Source code for simplechinese.representation

import numpy as np
import pandas as pd

from .nlp import extract_words

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, NMF
from sklearn.cluster import KMeans, DBSCAN, MeanShift

import jieba
jieba.setLogLevel(60)

[docs]def pca(x, n_components=2):
    """
    Perform dimension reduction with the principal component analysis algorithm. The input data should be a pandas.Series of vectors.

    |
    """
    pca = PCA(n_components=n_components)
    return pd.Series(pca.fit_transform(list(x)).tolist(), index=x.index)

[docs]def nmf(x, n_components=2):
    """
    Perform dimension reduction with the non-negative matrix factorization algorithm. The input data should be a pandas.Series of vectors.

    |
    """
    nmf = NMF(n_components=n_components, init="random", random_state=0)
    return pd.Series(nmf.fit_transform(list(x)).tolist(), index=x.index)

[docs]def term_frequency(x, mode=0, max_features=None, return_feature_names=False):
    """
    Extract the words and vectorize each element in the pandas.Series by the frequency of each word.

    Args:
        x: The pandas.Series to be parsed.

        max_features: The maximum number of features

        return_feature_names: Return the token words or not.

        mode: 0: No single character words. The words may be overlapped.
              1: Have single character words. The words may be overlapped.
              2: No single character words. The words are not overlapped.
              3: Have single character words. The words are not overlapped.
              4: Only single characters.

    Returns:
        The vectorization result.

    |
    """

    if mode not in [0,1,2,3,4]:
        raise ValueError("The mode should be chosen from 0-4.")
    if not isinstance(x, pd.Series):
        raise ValueError("The type of the input variable should be pandas.Series.")

    vectorizer = CountVectorizer(max_features=max_features,
                                 lowercase=False,
                                 token_pattern="\S+")
    y = extract_words(x, isList=False, mode=mode, token=" ")
    y = pd.Series(vectorizer.fit_transform(y).toarray().tolist(), index=x.index)

    if return_feature_names:
        return (y, tf.get_feature_names())
    else:
        return y

[docs]def tfidf(x, mode=0, max_features=None, min_df=1, return_feature_names=False):
    """
    Extract the words and vectorize each element in the pandas.Series by the tfidf scores.

    Args:
        x: The pandas.Series to be parsed.

        max_features: The maximum number of features

        return_feature_names: Return the token words or not.

        mode: 0: No single character words. The words may be overlapped.
              1: Have single character words. The words may be overlapped.
              2: No single character words. The words are not overlapped.
              3: Have single character words. The words are not overlapped.
              4: Only single characters.

    Returns:
        The vectorization result.

    |
    """

    if mode not in [0,1,2,3,4]:
        raise ValueError("The mode should be chosen from 0-4.")
    if not isinstance(x, pd.Series):
        raise ValueError("The type of the input variable should be pandas.Series.")

    vectorizer = TfidfVectorizer(use_idf=True,
                                 max_features=max_features,
                                 min_df=min_df,
                                 token_pattern="\S+",
                                 lowercase=False,)
    y = extract_words(x, isList=False, mode=mode, token=" ")
    y = pd.Series(vectorizer.fit_transform(y).toarray().tolist(), index=y.index)

    if return_feature_names:
        return (y, tf.get_feature_names())
    else:
        return y
Source code for simplechinese.representation

SimpleChinese

Navigation

Related Topics