Source code for simplechinese.nlp

import re
import pickle
import warnings
import os

import numpy as np
import pandas as pd

import jieba
jieba.setLogLevel(60)

_ROOT = os.path.abspath(os.path.dirname(__file__))
def _get_data(path):
    return os.path.join(_ROOT, 'data', path)

with open(_get_data('names.pickle'), 'rb') as handle:
    names = pickle.load(handle)
_nr = names['nr'] | names['nrt'] | names['nrfg']
_ns = names['nrt'] | names['ns'] | names['nt']
_n = set([])
for k in names.keys(): _n = _n | names[k]

[docs]def extract_nums(x, isList=False, dtype=float):
    """
    Extract the numbers from a string, a pandas.Series, or a pandas.DataFrame.

    Args:
        x: The content to be parsed. Either a string, a pandas.Series, or a pandas.DataFrame.

        isList: A boolean. If it is True, the returned value would be a list/lists of floats, or it would be a string/strings of numbers seperated by spaces.

    Returns:
        The numbers in the input data.

    |
    """

    def get_nums(x):
        def get_float(element):
            try:
                return float(element)
            except ValueError:
                return None
        nums = [get_float(n) for n in re.sub('[^0-9.]',' ', x).split()]
        nums = np.array([n for n in nums if n is not None]).astype(dtype)
        return list(nums)

    def func(x):
        nums = get_nums(x)
        if isList: return nums
        return " ".join([str(n) for n in nums])

    if isinstance(x, str):
        return get_nums(x)
    elif isinstance(x, pd.DataFrame):
        return x.applymap(func)
    elif isinstance(x, pd.Series):
        return x.apply(func)
    else:
        raise ValueError("The type of the input variable should be string, pd.Series, pandas.DataFrame.")

[docs]def extract_words(x, isList=False, mode=0, token="/"):
    """
    Extract the words from a string, a pandas.Series, or a pandas.DataFrame.

    Args:
        x: The content to be parsed. Either a string, a pandas.Series, or a pandas.DataFrame.

        isList: A boolean. If it is True, the returned value would be a list/lists, or it would be a string/strings of words seperated by the token.

        token: The token to seperate words if isList is False.

        mode: 0: No single character words. The words may be overlapped.
              1: Have single character words. The words may be overlapped.
              2: No single character words. The words are not overlapped.
              3: Have single character words. The words are not overlapped.
              4: Only single characters.

    Returns:
        The seperated words in the input data.

    |
    """

    if mode not in [0,1,2,3,4]:
        raise ValueError("The mode should be chosen from 0-4.")

    def get_words(_s):
        if mode in [0,1]:
            words = jieba.cut_for_search(_s)
        elif mode in [2,3]:
            words = jieba.cut(_s, cut_all=False)
        else:
            words = [n for n in _s]
        if mode in [1,3,4]:
            result = list(words)
        else:
            result = [n for n in words if len(n)>1]
        return result

    def func(x):
        words = get_words(x)
        if isList: return words
        return token.join([str(n) for n in words])

    if isinstance(x, str):
        return get_words(x)
    elif isinstance(x, pd.DataFrame):
        return x.applymap(func)
    elif isinstance(x, pd.Series):
        return x.apply(func)
    else:
        raise ValueError("The type of the input variable should be string, pd.Series, pandas.DataFrame.")

[docs]def extract_nouns(x, isList=False, split_mode=0, extract_mode="all", token="/"):
    """
    Extract the nouns from a string, a pandas.Series, or a pandas.DataFrame. This function is still under developing.

    Args:
        x: The content to be parsed. Either a string, a pandas.Series, or a pandas.DataFrame.

        isList: A boolean. If it is True, the returned value would be a list/lists, or it would be a string/strings of nouns seperated by the token.

        token: The token to seperate words if isList is False.

        mode: 0: No single character words. The words may be overlapped.
              1: Have single character words. The words may be overlapped.
              2: No single character words. The words are not overlapped.
              3: Have single character words. The words are not overlapped.
              4: Only single characters.

    Returns:
        The seperated nouns in the input data.

    |
    """

    if split_mode not in [0,1,2,3,4]:
        raise ValueError("The mode should be chosen from 0-4.")
    if extract_mode.lower() not in ["all", "person", "place"]:
        raise ValueError("The mode should be chosen from \"all\", \"person\", and \"place\".")
    if extract_mode in ["person", "place"]:
        warnings.warn("The function of extracting people or locations` names is still under developing...")

    def get_words(_s):
        if split_mode in [0,1]:
            words = jieba.cut_for_search(_s)
        elif split_mode in [2,3]:
            words = jieba.cut(_s, cut_all=False)
        else:
            words = [n for n in _s]
        if split_mode in [1,3,4]:
            result = list(words)
        else:
            result = [n for n in words if len(n)>1]

        if extract_mode=="person":
            result = [n for n in result if n in _nr]
        elif extract_mode=="place":
            result = [n for n in result if n in _ns]
        else:
            result = [n for n in result if n in _n]
        return result

    def func(x):
        words = get_words(x)
        if isList: return words
        return token.join([str(n) for n in words])

    if isinstance(x, str):
        return get_words(x)
    elif isinstance(x, pd.DataFrame):
        return x.applymap(func)
    elif isinstance(x, pd.Series):
        return x.apply(func)
    else:
        raise ValueError("The type of the input variable should be string, pd.Series, pandas.DataFrame.")
Source code for simplechinese.nlp

SimpleChinese

Navigation

Related Topics