Source code for simplechinese.nlp

import re
import pickle
import warnings
import os

import numpy as np
import pandas as pd

import jieba
jieba.setLogLevel(60)

_ROOT = os.path.abspath(os.path.dirname(__file__))
def _get_data(path):
    return os.path.join(_ROOT, 'data', path)

with open(_get_data('names.pickle'), 'rb') as handle:
    names = pickle.load(handle)
_nr = names['nr'] | names['nrt'] | names['nrfg']
_ns = names['nrt'] | names['ns'] | names['nt']
_n = set([])
for k in names.keys(): _n = _n | names[k]

[docs]def extract_nums(x, isList=False, dtype=float): """ Extract the numbers from a string, a pandas.Series, or a pandas.DataFrame. Args: x: The content to be parsed. Either a string, a pandas.Series, or a pandas.DataFrame. isList: A boolean. If it is True, the returned value would be a list/lists of floats, or it would be a string/strings of numbers seperated by spaces. Returns: The numbers in the input data. | """ def get_nums(x): def get_float(element): try: return float(element) except ValueError: return None nums = [get_float(n) for n in re.sub('[^0-9.]',' ', x).split()] nums = np.array([n for n in nums if n is not None]).astype(dtype) return list(nums) def func(x): nums = get_nums(x) if isList: return nums return " ".join([str(n) for n in nums]) if isinstance(x, str): return get_nums(x) elif isinstance(x, pd.DataFrame): return x.applymap(func) elif isinstance(x, pd.Series): return x.apply(func) else: raise ValueError("The type of the input variable should be string, pd.Series, pandas.DataFrame.")
[docs]def extract_words(x, isList=False, mode=0, token="/"): """ Extract the words from a string, a pandas.Series, or a pandas.DataFrame. Args: x: The content to be parsed. Either a string, a pandas.Series, or a pandas.DataFrame. isList: A boolean. If it is True, the returned value would be a list/lists, or it would be a string/strings of words seperated by the token. token: The token to seperate words if isList is False. mode: 0: No single character words. The words may be overlapped. 1: Have single character words. The words may be overlapped. 2: No single character words. The words are not overlapped. 3: Have single character words. The words are not overlapped. 4: Only single characters. Returns: The seperated words in the input data. | """ if mode not in [0,1,2,3,4]: raise ValueError("The mode should be chosen from 0-4.") def get_words(_s): if mode in [0,1]: words = jieba.cut_for_search(_s) elif mode in [2,3]: words = jieba.cut(_s, cut_all=False) else: words = [n for n in _s] if mode in [1,3,4]: result = list(words) else: result = [n for n in words if len(n)>1] return result def func(x): words = get_words(x) if isList: return words return token.join([str(n) for n in words]) if isinstance(x, str): return get_words(x) elif isinstance(x, pd.DataFrame): return x.applymap(func) elif isinstance(x, pd.Series): return x.apply(func) else: raise ValueError("The type of the input variable should be string, pd.Series, pandas.DataFrame.")
[docs]def extract_nouns(x, isList=False, split_mode=0, extract_mode="all", token="/"): """ Extract the nouns from a string, a pandas.Series, or a pandas.DataFrame. This function is still under developing. Args: x: The content to be parsed. Either a string, a pandas.Series, or a pandas.DataFrame. isList: A boolean. If it is True, the returned value would be a list/lists, or it would be a string/strings of nouns seperated by the token. token: The token to seperate words if isList is False. mode: 0: No single character words. The words may be overlapped. 1: Have single character words. The words may be overlapped. 2: No single character words. The words are not overlapped. 3: Have single character words. The words are not overlapped. 4: Only single characters. Returns: The seperated nouns in the input data. | """ if split_mode not in [0,1,2,3,4]: raise ValueError("The mode should be chosen from 0-4.") if extract_mode.lower() not in ["all", "person", "place"]: raise ValueError("The mode should be chosen from \"all\", \"person\", and \"place\".") if extract_mode in ["person", "place"]: warnings.warn("The function of extracting people or locations` names is still under developing...") def get_words(_s): if split_mode in [0,1]: words = jieba.cut_for_search(_s) elif split_mode in [2,3]: words = jieba.cut(_s, cut_all=False) else: words = [n for n in _s] if split_mode in [1,3,4]: result = list(words) else: result = [n for n in words if len(n)>1] if extract_mode=="person": result = [n for n in result if n in _nr] elif extract_mode=="place": result = [n for n in result if n in _ns] else: result = [n for n in result if n in _n] return result def func(x): words = get_words(x) if isList: return words return token.join([str(n) for n in words]) if isinstance(x, str): return get_words(x) elif isinstance(x, pd.DataFrame): return x.applymap(func) elif isinstance(x, pd.Series): return x.apply(func) else: raise ValueError("The type of the input variable should be string, pd.Series, pandas.DataFrame.")