Source code for simplechinese.preprocessing

import re

import numpy as np
import pandas as pd

def _parse(func, x):
    if isinstance(x, str):
        return func(x)
    elif isinstance(x, pd.DataFrame):
        return x.applymap(func)
    else:
        raise ValueError("The type of the input variable should be string or pandas.DataFrame.")

[docs]def only_digits(x): """ Only keeps the digits in a string or a pandas.DataFrame. Args: x: The content to be parsed. Either a string or a pandas.DataFrame. Returns: A new string or a pandas.DataFrame only includes digits. | """ def func(_s): return "".join([x for x in re.findall(r'[0-9]', _s)]) return _parse(func, x)
[docs]def only_zh(x): """ Only keeps Chinese characters in a string or a pandas.DataFrame. Args: x: The content to be parsed. Either a string or a pandas.DataFrame. Returns: A new string or a pandas.DataFrame only includes Chinese characters. | """ def func(_s): return "".join([x for x in re.findall(r'[\u4e00-\u9fff]+', _s)]) return _parse(func, x)
[docs]def only_en(x): """ Only keeps English alphabets in a string or a pandas.DataFrame. Args: x: The content to be parsed. Either a string or a pandas.DataFrame. Returns: A new string or a pandas.DataFrame only includes English alphabets. | """ def func(_s): return re.sub(r'[^\x41-\x5A\x61-\x7A ]', '', _s) return _parse(func, x)
[docs]def remove_space(x): """ Remove all the spaces in a string or a pandas.DataFrame. Args: x: The content to be parsed. Either a string or a pandas.DataFrame. Returns: A new string or a pandas.DataFrame without spaces. | """ def func(_s): return "".join(_s.split()) return _parse(func, x)
[docs]def remove_digits(x): """ Remove all the digits in a string or a pandas.DataFrame. Args: x: The content to be parsed. Either a string or a pandas.DataFrame. Returns: A new string or a pandas.DataFrame without digits. | """ def func(_s): return re.sub('[0-9]', '', _s) return _parse(func, x)
[docs]def remove_zh(x): """ Remove all the Chinese characters in a string or a pandas.DataFrame. Args: x: The content to be parsed. Either a string or a pandas.DataFrame. Returns: A new string or a pandas.DataFrame without Chinese characters. | """ def func(_s): return re.sub(r'[\u4e00-\u9fff]+', '', _s) return _parse(func, x)
[docs]def remove_en(x): """ Remove all the English alphabets in a string or a pandas.DataFrame. Args: x: The content to be parsed. Either a string or a pandas.DataFrame. Returns: A new string or a pandas.DataFrame without English alphabets. | """ def func(_s): return re.sub(r'[\x41-\x5A\x61-\x7A]', '', _s) return _parse(func, x)
[docs]def remove_punctuations(x): """ Remove all the punctuations in a string or a pandas.DataFrame. Args: x: The content to be parsed. Either a string or a pandas.DataFrame. Returns: A new string or a pandas.DataFrame without punctuations. | """ def func(_s): return re.sub(r'[^\w\s]','',_s) return _parse(func, x)
[docs]def fillna(x): """ Fill the N/As in a pandas.DataFrame with an empty string. Args: x: A pandas.DataFrame content to be parsed. Returns: A pandas.DataFrame without N/As, which are substituted with empty strings. | """ return x.fillna("")
# return x.applymap(lambda a: a if pd.notnull(a) else "")
[docs]def toLower(x): """ Transform alphabets to their lowercases. Args: x: The content to be parsed. Either a string or a pandas.DataFrame. Returns: A new string or a pandas.DataFrame where the alphabets are in lowercases. | """ def func(_s): return _s.lower() return _parse(func, x)
[docs]def toUpper(x): """ Transform alphabets to their uppercases. Args: x: The content to be parsed. Either a string or a pandas.DataFrame. Returns: A new string or a pandas.DataFrame where the alphabets are in uppercases. | """ def func(_s): return _s.upper() return _parse(func, x)
[docs]def clean(x): """ This function does the following: 1. fillna(): Fill the N/As in a pandas.DataFrame with an empty string. 2. toLower(): Transform alphabets to their lowercases. 3. remove_punctuations(): Remove all the punctuations in a string or a pandas.DataFrame. 4. remove_space(): Remove all the spaces in a string or a pandas.DataFrame. | """ y = fillna(x) y = toLower(y) y = remove_punctuations(y) y = remove_space(y) return y