Source code for simplechinese.preprocessing
import re
import numpy as np
import pandas as pd
def _parse(func, x):
if isinstance(x, str):
return func(x)
elif isinstance(x, pd.DataFrame):
return x.applymap(func)
else:
raise ValueError("The type of the input variable should be string or pandas.DataFrame.")
[docs]def only_digits(x):
"""
Only keeps the digits in a string or a pandas.DataFrame.
Args:
x: The content to be parsed. Either a string or a pandas.DataFrame.
Returns:
A new string or a pandas.DataFrame only includes digits.
|
"""
def func(_s):
return "".join([x for x in re.findall(r'[0-9]', _s)])
return _parse(func, x)
[docs]def only_zh(x):
"""
Only keeps Chinese characters in a string or a pandas.DataFrame.
Args:
x: The content to be parsed. Either a string or a pandas.DataFrame.
Returns:
A new string or a pandas.DataFrame only includes Chinese characters.
|
"""
def func(_s):
return "".join([x for x in re.findall(r'[\u4e00-\u9fff]+', _s)])
return _parse(func, x)
[docs]def only_en(x):
"""
Only keeps English alphabets in a string or a pandas.DataFrame.
Args:
x: The content to be parsed. Either a string or a pandas.DataFrame.
Returns:
A new string or a pandas.DataFrame only includes English alphabets.
|
"""
def func(_s):
return re.sub(r'[^\x41-\x5A\x61-\x7A ]', '', _s)
return _parse(func, x)
[docs]def remove_space(x):
"""
Remove all the spaces in a string or a pandas.DataFrame.
Args:
x: The content to be parsed. Either a string or a pandas.DataFrame.
Returns:
A new string or a pandas.DataFrame without spaces.
|
"""
def func(_s):
return "".join(_s.split())
return _parse(func, x)
[docs]def remove_digits(x):
"""
Remove all the digits in a string or a pandas.DataFrame.
Args:
x: The content to be parsed. Either a string or a pandas.DataFrame.
Returns:
A new string or a pandas.DataFrame without digits.
|
"""
def func(_s):
return re.sub('[0-9]', '', _s)
return _parse(func, x)
[docs]def remove_zh(x):
"""
Remove all the Chinese characters in a string or a pandas.DataFrame.
Args:
x: The content to be parsed. Either a string or a pandas.DataFrame.
Returns:
A new string or a pandas.DataFrame without Chinese characters.
|
"""
def func(_s):
return re.sub(r'[\u4e00-\u9fff]+', '', _s)
return _parse(func, x)
[docs]def remove_en(x):
"""
Remove all the English alphabets in a string or a pandas.DataFrame.
Args:
x: The content to be parsed. Either a string or a pandas.DataFrame.
Returns:
A new string or a pandas.DataFrame without English alphabets.
|
"""
def func(_s):
return re.sub(r'[\x41-\x5A\x61-\x7A]', '', _s)
return _parse(func, x)
[docs]def remove_punctuations(x):
"""
Remove all the punctuations in a string or a pandas.DataFrame.
Args:
x: The content to be parsed. Either a string or a pandas.DataFrame.
Returns:
A new string or a pandas.DataFrame without punctuations.
|
"""
def func(_s):
return re.sub(r'[^\w\s]','',_s)
return _parse(func, x)
[docs]def fillna(x):
"""
Fill the N/As in a pandas.DataFrame with an empty string.
Args:
x: A pandas.DataFrame content to be parsed.
Returns:
A pandas.DataFrame without N/As, which are substituted with empty strings.
|
"""
return x.fillna("")
# return x.applymap(lambda a: a if pd.notnull(a) else "")
[docs]def toLower(x):
"""
Transform alphabets to their lowercases.
Args:
x: The content to be parsed. Either a string or a pandas.DataFrame.
Returns:
A new string or a pandas.DataFrame where the alphabets are in lowercases.
|
"""
def func(_s):
return _s.lower()
return _parse(func, x)
[docs]def toUpper(x):
"""
Transform alphabets to their uppercases.
Args:
x: The content to be parsed. Either a string or a pandas.DataFrame.
Returns:
A new string or a pandas.DataFrame where the alphabets are in uppercases.
|
"""
def func(_s):
return _s.upper()
return _parse(func, x)
[docs]def clean(x):
"""
This function does the following:
1. fillna(): Fill the N/As in a pandas.DataFrame with an empty string.
2. toLower(): Transform alphabets to their lowercases.
3. remove_punctuations(): Remove all the punctuations in a string or a pandas.DataFrame.
4. remove_space(): Remove all the spaces in a string or a pandas.DataFrame.
|
"""
y = fillna(x)
y = toLower(y)
y = remove_punctuations(y)
y = remove_space(y)
return y