diff --git a/vietnamese_cleaner/regex_tokenize.py b/vietnamese_cleaner/regex_tokenize.py index 7dcc415..5978df1 100644 --- a/vietnamese_cleaner/regex_tokenize.py +++ b/vietnamese_cleaner/regex_tokenize.py @@ -7,8 +7,7 @@ import re import sys - -from underthesea.feature_engineering.text import Text +import unicodedata specials = [r"==>", r"->", r"\.\.\.", r">>", r"=\)\)"] digit = r"\d+([\.,_]\d+)+" @@ -87,6 +86,18 @@ patterns = re.compile(patterns, re.VERBOSE | re.UNICODE) +def Text(text): + """ provide a wrapper for python string + map byte to str (python 3) + all string in utf-8 encoding + normalize string to NFC + """ + if not type(text) == str: + text = text.decode("utf-8") + text = unicodedata.normalize("NFC", text) + return text + + def tokenize(text, format=None): """Tokenize text for word segmentation.