-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
126 lines (101 loc) · 4.06 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re
import string
from typing import List, Optional
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Can only be called once to make it thread-safe
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
class Preprocessor:
def __init__(
self,
lowercase=True,
remove_non_ascii=True,
remove_punctuation=False,
lemmatization=True,
remove_stopwords=True,
):
"""Performs preprocessing and tokenizes the input
Args:
lowercase (bool, optional): Convert all ascii characters to lowercase. Defaults to True.
remove_non_ascii (bool, optional): Removes all non ascii characters from the input. Defaults to True.
remove_punctuation (bool, optional): Removes all punctuation. Defaults to False.
lemmatization (bool, optional): Use lemmatization, converting all words to their base form. Defaults to True.
remove_stopwords (bool, optional): Removes stopwords. Defaults to True.
"""
self._lowercase = lowercase
self._remove_non_ascii = remove_non_ascii
self._remove_punctuation = remove_punctuation
if lemmatization:
self._lemmatizer = WordNetLemmatizer()
else:
self._lemmatizer = None
self._remove_stopwords = remove_stopwords
def preprocess(self, input: str) -> Optional[str]:
"""Preprocess a string
Args:
input (str): The string to preprocess
Returns:
str: Preprocessed string
"""
# Remove newlines
if input is None:
return input
if not isinstance(input, str) and pd.isnull(input):
return input
input.replace("\n", " ")
if self._remove_non_ascii or self._remove_punctuation:
def include_char(c):
if self._remove_non_ascii and c not in string.printable:
return False
if self._remove_punctuation and c in string.punctuation:
return False
return True
input = "".join(ch for ch in input if include_char(ch))
if self._lemmatizer:
input = " ".join(map(self._lemmatizer.lemmatize, input.split()))
if self._lowercase:
input = input.lower()
if self._remove_stopwords:
words = stopwords.words("english")
input = " ".join(w for w in input.split() if w not in words)
return input
def tokenize(self, input: str) -> List[str]:
"""Tokenize the input by splitting on punctuation. Punctuation will be
considered a token by themselves.
Args:
input (str): The string to tokenize
Returns:
List[str]: A list of tokens
"""
return re.findall(r"\w+|[^\s\w]+", input)
def tokenize_opt(self, input: str) -> Optional[List[str]]:
if input is None:
return input
if not isinstance(input, str) and pd.isnull(input):
return input
return self.tokenize(input)
def preprocess_and_tokenize(self, input: str) -> List[str]:
return self.tokenize(self.preprocess(input))
def preprocess_and_tokenize_opt(self, input: Optional[str]) -> Optional[List[str]]:
"""Same as preprocess_and_tokenize but it can accept optional values such as python None or pandas NaType
Args:
input (Optional[str]): The input to preprocess and tokenize
Returns:
Optional[List[str]]: Tokenized string
"""
if input is None:
return input
if not isinstance(input, str) and pd.isnull(input):
return input
return self.preprocess_and_tokenize(input)
if __name__ == "__main__":
import sys
print("Text to extract preprocess: ", end="")
sys.stdout.flush()
input = sys.stdin.read()
processor = Preprocessor()
tokens = processor.preprocess_and_tokenize(input)
print(tokens)