-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
244 lines (214 loc) · 9.39 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import string, re, emoji
from utils.translate_emoticon import emoticon_to_label
from utils.translate_emoji import emoji_to_label
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
class Tokenizer():
"""
:Tokenizer:
Converts a stream of characters into a list of word-label list. Words are
either tokens (all tokens in the string) or terms (only unique words).
String is split on whitespace, punctuation, emojis and emoticons.
:Usage:
tokenizer = Tokenizer()
tokenizer.get_tokens(text, OPTIONS) - for all tokens
tokenizer.get_terms(text, OPTIONS) - for unique terms
:Options:
All options are boolean and are by default False:
-- lowercase
-- stem
-- replace_emojis
-- replace_num
-- remove_stopw
-- remove_punct
"""
def __init__(self):
self.punct = string.punctuation + '’…”“£—@→♡°⁎«\|/'
self.stopwords = stopwords.words('english')
self.stemmer = SnowballStemmer('english')
def get_only_tokens(self, text):
return [token for token, tag in self.get_tokens(text, False, False, False, False, False, False, False)]
def get_tokens(self, text,
lowercase = False,
stem = False,
replace_emojis = False,
replace_num = False,
remove_stopw = False,
remove_punct = False,
addit_mode = True ):
# Convert to lowercase
if lowercase:
text = text.lower()
# Codify emoticons as tokens
for emoticon in emoticon_to_label.keys():
if re.search(emoticon, text):
if replace_emojis and addit_mode:
# Replace with emoticon + label (eg ":-)" > "[#:-)#] joy")
replace_by = '[#{}#] {}'.format(emoticon_to_label[emoticon][1:-1], emoticon)
elif replace_emojis:
# Replace emoticon by label
replace_by = '[#{}#]'.format(emoticon_to_label[emoticon][1:-1])
else:
# Just format emoticon
replace_by = '[#{}#]'.format(emoticon)
text = re.sub(emoticon, replace_by, text)
# Replace newline symbols with whitespace
if re.search('[NEWLINE]', text):
text = re.sub('\[NEWLINE\]', ' ', text)
# Split string and analyze each token seperately
result = []
tokens = text.split()
for token in tokens:
# Token is word
if token.isalpha():
result.append((token, 'word'))
# Token is special symbol, eg [#SYMBOL#]
elif re.fullmatch('^\[#(.*)#\]$', token):
if token.lower() == '[#triggerword#]':
result.append(('<TRIGGERWORD>', 'triggerword'))
else:
# should be emoticon
result.append((token[2:-2], 'emoticon'))
# Token is removed url
elif token.lower() == 'http://url.removed':
result.append(('<URL>', 'url'))
# Look for punctuation, emojis or whitespace
else:
new_token = ''
for char in token:
# Character is punctuation
if char in self.punct:
# add preceding charachters to results, if any
if new_token:
result += self.add_new_token(new_token, replace_num, stem, addit_mode)
new_token = ''
# Add symbol to result, except # and @
if char == '#' or char == '@':
new_token += char
elif not remove_punct:
result.append((char, 'punctuation'))
# Character is common emoji
elif char in emoji_to_label.keys():
# add preceding charachters to results, if any
if new_token:
result += self.add_new_token(new_token, replace_num, stem, addit_mode)
new_token = ''
# add emoji to results
if replace_emojis:
emoji_text = emoji_to_label[char] # ignore columns
result.append((emoji_text[1:-1], 'emoji'))
if addit_mode:
result.append((char, 'emoji'))
else:
result.append((char, 'emoji'))
# Character is uncommon emoji
elif char in emoji.UNICODE_EMOJI.keys():
# add preceding charachters to results, if any
if new_token:
result += self.add_new_token(new_token, replace_num, stem, addit_mode)
new_token = ''
# add emoji
if replace_emojis:
emoji_text = emoji.demojize(char) # ignore columns
result.append((emoji_text[1:-1], 'emoji'))
if addit_mode:
result.append((char, 'emoji'))
else:
result.append((char, 'emoji'))
# Character is alpha-numerical
else:
new_token += char
# final check after loop
if new_token:
result += self.add_new_token(new_token, replace_num, stem, addit_mode)
# Remove stopwords
if remove_stopw:
result = self.remove_stopwords(result)
return result
def add_new_token(self, token, replace_num, stem, addit):
"""
Returns a list with 1 or 2 elements: tuple with the new token and the
label of the token (type of token). If the arguments replace_num or stem
are True, additional token elements are added.
"""
new_tokens = []
if token[0] == '#':
new_tokens.append((token, 'hashtag'))
elif token[0] == '@':
new_tokens.append((token, 'username'))
elif token.isalpha():
if stem:
stemmed_token = self.stemmer.stem(token)
if stemmed_token and stemmed_token != token:
new_tokens.append((stemmed_token, 'word'))
if addit:
new_tokens.append((token, 'word'))
else:
new_tokens.append((token, 'word'))
elif token.isnumeric():
if replace_num:
new_tokens.append(('<NUM>', 'numeric'))
if addit:
new_tokens.append((token, 'numeric'))
else:
new_tokens.append((token, 'numeric'))
else:
new_tokens.append((token, 'other')) # Shouldn't happen, but to be save
return new_tokens
def remove_stopwords(self, word_list):
processed_word_list = []
for word in word_list:
if word[1] == 'word' and word[0].lower() not in self.stopwords:
processed_word_list.append(word)
elif word[1] != 'word':
processed_word_list.append(word)
return processed_word_list
def get_terms(self, text,
lowercase = False,
stem = False,
replace_emojis = False,
replace_num = False,
remove_stopw = False,
remove_punct = False ):
tokens = self.get_tokens(text, lowercase, stem, replace_emojis,
replace_num, remove_stopw, remove_punct)
unique_tokens = [] # list of strings
terms = [] # list of tuples
for token in tokens:
if token[0] not in unique_tokens:
unique_tokens.append(token[0])
terms.append(token)
return terms
def get_stems(self, tokens):
stemmed_tokens = []
for token in tokens:
if token[1] == 'word':
new_token = self.stemmer.stem(token[0])
if new_token:
stemmed_tokens.append((new_token, 'word'))
else:
stemmed_tokens.append(token)
else:
stemmed_tokens.append(token)
return stemmed_tokens
if __name__ == '__main__':
tokenizer = Tokenizer()
test_1 = 'HeLlO\t, WoRld! I\'m Tired of lo/sers <33333 1984 :)))) [NEWLINE] >:\ 🤠 🙂 😃😄😆😍'
test_2 = "much♡[NEWLINE]•2 … …texting&driving he's @USERNAME works. A[NEWLINE][NEWLINE]As Mom:\"its pretty done."
test_3 = '#WeLoveYouJackson[NEWLINE]#ItsOnlyGOT7'
test_4 = '#Love#Love @user'
test_5 = '’…”“£—@→♡°⁎«\|/'
test_6 = 'fu*kers'
choice = test_1
tokens = tokenizer.get_tokens(choice,
lowercase=False,
stem=True,
replace_emojis=True,
replace_num=True,
remove_stopw=False,
remove_punct=False,
addit_mode=False)
print('Original input: {}\n'.format(choice))
print('Tuples tokens ({}):\n{}\n'.format(len(tokens), tokens))
print('String tokens:'.format(len(tokens)))
print('"{}"\n'.format('" "'.join(t[0] for t in tokens)))