-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
37 lines (21 loc) · 826 Bytes
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import spacy
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.utils import simple_preprocess as spp
from gensim.parsing.preprocessing import STOPWORDS
def tokenize(msg):
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
msg = " ".join([token.lemma_ for token in nlp(msg)])
tkn_list = (spp(str(msg), deacc=True))
return tkn_list
def clean(tl):
tl_stopwords = list(filter(lambda x: x not in STOPWORDS,tl))
return tl_stopwords
def find_topic(m,model,dic):
token_list = list(tokenize(m))
texts = clean(token_list)
vc = dic.doc2bow(texts)
vector = model[vc]
topics = sorted(vector, key=lambda x: x[1], reverse=True)
return str(topics[0][0])