-
Notifications
You must be signed in to change notification settings - Fork 11
/
utils.py
61 lines (46 loc) · 1.61 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
"""
Some utilities.
"""
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
def preprocess(tweet, ascii=True, ignore_rt_char=True, ignore_url=True,
ignore_mention=True, ignore_hashtag=True,
letter_only=True, remove_stopwords=True, min_tweet_len=3):
sword = stopwords.words('english')
if ascii: # maybe remove lines with ANY non-ascii character
for c in tweet:
if not (0 < ord(c) < 127):
return ''
tokens = tweet.lower().split() # to lower, split
res = []
for token in tokens:
if remove_stopwords and token in sword:
continue
if ignore_rt_char and token == 'rt':
continue
if ignore_url and token.startswith('https:'):
continue
if ignore_mention and token.startswith('@'):
continue
if ignore_hashtag and token.startswith('#'):
continue
if letter_only:
if not token.isalpha():
continue
elif token.isdigit():
token = '<num>'
res += token,
if min_tweet_len and len(res) < min_tweet_len:
return ''
else:
return ' '.join(res)
def get_tfidf(tweet_list, top_n, max_features=5000):
""" return the top n feature names and idf scores of a tweets list """
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
tfidf_vectorizer.fit_transform(tweet_list)
indices = np.argsort(tfidf_vectorizer.idf_)[::-1]
features = tfidf_vectorizer.get_feature_names()
top_feature_name = [features[i] for i in indices[:top_n]]
top_feautre_idf = tfidf_vectorizer.idf_[indices][:top_n]
return top_feature_name, top_feautre_idf