-
Notifications
You must be signed in to change notification settings - Fork 1
/
TweetPreProcessing.py
156 lines (129 loc) · 4.94 KB
/
TweetPreProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import json
import re
import operator
import configparser
import math
from collections import Counter
from collections import defaultdict
import nltk
#nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import bigrams
import string
#from nltk.tokenize import word_tokenize
config = configparser.ConfigParser()
config.read('TwitterSentimentDetector.ini')
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
punctuation = list(string.punctuation)
numbers = list(string.digits)
articles = list(["a", "A", "The", "the"])
stop = stopwords.words('english') + punctuation + articles + numbers + ['2017', '2018'] + ['rt','RT', "we're", 'via']
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
fname = config['DEFAULT']['tweet_file']
# co-occurence matrix
com = defaultdict(lambda : defaultdict(int))
# document frequency of terms not including stop words
count_stop_single = Counter()
# document frequency of terms including stop words
count_single = Counter()
tweetCount = 0.0
with open(fname, 'r') as f:
for line in f:
tweetCount = tweetCount + 1
tweet = json.loads(line) # load it as Python dict
# Create a list with all the terms
terms_all = [term for term in preprocess(tweet['text'])]
# Count terms only once, equivalent to Document Frequency
count_single = set(terms_all)
# Count hashtags only
#terms_hash = [term for term in preprocess(tweet['text'])
# if term.startswith('#')]
# Count terms only (no hashtags, no mentions)
terms_only = [term for term in preprocess(tweet['text'])
if term not in stop and
not term.startswith(('#', '@'))]
# mind the ((double brackets))
# startswith() takes a tuple (not a list) if
# we pass a list of inputs
terms_stop = [term for term in preprocess(tweet['text']) if term not in stop]
# Update the Counter
count_stop_single.update(terms_stop)
# The bigrams() function from NLTK will take a list of tokens
# and produce a list of tuples using adjacent tokens
#terms_bigram = bigrams(terms_stop) # have to cast bigram as a list
#print(list(terms_bigram))
# Build co-occurrence matrix
for i in range(len(terms_only)-1):
for j in range(i+1, len(terms_only)):
w1, w2 = sorted([terms_only[i], terms_only[j]])
if w1 != w2:
com[w1][w2] += 1
#print(count_stop_single.most_common(5))
file.close(f)
# p_t is the probability of the term occurring in a document
p_t = {}
# p_t_com is the probability of two terms occurring together in a document
p_t_com = defaultdict(lambda : defaultdict(int))
#
for term, n in count_stop_single.items():
p_t[term] = n / tweetCount
for t2 in com[term]:
p_t_com[term][t2] = com[term][t2] / tweetCount
fname = config['DEFAULT']['positive_words']
positive_vocab = []
with open(fname, 'r') as f:
positive_vocab = f.read().splitlines()
file.close(f)
fname = config['DEFAULT']['negative_words']
negative_vocab = []
with open(fname, 'r') as f:
negative_vocab = f.read().splitlines()
# pmi is the Pointwise Mutual Information
# or the closeness of a word to terms like good or bad
pmi = defaultdict(lambda : defaultdict(int))
for t1 in p_t:
for t2 in com[t1]:
denom = p_t[t1] * p_t[t2]
pmi[t1][t2] = math.log((p_t_com[t1][t2] / denom), 2)
semantic_orientation = {}
for term, n in p_t.items():
positive_assoc = sum(pmi[term][tx] for tx in positive_vocab)
negative_assoc = sum(pmi[term][tx] for tx in negative_vocab)
semantic_orientation[term] = positive_assoc - negative_assoc
semantic_sorted = sorted(semantic_orientation.items(),
key=operator.itemgetter(1),
reverse=True)
top_pos = semantic_sorted[:10]
top_neg = semantic_sorted[-10:]
print(top_pos)
print(top_neg)
# Trump Tweets
#print("Jerusalem: ", semantic_orientation["Jerusalem"])
#print("Trump: ", semantic_orientation["Trump"])
#print("US: ", semantic_orientation["US"])
# Happy Tweets