-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
190 lines (148 loc) · 6.7 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer # For sentiment analysis
import cPickle as pickle # For loaded dataset from pickle file
import tqdm # Progress bar
from collections import Counter # Handy addon
from pprint import pprint # Useful to print JSON objects
import numpy as np
from joblib import Memory
memory = Memory(cachedir='/tmp/joblib')
def load_data():
# This loads the file that you want, might take several seconds (up to a minute)
with open("news_sentiment.pickle", "r") as f:
articles = pickle.load(f)
# print(len(articles), "articles were loaded")
# print("Example article:")
# pprint(articles[1040])
return articles
def get_isis():
print("get_isis")
articles = load_data()
# separate articles from the two stories
ISIS_articles = []
Brexit_articles = []
for a in articles:
if a["news_topic"] == 'ISIS War':
ISIS_articles.append(a)
else:
Brexit_articles.append(a)
# print len(ISIS_articles), " articles from ISIS War and ", len(Brexit_articles), "articles from Brexit were loaded"
return ISIS_articles
def get_brexit():
print("get_brexit")
articles = load_data()
# separate articles from the two stories
ISIS_articles = []
Brexit_articles = []
for a in articles:
if a["news_topic"] == 'ISIS War':
ISIS_articles.append(a)
else:
Brexit_articles.append(a)
# print len(ISIS_articles), " articles from ISIS War and ", len(Brexit_articles), "articles from Brexit were loaded"
return Brexit_articles
def get_intros(topic_articles):
print("get_intros")
analyzer = SentimentIntensityAnalyzer()
total_introductions = []
for a in topic_articles:
for intro in a.get('introductions', []): # get intros from each article
intro['source'] = a['source']
total_introductions.append(intro) # total intros across
# call the sentiment analysis (VADER) func to output a sentiment val from the text
for intro in tqdm.tqdm_notebook(total_introductions):
intro['sentiment'] = analyzer.polarity_scores(intro['text'])['compound']
return total_introductions
def print_some_stuff(introductions):
# Example some sentiment for some of the introductions
subsample = np.random.choice(introductions, 100)
for intro in subsample:
if intro['sentiment'] != 0: # print a few <entity,apposition,sentiment> values where sentiment !=0
print "---------------"
print "Entity mentionned:", intro['person']
print intro['text']
print "Sentiment:", intro['sentiment']
def build_src_actor_dict(introductions):
print("build_src_actor_dict")
ent_source_sent = {}
for intro in introductions:
p = intro['person']
s = intro['source']
if p not in ent_source_sent:
ent_source_sent[p] = {} # allocate space to fille ent_source_sent
if s not in ent_source_sent[p]:
ent_source_sent[p][s] = [] # allocate space to fille ent_source_sent
ent_source_sent[p][s].append(intro['sentiment'])
# print ent_source_sent['Aleppo'] # sentiments across sources for the single entity 'Aleppo'
return ent_source_sent
def filter_src_actor_dict(ent_source_sent):
print("filter_src_actor_dict")
# We get rid of entities that don't contain enough data
entities_kept = []
for entity in ent_source_sent.keys():
sentiments = ent_source_sent[entity] # collect sentiments across all sources for certain entity
total_size = sum([len(sentiments[source]) for source in sentiments.keys()])
if total_size >= 3: # only keep entities that 3 or more sources mention
entities_kept.append(entity)
# print "We will keep a total of", len(entities_kept), " / ", len(ent_source_sent.keys()), " entities in our dataset"
sources = set([])
for entity in entities_kept:
sources |= set(ent_source_sent[entity].keys())
sources = list(sources)
actor_per_source = {source: 0 for source in sources}
total = 0
for entity in entities_kept:
for source in ent_source_sent[entity].keys():
actor_per_source[source] += 1
total += 1
print("num actor per source")
for source in actor_per_source:
print(source, actor_per_source[source]/float(total))
print("num_actors =", len(entities_kept), "num_sources =", len(sources))
return entities_kept, sources
def build_algo_input(ent_source_sent, entities_kept, sources):
print("build_algo_input")
# this converts sentiments for same actor for same source into aggregate sentiment (which is discrete and is 1,0,or-1)
# Parameters: changing these affects the results you get
Pos_neg_ratio = 2.0
overall_ratio = 0.15
pos_threshold = 0.15
neg_threshold = -0.15
N = len(entities_kept)
M = len(sources)
A = np.zeros((N, M))
sentiment_counts = Counter()
source2j = {source: j for j, source in enumerate(sources)}
for i, entity in enumerate(entities_kept):
for source in ent_source_sent[entity].keys():
sent_array = np.array(ent_source_sent[entity][source])
N_pos = float(len(np.where(sent_array > pos_threshold)[0])) # count sentiments that are positive enough
N_neg = float(len(np.where(sent_array < neg_threshold)[0]))
T = float(len(sent_array))
aggregate_sentiment = 0
if N_pos > Pos_neg_ratio * N_neg and N_pos > overall_ratio * T:
aggregate_sentiment = 1
elif N_neg > Pos_neg_ratio * N_pos and N_neg > overall_ratio * T:
aggregate_sentiment = -1
j = source2j[source]
A[i, j] = aggregate_sentiment
sentiment_counts[aggregate_sentiment] += 1 # keeps track of #1,0,-1s assigned
# print "We allocated some sentiment in this matrix, the repartition is:", sentiment_counts
return A
def get_A(topic="isis"):
get_f = get_isis if topic == "isis" else get_brexit
global_ent_source_sent = build_src_actor_dict(get_intros(get_f()))
global_entities_kept, global_sources = filter_src_actor_dict(global_ent_source_sent)
global_A = build_algo_input(global_ent_source_sent, global_entities_kept, global_sources)
return global_A
@memory.cache
def get_A_and_labels(topic="isis"):
get_f = get_isis if topic == "isis" else get_brexit
ent_source_sent = build_src_actor_dict(get_intros(get_f()))
entities_kept, sources = filter_src_actor_dict(ent_source_sent)
A = build_algo_input(ent_source_sent, entities_kept, sources)
return A, sources
if __name__ == "__main__":
print(get_A("isis").shape)
print(get_A("brexit").shape)