-
Notifications
You must be signed in to change notification settings - Fork 0
/
similarity.py
275 lines (234 loc) · 11.2 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
from typing import Any, List, Tuple
import gensim.downloader
import numpy as np
from gensim import corpora
from gensim.models import LdaModel
from scipy import spatial
from sklearn.feature_extraction.text import TfidfVectorizer
from preprocess import Preprocessor
class SimilarityModel():
def __init__(self, embedding_type, dataset = None, ngram_range = (1,1), word2vec_type = 'word2vec-google-news-300'):
"""Perform textual relevance calculation
SimilarityModel calculates the average distance between a
target document and a list of similar documents. The higher
the value the more likely are the two documents similar. We
provide two similarity metrics: cosine distance and word
matching.
Parameters
----------
embedding_type : string
Must be 'tfidf' or 'word2vec'
dataset : array_like (optional, default=None)
Array of shape (n_samples, n_words). Each sample is a tokenised
list of words. if embedding_type is 'word2vec', dataset is ignored.
ngram_range: tuple_like (optional, default=(1,1))
ngram range of the TF-IDF
word2vec_type: string (optional, default='word2vec-google-news-300')
Must be a valid gensim word2vec model. Only required when
using 'word2vec' as embedding_type
Returns
-------
tr : SimilarityModel()
A Textual Relevance instance
Methods
-------
consine_dist(prediction, contexts)
Calculate the average cosine distance of the prediction document
and all the context documents (min: 0, max: 1)
word_match(prediction, contexts)
Calculate the average summed word match of prediction document
and all the context documents
Examples
--------
>>> pp = Preprocessor()
>>> ds = DatasetLoader().load_fakenewsnet(drop_if_less_than_num_contexts=3)
>>> df = ds.as_pandas()
>>> df["content"] = df["content"].apply(pp.preprocess_and_tokenize)
>>> tfidf = SimilarityModel('tfidf', df.content)
>>> tfidf.cosine_dist(df['content'].iloc[0], [
\ pp.preprocess_and_tokenize(df['ctx2_content'].iloc[0]),
\ pp.preprocess_and_tokenize(df['ctx3_content'].iloc[0])])
0.19219322243197196
Reference
-----
Alsuliman, F., Bhattacharyya, S., Slhoub, K., Nur, N. and Chambers,
C.N., 2022, June. Social Media vs. News Platforms: A Cross-analysis for
Fake News Detection Using Web Scraping and NLP. In Proceedings of the
15th International Conference on PErvasive Technologies Related to
Assistive Environments (pp. 190-196).
"""
if embedding_type not in ['tfidf', 'word2vec']:
raise ValueError(f'Word embedding type {embedding_type} is invalid. Valid types are tfidf and word2vec')
self.embedding_type = embedding_type
if self.embedding_type == 'tfidf':
self.vectorizer = self.__get_embedding_tfidf(dataset, ngram_range)
else:
# Model must be in list(gensim.downloader.info()['models'].keys())
self.__word2vec = gensim.downloader.load(word2vec_type)
self.__word2vec_dim = gensim.downloader.info()['models'][word2vec_type]['parameters']['dimension']
self.vectorizer = self.__get_embedding_word2vec()
def __get_embedding_tfidf(self, dataset, ngram_range):
'''Train a sklearn TfidfVectorizer on a tokenised dataset'''
# Experiment with different TfidVectorizer parameters here or use literature review results
vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False, ngram_range=ngram_range)
vectorizer.fit(dataset)
# https://stackoverflow.com/questions/24440332/numpy-scipy-sparse-matrix-to-vector
return lambda docs: [mat.A.T.flatten() for mat in vectorizer.transform(docs)]
def __get_embedding_word2vec(self):
'''Return a lambda function to convert a tokenised dataset into matrices of word2vec'''
return lambda docs: [self.__get_average_word2vec_vector(doc) for doc in docs]
def __get_average_word2vec_vector(self, words):
'''Reduce word2vec tokenised text (words) to average vector
Reference
---------
Sitikhu, P., Pahi, K., Thapa, P. and Shakya, S., 2019, November.
A comparison of semantic similarity methods for maximum human interpretability.
In 2019 artificial intelligence for transforming business and society (AITB)
(Vol. 1, pp. 1-4). IEEE.
https://arxiv.org/pdf/1910.09129v1.pdf
'''
val = np.zeros(self.__word2vec_dim,)
for word in words:
try:
val += self.__word2vec[word]
except KeyError:
# Vectors of non-existent words are ignored
continue
# To calculate the document vector we take the average of all vectors
return val / len(words)
def cosine_dist(self, prediction, contexts):
"""
Parameters
----------
prediction : array_like
Array of shape (n_words, ). The document to compare against its
contexts
contexts : array_like
Array of shape (n_samples, n_words). Each sample is a tokenised
list of words. Each sample is a context document of the prediction
Returns
-------
dist : float
average cosine distance between 0 and 1 of prediction against all
contexts. 0 means there is no relationship between two vectors, 1
means two vectors are the same
"""
predict_vec = self.vectorizer([prediction])[0]
context_vecs = self.vectorizer(contexts)
# https://stackoverflow.com/questions/18424228/cosine-similarity-between-2-number-lists
return np.mean([1 - spatial.distance.cosine(predict_vec, context_vec) for context_vec in context_vecs])
def word_appearance(self, prediction, contexts):
'''Calculate word appearance
Sum the number of unique prediction words in the context document
Divided by the total number of unique words in the context doc
Parameters
----------
prediction : array_like
Array of shape (n_words, ). The document to compare against its
contexts
contexts : array_like
Array of shape (n_samples, n_words). Each sample is a tokenised
list of words. Each sample is a context document of the prediction
Returns
-------
dist : float
average value of word appearance of the prediction doc against each
context doc. 0 means there is no relationship between two vectors, 1
means two vectors are the same
'''
unique_predict_words = set(prediction)
out = []
for context in contexts:
unique_context_words = set(context)
common_words = unique_predict_words.intersection(unique_context_words)
out.append(len(common_words) / len(unique_context_words))
return np.mean(out)
def matching_score(self, prediction, contexts):
'''Calculate the L1 norm vector value for the unique common words in the prediction
doc and context doc divided by the L1 norm vector value of the unique words in
context document. Note we can only use TF-IDF vectorizer.
Parameters
----------
prediction : array_like
Array of shape (n_words, ). The document to compare against its
contexts
contexts : array_like
Array of shape (n_samples, n_words). Each sample is a tokenised
list of words. Each sample is a context document of the prediction
Returns
-------
dist : float
average value of matching score of the prediction doc against each
context doc. 0 means there is no relationship between two vectors, 1
means two vectors are the same
'''
unique_predict_words = set(prediction)
out = []
for context in contexts:
unique_context_words = set(context)
common_words = unique_predict_words.intersection(unique_context_words)
# Assumes TF-IDF vectorizer with vector value >= 0
out.append(np.sum(self.vectorizer(common_words)) / np.sum(self.vectorizer(unique_context_words)))
return np.mean(out)
class LDASummaryExtractor:
def __init__(self):
# Punctuation is removed since we don't want punctuation to be a possible keyword.
self.preprocessor = Preprocessor(remove_punctuation=True)
def extract_main_themes(self, content: str,
num_words=10,
num_topics=1,
ignore_numerical_keywords=True) -> Tuple[List[Any], str]:
"""Extract the main themes out of the content in a news article.
Args:
content (str): The news article
num_words (int, optional): The number of keywords to consider per topic. Defaults to 10.
num_topics (int, optional): The number of topics to consider. Defaults to 1.
ignore_numerical_keywords (bool, optional): Whether to return
keywords which are just numbers. Keywords which are just dates or
money values can sometimes not be useful. Defaults to False.
Returns:
Tuple[List[Any], str]: A tuple consisting of the main themes
extracted from the content and the string summary of length
num_words of the topics.
"""
documents = list(map(self.preprocessor.preprocess_and_tokenize, content.split("\n")))
dictionary = corpora.Dictionary(documents)
corpus = list(map(dictionary.doc2bow, documents))
lda_model = LdaModel(
corpus,
num_topics=num_topics,
id2word=dictionary,
passes=20,
# Reproducible
random_state=42
)
main_themes = lda_model.show_topics(
num_topics=num_topics,
num_words=num_words,
formatted=False
)
all_keywords: List[Tuple[str, float]] = []
for topic_id, words in main_themes:
all_keywords.extend(words)
# Sort by decreasing probability
all_keywords.sort(key=lambda p: p[1], reverse=True)
# Deduplicate keywords from all topics and select by probability. Only take
# num_words amount of words.
keywords = []
for word, proba in all_keywords:
if word in keywords:
# Deduplication
continue
if ignore_numerical_keywords and word.isnumeric():
continue
keywords.append(word)
if len(keywords) >= num_words:
break
return main_themes, " ".join(keywords)
if __name__ == "__main__":
import sys
print("Text to extract context from: ", end="", flush=True)
content = sys.stdin.read()
main_themes, summary_string = LDASummaryExtractor().extract_main_themes(content)
print(main_themes)
print(summary_string)