forked from oguuzhansahin/turkish-news-summarization
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tf_idf.py
177 lines (117 loc) · 4.52 KB
/
tf_idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import math
import pandas as pd
from nltk import (
sent_tokenize,
word_tokenize,
PorterStemmer
)
from nltk.corpus import stopwords
df = pd.read_csv("url_metin.csv")
df = df[df["Metin"].str.contains("a")==True]
df = df.reset_index()
df.drop(columns = ['index'],inplace=True)
haberler = df["Metin"].tolist()
#%%
# Cümlelerdeki kelimelerin frekansları and generate matrix
def _create_frequency_matrix(sentences):
frequency_matrix = {}
for sent in sentences:
freq_table = {}
words = word_tokenize(sent)
for word in words:
word = word.lower()
if word in freq_table:
freq_table[word] += 1
else:
freq_table[word] = 1
frequency_matrix[sent] = freq_table
return frequency_matrix
#Calculate tf and generate matrix
def _create_tf_matrix(freq_matrix):
tf_matrix = {}
for sent, f_table in freq_matrix.items():
tf_table = {}
count_words_in_sentence = len(f_table)
for word, count in f_table.items():
tf_table[word] = count / count_words_in_sentence
tf_matrix[sent] = tf_table
return tf_matrix
def _create_documents_per_words(freq_matrix):
word_per_doc_table = {}
for sent, f_table in freq_matrix.items():
for word, count in f_table.items():
if word in word_per_doc_table:
word_per_doc_table[word] += 1
else:
word_per_doc_table[word] = 1
return word_per_doc_table
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
idf_matrix = {}
for sent, f_table in freq_matrix.items():
idf_table = {}
for word in f_table.keys():
idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))
idf_matrix[sent] = idf_table
return idf_matrix
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
tf_idf_matrix = {}
for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):
tf_idf_table = {}
for (word1, tf), (word2, idf) in zip(f_table1.items(),
f_table2.items()): # here, keys are the same in both the table
tf_idf_table[word1] = float(tf * idf)
tf_idf_matrix[sent1] = tf_idf_table
return tf_idf_matrix
def _score_sentences(tf_idf_matrix) -> dict:
"""
score a sentence by its word's TF
Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
:rtype: dict
"""
sentenceValue = {}
for sent, f_table in tf_idf_matrix.items():
total_score_per_sentence = 0
count_words_in_sentence = len(f_table)
for word, tf_idf in f_table.items():
total_score_per_sentence += tf_idf
sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence
return sentenceValue
def _find_average_score(sentenceValue) -> int:
"""
Find the average score from the sentence value dictionary
:rtype: int
"""
sumValues = 0
for entry in sentenceValue:
sumValues += sentenceValue[entry]
# Average value of a sentence from original summary_text
average = (sumValues / len(sentenceValue))
return average
def _generate_summary(sentences, sentenceValue, threshold):
sentence_count = 0
summary = ''
for sentence in sentences:
if sentence in sentenceValue and sentenceValue[sentence] >= (threshold):
summary += " " + sentence
sentence_count += 1
return summary, sentence_count
summaries = []
for data in haberler:
sentences = sent_tokenize(data)
total_documents = len(sentences)
freq_matrix = _create_frequency_matrix(sentences)
tf_matrix = _create_tf_matrix(freq_matrix)
count_doc_per_words = _create_documents_per_words(freq_matrix)
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
sentence_scores = _score_sentences(tf_idf_matrix)
threshold = _find_average_score(sentence_scores)
summary, sentence_count = _generate_summary(sentences, sentence_scores,threshold)
summaries.append([summary, sentence_count])
#%%
ozet_cumle_sayisi = pd.DataFrame(summaries, columns = ['Özet','Cümle_Sayisi'])
frames = [df, ozet_cumle_sayisi]
result = pd.concat(frames,axis=1, join='inner')
result.to_csv("metin_ozetleme.csv",index=False)
#import pandas as pd
#metin_ozetleme = pd.read_csv("metin_ozetleme.csv")