-
Notifications
You must be signed in to change notification settings - Fork 0
/
forward_frequency.py
77 lines (55 loc) · 1.7 KB
/
forward_frequency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import fileinput
import re
import sys
from sklearn import preprocessing
import hashlib
#takes in keyword_map of keywords w/ frequencies as values & text file as a string
def forward_freq_calc(keyword_map, textfile):
t = open(textfile, "r")
line = t.readline()
ffreq_dict = {}
paragraphs = 0
occurred = False
#grab total # of paragraphs within textfile
while line:
#for each class chunk
while line.strip() != '':
line = t.readline()
paragraphs+=1
line = t.readline()
#for each key, go through entire textfile and calculate forward frequency
for key, value in keyword_map.iteritems():
freq = 0
ffreq = 0
t = open(textfile, "r")
line = t.readline()
while line:
#for each class chunk
while line.strip() != '':
line = line.strip()
#for word in sentence:
if line.find(key) != -1:
#increase frequency once per paragraph if word is found
if occurred == False:
freq+=1
occurred = True
line = t.readline()
#at end of a paragraph
occurred = False
#get rid of blank line
line = t.readline()
#after going through entire doc, calculate forward frequency and store
ffreq = float(keyword_map[key])*float(freq)/float(paragraphs)
#add keyword candidate to keyword_map with value of its forward frequency
ffreq_dict[key] = ffreq
return ffreq_dict
keyword_map = {
'Present': 1, 'Scalable Distributed Information Management System (SDIMS)': 1, 'aggregates information': 1,
'large-scale networked systems': 1, 'large-scale distributed applications': 1,
'information': 1, 'summary': 1, 'global': 1, 'information': 1
}
#dict = {}
#dict = forward_freq_calc(keyword_map, "testdoc.txt")
#for word in dict:
# print word
# print dict[word]