-
Notifications
You must be signed in to change notification settings - Fork 0
/
my_ngram.py
189 lines (137 loc) · 4.88 KB
/
my_ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import random
import os
random.seed()
def from_file_to_list(input_file):
"""
This function take a file in argument and will write its content into a dictionnart to easy the data management
input_file: name of the file to read the data from
"""
file = open(input_file)
dict_values = ["" for k in range(8)]
for line in file:
s = line.split(" ")
s.pop(0) # first column only indicate line's number
s.remove('\n')
for idx, a in enumerate(s):
dict_values[idx] += a
file.close
return dict_values
def create_cross_list(protocol):
filenames = os.listdir("data/" + protocol)
cross_list = {k: [] for k in range(10)}
nbParMot = 20
for i in range(0,10):
nbApres = 0
nbAvant = 0
for j in range(0,10):
nbApres += nbParMot-1
pos_word1 = random.randint(nbAvant,nbApres)
cross_list[i].append((filenames[pos_word1].split("_")[0], filenames.pop(pos_word1)))
nbApres -= 1
pos_word2 = random.randint(nbAvant,nbApres)
cross_list[i].append((filenames[pos_word2].split("_")[0], filenames.pop(pos_word2)))
nbAvant = nbApres
nbParMot -= 2
return cross_list
words = ["come","girl","man","maybe","mine","name","read","right","science","thank"]
def initialize_grams():
grams = {k: {l: {} for l in words} for k in range(8)}
return grams
def compting_grams(list_word_file, grams, nb_grams, protocol):
for word, file in list_word_file:
word_sequences = from_file_to_list("data/" + protocol + "/" + file)
for seq_number, sequence in enumerate(word_sequences):
for i in range(1, nb_grams+1):
grams[seq_number][word][i] = {}
for j in range(0, len(sequence)-i+1):
gram = ""
for k in range(0, i):
gram += sequence[j + k]
if gram not in grams[seq_number][word][i]:
grams[seq_number][word][i][gram] = 0
grams[seq_number][word][i][gram] += 1
if "total" not in grams[seq_number][word][i]:
grams[seq_number][word][i]["total"] = 0
grams[seq_number][word][i]["total"] += 1
return grams
def evaluate_word(sequence_grams, grams, word, n_gram, sequence_number):
p = 1
for gram, all_prefix in sequence_grams.items():
if gram in grams[sequence_number][word][n_gram]:
p *= grams[sequence_number][word][n_gram][gram]/grams[sequence_number][word][n_gram-1][gram[:-1]]
else:
p *= 1/grams[sequence_number][word][n_gram-1]["total"]
for prefix in all_prefix:
if prefix in grams[sequence_number][word][len(prefix)]:
if len(prefix) == 1:
p *= grams[sequence_number][word][len(prefix)][prefix]/grams[sequence_number][word][len(prefix)]["total"]
else:
p *= grams[sequence_number][word][len(prefix)][prefix]/grams[sequence_number][word][len(prefix)-1][prefix[:-1]]
else:
if len(prefix) == 1:
p *= 1/grams[sequence_number][word][len(prefix)]["total"]
else:
p *= 1/grams[sequence_number][word][len(prefix)-1]["total"]
return p
def evaluate_sequence(sequence_number, sequence, grams, n_gram):
sequence_grams = {}
for j in range(0, len(sequence)-n_gram+1):
prefix = []
gram = ""
for k in range(0, n_gram):
gram += sequence[j + k]
if k < n_gram - 1 and j==0:
prefix.append(gram)
sequence_grams[gram] = prefix
guessed_word = None
max_probability = 0
for word in grams[sequence_number]:
p = evaluate_word(sequence_grams, grams, word, n_gram, sequence_number)
if p > max_probability:
max_probability = p
guessed_word = word
return guessed_word
def voting(filename, grams, n_grams, protocol):
word_sequences = from_file_to_list("data/" + protocol + "/" + filename)
words_guessed = {}
for sequence_number in range(0,8):
sequence = word_sequences[sequence_number]
guessed_word = evaluate_sequence(sequence_number, sequence, grams, n_grams)
if guessed_word not in words_guessed:
words_guessed[guessed_word] = 0
words_guessed[guessed_word] += 1
guessed_word = None
max_guess = 0
for word in words_guessed:
if words_guessed[word] > max_guess:
max_guess = words_guessed[word]
guessed_word = word
return guessed_word
def execution_with_vote(protocol, n_gram):
"""protocol : nom du DOSSIER dans data où y'a la bonne donnée"""
cpt_res = 0 # TMP
cross_list = create_cross_list(protocol)
results = {k: {} for k in range(10)}
for i in range(0,10):
grams = initialize_grams()
for j in range(0,10):
if j != i:
grams = compting_grams(cross_list[j], grams, n_gram, protocol)
results[i] = {k: {} for k in range(20)}
for j in range(0, 20):
word, filename = cross_list[i][j]
results[i][j]["original"] = word
best_guess = voting(filename, grams, n_gram, protocol) # TODO
results[i][j]["result"] = best_guess
if word == best_guess:
cpt_res+=1
return cpt_res
import numpy as np
for j in np.arange(0.05,0.3,0.05):
for i in range(5, 16):
moy = 0
for r in range(0,5):
moy += execution_with_vote("hm" + str(j), i)/200*100
print("hm",j," / ngrams ", i, " prob: ", moy/5)
#print(create_cross_list("sax"))
#print(execution_with_vote("sax", 29)[0]["girl"][29])