-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
309 lines (280 loc) · 11.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# -*- coding: utf-8 -*-
"""main.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1u7k93wZZaIgZHf-pT-5oYW4nMBxJ-2zS
"""
import csv
from google.colab import drive
drive.mount('/content/drive')
"""*******Reading all corpus from file into list*******"""
with open('/content/drive/My Drive/NLP/Corpus.txt', 'r') as f:
reader = csv.reader(f)
poetry = list(reader)
f.close()
text = poetry[0]
print(text)
"""Making a list of starting words"""
f1 = "/content/drive/My Drive/NLP/Corpus.txt"
roman = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
#array containing special characters
special = ['‘','٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪','%%%%%%%%%%%%%%%%%%%%', '!', '`', '"', ')', '(',"''", '.', ':','’’', "'", '"', '؟','‘','’','‘','،','“','’']
starting_words = [] #array to store the starting words
fileref = open (f1,"r")
line = fileref.readlines()
fileref.close()
#loop to tokenise sentences in words and store in start_words array
for words in line:
word = words.split()
if len(word) != 0 and word[0] not in special and word[0][0] not in roman: #check for special characters and english letters. if found than they are not added to list
starting_words.append(word[0])
#print(starting_words)
"""Tokenizing the corpus into a word list"""
w_list = []
special = ['‘','‘‘','٪','٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪','%%%%%%%%%%%%%%%%%%%%', '!', '%', '`', '"', ')', '(',"''", '.', ':','’’', "'", '"', '؟','‘','’','‘','،','“','’']
#nested loop to tokenise the whole corpus into words and stored in a word list
for i in range(len(poetry)-1):
text = poetry[i]
for word in text:
s = word.split()
for w in s:
if w not in special: #check so no special characters are added
w_list.append(w)
#print(w_list)
"""Probability calculation for Bigrams"""
def prob(w_list, s):
p = 0
p1 = 0
count = 0
w1_count = 0 #to store total occurances of the starting word
visited = []
max_prob = [] #to store probabilities of all possible combinations
word = ''
w1 = s
for i in range(len(w_list)-1): #counting total occurances of starting word in the corpus
if w1 == w_list[i]:
w1_count += 1
for i in range(len(w_list)-1):
w = w_list[i]
if w == w1: #if starting word found than get next word
w2 = w_list[i+1]
if w2 not in visited: #check in case the secound word comes again
visited.append(w2) #Ifthe secound word has not been read before add it to the visited array
for j in range(len(w_list)-1):
if w1 == w_list[j] and w2 == w_list[j+1]:
p += 1 #counting # of times the second word comes after the first word in the corpus
if p > p1: #storing the word with the highest count
word = w2
p1 = p
count = p/w1_count #calculating the probability using count of w1 after w2/total count of w2
max_prob.append(count) #adding in the array that stores probabilities of all second words that come after first word
p = 0
return word
"""Generating poetry through Bigram Model"""
import random
start = random.choice(starting_words) #generating random starting word
next_word = ''
verse = ''
verse = verse + start #adding starting word to the verse
next_word = prob(w_list, start) #getting the second most probable word
counter = 2
verse = verse + ' ' + next_word
verse_count = random.randint(5,8) #generating random numer between 5-8 for number of words of a verse as first 2 words have already been generated
for s in range(3):
for v in range(4):
for i in range (verse_count):
res = prob(w_list, next_word) #getting the next most probable word
verse = verse + ' ' + res #adding it to the verse
next_word = res
counter += 1
if counter == verse_count: #to check end of verse
#print(verse_count)
print(verse) #printing verse and resetting the counter
verse = ''
next_word = random.choice(starting_words) #generating starting word for the next verse
verse = verse + next_word
verse_count = random.randint(7,9) #generating random number for the # of words in a verse
counter = 1
break
print()
"""Probability function for Trigram"""
def tri_prob(w_list,r, r1):
p = 0
p1 = 0
count = 0
w1_count = 0
w3_count = 0
w2_count = 0
w2_visited = []
w3_visited = []
max_prob = []
max_prob2 = []
word = ''
word2 = ''
w1 = r
for i in range(len(w_list)-1): #getting the total occurances of starting word
if w1 == w_list[i]:
w1_count += 1
for i in range(len(w_list)-1):
w = w_list[i]
if w == w1:
w2 = r1 #if starting word found in list than w2 = given next word
if w2 not in w2_visited:
w2_visited.append(w2)
for j in range(len(w_list)-1):
if w1 == w_list[j] and w2 == w_list[j+1]: #getting most probable second word
p += 1
if p > p1:
word = w2
p1 = p
count = p/w1_count
max_prob.append(count)
p = 0
#resetting all counters
p = 0
p1 = 0
count = 0
for i in range(len(w_list)-1):
w = w_list[i]
w2 = w_list[i+1]
if w == w1 and w2 == word: #if first and second word found in list than store the third word
w3 = w_list[i+2]
for i in range(len(w_list)-1): #get total occurance sof the third word in the whole corpus
if w3 == w_list[i]:
w3_count += 1
if w3 not in w3_visited:
w3_visited.append(w3)
for j in range(len(w_list)-1):
if w1 == w_list[j] and w2 == w_list[j+1] and w3 == w_list[j+2]: #all words found consectively
p += 1 #counting # of time w3 came after w1 + w2
if p > p1:
word2 = w3
p1 = p
count = p/w3_count #calculating probability by count of w3 coming after w1+w2/ total count of w3
max_prob2.append(count)
p = 0
return word2
"""Generating Poetry through Trigram Model"""
import random
res1 = ''
res = ''
next_word = ''
verse = ''
counter = 0
done = []
verse_count = random.randint(5,8)#randomly generating words per verse
print_count = 0
for s in range(3):
for v in range(4):
if print_count == 1 or print_count == 0: #print_count 0 or 1 means start of a new verse
start = random.choice(starting_words) #randomly generating first word
done.append(start)
while (start in done):
start = random.choice(starting_words)
res = start
verse = verse + start #appending first word in verse
next_word = prob(w_list, start) #getting next probable word
res1 = next_word
verse = verse + ' ' + next_word
counter += 2
print_count = 2
if print_count > 1: #if not start of a new verse
for i in range (verse_count):
next_word = tri_prob(w_list, res, res1) #send first two words to get third most probable word
res = res1
res1 = next_word
verse = verse + ' ' + res1 #add next most probable word in verse
counter += 1
print_count += 1
if counter == verse_count: #end of verse
print(verse) #print verse
verse = ''
#reset counters and ramdomly generate count of words per next verse
print_count = 1
verse_count = random.randint(7,10)
counter = 0
print()
"""Probabilty function for Backward Bigram
"""
def back_prob(w_list, s):
p = 0
p1 = 0
count = 0
w1_count = 0 #to store total occurances of the starting word
visited = []
max_prob = [] #to store probabilities of all possible combinations
word = ''
w1 = s
for i in range(len(w_list)-1): #counting total occurances of starting word in the corpus
if w1 == w_list[i]:
w1_count += 1
for i in range(len(w_list)-1):
w = w_list[i]
if w == w1: #if starting word found than get next word
w2 = w_list[i-1]
if w2 not in visited: #check in case the secound word comes again
visited.append(w2) #Ifthe secound word has not been read before add it to the visited array
for j in range(len(w_list)-1):
if w1 == w_list[j] and w2 == w_list[j-1]:
p += 1 #counting # of times the second word comes after the first word in the corpus
if p > p1: #storing the word with the highest count
word = w2
p1 = p
count = p/w1_count #calculating the probability using count of w1 after w2/total count of w2
max_prob.append(count) #adding in the array that stores probabilities of all second words that come after first word
p = 0
return word
"""Generating Peotry through Backward Bigram model"""
import random
start = random.choice(w_list) #generating random word from the word list
next_word = ''
verse = ''
verse = verse + start #adding starting word to the verse
next_word = back_prob(w_list, start) #getting the second most probable word
counter = 2
verse = next_word + ' ' + verse
verse_count = random.randint(5,8) #generating random numer between 5-10 for number of words of a verse as first 2 words have already been generated
for s in range(3):
for v in range(4):
for i in range (verse_count):
res = back_prob(w_list, next_word) #getting the next most probable word
verse = res + ' ' + verse #adding it to the verse
next_word = res
counter += 1
if counter == verse_count: #to check end of verse
print(verse) #printing verse and resetting the counter
verse = ''
next_word = random.choice(w_list) #generating starting word for the next verse
verse = verse + next_word
verse_count = random.randint(7,9) #generating random number for the # of words in a verse
counter = 1
break
print()
"""Generating Poetry through Bidirectional Model"""
import random
start = random.choice(w_list) #selecting a random word from the word list
next_word = ''
verse = ''
verse = verse + start #adding starting word to the verse
counter = 1
next_word = start
verse_count = random.randint(4,5) #generating random numer between 5-10 for number of words of a verse as first 2 words have already been generated
for s in range(3):
for v in range(4):
for i in range (verse_count):
res = prob(w_list, start) #getting the next most probable forward bigram word
verse = verse + ' ' + res #adding it to the left of the verse
start = res
res1 = back_prob(w_list, next_word) #getting the next most probable backward bigram word
verse = res1 + ' ' + verse #adding word to the right of verse
next_word = res1
counter += 1
if counter == verse_count: #to check end of verse
print(verse) #printing verse and resetting the counter
verse = ''
start = random.choice(w_list) #generating starting word for the next verse
verse = verse + start
verse_count = random.randint(4,5) #generating random number for the # of words in a verse
counter = 1
break
print()