main.py

# -*- coding: utf-8 -*-
"""main.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1u7k93wZZaIgZHf-pT-5oYW4nMBxJ-2zS
"""


import csv
from google.colab import drive
drive.mount('/content/drive')

"""*******Reading all corpus from file into list*******"""

with open('/content/drive/My Drive/NLP/Corpus.txt', 'r') as f:
    reader = csv.reader(f)
    poetry = list(reader)
f.close()
text = poetry[0]
print(text)

"""Making a list of starting words"""

f1 = "/content/drive/My Drive/NLP/Corpus.txt"
roman = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
#array containing special characters
special = ['‘','٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪','%%%%%%%%%%%%%%%%%%%%', '!', '`', '"', ')', '(',"''", '.', ':','’’', "'", '"', '؟','‘','’','‘','،','“','’']
starting_words = []         #array to store the starting words
fileref = open (f1,"r")
line = fileref.readlines()
fileref.close()

#loop to tokenise sentences in words and store in start_words array  
for words in line:
  word = words.split()
  if len(word) != 0 and word[0] not in special and word[0][0] not in roman: #check for special characters and english letters. if found than they are not added to list
    starting_words.append(word[0])
#print(starting_words)

"""Tokenizing the corpus into a word list"""

w_list = []
special = ['‘','‘‘','٪','٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪٪','%%%%%%%%%%%%%%%%%%%%', '!', '%', '`', '"', ')', '(',"''", '.', ':','’’', "'", '"', '؟','‘','’','‘','،','“','’']
#nested loop to tokenise the whole corpus into words and stored in a word list
for i in range(len(poetry)-1):
  text = poetry[i]
  for word in text:
    s = word.split()
    for w in s:
      if w not in special:  #check so no special characters are added
        w_list.append(w)
#print(w_list)

"""Probability calculation for Bigrams"""

def prob(w_list, s):
  p = 0
  p1 = 0
  count = 0
  w1_count = 0    #to store total occurances of the starting word
  visited = []
  max_prob = []   #to store probabilities of all possible combinations
  word = ''
  w1 = s  

  for i in range(len(w_list)-1):  #counting total occurances of starting word in the corpus
    if w1 == w_list[i]:
      w1_count += 1 
  for i in range(len(w_list)-1):  
    w = w_list[i]
    if w == w1:                 #if starting word found than get next word
      w2 = w_list[i+1]
      if w2 not in visited:     #check in case the secound word comes again
        visited.append(w2)      #Ifthe secound word has not been read before add it to the visited array
        for j in range(len(w_list)-1):        
          if w1 == w_list[j] and w2 == w_list[j+1]:   
            p += 1              #counting # of times the second word comes after the first word in the corpus
        if p > p1:              #storing the word with the highest count
          word = w2
          p1 = p
        count = p/w1_count      #calculating the probability using count of w1 after w2/total count of w2
        max_prob.append(count)  #adding in the array that stores probabilities of all second words that come after first word
        p = 0
  return word

"""Generating poetry through Bigram Model"""

import random
start = random.choice(starting_words)   #generating random starting word
next_word = ''
verse = ''
verse = verse + start               #adding starting word to the verse
next_word = prob(w_list, start)     #getting the second most probable word
counter = 2
verse = verse + ' ' + next_word
verse_count = random.randint(5,8)  #generating random numer between 5-8 for number of words of a verse as first 2 words have already been generated
for s in range(3):
  for v in range(4):
    for i in range (verse_count):
      res = prob(w_list, next_word) #getting the next most probable word
      verse = verse + ' ' + res     #adding it to the verse
      next_word = res
      counter += 1
      if counter == verse_count:    #to check end of verse
        #print(verse_count)
        print(verse)                #printing verse and resetting the counter
        verse = ''
        next_word = random.choice(starting_words) #generating starting word for the next verse
        verse = verse + next_word
        verse_count = random.randint(7,9)          #generating random number for the # of words in a verse
        counter = 1
        break 
  print()

"""Probability function for Trigram"""

def tri_prob(w_list,r, r1):
  p = 0
  p1 = 0
  count = 0
  w1_count = 0
  w3_count = 0
  w2_count = 0
  w2_visited = []
  w3_visited = []
  max_prob = []
  max_prob2 = []
  word = ''
  word2 = ''
  w1 = r 
  for i in range(len(w_list)-1):    #getting the total occurances of starting word
    if w1 == w_list[i]:
      w1_count += 1
  for i in range(len(w_list)-1):
    w = w_list[i]
    if w == w1:                   
      w2 = r1                     #if starting word found in list than w2 = given next word
      if w2 not in w2_visited:    
        w2_visited.append(w2)
        for j in range(len(w_list)-1):
          if w1 == w_list[j] and w2 == w_list[j+1]:     #getting most probable second word
            p += 1
          if p > p1:
            word = w2
            p1 = p
          count = p/w1_count
          max_prob.append(count)
          p = 0

 #resetting all counters 
  p = 0
  p1 = 0
  count = 0
  for i in range(len(w_list)-1):
    w = w_list[i]
    w2 = w_list[i+1]
    if w == w1 and w2 == word:  #if first and second word found in list than store the third word
      w3 = w_list[i+2]
      for i in range(len(w_list)-1):   #get total occurance sof the third word in the whole corpus
        if w3 == w_list[i]:
          w3_count += 1
      if w3 not in w3_visited:      
        w3_visited.append(w3)
        for j in range(len(w_list)-1):
          if w1 == w_list[j] and w2 == w_list[j+1] and w3 == w_list[j+2]: #all words found consectively 
            p += 1        #counting # of time w3 came after w1 + w2
          if p > p1:
            word2 = w3
            p1 = p
          count = p/w3_count      #calculating probability by count of w3 coming after w1+w2/ total count of w3
          max_prob2.append(count)
          p = 0
  return word2

"""Generating Poetry through Trigram Model"""

import random
res1 = ''
res = ''
next_word = '' 
verse = ''
counter = 0
done = []
verse_count = random.randint(5,8)#randomly generating words per verse
print_count = 0
for s in range(3):
  for v in range(4):
    if print_count == 1 or print_count == 0:  #print_count 0 or 1 means start of a new verse
      start = random.choice(starting_words)   #randomly generating first word
      done.append(start)
      while (start in done):
        start = random.choice(starting_words)
      res = start       
      verse = verse + start           #appending first word in verse
      next_word = prob(w_list, start) #getting next probable word
      res1 = next_word
      verse = verse + ' ' + next_word
      counter += 2
      print_count = 2
    if print_count > 1:   #if not start of a new verse
      for i in range (verse_count):
        next_word = tri_prob(w_list, res, res1) #send first two words to get third most probable word
        res = res1
        res1 = next_word
        verse = verse + ' ' + res1 #add next most probable word in verse
        counter += 1
        print_count += 1
        if counter == verse_count:  #end of verse
          print(verse)          #print verse
          verse = ''            
          #reset counters and ramdomly generate count of words per next verse
          print_count = 1
          verse_count = random.randint(7,10)
          counter = 0
  print()

"""Probabilty function for Backward Bigram 

"""

def back_prob(w_list, s):
  p = 0
  p1 = 0
  count = 0
  w1_count = 0    #to store total occurances of the starting word
  visited = []
  max_prob = []   #to store probabilities of all possible combinations
  word = ''
  w1 = s  

  for i in range(len(w_list)-1):  #counting total occurances of starting word in the corpus
    if w1 == w_list[i]:
      w1_count += 1 
  for i in range(len(w_list)-1):  
    w = w_list[i]
    if w == w1:                 #if starting word found than get next word
      w2 = w_list[i-1]
      if w2 not in visited:     #check in case the secound word comes again
        visited.append(w2)      #Ifthe secound word has not been read before add it to the visited array
        for j in range(len(w_list)-1):        
          if w1 == w_list[j] and w2 == w_list[j-1]:   
            p += 1              #counting # of times the second word comes after the first word in the corpus
        if p > p1:              #storing the word with the highest count
          word = w2
          p1 = p
        count = p/w1_count      #calculating the probability using count of w1 after w2/total count of w2
        max_prob.append(count)  #adding in the array that stores probabilities of all second words that come after first word
        p = 0
  return word

"""Generating Peotry through Backward Bigram model"""

import random
start = random.choice(w_list) #generating random word from the word list
next_word = ''
verse = ''
verse = verse + start               #adding starting word to the verse
next_word = back_prob(w_list, start)     #getting the second most probable word
counter = 2
verse = next_word + ' ' + verse
verse_count = random.randint(5,8)  #generating random numer between 5-10 for number of words of a verse as first 2 words have already been generated
for s in range(3):
  for v in range(4):
    for i in range (verse_count):
      res = back_prob(w_list, next_word) #getting the next most probable word
      verse = res + ' ' + verse     #adding it to the verse
      next_word = res
      counter += 1
      if counter == verse_count:    #to check end of verse
        print(verse)                #printing verse and resetting the counter
        verse = ''
        next_word = random.choice(w_list) #generating starting word for the next verse
        verse = verse + next_word
        verse_count = random.randint(7,9)          #generating random number for the # of words in a verse
        counter = 1
        break
  print()

"""Generating Poetry through Bidirectional Model"""

import random
start = random.choice(w_list) #selecting a random word from the word list
next_word = ''
verse = ''
verse = verse + start               #adding starting word to the verse
counter = 1
next_word = start
verse_count = random.randint(4,5)  #generating random numer between 5-10 for number of words of a verse as first 2 words have already been generated
for s in range(3):
  for v in range(4):
    for i in range (verse_count):
      res = prob(w_list, start) #getting the next most probable forward bigram word
      verse = verse + ' ' + res     #adding it to the left of the verse
      start = res       
      res1 = back_prob(w_list, next_word) #getting the next most probable backward bigram word
      verse = res1 + ' ' + verse    #adding word to the right of verse
      next_word = res1
      counter += 1
      if counter == verse_count:    #to check end of verse
        print(verse)                #printing verse and resetting the counter
        verse = ''
        start = random.choice(w_list) #generating starting word for the next verse
        verse = verse + start 
        verse_count = random.randint(4,5)          #generating random number for the # of words in a verse
        counter = 1
        break
  print()