-
Notifications
You must be signed in to change notification settings - Fork 12
/
genlemmas.py
102 lines (82 loc) · 2.55 KB
/
genlemmas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import re
import gensim
import os
import itertools
import numpy as np
import argparse
import string
from nltk.corpus import stopwords
import glob
from bs4 import BeautifulSoup
from mycounter import Counter
import math
import pickle
from sklearn.metrics import pairwise
from nltk.stem import WordNetLemmatizer
from difflib import SequenceMatcher
import random
wordnet_lemmatizer = WordNetLemmatizer()
engstop = stopwords.words('english')
set_stopword=engstop
parser = argparse.ArgumentParser("Word2Vec")
def lemmaorstem(w):
return wordnet_lemmatizer.lemmatize(w)
def procline(q,stem=True):
q=q.replace('Which of the following ','').replace('Which of these ','').replace('Which statement best describes ','')
q=q.replace('Were do ','').replace('If a ','').replace('In which ','').replace('Which best ','').replace('What are ', '')
q=q.replace('What best ','').replace('What is ','').replace('What would ', '').replace('When a ','').replace('Which statement ', '')
q=q.replace('Why is ','').replace('Which ','')
q = q.replace('\t',' ').replace('\n',' ').replace('\r', ' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
q = re.sub('[^a-zA-Z0-9\- \']+', " ", q)
if stem==False:
q=[m for m in q.lower().split(' ') if m not in set_stopword and m!='' and m!=' ']
else:
q=[lemmaorstem(m) for m in q.lower().split(' ') if m not in set_stopword and m!='' and m!=' ']
return q
def genlemmas():
sents2=[]
for line in open('train/CK12clean.txt'):
sent=procline(line)
sents2.append(' '.join(sent))
f=open('CK12lemma.txt','wb+')
f.write('\n'.join(sents2))
f.close()
lines={}
lines2=[]
for line in open('train/bigquiz.txt'):
line=line.split('\t')
line=' '.join(line[1:])
lines[line]=None
for line in lines:
sent=procline(line)
lines2.append(' '.join(sent))
f=open('bigquizlemma.txt','wb+')
f.write('\n'.join(lines2))
f.close()
lines={}
lines2=[]
for line in open('train/bigquiz2.txt'):
line=line.split('\t')
line=' '.join(line[1:])
lines[line]=None
for line in lines:
sent=procline(line)
lines2.append(' '.join(sent))
sents.append(sent)
f=open('bigquizlemma2.txt','wb+')
f.write('\n'.join(lines2))
f.close()
lines={}
lines2=[]
for line in open('train/bigquiz3.txt'):
line=line.split('\t')
line=' '.join(line[1:])
lines[line]=None
for line in lines:
sent=procline(line)
lines2.append(' '.join(sent))
sents.append(sent)
f=open('bigquizlemma3.txt','wb+')
f.write('\n'.join(lines2))
f.close()
genlemmas()