-
Notifications
You must be signed in to change notification settings - Fork 12
/
indexES.py
109 lines (89 loc) · 4.07 KB
/
indexES.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from HTMLParser import HTMLParser
from datetime import datetime
import time
from elasticsearch import Elasticsearch
import re
from nltk.tag import pos_tag
import gensim
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import math
import nltk
from itertools import product, izip
engstop = stopwords.words('english')
set_stopword=engstop
def indexquiz(es):
counter=0
uniquequiz={}
for line in open('train/bigquiz.txt'):
line=' '.join(line.split('\t')[1:])
line = line.replace('\t',' ').replace('\n',' ').replace('\r', ' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
line = re.sub('[^a-zA-Z0-9,. \"\']+', " ", line)
uniquequiz[line]=None
for line in open('train/bigquiz2.txt'):
line=' '.join(line.split('\t')[1:])
line = line.replace('\t',' ').replace('\n',' ').replace('\r', ' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
line = re.sub('[^a-zA-Z0-9,. \"\']+', " ", line)
uniquequiz[line]=None
for line in open('train/bigquiz3.txt'):
line=' '.join(line.split('\t')[1:])
line = line.replace('\t',' ').replace('\n',' ').replace('\r', ' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
line = re.sub('[^a-zA-Z0-9,. \"\']+', " ", line)
uniquequiz[line]=None
for line in open('train/requiz.txt'):
line=' '.join(line.split('\t')[1:])
line = line.replace('\t',' ').replace('\n',' ').replace('\r', ' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
line = re.sub('[^a-zA-Z0-9,. \"\']+', " ", line)
uniquequiz[line]=None
for line in open('train/requiz2.txt'):
line=' '.join(line.split('\t')[1:])
line = line.replace('\t',' ').replace('\n',' ').replace('\r', ' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
line = re.sub('[^a-zA-Z0-9,. \"\']+', " ", line)
uniquequiz[line]=None
for line in open('train/requiz3.txt'):
line=' '.join(line.split('\t')[1:])
line = line.replace('\t',' ').replace('\n',' ').replace('\r', ' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
line = re.sub('[^a-zA-Z0-9,. \"\']+', " ", line)
uniquequiz[line]=None
for line in uniquequiz:
es.index(index="quizlets", doc_type=type, body={"text": line})
def indexquizlemma(es):
counter=0
uniquequiz={}
for line in open('train/bigquizlemma.txt'):
line = line.replace('\t',' ').replace('\n',' ').replace('\r', ' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
line = re.sub('[^a-zA-Z0-9,. \"\']+', " ", line)
uniquequiz[line]=None
for line in open('train/bigquizlemma2.txt'):
line = line.replace('\t',' ').replace('\n',' ').replace('\r', ' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
line = re.sub('[^a-zA-Z0-9,. \"\']+', " ", line)
uniquequiz[line]=None
for line in open('train/bigquizlemma3.txt'):
line = line.replace('\t',' ').replace('\n',' ').replace('\r', ' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
line = re.sub('[^a-zA-Z0-9,. \"\']+', " ", line)
uniquequiz[line]=None
for line in open('train/requizlemma.txt'):
line = line.replace('\t',' ').replace('\n',' ').replace('\r', ' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
line = re.sub('[^a-zA-Z0-9,. \"\']+', " ", line)
uniquequiz[line]=None
for line in uniquequiz:
es.index(index="quizlets_lemma", doc_type=type, body={"text": line})
def indexqalemma(es):
for line in open('train/CK12lemma.txt'):
line = line.replace('\t',' ').replace('\n',' ').replace('\r', ' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
line = re.sub('[^a-zA-Z0-9,. \"\']+', " ", line)
es.index(index="qa_lemma", doc_type=type, body={"text": line})
def indexqa(es):
for line in open('train/CK12clean.txt'):
line = line.replace('\t',' ').replace('\n',' ').replace('\r', ' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
line = re.sub('[^a-zA-Z0-9,. \"\']+', " ", line)
es.index(index="qa", doc_type=type, body={"text": line})
def main():
es = Elasticsearch() # init es
indexquizlemma(es)
indexquiz(es)
indexqalemma(es)
indexqa(es)
if __name__ == '__main__':
main()