-
Notifications
You must be signed in to change notification settings - Fork 0
/
textProcessing.py
193 lines (164 loc) · 7.08 KB
/
textProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import bs4 as bs
import os
import subprocess
import requests
from pathParameter import parsCit,LOCDB, pdfInspector, grobid
from logWriter import writeLog, writeUserLog
#from imageProcessing import processIndividualGrobid
from imgProcessing import mapXmlOutputIndividual
def check_file_extension(filename, ext):
return '.' in filename and \
filename.rsplit('.', 1)[1] in ext
'''
Description:
method that creates a bib structure for the given text and stores it as xml file
Input:
UPLOAD_FOLDER: the folder from which the file is taken
OUTPUT_FOLDER: the output folder
Settings: list of all settings -> see app for more info
filename: the filename with timestamp
Output:
outputfile is created
'''
def fileuploadText(UPLOAD_FOLDER, OUTPUT_FOLDER, settings, filename):
#checks the file extension and preprocesses the file based on it
if "True" in settings[1]:
Txt_Dummy = True
else:
Txt_Dummy = False
prepareText(UPLOAD_FOLDER, Txt_Dummy, filename)
mode = "extract_meta"
if Txt_Dummy:
mode = "extract_citations"
p = subprocess.Popen(["./citeExtract.pl -m" + mode + " " +LOCDB+"tmp/" + filename+ "_Textdummy.txt ",LOCDB+"tmp/" + filename + "_ParsText.xml"],shell=True,stdout=subprocess.PIPE,cwd=parsCit)
parscitstring= p.communicate()[0]
with open("tmp/" + filename + "_ParsText.xml", 'w') as f:
f.write(parscitstring)
outputxmlsoup =createBibstruct(filename)
if filename[-3:].lower() == "pdf":
output_grobid = processfileGrobid(UPLOAD_FOLDER, filename)
xmltags3 = output_grobid.find_all('BibStructured')
algotag3 = outputxmlsoup.algorithm
for curr in xmltags3:
algotag3.append(curr)
os.system("mv "+LOCDB+"tmp/"+filename+'_Textdummy.txt '+LOCDB+"processed-files/"+filename+'_Textdummy.txt')
os.system("mv "+LOCDB+"tmp/"+filename+'_ParsText.txt '+LOCDB+"processed-files/"+filename+'_ParsText.txt')
os.makedirs(OUTPUT_FOLDER + filename)
with open(OUTPUT_FOLDER + filename + "/Output" + filename+'.xml','w') as xmlf:
xmlf.write(outputxmlsoup.encode('utf-8'))
settings = []
settings.append("TXT")
settings.append(Txt_Dummy)
writeLog(filename, settings, True)
os.remove(UPLOAD_FOLDER + filename)
os.makedirs(OUTPUT_FOLDER + filename)
outputfile = OUTPUT_FOLDER + filename + "/Output" + filename+'.xml'
outputfilename = filename.replace(filename.split("_")[0], "")[1:]
if os.path.exists(outputfile):
print "Finished inputfile : " + outputfilename
writeUserLog("Finished inputfile : " + outputfilename)
else:
print "Error inputfile : " + outputfilename
writeUserLog("Error inputfile : " + outputfilename)
'''
Description:
method that pre-processes the text file and stores it as txt for further processing
Input:
UPLOAD_FOLDER: the folder from which the file is taken
Txt_Dummy: decide if text needs a dummy
filename: the filename with timestamp
Output:
Textdummy.txt is created
'''
def prepareText(UPLOAD_FOLDER, Txt_Dummy, filename):
#checks the file extension and converts the file to txt
if check_file_extension(filename,set(['pdf'])):
print "LOCDB:",LOCDB
print "UPLOAD_FOLDER:",UPLOAD_FOLDER
print "filename:",filename
print "java -jar docears-pdf-inspector.jar -title -text -out " +LOCDB +"tmp/" + filename+ "_Textdummy.txt " +LOCDB + UPLOAD_FOLDER + filename
subprocess.call(["java -jar docears-pdf-inspector.jar -title -text -out " +LOCDB +"tmp/" + filename+ "_Textdummy.txt " +LOCDB + UPLOAD_FOLDER + filename],shell=True,stdout=subprocess.PIPE,cwd=pdfInspector)
#removes the wrong title which is created by pdf-inspector
with open("tmp/" + filename+'_Textdummy.txt','r') as fin:
lines = fin.readlines()
firstline = lines[0].split("|")
if len(firstline) > 1:
lines[0] = firstline[1]
with open("tmp/" + filename+'_Textdummy.txt', 'w') as fout:
for line in lines:
fout.write(line)
elif check_file_extension(filename, set(['txt'])):
with open(UPLOAD_FOLDER + filename, 'r') as f:
text = f.read()
with open("tmp/" + filename+'_Textdummy.txt', 'w') as f:
f.write(text)
else:
path = UPLOAD_FOLDER + filename
args = ['libreoffice', '--headless', '--convert-to',
'txt', path]
subprocess.call(args, shell=False)
os.rename(filename.rsplit('.', 1)[0] + ".txt", "tmp/" + filename + "_Textdummy.txt")
if Txt_Dummy:
#reads dummy text
with open('dummy.txt','r') as dummy:
text=dummy.read()
#stores the dummy text and references as XMLdummy.txt
with open("tmp/" + filename+ '_Textdummy.txt','r') as dummytext:
content = dummytext.read()
with open("tmp/" + filename+ "_Textdummy.txt","w") as dummytext:
dummytext.write(text+'\n')
dummytext.write("REFERENCES\n\n")
dummytext.write(content)
'''
Description:
method that create the bibstruct out of the ParsText.xml
Input:
filename: the filename with timestamp
Output:
bib struct is returned as bs
'''
def createBibstruct(filename):
#find all parcit citations
with open("tmp/" + filename+"_ParsText.xml",'r') as f:
parsXmlsoup = bs.BeautifulSoup(f.read(),'xml')
parsXmltags= parsXmlsoup.find_all('citation', attrs={"valid" : "true"})
#finds headerinformation for the output bs created by parscit
parsXmltagsHead = parsXmlsoup.find('variant')
#create structure of output file and insert correct tags
soup = bs.BeautifulSoup("<algorithm></algorithm>",'xml')
algotag = soup.algorithm
soup.find('algorithm')['name']='LOCDB Web service'
soup.find('algorithm')['fname']=filename.split("_",1)[1]
#adds the header in correct format
if parsXmltagsHead is not None:
parsXmltagsHead.name = "HeadInformation"
del parsXmltagsHead['no']
algotag.append(parsXmltagsHead)
#adds all citations found by parscit
for parsCitSoup in parsXmltags:
new = parsCitSoup
new.name = "BibStructured"
new['detector'] = "ParsCit"
new['namer'] = "ParsCit"
del new['valid']
if new.contexts is not None:
new.contexts.extract()
algotag.append(new)
#removes confidence tag from everything
for c in algotag.find_all_next(confidence=True):
del c['confidence']
return soup
def processfileGrobid(UPLOAD_FOLDER, filename):
grobid_url = grobid+"processReferences"
files = {'input': open(LOCDB+UPLOAD_FOLDER+filename, 'rb')}
try:
response = requests.post(grobid_url, files=files)
except:
print "Error accessing Grobid Service"
xmlString = response.text.encode('utf8')
output = mapXmlOutputIndividual(xmlString)
xmltags= output.find_all('BibStructured')
for tag4 in xmltags:
tag4['detector'] = 'Grobid'
tag4['namer'] = 'Grobid'
return output