-
Notifications
You must be signed in to change notification settings - Fork 1
/
citationhtmlutil.py
133 lines (104 loc) · 5.7 KB
/
citationhtmlutil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import random
import re
import ntpath
#Expected Citation ML Output from API call
CITATION_OUTPUT_PATTERN = '{0}> := TYPE:{1}SUBTYPE:{2}ANAPHORIC:{3}CO-REF:{4}NAME:{5} ##{6}#{7}#'
CITATION_REGX = CITATION_OUTPUT_PATTERN.format('(?P<CITATIONID>.*)', '(?P<TYPE>.*)', '(?P<SUBTYPE>.*)', '(?P<ANAPHORIC>.*)', '(?P<COREF>.*)', '(?P<NAME>.*)', '(?P<OFFSETSTART>.*)', '(?P<OFFSETEND>.*)')
#Represents Citation Detail with data from ML API Response and derevied properties
class citationDetail:
def __init__(self):
self.citationFileAndId = ''
self.citationtype = ''
self.subtype = ''
self.anaphoric = False
self.coref = ''
self.name = ''
self.offsetStart = 0
self.offsetEnd = 0
#Generates Random Color
r = lambda: random.randint(50,225)
self.color = '#%02X%02X%02X' % (r(),r(),r())
#Gets the file name from Composite Key
def CitationFileId(self):
return self.citationFileAndId[0:self.citationFileAndId.rfind('-')]
#Gets the Citation Id from Composite Key
def CitationId(self):
return self.citationFileAndId[self.citationFileAndId.rfind('-') + 1:len(self.citationFileAndId)]
#Gets the Co-ref Citation Id from Composite Key
def CorefCitationId(self):
if(len(self.coref) > 4):
return self.coref[self.coref.rfind('-') + 1:len(self.coref)]
else:
return ''
#Represents Citation HtmlDocument
class citationHtmlDocument:
def __init__(self, citationDetails, citatation_raw_content):
self.citationDetails = citationDetails
self.sourceText = citatation_raw_content
def createDocument(self):
self.citationDetails.sort(key=lambda x: x.offsetEnd, reverse=True)
#Generates HTMl Conetent for passed in Citation Offset Details
#Decorate all the citations with Span Tag with Unique color per related citations
for citationDetail in self.citationDetails:
#self.sourceText = self.sourceText.replace(citationDetail.name, f'<span style="background-color: {citationDetail.color}">{citationDetail.name}</span>')
self.sourceText = insert_text(self.sourceText,citationDetail.offsetEnd, f'</span>')
self.sourceText = insert_text(self.sourceText,citationDetail.offsetStart, f'<span style="background-color: {citationDetail.color}">')
return self.sourceText
#Method to get citation details based on file input
def getCitationHtmlDetailsbyFile(citation_raw_file, citation_ml_output, citation_output_html):
rawFileContent = open(citation_raw_file, 'r').read()
mlOutputFileContent = open(citation_ml_output, 'r').read()
citationHtmlContent = getCitationHtmlDetails(rawFileContent, mlOutputFileContent)
# if not os.path.exists(os.path.dirname(citation_output_html)):
# os.makedirs(os.path.dirname(citation_output_html))
htmlFile = open(citation_output_html, 'w')
htmlFile.write(citationHtmlContent)
htmlFile.close()
#Method to get citation details based on content input
def getCitationHtmlDetails(rawFileContent, mlOutputFileContent):
citationHtml = citationHtmlDocument(getCitationDetails(mlOutputFileContent), rawFileContent)
# if not os.path.exists(os.path.dirname(citation_output_html)):
# os.makedirs(os.path.dirname(citation_output_html))
return citationHtml.createDocument()
#Parse Citation ML output
def getCitationDetails(citationContent):
outputDetails = citationContent.replace('\r', '').replace('\n', '').replace('\t', '').split('<CITATION-')
citationDetails = []
for output in outputDetails:
citationRegex = re.compile(CITATION_REGX)
match = citationRegex.search(output)
if match == None:
continue
citationDetailOutput = citationDetail()
citationDetailOutput.citationFileAndId = match.group('CITATIONID').strip()
citationDetailOutput.citationtype = match.group('TYPE').strip()
citationDetailOutput.subtype = match.group('SUBTYPE').strip()
citationDetailOutput.anaphoric = bool(match.group('ANAPHORIC').strip())
citationDetailOutput.coref = match.group('COREF').strip()
citationDetailOutput.name = rreplace(match.group('NAME').replace('"', ''),'"', '', 0).strip()
citationDetailOutput.offsetStart = int(match.group('OFFSETSTART').strip())
citationDetailOutput.offsetEnd = int(match.group('OFFSETEND').strip())
citationDetails.append(citationDetailOutput)
for indCitationDetail in [x for x in citationDetails if x.CorefCitationId() != '']:
indCitationDetail.color = next(x for x in citationDetails if x.CitationId() == indCitationDetail.CorefCitationId()).color
return citationDetails
#Replace Last Occurance of the string
def rreplace(input, old, new, occurrence):
output = input.rsplit(old, occurrence)
return new.join(output)
#Inserts text at the given index
def insert_text(input, index, text):
return input[:index] + text + input[index:]
#if __name__ == "__main__":
def htmlgeneration(path):
# CURRENT_FOLDER_PATH = os.path.dirname(os.getcwd())
citation_raw_file = path
path = ntpath.basename(citation_raw_file)
path = path.replace('.txt','')
citation_ml_output = 'outputs/'+path+'-text.txt'
citation_output_html = 'outputs/'+path+'-Html.html'
getCitationHtmlDetailsbyFile(citation_raw_file, citation_ml_output, citation_output_html)
#For content based call use below code
#citationHtmlContent = getCitationHtmlDetails(rawFileContent, mlOutputFileContent)
print('Html Generation completed. Files available @ ' + citation_output_html)