This repository has been archived by the owner on Sep 25, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
requestingTranskribus.py
382 lines (327 loc) · 14.5 KB
/
requestingTranskribus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
import requests
import json
import os
import datetime
from bs4 import BeautifulSoup
from config import username, password, status
from config import collectionnames as collection_list
def create_folder(directory):
"""Create a new folder.
:param directory: path to new directory
:type directory: string
"""
if not os.path.exists(directory):
os.makedirs(directory)
# ------- LOGS -------
def initiate_log():
"""Initiate a log file in __logs__ directory, name after a timestamp.
"""
collections = ' '.join(["'%s'" % collection for collection in collection_list])
path_to_file = os.path.join(path_to_logs, "log-req-%s.txt") % TIMESTAMP
intro = """
RETRIEVING PAGE-XML FILES AND METADATA FROM TRANSKRIBUS
Script ran at: %s
For collection(s): %s
---------------------
\n""" % (now, collections)
with open(path_to_file, "w") as f:
f.write(intro)
def create_separation_in_log():
"""Create a visual separation in log file.
"""
path_to_file = os.path.join(path_to_logs, "log-req-%s.txt") % TIMESTAMP
separation = "\n =================================================== \n\n"
with open(path_to_file, "a") as f:
f.write(separation)
def create_log_entry(data, error_log, pages_new, pages_inprogress, pages_done, pages_final):
"""Create simple reports in log file.
:param data: contains all data retrieved from requesting Transkribus API on a collection's documents/subcollections.
:param pages_new: list of page numbers matching "NEW" status.
:param pages_inprogress: list of page numbers matching "IN PROGRESS" status.
:param pages_done: list of page numbers matching "DONE" status.
:param pages_final: list of page numbers matching "FINAL" status.
:param error_log: message on server errors, can be empty string.
:type data: dictionary
:type pages_new: list
:type pages_inprogress: list
:type pages_final: list
:type error_log: string
"""
data = data["md"]
nr_of_pages = data["nrOfPages"]
title = data["title"]
nr_of_new = data["nrOfNew"]
nr_of_inprogress = data["nrOfInProgress"]
nr_of_done = data["nrOfDone"]
nr_of_final = data["nrOfFinal"]
report_pages = "\nDocument '%s' contains %s pages.\n" % (title, nr_of_pages)
report_status = "New: %s\nIn Progress: %s\nDone: %s\nFinal:%s\n" % (nr_of_new, nr_of_inprogress, nr_of_done, nr_of_final)
report_which_pages = ""
if len(pages_new) != 0:
pages_new = str(pages_new).strip('[]')
report_which_pages += "Following pages have status 'NEW': %s\n" % pages_new
if len(pages_inprogress) != 0:
pages_inprogress = str(pages_inprogress).strip('[]')
report_which_pages += "Following pages have status 'IN PROGRESS': %s\n" % pages_inprogress
if len(pages_done) != 0:
pages_done = str(pages_done).strip("[]")
report_which_pages += "Following pages have status 'DONE': %s\n" % pages_done
if len(pages_final) != 0:
pages_final = str(pages_final).strip("[]")
report_which_pages += "Following pages have status 'FINAL': %s\n" % pages_final
report = report_pages + report_status + report_which_pages + error_log
path_to_file = os.path.join(path_to_logs, "log-req-%s.txt") % TIMESTAMP
with open(path_to_file, "a") as f:
f.write(report)
# ------- CREATE FILES ------
def create_transcript(url_transcript, url_image, page_nb, path_to_doc, document_title, document_desc, document_lang):
"""Create xml file containing a page transcript from Transkribus, in PAGE standard. File is name after
corresponding page number.
:param url_transcript: url to request transcript, provided by Transkribus.
:param url_image: url to request image of current document/subcollection page.
:param page_nb: current page number.
:param path_to_doc: path to directory for the current document/subcollection.
:param document_title: title of current document/subcollection.
:param document_desc: description of current document/subcollection.
:param document_lang: list of languages for current document/subcollection, separated by commas.
:type url_transcript: string
:type url_image: string
:type page_nb: integer
:type path_to_doc: string
:type document_title: string
:type document_desc: string
:type document_lang: string
:return: a status to signal possible server errors.
:rtype: boolean
"""
response = requests.request("GET", url_transcript)
document_title = "<title>%s</title>" % document_title
document_desc = "<desc>%s</desc>" % document_desc
document_page_nb = "<pagenumber>%s</pagenumber>" % page_nb
tag_title = BeautifulSoup(document_title, "xml")
tag_desc = BeautifulSoup(document_desc, "xml")
tag_page_nb = BeautifulSoup(document_page_nb, "xml")
tag_title = tag_title.title.extract()
tag_desc = tag_desc.desc.extract()
tag_page_nb = tag_page_nb.pagenumber.extract()
tag_title.name = "tu:title"
tag_desc.name = "tu:desc"
tag_page_nb.name = "tu:pagenumber"
if len(document_lang) != 0:
document_lang = ''.join(["<language>%s</language>" % l.strip() for l in document_lang.split(",")])
document_lang = "<languages>%s</languages>" % document_lang
tag_languages = BeautifulSoup(document_lang, "xml")
tag_languages = tag_languages.languages.extract()
tag_lang_list = tag_languages.findAll("language")
for tag in tag_lang_list:
tag.name = "tu:language"
if response.status_code == 503:
error = True
else:
error = False
xml = response.text
path_to_file = os.path.join(path_to_doc, "%s.xml") % page_nb
soup = BeautifulSoup(xml, "xml")
# Adding namespace declaration for element added by Time Us project
# Adding attributes to Page elements : @timeUs:url and @timeUs:id
if soup.PcGts:
soup.PcGts["xmlns:tu"] = "timeUs"
soup.Page["tu:url"] = url_image
soup.Page["tu:id"] = page_nb
soup.Metadata.append(tag_title)
soup.Metadata.append(tag_desc)
soup.Metadata.append(tag_page_nb)
if len(document_lang) != 0:
for tag in tag_lang_list:
soup.Metadata.append(tag)
with open(path_to_file, "w") as f:
f.write(str(soup))
return error
def get_transcripts(data, path_to_doc):
"""Identify page numbers matching targeted status.
:param data: contains all data retrieved from requesting Transkribus API on a collection's documents/subcollections.
:param path_to_doc: path to directory for the current document/subcollection.
:type data: dictionary
:type path_to_doc: string
:return: sum of all signals of server errors on pages in current document/subcollection.
:rtype: integer
"""
pages_new = []
pages_inprogress = []
pages_done = []
pages_final = []
errors = 0
page_list = data["pageList"]["pages"]
document_title = data["md"]["title"]
if "desc" in data["md"]:
document_desc = data["md"]["desc"]
else:
document_desc = "No description."
if "language" in data["md"]:
document_lang = data["md"]["language"]
else:
document_lang = ""
# Reporting
print("Creating xml files for %s" % document_title)
for page in page_list:
match = False
latest_transcript = page["tsList"]["transcripts"][0]
for stat in status:
if latest_transcript["status"] == stat:
match = True
if match is True:
url_transcript = latest_transcript["url"]
url_image = page["url"]
page_nb = latest_transcript["pageNr"]
page_stat = latest_transcript["status"]
error = create_transcript(url_transcript, url_image, page_nb, path_to_doc, document_title, document_desc, document_lang)
if error is True:
errors += 1
if page_stat == "NEW":
pages_new.append(page_nb)
elif page_stat == "IN PROGRESS":
pages_inprogress.append(page_nb)
elif page_stat == "DONE":
pages_done.append(page_nb)
elif page_stat == "FINAL":
pages_final.append(page_nb)
if errors == 0:
error_log = "\n"
else:
error_log = "%s server error(s) while retreiving xml files.\n" % errors
create_log_entry(data, error_log, pages_new, pages_inprogress, pages_done, pages_final)
return errors
def get_metadata(data):
"""Create JSON file containing document/subcollection's metadata in collection's directory.
:param data: contains all data retrieved from requesting Transkribus API on a collection's documents/subcollections.
:type data: dictionary
:return: path to directory for the current document/subcollection.
:rtype: string
"""
metadata = data["md"]
document_id = metadata["docId"]
document_title = metadata["title"]
document_title = document_title.replace("/", "-")
path_to_doc = os.path.join(path_to_col, "%s - %s") % (document_id, document_title)
create_folder(path_to_doc)
# Create metadata.json file for document
path_to_file = os.path.join(path_to_doc, "metadata.json")
with open(path_to_file, 'w') as file:
text = json.dumps(metadata)
file.write(text)
return path_to_doc
# ------- REQUESTS -------
def get_session_id():
"""Request Transkribus API to authenticate.
:return: session id for authentication.
:rtype: string
"""
url = "https://transkribus.eu/TrpServer/rest/auth/login"
payload = 'user=' + username + '&' + 'pw=' + password
headers = {'Content-Type': 'application/x-www-form-urlencoded'}
response = requests.request("POST", url, data=payload, headers=headers)
try:
soup = BeautifulSoup(response.text, "xml")
session_id = soup.sessionId.string
# Reporting
print("Successfully connected; session ID: %s." % session_id)
except Exception as e:
print(e)
print("Connection failed.")
session_id = None
return session_id
def get_collection_id(session_id):
"""Request Trankribus API to list collections accessible for current user and retrieve id of targeted collection.
:param session_id: session id for authentication.
:type session_id: string
:return: numerical id for targeted collection.
:rtype: integer
"""
url = "https://transkribus.eu/TrpServer/rest/collections/list"
querystring = {"JSESSIONID": session_id}
response = requests.request("GET", url, params=querystring)
json_file = json.loads(response.text)
collection_id = ''
for collection in json_file:
if collection['colName'] == collection_name:
collection_id = collection['colId']
# Reporting
if not collection_id:
print('Verify targeted collection name or user\'s rights, no collection named %s!' % collection_name)
else:
print("Found %s; ID: %s." % (collection_name, collection_id))
return collection_id
def get_document_id(session_id, collection_id):
"""Request Transkribus API to list documents/subcollections in targeted collection.
:param session_id: session id for authentication.
:param collection_id: numerical id for targeted collection.
:type session_id: string
:type collection_id: integer
:return: list of numerical id for documents/subcollections in targeted collection.
:rtype: list
"""
url = "https://transkribus.eu/TrpServer/rest/collections/%s/list" % collection_id
querystring = {"JSESSIONID": session_id}
response = requests.request("GET", url, params=querystring)
json_file = json.loads(response.text)
document_list = [document["docId"] for document in json_file]
# Reporting
id_list = ", ".join(map(str, document_list))
print("Found following document IDs in '%s' collection: %s." % (collection_name, id_list))
return document_list
def get_document_pages(session_id, collection_id, document_id):
"""Request Transkribus API to get document/subcollection's metadata.
:param session_id: session id for authentication.
:param collection_id: numerical id for targeted collection.
:param document_id: numerical id for document/subcollection in targeted collection.
:type session_id: string
:type collection_id: integer
:type document_id: integer
:return: sum of all signals of server errors for current document/subcollection.
:rtype: integer
"""
total_errors = 0
url = "https://transkribus.eu/TrpServer/rest/collections/%s/%s/fulldoc" % (collection_id, document_id)
querystring = {"JSESSIONID": session_id}
response = requests.request("GET", url, params=querystring)
json_file = json.loads(response.text)
# Get metadata and create metadata file and log
path_to_doc = get_metadata(json_file)
errors = get_transcripts(json_file, path_to_doc)
total_errors += errors
return total_errors
# ======================= #
now = datetime.datetime.now()
TIMESTAMP = "%s-%s-%s-%s-%s" % (now.year, now.month, now.day, now.hour, now.minute)
invalid_status = []
for stat in status:
if not (stat == "NEW" or stat == "IN PROGRESS" or stat == "DONE" or stat == "FINAL"):
print("Warning! %s is not a valid satus" % stat)
invalid_status.append(stat)
if len(invalid_status) > 0:
[status.remove(wrong) for wrong in invalid_status]
if len(status) > 0:
print("Working with valid status: %s" % (str(status).strip('[]')))
if len(status) > 0:
cwd = os.path.dirname(os.path.abspath(__file__))
path_to_data = os.path.join(cwd, "data")
path_to_logs = os.path.join(cwd, "__logs__")
session_id = get_session_id()
if session_id:
initiate_log()
for collection_name in collection_list:
path_to_col = os.path.join(path_to_data, collection_name)
collection_id = get_collection_id(session_id)
total_errors = 0
if collection_id:
list_of_document_id = get_document_id(session_id, collection_id)
print("Creating new folder in data/ for %s collection if does not already exist." % collection_name)
create_folder(path_to_col)
for document_id in list_of_document_id:
errors = get_document_pages(session_id, collection_id, document_id)
total_errors += errors
if total_errors != 0:
print("Warning! %s server error(s) while retrieving xml files!" % total_errors)
create_separation_in_log()
else:
print("No valid status to work with.")