-
Notifications
You must be signed in to change notification settings - Fork 0
/
findTextBreaks.py
102 lines (89 loc) · 3.62 KB
/
findTextBreaks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# coding=utf-8
# A script to read in catalog data, check the OCR volumes for the line the text begins and ends on
# update and write out the data again.
import os
from os.path import dirname, join
import codecs
import re
import sys
from lxml import etree
from OCRXml import Catalog, OCRVolume, Text, Functions
my_path = dirname(__file__)
datafolder = join(my_path, '..', 'data')
catpath = join(datafolder,'peltsek.xml') # Path to the catalog data
volfolder = join(my_path, '..', 'volsource') # Folder where vol OCR resides
dt = Functions.getDateTime()
catout = join(datafolder, 'peltsek-with-lines_' + dt + '.xml') # Path to write new catalog
# Instantiate the Peltsek Catalog
cat = Catalog.Catalog(catpath, 'Peltsek')
# Iterate through the volume ocr files in the given folder
for f in os.listdir(volfolder):
# Determine volume info
m = re.search('\-vol(\d+)\_', f)
vnum = int(m.group(1)) # volume number
volpath = join(volfolder, f) # volume doc path
print "Doing volume {0}".format(vnum)
try:
vol = OCRVolume.Vol(volpath, vnum) # read in the vol and create object
vtxtlist = cat.get_volume_toc(vnum, 'list') # get text list for vol from catalog
lasttext = 0
# Iterate through texts in the volume
for t in vtxtlist:
mystpg = t['start']
if mystpg != None and "." in mystpg:
pts = mystpg.split(".")
mystpg = pts[0]
newstpg = mystpg
tnum = int(t['key'])
txt = cat.get_text(tnum, 'element')
if mystpg != None and txt != None:
lnum = vol.textStartLine(mystpg) # Find and assign start line for text
if lnum == False:
lnum = '1'
newstpg += '.' + lnum
cat.get_text(tnum, 'element').find('startpage').text = newstpg
# Set end line for previous text based on this text's beginning
if lasttext != 0:
lep = cat.get_text(lasttext,'element').find('endpage').text
if lep != None:
if "." in lep:
pts = lep.split(".")
lep = pts[0]
if vol.textStartsAtTop(mystpg):
lep = str((int(mystpg) - 1)) + ".6"
else:
lep += '.' + lnum
cat.get_text(lasttext,'element').find('endpage').text = lep
lasttext = tnum
# Set end page of last text in volume
ltnum = vtxtlist[-1]['key']
ltext = cat.get_text(ltnum, 'element')
endpg = ltext.find('endpage').text
if endpg == None:
endpg = vtxtlist[-1]['end']
if endpg is not None and "." not in endpg:
endpg = endpg + ".6"
ltext.find('endpage').text = endpg
except etree.XMLSyntaxError:
print "XML Error for volume {0}".format(vnum)
#except Exception:
# print "Generic error for volume {0}".format(vnum)
# print sys.exc_info()
# sys.exc_info()[2].print_stack()
# Run routine to fix missing paginations
cat.fix_missing_paginations()
# Run final check to fix problem where one text ends at e.g. 95.1 and next text starts 96.1
# Assume that 95.1 should be 95.6
prevTxt = None
for txt in cat.iter_texts("xml"):
if prevTxt is not None:
tst = txt.find('startpage')
ptend = prevTxt.find('endpage')
if tst is not None and ptend is not None:
if ".1" in tst.text and ".1" in ptend.text:
if int(float(tst.text)) == int(float(ptend.text)) + 1:
ptend.text = str(int(float(ptend.text))) + ".6"
prevTxt = txt
# Write out the catalog file with updated line numbers
cat.write(catout)
print "Output made to: {0}".format(catout)