-
Notifications
You must be signed in to change notification settings - Fork 0
/
testTextBreaks.py
124 lines (104 loc) · 3.59 KB
/
testTextBreaks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# coding=utf-8
# A script to read in catalog data, check the OCR volumes for the line the text begins and ends on
# update and write out the data again.
#
# Usage:
# testTextBreaks.py vol=#
#
from os.path import dirname, join
from os import listdir
import sys
from codecs import open
from OCRXml import *
import time
from re import search
my_path = dirname(__file__)
textdelims = {u'རྒྱ་གར་སྐད་':0, u'བོད་སྐད་':0, u'རྫོགས་སོ':0, u'་ཕྱག་འཚལ':0, u'བཞུགས་':0, u'ཞེས་བ':0}
def searchForBreak(stline):
found = 0
for delim in textdelims:
if stline.find(delim) > -1:
textdelims[delim] += 1
found = 1
return found
# Parse arguments into hash
args= {}
for a in sys.argv:
if sys.argv.index(a) > 0:
if "=" in a:
pts = a.split("=")
args[pts[0]] = pts[1]
else:
args[sys.argv.index(a)] = a
# Set log file
ltime = time.localtime(time.time())
datearr = [str(ltime[0]), str(ltime[1]).zfill(2), str(ltime[2]).zfill(2)]
timearr = [str(ltime[3] - 5).zfill(2), str(ltime[4]).zfill(2), str(ltime[5]).zfill(2)]
dtstr = "-".join(datearr) + "-" + "".join(timearr)
logfile = join(my_path,"logs","textbreaktest_{0}.log").format(dtstr)
print "logfile is: {0}".format(logfile)
# COMMENT OUT OPENING OF LOG FILE TO GET IMMEDIATE RESULTS IN TERMINAL
sys.stdout = open(logfile, 'w', encoding='utf-8')
print 'Log file for Testing Text Breaks ({0})'.format(dtstr)
print 'arguments: {0}'.format(args)
my_path = dirname(__file__)
catpath = join(my_path, 'data', 'peltsek-with-lines.xml')
cat = Catalog.Catalog(catpath, 'Peltsek')
# Load vol ocr text names
volpath = join(my_path, '..', 'volsource')
vols = {}
vfiles = listdir(volpath)
for v in vfiles:
match = search("\-vol(\d+)\_",v)
if match:
vols[match.group(1)] = v
if args.has_key('vol'):
vkey = str(args['vol']).zfill(2)
txtct = 0
badtxt = 0
lnnotfound = 0
for vn in range(1,50):
vkey = str(vn).zfill(2)
if vols.has_key(vkey):
print "Volume", vols[vkey]
vol = OCRVolume.Vol(join(volpath, vols[vkey]), int(vkey))
vtoc = cat.get_volume_toc(int(vkey), 'texts')
# Iterate through texts in volume
for t in vtoc:
stpg = t.startpage
txtct += 1
# if it has a start page with a "." in it
if stpg is not None and '.' in stpg:
spp = t.startpage.split('.')
stline = vol.getLine(spp[0],spp[1])
if stline:
found = searchForBreak(stline)
else:
newpg = int(spp[0]) + 1
stline = vol.getLine(newpg,1)
if stline:
found = searchForBreak(stline)
else:
print "Cannot find line: {0}.{1}".format(spp[0],spp[1])
lnnotfound += 1
if found == 0:
print t.key, "(" + stpg + ") ", " no delim found:"
print stline
badtxt += 1
# if it has a start page but no "."
elif stpg is not None and stpg != "":
stline = vol.getLine(stline, 1)
if stline:
found = searchForBreak(stline)
if found == 0:
print t.key, "(" + stpg + ") ", " assuming line 1, no delim found!"
print stline
badtxt += 1
else:
print "No line number"
lnnotfound += 1
print "Total Texts: {0}".format(txtct)
print "Bad St lines: {0}".format(badtxt)
print "Lines not found or no number: {0}".format(lnnotfound)
for k in textdelims.keys():
print k, "=>", textdelims[k]