-
Notifications
You must be signed in to change notification settings - Fork 0
/
DoImage.pl
executable file
·274 lines (222 loc) · 8.2 KB
/
DoImage.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#!/usr/bin/perl
#
# Given a image file (jpeg, jpeg2000, png and tif),
# run OCR on it, generating xml, text, stats, and concordance.
# Steps:
# - check the DB, skip existing
# - preprocess the image
# - run Tesseract or Ocropus or..
# - make word stats
# - save to DB
#
# Temp files were saved to a local tree that mirrors the tdr (change this!)
# The directory will be created if it does not already exist.
#
# Before doing OCR, process the input image with 'adaptive thresholding':
# we need the image to appear like a photocopy so that Tesseract's binarization
# will be effective, though the source image might have uneven lighting and minor noise.
#
# This program is intended to be run in parallel on several servers, one instance per core.
#use common::sense;
use strict;
use warnings;
use diagnostics;
# warn user (from perspective of caller)
use Carp;
use Getopt::Long;
use Cwd 'abs_path';
use File::Path qw( make_path );
use File::Basename;
use Cwd;
use OCR::Ocrdb qw( existsOCR insertOCR );
use OCR::hocrUtils qw( hocr2words doFilterHocr);
use POSIX qw(strftime);
use IO::Compress::Gzip qw(gzip $GzipError) ;
use Graphics::Magick;
use constant { TRUE => 1, FALSE => 0 };
my $logFile;
# get imageMagick to scale and or rotate the image
# given: the input source file name
# the output file name (no extension)
# the extension of the input file
# the required brightening factor
# returns the output file name
#
# Note: GraphicsMagick is a replacement for ImageMagick,
# giving better performance and fewer feature changes with new versions
#
sub magicBrighten {
my ($sourceFile, $interfilenameNoExt, $ext, $brightenFactor) = @_;
my $ofilename = "";
# my $image = Image::Magick->new;
my $image=Graphics::Magick->new;
my $x = $image->read( $sourceFile);
if ("$x") { print $logFile "ERROR image read = $x \n"; }
# convert jpeg2000 images to jpeg
if( $image->Get('magick') eq "JP2") {
# Going for lossless conversion
$image->Set(quality=>100,compression=>'none',magick=>'jpeg');
# $image->Strip(); # strip an image of all profiles and comments. This works in ImageMagick but not in GM
$image->Profile(); # strip an image of all profiles and comments. This is the GM way instead of Strip
$ofilename = $interfilenameNoExt . '.jpg';
} else {
$ofilename = $interfilenameNoExt . $ext;
}
# my $tempfile = $interfilenameNoExt . "temp.jpg";
$image->Quantize(colorspace=>'gray');
my $q = $image->Clone();
$q->Blur( radius=>0.0, sigma=>30.0); # sigma defaults to 1.0
$x = $q->Composite ( compose=>'Divide', image=>$image );
# $x = $image->Composite ( compose=>'Divide_Src', image=>$q, composite=>'t' );
if ("$x") { print $logFile "ERROR image modu = $x \n"; }
# $x = $image->Mogrify( 'modulate', brightness=>$brightenFactor );
# if ("$x") { print $logFile "image brighten = $x \n"; }
$x = $q->Write( $ofilename);
if ("$x") { print $logFile "ERROR image write = $x \n"; }
# works well on typewriter copy
# ref http://www.imagemagick.org/Usage/compose/#divide
# this is the command line corresponding to the above perlmagick
# `convert $ofilename -colorspace gray \\( +clone -blur 0x20 \\) -compose Divide_Src -composite $tempfile`;
#tess3.03-IMlat
# `convert $ofilename -negate -lat 15x15+10% -negate $tempfile`;
# `mv $tempfile $ofilename`;
# The recommended way to destroy an object is with undef
undef $q;
undef $image;
return $ofilename;
}
# The .hocr contains these tags:
# <span
# class='ocrx_word'
# id='word_1_331'
# title='bbox 1451 679 1457 686; x_wconf 76'
# lang='eng'
# dir='ltr'>
# <em>I</em>
# </span>
##############################################
# Mainline
#
my $input = ".";
my $lang = "eng";
my $brightFactor = "99"; # obsolete
my $ocropus; # future
my $verbose;
my $help;
my $keep = FALSE; # obsolete
my $result = GetOptions (
"input=s" => \$input, # string
"lang=s" => \$lang, # string
"bright=s" => \$brightFactor, #
"ocropus" => \$ocropus, # flag
"help" => \$help, # flag
"keep" => \$keep, # flag
"verbose" => \$verbose); # flag
if( $help || $input eq "." ) {
warn "Usage $0 [--input=indirpath] [--lang=fra] --verbose\n";
warn "or $0 --help\n";
exit 0;
}
#print $logFile "WARN inupt $input \n";
my $tessver = `tesseract --version 2>&1`;
# the output is in the form of
# tesseract 3.03
# leptonica-1.71
# libjpeg 8d : libpng 1.2.51 : libtiff 4.0.3 : zlib 1.2.8
#
# we want just the version ie 3.03:
$tessver =~ s/tesseract ([0-9]*.[0-9]*).*/$1/s;
# The following gets saved in the DB ocrEngine field
my $enginePreproDescrip = "tess${tessver}-IMdivide";
open($logFile, '>>', "/var/log/ocr/DoImage.log")
|| croak "LOG open failed: $!";
my $oldfh = select($logFile); $| = 1; select($oldfh);
# example input path
# /collections/tdr/oocihm/444/oocihm.lac_reel_c8008/data/sip/data/files/1869.jpg
# remove trailing filename
my ($base, $dir, $ext) = fileparse( $input, qr/\.[^.]*/ );
#my ($base, $dir, $ext) = fileparse( $input, qr{\.stas});
my $inBase = $dir . $base . $ext;
if ($base eq "revisions") {
print $logFile "WARN pruned dir $inBase \n";
exit 0;
}
# remove the prefix directories
# previously we removed
#substr( $inBase, 0, 40) =~ s|/collections/||g ;
# now we have it automatically remove all up to the oocihm or oop:
# take a valid TDR filepath, leaving the prefix whatevers
my $matchedIt = ( $inBase =~ s|^.*?/([a-z]+/[0-9]{3}/[a-z]+\.[a-z_0-9]+?/data/sip/data/files/[0-9._a-z]+?)$|$1|i );
if( !$matchedIt) {
print $logFile "ERROR === bad dirpath $inBase = $dir . $base . $ext \n";
exit 0;
}
# check the DB to see if the item has been processed already with any version of the engine.
if( existsOCR ( $inBase, undef, $lang)) {
exit 0;
}
my $oDir = getcwd() . $dir;
my $interfilenameNoExt = $oDir . $base;
my $interfilename = $oDir . $base . $ext;
my $outHcr = $interfilename . ".hocr";
# make output dir
make_path $oDir;
my $starttime = time();
# brighten the input image by nn%
# also, convert any jp2 files to jpg and return the name
my $ofilename = magicBrighten ( $input, $interfilenameNoExt, $ext, $brightFactor);
# OCR the brightened image, producing a .hocr file:
`tesseract $ofilename $interfilename -l $lang quiet hocr`;
# check that the .hocr file was made
if ( ! -e $outHcr) {
print $logFile "ERROR ===no hcr $outHcr\n";
exit 0;
}
# remove the brightened file, which uses lots of disk space
unlink $ofilename;
# get the hocr info from the file
my $inhocr = "";
#Unset $/, the Input Record Separator, to make <> give the whole file at once.
{
local $/=undef;
open my $hocrFile, '<', $outHcr
or croak "Couldn't open $outHcr: $!";
$inhocr = <$hocrFile>;
close $hocrFile;
}
$inhocr = doFilterHocr ( $inhocr);
# save the filtered hocr to a file for debugging
# open my $out_fh, '>', $outHcr . "filt.hocr";
# print {$out_fh} $inhocr;
# close $out_fh;
# The hocr info is stored in a format that is not utf8 because
# that caused problems in the gzip compression below.
my $utf8flag1 = utf8::is_utf8($inhocr);
if( $utf8flag1) {
# we do not expect to see this in the log
print $logFile "WARN inhocr == utf8 == true \n";
}
# compress it
my $gzhocr = "";
gzip \$inhocr, \$gzhocr ;
# get some stats and text from the .hocr file
my ($avgwconf, $nwords, $nwords2, $unformattedtext, $diagnostic) = hocr2words( $outHcr);
if ( $diagnostic) {
print $logFile $diagnostic;
}
# remove the hocr file, which uses a bit of disk space
unlink $outHcr;
print $logFile "INFO nwords unique $nwords all $nwords2 $inBase \n";
# find the size of the input image
my ($device, $inode, $mode, $nlink, $uid, $gid, $rdev, $imgFileSize,
$atime, $mtime, $ctime, $blksize, $blocks) =
stat( $input);
# oops, this time should be UTC or maybe 'floating' timezone though it does not matter much.
my $time = time() - $starttime;
my $remarks = "";
# insert or replace in the DB
insertOCR ( $inBase, $enginePreproDescrip, $lang, $brightFactor, "100",
$avgwconf, $nwords,
$starttime,
$time, $remarks, $imgFileSize, $unformattedtext, $gzhocr) ;
exit 0;