-
Notifications
You must be signed in to change notification settings - Fork 0
/
hocr2html.pl
executable file
·83 lines (71 loc) · 2.03 KB
/
hocr2html.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/perl
#
# Given a hocr document, take the bboxes and
# produce an html with a word position layer.
#
# Input and output can be files or piped STDIN and STDOUT
#
# Input can be from the DB, given an imageFile name
#
use common::sense;
use OCR::hocrUtils qw( hocr2html );
use OCR::Ocrdb qw( getOCR );
#use IO::Compress::Gzip qw(gzip $GzipError) ;
use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
# utf-8 conversion
use Encode qw(decode encode);
use diagnostics;
use Getopt::Long;
use Data::Dumper;
my $imageFile= "";
my $outfile= "";
my $fontSize= "35";
my $verbose;
my $help;
my $result = GetOptions (
"imageFile=s" => \$imageFile, # string
"outfile=s" => \$outfile, # string
"fontSize=i" => \$fontSize, # integer
"help" => \$help, # flag
"verbose" => \$verbose); # flag
if( $help ) {
print "Usage $0 ifilename > ofilename\n";
print "or $0 < ifilename > ofilename \n";
print "or $0 [--imageFile=DBimageSpec] \n";
print "or $0 [--outfile=filename] --verbose\n";
print "or $0 [--fontSize=number] \n";
print "or $0 --help\n";
exit 0;
}
my $file_ext = 'jpg';
my $inhocr;
if( $imageFile eq "") {
# Unset $/, the Input Record Separator, to make <> give the whole file at once.
{
local $/=undef;
$inhocr = <>;
}
} else {
# my $engine = "tess3.03-IMdivide";
my $engine; # we want whatever engine, newest startTime
my $gzhocr = getOCR( $imageFile, $engine);
# uncompress it (this needs bytes, not utf-8)
my $rawhocr ;
my $status = gunzip \$gzhocr, \$rawhocr
or die "gunzip failed: $GunzipError\n";
$inhocr = $rawhocr ;
if( $imageFile =~ /.*\.(.*)/ ) {
$file_ext = $1;
warn $1;
}
}
my $layerhtml = hocr2html ( $inhocr, $fontSize, $file_ext);
if( $outfile eq "") {
print $layerhtml ;
} else {
open my $out, '>', $outfile;
print {$out} $layerhtml;
close $out;
}
exit 1;
#------------------------