This repository has been archived by the owner on Jan 20, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 57
/
tests.rb
executable file
·608 lines (497 loc) · 33.5 KB
/
tests.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
#!/usr/bin/env jruby -J-Djava.awt.headless=true
# -*- coding: utf-8 -*-
require 'minitest'
require 'minitest/autorun'
require 'csv'
require_relative '../lib/tabula'
java_import Java::TechnologyTabula::Rectangle
def table_to_array(table)
lines_to_array(table.rows)
end
def lines_to_array(lines)
lines.map do |l|
l.map { |te| te.text.strip }
end
end
def lines_to_table(lines)
Tabula::Table.new_from_array(lines_to_array(lines))
end
# I don't want to pollute the "real" class with a funny inspect method. Just for testing comparisons.
module Tabula
class Table
def inspect
getRows.map { |row| row.map(&:getText).join(",") }
#"[" + lines.map(&:inspect).join(",") + "]"
end
end
end
module Tabula
class Line
def inspect
@text_elements.map{|te| te.nil? ? '' : te.text}.inspect
end
end
end
class TestEntityComparability < Minitest::Test
def test_text_element_comparability
base = Tabula::TextElement.new(0, 0, 0, 0, nil, 0, "Jeremy", 0, 0)
two = Tabula::TextElement.new(0, 0, 0, 0, nil, 0, " Jeremy \n", 0, 0)
three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88, 0)
four = Tabula::TextElement.new(5, 7, 1212, 121, nil, 15, "Jeremy", 55, 0)
five = Tabula::TextElement.new(5, 7, 1212, 121, nil, 15, "jeremy b", 55, 0)
six = Tabula::TextElement.new(5, 7, 1212, 121, nil, 15, "jeremy kj", 55, 0)
seven = Tabula::TextElement.new(0, 0, 0, 0, nil, 0, "jeremy kj", 55, 0)
assert_equal base, two
assert_equal base, three
assert_equal base, four
refute_equal base, five
refute_equal base, six
refute_equal base, seven
end
def test_line_comparability
text_base = Tabula::TextElement.new(0, 0, 0, 0, nil, 0, "Jeremy", 0)
text_two = Tabula::TextElement.new(0, 0, 0, 0, nil, 0, " Jeremy \n", 0)
text_three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
text_four = Tabula::TextElement.new(5, 7, 1212, 121, nil, 15, "Jeremy", 55)
text_five = Tabula::TextElement.new(5, 7, 1212, 121, nil, 15, "jeremy b", 55)
text_six = Tabula::TextElement.new(5, 7, 1212, 121, nil, 15, "jeremy kj", 55)
text_seven = Tabula::TextElement.new(0, 0, 0, 0, nil, 0, "jeremy kj", 0)
line_base = Tabula::Line.new
line_base.text_elements = [text_base, text_two, text_three]
line_equal = Tabula::Line.new
line_equal.text_elements = [text_base, text_two, text_three]
line_equal_but_longer = Tabula::Line.new
line_equal_but_longer.text_elements = [text_base, text_two, text_three, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
line_unequal = Tabula::Line.new
line_unequal.text_elements = [text_base, text_two, text_three, text_five]
line_unequal_and_longer = Tabula::Line.new
line_unequal_and_longer.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
line_unequal_and_longer_and_different = Tabula::Line.new
line_unequal_and_longer_and_different.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, 'whatever']
assert_equal line_base, line_equal
assert_equal line_base, line_equal_but_longer
refute_equal line_base, line_unequal
refute_equal line_base, line_unequal_and_longer
refute_equal line_base, line_unequal_and_longer_and_different
end
end
class TestPagesInfoExtractor < Minitest::Test
def test_pages_info_extractor
extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
i = 0
extractor.pages.each do |page|
assert_instance_of Tabula::Page, page
i += 1
end
assert_equal 2, i
end
end
class TestDumper < Minitest::Test
def test_extractor
extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
page = extractor.extract.first
extractor.close!
assert_instance_of Tabula::Page, page
end
def test_get_by_area
extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
page = extractor.extract.first
characters = page.get_text(107.1, 60.10, 313.65, 291.79)
extractor.close!
assert_equal 151, characters.size
end
end
class TestRulingIntersection < Minitest::Test
def test_ruling_intersection
horizontals = [Tabula::Ruling.new(10, 1, 10, 0)]
verticals = [Tabula::Ruling.new(1, 3, 0, 11),
Tabula::Ruling.new(1, 4, 0, 11)]
ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
assert_equal 2, ints.size
assert_equal ints[0][0].getX, 3.0
assert_equal ints[0][0].getY, 10.0
assert_equal ints[1][0].getX, 4.0
assert_equal ints[1][0].getY, 10.0
verticals = [Tabula::Ruling.new(20, 3, 0, 11)]
ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
assert_equal ints.size, 0
end
end
class TestExtractor < Minitest::Test
def test_extraction_of_multiple_pages
expected_by_page = [
[
["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"],
["Lidstad", "Dick & Peg", "62 Mississippi River Blvd N", "Saint Paul", "MN", "55104", "retired", "", "10/12/2012", "60.00"],
],
[
["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"],
["Filice","Gregory A","15 Crocus Place","Saint Paul","MN","55102","Physician","Veterans Affairs Medical Cent","10/3/2012","100.00"]
],
[
["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"],
["Skovolt","Glen and Anna","1473 Grantham St.","Saint Paul","MN","55108","retired","","9/12/2012","100.00"]
]
]
(1..3).to_a.each do |page_num|
table = table_to_array Tabula.extract_table(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)),
page_num,
[52.32857142857143,15.557142857142859,128.70000000000002,767.9571428571429],
:detect_ruling_lines => true)
assert_equal expected_by_page[page_num-1], table[0...2]
end
end
def test_extraction_of_all_pages
extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)), :all)
pages = extractor.extract.to_a
expected_unique_words_per_page = ["Lidstad", "Filice", "Spencer de Gutierrez", "Mancinis", "D'Aquila"]
pages.each_with_index do |pdf_page, idx|
text_chunks = Tabula::TextElement.merge_words(pdf_page.texts)
page_text_joined = text_chunks.map(&:text).join(" ")
assert page_text_joined.include?(expected_unique_words_per_page[idx])
end
end
def test_table_extraction_1
table = table_to_array Tabula.extract_table(File.expand_path('data/gre.pdf', File.dirname(__FILE__)),
1,
[107.1, 57.9214, 394.5214, 290.7],
:detect_ruling_lines => false,
:extraction_method => 'original')
expected = [["Prior Scale","New Scale","% Rank*"], ["800","170","99"], ["790","170","99"], ["780","170","99"], ["770","170","99"], ["760","170","99"], ["750","169","99"], ["740","169","99"], ["730","168","98"], ["720","168","98"], ["710","167","97"], ["700","166","96"], ["690","165","95"], ["680","165","95"], ["670","164","93"], ["660","164","93"], ["650","163","91"]]
assert_equal expected, table
end
def test_diputados_voting_record
table = table_to_array Tabula.extract_table(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)),
1,
[269.875, 12.75, 792.5, 565],
:detect_ruling_lines => false)
expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
assert_equal expected, table
end
def test_missing_spaces_around_an_ampersand
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
page_obj = character_extractor.extract.first
lines = page_obj.ruling_lines
vertical_rulings = lines.select(&:vertical?)
top, left, bottom, right = [170, 28, 185, 833] #top left bottom right
expected = Tabula::Table.new_from_array([
["", "REGIONAL PULMONARY & SLEEP",],
["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"],
["", "MEDICINE", ],
])
assert_equal expected, lines_to_table(page_obj.get_area(top,left,bottom,right).make_table(:vertical_rulings => vertical_rulings))
character_extractor.close!
end
# TODO Spaces inserted in words - fails
def test_bo_page24
table = table_to_array Tabula.extract_table(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)),
1,
[425.625, 53.125, 575.714, 810.535],
:detect_ruling_lines => false)
expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
assert_equal expected, table
end
def test_vertical_rulings_splitting_words
#if a vertical ruling crosses over a word, the word should be split at that vertical ruling
# before, the entire word would end up on one side of the vertical ruling.
pdf_file_path = File.expand_path('data/vertical_rulings_bug.pdf', File.dirname(__FILE__))
#both of these are semantically "correct"; the difference is in how we handle multi-line cells
expected = Tabula::Table.new_from_array([
["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH ABRAHAMSON", "", "", "$22.93", "", "", "$22.93"]
])
other_expected = Tabula::Table.new_from_array([
["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH", "", "", "$22.93", "", "", "$22.93"],
["", "", "", "ABRAHAMSON"]
])
#N.B. it's "MORGANTOWN", "WV" that we're most interested in here (it used to show up as ["MORGANTOWNWV", "", ""])
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, 1...2) #:all ) # 1..2643
extractor.extract.each_with_index do |pdf_page, page_index|
page_areas = [[250, 0, 325, 1700]]
scale_factor = pdf_page.width / 1700
vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{ |n| Tabula::Ruling.new(0, n * scale_factor, 0, 1000)}
tables = page_areas.map do |page_area|
pdf_page.get_area(*page_area).make_table(:vertical_rulings => vertical_rulings)
end
assert_equal expected, lines_to_table(tables.first)
end
extractor.close!
end
def test_vertical_rulings_prevent_merging_of_columns
expected = [["SZARANGOWICZ", "GUSTAVO ALEJANDRO", "25.096.244", "20-25096244-5", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TAILHADE", "LUIS RODOLFO", "21.386.299", "20-21386299-6", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TEDESCHI", "ADRIÁN ALBERTO", "24.171.507", "20-24171507-9", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["URRIZA", "MARÍA TERESA", "18.135.604", "27-18135604-4", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["USTARROZ", "GERÓNIMO JAVIER", "24.912.947", "20-24912947-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VALSANGIACOMO BLANC", "OFERNANDO JORGE", "26.800.203", "20-26800203-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VICENTE", "PABLO ARIEL", "21.897.586", "20-21897586-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["AMBURI", "HUGO ALBERTO", "14.096.560", "20-14096560-0", "09/10/2013", "EFECTIVO", "$ 20.000,00"], ["BERRA", "CLAUDIA SUSANA", "14.433.112", "27-14433112-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"]]
vertical_rulings = [ 147, 256, 310, 375, 431, 504].map{ |n| Tabula::Ruling.new(0, n, 0, 1000) }
table = table_to_array Tabula.extract_table(File.expand_path('data/campaign_donors.pdf', File.dirname(__FILE__)),
1,
[255.57,40.43,398.76,557.35],
:vertical_rulings => vertical_rulings)
assert_equal expected, table
end
def test_get_spacing_and_merging_right
table = table_to_array Tabula.extract_table(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)),
1,
[52.32857142857143,15.557142857142859,128.70000000000002,767.9571428571429],
:detect_ruling_lines => true)
expected = [["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"], ["Lidstad", "Dick & Peg", "62 Mississippi River Blvd N", "Saint Paul", "MN", "55104", "retired", "", "10/12/2012", "60.00"], ["Strom", "Pam", "1229 Hague Ave", "St. Paul", "MN", "55104", "", "", "9/12/2012", "60.00"], ["Seeba", "Louise & Paul", "1399 Sheldon St", "Saint Paul", "MN", "55108", "BOE", "City of Saint Paul", "10/12/2012", "60.00"], ["Schumacher / Bales", "Douglas L. / Patricia", "948 County Rd. D W", "Saint Paul", "MN", "55126", "", "", "10/13/2012", "60.00"], ["Abrams", "Marjorie", "238 8th St east", "St Paul", "MN", "55101", "Retired", "Retired", "8/8/2012", "75.00"], ["Crouse / Schroeder", "Abigail / Jonathan", "1545 Branston St.", "Saint Paul", "MN", "55108", "", "", "10/6/2012", "75.00"]]
assert_equal expected, table
end
class SpreadsheetsHasCellsTester
include Tabula::HasCells
attr_accessor :cells
def initialize(cells)
@cells = cells
end
end
class CellsHasCellsTester
include Tabula::HasCells
attr_accessor :vertical_ruling_lines, :horizontal_ruling_lines, :cells
def initialize(vertical_ruling_lines, horizontal_ruling_lines)
@cells = []
@vertical_ruling_lines = vertical_ruling_lines
@horizontal_ruling_lines = horizontal_ruling_lines
find_cells!(horizontal_ruling_lines, vertical_ruling_lines)
end
end
#just tests the algorithm
def test_lines_to_cells
vertical_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 0.0, 40.0),
Tabula::Ruling.new(44.0, 70.0, 0.0, 36.0),
Tabula::Ruling.new(40.0, 226.0, 0.0, 40.0)]
horizontal_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 208.0, 0.0),
Tabula::Ruling.new(44.0, 18.0, 208.0, 0.0),
Tabula::Ruling.new(50.0, 18.0, 208.0, 0.0),
Tabula::Ruling.new(54.0, 18.0, 208.0, 0.0),
Tabula::Ruling.new(60.0, 18.0, 208.0, 0.0),
Tabula::Ruling.new(64.0, 18.0, 208.0, 0.0),
Tabula::Ruling.new(70.0, 18.0, 208.0, 0.0),
Tabula::Ruling.new(74.0, 18.0, 208.0, 0.0),
Tabula::Ruling.new(80.0, 18.0, 208.0, 0.0)]
expected_cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
Tabula::Cell.new(44.0, 70.0, 156.0, 6.0), Tabula::Cell.new(50.0, 70.0, 156.0, 4.0),
Tabula::Cell.new(54.0, 70.0, 156.0, 6.0), Tabula::Cell.new(60.0, 70.0, 156.0, 4.0),
Tabula::Cell.new(64.0, 70.0, 156.0, 6.0), Tabula::Cell.new(70.0, 70.0, 156.0, 4.0),
Tabula::Cell.new(74.0, 70.0, 156.0, 6.0), ]
actual_cells = CellsHasCellsTester.new(vertical_ruling_lines, horizontal_ruling_lines).cells
assert_equal Set.new(expected_cells), Set.new(actual_cells) #I don't care about order
end
#this is the real deal!!
def test_extract_tabular_data_using_lines_and_spreadsheets
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
expected_data_path = File.expand_path('data/frx_2012_disclosure.tsv', File.dirname(__FILE__))
expected = open(expected_data_path, 'r').read
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all)
extractor.extract.each do |pdf_page|
spreadsheet = pdf_page.spreadsheets.first
assert_equal expected, spreadsheet.to_tsv
end
extractor.close!
end
def test_cope_with_a_tableless_page
skip("line_color_filter unimplemented in tabula-java for now, see https://github.com/tabulapdf/tabula-java/issues/21")
pdf_file_path = File.expand_path('data/no_tables.pdf', File.dirname(__FILE__))
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all, '',
:line_color_filter => lambda{|components| components.all?{|c| c < 0.1}}
)
spreadsheets = extractor.extract.to_a.first.spreadsheets
extractor.close!
assert_equal 0, spreadsheets.size
end
def test_spanning_cells
pdf_file_path = File.expand_path('data/spanning_cells.pdf', File.dirname(__FILE__))
expected_data_path = File.expand_path('data/spanning_cells.csv', File.dirname(__FILE__))
expected = open(expected_data_path, 'r').read
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1])
extractor.extract.each do |pdf_page|
spreadsheet = pdf_page.spreadsheets.first
assert_equal expected, spreadsheet.to_csv
end
extractor.close!
end
def test_almost_vertical_lines
pdf_file_path = File.expand_path('data/puertos1.pdf', File.dirname(__FILE__))
top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
area = Rectangle.new(top, left,
right - left, bottom - top)
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1])
extractor.extract.each do |pdf_page|
rulings = Tabula::Ruling.crop_rulings_to_area(pdf_page.ruling_lines, area)
# TODO assertion not entirely correct, should do the trick for now
assert_equal 15, rulings.select(&:vertical?).count
end
extractor.close!
end
def test_extract_spreadsheet_within_an_area
pdf_file_path = File.expand_path('data/puertos1.pdf', File.dirname(__FILE__))
top, left, bottom, right = 273.9035714285714,30.32142857142857,554.8821428571429,546.7964285714286
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1])
pdf_page = extractor.extract.first
area = pdf_page.get_area(top, left, bottom, right)
table = area.spreadsheets.first.getRows.map { |r| r.map(&:getText) }
assert_equal 15, table.length
assert_equal ["", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM"], table.first
assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
extractor.close!
end
def test_remove_repeated_text
top, left, bottom, right = 101.82857142857144,48.08571428571429,497.8285714285715,765.1285714285715
table = Tabula.extract_table(File.expand_path('data/nyc_2013fiscalreporttables.pdf', File.dirname(__FILE__)),
1,
[top,left,bottom,right],
:detect_ruling_lines => false,
:extraction_method => 'original')
ary = table_to_array(table)
assert_equal "$ 18,969,610", ary[1][1]
assert_equal "$ 18,157,722", ary[1][2]
end
def test_remove_overlapping_text
# one of those PDFs that put characters on top of another to make text "bold"
top,left,bottom,right = 399.98571428571427,36.06428571428571,425.1214285714285,544.2428571428571
table = Tabula.extract_table(File.expand_path('data/wc2012.pdf', File.dirname(__FILE__)),
1,
[top,left,bottom,right],
:detect_ruling_lines => false,
:extraction_method => 'original')
ary = table_to_array(table)
assert_equal ary.first.first, "Community development"
end
def test_cells_including_line_returns
data = []
pdf_file_path = File.expand_path('data/sydney_disclosure_contract.pdf', File.dirname(__FILE__))
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1])
extractor.extract([1.to_java(:int)]).each do |pdf_page|
pdf_page.spreadsheets.each do |spreadsheet|
spreadsheet.cells.each do |cell|
# this pattern is deprecated and maintained only for backwards-compatibility. please don't copy it.
# data << cell.text(true) would be sufficient in modern Tabula for all three following lines.
cell.options = ({:use_line_returns => true, :cell_debug => 0})
data << cell.text
end
end
end
extractor.close!
expected = ["1295", "Name: Reino International Pty Ltd trading as Duncan Solutions \rAddress: 15/39 Herbet Street, St Leonards NSW 2065", "N/A", "Effective Date: 13 May 2013 \rDuration: 15 Weeks", "Supply, Installation and Maintenance of Parking Ticket Machines", "$3,148,800.00exgst", "N/A", "N/A", "Open Tender \rTender evaluation criteria included: \r- The schedule of prices \r- Compliance with technical specifications/Technical assessment \r- Operational Plan including maintenance procedures"]
assert_equal expected, data
end
def test_remove_repeated_spaces
top,left,bottom,right = 304.9375,78.625,334.6875,501.5
table = Tabula.extract_table(File.expand_path('data/repeated_spaces.pdf', File.dirname(__FILE__)),
1,
[top,left,bottom,right],
:detect_ruling_lines => false,
:extraction_method => 'original')
table_to_array(table).each { |row|
assert_equal row.size, 7
}
end
def test_monospaced_table
top,left,bottom,right = 149.9142857142857, 89.10000000000001, 243.25714285714287, 721.2857142857143
table = Tabula.extract_table(File.expand_path('data/monospaced1.pdf', File.dirname(__FILE__)),
1,
[top,left,bottom,right],
:detect_ruling_lines => false,
:extraction_method => 'original')
expected = [["ALBERT LEA, MAYO CLINIC HEALTH SYS- ALBE", "0", "0", "0", "7", "7", ".0", ".0", ".0", "23.3", "10.4"], ["ROCHESTER, MAYO CLINIC METHODIST HOSPITA", "6", "7", "14", "11", "25", "27.3", "100.0", "37.8", "36.7", "37.3"], ["ROCHESTER, MAYO CLINIC ST. MARYS", "9", "0", "11", "7", "18", "40.9", ".0", "29.7", "23.3", "26.9"], ["BLUE EARTH, UNITED HOSPITAL DISTRICT", "3", "0", "4", "0", "4", "13.6", ".0", "10.8", ".0", "6.0"], ["FAIRMONT, MAYO CLINIC HEALTH SYSTEM -FAI", "1", "0", "2", "1", "3", "4.5", ".0", "5.4", "3.3", "4.5"], ["MANKATO, MAYO CLINIC HEALTH SYSTEM- MANK", "3", "0", "5", "3", "8", "13.6", ".0", "13.5", "10.0", "11.9"], ["ALL REGION 4 (TC) HOSPITALS", "0", "0", "1", "1", "2", ".0", ".0", "2.7", "3.3", "3.0"], ["", "22", "7", "37", "30", "67", "100.0", "100.0", "100.0", "100.0", "100.0"]]
assert_equal expected, table_to_array(table)
end
def test_monospaced_table_ascii_line_separator
extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/monospaced_ascii_sep.pdf', File.dirname(__FILE__)),
[1])
expected = [["Column A", "* ColB1", "ColB2 ColB3", "* Column C"], ["Value 1", "* 23.5", "66.811.0", "* Name 1"], ["Value 2", "* 33.2", "56.312.0", "* Name 2"], ["Value 3", "* 123.3", "200.4 123.9", "* Name 3"], ["Value 4", "* 24.5", "66.811.0", "* Name 1"], ["Value 5", "* 43.2", "80.114.5", "* Name 2"], ["Value 6", "* 100.6", "190.4 120.3", "* Name 3"], ["Value 7", "* 11.5", "66.811.0", "* Name 1"], ["Value 8", "* 37.4", "77.420.1", "* Name 2"], ["Value 9", "* 883.3", "110.4 111.2", "* Name 3"]]
table = extractor.extract_page(1).get_table
assert_equal expected, table_to_array(table)
end
def test_bad_column_detection
top,left,bottom,right = 535.5,70.125,549.3125,532.3125
table = Tabula.extract_table(File.expand_path('data/indecago10.pdf', File.dirname(__FILE__)),
1,
[top,left,bottom,right],
:detect_ruling_lines => false,
:extraction_method => 'original')
assert_equal ["Comunicaciones", "104,29", "– –", "0,1", "0,6", "1,1", "0,3"],
table_to_array(table).first
end
def test_character_merging_that_wasnt_working_previously
expected_data_path = File.expand_path('data/french1.tsv', File.dirname(__FILE__))
expected = CSV.read(expected_data_path, { :col_sep => "\t" })
expected.map! { |r| r.map(&:strip) }
top,left,bottom,right = 32.87142857142857,41.72142857142857,486.75,694.0928571428572
table = Tabula.extract_table(File.expand_path('data/french1.pdf', File.dirname(__FILE__)),
1,
[top,left,bottom,right],
:detect_ruling_lines => false,
:extraction_method => 'original')
assert_equal expected, table_to_array(table)
end
def test_issue78_some_ruling_lines_not_detected
pdf_file_path = File.expand_path('data/mineria.pdf', File.dirname(__FILE__))
area = [104.46890818740722, 13, 580.548646927163, 820.8271357581996]
table = Tabula.extract_table(File.expand_path('data/mineria.pdf',
File.dirname(__FILE__)),
1,
area,
:extraction_method => 'spreadsheet')
expected = [["1", "010000091", "086", "03/12/2012", "ACHAYAP MANTU ALDO", "ACHAYAP MANTU ALDO", "1", "OTROS", ".", ".", "AMAZONAS", "CONDORCANQ\rUI", "NIEVA", "", "", ""], ["2", "010000023", "022", "18/06/2012", "ACOSTA ROSALES YOSELIN BRICET", "ACOSTA ROSALES YOSELIN \rBRICET", "2", "TITULAR", "NANCY 11 ( REGISTRO \rCANCELADO )", "510001910", "AMAZONAS", "BONGARA", "FLORIDA", "18", "9,357,000", "174,000"]]
assert expected, table_to_array(table)[0..2]
end
def test_checks_for_text_on_page
pdf_file_path = File.expand_path('data/gretna-owh-request.pdf', File.dirname(__FILE__))
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1])
extractor.extract.each do |pdf_page|
assert !pdf_page.has_text?
end
extractor.close!
pdf_file_path = File.expand_path('data/brazil_crop_area.pdf', File.dirname(__FILE__))
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1])
extractor.extract.each do |pdf_page|
assert pdf_page.has_text?
end
extractor.close!
end
def test_spanning_cells
# see
# https://github.com/tabulapdf/tabula-java/issues/55
# https://github.com/tabulapdf/tabula-extractor/commit/735b82450a40b3743333816b50cad470b1ca7b43#commitcomment-15087511
# this PDF is a printout of an Excel spreadsheet that has "spanning cells", that is, 10 or so data columns
# with six header cells -- some of the header cells are "merged" to span over several data columns.
# we cope with this internally by creating zero-width (or zero-height for cells that span rows) cells
# and in array/CSV output, these blank cells are inserted /after/ the text of the real cell.
pdf_file_path = File.expand_path("data/47008204D_USA.page4.pdf", File.dirname(__FILE__))
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
pdf_page = extractor.extract.first
actual_header_row = table_to_array(pdf_page.spreadsheets.first).first
expected_header_row = ["", "", "", "IEM Findings","","","","","", "Remediation","","","", "[Status]"]
assert_equal expected_header_row, actual_header_row
end
end
class TestIsTabularHeuristic < Minitest::Test
EXPECTED_TO_BE_SPREADSHEET = ['GSK_2012_Q4.page437.pdf', 'strongschools.pdf', 'tabla_subsidios.pdf']
#NOT_EXPECTED_TO_BE_SPREADSHEET = ['560015757GV_China.page1.pdf', 'S2MNCEbirdisland.pdf', 'bo_page24.pdf', 'campaign_donors.pdf']
NOT_EXPECTED_TO_BE_SPREADSHEET = ['47008204D_USA.page4.pdf', '560015757GV_China.page1.pdf', 'bo_page24.pdf', 'campaign_donors.pdf']
File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
def test_heuristic_detects_spreadsheets
EXPECTED_TO_BE_SPREADSHEET.each do |f|
path = File.expand_path('data/' + f, File.dirname(__FILE__))
extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
page = extractor.extract.first
page.get_ruling_lines!
extractor.close!
assert page.is_tabular?, "failed on file #{f}"
end
end
def test_heuristic_detects_non_spreadsheets
NOT_EXPECTED_TO_BE_SPREADSHEET.each do |f|
path = File.expand_path('data/' + f, File.dirname(__FILE__))
extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
page = extractor.extract.first
page.get_ruling_lines!
extractor.close!
assert !page.is_tabular?, "failed on file #{f}"
end
end
end