Skip to content

Commit

Permalink
fixed some cases for multi-line rows
Browse files Browse the repository at this point in the history
  • Loading branch information
jazzido committed Apr 3, 2013
1 parent 02f178c commit e828265
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 22 deletions.
40 changes: 28 additions & 12 deletions lib/tabula.rb
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,15 @@ def should_add_space?(other)

def merge!(other)
raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
# unless self.horizontally_overlaps?(other) or self.vertically_overlaps?(other)
# raise ArgumentError, "won't merge TextElements that don't overlap"
# end
if self.horizontally_overlaps?(other) and other.top < self.top
self.text = other.text + self.text
else
self.text << other.text
end
super(other)
self.text << other.text
end

def to_h
Expand All @@ -139,25 +146,30 @@ def to_h


class Line < ZoneEntity
# TODO clean this up
attr_accessor :text_elements
attr_accessor :text_elements

def initialize
self.text_elements = []
end

def <<(t)
self.text_elements << t
if self.text_elements.size == 1
if self.text_elements.size == 0
self.text_elements << t
self.top = t.top
self.left = t.left
self.width = t.width
self.height = t.height
else
self.merge!(t)
if in_same_column = self.text_elements.find { |te| te.horizontally_overlaps?(t) }
in_same_column.merge!(t)
else
self.text_elements << t
self.merge!(t)
end
end
end


end

class Column < ZoneEntity
Expand Down Expand Up @@ -258,9 +270,10 @@ def get_rows


# TODO finish writing this method
# it should be analogous to get_line_boundaries
# (ie, take into account vertical ruling lines if available)
def group_by_columns
columns = []
vr = self.options[:vertical_rulings]
tes = self.text_elements.sort_by(&:left)

# we don't have vertical rulings
Expand Down Expand Up @@ -350,14 +363,16 @@ def merge_words!
end
end

ONLY_SPACES_RE = Regexp.new('^\s+$')
def Tabula.make_table(text_elements, options={})
extractor = TableExtractor.new(text_elements, options)

# group by lines
lines = []
line_boundaries = extractor.get_line_boundaries


# find all the text elements
# contained within each detected line (table row) boundary
line_boundaries.each { |lb|
line = Line.new

Expand All @@ -368,6 +383,8 @@ def Tabula.make_table(text_elements, options={})
text_elements -= line_members

line_members.sort_by(&:left).each { |te|
# skip text_elements that only contain spaces
next if te.text =~ ONLY_SPACES_RE
line << te
}

Expand All @@ -376,21 +393,20 @@ def Tabula.make_table(text_elements, options={})

lines.sort_by!(&:top)

columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq)
columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq).sort_by(&:left)

# # insert empty cells if needed
lines.each_with_index { |l, line_index|
next if l.text_elements.nil?
l.text_elements.compact! # TODO WHY do I have to do this?
l.text_elements.uniq! # TODO WHY do I have to do this?

l.text_elements = l.text_elements.sort_by(&:left)
l.text_elements.sort_by!(&:left)

# l.text_elements = Tabula.merge_words(l.text_elements)

next unless l.text_elements.size < columns.size

columns.sort_by(&:left).each_with_index do |c, i|
columns.each_with_index do |c, i|
if (i > l.text_elements.size - 1) or !l.text_elements(&:left)[i].nil? and !c.text_elements.include?(l.text_elements[i])
l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, ''))
end
Expand Down
Binary file added test/test_pdfs/tabla_subsidios.pdf
Binary file not shown.
29 changes: 19 additions & 10 deletions test/test_table_analyzer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def setup
end

def teardown
# FileUtils.remove_entry_secure @tmp_dir
FileUtils.remove_entry_secure @tmp_dir
end

# HOW TO WRITE A TEST - EXAMPLE
Expand All @@ -32,6 +32,24 @@ def teardown
# assert_equal lines_to_array(table), expected # assert the equality
# end

def test_tabla_subsidios
pdf_path = File.join(@script_path,
'test_pdfs/tabla_subsidios.pdf')
run_jruby_extractor!(pdf_path)

rulings = detect_rulings(pdf_path, 1)

text_elements = Tabula::XML.get_text_elements(@tmp_dir, 1, 26.87, 200.82, 715.62, 250.32)
table = Tabula.make_table(text_elements,
:horizontal_rulings => rulings[:horizontal],
:vertical_rulings => rulings[:vertical])

expected = [["BA 014/12", "", "BA", "DOMINGO GONZALEZ Y CIA SA", "MT", "PyME", "1.573.476,50", "1.573.476,50", "50,00%", "786.738,25"], ["BA 015/12", "", "BA", "LABORATORIO WEIZUR ARGENTINA SA", "MT", "PyME", "700.163,00", "700.163,00", "50,00%", "350.081,50"], ["BA 017/12", "NA 022/12", "BA", "RIZOBACTER ARGENTINA S.A.", "I+D", "GRANDE", "3.000.000,00", " 2.927.040,00 ", "50,00%", "969.218,54"]]

assert_equal lines_to_array(table), expected

end


def test_argentina_diputados_voting_record
run_jruby_extractor!(File.join(@script_path,'test_pdfs/argentina_diputados_voting_record.pdf'))
Expand All @@ -52,14 +70,8 @@ def test_pharma_spaceless

text_elements = Tabula::XML.get_text_elements(@tmp_dir, 1, 49.9375, 85, 537.625, 130.6875)

rulings = detect_rulings(pdf_path, 1)
# table = Tabula.make_table(text_elements,
# :horizontal_rulings => rulings[:horizontal],
# :vertical_rulings => rulings[:vertical])

table = Tabula.make_table(text_elements)


# this file does not get spaces rendered by pdfbox (so XML lacks spaces) but
# Tabula.make_table should add spaces.
expected = [["ABRAHAM RESEARCH, PLLC", "VILLAREAL, MANUEL", "CRESCENT SPRINGS, ", "KY", "$", "3,748.70"], ["ABRAHAM RESEARCH, PLLC", "TELTSER, MATTHEW", "PEMBROKE PINES, FL", "$", "6,000.00"], ["ALBEMARLE RESEARCH CONSULTANTS, INC. CAROLINA ", "RESEARCH ", "SPECIALISTS", "HEYDER, ALBRECHT M.", "ELIZABETH CITY, NC", "$", "2,565.00"], ["ALBUQUERQUE NEUROSCIENCE, INC.", "DEMPSEY, GLENN MICHAEL", "ALBUQUERQUE, NM", "$", "71,955.50"], ["ALEXIAN BROTHERS BEHAVIORIAL HEALTH HOSPITAL", "LERMAN, MARK", "CHICAGO, IL", "$", "14,079.00"], ["ALLERGIC DISEASES SC", "COHEN, STEVEN H.", "WEST ALLIS, WI", "$", "1,786.00"]]
Expand All @@ -68,7 +80,6 @@ def test_pharma_spaceless
end

def test_bo_page24
skip("FAILING - CHARACTERS APPEAR OUT OF ORDER")
run_jruby_extractor!(File.join(@script_path,'test_pdfs/bo_page24.pdf'))
# Request URL:http://localhost:9393/pdf/d1bfae1be8d6b099c1b7bb7b401ca97310e61063/data?x1=50.089285714285715&x2=809.0178571428571&y1=432.5892857142857&y2=490.2678571428571&page=1

Expand Down Expand Up @@ -108,8 +119,6 @@ def run_mupdfdraw!(file, output_dir, width=560, page=nil)

cmd += " #{page}" unless page.nil?

puts cmd

`#{cmd}`
end

Expand Down

0 comments on commit e828265

Please sign in to comment.