fixed some cases for multi-line rows

ridjohansen · Apr 3, 2013 · e828265 · e828265
1 parent 02f178c
commit e828265
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 22 deletions.
diff --git a/lib/tabula.rb b/lib/tabula.rb
@@ -124,8 +124,15 @@ def should_add_space?(other)
 
     def merge!(other)
       raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
+      # unless self.horizontally_overlaps?(other) or self.vertically_overlaps?(other)
+      #   raise ArgumentError, "won't merge TextElements that don't overlap"
+      # end
+      if self.horizontally_overlaps?(other) and other.top < self.top
+        self.text = other.text + self.text
+      else
+        self.text << other.text
+      end
       super(other)
-      self.text << other.text
     end
 
     def to_h
@@ -139,25 +146,30 @@ def to_h
 
 
   class Line < ZoneEntity
-    # TODO clean this up
-    attr_accessor :text_elements
+     attr_accessor :text_elements
 
     def initialize
       self.text_elements = []
     end
 
     def <<(t)
-      self.text_elements << t
-      if self.text_elements.size == 1
+      if self.text_elements.size == 0
+        self.text_elements << t
         self.top = t.top
         self.left = t.left
         self.width = t.width
         self.height = t.height
       else
-        self.merge!(t)
+        if in_same_column = self.text_elements.find { |te| te.horizontally_overlaps?(t) }
+          in_same_column.merge!(t)
+        else
+          self.text_elements << t
+          self.merge!(t)
+        end
       end
     end
 
+
   end
 
   class Column < ZoneEntity
@@ -258,9 +270,10 @@ def get_rows
 
 
     # TODO finish writing this method
+    # it should be analogous to get_line_boundaries
+    # (ie, take into account vertical ruling lines if available)
     def group_by_columns
       columns = []
-      vr = self.options[:vertical_rulings]
       tes = self.text_elements.sort_by(&:left)
 
       # we don't have vertical rulings
@@ -350,14 +363,16 @@ def merge_words!
     end
   end
 
+  ONLY_SPACES_RE = Regexp.new('^\s+$')
   def Tabula.make_table(text_elements, options={})
     extractor = TableExtractor.new(text_elements, options)
 
     # group by lines
     lines = []
     line_boundaries = extractor.get_line_boundaries
 
-
+    # find all the text elements
+    # contained within each detected line (table row) boundary
     line_boundaries.each { |lb|
       line = Line.new
 
@@ -368,6 +383,8 @@ def Tabula.make_table(text_elements, options={})
       text_elements -= line_members
 
       line_members.sort_by(&:left).each { |te|
+        # skip text_elements that only contain spaces
+        next if te.text =~ ONLY_SPACES_RE
         line << te
       }
 
@@ -376,21 +393,20 @@ def Tabula.make_table(text_elements, options={})
 
     lines.sort_by!(&:top)
 
-    columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq)
+    columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq).sort_by(&:left)
 
     # # insert empty cells if needed
     lines.each_with_index { |l, line_index|
       next if l.text_elements.nil?
       l.text_elements.compact! # TODO WHY do I have to do this?
       l.text_elements.uniq!  # TODO WHY do I have to do this?
-
-      l.text_elements = l.text_elements.sort_by(&:left)
+      l.text_elements.sort_by!(&:left)
 
       # l.text_elements = Tabula.merge_words(l.text_elements)
 
       next unless l.text_elements.size < columns.size
 
-      columns.sort_by(&:left).each_with_index do |c, i|
+      columns.each_with_index do |c, i|
         if (i > l.text_elements.size - 1) or !l.text_elements(&:left)[i].nil? and !c.text_elements.include?(l.text_elements[i])
           l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, ''))
         end

diff --git a/test/test_pdfs/tabla_subsidios.pdf b/test/test_pdfs/tabla_subsidios.pdf
diff --git a/test/test_table_analyzer.rb b/test/test_table_analyzer.rb
@@ -16,7 +16,7 @@ def setup
   end
 
   def teardown
-#    FileUtils.remove_entry_secure @tmp_dir
+    FileUtils.remove_entry_secure @tmp_dir
   end
 
   # HOW TO WRITE A TEST - EXAMPLE
@@ -32,6 +32,24 @@ def teardown
   #    assert_equal lines_to_array(table), expected # assert the equality
   #  end
 
+  def test_tabla_subsidios
+    pdf_path = File.join(@script_path,
+                         'test_pdfs/tabla_subsidios.pdf')
+    run_jruby_extractor!(pdf_path)
+
+    rulings = detect_rulings(pdf_path, 1)
+
+    text_elements = Tabula::XML.get_text_elements(@tmp_dir, 1, 26.87, 200.82, 715.62, 250.32)
+    table = Tabula.make_table(text_elements,
+                              :horizontal_rulings => rulings[:horizontal],
+                              :vertical_rulings => rulings[:vertical])
+
+    expected = [["BA 014/12", "", "BA", "DOMINGO GONZALEZ Y CIA SA", "MT", "PyME", "1.573.476,50", "1.573.476,50", "50,00%", "786.738,25"], ["BA 015/12", "", "BA", "LABORATORIO WEIZUR ARGENTINA SA", "MT", "PyME", "700.163,00", "700.163,00", "50,00%", "350.081,50"], ["BA 017/12", "NA 022/12", "BA", "RIZOBACTER ARGENTINA S.A.", "I+D", "GRANDE", "3.000.000,00", "             2.927.040,00 ", "50,00%", "969.218,54"]]
+
+    assert_equal lines_to_array(table), expected
+
+  end
+
 
   def test_argentina_diputados_voting_record
     run_jruby_extractor!(File.join(@script_path,'test_pdfs/argentina_diputados_voting_record.pdf'))
@@ -52,14 +70,8 @@ def test_pharma_spaceless
 
     text_elements = Tabula::XML.get_text_elements(@tmp_dir, 1, 49.9375, 85, 537.625, 130.6875)
 
-    rulings = detect_rulings(pdf_path, 1)
-    # table = Tabula.make_table(text_elements,
-    #                           :horizontal_rulings => rulings[:horizontal],
-    #                           :vertical_rulings => rulings[:vertical])
-
     table = Tabula.make_table(text_elements)
 
-
     # this file does not get spaces rendered by pdfbox (so XML lacks spaces) but
     # Tabula.make_table should add spaces.
     expected = [["ABRAHAM RESEARCH, PLLC", "VILLAREAL, MANUEL", "CRESCENT SPRINGS, ", "KY", "$", "3,748.70"], ["ABRAHAM RESEARCH, PLLC", "TELTSER, MATTHEW", "PEMBROKE PINES, FL", "$", "6,000.00"], ["ALBEMARLE RESEARCH CONSULTANTS, INC. CAROLINA ", "RESEARCH ", "SPECIALISTS", "HEYDER, ALBRECHT M.", "ELIZABETH CITY, NC", "$", "2,565.00"], ["ALBUQUERQUE NEUROSCIENCE, INC.", "DEMPSEY, GLENN MICHAEL", "ALBUQUERQUE, NM", "$", "71,955.50"], ["ALEXIAN BROTHERS BEHAVIORIAL HEALTH HOSPITAL", "LERMAN, MARK", "CHICAGO, IL", "$", "14,079.00"], ["ALLERGIC DISEASES SC", "COHEN, STEVEN H.", "WEST ALLIS, WI", "$", "1,786.00"]]
@@ -68,7 +80,6 @@ def test_pharma_spaceless
   end
 
   def test_bo_page24
-    skip("FAILING - CHARACTERS APPEAR OUT OF ORDER")
     run_jruby_extractor!(File.join(@script_path,'test_pdfs/bo_page24.pdf'))
     # Request URL:http://localhost:9393/pdf/d1bfae1be8d6b099c1b7bb7b401ca97310e61063/data?x1=50.089285714285715&x2=809.0178571428571&y1=432.5892857142857&y2=490.2678571428571&page=1
 
@@ -108,8 +119,6 @@ def run_mupdfdraw!(file, output_dir, width=560, page=nil)
 
     cmd += " #{page}" unless page.nil?
 
-    puts cmd
-
     `#{cmd}`
   end