Skip to content

Commit

Permalink
drop timeout, use safe_regexp for long running regex used by classifi…
Browse files Browse the repository at this point in the history
…ed/table recognizer
  • Loading branch information
robbi5 committed Jul 29, 2019
1 parent e1d8118 commit cbc00d1
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 33 deletions.
3 changes: 3 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ gem 'httparty', '~> 0.16.0'
# matching against known names in scrapers
gem 'fuzzy_match', '~> 2.1.0'

# handle long running regexps
gem 'safe_regexp', '~> 0.3.0'

# simple title and opengraph/twitter cards view helpers
gem 'tophat', '~> 2.3.0'

Expand Down
2 changes: 2 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ GEM
ruby-progressbar (1.10.0)
ruby_dep (1.5.0)
rubyzip (1.2.2)
safe_regexp (0.3.0)
sassc (2.0.1)
ffi (~> 1.9)
rake
Expand Down Expand Up @@ -428,6 +429,7 @@ DEPENDENCIES
redis (>= 3.3.5, < 5)
rubocop
rubyzip (~> 1.2.2)
safe_regexp (~> 0.3.0)
sassc-rails (~> 2.1.0)
saxerator (~> 0.9.5)
searchkick (= 3.1.1)
Expand Down
2 changes: 0 additions & 2 deletions app/jobs/contains_classified_information_job.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
require 'timeout'

class ContainsClassifiedInformationJob < PaperJob
queue_as :meta

Expand Down
2 changes: 0 additions & 2 deletions app/jobs/contains_table_job.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
require 'timeout'

class ContainsTableJob < PaperJob
queue_as :meta

Expand Down
12 changes: 9 additions & 3 deletions lib/classified_recognizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,10 @@ def match(regex, group, factor: 1)
return if @skip.include?(group)

m = nil
Timeout::timeout(5) do
m = text.scan(regex)
begin
m = SafeRegexp.execute(text, :scan, regex, timeout: 5)
rescue SafeRegexp::RegexpTimeout
m = nil
end
return if m.blank?

Expand All @@ -64,7 +66,11 @@ def match(regex, group, factor: 1)
def match_each(regex, group, factor: 1, &block)
return if @skip.include?(group)

m = text.scan(regex)
begin
m = SafeRegexp.execute(text, :scan, regex, timeout: 5)
rescue SafeRegexp::RegexpTimeout
m = nil
end
return if m.blank?

m.each do |match|
Expand Down
51 changes: 25 additions & 26 deletions lib/table_recognizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,34 +36,23 @@ def recognize
# Hint 5: "\nTabelle 2:\n"
match(/\nTabelle \d:\n/, :table_num)

begin
Timeout::timeout(5) do
# Hint 6: \d\n\d\n\d\n...
#m = text.scan(/\p{Zs}(\d[\p{Zs}\d]+\n\d[\p{Zs}\d]+)+/m)
match(/(\d[\p{Zs}\d]+\n\d[\p{Zs}\d]+)+/m, :looks_like_table_newlines, factor: 0.5) # TODO: lookahead/lookbehind?
end
rescue => e
# ignore failure
end
# Hint 6: \d\n\d\n\d\n...
#m = text.scan(/\p{Zs}(\d[\p{Zs}\d]+\n\d[\p{Zs}\d]+)+/m)
match(/(\d[\p{Zs}\d]+\n\d[\p{Zs}\d]+)+/m, :looks_like_table_newlines, factor: 0.5)

# Hint 7: Anlage 3 Tabelle 1, Anlage / Tabelle 1
match(/Anlage\s+[\d\/]+\s+Tabelle\s+\d+/m, :attachment_table)

begin
Timeout::timeout(5) do
# Hint 8: "\nAAA 10,1 10,2 10,3\nBBB 20 21,1 -1.022,2"
match_each(/\n([\p{Zs}\S]+?\p{Zs}+(\-?(?>(?:\d{1,3}(?>(?:\.\d{3}))*(?>(?:,\d+)?|\d*\.?\d+))\p{Zs}*)+))\n/m, :looks_like_table_values, factor: 0.5) do |match|
match.first.strip != match.second.strip &&
!match.first.strip.start_with?('vom') &&
!match.first.match('\d{2}\.\d{2}\.\d{4}') &&
!match.first.match('Seite\s+\d+\s+von\s+\d+') &&
!match.first.match('(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)\s+\d{4}\s*\z') &&
!match.first.match('(?:Str\.\s\d+|[Pp]latz\s\d+)') &&
!match.first.strip.match('\A(?:[0-9]|[MCDXLVI])+\.\s+[^\n]+\s\d+\s*\z')
end
end
rescue => e
# ignore failure
# Hint 8: "\nAAA 10,1 10,2 10,3\nBBB 20 21,1 -1.022,2"
match_each(/\n([\p{Zs}\S]+?\p{Zs}+(\-?(?>(?:\d{1,3}(?>(?:\.\d{3}))*(?>(?:,\d+)?|\d*\.?\d+))\p{Zs}*)+))\n/m, :looks_like_table_values, factor: 0.5) do |match|
match.first.strip != match.second.strip &&
!match.first.strip.start_with?('vom') &&
!match.first.match('\d{2}\.\d{2}\.\d{4}') &&
!match.first.match('Seite\s+\d+\s+von\s+\d+') &&
!match.first.match('(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)\s+\d{4}\s*\z') &&
!match.first.match('(?:Str\.?\s\d+|[Pp]latz\s\d+)') &&
!match.first.match('Drucksache') &&
!match.first.strip.match('\A(?:[0-9]|[MCDXLVI])+\.\s+[^\n]+\s\d+\s*\z')
end

{
Expand All @@ -78,7 +67,12 @@ def recognize
def match(regex, group, factor: 1)
return if @skip.include?(group)

m = text.scan(regex)
m = nil
begin
m = SafeRegexp.execute(text, :scan, regex, timeout: 5)
rescue SafeRegexp::RegexpTimeout
m = nil
end
return if m.blank?

@probability += factor * m.size
Expand All @@ -89,7 +83,12 @@ def match(regex, group, factor: 1)
def match_each(regex, group, factor: 1, &block)
return if @skip.include?(group)

m = text.scan(regex)
m = nil
begin
m = SafeRegexp.execute(text, :scan, regex, timeout: 5)
rescue SafeRegexp::RegexpTimeout
m = nil
end
return if m.blank?

m.each do |match|
Expand Down

0 comments on commit cbc00d1

Please sign in to comment.