diff --git a/Gemfile b/Gemfile index fd5ff5a..797ffbe 100644 --- a/Gemfile +++ b/Gemfile @@ -104,6 +104,9 @@ gem 'httparty', '~> 0.16.0' # matching against known names in scrapers gem 'fuzzy_match', '~> 2.1.0' +# handle long running regexps +gem 'safe_regexp', '~> 0.3.0' + # simple title and opengraph/twitter cards view helpers gem 'tophat', '~> 2.3.0' diff --git a/Gemfile.lock b/Gemfile.lock index dc7a084..360591a 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -311,6 +311,7 @@ GEM ruby-progressbar (1.10.0) ruby_dep (1.5.0) rubyzip (1.2.2) + safe_regexp (0.3.0) sassc (2.0.1) ffi (~> 1.9) rake @@ -428,6 +429,7 @@ DEPENDENCIES redis (>= 3.3.5, < 5) rubocop rubyzip (~> 1.2.2) + safe_regexp (~> 0.3.0) sassc-rails (~> 2.1.0) saxerator (~> 0.9.5) searchkick (= 3.1.1) diff --git a/app/jobs/contains_classified_information_job.rb b/app/jobs/contains_classified_information_job.rb index e0ce905..636a80b 100644 --- a/app/jobs/contains_classified_information_job.rb +++ b/app/jobs/contains_classified_information_job.rb @@ -1,5 +1,3 @@ -require 'timeout' - class ContainsClassifiedInformationJob < PaperJob queue_as :meta diff --git a/app/jobs/contains_table_job.rb b/app/jobs/contains_table_job.rb index eb717b3..8a3208a 100644 --- a/app/jobs/contains_table_job.rb +++ b/app/jobs/contains_table_job.rb @@ -1,5 +1,3 @@ -require 'timeout' - class ContainsTableJob < PaperJob queue_as :meta diff --git a/lib/classified_recognizer.rb b/lib/classified_recognizer.rb index 5a91e19..610c808 100644 --- a/lib/classified_recognizer.rb +++ b/lib/classified_recognizer.rb @@ -49,8 +49,10 @@ def match(regex, group, factor: 1) return if @skip.include?(group) m = nil - Timeout::timeout(5) do - m = text.scan(regex) + begin + m = SafeRegexp.execute(text, :scan, regex, timeout: 5) + rescue SafeRegexp::RegexpTimeout + m = nil end return if m.blank? @@ -64,7 +66,11 @@ def match(regex, group, factor: 1) def match_each(regex, group, factor: 1, &block) return if @skip.include?(group) - m = text.scan(regex) + begin + m = SafeRegexp.execute(text, :scan, regex, timeout: 5) + rescue SafeRegexp::RegexpTimeout + m = nil + end return if m.blank? m.each do |match| diff --git a/lib/table_recognizer.rb b/lib/table_recognizer.rb index a33ae83..c445931 100644 --- a/lib/table_recognizer.rb +++ b/lib/table_recognizer.rb @@ -36,34 +36,23 @@ def recognize # Hint 5: "\nTabelle 2:\n" match(/\nTabelle \d:\n/, :table_num) - begin - Timeout::timeout(5) do - # Hint 6: \d\n\d\n\d\n... - #m = text.scan(/\p{Zs}(\d[\p{Zs}\d]+\n\d[\p{Zs}\d]+)+/m) - match(/(\d[\p{Zs}\d]+\n\d[\p{Zs}\d]+)+/m, :looks_like_table_newlines, factor: 0.5) # TODO: lookahead/lookbehind? - end - rescue => e - # ignore failure - end + # Hint 6: \d\n\d\n\d\n... + #m = text.scan(/\p{Zs}(\d[\p{Zs}\d]+\n\d[\p{Zs}\d]+)+/m) + match(/(\d[\p{Zs}\d]+\n\d[\p{Zs}\d]+)+/m, :looks_like_table_newlines, factor: 0.5) # Hint 7: Anlage 3 Tabelle 1, Anlage / Tabelle 1 match(/Anlage\s+[\d\/]+\s+Tabelle\s+\d+/m, :attachment_table) - begin - Timeout::timeout(5) do - # Hint 8: "\nAAA 10,1 10,2 10,3\nBBB 20 21,1 -1.022,2" - match_each(/\n([\p{Zs}\S]+?\p{Zs}+(\-?(?>(?:\d{1,3}(?>(?:\.\d{3}))*(?>(?:,\d+)?|\d*\.?\d+))\p{Zs}*)+))\n/m, :looks_like_table_values, factor: 0.5) do |match| - match.first.strip != match.second.strip && - !match.first.strip.start_with?('vom') && - !match.first.match('\d{2}\.\d{2}\.\d{4}') && - !match.first.match('Seite\s+\d+\s+von\s+\d+') && - !match.first.match('(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)\s+\d{4}\s*\z') && - !match.first.match('(?:Str\.\s\d+|[Pp]latz\s\d+)') && - !match.first.strip.match('\A(?:[0-9]|[MCDXLVI])+\.\s+[^\n]+\s\d+\s*\z') - end - end - rescue => e - # ignore failure + # Hint 8: "\nAAA 10,1 10,2 10,3\nBBB 20 21,1 -1.022,2" + match_each(/\n([\p{Zs}\S]+?\p{Zs}+(\-?(?>(?:\d{1,3}(?>(?:\.\d{3}))*(?>(?:,\d+)?|\d*\.?\d+))\p{Zs}*)+))\n/m, :looks_like_table_values, factor: 0.5) do |match| + match.first.strip != match.second.strip && + !match.first.strip.start_with?('vom') && + !match.first.match('\d{2}\.\d{2}\.\d{4}') && + !match.first.match('Seite\s+\d+\s+von\s+\d+') && + !match.first.match('(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)\s+\d{4}\s*\z') && + !match.first.match('(?:Str\.?\s\d+|[Pp]latz\s\d+)') && + !match.first.match('Drucksache') && + !match.first.strip.match('\A(?:[0-9]|[MCDXLVI])+\.\s+[^\n]+\s\d+\s*\z') end { @@ -78,7 +67,12 @@ def recognize def match(regex, group, factor: 1) return if @skip.include?(group) - m = text.scan(regex) + m = nil + begin + m = SafeRegexp.execute(text, :scan, regex, timeout: 5) + rescue SafeRegexp::RegexpTimeout + m = nil + end return if m.blank? @probability += factor * m.size @@ -89,7 +83,12 @@ def match(regex, group, factor: 1) def match_each(regex, group, factor: 1, &block) return if @skip.include?(group) - m = text.scan(regex) + m = nil + begin + m = SafeRegexp.execute(text, :scan, regex, timeout: 5) + rescue SafeRegexp::RegexpTimeout + m = nil + end return if m.blank? m.each do |match|