diff --git a/Gemfile b/Gemfile index ac48d28..1a40be7 100644 --- a/Gemfile +++ b/Gemfile @@ -45,3 +45,6 @@ gem 'wombat', '~> 2.2.1' # slugs gem 'friendly_id', '~> 5.0.0' + +# fix urls while scraping +gem 'addressable', '~> 2.3.6', require: "addressable/uri" diff --git a/Gemfile.lock b/Gemfile.lock index fd1ed7f..dbb339a 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -27,6 +27,7 @@ GEM minitest (~> 5.1) thread_safe (~> 0.1) tzinfo (~> 1.1) + addressable (2.3.6) arel (5.0.1.20140414130214) builder (3.2.2) coffee-rails (4.0.1) @@ -149,6 +150,7 @@ PLATFORMS ruby DEPENDENCIES + addressable (~> 2.3.6) coffee-rails (~> 4.0.0) friendly_id (~> 5.0.0) jbuilder (~> 2.0) diff --git a/app/models/paper.rb b/app/models/paper.rb index e94d206..2bf8242 100644 --- a/app/models/paper.rb +++ b/app/models/paper.rb @@ -30,4 +30,13 @@ def normalize_friendly_id(value) def full_reference legislative_term.to_s + '/' + reference.to_s end + + + # helper method to fix non-standard urls in the database + # apply it with: Paper.find_each(&:normalize_url) + def normalize_url + normalized_url = Addressable::URI.parse(self.url).normalize.to_s + write_attribute(:url, normalized_url) + save! + end end diff --git a/app/scrapers/bayern_landtag_scraper.rb b/app/scrapers/bayern_landtag_scraper.rb index e621882..ba075a4 100644 --- a/app/scrapers/bayern_landtag_scraper.rb +++ b/app/scrapers/bayern_landtag_scraper.rb @@ -44,7 +44,9 @@ def scrape Date.parse(text.match(/([\d\.]+)$/)[1]) unless text.nil? end url 'xpath=.//a[not(contains(@href, "LASTFOLDER"))]/@href' do |href| - BASE_URL + href unless href.nil? + unless href.nil? + Addressable::URI.parse(BASE_URL + href).normalize.to_s + end end #text 'xpath=(following-sibling::tr[2]/td[contains(@class, "pad_bot0")])[1]' title 'xpath=following-sibling::tr[2]/td[3]' do |text|