From e9f666b0fdd2502b99f9a474ed5ac19b1f464bf7 Mon Sep 17 00:00:00 2001 From: Maximilian Richt Date: Mon, 7 Jan 2019 19:02:49 +0100 Subject: [PATCH] use quicklink for detail url, fix utf8 and name/party extraction --- .../baden_wuerttemberg_landtag_scraper.rb | 54 +++++++++++++------ test/fixtures/bw/detail_page.html | 12 ++--- ...baden_wuerttemberg_landtag_scraper_test.rb | 22 ++++---- 3 files changed, 54 insertions(+), 34 deletions(-) diff --git a/app/scrapers/baden_wuerttemberg_landtag_scraper.rb b/app/scrapers/baden_wuerttemberg_landtag_scraper.rb index edaec87..a78ab73 100644 --- a/app/scrapers/baden_wuerttemberg_landtag_scraper.rb +++ b/app/scrapers/baden_wuerttemberg_landtag_scraper.rb @@ -25,19 +25,41 @@ def self.extract_reference(full_reference) end def self.get_detail_url(legislative_term, reference) - mechanize = Mechanize.new - hashbody = {"action" => "SearchAndDisplay","sources" => ["Star"],"report" => {"rhl" => "main","rhlmode" => "add","format" => "suchergebnis-dokumentnummer","mime" => "html","sort" => "sDNRSO sRNRDS"},"search" => {"lines" => {"l1" => "D","l2" => "#{legislative_term}/#{reference}"},"serverrecordname" => "dokument"}} - mp = mechanize.post(SEARCH_URL + '/parlis/browse.tt.json', hashbody.to_json, 'Content-Type' => 'application/json') - rep_id = JSON.parse(mp.body)["report_id"] + SEARCH_URL + "/parlis/browse.tt.html?type=&action=qlink&q=WP=#{legislative_term}%20AND%20DNRF=#{reference}" + end + + def self.get_report_url(m, legislative_term, reference) + hashbody = { + "action" => "SearchAndDisplay", + "sources" => ["Star"], + "report" => { + "rhl" => "main", + "rhlmode" => "add", + "format" => "suchergebnis-dokumentnummer", + "mime" => "html", + "sort" => "sDNRSO sRNRDS" + }, + "search" => { + "lines" => { + "l1" => "D", + "l2" => "#{legislative_term}/#{reference}" + }, + "serverrecordname" => "dokument" + } + } + mp = m.post(SEARCH_URL + '/parlis/browse.tt.json', hashbody.to_json, 'Content-Type' => 'application/json') + data = JSON.parse(mp.body) + return nil if data['fetched_hits'] <= 0 + rep_id = data["report_id"] SEARCH_URL + "/parlis/report.tt.html?report_id=#{rep_id}" end def self.get_detail_link(page) - page.at('.fundstellenLinks') + page.at_css('.fundstellenLinks') end def self.get_detail_originators(page) - page.at('.drucksache-liste-urheber') + page.at_css('.drucksache-liste-urheber') end def self.link_is_answer?(originators) @@ -48,7 +70,7 @@ def self.extract_doctype(match_result) case match_result.downcase when 'klanfr', 'klanf', 'kleine anfrage' Paper::DOCTYPE_MINOR_INTERPELLATION - when 'granfr', 'granf', 'große anfrage', 'Große Anfrage' + when 'granfr', 'granf', 'große anfrage', 'Große Anfrage' Paper::DOCTYPE_MAJOR_INTERPELLATION end end @@ -58,16 +80,11 @@ def self.extract_detail_title(page) end def self.extract_from_originators(originators_line) - match_result = originators_line.lstrip.match(/(Kleine Anfrage?|Große Anfrage?)\s+(.+?)\s+?([\d\.\s]+)?\s+(?:und\s+Antwort)\s+(.+)/m) + match_result = originators_line.lstrip.match(/(Kleine Anfrage?|Große Anfrage?)\s+(.+?)\s+?([\d\.\s]+)?\s+(?:und\s+Antwort)\s+(.+)/m) return nil if match_result.nil? doctype = extract_doctype(match_result[1]) - names = match_result[2].gsub(/\s+(?:u.a.|u.u.)/, '').strip.tr('()', '') - if doctype == Paper::DOCTYPE_MINOR_INTERPELLATION - originators = NamePartyExtractor.new(names, NamePartyExtractor::NAME_PARTY_COMMA).extract - elsif doctype == Paper::DOCTYPE_MAJOR_INTERPELLATION - parties = names.gsub(' und', ', ').split(',').map(&:strip) - originators = { people: [], parties: parties } - end + names = match_result[2].gsub(/\s+(?:u.a.|u.u.)/, '').strip + originators = NamePartyExtractor.new(names, NamePartyExtractor::NAME_BRACKET_PARTY).extract ministries = [match_result[4].strip] answerers = nil @@ -83,7 +100,6 @@ def self.extract_from_originators(originators_line) def self.extract_meta(page) originators_text = get_detail_originators(page).text ometa = extract_from_originators(originators_text) - # when multiple originators exist, remove "and others" - we extract the other names later link_text = get_detail_link(page).text full_reference = link_text.lstrip.match(/Drucksache\s+(\d+\/\d+).\s+([\d\.\s]+)/m)[1] return nil if full_reference.nil? @@ -238,7 +254,11 @@ class Detail < DetailScraper def scrape m = mechanize - page = m.get BadenWuerttembergLandtagScraper.get_detail_url(@legislative_term, @reference) + report_url = BadenWuerttembergLandtagScraper.get_report_url(m, @legislative_term, @reference) + fail 'Report not found' if report_url.nil? + page = m.get report_url + # fix missing encoding on report pages: + page.encoding = 'utf-8' BadenWuerttembergLandtagScraper.extract_detail_paper(page) end end diff --git a/test/fixtures/bw/detail_page.html b/test/fixtures/bw/detail_page.html index 66c7e21..49995e5 100644 --- a/test/fixtures/bw/detail_page.html +++ b/test/fixtures/bw/detail_page.html @@ -11,7 +11,7 @@

Dokument

- Drucksache 16/5196 15.11.2018 + Drucksache 16/5196 15.11.2018 @@ -26,24 +26,24 @@

Dokument

-

Vorgänge

+

Vorgänge

- + - +
-
Kleine Anfrage Klaus Hoher (FDP/DVP) 15.11.2018 und Antwort Ministerium für Ländlichen Raum und Verbraucherschutz
+
Kleine Anfrage Klaus Hoher (FDP/DVP) 15.11.2018 und Antwort Ministerium für Ländlichen Raum und Verbraucherschutz
- Drucksache 16/5196 + Drucksache 16/5196 diff --git a/test/scraper/baden_wuerttemberg_landtag_scraper_test.rb b/test/scraper/baden_wuerttemberg_landtag_scraper_test.rb index 4ef7acc..22e977e 100644 --- a/test/scraper/baden_wuerttemberg_landtag_scraper_test.rb +++ b/test/scraper/baden_wuerttemberg_landtag_scraper_test.rb @@ -52,8 +52,8 @@ def setup legislative_term = '16' reference = '5196' actual = @scraper.get_detail_url(legislative_term, reference) - expected = 'https://parlis.landtag-bw.de/parlis/report.tt.html?report_id=MjAxOTAxMDMtMTkwNDU4LTc5NTktTEJXOnN1Y2hlcmdlYm5pcy1kb2t1bWVudG51bW1lcjpodG1sOjo6MTpzRE5SU08gc1JOUkRT' - assert_equal(expected[0,70]+expected[93..-1], actual[0,70]+actual[93..-1]) + expected = 'https://parlis.landtag-bw.de/parlis/browse.tt.html?type=&action=qlink&q=WP=16%20AND%20DNRF=5196' + assert_equal(expected, actual) end test 'get detail link from detail page' do @@ -75,26 +75,26 @@ def setup doctype: Paper::DOCTYPE_MINOR_INTERPELLATION, published_at: Date.parse('2018-11-15'), originators: { people: ['Klaus Hoher'], parties: ['FDP/DVP'] }, - answerers: { ministries: ['Ministerium für Ländlichen Raum und Verbraucherschutz'] } + answerers: { ministries: ['Ministerium für Ländlichen Raum und Verbraucherschutz'] } } assert_equal(expected, actual) end test 'extract meta information from long detail link' do - text = 'Kleine Anfrage Helmut Walter Rüeck (CDU), Nikolaos Sakellariou (SPD), Dr. Friedrich Bullinger (FDP/DVP) 24.07.2014 und Antwort Ministerium für Ländlichen Raum und Verbraucherschutz' + text = 'Kleine Anfrage Helmut Walter Rüeck (CDU), Nikolaos Sakellariou (SPD), Dr. Friedrich Bullinger (FDP/DVP) 24.07.2014 und Antwort Ministerium für Ländlichen Raum und Verbraucherschutz' actual = @scraper.extract_from_originators(text) expected = { doctype: Paper::DOCTYPE_MINOR_INTERPELLATION, published_at: Date.parse('2014-07-24'), - originators: { people: ['Helmut Walter Rüeck', 'Nikolaos Sakellariou', 'Dr. Friedrich Bullinger'], parties: ['CDU', 'SPD', 'FDP/DVP'] }, - answerers: { ministries: ['Ministerium für Ländlichen Raum und Verbraucherschutz'] } + originators: { people: ['Helmut Walter Rüeck', 'Nikolaos Sakellariou', 'Dr. Friedrich Bullinger'], parties: ['CDU', 'SPD', 'FDP/DVP'] }, + answerers: { ministries: ['Ministerium für Ländlichen Raum und Verbraucherschutz'] } } assert_equal(expected, actual) end test 'extract meta information from long detail link with newline' do skip "No such case known yet in new Detail Pages" - text = "Kleine Anfrage Helmut Walter Rüeck (CDU), Nikolaos Sakellariou (SPD), Dr. Friedrich Bullinger (FDP/DVP) 24.07.2014 und Antwort Ministerium für Ländlichen Raum und Verbraucherschutz " + text = "Kleine Anfrage Helmut Walter Räeck (CDU), Nikolaos Sakellariou (SPD), Dr. Friedrich Bullinger (FDP/DVP) 24.07.2014 und Antwort Ministerium für Ländlichen Raum und Verbraucherschutz " actual = @scraper.extract_from_originators(text) expected = { full_reference: '15/5544', @@ -178,7 +178,7 @@ def setup test 'extract meta information from detail with multiple originator parties' do skip "Fix Grosse Anfragen later" - text = 'Große Anfrage Fraktion der CDU, Fraktion der SPD, Fraktion der FDP/DVP, Fraktion GRÃœNE 13.02.2013 und Antwort Landesregierung ' + text = 'Große Anfrage Fraktion der CDU, Fraktion der SPD, Fraktion der FDP/DVP, Fraktion GRÜNE 13.02.2013 und Antwort Landesregierung ' actual = @scraper.extract_from_originators(text) expected = { full_reference: '15/3038', @@ -221,7 +221,7 @@ def setup test 'extract title from Detail' do actual = @scraper.extract_detail_title(@detail_page) - expected = 'Kennzeichnung von Streuobst und Streuobstprodukten aus Baden-Württemberg' + expected = 'Kennzeichnung von Streuobst und Streuobstprodukten aus Baden-Württemberg' assert_equal(expected, actual) end @@ -232,12 +232,12 @@ def setup legislative_term: '16', reference: '5196', doctype: Paper::DOCTYPE_MINOR_INTERPELLATION, - title: 'Kennzeichnung von Streuobst und Streuobstprodukten aus Baden-Württemberg', + title: 'Kennzeichnung von Streuobst und Streuobstprodukten aus Baden-Württemberg', url: 'https://www.landtag-bw.de/files/live/sites/LTBW/files/dokumente/WP16/Drucksachen/5000/16%5F5196%5FD.pdf', published_at: Date.parse('2018-11-15'), is_answer: true, originators: { people: ['Klaus Hoher'], parties: ['FDP/DVP'] }, - answerers: { ministries: ['Ministerium für Ländlichen Raum und Verbraucherschutz'] }, + answerers: { ministries: ['Ministerium für Ländlichen Raum und Verbraucherschutz'] }, } assert(expected <= actual) end