Skip to content

Commit

Permalink
use quicklink for detail url, fix utf8 and name/party extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
robbi5 committed Jan 27, 2019
1 parent e65a85b commit e9f666b
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 34 deletions.
54 changes: 37 additions & 17 deletions app/scrapers/baden_wuerttemberg_landtag_scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,41 @@ def self.extract_reference(full_reference)
end

def self.get_detail_url(legislative_term, reference)
mechanize = Mechanize.new
hashbody = {"action" => "SearchAndDisplay","sources" => ["Star"],"report" => {"rhl" => "main","rhlmode" => "add","format" => "suchergebnis-dokumentnummer","mime" => "html","sort" => "sDNRSO sRNRDS"},"search" => {"lines" => {"l1" => "D","l2" => "#{legislative_term}/#{reference}"},"serverrecordname" => "dokument"}}
mp = mechanize.post(SEARCH_URL + '/parlis/browse.tt.json', hashbody.to_json, 'Content-Type' => 'application/json')
rep_id = JSON.parse(mp.body)["report_id"]
SEARCH_URL + "/parlis/browse.tt.html?type=&action=qlink&q=WP=#{legislative_term}%20AND%20DNRF=#{reference}"
end

def self.get_report_url(m, legislative_term, reference)
hashbody = {
"action" => "SearchAndDisplay",
"sources" => ["Star"],
"report" => {
"rhl" => "main",
"rhlmode" => "add",
"format" => "suchergebnis-dokumentnummer",
"mime" => "html",
"sort" => "sDNRSO sRNRDS"
},
"search" => {
"lines" => {
"l1" => "D",
"l2" => "#{legislative_term}/#{reference}"
},
"serverrecordname" => "dokument"
}
}
mp = m.post(SEARCH_URL + '/parlis/browse.tt.json', hashbody.to_json, 'Content-Type' => 'application/json')
data = JSON.parse(mp.body)
return nil if data['fetched_hits'] <= 0
rep_id = data["report_id"]
SEARCH_URL + "/parlis/report.tt.html?report_id=#{rep_id}"
end

def self.get_detail_link(page)
page.at('.fundstellenLinks')
page.at_css('.fundstellenLinks')
end

def self.get_detail_originators(page)
page.at('.drucksache-liste-urheber')
page.at_css('.drucksache-liste-urheber')
end

def self.link_is_answer?(originators)
Expand All @@ -48,7 +70,7 @@ def self.extract_doctype(match_result)
case match_result.downcase
when 'klanfr', 'klanf', 'kleine anfrage'
Paper::DOCTYPE_MINOR_INTERPELLATION
when 'granfr', 'granf', 'große anfrage', 'Große Anfrage'
when 'granfr', 'granf', 'große anfrage', 'Große Anfrage'
Paper::DOCTYPE_MAJOR_INTERPELLATION
end
end
Expand All @@ -58,16 +80,11 @@ def self.extract_detail_title(page)
end

def self.extract_from_originators(originators_line)
match_result = originators_line.lstrip.match(/(Kleine Anfrage?|Große Anfrage?)\s+(.+?)\s+?([\d\.\s]+)?\s+(?:und\s+Antwort)\s+(.+)/m)
match_result = originators_line.lstrip.match(/(Kleine Anfrage?|Große Anfrage?)\s+(.+?)\s+?([\d\.\s]+)?\s+(?:und\s+Antwort)\s+(.+)/m)
return nil if match_result.nil?
doctype = extract_doctype(match_result[1])
names = match_result[2].gsub(/\s+(?:u.a.|u.u.)/, '').strip.tr('()', '')
if doctype == Paper::DOCTYPE_MINOR_INTERPELLATION
originators = NamePartyExtractor.new(names, NamePartyExtractor::NAME_PARTY_COMMA).extract
elsif doctype == Paper::DOCTYPE_MAJOR_INTERPELLATION
parties = names.gsub(' und', ', ').split(',').map(&:strip)
originators = { people: [], parties: parties }
end
names = match_result[2].gsub(/\s+(?:u.a.|u.u.)/, '').strip
originators = NamePartyExtractor.new(names, NamePartyExtractor::NAME_BRACKET_PARTY).extract
ministries = [match_result[4].strip]

answerers = nil
Expand All @@ -83,7 +100,6 @@ def self.extract_from_originators(originators_line)
def self.extract_meta(page)
originators_text = get_detail_originators(page).text
ometa = extract_from_originators(originators_text)
# when multiple originators exist, remove "and others" - we extract the other names later
link_text = get_detail_link(page).text
full_reference = link_text.lstrip.match(/Drucksache\s+(\d+\/\d+).\s+([\d\.\s]+)/m)[1]
return nil if full_reference.nil?
Expand Down Expand Up @@ -238,7 +254,11 @@ class Detail < DetailScraper

def scrape
m = mechanize
page = m.get BadenWuerttembergLandtagScraper.get_detail_url(@legislative_term, @reference)
report_url = BadenWuerttembergLandtagScraper.get_report_url(m, @legislative_term, @reference)
fail 'Report not found' if report_url.nil?
page = m.get report_url
# fix missing encoding on report pages:
page.encoding = 'utf-8'
BadenWuerttembergLandtagScraper.extract_detail_paper(page)
end
end
Expand Down
12 changes: 6 additions & 6 deletions test/fixtures/bw/detail_page.html
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ <h2> Dokument</h2>

<span class="dokumentLink">

<a class="fundstellenLinks" href="https://www.landtag-bw.de/files/live/sites/LTBW/files/dokumente/WP16/Drucksachen/5000/16%5F5196%5FD.pdf" target="_blank" title="In einem neuen Fenster öffnen">Drucksache 16/5196 15.11.2018</a>
<a class="fundstellenLinks" href="https://www.landtag-bw.de/files/live/sites/LTBW/files/dokumente/WP16/Drucksachen/5000/16%5F5196%5FD.pdf" target="_blank" title="In einem neuen Fenster öffnen">Drucksache 16/5196 15.11.2018</a>

</span>
<span class="pdf-size">
Expand All @@ -26,24 +26,24 @@ <h2> Dokument</h2>
<!-- oWBD02 = recn, oWBD03 = = dart; /parlis/dataEntry/erfassungDokument.thtm?de-recn= ?copy=true -->


<h2>Vorgänge</h2>
<h2>Vorgänge</h2>


<div class="efxRecordRepeater well" data-efx-rec="5c2e4e5ff45bc10e2e7347a1">
<button name="5c2e4e5ff45bc10e2e7347a1" class="partitalPrinter btn fundstelle-right" style="display:none; height: 30px; width:35px;"> <span class="glyphicon glyphicon-print" aria-hidden="true" style="font-size:12px; margin-left: -1px;"> </span> </button>
<button class="btn pull-right refresh" style ="display:none; margin-top: 20px; height: 30px; width:35px;" data-toggle="tooltip" data-original-title="Der Datensatz wurde möglicherweise aktualisiert. Datensatz in neuem Tab anzeigen." onclick="Javascript:window.open('/parlisa/browse.tt.html?type=&action=qlink&q=BAFO=BASIS AND VID=V-125000','_blank');"> <span class="glyphicon glyphicon-refresh" aria-hidden="true" style="font-size: 12px; margin-left: -1px;"> </span> </button>
<button class="btn pull-right refresh" style ="display:none; margin-top: 20px; height: 30px; width:35px;" data-toggle="tooltip" data-original-title="Der Datensatz wurde möglicherweise aktualisiert. Datensatz in neuem Tab anzeigen." onclick="Javascript:window.open('/parlisa/browse.tt.html?type=&action=qlink&q=BAFO=BASIS AND VID=V-125000','_blank');"> <span class="glyphicon glyphicon-refresh" aria-hidden="true" style="font-size: 12px; margin-left: -1px;"> </span> </button>


<div class="drucksache-liste-betreff"> <a data-toggle="tab" href="#detailTab-5c2e4e5ff45bc10e2e7347a1" id="5c2e4e5ff45bc10e2e7347a1" class="efxZoomTabVorgang"> Kennzeichnung von Streuobst und Streuobstprodukten aus Baden-Württemberg </a> </div>
<div class="drucksache-liste-betreff"> <a data-toggle="tab" href="#detailTab-5c2e4e5ff45bc10e2e7347a1" id="5c2e4e5ff45bc10e2e7347a1" class="efxZoomTabVorgang"> Kennzeichnung von Streuobst und Streuobstprodukten aus Baden-Württemberg </a> </div>
<div id="short-5c2e4e5ff45bc10e2e7347a1">
<div class="drucksache-liste-urheber"> <span class="typ">Kleine Anfrage Klaus Hoher (FDP/DVP) 15.11.2018 und Antwort Ministerium für Ländlichen Raum und Verbraucherschutz </span> <span class="urheber"></span> <span class="datum"></span> </div>
<div class="drucksache-liste-urheber"> <span class="typ">Kleine Anfrage Klaus Hoher (FDP/DVP) 15.11.2018 und Antwort Ministerium für Ländlichen Raum und Verbraucherschutz </span> <span class="urheber"></span> <span class="datum"></span> </div>




<span class="dokumentLink">

<a class="fundstellenLinks" href="https://www.landtag-bw.de/files/live/sites/LTBW/files/dokumente/WP16/Drucksachen/5000/16%5F5196%5FD.pdf" target="_blank" title="In einem neuen Fenster öffnen">Drucksache 16/5196</a>
<a class="fundstellenLinks" href="https://www.landtag-bw.de/files/live/sites/LTBW/files/dokumente/WP16/Drucksachen/5000/16%5F5196%5FD.pdf" target="_blank" title="In einem neuen Fenster öffnen">Drucksache 16/5196</a>

</span>
<span class="pdf-size">
Expand Down
22 changes: 11 additions & 11 deletions test/scraper/baden_wuerttemberg_landtag_scraper_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ def setup
legislative_term = '16'
reference = '5196'
actual = @scraper.get_detail_url(legislative_term, reference)
expected = 'https://parlis.landtag-bw.de/parlis/report.tt.html?report_id=MjAxOTAxMDMtMTkwNDU4LTc5NTktTEJXOnN1Y2hlcmdlYm5pcy1kb2t1bWVudG51bW1lcjpodG1sOjo6MTpzRE5SU08gc1JOUkRT'
assert_equal(expected[0,70]+expected[93..-1], actual[0,70]+actual[93..-1])
expected = 'https://parlis.landtag-bw.de/parlis/browse.tt.html?type=&action=qlink&q=WP=16%20AND%20DNRF=5196'
assert_equal(expected, actual)
end

test 'get detail link from detail page' do
Expand All @@ -75,26 +75,26 @@ def setup
doctype: Paper::DOCTYPE_MINOR_INTERPELLATION,
published_at: Date.parse('2018-11-15'),
originators: { people: ['Klaus Hoher'], parties: ['FDP/DVP'] },
answerers: { ministries: ['Ministerium für Ländlichen Raum und Verbraucherschutz'] }
answerers: { ministries: ['Ministerium für Ländlichen Raum und Verbraucherschutz'] }
}
assert_equal(expected, actual)
end

test 'extract meta information from long detail link' do
text = 'Kleine Anfrage Helmut Walter Rüeck (CDU), Nikolaos Sakellariou (SPD), Dr. Friedrich Bullinger (FDP/DVP) 24.07.2014 und Antwort Ministerium für Ländlichen Raum und Verbraucherschutz'
text = 'Kleine Anfrage Helmut Walter Rüeck (CDU), Nikolaos Sakellariou (SPD), Dr. Friedrich Bullinger (FDP/DVP) 24.07.2014 und Antwort Ministerium für Ländlichen Raum und Verbraucherschutz'
actual = @scraper.extract_from_originators(text)
expected = {
doctype: Paper::DOCTYPE_MINOR_INTERPELLATION,
published_at: Date.parse('2014-07-24'),
originators: { people: ['Helmut Walter Rüeck', 'Nikolaos Sakellariou', 'Dr. Friedrich Bullinger'], parties: ['CDU', 'SPD', 'FDP/DVP'] },
answerers: { ministries: ['Ministerium für Ländlichen Raum und Verbraucherschutz'] }
originators: { people: ['Helmut Walter Rüeck', 'Nikolaos Sakellariou', 'Dr. Friedrich Bullinger'], parties: ['CDU', 'SPD', 'FDP/DVP'] },
answerers: { ministries: ['Ministerium für Ländlichen Raum und Verbraucherschutz'] }
}
assert_equal(expected, actual)
end

test 'extract meta information from long detail link with newline' do
skip "No such case known yet in new Detail Pages"
text = "Kleine Anfrage Helmut Walter Rüeck (CDU), Nikolaos Sakellariou (SPD), Dr. Friedrich Bullinger (FDP/DVP) 24.07.2014 und Antwort Ministerium für Ländlichen Raum und Verbraucherschutz "
text = "Kleine Anfrage Helmut Walter Räeck (CDU), Nikolaos Sakellariou (SPD), Dr. Friedrich Bullinger (FDP/DVP) 24.07.2014 und Antwort Ministerium für Ländlichen Raum und Verbraucherschutz "
actual = @scraper.extract_from_originators(text)
expected = {
full_reference: '15/5544',
Expand Down Expand Up @@ -178,7 +178,7 @@ def setup

test 'extract meta information from detail with multiple originator parties' do
skip "Fix Grosse Anfragen later"
text = 'Große Anfrage Fraktion der CDU, Fraktion der SPD, Fraktion der FDP/DVP, Fraktion GRÜNE 13.02.2013 und Antwort Landesregierung '
text = 'Große Anfrage Fraktion der CDU, Fraktion der SPD, Fraktion der FDP/DVP, Fraktion GRÜNE 13.02.2013 und Antwort Landesregierung '
actual = @scraper.extract_from_originators(text)
expected = {
full_reference: '15/3038',
Expand Down Expand Up @@ -221,7 +221,7 @@ def setup

test 'extract title from Detail' do
actual = @scraper.extract_detail_title(@detail_page)
expected = 'Kennzeichnung von Streuobst und Streuobstprodukten aus Baden-Württemberg'
expected = 'Kennzeichnung von Streuobst und Streuobstprodukten aus Baden-Württemberg'
assert_equal(expected, actual)
end

Expand All @@ -232,12 +232,12 @@ def setup
legislative_term: '16',
reference: '5196',
doctype: Paper::DOCTYPE_MINOR_INTERPELLATION,
title: 'Kennzeichnung von Streuobst und Streuobstprodukten aus Baden-Württemberg',
title: 'Kennzeichnung von Streuobst und Streuobstprodukten aus Baden-Württemberg',
url: 'https://www.landtag-bw.de/files/live/sites/LTBW/files/dokumente/WP16/Drucksachen/5000/16%5F5196%5FD.pdf',
published_at: Date.parse('2018-11-15'),
is_answer: true,
originators: { people: ['Klaus Hoher'], parties: ['FDP/DVP'] },
answerers: { ministries: ['Ministerium für Ländlichen Raum und Verbraucherschutz'] },
answerers: { ministries: ['Ministerium für Ländlichen Raum und Verbraucherschutz'] },
}
assert(expected <= actual)
end
Expand Down

0 comments on commit e9f666b

Please sign in to comment.