-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.rb
103 lines (81 loc) · 2.36 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/bin/env ruby
# encoding: utf-8
# frozen_string_literal: true
require 'pry'
require 'scraped'
require 'scraperwiki'
require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'
class MembersPage < Scraped::HTML
field :members do
noko.css('div#personen .member').reject do |mp|
mp.xpath('preceding::h2').map(&:text).last == 'Stellvertreter der Abgeordneten'
end.map do |mp|
fragment mp => MemberDiv
end
end
end
class MemberDiv < Scraped::HTML
field :id do
noko.at_css('@data-id').text
end
field :name do
noko.css('.name a').text.tidy
end
field :image do
noko.css('.pic @style').text[/(https?:.*?.(jpg|png))/, 1]
end
field :email do
EMAIL_EXTRAS.reduce(raw_email) { |email, str| email.sub(str, '') }
end
field :party do
popup[1]
end
field :party_id do
popup[1]
end
field :birth_date do
popup[2].to_s.split('.').reverse.join('-')
end
field :region do
noko.xpath('preceding::h3').map(&:text).last
end
field :end_date do
noko.css('.name').text[/Zurückgetreten am (\d{2}.\d{2}.\d{4})/, 1].to_s.split('.').reverse.join('-')
end
field :source do
url
end
field :identifier__landtag do
id
end
private
# http://www.landtag.li/scripts/landtag-master.js?t=3 contains the
# replacement codes. Assume for now that these are static. If they're
# not, we'll need to fetch this and replace them dynamically.
EMAIL_EXTRAS = %w[
fss32ixh kvx7n3i7 p6gktryw kvx7n3i7 93Fu2 fss32ixh kvx7n3i7 p6gktryw kvx7n3i7 93Fu2
]
def raw_email
noko.css('.email a/@href').text.sub('mailto:', '')
end
def popup
noko.at_css('.pic div p').children.map(&:text).reject(&:empty?)
end
end
def scrape_list(termid, url)
page = MembersPage.new(response: Scraped::Request.new(url: url).response)
data = page.members.map do |mem|
mem.to_h.merge(term: termid)
end
data.each { |mem| puts mem.reject { |_, v| v.to_s.empty? }.sort_by { |k, _| k }.to_h } if ENV['MORPH_DEBUG']
ScraperWiki.save_sqlite(%i(name term), data)
end
terms = {
2017 => 'https://www.landtag.li/abgeordnete/?jahr=2017',
2013 => 'https://www.landtag.li/abgeordnete/?jahr=2013',
2009 => 'https://www.landtag.li/abgeordnete/?jahr=2009',
2005 => 'https://www.landtag.li/abgeordnete/?jahr=2005',
}
ScraperWiki.sqliteexecute('DROP TABLE data') rescue nil
terms.each { |id, url| scrape_list(id, url) }