-
Notifications
You must be signed in to change notification settings - Fork 16
/
wikidata-cities.rb
executable file
·71 lines (58 loc) · 1.76 KB
/
wikidata-cities.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/ruby
require 'rubygems'
require 'bundler/setup'
require 'open-uri'
require 'json'
require 'addressable'
require 'csv'
SKIP_EMPTY_STATES = true
# Use the Wikidata Query Service (https://query.wikidata.org) to test this:
QUERY = <<-EOQ.freeze
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
SELECT DISTINCT ?website ?name ?state
WHERE
{
?q p:P31 ?statement .
?statement ps:P31/wdt:P279* wd:Q262166 .
MINUS { ?statement pq:P582|pq:P576 ?x } . # Without already gone entries (end date or dissolved)
?q wdt:P856 ?website .
?q rdfs:label ?name filter (lang(?name) = "de") .
?q wdt:P131* ?qstate .
?qstate wdt:P31 wd:Q1221156 .
?qstate wdt:P300 ?state .
}
EOQ
def clean_domain(domain)
domain.downcase.gsub(/^www./, '')
end
def strip_iso3166_2_country(str)
str.split('-', 2).last
end
uri = URI('https://query.wikidata.org/sparql')
uri.query = URI.encode_www_form(
query: QUERY,
format: 'json'
)
begin
response = open(uri, 'User-Agent' => "ruby/#{RUBY_VERSION} german-gov-domains/1")
rescue OpenURI::HTTPError => e
STDERR.puts "# Got error: #{e.message}"
exit 1
end
data = JSON.parse(response.read)
data['results']['bindings'].each do |row|
website = row['website']['value']
name = row['name']['value']
state = ''
state = row['state']['value'] unless row['state'].nil?
state = strip_iso3166_2_country(state) unless state.empty?
next if state.empty? && SKIP_EMPTY_STATES
uri = Addressable::URI.parse(website).normalize
domain = clean_domain(uri.host)
puts [domain, name, state].to_csv
end