-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert.py
132 lines (103 loc) · 3.63 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import csv
import glob
import gzip
import io
import json
import os
import zipfile
import config as cfg
OUTFILE = f'./output/{cfg.outfile}.lpf.jsonl'
class Feature:
def __init__(self, csv):
self.fields = csv
def to_json(self):
def parse_num(str):
s = str.strip()
return int(s) if s else None
names = set(map(lambda str: str.strip(), self.fields[3].split(',')))
names.add(self.fields[1])
# Remove empty strings (in case the 'alternate names' column was empty!)
names = list(filter(None, names))
admin_codes = map(lambda idx: self.fields[idx], range(10, 13))
admin_codes = list(filter(lambda c: c.strip(), admin_codes))
feature = {
'@id': f'http://sws.geonames.org/{self.fields[0]}',
'type': 'Feature',
'properties': {
'title': self.fields[1],
'ccodes': [ self.fields[8] ],
'admin_codes': admin_codes,
'feature_class': self.fields[6],
'feature_code': self.fields[7],
'population': parse_num(self.fields[14]),
'elevation': parse_num(self.fields[15])
},
'names': list(map(lambda str: {
'toponym': str
}, names)),
'geometry': {
'type': 'Point',
'coordinates': [
float(self.fields[5]), float(self.fields[4])
]
}
}
return json.dumps(feature)
class Concordances:
# link concordances from the alternateNamesV2.zip file
def __init__(self):
print('Building concordance table (may take a while)')
self.concordances = {}
with zipfile.ZipFile('./downloads/alternateNamesV2.zip', 'r') as archive:
with archive.open('alternateNamesV2.txt') as f:
reader = csv.reader(io.TextIOWrapper(f), delimiter='\t')
for row in reader:
if row[2] == 'link' or row[2] == 'wkdt':
gn_id = row[1]
if gn_id in self.concordances:
self.concordances[gn_id].append(row[2])
else:
self.concordances[gn_id] = [ row[2] ]
print(f'Collected {len(self.concordances)} concordance links')
# Checks if there is a concordance for the given GeoNames ID
def includes(self, gn_id):
return self.concordances[gn_id] if gn_id in self.concordances else None
def convert_zip(ccode, maybe_concordances):
with open(OUTFILE, 'a') as outfile:
with zipfile.ZipFile(f'./downloads/{ccode}.zip', 'r') as archive:
with archive.open(f'{ccode}.txt') as f:
reader = csv.reader(io.TextIOWrapper(f), delimiter='\t')
ctr = 0
for row in reader:
if (not maybe_concordances or maybe_concordances.includes(row[0])):
feature = Feature(row)
outfile.write(f'{feature.to_json()}\n')
ctr += 1
return ctr
def gzip_outfile():
with open(OUTFILE, 'rb') as infile, gzip.open(f'{OUTFILE}.gz', 'wb') as outfile:
outfile.writelines(infile)
# Delete outfile (if exists)
try:
os.remove(OUTFILE)
except OSError:
pass
maybe_concordances = Concordances() if cfg.require_concordance else None
# List all Zip files in the downloads folder...
zipfiles = [f for f in glob.glob('./downloads/*.zip')]
ccodes = map(lambda f: f[f.rfind('/') + 1 : -4], zipfiles)
ccodes = list(filter(lambda ccode: ccode != 'alternateNamesV2', ccodes)) # Not very nice...
# Filter according to settings
if len(cfg.countries) > 0:
ccodes = list(filter(lambda ccode: ccode in cfg.countries, ccodes))
ccodes.sort()
# ...convert...
ctr = 0
for ccode in ccodes:
print(f'Converting file {ccode}.zip')
ctr += convert_zip(ccode, maybe_concordances)
print(f'Converted {ctr} records')
# ...and gzip
print('GZipping...')
gzip_outfile()
print(f'Wrote results to {OUTFILE}.')