From 83281f114291ff198920e12a05fd2b6c320be50b Mon Sep 17 00:00:00 2001 From: William Schueller Date: Mon, 20 Nov 2023 22:37:15 +0100 Subject: [PATCH] generic getters + geonames fillers --- gis_fillers/fillers/metafiller.py | 18 ++ gis_fillers/fillers/zones/__init__.py | 1 + gis_fillers/fillers/zones/geonames.py | 58 +++++ gis_fillers/getters/__init__.py | 2 +- gis_fillers/getters/generic_getters.py | 318 +++++++++++++++++++++++++ gis_fillers/getters/zone_getters.py | 88 +++---- gis_fillers/initscript.sql | 16 ++ requirements.txt | 6 +- tests/testmodule/test_db.py | 95 +++++--- 9 files changed, 522 insertions(+), 80 deletions(-) create mode 100644 gis_fillers/fillers/metafiller.py create mode 100644 gis_fillers/fillers/zones/geonames.py diff --git a/gis_fillers/fillers/metafiller.py b/gis_fillers/fillers/metafiller.py new file mode 100644 index 0000000..9a26f24 --- /dev/null +++ b/gis_fillers/fillers/metafiller.py @@ -0,0 +1,18 @@ +from .fillers import Fillers +from .zones import zaehlsprengel, countries, geonames + + +class MetaFiller(Filler): + def after_insert(self): + self.db.add_filler( + zaehlsprengel.ZaehlsprengelFiller(data_folder=self.data_folder) + ) + self.db.add_filler( + zaehlsprengel.SimplifiedZSFiller(data_folder=self.data_folder) + ) + self.db.add_filler( + zaehlsprengel.PopulationZSFiller(data_folder=self.data_folder) + ) + self.db.add_filler(zaehlsprengel.PLZFiller(data_folder=self.data_folder)) + self.db.add_filler(geonames.GeonamesFiller(data_folder=self.data_folder)) + self.db.add_filler(countries.CountriesFiller(data_folder=self.data_folder)) diff --git a/gis_fillers/fillers/zones/__init__.py b/gis_fillers/fillers/zones/__init__.py index dbd9888..d4167b0 100644 --- a/gis_fillers/fillers/zones/__init__.py +++ b/gis_fillers/fillers/zones/__init__.py @@ -2,3 +2,4 @@ from . import countries from . import hexagons from . import generic +from . import geonames diff --git a/gis_fillers/fillers/zones/geonames.py b/gis_fillers/fillers/zones/geonames.py new file mode 100644 index 0000000..a190bf5 --- /dev/null +++ b/gis_fillers/fillers/zones/geonames.py @@ -0,0 +1,58 @@ +from psycopg2 import extras +import zipfile +import os +from .. import fillers + + +class GeonamesFiller(fillers.Filler): + def __init__( + self, + force=False, + url_geonames="https://download.geonames.org/export/zip/allCountries.zip", + zipname="geonames_allCountries.zip", + **kwargs + ): + self.force = force + self.url_geonames = url_geonames + self.zipname = zipname + fillers.Filler.__init__(self, **kwargs) + + def prepare(self): + fillers.Filler.prepare(self) + if not self.force and self.check_done(): + self.done = True + elif not os.path.exists(os.path.join(self.data_folder, self.zipname)): + self.download(url=self.url_geonames, destination=self.zipname) + + def gen_extract(self): + with zipfile.ZipFile(os.path.join(self.data_folder, self.zipname), "r") as zf: + with zf.open("allCountries.txt", "r") as f: + for line in f: + elt = line.decode("utf8").split("\t") + yield dict( + country_code=elt[0], + zip_code=elt[1], + geo_lat=elt[-3], + geo_long=elt[-2], + ) + + def check_done(self): + self.db.cursor.execute("SELECT 1 FROM geonames_zipcodes LIMIT 1;") + return self.db.cursor.fetchone() == (1,) + + def apply(self): + extras.execute_batch( + self.db.cursor, + """ + INSERT INTO geonames_zipcodes(country_code,zip_code,geom) + VALUES( + %(country_code)s, + %(zip_code)s, + ST_SetSRID(ST_MakePoint(%(geo_long)s::double precision,%(geo_lat)s::double precision),4326) + ) + ON CONFLICT DO NOTHING; + """, + self.gen_extract(), + page_size=10**4, + ) + self.db.connection.commit() diff --git a/gis_fillers/getters/__init__.py b/gis_fillers/getters/__init__.py index d48089b..2f99745 100644 --- a/gis_fillers/getters/__init__.py +++ b/gis_fillers/getters/__init__.py @@ -1 +1 @@ -from .generic_getters import Getter +from .generic_getters import Getter, GISGetter diff --git a/gis_fillers/getters/generic_getters.py b/gis_fillers/getters/generic_getters.py index 4b5808b..78dde0c 100644 --- a/gis_fillers/getters/generic_getters.py +++ b/gis_fillers/getters/generic_getters.py @@ -1 +1,319 @@ from db_fillers import Getter +import string +import random +from psycopg2 import extras +import shapely +import geopandas as gpd +from geopy.geocoders import Nominatim +from shapely.geometry import Point +from shapely.affinity import translate + + +class GISGetter(Getter): + columns = ("geometry",) + + def get(self, db, **kwargs): + db.cursor.execute(self.query(), self.query_attributes()) + query_result = list(db.cursor.fetchall()) + gdf = gpd.GeoDataFrame( + self.parse_results(query_result=query_result), + crs="epsg:4326", + columns=self.columns, + ) + return gdf + + +class LocationPointsGetter(GISGetter): + """ + Mother class for specific location type queries + """ + + columns = ("location", "geometry", "lat", "long") + + def __init__(self, location_list, add_noise=False, noise_size=0.01, **kwargs): + self.location_list = location_list + self.add_noise = add_noise + self.noise_size = noise_size + Getter.__init__(self, **kwargs) + self.rnd_str = "".join( + random.choice(string.ascii_letters + string.digits) for _ in range(10) + ) + + def get(self, **kwargs): + gdf = GISGetter.get(self, **kwargs) + if self.add_noise: + tmp = [] + for index, poi in gdf.iterrows(): + new_point = translate( + gdf.loc[index, "geometry"], + xoff=random.random() * self.noise_size, + yoff=random.random() * self.noise_size, + ) + tmp.append( + { + "geometry": new_point, + } + ) + new_gdf = gpd.GeoDataFrame(tmp, columns=["geometry"]) + gdf["geometry"] = new_gdf["geometry"] + # gdf["lat"] = new_gdf["lat"] + # gdf["long"] = new_gdf["long"] + gdf["lat"] = gdf["geometry"].y + gdf["long"] = gdf["geometry"].x + + return gdf + + def prepare(self): + Getter.prepare(self) + self.db.cursor.execute( + f""" + CREATE TEMP TABLE IF NOT EXISTS temp_locations_{self.rnd_str}( + id BIGSERIAL PRIMARY KEY, + location text NOT NULL + ); + """ + ) + extras.execute_batch( + self.db.cursor, + f""" + INSERT INTO temp_locations_{self.rnd_str}( + location + ) + VALUES(%(loc)s) + """, + (dict(loc=loc) for loc in self.location_list), + ) + + def query(self): + raise NotImplementedError + + def query_attributes(self): + return dict() + + def parse_results(self, query_result): + """ + returns list of elements to be used for pandas or geopandas + """ + raise NotImplementedError + + def cleanup(self): + self.db.cursor.execute( + f""" + DROP TABLE IF EXISTS temp_locations_{self.rnd_str}; + """ + ) + + +class AreaPointsGetter(LocationPointsGetter): + """ + Returns coordinates of points from a list of areas by code and zone level + """ + + def __init__(self, zone_level="bezirk", location_ref_type="code", **kwargs): + self.zone_level = zone_level + if location_ref_type not in ( + "code", + "id", + "name", + ): + raise ValueError(f"Unrecognized ref_type for location:{location_ref_type}") + self.location_ref_type = location_ref_type + LocationPointsGetter.__init__(self, **kwargs) + + def query_attributes(self): + return dict(zone_level=self.zone_level, gis_type="zaehlsprengel") + + def query(self): + return f""" + WITH main_query AS(SELECT + tl.location, + CASE ST_NumGeometries(gd.geom) + WHEN 0 THEN NULL + WHEN 1 THEN ST_AsText(ST_GeometryN(ST_GeneratePoints(gd.geom,1),1)) + ELSE ST_AsText(ST_GeometryN(ST_GeneratePoints(gd.geom,100),(random()*100)::int+1)) + END AS geom + FROM temp_locations_{self.rnd_str} tl + LEFT OUTER JOIN zone_levels zl + ON zl.name=%(zone_level)s + LEFT OUTER JOIN zones z + ON zl.id=z.level + AND COALESCE(z.{self.location_ref_type}::text,z.id::text)=tl.location + LEFT OUTER JOIN gis_types gt + ON gt.name=%(gis_type)s + LEFT OUTER JOIN gis_data gd + ON gd.gis_type=gt.id + AND gd.zone_id=z.id + AND gd.zone_level=zl.id) + SELECT location,ST_Y(geom) AS geo_lat,ST_X(geom) AS geo_long,geom FROM main_query; + """ + + def parse_results(self, query_result): + return [ + { + "location": loc, + "lat": geo_lat, + "long": geo_long, + "geometry": shapely.wkt.loads(geo), + } + for ( + loc, + geo_lat, + geo_long, + geo, + ) in query_result + ] + + +class AddressPointsGetter(LocationPointsGetter): + """ + Returns coordinates of points from a list of addresses + Using a nominatim instance + """ + + columns = ("location", "lat", "long", "geometry") + + def __init__( + self, + nominatim_host="localhost", + nominatim_port=8080, + nominatim_user_agent="gis_fillers", + **kwargs, + ): + self.nominatim_host = nominatim_host + self.nominatim_port = nominatim_port + self.nominatim_user_agent = nominatim_user_agent + AreaPointsGetter.__init__(self, **kwargs) + self.set_geolocator() + + def set_geolocator(self): + if self.nominatim_host is None: + self.geolocator = Nominatim(user_agent=self.nominatim_user_agent) + else: + self.geolocator = Nominatim( + user_agent=self.nominatim_user_agent, + domain=f"{self.nominatim_host}:{self.nominatim_port}", + scheme="http", + ) + + def query(self): + return f""" + SELECT tl.location, + ST_Y(ca.geom) AS geo_lat, + ST_X(ca.geom) AS geo_long, + ST_AsText(ca.geom) AS geometry + FROM temp_locations_{self.rnd_str} tl + LEFT OUTER JOIN cached_addresses ca + ON ca.address=tl.location + ; + """ + + def parse_results(self, query_result): + ans = [ + { + "location": loc, + "lat": geo_lat, + "long": geo_long, + "geometry": shapely.wkt.loads(geo), + } + for ( + loc, + geo_lat, + geo_long, + geo, + ) in query_result + ] + to_resolve = [a for a in ans if a["geometry"] is None] + if len(to_resolve): + self.logger.info( + f"Resolving {len(to_resolve)} addresses out of {len(ans)}." + ) + temp_resolved = dict() + for a in to_resolve: + loc = a["location"] + if loc in temp_resolved.keys(): + geoloc = temp_resolved[loc] + else: + geoloc = self.geolocator.geocode(loc) + temp_resolved[loc] = geoloc + a["geometry"] = shapely.wkt.loads( + f"POINT({geoloc.longitude} {geoloc.latitude})" + ) + a["lat"], a["long"] = geoloc.latitude, geoloc.longitude + self.fill_cached_address(address=loc, geo_lat=a["lat"], geo_long=a["long"]) + return ans + + def fill_cached_address(self, address, geo_lat, geo_long): + self.db.cursor.execute( + """ + INSERT INTO cached_addresses(address,geom) + SELECT %(address)s,ST_SetSRID(ST_MakePoint(%(geo_long)s::double precision,%(geo_lat)s::double precision),4326) + On CONFLICT DO NOTHING; + """, + dict(address=address, geo_lat=geo_lat, geo_long=geo_long), + ) + self.db.connection.commit() + + +class ZipPointsGetter(LocationPointsGetter): + """ + Returns coordinates of points using zip codes and data from geonames + """ + + columns = ("location", "country_code", "lat", "long", "geometry") + + def prepare(self): + Getter.prepare(self) + self.rnd_str = "".join( + random.choice(string.ascii_letters + string.digits) for _ in range(10) + ) + self.db.cursor.execute( + f""" + CREATE TEMP TABLE IF NOT EXISTS temp_locations_{self.rnd_str}( + id BIGSERIAL PRIMARY KEY, + location text, + country_code text + ); + """ + ) + extras.execute_batch( + self.db.cursor, + f""" + INSERT INTO temp_locations_{self.rnd_str}( + location,country_code + ) + VALUES(%(loc)s,%(country)s) + """, + (dict(loc=loc, country=country) for country, loc in self.location_list), + ) + + def query(self): + return f""" + SELECT tl.location, + tl.country_code, + ST_Y(gz.geom) AS geo_lat, + ST_X(gz.geom) AS geo_long, + ST_AsText(gz.geom) AS geometry + FROM temp_locations_{self.rnd_str} tl + LEFT OUTER JOIN geonames_zipcodes gz + ON gz.country_code=tl.country_code + AND gz.zip_code=tl.location + ; + """ + + def parse_results(self, query_result): + return [ + { + "location": loc, + "country_code": c_code, + "lat": geo_lat, + "long": geo_long, + "geometry": shapely.wkt.loads(geo), + } + for ( + loc, + c_code, + geo_lat, + geo_long, + geo, + ) in query_result + ] diff --git a/gis_fillers/getters/zone_getters.py b/gis_fillers/getters/zone_getters.py index 6d89244..1f34657 100644 --- a/gis_fillers/getters/zone_getters.py +++ b/gis_fillers/getters/zone_getters.py @@ -1,4 +1,4 @@ -from . import Getter +from . import Getter, GISGetter import numpy as np import psycopg2 @@ -9,7 +9,7 @@ import pandas as pd -class PopulationGetter(Getter): +class PopulationGetter(GISGetter): """ Returns a geopandas dataframe with population per defined area level """ @@ -23,7 +23,7 @@ def __init__( simplified=True, **kwargs ): - Getter.__init__(self, **kwargs) + GISGetter.__init__(self, **kwargs) self.zone_level = zone_level self.zone_attribute = zone_attribute if simplified: @@ -33,42 +33,42 @@ def __init__( def query(self): return """ - SELECT q1.id,q1.level,q2.population,q1.name,q1.geometry, q1.area FROM - (SELECT z.id,z.level,z.name, ST_AsText(gd.geom) AS geometry, ST_Area(gd.geom,false)/10^6 AS area - FROM zones z - INNER JOIN zone_levels zl - ON zl.name=%(zone_level)s AND zl.id=z."level" - INNER JOIN gis_data gd - ON gd.zone_id =z.id AND gd.zone_level =z."level" - INNER JOIN gis_types gt - ON gd.gis_type =gt.id AND gt."name" =%(target_gt)s) AS q1 - INNER JOIN - (SELECT z.id,z.level,z.name,SUM(za.int_value::double precision*(COALESCE(zp.share,1.)::double precision)) AS population - FROM zones z - INNER JOIN zone_levels zl - ON zl.name=%(zone_level)s AND zl.id=z."level" - INNER JOIN zone_parents zp - ON zp.parent=z.id AND zp.parent_level=z.level - INNER JOIN zone_levels zl2 - ON zl2.id = zp.child_level AND zl2.name='zaehlsprengel' - INNER JOIN zone_attributes za - ON za.zone=zp.child AND za.zone_level=zp.child_level - INNER JOIN zone_attribute_types zat - ON zat.id=za.attribute AND zat.name='zs_population' - GROUP BY z.id,z.level,z.name - UNION - SELECT z.id,z.level,z.name,SUM(za.int_value::real) AS population - FROM zones z - INNER JOIN zone_levels zl - ON zl.name=%(zone_level)s AND zl.id=z."level" AND 'zaehlsprengel'=%(zone_level)s - INNER JOIN zone_attributes za - ON za.zone=z.id AND za.zone_level=zl.id - INNER JOIN zone_attribute_types zat - ON zat.id=za.attribute AND zat.name='zs_population' - GROUP BY z.id,z.level,z.name - ) AS q2 - ON q1.id=q2.id AND q1.level=q2.level - ;""" + SELECT q1.id,q1.level,q2.population,q1.name,q1.geometry, q1.area FROM + (SELECT z.id,z.level,z.name, ST_AsText(gd.geom) AS geometry, ST_Area(gd.geom,false)/10^6 AS area + FROM zones z + INNER JOIN zone_levels zl + ON zl.name=%(zone_level)s AND zl.id=z."level" + INNER JOIN gis_data gd + ON gd.zone_id =z.id AND gd.zone_level =z."level" + INNER JOIN gis_types gt + ON gd.gis_type =gt.id AND gt."name" =%(target_gt)s) AS q1 + INNER JOIN + (SELECT z.id,z.level,z.name,SUM(za.int_value::double precision*(COALESCE(zp.share,1.)::double precision)) AS population + FROM zones z + INNER JOIN zone_levels zl + ON zl.name=%(zone_level)s AND zl.id=z."level" + INNER JOIN zone_parents zp + ON zp.parent=z.id AND zp.parent_level=z.level + INNER JOIN zone_levels zl2 + ON zl2.id = zp.child_level AND zl2.name='zaehlsprengel' + INNER JOIN zone_attributes za + ON za.zone=zp.child AND za.zone_level=zp.child_level + INNER JOIN zone_attribute_types zat + ON zat.id=za.attribute AND zat.name='zs_population' + GROUP BY z.id,z.level,z.name + UNION + SELECT z.id,z.level,z.name,SUM(za.int_value::real) AS population + FROM zones z + INNER JOIN zone_levels zl + ON zl.name=%(zone_level)s AND zl.id=z."level" AND 'zaehlsprengel'=%(zone_level)s + INNER JOIN zone_attributes za + ON za.zone=z.id AND za.zone_level=zl.id + INNER JOIN zone_attribute_types zat + ON zat.id=za.attribute AND zat.name='zs_population' + GROUP BY z.id,z.level,z.name + ) AS q2 + ON q1.id=q2.id AND q1.level=q2.level + ;""" def query_attributes(self): return { @@ -89,16 +89,6 @@ def parse_results(self, query_result): for (zid, zlvl, pop, bez, geo, area) in query_result ] - def get(self, db, **kwargs): - db.cursor.execute(self.query(), self.query_attributes()) - query_result = list(db.cursor.fetchall()) - gdf = gpd.GeoDataFrame( - self.parse_results(query_result=query_result), - crs="epsg:4326", - columns=self.columns, - ) - return gdf - class PopulationDensityGetter(PopulationGetter): """ diff --git a/gis_fillers/initscript.sql b/gis_fillers/initscript.sql index ff789f1..677ff0a 100644 --- a/gis_fillers/initscript.sql +++ b/gis_fillers/initscript.sql @@ -118,6 +118,22 @@ CREATE INDEX IF NOT EXISTS zone_name_idx ON zones(name,level,id); --CREATE INDEX IF NOT EXISTS zs_attr_completenodate_idx2 ON zone_attributes(zone_level,zone,attribute,scenario,int_value); CREATE INDEX IF NOT EXISTS zs_attr_completenodate_idx2 ON zone_attributes(zone_level,zone,attribute,int_value); +CREATE TABLE IF NOT EXISTS cached_addresses( +address TEXT PRIMARY KEY, +geom GEOMETRY(POINT,4326) +); + +CREATE INDEX IF NOT EXISTS ca_gist_idx + ON cached_addresses + USING GIST (geom); + + +CREATE TABLE IF NOT EXISTS geonames_zipcodes( +country_code TEXT NOT NULL, +zip_code TEXT NOT NULL, +geom GEOMETRY(POINT,4326), +PRIMARY KEY(country_code,zip_code) +); CREATE TABLE IF NOT EXISTS _exec_info( id BIGSERIAL PRIMARY KEY, diff --git a/requirements.txt b/requirements.txt index fd17a97..c74b6bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,5 +10,7 @@ matplotlib # camelot-py[cv] odfpy db_fillers -h3 -topojson \ No newline at end of file +h3<4.0 +topojson +geopy +contextily \ No newline at end of file diff --git a/tests/testmodule/test_db.py b/tests/testmodule/test_db.py index 9e68743..d536fd1 100644 --- a/tests/testmodule/test_db.py +++ b/tests/testmodule/test_db.py @@ -5,7 +5,7 @@ import gis_fillers as gf from gis_fillers import Database from gis_fillers.fillers import zones -from gis_fillers.getters import zone_getters +from gis_fillers.getters import zone_getters, generic_getters conninfo = { "host": "localhost", @@ -27,10 +27,10 @@ def test_init(): db.init_db() -def test_clean(): - db = Database(**conninfo) - db.clean_db() - db.init_db() +# def test_clean(): +# db = Database(**conninfo) +# db.clean_db() +# db.init_db() @pytest.fixture @@ -62,8 +62,8 @@ def test_simplified_zs(maindb): # def test_simplified_zs_mapshaper(maindb): -# maindb.add_filler(zones.zaehlsprengel.SimplifiedZSFiller(simplify_engine='mapshaper',geojson_gis_info_name="mapshaper_{YEAR}0101.geojson",)) -# maindb.fill_db() +# maindb.add_filler(zones.zaehlsprengel.SimplifiedZSFiller(simplify_engine='mapshaper',geojson_gis_info_name="mapshaper_{YEAR}0101.geojson",)) +# maindb.fill_db() def test_plz(maindb): @@ -90,13 +90,13 @@ def country(request): return request.param -def test_hexagons(maindb, res, country): - maindb.add_filler( - zones.hexagons.HexagonsFiller( - res=res, target_zone=country, target_zone_level="country" - ) - ) - maindb.fill_db() +# def test_hexagons(maindb, res, country): +# maindb.add_filler( +# zones.hexagons.HexagonsFiller( +# res=res, target_zone=country, target_zone_level="country" +# ) +# ) +# maindb.fill_db() bezirk_list = [918, 922] @@ -118,19 +118,58 @@ def res_bezirk(request): return request.param -def test_hexagons_bezirk(maindb, res_bezirk, bezirk): - maindb.add_filler( - zones.hexagons.HexagonsFiller( - res=res_bezirk, target_zone=bezirk, target_zone_level="bezirk" - ) - ) - maindb.fill_db() +# def test_hexagons_bezirk(maindb, res_bezirk, bezirk): +# maindb.add_filler( +# zones.hexagons.HexagonsFiller( +# res=res_bezirk, target_zone=bezirk, target_zone_level="bezirk" +# ) +# ) +# maindb.fill_db() + + +getters_list = [ + (zone_getters.PopulationGetter, dict(zone_level="bezirk", simplified=False)), + (zone_getters.PopulationDensityGetter, dict(zone_level="bezirk", simplified=False)), + ( + generic_getters.AreaPointsGetter, + dict(zone_level="bezirk", location_list=["101", "918", "902"] * 10), + ), + ( + generic_getters.AreaPointsGetter, + dict(zone_level="country", location_list=["AT", "FR", "TR"] * 10), + ), + ( + generic_getters.AreaPointsGetter, + dict( + zone_level="country", + location_list=["AT", "FR", "TR"] * 10, + noise_size=0.2, + add_noise=True, + ), + ), + ( + generic_getters.AreaPointsGetter, + dict( + zone_level="gemeinde", + location_ref_type="name", + location_list=["Neunkirchen", "Mistelbach", "Graz"] * 10, + ), + ), + ( + generic_getters.AddressPointsGetter, + dict( + location_list=["josefstadter str. 39", "wien", "ortaköy, istanbul"] * 10, + nominatim_host=None, + nominatim_user_agent="gis_fillers_test", + ), + ), +] + + +@pytest.fixture(params=getters_list) +def getter(request): + return request.param -def test_getters(maindb): - zone_getters.PopulationGetter( - db=maindb, zone_level="bezirk", simplified=False - ).get_result() - zone_getters.PopulationDensityGetter( - db=maindb, zone_level="bezirk", simplified=False - ).get_result() +def test_getters(maindb, getter): + getter[0](db=maindb, **getter[1]).get_result()