From d3c0b2f27cc26ebaa9057aa7ee65726e7f212a0f Mon Sep 17 00:00:00 2001 From: Simon Roberts Date: Fri, 22 Sep 2023 14:44:39 +1000 Subject: [PATCH 01/11] Implement support for Sqlite DB. Remove unused query method. --- code/db.py | 140 ++++++++++++++++++++++++++++------------------------- 1 file changed, 75 insertions(+), 65 deletions(-) diff --git a/code/db.py b/code/db.py index e46d610633..89808fb970 100644 --- a/code/db.py +++ b/code/db.py @@ -1,36 +1,26 @@ -import itertools import logging +import sqlite3 from argparse import ArgumentParser, Namespace -import data import psycopg2 from psycopg2.extras import NamedTupleCursor +import data +from abc import ABC, abstractmethod -class AddressDB: - """Connect to the GNAF Postgres database and query for addresses. See https://github.com/minus34/gnaf-loader""" - def __init__(self, database: str, host: str, port: str, user: str, password: str, create_index: bool = True): - """Connect to the database""" - conn = psycopg2.connect( - database=database, host=host, port=port, user=user, password=password, cursor_factory=NamedTupleCursor - ) +class DbDriver(ABC): + """Abstract class for DB connections.""" + @abstractmethod + def execute(self, query, vars=None): + """Return a list of Namespace objects for the provided query.""" + pass - self.cur = conn.cursor() +class AddressDB: + """Connect to our cut-down version of the GNAF Postgres database and query for addresses.""" - # detect the schema used by the DB - self.cur.execute("SELECT schema_name FROM information_schema.schemata where schema_name like 'gnaf_%'") - db_schema = self.cur.fetchone().schema_name - self.cur.execute(f"SET search_path TO {db_schema}") - conn.commit() - - # optionally create a DB index - if create_index: - logging.info("Creating DB index...") - self.cur.execute( - "CREATE INDEX IF NOT EXISTS address_name_state ON address_principals (locality_name, state)" - ) - conn.commit() + def __init__(self, db: DbDriver): + self.db = db def get_addresses(self, target_suburb: str, target_state: str) -> data.AddressList: """Return a list of Address for the provided suburb+state from the database.""" @@ -40,8 +30,6 @@ def get_addresses(self, target_suburb: str, target_state: str) -> data.AddressLi WHERE locality_name = %s AND state = %s LIMIT 100000""" - self.cur.execute(query, (target_suburb, target_state)) - return [ data.Address( name=f"{row.address} {target_suburb} {row.postcode}", @@ -49,35 +37,9 @@ def get_addresses(self, target_suburb: str, target_state: str) -> data.AddressLi longitude=float(row.longitude), latitude=float(row.latitude), ) - for row in self.cur.fetchall() + for row in self.db.execute(query, (target_suburb, target_state)) ] - def get_list_vs_total(self, suburbs_states: dict) -> dict: - """Calculate which fraction of the entire dataset is represented by the given list of state+suburb.""" - self.cur.execute("SELECT state, COUNT(*) FROM address_principals GROUP BY state") - states = {row.state: {"total": row.count} for row in self.cur.fetchall()} - - query_parts = ["(state = %s AND locality_name IN %s)\n"] * len(suburbs_states) - values = [[state, tuple(suburbs)] for state, suburbs in suburbs_states.items()] - all_values = tuple(itertools.chain.from_iterable(values)) - - query = f""" - SELECT state, COUNT(*) - FROM address_principals - WHERE\n{" OR ".join(query_parts)} - GROUP BY state - """ - self.cur.execute(query, all_values) # takes ~2 minutes - for row in self.cur.fetchall(): - states[row.state]["completed"] = row.count - - # add a totals row - total_completed = sum(sp.get("completed", 0) for sp in states.values()) - total = sum(sp.get("total", 0) for sp in states.values()) - states["total"] = {"completed": total_completed, "total": total} - - return states - def get_counts_by_suburb(self) -> dict[str, dict[str, int]]: """return a tally of addresses by state and suburb""" query = """ @@ -86,10 +48,9 @@ def get_counts_by_suburb(self) -> dict[str, dict[str, int]]: GROUP BY locality_name, state ORDER BY state, locality_name """ - self.cur.execute(query) results = {} - for record in self.cur.fetchall(): + for record in self.db.execute(query): if record.state not in results: results[record.state] = {} results[record.state][record.locality_name] = record.count @@ -108,10 +69,9 @@ def get_extents_by_suburb(self) -> dict: GROUP BY locality_name, state ORDER BY state, locality_name """ - self.cur.execute(query) results = {} - for record in self.cur.fetchall(): + for record in self.db.execute(query): if record.state not in results: results[record.state] = {} results[record.state][record.locality_name] = ( @@ -131,7 +91,7 @@ def add_db_arguments(parser: ArgumentParser): help="The password for the database user", default="password", ) - parser.add_argument("-H", "--dbhost", help="The hostname for the database", default="localhost") + parser.add_argument("-H", "--dbhost", help="The hostname for the database (or file-path for Sqlite)", default="localhost") parser.add_argument("-P", "--dbport", help="The port number for the database", default="5433") parser.add_argument( "-i", @@ -141,13 +101,63 @@ def add_db_arguments(parser: ArgumentParser): ) +class PostgresDb(DbDriver): + """Class that implements Postgresql DB connection.""" + def __init__(self, database: str, host: str, port: str, user: str, password: str, create_index: bool = True): + """Connect to the database""" + conn = psycopg2.connect( + database=database, host=host, port=port, user=user, password=password, cursor_factory=NamedTupleCursor + ) + + self.cur = conn.cursor() + + # detect the schema used by the DB + self.cur.execute("SELECT schema_name FROM information_schema.schemata where schema_name like 'gnaf_%'") + db_schema = self.cur.fetchone().schema_name + self.cur.execute(f"SET search_path TO {db_schema}") + conn.commit() + + # optionally create a DB index + if create_index: + logging.info("Creating DB index...") + self.cur.execute( + "CREATE INDEX IF NOT EXISTS address_name_state ON address_principals (locality_name, state)" + ) + conn.commit() + + def execute(self, query, vars=None): + """Return a list of Namespace objects for the provided query.""" + self.cur.execute(query, vars) + return self.cur.fetchall() + + +class SqliteDb(DbDriver): + """Class that implements Sqlite DB connection (to a file). Pass the filename as the dbhost.""" + def __init__(self, database_file: str): + """Connect to the database""" + conn = sqlite3.connect(database_file) + conn.row_factory = sqlite3.Row + self.cur = conn.cursor() + + def execute(self, query, vars=None): + """Return a list of Namespace objects for the provided query.""" + query = query.replace("%s", "?") + self.cur.execute(query, vars) + # sqlite doesn't support NamedTupleCursor, so we need to manually add the column names + return [Namespace(**dict(zip(x.keys(), x))) for x in self.cur.fetchall()] + + def connect_to_db(args: Namespace) -> AddressDB: """return a DB connection based on the provided args""" - return AddressDB( - "postgres", - args.dbhost, - args.dbport, - args.dbuser, - args.dbpassword, - args.create_index, - ) + if args.dbhost.endswith('.db'): + db = SqliteDb(args.dbhost) + else: + db = PostgresDb( + "postgres", + args.dbhost, + args.dbport, + args.dbuser, + args.dbpassword, + args.create_index, + ) + return AddressDB(db) From b9d09b68b662cef3ccff6b0efd6a5f81a39c3c05 Mon Sep 17 00:00:00 2001 From: Simon Roberts Date: Mon, 25 Sep 2023 09:28:31 +1000 Subject: [PATCH 02/11] Document how to create SQLite DB --- extra/db/README.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/extra/db/README.md b/extra/db/README.md index 37a5f99d47..bf3ab7b044 100644 --- a/extra/db/README.md +++ b/extra/db/README.md @@ -24,6 +24,32 @@ REPOSITORY TAG IMAGE ID CREATED SIZE mydb latest 84af660a3493 39 seconds ago 3.73GB minus34/gnafloader latest d2c552c72a0a 10 days ago 32GB ``` +# Sqlite Version + +To create a SQLite DB from the full CSV file (as used in the Dockerfile) use: + +``` +sqlite3 address_principals.db + +CREATE TABLE address_principals +( + gnaf_pid text NOT NULL, + address text NOT NULL, + locality_name text NOT NULL, + postcode INTEGER NULL, + state text NOT NULL, + latitude numeric(10,8) NOT NULL, + longitude numeric(11,8) NOT NULL +); + +CREATE INDEX address_name_state ON address_principals(locality_name, state); + +.mode csv +.import address_principals.csv address_principals +.exit +``` + +This will create 1.5GB file (about 400MB is the index). ## References From 91875ceb6659002b6e10feb8e468917830471a6f Mon Sep 17 00:00:00 2001 From: Simon Roberts Date: Mon, 25 Sep 2023 09:59:54 +1000 Subject: [PATCH 03/11] DB tests using SQLite --- code/db.py | 4 ++- tests/data/README.md | 26 ++++++++++++++++++ tests/data/sample-addresses.db | Bin 0 -> 20480 bytes tests/test_db.py | 48 +++++++++++++++++++++++++++++++++ 4 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 tests/data/README.md create mode 100644 tests/data/sample-addresses.db create mode 100644 tests/test_db.py diff --git a/code/db.py b/code/db.py index 89808fb970..8cbb032175 100644 --- a/code/db.py +++ b/code/db.py @@ -43,7 +43,7 @@ def get_addresses(self, target_suburb: str, target_state: str) -> data.AddressLi def get_counts_by_suburb(self) -> dict[str, dict[str, int]]: """return a tally of addresses by state and suburb""" query = """ - SELECT locality_name, state, COUNT(*) + SELECT locality_name, state, COUNT(*) as count FROM address_principals GROUP BY locality_name, state ORDER BY state, locality_name @@ -142,6 +142,8 @@ def __init__(self, database_file: str): def execute(self, query, vars=None): """Return a list of Namespace objects for the provided query.""" query = query.replace("%s", "?") + if vars is None: + vars = {} self.cur.execute(query, vars) # sqlite doesn't support NamedTupleCursor, so we need to manually add the column names return [Namespace(**dict(zip(x.keys(), x))) for x in self.cur.fetchall()] diff --git a/tests/data/README.md b/tests/data/README.md new file mode 100644 index 0000000000..59649bf59b --- /dev/null +++ b/tests/data/README.md @@ -0,0 +1,26 @@ +To create sample data in sqllite use the following process: + +- create empty DB per process described in DB: + +``` +sqlite3 tests/data/sample-addresses.db + +-- create table and index per process described in DB +CREATE TABLE address_principals +( + gnaf_pid text NOT NULL, + address text NOT NULL, + locality_name text NOT NULL, + postcode INTEGER NULL, + state text NOT NULL, + latitude numeric(10,8) NOT NULL, + longitude numeric(11,8) NOT NULL +); + +CREATE INDEX address_name_state ON address_principals(locality_name, state); + +-- attach and import a subset of the data +attach database './extra/db/address_principals.db' as full_db; +INSERT INTO main.address_principals SELECT * FROM full_db.address_principals WHERE locality_name like '%SOMER%' ORDER BY RANDOM() LIMIT 100; +``` + diff --git a/tests/data/sample-addresses.db b/tests/data/sample-addresses.db new file mode 100644 index 0000000000000000000000000000000000000000..c28f0fb67b8686754bdf97f484b3e0465c69511f GIT binary patch literal 20480 zcmeHOX?PT6w$7g3n*aupBoq>s#sIZ1vZcDadPy(Uba&E3gqRJDB0&fWf&oQV6>u3u z?;UU$?(qKG=kHtJkZnK@P6Nmq3>;E(&~&eM4wAbef*)_30T zocAo1L`M5t+M1CGtyAu-Z$pTzE?L>xrb-0K%F4=xKl$*d<0}Wg$?NzJ{F|-#{&L^u zW(}z;HN&SlmROeML-;H6m0=*mK!$+~0~rP~3}hI{Fpyy&!$5|C3hNq{f!X^%JM&I>G`U6%cRGDU3ln zK2P3(h{4X)yJY-yVzV^<(Dxb15<2YeDJC-L|wZ}$sn z9%LA3K#*G})laCK+|q=!HBWCtf?^a2R{Q-G@JRwt`+;hI+K(rxq zcLK_WXs@4G-`3JL4X&Rw?at;YEsX;)w1Ty%wlcAG(yb}iV9IL@Hl?FU4Su%ujI@*= z|3~?0=KBl-83r;8WEjXWkYOOhK!$+~0~rP~3}hI{Fpy#3|0e@^lyct&_)nAWy)4Ul z%a`V(<}K!+X_#@7vCVKy-`BdrG}&;yp-6Yyy2g5=HQPd4vdkmQ6{f$Nj++LV@{E5r z7V96;->R?C3+6NC1=ge1xz?_hxt2+mTJse1ICGw9jp=^VHO5`W-x=!+yA7`!R_k}@ zr(5?}-?UD)j6<<2*s?sfb(=B?`r{~sg!lhjj z6gJBT>eOt@p}otlUEFyelx~Ji4m|^K|x>1A8X*bEHA~Wfr=+vty z_47$BFri(-6)3WMVd-Ug+b|{ZQ3miOucbH>Oy}t zCH>L?m#16on?wRR0F}93`}jWEB}Dza-r6h5)K`?Mo&y3{l3a+$?DW!NT&Ll*XX;(o zYA)%K&dlO;i`S%E?5^ELH|<{wmIk7x<7OfuWYz+(VW%&4~-;s zwgElk7!Y(M#<3KJVH}2dc%Q%{4zW6%K(NN(f^81$r(Jul^;b6Zd;IoF>n1m$vtfOb z_u~Y~VmL}OJQ5P3yg%p-0*GIX2+6|4>XCs;7hSX+peq~t>pksd-9%hxwFYuvbS^BOSj)bp!$eij{~v$@s#VkrWn<$d852u>pr2OqYJ zVX@lnQ4-b>1Bne@v*?M(--ivZxKF=4TNtdf4eb-RHpbUsBm?%c7)KzEuo&|TwTPP! zJB45*d6qrPV*Iv|JL#o57g9NXTusj+&n2-!XLBfFSd`@moRl{QKdpn{i0JTm{gQVo zw1?CatkVoiB@6>^z@;+Q z8+3z<)d<1rxX-6B{?VGD=A*#)zAY<{z2qCHvvCTS0I#Pxl8O5lhJ)Xelw2Akem;mr)gORw9B*j|=wf5-hw;uth@vp3} zpXltZvk}UJ!U&E+DU8Js-Y<;dgU+zZnx08z73VhAup#QnID3| z2$Voc4vC1>Q4eAltAkEHTr1O!2z(@-RFwPbCA_&R@J40B0P9;*57LfOjRTQ53y2+n zV5lc~cpclqmz~~M@mJW^pYC|}@%LOMV4N}sFbu~qG)K`G#Jgl01ZyOT0xqC0m(sw< zU%K?TS17J**uUg~T~))Ky;5TWasb5=EMz5bAQXWNkUrNI-NB4~7$Qr&X>I>@_jM_` z5@TqB0Vg7mh(`$9!=mUDkbn?V=xax=e7)e&?`A-*G)KE!2>5#HY{L`+OXDofq7)Gq zSPX=SqalCSDmSGRSJV25YwiKH?)u%zp83ve)wW3$0LtMsL84cT_eK2va~EF$+e}B^ znHd#(s5PI)DHcUhDxQ!)^U14g{tqALL|W0n*rt&KlE0>#&-jGQt1H{U>we|G~y2P zuB2}&L#6a&;jw9;wf$QERyCo<*%b^^K;S6JkPs>u662#T5z=FNAt=lFZ@yd-g8BZ} zHnCMjaw?Kb9Vz+KIK#3e%J|`Dd`Ykl94Z*(RctGL%6)#p`7rp@1Z02yDQB0I9pM39e+_Sa=PSEj z&T^g3L~Zx~B)MGtn z-x_mt_o93u2P$~w5nw3E;yB9^h`Uc^{O>PIWSuOXozV^Uz6KYcmjk<{ztw24w>-_5VY2re#_GY;Csow(Pgews7Wi z=GEq#;cfgr(|i+WJY`&CtT*;F95gI4(E4Ng$Mjx(fo`kr9vxcvRpGKiZ^6&d|KDCv zn*VA3qxtUq+`KJ$GxBV?$8&#|D?T$5&v%R3E*M2X%O!P>Zt0h#H&>3v!h8SUKBx9d$j@0&ccl^JqKVfAN=2Cz1=l%R;u)c zQ7pv)rquhp#UQU3iKt7y+4RZ3KeY`Q`rGllKPh)rtJH&pnwG$z<;GFqg%3)TIt4#U z^(gsi{%7}+u%Yvz#aM|ysL`iZc}4+Y$m{w`mOok@b&Y;VV%vVY}#3z!9W>hN#kHs zphqyU69RlNg1CfmVyKW(9Bn|}-<%T#QS#u|-k()9I^flUyLaE;=y!=JX9z7H z&Tt$7quVGS^{N|NU{QGZR&~hr^lZ95nF!T=;%Eadz(2C$Rb!ETA zh8-U=9+>{Y+jYgFNR`eesKSn>pkz$MFrfXmS$X?m)8b!uK?p8)pZL!MqrFgWt3gTV zP7^a2Y2Lwy1K{s4cU8K5N>n$mDn=au`N;5UwDgS~jhpMaqU4d5|7#(ktBsC{E@(Po)x4+w7apEMn zX|Kh-2Xa2UDmO4t=D;kNOSmcc70hs|L>SwrkH6C9OREch2IC%^u=TlTJiHt{N%6#_ z+(`nWboPER2Jc-i*TU-IeMrgHr^DQxr<_NI-f!afLzGY_lo$ zYY*_EF`s_admTRtfD~SY!K*#Up)eWtOTVHH^LvFmcFs+9J{!f?LCN&m_jWVy94U)a zXCzArhNL)nm*7BRVR>YpGMkDQgFDU-*f(%B5PjcP&+OfS5kRyu)}Wf?V00?$0muPp zIPdp?@+OH^M@RcNM_cH9VAs4A=LYt14cF8cuqzJ4>ZY8 zaqT_%=fJ*KM&-1Leomd$X$}5zUlys^HP&tVq{%5;OvGkTER+Z@rfTFcQD`xis=N36Hq+E2mpaMQ@jUWFbWZe z5Y)V^o;+yP*PDRwL96Dx5%kb00txSfz$L*M_^S%!9MicuhFL(--j~<>p*-1$Wo7*f DA0ghq literal 0 HcmV?d00001 diff --git a/tests/test_db.py b/tests/test_db.py new file mode 100644 index 0000000000..d385557474 --- /dev/null +++ b/tests/test_db.py @@ -0,0 +1,48 @@ +import os +from argparse import ArgumentParser, Namespace + +import db + +SAMPLE_ADDRESSES_DB_FILE = f"{os.path.dirname(os.path.realpath(__file__))}/data/sample-addresses.db" + +def test_get_address(): + address_db = db.connect_to_db(Namespace(dbhost=SAMPLE_ADDRESSES_DB_FILE)) + addresses = address_db.get_addresses("SOMERVILLE", "VIC") + assert len(addresses) == 30 + assert addresses[0].name == "83 GUELPH STREET SOMERVILLE 3912" + assert addresses[0].gnaf_pid == "GAVIC421048228" + + +def test_get_counts_by_suburb(): + address_db = db.connect_to_db(Namespace(dbhost=SAMPLE_ADDRESSES_DB_FILE)) + counts = address_db.get_counts_by_suburb() + assert counts["VIC"]["SOMERVILLE"] == 30 + assert counts["VIC"]["SOMERS"] == 10 + assert counts["VIC"]["SOMERTON"] == 1 + assert len(counts["NSW"]) == 2 + assert len(counts["SA"]) == 1 + assert len(counts["TAS"]) == 1 + assert len(counts["WA"]) == 1 + + +def test_get_extents_by_suburb(): + address_db = db.connect_to_db(Namespace(dbhost=SAMPLE_ADDRESSES_DB_FILE)) + extents = address_db.get_extents_by_suburb() + assert extents["VIC"]["SOMERVILLE"] == ( + (-38.23846838, 145.162399), + (-38.21306546, 145.22678832), + ) + + +def test_add_db_arguments(): + parser = ArgumentParser() + db.add_db_arguments(parser) + args = parser.parse_args([]) + assert args.dbuser == "postgres" + assert args.dbpassword == "password" + assert args.dbhost == "localhost" + assert args.dbport == "5433" + assert args.create_index + + +# TODO: test postgres with mocks From 5a6dc0f577f94660716ab5ee2dd07beb689dd863 Mon Sep 17 00:00:00 2001 From: lyricnz Date: Mon, 25 Sep 2023 00:35:57 +0000 Subject: [PATCH 04/11] [MegaLinter] Apply linters fixes --- code/db.py | 15 ++++++++++----- tests/test_db.py | 1 + 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/code/db.py b/code/db.py index 8cbb032175..afba32efe5 100644 --- a/code/db.py +++ b/code/db.py @@ -1,21 +1,22 @@ import logging import sqlite3 +from abc import ABC, abstractmethod from argparse import ArgumentParser, Namespace +import data import psycopg2 from psycopg2.extras import NamedTupleCursor -import data -from abc import ABC, abstractmethod - class DbDriver(ABC): """Abstract class for DB connections.""" + @abstractmethod def execute(self, query, vars=None): """Return a list of Namespace objects for the provided query.""" pass + class AddressDB: """Connect to our cut-down version of the GNAF Postgres database and query for addresses.""" @@ -91,7 +92,9 @@ def add_db_arguments(parser: ArgumentParser): help="The password for the database user", default="password", ) - parser.add_argument("-H", "--dbhost", help="The hostname for the database (or file-path for Sqlite)", default="localhost") + parser.add_argument( + "-H", "--dbhost", help="The hostname for the database (or file-path for Sqlite)", default="localhost" + ) parser.add_argument("-P", "--dbport", help="The port number for the database", default="5433") parser.add_argument( "-i", @@ -103,6 +106,7 @@ def add_db_arguments(parser: ArgumentParser): class PostgresDb(DbDriver): """Class that implements Postgresql DB connection.""" + def __init__(self, database: str, host: str, port: str, user: str, password: str, create_index: bool = True): """Connect to the database""" conn = psycopg2.connect( @@ -133,6 +137,7 @@ def execute(self, query, vars=None): class SqliteDb(DbDriver): """Class that implements Sqlite DB connection (to a file). Pass the filename as the dbhost.""" + def __init__(self, database_file: str): """Connect to the database""" conn = sqlite3.connect(database_file) @@ -151,7 +156,7 @@ def execute(self, query, vars=None): def connect_to_db(args: Namespace) -> AddressDB: """return a DB connection based on the provided args""" - if args.dbhost.endswith('.db'): + if args.dbhost.endswith(".db"): db = SqliteDb(args.dbhost) else: db = PostgresDb( diff --git a/tests/test_db.py b/tests/test_db.py index d385557474..a0de80e506 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -5,6 +5,7 @@ SAMPLE_ADDRESSES_DB_FILE = f"{os.path.dirname(os.path.realpath(__file__))}/data/sample-addresses.db" + def test_get_address(): address_db = db.connect_to_db(Namespace(dbhost=SAMPLE_ADDRESSES_DB_FILE)) addresses = address_db.get_addresses("SOMERVILLE", "VIC") From 6236b99e3c748de375d3e3df10cefebb4019019e Mon Sep 17 00:00:00 2001 From: Simon Roberts Date: Mon, 25 Sep 2023 10:44:11 +1000 Subject: [PATCH 05/11] Make sqlite-detection (by file-extension) a little more generic --- code/db.py | 6 ++++-- ...{sample-addresses.db => sample-addresses.sqlite} | Bin tests/test_db.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) rename tests/data/{sample-addresses.db => sample-addresses.sqlite} (100%) diff --git a/code/db.py b/code/db.py index afba32efe5..f6501cb1c9 100644 --- a/code/db.py +++ b/code/db.py @@ -3,10 +3,12 @@ from abc import ABC, abstractmethod from argparse import ArgumentParser, Namespace -import data import psycopg2 from psycopg2.extras import NamedTupleCursor +import data + +SQLITE_FILE_EXTENSIONS = {"db", "sqlite", "sqlite3", "db3", "s3db", "sl3"} class DbDriver(ABC): """Abstract class for DB connections.""" @@ -156,7 +158,7 @@ def execute(self, query, vars=None): def connect_to_db(args: Namespace) -> AddressDB: """return a DB connection based on the provided args""" - if args.dbhost.endswith(".db"): + if args.dbhost.split('.')[-1] in SQLITE_FILE_EXTENSIONS: db = SqliteDb(args.dbhost) else: db = PostgresDb( diff --git a/tests/data/sample-addresses.db b/tests/data/sample-addresses.sqlite similarity index 100% rename from tests/data/sample-addresses.db rename to tests/data/sample-addresses.sqlite diff --git a/tests/test_db.py b/tests/test_db.py index a0de80e506..22f0807eab 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -3,7 +3,7 @@ import db -SAMPLE_ADDRESSES_DB_FILE = f"{os.path.dirname(os.path.realpath(__file__))}/data/sample-addresses.db" +SAMPLE_ADDRESSES_DB_FILE = f"{os.path.dirname(os.path.realpath(__file__))}/data/sample-addresses.sqlite" def test_get_address(): From 48ddfff9f6424f06d003d9dcb1af6dfb95d356bf Mon Sep 17 00:00:00 2001 From: lyricnz Date: Mon, 25 Sep 2023 00:46:02 +0000 Subject: [PATCH 06/11] [MegaLinter] Apply linters fixes --- code/db.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/code/db.py b/code/db.py index f6501cb1c9..f208fd7a59 100644 --- a/code/db.py +++ b/code/db.py @@ -3,13 +3,13 @@ from abc import ABC, abstractmethod from argparse import ArgumentParser, Namespace +import data import psycopg2 from psycopg2.extras import NamedTupleCursor -import data - SQLITE_FILE_EXTENSIONS = {"db", "sqlite", "sqlite3", "db3", "s3db", "sl3"} + class DbDriver(ABC): """Abstract class for DB connections.""" @@ -158,7 +158,7 @@ def execute(self, query, vars=None): def connect_to_db(args: Namespace) -> AddressDB: """return a DB connection based on the provided args""" - if args.dbhost.split('.')[-1] in SQLITE_FILE_EXTENSIONS: + if args.dbhost.split(".")[-1] in SQLITE_FILE_EXTENSIONS: db = SqliteDb(args.dbhost) else: db = PostgresDb( From f77debb6db7885f2c55e86f4dc362f43886f4843 Mon Sep 17 00:00:00 2001 From: Simon Roberts Date: Tue, 26 Sep 2023 09:18:07 +1000 Subject: [PATCH 07/11] Script to convert docker/postgresql DB to sqlite --- extra/db/docker2sqlite.sh | 42 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100755 extra/db/docker2sqlite.sh diff --git a/extra/db/docker2sqlite.sh b/extra/db/docker2sqlite.sh new file mode 100755 index 0000000000..f119811672 --- /dev/null +++ b/extra/db/docker2sqlite.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -ex + +# Extract CSV from the DB if we don't have it already. +# It's also available as part of the docker-build process, but this is a bit more flexible. +CSV_FILENAME=address_principals.csv +if [ -f $CSV_FILENAME ]; then + echo "CSV file already exists, skipping extract..." +else + docker run -d --name db --publish=5433:5432 lukeprior/nbn-upgrade-map-db:latest + sleep 5 # it takes a few seconds to be ready + psql -h localhost -p 5433 -U postgres -c 'COPY gnaf_cutdown.address_principals TO stdout WITH CSV HEADER' > $CSV_FILENAME + docker rm -f db +fi + +# Create a new sqlite DB with the contents of the CSV +DB_FILENAME=address_principals.sqllite +if [ -f $DB_FILENAME ]; then + echo "sqllite file already exists, skipping creation..." +else + sqlite3 $DB_FILENAME < Date: Wed, 27 Sep 2023 15:18:28 +1000 Subject: [PATCH 08/11] Create a release with address_principals.sqllite --- .github/workflows/publish-db-image.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/publish-db-image.yml b/.github/workflows/publish-db-image.yml index 3c5a78ac92..f88097d9fd 100644 --- a/.github/workflows/publish-db-image.yml +++ b/.github/workflows/publish-db-image.yml @@ -59,3 +59,13 @@ jobs: labels: ${{ steps.meta.outputs.labels }} build-args: | GNAF_LOADER_TAG=${{ steps.version.outputs.GNAF_LOADER_TAG }} + + - name: Convert the Postgres DB to SQLite + run: ./extra/db/docker2sqlite.sh + + - name: Release + uses: softprops/action-gh-release@v1 + with: + tag_name: sqlite-db-${{ steps.version.outputs.GNAF_LOADER_TAG }} + body: SQLite DB for the cutdown version of the GNAF address database + files: address_principals.sqllite From a7e3b92e366c726a99ff9b0b601301ca12075033 Mon Sep 17 00:00:00 2001 From: Simon Roberts Date: Thu, 28 Sep 2023 13:48:52 +1000 Subject: [PATCH 09/11] Fix sqlite output filename --- .github/workflows/publish-db-image.yml | 2 +- extra/db/docker2sqlite.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish-db-image.yml b/.github/workflows/publish-db-image.yml index f88097d9fd..30ccf33128 100644 --- a/.github/workflows/publish-db-image.yml +++ b/.github/workflows/publish-db-image.yml @@ -68,4 +68,4 @@ jobs: with: tag_name: sqlite-db-${{ steps.version.outputs.GNAF_LOADER_TAG }} body: SQLite DB for the cutdown version of the GNAF address database - files: address_principals.sqllite + files: address_principals.sqlite diff --git a/extra/db/docker2sqlite.sh b/extra/db/docker2sqlite.sh index f119811672..1ee0c1ee55 100755 --- a/extra/db/docker2sqlite.sh +++ b/extra/db/docker2sqlite.sh @@ -15,7 +15,7 @@ else fi # Create a new sqlite DB with the contents of the CSV -DB_FILENAME=address_principals.sqllite +DB_FILENAME=address_principals.sqlite if [ -f $DB_FILENAME ]; then echo "sqllite file already exists, skipping creation..." else From 20c1063f7dbb56e0257553958be715adbae34d73 Mon Sep 17 00:00:00 2001 From: Simon Roberts Date: Thu, 28 Sep 2023 13:50:31 +1000 Subject: [PATCH 10/11] Ignore the sqlite DB from git --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index af792ac291..12298bd433 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ cache code/__pycache__ megalinter-reports/ +address_principals.sqlite From dd24ab82fb0213c7636a1385d37d361424934162 Mon Sep 17 00:00:00 2001 From: Simon Roberts Date: Thu, 28 Sep 2023 13:52:46 +1000 Subject: [PATCH 11/11] Fix a couple of documentation references to SQLite --- extra/db/docker2sqlite.sh | 2 +- tests/data/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/extra/db/docker2sqlite.sh b/extra/db/docker2sqlite.sh index 1ee0c1ee55..98162916c3 100755 --- a/extra/db/docker2sqlite.sh +++ b/extra/db/docker2sqlite.sh @@ -17,7 +17,7 @@ fi # Create a new sqlite DB with the contents of the CSV DB_FILENAME=address_principals.sqlite if [ -f $DB_FILENAME ]; then - echo "sqllite file already exists, skipping creation..." + echo "SQLite file $DB_FILENAME already exists, skipping creation..." else sqlite3 $DB_FILENAME <