From df2eff6d7d9104d2ba9d629123b1b14ee44a05a8 Mon Sep 17 00:00:00 2001 From: George Thomas Date: Wed, 28 Sep 2022 14:28:00 -0700 Subject: [PATCH 1/3] (HP-908): add migration to remove deprecated metadata keys --- .secrets.baseline | 20 ++++- ...6819874e85b9_remove_deprecated_metadata.py | 60 +++++++++++++++ tests/test_migrations.py | 76 +++++++++++++++++++ 3 files changed, 153 insertions(+), 3 deletions(-) create mode 100644 migrations/versions/6819874e85b9_remove_deprecated_metadata.py diff --git a/.secrets.baseline b/.secrets.baseline index f82193b4..6de6af72 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -1,9 +1,9 @@ { "exclude": { - "files": "^.secrets.baseline$", + "files": null, "lines": null }, - "generated_at": "2022-09-15T16:17:07Z", + "generated_at": "2022-09-28T21:26:32Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -90,6 +90,14 @@ "type": "Hex High Entropy String" } ], + "migrations/versions/6819874e85b9_remove_deprecated_metadata.py": [ + { + "hashed_secret": "ecdb6b62dc6de954dbbef8185029415aecae5e5a", + "is_verified": false, + "line_number": 15, + "type": "Hex High Entropy String" + } + ], "poetry.lock": [ { "hashed_secret": "940ab7206e90c8d2983ca7a38eca2d4a59d85fb5", @@ -102,7 +110,7 @@ { "hashed_secret": "143e9f2aca10dbd2711cb96047f4016f095e5709", "is_verified": false, - "line_number": 3638, + "line_number": 3898, "type": "Hex High Entropy String" } ], @@ -112,6 +120,12 @@ "is_verified": false, "line_number": 225, "type": "Hex High Entropy String" + }, + { + "hashed_secret": "ecdb6b62dc6de954dbbef8185029415aecae5e5a", + "is_verified": false, + "line_number": 298, + "type": "Hex High Entropy String" } ] }, diff --git a/migrations/versions/6819874e85b9_remove_deprecated_metadata.py b/migrations/versions/6819874e85b9_remove_deprecated_metadata.py new file mode 100644 index 00000000..8d535a36 --- /dev/null +++ b/migrations/versions/6819874e85b9_remove_deprecated_metadata.py @@ -0,0 +1,60 @@ +"""remove deprecated metadata + +Revision ID: 6819874e85b9 +Revises: 3354f2c466ec +Create Date: 2022-09-27 13:43:39.827523 + +""" +import json + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "6819874e85b9" +down_revision = "3354f2c466ec" +branch_labels = None +depends_on = None + + +def escape(str): + # escape single quotes for SQL statement + return str.replace("'", "''") + + +def upgrade(): + """Remove deprecated metadata keys.""" + + remove_metadata_keys = ["_uploader_id", "_filename", "_bucket", "_file_extension"] + + # extract existing PK (guid) and authz data (resource_path) from the metadata column + connection = op.get_bind() + offset = 0 + limit = 500 + query = ( + f"SELECT guid, data FROM metadata ORDER BY guid LIMIT {limit} OFFSET {offset}" + ) + results = connection.execute(query).fetchall() + while results: + for r in results: + guid, data = r[0], r[1] + have_key = False + # scrub internal fields from metadata + for metadata_key in remove_metadata_keys: + if metadata_key in data.keys(): + data.pop(metadata_key) + have_key = True + if have_key: + sql_statement = f"""UPDATE metadata + SET data='{escape(json.dumps(data))}' + WHERE guid='{guid}'""" + connection.execute(sql_statement) + # Grab another batch of rows + offset += limit + query = f"SELECT guid, data FROM metadata ORDER BY guid LIMIT {limit} OFFSET {offset} " + results = connection.execute(query).fetchall() + + +def downgrade(): + pass diff --git a/tests/test_migrations.py b/tests/test_migrations.py index efeeda42..eff68a25 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -238,5 +238,81 @@ async def test_4d93784a25e5_upgrade( _reset_migrations() +@pytest.mark.asyncio +@pytest.mark.parametrize( + "old_metadata, new_metadata", + [ + # data has keys + ( + { + "foo": "bar", + "bizz": "buzz", + "_uploader_id": "uploader", + "_filename": "hello.txt", + "_bucket": "mybucket", + "_file_extension": ".txt", + }, + {"foo": "bar", "bizz": "buzz"}, + ), + # data unchanged if no keys are present + ( + {"foo": "bar", "bizz": "buzz"}, + {"foo": "bar", "bizz": "buzz"}, + ), + ], +) +async def test_6819874e85b9_upgrade(old_metadata: dict, new_metadata: dict): + """ + We can't create metadata by using the `client` fixture because of two reasons: + 1) Calling the API when the db is in the downgraded state will give + UndefinedColumnErrors (metadata.authz) because the db and the data model are out of sync + 2) this issue: https://github.com/encode/starlette/issues/440 + so inserting directly into the DB instead. + """ + + # before "remove_deprecated_metadata" migration + alembic_main(["--raiseerr", "downgrade", "3354f2c466ec"]) + + fake_guid = "7891011" + authz_data = {"version": 0, "_resource_paths": ["/programs/DEV"]} + + async with db.with_bind(DB_DSN): + + # insert data + sql_old_metadata = escape(json.dumps(old_metadata)) + sql_authz_data = escape(json.dumps(authz_data)) + insert_stmt = f"INSERT INTO metadata(\"guid\", \"data\", \"authz\") VALUES ('{fake_guid}', '{sql_old_metadata}', '{sql_authz_data}')" + await db.scalar(db.text(insert_stmt)) + + try: + # check that the request data was inserted correctly + data = await db.all( + db.text( + f"SELECT guid, data, authz FROM metadata WHERE guid = '{fake_guid}'" + ) + ) + row = {k: v for k, v in data[0].items()} + assert row == {"guid": fake_guid, "data": old_metadata, "authz": authz_data} + + # run "add_authz_column" migration + alembic_main(["--raiseerr", "upgrade", "6819874e85b9"]) + + # check that the migration removed the deprecated keys + data = await db.all( + db.text( + f"SELECT guid, data, authz FROM metadata WHERE guid = '{fake_guid}'" + ) + ) + assert len(data) == 1 + row = {k: v for k, v in data[0].items()} + + assert row == {"guid": fake_guid, "data": new_metadata, "authz": authz_data} + + finally: + await db.all(db.text(f"DELETE FROM metadata WHERE guid = '{fake_guid}'")) + + _reset_migrations() + + def _reset_migrations(): alembic_main(["--raiseerr", "upgrade", "head"]) From 23efc363dce322e8374725b4ecb17a5d8cd4ca95 Mon Sep 17 00:00:00 2001 From: George Thomas Date: Thu, 29 Sep 2022 10:08:30 -0700 Subject: [PATCH 2/3] (HP-908): simplify migration and test --- .secrets.baseline | 4 +-- ...6819874e85b9_remove_deprecated_metadata.py | 13 +++---- tests/test_migrations.py | 36 ++++++------------- 3 files changed, 18 insertions(+), 35 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index 6de6af72..aa19dbed 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": null, "lines": null }, - "generated_at": "2022-09-28T21:26:32Z", + "generated_at": "2022-09-29T17:06:11Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -124,7 +124,7 @@ { "hashed_secret": "ecdb6b62dc6de954dbbef8185029415aecae5e5a", "is_verified": false, - "line_number": 298, + "line_number": 285, "type": "Hex High Entropy String" } ] diff --git a/migrations/versions/6819874e85b9_remove_deprecated_metadata.py b/migrations/versions/6819874e85b9_remove_deprecated_metadata.py index 8d535a36..c0c4feaa 100644 --- a/migrations/versions/6819874e85b9_remove_deprecated_metadata.py +++ b/migrations/versions/6819874e85b9_remove_deprecated_metadata.py @@ -28,7 +28,7 @@ def upgrade(): remove_metadata_keys = ["_uploader_id", "_filename", "_bucket", "_file_extension"] - # extract existing PK (guid) and authz data (resource_path) from the metadata column + # extract existing PK (guid) and metadata (data) columns connection = op.get_bind() offset = 0 limit = 500 @@ -39,17 +39,14 @@ def upgrade(): while results: for r in results: guid, data = r[0], r[1] - have_key = False # scrub internal fields from metadata for metadata_key in remove_metadata_keys: if metadata_key in data.keys(): data.pop(metadata_key) - have_key = True - if have_key: - sql_statement = f"""UPDATE metadata - SET data='{escape(json.dumps(data))}' - WHERE guid='{guid}'""" - connection.execute(sql_statement) + sql_statement = f"""UPDATE metadata + SET data='{escape(json.dumps(data))}' + WHERE guid='{guid}'""" + connection.execute(sql_statement) # Grab another batch of rows offset += limit query = f"SELECT guid, data FROM metadata ORDER BY guid LIMIT {limit} OFFSET {offset} " diff --git a/tests/test_migrations.py b/tests/test_migrations.py index eff68a25..1bf79ee4 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -239,29 +239,7 @@ async def test_4d93784a25e5_upgrade( @pytest.mark.asyncio -@pytest.mark.parametrize( - "old_metadata, new_metadata", - [ - # data has keys - ( - { - "foo": "bar", - "bizz": "buzz", - "_uploader_id": "uploader", - "_filename": "hello.txt", - "_bucket": "mybucket", - "_file_extension": ".txt", - }, - {"foo": "bar", "bizz": "buzz"}, - ), - # data unchanged if no keys are present - ( - {"foo": "bar", "bizz": "buzz"}, - {"foo": "bar", "bizz": "buzz"}, - ), - ], -) -async def test_6819874e85b9_upgrade(old_metadata: dict, new_metadata: dict): +async def test_6819874e85b9_upgrade(): """ We can't create metadata by using the `client` fixture because of two reasons: 1) Calling the API when the db is in the downgraded state will give @@ -274,6 +252,15 @@ async def test_6819874e85b9_upgrade(old_metadata: dict, new_metadata: dict): alembic_main(["--raiseerr", "downgrade", "3354f2c466ec"]) fake_guid = "7891011" + old_metadata = { + "foo": "bar", + "bizz": "buzz", + "_uploader_id": "uploader", + "_filename": "hello.txt", + "_bucket": "mybucket", + "_file_extension": ".txt", + } + new_metadata = {"foo": "bar", "bizz": "buzz"} authz_data = {"version": 0, "_resource_paths": ["/programs/DEV"]} async with db.with_bind(DB_DSN): @@ -294,7 +281,7 @@ async def test_6819874e85b9_upgrade(old_metadata: dict, new_metadata: dict): row = {k: v for k, v in data[0].items()} assert row == {"guid": fake_guid, "data": old_metadata, "authz": authz_data} - # run "add_authz_column" migration + # run "remove_deprecated_metadata" migration alembic_main(["--raiseerr", "upgrade", "6819874e85b9"]) # check that the migration removed the deprecated keys @@ -305,7 +292,6 @@ async def test_6819874e85b9_upgrade(old_metadata: dict, new_metadata: dict): ) assert len(data) == 1 row = {k: v for k, v in data[0].items()} - assert row == {"guid": fake_guid, "data": new_metadata, "authz": authz_data} finally: From 4ab83fe680da0b17c8b2feab839803a7dac8e5a4 Mon Sep 17 00:00:00 2001 From: George Thomas Date: Mon, 3 Oct 2022 11:11:47 -0700 Subject: [PATCH 3/3] (HP-908): simplify comments in test --- .secrets.baseline | 4 ++-- tests/test_migrations.py | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index aa19dbed..b7107886 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": null, "lines": null }, - "generated_at": "2022-09-29T17:06:11Z", + "generated_at": "2022-10-03T18:10:28Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -124,7 +124,7 @@ { "hashed_secret": "ecdb6b62dc6de954dbbef8185029415aecae5e5a", "is_verified": false, - "line_number": 285, + "line_number": 283, "type": "Hex High Entropy String" } ] diff --git a/tests/test_migrations.py b/tests/test_migrations.py index 1bf79ee4..9c8a2be3 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -241,10 +241,8 @@ async def test_4d93784a25e5_upgrade( @pytest.mark.asyncio async def test_6819874e85b9_upgrade(): """ - We can't create metadata by using the `client` fixture because of two reasons: - 1) Calling the API when the db is in the downgraded state will give - UndefinedColumnErrors (metadata.authz) because the db and the data model are out of sync - 2) this issue: https://github.com/encode/starlette/issues/440 + We can't create metadata by using the `client` fixture because of this issue: + https://github.com/encode/starlette/issues/440 so inserting directly into the DB instead. """