From 75bee9a8603cb47fe4cd6d5af46ef07c9f7b8494 Mon Sep 17 00:00:00 2001 From: aaronfriedman Date: Tue, 5 Nov 2024 14:21:16 -0500 Subject: [PATCH 1/2] Use updated barcode to patron_id query --- .github/workflows/run-tests.yml | 7 +-- .python-version | 1 + CHANGELOG.md | 11 ++++ Dockerfile | 2 +- README.md | 2 - config/devel.yaml | 2 - config/production.yaml | 4 +- config/qa.yaml | 4 +- devel_requirements.txt | 3 ++ helpers/query_helper.py | 16 ++++-- main.py | 82 ++++++++--------------------- requirements.txt | 7 +-- tests/conftest.py | 43 +++++++++++++++ tests/test_helpers.py | 41 --------------- tests/test_main.py | 93 +++++---------------------------- 15 files changed, 116 insertions(+), 202 deletions(-) create mode 100644 .python-version create mode 100644 devel_requirements.txt create mode 100644 tests/conftest.py delete mode 100644 tests/test_helpers.py diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index cfe4805..6a55aad 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -17,16 +17,17 @@ jobs: - name: Checkout repo uses: actions/checkout@v4 - - name: Set up Python 3.9 - uses: actions/setup-python@v4 + - name: Set up Python 3.12 + uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.12' cache: 'pip' - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt + pip install -r devel_requirements.txt - name: Run linter and test suite run: | diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..87dbaa1 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12.0 \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c078a3..25f8ff3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +## 2024-11-05 -- v2.0.0 +### Added +- Rewrite Sierra barcode --> patron_id query to use more efficient phrase_entry table +- Delete Sierra query retry logic +- Increase batch size to 10k + +### Fixed +- Upgrade from Python 3.9 to Python 3.12 +- Upgrade to `nypl-py-utils` v1.4.0 +- Add .python-version, devel_requirements, and conftest files + ## 2024-06-21 -- v1.1.0 ### Fixed - Handle patron_home_library_code in standardized way (convert empty strings and 'none' to NULL) diff --git a/Dockerfile b/Dockerfile index 85ed9f8..a65505c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9 +FROM python:3.12 ADD . /src WORKDIR /src diff --git a/README.md b/README.md index 97c8cd5..502cac6 100644 --- a/README.md +++ b/README.md @@ -64,8 +64,6 @@ The first 11 unencrypted variables (every variable through `KINESIS_BATCH_SIZE`) | `S3_RESOURCE` | Name of the resource for the S3 cache. This differs between QA and prod and should be empty when not using the cache locally. | | `PC_RESERVE_SCHEMA_URL` | Platform API endpoint from which to retrieve the PcReserve Avro schema | | `KINESIS_BATCH_SIZE` | How many records should be sent to Kinesis at once. Kinesis supports up to 500 records per batch. | -| `SIERRA_TIMEOUT` (optional) | Number of minutes a Sierra query is allowed to run before timing out. This should be `5` or fewer in `qa` and `production` to prevent an ECS bug where a statement timeout error isn't propagated and the connection hangs. Set to `5` by default. | -| `MAX_SIERRA_ATTEMPTS` (optional) | Maximum number of attempts to try querying Sierra before erroring out. Hence, the maximum total Sierra query time will be `SIERRA_TIMEOUT` times `MAX_SIERRA_ATTEMPTS` minutes. Set to `10` by default. | | `LOG_LEVEL` (optional) | What level of logs should be output. Set to `info` by default. | | `MAX_BATCHES` (optional) | The maximum number of times the poller should poll Envisionware per session. If this is not set, the poller will continue querying until all new records in Envisionware have been processed. | | `IGNORE_CACHE` (optional) | Whether fetching and setting the state from S3 should not be done. If this is true, the `PCR_DATE_TIME` and `PCR_KEY` environment variables will be used for the initial state (or `2023-01-01 00:00:00 +0000` and `0` by default). | diff --git a/config/devel.yaml b/config/devel.yaml index 3ed3bbb..94f9f22 100644 --- a/config/devel.yaml +++ b/config/devel.yaml @@ -11,8 +11,6 @@ PLAINTEXT_VARIABLES: S3_RESOURCE: PC_RESERVE_SCHEMA_URL: https://qa-platform.nypl.org/api/v0.1/current-schemas/PcReserve KINESIS_BATCH_SIZE: 20 - SIERRA_TIMEOUT: 10 - MAX_SIERRA_ATTEMPTS: 3 LOG_LEVEL: info MAX_BATCHES: 3 IGNORE_CACHE: True diff --git a/config/production.yaml b/config/production.yaml index a6b87b5..3d4bb5c 100644 --- a/config/production.yaml +++ b/config/production.yaml @@ -6,13 +6,11 @@ PLAINTEXT_VARIABLES: SIERRA_DB_PORT: 1032 SIERRA_DB_NAME: iii REDSHIFT_DB_NAME: production - ENVISIONWARE_BATCH_SIZE: 500 + ENVISIONWARE_BATCH_SIZE: 10000 S3_BUCKET: bic-poller-states-production S3_RESOURCE: pc_reserve_poller_state.json PC_RESERVE_SCHEMA_URL: https://platform.nypl.org/api/v0.1/current-schemas/PcReserve KINESIS_BATCH_SIZE: 500 - SIERRA_TIMEOUT: 5 - MAX_SIERRA_ATTEMPTS: 10 ENCRYPTED_VARIABLES: ENVISIONWARE_DB_HOST: AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAGwwagYJKoZIhvcNAQcGoF0wWwIBADBWBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDAuBv/BphGCF+7eZXQIBEIApcoNy31Pd5JJDeeAmSKF8bGWMOZPLx9GgSMNxY1DTab6tICcqGK+onys= ENVISIONWARE_DB_USER: AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAGcwZQYJKoZIhvcNAQcGoFgwVgIBADBRBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDBYJiqjEoMyidrOt4QIBEIAk8jOvhrYElM4VzBEZ4pGnO7Dhr7R04mEU5y6mTqe+jUHEZMe9 diff --git a/config/qa.yaml b/config/qa.yaml index 31a2036..af8b7d3 100644 --- a/config/qa.yaml +++ b/config/qa.yaml @@ -6,13 +6,11 @@ PLAINTEXT_VARIABLES: SIERRA_DB_PORT: 1032 SIERRA_DB_NAME: iii REDSHIFT_DB_NAME: qa - ENVISIONWARE_BATCH_SIZE: 500 + ENVISIONWARE_BATCH_SIZE: 10000 S3_BUCKET: bic-poller-states-qa S3_RESOURCE: pc_reserve_poller_state.json PC_RESERVE_SCHEMA_URL: https://qa-platform.nypl.org/api/v0.1/current-schemas/PcReserve KINESIS_BATCH_SIZE: 500 - SIERRA_TIMEOUT: 5 - MAX_SIERRA_ATTEMPTS: 10 ENCRYPTED_VARIABLES: ENVISIONWARE_DB_HOST: AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAGwwagYJKoZIhvcNAQcGoF0wWwIBADBWBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDAuBv/BphGCF+7eZXQIBEIApcoNy31Pd5JJDeeAmSKF8bGWMOZPLx9GgSMNxY1DTab6tICcqGK+onys= ENVISIONWARE_DB_USER: AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAGcwZQYJKoZIhvcNAQcGoFgwVgIBADBRBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDBYJiqjEoMyidrOt4QIBEIAk8jOvhrYElM4VzBEZ4pGnO7Dhr7R04mEU5y6mTqe+jUHEZMe9 diff --git a/devel_requirements.txt b/devel_requirements.txt new file mode 100644 index 0000000..1ae9e07 --- /dev/null +++ b/devel_requirements.txt @@ -0,0 +1,3 @@ +black +pytest +pytest-mock \ No newline at end of file diff --git a/helpers/query_helper.py b/helpers/query_helper.py index f0d94be..45ded64 100644 --- a/helpers/query_helper.py +++ b/helpers/query_helper.py @@ -10,13 +10,23 @@ LIMIT {limit};""" _SIERRA_QUERY = """ - SELECT + WITH phrase_barcodes AS ( + SELECT record_id, index_entry + FROM sierra_view.phrase_entry + WHERE index_tag = 'b' AND record_id IN ( + SELECT record_id + FROM sierra_view.phrase_entry + WHERE index_tag || index_entry IN ({}) + ) + ) + SELECT barcode, id, ptype_code, pcode3, CASE WHEN LENGTH(TRIM(home_library_code)) = 0 OR TRIM(home_library_code) = 'none' THEN NULL ELSE TRIM(home_library_code) END - FROM sierra_view.patron_view - WHERE barcode IN ({});""" + FROM phrase_barcodes INNER JOIN sierra_view.patron_view + ON phrase_barcodes.record_id = patron_view.id + AND phrase_barcodes.index_entry = patron_view.barcode;""" _REDSHIFT_QUERY = """ SELECT patron_id, postal_code, geoid diff --git a/main.py b/main.py index ab157e7..298e1f5 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,4 @@ import json -import logging import os import pandas as pd @@ -9,13 +8,10 @@ build_redshift_query, build_sierra_query, ) -from nypl_py_utils.classes.avro_encoder import AvroEncoder +from nypl_py_utils.classes.avro_client import AvroEncoder from nypl_py_utils.classes.kinesis_client import KinesisClient from nypl_py_utils.classes.mysql_client import MySQLClient -from nypl_py_utils.classes.postgresql_client import ( - PostgreSQLClient, - PostgreSQLClientError, -) +from nypl_py_utils.classes.postgresql_client import PostgreSQLClient from nypl_py_utils.classes.redshift_client import RedshiftClient from nypl_py_utils.classes.s3_client import S3Client from nypl_py_utils.functions.config_helper import load_env_file @@ -71,8 +67,6 @@ def main(): os.environ["REDSHIFT_DB_PASSWORD"], ) - sierra_timeout = os.environ.get("SIERRA_TIMEOUT", "5") - max_sierra_attempts = int(os.environ.get("MAX_SIERRA_ATTEMPTS", "5")) has_max_batches = "MAX_BATCHES" in os.environ finished = False batch_number = 1 @@ -90,7 +84,7 @@ def main(): ) ) envisionware_client.close_connection() - if len(pc_reserve_raw_data) == 0: + if not pc_reserve_raw_data: break pc_reserve_df = pd.DataFrame( data=pc_reserve_raw_data, @@ -104,7 +98,10 @@ def main(): "staff_override", ], ) - pc_reserve_df["key"] = pc_reserve_df["key"].astype("Int64").astype("string") + pc_reserve_df["key"] = pc_reserve_df["key"].astype("Int64") + pc_reserve_df[["key", "barcode"]] = pc_reserve_df[["key", "barcode"]].astype( + "string" + ) pc_reserve_df["transaction_et"] = pc_reserve_df["transaction_et"].dt.date # Obfuscate key @@ -113,39 +110,9 @@ def main(): pc_reserve_df["key"] = list(executor.map(obfuscate, pc_reserve_df["key"])) # Query Sierra for patron info using the patron barcodes - barcodes_str = "','".join( - pc_reserve_df["barcode"].to_string(index=False).split() - ) - barcodes_str = "'" + barcodes_str + "'" - sierra_query = build_sierra_query(barcodes_str) - + barcodes_str = "'b" + "','b".join(pc_reserve_df["barcode"].unique()) + "'" sierra_client.connect() - logger.info(f"Setting Sierra query timeout to {sierra_timeout} minutes") - sierra_client.execute_query(f"SET statement_timeout='{sierra_timeout}min';") - logger.info("Querying Sierra for patron information by barcode") - - initial_log_level = logging.getLogger("postgresql_client").getEffectiveLevel() - logging.getLogger("postgresql_client").setLevel(logging.CRITICAL) - finished = False - num_attempts = 1 - while not finished: - try: - sierra_raw_data = sierra_client.execute_query(sierra_query) - finished = True - except PostgreSQLClientError as e: - if num_attempts < max_sierra_attempts: - logger.info("Query failed -- trying again") - num_attempts += 1 - else: - logger.error( - f"Error executing Sierra query {sierra_query}:" "\n{e}" - ) - sierra_client.close_connection() - s3_client.close() - kinesis_client.close() - raise e from None - logging.getLogger("postgresql_client").setLevel(initial_log_level) - + sierra_raw_data = sierra_client.execute_query(build_sierra_query(barcodes_str)) sierra_client.close_connection() sierra_df = pd.DataFrame( data=sierra_raw_data, @@ -158,14 +125,15 @@ def main(): ], ) - # Some barcodes correspond to multiple patron records. For these - # barcodes, do not use patron information from any of the records. + # Some barcodes correspond to multiple patron records. For these barcodes, do + # not use patron information from any of the records. + sierra_df = sierra_df[pd.notnull(sierra_df["barcode"])] sierra_df = sierra_df.drop_duplicates("barcode", keep=False) sierra_df["patron_id"] = sierra_df["patron_id"].astype("Int64").astype("string") - # Merge the dataframes, set the patron retrieval status, and obfuscate - # the patron_id. The patron_id is either the Sierra id or, if no Sierra - # id is found for the barcode, the barcode prepended with 'barcode '. + # Merge the dataframes, set the patron retrieval status, and obfuscate the + # patron_id. The patron_id is either the Sierra id or, if no Sierra id is found + # for the barcode, the barcode prepended with 'barcode '. pc_reserve_df = pc_reserve_df.merge(sierra_df, how="left", on="barcode") pc_reserve_df = pc_reserve_df.apply(_set_patron_retrieval_status, axis=1) with ThreadPoolExecutor() as executor: @@ -173,14 +141,12 @@ def main(): executor.map(obfuscate, pc_reserve_df["patron_id"]) ) - # Query Redshift for the zip code and geoid using the obfuscated Sierra - # ids - sierra_ids = pc_reserve_df[pc_reserve_df["patron_retrieval_status"] == "found"][ - "patron_id" + # Query Redshift for the zip code and geoid using the obfuscated Sierra ids + sierra_ids = pc_reserve_df.loc[ + pc_reserve_df["patron_retrieval_status"] == "found", "patron_id" ] - if len(sierra_ids) > 0: - ids_str = "','".join(sierra_ids.to_string(index=False).split()) - ids_str = "'" + ids_str + "'" + if not sierra_ids.empty: + ids_str = "'" + "','".join(sierra_ids.unique()) + "'" redshift_table = "patron_info" if os.environ["REDSHIFT_DB_NAME"] != "production": redshift_table += "_" + os.environ["REDSHIFT_DB_NAME"] @@ -248,7 +214,8 @@ def main(): def _set_patron_retrieval_status(row): - if not pd.isnull(row["patron_id"]): + """Sets a barcode's Sierra retrieval status""" + if pd.notnull(row["patron_id"]): row["patron_retrieval_status"] = "found" elif row["barcode"].startswith("25555"): row["patron_retrieval_status"] = "guest_pass" @@ -260,10 +227,7 @@ def _set_patron_retrieval_status(row): def _get_poller_state(s3_client, poller_state, batch_number): - """ - Retrieves the poller state from the S3 cache, the config, or the local - memory - """ + """Retrieves the poller state from the S3 cache, the config, or the local memory""" if os.environ.get("IGNORE_CACHE", False) != "True": return s3_client.fetch_cache() elif batch_number == 1: diff --git a/requirements.txt b/requirements.txt index b13f0a0..40e63f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,2 @@ -black -nypl-py-utils[avro-encoder,kinesis-client,mysql-client,postgresql-client,redshift-client,s3-client,config-helper,obfuscation-helper]==1.1.5 -pandas -pytest -pytest-mock \ No newline at end of file +nypl-py-utils[avro-client,kinesis-client,mysql-client,postgresql-client,redshift-client,s3-client,config-helper,obfuscation-helper]==1.4.0 +pandas \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..5c23402 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,43 @@ +import os +import pytest + +# Sets OS vars for entire set of tests +TEST_ENV_VARS = { + "ENVIRONMENT": "test_environment", + "AWS_REGION": "test_aws_region", + "ENVISIONWARE_DB_PORT": "test_envisionware_port", + "ENVISIONWARE_DB_NAME": "test_envisionware_name", + "ENVISIONWARE_DB_HOST": "test_envisionware_host", + "ENVISIONWARE_DB_USER": "test_envisionware_user", + "ENVISIONWARE_DB_PASSWORD": "test_envisionware_password", + "SIERRA_DB_PORT": "test_sierra_port", + "SIERRA_DB_NAME": "test_sierra_name", + "SIERRA_DB_HOST": "test_sierra_host", + "SIERRA_DB_USER": "test_sierra_user", + "SIERRA_DB_PASSWORD": "test_sierra_password", + "REDSHIFT_DB_NAME": "test_redshift_name", + "REDSHIFT_DB_HOST": "test_redshift_host", + "REDSHIFT_DB_USER": "test_redshift_user", + "REDSHIFT_DB_PASSWORD": "test_redshift_password", + "ENVISIONWARE_BATCH_SIZE": "4", + "MAX_BATCHES": "1", + "S3_BUCKET": "test_s3_bucket", + "S3_RESOURCE": "test_s3_resource", + "PC_RESERVE_SCHEMA_URL": "https://test_schema_url", + "KINESIS_BATCH_SIZE": "4", + "KINESIS_STREAM_ARN": "test_kinesis_stream", + "BCRYPT_SALT": "test_salt", +} + + +@pytest.fixture(scope="function", autouse=True) +def tests_setup_and_teardown(): + # Will be executed before each test + os.environ.update(TEST_ENV_VARS) + + yield + + # Will execute after each test + for os_config in TEST_ENV_VARS.keys(): + if os_config in os.environ: + del os.environ[os_config] diff --git a/tests/test_helpers.py b/tests/test_helpers.py deleted file mode 100644 index d64fef2..0000000 --- a/tests/test_helpers.py +++ /dev/null @@ -1,41 +0,0 @@ -import os - - -class TestHelpers: - ENV_VARS = { - "ENVIRONMENT": "test_environment", - "AWS_REGION": "test_aws_region", - "ENVISIONWARE_DB_PORT": "test_envisionware_port", - "ENVISIONWARE_DB_NAME": "test_envisionware_name", - "ENVISIONWARE_DB_HOST": "test_envisionware_host", - "ENVISIONWARE_DB_USER": "test_envisionware_user", - "ENVISIONWARE_DB_PASSWORD": "test_envisionware_password", - "SIERRA_DB_PORT": "test_sierra_port", - "SIERRA_DB_NAME": "test_sierra_name", - "SIERRA_DB_HOST": "test_sierra_host", - "SIERRA_DB_USER": "test_sierra_user", - "SIERRA_DB_PASSWORD": "test_sierra_password", - "REDSHIFT_DB_NAME": "test_redshift_name", - "REDSHIFT_DB_HOST": "test_redshift_host", - "REDSHIFT_DB_USER": "test_redshift_user", - "REDSHIFT_DB_PASSWORD": "test_redshift_password", - "ENVISIONWARE_BATCH_SIZE": "4", - "MAX_BATCHES": "1", - "S3_BUCKET": "test_s3_bucket", - "S3_RESOURCE": "test_s3_resource", - "PC_RESERVE_SCHEMA_URL": "https://test_schema_url", - "KINESIS_BATCH_SIZE": "4", - "KINESIS_STREAM_ARN": "test_kinesis_stream", - "BCRYPT_SALT": "test_salt", - } - - @classmethod - def set_env_vars(cls): - for key, value in cls.ENV_VARS.items(): - os.environ[key] = value - - @classmethod - def clear_env_vars(cls): - for key in cls.ENV_VARS.keys(): - if key in os.environ: - del os.environ[key] diff --git a/tests/test_main.py b/tests/test_main.py index a2a6106..9e73222 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -5,7 +5,6 @@ from datetime import datetime from nypl_py_utils.classes.postgresql_client import PostgreSQLClientError -from tests.test_helpers import TestHelpers _ENVISIONWARE_DATA = [ (10000000, "barcode1", 100, datetime(2023, 1, 1, 1, 0, 0), "branch1", "area1", @@ -23,6 +22,7 @@ ("barcode3", 333333333333, 3, 30, "lib3"), ("barcode3", 300000000000, 30, 300, "lib30"), ("barcode4", 444444444444, 4, 40, "lib4"), + (None, 444444444444, None, None, None), ] _REDSHIFT_DATA = (["obf_id1", "zip1", "geoid1"], ["obf_id4", "zip4", None]) @@ -55,12 +55,6 @@ class TestMain: - @pytest.fixture - def set_env_vars(cls): - TestHelpers.set_env_vars() - yield - TestHelpers.clear_env_vars() - @pytest.fixture def mock_helpers(self, mocker): mocker.patch("main.load_env_file") @@ -71,7 +65,7 @@ def _set_up_mock(self, package, mocker): mocker.patch(package, return_value=mock) return mock - def test_main_one_iteration(self, set_env_vars, mock_helpers, mocker): + def test_main_one_iteration(self, mock_helpers, mocker): mock_obfuscate = mocker.patch("main.obfuscate", side_effect=[ "obf_key1", "obf_key2", "obf_key3", "obf_key4", "obf_id1", "obf_id2", "obf_id3", "obf_id4"]) @@ -97,7 +91,7 @@ def test_main_one_iteration(self, set_env_vars, mock_helpers, mocker): "main.build_sierra_query", return_value="SIERRA QUERY" ) mock_sierra_client = self._set_up_mock("main.PostgreSQLClient", mocker) - mock_sierra_client.execute_query.side_effect = [None, _SIERRA_DATA] + mock_sierra_client.execute_query.return_value = _SIERRA_DATA mock_redshift_query = mocker.patch( "main.build_redshift_query", return_value="REDSHIFT QUERY" @@ -106,6 +100,7 @@ def test_main_one_iteration(self, set_env_vars, mock_helpers, mocker): mock_redshift_client.execute_query.return_value = _REDSHIFT_DATA main.main() + mock_s3_client.fetch_cache.assert_called_once() mock_envisionware_client.connect.assert_called_once() @@ -117,10 +112,7 @@ def test_main_one_iteration(self, set_env_vars, mock_helpers, mocker): mock_sierra_client.connect.assert_called_once() mock_sierra_query.assert_called_once_with( - "'barcode1','25555000000000','barcode3','barcode4'" - ) - mock_sierra_client.execute_query.assert_has_calls( - [mocker.call("SET statement_timeout='5min';"), mocker.call("SIERRA QUERY")] + "'bbarcode1','b25555000000000','bbarcode3','bbarcode4'" ) mock_sierra_client.close_connection.assert_called_once() @@ -152,7 +144,7 @@ def test_main_one_iteration(self, set_env_vars, mock_helpers, mocker): mock_s3_client.close.assert_called_once() mock_kinesis_client.close.assert_called_once() - def test_main_multiple_iterations(self, set_env_vars, mock_helpers, mocker): + def test_main_multiple_iterations(self, mock_helpers, mocker): del os.environ["MAX_BATCHES"] _TEST_ENVISIONWARE_DATA = [ (i, "barcode{}".format(i), i, datetime(2023, i, i, i, 0, 0), @@ -181,6 +173,7 @@ def test_main_multiple_iterations(self, set_env_vars, mock_helpers, mocker): ] main.main() + assert mock_s3_client.fetch_cache.call_count == 2 mock_s3_client.set_cache.assert_has_calls( [ @@ -197,7 +190,7 @@ def test_main_multiple_iterations(self, set_env_vars, mock_helpers, mocker): ] ) - def test_main_no_envisionware_results(self, set_env_vars, mock_helpers, mocker): + def test_main_no_envisionware_results(self, mock_helpers, mocker): del os.environ["MAX_BATCHES"] mocker.patch("main.obfuscate") mocker.patch("main.build_envisionware_query") @@ -214,6 +207,7 @@ def test_main_no_envisionware_results(self, set_env_vars, mock_helpers, mocker): mock_envisionware_client.execute_query.return_value = [] main.main() + mock_envisionware_client.close_connection.assert_called_once() mock_s3_client.fetch_cache.assert_called_once() mock_s3_client.close.assert_called_once() @@ -225,7 +219,7 @@ def test_main_no_envisionware_results(self, set_env_vars, mock_helpers, mocker): mock_kinesis_client.send_records.assert_not_called() mock_s3_client.set_cache.assert_not_called() - def test_main_no_sierra_results(self, set_env_vars, mock_helpers, mocker): + def test_main_no_sierra_results(self, mock_helpers, mocker): _TEST_ENVISIONWARE_DATA = [ (i, "barcode{}".format(i), i, datetime(2023, i, i, i, 0, 0), "branch{}".format(i), "area{}".format(i), @@ -255,7 +249,7 @@ def test_main_no_sierra_results(self, set_env_vars, mock_helpers, mocker): mock_envisionware_client.execute_query.return_value = _TEST_ENVISIONWARE_DATA mock_sierra_client = self._set_up_mock("main.PostgreSQLClient", mocker) - mock_sierra_client.execute_query.side_effect = [None, []] + mock_sierra_client.execute_query.return_value = [] main.main() @@ -274,8 +268,7 @@ def test_main_no_sierra_results(self, set_env_vars, mock_helpers, mocker): ] ) - def test_main_no_redshift_results(self, set_env_vars, mock_helpers, - mocker): + def test_main_no_redshift_results(self, mock_helpers, mocker): _TEST_ENVISIONWARE_DATA = [ (i, "barcode{}".format(i), i, datetime(2023, i, i, i, 0, 0), "branch{}".format(i), "area{}".format(i), @@ -308,7 +301,7 @@ def test_main_no_redshift_results(self, set_env_vars, mock_helpers, mock_envisionware_client.execute_query.return_value = _TEST_ENVISIONWARE_DATA mock_sierra_client = self._set_up_mock("main.PostgreSQLClient", mocker) - mock_sierra_client.execute_query.side_effect = [None, _TEST_SIERRA_DATA] + mock_sierra_client.execute_query.return_value = _TEST_SIERRA_DATA mock_redshift_client = self._set_up_mock("main.RedshiftClient", mocker) mock_redshift_client.execute_query.return_value = [] @@ -328,63 +321,3 @@ def test_main_no_redshift_results(self, set_env_vars, mock_helpers, mocker.call("5"), ] ) - - def test_main_sierra_query_retry(self, set_env_vars, mocker, caplog): - mocker.patch("main.build_envisionware_query") - mocker.patch("main.build_sierra_query") - mocker.patch("main.build_redshift_query") - mocker.patch("main.load_env_file") - mocker.patch("main.obfuscate", return_value="obfuscated") - mocker.patch("main.AvroEncoder") - mocker.patch("main.KinesisClient") - mocker.patch("main.RedshiftClient") - mocker.patch("main.S3Client") - - mock_envisionware_client = self._set_up_mock("main.MySQLClient", mocker) - mock_envisionware_client.execute_query.return_value = _ENVISIONWARE_DATA - - mock_sierra_client = self._set_up_mock("main.PostgreSQLClient", mocker) - mock_sierra_client.execute_query.side_effect = [ - None, - PostgreSQLClientError("test error"), - PostgreSQLClientError("test error"), - [], - ] - - with caplog.at_level(logging.ERROR): - main.main() - - mock_sierra_client.connect.assert_called_once() - assert mock_sierra_client.execute_query.call_count == 4 - assert caplog.text == "" - mock_sierra_client.close_connection.assert_called_once() - - def test_main_sierra_query_timeout(self, set_env_vars, mocker, caplog): - mocker.patch("main.build_envisionware_query") - mocker.patch("main.build_sierra_query") - mocker.patch("main.build_redshift_query") - mocker.patch("main.load_env_file") - mocker.patch("main.obfuscate", return_value="obfuscated") - mocker.patch("main.AvroEncoder") - mocker.patch("main.RedshiftClient") - - mock_kinesis_client = self._set_up_mock("main.KinesisClient", mocker) - mock_s3_client = self._set_up_mock("main.S3Client", mocker) - - mock_envisionware_client = self._set_up_mock("main.MySQLClient", mocker) - mock_envisionware_client.execute_query.return_value = _ENVISIONWARE_DATA - - mock_sierra_client = self._set_up_mock("main.PostgreSQLClient", mocker) - mock_sierra_client.execute_query.side_effect = [None] + ( - [PostgreSQLClientError("test error")] * 5 - ) - - with pytest.raises(PostgreSQLClientError), caplog.at_level(logging.ERROR): - main.main() - - mock_sierra_client.connect.assert_called_once() - assert mock_sierra_client.execute_query.call_count == 6 - assert "Error executing Sierra query" in caplog.text - mock_sierra_client.close_connection.assert_called_once() - mock_kinesis_client.close.assert_called_once() - mock_s3_client.close.assert_called_once() From 2c0eaf5a08a5fcea42ce101947222f645bd02023 Mon Sep 17 00:00:00 2001 From: aaronfriedman Date: Tue, 5 Nov 2024 15:22:17 -0500 Subject: [PATCH 2/2] Simplify query --- helpers/query_helper.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/helpers/query_helper.py b/helpers/query_helper.py index 45ded64..ab85acf 100644 --- a/helpers/query_helper.py +++ b/helpers/query_helper.py @@ -10,23 +10,17 @@ LIMIT {limit};""" _SIERRA_QUERY = """ - WITH phrase_barcodes AS ( - SELECT record_id, index_entry - FROM sierra_view.phrase_entry - WHERE index_tag = 'b' AND record_id IN ( - SELECT record_id - FROM sierra_view.phrase_entry - WHERE index_tag || index_entry IN ({}) - ) - ) SELECT barcode, id, ptype_code, pcode3, CASE WHEN LENGTH(TRIM(home_library_code)) = 0 OR TRIM(home_library_code) = 'none' THEN NULL ELSE TRIM(home_library_code) END - FROM phrase_barcodes INNER JOIN sierra_view.patron_view - ON phrase_barcodes.record_id = patron_view.id - AND phrase_barcodes.index_entry = patron_view.barcode;""" + FROM sierra_view.patron_view + WHERE id IN ( + SELECT record_id + FROM sierra_view.phrase_entry + WHERE index_tag || index_entry IN ({}) + );""" _REDSHIFT_QUERY = """ SELECT patron_id, postal_code, geoid