Merge branch 'qa' into production

NYPL · Nov 14, 2024 · c1340a2 · c1340a2
2 parents 7daec9a + 63eee82
commit c1340a2
Show file tree

Hide file tree

Showing 15 changed files with 109 additions and 201 deletions.
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -17,16 +17,17 @@ jobs:
       - name: Checkout repo
         uses: actions/checkout@v4
 
-      - name: Set up Python 3.9
-        uses: actions/setup-python@v4
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
         with:
-          python-version: '3.9'
+          python-version: '3.12'
           cache: 'pip'
 
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           pip install -r requirements.txt
+          pip install -r devel_requirements.txt
 
       - name: Run linter and test suite
         run: |

diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.12.0
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,14 @@
+## 2024-11-05 -- v2.0.0
+### Added
+- Rewrite Sierra barcode --> patron_id query to use more efficient phrase_entry table
+- Delete Sierra query retry logic
+- Increase batch size to 10k
+
+### Fixed
+- Upgrade from Python 3.9 to Python 3.12
+- Upgrade to `nypl-py-utils` v1.4.0
+- Add .python-version, devel_requirements, and conftest files
+
 ## 2024-06-21 -- v1.1.0
 ### Fixed
 - Handle patron_home_library_code in standardized way (convert empty strings and 'none' to NULL)

diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.9
+FROM python:3.12
 ADD . /src
 WORKDIR /src
 

diff --git a/README.md b/README.md
@@ -64,8 +64,6 @@ The first 11 unencrypted variables (every variable through `KINESIS_BATCH_SIZE`)
 | `S3_RESOURCE` | Name of the resource for the S3 cache. This differs between QA and prod and should be empty when not using the cache locally. |
 | `PC_RESERVE_SCHEMA_URL` | Platform API endpoint from which to retrieve the PcReserve Avro schema |
 | `KINESIS_BATCH_SIZE` | How many records should be sent to Kinesis at once. Kinesis supports up to 500 records per batch. |
-| `SIERRA_TIMEOUT` (optional) | Number of minutes a Sierra query is allowed to run before timing out. This should be `5` or fewer in `qa` and `production` to prevent an ECS bug where a statement timeout error isn't propagated and the connection hangs. Set to `5` by default. |
-| `MAX_SIERRA_ATTEMPTS` (optional) | Maximum number of attempts to try querying Sierra before erroring out. Hence, the maximum total Sierra query time will be `SIERRA_TIMEOUT` times `MAX_SIERRA_ATTEMPTS` minutes. Set to `10` by default. |
 | `LOG_LEVEL` (optional) | What level of logs should be output. Set to `info` by default. |
 | `MAX_BATCHES` (optional) | The maximum number of times the poller should poll Envisionware per session. If this is not set, the poller will continue querying until all new records in Envisionware have been processed. |
 | `IGNORE_CACHE` (optional) | Whether fetching and setting the state from S3 should not be done. If this is true, the `PCR_DATE_TIME` and `PCR_KEY` environment variables will be used for the initial state (or `2023-01-01 00:00:00 +0000` and `0` by default). |

diff --git a/config/devel.yaml b/config/devel.yaml
@@ -11,8 +11,6 @@ PLAINTEXT_VARIABLES:
     S3_RESOURCE:
     PC_RESERVE_SCHEMA_URL: https://qa-platform.nypl.org/api/v0.1/current-schemas/PcReserve
     KINESIS_BATCH_SIZE: 20
-    SIERRA_TIMEOUT: 10
-    MAX_SIERRA_ATTEMPTS: 3
     LOG_LEVEL: info
     MAX_BATCHES: 3
     IGNORE_CACHE: True

diff --git a/config/production.yaml b/config/production.yaml
@@ -6,13 +6,11 @@ PLAINTEXT_VARIABLES:
     SIERRA_DB_PORT: 1032
     SIERRA_DB_NAME: iii
     REDSHIFT_DB_NAME: production
-    ENVISIONWARE_BATCH_SIZE: 500
+    ENVISIONWARE_BATCH_SIZE: 10000
     S3_BUCKET: bic-poller-states-production
     S3_RESOURCE: pc_reserve_poller_state.json
     PC_RESERVE_SCHEMA_URL: https://platform.nypl.org/api/v0.1/current-schemas/PcReserve
     KINESIS_BATCH_SIZE: 500
-    SIERRA_TIMEOUT: 5
-    MAX_SIERRA_ATTEMPTS: 10
 ENCRYPTED_VARIABLES:
     ENVISIONWARE_DB_HOST: AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAGwwagYJKoZIhvcNAQcGoF0wWwIBADBWBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDAuBv/BphGCF+7eZXQIBEIApcoNy31Pd5JJDeeAmSKF8bGWMOZPLx9GgSMNxY1DTab6tICcqGK+onys=
     ENVISIONWARE_DB_USER: AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAGcwZQYJKoZIhvcNAQcGoFgwVgIBADBRBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDBYJiqjEoMyidrOt4QIBEIAk8jOvhrYElM4VzBEZ4pGnO7Dhr7R04mEU5y6mTqe+jUHEZMe9

diff --git a/config/qa.yaml b/config/qa.yaml
@@ -6,13 +6,11 @@ PLAINTEXT_VARIABLES:
     SIERRA_DB_PORT: 1032
     SIERRA_DB_NAME: iii
     REDSHIFT_DB_NAME: qa
-    ENVISIONWARE_BATCH_SIZE: 500
+    ENVISIONWARE_BATCH_SIZE: 10000
     S3_BUCKET: bic-poller-states-qa
     S3_RESOURCE: pc_reserve_poller_state.json
     PC_RESERVE_SCHEMA_URL: https://qa-platform.nypl.org/api/v0.1/current-schemas/PcReserve
     KINESIS_BATCH_SIZE: 500
-    SIERRA_TIMEOUT: 5
-    MAX_SIERRA_ATTEMPTS: 10
 ENCRYPTED_VARIABLES:
     ENVISIONWARE_DB_HOST: AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAGwwagYJKoZIhvcNAQcGoF0wWwIBADBWBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDAuBv/BphGCF+7eZXQIBEIApcoNy31Pd5JJDeeAmSKF8bGWMOZPLx9GgSMNxY1DTab6tICcqGK+onys=
     ENVISIONWARE_DB_USER: AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAGcwZQYJKoZIhvcNAQcGoFgwVgIBADBRBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDBYJiqjEoMyidrOt4QIBEIAk8jOvhrYElM4VzBEZ4pGnO7Dhr7R04mEU5y6mTqe+jUHEZMe9

diff --git a/devel_requirements.txt b/devel_requirements.txt
@@ -0,0 +1,3 @@
+black
+pytest
+pytest-mock
diff --git a/helpers/query_helper.py b/helpers/query_helper.py
@@ -10,13 +10,17 @@
     LIMIT {limit};"""
 
 _SIERRA_QUERY = """
-    SELECT 
+    SELECT
         barcode, id, ptype_code, pcode3,
         CASE WHEN LENGTH(TRIM(home_library_code)) = 0
             OR TRIM(home_library_code) = 'none' THEN NULL
             ELSE TRIM(home_library_code) END
     FROM sierra_view.patron_view
-    WHERE barcode IN ({});"""
+    WHERE id IN (
+        SELECT record_id
+        FROM sierra_view.phrase_entry
+        WHERE index_tag || index_entry IN ({})
+    );"""
 
 _REDSHIFT_QUERY = """
     SELECT patron_id, postal_code, geoid

diff --git a/main.py b/main.py
@@ -1,5 +1,4 @@
 import json
-import logging
 import os
 import pandas as pd
 
@@ -9,13 +8,10 @@
     build_redshift_query,
     build_sierra_query,
 )
-from nypl_py_utils.classes.avro_encoder import AvroEncoder
+from nypl_py_utils.classes.avro_client import AvroEncoder
 from nypl_py_utils.classes.kinesis_client import KinesisClient
 from nypl_py_utils.classes.mysql_client import MySQLClient
-from nypl_py_utils.classes.postgresql_client import (
-    PostgreSQLClient,
-    PostgreSQLClientError,
-)
+from nypl_py_utils.classes.postgresql_client import PostgreSQLClient
 from nypl_py_utils.classes.redshift_client import RedshiftClient
 from nypl_py_utils.classes.s3_client import S3Client
 from nypl_py_utils.functions.config_helper import load_env_file
@@ -71,8 +67,6 @@ def main():
         os.environ["REDSHIFT_DB_PASSWORD"],
     )
 
-    sierra_timeout = os.environ.get("SIERRA_TIMEOUT", "5")
-    max_sierra_attempts = int(os.environ.get("MAX_SIERRA_ATTEMPTS", "5"))
     has_max_batches = "MAX_BATCHES" in os.environ
     finished = False
     batch_number = 1
@@ -90,7 +84,7 @@ def main():
             )
         )
         envisionware_client.close_connection()
-        if len(pc_reserve_raw_data) == 0:
+        if not pc_reserve_raw_data:
             break
         pc_reserve_df = pd.DataFrame(
             data=pc_reserve_raw_data,
@@ -104,7 +98,10 @@ def main():
                 "staff_override",
             ],
         )
-        pc_reserve_df["key"] = pc_reserve_df["key"].astype("Int64").astype("string")
+        pc_reserve_df["key"] = pc_reserve_df["key"].astype("Int64")
+        pc_reserve_df[["key", "barcode"]] = pc_reserve_df[["key", "barcode"]].astype(
+            "string"
+        )
         pc_reserve_df["transaction_et"] = pc_reserve_df["transaction_et"].dt.date
 
         # Obfuscate key
@@ -113,39 +110,9 @@ def main():
             pc_reserve_df["key"] = list(executor.map(obfuscate, pc_reserve_df["key"]))
 
         # Query Sierra for patron info using the patron barcodes
-        barcodes_str = "','".join(
-            pc_reserve_df["barcode"].to_string(index=False).split()
-        )
-        barcodes_str = "'" + barcodes_str + "'"
-        sierra_query = build_sierra_query(barcodes_str)
-
+        barcodes_str = "'b" + "','b".join(pc_reserve_df["barcode"].unique()) + "'"
         sierra_client.connect()
-        logger.info(f"Setting Sierra query timeout to {sierra_timeout} minutes")
-        sierra_client.execute_query(f"SET statement_timeout='{sierra_timeout}min';")
-        logger.info("Querying Sierra for patron information by barcode")
-
-        initial_log_level = logging.getLogger("postgresql_client").getEffectiveLevel()
-        logging.getLogger("postgresql_client").setLevel(logging.CRITICAL)
-        finished = False
-        num_attempts = 1
-        while not finished:
-            try:
-                sierra_raw_data = sierra_client.execute_query(sierra_query)
-                finished = True
-            except PostgreSQLClientError as e:
-                if num_attempts < max_sierra_attempts:
-                    logger.info("Query failed -- trying again")
-                    num_attempts += 1
-                else:
-                    logger.error(
-                        f"Error executing Sierra query {sierra_query}:" "\n{e}"
-                    )
-                    sierra_client.close_connection()
-                    s3_client.close()
-                    kinesis_client.close()
-                    raise e from None
-        logging.getLogger("postgresql_client").setLevel(initial_log_level)
-
+        sierra_raw_data = sierra_client.execute_query(build_sierra_query(barcodes_str))
         sierra_client.close_connection()
         sierra_df = pd.DataFrame(
             data=sierra_raw_data,
@@ -158,29 +125,28 @@ def main():
             ],
         )
 
-        # Some barcodes correspond to multiple patron records. For these
-        # barcodes, do not use patron information from any of the records.
+        # Some barcodes correspond to multiple patron records. For these barcodes, do
+        # not use patron information from any of the records.
+        sierra_df = sierra_df[pd.notnull(sierra_df["barcode"])]
         sierra_df = sierra_df.drop_duplicates("barcode", keep=False)
         sierra_df["patron_id"] = sierra_df["patron_id"].astype("Int64").astype("string")
 
-        # Merge the dataframes, set the patron retrieval status, and obfuscate
-        # the patron_id. The patron_id is either the Sierra id or, if no Sierra
-        # id is found for the barcode, the barcode prepended with 'barcode '.
+        # Merge the dataframes, set the patron retrieval status, and obfuscate the
+        # patron_id. The patron_id is either the Sierra id or, if no Sierra id is found
+        # for the barcode, the barcode prepended with 'barcode '.
         pc_reserve_df = pc_reserve_df.merge(sierra_df, how="left", on="barcode")
         pc_reserve_df = pc_reserve_df.apply(_set_patron_retrieval_status, axis=1)
         with ThreadPoolExecutor() as executor:
             pc_reserve_df["patron_id"] = list(
                 executor.map(obfuscate, pc_reserve_df["patron_id"])
             )
 
-        # Query Redshift for the zip code and geoid using the obfuscated Sierra
-        # ids
-        sierra_ids = pc_reserve_df[pc_reserve_df["patron_retrieval_status"] == "found"][
-            "patron_id"
+        # Query Redshift for the zip code and geoid using the obfuscated Sierra ids
+        sierra_ids = pc_reserve_df.loc[
+            pc_reserve_df["patron_retrieval_status"] == "found", "patron_id"
         ]
-        if len(sierra_ids) > 0:
-            ids_str = "','".join(sierra_ids.to_string(index=False).split())
-            ids_str = "'" + ids_str + "'"
+        if not sierra_ids.empty:
+            ids_str = "'" + "','".join(sierra_ids.unique()) + "'"
             redshift_table = "patron_info"
             if os.environ["REDSHIFT_DB_NAME"] != "production":
                 redshift_table += "_" + os.environ["REDSHIFT_DB_NAME"]
@@ -248,7 +214,8 @@ def main():
 
 
 def _set_patron_retrieval_status(row):
-    if not pd.isnull(row["patron_id"]):
+    """Sets a barcode's Sierra retrieval status"""
+    if pd.notnull(row["patron_id"]):
         row["patron_retrieval_status"] = "found"
     elif row["barcode"].startswith("25555"):
         row["patron_retrieval_status"] = "guest_pass"
@@ -260,10 +227,7 @@ def _set_patron_retrieval_status(row):
 
 
 def _get_poller_state(s3_client, poller_state, batch_number):
-    """
-    Retrieves the poller state from the S3 cache, the config, or the local
-    memory
-    """
+    """Retrieves the poller state from the S3 cache, the config, or the local memory"""
     if os.environ.get("IGNORE_CACHE", False) != "True":
         return s3_client.fetch_cache()
     elif batch_number == 1:

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,2 @@
-black
-nypl-py-utils[avro-encoder,kinesis-client,mysql-client,postgresql-client,redshift-client,s3-client,config-helper,obfuscation-helper]==1.1.5
-pandas
-pytest
-pytest-mock
+nypl-py-utils[avro-client,kinesis-client,mysql-client,postgresql-client,redshift-client,s3-client,config-helper,obfuscation-helper]==1.4.0
+pandas
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,43 @@
+import os
+import pytest
+
+# Sets OS vars for entire set of tests
+TEST_ENV_VARS = {
+    "ENVIRONMENT": "test_environment",
+    "AWS_REGION": "test_aws_region",
+    "ENVISIONWARE_DB_PORT": "test_envisionware_port",
+    "ENVISIONWARE_DB_NAME": "test_envisionware_name",
+    "ENVISIONWARE_DB_HOST": "test_envisionware_host",
+    "ENVISIONWARE_DB_USER": "test_envisionware_user",
+    "ENVISIONWARE_DB_PASSWORD": "test_envisionware_password",
+    "SIERRA_DB_PORT": "test_sierra_port",
+    "SIERRA_DB_NAME": "test_sierra_name",
+    "SIERRA_DB_HOST": "test_sierra_host",
+    "SIERRA_DB_USER": "test_sierra_user",
+    "SIERRA_DB_PASSWORD": "test_sierra_password",
+    "REDSHIFT_DB_NAME": "test_redshift_name",
+    "REDSHIFT_DB_HOST": "test_redshift_host",
+    "REDSHIFT_DB_USER": "test_redshift_user",
+    "REDSHIFT_DB_PASSWORD": "test_redshift_password",
+    "ENVISIONWARE_BATCH_SIZE": "4",
+    "MAX_BATCHES": "1",
+    "S3_BUCKET": "test_s3_bucket",
+    "S3_RESOURCE": "test_s3_resource",
+    "PC_RESERVE_SCHEMA_URL": "https://test_schema_url",
+    "KINESIS_BATCH_SIZE": "4",
+    "KINESIS_STREAM_ARN": "test_kinesis_stream",
+    "BCRYPT_SALT": "test_salt",
+}
+
+
+@pytest.fixture(scope="function", autouse=True)
+def tests_setup_and_teardown():
+    # Will be executed before each test
+    os.environ.update(TEST_ENV_VARS)
+
+    yield
+
+    # Will execute after each test
+    for os_config in TEST_ENV_VARS.keys():
+        if os_config in os.environ:
+            del os.environ[os_config]
diff --git a/tests/test_helpers.py b/tests/test_helpers.py