-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use updated barcode to patron_id query #33
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
3.12.0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
FROM python:3.9 | ||
FROM python:3.12 | ||
ADD . /src | ||
WORKDIR /src | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
black | ||
pytest | ||
pytest-mock |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,4 @@ | ||
import json | ||
import logging | ||
import os | ||
import pandas as pd | ||
|
||
|
@@ -9,13 +8,10 @@ | |
build_redshift_query, | ||
build_sierra_query, | ||
) | ||
from nypl_py_utils.classes.avro_encoder import AvroEncoder | ||
from nypl_py_utils.classes.avro_client import AvroEncoder | ||
from nypl_py_utils.classes.kinesis_client import KinesisClient | ||
from nypl_py_utils.classes.mysql_client import MySQLClient | ||
from nypl_py_utils.classes.postgresql_client import ( | ||
PostgreSQLClient, | ||
PostgreSQLClientError, | ||
) | ||
from nypl_py_utils.classes.postgresql_client import PostgreSQLClient | ||
from nypl_py_utils.classes.redshift_client import RedshiftClient | ||
from nypl_py_utils.classes.s3_client import S3Client | ||
from nypl_py_utils.functions.config_helper import load_env_file | ||
|
@@ -71,8 +67,6 @@ def main(): | |
os.environ["REDSHIFT_DB_PASSWORD"], | ||
) | ||
|
||
sierra_timeout = os.environ.get("SIERRA_TIMEOUT", "5") | ||
max_sierra_attempts = int(os.environ.get("MAX_SIERRA_ATTEMPTS", "5")) | ||
has_max_batches = "MAX_BATCHES" in os.environ | ||
finished = False | ||
batch_number = 1 | ||
|
@@ -90,7 +84,7 @@ def main(): | |
) | ||
) | ||
envisionware_client.close_connection() | ||
if len(pc_reserve_raw_data) == 0: | ||
if not pc_reserve_raw_data: | ||
break | ||
pc_reserve_df = pd.DataFrame( | ||
data=pc_reserve_raw_data, | ||
|
@@ -104,7 +98,10 @@ def main(): | |
"staff_override", | ||
], | ||
) | ||
pc_reserve_df["key"] = pc_reserve_df["key"].astype("Int64").astype("string") | ||
pc_reserve_df["key"] = pc_reserve_df["key"].astype("Int64") | ||
pc_reserve_df[["key", "barcode"]] = pc_reserve_df[["key", "barcode"]].astype( | ||
"string" | ||
) | ||
pc_reserve_df["transaction_et"] = pc_reserve_df["transaction_et"].dt.date | ||
|
||
# Obfuscate key | ||
|
@@ -113,39 +110,9 @@ def main(): | |
pc_reserve_df["key"] = list(executor.map(obfuscate, pc_reserve_df["key"])) | ||
|
||
# Query Sierra for patron info using the patron barcodes | ||
barcodes_str = "','".join( | ||
pc_reserve_df["barcode"].to_string(index=False).split() | ||
) | ||
barcodes_str = "'" + barcodes_str + "'" | ||
sierra_query = build_sierra_query(barcodes_str) | ||
|
||
barcodes_str = "'b" + "','b".join(pc_reserve_df["barcode"].unique()) + "'" | ||
sierra_client.connect() | ||
logger.info(f"Setting Sierra query timeout to {sierra_timeout} minutes") | ||
sierra_client.execute_query(f"SET statement_timeout='{sierra_timeout}min';") | ||
logger.info("Querying Sierra for patron information by barcode") | ||
|
||
initial_log_level = logging.getLogger("postgresql_client").getEffectiveLevel() | ||
logging.getLogger("postgresql_client").setLevel(logging.CRITICAL) | ||
finished = False | ||
num_attempts = 1 | ||
while not finished: | ||
try: | ||
sierra_raw_data = sierra_client.execute_query(sierra_query) | ||
finished = True | ||
except PostgreSQLClientError as e: | ||
if num_attempts < max_sierra_attempts: | ||
logger.info("Query failed -- trying again") | ||
num_attempts += 1 | ||
else: | ||
logger.error( | ||
f"Error executing Sierra query {sierra_query}:" "\n{e}" | ||
) | ||
sierra_client.close_connection() | ||
s3_client.close() | ||
kinesis_client.close() | ||
raise e from None | ||
logging.getLogger("postgresql_client").setLevel(initial_log_level) | ||
|
||
Comment on lines
-123
to
-148
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just for my own understanding -- you're getting rid of this because of this story right? Just want to double check There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No it's part of this one. The reason we were originally retrying is because the query would time out and fail a bunch. With this new query we don't expect that to happen. |
||
sierra_raw_data = sierra_client.execute_query(build_sierra_query(barcodes_str)) | ||
sierra_client.close_connection() | ||
sierra_df = pd.DataFrame( | ||
data=sierra_raw_data, | ||
|
@@ -158,29 +125,28 @@ def main(): | |
], | ||
) | ||
|
||
# Some barcodes correspond to multiple patron records. For these | ||
# barcodes, do not use patron information from any of the records. | ||
# Some barcodes correspond to multiple patron records. For these barcodes, do | ||
# not use patron information from any of the records. | ||
sierra_df = sierra_df[pd.notnull(sierra_df["barcode"])] | ||
sierra_df = sierra_df.drop_duplicates("barcode", keep=False) | ||
sierra_df["patron_id"] = sierra_df["patron_id"].astype("Int64").astype("string") | ||
|
||
# Merge the dataframes, set the patron retrieval status, and obfuscate | ||
# the patron_id. The patron_id is either the Sierra id or, if no Sierra | ||
# id is found for the barcode, the barcode prepended with 'barcode '. | ||
# Merge the dataframes, set the patron retrieval status, and obfuscate the | ||
# patron_id. The patron_id is either the Sierra id or, if no Sierra id is found | ||
# for the barcode, the barcode prepended with 'barcode '. | ||
pc_reserve_df = pc_reserve_df.merge(sierra_df, how="left", on="barcode") | ||
pc_reserve_df = pc_reserve_df.apply(_set_patron_retrieval_status, axis=1) | ||
with ThreadPoolExecutor() as executor: | ||
pc_reserve_df["patron_id"] = list( | ||
executor.map(obfuscate, pc_reserve_df["patron_id"]) | ||
) | ||
|
||
# Query Redshift for the zip code and geoid using the obfuscated Sierra | ||
# ids | ||
sierra_ids = pc_reserve_df[pc_reserve_df["patron_retrieval_status"] == "found"][ | ||
"patron_id" | ||
# Query Redshift for the zip code and geoid using the obfuscated Sierra ids | ||
sierra_ids = pc_reserve_df.loc[ | ||
pc_reserve_df["patron_retrieval_status"] == "found", "patron_id" | ||
] | ||
if len(sierra_ids) > 0: | ||
ids_str = "','".join(sierra_ids.to_string(index=False).split()) | ||
ids_str = "'" + ids_str + "'" | ||
if not sierra_ids.empty: | ||
ids_str = "'" + "','".join(sierra_ids.unique()) + "'" | ||
redshift_table = "patron_info" | ||
if os.environ["REDSHIFT_DB_NAME"] != "production": | ||
redshift_table += "_" + os.environ["REDSHIFT_DB_NAME"] | ||
|
@@ -248,7 +214,8 @@ def main(): | |
|
||
|
||
def _set_patron_retrieval_status(row): | ||
if not pd.isnull(row["patron_id"]): | ||
"""Sets a barcode's Sierra retrieval status""" | ||
if pd.notnull(row["patron_id"]): | ||
row["patron_retrieval_status"] = "found" | ||
elif row["barcode"].startswith("25555"): | ||
row["patron_retrieval_status"] = "guest_pass" | ||
|
@@ -260,10 +227,7 @@ def _set_patron_retrieval_status(row): | |
|
||
|
||
def _get_poller_state(s3_client, poller_state, batch_number): | ||
""" | ||
Retrieves the poller state from the S3 cache, the config, or the local | ||
memory | ||
""" | ||
"""Retrieves the poller state from the S3 cache, the config, or the local memory""" | ||
if os.environ.get("IGNORE_CACHE", False) != "True": | ||
return s3_client.fetch_cache() | ||
elif batch_number == 1: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,2 @@ | ||
black | ||
nypl-py-utils[avro-encoder,kinesis-client,mysql-client,postgresql-client,redshift-client,s3-client,config-helper,obfuscation-helper]==1.1.5 | ||
pandas | ||
pytest | ||
pytest-mock | ||
nypl-py-utils[avro-client,kinesis-client,mysql-client,postgresql-client,redshift-client,s3-client,config-helper,obfuscation-helper]==1.4.0 | ||
pandas |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import os | ||
import pytest | ||
|
||
# Sets OS vars for entire set of tests | ||
TEST_ENV_VARS = { | ||
"ENVIRONMENT": "test_environment", | ||
"AWS_REGION": "test_aws_region", | ||
"ENVISIONWARE_DB_PORT": "test_envisionware_port", | ||
"ENVISIONWARE_DB_NAME": "test_envisionware_name", | ||
"ENVISIONWARE_DB_HOST": "test_envisionware_host", | ||
"ENVISIONWARE_DB_USER": "test_envisionware_user", | ||
"ENVISIONWARE_DB_PASSWORD": "test_envisionware_password", | ||
"SIERRA_DB_PORT": "test_sierra_port", | ||
"SIERRA_DB_NAME": "test_sierra_name", | ||
"SIERRA_DB_HOST": "test_sierra_host", | ||
"SIERRA_DB_USER": "test_sierra_user", | ||
"SIERRA_DB_PASSWORD": "test_sierra_password", | ||
"REDSHIFT_DB_NAME": "test_redshift_name", | ||
"REDSHIFT_DB_HOST": "test_redshift_host", | ||
"REDSHIFT_DB_USER": "test_redshift_user", | ||
"REDSHIFT_DB_PASSWORD": "test_redshift_password", | ||
"ENVISIONWARE_BATCH_SIZE": "4", | ||
"MAX_BATCHES": "1", | ||
"S3_BUCKET": "test_s3_bucket", | ||
"S3_RESOURCE": "test_s3_resource", | ||
"PC_RESERVE_SCHEMA_URL": "https://test_schema_url", | ||
"KINESIS_BATCH_SIZE": "4", | ||
"KINESIS_STREAM_ARN": "test_kinesis_stream", | ||
"BCRYPT_SALT": "test_salt", | ||
} | ||
|
||
|
||
@pytest.fixture(scope="function", autouse=True) | ||
def tests_setup_and_teardown(): | ||
# Will be executed before each test | ||
os.environ.update(TEST_ENV_VARS) | ||
|
||
yield | ||
|
||
# Will execute after each test | ||
for os_config in TEST_ENV_VARS.keys(): | ||
if os_config in os.environ: | ||
del os.environ[os_config] |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Last question -- any specific reason this was upped? Or is it just to improve performance?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now that the query is more efficient we can send more barcodes at once!