Skip to content

Commit

Permalink
Merge branch 'main' into qa
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronfriedman6 committed Nov 25, 2024
2 parents 63eee82 + 0872b09 commit 7362714
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 181 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 2024-11-22 -- v2.0.1
### Added
- Use nypl-py-utils patron_data_helper methods to get Sierra and Redshift patron info

## 2024-11-05 -- v2.0.0
### Added
- Rewrite Sierra barcode --> patron_id query to use more efficient phrase_entry table
Expand Down
26 changes: 0 additions & 26 deletions helpers/query_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,8 @@
ORDER BY pcrDateTime, pcrKey
LIMIT {limit};"""

_SIERRA_QUERY = """
SELECT
barcode, id, ptype_code, pcode3,
CASE WHEN LENGTH(TRIM(home_library_code)) = 0
OR TRIM(home_library_code) = 'none' THEN NULL
ELSE TRIM(home_library_code) END
FROM sierra_view.patron_view
WHERE id IN (
SELECT record_id
FROM sierra_view.phrase_entry
WHERE index_tag || index_entry IN ({})
);"""

_REDSHIFT_QUERY = """
SELECT patron_id, postal_code, geoid
FROM {table}
WHERE patron_id IN ({ids});"""


def build_envisionware_query(date_time, key):
return _ENVISIONWARE_QUERY.format(
date_time=date_time, key=key, limit=os.environ["ENVISIONWARE_BATCH_SIZE"]
)


def build_sierra_query(barcodes):
return _SIERRA_QUERY.format(barcodes)


def build_redshift_query(table, ids):
return _REDSHIFT_QUERY.format(table=table, ids=ids)
68 changes: 17 additions & 51 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@
import pandas as pd

from concurrent.futures import ThreadPoolExecutor
from helpers.query_helper import (
build_envisionware_query,
build_redshift_query,
build_sierra_query,
)
from helpers.query_helper import build_envisionware_query
from nypl_py_utils.classes.avro_client import AvroEncoder
from nypl_py_utils.classes.kinesis_client import KinesisClient
from nypl_py_utils.classes.mysql_client import MySQLClient
Expand All @@ -17,7 +13,10 @@
from nypl_py_utils.functions.config_helper import load_env_file
from nypl_py_utils.functions.log_helper import create_log
from nypl_py_utils.functions.obfuscation_helper import obfuscate

from nypl_py_utils.functions.patron_data_helper import (
get_redshift_patron_data,
get_sierra_patron_data_from_barcodes,
)

_DTYPE_MAP = {
"patron_id": "string",
Expand Down Expand Up @@ -109,60 +108,27 @@ def main():
with ThreadPoolExecutor() as executor:
pc_reserve_df["key"] = list(executor.map(obfuscate, pc_reserve_df["key"]))

# Query Sierra for patron info using the patron barcodes
barcodes_str = "'b" + "','b".join(pc_reserve_df["barcode"].unique()) + "'"
sierra_client.connect()
sierra_raw_data = sierra_client.execute_query(build_sierra_query(barcodes_str))
sierra_client.close_connection()
sierra_df = pd.DataFrame(
data=sierra_raw_data,
columns=[
"barcode",
"patron_id",
"ptype_code",
"pcode3",
"patron_home_library_code",
],
# Get patron info from Sierra, set the patron retrieval status, and obfuscate
# the patron_id. The patron_id is either the Sierra id or, if no Sierra id is
# found for the barcode, the barcode prepended with 'barcode '.
sierra_df = get_sierra_patron_data_from_barcodes(
sierra_client, pc_reserve_df["barcode"]
)

# Some barcodes correspond to multiple patron records. For these barcodes, do
# not use patron information from any of the records.
sierra_df = sierra_df[pd.notnull(sierra_df["barcode"])]
sierra_df = sierra_df.drop_duplicates("barcode", keep=False)
sierra_df["patron_id"] = sierra_df["patron_id"].astype("Int64").astype("string")

# Merge the dataframes, set the patron retrieval status, and obfuscate the
# patron_id. The patron_id is either the Sierra id or, if no Sierra id is found
# for the barcode, the barcode prepended with 'barcode '.
pc_reserve_df = pc_reserve_df.merge(sierra_df, how="left", on="barcode")
pc_reserve_df = pc_reserve_df.apply(_set_patron_retrieval_status, axis=1)
with ThreadPoolExecutor() as executor:
pc_reserve_df["patron_id"] = list(
executor.map(obfuscate, pc_reserve_df["patron_id"])
)

# Query Redshift for the zip code and geoid using the obfuscated Sierra ids
sierra_ids = pc_reserve_df.loc[
pc_reserve_df["patron_retrieval_status"] == "found", "patron_id"
]
if not sierra_ids.empty:
ids_str = "'" + "','".join(sierra_ids.unique()) + "'"
redshift_table = "patron_info"
if os.environ["REDSHIFT_DB_NAME"] != "production":
redshift_table += "_" + os.environ["REDSHIFT_DB_NAME"]
redshift_client.connect()
redshift_raw_data = redshift_client.execute_query(
build_redshift_query(redshift_table, ids_str)
)
redshift_client.close_connection()
else:
logger.info("No Sierra ids found to query Redshift with")
redshift_raw_data = []
redshift_df = pd.DataFrame(
data=redshift_raw_data, columns=["patron_id", "postal_code", "geoid"]
# Get additional patron info from Redshift, merge the dataframes, and convert
# field dtypes
redshift_df = get_redshift_patron_data(
redshift_client,
pc_reserve_df.loc[
pc_reserve_df["patron_retrieval_status"] == "found", "patron_id"
],
)

# Merge the dataframes and convert necessary fields to integers
pc_reserve_df = pc_reserve_df.merge(redshift_df, how="left", on="patron_id")
pc_reserve_df = pc_reserve_df.astype(_DTYPE_MAP)

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
nypl-py-utils[avro-client,kinesis-client,mysql-client,postgresql-client,redshift-client,s3-client,config-helper,obfuscation-helper]==1.4.0
nypl-py-utils[avro-client,kinesis-client,mysql-client,postgresql-client,redshift-client,s3-client,config-helper,obfuscation-helper,patron-data-helper]==1.6.0
pandas
150 changes: 47 additions & 103 deletions tests/test_main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import logging
import main
import os
import pandas as pd
import pytest

from datetime import datetime
from nypl_py_utils.classes.postgresql_client import PostgreSQLClientError
from pandas.testing import assert_series_equal

_ENVISIONWARE_DATA = [
(10000000, "barcode1", 100, datetime(2023, 1, 1, 1, 0, 0), "branch1", "area1",
Expand All @@ -17,15 +17,16 @@
"staff_override4"),
]

_SIERRA_DATA = [
("barcode1", 111111111111, 1, 10, "lib1"),
("barcode3", 333333333333, 3, 30, "lib3"),
("barcode3", 300000000000, 30, 300, "lib30"),
("barcode4", 444444444444, 4, 40, "lib4"),
(None, 444444444444, None, None, None),
]
_SIERRA_DF = pd.DataFrame([
("111111111111", "barcode1", 1, 10, "lib1"),
("444444444444", "barcode4", 4, 40, "lib4")],
columns=["patron_id", "barcode", "ptype_code", "pcode3",
"patron_home_library_code"]).astype({"patron_id": "string"})

_REDSHIFT_DATA = (["obf_id1", "zip1", "geoid1"], ["obf_id4", "zip4", None])
_REDSHIFT_DF = pd.DataFrame(
[["obf_id1", "zip1", "geoid1"],],
columns=["patron_id", "postal_code", "geoid"]
)

_RESULTS = [
{"patron_id": "obf_id1", "ptype_code": 1, "patron_home_library_code": "lib1",
Expand All @@ -44,7 +45,7 @@
"area": "area3", "staff_override": "staff_override3",
"patron_retrieval_status": "missing"},
{"patron_id": "obf_id4", "ptype_code": 4, "patron_home_library_code": "lib4",
"pcode3": 40, "postal_code": "zip4", "geoid": None, "key": "obf_key4",
"pcode3": 40, "postal_code": None, "geoid": None, "key": "obf_key4",
"minutes_used": 400, "transaction_et": "2023-04-04", "branch": "branch4",
"area": "area4", "staff_override": "staff_override4",
"patron_retrieval_status": "found"},
Expand All @@ -69,6 +70,10 @@ def test_main_one_iteration(self, mock_helpers, mocker):
mock_obfuscate = mocker.patch("main.obfuscate", side_effect=[
"obf_key1", "obf_key2", "obf_key3", "obf_key4",
"obf_id1", "obf_id2", "obf_id3", "obf_id4"])
mock_get_sierra_patron_data_from_barcodes = mocker.patch(
"main.get_sierra_patron_data_from_barcodes", return_value=_SIERRA_DF)
mock_get_redshift_patron_data = mocker.patch(
"main.get_redshift_patron_data", return_value=_REDSHIFT_DF)

mock_kinesis_client = self._set_up_mock("main.KinesisClient", mocker)

Expand All @@ -86,18 +91,8 @@ def test_main_one_iteration(self, mock_helpers, mocker):
)
mock_envisionware_client = self._set_up_mock("main.MySQLClient", mocker)
mock_envisionware_client.execute_query.return_value = _ENVISIONWARE_DATA

mock_sierra_query = mocker.patch(
"main.build_sierra_query", return_value="SIERRA QUERY"
)
mock_sierra_client = self._set_up_mock("main.PostgreSQLClient", mocker)
mock_sierra_client.execute_query.return_value = _SIERRA_DATA

mock_redshift_query = mocker.patch(
"main.build_redshift_query", return_value="REDSHIFT QUERY"
)
mock_redshift_client = self._set_up_mock("main.RedshiftClient", mocker)
mock_redshift_client.execute_query.return_value = _REDSHIFT_DATA

main.main()

Expand All @@ -110,18 +105,20 @@ def test_main_one_iteration(self, mock_helpers, mocker):
)
mock_envisionware_client.close_connection.assert_called_once()

mock_sierra_client.connect.assert_called_once()
mock_sierra_query.assert_called_once_with(
"'bbarcode1','b25555000000000','bbarcode3','bbarcode4'"
)
mock_sierra_client.close_connection.assert_called_once()

mock_redshift_client.connect.assert_called_once()
mock_redshift_query.assert_called_once_with(
"patron_info_test_redshift_name", "'obf_id1','obf_id4'"
)
mock_redshift_client.execute_query.assert_called_once_with("REDSHIFT QUERY")
mock_redshift_client.close_connection.assert_called_once()
mock_get_sierra_patron_data_from_barcodes.assert_called_once()
sierra_args = mock_get_sierra_patron_data_from_barcodes.call_args[0]
assert sierra_args[0] == mock_sierra_client
assert_series_equal(
sierra_args[1],
pd.Series(["barcode1", "25555000000000", "barcode3", "barcode4"],
name="barcode", dtype="string"))

mock_get_redshift_patron_data.assert_called_once()
redshift_args = mock_get_redshift_patron_data.call_args[0]
assert redshift_args[0] == mock_redshift_client
assert_series_equal(
redshift_args[1],
pd.Series(["obf_id1", "obf_id4"], name="patron_id", index=[0, 3]))

mock_obfuscate.assert_has_calls(
[
Expand Down Expand Up @@ -152,8 +149,6 @@ def test_main_multiple_iterations(self, mock_helpers, mocker):
"staff_override{}".format(i))
for i in range(1, 7)]
mocker.patch("main.obfuscate")
mocker.patch("main.build_sierra_query")
mocker.patch("main.build_redshift_query")
mocker.patch("main.AvroEncoder")
mocker.patch("main.KinesisClient")
mocker.patch("main.PostgreSQLClient")
Expand Down Expand Up @@ -194,14 +189,15 @@ def test_main_no_envisionware_results(self, mock_helpers, mocker):
del os.environ["MAX_BATCHES"]
mocker.patch("main.obfuscate")
mocker.patch("main.build_envisionware_query")
mocker.patch("main.build_sierra_query")
mocker.patch("main.build_redshift_query")
mocker.patch("main.PostgreSQLClient")
mocker.patch("main.RedshiftClient")
mock_get_sierra_patron_data_from_barcodes = mocker.patch(
"main.get_sierra_patron_data_from_barcodes")
mock_get_redshift_patron_data = mocker.patch("main.get_redshift_patron_data")

mock_s3_client = self._set_up_mock("main.S3Client", mocker)
mock_kinesis_client = self._set_up_mock("main.KinesisClient", mocker)
mock_avro_encoder = self._set_up_mock("main.AvroEncoder", mocker)
mock_sierra_client = self._set_up_mock("main.PostgreSQLClient", mocker)
mock_redshift_client = self._set_up_mock("main.RedshiftClient", mocker)

mock_envisionware_client = self._set_up_mock("main.MySQLClient", mocker)
mock_envisionware_client.execute_query.return_value = []
Expand All @@ -213,13 +209,13 @@ def test_main_no_envisionware_results(self, mock_helpers, mocker):
mock_s3_client.close.assert_called_once()
mock_kinesis_client.close.assert_called_once()

mock_sierra_client.connect.assert_not_called()
mock_redshift_client.connect.assert_not_called()
mock_get_sierra_patron_data_from_barcodes.assert_not_called()
mock_get_redshift_patron_data.assert_not_called()
mock_avro_encoder.encode_batch.assert_not_called()
mock_kinesis_client.send_records.assert_not_called()
mock_s3_client.set_cache.assert_not_called()

def test_main_no_sierra_results(self, mock_helpers, mocker):
def test_main_no_sierra_redshift_results(self, mock_helpers, mocker):
_TEST_ENVISIONWARE_DATA = [
(i, "barcode{}".format(i), i, datetime(2023, i, i, i, 0, 0),
"branch{}".format(i), "area{}".format(i),
Expand All @@ -236,24 +232,26 @@ def test_main_no_sierra_results(self, mock_helpers, mocker):
for i in range(1, 5)]

mocker.patch("main.build_envisionware_query")
mocker.patch("main.build_sierra_query")
mocker.patch("main.build_redshift_query")
mocker.patch("main.KinesisClient")
mocker.patch("main.S3Client")

mock_obfuscate = mocker.patch("main.obfuscate", return_value="obfuscated")
mocker.patch(
"main.get_sierra_patron_data_from_barcodes",
return_value=pd.DataFrame(
[], columns=["patron_id", "barcode", "ptype_code", "pcode3",
"patron_home_library_code"]))
mocker.patch(
"main.get_redshift_patron_data",
return_value=pd.DataFrame(
[], columns=["patron_id", "postal_code", "geoid"]))
mock_avro_encoder = self._set_up_mock("main.AvroEncoder", mocker)
mock_redshift_client = self._set_up_mock("main.RedshiftClient", mocker)

mock_envisionware_client = self._set_up_mock("main.MySQLClient", mocker)
mock_envisionware_client.execute_query.return_value = _TEST_ENVISIONWARE_DATA

mock_sierra_client = self._set_up_mock("main.PostgreSQLClient", mocker)
mock_sierra_client.execute_query.return_value = []

main.main()

mock_redshift_client.connect.assert_not_called()
mock_avro_encoder.encode_batch.assert_called_once_with(_RESULTS)
mock_obfuscate.assert_has_calls(
[
Expand All @@ -267,57 +265,3 @@ def test_main_no_sierra_results(self, mock_helpers, mocker):
mocker.call("barcode barcode4"),
]
)

def test_main_no_redshift_results(self, mock_helpers, mocker):
_TEST_ENVISIONWARE_DATA = [
(i, "barcode{}".format(i), i, datetime(2023, i, i, i, 0, 0),
"branch{}".format(i), "area{}".format(i),
"staff_override{}".format(i))
for i in range(1, 5)]
_TEST_SIERRA_DATA = [
("barcode{}".format(i), i+1, i, i, "lib{}".format(i))
for i in range(1, 5)]
_RESULTS = [
{"patron_id": "obfuscated", "ptype_code": i,
"patron_home_library_code": "lib{}".format(i), "pcode3": i,
"postal_code": None, "geoid": None, "key": "obfuscated",
"minutes_used": i, "transaction_et": "2023-0{}-0{}".format(i, i),
"branch": "branch{}".format(i), "area": "area{}".format(i),
"staff_override": "staff_override{}".format(i),
"patron_retrieval_status": "found"}
for i in range(1, 5)]

mocker.patch("main.build_envisionware_query")
mocker.patch("main.build_sierra_query")
mocker.patch("main.build_redshift_query")
mocker.patch("main.KinesisClient")
mocker.patch("main.S3Client")

mock_obfuscate = mocker.patch("main.obfuscate", return_value="obfuscated")
mock_avro_encoder = self._set_up_mock("main.AvroEncoder", mocker)
mock_redshift_client = self._set_up_mock("main.RedshiftClient", mocker)

mock_envisionware_client = self._set_up_mock("main.MySQLClient", mocker)
mock_envisionware_client.execute_query.return_value = _TEST_ENVISIONWARE_DATA

mock_sierra_client = self._set_up_mock("main.PostgreSQLClient", mocker)
mock_sierra_client.execute_query.return_value = _TEST_SIERRA_DATA

mock_redshift_client = self._set_up_mock("main.RedshiftClient", mocker)
mock_redshift_client.execute_query.return_value = []

main.main()

mock_avro_encoder.encode_batch.assert_called_once_with(_RESULTS)
mock_obfuscate.assert_has_calls(
[
mocker.call("1"),
mocker.call("2"),
mocker.call("3"),
mocker.call("4"),
mocker.call("2"),
mocker.call("3"),
mocker.call("4"),
mocker.call("5"),
]
)

0 comments on commit 7362714

Please sign in to comment.