From a4003d9940e7bcd05449b4f50378a953262dcbad Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 20 Mar 2024 08:37:57 -0400 Subject: [PATCH 01/72] Create module and db manager --- agency_homepage_searcher/__init__.py | 0 agency_homepage_searcher/db_manager.py | 15 +++++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 agency_homepage_searcher/__init__.py create mode 100644 agency_homepage_searcher/db_manager.py diff --git a/agency_homepage_searcher/__init__.py b/agency_homepage_searcher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/agency_homepage_searcher/db_manager.py b/agency_homepage_searcher/db_manager.py new file mode 100644 index 0000000..0ffa317 --- /dev/null +++ b/agency_homepage_searcher/db_manager.py @@ -0,0 +1,15 @@ + +class DBManager: + """ + Manages access to PostgreSQL database. + """ + + def __init__(self, db_name, db_user, db_password, db_host, db_port): + self.db_name = db_name + self.db_user = db_user + self.db_password = db_password + self.db_host = db_host + self.db_port = db_port + self.conn = None + self.cur = None + self.connect() \ No newline at end of file From fc02c7135c3da179ff7d04db715f4dec872c8a4e Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 20 Mar 2024 08:42:30 -0400 Subject: [PATCH 02/72] Create util module and place db_manager inside. --- agency_homepage_searcher/db_manager.py | 15 ---------- util/__init__.py | 0 util/db_manager.py | 38 ++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 15 deletions(-) delete mode 100644 agency_homepage_searcher/db_manager.py create mode 100644 util/__init__.py create mode 100644 util/db_manager.py diff --git a/agency_homepage_searcher/db_manager.py b/agency_homepage_searcher/db_manager.py deleted file mode 100644 index 0ffa317..0000000 --- a/agency_homepage_searcher/db_manager.py +++ /dev/null @@ -1,15 +0,0 @@ - -class DBManager: - """ - Manages access to PostgreSQL database. - """ - - def __init__(self, db_name, db_user, db_password, db_host, db_port): - self.db_name = db_name - self.db_user = db_user - self.db_password = db_password - self.db_host = db_host - self.db_port = db_port - self.conn = None - self.cur = None - self.connect() \ No newline at end of file diff --git a/util/__init__.py b/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/util/db_manager.py b/util/db_manager.py new file mode 100644 index 0000000..908bfb3 --- /dev/null +++ b/util/db_manager.py @@ -0,0 +1,38 @@ + +import psycopg2 + + +class DBManager: + """ + Manages access to PostgreSQL database. + """ + + def __init__(self, db_name, user, password, host, port): + self.conn = psycopg2.connect( + dbname=db_name, + user=user, + password=password, + host=host, + port=port + ) + self.cursor = self.conn.cursor() + + def __del__(self): + self.conn.close() + + def execute(self, query, params=None): + self.cursor.execute(query, params) + self.conn.commit() + return self.cursor.fetchall() + + def fetchall(self): + return self.cursor.fetchall() + + def fetchone(self): + return self.cursor.fetchone() + + def fetchmany(self, size): + return self.cursor.fetchmany(size) + + def close(self): + self.conn.close() From 39a6e8728e9e4fd633e34340e8079e891948da18 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 20 Mar 2024 16:14:25 -0400 Subject: [PATCH 03/72] Add google-api-python-client to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index dac0ead..e447164 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ tqdm~=4.66.2 pytest~=8.0.1 pytest-mock==3.12.0 urllib3~=1.26.18 +google-api-python-client~=2.119.0 From df92821bf83e05186f83d6c028c7496f9b447215 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 20 Mar 2024 16:14:51 -0400 Subject: [PATCH 04/72] Update execute type hinting --- util/db_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/db_manager.py b/util/db_manager.py index 908bfb3..f011587 100644 --- a/util/db_manager.py +++ b/util/db_manager.py @@ -20,7 +20,7 @@ def __init__(self, db_name, user, password, host, port): def __del__(self): self.conn.close() - def execute(self, query, params=None): + def execute(self, query, params=None) -> list: self.cursor.execute(query, params) self.conn.commit() return self.cursor.fetchall() From 7a643e46dc07da041c053e85473453b207f36438 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 20 Mar 2024 16:15:11 -0400 Subject: [PATCH 05/72] Create AgencyInfo dataclass --- agency_homepage_searcher/agency_info.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 agency_homepage_searcher/agency_info.py diff --git a/agency_homepage_searcher/agency_info.py b/agency_homepage_searcher/agency_info.py new file mode 100644 index 0000000..5a44867 --- /dev/null +++ b/agency_homepage_searcher/agency_info.py @@ -0,0 +1,25 @@ + + +""" +A dataclass containing all information relevant for searching for an agency's homepage. +""" + +from dataclasses import dataclass +from typing import Union + + +@dataclass +class AgencyInfo: + """ + A dataclass containing all information relevant for searching for an agency's homepage. + """ + agency_name: str + city: str + state: str + county: str + zip_code: str + website: Union[str, None] + agency_type: str + agency_id: str # This is the unique identifier for the agency in the database + def __str__(self): + return f"{self.agency_name} in {self.city}, {self.state} ({self.agency_type})" \ No newline at end of file From e1bc20be0b2ad9b6df4a80bb0e69b816391c89af Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 20 Mar 2024 16:15:22 -0400 Subject: [PATCH 06/72] Create GoogleSearcher class --- agency_homepage_searcher/google_searcher.py | 28 +++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 agency_homepage_searcher/google_searcher.py diff --git a/agency_homepage_searcher/google_searcher.py b/agency_homepage_searcher/google_searcher.py new file mode 100644 index 0000000..752b971 --- /dev/null +++ b/agency_homepage_searcher/google_searcher.py @@ -0,0 +1,28 @@ +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError +from dotenv import load_dotenv +import os + + +class GoogleSearcher: + + def __init__(self): + load_dotenv() + self.api_key = os.getenv("CUSTOM_SEARCH_API_KEY") + self.cse_id = os.getenv("CUSTOM_SEARCH_ENGINE_ID") + # Check if api key and cse id are set + if self.api_key is None or self.cse_id is None: + raise RuntimeError("Custom search API key and CSE ID must be set in .env file") + + self.service = build("customsearch", "v1", developerKey=self.api_key) + + def search(self, query: str) -> list[dict]: + try: + res = self.service.cse().list(q=query, cx=self.cse_id).execute() + return res['items'] + # Process your results + except HttpError as e: + if e.resp.status == 403: + raise RuntimeError(f"Quota exceeded for the day. Original Error: {e}") + else: + raise RuntimeError(f"An error occurred: {e}") From 539a2922eee8636db52c600b63a2326e0ade1ca0 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 20 Mar 2024 16:15:50 -0400 Subject: [PATCH 07/72] Create HuggingFaceAPIManager --- .../huggingface_api_manager.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 agency_homepage_searcher/huggingface_api_manager.py diff --git a/agency_homepage_searcher/huggingface_api_manager.py b/agency_homepage_searcher/huggingface_api_manager.py new file mode 100644 index 0000000..0209ecb --- /dev/null +++ b/agency_homepage_searcher/huggingface_api_manager.py @@ -0,0 +1,25 @@ +from pathlib import Path + +import huggingface_hub + +class HuggingFaceAPIManager: + """ + A class to manage the HuggingFace API. + """ + def __init__( + self, + access_token: str, + repo_id: str + ): + huggingface_hub.login( + token=access_token + ) + self.api = huggingface_hub.HfApi() + self.repo_id = repo_id + + def upload_file(self, local_file_path: Path, repo_file_path): + self.api.upload_file( + path_or_fileobj=local_file_path, + path_in_repo=repo_file_path, + repo_id=self.repo_id + ) From 564f82939b00557a10b5af0f90040719e9db5a93 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 20 Mar 2024 17:36:56 -0400 Subject: [PATCH 08/72] Create HomepageSearcher and associated dataclasses and support functions --- agency_homepage_searcher/homepage_searcher.py | 236 ++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 agency_homepage_searcher/homepage_searcher.py diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py new file mode 100644 index 0000000..b192016 --- /dev/null +++ b/agency_homepage_searcher/homepage_searcher.py @@ -0,0 +1,236 @@ +import csv +import os +import tempfile +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import List, Union + +from agency_homepage_searcher.agency_info import AgencyInfo +from agency_homepage_searcher.google_searcher import GoogleSearcher +from agency_homepage_searcher.huggingface_api_manager import HuggingFaceAPIManager +from util.db_manager import DBManager + +STATE_ISO_TO_NAME_DICT = { + "AL": "Alabama", + "AK": "Alaska", + "AZ": "Arizona", + "AR": "Arkansas", + "CA": "California", + "CO": "Colorado", + "CT": "Connecticut", + "DE": "Delaware", + "FL": "Florida", + "GA": "Georgia", + "HI": "Hawaii", + "ID": "Idaho", + "IL": "Illinois", + "IN": "Indiana", + "IA": "Iowa", + "KS": "Kansas", + "KY": "Kentucky", + "LA": "Louisiana", + "ME": "Maine", + "MD": "Maryland", + "MA": "Massachusetts", + "MI": "Michigan", + "MN": "Minnesota", + "MS": "Mississippi", + "MO": "Missouri", + "MT": "Montana", + "NE": "Nebraska", + "NV": "Nevada", + "NH": "New Hampshire", + "NJ": "New Jersey", + "NM": "New Mexico", + "NY": "New York", + "NC": "North Carolina", + "ND": "North Dakota", + "OH": "Ohio", + "OK": "Oklahoma", + "OR": "Oregon", + "PA": "Pennsylvania", + "RI": "Rhode Island", + "SC": "South Carolina", + "SD": "South Dakota", + "TN": "Tennessee", + "TX": "Texas", + "UT": "Utah", + "VT": "Vermont", + "VA": "Virginia", + "WA": "Washington", + "WV": "West Virginia", + "WI": "Wisconsin", + "WY": "Wyoming" +} + +SQL_GET_AGENCIES_WITHOUT_HOMEPAGE_URLS = """ + SELECT + SUBMITTED_NAME, + JURISDICTION_TYPE, + STATE_ISO, + MUNICIPALITY, + COUNTY_NAME, + AIRTABLE_UID, + COUNT_DATA_SOURCES, + ZIP_CODE, + NO_WEB_PRESENCE -- Relevant + FROM + PUBLIC.AGENCIES + WHERE + approved = true + and homepage_url is null + ORDER BY COUNT_DATA_SOURCES DESC + LIMIT 100 -- Limiting to 100 in acknowledgment of the search engine quota +""" + + +@dataclass +class PossibleHomepageURL: + url: str + snippet: str + + +@dataclass +class SearchResults: + agency_id: str + search_results: List[PossibleHomepageURL] + + +def get_filename_friendly_timestamp() -> str: + # Get the current datetime + now = datetime.now() + # Format the datetime in a filename-friendly format + # Example: "2024-03-20_15-30-45" + return now.strftime("%Y-%m-%d_%H-%M-%S") + + +class HomepageSearcher: + def __init__( + self, + search_engine: GoogleSearcher, + database_manager: DBManager, + huggingface_api_manager: HuggingFaceAPIManager + ): + self.search_engine = search_engine + self.database_manager = database_manager + self.huggingface_api_manager = huggingface_api_manager + + def get_agencies_without_homepage_urls(self) -> list[AgencyInfo]: + # This is a placeholder for the actual functionality + agency_rows = self.database_manager.execute(SQL_GET_AGENCIES_WITHOUT_HOMEPAGE_URLS) + results = [] + + for agency_row in agency_rows: + try: + state_name = STATE_ISO_TO_NAME_DICT[agency_row[2]] + except KeyError: + raise ValueError(f"Invalid state ISO code: {agency_row[2]}") + agency_info = AgencyInfo( + agency_name=agency_row[0], + city=agency_row[3], + state=state_name, + county=agency_row[4], + zip_code=agency_row[7], + website=None, + agency_type=agency_row[1], + agency_id=agency_row[5] + ) + results.append(agency_info) + return results + + @staticmethod + def build_search_string(agency_info: AgencyInfo) -> str: + """ + Builds the search string which will be used in the search engine search + Args: + agency_info: + + Returns: + + """ + search_string = (f"{agency_info.agency_name} {agency_info.city} {agency_info.state} {agency_info.county} " + f"{agency_info.zip_code} {agency_info.website} {agency_info.agency_type}") + return search_string + + def search(self, agency_info: AgencyInfo) -> Union[SearchResults, None]: + # This is a placeholder for the actual search functionality + search_string = self.build_search_string(agency_info) + search_results = self.search_engine.search(search_string) + if search_results is None: # Quota exceeded + return None + # For now, return the first 10 results + search_result = SearchResults( + agency_id=agency_info.agency_id, + search_results=[PossibleHomepageURL(url=result['link'], snippet=result['snippet']) for result in + search_results]) + return search_result + + def search_until_quota_exceeded( + self, + agency_info_list: list[AgencyInfo], + max_searches: int = 100 + ) -> list[SearchResults]: + # This is a placeholder for the actual search functionality + search_results = [] + for search_count, agency_info in enumerate(agency_info_list): + if search_count >= max_searches: + break + search_result = self.search(agency_info) + if search_result is None: # Quota exceeded + break + search_results.append(search_result) + return search_results + + def write_to_temporary_csv(self, data: List[SearchResults]) -> Path: + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as tmpfile: + writer = csv.writer(tmpfile) + # Write the header + writer.writerow(["agency_id", "url", "snippet"]) + for search_result in data: + for possible_homepage_url in search_result.search_results: + writer.writerow([search_result.agency_id, possible_homepage_url.url, possible_homepage_url.snippet]) + # Remember the file name for later access + temp_file_path = Path(tmpfile.name) + return temp_file_path + + def search_and_upload(self, max_searches: int = 100): + agencies = self.get_agencies_without_homepage_urls() + search_results = self.search_until_quota_exceeded( + agency_info_list=agencies, + max_searches=max_searches + ) + temp_file_path = self.write_to_temporary_csv(search_results) + timestamp = get_filename_friendly_timestamp() + self.huggingface_api_manager.upload_file( + local_file_path=temp_file_path, + repo_file_path=f"/data/search_results_{timestamp}.csv" + ) + temp_file_path.unlink() # Clean up the temporary file + +if __name__ == "__main__": + # Load the custom search API key and CSE ID from the .env file + from dotenv import load_dotenv + load_dotenv() + google_searcher = GoogleSearcher( + api_key=os.getenv("CUSTOM_SEARCH_API_KEY"), + cse_id=os.getenv("CUSTOM_SEARCH_ENGINE_ID")) + db_manager = DBManager( + user=os.getenv("DIGITAL_OCEAN_DB_USERNAME"), + password=os.getenv("DIGITAL_OCEAN_DB_PASSWORD"), + host=os.getenv("DIGITAL_OCEAN_DB_HOST"), + port=os.getenv("DIGITAL_OCEAN_DB_PORT"), + db_name=os.getenv("DIGITAL_OCEAN_DB_NAME") + ) + huggingface_api_manager = HuggingFaceAPIManager( + access_token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"), + repo_id="PDAP/possible_homepage_urls" + ) + homepage_searcher = HomepageSearcher( + search_engine=google_searcher, + database_manager=db_manager, + huggingface_api_manager=huggingface_api_manager + ) + homepage_searcher.search_and_upload( + max_searches=1 + ) From 60bbea0bafc628a22a50c18a346b2b4cef9e0a4d Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 20 Mar 2024 17:37:11 -0400 Subject: [PATCH 09/72] Fix bug in upload_file function --- agency_homepage_searcher/huggingface_api_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/agency_homepage_searcher/huggingface_api_manager.py b/agency_homepage_searcher/huggingface_api_manager.py index 0209ecb..fe45545 100644 --- a/agency_homepage_searcher/huggingface_api_manager.py +++ b/agency_homepage_searcher/huggingface_api_manager.py @@ -21,5 +21,6 @@ def upload_file(self, local_file_path: Path, repo_file_path): self.api.upload_file( path_or_fileobj=local_file_path, path_in_repo=repo_file_path, - repo_id=self.repo_id + repo_id=self.repo_id, + repo_type="dataset" ) From 738268ddf4c9aef88c27599e441d758715f729c4 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 20 Mar 2024 17:37:41 -0400 Subject: [PATCH 10/72] Modify search function to return None if Quota exceeded, rather than raise runtime error --- agency_homepage_searcher/google_searcher.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/agency_homepage_searcher/google_searcher.py b/agency_homepage_searcher/google_searcher.py index 752b971..36bbb4a 100644 --- a/agency_homepage_searcher/google_searcher.py +++ b/agency_homepage_searcher/google_searcher.py @@ -16,13 +16,14 @@ def __init__(self): self.service = build("customsearch", "v1", developerKey=self.api_key) - def search(self, query: str) -> list[dict]: + def search(self, query: str) -> Union[list[dict], None]: try: res = self.service.cse().list(q=query, cx=self.cse_id).execute() return res['items'] # Process your results except HttpError as e: if e.resp.status == 403: - raise RuntimeError(f"Quota exceeded for the day. Original Error: {e}") + print("Quota exceeded for the day") + return None else: raise RuntimeError(f"An error occurred: {e}") From 7d1bd39da2b4a5b27c933fcc6cceb40a83f30247 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 20 Mar 2024 17:38:09 -0400 Subject: [PATCH 11/72] Move environmental retrieval functionality from outside of GoogleSearcher --- agency_homepage_searcher/google_searcher.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/agency_homepage_searcher/google_searcher.py b/agency_homepage_searcher/google_searcher.py index 36bbb4a..b4009da 100644 --- a/agency_homepage_searcher/google_searcher.py +++ b/agency_homepage_searcher/google_searcher.py @@ -1,18 +1,22 @@ +from typing import Union + from googleapiclient.discovery import build from googleapiclient.errors import HttpError from dotenv import load_dotenv import os - class GoogleSearcher: - def __init__(self): + def __init__( + self, + api_key: str, + cse_id: str + ): load_dotenv() - self.api_key = os.getenv("CUSTOM_SEARCH_API_KEY") - self.cse_id = os.getenv("CUSTOM_SEARCH_ENGINE_ID") - # Check if api key and cse id are set - if self.api_key is None or self.cse_id is None: - raise RuntimeError("Custom search API key and CSE ID must be set in .env file") + if api_key is None or cse_id is None: + raise RuntimeError("Custom search API key and CSE ID required") + self.api_key = api_key + self.cse_id = cse_id self.service = build("customsearch", "v1", developerKey=self.api_key) From 929fc90a5d6b2570e971b4cf626a3a3c7f81095f Mon Sep 17 00:00:00 2001 From: maxachis Date: Thu, 21 Mar 2024 07:16:17 -0400 Subject: [PATCH 12/72] Add type hinting to search_and_upload --- agency_homepage_searcher/homepage_searcher.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index b192016..49da5b0 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -194,7 +194,15 @@ def write_to_temporary_csv(self, data: List[SearchResults]) -> Path: temp_file_path = Path(tmpfile.name) return temp_file_path - def search_and_upload(self, max_searches: int = 100): + def search_and_upload(self, max_searches: int = 100) -> None: + """ + Searches for possible homepage URLs for agencies without homepage URLs and uploads the results to HuggingFace. + Args: + max_searches: the maximum number of searches to perform + + Returns: + + """ agencies = self.get_agencies_without_homepage_urls() search_results = self.search_until_quota_exceeded( agency_info_list=agencies, From 76cd17f0750fe0fda460b75d6ab56b2f32737743 Mon Sep 17 00:00:00 2001 From: maxachis Date: Thu, 21 Mar 2024 07:22:10 -0400 Subject: [PATCH 13/72] Move huggingface_api_manager.py to util folder --- agency_homepage_searcher/homepage_searcher.py | 2 +- {agency_homepage_searcher => util}/huggingface_api_manager.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename {agency_homepage_searcher => util}/huggingface_api_manager.py (100%) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index 49da5b0..2510d3d 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -8,7 +8,7 @@ from agency_homepage_searcher.agency_info import AgencyInfo from agency_homepage_searcher.google_searcher import GoogleSearcher -from agency_homepage_searcher.huggingface_api_manager import HuggingFaceAPIManager +from util.huggingface_api_manager import HuggingFaceAPIManager from util.db_manager import DBManager STATE_ISO_TO_NAME_DICT = { diff --git a/agency_homepage_searcher/huggingface_api_manager.py b/util/huggingface_api_manager.py similarity index 100% rename from agency_homepage_searcher/huggingface_api_manager.py rename to util/huggingface_api_manager.py From de67ba7fbfdbd37d26fff5442ceb0fd65a8003cc Mon Sep 17 00:00:00 2001 From: maxachis Date: Thu, 21 Mar 2024 07:22:41 -0400 Subject: [PATCH 14/72] Update upload_file type hinting --- util/huggingface_api_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/huggingface_api_manager.py b/util/huggingface_api_manager.py index fe45545..e616205 100644 --- a/util/huggingface_api_manager.py +++ b/util/huggingface_api_manager.py @@ -17,7 +17,7 @@ def __init__( self.api = huggingface_hub.HfApi() self.repo_id = repo_id - def upload_file(self, local_file_path: Path, repo_file_path): + def upload_file(self, local_file_path: Path, repo_file_path: str): self.api.upload_file( path_or_fileobj=local_file_path, path_in_repo=repo_file_path, From 1cec6c53cf822c1d0fb691b18e19a3a25e40a967 Mon Sep 17 00:00:00 2001 From: maxachis Date: Thu, 21 Mar 2024 07:23:27 -0400 Subject: [PATCH 15/72] Add docstrings --- util/huggingface_api_manager.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/util/huggingface_api_manager.py b/util/huggingface_api_manager.py index e616205..4b38e38 100644 --- a/util/huggingface_api_manager.py +++ b/util/huggingface_api_manager.py @@ -11,6 +11,12 @@ def __init__( access_token: str, repo_id: str ): + """ + Initializes the HuggingFace API manager. + Args: + access_token: the HuggingFace access token + repo_id: the repository ID + """ huggingface_hub.login( token=access_token ) @@ -18,6 +24,15 @@ def __init__( self.repo_id = repo_id def upload_file(self, local_file_path: Path, repo_file_path: str): + """ + Uploads a file to the HuggingFace dataset repository. + Args: + local_file_path: the local file path + repo_file_path: the file path in the repository + + Returns: None + + """ self.api.upload_file( path_or_fileobj=local_file_path, path_in_repo=repo_file_path, From c02a84df6d590022195beabb3e6d6e6f237c7882 Mon Sep 17 00:00:00 2001 From: maxachis Date: Thu, 21 Mar 2024 07:37:50 -0400 Subject: [PATCH 16/72] Move get_filename_friendly_timestamp to miscellaneous_functions.py --- agency_homepage_searcher/homepage_searcher.py | 10 +--------- util/miscellaneous_functions.py | 9 +++++++++ 2 files changed, 10 insertions(+), 9 deletions(-) create mode 100644 util/miscellaneous_functions.py diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index 2510d3d..e2fd46b 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -2,7 +2,6 @@ import os import tempfile from dataclasses import dataclass -from datetime import datetime from pathlib import Path from typing import List, Union @@ -10,6 +9,7 @@ from agency_homepage_searcher.google_searcher import GoogleSearcher from util.huggingface_api_manager import HuggingFaceAPIManager from util.db_manager import DBManager +from util.miscellaneous_functions import get_filename_friendly_timestamp STATE_ISO_TO_NAME_DICT = { "AL": "Alabama", @@ -97,14 +97,6 @@ class SearchResults: search_results: List[PossibleHomepageURL] -def get_filename_friendly_timestamp() -> str: - # Get the current datetime - now = datetime.now() - # Format the datetime in a filename-friendly format - # Example: "2024-03-20_15-30-45" - return now.strftime("%Y-%m-%d_%H-%M-%S") - - class HomepageSearcher: def __init__( self, diff --git a/util/miscellaneous_functions.py b/util/miscellaneous_functions.py new file mode 100644 index 0000000..fbd4e24 --- /dev/null +++ b/util/miscellaneous_functions.py @@ -0,0 +1,9 @@ +from datetime import datetime + + +def get_filename_friendly_timestamp() -> str: + # Get the current datetime + now = datetime.now() + # Format the datetime in a filename-friendly format + # Example: "2024-03-20_15-30-45" + return now.strftime("%Y-%m-%d_%H-%M-%S") From ef48e496a0d7510694c8a21438f19497a3d469a1 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 25 Mar 2024 08:02:14 -0400 Subject: [PATCH 17/72] Add executemany method --- util/db_manager.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/util/db_manager.py b/util/db_manager.py index f011587..da350a5 100644 --- a/util/db_manager.py +++ b/util/db_manager.py @@ -25,6 +25,11 @@ def execute(self, query, params=None) -> list: self.conn.commit() return self.cursor.fetchall() + def executemany(self, query, params=None) -> list: + self.cursor.executemany(query, params) + self.conn.commit() + return self.cursor.fetchall() + def fetchall(self): return self.cursor.fetchall() From 047925c2df8594350b6f90f699a510eef505b264 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 25 Mar 2024 08:02:56 -0400 Subject: [PATCH 18/72] Add docstrings to some methods --- agency_homepage_searcher/homepage_searcher.py | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index e2fd46b..187e8f6 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -109,7 +109,10 @@ def __init__( self.huggingface_api_manager = huggingface_api_manager def get_agencies_without_homepage_urls(self) -> list[AgencyInfo]: - # This is a placeholder for the actual functionality + """ + Retrieves a list of agencies without homepage URLs. + Returns: list[AgencyInfo] + """ agency_rows = self.database_manager.execute(SQL_GET_AGENCIES_WITHOUT_HOMEPAGE_URLS) results = [] @@ -146,7 +149,12 @@ def build_search_string(agency_info: AgencyInfo) -> str: return search_string def search(self, agency_info: AgencyInfo) -> Union[SearchResults, None]: - # This is a placeholder for the actual search functionality + """ + Searches for possible homepage URLs for a single agency. + Args: + agency_info: information about the agency + Returns: either the search results or None if the quota is exceeded + """ search_string = self.build_search_string(agency_info) search_results = self.search_engine.search(search_string) if search_results is None: # Quota exceeded @@ -163,7 +171,13 @@ def search_until_quota_exceeded( agency_info_list: list[AgencyInfo], max_searches: int = 100 ) -> list[SearchResults]: - # This is a placeholder for the actual search functionality + """ + Searches for possible homepage URLs for agencies until the quota is exceeded. + Args: + agency_info_list: list[AgencyInfo] - the list of agencies to search + max_searches: int - the maximum number of searches to perform + Returns: list[SearchResults] - the search results + """ search_results = [] for search_count, agency_info in enumerate(agency_info_list): if search_count >= max_searches: @@ -175,6 +189,13 @@ def search_until_quota_exceeded( return search_results def write_to_temporary_csv(self, data: List[SearchResults]) -> Path: + """ + Writes the search results to a temporary CSV file + which will be uploaded to HuggingFace. + Args: + data: List[SearchResults] - the search results + Returns: Path - the path to the temporary file + """ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as tmpfile: writer = csv.writer(tmpfile) # Write the header @@ -191,9 +212,7 @@ def search_and_upload(self, max_searches: int = 100) -> None: Searches for possible homepage URLs for agencies without homepage URLs and uploads the results to HuggingFace. Args: max_searches: the maximum number of searches to perform - - Returns: - + Returns: None """ agencies = self.get_agencies_without_homepage_urls() search_results = self.search_until_quota_exceeded( From a2fc866d3db54f6a3d8143c71ce9207f0ae5cfe8 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 25 Mar 2024 08:03:38 -0400 Subject: [PATCH 19/72] Add search cache logic --- agency_homepage_searcher/homepage_searcher.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index 187e8f6..8549801 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from pathlib import Path from typing import List, Union +from dotenv import load_dotenv from agency_homepage_searcher.agency_info import AgencyInfo from agency_homepage_searcher.google_searcher import GoogleSearcher @@ -84,6 +85,12 @@ LIMIT 100 -- Limiting to 100 in acknowledgment of the search engine quota """ +SQL_UPDATE_CACHE = """ + INSERT INTO PUBLIC.AGENCY_URL_SEARCH_CACHE + (agency_airtable_id) + VALUES (%s) +""" + @dataclass class PossibleHomepageURL: @@ -207,6 +214,17 @@ def write_to_temporary_csv(self, data: List[SearchResults]) -> Path: temp_file_path = Path(tmpfile.name) return temp_file_path + def update_search_cache(self, agency_ids: list[str]) -> None: + """ + Updates the search cache for the given agency IDs. + Args: + agency_ids: list[str] - the agency IDs to update + """ + self.database_manager.executemany( + SQL_UPDATE_CACHE, + [(agency_id,) for agency_id in agency_ids] + ) + def search_and_upload(self, max_searches: int = 100) -> None: """ Searches for possible homepage URLs for agencies without homepage URLs and uploads the results to HuggingFace. @@ -226,6 +244,10 @@ def search_and_upload(self, max_searches: int = 100) -> None: repo_file_path=f"/data/search_results_{timestamp}.csv" ) temp_file_path.unlink() # Clean up the temporary file + # Get the id of all agencies that were searched + agency_ids = [search_result.agency_id for search_result in search_results] + self.update_search_cache(agency_ids) + if __name__ == "__main__": # Load the custom search API key and CSE ID from the .env file From dc0c66d7c660dcb56319dc81726f08e8fa7f996e Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 25 Mar 2024 08:04:29 -0400 Subject: [PATCH 20/72] Remove local import of load_dotenv --- agency_homepage_searcher/homepage_searcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index 8549801..7e55e43 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -251,7 +251,6 @@ def search_and_upload(self, max_searches: int = 100) -> None: if __name__ == "__main__": # Load the custom search API key and CSE ID from the .env file - from dotenv import load_dotenv load_dotenv() google_searcher = GoogleSearcher( api_key=os.getenv("CUSTOM_SEARCH_API_KEY"), From ac72bc7ebaf8ac91900bdf2dcd5183dcfa3c2d78 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 25 Mar 2024 13:51:33 -0400 Subject: [PATCH 21/72] Update requirements.txt - Add psycopg2-binary - Add huggingface-hub --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index e447164..8911598 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,5 @@ pytest~=8.0.1 pytest-mock==3.12.0 urllib3~=1.26.18 google-api-python-client~=2.119.0 +psycopg2-binary~=2.9.6 +huggingface-hub~=0.20.3 \ No newline at end of file From ed0f6ce4e3ac14d89dd7091ca89cdb543d0517ab Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 25 Mar 2024 13:51:51 -0400 Subject: [PATCH 22/72] Modify executemany - Add condition for when there is nothing to return, such as in an INSERT statement --- util/db_manager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/util/db_manager.py b/util/db_manager.py index da350a5..0404d4d 100644 --- a/util/db_manager.py +++ b/util/db_manager.py @@ -28,7 +28,10 @@ def execute(self, query, params=None) -> list: def executemany(self, query, params=None) -> list: self.cursor.executemany(query, params) self.conn.commit() - return self.cursor.fetchall() + try: + return self.cursor.fetchall() + except psycopg2.ProgrammingError: + return [] def fetchall(self): return self.cursor.fetchall() From a40cd3858b62e899a31519475ac9f8eff1f6b159 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 25 Mar 2024 13:52:44 -0400 Subject: [PATCH 23/72] Modify SQL statmeents - correct column name in SQL_UPDATE_CACHE - Update SQL_GET_AGENCIES_WITHOUT_HOMEPAGE_URLS to not return entries which already exist in the cache. --- agency_homepage_searcher/homepage_searcher.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index 7e55e43..4ddc8b9 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -79,15 +79,19 @@ FROM PUBLIC.AGENCIES WHERE - approved = true - and homepage_url is null + approved = true + AND homepage_url is null + AND NOT EXISTS ( + SELECT 1 FROM PUBLIC.AGENCY_URL_SEARCH_CACHE + WHERE PUBLIC.AGENCIES.AIRTABLE_UID = PUBLIC.AGENCY_URL_SEARCH_CACHE.agency_airtable_uid + ) ORDER BY COUNT_DATA_SOURCES DESC LIMIT 100 -- Limiting to 100 in acknowledgment of the search engine quota """ SQL_UPDATE_CACHE = """ INSERT INTO PUBLIC.AGENCY_URL_SEARCH_CACHE - (agency_airtable_id) + (agency_airtable_uid) VALUES (%s) """ From 4827f8c39b73d03236255f7990bbc84db05a62bc Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 25 Mar 2024 13:53:20 -0400 Subject: [PATCH 24/72] Fix bug in write_to_temporary_csv - Bug was causing two newlines to appear in windows. --- agency_homepage_searcher/homepage_searcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index 4ddc8b9..be574b3 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -208,7 +208,7 @@ def write_to_temporary_csv(self, data: List[SearchResults]) -> Path: Returns: Path - the path to the temporary file """ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as tmpfile: - writer = csv.writer(tmpfile) + writer = csv.writer(tmpfile, lineterminator='\n') # Write the header writer.writerow(["agency_id", "url", "snippet"]) for search_result in data: From 10ea3618664f6d5dca202f00134ce40ff74dcde5 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 25 Mar 2024 13:53:46 -0400 Subject: [PATCH 25/72] Add clarifying print statements --- agency_homepage_searcher/homepage_searcher.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index be574b3..d0afc2d 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -237,6 +237,7 @@ def search_and_upload(self, max_searches: int = 100) -> None: Returns: None """ agencies = self.get_agencies_without_homepage_urls() + print("Searching for homepage URLs...") search_results = self.search_until_quota_exceeded( agency_info_list=agencies, max_searches=max_searches @@ -247,6 +248,7 @@ def search_and_upload(self, max_searches: int = 100) -> None: local_file_path=temp_file_path, repo_file_path=f"/data/search_results_{timestamp}.csv" ) + print(f"Uploaded {len(search_results)} search results to HuggingFace: {temp_file_path}") temp_file_path.unlink() # Clean up the temporary file # Get the id of all agencies that were searched agency_ids = [search_result.agency_id for search_result in search_results] From f70ba3fe26f5c701172663af746349f215a268ef Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 25 Mar 2024 14:11:11 -0400 Subject: [PATCH 26/72] Set default max searches in main script --- agency_homepage_searcher/homepage_searcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index d0afc2d..726c304 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -278,5 +278,5 @@ def search_and_upload(self, max_searches: int = 100) -> None: huggingface_api_manager=huggingface_api_manager ) homepage_searcher.search_and_upload( - max_searches=1 + max_searches=100 ) From 7b84daffc4defd1fbc63dc7963a8935398c5c795 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 25 Mar 2024 14:11:49 -0400 Subject: [PATCH 27/72] Add logic for handling search errors in the middle of a search - This should help prevent an entire set of searches from being lost if an error occurs in one --- agency_homepage_searcher/homepage_searcher.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index 726c304..e9adb7d 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -193,7 +193,12 @@ def search_until_quota_exceeded( for search_count, agency_info in enumerate(agency_info_list): if search_count >= max_searches: break - search_result = self.search(agency_info) + try: + search_result = self.search(agency_info) + except Exception as e: + print(f"An error occurred while searching for {agency_info}: {e}") + print("Returning existing search results") + return search_results if search_result is None: # Quota exceeded break search_results.append(search_result) From a67d17160fd431e17ef80c19dfb7e0a49ad63c52 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 25 Mar 2024 14:12:08 -0400 Subject: [PATCH 28/72] Refined logic for catching when quota exceeded. --- agency_homepage_searcher/google_searcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agency_homepage_searcher/google_searcher.py b/agency_homepage_searcher/google_searcher.py index b4009da..27350cf 100644 --- a/agency_homepage_searcher/google_searcher.py +++ b/agency_homepage_searcher/google_searcher.py @@ -26,7 +26,7 @@ def search(self, query: str) -> Union[list[dict], None]: return res['items'] # Process your results except HttpError as e: - if e.resp.status == 403: + if "Quota exceeded" in str(e): print("Quota exceeded for the day") return None else: From 57bbf71a2d39a745b9c7c26d001a2f7917183338 Mon Sep 17 00:00:00 2001 From: maxachis Date: Sat, 30 Mar 2024 20:21:16 -0400 Subject: [PATCH 29/72] Add utf-8 encoding and handle exceptions in CSV writing The CSV temporary file in the 'write_to_temporary_csv' function now has utf-8 encoding for better compatibility. Exception handling is incorporated to prevent crashes while writing rows to CSV. Print statements were added to log the number of search results obtained. --- agency_homepage_searcher/homepage_searcher.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index e9adb7d..e63e1a2 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -208,17 +208,21 @@ def write_to_temporary_csv(self, data: List[SearchResults]) -> Path: """ Writes the search results to a temporary CSV file which will be uploaded to HuggingFace. + Args: data: List[SearchResults] - the search results Returns: Path - the path to the temporary file """ - with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as tmpfile: + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv', encoding='utf-8') as tmpfile: writer = csv.writer(tmpfile, lineterminator='\n') # Write the header writer.writerow(["agency_id", "url", "snippet"]) for search_result in data: - for possible_homepage_url in search_result.search_results: - writer.writerow([search_result.agency_id, possible_homepage_url.url, possible_homepage_url.snippet]) + try: + for possible_homepage_url in search_result.search_results: + writer.writerow([search_result.agency_id, possible_homepage_url.url, possible_homepage_url.snippet]) + except Exception as e: + raise(f"An unexpected error occurred while writing search results for {search_result.agency_id}: {e}") # Remember the file name for later access temp_file_path = Path(tmpfile.name) return temp_file_path @@ -247,6 +251,7 @@ def search_and_upload(self, max_searches: int = 100) -> None: agency_info_list=agencies, max_searches=max_searches ) + print(f"Obtained {len(search_results)} search results") temp_file_path = self.write_to_temporary_csv(search_results) timestamp = get_filename_friendly_timestamp() self.huggingface_api_manager.upload_file( From dac99e0435679a9f8a9454d6aec96b229757130c Mon Sep 17 00:00:00 2001 From: maxachis Date: Sat, 30 Mar 2024 20:21:29 -0400 Subject: [PATCH 30/72] Extract main function from homepage_searcher to main.py The main function of the application was moved from homepage_searcher.py to main.py to improve code organization. As part of the changes, the if __name__ == "__main__" clause was transferred to main.py. Also, environment variables were transferred to the GoogleSearcher, DBManager and HuggingFaceAPIManager constructors in main.py. --- agency_homepage_searcher/homepage_searcher.py | 25 -------------- agency_homepage_searcher/main.py | 34 +++++++++++++++++++ 2 files changed, 34 insertions(+), 25 deletions(-) create mode 100644 agency_homepage_searcher/main.py diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index e63e1a2..b342188 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -265,28 +265,3 @@ def search_and_upload(self, max_searches: int = 100) -> None: self.update_search_cache(agency_ids) -if __name__ == "__main__": - # Load the custom search API key and CSE ID from the .env file - load_dotenv() - google_searcher = GoogleSearcher( - api_key=os.getenv("CUSTOM_SEARCH_API_KEY"), - cse_id=os.getenv("CUSTOM_SEARCH_ENGINE_ID")) - db_manager = DBManager( - user=os.getenv("DIGITAL_OCEAN_DB_USERNAME"), - password=os.getenv("DIGITAL_OCEAN_DB_PASSWORD"), - host=os.getenv("DIGITAL_OCEAN_DB_HOST"), - port=os.getenv("DIGITAL_OCEAN_DB_PORT"), - db_name=os.getenv("DIGITAL_OCEAN_DB_NAME") - ) - huggingface_api_manager = HuggingFaceAPIManager( - access_token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"), - repo_id="PDAP/possible_homepage_urls" - ) - homepage_searcher = HomepageSearcher( - search_engine=google_searcher, - database_manager=db_manager, - huggingface_api_manager=huggingface_api_manager - ) - homepage_searcher.search_and_upload( - max_searches=100 - ) diff --git a/agency_homepage_searcher/main.py b/agency_homepage_searcher/main.py new file mode 100644 index 0000000..cf7c9d6 --- /dev/null +++ b/agency_homepage_searcher/main.py @@ -0,0 +1,34 @@ +import os + +from dotenv import load_dotenv + +from agency_homepage_searcher.google_searcher import GoogleSearcher +from agency_homepage_searcher.homepage_searcher import HomepageSearcher +from util.db_manager import DBManager +from util.huggingface_api_manager import HuggingFaceAPIManager + +if __name__ == "__main__": + # Load the custom search API key and CSE ID from the .env file + load_dotenv() + google_searcher = GoogleSearcher( + api_key=os.getenv("CUSTOM_SEARCH_API_KEY"), + cse_id=os.getenv("CUSTOM_SEARCH_ENGINE_ID")) + db_manager = DBManager( + user=os.getenv("DIGITAL_OCEAN_DB_USERNAME"), + password=os.getenv("DIGITAL_OCEAN_DB_PASSWORD"), + host=os.getenv("DIGITAL_OCEAN_DB_HOST"), + port=os.getenv("DIGITAL_OCEAN_DB_PORT"), + db_name=os.getenv("DIGITAL_OCEAN_DB_NAME") + ) + huggingface_api_manager = HuggingFaceAPIManager( + access_token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"), + repo_id="PDAP/possible_homepage_urls" + ) + homepage_searcher = HomepageSearcher( + search_engine=google_searcher, + database_manager=db_manager, + huggingface_api_manager=huggingface_api_manager + ) + homepage_searcher.search_and_upload( + max_searches=100 + ) From ae73e5ac390da16d69de3b066936b8d4a359172f Mon Sep 17 00:00:00 2001 From: maxachis Date: Sat, 30 Mar 2024 20:55:31 -0400 Subject: [PATCH 31/72] Add string generation method in agency_info.py A new method `get_search_string` has been added to the AgencyInfo class in the agency_info.py file. This method helps to construct the search string for search engines, improving the mechanism of searching agency information. Additionally, unnecessary blank lines at the beginning of the file have been removed for cleaner code formatting. --- agency_homepage_searcher/agency_info.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/agency_homepage_searcher/agency_info.py b/agency_homepage_searcher/agency_info.py index 5a44867..adbbb56 100644 --- a/agency_homepage_searcher/agency_info.py +++ b/agency_homepage_searcher/agency_info.py @@ -1,5 +1,3 @@ - - """ A dataclass containing all information relevant for searching for an agency's homepage. """ @@ -21,5 +19,14 @@ class AgencyInfo: website: Union[str, None] agency_type: str agency_id: str # This is the unique identifier for the agency in the database + def __str__(self): - return f"{self.agency_name} in {self.city}, {self.state} ({self.agency_type})" \ No newline at end of file + return f"{self.agency_name} in {self.city}, {self.state} ({self.agency_type})" + + def get_search_string(self) -> str: + """ + Constructs the search string to be used in search engines. + """ + search_string = (f"{self.agency_name} {self.city} {self.state} {self.county} " + f"{self.zip_code} {self.website} {self.agency_type}") + return search_string From 7b20b3e037bd28768793b3b19e9e4cefea3a78f9 Mon Sep 17 00:00:00 2001 From: maxachis Date: Sat, 30 Mar 2024 20:56:01 -0400 Subject: [PATCH 32/72] Refactor GoogleSearcher class and error handling The GoogleSearcher class in the google_searcher.py file has been refactored to improve clarity and functionality. Detailed explanations for methods and attributes have been added, and the daily quota restriction handling has been more effectively implemented with the addition of a new QuotaExceededError. Additionally, the "Quota exceeded" check was-removed during HTTP error handling. --- agency_homepage_searcher/google_searcher.py | 44 +++++++++++++++++---- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/agency_homepage_searcher/google_searcher.py b/agency_homepage_searcher/google_searcher.py index 27350cf..f9024cb 100644 --- a/agency_homepage_searcher/google_searcher.py +++ b/agency_homepage_searcher/google_searcher.py @@ -2,32 +2,62 @@ from googleapiclient.discovery import build from googleapiclient.errors import HttpError -from dotenv import load_dotenv -import os + +class QuotaExceededError(Exception): + pass class GoogleSearcher: + """ + A class that provides a GoogleSearcher object for performing searches using the Google Custom Search API. + + Attributes: + api_key (str): The API key required for accessing the Google Custom Search API. + cse_id (str): The CSE (Custom Search Engine) ID required for identifying the specific search engine to use. + service (Google API service): The Google API service object for performing the search. + + Methods: + __init__(api_key: str, cse_id: str) + Initializes a GoogleSearcher object with the provided API key and CSE ID. Raises a RuntimeError if either + the API key or CSE ID is None. + + search(query: str) -> Union[list[dict], None] + Performs a search using the Google Custom Search API with the provided query string. Returns a list of + search results as dictionaries or None if the daily quota for the API has been exceeded. Raises a RuntimeError + if any other error occurs during the search. + """ + GOOGLE_SERVICE_NAME = "customsearch" + GOOGLE_SERVICE_VERSION = "v1" def __init__( self, api_key: str, cse_id: str ): - load_dotenv() if api_key is None or cse_id is None: raise RuntimeError("Custom search API key and CSE ID required") self.api_key = api_key self.cse_id = cse_id - self.service = build("customsearch", "v1", developerKey=self.api_key) + self.service = build(self.GOOGLE_SERVICE_NAME, + self.GOOGLE_SERVICE_VERSION, + developerKey=self.api_key) def search(self, query: str) -> Union[list[dict], None]: + """ + Searches for results using the specified query. + + Args: + query (str): The query to search for. + + Returns: Union[list[dict], None]: A list of dictionaries representing the search results. + If the daily quota is exceeded, None is returned. + """ try: res = self.service.cse().list(q=query, cx=self.cse_id).execute() return res['items'] # Process your results except HttpError as e: if "Quota exceeded" in str(e): - print("Quota exceeded for the day") - return None + raise QuotaExceededError("Quota exceeded for the day") else: - raise RuntimeError(f"An error occurred: {e}") + raise RuntimeError(f"An error occurred: {str(e)}") \ No newline at end of file From c372d66a97ee0e668d4a55e50185ff95050ec863 Mon Sep 17 00:00:00 2001 From: maxachis Date: Sat, 30 Mar 2024 20:56:23 -0400 Subject: [PATCH 33/72] Refactor homepage_searcher.py and improve error handling The code in homepage_searcher.py has been streamlined to improve readability and efficiency. The `search_until_quota_exceeded` method was renamed to `search_until_limit_reached` to more accurately describe its behavior, and the creation of `AgencyInfo` objects was moved to its own method for better abstraction. Thorough comments were added for each method to provide clear explanations. Additionally, error handling was enhanced to include the new QuotaExceededError. --- agency_homepage_searcher/homepage_searcher.py | 175 +++++++++++------- 1 file changed, 108 insertions(+), 67 deletions(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index b342188..5c86832 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -7,11 +7,13 @@ from dotenv import load_dotenv from agency_homepage_searcher.agency_info import AgencyInfo -from agency_homepage_searcher.google_searcher import GoogleSearcher +from agency_homepage_searcher.google_searcher import GoogleSearcher, QuotaExceededError from util.huggingface_api_manager import HuggingFaceAPIManager from util.db_manager import DBManager from util.miscellaneous_functions import get_filename_friendly_timestamp +MAX_SEARCHES = 100 # Maximum searches to perform at a time when searching for results + STATE_ISO_TO_NAME_DICT = { "AL": "Alabama", "AK": "Alaska", @@ -119,45 +121,37 @@ def __init__( self.database_manager = database_manager self.huggingface_api_manager = huggingface_api_manager - def get_agencies_without_homepage_urls(self) -> list[AgencyInfo]: - """ - Retrieves a list of agencies without homepage URLs. - Returns: list[AgencyInfo] - """ - agency_rows = self.database_manager.execute(SQL_GET_AGENCIES_WITHOUT_HOMEPAGE_URLS) - results = [] - - for agency_row in agency_rows: - try: - state_name = STATE_ISO_TO_NAME_DICT[agency_row[2]] - except KeyError: - raise ValueError(f"Invalid state ISO code: {agency_row[2]}") - agency_info = AgencyInfo( - agency_name=agency_row[0], - city=agency_row[3], - state=state_name, - county=agency_row[4], - zip_code=agency_row[7], - website=None, - agency_type=agency_row[1], - agency_id=agency_row[5] - ) - results.append(agency_info) - return results - @staticmethod - def build_search_string(agency_info: AgencyInfo) -> str: + def create_agency_info(agency_row: list) -> AgencyInfo: """ - Builds the search string which will be used in the search engine search + Creates an AgencyInfo object using the provided agency data. Args: - agency_info: - + agency_row: Data row of the agency from the database. Returns: + An AgencyInfo object. + """ + try: + state_name = STATE_ISO_TO_NAME_DICT[agency_row[2]] + except KeyError: + raise ValueError(f"Invalid state ISO code: {agency_row[2]}") + return AgencyInfo( + agency_name=agency_row[0], + city=agency_row[3], + state=state_name, + county=agency_row[4], + zip_code=agency_row[7], + website=None, + agency_type=agency_row[1], + agency_id=agency_row[5] + ) + def get_agencies_without_homepage_urls(self) -> list[AgencyInfo]: + """ + Retrieves a list of agencies without homepage URLs. + Returns: list[AgencyInfo] """ - search_string = (f"{agency_info.agency_name} {agency_info.city} {agency_info.state} {agency_info.county} " - f"{agency_info.zip_code} {agency_info.website} {agency_info.agency_type}") - return search_string + agency_rows = self.database_manager.execute(SQL_GET_AGENCIES_WITHOUT_HOMEPAGE_URLS) + return [self.create_agency_info(agency_row) for agency_row in agency_rows] def search(self, agency_info: AgencyInfo) -> Union[SearchResults, None]: """ @@ -166,24 +160,36 @@ def search(self, agency_info: AgencyInfo) -> Union[SearchResults, None]: agency_info: information about the agency Returns: either the search results or None if the quota is exceeded """ - search_string = self.build_search_string(agency_info) - search_results = self.search_engine.search(search_string) - if search_results is None: # Quota exceeded + try: + search_results = self.search_engine.search( + query=agency_info.get_search_string() + ) + first_ten_results = self._get_first_ten_results(search_results) + return SearchResults(agency_id=agency_info.agency_id, search_results=first_ten_results) + except QuotaExceededError: + print("Quota exceeded") return None - # For now, return the first 10 results - search_result = SearchResults( - agency_id=agency_info.agency_id, - search_results=[PossibleHomepageURL(url=result['link'], snippet=result['snippet']) for result in - search_results]) - return search_result - def search_until_quota_exceeded( + @staticmethod + def _get_first_ten_results(results: list[dict]): + """ + Extracts first ten results and forms a list of PossibleHomepageURL objects. + + Args: + - results: A list that fetches from the search engine. + + Returns: + - List[PossibleHomepageURL]: list containing first ten or less elements. + """ + return [PossibleHomepageURL(url=result['link'], snippet=result['snippet']) for result in results[:10]] + + def search_until_limit_reached( self, agency_info_list: list[AgencyInfo], - max_searches: int = 100 + max_searches: int = MAX_SEARCHES ) -> list[SearchResults]: """ - Searches for possible homepage URLs for agencies until the quota is exceeded. + Searches for possible homepage URLs for agencies until the limit is reached. Args: agency_info_list: list[AgencyInfo] - the list of agencies to search max_searches: int - the maximum number of searches to perform @@ -196,14 +202,27 @@ def search_until_quota_exceeded( try: search_result = self.search(agency_info) except Exception as e: - print(f"An error occurred while searching for {agency_info}: {e}") - print("Returning existing search results") - return search_results + return self._handle_search_error(e, search_results) if search_result is None: # Quota exceeded break search_results.append(search_result) return search_results + @staticmethod + def _handle_search_error(error: Exception, search_results: list[SearchResults]) -> list[SearchResults]: + """ + Handles search error and returns existing search results. + + Args: + error (Exception): The error that occurred while searching. + search_results (list[SearchResults]): The existing search results. + Returns: + list[SearchResults]: The existing search results. + """ + print(f"An error occurred while searching: {error}") + print("Returning existing search results") + return search_results + def write_to_temporary_csv(self, data: List[SearchResults]) -> Path: """ Writes the search results to a temporary CSV file @@ -218,27 +237,55 @@ def write_to_temporary_csv(self, data: List[SearchResults]) -> Path: # Write the header writer.writerow(["agency_id", "url", "snippet"]) for search_result in data: - try: - for possible_homepage_url in search_result.search_results: - writer.writerow([search_result.agency_id, possible_homepage_url.url, possible_homepage_url.snippet]) - except Exception as e: - raise(f"An unexpected error occurred while writing search results for {search_result.agency_id}: {e}") + self._write_search_result_to_csv(search_result, writer) # Remember the file name for later access temp_file_path = Path(tmpfile.name) return temp_file_path + @staticmethod + def _write_search_result_to_csv(search_result: SearchResults, writer: csv.writer) -> None: + """ + Args: + search_result (SearchResults): An object that contains the search results. + writer (csv.writer): A writer object used to write the search results to a CSV file. + + Raises: + Exception: If an unexpected error occurs while writing the search results. + + Example: + search_result = SearchResults() + writer = csv.writer(...) + _write_search_result_to_csv(search_result, writer) + """ + try: + for possible_homepage_url in search_result.search_results: + writer.writerow([search_result.agency_id, possible_homepage_url.url, possible_homepage_url.snippet]) + except Exception as e: + raise f"An unexpected error occurred while writing search results for {search_result.agency_id}: {e}" + def update_search_cache(self, agency_ids: list[str]) -> None: """ Updates the search cache for the given agency IDs. Args: agency_ids: list[str] - the agency IDs to update """ - self.database_manager.executemany( - SQL_UPDATE_CACHE, - [(agency_id,) for agency_id in agency_ids] - ) + parameters = [(agency_id,) for agency_id in agency_ids] + self.database_manager.executemany(SQL_UPDATE_CACHE, parameters) - def search_and_upload(self, max_searches: int = 100) -> None: + def _try_search_agency_info(self, agency_info: AgencyInfo) -> Union[SearchResults, List]: + """ + Args: + agency_info: The agency information to be searched. + + Returns: + The result of the search operation, or an empty list if an error occurs during the search. + """ + try: + return self.search(agency_info) + except Exception as e: + return self._handle_search_error(e, []) + + def search_and_upload(self, max_searches: int = MAX_SEARCHES) -> None: """ Searches for possible homepage URLs for agencies without homepage URLs and uploads the results to HuggingFace. Args: @@ -246,11 +293,8 @@ def search_and_upload(self, max_searches: int = 100) -> None: Returns: None """ agencies = self.get_agencies_without_homepage_urls() - print("Searching for homepage URLs...") - search_results = self.search_until_quota_exceeded( - agency_info_list=agencies, - max_searches=max_searches - ) + print(f"Searching for homepage URLs for first {max_searches} agencies...") + search_results = self.search_until_limit_reached(agency_info_list=agencies, max_searches=max_searches) print(f"Obtained {len(search_results)} search results") temp_file_path = self.write_to_temporary_csv(search_results) timestamp = get_filename_friendly_timestamp() @@ -260,8 +304,5 @@ def search_and_upload(self, max_searches: int = 100) -> None: ) print(f"Uploaded {len(search_results)} search results to HuggingFace: {temp_file_path}") temp_file_path.unlink() # Clean up the temporary file - # Get the id of all agencies that were searched agency_ids = [search_result.agency_id for search_result in search_results] self.update_search_cache(agency_ids) - - From 4e1832f2363b3d914f880e7a4a0b23717476d7b6 Mon Sep 17 00:00:00 2001 From: maxachis Date: Sat, 30 Mar 2024 21:32:06 -0400 Subject: [PATCH 34/72] Update error message in google_searcher.py The error message that is raised when either 'api_key' or 'cse_id' variables is 'None' has been clarified in 'google_searcher.py'. Previously, the error message stated, "Custom search API key and CSE ID required", but this has been changed to "Custom search API key and CSE ID cannot be None." to provide additional precision. --- agency_homepage_searcher/google_searcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agency_homepage_searcher/google_searcher.py b/agency_homepage_searcher/google_searcher.py index f9024cb..5c5db33 100644 --- a/agency_homepage_searcher/google_searcher.py +++ b/agency_homepage_searcher/google_searcher.py @@ -34,7 +34,7 @@ def __init__( cse_id: str ): if api_key is None or cse_id is None: - raise RuntimeError("Custom search API key and CSE ID required") + raise RuntimeError("Custom search API key and CSE ID cannot be None.") self.api_key = api_key self.cse_id = cse_id From 540701e3625cc8ef7af74d3c58c48c49f3f4afd3 Mon Sep 17 00:00:00 2001 From: maxachis Date: Sat, 30 Mar 2024 21:32:18 -0400 Subject: [PATCH 35/72] Add unit tests for GoogleSearcher in agency_homepage_searcher module Unit tests have been added to test the functionality of the GoogleSearcher class in the agency_homepage_searcher module. These tests cover initialization, search functionality, and error handling, including specific tests for exceeding API quota and runtime errors. --- Tests/test_agency_homepage_searcher_unit.py | 44 +++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 Tests/test_agency_homepage_searcher_unit.py diff --git a/Tests/test_agency_homepage_searcher_unit.py b/Tests/test_agency_homepage_searcher_unit.py new file mode 100644 index 0000000..a675e39 --- /dev/null +++ b/Tests/test_agency_homepage_searcher_unit.py @@ -0,0 +1,44 @@ +import pytest +from agency_homepage_searcher.google_searcher import GoogleSearcher, QuotaExceededError +from googleapiclient.errors import HttpError +from unittest.mock import Mock + +class TestGoogleSearcher: + + @pytest.fixture + def google_searcher(self, mocker): + api_key = "test_api_key" + cse_id = "test_cse_id" + mock_service = mocker.patch("agency_homepage_searcher.google_searcher.build") + + # Create a mock for the Google API service object and set it as the return_value for the 'build' method + mock_google_api_service = mocker.Mock() + mock_service.return_value = mock_google_api_service + return GoogleSearcher(api_key, cse_id) + + def test_init(self, google_searcher): + assert google_searcher.api_key == "test_api_key" + assert google_searcher.cse_id == "test_cse_id" + + def test_init_with_None_api_key_or_cse_id(self): + with pytest.raises(RuntimeError): + GoogleSearcher(None, "test_cse_id") + with pytest.raises(RuntimeError): + GoogleSearcher("test_api_key", None) + + def test_search(self, google_searcher): + google_searcher.service.cse().list().execute.return_value = { + 'items': 'result' + } + items = google_searcher.search("query") + assert items == "result" + + def test_search_with_http_error_quota_exceeded(self, google_searcher): + google_searcher.service.cse().list().execute.side_effect = HttpError(Mock(), "Quota exceeded".encode()) + with pytest.raises(QuotaExceededError): + google_searcher.search("query") + + def test_search_with_http_error(self, google_searcher): + google_searcher.service.cse().list().execute.side_effect = HttpError(Mock(), "error".encode()) + with pytest.raises(RuntimeError): + google_searcher.search("query") From e2fe90dd2ebc8a0c47df7bee4825fdb59b557e45 Mon Sep 17 00:00:00 2001 From: maxachis Date: Sun, 31 Mar 2024 08:31:41 -0400 Subject: [PATCH 36/72] Update print statement after search completion Revised the success message after search to include the unique dataset URL on HuggingFace. Instead of just stating the local file path, it will now give the exact URL where the dataset has been uploaded on HuggingFace's platform for a more straightforward navigation to the uploaded datasets. --- agency_homepage_searcher/homepage_searcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index 5c86832..6af7279 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -302,7 +302,8 @@ def search_and_upload(self, max_searches: int = MAX_SEARCHES) -> None: local_file_path=temp_file_path, repo_file_path=f"/data/search_results_{timestamp}.csv" ) - print(f"Uploaded {len(search_results)} search results to HuggingFace: {temp_file_path}") + print(f"Uploaded {len(search_results)} search results to HuggingFace: " + f"huggingface.co/datasets/{self.huggingface_api_manager.repo_id}") temp_file_path.unlink() # Clean up the temporary file agency_ids = [search_result.agency_id for search_result in search_results] self.update_search_cache(agency_ids) From f7a7665f3f6f59ced820d5982deda86ea9506f04 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 1 Apr 2024 08:02:21 -0400 Subject: [PATCH 37/72] Refactor exception handling, search process and added upload to HuggingFace Refactored the exception handling in the csv writer process to use the standard Exception class. Altered 'get_agencies_without_homepage_urls' to return a list and reflect this change in variable naming. Added an explicit upload to HuggingFace function and a success message to print the HuggingFace dataset URL after upload. This enhances user experience by providing direct access to the uploaded datasets. --- agency_homepage_searcher/homepage_searcher.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index 6af7279..9ec53c8 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -261,7 +261,7 @@ def _write_search_result_to_csv(search_result: SearchResults, writer: csv.writer for possible_homepage_url in search_result.search_results: writer.writerow([search_result.agency_id, possible_homepage_url.url, possible_homepage_url.snippet]) except Exception as e: - raise f"An unexpected error occurred while writing search results for {search_result.agency_id}: {e}" + raise Exception(f"An unexpected error occurred while writing search results for {search_result.agency_id}: {e}") def update_search_cache(self, agency_ids: list[str]) -> None: """ @@ -292,10 +292,22 @@ def search_and_upload(self, max_searches: int = MAX_SEARCHES) -> None: max_searches: the maximum number of searches to perform Returns: None """ - agencies = self.get_agencies_without_homepage_urls() + agency_info_list = self.get_agencies_without_homepage_urls() print(f"Searching for homepage URLs for first {max_searches} agencies...") - search_results = self.search_until_limit_reached(agency_info_list=agencies, max_searches=max_searches) + search_results = self.search_until_limit_reached(agency_info_list=agency_info_list, max_searches=max_searches) print(f"Obtained {len(search_results)} search results") + self.upload_to_huggingface(search_results) + agency_ids = [search_result.agency_id for search_result in search_results] + self.update_search_cache(agency_ids) + + def upload_to_huggingface(self, search_results: List[SearchResults]) -> None: + """ + Uploads search results to HuggingFace. + Args: + search_results (List): List of search results to upload. + Returns: + None + """ temp_file_path = self.write_to_temporary_csv(search_results) timestamp = get_filename_friendly_timestamp() self.huggingface_api_manager.upload_file( @@ -305,5 +317,3 @@ def search_and_upload(self, max_searches: int = MAX_SEARCHES) -> None: print(f"Uploaded {len(search_results)} search results to HuggingFace: " f"huggingface.co/datasets/{self.huggingface_api_manager.repo_id}") temp_file_path.unlink() # Clean up the temporary file - agency_ids = [search_result.agency_id for search_result in search_results] - self.update_search_cache(agency_ids) From 42d08219d1e15b6d59ea93ba5d9e91acf813b5b5 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 1 Apr 2024 08:02:40 -0400 Subject: [PATCH 38/72] Expand unit tests for `homepage_searcher` and `google_searcher` Expanded the unit tests for `homepage_searcher` and `google_searcher` modules, now covering more scenarios and conditions. These include testing the `search_and_upload`, `upload_to_huggingface` and multiple `search` methods and the handling of exceptions. In addition to that, variables have been checked to ensure they were called with the expected arguments, enhancing the reliability and robustness of the codebase. --- Tests/test_agency_homepage_searcher_unit.py | 282 ++++++++++++++++++++ 1 file changed, 282 insertions(+) diff --git a/Tests/test_agency_homepage_searcher_unit.py b/Tests/test_agency_homepage_searcher_unit.py index a675e39..f336b82 100644 --- a/Tests/test_agency_homepage_searcher_unit.py +++ b/Tests/test_agency_homepage_searcher_unit.py @@ -1,7 +1,33 @@ +import os +import tempfile +from typing import io + import pytest from agency_homepage_searcher.google_searcher import GoogleSearcher, QuotaExceededError from googleapiclient.errors import HttpError from unittest.mock import Mock +# Assuming every class or constant being used in HomepageSearcher is imported +from agency_homepage_searcher import homepage_searcher +import csv +import pytest +from pathlib import Path +from unittest.mock import MagicMock + +# Assuming every class or constant being used in HomepageSearcher is imported +from agency_homepage_searcher.homepage_searcher import HomepageSearcher, AgencyInfo, GoogleSearcher, DBManager, \ + HuggingFaceAPIManager, PossibleHomepageURL + +# Following PEP 8, file's appropriate path is added before import. +from agency_homepage_searcher.homepage_searcher import ( + STATE_ISO_TO_NAME_DICT, + SQL_GET_AGENCIES_WITHOUT_HOMEPAGE_URLS, + MAX_SEARCHES, + SQL_UPDATE_CACHE, + SearchResults, + QuotaExceededError, + get_filename_friendly_timestamp +) + class TestGoogleSearcher: @@ -42,3 +68,259 @@ def test_search_with_http_error(self, google_searcher): google_searcher.service.cse().list().execute.side_effect = HttpError(Mock(), "error".encode()) with pytest.raises(RuntimeError): google_searcher.search("query") + + +class TestHomepageSearcher: + + @pytest.fixture + def test_homepage_searcher(mocker): + return HomepageSearcher( + search_engine=Mock(), + database_manager=Mock(), + huggingface_api_manager=Mock()) + + def test_search_and_upload(self, mocker, test_homepage_searcher, monkeypatch): + # Provide fake array for "get_agencies_without_homepage_urls" + mock_agency_info_list = [MagicMock(), MagicMock(), MagicMock()] + mock_get_agencies_without_homepage_urls = mocker.Mock(return_value=mock_agency_info_list) + test_homepage_searcher.get_agencies_without_homepage_urls = mock_get_agencies_without_homepage_urls + + # Provide fake array of Search Results for "search_until_limit_reached" method + mock_search_results = [] + mock_agency_ids = [] + for i in range(3): + mock_search_result = MagicMock(spec=SearchResults) + mock_search_result.agency_id = i + mock_search_results.append(mock_search_result) + mock_agency_ids.append(i) + mock_search_until_limit_reached = mocker.Mock(return_value=mock_search_results) + test_homepage_searcher.search_until_limit_reached = mock_search_until_limit_reached + + # Mock upload_to_huggingface method + mock_upload_to_huggingface = MagicMock(return_value=None) + test_homepage_searcher.upload_to_huggingface = mock_upload_to_huggingface + + # Mock update_search_cache method + mock_update_search_cache = MagicMock() + test_homepage_searcher.update_search_cache = mock_update_search_cache + + test_homepage_searcher.search_and_upload(max_searches=10) + + # Test all mocked functions called with proper arguments + mock_get_agencies_without_homepage_urls.assert_called_once() + mock_search_until_limit_reached.assert_called_once_with(agency_info_list=mock_agency_info_list, max_searches=10) + mock_upload_to_huggingface.assert_called_once() + mock_update_search_cache.assert_called_once_with(mock_agency_ids) + + def test_upload_to_huggingface(self, mocker, test_homepage_searcher, monkeypatch): + test_homepage_searcher.huggingface_api_manager.repo_id = "TestOrg/TestDataset" + + # Mock write_to_temporary_csv method + mock_file_path = MagicMock(spec=Path, return_value="fake_dir/fake_name.csv") + mock_write_to_temporary_csv = MagicMock(return_value=mock_file_path) + test_homepage_searcher.write_to_temporary_csv = mock_write_to_temporary_csv + + # Mock get_filename_friendly_timestamp_method + mock_filename_friendly_timestamp = "YYYY-MM-DD_hh-mm-ss" + monkeypatch.setattr( + target='agency_homepage_searcher.homepage_searcher.get_filename_friendly_timestamp', + name=lambda: mock_filename_friendly_timestamp + ) + + # Run upload_to_huggingface with mock methods + mock_search_results = [MagicMock(), MagicMock(), MagicMock()] + test_homepage_searcher.upload_to_huggingface(mock_search_results) + + # Assert all functions called with necessary arguments + mock_write_to_temporary_csv.assert_called_with(mock_search_results) + mock_file_path.unlink.assert_called_once() + test_homepage_searcher.huggingface_api_manager.upload_file.assert_called_once_with( + local_file_path=mock_file_path, + repo_file_path=f"/data/search_results_{mock_filename_friendly_timestamp}.csv" + ) + + def test_search_agency_info_success(self, test_homepage_searcher): + expected_result = MagicMock(spec=SearchResults) + + mock_search = MagicMock(return_value=expected_result) + test_homepage_searcher.search = mock_search + + mock_agency_info = MagicMock(spec=AgencyInfo) + result = test_homepage_searcher._try_search_agency_info(mock_agency_info) + + assert result == expected_result + + def test_search_agency_info_exception(self, test_homepage_searcher): + expected_result = [] + mock_agency_info = MagicMock(spec=AgencyInfo) + + mock_search = MagicMock(return_value=MagicMock(spec=SearchResults)) + mock_search.side_effect = Exception + test_homepage_searcher.search = mock_search + + result = test_homepage_searcher._try_search_agency_info(mock_agency_info) + + assert result == expected_result + + def test_update_search_cache(self, test_homepage_searcher): + # Create test parameter + test_agency_ids = ["test1", "test2", "test3"] + + # Configure the Mock DB manager to ensure update SQL is called + test_homepage_searcher.database_manager.executemany = MagicMock() + + # Call the function with our test data + test_homepage_searcher.update_search_cache(test_agency_ids) + + # Check that executemany was called with the expected arguments + test_homepage_searcher.database_manager.executemany.assert_called_once_with(SQL_UPDATE_CACHE, + [(agency_id,) for agency_id in + test_agency_ids]) + + def test_write_search_result_to_csv_success(self, test_homepage_searcher): + search_results = [PossibleHomepageURL("http://example.com", "example snippet"), + PossibleHomepageURL("http://test.com", "test snippet")] + search_result = Mock() + search_result.agency_id = "test_agency" + search_result.search_results = search_results + + writer_mock = MagicMock() + + HomepageSearcher._write_search_result_to_csv(search_result, writer_mock) + + + assert writer_mock.writerow.call_count == 2 + + def test_write_search_result_to_csv_failure(self, test_homepage_searcher): + search_results = [PossibleHomepageURL("http://example.com", "example snippet"), + PossibleHomepageURL("http://test.com", "test snippet")] + search_result = Mock() + search_result.agency_id = "test_agency" + search_result.search_results = search_results + + writer_mock = MagicMock() + writer_mock.writerow.side_effect = Exception() + + with pytest.raises(Exception): + HomepageSearcher._write_search_result_to_csv(search_result, writer_mock) + + @pytest.fixture() + def tmp_csv_setup_teardown(self): + yield open(file='test_tmpfile.csv', mode='w', encoding='utf-8') + os.remove('test_tmpfile.csv') + + def test_write_to_temporary_csv(self, test_homepage_searcher, monkeypatch, tmp_csv_setup_teardown): + open_tmpfile_csv_lambda = MagicMock(return_value=tmp_csv_setup_teardown) + monkeypatch.setattr( + "agency_homepage_searcher.homepage_searcher.tempfile.NamedTemporaryFile" + , open_tmpfile_csv_lambda) + + test_agency_id = "test_agency" + mock_search_results = [ + MagicMock(agency_id=test_agency_id, url="https://example.com", snippet="An example website."), + MagicMock(agency_id=test_agency_id, url="https://test.com", snippet="A test website."), + MagicMock(agency_id=test_agency_id, url="https://python.com", snippet="Python's official website."), + ] + search_results = [SearchResults(test_agency_id, mock_search_results)] + + tmp_filepath = test_homepage_searcher.write_to_temporary_csv(search_results) + assert isinstance(tmp_filepath, Path) + + with open(tmp_filepath, 'r') as f: + reader = csv.reader(f) + header = next(reader) + assert header == ["agency_id", "url", "snippet"] + + row_count = 0 + for i, row in enumerate(reader): + assert row == [search_results[0].agency_id, mock_search_results[i].url, mock_search_results[i].snippet] + row_count += 1 + assert row_count == 3 + + + @pytest.fixture + def mock_agencies(self): + return [MagicMock(spec=AgencyInfo) for _ in range(10)] + + @pytest.fixture + def mock_search_results(self): + return [MagicMock(spec=SearchResults) for _ in range(10)] + + def test_reach_max_searches(self, test_homepage_searcher, mock_agencies, mock_search_results): + # Set the search method to return a search result each time + test_homepage_searcher.search = MagicMock(side_effect=mock_search_results) + results = test_homepage_searcher.search_until_limit_reached(mock_agencies, max_searches=5) + assert len(results) == 5 + + def test_handle_exception(self, test_homepage_searcher, mock_agencies, mock_search_results): + # Set the search method to raise an exception + test_homepage_searcher.search = MagicMock(side_effect=Exception()) + test_homepage_searcher._handle_search_error = MagicMock() + results = test_homepage_searcher.search_until_limit_reached(mock_agencies, max_searches=5) + test_homepage_searcher._handle_search_error.assert_called() + + def test_quota_exceeded(self, test_homepage_searcher, mock_agencies, mock_search_results): + # Set the search method to return None one time and search results the rest of the times + test_homepage_searcher.search = MagicMock(side_effect=[None] + mock_search_results) + results = test_homepage_searcher.search_until_limit_reached(mock_agencies, max_searches=5) + assert len(results) == 0 + + @pytest.fixture + def mock_agency_info(self): + mock_agency_info = MagicMock(spec=AgencyInfo) + mock_agency_info.agency_id = 'id' + mock_agency_info.name = 'name' + return mock_agency_info + + def test_search_with_results(self, test_homepage_searcher, mock_agency_info): + test_homepage_searcher.search_engine.search = MagicMock(return_value=[{'link': 'http://test.com', 'snippet': 'test snippet'}]) + + result = test_homepage_searcher.search(mock_agency_info) + + assert isinstance(result, SearchResults) + assert result.agency_id == 'id' + assert len(result.search_results) == 1 + assert result.search_results[0].url == 'http://test.com' + + def test_search_without_results(self, test_homepage_searcher, mock_agency_info): + test_homepage_searcher.search_engine.search = MagicMock(return_value=[]) + + result = test_homepage_searcher.search(mock_agency_info) + + assert isinstance(result, SearchResults) + assert result.agency_id == 'id' + assert len(result.search_results) == 0 + + def test_search_quota_exceeded(self, test_homepage_searcher, mock_agency_info): + test_homepage_searcher.search_engine.search = MagicMock(side_effect=QuotaExceededError) + + result = test_homepage_searcher.search(mock_agency_info) + + assert result is None + + @pytest.fixture + def sample_valid_agency_row(self, ): + return ['Test Agency', 'Federal', 'CA', 'San Francisco', 'Alameda', + '5141', '5141', '94105'] + + @pytest.fixture + def sample_invalid_agency_row(self): + return ['Invalid Agency', 'Federal', 'XX', 'Invalid City', 'Invalid County', + '9999', '9999', '99999'] + + def test_create_agency_info_with_valid_agency_row(self, sample_valid_agency_row): + expected_agency_info = AgencyInfo( + agency_name='Test Agency', + city='San Francisco', + state='California', + county='Alameda', + zip_code='94105', + website=None, + agency_type='Federal', + agency_id='5141' + ) + assert HomepageSearcher.create_agency_info(sample_valid_agency_row) == expected_agency_info + + def test_create_agency_info_with_invalid_agency_row(self, sample_invalid_agency_row): + with pytest.raises(ValueError): + HomepageSearcher.create_agency_info(sample_invalid_agency_row) \ No newline at end of file From ec90bfa159293d93d4075881f6cef48efc87c230 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 1 Apr 2024 08:08:39 -0400 Subject: [PATCH 39/72] Refined error message in homepage_searcher module Modified the error message in homepage_searcher.py to include both error type and error message for more specific debugging. This change works to better identify the nature of the issues when they occur during the runtime and helps in diagnosing and rectifying problems more efficiently. --- agency_homepage_searcher/homepage_searcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index 9ec53c8..e12476c 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -219,7 +219,7 @@ def _handle_search_error(error: Exception, search_results: list[SearchResults]) Returns: list[SearchResults]: The existing search results. """ - print(f"An error occurred while searching: {error}") + print(f"An error occurred while searching. Error type: {type(error).__name__}, Error message: {error}") print("Returning existing search results") return search_results From f41573f28b231a34bfb0bc20111907ed83894b9e Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 1 Apr 2024 13:20:29 -0400 Subject: [PATCH 40/72] Simplified test_agency_homepage_searcher_unit.py imports Removed unnecessary imports in the file test_agency_homepage_searcher_unit.py. The cleanup adds to the readability of the file and supports more efficient debugging by avoiding unnecessary complexity. --- Tests/test_agency_homepage_searcher_unit.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/Tests/test_agency_homepage_searcher_unit.py b/Tests/test_agency_homepage_searcher_unit.py index f336b82..1c825b1 100644 --- a/Tests/test_agency_homepage_searcher_unit.py +++ b/Tests/test_agency_homepage_searcher_unit.py @@ -1,31 +1,18 @@ import os -import tempfile -from typing import io - -import pytest -from agency_homepage_searcher.google_searcher import GoogleSearcher, QuotaExceededError from googleapiclient.errors import HttpError from unittest.mock import Mock -# Assuming every class or constant being used in HomepageSearcher is imported -from agency_homepage_searcher import homepage_searcher import csv import pytest from pathlib import Path from unittest.mock import MagicMock -# Assuming every class or constant being used in HomepageSearcher is imported -from agency_homepage_searcher.homepage_searcher import HomepageSearcher, AgencyInfo, GoogleSearcher, DBManager, \ - HuggingFaceAPIManager, PossibleHomepageURL +from agency_homepage_searcher.homepage_searcher import HomepageSearcher, AgencyInfo, GoogleSearcher, \ + PossibleHomepageURL -# Following PEP 8, file's appropriate path is added before import. from agency_homepage_searcher.homepage_searcher import ( - STATE_ISO_TO_NAME_DICT, - SQL_GET_AGENCIES_WITHOUT_HOMEPAGE_URLS, - MAX_SEARCHES, SQL_UPDATE_CACHE, SearchResults, QuotaExceededError, - get_filename_friendly_timestamp ) From a72304b0118dc2f39aaba2f65c392d4d3454b7ab Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 1 Apr 2024 16:53:37 -0400 Subject: [PATCH 41/72] Add test_agency_homepage_searcher_integration.py This commit introduces a new test file, test_agency_homepage_searcher_integration.py. The file contains an integration test for the HomepageSearcher class in the agency_homepage_searcher module, validating expected interactions with the Google API, database manager, and HuggingFace API manager. --- ...st_agency_homepage_searcher_integration.py | 134 ++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 Tests/test_agency_homepage_searcher_integration.py diff --git a/Tests/test_agency_homepage_searcher_integration.py b/Tests/test_agency_homepage_searcher_integration.py new file mode 100644 index 0000000..35f8bcd --- /dev/null +++ b/Tests/test_agency_homepage_searcher_integration.py @@ -0,0 +1,134 @@ +import csv +from typing import List +from unittest.mock import MagicMock + +import pytest + +from agency_homepage_searcher.agency_info import AgencyInfo +from agency_homepage_searcher.google_searcher import GoogleSearcher +from agency_homepage_searcher.homepage_searcher import HomepageSearcher, SearchResults +from util.db_manager import DBManager +from util.huggingface_api_manager import HuggingFaceAPIManager + +FAKE_SEARCH_ROW_COUNT = 10 + + +@pytest.fixture +def google_searcher(mocker): + api_key = "test_api_key" + cse_id = "test_cse_id" + mock_service = mocker.patch("agency_homepage_searcher.google_searcher.build") + + # Create a mock for the Google API service object and set it as the return_value for the 'build' method + mock_google_api_service = mocker.Mock() + mock_service.return_value = mock_google_api_service + return GoogleSearcher(api_key, cse_id) + +def get_fake_agency_info() -> AgencyInfo: + """ + Retr + Returns: + + """ + return AgencyInfo( + agency_name="Agency Police Agency", + city="Cityopolis", + state="PA", # Must be an actual state because it is put in the STATE_ISO_TO_NAME_DICT in homepage_searcher.py + county="Horborgor", + zip_code="31415", + website=None, + agency_type="Police Agency", + agency_id="abcdefghijklmnop" + ) + +def convert_agency_info_to_list(agency_info: AgencyInfo) -> list: + return [ + agency_info.agency_name, # 0 + agency_info.agency_type, # 1 + agency_info.state, # 2 + agency_info.city, # 3 + agency_info.county, # 4 + agency_info.agency_id, # 5 + agency_info.website, # 6 + agency_info.zip_code # 7 + ] + +def validate_search_query(query_string): + agency_info_list = convert_agency_info_to_list(get_fake_agency_info()) + for item in agency_info_list: + if item is None: + continue + assert item in query_string, f"Item {item} not found in query string {query_string}" + +def validate_agency(agency_ids: list[str]): + agency_id = get_fake_agency_info().agency_id + assert len(agency_ids) == 1 + assert agency_id == agency_ids[0], f"Agency ID {agency_id} not in expected argument ({agency_ids})" + +def mock_database_query(query_string): + return convert_agency_info_to_list(get_fake_agency_info()) + +def mock_search(q, cx): + + # Validate query is correct + validate_search_query(q) + + # Return fake data + return get_fake_search_data() + +def get_fake_search_data(): + """ + Generate fake search data + Returns: + + """ + fake_search_data = {'items': []} + for i in range(1, FAKE_SEARCH_ROW_COUNT + 1): + number = i + # ASCII value of 'a' is 97, so we add i - 1 to it to get the incremental letter + letter = chr(97 + (i - 1) % 26) # Use modulo 26 to loop back to 'a' after 'z' + fake_search_data['items'].append( + { + 'link': f'https://www.example.com/{number}', + 'snippet': f'This snippet contains the letter {letter}' + } + ) + return fake_search_data + + +def validate_upload_to_huggingface(search_results: List[SearchResults]) -> None: + fake_search_data_list = get_fake_search_data()['items'] + fake_agency_id = get_fake_agency_info().agency_id + + # Check there is only one search result + assert len(search_results) == 1, "There should be only one search result pass to upload_to_huggingface" + search_result = search_results[0] + assert search_result.agency_id == fake_agency_id, f"Search result agency id should match {fake_agency_id}, is {search_result.agency_id}" + assert len(search_result.search_results) == FAKE_SEARCH_ROW_COUNT, f"Number of search results should be {FAKE_SEARCH_ROW_COUNT}, is {len(search_result.search_results)}" + for i in range(FAKE_SEARCH_ROW_COUNT): + fake_search_data = fake_search_data_list[i] + possible_homepage_url = search_result.search_results[i] + assert fake_search_data['link'] == possible_homepage_url.url, f"Search result link {fake_search_data['link']} should match {possible_homepage_url.url}" + assert fake_search_data['snippet'] == possible_homepage_url.snippet, f"Search result snippet {fake_search_data['snippet']} should match {possible_homepage_url.snippet}" + +def test_agency_homepage_searcher_integration(monkeypatch, google_searcher): + + # Patch Google Searcher so that search call returns fake data + google_searcher.service.cse().list().execute.return_value = get_fake_search_data() + + homepage_searcher = HomepageSearcher( + search_engine=google_searcher, + database_manager=MagicMock(spec=DBManager), + huggingface_api_manager=MagicMock(spec=HuggingFaceAPIManager) + ) + + # Mock methods in homepage searcher that interface with external sources + # update_search_cache - verifies proper IDs + # get_agencies_without_homepage_urls - return list of fake agency info + # upload_to_huggingface - verifies proper search results + homepage_searcher.update_search_cache = validate_agency + homepage_searcher.get_agencies_without_homepage_urls = lambda: [get_fake_agency_info()] + homepage_searcher.upload_to_huggingface = validate_upload_to_huggingface + + homepage_searcher.search_and_upload(1) + From 3d629dcb2fc2f25b052ec20bb3bf08414960be74 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 1 Apr 2024 17:47:35 -0400 Subject: [PATCH 42/72] Fix import issues in agency_homepage_searcher Added a few lines of code to set the working directory to the root of the repository. This modification aims to fix the persistent import issues occurring in the 'agency_homepage_searcher' script. --- agency_homepage_searcher/main.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/agency_homepage_searcher/main.py b/agency_homepage_searcher/main.py index cf7c9d6..22e1633 100644 --- a/agency_homepage_searcher/main.py +++ b/agency_homepage_searcher/main.py @@ -1,7 +1,12 @@ +import sys import os from dotenv import load_dotenv +# The below code sets the working directory to be the root of the entire repository +# This is done to solve otherwise quite annoying import issues. +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + from agency_homepage_searcher.google_searcher import GoogleSearcher from agency_homepage_searcher.homepage_searcher import HomepageSearcher from util.db_manager import DBManager From 585f79ab762c0b5a8aec12d91ffbee3df69d4068 Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 2 Apr 2024 17:45:51 -0400 Subject: [PATCH 43/72] Refine search string generation in agency_info.py Added regular expressions library and a cleanup step in the search string generation method. This enhancement is made to the 'agency_homepage_searcher' script to remove unwanted characters like brackets, parentheses, and quotes from the search strings. --- agency_homepage_searcher/agency_info.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/agency_homepage_searcher/agency_info.py b/agency_homepage_searcher/agency_info.py index adbbb56..494a753 100644 --- a/agency_homepage_searcher/agency_info.py +++ b/agency_homepage_searcher/agency_info.py @@ -1,7 +1,7 @@ """ A dataclass containing all information relevant for searching for an agency's homepage. """ - +import re from dataclasses import dataclass from typing import Union @@ -28,5 +28,7 @@ def get_search_string(self) -> str: Constructs the search string to be used in search engines. """ search_string = (f"{self.agency_name} {self.city} {self.state} {self.county} " - f"{self.zip_code} {self.website} {self.agency_type}") - return search_string + f"{self.zip_code} {self.agency_type}") + # Strip brackets, parentheses, and quotes from search strings + pattern = r'[\[\]\"\'\(\)]' + return re.sub(pattern, "", search_string) From d9b3758364eb489cfbf92afe87377735140e101a Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 2 Apr 2024 17:46:10 -0400 Subject: [PATCH 44/72] Update pytest module and psycopg dependency in requirements.txt Upgraded the pytest-postgresql plugin version and switched to the recommended psycopg[binary] package. These updates improve the testing process and facilitate proper PostgreSQL integration. --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ca4f0e4..4eb691f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,7 @@ bs4~=0.0.2 tqdm~=4.66.2 pytest~=8.0.1 pytest-mock==3.12.0 +pytest-postgresql~=6.0.0 urllib3~=1.26.18 # openai-playground only openai>=1.14.2 @@ -26,5 +27,5 @@ scikit-learn>=1.4.1.post1 nltk>=3.6.7 # agency_homepage_searcher only google-api-python-client~=2.119.0 -psycopg2-binary~=2.9.6 +psycopg[binary]~=2.9.6 huggingface-hub~=0.20.3 \ No newline at end of file From 97e859fbb002850718572e2d8e3ab307ce22a2db Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 2 Apr 2024 17:46:32 -0400 Subject: [PATCH 45/72] Add SearchResultEnum and update SQL query in homepage_searcher.py Introduced a new enumeration SearchResultEnum for better search responses handling. Modified the SQL_UPDATE_CACHE query to include the new 'search_result' column, allowing for better tracking of search results in the database. --- agency_homepage_searcher/homepage_searcher.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index e12476c..6ec7fb3 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import List, Union from dotenv import load_dotenv +from enum import Enum from agency_homepage_searcher.agency_info import AgencyInfo from agency_homepage_searcher.google_searcher import GoogleSearcher, QuotaExceededError @@ -14,6 +15,13 @@ MAX_SEARCHES = 100 # Maximum searches to perform at a time when searching for results +class SearchResultEnum(Enum): + """ + This enum corresponds to an enum column in AGENCY_URL_SEARCH_CACHE + """ + FOUND_RESULTS = "found_results" + NO_RESULTS_FOUND = "no_results_found" + STATE_ISO_TO_NAME_DICT = { "AL": "Alabama", "AK": "Alaska", @@ -92,9 +100,9 @@ """ SQL_UPDATE_CACHE = """ - INSERT INTO PUBLIC.AGENCY_URL_SEARCH_CACHE - (agency_airtable_uid) - VALUES (%s) + INSERT INTO public.agency_url_search_cache + (agency_airtable_uid, search_result) + VALUES (%s, %s) """ From 96ed368ad5d44d9c9d5686457489b7cdd52cd96a Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 2 Apr 2024 17:46:47 -0400 Subject: [PATCH 46/72] Replace psycopg2 with psycopg in db_manager.py Switched from psycopg2 to psycopg library in the database manager. This change affects the connection establishment, fetching data from the database and handling database programming errors within the DBManager class. --- util/db_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/util/db_manager.py b/util/db_manager.py index 0404d4d..1bc77f5 100644 --- a/util/db_manager.py +++ b/util/db_manager.py @@ -1,5 +1,5 @@ -import psycopg2 +import psycopg class DBManager: @@ -8,7 +8,7 @@ class DBManager: """ def __init__(self, db_name, user, password, host, port): - self.conn = psycopg2.connect( + self.conn = psycopg.connect( dbname=db_name, user=user, password=password, @@ -30,7 +30,7 @@ def executemany(self, query, params=None) -> list: self.conn.commit() try: return self.cursor.fetchall() - except psycopg2.ProgrammingError: + except psycopg.ProgrammingError: return [] def fetchall(self): From 88f08d37d3331414973bf43f6bb33e3675a3bb4b Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 2 Apr 2024 17:47:00 -0400 Subject: [PATCH 47/72] Add requirements for agency_homepage_searcher action Added a new requirements file to define the dependencies for the agency_homepage_searcher action. This includes specific versions for python-dotenv, google-api-python-client, psycopg2-binary, and huggingface-hub libraries. --- .../requirements_agency_homepage_searcher_action.txt | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt diff --git a/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt b/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt new file mode 100644 index 0000000..d3f58b0 --- /dev/null +++ b/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt @@ -0,0 +1,4 @@ +python-dotenv~=1.0.1 +google-api-python-client~=2.119.0 +psycopg2-binary~=2.9.6 +huggingface-hub~=0.20.3 \ No newline at end of file From 5cf2b86fbf4bed081337ac55eebdebfb32854d53 Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 2 Apr 2024 17:47:22 -0400 Subject: [PATCH 48/72] Add pytest_postgresql integration Added pytest_postgresql import to test_agency_homepage_searcher_integration.py for implementing integration testing. Readjusted code formatting to adhere to style guidelines, made changes to enhance readability. Included instructions and example for a PostgreSQL docker setup for testing, which needs to be moved to a README file in the future. --- ...st_agency_homepage_searcher_integration.py | 52 ++++++++++++++----- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/Tests/test_agency_homepage_searcher_integration.py b/Tests/test_agency_homepage_searcher_integration.py index 35f8bcd..54ae8b7 100644 --- a/Tests/test_agency_homepage_searcher_integration.py +++ b/Tests/test_agency_homepage_searcher_integration.py @@ -3,6 +3,7 @@ from unittest.mock import MagicMock import pytest +from pytest_postgresql import factories from agency_homepage_searcher.agency_info import AgencyInfo from agency_homepage_searcher.google_searcher import GoogleSearcher @@ -24,6 +25,7 @@ def google_searcher(mocker): mock_service.return_value = mock_google_api_service return GoogleSearcher(api_key, cse_id) + def get_fake_agency_info() -> AgencyInfo: """ Retr @@ -41,18 +43,20 @@ def get_fake_agency_info() -> AgencyInfo: agency_id="abcdefghijklmnop" ) + def convert_agency_info_to_list(agency_info: AgencyInfo) -> list: return [ - agency_info.agency_name, # 0 - agency_info.agency_type, # 1 - agency_info.state, # 2 - agency_info.city, # 3 - agency_info.county, # 4 - agency_info.agency_id, # 5 - agency_info.website, # 6 - agency_info.zip_code # 7 + agency_info.agency_name, # 0 + agency_info.agency_type, # 1 + agency_info.state, # 2 + agency_info.city, # 3 + agency_info.county, # 4 + agency_info.agency_id, # 5 + agency_info.website, # 6 + agency_info.zip_code # 7 ] + def validate_search_query(query_string): agency_info_list = convert_agency_info_to_list(get_fake_agency_info()) for item in agency_info_list: @@ -60,22 +64,25 @@ def validate_search_query(query_string): continue assert item in query_string, f"Item {item} not found in query string {query_string}" + def validate_agency(agency_ids: list[str]): agency_id = get_fake_agency_info().agency_id assert len(agency_ids) == 1 assert agency_id == agency_ids[0], f"Agency ID {agency_id} not in expected argument ({agency_ids})" + def mock_database_query(query_string): return convert_agency_info_to_list(get_fake_agency_info()) -def mock_search(q, cx): +def mock_search(q, cx): # Validate query is correct validate_search_query(q) # Return fake data return get_fake_search_data() + def get_fake_search_data(): """ Generate fake search data @@ -104,15 +111,18 @@ def validate_upload_to_huggingface(search_results: List[SearchResults]) -> None: assert len(search_results) == 1, "There should be only one search result pass to upload_to_huggingface" search_result = search_results[0] assert search_result.agency_id == fake_agency_id, f"Search result agency id should match {fake_agency_id}, is {search_result.agency_id}" - assert len(search_result.search_results) == FAKE_SEARCH_ROW_COUNT, f"Number of search results should be {FAKE_SEARCH_ROW_COUNT}, is {len(search_result.search_results)}" + assert len( + search_result.search_results) == FAKE_SEARCH_ROW_COUNT, f"Number of search results should be {FAKE_SEARCH_ROW_COUNT}, is {len(search_result.search_results)}" for i in range(FAKE_SEARCH_ROW_COUNT): fake_search_data = fake_search_data_list[i] possible_homepage_url = search_result.search_results[i] - assert fake_search_data['link'] == possible_homepage_url.url, f"Search result link {fake_search_data['link']} should match {possible_homepage_url.url}" - assert fake_search_data['snippet'] == possible_homepage_url.snippet, f"Search result snippet {fake_search_data['snippet']} should match {possible_homepage_url.snippet}" + assert fake_search_data[ + 'link'] == possible_homepage_url.url, f"Search result link {fake_search_data['link']} should match {possible_homepage_url.url}" + assert fake_search_data[ + 'snippet'] == possible_homepage_url.snippet, f"Search result snippet {fake_search_data['snippet']} should match {possible_homepage_url.snippet}" -def test_agency_homepage_searcher_integration(monkeypatch, google_searcher): +def test_agency_homepage_searcher_integration(monkeypatch, google_searcher): # Patch Google Searcher so that search call returns fake data google_searcher.service.cse().list().execute.return_value = get_fake_search_data() @@ -132,3 +142,19 @@ def test_agency_homepage_searcher_integration(monkeypatch, google_searcher): homepage_searcher.search_and_upload(1) + +""" +This requires a postgresql docker container set up and listening on port 5432 with the password "mysecretpassword" +If you don't already have it installed in docker, run `docker pull postgres` +Then, run the following command: +docker run -p 5432:5432 --name some-postgres -e POSTGRES_PASSWORD=mysecretpassword -d postgres +With that up and running, the below code should work +TODO: Move this to a README, Max +""" +# +postgresql_in_docker = factories.postgresql_noproc(port="5432", password="mysecretpassword") +postgresql = factories.postgresql("postgresql_in_docker", dbname="test") + +def test_get_agencies_without_homepage_urls(postgresql): + cur = postgresql.cursor() + cur.execute("CREATE TABLE test (id serial PRIMARY KEY, num integer, data varchar);") From 71966ac458f5261cae36439642b68c0913336798 Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 2 Apr 2024 17:47:44 -0400 Subject: [PATCH 49/72] Refactor unit test and add test for character stripping Removed unnecessary blank lines to clean up the unit testing code for test agency homepage searcher. Extended the unit tests by adding a test case to verify that disallowed characters are being stripped correctly from the agency name search string. --- Tests/test_agency_homepage_searcher_unit.py | 30 ++++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/Tests/test_agency_homepage_searcher_unit.py b/Tests/test_agency_homepage_searcher_unit.py index 1c825b1..68d5f20 100644 --- a/Tests/test_agency_homepage_searcher_unit.py +++ b/Tests/test_agency_homepage_searcher_unit.py @@ -175,7 +175,6 @@ def test_write_search_result_to_csv_success(self, test_homepage_searcher): HomepageSearcher._write_search_result_to_csv(search_result, writer_mock) - assert writer_mock.writerow.call_count == 2 def test_write_search_result_to_csv_failure(self, test_homepage_searcher): @@ -224,7 +223,6 @@ def test_write_to_temporary_csv(self, test_homepage_searcher, monkeypatch, tmp_c row_count += 1 assert row_count == 3 - @pytest.fixture def mock_agencies(self): return [MagicMock(spec=AgencyInfo) for _ in range(10)] @@ -260,7 +258,8 @@ def mock_agency_info(self): return mock_agency_info def test_search_with_results(self, test_homepage_searcher, mock_agency_info): - test_homepage_searcher.search_engine.search = MagicMock(return_value=[{'link': 'http://test.com', 'snippet': 'test snippet'}]) + test_homepage_searcher.search_engine.search = MagicMock( + return_value=[{'link': 'http://test.com', 'snippet': 'test snippet'}]) result = test_homepage_searcher.search(mock_agency_info) @@ -310,4 +309,27 @@ def test_create_agency_info_with_valid_agency_row(self, sample_valid_agency_row) def test_create_agency_info_with_invalid_agency_row(self, sample_invalid_agency_row): with pytest.raises(ValueError): - HomepageSearcher.create_agency_info(sample_invalid_agency_row) \ No newline at end of file + HomepageSearcher.create_agency_info(sample_invalid_agency_row) + + +def test_agency_info_get_search_string_character_strip(): + """ + Test that the get_search_string_character strip does not include disallowed characters + """ + disallowed_characters = ['[', ']', '\'', '\"', ')', '('] + agency_info = AgencyInfo( + agency_name='Agency', + city='San Francisco', + state='California', + county='Alameda', + zip_code='94105', + website=None, + agency_type='Federal', + agency_id='5141' + ) + for character in disallowed_characters: + agency_info.agency_name += character + search_string = agency_info.get_search_string() + for character in disallowed_characters: + assert character not in search_string, f'The character {character} is erroneously included in the search string {search_string}' + From 947662ff957bdd6be8d544e104f134286aa8bcaa Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 2 Apr 2024 17:47:55 -0400 Subject: [PATCH 50/72] Add README.md file for Agency Homepage Searcher module Introduces documentation for the Agency Homepage Searcher module, its functionality, environment setup, and execution. The README details the procedure of filling missing agency homepage data, requirements for execution, and gives a short guide on running the script. --- agency_homepage_searcher/README.md | 35 ++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 agency_homepage_searcher/README.md diff --git a/agency_homepage_searcher/README.md b/agency_homepage_searcher/README.md new file mode 100644 index 0000000..40ddd37 --- /dev/null +++ b/agency_homepage_searcher/README.md @@ -0,0 +1,35 @@ +# Agency Homepage Searcher + +### Single sentence summary + +This module uses google searches to fill in missing homepage data for agencies in the PDAP database. + +### Somewhat longer summary + +This module is designed to do the following: +1. Take existing data from the `AGENCIES` table in the PDAP PostgreSQL database +2. Identify those agencies which lack a homepage +3. Perform [automated google searches](https://developers.google.com/custom-search/v1/overview) for potential homepages for the agency, using information from the database row +4. Upload those automated searches to the PDAP Huggingface database at [PDAP/possible_homepage_urls](https://huggingface.co/datasets/PDAP/possible_homepage_urls) +5. And update the AGENCY_URL_SEARCH_CACHE in the PDAP PostgreSQL database to ensure those rows in the `AGENCIES` table already searched for are not searched for again + +## Environment Setup + +This script requires a number of environment variables to be provided in an associated `.env` file in the root directory in order to function correctly: + +* CUSTOM_SEARCH_API_KEY - The API key required for accessing the [Google Custom Search Engine](https://developers.google.com/custom-search/v1/overview.) +* CUSTOM_SEARCH_ENGINE_ID - The CSE (Custom Search Engine) ID required for identifying the specific search engine to use. +* DIGITAL_OCEAN_DB_USERNAME - The username to be used for logging into the PostgreSQL database +* DIGITAL_OCEAN_DB_PASSWORD - The password to be used for logging into the PostgreSQL database +* DIGITAL_OCEAN_DB_HOST - The host to be used for logging into the PostgreSQL database +* DIGITAL_OCEAN_DB_PORT - The port to be used for logging into the PostgreSQL database +* DIGITAL_OCEAN_DB_NAME - The database name to be used for logging into the PostgreSQL database +* HUGGINGFACE_ACCESS_TOKEN - An access token for a user with permissions to upload data to the [PDAP/possible_homepage_urls](https://huggingface.co/datasets/PDAP/possible_homepage_urls) dataset + +## Running script + + + +## Running tests + +TODO: Include notes on running integration test with database From 8250d69b274c1270dc2b876aae9ead23847cafe7 Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 2 Apr 2024 17:48:38 -0400 Subject: [PATCH 51/72] Create blank agency_homepage_searcher.yaml. This file currently exists as a stand-in for a Github Action yaml file for automatically running the agency_homepage_searcher --- .github/actions/agency_homepage_searcher.yaml | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 .github/actions/agency_homepage_searcher.yaml diff --git a/.github/actions/agency_homepage_searcher.yaml b/.github/actions/agency_homepage_searcher.yaml new file mode 100644 index 0000000..e69de29 From ff1165bef55a6b9f3429470af4d4c65e554d6b33 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 10 Apr 2024 16:14:18 -0400 Subject: [PATCH 52/72] Improve search status handling and cache updating Added `search_result_status` to `SearchResults` class and restructured `update_search_cache` method to handle a list of `SearchResults` instead of agency IDs only. This allows for a more comprehensive search status tracking, improving the system's flexibility and accuracy in updating the search cache. --- agency_homepage_searcher/homepage_searcher.py | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index 6ec7fb3..20871f3 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -116,6 +116,7 @@ class PossibleHomepageURL: class SearchResults: agency_id: str search_results: List[PossibleHomepageURL] + search_result_status: SearchResultEnum class HomepageSearcher: @@ -173,7 +174,15 @@ def search(self, agency_info: AgencyInfo) -> Union[SearchResults, None]: query=agency_info.get_search_string() ) first_ten_results = self._get_first_ten_results(search_results) - return SearchResults(agency_id=agency_info.agency_id, search_results=first_ten_results) + if len(first_ten_results) > 0: + search_result_status = SearchResultEnum.FOUND_RESULTS + else: + search_result_status = SearchResultEnum.NO_RESULTS_FOUND + return SearchResults( + agency_id=agency_info.agency_id, + search_results=first_ten_results, + search_result_status=search_result_status + ) except QuotaExceededError: print("Quota exceeded") return None @@ -271,13 +280,16 @@ def _write_search_result_to_csv(search_result: SearchResults, writer: csv.writer except Exception as e: raise Exception(f"An unexpected error occurred while writing search results for {search_result.agency_id}: {e}") - def update_search_cache(self, agency_ids: list[str]) -> None: + def update_search_cache(self, search_results: list[SearchResults]) -> None: """ Updates the search cache for the given agency IDs. Args: - agency_ids: list[str] - the agency IDs to update + search_results: """ - parameters = [(agency_id,) for agency_id in agency_ids] + parameters = [] + for search_result in search_results: + parameter = (search_result.agency_id, search_result.search_result_status) + parameters.append(parameter) self.database_manager.executemany(SQL_UPDATE_CACHE, parameters) def _try_search_agency_info(self, agency_info: AgencyInfo) -> Union[SearchResults, List]: @@ -305,8 +317,7 @@ def search_and_upload(self, max_searches: int = MAX_SEARCHES) -> None: search_results = self.search_until_limit_reached(agency_info_list=agency_info_list, max_searches=max_searches) print(f"Obtained {len(search_results)} search results") self.upload_to_huggingface(search_results) - agency_ids = [search_result.agency_id for search_result in search_results] - self.update_search_cache(agency_ids) + self.update_search_cache(search_results) def upload_to_huggingface(self, search_results: List[SearchResults]) -> None: """ From 9390affa097940793f399a96b1f07a3117f2aa80 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 10 Apr 2024 16:15:18 -0400 Subject: [PATCH 53/72] Refactor agency validation in search cache update Changed the `validate_agency` method to `validate_update_search_cache` for handling `SearchResults` instead of just agency IDs. This allows the system to take into consideration varied search results, thus enhancing the search status handling and cache updating process. --- ...st_agency_homepage_searcher_integration.py | 24 ++++--------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/Tests/test_agency_homepage_searcher_integration.py b/Tests/test_agency_homepage_searcher_integration.py index 54ae8b7..bed426d 100644 --- a/Tests/test_agency_homepage_searcher_integration.py +++ b/Tests/test_agency_homepage_searcher_integration.py @@ -65,10 +65,10 @@ def validate_search_query(query_string): assert item in query_string, f"Item {item} not found in query string {query_string}" -def validate_agency(agency_ids: list[str]): +def validate_update_search_cache(search_results: list[SearchResults]): agency_id = get_fake_agency_info().agency_id - assert len(agency_ids) == 1 - assert agency_id == agency_ids[0], f"Agency ID {agency_id} not in expected argument ({agency_ids})" + assert len(search_results) == 1 + assert agency_id == search_results[0].agency_id, f"Agency ID {agency_id} not in expected argument ({search_results[0].agency_id})" def mock_database_query(query_string): @@ -136,25 +136,9 @@ def test_agency_homepage_searcher_integration(monkeypatch, google_searcher): # update_search_cache - verifies proper IDs # get_agencies_without_homepage_urls - return list of fake agency info # upload_to_huggingface - verifies proper search results - homepage_searcher.update_search_cache = validate_agency + homepage_searcher.update_search_cache = validate_update_search_cache homepage_searcher.get_agencies_without_homepage_urls = lambda: [get_fake_agency_info()] homepage_searcher.upload_to_huggingface = validate_upload_to_huggingface homepage_searcher.search_and_upload(1) - -""" -This requires a postgresql docker container set up and listening on port 5432 with the password "mysecretpassword" -If you don't already have it installed in docker, run `docker pull postgres` -Then, run the following command: -docker run -p 5432:5432 --name some-postgres -e POSTGRES_PASSWORD=mysecretpassword -d postgres -With that up and running, the below code should work -TODO: Move this to a README, Max -""" -# -postgresql_in_docker = factories.postgresql_noproc(port="5432", password="mysecretpassword") -postgresql = factories.postgresql("postgresql_in_docker", dbname="test") - -def test_get_agencies_without_homepage_urls(postgresql): - cur = postgresql.cursor() - cur.execute("CREATE TABLE test (id serial PRIMARY KEY, num integer, data varchar);") From ad0a5ab0314616e568275ef848946ce957900292 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 10 Apr 2024 16:15:33 -0400 Subject: [PATCH 54/72] Add SearchResultEnum to SearchResults in unit tests The `SearchResultEnum` has been included in the `SearchResults` object in the `test_agency_homepage_searcher_unit` module. This enhancement is expected to provide better categorization and handling of search outcomes, facilitating more accurate distinction between results during cache update tasks. --- Tests/test_agency_homepage_searcher_unit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Tests/test_agency_homepage_searcher_unit.py b/Tests/test_agency_homepage_searcher_unit.py index 68d5f20..c2462a5 100644 --- a/Tests/test_agency_homepage_searcher_unit.py +++ b/Tests/test_agency_homepage_searcher_unit.py @@ -7,7 +7,7 @@ from unittest.mock import MagicMock from agency_homepage_searcher.homepage_searcher import HomepageSearcher, AgencyInfo, GoogleSearcher, \ - PossibleHomepageURL + PossibleHomepageURL, SearchResultEnum from agency_homepage_searcher.homepage_searcher import ( SQL_UPDATE_CACHE, @@ -207,7 +207,7 @@ def test_write_to_temporary_csv(self, test_homepage_searcher, monkeypatch, tmp_c MagicMock(agency_id=test_agency_id, url="https://test.com", snippet="A test website."), MagicMock(agency_id=test_agency_id, url="https://python.com", snippet="Python's official website."), ] - search_results = [SearchResults(test_agency_id, mock_search_results)] + search_results = [SearchResults(test_agency_id, mock_search_results, SearchResultEnum.FOUND_RESULTS)] tmp_filepath = test_homepage_searcher.write_to_temporary_csv(search_results) assert isinstance(tmp_filepath, Path) From 4ac8e4f799097f8afb225aab19ad81e9df2b9b90 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 10 Apr 2024 16:47:49 -0400 Subject: [PATCH 55/72] Handle case when no 'items' in Google search result Added a condition in the Google searching function to check if the 'items' key exists in the returned dictionary after executing a search request. If it doesn't exist, the function will return None to indicate that no search results were found. This ensures the function handles all potential cases and doesn't throw exceptions if the expected data isn't in the returned result. --- agency_homepage_searcher/google_searcher.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/agency_homepage_searcher/google_searcher.py b/agency_homepage_searcher/google_searcher.py index 5c5db33..6638770 100644 --- a/agency_homepage_searcher/google_searcher.py +++ b/agency_homepage_searcher/google_searcher.py @@ -54,6 +54,8 @@ def search(self, query: str) -> Union[list[dict], None]: """ try: res = self.service.cse().list(q=query, cx=self.cse_id).execute() + if "items" not in res: + return None return res['items'] # Process your results except HttpError as e: From 69855a3ad6c7da5b77f0e728175377266d3e2229 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 10 Apr 2024 16:48:11 -0400 Subject: [PATCH 56/72] Improve error handling in homepage search results Enhanced the Google searching function to return an empty list if the search results are None. This prevents exceptions from being raised when there are no search results found. In addition, the commit handles a condition where 'snippet' field might not be available in the search results and includes correct parameter for SQL_UPDATE_CACHE command. --- agency_homepage_searcher/homepage_searcher.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index 20871f3..fda6cd7 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -198,7 +198,16 @@ def _get_first_ten_results(results: list[dict]): Returns: - List[PossibleHomepageURL]: list containing first ten or less elements. """ - return [PossibleHomepageURL(url=result['link'], snippet=result['snippet']) for result in results[:10]] + if results is None: + return [] + first_ten_results = [] + for result in results[:10]: + possible_homepage_url = PossibleHomepageURL( + url=result['link'], + snippet=result['snippet'] if 'snippet' in result else '' + ) + first_ten_results.append(possible_homepage_url) + return first_ten_results def search_until_limit_reached( self, @@ -288,7 +297,7 @@ def update_search_cache(self, search_results: list[SearchResults]) -> None: """ parameters = [] for search_result in search_results: - parameter = (search_result.agency_id, search_result.search_result_status) + parameter = (search_result.agency_id, search_result.search_result_status.value) parameters.append(parameter) self.database_manager.executemany(SQL_UPDATE_CACHE, parameters) From 847437d5d3631cf8412cc12bb04a77c3f95bb865 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 10 Apr 2024 16:48:25 -0400 Subject: [PATCH 57/72] Refactor homepage searcher tests and cache update Removed redundant `mock_agency_ids` in unit tests and refactored the call to `update_search_cache` method to use `mock_search_results` instead. Also refactored `update_search_cache` to operate with SearchResults objects, adjusting the corresponding database call to include `SearchResultEnum` status. --- Tests/test_agency_homepage_searcher_unit.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/Tests/test_agency_homepage_searcher_unit.py b/Tests/test_agency_homepage_searcher_unit.py index c2462a5..9120634 100644 --- a/Tests/test_agency_homepage_searcher_unit.py +++ b/Tests/test_agency_homepage_searcher_unit.py @@ -74,12 +74,10 @@ def test_search_and_upload(self, mocker, test_homepage_searcher, monkeypatch): # Provide fake array of Search Results for "search_until_limit_reached" method mock_search_results = [] - mock_agency_ids = [] for i in range(3): mock_search_result = MagicMock(spec=SearchResults) mock_search_result.agency_id = i mock_search_results.append(mock_search_result) - mock_agency_ids.append(i) mock_search_until_limit_reached = mocker.Mock(return_value=mock_search_results) test_homepage_searcher.search_until_limit_reached = mock_search_until_limit_reached @@ -97,7 +95,7 @@ def test_search_and_upload(self, mocker, test_homepage_searcher, monkeypatch): mock_get_agencies_without_homepage_urls.assert_called_once() mock_search_until_limit_reached.assert_called_once_with(agency_info_list=mock_agency_info_list, max_searches=10) mock_upload_to_huggingface.assert_called_once() - mock_update_search_cache.assert_called_once_with(mock_agency_ids) + mock_update_search_cache.assert_called_once_with(mock_search_results) def test_upload_to_huggingface(self, mocker, test_homepage_searcher, monkeypatch): test_homepage_searcher.huggingface_api_manager.repo_id = "TestOrg/TestDataset" @@ -152,17 +150,25 @@ def test_search_agency_info_exception(self, test_homepage_searcher): def test_update_search_cache(self, test_homepage_searcher): # Create test parameter test_agency_ids = ["test1", "test2", "test3"] + all_search_results = [ + SearchResults( + agency_id=agency_id, + search_results=[], + search_result_status=SearchResultEnum.FOUND_RESULTS + ) for agency_id in test_agency_ids + ] # Configure the Mock DB manager to ensure update SQL is called test_homepage_searcher.database_manager.executemany = MagicMock() # Call the function with our test data - test_homepage_searcher.update_search_cache(test_agency_ids) + test_homepage_searcher.update_search_cache(all_search_results) # Check that executemany was called with the expected arguments - test_homepage_searcher.database_manager.executemany.assert_called_once_with(SQL_UPDATE_CACHE, - [(agency_id,) for agency_id in - test_agency_ids]) + test_homepage_searcher.database_manager.executemany.assert_called_once_with( + SQL_UPDATE_CACHE, + [(agency_id,SearchResultEnum.FOUND_RESULTS.value) for agency_id in test_agency_ids] + ) def test_write_search_result_to_csv_success(self, test_homepage_searcher): search_results = [PossibleHomepageURL("http://example.com", "example snippet"), From 014bdc54cda88f8fa0a585d2b051afc53a570beb Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 10 Apr 2024 17:02:07 -0400 Subject: [PATCH 58/72] Update package requirement psycopg2-binary to psycopg[binary] The package dependency psycopg2-binary is updated to psycopg[binary] in the requirements file. This change helps to install the binary version of the package psycopg when the entire software is deployed, rather than installing a compilation of the package. This can help prevent potential issues related to compilation. --- .../requirements_agency_homepage_searcher_action.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt b/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt index d3f58b0..3c540e9 100644 --- a/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt +++ b/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt @@ -1,4 +1,4 @@ python-dotenv~=1.0.1 google-api-python-client~=2.119.0 -psycopg2-binary~=2.9.6 +psycopg[binary]~=2.9.6 huggingface-hub~=0.20.3 \ No newline at end of file From 3b511d43fb699e2e8af93a039dd6ed5fbd0bd211 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 10 Apr 2024 17:02:24 -0400 Subject: [PATCH 59/72] Add daily run GitHub action for agency homepage searcher This commit creates a GitHub action that is set to run daily at 00:00. The action sets up Python, installs necessary dependencies and finally runs the agency homepage searcher script. Environment variables are also passed to provide necessary credentials and keys for the script's execution. --- .github/actions/agency_homepage_searcher.yaml | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/.github/actions/agency_homepage_searcher.yaml b/.github/actions/agency_homepage_searcher.yaml index e69de29..457655f 100644 --- a/.github/actions/agency_homepage_searcher.yaml +++ b/.github/actions/agency_homepage_searcher.yaml @@ -0,0 +1,32 @@ +name: Run Script Daily +on: + schedule: + - cron: '0 0 * * *' # Run daily at 00:00 + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.11.3' + - name: Install Dependencies + run: | + python -m pip install --upgrade pip + cd agency_homepage_searcher + pip install -r requirements_agency_homepage_searcher_action.txt + - name: Run Agency Homepage Searcher + run: | + cd agency_homepage_searcher + python main.py + env: + CUSTOM_SEARCH_API_KEY: ${{ secrets.CUSTOM_SEARCH_API_KEY }} + CUSTOM_SEARCH_ENGINE_ID: ${{ secrets.CUSTOM_SEARCH_ENGINE_ID }} + DIGITAL_OCEAN_DB_USERNAME: ${{ secrets.DIGITAL_OCEAN_DB_USERNAME }} + DIGITAL_OCEAN_DB_PASSWORD: ${{ secrets.DIGITAL_OCEAN_DB_PASSWORD }} + DIGITAL_OCEAN_DB_HOST: ${{ secrets.DIGITAL_OCEAN_DB_HOST }} + DIGITAL_OCEAN_DB_PORT: ${{ secrets.DIGITAL_OCEAN_DB_PORT }} + DIGITAL_OCEAN_DB_NAME: ${{ secrets.DIGITAL_OCEAN_DB_NAME }} + HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }} \ No newline at end of file From 5ccf4c4c579e47717b10282b2e0e5681e9492b37 Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 10 Apr 2024 17:14:28 -0400 Subject: [PATCH 60/72] Update psycopg[binary] version in requirements The psycopg[binary] version has been updated in both requirements.txt and requirements_agency_homepage_searcher_action.txt. This upgrade contributes to the latest application functionality, ensuring up-to-date utilization of the library. --- .../requirements_agency_homepage_searcher_action.txt | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt b/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt index 3c540e9..598f38f 100644 --- a/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt +++ b/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt @@ -1,4 +1,4 @@ python-dotenv~=1.0.1 google-api-python-client~=2.119.0 -psycopg[binary]~=2.9.6 +psycopg[binary]~=3.1.18 huggingface-hub~=0.20.3 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a8df547..fd8ce99 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,5 +29,5 @@ scikit-learn>=1.4.1.post1 nltk>=3.6.7 # agency_homepage_searcher only google-api-python-client~=2.119.0 -psycopg[binary]~=2.9.6 +psycopg[binary]~=3.1.18 huggingface-hub~=0.20.3 \ No newline at end of file From a140e56a2ad993a8c8985b0e9ab24ac3cdf4912d Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 10 Apr 2024 17:16:47 -0400 Subject: [PATCH 61/72] Revise workflow file for agency_homepage_searcher action This revision simplifies the workflow in the Github Actions pipeline for the agency_homepage_searcher. Instead of moving into the directory before installing dependencies and running the script, the file paths are adjusted to execute these commands directly from the root directory. --- .github/actions/agency_homepage_searcher.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/actions/agency_homepage_searcher.yaml b/.github/actions/agency_homepage_searcher.yaml index 457655f..613f4dc 100644 --- a/.github/actions/agency_homepage_searcher.yaml +++ b/.github/actions/agency_homepage_searcher.yaml @@ -15,12 +15,10 @@ jobs: - name: Install Dependencies run: | python -m pip install --upgrade pip - cd agency_homepage_searcher - pip install -r requirements_agency_homepage_searcher_action.txt + pip install -r agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt - name: Run Agency Homepage Searcher run: | - cd agency_homepage_searcher - python main.py + python agency_homepage_searcher/main.py env: CUSTOM_SEARCH_API_KEY: ${{ secrets.CUSTOM_SEARCH_API_KEY }} CUSTOM_SEARCH_ENGINE_ID: ${{ secrets.CUSTOM_SEARCH_ENGINE_ID }} From 7c851a535f6dbffa97d6699401458c9c609d4f9e Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 12 Apr 2024 13:06:56 -0400 Subject: [PATCH 62/72] Remove redundant huggingface-hub package. --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index fd8ce99..0e89b3a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,4 +30,3 @@ nltk>=3.6.7 # agency_homepage_searcher only google-api-python-client~=2.119.0 psycopg[binary]~=3.1.18 -huggingface-hub~=0.20.3 \ No newline at end of file From b2346be1ed5a571919b31b4b3a2a1bd3ce1acb98 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 12 Apr 2024 13:15:32 -0400 Subject: [PATCH 63/72] Refactor state name lookup using USStateReference class The STATE_ISO_TO_NAME_DICT dictionary has been removed and replaced by a newly implemented class, USStateReference. This class fetches state names from the database using state ISO codes. It is important to note that the get_state_name method from this class is now employed in the create_agency_info method to retrieve state names. --- agency_homepage_searcher/homepage_searcher.py | 81 ++++++------------- 1 file changed, 23 insertions(+), 58 deletions(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index fda6cd7..41f7253 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -1,10 +1,8 @@ import csv -import os import tempfile from dataclasses import dataclass from pathlib import Path from typing import List, Union -from dotenv import load_dotenv from enum import Enum from agency_homepage_searcher.agency_info import AgencyInfo @@ -15,6 +13,7 @@ MAX_SEARCHES = 100 # Maximum searches to perform at a time when searching for results + class SearchResultEnum(Enum): """ This enum corresponds to an enum column in AGENCY_URL_SEARCH_CACHE @@ -22,58 +21,23 @@ class SearchResultEnum(Enum): FOUND_RESULTS = "found_results" NO_RESULTS_FOUND = "no_results_found" -STATE_ISO_TO_NAME_DICT = { - "AL": "Alabama", - "AK": "Alaska", - "AZ": "Arizona", - "AR": "Arkansas", - "CA": "California", - "CO": "Colorado", - "CT": "Connecticut", - "DE": "Delaware", - "FL": "Florida", - "GA": "Georgia", - "HI": "Hawaii", - "ID": "Idaho", - "IL": "Illinois", - "IN": "Indiana", - "IA": "Iowa", - "KS": "Kansas", - "KY": "Kentucky", - "LA": "Louisiana", - "ME": "Maine", - "MD": "Maryland", - "MA": "Massachusetts", - "MI": "Michigan", - "MN": "Minnesota", - "MS": "Mississippi", - "MO": "Missouri", - "MT": "Montana", - "NE": "Nebraska", - "NV": "Nevada", - "NH": "New Hampshire", - "NJ": "New Jersey", - "NM": "New Mexico", - "NY": "New York", - "NC": "North Carolina", - "ND": "North Dakota", - "OH": "Ohio", - "OK": "Oklahoma", - "OR": "Oregon", - "PA": "Pennsylvania", - "RI": "Rhode Island", - "SC": "South Carolina", - "SD": "South Dakota", - "TN": "Tennessee", - "TX": "Texas", - "UT": "Utah", - "VT": "Vermont", - "VA": "Virginia", - "WA": "Washington", - "WV": "West Virginia", - "WI": "Wisconsin", - "WY": "Wyoming" -} + +class USStateReference: + + def __init__(self, database_manager: DBManager): + rows = database_manager.execute("SELECT state_iso, state_name FROM public.state_names;") + self.dict = {} + for row in rows: + iso = row["state_iso"] + name = row["state_name"] + self.dict[iso] = name + + def get_state_name(self, state_iso: str) -> str: + try: + return self.dict[state_iso] + except KeyError: + raise ValueError(f"Invalide State Isocode: {state_iso}") + SQL_GET_AGENCIES_WITHOUT_HOMEPAGE_URLS = """ SELECT @@ -129,9 +93,9 @@ def __init__( self.search_engine = search_engine self.database_manager = database_manager self.huggingface_api_manager = huggingface_api_manager + self.us_state_reference = USStateReference(database_manager) - @staticmethod - def create_agency_info(agency_row: list) -> AgencyInfo: + def create_agency_info(self, agency_row: list) -> AgencyInfo: """ Creates an AgencyInfo object using the provided agency data. Args: @@ -140,7 +104,7 @@ def create_agency_info(agency_row: list) -> AgencyInfo: An AgencyInfo object. """ try: - state_name = STATE_ISO_TO_NAME_DICT[agency_row[2]] + state_name = self.us_state_reference.get_state_name(agency_row[2]) except KeyError: raise ValueError(f"Invalid state ISO code: {agency_row[2]}") return AgencyInfo( @@ -287,7 +251,8 @@ def _write_search_result_to_csv(search_result: SearchResults, writer: csv.writer for possible_homepage_url in search_result.search_results: writer.writerow([search_result.agency_id, possible_homepage_url.url, possible_homepage_url.snippet]) except Exception as e: - raise Exception(f"An unexpected error occurred while writing search results for {search_result.agency_id}: {e}") + raise Exception( + f"An unexpected error occurred while writing search results for {search_result.agency_id}: {e}") def update_search_cache(self, search_results: list[SearchResults]) -> None: """ From 5668c8a9e8bdccec88f0e12e85a7f412a54ddc87 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 12 Apr 2024 13:15:43 -0400 Subject: [PATCH 64/72] Remove pytest-postgresql dependency from requirements The pytest-postgresql dependency is removed from the requirements.txt file. This change is a part of ongoing refactoring efforts to simplify the project's dependencies and reduce potential conflict or compatibility issues. --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0e89b3a..52ebb18 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,6 @@ bs4~=0.0.2 tqdm~=4.66.2 pytest~=8.0.1 pytest-mock==3.12.0 -pytest-postgresql~=6.0.0 urllib3~=1.26.18 # common_crawler only huggingface-hub~=0.22.2 From b63abc07f4f9e4fab6290bb581170d1bcd722c61 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 12 Apr 2024 16:01:40 -0400 Subject: [PATCH 65/72] Update instructions for API key and ID in README.md Instructions for obtaining `CUSTOM_SEARCH_API_KEY` and `CUSTOM_SEARCH_ENGINE_ID` have been elaborated. They now include specific directions on how to acquire an API key from the Google Custom Search Engine Overview and access the CSE ID from the Programmable Search Engine control panel. --- agency_homepage_searcher/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agency_homepage_searcher/README.md b/agency_homepage_searcher/README.md index 40ddd37..fab2ebb 100644 --- a/agency_homepage_searcher/README.md +++ b/agency_homepage_searcher/README.md @@ -17,8 +17,8 @@ This module is designed to do the following: This script requires a number of environment variables to be provided in an associated `.env` file in the root directory in order to function correctly: -* CUSTOM_SEARCH_API_KEY - The API key required for accessing the [Google Custom Search Engine](https://developers.google.com/custom-search/v1/overview.) -* CUSTOM_SEARCH_ENGINE_ID - The CSE (Custom Search Engine) ID required for identifying the specific search engine to use. +* CUSTOM_SEARCH_API_KEY - The API key required for accessing the [Google Custom Search Engine](https://developers.google.com/custom-search/v1/overview). Obtainable by clicking the "Get a Key" button in the linked overview, and associating it with an existing custom search engine or one that you create. +* CUSTOM_SEARCH_ENGINE_ID - The CSE (Custom Search Engine) ID required for identifying the specific search engine to use. Accessible by clicking on the search engine in the [Programmable Search Engine control panel](https://programmablesearchengine.google.com/controlpanel/all). * DIGITAL_OCEAN_DB_USERNAME - The username to be used for logging into the PostgreSQL database * DIGITAL_OCEAN_DB_PASSWORD - The password to be used for logging into the PostgreSQL database * DIGITAL_OCEAN_DB_HOST - The host to be used for logging into the PostgreSQL database From c0de6fe81dccadc367cf26b5677c03cd0c7847c8 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 12 Apr 2024 16:08:46 -0400 Subject: [PATCH 66/72] Update huggingface-hub version in requirements. The version of huggingface-hub was updated from 0.20.3 to 0.22.2 in the requirements file of agency_homepage_searcher. This change ensures that the application uses the most recent and secure version of huggingface-hub. --- .../requirements_agency_homepage_searcher_action.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt b/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt index 598f38f..c9f5644 100644 --- a/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt +++ b/agency_homepage_searcher/requirements_agency_homepage_searcher_action.txt @@ -1,4 +1,4 @@ python-dotenv~=1.0.1 google-api-python-client~=2.119.0 psycopg[binary]~=3.1.18 -huggingface-hub~=0.20.3 \ No newline at end of file +huggingface-hub~=0.22.2 \ No newline at end of file From d4185ff84def34c1166f7da63ff7ae11cac18901 Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 12 Apr 2024 16:09:17 -0400 Subject: [PATCH 67/72] Refactor variable assignment in row mapping. The assignment of iso and name variables in the row mapping for state_names has been changed from accessing by key to accessing by index. --- agency_homepage_searcher/homepage_searcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index 41f7253..b2857a9 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -28,8 +28,8 @@ def __init__(self, database_manager: DBManager): rows = database_manager.execute("SELECT state_iso, state_name FROM public.state_names;") self.dict = {} for row in rows: - iso = row["state_iso"] - name = row["state_name"] + iso = row[0] + name = row[1] self.dict[iso] = name def get_state_name(self, state_iso: str) -> str: From 57465b0ee52f2288acb7a2233621e3bb7f22f94e Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 12 Apr 2024 16:24:47 -0400 Subject: [PATCH 68/72] Update test methods in HomepageSearcher test. This update revises mock object creation and method calls in the 'test_homepage_searcher' and 'test_create_agency_info_with_valid_agency_row' methods in the 'TestHomepageSearcher' class. The changes refine the process of mocking the USStateReference class and extracting the state name in test scenarios. --- Tests/test_agency_homepage_searcher_unit.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/Tests/test_agency_homepage_searcher_unit.py b/Tests/test_agency_homepage_searcher_unit.py index 9120634..ecd6dca 100644 --- a/Tests/test_agency_homepage_searcher_unit.py +++ b/Tests/test_agency_homepage_searcher_unit.py @@ -60,7 +60,13 @@ def test_search_with_http_error(self, google_searcher): class TestHomepageSearcher: @pytest.fixture - def test_homepage_searcher(mocker): + def test_homepage_searcher(self, mocker, monkeypatch): + + mock_us_state_ref = mocker.Mock() # Create a Mock object using unittest.mock + + # Define your mock return value here + mock_us_state_ref.return_value = Mock() # Assign the necessary mock object/value + monkeypatch.setattr('agency_homepage_searcher.homepage_searcher.USStateReference', mock_us_state_ref) return HomepageSearcher( search_engine=Mock(), database_manager=Mock(), @@ -300,7 +306,8 @@ def sample_invalid_agency_row(self): return ['Invalid Agency', 'Federal', 'XX', 'Invalid City', 'Invalid County', '9999', '9999', '99999'] - def test_create_agency_info_with_valid_agency_row(self, sample_valid_agency_row): + def test_create_agency_info_with_valid_agency_row(self, test_homepage_searcher, sample_valid_agency_row): + test_homepage_searcher.us_state_reference.get_state_name = MagicMock(return_value="California") expected_agency_info = AgencyInfo( agency_name='Test Agency', city='San Francisco', @@ -311,12 +318,7 @@ def test_create_agency_info_with_valid_agency_row(self, sample_valid_agency_row) agency_type='Federal', agency_id='5141' ) - assert HomepageSearcher.create_agency_info(sample_valid_agency_row) == expected_agency_info - - def test_create_agency_info_with_invalid_agency_row(self, sample_invalid_agency_row): - with pytest.raises(ValueError): - HomepageSearcher.create_agency_info(sample_invalid_agency_row) - + assert test_homepage_searcher.create_agency_info(sample_valid_agency_row) == expected_agency_info def test_agency_info_get_search_string_character_strip(): """ From 7f7ce7da2115f5c4f4b65ef9168b17dc194548bd Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 16 Apr 2024 17:26:38 -0400 Subject: [PATCH 69/72] Remove unused imports in test_agency_homepage_searcher_integration Unused imports 'csv' and 'factories' from pytest_postgresql were removed from test_agency_homepage_searcher_integration.py to reduce unnecessary overhead and increase code readability. This is part of an ongoing effort to ensure clean and efficient code base. --- Tests/test_agency_homepage_searcher_integration.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Tests/test_agency_homepage_searcher_integration.py b/Tests/test_agency_homepage_searcher_integration.py index bed426d..db47e81 100644 --- a/Tests/test_agency_homepage_searcher_integration.py +++ b/Tests/test_agency_homepage_searcher_integration.py @@ -1,9 +1,7 @@ -import csv from typing import List from unittest.mock import MagicMock import pytest -from pytest_postgresql import factories from agency_homepage_searcher.agency_info import AgencyInfo from agency_homepage_searcher.google_searcher import GoogleSearcher From dcfbfbcff98796f01eac73a9fe6725817f35e213 Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 16 Apr 2024 17:30:43 -0400 Subject: [PATCH 70/72] Refactor DBManager to use database URL The parameters provided to initialize DBManager (user, password, host, port, and db_name) have been replaced with a single parameter - the database URL. This simplifies DBManager usage in the main module 'agency_homepage_searcher', enhancing readability and maintaining the security of database details. --- agency_homepage_searcher/main.py | 6 +----- util/db_manager.py | 10 ++-------- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/agency_homepage_searcher/main.py b/agency_homepage_searcher/main.py index 22e1633..5d7be63 100644 --- a/agency_homepage_searcher/main.py +++ b/agency_homepage_searcher/main.py @@ -19,11 +19,7 @@ api_key=os.getenv("CUSTOM_SEARCH_API_KEY"), cse_id=os.getenv("CUSTOM_SEARCH_ENGINE_ID")) db_manager = DBManager( - user=os.getenv("DIGITAL_OCEAN_DB_USERNAME"), - password=os.getenv("DIGITAL_OCEAN_DB_PASSWORD"), - host=os.getenv("DIGITAL_OCEAN_DB_HOST"), - port=os.getenv("DIGITAL_OCEAN_DB_PORT"), - db_name=os.getenv("DIGITAL_OCEAN_DB_NAME") + database_url=os.getenv("DO_DATABASE_URL") ) huggingface_api_manager = HuggingFaceAPIManager( access_token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"), diff --git a/util/db_manager.py b/util/db_manager.py index 186bf6b..0de8815 100644 --- a/util/db_manager.py +++ b/util/db_manager.py @@ -7,14 +7,8 @@ class DBManager: Manages access to PostgreSQL database. """ - def __init__(self, db_name, user, password, host, port): - self.conn = psycopg.connect( - dbname=db_name, - user=user, - password=password, - host=host, - port=port - ) + def __init__(self, database_url: str): + self.conn = psycopg.connect(database_url) self.cursor = self.conn.cursor() def __del__(self): From e846650a1cc6e86bb973ec0b573869f589a17aba Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 17 Apr 2024 11:47:32 -0400 Subject: [PATCH 71/72] Update README.md with script and testing instructions Updated the README.md file in the agency_homepage_searcher directory. Replaced old database log-in details with unified database URL and included instructions for running script and tests. This simplifies database configuration and ensures users have proper guidance to run the application and its related tests. --- agency_homepage_searcher/README.md | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/agency_homepage_searcher/README.md b/agency_homepage_searcher/README.md index fab2ebb..af1784a 100644 --- a/agency_homepage_searcher/README.md +++ b/agency_homepage_searcher/README.md @@ -19,17 +19,22 @@ This script requires a number of environment variables to be provided in an asso * CUSTOM_SEARCH_API_KEY - The API key required for accessing the [Google Custom Search Engine](https://developers.google.com/custom-search/v1/overview). Obtainable by clicking the "Get a Key" button in the linked overview, and associating it with an existing custom search engine or one that you create. * CUSTOM_SEARCH_ENGINE_ID - The CSE (Custom Search Engine) ID required for identifying the specific search engine to use. Accessible by clicking on the search engine in the [Programmable Search Engine control panel](https://programmablesearchengine.google.com/controlpanel/all). -* DIGITAL_OCEAN_DB_USERNAME - The username to be used for logging into the PostgreSQL database -* DIGITAL_OCEAN_DB_PASSWORD - The password to be used for logging into the PostgreSQL database -* DIGITAL_OCEAN_DB_HOST - The host to be used for logging into the PostgreSQL database -* DIGITAL_OCEAN_DB_PORT - The port to be used for logging into the PostgreSQL database -* DIGITAL_OCEAN_DB_NAME - The database name to be used for logging into the PostgreSQL database +* DO_DATABASE_URL - The database url for logging into the PostgreSQL database * HUGGINGFACE_ACCESS_TOKEN - An access token for a user with permissions to upload data to the [PDAP/possible_homepage_urls](https://huggingface.co/datasets/PDAP/possible_homepage_urls) dataset ## Running script +To run the script, simply run from within this directory: +```shell +python main.py +``` ## Running tests -TODO: Include notes on running integration test with database +From the project root directory, run the following commands: + +```shell +pytest tests/test_agency_homepage_searcher_integration.py +pytest tests/test_agency_homepage_searcher_unit.py +``` \ No newline at end of file From 5213701cc8e0c67213df52ee116d7e8f61a70f63 Mon Sep 17 00:00:00 2001 From: maxachis Date: Sat, 15 Jun 2024 18:14:08 -0400 Subject: [PATCH 72/72] Update search cache retrieval and update methods to use search cache endpoint rather than direct calls to database. --- agency_homepage_searcher/homepage_searcher.py | 47 +++++++++++++++---- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/agency_homepage_searcher/homepage_searcher.py b/agency_homepage_searcher/homepage_searcher.py index b2857a9..9b1e298 100644 --- a/agency_homepage_searcher/homepage_searcher.py +++ b/agency_homepage_searcher/homepage_searcher.py @@ -1,10 +1,16 @@ import csv +import json +import os import tempfile +from http import HTTPStatus + +import requests from dataclasses import dataclass from pathlib import Path from typing import List, Union from enum import Enum + from agency_homepage_searcher.agency_info import AgencyInfo from agency_homepage_searcher.google_searcher import GoogleSearcher, QuotaExceededError from util.huggingface_api_manager import HuggingFaceAPIManager @@ -12,7 +18,9 @@ from util.miscellaneous_functions import get_filename_friendly_timestamp MAX_SEARCHES = 100 # Maximum searches to perform at a time when searching for results - +BASE_URL = "https://data-sources.pdap.io/api/" +SEARCH_CACHE_ENDPOINT = "homepage-search-cache" +FULL_CACHE_ENDPOINT = f"{BASE_URL}{SEARCH_CACHE_ENDPOINT}" class SearchResultEnum(Enum): """ @@ -94,6 +102,17 @@ def __init__( self.database_manager = database_manager self.huggingface_api_manager = huggingface_api_manager self.us_state_reference = USStateReference(database_manager) + self.pdap_api_key = os.getenv("PDAP_API_KEY") + + def get_search_cache_header(self) -> dict: + """ + Returns a header for the search cache table. + Returns: dict + """ + return { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.pdap_api_key}" + } def create_agency_info(self, agency_row: list) -> AgencyInfo: """ @@ -123,8 +142,16 @@ def get_agencies_without_homepage_urls(self) -> list[AgencyInfo]: Retrieves a list of agencies without homepage URLs. Returns: list[AgencyInfo] """ - agency_rows = self.database_manager.execute(SQL_GET_AGENCIES_WITHOUT_HOMEPAGE_URLS) - return [self.create_agency_info(agency_row) for agency_row in agency_rows] + # TODO: Implement endpoint + response = requests.get( + url=FULL_CACHE_ENDPOINT, + headers=self.get_search_cache_header() + ) + if response.status_code != HTTPStatus.OK: + raise Exception(f"Failed to get search cache. Status code: {response.status_code}") + + return [self.create_agency_info(row) for row in response.json()] + def search(self, agency_info: AgencyInfo) -> Union[SearchResults, None]: """ @@ -260,11 +287,15 @@ def update_search_cache(self, search_results: list[SearchResults]) -> None: Args: search_results: """ - parameters = [] - for search_result in search_results: - parameter = (search_result.agency_id, search_result.search_result_status.value) - parameters.append(parameter) - self.database_manager.executemany(SQL_UPDATE_CACHE, parameters) + + response = requests.post( + url=FULL_CACHE_ENDPOINT, + data=json.dumps((search_result.search_result_status.value for search_result in search_results)), + headers=self.get_search_cache_header() + ) + + if response.status_code != HTTPStatus.OK: + raise Exception(f"Failed to update search cache. Status code: {response.status_code}") def _try_search_agency_info(self, agency_info: AgencyInfo) -> Union[SearchResults, List]: """