From 9d9618aa0d2ac7fb58f7307ac55e936fd661522c Mon Sep 17 00:00:00 2001 From: eddie-m-m Date: Fri, 15 Nov 2024 17:06:58 -0800 Subject: [PATCH] Lint added files --- common_crawler/argparser.py | 43 +++-- common_crawler/cache.py | 16 +- common_crawler/crawler.py | 41 +++-- common_crawler/csv_manager.py | 23 +-- common_crawler/main.py | 108 +++++++----- common_crawler/utils.py | 2 +- .../ckan/ckan_scraper_toolkit.py | 14 +- .../ckan/scrape_ckan_data_portals.py | 1 + source_collectors/ckan/search_terms.py | 2 +- source_collectors/common_crawler/argparser.py | 43 +++-- source_collectors/common_crawler/cache.py | 16 +- source_collectors/common_crawler/crawler.py | 41 +++-- .../common_crawler/csv_manager.py | 23 +-- source_collectors/common_crawler/main.py | 108 +++++++----- source_collectors/common_crawler/utils.py | 2 +- .../convert_all_record_types_to_csv.py | 47 ++++- .../muckrock/create_foia_data_db.py | 165 +++++++++--------- .../muckrock/download_muckrock_foia.py | 12 +- .../generate_detailed_muckrock_csv.py | 68 +++++--- .../muckrock/get_all_record_types.py | 51 +++++- .../muckrock/get_allegheny_foias.py | 22 ++- source_collectors/muckrock/muck_get.py | 14 +- .../muckrock/muckrock_ml_labeler.py | 16 +- .../muckrock/search_foia_data_db.py | 82 +++++---- .../muckrock/search_local_foia_json.py | 26 +-- source_collectors/muckrock/utils.py | 11 +- 26 files changed, 616 insertions(+), 381 deletions(-) diff --git a/common_crawler/argparser.py b/common_crawler/argparser.py index 8cdf5b7..67f4a29 100644 --- a/common_crawler/argparser.py +++ b/common_crawler/argparser.py @@ -7,6 +7,7 @@ for the Common Crawler script. """ + def valid_common_crawl_id(common_crawl_id: str) -> bool: """ Validate the Common Crawl ID format. @@ -16,7 +17,8 @@ def valid_common_crawl_id(common_crawl_id: str) -> bool: Returns: True if the Common Crawl ID is valid, False otherwise """ - return re.match(r'CC-MAIN-\d{4}-\d{2}', common_crawl_id) is not None + return re.match(r"CC-MAIN-\d{4}-\d{2}", common_crawl_id) is not None + def parse_args() -> argparse.Namespace: """ @@ -33,22 +35,41 @@ def parse_args() -> argparse.Namespace: """ parser = argparse.ArgumentParser( - description='Query the Common Crawl dataset and optionally save the results to a file.') + description="Query the Common Crawl dataset and optionally save the results to a file." + ) # Add the required arguments - parser.add_argument('common_crawl_id', type=str, help='The Common Crawl ID') - parser.add_argument('url', type=str, help='The URL to query') - parser.add_argument('keyword', type=str, help='The keyword to search in the url') + parser.add_argument("common_crawl_id", type=str, help="The Common Crawl ID") + parser.add_argument("url", type=str, help="The URL to query") + parser.add_argument("keyword", type=str, help="The keyword to search in the url") # Optional arguments for the number of pages and the output file, and a flag to reset the cache - parser.add_argument('-c', '--config', type=str, default='config.ini', help='The configuration file to use') - parser.add_argument('-p', '--pages', type=int, default=1, help='The number of pages to search (default: 1)') - parser.add_argument('--reset-cache', action='store_true', default=False, - help='Reset the cache before starting the crawl') + parser.add_argument( + "-c", + "--config", + type=str, + default="config.ini", + help="The configuration file to use", + ) + parser.add_argument( + "-p", + "--pages", + type=int, + default=1, + help="The number of pages to search (default: 1)", + ) + parser.add_argument( + "--reset-cache", + action="store_true", + default=False, + help="Reset the cache before starting the crawl", + ) args = parser.parse_args() # Validate the Common Crawl ID format if not valid_common_crawl_id(args.common_crawl_id): - parser.error("Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW.") + parser.error( + "Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW." + ) # Read the configuration file config = configparser.ConfigParser() @@ -56,7 +77,7 @@ def parse_args() -> argparse.Namespace: # Combine parsed arguments with configuration file defaults app_parser = argparse.ArgumentParser(parents=[parser], add_help=False) - app_parser.set_defaults(**config['DEFAULT']) + app_parser.set_defaults(**config["DEFAULT"]) app_args = app_parser.parse_args() diff --git a/common_crawler/cache.py b/common_crawler/cache.py index 2a48c0b..23d5881 100644 --- a/common_crawler/cache.py +++ b/common_crawler/cache.py @@ -8,11 +8,13 @@ - CommonCrawlerCache: a class for managing the cache logic of Common Crawl search results """ + class CommonCrawlerCacheManager: """ A class for managing the cache of Common Crawl search results. This class is responsible for adding, retrieving, and saving cache data. """ + def __init__(self, file_name: str = "cache", directory=None): """ Initializes the CacheStorage object with a file name and directory. @@ -41,7 +43,6 @@ def upsert(self, index: str, url: str, keyword: str, last_page: int) -> None: self.cache[index][url] = {} self.cache[index][url][keyword] = last_page - def get(self, index, url, keyword) -> int: """ Retrieves a page number from the cache. @@ -53,12 +54,15 @@ def get(self, index, url, keyword) -> int: Returns: int - the last page crawled """ - if index in self.cache and url in self.cache[index] and keyword in self.cache[index][url]: + if ( + index in self.cache + and url in self.cache[index] + and keyword in self.cache[index][url] + ): return self.cache[index][url][keyword] # The cache object does not exist. Return 0 as the default value. return 0 - def load_or_create_cache(self) -> dict: """ Loads the cache from the configured file path. @@ -66,12 +70,11 @@ def load_or_create_cache(self) -> dict: Returns: dict - the cache data """ try: - with open(self.file_path, 'r') as file: + with open(self.file_path, "r") as file: return json.load(file) except FileNotFoundError: return {} - def save_cache(self) -> None: """ Converts the cache object into a JSON-serializable format and saves it to the configured file path. @@ -79,10 +82,9 @@ def save_cache(self) -> None: persistence of crawl data across sessions. """ # Reformat cache data for JSON serialization - with open(self.file_path, 'w') as file: + with open(self.file_path, "w") as file: json.dump(self.cache, file, indent=4) - def reset_cache(self) -> None: """ Resets the cache to an empty state. diff --git a/common_crawler/crawler.py b/common_crawler/crawler.py index 9afba7d..0982ca5 100644 --- a/common_crawler/crawler.py +++ b/common_crawler/crawler.py @@ -16,7 +16,6 @@ # TODO: What happens when no results are found? How does the CommonCrawlerManager handle this? - @dataclass class CommonCrawlResult: last_page_search: int @@ -31,16 +30,17 @@ class CommonCrawlerManager: It validates crawl ids, manages pagination, and aggregates results. """ - def __init__(self, crawl_id='CC-MAIN-2023-50'): + def __init__(self, crawl_id="CC-MAIN-2023-50"): self.crawl_id = crawl_id - CC_INDEX_SERVER = 'http://index.commoncrawl.org/' - INDEX_NAME = f'{self.crawl_id}-index' - self.root_url = f'{CC_INDEX_SERVER}{INDEX_NAME}' + CC_INDEX_SERVER = "http://index.commoncrawl.org/" + INDEX_NAME = f"{self.crawl_id}-index" + self.root_url = f"{CC_INDEX_SERVER}{INDEX_NAME}" def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResult: print( f"Searching for {keyword} on {search_term} in {self.crawl_id} for {num_pages} pages," - f" starting at page {start_page}") + f" starting at page {start_page}" + ) url_results = [] @@ -64,7 +64,9 @@ def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResul return CommonCrawlResult(last_page, url_results) - def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = 20) -> list[dict]: + def search_common_crawl_index( + self, url: str, page: int = 0, max_retries: int = 20 + ) -> list[dict]: """ This method is used to search the Common Crawl index for a given URL and page number Args: @@ -76,9 +78,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = """ encoded_url = quote_plus(url) search_url = URLWithParameters(self.root_url) - search_url.add_parameter('url', encoded_url) - search_url.add_parameter('output', 'json') - search_url.add_parameter('page', page) + search_url.add_parameter("url", encoded_url) + search_url.add_parameter("output", "json") + search_url.add_parameter("page", page) retries = 0 delay = 1 @@ -90,7 +92,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = return self.process_response(response, url, page) retries += 1 - print(f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})") + print( + f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})" + ) time.sleep(delay) print(f"Max retries exceeded. Failed to get records for {url} on page {page}.") @@ -106,19 +110,24 @@ def make_request(self, search_url: str) -> requests.Response: response.raise_for_status() return response except requests.exceptions.RequestException as e: - if response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR and 'SlowDown' in response.text: + if ( + response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR + and "SlowDown" in response.text + ): return None else: print(f"Failed to get records: {e}") return None - def process_response(self, response: requests.Response, url: str, page: int) -> list[dict]: + def process_response( + self, response: requests.Response, url: str, page: int + ) -> list[dict]: """Processes the HTTP response and returns the parsed records if successful.""" if response.status_code == HTTPStatus.OK: - records = response.text.strip().split('\n') + records = response.text.strip().split("\n") print(f"Found {len(records)} records for {url} on page {page}") return [json.loads(record) for record in records] - elif 'First Page is 0, Last Page is 0' in response.text: + elif "First Page is 0, Last Page is 0" in response.text: print("No records exist in index matching the url search term") return None else: @@ -127,4 +136,4 @@ def process_response(self, response: requests.Response, url: str, page: int) -> @staticmethod def get_urls_with_keyword(records: list[dict], keyword) -> list[str]: - return [record['url'] for record in records if keyword in record['url']] + return [record["url"] for record in records if keyword in record["url"]] diff --git a/common_crawler/csv_manager.py b/common_crawler/csv_manager.py index 6986862..2b823b4 100644 --- a/common_crawler/csv_manager.py +++ b/common_crawler/csv_manager.py @@ -10,12 +10,7 @@ class CSVManager: Creates the file if it doesn't exist, and provides a method for adding new rows. """ - def __init__( - self, - file_name: str, - headers: list[str], - directory=None - ): + def __init__(self, file_name: str, headers: list[str], directory=None): self.file_path = get_file_path(f"{file_name}.csv", directory) self.headers = headers if not os.path.exists(self.file_path): @@ -29,9 +24,9 @@ def add_row(self, row_values: list[str] | tuple[str]): """ if isinstance(row_values, str): # Single values must be converted to a list format - row_values = [row_values] + row_values = [row_values] try: - with open(self.file_path, mode='a', newline='', encoding='utf-8') as file: + with open(self.file_path, mode="a", newline="", encoding="utf-8") as file: writer = csv.writer(file) writer.writerow(row_values) except Exception as e: @@ -45,9 +40,7 @@ def add_rows(self, results: list[list[str]]) -> None: Returns: None """ for result in results: - self.add_row( - result - ) + self.add_row(result) print(f"{len(results)} URLs written to {self.file_path}") def initialize_file(self): @@ -59,15 +52,17 @@ def initialize_file(self): file_exists = os.path.isfile(self.file_path) if not file_exists: - with open(self.file_path, mode='a', newline='', encoding='utf-8') as file: + with open(self.file_path, mode="a", newline="", encoding="utf-8") as file: writer = csv.writer(file) writer.writerow(self.headers) else: # Open and check that headers match - with open(self.file_path, mode='r', encoding='utf-8') as file: + with open(self.file_path, mode="r", encoding="utf-8") as file: header_row = next(csv.reader(file)) if header_row != self.headers: - raise ValueError(f"Header row in {self.file_path} does not match expected headers") + raise ValueError( + f"Header row in {self.file_path} does not match expected headers" + ) print(f"CSV file initialized at {self.file_path}") def delete_file(self): diff --git a/common_crawler/main.py b/common_crawler/main.py index ae27f55..b9dd012 100644 --- a/common_crawler/main.py +++ b/common_crawler/main.py @@ -10,7 +10,7 @@ # The below code sets the working directory to be the root of the entire repository # This is done to solve otherwise quite annoying import issues. -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from util.huggingface_api_manager import HuggingFaceAPIManager from util.miscellaneous_functions import get_filename_friendly_timestamp @@ -35,30 +35,34 @@ class BatchInfo: notes: str filename: str + class LabelStudioError(Exception): """Custom exception for Label Studio Errors""" + pass -BATCH_HEADERS = ['Datetime', 'Source', 'Count', 'Keywords', 'Notes', 'Filename'] + +BATCH_HEADERS = ["Datetime", "Source", "Count", "Keywords", "Notes", "Filename"] + def get_current_time(): return str(datetime.now()) -def add_batch_info_to_csv(common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int) -> BatchInfo: +def add_batch_info_to_csv( + common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int +) -> BatchInfo: batch_info = BatchInfo( datetime=get_current_time(), source="Common Crawl", count=str(len(common_crawl_result.url_results)), keywords=f"{args.url} - {args.keyword}", notes=f"{args.common_crawl_id}, {args.pages} pages, starting at {last_page + 1}", - filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}" + filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}", ) batch_info_csv_manager = CSVManager( - file_name='batch_info', - directory=args.data_dir, - headers=BATCH_HEADERS + file_name="batch_info", directory=args.data_dir, headers=BATCH_HEADERS ) batch_info_csv_manager.add_row(dataclasses.astuple(batch_info)) @@ -71,12 +75,11 @@ def main(): # Initialize the Cache cache_manager = CommonCrawlerCacheManager( - file_name=args.cache_filename, - directory=args.data_dir + file_name=args.cache_filename, directory=args.data_dir ) load_dotenv() - + # Initialize the HuggingFace API Manager hf_access_token = os.getenv("HUGGINGFACE_ACCESS_TOKEN") if not hf_access_token: @@ -84,10 +87,10 @@ def main(): "HUGGINGFACE_ACCESS_TOKEN not accessible in .env file in root directory. " "Please obtain access token from your personal account at " "https://huggingface.co/settings/tokens and ensure you have write access to " - "https://huggingface.co/PDAP. Then include in .env file in root directory.") + "https://huggingface.co/PDAP. Then include in .env file in root directory." + ) huggingface_api_manager = HuggingFaceAPIManager( - access_token=hf_access_token, - repo_id=args.huggingface_repo_id + access_token=hf_access_token, repo_id=args.huggingface_repo_id ) ls_access_token = os.getenv("LABEL_STUDIO_ACCESS_TOKEN") if not ls_access_token: @@ -95,13 +98,15 @@ def main(): "LABEL_STUDIO_ACCESS_TOKEN not accessible in .env file in root directory. " "Please obtain access token from your personal account at " "https://app.heartex.com/user/account and ensure you have read access to " - "https://app.heartex.com/projects/61550. Then include in .env file in root directory.") + "https://app.heartex.com/projects/61550. Then include in .env file in root directory." + ) ls_project_id = os.getenv("LABEL_STUDIO_PROJECT_ID") if not ls_project_id: raise ValueError( "LABEL_STUDIO_PROJECT_ID not accessible in .env file in root directory. " "Please obtain a project ID by navigating to the Label Studio project " - "where it will be visibile in the url. Then include in .env file in root directory.") + "where it will be visibile in the url. Then include in .env file in root directory." + ) try: print("Retrieving Label Studio data for deduplication") @@ -119,7 +124,9 @@ def main(): try: # Retrieve the last page from the cache, or 0 if it does not exist last_page = cache_manager.get(args.common_crawl_id, args.url, args.keyword) - common_crawl_result = process_crawl_and_upload(args, last_page, huggingface_api_manager, label_studio_results) + common_crawl_result = process_crawl_and_upload( + args, last_page, huggingface_api_manager, label_studio_results + ) except ValueError as e: print(f"Error during crawling: {e}") return @@ -129,12 +136,14 @@ def main(): index=args.common_crawl_id, url=args.url, keyword=args.keyword, - last_page=common_crawl_result.last_page_search) + last_page=common_crawl_result.last_page_search, + ) cache_manager.save_cache() except ValueError as e: print(f"Error while saving cache manager: {e}") + def handle_remote_results_error(remote_results): """ Handles errors in the remote results @@ -151,6 +160,7 @@ def handle_remote_results_error(remote_results): else: raise LabelStudioError(f"Unexpected error: {remote_results}") + def validate_remote_results(remote_results): """ Validates the remote results retrieved from the Label Studio project @@ -166,7 +176,9 @@ def validate_remote_results(remote_results): print("No data in Label Studio project.") return [] elif "url" not in remote_results[0]["data"]: - raise LabelStudioError("Column 'url' not present in Label Studio project. Exiting...") + raise LabelStudioError( + "Column 'url' not present in Label Studio project. Exiting..." + ) else: return remote_results elif isinstance(remote_results, dict): @@ -174,6 +186,7 @@ def validate_remote_results(remote_results): else: raise LabelStudioError("Unexpected response type.") + def get_ls_data() -> list[dict] | None: """Retrieves data from a Label Studio project to be used in deduplication of common crawl results. @@ -190,14 +203,14 @@ def get_ls_data() -> list[dict] | None: def strip_url(url: str) -> str: - """Strips http(s)://www. from the beginning of a url if applicable. + """Strips http(s)://www. from the beginning of a url if applicable. Args: url (str): The URL to strip. Returns: str: The stripped URL. - """ + """ result = re.search(r"^(?:https?://)?(?:www\.)?(.*)$", url).group(1) return result @@ -210,7 +223,7 @@ def remove_local_duplicates(url_results: list[str]) -> list[str]: Returns: list[str]: List of unique URLs. - """ + """ stripped_url_results = [strip_url(url) for url in url_results] unique_urls = collections.deque() adjust = 0 @@ -225,7 +238,9 @@ def remove_local_duplicates(url_results: list[str]) -> list[str]: return url_results -def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dict]) -> list[str]: +def remove_remote_duplicates( + url_results: list[str], label_studio_data: list[dict] +) -> list[str]: """Removes URLs from a list that are already present in the Label Studio project, ignoring http(s)://www. Args: @@ -238,7 +253,9 @@ def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dic try: remote_urls = [strip_url(task["data"]["url"]) for task in label_studio_data] except TypeError: - print("Invalid Label Studio credentials. Database could not be checked for duplicates.") + print( + "Invalid Label Studio credentials. Database could not be checked for duplicates." + ) return url_results remote_urls = set(remote_urls) @@ -254,10 +271,11 @@ def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dic def handle_csv_and_upload( - common_crawl_result: CommonCrawlResult, - huggingface_api_manager: HuggingFaceAPIManager, - args: argparse.Namespace, - last_page: int): + common_crawl_result: CommonCrawlResult, + huggingface_api_manager: HuggingFaceAPIManager, + args: argparse.Namespace, + last_page: int, +): """ Handles the CSV file and uploads it to Hugging Face repository. Args: @@ -270,29 +288,27 @@ def handle_csv_and_upload( batch_info = add_batch_info_to_csv(common_crawl_result, args, last_page) csv_manager = CSVManager( - file_name=batch_info.filename, - headers=['url'], - directory=args.data_dir + file_name=batch_info.filename, headers=["url"], directory=args.data_dir ) csv_manager.add_rows(common_crawl_result.url_results) huggingface_api_manager.upload_file( local_file_path=csv_manager.file_path, - repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}" + repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}", ) print( - f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}") + f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}" + ) csv_manager.delete_file() def process_crawl_and_upload( - args: argparse.Namespace, - last_page: int, - huggingface_api_manager: HuggingFaceAPIManager, - label_studio_data: list[dict]) -> CommonCrawlResult: + args: argparse.Namespace, + last_page: int, + huggingface_api_manager: HuggingFaceAPIManager, + label_studio_data: list[dict], +) -> CommonCrawlResult: # Initialize the CommonCrawlerManager - crawler_manager = CommonCrawlerManager( - args.common_crawl_id - ) + crawler_manager = CommonCrawlerManager(args.common_crawl_id) # Determine the pages to search, based on the last page searched start_page = last_page + 1 # Use the parsed arguments @@ -300,7 +316,7 @@ def process_crawl_and_upload( search_term=args.url, keyword=args.keyword, num_pages=args.pages, - start_page=start_page + start_page=start_page, ) # Logic should conclude here if no results are found if not common_crawl_result.url_results: @@ -309,10 +325,16 @@ def process_crawl_and_upload( return common_crawl_result print("Removing urls already in the database") - common_crawl_result.url_results = remove_local_duplicates(common_crawl_result.url_results) - common_crawl_result.url_results = remove_remote_duplicates(common_crawl_result.url_results, label_studio_data) + common_crawl_result.url_results = remove_local_duplicates( + common_crawl_result.url_results + ) + common_crawl_result.url_results = remove_remote_duplicates( + common_crawl_result.url_results, label_studio_data + ) if not common_crawl_result.url_results: - print("No urls not already present in the database found. Ceasing main execution.") + print( + "No urls not already present in the database found. Ceasing main execution." + ) add_batch_info_to_csv(common_crawl_result, args, last_page) return common_crawl_result diff --git a/common_crawler/utils.py b/common_crawler/utils.py index 0848b02..3cea7af 100644 --- a/common_crawler/utils.py +++ b/common_crawler/utils.py @@ -12,7 +12,7 @@ def __init__(self, url): self.url = url def add_parameter(self, parameter, value): - if '?' in self.url: + if "?" in self.url: self.url += f"&{parameter}={value}" else: self.url += f"?{parameter}={value}" diff --git a/source_collectors/ckan/ckan_scraper_toolkit.py b/source_collectors/ckan/ckan_scraper_toolkit.py index 0d9dc44..5898c9f 100644 --- a/source_collectors/ckan/ckan_scraper_toolkit.py +++ b/source_collectors/ckan/ckan_scraper_toolkit.py @@ -1,4 +1,5 @@ """Toolkit of functions that use ckanapi to retrieve packages from CKAN data portals""" + from concurrent.futures import as_completed, ThreadPoolExecutor from dataclasses import dataclass, field from datetime import datetime @@ -150,10 +151,7 @@ def ckan_collection_search(base_url: str, collection_id: str) -> list[Package]: for dataset_content in soup.find_all(class_="dataset-content") ] - [ - packages.append(package.result()) - for package in as_completed(futures) - ] + [packages.append(package.result()) for package in as_completed(futures)] # Take a break to avoid being timed out if len(futures) >= 15: @@ -186,10 +184,12 @@ def _collection_search_get_package_data(dataset_content, base_url: str): record_format.text.strip() for record_format in dataset_content.find_all("li") ] package.record_format = list(set(package.record_format)) - + date = dataset_soup.find(property="dct:modified").text.strip() - package.source_last_updated = datetime.strptime(date, "%B %d, %Y").strftime("%Y-%d-%m") - + package.source_last_updated = datetime.strptime(date, "%B %d, %Y").strftime( + "%Y-%d-%m" + ) + return package diff --git a/source_collectors/ckan/scrape_ckan_data_portals.py b/source_collectors/ckan/scrape_ckan_data_portals.py index ef83b4d..57bd992 100644 --- a/source_collectors/ckan/scrape_ckan_data_portals.py +++ b/source_collectors/ckan/scrape_ckan_data_portals.py @@ -1,4 +1,5 @@ """Retrieves packages from CKAN data portals and parses relevant information then outputs to a CSV file""" + from itertools import chain import json import sys diff --git a/source_collectors/ckan/search_terms.py b/source_collectors/ckan/search_terms.py index 7fdbc34..179e58d 100644 --- a/source_collectors/ckan/search_terms.py +++ b/source_collectors/ckan/search_terms.py @@ -11,7 +11,7 @@ {"url": "https://open.jacksonms.gov/", "terms": ["tags:police"]}, {"url": "https://data.milwaukee.gov/", "terms": ["mpd", "wibr"]}, {"url": "https://data.sanantonio.gov/", "terms": ["sapd"]}, - {"url": "https://data.sanjoseca.gov/", "terms": ["police"]} + {"url": "https://data.sanjoseca.gov/", "terms": ["police"]}, ] group_search = [ diff --git a/source_collectors/common_crawler/argparser.py b/source_collectors/common_crawler/argparser.py index 8cdf5b7..67f4a29 100644 --- a/source_collectors/common_crawler/argparser.py +++ b/source_collectors/common_crawler/argparser.py @@ -7,6 +7,7 @@ for the Common Crawler script. """ + def valid_common_crawl_id(common_crawl_id: str) -> bool: """ Validate the Common Crawl ID format. @@ -16,7 +17,8 @@ def valid_common_crawl_id(common_crawl_id: str) -> bool: Returns: True if the Common Crawl ID is valid, False otherwise """ - return re.match(r'CC-MAIN-\d{4}-\d{2}', common_crawl_id) is not None + return re.match(r"CC-MAIN-\d{4}-\d{2}", common_crawl_id) is not None + def parse_args() -> argparse.Namespace: """ @@ -33,22 +35,41 @@ def parse_args() -> argparse.Namespace: """ parser = argparse.ArgumentParser( - description='Query the Common Crawl dataset and optionally save the results to a file.') + description="Query the Common Crawl dataset and optionally save the results to a file." + ) # Add the required arguments - parser.add_argument('common_crawl_id', type=str, help='The Common Crawl ID') - parser.add_argument('url', type=str, help='The URL to query') - parser.add_argument('keyword', type=str, help='The keyword to search in the url') + parser.add_argument("common_crawl_id", type=str, help="The Common Crawl ID") + parser.add_argument("url", type=str, help="The URL to query") + parser.add_argument("keyword", type=str, help="The keyword to search in the url") # Optional arguments for the number of pages and the output file, and a flag to reset the cache - parser.add_argument('-c', '--config', type=str, default='config.ini', help='The configuration file to use') - parser.add_argument('-p', '--pages', type=int, default=1, help='The number of pages to search (default: 1)') - parser.add_argument('--reset-cache', action='store_true', default=False, - help='Reset the cache before starting the crawl') + parser.add_argument( + "-c", + "--config", + type=str, + default="config.ini", + help="The configuration file to use", + ) + parser.add_argument( + "-p", + "--pages", + type=int, + default=1, + help="The number of pages to search (default: 1)", + ) + parser.add_argument( + "--reset-cache", + action="store_true", + default=False, + help="Reset the cache before starting the crawl", + ) args = parser.parse_args() # Validate the Common Crawl ID format if not valid_common_crawl_id(args.common_crawl_id): - parser.error("Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW.") + parser.error( + "Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW." + ) # Read the configuration file config = configparser.ConfigParser() @@ -56,7 +77,7 @@ def parse_args() -> argparse.Namespace: # Combine parsed arguments with configuration file defaults app_parser = argparse.ArgumentParser(parents=[parser], add_help=False) - app_parser.set_defaults(**config['DEFAULT']) + app_parser.set_defaults(**config["DEFAULT"]) app_args = app_parser.parse_args() diff --git a/source_collectors/common_crawler/cache.py b/source_collectors/common_crawler/cache.py index 2a48c0b..23d5881 100644 --- a/source_collectors/common_crawler/cache.py +++ b/source_collectors/common_crawler/cache.py @@ -8,11 +8,13 @@ - CommonCrawlerCache: a class for managing the cache logic of Common Crawl search results """ + class CommonCrawlerCacheManager: """ A class for managing the cache of Common Crawl search results. This class is responsible for adding, retrieving, and saving cache data. """ + def __init__(self, file_name: str = "cache", directory=None): """ Initializes the CacheStorage object with a file name and directory. @@ -41,7 +43,6 @@ def upsert(self, index: str, url: str, keyword: str, last_page: int) -> None: self.cache[index][url] = {} self.cache[index][url][keyword] = last_page - def get(self, index, url, keyword) -> int: """ Retrieves a page number from the cache. @@ -53,12 +54,15 @@ def get(self, index, url, keyword) -> int: Returns: int - the last page crawled """ - if index in self.cache and url in self.cache[index] and keyword in self.cache[index][url]: + if ( + index in self.cache + and url in self.cache[index] + and keyword in self.cache[index][url] + ): return self.cache[index][url][keyword] # The cache object does not exist. Return 0 as the default value. return 0 - def load_or_create_cache(self) -> dict: """ Loads the cache from the configured file path. @@ -66,12 +70,11 @@ def load_or_create_cache(self) -> dict: Returns: dict - the cache data """ try: - with open(self.file_path, 'r') as file: + with open(self.file_path, "r") as file: return json.load(file) except FileNotFoundError: return {} - def save_cache(self) -> None: """ Converts the cache object into a JSON-serializable format and saves it to the configured file path. @@ -79,10 +82,9 @@ def save_cache(self) -> None: persistence of crawl data across sessions. """ # Reformat cache data for JSON serialization - with open(self.file_path, 'w') as file: + with open(self.file_path, "w") as file: json.dump(self.cache, file, indent=4) - def reset_cache(self) -> None: """ Resets the cache to an empty state. diff --git a/source_collectors/common_crawler/crawler.py b/source_collectors/common_crawler/crawler.py index 9afba7d..0982ca5 100644 --- a/source_collectors/common_crawler/crawler.py +++ b/source_collectors/common_crawler/crawler.py @@ -16,7 +16,6 @@ # TODO: What happens when no results are found? How does the CommonCrawlerManager handle this? - @dataclass class CommonCrawlResult: last_page_search: int @@ -31,16 +30,17 @@ class CommonCrawlerManager: It validates crawl ids, manages pagination, and aggregates results. """ - def __init__(self, crawl_id='CC-MAIN-2023-50'): + def __init__(self, crawl_id="CC-MAIN-2023-50"): self.crawl_id = crawl_id - CC_INDEX_SERVER = 'http://index.commoncrawl.org/' - INDEX_NAME = f'{self.crawl_id}-index' - self.root_url = f'{CC_INDEX_SERVER}{INDEX_NAME}' + CC_INDEX_SERVER = "http://index.commoncrawl.org/" + INDEX_NAME = f"{self.crawl_id}-index" + self.root_url = f"{CC_INDEX_SERVER}{INDEX_NAME}" def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResult: print( f"Searching for {keyword} on {search_term} in {self.crawl_id} for {num_pages} pages," - f" starting at page {start_page}") + f" starting at page {start_page}" + ) url_results = [] @@ -64,7 +64,9 @@ def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResul return CommonCrawlResult(last_page, url_results) - def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = 20) -> list[dict]: + def search_common_crawl_index( + self, url: str, page: int = 0, max_retries: int = 20 + ) -> list[dict]: """ This method is used to search the Common Crawl index for a given URL and page number Args: @@ -76,9 +78,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = """ encoded_url = quote_plus(url) search_url = URLWithParameters(self.root_url) - search_url.add_parameter('url', encoded_url) - search_url.add_parameter('output', 'json') - search_url.add_parameter('page', page) + search_url.add_parameter("url", encoded_url) + search_url.add_parameter("output", "json") + search_url.add_parameter("page", page) retries = 0 delay = 1 @@ -90,7 +92,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = return self.process_response(response, url, page) retries += 1 - print(f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})") + print( + f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})" + ) time.sleep(delay) print(f"Max retries exceeded. Failed to get records for {url} on page {page}.") @@ -106,19 +110,24 @@ def make_request(self, search_url: str) -> requests.Response: response.raise_for_status() return response except requests.exceptions.RequestException as e: - if response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR and 'SlowDown' in response.text: + if ( + response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR + and "SlowDown" in response.text + ): return None else: print(f"Failed to get records: {e}") return None - def process_response(self, response: requests.Response, url: str, page: int) -> list[dict]: + def process_response( + self, response: requests.Response, url: str, page: int + ) -> list[dict]: """Processes the HTTP response and returns the parsed records if successful.""" if response.status_code == HTTPStatus.OK: - records = response.text.strip().split('\n') + records = response.text.strip().split("\n") print(f"Found {len(records)} records for {url} on page {page}") return [json.loads(record) for record in records] - elif 'First Page is 0, Last Page is 0' in response.text: + elif "First Page is 0, Last Page is 0" in response.text: print("No records exist in index matching the url search term") return None else: @@ -127,4 +136,4 @@ def process_response(self, response: requests.Response, url: str, page: int) -> @staticmethod def get_urls_with_keyword(records: list[dict], keyword) -> list[str]: - return [record['url'] for record in records if keyword in record['url']] + return [record["url"] for record in records if keyword in record["url"]] diff --git a/source_collectors/common_crawler/csv_manager.py b/source_collectors/common_crawler/csv_manager.py index 6986862..2b823b4 100644 --- a/source_collectors/common_crawler/csv_manager.py +++ b/source_collectors/common_crawler/csv_manager.py @@ -10,12 +10,7 @@ class CSVManager: Creates the file if it doesn't exist, and provides a method for adding new rows. """ - def __init__( - self, - file_name: str, - headers: list[str], - directory=None - ): + def __init__(self, file_name: str, headers: list[str], directory=None): self.file_path = get_file_path(f"{file_name}.csv", directory) self.headers = headers if not os.path.exists(self.file_path): @@ -29,9 +24,9 @@ def add_row(self, row_values: list[str] | tuple[str]): """ if isinstance(row_values, str): # Single values must be converted to a list format - row_values = [row_values] + row_values = [row_values] try: - with open(self.file_path, mode='a', newline='', encoding='utf-8') as file: + with open(self.file_path, mode="a", newline="", encoding="utf-8") as file: writer = csv.writer(file) writer.writerow(row_values) except Exception as e: @@ -45,9 +40,7 @@ def add_rows(self, results: list[list[str]]) -> None: Returns: None """ for result in results: - self.add_row( - result - ) + self.add_row(result) print(f"{len(results)} URLs written to {self.file_path}") def initialize_file(self): @@ -59,15 +52,17 @@ def initialize_file(self): file_exists = os.path.isfile(self.file_path) if not file_exists: - with open(self.file_path, mode='a', newline='', encoding='utf-8') as file: + with open(self.file_path, mode="a", newline="", encoding="utf-8") as file: writer = csv.writer(file) writer.writerow(self.headers) else: # Open and check that headers match - with open(self.file_path, mode='r', encoding='utf-8') as file: + with open(self.file_path, mode="r", encoding="utf-8") as file: header_row = next(csv.reader(file)) if header_row != self.headers: - raise ValueError(f"Header row in {self.file_path} does not match expected headers") + raise ValueError( + f"Header row in {self.file_path} does not match expected headers" + ) print(f"CSV file initialized at {self.file_path}") def delete_file(self): diff --git a/source_collectors/common_crawler/main.py b/source_collectors/common_crawler/main.py index ae27f55..b9dd012 100644 --- a/source_collectors/common_crawler/main.py +++ b/source_collectors/common_crawler/main.py @@ -10,7 +10,7 @@ # The below code sets the working directory to be the root of the entire repository # This is done to solve otherwise quite annoying import issues. -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from util.huggingface_api_manager import HuggingFaceAPIManager from util.miscellaneous_functions import get_filename_friendly_timestamp @@ -35,30 +35,34 @@ class BatchInfo: notes: str filename: str + class LabelStudioError(Exception): """Custom exception for Label Studio Errors""" + pass -BATCH_HEADERS = ['Datetime', 'Source', 'Count', 'Keywords', 'Notes', 'Filename'] + +BATCH_HEADERS = ["Datetime", "Source", "Count", "Keywords", "Notes", "Filename"] + def get_current_time(): return str(datetime.now()) -def add_batch_info_to_csv(common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int) -> BatchInfo: +def add_batch_info_to_csv( + common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int +) -> BatchInfo: batch_info = BatchInfo( datetime=get_current_time(), source="Common Crawl", count=str(len(common_crawl_result.url_results)), keywords=f"{args.url} - {args.keyword}", notes=f"{args.common_crawl_id}, {args.pages} pages, starting at {last_page + 1}", - filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}" + filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}", ) batch_info_csv_manager = CSVManager( - file_name='batch_info', - directory=args.data_dir, - headers=BATCH_HEADERS + file_name="batch_info", directory=args.data_dir, headers=BATCH_HEADERS ) batch_info_csv_manager.add_row(dataclasses.astuple(batch_info)) @@ -71,12 +75,11 @@ def main(): # Initialize the Cache cache_manager = CommonCrawlerCacheManager( - file_name=args.cache_filename, - directory=args.data_dir + file_name=args.cache_filename, directory=args.data_dir ) load_dotenv() - + # Initialize the HuggingFace API Manager hf_access_token = os.getenv("HUGGINGFACE_ACCESS_TOKEN") if not hf_access_token: @@ -84,10 +87,10 @@ def main(): "HUGGINGFACE_ACCESS_TOKEN not accessible in .env file in root directory. " "Please obtain access token from your personal account at " "https://huggingface.co/settings/tokens and ensure you have write access to " - "https://huggingface.co/PDAP. Then include in .env file in root directory.") + "https://huggingface.co/PDAP. Then include in .env file in root directory." + ) huggingface_api_manager = HuggingFaceAPIManager( - access_token=hf_access_token, - repo_id=args.huggingface_repo_id + access_token=hf_access_token, repo_id=args.huggingface_repo_id ) ls_access_token = os.getenv("LABEL_STUDIO_ACCESS_TOKEN") if not ls_access_token: @@ -95,13 +98,15 @@ def main(): "LABEL_STUDIO_ACCESS_TOKEN not accessible in .env file in root directory. " "Please obtain access token from your personal account at " "https://app.heartex.com/user/account and ensure you have read access to " - "https://app.heartex.com/projects/61550. Then include in .env file in root directory.") + "https://app.heartex.com/projects/61550. Then include in .env file in root directory." + ) ls_project_id = os.getenv("LABEL_STUDIO_PROJECT_ID") if not ls_project_id: raise ValueError( "LABEL_STUDIO_PROJECT_ID not accessible in .env file in root directory. " "Please obtain a project ID by navigating to the Label Studio project " - "where it will be visibile in the url. Then include in .env file in root directory.") + "where it will be visibile in the url. Then include in .env file in root directory." + ) try: print("Retrieving Label Studio data for deduplication") @@ -119,7 +124,9 @@ def main(): try: # Retrieve the last page from the cache, or 0 if it does not exist last_page = cache_manager.get(args.common_crawl_id, args.url, args.keyword) - common_crawl_result = process_crawl_and_upload(args, last_page, huggingface_api_manager, label_studio_results) + common_crawl_result = process_crawl_and_upload( + args, last_page, huggingface_api_manager, label_studio_results + ) except ValueError as e: print(f"Error during crawling: {e}") return @@ -129,12 +136,14 @@ def main(): index=args.common_crawl_id, url=args.url, keyword=args.keyword, - last_page=common_crawl_result.last_page_search) + last_page=common_crawl_result.last_page_search, + ) cache_manager.save_cache() except ValueError as e: print(f"Error while saving cache manager: {e}") + def handle_remote_results_error(remote_results): """ Handles errors in the remote results @@ -151,6 +160,7 @@ def handle_remote_results_error(remote_results): else: raise LabelStudioError(f"Unexpected error: {remote_results}") + def validate_remote_results(remote_results): """ Validates the remote results retrieved from the Label Studio project @@ -166,7 +176,9 @@ def validate_remote_results(remote_results): print("No data in Label Studio project.") return [] elif "url" not in remote_results[0]["data"]: - raise LabelStudioError("Column 'url' not present in Label Studio project. Exiting...") + raise LabelStudioError( + "Column 'url' not present in Label Studio project. Exiting..." + ) else: return remote_results elif isinstance(remote_results, dict): @@ -174,6 +186,7 @@ def validate_remote_results(remote_results): else: raise LabelStudioError("Unexpected response type.") + def get_ls_data() -> list[dict] | None: """Retrieves data from a Label Studio project to be used in deduplication of common crawl results. @@ -190,14 +203,14 @@ def get_ls_data() -> list[dict] | None: def strip_url(url: str) -> str: - """Strips http(s)://www. from the beginning of a url if applicable. + """Strips http(s)://www. from the beginning of a url if applicable. Args: url (str): The URL to strip. Returns: str: The stripped URL. - """ + """ result = re.search(r"^(?:https?://)?(?:www\.)?(.*)$", url).group(1) return result @@ -210,7 +223,7 @@ def remove_local_duplicates(url_results: list[str]) -> list[str]: Returns: list[str]: List of unique URLs. - """ + """ stripped_url_results = [strip_url(url) for url in url_results] unique_urls = collections.deque() adjust = 0 @@ -225,7 +238,9 @@ def remove_local_duplicates(url_results: list[str]) -> list[str]: return url_results -def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dict]) -> list[str]: +def remove_remote_duplicates( + url_results: list[str], label_studio_data: list[dict] +) -> list[str]: """Removes URLs from a list that are already present in the Label Studio project, ignoring http(s)://www. Args: @@ -238,7 +253,9 @@ def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dic try: remote_urls = [strip_url(task["data"]["url"]) for task in label_studio_data] except TypeError: - print("Invalid Label Studio credentials. Database could not be checked for duplicates.") + print( + "Invalid Label Studio credentials. Database could not be checked for duplicates." + ) return url_results remote_urls = set(remote_urls) @@ -254,10 +271,11 @@ def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dic def handle_csv_and_upload( - common_crawl_result: CommonCrawlResult, - huggingface_api_manager: HuggingFaceAPIManager, - args: argparse.Namespace, - last_page: int): + common_crawl_result: CommonCrawlResult, + huggingface_api_manager: HuggingFaceAPIManager, + args: argparse.Namespace, + last_page: int, +): """ Handles the CSV file and uploads it to Hugging Face repository. Args: @@ -270,29 +288,27 @@ def handle_csv_and_upload( batch_info = add_batch_info_to_csv(common_crawl_result, args, last_page) csv_manager = CSVManager( - file_name=batch_info.filename, - headers=['url'], - directory=args.data_dir + file_name=batch_info.filename, headers=["url"], directory=args.data_dir ) csv_manager.add_rows(common_crawl_result.url_results) huggingface_api_manager.upload_file( local_file_path=csv_manager.file_path, - repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}" + repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}", ) print( - f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}") + f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}" + ) csv_manager.delete_file() def process_crawl_and_upload( - args: argparse.Namespace, - last_page: int, - huggingface_api_manager: HuggingFaceAPIManager, - label_studio_data: list[dict]) -> CommonCrawlResult: + args: argparse.Namespace, + last_page: int, + huggingface_api_manager: HuggingFaceAPIManager, + label_studio_data: list[dict], +) -> CommonCrawlResult: # Initialize the CommonCrawlerManager - crawler_manager = CommonCrawlerManager( - args.common_crawl_id - ) + crawler_manager = CommonCrawlerManager(args.common_crawl_id) # Determine the pages to search, based on the last page searched start_page = last_page + 1 # Use the parsed arguments @@ -300,7 +316,7 @@ def process_crawl_and_upload( search_term=args.url, keyword=args.keyword, num_pages=args.pages, - start_page=start_page + start_page=start_page, ) # Logic should conclude here if no results are found if not common_crawl_result.url_results: @@ -309,10 +325,16 @@ def process_crawl_and_upload( return common_crawl_result print("Removing urls already in the database") - common_crawl_result.url_results = remove_local_duplicates(common_crawl_result.url_results) - common_crawl_result.url_results = remove_remote_duplicates(common_crawl_result.url_results, label_studio_data) + common_crawl_result.url_results = remove_local_duplicates( + common_crawl_result.url_results + ) + common_crawl_result.url_results = remove_remote_duplicates( + common_crawl_result.url_results, label_studio_data + ) if not common_crawl_result.url_results: - print("No urls not already present in the database found. Ceasing main execution.") + print( + "No urls not already present in the database found. Ceasing main execution." + ) add_batch_info_to_csv(common_crawl_result, args, last_page) return common_crawl_result diff --git a/source_collectors/common_crawler/utils.py b/source_collectors/common_crawler/utils.py index 0848b02..3cea7af 100644 --- a/source_collectors/common_crawler/utils.py +++ b/source_collectors/common_crawler/utils.py @@ -12,7 +12,7 @@ def __init__(self, url): self.url = url def add_parameter(self, parameter, value): - if '?' in self.url: + if "?" in self.url: self.url += f"&{parameter}={value}" else: self.url += f"?{parameter}={value}" diff --git a/source_collectors/muckrock/convert_all_record_types_to_csv.py b/source_collectors/muckrock/convert_all_record_types_to_csv.py index be6d536..30acdbb 100644 --- a/source_collectors/muckrock/convert_all_record_types_to_csv.py +++ b/source_collectors/muckrock/convert_all_record_types_to_csv.py @@ -1,12 +1,43 @@ -import subprocess -import os +# import subprocess +# import os -record_types = ['accident reports', 'arrest records', 'calls for service', 'car gps', 'citations', 'dispatch logs', 'dispatch recordings', - 'field contacts', 'incident reports', 'misc police activity', 'officer involved shootings', 'stops', 'surveys', 'use of force reports', - 'vehicle pursuits', 'complaints and misconduct', 'daily activity logs', 'training and hiring info', 'personnel records', 'annual and monthly reports', - 'budgets and finances', 'contact info and agency meta', 'geographic', 'list of data sources', 'policies and contracts', 'crime maps and reports', - 'crime statistics', 'media bulletins', 'records request info', 'resources', 'sex offender registry', 'wanted persons', 'booking reports', - 'court cases', 'incarceration records'] +record_types = [ + "accident reports", + "arrest records", + "calls for service", + "car gps", + "citations", + "dispatch logs", + "dispatch recordings", + "field contacts", + "incident reports", + "misc police activity", + "officer involved shootings", + "stops", + "surveys", + "use of force reports", + "vehicle pursuits", + "complaints and misconduct", + "daily activity logs", + "training and hiring info", + "personnel records", + "annual and monthly reports", + "budgets and finances", + "contact info and agency meta", + "geographic", + "list of data sources", + "policies and contracts", + "crime maps and reports", + "crime statistics", + "media bulletins", + "records request info", + "resources", + "sex offender registry", + "wanted persons", + "booking reports", + "court cases", + "incarceration records", +] print(len(record_types)) # json_files = [] diff --git a/source_collectors/muckrock/create_foia_data_db.py b/source_collectors/muckrock/create_foia_data_db.py index 4480105..4adc555 100644 --- a/source_collectors/muckrock/create_foia_data_db.py +++ b/source_collectors/muckrock/create_foia_data_db.py @@ -1,4 +1,4 @@ -''' +""" create_foia_data_db.py This script fetches data from the MuckRock FOIA API and stores it in a SQLite database. @@ -17,8 +17,7 @@ Error Handling: Errors encountered during API requests or database operations are logged to an `errors.log` file and/or printed to the console. -''' - +""" import requests import sqlite3 @@ -28,18 +27,19 @@ import time from typing import List, Tuple, Dict, Any, Union, Literal -logging.basicConfig(filename='errors.log', level=logging.ERROR, - format='%(levelname)s: %(message)s') +logging.basicConfig( + filename="errors.log", level=logging.ERROR, format="%(levelname)s: %(message)s" +) -base_url = 'https://www.muckrock.com/api_v1/foia/' -last_page_fetched = 'last_page_fetched.txt' +base_url = "https://www.muckrock.com/api_v1/foia/" +last_page_fetched = "last_page_fetched.txt" NO_MORE_DATA = -1 # flag for program exit JSON = Dict[str, Any] # type alias -create_table_query = ''' +create_table_query = """ CREATE TABLE IF NOT EXISTS results ( id INTEGER PRIMARY KEY, title TEXT, @@ -63,20 +63,20 @@ communications TEXT, absolute_url TEXT ) - ''' + """ -foia_insert_query = ''' +foia_insert_query = """ INSERT INTO results (id, title, slug, status, embargo_status, user, username, agency, datetime_submitted, date_due, days_until_due, date_followup, datetime_done, datetime_updated, date_embargo, tracking_id, price, disable_autofollowups, tags, communications, absolute_url) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - ''' + """ def create_db() -> bool: - ''' + """ Creates foia_data.db SQLite database with one table named `results`. Returns: @@ -84,23 +84,22 @@ def create_db() -> bool: Raises: sqlite3.Error: If the table creation operation fails, prints error and returns False. - ''' + """ try: - with sqlite3.connect('foia_data.db') as conn: + with sqlite3.connect("foia_data.db") as conn: conn.execute(create_table_query) conn.commit() - print('Successfully created foia_data.db!') + print("Successfully created foia_data.db!") return True except sqlite3.Error as e: - print(f'SQLite error: {e}.') - logging.error( - f'Failed to create foia_data.db due to SQLite error: {e}') + print(f"SQLite error: {e}.") + logging.error(f"Failed to create foia_data.db due to SQLite error: {e}") return False def fetch_page(page: int) -> Union[JSON, Literal[NO_MORE_DATA], None]: - ''' + """ Fetches a page of 100 results from the MuckRock FOIA API. Args: @@ -111,30 +110,33 @@ def fetch_page(page: int) -> Union[JSON, Literal[NO_MORE_DATA], None]: - JSON Dict[str, Any]: The response's JSON data, if the request is successful. - NO_MORE_DATA (int = -1): A constant, if there are no more pages to fetch (indicated by a 404 response). - None: If there is an error other than 404. - ''' + """ per_page = 100 response = requests.get( - base_url, params={'page': page, 'page_size': per_page, 'format': 'json'}) + base_url, params={"page": page, "page_size": per_page, "format": "json"} + ) if response.status_code == 200: return response.json() elif response.status_code == 404: - print('No more pages to fetch') + print("No more pages to fetch") return NO_MORE_DATA # Typically 404 response will mean there are no more pages to fetch elif 500 <= response.status_code < 600: - logging.error(f'Server error {response.status_code} on page {page}') + logging.error(f"Server error {response.status_code} on page {page}") page = page + 1 return fetch_page(page) else: - print(f'Error fetching page {page}: {response.status_code}') - logging.error(f'Fetching page {page} failed with response code: { - response.status_code}') + print(f"Error fetching page {page}: {response.status_code}") + logging.error( + f"Fetching page {page} failed with response code: { + response.status_code}" + ) return None def transform_page_data(data_to_transform: JSON) -> List[Tuple[Any, ...]]: - ''' + """ Transforms the data recieved from the MuckRock FOIA API into a structured format for insertion into a database with `populate_db()`. Transforms JSON input into a list of tuples, as well as serializes the nested `tags` and `communications` fields into JSON strings. @@ -144,43 +146,44 @@ def transform_page_data(data_to_transform: JSON) -> List[Tuple[Any, ...]]: Returns: transformed_data (List[Tuple[Any, ...]]: A list of tuples, where each tuple contains the fields of a single FOIA request. - ''' + """ transformed_data = [] - for result in data_to_transform.get('results', []): - result['tags'] = json.dumps(result.get('tags', [])) - result['communications'] = json.dumps( - result.get('communications', [])) - - transformed_data.append(( - result['id'], - result['title'], - result['slug'], - result['status'], - result['embargo_status'], - result['user'], - result['username'], - result['agency'], - result['datetime_submitted'], - result['date_due'], - result['days_until_due'], - result['date_followup'], - result['datetime_done'], - result['datetime_updated'], - result['date_embargo'], - result['tracking_id'], - result['price'], - result['disable_autofollowups'], - result['tags'], - result['communications'], - result['absolute_url'] - )) + for result in data_to_transform.get("results", []): + result["tags"] = json.dumps(result.get("tags", [])) + result["communications"] = json.dumps(result.get("communications", [])) + + transformed_data.append( + ( + result["id"], + result["title"], + result["slug"], + result["status"], + result["embargo_status"], + result["user"], + result["username"], + result["agency"], + result["datetime_submitted"], + result["date_due"], + result["days_until_due"], + result["date_followup"], + result["datetime_done"], + result["datetime_updated"], + result["date_embargo"], + result["tracking_id"], + result["price"], + result["disable_autofollowups"], + result["tags"], + result["communications"], + result["absolute_url"], + ) + ) return transformed_data def populate_db(transformed_data: List[Tuple[Any, ...]], page: int) -> None: - ''' + """ Populates foia_data.db SQLite database with the transfomed FOIA request data. Args: @@ -193,9 +196,9 @@ def populate_db(transformed_data: List[Tuple[Any, ...]], page: int) -> None: Raises: sqlite3.Error: If the insertion operation fails, attempts to retry operation (max_retries = 2). If retries are exhausted, logs error and exits. - ''' + """ - with sqlite3.connect('foia_data.db') as conn: + with sqlite3.connect("foia_data.db") as conn: retries = 0 max_retries = 2 @@ -203,51 +206,55 @@ def populate_db(transformed_data: List[Tuple[Any, ...]], page: int) -> None: try: conn.executemany(foia_insert_query, transformed_data) conn.commit() - print('Successfully inserted data!') + print("Successfully inserted data!") return except sqlite3.Error as e: - print(f'SQLite error: {e}. Retrying...') + print(f"SQLite error: {e}. Retrying...") conn.rollback() retries += 1 time.sleep(1) if retries == max_retries: - print(f'Failed to insert data from page {page} after { - max_retries} attempts. Skipping to next page.') - logging.error(f'Failed to insert data from page {page} after { - max_retries} attempts.') + print( + f"Failed to insert data from page {page} after { + max_retries} attempts. Skipping to next page." + ) + logging.error( + f"Failed to insert data from page {page} after { + max_retries} attempts." + ) def main() -> None: - ''' + """ Main entry point for create_foia_data_db.py. This function orchestrates the process of fetching FOIA requests data from the MuckRock FOIA API, transforming it, and storing it in a SQLite database. - ''' + """ - if not os.path.exists('foia_data.db'): - print('Creating foia_data.db...') + if not os.path.exists("foia_data.db"): + print("Creating foia_data.db...") success = create_db() if success == False: - print('Failed to create foia_data.db') + print("Failed to create foia_data.db") return if os.path.exists(last_page_fetched): - with open(last_page_fetched, mode='r') as file: + with open(last_page_fetched, mode="r") as file: page = int(file.read()) + 1 else: page = 1 while True: - print(f'Fetching page {page}...') + print(f"Fetching page {page}...") page_data = fetch_page(page) if page_data == NO_MORE_DATA: break # Exit program because no more data exixts if page_data is None: - print(f'Skipping page {page}...') + print(f"Skipping page {page}...") page += 1 continue @@ -255,16 +262,18 @@ def main() -> None: populate_db(transformed_data, page) - with open(last_page_fetched, mode='w') as file: + with open(last_page_fetched, mode="w") as file: file.write(str(page)) page += 1 - print('create_foia_data_db.py run finished') + print("create_foia_data_db.py run finished") -if __name__ == '__main__': +if __name__ == "__main__": try: main() except Exception as e: - logging.error(f'An unexpected error occurred: {e}') - print('Check errors.log to review errors. Run create_foia_data_db.py again to continue') + logging.error(f"An unexpected error occurred: {e}") + print( + "Check errors.log to review errors. Run create_foia_data_db.py again to continue" + ) diff --git a/source_collectors/muckrock/download_muckrock_foia.py b/source_collectors/muckrock/download_muckrock_foia.py index c1a0380..86ede5d 100644 --- a/source_collectors/muckrock/download_muckrock_foia.py +++ b/source_collectors/muckrock/download_muckrock_foia.py @@ -12,15 +12,19 @@ all_data = [] output_file = "foia_data.json" + # Function to fetch data from a specific page def fetch_page(page): - response = requests.get(base_url, params={"page": page, "page_size": per_page, "format": "json"}) + response = requests.get( + base_url, params={"page": page, "page_size": per_page, "format": "json"} + ) if response.status_code == 200: return response.json() else: print(f"Error fetching page {page}: {response.status_code}") return None + # Fetch and store data from all pages while True: print(f"Fetching page {page}...") @@ -30,14 +34,14 @@ def fetch_page(page): page += 1 continue - all_data.extend(data['results']) - if not data['next']: + all_data.extend(data["results"]) + if not data["next"]: break page += 1 # Write data to CSV -with open(output_file, mode='w', encoding='utf-8') as json_file: +with open(output_file, mode="w", encoding="utf-8") as json_file: json.dump(all_data, json_file, indent=4) print(f"Data written to {output_file}") diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py index 4d57737..455084a 100644 --- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py +++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py @@ -7,23 +7,48 @@ # Load the JSON data parser = argparse.ArgumentParser(description="Parse JSON from a file.") -parser.add_argument('--json_file', type=str, required=True, - help="Path to the JSON file") +parser.add_argument( + "--json_file", type=str, required=True, help="Path to the JSON file" +) args = parser.parse_args() -with open(args.json_file, 'r') as f: +with open(args.json_file, "r") as f: json_data = json.load(f) # Define the CSV headers headers = [ - "name", "agency_described", "record_type", "description", "source_url", - "readme_url", "scraper_url", "state", "county", "municipality", - "agency_type", "jurisdiction_type", "View Archive", "agency_aggregation", - "agency_supplied", "supplying_entity", "agency_originated", "originating_agency", - "coverage_start", "source_last_updated", "coverage_end", "number_of_records_available", - "size", "access_type", "data_portal_type", "access_notes", "record_format", "update_frequency", - "update_method", "retention_schedule", "detail_level" + "name", + "agency_described", + "record_type", + "description", + "source_url", + "readme_url", + "scraper_url", + "state", + "county", + "municipality", + "agency_type", + "jurisdiction_type", + "View Archive", + "agency_aggregation", + "agency_supplied", + "supplying_entity", + "agency_originated", + "originating_agency", + "coverage_start", + "source_last_updated", + "coverage_end", + "number_of_records_available", + "size", + "access_type", + "data_portal_type", + "access_notes", + "record_format", + "update_frequency", + "update_method", + "retention_schedule", + "detail_level", ] @@ -59,7 +84,7 @@ def get_jurisdiction(jurisdiction_id): output_csv = format_filename_json_to_csv(args.json_file) # Open a CSV file for writing -with open(output_csv, 'w', newline='') as csvfile: +with open(output_csv, "w", newline="") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=headers) # Write the header row @@ -87,8 +112,7 @@ def get_jurisdiction(jurisdiction_id): juris_type = "state" # local jurisdiction level if jurisdiction_level == "l": - parent_juris_data = get_jurisdiction( - jurisdiction_data.get("parent")) + parent_juris_data = get_jurisdiction(jurisdiction_data.get("parent")) state = parent_juris_data.get("abbrev") if "County" in jurisdiction_data.get("name"): county = jurisdiction_data.get("name") @@ -99,24 +123,24 @@ def get_jurisdiction(jurisdiction_id): municipality = jurisdiction_data.get("name") juris_type = "local" - if 'Police' in agency_data.get("types"): - agency_type = 'law enforcement/police' + if "Police" in agency_data.get("types"): + agency_type = "law enforcement/police" else: - agency_type = '' + agency_type = "" - source_url = '' + source_url = "" absolute_url = item.get("absolute_url") - access_type = '' + access_type = "" for comm in item["communications"]: if comm["files"]: - source_url = absolute_url + '#files' - access_type = 'Web page,Download,API' + source_url = absolute_url + "#files" + access_type = "Web page,Download,API" break # Extract the relevant fields from the JSON object csv_row = { "name": item.get("title", ""), - "agency_described": agency_data.get("name", "") + ' - ' + state, + "agency_described": agency_data.get("name", "") + " - " + state, "record_type": "", "description": "", "source_url": source_url, @@ -145,7 +169,7 @@ def get_jurisdiction(jurisdiction_id): "update_frequency": "", "update_method": "", "retention_schedule": "", - "detail_level": "" + "detail_level": "", } # Write the extracted row to the CSV file diff --git a/source_collectors/muckrock/get_all_record_types.py b/source_collectors/muckrock/get_all_record_types.py index bcc8c0b..6fa955d 100644 --- a/source_collectors/muckrock/get_all_record_types.py +++ b/source_collectors/muckrock/get_all_record_types.py @@ -1,17 +1,50 @@ import subprocess -record_types = ['accident reports', 'arrest records', 'calls for service', 'car gps', 'citations', 'dispatch logs', 'dispatch recordings', - 'field contacts', 'incident reports', 'misc police activity', 'officer involved shootings', 'stops', 'surveys', 'use of force reports', - 'vehicle pursuits', 'complaints and misconduct', 'daily activity logs', 'training and hiring info', 'personnel records', 'annual and monthly reports', - 'budgets and finances', 'contact info and agency meta', 'geographic', 'list of data sources', 'policies and contracts', 'crime maps and reports', - 'crime statistics', 'media bulletins', 'records request info', 'resources', 'sex offender registry', 'wanted persons', 'booking reports', - 'court cases', 'incarceration records'] +record_types = [ + "accident reports", + "arrest records", + "calls for service", + "car gps", + "citations", + "dispatch logs", + "dispatch recordings", + "field contacts", + "incident reports", + "misc police activity", + "officer involved shootings", + "stops", + "surveys", + "use of force reports", + "vehicle pursuits", + "complaints and misconduct", + "daily activity logs", + "training and hiring info", + "personnel records", + "annual and monthly reports", + "budgets and finances", + "contact info and agency meta", + "geographic", + "list of data sources", + "policies and contracts", + "crime maps and reports", + "crime statistics", + "media bulletins", + "records request info", + "resources", + "sex offender registry", + "wanted persons", + "booking reports", + "court cases", + "incarceration records", +] for record_type in record_types: - command = ['python', 'search_foia_data_db.py', '--search_for', record_type] + command = ["python", "search_foia_data_db.py", "--search_for", record_type] try: subprocess.run(command, check=True) except subprocess.CalledProcessError as e: - print(f'An error occurred while executing the command for "{ - record_type}": {e}') + print( + f'An error occurred while executing the command for "{ + record_type}": {e}' + ) diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py index 96cde83..02f7a4e 100644 --- a/source_collectors/muckrock/get_allegheny_foias.py +++ b/source_collectors/muckrock/get_allegheny_foias.py @@ -2,6 +2,7 @@ import json import time + # Function to fetch jurisdiction IDs based on town names from a text file def fetch_jurisdiction_ids(town_file, base_url): with open(town_file, "r") as file: @@ -14,12 +15,14 @@ def fetch_jurisdiction_ids(town_file, base_url): response = requests.get(url) if response.status_code == 200: data = response.json() - for item in data.get('results', []): - if item['name'] in town_names: - jurisdiction_ids[item['name']] = item['id'] + for item in data.get("results", []): + if item["name"] in town_names: + jurisdiction_ids[item["name"]] = item["id"] url = data.get("next") - print(f"Processed page, found {len(jurisdiction_ids)}/{len(town_names)} jurisdictions so far...") + print( + f"Processed page, found {len(jurisdiction_ids)}/{len(town_names)} jurisdictions so far..." + ) time.sleep(1) # To respect the rate limit elif response.status_code == 503: @@ -31,6 +34,7 @@ def fetch_jurisdiction_ids(town_file, base_url): return jurisdiction_ids + # Function to fetch FOIA data for each jurisdiction ID and save it to a JSON file def fetch_foia_data(jurisdiction_ids): all_data = [] @@ -42,7 +46,9 @@ def fetch_foia_data(jurisdiction_ids): data = response.json() all_data.extend(data.get("results", [])) url = data.get("next") - print(f"Fetching records for {name}, {len(all_data)} total records so far...") + print( + f"Fetching records for {name}, {len(all_data)} total records so far..." + ) time.sleep(1) # To respect the rate limit elif response.status_code == 503: print(f"Error 503: Skipping page for {name}") @@ -57,10 +63,13 @@ def fetch_foia_data(jurisdiction_ids): print(f"Saved {len(all_data)} records to foia_data_combined.json") + # Main function to execute the script def main(): town_file = "allegheny-county-towns.txt" - jurisdiction_url = "https://www.muckrock.com/api_v1/jurisdiction/?level=l&parent=126" + jurisdiction_url = ( + "https://www.muckrock.com/api_v1/jurisdiction/?level=l&parent=126" + ) # Fetch jurisdiction IDs based on town names jurisdiction_ids = fetch_jurisdiction_ids(town_file, jurisdiction_url) @@ -69,6 +78,7 @@ def main(): # Fetch FOIA data for each jurisdiction ID fetch_foia_data(jurisdiction_ids) + # Run the main function if __name__ == "__main__": main() diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py index ed1db45..a0160a8 100644 --- a/source_collectors/muckrock/muck_get.py +++ b/source_collectors/muckrock/muck_get.py @@ -14,17 +14,23 @@ while True: # Make the GET request with the search string as a query parameter - response = requests.get(base_url, params={"page" : page, "page_size" : per_page, "format": "json"}) + response = requests.get( + base_url, params={"page": page, "page_size": per_page, "format": "json"} + ) # Check if the request was successful if response.status_code == 200: # Parse the JSON response data = response.json() - if not data['results']: + if not data["results"]: break - filtered_results = [item for item in data['results'] if search_string.lower() in item['title'].lower()] + filtered_results = [ + item + for item in data["results"] + if search_string.lower() in item["title"].lower() + ] all_results.extend(filtered_results) @@ -44,7 +50,7 @@ # Dump list into a JSON file json_out_file = search_string.replace(" ", "_") + ".json" -with open(json_out_file, 'w') as json_file: +with open(json_out_file, "w") as json_file: json.dump(all_results, json_file) print(f"List dumped into {json_out_file}") diff --git a/source_collectors/muckrock/muckrock_ml_labeler.py b/source_collectors/muckrock/muckrock_ml_labeler.py index dafd6de..46b6580 100644 --- a/source_collectors/muckrock/muckrock_ml_labeler.py +++ b/source_collectors/muckrock/muckrock_ml_labeler.py @@ -11,16 +11,22 @@ # Load the dataset from command line argument parser = argparse.ArgumentParser(description="Load CSV file into a pandas DataFrame.") -parser.add_argument('--csv_file', type=str, required=True, help="Path to the CSV file") +parser.add_argument("--csv_file", type=str, required=True, help="Path to the CSV file") args = parser.parse_args() df = pd.read_csv(args.csv_file) # Combine multiple columns (e.g., 'url', 'html_title', 'h1') into a single text field for each row -columns_to_combine = ['url_path', 'html_title', 'h1'] # Add other columns here as needed -df['combined_text'] = df[columns_to_combine].apply(lambda row: ' '.join(row.values.astype(str)), axis=1) +columns_to_combine = [ + "url_path", + "html_title", + "h1", +] # Add other columns here as needed +df["combined_text"] = df[columns_to_combine].apply( + lambda row: " ".join(row.values.astype(str)), axis=1 +) # Convert the combined text into a list -texts = df['combined_text'].tolist() +texts = df["combined_text"].tolist() # Tokenize the inputs inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt") @@ -37,5 +43,5 @@ predicted_labels = [labels[int(pred)] for pred in predictions] # Add the predicted labels to the dataframe and save -df['predicted_label'] = predicted_labels +df["predicted_label"] = predicted_labels df.to_csv("labeled_muckrock_dataset.csv", index=False) diff --git a/source_collectors/muckrock/search_foia_data_db.py b/source_collectors/muckrock/search_foia_data_db.py index ff9aac6..1229059 100644 --- a/source_collectors/muckrock/search_foia_data_db.py +++ b/source_collectors/muckrock/search_foia_data_db.py @@ -1,4 +1,4 @@ -''' +""" search_foia_data_db.py This script provides search functionality for the `foia_data.db` SQLite database. The search looks in `title`s and @@ -16,8 +16,7 @@ Error Handling: Errors encountered during database operations, JSON parsing, or file writing are printed to the console. -''' - +""" import sqlite3 import pandas as pd @@ -26,37 +25,43 @@ import os from typing import Union, List, Dict -check_results_table_query = ''' +check_results_table_query = """ SELECT name FROM sqlite_master WHERE (type = 'table') AND (name = 'results') - ''' + """ -search_foia_query = ''' +search_foia_query = """ SELECT * FROM results WHERE (title LIKE ? OR tags LIKE ?) AND (status = 'done') - ''' + """ def parser_init() -> argparse.ArgumentParser: - ''' + """ Initializes the argument parser for search_foia_data_db.py. Returns: argparse.ArgumentParser: The configured argument parser. - ''' + """ parser = argparse.ArgumentParser( - description='Search foia_data.db and generate a JSON file of resulting matches') - parser.add_argument('--search_for', type=str, required=True, metavar='', - help='Provide a string to search foia_data.db') + description="Search foia_data.db and generate a JSON file of resulting matches" + ) + parser.add_argument( + "--search_for", + type=str, + required=True, + metavar="", + help="Provide a string to search foia_data.db", + ) return parser def search_foia_db(search_string: str) -> Union[pd.DataFrame, None]: - ''' + """ Searches the foia_data.db database for FOIA request entries matching the provided search string. Args: @@ -70,35 +75,35 @@ def search_foia_db(search_string: str) -> Union[pd.DataFrame, None]: Raises: sqlite3.Error: If any database operation fails, prints error and returns None. Exception: If any unexpected error occurs, prints error and returns None. - ''' + """ print(f'Searching foia_data.db for "{search_string}"...') try: - with sqlite3.connect('foia_data.db') as conn: + with sqlite3.connect("foia_data.db") as conn: results_table = pd.read_sql_query(check_results_table_query, conn) if results_table.empty: - print('The `results` table does not exist in the database.') + print("The `results` table does not exist in the database.") return None - params = [f'%{search_string}%', f'%{search_string}%'] + params = [f"%{search_string}%", f"%{search_string}%"] df = pd.read_sql_query(search_foia_query, conn, params=params) except sqlite3.Error as e: - print(f'Sqlite error: {e}') + print(f"Sqlite error: {e}") return None except Exception as e: - print(f'An unexpected error occurred: {e}') + print(f"An unexpected error occurred: {e}") return None return df def parse_communications_column(communications) -> List[Dict]: - ''' + """ Parses a communications column value, decoding it from JSON format. Args: @@ -110,19 +115,19 @@ def parse_communications_column(communications) -> List[Dict]: Raises: json.JSONDecodeError: If deserialization fails, prints error and returns empty list. - ''' + """ if pd.isna(communications): return [] try: return json.loads(communications) except json.JSONDecodeError as e: - print(f'Error decoding JSON: {e}') + print(f"Error decoding JSON: {e}") return [] def generate_json(df: pd.DataFrame, search_string: str) -> None: - ''' + """ Generates a JSON file from a pandas DataFrame. Args: @@ -136,46 +141,49 @@ def generate_json(df: pd.DataFrame, search_string: str) -> None: Raises: Exception: If writing to JSON file operation fails, prints error and returns. - ''' + """ - output_json = f'{search_string.replace(' ', '_')}.json' + output_json = f"{search_string.replace(' ', '_')}.json" try: - df.to_json(output_json, orient='records', indent=4) + df.to_json(output_json, orient="records", indent=4) print(f'Matching entries written to "{output_json}"') except Exception as e: - print(f'An error occurred while writing JSON: {e}') + print(f"An error occurred while writing JSON: {e}") def main() -> None: - ''' + """ Function to search the foia_data.db database for entries matching a specified search string. Command Line Args: --search_for (str): A string to search for in the `title` and `tags` fields of FOIA requests. - ''' + """ parser = parser_init() args = parser.parse_args() search_string = args.search_for - if not os.path.exists('foia_data.db'): - print('foia_data.db does not exist.\nRun create_foia_data_db.py first to create and populate it.') + if not os.path.exists("foia_data.db"): + print( + "foia_data.db does not exist.\nRun create_foia_data_db.py first to create and populate it." + ) return df = search_foia_db(search_string) if df is None: return - if not df['communications'].empty: - df['communications'] = df['communications'].apply( - parse_communications_column) + if not df["communications"].empty: + df["communications"] = df["communications"].apply(parse_communications_column) - print(f'Found {df.shape[0]} matching entries containing "{ - search_string}" in the title or tags') + print( + f'Found {df.shape[0]} matching entries containing "{ + search_string}" in the title or tags' + ) generate_json(df, search_string) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/source_collectors/muckrock/search_local_foia_json.py b/source_collectors/muckrock/search_local_foia_json.py index 9e61d49..66e6aca 100644 --- a/source_collectors/muckrock/search_local_foia_json.py +++ b/source_collectors/muckrock/search_local_foia_json.py @@ -1,38 +1,44 @@ import json # Specify the JSON file path -json_file = 'foia_data.json' -search_string = 'use of force' +json_file = "foia_data.json" +search_string = "use of force" # Load the JSON data -with open(json_file, 'r', encoding='utf-8') as file: +with open(json_file, "r", encoding="utf-8") as file: data = json.load(file) # List to store matching entries matching_entries = [] + # Function to search within an entry def search_entry(entry): # Check if 'status' is 'done' - if entry.get('status') != 'done': + if entry.get("status") != "done": return False - + # Check if 'title' or 'tags' field contains the search string - title_match = 'title' in entry and search_string.lower() in entry['title'].lower() - tags_match = 'tags' in entry and any(search_string.lower() in tag.lower() for tag in entry['tags']) - + title_match = "title" in entry and search_string.lower() in entry["title"].lower() + tags_match = "tags" in entry and any( + search_string.lower() in tag.lower() for tag in entry["tags"] + ) + return title_match or tags_match + # Iterate through the data and collect matching entries for entry in data: if search_entry(entry): matching_entries.append(entry) # Output the results -print(f"Found {len(matching_entries)} entries containing '{search_string}' in the title or tags.") +print( + f"Found {len(matching_entries)} entries containing '{search_string}' in the title or tags." +) # Optionally, write matching entries to a new JSON file -with open('matching_entries.json', 'w', encoding='utf-8') as file: +with open("matching_entries.json", "w", encoding="utf-8") as file: json.dump(matching_entries, file, indent=4) print(f"Matching entries written to 'matching_entries.json'") diff --git a/source_collectors/muckrock/utils.py b/source_collectors/muckrock/utils.py index ca66dc8..3d8b63d 100644 --- a/source_collectors/muckrock/utils.py +++ b/source_collectors/muckrock/utils.py @@ -1,18 +1,17 @@ -''' +""" utils.py Provides useful functions for muckrock_tools. Functions: - format_filename_json_to_csv() -''' - +""" import re def format_filename_json_to_csv(json_filename: str) -> str: - ''' + """ Converts JSON filename format to CSV filename format. Args: @@ -21,7 +20,7 @@ def format_filename_json_to_csv(json_filename: str) -> str: Returns: csv_filename (str): A CSV filename string. - ''' - csv_filename = re.sub(r'_(?=[^.]*$)', '-', json_filename[:-5]) + '.csv' + """ + csv_filename = re.sub(r"_(?=[^.]*$)", "-", json_filename[:-5]) + ".csv" return csv_filename