From 9d9618aa0d2ac7fb58f7307ac55e936fd661522c Mon Sep 17 00:00:00 2001
From: eddie-m-m <eddie.m.menefee@gmail.com>
Date: Fri, 15 Nov 2024 17:06:58 -0800
Subject: [PATCH] Lint added files

---
 common_crawler/argparser.py                   |  43 +++--
 common_crawler/cache.py                       |  16 +-
 common_crawler/crawler.py                     |  41 +++--
 common_crawler/csv_manager.py                 |  23 +--
 common_crawler/main.py                        | 108 +++++++-----
 common_crawler/utils.py                       |   2 +-
 .../ckan/ckan_scraper_toolkit.py              |  14 +-
 .../ckan/scrape_ckan_data_portals.py          |   1 +
 source_collectors/ckan/search_terms.py        |   2 +-
 source_collectors/common_crawler/argparser.py |  43 +++--
 source_collectors/common_crawler/cache.py     |  16 +-
 source_collectors/common_crawler/crawler.py   |  41 +++--
 .../common_crawler/csv_manager.py             |  23 +--
 source_collectors/common_crawler/main.py      | 108 +++++++-----
 source_collectors/common_crawler/utils.py     |   2 +-
 .../convert_all_record_types_to_csv.py        |  47 ++++-
 .../muckrock/create_foia_data_db.py           | 165 +++++++++---------
 .../muckrock/download_muckrock_foia.py        |  12 +-
 .../generate_detailed_muckrock_csv.py         |  68 +++++---
 .../muckrock/get_all_record_types.py          |  51 +++++-
 .../muckrock/get_allegheny_foias.py           |  22 ++-
 source_collectors/muckrock/muck_get.py        |  14 +-
 .../muckrock/muckrock_ml_labeler.py           |  16 +-
 .../muckrock/search_foia_data_db.py           |  82 +++++----
 .../muckrock/search_local_foia_json.py        |  26 +--
 source_collectors/muckrock/utils.py           |  11 +-
 26 files changed, 616 insertions(+), 381 deletions(-)

diff --git a/common_crawler/argparser.py b/common_crawler/argparser.py
index 8cdf5b7..67f4a29 100644
--- a/common_crawler/argparser.py
+++ b/common_crawler/argparser.py
@@ -7,6 +7,7 @@
 for the Common Crawler script.
 """
 
+
 def valid_common_crawl_id(common_crawl_id: str) -> bool:
     """
     Validate the Common Crawl ID format.
@@ -16,7 +17,8 @@ def valid_common_crawl_id(common_crawl_id: str) -> bool:
     Returns:
         True if the Common Crawl ID is valid, False otherwise
     """
-    return re.match(r'CC-MAIN-\d{4}-\d{2}', common_crawl_id) is not None
+    return re.match(r"CC-MAIN-\d{4}-\d{2}", common_crawl_id) is not None
+
 
 def parse_args() -> argparse.Namespace:
     """
@@ -33,22 +35,41 @@ def parse_args() -> argparse.Namespace:
     """
 
     parser = argparse.ArgumentParser(
-        description='Query the Common Crawl dataset and optionally save the results to a file.')
+        description="Query the Common Crawl dataset and optionally save the results to a file."
+    )
     # Add the required arguments
-    parser.add_argument('common_crawl_id', type=str, help='The Common Crawl ID')
-    parser.add_argument('url', type=str, help='The URL to query')
-    parser.add_argument('keyword', type=str, help='The keyword to search in the url')
+    parser.add_argument("common_crawl_id", type=str, help="The Common Crawl ID")
+    parser.add_argument("url", type=str, help="The URL to query")
+    parser.add_argument("keyword", type=str, help="The keyword to search in the url")
     # Optional arguments for the number of pages and the output file, and a flag to reset the cache
-    parser.add_argument('-c', '--config', type=str, default='config.ini', help='The configuration file to use')
-    parser.add_argument('-p', '--pages', type=int, default=1, help='The number of pages to search (default: 1)')
-    parser.add_argument('--reset-cache', action='store_true', default=False,
-                        help='Reset the cache before starting the crawl')
+    parser.add_argument(
+        "-c",
+        "--config",
+        type=str,
+        default="config.ini",
+        help="The configuration file to use",
+    )
+    parser.add_argument(
+        "-p",
+        "--pages",
+        type=int,
+        default=1,
+        help="The number of pages to search (default: 1)",
+    )
+    parser.add_argument(
+        "--reset-cache",
+        action="store_true",
+        default=False,
+        help="Reset the cache before starting the crawl",
+    )
 
     args = parser.parse_args()
 
     # Validate the Common Crawl ID format
     if not valid_common_crawl_id(args.common_crawl_id):
-        parser.error("Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW.")
+        parser.error(
+            "Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW."
+        )
 
     # Read the configuration file
     config = configparser.ConfigParser()
@@ -56,7 +77,7 @@ def parse_args() -> argparse.Namespace:
 
     # Combine parsed arguments with configuration file defaults
     app_parser = argparse.ArgumentParser(parents=[parser], add_help=False)
-    app_parser.set_defaults(**config['DEFAULT'])
+    app_parser.set_defaults(**config["DEFAULT"])
 
     app_args = app_parser.parse_args()
 
diff --git a/common_crawler/cache.py b/common_crawler/cache.py
index 2a48c0b..23d5881 100644
--- a/common_crawler/cache.py
+++ b/common_crawler/cache.py
@@ -8,11 +8,13 @@
     - CommonCrawlerCache: a class for managing the cache logic of Common Crawl search results
 """
 
+
 class CommonCrawlerCacheManager:
     """
     A class for managing the cache of Common Crawl search results.
     This class is responsible for adding, retrieving, and saving cache data.
     """
+
     def __init__(self, file_name: str = "cache", directory=None):
         """
         Initializes the CacheStorage object with a file name and directory.
@@ -41,7 +43,6 @@ def upsert(self, index: str, url: str, keyword: str, last_page: int) -> None:
             self.cache[index][url] = {}
         self.cache[index][url][keyword] = last_page
 
-
     def get(self, index, url, keyword) -> int:
         """
         Retrieves a page number from the cache.
@@ -53,12 +54,15 @@ def get(self, index, url, keyword) -> int:
         Returns: int - the last page crawled
 
         """
-        if index in self.cache and url in self.cache[index] and keyword in self.cache[index][url]:
+        if (
+            index in self.cache
+            and url in self.cache[index]
+            and keyword in self.cache[index][url]
+        ):
             return self.cache[index][url][keyword]
         # The cache object does not exist. Return 0 as the default value.
         return 0
 
-
     def load_or_create_cache(self) -> dict:
         """
         Loads the cache from the configured file path.
@@ -66,12 +70,11 @@ def load_or_create_cache(self) -> dict:
         Returns: dict - the cache data
         """
         try:
-            with open(self.file_path, 'r') as file:
+            with open(self.file_path, "r") as file:
                 return json.load(file)
         except FileNotFoundError:
             return {}
 
-
     def save_cache(self) -> None:
         """
         Converts the cache object into a JSON-serializable format and saves it to the configured file path.
@@ -79,10 +82,9 @@ def save_cache(self) -> None:
         persistence of crawl data across sessions.
         """
         # Reformat cache data for JSON serialization
-        with open(self.file_path, 'w') as file:
+        with open(self.file_path, "w") as file:
             json.dump(self.cache, file, indent=4)
 
-
     def reset_cache(self) -> None:
         """
         Resets the cache to an empty state.
diff --git a/common_crawler/crawler.py b/common_crawler/crawler.py
index 9afba7d..0982ca5 100644
--- a/common_crawler/crawler.py
+++ b/common_crawler/crawler.py
@@ -16,7 +16,6 @@
 # TODO: What happens when no results are found? How does the CommonCrawlerManager handle this?
 
 
-
 @dataclass
 class CommonCrawlResult:
     last_page_search: int
@@ -31,16 +30,17 @@ class CommonCrawlerManager:
     It validates crawl ids, manages pagination, and aggregates results.
     """
 
-    def __init__(self, crawl_id='CC-MAIN-2023-50'):
+    def __init__(self, crawl_id="CC-MAIN-2023-50"):
         self.crawl_id = crawl_id
-        CC_INDEX_SERVER = 'http://index.commoncrawl.org/'
-        INDEX_NAME = f'{self.crawl_id}-index'
-        self.root_url = f'{CC_INDEX_SERVER}{INDEX_NAME}'
+        CC_INDEX_SERVER = "http://index.commoncrawl.org/"
+        INDEX_NAME = f"{self.crawl_id}-index"
+        self.root_url = f"{CC_INDEX_SERVER}{INDEX_NAME}"
 
     def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResult:
         print(
             f"Searching for {keyword} on {search_term} in {self.crawl_id} for {num_pages} pages,"
-            f" starting at page {start_page}")
+            f" starting at page {start_page}"
+        )
 
         url_results = []
 
@@ -64,7 +64,9 @@ def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResul
 
         return CommonCrawlResult(last_page, url_results)
 
-    def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = 20) -> list[dict]:
+    def search_common_crawl_index(
+        self, url: str, page: int = 0, max_retries: int = 20
+    ) -> list[dict]:
         """
         This method is used to search the Common Crawl index for a given URL and page number
         Args:
@@ -76,9 +78,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int =
         """
         encoded_url = quote_plus(url)
         search_url = URLWithParameters(self.root_url)
-        search_url.add_parameter('url', encoded_url)
-        search_url.add_parameter('output', 'json')
-        search_url.add_parameter('page', page)
+        search_url.add_parameter("url", encoded_url)
+        search_url.add_parameter("output", "json")
+        search_url.add_parameter("page", page)
 
         retries = 0
         delay = 1
@@ -90,7 +92,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int =
                 return self.process_response(response, url, page)
 
             retries += 1
-            print(f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})")
+            print(
+                f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})"
+            )
             time.sleep(delay)
 
         print(f"Max retries exceeded. Failed to get records for {url} on page {page}.")
@@ -106,19 +110,24 @@ def make_request(self, search_url: str) -> requests.Response:
             response.raise_for_status()
             return response
         except requests.exceptions.RequestException as e:
-            if response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR and 'SlowDown' in response.text:
+            if (
+                response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR
+                and "SlowDown" in response.text
+            ):
                 return None
             else:
                 print(f"Failed to get records: {e}")
                 return None
 
-    def process_response(self, response: requests.Response, url: str, page: int) -> list[dict]:
+    def process_response(
+        self, response: requests.Response, url: str, page: int
+    ) -> list[dict]:
         """Processes the HTTP response and returns the parsed records if successful."""
         if response.status_code == HTTPStatus.OK:
-            records = response.text.strip().split('\n')
+            records = response.text.strip().split("\n")
             print(f"Found {len(records)} records for {url} on page {page}")
             return [json.loads(record) for record in records]
-        elif 'First Page is 0, Last Page is 0' in response.text:
+        elif "First Page is 0, Last Page is 0" in response.text:
             print("No records exist in index matching the url search term")
             return None
         else:
@@ -127,4 +136,4 @@ def process_response(self, response: requests.Response, url: str, page: int) ->
 
     @staticmethod
     def get_urls_with_keyword(records: list[dict], keyword) -> list[str]:
-        return [record['url'] for record in records if keyword in record['url']]
+        return [record["url"] for record in records if keyword in record["url"]]
diff --git a/common_crawler/csv_manager.py b/common_crawler/csv_manager.py
index 6986862..2b823b4 100644
--- a/common_crawler/csv_manager.py
+++ b/common_crawler/csv_manager.py
@@ -10,12 +10,7 @@ class CSVManager:
     Creates the file if it doesn't exist, and provides a method for adding new rows.
     """
 
-    def __init__(
-            self,
-            file_name: str,
-            headers: list[str],
-            directory=None
-    ):
+    def __init__(self, file_name: str, headers: list[str], directory=None):
         self.file_path = get_file_path(f"{file_name}.csv", directory)
         self.headers = headers
         if not os.path.exists(self.file_path):
@@ -29,9 +24,9 @@ def add_row(self, row_values: list[str] | tuple[str]):
         """
         if isinstance(row_values, str):
             # Single values must be converted to a list format
-             row_values = [row_values]
+            row_values = [row_values]
         try:
-            with open(self.file_path, mode='a', newline='', encoding='utf-8') as file:
+            with open(self.file_path, mode="a", newline="", encoding="utf-8") as file:
                 writer = csv.writer(file)
                 writer.writerow(row_values)
         except Exception as e:
@@ -45,9 +40,7 @@ def add_rows(self, results: list[list[str]]) -> None:
         Returns: None
         """
         for result in results:
-            self.add_row(
-                result
-            )
+            self.add_row(result)
         print(f"{len(results)} URLs written to {self.file_path}")
 
     def initialize_file(self):
@@ -59,15 +52,17 @@ def initialize_file(self):
         file_exists = os.path.isfile(self.file_path)
 
         if not file_exists:
-            with open(self.file_path, mode='a', newline='', encoding='utf-8') as file:
+            with open(self.file_path, mode="a", newline="", encoding="utf-8") as file:
                 writer = csv.writer(file)
                 writer.writerow(self.headers)
         else:
             # Open and check that headers match
-            with open(self.file_path, mode='r', encoding='utf-8') as file:
+            with open(self.file_path, mode="r", encoding="utf-8") as file:
                 header_row = next(csv.reader(file))
                 if header_row != self.headers:
-                    raise ValueError(f"Header row in {self.file_path} does not match expected headers")
+                    raise ValueError(
+                        f"Header row in {self.file_path} does not match expected headers"
+                    )
         print(f"CSV file initialized at {self.file_path}")
 
     def delete_file(self):
diff --git a/common_crawler/main.py b/common_crawler/main.py
index ae27f55..b9dd012 100644
--- a/common_crawler/main.py
+++ b/common_crawler/main.py
@@ -10,7 +10,7 @@
 
 # The below code sets the working directory to be the root of the entire repository
 # This is done to solve otherwise quite annoying import issues.
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 
 from util.huggingface_api_manager import HuggingFaceAPIManager
 from util.miscellaneous_functions import get_filename_friendly_timestamp
@@ -35,30 +35,34 @@ class BatchInfo:
     notes: str
     filename: str
 
+
 class LabelStudioError(Exception):
     """Custom exception for Label Studio Errors"""
+
     pass
 
-BATCH_HEADERS = ['Datetime', 'Source', 'Count', 'Keywords', 'Notes', 'Filename']
+
+BATCH_HEADERS = ["Datetime", "Source", "Count", "Keywords", "Notes", "Filename"]
+
 
 def get_current_time():
     return str(datetime.now())
 
 
-def add_batch_info_to_csv(common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int) -> BatchInfo:
+def add_batch_info_to_csv(
+    common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int
+) -> BatchInfo:
     batch_info = BatchInfo(
         datetime=get_current_time(),
         source="Common Crawl",
         count=str(len(common_crawl_result.url_results)),
         keywords=f"{args.url} - {args.keyword}",
         notes=f"{args.common_crawl_id}, {args.pages} pages, starting at {last_page + 1}",
-        filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}"
+        filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}",
     )
 
     batch_info_csv_manager = CSVManager(
-        file_name='batch_info',
-        directory=args.data_dir,
-        headers=BATCH_HEADERS
+        file_name="batch_info", directory=args.data_dir, headers=BATCH_HEADERS
     )
     batch_info_csv_manager.add_row(dataclasses.astuple(batch_info))
 
@@ -71,12 +75,11 @@ def main():
 
     # Initialize the Cache
     cache_manager = CommonCrawlerCacheManager(
-        file_name=args.cache_filename,
-        directory=args.data_dir
+        file_name=args.cache_filename, directory=args.data_dir
     )
 
     load_dotenv()
-    
+
     # Initialize the HuggingFace API Manager
     hf_access_token = os.getenv("HUGGINGFACE_ACCESS_TOKEN")
     if not hf_access_token:
@@ -84,10 +87,10 @@ def main():
             "HUGGINGFACE_ACCESS_TOKEN not accessible in .env file in root directory. "
             "Please obtain access token from your personal account at "
             "https://huggingface.co/settings/tokens and ensure you have write access to "
-            "https://huggingface.co/PDAP. Then include in .env file in root directory.")
+            "https://huggingface.co/PDAP. Then include in .env file in root directory."
+        )
     huggingface_api_manager = HuggingFaceAPIManager(
-        access_token=hf_access_token,
-        repo_id=args.huggingface_repo_id
+        access_token=hf_access_token, repo_id=args.huggingface_repo_id
     )
     ls_access_token = os.getenv("LABEL_STUDIO_ACCESS_TOKEN")
     if not ls_access_token:
@@ -95,13 +98,15 @@ def main():
             "LABEL_STUDIO_ACCESS_TOKEN not accessible in .env file in root directory. "
             "Please obtain access token from your personal account at "
             "https://app.heartex.com/user/account and ensure you have read access to "
-            "https://app.heartex.com/projects/61550. Then include in .env file in root directory.")
+            "https://app.heartex.com/projects/61550. Then include in .env file in root directory."
+        )
     ls_project_id = os.getenv("LABEL_STUDIO_PROJECT_ID")
     if not ls_project_id:
         raise ValueError(
             "LABEL_STUDIO_PROJECT_ID not accessible in .env file in root directory. "
             "Please obtain a project ID by navigating to the Label Studio project  "
-            "where it will be visibile in the url. Then include in .env file in root directory.")
+            "where it will be visibile in the url. Then include in .env file in root directory."
+        )
 
     try:
         print("Retrieving Label Studio data for deduplication")
@@ -119,7 +124,9 @@ def main():
     try:
         # Retrieve the last page from the cache, or 0 if it does not exist
         last_page = cache_manager.get(args.common_crawl_id, args.url, args.keyword)
-        common_crawl_result = process_crawl_and_upload(args, last_page, huggingface_api_manager, label_studio_results)
+        common_crawl_result = process_crawl_and_upload(
+            args, last_page, huggingface_api_manager, label_studio_results
+        )
     except ValueError as e:
         print(f"Error during crawling: {e}")
         return
@@ -129,12 +136,14 @@ def main():
             index=args.common_crawl_id,
             url=args.url,
             keyword=args.keyword,
-            last_page=common_crawl_result.last_page_search)
+            last_page=common_crawl_result.last_page_search,
+        )
         cache_manager.save_cache()
 
     except ValueError as e:
         print(f"Error while saving cache manager: {e}")
 
+
 def handle_remote_results_error(remote_results):
     """
     Handles errors in the remote results
@@ -151,6 +160,7 @@ def handle_remote_results_error(remote_results):
     else:
         raise LabelStudioError(f"Unexpected error: {remote_results}")
 
+
 def validate_remote_results(remote_results):
     """
     Validates the remote results retrieved from the Label Studio project
@@ -166,7 +176,9 @@ def validate_remote_results(remote_results):
             print("No data in Label Studio project.")
             return []
         elif "url" not in remote_results[0]["data"]:
-            raise LabelStudioError("Column 'url' not present in Label Studio project. Exiting...")
+            raise LabelStudioError(
+                "Column 'url' not present in Label Studio project. Exiting..."
+            )
         else:
             return remote_results
     elif isinstance(remote_results, dict):
@@ -174,6 +186,7 @@ def validate_remote_results(remote_results):
     else:
         raise LabelStudioError("Unexpected response type.")
 
+
 def get_ls_data() -> list[dict] | None:
     """Retrieves data from a Label Studio project to be used in deduplication of common crawl results.
 
@@ -190,14 +203,14 @@ def get_ls_data() -> list[dict] | None:
 
 
 def strip_url(url: str) -> str:
-    """Strips http(s)://www. from the beginning of a url if applicable. 
+    """Strips http(s)://www. from the beginning of a url if applicable.
 
     Args:
         url (str): The URL to strip.
 
     Returns:
         str: The stripped URL.
-    """    
+    """
     result = re.search(r"^(?:https?://)?(?:www\.)?(.*)$", url).group(1)
     return result
 
@@ -210,7 +223,7 @@ def remove_local_duplicates(url_results: list[str]) -> list[str]:
 
     Returns:
         list[str]: List of unique URLs.
-    """    
+    """
     stripped_url_results = [strip_url(url) for url in url_results]
     unique_urls = collections.deque()
     adjust = 0
@@ -225,7 +238,9 @@ def remove_local_duplicates(url_results: list[str]) -> list[str]:
     return url_results
 
 
-def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dict]) -> list[str]:
+def remove_remote_duplicates(
+    url_results: list[str], label_studio_data: list[dict]
+) -> list[str]:
     """Removes URLs from a list that are already present in the Label Studio project, ignoring http(s)://www.
 
     Args:
@@ -238,7 +253,9 @@ def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dic
     try:
         remote_urls = [strip_url(task["data"]["url"]) for task in label_studio_data]
     except TypeError:
-        print("Invalid Label Studio credentials. Database could not be checked for duplicates.")
+        print(
+            "Invalid Label Studio credentials. Database could not be checked for duplicates."
+        )
         return url_results
     remote_urls = set(remote_urls)
 
@@ -254,10 +271,11 @@ def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dic
 
 
 def handle_csv_and_upload(
-        common_crawl_result: CommonCrawlResult,
-        huggingface_api_manager: HuggingFaceAPIManager,
-        args: argparse.Namespace,
-        last_page: int):
+    common_crawl_result: CommonCrawlResult,
+    huggingface_api_manager: HuggingFaceAPIManager,
+    args: argparse.Namespace,
+    last_page: int,
+):
     """
     Handles the CSV file and uploads it to Hugging Face repository.
     Args:
@@ -270,29 +288,27 @@ def handle_csv_and_upload(
     batch_info = add_batch_info_to_csv(common_crawl_result, args, last_page)
 
     csv_manager = CSVManager(
-        file_name=batch_info.filename,
-        headers=['url'],
-        directory=args.data_dir
+        file_name=batch_info.filename, headers=["url"], directory=args.data_dir
     )
     csv_manager.add_rows(common_crawl_result.url_results)
     huggingface_api_manager.upload_file(
         local_file_path=csv_manager.file_path,
-        repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}"
+        repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}",
     )
     print(
-        f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}")
+        f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}"
+    )
     csv_manager.delete_file()
 
 
 def process_crawl_and_upload(
-        args: argparse.Namespace,
-        last_page: int,
-        huggingface_api_manager: HuggingFaceAPIManager,
-        label_studio_data: list[dict]) -> CommonCrawlResult:
+    args: argparse.Namespace,
+    last_page: int,
+    huggingface_api_manager: HuggingFaceAPIManager,
+    label_studio_data: list[dict],
+) -> CommonCrawlResult:
     # Initialize the CommonCrawlerManager
-    crawler_manager = CommonCrawlerManager(
-        args.common_crawl_id
-    )
+    crawler_manager = CommonCrawlerManager(args.common_crawl_id)
     # Determine the pages to search, based on the last page searched
     start_page = last_page + 1
     # Use the parsed arguments
@@ -300,7 +316,7 @@ def process_crawl_and_upload(
         search_term=args.url,
         keyword=args.keyword,
         num_pages=args.pages,
-        start_page=start_page
+        start_page=start_page,
     )
     # Logic should conclude here if no results are found
     if not common_crawl_result.url_results:
@@ -309,10 +325,16 @@ def process_crawl_and_upload(
         return common_crawl_result
 
     print("Removing urls already in the database")
-    common_crawl_result.url_results = remove_local_duplicates(common_crawl_result.url_results)
-    common_crawl_result.url_results = remove_remote_duplicates(common_crawl_result.url_results, label_studio_data)
+    common_crawl_result.url_results = remove_local_duplicates(
+        common_crawl_result.url_results
+    )
+    common_crawl_result.url_results = remove_remote_duplicates(
+        common_crawl_result.url_results, label_studio_data
+    )
     if not common_crawl_result.url_results:
-        print("No urls not already present in the database found. Ceasing main execution.")
+        print(
+            "No urls not already present in the database found. Ceasing main execution."
+        )
         add_batch_info_to_csv(common_crawl_result, args, last_page)
         return common_crawl_result
 
diff --git a/common_crawler/utils.py b/common_crawler/utils.py
index 0848b02..3cea7af 100644
--- a/common_crawler/utils.py
+++ b/common_crawler/utils.py
@@ -12,7 +12,7 @@ def __init__(self, url):
         self.url = url
 
     def add_parameter(self, parameter, value):
-        if '?' in self.url:
+        if "?" in self.url:
             self.url += f"&{parameter}={value}"
         else:
             self.url += f"?{parameter}={value}"
diff --git a/source_collectors/ckan/ckan_scraper_toolkit.py b/source_collectors/ckan/ckan_scraper_toolkit.py
index 0d9dc44..5898c9f 100644
--- a/source_collectors/ckan/ckan_scraper_toolkit.py
+++ b/source_collectors/ckan/ckan_scraper_toolkit.py
@@ -1,4 +1,5 @@
 """Toolkit of functions that use ckanapi to retrieve packages from CKAN data portals"""
+
 from concurrent.futures import as_completed, ThreadPoolExecutor
 from dataclasses import dataclass, field
 from datetime import datetime
@@ -150,10 +151,7 @@ def ckan_collection_search(base_url: str, collection_id: str) -> list[Package]:
                 for dataset_content in soup.find_all(class_="dataset-content")
             ]
 
-            [
-                packages.append(package.result())
-                for package in as_completed(futures)
-            ]
+            [packages.append(package.result()) for package in as_completed(futures)]
 
         # Take a break to avoid being timed out
         if len(futures) >= 15:
@@ -186,10 +184,12 @@ def _collection_search_get_package_data(dataset_content, base_url: str):
         record_format.text.strip() for record_format in dataset_content.find_all("li")
     ]
     package.record_format = list(set(package.record_format))
-    
+
     date = dataset_soup.find(property="dct:modified").text.strip()
-    package.source_last_updated = datetime.strptime(date, "%B %d, %Y").strftime("%Y-%d-%m")
-    
+    package.source_last_updated = datetime.strptime(date, "%B %d, %Y").strftime(
+        "%Y-%d-%m"
+    )
+
     return package
 
 
diff --git a/source_collectors/ckan/scrape_ckan_data_portals.py b/source_collectors/ckan/scrape_ckan_data_portals.py
index ef83b4d..57bd992 100644
--- a/source_collectors/ckan/scrape_ckan_data_portals.py
+++ b/source_collectors/ckan/scrape_ckan_data_portals.py
@@ -1,4 +1,5 @@
 """Retrieves packages from CKAN data portals and parses relevant information then outputs to a CSV file"""
+
 from itertools import chain
 import json
 import sys
diff --git a/source_collectors/ckan/search_terms.py b/source_collectors/ckan/search_terms.py
index 7fdbc34..179e58d 100644
--- a/source_collectors/ckan/search_terms.py
+++ b/source_collectors/ckan/search_terms.py
@@ -11,7 +11,7 @@
     {"url": "https://open.jacksonms.gov/", "terms": ["tags:police"]},
     {"url": "https://data.milwaukee.gov/", "terms": ["mpd", "wibr"]},
     {"url": "https://data.sanantonio.gov/", "terms": ["sapd"]},
-    {"url": "https://data.sanjoseca.gov/", "terms": ["police"]}
+    {"url": "https://data.sanjoseca.gov/", "terms": ["police"]},
 ]
 
 group_search = [
diff --git a/source_collectors/common_crawler/argparser.py b/source_collectors/common_crawler/argparser.py
index 8cdf5b7..67f4a29 100644
--- a/source_collectors/common_crawler/argparser.py
+++ b/source_collectors/common_crawler/argparser.py
@@ -7,6 +7,7 @@
 for the Common Crawler script.
 """
 
+
 def valid_common_crawl_id(common_crawl_id: str) -> bool:
     """
     Validate the Common Crawl ID format.
@@ -16,7 +17,8 @@ def valid_common_crawl_id(common_crawl_id: str) -> bool:
     Returns:
         True if the Common Crawl ID is valid, False otherwise
     """
-    return re.match(r'CC-MAIN-\d{4}-\d{2}', common_crawl_id) is not None
+    return re.match(r"CC-MAIN-\d{4}-\d{2}", common_crawl_id) is not None
+
 
 def parse_args() -> argparse.Namespace:
     """
@@ -33,22 +35,41 @@ def parse_args() -> argparse.Namespace:
     """
 
     parser = argparse.ArgumentParser(
-        description='Query the Common Crawl dataset and optionally save the results to a file.')
+        description="Query the Common Crawl dataset and optionally save the results to a file."
+    )
     # Add the required arguments
-    parser.add_argument('common_crawl_id', type=str, help='The Common Crawl ID')
-    parser.add_argument('url', type=str, help='The URL to query')
-    parser.add_argument('keyword', type=str, help='The keyword to search in the url')
+    parser.add_argument("common_crawl_id", type=str, help="The Common Crawl ID")
+    parser.add_argument("url", type=str, help="The URL to query")
+    parser.add_argument("keyword", type=str, help="The keyword to search in the url")
     # Optional arguments for the number of pages and the output file, and a flag to reset the cache
-    parser.add_argument('-c', '--config', type=str, default='config.ini', help='The configuration file to use')
-    parser.add_argument('-p', '--pages', type=int, default=1, help='The number of pages to search (default: 1)')
-    parser.add_argument('--reset-cache', action='store_true', default=False,
-                        help='Reset the cache before starting the crawl')
+    parser.add_argument(
+        "-c",
+        "--config",
+        type=str,
+        default="config.ini",
+        help="The configuration file to use",
+    )
+    parser.add_argument(
+        "-p",
+        "--pages",
+        type=int,
+        default=1,
+        help="The number of pages to search (default: 1)",
+    )
+    parser.add_argument(
+        "--reset-cache",
+        action="store_true",
+        default=False,
+        help="Reset the cache before starting the crawl",
+    )
 
     args = parser.parse_args()
 
     # Validate the Common Crawl ID format
     if not valid_common_crawl_id(args.common_crawl_id):
-        parser.error("Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW.")
+        parser.error(
+            "Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW."
+        )
 
     # Read the configuration file
     config = configparser.ConfigParser()
@@ -56,7 +77,7 @@ def parse_args() -> argparse.Namespace:
 
     # Combine parsed arguments with configuration file defaults
     app_parser = argparse.ArgumentParser(parents=[parser], add_help=False)
-    app_parser.set_defaults(**config['DEFAULT'])
+    app_parser.set_defaults(**config["DEFAULT"])
 
     app_args = app_parser.parse_args()
 
diff --git a/source_collectors/common_crawler/cache.py b/source_collectors/common_crawler/cache.py
index 2a48c0b..23d5881 100644
--- a/source_collectors/common_crawler/cache.py
+++ b/source_collectors/common_crawler/cache.py
@@ -8,11 +8,13 @@
     - CommonCrawlerCache: a class for managing the cache logic of Common Crawl search results
 """
 
+
 class CommonCrawlerCacheManager:
     """
     A class for managing the cache of Common Crawl search results.
     This class is responsible for adding, retrieving, and saving cache data.
     """
+
     def __init__(self, file_name: str = "cache", directory=None):
         """
         Initializes the CacheStorage object with a file name and directory.
@@ -41,7 +43,6 @@ def upsert(self, index: str, url: str, keyword: str, last_page: int) -> None:
             self.cache[index][url] = {}
         self.cache[index][url][keyword] = last_page
 
-
     def get(self, index, url, keyword) -> int:
         """
         Retrieves a page number from the cache.
@@ -53,12 +54,15 @@ def get(self, index, url, keyword) -> int:
         Returns: int - the last page crawled
 
         """
-        if index in self.cache and url in self.cache[index] and keyword in self.cache[index][url]:
+        if (
+            index in self.cache
+            and url in self.cache[index]
+            and keyword in self.cache[index][url]
+        ):
             return self.cache[index][url][keyword]
         # The cache object does not exist. Return 0 as the default value.
         return 0
 
-
     def load_or_create_cache(self) -> dict:
         """
         Loads the cache from the configured file path.
@@ -66,12 +70,11 @@ def load_or_create_cache(self) -> dict:
         Returns: dict - the cache data
         """
         try:
-            with open(self.file_path, 'r') as file:
+            with open(self.file_path, "r") as file:
                 return json.load(file)
         except FileNotFoundError:
             return {}
 
-
     def save_cache(self) -> None:
         """
         Converts the cache object into a JSON-serializable format and saves it to the configured file path.
@@ -79,10 +82,9 @@ def save_cache(self) -> None:
         persistence of crawl data across sessions.
         """
         # Reformat cache data for JSON serialization
-        with open(self.file_path, 'w') as file:
+        with open(self.file_path, "w") as file:
             json.dump(self.cache, file, indent=4)
 
-
     def reset_cache(self) -> None:
         """
         Resets the cache to an empty state.
diff --git a/source_collectors/common_crawler/crawler.py b/source_collectors/common_crawler/crawler.py
index 9afba7d..0982ca5 100644
--- a/source_collectors/common_crawler/crawler.py
+++ b/source_collectors/common_crawler/crawler.py
@@ -16,7 +16,6 @@
 # TODO: What happens when no results are found? How does the CommonCrawlerManager handle this?
 
 
-
 @dataclass
 class CommonCrawlResult:
     last_page_search: int
@@ -31,16 +30,17 @@ class CommonCrawlerManager:
     It validates crawl ids, manages pagination, and aggregates results.
     """
 
-    def __init__(self, crawl_id='CC-MAIN-2023-50'):
+    def __init__(self, crawl_id="CC-MAIN-2023-50"):
         self.crawl_id = crawl_id
-        CC_INDEX_SERVER = 'http://index.commoncrawl.org/'
-        INDEX_NAME = f'{self.crawl_id}-index'
-        self.root_url = f'{CC_INDEX_SERVER}{INDEX_NAME}'
+        CC_INDEX_SERVER = "http://index.commoncrawl.org/"
+        INDEX_NAME = f"{self.crawl_id}-index"
+        self.root_url = f"{CC_INDEX_SERVER}{INDEX_NAME}"
 
     def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResult:
         print(
             f"Searching for {keyword} on {search_term} in {self.crawl_id} for {num_pages} pages,"
-            f" starting at page {start_page}")
+            f" starting at page {start_page}"
+        )
 
         url_results = []
 
@@ -64,7 +64,9 @@ def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResul
 
         return CommonCrawlResult(last_page, url_results)
 
-    def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = 20) -> list[dict]:
+    def search_common_crawl_index(
+        self, url: str, page: int = 0, max_retries: int = 20
+    ) -> list[dict]:
         """
         This method is used to search the Common Crawl index for a given URL and page number
         Args:
@@ -76,9 +78,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int =
         """
         encoded_url = quote_plus(url)
         search_url = URLWithParameters(self.root_url)
-        search_url.add_parameter('url', encoded_url)
-        search_url.add_parameter('output', 'json')
-        search_url.add_parameter('page', page)
+        search_url.add_parameter("url", encoded_url)
+        search_url.add_parameter("output", "json")
+        search_url.add_parameter("page", page)
 
         retries = 0
         delay = 1
@@ -90,7 +92,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int =
                 return self.process_response(response, url, page)
 
             retries += 1
-            print(f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})")
+            print(
+                f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})"
+            )
             time.sleep(delay)
 
         print(f"Max retries exceeded. Failed to get records for {url} on page {page}.")
@@ -106,19 +110,24 @@ def make_request(self, search_url: str) -> requests.Response:
             response.raise_for_status()
             return response
         except requests.exceptions.RequestException as e:
-            if response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR and 'SlowDown' in response.text:
+            if (
+                response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR
+                and "SlowDown" in response.text
+            ):
                 return None
             else:
                 print(f"Failed to get records: {e}")
                 return None
 
-    def process_response(self, response: requests.Response, url: str, page: int) -> list[dict]:
+    def process_response(
+        self, response: requests.Response, url: str, page: int
+    ) -> list[dict]:
         """Processes the HTTP response and returns the parsed records if successful."""
         if response.status_code == HTTPStatus.OK:
-            records = response.text.strip().split('\n')
+            records = response.text.strip().split("\n")
             print(f"Found {len(records)} records for {url} on page {page}")
             return [json.loads(record) for record in records]
-        elif 'First Page is 0, Last Page is 0' in response.text:
+        elif "First Page is 0, Last Page is 0" in response.text:
             print("No records exist in index matching the url search term")
             return None
         else:
@@ -127,4 +136,4 @@ def process_response(self, response: requests.Response, url: str, page: int) ->
 
     @staticmethod
     def get_urls_with_keyword(records: list[dict], keyword) -> list[str]:
-        return [record['url'] for record in records if keyword in record['url']]
+        return [record["url"] for record in records if keyword in record["url"]]
diff --git a/source_collectors/common_crawler/csv_manager.py b/source_collectors/common_crawler/csv_manager.py
index 6986862..2b823b4 100644
--- a/source_collectors/common_crawler/csv_manager.py
+++ b/source_collectors/common_crawler/csv_manager.py
@@ -10,12 +10,7 @@ class CSVManager:
     Creates the file if it doesn't exist, and provides a method for adding new rows.
     """
 
-    def __init__(
-            self,
-            file_name: str,
-            headers: list[str],
-            directory=None
-    ):
+    def __init__(self, file_name: str, headers: list[str], directory=None):
         self.file_path = get_file_path(f"{file_name}.csv", directory)
         self.headers = headers
         if not os.path.exists(self.file_path):
@@ -29,9 +24,9 @@ def add_row(self, row_values: list[str] | tuple[str]):
         """
         if isinstance(row_values, str):
             # Single values must be converted to a list format
-             row_values = [row_values]
+            row_values = [row_values]
         try:
-            with open(self.file_path, mode='a', newline='', encoding='utf-8') as file:
+            with open(self.file_path, mode="a", newline="", encoding="utf-8") as file:
                 writer = csv.writer(file)
                 writer.writerow(row_values)
         except Exception as e:
@@ -45,9 +40,7 @@ def add_rows(self, results: list[list[str]]) -> None:
         Returns: None
         """
         for result in results:
-            self.add_row(
-                result
-            )
+            self.add_row(result)
         print(f"{len(results)} URLs written to {self.file_path}")
 
     def initialize_file(self):
@@ -59,15 +52,17 @@ def initialize_file(self):
         file_exists = os.path.isfile(self.file_path)
 
         if not file_exists:
-            with open(self.file_path, mode='a', newline='', encoding='utf-8') as file:
+            with open(self.file_path, mode="a", newline="", encoding="utf-8") as file:
                 writer = csv.writer(file)
                 writer.writerow(self.headers)
         else:
             # Open and check that headers match
-            with open(self.file_path, mode='r', encoding='utf-8') as file:
+            with open(self.file_path, mode="r", encoding="utf-8") as file:
                 header_row = next(csv.reader(file))
                 if header_row != self.headers:
-                    raise ValueError(f"Header row in {self.file_path} does not match expected headers")
+                    raise ValueError(
+                        f"Header row in {self.file_path} does not match expected headers"
+                    )
         print(f"CSV file initialized at {self.file_path}")
 
     def delete_file(self):
diff --git a/source_collectors/common_crawler/main.py b/source_collectors/common_crawler/main.py
index ae27f55..b9dd012 100644
--- a/source_collectors/common_crawler/main.py
+++ b/source_collectors/common_crawler/main.py
@@ -10,7 +10,7 @@
 
 # The below code sets the working directory to be the root of the entire repository
 # This is done to solve otherwise quite annoying import issues.
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 
 from util.huggingface_api_manager import HuggingFaceAPIManager
 from util.miscellaneous_functions import get_filename_friendly_timestamp
@@ -35,30 +35,34 @@ class BatchInfo:
     notes: str
     filename: str
 
+
 class LabelStudioError(Exception):
     """Custom exception for Label Studio Errors"""
+
     pass
 
-BATCH_HEADERS = ['Datetime', 'Source', 'Count', 'Keywords', 'Notes', 'Filename']
+
+BATCH_HEADERS = ["Datetime", "Source", "Count", "Keywords", "Notes", "Filename"]
+
 
 def get_current_time():
     return str(datetime.now())
 
 
-def add_batch_info_to_csv(common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int) -> BatchInfo:
+def add_batch_info_to_csv(
+    common_crawl_result: CommonCrawlResult, args: argparse.Namespace, last_page: int
+) -> BatchInfo:
     batch_info = BatchInfo(
         datetime=get_current_time(),
         source="Common Crawl",
         count=str(len(common_crawl_result.url_results)),
         keywords=f"{args.url} - {args.keyword}",
         notes=f"{args.common_crawl_id}, {args.pages} pages, starting at {last_page + 1}",
-        filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}"
+        filename=f"{args.output_filename}_{get_filename_friendly_timestamp()}",
     )
 
     batch_info_csv_manager = CSVManager(
-        file_name='batch_info',
-        directory=args.data_dir,
-        headers=BATCH_HEADERS
+        file_name="batch_info", directory=args.data_dir, headers=BATCH_HEADERS
     )
     batch_info_csv_manager.add_row(dataclasses.astuple(batch_info))
 
@@ -71,12 +75,11 @@ def main():
 
     # Initialize the Cache
     cache_manager = CommonCrawlerCacheManager(
-        file_name=args.cache_filename,
-        directory=args.data_dir
+        file_name=args.cache_filename, directory=args.data_dir
     )
 
     load_dotenv()
-    
+
     # Initialize the HuggingFace API Manager
     hf_access_token = os.getenv("HUGGINGFACE_ACCESS_TOKEN")
     if not hf_access_token:
@@ -84,10 +87,10 @@ def main():
             "HUGGINGFACE_ACCESS_TOKEN not accessible in .env file in root directory. "
             "Please obtain access token from your personal account at "
             "https://huggingface.co/settings/tokens and ensure you have write access to "
-            "https://huggingface.co/PDAP. Then include in .env file in root directory.")
+            "https://huggingface.co/PDAP. Then include in .env file in root directory."
+        )
     huggingface_api_manager = HuggingFaceAPIManager(
-        access_token=hf_access_token,
-        repo_id=args.huggingface_repo_id
+        access_token=hf_access_token, repo_id=args.huggingface_repo_id
     )
     ls_access_token = os.getenv("LABEL_STUDIO_ACCESS_TOKEN")
     if not ls_access_token:
@@ -95,13 +98,15 @@ def main():
             "LABEL_STUDIO_ACCESS_TOKEN not accessible in .env file in root directory. "
             "Please obtain access token from your personal account at "
             "https://app.heartex.com/user/account and ensure you have read access to "
-            "https://app.heartex.com/projects/61550. Then include in .env file in root directory.")
+            "https://app.heartex.com/projects/61550. Then include in .env file in root directory."
+        )
     ls_project_id = os.getenv("LABEL_STUDIO_PROJECT_ID")
     if not ls_project_id:
         raise ValueError(
             "LABEL_STUDIO_PROJECT_ID not accessible in .env file in root directory. "
             "Please obtain a project ID by navigating to the Label Studio project  "
-            "where it will be visibile in the url. Then include in .env file in root directory.")
+            "where it will be visibile in the url. Then include in .env file in root directory."
+        )
 
     try:
         print("Retrieving Label Studio data for deduplication")
@@ -119,7 +124,9 @@ def main():
     try:
         # Retrieve the last page from the cache, or 0 if it does not exist
         last_page = cache_manager.get(args.common_crawl_id, args.url, args.keyword)
-        common_crawl_result = process_crawl_and_upload(args, last_page, huggingface_api_manager, label_studio_results)
+        common_crawl_result = process_crawl_and_upload(
+            args, last_page, huggingface_api_manager, label_studio_results
+        )
     except ValueError as e:
         print(f"Error during crawling: {e}")
         return
@@ -129,12 +136,14 @@ def main():
             index=args.common_crawl_id,
             url=args.url,
             keyword=args.keyword,
-            last_page=common_crawl_result.last_page_search)
+            last_page=common_crawl_result.last_page_search,
+        )
         cache_manager.save_cache()
 
     except ValueError as e:
         print(f"Error while saving cache manager: {e}")
 
+
 def handle_remote_results_error(remote_results):
     """
     Handles errors in the remote results
@@ -151,6 +160,7 @@ def handle_remote_results_error(remote_results):
     else:
         raise LabelStudioError(f"Unexpected error: {remote_results}")
 
+
 def validate_remote_results(remote_results):
     """
     Validates the remote results retrieved from the Label Studio project
@@ -166,7 +176,9 @@ def validate_remote_results(remote_results):
             print("No data in Label Studio project.")
             return []
         elif "url" not in remote_results[0]["data"]:
-            raise LabelStudioError("Column 'url' not present in Label Studio project. Exiting...")
+            raise LabelStudioError(
+                "Column 'url' not present in Label Studio project. Exiting..."
+            )
         else:
             return remote_results
     elif isinstance(remote_results, dict):
@@ -174,6 +186,7 @@ def validate_remote_results(remote_results):
     else:
         raise LabelStudioError("Unexpected response type.")
 
+
 def get_ls_data() -> list[dict] | None:
     """Retrieves data from a Label Studio project to be used in deduplication of common crawl results.
 
@@ -190,14 +203,14 @@ def get_ls_data() -> list[dict] | None:
 
 
 def strip_url(url: str) -> str:
-    """Strips http(s)://www. from the beginning of a url if applicable. 
+    """Strips http(s)://www. from the beginning of a url if applicable.
 
     Args:
         url (str): The URL to strip.
 
     Returns:
         str: The stripped URL.
-    """    
+    """
     result = re.search(r"^(?:https?://)?(?:www\.)?(.*)$", url).group(1)
     return result
 
@@ -210,7 +223,7 @@ def remove_local_duplicates(url_results: list[str]) -> list[str]:
 
     Returns:
         list[str]: List of unique URLs.
-    """    
+    """
     stripped_url_results = [strip_url(url) for url in url_results]
     unique_urls = collections.deque()
     adjust = 0
@@ -225,7 +238,9 @@ def remove_local_duplicates(url_results: list[str]) -> list[str]:
     return url_results
 
 
-def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dict]) -> list[str]:
+def remove_remote_duplicates(
+    url_results: list[str], label_studio_data: list[dict]
+) -> list[str]:
     """Removes URLs from a list that are already present in the Label Studio project, ignoring http(s)://www.
 
     Args:
@@ -238,7 +253,9 @@ def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dic
     try:
         remote_urls = [strip_url(task["data"]["url"]) for task in label_studio_data]
     except TypeError:
-        print("Invalid Label Studio credentials. Database could not be checked for duplicates.")
+        print(
+            "Invalid Label Studio credentials. Database could not be checked for duplicates."
+        )
         return url_results
     remote_urls = set(remote_urls)
 
@@ -254,10 +271,11 @@ def remove_remote_duplicates(url_results: list[str], label_studio_data: list[dic
 
 
 def handle_csv_and_upload(
-        common_crawl_result: CommonCrawlResult,
-        huggingface_api_manager: HuggingFaceAPIManager,
-        args: argparse.Namespace,
-        last_page: int):
+    common_crawl_result: CommonCrawlResult,
+    huggingface_api_manager: HuggingFaceAPIManager,
+    args: argparse.Namespace,
+    last_page: int,
+):
     """
     Handles the CSV file and uploads it to Hugging Face repository.
     Args:
@@ -270,29 +288,27 @@ def handle_csv_and_upload(
     batch_info = add_batch_info_to_csv(common_crawl_result, args, last_page)
 
     csv_manager = CSVManager(
-        file_name=batch_info.filename,
-        headers=['url'],
-        directory=args.data_dir
+        file_name=batch_info.filename, headers=["url"], directory=args.data_dir
     )
     csv_manager.add_rows(common_crawl_result.url_results)
     huggingface_api_manager.upload_file(
         local_file_path=csv_manager.file_path,
-        repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}"
+        repo_file_path=f"{args.output_filename}/{csv_manager.file_path.name}",
     )
     print(
-        f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}")
+        f"Uploaded file to Hugging Face repo {huggingface_api_manager.repo_id} at {args.output_filename}/{csv_manager.file_path.name}"
+    )
     csv_manager.delete_file()
 
 
 def process_crawl_and_upload(
-        args: argparse.Namespace,
-        last_page: int,
-        huggingface_api_manager: HuggingFaceAPIManager,
-        label_studio_data: list[dict]) -> CommonCrawlResult:
+    args: argparse.Namespace,
+    last_page: int,
+    huggingface_api_manager: HuggingFaceAPIManager,
+    label_studio_data: list[dict],
+) -> CommonCrawlResult:
     # Initialize the CommonCrawlerManager
-    crawler_manager = CommonCrawlerManager(
-        args.common_crawl_id
-    )
+    crawler_manager = CommonCrawlerManager(args.common_crawl_id)
     # Determine the pages to search, based on the last page searched
     start_page = last_page + 1
     # Use the parsed arguments
@@ -300,7 +316,7 @@ def process_crawl_and_upload(
         search_term=args.url,
         keyword=args.keyword,
         num_pages=args.pages,
-        start_page=start_page
+        start_page=start_page,
     )
     # Logic should conclude here if no results are found
     if not common_crawl_result.url_results:
@@ -309,10 +325,16 @@ def process_crawl_and_upload(
         return common_crawl_result
 
     print("Removing urls already in the database")
-    common_crawl_result.url_results = remove_local_duplicates(common_crawl_result.url_results)
-    common_crawl_result.url_results = remove_remote_duplicates(common_crawl_result.url_results, label_studio_data)
+    common_crawl_result.url_results = remove_local_duplicates(
+        common_crawl_result.url_results
+    )
+    common_crawl_result.url_results = remove_remote_duplicates(
+        common_crawl_result.url_results, label_studio_data
+    )
     if not common_crawl_result.url_results:
-        print("No urls not already present in the database found. Ceasing main execution.")
+        print(
+            "No urls not already present in the database found. Ceasing main execution."
+        )
         add_batch_info_to_csv(common_crawl_result, args, last_page)
         return common_crawl_result
 
diff --git a/source_collectors/common_crawler/utils.py b/source_collectors/common_crawler/utils.py
index 0848b02..3cea7af 100644
--- a/source_collectors/common_crawler/utils.py
+++ b/source_collectors/common_crawler/utils.py
@@ -12,7 +12,7 @@ def __init__(self, url):
         self.url = url
 
     def add_parameter(self, parameter, value):
-        if '?' in self.url:
+        if "?" in self.url:
             self.url += f"&{parameter}={value}"
         else:
             self.url += f"?{parameter}={value}"
diff --git a/source_collectors/muckrock/convert_all_record_types_to_csv.py b/source_collectors/muckrock/convert_all_record_types_to_csv.py
index be6d536..30acdbb 100644
--- a/source_collectors/muckrock/convert_all_record_types_to_csv.py
+++ b/source_collectors/muckrock/convert_all_record_types_to_csv.py
@@ -1,12 +1,43 @@
-import subprocess
-import os
+# import subprocess
+# import os
 
-record_types = ['accident reports', 'arrest records', 'calls for service', 'car gps', 'citations', 'dispatch logs', 'dispatch recordings',
-                'field contacts', 'incident reports', 'misc police activity', 'officer involved shootings', 'stops', 'surveys', 'use of force reports',
-                'vehicle pursuits', 'complaints and misconduct', 'daily activity logs', 'training and hiring info', 'personnel records', 'annual and monthly reports',
-                'budgets and finances', 'contact info and agency meta', 'geographic', 'list of data sources', 'policies and contracts', 'crime maps and reports',
-                'crime statistics', 'media bulletins', 'records request info', 'resources', 'sex offender registry', 'wanted persons', 'booking reports',
-                'court cases', 'incarceration records']
+record_types = [
+    "accident reports",
+    "arrest records",
+    "calls for service",
+    "car gps",
+    "citations",
+    "dispatch logs",
+    "dispatch recordings",
+    "field contacts",
+    "incident reports",
+    "misc police activity",
+    "officer involved shootings",
+    "stops",
+    "surveys",
+    "use of force reports",
+    "vehicle pursuits",
+    "complaints and misconduct",
+    "daily activity logs",
+    "training and hiring info",
+    "personnel records",
+    "annual and monthly reports",
+    "budgets and finances",
+    "contact info and agency meta",
+    "geographic",
+    "list of data sources",
+    "policies and contracts",
+    "crime maps and reports",
+    "crime statistics",
+    "media bulletins",
+    "records request info",
+    "resources",
+    "sex offender registry",
+    "wanted persons",
+    "booking reports",
+    "court cases",
+    "incarceration records",
+]
 
 print(len(record_types))
 # json_files = []
diff --git a/source_collectors/muckrock/create_foia_data_db.py b/source_collectors/muckrock/create_foia_data_db.py
index 4480105..4adc555 100644
--- a/source_collectors/muckrock/create_foia_data_db.py
+++ b/source_collectors/muckrock/create_foia_data_db.py
@@ -1,4 +1,4 @@
-'''
+"""
 create_foia_data_db.py
 
 This script fetches data from the MuckRock FOIA API and stores it in a SQLite database.
@@ -17,8 +17,7 @@
 Error Handling:
 Errors encountered during API requests or database operations are logged to an `errors.log` file
 and/or printed to the console.
-'''
-
+"""
 
 import requests
 import sqlite3
@@ -28,18 +27,19 @@
 import time
 from typing import List, Tuple, Dict, Any, Union, Literal
 
-logging.basicConfig(filename='errors.log', level=logging.ERROR,
-                    format='%(levelname)s: %(message)s')
+logging.basicConfig(
+    filename="errors.log", level=logging.ERROR, format="%(levelname)s: %(message)s"
+)
 
 
-base_url = 'https://www.muckrock.com/api_v1/foia/'
-last_page_fetched = 'last_page_fetched.txt'
+base_url = "https://www.muckrock.com/api_v1/foia/"
+last_page_fetched = "last_page_fetched.txt"
 
 NO_MORE_DATA = -1  # flag for program exit
 JSON = Dict[str, Any]  # type alias
 
 
-create_table_query = '''
+create_table_query = """
             CREATE TABLE IF NOT EXISTS results (
                 id INTEGER PRIMARY KEY,
                 title TEXT,
@@ -63,20 +63,20 @@
                 communications TEXT,
                 absolute_url TEXT
             )
-            '''
+            """
 
 
-foia_insert_query = '''
+foia_insert_query = """
         INSERT INTO results (id, title, slug, status, embargo_status, user, username, agency,
                             datetime_submitted, date_due, days_until_due, date_followup,
                             datetime_done, datetime_updated, date_embargo, tracking_id,
                             price, disable_autofollowups, tags, communications, absolute_url)
         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
-    '''
+    """
 
 
 def create_db() -> bool:
-    '''
+    """
     Creates foia_data.db SQLite database with one table named `results`.
 
     Returns:
@@ -84,23 +84,22 @@ def create_db() -> bool:
 
     Raises:
         sqlite3.Error: If the table creation operation fails, prints error and returns False.
-    '''
+    """
 
     try:
-        with sqlite3.connect('foia_data.db') as conn:
+        with sqlite3.connect("foia_data.db") as conn:
             conn.execute(create_table_query)
             conn.commit()
-        print('Successfully created foia_data.db!')
+        print("Successfully created foia_data.db!")
         return True
     except sqlite3.Error as e:
-        print(f'SQLite error: {e}.')
-        logging.error(
-            f'Failed to create foia_data.db due to SQLite error: {e}')
+        print(f"SQLite error: {e}.")
+        logging.error(f"Failed to create foia_data.db due to SQLite error: {e}")
         return False
 
 
 def fetch_page(page: int) -> Union[JSON, Literal[NO_MORE_DATA], None]:
-    '''
+    """
     Fetches a page of 100 results from the MuckRock FOIA API.
 
     Args:
@@ -111,30 +110,33 @@ def fetch_page(page: int) -> Union[JSON, Literal[NO_MORE_DATA], None]:
             - JSON Dict[str, Any]: The response's JSON data, if the request is successful.
             - NO_MORE_DATA (int = -1): A constant, if there are no more pages to fetch (indicated by a 404 response).
             - None: If there is an error other than 404.
-    '''
+    """
 
     per_page = 100
     response = requests.get(
-        base_url, params={'page': page, 'page_size': per_page, 'format': 'json'})
+        base_url, params={"page": page, "page_size": per_page, "format": "json"}
+    )
 
     if response.status_code == 200:
         return response.json()
     elif response.status_code == 404:
-        print('No more pages to fetch')
+        print("No more pages to fetch")
         return NO_MORE_DATA  # Typically 404 response will mean there are no more pages to fetch
     elif 500 <= response.status_code < 600:
-        logging.error(f'Server error {response.status_code} on page {page}')
+        logging.error(f"Server error {response.status_code} on page {page}")
         page = page + 1
         return fetch_page(page)
     else:
-        print(f'Error fetching page {page}: {response.status_code}')
-        logging.error(f'Fetching page {page} failed with response code: {
-                      response.status_code}')
+        print(f"Error fetching page {page}: {response.status_code}")
+        logging.error(
+            f"Fetching page {page} failed with response code: {
+                      response.status_code}"
+        )
         return None
 
 
 def transform_page_data(data_to_transform: JSON) -> List[Tuple[Any, ...]]:
-    '''
+    """
     Transforms the data recieved from the MuckRock FOIA API into a structured format for insertion into a database with `populate_db()`.
 
     Transforms JSON input into a list of tuples, as well as serializes the nested `tags` and `communications` fields into JSON strings.
@@ -144,43 +146,44 @@ def transform_page_data(data_to_transform: JSON) -> List[Tuple[Any, ...]]:
 
     Returns:
         transformed_data (List[Tuple[Any, ...]]: A list of tuples, where each tuple contains the fields of a single FOIA request.
-    '''
+    """
 
     transformed_data = []
 
-    for result in data_to_transform.get('results', []):
-        result['tags'] = json.dumps(result.get('tags', []))
-        result['communications'] = json.dumps(
-            result.get('communications', []))
-
-        transformed_data.append((
-            result['id'],
-            result['title'],
-            result['slug'],
-            result['status'],
-            result['embargo_status'],
-            result['user'],
-            result['username'],
-            result['agency'],
-            result['datetime_submitted'],
-            result['date_due'],
-            result['days_until_due'],
-            result['date_followup'],
-            result['datetime_done'],
-            result['datetime_updated'],
-            result['date_embargo'],
-            result['tracking_id'],
-            result['price'],
-            result['disable_autofollowups'],
-            result['tags'],
-            result['communications'],
-            result['absolute_url']
-        ))
+    for result in data_to_transform.get("results", []):
+        result["tags"] = json.dumps(result.get("tags", []))
+        result["communications"] = json.dumps(result.get("communications", []))
+
+        transformed_data.append(
+            (
+                result["id"],
+                result["title"],
+                result["slug"],
+                result["status"],
+                result["embargo_status"],
+                result["user"],
+                result["username"],
+                result["agency"],
+                result["datetime_submitted"],
+                result["date_due"],
+                result["days_until_due"],
+                result["date_followup"],
+                result["datetime_done"],
+                result["datetime_updated"],
+                result["date_embargo"],
+                result["tracking_id"],
+                result["price"],
+                result["disable_autofollowups"],
+                result["tags"],
+                result["communications"],
+                result["absolute_url"],
+            )
+        )
     return transformed_data
 
 
 def populate_db(transformed_data: List[Tuple[Any, ...]], page: int) -> None:
-    '''
+    """
     Populates foia_data.db SQLite database with the transfomed FOIA request data.
 
     Args:
@@ -193,9 +196,9 @@ def populate_db(transformed_data: List[Tuple[Any, ...]], page: int) -> None:
     Raises:
         sqlite3.Error: If the insertion operation fails, attempts to retry operation (max_retries = 2). If retries are
                        exhausted, logs error and exits.
-    '''
+    """
 
-    with sqlite3.connect('foia_data.db') as conn:
+    with sqlite3.connect("foia_data.db") as conn:
 
         retries = 0
         max_retries = 2
@@ -203,51 +206,55 @@ def populate_db(transformed_data: List[Tuple[Any, ...]], page: int) -> None:
             try:
                 conn.executemany(foia_insert_query, transformed_data)
                 conn.commit()
-                print('Successfully inserted data!')
+                print("Successfully inserted data!")
                 return
             except sqlite3.Error as e:
-                print(f'SQLite error: {e}. Retrying...')
+                print(f"SQLite error: {e}. Retrying...")
                 conn.rollback()
                 retries += 1
                 time.sleep(1)
 
         if retries == max_retries:
-            print(f'Failed to insert data from page {page} after {
-                max_retries} attempts. Skipping to next page.')
-            logging.error(f'Failed to insert data from page {page} after {
-                max_retries} attempts.')
+            print(
+                f"Failed to insert data from page {page} after {
+                max_retries} attempts. Skipping to next page."
+            )
+            logging.error(
+                f"Failed to insert data from page {page} after {
+                max_retries} attempts."
+            )
 
 
 def main() -> None:
-    '''
+    """
     Main entry point for create_foia_data_db.py.
 
     This function orchestrates the process of fetching FOIA requests data from the MuckRock FOIA API, transforming it,
     and storing it in a SQLite database.
-    '''
+    """
 
-    if not os.path.exists('foia_data.db'):
-        print('Creating foia_data.db...')
+    if not os.path.exists("foia_data.db"):
+        print("Creating foia_data.db...")
         success = create_db()
         if success == False:
-            print('Failed to create foia_data.db')
+            print("Failed to create foia_data.db")
             return
 
     if os.path.exists(last_page_fetched):
-        with open(last_page_fetched, mode='r') as file:
+        with open(last_page_fetched, mode="r") as file:
             page = int(file.read()) + 1
     else:
         page = 1
 
     while True:
 
-        print(f'Fetching page {page}...')
+        print(f"Fetching page {page}...")
         page_data = fetch_page(page)
 
         if page_data == NO_MORE_DATA:
             break  # Exit program because no more data exixts
         if page_data is None:
-            print(f'Skipping page {page}...')
+            print(f"Skipping page {page}...")
             page += 1
             continue
 
@@ -255,16 +262,18 @@ def main() -> None:
 
         populate_db(transformed_data, page)
 
-        with open(last_page_fetched, mode='w') as file:
+        with open(last_page_fetched, mode="w") as file:
             file.write(str(page))
         page += 1
 
-    print('create_foia_data_db.py run finished')
+    print("create_foia_data_db.py run finished")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     try:
         main()
     except Exception as e:
-        logging.error(f'An unexpected error occurred: {e}')
-        print('Check errors.log to review errors. Run create_foia_data_db.py again to continue')
+        logging.error(f"An unexpected error occurred: {e}")
+        print(
+            "Check errors.log to review errors. Run create_foia_data_db.py again to continue"
+        )
diff --git a/source_collectors/muckrock/download_muckrock_foia.py b/source_collectors/muckrock/download_muckrock_foia.py
index c1a0380..86ede5d 100644
--- a/source_collectors/muckrock/download_muckrock_foia.py
+++ b/source_collectors/muckrock/download_muckrock_foia.py
@@ -12,15 +12,19 @@
 all_data = []
 output_file = "foia_data.json"
 
+
 # Function to fetch data from a specific page
 def fetch_page(page):
-    response = requests.get(base_url, params={"page": page, "page_size": per_page, "format": "json"})
+    response = requests.get(
+        base_url, params={"page": page, "page_size": per_page, "format": "json"}
+    )
     if response.status_code == 200:
         return response.json()
     else:
         print(f"Error fetching page {page}: {response.status_code}")
         return None
 
+
 # Fetch and store data from all pages
 while True:
     print(f"Fetching page {page}...")
@@ -30,14 +34,14 @@ def fetch_page(page):
         page += 1
         continue
 
-    all_data.extend(data['results'])
-    if not data['next']:
+    all_data.extend(data["results"])
+    if not data["next"]:
         break
 
     page += 1
 
 # Write data to CSV
-with open(output_file, mode='w', encoding='utf-8') as json_file:
+with open(output_file, mode="w", encoding="utf-8") as json_file:
     json.dump(all_data, json_file, indent=4)
 
 print(f"Data written to {output_file}")
diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
index 4d57737..455084a 100644
--- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py
+++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
@@ -7,23 +7,48 @@
 
 # Load the JSON data
 parser = argparse.ArgumentParser(description="Parse JSON from a file.")
-parser.add_argument('--json_file', type=str, required=True,
-                    help="Path to the JSON file")
+parser.add_argument(
+    "--json_file", type=str, required=True, help="Path to the JSON file"
+)
 
 args = parser.parse_args()
 
-with open(args.json_file, 'r') as f:
+with open(args.json_file, "r") as f:
     json_data = json.load(f)
 
 # Define the CSV headers
 headers = [
-    "name", "agency_described", "record_type", "description", "source_url",
-    "readme_url", "scraper_url", "state", "county", "municipality",
-    "agency_type", "jurisdiction_type", "View Archive", "agency_aggregation",
-    "agency_supplied", "supplying_entity", "agency_originated", "originating_agency",
-    "coverage_start", "source_last_updated", "coverage_end", "number_of_records_available",
-    "size", "access_type", "data_portal_type", "access_notes", "record_format", "update_frequency",
-    "update_method", "retention_schedule", "detail_level"
+    "name",
+    "agency_described",
+    "record_type",
+    "description",
+    "source_url",
+    "readme_url",
+    "scraper_url",
+    "state",
+    "county",
+    "municipality",
+    "agency_type",
+    "jurisdiction_type",
+    "View Archive",
+    "agency_aggregation",
+    "agency_supplied",
+    "supplying_entity",
+    "agency_originated",
+    "originating_agency",
+    "coverage_start",
+    "source_last_updated",
+    "coverage_end",
+    "number_of_records_available",
+    "size",
+    "access_type",
+    "data_portal_type",
+    "access_notes",
+    "record_format",
+    "update_frequency",
+    "update_method",
+    "retention_schedule",
+    "detail_level",
 ]
 
 
@@ -59,7 +84,7 @@ def get_jurisdiction(jurisdiction_id):
 
 output_csv = format_filename_json_to_csv(args.json_file)
 # Open a CSV file for writing
-with open(output_csv, 'w', newline='') as csvfile:
+with open(output_csv, "w", newline="") as csvfile:
     writer = csv.DictWriter(csvfile, fieldnames=headers)
 
     # Write the header row
@@ -87,8 +112,7 @@ def get_jurisdiction(jurisdiction_id):
             juris_type = "state"
         # local jurisdiction level
         if jurisdiction_level == "l":
-            parent_juris_data = get_jurisdiction(
-                jurisdiction_data.get("parent"))
+            parent_juris_data = get_jurisdiction(jurisdiction_data.get("parent"))
             state = parent_juris_data.get("abbrev")
             if "County" in jurisdiction_data.get("name"):
                 county = jurisdiction_data.get("name")
@@ -99,24 +123,24 @@ def get_jurisdiction(jurisdiction_id):
                 municipality = jurisdiction_data.get("name")
                 juris_type = "local"
 
-        if 'Police' in agency_data.get("types"):
-            agency_type = 'law enforcement/police'
+        if "Police" in agency_data.get("types"):
+            agency_type = "law enforcement/police"
         else:
-            agency_type = ''
+            agency_type = ""
 
-        source_url = ''
+        source_url = ""
         absolute_url = item.get("absolute_url")
-        access_type = ''
+        access_type = ""
         for comm in item["communications"]:
             if comm["files"]:
-                source_url = absolute_url + '#files'
-                access_type = 'Web page,Download,API'
+                source_url = absolute_url + "#files"
+                access_type = "Web page,Download,API"
                 break
 
         # Extract the relevant fields from the JSON object
         csv_row = {
             "name": item.get("title", ""),
-            "agency_described": agency_data.get("name", "") + ' - ' + state,
+            "agency_described": agency_data.get("name", "") + " - " + state,
             "record_type": "",
             "description": "",
             "source_url": source_url,
@@ -145,7 +169,7 @@ def get_jurisdiction(jurisdiction_id):
             "update_frequency": "",
             "update_method": "",
             "retention_schedule": "",
-            "detail_level": ""
+            "detail_level": "",
         }
 
         # Write the extracted row to the CSV file
diff --git a/source_collectors/muckrock/get_all_record_types.py b/source_collectors/muckrock/get_all_record_types.py
index bcc8c0b..6fa955d 100644
--- a/source_collectors/muckrock/get_all_record_types.py
+++ b/source_collectors/muckrock/get_all_record_types.py
@@ -1,17 +1,50 @@
 import subprocess
 
-record_types = ['accident reports', 'arrest records', 'calls for service', 'car gps', 'citations', 'dispatch logs', 'dispatch recordings',
-                'field contacts', 'incident reports', 'misc police activity', 'officer involved shootings', 'stops', 'surveys', 'use of force reports',
-                'vehicle pursuits', 'complaints and misconduct', 'daily activity logs', 'training and hiring info', 'personnel records', 'annual and monthly reports',
-                'budgets and finances', 'contact info and agency meta', 'geographic', 'list of data sources', 'policies and contracts', 'crime maps and reports',
-                'crime statistics', 'media bulletins', 'records request info', 'resources', 'sex offender registry', 'wanted persons', 'booking reports',
-                'court cases', 'incarceration records']
+record_types = [
+    "accident reports",
+    "arrest records",
+    "calls for service",
+    "car gps",
+    "citations",
+    "dispatch logs",
+    "dispatch recordings",
+    "field contacts",
+    "incident reports",
+    "misc police activity",
+    "officer involved shootings",
+    "stops",
+    "surveys",
+    "use of force reports",
+    "vehicle pursuits",
+    "complaints and misconduct",
+    "daily activity logs",
+    "training and hiring info",
+    "personnel records",
+    "annual and monthly reports",
+    "budgets and finances",
+    "contact info and agency meta",
+    "geographic",
+    "list of data sources",
+    "policies and contracts",
+    "crime maps and reports",
+    "crime statistics",
+    "media bulletins",
+    "records request info",
+    "resources",
+    "sex offender registry",
+    "wanted persons",
+    "booking reports",
+    "court cases",
+    "incarceration records",
+]
 
 for record_type in record_types:
-    command = ['python', 'search_foia_data_db.py', '--search_for', record_type]
+    command = ["python", "search_foia_data_db.py", "--search_for", record_type]
 
     try:
         subprocess.run(command, check=True)
     except subprocess.CalledProcessError as e:
-        print(f'An error occurred while executing the command for "{
-              record_type}": {e}')
+        print(
+            f'An error occurred while executing the command for "{
+              record_type}": {e}'
+        )
diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py
index 96cde83..02f7a4e 100644
--- a/source_collectors/muckrock/get_allegheny_foias.py
+++ b/source_collectors/muckrock/get_allegheny_foias.py
@@ -2,6 +2,7 @@
 import json
 import time
 
+
 # Function to fetch jurisdiction IDs based on town names from a text file
 def fetch_jurisdiction_ids(town_file, base_url):
     with open(town_file, "r") as file:
@@ -14,12 +15,14 @@ def fetch_jurisdiction_ids(town_file, base_url):
         response = requests.get(url)
         if response.status_code == 200:
             data = response.json()
-            for item in data.get('results', []):
-                if item['name'] in town_names:
-                    jurisdiction_ids[item['name']] = item['id']
+            for item in data.get("results", []):
+                if item["name"] in town_names:
+                    jurisdiction_ids[item["name"]] = item["id"]
 
             url = data.get("next")
-            print(f"Processed page, found {len(jurisdiction_ids)}/{len(town_names)} jurisdictions so far...")
+            print(
+                f"Processed page, found {len(jurisdiction_ids)}/{len(town_names)} jurisdictions so far..."
+            )
             time.sleep(1)  # To respect the rate limit
 
         elif response.status_code == 503:
@@ -31,6 +34,7 @@ def fetch_jurisdiction_ids(town_file, base_url):
 
     return jurisdiction_ids
 
+
 # Function to fetch FOIA data for each jurisdiction ID and save it to a JSON file
 def fetch_foia_data(jurisdiction_ids):
     all_data = []
@@ -42,7 +46,9 @@ def fetch_foia_data(jurisdiction_ids):
                 data = response.json()
                 all_data.extend(data.get("results", []))
                 url = data.get("next")
-                print(f"Fetching records for {name}, {len(all_data)} total records so far...")
+                print(
+                    f"Fetching records for {name}, {len(all_data)} total records so far..."
+                )
                 time.sleep(1)  # To respect the rate limit
             elif response.status_code == 503:
                 print(f"Error 503: Skipping page for {name}")
@@ -57,10 +63,13 @@ def fetch_foia_data(jurisdiction_ids):
 
     print(f"Saved {len(all_data)} records to foia_data_combined.json")
 
+
 # Main function to execute the script
 def main():
     town_file = "allegheny-county-towns.txt"
-    jurisdiction_url = "https://www.muckrock.com/api_v1/jurisdiction/?level=l&parent=126"
+    jurisdiction_url = (
+        "https://www.muckrock.com/api_v1/jurisdiction/?level=l&parent=126"
+    )
 
     # Fetch jurisdiction IDs based on town names
     jurisdiction_ids = fetch_jurisdiction_ids(town_file, jurisdiction_url)
@@ -69,6 +78,7 @@ def main():
     # Fetch FOIA data for each jurisdiction ID
     fetch_foia_data(jurisdiction_ids)
 
+
 # Run the main function
 if __name__ == "__main__":
     main()
diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py
index ed1db45..a0160a8 100644
--- a/source_collectors/muckrock/muck_get.py
+++ b/source_collectors/muckrock/muck_get.py
@@ -14,17 +14,23 @@
 while True:
 
     # Make the GET request with the search string as a query parameter
-    response = requests.get(base_url, params={"page" : page, "page_size" : per_page, "format": "json"})
+    response = requests.get(
+        base_url, params={"page": page, "page_size": per_page, "format": "json"}
+    )
 
     # Check if the request was successful
     if response.status_code == 200:
         # Parse the JSON response
         data = response.json()
 
-        if not data['results']:
+        if not data["results"]:
             break
 
-        filtered_results = [item for item in data['results'] if search_string.lower() in item['title'].lower()]
+        filtered_results = [
+            item
+            for item in data["results"]
+            if search_string.lower() in item["title"].lower()
+        ]
 
         all_results.extend(filtered_results)
 
@@ -44,7 +50,7 @@
 
 # Dump list into a JSON file
 json_out_file = search_string.replace(" ", "_") + ".json"
-with open(json_out_file, 'w') as json_file:
+with open(json_out_file, "w") as json_file:
     json.dump(all_results, json_file)
 
 print(f"List dumped into {json_out_file}")
diff --git a/source_collectors/muckrock/muckrock_ml_labeler.py b/source_collectors/muckrock/muckrock_ml_labeler.py
index dafd6de..46b6580 100644
--- a/source_collectors/muckrock/muckrock_ml_labeler.py
+++ b/source_collectors/muckrock/muckrock_ml_labeler.py
@@ -11,16 +11,22 @@
 
 # Load the dataset from command line argument
 parser = argparse.ArgumentParser(description="Load CSV file into a pandas DataFrame.")
-parser.add_argument('--csv_file', type=str, required=True, help="Path to the CSV file")
+parser.add_argument("--csv_file", type=str, required=True, help="Path to the CSV file")
 args = parser.parse_args()
 df = pd.read_csv(args.csv_file)
 
 # Combine multiple columns (e.g., 'url', 'html_title', 'h1') into a single text field for each row
-columns_to_combine = ['url_path', 'html_title', 'h1']  # Add other columns here as needed
-df['combined_text'] = df[columns_to_combine].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
+columns_to_combine = [
+    "url_path",
+    "html_title",
+    "h1",
+]  # Add other columns here as needed
+df["combined_text"] = df[columns_to_combine].apply(
+    lambda row: " ".join(row.values.astype(str)), axis=1
+)
 
 # Convert the combined text into a list
-texts = df['combined_text'].tolist()
+texts = df["combined_text"].tolist()
 
 # Tokenize the inputs
 inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
@@ -37,5 +43,5 @@
 predicted_labels = [labels[int(pred)] for pred in predictions]
 
 # Add the predicted labels to the dataframe and save
-df['predicted_label'] = predicted_labels
+df["predicted_label"] = predicted_labels
 df.to_csv("labeled_muckrock_dataset.csv", index=False)
diff --git a/source_collectors/muckrock/search_foia_data_db.py b/source_collectors/muckrock/search_foia_data_db.py
index ff9aac6..1229059 100644
--- a/source_collectors/muckrock/search_foia_data_db.py
+++ b/source_collectors/muckrock/search_foia_data_db.py
@@ -1,4 +1,4 @@
-'''
+"""
 search_foia_data_db.py
 
 This script provides search functionality for the `foia_data.db` SQLite database. The search looks in `title`s and
@@ -16,8 +16,7 @@
 
 Error Handling:
 Errors encountered during database operations, JSON parsing, or file writing are printed to the console.
-'''
-
+"""
 
 import sqlite3
 import pandas as pd
@@ -26,37 +25,43 @@
 import os
 from typing import Union, List, Dict
 
-check_results_table_query = '''
+check_results_table_query = """
                 SELECT name FROM sqlite_master
                 WHERE (type = 'table')
                 AND (name = 'results')
-                '''
+                """
 
-search_foia_query = '''
+search_foia_query = """
         SELECT * FROM results
         WHERE (title LIKE ? OR tags LIKE ?)
         AND (status = 'done')
-        '''
+        """
 
 
 def parser_init() -> argparse.ArgumentParser:
-    '''
+    """
     Initializes the argument parser for search_foia_data_db.py.
 
     Returns:
         argparse.ArgumentParser: The configured argument parser.
-    '''
+    """
 
     parser = argparse.ArgumentParser(
-        description='Search foia_data.db and generate a JSON file of resulting matches')
-    parser.add_argument('--search_for', type=str, required=True, metavar='<search_string>',
-                        help='Provide a string to search foia_data.db')
+        description="Search foia_data.db and generate a JSON file of resulting matches"
+    )
+    parser.add_argument(
+        "--search_for",
+        type=str,
+        required=True,
+        metavar="<search_string>",
+        help="Provide a string to search foia_data.db",
+    )
 
     return parser
 
 
 def search_foia_db(search_string: str) -> Union[pd.DataFrame, None]:
-    '''
+    """
     Searches the foia_data.db database for FOIA request entries matching the provided search string.
 
     Args:
@@ -70,35 +75,35 @@ def search_foia_db(search_string: str) -> Union[pd.DataFrame, None]:
     Raises:
         sqlite3.Error: If any database operation fails, prints error and returns None.
         Exception: If any unexpected error occurs, prints error and returns None.
-    '''
+    """
 
     print(f'Searching foia_data.db for "{search_string}"...')
 
     try:
-        with sqlite3.connect('foia_data.db') as conn:
+        with sqlite3.connect("foia_data.db") as conn:
 
             results_table = pd.read_sql_query(check_results_table_query, conn)
 
             if results_table.empty:
-                print('The `results` table does not exist in the database.')
+                print("The `results` table does not exist in the database.")
                 return None
 
-            params = [f'%{search_string}%', f'%{search_string}%']
+            params = [f"%{search_string}%", f"%{search_string}%"]
 
             df = pd.read_sql_query(search_foia_query, conn, params=params)
 
     except sqlite3.Error as e:
-        print(f'Sqlite error: {e}')
+        print(f"Sqlite error: {e}")
         return None
     except Exception as e:
-        print(f'An unexpected error occurred: {e}')
+        print(f"An unexpected error occurred: {e}")
         return None
 
     return df
 
 
 def parse_communications_column(communications) -> List[Dict]:
-    '''
+    """
     Parses a communications column value, decoding it from JSON format.
 
     Args:
@@ -110,19 +115,19 @@ def parse_communications_column(communications) -> List[Dict]:
 
     Raises:
         json.JSONDecodeError: If deserialization fails, prints error and returns empty list.
-    '''
+    """
 
     if pd.isna(communications):
         return []
     try:
         return json.loads(communications)
     except json.JSONDecodeError as e:
-        print(f'Error decoding JSON: {e}')
+        print(f"Error decoding JSON: {e}")
         return []
 
 
 def generate_json(df: pd.DataFrame, search_string: str) -> None:
-    '''
+    """
     Generates a JSON file from a pandas DataFrame.
 
     Args:
@@ -136,46 +141,49 @@ def generate_json(df: pd.DataFrame, search_string: str) -> None:
 
     Raises:
         Exception: If writing to JSON file operation fails, prints error and returns.
-    '''
+    """
 
-    output_json = f'{search_string.replace(' ', '_')}.json'
+    output_json = f"{search_string.replace(' ', '_')}.json"
 
     try:
-        df.to_json(output_json, orient='records', indent=4)
+        df.to_json(output_json, orient="records", indent=4)
         print(f'Matching entries written to "{output_json}"')
     except Exception as e:
-        print(f'An error occurred while writing JSON: {e}')
+        print(f"An error occurred while writing JSON: {e}")
 
 
 def main() -> None:
-    '''
+    """
     Function to search the foia_data.db database for entries matching a specified search string.
 
     Command Line Args:
         --search_for (str): A string to search for in the `title` and `tags` fields of FOIA requests.
-    '''
+    """
 
     parser = parser_init()
     args = parser.parse_args()
     search_string = args.search_for
 
-    if not os.path.exists('foia_data.db'):
-        print('foia_data.db does not exist.\nRun create_foia_data_db.py first to create and populate it.')
+    if not os.path.exists("foia_data.db"):
+        print(
+            "foia_data.db does not exist.\nRun create_foia_data_db.py first to create and populate it."
+        )
         return
 
     df = search_foia_db(search_string)
     if df is None:
         return
 
-    if not df['communications'].empty:
-        df['communications'] = df['communications'].apply(
-            parse_communications_column)
+    if not df["communications"].empty:
+        df["communications"] = df["communications"].apply(parse_communications_column)
 
-    print(f'Found {df.shape[0]} matching entries containing "{
-          search_string}" in the title or tags')
+    print(
+        f'Found {df.shape[0]} matching entries containing "{
+          search_string}" in the title or tags'
+    )
 
     generate_json(df, search_string)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/source_collectors/muckrock/search_local_foia_json.py b/source_collectors/muckrock/search_local_foia_json.py
index 9e61d49..66e6aca 100644
--- a/source_collectors/muckrock/search_local_foia_json.py
+++ b/source_collectors/muckrock/search_local_foia_json.py
@@ -1,38 +1,44 @@
 import json
 
 # Specify the JSON file path
-json_file = 'foia_data.json'
-search_string = 'use of force'
+json_file = "foia_data.json"
+search_string = "use of force"
 
 # Load the JSON data
-with open(json_file, 'r', encoding='utf-8') as file:
+with open(json_file, "r", encoding="utf-8") as file:
     data = json.load(file)
 
 # List to store matching entries
 matching_entries = []
 
+
 # Function to search within an entry
 def search_entry(entry):
     # Check if 'status' is 'done'
-    if entry.get('status') != 'done':
+    if entry.get("status") != "done":
         return False
-    
+
     # Check if 'title' or 'tags' field contains the search string
-    title_match = 'title' in entry and search_string.lower() in entry['title'].lower()
-    tags_match = 'tags' in entry and any(search_string.lower() in tag.lower() for tag in entry['tags'])
-    
+    title_match = "title" in entry and search_string.lower() in entry["title"].lower()
+    tags_match = "tags" in entry and any(
+        search_string.lower() in tag.lower() for tag in entry["tags"]
+    )
+
     return title_match or tags_match
 
+
 # Iterate through the data and collect matching entries
 for entry in data:
     if search_entry(entry):
         matching_entries.append(entry)
 
 # Output the results
-print(f"Found {len(matching_entries)} entries containing '{search_string}' in the title or tags.")
+print(
+    f"Found {len(matching_entries)} entries containing '{search_string}' in the title or tags."
+)
 
 # Optionally, write matching entries to a new JSON file
-with open('matching_entries.json', 'w', encoding='utf-8') as file:
+with open("matching_entries.json", "w", encoding="utf-8") as file:
     json.dump(matching_entries, file, indent=4)
 
 print(f"Matching entries written to 'matching_entries.json'")
diff --git a/source_collectors/muckrock/utils.py b/source_collectors/muckrock/utils.py
index ca66dc8..3d8b63d 100644
--- a/source_collectors/muckrock/utils.py
+++ b/source_collectors/muckrock/utils.py
@@ -1,18 +1,17 @@
-'''
+"""
 utils.py
 
 Provides useful functions for muckrock_tools.
 
 Functions:
     - format_filename_json_to_csv()
-'''
-
+"""
 
 import re
 
 
 def format_filename_json_to_csv(json_filename: str) -> str:
-    '''
+    """
     Converts JSON filename format to CSV filename format.
 
     Args:
@@ -21,7 +20,7 @@ def format_filename_json_to_csv(json_filename: str) -> str:
     Returns:
         csv_filename (str): A CSV filename string.
 
-    '''
-    csv_filename = re.sub(r'_(?=[^.]*$)', '-', json_filename[:-5]) + '.csv'
+    """
+    csv_filename = re.sub(r"_(?=[^.]*$)", "-", json_filename[:-5]) + ".csv"
 
     return csv_filename