Lint added files

Police-Data-Accessibility-Project · Nov 16, 2024 · 9d9618a · 9d9618a
1 parent cdbae20
commit 9d9618a
Show file tree

Hide file tree

Showing 26 changed files with 616 additions and 381 deletions.
diff --git a/common_crawler/argparser.py b/common_crawler/argparser.py
@@ -7,6 +7,7 @@
 for the Common Crawler script.
 """
 
+
 def valid_common_crawl_id(common_crawl_id: str) -> bool:
     """
     Validate the Common Crawl ID format.
@@ -16,7 +17,8 @@ def valid_common_crawl_id(common_crawl_id: str) -> bool:
     Returns:
         True if the Common Crawl ID is valid, False otherwise
     """
-    return re.match(r'CC-MAIN-\d{4}-\d{2}', common_crawl_id) is not None
+    return re.match(r"CC-MAIN-\d{4}-\d{2}", common_crawl_id) is not None
+
 
 def parse_args() -> argparse.Namespace:
     """
@@ -33,30 +35,49 @@ def parse_args() -> argparse.Namespace:
     """
 
     parser = argparse.ArgumentParser(
-        description='Query the Common Crawl dataset and optionally save the results to a file.')
+        description="Query the Common Crawl dataset and optionally save the results to a file."
+    )
     # Add the required arguments
-    parser.add_argument('common_crawl_id', type=str, help='The Common Crawl ID')
-    parser.add_argument('url', type=str, help='The URL to query')
-    parser.add_argument('keyword', type=str, help='The keyword to search in the url')
+    parser.add_argument("common_crawl_id", type=str, help="The Common Crawl ID")
+    parser.add_argument("url", type=str, help="The URL to query")
+    parser.add_argument("keyword", type=str, help="The keyword to search in the url")
     # Optional arguments for the number of pages and the output file, and a flag to reset the cache
-    parser.add_argument('-c', '--config', type=str, default='config.ini', help='The configuration file to use')
-    parser.add_argument('-p', '--pages', type=int, default=1, help='The number of pages to search (default: 1)')
-    parser.add_argument('--reset-cache', action='store_true', default=False,
-                        help='Reset the cache before starting the crawl')
+    parser.add_argument(
+        "-c",
+        "--config",
+        type=str,
+        default="config.ini",
+        help="The configuration file to use",
+    )
+    parser.add_argument(
+        "-p",
+        "--pages",
+        type=int,
+        default=1,
+        help="The number of pages to search (default: 1)",
+    )
+    parser.add_argument(
+        "--reset-cache",
+        action="store_true",
+        default=False,
+        help="Reset the cache before starting the crawl",
+    )
 
     args = parser.parse_args()
 
     # Validate the Common Crawl ID format
     if not valid_common_crawl_id(args.common_crawl_id):
-        parser.error("Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW.")
+        parser.error(
+            "Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW."
+        )
 
     # Read the configuration file
     config = configparser.ConfigParser()
     config.read(args.config)
 
     # Combine parsed arguments with configuration file defaults
     app_parser = argparse.ArgumentParser(parents=[parser], add_help=False)
-    app_parser.set_defaults(**config['DEFAULT'])
+    app_parser.set_defaults(**config["DEFAULT"])
 
     app_args = app_parser.parse_args()
 

diff --git a/common_crawler/cache.py b/common_crawler/cache.py
@@ -8,11 +8,13 @@
     - CommonCrawlerCache: a class for managing the cache logic of Common Crawl search results
 """
 
+
 class CommonCrawlerCacheManager:
     """
     A class for managing the cache of Common Crawl search results.
     This class is responsible for adding, retrieving, and saving cache data.
     """
+
     def __init__(self, file_name: str = "cache", directory=None):
         """
         Initializes the CacheStorage object with a file name and directory.
@@ -41,7 +43,6 @@ def upsert(self, index: str, url: str, keyword: str, last_page: int) -> None:
             self.cache[index][url] = {}
         self.cache[index][url][keyword] = last_page
 
-
     def get(self, index, url, keyword) -> int:
         """
         Retrieves a page number from the cache.
@@ -53,36 +54,37 @@ def get(self, index, url, keyword) -> int:
         Returns: int - the last page crawled
 
         """
-        if index in self.cache and url in self.cache[index] and keyword in self.cache[index][url]:
+        if (
+            index in self.cache
+            and url in self.cache[index]
+            and keyword in self.cache[index][url]
+        ):
             return self.cache[index][url][keyword]
         # The cache object does not exist. Return 0 as the default value.
         return 0
 
-
     def load_or_create_cache(self) -> dict:
         """
         Loads the cache from the configured file path.
         If the file does not exist, an empty dictionary is returned.
         Returns: dict - the cache data
         """
         try:
-            with open(self.file_path, 'r') as file:
+            with open(self.file_path, "r") as file:
                 return json.load(file)
         except FileNotFoundError:
             return {}
 
-
     def save_cache(self) -> None:
         """
         Converts the cache object into a JSON-serializable format and saves it to the configured file path.
         This method ensures the cache is stored in a readable and easily reloadable format, allowing for
         persistence of crawl data across sessions.
         """
         # Reformat cache data for JSON serialization
-        with open(self.file_path, 'w') as file:
+        with open(self.file_path, "w") as file:
             json.dump(self.cache, file, indent=4)
 
-
     def reset_cache(self) -> None:
         """
         Resets the cache to an empty state.

diff --git a/common_crawler/crawler.py b/common_crawler/crawler.py
@@ -16,7 +16,6 @@
 # TODO: What happens when no results are found? How does the CommonCrawlerManager handle this?
 
 
-
 @dataclass
 class CommonCrawlResult:
     last_page_search: int
@@ -31,16 +30,17 @@ class CommonCrawlerManager:
     It validates crawl ids, manages pagination, and aggregates results.
     """
 
-    def __init__(self, crawl_id='CC-MAIN-2023-50'):
+    def __init__(self, crawl_id="CC-MAIN-2023-50"):
         self.crawl_id = crawl_id
-        CC_INDEX_SERVER = 'http://index.commoncrawl.org/'
-        INDEX_NAME = f'{self.crawl_id}-index'
-        self.root_url = f'{CC_INDEX_SERVER}{INDEX_NAME}'
+        CC_INDEX_SERVER = "http://index.commoncrawl.org/"
+        INDEX_NAME = f"{self.crawl_id}-index"
+        self.root_url = f"{CC_INDEX_SERVER}{INDEX_NAME}"
 
     def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResult:
         print(
             f"Searching for {keyword} on {search_term} in {self.crawl_id} for {num_pages} pages,"
-            f" starting at page {start_page}")
+            f" starting at page {start_page}"
+        )
 
         url_results = []
 
@@ -64,7 +64,9 @@ def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResul
 
         return CommonCrawlResult(last_page, url_results)
 
-    def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = 20) -> list[dict]:
+    def search_common_crawl_index(
+        self, url: str, page: int = 0, max_retries: int = 20
+    ) -> list[dict]:
         """
         This method is used to search the Common Crawl index for a given URL and page number
         Args:
@@ -76,9 +78,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int =
         """
         encoded_url = quote_plus(url)
         search_url = URLWithParameters(self.root_url)
-        search_url.add_parameter('url', encoded_url)
-        search_url.add_parameter('output', 'json')
-        search_url.add_parameter('page', page)
+        search_url.add_parameter("url", encoded_url)
+        search_url.add_parameter("output", "json")
+        search_url.add_parameter("page", page)
 
         retries = 0
         delay = 1
@@ -90,7 +92,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int =
                 return self.process_response(response, url, page)
 
             retries += 1
-            print(f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})")
+            print(
+                f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})"
+            )
             time.sleep(delay)
 
         print(f"Max retries exceeded. Failed to get records for {url} on page {page}.")
@@ -106,19 +110,24 @@ def make_request(self, search_url: str) -> requests.Response:
             response.raise_for_status()
             return response
         except requests.exceptions.RequestException as e:
-            if response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR and 'SlowDown' in response.text:
+            if (
+                response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR
+                and "SlowDown" in response.text
+            ):
                 return None
             else:
                 print(f"Failed to get records: {e}")
                 return None
 
-    def process_response(self, response: requests.Response, url: str, page: int) -> list[dict]:
+    def process_response(
+        self, response: requests.Response, url: str, page: int
+    ) -> list[dict]:
         """Processes the HTTP response and returns the parsed records if successful."""
         if response.status_code == HTTPStatus.OK:
-            records = response.text.strip().split('\n')
+            records = response.text.strip().split("\n")
             print(f"Found {len(records)} records for {url} on page {page}")
             return [json.loads(record) for record in records]
-        elif 'First Page is 0, Last Page is 0' in response.text:
+        elif "First Page is 0, Last Page is 0" in response.text:
             print("No records exist in index matching the url search term")
             return None
         else:
@@ -127,4 +136,4 @@ def process_response(self, response: requests.Response, url: str, page: int) ->
 
     @staticmethod
     def get_urls_with_keyword(records: list[dict], keyword) -> list[str]:
-        return [record['url'] for record in records if keyword in record['url']]
+        return [record["url"] for record in records if keyword in record["url"]]
diff --git a/common_crawler/csv_manager.py b/common_crawler/csv_manager.py
@@ -10,12 +10,7 @@ class CSVManager:
     Creates the file if it doesn't exist, and provides a method for adding new rows.
     """
 
-    def __init__(
-            self,
-            file_name: str,
-            headers: list[str],
-            directory=None
-    ):
+    def __init__(self, file_name: str, headers: list[str], directory=None):
         self.file_path = get_file_path(f"{file_name}.csv", directory)
         self.headers = headers
         if not os.path.exists(self.file_path):
@@ -29,9 +24,9 @@ def add_row(self, row_values: list[str] | tuple[str]):
         """
         if isinstance(row_values, str):
             # Single values must be converted to a list format
-             row_values = [row_values]
+            row_values = [row_values]
         try:
-            with open(self.file_path, mode='a', newline='', encoding='utf-8') as file:
+            with open(self.file_path, mode="a", newline="", encoding="utf-8") as file:
                 writer = csv.writer(file)
                 writer.writerow(row_values)
         except Exception as e:
@@ -45,9 +40,7 @@ def add_rows(self, results: list[list[str]]) -> None:
         Returns: None
         """
         for result in results:
-            self.add_row(
-                result
-            )
+            self.add_row(result)
         print(f"{len(results)} URLs written to {self.file_path}")
 
     def initialize_file(self):
@@ -59,15 +52,17 @@ def initialize_file(self):
         file_exists = os.path.isfile(self.file_path)
 
         if not file_exists:
-            with open(self.file_path, mode='a', newline='', encoding='utf-8') as file:
+            with open(self.file_path, mode="a", newline="", encoding="utf-8") as file:
                 writer = csv.writer(file)
                 writer.writerow(self.headers)
         else:
             # Open and check that headers match
-            with open(self.file_path, mode='r', encoding='utf-8') as file:
+            with open(self.file_path, mode="r", encoding="utf-8") as file:
                 header_row = next(csv.reader(file))
                 if header_row != self.headers:
-                    raise ValueError(f"Header row in {self.file_path} does not match expected headers")
+                    raise ValueError(
+                        f"Header row in {self.file_path} does not match expected headers"
+                    )
         print(f"CSV file initialized at {self.file_path}")
 
     def delete_file(self):