Skip to content

Commit

Permalink
Lint added files
Browse files Browse the repository at this point in the history
  • Loading branch information
eddie-m-m committed Nov 16, 2024
1 parent cdbae20 commit 9d9618a
Show file tree
Hide file tree
Showing 26 changed files with 616 additions and 381 deletions.
43 changes: 32 additions & 11 deletions common_crawler/argparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
for the Common Crawler script.
"""


def valid_common_crawl_id(common_crawl_id: str) -> bool:
"""
Validate the Common Crawl ID format.
Expand All @@ -16,7 +17,8 @@ def valid_common_crawl_id(common_crawl_id: str) -> bool:
Returns:
True if the Common Crawl ID is valid, False otherwise
"""
return re.match(r'CC-MAIN-\d{4}-\d{2}', common_crawl_id) is not None
return re.match(r"CC-MAIN-\d{4}-\d{2}", common_crawl_id) is not None


def parse_args() -> argparse.Namespace:
"""
Expand All @@ -33,30 +35,49 @@ def parse_args() -> argparse.Namespace:
"""

parser = argparse.ArgumentParser(
description='Query the Common Crawl dataset and optionally save the results to a file.')
description="Query the Common Crawl dataset and optionally save the results to a file."
)
# Add the required arguments
parser.add_argument('common_crawl_id', type=str, help='The Common Crawl ID')
parser.add_argument('url', type=str, help='The URL to query')
parser.add_argument('keyword', type=str, help='The keyword to search in the url')
parser.add_argument("common_crawl_id", type=str, help="The Common Crawl ID")
parser.add_argument("url", type=str, help="The URL to query")
parser.add_argument("keyword", type=str, help="The keyword to search in the url")
# Optional arguments for the number of pages and the output file, and a flag to reset the cache
parser.add_argument('-c', '--config', type=str, default='config.ini', help='The configuration file to use')
parser.add_argument('-p', '--pages', type=int, default=1, help='The number of pages to search (default: 1)')
parser.add_argument('--reset-cache', action='store_true', default=False,
help='Reset the cache before starting the crawl')
parser.add_argument(
"-c",
"--config",
type=str,
default="config.ini",
help="The configuration file to use",
)
parser.add_argument(
"-p",
"--pages",
type=int,
default=1,
help="The number of pages to search (default: 1)",
)
parser.add_argument(
"--reset-cache",
action="store_true",
default=False,
help="Reset the cache before starting the crawl",
)

args = parser.parse_args()

# Validate the Common Crawl ID format
if not valid_common_crawl_id(args.common_crawl_id):
parser.error("Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW.")
parser.error(
"Invalid Common Crawl ID format. Expected format is CC-MAIN-YYYY-WW."
)

# Read the configuration file
config = configparser.ConfigParser()
config.read(args.config)

# Combine parsed arguments with configuration file defaults
app_parser = argparse.ArgumentParser(parents=[parser], add_help=False)
app_parser.set_defaults(**config['DEFAULT'])
app_parser.set_defaults(**config["DEFAULT"])

app_args = app_parser.parse_args()

Expand Down
16 changes: 9 additions & 7 deletions common_crawler/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
- CommonCrawlerCache: a class for managing the cache logic of Common Crawl search results
"""


class CommonCrawlerCacheManager:
"""
A class for managing the cache of Common Crawl search results.
This class is responsible for adding, retrieving, and saving cache data.
"""

def __init__(self, file_name: str = "cache", directory=None):
"""
Initializes the CacheStorage object with a file name and directory.
Expand Down Expand Up @@ -41,7 +43,6 @@ def upsert(self, index: str, url: str, keyword: str, last_page: int) -> None:
self.cache[index][url] = {}
self.cache[index][url][keyword] = last_page


def get(self, index, url, keyword) -> int:
"""
Retrieves a page number from the cache.
Expand All @@ -53,36 +54,37 @@ def get(self, index, url, keyword) -> int:
Returns: int - the last page crawled
"""
if index in self.cache and url in self.cache[index] and keyword in self.cache[index][url]:
if (
index in self.cache
and url in self.cache[index]

Check warning on line 59 in common_crawler/cache.py

View workflow job for this annotation

GitHub Actions / Lint

[flake8] reported by reviewdog 🐶 line break before binary operator Raw Output: ./common_crawler/cache.py:59:13: W503 line break before binary operator
and keyword in self.cache[index][url]

Check warning on line 60 in common_crawler/cache.py

View workflow job for this annotation

GitHub Actions / Lint

[flake8] reported by reviewdog 🐶 line break before binary operator Raw Output: ./common_crawler/cache.py:60:13: W503 line break before binary operator
):
return self.cache[index][url][keyword]
# The cache object does not exist. Return 0 as the default value.
return 0


def load_or_create_cache(self) -> dict:
"""
Loads the cache from the configured file path.
If the file does not exist, an empty dictionary is returned.
Returns: dict - the cache data
"""
try:
with open(self.file_path, 'r') as file:
with open(self.file_path, "r") as file:
return json.load(file)
except FileNotFoundError:
return {}


def save_cache(self) -> None:
"""
Converts the cache object into a JSON-serializable format and saves it to the configured file path.
This method ensures the cache is stored in a readable and easily reloadable format, allowing for
persistence of crawl data across sessions.
"""
# Reformat cache data for JSON serialization
with open(self.file_path, 'w') as file:
with open(self.file_path, "w") as file:
json.dump(self.cache, file, indent=4)


def reset_cache(self) -> None:
"""
Resets the cache to an empty state.
Expand Down
41 changes: 25 additions & 16 deletions common_crawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
# TODO: What happens when no results are found? How does the CommonCrawlerManager handle this?



@dataclass
class CommonCrawlResult:
last_page_search: int
Expand All @@ -31,16 +30,17 @@ class CommonCrawlerManager:
It validates crawl ids, manages pagination, and aggregates results.
"""

def __init__(self, crawl_id='CC-MAIN-2023-50'):
def __init__(self, crawl_id="CC-MAIN-2023-50"):

Check warning on line 33 in common_crawler/crawler.py

View workflow job for this annotation

GitHub Actions / Lint

[flake8] reported by reviewdog 🐶 Missing docstring in __init__ Raw Output: ./common_crawler/crawler.py:33:1: D107 Missing docstring in __init__
self.crawl_id = crawl_id
CC_INDEX_SERVER = 'http://index.commoncrawl.org/'
INDEX_NAME = f'{self.crawl_id}-index'
self.root_url = f'{CC_INDEX_SERVER}{INDEX_NAME}'
CC_INDEX_SERVER = "http://index.commoncrawl.org/"
INDEX_NAME = f"{self.crawl_id}-index"
self.root_url = f"{CC_INDEX_SERVER}{INDEX_NAME}"

def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResult:
print(
f"Searching for {keyword} on {search_term} in {self.crawl_id} for {num_pages} pages,"
f" starting at page {start_page}")
f" starting at page {start_page}"
)

url_results = []

Expand All @@ -64,7 +64,9 @@ def crawl(self, search_term, keyword, start_page, num_pages) -> CommonCrawlResul

return CommonCrawlResult(last_page, url_results)

def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int = 20) -> list[dict]:
def search_common_crawl_index(
self, url: str, page: int = 0, max_retries: int = 20
) -> list[dict]:
"""
This method is used to search the Common Crawl index for a given URL and page number
Args:
Expand All @@ -76,9 +78,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int =
"""
encoded_url = quote_plus(url)
search_url = URLWithParameters(self.root_url)
search_url.add_parameter('url', encoded_url)
search_url.add_parameter('output', 'json')
search_url.add_parameter('page', page)
search_url.add_parameter("url", encoded_url)
search_url.add_parameter("output", "json")
search_url.add_parameter("page", page)

retries = 0
delay = 1
Expand All @@ -90,7 +92,9 @@ def search_common_crawl_index(self, url: str, page: int = 0, max_retries: int =
return self.process_response(response, url, page)

retries += 1
print(f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})")
print(
f"Rate limit exceeded. Retrying in {delay} second(s)... (Attempt {retries}/{max_retries})"
)
time.sleep(delay)

print(f"Max retries exceeded. Failed to get records for {url} on page {page}.")
Expand All @@ -106,19 +110,24 @@ def make_request(self, search_url: str) -> requests.Response:
response.raise_for_status()
return response
except requests.exceptions.RequestException as e:
if response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR and 'SlowDown' in response.text:
if (
response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR
and "SlowDown" in response.text

Check warning on line 115 in common_crawler/crawler.py

View workflow job for this annotation

GitHub Actions / Lint

[flake8] reported by reviewdog 🐶 line break before binary operator Raw Output: ./common_crawler/crawler.py:115:17: W503 line break before binary operator
):
return None
else:
print(f"Failed to get records: {e}")
return None

def process_response(self, response: requests.Response, url: str, page: int) -> list[dict]:
def process_response(
self, response: requests.Response, url: str, page: int
) -> list[dict]:
"""Processes the HTTP response and returns the parsed records if successful."""
if response.status_code == HTTPStatus.OK:
records = response.text.strip().split('\n')
records = response.text.strip().split("\n")
print(f"Found {len(records)} records for {url} on page {page}")
return [json.loads(record) for record in records]
elif 'First Page is 0, Last Page is 0' in response.text:
elif "First Page is 0, Last Page is 0" in response.text:
print("No records exist in index matching the url search term")
return None
else:
Expand All @@ -127,4 +136,4 @@ def process_response(self, response: requests.Response, url: str, page: int) ->

@staticmethod
def get_urls_with_keyword(records: list[dict], keyword) -> list[str]:
return [record['url'] for record in records if keyword in record['url']]
return [record["url"] for record in records if keyword in record["url"]]
23 changes: 9 additions & 14 deletions common_crawler/csv_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,7 @@ class CSVManager:
Creates the file if it doesn't exist, and provides a method for adding new rows.
"""

def __init__(
self,
file_name: str,
headers: list[str],
directory=None
):
def __init__(self, file_name: str, headers: list[str], directory=None):

Check warning on line 13 in common_crawler/csv_manager.py

View workflow job for this annotation

GitHub Actions / Lint

[flake8] reported by reviewdog 🐶 Missing docstring in __init__ Raw Output: ./common_crawler/csv_manager.py:13:1: D107 Missing docstring in __init__
self.file_path = get_file_path(f"{file_name}.csv", directory)
self.headers = headers
if not os.path.exists(self.file_path):
Expand All @@ -29,9 +24,9 @@ def add_row(self, row_values: list[str] | tuple[str]):
"""
if isinstance(row_values, str):
# Single values must be converted to a list format
row_values = [row_values]
row_values = [row_values]
try:
with open(self.file_path, mode='a', newline='', encoding='utf-8') as file:
with open(self.file_path, mode="a", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(row_values)
except Exception as e:
Expand All @@ -45,9 +40,7 @@ def add_rows(self, results: list[list[str]]) -> None:
Returns: None
"""
for result in results:
self.add_row(
result
)
self.add_row(result)
print(f"{len(results)} URLs written to {self.file_path}")

def initialize_file(self):
Expand All @@ -59,15 +52,17 @@ def initialize_file(self):
file_exists = os.path.isfile(self.file_path)

if not file_exists:
with open(self.file_path, mode='a', newline='', encoding='utf-8') as file:
with open(self.file_path, mode="a", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(self.headers)
else:
# Open and check that headers match
with open(self.file_path, mode='r', encoding='utf-8') as file:
with open(self.file_path, mode="r", encoding="utf-8") as file:
header_row = next(csv.reader(file))
if header_row != self.headers:
raise ValueError(f"Header row in {self.file_path} does not match expected headers")
raise ValueError(
f"Header row in {self.file_path} does not match expected headers"
)
print(f"CSV file initialized at {self.file_path}")

def delete_file(self):
Expand Down
Loading

0 comments on commit 9d9618a

Please sign in to comment.