From 52bfeba1cb16f2b4793a93229a95a73d20d9fdda Mon Sep 17 00:00:00 2001 From: Mike Date: Mon, 30 Oct 2023 00:43:24 -0700 Subject: [PATCH] style --- src/webtranspose/consts.py | 1 - src/webtranspose/crawl.py | 115 +++++++++++++++++++---------------- src/webtranspose/openai.py | 12 ++-- src/webtranspose/scrape.py | 20 +++--- src/webtranspose/webt_api.py | 7 +-- 5 files changed, 82 insertions(+), 73 deletions(-) delete mode 100644 src/webtranspose/consts.py diff --git a/src/webtranspose/consts.py b/src/webtranspose/consts.py deleted file mode 100644 index 1bea4cd..0000000 --- a/src/webtranspose/consts.py +++ /dev/null @@ -1 +0,0 @@ -WEBTRANSPOSE_API_URL = "https://api.webtranspose.com/" diff --git a/src/webtranspose/crawl.py b/src/webtranspose/crawl.py index 4d7f865..9342840 100644 --- a/src/webtranspose/crawl.py +++ b/src/webtranspose/crawl.py @@ -9,6 +9,7 @@ import zipfile from datetime import datetime from fnmatch import fnmatch +from typing import Dict, List, Optional from urllib.parse import urljoin, urlparse, urlunparse import httpx @@ -20,17 +21,17 @@ class Crawl: def __init__( self, - url, - allowed_urls=[], - banned_urls=[], - n_workers=1, - max_pages=15, - render_js=False, - output_dir="webtranspose-out", - verbose=False, - api_key=None, - _created=False, - ): + url: str, + allowed_urls: List[str] = [], + banned_urls: List[str] = [], + n_workers: int = 1, + max_pages: int = 15, + render_js: bool = False, + output_dir: str = "webtranspose-out", + verbose: bool = False, + api_key: Optional[str] = None, + _created: bool = False, + ) -> None: """ Initialize the Crawl object. @@ -74,19 +75,19 @@ def __init__( @staticmethod async def crawl_worker( - name, - queue, - crawl_id, - visited_urls, - allowed_urls, - banned_urls, - output_dir, - base_url, - max_pages, - leftover_queue, - ignored_queue, - verbose, - ): + name: str, + queue: asyncio.Queue, + crawl_id: str, + visited_urls: Dict[str, str], + allowed_urls: List[str], + banned_urls: List[str], + output_dir: str, + base_url: str, + max_pages: int, + leftover_queue: asyncio.Queue, + ignored_queue: asyncio.Queue, + verbose: bool, + ) -> None: """ Worker function for crawling URLs. @@ -104,7 +105,7 @@ async def crawl_worker( :param verbose: Whether to print verbose logging messages. """ - def _lint_url(url): + def _lint_url(url: str) -> str: """ Lint the given URL by removing the fragment component. @@ -200,7 +201,7 @@ def _lint_url(url): queue.task_done() - async def create_crawl_api(self): + def create_crawl_api(self): """ Creates a Crawl on https://webtranspose.com """ @@ -219,7 +220,7 @@ async def create_crawl_api(self): self.crawl_id = out_json["crawl_id"] self.created = True - async def queue_crawl(self): + def queue_crawl(self): """ Resume crawling of Crawl object. Don't wait for it to finish crawling. """ @@ -228,21 +229,23 @@ async def queue_crawl(self): else: if not self.created: - await self.create_crawl_api() + self.create_crawl_api() queue_json = { "crawl_id": self.crawl_id, } - run_webt_api( + out = run_webt_api( queue_json, "v1/crawl/resume", self.api_key, ) - + print(out) async def crawl(self): """ Resume crawling of Crawl object. """ + if self.verbose: + logging.info(f"Starting crawl of {self.base_url}") if self.api_key is None: leftover_queue = asyncio.Queue() ignored_queue = asyncio.Queue() @@ -274,14 +277,18 @@ async def crawl(self): self.ignored_urls = list(ignored_queue._queue) self.to_metadata() else: - await self.queue_crawl() + self.queue_crawl() status = self.status() - while status['num_queued'] > 0 and status['num_visited'] < status['max_pages']: + while status["num_queued"] + status["num_visited"] + status["num_ignored"] == 0: + await asyncio.sleep(5) + status = self.status() + + while status["num_queued"] > 0 and status["num_visited"] < status["max_pages"]: await asyncio.sleep(5) status = self.status() return self - def get_queue(self, n=10): + def get_queue(self, n: int = 10) -> list: """ Get a list of URLs from the queue. @@ -316,7 +323,7 @@ def get_queue(self, n=10): ) return out_json["urls"] - def set_allowed_urls(self, allowed_urls): + def set_allowed_urls(self, allowed_urls: list) -> "Crawl": """ Set the allowed URLs for the crawl. @@ -341,7 +348,7 @@ def set_allowed_urls(self, allowed_urls): ) return self - def set_banned_urls(self, banned_urls): + def set_banned_urls(self, banned_urls: list) -> "Crawl": """ Set the banned URLs for the crawl. @@ -351,7 +358,7 @@ def set_banned_urls(self, banned_urls): Returns: self: The Crawl object. """ - self.banned_urls = banned_urls + self.banned_urls = banned_urls if not self.created: self.to_metadata() else: @@ -366,7 +373,7 @@ def set_banned_urls(self, banned_urls): ) return self - def get_filename(self, url): + def get_filename(self, url: str) -> str: """ Get the filename associated with a visited URL. @@ -384,7 +391,7 @@ def get_filename(self, url): except KeyError: raise ValueError(f"URL {url} not found in visited URLs") - def set_max_pages(self, max_pages): + def set_max_pages(self, max_pages: int) -> "Crawl": """ Set the maximum number of pages to crawl. @@ -409,7 +416,7 @@ def set_max_pages(self, max_pages): ) return self - def status(self): + def status(self) -> dict: """ Get the status of the Crawl object. @@ -441,8 +448,8 @@ def status(self): ) crawl_status["loc"] = "cloud" return crawl_status - - def get_ignored(self): + + def get_ignored(self) -> list: """ Get a list of ignored URLs. @@ -462,7 +469,7 @@ def get_ignored(self): ) return out_json["pages"] - def get_visited(self): + def get_visited(self) -> list: """ Get a list of visited URLs. @@ -482,7 +489,7 @@ def get_visited(self): ) return out_json["pages"] - def get_banned(self): + def get_banned(self) -> list: """ Get a list of banned URLs. @@ -539,9 +546,10 @@ def download(self): filename = urllib.parse.quote_plus(url).replace("/", "_") filepath = os.path.join(base_dir, filename) + ".json" shutil.move(json_file, filepath) + logging.info(f"The output of the crawl can be found at: {self.output_dir}") - def to_metadata(self): + def to_metadata(self) -> None: """ Save the metadata of the Crawl object to a file. """ @@ -564,7 +572,7 @@ def to_metadata(self): json.dump(metadata, file) @staticmethod - def from_metadata(crawl_id, output_dir="webtranspose-out"): + def from_metadata(crawl_id: str, output_dir: str = "webtranspose-out") -> "Crawl": """ Create a Crawl object from metadata stored in a file. @@ -596,7 +604,7 @@ def from_metadata(crawl_id, output_dir="webtranspose-out"): return crawl @staticmethod - def from_cloud(crawl_id, api_key=None): + def from_cloud(crawl_id: str, api_key: Optional[str] = None) -> "Crawl": """ Create a Crawl object from metadata stored in the cloud. @@ -631,7 +639,7 @@ def from_cloud(crawl_id, api_key=None): "API key not found. Please set WEBTRANSPOSE_API_KEY environment variable or pass api_key argument." ) - def status(self): + def status(self) -> dict: """ Get the status of the Crawl object. @@ -661,7 +669,7 @@ def status(self): ) return crawl_status - def __str__(self): + def __str__(self) -> str: """ Get a string representation of the Crawl object. @@ -683,7 +691,7 @@ def __str__(self): f")" ) - def __repr__(self): + def __repr__(self) -> str: """ Get a string representation of the Crawl object. @@ -705,7 +713,7 @@ def __repr__(self): f")" ) - def get_page(self, url): + def get_page(self, url: str) -> dict: """ Get the page data for a given URL. @@ -735,7 +743,7 @@ def get_page(self, url): ) return out_json - def get_child_urls(self, url): + def get_child_urls(self, url: str) -> list: """ Get the child URLs for a given URL. @@ -770,12 +778,13 @@ def get_child_urls(self, url): return out_json -def get_crawl(crawl_id, api_key=None): +def get_crawl(crawl_id: str, api_key: Optional[str] = None) -> Crawl: """ Get a Crawl object based on the crawl ID. Args: crawl_id (str): The ID of the crawl. + api_key (str, optional): The API key. Defaults to None. Returns: Crawl: The Crawl object. @@ -786,7 +795,7 @@ def get_crawl(crawl_id, api_key=None): return Crawl.from_cloud(crawl_id, api_key=api_key) -def list_crawls(loc="cloud", api_key=None): +def list_crawls(loc: str = "cloud", api_key: Optional[str] = None) -> list: """ List all available crawls. diff --git a/src/webtranspose/openai.py b/src/webtranspose/openai.py index a0a559c..71e700c 100644 --- a/src/webtranspose/openai.py +++ b/src/webtranspose/openai.py @@ -8,8 +8,8 @@ class OpenAIScraper: def __init__( self, - chunk_size=2500, - overlap_size=100, + chunk_size: int = 2500, + overlap_size: int = 100, ): """ Initialize the OpenAIScraper. @@ -24,7 +24,9 @@ def __init__( self.overlap_size = overlap_size @staticmethod - def process_html(text, chunk_size, overlap_size, encoding): + def process_html( + text: str, chunk_size: int, overlap_size: int, encoding: tiktoken.Encoding + ) -> list: """ Process the HTML text into chunks. @@ -49,7 +51,7 @@ def process_html(text, chunk_size, overlap_size, encoding): decoded_chunks = [encoding.decode(chunk) for chunk in chunks] return decoded_chunks - def scrape(self, html, schema): + def scrape(self, html: str, schema: dict) -> dict: """ Scrape the HTML text using the provided schema. @@ -106,7 +108,7 @@ def scrape(self, html, schema): return out_data - def transform_schema(self, schema): + def transform_schema(self, schema: dict) -> dict: """ Transform the schema into the format required by OpenAI. diff --git a/src/webtranspose/scrape.py b/src/webtranspose/scrape.py index 2eabd39..4c0dbe5 100644 --- a/src/webtranspose/scrape.py +++ b/src/webtranspose/scrape.py @@ -12,14 +12,14 @@ class Scraper: def __init__( self, - schema, - scraper_id=None, - name=None, - render_js=False, - verbose=False, - scraper=None, - api_key=None, - _created=False, + schema: dict, + scraper_id: str = None, + name: str = None, + render_js: bool = False, + verbose: bool = False, + scraper: OpenAIScraper = None, + api_key: str = None, + _created: bool = False, ): """ Initialize the Scraper object. @@ -141,7 +141,7 @@ def status(self): ) -def get_scraper(scraper_id, api_key=None): +def get_scraper(scraper_id, api_key: str = None): """ Get a Scraper object based on the scraper ID. @@ -180,7 +180,7 @@ def get_scraper(scraper_id, api_key=None): raise ValueError("Must provide api_key or set WEBTRANSPOSE_API_KEY in environment variables.") -def list_scrapers(api_key=None): +def list_scrapers(api_key: str = None): """ List all available scrapers. diff --git a/src/webtranspose/webt_api.py b/src/webtranspose/webt_api.py index fcbf71d..2262515 100644 --- a/src/webtranspose/webt_api.py +++ b/src/webtranspose/webt_api.py @@ -3,10 +3,8 @@ import requests -from .consts import WEBTRANSPOSE_API_URL - -def run_webt_api(params, api_path, api_key=None): +def run_webt_api(params: dict, api_path: str, api_key: str = None) -> dict: """ Run a WebTranspose API request. @@ -21,11 +19,12 @@ def run_webt_api(params, api_path, api_key=None): Raises: Exception: If the API request fails with a non-200 status code. """ + WEBTRANSPOSE_API_URL = "https://api.webtranspose.com/" if api_key is None: api_key = os.environ.get("WEBTRANSPOSE_API_KEY") headers = {"X-API-Key": api_key} api_endpoint = urljoin(WEBTRANSPOSE_API_URL, api_path) - response = requests.post(api_endpoint, headers=headers, json=params) + response = requests.post(api_endpoint, headers=headers, json=params, timeout=180) if response.status_code == 200: return response.json() else: