Skip to content

Commit

Permalink
add new features
Browse files Browse the repository at this point in the history
  • Loading branch information
mike-gee committed Nov 18, 2023
1 parent 69eee86 commit a8458db
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 53 deletions.
1 change: 0 additions & 1 deletion src/webtranspose/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from time import sleep
from typing import List


from .webt_api import run_webt_api


Expand Down
147 changes: 97 additions & 50 deletions src/webtranspose/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import zipfile
from datetime import datetime
from fnmatch import fnmatch
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Set
from urllib.parse import urljoin, urlparse, urlunparse

import httpx
Expand Down Expand Up @@ -62,6 +62,7 @@ def __init__(
)
self.output_dir = output_dir
self.visited_urls = {}
self.failed_urls = set()
self.ignored_urls = set()
self.n_workers = n_workers
if not os.path.exists(self.output_dir):
Expand All @@ -86,6 +87,7 @@ async def crawl_worker(
crawl_id: str,
visited_urls: Dict[str, str],
allowed_urls: List[str],
failed_urls: Set[str],
banned_urls: List[str],
output_dir: str,
base_url: str,
Expand Down Expand Up @@ -146,7 +148,13 @@ def _lint_url(url: str) -> str:
filename = urllib.parse.quote_plus(curr_url).replace("/", "_")
filepath = os.path.join(base_dir, filename) + ".json"
async with httpx.AsyncClient() as client:
page = await client.get(curr_url)
try:
page = await client.get(curr_url)
except:
failed_urls.add(curr_url)
queue.task_done()
continue

page_title = None
page_html = None
page_text = None
Expand Down Expand Up @@ -268,6 +276,7 @@ async def crawl(self):
self.crawl_id,
self.visited_urls,
self.allowed_urls,
self.failed_urls,
self.banned_urls,
self.output_dir,
self.base_url,
Expand All @@ -293,6 +302,11 @@ async def crawl(self):
await asyncio.sleep(5)
status = self.status()

if (status["num_failed"] > 0) and (
status["num_queued"] + status["num_visited"] + status["num_ignored"] == 0
):
raise Exception("The first page crawled failed")

while status["num_queued"] > 0 and status["num_visited"] < status["max_pages"]:
await asyncio.sleep(5)
status = self.status()
Expand Down Expand Up @@ -441,6 +455,7 @@ def status(self) -> dict:
"max_pages": self.max_pages,
"num_visited": len(self.visited_urls),
"num_ignored": len(self.ignored_urls),
"num_failed": len(self.failed_urls),
"num_queued": self.queue.qsize(),
"banned_urls": self.banned_urls,
"allowed_urls": self.allowed_urls,
Expand All @@ -457,6 +472,8 @@ def status(self) -> dict:
self.api_key,
)
crawl_status["loc"] = "cloud"
if self.verbose:
logging.info(f"Status of crawl {self.crawl_id}: {crawl_status}")
return crawl_status

def get_ignored(self) -> list:
Expand All @@ -479,6 +496,26 @@ def get_ignored(self) -> list:
)
return out_json["pages"]

def get_failed(self) -> list:
"""
Get a list of failed URLs.
Returns:
list: A list of failed URLs.
"""
if not self.created:
return list(self.failed_urls)

visited_json = {
"crawl_id": self.crawl_id,
}
out_json = run_webt_api(
visited_json,
"v1/crawl/get/failed",
self.api_key,
)
return out_json["pages"]

def get_visited(self) -> list:
"""
Get a list of visited URLs.
Expand Down Expand Up @@ -652,36 +689,6 @@ def from_cloud(crawl_id: str, api_key: Optional[str] = None) -> "Crawl":
"API key not found. Please set WEBTRANSPOSE_API_KEY environment variable or pass api_key argument."
)

def status(self) -> dict:
"""
Get the status of the Crawl object.
Returns:
dict: The status of the Crawl object.
"""
if not self.created:
return {
"crawl_id": self.crawl_id,
"n_workers": self.n_workers,
"base_url": self.base_url,
"max_pages": self.max_pages,
"num_visited": len(self.visited_urls),
"num_ignored": len(self.ignored_urls),
"num_queued": self.queue.qsize(),
"banned_urls": self.banned_urls,
"allowed_urls": self.allowed_urls,
}

status_json = {
"crawl_id": self.crawl_id,
}
crawl_status = run_webt_api(
status_json,
"v1/crawl/get",
self.api_key,
)
return crawl_status

def __str__(self) -> str:
"""
Get a string representation of the Crawl object.
Expand All @@ -692,15 +699,16 @@ def __str__(self) -> str:
status = self.status()
return (
f"WebTransposeCrawl(\n"
f"Crawl ID: {status['crawl_id']}\n"
f"Number of Workers: {status['n_workers'] if 'n_workers' in status else 'cloud'}\n"
f"Base URL: {status['base_url']}\n"
f"Max Pages: {status['max_pages']}\n"
f"Number of Visited URLs: {status['num_visited']}\n"
f"Number of Ignored URLs: {status['num_ignored']}\n"
f"Number of Queued URLs: {status['num_queued']}\n"
f"Banned URLs: {status['banned_urls']}\n"
f"Allowed URLs: {status['allowed_urls']}"
f" Crawl ID: {status['crawl_id']}\n"
f" Number of Workers: {status['n_workers'] if 'n_workers' in status else 'cloud'}\n"
f" Base URL: {status['base_url']}\n"
f" Max Pages: {status['max_pages']}\n"
f" Number of Visited URLs: {status['num_visited']}\n"
f" Number of Ignored URLs: {status['num_ignored']}\n"
f" Number of Queued URLs: {status['num_queued']}\n"
f" Number of Failed URLs: {status['num_failed']}\n"
f" Banned URLs: {status['banned_urls']}\n"
f" Allowed URLs: {status['allowed_urls']}"
f")"
)

Expand All @@ -714,15 +722,16 @@ def __repr__(self) -> str:
status = self.status()
return (
f"WebTransposeCrawl(\n"
f"Crawl ID: {status['crawl_id']}\n"
f"Number of Workers: {status['n_workers'] if 'n_workers' in status else 'cloud'}\n"
f"Base URL: {status['base_url']}\n"
f"Max Pages: {status['max_pages']}\n"
f"Number of Visited URLs: {status['num_visited']}\n"
f"Number of Ignored URLs: {status['num_ignored']}\n"
f"Number of Queued URLs: {status['num_queued']}\n"
f"Banned URLs: {status['banned_urls']}\n"
f"Allowed URLs: {status['allowed_urls']}"
f" Crawl ID: {status['crawl_id']}\n"
f" Number of Workers: {status['n_workers'] if 'n_workers' in status else 'cloud'}\n"
f" Base URL: {status['base_url']}\n"
f" Max Pages: {status['max_pages']}\n"
f" Number of Visited URLs: {status['num_visited']}\n"
f" Number of Ignored URLs: {status['num_ignored']}\n"
f" Number of Queued URLs: {status['num_queued']}\n"
f" Number of Failed URLs: {status['num_failed']}\n"
f" Banned URLs: {status['banned_urls']}\n"
f" Allowed URLs: {status['allowed_urls']}"
f")"
)

Expand Down Expand Up @@ -790,6 +799,22 @@ def get_child_urls(self, url: str) -> list:
)
return out_json

def retry_failed_urls(self) -> None:
"""
Queue failed URLs from a crawl.
"""
if not self.created:
logging.error("Cannot retry failed URLs for un-created crawl.")
elif self.api_key is not None:
queue_json = {
"crawl_id": self.crawl_id,
}
run_webt_api(
queue_json,
"v1/crawl/retry-failed",
self.api_key,
)


def get_crawl(crawl_id: str, api_key: Optional[str] = None) -> Crawl:
"""
Expand Down Expand Up @@ -836,3 +861,25 @@ def list_crawls(loc: str = "cloud", api_key: Optional[str] = None) -> list:
if filename.endswith(".json"):
crawls.append(Crawl.from_metadata(filename[:-5]))
return crawls


def retry_failed(crawl_id: str, api_key: Optional[str] = None) -> None:
"""
Queue failed URLs from a crawl.
Args:
crawl_id (str): The ID of the crawl.
api_key (str, optional): The API key. Defaults to None.
"""
if api_key is None:
api_key = os.environ.get("WEBTRANSPOSE_API_KEY")

if api_key is not None:
queue_json = {
"crawl_id": crawl_id,
}
run_webt_api(
queue_json,
"v1/crawl/retry-failed",
api_key,
)
49 changes: 47 additions & 2 deletions src/webtranspose/scrape.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
import os
import re
Expand Down Expand Up @@ -60,6 +61,42 @@ def __init__(
"No Web Transpose API provided. Lite version in use...\n\nTo run the actual WebT AI Web Scraper the Web Transpose API, set the WEBTRANSPOSE_API_KEY from https://webtranspose.com. Run cheaper with logging and advanced analytics."
)

def __str__(self) -> str:
"""
Get a string representation of the Scraper object.
Returns:
str: The string representation of the Scraper object.
"""
status = self.status()
schema = json.dumps(status["schema"], indent=4)
return (
f"WebTransposeScraper(\n"
f" Status ID: {status['scraper_id']}\n"
f" Name: {status['name']}\n"
f" Render JS: {status['render_js']}\n"
f" Schema: {schema}\n"
f")"
)

def __repr__(self) -> str:
"""
Get a string representation of the Scraper object.
Returns:
str: The string representation of the Scraper object.
"""
status = self.status()
schema = json.dumps(status["schema"], indent=4)
return (
f"WebTransposeScraper(\n"
f" Status ID: {status['scraper_id']}\n"
f" Name: {status['name']}\n"
f" Render JS: {status['render_js']}\n"
f" Schema: {schema}\n"
f")"
)

def create_scraper_api(self):
"""
Creates a Scraper on https://webtranspose.com
Expand Down Expand Up @@ -135,7 +172,7 @@ def status(self):
Returns:
dict: The status of the Scraper.
"""
if self.api_key is None:
if self.api_key is None or not self.created:
return {
"scraper_id": self.scraper_id,
"name": self.name,
Expand All @@ -147,11 +184,19 @@ def status(self):
get_json = {
"scraper_id": self.scraper_id,
}
return run_webt_api(
out_api = run_webt_api(
get_json,
"/v1/scraper/get",
self.api_key,
)
scraper = out_api["scraper"]
return {
"scraper_id": scraper["id"],
"name": scraper["name"],
"verbose": self.verbose,
"render_js": scraper["render_js"],
"schema": scraper["schema"],
}


def get_scraper(scraper_id, api_key: str = None):
Expand Down

0 comments on commit a8458db

Please sign in to comment.