Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix crawling error in AsyncWebCrawler #125

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions crawl4ai/async_crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:

try:
if self.verbose:
print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
print(f"[LOG] Crawling {url} using AsyncPlaywrightCrawlerStrategy...")

if self.use_cached_html:
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
Expand Down Expand Up @@ -296,6 +296,8 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
raise RuntimeError(f"Wait condition failed: {str(e)}")

html = await page.content()
if html is None:
raise ValueError(f"Failed to crawl {url}: HTML content is None")
page = await self.execute_hook('before_return_html', page, html)

if self.verbose:
Expand Down Expand Up @@ -404,4 +406,4 @@ async def take_screenshot(self, url: str) -> str:
img.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode('utf-8')
finally:
await page.close()
await page.close()
19 changes: 11 additions & 8 deletions crawl4ai/async_webcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ async def __aexit__(self, exc_type, exc_val, exc_tb):

async def awarmup(self):
if self.verbose:
print("[LOG] 🌤️ Warming up the AsyncWebCrawler")
print("[LOG] Warming up the AsyncWebCrawler")
await async_db_manager.ainit_db()
await self.arun(
url="https://google.com/",
Expand All @@ -55,7 +55,7 @@ async def awarmup(self):
)
self.ready = True
if self.verbose:
print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
print("[LOG] AsyncWebCrawler is ready to crawl")

async def arun(
self,
Expand Down Expand Up @@ -108,9 +108,12 @@ async def arun(
t2 = time.time()
if verbose:
print(
f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
f"[LOG] Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
)

if html is None:
raise ValueError(f"Failed to crawl {url}: HTML content is None")

crawl_result = await self.aprocess_html(
url,
html,
Expand All @@ -133,7 +136,7 @@ async def arun(
except Exception as e:
if not hasattr(e, "msg"):
e.msg = str(e)
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
print(f"[ERROR] Failed to crawl {url}, error: {e.msg}")
return CrawlResult(url=url, html="", success=False, error_message=e.msg)

async def arun_many(
Expand All @@ -148,7 +151,7 @@ async def arun_many(
user_agent: str = None,
verbose=True,
**kwargs,
) -> List[CrawlResult]:
) -> List<CrawlResult]:
tasks = [
self.arun(
url,
Expand Down Expand Up @@ -198,7 +201,7 @@ async def aprocess_html(
)
if verbose:
print(
f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds"
f"[LOG] Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds"
)

if result is None:
Expand All @@ -217,7 +220,7 @@ async def aprocess_html(
if extracted_content is None and extraction_strategy and chunking_strategy:
if verbose:
print(
f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {self.__class__.__name__}"
f"[LOG] Extracting semantic blocks for {url}, Strategy: {self.__class__.__name__}"
)

# Check if extraction strategy is type of JsonCssExtractionStrategy
Expand All @@ -232,7 +235,7 @@ async def aprocess_html(

if verbose:
print(
f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds."
f"[LOG] Extraction done for {url}, time taken: {time.time() - t:.2f} seconds."
)

screenshot = None if not screenshot else screenshot
Expand Down
3 changes: 2 additions & 1 deletion tests/async/test_basic_crawling.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ async def test_invalid_url():
result = await crawler.arun(url=url, bypass_cache=True)
assert not result.success
assert result.error_message
assert "HTML content is None" in result.error_message

@pytest.mark.asyncio
async def test_multiple_urls():
Expand Down Expand Up @@ -78,4 +79,4 @@ async def test_concurrent_crawling_performance():

# Entry point for debugging
if __name__ == "__main__":
pytest.main([__file__, "-v"])
pytest.main([__file__, "-v"])
129 changes: 65 additions & 64 deletions tests/async/test_error_handling.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,79 @@
# import os
# import sys
# import pytest
# import asyncio
import os
import sys
import pytest
import asyncio

# # Add the parent directory to the Python path
# parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# sys.path.append(parent_dir)
# Add the parent directory to the Python path
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)

# from crawl4ai.async_webcrawler import AsyncWebCrawler
# from crawl4ai.utils import InvalidCSSSelectorError
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.utils import InvalidCSSSelectorError

# class AsyncCrawlerWrapper:
# def __init__(self):
# self.crawler = None
class AsyncCrawlerWrapper:
def __init__(self):
self.crawler = None

# async def setup(self):
# self.crawler = AsyncWebCrawler(verbose=True)
# await self.crawler.awarmup()
async def setup(self):
self.crawler = AsyncWebCrawler(verbose=True)
await self.crawler.awarmup()

# async def cleanup(self):
# if self.crawler:
# await self.crawler.aclear_cache()
async def cleanup(self):
if self.crawler:
await self.crawler.aclear_cache()

# @pytest.fixture(scope="module")
# def crawler_wrapper():
# wrapper = AsyncCrawlerWrapper()
# asyncio.get_event_loop().run_until_complete(wrapper.setup())
# yield wrapper
# asyncio.get_event_loop().run_until_complete(wrapper.cleanup())
@pytest.fixture(scope="module")
def crawler_wrapper():
wrapper = AsyncCrawlerWrapper()
asyncio.get_event_loop().run_until_complete(wrapper.setup())
yield wrapper
asyncio.get_event_loop().run_until_complete(wrapper.cleanup())

@pytest.mark.asyncio
async def test_network_error(crawler_wrapper):
url = "https://www.nonexistentwebsite123456789.com"
result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True)
assert not result.success
assert "Failed to crawl" in result.error_message
assert "HTML content is None" in result.error_message

# @pytest.mark.asyncio
# async def test_network_error(crawler_wrapper):
# url = "https://www.nonexistentwebsite123456789.com"
# result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True)
# async def test_timeout_error(crawler_wrapper):
# # Simulating a timeout by using a very short timeout value
# url = "https://www.nbcnews.com/business"
# result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, timeout=0.001)
# assert not result.success
# assert "Failed to crawl" in result.error_message

# # @pytest.mark.asyncio
# # async def test_timeout_error(crawler_wrapper):
# # # Simulating a timeout by using a very short timeout value
# # url = "https://www.nbcnews.com/business"
# # result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, timeout=0.001)
# # assert not result.success
# # assert "timeout" in result.error_message.lower()
# assert "timeout" in result.error_message.lower()

# # @pytest.mark.asyncio
# # async def test_invalid_css_selector(crawler_wrapper):
# # url = "https://www.nbcnews.com/business"
# # with pytest.raises(InvalidCSSSelectorError):
# # await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, css_selector="invalid>>selector")
# @pytest.mark.asyncio
# async def test_invalid_css_selector(crawler_wrapper):
# url = "https://www.nbcnews.com/business"
# with pytest.raises(InvalidCSSSelectorError):
# await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, css_selector="invalid>>selector")

# # @pytest.mark.asyncio
# # async def test_js_execution_error(crawler_wrapper):
# # url = "https://www.nbcnews.com/business"
# # invalid_js = "This is not valid JavaScript code;"
# # result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, js=invalid_js)
# # assert not result.success
# # assert "JavaScript" in result.error_message
# @pytest.mark.asyncio
# async def test_js_execution_error(crawler_wrapper):
# url = "https://www.nbcnews.com/business"
# invalid_js = "This is not valid JavaScript code;"
# result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, js=invalid_js)
# assert not result.success
# assert "JavaScript" in result.error_message

# # @pytest.mark.asyncio
# # async def test_empty_page(crawler_wrapper):
# # # Use a URL that typically returns an empty page
# # url = "http://example.com/empty"
# # result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True)
# # assert result.success # The crawl itself should succeed
# # assert not result.markdown.strip() # The markdown content should be empty or just whitespace
# @pytest.mark.asyncio
# async def test_empty_page(crawler_wrapper):
# # Use a URL that typically returns an empty page
# url = "http://example.com/empty"
# result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True)
# assert result.success # The crawl itself should succeed
# assert not result.markdown.strip() # The markdown content should be empty or just whitespace

# # @pytest.mark.asyncio
# # async def test_rate_limiting(crawler_wrapper):
# # # Simulate rate limiting by making multiple rapid requests
# # url = "https://www.nbcnews.com/business"
# # results = await asyncio.gather(*[crawler_wrapper.crawler.arun(url=url, bypass_cache=True) for _ in range(10)])
# # assert any(not result.success and "rate limit" in result.error_message.lower() for result in results)
# @pytest.mark.asyncio
# async def test_rate_limiting(crawler_wrapper):
# # Simulate rate limiting by making multiple rapid requests
# url = "https://www.nbcnews.com/business"
# results = await asyncio.gather(*[crawler_wrapper.crawler.arun(url=url, bypass_cache=True) for _ in range(10)])
# assert any(not result.success and "rate limit" in result.error_message.lower() for result in results)

# # Entry point for debugging
# if __name__ == "__main__":
# pytest.main([__file__, "-v"])
# Entry point for debugging
if __name__ == "__main__":
pytest.main([__file__, "-v"])