From 49b52c7cda9cc3257e567ab85a0d9e97863b6f78 Mon Sep 17 00:00:00 2001 From: Mohit Lal Date: Thu, 7 Mar 2024 15:56:00 -0500 Subject: [PATCH] bug: Crawler downloads pdfs to project root directory --- haystack/nodes/connector/crawler.py | 2 ++ test/nodes/test_connector.py | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py index f5300f2aa8..bfb6f6db4a 100644 --- a/haystack/nodes/connector/crawler.py +++ b/haystack/nodes/connector/crawler.py @@ -137,6 +137,8 @@ def __init__( options = Options() for option in set(webdriver_options): options.add_argument(option) + if self.output_dir: + options.add_experimental_option("prefs", {"download.default_directory": str(self.output_dir)}) self.driver = selenium_webdriver.Chrome(service=Service(), options=options) diff --git a/test/nodes/test_connector.py b/test/nodes/test_connector.py index 1dba26a50c..68f31befdb 100644 --- a/test/nodes/test_connector.py +++ b/test/nodes/test_connector.py @@ -266,3 +266,15 @@ def test_crawler_custom_webdriver(): crawler = Crawler(webdriver=webdriver) assert webdriver is crawler.driver + + +@pytest.mark.integration +def test_crawler_pdf_download_location(samples_path, tmp_path): + crawler = Crawler(output_dir=tmp_path) + + file_name = "sample_pdf_1.pdf" + pdf_uri = (samples_path / "pdf" / file_name).absolute().as_uri() + documents = crawler.crawl(urls=[pdf_uri]) + assert len(documents) == 1 + assert (tmp_path / file_name).exists() + assert len(os.listdir(tmp_path)) == 2