bug: Crawler downloads pdfs to project root directory

deepset-ai · Mar 7, 2024 · 49b52c7 · 49b52c7
1 parent fd13e8e
commit 49b52c7
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 0 deletions.
diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py
@@ -137,6 +137,8 @@ def __init__(
         options = Options()
         for option in set(webdriver_options):
             options.add_argument(option)
+        if self.output_dir:
+            options.add_experimental_option("prefs", {"download.default_directory": str(self.output_dir)})
 
         self.driver = selenium_webdriver.Chrome(service=Service(), options=options)
 

diff --git a/test/nodes/test_connector.py b/test/nodes/test_connector.py
@@ -266,3 +266,15 @@ def test_crawler_custom_webdriver():
     crawler = Crawler(webdriver=webdriver)
 
     assert webdriver is crawler.driver
+
+
+@pytest.mark.integration
+def test_crawler_pdf_download_location(samples_path, tmp_path):
+    crawler = Crawler(output_dir=tmp_path)
+
+    file_name = "sample_pdf_1.pdf"
+    pdf_uri = (samples_path / "pdf" / file_name).absolute().as_uri()
+    documents = crawler.crawl(urls=[pdf_uri])
+    assert len(documents) == 1
+    assert (tmp_path / file_name).exists()
+    assert len(os.listdir(tmp_path)) == 2