Skip to content

Commit

Permalink
bug: Crawler downloads pdfs to project root directory
Browse files Browse the repository at this point in the history
  • Loading branch information
mohitlal31 committed Mar 7, 2024
1 parent fd13e8e commit 49b52c7
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 0 deletions.
2 changes: 2 additions & 0 deletions haystack/nodes/connector/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ def __init__(
options = Options()
for option in set(webdriver_options):
options.add_argument(option)
if self.output_dir:
options.add_experimental_option("prefs", {"download.default_directory": str(self.output_dir)})

self.driver = selenium_webdriver.Chrome(service=Service(), options=options)

Expand Down
12 changes: 12 additions & 0 deletions test/nodes/test_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,3 +266,15 @@ def test_crawler_custom_webdriver():
crawler = Crawler(webdriver=webdriver)

assert webdriver is crawler.driver


@pytest.mark.integration
def test_crawler_pdf_download_location(samples_path, tmp_path):
crawler = Crawler(output_dir=tmp_path)

file_name = "sample_pdf_1.pdf"
pdf_uri = (samples_path / "pdf" / file_name).absolute().as_uri()
documents = crawler.crawl(urls=[pdf_uri])
assert len(documents) == 1
assert (tmp_path / file_name).exists()
assert len(os.listdir(tmp_path)) == 2

0 comments on commit 49b52c7

Please sign in to comment.