Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add fetch_archive_from_http to fetch zip or gzip archives #7806

Merged
merged 2 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 59 additions & 5 deletions haystack/utils/import_utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
import logging
import gzip
import importlib
import importlib.util
from typing import Optional, Tuple, List
from urllib.parse import urlparse, unquote
from os.path import splitext, basename
import io
import logging
import zipfile
from os.path import basename, splitext
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
from urllib.parse import unquote, urlparse

import requests

from haystack.errors import DatasetsError
from haystack.schema import Document


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -55,5 +60,54 @@ def get_filename_extension_from_url(url: str) -> Tuple[str, str]:
return file_name, archive_extension


def fetch_archive_from_http(
url: str,
output_dir: str,
proxies: Optional[Dict[str, str]] = None,
timeout: Union[float, Tuple[float, float]] = 10.0,
) -> bool:
"""
Fetch an archive (zip or gz) from a url via http and extract content to an output directory.
:param url: http address
:param output_dir: local path
:param proxies: proxies details as required by requests library
:param timeout: How many seconds to wait for the server to send data before giving up,
as a float, or a :ref:`(connect timeout, read timeout) <timeouts>` tuple.
Defaults to 10 seconds.
:return: if anything got fetched
"""
# verify & prepare local directory
path = Path(output_dir)
if not path.exists():
path.mkdir(parents=True)
silvanocerza marked this conversation as resolved.
Show resolved Hide resolved

is_not_empty = len(list(Path(path).rglob("*"))) > 0
if is_not_empty:
logger.info("Found data stored in '%s'. Delete this first if you really want to fetch new data.", output_dir)
return False
else:
logger.info("Fetching from %s to '%s'", url, output_dir)

file_name, archive_extension = get_filename_extension_from_url(url)
request_data = requests.get(url, proxies=proxies, timeout=timeout)

if archive_extension == "zip":
zip_archive = zipfile.ZipFile(io.BytesIO(request_data.content))
zip_archive.extractall(output_dir)
elif archive_extension == "gz" and not "tar.gz" in url:
gzip_archive = gzip.GzipFile(fileobj=io.BytesIO(request_data.content))
file_content = gzip_archive.read()
with open(f"{output_dir}/{file_name}", "wb") as file:
file.write(file_content)
else:
logger.warning(
"Skipped url %s as file type is not supported here. "
"See haystack documentation for support of more file types",
url,
)

return True


def is_whisper_available():
return importlib.util.find_spec("whisper") is not None
4 changes: 4 additions & 0 deletions releasenotes/notes/safe-fetch-4ba829def3241eec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Add previously removed `fetch_archive_from_http` util function to fetch zip and gzip archives from url
Loading