From ec7f3eeaae1e7a918df2239c4e402db83883b73d Mon Sep 17 00:00:00 2001 From: Juan Nunez-Iglesias Date: Mon, 19 Feb 2024 15:47:38 +0100 Subject: [PATCH] Ensure all archive members are unpacked on a second call (#365) After running Untar/Unzip with a single member passed and then running again with `members=None` we'd only check that the unzipped folder exists and then return only the single member. This ensures that we return all of the requested files by checking that they exist in the unpacked folder. Co-authored-by: Leonardo Uieda --- pooch/processors.py | 93 ++++++++++++++++++++++++---------- pooch/tests/test_processors.py | 66 +++++++++++++++++------- 2 files changed, 115 insertions(+), 44 deletions(-) diff --git a/pooch/processors.py b/pooch/processors.py index dbe1a910..16670f9c 100644 --- a/pooch/processors.py +++ b/pooch/processors.py @@ -8,6 +8,7 @@ """ Post-processing hooks """ +import abc import os import bz2 import gzip @@ -19,9 +20,9 @@ from .utils import get_logger -class ExtractorProcessor: # pylint: disable=too-few-public-methods +class ExtractorProcessor(abc.ABC): # pylint: disable=too-few-public-methods """ - Base class for extractions from compressed archives. + Abstract base class for extractions from compressed archives. Subclasses can be used with :meth:`pooch.Pooch.fetch` and :func:`pooch.retrieve` to unzip a downloaded data file into a folder in the @@ -34,16 +35,43 @@ class ExtractorProcessor: # pylint: disable=too-few-public-methods If None, will unpack all files in the archive. Otherwise, *members* must be a list of file names to unpack from the archive. Only these files will be unpacked. + extract_dir : str or None + If None, files will be unpacked to the default location (a folder in + the same location as the downloaded zip file, with a suffix added). + Otherwise, files will be unpacked to ``extract_dir``, which is + interpreted as a *relative path* (relative to the cache location + provided by :func:`pooch.retrieve` or :meth:`pooch.Pooch.fetch`). """ - # String appended to unpacked archive. To be implemented in subclass - suffix = None - def __init__(self, members=None, extract_dir=None): self.members = members self.extract_dir = extract_dir + @property + @abc.abstractmethod + def suffix(self): + """ + String appended to unpacked archive folder name. + Only used if extract_dir is None. + MUST BE IMPLEMENTED BY CHILD CLASSES. + """ + + @abc.abstractmethod + def _all_members(self, fname): + """ + Return all the members in the archive. + MUST BE IMPLEMENTED BY CHILD CLASSES. + """ + + @abc.abstractmethod + def _extract_file(self, fname, extract_dir): + """ + This method receives an argument for the archive to extract and the + destination path. + MUST BE IMPLEMENTED BY CHILD CLASSES. + """ + def __call__(self, fname, action, pooch): """ Extract all files from the given archive. @@ -69,27 +97,23 @@ def __call__(self, fname, action, pooch): A list of the full path to all files in the extracted archive. """ - if self.suffix is None and self.extract_dir is None: - raise NotImplementedError( - "Derived classes must define either a 'suffix' attribute or " - "an 'extract_dir' attribute." - ) if self.extract_dir is None: self.extract_dir = fname + self.suffix else: archive_dir = fname.rsplit(os.path.sep, maxsplit=1)[0] self.extract_dir = os.path.join(archive_dir, self.extract_dir) + # Get a list of everyone who is supposed to be in the unpacked folder + # so we can check if they are all there or if we need to extract new + # files. + if self.members is None or not self.members: + members = self._all_members(fname) + else: + members = self.members if ( (action in ("update", "download")) or (not os.path.exists(self.extract_dir)) - or ( - (self.members is not None) - and ( - not all( - os.path.exists(os.path.join(self.extract_dir, m)) - for m in self.members - ) - ) + or not all( + os.path.exists(os.path.join(self.extract_dir, m)) for m in members ) ): # Make sure that the folder with the extracted files exists @@ -111,13 +135,6 @@ def __call__(self, fname, action, pooch): return fnames - def _extract_file(self, fname, extract_dir): - """ - This method receives an argument for the archive to extract and the - destination path. MUST BE IMPLEMENTED BY CHILD CLASSES. - """ - raise NotImplementedError - class Unzip(ExtractorProcessor): # pylint: disable=too-few-public-methods """ @@ -146,7 +163,18 @@ class Unzip(ExtractorProcessor): # pylint: disable=too-few-public-methods """ - suffix = ".unzip" + @property + def suffix(self): + """ + String appended to unpacked archive folder name. + Only used if extract_dir is None. + """ + return ".unzip" + + def _all_members(self, fname): + """Return all members from a given archive.""" + with ZipFile(fname, "r") as zip_file: + return zip_file.namelist() def _extract_file(self, fname, extract_dir): """ @@ -207,7 +235,18 @@ class Untar(ExtractorProcessor): # pylint: disable=too-few-public-methods :meth:`pooch.Pooch.fetch`). """ - suffix = ".untar" + @property + def suffix(self): + """ + String appended to unpacked archive folder name. + Only used if extract_dir is None. + """ + return ".untar" + + def _all_members(self, fname): + """Return all members from a given archive.""" + with TarFile.open(fname, "r") as tar_file: + return [info.name for info in tar_file.getmembers()] def _extract_file(self, fname, extract_dir): """ diff --git a/pooch/tests/test_processors.py b/pooch/tests/test_processors.py index 578b478d..1a2a1e2a 100644 --- a/pooch/tests/test_processors.py +++ b/pooch/tests/test_processors.py @@ -14,7 +14,7 @@ import pytest from .. import Pooch -from ..processors import Unzip, Untar, ExtractorProcessor, Decompress +from ..processors import Unzip, Untar, Decompress from .utils import pooch_test_url, pooch_test_registry, check_tiny_data, capture_log @@ -97,22 +97,6 @@ def test_decompress_fails(): assert "pooch.Unzip/Untar" in exception.value.args[0] -@pytest.mark.network -def test_extractprocessor_fails(): - "The base class should be used and should fail when passed to fecth" - with TemporaryDirectory() as local_store: - # Setup a pooch in a temp dir - pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY) - processor = ExtractorProcessor() - with pytest.raises(NotImplementedError) as exception: - pup.fetch("tiny-data.tar.gz", processor=processor) - assert "'suffix'" in exception.value.args[0] - processor.suffix = "tar.gz" - with pytest.raises(NotImplementedError) as exception: - pup.fetch("tiny-data.tar.gz", processor=processor) - assert not exception.value.args - - @pytest.mark.network @pytest.mark.parametrize( "target_path", [None, "some_custom_path"], ids=["default_path", "custom_path"] @@ -255,3 +239,51 @@ def _unpacking_expected_paths_and_logs(archive, members, path, name): log_lines.append(f"Extracting '{member}'") true_paths = set(true_paths) return true_paths, log_lines + + +@pytest.mark.network +@pytest.mark.parametrize( + "processor_class,extension", + [(Unzip, ".zip"), (Untar, ".tar.gz")], +) +def test_unpacking_members_then_no_members(processor_class, extension): + """ + Test that calling with valid members then without them works. + https://github.com/fatiando/pooch/issues/364 + """ + with TemporaryDirectory() as local_store: + pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY) + + # Do a first fetch with an existing member + processor1 = processor_class(members=["store/tiny-data.txt"]) + filenames1 = pup.fetch("store" + extension, processor=processor1) + assert len(filenames1) == 1 + + # Do a second fetch with no members + processor2 = processor_class() + filenames2 = pup.fetch("store" + extension, processor=processor2) + assert len(filenames2) > 1 + + +@pytest.mark.network +@pytest.mark.parametrize( + "processor_class,extension", + [(Unzip, ".zip"), (Untar, ".tar.gz")], +) +def test_unpacking_wrong_members_then_no_members(processor_class, extension): + """ + Test that calling with invalid members then without them works. + https://github.com/fatiando/pooch/issues/364 + """ + with TemporaryDirectory() as local_store: + pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY) + + # Do a first fetch with incorrect member + processor1 = processor_class(members=["not-a-valid-file.csv"]) + filenames1 = pup.fetch("store" + extension, processor=processor1) + assert len(filenames1) == 0 + + # Do a second fetch with no members + processor2 = processor_class() + filenames2 = pup.fetch("store" + extension, processor=processor2) + assert len(filenames2) > 0