From 749a80a70ae71db4797fe01e02b2ac0e227e9880 Mon Sep 17 00:00:00 2001 From: clearml <> Date: Sat, 7 Dec 2024 17:20:15 +0200 Subject: [PATCH] Don't download external files from parent datasets if they have been modified/removed in the child dataset --- clearml/datasets/dataset.py | 41 ++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/clearml/datasets/dataset.py b/clearml/datasets/dataset.py index 4ac0e001..35f49c16 100644 --- a/clearml/datasets/dataset.py +++ b/clearml/datasets/dataset.py @@ -552,7 +552,7 @@ def remove_files(self, dataset_path=None, recursive=True, verbose=False): k: v for k, v in self._dataset_link_entries.items() if not matches_any_wildcard(k, dataset_path, recursive=recursive) - and not matches_any_wildcard(v.link, dataset_path, recursive=recursive) + and not (matches_any_wildcard(v.link, dataset_path, recursive=recursive) or v.link == dataset_path) } removed = 0 @@ -2263,14 +2263,14 @@ def _download_dataset_archives(self): def _get_dataset_files( self, - force=False, - selected_chunks=None, - lock_target_folder=False, - cleanup_target_folder=True, - target_folder=None, - max_workers=None + force=False, # type: bool + selected_chunks=None, # type: Optional[List[int]] + lock_target_folder=False, # type: bool + cleanup_target_folder=True, # type: bool + target_folder=None, # type: Optional[Path] + max_workers=None, # type: Optional[int] + link_entries_of_interest=None # type: Optional[Dict[str, LinkEntry]] ): - # type: (bool, Optional[List[int]], bool, bool, Optional[Path], Optional[int]) -> str """ First, extracts the archive present on the ClearML server containing this dataset's files. Then, download the remote files. Note that if a remote file was added to the ClearML server, then @@ -2287,6 +2287,8 @@ def _get_dataset_files( :param target_folder: If provided use the specified target folder, default, auto generate from Dataset ID. :param max_workers: Number of threads to be spawned when getting dataset files. Defaults to the number of virtual cores. + :param link_entries_of_interest: Download only the external files in this dictionary. + Useful when one doesn't want to download all the files in a parent dataset, as some files might be removed :return: Path to the local storage where the data was downloaded """ @@ -2300,14 +2302,21 @@ def _get_dataset_files( max_workers=max_workers ) self._download_external_files( - target_folder=target_folder, lock_target_folder=lock_target_folder, max_workers=max_workers + target_folder=target_folder, + lock_target_folder=lock_target_folder, + max_workers=max_workers, + link_entries_of_interest=link_entries_of_interest, ) return local_folder def _download_external_files( - self, target_folder=None, lock_target_folder=False, max_workers=None + self, + target_folder=None, + lock_target_folder=False, + max_workers=None, + link_entries_of_interest=None ): - # (Union(Path, str), bool) -> None + # (Union(Path, str), bool, Optional[int], Optional[Dict[str, LinkEntry]]) -> None """ Downloads external files in the dataset. These files will be downloaded at relative_path (the path relative to the target_folder). Note that @@ -2318,6 +2327,8 @@ def _download_external_files( :param lock_target_folder: If True, local the target folder so the next cleanup will not delete Notice you should unlock it manually, or wait for the process to finish for auto unlocking. :param max_workers: Number of threads to be spawned when getting dataset files. Defaults to no multi-threading. + :param link_entries_of_interest: Download only the external files in this dictionary. + Useful when one doesn't want to download all the files in a parent dataset, as some files might be removed """ def _download_link(link, target_path): if os.path.exists(target_path): @@ -2370,12 +2381,13 @@ def _submit_download_link(relative_path, link, target_folder, pool=None): )[0] ).as_posix() + link_entries_of_interest = link_entries_of_interest or self._dataset_link_entries if not max_workers: - for relative_path, link in self._dataset_link_entries.items(): + for relative_path, link in link_entries_of_interest.items(): _submit_download_link(relative_path, link, target_folder) else: with ThreadPoolExecutor(max_workers=max_workers) as pool: - for relative_path, link in self._dataset_link_entries.items(): + for relative_path, link in link_entries_of_interest.items(): _submit_download_link(relative_path, link, target_folder, pool=pool) def _extract_dataset_archive( @@ -3224,7 +3236,8 @@ def _extract_parent_datasets( force=force, lock_target_folder=True, cleanup_target_folder=False, - max_workers=max_workers + max_workers=max_workers, + link_entries_of_interest=self._dataset_link_entries )) ds_base_folder.touch()