diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index c155030c594..f66cfecfe8c 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -59,6 +59,7 @@ import pandas as pd import pyarrow as pa import pyarrow.compute as pc +from fsspec.core import url_to_fs from huggingface_hub import CommitInfo, CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi from multiprocess import Pool from tqdm.contrib.concurrent import thread_map @@ -1504,7 +1505,7 @@ def save_to_disk( num_shards = num_shards if num_shards is not None else num_proc fs: fsspec.AbstractFileSystem - fs, _, _ = fsspec.get_fs_token_paths(dataset_path, storage_options=storage_options) + fs, _ = url_to_fs(dataset_path, **(storage_options or {})) if not is_remote_filesystem(fs): parent_cache_files_paths = { @@ -1694,7 +1695,7 @@ def load_from_disk( storage_options = fs.storage_options fs: fsspec.AbstractFileSystem - fs, _, [dataset_path] = fsspec.get_fs_token_paths(dataset_path, storage_options=storage_options) + fs, dataset_path = url_to_fs(dataset_path, **(storage_options or {})) dest_dataset_path = dataset_path dataset_dict_json_path = posixpath.join(dest_dataset_path, config.DATASETDICT_JSON_FILENAME) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index 3fed7e14e38..82e72a91ecc 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -24,6 +24,7 @@ import numpy as np import pyarrow as pa import pyarrow.parquet as pq +from fsspec.core import url_to_fs from . import config from .features import Features, Image, Value @@ -327,14 +328,10 @@ def __init__( self._disable_nullable = disable_nullable if stream is None: - fs_token_paths = fsspec.get_fs_token_paths(path, storage_options=storage_options) - self._fs: fsspec.AbstractFileSystem = fs_token_paths[0] - self._path = ( - fs_token_paths[2][0] - if not is_remote_filesystem(self._fs) - else self._fs.unstrip_protocol(fs_token_paths[2][0]) - ) - self.stream = self._fs.open(fs_token_paths[2][0], "wb") + fs, path = url_to_fs(path, **(storage_options or {})) + self._fs: fsspec.AbstractFileSystem = fs + self._path = path if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(path) + self.stream = self._fs.open(path, "wb") self._closable_stream = True else: self._fs = None @@ -681,7 +678,7 @@ def finalize(self, metrics_query_result: dict): """ # Beam FileSystems require the system's path separator in the older versions - fs, _, [parquet_path] = fsspec.get_fs_token_paths(self._parquet_path) + fs, parquet_path = url_to_fs(self._parquet_path) parquet_path = str(Path(parquet_path)) if not is_remote_filesystem(fs) else fs.unstrip_protocol(parquet_path) shards = fs.glob(parquet_path + "*.parquet") diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 1389aa7a20e..30b57ff3da3 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -34,6 +34,7 @@ import fsspec import pyarrow as pa +from fsspec.core import url_to_fs from multiprocess import Pool from tqdm.contrib.concurrent import thread_map @@ -883,7 +884,7 @@ def download_and_prepare( output_dir = output_dir if output_dir is not None else self._cache_dir # output_dir can be a remote bucket on GCS or S3 (when using BeamBasedBuilder for distributed data processing) - fs, _, [output_dir] = fsspec.get_fs_token_paths(output_dir, storage_options=storage_options) + fs, output_dir = url_to_fs(output_dir, **(storage_options or {})) self._fs = fs self._output_dir = output_dir if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(output_dir) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 752145413db..4a918595408 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -6,7 +6,7 @@ from typing import Callable, Dict, List, Optional, Set, Tuple, Union import huggingface_hub -from fsspec import get_fs_token_paths +from fsspec.core import url_to_fs from fsspec.implementations.http import HTTPFileSystem from huggingface_hub import HfFileSystem from packaging import version @@ -46,28 +46,48 @@ class EmptyDatasetError(FileNotFoundError): } NON_WORDS_CHARS = "-._ 0-9" if config.FSSPEC_VERSION < version.parse("2023.9.0"): - KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"] + KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"] + KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [ + "{keyword}/**", + "{keyword}[{sep}]*/**", + "**[{sep}/]{keyword}/**", + "**[{sep}/]{keyword}[{sep}]*/**", + ] elif config.FSSPEC_VERSION < version.parse("2023.12.0"): - KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**/*[{sep}/]{keyword}[{sep}/]**"] + KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/*[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"] + KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [ + "{keyword}/**/*", + "{keyword}[{sep}]*/**/*", + "**/*[{sep}/]{keyword}/**/*", + "**/*[{sep}/]{keyword}[{sep}]*/**/*", + ] else: - KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = [ - "**/{keyword}[{sep}]*", + KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/{keyword}[{sep}]*", "**/*[{sep}]{keyword}[{sep}]*"] + KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [ "**/{keyword}/**", - "**/*[{sep}]{keyword}[{sep}]*", - "**/*[{sep}]{keyword}[{sep}]*/**", "**/{keyword}[{sep}]*/**", "**/*[{sep}]{keyword}/**", + "**/*[{sep}]{keyword}[{sep}]*/**", ] DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST] -DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME = { +DEFAULT_PATTERNS_SPLIT_IN_FILENAME = { split: [ pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) for keyword in SPLIT_KEYWORDS[split] - for pattern in KEYWORDS_IN_PATH_NAME_BASE_PATTERNS + for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS ] for split in DEFAULT_SPLITS } +DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = { + split: [ + pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) + for keyword in SPLIT_KEYWORDS[split] + for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS + ] + for split in DEFAULT_SPLITS +} + DEFAULT_PATTERNS_ALL = { Split.TRAIN: ["**"], @@ -75,7 +95,8 @@ class EmptyDatasetError(FileNotFoundError): ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED] ALL_DEFAULT_PATTERNS = [ - DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME, + DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME, + DEFAULT_PATTERNS_SPLIT_IN_FILENAME, DEFAULT_PATTERNS_ALL, ] if config.FSSPEC_VERSION < version.parse("2023.9.0"): @@ -351,7 +372,7 @@ def resolve_pattern( else: base_path = "" pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config) - fs, _, _ = get_fs_token_paths(pattern, storage_options=storage_options) + fs, *_ = url_to_fs(pattern, **storage_options) fs_base_path = base_path.split("::")[0].split("://")[-1] or fs.root_marker fs_pattern = pattern.split("::")[0].split("://")[-1] files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)} @@ -409,7 +430,7 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] Output: - {"train": ["**"]} + {'train': ['**']} Input: @@ -435,8 +456,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] Output: - {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'], - 'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]} + {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'], + 'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]} Input: @@ -454,8 +475,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] Output: - {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'], - 'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]} + {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...], + 'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]} Input: @@ -504,7 +525,7 @@ def _get_single_origin_metadata( download_config: Optional[DownloadConfig] = None, ) -> Tuple[str]: data_file, storage_options = _prepare_path_and_storage_options(data_file, download_config=download_config) - fs, _, _ = get_fs_token_paths(data_file, storage_options=storage_options) + fs, *_ = url_to_fs(data_file, **storage_options) if isinstance(fs, HfFileSystem): resolved_path = fs.resolve_path(data_file) return (resolved_path.repo_id, resolved_path.revision) diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index ab26dbbb83d..18c99dac1b8 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -12,6 +12,7 @@ import fsspec import numpy as np +from fsspec.core import url_to_fs from huggingface_hub import ( CommitInfo, CommitOperationAdd, @@ -1280,7 +1281,7 @@ def save_to_disk( storage_options = fs.storage_options fs: fsspec.AbstractFileSystem - fs, _, _ = fsspec.get_fs_token_paths(dataset_dict_path, storage_options=storage_options) + fs, _ = url_to_fs(dataset_dict_path, **(storage_options or {})) if num_shards is None: num_shards = {k: None for k in self} @@ -1354,7 +1355,7 @@ def load_from_disk( storage_options = fs.storage_options fs: fsspec.AbstractFileSystem - fs, _, [dataset_dict_path] = fsspec.get_fs_token_paths(dataset_dict_path, storage_options=storage_options) + fs, dataset_dict_path = url_to_fs(dataset_dict_path, **(storage_options or {})) dataset_dict_json_path = posixpath.join(dataset_dict_path, config.DATASETDICT_JSON_FILENAME) dataset_state_json_path = posixpath.join(dataset_dict_path, config.DATASET_STATE_JSON_FILENAME) diff --git a/src/datasets/download/streaming_download_manager.py b/src/datasets/download/streaming_download_manager.py index 0b347216a9c..819150756fe 100644 --- a/src/datasets/download/streaming_download_manager.py +++ b/src/datasets/download/streaming_download_manager.py @@ -16,6 +16,7 @@ import fsspec from aiohttp.client_exceptions import ClientError +from fsspec.core import url_to_fs from huggingface_hub.utils import EntryNotFoundError from packaging import version @@ -159,7 +160,7 @@ def xexists(urlpath: str, download_config: Optional[DownloadConfig] = None): else: urlpath, storage_options = _prepare_path_and_storage_options(urlpath, download_config=download_config) main_hop, *rest_hops = urlpath.split("::") - fs, *_ = fsspec.get_fs_token_paths(urlpath, storage_options=storage_options) + fs, *_ = url_to_fs(urlpath, **storage_options) return fs.exists(main_hop) @@ -259,7 +260,7 @@ def xisfile(path, download_config: Optional[DownloadConfig] = None) -> bool: else: path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config) main_hop, *rest_hops = path.split("::") - fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options) + fs, *_ = url_to_fs(path, **storage_options) return fs.isfile(main_hop) @@ -279,7 +280,7 @@ def xgetsize(path, download_config: Optional[DownloadConfig] = None) -> int: else: path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config) main_hop, *rest_hops = path.split("::") - fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options) + fs, *_ = fs, *_ = url_to_fs(path, **storage_options) try: size = fs.size(main_hop) except EntryNotFoundError: @@ -307,7 +308,7 @@ def xisdir(path, download_config: Optional[DownloadConfig] = None) -> bool: else: path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config) main_hop, *rest_hops = path.split("::") - fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options) + fs, *_ = fs, *_ = url_to_fs(path, **storage_options) inner_path = main_hop.split("://")[-1] if not inner_path.strip("/"): return True @@ -546,7 +547,7 @@ def xlistdir(path: str, download_config: Optional[DownloadConfig] = None) -> Lis # globbing inside a zip in a private repo requires authentication path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config) main_hop, *rest_hops = path.split("::") - fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options) + fs, *_ = url_to_fs(path, **storage_options) inner_path = main_hop.split("://")[-1] if inner_path.strip("/") and not fs.isdir(inner_path): raise FileNotFoundError(f"Directory doesn't exist: {path}") @@ -573,11 +574,7 @@ def xglob(urlpath, *, recursive=False, download_config: Optional[DownloadConfig] # globbing inside a zip in a private repo requires authentication urlpath, storage_options = _prepare_path_and_storage_options(urlpath, download_config=download_config) main_hop, *rest_hops = urlpath.split("::") - fs, *_ = fsspec.get_fs_token_paths(urlpath, storage_options=storage_options) - # - If there's no "*" in the pattern, get_fs_token_paths() doesn't do any pattern matching - # so to be able to glob patterns like "[0-9]", we have to call `fs.glob`. - # - Also "*" in get_fs_token_paths() only matches files: we have to call `fs.glob` to match directories. - # - If there is "**" in the pattern, `fs.glob` must be called anyway. + fs, *_ = url_to_fs(urlpath, **storage_options) inner_path = main_hop.split("://")[1] globbed_paths = fs.glob(inner_path) protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[-1] @@ -603,7 +600,7 @@ def xwalk(urlpath, download_config: Optional[DownloadConfig] = None, **kwargs): # walking inside a zip in a private repo requires authentication urlpath, storage_options = _prepare_path_and_storage_options(urlpath, download_config=download_config) main_hop, *rest_hops = urlpath.split("::") - fs, *_ = fsspec.get_fs_token_paths(urlpath, storage_options=storage_options) + fs, *_ = url_to_fs(urlpath, **storage_options) inner_path = main_hop.split("://")[-1] if inner_path.strip("/") and not fs.isdir(inner_path): return [] @@ -659,11 +656,7 @@ def glob(self, pattern, download_config: Optional[DownloadConfig] = None): posix_path = "::".join([main_hop, urlpath, *rest_hops[1:]]) else: storage_options = None - fs, *_ = fsspec.get_fs_token_paths(xjoin(posix_path, pattern), storage_options=storage_options) - # - If there's no "*" in the pattern, get_fs_token_paths() doesn't do any pattern matching - # so to be able to glob patterns like "[0-9]", we have to call `fs.glob`. - # - Also "*" in get_fs_token_paths() only matches files: we have to call `fs.glob` to match directories. - # - If there is "**" in the pattern, `fs.glob` must be called anyway. + fs, *_ = url_to_fs(xjoin(posix_path, pattern), **(storage_options or {})) globbed_paths = fs.glob(xjoin(main_hop, pattern)) for globbed_path in globbed_paths: yield type(self)("::".join([f"{fs.protocol}://{globbed_path}"] + rest_hops)) diff --git a/src/datasets/filesystems/__init__.py b/src/datasets/filesystems/__init__.py index 15aefa5f42a..c2753e3d380 100644 --- a/src/datasets/filesystems/__init__.py +++ b/src/datasets/filesystems/__init__.py @@ -1,6 +1,5 @@ import importlib import shutil -import threading import warnings from typing import List @@ -68,19 +67,3 @@ def rename(fs: fsspec.AbstractFileSystem, src: str, dst: str): shutil.move(fs._strip_protocol(src), fs._strip_protocol(dst)) else: fs.mv(src, dst, recursive=True) - - -def _reset_fsspec_lock() -> None: - """ - Clear reference to the loop and thread. - This is necessary otherwise HTTPFileSystem hangs in the ML training loop. - Only required for fsspec >= 0.9.0 - See https://github.com/fsspec/gcsfs/issues/379 - """ - if hasattr(fsspec.asyn, "reset_lock"): - # for future fsspec>2022.05.0 - fsspec.asyn.reset_lock() - else: - fsspec.asyn.iothread[0] = None - fsspec.asyn.loop[0] = None - fsspec.asyn.lock = threading.Lock() diff --git a/src/datasets/info.py b/src/datasets/info.py index 74e9a962a0c..557f5b77d3f 100644 --- a/src/datasets/info.py +++ b/src/datasets/info.py @@ -39,6 +39,7 @@ from typing import ClassVar, Dict, List, Optional, Union import fsspec +from fsspec.core import url_to_fs from huggingface_hub import DatasetCard, DatasetCardData from . import config @@ -251,7 +252,7 @@ def write_to_directory( storage_options = fs.storage_options fs: fsspec.AbstractFileSystem - fs, _, _ = fsspec.get_fs_token_paths(dataset_info_dir, storage_options=storage_options) + fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {})) with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "wb") as f: self._dump_info(f, pretty_print=pretty_print) if self.license: @@ -347,7 +348,7 @@ def from_directory( storage_options = fs.storage_options fs: fsspec.AbstractFileSystem - fs, _, _ = fsspec.get_fs_token_paths(dataset_info_dir, storage_options=storage_options) + fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {})) logger.info(f"Loading Dataset info from {dataset_info_dir}") if not dataset_info_dir: raise ValueError("Calling DatasetInfo.from_directory() with undefined dataset_info_dir.") diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 31329de9c31..9adbb8998e2 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -9,6 +9,7 @@ from itertools import cycle, islice from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Union +import fsspec.asyn import numpy as np import pyarrow as pa @@ -16,7 +17,6 @@ from .arrow_dataset import Dataset, DatasetInfoMixin from .features import Features from .features.features import FeatureType, _align_features, _check_if_features_can_be_aligned, cast_to_python_objects -from .filesystems import _reset_fsspec_lock from .formatting import PythonFormatter, TensorFormatter, get_format_type_from_alias, get_formatter from .info import DatasetInfo from .splits import NamedSplit @@ -1257,8 +1257,9 @@ def n_shards(self) -> int: def _iter_pytorch(self): ex_iterable = self._prepare_ex_iterable_for_iteration() - # fix for fsspec when using multiprocess - _reset_fsspec_lock() + # Fix for fsspec when using multiprocess to avoid hanging in the ML training loop. (only required for fsspec >= 0.9.0) + # See https://github.com/fsspec/gcsfs/issues/379 + fsspec.asyn.reset_lock() # check if there aren't too many workers import torch.utils.data diff --git a/src/datasets/load.py b/src/datasets/load.py index 0245d55c323..ca8969c240c 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -35,6 +35,7 @@ import fsspec import requests import yaml +from fsspec.core import url_to_fs from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem from . import config @@ -2673,7 +2674,7 @@ def load_from_disk( storage_options = fs.storage_options fs: fsspec.AbstractFileSystem - fs, _, _ = fsspec.get_fs_token_paths(dataset_path, storage_options=storage_options) + fs, *_ = url_to_fs(dataset_path, **(storage_options or {})) if not fs.exists(dataset_path): raise FileNotFoundError(f"Directory {dataset_path} not found") if fs.isfile(posixpath.join(dataset_path, config.DATASET_INFO_FILENAME)) and fs.isfile( diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index b3532b96976..7c823831676 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -26,7 +26,7 @@ import fsspec import huggingface_hub import requests -from fsspec.core import strip_protocol +from fsspec.core import strip_protocol, url_to_fs from fsspec.utils import can_be_local from huggingface_hub.utils import insecure_hashlib from packaging import version @@ -315,10 +315,8 @@ def _request_with_retry( def fsspec_head(url, storage_options=None): _raise_if_offline_mode_is_enabled(f"Tried to reach {url}") - fs, _, paths = fsspec.get_fs_token_paths(url, storage_options=storage_options) - if len(paths) > 1: - raise ValueError(f"HEAD can be called with at most one path but was called with {paths}") - return fs.info(paths[0]) + fs, path = url_to_fs(url, **(storage_options or {})) + return fs.info(path) def stack_multiprocessing_download_progress_bars(): @@ -335,9 +333,7 @@ def __init__(self, tqdm_kwargs=None, *args, **kwargs): def fsspec_get(url, temp_file, storage_options=None, desc=None): _raise_if_offline_mode_is_enabled(f"Tried to reach {url}") - fs, _, paths = fsspec.get_fs_token_paths(url, storage_options=storage_options) - if len(paths) > 1: - raise ValueError(f"GET can be called with at most one path but was called with {paths}") + fs, path = url_to_fs(url, **(storage_options or {})) callback = TqdmCallback( tqdm_kwargs={ "desc": desc or "Downloading", @@ -349,7 +345,7 @@ def fsspec_get(url, temp_file, storage_options=None, desc=None): else None, } ) - fs.get_file(paths[0], temp_file.name, callback=callback) + fs.get_file(path, temp_file.name, callback=callback) def ftp_head(url, timeout=10.0): diff --git a/tests/test_data_files.py b/tests/test_data_files.py index 0b4ef65ce2a..dc59bc5c322 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -635,6 +635,8 @@ def ls(self, path, detail=True, refresh=True, **kwargs): {"test": "my test file.txt"}, {"test": "my-test_file.txt"}, {"test": "test00001.txt"}, + # . case + {"test": "test/train.txt"}, ], ) def test_get_data_files_patterns(base_path, data_file_per_split): diff --git a/tests/test_filesystem.py b/tests/test_filesystem.py index 586312973ae..13d94b86881 100644 --- a/tests/test_filesystem.py +++ b/tests/test_filesystem.py @@ -4,6 +4,7 @@ import fsspec import pytest from fsspec import register_implementation +from fsspec.core import url_to_fs from fsspec.registry import _registry as _fsspec_registry from datasets.filesystems import COMPRESSION_FILESYSTEMS, extract_path_from_uri, is_remote_filesystem @@ -68,7 +69,7 @@ def test_fs_isfile(protocol, zip_jsonl_path, jsonl_gz_path): compressed_file_path = compressed_file_paths[protocol] member_file_path = "dataset.jsonl" path = f"{protocol}://{member_file_path}::{compressed_file_path}" - fs, *_ = fsspec.get_fs_token_paths(path) + fs, *_ = url_to_fs(path) assert fs.isfile(member_file_path) assert not fs.isfile("non_existing_" + member_file_path)