diff --git a/src/uproot/_dask.py b/src/uproot/_dask.py index 38bb213c7..fd66335c4 100644 --- a/src/uproot/_dask.py +++ b/src/uproot/_dask.py @@ -161,7 +161,7 @@ def dask( array from ``TTrees``. """ - files = uproot._util.regularize_files(files, steps_allowed=True) + files = uproot._util.regularize_files(files, steps_allowed=True, **options) is_3arg = [len(x) == 3 for x in files] if any(is_3arg): diff --git a/src/uproot/_util.py b/src/uproot/_util.py index 44f38e1ca..952232f35 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -287,11 +287,6 @@ def regularize_path(path): return path -# These schemes may not appear in fsspec if the corresponding libraries are not installed (e.g. s3fs) -_remote_schemes = ["root", "s3", "http", "https"] -_schemes = list({*_remote_schemes, *fsspec.available_protocols()}) - - def file_object_path_split(urlpath: str) -> tuple[str, str | None]: """ Split a path with a colon into a file path and an object-in-file path. @@ -815,7 +810,9 @@ def regularize_steps(steps): return out.tolist() -def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allowed): +def _regularize_files_inner( + files, parse_colon, counter, HasBranches, steps_allowed, **options +): files2 = regularize_path(files) maybe_steps = None @@ -830,12 +827,24 @@ def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allo else: file_path, object_path = files, None + # This parses the windows drive letter as a scheme! parsed_url = urlparse(file_path) - - if parsed_url.scheme.lower() in _remote_schemes: - yield file_path, object_path, maybe_steps - + scheme = parsed_url.scheme + if "://" in file_path and scheme not in ("file", "local"): + # user specified a protocol, so we use fsspec to expand the glob and return the full paths + file_names_full = [ + file.full_name + for file in fsspec.open_files( + files, + **uproot.source.fsspec.FSSpecSource.extract_fsspec_options(options), + ) + ] + # https://github.com/fsspec/filesystem_spec/issues/1459 + # Not all protocols return the full_name attribute correctly (if they have url parameters) + for file_name_full in file_names_full: + yield file_name_full, object_path, maybe_steps else: + # no protocol, default to local file system expanded = os.path.expanduser(file_path) if _regularize_files_isglob.search(expanded) is None: yield file_path, object_path, maybe_steps @@ -885,6 +894,7 @@ def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allo counter, HasBranches, steps_allowed, + **options, ): yield file_path, object_path, maybe_steps @@ -892,7 +902,7 @@ def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allo for file in files: counter[0] += 1 for file_path, object_path, maybe_steps in _regularize_files_inner( - file, parse_colon, counter, HasBranches, steps_allowed + file, parse_colon, counter, HasBranches, steps_allowed, **options ): yield file_path, object_path, maybe_steps @@ -905,7 +915,7 @@ def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allo ) -def regularize_files(files, steps_allowed): +def regularize_files(files, steps_allowed, **options): """ Common code for regularizing the possible file inputs accepted by uproot so they can be used by uproot internal functions. """ @@ -915,7 +925,7 @@ def regularize_files(files, steps_allowed): seen = set() counter = [0] for file_path, object_path, maybe_steps in _regularize_files_inner( - files, True, counter, HasBranches, steps_allowed + files, True, counter, HasBranches, steps_allowed, **options ): if isinstance(file_path, str): key = (counter[0], file_path, object_path) diff --git a/src/uproot/behaviors/TBranch.py b/src/uproot/behaviors/TBranch.py index a82603176..5c6325e48 100644 --- a/src/uproot/behaviors/TBranch.py +++ b/src/uproot/behaviors/TBranch.py @@ -174,7 +174,7 @@ def iterate( array from ``TTrees``. * :doc:`uproot._dask.dask`: returns an unevaluated Dask array from ``TTrees``. """ - files = uproot._util.regularize_files(files, steps_allowed=False) + files = uproot._util.regularize_files(files, steps_allowed=False, **options) decompression_executor, interpretation_executor = _regularize_executors( decompression_executor, interpretation_executor, None ) @@ -340,7 +340,7 @@ def concatenate( single concatenated array from ``TTrees``. * :doc:`uproot._dask.dask`: returns an unevaluated Dask array from ``TTrees``. """ - files = uproot._util.regularize_files(files, steps_allowed=False) + files = uproot._util.regularize_files(files, steps_allowed=False, **options) decompression_executor, interpretation_executor = _regularize_executors( decompression_executor, interpretation_executor, None ) diff --git a/src/uproot/models/TTree.py b/src/uproot/models/TTree.py index 90342cf6c..360561cdd 100644 --- a/src/uproot/models/TTree.py +++ b/src/uproot/models/TTree.py @@ -906,7 +906,6 @@ def read_members(self, chunk, cursor, context, file): uproot.classes["TTree"] = Model_TTree uproot.classes["ROOT::TIOFeatures"] = Model_ROOT_3a3a_TIOFeatures - fEntriesStruct = struct.Struct(">q") diff --git a/src/uproot/source/fsspec.py b/src/uproot/source/fsspec.py index 0773d1c47..ac36eee55 100644 --- a/src/uproot/source/fsspec.py +++ b/src/uproot/source/fsspec.py @@ -27,30 +27,29 @@ class FSSpecSource(uproot.source.chunk.Source): """ def __init__(self, file_path: str, **options): - options = dict(uproot.reading.open.defaults, **options) - storage_options = { - k: v - for k, v in options.items() - if k not in uproot.reading.open.defaults.keys() - } - - self._fs, self._file_path = fsspec.core.url_to_fs(file_path, **storage_options) + super().__init__() + self._fs, self._file_path = fsspec.core.url_to_fs( + file_path, **self.extract_fsspec_options(options) + ) # What should we do when there is a chain of filesystems? self._async_impl = self._fs.async_impl - self._executor = None self._file = None self._fh = None - self._num_requests = 0 - self._num_requested_chunks = 0 - self._num_requested_bytes = 0 - self._open() self.__enter__() + @classmethod + def extract_fsspec_options(cls, options: dict) -> dict: + uproot_default_options = dict(uproot.reading.open.defaults) + options = dict(uproot_default_options, **options) + return { + k: v for k, v in options.items() if k not in uproot_default_options.keys() + } + def _open(self): self._executor = FSSpecLoopExecutor() self._file = self._fs.open(self._file_path) diff --git a/tests/test_0692_fsspec_reading.py b/tests/test_0692_fsspec_reading.py index 18dbaab1f..2bad9136f 100644 --- a/tests/test_0692_fsspec_reading.py +++ b/tests/test_0692_fsspec_reading.py @@ -388,3 +388,102 @@ def test_issue_1035(handler): branch = tree["MuonSpectrometerTrackParticlesAuxDyn.truthParticleLink"] data = branch.array() assert len(data) == 40 + + +@pytest.mark.network +@pytest.mark.xrootd +@pytest.mark.parametrize( + "handler", + [ + uproot.source.fsspec.FSSpecSource, + None, + ], +) +def test_fsspec_globbing_xrootd(handler): + pytest.importorskip("XRootD") + pytest.importorskip("fsspec_xrootd") + iterator = uproot.iterate( + { + "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/Run2012B_*.root": "Events" + }, + ["PV_x"], + handler=handler, + ) + + arrays = [array for array in iterator] + # if more files are added that match the glob, this test needs to be updated + assert len(arrays) == 2 + + +@pytest.mark.network +@pytest.mark.xrootd +@pytest.mark.parametrize( + "handler", + [ + uproot.source.fsspec.FSSpecSource, + None, + ], +) +def test_fsspec_globbing_xrootd_no_files(handler): + pytest.importorskip("XRootD") + pytest.importorskip("fsspec_xrootd") + iterator = uproot.iterate( + { + "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/*/ThisFileShouldNotExist.root": "Events" + }, + ["PV_x"], + handler=handler, + ) + with pytest.raises(FileNotFoundError): + arrays = [array for array in iterator] + + +@pytest.mark.parametrize( + "handler", + [ + uproot.source.fsspec.FSSpecSource, + None, + ], +) +def test_fsspec_globbing_s3(handler): + pytest.importorskip("s3fs") + if sys.version_info < (3, 11): + pytest.skip( + "https://github.com/scikit-hep/uproot5/pull/1012", + ) + + iterator = uproot.iterate( + {"s3://pivarski-princeton/pythia_ppZee_run17emb.*.root": "PicoDst"}, + ["Event/Event.mEventId"], + anon=True, + handler=handler, + ) + + # if more files are added that match the glob, this test needs to be updated + arrays = [array for array in iterator] + assert len(arrays) == 1 + for array in arrays: + assert len(array) == 8004 + + +@pytest.mark.parametrize( + "handler", + [ + uproot.source.fsspec.FSSpecSource, + None, + ], +) +def test_fsspec_globbing_http(handler): + pytest.importorskip("aiohttp") + + # Globbing does not work with http filesystems and will return an empty list of files + # We leave this test here to be notified when this feature is added + iterator = uproot.iterate( + { + "https://github.com/scikit-hep/scikit-hep-testdata/raw/main/src/skhep_testdata/data/uproot-issue*.root": "Events" + }, + ["MET_pt"], + handler=handler, + ) + with pytest.raises(FileNotFoundError): + arrays = [array for array in iterator]