From 4e0ba04fe6bfc4ab6e3449428d00a34739286f1a Mon Sep 17 00:00:00 2001 From: Luis Antonio Obis Aparicio <35803280+lobis@users.noreply.github.com> Date: Thu, 16 Nov 2023 12:51:04 -0600 Subject: [PATCH] feat: simplify object path split (#1028) * simplify object path split * add example from https://github.com/scikit-hep/uproot5/issues/975 * fix tests * add more test cases * test case update * remove scheme unused regex --- src/uproot/_util.py | 68 ++++++------------------- tests/test_0001_source_class.py | 8 +-- tests/test_0692_fsspec_reading.py | 2 +- tests/test_0976_path_object_split.py | 74 ++++++++++++++++++++++------ 4 files changed, 79 insertions(+), 73 deletions(-) diff --git a/src/uproot/_util.py b/src/uproot/_util.py index 1a86c4207..2017c30ad 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -12,7 +12,6 @@ import itertools import numbers import os -import pathlib import platform import re import warnings @@ -290,14 +289,10 @@ def regularize_path(path): _windows_absolute_path_pattern = re.compile(r"^[A-Za-z]:[\\/]") _windows_absolute_path_pattern_slash = re.compile(r"^[\\/][A-Za-z]:[\\/]") +# These schemes may not appear in fsspec if the corresponding libraries are not installed (e.g. s3fs) _remote_schemes = ["root", "s3", "http", "https"] _schemes = list({*_remote_schemes, *fsspec.available_protocols()}) -_uri_scheme = re.compile("^(" + "|".join([re.escape(x) for x in _schemes]) + ")://") -_uri_scheme_chain = re.compile( - "^(" + "|".join([re.escape(x) for x in _schemes]) + ")::" -) - def file_object_path_split(urlpath: str) -> tuple[str, str | None]: """ @@ -313,54 +308,19 @@ def file_object_path_split(urlpath: str) -> tuple[str, str | None]: """ urlpath: str = regularize_path(urlpath).strip() - path = urlpath - - def _split_path(path: str) -> list[str]: - parts = path.split(":") - if pathlib.PureWindowsPath(path).drive: - # Windows absolute path - assert len(parts) >= 2, f"could not split object from windows path {path}" - parts = [parts[0] + ":" + parts[1]] + parts[2:] - return parts - - if "://" not in path: - path = "file://" + path - - # replace the match of _uri_scheme_chain with "" until there is no match - while _uri_scheme_chain.match(path): - path = _uri_scheme_chain.sub("", path) - - if _uri_scheme.match(path): - # if not a local path, attempt to match a URI scheme - if path.startswith("file://"): - parsed_url_path = path[7:] - else: - parsed_url_path = urlparse(path).path - - if parsed_url_path.startswith("//"): - parsed_url_path = parsed_url_path[2:] - - parts = _split_path(parsed_url_path) - else: - # invalid scheme - scheme = path.split("://")[0] - raise ValueError( - f"Invalid URI scheme: '{scheme}://' in {path}. Available schemes: {', '.join(_schemes)}." - ) - - if len(parts) == 1: - obj = None - elif len(parts) == 2: - obj = parts[1] - # remove the object from the path (including the colon) - urlpath = urlpath[: -len(obj) - 1] - # clean badly placed slashes - obj = obj.strip().lstrip("/") - while "//" in obj: - obj = obj.replace("//", "/") - else: - raise ValueError(f"could not split object from path {path}") - + obj = None + + separator = "::" + parts = urlpath.split(separator) + object_regex = re.compile(r"(.+\.root):(.*$)") + for i, part in enumerate(reversed(parts)): + match = object_regex.match(part) + if match: + obj = re.sub(r"/+", "/", match.group(2).strip().lstrip("/")).rstrip("/") + parts[-i - 1] = match.group(1) + break + + urlpath = separator.join(parts) return urlpath, obj diff --git a/tests/test_0001_source_class.py b/tests/test_0001_source_class.py index 2ca6d98a7..be1d1186c 100644 --- a/tests/test_0001_source_class.py +++ b/tests/test_0001_source_class.py @@ -148,13 +148,13 @@ def test_colons_and_ports(): "https://example.com:443", None, ) - assert uproot._util.file_object_path_split("https://example.com:443/something") == ( - "https://example.com:443/something", + assert uproot._util.file_object_path_split("https://example.com:443/file.root") == ( + "https://example.com:443/file.root", None, ) assert uproot._util.file_object_path_split( - "https://example.com:443/something:else" - ) == ("https://example.com:443/something", "else") + "https://example.com:443/file.root:object" + ) == ("https://example.com:443/file.root", "object") @pytest.mark.parametrize("use_threads", [True, False], indirect=True) diff --git a/tests/test_0692_fsspec_reading.py b/tests/test_0692_fsspec_reading.py index fb99dd65c..052d916c6 100644 --- a/tests/test_0692_fsspec_reading.py +++ b/tests/test_0692_fsspec_reading.py @@ -208,7 +208,7 @@ def test_fsspec_zip(tmp_path): # open with fsspec with uproot.open( - f"zip://{filename}::file://{filename_zip}:Events/MET_pt" + f"zip://{filename}:Events/MET_pt::file://{filename_zip}" ) as branch: data = branch.array(library="np") assert len(data) == 40 diff --git a/tests/test_0976_path_object_split.py b/tests/test_0976_path_object_split.py index 71638a276..84043f8c7 100644 --- a/tests/test_0976_path_object_split.py +++ b/tests/test_0976_path_object_split.py @@ -64,24 +64,38 @@ ), ), ( - "ssh://user@host:22/path/to/file:object", + "ssh://user@host:22/path/to/file.root:/object//path", ( - "ssh://user@host:22/path/to/file", - "object", + "ssh://user@host:22/path/to/file.root", + "object/path", ), ), ( - "ssh://user@host:50230/path/to/file", + "ssh://user@host:22/path/to/file.root:/object//path:with:colon:in:path/something/", ( - "ssh://user@host:50230/path/to/file", + "ssh://user@host:22/path/to/file.root", + "object/path:with:colon:in:path/something", + ), + ), + ( + "ssh://user@host:50230/path/to/file.root", + ( + "ssh://user@host:50230/path/to/file.root", None, ), ), ( - "s3://bucket/path/to/file:object", + "s3://bucket/path/to/file.root:/dir////object", + ( + "s3://bucket/path/to/file.root", + "dir/object", + ), + ), + ( + "s3://bucket/path/to/file.root:", ( - "s3://bucket/path/to/file", - "object", + "s3://bucket/path/to/file.root", + "", ), ), ( @@ -98,27 +112,56 @@ None, ), ), + # https://github.com/scikit-hep/uproot5/issues/975 ( - "zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip:Events/MET_pt", + "DAOD_PHYSLITE_2023-09-13T1230.art.rntuple.root:RNT:CollectionTree", + ( + "DAOD_PHYSLITE_2023-09-13T1230.art.rntuple.root", + "RNT:CollectionTree", + ), + ), + ( + "zip://uproot-issue121.root:Events/MET_pt::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip", ( "zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip", "Events/MET_pt", ), ), ( - "simplecache::zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip:Events/MET_pt", + "simplecache::zip://uproot-issue121.root:Events/MET_pt::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip", ( "simplecache::zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip", "Events/MET_pt", ), ), ( - r"zip://uproot-issue121.root::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip:Events/MET_pt", + r"zip://uproot-issue121.root:Events/MET_pt::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip", ( r"zip://uproot-issue121.root::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip", "Events/MET_pt", ), ), + ( + "zip://uproot-issue121.root:Events/MET_pt::file:///some/weird/path:with:colons/file.root", + ( + "zip://uproot-issue121.root::file:///some/weird/path:with:colons/file.root", + "Events/MET_pt", + ), + ), + ( + "/some/weird/path:with:colons/file.root:Events/MET_pt", + ( + "/some/weird/path:with:colons/file.root", + "Events/MET_pt", + ), + ), + ( + "/some/weird/path:with:colons/file.root", + ( + "/some/weird/path:with:colons/file.root", + None, + ), + ), ], ) def test_url_split(input_value, expected_output): @@ -131,9 +174,12 @@ def test_url_split(input_value, expected_output): @pytest.mark.parametrize( "input_value", [ - "local/file.root://Events", + "local/file.root.zip://Events", + "local/file.roo://Events", + "local/file://Events", ], ) def test_url_split_invalid(input_value): - with pytest.raises(ValueError): - uproot._util.file_object_path_split(input_value) + path, obj = uproot._util.file_object_path_split(input_value) + assert obj is None + assert path == input_value