From 9b5544dea6865b76f838874e380da9e1654dcf27 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Sat, 23 Dec 2023 01:32:15 +0200 Subject: [PATCH] adlfs: don't append / to dirs in find() and fix glob() This makes it consistent with localfs/gcsfs/s3fs and also makes glob work after changes in https://github.com/fsspec/filesystem_spec/pull/1382 --- adlfs/spec.py | 29 +++++----------- adlfs/tests/test_spec.py | 72 ++++++++++++++++++++-------------------- pyproject.toml | 2 +- 3 files changed, 46 insertions(+), 57 deletions(-) diff --git a/adlfs/spec.py b/adlfs/spec.py index 03518c4f..9bc0053a 100644 --- a/adlfs/spec.py +++ b/adlfs/spec.py @@ -880,12 +880,7 @@ async def _details( return output - def find(self, path, withdirs=False, prefix="", **kwargs): - return sync( - self.loop, self._find, path=path, withdirs=withdirs, prefix=prefix, **kwargs - ) - - async def _find(self, path, withdirs=False, prefix="", with_parent=False, **kwargs): + async def _find(self, path, withdirs=False, prefix="", **kwargs): """List all files below path. Like posix ``find`` command without conditions. @@ -931,11 +926,8 @@ async def _find(self, path, withdirs=False, prefix="", with_parent=False, **kwar for info in infos: name = _name = info["name"] while True: - parent_dir = self._parent(_name).rstrip("/") + "/" - if ( - parent_dir not in dir_set - and parent_dir != full_path.strip("/") + "/" - ): + parent_dir = self._parent(_name).rstrip("/") + if parent_dir not in dir_set and parent_dir != full_path.strip("/"): dir_set.add(parent_dir) dirs[parent_dir] = { "name": parent_dir, @@ -960,8 +952,6 @@ async def _find(self, path, withdirs=False, prefix="", with_parent=False, **kwar files[file["name"]] = file if withdirs: - if not with_parent: - dirs.pop(target_path, None) files.update(dirs) files = {k: v for k, v in files.items() if k.startswith(target_path)} names = sorted([n for n in files.keys()]) @@ -1160,7 +1150,9 @@ async def _rm( """ if expand_path: path = await self._expand_path( - path, recursive=recursive, maxdepth=maxdepth, with_parent=True + path, + recursive=recursive, + maxdepth=maxdepth, ) elif isinstance(path, str): path = [path] @@ -1491,10 +1483,6 @@ async def _expand_path( self, path, recursive=False, maxdepth=None, skip_noexist=True, **kwargs ): """Turn one or more globs or directories into a list of all matching files""" - with_parent = kwargs.get( - "with_parent", False - ) # Sets whether to return the parent dir - if isinstance(path, list): path = [f"{p.strip('/')}" for p in path if not p.endswith("*")] else: @@ -1502,7 +1490,9 @@ async def _expand_path( path = f"{path.strip('/')}" if isinstance(path, str): out = await self._expand_path( - [path], recursive, maxdepth, with_parent=with_parent + [path], + recursive, + maxdepth, ) else: out = set() @@ -1526,7 +1516,6 @@ async def _expand_path( await self._find( glob_p, withdirs=True, - with_parent=with_parent, version_id=version_id, ) ) diff --git a/adlfs/tests/test_spec.py b/adlfs/tests/test_spec.py index 8f7ac69b..2a451043 100644 --- a/adlfs/tests/test_spec.py +++ b/adlfs/tests/test_spec.py @@ -450,35 +450,35 @@ def test_find(storage): # all files and directories assert fs.find("data/root", withdirs=True) == [ - "data/root/a/", + "data/root/a", "data/root/a/file.txt", - "data/root/a1/", + "data/root/a1", "data/root/a1/file1.txt", - "data/root/b/", + "data/root/b", "data/root/b/file.txt", - "data/root/c/", + "data/root/c", "data/root/c/file1.txt", "data/root/c/file2.txt", - "data/root/d/", + "data/root/d", "data/root/d/file_with_metadata.txt", - "data/root/e+f/", + "data/root/e+f", "data/root/e+f/file1.txt", "data/root/e+f/file2.txt", "data/root/rfile.txt", ] assert fs.find("data/root/", withdirs=True) == [ - "data/root/a/", + "data/root/a", "data/root/a/file.txt", - "data/root/a1/", + "data/root/a1", "data/root/a1/file1.txt", - "data/root/b/", + "data/root/b", "data/root/b/file.txt", - "data/root/c/", + "data/root/c", "data/root/c/file1.txt", "data/root/c/file2.txt", - "data/root/d/", + "data/root/d", "data/root/d/file_with_metadata.txt", - "data/root/e+f/", + "data/root/e+f", "data/root/e+f/file1.txt", "data/root/e+f/file2.txt", "data/root/rfile.txt", @@ -494,9 +494,9 @@ def test_find(storage): ] assert fs.find("data/root", prefix="a", withdirs=True) == [ - "data/root/a/", + "data/root/a", "data/root/a/file.txt", - "data/root/a1/", + "data/root/a1", "data/root/a1/file1.txt", ] @@ -504,7 +504,7 @@ def test_find(storage): assert_blobs_equals( list(find_results.values()), [ - {"name": "data/root/a1/", "size": 0, "type": "directory"}, + {"name": "data/root/a1", "size": 0, "type": "directory"}, { "name": "data/root/a1/file1.txt", "size": 10, @@ -551,12 +551,12 @@ def test_glob(storage): # top-level contents of a directory assert fs.glob("data/root/*") == [ - "data/root/a/", - "data/root/a1/", - "data/root/b/", - "data/root/c/", - "data/root/d/", - "data/root/e+f/", + "data/root/a", + "data/root/a1", + "data/root/b", + "data/root/c", + "data/root/d", + "data/root/e+f", "data/root/rfile.txt", ] @@ -606,36 +606,36 @@ def test_glob(storage): # all files assert fs.glob("data/root/**") == [ - "data/root/a/", + "data/root/a", "data/root/a/file.txt", - "data/root/a1/", + "data/root/a1", "data/root/a1/file1.txt", - "data/root/b/", + "data/root/b", "data/root/b/file.txt", - "data/root/c/", + "data/root/c", "data/root/c/file1.txt", "data/root/c/file2.txt", - "data/root/d/", + "data/root/d", "data/root/d/file_with_metadata.txt", - "data/root/e+f/", + "data/root/e+f", "data/root/e+f/file1.txt", "data/root/e+f/file2.txt", "data/root/rfile.txt", ] - assert fs.glob("data/roo**") == [ - "data/root/", - "data/root/a/", + assert fs.glob("data/roo*/**") == [ + "data/root", + "data/root/a", "data/root/a/file.txt", - "data/root/a1/", + "data/root/a1", "data/root/a1/file1.txt", - "data/root/b/", + "data/root/b", "data/root/b/file.txt", - "data/root/c/", + "data/root/c", "data/root/c/file1.txt", "data/root/c/file2.txt", - "data/root/d/", + "data/root/d", "data/root/d/file_with_metadata.txt", - "data/root/e+f/", + "data/root/e+f", "data/root/e+f/file1.txt", "data/root/e+f/file2.txt", "data/root/rfile.txt", @@ -1191,7 +1191,7 @@ def test_dask_parquet(storage): write_metadata_file=True, ) assert fs.glob("test/test_group3.parquet/*") == [ - "test/test_group3.parquet/A=1/", + "test/test_group3.parquet/A=1", "test/test_group3.parquet/_common_metadata", "test/test_group3.parquet/_metadata", ] diff --git a/pyproject.toml b/pyproject.toml index b83586e9..21bcbbe2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ dependencies = [ "azure-datalake-store>=0.0.46,<0.1", "azure-identity", "azure-storage-blob>=12.12.0", - "fsspec>=2023.9.0", + "fsspec>=2023.12.0", "aiohttp>=3.7.0", ]