From b6536bef07abe21d4eecdc690048c30b523c20f7 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 5 Dec 2023 10:09:44 -0500 Subject: [PATCH] Remove trailing "/" from path names in archives (#1445) * Remove trailing "/" from path names in ZIP * Fix path names in all archives * update tests --- docs/source/changelog.rst | 7 +++++++ fsspec/archive.py | 4 ++-- fsspec/implementations/libarchive.py | 5 ++--- fsspec/implementations/tar.py | 3 ++- fsspec/implementations/tests/test_archive.py | 10 +++++----- fsspec/implementations/tests/test_tar.py | 8 ++++---- fsspec/implementations/tests/test_zip.py | 15 +++++++++++++-- fsspec/implementations/zip.py | 2 +- 8 files changed, 36 insertions(+), 18 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index a2e0d5c76..45aa17150 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,13 @@ Changelog ========= +2023.12.1 +--------- + +Fixes + +- Remove trailing "/" from directory names in zipFS (#1445) + 2023.12.0 --------- diff --git a/fsspec/archive.py b/fsspec/archive.py index 9bdd8efce..cba8ad567 100644 --- a/fsspec/archive.py +++ b/fsspec/archive.py @@ -38,7 +38,7 @@ def info(self, path, **kwargs): self._get_dirs() path = self._strip_protocol(path) if path in {"", "/"} and self.dir_cache: - return {"name": "/", "type": "directory", "size": 0} + return {"name": "", "type": "directory", "size": 0} if path in self.dir_cache: return self.dir_cache[path] elif path + "/" in self.dir_cache: @@ -64,7 +64,7 @@ def ls(self, path, detail=True, **kwargs): # root directory entry ppath = p.rstrip("/").split("/", 1)[0] if ppath not in paths: - out = {"name": ppath + "/", "size": 0, "type": "directory"} + out = {"name": ppath, "size": 0, "type": "directory"} paths[ppath] = out out = sorted(paths.values(), key=lambda _: _["name"]) if detail: diff --git a/fsspec/implementations/libarchive.py b/fsspec/implementations/libarchive.py index 592e8979d..eb6f14535 100644 --- a/fsspec/implementations/libarchive.py +++ b/fsspec/implementations/libarchive.py @@ -164,8 +164,7 @@ def _get_dirs(self): continue self.dir_cache.update( { - dirname - + "/": {"name": dirname + "/", "size": 0, "type": "directory"} + dirname: {"name": dirname, "size": 0, "type": "directory"} for dirname in self._all_dirnames(set(entry.name)) } ) @@ -178,7 +177,7 @@ def _get_dirs(self): # not in all formats), so get the directories names from the files names self.dir_cache.update( { - dirname + "/": {"name": dirname + "/", "size": 0, "type": "directory"} + dirname: {"name": dirname, "size": 0, "type": "directory"} for dirname in self._all_dirnames(list_names) } ) diff --git a/fsspec/implementations/tar.py b/fsspec/implementations/tar.py index 62bb58f84..412e5ba4d 100644 --- a/fsspec/implementations/tar.py +++ b/fsspec/implementations/tar.py @@ -106,11 +106,12 @@ def _get_dirs(self): # This enables ls to get directories as children as well as files self.dir_cache = { - dirname + "/": {"name": dirname + "/", "size": 0, "type": "directory"} + dirname: {"name": dirname, "size": 0, "type": "directory"} for dirname in self._all_dirnames(self.tar.getnames()) } for member in self.tar.getmembers(): info = member.get_info() + info["name"] = info["name"].rstrip("/") info["type"] = typemap.get(info["type"], "file") self.dir_cache[info["name"]] = info diff --git a/fsspec/implementations/tests/test_archive.py b/fsspec/implementations/tests/test_archive.py index bde42df0f..6caf790c3 100644 --- a/fsspec/implementations/tests/test_archive.py +++ b/fsspec/implementations/tests/test_archive.py @@ -276,10 +276,10 @@ def test_ls(self, scenario: ArchiveTestScenario): with scenario.provider(archive_data) as archive: fs = fsspec.filesystem(scenario.protocol, fo=archive) - assert fs.ls("", detail=False) == ["a", "b", "deeply/"] + assert fs.ls("", detail=False) == ["a", "b", "deeply"] assert fs.ls("/") == fs.ls("") - assert fs.ls("deeply", detail=False) == ["deeply/nested/"] + assert fs.ls("deeply", detail=False) == ["deeply/nested"] assert fs.ls("deeply/") == fs.ls("deeply") assert fs.ls("deeply/nested", detail=False) == ["deeply/nested/path"] @@ -293,8 +293,8 @@ def test_find(self, scenario: ArchiveTestScenario): assert fs.find("", withdirs=True) == [ "a", "b", - "deeply/", - "deeply/nested/", + "deeply", + "deeply/nested", "deeply/nested/path", ] @@ -347,7 +347,7 @@ def project(mapping, keys): # Iterate over all directories. for d in fs._all_dirnames(archive_data.keys()): lhs = project(fs.info(d), ["name", "size", "type"]) - expected = {"name": f"{d}/", "size": 0, "type": "directory"} + expected = {"name": f"{d}", "size": 0, "type": "directory"} assert lhs == expected # Iterate over all files. diff --git a/fsspec/implementations/tests/test_tar.py b/fsspec/implementations/tests/test_tar.py index 6a7aca44f..23a4c2ea5 100644 --- a/fsspec/implementations/tests/test_tar.py +++ b/fsspec/implementations/tests/test_tar.py @@ -26,7 +26,7 @@ def test_info(): lhs = fs.info(d) del lhs["chksum"] expected = { - "name": f"{d}/", + "name": f"{d}", "size": 0, "type": "directory", "devmajor": 0, @@ -234,10 +234,10 @@ def test_ls_with_folders(compression: str, tmp_path: Path): fs = TarFileSystem(fd) assert fs.find("/", withdirs=True) == [ "a.pdf", - "b/", + "b", "b/c.pdf", - "d/", - "d/e/", + "d", + "d/e", "d/e/f.pdf", "d/g.pdf", ] diff --git a/fsspec/implementations/tests/test_zip.py b/fsspec/implementations/tests/test_zip.py index cd88d9dc2..3c6c50b63 100644 --- a/fsspec/implementations/tests/test_zip.py +++ b/fsspec/implementations/tests/test_zip.py @@ -46,8 +46,8 @@ def test_not_cached(): def test_root_info(): with tempzip(archive_data) as z: fs = fsspec.filesystem("zip", fo=z) - assert fs.info("/") == {"name": "/", "type": "directory", "size": 0} - assert fs.info("") == {"name": "/", "type": "directory", "size": 0} + assert fs.info("/") == {"name": "", "type": "directory", "size": 0} + assert fs.info("") == {"name": "", "type": "directory", "size": 0} def test_write_seek(m): @@ -83,3 +83,14 @@ def test_mapper(m): # fails because this is write mode and we cannot also read mapper["a"] assert "a" in mapper # but be can list + + +def test_zip_glob_star(m): + with fsspec.open( + "zip://adir/afile::memory://out.zip", mode="wb", zip={"mode": "w"} + ) as f: + f.write(b"data") + + fs, _ = fsspec.core.url_to_fs("zip::memory://out.zip") + outfiles = fs.glob("*") + assert len(outfiles) == 1 diff --git a/fsspec/implementations/zip.py b/fsspec/implementations/zip.py index f828a6841..dae927bc0 100644 --- a/fsspec/implementations/zip.py +++ b/fsspec/implementations/zip.py @@ -83,7 +83,7 @@ def _get_dirs(self): # not read from the file. files = self.zip.infolist() self.dir_cache = { - dirname + "/": {"name": dirname + "/", "size": 0, "type": "directory"} + dirname: {"name": dirname, "size": 0, "type": "directory"} for dirname in self._all_dirnames(self.zip.namelist()) } for z in files: