From 89626e89022dc9ed62aa7b0fb8dba9427fa892b3 Mon Sep 17 00:00:00 2001 From: Ian Thomas Date: Wed, 27 Sep 2023 18:36:44 +0100 Subject: [PATCH] Implement cache size for CachingFileSystem (#1377) --- fsspec/implementations/cached.py | 24 +++++++++++++ fsspec/implementations/tests/test_cached.py | 39 +++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/fsspec/implementations/cached.py b/fsspec/implementations/cached.py index b679cce51..496bcb07a 100644 --- a/fsspec/implementations/cached.py +++ b/fsspec/implementations/cached.py @@ -128,6 +128,11 @@ def __init__( self.expiry = expiry_time self.compression = compression + # Size of cache in bytes. If None then the size is unknown and will be + # recalculated the next time cache_size() is called. On writes to the + # cache this is reset to None. + self._cache_size = None + if same_names is not None and cache_mapper is not None: raise ValueError( "Cannot specify both same_names and cache_mapper in " @@ -165,6 +170,17 @@ def _remove_tempdir(tempdir): def _mkcache(self): os.makedirs(self.storage[-1], exist_ok=True) + def cache_size(self): + """Return size of cache in bytes. + + If more than one cache directory is in use, only the size of the last + one (the writable cache directory) is returned. + """ + if self._cache_size is None: + cache_dir = self.storage[-1] + self._cache_size = filesystem("file").du(cache_dir, withdirs=True) + return self._cache_size + def load_cache(self): """Read set of stored blocks from file""" self._metadata.load() @@ -176,6 +192,7 @@ def save_cache(self): self._mkcache() self._metadata.save() self.last_cache = time.time() + self._cache_size = None def _check_cache(self): """Reload caches if time elapsed or any disappeared""" @@ -202,6 +219,7 @@ def clear_cache(self): """ rmtree(self.storage[-1]) self.load_cache() + self._cache_size = None def clear_expired_cache(self, expiry_time=None): """Remove all expired files and metadata from the cache @@ -231,6 +249,8 @@ def clear_expired_cache(self, expiry_time=None): rmtree(self.storage[-1]) self.load_cache() + self._cache_size = None + def pop_from_cache(self, path): """Remove cached version of given file @@ -242,6 +262,7 @@ def pop_from_cache(self, path): fn = self._metadata.pop_file(path) if fn is not None: os.remove(fn) + self._cache_size = None def _open( self, @@ -389,6 +410,7 @@ def __getattribute__(self, item): "__hash__", "__eq__", "to_json", + "cache_size", ]: # all the methods defined in this class. Note `open` here, since # it calls `_open`, but is actually in superclass @@ -535,6 +557,7 @@ def commit_many(self, open_files): os.remove(f.name) except FileNotFoundError: pass + self._cache_size = None def _make_local_details(self, path): hash = self._mapper(path) @@ -704,6 +727,7 @@ def _open(self, path, mode="rb", **kwargs): kwargs["mode"] = mode self._mkcache() + self._cache_size = None if self.compression: with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2: if isinstance(f, AbstractBufferedFile): diff --git a/fsspec/implementations/tests/test_cached.py b/fsspec/implementations/tests/test_cached.py index d4e05bdcf..2f7b35861 100644 --- a/fsspec/implementations/tests/test_cached.py +++ b/fsspec/implementations/tests/test_cached.py @@ -16,6 +16,7 @@ ) from fsspec.implementations.cached import CachingFileSystem, LocalTempFile from fsspec.implementations.local import make_path_posix +from fsspec.tests.conftest import win from .test_ftp import FTPFileSystem @@ -1211,3 +1212,41 @@ def test_cache_dir_auto_deleted(temp_cache, tmpdir): assert not local.exists(cache_dir) else: assert local.exists(cache_dir) + + +@pytest.mark.parametrize("protocol", ["filecache", "blockcache", "simplecache"]) +def test_cache_size(tmpdir, protocol): + if win and protocol == "blockcache": + pytest.skip("Windows file locking affects blockcache size tests") + + source = os.path.join(tmpdir, "source") + afile = os.path.join(source, "afile") + os.mkdir(source) + open(afile, "w").write("test") + + fs = fsspec.filesystem(protocol, target_protocol="file") + empty_cache_size = fs.cache_size() + + # Create cache + with fs.open(afile, "rb") as f: + assert f.read(5) == b"test" + single_file_cache_size = fs.cache_size() + assert single_file_cache_size > empty_cache_size + + # Remove cached file but leave cache metadata file + fs.pop_from_cache(afile) + if win and protocol == "filecache": + empty_cache_size < fs.cache_size() + elif protocol != "simplecache": + assert empty_cache_size < fs.cache_size() < single_file_cache_size + else: + # simplecache never stores metadata + assert fs.cache_size() == single_file_cache_size + + # Completely remove cache + fs.clear_cache() + if protocol != "simplecache": + assert fs.cache_size() == empty_cache_size + else: + # Whole cache directory has been deleted + assert fs.cache_size() == 0