Skip to content

Commit

Permalink
Implement cache size for CachingFileSystem (#1377)
Browse files Browse the repository at this point in the history
  • Loading branch information
ianthomas23 authored Sep 27, 2023
1 parent bb60983 commit 89626e8
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 0 deletions.
24 changes: 24 additions & 0 deletions fsspec/implementations/cached.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ def __init__(
self.expiry = expiry_time
self.compression = compression

# Size of cache in bytes. If None then the size is unknown and will be
# recalculated the next time cache_size() is called. On writes to the
# cache this is reset to None.
self._cache_size = None

if same_names is not None and cache_mapper is not None:
raise ValueError(
"Cannot specify both same_names and cache_mapper in "
Expand Down Expand Up @@ -165,6 +170,17 @@ def _remove_tempdir(tempdir):
def _mkcache(self):
os.makedirs(self.storage[-1], exist_ok=True)

def cache_size(self):
"""Return size of cache in bytes.
If more than one cache directory is in use, only the size of the last
one (the writable cache directory) is returned.
"""
if self._cache_size is None:
cache_dir = self.storage[-1]
self._cache_size = filesystem("file").du(cache_dir, withdirs=True)
return self._cache_size

def load_cache(self):
"""Read set of stored blocks from file"""
self._metadata.load()
Expand All @@ -176,6 +192,7 @@ def save_cache(self):
self._mkcache()
self._metadata.save()
self.last_cache = time.time()
self._cache_size = None

def _check_cache(self):
"""Reload caches if time elapsed or any disappeared"""
Expand All @@ -202,6 +219,7 @@ def clear_cache(self):
"""
rmtree(self.storage[-1])
self.load_cache()
self._cache_size = None

def clear_expired_cache(self, expiry_time=None):
"""Remove all expired files and metadata from the cache
Expand Down Expand Up @@ -231,6 +249,8 @@ def clear_expired_cache(self, expiry_time=None):
rmtree(self.storage[-1])
self.load_cache()

self._cache_size = None

def pop_from_cache(self, path):
"""Remove cached version of given file
Expand All @@ -242,6 +262,7 @@ def pop_from_cache(self, path):
fn = self._metadata.pop_file(path)
if fn is not None:
os.remove(fn)
self._cache_size = None

def _open(
self,
Expand Down Expand Up @@ -389,6 +410,7 @@ def __getattribute__(self, item):
"__hash__",
"__eq__",
"to_json",
"cache_size",
]:
# all the methods defined in this class. Note `open` here, since
# it calls `_open`, but is actually in superclass
Expand Down Expand Up @@ -535,6 +557,7 @@ def commit_many(self, open_files):
os.remove(f.name)
except FileNotFoundError:
pass
self._cache_size = None

def _make_local_details(self, path):
hash = self._mapper(path)
Expand Down Expand Up @@ -704,6 +727,7 @@ def _open(self, path, mode="rb", **kwargs):
kwargs["mode"] = mode

self._mkcache()
self._cache_size = None
if self.compression:
with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
if isinstance(f, AbstractBufferedFile):
Expand Down
39 changes: 39 additions & 0 deletions fsspec/implementations/tests/test_cached.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)
from fsspec.implementations.cached import CachingFileSystem, LocalTempFile
from fsspec.implementations.local import make_path_posix
from fsspec.tests.conftest import win

from .test_ftp import FTPFileSystem

Expand Down Expand Up @@ -1211,3 +1212,41 @@ def test_cache_dir_auto_deleted(temp_cache, tmpdir):
assert not local.exists(cache_dir)
else:
assert local.exists(cache_dir)


@pytest.mark.parametrize("protocol", ["filecache", "blockcache", "simplecache"])
def test_cache_size(tmpdir, protocol):
if win and protocol == "blockcache":
pytest.skip("Windows file locking affects blockcache size tests")

source = os.path.join(tmpdir, "source")
afile = os.path.join(source, "afile")
os.mkdir(source)
open(afile, "w").write("test")

fs = fsspec.filesystem(protocol, target_protocol="file")
empty_cache_size = fs.cache_size()

# Create cache
with fs.open(afile, "rb") as f:
assert f.read(5) == b"test"
single_file_cache_size = fs.cache_size()
assert single_file_cache_size > empty_cache_size

# Remove cached file but leave cache metadata file
fs.pop_from_cache(afile)
if win and protocol == "filecache":
empty_cache_size < fs.cache_size()
elif protocol != "simplecache":
assert empty_cache_size < fs.cache_size() < single_file_cache_size
else:
# simplecache never stores metadata
assert fs.cache_size() == single_file_cache_size

# Completely remove cache
fs.clear_cache()
if protocol != "simplecache":
assert fs.cache_size() == empty_cache_size
else:
# Whole cache directory has been deleted
assert fs.cache_size() == 0

0 comments on commit 89626e8

Please sign in to comment.