From a3dbc1cd6bb4b7c84bd7d8cef8f404b80aad2a84 Mon Sep 17 00:00:00 2001 From: Daniel Carl Jones Date: Mon, 9 Oct 2023 15:54:02 +0100 Subject: [PATCH] Implement strong consistency toggle in mountpoint-s3::fs::CacheConfig This plumbs in checks for if the filesystem should maintain strong consistency for operations like open. There is no way to configure mountpoint-s3 itself to relax the consistency model - this change only impacts internals. Signed-off-by: Daniel Carl Jones --- mountpoint-s3/src/fs.rs | 21 ++++- mountpoint-s3/src/inode.rs | 85 ++++++++++++++++--- mountpoint-s3/tests/fuse_tests/lookup_test.rs | 1 + mountpoint-s3/tests/reftests/harness.rs | 9 +- 4 files changed, 101 insertions(+), 15 deletions(-) diff --git a/mountpoint-s3/src/fs.rs b/mountpoint-s3/src/fs.rs index 2bdfe4a25..acd88a009 100644 --- a/mountpoint-s3/src/fs.rs +++ b/mountpoint-s3/src/fs.rs @@ -194,6 +194,13 @@ impl UploadState { #[derive(Debug, Clone)] pub struct CacheConfig { + /// Should the file system check S3 even when a valid cached entry may be available? + /// + /// When enabled, some operations such as `lookup` are allowed to be served from cache + /// with a short TTL since Linux filesystems behave badly when the TTL is zero. + /// For example, results from `readdir` will expire immediately, and so the kernel will + /// immediately re-lookup every entry returned from `readdir`. + pub prefer_s3: bool, /// How long the kernel will cache metadata for files pub file_ttl: Duration, /// How long the kernel will cache metadata for directories @@ -202,8 +209,9 @@ pub struct CacheConfig { impl Default for CacheConfig { fn default() -> Self { - // We want to do as little caching as possible, but Linux filesystems behave badly when the - // TTL is exactly zero. For example, results from `readdir` will expire immediately, and so + // We want to do as little caching as possible by default, + // but Linux filesystems behave badly when the TTL is exactly zero. + // For example, results from `readdir` will expire immediately, and so // the kernel will immediately re-lookup every entry returned from `readdir`. So we apply // small non-zero TTLs. The goal is to be small enough that the impact on consistency is // minimal, but large enough that a single cache miss doesn't cause a cascading effect where @@ -213,7 +221,11 @@ impl Default for CacheConfig { let file_ttl = Duration::from_millis(100); let dir_ttl = Duration::from_millis(1000); - Self { file_ttl, dir_ttl } + Self { + prefer_s3: true, + file_ttl, + dir_ttl, + } } } @@ -461,7 +473,8 @@ where pub async fn open(&self, ino: InodeNo, flags: i32) -> Result { trace!("fs:open with ino {:?} flags {:?}", ino, flags); - let lookup = self.superblock.getattr(&self.client, ino, true).await?; + let force_revalidate = self.config.cache_config.prefer_s3; + let lookup = self.superblock.getattr(&self.client, ino, force_revalidate).await?; match lookup.inode.kind() { InodeKind::Directory => return Err(InodeError::IsDirectory(lookup.inode.err()).into()), diff --git a/mountpoint-s3/src/inode.rs b/mountpoint-s3/src/inode.rs index 06d00201b..d9b07ddb4 100644 --- a/mountpoint-s3/src/inode.rs +++ b/mountpoint-s3/src/inode.rs @@ -160,6 +160,13 @@ impl Superblock { } } writing_children.remove(&ino); + + if let Ok(state) = inode.get_inode_state() { + metrics::counter!( + "metadata_cache.inode_forgotten_before_expiry", + state.stat.is_valid().into(), + ); + }; } } } @@ -174,7 +181,10 @@ impl Superblock { name: &OsStr, ) -> Result { trace!(parent=?parent_ino, ?name, "lookup"); - let lookup = self.inner.lookup(client, parent_ino, name).await?; + let lookup = self + .inner + .lookup_by_name(client, parent_ino, name, self.inner.cache_config.prefer_s3) + .await?; self.inner.remember(&lookup.inode); Ok(lookup) } @@ -197,7 +207,10 @@ impl Superblock { } } - let lookup = self.inner.lookup(client, inode.parent(), inode.name().as_ref()).await?; + let lookup = self + .inner + .lookup_by_name(client, inode.parent(), inode.name().as_ref(), true) + .await?; if lookup.inode.ino() != ino { Err(InodeError::StaleInode { remote_key: lookup.inode.full_key().to_owned(), @@ -287,7 +300,10 @@ impl Superblock { ) -> Result { trace!(parent=?dir, ?name, "create"); - let existing = self.inner.lookup(client, dir, name).await; + let existing = self + .inner + .lookup_by_name(client, dir, name, self.inner.cache_config.prefer_s3) + .await; match existing { Ok(lookup) => return Err(InodeError::FileAlreadyExists(lookup.inode.err())), Err(InodeError::FileDoesNotExist) => (), @@ -341,7 +357,10 @@ impl Superblock { parent_ino: InodeNo, name: &OsStr, ) -> Result<(), InodeError> { - let LookedUp { inode, .. } = self.inner.lookup(client, parent_ino, name).await?; + let LookedUp { inode, .. } = self + .inner + .lookup_by_name(client, parent_ino, name, self.inner.cache_config.prefer_s3) + .await?; if inode.kind() == InodeKind::File { return Err(InodeError::NotADirectory(inode.err())); @@ -407,7 +426,10 @@ impl Superblock { name: &OsStr, ) -> Result<(), InodeError> { let parent = self.inner.get(parent_ino)?; - let LookedUp { inode, .. } = self.inner.lookup(client, parent_ino, name).await?; + let LookedUp { inode, .. } = self + .inner + .lookup_by_name(client, parent_ino, name, self.inner.cache_config.prefer_s3) + .await?; if inode.kind() == InodeKind::Directory { return Err(InodeError::IsDirectory(inode.err())); @@ -475,6 +497,8 @@ impl Superblock { impl SuperblockInner { /// Retrieve the inode for the given number if it exists + /// + /// The validity of its stat field is not checked. pub fn get(&self, ino: InodeNo) -> Result { let inode = self .inodes @@ -499,14 +523,16 @@ impl SuperblockInner { } /// Lookup an inode in the parent directory with the given name. + /// /// Updates the parent inode to be in sync with the client, but does /// not add new inodes to the superblock. The caller is responsible /// for calling [`remember()`] if that is required. - pub async fn lookup( + pub async fn lookup_by_name( &self, client: &OC, parent_ino: InodeNo, name: &OsStr, + skip_cache: bool, ) -> Result { let name = name .to_str() @@ -518,14 +544,50 @@ impl SuperblockInner { return Err(InodeError::InvalidFileName(name.into())); } - // TODO use caches. if we already know about this name, we just need to revalidate the stat - // cache and then read it. - let remote = self.remote_lookup(client, parent_ino, name).await?; - let lookup = self.update_from_remote(parent_ino, name, remote)?; + let lookup = if skip_cache { + None + } else { + let lookup = self.cache_lookup(parent_ino, name); + trace!("lookup returned from cache: {:?}", lookup); + metrics::counter!("metadata_cache.cache_hit", lookup.is_some().into()); + lookup + }; + + let lookup = match lookup { + Some(lookup) => lookup, + None => { + let remote = self.remote_lookup(client, parent_ino, name).await?; + self.update_from_remote(parent_ino, name, remote)? + } + }; + lookup.inode.verify_child(parent_ino, name.as_ref())?; Ok(lookup) } + /// Lookup an [Inode] against known directory entries in the parent, + /// verifying any returned entry has not expired. + fn cache_lookup(&self, parent_ino: InodeNo, name: &str) -> Option { + let parent = self.get(parent_ino).ok()?; + + match &parent.get_inode_state().ok()?.kind_data { + InodeKindData::File { .. } => unreachable!("parent should be a directory!"), + InodeKindData::Directory { children, .. } => { + let inode = children.get(name)?; + let inode_stat = &inode.get_inode_state().ok()?.stat; + if inode_stat.is_valid() { + let lookup = LookedUp { + inode: inode.clone(), + stat: inode_stat.clone(), + }; + return Some(lookup); + } + } + }; + + None + } + /// Lookup an inode in the parent directory with the given name /// on the remote client. async fn remote_lookup( @@ -1121,6 +1183,7 @@ impl Inode { Self { inner: inner.into() } } + /// Verify [Inode] has the expected inode number and the inode content is valid for its checksum. fn verify_inode(&self, expected_ino: InodeNo) -> Result<(), InodeError> { let computed = Self::compute_checksum(self.ino(), self.full_key()); if computed == self.inner.checksum && self.ino() == expected_ino { @@ -1130,6 +1193,8 @@ impl Inode { } } + /// Verify [Inode] has the expected inode number, expected parent inode number, + /// and the inode's content is valid for its checksum. fn verify_child(&self, expected_parent: InodeNo, expected_name: &str) -> Result<(), InodeError> { let computed = Self::compute_checksum(self.ino(), self.full_key()); if computed == self.inner.checksum && self.parent() == expected_parent && self.name() == expected_name { diff --git a/mountpoint-s3/tests/fuse_tests/lookup_test.rs b/mountpoint-s3/tests/fuse_tests/lookup_test.rs index 34c42c35a..818ecad1d 100644 --- a/mountpoint-s3/tests/fuse_tests/lookup_test.rs +++ b/mountpoint-s3/tests/fuse_tests/lookup_test.rs @@ -111,6 +111,7 @@ where { let filesystem_config = S3FilesystemConfig { cache_config: CacheConfig { + prefer_s3: true, file_ttl: Duration::ZERO, dir_ttl: Duration::ZERO, }, diff --git a/mountpoint-s3/tests/reftests/harness.rs b/mountpoint-s3/tests/reftests/harness.rs index 80d063f3b..22680b7f1 100644 --- a/mountpoint-s3/tests/reftests/harness.rs +++ b/mountpoint-s3/tests/reftests/harness.rs @@ -2,11 +2,12 @@ use std::collections::{BTreeMap, HashSet}; use std::fmt::Debug; use std::path::{Component, Path, PathBuf}; use std::sync::Arc; +use std::time::Duration; use fuser::FileType; use futures::executor::ThreadPool; use futures::future::{BoxFuture, FutureExt}; -use mountpoint_s3::fs::{self, InodeNo, ReadReplier, ToErrno, FUSE_ROOT_INODE}; +use mountpoint_s3::fs::{self, CacheConfig, InodeNo, ReadReplier, ToErrno, FUSE_ROOT_INODE}; use mountpoint_s3::prefix::Prefix; use mountpoint_s3::{S3Filesystem, S3FilesystemConfig}; use mountpoint_s3_client::mock_client::{MockClient, MockObject}; @@ -885,6 +886,12 @@ mod mutations { let config = S3FilesystemConfig { readdir_size: 5, allow_delete: true, + cache_config: CacheConfig { + // We are only interested in strong consistency for the reference tests. FUSE isn't even in the loop. + prefer_s3: true, + dir_ttl: Duration::ZERO, + file_ttl: Duration::ZERO, + }, ..Default::default() }; let (client, fs) = make_test_filesystem(BUCKET_NAME, &test_prefix, config);