Skip to content

Commit

Permalink
Allow partial repeat of readdir response (#965)
Browse files Browse the repository at this point in the history
## Description of change
When user application gets interrupted in a `readdir` syscall the
underlying chain of `readdir` fuse requests gets reset to an offset
which is considered stale by Mountpoint. In that case Mountpoint still
completes the interrupted `readdir` request, but kernel partially
discards the response. We already cache the last response, so we can use
it to serve the request which follows the interrupt.

Relevant issues: #955

## Does this change impact existing behavior?

This is not a breaking change. Previously an error was returned, now
it'll be handled properly.

---

By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license and I agree to the terms of
the [Developer Certificate of Origin
(DCO)](https://developercertificate.org/).

---------

Signed-off-by: Vladislav Volodkin <[email protected]>
Signed-off-by: Vlad Volodkin <[email protected]>
Co-authored-by: Vladislav Volodkin <[email protected]>
Co-authored-by: Vlad Volodkin <[email protected]>
  • Loading branch information
3 people authored Dec 10, 2024
1 parent 688ec17 commit 441a502
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 6 deletions.
4 changes: 4 additions & 0 deletions mountpoint-s3/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
## Unreleased

### Other changes

* Fix an issue where an interrupt during `readdir` syscall leads to an error. ([#965](https://github.com/awslabs/mountpoint-s3/pull/965))

## v1.13.0 (December 2, 2024)

### New features
Expand Down
16 changes: 12 additions & 4 deletions mountpoint-s3/src/fs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -619,22 +619,30 @@ where
let new_handle = self.readdir_handle(parent).await?;
*dir_handle.handle.lock().await = new_handle;
dir_handle.rewind_offset();
// drop any cached entries, as new response may be unordered and cache would be stale
*dir_handle.last_response.lock().await = None;
}

let readdir_handle = dir_handle.handle.lock().await;

if offset != dir_handle.offset() {
// If offset is 0 we've already restarted the request and do not use cache, otherwise we're using the same request
// and it is safe to repeat the response. We do not repeat the response if negative offset was provided.
if offset != dir_handle.offset() && offset > 0 {
// POSIX allows seeking an open directory. That's a pain for us since we are streaming
// the directory entries and don't want to keep them all in memory. But one common case
// we've seen (https://github.com/awslabs/mountpoint-s3/issues/477) is applications that
// request offset 0 twice in a row. So we remember the last response and, if repeated,
// we return it again.
// we return it again. Last response may also be used partially, if an interrupt occured
// (https://github.com/awslabs/mountpoint-s3/issues/955), which caused entries from it to
// be only partially fetched by kernel.

let last_response = dir_handle.last_response.lock().await;
if let Some((last_offset, entries)) = last_response.as_ref() {
if offset == *last_offset {
let offset = offset as usize;
let last_offset = *last_offset as usize;
if (last_offset..last_offset + entries.len()).contains(&offset) {
trace!(offset, "repeating readdir response");
for entry in entries {
for entry in entries[offset - last_offset..].iter() {
if reply.add(entry.clone()) {
break;
}
Expand Down
78 changes: 76 additions & 2 deletions mountpoint-s3/tests/fs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1314,9 +1314,9 @@ async fn test_readdir_rewind_ordered() {
.collect::<Vec<_>>();
assert_eq!(entries.len(), 5);

// Trying to read out of order should fail (only the previous or next offsets are valid)
// Trying to read out of order should fail (only offsets in the range of the previous response, or the one immediately following, are valid)
assert!(reply.entries.back().unwrap().offset > 1);
fs.readdirplus(FUSE_ROOT_INODE, dir_handle, 1, &mut Default::default())
fs.readdirplus(FUSE_ROOT_INODE, dir_handle, 6, &mut Default::default())
.await
.expect_err("out of order");

Expand Down Expand Up @@ -1481,6 +1481,80 @@ async fn test_readdir_rewind_with_local_files_only() {
assert_eq!(new_entries.len(), 3); // 1 new local file + 2 dirs (. and ..) = 3 entries
}

// Check that request with an out-of-order offset which is in bounds of previously cached response is served well.
// This is relevant for situation when user application is interrupted in a readdir system call, which makes the
// kernel partially discard previous response and request some entries from it again:
//
// FUSE( 10) READDIRPLUS fh FileHandle(1), offset 0, size 4096
// FUSE( 11) INTERRUPT unique RequestId(10)
// FUSE( 12) READDIRPLUS fh FileHandle(1), offset 1, size 4096 <-- out-of-order offset `1`
// FUSE( 14) READDIRPLUS fh FileHandle(1), offset 25, size 4096
#[test_case(1, 25; "first in the beginning, second in full")]
#[test_case(24, 25; "first in the end, second in full")]
#[test_case(0, 26; "first in full, second in the beginning")]
#[test_case(0, 49; "first in full, second in the end")]
#[tokio::test]
async fn test_readdir_repeat_response_partial(first_repeated_offset: usize, second_repeated_offset: usize) {
let (client, fs) = make_test_filesystem("test_readdir_repeat_response", &Default::default(), Default::default());

for i in 0..48 {
// "." and ".." make it a round 50 in total
client.add_object(&format!("foo{i}"), b"foo".into());
}

let dir_handle = fs.opendir(FUSE_ROOT_INODE, 0).await.unwrap().fh;
let max_entries = 25;

// first request should just succeed
let first_response = ls(&fs, dir_handle, 0, max_entries).await;
assert!(first_response.len() == max_entries);

// request some entries from the first response again
let second_response = ls(&fs, dir_handle, first_repeated_offset as i64, max_entries).await;
assert_eq!(&first_response[first_repeated_offset..], &second_response[..]);

// read till the end
let third_response = ls(&fs, dir_handle, 25, max_entries).await;
assert!(third_response.len() == max_entries);

// request some entries from the last response again
let repeated_response = ls(&fs, dir_handle, second_repeated_offset as i64, max_entries).await;
assert_eq!(&third_response[second_repeated_offset - 25..], &repeated_response[..]);

// final response must be empty, signaling about EOF
let final_response = ls(&fs, dir_handle, 50, max_entries).await;
assert!(final_response.is_empty());
}

#[tokio::test]
async fn test_readdir_repeat_response_after_rewind() {
let (client, fs) = make_test_filesystem("test_readdir_repeat_response", &Default::default(), Default::default());

for i in 0..73 {
// "." and ".." make it a round 75 in total
client.add_object(&format!("foo{i}"), b"foo".into());
}

let dir_handle = fs.opendir(FUSE_ROOT_INODE, 0).await.unwrap().fh;
let max_entries = 25;

// read the first response, we'll later use it as an expected result
let first_response = ls(&fs, dir_handle, 0, max_entries).await;
assert!(first_response.len() == max_entries);

// proceed in the stream, so we have a new response cached
let second_response = ls(&fs, dir_handle, 25, max_entries).await;
assert!(second_response.len() == max_entries);

// ask for offset 0, causing a rewind (new S3 request)
let rewinded_response = ls(&fs, dir_handle, 0, max_entries).await;
assert_eq!(first_response, rewinded_response);

// ask for offset 1, check that the correct cached response is used
let repeated_response = ls(&fs, dir_handle, 1, max_entries).await;
assert_eq!(&first_response[1..], repeated_response);
}

#[cfg(feature = "s3_tests")]
#[tokio::test]
async fn test_lookup_404_not_an_error() {
Expand Down

0 comments on commit 441a502

Please sign in to comment.