Allow partial repeat of readdir response (#965)

## Description of change When user application gets interrupted in a `readdir` syscall the underlying chain of `readdir` fuse requests gets reset to an offset which is considered stale by Mountpoint. In that case Mountpoint still completes the interrupted `readdir` request, but kernel partially discards the response. We already cache the last response, so we can use it to serve the request which follows the interrupt. Relevant issues: #955 ## Does this change impact existing behavior? This is not a breaking change. Previously an error was returned, now it'll be handled properly. --- By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license and I agree to the terms of the [Developer Certificate of Origin (DCO)](https://developercertificate.org/). --------- Signed-off-by: Vladislav Volodkin <[email protected]> Signed-off-by: Vlad Volodkin <[email protected]> Co-authored-by: Vladislav Volodkin <[email protected]> Co-authored-by: Vlad Volodkin <[email protected]>
awslabs · Dec 10, 2024 · 441a502 · 441a502
1 parent 688ec17
commit 441a502
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 6 deletions.
diff --git a/mountpoint-s3/CHANGELOG.md b/mountpoint-s3/CHANGELOG.md
@@ -1,5 +1,9 @@
 ## Unreleased
 
+### Other changes
+
+* Fix an issue where an interrupt during `readdir` syscall leads to an error. ([#965](https://github.com/awslabs/mountpoint-s3/pull/965))
+
 ## v1.13.0 (December 2, 2024)
 
 ### New features

diff --git a/mountpoint-s3/src/fs.rs b/mountpoint-s3/src/fs.rs
@@ -619,22 +619,30 @@ where
             let new_handle = self.readdir_handle(parent).await?;
             *dir_handle.handle.lock().await = new_handle;
             dir_handle.rewind_offset();
+            // drop any cached entries, as new response may be unordered and cache would be stale
+            *dir_handle.last_response.lock().await = None;
         }
 
         let readdir_handle = dir_handle.handle.lock().await;
 
-        if offset != dir_handle.offset() {
+        // If offset is 0 we've already restarted the request and do not use cache, otherwise we're using the same request
+        // and it is safe to repeat the response. We do not repeat the response if negative offset was provided.
+        if offset != dir_handle.offset() && offset > 0 {
             // POSIX allows seeking an open directory. That's a pain for us since we are streaming
             // the directory entries and don't want to keep them all in memory. But one common case
             // we've seen (https://github.com/awslabs/mountpoint-s3/issues/477) is applications that
             // request offset 0 twice in a row. So we remember the last response and, if repeated,
-            // we return it again.
+            // we return it again. Last response may also be used partially, if an interrupt occured
+            // (https://github.com/awslabs/mountpoint-s3/issues/955), which caused entries from it to
+            // be only partially fetched by kernel.
 
             let last_response = dir_handle.last_response.lock().await;
             if let Some((last_offset, entries)) = last_response.as_ref() {
-                if offset == *last_offset {
+                let offset = offset as usize;
+                let last_offset = *last_offset as usize;
+                if (last_offset..last_offset + entries.len()).contains(&offset) {
                     trace!(offset, "repeating readdir response");
-                    for entry in entries {
+                    for entry in entries[offset - last_offset..].iter() {
                         if reply.add(entry.clone()) {
                             break;
                         }

diff --git a/mountpoint-s3/tests/fs.rs b/mountpoint-s3/tests/fs.rs
@@ -1314,9 +1314,9 @@ async fn test_readdir_rewind_ordered() {
         .collect::<Vec<_>>();
     assert_eq!(entries.len(), 5);
 
-    // Trying to read out of order should fail (only the previous or next offsets are valid)
+    // Trying to read out of order should fail (only offsets in the range of the previous response, or the one immediately following, are valid)
     assert!(reply.entries.back().unwrap().offset > 1);
-    fs.readdirplus(FUSE_ROOT_INODE, dir_handle, 1, &mut Default::default())
+    fs.readdirplus(FUSE_ROOT_INODE, dir_handle, 6, &mut Default::default())
         .await
         .expect_err("out of order");
 
@@ -1481,6 +1481,80 @@ async fn test_readdir_rewind_with_local_files_only() {
     assert_eq!(new_entries.len(), 3); // 1 new local file + 2 dirs (. and ..) = 3 entries
 }
 
+// Check that request with an out-of-order offset which is in bounds of previously cached response is served well.
+// This is relevant for situation when user application is interrupted in a readdir system call, which makes the
+// kernel partially discard previous response and request some entries from it again:
+//
+// FUSE( 10) READDIRPLUS fh FileHandle(1), offset 0, size 4096
+// FUSE( 11) INTERRUPT unique RequestId(10)
+// FUSE( 12) READDIRPLUS fh FileHandle(1), offset 1, size 4096  <-- out-of-order offset `1`
+// FUSE( 14) READDIRPLUS fh FileHandle(1), offset 25, size 4096
+#[test_case(1, 25; "first in the beginning, second in full")]
+#[test_case(24, 25; "first in the end, second in full")]
+#[test_case(0, 26; "first in full, second in the beginning")]
+#[test_case(0, 49; "first in full, second in the end")]
+#[tokio::test]
+async fn test_readdir_repeat_response_partial(first_repeated_offset: usize, second_repeated_offset: usize) {
+    let (client, fs) = make_test_filesystem("test_readdir_repeat_response", &Default::default(), Default::default());
+
+    for i in 0..48 {
+        // "." and ".." make it a round 50 in total
+        client.add_object(&format!("foo{i}"), b"foo".into());
+    }
+
+    let dir_handle = fs.opendir(FUSE_ROOT_INODE, 0).await.unwrap().fh;
+    let max_entries = 25;
+
+    // first request should just succeed
+    let first_response = ls(&fs, dir_handle, 0, max_entries).await;
+    assert!(first_response.len() == max_entries);
+
+    // request some entries from the first response again
+    let second_response = ls(&fs, dir_handle, first_repeated_offset as i64, max_entries).await;
+    assert_eq!(&first_response[first_repeated_offset..], &second_response[..]);
+
+    // read till the end
+    let third_response = ls(&fs, dir_handle, 25, max_entries).await;
+    assert!(third_response.len() == max_entries);
+
+    // request some entries from the last response again
+    let repeated_response = ls(&fs, dir_handle, second_repeated_offset as i64, max_entries).await;
+    assert_eq!(&third_response[second_repeated_offset - 25..], &repeated_response[..]);
+
+    // final response must be empty, signaling about EOF
+    let final_response = ls(&fs, dir_handle, 50, max_entries).await;
+    assert!(final_response.is_empty());
+}
+
+#[tokio::test]
+async fn test_readdir_repeat_response_after_rewind() {
+    let (client, fs) = make_test_filesystem("test_readdir_repeat_response", &Default::default(), Default::default());
+
+    for i in 0..73 {
+        // "." and ".." make it a round 75 in total
+        client.add_object(&format!("foo{i}"), b"foo".into());
+    }
+
+    let dir_handle = fs.opendir(FUSE_ROOT_INODE, 0).await.unwrap().fh;
+    let max_entries = 25;
+
+    // read the first response, we'll later use it as an expected result
+    let first_response = ls(&fs, dir_handle, 0, max_entries).await;
+    assert!(first_response.len() == max_entries);
+
+    // proceed in the stream, so we have a new response cached
+    let second_response = ls(&fs, dir_handle, 25, max_entries).await;
+    assert!(second_response.len() == max_entries);
+
+    // ask for offset 0, causing a rewind (new S3 request)
+    let rewinded_response = ls(&fs, dir_handle, 0, max_entries).await;
+    assert_eq!(first_response, rewinded_response);
+
+    // ask for offset 1, check that the correct cached response is used
+    let repeated_response = ls(&fs, dir_handle, 1, max_entries).await;
+    assert_eq!(&first_response[1..], repeated_response);
+}
+
 #[cfg(feature = "s3_tests")]
 #[tokio::test]
 async fn test_lookup_404_not_an_error() {