From 2dc631cf0711c8c4ad44a8e57ae46cb58465b859 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aurimas=20Bla=C5=BEulionis?= <0x60@pm.me>
Date: Thu, 30 Nov 2023 19:56:00 +0000
Subject: [PATCH] Resolve io_uring out of order packets

---
 .../src/native/impls/io_uring/tcp_stream.rs   | 25 +++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/mfio-rt/src/native/impls/io_uring/tcp_stream.rs b/mfio-rt/src/native/impls/io_uring/tcp_stream.rs
index 20bf15c..586f809 100644
--- a/mfio-rt/src/native/impls/io_uring/tcp_stream.rs
+++ b/mfio-rt/src/native/impls/io_uring/tcp_stream.rs
@@ -9,6 +9,7 @@ use core::task::{Context, Poll};
 
 use io_uring::{
     opcode,
+    squeue::Flags,
     types::{Fd, Fixed},
 };
 use parking_lot::Mutex;
@@ -131,7 +132,18 @@ impl StreamInner {
             }
             let queue = self.stream.write_queue();
             if !queue.is_empty() {
-                for queue in queue.chunks(*IOV_MAX) {
+                // FIXME: investigate why processing more than 3 chunks leads to out-of-order
+                // transfer of data - OOO does not happen only if we add link flag to the tail as
+                // well, which is wrong - adding link flag to the tail likely interupts other logic
+                // slowing us down. Taking 3 chunks seems to work fine, but 2 should be good
+                // enough.
+                //
+                // Performance wise, a better improvement here would be to enable submission of new
+                // writes, before all chunks complete - this would maximize the throughput.
+                let target_height = core::cmp::min((queue.len() + *IOV_MAX - 1) / *IOV_MAX, 2);
+                let mut tailed = false;
+                for queue in queue.chunks(*IOV_MAX).take(target_height) {
+                    debug_assert!(!tailed);
                     self.in_write += 1;
                     let entry = opcode::Writev::new(
                         Fixed(Key::Stream(idx).key() as _),
@@ -140,7 +152,16 @@ impl StreamInner {
                     )
                     .offset(!0u64)
                     .build();
-                    push_handle.try_push_op(entry, Operation::StreamWrite(idx))
+
+                    push_handle.try_push_op(
+                        if self.in_write < target_height {
+                            entry.flags(Flags::IO_HARDLINK)
+                        } else {
+                            tailed = true;
+                            entry
+                        },
+                        Operation::StreamWrite(idx),
+                    );
                 }
             }
         }