From 2f4bfae460a7026a7ec3ca23890a6999053e97da Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Mon, 3 Jun 2024 20:01:41 -0700
Subject: [PATCH 01/10] Fix byte size handling for raw binary requests where
 size and data are split across buffers

---
 src/infer_request.cc | 98 +++++++++++++++++++++++++++++++-------------
 1 file changed, 70 insertions(+), 28 deletions(-)
diff --git a/src/infer_request.cc b/src/infer_request.cc
index 0c85051ff..34322294d 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -1269,47 +1269,89 @@ InferenceRequest::ValidateBytesInputs(
     const std::string& input_id, const Input& input,
     int64_t* const expected_byte_size) const
 {
+  *expected_byte_size = 0;
   const auto& input_dims = input.ShapeWithBatchDim();
+
   int64_t element_count = triton::common::GetElementCount(input_dims);
   int64_t element_idx = 0;
-  *expected_byte_size = 0;
-  for (size_t i = 0; i < input.Data()->BufferCount(); ++i) {
-    size_t content_byte_size;
-    TRITONSERVER_MemoryType content_memory_type;
-    int64_t content_memory_id;
-    const char* content = input.Data()->BufferAt(
-        i, &content_byte_size, &content_memory_type, &content_memory_id);
-
-    while (content_byte_size >= sizeof(uint32_t)) {
-      if (element_idx >= element_count) {
-        return Status(
-            Status::Code::INVALID_ARG,
-            LogRequest() + "unexpected number of string elements " +
-                std::to_string(element_idx + 1) + " for inference input '" +
-                input_id + "', expecting " + std::to_string(element_count));
+  size_t remaining_element_size = 0;
+
+  size_t buffer_idx = 0;
+  const size_t buffer_count = input.Data()->BufferCount();
+
+  const char* buffer = nullptr;
+  size_t remaining_buffer_size = 0;
+  TRITONSERVER_MemoryType buffer_memory_type;
+  int64_t buffer_memory_id;
+
+  // Validate elements until all buffers have been fully processed.
+  while (remaining_buffer_size || buffer_idx < buffer_count) {
+    // Get the next buffer if not currently processing one.
+    if (!remaining_buffer_size) {
+      // Reset remaining buffer size and pointers for next buffer.
+      buffer = input.Data()->BufferAt(
+          buffer_idx++, &remaining_buffer_size, &buffer_memory_type,
+          &buffer_memory_id);
+      *expected_byte_size += remaining_buffer_size;
+
+      // FIXME: Handle GPU buffers
+      if (buffer_memory_type == TRITONSERVER_MEMORY_GPU) {
+        LOG_WARNING << "Validation of GPU byte size buffers is not implemented "
+                       "yet, skipping to the next buffer.";
+        buffer = nullptr;
+        remaining_buffer_size = 0;
+        continue;
       }
+    }
 
-      const uint32_t len = *(reinterpret_cast<const uint32_t*>(content));
-      content += sizeof(uint32_t);
-      content_byte_size -= sizeof(uint32_t);
-      *expected_byte_size += sizeof(uint32_t);
-
-      if (content_byte_size < len) {
+    constexpr size_t kElementSizeIndicator = sizeof(uint32_t);
+    // Get the next element if not currently processing one.
+    if (!remaining_element_size) {
+      // FIXME: Assume the string element's byte size indicator is not spread
+      // across buffer boundaries for simplicity.
+      if (remaining_buffer_size < kElementSizeIndicator) {
         return Status(
             Status::Code::INVALID_ARG,
-            LogRequest() + "incomplete string data for inference input '" +
-                input_id + "', expecting string of length " +
-                std::to_string(len) + " but only " +
-                std::to_string(content_byte_size) + " bytes available");
+            LogRequest() +
+                "Element byte size indicator exceeds the end of the buffer.");
       }
 
-      content += len;
-      content_byte_size -= len;
-      *expected_byte_size += len;
+      // Start the next element and reset the remaining element size.
+      remaining_element_size = *(reinterpret_cast<const uint32_t*>(buffer));
       element_idx++;
+
+      // Advance pointer and remainder by the indicator size.
+      buffer += kElementSizeIndicator;
+      remaining_buffer_size -= kElementSizeIndicator;
+    }
+
+    // If the remaining buffer fits it: consume the rest of the element, proceed
+    // to the next element.
+    if (remaining_buffer_size >= remaining_element_size) {
+      buffer += remaining_element_size;
+      remaining_buffer_size -= remaining_element_size;
+      remaining_element_size = 0;
+    }
+    // Otherwise the remaining element is larger: consume the rest of the
+    // buffer, proceed to the next buffer.
+    else {
+      remaining_element_size -= remaining_buffer_size;
+      remaining_buffer_size = 0;
     }
   }
 
+  // Validate the number of processed buffers exactly match expectations.
+  if (buffer_idx != buffer_count) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        LogRequest() + "expected to process " + std::to_string(buffer_count) +
+            " buffers for inference input '" + input_id + "', only processed " +
+            std::to_string(buffer_idx));
+  }
+
+  // FIXME: If the input contains GPU buffers that get skipped, the element
+  //        count will likely not match expectations.
+  // Validate the number of processed elements exactly match expectations.
   if (element_idx != element_count) {
     return Status(
         Status::Code::INVALID_ARG,

From 0a79085e98e8eecd149fa21b952885d11bb0ebd7 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Mon, 3 Jun 2024 20:27:53 -0700
Subject: [PATCH 02/10] Be more explicit about unhandled case and just skip
 validation for GPU buffers

---
 src/infer_request.cc | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/infer_request.cc b/src/infer_request.cc
index 34322294d..fb7dc14d5 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -1186,6 +1186,11 @@ InferenceRequest::Normalize()
       if (data_type == inference::DataType::TYPE_STRING) {
         RETURN_IF_ERROR(
             ValidateBytesInputs(input_id, input, &expected_byte_size));
+        // FIXME: -1 is used as a signal to skip total byte size validation for
+        // unhandled cases in ValidateBytesInputs.
+        if (expected_byte_size == -1) {
+          return Status::Success;
+        }
       } else {
         expected_byte_size = triton::common::GetByteSize(data_type, input_dims);
       }
@@ -1294,13 +1299,15 @@ InferenceRequest::ValidateBytesInputs(
           &buffer_memory_id);
       *expected_byte_size += remaining_buffer_size;
 
-      // FIXME: Handle GPU buffers
+      // FIXME: Skip GPU buffers for now, return an expected_byte_size of -1 as
+      // a signal to skip validation.
       if (buffer_memory_type == TRITONSERVER_MEMORY_GPU) {
-        LOG_WARNING << "Validation of GPU byte size buffers is not implemented "
-                       "yet, skipping to the next buffer.";
-        buffer = nullptr;
-        remaining_buffer_size = 0;
-        continue;
+        LOG_WARNING << LogRequest()
+                    << "Validation of GPU byte size buffers is not implemented "
+                       "yet, skipping validation for input: "
+                    << input_id;
+        *expected_byte_size = -1;
+        return Status::Success;
       }
     }
 
@@ -1349,8 +1356,6 @@ InferenceRequest::ValidateBytesInputs(
             std::to_string(buffer_idx));
   }
 
-  // FIXME: If the input contains GPU buffers that get skipped, the element
-  //        count will likely not match expectations.
   // Validate the number of processed elements exactly match expectations.
   if (element_idx != element_count) {
     return Status(

From 70220019b67109bb87809c71a5f0208e0ba5a6f2 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Tue, 4 Jun 2024 15:37:19 -0700
Subject: [PATCH 03/10] Remove warning log, don't return early

---
 src/infer_request.cc | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/infer_request.cc b/src/infer_request.cc
index fb7dc14d5..b280e5593 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -1180,6 +1180,7 @@ InferenceRequest::Normalize()
       const auto& data_type = input.DType();
       const auto& input_dims = input.ShapeWithBatchDim();
       int64_t expected_byte_size = INT_MAX;
+      bool skip_byte_size_check = false;
       // Because Triton expects STRING type to be in special format
       // (prepend 4 bytes to specify string length), so need to add all the
       // first 4 bytes for each element to find expected byte size
@@ -1188,14 +1189,15 @@ InferenceRequest::Normalize()
             ValidateBytesInputs(input_id, input, &expected_byte_size));
         // FIXME: -1 is used as a signal to skip total byte size validation for
         // unhandled cases in ValidateBytesInputs.
-        if (expected_byte_size == -1) {
-          return Status::Success;
-        }
+        skip_byte_size_check = (expected_byte_size == -1);
       } else {
         expected_byte_size = triton::common::GetByteSize(data_type, input_dims);
       }
-      if ((byte_size > INT_MAX) ||
-          (static_cast<int64_t>(byte_size) != expected_byte_size)) {
+
+      bool byte_size_valid =
+          (byte_size > INT_MAX) ||
+          (static_cast<int64_t>(byte_size) != expected_byte_size);
+      if (!skip_byte_size_check && byte_size_valid) {
         return Status(
             Status::Code::INVALID_ARG,
             LogRequest() + "input byte size mismatch for input '" + input_id +
@@ -1302,10 +1304,6 @@ InferenceRequest::ValidateBytesInputs(
       // FIXME: Skip GPU buffers for now, return an expected_byte_size of -1 as
       // a signal to skip validation.
       if (buffer_memory_type == TRITONSERVER_MEMORY_GPU) {
-        LOG_WARNING << LogRequest()
-                    << "Validation of GPU byte size buffers is not implemented "
-                       "yet, skipping validation for input: "
-                    << input_id;
         *expected_byte_size = -1;
         return Status::Success;
       }

From ef9daa26866d7f1d2b3fab040b081d5e7c103818 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Tue, 4 Jun 2024 19:30:32 -0700
Subject: [PATCH 04/10] Update input calls

---
 src/infer_request.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/infer_request.cc b/src/infer_request.cc
index b280e5593..0ab0a2970 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -1284,7 +1284,7 @@ InferenceRequest::ValidateBytesInputs(
   size_t remaining_element_size = 0;
 
   size_t buffer_idx = 0;
-  const size_t buffer_count = input.Data()->BufferCount();
+  const size_t buffer_count = input.DataBufferCount();
 
   const char* buffer = nullptr;
   size_t remaining_buffer_size = 0;
@@ -1296,9 +1296,9 @@ InferenceRequest::ValidateBytesInputs(
     // Get the next buffer if not currently processing one.
     if (!remaining_buffer_size) {
       // Reset remaining buffer size and pointers for next buffer.
-      buffer = input.Data()->BufferAt(
-          buffer_idx++, &remaining_buffer_size, &buffer_memory_type,
-          &buffer_memory_id);
+      RETURN_IF_ERROR(input.DataBuffer(
+          buffer_idx++, (const void**)(&buffer), &remaining_buffer_size,
+          &buffer_memory_type, &buffer_memory_id));
       *expected_byte_size += remaining_buffer_size;
 
       // FIXME: Skip GPU buffers for now, return an expected_byte_size of -1 as

From 6a171fcef1b99406481e061c077e5af717ca747b Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Wed, 5 Jun 2024 01:23:07 -0700
Subject: [PATCH 05/10] update function ValidateBytesInputs

---
 src/infer_request.cc | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/infer_request.cc b/src/infer_request.cc
index 0ab0a2970..0934fc8fd 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -1301,6 +1301,13 @@ InferenceRequest::ValidateBytesInputs(
           &buffer_memory_type, &buffer_memory_id));
       *expected_byte_size += remaining_buffer_size;
 
+      if (buffer_idx > buffer_count) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            LogRequest() +
+                "element strings exceed the end of the last buffer.");
+      }
+
       // FIXME: Skip GPU buffers for now, return an expected_byte_size of -1 as
       // a signal to skip validation.
       if (buffer_memory_type == TRITONSERVER_MEMORY_GPU) {
@@ -1318,7 +1325,7 @@ InferenceRequest::ValidateBytesInputs(
         return Status(
             Status::Code::INVALID_ARG,
             LogRequest() +
-                "Element byte size indicator exceeds the end of the buffer.");
+                "element byte size indicator exceeds the end of the buffer.");
       }
 
       // Start the next element and reset the remaining element size.
@@ -1349,8 +1356,8 @@ InferenceRequest::ValidateBytesInputs(
   if (buffer_idx != buffer_count) {
     return Status(
         Status::Code::INVALID_ARG,
-        LogRequest() + "expected to process " + std::to_string(buffer_count) +
-            " buffers for inference input '" + input_id + "', only processed " +
+        LogRequest() + "expected " + std::to_string(buffer_count) +
+            " buffers for inference input '" + input_id + "', got " +
             std::to_string(buffer_idx));
   }
 
@@ -1359,7 +1366,7 @@ InferenceRequest::ValidateBytesInputs(
     return Status(
         Status::Code::INVALID_ARG,
         LogRequest() + "expected " + std::to_string(element_count) +
-            " strings for inference input '" + input_id + "', got " +
+            " string elements for inference input '" + input_id + "', got " +
             std::to_string(element_idx));
   }
 

From e8656fa87664388d81a2c08f0e73bac6be767f5d Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Wed, 5 Jun 2024 08:43:54 -0700
Subject: [PATCH 06/10] skip input byte-size checks for TensorRT

---
 src/infer_request.cc | 66 +++++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 28 deletions(-)

diff --git a/src/infer_request.cc b/src/infer_request.cc
index 0934fc8fd..9d98212a0 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -1180,30 +1180,40 @@ InferenceRequest::Normalize()
       const auto& data_type = input.DType();
       const auto& input_dims = input.ShapeWithBatchDim();
       int64_t expected_byte_size = INT_MAX;
+
+      // Skip byte size validation for TensorRT backend because it breaks
+      // shape-size assumption
       bool skip_byte_size_check = false;
-      // Because Triton expects STRING type to be in special format
-      // (prepend 4 bytes to specify string length), so need to add all the
-      // first 4 bytes for each element to find expected byte size
-      if (data_type == inference::DataType::TYPE_STRING) {
-        RETURN_IF_ERROR(
-            ValidateBytesInputs(input_id, input, &expected_byte_size));
-        // FIXME: -1 is used as a signal to skip total byte size validation for
-        // unhandled cases in ValidateBytesInputs.
-        skip_byte_size_check = (expected_byte_size == -1);
-      } else {
-        expected_byte_size = triton::common::GetByteSize(data_type, input_dims);
-      }
+      constexpr char trt_prefix[] = "tensorrt_";
+      const std::string& platform = model_raw_->Config().platform();
+      skip_byte_size_check |= (platform.rfind(trt_prefix) == 0);
+
+      if (!skip_byte_size_check) {
+        // Because Triton expects STRING type to be in special format
+        // (prepend 4 bytes to specify string length), so need to add all the
+        // first 4 bytes for each element to find expected byte size
+        if (data_type == inference::DataType::TYPE_STRING) {
+          RETURN_IF_ERROR(
+              ValidateBytesInputs(input_id, input, &expected_byte_size));
+          // FIXME: -1 is used as a signal to skip total byte size validation
+          // for unhandled cases in ValidateBytesInputs.
+          skip_byte_size_check |= (expected_byte_size == -1);
+        } else {
+          expected_byte_size =
+              triton::common::GetByteSize(data_type, input_dims);
+        }
 
-      bool byte_size_valid =
-          (byte_size > INT_MAX) ||
-          (static_cast<int64_t>(byte_size) != expected_byte_size);
-      if (!skip_byte_size_check && byte_size_valid) {
-        return Status(
-            Status::Code::INVALID_ARG,
-            LogRequest() + "input byte size mismatch for input '" + input_id +
-                "' for model '" + ModelName() + "'. Expected " +
-                std::to_string(expected_byte_size) + ", got " +
-                std::to_string(byte_size));
+        bool byte_size_valid =
+            (byte_size > INT_MAX) ||
+            (static_cast<int64_t>(byte_size) != expected_byte_size);
+        if (!skip_byte_size_check && byte_size_valid) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              LogRequest() + "input byte size mismatch for input '" + input_id +
+                  "' for model '" + ModelName() + "'. Expected " +
+                  std::to_string(expected_byte_size) + ", got " +
+                  std::to_string(byte_size));
+        }
       }
     }
   }
@@ -1283,7 +1293,7 @@ InferenceRequest::ValidateBytesInputs(
   int64_t element_idx = 0;
   size_t remaining_element_size = 0;
 
-  size_t buffer_idx = 0;
+  size_t buffer_next_idx = 0;
   const size_t buffer_count = input.DataBufferCount();
 
   const char* buffer = nullptr;
@@ -1292,16 +1302,16 @@ InferenceRequest::ValidateBytesInputs(
   int64_t buffer_memory_id;
 
   // Validate elements until all buffers have been fully processed.
-  while (remaining_buffer_size || buffer_idx < buffer_count) {
+  while (remaining_buffer_size || buffer_next_idx < buffer_count) {
     // Get the next buffer if not currently processing one.
     if (!remaining_buffer_size) {
       // Reset remaining buffer size and pointers for next buffer.
       RETURN_IF_ERROR(input.DataBuffer(
-          buffer_idx++, (const void**)(&buffer), &remaining_buffer_size,
+          buffer_next_idx++, (const void**)(&buffer), &remaining_buffer_size,
           &buffer_memory_type, &buffer_memory_id));
       *expected_byte_size += remaining_buffer_size;
 
-      if (buffer_idx > buffer_count) {
+      if (buffer_next_idx > buffer_count) {
         return Status(
             Status::Code::INVALID_ARG,
             LogRequest() +
@@ -1353,12 +1363,12 @@ InferenceRequest::ValidateBytesInputs(
   }
 
   // Validate the number of processed buffers exactly match expectations.
-  if (buffer_idx != buffer_count) {
+  if (buffer_next_idx != buffer_count) {
     return Status(
         Status::Code::INVALID_ARG,
         LogRequest() + "expected " + std::to_string(buffer_count) +
             " buffers for inference input '" + input_id + "', got " +
-            std::to_string(buffer_idx));
+            std::to_string(buffer_next_idx));
   }
 
   // Validate the number of processed elements exactly match expectations.

From a40429b33684d33495bdff77225a6d9f40b7b252 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Wed, 5 Jun 2024 10:17:38 -0700
Subject: [PATCH 07/10] Rename byte_size_valid to byte_size_invalid

---
 src/infer_request.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/infer_request.cc b/src/infer_request.cc
index 9d98212a0..e16cf34d7 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -1203,10 +1203,10 @@ InferenceRequest::Normalize()
               triton::common::GetByteSize(data_type, input_dims);
         }
 
-        bool byte_size_valid =
+        bool byte_size_invalid =
             (byte_size > INT_MAX) ||
             (static_cast<int64_t>(byte_size) != expected_byte_size);
-        if (!skip_byte_size_check && byte_size_valid) {
+        if (!skip_byte_size_check && byte_size_invalid) {
           return Status(
               Status::Code::INVALID_ARG,
               LogRequest() + "input byte size mismatch for input '" + input_id +

From 6037ab85200f3314335741fa8981f5f4ae8a5bc7 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Thu, 6 Jun 2024 11:51:02 -0700
Subject: [PATCH 08/10] Minor updates

---
 src/infer_request.cc | 41 ++++++++++++++++++-----------------------
 1 file changed, 18 insertions(+), 23 deletions(-)

diff --git a/src/infer_request.cc b/src/infer_request.cc
index e16cf34d7..a0a8d4b53 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -1181,23 +1181,26 @@ InferenceRequest::Normalize()
       const auto& input_dims = input.ShapeWithBatchDim();
       int64_t expected_byte_size = INT_MAX;
 
-      // Skip byte size validation for TensorRT backend because it breaks
-      // shape-size assumption
+      // FIXME: Skip byte size validation for TensorRT backend because it breaks
+      // shape-size assumption. See DLIS-6805 for proper fix for TRT backend
+      // reformat_free tensors.
       bool skip_byte_size_check = false;
       constexpr char trt_prefix[] = "tensorrt_";
       const std::string& platform = model_raw_->Config().platform();
       skip_byte_size_check |= (platform.rfind(trt_prefix) == 0);
 
       if (!skip_byte_size_check) {
+        TRITONSERVER_MemoryType input_memory_type;
         // Because Triton expects STRING type to be in special format
         // (prepend 4 bytes to specify string length), so need to add all the
         // first 4 bytes for each element to find expected byte size
         if (data_type == inference::DataType::TYPE_STRING) {
-          RETURN_IF_ERROR(
-              ValidateBytesInputs(input_id, input, &expected_byte_size));
-          // FIXME: -1 is used as a signal to skip total byte size validation
-          // for unhandled cases in ValidateBytesInputs.
-          skip_byte_size_check |= (expected_byte_size == -1);
+          RETURN_IF_ERROR(ValidateBytesInputs(
+              input_id, input, &expected_byte_size, &input_memory_type));
+          // FIXME: Temporarily skips byte size checks for GPU tensors. See
+          // DLIS-6820.
+          skip_byte_size_check |=
+              (input_memory_type == TRITONSERVER_MEMORY_GPU);
         } else {
           expected_byte_size =
               triton::common::GetByteSize(data_type, input_dims);
@@ -1284,13 +1287,14 @@ InferenceRequest::ValidateRequestInputs()
 Status
 InferenceRequest::ValidateBytesInputs(
     const std::string& input_id, const Input& input,
-    int64_t* const expected_byte_size) const
+    int64_t* const expected_byte_size,
+    TRITONSERVER_MemoryType* buffer_memory_type) const
 {
   *expected_byte_size = 0;
   const auto& input_dims = input.ShapeWithBatchDim();
 
   int64_t element_count = triton::common::GetElementCount(input_dims);
-  int64_t element_idx = 0;
+  int64_t element_checked = 0;
   size_t remaining_element_size = 0;
 
   size_t buffer_next_idx = 0;
@@ -1298,7 +1302,6 @@ InferenceRequest::ValidateBytesInputs(
 
   const char* buffer = nullptr;
   size_t remaining_buffer_size = 0;
-  TRITONSERVER_MemoryType buffer_memory_type;
   int64_t buffer_memory_id;
 
   // Validate elements until all buffers have been fully processed.
@@ -1308,20 +1311,12 @@ InferenceRequest::ValidateBytesInputs(
       // Reset remaining buffer size and pointers for next buffer.
       RETURN_IF_ERROR(input.DataBuffer(
           buffer_next_idx++, (const void**)(&buffer), &remaining_buffer_size,
-          &buffer_memory_type, &buffer_memory_id));
+          buffer_memory_type, &buffer_memory_id));
       *expected_byte_size += remaining_buffer_size;
 
-      if (buffer_next_idx > buffer_count) {
-        return Status(
-            Status::Code::INVALID_ARG,
-            LogRequest() +
-                "element strings exceed the end of the last buffer.");
-      }
-
       // FIXME: Skip GPU buffers for now, return an expected_byte_size of -1 as
       // a signal to skip validation.
-      if (buffer_memory_type == TRITONSERVER_MEMORY_GPU) {
-        *expected_byte_size = -1;
+      if (*buffer_memory_type == TRITONSERVER_MEMORY_GPU) {
         return Status::Success;
       }
     }
@@ -1340,7 +1335,7 @@ InferenceRequest::ValidateBytesInputs(
 
       // Start the next element and reset the remaining element size.
       remaining_element_size = *(reinterpret_cast<const uint32_t*>(buffer));
-      element_idx++;
+      element_checked++;
 
       // Advance pointer and remainder by the indicator size.
       buffer += kElementSizeIndicator;
@@ -1372,12 +1367,12 @@ InferenceRequest::ValidateBytesInputs(
   }
 
   // Validate the number of processed elements exactly match expectations.
-  if (element_idx != element_count) {
+  if (element_checked != element_count) {
     return Status(
         Status::Code::INVALID_ARG,
         LogRequest() + "expected " + std::to_string(element_count) +
             " string elements for inference input '" + input_id + "', got " +
-            std::to_string(element_idx));
+            std::to_string(element_checked));
   }
 
   return Status::Success;

From 272ef2905d9a2bcaa57215794ab98aa96a35cec9 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Thu, 6 Jun 2024 12:04:29 -0700
Subject: [PATCH 09/10] Update header

---
 src/infer_request.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/infer_request.h b/src/infer_request.h
index c97ef8039..1dc0310cc 100644
--- a/src/infer_request.h
+++ b/src/infer_request.h
@@ -749,7 +749,8 @@ class InferenceRequest {
 
   Status ValidateBytesInputs(
       const std::string& input_id, const Input& input,
-      int64_t* const expected_byte_size) const;
+      int64_t* const expected_byte_size,
+      TRITONSERVER_MemoryType* buffer_memory_type) const;
 
   // Helpers for pending request metrics
   void IncrementPendingRequestCount();

From 0e3b63bda47477f2a5eede6bedd0c924bd72136c Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Thu, 6 Jun 2024 13:55:53 -0700
Subject: [PATCH 10/10] Remove redundant checks

---
 src/infer_request.cc | 36 ++++++++++++++----------------------
 src/infer_request.h  |  1 -
 2 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/src/infer_request.cc b/src/infer_request.cc
index a0a8d4b53..4ea687538 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -1176,10 +1176,7 @@ InferenceRequest::Normalize()
     // Note: Since we're using normalized input.ShapeWithBatchDim() here,
     // make sure that all the normalization is before the check.
     {
-      const size_t& byte_size = input.Data()->TotalByteSize();
       const auto& data_type = input.DType();
-      const auto& input_dims = input.ShapeWithBatchDim();
-      int64_t expected_byte_size = INT_MAX;
 
       // FIXME: Skip byte size validation for TensorRT backend because it breaks
       // shape-size assumption. See DLIS-6805 for proper fix for TRT backend
@@ -1195,27 +1192,27 @@ InferenceRequest::Normalize()
         // (prepend 4 bytes to specify string length), so need to add all the
         // first 4 bytes for each element to find expected byte size
         if (data_type == inference::DataType::TYPE_STRING) {
-          RETURN_IF_ERROR(ValidateBytesInputs(
-              input_id, input, &expected_byte_size, &input_memory_type));
+          RETURN_IF_ERROR(
+              ValidateBytesInputs(input_id, input, &input_memory_type));
           // FIXME: Temporarily skips byte size checks for GPU tensors. See
           // DLIS-6820.
           skip_byte_size_check |=
               (input_memory_type == TRITONSERVER_MEMORY_GPU);
         } else {
+          const auto& input_dims = input.ShapeWithBatchDim();
+          int64_t expected_byte_size = INT_MAX;
           expected_byte_size =
               triton::common::GetByteSize(data_type, input_dims);
-        }
-
-        bool byte_size_invalid =
-            (byte_size > INT_MAX) ||
-            (static_cast<int64_t>(byte_size) != expected_byte_size);
-        if (!skip_byte_size_check && byte_size_invalid) {
-          return Status(
-              Status::Code::INVALID_ARG,
-              LogRequest() + "input byte size mismatch for input '" + input_id +
-                  "' for model '" + ModelName() + "'. Expected " +
-                  std::to_string(expected_byte_size) + ", got " +
-                  std::to_string(byte_size));
+          const size_t& byte_size = input.Data()->TotalByteSize();
+          if ((byte_size > INT_MAX) ||
+              (static_cast<int64_t>(byte_size) != expected_byte_size)) {
+            return Status(
+                Status::Code::INVALID_ARG,
+                LogRequest() + "input byte size mismatch for input '" +
+                    input_id + "' for model '" + ModelName() + "'. Expected " +
+                    std::to_string(expected_byte_size) + ", got " +
+                    std::to_string(byte_size));
+          }
         }
       }
     }
@@ -1287,10 +1284,8 @@ InferenceRequest::ValidateRequestInputs()
 Status
 InferenceRequest::ValidateBytesInputs(
     const std::string& input_id, const Input& input,
-    int64_t* const expected_byte_size,
     TRITONSERVER_MemoryType* buffer_memory_type) const
 {
-  *expected_byte_size = 0;
   const auto& input_dims = input.ShapeWithBatchDim();
 
   int64_t element_count = triton::common::GetElementCount(input_dims);
@@ -1312,10 +1307,7 @@ InferenceRequest::ValidateBytesInputs(
       RETURN_IF_ERROR(input.DataBuffer(
           buffer_next_idx++, (const void**)(&buffer), &remaining_buffer_size,
           buffer_memory_type, &buffer_memory_id));
-      *expected_byte_size += remaining_buffer_size;
 
-      // FIXME: Skip GPU buffers for now, return an expected_byte_size of -1 as
-      // a signal to skip validation.
       if (*buffer_memory_type == TRITONSERVER_MEMORY_GPU) {
         return Status::Success;
       }
diff --git a/src/infer_request.h b/src/infer_request.h
index 1dc0310cc..37da42b02 100644
--- a/src/infer_request.h
+++ b/src/infer_request.h
@@ -749,7 +749,6 @@ class InferenceRequest {
 
   Status ValidateBytesInputs(
       const std::string& input_id, const Input& input,
-      int64_t* const expected_byte_size,
       TRITONSERVER_MemoryType* buffer_memory_type) const;
 
   // Helpers for pending request metrics