triton-inference-server · pskiran1 · Jul 27, 2024 · Jun 28, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/src/infer_request.cc b/src/infer_request.cc
@@ -1015,6 +1015,19 @@ InferenceRequest::Normalize()
     for (auto& pr : original_inputs_) {
       auto& input = pr.second;
       *input.MutableShape() = input.OriginalShape();
+
+      // For a shape tensor, mark that the input is a shape tensor.
+      const inference::ModelInput* input_config;
+      RETURN_IF_ERROR(model_raw_->GetInput(input.Name(), &input_config));
+      if (input_config->is_shape_tensor()) {
+        input.SetIsShapeTensor(true);
+      }
+
+      // If a tensor uses a non-linear IO format, indicate that the input uses a
+      // non-linear IO format.
+      if (input_config->is_non_linear_format_io()) {
+        input.SetIsNonLinearFormatIo(true);
+      }
     }
   } else {
     // Model does support Triton-style batching so each input tensor
@@ -1024,11 +1037,17 @@ InferenceRequest::Normalize()
     batch_size_ = 0;
     for (auto& pr : original_inputs_) {
       auto& input = pr.second;
+      const inference::ModelInput* input_config;
+      RETURN_IF_ERROR(model_raw_->GetInput(input.Name(), &input_config));
+
+      // If a tensor uses a non-linear IO format, indicate that the input uses a
+      // non-linear IO format.
+      if (input_config->is_non_linear_format_io()) {
+        input.SetIsNonLinearFormatIo(true);
+      }
 
       // For a shape tensor, keep the tensor's shape as it is and mark
       // that the input is a shape tensor.
-      const inference::ModelInput* input_config;
-      RETURN_IF_ERROR(model_raw_->GetInput(input.Name(), &input_config));
       if (input_config->is_shape_tensor()) {
         *input.MutableShape() = input.OriginalShape();
         input.SetIsShapeTensor(true);
@@ -1182,28 +1201,26 @@ InferenceRequest::Normalize()
     {
       const auto& data_type = input.DType();
 
-      // FIXME: Skip byte size validation for TensorRT backend because it breaks
-      // shape-size assumption. See DLIS-6805 for proper fix for TRT backend
-      // reformat_free tensors.
-      bool skip_byte_size_check = false;
-      constexpr char trt_prefix[] = "tensorrt_";
-      const std::string& platform = model_raw_->Config().platform();
-      skip_byte_size_check |= (platform.rfind(trt_prefix) == 0);
-
-      if (!skip_byte_size_check) {
+      // Non-linear IO format input byte size validation will be handled in the
+      // backend.
+      if (!input.IsNonLinearFormatIo()) {
         TRITONSERVER_MemoryType input_memory_type;
         // Because Triton expects STRING type to be in special format
         // (prepend 4 bytes to specify string length), so need to add all the
         // first 4 bytes for each element to find expected byte size
         if (data_type == inference::DataType::TYPE_STRING) {
           RETURN_IF_ERROR(
               ValidateBytesInputs(input_id, input, &input_memory_type));
+
           // FIXME: Temporarily skips byte size checks for GPU tensors. See
           // DLIS-6820.
-          skip_byte_size_check |=
-              (input_memory_type == TRITONSERVER_MEMORY_GPU);
         } else {
-          const auto& input_dims = input.ShapeWithBatchDim();
+          // Shape tensor with dynamic batching does not introduce a new
+          // dimension to the tensor but adds an additional value to the 1-D
+          // array.
+          const std::vector<int64_t>& input_dims =
+              input.IsShapeTensor() ? input.OriginalShape()
+                                    : input.ShapeWithBatchDim();
           int64_t expected_byte_size = INT_MAX;
           expected_byte_size =
               triton::common::GetByteSize(data_type, input_dims);
@@ -1506,8 +1523,8 @@ InferenceRequest::ReportStatisticsCacheHit(MetricModelReporter* metric_reporter)
 // Input
 //
 InferenceRequest::Input::Input()
-    : is_shape_tensor_(false), data_(new MemoryReference),
-      has_host_policy_specific_data_(false)
+    : is_shape_tensor_(false), is_non_linear_format_io_(false),
+      data_(new MemoryReference), has_host_policy_specific_data_(false)
 {
 }
 
@@ -1516,16 +1533,17 @@ InferenceRequest::Input::Input(
     const int64_t* shape, const uint64_t dim_count)
     : name_(name), datatype_(datatype),
       original_shape_(shape, shape + dim_count), is_shape_tensor_(false),
-      data_(new MemoryReference), has_host_policy_specific_data_(false)
+      is_non_linear_format_io_(false), data_(new MemoryReference),
+      has_host_policy_specific_data_(false)
 {
 }
 
 InferenceRequest::Input::Input(
     const std::string& name, const inference::DataType datatype,
     const std::vector<int64_t>& shape)
     : name_(name), datatype_(datatype), original_shape_(shape),
-      is_shape_tensor_(false), data_(new MemoryReference),
-      has_host_policy_specific_data_(false)
+      is_shape_tensor_(false), is_non_linear_format_io_(false),
+      data_(new MemoryReference), has_host_policy_specific_data_(false)
 {
 }
 
@@ -1546,6 +1564,14 @@ InferenceRequest::Input::SetIsShapeTensor(const bool is_shape_tensor)
   return Status::Success;
 }
 
+Status
+InferenceRequest::Input::SetIsNonLinearFormatIo(
+    const bool is_non_linear_format_io)
+{
+  is_non_linear_format_io_ = is_non_linear_format_io;
+  return Status::Success;
+}
+
 const std::shared_ptr<Memory>&
 InferenceRequest::Input::Data(const std::string& host_policy_name) const
 {

diff --git a/src/infer_request.h b/src/infer_request.h
@@ -136,9 +136,15 @@ class InferenceRequest {
     // Whether or not the input is a tensorrt shape tensor
     bool IsShapeTensor() const { return is_shape_tensor_; }
 
+    // Specifies whether the input uses a non-linear IO format
+    bool IsNonLinearFormatIo() const { return is_non_linear_format_io_; }
+
     // Set the input to be treated as a shape tensor.
     Status SetIsShapeTensor(const bool is_shape_tensor);
 
+    // Set the input uses a non-linear IO format
+    Status SetIsNonLinearFormatIo(const bool is_non_linear_format_io_);
+
     // The data for this input.
     const std::shared_ptr<Memory>& Data() const { return data_; }
 
@@ -241,6 +247,7 @@ class InferenceRequest {
     std::vector<int64_t> shape_;
     std::vector<int64_t> shape_with_batch_dim_;
     bool is_shape_tensor_;
+    bool is_non_linear_format_io_;
     std::shared_ptr<Memory> data_;
 
     bool has_host_policy_specific_data_;

diff --git a/src/model_config_utils.cc b/src/model_config_utils.cc
@@ -1732,6 +1732,17 @@ ValidateModelInput(
         "shape tensors are only supported for TensorRT platform");
   }
 
+  if ((platform != kTensorRTPlanPlatform) && io.is_non_linear_format_io()) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Non-linear IO format is only supported for the TensorRT platform");
+  }
+
+  if (io.is_non_linear_format_io() && (io.dims_size() != 3)) {
+    return Status(
+        Status::Code::INVALID_ARG, "Non-linear IO format input require 3 dims");
+  }
+
   return Status::Success;
 }
 
@@ -1768,6 +1779,12 @@ ValidateModelOutput(
         "shape tensors are only supported for TensorRT platform");
   }
 
+  if ((platform != kTensorRTPlanPlatform) && io.is_non_linear_format_io()) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Non-linear IO format is only supported for the TensorRT platform");
+  }
+
   return Status::Success;
 }