triton-inference-server · pskiran1 · Jul 27, 2024 · Jun 28, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/src/infer_request.cc b/src/infer_request.cc
@@ -1015,6 +1015,17 @@ InferenceRequest::Normalize()
     for (auto& pr : original_inputs_) {
       auto& input = pr.second;
       *input.MutableShape() = input.OriginalShape();
+
+      const inference::ModelInput* input_config;
+      RETURN_IF_ERROR(model_raw_->GetInput(input.Name(), &input_config));
+      if (input_config->is_shape_tensor()) {
+        // For a shape tensor, mark that the input is a shape tensor.
+        input.SetIsShapeTensor();
+      } else if (input_config->is_non_linear_format_io()) {
+        // If a tensor uses a non-linear IO format, indicate that the input uses
+        // a non-linear IO format.
+        input.SetIsNonLinearFormatIo();
+      }
     }
   } else {
     // Model does support Triton-style batching so each input tensor
@@ -1024,15 +1035,19 @@ InferenceRequest::Normalize()
     batch_size_ = 0;
     for (auto& pr : original_inputs_) {
       auto& input = pr.second;
+      const inference::ModelInput* input_config;
+      RETURN_IF_ERROR(model_raw_->GetInput(input.Name(), &input_config));
 
       // For a shape tensor, keep the tensor's shape as it is and mark
       // that the input is a shape tensor.
-      const inference::ModelInput* input_config;
-      RETURN_IF_ERROR(model_raw_->GetInput(input.Name(), &input_config));
       if (input_config->is_shape_tensor()) {
         *input.MutableShape() = input.OriginalShape();
-        input.SetIsShapeTensor(true);
+        input.SetIsShapeTensor();
         continue;
+      } else if (input_config->is_non_linear_format_io()) {
+        // If a tensor uses a non-linear IO format, indicate that the input uses
+        // a non-linear IO format.
+        input.SetIsNonLinearFormatIo();
       }
 
       if (input.OriginalShape().size() == 0) {
@@ -1182,28 +1197,21 @@ InferenceRequest::Normalize()
     {
       const auto& data_type = input.DType();
 
-      // FIXME: Skip byte size validation for TensorRT backend because it breaks
-      // shape-size assumption. See DLIS-6805 for proper fix for TRT backend
-      // reformat_free tensors.
-      bool skip_byte_size_check = false;
-      constexpr char trt_prefix[] = "tensorrt_";
-      const std::string& platform = model_raw_->Config().platform();
-      skip_byte_size_check |= (platform.rfind(trt_prefix) == 0);
-
-      if (!skip_byte_size_check) {
+      // Non-linear IO format input byte size validation will be handled in the
+      // TensorRT backend.
+      if (!input.IsNonLinearFormatIo()) {
         TRITONSERVER_MemoryType input_memory_type;
         // Because Triton expects STRING type to be in special format
         // (prepend 4 bytes to specify string length), so need to add all the
         // first 4 bytes for each element to find expected byte size
         if (data_type == inference::DataType::TYPE_STRING) {
           RETURN_IF_ERROR(
               ValidateBytesInputs(input_id, input, &input_memory_type));
+
           // FIXME: Temporarily skips byte size checks for GPU tensors. See
           // DLIS-6820.
-          skip_byte_size_check |=
-              (input_memory_type == TRITONSERVER_MEMORY_GPU);
         } else {
-          const auto& input_dims = input.ShapeWithBatchDim();
+          const std::vector<int64_t>& input_dims = input.ShapeWithBatchDim();
           int64_t expected_byte_size = INT_MAX;
           expected_byte_size =
               triton::common::GetByteSize(data_type, input_dims);
@@ -1506,7 +1514,7 @@ InferenceRequest::ReportStatisticsCacheHit(MetricModelReporter* metric_reporter)
 // Input
 //
 InferenceRequest::Input::Input()
-    : is_shape_tensor_(false), data_(new MemoryReference),
+    : tensor_type_(TensorType::TENSOR), data_(new MemoryReference),
       has_host_policy_specific_data_(false)
 {
 }
@@ -1515,16 +1523,17 @@ InferenceRequest::Input::Input(
     const std::string& name, const inference::DataType datatype,
     const int64_t* shape, const uint64_t dim_count)
     : name_(name), datatype_(datatype),
-      original_shape_(shape, shape + dim_count), is_shape_tensor_(false),
-      data_(new MemoryReference), has_host_policy_specific_data_(false)
+      original_shape_(shape, shape + dim_count),
+      tensor_type_(TensorType::TENSOR), data_(new MemoryReference),
+      has_host_policy_specific_data_(false)
 {
 }
 
 InferenceRequest::Input::Input(
     const std::string& name, const inference::DataType datatype,
     const std::vector<int64_t>& shape)
     : name_(name), datatype_(datatype), original_shape_(shape),
-      is_shape_tensor_(false), data_(new MemoryReference),
+      tensor_type_(TensorType::TENSOR), data_(new MemoryReference),
       has_host_policy_specific_data_(false)
 {
 }
@@ -1540,9 +1549,16 @@ InferenceRequest::Input::SetMetadata(
 }
 
 Status
-InferenceRequest::Input::SetIsShapeTensor(const bool is_shape_tensor)
+InferenceRequest::Input::SetIsShapeTensor()
+{
+  tensor_type_ = TensorType::SHAPE_TENSOR;
+  return Status::Success;
+}
+
+Status
+InferenceRequest::Input::SetIsNonLinearFormatIo()
 {
-  is_shape_tensor_ = is_shape_tensor;
+  tensor_type_ = TensorType::NON_LINEAR;
   return Status::Success;
 }
 

diff --git a/src/infer_request.h b/src/infer_request.h
@@ -82,6 +82,8 @@ class InferenceRequest {
   // Input tensor
   class Input {
    public:
+    enum class TensorType { TENSOR, SHAPE_TENSOR, NON_LINEAR };
+
     Input();
     Input(
         const std::string& name, const inference::DataType datatype,
@@ -120,7 +122,14 @@ class InferenceRequest {
     // into batch + shape.
     const std::vector<int64_t>& ShapeWithBatchDim() const
     {
-      return shape_with_batch_dim_;
+      if (tensor_type_ == TensorType::SHAPE_TENSOR) {
+        // Shape tensor with dynamic batching does not introduce a new
+        // dimension to the tensor but adds an additional value to the 1-D
+        // array.
+        return original_shape_;
+      } else {
+        return shape_with_batch_dim_;
+      }
     }
     std::vector<int64_t>* MutableShapeWithBatchDim()
     {
@@ -134,10 +143,22 @@ class InferenceRequest {
     }
 
     // Whether or not the input is a tensorrt shape tensor
-    bool IsShapeTensor() const { return is_shape_tensor_; }
+    bool IsShapeTensor() const
+    {
+      return tensor_type_ == TensorType::SHAPE_TENSOR;
+    }
+
+    // Specifies whether the input uses a non-linear IO format
+    bool IsNonLinearFormatIo() const
+    {
+      return tensor_type_ == TensorType::NON_LINEAR;
+    }
 
     // Set the input to be treated as a shape tensor.
-    Status SetIsShapeTensor(const bool is_shape_tensor);
+    Status SetIsShapeTensor();
+
+    // Set the input uses a non-linear IO format
+    Status SetIsNonLinearFormatIo();
 
     // The data for this input.
     const std::shared_ptr<Memory>& Data() const { return data_; }
@@ -240,7 +261,7 @@ class InferenceRequest {
     std::vector<int64_t> original_shape_;
     std::vector<int64_t> shape_;
     std::vector<int64_t> shape_with_batch_dim_;
-    bool is_shape_tensor_;
+    TensorType tensor_type_;
     std::shared_ptr<Memory> data_;
 
     bool has_host_policy_specific_data_;

diff --git a/src/model_config_utils.cc b/src/model_config_utils.cc
@@ -1712,6 +1712,26 @@ ValidateInstanceGroup(
   return Status::Success;
 }
 
+Status
+ValidateNonLinearFormatIO(
+    const inference::ModelInput& io, const std::string& platform, bool is_input)
+{
+  if ((platform != kTensorRTPlanPlatform) && io.is_non_linear_format_io()) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Non-linear IO format is only supported for the TensorRT platform");
+  }
+
+  if (io.is_non_linear_format_io() && (io.dims_size() != 3)) {
+    std::string io_type = is_input ? "input" : "output";
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Non-linear IO format " + io_type + " requires 3 dims");
+  }
+
+  return Status::Success;
+}
+
 Status
 ValidateModelInput(
     const inference::ModelInput& io, int32_t max_batch_size,
@@ -1732,6 +1752,8 @@ ValidateModelInput(
         "shape tensors are only supported for TensorRT platform");
   }
 
+  RETURN_IF_ERROR(ValidateNonLinearFormatIO(io, platform, true /* is_input*/));
+
   return Status::Success;
 }
 
@@ -1768,6 +1790,8 @@ ValidateModelOutput(
         "shape tensors are only supported for TensorRT platform");
   }
 
+  RETURN_IF_ERROR(ValidateNonLinearFormatIO(io, platform, false /* is_input*/));
+
   return Status::Success;
 }
 

diff --git a/src/model_config_utils.h b/src/model_config_utils.h
@@ -172,6 +172,17 @@ Status ValidateInstanceGroup(
 /// is not valid.
 Status ValidateModelIOConfig(const inference::ModelConfig& config);
 
+/// Validate that Non-linear format inputs or outputs are specified correctly
+/// in a model configuration.
+/// \param io The model input.
+/// \param platform The platform name
+/// \param is_input Specifies whether it is an input or an output.
+/// \return The error status. A non-OK status indicates the configuration
+/// is not valid.
+Status ValidateNonLinearFormatIO(
+    const inference::ModelInput& io, const std::string& platform,
+    bool is_input);
+
 /// Validate that input is specified correctly in a model
 /// configuration.
 /// \param io The model input.