Merge branch 'main' into identity_ensemble_flag_ibhosale

triton-inference-server · Jun 10, 2024 · cd29411 · cd29411
2 parents 15f50ba + 10bcbd9
commit cd29411
Show file tree

Hide file tree

Showing 8 changed files with 193 additions and 107 deletions.
diff --git a/include/triton/core/tritonserver.h b/include/triton/core/tritonserver.h
@@ -91,7 +91,7 @@ struct TRITONSERVER_MetricFamily;
 ///   }
 ///
 #define TRITONSERVER_API_VERSION_MAJOR 1
-#define TRITONSERVER_API_VERSION_MINOR 31
+#define TRITONSERVER_API_VERSION_MINOR 32
 
 /// Get the TRITONBACKEND API version supported by the Triton shared
 /// library. This value can be compared against the

diff --git a/python/setup.py b/python/setup.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -108,6 +108,6 @@ def get_tag(self):
     zip_safe=False,
     cmdclass={"bdist_wheel": bdist_wheel},
     data_files=data_files,
-    install_requires=["numpy"],
+    install_requires=["numpy<2"],
     extras_require={"GPU": gpu_extras, "test": test_extras, "all": all_extras},
 )
diff --git a/src/backend_model.cc b/src/backend_model.cc
@@ -1549,6 +1549,11 @@ TRITONAPI_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ResponseSetStringParameter(
     TRITONBACKEND_Response* response, const char* name, const char* value)
 {
+  if (!response) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr");
+  }
+
   InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
   RETURN_TRITONSERVER_ERROR_IF_ERROR(tr->AddParameter(name, value));
   return nullptr;  // success
@@ -1558,6 +1563,11 @@ TRITONAPI_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ResponseSetIntParameter(
     TRITONBACKEND_Response* response, const char* name, const int64_t value)
 {
+  if (!response) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr");
+  }
+
   InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
   RETURN_TRITONSERVER_ERROR_IF_ERROR(tr->AddParameter(name, value));
   return nullptr;  // success
@@ -1567,6 +1577,11 @@ TRITONAPI_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ResponseSetBoolParameter(
     TRITONBACKEND_Response* response, const char* name, const bool value)
 {
+  if (!response) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr");
+  }
+
   InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
   RETURN_TRITONSERVER_ERROR_IF_ERROR(tr->AddParameter(name, value));
   return nullptr;  // success
@@ -1576,6 +1591,11 @@ TRITONAPI_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ResponseSetDoubleParameter(
     TRITONBACKEND_Response* response, const char* name, const double value)
 {
+  if (!response) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr");
+  }
+
   InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
   RETURN_TRITONSERVER_ERROR_IF_ERROR(tr->AddParameter(name, value));
   return nullptr;  // success
@@ -1587,6 +1607,11 @@ TRITONBACKEND_ResponseOutput(
     const char* name, const TRITONSERVER_DataType datatype,
     const int64_t* shape, const uint32_t dims_count)
 {
+  if (!response) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr");
+  }
+
   *output = nullptr;
   InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
   std::vector<int64_t> lshape(shape, shape + dims_count);
@@ -1602,6 +1627,11 @@ TRITONBACKEND_ResponseSend(
     TRITONBACKEND_Response* response, const uint32_t send_flags,
     TRITONSERVER_Error* error)
 {
+  if (!response) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr");
+  }
+
   InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
 
   std::unique_ptr<InferenceResponse> utr(tr);

diff --git a/src/constants.h b/src/constants.h
@@ -66,6 +66,7 @@ constexpr char kEnsemblePlatform[] = "ensemble";
 
 constexpr char kTensorRTExecutionAccelerator[] = "tensorrt";
 constexpr char kOpenVINOExecutionAccelerator[] = "openvino";
+constexpr char kCUDAExecutionAccelerator[] = "cuda";
 constexpr char kGPUIOExecutionAccelerator[] = "gpu_io";
 constexpr char kAutoMixedPrecisionExecutionAccelerator[] =
     "auto_mixed_precision";

diff --git a/src/infer_request.cc b/src/infer_request.cc
@@ -1176,27 +1176,44 @@ InferenceRequest::Normalize()
     // Note: Since we're using normalized input.ShapeWithBatchDim() here,
     // make sure that all the normalization is before the check.
     {
-      const size_t& byte_size = input.Data()->TotalByteSize();
       const auto& data_type = input.DType();
-      const auto& input_dims = input.ShapeWithBatchDim();
-      int64_t expected_byte_size = INT_MAX;
-      // Because Triton expects STRING type to be in special format
-      // (prepend 4 bytes to specify string length), so need to add all the
-      // first 4 bytes for each element to find expected byte size
-      if (data_type == inference::DataType::TYPE_STRING) {
-        RETURN_IF_ERROR(
-            ValidateBytesInputs(input_id, input, &expected_byte_size));
-      } else {
-        expected_byte_size = triton::common::GetByteSize(data_type, input_dims);
-      }
-      if ((byte_size > INT_MAX) ||
-          (static_cast<int64_t>(byte_size) != expected_byte_size)) {
-        return Status(
-            Status::Code::INVALID_ARG,
-            LogRequest() + "input byte size mismatch for input '" + input_id +
-                "' for model '" + ModelName() + "'. Expected " +
-                std::to_string(expected_byte_size) + ", got " +
-                std::to_string(byte_size));
+
+      // FIXME: Skip byte size validation for TensorRT backend because it breaks
+      // shape-size assumption. See DLIS-6805 for proper fix for TRT backend
+      // reformat_free tensors.
+      bool skip_byte_size_check = false;
+      constexpr char trt_prefix[] = "tensorrt_";
+      const std::string& platform = model_raw_->Config().platform();
+      skip_byte_size_check |= (platform.rfind(trt_prefix) == 0);
+
+      if (!skip_byte_size_check) {
+        TRITONSERVER_MemoryType input_memory_type;
+        // Because Triton expects STRING type to be in special format
+        // (prepend 4 bytes to specify string length), so need to add all the
+        // first 4 bytes for each element to find expected byte size
+        if (data_type == inference::DataType::TYPE_STRING) {
+          RETURN_IF_ERROR(
+              ValidateBytesInputs(input_id, input, &input_memory_type));
+          // FIXME: Temporarily skips byte size checks for GPU tensors. See
+          // DLIS-6820.
+          skip_byte_size_check |=
+              (input_memory_type == TRITONSERVER_MEMORY_GPU);
+        } else {
+          const auto& input_dims = input.ShapeWithBatchDim();
+          int64_t expected_byte_size = INT_MAX;
+          expected_byte_size =
+              triton::common::GetByteSize(data_type, input_dims);
+          const size_t& byte_size = input.Data()->TotalByteSize();
+          if ((byte_size > INT_MAX) ||
+              (static_cast<int64_t>(byte_size) != expected_byte_size)) {
+            return Status(
+                Status::Code::INVALID_ARG,
+                LogRequest() + "input byte size mismatch for input '" +
+                    input_id + "' for model '" + ModelName() + "'. Expected " +
+                    std::to_string(expected_byte_size) + ", got " +
+                    std::to_string(byte_size));
+          }
+        }
       }
     }
   }
@@ -1267,55 +1284,87 @@ InferenceRequest::ValidateRequestInputs()
 Status
 InferenceRequest::ValidateBytesInputs(
     const std::string& input_id, const Input& input,
-    int64_t* const expected_byte_size) const
+    TRITONSERVER_MemoryType* buffer_memory_type) const
 {
   const auto& input_dims = input.ShapeWithBatchDim();
+
   int64_t element_count = triton::common::GetElementCount(input_dims);
-  int64_t element_idx = 0;
-  *expected_byte_size = 0;
-  for (size_t i = 0; i < input.Data()->BufferCount(); ++i) {
-    size_t content_byte_size;
-    TRITONSERVER_MemoryType content_memory_type;
-    int64_t content_memory_id;
-    const char* content = input.Data()->BufferAt(
-        i, &content_byte_size, &content_memory_type, &content_memory_id);
-
-    while (content_byte_size >= sizeof(uint32_t)) {
-      if (element_idx >= element_count) {
-        return Status(
-            Status::Code::INVALID_ARG,
-            LogRequest() + "unexpected number of string elements " +
-                std::to_string(element_idx + 1) + " for inference input '" +
-                input_id + "', expecting " + std::to_string(element_count));
+  int64_t element_checked = 0;
+  size_t remaining_element_size = 0;
+
+  size_t buffer_next_idx = 0;
+  const size_t buffer_count = input.DataBufferCount();
+
+  const char* buffer = nullptr;
+  size_t remaining_buffer_size = 0;
+  int64_t buffer_memory_id;
+
+  // Validate elements until all buffers have been fully processed.
+  while (remaining_buffer_size || buffer_next_idx < buffer_count) {
+    // Get the next buffer if not currently processing one.
+    if (!remaining_buffer_size) {
+      // Reset remaining buffer size and pointers for next buffer.
+      RETURN_IF_ERROR(input.DataBuffer(
+          buffer_next_idx++, (const void**)(&buffer), &remaining_buffer_size,
+          buffer_memory_type, &buffer_memory_id));
+
+      if (*buffer_memory_type == TRITONSERVER_MEMORY_GPU) {
+        return Status::Success;
       }
+    }
 
-      const uint32_t len = *(reinterpret_cast<const uint32_t*>(content));
-      content += sizeof(uint32_t);
-      content_byte_size -= sizeof(uint32_t);
-      *expected_byte_size += sizeof(uint32_t);
-
-      if (content_byte_size < len) {
+    constexpr size_t kElementSizeIndicator = sizeof(uint32_t);
+    // Get the next element if not currently processing one.
+    if (!remaining_element_size) {
+      // FIXME: Assume the string element's byte size indicator is not spread
+      // across buffer boundaries for simplicity.
+      if (remaining_buffer_size < kElementSizeIndicator) {
         return Status(
             Status::Code::INVALID_ARG,
-            LogRequest() + "incomplete string data for inference input '" +
-                input_id + "', expecting string of length " +
-                std::to_string(len) + " but only " +
-                std::to_string(content_byte_size) + " bytes available");
+            LogRequest() +
+                "element byte size indicator exceeds the end of the buffer.");
       }
 
-      content += len;
-      content_byte_size -= len;
-      *expected_byte_size += len;
-      element_idx++;
+      // Start the next element and reset the remaining element size.
+      remaining_element_size = *(reinterpret_cast<const uint32_t*>(buffer));
+      element_checked++;
+
+      // Advance pointer and remainder by the indicator size.
+      buffer += kElementSizeIndicator;
+      remaining_buffer_size -= kElementSizeIndicator;
+    }
+
+    // If the remaining buffer fits it: consume the rest of the element, proceed
+    // to the next element.
+    if (remaining_buffer_size >= remaining_element_size) {
+      buffer += remaining_element_size;
+      remaining_buffer_size -= remaining_element_size;
+      remaining_element_size = 0;
+    }
+    // Otherwise the remaining element is larger: consume the rest of the
+    // buffer, proceed to the next buffer.
+    else {
+      remaining_element_size -= remaining_buffer_size;
+      remaining_buffer_size = 0;
     }
   }
 
-  if (element_idx != element_count) {
+  // Validate the number of processed buffers exactly match expectations.
+  if (buffer_next_idx != buffer_count) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        LogRequest() + "expected " + std::to_string(buffer_count) +
+            " buffers for inference input '" + input_id + "', got " +
+            std::to_string(buffer_next_idx));
+  }
+
+  // Validate the number of processed elements exactly match expectations.
+  if (element_checked != element_count) {
     return Status(
         Status::Code::INVALID_ARG,
         LogRequest() + "expected " + std::to_string(element_count) +
-            " strings for inference input '" + input_id + "', got " +
-            std::to_string(element_idx));
+            " string elements for inference input '" + input_id + "', got " +
+            std::to_string(element_checked));
   }
 
   return Status::Success;

diff --git a/src/infer_request.h b/src/infer_request.h
@@ -749,7 +749,7 @@ class InferenceRequest {
 
   Status ValidateBytesInputs(
       const std::string& input_id, const Input& input,
-      int64_t* const expected_byte_size) const;
+      TRITONSERVER_MemoryType* buffer_memory_type) const;
 
   // Helpers for pending request metrics
   void IncrementPendingRequestCount();