diff --git a/include/triton/core/tritonserver.h b/include/triton/core/tritonserver.h
index d5ffd5370..b8e25df72 100644
--- a/include/triton/core/tritonserver.h
+++ b/include/triton/core/tritonserver.h
@@ -91,7 +91,7 @@ struct TRITONSERVER_MetricFamily;
 ///   }
 ///
 #define TRITONSERVER_API_VERSION_MAJOR 1
-#define TRITONSERVER_API_VERSION_MINOR 31
+#define TRITONSERVER_API_VERSION_MINOR 32
 
 /// Get the TRITONBACKEND API version supported by the Triton shared
 /// library. This value can be compared against the
diff --git a/python/setup.py b/python/setup.py
index 61a8590c0..8bab64221 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -108,6 +108,6 @@ def get_tag(self):
     zip_safe=False,
     cmdclass={"bdist_wheel": bdist_wheel},
     data_files=data_files,
-    install_requires=["numpy"],
+    install_requires=["numpy<2"],
     extras_require={"GPU": gpu_extras, "test": test_extras, "all": all_extras},
 )
diff --git a/src/backend_model.cc b/src/backend_model.cc
index d3119ac67..01962d3bf 100644
--- a/src/backend_model.cc
+++ b/src/backend_model.cc
@@ -1549,6 +1549,11 @@ TRITONAPI_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ResponseSetStringParameter(
     TRITONBACKEND_Response* response, const char* name, const char* value)
 {
+  if (!response) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr");
+  }
+
   InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
   RETURN_TRITONSERVER_ERROR_IF_ERROR(tr->AddParameter(name, value));
   return nullptr;  // success
@@ -1558,6 +1563,11 @@ TRITONAPI_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ResponseSetIntParameter(
     TRITONBACKEND_Response* response, const char* name, const int64_t value)
 {
+  if (!response) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr");
+  }
+
   InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
   RETURN_TRITONSERVER_ERROR_IF_ERROR(tr->AddParameter(name, value));
   return nullptr;  // success
@@ -1567,6 +1577,11 @@ TRITONAPI_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ResponseSetBoolParameter(
     TRITONBACKEND_Response* response, const char* name, const bool value)
 {
+  if (!response) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr");
+  }
+
   InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
   RETURN_TRITONSERVER_ERROR_IF_ERROR(tr->AddParameter(name, value));
   return nullptr;  // success
@@ -1576,6 +1591,11 @@ TRITONAPI_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ResponseSetDoubleParameter(
     TRITONBACKEND_Response* response, const char* name, const double value)
 {
+  if (!response) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr");
+  }
+
   InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
   RETURN_TRITONSERVER_ERROR_IF_ERROR(tr->AddParameter(name, value));
   return nullptr;  // success
@@ -1587,6 +1607,11 @@ TRITONBACKEND_ResponseOutput(
     const char* name, const TRITONSERVER_DataType datatype,
     const int64_t* shape, const uint32_t dims_count)
 {
+  if (!response) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr");
+  }
+
   *output = nullptr;
   InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
   std::vector<int64_t> lshape(shape, shape + dims_count);
@@ -1602,6 +1627,11 @@ TRITONBACKEND_ResponseSend(
     TRITONBACKEND_Response* response, const uint32_t send_flags,
     TRITONSERVER_Error* error)
 {
+  if (!response) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr");
+  }
+
   InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
 
   std::unique_ptr<InferenceResponse> utr(tr);
diff --git a/src/constants.h b/src/constants.h
index 7458ce575..39647e185 100644
--- a/src/constants.h
+++ b/src/constants.h
@@ -66,6 +66,7 @@ constexpr char kEnsemblePlatform[] = "ensemble";
 
 constexpr char kTensorRTExecutionAccelerator[] = "tensorrt";
 constexpr char kOpenVINOExecutionAccelerator[] = "openvino";
+constexpr char kCUDAExecutionAccelerator[] = "cuda";
 constexpr char kGPUIOExecutionAccelerator[] = "gpu_io";
 constexpr char kAutoMixedPrecisionExecutionAccelerator[] =
     "auto_mixed_precision";
diff --git a/src/infer_request.cc b/src/infer_request.cc
index 0c85051ff..4ea687538 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -1176,27 +1176,44 @@ InferenceRequest::Normalize()
     // Note: Since we're using normalized input.ShapeWithBatchDim() here,
     // make sure that all the normalization is before the check.
     {
-      const size_t& byte_size = input.Data()->TotalByteSize();
       const auto& data_type = input.DType();
-      const auto& input_dims = input.ShapeWithBatchDim();
-      int64_t expected_byte_size = INT_MAX;
-      // Because Triton expects STRING type to be in special format
-      // (prepend 4 bytes to specify string length), so need to add all the
-      // first 4 bytes for each element to find expected byte size
-      if (data_type == inference::DataType::TYPE_STRING) {
-        RETURN_IF_ERROR(
-            ValidateBytesInputs(input_id, input, &expected_byte_size));
-      } else {
-        expected_byte_size = triton::common::GetByteSize(data_type, input_dims);
-      }
-      if ((byte_size > INT_MAX) ||
-          (static_cast<int64_t>(byte_size) != expected_byte_size)) {
-        return Status(
-            Status::Code::INVALID_ARG,
-            LogRequest() + "input byte size mismatch for input '" + input_id +
-                "' for model '" + ModelName() + "'. Expected " +
-                std::to_string(expected_byte_size) + ", got " +
-                std::to_string(byte_size));
+
+      // FIXME: Skip byte size validation for TensorRT backend because it breaks
+      // shape-size assumption. See DLIS-6805 for proper fix for TRT backend
+      // reformat_free tensors.
+      bool skip_byte_size_check = false;
+      constexpr char trt_prefix[] = "tensorrt_";
+      const std::string& platform = model_raw_->Config().platform();
+      skip_byte_size_check |= (platform.rfind(trt_prefix) == 0);
+
+      if (!skip_byte_size_check) {
+        TRITONSERVER_MemoryType input_memory_type;
+        // Because Triton expects STRING type to be in special format
+        // (prepend 4 bytes to specify string length), so need to add all the
+        // first 4 bytes for each element to find expected byte size
+        if (data_type == inference::DataType::TYPE_STRING) {
+          RETURN_IF_ERROR(
+              ValidateBytesInputs(input_id, input, &input_memory_type));
+          // FIXME: Temporarily skips byte size checks for GPU tensors. See
+          // DLIS-6820.
+          skip_byte_size_check |=
+              (input_memory_type == TRITONSERVER_MEMORY_GPU);
+        } else {
+          const auto& input_dims = input.ShapeWithBatchDim();
+          int64_t expected_byte_size = INT_MAX;
+          expected_byte_size =
+              triton::common::GetByteSize(data_type, input_dims);
+          const size_t& byte_size = input.Data()->TotalByteSize();
+          if ((byte_size > INT_MAX) ||
+              (static_cast<int64_t>(byte_size) != expected_byte_size)) {
+            return Status(
+                Status::Code::INVALID_ARG,
+                LogRequest() + "input byte size mismatch for input '" +
+                    input_id + "' for model '" + ModelName() + "'. Expected " +
+                    std::to_string(expected_byte_size) + ", got " +
+                    std::to_string(byte_size));
+          }
+        }
       }
     }
   }
@@ -1267,55 +1284,87 @@ InferenceRequest::ValidateRequestInputs()
 Status
 InferenceRequest::ValidateBytesInputs(
     const std::string& input_id, const Input& input,
-    int64_t* const expected_byte_size) const
+    TRITONSERVER_MemoryType* buffer_memory_type) const
 {
   const auto& input_dims = input.ShapeWithBatchDim();
+
   int64_t element_count = triton::common::GetElementCount(input_dims);
-  int64_t element_idx = 0;
-  *expected_byte_size = 0;
-  for (size_t i = 0; i < input.Data()->BufferCount(); ++i) {
-    size_t content_byte_size;
-    TRITONSERVER_MemoryType content_memory_type;
-    int64_t content_memory_id;
-    const char* content = input.Data()->BufferAt(
-        i, &content_byte_size, &content_memory_type, &content_memory_id);
-
-    while (content_byte_size >= sizeof(uint32_t)) {
-      if (element_idx >= element_count) {
-        return Status(
-            Status::Code::INVALID_ARG,
-            LogRequest() + "unexpected number of string elements " +
-                std::to_string(element_idx + 1) + " for inference input '" +
-                input_id + "', expecting " + std::to_string(element_count));
+  int64_t element_checked = 0;
+  size_t remaining_element_size = 0;
+
+  size_t buffer_next_idx = 0;
+  const size_t buffer_count = input.DataBufferCount();
+
+  const char* buffer = nullptr;
+  size_t remaining_buffer_size = 0;
+  int64_t buffer_memory_id;
+
+  // Validate elements until all buffers have been fully processed.
+  while (remaining_buffer_size || buffer_next_idx < buffer_count) {
+    // Get the next buffer if not currently processing one.
+    if (!remaining_buffer_size) {
+      // Reset remaining buffer size and pointers for next buffer.
+      RETURN_IF_ERROR(input.DataBuffer(
+          buffer_next_idx++, (const void**)(&buffer), &remaining_buffer_size,
+          buffer_memory_type, &buffer_memory_id));
+
+      if (*buffer_memory_type == TRITONSERVER_MEMORY_GPU) {
+        return Status::Success;
       }
+    }
 
-      const uint32_t len = *(reinterpret_cast<const uint32_t*>(content));
-      content += sizeof(uint32_t);
-      content_byte_size -= sizeof(uint32_t);
-      *expected_byte_size += sizeof(uint32_t);
-
-      if (content_byte_size < len) {
+    constexpr size_t kElementSizeIndicator = sizeof(uint32_t);
+    // Get the next element if not currently processing one.
+    if (!remaining_element_size) {
+      // FIXME: Assume the string element's byte size indicator is not spread
+      // across buffer boundaries for simplicity.
+      if (remaining_buffer_size < kElementSizeIndicator) {
         return Status(
             Status::Code::INVALID_ARG,
-            LogRequest() + "incomplete string data for inference input '" +
-                input_id + "', expecting string of length " +
-                std::to_string(len) + " but only " +
-                std::to_string(content_byte_size) + " bytes available");
+            LogRequest() +
+                "element byte size indicator exceeds the end of the buffer.");
       }
 
-      content += len;
-      content_byte_size -= len;
-      *expected_byte_size += len;
-      element_idx++;
+      // Start the next element and reset the remaining element size.
+      remaining_element_size = *(reinterpret_cast<const uint32_t*>(buffer));
+      element_checked++;
+
+      // Advance pointer and remainder by the indicator size.
+      buffer += kElementSizeIndicator;
+      remaining_buffer_size -= kElementSizeIndicator;
+    }
+
+    // If the remaining buffer fits it: consume the rest of the element, proceed
+    // to the next element.
+    if (remaining_buffer_size >= remaining_element_size) {
+      buffer += remaining_element_size;
+      remaining_buffer_size -= remaining_element_size;
+      remaining_element_size = 0;
+    }
+    // Otherwise the remaining element is larger: consume the rest of the
+    // buffer, proceed to the next buffer.
+    else {
+      remaining_element_size -= remaining_buffer_size;
+      remaining_buffer_size = 0;
     }
   }
 
-  if (element_idx != element_count) {
+  // Validate the number of processed buffers exactly match expectations.
+  if (buffer_next_idx != buffer_count) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        LogRequest() + "expected " + std::to_string(buffer_count) +
+            " buffers for inference input '" + input_id + "', got " +
+            std::to_string(buffer_next_idx));
+  }
+
+  // Validate the number of processed elements exactly match expectations.
+  if (element_checked != element_count) {
     return Status(
         Status::Code::INVALID_ARG,
         LogRequest() + "expected " + std::to_string(element_count) +
-            " strings for inference input '" + input_id + "', got " +
-            std::to_string(element_idx));
+            " string elements for inference input '" + input_id + "', got " +
+            std::to_string(element_checked));
   }
 
   return Status::Success;
diff --git a/src/infer_request.h b/src/infer_request.h
index c97ef8039..37da42b02 100644
--- a/src/infer_request.h
+++ b/src/infer_request.h
@@ -749,7 +749,7 @@ class InferenceRequest {
 
   Status ValidateBytesInputs(
       const std::string& input_id, const Input& input,
-      int64_t* const expected_byte_size) const;
+      TRITONSERVER_MemoryType* buffer_memory_type) const;
 
   // Helpers for pending request metrics
   void IncrementPendingRequestCount();
diff --git a/src/pinned_memory_manager.cc b/src/pinned_memory_manager.cc
index 5e472d758..3a70b0b7d 100644
--- a/src/pinned_memory_manager.cc
+++ b/src/pinned_memory_manager.cc
@@ -263,20 +263,25 @@ PinnedMemoryManager::Create(const Options& options)
   instance_.reset(new PinnedMemoryManager());
   if (options.host_policy_map_.empty()) {
     void* buffer = nullptr;
+    if (options.pinned_memory_pool_byte_size_ > 0) {
 #ifdef TRITON_ENABLE_GPU
-    auto err = cudaHostAlloc(
-        &buffer, options.pinned_memory_pool_byte_size_, cudaHostAllocPortable);
-    if (err != cudaSuccess) {
-      buffer = nullptr;
-      LOG_WARNING << "Unable to allocate pinned system memory, pinned memory "
-                     "pool will not be available: "
-                  << std::string(cudaGetErrorString(err));
-    } else if (options.pinned_memory_pool_byte_size_ != 0) {
-      LOG_INFO << "Pinned memory pool is created at '"
-               << PointerToString(buffer) << "' with size "
-               << options.pinned_memory_pool_byte_size_;
-    }
+      auto err = cudaHostAlloc(
+          &buffer, options.pinned_memory_pool_byte_size_,
+          cudaHostAllocPortable);
+      if (err != cudaSuccess) {
+        buffer = nullptr;
+        LOG_WARNING << "Unable to allocate pinned system memory, pinned memory "
+                       "pool will not be available: "
+                    << std::string(cudaGetErrorString(err));
+      } else if (options.pinned_memory_pool_byte_size_ != 0) {
+        LOG_INFO << "Pinned memory pool is created at '"
+                 << PointerToString(buffer) << "' with size "
+                 << options.pinned_memory_pool_byte_size_;
+      }
 #endif  // TRITON_ENABLE_GPU
+    } else {
+      LOG_INFO << "Pinned memory pool disabled";
+    }
     try {
       instance_->AddPinnedMemoryBuffer(
           std::shared_ptr<PinnedMemory>(
@@ -318,23 +323,28 @@ PinnedMemoryManager::Create(const Options& options)
         continue;
       }
       void* buffer = nullptr;
+      if (options.pinned_memory_pool_byte_size_ > 0) {
 #ifdef TRITON_ENABLE_GPU
-      auto err = cudaHostAlloc(
-          &buffer, options.pinned_memory_pool_byte_size_,
-          cudaHostAllocPortable);
-      if (err != cudaSuccess) {
-        buffer = nullptr;
-        LOG_WARNING << "Unable to allocate pinned system memory, pinned memory "
-                       "pool will not be available: "
-                    << std::string(cudaGetErrorString(err));
-      } else if (options.pinned_memory_pool_byte_size_ != 0) {
-        LOG_INFO << "Pinned memory pool is created at '"
-                 << PointerToString(buffer) << "' with size "
-                 << options.pinned_memory_pool_byte_size_;
+        auto err = cudaHostAlloc(
+            &buffer, options.pinned_memory_pool_byte_size_,
+            cudaHostAllocPortable);
+        if (err != cudaSuccess) {
+          buffer = nullptr;
+          LOG_WARNING
+              << "Unable to allocate pinned system memory, pinned memory "
+                 "pool will not be available: "
+              << std::string(cudaGetErrorString(err));
+        } else if (options.pinned_memory_pool_byte_size_ != 0) {
+          LOG_INFO << "Pinned memory pool is created at '"
+                   << PointerToString(buffer) << "' with size "
+                   << options.pinned_memory_pool_byte_size_;
+        } else {
+          LOG_INFO << "Pinned memory pool disabled";
+        }
+#endif  // TRITON_ENABLE_GPU
       } else {
         LOG_INFO << "Pinned memory pool disabled";
       }
-#endif  // TRITON_ENABLE_GPU
       ResetNumaMemoryPolicy();
       try {
         instance_->AddPinnedMemoryBuffer(
@@ -348,21 +358,21 @@ PinnedMemoryManager::Create(const Options& options)
             "Failed to add Pinned Memory buffer with host policy: " +
                 std::string(ex.what()));
       }
-    }
-    // If no pinned memory is allocated, add an empty entry where all allocation
-    // will be on normal system memory
-    if (instance_->pinned_memory_buffers_.empty()) {
-      try {
-        instance_->AddPinnedMemoryBuffer(
-            std::shared_ptr<PinnedMemory>(new PinnedMemory(
-                nullptr, options.pinned_memory_pool_byte_size_)),
-            0);
-      }
-      catch (const std::exception& ex) {
-        return Status(
-            Status::Code::INTERNAL,
-            "Failed to add empty Pinned Memory entry: " +
-                std::string(ex.what()));
+      // If no pinned memory is allocated, add an empty entry where all
+      // allocation will be on normal system memory
+      if (instance_->pinned_memory_buffers_.empty()) {
+        try {
+          instance_->AddPinnedMemoryBuffer(
+              std::shared_ptr<PinnedMemory>(new PinnedMemory(
+                  nullptr, options.pinned_memory_pool_byte_size_)),
+              0);
+        }
+        catch (const std::exception& ex) {
+          return Status(
+              Status::Code::INTERNAL,
+              "Failed to add empty Pinned Memory entry: " +
+                  std::string(ex.what()));
+        }
       }
     }
   }
diff --git a/src/server.cc b/src/server.cc
index ae87cb9c0..68b39954f 100644
--- a/src/server.cc
+++ b/src/server.cc
@@ -201,15 +201,11 @@ InferenceServer::Init()
     ready_state_ = ServerReadyState::SERVER_FAILED_TO_INITIALIZE;
     return status;
   }
-  if (pinned_memory_pool_size_ > 0) {
-    PinnedMemoryManager::Options options(pinned_memory_pool_size_);
-    status = PinnedMemoryManager::Create(options);
-    if (!status.IsOk()) {
-      ready_state_ = ServerReadyState::SERVER_FAILED_TO_INITIALIZE;
-      return status;
-    }
-  } else {
-    LOG_INFO << "Pinned memory pool disabled";
+  PinnedMemoryManager::Options options(pinned_memory_pool_size_);
+  status = PinnedMemoryManager::Create(options);
+  if (!status.IsOk()) {
+    ready_state_ = ServerReadyState::SERVER_FAILED_TO_INITIALIZE;
+    return status;
   }
 
 #ifdef TRITON_ENABLE_GPU