diff --git a/include/triton/core/tritonserver.h b/include/triton/core/tritonserver.h index d5ffd5370..b8e25df72 100644 --- a/include/triton/core/tritonserver.h +++ b/include/triton/core/tritonserver.h @@ -91,7 +91,7 @@ struct TRITONSERVER_MetricFamily; /// } /// #define TRITONSERVER_API_VERSION_MAJOR 1 -#define TRITONSERVER_API_VERSION_MINOR 31 +#define TRITONSERVER_API_VERSION_MINOR 32 /// Get the TRITONBACKEND API version supported by the Triton shared /// library. This value can be compared against the diff --git a/python/setup.py b/python/setup.py index 61a8590c0..8bab64221 100755 --- a/python/setup.py +++ b/python/setup.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -108,6 +108,6 @@ def get_tag(self): zip_safe=False, cmdclass={"bdist_wheel": bdist_wheel}, data_files=data_files, - install_requires=["numpy"], + install_requires=["numpy<2"], extras_require={"GPU": gpu_extras, "test": test_extras, "all": all_extras}, ) diff --git a/src/backend_model.cc b/src/backend_model.cc index d3119ac67..01962d3bf 100644 --- a/src/backend_model.cc +++ b/src/backend_model.cc @@ -1549,6 +1549,11 @@ TRITONAPI_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseSetStringParameter( TRITONBACKEND_Response* response, const char* name, const char* value) { + if (!response) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr"); + } + InferenceResponse* tr = reinterpret_cast(response); RETURN_TRITONSERVER_ERROR_IF_ERROR(tr->AddParameter(name, value)); return nullptr; // success @@ -1558,6 +1563,11 @@ TRITONAPI_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseSetIntParameter( TRITONBACKEND_Response* response, const char* name, const int64_t value) { + if (!response) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr"); + } + InferenceResponse* tr = reinterpret_cast(response); RETURN_TRITONSERVER_ERROR_IF_ERROR(tr->AddParameter(name, value)); return nullptr; // success @@ -1567,6 +1577,11 @@ TRITONAPI_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseSetBoolParameter( TRITONBACKEND_Response* response, const char* name, const bool value) { + if (!response) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr"); + } + InferenceResponse* tr = reinterpret_cast(response); RETURN_TRITONSERVER_ERROR_IF_ERROR(tr->AddParameter(name, value)); return nullptr; // success @@ -1576,6 +1591,11 @@ TRITONAPI_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseSetDoubleParameter( TRITONBACKEND_Response* response, const char* name, const double value) { + if (!response) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr"); + } + InferenceResponse* tr = reinterpret_cast(response); RETURN_TRITONSERVER_ERROR_IF_ERROR(tr->AddParameter(name, value)); return nullptr; // success @@ -1587,6 +1607,11 @@ TRITONBACKEND_ResponseOutput( const char* name, const TRITONSERVER_DataType datatype, const int64_t* shape, const uint32_t dims_count) { + if (!response) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr"); + } + *output = nullptr; InferenceResponse* tr = reinterpret_cast(response); std::vector lshape(shape, shape + dims_count); @@ -1602,6 +1627,11 @@ TRITONBACKEND_ResponseSend( TRITONBACKEND_Response* response, const uint32_t send_flags, TRITONSERVER_Error* error) { + if (!response) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, "response was nullptr"); + } + InferenceResponse* tr = reinterpret_cast(response); std::unique_ptr utr(tr); diff --git a/src/constants.h b/src/constants.h index 7458ce575..39647e185 100644 --- a/src/constants.h +++ b/src/constants.h @@ -66,6 +66,7 @@ constexpr char kEnsemblePlatform[] = "ensemble"; constexpr char kTensorRTExecutionAccelerator[] = "tensorrt"; constexpr char kOpenVINOExecutionAccelerator[] = "openvino"; +constexpr char kCUDAExecutionAccelerator[] = "cuda"; constexpr char kGPUIOExecutionAccelerator[] = "gpu_io"; constexpr char kAutoMixedPrecisionExecutionAccelerator[] = "auto_mixed_precision"; diff --git a/src/infer_request.cc b/src/infer_request.cc index 0c85051ff..4ea687538 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -1176,27 +1176,44 @@ InferenceRequest::Normalize() // Note: Since we're using normalized input.ShapeWithBatchDim() here, // make sure that all the normalization is before the check. { - const size_t& byte_size = input.Data()->TotalByteSize(); const auto& data_type = input.DType(); - const auto& input_dims = input.ShapeWithBatchDim(); - int64_t expected_byte_size = INT_MAX; - // Because Triton expects STRING type to be in special format - // (prepend 4 bytes to specify string length), so need to add all the - // first 4 bytes for each element to find expected byte size - if (data_type == inference::DataType::TYPE_STRING) { - RETURN_IF_ERROR( - ValidateBytesInputs(input_id, input, &expected_byte_size)); - } else { - expected_byte_size = triton::common::GetByteSize(data_type, input_dims); - } - if ((byte_size > INT_MAX) || - (static_cast(byte_size) != expected_byte_size)) { - return Status( - Status::Code::INVALID_ARG, - LogRequest() + "input byte size mismatch for input '" + input_id + - "' for model '" + ModelName() + "'. Expected " + - std::to_string(expected_byte_size) + ", got " + - std::to_string(byte_size)); + + // FIXME: Skip byte size validation for TensorRT backend because it breaks + // shape-size assumption. See DLIS-6805 for proper fix for TRT backend + // reformat_free tensors. + bool skip_byte_size_check = false; + constexpr char trt_prefix[] = "tensorrt_"; + const std::string& platform = model_raw_->Config().platform(); + skip_byte_size_check |= (platform.rfind(trt_prefix) == 0); + + if (!skip_byte_size_check) { + TRITONSERVER_MemoryType input_memory_type; + // Because Triton expects STRING type to be in special format + // (prepend 4 bytes to specify string length), so need to add all the + // first 4 bytes for each element to find expected byte size + if (data_type == inference::DataType::TYPE_STRING) { + RETURN_IF_ERROR( + ValidateBytesInputs(input_id, input, &input_memory_type)); + // FIXME: Temporarily skips byte size checks for GPU tensors. See + // DLIS-6820. + skip_byte_size_check |= + (input_memory_type == TRITONSERVER_MEMORY_GPU); + } else { + const auto& input_dims = input.ShapeWithBatchDim(); + int64_t expected_byte_size = INT_MAX; + expected_byte_size = + triton::common::GetByteSize(data_type, input_dims); + const size_t& byte_size = input.Data()->TotalByteSize(); + if ((byte_size > INT_MAX) || + (static_cast(byte_size) != expected_byte_size)) { + return Status( + Status::Code::INVALID_ARG, + LogRequest() + "input byte size mismatch for input '" + + input_id + "' for model '" + ModelName() + "'. Expected " + + std::to_string(expected_byte_size) + ", got " + + std::to_string(byte_size)); + } + } } } } @@ -1267,55 +1284,87 @@ InferenceRequest::ValidateRequestInputs() Status InferenceRequest::ValidateBytesInputs( const std::string& input_id, const Input& input, - int64_t* const expected_byte_size) const + TRITONSERVER_MemoryType* buffer_memory_type) const { const auto& input_dims = input.ShapeWithBatchDim(); + int64_t element_count = triton::common::GetElementCount(input_dims); - int64_t element_idx = 0; - *expected_byte_size = 0; - for (size_t i = 0; i < input.Data()->BufferCount(); ++i) { - size_t content_byte_size; - TRITONSERVER_MemoryType content_memory_type; - int64_t content_memory_id; - const char* content = input.Data()->BufferAt( - i, &content_byte_size, &content_memory_type, &content_memory_id); - - while (content_byte_size >= sizeof(uint32_t)) { - if (element_idx >= element_count) { - return Status( - Status::Code::INVALID_ARG, - LogRequest() + "unexpected number of string elements " + - std::to_string(element_idx + 1) + " for inference input '" + - input_id + "', expecting " + std::to_string(element_count)); + int64_t element_checked = 0; + size_t remaining_element_size = 0; + + size_t buffer_next_idx = 0; + const size_t buffer_count = input.DataBufferCount(); + + const char* buffer = nullptr; + size_t remaining_buffer_size = 0; + int64_t buffer_memory_id; + + // Validate elements until all buffers have been fully processed. + while (remaining_buffer_size || buffer_next_idx < buffer_count) { + // Get the next buffer if not currently processing one. + if (!remaining_buffer_size) { + // Reset remaining buffer size and pointers for next buffer. + RETURN_IF_ERROR(input.DataBuffer( + buffer_next_idx++, (const void**)(&buffer), &remaining_buffer_size, + buffer_memory_type, &buffer_memory_id)); + + if (*buffer_memory_type == TRITONSERVER_MEMORY_GPU) { + return Status::Success; } + } - const uint32_t len = *(reinterpret_cast(content)); - content += sizeof(uint32_t); - content_byte_size -= sizeof(uint32_t); - *expected_byte_size += sizeof(uint32_t); - - if (content_byte_size < len) { + constexpr size_t kElementSizeIndicator = sizeof(uint32_t); + // Get the next element if not currently processing one. + if (!remaining_element_size) { + // FIXME: Assume the string element's byte size indicator is not spread + // across buffer boundaries for simplicity. + if (remaining_buffer_size < kElementSizeIndicator) { return Status( Status::Code::INVALID_ARG, - LogRequest() + "incomplete string data for inference input '" + - input_id + "', expecting string of length " + - std::to_string(len) + " but only " + - std::to_string(content_byte_size) + " bytes available"); + LogRequest() + + "element byte size indicator exceeds the end of the buffer."); } - content += len; - content_byte_size -= len; - *expected_byte_size += len; - element_idx++; + // Start the next element and reset the remaining element size. + remaining_element_size = *(reinterpret_cast(buffer)); + element_checked++; + + // Advance pointer and remainder by the indicator size. + buffer += kElementSizeIndicator; + remaining_buffer_size -= kElementSizeIndicator; + } + + // If the remaining buffer fits it: consume the rest of the element, proceed + // to the next element. + if (remaining_buffer_size >= remaining_element_size) { + buffer += remaining_element_size; + remaining_buffer_size -= remaining_element_size; + remaining_element_size = 0; + } + // Otherwise the remaining element is larger: consume the rest of the + // buffer, proceed to the next buffer. + else { + remaining_element_size -= remaining_buffer_size; + remaining_buffer_size = 0; } } - if (element_idx != element_count) { + // Validate the number of processed buffers exactly match expectations. + if (buffer_next_idx != buffer_count) { + return Status( + Status::Code::INVALID_ARG, + LogRequest() + "expected " + std::to_string(buffer_count) + + " buffers for inference input '" + input_id + "', got " + + std::to_string(buffer_next_idx)); + } + + // Validate the number of processed elements exactly match expectations. + if (element_checked != element_count) { return Status( Status::Code::INVALID_ARG, LogRequest() + "expected " + std::to_string(element_count) + - " strings for inference input '" + input_id + "', got " + - std::to_string(element_idx)); + " string elements for inference input '" + input_id + "', got " + + std::to_string(element_checked)); } return Status::Success; diff --git a/src/infer_request.h b/src/infer_request.h index c97ef8039..37da42b02 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -749,7 +749,7 @@ class InferenceRequest { Status ValidateBytesInputs( const std::string& input_id, const Input& input, - int64_t* const expected_byte_size) const; + TRITONSERVER_MemoryType* buffer_memory_type) const; // Helpers for pending request metrics void IncrementPendingRequestCount(); diff --git a/src/pinned_memory_manager.cc b/src/pinned_memory_manager.cc index 5e472d758..3a70b0b7d 100644 --- a/src/pinned_memory_manager.cc +++ b/src/pinned_memory_manager.cc @@ -263,20 +263,25 @@ PinnedMemoryManager::Create(const Options& options) instance_.reset(new PinnedMemoryManager()); if (options.host_policy_map_.empty()) { void* buffer = nullptr; + if (options.pinned_memory_pool_byte_size_ > 0) { #ifdef TRITON_ENABLE_GPU - auto err = cudaHostAlloc( - &buffer, options.pinned_memory_pool_byte_size_, cudaHostAllocPortable); - if (err != cudaSuccess) { - buffer = nullptr; - LOG_WARNING << "Unable to allocate pinned system memory, pinned memory " - "pool will not be available: " - << std::string(cudaGetErrorString(err)); - } else if (options.pinned_memory_pool_byte_size_ != 0) { - LOG_INFO << "Pinned memory pool is created at '" - << PointerToString(buffer) << "' with size " - << options.pinned_memory_pool_byte_size_; - } + auto err = cudaHostAlloc( + &buffer, options.pinned_memory_pool_byte_size_, + cudaHostAllocPortable); + if (err != cudaSuccess) { + buffer = nullptr; + LOG_WARNING << "Unable to allocate pinned system memory, pinned memory " + "pool will not be available: " + << std::string(cudaGetErrorString(err)); + } else if (options.pinned_memory_pool_byte_size_ != 0) { + LOG_INFO << "Pinned memory pool is created at '" + << PointerToString(buffer) << "' with size " + << options.pinned_memory_pool_byte_size_; + } #endif // TRITON_ENABLE_GPU + } else { + LOG_INFO << "Pinned memory pool disabled"; + } try { instance_->AddPinnedMemoryBuffer( std::shared_ptr( @@ -318,23 +323,28 @@ PinnedMemoryManager::Create(const Options& options) continue; } void* buffer = nullptr; + if (options.pinned_memory_pool_byte_size_ > 0) { #ifdef TRITON_ENABLE_GPU - auto err = cudaHostAlloc( - &buffer, options.pinned_memory_pool_byte_size_, - cudaHostAllocPortable); - if (err != cudaSuccess) { - buffer = nullptr; - LOG_WARNING << "Unable to allocate pinned system memory, pinned memory " - "pool will not be available: " - << std::string(cudaGetErrorString(err)); - } else if (options.pinned_memory_pool_byte_size_ != 0) { - LOG_INFO << "Pinned memory pool is created at '" - << PointerToString(buffer) << "' with size " - << options.pinned_memory_pool_byte_size_; + auto err = cudaHostAlloc( + &buffer, options.pinned_memory_pool_byte_size_, + cudaHostAllocPortable); + if (err != cudaSuccess) { + buffer = nullptr; + LOG_WARNING + << "Unable to allocate pinned system memory, pinned memory " + "pool will not be available: " + << std::string(cudaGetErrorString(err)); + } else if (options.pinned_memory_pool_byte_size_ != 0) { + LOG_INFO << "Pinned memory pool is created at '" + << PointerToString(buffer) << "' with size " + << options.pinned_memory_pool_byte_size_; + } else { + LOG_INFO << "Pinned memory pool disabled"; + } +#endif // TRITON_ENABLE_GPU } else { LOG_INFO << "Pinned memory pool disabled"; } -#endif // TRITON_ENABLE_GPU ResetNumaMemoryPolicy(); try { instance_->AddPinnedMemoryBuffer( @@ -348,21 +358,21 @@ PinnedMemoryManager::Create(const Options& options) "Failed to add Pinned Memory buffer with host policy: " + std::string(ex.what())); } - } - // If no pinned memory is allocated, add an empty entry where all allocation - // will be on normal system memory - if (instance_->pinned_memory_buffers_.empty()) { - try { - instance_->AddPinnedMemoryBuffer( - std::shared_ptr(new PinnedMemory( - nullptr, options.pinned_memory_pool_byte_size_)), - 0); - } - catch (const std::exception& ex) { - return Status( - Status::Code::INTERNAL, - "Failed to add empty Pinned Memory entry: " + - std::string(ex.what())); + // If no pinned memory is allocated, add an empty entry where all + // allocation will be on normal system memory + if (instance_->pinned_memory_buffers_.empty()) { + try { + instance_->AddPinnedMemoryBuffer( + std::shared_ptr(new PinnedMemory( + nullptr, options.pinned_memory_pool_byte_size_)), + 0); + } + catch (const std::exception& ex) { + return Status( + Status::Code::INTERNAL, + "Failed to add empty Pinned Memory entry: " + + std::string(ex.what())); + } } } } diff --git a/src/server.cc b/src/server.cc index ae87cb9c0..68b39954f 100644 --- a/src/server.cc +++ b/src/server.cc @@ -201,15 +201,11 @@ InferenceServer::Init() ready_state_ = ServerReadyState::SERVER_FAILED_TO_INITIALIZE; return status; } - if (pinned_memory_pool_size_ > 0) { - PinnedMemoryManager::Options options(pinned_memory_pool_size_); - status = PinnedMemoryManager::Create(options); - if (!status.IsOk()) { - ready_state_ = ServerReadyState::SERVER_FAILED_TO_INITIALIZE; - return status; - } - } else { - LOG_INFO << "Pinned memory pool disabled"; + PinnedMemoryManager::Options options(pinned_memory_pool_size_); + status = PinnedMemoryManager::Create(options); + if (!status.IsOk()) { + ready_state_ = ServerReadyState::SERVER_FAILED_TO_INITIALIZE; + return status; } #ifdef TRITON_ENABLE_GPU