diff --git a/demos/CUDA/BlackScholes/BlackScholes.cu b/demos/CUDA/BlackScholes/BlackScholes.cu index 7f6ddcf44..5e6df2cca 100644 --- a/demos/CUDA/BlackScholes/BlackScholes.cu +++ b/demos/CUDA/BlackScholes/BlackScholes.cu @@ -194,7 +194,7 @@ double computeL1norm_Call(float* S, float* X, float* T, float* d, Greek greek) { return sum_delta / sum_ref; } -double computeL1norm_Pull(float* S, float* X, float* T, float* d, Greek greek) { +double computeL1norm_Put(float* S, float* X, float* T, float* d, Greek greek) { double delta, ref, sum_delta, sum_ref; sum_delta = 0; sum_ref = 0; @@ -337,7 +337,7 @@ int main(int argc, char** argv) { return EXIT_FAILURE; } - // Compute the derivatives of the price of the pull options + // Compute the derivatives of the price of the Put options for (int i = 0; i < OPT_N; i++) { h_CallResultCPU[i] = 0.0f; h_PutResultCPU[i] = -1.0f; @@ -356,34 +356,34 @@ int main(int argc, char** argv) { d_PutResultGPU, d_StockPrice, d_OptionStrike, d_OptionYears); // Verify delta - L1norm = computeL1norm_Pull(h_StockPrice, h_OptionStrike, h_OptionYears, - d_StockPrice, Delta); - printf("L1norm of delta for Pull option = %E\n", L1norm); + L1norm = computeL1norm_Put(h_StockPrice, h_OptionStrike, h_OptionYears, + d_StockPrice, Delta); + printf("L1norm of delta for Put option = %E\n", L1norm); if (L1norm > 1e-5) { printf("Gradient test failed: the difference between the computed and " - "the approximated theoretical delta for Pull option is larger than " + "the approximated theoretical delta for Put option is larger than " "expected\n"); return EXIT_FAILURE; } // Verify derivatives with respect to the Strike price - L1norm = computeL1norm_Pull(h_StockPrice, h_OptionStrike, h_OptionYears, - d_OptionStrike, dX); - printf("L1norm of derivative of Pull w.r.t. the strike price = %E\n", L1norm); + L1norm = computeL1norm_Put(h_StockPrice, h_OptionStrike, h_OptionYears, + d_OptionStrike, dX); + printf("L1norm of derivative of Put w.r.t. the strike price = %E\n", L1norm); if (L1norm > 1e-6) { printf("Gradient test failed: the difference between the computed and the " "approximated theoretcial derivative of " - "PUll w.r.t. the strike price is larger than expected\n"); + "Put w.r.t. the strike price is larger than expected\n"); return EXIT_FAILURE; } // Verify theta - L1norm = computeL1norm_Pull(h_StockPrice, h_OptionStrike, h_OptionYears, - d_OptionYears, Theta); - printf("L1norm of theta for Pull option = %E\n", L1norm); + L1norm = computeL1norm_Put(h_StockPrice, h_OptionStrike, h_OptionYears, + d_OptionYears, Theta); + printf("L1norm of theta for Put option = %E\n", L1norm); if (L1norm > 1e-5) { printf("Gradient test failed: the difference between the computed and the " - "approximated theoretical theta for Pull option is larger than " + "approximated theoretical theta for Put option is larger than " "expected\n"); return EXIT_FAILURE; } diff --git a/demos/CUDA/BlackScholes/BlackScholes_gold.cpp b/demos/CUDA/BlackScholes/BlackScholes_gold.cpp index b2fee109f..6b0d9c8ef 100644 --- a/demos/CUDA/BlackScholes/BlackScholes_gold.cpp +++ b/demos/CUDA/BlackScholes/BlackScholes_gold.cpp @@ -49,7 +49,8 @@ extern "C" double CND(double d) { double cnd = RSQRT2PI * exp(-0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); - if (d > 0) cnd = 1.0 - cnd; + if (d > 0) + cnd = 1.0 - cnd; return cnd; } @@ -57,13 +58,13 @@ extern "C" double CND(double d) { //////////////////////////////////////////////////////////////////////////////// // Black-Scholes formula for both call and put //////////////////////////////////////////////////////////////////////////////// -static void BlackScholesBodyCPU(float &callResult, float &putResult, - float Sf, // Stock price - float Xf, // Option strike - float Tf, // Option years - float Rf, // Riskless rate +static void BlackScholesBodyCPU(float& callResult, float& putResult, + float Sf, // Stock price + float Xf, // Option strike + float Tf, // Option years + float Rf, // Riskless rate float Vf // Volatility rate - ) { +) { double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf; double sqrtT = sqrt(T); @@ -81,9 +82,9 @@ static void BlackScholesBodyCPU(float &callResult, float &putResult, //////////////////////////////////////////////////////////////////////////////// // Process an array of optN options //////////////////////////////////////////////////////////////////////////////// -extern "C" void BlackScholesCPU(float *h_CallResult, float *h_PutResult, - float *h_StockPrice, float *h_OptionStrike, - float *h_OptionYears, float Riskfree, +extern "C" void BlackScholesCPU(float* h_CallResult, float* h_PutResult, + float* h_StockPrice, float* h_OptionStrike, + float* h_OptionYears, float Riskfree, float Volatility, int optN) { for (int opt = 0; opt < optN; opt++) BlackScholesBodyCPU(h_CallResult[opt], h_PutResult[opt], h_StockPrice[opt], diff --git a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh index dfcf5c575..26497b8ac 100644 --- a/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh +++ b/demos/CUDA/BlackScholes/BlackScholes_kernel.cuh @@ -41,7 +41,8 @@ __device__ inline float cndGPU(float d) { float cnd = RSQRT2PI * expf(-0.5f * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); - if (d > 0) cnd = 1.0f - cnd; + if (d > 0) + cnd = 1.0f - cnd; return cnd; } @@ -49,13 +50,13 @@ __device__ inline float cndGPU(float d) { //////////////////////////////////////////////////////////////////////////////// // Black-Scholes formula for both call and put //////////////////////////////////////////////////////////////////////////////// -__device__ inline void BlackScholesBodyGPU(float &CallResult, float &PutResult, - float S, // Stock price - float X, // Option strike - float T, // Option years - float R, // Riskless rate +__device__ inline void BlackScholesBodyGPU(float& CallResult, float& PutResult, + float S, // Stock price + float X, // Option strike + float T, // Option years + float R, // Riskless rate float V // Volatility rate - ) { +) { float sqrtT, expRT; float d1, d2, CNDD1, CNDD2; diff --git a/demos/CUDA/BlackScholes/helper/helper_cuda.h b/demos/CUDA/BlackScholes/helper/helper_cuda.h index 22d8eeaa2..6666e8208 100644 --- a/demos/CUDA/BlackScholes/helper/helper_cuda.h +++ b/demos/CUDA/BlackScholes/helper/helper_cuda.h @@ -50,16 +50,16 @@ // CUDA Runtime error messages #ifdef __DRIVER_TYPES_H__ -static const char *_cudaGetErrorEnum(cudaError_t error) { +static const char* _cudaGetErrorEnum(cudaError_t error) { return cudaGetErrorName(error); } #endif #ifdef CUDA_DRIVER_API // CUDA Driver API errors -static const char *_cudaGetErrorEnum(CUresult error) { +static const char* _cudaGetErrorEnum(CUresult error) { static char unknown[] = ""; - const char *ret = NULL; + const char* ret = NULL; cuGetErrorName(error, &ret); return ret ? ret : unknown; } @@ -67,37 +67,37 @@ static const char *_cudaGetErrorEnum(CUresult error) { #ifdef CUBLAS_API_H_ // cuBLAS API errors -static const char *_cudaGetErrorEnum(cublasStatus_t error) { +static const char* _cudaGetErrorEnum(cublasStatus_t error) { switch (error) { - case CUBLAS_STATUS_SUCCESS: - return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: - return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: - return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: - return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: - return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: - return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: - return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; - case CUBLAS_STATUS_NOT_SUPPORTED: - return "CUBLAS_STATUS_NOT_SUPPORTED"; + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; - case CUBLAS_STATUS_LICENSE_ERROR: - return "CUBLAS_STATUS_LICENSE_ERROR"; + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; } return ""; @@ -106,58 +106,58 @@ static const char *_cudaGetErrorEnum(cublasStatus_t error) { #ifdef _CUFFT_H_ // cuFFT API errors -static const char *_cudaGetErrorEnum(cufftResult error) { +static const char* _cudaGetErrorEnum(cufftResult error) { switch (error) { - case CUFFT_SUCCESS: - return "CUFFT_SUCCESS"; + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; - case CUFFT_INVALID_PLAN: - return "CUFFT_INVALID_PLAN"; + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; - case CUFFT_ALLOC_FAILED: - return "CUFFT_ALLOC_FAILED"; + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; - case CUFFT_INVALID_TYPE: - return "CUFFT_INVALID_TYPE"; + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; - case CUFFT_INVALID_VALUE: - return "CUFFT_INVALID_VALUE"; + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; - case CUFFT_INTERNAL_ERROR: - return "CUFFT_INTERNAL_ERROR"; + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; - case CUFFT_EXEC_FAILED: - return "CUFFT_EXEC_FAILED"; + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; - case CUFFT_SETUP_FAILED: - return "CUFFT_SETUP_FAILED"; + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; - case CUFFT_INVALID_SIZE: - return "CUFFT_INVALID_SIZE"; + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; - case CUFFT_UNALIGNED_DATA: - return "CUFFT_UNALIGNED_DATA"; + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; - case CUFFT_INCOMPLETE_PARAMETER_LIST: - return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; - case CUFFT_INVALID_DEVICE: - return "CUFFT_INVALID_DEVICE"; + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; - case CUFFT_PARSE_ERROR: - return "CUFFT_PARSE_ERROR"; + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; - case CUFFT_NO_WORKSPACE: - return "CUFFT_NO_WORKSPACE"; + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; - case CUFFT_NOT_IMPLEMENTED: - return "CUFFT_NOT_IMPLEMENTED"; + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; - case CUFFT_LICENSE_ERROR: - return "CUFFT_LICENSE_ERROR"; + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; - case CUFFT_NOT_SUPPORTED: - return "CUFFT_NOT_SUPPORTED"; + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; } return ""; @@ -166,34 +166,34 @@ static const char *_cudaGetErrorEnum(cufftResult error) { #ifdef CUSPARSEAPI // cuSPARSE API errors -static const char *_cudaGetErrorEnum(cusparseStatus_t error) { +static const char* _cudaGetErrorEnum(cusparseStatus_t error) { switch (error) { - case CUSPARSE_STATUS_SUCCESS: - return "CUSPARSE_STATUS_SUCCESS"; + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; - case CUSPARSE_STATUS_NOT_INITIALIZED: - return "CUSPARSE_STATUS_NOT_INITIALIZED"; + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; - case CUSPARSE_STATUS_ALLOC_FAILED: - return "CUSPARSE_STATUS_ALLOC_FAILED"; + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; - case CUSPARSE_STATUS_INVALID_VALUE: - return "CUSPARSE_STATUS_INVALID_VALUE"; + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; - case CUSPARSE_STATUS_ARCH_MISMATCH: - return "CUSPARSE_STATUS_ARCH_MISMATCH"; + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; - case CUSPARSE_STATUS_MAPPING_ERROR: - return "CUSPARSE_STATUS_MAPPING_ERROR"; + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; - case CUSPARSE_STATUS_EXECUTION_FAILED: - return "CUSPARSE_STATUS_EXECUTION_FAILED"; + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; - case CUSPARSE_STATUS_INTERNAL_ERROR: - return "CUSPARSE_STATUS_INTERNAL_ERROR"; + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; - case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: - return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; } return ""; @@ -202,32 +202,32 @@ static const char *_cudaGetErrorEnum(cusparseStatus_t error) { #ifdef CUSOLVER_COMMON_H_ // cuSOLVER API errors -static const char *_cudaGetErrorEnum(cusolverStatus_t error) { +static const char* _cudaGetErrorEnum(cusolverStatus_t error) { switch (error) { - case CUSOLVER_STATUS_SUCCESS: - return "CUSOLVER_STATUS_SUCCESS"; - case CUSOLVER_STATUS_NOT_INITIALIZED: - return "CUSOLVER_STATUS_NOT_INITIALIZED"; - case CUSOLVER_STATUS_ALLOC_FAILED: - return "CUSOLVER_STATUS_ALLOC_FAILED"; - case CUSOLVER_STATUS_INVALID_VALUE: - return "CUSOLVER_STATUS_INVALID_VALUE"; - case CUSOLVER_STATUS_ARCH_MISMATCH: - return "CUSOLVER_STATUS_ARCH_MISMATCH"; - case CUSOLVER_STATUS_MAPPING_ERROR: - return "CUSOLVER_STATUS_MAPPING_ERROR"; - case CUSOLVER_STATUS_EXECUTION_FAILED: - return "CUSOLVER_STATUS_EXECUTION_FAILED"; - case CUSOLVER_STATUS_INTERNAL_ERROR: - return "CUSOLVER_STATUS_INTERNAL_ERROR"; - case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: - return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; - case CUSOLVER_STATUS_NOT_SUPPORTED: - return "CUSOLVER_STATUS_NOT_SUPPORTED "; - case CUSOLVER_STATUS_ZERO_PIVOT: - return "CUSOLVER_STATUS_ZERO_PIVOT"; - case CUSOLVER_STATUS_INVALID_LICENSE: - return "CUSOLVER_STATUS_INVALID_LICENSE"; + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; } return ""; @@ -236,46 +236,46 @@ static const char *_cudaGetErrorEnum(cusolverStatus_t error) { #ifdef CURAND_H_ // cuRAND API errors -static const char *_cudaGetErrorEnum(curandStatus_t error) { +static const char* _cudaGetErrorEnum(curandStatus_t error) { switch (error) { - case CURAND_STATUS_SUCCESS: - return "CURAND_STATUS_SUCCESS"; + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; - case CURAND_STATUS_VERSION_MISMATCH: - return "CURAND_STATUS_VERSION_MISMATCH"; + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; - case CURAND_STATUS_NOT_INITIALIZED: - return "CURAND_STATUS_NOT_INITIALIZED"; + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; - case CURAND_STATUS_ALLOCATION_FAILED: - return "CURAND_STATUS_ALLOCATION_FAILED"; + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; - case CURAND_STATUS_TYPE_ERROR: - return "CURAND_STATUS_TYPE_ERROR"; + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; - case CURAND_STATUS_OUT_OF_RANGE: - return "CURAND_STATUS_OUT_OF_RANGE"; + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: - return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: - return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - case CURAND_STATUS_LAUNCH_FAILURE: - return "CURAND_STATUS_LAUNCH_FAILURE"; + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; - case CURAND_STATUS_PREEXISTING_FAILURE: - return "CURAND_STATUS_PREEXISTING_FAILURE"; + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; - case CURAND_STATUS_INITIALIZATION_FAILED: - return "CURAND_STATUS_INITIALIZATION_FAILED"; + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; - case CURAND_STATUS_ARCH_MISMATCH: - return "CURAND_STATUS_ARCH_MISMATCH"; + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; - case CURAND_STATUS_INTERNAL_ERROR: - return "CURAND_STATUS_INTERNAL_ERROR"; + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; } return ""; @@ -284,34 +284,34 @@ static const char *_cudaGetErrorEnum(curandStatus_t error) { #ifdef NVJPEGAPI // nvJPEG API errors -static const char *_cudaGetErrorEnum(nvjpegStatus_t error) { +static const char* _cudaGetErrorEnum(nvjpegStatus_t error) { switch (error) { - case NVJPEG_STATUS_SUCCESS: - return "NVJPEG_STATUS_SUCCESS"; + case NVJPEG_STATUS_SUCCESS: + return "NVJPEG_STATUS_SUCCESS"; - case NVJPEG_STATUS_NOT_INITIALIZED: - return "NVJPEG_STATUS_NOT_INITIALIZED"; + case NVJPEG_STATUS_NOT_INITIALIZED: + return "NVJPEG_STATUS_NOT_INITIALIZED"; - case NVJPEG_STATUS_INVALID_PARAMETER: - return "NVJPEG_STATUS_INVALID_PARAMETER"; + case NVJPEG_STATUS_INVALID_PARAMETER: + return "NVJPEG_STATUS_INVALID_PARAMETER"; - case NVJPEG_STATUS_BAD_JPEG: - return "NVJPEG_STATUS_BAD_JPEG"; + case NVJPEG_STATUS_BAD_JPEG: + return "NVJPEG_STATUS_BAD_JPEG"; - case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: - return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; + case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: + return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; - case NVJPEG_STATUS_ALLOCATOR_FAILURE: - return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; + case NVJPEG_STATUS_ALLOCATOR_FAILURE: + return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; - case NVJPEG_STATUS_EXECUTION_FAILED: - return "NVJPEG_STATUS_EXECUTION_FAILED"; + case NVJPEG_STATUS_EXECUTION_FAILED: + return "NVJPEG_STATUS_EXECUTION_FAILED"; - case NVJPEG_STATUS_ARCH_MISMATCH: - return "NVJPEG_STATUS_ARCH_MISMATCH"; + case NVJPEG_STATUS_ARCH_MISMATCH: + return "NVJPEG_STATUS_ARCH_MISMATCH"; - case NVJPEG_STATUS_INTERNAL_ERROR: - return "NVJPEG_STATUS_INTERNAL_ERROR"; + case NVJPEG_STATUS_INTERNAL_ERROR: + return "NVJPEG_STATUS_INTERNAL_ERROR"; } return ""; @@ -320,258 +320,258 @@ static const char *_cudaGetErrorEnum(nvjpegStatus_t error) { #ifdef NV_NPPIDEFS_H // NPP API errors -static const char *_cudaGetErrorEnum(NppStatus error) { +static const char* _cudaGetErrorEnum(NppStatus error) { switch (error) { - case NPP_NOT_SUPPORTED_MODE_ERROR: - return "NPP_NOT_SUPPORTED_MODE_ERROR"; + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; - case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: - return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; - case NPP_RESIZE_NO_OPERATION_ERROR: - return "NPP_RESIZE_NO_OPERATION_ERROR"; + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; - case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: - return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 - case NPP_BAD_ARG_ERROR: - return "NPP_BAD_ARGUMENT_ERROR"; + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; - case NPP_COEFF_ERROR: - return "NPP_COEFFICIENT_ERROR"; + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; - case NPP_RECT_ERROR: - return "NPP_RECTANGLE_ERROR"; + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; - case NPP_QUAD_ERROR: - return "NPP_QUADRANGLE_ERROR"; + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; - case NPP_MEM_ALLOC_ERR: - return "NPP_MEMORY_ALLOCATION_ERROR"; + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; - case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: - return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; - case NPP_INVALID_INPUT: - return "NPP_INVALID_INPUT"; + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; - case NPP_POINTER_ERROR: - return "NPP_POINTER_ERROR"; + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; - case NPP_WARNING: - return "NPP_WARNING"; + case NPP_WARNING: + return "NPP_WARNING"; - case NPP_ODD_ROI_WARNING: - return "NPP_ODD_ROI_WARNING"; + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; #else - // These are for CUDA 5.5 or higher - case NPP_BAD_ARGUMENT_ERROR: - return "NPP_BAD_ARGUMENT_ERROR"; + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; - case NPP_COEFFICIENT_ERROR: - return "NPP_COEFFICIENT_ERROR"; + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; - case NPP_RECTANGLE_ERROR: - return "NPP_RECTANGLE_ERROR"; + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; - case NPP_QUADRANGLE_ERROR: - return "NPP_QUADRANGLE_ERROR"; + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; - case NPP_MEMORY_ALLOCATION_ERR: - return "NPP_MEMORY_ALLOCATION_ERROR"; + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; - case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: - return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; - case NPP_INVALID_HOST_POINTER_ERROR: - return "NPP_INVALID_HOST_POINTER_ERROR"; + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; - case NPP_INVALID_DEVICE_POINTER_ERROR: - return "NPP_INVALID_DEVICE_POINTER_ERROR"; + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; #endif - case NPP_LUT_NUMBER_OF_LEVELS_ERROR: - return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; - case NPP_TEXTURE_BIND_ERROR: - return "NPP_TEXTURE_BIND_ERROR"; + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; - case NPP_WRONG_INTERSECTION_ROI_ERROR: - return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; - case NPP_NOT_EVEN_STEP_ERROR: - return "NPP_NOT_EVEN_STEP_ERROR"; + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; - case NPP_INTERPOLATION_ERROR: - return "NPP_INTERPOLATION_ERROR"; + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; - case NPP_RESIZE_FACTOR_ERROR: - return "NPP_RESIZE_FACTOR_ERROR"; + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; - case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: - return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 - case NPP_MEMFREE_ERR: - return "NPP_MEMFREE_ERR"; + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; - case NPP_MEMSET_ERR: - return "NPP_MEMSET_ERR"; + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; - case NPP_MEMCPY_ERR: - return "NPP_MEMCPY_ERROR"; + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; - case NPP_MIRROR_FLIP_ERR: - return "NPP_MIRROR_FLIP_ERR"; + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; #else - case NPP_MEMFREE_ERROR: - return "NPP_MEMFREE_ERROR"; + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; - case NPP_MEMSET_ERROR: - return "NPP_MEMSET_ERROR"; + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; - case NPP_MEMCPY_ERROR: - return "NPP_MEMCPY_ERROR"; + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; - case NPP_MIRROR_FLIP_ERROR: - return "NPP_MIRROR_FLIP_ERROR"; + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; #endif - case NPP_ALIGNMENT_ERROR: - return "NPP_ALIGNMENT_ERROR"; + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; - case NPP_STEP_ERROR: - return "NPP_STEP_ERROR"; + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; - case NPP_SIZE_ERROR: - return "NPP_SIZE_ERROR"; + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; - case NPP_NULL_POINTER_ERROR: - return "NPP_NULL_POINTER_ERROR"; + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; - case NPP_CUDA_KERNEL_EXECUTION_ERROR: - return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; - case NPP_NOT_IMPLEMENTED_ERROR: - return "NPP_NOT_IMPLEMENTED_ERROR"; + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; - case NPP_ERROR: - return "NPP_ERROR"; + case NPP_ERROR: + return "NPP_ERROR"; - case NPP_SUCCESS: - return "NPP_SUCCESS"; + case NPP_SUCCESS: + return "NPP_SUCCESS"; - case NPP_WRONG_INTERSECTION_QUAD_WARNING: - return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; - case NPP_MISALIGNED_DST_ROI_WARNING: - return "NPP_MISALIGNED_DST_ROI_WARNING"; + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; - case NPP_AFFINE_QUAD_INCORRECT_WARNING: - return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; - case NPP_DOUBLE_SIZE_WARNING: - return "NPP_DOUBLE_SIZE_WARNING"; + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; - case NPP_WRONG_INTERSECTION_ROI_WARNING: - return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 - /* These are 6.0 or higher */ - case NPP_LUT_PALETTE_BITSIZE_ERROR: - return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; - case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: - return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; - case NPP_QUALITY_INDEX_ERROR: - return "NPP_QUALITY_INDEX_ERROR"; + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; - case NPP_CHANNEL_ORDER_ERROR: - return "NPP_CHANNEL_ORDER_ERROR"; + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; - case NPP_ZERO_MASK_VALUE_ERROR: - return "NPP_ZERO_MASK_VALUE_ERROR"; + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; - case NPP_NUMBER_OF_CHANNELS_ERROR: - return "NPP_NUMBER_OF_CHANNELS_ERROR"; + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; - case NPP_COI_ERROR: - return "NPP_COI_ERROR"; + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; - case NPP_DIVISOR_ERROR: - return "NPP_DIVISOR_ERROR"; + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; - case NPP_CHANNEL_ERROR: - return "NPP_CHANNEL_ERROR"; + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; - case NPP_STRIDE_ERROR: - return "NPP_STRIDE_ERROR"; + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; - case NPP_ANCHOR_ERROR: - return "NPP_ANCHOR_ERROR"; + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; - case NPP_MASK_SIZE_ERROR: - return "NPP_MASK_SIZE_ERROR"; + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; - case NPP_MOMENT_00_ZERO_ERROR: - return "NPP_MOMENT_00_ZERO_ERROR"; + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; - case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: - return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; - case NPP_THRESHOLD_ERROR: - return "NPP_THRESHOLD_ERROR"; + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; - case NPP_CONTEXT_MATCH_ERROR: - return "NPP_CONTEXT_MATCH_ERROR"; + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; - case NPP_FFT_FLAG_ERROR: - return "NPP_FFT_FLAG_ERROR"; + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; - case NPP_FFT_ORDER_ERROR: - return "NPP_FFT_ORDER_ERROR"; + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; - case NPP_SCALE_RANGE_ERROR: - return "NPP_SCALE_RANGE_ERROR"; + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; - case NPP_DATA_TYPE_ERROR: - return "NPP_DATA_TYPE_ERROR"; + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; - case NPP_OUT_OFF_RANGE_ERROR: - return "NPP_OUT_OFF_RANGE_ERROR"; + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; - case NPP_DIVIDE_BY_ZERO_ERROR: - return "NPP_DIVIDE_BY_ZERO_ERROR"; + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; - case NPP_RANGE_ERROR: - return "NPP_RANGE_ERROR"; + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; - case NPP_NO_MEMORY_ERROR: - return "NPP_NO_MEMORY_ERROR"; + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; - case NPP_ERROR_RESERVED: - return "NPP_ERROR_RESERVED"; + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; - case NPP_NO_OPERATION_WARNING: - return "NPP_NO_OPERATION_WARNING"; + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; - case NPP_DIVIDE_BY_ZERO_WARNING: - return "NPP_DIVIDE_BY_ZERO_WARNING"; + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; #endif #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 - /* These are 7.0 or higher */ - case NPP_OVERFLOW_ERROR: - return "NPP_OVERFLOW_ERROR"; + /* These are 7.0 or higher */ + case NPP_OVERFLOW_ERROR: + return "NPP_OVERFLOW_ERROR"; - case NPP_CORRUPTED_DATA_ERROR: - return "NPP_CORRUPTED_DATA_ERROR"; + case NPP_CORRUPTED_DATA_ERROR: + return "NPP_CORRUPTED_DATA_ERROR"; #endif } @@ -580,8 +580,8 @@ static const char *_cudaGetErrorEnum(NppStatus error) { #endif template -void check(T result, char const *const func, const char *const file, - int const line) { +void check(T result, const char* const func, const char* const file, + const int line) { if (result) { fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast(result), _cudaGetErrorEnum(result), func); @@ -597,7 +597,7 @@ void check(T result, char const *const func, const char *const file, // This will output the proper error string when calling cudaGetLastError #define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) -inline void __getLastCudaError(const char *errorMessage, const char *file, +inline void __getLastCudaError(const char* errorMessage, const char* file, const int line) { cudaError_t err = cudaGetLastError(); @@ -615,7 +615,7 @@ inline void __getLastCudaError(const char *errorMessage, const char *file, // but not exit program incase error detected. #define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) -inline void __printLastCudaError(const char *errorMessage, const char *file, +inline void __printLastCudaError(const char* errorMessage, const char* file, const int line) { cudaError_t err = cudaGetLastError(); @@ -644,48 +644,31 @@ inline int _ConvertSMVer2Cores(int major, int minor) { // Defines for GPU Architecture types (using the SM version to determine // the # of cores per SM typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, + int SM; // 0xMm (hexidecimal notation), M = SM Major version, // and m = SM minor version int Cores; } sSMtoCores; sSMtoCores nGpuArchCoresPerSM[] = { - {0x30, 192}, - {0x32, 192}, - {0x35, 192}, - {0x37, 192}, - {0x50, 128}, - {0x52, 128}, - {0x53, 128}, - {0x60, 64}, - {0x61, 128}, - {0x62, 128}, - {0x70, 64}, - {0x72, 64}, - {0x75, 64}, - {0x80, 64}, - {0x86, 128}, - {0x87, 128}, - {0x89, 128}, - {0x90, 128}, - {-1, -1}}; + {0x30, 192}, {0x32, 192}, {0x35, 192}, {0x37, 192}, {0x50, 128}, + {0x52, 128}, {0x53, 128}, {0x60, 64}, {0x61, 128}, {0x62, 128}, + {0x70, 64}, {0x72, 64}, {0x75, 64}, {0x80, 64}, {0x86, 128}, + {0x87, 128}, {0x89, 128}, {0x90, 128}, {-1, -1}}; int index = 0; while (nGpuArchCoresPerSM[index].SM != -1) { - if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) return nGpuArchCoresPerSM[index].Cores; - } index++; } // If we don't find the values, we default use the previous one // to run properly - printf( - "MapSMtoCores for SM %d.%d is undefined." - " Default to use %d Cores/SM\n", - major, minor, nGpuArchCoresPerSM[index - 1].Cores); + printf("MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); return nGpuArchCoresPerSM[index - 1].Cores; } @@ -693,51 +676,37 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) { // Defines for GPU Architecture types (using the SM version to determine // the GPU Arch name) typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, + int SM; // 0xMm (hexidecimal notation), M = SM Major version, // and m = SM minor version const char* name; } sSMtoArchName; sSMtoArchName nGpuArchNameSM[] = { - {0x30, "Kepler"}, - {0x32, "Kepler"}, - {0x35, "Kepler"}, - {0x37, "Kepler"}, - {0x50, "Maxwell"}, - {0x52, "Maxwell"}, - {0x53, "Maxwell"}, - {0x60, "Pascal"}, - {0x61, "Pascal"}, - {0x62, "Pascal"}, - {0x70, "Volta"}, - {0x72, "Xavier"}, - {0x75, "Turing"}, - {0x80, "Ampere"}, - {0x86, "Ampere"}, - {0x87, "Ampere"}, - {0x89, "Ada"}, - {0x90, "Hopper"}, + {0x30, "Kepler"}, {0x32, "Kepler"}, {0x35, "Kepler"}, + {0x37, "Kepler"}, {0x50, "Maxwell"}, {0x52, "Maxwell"}, + {0x53, "Maxwell"}, {0x60, "Pascal"}, {0x61, "Pascal"}, + {0x62, "Pascal"}, {0x70, "Volta"}, {0x72, "Xavier"}, + {0x75, "Turing"}, {0x80, "Ampere"}, {0x86, "Ampere"}, + {0x87, "Ampere"}, {0x89, "Ada"}, {0x90, "Hopper"}, {-1, "Graphics Device"}}; int index = 0; while (nGpuArchNameSM[index].SM != -1) { - if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) return nGpuArchNameSM[index].name; - } index++; } // If we don't find the values, we default use the previous one // to run properly - printf( - "MapSMtoArchName for SM %d.%d is undefined." - " Default to use %s\n", - major, minor, nGpuArchNameSM[index - 1].name); + printf("MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); return nGpuArchNameSM[index - 1].name; } - // end of GPU Architecture definitions +// end of GPU Architecture definitions #ifdef __CUDA_RUNTIME_H__ // General GPU Device CUDA Initialization @@ -746,15 +715,13 @@ inline int gpuDeviceInit(int devID) { checkCudaErrors(cudaGetDeviceCount(&device_count)); if (device_count == 0) { - fprintf(stderr, - "gpuDeviceInit() CUDA error: " - "no devices supporting CUDA.\n"); + fprintf(stderr, "gpuDeviceInit() CUDA error: " + "no devices supporting CUDA.\n"); exit(EXIT_FAILURE); } - if (devID < 0) { + if (devID < 0) devID = 0; - } if (devID > device_count - 1) { fprintf(stderr, "\n"); @@ -769,13 +736,15 @@ inline int gpuDeviceInit(int devID) { } int computeMode = -1, major = 0, minor = 0; - checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID)); - checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); - checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); + checkCudaErrors( + cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID)); + checkCudaErrors( + cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); + checkCudaErrors( + cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); if (computeMode == cudaComputeModeProhibited) { - fprintf(stderr, - "Error: device is running in , no threads can use cudaSetDevice().\n"); + fprintf(stderr, "Error: device is running in , no threads can use cudaSetDevice().\n"); return -1; } @@ -785,7 +754,8 @@ inline int gpuDeviceInit(int devID) { } checkCudaErrors(cudaSetDevice(devID)); - printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor)); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, + _ConvertSMVer2ArchName(major, minor)); return devID; } @@ -801,9 +771,8 @@ inline int gpuGetMaxGflopsDeviceId() { checkCudaErrors(cudaGetDeviceCount(&device_count)); if (device_count == 0) { - fprintf(stderr, - "gpuGetMaxGflopsDeviceId() CUDA error:" - " no devices supporting CUDA.\n"); + fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error:" + " no devices supporting CUDA.\n"); exit(EXIT_FAILURE); } @@ -812,35 +781,40 @@ inline int gpuGetMaxGflopsDeviceId() { while (current_device < device_count) { int computeMode = -1, major = 0, minor = 0; - checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); - checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device)); - checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, + current_device)); + checkCudaErrors(cudaDeviceGetAttribute( + &major, cudaDevAttrComputeCapabilityMajor, current_device)); + checkCudaErrors(cudaDeviceGetAttribute( + &minor, cudaDevAttrComputeCapabilityMinor, current_device)); // If this GPU is not running on Compute Mode prohibited, // then we can add it to the list if (computeMode != cudaComputeModeProhibited) { - if (major == 9999 && minor == 9999) { + if (major == 9999 && minor == 9999) sm_per_multiproc = 1; - } else { - sm_per_multiproc = - _ConvertSMVer2Cores(major, minor); - } + else + sm_per_multiproc = _ConvertSMVer2Cores(major, minor); int multiProcessorCount = 0, clockRate = 0; - checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device)); - cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device); + checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, + cudaDevAttrMultiProcessorCount, + current_device)); + cudaError_t result = cudaDeviceGetAttribute( + &clockRate, cudaDevAttrClockRate, current_device); if (result != cudaSuccess) { // If cudaDevAttrClockRate attribute is not supported we // set clockRate as 1, to consider GPU with most SMs and CUDA Cores. - if(result == cudaErrorInvalidValue) { + if (result == cudaErrorInvalidValue) { clockRate = 1; - } - else { - fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__, - static_cast(result), _cudaGetErrorEnum(result)); + } else { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, + __LINE__, static_cast(result), + _cudaGetErrorEnum(result)); exit(EXIT_FAILURE); } } - uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; + uint64_t compute_perf = + (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; if (compute_perf > max_compute_perf) { max_compute_perf = compute_perf; @@ -854,9 +828,8 @@ inline int gpuGetMaxGflopsDeviceId() { } if (devices_prohibited == device_count) { - fprintf(stderr, - "gpuGetMaxGflopsDeviceId() CUDA error:" - " all devices have compute mode prohibited.\n"); + fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); exit(EXIT_FAILURE); } @@ -864,7 +837,7 @@ inline int gpuGetMaxGflopsDeviceId() { } // Initialization code to find the best CUDA Device -inline int findCudaDevice(int argc, const char **argv) { +inline int findCudaDevice(int argc, const char** argv) { int devID = 0; // If the command-line has a device number specified, use it @@ -887,11 +860,12 @@ inline int findCudaDevice(int argc, const char **argv) { devID = gpuGetMaxGflopsDeviceId(); checkCudaErrors(cudaSetDevice(devID)); int major = 0, minor = 0; - checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); - checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", - devID, _ConvertSMVer2ArchName(major, minor), major, minor); - + checkCudaErrors(cudaDeviceGetAttribute( + &major, cudaDevAttrComputeCapabilityMajor, devID)); + checkCudaErrors(cudaDeviceGetAttribute( + &minor, cudaDevAttrComputeCapabilityMinor, devID)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, + _ConvertSMVer2ArchName(major, minor), major, minor); } return devID; @@ -912,18 +886,23 @@ inline int findIntegratedGPU() { // Find the integrated GPU which is compute capable while (current_device < device_count) { int computeMode = -1, integrated = -1; - checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); - checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, + current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, + current_device)); // If GPU is integrated and is not running on Compute Mode prohibited, // then cuda can map to GLES resource if (integrated && (computeMode != cudaComputeModeProhibited)) { checkCudaErrors(cudaSetDevice(current_device)); int major = 0, minor = 0; - checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device)); - checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device)); + checkCudaErrors(cudaDeviceGetAttribute( + &major, cudaDevAttrComputeCapabilityMajor, current_device)); + checkCudaErrors(cudaDeviceGetAttribute( + &minor, cudaDevAttrComputeCapabilityMinor, current_device)); printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", - current_device, _ConvertSMVer2ArchName(major, minor), major, minor); + current_device, _ConvertSMVer2ArchName(major, minor), major, + minor); return current_device; } else { @@ -934,9 +913,8 @@ inline int findIntegratedGPU() { } if (devices_prohibited == device_count) { - fprintf(stderr, - "CUDA error:" - " No GLES-CUDA Interop capable GPU found.\n"); + fprintf(stderr, "CUDA error:" + " No GLES-CUDA Interop capable GPU found.\n"); exit(EXIT_FAILURE); } @@ -949,25 +927,25 @@ inline bool checkCudaCapabilities(int major_version, int minor_version) { int major = 0, minor = 0; checkCudaErrors(cudaGetDevice(&dev)); - checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); - checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev)); + checkCudaErrors( + cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); + checkCudaErrors( + cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev)); if ((major > major_version) || - (major == major_version && - minor >= minor_version)) { + (major == major_version && minor >= minor_version)) { printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, _ConvertSMVer2ArchName(major, minor), major, minor); return true; } else { - printf( - " No GPU device was found that can support " - "CUDA compute capability %d.%d.\n", - major_version, minor_version); + printf(" No GPU device was found that can support " + "CUDA compute capability %d.%d.\n", + major_version, minor_version); return false; } } #endif - // end of CUDA Helper Functions +// end of CUDA Helper Functions -#endif // COMMON_HELPER_CUDA_H_ +#endif // COMMON_HELPER_CUDA_H_ diff --git a/demos/CUDA/BlackScholes/helper/helper_functions.h b/demos/CUDA/BlackScholes/helper/helper_functions.h index 14dc77fc7..bd40ba43e 100644 --- a/demos/CUDA/BlackScholes/helper/helper_functions.h +++ b/demos/CUDA/BlackScholes/helper/helper_functions.h @@ -48,12 +48,12 @@ #include // includes, timer, string parsing, image helpers -#include // helper functions for image compare, dump, data comparisons -#include // helper functions for string parsing -#include // helper functions for timers +#include // helper functions for image compare, dump, data comparisons +#include // helper functions for string parsing +#include // helper functions for timers #ifndef EXIT_WAIVED #define EXIT_WAIVED 2 #endif -#endif // COMMON_HELPER_FUNCTIONS_H_ +#endif // COMMON_HELPER_FUNCTIONS_H_ diff --git a/demos/CUDA/BlackScholes/helper/helper_image.h b/demos/CUDA/BlackScholes/helper/helper_image.h index 24b10462d..33fbf1b62 100644 --- a/demos/CUDA/BlackScholes/helper/helper_image.h +++ b/demos/CUDA/BlackScholes/helper/helper_image.h @@ -61,55 +61,49 @@ const unsigned int PGMHeaderSize = 0x40; // types //! Data converter from unsigned char / unsigned byte to type T -template -struct ConverterFromUByte; +template struct ConverterFromUByte; //! Data converter from unsigned char / unsigned byte -template <> -struct ConverterFromUByte { +template <> struct ConverterFromUByte { //! Conversion operator //! @return converted value //! @param val value to convert - float operator()(const unsigned char &val) { + float operator()(const unsigned char& val) { return static_cast(val); } }; //! Data converter from unsigned char / unsigned byte to float -template <> -struct ConverterFromUByte { +template <> struct ConverterFromUByte { //! Conversion operator //! @return converted value //! @param val value to convert - float operator()(const unsigned char &val) { + float operator()(const unsigned char& val) { return static_cast(val) / 255.0f; } }; //! Data converter from unsigned char / unsigned byte to type T -template -struct ConverterToUByte; +template struct ConverterToUByte; //! Data converter from unsigned char / unsigned byte to unsigned int -template <> -struct ConverterToUByte { +template <> struct ConverterToUByte { //! Conversion operator (essentially a passthru //! @return converted value //! @param val value to convert - unsigned char operator()(const unsigned char &val) { return val; } + unsigned char operator()(const unsigned char& val) { return val; } }; //! Data converter from unsigned char / unsigned byte to unsigned int -template <> -struct ConverterToUByte { +template <> struct ConverterToUByte { //! Conversion operator //! @return converted value //! @param val value to convert - unsigned char operator()(const float &val) { + unsigned char operator()(const float& val) { return static_cast(val * 255.0f); } }; -} // namespace helper_image_internal +} // namespace helper_image_internal #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #ifndef FOPEN @@ -133,9 +127,9 @@ struct ConverterToUByte { #endif #endif -inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w, - unsigned int *h, unsigned int *channels) { - FILE *fp = NULL; +inline bool __loadPPM(const char* file, unsigned char** data, unsigned int* w, + unsigned int* h, unsigned int* channels) { + FILE* fp = NULL; if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) { std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl; @@ -173,36 +167,31 @@ inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w, return false; } - if (header[0] == '#') { + if (header[0] == '#') continue; - } - if (i == 0) { + if (i == 0) i += SSCANF(header, "%u %u %u", &width, &height, &maxval); - } else if (i == 1) { + else if (i == 1) i += SSCANF(header, "%u %u", &height, &maxval); - } else if (i == 2) { + else if (i == 2) i += SSCANF(header, "%u", &maxval); - } } // check if given handle for the data is initialized if (NULL != *data) { - if (*w != width || *h != height) { + if (*w != width || *h != height) std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl; - } } else { - *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height * - *channels); + *data = (unsigned char*)malloc(sizeof(unsigned char) * width * height * + *channels); *w = width; *h = height; } // read and close file - if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) == - 0) { + if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) == 0) std::cerr << "__LoadPPM() read data returned error." << std::endl; - } fclose(fp); @@ -210,22 +199,20 @@ inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w, } template -inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w, - unsigned int *h) { - unsigned char *idata = NULL; +inline bool sdkLoadPGM(const char* file, T** data, unsigned int* w, + unsigned int* h) { + unsigned char* idata = NULL; unsigned int channels; - if (true != __loadPPM(file, &idata, w, h, &channels)) { + if (true != __loadPPM(file, &idata, w, h, &channels)) return false; - } unsigned int size = *w * *h * channels; // initialize mem if necessary // the correct size is checked / set in loadPGMc() - if (NULL == *data) { - *data = reinterpret_cast(malloc(sizeof(T) * size)); - } + if (NULL == *data) + *data = reinterpret_cast(malloc(sizeof(T) * size)); // copy and cast data std::transform(idata, idata + size, *data, @@ -237,18 +224,18 @@ inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w, } template -inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w, - unsigned int *h) { - unsigned char *idata = 0; +inline bool sdkLoadPPM4(const char* file, T** data, unsigned int* w, + unsigned int* h) { + unsigned char* idata = 0; unsigned int channels; if (__loadPPM(file, &idata, w, h, &channels)) { // pad 4th component int size = *w * *h; // keep the original pointer - unsigned char *idata_orig = idata; - *data = reinterpret_cast(malloc(sizeof(T) * size * 4)); - unsigned char *ptr = *data; + unsigned char* idata_orig = idata; + *data = reinterpret_cast(malloc(sizeof(T) * size * 4)); + unsigned char* ptr = *data; for (int i = 0; i < size; i++) { *ptr++ = *idata++; @@ -265,7 +252,7 @@ inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w, } } -inline bool __savePPM(const char *file, unsigned char *data, unsigned int w, +inline bool __savePPM(const char* file, unsigned char* data, unsigned int w, unsigned int h, unsigned int channels) { assert(NULL != data); assert(w > 0); @@ -289,9 +276,8 @@ inline bool __savePPM(const char *file, unsigned char *data, unsigned int w, fh << w << "\n" << h << "\n" << 0xff << std::endl; - for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) { + for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) fh << data[i]; - } fh.flush(); @@ -306,10 +292,10 @@ inline bool __savePPM(const char *file, unsigned char *data, unsigned int w, } template -inline bool sdkSavePGM(const char *file, T *data, unsigned int w, +inline bool sdkSavePGM(const char* file, T* data, unsigned int w, unsigned int h) { unsigned int size = w * h; - unsigned char *idata = (unsigned char *)malloc(sizeof(unsigned char) * size); + unsigned char* idata = (unsigned char*)malloc(sizeof(unsigned char) * size); std::transform(data, data + size, idata, helper_image_internal::ConverterToUByte()); @@ -323,13 +309,13 @@ inline bool sdkSavePGM(const char *file, T *data, unsigned int w, return result; } -inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w, +inline bool sdkSavePPM4ub(const char* file, unsigned char* data, unsigned int w, unsigned int h) { // strip 4th component int size = w * h; - unsigned char *ndata = - (unsigned char *)malloc(sizeof(unsigned char) * size * 3); - unsigned char *ptr = ndata; + unsigned char* ndata = + (unsigned char*)malloc(sizeof(unsigned char) * size * 3); + unsigned char* ptr = ndata; for (int i = 0; i < size; i++) { *ptr++ = *data++; @@ -352,7 +338,7 @@ inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w, //! @param len number of data elements in data, -1 on error ////////////////////////////////////////////////////////////////////////////// template -inline bool sdkReadFile(const char *filename, T **data, unsigned int *len, +inline bool sdkReadFile(const char* filename, T** data, unsigned int* len, bool verbose) { // check input arguments assert(NULL != filename); @@ -362,7 +348,7 @@ inline bool sdkReadFile(const char *filename, T **data, unsigned int *len, std::vector data_read; // open file for reading - FILE *fh = NULL; + FILE* fh = NULL; // check if filestream is valid if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) { @@ -394,7 +380,7 @@ inline bool sdkReadFile(const char *filename, T **data, unsigned int *len, } } else { // allocate storage for the data read - *data = reinterpret_cast(malloc(sizeof(T) * data_read.size())); + *data = reinterpret_cast(malloc(sizeof(T) * data_read.size())); // store signal size *len = static_cast(data_read.size()); } @@ -414,7 +400,7 @@ inline bool sdkReadFile(const char *filename, T **data, unsigned int *len, //! @param len number of data elements in data, -1 on error ////////////////////////////////////////////////////////////////////////////// template -inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, +inline bool sdkReadFileBlocks(const char* filename, T** data, unsigned int* len, unsigned int block_num, unsigned int block_size, bool verbose) { // check input arguments @@ -422,7 +408,7 @@ inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, assert(NULL != len); // open file for reading - FILE *fh = fopen(filename, "rb"); + FILE* fh = fopen(filename, "rb"); if (fh == NULL && verbose) { std::cerr << "sdkReadFile() : Opening file failed." << std::endl; @@ -431,7 +417,7 @@ inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, // check if the given handle is already initialized // allocate storage for the data read - data[block_num] = reinterpret_cast(malloc(block_size)); + data[block_num] = reinterpret_cast(malloc(block_size)); // read all data elements fseek(fh, block_num * block_size, SEEK_SET); @@ -451,7 +437,7 @@ inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, //! @param epsilon epsilon for comparison ////////////////////////////////////////////////////////////////////////////// template -inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len, +inline bool sdkWriteFile(const char* filename, const T* data, unsigned int len, const S epsilon, bool verbose, bool append = false) { assert(NULL != filename); assert(NULL != data); @@ -476,9 +462,8 @@ inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len, // check if filestream is valid if (!fh.good()) { - if (verbose) { + if (verbose) std::cerr << "sdkWriteFile() : Opening file failed." << std::endl; - } return false; } @@ -487,15 +472,13 @@ inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len, fh << "# " << epsilon << "\n"; // write data - for (unsigned int i = 0; (i < len) && (fh.good()); ++i) { + for (unsigned int i = 0; (i < len) && (fh.good()); ++i) fh << data[i] << ' '; - } // Check if writing succeeded if (!fh.good()) { - if (verbose) { + if (verbose) std::cerr << "sdkWriteFile() : Writing file failed." << std::endl; - } return false; } @@ -515,7 +498,7 @@ inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len, //! @param epsilon epsilon to use for the comparison ////////////////////////////////////////////////////////////////////////////// template -inline bool compareData(const T *reference, const T *data, +inline bool compareData(const T* reference, const T* data, const unsigned int len, const S epsilon, const float threshold) { assert(epsilon >= 0); @@ -569,7 +552,7 @@ inline bool compareData(const T *reference, const T *data, //! @param epsilon threshold % of (# of bytes) for pass/fail ////////////////////////////////////////////////////////////////////////////// template -inline bool compareDataAsFloatThreshold(const T *reference, const T *data, +inline bool compareDataAsFloatThreshold(const T* reference, const T* data, const unsigned int len, const S epsilon, const float threshold) { assert(epsilon >= 0); @@ -585,15 +568,13 @@ inline bool compareDataAsFloatThreshold(const T *reference, const T *data, bool comp = (diff < max_error); result &= comp; - if (!comp) { + if (!comp) error_count++; - } } if (threshold == 0.0f) { - if (error_count) { + if (error_count) printf("total # of errors = %d\n", error_count); - } return (error_count == 0) ? true : false; } else { @@ -607,18 +588,18 @@ inline bool compareDataAsFloatThreshold(const T *reference, const T *data, } } -inline void sdkDumpBin(void *data, unsigned int bytes, const char *filename) { +inline void sdkDumpBin(void* data, unsigned int bytes, const char* filename) { printf("sdkDumpBin: <%s>\n", filename); - FILE *fp; + FILE* fp; FOPEN(fp, filename, "wb"); fwrite(data, bytes, 1, fp); fflush(fp); fclose(fp); } -inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, +inline bool sdkCompareBin2BinUint(const char* src_file, const char* ref_file, unsigned int nelements, const float epsilon, - const float threshold, char *exec_path) { + const float threshold, char* exec_path) { unsigned int *src_buffer, *ref_buffer; FILE *src_fp = NULL, *ref_fp = NULL; @@ -631,7 +612,7 @@ inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, error_count++; } - char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + char* ref_file_path = sdkFindFilePath(ref_file, exec_path); if (ref_file_path == NULL) { printf("compareBin2Bin unable to find <%s> in <%s>\n", @@ -642,42 +623,37 @@ inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, printf(" FAILED\n"); error_count++; - if (src_fp) { + if (src_fp) fclose(src_fp); - } - if (ref_fp) { + if (ref_fp) fclose(ref_fp); - } } else { if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { - printf( - "compareBin2Bin " - " unable to open ref_file: %s\n", - ref_file_path); + printf("compareBin2Bin " + " unable to open ref_file: %s\n", + ref_file_path); error_count++; } if (src_fp && ref_fp) { - src_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int)); - ref_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int)); + src_buffer = (unsigned int*)malloc(nelements * sizeof(unsigned int)); + ref_buffer = (unsigned int*)malloc(nelements * sizeof(unsigned int)); fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp); fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp); - printf( - "> compareBin2Bin nelements=%d," - " epsilon=%4.2f, threshold=%4.2f\n", - nelements, epsilon, threshold); + printf("> compareBin2Bin nelements=%d," + " epsilon=%4.2f, threshold=%4.2f\n", + nelements, epsilon, threshold); printf(" src_file <%s>, size=%d bytes\n", src_file, static_cast(fsize)); printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, static_cast(fsize)); if (!compareData(ref_buffer, src_buffer, nelements, - epsilon, threshold)) { + epsilon, threshold)) error_count++; - } fclose(src_fp); fclose(ref_fp); @@ -685,28 +661,25 @@ inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, free(src_buffer); free(ref_buffer); } else { - if (src_fp) { + if (src_fp) fclose(src_fp); - } - if (ref_fp) { + if (ref_fp) fclose(ref_fp); - } } } - if (error_count == 0) { + if (error_count == 0) printf(" OK\n"); - } else { + else printf(" FAILURE: %d errors...\n", (unsigned int)error_count); - } - return (error_count == 0); // returns true if all pixels pass + return (error_count == 0); // returns true if all pixels pass } -inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, +inline bool sdkCompareBin2BinFloat(const char* src_file, const char* ref_file, unsigned int nelements, const float epsilon, - const float threshold, char *exec_path) { + const float threshold, char* exec_path) { float *src_buffer = NULL, *ref_buffer = NULL; FILE *src_fp = NULL, *ref_fp = NULL; size_t fsize = 0; @@ -718,7 +691,7 @@ inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, error_count = 1; } - char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + char* ref_file_path = sdkFindFilePath(ref_file, exec_path); if (ref_file_path == NULL) { printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, @@ -729,13 +702,11 @@ inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, printf(" FAILED\n"); error_count++; - if (src_fp) { + if (src_fp) fclose(src_fp); - } - if (ref_fp) { + if (ref_fp) fclose(ref_fp); - } } else { if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { printf("compareBin2Bin unable to open ref_file: %s\n", @@ -744,13 +715,12 @@ inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, } if (src_fp && ref_fp) { - src_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); - ref_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + src_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + ref_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); - printf( - "> compareBin2Bin nelements=%d, epsilon=%4.2f," - " threshold=%4.2f\n", - nelements, epsilon, threshold); + printf("> compareBin2Bin nelements=%d, epsilon=%4.2f," + " threshold=%4.2f\n", + nelements, epsilon, threshold); fsize = fread(src_buffer, sizeof(float), nelements, src_fp); printf(" src_file <%s>, size=%d bytes\n", src_file, static_cast(fsize * sizeof(float))); @@ -759,9 +729,8 @@ inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, static_cast(fsize * sizeof(float))); if (!compareDataAsFloatThreshold( - ref_buffer, src_buffer, nelements, epsilon, threshold)) { + ref_buffer, src_buffer, nelements, epsilon, threshold)) error_count++; - } fclose(src_fp); fclose(ref_fp); @@ -769,26 +738,23 @@ inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, free(src_buffer); free(ref_buffer); } else { - if (src_fp) { + if (src_fp) fclose(src_fp); - } - if (ref_fp) { + if (ref_fp) fclose(ref_fp); - } } } - if (error_count == 0) { + if (error_count == 0) printf(" OK\n"); - } else { + else printf(" FAILURE: %d errors...\n", (unsigned int)error_count); - } - return (error_count == 0); // returns true if all pixels pass + return (error_count == 0); // returns true if all pixels pass } -inline bool sdkCompareL2fe(const float *reference, const float *data, +inline bool sdkCompareL2fe(const float* reference, const float* data, const unsigned int len, const float epsilon) { assert(epsilon >= 0); @@ -825,24 +791,24 @@ inline bool sdkCompareL2fe(const float *reference, const float *data, return result; } -inline bool sdkLoadPPMub(const char *file, unsigned char **data, - unsigned int *w, unsigned int *h) { +inline bool sdkLoadPPMub(const char* file, unsigned char** data, + unsigned int* w, unsigned int* h) { unsigned int channels; return __loadPPM(file, data, w, h, &channels); } -inline bool sdkLoadPPM4ub(const char *file, unsigned char **data, - unsigned int *w, unsigned int *h) { - unsigned char *idata = 0; +inline bool sdkLoadPPM4ub(const char* file, unsigned char** data, + unsigned int* w, unsigned int* h) { + unsigned char* idata = 0; unsigned int channels; if (__loadPPM(file, &idata, w, h, &channels)) { // pad 4th component int size = *w * *h; // keep the original pointer - unsigned char *idata_orig = idata; - *data = (unsigned char *)malloc(sizeof(unsigned char) * size * 4); - unsigned char *ptr = *data; + unsigned char* idata_orig = idata; + *data = (unsigned char*)malloc(sizeof(unsigned char) * size * 4); + unsigned char* ptr = *data; for (int i = 0; i < size; i++) { *ptr++ = *idata++; @@ -859,7 +825,7 @@ inline bool sdkLoadPPM4ub(const char *file, unsigned char **data, } } -inline bool sdkComparePPM(const char *src_file, const char *ref_file, +inline bool sdkComparePPM(const char* src_file, const char* ref_file, const float epsilon, const float threshold, bool verboseErrors) { unsigned char *src_data, *ref_data; @@ -911,25 +877,22 @@ inline bool sdkComparePPM(const char *src_file, const char *ref_file, } if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon, - threshold) == false) { + threshold) == false) error_count = 1; - } if (error_count == 0) { - if (verboseErrors) { + if (verboseErrors) std::cerr << " OK\n\n"; - } } else { - if (verboseErrors) { + if (verboseErrors) std::cerr << " FAILURE! " << error_count << " errors...\n\n"; - } } // returns true if all pixels pass return (error_count == 0) ? true : false; } -inline bool sdkComparePGM(const char *src_file, const char *ref_file, +inline bool sdkComparePGM(const char* src_file, const char* ref_file, const float epsilon, const float threshold, bool verboseErrors) { unsigned char *src_data = 0, *ref_data = 0; @@ -980,22 +943,19 @@ inline bool sdkComparePGM(const char *src_file, const char *ref_file, << threshold * 100 << "%)\n"; if (compareData(ref_data, src_data, src_width * src_height, epsilon, - threshold) == false) { + threshold) == false) error_count = 1; - } if (error_count == 0) { - if (verboseErrors) { + if (verboseErrors) std::cerr << " OK\n\n"; - } } else { - if (verboseErrors) { + if (verboseErrors) std::cerr << " FAILURE! " << error_count << " errors...\n\n"; - } } // returns true if all pixels pass return (error_count == 0) ? true : false; } -#endif // COMMON_HELPER_IMAGE_H_ +#endif // COMMON_HELPER_IMAGE_H_ diff --git a/demos/CUDA/BlackScholes/helper/helper_string.h b/demos/CUDA/BlackScholes/helper/helper_string.h index 39a1b3805..f6c25f659 100644 --- a/demos/CUDA/BlackScholes/helper/helper_string.h +++ b/demos/CUDA/BlackScholes/helper/helper_string.h @@ -29,9 +29,9 @@ #ifndef COMMON_HELPER_STRING_H_ #define COMMON_HELPER_STRING_H_ +#include #include #include -#include #include #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) @@ -60,7 +60,7 @@ #ifndef SPRINTF #define SPRINTF sprintf_s #endif -#else // Linux Includes +#else // Linux Includes #include #include @@ -93,28 +93,27 @@ #endif // CUDA Utility Helper Functions -inline int stringRemoveDelimiter(char delimiter, const char *string) { +inline int stringRemoveDelimiter(char delimiter, const char* string) { int string_start = 0; - while (string[string_start] == delimiter) { + while (string[string_start] == delimiter) string_start++; - } - if (string_start >= static_cast(strlen(string) - 1)) { + if (string_start >= static_cast(strlen(string) - 1)) return 0; - } return string_start; } -inline int getFileExtension(char *filename, char **extension) { +inline int getFileExtension(char* filename, char** extension) { int string_length = static_cast(strlen(filename)); - while (filename[string_length--] != '.') { - if (string_length == 0) break; - } + while (filename[string_length--] != '.') + if (string_length == 0) + break; - if (string_length > 0) string_length += 2; + if (string_length > 0) + string_length += 2; if (string_length == 0) *extension = NULL; @@ -124,16 +123,16 @@ inline int getFileExtension(char *filename, char **extension) { return string_length; } -inline bool checkCmdLineFlag(const int argc, const char **argv, - const char *string_ref) { +inline bool checkCmdLineFlag(const int argc, const char** argv, + const char* string_ref) { bool bFound = false; if (argc >= 1) { for (int i = 1; i < argc; i++) { int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; + const char* string_argv = &argv[i][string_start]; - const char *equal_pos = strchr(string_argv, '='); + const char* equal_pos = strchr(string_argv, '='); int argv_length = static_cast( equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); @@ -152,14 +151,14 @@ inline bool checkCmdLineFlag(const int argc, const char **argv, // This function wraps the CUDA Driver API into a template function template -inline bool getCmdLineArgumentValue(const int argc, const char **argv, - const char *string_ref, T *value) { +inline bool getCmdLineArgumentValue(const int argc, const char** argv, + const char* string_ref, T* value) { bool bFound = false; if (argc >= 1) { for (int i = 1; i < argc; i++) { int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; + const char* string_argv = &argv[i][string_start]; int length = static_cast(strlen(string_ref)); if (!STRNCASECMP(string_argv, string_ref, length)) { @@ -177,15 +176,15 @@ inline bool getCmdLineArgumentValue(const int argc, const char **argv, return bFound; } -inline int getCmdLineArgumentInt(const int argc, const char **argv, - const char *string_ref) { +inline int getCmdLineArgumentInt(const int argc, const char** argv, + const char* string_ref) { bool bFound = false; int value = -1; if (argc >= 1) { for (int i = 1; i < argc; i++) { int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; + const char* string_argv = &argv[i][string_start]; int length = static_cast(strlen(string_ref)); if (!STRNCASECMP(string_argv, string_ref, length)) { @@ -202,22 +201,21 @@ inline int getCmdLineArgumentInt(const int argc, const char **argv, } } - if (bFound) { + if (bFound) return value; - } else { + else return 0; - } } -inline float getCmdLineArgumentFloat(const int argc, const char **argv, - const char *string_ref) { +inline float getCmdLineArgumentFloat(const int argc, const char** argv, + const char* string_ref) { bool bFound = false; float value = -1; if (argc >= 1) { for (int i = 1; i < argc; i++) { int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; + const char* string_argv = &argv[i][string_start]; int length = static_cast(strlen(string_ref)); if (!STRNCASECMP(string_argv, string_ref, length)) { @@ -234,22 +232,21 @@ inline float getCmdLineArgumentFloat(const int argc, const char **argv, } } - if (bFound) { + if (bFound) return value; - } else { + else return 0; - } } -inline bool getCmdLineArgumentString(const int argc, const char **argv, - const char *string_ref, - char **string_retval) { +inline bool getCmdLineArgumentString(const int argc, const char** argv, + const char* string_ref, + char** string_retval) { bool bFound = false; if (argc >= 1) { for (int i = 1; i < argc; i++) { int string_start = stringRemoveDelimiter('-', argv[i]); - char *string_argv = const_cast(&argv[i][string_start]); + char* string_argv = const_cast(&argv[i][string_start]); int length = static_cast(strlen(string_ref)); if (!STRNCASECMP(string_argv, string_ref, length)) { @@ -260,9 +257,8 @@ inline bool getCmdLineArgumentString(const int argc, const char **argv, } } - if (!bFound) { + if (!bFound) *string_retval = NULL; - } return bFound; } @@ -275,8 +271,8 @@ inline bool getCmdLineArgumentString(const int argc, const char **argv, //! @param filename name of the file //! @param executable_path optional absolute path of the executable ////////////////////////////////////////////////////////////////////////////// -inline char *sdkFindFilePath(const char *filename, - const char *executable_path) { +inline char* sdkFindFilePath(const char* filename, + const char* executable_path) { // defines a variable that is replaced with the name of the // executable @@ -284,77 +280,95 @@ inline char *sdkFindFilePath(const char *filename, // input data, or JIT source files) The origin for the relative search may be // the .exe file, a .bat file launching an .exe, a browser .exe launching the // .exe or .bat, etc - const char *searchPath[] = { - "./", // same dir - "./data/", // same dir - - "../../../../Samples//", // up 4 in tree - "../../../Samples//", // up 3 in tree - "../../Samples//", // up 2 in tree - - "../../../../Samples//data/", // up 4 in tree - "../../../Samples//data/", // up 3 in tree - "../../Samples//data/", // up 2 in tree - - "../../../../Samples/0_Introduction//", // up 4 in tree - "../../../Samples/0_Introduction//", // up 3 in tree - "../../Samples/0_Introduction//", // up 2 in tree - - "../../../../Samples/1_Utilities//", // up 4 in tree - "../../../Samples/1_Utilities//", // up 3 in tree - "../../Samples/1_Utilities//", // up 2 in tree - - "../../../../Samples/2_Concepts_and_Techniques//", // up 4 in tree - "../../../Samples/2_Concepts_and_Techniques//", // up 3 in tree - "../../Samples/2_Concepts_and_Techniques//", // up 2 in tree - - "../../../../Samples/3_CUDA_Features//", // up 4 in tree - "../../../Samples/3_CUDA_Features//", // up 3 in tree - "../../Samples/3_CUDA_Features//", // up 2 in tree - - "../../../../Samples/4_CUDA_Libraries//", // up 4 in tree - "../../../Samples/4_CUDA_Libraries//", // up 3 in tree - "../../Samples/4_CUDA_Libraries//", // up 2 in tree - - "../../../../Samples/5_Domain_Specific//", // up 4 in tree - "../../../Samples/5_Domain_Specific//", // up 3 in tree - "../../Samples/5_Domain_Specific//", // up 2 in tree - - "../../../../Samples/6_Performance//", // up 4 in tree - "../../../Samples/6_Performance//", // up 3 in tree - "../../Samples/6_Performance//", // up 2 in tree - - "../../../../Samples/0_Introduction//data/", // up 4 in tree - "../../../Samples/0_Introduction//data/", // up 3 in tree - "../../Samples/0_Introduction//data/", // up 2 in tree - - "../../../../Samples/1_Utilities//data/", // up 4 in tree - "../../../Samples/1_Utilities//data/", // up 3 in tree - "../../Samples/1_Utilities//data/", // up 2 in tree - - "../../../../Samples/2_Concepts_and_Techniques//data/", // up 4 in tree - "../../../Samples/2_Concepts_and_Techniques//data/", // up 3 in tree - "../../Samples/2_Concepts_and_Techniques//data/", // up 2 in tree - - "../../../../Samples/3_CUDA_Features//data/", // up 4 in tree - "../../../Samples/3_CUDA_Features//data/", // up 3 in tree - "../../Samples/3_CUDA_Features//data/", // up 2 in tree - - "../../../../Samples/4_CUDA_Libraries//data/", // up 4 in tree - "../../../Samples/4_CUDA_Libraries//data/", // up 3 in tree - "../../Samples/4_CUDA_Libraries//data/", // up 2 in tree - - "../../../../Samples/5_Domain_Specific//data/", // up 4 in tree - "../../../Samples/5_Domain_Specific//data/", // up 3 in tree - "../../Samples/5_Domain_Specific//data/", // up 2 in tree - - "../../../../Samples/6_Performance//data/", // up 4 in tree - "../../../Samples/6_Performance//data/", // up 3 in tree - "../../Samples/6_Performance//data/", // up 2 in tree - - "../../../../Common/data/", // up 4 in tree - "../../../Common/data/", // up 3 in tree - "../../Common/data/" // up 2 in tree + const char* searchPath[] = { + "./", // same dir + "./data/", // same dir + + "../../../../Samples//", // up 4 in tree + "../../../Samples//", // up 3 in tree + "../../Samples//", // up 2 in tree + + "../../../../Samples//data/", // up 4 in tree + "../../../Samples//data/", // up 3 in tree + "../../Samples//data/", // up 2 in tree + + "../../../../Samples/0_Introduction//", // up 4 in tree + "../../../Samples/0_Introduction//", // up 3 in tree + "../../Samples/0_Introduction//", // up 2 in tree + + "../../../../Samples/1_Utilities//", // up 4 in tree + "../../../Samples/1_Utilities//", // up 3 in tree + "../../Samples/1_Utilities//", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//", // up + // 4 + // in + // tree + "../../../Samples/2_Concepts_and_Techniques//", // up 3 + // in + // tree + "../../Samples/2_Concepts_and_Techniques//", // up 2 in + // tree + + "../../../../Samples/3_CUDA_Features//", // up 4 in tree + "../../../Samples/3_CUDA_Features//", // up 3 in tree + "../../Samples/3_CUDA_Features//", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//", // up 3 in tree + "../../Samples/4_CUDA_Libraries//", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//", // up 4 in + // tree + "../../../Samples/5_Domain_Specific//", // up 3 in tree + "../../Samples/5_Domain_Specific//", // up 2 in tree + + "../../../../Samples/6_Performance//", // up 4 in tree + "../../../Samples/6_Performance//", // up 3 in tree + "../../Samples/6_Performance//", // up 2 in tree + + "../../../../Samples/0_Introduction//data/", // up 4 in + // tree + "../../../Samples/0_Introduction//data/", // up 3 in tree + "../../Samples/0_Introduction//data/", // up 2 in tree + + "../../../../Samples/1_Utilities//data/", // up 4 in tree + "../../../Samples/1_Utilities//data/", // up 3 in tree + "../../Samples/1_Utilities//data/", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//data/", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//data/", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//data/", // up 2 + // in + // tree + + "../../../../Samples/3_CUDA_Features//data/", // up 4 in + // tree + "../../../Samples/3_CUDA_Features//data/", // up 3 in + // tree + "../../Samples/3_CUDA_Features//data/", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//data/", // up 4 in + // tree + "../../../Samples/4_CUDA_Libraries//data/", // up 3 in + // tree + "../../Samples/4_CUDA_Libraries//data/", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//data/", // up 4 + // in + // tree + "../../../Samples/5_Domain_Specific//data/", // up 3 in + // tree + "../../Samples/5_Domain_Specific//data/", // up 2 in tree + + "../../../../Samples/6_Performance//data/", // up 4 in + // tree + "../../../Samples/6_Performance//data/", // up 3 in tree + "../../Samples/6_Performance//data/", // up 2 in tree + + "../../../../Common/data/", // up 4 in tree + "../../../Common/data/", // up 3 in tree + "../../Common/data/" // up 2 in tree }; // Extract the executable name @@ -381,7 +395,7 @@ inline char *sdkFindFilePath(const char *filename, } // Loop over all search paths and return the first hit - for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char*); ++i) { std::string path(searchPath[i]); size_t executable_name_pos = path.find(""); @@ -403,21 +417,20 @@ inline char *sdkFindFilePath(const char *filename, // Test if the file exists path.append(filename); - FILE *fp; + FILE* fp; FOPEN(fp, path.c_str(), "rb"); if (fp != NULL) { fclose(fp); // File found // returning an allocated array here for backwards compatibility reasons - char *file_path = reinterpret_cast(malloc(path.length() + 1)); + char* file_path = reinterpret_cast(malloc(path.length() + 1)); STRCPY(file_path, path.length() + 1, path.c_str()); return file_path; } - if (fp) { + if (fp) fclose(fp); - } } // File not found @@ -425,4 +438,4 @@ inline char *sdkFindFilePath(const char *filename, return 0; } -#endif // COMMON_HELPER_STRING_H_ +#endif // COMMON_HELPER_STRING_H_ diff --git a/demos/CUDA/BlackScholes/helper/helper_timer.h b/demos/CUDA/BlackScholes/helper/helper_timer.h index 5aceaf8a9..3869bf8ca 100644 --- a/demos/CUDA/BlackScholes/helper/helper_timer.h +++ b/demos/CUDA/BlackScholes/helper/helper_timer.h @@ -42,11 +42,11 @@ // Definition of the StopWatch Interface, this is used if we don't want to use // the CUT functions But rather in a self contained class interface class StopWatchInterface { - public: +public: StopWatchInterface() {} virtual ~StopWatchInterface() {} - public: +public: //! Start time measurement virtual void start() = 0; @@ -78,23 +78,17 @@ class StopWatchInterface { //! Windows specific implementation of StopWatch class StopWatchWin : public StopWatchInterface { - public: +public: //! Constructor, default StopWatchWin() - : start_time(), - end_time(), - diff_time(0.0f), - total_time(0.0f), - running(false), - clock_sessions(0), - freq(0), - freq_set(false) { + : start_time(), end_time(), diff_time(0.0f), total_time(0.0f), + running(false), clock_sessions(0), freq(0), freq_set(false) { if (!freq_set) { // helper variable LARGE_INTEGER temp; // get the tick frequency from the OS - QueryPerformanceFrequency(reinterpret_cast(&temp)); + QueryPerformanceFrequency(reinterpret_cast(&temp)); // convert to type in which it is needed freq = (static_cast(temp.QuadPart)) / 1000.0; @@ -107,7 +101,7 @@ class StopWatchWin : public StopWatchInterface { // Destructor ~StopWatchWin() {} - public: +public: //! Start time measurement inline void start(); @@ -126,7 +120,7 @@ class StopWatchWin : public StopWatchInterface { //! _stopped_ (ie finished sessions) and the current total time inline float getAverageTime(); - private: +private: // member variables //! Start of measurement @@ -160,7 +154,7 @@ class StopWatchWin : public StopWatchInterface { //! Start time measurement //////////////////////////////////////////////////////////////////////////////// inline void StopWatchWin::start() { - QueryPerformanceCounter(reinterpret_cast(&start_time)); + QueryPerformanceCounter(reinterpret_cast(&start_time)); running = true; } @@ -169,7 +163,7 @@ inline void StopWatchWin::start() { //! variable. Also increment the number of times this clock has been run. //////////////////////////////////////////////////////////////////////////////// inline void StopWatchWin::stop() { - QueryPerformanceCounter(reinterpret_cast(&end_time)); + QueryPerformanceCounter(reinterpret_cast(&end_time)); diff_time = static_cast(((static_cast(end_time.QuadPart) - static_cast(start_time.QuadPart)) / freq)); @@ -188,9 +182,8 @@ inline void StopWatchWin::reset() { total_time = 0; clock_sessions = 0; - if (running) { - QueryPerformanceCounter(reinterpret_cast(&start_time)); - } + if (running) + QueryPerformanceCounter(reinterpret_cast(&start_time)); } //////////////////////////////////////////////////////////////////////////////// @@ -205,7 +198,7 @@ inline float StopWatchWin::getTime() { if (running) { LARGE_INTEGER temp; - QueryPerformanceCounter(reinterpret_cast(&temp)); + QueryPerformanceCounter(reinterpret_cast(&temp)); retval += static_cast(((static_cast(temp.QuadPart) - static_cast(start_time.QuadPart)) / freq)); @@ -224,24 +217,21 @@ inline float StopWatchWin::getAverageTime() { #else // Declarations for Stopwatch on Linux and Mac OSX // includes, system -#include #include +#include //! Windows specific implementation of StopWatch class StopWatchLinux : public StopWatchInterface { - public: +public: //! Constructor, default StopWatchLinux() - : start_time(), - diff_time(0.0), - total_time(0.0), - running(false), + : start_time(), diff_time(0.0), total_time(0.0), running(false), clock_sessions(0) {} // Destructor virtual ~StopWatchLinux() {} - public: +public: //! Start time measurement inline void start(); @@ -260,13 +250,13 @@ class StopWatchLinux : public StopWatchInterface { //! _stopped_ (ie finished sessions) and the current total time inline float getAverageTime(); - private: +private: // helper functions //! Get difference between start time and current time inline float getDiffTime(); - private: +private: // member variables //! Start of measurement @@ -316,9 +306,8 @@ inline void StopWatchLinux::reset() { total_time = 0; clock_sessions = 0; - if (running) { + if (running) gettimeofday(&start_time, 0); - } } //////////////////////////////////////////////////////////////////////////////// @@ -331,9 +320,8 @@ inline float StopWatchLinux::getTime() { // Return the TOTAL time to date float retval = total_time; - if (running) { + if (running) retval += getDiffTime(); - } return retval; } @@ -356,7 +344,7 @@ inline float StopWatchLinux::getDiffTime() { return static_cast(1000.0 * (t_time.tv_sec - start_time.tv_sec) + (0.001 * (t_time.tv_usec - start_time.tv_usec))); } -#endif // WIN32 +#endif // WIN32 //////////////////////////////////////////////////////////////////////////////// //! Timer functionality exported @@ -366,13 +354,13 @@ inline float StopWatchLinux::getDiffTime() { //! @return true if a time has been created, otherwise false //! @param name of the new timer, 0 if the creation failed //////////////////////////////////////////////////////////////////////////////// -inline bool sdkCreateTimer(StopWatchInterface **timer_interface) { +inline bool sdkCreateTimer(StopWatchInterface** timer_interface) { // printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - *timer_interface = reinterpret_cast(new StopWatchWin()); + *timer_interface = reinterpret_cast(new StopWatchWin()); #else *timer_interface = - reinterpret_cast(new StopWatchLinux()); + reinterpret_cast(new StopWatchLinux()); #endif return (*timer_interface != NULL) ? true : false; } @@ -382,7 +370,7 @@ inline bool sdkCreateTimer(StopWatchInterface **timer_interface) { //! @return true if a time has been deleted, otherwise false //! @param name of the timer to delete //////////////////////////////////////////////////////////////////////////////// -inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) { +inline bool sdkDeleteTimer(StopWatchInterface** timer_interface) { // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); if (*timer_interface) { delete *timer_interface; @@ -396,11 +384,10 @@ inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) { //! Start the time with name \a name //! @param name name of the timer to start //////////////////////////////////////////////////////////////////////////////// -inline bool sdkStartTimer(StopWatchInterface **timer_interface) { +inline bool sdkStartTimer(StopWatchInterface** timer_interface) { // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); - if (*timer_interface) { + if (*timer_interface) (*timer_interface)->start(); - } return true; } @@ -409,11 +396,10 @@ inline bool sdkStartTimer(StopWatchInterface **timer_interface) { //! Stop the time with name \a name. Does not reset. //! @param name name of the timer to stop //////////////////////////////////////////////////////////////////////////////// -inline bool sdkStopTimer(StopWatchInterface **timer_interface) { +inline bool sdkStopTimer(StopWatchInterface** timer_interface) { // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); - if (*timer_interface) { + if (*timer_interface) (*timer_interface)->stop(); - } return true; } @@ -422,11 +408,10 @@ inline bool sdkStopTimer(StopWatchInterface **timer_interface) { //! Resets the timer's counter. //! @param name name of the timer to reset. //////////////////////////////////////////////////////////////////////////////// -inline bool sdkResetTimer(StopWatchInterface **timer_interface) { +inline bool sdkResetTimer(StopWatchInterface** timer_interface) { // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); - if (*timer_interface) { + if (*timer_interface) (*timer_interface)->reset(); - } return true; } @@ -438,14 +423,13 @@ inline bool sdkResetTimer(StopWatchInterface **timer_interface) { //! Excludes the current running time if the timer is currently running. //! @param name name of the timer to return the time of //////////////////////////////////////////////////////////////////////////////// -inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) { +inline float sdkGetAverageTimerValue(StopWatchInterface** timer_interface) { // printf("sdkGetAverageTimerValue called object %08x\n", (void // *)*timer_interface); - if (*timer_interface) { + if (*timer_interface) return (*timer_interface)->getAverageTime(); - } else { + else return 0.0f; - } } //////////////////////////////////////////////////////////////////////////////// @@ -453,13 +437,12 @@ inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) { //! or timer creation. //! @param name name of the timer to obtain the value of. //////////////////////////////////////////////////////////////////////////////// -inline float sdkGetTimerValue(StopWatchInterface **timer_interface) { +inline float sdkGetTimerValue(StopWatchInterface** timer_interface) { // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); - if (*timer_interface) { + if (*timer_interface) return (*timer_interface)->getTime(); - } else { + else return 0.0f; - } } -#endif // COMMON_HELPER_TIMER_H_ +#endif // COMMON_HELPER_TIMER_H_ diff --git a/demos/CUDA/TensorContraction.cu b/demos/CUDA/TensorContraction.cu index 8e1efaf9b..9d324a065 100644 --- a/demos/CUDA/TensorContraction.cu +++ b/demos/CUDA/TensorContraction.cu @@ -17,236 +17,250 @@ typedef unsigned long long int size_type; -__device__ void computeStartStep(size_type& A_start, size_type& A_step, size_type& B_start, size_type& B_step, const int idx, const size_type A_dim[3], const size_type B_dim[3], const int contractDimA, const int contractDimB) { - size_type A_a, A_b, A_c, B_d, B_e, B_f; +__device__ void computeStartStep(size_type& A_start, size_type& A_step, + size_type& B_start, size_type& B_step, + const int idx, const size_type A_dim[3], + const size_type B_dim[3], + const int contractDimA, + const int contractDimB) { + size_type A_a, A_b, A_c, B_d, B_e, B_f; - switch (contractDimA) { - case 0: - A_b = idx / (A_dim[2] * B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3]); - A_c = (idx / (B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3])) % A_dim[2]; - A_start = 0 + A_b * A_dim[2] + A_c; - A_step = A_dim[1] * A_dim[2]; - break; - case 1: - A_a = idx / (A_dim[2] * B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3]); - A_c = (idx / (B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3])) % A_dim[2]; - A_start = A_a * A_dim[1] * A_dim[2] + 0 + A_c; - A_step = A_dim[2]; - break; - case 2: - A_a = idx / (A_dim[1] * B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3]); - A_b = (idx / (B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3])) % A_dim[1]; - A_start = A_a * A_dim[1] * A_dim[2] + A_b * A_dim[2]; - A_step = 1; - break; - } + switch (contractDimA) { + case 0: + A_b = idx / (A_dim[2] * B_dim[(contractDimB + 1) % 3] * + B_dim[(contractDimB + 2) % 3]); + A_c = (idx / + (B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3])) % + A_dim[2]; + A_start = 0 + A_b * A_dim[2] + A_c; + A_step = A_dim[1] * A_dim[2]; + break; + case 1: + A_a = idx / (A_dim[2] * B_dim[(contractDimB + 1) % 3] * + B_dim[(contractDimB + 2) % 3]); + A_c = (idx / + (B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3])) % + A_dim[2]; + A_start = A_a * A_dim[1] * A_dim[2] + 0 + A_c; + A_step = A_dim[2]; + break; + case 2: + A_a = idx / (A_dim[1] * B_dim[(contractDimB + 1) % 3] * + B_dim[(contractDimB + 2) % 3]); + A_b = (idx / + (B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3])) % + A_dim[1]; + A_start = A_a * A_dim[1] * A_dim[2] + A_b * A_dim[2]; + A_step = 1; + break; + } - switch (contractDimB) { - case 0: - B_e = (idx / B_dim[2]) % B_dim[1]; - B_f = idx % B_dim[2]; - B_start = 0 + B_e * B_dim[2] + B_f; - B_step = B_dim[1] * B_dim[2]; - break; - case 1: - B_d = (idx / B_dim[2]) % B_dim[0]; - B_f = idx % B_dim[2]; - B_start = B_d * B_dim[2] * B_dim[1] + 0 + B_f; - B_step = B_dim[2]; - break; - case 2: - B_d = (idx / B_dim[1]) % B_dim[0]; - B_e = idx % B_dim[1]; - B_start = B_d * B_dim[2] * B_dim[1] + B_e * B_dim[2]; - B_step = 1; - break; - } + switch (contractDimB) { + case 0: + B_e = (idx / B_dim[2]) % B_dim[1]; + B_f = idx % B_dim[2]; + B_start = 0 + B_e * B_dim[2] + B_f; + B_step = B_dim[1] * B_dim[2]; + break; + case 1: + B_d = (idx / B_dim[2]) % B_dim[0]; + B_f = idx % B_dim[2]; + B_start = B_d * B_dim[2] * B_dim[1] + 0 + B_f; + B_step = B_dim[2]; + break; + case 2: + B_d = (idx / B_dim[1]) % B_dim[0]; + B_e = idx % B_dim[1]; + B_start = B_d * B_dim[2] * B_dim[1] + B_e * B_dim[2]; + B_step = 1; + break; + } } -__global__ void tensorContraction3D(float* C, const float *A, const float *B, const size_type *A_dim, const size_type *B_dim, const int contractDimA, const int contractDimB) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void tensorContraction3D(float* C, const float* A, const float* B, + const size_type* A_dim, + const size_type* B_dim, + const int contractDimA, + const int contractDimB) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; - // Each thread computes one element of the output tensor - int totalElements = A_dim[(contractDimA + 1) % 3] * A_dim[(contractDimA + 2) % 3] * B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3]; - if (idx < totalElements) { - size_type A_start, B_start, A_step, B_step; - size_type A_a, A_b, A_c, B_d, B_e, B_f; + // Each thread computes one element of the output tensor + int totalElements = + A_dim[(contractDimA + 1) % 3] * A_dim[(contractDimA + 2) % 3] * + B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3]; + if (idx < totalElements) { + size_type A_start, B_start, A_step, B_step; + size_type A_a, A_b, A_c, B_d, B_e, B_f; - computeStartStep(A_start, A_step, B_start, B_step, idx, A_dim, B_dim, contractDimA, contractDimB); - - float sum = 0.0f; - for (int i = 0; i < A_dim[contractDimA]; i++) { // A_dim[contractDimA] == B_dim[contractDimB] - sum += A[A_start + (i * A_step)] * B[B_start + (i * B_step)]; - } + computeStartStep(A_start, A_step, B_start, B_step, idx, A_dim, B_dim, + contractDimA, contractDimB); - C[idx] = sum; - } + float sum = 0.0f; + for (int i = 0; i < A_dim[contractDimA]; + i++) // A_dim[contractDimA] == B_dim[contractDimB] + sum += A[A_start + (i * A_step)] * B[B_start + (i * B_step)]; + + C[idx] = sum; + } } -void launchTensorContraction3D(float* C, const float* A, const float* B, const size_type D1, const size_type D2, const size_type D3, const size_type D4, const size_type D5) { - float *d_A = nullptr, *d_B = nullptr, *d_C = nullptr; - - const size_type A_size = D1 * D2 * D3 * sizeof(float); - const size_type B_size = D3 * D4 * D5 * sizeof(float); - const size_type C_size = D1 * D2 * D4 * D5 * sizeof(float); - - // Allocate device memory and copy data from host to device - cudaMalloc(&d_A, A_size); - cudaMalloc(&d_B, B_size); - cudaMalloc(&d_C, C_size); - cudaMemcpy(d_A, A, A_size, cudaMemcpyHostToDevice); - cudaMemcpy(d_B, B, B_size, cudaMemcpyHostToDevice); - - const size_type A_dim[3] = {D1, D2, D3}; - const size_type B_dim[3] = {D3, D4, D5}; - - size_type *d_A_dim = nullptr, *d_B_dim = nullptr; - cudaMalloc(&d_A_dim, 3 * sizeof(size_type)); - cudaMalloc(&d_B_dim, 3 * sizeof(size_type)); - cudaMemcpy(d_A_dim, A_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice); - cudaMemcpy(d_B_dim, B_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice); - - // Launch the kernel - tensorContraction3D<<<1, 256>>>(d_C, d_A, d_B, d_A_dim, d_B_dim, /*contractDimA=*/2, /*contractDimB=*/0); - - // Copy the result from device to host - cudaMemcpy(C, d_C, C_size, cudaMemcpyDeviceToHost); - - // Free device memory - cudaFree(d_A); - cudaFree(d_B); - cudaFree(d_C); - cudaFree(d_A_dim); - cudaFree(d_B_dim); +void launchTensorContraction3D(float* C, const float* A, const float* B, + const size_type D1, const size_type D2, + const size_type D3, const size_type D4, + const size_type D5) { + float *d_A = nullptr, *d_B = nullptr, *d_C = nullptr; + + const size_type A_size = D1 * D2 * D3 * sizeof(float); + const size_type B_size = D3 * D4 * D5 * sizeof(float); + const size_type C_size = D1 * D2 * D4 * D5 * sizeof(float); + + // Allocate device memory and copy data from host to device + cudaMalloc(&d_A, A_size); + cudaMalloc(&d_B, B_size); + cudaMalloc(&d_C, C_size); + cudaMemcpy(d_A, A, A_size, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, B_size, cudaMemcpyHostToDevice); + + const size_type A_dim[3] = {D1, D2, D3}; + const size_type B_dim[3] = {D3, D4, D5}; + + size_type *d_A_dim = nullptr, *d_B_dim = nullptr; + cudaMalloc(&d_A_dim, 3 * sizeof(size_type)); + cudaMalloc(&d_B_dim, 3 * sizeof(size_type)); + cudaMemcpy(d_A_dim, A_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice); + cudaMemcpy(d_B_dim, B_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice); + + // Launch the kernel + tensorContraction3D<<<1, 256>>>(d_C, d_A, d_B, d_A_dim, d_B_dim, + /*contractDimA=*/2, /*contractDimB=*/0); + + // Copy the result from device to host + cudaMemcpy(C, d_C, C_size, cudaMemcpyDeviceToHost); + + // Free device memory + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + cudaFree(d_A_dim); + cudaFree(d_B_dim); } int main() { - const size_type D1 = 2, D2 = 3, D3 = 4, D4 = 3, D5 = 2; - - float A[D1][D2][D3] = { - { {1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12} }, - { {13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24} } - }; - - float B[D3][D4][D5] = { - { {1, 2}, {3, 4}, {5, 6} }, - { {7, 8}, {9, 10}, {11, 12} }, - { {13, 14}, {15, 16}, {17, 18} }, - { {19, 20}, {21, 22}, {23, 24} } - }; - - float C[D1][D2][D4][D5] = {0}; // Result tensor - - // Compute the gradient - auto tensor_grad = clad::gradient(launchTensorContraction3D, "C, A, B"); - - // Initialize the gradient inputs - float gradC[D1][D2][D4][D5] = { - { - { {1, 1}, {1, 1}, {1, 1} }, - { {1, 1}, {1, 1}, {1, 1} }, - { {1, 1}, {1, 1}, {1, 1} } - }, - { - { {1, 1}, {1, 1}, {1, 1} }, - { {1, 1}, {1, 1}, {1, 1} }, - { {1, 1}, {1, 1}, {1, 1} } - } - }; - float gradA[D1][D2][D3] = {0}; - float gradB[D3][D4][D5] = {0}; - - // Execute tensor contraction and its gradient - tensor_grad.execute(&C[0][0][0][0], &A[0][0][0], &B[0][0][0], D1, D2, D3, D4, D5, &gradC[0][0][0][0], &gradA[0][0][0], &gradB[0][0][0]); - - // Print the result - std::cout << "Result C:\n"; - for (int i = 0; i < D1; ++i) { - for (int j = 0; j < D2; ++j) { - for (int k = 0; k < D4; ++k) { - for (int l = 0; l < D5; ++l) { - std::cout << C[i][j][k][l] << " "; - } - std::cout << "\n"; - } - std::cout << "\n"; - } + const size_type D1 = 2, D2 = 3, D3 = 4, D4 = 3, D5 = 2; + + float A[D1][D2][D3] = { + {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}, + {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}; + + float B[D3][D4][D5] = {{{1, 2}, {3, 4}, {5, 6}}, + {{7, 8}, {9, 10}, {11, 12}}, + {{13, 14}, {15, 16}, {17, 18}}, + {{19, 20}, {21, 22}, {23, 24}}}; + + float C[D1][D2][D4][D5] = {0}; // Result tensor + + // Compute the gradient + auto tensor_grad = clad::gradient(launchTensorContraction3D, "C, A, B"); + + // Initialize the gradient inputs + float gradC[D1][D2][D4][D5] = {{{{1, 1}, {1, 1}, {1, 1}}, + {{1, 1}, {1, 1}, {1, 1}}, + {{1, 1}, {1, 1}, {1, 1}}}, + {{{1, 1}, {1, 1}, {1, 1}}, + {{1, 1}, {1, 1}, {1, 1}}, + {{1, 1}, {1, 1}, {1, 1}}}}; + float gradA[D1][D2][D3] = {0}; + float gradB[D3][D4][D5] = {0}; + + // Execute tensor contraction and its gradient + tensor_grad.execute(&C[0][0][0][0], &A[0][0][0], &B[0][0][0], D1, D2, D3, D4, + D5, &gradC[0][0][0][0], &gradA[0][0][0], &gradB[0][0][0]); + + // Print the result + std::cout << "Result C:\n"; + for (int i = 0; i < D1; ++i) { + for (int j = 0; j < D2; ++j) { + for (int k = 0; k < D4; ++k) { + for (int l = 0; l < D5; ++l) + std::cout << C[i][j][k][l] << " "; std::cout << "\n"; + } + std::cout << "\n"; } - - std::cout << "Result C_grad w.r.t. A:\n"; - for (int i = 0; i < D1; ++i) { - for (int j = 0; j < D2; ++j) { - for (int k = 0; k < D3; ++k) { - std::cout << gradA[i][j][k] << " "; - } - std::cout << "\n"; - } - std::cout << "\n"; + std::cout << "\n"; + } + + std::cout << "Result C_grad w.r.t. A:\n"; + for (int i = 0; i < D1; ++i) { + for (int j = 0; j < D2; ++j) { + for (int k = 0; k < D3; ++k) + std::cout << gradA[i][j][k] << " "; + std::cout << "\n"; } + std::cout << "\n"; + } - std::cout << "Result C_grad w.r.t. B:\n"; - for (int i = 0; i < D3; ++i) { - for (int j = 0; j < D4; ++j) { - for (int k = 0; k < D5; ++k) { - std::cout << gradB[i][j][k] << " "; - } - std::cout << "\n"; - } - std::cout << "\n"; + std::cout << "Result C_grad w.r.t. B:\n"; + for (int i = 0; i < D3; ++i) { + for (int j = 0; j < D4; ++j) { + for (int k = 0; k < D5; ++k) + std::cout << gradB[i][j][k] << " "; + std::cout << "\n"; } + std::cout << "\n"; + } - return 0; + return 0; } // CHECK-EXEC: Result C: -// CHECK-NEXT: 130 140 -// CHECK-NEXT: 150 160 -// CHECK-NEXT: 170 180 +// CHECK-NEXT: 130 140 +// CHECK-NEXT: 150 160 +// CHECK-NEXT: 170 180 // CHECK-NEXT: -// CHECK-NEXT: 290 316 -// CHECK-NEXT: 342 368 -// CHECK-NEXT: 394 420 +// CHECK-NEXT: 290 316 +// CHECK-NEXT: 342 368 +// CHECK-NEXT: 394 420 // CHECK-NEXT: -// CHECK-NEXT: 450 492 -// CHECK-NEXT: 534 576 -// CHECK-NEXT: 618 660 +// CHECK-NEXT: 450 492 +// CHECK-NEXT: 534 576 +// CHECK-NEXT: 618 660 // CHECK-NEXT: // CHECK-NEXT: -// CHECK-NEXT: 610 668 -// CHECK-NEXT: 726 784 -// CHECK-NEXT: 842 900 +// CHECK-NEXT: 610 668 +// CHECK-NEXT: 726 784 +// CHECK-NEXT: 842 900 // CHECK-NEXT: -// CHECK-NEXT: 770 844 -// CHECK-NEXT: 918 992 -// CHECK-NEXT: 1066 1140 +// CHECK-NEXT: 770 844 +// CHECK-NEXT: 918 992 +// CHECK-NEXT: 1066 1140 // CHECK-NEXT: -// CHECK-NEXT: 930 1020 -// CHECK-NEXT: 1110 1200 -// CHECK-NEXT: 1290 1380 +// CHECK-NEXT: 930 1020 +// CHECK-NEXT: 1110 1200 +// CHECK-NEXT: 1290 1380 // CHECK-EXEC: Result C_grad w.r.t. A: -// CHECK-NEXT: 21 57 93 129 -// CHECK-NEXT: 21 57 93 129 -// CHECK-NEXT: 21 57 93 129 +// CHECK-NEXT: 21 57 93 129 +// CHECK-NEXT: 21 57 93 129 +// CHECK-NEXT: 21 57 93 129 // CHECK-NEXT: -// CHECK-NEXT: 21 57 93 129 -// CHECK-NEXT: 21 57 93 129 -// CHECK-NEXT: 21 57 93 129 +// CHECK-NEXT: 21 57 93 129 +// CHECK-NEXT: 21 57 93 129 +// CHECK-NEXT: 21 57 93 129 // CHECK-NEXT: // CHECK-EXEC: Result C_grad w.r.t. B: -// CHECK-NEXT: 66 66 -// CHECK-NEXT: 66 66 -// CHECK-NEXT: 66 66 +// CHECK-NEXT: 66 66 +// CHECK-NEXT: 66 66 +// CHECK-NEXT: 66 66 // CHECK-NEXT: -// CHECK-NEXT: 72 72 -// CHECK-NEXT: 72 72 -// CHECK-NEXT: 72 72 +// CHECK-NEXT: 72 72 +// CHECK-NEXT: 72 72 +// CHECK-NEXT: 72 72 // CHECK-NEXT: -// CHECK-NEXT: 78 78 -// CHECK-NEXT: 78 78 -// CHECK-NEXT: 78 78 +// CHECK-NEXT: 78 78 +// CHECK-NEXT: 78 78 +// CHECK-NEXT: 78 78 // CHECK-NEXT: -// CHECK-NEXT: 84 84 -// CHECK-NEXT: 84 84 -// CHECK-NEXT: 84 84 \ No newline at end of file +// CHECK-NEXT: 84 84 +// CHECK-NEXT: 84 84 +// CHECK-NEXT: 84 84 \ No newline at end of file