From 56bc4492463cc5bfdbbb0852a3b4eabf70c0461b Mon Sep 17 00:00:00 2001 From: kchristin Date: Sun, 3 Nov 2024 22:53:19 +0200 Subject: [PATCH] Remove tensor demo --- test/CUDA/tensor_demo.cu | 245 --------------------------------------- 1 file changed, 245 deletions(-) delete mode 100644 test/CUDA/tensor_demo.cu diff --git a/test/CUDA/tensor_demo.cu b/test/CUDA/tensor_demo.cu deleted file mode 100644 index 43bcc32ee..000000000 --- a/test/CUDA/tensor_demo.cu +++ /dev/null @@ -1,245 +0,0 @@ -// RUN: %cladclang_cuda -I%S/../../include --cuda-path=%cudapath \ -// RUN: --cuda-gpu-arch=%cudaarch %cudaldflags -otensor_demo.out %s -// RUN: ./tensor_demo.out | %filecheck_exec %s -// REQUIRES: cuda-runtime - - -#include "clad/Differentiator/Differentiator.h" - -typedef unsigned long long int size_type; - -__device__ void computeStartStep(size_type& A_start, size_type& A_step, size_type& B_start, size_type& B_step, const int idx, const size_type A_dim[3], const size_type B_dim[3], const int contractDimA, const int contractDimB) { - size_type A_a, A_b, A_c, B_d, B_e, B_f; - - switch (contractDimA) { - case 0: - A_b = idx / (A_dim[2] * B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3]); - A_c = (idx / (B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3])) % A_dim[2]; - A_start = 0 + A_b * A_dim[2] + A_c; - A_step = A_dim[1] * A_dim[2]; - break; - case 1: - A_a = idx / (A_dim[2] * B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3]); - A_c = (idx / (B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3])) % A_dim[2]; - A_start = A_a * A_dim[1] * A_dim[2] + 0 + A_c; - A_step = A_dim[2]; - break; - case 2: - A_a = idx / (A_dim[1] * B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3]); - A_b = (idx / (B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3])) % A_dim[1]; - A_start = A_a * A_dim[1] * A_dim[2] + A_b * A_dim[2]; - A_step = 1; - break; - } - - switch (contractDimB) { - case 0: - B_e = (idx / B_dim[2]) % B_dim[1]; - B_f = idx % B_dim[2]; - B_start = 0 + B_e * B_dim[2] + B_f; - B_step = B_dim[1] * B_dim[2]; - break; - case 1: - B_d = (idx / B_dim[2]) % B_dim[0]; - B_f = idx % B_dim[2]; - B_start = B_d * B_dim[2] * B_dim[1] + 0 + B_f; - B_step = B_dim[2]; - break; - case 2: - B_d = (idx / B_dim[1]) % B_dim[0]; - B_e = idx % B_dim[1]; - B_start = B_d * B_dim[2] * B_dim[1] + B_e * B_dim[2]; - B_step = 1; - break; - } -} - -__global__ void tensorContraction3D(float* C, const float *A, const float *B, const size_type *A_dim, const size_type *B_dim, const int contractDimA, const int contractDimB) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - - // Each thread computes one element of the output tensor - int totalElements = A_dim[(contractDimA + 1) % 3] * A_dim[(contractDimA + 2) % 3] * B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3]; - if (idx < totalElements) { - size_type A_start, B_start, A_step, B_step; - size_type A_a, A_b, A_c, B_d, B_e, B_f; - - computeStartStep(A_start, A_step, B_start, B_step, idx, A_dim, B_dim, contractDimA, contractDimB); - - float sum = 0.0f; - for (int i = 0; i < A_dim[contractDimA]; i++) { // A_dim[contractDimA] == B_dim[contractDimB] - sum += A[A_start + (i * A_step)] * B[B_start + (i * B_step)]; - } - - C[idx] = sum; - } -} - -void launchTensorContraction3D(float* C, const float* A, const float* B, const size_type D1, const size_type D2, const size_type D3, const size_type D4, const size_type D5) { - float *d_A = nullptr, *d_B = nullptr, *d_C = nullptr; - - const size_type A_size = D1 * D2 * D3 * sizeof(float); - const size_type B_size = D3 * D4 * D5 * sizeof(float); - const size_type C_size = D1 * D2 * D4 * D5 * sizeof(float); - - // Allocate device memory and copy data from host to device - cudaMalloc(&d_A, A_size); - cudaMalloc(&d_B, B_size); - cudaMalloc(&d_C, C_size); - cudaMemcpy(d_A, A, A_size, cudaMemcpyHostToDevice); - cudaMemcpy(d_B, B, B_size, cudaMemcpyHostToDevice); - - const size_type A_dim[3] = {D1, D2, D3}; - const size_type B_dim[3] = {D3, D4, D5}; - - size_type *d_A_dim = nullptr, *d_B_dim = nullptr; - cudaMalloc(&d_A_dim, 3 * sizeof(size_type)); - cudaMalloc(&d_B_dim, 3 * sizeof(size_type)); - cudaMemcpy(d_A_dim, A_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice); - cudaMemcpy(d_B_dim, B_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice); - - // Launch the kernel - tensorContraction3D<<<1, 256>>>(d_C, d_A, d_B, d_A_dim, d_B_dim, /*contractDimA=*/2, /*contractDimB=*/0); - - // Copy the result from device to host - cudaMemcpy(C, d_C, C_size, cudaMemcpyDeviceToHost); - - // Free device memory - cudaFree(d_A); - cudaFree(d_B); - cudaFree(d_C); - cudaFree(d_A_dim); - cudaFree(d_B_dim); -} - -int main() { - const size_type D1 = 2, D2 = 3, D3 = 4, D4 = 3, D5 = 2; - - float A[D1][D2][D3] = { - {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}, - {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}} - }; - - float B[D3][D4][D5] = { - {{1, 2}, {3, 4}, {5, 6}}, - {{7, 8}, {9, 10}, {11, 12}}, - {{13, 14}, {15, 16}, {17, 18}}, - {{19, 20}, {21, 22}, {23, 24}} - }; - - float C[D1][D2][D4][D5] = {0}; // Result tensor - - launchTensorContraction3D(&C[0][0][0][0], &A[0][0][0], &B[0][0][0], D1, D2, D3, D4, D5); - - // Compute the gradient - auto tensor_grad = clad::gradient(launchTensorContraction3D, "C, A, B"); - - // Initialize the gradient inputs - float gradC[D1][D2][D4][D5] = { - { - { {1, 1}, {1, 1}, {1, 1} }, - { {1, 1}, {1, 1}, {1, 1} }, - { {1, 1}, {1, 1}, {1, 1} } - }, - { - { {1, 1}, {1, 1}, {1, 1} }, - { {1, 1}, {1, 1}, {1, 1} }, - { {1, 1}, {1, 1}, {1, 1} } - } - }; - float gradA[D1][D2][D3] = {0}; - float gradB[D3][D4][D5] = {0}; - - // Execute tensor contraction and its gradient - tensor_grad.execute(&C[0][0][0][0], &A[0][0][0], &B[0][0][0], D1, D2, D3, D4, D5, &gradC[0][0][0][0], &gradA[0][0][0], &gradB[0][0][0]); - - // Print the result - std::cout << "Result C:\n"; - for (int i = 0; i < D1; ++i) { - for (int j = 0; j < D2; ++j) { - for (int k = 0; k < D4; ++k) { - for (int l = 0; l < D5; ++l) { - std::cout << C[i][j][k][l] << " "; - } - std::cout << "\n"; - } - std::cout << "\n"; - } - std::cout << "\n"; - } - - std::cout << "Result C_grad w.r.t. A:\n"; - for (int i = 0; i < D1; ++i) { - for (int j = 0; j < D2; ++j) { - for (int k = 0; k < D3; ++k) { - std::cout << gradA[i][j][k] << " "; - } - std::cout << "\n"; - } - std::cout << "\n"; - } - - std::cout << "Result C_grad w.r.t. B:\n"; - for (int i = 0; i < D3; ++i) { - for (int j = 0; j < D4; ++j) { - for (int k = 0; k < D5; ++k) { - std::cout << gradB[i][j][k] << " "; - } - std::cout << "\n"; - } - std::cout << "\n"; - } - - return 0; -} - -// CHECK-EXEC: Result C: -// CHECK-NEXT: 130 140 -// CHECK-NEXT: 150 160 -// CHECK-NEXT: 170 180 -// CHECK-NEXT: -// CHECK-NEXT: 290 316 -// CHECK-NEXT: 342 368 -// CHECK-NEXT: 394 420 -// CHECK-NEXT: -// CHECK-NEXT: 450 492 -// CHECK-NEXT: 534 576 -// CHECK-NEXT: 618 660 -// CHECK-NEXT: -// CHECK-NEXT: -// CHECK-NEXT: 610 668 -// CHECK-NEXT: 726 784 -// CHECK-NEXT: 842 900 -// CHECK-NEXT: -// CHECK-NEXT: 770 844 -// CHECK-NEXT: 918 992 -// CHECK-NEXT: 1066 1140 -// CHECK-NEXT: -// CHECK-NEXT: 930 1020 -// CHECK-NEXT: 1110 1200 -// CHECK-NEXT: 1290 1380 - -// CHECK-EXEC: Result C_grad w.r.t. A: -// CHECK-NEXT: 21 57 93 129 -// CHECK-NEXT: 21 57 93 129 -// CHECK-NEXT: 21 57 93 129 -// CHECK-NEXT: -// CHECK-NEXT: 21 57 93 129 -// CHECK-NEXT: 21 57 93 129 -// CHECK-NEXT: 21 57 93 129 -// CHECK-NEXT: -// CHECK-EXEC: Result C_grad w.r.t. B: -// CHECK-NEXT: 66 66 -// CHECK-NEXT: 66 66 -// CHECK-NEXT: 66 66 -// CHECK-NEXT: -// CHECK-NEXT: 72 72 -// CHECK-NEXT: 72 72 -// CHECK-NEXT: 72 72 -// CHECK-NEXT: -// CHECK-NEXT: 78 78 -// CHECK-NEXT: 78 78 -// CHECK-NEXT: 78 78 -// CHECK-NEXT: -// CHECK-NEXT: 84 84 -// CHECK-NEXT: 84 84 -// CHECK-NEXT: 84 84 \ No newline at end of file