-
Notifications
You must be signed in to change notification settings - Fork 0
/
03_cuda_to_mpi.c
83 lines (69 loc) · 2.4 KB
/
03_cuda_to_mpi.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
// clang-format off
// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
// clang-format on
// CHECK-DAG: data race
// CHECK-DAG: [Error] sync
// CHECK-SYNC-NOT: data race
// CHECK-SYNC-NOT: [Error] sync
#include "../support/gpu_mpi.h"
__global__ void kernel(int* arr, const int N) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
#if __CUDA_ARCH__ >= 700
for (int i = 0; i < tid; i++) {
__nanosleep(1000000U);
}
#else
printf(">>> __CUDA_ARCH__ !\n");
#endif
arr[tid] = (tid + 1);
}
}
int main(int argc, char* argv[]) {
if (!has_gpu_aware_mpi()) {
printf("This example is designed for CUDA-aware MPI. Exiting.\n");
return 1;
}
const int size = 256;
const int threadsPerBlock = size;
const int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
MPI_Init(&argc, &argv);
int world_size, world_rank;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
if (world_size != 2) {
printf("This example is designed for 2 MPI processes. Exiting.\n");
MPI_Finalize();
return 1;
}
int* d_data;
cudaMalloc(&d_data, size * sizeof(int));
if (world_rank == 0) {
kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, size);
#ifdef CUSAN_SYNC
cudaDeviceSynchronize(); // FIXME: uncomment for correct execution
#endif
MPI_Send(d_data, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
} else if (world_rank == 1) {
MPI_Recv(d_data, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
if (world_rank == 1) {
int* h_data = (int*)malloc(size * sizeof(int));
cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < size; i++) {
const int buf_v = h_data[i];
if (buf_v == 0) {
printf("[Error] sync\n");
break;
}
// printf("buf[%d] = %d (r%d)\n", i, buf_v, world_rank);
}
free(h_data);
}
cudaFree(d_data);
MPI_Finalize();
return 0;
}