-
Notifications
You must be signed in to change notification settings - Fork 0
/
maxpool_kernel.cu
134 lines (98 loc) · 4 KB
/
maxpool_kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include <iostream>
#include <random>
#include <chrono>
#include <algorithm>
#include <cmath>
#define timestamp(__var__) auto __var__ = std::chrono::system_clock::now();
inline double getDuration(std::chrono::time_point<std::chrono::system_clock> a,
std::chrono::time_point<std::chrono::system_clock> b)
{
return std::chrono::duration<double>(b - a).count();
}
using namespace std;
const int WARPS_PER_BLOCK = 16;
const int N = 232960 >> 8 << 8;
//const int N = 716847 >> 8 << 8;
//const int N = 2449029 >> 8 << 8;
//const int N = 16;
const int dim_in = 256, dim_out = 64;
__global__ void maxpool(float *, float *, unsigned int *);
int main() {
cout << "Max Pooling kernel" << endl;
cout << "N = "<< N << ", dim_in = " << dim_in << ", dim_out = " << dim_out << ", preparing data..." << endl;
float *data, *value;
unsigned int *indices;
cudaMallocManaged(&data, N * dim_in * sizeof(float));
cudaMallocManaged(&value, N * dim_out * sizeof(float));
cudaMallocManaged(&indices, N * dim_out * sizeof(unsigned int));
default_random_engine engine;
engine.seed(123);
uniform_real_distribution<float> rd(0, 1);
generate(data, data + N * dim_in, [&](){ return rd(engine); });
cout << "data ready, testing..." << endl;
unsigned int shared_mem_size = WARPS_PER_BLOCK * dim_in * sizeof(float);
cout<<"Config GridDim = "<< N / WARPS_PER_BLOCK << ", BlockDim = " << WARPS_PER_BLOCK * 32 << ", shared_mem_size = " << shared_mem_size << endl;
int times = 100;
for (int i = 0; i < times; i++) {
maxpool <<< N / WARPS_PER_BLOCK, WARPS_PER_BLOCK * 32, shared_mem_size >>> (data, value, indices);
}
cudaDeviceSynchronize();
double measured_time = 0;
for (int i = 0; i < times; i++) {
timestamp(t0);
maxpool <<< N / WARPS_PER_BLOCK, WARPS_PER_BLOCK * 32, shared_mem_size >>> (data, value, indices);
cudaDeviceSynchronize();
timestamp(t1);
measured_time += getDuration(t0, t1);
}
cout << "max-pooling time = " << measured_time / times * 1000 << " ms" <<endl;
// for (int i = 0; i < 64; i += 1) {
// cout << "value[" << i << "] = " << *(value + i) << endl;
// }
//
// for (int i = 0; i < 64; i += 1) {
// cout << "indices[" << i << "] = " << *(indices + i) << endl;
// }
cudaFree(data);
cudaFree(value);
cudaFree(indices);
return 0;
}
__global__ void maxpool(float *data, float *value, unsigned int *indices) {
extern __shared__ float buffer[];
const int warp_id = threadIdx.x / 32;
const int local_tid = threadIdx.x % 32;
const int warp_offset = WARPS_PER_BLOCK * dim_in;
const int feature_per_warp = dim_in / 32;
const int vertex_offset = warp_id * dim_in;
const int sqrt_dim_in = 16;
#pragma unroll
for (unsigned int i = 0; i < feature_per_warp; i += 1) {
buffer[warp_id * dim_in + feature_per_warp * local_tid + i] = data[blockIdx.x * warp_offset + warp_id * dim_in + feature_per_warp * local_tid + i];
}
__syncwarp();
int xx = local_tid / 4 * 2;
int yy = local_tid % 4 * 4;
unsigned int pos;
float v;
#pragma unroll
for (unsigned int i = 0; i < 2; i += 1) {
yy += 2 * i;
pos = xx * sqrt_dim_in + yy;
v = buffer[vertex_offset + pos];
if (buffer[vertex_offset + (xx + 1) * sqrt_dim_in + yy] > v) {
pos = (xx + 1) * sqrt_dim_in + yy;
v = buffer[vertex_offset + pos];
}
if (buffer[vertex_offset + xx * sqrt_dim_in + yy + 1] > v) {
pos = xx * sqrt_dim_in + yy + 1;
v = buffer[vertex_offset + pos];
}
if (buffer[vertex_offset + (xx + 1) * sqrt_dim_in + yy + 1] > v) {
pos = (xx + 1) * sqrt_dim_in + yy + 1;
v = buffer[vertex_offset + pos];
}
value[blockIdx.x * WARPS_PER_BLOCK * dim_out + warp_id * dim_out + 2 * local_tid + i] = v;
indices[blockIdx.x * WARPS_PER_BLOCK * dim_out + warp_id * dim_out + 2 * local_tid + i] = pos;
}
}