-
Notifications
You must be signed in to change notification settings - Fork 1
/
mat_nn_kokkos.hpp
92 lines (76 loc) · 3.54 KB
/
mat_nn_kokkos.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
// Kokkos implementation
#include <Kokkos_Core.hpp>
#define THREADS_PER_SITE 36
#define NUM_TEAMS 1600
using ExecSpace = Kokkos::DefaultExecutionSpace;
using HostExecSpace = Kokkos::DefaultHostExecutionSpace;
using d_site_view = Kokkos::View<site *, ExecSpace>;
using d_su3_matrix_view = Kokkos::View<su3_matrix *, ExecSpace>;
using h_site_view = Kokkos::View<site *, HostExecSpace>;
using h_su3_matrix_view = Kokkos::View<su3_matrix *, HostExecSpace>;
//
//******************* m_mat_nn.c (in su3.a) ****************************
// void mult_su3_nn( su3_matrix *a,*b,*c )
// matrix multiply, no adjoints
// C <- A*B
double k_mat_nn(size_t iterations, d_site_view a, d_su3_matrix_view b,
d_site_view c, int total_sites, int blocksPerGrid,
int threadsPerBlock, Profile* profile) {
using team_policy =
Kokkos::TeamPolicy<ExecSpace,
Kokkos::IndexType<size_t>>;
using member_type = team_policy::member_type;
team_policy policy(blocksPerGrid, threadsPerBlock);
Kokkos::Timer start;
auto tprofiling = Clock::now();
for (size_t iters = 0; iters < iterations + warmups; ++iters) {
if (iters == warmups) {
Kokkos::fence();
start.reset();
tprofiling = Clock::now();
}
Kokkos::parallel_for(
"k_mat_nn", policy, KOKKOS_LAMBDA(const member_type &team) {
int myThread =
team.team_size() * team.league_rank() + team.team_rank();
int mySite = myThread / 36;
if (mySite < total_sites) {
int j = (myThread % 36) / 9;
int k = (myThread % 9) / 3;
int l = myThread % 3;
Complx cc = {0.0, 0.0};
for (int m = 0; m < 3; m++)
cc += a(mySite).link[j].e[k][m] * b(j).e[m][l];
c(mySite).link[j].e[k][l] = cc;
}
});
Kokkos::fence();
}
profile->kernel_time = (std::chrono::duration_cast<std::chrono::microseconds>(Clock::now()-tprofiling).count())/1.0e6;
return (start.seconds());
}
double su3_mat_nn(h_site_view &a, h_su3_matrix_view &b, h_site_view &c,
size_t total_sites, size_t iterations, size_t threadsPerBlock,
int use_device, Profile* profile) {
if (threadsPerBlock == 0) threadsPerBlock = THREADS_PER_SITE;
double sitesPerBlock = (double)threadsPerBlock / THREADS_PER_SITE;
int blocksPerGrid = total_sites / sitesPerBlock + 0.999999;
if (verbose >= 1) {
printf("Number of blocks set to %d\n", blocksPerGrid);
printf("Threads per block set to %zu\n", threadsPerBlock);
printf("Device number set to %d\n", use_device);
}
auto tprofiling = Clock::now();
d_site_view d_a(Kokkos::ViewAllocateWithoutInitializing("d_a"), total_sites);
d_site_view d_c(Kokkos::ViewAllocateWithoutInitializing("d_c"), total_sites);
d_su3_matrix_view d_b(Kokkos::ViewAllocateWithoutInitializing("d_b"), 4);
Kokkos::deep_copy(d_a, a);
Kokkos::deep_copy(d_b, b);
profile->host_to_device_time = (std::chrono::duration_cast<std::chrono::microseconds>(Clock::now()-tprofiling).count())/1.0e6;
double ttotal = k_mat_nn(iterations, d_a, d_b, d_c, total_sites,
blocksPerGrid, threadsPerBlock, profile);
tprofiling = Clock::now();
Kokkos::deep_copy(c, d_c);
profile->device_to_host_time = (std::chrono::duration_cast<std::chrono::microseconds>(Clock::now()-tprofiling).count())/1.0e6;
return ttotal;
}