forked from NVIDIA/cuda-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bisect_kernel_large_multi.cuh
223 lines (185 loc) · 8.6 KB
/
bisect_kernel_large_multi.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Perform second step of bisection algorithm for large matrices for
* intervals that contained after the first step more than one eigenvalue
*/
#ifndef _BISECT_KERNEL_LARGE_MULTI_H_
#define _BISECT_KERNEL_LARGE_MULTI_H_
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
// includes, project
#include "config.h"
#include "util.h"
// additional kernel
#include "bisect_util.cu"
////////////////////////////////////////////////////////////////////////////////
//! Perform second step of bisection algorithm for large matrices for
//! intervals that after the first step contained more than one eigenvalue
//! @param g_d diagonal elements of symmetric, tridiagonal matrix
//! @param g_s superdiagonal elements of symmetric, tridiagonal matrix
//! @param n matrix size
//! @param blocks_mult start addresses of blocks of intervals that are
//! processed by one block of threads, each of the
//! intervals contains more than one eigenvalue
//! @param blocks_mult_sum total number of eigenvalues / singleton intervals
//! in one block of intervals
//! @param g_left left limits of intervals
//! @param g_right right limits of intervals
//! @param g_left_count number of eigenvalues less than left limits
//! @param g_right_count number of eigenvalues less than right limits
//! @param g_lambda final eigenvalue
//! @param g_pos index of eigenvalue (in ascending order)
//! @param precision desired precision of eigenvalues
////////////////////////////////////////////////////////////////////////////////
__global__ void bisectKernelLarge_MultIntervals(
float *g_d, float *g_s, const unsigned int n, unsigned int *blocks_mult,
unsigned int *blocks_mult_sum, float *g_left, float *g_right,
unsigned int *g_left_count, unsigned int *g_right_count, float *g_lambda,
unsigned int *g_pos, float precision) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
const unsigned int tid = threadIdx.x;
// left and right limits of interval
__shared__ float s_left[2 * MAX_THREADS_BLOCK];
__shared__ float s_right[2 * MAX_THREADS_BLOCK];
// number of eigenvalues smaller than interval limits
__shared__ unsigned int s_left_count[2 * MAX_THREADS_BLOCK];
__shared__ unsigned int s_right_count[2 * MAX_THREADS_BLOCK];
// helper array for chunk compaction of second chunk
__shared__ unsigned int s_compaction_list[2 * MAX_THREADS_BLOCK + 1];
// compaction list helper for exclusive scan
unsigned int *s_compaction_list_exc = s_compaction_list + 1;
// flag if all threads are converged
__shared__ unsigned int all_threads_converged;
// number of active threads
__shared__ unsigned int num_threads_active;
// number of threads to employ for compaction
__shared__ unsigned int num_threads_compaction;
// flag if second chunk has to be compacted
__shared__ unsigned int compact_second_chunk;
// parameters of block of intervals processed by this block of threads
__shared__ unsigned int c_block_start;
__shared__ unsigned int c_block_end;
__shared__ unsigned int c_block_offset_output;
// midpoint of currently active interval of the thread
float mid = 0.0f;
// number of eigenvalues smaller than \a mid
unsigned int mid_count = 0;
// current interval parameter
float left;
float right;
unsigned int left_count;
unsigned int right_count;
// helper for compaction, keep track which threads have a second child
unsigned int is_active_second = 0;
// initialize common start conditions
if (0 == tid) {
c_block_start = blocks_mult[blockIdx.x];
c_block_end = blocks_mult[blockIdx.x + 1];
c_block_offset_output = blocks_mult_sum[blockIdx.x];
num_threads_active = c_block_end - c_block_start;
s_compaction_list[0] = 0;
num_threads_compaction = ceilPow2(num_threads_active);
all_threads_converged = 1;
compact_second_chunk = 0;
}
cg::sync(cta);
// read data into shared memory
if (tid < num_threads_active) {
s_left[tid] = g_left[c_block_start + tid];
s_right[tid] = g_right[c_block_start + tid];
s_left_count[tid] = g_left_count[c_block_start + tid];
s_right_count[tid] = g_right_count[c_block_start + tid];
}
cg::sync(cta);
// do until all threads converged
while (true) {
// for (int iter=0; iter < 0; iter++) {
// subdivide interval if currently active and not already converged
subdivideActiveInterval(tid, s_left, s_right, s_left_count, s_right_count,
num_threads_active, left, right, left_count,
right_count, mid, all_threads_converged);
cg::sync(cta);
// stop if all eigenvalues have been found
if (1 == all_threads_converged) {
break;
}
// compute number of eigenvalues smaller than mid for active and not
// converged intervals, use all threads for loading data from gmem and
// s_left and s_right as scratch space to store the data load from gmem
// in shared memory
mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, mid, tid,
num_threads_active, s_left,
s_right, (left == right), cta);
cg::sync(cta);
if (tid < num_threads_active) {
// store intervals
if (left != right) {
storeNonEmptyIntervals(tid, num_threads_active, s_left, s_right,
s_left_count, s_right_count, left, mid, right,
left_count, mid_count, right_count, precision,
compact_second_chunk, s_compaction_list_exc,
is_active_second);
} else {
storeIntervalConverged(
s_left, s_right, s_left_count, s_right_count, left, mid, right,
left_count, mid_count, right_count, s_compaction_list_exc,
compact_second_chunk, num_threads_active, is_active_second);
}
}
cg::sync(cta);
// compact second chunk of intervals if any of the threads generated
// two child intervals
if (1 == compact_second_chunk) {
createIndicesCompaction(s_compaction_list_exc, num_threads_compaction,
cta);
compactIntervals(s_left, s_right, s_left_count, s_right_count, mid, right,
mid_count, right_count, s_compaction_list,
num_threads_active, is_active_second);
}
cg::sync(cta);
// update state variables
if (0 == tid) {
num_threads_active += s_compaction_list[num_threads_active];
num_threads_compaction = ceilPow2(num_threads_active);
compact_second_chunk = 0;
all_threads_converged = 1;
}
cg::sync(cta);
// clear
s_compaction_list_exc[threadIdx.x] = 0;
s_compaction_list_exc[threadIdx.x + blockDim.x] = 0;
cg::sync(cta);
} // end until all threads converged
// write data back to global memory
if (tid < num_threads_active) {
unsigned int addr = c_block_offset_output + tid;
g_lambda[addr] = s_left[tid];
g_pos[addr] = s_right_count[tid];
}
}
#endif // #ifndef _BISECT_KERNEL_LARGE_MULTI_H_