This repository has been archived by the owner on Mar 3, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 57
/
WarpBitonicSort.cuh
74 lines (62 loc) · 2.33 KB
/
WarpBitonicSort.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#include "cuda/Comparators.cuh"
#include "cuda/CudaUtils.cuh"
#include "cuda/ShuffleTypes.cuh"
namespace facebook { namespace cuda {
namespace detail {
template <typename T, typename Comparator>
__device__ __forceinline__ T shflSwap(const T x, int mask, int dir) {
T y = shfl_xor(x, mask);
return Comparator::compare(x, y) == dir ? y : x;
}
} // namespace
/// Defines a bitonic sort network to exchange 'V' according to
/// `SWAP()`'s compare and exchange mechanism across the warp, ordered
/// according to the comparator `comp`. In other words, if `comp` is
/// `GreaterThan<T>`, then lane 0 will contain the highest `val`
/// presented across the warp
///
/// See also
/// http://on-demand.gputechconf.com/gtc/2013/presentations/S3174-Kepler-Shuffle-Tips-Tricks.pdf
template <typename T, typename Comparator>
__device__ T warpBitonicSort(T val) {
const int laneId = getLaneId();
// 2
val = detail::shflSwap<T, Comparator>(
val, 0x01, getBit(laneId, 1) ^ getBit(laneId, 0));
// 4
val = detail::shflSwap<T, Comparator>(
val, 0x02, getBit(laneId, 2) ^ getBit(laneId, 1));
val = detail::shflSwap<T, Comparator>(
val, 0x01, getBit(laneId, 2) ^ getBit(laneId, 0));
// 8
val = detail::shflSwap<T, Comparator>(
val, 0x04, getBit(laneId, 3) ^ getBit(laneId, 2));
val = detail::shflSwap<T, Comparator>(
val, 0x02, getBit(laneId, 3) ^ getBit(laneId, 1));
val = detail::shflSwap<T, Comparator>(
val, 0x01, getBit(laneId, 3) ^ getBit(laneId, 0));
// 16
val = detail::shflSwap<T, Comparator>(
val, 0x08, getBit(laneId, 4) ^ getBit(laneId, 3));
val = detail::shflSwap<T, Comparator>(
val, 0x04, getBit(laneId, 4) ^ getBit(laneId, 2));
val = detail::shflSwap<T, Comparator>(
val, 0x02, getBit(laneId, 4) ^ getBit(laneId, 1));
val = detail::shflSwap<T, Comparator>(
val, 0x01, getBit(laneId, 4) ^ getBit(laneId, 0));
// 32
val = detail::shflSwap<T, Comparator>(
val, 0x10, getBit(laneId, 4));
val = detail::shflSwap<T, Comparator>(
val, 0x08, getBit(laneId, 3));
val = detail::shflSwap<T, Comparator>(
val, 0x04, getBit(laneId, 2));
val = detail::shflSwap<T, Comparator>(
val, 0x02, getBit(laneId, 1));
val = detail::shflSwap<T, Comparator>(
val, 0x01, getBit(laneId, 0));
return val;
}
} } // namespace