forked from RapidsBlink/SIMDVectorizationOpt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fastavxbase64.c
191 lines (157 loc) · 7.04 KB
/
fastavxbase64.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#include "fastavxbase64.h"
#include <x86intrin.h>
#include <stdbool.h>
/**
* This code borrows from Wojciech Mula's library at
* https://github.com/WojciechMula/base64simd (published under BSD)
* as well as code from Alfred Klomp's library https://github.com/aklomp/base64 (published under BSD)
*
*/
/**
* Note : Hardware such as Knights Landing might do poorly with this AVX2 code since it relies on shuffles. Alternatives might be faster.
*/
static inline __m256i enc_reshuffle(const __m256i input) {
// translation from SSE into AVX2 of procedure
// https://github.com/WojciechMula/base64simd/blob/master/encode/unpack_bigendian.cpp
const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
10, 11, 9, 10,
7, 8, 6, 7,
4, 5, 3, 4,
1, 2, 0, 1,
14, 15, 13, 14,
11, 12, 10, 11,
8, 9, 7, 8,
5, 6, 4, 5
));
const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
return _mm256_or_si256(t1, t3);
}
static inline __m256i enc_translate(const __m256i in) {
const __m256i lut = _mm256_setr_epi8(
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0, 65, 71,
-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
__m256i mask = _mm256_cmpgt_epi8((in), _mm256_set1_epi8(25));
indices = _mm256_sub_epi8(indices, mask);
__m256i out = _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
return out;
}
static inline __m256i dec_reshuffle(__m256i in) {
// inlined procedure pack_madd from https://github.com/WojciechMula/base64simd/blob/master/decode/pack.avx2.cpp
// The only difference is that elements are reversed,
// only the multiplication constants were changed.
const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140)); //_mm256_maddubs_epi16 is likely expensive
__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
// end of inlined
// Pack bytes together within 32-bit words, discarding words 3 and 7:
out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1
));
// the call to _mm256_permutevar8x32_epi32 could be replaced by a call to _mm256_storeu2_m128i but it is doubtful that it would help
return _mm256_permutevar8x32_epi32(
out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
}
size_t fast_avx2_base64_encode(char* dest, const char* str, size_t len) {
const char* const dest_orig = dest;
if(len >= 32 - 4) {
// first load is masked
__m256i inputvector = _mm256_maskload_epi32((int const*)(str - 4), _mm256_set_epi32(
0x80000000,
0x80000000,
0x80000000,
0x80000000,
0x80000000,
0x80000000,
0x80000000,
0x00000000 // we do not load the first 4 bytes
));
//////////
// Intel docs: Faults occur only due to mask-bit required memory accesses that caused the faults.
// Faults will not occur due to referencing any memory location if the corresponding mask bit for
//that memory location is 0. For example, no faults will be detected if the mask bits are all zero.
////////////
while(true) {
inputvector = enc_reshuffle(inputvector);
inputvector = enc_translate(inputvector);
_mm256_storeu_si256((__m256i *)dest, inputvector);
str += 24;
dest += 32;
len -= 24;
if(len >= 32) {
inputvector = _mm256_loadu_si256((__m256i *)(str - 4)); // no need for a mask here
// we could do a mask load as long as len >= 24
} else {
break;
}
}
}
size_t scalarret = chromium_base64_encode(dest, str, len);
if(scalarret == MODP_B64_ERROR) return MODP_B64_ERROR;
return (dest - dest_orig) + scalarret;
}
size_t fast_avx2_base64_decode(char *out, const char *src, size_t srclen) {
char* out_orig = out;
while (srclen >= 45) {
// The input consists of six character sets in the Base64 alphabet,
// which we need to map back to the 6-bit values they represent.
// There are three ranges, two singles, and then there's the rest.
//
// # From To Add Characters
// 1 [43] [62] +19 +
// 2 [47] [63] +16 /
// 3 [48..57] [52..61] +4 0..9
// 4 [65..90] [0..25] -65 A..Z
// 5 [97..122] [26..51] -71 a..z
// (6) Everything else => invalid input
__m256i str = _mm256_loadu_si256((__m256i *)src);
// code by @aqrit from
// https://github.com/WojciechMula/base64simd/issues/3#issuecomment-271137490
// transated into AVX2
const __m256i lut_lo = _mm256_setr_epi8(
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
);
const __m256i lut_hi = _mm256_setr_epi8(
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
);
const __m256i lut_roll = _mm256_setr_epi8(
0, 16, 19, 4, -65, -65, -71, -71,
0, 0, 0, 0, 0, 0, 0, 0,
0, 16, 19, 4, -65, -65, -71, -71,
0, 0, 0, 0, 0, 0, 0, 0
);
const __m256i mask_2F = _mm256_set1_epi8(0x2f);
// lookup
__m256i hi_nibbles = _mm256_srli_epi32(str, 4);
__m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
hi_nibbles = _mm256_and_si256(hi_nibbles, mask_2F);
const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
if (!_mm256_testz_si256(lo, hi)) {
break;
}
str = _mm256_add_epi8(str, roll);
// end of copied function
srclen -= 32;
src += 32;
// end of inlined function
// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);
_mm256_storeu_si256((__m256i *)out, str);
out += 24;
}
size_t scalarret = chromium_base64_decode(out, src, srclen);
if(scalarret == MODP_B64_ERROR) return MODP_B64_ERROR;
return (out - out_orig) + scalarret;
}