-
Notifications
You must be signed in to change notification settings - Fork 1
/
vec_detail.h
634 lines (565 loc) · 23.5 KB
/
vec_detail.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
/* SPDX-License-Identifier: GPL-3.0-or-later WITH GCC-exception-3.1 */
/* Copyright © 2023-2024 GSI Helmholtzzentrum fuer Schwerionenforschung GmbH
* Matthias Kretz <[email protected]>
*/
#ifndef PROTOTYPE_VEC_DETAIL_H_
#define PROTOTYPE_VEC_DETAIL_H_
#include "simd_config.h"
#include "simd_meta.h"
#include "constexpr_wrapper.h"
#include <cstdint>
namespace std::__detail
{
///////////////////////////////////////////////////////////////////////////////////////////////
/////////////// tools for working with gnu::vector_size types (vector builtins) ///////////////
///////////////////////////////////////////////////////////////////////////////////////////////
/**
* Alias for a vector builtin with given value type and total sizeof.
*/
template <__vectorizable _Tp, size_t _Bytes>
requires (__has_single_bit(_Bytes))
using __vec_builtin_type_bytes [[__gnu__::__vector_size__(_Bytes)]] = _Tp;
/**
* Alias for a vector builtin with given value type \p _Tp and \p _Width.
*/
template <__vectorizable _Tp, _SimdSizeType _Width>
requires (__has_single_bit(_Width))
using __vec_builtin_type = __vec_builtin_type_bytes<_Tp, sizeof(_Tp) * _Width>;
/**
* Constrain to any vector builtin with given value type and optional width.
*/
template <typename _Tp, typename _ValueType,
_SimdSizeType _Width = sizeof(_Tp) / sizeof(_ValueType)>
concept __vec_builtin_of
= not __arithmetic<_Tp> and __vectorizable<_ValueType>
and _Width >= 1 and sizeof(_Tp) / sizeof(_ValueType) == _Width
and same_as<__vec_builtin_type_bytes<_ValueType, sizeof(_Tp)>, _Tp>
and requires(_Tp& __v, _ValueType __x) { __v[0] = __x; };
static_assert( __vec_builtin_of<__vec_builtin_type<int, 4>, int>);
static_assert(not __vec_builtin_of<__vec_builtin_type<int, 4>, char>);
static_assert(not __vec_builtin_of<int, int>);
/**
* Constrain to any vector builtin.
*/
template <typename _Tp>
concept __vec_builtin
= not std::is_class_v<_Tp> and requires(const _Tp& __x) {
requires __vec_builtin_of<_Tp, remove_cvref_t<decltype(__x[0])>>;
};
static_assert(not __vec_builtin<int>);
static_assert( __vec_builtin<__vec_builtin_type<int, 4>>);
template <typename _Tp>
struct __value_type_of_impl;
template <typename _Tp>
concept __has_value_type_member = requires { typename _Tp::value_type; };
/**
* Alias for the value type of the given type \p _Tp.
*/
template <typename _Tp>
requires __vec_builtin<_Tp> or __arithmetic<_Tp> or __has_value_type_member<_Tp>
using __value_type_of = typename __value_type_of_impl<_Tp>::type;
template <__vec_builtin _Tp>
struct __value_type_of_impl<_Tp>
{ using type = remove_cvref_t<decltype(std::declval<const _Tp>()[0])>; };
template <__arithmetic _Tp>
struct __value_type_of_impl<_Tp>
{ using type = _Tp; };
template <__has_value_type_member _Tp>
struct __value_type_of_impl<_Tp>
{ using type = typename _Tp::value_type; };
/**
* The width (number of value_type elements) of the given vector builtin or arithmetic type.
*/
template <typename _Tp>
requires __vec_builtin<_Tp> or __arithmetic<_Tp>
inline constexpr _SimdSizeType __width_of = sizeof(_Tp) / sizeof(__value_type_of<_Tp>);
/**
* Alias for a vector builtin with value type \p _Up and equal width as \p _TV.
*/
template <__vectorizable _Up, __vec_builtin _TV>
using __rebind_vec_builtin_t = __vec_builtin_type<_Up, __width_of<_TV>>;
/**
* Alias for a vector builtin with value type of \p _TV transformed using \p _Trait and equal
* width as \p _TV.
*/
template <template <typename> class _Trait, __vec_builtin _TV>
using __transform_vec_builtin_t
= __vec_builtin_type<_Trait<__value_type_of<_TV>>, __width_of<_TV>>;
/**
* Alias for a vector mask type matching the given \p _TV type.
*/
template <__vec_builtin _TV>
using __mask_vec_from = __transform_vec_builtin_t<__make_signed_int_t, _TV>;
/**
* Constrain to vector builtins with given value_type sizeof and optionally vector type sizeof.
*/
template <typename _Tp, size_t _ValueTypeSize, size_t _VecSize = sizeof(_Tp)>
concept __vec_builtin_sizeof
= __vec_builtin<_Tp> and sizeof(_Tp) == _VecSize
and sizeof(__value_type_of<_Tp>) == _ValueTypeSize;
static_assert( __vec_builtin_sizeof<__vec_builtin_type<int, 4>, sizeof(int)>);
static_assert(not __vec_builtin_sizeof<int, sizeof(int)>);
using __v2double [[__gnu__::__vector_size__(16)]] = double;
using __v4double [[__gnu__::__vector_size__(32)]] = double;
using __v8double [[__gnu__::__vector_size__(64)]] = double;
using __v4float [[__gnu__::__vector_size__(16)]] = float;
using __v8float [[__gnu__::__vector_size__(32)]] = float;
using __v16float [[__gnu__::__vector_size__(64)]] = float;
using __v16char [[__gnu__::__vector_size__(16)]] = char;
using __v32char [[__gnu__::__vector_size__(32)]] = char;
using __v64char [[__gnu__::__vector_size__(64)]] = char;
using __v16schar [[__gnu__::__vector_size__(16)]] = signed char;
using __v32schar [[__gnu__::__vector_size__(32)]] = signed char;
using __v64schar [[__gnu__::__vector_size__(64)]] = signed char;
using __v16uchar [[__gnu__::__vector_size__(16)]] = unsigned char;
using __v32uchar [[__gnu__::__vector_size__(32)]] = unsigned char;
using __v64uchar [[__gnu__::__vector_size__(64)]] = unsigned char;
using __v8int16 [[__gnu__::__vector_size__(16)]] = int16_t;
using __v16int16 [[__gnu__::__vector_size__(32)]] = int16_t;
using __v32int16 [[__gnu__::__vector_size__(64)]] = int16_t;
using __v8uint16 [[__gnu__::__vector_size__(16)]] = uint16_t;
using __v16uint16 [[__gnu__::__vector_size__(32)]] = uint16_t;
using __v32uint16 [[__gnu__::__vector_size__(64)]] = uint16_t;
using __v4int32 [[__gnu__::__vector_size__(16)]] = int32_t;
using __v8int32 [[__gnu__::__vector_size__(32)]] = int32_t;
using __v16int32 [[__gnu__::__vector_size__(64)]] = int32_t;
using __v4uint32 [[__gnu__::__vector_size__(16)]] = uint32_t;
using __v8uint32 [[__gnu__::__vector_size__(32)]] = uint32_t;
using __v16uint32 [[__gnu__::__vector_size__(64)]] = uint32_t;
using __v2uint64 [[__gnu__::__vector_size__(16)]] = uint64_t;
using __v4uint64 [[__gnu__::__vector_size__(32)]] = uint64_t;
using __v8uint64 [[__gnu__::__vector_size__(64)]] = uint64_t;
using __v2int64 [[__gnu__::__vector_size__(16)]] = int64_t;
using __v4int64 [[__gnu__::__vector_size__(32)]] = int64_t;
using __v8int64 [[__gnu__::__vector_size__(64)]] = int64_t;
using __v2llong [[__gnu__::__vector_size__(16)]] = long long;
using __v4llong [[__gnu__::__vector_size__(32)]] = long long;
using __v8llong [[__gnu__::__vector_size__(64)]] = long long;
using __v2ullong [[__gnu__::__vector_size__(16)]] = unsigned long long;
using __v4ullong [[__gnu__::__vector_size__(32)]] = unsigned long long;
using __v8ullong [[__gnu__::__vector_size__(64)]] = unsigned long long;
/**
* An object of given type where all bits are 1.
*/
template <__vec_builtin _V>
static inline constexpr _V _S_allbits
= __builtin_bit_cast(_V, ~__vec_builtin_type_bytes<char, sizeof(_V)>());
/**
* An object of given type where only the sign bits are 1.
*/
template <__vec_builtin _V>
requires floating_point<__value_type_of<_V>>
static inline constexpr _V _S_signmask = __xor(_V() + 1, _V() - 1);
/**
* An object of given type where only the sign bits are 0 (complement of _S_signmask).
*/
template <__vec_builtin _V>
requires floating_point<__value_type_of<_V>>
static inline constexpr _V _S_absmask = __andnot(_S_signmask<_V>, _S_allbits<_V>);
/**
* Helper function to work around Clang not allowing v[i] in constant expressions.
*/
template <__vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr __value_type_of<_TV>
__vec_get(_TV __v, int __i)
{
#ifdef __clang__
if (__builtin_is_constant_evaluated())
return __builtin_bit_cast(array<__value_type_of<_TV>, __width_of<_TV>>, __v)[__i];
else
#endif
return __v[__i];
}
/**
* Helper function to work around Clang and GCC not allowing assignment to v[i] in constant
* expressions.
*/
template <__vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr void
__vec_set(_TV& __v, int __i, __value_type_of<_TV> __x)
{
if (__builtin_is_constant_evaluated())
{
#ifdef __clang__
auto __arr = __builtin_bit_cast(array<__value_type_of<_TV>, __width_of<_TV>>, __v);
__arr[__i] = __x;
__v = __builtin_bit_cast(_TV, __arr);
#else
__v = _GLIBCXX_SIMD_INT_PACK(__width_of<_TV>, __j, {
return _TV{(__i == __j ? __x : __v[__j])...};
});
#endif
}
else
__v[__i] = __x;
}
/**
* Returns a permutation of the given vector builtin. _Indices work like for
* __builtin_shufflevector, except that -1 signifies a 0.
*/
template <int... _Indices, __vec_builtin _Tp>
_GLIBCXX_SIMD_INTRINSIC constexpr _Tp
__vec_permute(_Tp __x)
{
static_assert(sizeof...(_Indices) == __width_of<_Tp>);
return __builtin_shufflevector(__x, _Tp(),
(_Indices == -1 ? __width_of<_Tp> : _Indices)...);
}
/**
* Split \p __x into \p _Total parts and return the part at index \p _Index. Optionally combine
* multiple parts into the return value (\p _Combine).
*/
template <int _Index, int _Total, int _Combine = 1, __vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
__vec_builtin_type<__value_type_of<_TV>, __width_of<_TV> / _Total * _Combine>
__vec_extract_part(_TV __x)
{
using _Tp = __value_type_of<_TV>;
constexpr int __values_per_part = __width_of<_TV> / _Total;
constexpr int __values_to_skip = _Index * __values_per_part;
constexpr int __return_size = _Combine * __values_per_part;
static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp) <= sizeof(__x),
"out of bounds __vec_extract_part");
#ifdef __clang__
using _RV = __vec_builtin_type<_Tp, __return_size>;
return _GLIBCXX_SIMD_INT_PACK(__return_size, _Ind, {
return _RV{__vec_get(__x, __values_to_skip + _Ind)...};
});
#else
return _GLIBCXX_SIMD_INT_PACK(__return_size, _Ind, {
return __builtin_shufflevector(__x, __x, (__values_to_skip + _Ind)...);
});
#endif
}
template <int _Index, int _Total, int _Combine = 1, integral _Tp,
vir::constexpr_value<int> _Width = decltype(vir::cw<sizeof(_Tp) * __CHAR_BIT__>)>
_GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr integral auto
__vec_extract_part(_Tp __x, _Width __width = {})
{
constexpr int __values_per_part = __width / _Total;
constexpr int __values_to_skip = _Index * __values_per_part;
constexpr int __return_size = __values_per_part * _Combine;
static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp) <= sizeof(__x),
"out of bounds __vec_extract_part");
return _GLIBCXX_SIMD_INT_PACK(__return_size, _Ind, {
return __builtin_shufflevector(__x, __x, (__values_to_skip + _Ind)...);
});
}
template <__vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr __vec_builtin_type_bytes<__value_type_of<_TV>, 16>
__vec_lo128(_TV __x)
{
using _Tp = __value_type_of<_TV>;
constexpr int __new_width = 16 / sizeof(_Tp);
static_assert(sizeof(_TV) >= 32);
return _GLIBCXX_SIMD_INT_PACK(__new_width, _Is, {
return __builtin_shufflevector(__x, __x, _Is...);
});
}
template <__vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr __vec_builtin_type_bytes<__value_type_of<_TV>, 16>
__vec_hi128(_TV __x)
{
using _Tp = __value_type_of<_TV>;
constexpr int __new_width = 16 / sizeof(_Tp);
static_assert(sizeof(_TV) >= 32);
return _GLIBCXX_SIMD_INT_PACK(__new_width, _Is, {
return __builtin_shufflevector(__x, __x, (__width_of<_TV> - __new_width + _Is)...);
});
}
template <__vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr __vec_builtin_type_bytes<__value_type_of<_TV>, 32>
__vec_lo256(_TV __x)
{
using _Tp = __value_type_of<_TV>;
constexpr int __new_width = 32 / sizeof(_Tp);
static_assert(sizeof(_TV) >= 64);
return _GLIBCXX_SIMD_INT_PACK(__new_width, _Is, {
return __builtin_shufflevector(__x, __x, _Is...);
});
}
template <__vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr __vec_builtin_type_bytes<__value_type_of<_TV>, 32>
__vec_hi256(_TV __x)
{
using _Tp = __value_type_of<_TV>;
constexpr int __new_width = 32 / sizeof(_Tp);
static_assert(sizeof(_TV) >= 64);
return _GLIBCXX_SIMD_INT_PACK(__new_width, _Is, {
return __builtin_shufflevector(__x, __x, (__width_of<_TV> - __new_width + _Is)...);
});
}
/**
* Return vector builtin with all values from \p __a and \p __b.
*/
template <__vec_builtin _Tp>
_GLIBCXX_SIMD_INTRINSIC constexpr auto
__vec_concat(_Tp __a, _Tp __b)
{
#ifdef __clang__
constexpr int _N0 = __width_of<_Tp>;
using _RV = __vec_builtin_type<__value_type_of<_Tp>, _N0 * 2>;
return _GLIBCXX_SIMD_INT_PACK(_N0 * 2, _Is, {
return _RV{__vec_get(_Is < _N0 ? __a : __b, _Is % _N0)...};
});
#else
return _GLIBCXX_SIMD_INT_PACK(__width_of<_Tp> * 2, _Is, {
return __builtin_shufflevector(__a, __b, _Is...);
});
#endif
}
template <int _Offset, __vec_builtin _Tp>
_GLIBCXX_SIMD_INTRINSIC constexpr auto
__vec_concat_from_pack(_Tp __a)
{
static_assert(_Offset == 0);
return __vec_concat(__a, _Tp{});
}
template <int _Offset, __vec_builtin _Tp, __vec_builtin... _More>
_GLIBCXX_SIMD_INTRINSIC constexpr auto
__vec_concat_from_pack(_Tp __a, _Tp __b, _More... __more)
{
if constexpr (_Offset == 0)
return __vec_concat(__a, __b);
else
return __vec_concat_from_pack<_Offset - 1>(__more...);
}
template <__vec_builtin _Tp, __vec_builtin... _More>
requires (sizeof...(_More) >= 1)
_GLIBCXX_SIMD_INTRINSIC constexpr auto
__vec_concat(_Tp __a, _Tp __b, _More... __more)
{
static_assert((std::is_same_v<_Tp, _More> and ...));
return _GLIBCXX_SIMD_INT_PACK((sizeof...(_More) + 1) / 2, _Is, {
return __vec_concat(__vec_concat(__a, __b),
__vec_concat_from_pack<_Is>(__more...)...);
});
}
/**
* Convert \p __a to _To.
* Prefer this function over calling __builtin_convertvector directly so that the library can
* improve code-gen (until the relevant PRs on GCC get resolved).
*/
template <__vec_builtin _To, __vec_builtin _From>
_GLIBCXX_SIMD_INTRINSIC _To
__vec_convert(_From __a)
{ return __builtin_convertvector(__a, _To); }
template <__vectorizable _To, __vec_builtin _From>
_GLIBCXX_SIMD_INTRINSIC __rebind_vec_builtin_t<_To, _From>
__vec_convert(_From __a)
{ return __builtin_convertvector(__a, __rebind_vec_builtin_t<_To, _From>); }
template <__vec_builtin _To, __vec_builtin... _From>
requires (sizeof...(_From) >= 2)
_GLIBCXX_SIMD_INTRINSIC _To
__vec_convert(_From... __pack)
{
using _T2 = __vec_builtin_type_bytes<__value_type_of<_To>,
sizeof(_To) / std::__bit_ceil(sizeof...(__pack))>;
return __vec_concat(__vec_convert<_T2>(__pack)...);
}
/**
* Converts __v into array<_To, N>, where N is _NParts if non-zero or otherwise deduced from _To
* such that N * #elements(_To) <= #elements(__v). Note: this function may return less than all
* converted elements
* \tparam _NParts allows to convert fewer or more (only last _To, to be partially filled) than
* all
* \tparam _Offset where to start, number of elements (not Bytes or Parts)
*/
template <typename _To, int _NParts = 0, int _Offset = 0, int _FromSize = 0, __vec_builtin _From>
_GLIBCXX_SIMD_INTRINSIC auto
__vec_convert_all(_From __v)
{
static_assert(_FromSize < __width_of<_From>);
constexpr int __input_size = _FromSize == 0 ? __width_of<_From> : _FromSize;
if constexpr (is_arithmetic_v<_To> && _NParts != 1)
{
static_assert(_Offset < __width_of<_From>);
constexpr int _Np = _NParts == 0 ? __input_size - _Offset : _NParts;
using _Rp = array<_To, _Np>;
return _GLIBCXX_SIMD_INT_PACK(_Np, _Is, {
return _Rp{(static_cast<_To>(__v[_Is + _Offset]))...};
});
}
else
{
static_assert(__vec_builtin<_To>);
if constexpr (_NParts == 1)
{
static_assert(_Offset % __width_of<_To> == 0);
return array<_To, 1>{
__vec_convert<_To>(__vec_extract_part<
_Offset / __width_of<_To>,
__div_roundup(__input_size, __width_of<_To>)>(__v))
};
}
else if constexpr ((__input_size - _Offset) > __width_of<_To>)
{
constexpr size_t _NTotal = (__input_size - _Offset) / __width_of<_To>;
constexpr size_t _Np = _NParts == 0 ? _NTotal : _NParts;
static_assert(_Np <= _NTotal
or (_Np == _NTotal + 1
and (__input_size - _Offset) % __width_of<_To> > 0));
using _Rp = array<_To, _Np>;
if constexpr (_Np == 1)
return _Rp{__vec_convert<_To>(__vec_extract_part<_Offset, __input_size,
__width_of<_To>>(__v))};
else
return _GLIBCXX_SIMD_INT_PACK(_Np, _Is, {
return _Rp {
__vec_convert<_To>(
__vec_extract_part<_Is * __width_of<_To> + _Offset, __input_size,
__width_of<_To>>(__v))...
};
});
}
else if constexpr (_Offset == 0)
return array<_To, 1>{__vec_convert<_To>(__v)};
else
return array<_To, 1>{__vec_convert<_To>(
__vec_extract_part<_Offset, __input_size,
__input_size - _Offset>(__v))};
}
}
/**
* Generator "ctor" for __vec_builtin types.
*/
#define _GLIBCXX_SIMD_VEC_GEN(_Tp, width, pack, code) \
_GLIBCXX_SIMD_INT_PACK(width, pack, { return _Tp code; })
template <__vec_builtin _Tp, int _Width = __width_of<_Tp>>
_GLIBCXX_SIMD_INTRINSIC constexpr _Tp
__vec_generate(auto&& __gen)
{ return _GLIBCXX_SIMD_VEC_GEN(_Tp, _Width, _Is, {__gen(vir::cw<_Is>)...}); }
template <int _Width, typename _Tp>
_GLIBCXX_SIMD_INTRINSIC constexpr __vec_builtin_type<_Tp, __bit_ceil(_Width)>
__vec_broadcast(_Tp __x)
{
using _Rp = __vec_builtin_type<_Tp, __bit_ceil(_Width)>;
return _GLIBCXX_SIMD_VEC_GEN(_Rp, _Width, __is, {(__is < _Width ? __x : _Tp())...});
}
template <__vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr _TV
__vec_xor(_TV __a, _TV __b)
{
using _Tp = __value_type_of<_TV>;
if constexpr (is_floating_point_v<_Tp>)
{
using _UV = __vec_builtin_type<__make_unsigned_int_t<_Tp>, __width_of<_TV>>;
return __builtin_bit_cast(
_TV, __builtin_bit_cast(_UV, __a) ^ __builtin_bit_cast(_UV, __b));
}
else
return __a ^ __b;
}
template <__vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr _TV
__vec_or(_TV __a, _TV __b)
{
using _Tp = __value_type_of<_TV>;
if constexpr (is_floating_point_v<_Tp>)
{
using _UV = __vec_builtin_type<__make_unsigned_int_t<_Tp>, __width_of<_TV>>;
return __builtin_bit_cast(
_TV, __builtin_bit_cast(_UV, __a) | __builtin_bit_cast(_UV, __b));
}
else
return __a | __b;
}
template <__vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr _TV
__vec_and(_TV __a, _TV __b)
{
using _Tp = __value_type_of<_TV>;
if constexpr (is_floating_point_v<_Tp>)
{
using _UV = __vec_builtin_type<__make_unsigned_int_t<_Tp>, __width_of<_TV>>;
return __builtin_bit_cast(
_TV, __builtin_bit_cast(_UV, __a) & __builtin_bit_cast(_UV, __b));
}
else
return __a & __b;
}
//overloaded in x86_detail.h
template <__vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr _TV
__vec_andnot(_TV __a, _TV __b)
{
using _Tp = __value_type_of<_TV>;
using _UV = __vec_builtin_type<__make_unsigned_int_t<_Tp>, __width_of<_TV>>;
return __builtin_bit_cast(
_TV, ~__builtin_bit_cast(_UV, __a) & __builtin_bit_cast(_UV, __b));
}
template <__vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr _TV
__vec_not(_TV __a)
{
using _UV = __vec_builtin_type<unsigned, sizeof(_TV)>;
if constexpr (is_floating_point_v<__value_type_of<_TV>>)
return __builtin_bit_cast(_TV, ~__builtin_bit_cast(_UV, __a));
else
return ~__a;
}
/**
* Bit-cast \p __x to a vector type with equal sizeof but value-type \p _Up.
* Optionally, the width of the return type can be constrained, making the cast ill-formed if it
* doesn't match.
*/
template <typename _Up, int _Np = 0, __vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr auto
__vec_bitcast(_TV __x)
{
if constexpr (_Np == 0)
return __builtin_bit_cast(__vec_builtin_type_bytes<_Up, sizeof(__x)>, __x);
else
return __builtin_bit_cast(__vec_builtin_type<_Up, _Np>, __x);
}
/**
* Bit-cast \p __x to the vector type \p _UV. sizeof(_UV) may be smaller than sizeof(__x).
*/
template <__vec_builtin _UV, __vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr _UV
__vec_bitcast_trunc(_TV __x)
{
static_assert(sizeof(_UV) <= sizeof(_TV));
if constexpr (sizeof(_UV) == sizeof(_TV))
return __builtin_bit_cast(_UV, __x);
else if constexpr (sizeof(_UV) <= 8)
{
using _Ip = __make_signed_int_t<_UV>;
return __builtin_bit_cast(
_UV, __builtin_bit_cast(__vec_builtin_type_bytes<_Ip, sizeof(__x)>, __x)[0]);
}
else
{
const auto __y
= __builtin_bit_cast(__vec_builtin_type_bytes<__value_type_of<_UV>, sizeof(__x)>, __x);
return _GLIBCXX_SIMD_INT_PACK(__width_of<_UV>, _Is, {
return __builtin_shufflevector(__y, __y, _Is...);
});
}
}
template <__vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC _TV
__vec_optimizer_barrier(_TV __x)
{
asm("":"+v,x,g"(__x));
return __x;
}
/**
* Return a type with sizeof 16. If the input type is smaller, add zero-padding to \p __x.
*/
template <__vec_builtin _TV>
_GLIBCXX_SIMD_INTRINSIC constexpr auto
__vec_zero_pad_to_16(_TV __x)
{
static_assert(sizeof(_TV) <= 16);
if constexpr (sizeof(_TV) == 16)
return __x;
else
{
using _Up = __make_signed_int_t<_TV>;
__vec_builtin_type_bytes<_Up, 16> __tmp = {__builtin_bit_cast(_Up, __x)};
return __builtin_bit_cast(__vec_builtin_type_bytes<__value_type_of<_TV>, 16>, __tmp);
}
}
}
#endif // PROTOTYPE_VEC_DETAIL_H_