28#define ALLOW_FP_PERMUTE true
31#if ( defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ) ) && !defined( __x86_64__ )
53# if defined( __AVX512VL__ ) && defined( __AVX512BW__ ) && defined( __AVX512DQ__ )
55# elif defined( __AVX512F__ ) || defined( __AVX512__ )
57# elif defined( __AVX2__ )
59# elif defined( __AVX__ )
61# elif defined( __SSE4_2__ )
63# elif defined( __SSE4_1__ )
65# elif defined( __SSSE3__ )
67# elif defined( __SSE3__ )
69# elif defined( __SSE2__ ) || defined( __x86_64__ )
71# elif defined( __SSE__ )
73# elif defined( _M_IX86_FP )
74# define INSTRSET _M_IX86_FP
82# if defined( __GNUC__ ) && !defined( __INTEL_COMPILER )
83# include <x86intrin.h>
87# include <immintrin.h>
90# include <immintrin.h>
92# include <nmmintrin.h>
94# include <smmintrin.h>
96# include <tmmintrin.h>
98# include <pmmintrin.h>
100# include <emmintrin.h>
102# include <xmmintrin.h>
105#if INSTRSET >= 8 && !defined( __FMA__ )
107# if defined( __GNUC__ ) && !defined( __INTEL_COMPILER )
109# if !defined( DISABLE_WARNING_AVX2_WITHOUT_FMA )
110# pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
112# elif !defined( __clang__ )
118#if defined( __XOP__ ) || defined( __FMA4__ )
120# include <x86intrin.h>
122# include <ammintrin.h>
124#elif defined( __SSE4A__ )
125# include <ammintrin.h>
129#if defined( __FMA__ ) && ( defined( __GNUC__ ) || defined( __clang__ ) ) && !defined( __INTEL_COMPILER )
130# include <fmaintrin.h>
134#if defined( __FMA4__ ) && ( defined( __GNUC__ ) || defined( __clang__ ) )
135# include <fma4intrin.h>
147namespace VCL_NAMESPACE {
164#if defined( __GNUC__ ) && !defined( GCC_VERSION ) && !defined( __clang__ )
165# define GCC_VERSION ( (__GNUC__)*10000 + (__GNUC_MINOR__)*100 + ( __GNUC_PATCHLEVEL__ ) )
169#if defined( __clang__ )
170# define CLANG_VERSION ( (__clang_major__)*10000 + (__clang_minor__)*100 + ( __clang_patchlevel__ ) )
178# if defined( _WINDEF_ ) && defined( min ) && defined( max )
191#if defined( __INTEL_COMPILER ) && __INTEL_COMPILER < 9999
192# error The Intel compiler version 19.00 cannot compile VCL version 2. Use Version 1.xx of VCL instead
204#if ( defined( __clang__ ) || defined( __apple_build_version__ ) ) && !defined( __INTEL_COMPILER )
205# define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
208#if defined( GCC_VERSION ) && GCC_VERSION < 99999 && !defined( __clang__ )
213namespace VCL_NAMESPACE {
230 static inline void cpuid(
int output[4],
int functionnumber,
int ecxleaf = 0 ) {
232# if defined( __GNUC__ ) || defined( __clang__ )
234 __asm(
"cpuid" :
"=a"( a ),
"=b"( b ),
"=c"( c ),
"=d"( d ) :
"a"( functionnumber ),
"c"( ecxleaf ) : );
240# elif defined( _MSC_VER )
241 __cpuidex( output, functionnumber, ecxleaf );
245 mov eax, functionnumber
256 for ( i = 0; i < 4; i++ ) { output[i] = 0; }
264 static inline uint32_t vml_popcnt( uint32_t a ) {
265 return (uint32_t)_mm_popcnt_u32( a );
268 static inline int64_t vml_popcnt( uint64_t a ) {
269 return _mm_popcnt_u64( a );
272 static inline int64_t vml_popcnt( uint64_t a ) {
273 return _mm_popcnt_u32( uint32_t( a >> 32 ) ) + _mm_popcnt_u32( uint32_t( a ) );
277static inline uint32_t vml_popcnt( uint32_t a ) {
279 uint32_t b = a - ( ( a >> 1 ) & 0x55555555 );
280 uint32_t
c = ( b & 0x33333333 ) + ( ( b >> 2 ) & 0x33333333 );
281 uint32_t d = (
c + (
c >> 4 ) ) & 0x0F0F0F0F;
282 uint32_t e = d * 0x01010101;
286static inline int32_t vml_popcnt( uint64_t a ) {
287 return vml_popcnt( uint32_t( a >> 32 ) ) + vml_popcnt( uint32_t( a ) );
293#if defined( __GNUC__ ) || defined( __clang__ )
295# if defined( __clang__ )
297 __attribute__( ( noinline ) )
300 bit_scan_forward( uint32_t a ) {
302 __asm(
"bsfl %1, %0" :
"=r"( r ) :
"r"( a ) : );
305 static inline uint32_t bit_scan_forward( uint64_t a ) {
306 uint32_t lo = uint32_t( a );
307 if ( lo )
return bit_scan_forward( lo );
308 uint32_t hi = uint32_t( a >> 32 );
309 return bit_scan_forward( hi ) + 32;
313static inline uint32_t bit_scan_forward( uint32_t a ) {
315 _BitScanForward( &r, a );
319static inline uint32_t bit_scan_forward( uint64_t a ) {
321 _BitScanForward64( &r, a );
325static inline uint32_t bit_scan_forward( uint64_t a ) {
326 uint32_t lo = uint32_t( a );
327 if ( lo )
return bit_scan_forward( lo );
328 uint32_t hi = uint32_t( a >> 32 );
329 return bit_scan_forward( hi ) + 32;
335#if defined( __GNUC__ ) || defined( __clang__ )
336 static inline uint32_t bit_scan_reverse( uint32_t a ) __attribute__( ( pure ) );
337 static inline uint32_t bit_scan_reverse( uint32_t a ) {
339 __asm(
"bsrl %1, %0" :
"=r"( r ) :
"r"( a ) : );
343 static inline uint32_t bit_scan_reverse( uint64_t a ) {
345 __asm(
"bsrq %1, %0" :
"=r"( r ) :
"r"( a ) : );
349 static inline uint32_t bit_scan_reverse( uint64_t a ) {
350 uint64_t ahi = a >> 32;
352 return bit_scan_reverse( uint32_t( a ) );
354 return bit_scan_reverse( uint32_t( ahi ) ) + 32;
358static inline uint32_t bit_scan_reverse( uint32_t a ) {
360 _BitScanReverse( &r, a );
364static inline uint32_t bit_scan_reverse( uint64_t a ) {
366 _BitScanReverse64( &r, a );
370static inline uint32_t bit_scan_reverse( uint64_t a ) {
371 uint64_t ahi = a >> 32;
373 return bit_scan_reverse( uint32_t( a ) );
375 return bit_scan_reverse( uint32_t( ahi ) ) + 32;
382 if ( n == 0 )
return -1;
383 uint64_t a = n, b = 0, j = 64, k = 0;
386 k = (uint64_t)1 << j;
404 template <u
int32_t n>
406#define const_int( n ) ( Const_int_t<n>() )
407#define const_uint( n ) ( Const_uint_t<n>() )
410 template <
class VTYPE>
411 static inline VTYPE nan_vec( uint32_t payload = 0x100 ) {
412 if constexpr ( ( VTYPE::elementtype() & 1 ) != 0 ) {
418 ud.q = 0x7FF8000000000000 | uint64_t( payload ) << 29;
419 return VTYPE( ud.f );
426 uf.i = 0x7FC00000 | ( payload & 0x003FFFFF );
427 return VTYPE( uf.f );
461 template <
typename T,
int N>
468 template <
typename V>
470 constexpr int elementsize =
sizeof( V ) / V::size();
472 if constexpr ( elementsize >= 8 ) {
473 return -int64_t( 1 );
474 }
else if constexpr ( elementsize >= 4 ) {
475 return int32_t( -1 );
476 }
else if constexpr ( elementsize >= 2 ) {
477 return int16_t( -1 );
490 for ( i = 0; i < N; i++ ) {
491 if ( a[i] >= 0 ) mask |= uint64_t( 1 ) << i;
493 if constexpr ( N <= 8 )
494 return uint8_t( mask );
495 else if constexpr ( N <= 16 )
496 return uint16_t( mask );
497 else if constexpr ( N <= 32 )
498 return uint32_t( mask );
505 template <
typename V>
507 constexpr int N = V::size();
511 for ( i = 0; i < N; i++ ) { u.a[i] = A[i] >= 0 ?
get_inttype<V>() : 0; }
524 template <
int N,
int B>
527 uint8_t j = uint8_t( B & 0xFF );
531 for ( i = 0; i < N; i++ ) {
536 s = ( (uint32_t)ix >> j ) & 1;
544 r |= uint64_t( s ) << i;
551 template <
typename V>
553 constexpr int N = V::size();
557 for ( i = 0; i < N; i++ ) { u.a[i] = ( ( m >> i ) & 1 ) != 0 ?
get_inttype<V>() : 0; }
563 template <
typename V>
565 constexpr int N = V::size();
569 for ( i = 0; i < N; i++ ) { u.a[i] = Etype( A[i] ); }
602 template <
typename V>
603 constexpr uint64_t
perm_flags(
int const ( &a )[V::size()] ) {
606 constexpr int N = V::size();
611 const uint32_t nlanes =
sizeof( V ) / 16;
612 const uint32_t lanesize = N / nlanes;
613 const uint32_t elementsize =
sizeof( V ) / N;
616 int32_t broadc = 999;
617 uint32_t patfail = 0;
619 int32_t compresslasti = -1;
620 int32_t compresslastp = -1;
621 int32_t expandlasti = -1;
622 int32_t expandlastp = -1;
624 int lanepattern[lanesize] = { 0 };
626 for ( i = 0; i < N; i++ ) {
631 }
else if ( ix !=
V_DC && uint32_t( ix ) >= N ) {
639 else if ( broadc != ix )
644 if ( ( i & 1 ) == 0 ) {
649 if ( ix == -1 && iy >= 0 ) r |=
perm_addz;
650 if ( iy == -1 && ix >= 0 ) r |=
perm_addz;
658 uint32_t lanei = (uint32_t)ix / lanesize;
662 if ( lane != 0 && ix >= 0 ) {
663 int j1 = i - int( lane * lanesize );
664 int jx = ix - int( lane * lanesize );
666 if ( lanepattern[j1] < 0 ) {
667 lanepattern[j1] = jx;
674 if ( uint32_t( ix * 2 ) != i ) {
678 if ( ix > compresslasti && ix - compresslasti >= (
int)i - compresslastp ) {
679 if ( (
int)i - compresslastp > 1 ) addz2 |= 2;
686 if ( ix > expandlasti && ix - expandlasti <= (
int)i - expandlastp ) {
687 if ( ix - expandlasti > 1 ) addz2 |= 4;
693 }
else if ( ix == -1 ) {
694 if ( ( i & 1 ) == 0 ) addz2 |= 1;
701 if ( ( patfail & 1 ) == 0 ) {
704 }
else if ( ( patfail & 2 ) == 0 ) {
706 if ( ( addz2 & 2 ) != 0 ) {
707 for ( j = 0; j < compresslastp; j++ ) {
711 }
else if ( ( patfail & 4 ) == 0 ) {
713 if ( ( addz2 & 4 ) != 0 ) {
714 for ( j = 0; j < expandlastp; j++ ) {
724 for ( i = 0; i < lanesize; i++ ) {
725 if ( lanepattern[i] >= 0 ) {
726 uint32_t rot1 = uint32_t( lanepattern[i] + lanesize - i ) % lanesize;
730 if ( rot != rot1 ) fit =
false;
736 uint64_t rot2 = ( rot * elementsize ) & 0xF;
743 for ( i = 0; i < lanesize - rot; i++ ) {
744 if ( lanepattern[i] >= 0 ) fit =
false;
748 for ( ; i < lanesize; i++ )
749 if ( lanepattern[i] == -1 ) r |=
perm_addz;
753 for ( i = lanesize - (uint32_t)rot; i < lanesize;
755 if ( lanepattern[i] >= 0 ) fit =
false;
759 for ( i = 0; i < lanesize - rot; i++ ) {
760 if ( lanepattern[i] == -1 ) r |=
perm_addz;
766 uint32_t j2 = lanesize / 2;
767 for ( i = 0; i < lanesize; i++ ) {
768 if ( lanepattern[i] >= 0 && lanepattern[i] != (
int)j2 ) fit =
false;
769 if ( ( i & 1 ) != 0 ) j2++;
775 for ( i = 0; i < lanesize; i++ ) {
776 if ( lanepattern[i] >= 0 && lanepattern[i] != (
int)j2 ) fit =
false;
777 if ( ( i & 1 ) != 0 ) j2++;
781 if ( elementsize >= 4 ) {
783 for ( i = 0; i < lanesize; i++ ) {
784 if ( lanesize == 4 ) {
785 p |= ( lanepattern[i] & 3 ) << 2 * i;
787 p |= ( ( lanepattern[i] & 1 ) * 10 + 4 ) << 4 * i;
795 if constexpr ( nlanes > 1 ) {
796 for ( i = 0; i < N; i++ ) {
799 uint32_t rot2 = ( ix + N - i ) % N;
802 }
else if ( rot != rot2 ) {
826 int ix = 0, lasti = -1, lastp = -1;
830 for ( i = 0; i < N; i++ ) {
833 m |= (uint64_t)1 << ix;
834 for ( j = 1; j < i - lastp; j++ ) {
835 m |= (uint64_t)1 << ( lasti + j );
850 int ix = 0, lasti = -1, lastp = -1;
854 for ( i = 0; i < N; i++ ) {
857 m |= (uint64_t)1 << i;
858 for ( j = 1; j < ix - lasti; j++ ) {
859 m |= (uint64_t)1 << ( lastp + j );
875 template <
typename V>
879 constexpr int N = V::size();
882 uint32_t pat[4] = { 0, 0, 0, 0 };
885 const uint32_t lanesize = 8;
887 int lanepattern[lanesize] = { 0 };
889 for ( i = 0; i < N; i++ ) {
894 }
else if ( ix >= 0 ) {
895 uint32_t j = i - lane * lanesize;
896 int jx = ix - lane * lanesize;
897 if ( lanepattern[j] < 0 ) {
903 for ( i = 0; i < 4; i++ ) {
905 if ( lanepattern[i] >= 0 ) {
906 if ( lanepattern[i] < 4 ) {
908 pat[0] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
911 pat[2] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
915 if ( lanepattern[i + 4] >= 0 ) {
916 if ( lanepattern[i + 4] < 4 ) {
918 pat[3] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
921 pat[1] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
926 for ( i = 0; i < 4; i++ ) { retval |= (uint64_t)pat[i] << ( 32 + i * 8 ); }
934 template <
typename V,
int oppos = 0>
939 constexpr uint32_t N = V::size();
940 constexpr uint32_t elementsize =
sizeof( V ) / N;
941 constexpr uint32_t nlanes =
sizeof( V ) / 16;
942 constexpr uint32_t elements_per_lane = N / nlanes;
944 EList<int8_t,
sizeof( V )> u = { { 0 } };
952 for ( lane = 0; lane < nlanes; lane++ ) {
953 for ( i = 0; i < elements_per_lane; i++ ) {
958 ix ^= oppos * elements_per_lane;
960 ix -= int( lane * elements_per_lane );
961 if ( ix >= 0 && ix < (
int)elements_per_lane ) {
962 p = ix * elementsize;
964 for ( j = 0; j < elementsize; j++ ) {
965 u.a[k++] = p < 0 ? -1 : p + j;
980 EList<int, N / 2> list = { { 0 } };
984 bool fit_addz =
false;
988 for ( i = 0; i < N; i += 2 ) {
991 if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz =
true; }
995 for ( i = 0; i < N; i += 2 ) {
1000 }
else if ( iy >= 0 ) {
1004 if ( fit_addz ) iz =
V_DC;
1036 template <
typename V>
1040 constexpr int N = V::size();
1045 const uint32_t nlanes =
sizeof( V ) / 16;
1046 const uint32_t lanesize = N / nlanes;
1049 int lanepattern[lanesize] = { 0 };
1050 if ( lanesize == 2 && N <= 8 ) {
1054 for ( ii = 0; ii < N; ii++ ) {
1059 else if ( ix !=
V_DC ) {
1068 }
else if ( ix < 2 * N ) {
1078 if ( ( ii & 1 ) == 0 ) {
1086 lane = (uint32_t)ii / lanesize;
1088 lanepattern[ii] = ix;
1092 uint32_t lanei = uint32_t( ix & ~N ) / lanesize;
1093 if ( lanei != lane ) {
1096 if ( lanesize == 2 ) {
1098 if ( ( ( ( ix & N ) != 0 ) ^ ii ) & 1 )
1105 if ( lane != 0 && ix >= 0 ) {
1106 int j = ii - int( lane * lanesize );
1107 int jx = ix - int( lane * lanesize );
1109 if ( lanepattern[j] < 0 ) {
1110 lanepattern[j] = jx;
1124 for ( iu = 0; iu < lanesize; iu++ ) {
1125 ix = lanepattern[iu];
1128 if ( (uint32_t)ix != iu / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &=
~blend_punpcklba;
1129 if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( iu & 1 ) * N ) r &=
~blend_punpckhab;
1130 if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &=
~blend_punpckhba;
1134 for ( iu = 0; iu < lanesize; iu++ ) {
1135 ix = lanepattern[iu];
1137 uint32_t t = ix & ~N;
1138 if ( ix & N ) t += lanesize;
1139 uint32_t tb = ( t + 2 * lanesize - iu ) % ( lanesize * 2 );
1143 if ( rot != tb ) rot = 1000;
1148 if ( rot < lanesize ) {
1153 const uint32_t elementsize =
sizeof( V ) / N;
1154 r |= uint64_t( ( rot & ( lanesize - 1 ) ) * elementsize ) <<
blend_rotpattern;
1157 if ( lanesize == 4 ) {
1160 for ( ii = 0; ii < 2; ii++ ) {
1161 ix = lanepattern[ii];
1169 for ( ; ii < 4; ii++ ) {
1170 ix = lanepattern[ii];
1179 uint8_t shufpattern = 0;
1180 for ( iu = 0; iu < lanesize; iu++ ) { shufpattern |= ( lanepattern[iu] & 3 ) << iu * 2; }
1184 }
else if ( nlanes > 1 ) {
1186 for ( ii = 0; ii < N; ii++ ) {
1189 uint32_t rot2 = ( ix + 2 * N - ii ) % ( 2 * N );
1192 }
else if ( rot != rot2 ) {
1198 if ( rot < 2 * N ) {
1203 for ( ii = 0; ii < N; ii++ ) { r |= uint64_t( a[ii] & 1 ) << (
blend_shufpattern + ii ); }
1213 template <
int N,
int dozero>
1217 int u = dozero ? -1 :
V_DC;
1220 for ( j = 0; j < N; j++ ) {
1223 if ( dozero == 2 ) {
1232 }
else if ( ix < N ) {
1237 list.
a[j + N] = ix - N;
1251 EList<int, N / 2> list = { { 0 } };
1253 bool fit_addz =
false;
1259 for ( i = 0; i < N; i += 2 ) {
1264 }
else if ( iy >= 0 ) {
1270 if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz =
true; }
1274 for ( i = 0; i < N / 2; i++ ) {
1275 if ( list.a[i] < 0 ) list.a[i] =
V_DC;
1291 template <
typename dummy>
1293 template <
typename dummy>
1295 template <
typename dummy>
1297 template <
typename dummy>
1299 template <
typename dummy>
1308 template <
int N,
int dozero,
int src1,
int src2>
1312 int u = dozero ? -1 :
V_DC;
1315 for ( j = 0; j < N; j++ ) {
1318 list.
a[j] = ( dozero == 2 ) ? ix : u;
1321 if ( src == src1 ) {
1322 list.
a[j] = ix & ( N - 1 );
1323 }
else if ( src == src2 ) {
1324 list.
a[j] = ( ix & ( N - 1 ) ) + N;
1333 template <
typename W,
int s>
1334 static inline auto selectblend( W
const a, W
const b ) {
1335 if constexpr (
s == 0 )
1337 else if constexpr (
s == 1 )
1338 return a.get_high();
1339 else if constexpr (
s == 2 )
1342 return b.get_high();
1352 template <
typename W,
int... i0>
1354 typedef decltype( a.get_low() ) V;
1355 constexpr int N = V::size();
1356 static_assert(
sizeof...( i0 ) == N,
"wrong number of indexes in blend_half" );
1357 constexpr int ind[N] = { i0... };
1361 auto listsources = [](
int const n,
int const( &ind )[N] )
constexpr {
1362 bool source_used[4] = {
false,
false,
false,
false };
1364 for ( i = 0; i < n; i++ ) {
1368 source_used[src & 3] =
true;
1374 for ( i = 0; i < 4; i++ ) {
1375 if ( source_used[i] ) { sources.
a[nsrc++] = i; }
1377 sources.
a[4] = nsrc;
1382 constexpr int nsrc = sources.
a[4];
1384 if constexpr ( nsrc == 0 ) {
1388 constexpr int uindex = ( nsrc > 2 ) ? 1 : 2;
1391 V src0 = selectblend<W, sources.
a[0]>( a, b );
1392 V src1 = selectblend<W, sources.
a[1]>( a, b );
1393 if constexpr ( N == 2 ) {
1394 x0 =
blend2<L.a[0], L.a[1]>( src0, src1 );
1395 }
else if constexpr ( N == 4 ) {
1396 x0 =
blend4<L.a[0], L.a[1], L.a[2], L.a[3]>( src0, src1 );
1397 }
else if constexpr ( N == 8 ) {
1398 x0 =
blend8<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]>( src0, src1 );
1399 }
else if constexpr ( N == 16 ) {
1400 x0 =
blend16<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
1401 L.a[12], L.a[13], L.a[14], L.a[15]>( src0, src1 );
1402 }
else if constexpr ( N == 32 ) {
1403 x0 =
blend32<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
1404 L.a[12], L.a[13], L.a[14], L.a[15], L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22],
1405 L.a[23], L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]>( src0, src1 );
1407 if constexpr ( nsrc > 2 ) {
1410 V src2 = selectblend<W, sources.
a[2]>( a, b );
1411 V src3 = selectblend<W, sources.
a[3]>( a, b );
1412 if constexpr ( N == 2 ) {
1413 x1 =
blend2<M.
a[0], M.
a[1]>( src0, src1 );
1414 }
else if constexpr ( N == 4 ) {
1415 x1 =
blend4<M.
a[0], M.
a[1], M.
a[2], M.
a[3]>( src2, src3 );
1416 }
else if constexpr ( N == 8 ) {
1417 x1 =
blend8<M.
a[0], M.
a[1], M.
a[2], M.
a[3], M.
a[4], M.
a[5], M.
a[6], M.
a[7]>( src2, src3 );
1418 }
else if constexpr ( N == 16 ) {
1419 x1 =
blend16<M.
a[0], M.
a[1], M.
a[2], M.
a[3], M.
a[4], M.
a[5], M.
a[6], M.
a[7], M.
a[8], M.
a[9], M.
a[10], M.
a[11],
1420 M.
a[12], M.
a[13], M.
a[14], M.
a[15]>( src2, src3 );
1421 }
else if constexpr ( N == 32 ) {
1422 x1 =
blend32<M.
a[0], M.
a[1], M.
a[2], M.
a[3], M.
a[4], M.
a[5], M.
a[6], M.
a[7], M.
a[8], M.
a[9], M.
a[10], M.
a[11],
1423 M.
a[12], M.
a[13], M.
a[14], M.
a[15], M.
a[16], M.
a[17], M.
a[18], M.
a[19], M.
a[20], M.
a[21], M.
a[22],
1424 M.
a[23], M.
a[24], M.
a[25], M.
a[26], M.
a[27], M.
a[28], M.
a[29], M.
a[30], M.
a[31]>( src2, src3 );
const int blend_rotpattern
const int blend_punpckhba
constexpr uint64_t blend_flags(int const (&a)[V::size()])
const int blend_shufpattern
const int blend_cross_lane
constexpr auto perm_mask_broad(int const (&A)[V::size()])
const int blend_largeblock
constexpr int bit_scan_reverse_const(uint64_t const n)
const int perm_rotate_big
constexpr uint64_t expand_mask(int const (&a)[N])
constexpr auto zero_mask(int const (&a)[N])
constexpr auto pshufb_mask(int const (&A)[V::size()])
bool hasAVX512VBMI2(void)
constexpr auto zero_mask_broad(int const (&A)[V::size()])
constexpr EList< int, 2 *N > blend_perm_indexes(int const (&a)[N])
const int blend_rotate_big
constexpr uint64_t perm_flags(int const (&a)[V::size()])
const int perm_cross_lane
const int blend_outofrange
constexpr uint64_t perm16_flags(int const (&a)[V::size()])
constexpr EList< int, N/2 > largeblock_perm(int const (&a)[N])
const int perm_outofrange
const int perm_largeblock
const int blend_same_pattern
int physicalProcessors(int *logical_processors=0)
constexpr EList< int, N/2 > largeblock_indexes(int const (&a)[N])
const int perm_same_pattern
constexpr uint64_t make_bit_mask(int const (&a)[N])
const int blend_punpcklab
auto blend_half(W const &a, W const &b)
const int blend_punpckhab
constexpr auto get_inttype()
constexpr auto make_broad_mask(uint64_t const m)
constexpr uint64_t compress_mask(int const (&a)[N])
int instrset_detect(void)
const int blend_punpcklba
constexpr EList< int, N > blend_half_indexes(int const (&a)[N])