28#define ALLOW_FP_PERMUTE true
31#if ( defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ) ) && !defined( __x86_64__ )
53# if defined( __AVX512VL__ ) && defined( __AVX512BW__ ) && defined( __AVX512DQ__ )
55# elif defined( __AVX512F__ ) || defined( __AVX512__ )
57# elif defined( __AVX2__ )
59# elif defined( __AVX__ )
61# elif defined( __SSE4_2__ )
63# elif defined( __SSE4_1__ )
65# elif defined( __SSSE3__ )
67# elif defined( __SSE3__ )
69# elif defined( __SSE2__ ) || defined( __x86_64__ )
71# elif defined( __SSE__ )
73# elif defined( _M_IX86_FP )
74# define INSTRSET _M_IX86_FP
82# if defined( __GNUC__ ) && !defined( __INTEL_COMPILER )
83# include <x86intrin.h>
87# include <immintrin.h>
90# include <immintrin.h>
92# include <nmmintrin.h>
94# include <smmintrin.h>
96# include <tmmintrin.h>
98# include <pmmintrin.h>
100# include <emmintrin.h>
102# include <xmmintrin.h>
105#if INSTRSET >= 8 && !defined( __FMA__ )
107# if defined( __GNUC__ ) && !defined( __INTEL_COMPILER )
109# if !defined( DISABLE_WARNING_AVX2_WITHOUT_FMA )
110# pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
112# elif !defined( __clang__ )
118#if defined( __XOP__ ) || defined( __FMA4__ )
120# include <x86intrin.h>
122# include <ammintrin.h>
124#elif defined( __SSE4A__ )
125# include <ammintrin.h>
129#if defined( __FMA__ ) && ( defined( __GNUC__ ) || defined( __clang__ ) ) && !defined( __INTEL_COMPILER )
130# include <fmaintrin.h>
134#if defined( __FMA4__ ) && ( defined( __GNUC__ ) || defined( __clang__ ) )
135# include <fma4intrin.h>
147namespace VCL_NAMESPACE {
164#if defined( __GNUC__ ) && !defined( GCC_VERSION ) && !defined( __clang__ )
165# define GCC_VERSION ( (__GNUC__)*10000 + (__GNUC_MINOR__)*100 + ( __GNUC_PATCHLEVEL__ ) )
169#if defined( __clang__ )
170# define CLANG_VERSION ( (__clang_major__)*10000 + (__clang_minor__)*100 + ( __clang_patchlevel__ ) )
178# if defined( _WINDEF_ ) && defined( min ) && defined( max )
191#if defined( __INTEL_COMPILER ) && __INTEL_COMPILER < 9999
192# error The Intel compiler version 19.00 cannot compile VCL version 2. Use Version 1.xx of VCL instead
204#if ( defined( __clang__ ) || defined( __apple_build_version__ ) ) && !defined( __INTEL_COMPILER )
205# define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
208#if defined( GCC_VERSION ) && GCC_VERSION < 99999 && !defined( __clang__ )
213namespace VCL_NAMESPACE {
230 static inline void cpuid(
int output[4],
int functionnumber,
int ecxleaf = 0 ) {
232# if defined( __GNUC__ ) || defined( __clang__ )
234 __asm(
"cpuid" :
"=a"( a ),
"=b"( b ),
"=c"( c ),
"=d"( d ) :
"a"( functionnumber ),
"c"( ecxleaf ) : );
240# elif defined( _MSC_VER )
241 __cpuidex( output, functionnumber, ecxleaf );
245 mov eax, functionnumber
262 static inline uint32_t vml_popcnt( uint32_t a ) {
263 return (uint32_t)_mm_popcnt_u32( a );
266 static inline int64_t vml_popcnt( uint64_t a ) {
267 return _mm_popcnt_u64( a );
270 static inline int64_t vml_popcnt( uint64_t a ) {
271 return _mm_popcnt_u32( uint32_t( a >> 32 ) ) + _mm_popcnt_u32( uint32_t( a ) );
275static inline uint32_t vml_popcnt( uint32_t a ) {
277 uint32_t b = a - ( ( a >> 1 ) & 0x55555555 );
278 uint32_t
c = ( b & 0x33333333 ) + ( ( b >> 2 ) & 0x33333333 );
279 uint32_t d = (
c + (
c >> 4 ) ) & 0x0F0F0F0F;
280 uint32_t e = d * 0x01010101;
284static inline int32_t vml_popcnt( uint64_t a ) {
285 return vml_popcnt( uint32_t( a >> 32 ) ) + vml_popcnt( uint32_t( a ) );
291#if defined( __GNUC__ ) || defined( __clang__ )
293# if defined( __clang__ )
295 __attribute__( ( noinline ) )
298 bit_scan_forward( uint32_t a ) {
300 __asm(
"bsfl %1, %0" :
"=r"( r ) :
"r"( a ) : );
303 static inline uint32_t bit_scan_forward( uint64_t a ) {
304 uint32_t lo = uint32_t( a );
305 if ( lo )
return bit_scan_forward( lo );
306 uint32_t hi = uint32_t( a >> 32 );
307 return bit_scan_forward( hi ) + 32;
311static inline uint32_t bit_scan_forward( uint32_t a ) {
313 _BitScanForward( &r, a );
317static inline uint32_t bit_scan_forward( uint64_t a ) {
319 _BitScanForward64( &r, a );
323static inline uint32_t bit_scan_forward( uint64_t a ) {
324 uint32_t lo = uint32_t( a );
325 if ( lo )
return bit_scan_forward( lo );
326 uint32_t hi = uint32_t( a >> 32 );
327 return bit_scan_forward( hi ) + 32;
333#if defined( __GNUC__ ) || defined( __clang__ )
334 static inline uint32_t bit_scan_reverse( uint32_t a ) __attribute__( ( pure ) );
335 static inline uint32_t bit_scan_reverse( uint32_t a ) {
337 __asm(
"bsrl %1, %0" :
"=r"( r ) :
"r"( a ) : );
341 static inline uint32_t bit_scan_reverse( uint64_t a ) {
343 __asm(
"bsrq %1, %0" :
"=r"( r ) :
"r"( a ) : );
347 static inline uint32_t bit_scan_reverse( uint64_t a ) {
348 uint64_t ahi = a >> 32;
350 return bit_scan_reverse( uint32_t( a ) );
352 return bit_scan_reverse( uint32_t( ahi ) ) + 32;
356static inline uint32_t bit_scan_reverse( uint32_t a ) {
358 _BitScanReverse( &r, a );
362static inline uint32_t bit_scan_reverse( uint64_t a ) {
364 _BitScanReverse64( &r, a );
368static inline uint32_t bit_scan_reverse( uint64_t a ) {
369 uint64_t ahi = a >> 32;
371 return bit_scan_reverse( uint32_t( a ) );
373 return bit_scan_reverse( uint32_t( ahi ) ) + 32;
380 if ( n == 0 )
return -1;
381 uint64_t a = n, b = 0, j = 64, k = 0;
384 k = (uint64_t)1 << j;
402 template <u
int32_t n>
404#define const_int( n ) ( Const_int_t<n>() )
405#define const_uint( n ) ( Const_uint_t<n>() )
408 template <
class VTYPE>
409 static inline VTYPE nan_vec( uint32_t payload = 0x100 ) {
410 if constexpr ( ( VTYPE::elementtype() & 1 ) != 0 ) {
416 ud.q = 0x7FF8000000000000 | uint64_t( payload ) << 29;
417 return VTYPE( ud.f );
424 uf.i = 0x7FC00000 | ( payload & 0x003FFFFF );
425 return VTYPE( uf.f );
459 template <
typename T,
int N>
466 template <
typename V>
468 constexpr int elementsize =
sizeof( V ) / V::size();
470 if constexpr ( elementsize >= 8 ) {
471 return -int64_t( 1 );
472 }
else if constexpr ( elementsize >= 4 ) {
473 return int32_t( -1 );
474 }
else if constexpr ( elementsize >= 2 ) {
475 return int16_t( -1 );
488 for ( i = 0; i < N; i++ ) {
489 if ( a[i] >= 0 ) mask |= uint64_t( 1 ) << i;
491 if constexpr ( N <= 8 )
492 return uint8_t( mask );
493 else if constexpr ( N <= 16 )
494 return uint16_t( mask );
495 else if constexpr ( N <= 32 )
496 return uint32_t( mask );
503 template <
typename V>
505 constexpr int N = V::size();
509 for ( i = 0; i < N; i++ ) { u.a[i] = A[i] >= 0 ?
get_inttype<V>() : 0; }
522 template <
int N,
int B>
525 uint8_t j = uint8_t( B & 0xFF );
529 for ( i = 0; i < N; i++ ) {
534 s = ( (uint32_t)ix >> j ) & 1;
542 r |= uint64_t( s ) << i;
549 template <
typename V>
551 constexpr int N = V::size();
555 for ( i = 0; i < N; i++ ) { u.a[i] = ( ( m >> i ) & 1 ) != 0 ?
get_inttype<V>() : 0; }
561 template <
typename V>
563 constexpr int N = V::size();
567 for ( i = 0; i < N; i++ ) { u.a[i] = Etype( A[i] ); }
600 template <
typename V>
601 constexpr uint64_t
perm_flags(
int const ( &a )[V::size()] ) {
604 constexpr int N = V::size();
609 const uint32_t nlanes =
sizeof( V ) / 16;
610 const uint32_t lanesize = N / nlanes;
611 const uint32_t elementsize =
sizeof( V ) / N;
614 int32_t broadc = 999;
615 uint32_t patfail = 0;
617 int32_t compresslasti = -1;
618 int32_t compresslastp = -1;
619 int32_t expandlasti = -1;
620 int32_t expandlastp = -1;
622 int lanepattern[lanesize] = { 0 };
624 for ( i = 0; i < N; i++ ) {
629 }
else if ( ix !=
V_DC && uint32_t( ix ) >= N ) {
637 else if ( broadc != ix )
642 if ( ( i & 1 ) == 0 ) {
647 if ( ix == -1 && iy >= 0 ) r |=
perm_addz;
648 if ( iy == -1 && ix >= 0 ) r |=
perm_addz;
656 uint32_t lanei = (uint32_t)ix / lanesize;
660 if ( lane != 0 && ix >= 0 ) {
661 int j1 = i - int( lane * lanesize );
662 int jx = ix - int( lane * lanesize );
664 if ( lanepattern[j1] < 0 ) {
665 lanepattern[j1] = jx;
672 if ( uint32_t( ix * 2 ) != i ) {
676 if ( ix > compresslasti && ix - compresslasti >= (
int)i - compresslastp ) {
677 if ( (
int)i - compresslastp > 1 ) addz2 |= 2;
684 if ( ix > expandlasti && ix - expandlasti <= (
int)i - expandlastp ) {
685 if ( ix - expandlasti > 1 ) addz2 |= 4;
691 }
else if ( ix == -1 ) {
692 if ( ( i & 1 ) == 0 ) addz2 |= 1;
699 if ( ( patfail & 1 ) == 0 ) {
702 }
else if ( ( patfail & 2 ) == 0 ) {
704 if ( ( addz2 & 2 ) != 0 ) {
705 for ( j = 0; j < compresslastp; j++ ) {
709 }
else if ( ( patfail & 4 ) == 0 ) {
711 if ( ( addz2 & 4 ) != 0 ) {
712 for ( j = 0; j < expandlastp; j++ ) {
722 for ( i = 0; i < lanesize; i++ ) {
723 if ( lanepattern[i] >= 0 ) {
724 uint32_t rot1 = uint32_t( lanepattern[i] + lanesize - i ) % lanesize;
728 if ( rot != rot1 ) fit =
false;
734 uint64_t rot2 = ( rot * elementsize ) & 0xF;
741 for ( i = 0; i < lanesize - rot; i++ ) {
742 if ( lanepattern[i] >= 0 ) fit =
false;
746 for ( ; i < lanesize; i++ )
747 if ( lanepattern[i] == -1 ) r |=
perm_addz;
751 for ( i = lanesize - (uint32_t)rot; i < lanesize;
753 if ( lanepattern[i] >= 0 ) fit =
false;
757 for ( i = 0; i < lanesize - rot; i++ ) {
758 if ( lanepattern[i] == -1 ) r |=
perm_addz;
764 uint32_t j2 = lanesize / 2;
765 for ( i = 0; i < lanesize; i++ ) {
766 if ( lanepattern[i] >= 0 && lanepattern[i] != (
int)j2 ) fit =
false;
767 if ( ( i & 1 ) != 0 ) j2++;
773 for ( i = 0; i < lanesize; i++ ) {
774 if ( lanepattern[i] >= 0 && lanepattern[i] != (
int)j2 ) fit =
false;
775 if ( ( i & 1 ) != 0 ) j2++;
779 if ( elementsize >= 4 ) {
781 for ( i = 0; i < lanesize; i++ ) {
782 if ( lanesize == 4 ) {
783 p |= ( lanepattern[i] & 3 ) << 2 * i;
785 p |= ( ( lanepattern[i] & 1 ) * 10 + 4 ) << 4 * i;
793 if constexpr ( nlanes > 1 ) {
794 for ( i = 0; i < N; i++ ) {
797 uint32_t rot2 = ( ix + N - i ) % N;
800 }
else if ( rot != rot2 ) {
824 int ix = 0, lasti = -1, lastp = -1;
828 for ( i = 0; i < N; i++ ) {
831 m |= (uint64_t)1 << ix;
832 for ( j = 1; j < i - lastp; j++ ) {
833 m |= (uint64_t)1 << ( lasti + j );
848 int ix = 0, lasti = -1, lastp = -1;
852 for ( i = 0; i < N; i++ ) {
855 m |= (uint64_t)1 << i;
856 for ( j = 1; j < ix - lasti; j++ ) {
857 m |= (uint64_t)1 << ( lastp + j );
873 template <
typename V>
877 constexpr int N = V::size();
880 uint32_t pat[4] = { 0, 0, 0, 0 };
883 const uint32_t lanesize = 8;
885 int lanepattern[lanesize] = { 0 };
887 for ( i = 0; i < N; i++ ) {
892 }
else if ( ix >= 0 ) {
893 uint32_t j = i - lane * lanesize;
894 int jx = ix - lane * lanesize;
895 if ( lanepattern[j] < 0 ) {
901 for ( i = 0; i < 4; i++ ) {
903 if ( lanepattern[i] >= 0 ) {
904 if ( lanepattern[i] < 4 ) {
906 pat[0] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
909 pat[2] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
913 if ( lanepattern[i + 4] >= 0 ) {
914 if ( lanepattern[i + 4] < 4 ) {
916 pat[3] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
919 pat[1] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
924 for ( i = 0; i < 4; i++ ) { retval |= (uint64_t)pat[i] << ( 32 + i * 8 ); }
932 template <
typename V,
int oppos = 0>
937 constexpr uint32_t N = V::size();
938 constexpr uint32_t elementsize =
sizeof( V ) / N;
939 constexpr uint32_t nlanes =
sizeof( V ) / 16;
940 constexpr uint32_t elements_per_lane = N / nlanes;
942 EList<int8_t,
sizeof( V )> u = { { 0 } };
950 for ( lane = 0; lane < nlanes; lane++ ) {
951 for ( i = 0; i < elements_per_lane; i++ ) {
956 ix ^= oppos * elements_per_lane;
958 ix -= int( lane * elements_per_lane );
959 if ( ix >= 0 && ix < (
int)elements_per_lane ) {
960 p = ix * elementsize;
962 for ( j = 0; j < elementsize; j++ ) {
963 u.a[k++] = p < 0 ? -1 : p + j;
978 EList<int, N / 2> list = { { 0 } };
982 bool fit_addz =
false;
986 for ( i = 0; i < N; i += 2 ) {
989 if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz =
true; }
993 for ( i = 0; i < N; i += 2 ) {
998 }
else if ( iy >= 0 ) {
1002 if ( fit_addz ) iz =
V_DC;
1034 template <
typename V>
1038 constexpr int N = V::size();
1043 const uint32_t nlanes =
sizeof( V ) / 16;
1044 const uint32_t lanesize = N / nlanes;
1047 int lanepattern[lanesize] = { 0 };
1048 if ( lanesize == 2 && N <= 8 ) {
1052 for ( ii = 0; ii < N; ii++ ) {
1057 else if ( ix !=
V_DC ) {
1066 }
else if ( ix < 2 * N ) {
1076 if ( ( ii & 1 ) == 0 ) {
1084 lane = (uint32_t)ii / lanesize;
1086 lanepattern[ii] = ix;
1090 uint32_t lanei = uint32_t( ix & ~N ) / lanesize;
1091 if ( lanei != lane ) {
1094 if ( lanesize == 2 ) {
1096 if ( ( ( ( ix & N ) != 0 ) ^ ii ) & 1 )
1103 if ( lane != 0 && ix >= 0 ) {
1104 int j = ii - int( lane * lanesize );
1105 int jx = ix - int( lane * lanesize );
1107 if ( lanepattern[j] < 0 ) {
1108 lanepattern[j] = jx;
1122 for ( iu = 0; iu < lanesize; iu++ ) {
1123 ix = lanepattern[iu];
1126 if ( (uint32_t)ix != iu / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &=
~blend_punpcklba;
1127 if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( iu & 1 ) * N ) r &=
~blend_punpckhab;
1128 if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &=
~blend_punpckhba;
1132 for ( iu = 0; iu < lanesize; iu++ ) {
1133 ix = lanepattern[iu];
1135 uint32_t t = ix & ~N;
1136 if ( ix & N ) t += lanesize;
1137 uint32_t tb = ( t + 2 * lanesize - iu ) % ( lanesize * 2 );
1141 if ( rot != tb ) rot = 1000;
1146 if ( rot < lanesize ) {
1151 const uint32_t elementsize =
sizeof( V ) / N;
1152 r |= uint64_t( ( rot & ( lanesize - 1 ) ) * elementsize ) <<
blend_rotpattern;
1155 if ( lanesize == 4 ) {
1158 for ( ii = 0; ii < 2; ii++ ) {
1159 ix = lanepattern[ii];
1167 for ( ; ii < 4; ii++ ) {
1168 ix = lanepattern[ii];
1177 uint8_t shufpattern = 0;
1178 for ( iu = 0; iu < lanesize; iu++ ) { shufpattern |= ( lanepattern[iu] & 3 ) << iu * 2; }
1182 }
else if ( nlanes > 1 ) {
1184 for ( ii = 0; ii < N; ii++ ) {
1187 uint32_t rot2 = ( ix + 2 * N - ii ) % ( 2 * N );
1190 }
else if ( rot != rot2 ) {
1196 if ( rot < 2 * N ) {
1201 for ( ii = 0; ii < N; ii++ ) { r |= uint64_t( a[ii] & 1 ) << (
blend_shufpattern + ii ); }
1211 template <
int N,
int dozero>
1215 int u = dozero ? -1 :
V_DC;
1218 for ( j = 0; j < N; j++ ) {
1221 if ( dozero == 2 ) {
1230 }
else if ( ix < N ) {
1235 list.
a[j + N] = ix - N;
1249 EList<int, N / 2> list = { { 0 } };
1251 bool fit_addz =
false;
1257 for ( i = 0; i < N; i += 2 ) {
1262 }
else if ( iy >= 0 ) {
1268 if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz =
true; }
1272 for ( i = 0; i < N / 2; i++ ) {
1273 if ( list.a[i] < 0 ) list.a[i] =
V_DC;
1289 template <
typename dummy>
1291 template <
typename dummy>
1293 template <
typename dummy>
1295 template <
typename dummy>
1297 template <
typename dummy>
1306 template <
int N,
int dozero,
int src1,
int src2>
1310 int u = dozero ? -1 :
V_DC;
1313 for ( j = 0; j < N; j++ ) {
1316 list.
a[j] = ( dozero == 2 ) ? ix : u;
1319 if ( src == src1 ) {
1320 list.
a[j] = ix & ( N - 1 );
1321 }
else if ( src == src2 ) {
1322 list.
a[j] = ( ix & ( N - 1 ) ) + N;
1331 template <
typename W,
int s>
1332 static inline auto selectblend( W
const a, W
const b ) {
1333 if constexpr (
s == 0 )
1335 else if constexpr (
s == 1 )
1336 return a.get_high();
1337 else if constexpr (
s == 2 )
1340 return b.get_high();
1350 template <
typename W,
int... i0>
1352 typedef decltype( a.get_low() ) V;
1353 constexpr int N = V::size();
1354 static_assert(
sizeof...( i0 ) == N,
"wrong number of indexes in blend_half" );
1355 constexpr int ind[N] = { i0... };
1359 auto listsources = [](
int const n,
int const( &ind )[N] )
constexpr {
1360 bool source_used[4] = {
false,
false,
false,
false };
1362 for ( i = 0; i < n; i++ ) {
1366 source_used[src & 3] =
true;
1372 for ( i = 0; i < 4; i++ ) {
1373 if ( source_used[i] ) { sources.
a[nsrc++] = i; }
1375 sources.
a[4] = nsrc;
1380 constexpr int nsrc = sources.
a[4];
1382 if constexpr ( nsrc == 0 ) {
1386 constexpr int uindex = ( nsrc > 2 ) ? 1 : 2;
1389 V src0 = selectblend<W, sources.
a[0]>( a, b );
1390 V src1 = selectblend<W, sources.
a[1]>( a, b );
1391 if constexpr ( N == 2 ) {
1392 x0 =
blend2<L.a[0], L.a[1]>( src0, src1 );
1393 }
else if constexpr ( N == 4 ) {
1394 x0 =
blend4<L.a[0], L.a[1], L.a[2], L.a[3]>( src0, src1 );
1395 }
else if constexpr ( N == 8 ) {
1396 x0 =
blend8<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]>( src0, src1 );
1397 }
else if constexpr ( N == 16 ) {
1398 x0 =
blend16<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
1399 L.a[12], L.a[13], L.a[14], L.a[15]>( src0, src1 );
1400 }
else if constexpr ( N == 32 ) {
1401 x0 =
blend32<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
1402 L.a[12], L.a[13], L.a[14], L.a[15], L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22],
1403 L.a[23], L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]>( src0, src1 );
1405 if constexpr ( nsrc > 2 ) {
1408 V src2 = selectblend<W, sources.
a[2]>( a, b );
1409 V src3 = selectblend<W, sources.
a[3]>( a, b );
1410 if constexpr ( N == 2 ) {
1411 x1 =
blend2<M.
a[0], M.
a[1]>( src0, src1 );
1412 }
else if constexpr ( N == 4 ) {
1413 x1 =
blend4<M.
a[0], M.
a[1], M.
a[2], M.
a[3]>( src2, src3 );
1414 }
else if constexpr ( N == 8 ) {
1415 x1 =
blend8<M.
a[0], M.
a[1], M.
a[2], M.
a[3], M.
a[4], M.
a[5], M.
a[6], M.
a[7]>( src2, src3 );
1416 }
else if constexpr ( N == 16 ) {
1417 x1 =
blend16<M.
a[0], M.
a[1], M.
a[2], M.
a[3], M.
a[4], M.
a[5], M.
a[6], M.
a[7], M.
a[8], M.
a[9], M.
a[10], M.
a[11],
1418 M.
a[12], M.
a[13], M.
a[14], M.
a[15]>( src2, src3 );
1419 }
else if constexpr ( N == 32 ) {
1420 x1 =
blend32<M.
a[0], M.
a[1], M.
a[2], M.
a[3], M.
a[4], M.
a[5], M.
a[6], M.
a[7], M.
a[8], M.
a[9], M.
a[10], M.
a[11],
1421 M.
a[12], M.
a[13], M.
a[14], M.
a[15], M.
a[16], M.
a[17], M.
a[18], M.
a[19], M.
a[20], M.
a[21], M.
a[22],
1422 M.
a[23], M.
a[24], M.
a[25], M.
a[26], M.
a[27], M.
a[28], M.
a[29], M.
a[30], M.
a[31]>( src2, src3 );
const int blend_rotpattern
const int blend_punpckhba
constexpr uint64_t blend_flags(int const (&a)[V::size()])
const int blend_shufpattern
const int blend_cross_lane
constexpr auto perm_mask_broad(int const (&A)[V::size()])
const int blend_largeblock
constexpr int bit_scan_reverse_const(uint64_t const n)
const int perm_rotate_big
constexpr uint64_t expand_mask(int const (&a)[N])
constexpr auto zero_mask(int const (&a)[N])
constexpr auto pshufb_mask(int const (&A)[V::size()])
bool hasAVX512VBMI2(void)
constexpr auto zero_mask_broad(int const (&A)[V::size()])
constexpr EList< int, 2 *N > blend_perm_indexes(int const (&a)[N])
const int blend_rotate_big
constexpr uint64_t perm_flags(int const (&a)[V::size()])
const int perm_cross_lane
const int blend_outofrange
constexpr uint64_t perm16_flags(int const (&a)[V::size()])
constexpr EList< int, N/2 > largeblock_perm(int const (&a)[N])
const int perm_outofrange
const int perm_largeblock
const int blend_same_pattern
int physicalProcessors(int *logical_processors=0)
constexpr EList< int, N/2 > largeblock_indexes(int const (&a)[N])
const int perm_same_pattern
constexpr uint64_t make_bit_mask(int const (&a)[N])
const int blend_punpcklab
auto blend_half(W const &a, W const &b)
const int blend_punpckhab
constexpr auto get_inttype()
constexpr auto make_broad_mask(uint64_t const m)
constexpr uint64_t compress_mask(int const (&a)[N])
int instrset_detect(void)
const int blend_punpcklba
constexpr EList< int, N > blend_half_indexes(int const (&a)[N])