24 #define INSTRSET_H 20102 29 #define ALLOW_FP_PERMUTE true 32 #if ( defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ) ) && !defined( __x86_64__ ) 33 # define __x86_64__ 1 // There are many different macros for this, decide on only one 54 # if defined( __AVX512VL__ ) && defined( __AVX512BW__ ) && defined( __AVX512DQ__ ) 56 # elif defined( __AVX512F__ ) || defined( __AVX512__ ) 58 # elif defined( __AVX2__ ) 60 # elif defined( __AVX__ ) 62 # elif defined( __SSE4_2__ ) 64 # elif defined( __SSE4_1__ ) 66 # elif defined( __SSSE3__ ) 68 # elif defined( __SSE3__ ) 70 # elif defined( __SSE2__ ) || defined( __x86_64__ ) 72 # elif defined( __SSE__ ) 74 # elif defined( _M_IX86_FP ) // Defined in MS compiler. 1: SSE, 2: SSE2 75 # define INSTRSET _M_IX86_FP 78 # endif // instruction set defines 82 #if INSTRSET > 7 // AVX2 and later 83 # if defined( __GNUC__ ) && !defined( __INTEL_COMPILER ) 84 # include <x86intrin.h> 88 # include <immintrin.h> 91 # include <immintrin.h> 93 # include <nmmintrin.h> 95 # include <smmintrin.h> 97 # include <tmmintrin.h> 99 # include <pmmintrin.h> 101 # include <emmintrin.h> 103 # include <xmmintrin.h> 106 #if INSTRSET >= 8 && !defined( __FMA__ ) 108 # if defined( __GNUC__ ) && !defined( __INTEL_COMPILER ) 110 # if !defined( DISABLE_WARNING_AVX2_WITHOUT_FMA ) 111 # pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher" 113 # elif !defined( __clang__ ) 119 #if defined( __XOP__ ) || defined( __FMA4__ ) 121 # include <x86intrin.h> 123 # include <ammintrin.h> 125 #elif defined( __SSE4A__ ) // AMD SSE4A 126 # include <ammintrin.h> 130 #if defined( __FMA__ ) && ( defined( __GNUC__ ) || defined( __clang__ ) ) && !defined( __INTEL_COMPILER ) 131 # include <fmaintrin.h> 135 #if defined( __FMA4__ ) && ( defined( __GNUC__ ) || defined( __clang__ ) ) 136 # include <fma4intrin.h> 142 #ifdef _MSC_VER // Microsoft compiler or compatible Intel compiler 165 #if defined( __GNUC__ ) && !defined( GCC_VERSION ) && !defined( __clang__ ) 166 # define GCC_VERSION ( (__GNUC__)*10000 + (__GNUC_MINOR__)*100 + ( __GNUC_PATCHLEVEL__ ) ) 170 #if defined( __clang__ ) 171 # define CLANG_VERSION ( (__clang_major__)*10000 + (__clang_minor__)*100 + ( __clang_patchlevel__ ) ) 179 # if defined( _WINDEF_ ) && defined( min ) && defined( max ) 192 #if defined( __INTEL_COMPILER ) && __INTEL_COMPILER < 9999 193 # error The Intel compiler version 19.00 cannot compile VCL version 2. Use Version 1.xx of VCL instead 205 #if ( defined( __clang__ ) || defined( __apple_build_version__ ) ) && !defined( __INTEL_COMPILER ) 206 # define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY 209 #if defined( GCC_VERSION ) && GCC_VERSION < 99999 && !defined( __clang__ ) 210 # define ZEXT_MISSING // Gcc 7.4.0 does not have _mm256_zextsi128_si256 and similar functions 231 static inline void cpuid(
int output[4],
int functionnumber,
int ecxleaf = 0 ) {
232 #if defined( __GNUC__ ) || defined( __clang__ ) // use inline assembly, Gnu/AT&T syntax 234 __asm(
"cpuid" :
"=a"( a ),
"=b"( b ),
"=c"(
c ),
"=d"( d ) :
"a"( functionnumber ),
"c"( ecxleaf ) : );
240 #elif defined( _MSC_VER ) // Microsoft compiler, intrin.h included 241 __cpuidex(
output, functionnumber, ecxleaf );
243 #else // unknown platform. try inline assembly with masm/intel syntax 245 mov eax, functionnumber
258 #if INSTRSET >= 6 // SSE4.2 261 static inline uint32_t vml_popcnt( uint32_t a ) {
262 return (uint32_t)_mm_popcnt_u32( a );
265 static inline int64_t vml_popcnt( uint64_t a ) {
266 return _mm_popcnt_u64( a );
268 # else // 32 bit mode 269 static inline int64_t vml_popcnt( uint64_t a ) {
270 return _mm_popcnt_u32( uint32_t( a >> 32 ) ) + _mm_popcnt_u32( uint32_t( a ) );
274 static inline uint32_t vml_popcnt( uint32_t a ) {
276 uint32_t b = a - ( ( a >> 1 ) & 0x55555555 );
277 uint32_t
c = ( b & 0x33333333 ) + ( ( b >> 2 ) & 0x33333333 );
278 uint32_t d = (
c + (
c >> 4 ) ) & 0x0F0F0F0F;
279 uint32_t e = d * 0x01010101;
283 static inline int32_t vml_popcnt( uint64_t a ) {
284 return vml_popcnt( uint32_t( a >> 32 ) ) + vml_popcnt( uint32_t( a ) );
290 #if defined( __GNUC__ ) || defined( __clang__ ) 292 # if defined( __clang__ ) // fix clang bug 294 __attribute__( ( noinline ) )
297 bit_scan_forward( uint32_t a ) {
299 __asm(
"bsfl %1, %0" :
"=r"( r ) :
"r"( a ) : );
302 static inline uint32_t bit_scan_forward( uint64_t a ) {
303 uint32_t lo = uint32_t( a );
304 if ( lo )
return bit_scan_forward( lo );
305 uint32_t hi = uint32_t( a >> 32 );
306 return bit_scan_forward( hi ) + 32;
309 #else // other compilers 310 static inline uint32_t bit_scan_forward( uint32_t a ) {
312 _BitScanForward( &r, a );
316 static inline uint32_t bit_scan_forward( uint64_t a ) {
318 _BitScanForward64( &r, a );
322 static inline uint32_t bit_scan_forward( uint64_t a ) {
323 uint32_t lo = uint32_t( a );
324 if ( lo )
return bit_scan_forward( lo );
325 uint32_t hi = uint32_t( a >> 32 );
326 return bit_scan_forward( hi ) + 32;
332 #if defined( __GNUC__ ) || defined( __clang__ ) 333 static inline uint32_t bit_scan_reverse( uint32_t a ) __attribute__( ( pure ) );
334 static inline uint32_t bit_scan_reverse( uint32_t a ) {
336 __asm(
"bsrl %1, %0" :
"=r"( r ) :
"r"( a ) : );
340 static inline uint32_t bit_scan_reverse( uint64_t a ) {
342 __asm(
"bsrq %1, %0" :
"=r"( r ) :
"r"( a ) : );
345 # else // 32 bit mode 346 static inline uint32_t bit_scan_reverse( uint64_t a ) {
347 uint64_t ahi = a >> 32;
349 return bit_scan_reverse( uint32_t( a ) );
351 return bit_scan_reverse( uint32_t( ahi ) ) + 32;
355 static inline uint32_t bit_scan_reverse( uint32_t a ) {
357 _BitScanReverse( &r, a );
361 static inline uint32_t bit_scan_reverse( uint64_t a ) {
363 _BitScanReverse64( &r, a );
366 # else // 32 bit mode 367 static inline uint32_t bit_scan_reverse( uint64_t a ) {
368 uint64_t ahi = a >> 32;
370 return bit_scan_reverse( uint32_t( a ) );
372 return bit_scan_reverse( uint32_t( ahi ) ) + 32;
379 if (
n == 0 )
return -1;
380 uint64_t a =
n, b = 0, j = 64, k = 0;
383 k = (uint64_t)1 << j;
401 template <u
int32_t n>
403 #define const_int( n ) ( Const_int_t<n>() ) // n must be compile-time integer constant 404 #define const_uint( n ) ( Const_uint_t<n>() ) // n must be compile-time unsigned integer constant 407 template <
class VTYPE>
408 static inline VTYPE nan_vec( uint32_t payload = 0x100 ) {
409 if constexpr ( ( VTYPE::elementtype() & 1 ) != 0 ) {
415 ud.q = 0x7FF8000000000000 | uint64_t( payload ) << 29;
416 return VTYPE( ud.f );
423 uf.i = 0x7FC00000 | ( payload & 0x003FFFFF );
424 return VTYPE( uf.f );
458 template <
typename T,
int N>
465 template <
typename V>
467 constexpr
int elementsize =
sizeof( V ) /
V::size();
469 if constexpr ( elementsize >= 8 ) {
470 return -int64_t( 1 );
471 }
else if constexpr ( elementsize >= 4 ) {
472 return int32_t( -1 );
473 }
else if constexpr ( elementsize >= 2 ) {
474 return int16_t( -1 );
487 for ( i = 0; i <
N; i++ ) {
488 if ( a[i] >= 0 ) mask |= uint64_t( 1 ) << i;
490 if constexpr (
N <= 8 )
491 return uint8_t( mask );
492 else if constexpr (
N <= 16 )
493 return uint16_t( mask );
494 else if constexpr (
N <= 32 )
495 return uint32_t( mask );
502 template <
typename V>
505 typedef decltype( get_inttype<V>() ) Etype;
508 for ( i = 0; i <
N; i++ ) { u.
a[i] = A[i] >= 0 ? get_inttype<V>() : 0; }
521 template <
int N,
int B>
524 uint8_t j = uint8_t( B & 0xFF );
528 for ( i = 0; i <
N; i++ ) {
533 s = ( (uint32_t)ix >> j ) & 1;
541 r |= uint64_t(
s ) << i;
548 template <
typename V>
551 typedef decltype( get_inttype<V>() ) Etype;
554 for ( i = 0; i <
N; i++ ) { u.
a[i] = ( (
m >> i ) & 1 ) != 0 ? get_inttype<V>() : 0; }
560 template <
typename V>
563 typedef decltype( get_inttype<V>() ) Etype;
566 for ( i = 0; i <
N; i++ ) { u.
a[i] = Etype( A[i] ); }
599 template <
typename V>
608 const uint32_t nlanes =
sizeof( V ) / 16;
609 const uint32_t lanesize =
N / nlanes;
610 const uint32_t elementsize =
sizeof( V ) /
N;
613 int32_t broadc = 999;
614 uint32_t patfail = 0;
616 int32_t compresslasti = -1;
617 int32_t compresslastp = -1;
618 int32_t expandlasti = -1;
619 int32_t expandlastp = -1;
621 int lanepattern[lanesize] = {0};
623 for ( i = 0; i <
N; i++ ) {
628 }
else if ( ix !=
V_DC && uint32_t( ix ) >=
N ) {
636 else if ( broadc != ix )
641 if ( ( i & 1 ) == 0 ) {
646 if ( ix == -1 && iy >= 0 ) r |=
perm_addz;
647 if ( iy == -1 && ix >= 0 ) r |=
perm_addz;
655 uint32_t lanei = (uint32_t)ix / lanesize;
659 if ( lane != 0 && ix >= 0 ) {
660 int j1 = i - int( lane * lanesize );
661 int jx = ix - int( lane * lanesize );
663 if ( lanepattern[j1] < 0 ) {
664 lanepattern[j1] = jx;
671 if ( uint32_t( ix * 2 ) != i ) {
675 if ( ix > compresslasti && ix - compresslasti >= (
int)i - compresslastp ) {
676 if ( (
int)i - compresslastp > 1 ) addz2 |= 2;
683 if ( ix > expandlasti && ix - expandlasti <= (
int)i - expandlastp ) {
684 if ( ix - expandlasti > 1 ) addz2 |= 4;
690 }
else if ( ix == -1 ) {
691 if ( ( i & 1 ) == 0 ) addz2 |= 1;
698 if ( ( patfail & 1 ) == 0 ) {
701 }
else if ( ( patfail & 2 ) == 0 ) {
703 if ( ( addz2 & 2 ) != 0 ) {
704 for ( j = 0; j < compresslastp; j++ ) {
708 }
else if ( ( patfail & 4 ) == 0 ) {
710 if ( ( addz2 & 4 ) != 0 ) {
711 for ( j = 0; j < expandlastp; j++ ) {
721 for ( i = 0; i < lanesize; i++ ) {
722 if ( lanepattern[i] >= 0 ) {
723 uint32_t rot1 = uint32_t( lanepattern[i] + lanesize - i ) % lanesize;
727 if ( rot != rot1 ) fit =
false;
733 uint64_t rot2 = ( rot * elementsize ) & 0xF;
735 #if INSTRSET >= 4 // SSSE3 740 for ( i = 0; i < lanesize - rot; i++ ) {
741 if ( lanepattern[i] >= 0 ) fit =
false;
745 for ( ; i < lanesize; i++ )
746 if ( lanepattern[i] == -1 ) r |=
perm_addz;
750 for ( i = lanesize - (uint32_t)rot; i < lanesize;
752 if ( lanepattern[i] >= 0 ) fit =
false;
756 for ( i = 0; i < lanesize - rot; i++ ) {
757 if ( lanepattern[i] == -1 ) r |=
perm_addz;
763 uint32_t j2 = lanesize / 2;
764 for ( i = 0; i < lanesize; i++ ) {
765 if ( lanepattern[i] >= 0 && lanepattern[i] != (
int)j2 ) fit =
false;
766 if ( ( i & 1 ) != 0 ) j2++;
772 for ( i = 0; i < lanesize; i++ ) {
773 if ( lanepattern[i] >= 0 && lanepattern[i] != (
int)j2 ) fit =
false;
774 if ( ( i & 1 ) != 0 ) j2++;
778 if ( elementsize >= 4 ) {
780 for ( i = 0; i < lanesize; i++ ) {
781 if ( lanesize == 4 ) {
782 p |= ( lanepattern[i] & 3 ) << 2 * i;
784 p |= ( ( lanepattern[i] & 1 ) * 10 + 4 ) << 4 * i;
792 if constexpr ( nlanes > 1 ) {
793 for ( i = 0; i <
N; i++ ) {
796 uint32_t rot2 = ( ix +
N - i ) %
N;
799 }
else if ( rot != rot2 ) {
823 int ix = 0, lasti = -1, lastp = -1;
827 for ( i = 0; i <
N; i++ ) {
830 m |= (uint64_t)1 << ix;
831 for ( j = 1; j < i - lastp; j++ ) {
832 m |= (uint64_t)1 << ( lasti + j );
847 int ix = 0, lasti = -1, lastp = -1;
851 for ( i = 0; i <
N; i++ ) {
854 m |= (uint64_t)1 << i;
855 for ( j = 1; j < ix - lasti; j++ ) {
856 m |= (uint64_t)1 << ( lastp + j );
872 template <
typename V>
879 uint32_t pat[4] = {0, 0, 0, 0};
882 const uint32_t lanesize = 8;
884 int lanepattern[lanesize] = {0};
886 for ( i = 0; i <
N; i++ ) {
891 }
else if ( ix >= 0 ) {
892 uint32_t j = i - lane * lanesize;
893 int jx = ix - lane * lanesize;
894 if ( lanepattern[j] < 0 ) {
900 for ( i = 0; i < 4; i++ ) {
902 if ( lanepattern[i] >= 0 ) {
903 if ( lanepattern[i] < 4 ) {
905 pat[0] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
908 pat[2] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
912 if ( lanepattern[i + 4] >= 0 ) {
913 if ( lanepattern[i + 4] < 4 ) {
915 pat[3] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
918 pat[1] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
923 for ( i = 0; i < 4; i++ ) {
retval |= (uint64_t)pat[i] << ( 32 + i * 8 ); }
931 template <
typename V,
int oppos = 0>
937 constexpr uint32_t elementsize =
sizeof( V ) /
N;
938 constexpr uint32_t nlanes =
sizeof( V ) / 16;
939 constexpr uint32_t elements_per_lane =
N / nlanes;
941 EList<int8_t,
sizeof( V )> u = {{0}};
949 for ( lane = 0; lane < nlanes; lane++ ) {
950 for ( i = 0; i < elements_per_lane; i++ ) {
955 ix ^= oppos * elements_per_lane;
957 ix -= int( lane * elements_per_lane );
958 if ( ix >= 0 && ix < (
int)elements_per_lane ) {
959 p = ix * elementsize;
961 for ( j = 0; j < elementsize; j++ ) {
962 u.a[k++] = p < 0 ? -1 : p + j;
977 EList<int,
N / 2> list = {{0}};
981 bool fit_addz =
false;
985 for ( i = 0; i <
N; i += 2 ) {
988 if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz =
true; }
992 for ( i = 0; i <
N; i += 2 ) {
997 }
else if ( iy >= 0 ) {
1001 if ( fit_addz ) iz =
V_DC;
1033 template <
typename V>
1042 const uint32_t nlanes =
sizeof( V ) / 16;
1043 const uint32_t lanesize =
N / nlanes;
1046 int lanepattern[lanesize] = {0};
1047 if ( lanesize == 2 &&
N <= 8 ) {
1051 for ( ii = 0; ii <
N; ii++ ) {
1056 else if ( ix !=
V_DC ) {
1065 }
else if ( ix < 2 *
N ) {
1075 if ( ( ii & 1 ) == 0 ) {
1083 lane = (uint32_t)ii / lanesize;
1085 lanepattern[ii] = ix;
1089 uint32_t lanei = uint32_t( ix & ~
N ) / lanesize;
1090 if ( lanei != lane ) {
1093 if ( lanesize == 2 ) {
1095 if ( ( ( ( ix &
N ) != 0 ) ^ ii ) & 1 )
1102 if ( lane != 0 && ix >= 0 ) {
1103 int j = ii - int( lane * lanesize );
1104 int jx = ix - int( lane * lanesize );
1106 if ( lanepattern[j] < 0 ) {
1107 lanepattern[j] = jx;
1121 for ( iu = 0; iu < lanesize; iu++ ) {
1122 ix = lanepattern[iu];
1125 if ( (uint32_t)ix != iu / 2 + ( ( iu & 1 ) ^ 1 ) *
N ) r &= ~
blend_punpcklba;
1126 if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( iu & 1 ) *
N ) r &= ~
blend_punpckhab;
1127 if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( ( iu & 1 ) ^ 1 ) *
N ) r &= ~
blend_punpckhba;
1130 #if INSTRSET >= 4 // SSSE3. check if it fits palignr 1131 for ( iu = 0; iu < lanesize; iu++ ) {
1132 ix = lanepattern[iu];
1134 uint32_t t = ix & ~
N;
1135 if ( ix &
N ) t += lanesize;
1136 uint32_t tb = ( t + 2 * lanesize - iu ) % ( lanesize * 2 );
1140 if ( rot != tb ) rot = 1000;
1145 if ( rot < lanesize ) {
1150 const uint32_t elementsize =
sizeof( V ) /
N;
1151 r |= uint64_t( ( rot & ( lanesize - 1 ) ) * elementsize ) <<
blend_rotpattern;
1154 if ( lanesize == 4 ) {
1157 for ( ii = 0; ii < 2; ii++ ) {
1158 ix = lanepattern[ii];
1166 for ( ; ii < 4; ii++ ) {
1167 ix = lanepattern[ii];
1176 uint8_t shufpattern = 0;
1177 for ( iu = 0; iu < lanesize; iu++ ) { shufpattern |= ( lanepattern[iu] & 3 ) << iu * 2; }
1181 }
else if ( nlanes > 1 ) {
1183 for ( ii = 0; ii <
N; ii++ ) {
1186 uint32_t rot2 = ( ix + 2 *
N - ii ) % ( 2 *
N );
1189 }
else if ( rot != rot2 ) {
1195 if ( rot < 2 *
N ) {
1200 for ( ii = 0; ii <
N; ii++ ) { r |= uint64_t( a[ii] & 1 ) << (
blend_shufpattern + ii ); }
1210 template <
int N,
int dozero>
1214 int u = dozero ? -1 :
V_DC;
1217 for ( j = 0; j <
N; j++ ) {
1220 if ( dozero == 2 ) {
1229 }
else if ( ix <
N ) {
1234 list.
a[j +
N] = ix -
N;
1248 EList<int,
N / 2> list = {{0}};
1250 bool fit_addz =
false;
1256 for ( i = 0; i <
N; i += 2 ) {
1261 }
else if ( iy >= 0 ) {
1267 if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz =
true; }
1271 for ( i = 0; i <
N / 2; i++ ) {
1272 if ( list.a[i] < 0 ) list.a[i] =
V_DC;
1288 template <
typename dummy>
1290 template <
typename dummy>
1292 template <
typename dummy>
1294 template <
typename dummy>
1296 template <
typename dummy>
1305 template <
int N,
int dozero,
int src1,
int src2>
1309 int u = dozero ? -1 :
V_DC;
1312 for ( j = 0; j <
N; j++ ) {
1315 list.
a[j] = ( dozero == 2 ) ? ix : u;
1318 if ( src == src1 ) {
1319 list.
a[j] = ix & (
N - 1 );
1320 }
else if ( src == src2 ) {
1321 list.
a[j] = ( ix & (
N - 1 ) ) +
N;
1330 template <
typename W,
int s>
1331 static inline auto selectblend( W
const a, W
const b ) {
1332 if constexpr (
s == 0 )
1334 else if constexpr (
s == 1 )
1335 return a.get_high();
1336 else if constexpr (
s == 2 )
1339 return b.get_high();
1349 template <typename W,
int... i0>
1351 typedef decltype( a.get_low() ) V;
1353 static_assert(
sizeof...( i0 ) ==
N,
"wrong number of indexes in blend_half" );
1354 constexpr
int ind[
N] = {i0...};
1358 auto listsources = [](
int const n,
int const( &ind )[
N] ) constexpr {
1359 bool source_used[4] = {
false,
false,
false,
false};
1361 for ( i = 0; i <
n; i++ ) {
1365 source_used[src & 3] =
true;
1371 for ( i = 0; i < 4; i++ ) {
1372 if ( source_used[i] ) { sources.
a[nsrc++] = i; }
1374 sources.
a[4] = nsrc;
1379 constexpr
int nsrc = sources.a[4];
1381 if constexpr ( nsrc == 0 ) {
1385 constexpr
int uindex = ( nsrc > 2 ) ? 1 : 2;
1388 V src0 = selectblend<W, sources.a[0]>( a, b );
1389 V src1 = selectblend<W, sources.a[1]>( a, b );
1390 if constexpr (
N == 2 ) {
1391 x0 =
blend2<L.a[0], L.a[1]>( src0, src1 );
1392 }
else if constexpr (
N == 4 ) {
1393 x0 =
blend4<L.a[0], L.a[1], L.a[2], L.a[3]>( src0, src1 );
1394 }
else if constexpr (
N == 8 ) {
1395 x0 =
blend8<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]>( src0, src1 );
1396 }
else if constexpr (
N == 16 ) {
1397 x0 =
blend16<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
1398 L.a[12], L.a[13], L.a[14], L.a[15]>( src0, src1 );
1399 }
else if constexpr (
N == 32 ) {
1400 x0 =
blend32<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
1401 L.a[12], L.a[13], L.a[14], L.a[15], L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22],
1402 L.a[23], L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]>( src0, src1 );
1404 if constexpr ( nsrc > 2 ) {
1407 V src2 = selectblend<W, sources.a[2]>( a, b );
1408 V src3 = selectblend<W, sources.a[3]>( a, b );
1409 if constexpr (
N == 2 ) {
1410 x1 =
blend2<M.a[0], M.a[1]>( src0, src1 );
1411 }
else if constexpr (
N == 4 ) {
1412 x1 =
blend4<M.a[0], M.a[1], M.a[2], M.a[3]>( src2, src3 );
1413 }
else if constexpr (
N == 8 ) {
1414 x1 =
blend8<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7]>( src2, src3 );
1415 }
else if constexpr (
N == 16 ) {
1416 x1 =
blend16<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
1417 M.a[12], M.a[13], M.a[14], M.a[15]>( src2, src3 );
1418 }
else if constexpr (
N == 32 ) {
1419 x1 =
blend32<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
1420 M.a[12], M.a[13], M.a[14], M.a[15], M.a[16], M.a[17], M.a[18], M.a[19], M.a[20], M.a[21], M.a[22],
1421 M.a[23], M.a[24], M.a[25], M.a[26], M.a[27], M.a[28], M.a[29], M.a[30], M.a[31]>( src2, src3 );
1428 #ifdef VCL_NAMESPACE 1432 #endif // INSTRSET_H
constexpr auto size(const T &, Args &&...) noexcept
constexpr EList< int, N/2 > largeblock_indexes(int const (&a)[N])
const int perm_largeblock
constexpr auto make_broad_mask(uint64_t const m)
constexpr auto zero_mask(int const (&a)[N])
constexpr uint64_t expand_mask(int const (&a)[N])
const int perm_outofrange
bool hasAVX512VBMI2(void)
#define cpuid(func, eax, ebx, ecx, edx)
constexpr uint64_t make_bit_mask(int const (&a)[N])
const int blend_largeblock
constexpr uint64_t perm_flags(int const (&a)[V::size()])
constexpr EList< int, N > blend_half_indexes(int const (&a)[N])
const int blend_shufpattern
constexpr uint64_t blend_flags(int const (&a)[V::size()])
const int blend_rotate_big
auto blend_half(W const &a, W const &b)
constexpr uint64_t compress_mask(int const (&a)[N])
int instrset_detect(void)
const int blend_outofrange
const int blend_cross_lane
const int blend_punpckhba
const int blend_same_pattern
const int blend_punpckhab
constexpr auto pshufb_mask(int const (&A)[V::size()])
constexpr uint64_t perm16_flags(int const (&a)[V::size()])
constexpr auto zero_mask_broad(int const (&A)[V::size()])
const int perm_same_pattern
constexpr EList< int, N/2 > largeblock_perm(int const (&a)[N])
const int perm_cross_lane
constexpr EList< int, 2 *N > blend_perm_indexes(int const (&a)[N])
const int perm_rotate_big
const int blend_punpcklba
constexpr auto perm_mask_broad(int const (&A)[V::size()])
const int blend_rotpattern
const int blend_punpcklab
constexpr int bit_scan_reverse_const(uint64_t const n)
constexpr auto get_inttype()
int physicalProcessors(int *logical_processors=0)