24 #define INSTRSET_H 20102
29 #define ALLOW_FP_PERMUTE true
32 #if ( defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ) ) && !defined( __x86_64__ )
33 # define __x86_64__ 1 // There are many different macros for this, decide on only one
54 # if defined( __AVX512VL__ ) && defined( __AVX512BW__ ) && defined( __AVX512DQ__ )
56 # elif defined( __AVX512F__ ) || defined( __AVX512__ )
58 # elif defined( __AVX2__ )
60 # elif defined( __AVX__ )
62 # elif defined( __SSE4_2__ )
64 # elif defined( __SSE4_1__ )
66 # elif defined( __SSSE3__ )
68 # elif defined( __SSE3__ )
70 # elif defined( __SSE2__ ) || defined( __x86_64__ )
72 # elif defined( __SSE__ )
74 # elif defined( _M_IX86_FP ) // Defined in MS compiler. 1: SSE, 2: SSE2
75 # define INSTRSET _M_IX86_FP
78 # endif // instruction set defines
82 #if INSTRSET > 7 // AVX2 and later
83 # if defined( __GNUC__ ) && !defined( __INTEL_COMPILER )
84 # include <x86intrin.h>
88 # include <immintrin.h>
91 # include <immintrin.h>
93 # include <nmmintrin.h>
95 # include <smmintrin.h>
97 # include <tmmintrin.h>
99 # include <pmmintrin.h>
101 # include <emmintrin.h>
103 # include <xmmintrin.h>
106 #if INSTRSET >= 8 && !defined( __FMA__ )
108 # if defined( __GNUC__ ) && !defined( __INTEL_COMPILER )
110 # if !defined( DISABLE_WARNING_AVX2_WITHOUT_FMA )
111 # pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
113 # elif !defined( __clang__ )
119 #if defined( __XOP__ ) || defined( __FMA4__ )
121 # include <x86intrin.h>
123 # include <ammintrin.h>
125 #elif defined( __SSE4A__ ) // AMD SSE4A
126 # include <ammintrin.h>
130 #if defined( __FMA__ ) && ( defined( __GNUC__ ) || defined( __clang__ ) ) && !defined( __INTEL_COMPILER )
131 # include <fmaintrin.h>
135 #if defined( __FMA4__ ) && ( defined( __GNUC__ ) || defined( __clang__ ) )
136 # include <fma4intrin.h>
142 #ifdef _MSC_VER // Microsoft compiler or compatible Intel compiler
148 namespace VCL_NAMESPACE {
165 #if defined( __GNUC__ ) && !defined( GCC_VERSION ) && !defined( __clang__ )
166 # define GCC_VERSION ( (__GNUC__)*10000 + (__GNUC_MINOR__)*100 + ( __GNUC_PATCHLEVEL__ ) )
170 #if defined( __clang__ )
171 # define CLANG_VERSION ( (__clang_major__)*10000 + (__clang_minor__)*100 + ( __clang_patchlevel__ ) )
179 # if defined( _WINDEF_ ) && defined( min ) && defined( max )
192 #if defined( __INTEL_COMPILER ) && __INTEL_COMPILER < 9999
193 # error The Intel compiler version 19.00 cannot compile VCL version 2. Use Version 1.xx of VCL instead
205 #if ( defined( __clang__ ) || defined( __apple_build_version__ ) ) && !defined( __INTEL_COMPILER )
206 # define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
209 #if defined( GCC_VERSION ) && GCC_VERSION < 99999 && !defined( __clang__ )
210 # define ZEXT_MISSING // Gcc 7.4.0 does not have _mm256_zextsi128_si256 and similar functions
214 namespace VCL_NAMESPACE {
231 static inline void cpuid(
int output[4],
int functionnumber,
int ecxleaf = 0 ) {
233 # if defined( __GNUC__ ) || defined( __clang__ ) // use inline assembly, Gnu/AT&T syntax
235 __asm(
"cpuid" :
"=a"( a ),
"=b"( b ),
"=c"(
c ),
"=d"( d ) :
"a"( functionnumber ),
"c"( ecxleaf ) : );
241 # elif defined( _MSC_VER ) // Microsoft compiler, intrin.h included
242 __cpuidex(
output, functionnumber, ecxleaf );
244 # else // unknown platform. try inline assembly with masm/intel syntax
246 mov eax, functionnumber
255 # endif // compiler/platform
260 #if INSTRSET >= 6 // SSE4.2
263 static inline uint32_t vml_popcnt( uint32_t a ) {
264 return (uint32_t)_mm_popcnt_u32( a );
267 static inline int64_t vml_popcnt( uint64_t a ) {
268 return _mm_popcnt_u64( a );
270 # else // 32 bit mode
271 static inline int64_t vml_popcnt( uint64_t a ) {
272 return _mm_popcnt_u32( uint32_t( a >> 32 ) ) + _mm_popcnt_u32( uint32_t( a ) );
276 static inline uint32_t vml_popcnt( uint32_t a ) {
278 uint32_t b = a - ( ( a >> 1 ) & 0x55555555 );
279 uint32_t
c = ( b & 0x33333333 ) + ( ( b >> 2 ) & 0x33333333 );
280 uint32_t d = (
c + (
c >> 4 ) ) & 0x0F0F0F0F;
281 uint32_t e = d * 0x01010101;
285 static inline int32_t vml_popcnt( uint64_t a ) {
286 return vml_popcnt( uint32_t( a >> 32 ) ) + vml_popcnt( uint32_t( a ) );
292 #if defined( __GNUC__ ) || defined( __clang__ )
294 # if defined( __clang__ ) // fix clang bug
296 __attribute__( ( noinline ) )
299 bit_scan_forward( uint32_t a ) {
301 __asm(
"bsfl %1, %0" :
"=r"( r ) :
"r"( a ) : );
304 static inline uint32_t bit_scan_forward( uint64_t a ) {
305 uint32_t lo = uint32_t( a );
306 if ( lo )
return bit_scan_forward( lo );
307 uint32_t hi = uint32_t( a >> 32 );
308 return bit_scan_forward( hi ) + 32;
311 #else // other compilers
312 static inline uint32_t bit_scan_forward( uint32_t a ) {
314 _BitScanForward( &r, a );
318 static inline uint32_t bit_scan_forward( uint64_t a ) {
320 _BitScanForward64( &r, a );
324 static inline uint32_t bit_scan_forward( uint64_t a ) {
325 uint32_t lo = uint32_t( a );
326 if ( lo )
return bit_scan_forward( lo );
327 uint32_t hi = uint32_t( a >> 32 );
328 return bit_scan_forward( hi ) + 32;
334 #if defined( __GNUC__ ) || defined( __clang__ )
335 static inline uint32_t bit_scan_reverse( uint32_t a ) __attribute__( ( pure ) );
336 static inline uint32_t bit_scan_reverse( uint32_t a ) {
338 __asm(
"bsrl %1, %0" :
"=r"( r ) :
"r"( a ) : );
342 static inline uint32_t bit_scan_reverse( uint64_t a ) {
344 __asm(
"bsrq %1, %0" :
"=r"( r ) :
"r"( a ) : );
347 # else // 32 bit mode
348 static inline uint32_t bit_scan_reverse( uint64_t a ) {
349 uint64_t ahi = a >> 32;
351 return bit_scan_reverse( uint32_t( a ) );
353 return bit_scan_reverse( uint32_t( ahi ) ) + 32;
357 static inline uint32_t bit_scan_reverse( uint32_t a ) {
359 _BitScanReverse( &r, a );
363 static inline uint32_t bit_scan_reverse( uint64_t a ) {
365 _BitScanReverse64( &r, a );
368 # else // 32 bit mode
369 static inline uint32_t bit_scan_reverse( uint64_t a ) {
370 uint64_t ahi = a >> 32;
372 return bit_scan_reverse( uint32_t( a ) );
374 return bit_scan_reverse( uint32_t( ahi ) ) + 32;
381 if (
n == 0 )
return -1;
382 uint64_t a =
n, b = 0,
j = 64, k = 0;
385 k = (uint64_t)1 <<
j;
403 template <u
int32_t n>
405 #define const_int( n ) ( Const_int_t<n>() ) // n must be compile-time integer constant
406 #define const_uint( n ) ( Const_uint_t<n>() ) // n must be compile-time unsigned integer constant
409 template <
class VTYPE>
410 static inline VTYPE nan_vec( uint32_t payload = 0x100 ) {
411 if constexpr ( ( VTYPE::elementtype() & 1 ) != 0 ) {
417 ud.q = 0x7FF8000000000000 | uint64_t( payload ) << 29;
418 return VTYPE( ud.f );
425 uf.i = 0x7FC00000 | ( payload & 0x003FFFFF );
426 return VTYPE( uf.f );
460 template <
typename T,
int N>
467 template <
typename V>
469 constexpr
int elementsize =
sizeof( V ) /
V::size();
471 if constexpr ( elementsize >= 8 ) {
472 return -int64_t( 1 );
473 }
else if constexpr ( elementsize >= 4 ) {
474 return int32_t( -1 );
475 }
else if constexpr ( elementsize >= 2 ) {
476 return int16_t( -1 );
489 for ( i = 0; i <
N; i++ ) {
490 if ( a[i] >= 0 ) mask |= uint64_t( 1 ) << i;
492 if constexpr (
N <= 8 )
493 return uint8_t( mask );
494 else if constexpr (
N <= 16 )
495 return uint16_t( mask );
496 else if constexpr (
N <= 32 )
497 return uint32_t( mask );
504 template <
typename V>
507 typedef decltype( get_inttype<V>() ) Etype;
510 for ( i = 0; i <
N; i++ ) { u.
a[i] = A[i] >= 0 ? get_inttype<V>() : 0; }
523 template <
int N,
int B>
526 uint8_t
j = uint8_t( B & 0xFF );
530 for ( i = 0; i <
N; i++ ) {
535 s = ( (uint32_t)ix >>
j ) & 1;
543 r |= uint64_t(
s ) << i;
550 template <
typename V>
553 typedef decltype( get_inttype<V>() ) Etype;
556 for ( i = 0; i <
N; i++ ) { u.
a[i] = ( (
m >> i ) & 1 ) != 0 ? get_inttype<V>() : 0; }
562 template <
typename V>
565 typedef decltype( get_inttype<V>() ) Etype;
568 for ( i = 0; i <
N; i++ ) { u.
a[i] = Etype( A[i] ); }
601 template <
typename V>
610 const uint32_t nlanes =
sizeof( V ) / 16;
611 const uint32_t lanesize =
N / nlanes;
612 const uint32_t elementsize =
sizeof( V ) /
N;
615 int32_t broadc = 999;
616 uint32_t patfail = 0;
618 int32_t compresslasti = -1;
619 int32_t compresslastp = -1;
620 int32_t expandlasti = -1;
621 int32_t expandlastp = -1;
623 int lanepattern[lanesize] = { 0 };
625 for ( i = 0; i <
N; i++ ) {
630 }
else if ( ix !=
V_DC && uint32_t( ix ) >=
N ) {
638 else if ( broadc != ix )
643 if ( ( i & 1 ) == 0 ) {
648 if ( ix == -1 && iy >= 0 ) r |=
perm_addz;
649 if ( iy == -1 && ix >= 0 ) r |=
perm_addz;
657 uint32_t lanei = (uint32_t)ix / lanesize;
661 if ( lane != 0 && ix >= 0 ) {
662 int j1 = i - int( lane * lanesize );
663 int jx = ix - int( lane * lanesize );
665 if ( lanepattern[j1] < 0 ) {
666 lanepattern[j1] = jx;
673 if ( uint32_t( ix * 2 ) != i ) {
677 if ( ix > compresslasti && ix - compresslasti >= (
int)i - compresslastp ) {
678 if ( (
int)i - compresslastp > 1 ) addz2 |= 2;
685 if ( ix > expandlasti && ix - expandlasti <= (
int)i - expandlastp ) {
686 if ( ix - expandlasti > 1 ) addz2 |= 4;
692 }
else if ( ix == -1 ) {
693 if ( ( i & 1 ) == 0 ) addz2 |= 1;
700 if ( ( patfail & 1 ) == 0 ) {
703 }
else if ( ( patfail & 2 ) == 0 ) {
705 if ( ( addz2 & 2 ) != 0 ) {
706 for (
j = 0;
j < compresslastp;
j++ ) {
710 }
else if ( ( patfail & 4 ) == 0 ) {
712 if ( ( addz2 & 4 ) != 0 ) {
713 for (
j = 0;
j < expandlastp;
j++ ) {
723 for ( i = 0; i < lanesize; i++ ) {
724 if ( lanepattern[i] >= 0 ) {
725 uint32_t rot1 = uint32_t( lanepattern[i] + lanesize - i ) % lanesize;
729 if ( rot != rot1 ) fit =
false;
735 uint64_t rot2 = ( rot * elementsize ) & 0xF;
737 #if INSTRSET >= 4 // SSSE3
742 for ( i = 0; i < lanesize - rot; i++ ) {
743 if ( lanepattern[i] >= 0 ) fit =
false;
747 for ( ; i < lanesize; i++ )
748 if ( lanepattern[i] == -1 ) r |=
perm_addz;
752 for ( i = lanesize - (uint32_t)rot; i < lanesize;
754 if ( lanepattern[i] >= 0 ) fit =
false;
758 for ( i = 0; i < lanesize - rot; i++ ) {
759 if ( lanepattern[i] == -1 ) r |=
perm_addz;
765 uint32_t j2 = lanesize / 2;
766 for ( i = 0; i < lanesize; i++ ) {
767 if ( lanepattern[i] >= 0 && lanepattern[i] != (
int)j2 ) fit =
false;
768 if ( ( i & 1 ) != 0 ) j2++;
774 for ( i = 0; i < lanesize; i++ ) {
775 if ( lanepattern[i] >= 0 && lanepattern[i] != (
int)j2 ) fit =
false;
776 if ( ( i & 1 ) != 0 ) j2++;
780 if ( elementsize >= 4 ) {
782 for ( i = 0; i < lanesize; i++ ) {
783 if ( lanesize == 4 ) {
784 p |= ( lanepattern[i] & 3 ) << 2 * i;
786 p |= ( ( lanepattern[i] & 1 ) * 10 + 4 ) << 4 * i;
794 if constexpr ( nlanes > 1 ) {
795 for ( i = 0; i <
N; i++ ) {
798 uint32_t rot2 = ( ix +
N - i ) %
N;
801 }
else if ( rot != rot2 ) {
825 int ix = 0, lasti = -1, lastp = -1;
829 for ( i = 0; i <
N; i++ ) {
832 m |= (uint64_t)1 << ix;
833 for (
j = 1;
j < i - lastp;
j++ ) {
834 m |= (uint64_t)1 << ( lasti +
j );
849 int ix = 0, lasti = -1, lastp = -1;
853 for ( i = 0; i <
N; i++ ) {
856 m |= (uint64_t)1 << i;
857 for (
j = 1;
j < ix - lasti;
j++ ) {
858 m |= (uint64_t)1 << ( lastp +
j );
874 template <
typename V>
881 uint32_t pat[4] = { 0, 0, 0, 0 };
884 const uint32_t lanesize = 8;
886 int lanepattern[lanesize] = { 0 };
888 for ( i = 0; i <
N; i++ ) {
893 }
else if ( ix >= 0 ) {
894 uint32_t
j = i - lane * lanesize;
895 int jx = ix - lane * lanesize;
896 if ( lanepattern[
j] < 0 ) {
902 for ( i = 0; i < 4; i++ ) {
904 if ( lanepattern[i] >= 0 ) {
905 if ( lanepattern[i] < 4 ) {
907 pat[0] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
910 pat[2] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
914 if ( lanepattern[i + 4] >= 0 ) {
915 if ( lanepattern[i + 4] < 4 ) {
917 pat[3] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
920 pat[1] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
925 for ( i = 0; i < 4; i++ ) {
retval |= (uint64_t)pat[i] << ( 32 + i * 8 ); }
933 template <
typename V,
int oppos = 0>
939 constexpr uint32_t elementsize =
sizeof( V ) /
N;
940 constexpr uint32_t nlanes =
sizeof( V ) / 16;
941 constexpr uint32_t elements_per_lane =
N / nlanes;
943 EList<int8_t,
sizeof( V )> u = { { 0 } };
951 for ( lane = 0; lane < nlanes; lane++ ) {
952 for ( i = 0; i < elements_per_lane; i++ ) {
957 ix ^= oppos * elements_per_lane;
959 ix -= int( lane * elements_per_lane );
960 if ( ix >= 0 && ix < (
int)elements_per_lane ) {
961 p = ix * elementsize;
963 for (
j = 0;
j < elementsize;
j++ ) {
964 u.a[k++] = p < 0 ? -1 : p +
j;
979 EList<int,
N / 2> list = { { 0 } };
983 bool fit_addz =
false;
987 for ( i = 0; i <
N; i += 2 ) {
990 if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz =
true; }
994 for ( i = 0; i <
N; i += 2 ) {
999 }
else if ( iy >= 0 ) {
1003 if ( fit_addz ) iz =
V_DC;
1035 template <
typename V>
1044 const uint32_t nlanes =
sizeof( V ) / 16;
1045 const uint32_t lanesize =
N / nlanes;
1048 int lanepattern[lanesize] = { 0 };
1049 if ( lanesize == 2 &&
N <= 8 ) {
1053 for ( ii = 0; ii <
N; ii++ ) {
1058 else if ( ix !=
V_DC ) {
1067 }
else if ( ix < 2 *
N ) {
1077 if ( ( ii & 1 ) == 0 ) {
1085 lane = (uint32_t)ii / lanesize;
1087 lanepattern[ii] = ix;
1091 uint32_t lanei = uint32_t( ix & ~
N ) / lanesize;
1092 if ( lanei != lane ) {
1095 if ( lanesize == 2 ) {
1097 if ( ( ( ( ix &
N ) != 0 ) ^ ii ) & 1 )
1104 if ( lane != 0 && ix >= 0 ) {
1105 int j = ii - int( lane * lanesize );
1106 int jx = ix - int( lane * lanesize );
1108 if ( lanepattern[
j] < 0 ) {
1109 lanepattern[
j] = jx;
1123 for ( iu = 0; iu < lanesize; iu++ ) {
1124 ix = lanepattern[iu];
1127 if ( (uint32_t)ix != iu / 2 + ( ( iu & 1 ) ^ 1 ) *
N ) r &= ~
blend_punpcklba;
1128 if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( iu & 1 ) *
N ) r &= ~
blend_punpckhab;
1129 if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( ( iu & 1 ) ^ 1 ) *
N ) r &= ~
blend_punpckhba;
1132 #if INSTRSET >= 4 // SSSE3. check if it fits palignr
1133 for ( iu = 0; iu < lanesize; iu++ ) {
1134 ix = lanepattern[iu];
1136 uint32_t
t = ix & ~
N;
1137 if ( ix &
N )
t += lanesize;
1138 uint32_t tb = (
t + 2 * lanesize - iu ) % ( lanesize * 2 );
1142 if ( rot != tb ) rot = 1000;
1147 if ( rot < lanesize ) {
1152 const uint32_t elementsize =
sizeof( V ) /
N;
1153 r |= uint64_t( ( rot & ( lanesize - 1 ) ) * elementsize ) <<
blend_rotpattern;
1156 if ( lanesize == 4 ) {
1159 for ( ii = 0; ii < 2; ii++ ) {
1160 ix = lanepattern[ii];
1168 for ( ; ii < 4; ii++ ) {
1169 ix = lanepattern[ii];
1178 uint8_t shufpattern = 0;
1179 for ( iu = 0; iu < lanesize; iu++ ) { shufpattern |= ( lanepattern[iu] & 3 ) << iu * 2; }
1183 }
else if ( nlanes > 1 ) {
1185 for ( ii = 0; ii <
N; ii++ ) {
1188 uint32_t rot2 = ( ix + 2 *
N - ii ) % ( 2 *
N );
1191 }
else if ( rot != rot2 ) {
1197 if ( rot < 2 *
N ) {
1202 for ( ii = 0; ii <
N; ii++ ) { r |= uint64_t( a[ii] & 1 ) << (
blend_shufpattern + ii ); }
1212 template <
int N,
int dozero>
1216 int u = dozero ? -1 :
V_DC;
1219 for (
j = 0;
j <
N;
j++ ) {
1222 if ( dozero == 2 ) {
1231 }
else if ( ix <
N ) {
1236 list.
a[
j +
N] = ix -
N;
1250 EList<int,
N / 2> list = { { 0 } };
1252 bool fit_addz =
false;
1258 for ( i = 0; i <
N; i += 2 ) {
1263 }
else if ( iy >= 0 ) {
1269 if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz =
true; }
1273 for ( i = 0; i <
N / 2; i++ ) {
1274 if ( list.a[i] < 0 ) list.a[i] =
V_DC;
1290 template <
typename dummy>
1292 template <
typename dummy>
1294 template <
typename dummy>
1296 template <
typename dummy>
1298 template <
typename dummy>
1307 template <
int N,
int dozero,
int src1,
int src2>
1311 int u = dozero ? -1 :
V_DC;
1314 for (
j = 0;
j <
N;
j++ ) {
1317 list.
a[
j] = ( dozero == 2 ) ? ix : u;
1320 if ( src == src1 ) {
1321 list.
a[
j] = ix & (
N - 1 );
1322 }
else if ( src == src2 ) {
1323 list.
a[
j] = ( ix & (
N - 1 ) ) +
N;
1332 template <
typename W,
int s>
1333 static inline auto selectblend( W
const a, W
const b ) {
1334 if constexpr (
s == 0 )
1336 else if constexpr (
s == 1 )
1337 return a.get_high();
1338 else if constexpr (
s == 2 )
1341 return b.get_high();
1351 template <typename W,
int... i0>
1353 typedef decltype( a.get_low() ) V;
1355 static_assert(
sizeof...( i0 ) ==
N,
"wrong number of indexes in blend_half" );
1356 constexpr
int ind[
N] = { i0... };
1360 auto listsources = [](
int const n,
int const( &ind )[
N] ) constexpr {
1361 bool source_used[4] = {
false,
false,
false,
false };
1363 for ( i = 0; i <
n; i++ ) {
1367 source_used[src & 3] =
true;
1373 for ( i = 0; i < 4; i++ ) {
1374 if ( source_used[i] ) { sources.
a[nsrc++] = i; }
1376 sources.
a[4] = nsrc;
1381 constexpr
int nsrc = sources.a[4];
1383 if constexpr ( nsrc == 0 ) {
1387 constexpr
int uindex = ( nsrc > 2 ) ? 1 : 2;
1390 V src0 = selectblend<W, sources.a[0]>( a, b );
1391 V src1 = selectblend<W, sources.a[1]>( a, b );
1392 if constexpr (
N == 2 ) {
1393 x0 =
blend2<
L.a[0],
L.a[1]>( src0, src1 );
1394 }
else if constexpr (
N == 4 ) {
1395 x0 =
blend4<
L.a[0],
L.a[1],
L.a[2],
L.a[3]>( src0, src1 );
1396 }
else if constexpr (
N == 8 ) {
1397 x0 =
blend8<
L.a[0],
L.a[1],
L.a[2],
L.a[3],
L.a[4],
L.a[5],
L.a[6],
L.a[7]>( src0, src1 );
1398 }
else if constexpr (
N == 16 ) {
1399 x0 =
blend16<
L.a[0],
L.a[1],
L.a[2],
L.a[3],
L.a[4],
L.a[5],
L.a[6],
L.a[7],
L.a[8],
L.a[9],
L.a[10],
L.a[11],
1400 L.a[12],
L.a[13],
L.a[14],
L.a[15]>( src0, src1 );
1401 }
else if constexpr (
N == 32 ) {
1402 x0 =
blend32<
L.a[0],
L.a[1],
L.a[2],
L.a[3],
L.a[4],
L.a[5],
L.a[6],
L.a[7],
L.a[8],
L.a[9],
L.a[10],
L.a[11],
1403 L.a[12],
L.a[13],
L.a[14],
L.a[15],
L.a[16],
L.a[17],
L.a[18],
L.a[19],
L.a[20],
L.a[21],
L.a[22],
1404 L.a[23],
L.a[24],
L.a[25],
L.a[26],
L.a[27],
L.a[28],
L.a[29],
L.a[30],
L.a[31]>( src0, src1 );
1406 if constexpr ( nsrc > 2 ) {
1409 V src2 = selectblend<W, sources.a[2]>( a, b );
1410 V src3 = selectblend<W, sources.a[3]>( a, b );
1411 if constexpr (
N == 2 ) {
1412 x1 =
blend2<M.a[0], M.a[1]>( src0, src1 );
1413 }
else if constexpr (
N == 4 ) {
1414 x1 =
blend4<M.a[0], M.a[1], M.a[2], M.a[3]>( src2, src3 );
1415 }
else if constexpr (
N == 8 ) {
1416 x1 =
blend8<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7]>( src2, src3 );
1417 }
else if constexpr (
N == 16 ) {
1418 x1 =
blend16<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
1419 M.a[12], M.a[13], M.a[14], M.a[15]>( src2, src3 );
1420 }
else if constexpr (
N == 32 ) {
1421 x1 =
blend32<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
1422 M.a[12], M.a[13], M.a[14], M.a[15], M.a[16], M.a[17], M.a[18], M.a[19], M.a[20], M.a[21], M.a[22],
1423 M.a[23], M.a[24], M.a[25], M.a[26], M.a[27], M.a[28], M.a[29], M.a[30], M.a[31]>( src2, src3 );
1430 #ifdef VCL_NAMESPACE
1434 #endif // INSTRSET_H