28 #define ALLOW_FP_PERMUTE true
31 #if ( defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ) ) && !defined( __x86_64__ )
32 # define __x86_64__ 1 // There are many different macros for this, decide on only one
53 # if defined( __AVX512VL__ ) && defined( __AVX512BW__ ) && defined( __AVX512DQ__ )
55 # elif defined( __AVX512F__ ) || defined( __AVX512__ )
57 # elif defined( __AVX2__ )
59 # elif defined( __AVX__ )
61 # elif defined( __SSE4_2__ )
63 # elif defined( __SSE4_1__ )
65 # elif defined( __SSSE3__ )
67 # elif defined( __SSE3__ )
69 # elif defined( __SSE2__ ) || defined( __x86_64__ )
71 # elif defined( __SSE__ )
73 # elif defined( _M_IX86_FP ) // Defined in MS compiler. 1: SSE, 2: SSE2
74 # define INSTRSET _M_IX86_FP
77 # endif // instruction set defines
81 #if INSTRSET > 7 // AVX2 and later
82 # if defined( __GNUC__ ) && !defined( __INTEL_COMPILER )
83 # include <x86intrin.h>
87 # include <immintrin.h>
90 # include <immintrin.h>
92 # include <nmmintrin.h>
94 # include <smmintrin.h>
96 # include <tmmintrin.h>
98 # include <pmmintrin.h>
100 # include <emmintrin.h>
102 # include <xmmintrin.h>
105 #if INSTRSET >= 8 && !defined( __FMA__ )
107 # if defined( __GNUC__ ) && !defined( __INTEL_COMPILER )
109 # if !defined( DISABLE_WARNING_AVX2_WITHOUT_FMA )
110 # pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
112 # elif !defined( __clang__ )
118 #if defined( __XOP__ ) || defined( __FMA4__ )
120 # include <x86intrin.h>
122 # include <ammintrin.h>
124 #elif defined( __SSE4A__ ) // AMD SSE4A
125 # include <ammintrin.h>
129 #if defined( __FMA__ ) && ( defined( __GNUC__ ) || defined( __clang__ ) ) && !defined( __INTEL_COMPILER )
130 # include <fmaintrin.h>
134 #if defined( __FMA4__ ) && ( defined( __GNUC__ ) || defined( __clang__ ) )
135 # include <fma4intrin.h>
141 #ifdef _MSC_VER // Microsoft compiler or compatible Intel compiler
147 namespace VCL_NAMESPACE {
164 #if defined( __GNUC__ ) && !defined( GCC_VERSION ) && !defined( __clang__ )
165 # define GCC_VERSION ( (__GNUC__)*10000 + (__GNUC_MINOR__)*100 + ( __GNUC_PATCHLEVEL__ ) )
169 #if defined( __clang__ )
170 # define CLANG_VERSION ( (__clang_major__)*10000 + (__clang_minor__)*100 + ( __clang_patchlevel__ ) )
178 # if defined( _WINDEF_ ) && defined( min ) && defined( max )
191 #if defined( __INTEL_COMPILER ) && __INTEL_COMPILER < 9999
192 # error The Intel compiler version 19.00 cannot compile VCL version 2. Use Version 1.xx of VCL instead
204 #if ( defined( __clang__ ) || defined( __apple_build_version__ ) ) && !defined( __INTEL_COMPILER )
205 # define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
208 #if defined( GCC_VERSION ) && GCC_VERSION < 99999 && !defined( __clang__ )
209 # define ZEXT_MISSING // Gcc 7.4.0 does not have _mm256_zextsi128_si256 and similar functions
213 namespace VCL_NAMESPACE {
230 static inline void cpuid(
int output[4],
int functionnumber,
int ecxleaf = 0 ) {
232 # if defined( __GNUC__ ) || defined( __clang__ ) // use inline assembly, Gnu/AT&T syntax
234 __asm(
"cpuid" :
"=a"( a ),
"=b"( b ),
"=c"(
c ),
"=d"( d ) :
"a"( functionnumber ),
"c"( ecxleaf ) : );
240 # elif defined( _MSC_VER ) // Microsoft compiler, intrin.h included
241 __cpuidex(
output, functionnumber, ecxleaf );
243 # else // unknown platform. try inline assembly with masm/intel syntax
245 mov eax, functionnumber
254 # endif // compiler/platform
259 #if INSTRSET >= 6 // SSE4.2
262 static inline uint32_t vml_popcnt( uint32_t a ) {
263 return (uint32_t)_mm_popcnt_u32( a );
266 static inline int64_t vml_popcnt( uint64_t a ) {
267 return _mm_popcnt_u64( a );
269 # else // 32 bit mode
270 static inline int64_t vml_popcnt( uint64_t a ) {
271 return _mm_popcnt_u32( uint32_t( a >> 32 ) ) + _mm_popcnt_u32( uint32_t( a ) );
275 static inline uint32_t vml_popcnt( uint32_t a ) {
277 uint32_t b = a - ( ( a >> 1 ) & 0x55555555 );
278 uint32_t
c = ( b & 0x33333333 ) + ( ( b >> 2 ) & 0x33333333 );
279 uint32_t d = (
c + (
c >> 4 ) ) & 0x0F0F0F0F;
280 uint32_t e = d * 0x01010101;
284 static inline int32_t vml_popcnt( uint64_t a ) {
285 return vml_popcnt( uint32_t( a >> 32 ) ) + vml_popcnt( uint32_t( a ) );
291 #if defined( __GNUC__ ) || defined( __clang__ )
293 # if defined( __clang__ ) // fix clang bug
295 __attribute__( ( noinline ) )
298 bit_scan_forward( uint32_t a ) {
300 __asm(
"bsfl %1, %0" :
"=r"( r ) :
"r"( a ) : );
303 static inline uint32_t bit_scan_forward( uint64_t a ) {
304 uint32_t lo = uint32_t( a );
305 if ( lo )
return bit_scan_forward( lo );
306 uint32_t hi = uint32_t( a >> 32 );
307 return bit_scan_forward( hi ) + 32;
310 #else // other compilers
311 static inline uint32_t bit_scan_forward( uint32_t a ) {
313 _BitScanForward( &r, a );
317 static inline uint32_t bit_scan_forward( uint64_t a ) {
319 _BitScanForward64( &r, a );
323 static inline uint32_t bit_scan_forward( uint64_t a ) {
324 uint32_t lo = uint32_t( a );
325 if ( lo )
return bit_scan_forward( lo );
326 uint32_t hi = uint32_t( a >> 32 );
327 return bit_scan_forward( hi ) + 32;
333 #if defined( __GNUC__ ) || defined( __clang__ )
334 static inline uint32_t bit_scan_reverse( uint32_t a ) __attribute__( ( pure ) );
335 static inline uint32_t bit_scan_reverse( uint32_t a ) {
337 __asm(
"bsrl %1, %0" :
"=r"( r ) :
"r"( a ) : );
341 static inline uint32_t bit_scan_reverse( uint64_t a ) {
343 __asm(
"bsrq %1, %0" :
"=r"( r ) :
"r"( a ) : );
346 # else // 32 bit mode
347 static inline uint32_t bit_scan_reverse( uint64_t a ) {
348 uint64_t ahi = a >> 32;
350 return bit_scan_reverse( uint32_t( a ) );
352 return bit_scan_reverse( uint32_t( ahi ) ) + 32;
356 static inline uint32_t bit_scan_reverse( uint32_t a ) {
358 _BitScanReverse( &r, a );
362 static inline uint32_t bit_scan_reverse( uint64_t a ) {
364 _BitScanReverse64( &r, a );
367 # else // 32 bit mode
368 static inline uint32_t bit_scan_reverse( uint64_t a ) {
369 uint64_t ahi = a >> 32;
371 return bit_scan_reverse( uint32_t( a ) );
373 return bit_scan_reverse( uint32_t( ahi ) ) + 32;
380 if (
n == 0 )
return -1;
381 uint64_t a =
n, b = 0,
j = 64, k = 0;
384 k = (uint64_t)1 <<
j;
402 template <u
int32_t n>
404 #define const_int( n ) ( Const_int_t<n>() ) // n must be compile-time integer constant
405 #define const_uint( n ) ( Const_uint_t<n>() ) // n must be compile-time unsigned integer constant
408 template <
class VTYPE>
409 static inline VTYPE nan_vec( uint32_t payload = 0x100 ) {
410 if constexpr ( ( VTYPE::elementtype() & 1 ) != 0 ) {
416 ud.q = 0x7FF8000000000000 | uint64_t( payload ) << 29;
417 return VTYPE( ud.f );
424 uf.i = 0x7FC00000 | ( payload & 0x003FFFFF );
425 return VTYPE( uf.f );
459 template <
typename T,
int N>
466 template <
typename V>
468 constexpr
int elementsize =
sizeof( V ) /
V::size();
470 if constexpr ( elementsize >= 8 ) {
471 return -int64_t( 1 );
472 }
else if constexpr ( elementsize >= 4 ) {
473 return int32_t( -1 );
474 }
else if constexpr ( elementsize >= 2 ) {
475 return int16_t( -1 );
488 for ( i = 0; i <
N; i++ ) {
489 if ( a[i] >= 0 ) mask |= uint64_t( 1 ) << i;
491 if constexpr (
N <= 8 )
492 return uint8_t( mask );
493 else if constexpr (
N <= 16 )
494 return uint16_t( mask );
495 else if constexpr (
N <= 32 )
496 return uint32_t( mask );
503 template <
typename V>
506 typedef decltype( get_inttype<V>() ) Etype;
509 for ( i = 0; i <
N; i++ ) { u.
a[i] = A[i] >= 0 ? get_inttype<V>() : 0; }
522 template <
int N,
int B>
525 uint8_t
j = uint8_t( B & 0xFF );
529 for ( i = 0; i <
N; i++ ) {
534 s = ( (uint32_t)ix >>
j ) & 1;
542 r |= uint64_t(
s ) << i;
549 template <
typename V>
552 typedef decltype( get_inttype<V>() ) Etype;
555 for ( i = 0; i <
N; i++ ) { u.
a[i] = ( (
m >> i ) & 1 ) != 0 ? get_inttype<V>() : 0; }
561 template <
typename V>
564 typedef decltype( get_inttype<V>() ) Etype;
567 for ( i = 0; i <
N; i++ ) { u.
a[i] = Etype( A[i] ); }
600 template <
typename V>
609 const uint32_t nlanes =
sizeof( V ) / 16;
610 const uint32_t lanesize =
N / nlanes;
611 const uint32_t elementsize =
sizeof( V ) /
N;
614 int32_t broadc = 999;
615 uint32_t patfail = 0;
617 int32_t compresslasti = -1;
618 int32_t compresslastp = -1;
619 int32_t expandlasti = -1;
620 int32_t expandlastp = -1;
622 int lanepattern[lanesize] = { 0 };
624 for ( i = 0; i <
N; i++ ) {
629 }
else if ( ix !=
V_DC && uint32_t( ix ) >=
N ) {
637 else if ( broadc != ix )
642 if ( ( i & 1 ) == 0 ) {
647 if ( ix == -1 && iy >= 0 ) r |=
perm_addz;
648 if ( iy == -1 && ix >= 0 ) r |=
perm_addz;
656 uint32_t lanei = (uint32_t)ix / lanesize;
660 if ( lane != 0 && ix >= 0 ) {
661 int j1 = i - int( lane * lanesize );
662 int jx = ix - int( lane * lanesize );
664 if ( lanepattern[j1] < 0 ) {
665 lanepattern[j1] = jx;
672 if ( uint32_t( ix * 2 ) != i ) {
676 if ( ix > compresslasti && ix - compresslasti >= (
int)i - compresslastp ) {
677 if ( (
int)i - compresslastp > 1 ) addz2 |= 2;
684 if ( ix > expandlasti && ix - expandlasti <= (
int)i - expandlastp ) {
685 if ( ix - expandlasti > 1 ) addz2 |= 4;
691 }
else if ( ix == -1 ) {
692 if ( ( i & 1 ) == 0 ) addz2 |= 1;
699 if ( ( patfail & 1 ) == 0 ) {
702 }
else if ( ( patfail & 2 ) == 0 ) {
704 if ( ( addz2 & 2 ) != 0 ) {
705 for (
j = 0;
j < compresslastp;
j++ ) {
709 }
else if ( ( patfail & 4 ) == 0 ) {
711 if ( ( addz2 & 4 ) != 0 ) {
712 for (
j = 0;
j < expandlastp;
j++ ) {
722 for ( i = 0; i < lanesize; i++ ) {
723 if ( lanepattern[i] >= 0 ) {
724 uint32_t rot1 = uint32_t( lanepattern[i] + lanesize - i ) % lanesize;
728 if ( rot != rot1 ) fit =
false;
734 uint64_t rot2 = ( rot * elementsize ) & 0xF;
736 #if INSTRSET >= 4 // SSSE3
741 for ( i = 0; i < lanesize - rot; i++ ) {
742 if ( lanepattern[i] >= 0 ) fit =
false;
746 for ( ; i < lanesize; i++ )
747 if ( lanepattern[i] == -1 ) r |=
perm_addz;
751 for ( i = lanesize - (uint32_t)rot; i < lanesize;
753 if ( lanepattern[i] >= 0 ) fit =
false;
757 for ( i = 0; i < lanesize - rot; i++ ) {
758 if ( lanepattern[i] == -1 ) r |=
perm_addz;
764 uint32_t j2 = lanesize / 2;
765 for ( i = 0; i < lanesize; i++ ) {
766 if ( lanepattern[i] >= 0 && lanepattern[i] != (
int)j2 ) fit =
false;
767 if ( ( i & 1 ) != 0 ) j2++;
773 for ( i = 0; i < lanesize; i++ ) {
774 if ( lanepattern[i] >= 0 && lanepattern[i] != (
int)j2 ) fit =
false;
775 if ( ( i & 1 ) != 0 ) j2++;
779 if ( elementsize >= 4 ) {
781 for ( i = 0; i < lanesize; i++ ) {
782 if ( lanesize == 4 ) {
783 p |= ( lanepattern[i] & 3 ) << 2 * i;
785 p |= ( ( lanepattern[i] & 1 ) * 10 + 4 ) << 4 * i;
793 if constexpr ( nlanes > 1 ) {
794 for ( i = 0; i <
N; i++ ) {
797 uint32_t rot2 = ( ix +
N - i ) %
N;
800 }
else if ( rot != rot2 ) {
824 int ix = 0, lasti = -1, lastp = -1;
828 for ( i = 0; i <
N; i++ ) {
831 m |= (uint64_t)1 << ix;
832 for (
j = 1;
j < i - lastp;
j++ ) {
833 m |= (uint64_t)1 << ( lasti +
j );
848 int ix = 0, lasti = -1, lastp = -1;
852 for ( i = 0; i <
N; i++ ) {
855 m |= (uint64_t)1 << i;
856 for (
j = 1;
j < ix - lasti;
j++ ) {
857 m |= (uint64_t)1 << ( lastp +
j );
873 template <
typename V>
880 uint32_t pat[4] = { 0, 0, 0, 0 };
883 const uint32_t lanesize = 8;
885 int lanepattern[lanesize] = { 0 };
887 for ( i = 0; i <
N; i++ ) {
892 }
else if ( ix >= 0 ) {
893 uint32_t
j = i - lane * lanesize;
894 int jx = ix - lane * lanesize;
895 if ( lanepattern[
j] < 0 ) {
901 for ( i = 0; i < 4; i++ ) {
903 if ( lanepattern[i] >= 0 ) {
904 if ( lanepattern[i] < 4 ) {
906 pat[0] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
909 pat[2] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
913 if ( lanepattern[i + 4] >= 0 ) {
914 if ( lanepattern[i + 4] < 4 ) {
916 pat[3] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
919 pat[1] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
924 for ( i = 0; i < 4; i++ ) {
retval |= (uint64_t)pat[i] << ( 32 + i * 8 ); }
932 template <
typename V,
int oppos = 0>
938 constexpr uint32_t elementsize =
sizeof( V ) /
N;
939 constexpr uint32_t nlanes =
sizeof( V ) / 16;
940 constexpr uint32_t elements_per_lane =
N / nlanes;
942 EList<int8_t,
sizeof( V )> u = { { 0 } };
950 for ( lane = 0; lane < nlanes; lane++ ) {
951 for ( i = 0; i < elements_per_lane; i++ ) {
956 ix ^= oppos * elements_per_lane;
958 ix -= int( lane * elements_per_lane );
959 if ( ix >= 0 && ix < (
int)elements_per_lane ) {
960 p = ix * elementsize;
962 for (
j = 0;
j < elementsize;
j++ ) {
963 u.a[k++] = p < 0 ? -1 : p +
j;
978 EList<int,
N / 2> list = { { 0 } };
982 bool fit_addz =
false;
986 for ( i = 0; i <
N; i += 2 ) {
989 if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz =
true; }
993 for ( i = 0; i <
N; i += 2 ) {
998 }
else if ( iy >= 0 ) {
1002 if ( fit_addz ) iz =
V_DC;
1034 template <
typename V>
1043 const uint32_t nlanes =
sizeof( V ) / 16;
1044 const uint32_t lanesize =
N / nlanes;
1047 int lanepattern[lanesize] = { 0 };
1048 if ( lanesize == 2 &&
N <= 8 ) {
1052 for ( ii = 0; ii <
N; ii++ ) {
1057 else if ( ix !=
V_DC ) {
1066 }
else if ( ix < 2 *
N ) {
1076 if ( ( ii & 1 ) == 0 ) {
1084 lane = (uint32_t)ii / lanesize;
1086 lanepattern[ii] = ix;
1090 uint32_t lanei = uint32_t( ix & ~
N ) / lanesize;
1091 if ( lanei != lane ) {
1094 if ( lanesize == 2 ) {
1096 if ( ( ( ( ix &
N ) != 0 ) ^ ii ) & 1 )
1103 if ( lane != 0 && ix >= 0 ) {
1104 int j = ii - int( lane * lanesize );
1105 int jx = ix - int( lane * lanesize );
1107 if ( lanepattern[
j] < 0 ) {
1108 lanepattern[
j] = jx;
1122 for ( iu = 0; iu < lanesize; iu++ ) {
1123 ix = lanepattern[iu];
1126 if ( (uint32_t)ix != iu / 2 + ( ( iu & 1 ) ^ 1 ) *
N ) r &= ~
blend_punpcklba;
1127 if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( iu & 1 ) *
N ) r &= ~
blend_punpckhab;
1128 if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( ( iu & 1 ) ^ 1 ) *
N ) r &= ~
blend_punpckhba;
1131 #if INSTRSET >= 4 // SSSE3. check if it fits palignr
1132 for ( iu = 0; iu < lanesize; iu++ ) {
1133 ix = lanepattern[iu];
1135 uint32_t
t = ix & ~
N;
1136 if ( ix &
N )
t += lanesize;
1137 uint32_t tb = (
t + 2 * lanesize - iu ) % ( lanesize * 2 );
1141 if ( rot != tb ) rot = 1000;
1146 if ( rot < lanesize ) {
1151 const uint32_t elementsize =
sizeof( V ) /
N;
1152 r |= uint64_t( ( rot & ( lanesize - 1 ) ) * elementsize ) <<
blend_rotpattern;
1155 if ( lanesize == 4 ) {
1158 for ( ii = 0; ii < 2; ii++ ) {
1159 ix = lanepattern[ii];
1167 for ( ; ii < 4; ii++ ) {
1168 ix = lanepattern[ii];
1177 uint8_t shufpattern = 0;
1178 for ( iu = 0; iu < lanesize; iu++ ) { shufpattern |= ( lanepattern[iu] & 3 ) << iu * 2; }
1182 }
else if ( nlanes > 1 ) {
1184 for ( ii = 0; ii <
N; ii++ ) {
1187 uint32_t rot2 = ( ix + 2 *
N - ii ) % ( 2 *
N );
1190 }
else if ( rot != rot2 ) {
1196 if ( rot < 2 *
N ) {
1201 for ( ii = 0; ii <
N; ii++ ) { r |= uint64_t( a[ii] & 1 ) << (
blend_shufpattern + ii ); }
1211 template <
int N,
int dozero>
1215 int u = dozero ? -1 :
V_DC;
1218 for (
j = 0;
j <
N;
j++ ) {
1221 if ( dozero == 2 ) {
1230 }
else if ( ix <
N ) {
1235 list.
a[
j +
N] = ix -
N;
1249 EList<int,
N / 2> list = { { 0 } };
1251 bool fit_addz =
false;
1257 for ( i = 0; i <
N; i += 2 ) {
1262 }
else if ( iy >= 0 ) {
1268 if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz =
true; }
1272 for ( i = 0; i <
N / 2; i++ ) {
1273 if ( list.a[i] < 0 ) list.a[i] =
V_DC;
1289 template <
typename dummy>
1291 template <
typename dummy>
1293 template <
typename dummy>
1295 template <
typename dummy>
1297 template <
typename dummy>
1306 template <
int N,
int dozero,
int src1,
int src2>
1310 int u = dozero ? -1 :
V_DC;
1313 for (
j = 0;
j <
N;
j++ ) {
1316 list.
a[
j] = ( dozero == 2 ) ? ix : u;
1319 if ( src == src1 ) {
1320 list.
a[
j] = ix & (
N - 1 );
1321 }
else if ( src == src2 ) {
1322 list.
a[
j] = ( ix & (
N - 1 ) ) +
N;
1331 template <
typename W,
int s>
1332 static inline auto selectblend( W
const a, W
const b ) {
1333 if constexpr (
s == 0 )
1335 else if constexpr (
s == 1 )
1336 return a.get_high();
1337 else if constexpr (
s == 2 )
1340 return b.get_high();
1350 template <typename W,
int... i0>
1352 typedef decltype( a.get_low() ) V;
1354 static_assert(
sizeof...( i0 ) ==
N,
"wrong number of indexes in blend_half" );
1355 constexpr
int ind[
N] = { i0... };
1359 auto listsources = [](
int const n,
int const( &ind )[
N] ) constexpr {
1360 bool source_used[4] = {
false,
false,
false,
false };
1362 for ( i = 0; i <
n; i++ ) {
1366 source_used[src & 3] =
true;
1372 for ( i = 0; i < 4; i++ ) {
1373 if ( source_used[i] ) { sources.
a[nsrc++] = i; }
1375 sources.
a[4] = nsrc;
1380 constexpr
int nsrc = sources.a[4];
1382 if constexpr ( nsrc == 0 ) {
1386 constexpr
int uindex = ( nsrc > 2 ) ? 1 : 2;
1389 V src0 = selectblend<W, sources.a[0]>( a, b );
1390 V src1 = selectblend<W, sources.a[1]>( a, b );
1391 if constexpr (
N == 2 ) {
1392 x0 =
blend2<
L.a[0],
L.a[1]>( src0, src1 );
1393 }
else if constexpr (
N == 4 ) {
1394 x0 =
blend4<
L.a[0],
L.a[1],
L.a[2],
L.a[3]>( src0, src1 );
1395 }
else if constexpr (
N == 8 ) {
1396 x0 =
blend8<
L.a[0],
L.a[1],
L.a[2],
L.a[3],
L.a[4],
L.a[5],
L.a[6],
L.a[7]>( src0, src1 );
1397 }
else if constexpr (
N == 16 ) {
1398 x0 =
blend16<
L.a[0],
L.a[1],
L.a[2],
L.a[3],
L.a[4],
L.a[5],
L.a[6],
L.a[7],
L.a[8],
L.a[9],
L.a[10],
L.a[11],
1399 L.a[12],
L.a[13],
L.a[14],
L.a[15]>( src0, src1 );
1400 }
else if constexpr (
N == 32 ) {
1401 x0 =
blend32<
L.a[0],
L.a[1],
L.a[2],
L.a[3],
L.a[4],
L.a[5],
L.a[6],
L.a[7],
L.a[8],
L.a[9],
L.a[10],
L.a[11],
1402 L.a[12],
L.a[13],
L.a[14],
L.a[15],
L.a[16],
L.a[17],
L.a[18],
L.a[19],
L.a[20],
L.a[21],
L.a[22],
1403 L.a[23],
L.a[24],
L.a[25],
L.a[26],
L.a[27],
L.a[28],
L.a[29],
L.a[30],
L.a[31]>( src0, src1 );
1405 if constexpr ( nsrc > 2 ) {
1408 V src2 = selectblend<W, sources.a[2]>( a, b );
1409 V src3 = selectblend<W, sources.a[3]>( a, b );
1410 if constexpr (
N == 2 ) {
1411 x1 =
blend2<M.a[0], M.a[1]>( src0, src1 );
1412 }
else if constexpr (
N == 4 ) {
1413 x1 =
blend4<M.a[0], M.a[1], M.a[2], M.a[3]>( src2, src3 );
1414 }
else if constexpr (
N == 8 ) {
1415 x1 =
blend8<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7]>( src2, src3 );
1416 }
else if constexpr (
N == 16 ) {
1417 x1 =
blend16<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
1418 M.a[12], M.a[13], M.a[14], M.a[15]>( src2, src3 );
1419 }
else if constexpr (
N == 32 ) {
1420 x1 =
blend32<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
1421 M.a[12], M.a[13], M.a[14], M.a[15], M.a[16], M.a[17], M.a[18], M.a[19], M.a[20], M.a[21], M.a[22],
1422 M.a[23], M.a[24], M.a[25], M.a[26], M.a[27], M.a[28], M.a[29], M.a[30], M.a[31]>( src2, src3 );
1429 #ifdef VCL_NAMESPACE