The Gaudi Framework  master (3415b466)
Loading...
Searching...
No Matches
instrset.h
Go to the documentation of this file.
1/**************************** instrset.h **********************************
2 * Author: Agner Fog
3 * Date created: 2012-05-30
4 * Last modified: 2020-11-11
5 * Version: 2.01.02
6 * Project: vector class library
7 * Description:
8 * Header file for various compiler-specific tasks as well as common
9 * macros and templates. This file contains:
10 *
11 * > Selection of the supported instruction set
12 * > Defines compiler version macros
13 * > Undefines certain macros that prevent function overloading
14 * > Helper functions that depend on instruction set, compiler, or platform
15 * > Common templates for permute, blend, etc.
16 *
17 * For instructions, see vcl_manual.pdf
18 *
19 * (c) Copyright 2012-2020 Agner Fog.
20 * Apache License version 2.0 or later.
21 ******************************************************************************/
22
23#pragma once
24
25// Allow the use of floating point permute instructions on integer vectors.
26// Some CPU's have an extra latency of 1 or 2 clock cycles for this, but
27// it may still be faster than alternative implementations:
28#define ALLOW_FP_PERMUTE true
29
30// Macro to indicate 64 bit mode
31#if ( defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ) ) && !defined( __x86_64__ )
32# define __x86_64__ 1 // There are many different macros for this, decide on only one
33#endif
34
35// The following values of INSTRSET are currently defined:
36// 2: SSE2
37// 3: SSE3
38// 4: SSSE3
39// 5: SSE4.1
40// 6: SSE4.2
41// 7: AVX
42// 8: AVX2
43// 9: AVX512F
44// 10: AVX512BW/DQ/VL
45// In the future, INSTRSET = 11 may include AVX512VBMI and AVX512VBMI2, but this
46// decision cannot be made before the market situation for CPUs with these
47// instruction sets is known (these future instruction set extensions are already
48// used in some VCL functions and tested with an emulator)
49
50// Find instruction set from compiler macros if INSTRSET is not defined.
51// Note: Most of these macros are not defined in Microsoft compilers
52#ifndef INSTRSET
53# if defined( __AVX512VL__ ) && defined( __AVX512BW__ ) && defined( __AVX512DQ__ )
54# define INSTRSET 10
55# elif defined( __AVX512F__ ) || defined( __AVX512__ )
56# define INSTRSET 9
57# elif defined( __AVX2__ )
58# define INSTRSET 8
59# elif defined( __AVX__ )
60# define INSTRSET 7
61# elif defined( __SSE4_2__ )
62# define INSTRSET 6
63# elif defined( __SSE4_1__ )
64# define INSTRSET 5
65# elif defined( __SSSE3__ )
66# define INSTRSET 4
67# elif defined( __SSE3__ )
68# define INSTRSET 3
69# elif defined( __SSE2__ ) || defined( __x86_64__ )
70# define INSTRSET 2
71# elif defined( __SSE__ )
72# define INSTRSET 1
73# elif defined( _M_IX86_FP ) // Defined in MS compiler. 1: SSE, 2: SSE2
74# define INSTRSET _M_IX86_FP
75# else
76# define INSTRSET 0
77# endif // instruction set defines
78#endif // INSTRSET
79
80// Include the appropriate header file for intrinsic functions
81#if INSTRSET > 7 // AVX2 and later
82# if defined( __GNUC__ ) && !defined( __INTEL_COMPILER )
83# include <x86intrin.h> // x86intrin.h includes header files for whatever instruction
84 // sets are specified on the compiler command line, such as:
85 // xopintrin.h, fma4intrin.h
86# else
87# include <immintrin.h> // MS/Intel version of immintrin.h covers AVX and later
88# endif // __GNUC__
89#elif INSTRSET == 7
90# include <immintrin.h> // AVX
91#elif INSTRSET == 6
92# include <nmmintrin.h> // SSE4.2
93#elif INSTRSET == 5
94# include <smmintrin.h> // SSE4.1
95#elif INSTRSET == 4
96# include <tmmintrin.h> // SSSE3
97#elif INSTRSET == 3
98# include <pmmintrin.h> // SSE3
99#elif INSTRSET == 2
100# include <emmintrin.h> // SSE2
101#elif INSTRSET == 1
102# include <xmmintrin.h> // SSE
103#endif // INSTRSET
104
105#if INSTRSET >= 8 && !defined( __FMA__ )
106// Assume that all processors that have AVX2 also have FMA3
107# if defined( __GNUC__ ) && !defined( __INTEL_COMPILER )
108// Prevent error message in g++ and Clang when using FMA intrinsics with avx2:
109# if !defined( DISABLE_WARNING_AVX2_WITHOUT_FMA )
110# pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
111# endif
112# elif !defined( __clang__ )
113# define __FMA__ 1
114# endif
115#endif
116
117// AMD instruction sets
118#if defined( __XOP__ ) || defined( __FMA4__ )
119# ifdef __GNUC__
120# include <x86intrin.h> // AMD XOP (Gnu)
121# else
122# include <ammintrin.h> // AMD XOP (Microsoft)
123# endif // __GNUC__
124#elif defined( __SSE4A__ ) // AMD SSE4A
125# include <ammintrin.h>
126#endif // __XOP__
127
128// FMA3 instruction set
129#if defined( __FMA__ ) && ( defined( __GNUC__ ) || defined( __clang__ ) ) && !defined( __INTEL_COMPILER )
130# include <fmaintrin.h>
131#endif // __FMA__
132
133// FMA4 instruction set
134#if defined( __FMA4__ ) && ( defined( __GNUC__ ) || defined( __clang__ ) )
135# include <fma4intrin.h> // must have both x86intrin.h and fma4intrin.h, don't know why
136#endif // __FMA4__
137
138#include <stdint.h> // Define integer types with known size
139#include <stdlib.h> // define abs(int)
140
141#ifdef _MSC_VER // Microsoft compiler or compatible Intel compiler
142# include <intrin.h> // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int)
143#endif // _MSC_VER
144
145// functions in instrset_detect.cpp:
146#ifdef VCL_NAMESPACE
147namespace VCL_NAMESPACE {
148#endif
149 int instrset_detect( void ); // tells which instruction sets are supported
150 bool hasFMA3( void ); // true if FMA3 instructions supported
151 bool hasFMA4( void ); // true if FMA4 instructions supported
152 bool hasXOP( void ); // true if XOP instructions supported
153 bool hasAVX512ER( void ); // true if AVX512ER instructions supported
154 bool hasAVX512VBMI( void ); // true if AVX512VBMI instructions supported
155 bool hasAVX512VBMI2( void ); // true if AVX512VBMI2 instructions supported
156#ifdef VCL_NAMESPACE
157}
158#endif
159
160// functions in physical_processors.cpp:
161int physicalProcessors( int* logical_processors = 0 );
162
163// GCC version
164#if defined( __GNUC__ ) && !defined( GCC_VERSION ) && !defined( __clang__ )
165# define GCC_VERSION ( (__GNUC__)*10000 + (__GNUC_MINOR__)*100 + ( __GNUC_PATCHLEVEL__ ) )
166#endif
167
168// Clang version
169#if defined( __clang__ )
170# define CLANG_VERSION ( (__clang_major__)*10000 + (__clang_minor__)*100 + ( __clang_patchlevel__ ) )
171// Problem: The version number is not consistent across platforms
172// http://llvm.org/bugs/show_bug.cgi?id=12643
173// Apple bug 18746972
174#endif
175
176// Fix problem with non-overloadable macros named min and max in WinDef.h
177#ifdef _MSC_VER
178# if defined( _WINDEF_ ) && defined( min ) && defined( max )
179# undef min
180# undef max
181# endif
182# ifndef NOMINMAX
183# define NOMINMAX
184# endif
185#endif
186
187/* Intel compiler problem:
188The Intel compiler currently cannot compile version 2.00 of VCL. It seems to have
189a problem with constexpr function returns not being constant enough.
190*/
191#if defined( __INTEL_COMPILER ) && __INTEL_COMPILER < 9999
192# error The Intel compiler version 19.00 cannot compile VCL version 2. Use Version 1.xx of VCL instead
193#endif
194
195/* Clang problem:
196The Clang compiler treats the intrinsic vector types __m128, __m128i, and __m128d as identical.
197See the bug report at https://bugs.llvm.org/show_bug.cgi?id=17164
198Additional problem: The version number is not consistent across platforms. The Apple build has
199different version numbers. We have to rely on __apple_build_version__ on the Mac platform:
200http://llvm.org/bugs/show_bug.cgi?id=12643
201We have to make switches here when - hopefully - the error some day has been fixed.
202We need different version checks with and whithout __apple_build_version__
203*/
204#if ( defined( __clang__ ) || defined( __apple_build_version__ ) ) && !defined( __INTEL_COMPILER )
205# define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
206#endif
207
208#if defined( GCC_VERSION ) && GCC_VERSION < 99999 && !defined( __clang__ )
209# define ZEXT_MISSING // Gcc 7.4.0 does not have _mm256_zextsi128_si256 and similar functions
210#endif
211
212#ifdef VCL_NAMESPACE
213namespace VCL_NAMESPACE {
214#endif
215
216 // Constant for indicating don't care in permute and blend functions.
217 // V_DC is -256 in Vector class library version 1.xx
218 // V_DC can be any value less than -1 in Vector class library version 2.00
219 constexpr int V_DC = -256;
220
221 /*****************************************************************************
222 *
223 * Helper functions that depend on instruction set, compiler, or platform
224 *
225 *****************************************************************************/
226
227 // Define interface to cpuid instruction.
228 // input: functionnumber = leaf (eax), ecxleaf = subleaf(ecx)
229 // output: output[0] = eax, output[1] = ebx, output[2] = ecx, output[3] = edx
230 static inline void cpuid( int output[4], int functionnumber, int ecxleaf = 0 ) {
231#ifdef __x86_64__
232# if defined( __GNUC__ ) || defined( __clang__ ) // use inline assembly, Gnu/AT&T syntax
233 int a, b, c, d;
234 __asm( "cpuid" : "=a"( a ), "=b"( b ), "=c"( c ), "=d"( d ) : "a"( functionnumber ), "c"( ecxleaf ) : );
235 output[0] = a;
236 output[1] = b;
237 output[2] = c;
238 output[3] = d;
239
240# elif defined( _MSC_VER ) // Microsoft compiler, intrin.h included
241 __cpuidex( output, functionnumber, ecxleaf ); // intrinsic function for CPUID
242
243# else // unknown platform. try inline assembly with masm/intel syntax
244 __asm {
245 mov eax, functionnumber
246 mov ecx, ecxleaf
247 cpuid;
248 mov esi, output
249 mov[esi], eax
250 mov[esi + 4], ebx
251 mov[esi + 8], ecx
252 mov[esi + 12], edx
253 }
254# endif // compiler/platform
255#else
256 for ( i = 0; i < 4; i++ ) { output[i] = 0; }
257#endif // __x86_64__
258 }
259
260// Define popcount function. Gives sum of bits
261#if INSTRSET >= 6 // SSE4.2
262 // popcnt instruction is not officially part of the SSE4.2 instruction set,
263 // but available in all known processors with SSE4.2
264 static inline uint32_t vml_popcnt( uint32_t a ) {
265 return (uint32_t)_mm_popcnt_u32( a ); // Intel intrinsic. Supported by gcc and clang
266 }
267# ifdef __x86_64__
268 static inline int64_t vml_popcnt( uint64_t a ) {
269 return _mm_popcnt_u64( a ); // Intel intrinsic.
270 }
271# else // 32 bit mode
272 static inline int64_t vml_popcnt( uint64_t a ) {
273 return _mm_popcnt_u32( uint32_t( a >> 32 ) ) + _mm_popcnt_u32( uint32_t( a ) );
274 }
275# endif
276#else // no SSE4.2
277static inline uint32_t vml_popcnt( uint32_t a ) {
278 // popcnt instruction not available
279 uint32_t b = a - ( ( a >> 1 ) & 0x55555555 );
280 uint32_t c = ( b & 0x33333333 ) + ( ( b >> 2 ) & 0x33333333 );
281 uint32_t d = ( c + ( c >> 4 ) ) & 0x0F0F0F0F;
282 uint32_t e = d * 0x01010101;
283 return e >> 24;
284}
285
286static inline int32_t vml_popcnt( uint64_t a ) {
287 return vml_popcnt( uint32_t( a >> 32 ) ) + vml_popcnt( uint32_t( a ) );
288}
289
290#endif
291
292// Define bit-scan-forward function. Gives index to lowest set bit
293#if defined( __GNUC__ ) || defined( __clang__ )
294 // gcc and Clang have no bit_scan_forward intrinsic
295# if defined( __clang__ ) // fix clang bug
296 // Clang uses a k register as parameter a when inlined from horizontal_find_first
297 __attribute__( ( noinline ) )
298# endif
299 static uint32_t
300 bit_scan_forward( uint32_t a ) {
301 uint32_t r;
302 __asm( "bsfl %1, %0" : "=r"( r ) : "r"( a ) : );
303 return r;
304 }
305 static inline uint32_t bit_scan_forward( uint64_t a ) {
306 uint32_t lo = uint32_t( a );
307 if ( lo ) return bit_scan_forward( lo );
308 uint32_t hi = uint32_t( a >> 32 );
309 return bit_scan_forward( hi ) + 32;
310 }
311
312#else // other compilers
313static inline uint32_t bit_scan_forward( uint32_t a ) {
314 unsigned long r;
315 _BitScanForward( &r, a ); // defined in intrin.h for MS and Intel compilers
316 return r;
317}
318# ifdef __x86_64__
319static inline uint32_t bit_scan_forward( uint64_t a ) {
320 unsigned long r;
321 _BitScanForward64( &r, a ); // defined in intrin.h for MS and Intel compilers
322 return (uint32_t)r;
323}
324# else
325static inline uint32_t bit_scan_forward( uint64_t a ) {
326 uint32_t lo = uint32_t( a );
327 if ( lo ) return bit_scan_forward( lo );
328 uint32_t hi = uint32_t( a >> 32 );
329 return bit_scan_forward( hi ) + 32;
330}
331# endif
332#endif
333
334// Define bit-scan-reverse function. Gives index to highest set bit = floor(log2(a))
335#if defined( __GNUC__ ) || defined( __clang__ )
336 static inline uint32_t bit_scan_reverse( uint32_t a ) __attribute__( ( pure ) );
337 static inline uint32_t bit_scan_reverse( uint32_t a ) {
338 uint32_t r;
339 __asm( "bsrl %1, %0" : "=r"( r ) : "r"( a ) : );
340 return r;
341 }
342# ifdef __x86_64__
343 static inline uint32_t bit_scan_reverse( uint64_t a ) {
344 uint64_t r;
345 __asm( "bsrq %1, %0" : "=r"( r ) : "r"( a ) : );
346 return r;
347 }
348# else // 32 bit mode
349 static inline uint32_t bit_scan_reverse( uint64_t a ) {
350 uint64_t ahi = a >> 32;
351 if ( ahi == 0 )
352 return bit_scan_reverse( uint32_t( a ) );
353 else
354 return bit_scan_reverse( uint32_t( ahi ) ) + 32;
355 }
356# endif
357#else
358static inline uint32_t bit_scan_reverse( uint32_t a ) {
359 unsigned long r;
360 _BitScanReverse( &r, a ); // defined in intrin.h for MS and Intel compilers
361 return r;
362}
363# ifdef __x86_64__
364static inline uint32_t bit_scan_reverse( uint64_t a ) {
365 unsigned long r;
366 _BitScanReverse64( &r, a ); // defined in intrin.h for MS and Intel compilers
367 return r;
368}
369# else // 32 bit mode
370static inline uint32_t bit_scan_reverse( uint64_t a ) {
371 uint64_t ahi = a >> 32;
372 if ( ahi == 0 )
373 return bit_scan_reverse( uint32_t( a ) );
374 else
375 return bit_scan_reverse( uint32_t( ahi ) ) + 32;
376}
377# endif
378#endif
379
380 // Same function, for compile-time constants
381 constexpr int bit_scan_reverse_const( uint64_t const n ) {
382 if ( n == 0 ) return -1;
383 uint64_t a = n, b = 0, j = 64, k = 0;
384 do {
385 j >>= 1;
386 k = (uint64_t)1 << j;
387 if ( a >= k ) {
388 a >>= j;
389 b += j;
390 }
391 } while ( j > 0 );
392 return int( b );
393 }
394
395 /*****************************************************************************
396 *
397 * Common templates
398 *
399 *****************************************************************************/
400
401 // Template class to represent compile-time integer constant
402 template <int32_t n>
403 class Const_int_t {}; // represent compile-time signed integer constant
404 template <uint32_t n>
405 class Const_uint_t {}; // represent compile-time unsigned integer constant
406#define const_int( n ) ( Const_int_t<n>() ) // n must be compile-time integer constant
407#define const_uint( n ) ( Const_uint_t<n>() ) // n must be compile-time unsigned integer constant
408
409 // template for producing quiet NAN
410 template <class VTYPE>
411 static inline VTYPE nan_vec( uint32_t payload = 0x100 ) {
412 if constexpr ( ( VTYPE::elementtype() & 1 ) != 0 ) { // double
413 union {
414 uint64_t q;
415 double f;
416 } ud;
417 // n is left justified to avoid loss of NAN payload when converting to float
418 ud.q = 0x7FF8000000000000 | uint64_t( payload ) << 29;
419 return VTYPE( ud.f );
420 }
421 // float will be converted to double if necessary
422 union {
423 uint32_t i;
424 float f;
425 } uf;
426 uf.i = 0x7FC00000 | ( payload & 0x003FFFFF );
427 return VTYPE( uf.f );
428 }
429
430 // Test if a parameter is a compile-time constant
431 /* Unfortunately, this works only for macro parameters, not for inline function parameters.
432 I hope that some solution will appear in the future, but for now it appears to be
433 impossible to check if a function parameter is a compile-time constant.
434 This would be useful in operator / and in function pow:
435 #if defined(__GNUC__) || defined (__clang__)
436 #define is_constant(a) __builtin_constant_p(a)
437 #else
438 #define is_constant(a) false
439 #endif
440 */
441
442 /*****************************************************************************
443 *
444 * Helper functions for permute and blend functions
445 *
446 ******************************************************************************
447 Rules for constexpr functions:
448
449 > All variable declarations must include initialization
450
451 > Do not put variable declarations inside a for-clause, e.g. avoid: for (int i=0; ..
452 Instead, you have to declare the loop counter before the for-loop.
453
454 > Do not make constexpr functions that return vector types. This requires type
455 punning with a union, which is not allowed in constexpr functions under C++17.
456 It may be possible under C++20
457
458 *****************************************************************************/
459
460 // Define type for Encapsulated array to use as return type:
461 template <typename T, int N>
462 struct EList {
463 T a[N];
464 };
465
466 // get_inttype: get an integer of a size that matches the element size
467 // of vector class V with the value -1
468 template <typename V>
469 constexpr auto get_inttype() {
470 constexpr int elementsize = sizeof( V ) / V::size(); // size of vector elements
471
472 if constexpr ( elementsize >= 8 ) {
473 return -int64_t( 1 );
474 } else if constexpr ( elementsize >= 4 ) {
475 return int32_t( -1 );
476 } else if constexpr ( elementsize >= 2 ) {
477 return int16_t( -1 );
478 } else {
479 return int8_t( -1 );
480 }
481 }
482
483 // zero_mask: return a compact bit mask mask for zeroing using AVX512 mask.
484 // Parameter a is a reference to a constexpr int array of permutation indexes
485 template <int N>
486 constexpr auto zero_mask( int const ( &a )[N] ) {
487 uint64_t mask = 0;
488 int i = 0;
489
490 for ( i = 0; i < N; i++ ) {
491 if ( a[i] >= 0 ) mask |= uint64_t( 1 ) << i;
492 }
493 if constexpr ( N <= 8 )
494 return uint8_t( mask );
495 else if constexpr ( N <= 16 )
496 return uint16_t( mask );
497 else if constexpr ( N <= 32 )
498 return uint32_t( mask );
499 else
500 return mask;
501 }
502
503 // zero_mask_broad: return a broad byte mask for zeroing.
504 // Parameter a is a reference to a constexpr int array of permutation indexes
505 template <typename V>
506 constexpr auto zero_mask_broad( int const ( &A )[V::size()] ) {
507 constexpr int N = V::size(); // number of vector elements
508 typedef decltype( get_inttype<V>() ) Etype; // element type
509 EList<Etype, N> u = { { 0 } }; // list for return
510 int i = 0;
511 for ( i = 0; i < N; i++ ) { u.a[i] = A[i] >= 0 ? get_inttype<V>() : 0; }
512 return u; // return encapsulated array
513 }
514
515 // make_bit_mask: return a compact mask of bits from a list of N indexes:
516 // B contains options indicating how to gather the mask
517 // bit 0-7 in B indicates which bit in each index to collect
518 // bit 8 = 0x100: set 1 in the lower half of the bit mask if the indicated bit is 1.
519 // bit 8 = 0 : set 1 in the lower half of the bit mask if the indicated bit is 0.
520 // bit 9 = 0x200: set 1 in the upper half of the bit mask if the indicated bit is 1.
521 // bit 9 = 0 : set 1 in the upper half of the bit mask if the indicated bit is 0.
522 // bit 10 = 0x400: set 1 in the bit mask if the corresponding index is -1 or V_DC
523 // Parameter a is a reference to a constexpr int array of permutation indexes
524 template <int N, int B>
525 constexpr uint64_t make_bit_mask( int const ( &a )[N] ) {
526 uint64_t r = 0; // return value
527 uint8_t j = uint8_t( B & 0xFF ); // index to selected bit
528 uint64_t s = 0; // bit number i in r
529 uint64_t f = 0; // 1 if bit not flipped
530 int i = 0;
531 for ( i = 0; i < N; i++ ) {
532 int ix = a[i];
533 if ( ix < 0 ) { // -1 or V_DC
534 s = ( B >> 10 ) & 1;
535 } else {
536 s = ( (uint32_t)ix >> j ) & 1; // extract selected bit
537 if ( i < N / 2 ) {
538 f = ( B >> 8 ) & 1; // lower half
539 } else {
540 f = ( B >> 9 ) & 1; // upper half
541 }
542 s ^= f ^ 1; // flip bit if needed
543 }
544 r |= uint64_t( s ) << i; // set bit in return value
545 }
546 return r;
547 }
548
549 // make_broad_mask: Convert a bit mask m to a broad mask
550 // The return value will be a broad boolean mask with elementsize matching vector class V
551 template <typename V>
552 constexpr auto make_broad_mask( uint64_t const m ) {
553 constexpr int N = V::size(); // number of vector elements
554 typedef decltype( get_inttype<V>() ) Etype; // element type
555 EList<Etype, N> u = { { 0 } }; // list for returning
556 int i = 0;
557 for ( i = 0; i < N; i++ ) { u.a[i] = ( ( m >> i ) & 1 ) != 0 ? get_inttype<V>() : 0; }
558 return u; // return encapsulated array
559 }
560
561 // perm_mask_broad: return a mask for permutation by a vector register index.
562 // Parameter A is a reference to a constexpr int array of permutation indexes
563 template <typename V>
564 constexpr auto perm_mask_broad( int const ( &A )[V::size()] ) {
565 constexpr int N = V::size(); // number of vector elements
566 typedef decltype( get_inttype<V>() ) Etype; // vector element type
567 EList<Etype, N> u = { { 0 } }; // list for returning
568 int i = 0;
569 for ( i = 0; i < N; i++ ) { u.a[i] = Etype( A[i] ); }
570 return u; // return encapsulated array
571 }
572
573 // perm_flags: returns information about how a permute can be implemented.
574 // The return value is composed of these flag bits:
575 const int perm_zeroing = 1; // needs zeroing
576 const int perm_perm = 2; // permutation needed
577 const int perm_allzero = 4; // all is zero or don't care
578 const int perm_largeblock = 8; // fits permute with a larger block size (e.g permute Vec2q instead of Vec4i)
579 const int perm_addz = 0x10; // additional zeroing needed after permute with larger block size or shift
580 const int perm_addz2 = 0x20; // additional zeroing needed after perm_zext, perm_compress, or perm_expand
581 const int perm_cross_lane = 0x40; // permutation crossing 128-bit lanes
582 const int perm_same_pattern = 0x80; // same permute pattern in all 128-bit lanes
583 const int perm_punpckh = 0x100; // permutation pattern fits punpckh instruction
584 const int perm_punpckl = 0x200; // permutation pattern fits punpckl instruction
585 const int perm_rotate =
586 0x400; // permutation pattern fits rotation within lanes. 4 bit count returned in bit perm_rot_count
587 const int perm_shright =
588 0x1000; // permutation pattern fits shift right within lanes. 4 bit count returned in bit perm_rot_count
589 const int perm_shleft =
590 0x2000; // permutation pattern fits shift left within lanes. negative count returned in bit perm_rot_count
591 const int perm_rotate_big =
592 0x4000; // permutation pattern fits rotation across lanes. 6 bit count returned in bit perm_rot_count
593 const int perm_broadcast = 0x8000; // permutation pattern fits broadcast of a single element.
594 const int perm_zext = 0x10000; // permutation pattern fits zero extension
595 const int perm_compress = 0x20000; // permutation pattern fits vpcompress instruction
596 const int perm_expand = 0x40000; // permutation pattern fits vpexpand instruction
597 const int perm_outofrange = 0x10000000; // index out of range
598 const int perm_rot_count = 32; // rotate or shift count is in bits perm_rot_count to perm_rot_count+3
599 const int perm_ipattern =
600 40; // pattern for pshufd is in bit perm_ipattern to perm_ipattern + 7 if perm_same_pattern and elementsize >= 4
601
602 template <typename V>
603 constexpr uint64_t perm_flags( int const ( &a )[V::size()] ) {
604 // a is a reference to a constexpr array of permutation indexes
605 // V is a vector class
606 constexpr int N = V::size(); // number of elements
607 uint64_t r = perm_largeblock | perm_same_pattern | perm_allzero; // return value
608 uint32_t i = 0; // loop counter
609 int j = 0; // loop counter
610 int ix = 0; // index number i
611 const uint32_t nlanes = sizeof( V ) / 16; // number of 128-bit lanes
612 const uint32_t lanesize = N / nlanes; // elements per lane
613 const uint32_t elementsize = sizeof( V ) / N; // size of each vector element
614 uint32_t lane = 0; // current lane
615 uint32_t rot = 999; // rotate left count
616 int32_t broadc = 999; // index to broadcasted element
617 uint32_t patfail = 0; // remember certain patterns that do not fit
618 uint32_t addz2 = 0; // remember certain patterns need extra zeroing
619 int32_t compresslasti = -1; // last index in perm_compress fit
620 int32_t compresslastp = -1; // last position in perm_compress fit
621 int32_t expandlasti = -1; // last index in perm_expand fit
622 int32_t expandlastp = -1; // last position in perm_expand fit
623
624 int lanepattern[lanesize] = { 0 }; // pattern in each lane
625
626 for ( i = 0; i < N; i++ ) { // loop through indexes
627 ix = a[i]; // current index
628 // meaning of ix: -1 = set to zero, V_DC = don't care, non-negative value = permute.
629 if ( ix == -1 ) {
630 r |= perm_zeroing; // zeroing requested
631 } else if ( ix != V_DC && uint32_t( ix ) >= N ) {
632 r |= perm_outofrange; // index out of range
633 }
634 if ( ix >= 0 ) {
635 r &= ~perm_allzero; // not all zero
636 if ( ix != (int)i ) r |= perm_perm; // needs permutation
637 if ( broadc == 999 )
638 broadc = ix; // remember broadcast index
639 else if ( broadc != ix )
640 broadc = 1000; // does not fit broadcast
641 }
642 // check if pattern fits a larger block size:
643 // even indexes must be even, odd indexes must fit the preceding even index + 1
644 if ( ( i & 1 ) == 0 ) { // even index
645 if ( ix >= 0 && ( ix & 1 ) ) r &= ~perm_largeblock; // not even. does not fit larger block size
646 int iy = a[i + 1]; // next odd index
647 if ( iy >= 0 && ( iy & 1 ) == 0 ) r &= ~perm_largeblock; // not odd. does not fit larger block size
648 if ( ix >= 0 && iy >= 0 && iy != ix + 1 ) r &= ~perm_largeblock; // does not fit preceding index + 1
649 if ( ix == -1 && iy >= 0 ) r |= perm_addz; // needs additional zeroing at current block size
650 if ( iy == -1 && ix >= 0 ) r |= perm_addz; // needs additional zeroing at current block size
651 }
652 lane = i / lanesize; // current lane
653 if ( lane == 0 ) { // first lane, or no pattern yet
654 lanepattern[i] = ix; // save pattern
655 }
656 // check if crossing lanes
657 if ( ix >= 0 ) {
658 uint32_t lanei = (uint32_t)ix / lanesize; // source lane
659 if ( lanei != lane ) r |= perm_cross_lane; // crossing lane
660 }
661 // check if same pattern in all lanes
662 if ( lane != 0 && ix >= 0 ) { // not first lane
663 int j1 = i - int( lane * lanesize ); // index into lanepattern
664 int jx = ix - int( lane * lanesize ); // pattern within lane
665 if ( jx < 0 || jx >= (int)lanesize ) r &= ~perm_same_pattern; // source is in another lane
666 if ( lanepattern[j1] < 0 ) {
667 lanepattern[j1] = jx; // pattern not known from previous lane
668 } else {
669 if ( lanepattern[j1] != jx ) r &= ~perm_same_pattern; // not same pattern
670 }
671 }
672 if ( ix >= 0 ) {
673 // check if pattern fits zero extension (perm_zext)
674 if ( uint32_t( ix * 2 ) != i ) {
675 patfail |= 1; // does not fit zero extension
676 }
677 // check if pattern fits compress (perm_compress)
678 if ( ix > compresslasti && ix - compresslasti >= (int)i - compresslastp ) {
679 if ( (int)i - compresslastp > 1 ) addz2 |= 2; // perm_compress may need additional zeroing
680 compresslasti = ix;
681 compresslastp = i;
682 } else {
683 patfail |= 2; // does not fit perm_compress
684 }
685 // check if pattern fits expand (perm_expand)
686 if ( ix > expandlasti && ix - expandlasti <= (int)i - expandlastp ) {
687 if ( ix - expandlasti > 1 ) addz2 |= 4; // perm_expand may need additional zeroing
688 expandlasti = ix;
689 expandlastp = i;
690 } else {
691 patfail |= 4; // does not fit perm_compress
692 }
693 } else if ( ix == -1 ) {
694 if ( ( i & 1 ) == 0 ) addz2 |= 1; // zero extension needs additional zeroing
695 }
696 }
697 if ( !( r & perm_perm ) ) return r; // more checks are superfluous
698
699 if ( !( r & perm_largeblock ) ) r &= ~perm_addz; // remove irrelevant flag
700 if ( r & perm_cross_lane ) r &= ~perm_same_pattern; // remove irrelevant flag
701 if ( ( patfail & 1 ) == 0 ) {
702 r |= perm_zext; // fits zero extension
703 if ( ( addz2 & 1 ) != 0 ) r |= perm_addz2;
704 } else if ( ( patfail & 2 ) == 0 ) {
705 r |= perm_compress; // fits compression
706 if ( ( addz2 & 2 ) != 0 ) { // check if additional zeroing needed
707 for ( j = 0; j < compresslastp; j++ ) {
708 if ( a[j] == -1 ) r |= perm_addz2;
709 }
710 }
711 } else if ( ( patfail & 4 ) == 0 ) {
712 r |= perm_expand; // fits expansion
713 if ( ( addz2 & 4 ) != 0 ) { // check if additional zeroing needed
714 for ( j = 0; j < expandlastp; j++ ) {
715 if ( a[j] == -1 ) r |= perm_addz2;
716 }
717 }
718 }
719
720 if ( r & perm_same_pattern ) {
721 // same pattern in all lanes. check if it fits specific patterns
722 bool fit = true;
723 // fit shift or rotate
724 for ( i = 0; i < lanesize; i++ ) {
725 if ( lanepattern[i] >= 0 ) {
726 uint32_t rot1 = uint32_t( lanepattern[i] + lanesize - i ) % lanesize;
727 if ( rot == 999 ) {
728 rot = rot1;
729 } else { // check if fit
730 if ( rot != rot1 ) fit = false;
731 }
732 }
733 }
734 rot &= lanesize - 1; // prevent out of range values
735 if ( fit ) { // fits rotate, and possibly shift
736 uint64_t rot2 = ( rot * elementsize ) & 0xF; // rotate right count in bytes
737 r |= rot2 << perm_rot_count; // put shift/rotate count in output bit 16-19
738#if INSTRSET >= 4 // SSSE3
739 r |= perm_rotate; // allow palignr
740#endif
741 // fit shift left
742 fit = true;
743 for ( i = 0; i < lanesize - rot; i++ ) { // check if first rot elements are zero or don't care
744 if ( lanepattern[i] >= 0 ) fit = false;
745 }
746 if ( fit ) {
747 r |= perm_shleft;
748 for ( ; i < lanesize; i++ )
749 if ( lanepattern[i] == -1 ) r |= perm_addz; // additional zeroing needed
750 }
751 // fit shift right
752 fit = true;
753 for ( i = lanesize - (uint32_t)rot; i < lanesize;
754 i++ ) { // check if last (lanesize-rot) elements are zero or don't care
755 if ( lanepattern[i] >= 0 ) fit = false;
756 }
757 if ( fit ) {
758 r |= perm_shright;
759 for ( i = 0; i < lanesize - rot; i++ ) {
760 if ( lanepattern[i] == -1 ) r |= perm_addz; // additional zeroing needed
761 }
762 }
763 }
764 // fit punpckhi
765 fit = true;
766 uint32_t j2 = lanesize / 2;
767 for ( i = 0; i < lanesize; i++ ) {
768 if ( lanepattern[i] >= 0 && lanepattern[i] != (int)j2 ) fit = false;
769 if ( ( i & 1 ) != 0 ) j2++;
770 }
771 if ( fit ) r |= perm_punpckh;
772 // fit punpcklo
773 fit = true;
774 j2 = 0;
775 for ( i = 0; i < lanesize; i++ ) {
776 if ( lanepattern[i] >= 0 && lanepattern[i] != (int)j2 ) fit = false;
777 if ( ( i & 1 ) != 0 ) j2++;
778 }
779 if ( fit ) r |= perm_punpckl;
780 // fit pshufd
781 if ( elementsize >= 4 ) {
782 uint64_t p = 0;
783 for ( i = 0; i < lanesize; i++ ) {
784 if ( lanesize == 4 ) {
785 p |= ( lanepattern[i] & 3 ) << 2 * i;
786 } else { // lanesize = 2
787 p |= ( ( lanepattern[i] & 1 ) * 10 + 4 ) << 4 * i;
788 }
789 }
790 r |= p << perm_ipattern;
791 }
792 }
793#if INSTRSET >= 7
794 else { // not same pattern in all lanes
795 if constexpr ( nlanes > 1 ) { // Try if it fits big rotate
796 for ( i = 0; i < N; i++ ) {
797 ix = a[i];
798 if ( ix >= 0 ) {
799 uint32_t rot2 = ( ix + N - i ) % N; // rotate count
800 if ( rot == 999 ) {
801 rot = rot2; // save rotate count
802 } else if ( rot != rot2 ) {
803 rot = 1000;
804 break; // does not fit big rotate
805 }
806 }
807 }
808 if ( rot < N ) { // fits big rotate
809 r |= perm_rotate_big | (uint64_t)rot << perm_rot_count;
810 }
811 }
812 }
813#endif
814 if ( broadc < 999 && ( r & ( perm_rotate | perm_shright | perm_shleft | perm_rotate_big ) ) == 0 ) {
815 r |= perm_broadcast | (uint64_t)broadc << perm_rot_count; // fits broadcast
816 }
817 return r;
818 }
819
820 // compress_mask: returns a bit mask to use for compression instruction.
821 // It is presupposed that perm_flags indicates perm_compress.
822 // Additional zeroing is needed if perm_flags indicates perm_addz2
823 template <int N>
824 constexpr uint64_t compress_mask( int const ( &a )[N] ) {
825 // a is a reference to a constexpr array of permutation indexes
826 int ix = 0, lasti = -1, lastp = -1;
827 uint64_t m = 0;
828 int i = 0;
829 int j = 1; // loop counters
830 for ( i = 0; i < N; i++ ) {
831 ix = a[i]; // permutation index
832 if ( ix >= 0 ) {
833 m |= (uint64_t)1 << ix; // mask for compression source
834 for ( j = 1; j < i - lastp; j++ ) {
835 m |= (uint64_t)1 << ( lasti + j ); // dummy filling source
836 }
837 lastp = i;
838 lasti = ix;
839 }
840 }
841 return m;
842 }
843
844 // expand_mask: returns a bit mask to use for expansion instruction.
845 // It is presupposed that perm_flags indicates perm_expand.
846 // Additional zeroing is needed if perm_flags indicates perm_addz2
847 template <int N>
848 constexpr uint64_t expand_mask( int const ( &a )[N] ) {
849 // a is a reference to a constexpr array of permutation indexes
850 int ix = 0, lasti = -1, lastp = -1;
851 uint64_t m = 0;
852 int i = 0;
853 int j = 1;
854 for ( i = 0; i < N; i++ ) {
855 ix = a[i]; // permutation index
856 if ( ix >= 0 ) {
857 m |= (uint64_t)1 << i; // mask for expansion destination
858 for ( j = 1; j < ix - lasti; j++ ) {
859 m |= (uint64_t)1 << ( lastp + j ); // dummy filling destination
860 }
861 lastp = i;
862 lasti = ix;
863 }
864 }
865 return m;
866 }
867
868 // perm16_flags: returns information about how to permute a vector of 16-bit integers
869 // Note: It is presupposed that perm_flags reports perm_same_pattern
870 // The return value is composed of these bits:
871 // 1: data from low 64 bits to low 64 bits. pattern in bit 32-39
872 // 2: data from high 64 bits to high 64 bits. pattern in bit 40-47
873 // 4: data from high 64 bits to low 64 bits. pattern in bit 48-55
874 // 8: data from low 64 bits to high 64 bits. pattern in bit 56-63
875 template <typename V>
876 constexpr uint64_t perm16_flags( int const ( &a )[V::size()] ) {
877 // a is a reference to a constexpr array of permutation indexes
878 // V is a vector class
879 constexpr int N = V::size(); // number of elements
880
881 uint64_t retval = 0; // return value
882 uint32_t pat[4] = { 0, 0, 0, 0 }; // permute patterns
883 uint32_t i = 0; // loop counter
884 int ix = 0; // index number i
885 const uint32_t lanesize = 8; // elements per lane
886 uint32_t lane = 0; // current lane
887 int lanepattern[lanesize] = { 0 }; // pattern in each lane
888
889 for ( i = 0; i < N; i++ ) {
890 ix = a[i];
891 lane = i / lanesize; // current lane
892 if ( lane == 0 ) {
893 lanepattern[i] = ix; // save pattern
894 } else if ( ix >= 0 ) { // not first lane
895 uint32_t j = i - lane * lanesize; // index into lanepattern
896 int jx = ix - lane * lanesize; // pattern within lane
897 if ( lanepattern[j] < 0 ) {
898 lanepattern[j] = jx; // pattern not known from previous lane
899 }
900 }
901 }
902 // four patterns: low2low, high2high, high2low, low2high
903 for ( i = 0; i < 4; i++ ) {
904 // loop through low pattern
905 if ( lanepattern[i] >= 0 ) {
906 if ( lanepattern[i] < 4 ) { // low2low
907 retval |= 1;
908 pat[0] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
909 } else { // high2low
910 retval |= 4;
911 pat[2] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
912 }
913 }
914 // loop through high pattern
915 if ( lanepattern[i + 4] >= 0 ) {
916 if ( lanepattern[i + 4] < 4 ) { // low2high
917 retval |= 8;
918 pat[3] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
919 } else { // high2high
920 retval |= 2;
921 pat[1] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
922 }
923 }
924 }
925 // join return data
926 for ( i = 0; i < 4; i++ ) { retval |= (uint64_t)pat[i] << ( 32 + i * 8 ); }
927 return retval;
928 }
929
930 // pshufb_mask: return a broad byte mask for permutation within lanes
931 // for use with the pshufb instruction (_mm..._shuffle_epi8).
932 // The pshufb instruction provides fast permutation and zeroing,
933 // allowing different patterns in each lane but no crossing of lane boundaries
934 template <typename V, int oppos = 0>
935 constexpr auto pshufb_mask( int const ( &A )[V::size()] ) {
936 // Parameter a is a reference to a constexpr array of permutation indexes
937 // V is a vector class
938 // oppos = 1 for data from the opposite 128-bit lane in 256-bit vectors
939 constexpr uint32_t N = V::size(); // number of vector elements
940 constexpr uint32_t elementsize = sizeof( V ) / N; // size of each vector element
941 constexpr uint32_t nlanes = sizeof( V ) / 16; // number of 128 bit lanes in vector
942 constexpr uint32_t elements_per_lane = N / nlanes; // number of vector elements per lane
943
944 EList<int8_t, sizeof( V )> u = { { 0 } }; // list for returning
945
946 uint32_t i = 0; // loop counters
947 uint32_t j = 0;
948 int m = 0;
949 int k = 0;
950 uint32_t lane = 0;
951
952 for ( lane = 0; lane < nlanes; lane++ ) { // loop through lanes
953 for ( i = 0; i < elements_per_lane; i++ ) { // loop through elements in lane
954 // permutation index for element within lane
955 int8_t p = -1;
956 int ix = A[m];
957 if ( ix >= 0 ) {
958 ix ^= oppos * elements_per_lane; // flip bit if opposite lane
959 }
960 ix -= int( lane * elements_per_lane ); // index relative to lane
961 if ( ix >= 0 && ix < (int)elements_per_lane ) { // index points to desired lane
962 p = ix * elementsize;
963 }
964 for ( j = 0; j < elementsize; j++ ) { // loop through bytes in element
965 u.a[k++] = p < 0 ? -1 : p + j; // store byte permutation index
966 }
967 m++;
968 }
969 }
970 return u; // return encapsulated array
971 }
972
973 // largeblock_perm: return indexes for replacing a permute or blend with
974 // a certain block size by a permute or blend with the double block size.
975 // Note: it is presupposed that perm_flags() indicates perm_largeblock
976 // It is required that additional zeroing is added if perm_flags() indicates perm_addz
977 template <int N>
978 constexpr EList<int, N / 2> largeblock_perm( int const ( &a )[N] ) {
979 // Parameter a is a reference to a constexpr array of permutation indexes
980 EList<int, N / 2> list = { { 0 } }; // result indexes
981 int ix = 0; // even index
982 int iy = 0; // odd index
983 int iz = 0; // combined index
984 bool fit_addz = false; // additional zeroing needed at the lower block level
985 int i = 0; // loop counter
986
987 // check if additional zeroing is needed at current block size
988 for ( i = 0; i < N; i += 2 ) {
989 ix = a[i]; // even index
990 iy = a[i + 1]; // odd index
991 if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz = true; }
992 }
993
994 // loop through indexes
995 for ( i = 0; i < N; i += 2 ) {
996 ix = a[i]; // even index
997 iy = a[i + 1]; // odd index
998 if ( ix >= 0 ) {
999 iz = ix / 2; // half index
1000 } else if ( iy >= 0 ) {
1001 iz = iy / 2;
1002 } else {
1003 iz = ix | iy; // -1 or V_DC. -1 takes precedence
1004 if ( fit_addz ) iz = V_DC; // V_DC, because result will be zeroed later
1005 }
1006 list.a[i / 2] = iz; // save to list
1007 }
1008 return list;
1009 }
1010
1011 // blend_flags: returns information about how a blend function can be implemented
1012 // The return value is composed of these flag bits:
1013 const int blend_zeroing = 1; // needs zeroing
1014 const int blend_allzero = 2; // all is zero or don't care
1015 const int blend_largeblock = 4; // fits blend with a larger block size (e.g permute Vec2q instead of Vec4i)
1016 const int blend_addz = 8; // additional zeroing needed after blend with larger block size or shift
1017 const int blend_a = 0x10; // has data from a
1018 const int blend_b = 0x20; // has data from b
1019 const int blend_perma = 0x40; // permutation of a needed
1020 const int blend_permb = 0x80; // permutation of b needed
1021 const int blend_cross_lane = 0x100; // permutation crossing 128-bit lanes
1022 const int blend_same_pattern = 0x200; // same permute/blend pattern in all 128-bit lanes
1023 const int blend_punpckhab = 0x1000; // pattern fits punpckh(a,b)
1024 const int blend_punpckhba = 0x2000; // pattern fits punpckh(b,a)
1025 const int blend_punpcklab = 0x4000; // pattern fits punpckl(a,b)
1026 const int blend_punpcklba = 0x8000; // pattern fits punpckl(b,a)
1027 const int blend_rotateab = 0x10000; // pattern fits palignr(a,b)
1028 const int blend_rotateba = 0x20000; // pattern fits palignr(b,a)
1029 const int blend_shufab = 0x40000; // pattern fits shufps/shufpd(a,b)
1030 const int blend_shufba = 0x80000; // pattern fits shufps/shufpd(b,a)
1031 const int blend_rotate_big = 0x100000; // pattern fits rotation across lanes. count returned in bits blend_rotpattern
1032 const int blend_outofrange = 0x10000000; // index out of range
1033 const int blend_shufpattern = 32; // pattern for shufps/shufpd is in bit blend_shufpattern to blend_shufpattern + 7
1034 const int blend_rotpattern = 40; // pattern for palignr is in bit blend_rotpattern to blend_rotpattern + 7
1035
1036 template <typename V>
1037 constexpr uint64_t blend_flags( int const ( &a )[V::size()] ) {
1038 // a is a reference to a constexpr array of permutation indexes
1039 // V is a vector class
1040 constexpr int N = V::size(); // number of elements
1041 uint64_t r = blend_largeblock | blend_same_pattern | blend_allzero; // return value
1042 uint32_t iu = 0; // loop counter
1043 int32_t ii = 0; // loop counter
1044 int ix = 0; // index number i
1045 const uint32_t nlanes = sizeof( V ) / 16; // number of 128-bit lanes
1046 const uint32_t lanesize = N / nlanes; // elements per lane
1047 uint32_t lane = 0; // current lane
1048 uint32_t rot = 999; // rotate left count
1049 int lanepattern[lanesize] = { 0 }; // pattern in each lane
1050 if ( lanesize == 2 && N <= 8 ) {
1051 r |= blend_shufab | blend_shufba; // check if it fits shufpd
1052 }
1053
1054 for ( ii = 0; ii < N; ii++ ) { // loop through indexes
1055 ix = a[ii]; // index
1056 if ( ix < 0 ) {
1057 if ( ix == -1 )
1058 r |= blend_zeroing; // set to zero
1059 else if ( ix != V_DC ) {
1060 r = blend_outofrange;
1061 break; // illegal index
1062 }
1063 } else { // ix >= 0
1064 r &= ~blend_allzero;
1065 if ( ix < N ) {
1066 r |= blend_a; // data from a
1067 if ( ix != ii ) r |= blend_perma; // permutation of a
1068 } else if ( ix < 2 * N ) {
1069 r |= blend_b; // data from b
1070 if ( ix != ii + N ) r |= blend_permb; // permutation of b
1071 } else {
1072 r = blend_outofrange;
1073 break; // illegal index
1074 }
1075 }
1076 // check if pattern fits a larger block size:
1077 // even indexes must be even, odd indexes must fit the preceding even index + 1
1078 if ( ( ii & 1 ) == 0 ) { // even index
1079 if ( ix >= 0 && ( ix & 1 ) ) r &= ~blend_largeblock; // not even. does not fit larger block size
1080 int iy = a[ii + 1]; // next odd index
1081 if ( iy >= 0 && ( iy & 1 ) == 0 ) r &= ~blend_largeblock; // not odd. does not fit larger block size
1082 if ( ix >= 0 && iy >= 0 && iy != ix + 1 ) r &= ~blend_largeblock; // does not fit preceding index + 1
1083 if ( ix == -1 && iy >= 0 ) r |= blend_addz; // needs additional zeroing at current block size
1084 if ( iy == -1 && ix >= 0 ) r |= blend_addz; // needs additional zeroing at current block size
1085 }
1086 lane = (uint32_t)ii / lanesize; // current lane
1087 if ( lane == 0 ) { // first lane, or no pattern yet
1088 lanepattern[ii] = ix; // save pattern
1089 }
1090 // check if crossing lanes
1091 if ( ix >= 0 ) {
1092 uint32_t lanei = uint32_t( ix & ~N ) / lanesize; // source lane
1093 if ( lanei != lane ) {
1094 r |= blend_cross_lane; // crossing lane
1095 }
1096 if ( lanesize == 2 ) { // check if it fits pshufd
1097 if ( lanei != lane ) r &= ~( blend_shufab | blend_shufba );
1098 if ( ( ( ( ix & N ) != 0 ) ^ ii ) & 1 )
1099 r &= ~blend_shufab;
1100 else
1101 r &= ~blend_shufba;
1102 }
1103 }
1104 // check if same pattern in all lanes
1105 if ( lane != 0 && ix >= 0 ) { // not first lane
1106 int j = ii - int( lane * lanesize ); // index into lanepattern
1107 int jx = ix - int( lane * lanesize ); // pattern within lane
1108 if ( jx < 0 || ( jx & ~N ) >= (int)lanesize ) r &= ~blend_same_pattern; // source is in another lane
1109 if ( lanepattern[j] < 0 ) {
1110 lanepattern[j] = jx; // pattern not known from previous lane
1111 } else {
1112 if ( lanepattern[j] != jx ) r &= ~blend_same_pattern; // not same pattern
1113 }
1114 }
1115 }
1116 if ( !( r & blend_largeblock ) ) r &= ~blend_addz; // remove irrelevant flag
1117 if ( r & blend_cross_lane ) r &= ~blend_same_pattern; // remove irrelevant flag
1118 if ( !( r & ( blend_perma | blend_permb ) ) ) {
1119 return r; // no permutation. more checks are superfluous
1120 }
1121 if ( r & blend_same_pattern ) {
1122 // same pattern in all lanes. check if it fits unpack patterns
1124 for ( iu = 0; iu < lanesize; iu++ ) { // loop through lanepattern
1125 ix = lanepattern[iu];
1126 if ( ix >= 0 ) {
1127 if ( (uint32_t)ix != iu / 2 + ( iu & 1 ) * N ) r &= ~blend_punpcklab;
1128 if ( (uint32_t)ix != iu / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &= ~blend_punpcklba;
1129 if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( iu & 1 ) * N ) r &= ~blend_punpckhab;
1130 if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &= ~blend_punpckhba;
1131 }
1132 }
1133#if INSTRSET >= 4 // SSSE3. check if it fits palignr
1134 for ( iu = 0; iu < lanesize; iu++ ) {
1135 ix = lanepattern[iu];
1136 if ( ix >= 0 ) {
1137 uint32_t t = ix & ~N;
1138 if ( ix & N ) t += lanesize;
1139 uint32_t tb = ( t + 2 * lanesize - iu ) % ( lanesize * 2 );
1140 if ( rot == 999 ) {
1141 rot = tb;
1142 } else { // check if fit
1143 if ( rot != tb ) rot = 1000;
1144 }
1145 }
1146 }
1147 if ( rot < 999 ) { // firs palignr
1148 if ( rot < lanesize ) {
1149 r |= blend_rotateba;
1150 } else {
1151 r |= blend_rotateab;
1152 }
1153 const uint32_t elementsize = sizeof( V ) / N;
1154 r |= uint64_t( ( rot & ( lanesize - 1 ) ) * elementsize ) << blend_rotpattern;
1155 }
1156#endif
1157 if ( lanesize == 4 ) {
1158 // check if it fits shufps
1160 for ( ii = 0; ii < 2; ii++ ) {
1161 ix = lanepattern[ii];
1162 if ( ix >= 0 ) {
1163 if ( ix & N )
1164 r &= ~blend_shufab;
1165 else
1166 r &= ~blend_shufba;
1167 }
1168 }
1169 for ( ; ii < 4; ii++ ) {
1170 ix = lanepattern[ii];
1171 if ( ix >= 0 ) {
1172 if ( ix & N )
1173 r &= ~blend_shufba;
1174 else
1175 r &= ~blend_shufab;
1176 }
1177 }
1178 if ( r & ( blend_shufab | blend_shufba ) ) { // fits shufps/shufpd
1179 uint8_t shufpattern = 0; // get pattern
1180 for ( iu = 0; iu < lanesize; iu++ ) { shufpattern |= ( lanepattern[iu] & 3 ) << iu * 2; }
1181 r |= (uint64_t)shufpattern << blend_shufpattern; // return pattern
1182 }
1183 }
1184 } else if ( nlanes > 1 ) { // not same pattern in all lanes
1185 rot = 999; // check if it fits big rotate
1186 for ( ii = 0; ii < N; ii++ ) {
1187 ix = a[ii];
1188 if ( ix >= 0 ) {
1189 uint32_t rot2 = ( ix + 2 * N - ii ) % ( 2 * N ); // rotate count
1190 if ( rot == 999 ) {
1191 rot = rot2; // save rotate count
1192 } else if ( rot != rot2 ) {
1193 rot = 1000;
1194 break; // does not fit big rotate
1195 }
1196 }
1197 }
1198 if ( rot < 2 * N ) { // fits big rotate
1199 r |= blend_rotate_big | (uint64_t)rot << blend_rotpattern;
1200 }
1201 }
1202 if ( lanesize == 2 && ( r & ( blend_shufab | blend_shufba ) ) ) { // fits shufpd. Get pattern
1203 for ( ii = 0; ii < N; ii++ ) { r |= uint64_t( a[ii] & 1 ) << ( blend_shufpattern + ii ); }
1204 }
1205 return r;
1206 }
1207
1208 // blend_perm_indexes: return an Indexlist for implementing a blend function as
1209 // two permutations. N = vector size.
1210 // dozero = 0: let unused elements be don't care. The two permutation results must be blended
1211 // dozero = 1: zero unused elements in each permuation. The two permutation results can be OR'ed
1212 // dozero = 2: indexes that are -1 or V_DC are preserved
1213 template <int N, int dozero>
1214 constexpr EList<int, 2 * N> blend_perm_indexes( int const ( &a )[N] ) {
1215 // a is a reference to a constexpr array of permutation indexes
1216 EList<int, 2 * N> list = { { 0 } }; // list to return
1217 int u = dozero ? -1 : V_DC; // value to use for unused entries
1218 int j = 0;
1219
1220 for ( j = 0; j < N; j++ ) { // loop through indexes
1221 int ix = a[j]; // current index
1222 if ( ix < 0 ) { // zero or don't care
1223 if ( dozero == 2 ) {
1224 // list.a[j] = list.a[j + N] = ix; // fails in gcc in complicated cases
1225 list.a[j] = ix;
1226 list.a[j + N] = ix;
1227 } else {
1228 // list.a[j] = list.a[j + N] = u;
1229 list.a[j] = u;
1230 list.a[j + N] = u;
1231 }
1232 } else if ( ix < N ) { // value from a
1233 list.a[j] = ix;
1234 list.a[j + N] = u;
1235 } else {
1236 list.a[j] = u; // value from b
1237 list.a[j + N] = ix - N;
1238 }
1239 }
1240 return list;
1241 }
1242
1243 // largeblock_indexes: return indexes for replacing a permute or blend with a
1244 // certain block size by a permute or blend with the double block size.
1245 // Note: it is presupposed that perm_flags or blend_flags indicates _largeblock
1246 // It is required that additional zeroing is added if perm_flags or blend_flags
1247 // indicates _addz
1248 template <int N>
1249 constexpr EList<int, N / 2> largeblock_indexes( int const ( &a )[N] ) {
1250 // Parameter a is a reference to a constexpr array of N permutation indexes
1251 EList<int, N / 2> list = { { 0 } }; // list to return
1252
1253 bool fit_addz = false; // additional zeroing needed at the lower block level
1254 int ix = 0; // even index
1255 int iy = 0; // odd index
1256 int iz = 0; // combined index
1257 int i = 0; // loop counter
1258
1259 for ( i = 0; i < N; i += 2 ) {
1260 ix = a[i]; // even index
1261 iy = a[i + 1]; // odd index
1262 if ( ix >= 0 ) {
1263 iz = ix / 2; // half index
1264 } else if ( iy >= 0 ) {
1265 iz = iy / 2; // half index
1266 } else
1267 iz = ix | iy; // -1 or V_DC. -1 takes precedence
1268 list.a[i / 2] = iz; // save to list
1269 // check if additional zeroing is needed at current block size
1270 if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz = true; }
1271 }
1272 // replace -1 by V_DC if fit_addz
1273 if ( fit_addz ) {
1274 for ( i = 0; i < N / 2; i++ ) {
1275 if ( list.a[i] < 0 ) list.a[i] = V_DC;
1276 }
1277 }
1278 return list;
1279 }
1280
1281 /****************************************************************************************
1282 *
1283 * Vector blend helper function templates
1284 *
1285 * These templates are for emulating a blend with a vector size that is not supported by
1286 * the instruction set, using multiple blends or permutations of half the vector size
1287 *
1288 ****************************************************************************************/
1289
1290 // Make dummy blend function templates to avoid error messages when the blend funtions are not yet defined
1291 template <typename dummy>
1292 void blend2() {}
1293 template <typename dummy>
1294 void blend4() {}
1295 template <typename dummy>
1296 void blend8() {}
1297 template <typename dummy>
1298 void blend16() {}
1299 template <typename dummy>
1300 void blend32() {}
1301
1302 // blend_half_indexes: return an Indexlist for emulating a blend function as
1303 // blends or permutations from multiple sources
1304 // dozero = 0: let unused elements be don't care. Multiple permutation results must be blended
1305 // dozero = 1: zero unused elements in each permuation. Multiple permutation results can be OR'ed
1306 // dozero = 2: indexes that are -1 or V_DC are preserved
1307 // src1, src2: sources to blend in a partial implementation
1308 template <int N, int dozero, int src1, int src2>
1309 constexpr EList<int, N> blend_half_indexes( int const ( &a )[N] ) {
1310 // a is a reference to a constexpr array of permutation indexes
1311 EList<int, N> list = { { 0 } }; // list to return
1312 int u = dozero ? -1 : V_DC; // value to use for unused entries
1313 int j = 0; // loop counter
1314
1315 for ( j = 0; j < N; j++ ) { // loop through indexes
1316 int ix = a[j]; // current index
1317 if ( ix < 0 ) { // zero or don't care
1318 list.a[j] = ( dozero == 2 ) ? ix : u;
1319 } else {
1320 int src = ix / N; // source
1321 if ( src == src1 ) {
1322 list.a[j] = ix & ( N - 1 );
1323 } else if ( src == src2 ) {
1324 list.a[j] = ( ix & ( N - 1 ) ) + N;
1325 } else
1326 list.a[j] = u;
1327 }
1328 }
1329 return list;
1330 }
1331
1332 // selectblend: select one of four sources for blending
1333 template <typename W, int s>
1334 static inline auto selectblend( W const a, W const b ) {
1335 if constexpr ( s == 0 )
1336 return a.get_low();
1337 else if constexpr ( s == 1 )
1338 return a.get_high();
1339 else if constexpr ( s == 2 )
1340 return b.get_low();
1341 else
1342 return b.get_high();
1343 }
1344
1345 // blend_half: Emulate a blend with a vector size that is not supported
1346 // by multiple blends with half the vector size.
1347 // blend_half is called twice, to give the low and high half of the result
1348 // Parameters: W: type of full-size vector
1349 // i0...: indexes for low or high half
1350 // a, b: full size input vectors
1351 // return value: half-size vector for lower or upper part
1352 template <typename W, int... i0>
1353 auto blend_half( W const& a, W const& b ) {
1354 typedef decltype( a.get_low() ) V; // type for half-size vector
1355 constexpr int N = V::size(); // size of half-size vector
1356 static_assert( sizeof...( i0 ) == N, "wrong number of indexes in blend_half" );
1357 constexpr int ind[N] = { i0... }; // array of indexes
1358
1359 // lambda to find which of the four possible sources are used
1360 // return: EList<int, 5> containing a list of up to 4 sources. The last element is the number of sources used
1361 auto listsources = []( int const n, int const( &ind )[N] ) constexpr {
1362 bool source_used[4] = { false, false, false, false }; // list of sources used
1363 int i = 0;
1364 for ( i = 0; i < n; i++ ) {
1365 int ix = ind[i]; // index
1366 if ( ix >= 0 ) {
1367 int src = ix / n; // source used
1368 source_used[src & 3] = true;
1369 }
1370 }
1371 // return a list of sources used. The last element is the number of sources used
1372 EList<int, 5> sources = { { 0 } };
1373 int nsrc = 0; // number of sources
1374 for ( i = 0; i < 4; i++ ) {
1375 if ( source_used[i] ) { sources.a[nsrc++] = i; }
1376 }
1377 sources.a[4] = nsrc;
1378 return sources;
1379 };
1380 // list of sources used
1381 constexpr EList<int, 5> sources = listsources( N, ind );
1382 constexpr int nsrc = sources.a[4]; // number of sources used
1383
1384 if constexpr ( nsrc == 0 ) { // no sources
1385 return V( 0 );
1386 }
1387 // get indexes for the first one or two sources
1388 constexpr int uindex = ( nsrc > 2 ) ? 1 : 2; // unused elements set to zero if two blends are combined
1389 constexpr EList<int, N> L = blend_half_indexes<N, uindex, sources.a[0], sources.a[1]>( ind );
1390 V x0;
1391 V src0 = selectblend<W, sources.a[0]>( a, b ); // first source
1392 V src1 = selectblend<W, sources.a[1]>( a, b ); // second source
1393 if constexpr ( N == 2 ) {
1394 x0 = blend2<L.a[0], L.a[1]>( src0, src1 );
1395 } else if constexpr ( N == 4 ) {
1396 x0 = blend4<L.a[0], L.a[1], L.a[2], L.a[3]>( src0, src1 );
1397 } else if constexpr ( N == 8 ) {
1398 x0 = blend8<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]>( src0, src1 );
1399 } else if constexpr ( N == 16 ) {
1400 x0 = blend16<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
1401 L.a[12], L.a[13], L.a[14], L.a[15]>( src0, src1 );
1402 } else if constexpr ( N == 32 ) {
1403 x0 = blend32<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
1404 L.a[12], L.a[13], L.a[14], L.a[15], L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22],
1405 L.a[23], L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]>( src0, src1 );
1406 }
1407 if constexpr ( nsrc > 2 ) { // get last one or two sources
1408 constexpr EList<int, N> M = blend_half_indexes<N, 1, sources.a[2], sources.a[3]>( ind );
1409 V x1;
1410 V src2 = selectblend<W, sources.a[2]>( a, b ); // third source
1411 V src3 = selectblend<W, sources.a[3]>( a, b ); // fourth source
1412 if constexpr ( N == 2 ) {
1413 x1 = blend2<M.a[0], M.a[1]>( src0, src1 );
1414 } else if constexpr ( N == 4 ) {
1415 x1 = blend4<M.a[0], M.a[1], M.a[2], M.a[3]>( src2, src3 );
1416 } else if constexpr ( N == 8 ) {
1417 x1 = blend8<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7]>( src2, src3 );
1418 } else if constexpr ( N == 16 ) {
1419 x1 = blend16<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
1420 M.a[12], M.a[13], M.a[14], M.a[15]>( src2, src3 );
1421 } else if constexpr ( N == 32 ) {
1422 x1 = blend32<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
1423 M.a[12], M.a[13], M.a[14], M.a[15], M.a[16], M.a[17], M.a[18], M.a[19], M.a[20], M.a[21], M.a[22],
1424 M.a[23], M.a[24], M.a[25], M.a[26], M.a[27], M.a[28], M.a[29], M.a[30], M.a[31]>( src2, src3 );
1425 }
1426 x0 |= x1; // combine result of two blends. Unused elements are zero
1427 }
1428 return x0;
1429 }
1430
1431#ifdef VCL_NAMESPACE
1432}
1433#endif
const int perm_compress
Definition instrset.h:595
const int perm_shleft
Definition instrset.h:589
const int blend_rotpattern
Definition instrset.h:1034
const int blend_permb
Definition instrset.h:1020
const int blend_punpckhba
Definition instrset.h:1024
constexpr uint64_t blend_flags(int const (&a)[V::size()])
Definition instrset.h:1037
const int blend_shufpattern
Definition instrset.h:1033
const int blend_rotateab
Definition instrset.h:1027
const int blend_cross_lane
Definition instrset.h:1021
void blend8()
Definition instrset.h:1296
constexpr auto perm_mask_broad(int const (&A)[V::size()])
Definition instrset.h:564
const int perm_addz2
Definition instrset.h:580
const int blend_rotateba
Definition instrset.h:1028
const int blend_largeblock
Definition instrset.h:1015
constexpr int bit_scan_reverse_const(uint64_t const n)
Definition instrset.h:381
const int perm_punpckl
Definition instrset.h:584
const int perm_rotate_big
Definition instrset.h:591
constexpr uint64_t expand_mask(int const (&a)[N])
Definition instrset.h:848
constexpr auto zero_mask(int const (&a)[N])
Definition instrset.h:486
constexpr auto pshufb_mask(int const (&A)[V::size()])
Definition instrset.h:935
const int blend_a
Definition instrset.h:1017
bool hasAVX512VBMI2(void)
constexpr auto zero_mask_broad(int const (&A)[V::size()])
Definition instrset.h:506
void blend2()
Definition instrset.h:1292
constexpr EList< int, 2 *N > blend_perm_indexes(int const (&a)[N])
Definition instrset.h:1214
const int blend_addz
Definition instrset.h:1016
bool hasFMA3(void)
const int blend_shufba
Definition instrset.h:1030
const int perm_broadcast
Definition instrset.h:593
const int blend_rotate_big
Definition instrset.h:1031
constexpr uint64_t perm_flags(int const (&a)[V::size()])
Definition instrset.h:603
const int blend_perma
Definition instrset.h:1019
const int perm_cross_lane
Definition instrset.h:581
const int perm_addz
Definition instrset.h:579
const int blend_outofrange
Definition instrset.h:1032
bool hasAVX512VBMI(void)
const int perm_zeroing
Definition instrset.h:575
constexpr uint64_t perm16_flags(int const (&a)[V::size()])
Definition instrset.h:876
constexpr EList< int, N/2 > largeblock_perm(int const (&a)[N])
Definition instrset.h:978
const int perm_outofrange
Definition instrset.h:597
const int perm_largeblock
Definition instrset.h:578
const int perm_rotate
Definition instrset.h:585
const int blend_same_pattern
Definition instrset.h:1022
const int perm_shright
Definition instrset.h:587
const int perm_punpckh
Definition instrset.h:583
int physicalProcessors(int *logical_processors=0)
constexpr EList< int, N/2 > largeblock_indexes(int const (&a)[N])
Definition instrset.h:1249
const int perm_same_pattern
Definition instrset.h:582
constexpr uint64_t make_bit_mask(int const (&a)[N])
Definition instrset.h:525
const int blend_punpcklab
Definition instrset.h:1025
const int blend_allzero
Definition instrset.h:1014
bool hasAVX512ER(void)
auto blend_half(W const &a, W const &b)
Definition instrset.h:1353
void blend32()
Definition instrset.h:1300
void blend16()
Definition instrset.h:1298
const int blend_punpckhab
Definition instrset.h:1023
constexpr auto get_inttype()
Definition instrset.h:469
void blend4()
Definition instrset.h:1294
const int perm_zext
Definition instrset.h:594
constexpr auto make_broad_mask(uint64_t const m)
Definition instrset.h:552
constexpr uint64_t compress_mask(int const (&a)[N])
Definition instrset.h:824
const int blend_zeroing
Definition instrset.h:1013
constexpr int V_DC
Definition instrset.h:219
const int perm_perm
Definition instrset.h:576
const int perm_expand
Definition instrset.h:596
int instrset_detect(void)
const int blend_punpcklba
Definition instrset.h:1026
const int perm_allzero
Definition instrset.h:577
const int perm_ipattern
Definition instrset.h:599
const int perm_rot_count
Definition instrset.h:598
constexpr EList< int, N > blend_half_indexes(int const (&a)[N])
Definition instrset.h:1309
const int blend_shufab
Definition instrset.h:1029
bool hasXOP(void)
const int blend_b
Definition instrset.h:1018
bool hasFMA4(void)
T a[N]
Definition instrset.h:463