The Gaudi Framework  master (37c0b60a)
instrset.h
Go to the documentation of this file.
1 /**************************** instrset.h **********************************
2  * Author: Agner Fog
3  * Date created: 2012-05-30
4  * Last modified: 2020-11-11
5  * Version: 2.01.02
6  * Project: vector class library
7  * Description:
8  * Header file for various compiler-specific tasks as well as common
9  * macros and templates. This file contains:
10  *
11  * > Selection of the supported instruction set
12  * > Defines compiler version macros
13  * > Undefines certain macros that prevent function overloading
14  * > Helper functions that depend on instruction set, compiler, or platform
15  * > Common templates for permute, blend, etc.
16  *
17  * For instructions, see vcl_manual.pdf
18  *
19  * (c) Copyright 2012-2020 Agner Fog.
20  * Apache License version 2.0 or later.
21  ******************************************************************************/
22 
23 #ifndef INSTRSET_H
24 #define INSTRSET_H 20102
25 
26 // Allow the use of floating point permute instructions on integer vectors.
27 // Some CPU's have an extra latency of 1 or 2 clock cycles for this, but
28 // it may still be faster than alternative implementations:
29 #define ALLOW_FP_PERMUTE true
30 
31 // Macro to indicate 64 bit mode
32 #if ( defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ) ) && !defined( __x86_64__ )
33 # define __x86_64__ 1 // There are many different macros for this, decide on only one
34 #endif
35 
36 // The following values of INSTRSET are currently defined:
37 // 2: SSE2
38 // 3: SSE3
39 // 4: SSSE3
40 // 5: SSE4.1
41 // 6: SSE4.2
42 // 7: AVX
43 // 8: AVX2
44 // 9: AVX512F
45 // 10: AVX512BW/DQ/VL
46 // In the future, INSTRSET = 11 may include AVX512VBMI and AVX512VBMI2, but this
47 // decision cannot be made before the market situation for CPUs with these
48 // instruction sets is known (these future instruction set extensions are already
49 // used in some VCL functions and tested with an emulator)
50 
51 // Find instruction set from compiler macros if INSTRSET is not defined.
52 // Note: Most of these macros are not defined in Microsoft compilers
53 #ifndef INSTRSET
54 # if defined( __AVX512VL__ ) && defined( __AVX512BW__ ) && defined( __AVX512DQ__ )
55 # define INSTRSET 10
56 # elif defined( __AVX512F__ ) || defined( __AVX512__ )
57 # define INSTRSET 9
58 # elif defined( __AVX2__ )
59 # define INSTRSET 8
60 # elif defined( __AVX__ )
61 # define INSTRSET 7
62 # elif defined( __SSE4_2__ )
63 # define INSTRSET 6
64 # elif defined( __SSE4_1__ )
65 # define INSTRSET 5
66 # elif defined( __SSSE3__ )
67 # define INSTRSET 4
68 # elif defined( __SSE3__ )
69 # define INSTRSET 3
70 # elif defined( __SSE2__ ) || defined( __x86_64__ )
71 # define INSTRSET 2
72 # elif defined( __SSE__ )
73 # define INSTRSET 1
74 # elif defined( _M_IX86_FP ) // Defined in MS compiler. 1: SSE, 2: SSE2
75 # define INSTRSET _M_IX86_FP
76 # else
77 # define INSTRSET 0
78 # endif // instruction set defines
79 #endif // INSTRSET
80 
81 // Include the appropriate header file for intrinsic functions
82 #if INSTRSET > 7 // AVX2 and later
83 # if defined( __GNUC__ ) && !defined( __INTEL_COMPILER )
84 # include <x86intrin.h> // x86intrin.h includes header files for whatever instruction
85  // sets are specified on the compiler command line, such as:
86  // xopintrin.h, fma4intrin.h
87 # else
88 # include <immintrin.h> // MS/Intel version of immintrin.h covers AVX and later
89 # endif // __GNUC__
90 #elif INSTRSET == 7
91 # include <immintrin.h> // AVX
92 #elif INSTRSET == 6
93 # include <nmmintrin.h> // SSE4.2
94 #elif INSTRSET == 5
95 # include <smmintrin.h> // SSE4.1
96 #elif INSTRSET == 4
97 # include <tmmintrin.h> // SSSE3
98 #elif INSTRSET == 3
99 # include <pmmintrin.h> // SSE3
100 #elif INSTRSET == 2
101 # include <emmintrin.h> // SSE2
102 #elif INSTRSET == 1
103 # include <xmmintrin.h> // SSE
104 #endif // INSTRSET
105 
106 #if INSTRSET >= 8 && !defined( __FMA__ )
107 // Assume that all processors that have AVX2 also have FMA3
108 # if defined( __GNUC__ ) && !defined( __INTEL_COMPILER )
109 // Prevent error message in g++ and Clang when using FMA intrinsics with avx2:
110 # if !defined( DISABLE_WARNING_AVX2_WITHOUT_FMA )
111 # pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
112 # endif
113 # elif !defined( __clang__ )
114 # define __FMA__ 1
115 # endif
116 #endif
117 
118 // AMD instruction sets
119 #if defined( __XOP__ ) || defined( __FMA4__ )
120 # ifdef __GNUC__
121 # include <x86intrin.h> // AMD XOP (Gnu)
122 # else
123 # include <ammintrin.h> // AMD XOP (Microsoft)
124 # endif // __GNUC__
125 #elif defined( __SSE4A__ ) // AMD SSE4A
126 # include <ammintrin.h>
127 #endif // __XOP__
128 
129 // FMA3 instruction set
130 #if defined( __FMA__ ) && ( defined( __GNUC__ ) || defined( __clang__ ) ) && !defined( __INTEL_COMPILER )
131 # include <fmaintrin.h>
132 #endif // __FMA__
133 
134 // FMA4 instruction set
135 #if defined( __FMA4__ ) && ( defined( __GNUC__ ) || defined( __clang__ ) )
136 # include <fma4intrin.h> // must have both x86intrin.h and fma4intrin.h, don't know why
137 #endif // __FMA4__
138 
139 #include <stdint.h> // Define integer types with known size
140 #include <stdlib.h> // define abs(int)
141 
142 #ifdef _MSC_VER // Microsoft compiler or compatible Intel compiler
143 # include <intrin.h> // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int)
144 #endif // _MSC_VER
145 
146 // functions in instrset_detect.cpp:
147 #ifdef VCL_NAMESPACE
148 namespace VCL_NAMESPACE {
149 #endif
150  int instrset_detect( void ); // tells which instruction sets are supported
151  bool hasFMA3( void ); // true if FMA3 instructions supported
152  bool hasFMA4( void ); // true if FMA4 instructions supported
153  bool hasXOP( void ); // true if XOP instructions supported
154  bool hasAVX512ER( void ); // true if AVX512ER instructions supported
155  bool hasAVX512VBMI( void ); // true if AVX512VBMI instructions supported
156  bool hasAVX512VBMI2( void ); // true if AVX512VBMI2 instructions supported
157 #ifdef VCL_NAMESPACE
158 }
159 #endif
160 
161 // functions in physical_processors.cpp:
162 int physicalProcessors( int* logical_processors = 0 );
163 
164 // GCC version
165 #if defined( __GNUC__ ) && !defined( GCC_VERSION ) && !defined( __clang__ )
166 # define GCC_VERSION ( (__GNUC__)*10000 + (__GNUC_MINOR__)*100 + ( __GNUC_PATCHLEVEL__ ) )
167 #endif
168 
169 // Clang version
170 #if defined( __clang__ )
171 # define CLANG_VERSION ( (__clang_major__)*10000 + (__clang_minor__)*100 + ( __clang_patchlevel__ ) )
172 // Problem: The version number is not consistent across platforms
173 // http://llvm.org/bugs/show_bug.cgi?id=12643
174 // Apple bug 18746972
175 #endif
176 
177 // Fix problem with non-overloadable macros named min and max in WinDef.h
178 #ifdef _MSC_VER
179 # if defined( _WINDEF_ ) && defined( min ) && defined( max )
180 # undef min
181 # undef max
182 # endif
183 # ifndef NOMINMAX
184 # define NOMINMAX
185 # endif
186 #endif
187 
188 /* Intel compiler problem:
189 The Intel compiler currently cannot compile version 2.00 of VCL. It seems to have
190 a problem with constexpr function returns not being constant enough.
191 */
192 #if defined( __INTEL_COMPILER ) && __INTEL_COMPILER < 9999
193 # error The Intel compiler version 19.00 cannot compile VCL version 2. Use Version 1.xx of VCL instead
194 #endif
195 
196 /* Clang problem:
197 The Clang compiler treats the intrinsic vector types __m128, __m128i, and __m128d as identical.
198 See the bug report at https://bugs.llvm.org/show_bug.cgi?id=17164
199 Additional problem: The version number is not consistent across platforms. The Apple build has
200 different version numbers. We have to rely on __apple_build_version__ on the Mac platform:
201 http://llvm.org/bugs/show_bug.cgi?id=12643
202 We have to make switches here when - hopefully - the error some day has been fixed.
203 We need different version checks with and whithout __apple_build_version__
204 */
205 #if ( defined( __clang__ ) || defined( __apple_build_version__ ) ) && !defined( __INTEL_COMPILER )
206 # define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
207 #endif
208 
209 #if defined( GCC_VERSION ) && GCC_VERSION < 99999 && !defined( __clang__ )
210 # define ZEXT_MISSING // Gcc 7.4.0 does not have _mm256_zextsi128_si256 and similar functions
211 #endif
212 
213 #ifdef VCL_NAMESPACE
214 namespace VCL_NAMESPACE {
215 #endif
216 
217  // Constant for indicating don't care in permute and blend functions.
218  // V_DC is -256 in Vector class library version 1.xx
219  // V_DC can be any value less than -1 in Vector class library version 2.00
220  constexpr int V_DC = -256;
221 
222  /*****************************************************************************
223  *
224  * Helper functions that depend on instruction set, compiler, or platform
225  *
226  *****************************************************************************/
227 
228  // Define interface to cpuid instruction.
229  // input: functionnumber = leaf (eax), ecxleaf = subleaf(ecx)
230  // output: output[0] = eax, output[1] = ebx, output[2] = ecx, output[3] = edx
231  static inline void cpuid( int output[4], int functionnumber, int ecxleaf = 0 ) {
232 #ifdef __x86_64__
233 # if defined( __GNUC__ ) || defined( __clang__ ) // use inline assembly, Gnu/AT&T syntax
234  int a, b, c, d;
235  __asm( "cpuid" : "=a"( a ), "=b"( b ), "=c"( c ), "=d"( d ) : "a"( functionnumber ), "c"( ecxleaf ) : );
236  output[0] = a;
237  output[1] = b;
238  output[2] = c;
239  output[3] = d;
240 
241 # elif defined( _MSC_VER ) // Microsoft compiler, intrin.h included
242  __cpuidex( output, functionnumber, ecxleaf ); // intrinsic function for CPUID
243 
244 # else // unknown platform. try inline assembly with masm/intel syntax
245  __asm {
246  mov eax, functionnumber
247  mov ecx, ecxleaf
248  cpuid;
249  mov esi, output
250  mov[esi], eax
251  mov[esi + 4], ebx
252  mov[esi + 8], ecx
253  mov[esi + 12], edx
254  }
255 # endif // compiler/platform
256 #endif // __x86_64__
257  }
258 
259 // Define popcount function. Gives sum of bits
260 #if INSTRSET >= 6 // SSE4.2
261  // popcnt instruction is not officially part of the SSE4.2 instruction set,
262  // but available in all known processors with SSE4.2
263  static inline uint32_t vml_popcnt( uint32_t a ) {
264  return (uint32_t)_mm_popcnt_u32( a ); // Intel intrinsic. Supported by gcc and clang
265  }
266 # ifdef __x86_64__
267  static inline int64_t vml_popcnt( uint64_t a ) {
268  return _mm_popcnt_u64( a ); // Intel intrinsic.
269  }
270 # else // 32 bit mode
271  static inline int64_t vml_popcnt( uint64_t a ) {
272  return _mm_popcnt_u32( uint32_t( a >> 32 ) ) + _mm_popcnt_u32( uint32_t( a ) );
273  }
274 # endif
275 #else // no SSE4.2
276 static inline uint32_t vml_popcnt( uint32_t a ) {
277  // popcnt instruction not available
278  uint32_t b = a - ( ( a >> 1 ) & 0x55555555 );
279  uint32_t c = ( b & 0x33333333 ) + ( ( b >> 2 ) & 0x33333333 );
280  uint32_t d = ( c + ( c >> 4 ) ) & 0x0F0F0F0F;
281  uint32_t e = d * 0x01010101;
282  return e >> 24;
283 }
284 
285 static inline int32_t vml_popcnt( uint64_t a ) {
286  return vml_popcnt( uint32_t( a >> 32 ) ) + vml_popcnt( uint32_t( a ) );
287 }
288 
289 #endif
290 
291 // Define bit-scan-forward function. Gives index to lowest set bit
292 #if defined( __GNUC__ ) || defined( __clang__ )
293  // gcc and Clang have no bit_scan_forward intrinsic
294 # if defined( __clang__ ) // fix clang bug
295  // Clang uses a k register as parameter a when inlined from horizontal_find_first
296  __attribute__( ( noinline ) )
297 # endif
298  static uint32_t
299  bit_scan_forward( uint32_t a ) {
300  uint32_t r;
301  __asm( "bsfl %1, %0" : "=r"( r ) : "r"( a ) : );
302  return r;
303  }
304  static inline uint32_t bit_scan_forward( uint64_t a ) {
305  uint32_t lo = uint32_t( a );
306  if ( lo ) return bit_scan_forward( lo );
307  uint32_t hi = uint32_t( a >> 32 );
308  return bit_scan_forward( hi ) + 32;
309  }
310 
311 #else // other compilers
312 static inline uint32_t bit_scan_forward( uint32_t a ) {
313  unsigned long r;
314  _BitScanForward( &r, a ); // defined in intrin.h for MS and Intel compilers
315  return r;
316 }
317 # ifdef __x86_64__
318 static inline uint32_t bit_scan_forward( uint64_t a ) {
319  unsigned long r;
320  _BitScanForward64( &r, a ); // defined in intrin.h for MS and Intel compilers
321  return (uint32_t)r;
322 }
323 # else
324 static inline uint32_t bit_scan_forward( uint64_t a ) {
325  uint32_t lo = uint32_t( a );
326  if ( lo ) return bit_scan_forward( lo );
327  uint32_t hi = uint32_t( a >> 32 );
328  return bit_scan_forward( hi ) + 32;
329 }
330 # endif
331 #endif
332 
333 // Define bit-scan-reverse function. Gives index to highest set bit = floor(log2(a))
334 #if defined( __GNUC__ ) || defined( __clang__ )
335  static inline uint32_t bit_scan_reverse( uint32_t a ) __attribute__( ( pure ) );
336  static inline uint32_t bit_scan_reverse( uint32_t a ) {
337  uint32_t r;
338  __asm( "bsrl %1, %0" : "=r"( r ) : "r"( a ) : );
339  return r;
340  }
341 # ifdef __x86_64__
342  static inline uint32_t bit_scan_reverse( uint64_t a ) {
343  uint64_t r;
344  __asm( "bsrq %1, %0" : "=r"( r ) : "r"( a ) : );
345  return r;
346  }
347 # else // 32 bit mode
348  static inline uint32_t bit_scan_reverse( uint64_t a ) {
349  uint64_t ahi = a >> 32;
350  if ( ahi == 0 )
351  return bit_scan_reverse( uint32_t( a ) );
352  else
353  return bit_scan_reverse( uint32_t( ahi ) ) + 32;
354  }
355 # endif
356 #else
357 static inline uint32_t bit_scan_reverse( uint32_t a ) {
358  unsigned long r;
359  _BitScanReverse( &r, a ); // defined in intrin.h for MS and Intel compilers
360  return r;
361 }
362 # ifdef __x86_64__
363 static inline uint32_t bit_scan_reverse( uint64_t a ) {
364  unsigned long r;
365  _BitScanReverse64( &r, a ); // defined in intrin.h for MS and Intel compilers
366  return r;
367 }
368 # else // 32 bit mode
369 static inline uint32_t bit_scan_reverse( uint64_t a ) {
370  uint64_t ahi = a >> 32;
371  if ( ahi == 0 )
372  return bit_scan_reverse( uint32_t( a ) );
373  else
374  return bit_scan_reverse( uint32_t( ahi ) ) + 32;
375 }
376 # endif
377 #endif
378 
379  // Same function, for compile-time constants
380  constexpr int bit_scan_reverse_const( uint64_t const n ) {
381  if ( n == 0 ) return -1;
382  uint64_t a = n, b = 0, j = 64, k = 0;
383  do {
384  j >>= 1;
385  k = (uint64_t)1 << j;
386  if ( a >= k ) {
387  a >>= j;
388  b += j;
389  }
390  } while ( j > 0 );
391  return int( b );
392  }
393 
394  /*****************************************************************************
395  *
396  * Common templates
397  *
398  *****************************************************************************/
399 
400  // Template class to represent compile-time integer constant
401  template <int32_t n>
402  class Const_int_t {}; // represent compile-time signed integer constant
403  template <uint32_t n>
404  class Const_uint_t {}; // represent compile-time unsigned integer constant
405 #define const_int( n ) ( Const_int_t<n>() ) // n must be compile-time integer constant
406 #define const_uint( n ) ( Const_uint_t<n>() ) // n must be compile-time unsigned integer constant
407 
408  // template for producing quiet NAN
409  template <class VTYPE>
410  static inline VTYPE nan_vec( uint32_t payload = 0x100 ) {
411  if constexpr ( ( VTYPE::elementtype() & 1 ) != 0 ) { // double
412  union {
413  uint64_t q;
414  double f;
415  } ud;
416  // n is left justified to avoid loss of NAN payload when converting to float
417  ud.q = 0x7FF8000000000000 | uint64_t( payload ) << 29;
418  return VTYPE( ud.f );
419  }
420  // float will be converted to double if necessary
421  union {
422  uint32_t i;
423  float f;
424  } uf;
425  uf.i = 0x7FC00000 | ( payload & 0x003FFFFF );
426  return VTYPE( uf.f );
427  }
428 
429  // Test if a parameter is a compile-time constant
430  /* Unfortunately, this works only for macro parameters, not for inline function parameters.
431  I hope that some solution will appear in the future, but for now it appears to be
432  impossible to check if a function parameter is a compile-time constant.
433  This would be useful in operator / and in function pow:
434  #if defined(__GNUC__) || defined (__clang__)
435  #define is_constant(a) __builtin_constant_p(a)
436  #else
437  #define is_constant(a) false
438  #endif
439  */
440 
441  /*****************************************************************************
442  *
443  * Helper functions for permute and blend functions
444  *
445  ******************************************************************************
446  Rules for constexpr functions:
447 
448  > All variable declarations must include initialization
449 
450  > Do not put variable declarations inside a for-clause, e.g. avoid: for (int i=0; ..
451  Instead, you have to declare the loop counter before the for-loop.
452 
453  > Do not make constexpr functions that return vector types. This requires type
454  punning with a union, which is not allowed in constexpr functions under C++17.
455  It may be possible under C++20
456 
457  *****************************************************************************/
458 
459  // Define type for Encapsulated array to use as return type:
460  template <typename T, int N>
461  struct EList {
462  T a[N];
463  };
464 
465  // get_inttype: get an integer of a size that matches the element size
466  // of vector class V with the value -1
467  template <typename V>
468  constexpr auto get_inttype() {
469  constexpr int elementsize = sizeof( V ) / V::size(); // size of vector elements
470 
471  if constexpr ( elementsize >= 8 ) {
472  return -int64_t( 1 );
473  } else if constexpr ( elementsize >= 4 ) {
474  return int32_t( -1 );
475  } else if constexpr ( elementsize >= 2 ) {
476  return int16_t( -1 );
477  } else {
478  return int8_t( -1 );
479  }
480  }
481 
482  // zero_mask: return a compact bit mask mask for zeroing using AVX512 mask.
483  // Parameter a is a reference to a constexpr int array of permutation indexes
484  template <int N>
485  constexpr auto zero_mask( int const ( &a )[N] ) {
486  uint64_t mask = 0;
487  int i = 0;
488 
489  for ( i = 0; i < N; i++ ) {
490  if ( a[i] >= 0 ) mask |= uint64_t( 1 ) << i;
491  }
492  if constexpr ( N <= 8 )
493  return uint8_t( mask );
494  else if constexpr ( N <= 16 )
495  return uint16_t( mask );
496  else if constexpr ( N <= 32 )
497  return uint32_t( mask );
498  else
499  return mask;
500  }
501 
502  // zero_mask_broad: return a broad byte mask for zeroing.
503  // Parameter a is a reference to a constexpr int array of permutation indexes
504  template <typename V>
505  constexpr auto zero_mask_broad( int const ( &A )[V::size()] ) {
506  constexpr int N = V::size(); // number of vector elements
507  typedef decltype( get_inttype<V>() ) Etype; // element type
508  EList<Etype, N> u = { { 0 } }; // list for return
509  int i = 0;
510  for ( i = 0; i < N; i++ ) { u.a[i] = A[i] >= 0 ? get_inttype<V>() : 0; }
511  return u; // return encapsulated array
512  }
513 
514  // make_bit_mask: return a compact mask of bits from a list of N indexes:
515  // B contains options indicating how to gather the mask
516  // bit 0-7 in B indicates which bit in each index to collect
517  // bit 8 = 0x100: set 1 in the lower half of the bit mask if the indicated bit is 1.
518  // bit 8 = 0 : set 1 in the lower half of the bit mask if the indicated bit is 0.
519  // bit 9 = 0x200: set 1 in the upper half of the bit mask if the indicated bit is 1.
520  // bit 9 = 0 : set 1 in the upper half of the bit mask if the indicated bit is 0.
521  // bit 10 = 0x400: set 1 in the bit mask if the corresponding index is -1 or V_DC
522  // Parameter a is a reference to a constexpr int array of permutation indexes
523  template <int N, int B>
524  constexpr uint64_t make_bit_mask( int const ( &a )[N] ) {
525  uint64_t r = 0; // return value
526  uint8_t j = uint8_t( B & 0xFF ); // index to selected bit
527  uint64_t s = 0; // bit number i in r
528  uint64_t f = 0; // 1 if bit not flipped
529  int i = 0;
530  for ( i = 0; i < N; i++ ) {
531  int ix = a[i];
532  if ( ix < 0 ) { // -1 or V_DC
533  s = ( B >> 10 ) & 1;
534  } else {
535  s = ( (uint32_t)ix >> j ) & 1; // extract selected bit
536  if ( i < N / 2 ) {
537  f = ( B >> 8 ) & 1; // lower half
538  } else {
539  f = ( B >> 9 ) & 1; // upper half
540  }
541  s ^= f ^ 1; // flip bit if needed
542  }
543  r |= uint64_t( s ) << i; // set bit in return value
544  }
545  return r;
546  }
547 
548  // make_broad_mask: Convert a bit mask m to a broad mask
549  // The return value will be a broad boolean mask with elementsize matching vector class V
550  template <typename V>
551  constexpr auto make_broad_mask( uint64_t const m ) {
552  constexpr int N = V::size(); // number of vector elements
553  typedef decltype( get_inttype<V>() ) Etype; // element type
554  EList<Etype, N> u = { { 0 } }; // list for returning
555  int i = 0;
556  for ( i = 0; i < N; i++ ) { u.a[i] = ( ( m >> i ) & 1 ) != 0 ? get_inttype<V>() : 0; }
557  return u; // return encapsulated array
558  }
559 
560  // perm_mask_broad: return a mask for permutation by a vector register index.
561  // Parameter A is a reference to a constexpr int array of permutation indexes
562  template <typename V>
563  constexpr auto perm_mask_broad( int const ( &A )[V::size()] ) {
564  constexpr int N = V::size(); // number of vector elements
565  typedef decltype( get_inttype<V>() ) Etype; // vector element type
566  EList<Etype, N> u = { { 0 } }; // list for returning
567  int i = 0;
568  for ( i = 0; i < N; i++ ) { u.a[i] = Etype( A[i] ); }
569  return u; // return encapsulated array
570  }
571 
572  // perm_flags: returns information about how a permute can be implemented.
573  // The return value is composed of these flag bits:
574  const int perm_zeroing = 1; // needs zeroing
575  const int perm_perm = 2; // permutation needed
576  const int perm_allzero = 4; // all is zero or don't care
577  const int perm_largeblock = 8; // fits permute with a larger block size (e.g permute Vec2q instead of Vec4i)
578  const int perm_addz = 0x10; // additional zeroing needed after permute with larger block size or shift
579  const int perm_addz2 = 0x20; // additional zeroing needed after perm_zext, perm_compress, or perm_expand
580  const int perm_cross_lane = 0x40; // permutation crossing 128-bit lanes
581  const int perm_same_pattern = 0x80; // same permute pattern in all 128-bit lanes
582  const int perm_punpckh = 0x100; // permutation pattern fits punpckh instruction
583  const int perm_punpckl = 0x200; // permutation pattern fits punpckl instruction
584  const int perm_rotate =
585  0x400; // permutation pattern fits rotation within lanes. 4 bit count returned in bit perm_rot_count
586  const int perm_shright =
587  0x1000; // permutation pattern fits shift right within lanes. 4 bit count returned in bit perm_rot_count
588  const int perm_shleft =
589  0x2000; // permutation pattern fits shift left within lanes. negative count returned in bit perm_rot_count
590  const int perm_rotate_big =
591  0x4000; // permutation pattern fits rotation across lanes. 6 bit count returned in bit perm_rot_count
592  const int perm_broadcast = 0x8000; // permutation pattern fits broadcast of a single element.
593  const int perm_zext = 0x10000; // permutation pattern fits zero extension
594  const int perm_compress = 0x20000; // permutation pattern fits vpcompress instruction
595  const int perm_expand = 0x40000; // permutation pattern fits vpexpand instruction
596  const int perm_outofrange = 0x10000000; // index out of range
597  const int perm_rot_count = 32; // rotate or shift count is in bits perm_rot_count to perm_rot_count+3
598  const int perm_ipattern =
599  40; // pattern for pshufd is in bit perm_ipattern to perm_ipattern + 7 if perm_same_pattern and elementsize >= 4
600 
601  template <typename V>
602  constexpr uint64_t perm_flags( int const ( &a )[V::size()] ) {
603  // a is a reference to a constexpr array of permutation indexes
604  // V is a vector class
605  constexpr int N = V::size(); // number of elements
606  uint64_t r = perm_largeblock | perm_same_pattern | perm_allzero; // return value
607  uint32_t i = 0; // loop counter
608  int j = 0; // loop counter
609  int ix = 0; // index number i
610  const uint32_t nlanes = sizeof( V ) / 16; // number of 128-bit lanes
611  const uint32_t lanesize = N / nlanes; // elements per lane
612  const uint32_t elementsize = sizeof( V ) / N; // size of each vector element
613  uint32_t lane = 0; // current lane
614  uint32_t rot = 999; // rotate left count
615  int32_t broadc = 999; // index to broadcasted element
616  uint32_t patfail = 0; // remember certain patterns that do not fit
617  uint32_t addz2 = 0; // remember certain patterns need extra zeroing
618  int32_t compresslasti = -1; // last index in perm_compress fit
619  int32_t compresslastp = -1; // last position in perm_compress fit
620  int32_t expandlasti = -1; // last index in perm_expand fit
621  int32_t expandlastp = -1; // last position in perm_expand fit
622 
623  int lanepattern[lanesize] = { 0 }; // pattern in each lane
624 
625  for ( i = 0; i < N; i++ ) { // loop through indexes
626  ix = a[i]; // current index
627  // meaning of ix: -1 = set to zero, V_DC = don't care, non-negative value = permute.
628  if ( ix == -1 ) {
629  r |= perm_zeroing; // zeroing requested
630  } else if ( ix != V_DC && uint32_t( ix ) >= N ) {
631  r |= perm_outofrange; // index out of range
632  }
633  if ( ix >= 0 ) {
634  r &= ~perm_allzero; // not all zero
635  if ( ix != (int)i ) r |= perm_perm; // needs permutation
636  if ( broadc == 999 )
637  broadc = ix; // remember broadcast index
638  else if ( broadc != ix )
639  broadc = 1000; // does not fit broadcast
640  }
641  // check if pattern fits a larger block size:
642  // even indexes must be even, odd indexes must fit the preceding even index + 1
643  if ( ( i & 1 ) == 0 ) { // even index
644  if ( ix >= 0 && ( ix & 1 ) ) r &= ~perm_largeblock; // not even. does not fit larger block size
645  int iy = a[i + 1]; // next odd index
646  if ( iy >= 0 && ( iy & 1 ) == 0 ) r &= ~perm_largeblock; // not odd. does not fit larger block size
647  if ( ix >= 0 && iy >= 0 && iy != ix + 1 ) r &= ~perm_largeblock; // does not fit preceding index + 1
648  if ( ix == -1 && iy >= 0 ) r |= perm_addz; // needs additional zeroing at current block size
649  if ( iy == -1 && ix >= 0 ) r |= perm_addz; // needs additional zeroing at current block size
650  }
651  lane = i / lanesize; // current lane
652  if ( lane == 0 ) { // first lane, or no pattern yet
653  lanepattern[i] = ix; // save pattern
654  }
655  // check if crossing lanes
656  if ( ix >= 0 ) {
657  uint32_t lanei = (uint32_t)ix / lanesize; // source lane
658  if ( lanei != lane ) r |= perm_cross_lane; // crossing lane
659  }
660  // check if same pattern in all lanes
661  if ( lane != 0 && ix >= 0 ) { // not first lane
662  int j1 = i - int( lane * lanesize ); // index into lanepattern
663  int jx = ix - int( lane * lanesize ); // pattern within lane
664  if ( jx < 0 || jx >= (int)lanesize ) r &= ~perm_same_pattern; // source is in another lane
665  if ( lanepattern[j1] < 0 ) {
666  lanepattern[j1] = jx; // pattern not known from previous lane
667  } else {
668  if ( lanepattern[j1] != jx ) r &= ~perm_same_pattern; // not same pattern
669  }
670  }
671  if ( ix >= 0 ) {
672  // check if pattern fits zero extension (perm_zext)
673  if ( uint32_t( ix * 2 ) != i ) {
674  patfail |= 1; // does not fit zero extension
675  }
676  // check if pattern fits compress (perm_compress)
677  if ( ix > compresslasti && ix - compresslasti >= (int)i - compresslastp ) {
678  if ( (int)i - compresslastp > 1 ) addz2 |= 2; // perm_compress may need additional zeroing
679  compresslasti = ix;
680  compresslastp = i;
681  } else {
682  patfail |= 2; // does not fit perm_compress
683  }
684  // check if pattern fits expand (perm_expand)
685  if ( ix > expandlasti && ix - expandlasti <= (int)i - expandlastp ) {
686  if ( ix - expandlasti > 1 ) addz2 |= 4; // perm_expand may need additional zeroing
687  expandlasti = ix;
688  expandlastp = i;
689  } else {
690  patfail |= 4; // does not fit perm_compress
691  }
692  } else if ( ix == -1 ) {
693  if ( ( i & 1 ) == 0 ) addz2 |= 1; // zero extension needs additional zeroing
694  }
695  }
696  if ( !( r & perm_perm ) ) return r; // more checks are superfluous
697 
698  if ( !( r & perm_largeblock ) ) r &= ~perm_addz; // remove irrelevant flag
699  if ( r & perm_cross_lane ) r &= ~perm_same_pattern; // remove irrelevant flag
700  if ( ( patfail & 1 ) == 0 ) {
701  r |= perm_zext; // fits zero extension
702  if ( ( addz2 & 1 ) != 0 ) r |= perm_addz2;
703  } else if ( ( patfail & 2 ) == 0 ) {
704  r |= perm_compress; // fits compression
705  if ( ( addz2 & 2 ) != 0 ) { // check if additional zeroing needed
706  for ( j = 0; j < compresslastp; j++ ) {
707  if ( a[j] == -1 ) r |= perm_addz2;
708  }
709  }
710  } else if ( ( patfail & 4 ) == 0 ) {
711  r |= perm_expand; // fits expansion
712  if ( ( addz2 & 4 ) != 0 ) { // check if additional zeroing needed
713  for ( j = 0; j < expandlastp; j++ ) {
714  if ( a[j] == -1 ) r |= perm_addz2;
715  }
716  }
717  }
718 
719  if ( r & perm_same_pattern ) {
720  // same pattern in all lanes. check if it fits specific patterns
721  bool fit = true;
722  // fit shift or rotate
723  for ( i = 0; i < lanesize; i++ ) {
724  if ( lanepattern[i] >= 0 ) {
725  uint32_t rot1 = uint32_t( lanepattern[i] + lanesize - i ) % lanesize;
726  if ( rot == 999 ) {
727  rot = rot1;
728  } else { // check if fit
729  if ( rot != rot1 ) fit = false;
730  }
731  }
732  }
733  rot &= lanesize - 1; // prevent out of range values
734  if ( fit ) { // fits rotate, and possibly shift
735  uint64_t rot2 = ( rot * elementsize ) & 0xF; // rotate right count in bytes
736  r |= rot2 << perm_rot_count; // put shift/rotate count in output bit 16-19
737 #if INSTRSET >= 4 // SSSE3
738  r |= perm_rotate; // allow palignr
739 #endif
740  // fit shift left
741  fit = true;
742  for ( i = 0; i < lanesize - rot; i++ ) { // check if first rot elements are zero or don't care
743  if ( lanepattern[i] >= 0 ) fit = false;
744  }
745  if ( fit ) {
746  r |= perm_shleft;
747  for ( ; i < lanesize; i++ )
748  if ( lanepattern[i] == -1 ) r |= perm_addz; // additional zeroing needed
749  }
750  // fit shift right
751  fit = true;
752  for ( i = lanesize - (uint32_t)rot; i < lanesize;
753  i++ ) { // check if last (lanesize-rot) elements are zero or don't care
754  if ( lanepattern[i] >= 0 ) fit = false;
755  }
756  if ( fit ) {
757  r |= perm_shright;
758  for ( i = 0; i < lanesize - rot; i++ ) {
759  if ( lanepattern[i] == -1 ) r |= perm_addz; // additional zeroing needed
760  }
761  }
762  }
763  // fit punpckhi
764  fit = true;
765  uint32_t j2 = lanesize / 2;
766  for ( i = 0; i < lanesize; i++ ) {
767  if ( lanepattern[i] >= 0 && lanepattern[i] != (int)j2 ) fit = false;
768  if ( ( i & 1 ) != 0 ) j2++;
769  }
770  if ( fit ) r |= perm_punpckh;
771  // fit punpcklo
772  fit = true;
773  j2 = 0;
774  for ( i = 0; i < lanesize; i++ ) {
775  if ( lanepattern[i] >= 0 && lanepattern[i] != (int)j2 ) fit = false;
776  if ( ( i & 1 ) != 0 ) j2++;
777  }
778  if ( fit ) r |= perm_punpckl;
779  // fit pshufd
780  if ( elementsize >= 4 ) {
781  uint64_t p = 0;
782  for ( i = 0; i < lanesize; i++ ) {
783  if ( lanesize == 4 ) {
784  p |= ( lanepattern[i] & 3 ) << 2 * i;
785  } else { // lanesize = 2
786  p |= ( ( lanepattern[i] & 1 ) * 10 + 4 ) << 4 * i;
787  }
788  }
789  r |= p << perm_ipattern;
790  }
791  }
792 #if INSTRSET >= 7
793  else { // not same pattern in all lanes
794  if constexpr ( nlanes > 1 ) { // Try if it fits big rotate
795  for ( i = 0; i < N; i++ ) {
796  ix = a[i];
797  if ( ix >= 0 ) {
798  uint32_t rot2 = ( ix + N - i ) % N; // rotate count
799  if ( rot == 999 ) {
800  rot = rot2; // save rotate count
801  } else if ( rot != rot2 ) {
802  rot = 1000;
803  break; // does not fit big rotate
804  }
805  }
806  }
807  if ( rot < N ) { // fits big rotate
808  r |= perm_rotate_big | (uint64_t)rot << perm_rot_count;
809  }
810  }
811  }
812 #endif
813  if ( broadc < 999 && ( r & ( perm_rotate | perm_shright | perm_shleft | perm_rotate_big ) ) == 0 ) {
814  r |= perm_broadcast | (uint64_t)broadc << perm_rot_count; // fits broadcast
815  }
816  return r;
817  }
818 
819  // compress_mask: returns a bit mask to use for compression instruction.
820  // It is presupposed that perm_flags indicates perm_compress.
821  // Additional zeroing is needed if perm_flags indicates perm_addz2
822  template <int N>
823  constexpr uint64_t compress_mask( int const ( &a )[N] ) {
824  // a is a reference to a constexpr array of permutation indexes
825  int ix = 0, lasti = -1, lastp = -1;
826  uint64_t m = 0;
827  int i = 0;
828  int j = 1; // loop counters
829  for ( i = 0; i < N; i++ ) {
830  ix = a[i]; // permutation index
831  if ( ix >= 0 ) {
832  m |= (uint64_t)1 << ix; // mask for compression source
833  for ( j = 1; j < i - lastp; j++ ) {
834  m |= (uint64_t)1 << ( lasti + j ); // dummy filling source
835  }
836  lastp = i;
837  lasti = ix;
838  }
839  }
840  return m;
841  }
842 
843  // expand_mask: returns a bit mask to use for expansion instruction.
844  // It is presupposed that perm_flags indicates perm_expand.
845  // Additional zeroing is needed if perm_flags indicates perm_addz2
846  template <int N>
847  constexpr uint64_t expand_mask( int const ( &a )[N] ) {
848  // a is a reference to a constexpr array of permutation indexes
849  int ix = 0, lasti = -1, lastp = -1;
850  uint64_t m = 0;
851  int i = 0;
852  int j = 1;
853  for ( i = 0; i < N; i++ ) {
854  ix = a[i]; // permutation index
855  if ( ix >= 0 ) {
856  m |= (uint64_t)1 << i; // mask for expansion destination
857  for ( j = 1; j < ix - lasti; j++ ) {
858  m |= (uint64_t)1 << ( lastp + j ); // dummy filling destination
859  }
860  lastp = i;
861  lasti = ix;
862  }
863  }
864  return m;
865  }
866 
867  // perm16_flags: returns information about how to permute a vector of 16-bit integers
868  // Note: It is presupposed that perm_flags reports perm_same_pattern
869  // The return value is composed of these bits:
870  // 1: data from low 64 bits to low 64 bits. pattern in bit 32-39
871  // 2: data from high 64 bits to high 64 bits. pattern in bit 40-47
872  // 4: data from high 64 bits to low 64 bits. pattern in bit 48-55
873  // 8: data from low 64 bits to high 64 bits. pattern in bit 56-63
874  template <typename V>
875  constexpr uint64_t perm16_flags( int const ( &a )[V::size()] ) {
876  // a is a reference to a constexpr array of permutation indexes
877  // V is a vector class
878  constexpr int N = V::size(); // number of elements
879 
880  uint64_t retval = 0; // return value
881  uint32_t pat[4] = { 0, 0, 0, 0 }; // permute patterns
882  uint32_t i = 0; // loop counter
883  int ix = 0; // index number i
884  const uint32_t lanesize = 8; // elements per lane
885  uint32_t lane = 0; // current lane
886  int lanepattern[lanesize] = { 0 }; // pattern in each lane
887 
888  for ( i = 0; i < N; i++ ) {
889  ix = a[i];
890  lane = i / lanesize; // current lane
891  if ( lane == 0 ) {
892  lanepattern[i] = ix; // save pattern
893  } else if ( ix >= 0 ) { // not first lane
894  uint32_t j = i - lane * lanesize; // index into lanepattern
895  int jx = ix - lane * lanesize; // pattern within lane
896  if ( lanepattern[j] < 0 ) {
897  lanepattern[j] = jx; // pattern not known from previous lane
898  }
899  }
900  }
901  // four patterns: low2low, high2high, high2low, low2high
902  for ( i = 0; i < 4; i++ ) {
903  // loop through low pattern
904  if ( lanepattern[i] >= 0 ) {
905  if ( lanepattern[i] < 4 ) { // low2low
906  retval |= 1;
907  pat[0] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
908  } else { // high2low
909  retval |= 4;
910  pat[2] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
911  }
912  }
913  // loop through high pattern
914  if ( lanepattern[i + 4] >= 0 ) {
915  if ( lanepattern[i + 4] < 4 ) { // low2high
916  retval |= 8;
917  pat[3] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
918  } else { // high2high
919  retval |= 2;
920  pat[1] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
921  }
922  }
923  }
924  // join return data
925  for ( i = 0; i < 4; i++ ) { retval |= (uint64_t)pat[i] << ( 32 + i * 8 ); }
926  return retval;
927  }
928 
929  // pshufb_mask: return a broad byte mask for permutation within lanes
930  // for use with the pshufb instruction (_mm..._shuffle_epi8).
931  // The pshufb instruction provides fast permutation and zeroing,
932  // allowing different patterns in each lane but no crossing of lane boundaries
933  template <typename V, int oppos = 0>
934  constexpr auto pshufb_mask( int const ( &A )[V::size()] ) {
935  // Parameter a is a reference to a constexpr array of permutation indexes
936  // V is a vector class
937  // oppos = 1 for data from the opposite 128-bit lane in 256-bit vectors
938  constexpr uint32_t N = V::size(); // number of vector elements
939  constexpr uint32_t elementsize = sizeof( V ) / N; // size of each vector element
940  constexpr uint32_t nlanes = sizeof( V ) / 16; // number of 128 bit lanes in vector
941  constexpr uint32_t elements_per_lane = N / nlanes; // number of vector elements per lane
942 
943  EList<int8_t, sizeof( V )> u = { { 0 } }; // list for returning
944 
945  uint32_t i = 0; // loop counters
946  uint32_t j = 0;
947  int m = 0;
948  int k = 0;
949  uint32_t lane = 0;
950 
951  for ( lane = 0; lane < nlanes; lane++ ) { // loop through lanes
952  for ( i = 0; i < elements_per_lane; i++ ) { // loop through elements in lane
953  // permutation index for element within lane
954  int8_t p = -1;
955  int ix = A[m];
956  if ( ix >= 0 ) {
957  ix ^= oppos * elements_per_lane; // flip bit if opposite lane
958  }
959  ix -= int( lane * elements_per_lane ); // index relative to lane
960  if ( ix >= 0 && ix < (int)elements_per_lane ) { // index points to desired lane
961  p = ix * elementsize;
962  }
963  for ( j = 0; j < elementsize; j++ ) { // loop through bytes in element
964  u.a[k++] = p < 0 ? -1 : p + j; // store byte permutation index
965  }
966  m++;
967  }
968  }
969  return u; // return encapsulated array
970  }
971 
972  // largeblock_perm: return indexes for replacing a permute or blend with
973  // a certain block size by a permute or blend with the double block size.
974  // Note: it is presupposed that perm_flags() indicates perm_largeblock
975  // It is required that additional zeroing is added if perm_flags() indicates perm_addz
976  template <int N>
977  constexpr EList<int, N / 2> largeblock_perm( int const ( &a )[N] ) {
978  // Parameter a is a reference to a constexpr array of permutation indexes
979  EList<int, N / 2> list = { { 0 } }; // result indexes
980  int ix = 0; // even index
981  int iy = 0; // odd index
982  int iz = 0; // combined index
983  bool fit_addz = false; // additional zeroing needed at the lower block level
984  int i = 0; // loop counter
985 
986  // check if additional zeroing is needed at current block size
987  for ( i = 0; i < N; i += 2 ) {
988  ix = a[i]; // even index
989  iy = a[i + 1]; // odd index
990  if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz = true; }
991  }
992 
993  // loop through indexes
994  for ( i = 0; i < N; i += 2 ) {
995  ix = a[i]; // even index
996  iy = a[i + 1]; // odd index
997  if ( ix >= 0 ) {
998  iz = ix / 2; // half index
999  } else if ( iy >= 0 ) {
1000  iz = iy / 2;
1001  } else {
1002  iz = ix | iy; // -1 or V_DC. -1 takes precedence
1003  if ( fit_addz ) iz = V_DC; // V_DC, because result will be zeroed later
1004  }
1005  list.a[i / 2] = iz; // save to list
1006  }
1007  return list;
1008  }
1009 
1010  // blend_flags: returns information about how a blend function can be implemented
1011  // The return value is composed of these flag bits:
1012  const int blend_zeroing = 1; // needs zeroing
1013  const int blend_allzero = 2; // all is zero or don't care
1014  const int blend_largeblock = 4; // fits blend with a larger block size (e.g permute Vec2q instead of Vec4i)
1015  const int blend_addz = 8; // additional zeroing needed after blend with larger block size or shift
1016  const int blend_a = 0x10; // has data from a
1017  const int blend_b = 0x20; // has data from b
1018  const int blend_perma = 0x40; // permutation of a needed
1019  const int blend_permb = 0x80; // permutation of b needed
1020  const int blend_cross_lane = 0x100; // permutation crossing 128-bit lanes
1021  const int blend_same_pattern = 0x200; // same permute/blend pattern in all 128-bit lanes
1022  const int blend_punpckhab = 0x1000; // pattern fits punpckh(a,b)
1023  const int blend_punpckhba = 0x2000; // pattern fits punpckh(b,a)
1024  const int blend_punpcklab = 0x4000; // pattern fits punpckl(a,b)
1025  const int blend_punpcklba = 0x8000; // pattern fits punpckl(b,a)
1026  const int blend_rotateab = 0x10000; // pattern fits palignr(a,b)
1027  const int blend_rotateba = 0x20000; // pattern fits palignr(b,a)
1028  const int blend_shufab = 0x40000; // pattern fits shufps/shufpd(a,b)
1029  const int blend_shufba = 0x80000; // pattern fits shufps/shufpd(b,a)
1030  const int blend_rotate_big = 0x100000; // pattern fits rotation across lanes. count returned in bits blend_rotpattern
1031  const int blend_outofrange = 0x10000000; // index out of range
1032  const int blend_shufpattern = 32; // pattern for shufps/shufpd is in bit blend_shufpattern to blend_shufpattern + 7
1033  const int blend_rotpattern = 40; // pattern for palignr is in bit blend_rotpattern to blend_rotpattern + 7
1034 
1035  template <typename V>
1036  constexpr uint64_t blend_flags( int const ( &a )[V::size()] ) {
1037  // a is a reference to a constexpr array of permutation indexes
1038  // V is a vector class
1039  constexpr int N = V::size(); // number of elements
1040  uint64_t r = blend_largeblock | blend_same_pattern | blend_allzero; // return value
1041  uint32_t iu = 0; // loop counter
1042  int32_t ii = 0; // loop counter
1043  int ix = 0; // index number i
1044  const uint32_t nlanes = sizeof( V ) / 16; // number of 128-bit lanes
1045  const uint32_t lanesize = N / nlanes; // elements per lane
1046  uint32_t lane = 0; // current lane
1047  uint32_t rot = 999; // rotate left count
1048  int lanepattern[lanesize] = { 0 }; // pattern in each lane
1049  if ( lanesize == 2 && N <= 8 ) {
1050  r |= blend_shufab | blend_shufba; // check if it fits shufpd
1051  }
1052 
1053  for ( ii = 0; ii < N; ii++ ) { // loop through indexes
1054  ix = a[ii]; // index
1055  if ( ix < 0 ) {
1056  if ( ix == -1 )
1057  r |= blend_zeroing; // set to zero
1058  else if ( ix != V_DC ) {
1059  r = blend_outofrange;
1060  break; // illegal index
1061  }
1062  } else { // ix >= 0
1063  r &= ~blend_allzero;
1064  if ( ix < N ) {
1065  r |= blend_a; // data from a
1066  if ( ix != ii ) r |= blend_perma; // permutation of a
1067  } else if ( ix < 2 * N ) {
1068  r |= blend_b; // data from b
1069  if ( ix != ii + N ) r |= blend_permb; // permutation of b
1070  } else {
1071  r = blend_outofrange;
1072  break; // illegal index
1073  }
1074  }
1075  // check if pattern fits a larger block size:
1076  // even indexes must be even, odd indexes must fit the preceding even index + 1
1077  if ( ( ii & 1 ) == 0 ) { // even index
1078  if ( ix >= 0 && ( ix & 1 ) ) r &= ~blend_largeblock; // not even. does not fit larger block size
1079  int iy = a[ii + 1]; // next odd index
1080  if ( iy >= 0 && ( iy & 1 ) == 0 ) r &= ~blend_largeblock; // not odd. does not fit larger block size
1081  if ( ix >= 0 && iy >= 0 && iy != ix + 1 ) r &= ~blend_largeblock; // does not fit preceding index + 1
1082  if ( ix == -1 && iy >= 0 ) r |= blend_addz; // needs additional zeroing at current block size
1083  if ( iy == -1 && ix >= 0 ) r |= blend_addz; // needs additional zeroing at current block size
1084  }
1085  lane = (uint32_t)ii / lanesize; // current lane
1086  if ( lane == 0 ) { // first lane, or no pattern yet
1087  lanepattern[ii] = ix; // save pattern
1088  }
1089  // check if crossing lanes
1090  if ( ix >= 0 ) {
1091  uint32_t lanei = uint32_t( ix & ~N ) / lanesize; // source lane
1092  if ( lanei != lane ) {
1093  r |= blend_cross_lane; // crossing lane
1094  }
1095  if ( lanesize == 2 ) { // check if it fits pshufd
1096  if ( lanei != lane ) r &= ~( blend_shufab | blend_shufba );
1097  if ( ( ( ( ix & N ) != 0 ) ^ ii ) & 1 )
1098  r &= ~blend_shufab;
1099  else
1100  r &= ~blend_shufba;
1101  }
1102  }
1103  // check if same pattern in all lanes
1104  if ( lane != 0 && ix >= 0 ) { // not first lane
1105  int j = ii - int( lane * lanesize ); // index into lanepattern
1106  int jx = ix - int( lane * lanesize ); // pattern within lane
1107  if ( jx < 0 || ( jx & ~N ) >= (int)lanesize ) r &= ~blend_same_pattern; // source is in another lane
1108  if ( lanepattern[j] < 0 ) {
1109  lanepattern[j] = jx; // pattern not known from previous lane
1110  } else {
1111  if ( lanepattern[j] != jx ) r &= ~blend_same_pattern; // not same pattern
1112  }
1113  }
1114  }
1115  if ( !( r & blend_largeblock ) ) r &= ~blend_addz; // remove irrelevant flag
1116  if ( r & blend_cross_lane ) r &= ~blend_same_pattern; // remove irrelevant flag
1117  if ( !( r & ( blend_perma | blend_permb ) ) ) {
1118  return r; // no permutation. more checks are superfluous
1119  }
1120  if ( r & blend_same_pattern ) {
1121  // same pattern in all lanes. check if it fits unpack patterns
1123  for ( iu = 0; iu < lanesize; iu++ ) { // loop through lanepattern
1124  ix = lanepattern[iu];
1125  if ( ix >= 0 ) {
1126  if ( (uint32_t)ix != iu / 2 + ( iu & 1 ) * N ) r &= ~blend_punpcklab;
1127  if ( (uint32_t)ix != iu / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &= ~blend_punpcklba;
1128  if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( iu & 1 ) * N ) r &= ~blend_punpckhab;
1129  if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &= ~blend_punpckhba;
1130  }
1131  }
1132 #if INSTRSET >= 4 // SSSE3. check if it fits palignr
1133  for ( iu = 0; iu < lanesize; iu++ ) {
1134  ix = lanepattern[iu];
1135  if ( ix >= 0 ) {
1136  uint32_t t = ix & ~N;
1137  if ( ix & N ) t += lanesize;
1138  uint32_t tb = ( t + 2 * lanesize - iu ) % ( lanesize * 2 );
1139  if ( rot == 999 ) {
1140  rot = tb;
1141  } else { // check if fit
1142  if ( rot != tb ) rot = 1000;
1143  }
1144  }
1145  }
1146  if ( rot < 999 ) { // firs palignr
1147  if ( rot < lanesize ) {
1148  r |= blend_rotateba;
1149  } else {
1150  r |= blend_rotateab;
1151  }
1152  const uint32_t elementsize = sizeof( V ) / N;
1153  r |= uint64_t( ( rot & ( lanesize - 1 ) ) * elementsize ) << blend_rotpattern;
1154  }
1155 #endif
1156  if ( lanesize == 4 ) {
1157  // check if it fits shufps
1158  r |= blend_shufab | blend_shufba;
1159  for ( ii = 0; ii < 2; ii++ ) {
1160  ix = lanepattern[ii];
1161  if ( ix >= 0 ) {
1162  if ( ix & N )
1163  r &= ~blend_shufab;
1164  else
1165  r &= ~blend_shufba;
1166  }
1167  }
1168  for ( ; ii < 4; ii++ ) {
1169  ix = lanepattern[ii];
1170  if ( ix >= 0 ) {
1171  if ( ix & N )
1172  r &= ~blend_shufba;
1173  else
1174  r &= ~blend_shufab;
1175  }
1176  }
1177  if ( r & ( blend_shufab | blend_shufba ) ) { // fits shufps/shufpd
1178  uint8_t shufpattern = 0; // get pattern
1179  for ( iu = 0; iu < lanesize; iu++ ) { shufpattern |= ( lanepattern[iu] & 3 ) << iu * 2; }
1180  r |= (uint64_t)shufpattern << blend_shufpattern; // return pattern
1181  }
1182  }
1183  } else if ( nlanes > 1 ) { // not same pattern in all lanes
1184  rot = 999; // check if it fits big rotate
1185  for ( ii = 0; ii < N; ii++ ) {
1186  ix = a[ii];
1187  if ( ix >= 0 ) {
1188  uint32_t rot2 = ( ix + 2 * N - ii ) % ( 2 * N ); // rotate count
1189  if ( rot == 999 ) {
1190  rot = rot2; // save rotate count
1191  } else if ( rot != rot2 ) {
1192  rot = 1000;
1193  break; // does not fit big rotate
1194  }
1195  }
1196  }
1197  if ( rot < 2 * N ) { // fits big rotate
1198  r |= blend_rotate_big | (uint64_t)rot << blend_rotpattern;
1199  }
1200  }
1201  if ( lanesize == 2 && ( r & ( blend_shufab | blend_shufba ) ) ) { // fits shufpd. Get pattern
1202  for ( ii = 0; ii < N; ii++ ) { r |= uint64_t( a[ii] & 1 ) << ( blend_shufpattern + ii ); }
1203  }
1204  return r;
1205  }
1206 
1207  // blend_perm_indexes: return an Indexlist for implementing a blend function as
1208  // two permutations. N = vector size.
1209  // dozero = 0: let unused elements be don't care. The two permutation results must be blended
1210  // dozero = 1: zero unused elements in each permuation. The two permutation results can be OR'ed
1211  // dozero = 2: indexes that are -1 or V_DC are preserved
1212  template <int N, int dozero>
1213  constexpr EList<int, 2 * N> blend_perm_indexes( int const ( &a )[N] ) {
1214  // a is a reference to a constexpr array of permutation indexes
1215  EList<int, 2 * N> list = { { 0 } }; // list to return
1216  int u = dozero ? -1 : V_DC; // value to use for unused entries
1217  int j = 0;
1218 
1219  for ( j = 0; j < N; j++ ) { // loop through indexes
1220  int ix = a[j]; // current index
1221  if ( ix < 0 ) { // zero or don't care
1222  if ( dozero == 2 ) {
1223  // list.a[j] = list.a[j + N] = ix; // fails in gcc in complicated cases
1224  list.a[j] = ix;
1225  list.a[j + N] = ix;
1226  } else {
1227  // list.a[j] = list.a[j + N] = u;
1228  list.a[j] = u;
1229  list.a[j + N] = u;
1230  }
1231  } else if ( ix < N ) { // value from a
1232  list.a[j] = ix;
1233  list.a[j + N] = u;
1234  } else {
1235  list.a[j] = u; // value from b
1236  list.a[j + N] = ix - N;
1237  }
1238  }
1239  return list;
1240  }
1241 
1242  // largeblock_indexes: return indexes for replacing a permute or blend with a
1243  // certain block size by a permute or blend with the double block size.
1244  // Note: it is presupposed that perm_flags or blend_flags indicates _largeblock
1245  // It is required that additional zeroing is added if perm_flags or blend_flags
1246  // indicates _addz
1247  template <int N>
1248  constexpr EList<int, N / 2> largeblock_indexes( int const ( &a )[N] ) {
1249  // Parameter a is a reference to a constexpr array of N permutation indexes
1250  EList<int, N / 2> list = { { 0 } }; // list to return
1251 
1252  bool fit_addz = false; // additional zeroing needed at the lower block level
1253  int ix = 0; // even index
1254  int iy = 0; // odd index
1255  int iz = 0; // combined index
1256  int i = 0; // loop counter
1257 
1258  for ( i = 0; i < N; i += 2 ) {
1259  ix = a[i]; // even index
1260  iy = a[i + 1]; // odd index
1261  if ( ix >= 0 ) {
1262  iz = ix / 2; // half index
1263  } else if ( iy >= 0 ) {
1264  iz = iy / 2; // half index
1265  } else
1266  iz = ix | iy; // -1 or V_DC. -1 takes precedence
1267  list.a[i / 2] = iz; // save to list
1268  // check if additional zeroing is needed at current block size
1269  if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz = true; }
1270  }
1271  // replace -1 by V_DC if fit_addz
1272  if ( fit_addz ) {
1273  for ( i = 0; i < N / 2; i++ ) {
1274  if ( list.a[i] < 0 ) list.a[i] = V_DC;
1275  }
1276  }
1277  return list;
1278  }
1279 
1280  /****************************************************************************************
1281  *
1282  * Vector blend helper function templates
1283  *
1284  * These templates are for emulating a blend with a vector size that is not supported by
1285  * the instruction set, using multiple blends or permutations of half the vector size
1286  *
1287  ****************************************************************************************/
1288 
1289  // Make dummy blend function templates to avoid error messages when the blend funtions are not yet defined
1290  template <typename dummy>
1291  void blend2() {}
1292  template <typename dummy>
1293  void blend4() {}
1294  template <typename dummy>
1295  void blend8() {}
1296  template <typename dummy>
1297  void blend16() {}
1298  template <typename dummy>
1299  void blend32() {}
1300 
1301  // blend_half_indexes: return an Indexlist for emulating a blend function as
1302  // blends or permutations from multiple sources
1303  // dozero = 0: let unused elements be don't care. Multiple permutation results must be blended
1304  // dozero = 1: zero unused elements in each permuation. Multiple permutation results can be OR'ed
1305  // dozero = 2: indexes that are -1 or V_DC are preserved
1306  // src1, src2: sources to blend in a partial implementation
1307  template <int N, int dozero, int src1, int src2>
1308  constexpr EList<int, N> blend_half_indexes( int const ( &a )[N] ) {
1309  // a is a reference to a constexpr array of permutation indexes
1310  EList<int, N> list = { { 0 } }; // list to return
1311  int u = dozero ? -1 : V_DC; // value to use for unused entries
1312  int j = 0; // loop counter
1313 
1314  for ( j = 0; j < N; j++ ) { // loop through indexes
1315  int ix = a[j]; // current index
1316  if ( ix < 0 ) { // zero or don't care
1317  list.a[j] = ( dozero == 2 ) ? ix : u;
1318  } else {
1319  int src = ix / N; // source
1320  if ( src == src1 ) {
1321  list.a[j] = ix & ( N - 1 );
1322  } else if ( src == src2 ) {
1323  list.a[j] = ( ix & ( N - 1 ) ) + N;
1324  } else
1325  list.a[j] = u;
1326  }
1327  }
1328  return list;
1329  }
1330 
1331  // selectblend: select one of four sources for blending
1332  template <typename W, int s>
1333  static inline auto selectblend( W const a, W const b ) {
1334  if constexpr ( s == 0 )
1335  return a.get_low();
1336  else if constexpr ( s == 1 )
1337  return a.get_high();
1338  else if constexpr ( s == 2 )
1339  return b.get_low();
1340  else
1341  return b.get_high();
1342  }
1343 
1344  // blend_half: Emulate a blend with a vector size that is not supported
1345  // by multiple blends with half the vector size.
1346  // blend_half is called twice, to give the low and high half of the result
1347  // Parameters: W: type of full-size vector
1348  // i0...: indexes for low or high half
1349  // a, b: full size input vectors
1350  // return value: half-size vector for lower or upper part
1351  template <typename W, int... i0>
1352  auto blend_half( W const& a, W const& b ) {
1353  typedef decltype( a.get_low() ) V; // type for half-size vector
1354  constexpr int N = V::size(); // size of half-size vector
1355  static_assert( sizeof...( i0 ) == N, "wrong number of indexes in blend_half" );
1356  constexpr int ind[N] = { i0... }; // array of indexes
1357 
1358  // lambda to find which of the four possible sources are used
1359  // return: EList<int, 5> containing a list of up to 4 sources. The last element is the number of sources used
1360  auto listsources = []( int const n, int const( &ind )[N] ) constexpr {
1361  bool source_used[4] = { false, false, false, false }; // list of sources used
1362  int i = 0;
1363  for ( i = 0; i < n; i++ ) {
1364  int ix = ind[i]; // index
1365  if ( ix >= 0 ) {
1366  int src = ix / n; // source used
1367  source_used[src & 3] = true;
1368  }
1369  }
1370  // return a list of sources used. The last element is the number of sources used
1371  EList<int, 5> sources = { { 0 } };
1372  int nsrc = 0; // number of sources
1373  for ( i = 0; i < 4; i++ ) {
1374  if ( source_used[i] ) { sources.a[nsrc++] = i; }
1375  }
1376  sources.a[4] = nsrc;
1377  return sources;
1378  };
1379  // list of sources used
1380  constexpr EList<int, 5> sources = listsources( N, ind );
1381  constexpr int nsrc = sources.a[4]; // number of sources used
1382 
1383  if constexpr ( nsrc == 0 ) { // no sources
1384  return V( 0 );
1385  }
1386  // get indexes for the first one or two sources
1387  constexpr int uindex = ( nsrc > 2 ) ? 1 : 2; // unused elements set to zero if two blends are combined
1388  constexpr EList<int, N> L = blend_half_indexes<N, uindex, sources.a[0], sources.a[1]>( ind );
1389  V x0;
1390  V src0 = selectblend<W, sources.a[0]>( a, b ); // first source
1391  V src1 = selectblend<W, sources.a[1]>( a, b ); // second source
1392  if constexpr ( N == 2 ) {
1393  x0 = blend2<L.a[0], L.a[1]>( src0, src1 );
1394  } else if constexpr ( N == 4 ) {
1395  x0 = blend4<L.a[0], L.a[1], L.a[2], L.a[3]>( src0, src1 );
1396  } else if constexpr ( N == 8 ) {
1397  x0 = blend8<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]>( src0, src1 );
1398  } else if constexpr ( N == 16 ) {
1399  x0 = blend16<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
1400  L.a[12], L.a[13], L.a[14], L.a[15]>( src0, src1 );
1401  } else if constexpr ( N == 32 ) {
1402  x0 = blend32<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
1403  L.a[12], L.a[13], L.a[14], L.a[15], L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22],
1404  L.a[23], L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]>( src0, src1 );
1405  }
1406  if constexpr ( nsrc > 2 ) { // get last one or two sources
1407  constexpr EList<int, N> M = blend_half_indexes<N, 1, sources.a[2], sources.a[3]>( ind );
1408  V x1;
1409  V src2 = selectblend<W, sources.a[2]>( a, b ); // third source
1410  V src3 = selectblend<W, sources.a[3]>( a, b ); // fourth source
1411  if constexpr ( N == 2 ) {
1412  x1 = blend2<M.a[0], M.a[1]>( src0, src1 );
1413  } else if constexpr ( N == 4 ) {
1414  x1 = blend4<M.a[0], M.a[1], M.a[2], M.a[3]>( src2, src3 );
1415  } else if constexpr ( N == 8 ) {
1416  x1 = blend8<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7]>( src2, src3 );
1417  } else if constexpr ( N == 16 ) {
1418  x1 = blend16<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
1419  M.a[12], M.a[13], M.a[14], M.a[15]>( src2, src3 );
1420  } else if constexpr ( N == 32 ) {
1421  x1 = blend32<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
1422  M.a[12], M.a[13], M.a[14], M.a[15], M.a[16], M.a[17], M.a[18], M.a[19], M.a[20], M.a[21], M.a[22],
1423  M.a[23], M.a[24], M.a[25], M.a[26], M.a[27], M.a[28], M.a[29], M.a[30], M.a[31]>( src2, src3 );
1424  }
1425  x0 |= x1; // combine result of two blends. Unused elements are zero
1426  }
1427  return x0;
1428  }
1429 
1430 #ifdef VCL_NAMESPACE
1431 }
1432 #endif
1433 
1434 #endif // INSTRSET_H
blend4
void blend4()
Definition: instrset.h:1293
blend8
void blend8()
Definition: instrset.h:1295
get_inttype
constexpr auto get_inttype()
Definition: instrset.h:468
blend_perm_indexes
constexpr EList< int, 2 *N > blend_perm_indexes(int const (&a)[N])
Definition: instrset.h:1213
hasXOP
bool hasXOP(void)
Definition: instrset_detect.cpp:126
details::size
constexpr auto size(const T &, Args &&...) noexcept
Definition: AnyDataWrapper.h:23
IOTest.N
N
Definition: IOTest.py:112
perm_punpckl
const int perm_punpckl
Definition: instrset.h:583
perm_shleft
const int perm_shleft
Definition: instrset.h:588
hasFMA3
bool hasFMA3(void)
Definition: instrset_detect.cpp:110
perm_broadcast
const int perm_broadcast
Definition: instrset.h:592
perm_largeblock
const int perm_largeblock
Definition: instrset.h:577
blend_rotateab
const int blend_rotateab
Definition: instrset.h:1026
physicalProcessors
int physicalProcessors(int *logical_processors=0)
gaudirun.s
string s
Definition: gaudirun.py:346
perm_rot_count
const int perm_rot_count
Definition: instrset.h:597
perm16_flags
constexpr uint64_t perm16_flags(int const (&a)[V::size()])
Definition: instrset.h:875
blend_perma
const int blend_perma
Definition: instrset.h:1018
make_bit_mask
constexpr uint64_t make_bit_mask(int const (&a)[N])
Definition: instrset.h:524
perm_flags
constexpr uint64_t perm_flags(int const (&a)[V::size()])
Definition: instrset.h:602
blend32
void blend32()
Definition: instrset.h:1299
perm_ipattern
const int perm_ipattern
Definition: instrset.h:598
gaudirun.c
c
Definition: gaudirun.py:525
perm_zeroing
const int perm_zeroing
Definition: instrset.h:574
blend_addz
const int blend_addz
Definition: instrset.h:1015
blend_rotate_big
const int blend_rotate_big
Definition: instrset.h:1030
gaudirun.output
output
Definition: gaudirun.py:521
blend_punpckhab
const int blend_punpckhab
Definition: instrset.h:1022
largeblock_indexes
constexpr EList< int, N/2 > largeblock_indexes(int const (&a)[N])
Definition: instrset.h:1248
pshufb_mask
constexpr auto pshufb_mask(int const (&A)[V::size()])
Definition: instrset.h:934
perm_rotate
const int perm_rotate
Definition: instrset.h:584
Const_uint_t
Definition: instrset.h:404
blend_flags
constexpr uint64_t blend_flags(int const (&a)[V::size()])
Definition: instrset.h:1036
perm_shright
const int perm_shright
Definition: instrset.h:586
bug_34121.t
t
Definition: bug_34121.py:31
blend_rotateba
const int blend_rotateba
Definition: instrset.h:1027
blend_same_pattern
const int blend_same_pattern
Definition: instrset.h:1021
perm_mask_broad
constexpr auto perm_mask_broad(int const (&A)[V::size()])
Definition: instrset.h:563
Gaudi::Units::m
constexpr double m
Definition: SystemOfUnits.h:108
perm_zext
const int perm_zext
Definition: instrset.h:593
ProduceConsume.j
j
Definition: ProduceConsume.py:104
blend_a
const int blend_a
Definition: instrset.h:1016
blend_zeroing
const int blend_zeroing
Definition: instrset.h:1012
zero_mask
constexpr auto zero_mask(int const (&a)[N])
Definition: instrset.h:485
make_broad_mask
constexpr auto make_broad_mask(uint64_t const m)
Definition: instrset.h:551
blend_allzero
const int blend_allzero
Definition: instrset.h:1013
blend2
void blend2()
Definition: instrset.h:1291
instrset_detect
int instrset_detect(void)
Definition: instrset_detect.cpp:63
hasFMA4
bool hasFMA4(void)
Definition: instrset_detect.cpp:118
blend_shufpattern
const int blend_shufpattern
Definition: instrset.h:1032
blend_shufba
const int blend_shufba
Definition: instrset.h:1029
hasAVX512ER
bool hasAVX512ER(void)
Definition: instrset_detect.cpp:142
EList
Definition: instrset.h:461
perm_perm
const int perm_perm
Definition: instrset.h:575
bit_scan_reverse_const
constexpr int bit_scan_reverse_const(uint64_t const n)
Definition: instrset.h:380
cpluginsvc.n
n
Definition: cpluginsvc.py:234
blend_rotpattern
const int blend_rotpattern
Definition: instrset.h:1033
perm_compress
const int perm_compress
Definition: instrset.h:594
blend_punpcklba
const int blend_punpcklba
Definition: instrset.h:1025
Const_int_t
Definition: instrset.h:402
perm_cross_lane
const int perm_cross_lane
Definition: instrset.h:580
largeblock_perm
constexpr EList< int, N/2 > largeblock_perm(int const (&a)[N])
Definition: instrset.h:977
perm_addz2
const int perm_addz2
Definition: instrset.h:579
perm_punpckh
const int perm_punpckh
Definition: instrset.h:582
blend_punpcklab
const int blend_punpcklab
Definition: instrset.h:1024
compress_mask
constexpr uint64_t compress_mask(int const (&a)[N])
Definition: instrset.h:823
perm_rotate_big
const int perm_rotate_big
Definition: instrset.h:590
hasAVX512VBMI
bool hasAVX512VBMI(void)
Definition: instrset_detect.cpp:150
expand_mask
constexpr uint64_t expand_mask(int const (&a)[N])
Definition: instrset.h:847
blend_punpckhba
const int blend_punpckhba
Definition: instrset.h:1023
EList::a
T a[N]
Definition: instrset.h:478
compareRootHistos.retval
retval
Definition: compareRootHistos.py:499
blend_b
const int blend_b
Definition: instrset.h:1017
V_DC
constexpr int V_DC
Definition: instrset.h:220
perm_same_pattern
const int perm_same_pattern
Definition: instrset.h:581
blend_largeblock
const int blend_largeblock
Definition: instrset.h:1014
perm_allzero
const int perm_allzero
Definition: instrset.h:576
zero_mask_broad
constexpr auto zero_mask_broad(int const (&A)[V::size()])
Definition: instrset.h:505
blend_half
auto blend_half(W const &a, W const &b)
Definition: instrset.h:1352
Gaudi::Units::L
constexpr double L
Definition: SystemOfUnits.h:118
perm_outofrange
const int perm_outofrange
Definition: instrset.h:596
blend_shufab
const int blend_shufab
Definition: instrset.h:1028
hasAVX512VBMI2
bool hasAVX512VBMI2(void)
Definition: instrset_detect.cpp:158
blend_half_indexes
constexpr EList< int, N > blend_half_indexes(int const (&a)[N])
Definition: instrset.h:1308
blend16
void blend16()
Definition: instrset.h:1297
cpuid
#define cpuid(func, ax, bx, cx, dx)
Definition: PerfMonAuditor.cpp:83
perm_addz
const int perm_addz
Definition: instrset.h:578
blend_cross_lane
const int blend_cross_lane
Definition: instrset.h:1020
blend_outofrange
const int blend_outofrange
Definition: instrset.h:1031
blend_permb
const int blend_permb
Definition: instrset.h:1019
perm_expand
const int perm_expand
Definition: instrset.h:595