The Gaudi Framework  v33r2 (a6f0ec87)
instrset.h File Reference
#include <stdint.h>
#include <stdlib.h>
Include dependency graph for instrset.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

class  Const_int_t< n >
 
class  Const_uint_t< n >
 
struct  EList< T, N >
 

Macros

#define INSTRSET_H   20102
 
#define ALLOW_FP_PERMUTE   true
 
#define INSTRSET   0
 
#define const_int(n)   ( Const_int_t<n>() )
 
#define const_uint(n)   ( Const_uint_t<n>() )
 

Functions

int instrset_detect (void)
 
bool hasFMA3 (void)
 
bool hasFMA4 (void)
 
bool hasXOP (void)
 
bool hasAVX512ER (void)
 
bool hasAVX512VBMI (void)
 
bool hasAVX512VBMI2 (void)
 
int physicalProcessors (int *logical_processors=0)
 
constexpr int bit_scan_reverse_const (uint64_t const n)
 
template<typename V >
constexpr auto get_inttype ()
 
template<int N>
constexpr auto zero_mask (int const (&a)[N])
 
template<typename V >
constexpr auto zero_mask_broad (int const (&A)[V::size()])
 
template<int N, int B>
constexpr uint64_t make_bit_mask (int const (&a)[N])
 
template<typename V >
constexpr auto make_broad_mask (uint64_t const m)
 
template<typename V >
constexpr auto perm_mask_broad (int const (&A)[V::size()])
 
template<typename V >
constexpr uint64_t perm_flags (int const (&a)[V::size()])
 
template<int N>
constexpr uint64_t compress_mask (int const (&a)[N])
 
template<int N>
constexpr uint64_t expand_mask (int const (&a)[N])
 
template<typename V >
constexpr uint64_t perm16_flags (int const (&a)[V::size()])
 
template<typename V , int oppos = 0>
constexpr auto pshufb_mask (int const (&A)[V::size()])
 
template<int N>
constexpr EList< int, N/2 > largeblock_perm (int const (&a)[N])
 
template<typename V >
constexpr uint64_t blend_flags (int const (&a)[V::size()])
 
template<int N, int dozero>
constexpr EList< int, 2 *N > blend_perm_indexes (int const (&a)[N])
 
template<int N>
constexpr EList< int, N/2 > largeblock_indexes (int const (&a)[N])
 
template<typename dummy >
void blend2 ()
 
template<typename dummy >
void blend4 ()
 
template<typename dummy >
void blend8 ()
 
template<typename dummy >
void blend16 ()
 
template<typename dummy >
void blend32 ()
 
template<int N, int dozero, int src1, int src2>
constexpr EList< int, N > blend_half_indexes (int const (&a)[N])
 
template<typename W , int... i0>
auto blend_half (W const &a, W const &b)
 

Variables

constexpr int V_DC = -256
 
const int perm_zeroing = 1
 
const int perm_perm = 2
 
const int perm_allzero = 4
 
const int perm_largeblock = 8
 
const int perm_addz = 0x10
 
const int perm_addz2 = 0x20
 
const int perm_cross_lane = 0x40
 
const int perm_same_pattern = 0x80
 
const int perm_punpckh = 0x100
 
const int perm_punpckl = 0x200
 
const int perm_rotate
 
const int perm_shright
 
const int perm_shleft
 
const int perm_rotate_big
 
const int perm_broadcast = 0x8000
 
const int perm_zext = 0x10000
 
const int perm_compress = 0x20000
 
const int perm_expand = 0x40000
 
const int perm_outofrange = 0x10000000
 
const int perm_rot_count = 32
 
const int perm_ipattern
 
const int blend_zeroing = 1
 
const int blend_allzero = 2
 
const int blend_largeblock = 4
 
const int blend_addz = 8
 
const int blend_a = 0x10
 
const int blend_b = 0x20
 
const int blend_perma = 0x40
 
const int blend_permb = 0x80
 
const int blend_cross_lane = 0x100
 
const int blend_same_pattern = 0x200
 
const int blend_punpckhab = 0x1000
 
const int blend_punpckhba = 0x2000
 
const int blend_punpcklab = 0x4000
 
const int blend_punpcklba = 0x8000
 
const int blend_rotateab = 0x10000
 
const int blend_rotateba = 0x20000
 
const int blend_shufab = 0x40000
 
const int blend_shufba = 0x80000
 
const int blend_rotate_big = 0x100000
 
const int blend_outofrange = 0x10000000
 
const int blend_shufpattern = 32
 
const int blend_rotpattern = 40
 

Macro Definition Documentation

◆ ALLOW_FP_PERMUTE

#define ALLOW_FP_PERMUTE   true

Definition at line 29 of file instrset.h.

◆ const_int

#define const_int (   n)    ( Const_int_t<n>() )

Definition at line 403 of file instrset.h.

◆ const_uint

#define const_uint (   n)    ( Const_uint_t<n>() )

Definition at line 404 of file instrset.h.

◆ INSTRSET

#define INSTRSET   0

Definition at line 77 of file instrset.h.

◆ INSTRSET_H

#define INSTRSET_H   20102

Definition at line 24 of file instrset.h.

Function Documentation

◆ bit_scan_reverse_const()

constexpr int bit_scan_reverse_const ( uint64_t const  n)

Definition at line 378 of file instrset.h.

378  {
379  if ( n == 0 ) return -1;
380  uint64_t a = n, b = 0, j = 64, k = 0;
381  do {
382  j >>= 1;
383  k = (uint64_t)1 << j;
384  if ( a >= k ) {
385  a >>= j;
386  b += j;
387  }
388  } while ( j > 0 );
389  return int( b );
390  }

◆ blend16()

template<typename dummy >
void blend16 ( )

Definition at line 1295 of file instrset.h.

1295 {}

◆ blend2()

template<typename dummy >
void blend2 ( )

Definition at line 1289 of file instrset.h.

1289 {}

◆ blend32()

template<typename dummy >
void blend32 ( )

Definition at line 1297 of file instrset.h.

1297 {}

◆ blend4()

template<typename dummy >
void blend4 ( )

Definition at line 1291 of file instrset.h.

1291 {}

◆ blend8()

template<typename dummy >
void blend8 ( )

Definition at line 1293 of file instrset.h.

1293 {}

◆ blend_flags()

template<typename V >
constexpr uint64_t blend_flags ( int const (&)  a[V::size()])

Definition at line 1034 of file instrset.h.

1034  {
1035  // a is a reference to a constexpr array of permutation indexes
1036  // V is a vector class
1037  constexpr int N = V::size(); // number of elements
1038  uint64_t r = blend_largeblock | blend_same_pattern | blend_allzero; // return value
1039  uint32_t iu = 0; // loop counter
1040  int32_t ii = 0; // loop counter
1041  int ix = 0; // index number i
1042  const uint32_t nlanes = sizeof( V ) / 16; // number of 128-bit lanes
1043  const uint32_t lanesize = N / nlanes; // elements per lane
1044  uint32_t lane = 0; // current lane
1045  uint32_t rot = 999; // rotate left count
1046  int lanepattern[lanesize] = {0}; // pattern in each lane
1047  if ( lanesize == 2 && N <= 8 ) {
1048  r |= blend_shufab | blend_shufba; // check if it fits shufpd
1049  }
1050 
1051  for ( ii = 0; ii < N; ii++ ) { // loop through indexes
1052  ix = a[ii]; // index
1053  if ( ix < 0 ) {
1054  if ( ix == -1 )
1055  r |= blend_zeroing; // set to zero
1056  else if ( ix != V_DC ) {
1057  r = blend_outofrange;
1058  break; // illegal index
1059  }
1060  } else { // ix >= 0
1061  r &= ~blend_allzero;
1062  if ( ix < N ) {
1063  r |= blend_a; // data from a
1064  if ( ix != ii ) r |= blend_perma; // permutation of a
1065  } else if ( ix < 2 * N ) {
1066  r |= blend_b; // data from b
1067  if ( ix != ii + N ) r |= blend_permb; // permutation of b
1068  } else {
1069  r = blend_outofrange;
1070  break; // illegal index
1071  }
1072  }
1073  // check if pattern fits a larger block size:
1074  // even indexes must be even, odd indexes must fit the preceding even index + 1
1075  if ( ( ii & 1 ) == 0 ) { // even index
1076  if ( ix >= 0 && ( ix & 1 ) ) r &= ~blend_largeblock; // not even. does not fit larger block size
1077  int iy = a[ii + 1]; // next odd index
1078  if ( iy >= 0 && ( iy & 1 ) == 0 ) r &= ~blend_largeblock; // not odd. does not fit larger block size
1079  if ( ix >= 0 && iy >= 0 && iy != ix + 1 ) r &= ~blend_largeblock; // does not fit preceding index + 1
1080  if ( ix == -1 && iy >= 0 ) r |= blend_addz; // needs additional zeroing at current block size
1081  if ( iy == -1 && ix >= 0 ) r |= blend_addz; // needs additional zeroing at current block size
1082  }
1083  lane = (uint32_t)ii / lanesize; // current lane
1084  if ( lane == 0 ) { // first lane, or no pattern yet
1085  lanepattern[ii] = ix; // save pattern
1086  }
1087  // check if crossing lanes
1088  if ( ix >= 0 ) {
1089  uint32_t lanei = uint32_t( ix & ~N ) / lanesize; // source lane
1090  if ( lanei != lane ) {
1091  r |= blend_cross_lane; // crossing lane
1092  }
1093  if ( lanesize == 2 ) { // check if it fits pshufd
1094  if ( lanei != lane ) r &= ~( blend_shufab | blend_shufba );
1095  if ( ( ( ( ix & N ) != 0 ) ^ ii ) & 1 )
1096  r &= ~blend_shufab;
1097  else
1098  r &= ~blend_shufba;
1099  }
1100  }
1101  // check if same pattern in all lanes
1102  if ( lane != 0 && ix >= 0 ) { // not first lane
1103  int j = ii - int( lane * lanesize ); // index into lanepattern
1104  int jx = ix - int( lane * lanesize ); // pattern within lane
1105  if ( jx < 0 || ( jx & ~N ) >= (int)lanesize ) r &= ~blend_same_pattern; // source is in another lane
1106  if ( lanepattern[j] < 0 ) {
1107  lanepattern[j] = jx; // pattern not known from previous lane
1108  } else {
1109  if ( lanepattern[j] != jx ) r &= ~blend_same_pattern; // not same pattern
1110  }
1111  }
1112  }
1113  if ( !( r & blend_largeblock ) ) r &= ~blend_addz; // remove irrelevant flag
1114  if ( r & blend_cross_lane ) r &= ~blend_same_pattern; // remove irrelevant flag
1115  if ( !( r & ( blend_perma | blend_permb ) ) ) {
1116  return r; // no permutation. more checks are superfluous
1117  }
1118  if ( r & blend_same_pattern ) {
1119  // same pattern in all lanes. check if it fits unpack patterns
1121  for ( iu = 0; iu < lanesize; iu++ ) { // loop through lanepattern
1122  ix = lanepattern[iu];
1123  if ( ix >= 0 ) {
1124  if ( (uint32_t)ix != iu / 2 + ( iu & 1 ) * N ) r &= ~blend_punpcklab;
1125  if ( (uint32_t)ix != iu / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &= ~blend_punpcklba;
1126  if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( iu & 1 ) * N ) r &= ~blend_punpckhab;
1127  if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &= ~blend_punpckhba;
1128  }
1129  }
1130 #if INSTRSET >= 4 // SSSE3. check if it fits palignr
1131  for ( iu = 0; iu < lanesize; iu++ ) {
1132  ix = lanepattern[iu];
1133  if ( ix >= 0 ) {
1134  uint32_t t = ix & ~N;
1135  if ( ix & N ) t += lanesize;
1136  uint32_t tb = ( t + 2 * lanesize - iu ) % ( lanesize * 2 );
1137  if ( rot == 999 ) {
1138  rot = tb;
1139  } else { // check if fit
1140  if ( rot != tb ) rot = 1000;
1141  }
1142  }
1143  }
1144  if ( rot < 999 ) { // firs palignr
1145  if ( rot < lanesize ) {
1146  r |= blend_rotateba;
1147  } else {
1148  r |= blend_rotateab;
1149  }
1150  const uint32_t elementsize = sizeof( V ) / N;
1151  r |= uint64_t( ( rot & ( lanesize - 1 ) ) * elementsize ) << blend_rotpattern;
1152  }
1153 #endif
1154  if ( lanesize == 4 ) {
1155  // check if it fits shufps
1156  r |= blend_shufab | blend_shufba;
1157  for ( ii = 0; ii < 2; ii++ ) {
1158  ix = lanepattern[ii];
1159  if ( ix >= 0 ) {
1160  if ( ix & N )
1161  r &= ~blend_shufab;
1162  else
1163  r &= ~blend_shufba;
1164  }
1165  }
1166  for ( ; ii < 4; ii++ ) {
1167  ix = lanepattern[ii];
1168  if ( ix >= 0 ) {
1169  if ( ix & N )
1170  r &= ~blend_shufba;
1171  else
1172  r &= ~blend_shufab;
1173  }
1174  }
1175  if ( r & ( blend_shufab | blend_shufba ) ) { // fits shufps/shufpd
1176  uint8_t shufpattern = 0; // get pattern
1177  for ( iu = 0; iu < lanesize; iu++ ) { shufpattern |= ( lanepattern[iu] & 3 ) << iu * 2; }
1178  r |= (uint64_t)shufpattern << blend_shufpattern; // return pattern
1179  }
1180  }
1181  } else if ( nlanes > 1 ) { // not same pattern in all lanes
1182  rot = 999; // check if it fits big rotate
1183  for ( ii = 0; ii < N; ii++ ) {
1184  ix = a[ii];
1185  if ( ix >= 0 ) {
1186  uint32_t rot2 = ( ix + 2 * N - ii ) % ( 2 * N ); // rotate count
1187  if ( rot == 999 ) {
1188  rot = rot2; // save rotate count
1189  } else if ( rot != rot2 ) {
1190  rot = 1000;
1191  break; // does not fit big rotate
1192  }
1193  }
1194  }
1195  if ( rot < 2 * N ) { // fits big rotate
1196  r |= blend_rotate_big | (uint64_t)rot << blend_rotpattern;
1197  }
1198  }
1199  if ( lanesize == 2 && ( r & ( blend_shufab | blend_shufba ) ) ) { // fits shufpd. Get pattern
1200  for ( ii = 0; ii < N; ii++ ) { r |= uint64_t( a[ii] & 1 ) << ( blend_shufpattern + ii ); }
1201  }
1202  return r;
1203  }
constexpr auto size(const T &, Args &&...) noexcept
const int blend_permb
Definition: instrset.h:1017
const int blend_shufab
Definition: instrset.h:1026
const int blend_largeblock
Definition: instrset.h:1012
const int blend_allzero
Definition: instrset.h:1011
int N
Definition: IOTest.py:110
const int blend_rotateba
Definition: instrset.h:1025
const int blend_shufpattern
Definition: instrset.h:1030
const int blend_rotate_big
Definition: instrset.h:1028
const int blend_a
Definition: instrset.h:1014
const int blend_shufba
Definition: instrset.h:1027
const int blend_b
Definition: instrset.h:1015
const int blend_outofrange
Definition: instrset.h:1029
constexpr int V_DC
Definition: instrset.h:220
const int blend_cross_lane
Definition: instrset.h:1018
const int blend_punpckhba
Definition: instrset.h:1021
const int blend_same_pattern
Definition: instrset.h:1019
const int blend_punpckhab
Definition: instrset.h:1020
const int blend_perma
Definition: instrset.h:1016
const int blend_addz
Definition: instrset.h:1013
const int blend_zeroing
Definition: instrset.h:1010
const int blend_punpcklba
Definition: instrset.h:1023
const int blend_rotpattern
Definition: instrset.h:1031
const int blend_punpcklab
Definition: instrset.h:1022
const int blend_rotateab
Definition: instrset.h:1024

◆ blend_half()

template<typename W , int... i0>
auto blend_half ( W const &  a,
W const &  b 
)

Definition at line 1350 of file instrset.h.

1350  {
1351  typedef decltype( a.get_low() ) V; // type for half-size vector
1352  constexpr int N = V::size(); // size of half-size vector
1353  static_assert( sizeof...( i0 ) == N, "wrong number of indexes in blend_half" );
1354  constexpr int ind[N] = {i0...}; // array of indexes
1355 
1356  // lambda to find which of the four possible sources are used
1357  // return: EList<int, 5> containing a list of up to 4 sources. The last element is the number of sources used
1358  auto listsources = []( int const n, int const( &ind )[N] ) constexpr {
1359  bool source_used[4] = {false, false, false, false}; // list of sources used
1360  int i = 0;
1361  for ( i = 0; i < n; i++ ) {
1362  int ix = ind[i]; // index
1363  if ( ix >= 0 ) {
1364  int src = ix / n; // source used
1365  source_used[src & 3] = true;
1366  }
1367  }
1368  // return a list of sources used. The last element is the number of sources used
1369  EList<int, 5> sources = {{0}};
1370  int nsrc = 0; // number of sources
1371  for ( i = 0; i < 4; i++ ) {
1372  if ( source_used[i] ) { sources.a[nsrc++] = i; }
1373  }
1374  sources.a[4] = nsrc;
1375  return sources;
1376  };
1377  // list of sources used
1378  constexpr EList<int, 5> sources = listsources( N, ind );
1379  constexpr int nsrc = sources.a[4]; // number of sources used
1380 
1381  if constexpr ( nsrc == 0 ) { // no sources
1382  return V( 0 );
1383  }
1384  // get indexes for the first one or two sources
1385  constexpr int uindex = ( nsrc > 2 ) ? 1 : 2; // unused elements set to zero if two blends are combined
1386  constexpr EList<int, N> L = blend_half_indexes<N, uindex, sources.a[0], sources.a[1]>( ind );
1387  V x0;
1388  V src0 = selectblend<W, sources.a[0]>( a, b ); // first source
1389  V src1 = selectblend<W, sources.a[1]>( a, b ); // second source
1390  if constexpr ( N == 2 ) {
1391  x0 = blend2<L.a[0], L.a[1]>( src0, src1 );
1392  } else if constexpr ( N == 4 ) {
1393  x0 = blend4<L.a[0], L.a[1], L.a[2], L.a[3]>( src0, src1 );
1394  } else if constexpr ( N == 8 ) {
1395  x0 = blend8<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]>( src0, src1 );
1396  } else if constexpr ( N == 16 ) {
1397  x0 = blend16<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
1398  L.a[12], L.a[13], L.a[14], L.a[15]>( src0, src1 );
1399  } else if constexpr ( N == 32 ) {
1400  x0 = blend32<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
1401  L.a[12], L.a[13], L.a[14], L.a[15], L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22],
1402  L.a[23], L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]>( src0, src1 );
1403  }
1404  if constexpr ( nsrc > 2 ) { // get last one or two sources
1405  constexpr EList<int, N> M = blend_half_indexes<N, 1, sources.a[2], sources.a[3]>( ind );
1406  V x1;
1407  V src2 = selectblend<W, sources.a[2]>( a, b ); // third source
1408  V src3 = selectblend<W, sources.a[3]>( a, b ); // fourth source
1409  if constexpr ( N == 2 ) {
1410  x1 = blend2<M.a[0], M.a[1]>( src0, src1 );
1411  } else if constexpr ( N == 4 ) {
1412  x1 = blend4<M.a[0], M.a[1], M.a[2], M.a[3]>( src2, src3 );
1413  } else if constexpr ( N == 8 ) {
1414  x1 = blend8<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7]>( src2, src3 );
1415  } else if constexpr ( N == 16 ) {
1416  x1 = blend16<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
1417  M.a[12], M.a[13], M.a[14], M.a[15]>( src2, src3 );
1418  } else if constexpr ( N == 32 ) {
1419  x1 = blend32<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
1420  M.a[12], M.a[13], M.a[14], M.a[15], M.a[16], M.a[17], M.a[18], M.a[19], M.a[20], M.a[21], M.a[22],
1421  M.a[23], M.a[24], M.a[25], M.a[26], M.a[27], M.a[28], M.a[29], M.a[30], M.a[31]>( src2, src3 );
1422  }
1423  x0 |= x1; // combine result of two blends. Unused elements are zero
1424  }
1425  return x0;
1426  }
constexpr auto size(const T &, Args &&...) noexcept
void blend8()
Definition: instrset.h:1293
void blend16()
Definition: instrset.h:1295
void blend32()
Definition: instrset.h:1297
int N
Definition: IOTest.py:110
constexpr EList< int, N > blend_half_indexes(int const (&a)[N])
Definition: instrset.h:1306
T a[N]
Definition: instrset.h:460
void blend2()
Definition: instrset.h:1289
void blend4()
Definition: instrset.h:1291

◆ blend_half_indexes()

template<int N, int dozero, int src1, int src2>
constexpr EList<int, N> blend_half_indexes ( int const (&)  a[N])

Definition at line 1306 of file instrset.h.

1306  {
1307  // a is a reference to a constexpr array of permutation indexes
1308  EList<int, N> list = {{0}}; // list to return
1309  int u = dozero ? -1 : V_DC; // value to use for unused entries
1310  int j = 0; // loop counter
1311 
1312  for ( j = 0; j < N; j++ ) { // loop through indexes
1313  int ix = a[j]; // current index
1314  if ( ix < 0 ) { // zero or don't care
1315  list.a[j] = ( dozero == 2 ) ? ix : u;
1316  } else {
1317  int src = ix / N; // source
1318  if ( src == src1 ) {
1319  list.a[j] = ix & ( N - 1 );
1320  } else if ( src == src2 ) {
1321  list.a[j] = ( ix & ( N - 1 ) ) + N;
1322  } else
1323  list.a[j] = u;
1324  }
1325  }
1326  return list;
1327  }
int N
Definition: IOTest.py:110
constexpr int V_DC
Definition: instrset.h:220
T a[N]
Definition: instrset.h:460

◆ blend_perm_indexes()

template<int N, int dozero>
constexpr EList<int, 2 * N> blend_perm_indexes ( int const (&)  a[N])

Definition at line 1211 of file instrset.h.

1211  {
1212  // a is a reference to a constexpr array of permutation indexes
1213  EList<int, 2 * N> list = {{0}}; // list to return
1214  int u = dozero ? -1 : V_DC; // value to use for unused entries
1215  int j = 0;
1216 
1217  for ( j = 0; j < N; j++ ) { // loop through indexes
1218  int ix = a[j]; // current index
1219  if ( ix < 0 ) { // zero or don't care
1220  if ( dozero == 2 ) {
1221  // list.a[j] = list.a[j + N] = ix; // fails in gcc in complicated cases
1222  list.a[j] = ix;
1223  list.a[j + N] = ix;
1224  } else {
1225  // list.a[j] = list.a[j + N] = u;
1226  list.a[j] = u;
1227  list.a[j + N] = u;
1228  }
1229  } else if ( ix < N ) { // value from a
1230  list.a[j] = ix;
1231  list.a[j + N] = u;
1232  } else {
1233  list.a[j] = u; // value from b
1234  list.a[j + N] = ix - N;
1235  }
1236  }
1237  return list;
1238  }
int N
Definition: IOTest.py:110
constexpr int V_DC
Definition: instrset.h:220
T a[N]
Definition: instrset.h:460

◆ compress_mask()

template<int N>
constexpr uint64_t compress_mask ( int const (&)  a[N])

Definition at line 821 of file instrset.h.

821  {
822  // a is a reference to a constexpr array of permutation indexes
823  int ix = 0, lasti = -1, lastp = -1;
824  uint64_t m = 0;
825  int i = 0;
826  int j = 1; // loop counters
827  for ( i = 0; i < N; i++ ) {
828  ix = a[i]; // permutation index
829  if ( ix >= 0 ) {
830  m |= (uint64_t)1 << ix; // mask for compression source
831  for ( j = 1; j < i - lastp; j++ ) {
832  m |= (uint64_t)1 << ( lasti + j ); // dummy filling source
833  }
834  lastp = i;
835  lasti = ix;
836  }
837  }
838  return m;
839  }
int N
Definition: IOTest.py:110
constexpr double m

◆ expand_mask()

template<int N>
constexpr uint64_t expand_mask ( int const (&)  a[N])

Definition at line 845 of file instrset.h.

845  {
846  // a is a reference to a constexpr array of permutation indexes
847  int ix = 0, lasti = -1, lastp = -1;
848  uint64_t m = 0;
849  int i = 0;
850  int j = 1;
851  for ( i = 0; i < N; i++ ) {
852  ix = a[i]; // permutation index
853  if ( ix >= 0 ) {
854  m |= (uint64_t)1 << i; // mask for expansion destination
855  for ( j = 1; j < ix - lasti; j++ ) {
856  m |= (uint64_t)1 << ( lastp + j ); // dummy filling destination
857  }
858  lastp = i;
859  lasti = ix;
860  }
861  }
862  return m;
863  }
int N
Definition: IOTest.py:110
constexpr double m

◆ get_inttype()

template<typename V >
constexpr auto get_inttype ( )

Definition at line 466 of file instrset.h.

466  {
467  constexpr int elementsize = sizeof( V ) / V::size(); // size of vector elements
468 
469  if constexpr ( elementsize >= 8 ) {
470  return -int64_t( 1 );
471  } else if constexpr ( elementsize >= 4 ) {
472  return int32_t( -1 );
473  } else if constexpr ( elementsize >= 2 ) {
474  return int16_t( -1 );
475  } else {
476  return int8_t( -1 );
477  }
478  }
constexpr auto size(const T &, Args &&...) noexcept

◆ hasAVX512ER()

bool hasAVX512ER ( void  )

Definition at line 142 of file instrset_detect.cpp.

142  {
143  if ( instrset_detect() < 9 ) return false; // must have AVX512F
144  int abcd[4]; // cpuid results
145  cpuid( abcd, 7 ); // call cpuid function 7
146  return ( ( abcd[1] & ( 1 << 27 ) ) != 0 ); // ebx bit 27 indicates AVX512ER
147  }
int instrset_detect(void)
#define cpuid(func, eax, ebx, ecx, edx)

◆ hasAVX512VBMI()

bool hasAVX512VBMI ( void  )

Definition at line 150 of file instrset_detect.cpp.

150  {
151  if ( instrset_detect() < 10 ) return false; // must have AVX512BW
152  int abcd[4]; // cpuid results
153  cpuid( abcd, 7 ); // call cpuid function 7
154  return ( ( abcd[2] & ( 1 << 1 ) ) != 0 ); // ecx bit 1 indicates AVX512VBMI
155  }
int instrset_detect(void)
#define cpuid(func, eax, ebx, ecx, edx)

◆ hasAVX512VBMI2()

bool hasAVX512VBMI2 ( void  )

Definition at line 158 of file instrset_detect.cpp.

158  {
159  if ( instrset_detect() < 10 ) return false; // must have AVX512BW
160  int abcd[4]; // cpuid results
161  cpuid( abcd, 7 ); // call cpuid function 7
162  return ( ( abcd[2] & ( 1 << 6 ) ) != 0 ); // ecx bit 6 indicates AVX512VBMI2
163  }
int instrset_detect(void)
#define cpuid(func, eax, ebx, ecx, edx)

◆ hasFMA3()

bool hasFMA3 ( void  )

Definition at line 110 of file instrset_detect.cpp.

110  {
111  if ( instrset_detect() < 7 ) return false; // must have AVX
112  int abcd[4]; // cpuid results
113  cpuid( abcd, 1 ); // call cpuid function 1
114  return ( ( abcd[2] & ( 1 << 12 ) ) != 0 ); // ecx bit 12 indicates FMA3
115  }
int instrset_detect(void)
#define cpuid(func, eax, ebx, ecx, edx)

◆ hasFMA4()

bool hasFMA4 ( void  )

Definition at line 118 of file instrset_detect.cpp.

118  {
119  if ( instrset_detect() < 7 ) return false; // must have AVX
120  int abcd[4]; // cpuid results
121  cpuid( abcd, 0x80000001 ); // call cpuid function 0x80000001
122  return ( ( abcd[2] & ( 1 << 16 ) ) != 0 ); // ecx bit 16 indicates FMA4
123  }
int instrset_detect(void)
#define cpuid(func, eax, ebx, ecx, edx)

◆ hasXOP()

bool hasXOP ( void  )

Definition at line 126 of file instrset_detect.cpp.

126  {
127  if ( instrset_detect() < 7 ) return false; // must have AVX
128  int abcd[4]; // cpuid results
129  cpuid( abcd, 0x80000001 ); // call cpuid function 0x80000001
130  return ( ( abcd[2] & ( 1 << 11 ) ) != 0 ); // ecx bit 11 indicates XOP
131  }
int instrset_detect(void)
#define cpuid(func, eax, ebx, ecx, edx)

◆ instrset_detect()

int instrset_detect ( void  )

Definition at line 63 of file instrset_detect.cpp.

63  {
64 
65  static int iset = -1; // remember value for next call
66  if ( iset >= 0 ) {
67  return iset; // called before
68  }
69  iset = 0; // default value
70  int abcd[4] = {0, 0, 0, 0}; // cpuid results
71  cpuid( abcd, 0 ); // call cpuid function 0
72  if ( abcd[0] == 0 ) return iset; // no further cpuid function supported
73  cpuid( abcd, 1 ); // call cpuid function 1 for feature flags
74  if ( ( abcd[3] & ( 1 << 0 ) ) == 0 ) return iset; // no floating point
75  if ( ( abcd[3] & ( 1 << 23 ) ) == 0 ) return iset; // no MMX
76  if ( ( abcd[3] & ( 1 << 15 ) ) == 0 ) return iset; // no conditional move
77  if ( ( abcd[3] & ( 1 << 24 ) ) == 0 ) return iset; // no FXSAVE
78  if ( ( abcd[3] & ( 1 << 25 ) ) == 0 ) return iset; // no SSE
79  iset = 1; // 1: SSE supported
80  if ( ( abcd[3] & ( 1 << 26 ) ) == 0 ) return iset; // no SSE2
81  iset = 2; // 2: SSE2 supported
82  if ( ( abcd[2] & ( 1 << 0 ) ) == 0 ) return iset; // no SSE3
83  iset = 3; // 3: SSE3 supported
84  if ( ( abcd[2] & ( 1 << 9 ) ) == 0 ) return iset; // no SSSE3
85  iset = 4; // 4: SSSE3 supported
86  if ( ( abcd[2] & ( 1 << 19 ) ) == 0 ) return iset; // no SSE4.1
87  iset = 5; // 5: SSE4.1 supported
88  if ( ( abcd[2] & ( 1 << 23 ) ) == 0 ) return iset; // no POPCNT
89  if ( ( abcd[2] & ( 1 << 20 ) ) == 0 ) return iset; // no SSE4.2
90  iset = 6; // 6: SSE4.2 supported
91  if ( ( abcd[2] & ( 1 << 27 ) ) == 0 ) return iset; // no OSXSAVE
92  if ( ( xgetbv( 0 ) & 6 ) != 6 ) return iset; // AVX not enabled in O.S.
93  if ( ( abcd[2] & ( 1 << 28 ) ) == 0 ) return iset; // no AVX
94  iset = 7; // 7: AVX supported
95  cpuid( abcd, 7 ); // call cpuid leaf 7 for feature flags
96  if ( ( abcd[1] & ( 1 << 5 ) ) == 0 ) return iset; // no AVX2
97  iset = 8;
98  if ( ( abcd[1] & ( 1 << 16 ) ) == 0 ) return iset; // no AVX512
99  cpuid( abcd, 0xD ); // call cpuid leaf 0xD for feature flags
100  if ( ( abcd[0] & 0x60 ) != 0x60 ) return iset; // no AVX512
101  iset = 9;
102  cpuid( abcd, 7 ); // call cpuid leaf 7 for feature flags
103  if ( ( abcd[1] & ( 1 << 31 ) ) == 0 ) return iset; // no AVX512VL
104  if ( ( abcd[1] & 0x40020000 ) != 0x40020000 ) return iset; // no AVX512BW, AVX512DQ
105  iset = 10;
106  return iset;
107  }
#define cpuid(func, eax, ebx, ecx, edx)

◆ largeblock_indexes()

template<int N>
constexpr EList<int, N / 2> largeblock_indexes ( int const (&)  a[N])

Definition at line 1246 of file instrset.h.

1246  {
1247  // Parameter a is a reference to a constexpr array of N permutation indexes
1248  EList<int, N / 2> list = {{0}}; // list to return
1249 
1250  bool fit_addz = false; // additional zeroing needed at the lower block level
1251  int ix = 0; // even index
1252  int iy = 0; // odd index
1253  int iz = 0; // combined index
1254  int i = 0; // loop counter
1255 
1256  for ( i = 0; i < N; i += 2 ) {
1257  ix = a[i]; // even index
1258  iy = a[i + 1]; // odd index
1259  if ( ix >= 0 ) {
1260  iz = ix / 2; // half index
1261  } else if ( iy >= 0 ) {
1262  iz = iy / 2; // half index
1263  } else
1264  iz = ix | iy; // -1 or V_DC. -1 takes precedence
1265  list.a[i / 2] = iz; // save to list
1266  // check if additional zeroing is needed at current block size
1267  if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz = true; }
1268  }
1269  // replace -1 by V_DC if fit_addz
1270  if ( fit_addz ) {
1271  for ( i = 0; i < N / 2; i++ ) {
1272  if ( list.a[i] < 0 ) list.a[i] = V_DC;
1273  }
1274  }
1275  return list;
1276  }
int N
Definition: IOTest.py:110
constexpr int V_DC
Definition: instrset.h:220

◆ largeblock_perm()

template<int N>
constexpr EList<int, N / 2> largeblock_perm ( int const (&)  a[N])

Definition at line 975 of file instrset.h.

975  {
976  // Parameter a is a reference to a constexpr array of permutation indexes
977  EList<int, N / 2> list = {{0}}; // result indexes
978  int ix = 0; // even index
979  int iy = 0; // odd index
980  int iz = 0; // combined index
981  bool fit_addz = false; // additional zeroing needed at the lower block level
982  int i = 0; // loop counter
983 
984  // check if additional zeroing is needed at current block size
985  for ( i = 0; i < N; i += 2 ) {
986  ix = a[i]; // even index
987  iy = a[i + 1]; // odd index
988  if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz = true; }
989  }
990 
991  // loop through indexes
992  for ( i = 0; i < N; i += 2 ) {
993  ix = a[i]; // even index
994  iy = a[i + 1]; // odd index
995  if ( ix >= 0 ) {
996  iz = ix / 2; // half index
997  } else if ( iy >= 0 ) {
998  iz = iy / 2;
999  } else {
1000  iz = ix | iy; // -1 or V_DC. -1 takes precedence
1001  if ( fit_addz ) iz = V_DC; // V_DC, because result will be zeroed later
1002  }
1003  list.a[i / 2] = iz; // save to list
1004  }
1005  return list;
1006  }
int N
Definition: IOTest.py:110
constexpr int V_DC
Definition: instrset.h:220

◆ make_bit_mask()

template<int N, int B>
constexpr uint64_t make_bit_mask ( int const (&)  a[N])

Definition at line 522 of file instrset.h.

522  {
523  uint64_t r = 0; // return value
524  uint8_t j = uint8_t( B & 0xFF ); // index to selected bit
525  uint64_t s = 0; // bit number i in r
526  uint64_t f = 0; // 1 if bit not flipped
527  int i = 0;
528  for ( i = 0; i < N; i++ ) {
529  int ix = a[i];
530  if ( ix < 0 ) { // -1 or V_DC
531  s = ( B >> 10 ) & 1;
532  } else {
533  s = ( (uint32_t)ix >> j ) & 1; // extract selected bit
534  if ( i < N / 2 ) {
535  f = ( B >> 8 ) & 1; // lower half
536  } else {
537  f = ( B >> 9 ) & 1; // upper half
538  }
539  s ^= f ^ 1; // flip bit if needed
540  }
541  r |= uint64_t( s ) << i; // set bit in return value
542  }
543  return r;
544  }
int N
Definition: IOTest.py:110
string s
Definition: gaudirun.py:328

◆ make_broad_mask()

template<typename V >
constexpr auto make_broad_mask ( uint64_t const  m)

Definition at line 549 of file instrset.h.

549  {
550  constexpr int N = V::size(); // number of vector elements
551  typedef decltype( get_inttype<V>() ) Etype; // element type
552  EList<Etype, N> u = {{0}}; // list for returning
553  int i = 0;
554  for ( i = 0; i < N; i++ ) { u.a[i] = ( ( m >> i ) & 1 ) != 0 ? get_inttype<V>() : 0; }
555  return u; // return encapsulated array
556  }
constexpr auto size(const T &, Args &&...) noexcept
int N
Definition: IOTest.py:110
constexpr double m

◆ perm16_flags()

template<typename V >
constexpr uint64_t perm16_flags ( int const (&)  a[V::size()])

Definition at line 873 of file instrset.h.

873  {
874  // a is a reference to a constexpr array of permutation indexes
875  // V is a vector class
876  constexpr int N = V::size(); // number of elements
877 
878  uint64_t retval = 0; // return value
879  uint32_t pat[4] = {0, 0, 0, 0}; // permute patterns
880  uint32_t i = 0; // loop counter
881  int ix = 0; // index number i
882  const uint32_t lanesize = 8; // elements per lane
883  uint32_t lane = 0; // current lane
884  int lanepattern[lanesize] = {0}; // pattern in each lane
885 
886  for ( i = 0; i < N; i++ ) {
887  ix = a[i];
888  lane = i / lanesize; // current lane
889  if ( lane == 0 ) {
890  lanepattern[i] = ix; // save pattern
891  } else if ( ix >= 0 ) { // not first lane
892  uint32_t j = i - lane * lanesize; // index into lanepattern
893  int jx = ix - lane * lanesize; // pattern within lane
894  if ( lanepattern[j] < 0 ) {
895  lanepattern[j] = jx; // pattern not known from previous lane
896  }
897  }
898  }
899  // four patterns: low2low, high2high, high2low, low2high
900  for ( i = 0; i < 4; i++ ) {
901  // loop through low pattern
902  if ( lanepattern[i] >= 0 ) {
903  if ( lanepattern[i] < 4 ) { // low2low
904  retval |= 1;
905  pat[0] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
906  } else { // high2low
907  retval |= 4;
908  pat[2] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
909  }
910  }
911  // loop through high pattern
912  if ( lanepattern[i + 4] >= 0 ) {
913  if ( lanepattern[i + 4] < 4 ) { // low2high
914  retval |= 8;
915  pat[3] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
916  } else { // high2high
917  retval |= 2;
918  pat[1] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
919  }
920  }
921  }
922  // join return data
923  for ( i = 0; i < 4; i++ ) { retval |= (uint64_t)pat[i] << ( 32 + i * 8 ); }
924  return retval;
925  }
constexpr auto size(const T &, Args &&...) noexcept
int N
Definition: IOTest.py:110

◆ perm_flags()

template<typename V >
constexpr uint64_t perm_flags ( int const (&)  a[V::size()])

Definition at line 600 of file instrset.h.

600  {
601  // a is a reference to a constexpr array of permutation indexes
602  // V is a vector class
603  constexpr int N = V::size(); // number of elements
604  uint64_t r = perm_largeblock | perm_same_pattern | perm_allzero; // return value
605  uint32_t i = 0; // loop counter
606  int j = 0; // loop counter
607  int ix = 0; // index number i
608  const uint32_t nlanes = sizeof( V ) / 16; // number of 128-bit lanes
609  const uint32_t lanesize = N / nlanes; // elements per lane
610  const uint32_t elementsize = sizeof( V ) / N; // size of each vector element
611  uint32_t lane = 0; // current lane
612  uint32_t rot = 999; // rotate left count
613  int32_t broadc = 999; // index to broadcasted element
614  uint32_t patfail = 0; // remember certain patterns that do not fit
615  uint32_t addz2 = 0; // remember certain patterns need extra zeroing
616  int32_t compresslasti = -1; // last index in perm_compress fit
617  int32_t compresslastp = -1; // last position in perm_compress fit
618  int32_t expandlasti = -1; // last index in perm_expand fit
619  int32_t expandlastp = -1; // last position in perm_expand fit
620 
621  int lanepattern[lanesize] = {0}; // pattern in each lane
622 
623  for ( i = 0; i < N; i++ ) { // loop through indexes
624  ix = a[i]; // current index
625  // meaning of ix: -1 = set to zero, V_DC = don't care, non-negative value = permute.
626  if ( ix == -1 ) {
627  r |= perm_zeroing; // zeroing requested
628  } else if ( ix != V_DC && uint32_t( ix ) >= N ) {
629  r |= perm_outofrange; // index out of range
630  }
631  if ( ix >= 0 ) {
632  r &= ~perm_allzero; // not all zero
633  if ( ix != (int)i ) r |= perm_perm; // needs permutation
634  if ( broadc == 999 )
635  broadc = ix; // remember broadcast index
636  else if ( broadc != ix )
637  broadc = 1000; // does not fit broadcast
638  }
639  // check if pattern fits a larger block size:
640  // even indexes must be even, odd indexes must fit the preceding even index + 1
641  if ( ( i & 1 ) == 0 ) { // even index
642  if ( ix >= 0 && ( ix & 1 ) ) r &= ~perm_largeblock; // not even. does not fit larger block size
643  int iy = a[i + 1]; // next odd index
644  if ( iy >= 0 && ( iy & 1 ) == 0 ) r &= ~perm_largeblock; // not odd. does not fit larger block size
645  if ( ix >= 0 && iy >= 0 && iy != ix + 1 ) r &= ~perm_largeblock; // does not fit preceding index + 1
646  if ( ix == -1 && iy >= 0 ) r |= perm_addz; // needs additional zeroing at current block size
647  if ( iy == -1 && ix >= 0 ) r |= perm_addz; // needs additional zeroing at current block size
648  }
649  lane = i / lanesize; // current lane
650  if ( lane == 0 ) { // first lane, or no pattern yet
651  lanepattern[i] = ix; // save pattern
652  }
653  // check if crossing lanes
654  if ( ix >= 0 ) {
655  uint32_t lanei = (uint32_t)ix / lanesize; // source lane
656  if ( lanei != lane ) r |= perm_cross_lane; // crossing lane
657  }
658  // check if same pattern in all lanes
659  if ( lane != 0 && ix >= 0 ) { // not first lane
660  int j1 = i - int( lane * lanesize ); // index into lanepattern
661  int jx = ix - int( lane * lanesize ); // pattern within lane
662  if ( jx < 0 || jx >= (int)lanesize ) r &= ~perm_same_pattern; // source is in another lane
663  if ( lanepattern[j1] < 0 ) {
664  lanepattern[j1] = jx; // pattern not known from previous lane
665  } else {
666  if ( lanepattern[j1] != jx ) r &= ~perm_same_pattern; // not same pattern
667  }
668  }
669  if ( ix >= 0 ) {
670  // check if pattern fits zero extension (perm_zext)
671  if ( uint32_t( ix * 2 ) != i ) {
672  patfail |= 1; // does not fit zero extension
673  }
674  // check if pattern fits compress (perm_compress)
675  if ( ix > compresslasti && ix - compresslasti >= (int)i - compresslastp ) {
676  if ( (int)i - compresslastp > 1 ) addz2 |= 2; // perm_compress may need additional zeroing
677  compresslasti = ix;
678  compresslastp = i;
679  } else {
680  patfail |= 2; // does not fit perm_compress
681  }
682  // check if pattern fits expand (perm_expand)
683  if ( ix > expandlasti && ix - expandlasti <= (int)i - expandlastp ) {
684  if ( ix - expandlasti > 1 ) addz2 |= 4; // perm_expand may need additional zeroing
685  expandlasti = ix;
686  expandlastp = i;
687  } else {
688  patfail |= 4; // does not fit perm_compress
689  }
690  } else if ( ix == -1 ) {
691  if ( ( i & 1 ) == 0 ) addz2 |= 1; // zero extension needs additional zeroing
692  }
693  }
694  if ( !( r & perm_perm ) ) return r; // more checks are superfluous
695 
696  if ( !( r & perm_largeblock ) ) r &= ~perm_addz; // remove irrelevant flag
697  if ( r & perm_cross_lane ) r &= ~perm_same_pattern; // remove irrelevant flag
698  if ( ( patfail & 1 ) == 0 ) {
699  r |= perm_zext; // fits zero extension
700  if ( ( addz2 & 1 ) != 0 ) r |= perm_addz2;
701  } else if ( ( patfail & 2 ) == 0 ) {
702  r |= perm_compress; // fits compression
703  if ( ( addz2 & 2 ) != 0 ) { // check if additional zeroing needed
704  for ( j = 0; j < compresslastp; j++ ) {
705  if ( a[j] == -1 ) r |= perm_addz2;
706  }
707  }
708  } else if ( ( patfail & 4 ) == 0 ) {
709  r |= perm_expand; // fits expansion
710  if ( ( addz2 & 4 ) != 0 ) { // check if additional zeroing needed
711  for ( j = 0; j < expandlastp; j++ ) {
712  if ( a[j] == -1 ) r |= perm_addz2;
713  }
714  }
715  }
716 
717  if ( r & perm_same_pattern ) {
718  // same pattern in all lanes. check if it fits specific patterns
719  bool fit = true;
720  // fit shift or rotate
721  for ( i = 0; i < lanesize; i++ ) {
722  if ( lanepattern[i] >= 0 ) {
723  uint32_t rot1 = uint32_t( lanepattern[i] + lanesize - i ) % lanesize;
724  if ( rot == 999 ) {
725  rot = rot1;
726  } else { // check if fit
727  if ( rot != rot1 ) fit = false;
728  }
729  }
730  }
731  rot &= lanesize - 1; // prevent out of range values
732  if ( fit ) { // fits rotate, and possibly shift
733  uint64_t rot2 = ( rot * elementsize ) & 0xF; // rotate right count in bytes
734  r |= rot2 << perm_rot_count; // put shift/rotate count in output bit 16-19
735 #if INSTRSET >= 4 // SSSE3
736  r |= perm_rotate; // allow palignr
737 #endif
738  // fit shift left
739  fit = true;
740  for ( i = 0; i < lanesize - rot; i++ ) { // check if first rot elements are zero or don't care
741  if ( lanepattern[i] >= 0 ) fit = false;
742  }
743  if ( fit ) {
744  r |= perm_shleft;
745  for ( ; i < lanesize; i++ )
746  if ( lanepattern[i] == -1 ) r |= perm_addz; // additional zeroing needed
747  }
748  // fit shift right
749  fit = true;
750  for ( i = lanesize - (uint32_t)rot; i < lanesize;
751  i++ ) { // check if last (lanesize-rot) elements are zero or don't care
752  if ( lanepattern[i] >= 0 ) fit = false;
753  }
754  if ( fit ) {
755  r |= perm_shright;
756  for ( i = 0; i < lanesize - rot; i++ ) {
757  if ( lanepattern[i] == -1 ) r |= perm_addz; // additional zeroing needed
758  }
759  }
760  }
761  // fit punpckhi
762  fit = true;
763  uint32_t j2 = lanesize / 2;
764  for ( i = 0; i < lanesize; i++ ) {
765  if ( lanepattern[i] >= 0 && lanepattern[i] != (int)j2 ) fit = false;
766  if ( ( i & 1 ) != 0 ) j2++;
767  }
768  if ( fit ) r |= perm_punpckh;
769  // fit punpcklo
770  fit = true;
771  j2 = 0;
772  for ( i = 0; i < lanesize; i++ ) {
773  if ( lanepattern[i] >= 0 && lanepattern[i] != (int)j2 ) fit = false;
774  if ( ( i & 1 ) != 0 ) j2++;
775  }
776  if ( fit ) r |= perm_punpckl;
777  // fit pshufd
778  if ( elementsize >= 4 ) {
779  uint64_t p = 0;
780  for ( i = 0; i < lanesize; i++ ) {
781  if ( lanesize == 4 ) {
782  p |= ( lanepattern[i] & 3 ) << 2 * i;
783  } else { // lanesize = 2
784  p |= ( ( lanepattern[i] & 1 ) * 10 + 4 ) << 4 * i;
785  }
786  }
787  r |= p << perm_ipattern;
788  }
789  }
790 #if INSTRSET >= 7
791  else { // not same pattern in all lanes
792  if constexpr ( nlanes > 1 ) { // Try if it fits big rotate
793  for ( i = 0; i < N; i++ ) {
794  ix = a[i];
795  if ( ix >= 0 ) {
796  uint32_t rot2 = ( ix + N - i ) % N; // rotate count
797  if ( rot == 999 ) {
798  rot = rot2; // save rotate count
799  } else if ( rot != rot2 ) {
800  rot = 1000;
801  break; // does not fit big rotate
802  }
803  }
804  }
805  if ( rot < N ) { // fits big rotate
806  r |= perm_rotate_big | (uint64_t)rot << perm_rot_count;
807  }
808  }
809  }
810 #endif
811  if ( broadc < 999 && ( r & ( perm_rotate | perm_shright | perm_shleft | perm_rotate_big ) ) == 0 ) {
812  r |= perm_broadcast | (uint64_t)broadc << perm_rot_count; // fits broadcast
813  }
814  return r;
815  }
const int perm_rot_count
Definition: instrset.h:595
constexpr auto size(const T &, Args &&...) noexcept
const int perm_largeblock
Definition: instrset.h:575
const int perm_perm
Definition: instrset.h:573
const int perm_outofrange
Definition: instrset.h:594
const int perm_ipattern
Definition: instrset.h:596
const int perm_punpckl
Definition: instrset.h:581
const int perm_allzero
Definition: instrset.h:574
const int perm_compress
Definition: instrset.h:592
int N
Definition: IOTest.py:110
const int perm_punpckh
Definition: instrset.h:580
const int perm_addz2
Definition: instrset.h:577
const int perm_shright
Definition: instrset.h:584
const int perm_addz
Definition: instrset.h:576
const int perm_zext
Definition: instrset.h:591
const int perm_broadcast
Definition: instrset.h:590
const int perm_expand
Definition: instrset.h:593
constexpr int V_DC
Definition: instrset.h:220
const int perm_shleft
Definition: instrset.h:586
const int perm_zeroing
Definition: instrset.h:572
const int perm_rotate
Definition: instrset.h:582
const int perm_same_pattern
Definition: instrset.h:579
const int perm_cross_lane
Definition: instrset.h:578
const int perm_rotate_big
Definition: instrset.h:588

◆ perm_mask_broad()

template<typename V >
constexpr auto perm_mask_broad ( int const (&)  A[V::size()])

Definition at line 561 of file instrset.h.

561  {
562  constexpr int N = V::size(); // number of vector elements
563  typedef decltype( get_inttype<V>() ) Etype; // vector element type
564  EList<Etype, N> u = {{0}}; // list for returning
565  int i = 0;
566  for ( i = 0; i < N; i++ ) { u.a[i] = Etype( A[i] ); }
567  return u; // return encapsulated array
568  }
constexpr auto size(const T &, Args &&...) noexcept
int N
Definition: IOTest.py:110

◆ physicalProcessors()

int physicalProcessors ( int *  logical_processors = 0)

◆ pshufb_mask()

template<typename V , int oppos = 0>
constexpr auto pshufb_mask ( int const (&)  A[V::size()])

Definition at line 932 of file instrset.h.

932  {
933  // Parameter a is a reference to a constexpr array of permutation indexes
934  // V is a vector class
935  // oppos = 1 for data from the opposite 128-bit lane in 256-bit vectors
936  constexpr uint32_t N = V::size(); // number of vector elements
937  constexpr uint32_t elementsize = sizeof( V ) / N; // size of each vector element
938  constexpr uint32_t nlanes = sizeof( V ) / 16; // number of 128 bit lanes in vector
939  constexpr uint32_t elements_per_lane = N / nlanes; // number of vector elements per lane
940 
941  EList<int8_t, sizeof( V )> u = {{0}}; // list for returning
942 
943  uint32_t i = 0; // loop counters
944  uint32_t j = 0;
945  int m = 0;
946  int k = 0;
947  uint32_t lane = 0;
948 
949  for ( lane = 0; lane < nlanes; lane++ ) { // loop through lanes
950  for ( i = 0; i < elements_per_lane; i++ ) { // loop through elements in lane
951  // permutation index for element within lane
952  int8_t p = -1;
953  int ix = A[m];
954  if ( ix >= 0 ) {
955  ix ^= oppos * elements_per_lane; // flip bit if opposite lane
956  }
957  ix -= int( lane * elements_per_lane ); // index relative to lane
958  if ( ix >= 0 && ix < (int)elements_per_lane ) { // index points to desired lane
959  p = ix * elementsize;
960  }
961  for ( j = 0; j < elementsize; j++ ) { // loop through bytes in element
962  u.a[k++] = p < 0 ? -1 : p + j; // store byte permutation index
963  }
964  m++;
965  }
966  }
967  return u; // return encapsulated array
968  }
constexpr auto size(const T &, Args &&...) noexcept
int N
Definition: IOTest.py:110
constexpr double m

◆ zero_mask()

template<int N>
constexpr auto zero_mask ( int const (&)  a[N])

Definition at line 483 of file instrset.h.

483  {
484  uint64_t mask = 0;
485  int i = 0;
486 
487  for ( i = 0; i < N; i++ ) {
488  if ( a[i] >= 0 ) mask |= uint64_t( 1 ) << i;
489  }
490  if constexpr ( N <= 8 )
491  return uint8_t( mask );
492  else if constexpr ( N <= 16 )
493  return uint16_t( mask );
494  else if constexpr ( N <= 32 )
495  return uint32_t( mask );
496  else
497  return mask;
498  }
int N
Definition: IOTest.py:110

◆ zero_mask_broad()

template<typename V >
constexpr auto zero_mask_broad ( int const (&)  A[V::size()])

Definition at line 503 of file instrset.h.

503  {
504  constexpr int N = V::size(); // number of vector elements
505  typedef decltype( get_inttype<V>() ) Etype; // element type
506  EList<Etype, N> u = {{0}}; // list for return
507  int i = 0;
508  for ( i = 0; i < N; i++ ) { u.a[i] = A[i] >= 0 ? get_inttype<V>() : 0; }
509  return u; // return encapsulated array
510  }
constexpr auto size(const T &, Args &&...) noexcept
int N
Definition: IOTest.py:110

Variable Documentation

◆ blend_a

const int blend_a = 0x10

Definition at line 1014 of file instrset.h.

◆ blend_addz

const int blend_addz = 8

Definition at line 1013 of file instrset.h.

◆ blend_allzero

const int blend_allzero = 2

Definition at line 1011 of file instrset.h.

◆ blend_b

const int blend_b = 0x20

Definition at line 1015 of file instrset.h.

◆ blend_cross_lane

const int blend_cross_lane = 0x100

Definition at line 1018 of file instrset.h.

◆ blend_largeblock

const int blend_largeblock = 4

Definition at line 1012 of file instrset.h.

◆ blend_outofrange

const int blend_outofrange = 0x10000000

Definition at line 1029 of file instrset.h.

◆ blend_perma

const int blend_perma = 0x40

Definition at line 1016 of file instrset.h.

◆ blend_permb

const int blend_permb = 0x80

Definition at line 1017 of file instrset.h.

◆ blend_punpckhab

const int blend_punpckhab = 0x1000

Definition at line 1020 of file instrset.h.

◆ blend_punpckhba

const int blend_punpckhba = 0x2000

Definition at line 1021 of file instrset.h.

◆ blend_punpcklab

const int blend_punpcklab = 0x4000

Definition at line 1022 of file instrset.h.

◆ blend_punpcklba

const int blend_punpcklba = 0x8000

Definition at line 1023 of file instrset.h.

◆ blend_rotate_big

const int blend_rotate_big = 0x100000

Definition at line 1028 of file instrset.h.

◆ blend_rotateab

const int blend_rotateab = 0x10000

Definition at line 1024 of file instrset.h.

◆ blend_rotateba

const int blend_rotateba = 0x20000

Definition at line 1025 of file instrset.h.

◆ blend_rotpattern

const int blend_rotpattern = 40

Definition at line 1031 of file instrset.h.

◆ blend_same_pattern

const int blend_same_pattern = 0x200

Definition at line 1019 of file instrset.h.

◆ blend_shufab

const int blend_shufab = 0x40000

Definition at line 1026 of file instrset.h.

◆ blend_shufba

const int blend_shufba = 0x80000

Definition at line 1027 of file instrset.h.

◆ blend_shufpattern

const int blend_shufpattern = 32

Definition at line 1030 of file instrset.h.

◆ blend_zeroing

const int blend_zeroing = 1

Definition at line 1010 of file instrset.h.

◆ perm_addz

const int perm_addz = 0x10

Definition at line 576 of file instrset.h.

◆ perm_addz2

const int perm_addz2 = 0x20

Definition at line 577 of file instrset.h.

◆ perm_allzero

const int perm_allzero = 4

Definition at line 574 of file instrset.h.

◆ perm_broadcast

const int perm_broadcast = 0x8000

Definition at line 590 of file instrset.h.

◆ perm_compress

const int perm_compress = 0x20000

Definition at line 592 of file instrset.h.

◆ perm_cross_lane

const int perm_cross_lane = 0x40

Definition at line 578 of file instrset.h.

◆ perm_expand

const int perm_expand = 0x40000

Definition at line 593 of file instrset.h.

◆ perm_ipattern

const int perm_ipattern
Initial value:
=
40

Definition at line 596 of file instrset.h.

◆ perm_largeblock

const int perm_largeblock = 8

Definition at line 575 of file instrset.h.

◆ perm_outofrange

const int perm_outofrange = 0x10000000

Definition at line 594 of file instrset.h.

◆ perm_perm

const int perm_perm = 2

Definition at line 573 of file instrset.h.

◆ perm_punpckh

const int perm_punpckh = 0x100

Definition at line 580 of file instrset.h.

◆ perm_punpckl

const int perm_punpckl = 0x200

Definition at line 581 of file instrset.h.

◆ perm_rot_count

const int perm_rot_count = 32

Definition at line 595 of file instrset.h.

◆ perm_rotate

const int perm_rotate
Initial value:
=
0x400

Definition at line 582 of file instrset.h.

◆ perm_rotate_big

const int perm_rotate_big
Initial value:
=
0x4000

Definition at line 588 of file instrset.h.

◆ perm_same_pattern

const int perm_same_pattern = 0x80

Definition at line 579 of file instrset.h.

◆ perm_shleft

const int perm_shleft
Initial value:
=
0x2000

Definition at line 586 of file instrset.h.

◆ perm_shright

const int perm_shright
Initial value:
=
0x1000

Definition at line 584 of file instrset.h.

◆ perm_zeroing

const int perm_zeroing = 1

Definition at line 572 of file instrset.h.

◆ perm_zext

const int perm_zext = 0x10000

Definition at line 591 of file instrset.h.

◆ V_DC

constexpr int V_DC = -256

Definition at line 220 of file instrset.h.