The Gaudi Framework  master (82fdf313)
Loading...
Searching...
No Matches
instrset.h File Reference
#include <stdint.h>
#include <stdlib.h>
Include dependency graph for instrset.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

class  Const_int_t< n >
 
class  Const_uint_t< n >
 
struct  EList< T, N >
 

Macros

#define ALLOW_FP_PERMUTE   true
 
#define INSTRSET   0
 
#define const_int(n)
 
#define const_uint(n)
 

Functions

int instrset_detect (void)
 
bool hasFMA3 (void)
 
bool hasFMA4 (void)
 
bool hasXOP (void)
 
bool hasAVX512ER (void)
 
bool hasAVX512VBMI (void)
 
bool hasAVX512VBMI2 (void)
 
int physicalProcessors (int *logical_processors=0)
 
constexpr int bit_scan_reverse_const (uint64_t const n)
 
template<typename V>
constexpr auto get_inttype ()
 
template<int N>
constexpr auto zero_mask (int const (&a)[N])
 
template<typename V>
constexpr auto zero_mask_broad (int const (&A)[V::size()])
 
template<int N, int B>
constexpr uint64_t make_bit_mask (int const (&a)[N])
 
template<typename V>
constexpr auto make_broad_mask (uint64_t const m)
 
template<typename V>
constexpr auto perm_mask_broad (int const (&A)[V::size()])
 
template<typename V>
constexpr uint64_t perm_flags (int const (&a)[V::size()])
 
template<int N>
constexpr uint64_t compress_mask (int const (&a)[N])
 
template<int N>
constexpr uint64_t expand_mask (int const (&a)[N])
 
template<typename V>
constexpr uint64_t perm16_flags (int const (&a)[V::size()])
 
template<typename V, int oppos = 0>
constexpr auto pshufb_mask (int const (&A)[V::size()])
 
template<int N>
constexpr EList< int, N/2 > largeblock_perm (int const (&a)[N])
 
template<typename V>
constexpr uint64_t blend_flags (int const (&a)[V::size()])
 
template<int N, int dozero>
constexpr EList< int, 2 *N > blend_perm_indexes (int const (&a)[N])
 
template<int N>
constexpr EList< int, N/2 > largeblock_indexes (int const (&a)[N])
 
template<typename dummy>
void blend2 ()
 
template<typename dummy>
void blend4 ()
 
template<typename dummy>
void blend8 ()
 
template<typename dummy>
void blend16 ()
 
template<typename dummy>
void blend32 ()
 
template<int N, int dozero, int src1, int src2>
constexpr EList< int, N > blend_half_indexes (int const (&a)[N])
 
template<typename W, int... i0>
auto blend_half (W const &a, W const &b)
 

Variables

constexpr int V_DC = -256
 
const int perm_zeroing = 1
 
const int perm_perm = 2
 
const int perm_allzero = 4
 
const int perm_largeblock = 8
 
const int perm_addz = 0x10
 
const int perm_addz2 = 0x20
 
const int perm_cross_lane = 0x40
 
const int perm_same_pattern = 0x80
 
const int perm_punpckh = 0x100
 
const int perm_punpckl = 0x200
 
const int perm_rotate
 
const int perm_shright
 
const int perm_shleft
 
const int perm_rotate_big
 
const int perm_broadcast = 0x8000
 
const int perm_zext = 0x10000
 
const int perm_compress = 0x20000
 
const int perm_expand = 0x40000
 
const int perm_outofrange = 0x10000000
 
const int perm_rot_count = 32
 
const int perm_ipattern
 
const int blend_zeroing = 1
 
const int blend_allzero = 2
 
const int blend_largeblock = 4
 
const int blend_addz = 8
 
const int blend_a = 0x10
 
const int blend_b = 0x20
 
const int blend_perma = 0x40
 
const int blend_permb = 0x80
 
const int blend_cross_lane = 0x100
 
const int blend_same_pattern = 0x200
 
const int blend_punpckhab = 0x1000
 
const int blend_punpckhba = 0x2000
 
const int blend_punpcklab = 0x4000
 
const int blend_punpcklba = 0x8000
 
const int blend_rotateab = 0x10000
 
const int blend_rotateba = 0x20000
 
const int blend_shufab = 0x40000
 
const int blend_shufba = 0x80000
 
const int blend_rotate_big = 0x100000
 
const int blend_outofrange = 0x10000000
 
const int blend_shufpattern = 32
 
const int blend_rotpattern = 40
 

Macro Definition Documentation

◆ ALLOW_FP_PERMUTE

#define ALLOW_FP_PERMUTE   true

Definition at line 28 of file instrset.h.

◆ const_int

#define const_int ( n)
Value:

Definition at line 404 of file instrset.h.

◆ const_uint

#define const_uint ( n)
Value:

Definition at line 405 of file instrset.h.

◆ INSTRSET

#define INSTRSET   0

Definition at line 76 of file instrset.h.

Function Documentation

◆ bit_scan_reverse_const()

int bit_scan_reverse_const ( uint64_t const n)
constexpr

Definition at line 379 of file instrset.h.

379 {
380 if ( n == 0 ) return -1;
381 uint64_t a = n, b = 0, j = 64, k = 0;
382 do {
383 j >>= 1;
384 k = (uint64_t)1 << j;
385 if ( a >= k ) {
386 a >>= j;
387 b += j;
388 }
389 } while ( j > 0 );
390 return int( b );
391 }

◆ blend16()

template<typename dummy>
void blend16 ( )

Definition at line 1296 of file instrset.h.

1296{}

◆ blend2()

template<typename dummy>
void blend2 ( )

Definition at line 1290 of file instrset.h.

1290{}

◆ blend32()

template<typename dummy>
void blend32 ( )

Definition at line 1298 of file instrset.h.

1298{}

◆ blend4()

template<typename dummy>
void blend4 ( )

Definition at line 1292 of file instrset.h.

1292{}

◆ blend8()

template<typename dummy>
void blend8 ( )

Definition at line 1294 of file instrset.h.

1294{}

◆ blend_flags()

template<typename V>
uint64_t blend_flags ( int const (&) a[V::size()])
constexpr

Definition at line 1035 of file instrset.h.

1035 {
1036 // a is a reference to a constexpr array of permutation indexes
1037 // V is a vector class
1038 constexpr int N = V::size(); // number of elements
1039 uint64_t r = blend_largeblock | blend_same_pattern | blend_allzero; // return value
1040 uint32_t iu = 0; // loop counter
1041 int32_t ii = 0; // loop counter
1042 int ix = 0; // index number i
1043 const uint32_t nlanes = sizeof( V ) / 16; // number of 128-bit lanes
1044 const uint32_t lanesize = N / nlanes; // elements per lane
1045 uint32_t lane = 0; // current lane
1046 uint32_t rot = 999; // rotate left count
1047 int lanepattern[lanesize] = { 0 }; // pattern in each lane
1048 if ( lanesize == 2 && N <= 8 ) {
1049 r |= blend_shufab | blend_shufba; // check if it fits shufpd
1050 }
1051
1052 for ( ii = 0; ii < N; ii++ ) { // loop through indexes
1053 ix = a[ii]; // index
1054 if ( ix < 0 ) {
1055 if ( ix == -1 )
1056 r |= blend_zeroing; // set to zero
1057 else if ( ix != V_DC ) {
1058 r = blend_outofrange;
1059 break; // illegal index
1060 }
1061 } else { // ix >= 0
1062 r &= ~blend_allzero;
1063 if ( ix < N ) {
1064 r |= blend_a; // data from a
1065 if ( ix != ii ) r |= blend_perma; // permutation of a
1066 } else if ( ix < 2 * N ) {
1067 r |= blend_b; // data from b
1068 if ( ix != ii + N ) r |= blend_permb; // permutation of b
1069 } else {
1070 r = blend_outofrange;
1071 break; // illegal index
1072 }
1073 }
1074 // check if pattern fits a larger block size:
1075 // even indexes must be even, odd indexes must fit the preceding even index + 1
1076 if ( ( ii & 1 ) == 0 ) { // even index
1077 if ( ix >= 0 && ( ix & 1 ) ) r &= ~blend_largeblock; // not even. does not fit larger block size
1078 int iy = a[ii + 1]; // next odd index
1079 if ( iy >= 0 && ( iy & 1 ) == 0 ) r &= ~blend_largeblock; // not odd. does not fit larger block size
1080 if ( ix >= 0 && iy >= 0 && iy != ix + 1 ) r &= ~blend_largeblock; // does not fit preceding index + 1
1081 if ( ix == -1 && iy >= 0 ) r |= blend_addz; // needs additional zeroing at current block size
1082 if ( iy == -1 && ix >= 0 ) r |= blend_addz; // needs additional zeroing at current block size
1083 }
1084 lane = (uint32_t)ii / lanesize; // current lane
1085 if ( lane == 0 ) { // first lane, or no pattern yet
1086 lanepattern[ii] = ix; // save pattern
1087 }
1088 // check if crossing lanes
1089 if ( ix >= 0 ) {
1090 uint32_t lanei = uint32_t( ix & ~N ) / lanesize; // source lane
1091 if ( lanei != lane ) {
1092 r |= blend_cross_lane; // crossing lane
1093 }
1094 if ( lanesize == 2 ) { // check if it fits pshufd
1095 if ( lanei != lane ) r &= ~( blend_shufab | blend_shufba );
1096 if ( ( ( ( ix & N ) != 0 ) ^ ii ) & 1 )
1097 r &= ~blend_shufab;
1098 else
1099 r &= ~blend_shufba;
1100 }
1101 }
1102 // check if same pattern in all lanes
1103 if ( lane != 0 && ix >= 0 ) { // not first lane
1104 int j = ii - int( lane * lanesize ); // index into lanepattern
1105 int jx = ix - int( lane * lanesize ); // pattern within lane
1106 if ( jx < 0 || ( jx & ~N ) >= (int)lanesize ) r &= ~blend_same_pattern; // source is in another lane
1107 if ( lanepattern[j] < 0 ) {
1108 lanepattern[j] = jx; // pattern not known from previous lane
1109 } else {
1110 if ( lanepattern[j] != jx ) r &= ~blend_same_pattern; // not same pattern
1111 }
1112 }
1113 }
1114 if ( !( r & blend_largeblock ) ) r &= ~blend_addz; // remove irrelevant flag
1115 if ( r & blend_cross_lane ) r &= ~blend_same_pattern; // remove irrelevant flag
1116 if ( !( r & ( blend_perma | blend_permb ) ) ) {
1117 return r; // no permutation. more checks are superfluous
1118 }
1119 if ( r & blend_same_pattern ) {
1120 // same pattern in all lanes. check if it fits unpack patterns
1122 for ( iu = 0; iu < lanesize; iu++ ) { // loop through lanepattern
1123 ix = lanepattern[iu];
1124 if ( ix >= 0 ) {
1125 if ( (uint32_t)ix != iu / 2 + ( iu & 1 ) * N ) r &= ~blend_punpcklab;
1126 if ( (uint32_t)ix != iu / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &= ~blend_punpcklba;
1127 if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( iu & 1 ) * N ) r &= ~blend_punpckhab;
1128 if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &= ~blend_punpckhba;
1129 }
1130 }
1131#if INSTRSET >= 4 // SSSE3. check if it fits palignr
1132 for ( iu = 0; iu < lanesize; iu++ ) {
1133 ix = lanepattern[iu];
1134 if ( ix >= 0 ) {
1135 uint32_t t = ix & ~N;
1136 if ( ix & N ) t += lanesize;
1137 uint32_t tb = ( t + 2 * lanesize - iu ) % ( lanesize * 2 );
1138 if ( rot == 999 ) {
1139 rot = tb;
1140 } else { // check if fit
1141 if ( rot != tb ) rot = 1000;
1142 }
1143 }
1144 }
1145 if ( rot < 999 ) { // firs palignr
1146 if ( rot < lanesize ) {
1147 r |= blend_rotateba;
1148 } else {
1149 r |= blend_rotateab;
1150 }
1151 const uint32_t elementsize = sizeof( V ) / N;
1152 r |= uint64_t( ( rot & ( lanesize - 1 ) ) * elementsize ) << blend_rotpattern;
1153 }
1154#endif
1155 if ( lanesize == 4 ) {
1156 // check if it fits shufps
1158 for ( ii = 0; ii < 2; ii++ ) {
1159 ix = lanepattern[ii];
1160 if ( ix >= 0 ) {
1161 if ( ix & N )
1162 r &= ~blend_shufab;
1163 else
1164 r &= ~blend_shufba;
1165 }
1166 }
1167 for ( ; ii < 4; ii++ ) {
1168 ix = lanepattern[ii];
1169 if ( ix >= 0 ) {
1170 if ( ix & N )
1171 r &= ~blend_shufba;
1172 else
1173 r &= ~blend_shufab;
1174 }
1175 }
1176 if ( r & ( blend_shufab | blend_shufba ) ) { // fits shufps/shufpd
1177 uint8_t shufpattern = 0; // get pattern
1178 for ( iu = 0; iu < lanesize; iu++ ) { shufpattern |= ( lanepattern[iu] & 3 ) << iu * 2; }
1179 r |= (uint64_t)shufpattern << blend_shufpattern; // return pattern
1180 }
1181 }
1182 } else if ( nlanes > 1 ) { // not same pattern in all lanes
1183 rot = 999; // check if it fits big rotate
1184 for ( ii = 0; ii < N; ii++ ) {
1185 ix = a[ii];
1186 if ( ix >= 0 ) {
1187 uint32_t rot2 = ( ix + 2 * N - ii ) % ( 2 * N ); // rotate count
1188 if ( rot == 999 ) {
1189 rot = rot2; // save rotate count
1190 } else if ( rot != rot2 ) {
1191 rot = 1000;
1192 break; // does not fit big rotate
1193 }
1194 }
1195 }
1196 if ( rot < 2 * N ) { // fits big rotate
1197 r |= blend_rotate_big | (uint64_t)rot << blend_rotpattern;
1198 }
1199 }
1200 if ( lanesize == 2 && ( r & ( blend_shufab | blend_shufba ) ) ) { // fits shufpd. Get pattern
1201 for ( ii = 0; ii < N; ii++ ) { r |= uint64_t( a[ii] & 1 ) << ( blend_shufpattern + ii ); }
1202 }
1203 return r;
1204 }
const int blend_rotpattern
Definition instrset.h:1032
const int blend_permb
Definition instrset.h:1018
const int blend_punpckhba
Definition instrset.h:1022
const int blend_shufpattern
Definition instrset.h:1031
const int blend_rotateab
Definition instrset.h:1025
const int blend_cross_lane
Definition instrset.h:1019
const int blend_rotateba
Definition instrset.h:1026
const int blend_largeblock
Definition instrset.h:1013
const int blend_a
Definition instrset.h:1015
const int blend_addz
Definition instrset.h:1014
const int blend_shufba
Definition instrset.h:1028
const int blend_rotate_big
Definition instrset.h:1029
const int blend_perma
Definition instrset.h:1017
const int blend_outofrange
Definition instrset.h:1030
const int blend_same_pattern
Definition instrset.h:1020
const int blend_punpcklab
Definition instrset.h:1023
const int blend_allzero
Definition instrset.h:1012
const int blend_punpckhab
Definition instrset.h:1021
const int blend_zeroing
Definition instrset.h:1011
constexpr int V_DC
Definition instrset.h:219
const int blend_punpcklba
Definition instrset.h:1024
const int blend_shufab
Definition instrset.h:1027
const int blend_b
Definition instrset.h:1016
int N
Definition IOTest.py:112

◆ blend_half()

template<typename W, int... i0>
auto blend_half ( W const & a,
W const & b )

Definition at line 1351 of file instrset.h.

1351 {
1352 typedef decltype( a.get_low() ) V; // type for half-size vector
1353 constexpr int N = V::size(); // size of half-size vector
1354 static_assert( sizeof...( i0 ) == N, "wrong number of indexes in blend_half" );
1355 constexpr int ind[N] = { i0... }; // array of indexes
1356
1357 // lambda to find which of the four possible sources are used
1358 // return: EList<int, 5> containing a list of up to 4 sources. The last element is the number of sources used
1359 auto listsources = []( int const n, int const( &ind )[N] ) constexpr {
1360 bool source_used[4] = { false, false, false, false }; // list of sources used
1361 int i = 0;
1362 for ( i = 0; i < n; i++ ) {
1363 int ix = ind[i]; // index
1364 if ( ix >= 0 ) {
1365 int src = ix / n; // source used
1366 source_used[src & 3] = true;
1367 }
1368 }
1369 // return a list of sources used. The last element is the number of sources used
1370 EList<int, 5> sources = { { 0 } };
1371 int nsrc = 0; // number of sources
1372 for ( i = 0; i < 4; i++ ) {
1373 if ( source_used[i] ) { sources.a[nsrc++] = i; }
1374 }
1375 sources.a[4] = nsrc;
1376 return sources;
1377 };
1378 // list of sources used
1379 constexpr EList<int, 5> sources = listsources( N, ind );
1380 constexpr int nsrc = sources.a[4]; // number of sources used
1381
1382 if constexpr ( nsrc == 0 ) { // no sources
1383 return V( 0 );
1384 }
1385 // get indexes for the first one or two sources
1386 constexpr int uindex = ( nsrc > 2 ) ? 1 : 2; // unused elements set to zero if two blends are combined
1387 constexpr EList<int, N> L = blend_half_indexes<N, uindex, sources.a[0], sources.a[1]>( ind );
1388 V x0;
1389 V src0 = selectblend<W, sources.a[0]>( a, b ); // first source
1390 V src1 = selectblend<W, sources.a[1]>( a, b ); // second source
1391 if constexpr ( N == 2 ) {
1392 x0 = blend2<L.a[0], L.a[1]>( src0, src1 );
1393 } else if constexpr ( N == 4 ) {
1394 x0 = blend4<L.a[0], L.a[1], L.a[2], L.a[3]>( src0, src1 );
1395 } else if constexpr ( N == 8 ) {
1396 x0 = blend8<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]>( src0, src1 );
1397 } else if constexpr ( N == 16 ) {
1398 x0 = blend16<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
1399 L.a[12], L.a[13], L.a[14], L.a[15]>( src0, src1 );
1400 } else if constexpr ( N == 32 ) {
1401 x0 = blend32<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
1402 L.a[12], L.a[13], L.a[14], L.a[15], L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22],
1403 L.a[23], L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]>( src0, src1 );
1404 }
1405 if constexpr ( nsrc > 2 ) { // get last one or two sources
1406 constexpr EList<int, N> M = blend_half_indexes<N, 1, sources.a[2], sources.a[3]>( ind );
1407 V x1;
1408 V src2 = selectblend<W, sources.a[2]>( a, b ); // third source
1409 V src3 = selectblend<W, sources.a[3]>( a, b ); // fourth source
1410 if constexpr ( N == 2 ) {
1411 x1 = blend2<M.a[0], M.a[1]>( src0, src1 );
1412 } else if constexpr ( N == 4 ) {
1413 x1 = blend4<M.a[0], M.a[1], M.a[2], M.a[3]>( src2, src3 );
1414 } else if constexpr ( N == 8 ) {
1415 x1 = blend8<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7]>( src2, src3 );
1416 } else if constexpr ( N == 16 ) {
1417 x1 = blend16<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
1418 M.a[12], M.a[13], M.a[14], M.a[15]>( src2, src3 );
1419 } else if constexpr ( N == 32 ) {
1420 x1 = blend32<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
1421 M.a[12], M.a[13], M.a[14], M.a[15], M.a[16], M.a[17], M.a[18], M.a[19], M.a[20], M.a[21], M.a[22],
1422 M.a[23], M.a[24], M.a[25], M.a[26], M.a[27], M.a[28], M.a[29], M.a[30], M.a[31]>( src2, src3 );
1423 }
1424 x0 |= x1; // combine result of two blends. Unused elements are zero
1425 }
1426 return x0;
1427 }
void blend8()
Definition instrset.h:1294
void blend2()
Definition instrset.h:1290
void blend32()
Definition instrset.h:1298
void blend16()
Definition instrset.h:1296
void blend4()
Definition instrset.h:1292
constexpr EList< int, N > blend_half_indexes(int const (&a)[N])
Definition instrset.h:1307
constexpr double L
T a[N]
Definition instrset.h:461

◆ blend_half_indexes()

template<int N, int dozero, int src1, int src2>
EList< int, N > blend_half_indexes ( int const (&) a[N])
constexpr

Definition at line 1307 of file instrset.h.

1307 {
1308 // a is a reference to a constexpr array of permutation indexes
1309 EList<int, N> list = { { 0 } }; // list to return
1310 int u = dozero ? -1 : V_DC; // value to use for unused entries
1311 int j = 0; // loop counter
1312
1313 for ( j = 0; j < N; j++ ) { // loop through indexes
1314 int ix = a[j]; // current index
1315 if ( ix < 0 ) { // zero or don't care
1316 list.a[j] = ( dozero == 2 ) ? ix : u;
1317 } else {
1318 int src = ix / N; // source
1319 if ( src == src1 ) {
1320 list.a[j] = ix & ( N - 1 );
1321 } else if ( src == src2 ) {
1322 list.a[j] = ( ix & ( N - 1 ) ) + N;
1323 } else
1324 list.a[j] = u;
1325 }
1326 }
1327 return list;
1328 }

◆ blend_perm_indexes()

template<int N, int dozero>
EList< int, 2 *N > blend_perm_indexes ( int const (&) a[N])
constexpr

Definition at line 1212 of file instrset.h.

1212 {
1213 // a is a reference to a constexpr array of permutation indexes
1214 EList<int, 2 * N> list = { { 0 } }; // list to return
1215 int u = dozero ? -1 : V_DC; // value to use for unused entries
1216 int j = 0;
1217
1218 for ( j = 0; j < N; j++ ) { // loop through indexes
1219 int ix = a[j]; // current index
1220 if ( ix < 0 ) { // zero or don't care
1221 if ( dozero == 2 ) {
1222 // list.a[j] = list.a[j + N] = ix; // fails in gcc in complicated cases
1223 list.a[j] = ix;
1224 list.a[j + N] = ix;
1225 } else {
1226 // list.a[j] = list.a[j + N] = u;
1227 list.a[j] = u;
1228 list.a[j + N] = u;
1229 }
1230 } else if ( ix < N ) { // value from a
1231 list.a[j] = ix;
1232 list.a[j + N] = u;
1233 } else {
1234 list.a[j] = u; // value from b
1235 list.a[j + N] = ix - N;
1236 }
1237 }
1238 return list;
1239 }

◆ compress_mask()

template<int N>
uint64_t compress_mask ( int const (&) a[N])
constexpr

Definition at line 822 of file instrset.h.

822 {
823 // a is a reference to a constexpr array of permutation indexes
824 int ix = 0, lasti = -1, lastp = -1;
825 uint64_t m = 0;
826 int i = 0;
827 int j = 1; // loop counters
828 for ( i = 0; i < N; i++ ) {
829 ix = a[i]; // permutation index
830 if ( ix >= 0 ) {
831 m |= (uint64_t)1 << ix; // mask for compression source
832 for ( j = 1; j < i - lastp; j++ ) {
833 m |= (uint64_t)1 << ( lasti + j ); // dummy filling source
834 }
835 lastp = i;
836 lasti = ix;
837 }
838 }
839 return m;
840 }
constexpr double m

◆ expand_mask()

template<int N>
uint64_t expand_mask ( int const (&) a[N])
constexpr

Definition at line 846 of file instrset.h.

846 {
847 // a is a reference to a constexpr array of permutation indexes
848 int ix = 0, lasti = -1, lastp = -1;
849 uint64_t m = 0;
850 int i = 0;
851 int j = 1;
852 for ( i = 0; i < N; i++ ) {
853 ix = a[i]; // permutation index
854 if ( ix >= 0 ) {
855 m |= (uint64_t)1 << i; // mask for expansion destination
856 for ( j = 1; j < ix - lasti; j++ ) {
857 m |= (uint64_t)1 << ( lastp + j ); // dummy filling destination
858 }
859 lastp = i;
860 lasti = ix;
861 }
862 }
863 return m;
864 }

◆ get_inttype()

template<typename V>
auto get_inttype ( )
constexpr

Definition at line 467 of file instrset.h.

467 {
468 constexpr int elementsize = sizeof( V ) / V::size(); // size of vector elements
469
470 if constexpr ( elementsize >= 8 ) {
471 return -int64_t( 1 );
472 } else if constexpr ( elementsize >= 4 ) {
473 return int32_t( -1 );
474 } else if constexpr ( elementsize >= 2 ) {
475 return int16_t( -1 );
476 } else {
477 return int8_t( -1 );
478 }
479 }

◆ hasAVX512ER()

bool hasAVX512ER ( void )

Definition at line 142 of file instrset_detect.cpp.

142 {
143 if ( instrset_detect() < 9 ) return false; // must have AVX512F
144 int abcd[4]; // cpuid results
145 cpuid( abcd, 7 ); // call cpuid function 7
146 return ( ( abcd[1] & ( 1 << 27 ) ) != 0 ); // ebx bit 27 indicates AVX512ER
147 }
int instrset_detect(void)

◆ hasAVX512VBMI()

bool hasAVX512VBMI ( void )

Definition at line 150 of file instrset_detect.cpp.

150 {
151 if ( instrset_detect() < 10 ) return false; // must have AVX512BW
152 int abcd[4]; // cpuid results
153 cpuid( abcd, 7 ); // call cpuid function 7
154 return ( ( abcd[2] & ( 1 << 1 ) ) != 0 ); // ecx bit 1 indicates AVX512VBMI
155 }

◆ hasAVX512VBMI2()

bool hasAVX512VBMI2 ( void )

Definition at line 158 of file instrset_detect.cpp.

158 {
159 if ( instrset_detect() < 10 ) return false; // must have AVX512BW
160 int abcd[4]; // cpuid results
161 cpuid( abcd, 7 ); // call cpuid function 7
162 return ( ( abcd[2] & ( 1 << 6 ) ) != 0 ); // ecx bit 6 indicates AVX512VBMI2
163 }

◆ hasFMA3()

bool hasFMA3 ( void )

Definition at line 110 of file instrset_detect.cpp.

110 {
111 if ( instrset_detect() < 7 ) return false; // must have AVX
112 int abcd[4]; // cpuid results
113 cpuid( abcd, 1 ); // call cpuid function 1
114 return ( ( abcd[2] & ( 1 << 12 ) ) != 0 ); // ecx bit 12 indicates FMA3
115 }

◆ hasFMA4()

bool hasFMA4 ( void )

Definition at line 118 of file instrset_detect.cpp.

118 {
119 if ( instrset_detect() < 7 ) return false; // must have AVX
120 int abcd[4]; // cpuid results
121 cpuid( abcd, 0x80000001 ); // call cpuid function 0x80000001
122 return ( ( abcd[2] & ( 1 << 16 ) ) != 0 ); // ecx bit 16 indicates FMA4
123 }

◆ hasXOP()

bool hasXOP ( void )

Definition at line 126 of file instrset_detect.cpp.

126 {
127 if ( instrset_detect() < 7 ) return false; // must have AVX
128 int abcd[4]; // cpuid results
129 cpuid( abcd, 0x80000001 ); // call cpuid function 0x80000001
130 return ( ( abcd[2] & ( 1 << 11 ) ) != 0 ); // ecx bit 11 indicates XOP
131 }

◆ instrset_detect()

int instrset_detect ( void )

Definition at line 63 of file instrset_detect.cpp.

63 {
64
65 static int iset = -1; // remember value for next call
66 if ( iset >= 0 ) {
67 return iset; // called before
68 }
69 iset = 0; // default value
70 int abcd[4] = { 0, 0, 0, 0 }; // cpuid results
71 cpuid( abcd, 0 ); // call cpuid function 0
72 if ( abcd[0] == 0 ) return iset; // no further cpuid function supported
73 cpuid( abcd, 1 ); // call cpuid function 1 for feature flags
74 if ( ( abcd[3] & ( 1 << 0 ) ) == 0 ) return iset; // no floating point
75 if ( ( abcd[3] & ( 1 << 23 ) ) == 0 ) return iset; // no MMX
76 if ( ( abcd[3] & ( 1 << 15 ) ) == 0 ) return iset; // no conditional move
77 if ( ( abcd[3] & ( 1 << 24 ) ) == 0 ) return iset; // no FXSAVE
78 if ( ( abcd[3] & ( 1 << 25 ) ) == 0 ) return iset; // no SSE
79 iset = 1; // 1: SSE supported
80 if ( ( abcd[3] & ( 1 << 26 ) ) == 0 ) return iset; // no SSE2
81 iset = 2; // 2: SSE2 supported
82 if ( ( abcd[2] & ( 1 << 0 ) ) == 0 ) return iset; // no SSE3
83 iset = 3; // 3: SSE3 supported
84 if ( ( abcd[2] & ( 1 << 9 ) ) == 0 ) return iset; // no SSSE3
85 iset = 4; // 4: SSSE3 supported
86 if ( ( abcd[2] & ( 1 << 19 ) ) == 0 ) return iset; // no SSE4.1
87 iset = 5; // 5: SSE4.1 supported
88 if ( ( abcd[2] & ( 1 << 23 ) ) == 0 ) return iset; // no POPCNT
89 if ( ( abcd[2] & ( 1 << 20 ) ) == 0 ) return iset; // no SSE4.2
90 iset = 6; // 6: SSE4.2 supported
91 if ( ( abcd[2] & ( 1 << 27 ) ) == 0 ) return iset; // no OSXSAVE
92 if ( ( xgetbv( 0 ) & 6 ) != 6 ) return iset; // AVX not enabled in O.S.
93 if ( ( abcd[2] & ( 1 << 28 ) ) == 0 ) return iset; // no AVX
94 iset = 7; // 7: AVX supported
95 cpuid( abcd, 7 ); // call cpuid leaf 7 for feature flags
96 if ( ( abcd[1] & ( 1 << 5 ) ) == 0 ) return iset; // no AVX2
97 iset = 8;
98 if ( ( abcd[1] & ( 1 << 16 ) ) == 0 ) return iset; // no AVX512
99 cpuid( abcd, 0xD ); // call cpuid leaf 0xD for feature flags
100 if ( ( abcd[0] & 0x60 ) != 0x60 ) return iset; // no AVX512
101 iset = 9;
102 cpuid( abcd, 7 ); // call cpuid leaf 7 for feature flags
103 if ( ( abcd[1] & ( 1 << 31 ) ) == 0 ) return iset; // no AVX512VL
104 if ( ( abcd[1] & 0x40020000 ) != 0x40020000 ) return iset; // no AVX512BW, AVX512DQ
105 iset = 10;
106 return iset;
107 }

◆ largeblock_indexes()

template<int N>
EList< int, N/2 > largeblock_indexes ( int const (&) a[N])
constexpr

Definition at line 1247 of file instrset.h.

1247 {
1248 // Parameter a is a reference to a constexpr array of N permutation indexes
1249 EList<int, N / 2> list = { { 0 } }; // list to return
1250
1251 bool fit_addz = false; // additional zeroing needed at the lower block level
1252 int ix = 0; // even index
1253 int iy = 0; // odd index
1254 int iz = 0; // combined index
1255 int i = 0; // loop counter
1256
1257 for ( i = 0; i < N; i += 2 ) {
1258 ix = a[i]; // even index
1259 iy = a[i + 1]; // odd index
1260 if ( ix >= 0 ) {
1261 iz = ix / 2; // half index
1262 } else if ( iy >= 0 ) {
1263 iz = iy / 2; // half index
1264 } else
1265 iz = ix | iy; // -1 or V_DC. -1 takes precedence
1266 list.a[i / 2] = iz; // save to list
1267 // check if additional zeroing is needed at current block size
1268 if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz = true; }
1269 }
1270 // replace -1 by V_DC if fit_addz
1271 if ( fit_addz ) {
1272 for ( i = 0; i < N / 2; i++ ) {
1273 if ( list.a[i] < 0 ) list.a[i] = V_DC;
1274 }
1275 }
1276 return list;
1277 }

◆ largeblock_perm()

template<int N>
EList< int, N/2 > largeblock_perm ( int const (&) a[N])
constexpr

Definition at line 976 of file instrset.h.

976 {
977 // Parameter a is a reference to a constexpr array of permutation indexes
978 EList<int, N / 2> list = { { 0 } }; // result indexes
979 int ix = 0; // even index
980 int iy = 0; // odd index
981 int iz = 0; // combined index
982 bool fit_addz = false; // additional zeroing needed at the lower block level
983 int i = 0; // loop counter
984
985 // check if additional zeroing is needed at current block size
986 for ( i = 0; i < N; i += 2 ) {
987 ix = a[i]; // even index
988 iy = a[i + 1]; // odd index
989 if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz = true; }
990 }
991
992 // loop through indexes
993 for ( i = 0; i < N; i += 2 ) {
994 ix = a[i]; // even index
995 iy = a[i + 1]; // odd index
996 if ( ix >= 0 ) {
997 iz = ix / 2; // half index
998 } else if ( iy >= 0 ) {
999 iz = iy / 2;
1000 } else {
1001 iz = ix | iy; // -1 or V_DC. -1 takes precedence
1002 if ( fit_addz ) iz = V_DC; // V_DC, because result will be zeroed later
1003 }
1004 list.a[i / 2] = iz; // save to list
1005 }
1006 return list;
1007 }

◆ make_bit_mask()

template<int N, int B>
uint64_t make_bit_mask ( int const (&) a[N])
constexpr

Definition at line 523 of file instrset.h.

523 {
524 uint64_t r = 0; // return value
525 uint8_t j = uint8_t( B & 0xFF ); // index to selected bit
526 uint64_t s = 0; // bit number i in r
527 uint64_t f = 0; // 1 if bit not flipped
528 int i = 0;
529 for ( i = 0; i < N; i++ ) {
530 int ix = a[i];
531 if ( ix < 0 ) { // -1 or V_DC
532 s = ( B >> 10 ) & 1;
533 } else {
534 s = ( (uint32_t)ix >> j ) & 1; // extract selected bit
535 if ( i < N / 2 ) {
536 f = ( B >> 8 ) & 1; // lower half
537 } else {
538 f = ( B >> 9 ) & 1; // upper half
539 }
540 s ^= f ^ 1; // flip bit if needed
541 }
542 r |= uint64_t( s ) << i; // set bit in return value
543 }
544 return r;
545 }

◆ make_broad_mask()

template<typename V>
auto make_broad_mask ( uint64_t const m)
constexpr

Definition at line 550 of file instrset.h.

550 {
551 constexpr int N = V::size(); // number of vector elements
552 typedef decltype( get_inttype<V>() ) Etype; // element type
553 EList<Etype, N> u = { { 0 } }; // list for returning
554 int i = 0;
555 for ( i = 0; i < N; i++ ) { u.a[i] = ( ( m >> i ) & 1 ) != 0 ? get_inttype<V>() : 0; }
556 return u; // return encapsulated array
557 }
constexpr auto get_inttype()
Definition instrset.h:467

◆ perm16_flags()

template<typename V>
uint64_t perm16_flags ( int const (&) a[V::size()])
constexpr

Definition at line 874 of file instrset.h.

874 {
875 // a is a reference to a constexpr array of permutation indexes
876 // V is a vector class
877 constexpr int N = V::size(); // number of elements
878
879 uint64_t retval = 0; // return value
880 uint32_t pat[4] = { 0, 0, 0, 0 }; // permute patterns
881 uint32_t i = 0; // loop counter
882 int ix = 0; // index number i
883 const uint32_t lanesize = 8; // elements per lane
884 uint32_t lane = 0; // current lane
885 int lanepattern[lanesize] = { 0 }; // pattern in each lane
886
887 for ( i = 0; i < N; i++ ) {
888 ix = a[i];
889 lane = i / lanesize; // current lane
890 if ( lane == 0 ) {
891 lanepattern[i] = ix; // save pattern
892 } else if ( ix >= 0 ) { // not first lane
893 uint32_t j = i - lane * lanesize; // index into lanepattern
894 int jx = ix - lane * lanesize; // pattern within lane
895 if ( lanepattern[j] < 0 ) {
896 lanepattern[j] = jx; // pattern not known from previous lane
897 }
898 }
899 }
900 // four patterns: low2low, high2high, high2low, low2high
901 for ( i = 0; i < 4; i++ ) {
902 // loop through low pattern
903 if ( lanepattern[i] >= 0 ) {
904 if ( lanepattern[i] < 4 ) { // low2low
905 retval |= 1;
906 pat[0] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
907 } else { // high2low
908 retval |= 4;
909 pat[2] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
910 }
911 }
912 // loop through high pattern
913 if ( lanepattern[i + 4] >= 0 ) {
914 if ( lanepattern[i + 4] < 4 ) { // low2high
915 retval |= 8;
916 pat[3] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
917 } else { // high2high
918 retval |= 2;
919 pat[1] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
920 }
921 }
922 }
923 // join return data
924 for ( i = 0; i < 4; i++ ) { retval |= (uint64_t)pat[i] << ( 32 + i * 8 ); }
925 return retval;
926 }

◆ perm_flags()

template<typename V>
uint64_t perm_flags ( int const (&) a[V::size()])
constexpr

Definition at line 601 of file instrset.h.

601 {
602 // a is a reference to a constexpr array of permutation indexes
603 // V is a vector class
604 constexpr int N = V::size(); // number of elements
605 uint64_t r = perm_largeblock | perm_same_pattern | perm_allzero; // return value
606 uint32_t i = 0; // loop counter
607 int j = 0; // loop counter
608 int ix = 0; // index number i
609 const uint32_t nlanes = sizeof( V ) / 16; // number of 128-bit lanes
610 const uint32_t lanesize = N / nlanes; // elements per lane
611 const uint32_t elementsize = sizeof( V ) / N; // size of each vector element
612 uint32_t lane = 0; // current lane
613 uint32_t rot = 999; // rotate left count
614 int32_t broadc = 999; // index to broadcasted element
615 uint32_t patfail = 0; // remember certain patterns that do not fit
616 uint32_t addz2 = 0; // remember certain patterns need extra zeroing
617 int32_t compresslasti = -1; // last index in perm_compress fit
618 int32_t compresslastp = -1; // last position in perm_compress fit
619 int32_t expandlasti = -1; // last index in perm_expand fit
620 int32_t expandlastp = -1; // last position in perm_expand fit
621
622 int lanepattern[lanesize] = { 0 }; // pattern in each lane
623
624 for ( i = 0; i < N; i++ ) { // loop through indexes
625 ix = a[i]; // current index
626 // meaning of ix: -1 = set to zero, V_DC = don't care, non-negative value = permute.
627 if ( ix == -1 ) {
628 r |= perm_zeroing; // zeroing requested
629 } else if ( ix != V_DC && uint32_t( ix ) >= N ) {
630 r |= perm_outofrange; // index out of range
631 }
632 if ( ix >= 0 ) {
633 r &= ~perm_allzero; // not all zero
634 if ( ix != (int)i ) r |= perm_perm; // needs permutation
635 if ( broadc == 999 )
636 broadc = ix; // remember broadcast index
637 else if ( broadc != ix )
638 broadc = 1000; // does not fit broadcast
639 }
640 // check if pattern fits a larger block size:
641 // even indexes must be even, odd indexes must fit the preceding even index + 1
642 if ( ( i & 1 ) == 0 ) { // even index
643 if ( ix >= 0 && ( ix & 1 ) ) r &= ~perm_largeblock; // not even. does not fit larger block size
644 int iy = a[i + 1]; // next odd index
645 if ( iy >= 0 && ( iy & 1 ) == 0 ) r &= ~perm_largeblock; // not odd. does not fit larger block size
646 if ( ix >= 0 && iy >= 0 && iy != ix + 1 ) r &= ~perm_largeblock; // does not fit preceding index + 1
647 if ( ix == -1 && iy >= 0 ) r |= perm_addz; // needs additional zeroing at current block size
648 if ( iy == -1 && ix >= 0 ) r |= perm_addz; // needs additional zeroing at current block size
649 }
650 lane = i / lanesize; // current lane
651 if ( lane == 0 ) { // first lane, or no pattern yet
652 lanepattern[i] = ix; // save pattern
653 }
654 // check if crossing lanes
655 if ( ix >= 0 ) {
656 uint32_t lanei = (uint32_t)ix / lanesize; // source lane
657 if ( lanei != lane ) r |= perm_cross_lane; // crossing lane
658 }
659 // check if same pattern in all lanes
660 if ( lane != 0 && ix >= 0 ) { // not first lane
661 int j1 = i - int( lane * lanesize ); // index into lanepattern
662 int jx = ix - int( lane * lanesize ); // pattern within lane
663 if ( jx < 0 || jx >= (int)lanesize ) r &= ~perm_same_pattern; // source is in another lane
664 if ( lanepattern[j1] < 0 ) {
665 lanepattern[j1] = jx; // pattern not known from previous lane
666 } else {
667 if ( lanepattern[j1] != jx ) r &= ~perm_same_pattern; // not same pattern
668 }
669 }
670 if ( ix >= 0 ) {
671 // check if pattern fits zero extension (perm_zext)
672 if ( uint32_t( ix * 2 ) != i ) {
673 patfail |= 1; // does not fit zero extension
674 }
675 // check if pattern fits compress (perm_compress)
676 if ( ix > compresslasti && ix - compresslasti >= (int)i - compresslastp ) {
677 if ( (int)i - compresslastp > 1 ) addz2 |= 2; // perm_compress may need additional zeroing
678 compresslasti = ix;
679 compresslastp = i;
680 } else {
681 patfail |= 2; // does not fit perm_compress
682 }
683 // check if pattern fits expand (perm_expand)
684 if ( ix > expandlasti && ix - expandlasti <= (int)i - expandlastp ) {
685 if ( ix - expandlasti > 1 ) addz2 |= 4; // perm_expand may need additional zeroing
686 expandlasti = ix;
687 expandlastp = i;
688 } else {
689 patfail |= 4; // does not fit perm_compress
690 }
691 } else if ( ix == -1 ) {
692 if ( ( i & 1 ) == 0 ) addz2 |= 1; // zero extension needs additional zeroing
693 }
694 }
695 if ( !( r & perm_perm ) ) return r; // more checks are superfluous
696
697 if ( !( r & perm_largeblock ) ) r &= ~perm_addz; // remove irrelevant flag
698 if ( r & perm_cross_lane ) r &= ~perm_same_pattern; // remove irrelevant flag
699 if ( ( patfail & 1 ) == 0 ) {
700 r |= perm_zext; // fits zero extension
701 if ( ( addz2 & 1 ) != 0 ) r |= perm_addz2;
702 } else if ( ( patfail & 2 ) == 0 ) {
703 r |= perm_compress; // fits compression
704 if ( ( addz2 & 2 ) != 0 ) { // check if additional zeroing needed
705 for ( j = 0; j < compresslastp; j++ ) {
706 if ( a[j] == -1 ) r |= perm_addz2;
707 }
708 }
709 } else if ( ( patfail & 4 ) == 0 ) {
710 r |= perm_expand; // fits expansion
711 if ( ( addz2 & 4 ) != 0 ) { // check if additional zeroing needed
712 for ( j = 0; j < expandlastp; j++ ) {
713 if ( a[j] == -1 ) r |= perm_addz2;
714 }
715 }
716 }
717
718 if ( r & perm_same_pattern ) {
719 // same pattern in all lanes. check if it fits specific patterns
720 bool fit = true;
721 // fit shift or rotate
722 for ( i = 0; i < lanesize; i++ ) {
723 if ( lanepattern[i] >= 0 ) {
724 uint32_t rot1 = uint32_t( lanepattern[i] + lanesize - i ) % lanesize;
725 if ( rot == 999 ) {
726 rot = rot1;
727 } else { // check if fit
728 if ( rot != rot1 ) fit = false;
729 }
730 }
731 }
732 rot &= lanesize - 1; // prevent out of range values
733 if ( fit ) { // fits rotate, and possibly shift
734 uint64_t rot2 = ( rot * elementsize ) & 0xF; // rotate right count in bytes
735 r |= rot2 << perm_rot_count; // put shift/rotate count in output bit 16-19
736#if INSTRSET >= 4 // SSSE3
737 r |= perm_rotate; // allow palignr
738#endif
739 // fit shift left
740 fit = true;
741 for ( i = 0; i < lanesize - rot; i++ ) { // check if first rot elements are zero or don't care
742 if ( lanepattern[i] >= 0 ) fit = false;
743 }
744 if ( fit ) {
745 r |= perm_shleft;
746 for ( ; i < lanesize; i++ )
747 if ( lanepattern[i] == -1 ) r |= perm_addz; // additional zeroing needed
748 }
749 // fit shift right
750 fit = true;
751 for ( i = lanesize - (uint32_t)rot; i < lanesize;
752 i++ ) { // check if last (lanesize-rot) elements are zero or don't care
753 if ( lanepattern[i] >= 0 ) fit = false;
754 }
755 if ( fit ) {
756 r |= perm_shright;
757 for ( i = 0; i < lanesize - rot; i++ ) {
758 if ( lanepattern[i] == -1 ) r |= perm_addz; // additional zeroing needed
759 }
760 }
761 }
762 // fit punpckhi
763 fit = true;
764 uint32_t j2 = lanesize / 2;
765 for ( i = 0; i < lanesize; i++ ) {
766 if ( lanepattern[i] >= 0 && lanepattern[i] != (int)j2 ) fit = false;
767 if ( ( i & 1 ) != 0 ) j2++;
768 }
769 if ( fit ) r |= perm_punpckh;
770 // fit punpcklo
771 fit = true;
772 j2 = 0;
773 for ( i = 0; i < lanesize; i++ ) {
774 if ( lanepattern[i] >= 0 && lanepattern[i] != (int)j2 ) fit = false;
775 if ( ( i & 1 ) != 0 ) j2++;
776 }
777 if ( fit ) r |= perm_punpckl;
778 // fit pshufd
779 if ( elementsize >= 4 ) {
780 uint64_t p = 0;
781 for ( i = 0; i < lanesize; i++ ) {
782 if ( lanesize == 4 ) {
783 p |= ( lanepattern[i] & 3 ) << 2 * i;
784 } else { // lanesize = 2
785 p |= ( ( lanepattern[i] & 1 ) * 10 + 4 ) << 4 * i;
786 }
787 }
788 r |= p << perm_ipattern;
789 }
790 }
791#if INSTRSET >= 7
792 else { // not same pattern in all lanes
793 if constexpr ( nlanes > 1 ) { // Try if it fits big rotate
794 for ( i = 0; i < N; i++ ) {
795 ix = a[i];
796 if ( ix >= 0 ) {
797 uint32_t rot2 = ( ix + N - i ) % N; // rotate count
798 if ( rot == 999 ) {
799 rot = rot2; // save rotate count
800 } else if ( rot != rot2 ) {
801 rot = 1000;
802 break; // does not fit big rotate
803 }
804 }
805 }
806 if ( rot < N ) { // fits big rotate
807 r |= perm_rotate_big | (uint64_t)rot << perm_rot_count;
808 }
809 }
810 }
811#endif
812 if ( broadc < 999 && ( r & ( perm_rotate | perm_shright | perm_shleft | perm_rotate_big ) ) == 0 ) {
813 r |= perm_broadcast | (uint64_t)broadc << perm_rot_count; // fits broadcast
814 }
815 return r;
816 }
const int perm_compress
Definition instrset.h:593
const int perm_shleft
Definition instrset.h:587
const int perm_addz2
Definition instrset.h:578
const int perm_punpckl
Definition instrset.h:582
const int perm_rotate_big
Definition instrset.h:589
const int perm_broadcast
Definition instrset.h:591
const int perm_cross_lane
Definition instrset.h:579
const int perm_addz
Definition instrset.h:577
const int perm_zeroing
Definition instrset.h:573
const int perm_outofrange
Definition instrset.h:595
const int perm_largeblock
Definition instrset.h:576
const int perm_rotate
Definition instrset.h:583
const int perm_shright
Definition instrset.h:585
const int perm_punpckh
Definition instrset.h:581
const int perm_same_pattern
Definition instrset.h:580
const int perm_zext
Definition instrset.h:592
const int perm_perm
Definition instrset.h:574
const int perm_expand
Definition instrset.h:594
const int perm_allzero
Definition instrset.h:575
const int perm_ipattern
Definition instrset.h:597
const int perm_rot_count
Definition instrset.h:596

◆ perm_mask_broad()

template<typename V>
auto perm_mask_broad ( int const (&) A[V::size()])
constexpr

Definition at line 562 of file instrset.h.

562 {
563 constexpr int N = V::size(); // number of vector elements
564 typedef decltype( get_inttype<V>() ) Etype; // vector element type
565 EList<Etype, N> u = { { 0 } }; // list for returning
566 int i = 0;
567 for ( i = 0; i < N; i++ ) { u.a[i] = Etype( A[i] ); }
568 return u; // return encapsulated array
569 }

◆ physicalProcessors()

int physicalProcessors ( int * logical_processors = 0)

◆ pshufb_mask()

template<typename V, int oppos = 0>
auto pshufb_mask ( int const (&) A[V::size()])
constexpr

Definition at line 933 of file instrset.h.

933 {
934 // Parameter a is a reference to a constexpr array of permutation indexes
935 // V is a vector class
936 // oppos = 1 for data from the opposite 128-bit lane in 256-bit vectors
937 constexpr uint32_t N = V::size(); // number of vector elements
938 constexpr uint32_t elementsize = sizeof( V ) / N; // size of each vector element
939 constexpr uint32_t nlanes = sizeof( V ) / 16; // number of 128 bit lanes in vector
940 constexpr uint32_t elements_per_lane = N / nlanes; // number of vector elements per lane
941
942 EList<int8_t, sizeof( V )> u = { { 0 } }; // list for returning
943
944 uint32_t i = 0; // loop counters
945 uint32_t j = 0;
946 int m = 0;
947 int k = 0;
948 uint32_t lane = 0;
949
950 for ( lane = 0; lane < nlanes; lane++ ) { // loop through lanes
951 for ( i = 0; i < elements_per_lane; i++ ) { // loop through elements in lane
952 // permutation index for element within lane
953 int8_t p = -1;
954 int ix = A[m];
955 if ( ix >= 0 ) {
956 ix ^= oppos * elements_per_lane; // flip bit if opposite lane
957 }
958 ix -= int( lane * elements_per_lane ); // index relative to lane
959 if ( ix >= 0 && ix < (int)elements_per_lane ) { // index points to desired lane
960 p = ix * elementsize;
961 }
962 for ( j = 0; j < elementsize; j++ ) { // loop through bytes in element
963 u.a[k++] = p < 0 ? -1 : p + j; // store byte permutation index
964 }
965 m++;
966 }
967 }
968 return u; // return encapsulated array
969 }

◆ zero_mask()

template<int N>
auto zero_mask ( int const (&) a[N])
constexpr

Definition at line 484 of file instrset.h.

484 {
485 uint64_t mask = 0;
486 int i = 0;
487
488 for ( i = 0; i < N; i++ ) {
489 if ( a[i] >= 0 ) mask |= uint64_t( 1 ) << i;
490 }
491 if constexpr ( N <= 8 )
492 return uint8_t( mask );
493 else if constexpr ( N <= 16 )
494 return uint16_t( mask );
495 else if constexpr ( N <= 32 )
496 return uint32_t( mask );
497 else
498 return mask;
499 }

◆ zero_mask_broad()

template<typename V>
auto zero_mask_broad ( int const (&) A[V::size()])
constexpr

Definition at line 504 of file instrset.h.

504 {
505 constexpr int N = V::size(); // number of vector elements
506 typedef decltype( get_inttype<V>() ) Etype; // element type
507 EList<Etype, N> u = { { 0 } }; // list for return
508 int i = 0;
509 for ( i = 0; i < N; i++ ) { u.a[i] = A[i] >= 0 ? get_inttype<V>() : 0; }
510 return u; // return encapsulated array
511 }

Variable Documentation

◆ blend_a

const int blend_a = 0x10

Definition at line 1015 of file instrset.h.

◆ blend_addz

const int blend_addz = 8

Definition at line 1014 of file instrset.h.

◆ blend_allzero

const int blend_allzero = 2

Definition at line 1012 of file instrset.h.

◆ blend_b

const int blend_b = 0x20

Definition at line 1016 of file instrset.h.

◆ blend_cross_lane

const int blend_cross_lane = 0x100

Definition at line 1019 of file instrset.h.

◆ blend_largeblock

const int blend_largeblock = 4

Definition at line 1013 of file instrset.h.

◆ blend_outofrange

const int blend_outofrange = 0x10000000

Definition at line 1030 of file instrset.h.

◆ blend_perma

const int blend_perma = 0x40

Definition at line 1017 of file instrset.h.

◆ blend_permb

const int blend_permb = 0x80

Definition at line 1018 of file instrset.h.

◆ blend_punpckhab

const int blend_punpckhab = 0x1000

Definition at line 1021 of file instrset.h.

◆ blend_punpckhba

const int blend_punpckhba = 0x2000

Definition at line 1022 of file instrset.h.

◆ blend_punpcklab

const int blend_punpcklab = 0x4000

Definition at line 1023 of file instrset.h.

◆ blend_punpcklba

const int blend_punpcklba = 0x8000

Definition at line 1024 of file instrset.h.

◆ blend_rotate_big

const int blend_rotate_big = 0x100000

Definition at line 1029 of file instrset.h.

◆ blend_rotateab

const int blend_rotateab = 0x10000

Definition at line 1025 of file instrset.h.

◆ blend_rotateba

const int blend_rotateba = 0x20000

Definition at line 1026 of file instrset.h.

◆ blend_rotpattern

const int blend_rotpattern = 40

Definition at line 1032 of file instrset.h.

◆ blend_same_pattern

const int blend_same_pattern = 0x200

Definition at line 1020 of file instrset.h.

◆ blend_shufab

const int blend_shufab = 0x40000

Definition at line 1027 of file instrset.h.

◆ blend_shufba

const int blend_shufba = 0x80000

Definition at line 1028 of file instrset.h.

◆ blend_shufpattern

const int blend_shufpattern = 32

Definition at line 1031 of file instrset.h.

◆ blend_zeroing

const int blend_zeroing = 1

Definition at line 1011 of file instrset.h.

◆ perm_addz

const int perm_addz = 0x10

Definition at line 577 of file instrset.h.

◆ perm_addz2

const int perm_addz2 = 0x20

Definition at line 578 of file instrset.h.

◆ perm_allzero

const int perm_allzero = 4

Definition at line 575 of file instrset.h.

◆ perm_broadcast

const int perm_broadcast = 0x8000

Definition at line 591 of file instrset.h.

◆ perm_compress

const int perm_compress = 0x20000

Definition at line 593 of file instrset.h.

◆ perm_cross_lane

const int perm_cross_lane = 0x40

Definition at line 579 of file instrset.h.

◆ perm_expand

const int perm_expand = 0x40000

Definition at line 594 of file instrset.h.

◆ perm_ipattern

const int perm_ipattern
Initial value:
=
40

Definition at line 597 of file instrset.h.

◆ perm_largeblock

const int perm_largeblock = 8

Definition at line 576 of file instrset.h.

◆ perm_outofrange

const int perm_outofrange = 0x10000000

Definition at line 595 of file instrset.h.

◆ perm_perm

const int perm_perm = 2

Definition at line 574 of file instrset.h.

◆ perm_punpckh

const int perm_punpckh = 0x100

Definition at line 581 of file instrset.h.

◆ perm_punpckl

const int perm_punpckl = 0x200

Definition at line 582 of file instrset.h.

◆ perm_rot_count

const int perm_rot_count = 32

Definition at line 596 of file instrset.h.

◆ perm_rotate

const int perm_rotate
Initial value:
=
0x400

Definition at line 583 of file instrset.h.

◆ perm_rotate_big

const int perm_rotate_big
Initial value:
=
0x4000

Definition at line 589 of file instrset.h.

◆ perm_same_pattern

const int perm_same_pattern = 0x80

Definition at line 580 of file instrset.h.

◆ perm_shleft

const int perm_shleft
Initial value:
=
0x2000

Definition at line 587 of file instrset.h.

◆ perm_shright

const int perm_shright
Initial value:
=
0x1000

Definition at line 585 of file instrset.h.

◆ perm_zeroing

const int perm_zeroing = 1

Definition at line 573 of file instrset.h.

◆ perm_zext

const int perm_zext = 0x10000

Definition at line 592 of file instrset.h.

◆ V_DC

int V_DC = -256
constexpr

Definition at line 219 of file instrset.h.