[x64] Add `GFNI`-based optimization for `VECTOR_SH{R,L}_V128(Int8)`
In the `Int8` case of `VECTOR_SH{R,L}_V128`, when all the values are the same, then a single-instruction `gf2p8affineqb` can be emitted that does an int8-based arithmetic-shift, utilizing GF(8) arithmetic. More info here: https://wunkolo.github.io/post/2020/11/gf2p8affineqb-int8-shifting/ Also fixes the iteration-type for when detecting if all of the simd lanes are the same value(was iterating `u16` and not `u8`)
This commit is contained in:
parent
7418011ab5
commit
bd9a290b30
|
@ -731,6 +731,25 @@ struct VECTOR_SHL_V128
|
|||
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
if (i.src2.is_constant) {
|
||||
if (e.IsFeatureEnabled(kX64EmitGFNI)) {
|
||||
const auto& shamt = i.src2.constant();
|
||||
bool all_same = true;
|
||||
for (size_t n = 0; n < 16 - n; ++n) {
|
||||
if (shamt.u8[n] != shamt.u8[n + 1]) {
|
||||
all_same = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (all_same) {
|
||||
// Every count is the same, so we can use gf2p8affineqb.
|
||||
const uint8_t shift_amount = shamt.u8[0];
|
||||
const uint64_t shift_matrix =
|
||||
0x0102040810204080 >> (shift_amount * 8);
|
||||
e.vgf2p8affineqb(i.dest, i.src1,
|
||||
e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
|
||||
return;
|
||||
}
|
||||
}
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
|
@ -920,6 +939,25 @@ struct VECTOR_SHR_V128
|
|||
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
if (i.src2.is_constant) {
|
||||
if (e.IsFeatureEnabled(kX64EmitGFNI)) {
|
||||
const auto& shamt = i.src2.constant();
|
||||
bool all_same = true;
|
||||
for (size_t n = 0; n < 16 - n; ++n) {
|
||||
if (shamt.u8[n] != shamt.u8[n + 1]) {
|
||||
all_same = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (all_same) {
|
||||
// Every count is the same, so we can use gf2p8affineqb.
|
||||
const uint8_t shift_amount = shamt.u8[0];
|
||||
const uint64_t shift_matrix = 0x0102040810204080
|
||||
<< (shift_amount * 8);
|
||||
e.vgf2p8affineqb(i.dest, i.src1,
|
||||
e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
|
||||
return;
|
||||
}
|
||||
}
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
|
@ -1087,8 +1125,8 @@ struct VECTOR_SHA_V128
|
|||
if (e.IsFeatureEnabled(kX64EmitGFNI)) {
|
||||
const auto& shamt = i.src2.constant();
|
||||
bool all_same = true;
|
||||
for (size_t n = 0; n < 8 - n; ++n) {
|
||||
if (shamt.u16[n] != shamt.u16[n + 1]) {
|
||||
for (size_t n = 0; n < 16 - n; ++n) {
|
||||
if (shamt.u8[n] != shamt.u8[n + 1]) {
|
||||
all_same = false;
|
||||
break;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue