Disable most XOP code by default, the manual must be wrong for the shifts or we must be assembling them incorrectly, will return to it later and fix

comparisons and select done by xop are fine though
This commit is contained in:
chss95cs@gmail.com 2022-08-21 12:32:33 -07:00
parent b26c6ee1b8
commit b5ef3453c7
5 changed files with 255 additions and 103 deletions

View File

@ -1030,8 +1030,13 @@ static const vec128_t xmm_consts[] = {
/*
XMMF16PackLCPI6
*/
vec128i(0x8000)
vec128i(0x8000),
/* XMMXOPByteShiftMask,*/
vec128b(7),
/*XMMXOPWordShiftMask*/
vec128s(15),
/*XMMXOPDwordShiftMask*/
vec128i(31)
};
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {

View File

@ -167,7 +167,11 @@ enum XmmConst {
XMMF16PackLCPI3,
XMMF16PackLCPI4,
XMMF16PackLCPI5,
XMMF16PackLCPI6
XMMF16PackLCPI6,
XMMXOPByteShiftMask,
XMMXOPWordShiftMask,
XMMXOPDwordShiftMask,
};
using amdfx::xopcompare_e;
using Xbyak::Xmm;
@ -383,7 +387,30 @@ class X64Emitter : public Xbyak::CodeGenerator {
DEFINECOMPARE(vpcomud);
DEFINECOMPARE(vpcomq);
DEFINECOMPARE(vpcomuq);
#undef DEFINECOMPARE
#undef DEFINECOMPARE
#define DEFINESHIFTER(name) \
void name(Xmm dest, Xmm src1, Xmm src2) { \
auto xop_bytes = \
amdfx::operations::name(dest.getIdx(), src1.getIdx(), src2.getIdx()); \
EmitXOP(xop_bytes); \
}
DEFINESHIFTER(vprotb)
DEFINESHIFTER(vprotw)
DEFINESHIFTER(vprotd)
DEFINESHIFTER(vprotq)
DEFINESHIFTER(vpshab)
DEFINESHIFTER(vpshaw)
DEFINESHIFTER(vpshad)
DEFINESHIFTER(vpshaq)
DEFINESHIFTER(vpshlb)
DEFINESHIFTER(vpshlw)
DEFINESHIFTER(vpshld)
DEFINESHIFTER(vpshlq)
protected:
void* Emplace(const EmitFunctionInfo& func_info,
GuestFunction* function = nullptr);

View File

@ -19,6 +19,16 @@
#include "xenia/base/cvar.h"
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
DEFINE_bool(xop_rotates, false, "rotate via xop", "X64");
DEFINE_bool(xop_left_shifts, false, "shl via xop", "X64");
DEFINE_bool(xop_right_shifts, false, "shr via xop", "X64");
DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "X64");
DEFINE_bool(xop_compares, true, "compare via xop", "X64");
namespace xe {
namespace cpu {
namespace backend {
@ -407,7 +417,7 @@ struct VECTOR_COMPARE_SGE_V128
static void Emit(X64Emitter& e, const EmitArgType& i) {
EmitAssociativeBinaryXmmOp(
e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
if (e.IsFeatureEnabled(kX64EmitXOP)) {
if (cvars::xop_compares && e.IsFeatureEnabled(kX64EmitXOP)) {
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcomb(dest, src1, src2, xopcompare_e::GTE);
@ -775,23 +785,52 @@ static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) {
// Store result and return it.
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
}
static XmmConst GetShiftmaskForType(unsigned typ) {
if (typ == INT8_TYPE) {
return XMMXOPByteShiftMask;
} else if (typ == INT16_TYPE) {
return XMMXOPWordShiftMask;
} else {
return XMMXOPDwordShiftMask;
}
}
struct VECTOR_SHL_V128
: Sequence<VECTOR_SHL_V128, I<OPCODE_VECTOR_SHL, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
if (cvars::xop_left_shifts && e.IsFeatureEnabled(kX64EmitXOP)) {
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
e.vpand(e.xmm2, src2,
e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
switch (i.instr->flags) {
case INT8_TYPE:
e.vpshlb(i.dest, src1, e.xmm2);
break;
case INT16_TYPE:
e.vpshlw(i.dest, src1, e.xmm2);
break;
case INT32_TYPE:
e.vpshld(i.dest, src1, e.xmm2);
break;
}
} else {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
}
}
}
@ -1061,19 +1100,45 @@ static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) {
struct VECTOR_SHR_V128
: Sequence<VECTOR_SHR_V128, I<OPCODE_VECTOR_SHR, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
if (cvars::xop_right_shifts && e.IsFeatureEnabled(kX64EmitXOP)) {
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
e.vpand(e.xmm2, src2,
e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
e.vpcmpeqb(e.xmm3, e.xmm3);
switch (i.instr->flags) {
case INT8_TYPE:
e.vpsignb(e.xmm2, e.xmm3);
e.vpshlb(i.dest, src1, e.xmm2);
break;
case INT16_TYPE:
e.vpsignw(e.xmm2, e.xmm3);
e.vpshlw(i.dest, src1, e.xmm2);
break;
case INT32_TYPE:
e.vpsignd(e.xmm2, e.xmm3);
e.vpshld(i.dest, src1, e.xmm2);
break;
}
} else {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
}
}
}
@ -1244,19 +1309,45 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128);
struct VECTOR_SHA_V128
: Sequence<VECTOR_SHA_V128, I<OPCODE_VECTOR_SHA, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
if (cvars::xop_arithmetic_right_shifts && e.IsFeatureEnabled(kX64EmitXOP)) {
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
e.vpand(e.xmm2, src2,
e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
e.vpcmpeqb(e.xmm3, e.xmm3);
switch (i.instr->flags) {
case INT8_TYPE:
e.vpsignb(e.xmm2, e.xmm3);
e.vpshab(i.dest, src1, e.xmm2);
break;
case INT16_TYPE:
e.vpsignw(e.xmm2, e.xmm3);
e.vpshaw(i.dest, src1, e.xmm2);
break;
case INT32_TYPE:
e.vpsignd(e.xmm2, e.xmm3);
e.vpshad(i.dest, src1, e.xmm2);
break;
}
} else {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
}
}
}
@ -1432,55 +1523,29 @@ struct VECTOR_ROTATE_LEFT_V128
: Sequence<VECTOR_ROTATE_LEFT_V128,
I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
switch (i.instr->flags) {
case INT8_TYPE:
// TODO(benvanik): native version (with shift magic).
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
} else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint8_t>));
e.vmovaps(i.dest, e.xmm0);
break;
case INT16_TYPE:
// TODO(benvanik): native version (with shift magic).
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
} else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint16_t>));
e.vmovaps(i.dest, e.xmm0);
break;
case INT32_TYPE: {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vprolvd(i.dest, i.src1, i.src2);
} else if (e.IsFeatureEnabled(kX64EmitAVX2)) {
Xmm temp = i.dest;
if (i.dest == i.src1 || i.dest == i.src2) {
temp = e.xmm2;
}
// Shift left (to get high bits):
if (i.src2.is_constant) {
e.LoadConstantXmm(temp, i.src2.constant());
e.vpand(e.xmm0, temp, e.GetXmmConstPtr(XMMShiftMaskPS));
} else {
e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
}
e.vpsllvd(e.xmm1, i.src1, e.xmm0);
// Shift right (to get low bits):
e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32));
e.vpsubd(temp, e.xmm0);
e.vpsrlvd(i.dest, i.src1, temp);
// Merge:
e.vpor(i.dest, e.xmm1);
} else {
// TODO(benvanik): non-AVX2 native version.
if (cvars::xop_rotates && e.IsFeatureEnabled(kX64EmitXOP)) {
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
e.vpand(e.xmm2, src2,
e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
switch (i.instr->flags) {
case INT8_TYPE:
e.vprotb(i.dest, src1, e.xmm2);
break;
case INT16_TYPE:
e.vprotw(i.dest, src1, e.xmm2);
break;
case INT32_TYPE:
e.vprotd(i.dest, src1, e.xmm2);
break;
}
} else {
switch (i.instr->flags) {
case INT8_TYPE:
// TODO(benvanik): native version (with shift magic).
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1),
e.StashConstantXmm(1, i.src2.constant()));
@ -1489,14 +1554,63 @@ struct VECTOR_ROTATE_LEFT_V128
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint32_t>));
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint8_t>));
e.vmovaps(i.dest, e.xmm0);
break;
case INT16_TYPE:
// TODO(benvanik): native version (with shift magic).
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1),
e.StashConstantXmm(1, i.src2.constant()));
} else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint16_t>));
e.vmovaps(i.dest, e.xmm0);
break;
case INT32_TYPE: {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vprolvd(i.dest, i.src1, i.src2);
} else if (e.IsFeatureEnabled(kX64EmitAVX2)) {
Xmm temp = i.dest;
if (i.dest == i.src1 || i.dest == i.src2) {
temp = e.xmm2;
}
// Shift left (to get high bits):
if (i.src2.is_constant) {
e.LoadConstantXmm(temp, i.src2.constant());
e.vpand(e.xmm0, temp, e.GetXmmConstPtr(XMMShiftMaskPS));
} else {
e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
}
e.vpsllvd(e.xmm1, i.src1, e.xmm0);
// Shift right (to get low bits):
e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32));
e.vpsubd(temp, e.xmm0);
e.vpsrlvd(i.dest, i.src1, temp);
// Merge:
e.vpor(i.dest, e.xmm1);
} else {
// TODO(benvanik): non-AVX2 native version.
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1),
e.StashConstantXmm(1, i.src2.constant()));
} else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint32_t>));
e.vmovaps(i.dest, e.xmm0);
}
break;
}
break;
default:
assert_always();
break;
}
default:
assert_always();
break;
}
}
};

View File

@ -781,7 +781,7 @@ struct SELECT_V128_V128
} else if (mayblend == PermittedBlend::Ps) {
e.vblendvps(i.dest, src2, src3, src1);
} else {
if (1 && e.IsFeatureEnabled(kX64EmitXOP)) {
if (1 && e.IsFeatureEnabled(kX64EmitXOP)) {
XELOGCPU("Doing vpcmov!!");
e.vpcmov(i.dest, src3, src2, src1);
} else {

View File

@ -452,7 +452,7 @@ Affected: FX, FEX, VX, OX (if Rc = 1)
*/
// f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) {
// todo
}
return 0;
}
@ -469,7 +469,10 @@ int InstrEmit_fnabsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- !abs(frB)
Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB)));
f.StoreFPR(i.X.RT, v);
f.UpdateFPSCR(v, i.X.Rc);
//f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) {
//todo
}
return 0;
}
@ -477,7 +480,10 @@ int InstrEmit_fnegx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- ¬ frB[0] || frB[1-63]
Value* v = f.Neg(f.LoadFPR(i.X.RB));
f.StoreFPR(i.X.RT, v);
f.UpdateFPSCR(v, i.X.Rc);
//f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) {
//todo
}
return 0;
}