Disable most XOP code by default, the manual must be wrong for the shifts or we must be assembling them incorrectly, will return to it later and fix

comparisons and select done by xop are fine though
This commit is contained in:
chss95cs@gmail.com 2022-08-21 12:32:33 -07:00
parent b26c6ee1b8
commit b5ef3453c7
5 changed files with 255 additions and 103 deletions

View File

@ -1030,8 +1030,13 @@ static const vec128_t xmm_consts[] = {
/* /*
XMMF16PackLCPI6 XMMF16PackLCPI6
*/ */
vec128i(0x8000) vec128i(0x8000),
/* XMMXOPByteShiftMask,*/
vec128b(7),
/*XMMXOPWordShiftMask*/
vec128s(15),
/*XMMXOPDwordShiftMask*/
vec128i(31)
}; };
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {

View File

@ -167,7 +167,11 @@ enum XmmConst {
XMMF16PackLCPI3, XMMF16PackLCPI3,
XMMF16PackLCPI4, XMMF16PackLCPI4,
XMMF16PackLCPI5, XMMF16PackLCPI5,
XMMF16PackLCPI6 XMMF16PackLCPI6,
XMMXOPByteShiftMask,
XMMXOPWordShiftMask,
XMMXOPDwordShiftMask,
}; };
using amdfx::xopcompare_e; using amdfx::xopcompare_e;
using Xbyak::Xmm; using Xbyak::Xmm;
@ -383,7 +387,30 @@ class X64Emitter : public Xbyak::CodeGenerator {
DEFINECOMPARE(vpcomud); DEFINECOMPARE(vpcomud);
DEFINECOMPARE(vpcomq); DEFINECOMPARE(vpcomq);
DEFINECOMPARE(vpcomuq); DEFINECOMPARE(vpcomuq);
#undef DEFINECOMPARE #undef DEFINECOMPARE
#define DEFINESHIFTER(name) \
void name(Xmm dest, Xmm src1, Xmm src2) { \
auto xop_bytes = \
amdfx::operations::name(dest.getIdx(), src1.getIdx(), src2.getIdx()); \
EmitXOP(xop_bytes); \
}
DEFINESHIFTER(vprotb)
DEFINESHIFTER(vprotw)
DEFINESHIFTER(vprotd)
DEFINESHIFTER(vprotq)
DEFINESHIFTER(vpshab)
DEFINESHIFTER(vpshaw)
DEFINESHIFTER(vpshad)
DEFINESHIFTER(vpshaq)
DEFINESHIFTER(vpshlb)
DEFINESHIFTER(vpshlw)
DEFINESHIFTER(vpshld)
DEFINESHIFTER(vpshlq)
protected: protected:
void* Emplace(const EmitFunctionInfo& func_info, void* Emplace(const EmitFunctionInfo& func_info,
GuestFunction* function = nullptr); GuestFunction* function = nullptr);

View File

@ -19,6 +19,16 @@
#include "xenia/base/cvar.h" #include "xenia/base/cvar.h"
#include "xenia/cpu/backend/x64/x64_stack_layout.h" #include "xenia/cpu/backend/x64/x64_stack_layout.h"
DEFINE_bool(xop_rotates, false, "rotate via xop", "X64");
DEFINE_bool(xop_left_shifts, false, "shl via xop", "X64");
DEFINE_bool(xop_right_shifts, false, "shr via xop", "X64");
DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "X64");
DEFINE_bool(xop_compares, true, "compare via xop", "X64");
namespace xe { namespace xe {
namespace cpu { namespace cpu {
namespace backend { namespace backend {
@ -407,7 +417,7 @@ struct VECTOR_COMPARE_SGE_V128
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
EmitAssociativeBinaryXmmOp( EmitAssociativeBinaryXmmOp(
e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
if (e.IsFeatureEnabled(kX64EmitXOP)) { if (cvars::xop_compares && e.IsFeatureEnabled(kX64EmitXOP)) {
switch (i.instr->flags) { switch (i.instr->flags) {
case INT8_TYPE: case INT8_TYPE:
e.vpcomb(dest, src1, src2, xopcompare_e::GTE); e.vpcomb(dest, src1, src2, xopcompare_e::GTE);
@ -775,23 +785,52 @@ static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) {
// Store result and return it. // Store result and return it.
return _mm_load_si128(reinterpret_cast<__m128i*>(value)); return _mm_load_si128(reinterpret_cast<__m128i*>(value));
} }
static XmmConst GetShiftmaskForType(unsigned typ) {
if (typ == INT8_TYPE) {
return XMMXOPByteShiftMask;
} else if (typ == INT16_TYPE) {
return XMMXOPWordShiftMask;
} else {
return XMMXOPDwordShiftMask;
}
}
struct VECTOR_SHL_V128 struct VECTOR_SHL_V128
: Sequence<VECTOR_SHL_V128, I<OPCODE_VECTOR_SHL, V128Op, V128Op, V128Op>> { : Sequence<VECTOR_SHL_V128, I<OPCODE_VECTOR_SHL, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
switch (i.instr->flags) { if (cvars::xop_left_shifts && e.IsFeatureEnabled(kX64EmitXOP)) {
case INT8_TYPE: Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
EmitInt8(e, i); Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
break;
case INT16_TYPE: e.vpand(e.xmm2, src2,
EmitInt16(e, i); e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
break;
case INT32_TYPE: switch (i.instr->flags) {
EmitInt32(e, i); case INT8_TYPE:
break; e.vpshlb(i.dest, src1, e.xmm2);
default: break;
assert_always(); case INT16_TYPE:
break; e.vpshlw(i.dest, src1, e.xmm2);
break;
case INT32_TYPE:
e.vpshld(i.dest, src1, e.xmm2);
break;
}
} else {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
}
} }
} }
@ -1061,19 +1100,45 @@ static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) {
struct VECTOR_SHR_V128 struct VECTOR_SHR_V128
: Sequence<VECTOR_SHR_V128, I<OPCODE_VECTOR_SHR, V128Op, V128Op, V128Op>> { : Sequence<VECTOR_SHR_V128, I<OPCODE_VECTOR_SHR, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
switch (i.instr->flags) { if (cvars::xop_right_shifts && e.IsFeatureEnabled(kX64EmitXOP)) {
case INT8_TYPE: Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
EmitInt8(e, i); Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
break;
case INT16_TYPE: e.vpand(e.xmm2, src2,
EmitInt16(e, i); e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
break;
case INT32_TYPE: e.vpcmpeqb(e.xmm3, e.xmm3);
EmitInt32(e, i);
break; switch (i.instr->flags) {
default: case INT8_TYPE:
assert_always(); e.vpsignb(e.xmm2, e.xmm3);
break; e.vpshlb(i.dest, src1, e.xmm2);
break;
case INT16_TYPE:
e.vpsignw(e.xmm2, e.xmm3);
e.vpshlw(i.dest, src1, e.xmm2);
break;
case INT32_TYPE:
e.vpsignd(e.xmm2, e.xmm3);
e.vpshld(i.dest, src1, e.xmm2);
break;
}
} else {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
}
} }
} }
@ -1244,19 +1309,45 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128);
struct VECTOR_SHA_V128 struct VECTOR_SHA_V128
: Sequence<VECTOR_SHA_V128, I<OPCODE_VECTOR_SHA, V128Op, V128Op, V128Op>> { : Sequence<VECTOR_SHA_V128, I<OPCODE_VECTOR_SHA, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
switch (i.instr->flags) { if (cvars::xop_arithmetic_right_shifts && e.IsFeatureEnabled(kX64EmitXOP)) {
case INT8_TYPE: Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
EmitInt8(e, i); Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
break;
case INT16_TYPE: e.vpand(e.xmm2, src2,
EmitInt16(e, i); e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
break;
case INT32_TYPE: e.vpcmpeqb(e.xmm3, e.xmm3);
EmitInt32(e, i);
break; switch (i.instr->flags) {
default: case INT8_TYPE:
assert_always(); e.vpsignb(e.xmm2, e.xmm3);
break; e.vpshab(i.dest, src1, e.xmm2);
break;
case INT16_TYPE:
e.vpsignw(e.xmm2, e.xmm3);
e.vpshaw(i.dest, src1, e.xmm2);
break;
case INT32_TYPE:
e.vpsignd(e.xmm2, e.xmm3);
e.vpshad(i.dest, src1, e.xmm2);
break;
}
} else {
switch (i.instr->flags) {
case INT8_TYPE:
EmitInt8(e, i);
break;
case INT16_TYPE:
EmitInt16(e, i);
break;
case INT32_TYPE:
EmitInt32(e, i);
break;
default:
assert_always();
break;
}
} }
} }
@ -1432,55 +1523,29 @@ struct VECTOR_ROTATE_LEFT_V128
: Sequence<VECTOR_ROTATE_LEFT_V128, : Sequence<VECTOR_ROTATE_LEFT_V128,
I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> { I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
switch (i.instr->flags) { if (cvars::xop_rotates && e.IsFeatureEnabled(kX64EmitXOP)) {
case INT8_TYPE: Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
// TODO(benvanik): native version (with shift magic). Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); e.vpand(e.xmm2, src2,
} else { e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags)));
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
} switch (i.instr->flags) {
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); case INT8_TYPE:
e.CallNativeSafe( e.vprotb(i.dest, src1, e.xmm2);
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint8_t>)); break;
e.vmovaps(i.dest, e.xmm0); case INT16_TYPE:
break; e.vprotw(i.dest, src1, e.xmm2);
case INT16_TYPE: break;
// TODO(benvanik): native version (with shift magic). case INT32_TYPE:
if (i.src2.is_constant) { e.vprotd(i.dest, src1, e.xmm2);
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); break;
} else { }
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
} } else {
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); switch (i.instr->flags) {
e.CallNativeSafe( case INT8_TYPE:
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint16_t>)); // TODO(benvanik): native version (with shift magic).
e.vmovaps(i.dest, e.xmm0);
break;
case INT32_TYPE: {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vprolvd(i.dest, i.src1, i.src2);
} else if (e.IsFeatureEnabled(kX64EmitAVX2)) {
Xmm temp = i.dest;
if (i.dest == i.src1 || i.dest == i.src2) {
temp = e.xmm2;
}
// Shift left (to get high bits):
if (i.src2.is_constant) {
e.LoadConstantXmm(temp, i.src2.constant());
e.vpand(e.xmm0, temp, e.GetXmmConstPtr(XMMShiftMaskPS));
} else {
e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
}
e.vpsllvd(e.xmm1, i.src1, e.xmm0);
// Shift right (to get low bits):
e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32));
e.vpsubd(temp, e.xmm0);
e.vpsrlvd(i.dest, i.src1, temp);
// Merge:
e.vpor(i.dest, e.xmm1);
} else {
// TODO(benvanik): non-AVX2 native version.
if (i.src2.is_constant) { if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1), e.lea(e.GetNativeParam(1),
e.StashConstantXmm(1, i.src2.constant())); e.StashConstantXmm(1, i.src2.constant()));
@ -1489,14 +1554,63 @@ struct VECTOR_ROTATE_LEFT_V128
} }
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe( e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint32_t>)); reinterpret_cast<void*>(EmulateVectorRotateLeft<uint8_t>));
e.vmovaps(i.dest, e.xmm0); e.vmovaps(i.dest, e.xmm0);
break;
case INT16_TYPE:
// TODO(benvanik): native version (with shift magic).
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1),
e.StashConstantXmm(1, i.src2.constant()));
} else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint16_t>));
e.vmovaps(i.dest, e.xmm0);
break;
case INT32_TYPE: {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vprolvd(i.dest, i.src1, i.src2);
} else if (e.IsFeatureEnabled(kX64EmitAVX2)) {
Xmm temp = i.dest;
if (i.dest == i.src1 || i.dest == i.src2) {
temp = e.xmm2;
}
// Shift left (to get high bits):
if (i.src2.is_constant) {
e.LoadConstantXmm(temp, i.src2.constant());
e.vpand(e.xmm0, temp, e.GetXmmConstPtr(XMMShiftMaskPS));
} else {
e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
}
e.vpsllvd(e.xmm1, i.src1, e.xmm0);
// Shift right (to get low bits):
e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32));
e.vpsubd(temp, e.xmm0);
e.vpsrlvd(i.dest, i.src1, temp);
// Merge:
e.vpor(i.dest, e.xmm1);
} else {
// TODO(benvanik): non-AVX2 native version.
if (i.src2.is_constant) {
e.lea(e.GetNativeParam(1),
e.StashConstantXmm(1, i.src2.constant()));
} else {
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
}
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));
e.CallNativeSafe(
reinterpret_cast<void*>(EmulateVectorRotateLeft<uint32_t>));
e.vmovaps(i.dest, e.xmm0);
}
break;
} }
break; default:
assert_always();
break;
} }
default:
assert_always();
break;
} }
} }
}; };

View File

@ -781,7 +781,7 @@ struct SELECT_V128_V128
} else if (mayblend == PermittedBlend::Ps) { } else if (mayblend == PermittedBlend::Ps) {
e.vblendvps(i.dest, src2, src3, src1); e.vblendvps(i.dest, src2, src3, src1);
} else { } else {
if (1 && e.IsFeatureEnabled(kX64EmitXOP)) { if (1 && e.IsFeatureEnabled(kX64EmitXOP)) {
XELOGCPU("Doing vpcmov!!"); XELOGCPU("Doing vpcmov!!");
e.vpcmov(i.dest, src3, src2, src1); e.vpcmov(i.dest, src3, src2, src1);
} else { } else {

View File

@ -452,7 +452,7 @@ Affected: FX, FEX, VX, OX (if Rc = 1)
*/ */
// f.UpdateFPSCR(v, i.X.Rc); // f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) { if (i.X.Rc) {
// todo
} }
return 0; return 0;
} }
@ -469,7 +469,10 @@ int InstrEmit_fnabsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- !abs(frB) // frD <- !abs(frB)
Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB))); Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB)));
f.StoreFPR(i.X.RT, v); f.StoreFPR(i.X.RT, v);
f.UpdateFPSCR(v, i.X.Rc); //f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) {
//todo
}
return 0; return 0;
} }
@ -477,7 +480,10 @@ int InstrEmit_fnegx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- ¬ frB[0] || frB[1-63] // frD <- ¬ frB[0] || frB[1-63]
Value* v = f.Neg(f.LoadFPR(i.X.RB)); Value* v = f.Neg(f.LoadFPR(i.X.RB));
f.StoreFPR(i.X.RT, v); f.StoreFPR(i.X.RT, v);
f.UpdateFPSCR(v, i.X.Rc); //f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) {
//todo
}
return 0; return 0;
} }