diff --git a/src/xenia/cpu/backend/x64/x64_amdfx_extensions.h b/src/xenia/cpu/backend/x64/x64_amdfx_extensions.h new file mode 100644 index 000000000..947589a8e --- /dev/null +++ b/src/xenia/cpu/backend/x64/x64_amdfx_extensions.h @@ -0,0 +1,334 @@ +#ifndef XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_ +#define XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_ + +#include +#include +#include + +namespace xe { +namespace cpu { +namespace backend { +namespace x64 { +namespace amdfx { +enum xopcodemap_e : unsigned char { + XOPCODE_HAS_IMMBYTE = 0x8, + XOPCODE_NO_IMMBYTE = 0x9 +}; + +// base opcodes, without their size specified +enum xopcode_e : unsigned char { + xop_VFRCZPD = 0x81, + xop_VFRCZPS = 0x80, + xop_VFRCZSD = 0x83, + xop_VFRCZSS = 0x82, + xop_VPCMOV = 0xA2, + xop_VPCOMB = 0xCC, + xop_VPCOMD = 0xCE, + xop_VPCOMQ = 0xCF, + xop_VPCOMUB = 0xEC, + xop_VPCOMUD = 0xEE, + xop_VPCOMUQ = 0xEF, + xop_VPCOMUW = 0xED, + xop_VPCOMW = 0xCD, + xop_VPERMIL2PD = 0x49, + xop_VPERMIL2PS = 0x48, + xop_VPHADDBD = 0xC2, + xop_VPHADDBQ = 0xC3, + xop_VPHADDBW = 0xC1, + xop_VPHADDDQ = 0xCB, + xop_VPHADDUBD = 0xD2, + xop_VPHADDUBQ = 0xD3, + xop_VPHADDUBW = 0xD1, + xop_VPHADDUDQ = 0xDB, + xop_VPHADDUWD = 0xD6, + xop_VPHADDUWQ = 0xD7, + xop_VPHADDWD = 0xC6, + xop_VPHADDWQ = 0xC7, + xop_VPHSUBBW = 0xE1, + xop_VPHSUBDQ = 0xE3, + xop_VPHSUBWD = 0xE2, + xop_VPMACSDD = 0x9E, + xop_VPMACSDQH = 0x9F, + xop_VPMACSDQL = 0x97, + xop_VPMACSSDD = 0x8E, + xop_VPMACSSDQH = 0x8F, + xop_VPMACSSDQL = 0x87, + xop_VPMACSSWD = 0x86, + xop_VPMACSSWW = 0x85, + xop_VPMACSWD = 0x96, + xop_VPMACSWW = 0x95, + xop_VPMADCSSWD = 0xA6, + xop_VPMADCSWD = 0xB6, + xop_VPPERM = 0xA3, + xop_VPROTB = 0x90, + xop_VPROTBI = 0xC0, // imm version + xop_VPROTD = 0x92, + xop_VPROTDI = 0xC2, + xop_VPROTQ = 0x93, + xop_VPROTQI = 0xC3, + xop_VPROTW = 0x91, + xop_VPROTWI = 0xC1, + xop_VPSHAB = 0x98, + xop_VPSHAD = 0x9A, + xop_VPSHAQ = 0x9B, + xop_VPSHAW = 0x99, + xop_VPSHLB = 0x94, + xop_VPSHLD = 0x96, + xop_VPSHLQ = 0x97, + xop_VPSHLW = 0x95, + +}; + +enum xop_iop_e : unsigned char { + XOP_BYTE = 0, + XOP_WORD = 1, + XOP_DOUBLEWORD = 2, + XOP_QUADWORD = 3 +}; + +enum xop_fop_e : unsigned char { + XOP_PS = 0, + XOP_PD = 1, + XOP_SS = 2, + XOP_SD = 3 +}; +class xop_byte1_t { + public: + union { + // informative names + struct { + /* + A five bit field encoding a one- or two-byte opcode prefix. + */ + unsigned char opcode_map_select : 5; + /* + This bit provides a one-bit extension of either the ModRM.r/m + field to specify a GPR or XMM register or to the SIB base field to + specify a GPR. This permits access to 16 registers. In 32-bit protected + and compatibility modes, this bit is ignored. This bit is the + bit-inverted equivalent of the REX.B bit and is available only in the + 3-byte prefix format. + */ + unsigned char inv_1bit_ext_modrm_or_sib : 1; + /* + This bit provides a one bit extension of the SIB.index field in + 64-bit mode, permitting access to 16 YMM/XMM and GPR registers. In + 32-bit protected and compatibility modes, this bit must be set to 1. + This bit is the bit-inverted equivalent of the REX.X bit + */ + unsigned char inv_1bit_ext_sib_index : 1; + /* + This bit provides a one bit extension of the ModRM.reg field in + 64-bit mode, permitting access to all 16 YMM/XMM and GPR registers. In + 32-bit protected and compatibility modes, this bit must be set to 1. + This bit is the bit-inverted equivalent of the REX.R bit. + */ + unsigned char inv_1bit_ext_modrm_reg_field : 1; + }; + // amd manual names + struct { + unsigned char mmmmm : 5; + unsigned char B : 1; + unsigned char X : 1; + unsigned char R : 1; + }; + unsigned char encoded; + }; +}; + +class xop_byte2_t { + public: + union { + struct { + unsigned char + implied_66f2f3_ext : 2; // 0 = no implied, 1 = 66, 2 = F3, 3 = F2 + unsigned char vector_length : 1; + unsigned char source_or_dest_reg_specifier_inverted_1s_compl : 4; + unsigned char scalar_reg_size_override_special : 1; + }; + // amd manual names + + struct { + unsigned char pp : 2; // presumably 0 = no implied, 1 = 66, 2 = F2, 3 = + // F3 + unsigned char L : 1; + unsigned char vvvv : 4; // src1 for four operand form + unsigned char W : 1; + }; + unsigned char encoded; + }; +}; + +class xop_opcode_byte_t { + public: + union { + struct { + xop_fop_e float_datatype : 2; + unsigned char __unused0 : 6; + }; + + struct { + xop_iop_e int_datatype : 2; + unsigned char __unused1 : 6; + }; + + struct { + unsigned char oes : 2; + unsigned char opcode : 6; + }; + unsigned char encoded; + }; +}; + +class modrm_byte_t { + public: + union { + struct { + unsigned char rm : 3; + unsigned char mod : 5; // 4 opnd form dest reg + }; + unsigned char encoded; + }; +}; + +#pragma pack(push, 1) +class xop_t { + public: + unsigned char imm_8F; // always 0x8F + xop_byte1_t byte1; + xop_byte2_t byte2; + xop_opcode_byte_t opcode; + modrm_byte_t modrm; + unsigned char imm8; + + xop_t() : imm_8F(0x8F) { + byte1.encoded = 0; + byte2.encoded = 0; + opcode.encoded = 0; + modrm.encoded = 0; + } + + unsigned AssembledSize() const { + if (byte1.opcode_map_select == XOPCODE_NO_IMMBYTE) { + return 5; + } else { + return 6; + } + } + + template + void ForeachByte(TCall&& cb) { + cb(imm_8F); + cb(byte1.encoded); + cb(byte2.encoded); + cb(opcode.encoded); + cb(modrm.encoded); + if (AssembledSize() == 6) { + cb(imm8); + } + } +}; +#pragma pack(pop) + +static void xop_set_fouroperand_form(xop_t& xop, unsigned xmmidx_dest, + unsigned xmmidx_src1, unsigned xmmidx_src2, + unsigned xmmidx_src3, xopcode_e opcode, + bool has_immbyte = true) { + xop.opcode.encoded = opcode; + xop.byte1.encoded = 0xe8; + if (has_immbyte) { + xop.byte1.opcode_map_select = XOPCODE_HAS_IMMBYTE; + } else { + xop.byte1.opcode_map_select = XOPCODE_NO_IMMBYTE; + } + xop.imm8 = xmmidx_src3 << 4; + + xop.modrm.rm = xmmidx_src2 & 0b111; + xop.byte1.inv_1bit_ext_modrm_reg_field = (xmmidx_dest >> 3) ^ 1; + xop.byte1.inv_1bit_ext_modrm_or_sib = (xmmidx_src2 >> 3) ^ 1; + xop.byte2.vvvv = ~xmmidx_src1; + xop.modrm.encoded |= 0xC0; + xop.modrm.mod |= xmmidx_dest & 0b111; +} + +enum class xopcompare_e : uint32_t { + LT = 0b000, + LTE = 0b001, + GT = 0b010, + GTE = 0b011, + EQ = 0b100, + NEQ = 0b101, + FALSEY = 0b110, // there doesnt seem to be much in the way of documentation + // for these two + TRUTHEY = 0b111 +}; + +namespace operations { +#define SIMPLE_FOUROPERAND(funcname, opcode) \ + static xop_t funcname(unsigned destidx, unsigned src1idx, unsigned src2idx, \ + unsigned src3idx) { \ + xop_t result{}; \ + xop_set_fouroperand_form(result, destidx, src1idx, src2idx, src3idx, \ + opcode, true); \ + return result; \ + } + +SIMPLE_FOUROPERAND(vpcmov, xop_VPCMOV) + +SIMPLE_FOUROPERAND(vpperm, xop_VPPERM) + +#define COMPAREFUNC(name, opcode) \ + static xop_t name(unsigned dst, unsigned src1, unsigned src2, \ + xopcompare_e imm8) { \ + xop_t xop; \ + xop_set_fouroperand_form(xop, dst, src1, src2, 0, opcode, true); \ + xop.imm8 = static_cast(static_cast(imm8)); \ + return xop; \ + } + +COMPAREFUNC(vpcomb, xop_VPCOMB) +COMPAREFUNC(vpcomub, xop_VPCOMUB) +COMPAREFUNC(vpcomw, xop_VPCOMW) +COMPAREFUNC(vpcomuw, xop_VPCOMUW) +COMPAREFUNC(vpcomd, xop_VPCOMD) +COMPAREFUNC(vpcomud, xop_VPCOMUD) +COMPAREFUNC(vpcomq, xop_VPCOMQ) +COMPAREFUNC(vpcomuq, xop_VPCOMUQ) + +#define SIMPLE_THREEOPERAND(funcname, opcode) \ + static xop_t funcname(unsigned destidx, unsigned src1idx, \ + unsigned src2idx) { \ + xop_t result{}; \ + xop_set_fouroperand_form(result, destidx, src1idx, src2idx, 0, opcode, \ + false); \ + return result; \ + } + +SIMPLE_THREEOPERAND(vprotb, xop_VPROTB) +SIMPLE_THREEOPERAND(vprotw, xop_VPROTW) +SIMPLE_THREEOPERAND(vprotd, xop_VPROTD) +SIMPLE_THREEOPERAND(vprotq, xop_VPROTQ) + +SIMPLE_THREEOPERAND(vpshab, xop_VPSHAB) +SIMPLE_THREEOPERAND(vpshaw, xop_VPSHAW) +SIMPLE_THREEOPERAND(vpshad, xop_VPSHAD) +SIMPLE_THREEOPERAND(vpshaq, xop_VPSHAQ) + + +SIMPLE_THREEOPERAND(vpshlb, xop_VPSHLB) +SIMPLE_THREEOPERAND(vpshlw, xop_VPSHLW) +SIMPLE_THREEOPERAND(vpshld, xop_VPSHLD) +SIMPLE_THREEOPERAND(vpshlq, xop_VPSHLQ) + +#undef SIMPLE_THREEOPERAND +#undef SIMPLE_FOUROPERAND +#undef COMPAREFUNC +} // namespace operations + +} // namespace amdfx +} // namespace x64 +} // namespace backend +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_ diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index ccd6e969a..d1394d202 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -143,6 +143,12 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) feature_flags_ |= kX64EmitTBM; } } + if (amd_flags & (1U << 11)) { + if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) { + feature_flags_ |= kX64EmitXOP; + XELOGCPU("Cpu support XOP!\n\n"); + } + } if (cpu_.has(Xbyak::util::Cpu::tAMD)) { bool is_zennish = cpu_.displayFamily >= 0x17; /* @@ -1024,8 +1030,13 @@ static const vec128_t xmm_consts[] = { /* XMMF16PackLCPI6 */ - vec128i(0x8000) - + vec128i(0x8000), + /* XMMXOPByteShiftMask,*/ + vec128b(7), + /*XMMXOPWordShiftMask*/ + vec128s(15), + /*XMMXOPDwordShiftMask*/ + vec128i(31) }; void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 48cfa9909..01027cc0c 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -23,7 +23,7 @@ // NOTE: must be included last as it expects windows.h to already be included. #include "third_party/xbyak/xbyak/xbyak.h" #include "third_party/xbyak/xbyak/xbyak_util.h" - +#include "x64_amdfx_extensions.h" namespace xe { namespace cpu { class Processor; @@ -167,8 +167,14 @@ enum XmmConst { XMMF16PackLCPI3, XMMF16PackLCPI4, XMMF16PackLCPI5, - XMMF16PackLCPI6 + XMMF16PackLCPI6, + XMMXOPByteShiftMask, + XMMXOPWordShiftMask, + XMMXOPDwordShiftMask, + }; +using amdfx::xopcompare_e; +using Xbyak::Xmm; // X64Backend specific Instr->runtime_flags enum : uint32_t { INSTR_X64_FLAGS_ELIMINATED = @@ -351,6 +357,60 @@ class X64Emitter : public Xbyak::CodeGenerator { void EmitProfilerEpilogue(); + void EmitXOP(amdfx::xop_t xoperation) { + xoperation.ForeachByte([this](uint8_t b) { this->db(b); }); + } + + void vpcmov(Xmm dest, Xmm src1, Xmm src2, Xmm selector) { + auto xop_bytes = amdfx::operations::vpcmov( + dest.getIdx(), src1.getIdx(), src2.getIdx(), selector.getIdx()); + EmitXOP(xop_bytes); + } + + void vpperm(Xmm dest, Xmm src1, Xmm src2, Xmm selector) { + auto xop_bytes = amdfx::operations::vpperm( + dest.getIdx(), src1.getIdx(), src2.getIdx(), selector.getIdx()); + EmitXOP(xop_bytes); + } + +#define DEFINECOMPARE(name) \ + void name(Xmm dest, Xmm src1, Xmm src2, xopcompare_e compareop) { \ + auto xop_bytes = amdfx::operations::name(dest.getIdx(), src1.getIdx(), \ + src2.getIdx(), compareop); \ + EmitXOP(xop_bytes); \ + } + DEFINECOMPARE(vpcomb); + DEFINECOMPARE(vpcomub); + DEFINECOMPARE(vpcomw); + DEFINECOMPARE(vpcomuw); + DEFINECOMPARE(vpcomd); + DEFINECOMPARE(vpcomud); + DEFINECOMPARE(vpcomq); + DEFINECOMPARE(vpcomuq); +#undef DEFINECOMPARE + +#define DEFINESHIFTER(name) \ + void name(Xmm dest, Xmm src1, Xmm src2) { \ + auto xop_bytes = \ + amdfx::operations::name(dest.getIdx(), src1.getIdx(), src2.getIdx()); \ + EmitXOP(xop_bytes); \ + } + + DEFINESHIFTER(vprotb) + DEFINESHIFTER(vprotw) + DEFINESHIFTER(vprotd) + DEFINESHIFTER(vprotq) + + DEFINESHIFTER(vpshab) + DEFINESHIFTER(vpshaw) + DEFINESHIFTER(vpshad) + DEFINESHIFTER(vpshaq) + + DEFINESHIFTER(vpshlb) + DEFINESHIFTER(vpshlw) + DEFINESHIFTER(vpshld) + DEFINESHIFTER(vpshlq) + protected: void* Emplace(const EmitFunctionInfo& func_info, GuestFunction* function = nullptr); diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 3d9a5f797..46eb285cf 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -19,6 +19,16 @@ #include "xenia/base/cvar.h" #include "xenia/cpu/backend/x64/x64_stack_layout.h" +DEFINE_bool(xop_rotates, false, "rotate via xop", "X64"); + +DEFINE_bool(xop_left_shifts, false, "shl via xop", "X64"); + +DEFINE_bool(xop_right_shifts, false, "shr via xop", "X64"); + +DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "X64"); + +DEFINE_bool(xop_compares, true, "compare via xop", "X64"); + namespace xe { namespace cpu { namespace backend { @@ -143,6 +153,7 @@ struct VECTOR_DENORMFLUSH e.vandps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMSingleDenormalMask)); // 0.25 P0123 e.vcmpneqps(e.xmm2, e.xmm0, e.xmm1); // 0.5 P01 + // todo: xop vpcmov here e.vandps(e.xmm1, i.src1, e.GetXmmConstPtr(XMMSignMaskF32)); // 0.5 P0123 take signs, zeros // must keep their signs @@ -406,26 +417,44 @@ struct VECTOR_COMPARE_SGE_V128 static void Emit(X64Emitter& e, const EmitArgType& i) { EmitAssociativeBinaryXmmOp( e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - switch (i.instr->flags) { - case INT8_TYPE: - e.vpcmpeqb(e.xmm0, src1, src2); - e.vpcmpgtb(dest, src1, src2); - e.vpor(dest, e.xmm0); - break; - case INT16_TYPE: - e.vpcmpeqw(e.xmm0, src1, src2); - e.vpcmpgtw(dest, src1, src2); - e.vpor(dest, e.xmm0); - break; - case INT32_TYPE: - e.vpcmpeqd(e.xmm0, src1, src2); - e.vpcmpgtd(dest, src1, src2); - e.vpor(dest, e.xmm0); - break; - case FLOAT32_TYPE: - e.ChangeMxcsrMode(MXCSRMode::Vmx); - e.vcmpgeps(dest, src1, src2); - break; + if (cvars::xop_compares && e.IsFeatureEnabled(kX64EmitXOP)) { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcomb(dest, src1, src2, xopcompare_e::GTE); + break; + case INT16_TYPE: + e.vpcomw(dest, src1, src2, xopcompare_e::GTE); + break; + case INT32_TYPE: + e.vpcomd(dest, src1, src2, xopcompare_e::GTE); + break; + case FLOAT32_TYPE: + e.ChangeMxcsrMode(MXCSRMode::Vmx); + e.vcmpgeps(dest, src1, src2); + break; + } + } else { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpeqb(e.xmm0, src1, src2); + e.vpcmpgtb(dest, src1, src2); + e.vpor(dest, e.xmm0); + break; + case INT16_TYPE: + e.vpcmpeqw(e.xmm0, src1, src2); + e.vpcmpgtw(dest, src1, src2); + e.vpor(dest, e.xmm0); + break; + case INT32_TYPE: + e.vpcmpeqd(e.xmm0, src1, src2); + e.vpcmpgtd(dest, src1, src2); + e.vpor(dest, e.xmm0); + break; + case FLOAT32_TYPE: + e.ChangeMxcsrMode(MXCSRMode::Vmx); + e.vcmpgeps(dest, src1, src2); + break; + } } }); } @@ -600,6 +629,7 @@ struct VECTOR_ADD // overflowed (only need to check one input) // if (src1 > res) then overflowed // http://locklessinc.com/articles/sat_arithmetic/ + // chrispy: todo - add xop stuff here e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32)); e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32)); e.vpcmpgtd(e.xmm0, e.xmm2, e.xmm0); @@ -755,23 +785,52 @@ static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) { // Store result and return it. return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } - +static XmmConst GetShiftmaskForType(unsigned typ) { + if (typ == INT8_TYPE) { + return XMMXOPByteShiftMask; + } else if (typ == INT16_TYPE) { + return XMMXOPWordShiftMask; + } else { + return XMMXOPDwordShiftMask; + } +} struct VECTOR_SHL_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - EmitInt8(e, i); - break; - case INT16_TYPE: - EmitInt16(e, i); - break; - case INT32_TYPE: - EmitInt32(e, i); - break; - default: - assert_always(); - break; + if (cvars::xop_left_shifts && e.IsFeatureEnabled(kX64EmitXOP)) { + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + + e.vpand(e.xmm2, src2, + e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags))); + + switch (i.instr->flags) { + case INT8_TYPE: + e.vpshlb(i.dest, src1, e.xmm2); + break; + case INT16_TYPE: + e.vpshlw(i.dest, src1, e.xmm2); + break; + case INT32_TYPE: + e.vpshld(i.dest, src1, e.xmm2); + break; + } + + } else { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } } } @@ -1041,19 +1100,45 @@ static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) { struct VECTOR_SHR_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - EmitInt8(e, i); - break; - case INT16_TYPE: - EmitInt16(e, i); - break; - case INT32_TYPE: - EmitInt32(e, i); - break; - default: - assert_always(); - break; + if (cvars::xop_right_shifts && e.IsFeatureEnabled(kX64EmitXOP)) { + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + + e.vpand(e.xmm2, src2, + e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags))); + + e.vpcmpeqb(e.xmm3, e.xmm3); + + switch (i.instr->flags) { + case INT8_TYPE: + e.vpsignb(e.xmm2, e.xmm3); + e.vpshlb(i.dest, src1, e.xmm2); + break; + case INT16_TYPE: + e.vpsignw(e.xmm2, e.xmm3); + e.vpshlw(i.dest, src1, e.xmm2); + break; + case INT32_TYPE: + e.vpsignd(e.xmm2, e.xmm3); + e.vpshld(i.dest, src1, e.xmm2); + break; + } + + } else { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } } } @@ -1224,19 +1309,45 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128); struct VECTOR_SHA_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - EmitInt8(e, i); - break; - case INT16_TYPE: - EmitInt16(e, i); - break; - case INT32_TYPE: - EmitInt32(e, i); - break; - default: - assert_always(); - break; + if (cvars::xop_arithmetic_right_shifts && e.IsFeatureEnabled(kX64EmitXOP)) { + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + + e.vpand(e.xmm2, src2, + e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags))); + + e.vpcmpeqb(e.xmm3, e.xmm3); + + switch (i.instr->flags) { + case INT8_TYPE: + e.vpsignb(e.xmm2, e.xmm3); + e.vpshab(i.dest, src1, e.xmm2); + break; + case INT16_TYPE: + e.vpsignw(e.xmm2, e.xmm3); + e.vpshaw(i.dest, src1, e.xmm2); + break; + case INT32_TYPE: + e.vpsignd(e.xmm2, e.xmm3); + e.vpshad(i.dest, src1, e.xmm2); + break; + } + + } else { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } } } @@ -1412,55 +1523,29 @@ struct VECTOR_ROTATE_LEFT_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); - } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); - } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorRotateLeft)); - e.vmovaps(i.dest, e.xmm0); - break; - case INT16_TYPE: - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); - } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); - } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorRotateLeft)); - e.vmovaps(i.dest, e.xmm0); - break; - case INT32_TYPE: { - if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { - e.vprolvd(i.dest, i.src1, i.src2); - } else if (e.IsFeatureEnabled(kX64EmitAVX2)) { - Xmm temp = i.dest; - if (i.dest == i.src1 || i.dest == i.src2) { - temp = e.xmm2; - } - // Shift left (to get high bits): - if (i.src2.is_constant) { - e.LoadConstantXmm(temp, i.src2.constant()); - e.vpand(e.xmm0, temp, e.GetXmmConstPtr(XMMShiftMaskPS)); - } else { - e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); - } - e.vpsllvd(e.xmm1, i.src1, e.xmm0); - // Shift right (to get low bits): - e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); - e.vpsubd(temp, e.xmm0); - e.vpsrlvd(i.dest, i.src1, temp); - // Merge: - e.vpor(i.dest, e.xmm1); - } else { - // TODO(benvanik): non-AVX2 native version. + if (cvars::xop_rotates && e.IsFeatureEnabled(kX64EmitXOP)) { + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + + e.vpand(e.xmm2, src2, + e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags))); + + switch (i.instr->flags) { + case INT8_TYPE: + e.vprotb(i.dest, src1, e.xmm2); + break; + case INT16_TYPE: + e.vprotw(i.dest, src1, e.xmm2); + break; + case INT32_TYPE: + e.vprotd(i.dest, src1, e.xmm2); + break; + } + + } else { + switch (i.instr->flags) { + case INT8_TYPE: + // TODO(benvanik): native version (with shift magic). if (i.src2.is_constant) { e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); @@ -1469,14 +1554,63 @@ struct VECTOR_ROTATE_LEFT_V128 } e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); e.CallNativeSafe( - reinterpret_cast(EmulateVectorRotateLeft)); + reinterpret_cast(EmulateVectorRotateLeft)); e.vmovaps(i.dest, e.xmm0); + break; + case INT16_TYPE: + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.lea(e.GetNativeParam(1), + e.StashConstantXmm(1, i.src2.constant())); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); + e.vmovaps(i.dest, e.xmm0); + break; + case INT32_TYPE: { + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { + e.vprolvd(i.dest, i.src1, i.src2); + } else if (e.IsFeatureEnabled(kX64EmitAVX2)) { + Xmm temp = i.dest; + if (i.dest == i.src1 || i.dest == i.src2) { + temp = e.xmm2; + } + // Shift left (to get high bits): + if (i.src2.is_constant) { + e.LoadConstantXmm(temp, i.src2.constant()); + e.vpand(e.xmm0, temp, e.GetXmmConstPtr(XMMShiftMaskPS)); + } else { + e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + } + e.vpsllvd(e.xmm1, i.src1, e.xmm0); + // Shift right (to get low bits): + e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); + e.vpsubd(temp, e.xmm0); + e.vpsrlvd(i.dest, i.src1, temp); + // Merge: + e.vpor(i.dest, e.xmm1); + } else { + // TODO(benvanik): non-AVX2 native version. + if (i.src2.is_constant) { + e.lea(e.GetNativeParam(1), + e.StashConstantXmm(1, i.src2.constant())); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); + e.vmovaps(i.dest, e.xmm0); + } + break; } - break; + default: + assert_always(); + break; } - default: - assert_always(); - break; } } }; diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index d5dad5cd7..48e340d42 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -50,10 +50,10 @@ DEFINE_bool(no_round_to_single, false, "Not for users, breaks games. Skip rounding double values to " "single precision and back", "CPU"); -DEFINE_bool( - inline_loadclock, false, - "Directly read cached guest clock without calling the LoadClock method (it gets repeatedly updated by calls from other threads)", - "CPU"); +DEFINE_bool(inline_loadclock, false, + "Directly read cached guest clock without calling the LoadClock " + "method (it gets repeatedly updated by calls from other threads)", + "CPU"); namespace xe { namespace cpu { namespace backend { @@ -549,7 +549,7 @@ struct MAX_F64 : Sequence> { struct MAX_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { e.ChangeMxcsrMode(MXCSRMode::Vmx); - //if 0 and -0, return 0! opposite of minfp + // if 0 and -0, return 0! opposite of minfp auto src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); auto src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); e.vmaxps(e.xmm2, src1, src2); @@ -781,11 +781,15 @@ struct SELECT_V128_V128 } else if (mayblend == PermittedBlend::Ps) { e.vblendvps(i.dest, src2, src3, src1); } else { - //ideally we would have an xop path here... - // src1 ? src2 : src3; - e.vpandn(e.xmm3, src1, src2); - e.vpand(i.dest, src1, src3); - e.vpor(i.dest, i.dest, e.xmm3); + if (e.IsFeatureEnabled(kX64EmitXOP)) { + e.vpcmov(i.dest, src3, src2, src1); + } else { + // src1 ? src2 : src3; + + e.vpandn(e.xmm3, src1, src2); + e.vpand(i.dest, src1, src3); + e.vpor(i.dest, i.dest, e.xmm3); + } } } }; diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index a5100cff6..e94b570dc 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -84,7 +84,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) { iter_result |= EliminateConversions(builder); iter_result |= SimplifyAssignments(builder); iter_result |= SimplifyBasicArith(builder); - + iter_result |= SimplifyVectorOps(builder); result |= iter_result; } while (iter_result); return true; @@ -1393,6 +1393,65 @@ bool SimplificationPass::SimplifyBasicArith(hir::HIRBuilder* builder) { return result; } +static bool CouldEverProduceDenormal(hir::Instr* i) { + if (!i) { + return false; + } + Opcode denflushed_opcode = i->GetOpcodeNum(); + + if (denflushed_opcode == OPCODE_VECTOR_DENORMFLUSH) { + return false; + } else if (denflushed_opcode == OPCODE_UNPACK) { + // todo: more unpack operations likely cannot produce denormals + if (i->flags == PACK_TYPE_FLOAT16_4 || i->flags == PACK_TYPE_FLOAT16_2) { + return false; // xenos half float format does not support denormals + } + } else if (denflushed_opcode == OPCODE_VECTOR_CONVERT_I2F) { + return false; + } + return true; // todo: recurse, check values for min/max, abs, and others +} + +bool SimplificationPass::SimplifyVectorOps(hir::Instr* i, + hir::HIRBuilder* builder) { + Opcode opc = i->GetOpcodeNum(); + /* + if the input to an unconditional denormal flush is an output of an + unconditional denormal flush, it is a pointless instruction and should be + elimed + */ + if (opc == OPCODE_VECTOR_DENORMFLUSH) { + hir::Instr* denflushed_def = i->src1.value->GetDefSkipAssigns(); + + if (denflushed_def) { + if (!CouldEverProduceDenormal(denflushed_def)) { + i->opcode = &OPCODE_ASSIGN_info; + return true; + } + } + } + return false; +} +bool SimplificationPass::SimplifyVectorOps(hir::HIRBuilder* builder) { + bool result = false; + auto block = builder->first_block(); + while (block) { + auto i = block->instr_head; + while (i) { + bool looks_vectory = false; + + i->VisitValueOperands([&looks_vectory](Value* val, uint32_t idx) { + if (val->type == VEC128_TYPE) { + looks_vectory = true; + } + }); + result |= SimplifyVectorOps(i, builder); + i = i->next; + } + block = block->next; + } + return result; +} /* todo: add load-store simplification pass diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.h b/src/xenia/cpu/compiler/passes/simplification_pass.h index 078187eb1..66d2a26a2 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.h +++ b/src/xenia/cpu/compiler/passes/simplification_pass.h @@ -35,6 +35,9 @@ class SimplificationPass : public ConditionalGroupSubpass { // handles simple multiplication/addition rules bool SimplifyBasicArith(hir::HIRBuilder* builder); + +bool SimplifyVectorOps(hir::HIRBuilder* builder); + bool SimplifyVectorOps(hir::Instr* i, hir::HIRBuilder* builder); bool SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder); bool SimplifyAddWithSHL(hir::Instr* i, hir::HIRBuilder* builder); bool SimplifyAddToSelf(hir::Instr* i, hir::HIRBuilder* builder); diff --git a/src/xenia/cpu/function.h b/src/xenia/cpu/function.h index 57969cfda..856fdbe1c 100644 --- a/src/xenia/cpu/function.h +++ b/src/xenia/cpu/function.h @@ -31,10 +31,11 @@ struct SourceMapEntry { uint32_t hir_offset; // Block ordinal (16b) | Instr ordinal (16b) uint32_t code_offset; // Offset from emitted code start. }; +enum class SaveRestoreType : uint8_t { NONE, GPR, VMX, FPR }; class Function : public Symbol { public: - enum class Behavior { + enum class Behavior : uint8_t { kDefault = 0, kProlog, kEpilog, @@ -53,6 +54,20 @@ class Function : public Symbol { void set_behavior(Behavior value) { behavior_ = value; } bool is_guest() const { return behavior_ != Behavior::kBuiltin; } + void SetSaverest(SaveRestoreType type, bool is_rest, uint8_t index) { + saverest_type_ = type; + is_restore_ = is_rest; + saverest_index_ = index; + } + + bool IsSaverest() const { return saverest_type_ != SaveRestoreType::NONE; } + + SaveRestoreType SaverestType() const { return saverest_type_; } + unsigned SaverestIndex() const { return saverest_index_; } + + bool IsSave() const { return IsSaverest() && is_restore_ == 0; } + bool IsRestore() const { return IsSaverest() && is_restore_; } + bool ContainsAddress(uint32_t address) const { if (!address_ || !end_address_) { return false; @@ -71,7 +86,11 @@ class Function : public Symbol { Function(Module* module, uint32_t address); uint32_t end_address_ = 0; + Behavior behavior_ = Behavior::kDefault; + SaveRestoreType saverest_type_ = SaveRestoreType::NONE; + uint8_t is_restore_ = 0; + uint8_t saverest_index_ = 0; }; class BuiltinFunction : public Function { diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc index df5b72375..fda6812b4 100644 --- a/src/xenia/cpu/hir/hir_builder.cc +++ b/src/xenia/cpu/hir/hir_builder.cc @@ -1023,13 +1023,6 @@ Value* HIRBuilder::Truncate(Value* value, TypeName target_type) { Value* HIRBuilder::Convert(Value* value, TypeName target_type, RoundMode round_mode) { - if (value->type == target_type) { - return value; - } else if (value->IsConstant()) { - Value* dest = CloneValue(value); - dest->Convert(target_type, round_mode); - return dest; - } Instr* i = AppendInstr(OPCODE_CONVERT_info, round_mode, AllocValue(target_type)); @@ -1041,11 +1034,6 @@ Value* HIRBuilder::Convert(Value* value, TypeName target_type, Value* HIRBuilder::Round(Value* value, RoundMode round_mode) { ASSERT_FLOAT_OR_VECTOR_TYPE(value); - if (value->IsConstant()) { - Value* dest = CloneValue(value); - dest->Round(round_mode); - return dest; - } Instr* i = AppendInstr(OPCODE_ROUND_info, round_mode, AllocValue(value->type)); @@ -1295,7 +1283,7 @@ void HIRBuilder::SetNJM(Value* value) { Value* HIRBuilder::Max(Value* value1, Value* value2) { ASSERT_TYPES_EQUAL(value1, value2); - if (value1->type != VEC128_TYPE && value1->IsConstant() && + if (IsScalarIntegralType( value1->type) && value1->IsConstant() && value2->IsConstant()) { return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value2 : value1; } @@ -1323,7 +1311,7 @@ Value* HIRBuilder::VectorMax(Value* value1, Value* value2, TypeName part_type, Value* HIRBuilder::Min(Value* value1, Value* value2) { ASSERT_TYPES_EQUAL(value1, value2); - if (value1->type != VEC128_TYPE && value1->IsConstant() && + if (IsScalarIntegralType(value1->type) && value1->IsConstant() && value2->IsConstant()) { return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value1 : value2; } @@ -1351,8 +1339,9 @@ Value* HIRBuilder::VectorMin(Value* value1, Value* value2, TypeName part_type, Value* HIRBuilder::Select(Value* cond, Value* value1, Value* value2) { assert_true(cond->type == INT8_TYPE || cond->type == VEC128_TYPE); // for now ASSERT_TYPES_EQUAL(value1, value2); - - if (cond->IsConstant()) { + // chrispy: this was being done with V128, which was breaking stuff obviously + // because that should be an element by element select + if (cond->IsConstant() && IsScalarIntegralType(cond->type)) { return cond->IsConstantTrue() ? value1 : value2; } @@ -1518,7 +1507,8 @@ Value* HIRBuilder::Add(Value* value1, Value* value2, ASSERT_TYPES_EQUAL(value1, value2); // TODO(benvanik): optimize when flags set. - if (!arithmetic_flags) { + + if (!arithmetic_flags && IsScalarIntegralType(value1->type)) { if (value1->IsConstantZero()) { return value2; } else if (value2->IsConstantZero()) { diff --git a/src/xenia/cpu/ppc/ppc_emit_fpu.cc b/src/xenia/cpu/ppc/ppc_emit_fpu.cc index 71d323f2e..c491ad09a 100644 --- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc @@ -442,7 +442,18 @@ int InstrEmit_fabsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- abs(frB) Value* v = f.Abs(f.LoadFPR(i.X.RB)); f.StoreFPR(i.X.RT, v); - f.UpdateFPSCR(v, i.X.Rc); + /* + The contents of frB with bit 0 cleared are placed into frD. +Note that the fabs instruction treats NaNs just like any other kind of value. That is, the sign +bit of a NaN may be altered by fabs. This instruction does not alter the FPSCR. +Other registers altered: +• Condition Register (CR1 field): +Affected: FX, FEX, VX, OX (if Rc = 1) + */ + // f.UpdateFPSCR(v, i.X.Rc); + if (i.X.Rc) { + // todo + } return 0; } @@ -458,7 +469,10 @@ int InstrEmit_fnabsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- !abs(frB) Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB))); f.StoreFPR(i.X.RT, v); - f.UpdateFPSCR(v, i.X.Rc); + //f.UpdateFPSCR(v, i.X.Rc); + if (i.X.Rc) { + //todo + } return 0; } @@ -466,7 +480,10 @@ int InstrEmit_fnegx(PPCHIRBuilder& f, const InstrData& i) { // frD <- ¬ frB[0] || frB[1-63] Value* v = f.Neg(f.LoadFPR(i.X.RB)); f.StoreFPR(i.X.RT, v); - f.UpdateFPSCR(v, i.X.Rc); + //f.UpdateFPSCR(v, i.X.Rc); + if (i.X.Rc) { + //todo + } return 0; } diff --git a/src/xenia/cpu/xex_module.cc b/src/xenia/cpu/xex_module.cc index 7ccf3f71b..6e0d13178 100644 --- a/src/xenia/cpu/xex_module.cc +++ b/src/xenia/cpu/xex_module.cc @@ -1598,6 +1598,8 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveGprLr; function->set_behavior(Function::Behavior::kProlog); function->set_status(Symbol::Status::kDeclared); + function->SetSaverest(cpu::SaveRestoreType::GPR, false, n); + address += 4; } address = gplr_start + 20 * 4; @@ -1612,6 +1614,7 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestGprLr; function->set_behavior(Function::Behavior::kEpilogReturn); function->set_status(Symbol::Status::kDeclared); + function->SetSaverest(cpu::SaveRestoreType::GPR, true, n); address += 4; } } @@ -1628,6 +1631,8 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveFpr; function->set_behavior(Function::Behavior::kProlog); function->set_status(Symbol::Status::kDeclared); + + function->SetSaverest(cpu::SaveRestoreType::FPR, false, n); address += 4; } address = fpr_start + (18 * 4) + (1 * 4); @@ -1642,6 +1647,7 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestFpr; function->set_behavior(Function::Behavior::kEpilog); function->set_status(Symbol::Status::kDeclared); + function->SetSaverest(cpu::SaveRestoreType::FPR, true, n); address += 4; } } @@ -1662,6 +1668,7 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx; function->set_behavior(Function::Behavior::kProlog); function->set_status(Symbol::Status::kDeclared); + function->SetSaverest(cpu::SaveRestoreType::VMX, false, n); address += 2 * 4; } address += 4; @@ -1675,6 +1682,7 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx; function->set_behavior(Function::Behavior::kProlog); function->set_status(Symbol::Status::kDeclared); + function->SetSaverest(cpu::SaveRestoreType::VMX, false, n); address += 2 * 4; } address = vmx_start + (18 * 2 * 4) + (1 * 4) + (64 * 2 * 4) + (1 * 4); @@ -1688,6 +1696,7 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx; function->set_behavior(Function::Behavior::kEpilog); function->set_status(Symbol::Status::kDeclared); + function->SetSaverest(cpu::SaveRestoreType::VMX, true, n); address += 2 * 4; } address += 4; @@ -1701,6 +1710,7 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx; function->set_behavior(Function::Behavior::kEpilog); function->set_status(Symbol::Status::kDeclared); + function->SetSaverest(cpu::SaveRestoreType::VMX, true, n); address += 2 * 4; } }