diff --git a/src/xenia/cpu/backend/x64/x64_amdfx_extensions.h b/src/xenia/cpu/backend/x64/x64_amdfx_extensions.h new file mode 100644 index 000000000..947589a8e --- /dev/null +++ b/src/xenia/cpu/backend/x64/x64_amdfx_extensions.h @@ -0,0 +1,334 @@ +#ifndef XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_ +#define XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_ + +#include +#include +#include + +namespace xe { +namespace cpu { +namespace backend { +namespace x64 { +namespace amdfx { +enum xopcodemap_e : unsigned char { + XOPCODE_HAS_IMMBYTE = 0x8, + XOPCODE_NO_IMMBYTE = 0x9 +}; + +// base opcodes, without their size specified +enum xopcode_e : unsigned char { + xop_VFRCZPD = 0x81, + xop_VFRCZPS = 0x80, + xop_VFRCZSD = 0x83, + xop_VFRCZSS = 0x82, + xop_VPCMOV = 0xA2, + xop_VPCOMB = 0xCC, + xop_VPCOMD = 0xCE, + xop_VPCOMQ = 0xCF, + xop_VPCOMUB = 0xEC, + xop_VPCOMUD = 0xEE, + xop_VPCOMUQ = 0xEF, + xop_VPCOMUW = 0xED, + xop_VPCOMW = 0xCD, + xop_VPERMIL2PD = 0x49, + xop_VPERMIL2PS = 0x48, + xop_VPHADDBD = 0xC2, + xop_VPHADDBQ = 0xC3, + xop_VPHADDBW = 0xC1, + xop_VPHADDDQ = 0xCB, + xop_VPHADDUBD = 0xD2, + xop_VPHADDUBQ = 0xD3, + xop_VPHADDUBW = 0xD1, + xop_VPHADDUDQ = 0xDB, + xop_VPHADDUWD = 0xD6, + xop_VPHADDUWQ = 0xD7, + xop_VPHADDWD = 0xC6, + xop_VPHADDWQ = 0xC7, + xop_VPHSUBBW = 0xE1, + xop_VPHSUBDQ = 0xE3, + xop_VPHSUBWD = 0xE2, + xop_VPMACSDD = 0x9E, + xop_VPMACSDQH = 0x9F, + xop_VPMACSDQL = 0x97, + xop_VPMACSSDD = 0x8E, + xop_VPMACSSDQH = 0x8F, + xop_VPMACSSDQL = 0x87, + xop_VPMACSSWD = 0x86, + xop_VPMACSSWW = 0x85, + xop_VPMACSWD = 0x96, + xop_VPMACSWW = 0x95, + xop_VPMADCSSWD = 0xA6, + xop_VPMADCSWD = 0xB6, + xop_VPPERM = 0xA3, + xop_VPROTB = 0x90, + xop_VPROTBI = 0xC0, // imm version + xop_VPROTD = 0x92, + xop_VPROTDI = 0xC2, + xop_VPROTQ = 0x93, + xop_VPROTQI = 0xC3, + xop_VPROTW = 0x91, + xop_VPROTWI = 0xC1, + xop_VPSHAB = 0x98, + xop_VPSHAD = 0x9A, + xop_VPSHAQ = 0x9B, + xop_VPSHAW = 0x99, + xop_VPSHLB = 0x94, + xop_VPSHLD = 0x96, + xop_VPSHLQ = 0x97, + xop_VPSHLW = 0x95, + +}; + +enum xop_iop_e : unsigned char { + XOP_BYTE = 0, + XOP_WORD = 1, + XOP_DOUBLEWORD = 2, + XOP_QUADWORD = 3 +}; + +enum xop_fop_e : unsigned char { + XOP_PS = 0, + XOP_PD = 1, + XOP_SS = 2, + XOP_SD = 3 +}; +class xop_byte1_t { + public: + union { + // informative names + struct { + /* + A five bit field encoding a one- or two-byte opcode prefix. + */ + unsigned char opcode_map_select : 5; + /* + This bit provides a one-bit extension of either the ModRM.r/m + field to specify a GPR or XMM register or to the SIB base field to + specify a GPR. This permits access to 16 registers. In 32-bit protected + and compatibility modes, this bit is ignored. This bit is the + bit-inverted equivalent of the REX.B bit and is available only in the + 3-byte prefix format. + */ + unsigned char inv_1bit_ext_modrm_or_sib : 1; + /* + This bit provides a one bit extension of the SIB.index field in + 64-bit mode, permitting access to 16 YMM/XMM and GPR registers. In + 32-bit protected and compatibility modes, this bit must be set to 1. + This bit is the bit-inverted equivalent of the REX.X bit + */ + unsigned char inv_1bit_ext_sib_index : 1; + /* + This bit provides a one bit extension of the ModRM.reg field in + 64-bit mode, permitting access to all 16 YMM/XMM and GPR registers. In + 32-bit protected and compatibility modes, this bit must be set to 1. + This bit is the bit-inverted equivalent of the REX.R bit. + */ + unsigned char inv_1bit_ext_modrm_reg_field : 1; + }; + // amd manual names + struct { + unsigned char mmmmm : 5; + unsigned char B : 1; + unsigned char X : 1; + unsigned char R : 1; + }; + unsigned char encoded; + }; +}; + +class xop_byte2_t { + public: + union { + struct { + unsigned char + implied_66f2f3_ext : 2; // 0 = no implied, 1 = 66, 2 = F3, 3 = F2 + unsigned char vector_length : 1; + unsigned char source_or_dest_reg_specifier_inverted_1s_compl : 4; + unsigned char scalar_reg_size_override_special : 1; + }; + // amd manual names + + struct { + unsigned char pp : 2; // presumably 0 = no implied, 1 = 66, 2 = F2, 3 = + // F3 + unsigned char L : 1; + unsigned char vvvv : 4; // src1 for four operand form + unsigned char W : 1; + }; + unsigned char encoded; + }; +}; + +class xop_opcode_byte_t { + public: + union { + struct { + xop_fop_e float_datatype : 2; + unsigned char __unused0 : 6; + }; + + struct { + xop_iop_e int_datatype : 2; + unsigned char __unused1 : 6; + }; + + struct { + unsigned char oes : 2; + unsigned char opcode : 6; + }; + unsigned char encoded; + }; +}; + +class modrm_byte_t { + public: + union { + struct { + unsigned char rm : 3; + unsigned char mod : 5; // 4 opnd form dest reg + }; + unsigned char encoded; + }; +}; + +#pragma pack(push, 1) +class xop_t { + public: + unsigned char imm_8F; // always 0x8F + xop_byte1_t byte1; + xop_byte2_t byte2; + xop_opcode_byte_t opcode; + modrm_byte_t modrm; + unsigned char imm8; + + xop_t() : imm_8F(0x8F) { + byte1.encoded = 0; + byte2.encoded = 0; + opcode.encoded = 0; + modrm.encoded = 0; + } + + unsigned AssembledSize() const { + if (byte1.opcode_map_select == XOPCODE_NO_IMMBYTE) { + return 5; + } else { + return 6; + } + } + + template + void ForeachByte(TCall&& cb) { + cb(imm_8F); + cb(byte1.encoded); + cb(byte2.encoded); + cb(opcode.encoded); + cb(modrm.encoded); + if (AssembledSize() == 6) { + cb(imm8); + } + } +}; +#pragma pack(pop) + +static void xop_set_fouroperand_form(xop_t& xop, unsigned xmmidx_dest, + unsigned xmmidx_src1, unsigned xmmidx_src2, + unsigned xmmidx_src3, xopcode_e opcode, + bool has_immbyte = true) { + xop.opcode.encoded = opcode; + xop.byte1.encoded = 0xe8; + if (has_immbyte) { + xop.byte1.opcode_map_select = XOPCODE_HAS_IMMBYTE; + } else { + xop.byte1.opcode_map_select = XOPCODE_NO_IMMBYTE; + } + xop.imm8 = xmmidx_src3 << 4; + + xop.modrm.rm = xmmidx_src2 & 0b111; + xop.byte1.inv_1bit_ext_modrm_reg_field = (xmmidx_dest >> 3) ^ 1; + xop.byte1.inv_1bit_ext_modrm_or_sib = (xmmidx_src2 >> 3) ^ 1; + xop.byte2.vvvv = ~xmmidx_src1; + xop.modrm.encoded |= 0xC0; + xop.modrm.mod |= xmmidx_dest & 0b111; +} + +enum class xopcompare_e : uint32_t { + LT = 0b000, + LTE = 0b001, + GT = 0b010, + GTE = 0b011, + EQ = 0b100, + NEQ = 0b101, + FALSEY = 0b110, // there doesnt seem to be much in the way of documentation + // for these two + TRUTHEY = 0b111 +}; + +namespace operations { +#define SIMPLE_FOUROPERAND(funcname, opcode) \ + static xop_t funcname(unsigned destidx, unsigned src1idx, unsigned src2idx, \ + unsigned src3idx) { \ + xop_t result{}; \ + xop_set_fouroperand_form(result, destidx, src1idx, src2idx, src3idx, \ + opcode, true); \ + return result; \ + } + +SIMPLE_FOUROPERAND(vpcmov, xop_VPCMOV) + +SIMPLE_FOUROPERAND(vpperm, xop_VPPERM) + +#define COMPAREFUNC(name, opcode) \ + static xop_t name(unsigned dst, unsigned src1, unsigned src2, \ + xopcompare_e imm8) { \ + xop_t xop; \ + xop_set_fouroperand_form(xop, dst, src1, src2, 0, opcode, true); \ + xop.imm8 = static_cast(static_cast(imm8)); \ + return xop; \ + } + +COMPAREFUNC(vpcomb, xop_VPCOMB) +COMPAREFUNC(vpcomub, xop_VPCOMUB) +COMPAREFUNC(vpcomw, xop_VPCOMW) +COMPAREFUNC(vpcomuw, xop_VPCOMUW) +COMPAREFUNC(vpcomd, xop_VPCOMD) +COMPAREFUNC(vpcomud, xop_VPCOMUD) +COMPAREFUNC(vpcomq, xop_VPCOMQ) +COMPAREFUNC(vpcomuq, xop_VPCOMUQ) + +#define SIMPLE_THREEOPERAND(funcname, opcode) \ + static xop_t funcname(unsigned destidx, unsigned src1idx, \ + unsigned src2idx) { \ + xop_t result{}; \ + xop_set_fouroperand_form(result, destidx, src1idx, src2idx, 0, opcode, \ + false); \ + return result; \ + } + +SIMPLE_THREEOPERAND(vprotb, xop_VPROTB) +SIMPLE_THREEOPERAND(vprotw, xop_VPROTW) +SIMPLE_THREEOPERAND(vprotd, xop_VPROTD) +SIMPLE_THREEOPERAND(vprotq, xop_VPROTQ) + +SIMPLE_THREEOPERAND(vpshab, xop_VPSHAB) +SIMPLE_THREEOPERAND(vpshaw, xop_VPSHAW) +SIMPLE_THREEOPERAND(vpshad, xop_VPSHAD) +SIMPLE_THREEOPERAND(vpshaq, xop_VPSHAQ) + + +SIMPLE_THREEOPERAND(vpshlb, xop_VPSHLB) +SIMPLE_THREEOPERAND(vpshlw, xop_VPSHLW) +SIMPLE_THREEOPERAND(vpshld, xop_VPSHLD) +SIMPLE_THREEOPERAND(vpshlq, xop_VPSHLQ) + +#undef SIMPLE_THREEOPERAND +#undef SIMPLE_FOUROPERAND +#undef COMPAREFUNC +} // namespace operations + +} // namespace amdfx +} // namespace x64 +} // namespace backend +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_BACKEND_X64_X64_AMDFX_EXTENSIONS_H_ diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 48cfa9909..c7773f08f 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -23,7 +23,7 @@ // NOTE: must be included last as it expects windows.h to already be included. #include "third_party/xbyak/xbyak/xbyak.h" #include "third_party/xbyak/xbyak/xbyak_util.h" - +#include "x64_amdfx_extensions.h" namespace xe { namespace cpu { class Processor; @@ -169,6 +169,8 @@ enum XmmConst { XMMF16PackLCPI5, XMMF16PackLCPI6 }; +using amdfx::xopcompare_e; +using Xbyak::Xmm; // X64Backend specific Instr->runtime_flags enum : uint32_t { INSTR_X64_FLAGS_ELIMINATED = @@ -351,6 +353,37 @@ class X64Emitter : public Xbyak::CodeGenerator { void EmitProfilerEpilogue(); + void EmitXOP(amdfx::xop_t xoperation) { + xoperation.ForeachByte([this](uint8_t b) { this->db(b); }); + } + + void vpcmov(Xmm dest, Xmm src1, Xmm src2, Xmm selector) { + auto xop_bytes = amdfx::operations::vpcmov( + dest.getIdx(), src1.getIdx(), src2.getIdx(), selector.getIdx()); + EmitXOP(xop_bytes); + } + + void vpperm(Xmm dest, Xmm src1, Xmm src2, Xmm selector) { + auto xop_bytes = amdfx::operations::vpperm( + dest.getIdx(), src1.getIdx(), src2.getIdx(), selector.getIdx()); + EmitXOP(xop_bytes); + } + +#define DEFINECOMPARE(name) \ + void name(Xmm dest, Xmm src1, Xmm src2, xopcompare_e compareop) { \ + auto xop_bytes = amdfx::operations::name(dest.getIdx(), src1.getIdx(), \ + src2.getIdx(), compareop); \ + EmitXOP(xop_bytes); \ + } + DEFINECOMPARE(vpcomb); + DEFINECOMPARE(vpcomub); + DEFINECOMPARE(vpcomw); + DEFINECOMPARE(vpcomuw); + DEFINECOMPARE(vpcomd); + DEFINECOMPARE(vpcomud); + DEFINECOMPARE(vpcomq); + DEFINECOMPARE(vpcomuq); + #undef DEFINECOMPARE protected: void* Emplace(const EmitFunctionInfo& func_info, GuestFunction* function = nullptr); diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 3d9a5f797..385b8e741 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -406,26 +406,44 @@ struct VECTOR_COMPARE_SGE_V128 static void Emit(X64Emitter& e, const EmitArgType& i) { EmitAssociativeBinaryXmmOp( e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - switch (i.instr->flags) { - case INT8_TYPE: - e.vpcmpeqb(e.xmm0, src1, src2); - e.vpcmpgtb(dest, src1, src2); - e.vpor(dest, e.xmm0); - break; - case INT16_TYPE: - e.vpcmpeqw(e.xmm0, src1, src2); - e.vpcmpgtw(dest, src1, src2); - e.vpor(dest, e.xmm0); - break; - case INT32_TYPE: - e.vpcmpeqd(e.xmm0, src1, src2); - e.vpcmpgtd(dest, src1, src2); - e.vpor(dest, e.xmm0); - break; - case FLOAT32_TYPE: - e.ChangeMxcsrMode(MXCSRMode::Vmx); - e.vcmpgeps(dest, src1, src2); - break; + if (e.IsFeatureEnabled(kX64EmitXOP)) { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcomb(dest, src1, src2, xopcompare_e::GTE); + break; + case INT16_TYPE: + e.vpcomw(dest, src1, src2, xopcompare_e::GTE); + break; + case INT32_TYPE: + e.vpcomd(dest, src1, src2, xopcompare_e::GTE); + break; + case FLOAT32_TYPE: + e.ChangeMxcsrMode(MXCSRMode::Vmx); + e.vcmpgeps(dest, src1, src2); + break; + } + } else { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpeqb(e.xmm0, src1, src2); + e.vpcmpgtb(dest, src1, src2); + e.vpor(dest, e.xmm0); + break; + case INT16_TYPE: + e.vpcmpeqw(e.xmm0, src1, src2); + e.vpcmpgtw(dest, src1, src2); + e.vpor(dest, e.xmm0); + break; + case INT32_TYPE: + e.vpcmpeqd(e.xmm0, src1, src2); + e.vpcmpgtd(dest, src1, src2); + e.vpor(dest, e.xmm0); + break; + case FLOAT32_TYPE: + e.ChangeMxcsrMode(MXCSRMode::Vmx); + e.vcmpgeps(dest, src1, src2); + break; + } } }); } @@ -439,52 +457,68 @@ struct VECTOR_COMPARE_UGT_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy - switch (i.instr->flags) { - case INT8_TYPE: - sign_addr = e.GetXmmConstPtr(XMMSignMaskI8); - break; - case INT16_TYPE: - sign_addr = e.GetXmmConstPtr(XMMSignMaskI16); - break; - case INT32_TYPE: - sign_addr = e.GetXmmConstPtr(XMMSignMaskI32); - break; - case FLOAT32_TYPE: - e.ChangeMxcsrMode(MXCSRMode::Vmx); - sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); - break; - default: - assert_always(); - break; - } - if (i.src1.is_constant) { - // TODO(benvanik): make this constant. - e.LoadConstantXmm(e.xmm0, i.src1.constant()); - e.vpxor(e.xmm0, sign_addr); + if (i.instr->flags != FLOAT32_TYPE && e.IsFeatureEnabled(kX64EmitXOP)) { + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcomub(i.dest, src1, src2, xopcompare_e::GT); + break; + case INT16_TYPE: + e.vpcomuw(i.dest, src1, src2, xopcompare_e::GT); + break; + case INT32_TYPE: + e.vpcomud(i.dest, src1, src2, xopcompare_e::GT); + break; + } } else { - e.vpxor(e.xmm0, i.src1, sign_addr); - } - if (i.src2.is_constant) { - // TODO(benvanik): make this constant. - e.LoadConstantXmm(e.xmm1, i.src2.constant()); - e.vpxor(e.xmm1, sign_addr); - } else { - e.vpxor(e.xmm1, i.src2, sign_addr); - } - switch (i.instr->flags) { - case INT8_TYPE: - e.vpcmpgtb(i.dest, e.xmm0, e.xmm1); - break; - case INT16_TYPE: - e.vpcmpgtw(i.dest, e.xmm0, e.xmm1); - break; - case INT32_TYPE: - e.vpcmpgtd(i.dest, e.xmm0, e.xmm1); - break; - case FLOAT32_TYPE: - e.vcmpgtps(i.dest, e.xmm0, e.xmm1); - break; + Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy + switch (i.instr->flags) { + case INT8_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI8); + break; + case INT16_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI16); + break; + case INT32_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI32); + break; + case FLOAT32_TYPE: + e.ChangeMxcsrMode(MXCSRMode::Vmx); + sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); + break; + default: + assert_always(); + break; + } + if (i.src1.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + e.vpxor(e.xmm0, sign_addr); + } else { + e.vpxor(e.xmm0, i.src1, sign_addr); + } + if (i.src2.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm1, i.src2.constant()); + e.vpxor(e.xmm1, sign_addr); + } else { + e.vpxor(e.xmm1, i.src2, sign_addr); + } + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpgtb(i.dest, e.xmm0, e.xmm1); + break; + case INT16_TYPE: + e.vpcmpgtw(i.dest, e.xmm0, e.xmm1); + break; + case INT32_TYPE: + e.vpcmpgtd(i.dest, e.xmm0, e.xmm1); + break; + case FLOAT32_TYPE: + e.vcmpgtps(i.dest, e.xmm0, e.xmm1); + break; + } } } }; diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index d5dad5cd7..8952ca771 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -50,10 +50,10 @@ DEFINE_bool(no_round_to_single, false, "Not for users, breaks games. Skip rounding double values to " "single precision and back", "CPU"); -DEFINE_bool( - inline_loadclock, false, - "Directly read cached guest clock without calling the LoadClock method (it gets repeatedly updated by calls from other threads)", - "CPU"); +DEFINE_bool(inline_loadclock, false, + "Directly read cached guest clock without calling the LoadClock " + "method (it gets repeatedly updated by calls from other threads)", + "CPU"); namespace xe { namespace cpu { namespace backend { @@ -549,7 +549,7 @@ struct MAX_F64 : Sequence> { struct MAX_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { e.ChangeMxcsrMode(MXCSRMode::Vmx); - //if 0 and -0, return 0! opposite of minfp + // if 0 and -0, return 0! opposite of minfp auto src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); auto src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); e.vmaxps(e.xmm2, src1, src2); @@ -781,11 +781,15 @@ struct SELECT_V128_V128 } else if (mayblend == PermittedBlend::Ps) { e.vblendvps(i.dest, src2, src3, src1); } else { - //ideally we would have an xop path here... - // src1 ? src2 : src3; - e.vpandn(e.xmm3, src1, src2); - e.vpand(i.dest, src1, src3); - e.vpor(i.dest, i.dest, e.xmm3); + if (e.IsFeatureEnabled(kX64EmitXOP)) { + XELOGCPU("Doing vpcmov!!"); + e.vpcmov(i.dest, src2, src3, src1); + } else { + // src1 ? src2 : src3; + e.vpandn(e.xmm3, src1, src2); + e.vpand(i.dest, src1, src3); + e.vpor(i.dest, i.dest, e.xmm3); + } } } }; diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index a5100cff6..e94b570dc 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -84,7 +84,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) { iter_result |= EliminateConversions(builder); iter_result |= SimplifyAssignments(builder); iter_result |= SimplifyBasicArith(builder); - + iter_result |= SimplifyVectorOps(builder); result |= iter_result; } while (iter_result); return true; @@ -1393,6 +1393,65 @@ bool SimplificationPass::SimplifyBasicArith(hir::HIRBuilder* builder) { return result; } +static bool CouldEverProduceDenormal(hir::Instr* i) { + if (!i) { + return false; + } + Opcode denflushed_opcode = i->GetOpcodeNum(); + + if (denflushed_opcode == OPCODE_VECTOR_DENORMFLUSH) { + return false; + } else if (denflushed_opcode == OPCODE_UNPACK) { + // todo: more unpack operations likely cannot produce denormals + if (i->flags == PACK_TYPE_FLOAT16_4 || i->flags == PACK_TYPE_FLOAT16_2) { + return false; // xenos half float format does not support denormals + } + } else if (denflushed_opcode == OPCODE_VECTOR_CONVERT_I2F) { + return false; + } + return true; // todo: recurse, check values for min/max, abs, and others +} + +bool SimplificationPass::SimplifyVectorOps(hir::Instr* i, + hir::HIRBuilder* builder) { + Opcode opc = i->GetOpcodeNum(); + /* + if the input to an unconditional denormal flush is an output of an + unconditional denormal flush, it is a pointless instruction and should be + elimed + */ + if (opc == OPCODE_VECTOR_DENORMFLUSH) { + hir::Instr* denflushed_def = i->src1.value->GetDefSkipAssigns(); + + if (denflushed_def) { + if (!CouldEverProduceDenormal(denflushed_def)) { + i->opcode = &OPCODE_ASSIGN_info; + return true; + } + } + } + return false; +} +bool SimplificationPass::SimplifyVectorOps(hir::HIRBuilder* builder) { + bool result = false; + auto block = builder->first_block(); + while (block) { + auto i = block->instr_head; + while (i) { + bool looks_vectory = false; + + i->VisitValueOperands([&looks_vectory](Value* val, uint32_t idx) { + if (val->type == VEC128_TYPE) { + looks_vectory = true; + } + }); + result |= SimplifyVectorOps(i, builder); + i = i->next; + } + block = block->next; + } + return result; +} /* todo: add load-store simplification pass diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.h b/src/xenia/cpu/compiler/passes/simplification_pass.h index 078187eb1..66d2a26a2 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.h +++ b/src/xenia/cpu/compiler/passes/simplification_pass.h @@ -35,6 +35,9 @@ class SimplificationPass : public ConditionalGroupSubpass { // handles simple multiplication/addition rules bool SimplifyBasicArith(hir::HIRBuilder* builder); + +bool SimplifyVectorOps(hir::HIRBuilder* builder); + bool SimplifyVectorOps(hir::Instr* i, hir::HIRBuilder* builder); bool SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder); bool SimplifyAddWithSHL(hir::Instr* i, hir::HIRBuilder* builder); bool SimplifyAddToSelf(hir::Instr* i, hir::HIRBuilder* builder); diff --git a/src/xenia/cpu/function.h b/src/xenia/cpu/function.h index 57969cfda..856fdbe1c 100644 --- a/src/xenia/cpu/function.h +++ b/src/xenia/cpu/function.h @@ -31,10 +31,11 @@ struct SourceMapEntry { uint32_t hir_offset; // Block ordinal (16b) | Instr ordinal (16b) uint32_t code_offset; // Offset from emitted code start. }; +enum class SaveRestoreType : uint8_t { NONE, GPR, VMX, FPR }; class Function : public Symbol { public: - enum class Behavior { + enum class Behavior : uint8_t { kDefault = 0, kProlog, kEpilog, @@ -53,6 +54,20 @@ class Function : public Symbol { void set_behavior(Behavior value) { behavior_ = value; } bool is_guest() const { return behavior_ != Behavior::kBuiltin; } + void SetSaverest(SaveRestoreType type, bool is_rest, uint8_t index) { + saverest_type_ = type; + is_restore_ = is_rest; + saverest_index_ = index; + } + + bool IsSaverest() const { return saverest_type_ != SaveRestoreType::NONE; } + + SaveRestoreType SaverestType() const { return saverest_type_; } + unsigned SaverestIndex() const { return saverest_index_; } + + bool IsSave() const { return IsSaverest() && is_restore_ == 0; } + bool IsRestore() const { return IsSaverest() && is_restore_; } + bool ContainsAddress(uint32_t address) const { if (!address_ || !end_address_) { return false; @@ -71,7 +86,11 @@ class Function : public Symbol { Function(Module* module, uint32_t address); uint32_t end_address_ = 0; + Behavior behavior_ = Behavior::kDefault; + SaveRestoreType saverest_type_ = SaveRestoreType::NONE; + uint8_t is_restore_ = 0; + uint8_t saverest_index_ = 0; }; class BuiltinFunction : public Function { diff --git a/src/xenia/cpu/xex_module.cc b/src/xenia/cpu/xex_module.cc index 7ccf3f71b..6e0d13178 100644 --- a/src/xenia/cpu/xex_module.cc +++ b/src/xenia/cpu/xex_module.cc @@ -1598,6 +1598,8 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveGprLr; function->set_behavior(Function::Behavior::kProlog); function->set_status(Symbol::Status::kDeclared); + function->SetSaverest(cpu::SaveRestoreType::GPR, false, n); + address += 4; } address = gplr_start + 20 * 4; @@ -1612,6 +1614,7 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestGprLr; function->set_behavior(Function::Behavior::kEpilogReturn); function->set_status(Symbol::Status::kDeclared); + function->SetSaverest(cpu::SaveRestoreType::GPR, true, n); address += 4; } } @@ -1628,6 +1631,8 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveFpr; function->set_behavior(Function::Behavior::kProlog); function->set_status(Symbol::Status::kDeclared); + + function->SetSaverest(cpu::SaveRestoreType::FPR, false, n); address += 4; } address = fpr_start + (18 * 4) + (1 * 4); @@ -1642,6 +1647,7 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestFpr; function->set_behavior(Function::Behavior::kEpilog); function->set_status(Symbol::Status::kDeclared); + function->SetSaverest(cpu::SaveRestoreType::FPR, true, n); address += 4; } } @@ -1662,6 +1668,7 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx; function->set_behavior(Function::Behavior::kProlog); function->set_status(Symbol::Status::kDeclared); + function->SetSaverest(cpu::SaveRestoreType::VMX, false, n); address += 2 * 4; } address += 4; @@ -1675,6 +1682,7 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagSaveVmx; function->set_behavior(Function::Behavior::kProlog); function->set_status(Symbol::Status::kDeclared); + function->SetSaverest(cpu::SaveRestoreType::VMX, false, n); address += 2 * 4; } address = vmx_start + (18 * 2 * 4) + (1 * 4) + (64 * 2 * 4) + (1 * 4); @@ -1688,6 +1696,7 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx; function->set_behavior(Function::Behavior::kEpilog); function->set_status(Symbol::Status::kDeclared); + function->SetSaverest(cpu::SaveRestoreType::VMX, true, n); address += 2 * 4; } address += 4; @@ -1701,6 +1710,7 @@ bool XexModule::FindSaveRest() { // TODO(benvanik): set flags fn->flags |= FunctionSymbol::kFlagRestVmx; function->set_behavior(Function::Behavior::kEpilog); function->set_status(Symbol::Status::kDeclared); + function->SetSaverest(cpu::SaveRestoreType::VMX, true, n); address += 2 * 4; } }