diff --git a/src/xenia/cpu/backend/backend.h b/src/xenia/cpu/backend/backend.h index aa9097602..fce3410d7 100644 --- a/src/xenia/cpu/backend/backend.h +++ b/src/xenia/cpu/backend/backend.h @@ -67,6 +67,7 @@ class Backend { // up until the start of ctx may be used by the backend to store whatever data // they want virtual void InitializeBackendContext(void* ctx) {} + virtual void SetGuestRoundingMode(void* ctx, unsigned int mode){}; protected: Processor* processor_ = nullptr; diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index 87ee4f76a..7d15d0e63 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -689,8 +689,7 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() { #endif } void X64Backend::InitializeBackendContext(void* ctx) { - X64BackendContext* bctx = reinterpret_cast( - reinterpret_cast(ctx) - sizeof(X64BackendContext)); + X64BackendContext* bctx = BackendContextForGuestContext(ctx); bctx->ResolveFunction_Ptr = reinterpret_cast(&ResolveFunction); bctx->mxcsr_fpu = DEFAULT_FPU_MXCSR; // idk if this is right, check on rgh what the @@ -700,6 +699,18 @@ void X64Backend::InitializeBackendContext(void* ctx) { // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png bctx->Ox1000 = 0x1000; } +const uint32_t mxcsr_table[8] = { + 0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80, +}; + +void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) { + X64BackendContext* bctx = BackendContextForGuestContext(ctx); + + uint32_t control = mode & 7; + _mm_setcsr(mxcsr_table[control]); + bctx->mxcsr_fpu = mxcsr_table[control]; + ((ppc::PPCContext*)ctx)->fpscr.bits.rn = control; +} } // namespace x64 } // namespace backend } // namespace cpu diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h index 1026202fe..a87cdc102 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.h +++ b/src/xenia/cpu/backend/x64/x64_backend.h @@ -37,9 +37,10 @@ typedef void (*ResolveFunctionThunk)(); // negatively index the membase reg) struct X64BackendContext { void* ResolveFunction_Ptr; // cached pointer to resolvefunction - unsigned int mxcsr_fpu; //currently, the way we implement rounding mode affects both vmx and the fpu + unsigned int mxcsr_fpu; // currently, the way we implement rounding mode + // affects both vmx and the fpu unsigned int mxcsr_vmx; - unsigned int flags; //bit 0 = 0 if mxcsr is fpu, else it is vmx + unsigned int flags; // bit 0 = 0 if mxcsr is fpu, else it is vmx unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted // add of it by... 2 bytes lol }; @@ -48,7 +49,7 @@ constexpr unsigned int DEFAULT_VMX_MXCSR = 0x0040 | (_MM_MASK_MASK); // default rounding mode for vmx constexpr unsigned int DEFAULT_FPU_MXCSR = 0x1F80; - +extern const uint32_t mxcsr_table[8]; class X64Backend : public Backend { public: static const uint32_t kForceReturnAddress = 0x9FFF0000u; @@ -85,6 +86,12 @@ class X64Backend : public Backend { void UninstallBreakpoint(Breakpoint* breakpoint) override; virtual void InitializeBackendContext(void* ctx) override; + X64BackendContext* BackendContextForGuestContext(void* ctx) { + return reinterpret_cast( + reinterpret_cast(ctx) - sizeof(X64BackendContext)); + } + virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override; + private: static bool ExceptionCallbackThunk(Exception* ex, void* data); bool ExceptionCallback(Exception* ex); diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 129ecc0d3..e481788c3 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -50,6 +50,13 @@ DEFINE_bool(resolve_rel32_guest_calls, true, "Experimental optimization, directly call already resolved " "functions via x86 rel32 call/jmp", "CPU"); + +DEFINE_bool(enable_incorrect_roundingmode_behavior, false, + "Disables the FPU/VMX MXCSR sharing workaround, potentially " + "causing incorrect rounding behavior and denormal handling in VMX " + "code. The workaround may cause reduced CPU performance but is a " + "more accurate emulation", + "x64"); namespace xe { namespace cpu { namespace backend { @@ -1374,13 +1381,13 @@ Xbyak::Label& X64Emitter::NewCachedLabel() { return *tmp; } -template +template static void ChangeMxcsrModeDynamicHelper(X64Emitter& e) { auto flags = e.GetBackendFlagsPtr(); if (switching_to_fpu) { e.btr(flags, 0); // bit 0 set to 0 = is fpu mode } else { - e.bts(flags, 0); // bit 0 set to 1 = is vmx mode + e.bts(flags, 0); // bit 0 set to 1 = is vmx mode } Xbyak::Label& come_back = e.NewCachedLabel(); @@ -1391,20 +1398,24 @@ static void ChangeMxcsrModeDynamicHelper(X64Emitter& e) { e.LoadFpuMxcsrDirect(); } else { e.LoadVmxMxcsrDirect(); - } + } e.jmp(come_back, X64Emitter::T_NEAR); }); if (switching_to_fpu) { e.jc(reload_bailout, X64Emitter::T_NEAR); // if carry flag was set, we were VMX mxcsr mode. } else { - e.jnc(reload_bailout, - X64Emitter::T_NEAR); // if carry flag was set, we were VMX mxcsr mode. + e.jnc( + reload_bailout, + X64Emitter::T_NEAR); // if carry flag was set, we were VMX mxcsr mode. } e.L(come_back); } bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) { + if (cvars::enable_incorrect_roundingmode_behavior) { + return false; // no MXCSR mode handling! + } if (new_mode == mxcsr_mode_) { return false; } @@ -1420,21 +1431,21 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) { ChangeMxcsrModeDynamicHelper(*this); } else { assert_unhandled_case(new_mode); - } - } else { //even if already set, we still need to update flags to reflect our mode + } + } else { // even if already set, we still need to update flags to reflect + // our mode if (new_mode == MXCSRMode::Fpu) { btr(GetBackendFlagsPtr(), 0); } else if (new_mode == MXCSRMode::Vmx) { bts(GetBackendFlagsPtr(), 0); } else { assert_unhandled_case(new_mode); - } - } + } + } } else { mxcsr_mode_ = new_mode; if (!already_set) { if (new_mode == MXCSRMode::Fpu) { - LoadFpuMxcsrDirect(); btr(GetBackendFlagsPtr(), 0); return true; diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc index 7bd306ad0..95f5332b6 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc @@ -23,6 +23,10 @@ DEFINE_bool( elide_e0_check, false, "Eliminate e0 check on some memory accesses, like to r13(tls) or r1(sp)", "CPU"); +DEFINE_bool(enable_rmw_context_merging, false, + "Permit merging read-modify-write HIR instr sequences together " + "into x86 instructions that use a memory operand.", + "x64"); namespace xe { namespace cpu { @@ -88,6 +92,9 @@ struct LoadModStoreContext : public LoadModStore { }; static bool GetLoadModStoreContext(const hir::Instr* loadinsn, LoadModStoreContext* out) { + if (!cvars::enable_rmw_context_merging) { + return false; + } if (!GetLoadModStore(loadinsn, out)) { return false; } diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 0ccd7d441..e99628728 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -360,10 +360,6 @@ struct CONVERT_I64_F64 struct CONVERT_F32_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.ChangeMxcsrMode(MXCSRMode::Fpu); - // TODO(benvanik): saturation check? cvtt* (trunc?) - // e.vcvtsi2ss(i.dest, GetInputRegOrConstant(e, i.src1, e.xmm0)); - assert_impossible_sequence(CONVERT_F32_I32); } }; @@ -428,26 +424,7 @@ EMITTER_OPCODE_TABLE(OPCODE_TO_SINGLE, TOSINGLE_F64_F64); // ============================================================================ struct ROUND_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { -#if 1 assert_impossible_sequence(ROUND_F32); -#else - // likely dead code - e.ChangeMxcsrMode(MXCSRMode::Fpu); - switch (i.instr->flags) { - case ROUND_TO_ZERO: - e.vroundss(i.dest, i.src1, 0b00000011); - break; - case ROUND_TO_NEAREST: - e.vroundss(i.dest, i.src1, 0b00000000); - break; - case ROUND_TO_MINUS_INFINITY: - e.vroundss(i.dest, i.src1, 0b00000001); - break; - case ROUND_TO_POSITIVE_INFINITY: - e.vroundss(i.dest, i.src1, 0b00000010); - break; - } -#endif } }; struct ROUND_F64 : Sequence> { @@ -547,11 +524,7 @@ EMITTER_OPCODE_TABLE(OPCODE_CONTEXT_BARRIER, CONTEXT_BARRIER); // ============================================================================ struct MAX_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.ChangeMxcsrMode(MXCSRMode::Fpu); - EmitCommutativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vmaxss(dest, src1, src2); - }); + assert_impossible_sequence(MAX_F32); } }; struct MAX_F64 : Sequence> { @@ -594,56 +567,22 @@ struct MIN_I8 : Sequence> { }; struct MIN_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryOp( - e, i, - [](X64Emitter& e, const Reg16& dest_src, const Reg16& src) { - e.cmp(dest_src, src); - e.cmovg(dest_src.cvt32(), src.cvt32()); - }, - [](X64Emitter& e, const Reg16& dest_src, int32_t constant) { - e.mov(e.ax, constant); - e.cmp(dest_src, e.ax); - e.cmovg(dest_src.cvt32(), e.eax); - }); + assert_impossible_sequence(MIN_I16); } }; struct MIN_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryOp( - e, i, - [](X64Emitter& e, const Reg32& dest_src, const Reg32& src) { - e.cmp(dest_src, src); - e.cmovg(dest_src, src); - }, - [](X64Emitter& e, const Reg32& dest_src, int32_t constant) { - e.mov(e.eax, constant); - e.cmp(dest_src, e.eax); - e.cmovg(dest_src, e.eax); - }); + assert_impossible_sequence(MIN_I32); } }; struct MIN_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryOp( - e, i, - [](X64Emitter& e, const Reg64& dest_src, const Reg64& src) { - e.cmp(dest_src, src); - e.cmovg(dest_src, src); - }, - [](X64Emitter& e, const Reg64& dest_src, int64_t constant) { - e.mov(e.rax, constant); - e.cmp(dest_src, e.rax); - e.cmovg(dest_src, e.rax); - }); + assert_impossible_sequence(MIN_I64); } }; struct MIN_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.ChangeMxcsrMode(MXCSRMode::Fpu); - EmitCommutativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vminss(dest, src1, src2); - }); + assert_impossible_sequence(MIN_F32); } }; struct MIN_F64 : Sequence> { @@ -736,26 +675,7 @@ struct SELECT_I64 struct SELECT_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.ChangeMxcsrMode(MXCSRMode::Fpu); - // TODO(benvanik): find a shorter sequence. - // dest = src1 != 0 ? src2 : src3 - e.movzx(e.eax, i.src1); - e.vmovd(e.xmm1, e.eax); - e.vxorps(e.xmm0, e.xmm0); - e.vpcmpeqd(e.xmm0, e.xmm1); - - Xmm src2 = i.src2.is_constant ? e.xmm2 : i.src2; - if (i.src2.is_constant) { - e.LoadConstantXmm(src2, i.src2.constant()); - } - e.vpandn(e.xmm1, e.xmm0, src2); - - Xmm src3 = i.src3.is_constant ? e.xmm2 : i.src3; - if (i.src3.is_constant) { - e.LoadConstantXmm(src3, i.src3.constant()); - } - e.vpand(i.dest, e.xmm0, src3); - e.vpor(i.dest, e.xmm1); + assert_impossible_sequence(SELECT_F32); } }; struct SELECT_F64 @@ -785,30 +705,7 @@ struct SELECT_F64 struct SELECT_V128_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.ChangeMxcsrMode(MXCSRMode::Vmx); - // TODO(benvanik): find a shorter sequence. - // dest = src1 != 0 ? src2 : src3 - /* - chrispy: this is dead code, this sequence is never emitted - */ - e.movzx(e.eax, i.src1); - e.vmovd(e.xmm1, e.eax); - e.vpbroadcastd(e.xmm1, e.xmm1); - e.vxorps(e.xmm0, e.xmm0); - e.vpcmpeqd(e.xmm0, e.xmm1); - - Xmm src2 = i.src2.is_constant ? e.xmm2 : i.src2; - if (i.src2.is_constant) { - e.LoadConstantXmm(src2, i.src2.constant()); - } - e.vpandn(e.xmm1, e.xmm0, src2); - - Xmm src3 = i.src3.is_constant ? e.xmm2 : i.src3; - if (i.src3.is_constant) { - e.LoadConstantXmm(src3, i.src3.constant()); - } - e.vpand(i.dest, e.xmm0, src3); - e.vpor(i.dest, e.xmm1); + assert_impossible_sequence(SELECT_V128_I8); } }; @@ -1012,9 +909,7 @@ EMITTER_OPCODE_TABLE(OPCODE_IS_FALSE, IS_FALSE_I8, IS_FALSE_I16, IS_FALSE_I32, // ============================================================================ struct IS_NAN_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.ChangeMxcsrMode(MXCSRMode::Fpu); - e.vucomiss(i.src1, i.src1); - e.setp(i.dest); + assert_impossible_sequence(IS_NAN_F32); } }; @@ -1418,49 +1313,25 @@ struct ADD_I64 : Sequence> { }; struct ADD_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { -#if 1 - assert_impossible_sequence(ADD_F32); -#else - e.ChangeMxcsrMode(MXCSRMode::Fpu); - EmitCommutativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vaddss(dest, src1, src2); - }); -#endif } }; struct ADD_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { e.ChangeMxcsrMode(MXCSRMode::Fpu); -#if 0 - EmitCommutativeBinaryXmmOp( - e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vaddsd(dest, src1, src2); - }); -#else + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); e.vaddsd(i.dest, src1, src2); - -#endif } }; struct ADD_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { e.ChangeMxcsrMode(MXCSRMode::Vmx); -#if 0 - EmitCommutativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vaddps(dest, src1, src2); - }); -#else + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); e.vaddps(i.dest, src1, src2); - -#endif } }; EMITTER_OPCODE_TABLE(OPCODE_ADD, ADD_I8, ADD_I16, ADD_I32, ADD_I64, ADD_F32, @@ -1560,16 +1431,7 @@ struct SUB_I64 : Sequence> { }; struct SUB_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - #if 1 assert_impossible_sequence(SUB_F32); - #else - assert_true(!i.instr->flags); - e.ChangeMxcsrMode(MXCSRMode::Fpu); - EmitAssociativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vsubss(dest, src1, src2); - }); - #endif } }; struct SUB_F64 : Sequence> { @@ -1579,7 +1441,6 @@ struct SUB_F64 : Sequence> { Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); e.vsubsd(i.dest, src1, src2); - } }; struct SUB_V128 : Sequence> { @@ -1601,112 +1462,12 @@ EMITTER_OPCODE_TABLE(OPCODE_SUB, SUB_I8, SUB_I16, SUB_I32, SUB_I64, SUB_F32, // We exploit mulx here to avoid creating too much register pressure. struct MUL_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - #if 1 assert_impossible_sequence(MUL_I8); - #else - if (i.src1.is_constant || i.src2.is_constant) { - uint64_t cval = - i.src1.is_constant ? i.src1.constant() : i.src2.constant(); - - if (cval < (1ull << 32)) { - auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; - - e.imul(i.dest, whichevs, (int)cval); - return; - } - } - - if (e.IsFeatureEnabled(kX64EmitBMI2)) { - // mulx: $1:$2 = EDX * $3 - - // TODO(benvanik): place src2 in edx? - if (i.src1.is_constant) { - assert_true(!i.src2.is_constant); - e.movzx(e.edx, i.src2); - e.mov(e.eax, static_cast(i.src1.constant())); - e.mulx(e.edx, i.dest.reg().cvt32(), e.eax); - } else if (i.src2.is_constant) { - e.movzx(e.edx, i.src1); - e.mov(e.eax, static_cast(i.src2.constant())); - e.mulx(e.edx, i.dest.reg().cvt32(), e.eax); - } else { - e.movzx(e.edx, i.src2); - e.mulx(e.edx, i.dest.reg().cvt32(), i.src1.reg().cvt32()); - } - } else { - // x86 mul instruction - // AH:AL = AL * $1; - - if (i.src1.is_constant) { - assert_true(!i.src2.is_constant); - e.mov(e.al, i.src1.constant()); - e.mul(i.src2); - e.mov(i.dest, e.al); - } else if (i.src2.is_constant) { - assert_true(!i.src1.is_constant); - e.mov(e.al, i.src2.constant()); - e.mul(i.src1); - e.mov(i.dest, e.al); - } else { - e.movzx(e.al, i.src1); - e.mul(i.src2); - e.mov(i.dest, e.al); - } - } - #endif } }; struct MUL_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.src1.is_constant || i.src2.is_constant) { - uint64_t cval = - i.src1.is_constant ? i.src1.constant() : i.src2.constant(); - - if (cval < (1ull << 32)) { - auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; - - e.imul(i.dest, whichevs, (int)cval); - return; - } - } - - if (e.IsFeatureEnabled(kX64EmitBMI2)) { - // mulx: $1:$2 = EDX * $3 - - // TODO(benvanik): place src2 in edx? - if (i.src1.is_constant) { - assert_true(!i.src2.is_constant); - e.movzx(e.edx, i.src2); - e.mov(e.ax, static_cast(i.src1.constant())); - e.mulx(e.edx, i.dest.reg().cvt32(), e.eax); - } else if (i.src2.is_constant) { - e.movzx(e.edx, i.src1); - e.mov(e.ax, static_cast(i.src2.constant())); - e.mulx(e.edx, i.dest.reg().cvt32(), e.eax); - } else { - e.movzx(e.edx, i.src2); - e.mulx(e.edx, i.dest.reg().cvt32(), i.src1.reg().cvt32()); - } - } else { - // x86 mul instruction - // DX:AX = AX * $1; - - if (i.src1.is_constant) { - assert_true(!i.src2.is_constant); - e.mov(e.ax, i.src1.constant()); - e.mul(i.src2); - e.movzx(i.dest, e.ax); - } else if (i.src2.is_constant) { - assert_true(!i.src1.is_constant); - e.mov(e.ax, i.src2.constant()); - e.mul(i.src1); - e.movzx(i.dest, e.ax); - } else { - e.movzx(e.ax, i.src1); - e.mul(i.src2); - e.movzx(i.dest, e.ax); - } - } + assert_impossible_sequence(MUL_I8); } }; struct MUL_I32 : Sequence> { @@ -1719,18 +1480,6 @@ struct MUL_I32 : Sequence> { } } - if (i.src1.is_constant || i.src2.is_constant) { - uint64_t cval = - i.src1.is_constant ? i.src1.constant() : i.src2.constant(); - - if (cval < (1ull << 32)) { - auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; - - e.imul(i.dest, whichevs, (int)cval); - return; - } - } - if (e.IsFeatureEnabled(kX64EmitBMI2)) { // mulx: $1:$2 = EDX * $3 @@ -1782,18 +1531,6 @@ struct MUL_I64 : Sequence> { } } - if (i.src1.is_constant || i.src2.is_constant) { - uint64_t cval = - i.src1.is_constant ? i.src1.constant() : i.src2.constant(); - - if (cval < (1ull << 32)) { - auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; - - e.imul(i.dest, whichevs, (int)cval); - return; - } - } - if (e.IsFeatureEnabled(kX64EmitBMI2)) { // mulx: $1:$2 = RDX * $3 @@ -1835,19 +1572,7 @@ struct MUL_I64 : Sequence> { }; struct MUL_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - #if 1 - - assert_impossible_sequence(MUL_F32); - - #else - assert_true(!i.instr->flags); - - e.ChangeMxcsrMode(MXCSRMode::Fpu); - EmitCommutativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vmulss(dest, src1, src2); - }); - #endif + assert_impossible_sequence(MUL_F32); } }; struct MUL_F64 : Sequence> { @@ -1857,8 +1582,7 @@ struct MUL_F64 : Sequence> { Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); - e.vmulsd(i.dest, src1, src2); - + e.vmulsd(i.dest, src1, src2); } }; struct MUL_V128 : Sequence> { @@ -1890,49 +1614,7 @@ struct MUL_HI_I16 struct MUL_HI_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.instr->flags & ARITHMETIC_UNSIGNED) { - if (e.IsFeatureEnabled(kX64EmitBMI2)) { - // TODO(benvanik): place src1 in eax? still need to sign extend - e.mov(e.edx, i.src1); - if (i.src2.is_constant) { - e.mov(e.eax, i.src2.constant()); - e.mulx(i.dest, e.edx, e.eax); - } else { - e.mulx(i.dest, e.edx, i.src2); - } - } else { - // x86 mul instruction - // EDX:EAX = EAX * $1; - if (i.src1.is_constant) { - assert_true(!i.src2.is_constant); // can't multiply 2 constants - e.mov(e.eax, i.src1.constant()); - e.mul(i.src2); - e.mov(i.dest, e.edx); - } else if (i.src2.is_constant) { - assert_true(!i.src1.is_constant); // can't multiply 2 constants - e.mov(e.eax, i.src2.constant()); - e.mul(i.src1); - e.mov(i.dest, e.edx); - } else { - e.mov(e.eax, i.src1); - e.mul(i.src2); - e.mov(i.dest, e.edx); - } - } - } else { - if (i.src1.is_constant) { - e.mov(e.eax, i.src1.constant()); - } else { - e.mov(e.eax, i.src1); - } - if (i.src2.is_constant) { - e.mov(e.edx, i.src2.constant()); - e.imul(e.edx); - } else { - e.imul(i.src2); - } - e.mov(i.dest, e.edx); - } + assert_impossible_sequence(MUL_HI_I32); } }; struct MUL_HI_I64 @@ -2005,23 +1687,38 @@ struct DIV_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { Xbyak::Label skip; e.inLocalLabel(); - + e.xor_(e.eax, + e.eax); // need to make sure that we're zeroed if its divide by zero if (i.src2.is_constant) { assert_true(!i.src1.is_constant); - e.mov(e.ecx, i.src2.constant()); + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + e.mov(e.ecx, i.src2.constant()); e.mov(e.eax, i.src1); // Zero upper bits. e.xor_(e.edx, e.edx); e.div(e.ecx); } else { + e.mov(e.ecx, i.src2.constant()); + if (i.src2.constant() == -1) { // we might have signed overflow, so + // check src1 for 0x80000000 at runtime + e.cmp(i.src1, 1); + + e.jo(skip, CodeGenerator::T_SHORT); + } e.mov(e.eax, i.src1); + e.cdq(); // edx:eax = sign-extend eax e.idiv(e.ecx); } + } else { // Skip if src2 is zero. e.test(i.src2, i.src2); + // branches are assumed not taken, so a newly executed divide instruction + // that divides by 0 will probably end up speculatively executing the + // divide instruction :/ hopefully no games rely on divide by zero + // behavior e.jz(skip, CodeGenerator::T_SHORT); if (i.instr->flags & ARITHMETIC_UNSIGNED) { @@ -2034,11 +1731,31 @@ struct DIV_I32 : Sequence> { e.xor_(e.edx, e.edx); e.div(i.src2); } else { + // check for signed overflow + if (i.src1.is_constant) { + if (i.src1.constant() != (1 << 31)) { + // we're good, overflow is impossible + } else { + e.cmp(i.src2, -1); // otherwise, if src2 is -1 then we have + // overflow + e.jz(skip, CodeGenerator::T_SHORT); + } + } else { + e.xor_(e.ecx, e.ecx); + e.cmp(i.src1, 1); //== 0x80000000 + e.seto(e.cl); + e.cmp(i.src2, -1); + e.setz(e.ch); + e.cmp(e.ecx, 0x0101); + e.jz(skip, CodeGenerator::T_SHORT); + } + if (i.src1.is_constant) { e.mov(e.eax, i.src1.constant()); } else { e.mov(e.eax, i.src1); } + e.cdq(); // edx:eax = sign-extend eax e.idiv(i.src2); } @@ -2053,16 +1770,26 @@ struct DIV_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { Xbyak::Label skip; e.inLocalLabel(); - + e.xor_(e.eax, + e.eax); // need to make sure that we're zeroed if its divide by zero if (i.src2.is_constant) { assert_true(!i.src1.is_constant); - e.mov(e.rcx, i.src2.constant()); + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + e.mov(e.rcx, i.src2.constant()); e.mov(e.rax, i.src1); // Zero upper bits. - e.xor_(e.rdx, e.rdx); + e.xor_(e.edx, e.edx); e.div(e.rcx); } else { + if (i.src2.constant() == + -1LL) { // we might have signed overflow, so + // check src1 for 0x80000000 at runtime + e.cmp(i.src1, 1); + + e.jo(skip, CodeGenerator::T_SHORT); + } + e.mov(e.rcx, i.src2.constant()); e.mov(e.rax, i.src1); e.cqo(); // rdx:rax = sign-extend rax e.idiv(e.rcx); @@ -2079,9 +1806,28 @@ struct DIV_I64 : Sequence> { e.mov(e.rax, i.src1); } // Zero upper bits. - e.xor_(e.rdx, e.rdx); + e.xor_(e.edx, e.edx); e.div(i.src2); } else { + // check for signed overflow + if (i.src1.is_constant) { + if (i.src1.constant() != (1 << 31)) { + // we're good, overflow is impossible + } else { + e.cmp(i.src2, -1); // otherwise, if src2 is -1 then we have + // overflow + e.jz(skip, CodeGenerator::T_SHORT); + } + } else { + e.xor_(e.ecx, e.ecx); + e.cmp(i.src1, 1); //== 0x80000000 + e.seto(e.cl); + e.cmp(i.src2, -1); + e.setz(e.ch); + e.cmp(e.ecx, 0x0101); + e.jz(skip, CodeGenerator::T_SHORT); + } + if (i.src1.is_constant) { e.mov(e.rax, i.src1.constant()); } else { @@ -2099,28 +1845,17 @@ struct DIV_I64 : Sequence> { }; struct DIV_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - #if 1 - assert_impossible_sequence(DIV_F32) - #else - assert_true(!i.instr->flags); - e.ChangeMxcsrMode(MXCSRMode::Fpu); - EmitAssociativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vdivss(dest, src1, src2); - }); - #endif + assert_impossible_sequence(DIV_F32); } }; struct DIV_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(!i.instr->flags); e.ChangeMxcsrMode(MXCSRMode::Fpu); - Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); e.vdivsd(i.dest, src1, src2); - } }; struct DIV_V128 : Sequence> { @@ -3104,12 +2839,15 @@ struct SHL_I64 : Sequence> { struct SHL_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { // TODO(benvanik): native version (with shift magic). + + auto src1 = GetInputRegOrConstant(e, i.src1, e.xmm3); + if (i.src2.is_constant) { e.mov(e.GetNativeParam(1), i.src2.constant()); } else { e.mov(e.GetNativeParam(1), i.src2); } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.lea(e.GetNativeParam(0), e.StashXmm(0, src1)); e.CallNativeSafe(reinterpret_cast(EmulateShlV128)); e.vmovaps(i.dest, e.xmm0); } @@ -3180,51 +2918,15 @@ struct SHR_I64 : Sequence> { }; struct SHR_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - /* - godbolt link: - https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGIMwAcpK4AMngMmAByPgBGmMQgAGwaAJykAA6oCoRODB7evv5BmdmOAmER0SxxCclpdpgOuUIETMQE%2BT5%2BgbaY9mUMLW0EFVGx8Umptq3tnYU9CjMj4WPVE3UAlLaoXsTI7BzmAMzhyN5YANQmR26qAYnhBMThAHQI19gmGgCCx6fnmCuNxihA%2BX1%2BZhODDOXku1zci3wgjeYJ%2B4L%2BVAuGnRkLwmKwNAi6AgAH0SQBxSJyNxkjbgi4XAgAT3SmAJDI5nIutAEwG5vO5tGuVh%2BDOZrPZXgY2WARP5RnlfK8tCFRxF3wZxwJKwuZMeiUkisV9KukO1EV1JMeRzMF0eJq1mEJgL1gi4iQuCgQJAIDrNTp1roIAQZyAQbT9R3NgIAst8ANLYEIhCAMHwbC5plimo7HC7JyPRi4AMRjABUSQbTWYVeYzDijn08Rdo8SSTGhDSAGrYABKdNFjJZbKdXK5QartbVJvFI8xUplconhuVqvVmv9zouccTydT6czPhzebwBsLAYtpYrVbrAEkz2Z62jIU38Re2RdSSSLAB5Xshb5IgAERpEkBw1IcJVHMcOWXQVhRnYdJWlPBZQ/ODVwQwdHS3HckxTLMMyzY9ITtM9sM3HUr0rQ06xCOsGz6JRI3iYgSGrKUAGsGFQAB3BgLjQFh0joeIGOfRsGHwKhwVnZDFw/R4Li8e1px%2BOTRwXVC5TDNplN04gsO%2BDT5xQtD0E9b12mUr0fSMkzlLMuUeQVZVeSM2SkOgmDBPDYgOUeAJ7K8zEGQUiyDI5bJBCCtTjJCxzwt8vSGRUmLgqg0KfNs6y7TdRIMrnKLtI/HKCDCx53UK%2BSSossrUsqgq4ocny8vKgLBBtarvKSpTis6%2BtmoSrTzLazk0oILqhsywVWq5fVJG6zEVTmzlooIM9pqK1dVoawRNvVcEAHojouZRhjwMRaCZFt3ws2cFBeC4ywQTAbraQEvCUCzeNegSCFe26hJE%2Bh/PQVBMAUTNUHK7i%2BOO07DCZAHwj5JgYh2cqAcBWcLkwVR9nScrCCh7IAC9MBeBsi2/ABNMtsD24NqffXUAHU/yApmqokmmgI53suYmrredZkkAEUBaFhaG2bMAwFbUkO27PtwJwwMQh/SJyU17XLUqwJGKkvF0R%2BE6LkiAQAFpFkMdA2gsjHPEwQxIMhp6Xrei4Ppsj9fsYRlAawYHRP80QGB48qvswBHA8BW2pId6snaFR83YuOJRGji5UExbHPTwCmLhYPBFhYJgCDDDOvCxwGSmyGJ6FjgA3MQvEh73iEBARrqxb2pIuLgnqETBATEBRUAuMAOF/H8Qmn9O4h5XiqfUhLAt1WeQi4Ja2vdTefznwb1Qc61bW/Q%2BQkWrb2QWg%2B59iw6JLxKTRxJNnb2An82aEElPJmjeFh6afBvqORqFwpa7zPhcfmnMoEDXzFrck8Dypb2FDBc2Xh0isj2EwJQFwt52ihl9LwV0bqGhiMjSGRtpL/yKnfSWcC4oYlfpiMkyB0jeAUJwr6dDb6CAzqgTw6Cxzm14oCXihgsaT2zinPKOddgXDcBcdIbFgDEFYAoGhJszanSEKgNggkBDN0YHgRg%2Bxi5MGQGxKGRBLGcUBOkC6YhvbIH2AoJQUMGB4H2IZUWW4AJCArJ/ICEBVAZGGCSWcGYOQQHJpgXOYSNhHXiYkpx7QonDgzFbQeatcRvmdG2OmDMSSc1VqaAqZgPRkiASUspvYgRAWuFzGpt5yQkmwMBW8gEGwMiLJrNmJIQlhIiRk6JHJAnBOAiM9JBBMmsjyUcPprMAASbSVlDOmeE2Z8zMAxOxBJJiMcJLLK3Gs8kGzhnbMieM/M3wgmbNCdcsZWTem3QCd/R5MyblZI5AciEklaH%2BJ1LU7ADARmZhiZ%2BAAVFAYp2BoV0iqUk6wDANiLKLFLcF4TIWxNhaSKWiLzCJBRZYNFGLWawMFti0guKYVwqpUBIlyLVBIosOS02AL%2Bk/lBUkhkoKaUDK%2BeE%2BF6KWYfKlnyiBnNBWfKuaQd%2BnMxXAotJrRlfLGWysGfKkkjLlVctWbeXlrL%2BXAJpecy5WyFWgv1erC0azJUmuldSkZFrhUKqlrayi9rbzqpNZq116z3W6s9RSrcoKuBSoIWaiFuTWrm0oQQQEXBPxoClI4BUVA2LZg0GGkFwCzBRoFbGsweaLSgqOEWmNOKLhHDLYCUFkgq0MxpQySQ9bo0MwAKzNrBbGrtHbQUkqdZ2vtNbEiDuAQAdl7a2i4U7J0MwCLO2NARF3YBSCumtKR11cA0FK4tOK927sjU6w9tKuBcF3YWs91aL2lvFfmhmXBK23pbRCl9u6m1vrHRe9tj7y3AK4D2n9rbgMdqlqeqFWLY1XoA4CKWN7oMypLVC0Rp0UbEB%2BiQCyuc445xiNoRoBBaUjSJPB51QFX3IZdTWutFGpbfpo0BOd/6VUIc5iB5jc6B0Mc5sO7jsaJ18cFjOkdMGa0Ls5ebHivEC6jXLtYrIn584KFYICGINcLi8UIAgeTAl8ZJpQgIDtQhz10vpRAQKzKBOoq9VGVmQgJO0rRXiqAjUbOkvZfZosQgA04tc5Zs%2BnnWV2bVuxi4QhNbGpiWZu9Qr5WBR845gZnMpVOZQ%2BEhLVrGrJa3FFn8fqMx%2Bec9lp55ABp5Z1EINZMWGRxffeEt1iWYpVYtDV28jrYvOeazl/KbXAQdaK5F/zpBevlbPgNyLEao0Nd/QyODEW5tIY5HNudD6lsVtm%2BZ2tpnG3bbvW2vbwCuOrZ27xzbwCBNncOxcYTl2GZiahWt2NUmHvYGXSOl7Na10Ubm5ur7O2d1/Yjfup132L25pB0BqD9XzOXuO8%2Blb03DtcA2wa/LEbqNw9R/R97Uh0vw7Yxj6rEbTso8axei7JP2uQdm85hbpnEP08y7Si46O7WDaltj%2BrDPdt/cYyz2jbPiec8i1Lcn4vWcMmp2LjLgtru8%2Bl3dpnnMnurb52934uiLjkkYPECuY8VFMDwP5PDqAcF20epFz0rQpJQ34P5ae4Vp4UbJEIZQ3xby9ndGSCACBUIIFpcvGJUArP9YZP7wPGZ4TwgZGuq4U7lEQAmgniAIeO3u8997m0fuA/ACD/yXiof3OVcj/nhAMebhx/dDHpPn4Jq1/T3xKbWeve9gNHnwPweW%2BR9Lxtdt5fo9AjcHHm0dfk/C1Lc34vmeSQe/b2jgIXeC89%2BL5%2BfvS%2BMxR4L1X0fNw7uD5MPXlPC0Ngz9bySbPPvEgr8LyH2JUBG8Ts/BXvfceLgJ%2BP5PpLn4M9u6v3b1zxJB33v17z71PzL1APfwP1r0Tx/36wvzn2v07xAIrzXyhTDwmgNG3zfxHzH1LXgIb0myQIAOvyXzvwwIgMb0CHPzwNjwPxwKIMgIH3P3/zRB%2BC%2BlRmN2QAcXQCiXxhJDQBwwUCiUaUSlqjag8h%2BEWGIC8AcAuBMWQDMDEOP3XA5CoB5ArguBxSZA8inSaWYWfkxB3h%2BHBi8EbkBGDgMXSC/Dvi7gUGVBIxbB2EsO9l%2BzRCnXUKUmbmPguHNmIBSBNCDH3mblzDVH8NOmIEvRNB8OvgsEiIuGICCkHB8K7XQQCL3WCKtHykUKagSMyNMIgnMLcJiCsU4iEPDHCAyNOhMC7QsG4WsA0HeC7S5jqIsCtj3RaKaUHBKPoEUMfkSPaMaMsDMGaLqLaPqOsC6ImM5QZGbhDGaXcKMgZGbAgHcMaSWI0BeA0AuHAk1C8JNHmNtC2JWMTx6IgiOQdEOMHHmKWSWIdTSyYF%2Bzik5DWM/EeMFggGeJjyqSxFUCnWLGLFzU2KOC5n%2BTHGJWJQ32blojBIuDWXVR%2BNpWbi7XELVUlWRI%2BN9UxK/z%2BI0FUCBKJIzHli2In2/0QSRLXQzH2I5DUKOI5F8PEM6I0DMB3lePmkxHWIgBmx%2BIqX%2BOPVBPBL2IZIOPULHHBlFLpJuIgh8lhIuGhSWOPilM5ERMlQWKry5lhLOJ8neNRJHz7lpJ8npNuNanlO/yWK4C8B1NajVLSw1PEO1I5ONIMJVMZPuPhM%2BNCQ1JtLHH1MVPhOVNNLHCtitl8N9OlIuJ8l%2BlEk/E/BmwdOhIJMFOaS2JFOdK5AxPtK/3hNRIjOPyjK5Gbg9CWLCP5IJKBOwGLAjK9IgETNzPyKlPeOeINO2N2KNK5FrPrK1JLPrwJICA0EHIjLKN4MqJNwElLMfilNrJHIqN0nCE1IRM62zN%2BI9H7MHOLCIIJKOGLGwGxAzILIZKuNNJNNlI5FnLHPCHEOeJrOXK%2BIvPnNcweLvNCT5KTLuA3K3NUB3L3IjKZKWKfycnQhyIICb1rLfPxIBKBJBLBCOEZkHxyT3UfBtMPNNJbKWIfKqIYDONQoglhRDU5gVI2AcKcMdKDIgi7gIF2AEhvOYVdMOWNhkh%2BD6kcJiBJACDMHqMSBSCOH3RCI9GhSYC4FpSUiYDMBEoESYCOAkvKiYGkDGiYC7Rku9kSGUqYC11PNWIEQWJqKSKyNSIERItoF9AiICNzAMvKmbiyNMqiPMogh8JiJsqSKOCKK0ssrR10uIF4tiO0pfU8rMCCIssUKkH8pSO1wggWOvJ3nqPrFaLOPeO%2BJOIP0TJPykoNKEvaNzFaIn0/DkvSuiosEWmyuYMUqBEZgyvqPSOKopO%2BJLLgu9gKoKmqtSqnTKoavaKnQmJpOuPFLtC5O9iSuUUisQpFODIuGABhkngAgsCTBJACptKst2KWIqosCysMN6ubI9KGr3QuBGo7IZAmrsWmtmvmozKspTWWoKqKvWoZObLhO2pTT2qlMOqmu%2BBmpCDmqCLOrRyivaKqputNObLRI%2BGCpOMsEHlGrPIOsmruXes%2BoWpfV%2Bq4tmIzObLqsZisvuPBpTX2vGphuOo%2BtOt6qsrhK5hWs6ritRv6vUrapJtNASJxuevxrepOq%2Bo2upqSoxpDFxrEsdKnMBupo9Ixp2p5o9LJoKrWptLutpsvUhrHDysuvaOuqlupuBuSp%2Bp5rVvFr%2BpRvZrjKYDqu2qWR5rqu1uRspr1vWJprjzpp5tasVvqIpoBrPIStKpyTEozGhVyo9HdrrQVNytavdoQs/DppyXOs9pDsRrDsFK9rrJ%2BrDpzMZltA7NPIJs%2BpAClKMvKlOIPPorPOPLPM0q5BWtiraPhLEojKzvEPZN6oZBWrWuvPCNrvavqOuuvPiKlJWv%2BuvPSIzLrsaomOvMbLGvJsHuWs6tzsLNwoZALrFIZKrstJwrzoZEouoqSMhicP0IuPBDkQUA0R0xTyAoskwBiBNSLs1E4KPsUPEJPtUH5KgAuur1gOyVySMjkVPsdKnjABzvVDIw/GiK4AsiWNvrKpBuTJkjilXuIAEg/uhIAfQC3uk0VMvqShcj5DQaSNoB0wYBYEPtQYFAwfCHSBrlIyvrYmcL/osnLgUE4jr16soaNAwbOBdi7iAfhMAt6kEA2CIZrjpCMLPOYY%2BiAaTyWMEdYYYLcCmifnCs5CgYEjJE0zoAzRJHIcN3oCoAHwgDEbZFpXIaRUSGoc4kQY4O%2BAYaUiTUWEjQYYwcIYYGIYIDofBDkcwewdwZ4ecOTKYAvT4cOjzuhSOk5TMYEXBkICujwckKUhxQMi9teiZE03QHQCZEcdNLkTDEaE4kIHEKoHHhjiprCQNJ3UT2btDFel4MyaWKeHbgjIZBOmcaJoZOno5GhVificSfELSbKZMukbeKoC0dKYyYcaKbdIUZUmMvCBJClC7isXDEsIgB8YaeXo5Gcd4rZvRDzttkcGQAuE0NQG0PDFoCoDMB2YrnCdGh0OSbPOcZJBYBYCEObgIA8RJCoBtFJBubuYIHSAQBJHSAUFedueQHueyFz2yCXw4Yia4dZTFToouI2Z8QkNGnqmOYIDMH2d6aRe2YufnswCougdtwrh8SELwRMrcAYbKg%2BD%2BfeZBcCBJCBcLWuf%2BfuZ%2Be%2BYQApaUDmQ8WoC2F1BjBjBJGLF7B/DkGAhJEiB/A6QAA1lEKT2xeX%2BXBXhWywxWAAtPsH8dFeZzwi49F1QRgAQEkVFo5rQggU5uUc5oZkx8aK%2BhQWUcQu%2B98gcoc/hy1pKfGdIcQvlaEiAFy/A2AqHKmz8eWV1i5nyZxxEEAEAEEOZUQRYeEJF8l61icp%2B5RWvFCvO26gRZAYSQxm%2B1QN11M9hlyiRlMnc3alNEALEG0k6V13agCzN9IbNz19k/fNwY9DVs8hh9etAaURwKipNNhrmO%2BnJLRrNvBTie%2Bpt%2BEVt6Qs89FlgJwvAThMxfyJY2F5AQlmNm4ONuCqAOthtv4wgmqiAAAP13dHdXNyTbbmI0KNcEnuaiWEnEINaRYgC7i7cWEIBrjZDbdkexbXoBbmQIAfa9rnbGcXbEn4bWe3pfEYs5QxArZMfpDJBYCXx0N1YYH1bEEOefaQ6XzwDNfPv8OQaho5DfbtweYZP8YZJw8CDw9RZJFdYEEYDmQbfhPpZpexa4Ho5EndC9a4DkB9eTb9d6uo4CDw8XFEJY5aTebZc48wG45JXXMHLkG/dgmudw7xlzcY8EBJGDi7jYG08k91Gk44647wB48fubcHiE6o7U5o%2B2aNfo804iG09OD06Y%2BudHfELY5k9M5eb44E/Z3oyddU%2BQ7s4pjYi7j5CWO8%2BxfC9QBpbM8CDmZtJE7w9dahmi7ebtgS9BZxTo4Y%2Bc%2BY7PZS9s9E9uh4mIHLloAk888y/%2BeEkwAAEdTOeP0vaU4vIvL2LWQv1OsAKuquauaGSQbQvO3mpQnFeCeQWuSU%2BuSABuPOaHaVZvKvx4FvaGajmwKyiTaVM4DFAQtuiSiSnoIgRwLJHcM56ApJm5uRDc/EZGOQq3c2hTa0LOa8ISeu7Ou5GuVIHDHO5kNEjBMA1vRv6vWRmu5OzOSU2uLh8unP3PDGVOGRUuNP0gdPGg3ODPR2Pw6uSRsuqXl0vufvIY/uVHDBgAgfDHaUCv3PdPMB9OiuaHEfLRQuyudMAZHPMhCvcf0BtAvo%2B2QeaWLDpuIVXW0frE6f4esf0AmeROjEGATF2g2QSQ2fvQa4SeAfyfufefFgBf/3PnvnfmVeOetO5kmAee%2Bev3p3ORkfxOBe8fEvPtaVxO1uuuzybftIFARucfxvyipuIeeOOvMBgAnePefHIjy4HELhtfypxOr2kfSu8PCfTdifDNSfAevfWOxu7Hff4v/eSVA/g/17vvk/RDU%2BNeKfiuMzkey%2ByegekWcjxf6e7epIcukuk/fua/0/xL7Pdm5lqeXPoQMeGf1vgv4%2BWe8Ok1iBnhLCVHMBHh4hVFsWhDjF4h%2Becfo2PnRD8etGV/Ff%2BCVfMZ1fa%2Bte%2Bew%2BTpvRe4vRR5iZBA%2BhaBS5GB8E8Abm2RLok1L%2BvBgByfFgPwUYoZtC2OcfZnup0n7T96ATzXPP3zmQ0BVAWCO3gk186FpQBeAGfl3Hn7EBF%2BUbXfn2yp6qB/ux/evq5wl5cNp2NncfpYiwBo9%2BuW/WqCSDi4dwcelhK7qZ2XTICZ%2BzzFQlAKeZ4BYB6QEPrKE97d9lu83QxsNwfSj916OLeRm8w3748DeEAcuJQOEE0DzIdA%2BIBDAUCu9YOedI6ER05BEBwY5bZ/qolQAmJA4pccEJR1FDI8dWPEdDkixRaYcvwLPDFua3u5j91O9fdGINzHY48fOEPF5gSUBKHcgSXgV3tbwT76ZRBPzAXv4JEiBCPyg5JIdZ1NLI9TB8QHZrxDT7k9YhJnAIYWgJKSBghe5QcmEJK7kCpQWAYgJkOyGAg/BeQ%2BIQUO/IBAHWSQsoVX0iHpDqhy8Wfo4WMq5DQKwvIIUSXaHCdIhi4UQfvHqGDC8%2BEARTikLd6lc8WGSQzK4CV7l8BhnHH5hAESBdou0GgSQDsSoAy9IhovWnvTwz5SdbmcQvAH5344SNAuJw8gaLyIH08phmfa4Q0Mh4QB/ODw49Fbw%2B5lcC%2BAwuLq32XRPD1OkbSOJF114NdweIkZdAX3a7qDOu5QlDlQFx4xBvoTzEgEIQfY49su2wqgLSjY6yCHe8grwViJd7hDAReHDEd4KV7z91ELidfkSyZbb96RWIpXo7jxHpAmeugt0lYiopkJaUaaWgBZGzglwGAR0cuHfXCA/8zeOcTEHW36F2I6wbAYgOTzrB5QJmr8boXxFqFHRxmXQmoeXwo4BNUhkQyODkRJAmiehGw9fsJGMqIDqAmI7EUyI0S0BaUdog0eXwhF2dxmlQjIfaNr6wjOEffRoa6IZH8EPRYgUhlUNNG19/RZXY0Qr0TGA8BeqgXEdv2tGpjgxvo2vjt0IDQig%2BNIjwYECL5E9S%2BeAtYfwQdEfDeRwAOZESLdHcjcRdbJ3ksD%2B61jahZYq4Ah1uLmw0haYnobiKDH6i5MOPbMQ73Dy2iRxBYwHhJV1EJiQxgPPscj3b4p8axr8OscfzQG38MBlFbHg2I37siZxm46sUZiwC7i1xaIuzhiP3GT9MBx4q4eu035gjOWAI8sWV3OFMd4gzffgtvwvHdidxtQ2fugOfHoBcBqPX8ZVi/HAC7OsEvtjiOIBH8MxfgoXvkNdGPiF%2BR4qCUDCH7xB1xSwwwDDFeioSmAvEKgMqHAkHjMBuvIlnIO2FIS2xqEzvuTz7FkD1O%2BzExKIWtEwClecA9foy1EKfNvikccibjyok0ScJh47FiSJ5Z8sBWQrICCKzFbYBJWRBGVkpPlaqTFWJIFVgK35FHRsY%2BCH2JgOZDjUbECgWlEyGoSWjyBwE1YaBPL7vDXxE3TiKX0jFOTtx14sCbJMgnIiIupYu8WVx9G8Q3JRnW5h5K8m3DC0eYicbUKCkkAQpHQioXqMyGRS2OMUl0YGIymrjyeyU1EWlPU7hTfcY4/KXxCylvNpxoLMqapUcgriqpwmCDg5PU6utXO1U25vb1BY%2BSrx6w2vu6GgkN8h%2BKbCQdX1zbcCdW/bKKdz34KzCeJxPfiTwMEl8CUenUlqb1ROiyZPQfQYjJ6FZDIBLotARQm3GOSLDyB3CRoMdJJCtxvAdQhsT1KS71TvR84rIVnSZ5XSjpYgW6WdIF61SkuX0m6XdPbgkisuLfbfqLwEn4SJho7EkLXmInkDoZKEmlodOOkZcGxTA9AM3BYEQAoZK0mGWjJ%2BkgzMAr0/MRFNvwVSmpFMvsc42Rk8igZ48HCpq3BDYFJAYhEDgPiICgs2ZYhC3MJWUgLQ%2BZ6QDnDKXGhCyo4j9LmBAH5nllVAVABWYrIVlM9eZ7ld1hbjBprl5ZSsxWSrIlmMh3WXgFNF7Ssp6z9o7M8qLxDFoqY5ZOs3WTaVVkXBfB7DbOnBRBrj54Jss2CiAi5hSMGSrs6WbLJNlXp6aTsm0s7OlnZ1IKds5WQ7P1kRT3WrskBIzA9kSCNZoDN2fCT9mmkA5DeY2TnBFm/EEiRjDMhHJtneza04g7pks1/a4sZZ6QfORrKLnaYLq4NEudI1wrHBXAOicKg5FkLyFZK9/WUHgh%2BEv4lCFBDLmLMci5A3Ba0SpgoQI4MhtWEZdFnoT7o99tCZMFeTe14iptnac8uQgvKnlvEBoYhHgRGTPhny15zdS%2BcTC3nrzb5doXeZPRtL9yj5zdKfGIS8CqAIyn8qONfKlJ/zlI98j%2BY/K8DPzeqh5DMuiw4FK1daDJPooCCoBngYq8CwGjFDPlBR6iVUC2gyVTxfysFFgHBfvI6igVb8xMHBRYCwUkLuY5C5SJQuoVS1T5xMLqK3TQXtswFrCwquwpPkD4z5KCiwKXTjnmyv5AioRRmTfnZ1j5HIR2cvF/n6yK8e8pes7RNByILcBuIgIZDBLNId%2B3bRQo0AnnQkYgGYUjrPJrlSCi8z3GIC8GXgVJoSFeKxW8EDzKLVFQiE6eoo0SaLSSYJXRTr3HnUstEfxYxXLx14EdnGti7xVzGsW2KLS9eBxZEozhOKC8LiwcP4uXxQwPFFcEgHUWwC%2BLLKBigJUYpMUCBQl0itJTS29jiFoUf/G0kwCSV6YclGFepbUpsV8QrgXaH2YkrkUZlnGTAFxRyXKWBKPQmSzRY0p8WmLBlRSsxQyGqUB4MZXMWZaXHaWMwYgNpZxosq0QckoF6hQZQXI0XZLkUEygpekqmWmKCOuypgFUpqUZk6lDiqEvCWsWKKblrSycVUiaXdLeqvS/pTsuOU0toSIyg5U1FCRHLkAhioJcGyaZ/8rlcy/kqsp6W1yBIGy75SaF2UArDISePJfotBWFLwVISqRcTV%2BX/9oVFg55XcpEYPLmlzymJeSqiUvK1lCK72MitSWEr/lluUZTSsxWTLcVBHDZcSsnnKI4VnyhlUiq2WuloWWKigl0zmKrsEJZXbACB0NxCAEAxALsKCxt57ABZ%2BCmPnsAPbSLzY3wWgCwCyCDyTp8ou3O4k7jxxwwLAXOQLM0zlQ7BVsUjlIlXhnlzYXoVgMTAy71ENApAJ2ngsChiEPVNq8QnvWUJyyJ6vVMQNpBHlwFylllM6Sl2k6aKgeQEufnRMopvj4QqXAMrkuJQkyuWYauDAeUhCqImAwAcuIJAMCuQGAbEFUCSB5AW4GS53cPMTGrq90EieHMBuglDJ4AIVnIEmS8C8CEKu1HS/4g3QAqDrh17RUdYzG3Jj1M5jMYNYMwtJjVdoPkOsmdKHWEKrMu1ONDkvHULrLOEAEMDkmXVQtIFizeYluunU%2Brllh61oo6RvWEKlqY6%2BdY%2BornLr6VFitjloUAnnj01T4zNRv2zUJ9c1EAfNWdIvXXFt61gyIRY0n5zM3BXEz7hvX6HTC2JCIuYdrIvS0pu%2B0lC4PJSUp3ZaUU6WlEEAuBpArOF6AWWjgvQEapAVOJns43lXKhFVyq1VW3zQ3OEpM0LPhJpCvqRRE0mAJQmITJAVwngKAz9n7jrLEZpxFMCAIaC9qpJemjwdVm2xt4IAvACs0GM4PU5FrSMga7VcoUxYyEdV9yoCEEIWq/KY%2BWmnTUDyzr0kxVBK7Fcvls3abNCqa5VU5poUSrqW7m%2BzTSwQBChDizmvBVfX67RrxC%2B5Xqi2qUhdr4SwGdBAlrnU9qrYeAUgJFvv7WBrA/arkF6A83gCs626mdfevfVl0uYWWvANU0siFavNxAErfUSq0LquYs6w9evLCh2bPNQW2gI1vZRzdo1LWz8Hh0Qox43135JRaaQK2Bbitt6iwK%2BrnUTaP18JZrRIq63gCvQDWubc1uW3CkD1TdBktNu61ehet22gbdlt23/EByNpYTaJts3EBptL42aUdvAEQ9l0BmhUgFOA1EtQN4/cDcShe31boNKTKhHdssjVc7NT27zutqB5va4kGq2lNCi%2B1L8QNNwHNeSwB0w6etwOs8ptse1sNk5K7czWtoh3abjxzbACqes9A6rEZ6nbQKgBWAzS8dkO4RsohO347v1a9aHXVtxn07GdiO5HVGx%2B1o6wNGOqpIDt6FOEcdncn4Mj2bh714ZemuzkWry2hsYdxAeHaCrw2XsDCHALYLQE4BdpeAfgDgFoFICoBOAo%2BSwNYE9A7A9gdQyEDwFIAEBNAeurYJxBACSBEgLwSQCkD3QpANARwAcgFUSA9ADdHASQMbtd3m7OAvABQCAF9Uu7Tdeu0gHAFgBIAQ4oMcgJQEz0TAzgZPKJHIQYCcQ%2BASjeIPHogAxBo9IIZgMQCZCcAndQkenj%2BAYDXRo9WAcuEYHEDJ7SA%2BAV9o4F4nR78YjQT9g3t4Dz9w9Zu%2B/jEA0R16PAWAaPZJpYBj6tgmhctQoC7BmJeIP4VkCbqd38BBAIgMQOwEY0H75ASgNQNHt0DCVq1xgHLZYH0AoD49kALYBbgGDx6OAVsH8Nkx167Vy4ewd4GCTJjy94gtoG2AQAQZglZQ1pa3aMSYBx69pTQZwHjKkhzA/AwlUICsCqA1A9AJQHIAIDQO4Gsg%2BBhgKMGwMTBhKDQJA4MCWCEHKDiBgYEMHaBkHxgCQSg7Qc8BdA9AtsZg1gdYMSAtgE8XYPsD0BPBR4K%2B/QIbqj096LdHAO4IkCthVh89CoCAJU2L3b4rdVgB/RcFwCEB2IxwAWR4GEihwcwV6XgEnq0DopSAr0M3hMDmakAPdXaMwC8CnQvpJARwJw5ICKEcUihkhiPdIbN2yG49Ce53a7q2Bp7EAIATGPY2z079jDoMSIOpk4CqAqwLABQM3C2ZWUUgLwKQJ%2BA/iRBsAGwXgK/00V4B0AegM/UfvECn7ZAigFQOoB73X7SAvEDROkAkPh6jdmW6PbIZ/A1x7GSonQvcEUOGhlDfIVQ0XtoafgjDIMf8QYaKOhHk9Vhj3dsSnQpAu0RQ9w1Ol91skqkRwaQOHsj1dGZDse2wCEYsNu6/DZgAI7wCCMLHLDWwBXtkGcCSAgAA%3D%3D%3D - */ - /* - todo: this is a naive version, we can do far more optimizations for - constant src2 - */ - bool consts2 = false; - - if (i.src1.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src1.constant()); - } else { - e.vmovdqa(e.xmm0, i.src1); - } + // TODO(benvanik): native version (with shift magic). if (i.src2.is_constant) { - consts2 = true; - e.mov(e.r8d, i.src2.constant() & 7); - e.mov(e.eax, 8 - (i.src2.constant() & 7)); + e.mov(e.GetNativeParam(1), i.src2.constant()); } else { - e.movzx(e.r8d, i.src2); - e.and_(e.r8d, 7); + e.mov(e.GetNativeParam(1), i.src2); } - - e.vpshufd(e.xmm1, e.xmm0, 27); - e.vpcmpeqd(e.xmm3, e.xmm3, e.xmm3); - e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMVSRShlByteshuf)); - if (!consts2) { - e.mov(e.eax, 8); - } - e.vmovd(e.xmm2, e.r8d); - if (!consts2) { - e.sub(e.eax, e.r8d); - } - e.vpsrlw(e.xmm1, e.xmm1, e.xmm2); - e.vpsrlw(e.xmm2, e.xmm3, e.xmm2); - e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMVSRMask)); - e.vpand(e.xmm1, e.xmm1, e.xmm2); - e.vmovd(e.xmm2, e.eax); - e.vpsllw(e.xmm0, e.xmm0, e.xmm2); - e.vpsllw(e.xmm2, e.xmm3, e.xmm2); - e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMZero)); - e.vpand(e.xmm0, e.xmm0, e.xmm2); - e.vpor(e.xmm0, e.xmm0, e.xmm1); - e.vpshufd(i.dest, e.xmm0, 27); + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateShrV128)); + e.vmovaps(i.dest, e.xmm0); } static __m128i EmulateShrV128(void*, __m128i src1, uint8_t src2) { // Almost all instances are shamt = 1, but non-constant. @@ -3444,9 +3146,7 @@ EMITTER_OPCODE_TABLE(OPCODE_CNTLZ, CNTLZ_I8, CNTLZ_I16, CNTLZ_I32, CNTLZ_I64); // OPCODE_SET_ROUNDING_MODE // ============================================================================ // Input: FPSCR (PPC format) -static const uint32_t mxcsr_table[] = { - 0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80, -}; + struct SET_ROUNDING_MODE_I32 : Sequence> { diff --git a/src/xenia/cpu/backend/x64/x64_sequences.h b/src/xenia/cpu/backend/x64/x64_sequences.h index d83e40e28..0ce6a7f57 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.h +++ b/src/xenia/cpu/backend/x64/x64_sequences.h @@ -10,11 +10,13 @@ #ifndef XENIA_CPU_BACKEND_X64_X64_SEQUENCES_H_ #define XENIA_CPU_BACKEND_X64_X64_SEQUENCES_H_ +#include "xenia/base/logging.h" #include "xenia/cpu/hir/instr.h" #include -#define assert_impossible_sequence(name) \ - assert_always("impossible sequence hit" #name); +#define assert_impossible_sequence(name) \ + assert_always("impossible sequence hit" #name); \ + XELOGE("impossible sequence hit: {}", #name) namespace xe { namespace cpu { diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index a5fa40a04..f7d882279 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -20,7 +20,9 @@ DEFINE_bool(inline_mmio_access, true, "Inline constant MMIO loads and stores.", "CPU"); -DEFINE_bool(permit_float_constant_evaluation, false, "Allow float constant evaluation, may produce incorrect results and break games math", +DEFINE_bool(permit_float_constant_evaluation, false, + "Allow float constant evaluation, may produce incorrect results " + "and break games math", "CPU"); namespace xe { @@ -85,8 +87,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->dest) { might_be_floatop |= i->dest->MaybeFloaty(); } - - bool should_skip_because_of_float = + + bool should_skip_because_of_float = might_be_floatop && !cvars::permit_float_constant_evaluation; auto v = i->dest; @@ -557,6 +559,12 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0); i->Remove(); result = true; + } else if (!i->src2.value->MaybeFloaty() && + i->src2.value->IsConstantZero()) { + // division by 0 == 0 every time, + v->set_zero(i->src2.value->type); + i->Remove(); + result = true; } else if (i->src2.value->IsConstant()) { // Division by one = no-op. Value* src1 = i->src1.value; @@ -672,29 +680,33 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { } break; case OPCODE_SHL: - if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { - v->set_from(i->src1.value); - v->Shl(i->src2.value); - i->Remove(); - result = true; - } else if (i->src2.value->IsConstantZero()) { - auto src1 = i->src1.value; - i->Replace(&OPCODE_ASSIGN_info, 0); - i->set_src1(src1); - result = true; + if (i->dest->type != VEC128_TYPE) { + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + v->set_from(i->src1.value); + v->Shl(i->src2.value); + i->Remove(); + result = true; + } else if (i->src2.value->IsConstantZero()) { + auto src1 = i->src1.value; + i->Replace(&OPCODE_ASSIGN_info, 0); + i->set_src1(src1); + result = true; + } } break; case OPCODE_SHR: - if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { - v->set_from(i->src1.value); - v->Shr(i->src2.value); - i->Remove(); - result = true; - } else if (i->src2.value->IsConstantZero()) { - auto src1 = i->src1.value; - i->Replace(&OPCODE_ASSIGN_info, 0); - i->set_src1(src1); - result = true; + if (i->dest->type != VEC128_TYPE) { + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + v->set_from(i->src1.value); + v->Shr(i->src2.value); + i->Remove(); + result = true; + } else if (i->src2.value->IsConstantZero()) { + auto src1 = i->src1.value; + i->Replace(&OPCODE_ASSIGN_info, 0); + i->set_src1(src1); + result = true; + } } break; case OPCODE_SHA: @@ -729,7 +741,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { result = true; } break; - +#if 1 case OPCODE_PERMUTE: { if (i->src1.value->IsConstant() && i->src2.value->IsConstant() && i->src3.value->IsConstant() && @@ -756,6 +768,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { break; } +#endif case OPCODE_INSERT: if (i->src1.value->IsConstant() && i->src2.value->IsConstant() && i->src3.value->IsConstant()) { diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index 1b84e417c..4c02789b8 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -83,6 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) { iter_result |= SimplifyBitArith(builder); iter_result |= EliminateConversions(builder); iter_result |= SimplifyAssignments(builder); + iter_result |= SimplifyBasicArith(builder); result |= iter_result; } while (iter_result); @@ -1228,6 +1229,91 @@ Value* SimplificationPass::CheckValue(Value* value, bool& result) { return value; } +bool SimplificationPass::SimplifyAddArith(hir::Instr* i, + hir::HIRBuilder* builder) { + /* + example: (x <<1 ) + x == (x*3) + + */ + auto [shlinsn, addend] = + i->BinaryValueArrangeByDefiningOpcode(&OPCODE_SHL_info); + if (!shlinsn) { + return false; + } + Instr* shift_insn = shlinsn->def; + + Value* shift = shift_insn->src2.value; + + // if not a constant shift, we cant combine to a multiply + if (!shift->IsConstant()) { + return false; + } + + Value* shouldbeaddend = shift_insn->src1.value; + + if (!shouldbeaddend->IsEqual(addend)) { + return false; + } + + uint64_t multiplier = 1ULL << shift->constant.u8; + + multiplier++; + + hir::Value* oldvalue = shouldbeaddend; + + i->Replace(&OPCODE_MUL_info, ARITHMETIC_UNSIGNED); + i->set_src1(oldvalue); + + // this sequence needs to be broken out into some kind of LoadConstant(type, + // raw_value) method of hirbuilder + auto constmul = builder->AllocValue(oldvalue->type); + // could cause problems on big endian targets... + constmul->flags |= VALUE_IS_CONSTANT; + constmul->constant.u64 = multiplier; + + i->set_src2(constmul); + + return true; +} + +bool SimplificationPass::SimplifySubArith(hir::Instr* i, + hir::HIRBuilder* builder) { + return false; +} +bool SimplificationPass::SimplifyBasicArith(hir::Instr* i, + hir::HIRBuilder* builder) { + if (!i->dest) { + return false; + } + if (i->dest->MaybeFloaty()) { + return false; + } + + hir::Opcode op = i->GetOpcodeNum(); + + switch (op) { + case OPCODE_ADD: { + return SimplifyAddArith(i, builder); + } + case OPCODE_SUB: { + return SimplifySubArith(i, builder); + } + } + return false; +} +bool SimplificationPass::SimplifyBasicArith(hir::HIRBuilder* builder) { + bool result = false; + auto block = builder->first_block(); + while (block) { + auto i = block->instr_head; + while (i) { + result |= SimplifyBasicArith(i, builder); + i = i->next; + } + block = block->next; + } + return result; +} } // namespace passes } // namespace compiler } // namespace cpu diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.h b/src/xenia/cpu/compiler/passes/simplification_pass.h index 3e3fa9c46..8a5d3ee4c 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.h +++ b/src/xenia/cpu/compiler/passes/simplification_pass.h @@ -32,6 +32,13 @@ class SimplificationPass : public ConditionalGroupSubpass { bool SimplifyAssignments(hir::HIRBuilder* builder); hir::Value* CheckValue(hir::Value* value, bool& result); bool SimplifyBitArith(hir::HIRBuilder* builder); + + // handles simple multiplication/addition rules + bool SimplifyBasicArith(hir::HIRBuilder* builder); + bool SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder); + + bool SimplifyAddArith(hir::Instr* i, hir::HIRBuilder* builder); + bool SimplifySubArith(hir::Instr* i, hir::HIRBuilder* builder); // handle either or or xor with 0 bool CheckOrXorZero(hir::Instr* i); bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder); diff --git a/src/xenia/cpu/hir/instr.h b/src/xenia/cpu/hir/instr.h index 337622215..47f629227 100644 --- a/src/xenia/cpu/hir/instr.h +++ b/src/xenia/cpu/hir/instr.h @@ -79,6 +79,10 @@ class Instr { void MoveBefore(Instr* other); void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags); void Remove(); + const OpcodeInfo* GetOpcodeInfo() const { return opcode; } + // if opcode is null, we have bigger problems + Opcode GetOpcodeNum() const { return GetOpcodeInfo()->num; } + template std::pair BinaryValueArrangeByPredicateExclusive( TPredicate&& pred) { @@ -86,12 +90,13 @@ class Instr { auto src2_value = src2.value; if (!src1_value || !src2_value) return {nullptr, nullptr}; - if (!opcode) return {nullptr, nullptr}; // impossible! + if (!GetOpcodeInfo()) return {nullptr, nullptr}; // impossible! // check if binary opcode taking two values. we dont care if the dest is a // value - if (!IsOpcodeBinaryValue(opcode->signature)) return {nullptr, nullptr}; + if (!IsOpcodeBinaryValue(GetOpcodeInfo()->signature)) + return {nullptr, nullptr}; if (pred(src1_value)) { if (pred(src2_value)) { @@ -119,7 +124,7 @@ if both are constant, return nullptr, nullptr std::pair BinaryValueArrangeByDefiningOpcode( const OpcodeInfo* op_ptr) { return BinaryValueArrangeByPredicateExclusive([op_ptr](Value* value) { - return value->def && value->def->opcode == op_ptr; + return value->def && value->def->GetOpcodeInfo() == op_ptr; }); } @@ -143,7 +148,7 @@ if both are constant, return nullptr, nullptr */ template void VisitValueOperands(TCallable&& call_for_values) { - uint32_t signature = opcode->signature; + uint32_t signature = GetOpcodeInfo()->signature; OpcodeSignatureType t_dest, t_src1, t_src2, t_src3; diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc index f6e76f99d..473dc5d90 100644 --- a/src/xenia/cpu/hir/value.cc +++ b/src/xenia/cpu/hir/value.cc @@ -199,7 +199,7 @@ void Value::Truncate(TypeName target_type) { return; } } -//WARNING: this does not handle rounding flags at all! +// WARNING: this does not handle rounding flags at all! void Value::Convert(TypeName target_type, RoundMode round_mode) { switch (type) { case FLOAT32_TYPE: @@ -428,35 +428,57 @@ void Value::MulHi(Value* other, bool is_unsigned) { } } +template +static T PPCUDiv(T numer, T denom) { + if (!denom) { + return 0; + } else { + return numer / denom; + } +} +template +static T PPCIDiv(T numer, T denom) { + if (!denom) { + return 0; + } else if (numer == static_cast(1LL << ((sizeof(T) * CHAR_BIT) - 1)) && + !~denom) { // if numer is signbit and denom is all ones, signed + // oflow + return 0; + } else { + return numer / denom; + } +} + +// warning : we tolerate division by 0 in x64_sequences, but here we do not void Value::Div(Value* other, bool is_unsigned) { assert_true(type == other->type); switch (type) { case INT8_TYPE: if (is_unsigned) { - constant.i8 /= uint8_t(other->constant.i8); + constant.i8 = PPCUDiv(constant.i8, other->constant.i8); } else { - constant.i8 /= other->constant.i8; + constant.i8 = PPCIDiv(constant.i8, other->constant.i8); } break; case INT16_TYPE: if (is_unsigned) { - constant.i16 /= uint16_t(other->constant.i16); + constant.i16 = PPCUDiv(constant.i16, other->constant.i16); } else { - constant.i16 /= other->constant.i16; + constant.i16 = PPCIDiv(constant.i16, other->constant.i16); } break; case INT32_TYPE: if (is_unsigned) { - constant.i32 /= uint32_t(other->constant.i32); + constant.i32 = PPCUDiv(constant.i32, other->constant.i32); } else { - constant.i32 /= other->constant.i32; + constant.i32 = PPCIDiv(constant.i32, other->constant.i32); } break; case INT64_TYPE: if (is_unsigned) { - constant.i64 /= uint64_t(other->constant.i64); + constant.i64 = PPCUDiv(constant.i64, other->constant.i64); } else { - constant.i64 /= other->constant.i64; + constant.i64 = PPCIDiv(constant.i64, other->constant.i64); } break; case FLOAT32_TYPE: diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index b62e28498..5719357a4 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -364,12 +364,11 @@ int InstrEmit_mfvscr(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) { // is this the right format? - //todo: what mtvscr does with the unused bits is implementation defined, figure out what it does - + // todo: what mtvscr does with the unused bits is implementation defined, + // figure out what it does Value* v = f.LoadVR(i.VX128_1.RB); - Value* has_njm_value = f.Extract(v, (uint8_t)3, INT32_TYPE); f.SetNJM(f.IsTrue(f.And(has_njm_value, f.LoadConstantInt32(65536)))); @@ -1824,9 +1823,38 @@ int InstrEmit_vsum4ubs(PPCHIRBuilder& f, const InstrData& i) { return 1; } +static Value* vkpkx_in_low(PPCHIRBuilder& f, Value* input) { + // truncate from argb8888 to 1 bit alpha, 5 bit red, 5 bit green, 5 bit blue + auto ShrU32Vec = [&f](Value* input, unsigned shift) { + return f.VectorShr(input, f.LoadConstantVec128(vec128i(shift)), INT32_TYPE); + }; + auto AndU32Vec = [&f](Value* input, unsigned msk) { + return f.And(input, f.LoadConstantVec128(vec128i(msk))); + }; + auto tmp1 = AndU32Vec(ShrU32Vec(input, 9), 0xFC00); + auto tmp2 = AndU32Vec(ShrU32Vec(input, 6), 0x3E0); + auto tmp3 = AndU32Vec(ShrU32Vec(input, 3), 0x1F); + return f.Or(tmp3, f.Or(tmp1, tmp2)); +} + int InstrEmit_vpkpx(PPCHIRBuilder& f, const InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + // I compared the results of this against over a million randomly generated + // sets of inputs and all compared equal + + Value* src1 = f.LoadVR(i.VX.VA); + + Value* src2 = f.LoadVR(i.VX.VB); + + Value* pck1 = vkpkx_in_low(f, src1); + Value* pck2 = vkpkx_in_low(f, src2); + + Value* result = f.Pack( + pck1, pck2, + PACK_TYPE_16_IN_32 | PACK_TYPE_IN_UNSIGNED | PACK_TYPE_OUT_UNSIGNED); + + f.StoreVR(i.VX.VD, result); + + return 0; } int InstrEmit_vpkshss_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, diff --git a/src/xenia/cpu/ppc/ppc_emit_alu.cc b/src/xenia/cpu/ppc/ppc_emit_alu.cc index 6c9fd9120..fe4fe016d 100644 --- a/src/xenia/cpu/ppc/ppc_emit_alu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_alu.cc @@ -336,10 +336,14 @@ int InstrEmit_mulhwx(PPCHIRBuilder& f, const InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; } + Value* ratrunc = + f.SignExtend(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), INT64_TYPE); + + Value* rbtrunc = + f.SignExtend(f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), INT64_TYPE); + + Value* v = f.Sha(f.Mul(ratrunc, rbtrunc), 32); - Value* v = f.SignExtend(f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), - f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE)), - INT64_TYPE); f.StoreGPR(i.XO.RT, v); if (i.XO.Rc) { f.UpdateCR(0, v); @@ -355,10 +359,13 @@ int InstrEmit_mulhwux(PPCHIRBuilder& f, const InstrData& i) { return 1; } - Value* v = f.ZeroExtend( - f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), - f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), ARITHMETIC_UNSIGNED), - INT64_TYPE); + Value* ratrunc = + f.ZeroExtend(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), INT64_TYPE); + + Value* rbtrunc = + f.ZeroExtend(f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), INT64_TYPE); + + Value* v = f.Shr(f.Mul(ratrunc, rbtrunc, ARITHMETIC_UNSIGNED), 32); f.StoreGPR(i.XO.RT, v); if (i.XO.Rc) { f.UpdateCR(0, v); diff --git a/src/xenia/cpu/ppc/ppc_emit_fpu.cc b/src/xenia/cpu/ppc/ppc_emit_fpu.cc index e12caa9bc..5723c6bfd 100644 --- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc @@ -89,8 +89,10 @@ int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fresx(PPCHIRBuilder& f, const InstrData& i) { // frD <- 1.0 / (frB) - Value* v = f.Recip(f.LoadFPR(i.A.FRB)); - v = f.ToSingle(v); + // this actually does seem to require single precision, oddly + // more research is needed + Value* v = f.Recip(f.Convert(f.LoadFPR(i.A.FRB), FLOAT32_TYPE)); + v = f.Convert(v, FLOAT64_TYPE); // f.ToSingle(v); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0; diff --git a/src/xenia/cpu/ppc/ppc_emit_memory.cc b/src/xenia/cpu/ppc/ppc_emit_memory.cc index ee347682c..7e7636adb 100644 --- a/src/xenia/cpu/ppc/ppc_emit_memory.cc +++ b/src/xenia/cpu/ppc/ppc_emit_memory.cc @@ -11,9 +11,17 @@ #include #include "xenia/base/assert.h" +#include "xenia/base/cvar.h" #include "xenia/cpu/ppc/ppc_context.h" #include "xenia/cpu/ppc/ppc_hir_builder.h" +DEFINE_bool( + disable_prefetch_and_cachecontrol, false, + "Disables translating ppc prefetch/cache flush instructions to host " + "prefetch/cacheflush instructions. This may improve performance as these " + "instructions were written with the Xbox 360's cache in mind, and modern " + "processors do their own automatic prefetching.", + "CPU"); namespace xe { namespace cpu { namespace ppc { @@ -1080,28 +1088,36 @@ int InstrEmit_stfsx(PPCHIRBuilder& f, const InstrData& i) { // https://randomascii.wordpress.com/2018/01/07/finding-a-cpu-design-bug-in-the-xbox-360/ int InstrEmit_dcbf(PPCHIRBuilder& f, const InstrData& i) { - Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); - f.CacheControl(ea, 128, - CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE_AND_FLUSH); + if (!cvars::disable_prefetch_and_cachecontrol) { + Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); + f.CacheControl(ea, 128, + CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE_AND_FLUSH); + } return 0; } int InstrEmit_dcbst(PPCHIRBuilder& f, const InstrData& i) { - Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); - f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE); + if (!cvars::disable_prefetch_and_cachecontrol) { + Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); + f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE); + } return 0; } int InstrEmit_dcbt(PPCHIRBuilder& f, const InstrData& i) { - Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); - f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH); + if (!cvars::disable_prefetch_and_cachecontrol) { + Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); + f.CacheControl(ea, 128, CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH); + } return 0; } int InstrEmit_dcbtst(PPCHIRBuilder& f, const InstrData& i) { - Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); - f.CacheControl(ea, 128, - CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE); + if (!cvars::disable_prefetch_and_cachecontrol) { + Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); + f.CacheControl(ea, 128, + CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE); + } return 0; } diff --git a/src/xenia/cpu/ppc/ppc_frontend.h b/src/xenia/cpu/ppc/ppc_frontend.h index d2a41c76c..27cc562fe 100644 --- a/src/xenia/cpu/ppc/ppc_frontend.h +++ b/src/xenia/cpu/ppc/ppc_frontend.h @@ -55,7 +55,9 @@ class PPCFrontend { PPCBuiltins builtins_ = {0}; TypePool translator_pool_; }; - +// Checks the state of the global lock and sets scratch to the current MSR +// value. +void CheckGlobalLock(PPCContext* ppc_context, void* arg0, void* arg1); } // namespace ppc } // namespace cpu } // namespace xe diff --git a/src/xenia/kernel/util/shim_utils.h b/src/xenia/kernel/util/shim_utils.h index 579305a72..8a0411b0b 100644 --- a/src/xenia/kernel/util/shim_utils.h +++ b/src/xenia/kernel/util/shim_utils.h @@ -192,6 +192,21 @@ class ParamBase : public Param { T value_; }; +class ContextParam : public Param { + public: + ContextParam() : Param(), ctx_(nullptr) {} + ContextParam(PPCContext* value) : Param(), ctx_(value) {} + ContextParam(Init& init) : Param(init), ctx_(init.ppc_context) {} + + operator PPCContext*() const { return ctx_; } + PPCContext* value() const { return ctx_; } + + PPCContext* operator->() const { return ctx_; } + + protected: + PPCContext* ctx_; +}; + class PointerParam : public ParamBase { public: PointerParam(Init& init) : ParamBase(init) { @@ -370,6 +385,7 @@ using int_result_t = shim::ResultBase; using dword_result_t = shim::ResultBase; using pointer_result_t = shim::ResultBase; using X_HRESULT_result_t = shim::ResultBase; +using ppc_context_t = shim::ContextParam; // Exported from kernel_state.cc. KernelState* kernel_state(); @@ -422,6 +438,9 @@ inline void AppendParam(StringBuffer* string_buffer, lpdouble_t param) { string_buffer->AppendFormat("({:G})", param.value()); } } +inline void AppendParam(StringBuffer* string_buffer, ppc_context_t param) { + string_buffer->Append("ContextArg"); +} inline void AppendParam(StringBuffer* string_buffer, lpstring_t param) { string_buffer->AppendFormat("{:08X}", param.guest_address()); if (param) { diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_misc.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_misc.cc index 7a100665a..1c29c5e57 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_misc.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_misc.cc @@ -8,12 +8,13 @@ */ #include "xenia/base/logging.h" +#include "xenia/cpu/ppc/ppc_frontend.h" +#include "xenia/cpu/processor.h" #include "xenia/kernel/kernel_state.h" #include "xenia/kernel/util/shim_utils.h" #include "xenia/kernel/xboxkrnl/xboxkrnl_private.h" #include "xenia/kernel/xthread.h" #include "xenia/xbox.h" - namespace xe { namespace kernel { namespace xboxkrnl { @@ -22,6 +23,94 @@ void KeEnableFpuExceptions_entry(dword_t enabled) { // TODO(benvanik): can we do anything about exceptions? } DECLARE_XBOXKRNL_EXPORT1(KeEnableFpuExceptions, kNone, kStub); +#if 0 +struct __declspec(align(8)) fpucontext_ptr_t { + char unknown_data[158]; + __int16 field_9E; + char field_A0[2272]; + unsigned __int64 saved_FPSCR; + double saved_fpu_regs[32]; +}; +#pragma pack(push, 1) +struct __declspec(align(1)) r13_struct_t { + char field_0[6]; + __int16 field_6; + char field_8[2]; + char field_A; + char field_B[5]; + int field_10; + char field_14[315]; + char field_14F; + unsigned int field_150; + char field_154[427]; + char field_2FF; + char field_300; +}; +#pragma pack(pop) + + +static uint64_t Do_mfmsr(ppc_context_t& ctx) { + auto frontend = ctx->thread_state->processor()->frontend(); + cpu::ppc::CheckGlobalLock( + ctx, reinterpret_cast(&xe::global_critical_region::mutex()), + reinterpret_cast(&frontend->builtins()->global_lock_count)); + return ctx->scratch; +} + +void KeSaveFloatingPointState_entry(ppc_context_t& ctx) { + xe::Memory* memory = ctx->thread_state->memory(); + unsigned int r13 = static_cast(ctx->r[13]); + + + + + r13_struct_t* st = memory->TranslateVirtual(r13); + /* + lwz r10, 0x150(r13) + lbz r11, 0xA(r13) + tweqi r10, 0 + twnei r11, 0 + */ + + unsigned int r10 = st->field_150; + unsigned char r11 = st->field_A; + + if (r10 == 0 || r11 != 0) { + //trap! + } + + //should do mfmsr here + + unsigned int r3 = xe::load_and_swap(&st->field_10); + + //too much work to do the mfmsr/mtmsr stuff right now + int to_store = -2049; + xe::store_and_swap(&st->field_10, (unsigned int)to_store); + xe::store_and_swap(&st->field_6, (short)to_store); + + + + if (r3 != ~0u) { + fpucontext_ptr_t* fpucontext = + memory->TranslateVirtual(r3); + xe::store_and_swap(&fpucontext->saved_FPSCR, ctx->fpscr.value); + + for (unsigned int i = 0; i < 32; ++i) { + xe::store_and_swap(&fpucontext->saved_fpu_regs[i], ctx->f[i]); + } + xe::store_and_swap(&fpucontext->field_9E, 0xD7FF); + } + ctx->processor->backend()->SetGuestRoundingMode(ctx.value(), 0); + ctx->fpscr.value = 0; + st->field_A = 1; + + xe::store_and_swap(&st->field_10, r13 + 0x300); + ctx->r[3] = r3; + +} + +DECLARE_XBOXKRNL_EXPORT1(KeSaveFloatingPointState, kNone, kImplemented); +#endif } // namespace xboxkrnl } // namespace kernel