From 968f656d96b3ca9c14d6467423df77d3583f7d18 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sun, 31 Jul 2022 08:56:36 -0700 Subject: [PATCH] Add separate VMX/fpu mxcsr Add support for constant operands for most fpu instructions Remove constant folding for most fpu cpde half float --- src/xenia/cpu/backend/x64/x64_backend.cc | 6 + src/xenia/cpu/backend/x64/x64_backend.h | 8 + src/xenia/cpu/backend/x64/x64_emitter.cc | 108 ++- src/xenia/cpu/backend/x64/x64_emitter.h | 24 +- src/xenia/cpu/backend/x64/x64_op.h | 24 + src/xenia/cpu/backend/x64/x64_seq_control.cc | 7 + src/xenia/cpu/backend/x64/x64_seq_vector.cc | 120 +++- src/xenia/cpu/backend/x64/x64_sequences.cc | 655 ++++++++---------- .../passes/constant_propagation_pass.cc | 166 ++--- src/xenia/cpu/hir/hir_builder.cc | 14 +- src/xenia/cpu/hir/hir_builder.h | 2 +- src/xenia/cpu/hir/opcodes.h | 2 + src/xenia/cpu/hir/opcodes.inl | 32 +- src/xenia/cpu/hir/value.cc | 68 +- src/xenia/cpu/hir/value.h | 9 +- src/xenia/cpu/ppc/ppc_emit_altivec.cc | 9 + src/xenia/cpu/ppc/ppc_emit_fpu.cc | 5 +- src/xenia/emulator.cc | 39 +- 18 files changed, 687 insertions(+), 611 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index 77448f15c..87ee4f76a 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -692,6 +692,12 @@ void X64Backend::InitializeBackendContext(void* ctx) { X64BackendContext* bctx = reinterpret_cast( reinterpret_cast(ctx) - sizeof(X64BackendContext)); bctx->ResolveFunction_Ptr = reinterpret_cast(&ResolveFunction); + bctx->mxcsr_fpu = + DEFAULT_FPU_MXCSR; // idk if this is right, check on rgh what the + // rounding on ppc is at startup + bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR; + bctx->flags = 0; + // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png bctx->Ox1000 = 0x1000; } } // namespace x64 diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h index 332cbc196..1026202fe 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.h +++ b/src/xenia/cpu/backend/x64/x64_backend.h @@ -37,9 +37,17 @@ typedef void (*ResolveFunctionThunk)(); // negatively index the membase reg) struct X64BackendContext { void* ResolveFunction_Ptr; // cached pointer to resolvefunction + unsigned int mxcsr_fpu; //currently, the way we implement rounding mode affects both vmx and the fpu + unsigned int mxcsr_vmx; + unsigned int flags; //bit 0 = 0 if mxcsr is fpu, else it is vmx unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted // add of it by... 2 bytes lol }; +constexpr unsigned int DEFAULT_VMX_MXCSR = + 0x8000 | // flush to zero + 0x0040 | (_MM_MASK_MASK); // default rounding mode for vmx + +constexpr unsigned int DEFAULT_FPU_MXCSR = 0x1F80; class X64Backend : public Backend { public: diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index a212d5fe6..129ecc0d3 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -320,6 +320,8 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { // Body. auto block = builder->first_block(); while (block) { + ForgetMxcsrMode(); // at start of block, mxcsr mode is undefined + // Mark block labels. auto label = block->label_head; while (label) { @@ -490,6 +492,7 @@ uint64_t ResolveFunction(void* raw_context, uint64_t target_address) { void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) { assert_not_null(function); + ForgetMxcsrMode(); auto fn = static_cast(function); // Resolve address to the function to call and store in rax. @@ -564,6 +567,7 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) { void X64Emitter::CallIndirect(const hir::Instr* instr, const Xbyak::Reg64& reg) { + ForgetMxcsrMode(); // Check if return. if (instr->flags & hir::CALL_POSSIBLE_RETURN) { cmp(reg.cvt32(), dword[rsp + StackLayout::GUEST_RET_ADDR]); @@ -617,6 +621,7 @@ uint64_t UndefinedCallExtern(void* raw_context, uint64_t function_ptr) { return 0; } void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) { + ForgetMxcsrMode(); bool undefined = true; if (function->behavior() == Function::Behavior::kBuiltin) { auto builtin_function = static_cast(function); @@ -696,11 +701,13 @@ Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param) { } // Important: If you change these, you must update the thunks in x64_backend.cc! -Xbyak::Reg64 X64Emitter::GetContextReg() { return rsi; } -Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdi; } +Xbyak::Reg64 X64Emitter::GetContextReg() const { return rsi; } +Xbyak::Reg64 X64Emitter::GetMembaseReg() const { return rdi; } void X64Emitter::ReloadMembase() { - mov(GetMembaseReg(), qword[GetContextReg() + 8]); // membase + mov(GetMembaseReg(), + qword[GetContextReg() + + offsetof(ppc::PPCContext, virtual_membase)]); // membase } // Len Assembly Byte Sequence @@ -917,7 +924,7 @@ static const vec128_t xmm_consts[] = { /* XMMQNaN */ vec128i(0x7FC00000u), /* XMMInt127 */ vec128i(0x7Fu), /* XMM2To32 */ vec128f(0x1.0p32f), - /* xmminf */ vec128i(0x7f800000), + /* XMMFloatInf */ vec128i(0x7f800000), /* XMMIntsToBytes*/ v128_setr_bytes(0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, @@ -938,9 +945,7 @@ static const vec128_t xmm_consts[] = { /*XMMVSRShlByteshuf*/ v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80), // XMMVSRMask - vec128b(1) - -}; + vec128b(1)}; void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { for (auto& vec : xmm_consts) { @@ -1347,7 +1352,7 @@ SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) { return SimdDomain::DONTCARE; } -Xbyak::Address X64Emitter::GetBackendCtxPtr(int offset_in_x64backendctx) { +Xbyak::Address X64Emitter::GetBackendCtxPtr(int offset_in_x64backendctx) const { /* index context ptr negatively to get to backend ctx field */ @@ -1368,6 +1373,93 @@ Xbyak::Label& X64Emitter::NewCachedLabel() { label_cache_.push_back(tmp); return *tmp; } + +template +static void ChangeMxcsrModeDynamicHelper(X64Emitter& e) { + auto flags = e.GetBackendFlagsPtr(); + if (switching_to_fpu) { + e.btr(flags, 0); // bit 0 set to 0 = is fpu mode + } else { + e.bts(flags, 0); // bit 0 set to 1 = is vmx mode + } + Xbyak::Label& come_back = e.NewCachedLabel(); + + Xbyak::Label& reload_bailout = + e.AddToTail([&come_back](X64Emitter& e, Xbyak::Label& thislabel) { + e.L(thislabel); + if (switching_to_fpu) { + e.LoadFpuMxcsrDirect(); + } else { + e.LoadVmxMxcsrDirect(); + } + e.jmp(come_back, X64Emitter::T_NEAR); + }); + if (switching_to_fpu) { + e.jc(reload_bailout, + X64Emitter::T_NEAR); // if carry flag was set, we were VMX mxcsr mode. + } else { + e.jnc(reload_bailout, + X64Emitter::T_NEAR); // if carry flag was set, we were VMX mxcsr mode. + } + e.L(come_back); +} + +bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) { + if (new_mode == mxcsr_mode_) { + return false; + } + assert_true(new_mode != MXCSRMode::Unknown); + + if (mxcsr_mode_ == MXCSRMode::Unknown) { + // check the mode dynamically + mxcsr_mode_ = new_mode; + if (!already_set) { + if (new_mode == MXCSRMode::Fpu) { + ChangeMxcsrModeDynamicHelper(*this); + } else if (new_mode == MXCSRMode::Vmx) { + ChangeMxcsrModeDynamicHelper(*this); + } else { + assert_unhandled_case(new_mode); + } + } else { //even if already set, we still need to update flags to reflect our mode + if (new_mode == MXCSRMode::Fpu) { + btr(GetBackendFlagsPtr(), 0); + } else if (new_mode == MXCSRMode::Vmx) { + bts(GetBackendFlagsPtr(), 0); + } else { + assert_unhandled_case(new_mode); + } + } + } else { + mxcsr_mode_ = new_mode; + if (!already_set) { + if (new_mode == MXCSRMode::Fpu) { + + LoadFpuMxcsrDirect(); + btr(GetBackendFlagsPtr(), 0); + return true; + } else if (new_mode == MXCSRMode::Vmx) { + LoadVmxMxcsrDirect(); + bts(GetBackendFlagsPtr(), 0); + return true; + } else { + assert_unhandled_case(new_mode); + } + } + } + return false; +} +void X64Emitter::LoadFpuMxcsrDirect() { + vldmxcsr(GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu))); +} +void X64Emitter::LoadVmxMxcsrDirect() { + vldmxcsr(GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_vmx))); +} +Xbyak::Address X64Emitter::GetBackendFlagsPtr() const { + Xbyak::Address pt = GetBackendCtxPtr(offsetof(X64BackendContext, flags)); + pt.setBit(32); + return pt; +} } // namespace x64 } // namespace backend } // namespace cpu diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 528326088..93a7babaf 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -65,6 +65,12 @@ enum class SimdDomain : uint32_t { // CONFLICTING means its used in multiple domains) }; +enum class MXCSRMode : uint32_t { + Unknown, + Fpu, + Vmx +}; + static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) { if (dom1 == dom2) { return dom1; @@ -283,8 +289,8 @@ class X64Emitter : public Xbyak::CodeGenerator { Xbyak::Reg64 GetNativeParam(uint32_t param); - Xbyak::Reg64 GetContextReg(); - Xbyak::Reg64 GetMembaseReg(); + Xbyak::Reg64 GetContextReg() const; + Xbyak::Reg64 GetMembaseReg() const; bool CanUseMembaseLow32As0() const { return may_use_membase32_as_zero_reg_; } void ReloadMembase(); @@ -295,7 +301,7 @@ class X64Emitter : public Xbyak::CodeGenerator { void MovMem64(const Xbyak::RegExp& addr, uint64_t v); Xbyak::Address GetXmmConstPtr(XmmConst id); - Xbyak::Address GetBackendCtxPtr(int offset_in_x64backendctx); + Xbyak::Address GetBackendCtxPtr(int offset_in_x64backendctx) const; void LoadConstantXmm(Xbyak::Xmm dest, float v); void LoadConstantXmm(Xbyak::Xmm dest, double v); @@ -304,6 +310,7 @@ class X64Emitter : public Xbyak::CodeGenerator { Xbyak::Address StashConstantXmm(int index, float v); Xbyak::Address StashConstantXmm(int index, double v); Xbyak::Address StashConstantXmm(int index, const vec128_t& v); + Xbyak::Address GetBackendFlagsPtr() const; void* FindByteConstantOffset(unsigned bytevalue); void* FindWordConstantOffset(unsigned wordvalue); void* FindDwordConstantOffset(unsigned bytevalue); @@ -319,6 +326,16 @@ class X64Emitter : public Xbyak::CodeGenerator { size_t stack_size() const { return stack_size_; } SimdDomain DeduceSimdDomain(const hir::Value* for_value); + void ForgetMxcsrMode() { + mxcsr_mode_ = MXCSRMode::Unknown; + } + /* + returns true if had to load mxcsr. DOT_PRODUCT can use this to skip clearing the overflow flag, as it will never be set in the vmx fpscr + */ + bool ChangeMxcsrMode(MXCSRMode new_mode, bool already_set=false);//already_set means that the caller already did vldmxcsr, used for SET_ROUNDING_MODE + + void LoadFpuMxcsrDirect(); //unsafe, does not change mxcsr_mode_ + void LoadVmxMxcsrDirect(); //unsafe, does not change mxcsr_mode_ protected: void* Emplace(const EmitFunctionInfo& func_info, GuestFunction* function = nullptr); @@ -359,6 +376,7 @@ class X64Emitter : public Xbyak::CodeGenerator { std::vector label_cache_; // for creating labels that need to be referenced much // later by tail emitters + MXCSRMode mxcsr_mode_ = MXCSRMode::Unknown; }; } // namespace x64 diff --git a/src/xenia/cpu/backend/x64/x64_op.h b/src/xenia/cpu/backend/x64/x64_op.h index 1f3e38cc6..745603032 100644 --- a/src/xenia/cpu/backend/x64/x64_op.h +++ b/src/xenia/cpu/backend/x64/x64_op.h @@ -616,7 +616,31 @@ struct Sequence { } } }; +template +static Xmm GetInputRegOrConstant(X64Emitter& e, const T& input, + Xmm xmm_to_use_if_const) { + if (input.is_constant) { + using constant_type = std::remove_reference_t; + if constexpr (std::is_integral_v) { + vec128_t input_constant = vec128b(0); + if constexpr (sizeof(constant_type) == 4) { + input_constant.i32[0] = input.constant(); + + } else if constexpr (sizeof(constant_type) == 8) { + input_constant.low = input.constant(); + } else { + assert_unhandled_case(sizeof(constant_type)); + } + e.LoadConstantXmm(xmm_to_use_if_const, input_constant); + } else { + e.LoadConstantXmm(xmm_to_use_if_const, input.constant()); + } + return xmm_to_use_if_const; + } else { + return input; + } +} } // namespace x64 } // namespace backend } // namespace cpu diff --git a/src/xenia/cpu/backend/x64/x64_seq_control.cc b/src/xenia/cpu/backend/x64/x64_seq_control.cc index 0df9d3255..dc5fa7d3d 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_control.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc @@ -257,6 +257,7 @@ struct CALL_TRUE_I8 e.jz(skip); e.Call(i.instr, static_cast(i.src2.value)); e.L(skip); + e.ForgetMxcsrMode(); } }; struct CALL_TRUE_I16 @@ -268,6 +269,7 @@ struct CALL_TRUE_I16 e.jz(skip); e.Call(i.instr, static_cast(i.src2.value)); e.L(skip); + e.ForgetMxcsrMode(); } }; struct CALL_TRUE_I32 @@ -279,6 +281,7 @@ struct CALL_TRUE_I32 e.jz(skip); e.Call(i.instr, static_cast(i.src2.value)); e.L(skip); + e.ForgetMxcsrMode(); } }; struct CALL_TRUE_I64 @@ -290,6 +293,7 @@ struct CALL_TRUE_I64 e.jz(skip); e.Call(i.instr, static_cast(i.src2.value)); e.L(skip); + e.ForgetMxcsrMode(); } }; struct CALL_TRUE_F32 @@ -301,6 +305,7 @@ struct CALL_TRUE_F32 e.jz(skip); e.Call(i.instr, static_cast(i.src2.value)); e.L(skip); + e.ForgetMxcsrMode(); } }; @@ -313,6 +318,7 @@ struct CALL_TRUE_F64 e.jz(skip); e.Call(i.instr, static_cast(i.src2.value)); e.L(skip); + e.ForgetMxcsrMode(); } }; EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16, @@ -326,6 +332,7 @@ struct CALL_INDIRECT : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { e.CallIndirect(i.instr, i.src1); + e.ForgetMxcsrMode(); } }; EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT, CALL_INDIRECT); diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 7c55300db..846eda234 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -16,7 +16,13 @@ // For OPCODE_PACK/OPCODE_UNPACK #include "third_party/half/include/half.hpp" +#include "xenia/base/cvar.h" +#include "xenia/cpu/backend/x64/x64_stack_layout.h" +DEFINE_bool(use_extended_range_half, true, + "Emulate extended range half-precision, may be slower on games " + "that use it heavily", + "CPU"); namespace xe { namespace cpu { namespace backend { @@ -31,6 +37,8 @@ struct VECTOR_CONVERT_I2F : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3); // flags = ARITHMETIC_UNSIGNED if (i.instr->flags & ARITHMETIC_UNSIGNED) { // Round manually to (1.stored mantissa bits * 2^31) or to 2^32 to the @@ -46,8 +54,8 @@ struct VECTOR_CONVERT_I2F // be 4294967296.0f. // xmm0 = src + 0b01111111 + ((src >> 8) & 1) // (xmm1 also used to launch reg + mem early and to require it late) - e.vpaddd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMInt127)); - e.vpslld(e.xmm0, i.src1, 31 - 8); + e.vpaddd(e.xmm1, src1, e.GetXmmConstPtr(XMMInt127)); + e.vpslld(e.xmm0, src1, 31 - 8); e.vpsrld(e.xmm0, e.xmm0, 31); e.vpaddd(e.xmm0, e.xmm0, e.xmm1); // xmm0 = (0xFF800000 | 23 explicit mantissa bits), or 0 if overflowed @@ -63,13 +71,13 @@ struct VECTOR_CONVERT_I2F // Convert from signed integer to float. // xmm1 = [0x00000000, 0x7FFFFFFF] case result - e.vcvtdq2ps(e.xmm1, i.src1); + e.vcvtdq2ps(e.xmm1, src1); // Merge the two ways depending on whether the number is >= 0x80000000 // (has high bit set). - e.vblendvps(i.dest, e.xmm1, e.xmm0, i.src1); + e.vblendvps(i.dest, e.xmm1, e.xmm0, src1); } else { - e.vcvtdq2ps(i.dest, i.src1); + e.vcvtdq2ps(i.dest, src1); } } }; @@ -82,9 +90,11 @@ struct VECTOR_CONVERT_F2I : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3); if (i.instr->flags & ARITHMETIC_UNSIGNED) { // clamp to min 0 - e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero)); + e.vmaxps(e.xmm0, src1, e.GetXmmConstPtr(XMMZero)); // xmm1 = mask of values >= (unsigned)INT_MIN e.vcmpgeps(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS)); @@ -108,14 +118,14 @@ struct VECTOR_CONVERT_F2I e.vpor(i.dest, i.dest, e.xmm0); } else { // xmm2 = NaN mask - e.vcmpunordps(e.xmm2, i.src1, i.src1); + e.vcmpunordps(e.xmm2, src1, src1); // convert packed floats to packed dwords - e.vcvttps2dq(e.xmm0, i.src1); + e.vcvttps2dq(e.xmm0, src1); // (high bit) xmm1 = dest is indeterminate and i.src1 >= 0 e.vpcmpeqd(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMIntMin)); - e.vpandn(e.xmm1, i.src1, e.xmm1); + e.vpandn(e.xmm1, src1, e.xmm1); // saturate positive values e.vblendvps(i.dest, e.xmm0, e.GetXmmConstPtr(XMMIntMax), e.xmm1); @@ -131,6 +141,7 @@ struct VECTOR_DENORMFLUSH : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); e.vxorps(e.xmm1, e.xmm1, e.xmm1); // 0.25 P0123 e.vandps(e.xmm0, i.src1, @@ -352,6 +363,7 @@ struct VECTOR_COMPARE_EQ_V128 e.vpcmpeqd(dest, src1, src2); break; case FLOAT32_TYPE: + e.ChangeMxcsrMode(MXCSRMode::Vmx); e.vcmpeqps(dest, src1, src2); break; } @@ -380,6 +392,7 @@ struct VECTOR_COMPARE_SGT_V128 e.vpcmpgtd(dest, src1, src2); break; case FLOAT32_TYPE: + e.ChangeMxcsrMode(MXCSRMode::Vmx); e.vcmpgtps(dest, src1, src2); break; } @@ -414,6 +427,7 @@ struct VECTOR_COMPARE_SGE_V128 e.vpor(dest, e.xmm0); break; case FLOAT32_TYPE: + e.ChangeMxcsrMode(MXCSRMode::Vmx); e.vcmpgeps(dest, src1, src2); break; } @@ -441,6 +455,7 @@ struct VECTOR_COMPARE_UGT_V128 sign_addr = e.GetXmmConstPtr(XMMSignMaskI32); break; case FLOAT32_TYPE: + e.ChangeMxcsrMode(MXCSRMode::Vmx); sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); break; default: @@ -498,6 +513,7 @@ struct VECTOR_COMPARE_UGE_V128 sign_addr = e.GetXmmConstPtr(XMMSignMaskI32); break; case FLOAT32_TYPE: + e.ChangeMxcsrMode(MXCSRMode::Vmx); sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); break; } @@ -620,6 +636,7 @@ struct VECTOR_ADD case FLOAT32_TYPE: assert_false(is_unsigned); assert_false(saturate); + e.ChangeMxcsrMode(MXCSRMode::Vmx); e.vaddps(dest, src1, src2); break; default: @@ -711,6 +728,7 @@ struct VECTOR_SUB } break; case FLOAT32_TYPE: + e.ChangeMxcsrMode(MXCSRMode::Vmx); e.vsubps(dest, src1, src2); break; default: @@ -2003,6 +2021,7 @@ EMITTER_OPCODE_TABLE(OPCODE_SWIZZLE, SWIZZLE); // ============================================================================ struct PACK : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); switch (i.instr->flags & PACK_TYPE_MODE) { case PACK_TYPE_D3DCOLOR: EmitD3DCOLOR(e, i); @@ -2062,9 +2081,14 @@ struct PACK : Sequence> { alignas(16) uint16_t b[8]; _mm_store_ps(a, src1); std::memset(b, 0, sizeof(b)); - - for (int i = 0; i < 2; i++) { - b[7 - i] = half_float::detail::float2half(a[i]); + if (!cvars::use_extended_range_half) { + for (int i = 0; i < 2; i++) { + b[7 - i] = half_float::detail::float2half(a[i]); + } + } else { + for (int i = 0; i < 2; i++) { + b[7 - i] = float_to_xenos_half(a[i]); + } } return _mm_load_si128(reinterpret_cast<__m128i*>(b)); @@ -2074,7 +2098,7 @@ struct PACK : Sequence> { // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx // dest = [(src1.x | src1.y), 0, 0, 0] - if (e.IsFeatureEnabled(kX64EmitF16C)) { + if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) { Xmm src; if (i.src1.is_constant) { src = i.dest; @@ -2101,10 +2125,15 @@ struct PACK : Sequence> { alignas(16) uint16_t b[8]; _mm_store_ps(a, src1); std::memset(b, 0, sizeof(b)); - - for (int i = 0; i < 4; i++) { - b[7 - (i ^ 2)] = - half_float::detail::float2half(a[i]); + if (!cvars::use_extended_range_half) { + for (int i = 0; i < 4; i++) { + b[7 - (i ^ 2)] = + half_float::detail::float2half(a[i]); + } + } else { + for (int i = 0; i < 4; i++) { + b[7 - (i ^ 2)] = float_to_xenos_half(a[i]); + } } return _mm_load_si128(reinterpret_cast<__m128i*>(b)); @@ -2113,7 +2142,7 @@ struct PACK : Sequence> { assert_true(i.src2.value->IsConstantZero()); // dest = [(src1.z | src1.w), (src1.x | src1.y), 0, 0] - if (e.IsFeatureEnabled(kX64EmitF16C)) { + if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) { Xmm src; if (i.src1.is_constant) { src = i.dest; @@ -2420,6 +2449,7 @@ EMITTER_OPCODE_TABLE(OPCODE_PACK, PACK); // ============================================================================ struct UNPACK : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); switch (i.instr->flags & PACK_TYPE_MODE) { case PACK_TYPE_D3DCOLOR: EmitD3DCOLOR(e, i); @@ -2478,10 +2508,15 @@ struct UNPACK : Sequence> { alignas(16) float b[4]; _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); - for (int i = 0; i < 2; i++) { - b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]); + if (!cvars::use_extended_range_half) { + for (int i = 0; i < 2; i++) { + b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]); + } + } else { + for (int i = 0; i < 2; i++) { + b[i] = xenos_half_to_float(a[VEC128_W(6 + i)]); + } } - // Constants, or something b[2] = 0.f; b[3] = 1.f; @@ -2501,7 +2536,9 @@ struct UNPACK : Sequence> { // Also zero out the high end. // TODO(benvanik): special case constant unpacks that just get 0/1/etc. - if (e.IsFeatureEnabled(kX64EmitF16C)) { + if (e.IsFeatureEnabled(kX64EmitF16C) && + !cvars::use_extended_range_half) { // todo: can use cvtph and bit logic + // to implement Xmm src; if (i.src1.is_constant) { src = i.dest; @@ -2534,16 +2571,21 @@ struct UNPACK : Sequence> { alignas(16) uint16_t a[8]; alignas(16) float b[4]; _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); - - for (int i = 0; i < 4; i++) { - b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]); + if (!cvars::use_extended_range_half) { + for (int i = 0; i < 4; i++) { + b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]); + } + } else { + for (int i = 0; i < 4; i++) { + b[i] = xenos_half_to_float(a[VEC128_W(4 + i)]); + } } return _mm_load_ps(b); } static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { // src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0] - if (e.IsFeatureEnabled(kX64EmitF16C)) { + if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) { Xmm src; if (i.src1.is_constant) { src = i.dest; @@ -2805,6 +2847,32 @@ struct UNPACK : Sequence> { }; EMITTER_OPCODE_TABLE(OPCODE_UNPACK, UNPACK); +struct SET_NJM_I8 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr_vmx = e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_vmx)); + + addr_vmx.setBit(32); + if (i.src1.is_constant) { + if (i.src1.constant() == 0) { + // turn off daz/flush2z + e.mov(addr_vmx, _MM_MASK_MASK); + + } else { + e.mov(addr_vmx, DEFAULT_VMX_MXCSR); + } + + } else { + e.test(i.src1, i.src1); + e.mov(e.edx, DEFAULT_VMX_MXCSR); + e.mov(e.eax, _MM_MASK_MASK); + + e.cmove(e.edx, e.eax); + e.mov(addr_vmx, e.edx); + } + e.ChangeMxcsrMode(MXCSRMode::Vmx); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SET_NJM, SET_NJM_I8); } // namespace x64 } // namespace backend } // namespace cpu diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index bbd5f6d21..0ccd7d441 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -137,7 +137,12 @@ struct ASSIGN_F64 : Sequence> { }; struct ASSIGN_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vmovaps(i.dest, i.src1); + SimdDomain domain = e.DeduceSimdDomain(i.src1.value); + if (domain == SimdDomain::INTEGER) { + e.vmovdqa(i.dest, i.src1); + } else { + e.vmovaps(i.dest, i.src1); + } } }; EMITTER_OPCODE_TABLE(OPCODE_ASSIGN, ASSIGN_I8, ASSIGN_I16, ASSIGN_I32, @@ -304,38 +309,44 @@ EMITTER_OPCODE_TABLE(OPCODE_TRUNCATE, TRUNCATE_I8_I16, TRUNCATE_I8_I32, struct CONVERT_I32_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); // TODO(benvanik): saturation check? cvtt* (trunc?) + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); if (i.instr->flags == ROUND_TO_ZERO) { - e.vcvttss2si(i.dest, i.src1); + e.vcvttss2si(i.dest, src1); } else { - e.vcvtss2si(i.dest, i.src1); + e.vcvtss2si(i.dest, src1); } } }; struct CONVERT_I32_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); // Intel returns 0x80000000 if the double value does not fit within an int32 // PPC saturates the value instead. // So, we can clamp the double value to (double)0x7FFFFFFF. - e.vminsd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMIntMaxPD)); + e.vminsd(e.xmm1, GetInputRegOrConstant(e, i.src1, e.xmm0), + e.GetXmmConstPtr(XMMIntMaxPD)); if (i.instr->flags == ROUND_TO_ZERO) { - e.vcvttsd2si(i.dest, e.xmm0); + e.vcvttsd2si(i.dest, e.xmm1); } else { - e.vcvtsd2si(i.dest, e.xmm0); + e.vcvtsd2si(i.dest, e.xmm1); } } }; struct CONVERT_I64_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); e.xor_(e.eax, e.eax); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); - e.vcomisd(i.src1, e.GetXmmConstPtr(XmmConst::XMMZero)); + e.vcomisd(src1, e.GetXmmConstPtr(XmmConst::XMMZero)); if (i.instr->flags == ROUND_TO_ZERO) { - e.vcvttsd2si(i.dest, i.src1); + e.vcvttsd2si(i.dest, src1); } else { - e.vcvtsd2si(i.dest, i.src1); + e.vcvtsd2si(i.dest, src1); } // cf set if less than e.setnc(e.cl); @@ -349,28 +360,40 @@ struct CONVERT_I64_F64 struct CONVERT_F32_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); // TODO(benvanik): saturation check? cvtt* (trunc?) - e.vcvtsi2ss(i.dest, i.src1); + // e.vcvtsi2ss(i.dest, GetInputRegOrConstant(e, i.src1, e.xmm0)); + + assert_impossible_sequence(CONVERT_F32_I32); } }; struct CONVERT_F32_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); // TODO(benvanik): saturation check? cvtt* (trunc?) - e.vcvtsd2ss(i.dest, i.src1); + e.vcvtsd2ss(i.dest, GetInputRegOrConstant(e, i.src1, e.xmm0)); } }; struct CONVERT_F64_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); + + Reg64 input = i.src1; + if (i.src1.is_constant) { + input = e.rax; + e.mov(input, (uintptr_t)i.src1.constant()); + } // TODO(benvanik): saturation check? cvtt* (trunc?) - e.vcvtsi2sd(i.dest, i.src1); + e.vcvtsi2sd(i.dest, input); } }; struct CONVERT_F64_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vcvtss2sd(i.dest, i.src1); + e.ChangeMxcsrMode(MXCSRMode::Fpu); + e.vcvtss2sd(i.dest, GetInputRegOrConstant(e, i.src1, e.xmm0)); } }; EMITTER_OPCODE_TABLE(OPCODE_CONVERT, CONVERT_I32_F32, CONVERT_I32_F64, @@ -380,19 +403,21 @@ EMITTER_OPCODE_TABLE(OPCODE_CONVERT, CONVERT_I32_F32, CONVERT_I32_F64, struct TOSINGLE_F64_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - /* todo: - manually round, honestly might be faster than this. this sequence takes > - 6 cycles on zen 2 we can also get closer to the correct behavior by - manually rounding: - https://randomascii.wordpress.com/2019/03/20/exercises-in-emulation-xbox-360s-fma-instruction/ + e.ChangeMxcsrMode(MXCSRMode::Fpu); + + Xmm srcreg = GetInputRegOrConstant(e, i.src1, e.xmm1); - */ if (cvars::no_round_to_single) { - if (i.dest != i.src1) { - e.vmovapd(i.dest, i.src1); + if (i.dest != i.src1 || i.src1.is_constant) { + e.vmovapd(i.dest, srcreg); } + } else { - e.vcvtsd2ss(e.xmm0, i.src1); + /* + i compared the results for this cvtss/cvtsd to results generated + on actual hardware, it looks good to me + */ + e.vcvtsd2ss(e.xmm0, srcreg); e.vcvtss2sd(i.dest, e.xmm0); } } @@ -403,6 +428,11 @@ EMITTER_OPCODE_TABLE(OPCODE_TO_SINGLE, TOSINGLE_F64_F64); // ============================================================================ struct ROUND_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { +#if 1 + assert_impossible_sequence(ROUND_F32); +#else + // likely dead code + e.ChangeMxcsrMode(MXCSRMode::Fpu); switch (i.instr->flags) { case ROUND_TO_ZERO: e.vroundss(i.dest, i.src1, 0b00000011); @@ -417,40 +447,46 @@ struct ROUND_F32 : Sequence> { e.vroundss(i.dest, i.src1, 0b00000010); break; } +#endif } }; struct ROUND_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); switch (i.instr->flags) { case ROUND_TO_ZERO: - e.vroundsd(i.dest, i.src1, 0b00000011); + e.vroundsd(i.dest, src1, 0b00000011); break; case ROUND_TO_NEAREST: - e.vroundsd(i.dest, i.src1, 0b00000000); + e.vroundsd(i.dest, src1, 0b00000000); break; case ROUND_TO_MINUS_INFINITY: - e.vroundsd(i.dest, i.src1, 0b00000001); + e.vroundsd(i.dest, src1, 0b00000001); break; case ROUND_TO_POSITIVE_INFINITY: - e.vroundsd(i.dest, i.src1, 0b00000010); + e.vroundsd(i.dest, src1, 0b00000010); break; } } }; struct ROUND_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + // likely dead code + e.ChangeMxcsrMode(MXCSRMode::Vmx); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); switch (i.instr->flags) { case ROUND_TO_ZERO: - e.vroundps(i.dest, i.src1, 0b00000011); + e.vroundps(i.dest, src1, 0b00000011); break; case ROUND_TO_NEAREST: - e.vroundps(i.dest, i.src1, 0b00000000); + e.vroundps(i.dest, src1, 0b00000000); break; case ROUND_TO_MINUS_INFINITY: - e.vroundps(i.dest, i.src1, 0b00000001); + e.vroundps(i.dest, src1, 0b00000001); break; case ROUND_TO_POSITIVE_INFINITY: - e.vroundps(i.dest, i.src1, 0b00000010); + e.vroundps(i.dest, src1, 0b00000010); break; } } @@ -511,6 +547,7 @@ EMITTER_OPCODE_TABLE(OPCODE_CONTEXT_BARRIER, CONTEXT_BARRIER); // ============================================================================ struct MAX_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); EmitCommutativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vmaxss(dest, src1, src2); @@ -519,6 +556,7 @@ struct MAX_F32 : Sequence> { }; struct MAX_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); EmitCommutativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vmaxsd(dest, src1, src2); @@ -527,6 +565,7 @@ struct MAX_F64 : Sequence> { }; struct MAX_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); EmitCommutativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vmaxps(dest, src1, src2); @@ -600,6 +639,7 @@ struct MIN_I64 : Sequence> { }; struct MIN_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); EmitCommutativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vminss(dest, src1, src2); @@ -608,6 +648,7 @@ struct MIN_F32 : Sequence> { }; struct MIN_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); EmitCommutativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vminsd(dest, src1, src2); @@ -616,6 +657,7 @@ struct MIN_F64 : Sequence> { }; struct MIN_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); EmitCommutativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vminps(dest, src1, src2); @@ -694,6 +736,7 @@ struct SELECT_I64 struct SELECT_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); // TODO(benvanik): find a shorter sequence. // dest = src1 != 0 ? src2 : src3 e.movzx(e.eax, i.src1); @@ -718,6 +761,7 @@ struct SELECT_F32 struct SELECT_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); // dest = src1 != 0 ? src2 : src3 e.movzx(e.eax, i.src1); e.vmovd(e.xmm1, e.eax); @@ -741,6 +785,7 @@ struct SELECT_F64 struct SELECT_V128_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); // TODO(benvanik): find a shorter sequence. // dest = src1 != 0 ? src2 : src3 /* @@ -967,6 +1012,7 @@ EMITTER_OPCODE_TABLE(OPCODE_IS_FALSE, IS_FALSE_I8, IS_FALSE_I16, IS_FALSE_I32, // ============================================================================ struct IS_NAN_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); e.vucomiss(i.src1, i.src1); e.setp(i.dest); } @@ -974,6 +1020,7 @@ struct IS_NAN_F32 : Sequence> { struct IS_NAN_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); e.vucomisd(i.src1, i.src1); e.setp(i.dest); } @@ -1074,6 +1121,7 @@ struct COMPARE_EQ_I64 struct COMPARE_EQ_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); if (!HasPrecedingCmpOfSameValues(i.instr)) { EmitCommutativeBinaryXmmOp( e, i, @@ -1087,6 +1135,7 @@ struct COMPARE_EQ_F32 struct COMPARE_EQ_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); if (!HasPrecedingCmpOfSameValues(i.instr)) { EmitCommutativeBinaryXmmOp( e, i, @@ -1181,6 +1230,7 @@ struct COMPARE_NE_I64 struct COMPARE_NE_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); if (!HasPrecedingCmpOfSameValues(i.instr)) { e.vcomiss(i.src1, i.src2); } @@ -1190,6 +1240,7 @@ struct COMPARE_NE_F32 struct COMPARE_NE_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); if (!HasPrecedingCmpOfSameValues(i.instr)) { e.vcomisd(i.src1, i.src2); } @@ -1267,6 +1318,7 @@ EMITTER_ASSOCIATIVE_COMPARE_XX(UGE, setae, setbe); : Sequence> { \ static void Emit(X64Emitter& e, const EmitArgType& i) { \ + e.ChangeMxcsrMode(MXCSRMode::Fpu); \ if (!HasPrecedingCmpOfSameValues(i.instr)) { \ e.vcomiss(i.src1, i.src2); \ } \ @@ -1282,6 +1334,7 @@ EMITTER_ASSOCIATIVE_COMPARE_XX(UGE, setae, setbe); : Sequence> { \ static void Emit(X64Emitter& e, const EmitArgType& i) { \ + e.ChangeMxcsrMode(MXCSRMode::Fpu); \ if (!HasPrecedingCmpOfSameValues(i.instr)) { \ if (i.src1.is_constant) { \ e.LoadConstantXmm(e.xmm0, i.src1.constant()); \ @@ -1365,26 +1418,49 @@ struct ADD_I64 : Sequence> { }; struct ADD_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { +#if 1 + + assert_impossible_sequence(ADD_F32); +#else + e.ChangeMxcsrMode(MXCSRMode::Fpu); EmitCommutativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vaddss(dest, src1, src2); }); +#endif } }; struct ADD_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryXmmOp(e, i, + e.ChangeMxcsrMode(MXCSRMode::Fpu); +#if 0 + EmitCommutativeBinaryXmmOp( + e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vaddsd(dest, src1, src2); }); +#else + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + e.vaddsd(i.dest, src1, src2); + +#endif } }; struct ADD_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); +#if 0 EmitCommutativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vaddps(dest, src1, src2); }); +#else + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + e.vaddps(i.dest, src1, src2); + +#endif } }; EMITTER_OPCODE_TABLE(OPCODE_ADD, ADD_I8, ADD_I16, ADD_I32, ADD_I64, ADD_F32, @@ -1484,29 +1560,35 @@ struct SUB_I64 : Sequence> { }; struct SUB_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + #if 1 + assert_impossible_sequence(SUB_F32); + #else assert_true(!i.instr->flags); + e.ChangeMxcsrMode(MXCSRMode::Fpu); EmitAssociativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vsubss(dest, src1, src2); }); + #endif } }; struct SUB_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { assert_true(!i.instr->flags); - EmitAssociativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vsubsd(dest, src1, src2); - }); + e.ChangeMxcsrMode(MXCSRMode::Fpu); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + e.vsubsd(i.dest, src1, src2); + } }; struct SUB_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { assert_true(!i.instr->flags); - EmitAssociativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vsubps(dest, src1, src2); - }); + e.ChangeMxcsrMode(MXCSRMode::Vmx); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + e.vsubps(i.dest, src1, src2); } }; EMITTER_OPCODE_TABLE(OPCODE_SUB, SUB_I8, SUB_I16, SUB_I32, SUB_I64, SUB_F32, @@ -1519,6 +1601,9 @@ EMITTER_OPCODE_TABLE(OPCODE_SUB, SUB_I8, SUB_I16, SUB_I32, SUB_I64, SUB_F32, // We exploit mulx here to avoid creating too much register pressure. struct MUL_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + #if 1 + assert_impossible_sequence(MUL_I8); + #else if (i.src1.is_constant || i.src2.is_constant) { uint64_t cval = i.src1.is_constant ? i.src1.constant() : i.src2.constant(); @@ -1568,6 +1653,7 @@ struct MUL_I8 : Sequence> { e.mov(i.dest, e.al); } } + #endif } }; struct MUL_I16 : Sequence> { @@ -1749,29 +1835,39 @@ struct MUL_I64 : Sequence> { }; struct MUL_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + #if 1 + + assert_impossible_sequence(MUL_F32); + + #else assert_true(!i.instr->flags); + + e.ChangeMxcsrMode(MXCSRMode::Fpu); EmitCommutativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vmulss(dest, src1, src2); }); + #endif } }; struct MUL_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { assert_true(!i.instr->flags); - EmitCommutativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vmulsd(dest, src1, src2); - }); + e.ChangeMxcsrMode(MXCSRMode::Fpu); + + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + e.vmulsd(i.dest, src1, src2); + } }; struct MUL_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { assert_true(!i.instr->flags); - EmitCommutativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vmulps(dest, src1, src2); - }); + e.ChangeMxcsrMode(MXCSRMode::Vmx); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + e.vmulps(i.dest, src1, src2); } }; EMITTER_OPCODE_TABLE(OPCODE_MUL, MUL_I8, MUL_I16, MUL_I32, MUL_I64, MUL_F32, @@ -2003,20 +2099,28 @@ struct DIV_I64 : Sequence> { }; struct DIV_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + #if 1 + assert_impossible_sequence(DIV_F32) + #else assert_true(!i.instr->flags); + e.ChangeMxcsrMode(MXCSRMode::Fpu); EmitAssociativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vdivss(dest, src1, src2); }); + #endif } }; struct DIV_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + assert_true(!i.instr->flags); - EmitAssociativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vdivsd(dest, src1, src2); - }); + e.ChangeMxcsrMode(MXCSRMode::Fpu); + + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + e.vdivsd(i.dest, src1, src2); + } }; struct DIV_V128 : Sequence> { @@ -2047,49 +2151,20 @@ struct MUL_ADD_F32 struct MUL_ADD_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // FMA extension + e.ChangeMxcsrMode(MXCSRMode::Fpu); + + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2); if (e.IsFeatureEnabled(kX64EmitFMA)) { - EmitCommutativeBinaryXmmOp(e, i, - [&i](X64Emitter& e, const Xmm& dest, - const Xmm& src1, const Xmm& src2) { - Xmm src3 = - i.src3.is_constant ? e.xmm1 : i.src3; - if (i.src3.is_constant) { - e.LoadConstantXmm(src3, i.src3.constant()); - } - if (i.dest == src1) { - e.vfmadd213sd(i.dest, src2, src3); - } else if (i.dest == src2) { - e.vfmadd213sd(i.dest, src1, src3); - } else if (i.dest == i.src3) { - e.vfmadd231sd(i.dest, src1, src2); - } else { - // Dest not equal to anything - e.vmovsd(i.dest, src1); - e.vfmadd213sd(i.dest, src2, src3); - } - }); + // todo: this is garbage + e.vmovapd(e.xmm3, src1); + e.vfmadd213sd(e.xmm3, src2, src3); + e.vmovapd(i.dest, e.xmm3); } else { - Xmm src3; - if (i.src3.is_constant) { - src3 = e.xmm1; - e.LoadConstantXmm(src3, i.src3.constant()); - } else { - // If i.dest == i.src3, back up i.src3 so we don't overwrite it. - src3 = i.src3; - if (i.dest == i.src3) { - e.vmovsd(e.xmm1, i.src3); - src3 = e.xmm1; - } - } - - // Multiply operation is commutative. - EmitCommutativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vmulsd(dest, src1, src2); // $0 = $1 * $2 - }); - - e.vaddsd(i.dest, i.dest, src3); // $0 = $1 + $2 + // todo: might need to use x87 in this case... + e.vmulsd(e.xmm3, src1, src2); + e.vaddsd(i.dest, e.xmm3, src3); } } }; @@ -2097,57 +2172,20 @@ struct MUL_ADD_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // TODO(benvanik): the vfmadd sequence produces slightly different results - // than vmul+vadd and it'd be nice to know why. Until we know, it's - // disabled so tests pass. - // chrispy: reenabled, i have added the DAZ behavior that was missing + e.ChangeMxcsrMode(MXCSRMode::Vmx); + + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2); if (e.IsFeatureEnabled(kX64EmitFMA)) { - EmitCommutativeBinaryXmmOp(e, i, - [&i](X64Emitter& e, const Xmm& dest, - const Xmm& src1, const Xmm& src2) { - Xmm src3 = - i.src3.is_constant ? e.xmm1 : i.src3; - if (i.src3.is_constant) { - e.LoadConstantXmm(src3, i.src3.constant()); - } - if (i.dest == src1) { - e.vfmadd213ps(i.dest, src2, src3); - } else if (i.dest == src2) { - e.vfmadd213ps(i.dest, src1, src3); - } else if (i.dest == i.src3) { - e.vfmadd231ps(i.dest, src1, src2); - } else { - // Dest not equal to anything - // e.vmovdqa(i.dest, - // src1); - // chrispy: vmovdqa was a domain pipeline - // hazard - e.vmovaps(i.dest, src1); - e.vfmadd213ps(i.dest, src2, src3); - } - }); + // todo: this is garbage + e.vmovaps(e.xmm3, src1); + e.vfmadd213ps(e.xmm3, src2, src3); + e.vmovaps(i.dest, e.xmm3); } else { - Xmm src3; - if (i.src3.is_constant) { - src3 = e.xmm1; - e.LoadConstantXmm(src3, i.src3.constant()); - } else { - // If i.dest == i.src3, back up i.src3 so we don't overwrite it. - src3 = i.src3; - if (i.dest == i.src3) { - // e.vmovdqa(e.xmm1, i.src3); - e.vmovaps(e.xmm1, i.src3); - src3 = e.xmm1; - } - } - - // Multiply operation is commutative. - EmitCommutativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vmulps(dest, src1, src2); // $0 = $1 * $2 - }); - - e.vaddps(i.dest, i.dest, src3); // $0 = $1 + $2 + // todo: might need to use x87 in this case... + e.vmulps(e.xmm3, src1, src2); + e.vaddps(i.dest, e.xmm3, src3); } } }; @@ -2168,98 +2206,26 @@ EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128); struct MUL_SUB_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // FMA extension - if (e.IsFeatureEnabled(kX64EmitFMA)) { - EmitCommutativeBinaryXmmOp(e, i, - [&i](X64Emitter& e, const Xmm& dest, - const Xmm& src1, const Xmm& src2) { - Xmm src3 = - i.src3.is_constant ? e.xmm1 : i.src3; - if (i.src3.is_constant) { - e.LoadConstantXmm(src3, i.src3.constant()); - } - if (i.dest == src1) { - e.vfmsub213ss(i.dest, src2, src3); - } else if (i.dest == src2) { - e.vfmsub213ss(i.dest, src1, src3); - } else if (i.dest == i.src3) { - e.vfmsub231ss(i.dest, src1, src2); - } else { - // Dest not equal to anything - e.vmovss(i.dest, src1); - e.vfmsub213ss(i.dest, src2, src3); - } - }); - } else { - Xmm src3; - if (i.src3.is_constant) { - src3 = e.xmm1; - e.LoadConstantXmm(src3, i.src3.constant()); - } else { - // If i.dest == i.src3, back up i.src3 so we don't overwrite it. - src3 = i.src3; - if (i.dest == i.src3) { - e.vmovss(e.xmm1, i.src3); - src3 = e.xmm1; - } - } - - // Multiply operation is commutative. - EmitCommutativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vmulss(dest, src1, src2); // $0 = $1 * $2 - }); - - e.vsubss(i.dest, i.dest, src3); // $0 = $1 - $2 - } + assert_impossible_sequence(MUL_SUB_F32); } }; struct MUL_SUB_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // FMA extension + e.ChangeMxcsrMode(MXCSRMode::Fpu); + + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2); if (e.IsFeatureEnabled(kX64EmitFMA)) { - EmitCommutativeBinaryXmmOp(e, i, - [&i](X64Emitter& e, const Xmm& dest, - const Xmm& src1, const Xmm& src2) { - Xmm src3 = - i.src3.is_constant ? e.xmm1 : i.src3; - if (i.src3.is_constant) { - e.LoadConstantXmm(src3, i.src3.constant()); - } - if (i.dest == src1) { - e.vfmsub213sd(i.dest, src2, src3); - } else if (i.dest == src2) { - e.vfmsub213sd(i.dest, src1, src3); - } else if (i.dest == i.src3) { - e.vfmsub231sd(i.dest, src1, src2); - } else { - // Dest not equal to anything - e.vmovsd(i.dest, src1); - e.vfmsub213sd(i.dest, src2, src3); - } - }); + // todo: this is garbage + e.vmovapd(e.xmm3, src1); + e.vfmsub213sd(e.xmm3, src2, src3); + e.vmovapd(i.dest, e.xmm3); } else { - Xmm src3; - if (i.src3.is_constant) { - src3 = e.xmm1; - e.LoadConstantXmm(src3, i.src3.constant()); - } else { - // If i.dest == i.src3, back up i.src3 so we don't overwrite it. - src3 = i.src3; - if (i.dest == i.src3) { - e.vmovsd(e.xmm1, i.src3); - src3 = e.xmm1; - } - } - - // Multiply operation is commutative. - EmitCommutativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vmulsd(dest, src1, src2); // $0 = $1 * $2 - }); - - e.vsubsd(i.dest, i.dest, src3); // $0 = $1 - $2 + // todo: might need to use x87 in this case... + e.vmulsd(e.xmm3, src1, src2); + e.vsubsd(i.dest, e.xmm3, src3); } } }; @@ -2267,49 +2233,20 @@ struct MUL_SUB_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // FMA extension + e.ChangeMxcsrMode(MXCSRMode::Vmx); + + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2); if (e.IsFeatureEnabled(kX64EmitFMA)) { - EmitCommutativeBinaryXmmOp(e, i, - [&i](X64Emitter& e, const Xmm& dest, - const Xmm& src1, const Xmm& src2) { - Xmm src3 = - i.src3.is_constant ? e.xmm1 : i.src3; - if (i.src3.is_constant) { - e.LoadConstantXmm(src3, i.src3.constant()); - } - if (i.dest == src1) { - e.vfmsub213ps(i.dest, src2, src3); - } else if (i.dest == src2) { - e.vfmsub213ps(i.dest, src1, src3); - } else if (i.dest == i.src3) { - e.vfmsub231ps(i.dest, src1, src2); - } else { - // Dest not equal to anything - e.vmovdqa(i.dest, src1); - e.vfmsub213ps(i.dest, src2, src3); - } - }); + // todo: this is garbage + e.vmovaps(e.xmm3, src1); + e.vfmsub213ps(e.xmm3, src2, src3); + e.vmovaps(i.dest, e.xmm3); } else { - Xmm src3; - if (i.src3.is_constant) { - src3 = e.xmm1; - e.LoadConstantXmm(src3, i.src3.constant()); - } else { - // If i.dest == i.src3, back up i.src3 so we don't overwrite it. - src3 = i.src3; - if (i.dest == i.src3) { - e.vmovdqa(e.xmm1, i.src3); - src3 = e.xmm1; - } - } - - // Multiply operation is commutative. - EmitCommutativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vmulps(dest, src1, src2); // $0 = $1 * $2 - }); - - e.vsubps(i.dest, i.dest, src3); // $0 = $1 - $2 + // todo: might need to use x87 in this case... + e.vmulps(e.xmm3, src1, src2); + e.vsubps(i.dest, e.xmm3, src3); } } }; @@ -2346,17 +2283,20 @@ struct NEG_I64 : Sequence> { }; struct NEG_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); e.vxorps(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPS)); } }; struct NEG_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); e.vxorpd(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPD)); } }; struct NEG_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { assert_true(!i.instr->flags); + e.ChangeMxcsrMode(MXCSRMode::Vmx); e.vxorps(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPS)); } }; @@ -2368,16 +2308,19 @@ EMITTER_OPCODE_TABLE(OPCODE_NEG, NEG_I8, NEG_I16, NEG_I32, NEG_I64, NEG_F32, // ============================================================================ struct ABS_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS)); } }; struct ABS_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); e.vandpd(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD)); } }; struct ABS_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); e.vandps(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS)); } }; @@ -2388,17 +2331,21 @@ EMITTER_OPCODE_TABLE(OPCODE_ABS, ABS_F32, ABS_F64, ABS_V128); // ============================================================================ struct SQRT_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vsqrtss(i.dest, i.src1); + e.ChangeMxcsrMode(MXCSRMode::Fpu); + + e.vsqrtss(i.dest, GetInputRegOrConstant(e, i.src1, e.xmm0)); } }; struct SQRT_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vsqrtsd(i.dest, i.src1); + e.ChangeMxcsrMode(MXCSRMode::Fpu); + e.vsqrtsd(i.dest, GetInputRegOrConstant(e, i.src1, e.xmm0)); } }; struct SQRT_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vsqrtps(i.dest, i.src1); + e.ChangeMxcsrMode(MXCSRMode::Vmx); + e.vsqrtps(i.dest, GetInputRegOrConstant(e, i.src1, e.xmm0)); } }; EMITTER_OPCODE_TABLE(OPCODE_SQRT, SQRT_F32, SQRT_F64, SQRT_V128); @@ -2410,33 +2357,40 @@ EMITTER_OPCODE_TABLE(OPCODE_SQRT, SQRT_F32, SQRT_F64, SQRT_V128); // < 1.5*2^-12 ≈ 1/2730 for vrsqrtps. struct RSQRT_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3); + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { - e.vrsqrt14ss(i.dest, i.src1, i.src1); + e.vrsqrt14ss(i.dest, src1, src1); } else { e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMOne)); - e.vsqrtss(e.xmm1, i.src1, i.src1); + e.vsqrtss(e.xmm1, src1, src1); e.vdivss(i.dest, e.xmm0, e.xmm1); } } }; struct RSQRT_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3); if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { - e.vrsqrt14sd(i.dest, i.src1, i.src1); + e.vrsqrt14sd(i.dest, src1, src1); } else { e.vmovapd(e.xmm0, e.GetXmmConstPtr(XMMOnePD)); - e.vsqrtsd(e.xmm1, i.src1, i.src1); + e.vsqrtsd(e.xmm1, src1, src1); e.vdivsd(i.dest, e.xmm0, e.xmm1); } } }; struct RSQRT_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3); if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { - e.vrsqrt14ps(i.dest, i.src1); + e.vrsqrt14ps(i.dest, src1); } else { e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMOne)); - e.vsqrtps(e.xmm1, i.src1); + e.vsqrtps(e.xmm1, src1); e.vdivps(i.dest, e.xmm0, e.xmm1); } } @@ -2451,31 +2405,37 @@ EMITTER_OPCODE_TABLE(OPCODE_RSQRT, RSQRT_F32, RSQRT_F64, RSQRT_V128); // spawning, breaks cactus collision as well as flickering grass in 5454082B struct RECIP_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3); if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { - e.vrcp14ss(i.dest, i.src1, i.src1); + e.vrcp14ss(i.dest, src1, src1); } else { e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMOne)); - e.vdivss(i.dest, e.xmm0, i.src1); + e.vdivss(i.dest, e.xmm0, src1); } } }; struct RECIP_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3); if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { - e.vrcp14sd(i.dest, i.src1, i.src1); + e.vrcp14sd(i.dest, src1, src1); } else { e.vmovapd(e.xmm0, e.GetXmmConstPtr(XMMOnePD)); - e.vdivsd(i.dest, e.xmm0, i.src1); + e.vdivsd(i.dest, e.xmm0, src1); } } }; struct RECIP_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3); if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { - e.vrcp14ps(i.dest, i.src1); + e.vrcp14ps(i.dest, src1); } else { e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMOne)); - e.vdivps(i.dest, e.xmm0, i.src1); + e.vdivps(i.dest, e.xmm0, src1); } } }; @@ -2487,31 +2447,13 @@ EMITTER_OPCODE_TABLE(OPCODE_RECIP, RECIP_F32, RECIP_F64, RECIP_V128); // TODO(benvanik): use approx here: // https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html struct POW2_F32 : Sequence> { - static __m128 EmulatePow2(void*, __m128 src) { - float src_value; - _mm_store_ss(&src_value, src); - float result = std::exp2(src_value); - return _mm_load_ss(&result); - } static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_always(); - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulatePow2)); - e.vmovaps(i.dest, e.xmm0); + assert_impossible_sequence(POW2_F32); } }; struct POW2_F64 : Sequence> { - static __m128d EmulatePow2(void*, __m128d src) { - double src_value; - _mm_store_sd(&src_value, src); - double result = std::exp2(src_value); - return _mm_load_sd(&result); - } static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_always(); - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulatePow2)); - e.vmovaps(i.dest, e.xmm0); + assert_impossible_sequence(POW2_F64); } }; struct POW2_V128 : Sequence> { @@ -2524,7 +2466,10 @@ struct POW2_V128 : Sequence> { return _mm_load_ps(values); } static void Emit(X64Emitter& e, const EmitArgType& i) { - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.ChangeMxcsrMode(MXCSRMode::Vmx); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + e.lea(e.GetNativeParam(0), e.StashXmm(0, src1)); + e.CallNativeSafe(reinterpret_cast(EmulatePow2)); e.vmovaps(i.dest, e.xmm0); } @@ -2538,39 +2483,13 @@ EMITTER_OPCODE_TABLE(OPCODE_POW2, POW2_F32, POW2_F64, POW2_V128); // https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html // TODO(benvanik): this emulated fn destroys all xmm registers! don't do it! struct LOG2_F32 : Sequence> { - static __m128 EmulateLog2(void*, __m128 src) { - float src_value; - _mm_store_ss(&src_value, src); - float result = std::log2(src_value); - return _mm_load_ss(&result); - } static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_always(); - if (i.src1.is_constant) { - e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant())); - } else { - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - } - e.CallNativeSafe(reinterpret_cast(EmulateLog2)); - e.vmovaps(i.dest, e.xmm0); + assert_impossible_sequence(LOG2_F32); } }; struct LOG2_F64 : Sequence> { - static __m128d EmulateLog2(void*, __m128d src) { - double src_value; - _mm_store_sd(&src_value, src); - double result = std::log2(src_value); - return _mm_load_sd(&result); - } static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_always(); - if (i.src1.is_constant) { - e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant())); - } else { - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - } - e.CallNativeSafe(reinterpret_cast(EmulateLog2)); - e.vmovaps(i.dest, e.xmm0); + assert_impossible_sequence(LOG2_F64); } }; struct LOG2_V128 : Sequence> { @@ -2583,11 +2502,11 @@ struct LOG2_V128 : Sequence> { return _mm_load_ps(values); } static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.src1.is_constant) { - e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant())); - } else { - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - } + e.ChangeMxcsrMode(MXCSRMode::Vmx); + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + + e.lea(e.GetNativeParam(0), e.StashXmm(0, src1)); + e.CallNativeSafe(reinterpret_cast(EmulateLog2)); e.vmovaps(i.dest, e.xmm0); } @@ -2601,6 +2520,7 @@ struct DOT_PRODUCT_3_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); // todo: add fast_dot_product path that just checks for infinity instead of // using mxcsr auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64]; @@ -2716,6 +2636,7 @@ struct DOT_PRODUCT_4_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); // todo: add fast_dot_product path that just checks for infinity instead of // using mxcsr auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64]; @@ -3469,46 +3390,12 @@ EMITTER_OPCODE_TABLE(OPCODE_BYTE_SWAP, BYTE_SWAP_I16, BYTE_SWAP_I32, // ============================================================================ struct CNTLZ_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (e.IsFeatureEnabled(kX64EmitLZCNT)) { - // No 8bit lzcnt, so do 16 and sub 8. - e.movzx(i.dest.reg().cvt16(), i.src1); - e.lzcnt(i.dest.reg().cvt16(), i.dest.reg().cvt16()); - e.sub(i.dest, 8); - } else { - Xbyak::Label end; - e.inLocalLabel(); - - e.bsr(e.rax, i.src1); // ZF set if i.src1 is 0 - e.mov(i.dest, 0x8); - e.jz(end); - - e.xor_(e.rax, 0x7); - e.mov(i.dest, e.rax); - - e.L(end); - e.outLocalLabel(); - } + assert_impossible_sequence(CNTLZ_I8); } }; struct CNTLZ_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (e.IsFeatureEnabled(kX64EmitLZCNT)) { - // LZCNT: searches $2 until MSB 1 found, stores idx (from last bit) in $1 - e.lzcnt(i.dest.reg().cvt32(), i.src1); - } else { - Xbyak::Label end; - e.inLocalLabel(); - - e.bsr(e.rax, i.src1); // ZF set if i.src1 is 0 - e.mov(i.dest, 0x10); - e.jz(end); - - e.xor_(e.rax, 0x0F); - e.mov(i.dest, e.rax); - - e.L(end); - e.outLocalLabel(); - } + assert_impossible_sequence(CNTLZ_I16); } }; struct CNTLZ_I32 : Sequence> { @@ -3564,10 +3451,26 @@ struct SET_ROUNDING_MODE_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.mov(e.rcx, i.src1); - e.and_(e.rcx, 0x7); - e.mov(e.rax, uintptr_t(mxcsr_table)); - e.vldmxcsr(e.ptr[e.rax + e.rcx * 4]); + // removed the And with 7 and hoisted that and into the InstrEmit_'s that + // generate OPCODE_SET_ROUNDING_MODE so that it can be constant folded and + // backends dont have to worry about it + if (i.src1.is_constant) { + e.mov(e.eax, mxcsr_table[i.src1.constant()]); + e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH64], e.eax); + e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.eax); + e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH64]); + + } else { + e.mov(e.ecx, i.src1); + + e.mov(e.rax, uintptr_t(mxcsr_table)); + e.mov(e.edx, e.ptr[e.rax + e.rcx * 4]); + // this was not here + e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.edx); + + e.vldmxcsr(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu))); + } + e.ChangeMxcsrMode(MXCSRMode::Fpu, true); } }; EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32); diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index b6985b1d8..a5fa40a04 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -20,6 +20,9 @@ DEFINE_bool(inline_mmio_access, true, "Inline constant MMIO loads and stores.", "CPU"); +DEFINE_bool(permit_float_constant_evaluation, false, "Allow float constant evaluation, may produce incorrect results and break games math", + "CPU"); + namespace xe { namespace cpu { namespace compiler { @@ -68,8 +71,24 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { result = false; auto block = builder->first_block(); while (block) { - auto i = block->instr_head; - while (i) { + for (auto i = block->instr_head; i; i = i->next) { + if (((i->opcode->flags & OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) != 0) && + !cvars::permit_float_constant_evaluation) { + continue; + } + bool might_be_floatop = false; + + i->VisitValueOperands( + [&might_be_floatop](Value* current_opnd, uint32_t opnd_index) { + might_be_floatop |= current_opnd->MaybeFloaty(); + }); + if (i->dest) { + might_be_floatop |= i->dest->MaybeFloaty(); + } + + bool should_skip_because_of_float = + might_be_floatop && !cvars::permit_float_constant_evaluation; + auto v = i->dest; switch (i->opcode->num) { case OPCODE_DEBUG_BREAK_TRUE: @@ -452,7 +471,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { break; case OPCODE_ADD: - if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + if (i->src1.value->IsConstant() && i->src2.value->IsConstant() && + !should_skip_because_of_float) { v->set_from(i->src1.value); v->Add(i->src2.value); i->Remove(); @@ -481,7 +501,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { } break; case OPCODE_SUB: - if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + if (i->src1.value->IsConstant() && i->src2.value->IsConstant() && + !should_skip_because_of_float) { v->set_from(i->src1.value); v->Sub(i->src2.value); i->Remove(); @@ -489,32 +510,34 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { } break; case OPCODE_MUL: - if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { - v->set_from(i->src1.value); - v->Mul(i->src2.value); - i->Remove(); - result = true; - } else if (i->src1.value->IsConstant() || - i->src2.value->IsConstant()) { - // Reorder the sources to make things simpler. - // s1 = non-const, s2 = const - auto s1 = - i->src1.value->IsConstant() ? i->src2.value : i->src1.value; - auto s2 = - i->src1.value->IsConstant() ? i->src1.value : i->src2.value; - - // Multiplication by one = no-op - if (s2->type != VEC128_TYPE && s2->IsConstantOne()) { - i->Replace(&OPCODE_ASSIGN_info, 0); - i->set_src1(s1); + if (!should_skip_because_of_float) { + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + v->set_from(i->src1.value); + v->Mul(i->src2.value); + i->Remove(); result = true; - } else if (s2->type == VEC128_TYPE) { - auto& c = s2->constant; - if (c.v128.f32[0] == 1.f && c.v128.f32[1] == 1.f && - c.v128.f32[2] == 1.f && c.v128.f32[3] == 1.f) { + } else if (i->src1.value->IsConstant() || + i->src2.value->IsConstant()) { + // Reorder the sources to make things simpler. + // s1 = non-const, s2 = const + auto s1 = + i->src1.value->IsConstant() ? i->src2.value : i->src1.value; + auto s2 = + i->src1.value->IsConstant() ? i->src1.value : i->src2.value; + + // Multiplication by one = no-op + if (s2->type != VEC128_TYPE && s2->IsConstantOne()) { i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(s1); result = true; + } else if (s2->type == VEC128_TYPE) { + auto& c = s2->constant; + if (c.v128.f32[0] == 1.f && c.v128.f32[1] == 1.f && + c.v128.f32[2] == 1.f && c.v128.f32[3] == 1.f) { + i->Replace(&OPCODE_ASSIGN_info, 0); + i->set_src1(s1); + result = true; + } } } } @@ -528,75 +551,32 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { } break; case OPCODE_DIV: - if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { - v->set_from(i->src1.value); - v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0); - i->Remove(); - result = true; - } else if (i->src2.value->IsConstant()) { - // Division by one = no-op. - Value* src1 = i->src1.value; - if (i->src2.value->type != VEC128_TYPE && - i->src2.value->IsConstantOne()) { - i->Replace(&OPCODE_ASSIGN_info, 0); - i->set_src1(src1); + if (!should_skip_because_of_float) { + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + v->set_from(i->src1.value); + v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0); + i->Remove(); result = true; - } else if (i->src2.value->type == VEC128_TYPE) { - auto& c = i->src2.value->constant; - if (c.v128.f32[0] == 1.f && c.v128.f32[1] == 1.f && - c.v128.f32[2] == 1.f && c.v128.f32[3] == 1.f) { + } else if (i->src2.value->IsConstant()) { + // Division by one = no-op. + Value* src1 = i->src1.value; + if (i->src2.value->type != VEC128_TYPE && + i->src2.value->IsConstantOne()) { i->Replace(&OPCODE_ASSIGN_info, 0); i->set_src1(src1); result = true; + } else if (i->src2.value->type == VEC128_TYPE) { + auto& c = i->src2.value->constant; + if (c.v128.f32[0] == 1.f && c.v128.f32[1] == 1.f && + c.v128.f32[2] == 1.f && c.v128.f32[3] == 1.f) { + i->Replace(&OPCODE_ASSIGN_info, 0); + i->set_src1(src1); + result = true; + } } } } break; - case OPCODE_MUL_ADD: - if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { - if (i->src3.value->IsConstant()) { - v->set_from(i->src1.value); - Value::MulAdd(v, i->src1.value, i->src2.value, i->src3.value); - i->Remove(); - result = true; - } else { - // Multiply part is constant. - Value* mul = builder->AllocValue(); - mul->set_from(i->src1.value); - mul->Mul(i->src2.value); - - Value* add = i->src3.value; - i->Replace(&OPCODE_ADD_info, 0); - i->set_src1(mul); - i->set_src2(add); - - result = true; - } - } - break; - case OPCODE_MUL_SUB: - if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { - // Multiply part is constant. - if (i->src3.value->IsConstant()) { - v->set_from(i->src1.value); - Value::MulSub(v, i->src1.value, i->src2.value, i->src3.value); - i->Remove(); - result = true; - } else { - // Multiply part is constant. - Value* mul = builder->AllocValue(); - mul->set_from(i->src1.value); - mul->Mul(i->src2.value); - - Value* add = i->src3.value; - i->Replace(&OPCODE_SUB_info, 0); - i->set_src1(mul); - i->set_src2(add); - - result = true; - } - } - break; case OPCODE_MAX: if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); @@ -925,7 +905,8 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { result = true; } break; - case OPCODE_VECTOR_DENORMFLUSH: + case OPCODE_VECTOR_DENORMFLUSH: // this one is okay to constant + // evaluate, since it is just bit math if (i->src1.value->IsConstant()) { v->set_from(i->src1.value); v->DenormalFlush(); @@ -933,19 +914,10 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { result = true; } break; - case OPCODE_TO_SINGLE: - if (i->src1.value->IsConstant()) { - v->set_from(i->src1.value); - v->ToSingle(); - i->Remove(); - result = true; - } - break; default: // Ignored. break; } - i = i->next; } block = block->next; diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc index 03e73ca1b..760b7fc2c 100644 --- a/src/xenia/cpu/hir/hir_builder.cc +++ b/src/xenia/cpu/hir/hir_builder.cc @@ -1287,7 +1287,11 @@ void HIRBuilder::SetRoundingMode(Value* value) { Instr* i = AppendInstr(OPCODE_SET_ROUNDING_MODE_info, 0); i->set_src1(value); } - +void HIRBuilder::SetNJM(Value* value) { + ASSERT_INTEGER_TYPE(value); + Instr* i = AppendInstr(OPCODE_SET_NJM_info, 0); + i->set_src1(value); +} Value* HIRBuilder::Max(Value* value1, Value* value2) { ASSERT_TYPES_EQUAL(value1, value2); @@ -1632,7 +1636,7 @@ Value* HIRBuilder::Div(Value* value1, Value* value2, Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) { ASSERT_TYPES_EQUAL(value1, value2); ASSERT_TYPES_EQUAL(value1, value3); - + #if 0 bool c1 = value1->IsConstant(); bool c2 = value2->IsConstant(); if (c1 && c2) { @@ -1640,7 +1644,7 @@ Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) { dest->Mul(value2); return Add(dest, value3); } - + #endif Instr* i = AppendInstr(OPCODE_MUL_ADD_info, 0, AllocValue(value1->type)); i->set_src1(value1); i->set_src2(value2); @@ -1651,7 +1655,7 @@ Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) { Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) { ASSERT_TYPES_EQUAL(value1, value2); ASSERT_TYPES_EQUAL(value1, value3); - + #if 0 bool c1 = value1->IsConstant(); bool c2 = value2->IsConstant(); if (c1 && c2) { @@ -1659,7 +1663,7 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) { dest->Mul(value2); return Sub(dest, value3); } - + #endif Instr* i = AppendInstr(OPCODE_MUL_SUB_info, 0, AllocValue(value1->type)); i->set_src1(value1); i->set_src2(value2); diff --git a/src/xenia/cpu/hir/hir_builder.h b/src/xenia/cpu/hir/hir_builder.h index be08dbc98..05cb14d34 100644 --- a/src/xenia/cpu/hir/hir_builder.h +++ b/src/xenia/cpu/hir/hir_builder.h @@ -264,7 +264,7 @@ class HIRBuilder { Value* new_value); Value* AtomicAdd(Value* address, Value* value); Value* AtomicSub(Value* address, Value* value); - + void SetNJM(Value* value); protected: void DumpValue(StringBuffer* str, Value* value); void DumpOp(StringBuffer* str, OpcodeSignatureType sig_type, Instr::Op* op); diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index 93b3e7e62..2f7676861 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -284,6 +284,7 @@ enum Opcode { OPCODE_TO_SINGLE, // i could not find a decent name to assign to this opcode, // as we already have OPCODE_ROUND. round double to float ( // ppc "single" fpu instruction result rounding behavior ) + OPCODE_SET_NJM, __OPCODE_MAX_VALUE, // Keep at end. }; @@ -295,6 +296,7 @@ enum OpcodeFlags { OPCODE_FLAG_IGNORE = (1 << 5), OPCODE_FLAG_HIDE = (1 << 6), OPCODE_FLAG_PAIRED_PREV = (1 << 7), + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING = (1 << 8) }; enum OpcodeSignatureType { diff --git a/src/xenia/cpu/hir/opcodes.inl b/src/xenia/cpu/hir/opcodes.inl index be06171f0..b68e9158b 100644 --- a/src/xenia/cpu/hir/opcodes.inl +++ b/src/xenia/cpu/hir/opcodes.inl @@ -151,25 +151,25 @@ DEFINE_OPCODE( OPCODE_CONVERT, "convert", OPCODE_SIG_V_V, - 0) + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) DEFINE_OPCODE( OPCODE_ROUND, "round", OPCODE_SIG_V_V, - 0) + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) DEFINE_OPCODE( OPCODE_VECTOR_CONVERT_I2F, "vector_convert_i2f", OPCODE_SIG_V_V, - 0) + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) DEFINE_OPCODE( OPCODE_VECTOR_CONVERT_F2I, "vector_convert_f2i", OPCODE_SIG_V_V, - 0) + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) DEFINE_OPCODE( OPCODE_LOAD_VECTOR_SHL, @@ -456,13 +456,13 @@ DEFINE_OPCODE( OPCODE_MUL_ADD, "mul_add", OPCODE_SIG_V_V_V_V, - 0) + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) DEFINE_OPCODE( OPCODE_MUL_SUB, "mul_sub", OPCODE_SIG_V_V_V_V, - 0) + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) DEFINE_OPCODE( OPCODE_NEG, @@ -480,43 +480,43 @@ DEFINE_OPCODE( OPCODE_SQRT, "sqrt", OPCODE_SIG_V_V, - 0) + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) DEFINE_OPCODE( OPCODE_RSQRT, "rsqrt", OPCODE_SIG_V_V, - 0) + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) DEFINE_OPCODE( OPCODE_RECIP, "recip", OPCODE_SIG_V_V, - 0) + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) DEFINE_OPCODE( OPCODE_POW2, "pow2", OPCODE_SIG_V_V, - 0) + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) DEFINE_OPCODE( OPCODE_LOG2, "log2", OPCODE_SIG_V_V, - 0) + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) DEFINE_OPCODE( OPCODE_DOT_PRODUCT_3, "dot_product_3", OPCODE_SIG_V_V_V, - 0) + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) DEFINE_OPCODE( OPCODE_DOT_PRODUCT_4, "dot_product_4", OPCODE_SIG_V_V_V, - 0) + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) DEFINE_OPCODE( OPCODE_AND, @@ -685,5 +685,11 @@ DEFINE_OPCODE( OPCODE_TO_SINGLE, "to_single", OPCODE_SIG_V_V, + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING +) +DEFINE_OPCODE( + OPCODE_SET_NJM, + "set_njm", + OPCODE_SIG_X_V, 0 ) \ No newline at end of file diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc index a1e6fc2ea..f6e76f99d 100644 --- a/src/xenia/cpu/hir/value.cc +++ b/src/xenia/cpu/hir/value.cc @@ -199,7 +199,7 @@ void Value::Truncate(TypeName target_type) { return; } } - +//WARNING: this does not handle rounding flags at all! void Value::Convert(TypeName target_type, RoundMode round_mode) { switch (type) { case FLOAT32_TYPE: @@ -401,7 +401,7 @@ void Value::MulHi(Value* other, bool is_unsigned) { 32); } break; - case INT64_TYPE: + case INT64_TYPE: { #if XE_COMPILER_MSVC if (is_unsigned) { constant.i64 = __umulh(constant.i64, other->constant.i64); @@ -409,17 +409,19 @@ void Value::MulHi(Value* other, bool is_unsigned) { constant.i64 = __mulh(constant.i64, other->constant.i64); } #else + unsigned __int128 product; if (is_unsigned) { - constant.i64 = static_cast( - static_cast(constant.i64) * - static_cast(other->constant.i64)); + product = static_cast(constant.i64) * + static_cast(other->constant.i64); } else { - constant.i64 = - static_cast(static_cast<__int128>(constant.i64) * - static_cast<__int128>(other->constant.i64)); + product = static_cast( + static_cast<__int128>(constant.i64) * + static_cast<__int128>(other->constant.i64)); } + constant.i64 = static_cast(product >> 64); #endif // XE_COMPILER_MSVC break; + } default: assert_unhandled_case(type); break; @@ -495,52 +497,6 @@ void Value::Max(Value* other) { } } -void Value::MulAdd(Value* dest, Value* value1, Value* value2, Value* value3) { - switch (dest->type) { - case VEC128_TYPE: - for (int i = 0; i < 4; i++) { - dest->constant.v128.f32[i] = - (value1->constant.v128.f32[i] * value2->constant.v128.f32[i]) + - value3->constant.v128.f32[i]; - } - break; - case FLOAT32_TYPE: - dest->constant.f32 = - (value1->constant.f32 * value2->constant.f32) + value3->constant.f32; - break; - case FLOAT64_TYPE: - dest->constant.f64 = - (value1->constant.f64 * value2->constant.f64) + value3->constant.f64; - break; - default: - assert_unhandled_case(dest->type); - break; - } -} - -void Value::MulSub(Value* dest, Value* value1, Value* value2, Value* value3) { - switch (dest->type) { - case VEC128_TYPE: - for (int i = 0; i < 4; i++) { - dest->constant.v128.f32[i] = - (value1->constant.v128.f32[i] * value2->constant.v128.f32[i]) - - value3->constant.v128.f32[i]; - } - break; - case FLOAT32_TYPE: - dest->constant.f32 = - (value1->constant.f32 * value2->constant.f32) - value3->constant.f32; - break; - case FLOAT64_TYPE: - dest->constant.f64 = - (value1->constant.f64 * value2->constant.f64) - value3->constant.f64; - break; - default: - assert_unhandled_case(dest->type); - break; - } -} - void Value::Neg() { switch (type) { case INT8_TYPE: @@ -1643,11 +1599,7 @@ void Value::DenormalFlush() { constant.v128.u32[i] = current_element; } } -void Value::ToSingle() { - assert_true(type == FLOAT64_TYPE); - constant.f64 = static_cast(static_cast(constant.f64)); -} void Value::CountLeadingZeros(const Value* other) { switch (other->type) { case INT8_TYPE: diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h index d878f29cd..4cb7ee17d 100644 --- a/src/xenia/cpu/hir/value.h +++ b/src/xenia/cpu/hir/value.h @@ -563,8 +563,7 @@ class Value { void MulHi(Value* other, bool is_unsigned); void Div(Value* other, bool is_unsigned); void Max(Value* other); - static void MulAdd(Value* dest, Value* value1, Value* value2, Value* value3); - static void MulSub(Value* dest, Value* value1, Value* value2, Value* value3); + void Neg(); void Abs(); void Sqrt(); @@ -603,7 +602,6 @@ class Value { bool saturate); void ByteSwap(); void DenormalFlush(); - void ToSingle(); void CountLeadingZeros(const Value* other); bool Compare(Opcode opcode, Value* other); hir::Instr* GetDefSkipAssigns(); @@ -615,7 +613,10 @@ class Value { // returns true if every single use is as an operand to a single instruction // (add var2, var1, var1) bool AllUsesByOneInsn() const; - + //the maybe is here because this includes vec128, which is untyped data that can be treated as float or int depending on the context + bool MaybeFloaty() const { + return type == FLOAT32_TYPE || type == FLOAT64_TYPE || type == VEC128_TYPE; + } private: static bool CompareInt8(Opcode opcode, Value* a, Value* b); static bool CompareInt16(Opcode opcode, Value* a, Value* b); diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index 40d3f32cd..b62e28498 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -364,7 +364,16 @@ int InstrEmit_mfvscr(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) { // is this the right format? + //todo: what mtvscr does with the unused bits is implementation defined, figure out what it does + + Value* v = f.LoadVR(i.VX128_1.RB); + + + Value* has_njm_value = f.Extract(v, (uint8_t)3, INT32_TYPE); + + f.SetNJM(f.IsTrue(f.And(has_njm_value, f.LoadConstantInt32(65536)))); + f.StoreContext(offsetof(PPCContext, vscr_vec), v); return 0; } diff --git a/src/xenia/cpu/ppc/ppc_emit_fpu.cc b/src/xenia/cpu/ppc/ppc_emit_fpu.cc index 872ee1ff4..e12caa9bc 100644 --- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc @@ -382,7 +382,6 @@ int InstrEmit_mtfsfx(PPCHIRBuilder& f, const InstrData& i) { return 1; } else { assert_zero(i.XFL.W); - // Store under control of mask. // Expand the mask from 8 bits -> 32 bits. uint32_t mask = 0; @@ -402,7 +401,7 @@ int InstrEmit_mtfsfx(PPCHIRBuilder& f, const InstrData& i) { // Update the system rounding mode. if (mask & 0x7) { - f.SetRoundingMode(v); + f.SetRoundingMode(f.And(v, f.LoadConstantInt32(7))); } } if (i.XFL.Rc) { @@ -425,7 +424,7 @@ int InstrEmit_mtfsfix(PPCHIRBuilder& f, const InstrData& i) { // Update the system rounding mode. if (mask & 0x7) { - f.SetRoundingMode(fpscr); + f.SetRoundingMode(f.And(fpscr, f.LoadConstantInt32(7))); } if (i.X.Rc) { diff --git a/src/xenia/emulator.cc b/src/xenia/emulator.cc index da9e059d0..e9a671c87 100644 --- a/src/xenia/emulator.cc +++ b/src/xenia/emulator.cc @@ -64,9 +64,13 @@ DEFINE_string( "or the module specified by the game. Leave blank to launch the default " "module.", "General"); +DEFINE_bool(allow_game_relative_writes, false, + "Not useful to non-developers. Allows code to write to paths " + "relative to game://. Used for " + "generating test data to compare with original hardware. ", + "General"); namespace xe { - using namespace xe::literals; Emulator::GameConfigLoadCallback::GameConfigLoadCallback(Emulator& emulator) @@ -282,7 +286,8 @@ const std::unique_ptr Emulator::CreateVfsDeviceBasedOnPath( auto extension = xe::utf8::lower_ascii(xe::path_to_utf8(path.extension())); if (extension == ".xex" || extension == ".elf" || extension == ".exe") { auto parent_path = path.parent_path(); - return std::make_unique(mount_path, parent_path, true); + return std::make_unique( + mount_path, parent_path, !cvars::allow_game_relative_writes); } else { return std::make_unique(mount_path, path); } @@ -653,8 +658,8 @@ bool Emulator::ExceptionCallback(Exception* ex) { // debugger. return false; } else if (processor()->is_debugger_attached()) { - // Let the debugger handle this exception. It may decide to continue past it - // (if it was a stepping breakpoint, etc). + // Let the debugger handle this exception. It may decide to continue past + // it (if it was a stepping breakpoint, etc). return processor()->OnUnhandledException(ex); } @@ -823,8 +828,8 @@ static std::string format_version(xex2_version version) { X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path, const std::string_view module_path) { - // Making changes to the UI (setting the icon) and executing game config load - // callbacks which expect to be called from the UI thread. + // Making changes to the UI (setting the icon) and executing game config + // load callbacks which expect to be called from the UI thread. assert_true(display_window_->app_context().IsInUIThread()); // Setup NullDevices for raw HDD partition accesses @@ -832,12 +837,12 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path, // By using a NullDevice that just returns success to all IO requests it // should allow games to believe cache/raw disk was accessed successfully - // NOTE: this should probably be moved to xenia_main.cc, but right now we need - // to register the \Device\Harddisk0\ NullDevice _after_ the + // NOTE: this should probably be moved to xenia_main.cc, but right now we + // need to register the \Device\Harddisk0\ NullDevice _after_ the // \Device\Harddisk0\Partition1 HostPathDevice, otherwise requests to - // Partition1 will go to this. Registering during CompleteLaunch allows us to - // make sure any HostPathDevices are ready beforehand. - // (see comment above cache:\ device registration for more info about why) + // Partition1 will go to this. Registering during CompleteLaunch allows us + // to make sure any HostPathDevices are ready beforehand. (see comment above + // cache:\ device registration for more info about why) auto null_paths = {std::string("\\Partition0"), std::string("\\Cache0"), std::string("\\Cache1")}; auto null_device = @@ -900,8 +905,8 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path, if (module->title_id()) { auto title_id = fmt::format("{:08X}", module->title_id()); - // Load the per-game configuration file and make sure updates are handled by - // the callbacks. + // Load the per-game configuration file and make sure updates are handled + // by the callbacks. config::LoadGameConfig(title_id); assert_true(game_config_load_callback_loop_next_index_ == SIZE_MAX); game_config_load_callback_loop_next_index_ = 0; @@ -934,10 +939,10 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path, } } - // Initializing the shader storage in a blocking way so the user doesn't miss - // the initial seconds - for instance, sound from an intro video may start - // playing before the video can be seen if doing this in parallel with the - // main thread. + // Initializing the shader storage in a blocking way so the user doesn't + // miss the initial seconds - for instance, sound from an intro video may + // start playing before the video can be seen if doing this in parallel with + // the main thread. on_shader_storage_initialization(true); graphics_system_->InitializeShaderStorage(cache_root_, title_id_.value(), true);