From 5afe3a8ae23d3d4d9e241149e0e416fcac7c1c4c Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Fri, 24 Jan 2020 20:18:26 -0800 Subject: [PATCH] Fixed shit precision in RECIP, made multiplication codegen less garbage --- src/xenia/cpu/backend/x64/x64_emitter.cc | 54 +++++++--------- src/xenia/cpu/backend/x64/x64_emitter.h | 4 +- src/xenia/cpu/backend/x64/x64_sequences.cc | 74 ++++++++++++++++++++-- 3 files changed, 92 insertions(+), 40 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index fb30906b6..3094b6d48 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -55,13 +55,13 @@ static const size_t kStashOffset = 32; // static const size_t kStashOffsetHigh = 32 + 32; const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = { - Xbyak::Operand::RBX, Xbyak::Operand::R10, Xbyak::Operand::R11, - Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14, - Xbyak::Operand::R15, + Xbyak::Operand::RBX, Xbyak::Operand::R10, Xbyak::Operand::R11, + Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14, + Xbyak::Operand::R15, }; const uint32_t X64Emitter::xmm_reg_map_[X64Emitter::XMM_COUNT] = { - 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, }; X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) @@ -210,9 +210,8 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { // Record call history value into slot (guest addr in RDX). mov(dword[Xbyak::RegExp(uint32_t(uint64_t( - low_address(&trace_header->function_caller_history)))) + - rax * 4], - edx); + low_address(&trace_header->function_caller_history)))) + + rax * 4], edx); // Calling thread. Load ax with thread ID. EmitGetCurrentThreadId(); @@ -737,9 +736,8 @@ static const vec128_t xmm_consts[] = { /* XMMIntMax */ vec128i(INT_MAX), /* XMMIntMaxPD */ vec128d(INT_MAX), /* XMMPosIntMinPS */ vec128f((float)0x80000000u), - /* XMMQNaN */ vec128i(0x7FC00000u), - /*XMMSelectTableBase */vec128i(0), - /*XMMSelectTableLast*/ vec128i(-1) + /* XMMQNaN */ vec128i(0x7FC00000u), + /* XMMOneDouble */ vec128d(1.0) }; // First location to try and place constants. @@ -778,17 +776,14 @@ void X64Emitter::FreeConstData(uintptr_t data) { memory::DeallocFixed(reinterpret_cast(data), 0, memory::DeallocationType::kRelease); } -uintptr_t X64Emitter::GetXmmRawAddress(XmmConst id) { - return backend_->emitter_data() + sizeof(vec128_t) * id; -} + Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) { // Load through fixed constant table setup by PlaceConstData. // It's important that the pointer is not signed, as it will be sign-extended. - return ptr[GetXmmRawAddress(id)]; + return ptr[reinterpret_cast(backend_->emitter_data() + + sizeof(vec128_t) * id)]; } - - // Implies possible StashXmm(0, ...)! void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) { // https://www.agner.org/optimize/optimizing_assembly.pdf @@ -802,10 +797,10 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) { } else { for(unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) { - if(xmm_consts[i] == v) { - vmovapd(dest, GetXmmConstPtr((XmmConst)i)); - return; - } + if(xmm_consts[i] == v) { + vmovapd(dest, GetXmmConstPtr((XmmConst)i)); + return; + } } // TODO(benvanik): see what other common values are. // TODO(benvanik): build constant table - 99% are reused. @@ -830,15 +825,13 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, float v) { // TODO(benvanik): see what other common values are. // TODO(benvanik): build constant table - 99% are reused. - unsigned raw_bits =*reinterpret_cast(&v); for (unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) { - - if(xmm_consts[i].u32[0] == raw_bits) { - vmovss(dest, GetXmmConstPtr((XmmConst)i)); - return; - } + if(xmm_consts[i].u32[0] == raw_bits) { + vmovss(dest, GetXmmConstPtr((XmmConst)i)); + return; + } } mov(eax, x.i); @@ -865,11 +858,10 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, double v) { uint64_t raw_bits = *reinterpret_cast(&v); for (unsigned i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) { - - if(xmm_consts[i].u64[0] == raw_bits) { - vmovsd(dest, GetXmmConstPtr((XmmConst)i)); - return; - } + if(xmm_consts[i].u64[0] == raw_bits) { + vmovsd(dest, GetXmmConstPtr((XmmConst)i)); + return; + } } mov(rax, x.i); vmovq(dest, rax); diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 4f4af11f6..a81f5e2b2 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -114,8 +114,7 @@ enum XmmConst { XMMIntMaxPD, XMMPosIntMinPS, XMMQNaN, - XMMSelectTableBase, - XMMSelectTableLast, + XMMOneDouble }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. @@ -212,7 +211,6 @@ class X64Emitter : public Xbyak::CodeGenerator { void MovMem64(const Xbyak::RegExp& addr, uint64_t v); Xbyak::Address GetXmmConstPtr(XmmConst id); - uintptr_t GetXmmRawAddress(XmmConst id); void LoadConstantXmm(Xbyak::Xmm dest, float v); void LoadConstantXmm(Xbyak::Xmm dest, double v); void LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v); diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 40f2ddf47..0fbe1663e 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -175,7 +175,7 @@ struct ZERO_EXTEND_I32_I8 struct ZERO_EXTEND_I64_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.movzx(i.dest, i.src1); + e.movzx(i.dest.reg().cvt32(), i.src1); } }; struct ZERO_EXTEND_I32_I16 @@ -187,7 +187,7 @@ struct ZERO_EXTEND_I32_I16 struct ZERO_EXTEND_I64_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.movzx(i.dest, i.src1); + e.movzx(i.dest.reg().cvt32(), i.src1); } }; struct ZERO_EXTEND_I64_I32 @@ -1323,6 +1323,19 @@ EMITTER_OPCODE_TABLE(OPCODE_SUB, SUB_I8, SUB_I16, SUB_I32, SUB_I64, SUB_F32, // We exploit mulx here to avoid creating too much register pressure. struct MUL_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + if(i.src1.is_constant || i.src2.is_constant ) { + + + uint64_t cval =i.src1.is_constant ? i.src1.constant() : i.src2.constant(); + + if(cval < (1ull<<32)) { + + auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; + + e.imul(i.dest, whichevs, (int)cval); + return; + } + } if (e.IsFeatureEnabled(kX64EmitBMI2)) { // mulx: $1:$2 = EDX * $3 @@ -1364,6 +1377,19 @@ struct MUL_I8 : Sequence> { }; struct MUL_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + if(i.src1.is_constant || i.src2.is_constant ) { + + + uint64_t cval =i.src1.is_constant ? i.src1.constant() : i.src2.constant(); + + if(cval < (1ull<<32)) { + + auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; + + e.imul(i.dest, whichevs, (int)cval); + return; + } + } if (e.IsFeatureEnabled(kX64EmitBMI2)) { // mulx: $1:$2 = EDX * $3 @@ -1412,6 +1438,20 @@ struct MUL_I32 : Sequence> { return; } } + + if(i.src1.is_constant || i.src2.is_constant ) { + + + uint64_t cval =i.src1.is_constant ? i.src1.constant() : i.src2.constant(); + + if(cval < (1ull<<32)) { + + auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; + + e.imul(i.dest, whichevs, (int)cval); + return; + } + } if (e.IsFeatureEnabled(kX64EmitBMI2)) { // mulx: $1:$2 = EDX * $3 @@ -1462,6 +1502,21 @@ struct MUL_I64 : Sequence> { return; } } + + if(i.src1.is_constant || i.src2.is_constant ) { + + + uint64_t cval =i.src1.is_constant ? i.src1.constant() : i.src2.constant(); + + if(cval < (1ull<<32)) { + + auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; + + e.imul(i.dest, whichevs, (int)cval); + return; + } + } + if (e.IsFeatureEnabled(kX64EmitBMI2)) { // mulx: $1:$2 = RDX * $3 @@ -2470,9 +2525,14 @@ struct RSQRT_F32 : Sequence> { }; struct RSQRT_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vcvtsd2ss(i.dest, i.src1); + /*e.vcvtsd2ss(i.dest, i.src1); e.vrsqrtss(i.dest, i.dest); - e.vcvtss2sd(i.dest, i.dest); + e.vcvtss2sd(i.dest, i.dest);*/ + + e.vmovsd(e.xmm0, e.GetXmmConstPtr(XmmConst::XMMOneDouble)); + e.vsqrtsd(i.dest, i.src1); + e.vdivsd(i.dest, e.xmm0, i.dest); + } }; struct RSQRT_V128 : Sequence> { @@ -2492,9 +2552,11 @@ struct RECIP_F32 : Sequence> { }; struct RECIP_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vcvtsd2ss(i.dest, i.src1); + /*e.vcvtsd2ss(i.dest, i.src1); e.vrcpss(i.dest, i.dest); - e.vcvtss2sd(i.dest, i.dest); + e.vcvtss2sd(i.dest, i.dest);*/ + e.vmovsd(e.xmm0, e.GetXmmConstPtr(XmmConst::XMMOneDouble)); + e.vdivsd(i.dest, e.xmm0, i.src1); } }; struct RECIP_V128 : Sequence> {