From 9b6f2859a44ddb65f8e3693d7fdb194c7aecabb0 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Wed, 5 Jun 2019 23:23:22 +0200 Subject: [PATCH 1/6] DSPJitUtil: Micro-optimize increment_addr_reg 2 bytes shorter. Before: 0: 41 0f b7 c2 movzx eax,r10w 4: 41 89 c5 mov r13d,eax 7: 83 c0 01 add eax,0x1 After: 0: 45 0f b7 ea movzx r13d,r10w 4: 41 8d 45 01 lea eax,[r13+0x1] --- Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp b/Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp index 630fc249d5..397d45b1b9 100644 --- a/Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp +++ b/Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp @@ -317,12 +317,11 @@ void DSPEmitter::increment_addr_reg(int reg) m_gpr.PutReg(DSP_REG_WR0 + reg, false); const OpArg ar_reg = m_gpr.GetReg(DSP_REG_AR0 + reg); - MOVZX(32, 16, EAX, ar_reg); - X64Reg tmp1 = m_gpr.GetFreeXReg(); + MOVZX(32, 16, tmp1, ar_reg); + // u32 nar = ar + 1; - MOV(32, R(tmp1), R(EAX)); - ADD(32, R(EAX), Imm8(1)); + LEA(32, EAX, MDisp(tmp1, 1)); // if ((nar ^ ar) > ((wr | 1) << 1)) // nar -= wr + 1; From 832e320f0824d37057dcb9deea3807d8f18043f7 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Wed, 5 Jun 2019 23:25:39 +0200 Subject: [PATCH 2/6] DSPJitUtil: Make round_long_acc branchless Also no longer clobbers RCX. --- Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp b/Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp index 397d45b1b9..5899387df9 100644 --- a/Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp +++ b/Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp @@ -707,22 +707,13 @@ void DSPEmitter::set_long_prod() } // Returns s64 in RAX -// Clobbers RCX void DSPEmitter::round_long_acc(X64Reg long_acc) { // if (prod & 0x10000) prod = (prod + 0x8000) & ~0xffff; - TEST(32, R(long_acc), Imm32(0x10000)); - FixupBranch jump = J_CC(CC_Z); - ADD(64, R(long_acc), Imm32(0x8000)); - MOV(64, R(ECX), Imm64(~0xffff)); - AND(64, R(long_acc), R(RCX)); - FixupBranch _ret = J(); // else prod = (prod + 0x7fff) & ~0xffff; - SetJumpTarget(jump); - ADD(64, R(long_acc), Imm32(0x7fff)); - MOV(64, R(RCX), Imm64(~0xffff)); - AND(64, R(long_acc), R(RCX)); - SetJumpTarget(_ret); + BT(32, R(long_acc), Imm8(16)); + ADC(64, R(long_acc), Imm32(0x7FFF)); + XOR(16, R(long_acc), R(long_acc)); // return prod; } From a3744c3c26276434106435f13f86779b5d5308ce Mon Sep 17 00:00:00 2001 From: Sintendo Date: Wed, 5 Jun 2019 23:33:02 +0200 Subject: [PATCH 3/6] DSPJitUtil: Use round_long_acc in get_long_prod_round_prodl Identical, except that this used a temporary register as scratch instead of RCX. round_long_acc now no longer needs scratch, so we can deduplicate the logic. --- Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp b/Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp index 5899387df9..00bfe715a3 100644 --- a/Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp +++ b/Source/Core/Core/DSP/Jit/x64/DSPJitUtil.cpp @@ -665,28 +665,12 @@ void DSPEmitter::get_long_prod(X64Reg long_prod) } // Returns s64 in RAX -// Clobbers RCX void DSPEmitter::get_long_prod_round_prodl(X64Reg long_prod) { // s64 prod = dsp_get_long_prod(); get_long_prod(long_prod); - X64Reg tmp = m_gpr.GetFreeXReg(); - // if (prod & 0x10000) prod = (prod + 0x8000) & ~0xffff; - TEST(32, R(long_prod), Imm32(0x10000)); - FixupBranch jump = J_CC(CC_Z); - ADD(64, R(long_prod), Imm32(0x8000)); - MOV(64, R(tmp), Imm64(~0xffff)); - AND(64, R(long_prod), R(tmp)); - FixupBranch _ret = J(); - // else prod = (prod + 0x7fff) & ~0xffff; - SetJumpTarget(jump); - ADD(64, R(long_prod), Imm32(0x7fff)); - MOV(64, R(tmp), Imm64(~0xffff)); - AND(64, R(long_prod), R(tmp)); - SetJumpTarget(_ret); - // return prod; - m_gpr.PutXReg(tmp); + round_long_acc(long_prod); } // For accurate emulation, this is wrong - but the real prod registers behave From 3951d238d92e1c4b2c7db9f553272272ecc7691b Mon Sep 17 00:00:00 2001 From: Sintendo Date: Wed, 5 Jun 2019 23:36:14 +0200 Subject: [PATCH 4/6] DSPJitArithmetic: Precompute addi immediate Compute immediate value at compile time. --- Source/Core/Core/DSP/Jit/x64/DSPJitArithmetic.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Source/Core/Core/DSP/Jit/x64/DSPJitArithmetic.cpp b/Source/Core/Core/DSP/Jit/x64/DSPJitArithmetic.cpp index 8e54bd60aa..46c44617ef 100644 --- a/Source/Core/Core/DSP/Jit/x64/DSPJitArithmetic.cpp +++ b/Source/Core/Core/DSP/Jit/x64/DSPJitArithmetic.cpp @@ -701,9 +701,7 @@ void DSPEmitter::addi(const UDSPInstruction opc) // s64 imm = (s16)dsp_fetch_code(); s16 imm = dsp_imem_read(m_compile_pc + 1); // imm <<= 16; - MOV(16, R(RDX), Imm16(imm)); - MOVSX(64, 16, RDX, R(RDX)); - SHL(64, R(RDX), Imm8(16)); + MOV(64, R(RDX), Imm32(imm << 16)); // s64 res = acc + imm; ADD(64, R(RAX), R(RDX)); // dsp_set_long_acc(areg, res); From 5b5886160e4568067ffeb92431f2d9c2fba72636 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Wed, 5 Jun 2019 23:36:30 +0200 Subject: [PATCH 5/6] DSPJitArithmetic: Precompute addis immediate Compute immediate value at compile time. --- Source/Core/Core/DSP/Jit/x64/DSPJitArithmetic.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Source/Core/Core/DSP/Jit/x64/DSPJitArithmetic.cpp b/Source/Core/Core/DSP/Jit/x64/DSPJitArithmetic.cpp index 46c44617ef..7d421b50a7 100644 --- a/Source/Core/Core/DSP/Jit/x64/DSPJitArithmetic.cpp +++ b/Source/Core/Core/DSP/Jit/x64/DSPJitArithmetic.cpp @@ -735,9 +735,8 @@ void DSPEmitter::addis(const UDSPInstruction opc) MOV(64, R(RAX), R(tmp1)); // s64 imm = (s8)(u8)opc; // imm <<= 16; - MOV(8, R(RDX), Imm8((u8)opc)); - MOVSX(64, 8, RDX, R(RDX)); - SHL(64, R(RDX), Imm8(16)); + s32 imm = static_cast(opc) << 24 >> 8; + MOV(64, R(RDX), Imm32(imm)); // s64 res = acc + imm; ADD(64, R(RAX), R(RDX)); // dsp_set_long_acc(dreg, res); From e9d4869965825d31cfb380cebbc27b3166d973ba Mon Sep 17 00:00:00 2001 From: Sintendo Date: Wed, 5 Jun 2019 23:58:44 +0200 Subject: [PATCH 6/6] Jit64AsmCommon: Micro-optimize GenFrsqrte Save 2 bytes by testing register against itself and branching on the sign flag. Before: 0: 48 0f ba e0 3f bt rax,0x3f After: 0: 48 85 c0 test rax,rax --- Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp index 81b4b27204..70224162f0 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp @@ -101,8 +101,8 @@ void CommonAsmRoutines::GenFrsqrte() MOVQ_xmm(XMM0, R(RSCRATCH)); RET(); SetJumpTarget(inf); - BT(64, R(RSCRATCH), Imm8(63)); - FixupBranch negative = J_CC(CC_C); + TEST(64, R(RSCRATCH), R(RSCRATCH)); + FixupBranch negative = J_CC(CC_S); XORPD(XMM0, R(XMM0)); RET();