From 2c68f6bfc591d159cd132e851bd43ce0e7f038aa Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 1 Sep 2015 16:22:44 -0500 Subject: [PATCH 1/2] [AArch64] Implement Fiora's preemptive paired loadstore optimization. This provides a decent speed up in pretty much everything that touches pair loadstores because in most cases they are just regular non-quantizing float loadstores that happen. --- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 26 +++++ .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 22 +++- .../JitArm64/JitArm64_LoadStorePaired.cpp | 104 ++++++++++++------ .../Core/PowerPC/JitArmCommon/BackPatch.h | 27 +++-- 4 files changed, 131 insertions(+), 48 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index b165020a43..df8cd0dd86 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -7,6 +7,7 @@ #include "Common/PerformanceCounter.h" #include "Core/PatchEngine.h" +#include "Core/PowerPC/JitInterface.h" #include "Core/PowerPC/Profiler.h" #include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitArm64/JitArm64_RegCache.h" @@ -351,6 +352,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB js.isLastInstruction = false; js.firstFPInstructionFound = false; + js.assumeNoPairedQuantize = false; js.blockStart = em_address; js.fifoBytesThisBlock = 0; js.downcountAmount = 0; @@ -396,6 +398,30 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB // get start tic BeginTimeProfile(b); } + + if (code_block.m_gqr_used.Count() == 1 && js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end()) + { + int gqr = *code_block.m_gqr_used.begin(); + if (!code_block.m_gqr_modified[gqr] && !GQR(gqr)) + { + LDR(INDEX_UNSIGNED, W0, X29, PPCSTATE_OFF(spr[SPR_GQR0]) + gqr * 4); + FixupBranch no_fail = B(CC_EQ); + FixupBranch fail = B(); + SwitchToFarCode(); + SetJumpTarget(fail); + MOVI2R(W0, js.blockStart); + STR(INDEX_UNSIGNED, W0, X29, PPCSTATE_OFF(pc)); + MOVI2R(W0, (u32)JitInterface::ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE); + MOVI2R(X1, (u64)&JitInterface::CompileExceptionCheck); + BLR(X1); + MOVI2R(X1, (u64)asm_routines.dispatcher); + BR(X1); + SwitchToNearCode(); + SetJumpTarget(no_fail); + js.assumeNoPairedQuantize = true; + } + } + const u8 *normalEntry = GetCodePtr(); b->normalEntry = normalEntry; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index 37420a2cf0..73bd148e70 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -51,7 +51,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, { if (flags & BackPatchInfo::FLAG_STORE && - flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64 | BackPatchInfo::FLAG_SIZE_F32I)) + flags & BackPatchInfo::FLAG_MASK_FLOAT) { if (flags & BackPatchInfo::FLAG_SIZE_F32) { @@ -64,6 +64,12 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, m_float_emit.REV32(8, D0, RS); m_float_emit.STR(32, D0, X28, addr); } + else if (flags & BackPatchInfo::FLAG_SIZE_F32X2) + { + m_float_emit.FCVTN(32, D0, RS); + m_float_emit.REV32(8, D0, D0); + m_float_emit.STR(64, Q0, X28, addr); + } else { m_float_emit.REV64(8, Q0, RS); @@ -71,7 +77,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, } } else if (flags & BackPatchInfo::FLAG_LOAD && - flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64)) + flags & BackPatchInfo::FLAG_MASK_FLOAT) { if (flags & BackPatchInfo::FLAG_SIZE_F32) { @@ -166,7 +172,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, m_float_emit.ABI_PushRegisters(fprs_to_push, X30); if (flags & BackPatchInfo::FLAG_STORE && - flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64 | BackPatchInfo::FLAG_SIZE_F32I)) + flags & BackPatchInfo::FLAG_MASK_FLOAT) { if (flags & BackPatchInfo::FLAG_SIZE_F32) { @@ -181,6 +187,14 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, MOVI2R(X30, (u64)&PowerPC::Write_U32); BLR(X30); } + else if (flags & BackPatchInfo::FLAG_SIZE_F32X2) + { + m_float_emit.FCVTN(32, D0, RS); + m_float_emit.UMOV(64, X0, D0, 0); + ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32)); + MOVI2R(X30, (u64)PowerPC::Write_U64); + BLR(X30); + } else { MOVI2R(X30, (u64)&PowerPC::Write_U64); @@ -190,7 +204,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, } else if (flags & BackPatchInfo::FLAG_LOAD && - flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64)) + flags & BackPatchInfo::FLAG_MASK_FLOAT) { if (flags & BackPatchInfo::FLAG_SIZE_F32) { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 1ee2ce6f6a..f3a26028aa 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -38,8 +38,7 @@ void JitArm64::psq_l(UGeckoInstruction inst) ARM64Reg scale_reg = W0; ARM64Reg addr_reg = W1; ARM64Reg type_reg = W2; - - LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I])); + ARM64Reg VS; if (inst.RA || update) // Always uses the register on update { @@ -53,21 +52,43 @@ void JitArm64::psq_l(UGeckoInstruction inst) MOVI2R(addr_reg, (u32)offset); } - UBFM(type_reg, scale_reg, 16, 18); // Type - UBFM(scale_reg, scale_reg, 24, 29); // Scale - if (update) { gpr.BindToRegister(inst.RA, REG_REG); MOV(arm_addr, addr_reg); } - MOVI2R(X30, (u64)&asm_routines.pairedLoadQuantized[inst.W * 8]); - LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true)); - BLR(X30); + if (js.assumeNoPairedQuantize) + { + VS = fpr.RW(inst.RS, REG_REG); + if (!inst.W) + { + ADD(EncodeRegTo64(addr_reg), EncodeRegTo64(addr_reg), X28); + m_float_emit.LD1(32, 1, EncodeRegToDouble(VS), EncodeRegTo64(addr_reg)); + m_float_emit.REV32(8, VS, VS); + } + else + { + m_float_emit.LDR(32, VS, EncodeRegTo64(addr_reg), X28); + m_float_emit.REV32(8, VS, VS); + + } + m_float_emit.FCVTL(64, VS, VS); + } + else + { + LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I])); + UBFM(type_reg, scale_reg, 16, 18); // Type + UBFM(scale_reg, scale_reg, 24, 29); // Scale + + MOVI2R(X30, (u64)&asm_routines.pairedLoadQuantized[inst.W * 8]); + LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true)); + BLR(X30); + + VS = fpr.RW(inst.RS, REG_REG); + m_float_emit.FCVTL(64, VS, D0); + } - ARM64Reg VS = fpr.RW(inst.RS, REG_REG); - m_float_emit.FCVTL(64, VS, D0); if (inst.W) { m_float_emit.FMOV(D0, 0x70); // 1.0 as a Double @@ -106,11 +127,9 @@ void JitArm64::psq_st(UGeckoInstruction inst) BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); // Wipe the registers we are using as temporaries - gprs_in_use &= BitSet32(~0x40000007); + gprs_in_use &= BitSet32(~7); fprs_in_use &= BitSet32(~3); - LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I])); - if (inst.RA || update) // Always uses the register on update { if (offset >= 0) @@ -123,40 +142,59 @@ void JitArm64::psq_st(UGeckoInstruction inst) MOVI2R(addr_reg, (u32)offset); } - UBFM(type_reg, scale_reg, 0, 2); // Type - UBFM(scale_reg, scale_reg, 8, 13); // Scale - if (update) { gpr.BindToRegister(inst.RA, REG_REG); MOV(arm_addr, addr_reg); } - m_float_emit.FCVTN(32, D0, VS); - - // Inline address check + if (js.assumeNoPairedQuantize) { + u32 flags = BackPatchInfo::FLAG_STORE; + flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2); + EmitBackpatchRoutine(flags, + jo.fastmem, + jo.fastmem, + VS, EncodeRegTo64(addr_reg), + gprs_in_use, + fprs_in_use); + } + else + { + if (inst.W) + m_float_emit.FCVT(32, 64, D0, VS); + else + m_float_emit.FCVTN(32, D0, VS); + + LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I])); + UBFM(type_reg, scale_reg, 0, 2); // Type + UBFM(scale_reg, scale_reg, 8, 13); // Scale + + // Inline address check TST(addr_reg, 6, 1); - FixupBranch argh = B(CC_NEQ); + FixupBranch pass = B(CC_EQ); + FixupBranch fail = B(); + + SwitchToFarCode(); + SetJumpTarget(fail); + // Slow + MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[16 + inst.W * 8]); + LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true)); + + ABI_PushRegisters(gprs_in_use); + m_float_emit.ABI_PushRegisters(fprs_in_use, X30); + BLR(EncodeRegTo64(type_reg)); + m_float_emit.ABI_PopRegisters(fprs_in_use, X30); + ABI_PopRegisters(gprs_in_use); + FixupBranch continue1 = B(); + SwitchToNearCode(); + SetJumpTarget(pass); // Fast MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[inst.W * 8]); LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true)); BLR(EncodeRegTo64(type_reg)); - FixupBranch continue1 = B(); - SetJumpTarget(argh); - - // Slow - MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[16 + inst.W * 8]); - LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true)); - - ABI_PushRegisters(gprs_in_use); - m_float_emit.ABI_PushRegisters(fprs_in_use, X30); - BLR(EncodeRegTo64(type_reg)); - m_float_emit.ABI_PopRegisters(fprs_in_use, X30); - ABI_PopRegisters(gprs_in_use); - SetJumpTarget(continue1); } diff --git a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h index b7f68b0b23..c712918883 100644 --- a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h +++ b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h @@ -9,17 +9,22 @@ struct BackPatchInfo { enum { - FLAG_STORE = (1 << 0), - FLAG_LOAD = (1 << 1), - FLAG_SIZE_8 = (1 << 2), - FLAG_SIZE_16 = (1 << 3), - FLAG_SIZE_32 = (1 << 4), - FLAG_SIZE_F32 = (1 << 5), - FLAG_SIZE_F64 = (1 << 6), - FLAG_REVERSE = (1 << 7), - FLAG_EXTEND = (1 << 8), - FLAG_SIZE_F32I = (1 << 9), - FLAG_ZERO_256 = (1 << 10), + FLAG_STORE = (1 << 0), + FLAG_LOAD = (1 << 1), + FLAG_SIZE_8 = (1 << 2), + FLAG_SIZE_16 = (1 << 3), + FLAG_SIZE_32 = (1 << 4), + FLAG_SIZE_F32 = (1 << 5), + FLAG_SIZE_F32X2 = (1 << 6), + FLAG_SIZE_F64 = (1 << 7), + FLAG_REVERSE = (1 << 8), + FLAG_EXTEND = (1 << 9), + FLAG_SIZE_F32I = (1 << 10), + FLAG_ZERO_256 = (1 << 11), + FLAG_MASK_FLOAT = FLAG_SIZE_F32 | + FLAG_SIZE_F32X2 | + FLAG_SIZE_F64 | + FLAG_SIZE_F32I, }; static u32 GetFlagSize(u32 flags) From 791c7d5a845658e98a73c5be261c8ecda277a9dc Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 1 Sep 2015 17:10:53 -0500 Subject: [PATCH 2/2] [AArch64] Clean up bogus vector FCVT{N,L} instruction usage. Replace the instruction with the scalar variant FCVT instruction. FCVT{N,L} 8 cycles latency on the Cortex A57 FCVT has five cycle latency and slightly higher throughput On the A72 all three of these instructions will have three cycle latency, While FCVT{N,L} will have half the throughput. --- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 2 +- Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp | 4 ++-- .../Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp | 6 +++--- .../Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp | 9 ++++++--- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index df8cd0dd86..1a5a8e36a9 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -405,7 +405,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB if (!code_block.m_gqr_modified[gqr] && !GQR(gqr)) { LDR(INDEX_UNSIGNED, W0, X29, PPCSTATE_OFF(spr[SPR_GQR0]) + gqr * 4); - FixupBranch no_fail = B(CC_EQ); + FixupBranch no_fail = CBZ(W0); FixupBranch fail = B(); SwitchToFarCode(); SetJumpTarget(fail); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index 73bd148e70..5708348e21 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -83,7 +83,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, { m_float_emit.LDR(32, EncodeRegToDouble(RS), X28, addr); m_float_emit.REV32(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); - m_float_emit.FCVTL(64, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); + m_float_emit.FCVT(64, 32, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); } else { @@ -211,7 +211,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, MOVI2R(X30, (u64)&PowerPC::Read_U32); BLR(X30); m_float_emit.INS(32, RS, 0, X0); - m_float_emit.FCVTL(64, RS, RS); + m_float_emit.FCVT(64, 32, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); } else { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index efcc61173d..ec506964cc 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -335,8 +335,8 @@ void JitArm64::frspx(UGeckoInstruction inst) ARM64Reg VB = fpr.R(b, REG_IS_LOADED); ARM64Reg VD = fpr.RW(d, REG_DUP); - m_float_emit.FCVTN(32, EncodeRegToDouble(VD), EncodeRegToDouble(VB)); - m_float_emit.FCVTL(64, EncodeRegToDouble(VD), EncodeRegToDouble(VD)); + m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB)); + m_float_emit.FCVT(64, 32, EncodeRegToDouble(VD), EncodeRegToDouble(VD)); } void JitArm64::fcmpx(UGeckoInstruction inst) @@ -441,7 +441,7 @@ void JitArm64::fctiwzx(UGeckoInstruction inst) m_float_emit.MOVI(64, EncodeRegToDouble(V0), 0xFFFF000000000000ULL); m_float_emit.BIC(16, EncodeRegToDouble(V0), 0x7); - m_float_emit.FCVTN(32, EncodeRegToDouble(VD), EncodeRegToDouble(VB)); + m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB)); m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VD), ROUND_Z); m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(VD), EncodeRegToDouble(V0)); fpr.Unlock(V0); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index f3a26028aa..bd2574d2c7 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -66,14 +66,14 @@ void JitArm64::psq_l(UGeckoInstruction inst) ADD(EncodeRegTo64(addr_reg), EncodeRegTo64(addr_reg), X28); m_float_emit.LD1(32, 1, EncodeRegToDouble(VS), EncodeRegTo64(addr_reg)); m_float_emit.REV32(8, VS, VS); + m_float_emit.FCVTL(64, VS, VS); } else { m_float_emit.LDR(32, VS, EncodeRegTo64(addr_reg), X28); m_float_emit.REV32(8, VS, VS); - + m_float_emit.FCVT(64, 32, EncodeRegToDouble(VS), EncodeRegToDouble(VS)); } - m_float_emit.FCVTL(64, VS, VS); } else { @@ -86,7 +86,10 @@ void JitArm64::psq_l(UGeckoInstruction inst) BLR(X30); VS = fpr.RW(inst.RS, REG_REG); - m_float_emit.FCVTL(64, VS, D0); + if (!inst.W) + m_float_emit.FCVTL(64, VS, D0); + else + m_float_emit.FCVT(64, 32, EncodeRegToDouble(VS), D0); } if (inst.W)