From 816d056657885e869a1bd550a88fe92e52533d92 Mon Sep 17 00:00:00 2001 From: Fiora Date: Tue, 19 Aug 2014 21:52:09 -0700 Subject: [PATCH 1/2] JIT: implement timer support in mtspr Faster, of course, since we avoid the interpreter, but also means we can get more a more accurate timer in long blocks by adding the offset from the start of the block to the retrieved timer. I don't know if this will actually fix any issues, but it's more correct and a nearly-free improvement. --- Source/Core/Core/HW/SystemTimers.cpp | 25 ----------------- Source/Core/Core/HW/SystemTimers.h | 25 +++++++++++++++++ .../PowerPC/Jit64/Jit_SystemRegisters.cpp | 27 ++++++++++++++++--- 3 files changed, 49 insertions(+), 28 deletions(-) diff --git a/Source/Core/Core/HW/SystemTimers.cpp b/Source/Core/Core/HW/SystemTimers.cpp index b82ec4c4ee..53452a8d65 100644 --- a/Source/Core/Core/HW/SystemTimers.cpp +++ b/Source/Core/Core/HW/SystemTimers.cpp @@ -70,31 +70,6 @@ namespace SystemTimers static u32 CPU_CORE_CLOCK = 486000000u; // 486 mhz (its not 485, stop bugging me!) -/* -GameCube MHz -flipper <-> ARAM bus: 81 (DSP) -gekko <-> flipper bus: 162 -flipper <-> 1T-SRAM bus: 324 -gekko: 486 - -These contain some guesses: -Wii MHz -hollywood <-> GDDR3 RAM bus: ??? no idea really -broadway <-> hollywood bus: 243 -hollywood <-> 1T-SRAM bus: 486 -broadway: 729 -*/ -// Ratio of TB and Decrementer to clock cycles. -// TB clk is 1/4 of BUS clk. And it seems BUS clk is really 1/3 of CPU clk. -// So, ratio is 1 / (1/4 * 1/3 = 1/12) = 12. -// note: ZWW is ok and faster with TIMER_RATIO=8 though. -// !!! POSSIBLE STABLE PERF BOOST HACK THERE !!! - -enum -{ - TIMER_RATIO = 12 -}; - static int et_Dec; static int et_VI; static int et_SI; diff --git a/Source/Core/Core/HW/SystemTimers.h b/Source/Core/Core/HW/SystemTimers.h index e0f6d3447f..a98d8085bf 100644 --- a/Source/Core/Core/HW/SystemTimers.h +++ b/Source/Core/Core/HW/SystemTimers.h @@ -9,6 +9,31 @@ namespace SystemTimers { +/* +GameCube MHz +flipper <-> ARAM bus: 81 (DSP) +gekko <-> flipper bus: 162 +flipper <-> 1T-SRAM bus: 324 +gekko: 486 + +These contain some guesses: +Wii MHz +hollywood <-> GDDR3 RAM bus: ??? no idea really +broadway <-> hollywood bus: 243 +hollywood <-> 1T-SRAM bus: 486 +broadway: 729 +*/ +// Ratio of TB and Decrementer to clock cycles. +// TB clk is 1/4 of BUS clk. And it seems BUS clk is really 1/3 of CPU clk. +// So, ratio is 1 / (1/4 * 1/3 = 1/12) = 12. +// note: ZWW is ok and faster with TIMER_RATIO=8 though. +// !!! POSSIBLE STABLE PERF BOOST HACK THERE !!! + +enum +{ + TIMER_RATIO = 12 +}; + u32 GetTicksPerSecond(); void PreInit(); void Init(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp index 703341ba8a..3005970453 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -166,10 +166,31 @@ void Jit64::mfspr(UGeckoInstruction inst) int d = inst.RD; switch (iIndex) { - case SPR_WPAR: - case SPR_DEC: case SPR_TL: case SPR_TU: + { + // TODO: we really only need to call GetFakeTimeBase once per JIT block; this matters because + // typical use of this instruction is to call it three times, e.g. mftbu/mftbl/mftbu/cmpw/bne + // to deal with possible timer wraparound. This makes the second two (out of three) completely + // redundant for the JIT. + u32 registersInUse = CallerSavedRegistersInUse(); + u32 offset = js.downcountAmount / SystemTimers::TIMER_RATIO; + ABI_PushRegistersAndAdjustStack(registersInUse, false); + ABI_CallFunction((void *)&SystemTimers::GetFakeTimeBase); + ABI_PopRegistersAndAdjustStack(registersInUse, false); + // The timer can change within a long block, so add in any difference + if (offset > 0) + ADD(64, R(RAX), Imm32(offset)); + MOV(64, M(&TL), R(RAX)); + gpr.Lock(d); + gpr.BindToRegister(d, false); + if (iIndex == SPR_TU) + SHR(64, R(RAX), Imm8(32)); + MOV(32, gpr.R(d), R(EAX)); + break; + } + case SPR_WPAR: + case SPR_DEC: case SPR_PMC1: case SPR_PMC2: case SPR_PMC3: @@ -179,9 +200,9 @@ void Jit64::mfspr(UGeckoInstruction inst) gpr.Lock(d); gpr.BindToRegister(d, false); MOV(32, gpr.R(d), M(&PowerPC::ppcState.spr[iIndex])); - gpr.UnlockAll(); break; } + gpr.UnlockAll(); } void Jit64::mtmsr(UGeckoInstruction inst) From 6875d911f11ae8a3fee8f8fdc34844d9bc5c2018 Mon Sep 17 00:00:00 2001 From: Fiora Date: Wed, 20 Aug 2014 00:42:04 -0700 Subject: [PATCH 2/2] JIT: merge paired timebase reads where possible Combined with the previous patch, ~1% faster overall on F-Zero GX. --- .../PowerPC/Jit64/Jit_SystemRegisters.cpp | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp index 3005970453..af2fe33bb8 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -182,11 +182,33 @@ void Jit64::mfspr(UGeckoInstruction inst) if (offset > 0) ADD(64, R(RAX), Imm32(offset)); MOV(64, M(&TL), R(RAX)); - gpr.Lock(d); - gpr.BindToRegister(d, false); - if (iIndex == SPR_TU) + // Two calls of TU/TL next to each other are extremely common in typical usage, so merge them + // if we can. + u32 nextIndex = (js.next_inst.SPRU << 5) | (js.next_inst.SPRL & 0x1F); + // Be careful; the actual opcode is for mftb (371), not mfspr (339) + if (js.next_inst.OPCD == 31 && js.next_inst.SUBOP10 == 371 && (nextIndex == SPR_TU || nextIndex == SPR_TL)) + { + int n = js.next_inst.RD; + js.downcountAmount++; + js.skipnext = true; + gpr.Lock(d, n); + if (iIndex == SPR_TL) + MOV(32, gpr.R(d), R(EAX)); + if (nextIndex == SPR_TL) + MOV(32, gpr.R(n), R(EAX)); SHR(64, R(RAX), Imm8(32)); - MOV(32, gpr.R(d), R(EAX)); + if (iIndex == SPR_TU) + MOV(32, gpr.R(d), R(EAX)); + if (nextIndex == SPR_TU) + MOV(32, gpr.R(n), R(EAX)); + } + else + { + gpr.Lock(d); + if (iIndex == SPR_TU) + SHR(64, R(RAX), Imm8(32)); + MOV(32, gpr.R(d), R(EAX)); + } break; } case SPR_WPAR: