From 103e73030fcb882ba3bacc36d5f30480c12df73e Mon Sep 17 00:00:00 2001 From: skidau Date: Sat, 28 Apr 2012 20:42:45 +1000 Subject: [PATCH 1/3] Added preliminary support for the Gekko CPU Performance Monitor. Fixes Harry Potter and the Prisoner of Azkaban. --- Source/Core/Core/Src/PowerPC/Gekko.h | 95 ++++++++++++++----- Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp | 11 +++ .../Src/PowerPC/Jit64/Jit_SystemRegisters.cpp | 4 + .../Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp | 11 +++ .../Core/Core/Src/PowerPC/JitCommon/JitBase.h | 2 + Source/Core/Core/Src/PowerPC/PowerPC.cpp | 73 ++++++++++++++ Source/Core/Core/Src/PowerPC/PowerPC.h | 4 + 7 files changed, 175 insertions(+), 25 deletions(-) diff --git a/Source/Core/Core/Src/PowerPC/Gekko.h b/Source/Core/Core/Src/PowerPC/Gekko.h index 1ce6fae4cd..8615acd5ed 100644 --- a/Source/Core/Core/Src/PowerPC/Gekko.h +++ b/Source/Core/Core/Src/PowerPC/Gekko.h @@ -181,7 +181,7 @@ union UGeckoInstruction u32 : 11; u32 CRBB : 5; u32 CRBA : 5; - u32 CRBD : 5; + u32 CRBD : 5; u32 : 6; }; @@ -235,9 +235,9 @@ union UGeckoInstruction }; struct { - u32 : 17; - u32 FM : 8; - u32 : 7; + u32 : 17; + u32 FM : 8; + u32 : 7; }; // paired @@ -247,8 +247,8 @@ union UGeckoInstruction u32 Ix : 3; u32 Wx : 1; u32 : 1; - u32 I : 3; - u32 W : 1; + u32 I : 3; + u32 W : 1; u32 : 16; }; @@ -319,7 +319,7 @@ union UReg_XER u32 Hex; UReg_XER(u32 _hex) { Hex = _hex; } - UReg_XER() { Hex = 0; } + UReg_XER() { Hex = 0; } }; // Machine State Register @@ -351,7 +351,7 @@ union UReg_MSR u32 Hex; UReg_MSR(u32 _hex) { Hex = _hex; } - UReg_MSR() { Hex = 0; } + UReg_MSR() { Hex = 0; } }; // Floating Point Status and Control Register @@ -487,13 +487,47 @@ union UReg_SPR1 u32 Hex; struct { - u32 htaborg : 16; - u32 : 7; - u32 htabmask : 9; + u32 htaborg : 16; + u32 : 7; + u32 htabmask : 9; }; }; +// MMCR0 - Monitor Mode Control Register 0 format +union UReg_MMCR0 +{ + u32 Hex; + struct + { + u32 PMC2SELECT : 6; + u32 PMC1SELECT : 7; + u32 PMCTRIGGER : 1; + u32 PMCINTCONTROL : 1; + u32 PMC1INTCONTROL : 1; + u32 THRESHOLD : 6; + u32 INTONBITTRANS : 1; + u32 RTCSELECT : 2; + u32 DISCOUNT : 1; + u32 ENINT : 1; + u32 DMR : 1; + u32 DMS : 1; + u32 DU : 1; + u32 DP : 1; + u32 DIS : 1; + }; +}; +// MMCR1 - Monitor Mode Control Register 1 format +union UReg_MMCR1 +{ + u32 Hex; + struct + { + u32 : 22; + u32 PMC4SELECT : 5; + u32 PMC3SELECT : 5; + }; +}; // Write Pipe Address Register union UReg_WPAR @@ -516,7 +550,7 @@ union UReg_DMAU struct { u32 DMA_LEN_U : 5; - u32 MEM_ADDR : 27; + u32 MEM_ADDR : 27; }; u32 Hex; @@ -533,7 +567,7 @@ union UReg_DMAL u32 DMA_T : 1; u32 DMA_LEN_L : 2; u32 DMA_LD : 1; - u32 LC_ADDR : 27; + u32 LC_ADDR : 27; }; u32 Hex; @@ -545,11 +579,11 @@ union UReg_BAT_Up { struct { - u32 VP : 1; - u32 VS : 1; + u32 VP : 1; + u32 VS : 1; u32 BL : 11; // Block length (aka block size mask) u32 : 4; - u32 BEPI : 15; + u32 BEPI : 15; }; u32 Hex; @@ -561,8 +595,8 @@ union UReg_BAT_Lo { struct { - u32 PP : 2; - u32 : 1; + u32 PP : 2; + u32 : 1; u32 WIMG : 4; u32 : 10; u32 BRPN : 15; // Physical Block Number @@ -586,7 +620,7 @@ union UReg_PTE u64 WIMG : 4; u64 C : 1; u64 R : 1; - u64 : 3; + u64 : 3; u64 RPN : 20; }; @@ -623,16 +657,16 @@ enum // Special purpose register indices enum { - SPR_XER = 1, + SPR_XER = 1, SPR_LR = 8, SPR_CTR = 9, SPR_DSISR = 18, - SPR_DAR = 19, + SPR_DAR = 19, SPR_DEC = 22, - SPR_SDR = 25, + SPR_SDR = 25, SPR_SRR0 = 26, SPR_SRR1 = 27, - SPR_TL = 268, + SPR_TL = 268, SPR_TU = 269, SPR_TL_W = 284, SPR_TU_W = 285, @@ -669,7 +703,17 @@ enum SPR_ECID_U = 924, SPR_ECID_M = 925, SPR_ECID_L = 926, - SPR_L2CR = 1017 + SPR_L2CR = 1017, + + SPR_UMMCR0 = 936, + SPR_MMCR0 = 952, + SPR_PMC1 = 953, + SPR_PMC2 = 954, + + SPR_UMMCR1 = 940, + SPR_MMCR1 = 956, + SPR_PMC3 = 957, + SPR_PMC4 = 958, }; // Exceptions @@ -679,8 +723,9 @@ enum #define EXCEPTION_DSI 0x00000008 #define EXCEPTION_ISI 0x00000010 #define EXCEPTION_ALIGNMENT 0x00000020 -#define EXCEPTION_FPU_UNAVAILABLE 0x00000040 +#define EXCEPTION_FPU_UNAVAILABLE 0x00000040 #define EXCEPTION_PROGRAM 0x00000080 +#define EXCEPTION_PERFORMANCE_MONITOR 0x00000100 inline s32 SignExt16(s16 x) {return (s32)(s16)x;} inline s32 SignExt26(u32 x) {return x & 0x2000000 ? (s32)(x | 0xFC000000) : (s32)(x);} diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp index bd72d1e79c..5db67c2cd7 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp @@ -295,6 +295,11 @@ void Jit64::Cleanup() { if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) ABI_CallFunction((void *)&GPFifo::CheckGatherPipe); + + CMP(32, M(&MMCR0), Imm32(0)); + FixupBranch mmcr0 = J_CC(CC_Z); + ABI_CallFunctionCCC((void *)&PowerPC::UpdatePerformanceMonitor, js.downcountAmount, jit->js.numLoadStoreInst, jit->js.numFloatingPointInst); + SetJumpTarget(mmcr0); } void Jit64::WriteExit(u32 destination, int exit_num) @@ -626,6 +631,12 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc WriteExceptionExit(); SetJumpTarget(noMemException); } + + if (opinfo->flags & FL_LOADSTORE) + ++jit->js.numLoadStoreInst; + + if (opinfo->flags & FL_USE_FPU) + ++jit->js.numFloatingPointInst; } #if defined(_DEBUG) || defined(DEBUGFAST) diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp index 52c2ef3eee..833a623c49 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -93,6 +93,10 @@ void Jit64::mfspr(UGeckoInstruction inst) case SPR_DEC: case SPR_TL: case SPR_TU: + case SPR_PMC1: + case SPR_PMC2: + case SPR_PMC3: + case SPR_PMC4: Default(inst); return; default: diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp index d462ff8a35..c944522c4d 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp @@ -390,6 +390,11 @@ void JitIL::Cleanup() { if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) ABI_CallFunction((void *)&GPFifo::CheckGatherPipe); + + CMP(32, M(&MMCR0), Imm32(0)); + FixupBranch mmcr0 = J_CC(CC_Z); + ABI_CallFunctionCCC((void *)&PowerPC::UpdatePerformanceMonitor, js.downcountAmount, jit->js.numLoadStoreInst, jit->js.numFloatingPointInst); + SetJumpTarget(mmcr0); } void JitIL::WriteExit(u32 destination, int exit_num) @@ -663,6 +668,12 @@ const u8* JitIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc { ibuild.EmitFPExceptionCheckEnd(ibuild.EmitIntConst(ops[i].address)); } + + if (opinfo->flags & FL_LOADSTORE) + ++jit->js.numLoadStoreInst; + + if (opinfo->flags & FL_USE_FPU) + ++jit->js.numFloatingPointInst; } } diff --git a/Source/Core/Core/Src/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/Src/PowerPC/JitCommon/JitBase.h index a953a820be..05c10c6caa 100644 --- a/Source/Core/Core/Src/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/Src/PowerPC/JitCommon/JitBase.h @@ -57,6 +57,8 @@ protected: int blockSize; int instructionNumber; int downcountAmount; + u32 numLoadStoreInst; + u32 numFloatingPointInst; bool isLastInstruction; bool forceUnsafeLoad; diff --git a/Source/Core/Core/Src/PowerPC/PowerPC.cpp b/Source/Core/Core/Src/PowerPC/PowerPC.cpp index 2ae520faab..4a7b7e11ac 100644 --- a/Source/Core/Core/Src/PowerPC/PowerPC.cpp +++ b/Source/Core/Core/Src/PowerPC/PowerPC.cpp @@ -289,6 +289,68 @@ void Stop() Host_UpdateDisasmDialog(); } +void UpdatePerformanceMonitor(u32 cycles, u32 num_load_stores, u32 num_fp_inst) +{ + switch (MMCR0.PMC1SELECT) + { + case 0: // No change + break; + case 1: // Processor cycles + PowerPC::ppcState.spr[SPR_PMC1] += cycles; + break; + default: + break; + } + + switch (MMCR0.PMC2SELECT) + { + case 0: // No change + break; + case 1: // Processor cycles + PowerPC::ppcState.spr[SPR_PMC2] += cycles; + break; + case 11: // Number of loads and stores completed + PowerPC::ppcState.spr[SPR_PMC2] += num_load_stores; + break; + default: + break; + } + + switch (MMCR1.PMC3SELECT) + { + case 0: // No change + break; + case 1: // Processor cycles + PowerPC::ppcState.spr[SPR_PMC3] += cycles; + break; + case 11: // Number of FPU instructions completed + PowerPC::ppcState.spr[SPR_PMC3] += num_fp_inst; + break; + default: + break; + } + + switch (MMCR1.PMC4SELECT) + { + case 0: // No change + break; + case 1: // Processor cycles + PowerPC::ppcState.spr[SPR_PMC4] += cycles; + break; + default: + break; + } + + if (MMCR0.PMC1INTCONTROL && (PowerPC::ppcState.spr[SPR_PMC1] & 80000000) != 0) + PowerPC::ppcState.Exceptions |= EXCEPTION_PERFORMANCE_MONITOR; + if (MMCR0.PMCINTCONTROL && (PowerPC::ppcState.spr[SPR_PMC2] & 80000000) != 0) + PowerPC::ppcState.Exceptions |= EXCEPTION_PERFORMANCE_MONITOR; + if (MMCR0.PMCINTCONTROL && (PowerPC::ppcState.spr[SPR_PMC3] & 80000000) != 0) + PowerPC::ppcState.Exceptions |= EXCEPTION_PERFORMANCE_MONITOR; + if (MMCR0.PMCINTCONTROL && (PowerPC::ppcState.spr[SPR_PMC4] & 80000000) != 0) + PowerPC::ppcState.Exceptions |= EXCEPTION_PERFORMANCE_MONITOR; +} + void CheckExceptions() { // Make sure we are checking against the latest EXI status. This is required @@ -405,6 +467,17 @@ void CheckExceptions() _dbg_assert_msg_(POWERPC, (SRR1 & 0x02) != 0, "EXTERNAL_INT unrecoverable???"); } + else if (exceptions & EXCEPTION_PERFORMANCE_MONITOR) + { + SRR0 = NPC; + SRR1 = MSR & 0x87C0FFFF; + MSR |= (MSR >> 16) & 1; + MSR &= ~0x04EF36; + NPC = 0x80000F00; + + INFO_LOG(POWERPC, "EXCEPTION_PERFORMANCE_MONITOR"); + Common::AtomicAnd(ppcState.Exceptions, ~EXCEPTION_PERFORMANCE_MONITOR); + } else if (exceptions & EXCEPTION_DECREMENTER) { SRR0 = NPC; diff --git a/Source/Core/Core/Src/PowerPC/PowerPC.h b/Source/Core/Core/Src/PowerPC/PowerPC.h index 54690ac6b5..f72e873779 100644 --- a/Source/Core/Core/Src/PowerPC/PowerPC.h +++ b/Source/Core/Core/Src/PowerPC/PowerPC.h @@ -116,12 +116,16 @@ void ExpandCR(); void OnIdle(u32 _uThreadAddr); void OnIdleIL(); +void UpdatePerformanceMonitor(u32 cycles, u32 num_load_stores, u32 num_fp_inst); + // Easy register access macros. #define HID0 ((UReg_HID0&)PowerPC::ppcState.spr[SPR_HID0]) #define HID2 ((UReg_HID2&)PowerPC::ppcState.spr[SPR_HID2]) #define HID4 ((UReg_HID4&)PowerPC::ppcState.spr[SPR_HID4]) #define DMAU (*(UReg_DMAU*)&PowerPC::ppcState.spr[SPR_DMAU]) #define DMAL (*(UReg_DMAL*)&PowerPC::ppcState.spr[SPR_DMAL]) +#define MMCR0 ((UReg_MMCR0&)PowerPC::ppcState.spr[SPR_MMCR0]) +#define MMCR1 ((UReg_MMCR1&)PowerPC::ppcState.spr[SPR_MMCR1]) #define PC PowerPC::ppcState.pc #define NPC PowerPC::ppcState.npc #define FPSCR ((UReg_FPSCR&)PowerPC::ppcState.fpscr) From 853d12b42c19a4febef0fa19f4d88ae7bf5f0367 Mon Sep 17 00:00:00 2001 From: skidau Date: Sat, 28 Apr 2012 22:47:55 +1000 Subject: [PATCH 2/3] Changed the performance monitor check to a compile time check instead of run-time. --- Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp | 7 +++---- Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp index 5db67c2cd7..57fd1e4366 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp @@ -296,10 +296,9 @@ void Jit64::Cleanup() if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) ABI_CallFunction((void *)&GPFifo::CheckGatherPipe); - CMP(32, M(&MMCR0), Imm32(0)); - FixupBranch mmcr0 = J_CC(CC_Z); - ABI_CallFunctionCCC((void *)&PowerPC::UpdatePerformanceMonitor, js.downcountAmount, jit->js.numLoadStoreInst, jit->js.numFloatingPointInst); - SetJumpTarget(mmcr0); + // SPEED HACK: MMCR0/MMCR1 should be checked at run-time, not at compile time. + if (MMCR0.Hex || MMCR1.Hex) + ABI_CallFunctionCCC((void *)&PowerPC::UpdatePerformanceMonitor, js.downcountAmount, jit->js.numLoadStoreInst, jit->js.numFloatingPointInst); } void Jit64::WriteExit(u32 destination, int exit_num) diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp index c944522c4d..34608c7b46 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp @@ -391,10 +391,9 @@ void JitIL::Cleanup() if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) ABI_CallFunction((void *)&GPFifo::CheckGatherPipe); - CMP(32, M(&MMCR0), Imm32(0)); - FixupBranch mmcr0 = J_CC(CC_Z); - ABI_CallFunctionCCC((void *)&PowerPC::UpdatePerformanceMonitor, js.downcountAmount, jit->js.numLoadStoreInst, jit->js.numFloatingPointInst); - SetJumpTarget(mmcr0); + // SPEED HACK: MMCR0/MMCR1 should be checked at run-time, not at compile time. + if (MMCR0.Hex || MMCR1.Hex) + ABI_CallFunctionCCC((void *)&PowerPC::UpdatePerformanceMonitor, js.downcountAmount, jit->js.numLoadStoreInst, jit->js.numFloatingPointInst); } void JitIL::WriteExit(u32 destination, int exit_num) From 15d3c451598d6dcde0d2a2e8b88ef9ba42ca8221 Mon Sep 17 00:00:00 2001 From: skidau Date: Sun, 29 Apr 2012 00:10:20 +1000 Subject: [PATCH 3/3] Reset the performance counters at the start of the each block. --- Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp | 2 ++ Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp index 57fd1e4366..037cec634a 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp @@ -445,6 +445,8 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc js.curBlock = b; js.block_flags = 0; js.cancel = false; + jit->js.numLoadStoreInst = 0; + jit->js.numFloatingPointInst = 0; // Analyze the block, collect all instructions it is made of (including inlining, // if that is enabled), reorder instructions for optimal performance, and join joinable instructions. diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp index 34608c7b46..2447bc3ac0 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp @@ -548,6 +548,8 @@ const u8* JitIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc js.fifoBytesThisBlock = 0; js.curBlock = b; js.cancel = false; + jit->js.numLoadStoreInst = 0; + jit->js.numFloatingPointInst = 0; // Analyze the block, collect all instructions it is made of (including inlining, // if that is enabled), reorder instructions for optimal performance, and join joinable instructions.