From c5381bae66a885fd11ce2b4e37119bb3a53d988e Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 31 Aug 2014 15:35:44 -0700 Subject: [PATCH] JIT: add "far code emitter" to reduce L1I cache pressure --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 43 +++++++++++-------- Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp | 7 +-- Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 32 ++++++++++++++ 3 files changed, 61 insertions(+), 21 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 5c5d9f0558..e0b1fdffd7 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -132,8 +132,6 @@ ps_adds1 */ -static int CODE_SIZE = 1024*1024*32; - void Jit64::Init() { jo.optimizeStack = true; @@ -169,6 +167,7 @@ void Jit64::Init() trampolines.Init(); AllocCodeSpace(CODE_SIZE); + farcode.Init(js.memcheck ? FARCODE_SIZE_MMU : FARCODE_SIZE); blocks.Init(); asm_routines.Init(); @@ -183,6 +182,7 @@ void Jit64::ClearCache() { blocks.Clear(); trampolines.ClearCodeSpace(); + farcode.ClearCodeSpace(); ClearCodeSpace(); } @@ -193,6 +193,7 @@ void Jit64::Shutdown() blocks.Shutdown(); trampolines.Shutdown(); asm_routines.Shutdown(); + farcode.Shutdown(); } // This is only called by FallBackToInterpreter() in this file. It will execute an instruction with the interpreter functions. @@ -372,7 +373,8 @@ void Jit64::Trace() void STACKALIGN Jit64::Jit(u32 em_address) { - if (GetSpaceLeft() < 0x10000 || blocks.IsFull() || SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache) + if (GetSpaceLeft() < 0x10000 || farcode.GetSpaceLeft() < 0x10000 || blocks.IsFull() || + SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache) { ClearCache(); } @@ -525,12 +527,13 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc { if ((opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound) { - gpr.Flush(); - fpr.Flush(); - //This instruction uses FPU - needs to add FP exception bailout TEST(32, PPCSTATE(msr), Imm32(1 << 13)); // Test FP enabled bit - FixupBranch b1 = J_CC(CC_NZ, true); + FixupBranch b1 = J_CC(CC_Z, true); + SwitchToFarCode(); + SetJumpTarget(b1); + gpr.Flush(FLUSH_MAINTAIN_STATE); + fpr.Flush(FLUSH_MAINTAIN_STATE); // If a FPU exception occurs, the exception handler will read // from PC. Update PC with the latest value in case that happens. @@ -538,32 +541,34 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc OR(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_FPU_UNAVAILABLE)); WriteExceptionExit(); - SetJumpTarget(b1); - + SwitchToNearCode(); js.firstFPInstructionFound = true; } // Add an external exception check if the instruction writes to the FIFO. if (jit->js.fifoWriteAddresses.find(ops[i].address) != jit->js.fifoWriteAddresses.end()) { - gpr.Flush(); - fpr.Flush(); - TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_ISI | EXCEPTION_PROGRAM | EXCEPTION_SYSCALL | EXCEPTION_FPU_UNAVAILABLE | EXCEPTION_DSI | EXCEPTION_ALIGNMENT)); - FixupBranch clearInt = J_CC(CC_NZ, true); + FixupBranch clearInt = J_CC(CC_NZ); TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_EXTERNAL_INT)); - FixupBranch noExtException = J_CC(CC_Z, true); + FixupBranch extException = J_CC(CC_NZ, true); + SwitchToFarCode(); + SetJumpTarget(extException); TEST(32, PPCSTATE(msr), Imm32(0x0008000)); FixupBranch noExtIntEnable = J_CC(CC_Z, true); TEST(32, M((void *)&ProcessorInterface::m_InterruptCause), Imm32(ProcessorInterface::INT_CAUSE_CP | ProcessorInterface::INT_CAUSE_PE_TOKEN | ProcessorInterface::INT_CAUSE_PE_FINISH)); FixupBranch noCPInt = J_CC(CC_Z, true); + gpr.Flush(FLUSH_MAINTAIN_STATE); + fpr.Flush(FLUSH_MAINTAIN_STATE); + MOV(32, PPCSTATE(pc), Imm32(ops[i].address)); WriteExternalExceptionExit(); + SwitchToNearCode(); + SetJumpTarget(noCPInt); SetJumpTarget(noExtIntEnable); - SetJumpTarget(noExtException); SetJumpTarget(clearInt); } @@ -585,9 +590,11 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc if (js.memcheck && (opinfo->flags & FL_LOADSTORE)) { - // In case we are about to jump to the dispatcher, flush regs TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); - FixupBranch noMemException = J_CC(CC_Z, true); + FixupBranch memException = J_CC(CC_NZ, true); + + SwitchToFarCode(); + SetJumpTarget(memException); gpr.Flush(FLUSH_MAINTAIN_STATE); fpr.Flush(FLUSH_MAINTAIN_STATE); @@ -596,7 +603,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc // from PC. Update PC with the latest value in case that happens. MOV(32, PPCSTATE(pc), Imm32(ops[i].address)); WriteExceptionExit(); - SetJumpTarget(noMemException); + SwitchToNearCode(); } if (opinfo->flags & FL_LOADSTORE) diff --git a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp index 077f4b7a1f..da8612c046 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp @@ -241,8 +241,6 @@ namespace JitILProfiler } }; -static int CODE_SIZE = 1024*1024*32; - void JitIL::Init() { jo.optimizeStack = true; @@ -273,6 +271,7 @@ void JitIL::Init() trampolines.Init(); AllocCodeSpace(CODE_SIZE); + farcode.Init(js.memcheck ? FARCODE_SIZE_MMU : FARCODE_SIZE); blocks.Init(); asm_routines.Init(); @@ -306,6 +305,7 @@ void JitIL::Shutdown() blocks.Shutdown(); trampolines.Shutdown(); asm_routines.Shutdown(); + farcode.Shutdown(); } @@ -504,7 +504,8 @@ void JitIL::Trace() void STACKALIGN JitIL::Jit(u32 em_address) { - if (GetSpaceLeft() < 0x10000 || blocks.IsFull() || SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache) + if (GetSpaceLeft() < 0x10000 || farcode.GetSpaceLeft() < 0x10000 || blocks.IsFull() || + SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache) { ClearCache(); } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index 164ef03f0f..1eab197fda 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -32,10 +32,42 @@ namespace MMIO { class Mapping; } #define PPCSTATE_SRR0 PPCSTATE(spr[SPR_SRR0]) #define PPCSTATE_SRR1 PPCSTATE(spr[SPR_SRR1]) +// A place to throw blocks of code we don't want polluting the cache, e.g. rarely taken +// exception branches. +class FarCodeCache : public Gen::X64CodeBlock +{ +public: + void Init(int size) { AllocCodeSpace(size); } + void Shutdown() { FreeCodeSpace(); } +}; + // Like XCodeBlock but has some utilities for memory access. class EmuCodeBlock : public Gen::X64CodeBlock { public: + static const int CODE_SIZE = 1024 * 1024 * 32; + + // a bit of a hack; the MMU results in a vast amount more code ending up in the far cache, + // mostly exception handling, so give it a whole bunch more space if the MMU is on. + static const int FARCODE_SIZE = 1024 * 1024 * 8; + static const int FARCODE_SIZE_MMU = 1024 * 1024 * 48; + + FarCodeCache farcode; + u8* nearcode; // Backed up when we switch to far code. + + // Simple functions to switch between near and far code emitting + void SwitchToFarCode() + { + nearcode = GetWritableCodePtr(); + SetCodePtr(farcode.GetWritableCodePtr()); + } + + void SwitchToNearCode() + { + farcode.SetCodePtr(GetWritableCodePtr()); + SetCodePtr(nearcode); + } + void LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src); void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src);