From cbc66f9467ecb1ac2d555e3fa844b858ccfa0d00 Mon Sep 17 00:00:00 2001 From: "dok.slade" Date: Sun, 25 Jul 2010 15:37:56 +0000 Subject: [PATCH] More performance work on the PowerPC JIT compiler: * Merge some loops in PPCAnalyst::Flatten() * Put ppcState.cr and ppcState.cr_fast[] into a single C++ union. This allows quick access to the whole CR register without needing to merge the cr_fast array. The implemented solution assumes the host system is little-endian, but it seems to be already assumed in many places in the code... * Inline the call to computeCR: it now costs a few more memory bytes per JITed instruction but it removes the CPU overhead of the CALL. This allowed to remove some unneeded MOV as well. * Jit64::GenerateCarry() don't need a temporary register anymore * Fix what seems to be a bug in PowerPC instruction RFI: the wrong bit was cleared in MSR git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5970 8ced0084-cf51-0410-be5f-012b33b47a6e --- .../Interpreter/Interpreter_Branch.cpp | 2 +- Source/Core/Core/Src/PowerPC/Jit64/Jit.h | 3 +- Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp | 29 -- .../Core/Src/PowerPC/Jit64/Jit_Branch.cpp | 12 +- .../Core/Src/PowerPC/Jit64/Jit_Integer.cpp | 123 ++++----- .../Src/PowerPC/Jit64/Jit_SystemRegisters.cpp | 36 +-- .../Core/Src/PowerPC/Jit64IL/JitILAsm.cpp | 14 - .../Core/Src/PowerPC/JitCommon/JitAsmCommon.h | 1 - Source/Core/Core/Src/PowerPC/PPCAnalyst.cpp | 252 ++++++++---------- Source/Core/Core/Src/PowerPC/PPCAnalyst.h | 14 + Source/Core/Core/Src/PowerPC/PowerPC.cpp | 20 +- Source/Core/Core/Src/PowerPC/PowerPC.h | 16 +- 12 files changed, 211 insertions(+), 311 deletions(-) diff --git a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_Branch.cpp b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_Branch.cpp index 213f84c9c3..0141482abc 100644 --- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_Branch.cpp +++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_Branch.cpp @@ -117,7 +117,7 @@ void rfi(UGeckoInstruction _inst) const int mask = 0x87C0FFFF; MSR = (MSR & ~mask) | (SRR1 & mask); //MSR[13] is set to 0. - MSR &= 0xFFFDFFFF; + MSR &= 0xFFFBFFFF; // Here we should check if there are pending exceptions, and if their corresponding enable bits are set // if above is true, we'd do: //PowerPC::CheckExceptions(); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h index cbc0387243..5e3161cb76 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h @@ -136,7 +136,8 @@ public: void WriteCallInterpreter(UGeckoInstruction _inst); void Cleanup(); - void GenerateCarry(Gen::X64Reg temp_reg); + void GenerateCarry(); + void ComputeRC(const Gen::OpArg & arg); void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg)); typedef u32 (*Operation)(u32 a, u32 b); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp index adc8dc6d75..67e8145999 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp @@ -222,35 +222,6 @@ void Jit64AsmRoutineManager::Generate() void Jit64AsmRoutineManager::GenerateCommon() { - // USES_CR - computeRc = AlignCode16(); - CMP(32, R(EAX), Imm8(0)); - FixupBranch pLesser = J_CC(CC_L); - FixupBranch pGreater = J_CC(CC_G); - MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x2)); // _x86Reg == 0 - FixupBranch continue1 = J(); - - SetJumpTarget(pGreater); - MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x4)); // _x86Reg > 0 - FixupBranch continue2 = J(); - - SetJumpTarget(pLesser); - MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x8)); // _x86Reg < 0 - - SetJumpTarget(continue1); - SetJumpTarget(continue2); - - // cr[0] |= SPR_XER & 1 - - /*MOV(32, R(EAX), M(&PowerPC::ppcState.spr[SPR_XER])); - - AND(32, R(EAX), Imm32(1)); - - MOVSX(32, 8, ECX, M(&PowerPC::ppcState.cr_fast[0])); - OR(32, R(ECX), R(EAX)); - MOV(8, M(&PowerPC::ppcState.cr_fast[0]), R(ECX));*/ - RET(); - fifoDirectWrite8 = AlignCode4(); GenFifoWrite(8); fifoDirectWrite16 = AlignCode4(); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Branch.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Branch.cpp index dd2fd15ff7..292caeda39 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Branch.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Branch.cpp @@ -66,7 +66,7 @@ void Jit64::rfi(UGeckoInstruction inst) AND(32, R(ECX), Imm32(mask)); OR(32, R(EAX), R(ECX)); // MSR &= 0xFFFDFFFF; //TODO: VERIFY - AND(32, R(EAX), Imm32(0xFFFDFFFF)); + AND(32, R(EAX), Imm32(0xFFFBFFFF)); MOV(32, M(&MSR), R(EAX)); // NPC = SRR0; MOV(32, R(EAX), M(&SRR0)); @@ -246,7 +246,7 @@ void Jit64::bcctrx(UGeckoInstruction inst) MOV(32, R(EAX), Imm32(js.compilerPC + 4)); FixupBranch b = J_CC(branch, false); MOV(32, R(EAX), M(&CTR)); - MOV(32, M(&PC), R(EAX)); + //MOV(32, M(&PC), R(EAX)); => Already done in WriteExitDestInEAX() if (inst.LK_3) MOV(32, M(&LR), Imm32(js.compilerPC + 4)); // LR = PC + 4; // Would really like to continue the block here, but it ends. TODO. @@ -274,7 +274,7 @@ void Jit64::bclrx(UGeckoInstruction inst) AND(32, M(&CR), Imm32(~(0xFF000000))); #endif MOV(32, R(EAX), M(&LR)); - MOV(32, M(&PC), R(EAX)); + //MOV(32, M(&PC), R(EAX)); => Already done in WriteExitDestInEAX() if (inst.LK_3) MOV(32, M(&LR), Imm32(js.compilerPC + 4)); // LR = PC + 4; WriteExitDestInEAX(0); @@ -295,15 +295,15 @@ void Jit64::bclrx(UGeckoInstruction inst) branch = CC_Z; else branch = CC_NZ; - MOV(32, R(EAX), Imm32(js.compilerPC + 4)); FixupBranch b = J_CC(branch, false); MOV(32, R(EAX), M(&LR)); - MOV(32, M(&PC), R(EAX)); + //MOV(32, M(&PC), R(EAX)); => Already done in WriteExitDestInEAX() if (inst.LK_3) MOV(32, M(&LR), Imm32(js.compilerPC + 4)); // LR = PC + 4; + WriteExitDestInEAX(0); // Would really like to continue the block here, but it ends. TODO. SetJumpTarget(b); - WriteExitDestInEAX(0); + WriteExit(js.compilerPC + 4, 1); return; } // Call interpreter diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp index 98e2df6474..ccc4c29293 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp @@ -25,12 +25,32 @@ #include "JitAsm.h" // Assumes that the flags were just set through an addition. -void Jit64::GenerateCarry(Gen::X64Reg temp_reg) { +void Jit64::GenerateCarry() { // USES_XER - SETcc(CC_C, R(temp_reg)); + FixupBranch pNoCarry = J_CC(CC_NC); + OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(1 << 29)); + FixupBranch pContinue = J(); + SetJumpTarget(pNoCarry); AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~(1 << 29))); - SHL(32, R(temp_reg), Imm8(29)); - OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(temp_reg)); + SetJumpTarget(pContinue); +} + +void Jit64::ComputeRC(const Gen::OpArg & arg) { + CMP(32, arg, Imm8(0)); + FixupBranch pLesser = J_CC(CC_L); + FixupBranch pGreater = J_CC(CC_G); + MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x2)); // _x86Reg == 0 + FixupBranch continue1 = J(); + + SetJumpTarget(pGreater); + MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x4)); // _x86Reg > 0 + FixupBranch continue2 = J(); + + SetJumpTarget(pLesser); + MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x8)); // _x86Reg < 0 + + SetJumpTarget(continue1); + SetJumpTarget(continue2); } u32 Add(u32 a, u32 b) {return a + b;} @@ -55,7 +75,7 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void gpr.LoadToX64(d, false); (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; if (carry) - GenerateCarry(EAX); + GenerateCarry(); } } else @@ -64,7 +84,7 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void MOV(32, gpr.R(d), gpr.R(a)); (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; if (carry) - GenerateCarry(EAX); + GenerateCarry(); } } else if (doop == Add) @@ -81,8 +101,7 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void if (Rc) { // Todo - special case immediates. - MOV(32, R(EAX), gpr.R(d)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(d)); } gpr.UnlockAll(); } @@ -281,8 +300,7 @@ void Jit64::orx(UGeckoInstruction inst) if (inst.Rc) { - MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(a)); } } @@ -311,8 +329,7 @@ void Jit64::xorx(UGeckoInstruction inst) if (inst.Rc) { - MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(a)); } } @@ -334,7 +351,7 @@ void Jit64::andx(UGeckoInstruction inst) if (inst.Rc) { // result is already in eax - CALL((u8*)asm_routines.computeRc); + ComputeRC(R(EAX)); } } @@ -351,8 +368,7 @@ void Jit64::extsbx(UGeckoInstruction inst) MOV(32, R(EAX), gpr.R(s)); MOVSX(32, 8, gpr.RX(a), R(AL)); // watch out for ah and friends if (inst.Rc) { - MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(a)); } } @@ -368,8 +384,7 @@ void Jit64::extshx(UGeckoInstruction inst) // as the 32-bit register. MOVSX(32, 16, gpr.RX(a), gpr.R(s)); if (inst.Rc) { - MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(a)); } } @@ -378,7 +393,6 @@ void Jit64::subfic(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(Integer) int a = inst.RA, d = inst.RD; - gpr.FlushLockX(ECX); gpr.Lock(a, d); gpr.LoadToX64(d, a == d, true); int imm = inst.SIMM_16; @@ -386,9 +400,8 @@ void Jit64::subfic(UGeckoInstruction inst) NOT(32, R(EAX)); ADD(32, R(EAX), Imm32(imm + 1)); MOV(32, gpr.R(d), R(EAX)); - GenerateCarry(ECX); + GenerateCarry(); gpr.UnlockAll(); - gpr.UnlockAllX(); // This instruction has no RC flag } @@ -421,7 +434,7 @@ void Jit64::subfcx(UGeckoInstruction inst) gpr.UnlockAll(); if (inst.OE) PanicAlert("OE: subfcx"); if (inst.Rc) { - CALL((u8*)asm_routines.computeRc); + ComputeRC(R(EAX)); } } @@ -464,7 +477,7 @@ void Jit64::subfex(UGeckoInstruction inst) gpr.UnlockAllX(); if (inst.OE) PanicAlert("OE: subfex"); if (inst.Rc) { - CALL((u8*)asm_routines.computeRc); + ComputeRC(R(EAX)); } } @@ -486,7 +499,7 @@ void Jit64::subfx(UGeckoInstruction inst) if (inst.OE) PanicAlert("OE: subfx"); if (inst.Rc) { // result is already in eax - CALL((u8*)asm_routines.computeRc); + ComputeRC(R(EAX)); } } @@ -519,8 +532,7 @@ void Jit64::mullwx(UGeckoInstruction inst) } gpr.UnlockAll(); if (inst.Rc) { - MOV(32, R(EAX), gpr.R(d)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(d)); } } @@ -543,14 +555,9 @@ void Jit64::mulhwux(UGeckoInstruction inst) MUL(32, gpr.R(b)); gpr.UnlockAll(); gpr.UnlockAllX(); - if (inst.Rc) { - MOV(32, R(EAX), R(EDX)); - MOV(32, gpr.R(d), R(EDX)); - // result is already in eax - CALL((u8*)asm_routines.computeRc); - } else { - MOV(32, gpr.R(d), R(EDX)); - } + MOV(32, gpr.R(d), R(EDX)); + if (inst.Rc) + ComputeRC(R(EDX)); } void Jit64::divwux(UGeckoInstruction inst) @@ -581,7 +588,7 @@ void Jit64::divwux(UGeckoInstruction inst) gpr.UnlockAll(); gpr.UnlockAllX(); if (inst.Rc) { - CALL((u8*)asm_routines.computeRc); + ComputeRC(R(EAX)); } } @@ -604,8 +611,7 @@ void Jit64::addx(UGeckoInstruction inst) } if (inst.Rc) { - MOV(32, R(EAX), gpr.R(d)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(d)); } gpr.UnlockAll(); } @@ -616,8 +622,7 @@ void Jit64::addx(UGeckoInstruction inst) ADD(32, gpr.R(d), gpr.R(b)); if (inst.Rc) { - MOV(32, R(EAX), gpr.R(d)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(d)); } gpr.UnlockAll(); } @@ -628,8 +633,7 @@ void Jit64::addx(UGeckoInstruction inst) ADD(32, gpr.R(d), gpr.R(a)); if (inst.Rc) { - MOV(32, R(EAX), gpr.R(d)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(d)); } gpr.UnlockAll(); } @@ -640,8 +644,7 @@ void Jit64::addx(UGeckoInstruction inst) ADD(32, gpr.R(d), gpr.R(d)); if (inst.Rc) { - MOV(32, R(EAX), gpr.R(d)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(d)); } gpr.UnlockAll(); } @@ -653,8 +656,7 @@ void Jit64::addx(UGeckoInstruction inst) ADD(32, gpr.R(d), gpr.R(d)); if (inst.Rc) { - MOV(32, R(EAX), gpr.R(d)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(d)); } gpr.UnlockAll(); } @@ -671,7 +673,6 @@ void Jit64::addex(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(Integer) int a = inst.RA, b = inst.RB, d = inst.RD; - gpr.FlushLockX(ECX); gpr.Lock(a, b, d); if (d != a && d != b) gpr.LoadToX64(d, false); @@ -682,12 +683,11 @@ void Jit64::addex(UGeckoInstruction inst) MOV(32, R(EAX), gpr.R(a)); ADC(32, R(EAX), gpr.R(b)); MOV(32, gpr.R(d), R(EAX)); - GenerateCarry(ECX); + GenerateCarry(); gpr.UnlockAll(); - gpr.UnlockAllX(); if (inst.Rc) { - CALL((u8*)asm_routines.computeRc); + ComputeRC(R(EAX)); } } @@ -741,8 +741,7 @@ void Jit64::rlwinmx(UGeckoInstruction inst) if (inst.Rc) { - MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(a)); } } @@ -775,8 +774,7 @@ void Jit64::rlwimix(UGeckoInstruction inst) gpr.UnlockAll(); if (inst.Rc) { - MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(a)); } } @@ -804,8 +802,7 @@ void Jit64::rlwnmx(UGeckoInstruction inst) gpr.UnlockAllX(); if (inst.Rc) { - MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(R(EAX)); } } @@ -823,8 +820,7 @@ void Jit64::negx(UGeckoInstruction inst) gpr.UnlockAll(); if (inst.Rc) { - MOV(32, R(EAX), gpr.R(d)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(d)); } } @@ -850,8 +846,7 @@ void Jit64::srwx(UGeckoInstruction inst) gpr.UnlockAllX(); if (inst.Rc) { - MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(R(EAX)); } } @@ -877,8 +872,7 @@ void Jit64::slwx(UGeckoInstruction inst) gpr.UnlockAllX(); if (inst.Rc) { - MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(R(EAX)); } } @@ -922,8 +916,7 @@ void Jit64::srawx(UGeckoInstruction inst) gpr.UnlockAllX(); if (inst.Rc) { - MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(a)); } } @@ -965,8 +958,7 @@ void Jit64::srawix(UGeckoInstruction inst) } if (inst.Rc) { - MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(a)); } } @@ -993,8 +985,7 @@ void Jit64::cntlzwx(UGeckoInstruction inst) if (inst.Rc) { - MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)asm_routines.computeRc); + ComputeRC(gpr.R(a)); // TODO: Check PPC manual too } } diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp index 40990a7f52..a32ccb90d6 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -145,13 +145,8 @@ void Jit64::mfcr(UGeckoInstruction inst) // USES_CR int d = inst.RD; gpr.LoadToX64(d, false, true); - MOV(8, R(EAX), M(&PowerPC::ppcState.cr_fast[0])); - SHL(32, R(EAX), Imm8(4)); - for (int i = 1; i < 7; i++) { - OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[i])); - SHL(32, R(EAX), Imm8(4)); - } - OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[7])); + MOV(32, R(EAX), M(&PowerPC::ppcState.cr_fast_u32)); + BSWAP(32, EAX); MOV(32, gpr.R(d), R(EAX)); } @@ -160,33 +155,22 @@ void Jit64::mtcrf(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(SystemRegisters) - // USES_CR - u32 mask = 0; u32 crm = inst.CRM; if (crm == 0xFF) { - gpr.FlushLockX(ECX); MOV(32, R(EAX), gpr.R(inst.RS)); - for (int i = 0; i < 8; i++) { - MOV(32, R(ECX), R(EAX)); - SHR(32, R(ECX), Imm8(28 - (i * 4))); - AND(32, R(ECX), Imm32(0xF)); - MOV(8, M(&PowerPC::ppcState.cr_fast[i]), R(ECX)); - } - gpr.UnlockAllX(); - } else { - Default(inst); - return; - - // TODO: translate this to work in new CR model. + BSWAP(32, EAX); + MOV(32, M(&PowerPC::ppcState.cr_fast_u32), R(EAX)); + } + else if (crm != 0) { + u32 mask = 0; for (int i = 0; i < 8; i++) { if (crm & (1 << i)) mask |= 0xF << (i*4); } + MOV(32, R(EAX), gpr.R(inst.RS)); - MOV(32, R(ECX), M(&PowerPC::ppcState.cr)); AND(32, R(EAX), Imm32(mask)); - AND(32, R(ECX), Imm32(~mask)); - OR(32, R(EAX), R(ECX)); - MOV(32, M(&PowerPC::ppcState.cr), R(EAX)); + BSWAP(32, EAX); + MOV(32, M(&PowerPC::ppcState.cr_fast_u32), R(EAX)); } } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitILAsm.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/JitILAsm.cpp index 820bfcd67d..2f3903ce89 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitILAsm.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitILAsm.cpp @@ -221,20 +221,6 @@ void JitILAsmRoutineManager::Generate() void JitILAsmRoutineManager::GenerateCommon() { - // USES_CR - computeRc = AlignCode16(); - CMP(32, R(EAX), Imm8(0)); - FixupBranch pLesser = J_CC(CC_L); - FixupBranch pGreater = J_CC(CC_G); - MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x2)); // _x86Reg == 0 - RET(); - SetJumpTarget(pGreater); - MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x4)); // _x86Reg > 0 - RET(); - SetJumpTarget(pLesser); - MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x8)); // _x86Reg < 0 - RET(); - fifoDirectWrite8 = AlignCode4(); GenFifoWrite(8); fifoDirectWrite16 = AlignCode4(); diff --git a/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.h index a00a31ddd9..65788fea3b 100644 --- a/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.h @@ -45,7 +45,6 @@ public: const u8 *dispatcherPcInEAX; const u8 *fpException; - const u8 *computeRc; const u8 *testExceptions; const u8 *dispatchPcInEAX; const u8 *doTiming; diff --git a/Source/Core/Core/Src/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/Src/PowerPC/PPCAnalyst.cpp index 2a00a18cf9..d9fc6f7a0f 100644 --- a/Source/Core/Core/Src/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/Src/PowerPC/PPCAnalyst.cpp @@ -296,6 +296,13 @@ u32 Flatten(u32 address, int *realsize, BlockStats *st, BlockRegStats *gpa, Bloc gpa->any = true; fpa->any = false; + for (int i = 0; i < 32; i++) + { + gpa->firstRead[i] = -1; + gpa->firstWrite[i] = -1; + gpa->numReads[i] = 0; + gpa->numWrites[i] = 0; + } u32 blockstart = address; int maxsize = blockSize; @@ -307,7 +314,8 @@ u32 Flatten(u32 address, int *realsize, BlockStats *st, BlockRegStats *gpa, Bloc CodeOp *code = buffer->codebuffer; bool foundExit = false; - // Flatten! (Currently just copies, following branches is disabled) + // Do analysis of the code, look for dependencies etc + int numSystemInstructions = 0; for (int i = 0; i < maxsize; i++) { num_inst++; @@ -326,6 +334,106 @@ u32 Flatten(u32 address, int *realsize, BlockStats *st, BlockRegStats *gpa, Bloc if (opinfo) numCycles += opinfo->numCyclesMinusOne + 1; _assert_msg_(POWERPC, opinfo != 0, "Invalid Op - Error flattening %08x op %08x", address + i*4, inst.hex); + + code[i].wantsCR0 = false; + code[i].wantsCR1 = false; + code[i].wantsPS1 = false; + + int flags = opinfo->flags; + + if (flags & FL_USE_FPU) + fpa->any = true; + + if (flags & FL_TIMER) + gpa->anyTimer = true; + + // Does the instruction output CR0? + if (flags & FL_RC_BIT) + code[i].outputCR0 = inst.hex & 1; //todo fix + else if ((flags & FL_SET_CRn) && inst.CRFD == 0) + code[i].outputCR0 = true; + else + code[i].outputCR0 = (flags & FL_SET_CR0) ? true : false; + + // Does the instruction output CR1? + if (flags & FL_RC_BIT_F) + code[i].outputCR1 = inst.hex & 1; //todo fix + else if ((flags & FL_SET_CRn) && inst.CRFD == 1) + code[i].outputCR1 = true; + else + code[i].outputCR1 = (flags & FL_SET_CR1) ? true : false; + + int numOut = 0; + int numIn = 0; + if (flags & FL_OUT_A) + { + code[i].regsOut[numOut++] = inst.RA; + gpa->SetOutputRegister(inst.RA, i); + } + if (flags & FL_OUT_D) + { + code[i].regsOut[numOut++] = inst.RD; + gpa->SetOutputRegister(inst.RD, i); + } + if (flags & FL_OUT_S) + { + code[i].regsOut[numOut++] = inst.RS; + gpa->SetOutputRegister(inst.RS, i); + } + if ((flags & FL_IN_A) || ((flags & FL_IN_A0) && inst.RA != 0)) + { + code[i].regsIn[numIn++] = inst.RA; + gpa->SetInputRegister(inst.RA, i); + } + if (flags & FL_IN_B) + { + code[i].regsIn[numIn++] = inst.RB; + gpa->SetInputRegister(inst.RB, i); + } + if (flags & FL_IN_C) + { + code[i].regsIn[numIn++] = inst.RC; + gpa->SetInputRegister(inst.RC, i); + } + if (flags & FL_IN_S) + { + code[i].regsIn[numIn++] = inst.RS; + gpa->SetInputRegister(inst.RS, i); + } + + // Set remaining register slots as unused (-1) + for (int j = numIn; j < 3; j++) + code[i].regsIn[j] = -1; + for (int j = numOut; j < 2; j++) + code[i].regsOut[j] = -1; + for (int j = 0; j < 3; j++) + code[i].fregsIn[j] = -1; + code[i].fregOut = -1; + + switch (opinfo->type) + { + case OPTYPE_INTEGER: + case OPTYPE_LOAD: + case OPTYPE_STORE: + break; + case OPTYPE_FPU: + break; + case OPTYPE_LOADFP: + break; + case OPTYPE_BRANCH: + if (code[i].inst.hex == 0x4e800020) + { + // For analysis purposes, we can assume that blr eats flags. + code[i].outputCR0 = true; + code[i].outputCR1 = true; + } + break; + case OPTYPE_SYSTEM: + case OPTYPE_SYSTEMFP: + numSystemInstructions++; + break; + } + bool follow = false; u32 destination; if (inst.OPCD == 18 && blockSize > 1) @@ -362,146 +470,6 @@ u32 Flatten(u32 address, int *realsize, BlockStats *st, BlockRegStats *gpa, Bloc NOTICE_LOG(POWERPC, "Analyzer ERROR - Function %08x too big, size is 0x%08x", blockstart, address-blockstart); st->numCycles = numCycles; - // Do analysis of the code, look for dependencies etc - int numSystemInstructions = 0; - for (int i = 0; i < 32; i++) - { - gpa->firstRead[i] = -1; - gpa->firstWrite[i] = -1; - gpa->numReads[i] = 0; - gpa->numWrites[i] = 0; - } - - gpa->any = true; - for (int i = 0; i < num_inst; i++) - { - UGeckoInstruction inst = code[i].inst; - - code[i].wantsCR0 = false; - code[i].wantsCR1 = false; - code[i].wantsPS1 = false; - - const GekkoOPInfo *opinfo = code[i].opinfo; - _assert_msg_(POWERPC, opinfo != 0, "Invalid Op - Error scanning %08x op %08x",address+i*4,inst.hex); - int flags = opinfo->flags; - - if (flags & FL_USE_FPU) - fpa->any = true; - - if (flags & FL_TIMER) - gpa->anyTimer = true; - - // Does the instruction output CR0? - if (flags & FL_RC_BIT) - code[i].outputCR0 = inst.hex & 1; //todo fix - else if ((flags & FL_SET_CRn) && inst.CRFD == 0) - code[i].outputCR0 = true; - else - code[i].outputCR0 = (flags & FL_SET_CR0) ? true : false; - - // Does the instruction output CR1? - if (flags & FL_RC_BIT_F) - code[i].outputCR1 = inst.hex & 1; //todo fix - else if ((flags & FL_SET_CRn) && inst.CRFD == 1) - code[i].outputCR1 = true; - else - code[i].outputCR1 = (flags & FL_SET_CR1) ? true : false; - - for (int j = 0; j < 3; j++) - { - code[i].fregsIn[j] = -1; - code[i].regsIn[j] = -1; - } - for (int j = 0; j < 2; j++) - code[i].regsOut[j] = -1; - - code[i].fregOut = -1; - - int numOut = 0; - int numIn = 0; - if (flags & FL_OUT_A) - { - code[i].regsOut[numOut++] = inst.RA; - gpa->numWrites[inst.RA]++; - } - if (flags & FL_OUT_D) - { - code[i].regsOut[numOut++] = inst.RD; - gpa->numWrites[inst.RD]++; - } - if (flags & FL_OUT_S) - { - code[i].regsOut[numOut++] = inst.RS; - gpa->numWrites[inst.RS]++; - } - if ((flags & FL_IN_A) || ((flags & FL_IN_A0) && inst.RA != 0)) - { - code[i].regsIn[numIn++] = inst.RA; - gpa->numReads[inst.RA]++; - } - if (flags & FL_IN_B) - { - code[i].regsIn[numIn++] = inst.RB; - gpa->numReads[inst.RB]++; - } - if (flags & FL_IN_C) - { - code[i].regsIn[numIn++] = inst.RC; - gpa->numReads[inst.RC]++; - } - if (flags & FL_IN_S) - { - code[i].regsIn[numIn++] = inst.RS; - gpa->numReads[inst.RS]++; - } - - switch (opinfo->type) - { - case OPTYPE_INTEGER: - case OPTYPE_LOAD: - case OPTYPE_STORE: - break; - case OPTYPE_FPU: - break; - case OPTYPE_LOADFP: - break; - case OPTYPE_BRANCH: - if (code[i].inst.hex == 0x4e800020) - { - // For analysis purposes, we can assume that blr eats flags. - code[i].outputCR0 = true; - code[i].outputCR1 = true; - } - break; - case OPTYPE_SYSTEM: - case OPTYPE_SYSTEMFP: - numSystemInstructions++; - break; - } - - for (int j = 0; j < numIn; j++) - { - int r = code[i].regsIn[j]; - if (r < 0 || r > 31) - PanicAlert("wtf"); - if (gpa->firstRead[r] == -1) - gpa->firstRead[r] = (short)(i); - gpa->lastRead[r] = (short)(i); - gpa->numReads[r]++; - } - - for (int j = 0; j < numOut; j++) - { - int r = code[i].regsOut[j]; - if (r < 0 || r > 31) - PanicAlert("wtf"); - if (gpa->firstWrite[r] == -1) - gpa->firstWrite[r] = (short)(i); - gpa->lastWrite[r] = (short)(i); - gpa->numWrites[r]++; - } - } - // Instruction Reordering Pass if (blockSize > 1) { @@ -530,7 +498,7 @@ u32 Flatten(u32 address, int *realsize, BlockStats *st, BlockRegStats *gpa, Bloc bool wantsCR0 = true; bool wantsCR1 = true; bool wantsPS1 = true; - for (int i = num_inst; i; i--) + for (int i = num_inst - 1; i >= 0; i--) { if (code[i].outputCR0) wantsCR0 = false; diff --git a/Source/Core/Core/Src/PowerPC/PPCAnalyst.h b/Source/Core/Core/Src/PowerPC/PPCAnalyst.h index 5a41f6fda7..d19774e3e8 100644 --- a/Source/Core/Core/Src/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/Src/PowerPC/PPCAnalyst.h @@ -77,6 +77,20 @@ struct BlockRegStats int GetUseRange(int reg) { return max(lastRead[reg], lastWrite[reg]) - min(firstRead[reg], firstWrite[reg]);} + + inline void SetInputRegister(int reg, short opindex) { + if (firstRead[reg] == -1) + firstRead[reg] = (short)(opindex); + lastRead[reg] = (short)(opindex); + numReads[reg]++; + } + + inline void SetOutputRegister(int reg, short opindex) { + if (firstWrite[reg] == -1) + firstWrite[reg] = (short)(opindex); + lastWrite[reg] = (short)(opindex); + numWrites[reg]++; + } }; diff --git a/Source/Core/Core/Src/PowerPC/PowerPC.cpp b/Source/Core/Core/Src/PowerPC/PowerPC.cpp index 2bc9771638..f3f28ed3cb 100644 --- a/Source/Core/Core/Src/PowerPC/PowerPC.cpp +++ b/Source/Core/Core/Src/PowerPC/PowerPC.cpp @@ -48,24 +48,6 @@ BreakPoints breakpoints; MemChecks memchecks; PPCDebugInterface debug_interface; -void CompactCR() -{ - u32 new_cr = ppcState.cr_fast[0] << 28; - for (int i = 1; i < 8; i++) - { - new_cr |= ppcState.cr_fast[i] << (28 - i * 4); - } - ppcState.cr = new_cr; -} - -void ExpandCR() -{ - for (int i = 0; i < 8; i++) - { - ppcState.cr_fast[i] = (ppcState.cr >> (28 - i * 4)) & 0xF; - } -} - void DoState(PointerWrap &p) { p.Do(ppcState); @@ -97,7 +79,7 @@ void ResetRegisters() ppcState.spr[SPR_ECID_M] = 0x1840c00d; ppcState.spr[SPR_ECID_L] = 0x82bb08e8; - ppcState.cr = 0; + ppcState.cr_fast_u32 = 0; ppcState.fpscr = 0; ppcState.pc = 0; ppcState.npc = 0; diff --git a/Source/Core/Core/Src/PowerPC/PowerPC.h b/Source/Core/Core/Src/PowerPC/PowerPC.h index 22c413afd7..f5b55c127b 100644 --- a/Source/Core/Core/Src/PowerPC/PowerPC.h +++ b/Source/Core/Core/Src/PowerPC/PowerPC.h @@ -49,8 +49,14 @@ struct GC_ALIGNED64(PowerPCState) u32 pc; // program counter u32 npc; - u32 cr; // flags - u8 cr_fast[8]; // Possibly reorder to 0, 2, 4, 8, 1, 3, 5, 7 so that we can make Compact and Expand super fast? + // flags + u32 cr_old; // Not used anymore (only there to maintain backward compatibility with previous save states) + #pragma pack(push,1) + union { + u8 cr_fast[8]; // Possibly reorder to 0, 2, 4, 8, 1, 3, 5, 7 so that we can make Compact and Expand super fast? + u32 cr_fast_u32; // Warning: This is reversed CR on little-endian systems + }; + #pragma pack(pop) u32 msr; // machine specific register u32 fpscr; // floating point flags/status bits @@ -168,13 +174,11 @@ inline void SetCRBit(int bit, int value) { // SetCR and GetCR are fairly slow. Should be avoided if possible. inline void SetCR(u32 new_cr) { - PowerPC::ppcState.cr = new_cr; - PowerPC::ExpandCR(); + PowerPC::ppcState.cr_fast_u32 = Common::swap32(new_cr); } inline u32 GetCR() { - PowerPC::CompactCR(); - return PowerPC::ppcState.cr; + return Common::swap32(PowerPC::ppcState.cr_fast_u32); } // SetCarry/GetCarry may speed up soon.