From 866d4e6bc8626246c528e736dca01ec273fdc03f Mon Sep 17 00:00:00 2001 From: hrydgard Date: Mon, 15 Dec 2008 19:22:34 +0000 Subject: [PATCH] Consolidate some compare instructions in JIT, preparations for separate CR flag storage, misc other cleanup in cpu core. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1547 8ced0084-cf51-0410-be5f-012b33b47a6e --- Source/Core/Core/Src/HW/Memmap.cpp | 3 +- .../Interpreter/Interpreter_Integer.cpp | 22 ++- .../Interpreter/Interpreter_LoadStore.cpp | 40 ++--- .../Interpreter/Interpreter_Paired.cpp | 1 - .../Interpreter_SystemRegisters.cpp | 5 +- Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp | 4 + Source/Core/Core/Src/PowerPC/Jit64/Jit.h | 7 +- Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp | 9 +- .../Core/Src/PowerPC/Jit64/Jit_Branch.cpp | 19 ++- .../Src/PowerPC/Jit64/Jit_FloatingPoint.cpp | 6 +- .../Core/Src/PowerPC/Jit64/Jit_Integer.cpp | 159 ++++++++---------- .../Core/Core/Src/PowerPC/Jit64/Jit_Util.cpp | 10 ++ Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.h | 3 + Source/Core/Core/Src/PowerPC/PPCAnalyst.cpp | 42 ++--- Source/Core/Core/Src/PowerPC/PPCAnalyst.h | 2 - Source/Core/Core/Src/PowerPC/PPCTables.cpp | 8 +- Source/Core/Core/Src/PowerPC/PowerPC.h | 128 ++++++++------ 17 files changed, 240 insertions(+), 228 deletions(-) diff --git a/Source/Core/Core/Src/HW/Memmap.cpp b/Source/Core/Core/Src/HW/Memmap.cpp index d7552822d5..bc30c9a443 100644 --- a/Source/Core/Core/Src/HW/Memmap.cpp +++ b/Source/Core/Core/Src/HW/Memmap.cpp @@ -672,7 +672,8 @@ bool Init() else InitHWMemFuncs(); - LOG(MEMMAP, "Memory system initialized. RAM at %p (0x80000000 @ %p)", base, base + 0x80000000); + LOG(MEMMAP, "Memory system initialized. RAM at %p (mirrors at 0 @ %p, 0x80000000 @ %p , 0xC0000000 @ %p)", + m_pRAM, m_pPhysicalRAM, m_pVirtualCachedRAM, m_pVirtualUncachedRAM); m_IsInitialized = true; return true; } diff --git a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_Integer.cpp b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_Integer.cpp index 6e30a0d5e9..bdbfef0fe6 100644 --- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_Integer.cpp +++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_Integer.cpp @@ -117,7 +117,7 @@ void andis_rc(UGeckoInstruction _inst) void cmpi(UGeckoInstruction _inst) { - Helper_UpdateCRx(_inst.CRFD, m_GPR[_inst.RA]-_inst.SIMM_16); + Helper_UpdateCRx(_inst.CRFD, m_GPR[_inst.RA] - _inst.SIMM_16); } void cmpli(UGeckoInstruction _inst) @@ -128,7 +128,7 @@ void cmpli(UGeckoInstruction _inst) if (a < b) f = 0x8; else if (a > b) f = 0x4; else f = 0x2; //equals - if (XER.SO) f = 0x1; + if (GetXER_SO()) f |= 0x1; SetCRField(_inst.CRFD, f); } @@ -151,13 +151,12 @@ void subfic(UGeckoInstruction _inst) { /* u32 rra = ~m_GPR[_inst.RA]; s32 immediate = (s16)_inst.SIMM_16 + 1; - // #define CALC_XER_CA(X,Y) (((X) + (Y) < X) ? SET_XER_CA : CLEAR_XER_CA) if ((rra + immediate) < rra) - XER.CA = 1; + SetCarry(1); else - XER.CA = 0; + SetCarry(0); m_GPR[_inst.RD] = rra - immediate; */ @@ -227,11 +226,10 @@ void cmp(UGeckoInstruction _inst) s32 a = (s32)m_GPR[_inst.RA]; s32 b = (s32)m_GPR[_inst.RB]; int fTemp = 0x8; // a < b - - // if (a < b) fTemp = 0x8; else - if (a > b) fTemp = 0x4; + // if (a < b) fTemp = 0x8; else + if (a > b) fTemp = 0x4; else if (a == b) fTemp = 0x2; - if (XER.SO) PanicAlert("cmp getting overflow flag"); // fTemp |= 0x1 + if (GetXER_SO()) PanicAlert("cmp getting overflow flag"); // fTemp |= 0x1 SetCRField(_inst.CRFD, fTemp); } @@ -241,10 +239,10 @@ void cmpl(UGeckoInstruction _inst) u32 b = m_GPR[_inst.RB]; u32 fTemp = 0x8; // a < b - // if (a < b) fTemp = 0x8;else - if (a > b) fTemp = 0x4; + // if (a < b) fTemp = 0x8;else + if (a > b) fTemp = 0x4; else if (a == b) fTemp = 0x2; - if (XER.SO) PanicAlert("cmpl getting overflow flag"); // fTemp |= 0x1; + if (GetXER_SO()) PanicAlert("cmpl getting overflow flag"); // fTemp |= 0x1; SetCRField(_inst.CRFD, fTemp); } diff --git a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_LoadStore.cpp b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_LoadStore.cpp index 3eab82e23f..02571262df 100644 --- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_LoadStore.cpp +++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_LoadStore.cpp @@ -28,6 +28,11 @@ namespace Interpreter { +// TODO: These should really be in the save state, although it's unlikely to matter much. +// They are for lwarx and its friend stwcxd. +static bool g_bReserve = false; +static u32 g_reserveAddr; + u32 Helper_Get_EA(const UGeckoInstruction _inst) { return _inst.RA ? (m_GPR[_inst.RA] + _inst.SIMM_16) : _inst.SIMM_16; @@ -581,37 +586,32 @@ void stwbrx(UGeckoInstruction _inst) // The following two instructions are for SMP communications. On a single -// CPU, they cannot fail unless an interrupt happens in between, which usually -// won't happen with the JIT. -bool g_bReserve = false; -u32 g_reserveAddr; +// CPU, they cannot fail unless an interrupt happens in between. void lwarx(UGeckoInstruction _inst) { - u32 uAddress = Helper_Get_EA_X(_inst); - + u32 uAddress = Helper_Get_EA_X(_inst); m_GPR[_inst.RD] = Memory::Read_U32(uAddress); - g_bReserve = true; - g_reserveAddr = uAddress; + + g_bReserve = true; + g_reserveAddr = uAddress; } void stwcxd(UGeckoInstruction _inst) { - // Stores Word Conditional indeXed - - u32 uAddress; - - if(g_bReserve) { + // Stores Word Conditional indeXed + u32 uAddress; + if (g_bReserve) { uAddress = Helper_Get_EA_X(_inst); - if(uAddress == g_reserveAddr) { + if (uAddress == g_reserveAddr) { Memory::Write_U32(m_GPR[_inst.RS], uAddress); - g_bReserve = false; - SetCRField(0, 2 | XER.SO); - return; - } - } + g_bReserve = false; + SetCRField(0, 2 | GetXER_SO()); + return; + } + } - SetCRField(0, XER.SO); + SetCRField(0, GetXER_SO()); } void stwux(UGeckoInstruction _inst) diff --git a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_Paired.cpp b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_Paired.cpp index 46ff9f45c3..90e1e458ba 100644 --- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_Paired.cpp +++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_Paired.cpp @@ -229,7 +229,6 @@ void ps_cmpu1(UGeckoInstruction _inst) if (fa < fb) compareResult = 8; else if (fa > fb) compareResult = 4; else compareResult = 2; - SetCRField(_inst.CRFD, compareResult); } diff --git a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp index 426f5f0667..ae1f41fe23 100644 --- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp +++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp @@ -226,8 +226,9 @@ void mtfsfx(UGeckoInstruction _inst) void mcrxr(UGeckoInstruction _inst) { - SetCRField(_inst.CRFD, XER.Hex >> 28); - XER.Hex &= ~0xF0000000; // clear 0-3 + // USES_XER + SetCRField(_inst.CRFD, PowerPC::ppcState.spr[SPR_XER] >> 28); + PowerPC::ppcState.spr[SPR_XER] &= ~0xF0000000; // clear 0-3 } void mfcr(UGeckoInstruction _inst) diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp index 347feb25b1..caf9b9f6d8 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp @@ -385,6 +385,7 @@ namespace Jit64 js.instructionNumber = i; if (i == (int)size - 1) { js.isLastInstruction = true; + js.next_inst = 0; if (Profiler::g_ProfileBlocks) { // CAUTION!!! push on stack regs you use, do your stuff, then pop PROFILER_VPUSH; @@ -394,6 +395,9 @@ namespace Jit64 PROFILER_ADD_DIFF_LARGE_INTEGER(&b.ticCounter, &b.ticStop, &b.ticStart); PROFILER_VPOP; } + } else { + // help peephole optimizations + js.next_inst = ops[i + 1].inst; } // const GekkoOpInfo *info = GetOpInfo(); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h index dfd29ff755..beb287620c 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h @@ -49,6 +49,7 @@ namespace Jit64 { u32 compilerPC; u32 blockStart; + UGeckoInstruction next_inst; // for easy peephole opt. int blockSize; int instructionNumber; int downcountAmount; @@ -142,10 +143,8 @@ namespace Jit64 void fcmpx(UGeckoInstruction inst); void fmrx(UGeckoInstruction inst); - void cmpli(UGeckoInstruction inst); - void cmpi(UGeckoInstruction inst); - void cmpl(UGeckoInstruction inst); - void cmp(UGeckoInstruction inst); + void cmpXi(UGeckoInstruction inst); + void cmpX(UGeckoInstruction inst); void cntlzwx(UGeckoInstruction inst); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp index d43a7edd1d..17b8663ef1 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp @@ -324,18 +324,19 @@ void GenFifoXmm64Write() void GenerateCommon() { + // USES_CR computeRc = AlignCode16(); - AND(32, M(&CR), Imm32(0x0FFFFFFF)); + AND(32, M(&PowerPC::ppcState.cr), Imm32(0x0FFFFFFF)); CMP(32, R(EAX), Imm8(0)); FixupBranch pLesser = J_CC(CC_L); FixupBranch pGreater = J_CC(CC_G); - OR(32, M(&CR), Imm32(0x20000000)); // _x86Reg == 0 + OR(32, M(&PowerPC::ppcState.cr), Imm32(0x20000000)); // _x86Reg == 0 RET(); SetJumpTarget(pGreater); - OR(32, M(&CR), Imm32(0x40000000)); // _x86Reg > 0 + OR(32, M(&PowerPC::ppcState.cr), Imm32(0x40000000)); // _x86Reg > 0 RET(); SetJumpTarget(pLesser); - OR(32, M(&CR), Imm32(0x80000000)); // _x86Reg < 0 + OR(32, M(&PowerPC::ppcState.cr), Imm32(0x80000000)); // _x86Reg < 0 RET(); fifoDirectWrite8 = AlignCode4(); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Branch.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Branch.cpp index 6ea665b68f..081029b83b 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Branch.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Branch.cpp @@ -108,6 +108,7 @@ namespace Jit64 // variants of this instruction. void bcx(UGeckoInstruction inst) { + // USES_CR _assert_msg_(DYNA_REC, js.isLastInstruction, "bcx not last instruction of block"); gpr.Flush(FLUSH_ALL); @@ -124,7 +125,7 @@ namespace Jit64 if ((inst.BO & 16) == 0) // Test a CR bit { - TEST(32, M(&CR), Imm32(0x80000000 >> inst.BI)); + TEST(32, M(&PowerPC::ppcState.cr), Imm32(0x80000000 >> inst.BI)); if (inst.BO & 8) // Conditional branch branch = CC_NZ; else @@ -181,14 +182,14 @@ namespace Jit64 { skip = J_CC(branch); } - u32 destination; - if (inst.LK) - MOV(32, M(&LR), Imm32(js.compilerPC + 4)); - if(inst.AA) - destination = SignExt16(inst.BD << 2); - else - destination = js.compilerPC + SignExt16(inst.BD << 2); - WriteExit(destination, 0); + u32 destination; + if (inst.LK) + MOV(32, M(&LR), Imm32(js.compilerPC + 4)); + if(inst.AA) + destination = SignExt16(inst.BD << 2); + else + destination = js.compilerPC + SignExt16(inst.BD << 2); + WriteExit(destination, 0); if (inst.BO != 20) { SetJumpTarget(skip); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp index e056d8e2e3..5fafea7e78 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -205,8 +205,8 @@ namespace Jit64 { fpr.LoadToX64(a, true); } - - AND(32, M(&CR), Imm32(~(0xF0000000 >> shift))); + // USES_CR + AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> shift))); if (ordered) COMISD(fpr.R(a).GetSimpleReg(), fpr.R(b)); else @@ -226,7 +226,7 @@ namespace Jit64 SetJumpTarget(continue1); SetJumpTarget(continue2); SHR(32, R(EAX), Imm8(shift)); - OR(32, M(&CR), R(EAX)); + OR(32, M(&PowerPC::ppcState.cr), R(EAX)); fpr.UnlockAll(); } diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp index 3ec04f8d6d..39c6ebd3f6 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp @@ -24,6 +24,7 @@ #include "JitCache.h" #include "JitRegCache.h" #include "JitAsm.h" +#include "Jit_Util.h" // #define INSTRUCTION_START Default(inst); return; #define INSTRUCTION_START @@ -32,10 +33,11 @@ namespace Jit64 { // Assumes that the flags were just set through an addition. void GenerateCarry(X64Reg temp_reg) { + // USES_XER SETcc(CC_C, R(temp_reg)); - AND(32, M(&XER), Imm32(~(1 << 29))); + AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~(1 << 29))); SHL(32, R(temp_reg), Imm8(29)); - OR(32, M(&XER), R(temp_reg)); + OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(temp_reg)); } typedef u32 (*Operation)(u32 a, u32 b); @@ -133,26 +135,49 @@ namespace Jit64 } } + /* + + if (js.next_inst.OPCD == 16) { // bcx + if (!js.next_inst.LK && (js.next_inst.BO & BO_DONT_DECREMENT_FLAG)) + { + // it's clear there's plenty of opportunity. + //PanicAlert("merge"); + } + } + */ + // unsigned - void cmpli(UGeckoInstruction inst) + void cmpXi(UGeckoInstruction inst) { - // Should check if the next intruction is a branch - if it is, merge the two. This can save - // a whole bunch of instructions and cycles, especially if we aggressively bubble down compares - // towards branches. + // USES_CR #ifdef JIT_OFF_OPTIONS if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff) {Default(inst); return;} // turn off from debugger #endif + // Should check if the next intruction is a branch - if it is, merge the two. This can save + // a whole bunch of instructions and cycles, especially if we aggressively bubble down compares + // towards branches. INSTRUCTION_START; int a = inst.RA; - u32 uimm = inst.UIMM; int crf = inst.CRFD; int shift = crf * 4; + Gen::CCFlags less_than, greater_than; + OpArg comparand; + if (inst.OPCD == 10) { + less_than = CC_B; + greater_than = CC_A; + comparand = Imm32(inst.UIMM); + } else { + less_than = CC_L; + greater_than = CC_G; + comparand = Imm32((s32)(s16)inst.UIMM); + } + gpr.KillImmediate(a); // todo, optimize instead, but unlikely to make a difference - AND(32, M(&CR), Imm32(~(0xF0000000 >> (crf*4)))); - CMP(32, gpr.R(a), Imm32(uimm)); - FixupBranch pLesser = J_CC(CC_B); - FixupBranch pGreater = J_CC(CC_A); + AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> (crf*4)))); + CMP(32, gpr.R(a), comparand); + FixupBranch pLesser = J_CC(less_than); + FixupBranch pGreater = J_CC(greater_than); MOV(32, R(EAX), Imm32(0x20000000 >> shift)); // _x86Reg == 0 FixupBranch continue1 = J(); @@ -165,44 +190,17 @@ namespace Jit64 MOV(32, R(EAX), Imm32(0x80000000 >> shift));// _x86Reg < 0 SetJumpTarget(continue1); SetJumpTarget(continue2); - OR(32, M(&CR), R(EAX)); + OR(32, M(&PowerPC::ppcState.cr), R(EAX)); + + // TODO: Add extra code at the end for the "taken" case. Jump to it from the matching branches. + // Since it's the last block, some liberties can be taken. + // don't forget to flush registers AFTER the cmp BEFORE the jmp. Flushing doesn't affect flags. } // signed - void cmpi(UGeckoInstruction inst) - { -#ifdef JIT_OFF_OPTIONS - if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff) - {Default(inst); return;} // turn off from debugger -#endif - INSTRUCTION_START; - int a = inst.RA; - s32 simm = (s32)(s16)inst.UIMM; - int crf = inst.CRFD; - int shift = crf * 4; - gpr.KillImmediate(a); // todo, optimize instead, but unlikely to make a difference - AND(32, M(&CR), Imm32(~(0xF0000000 >> (crf*4)))); - CMP(32, gpr.R(a), Imm32(simm)); - FixupBranch pLesser = J_CC(CC_L); - FixupBranch pGreater = J_CC(CC_G); - // _x86Reg == 0 - MOV(32, R(EAX), Imm32(0x20000000 >> shift)); - FixupBranch continue1 = J(); - // _x86Reg > 0 - SetJumpTarget(pGreater); - MOV(32, R(EAX), Imm32(0x40000000 >> shift)); - FixupBranch continue2 = J(); - // _x86Reg < 0 - SetJumpTarget(pLesser); - MOV(32, R(EAX), Imm32(0x80000000 >> shift)); - SetJumpTarget(continue1); - SetJumpTarget(continue2); - OR(32, M(&CR), R(EAX)); - } - - // signed - void cmp(UGeckoInstruction inst) + void cmpX(UGeckoInstruction inst) { + // USES_CR #ifdef JIT_OFF_OPTIONS if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff) {Default(inst); return;} // turn off from debugger @@ -212,12 +210,21 @@ namespace Jit64 int b = inst.RB; int crf = inst.CRFD; int shift = crf * 4; + Gen::CCFlags less_than, greater_than; + Gen::OpArg comparand = gpr.R(b); + if (inst.SUBOP10 == 32) { + less_than = CC_B; + greater_than = CC_A; + } else { + less_than = CC_L; + greater_than = CC_G; + } gpr.Lock(a, b); gpr.LoadToX64(a, true, false); - AND(32, M(&CR), Imm32(~(0xF0000000 >> (crf*4)))); - CMP(32, gpr.R(a), gpr.R(b)); - FixupBranch pLesser = J_CC(CC_L); - FixupBranch pGreater = J_CC(CC_G); + AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> (crf*4)))); + CMP(32, gpr.R(a), comparand); + FixupBranch pLesser = J_CC(less_than); + FixupBranch pGreater = J_CC(greater_than); // _x86Reg == 0 MOV(32, R(EAX), Imm32(0x20000000 >> shift)); FixupBranch continue1 = J(); @@ -230,41 +237,7 @@ namespace Jit64 MOV(32, R(EAX), Imm32(0x80000000 >> shift)); SetJumpTarget(continue1); SetJumpTarget(continue2); - OR(32, M(&CR), R(EAX)); - gpr.UnlockAll(); - } - - // unsigned - void cmpl(UGeckoInstruction inst) - { -#ifdef JIT_OFF_OPTIONS - if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff) - {Default(inst); return;} // turn off from debugger -#endif - INSTRUCTION_START; - int a = inst.RA; - int b = inst.RB; - int crf = inst.CRFD; - int shift = crf * 4; - gpr.Lock(a, b); - gpr.LoadToX64(a, true, false); - AND(32, M(&CR), Imm32(~(0xF0000000 >> (crf*4)))); - CMP(32, gpr.R(a), gpr.R(b)); - FixupBranch pLesser = J_CC(CC_B); - FixupBranch pGreater = J_CC(CC_A); - // _x86Reg == 0 - MOV(32, R(EAX), Imm32(0x20000000 >> shift)); - FixupBranch continue1 = J(); - // _x86Reg > 0 - SetJumpTarget(pGreater); - MOV(32, R(EAX), Imm32(0x40000000 >> shift)); - FixupBranch continue2 = J(); - // _x86Reg < 0 - SetJumpTarget(pLesser); - MOV(32, R(EAX), Imm32(0x80000000 >> shift)); - SetJumpTarget(continue1); - SetJumpTarget(continue2); - OR(32, M(&CR), R(EAX)); + OR(32, M(&PowerPC::ppcState.cr), R(EAX)); gpr.UnlockAll(); } @@ -652,6 +625,7 @@ namespace Jit64 // This can be optimized void addex(UGeckoInstruction inst) { + // USES_XER #ifdef JIT_OFF_OPTIONS if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff) {Default(inst); return;} // turn off from debugger @@ -664,7 +638,7 @@ namespace Jit64 gpr.LoadToX64(d, false); else gpr.LoadToX64(d, true); - MOV(32, R(EAX), M(&XER)); + MOV(32, R(EAX), M(&PowerPC::ppcState.spr[SPR_XER])); SHR(32, R(EAX), Imm8(30)); // shift the carry flag out into the x86 carry flag MOV(32, R(EAX), gpr.R(a)); ADC(32, R(EAX), gpr.R(b)); @@ -895,6 +869,7 @@ namespace Jit64 void srawx(UGeckoInstruction inst) { + // USES_XER #ifdef JIT_OFF_OPTIONS if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff) {Default(inst); return;} // turn off from debugger @@ -919,17 +894,17 @@ namespace Jit64 CMP(32, R(EAX), Imm32(-1)); SETcc(CC_L, R(EAX)); SAR(32, gpr.R(a), R(ECX)); - AND(32, M(&XER), Imm32(~(1 << 29))); + AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~(1 << 29))); SHL(32, R(EAX), Imm8(29)); - OR(32, M(&XER), R(EAX)); + OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); FixupBranch end = J(); SetJumpTarget(topBitSet); MOV(32, R(EAX), gpr.R(s)); SAR(32, R(EAX), Imm8(31)); MOV(32, gpr.R(a), R(EAX)); - AND(32, M(&XER), Imm32(~(1 << 29))); + AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~(1 << 29))); AND(32, R(EAX), Imm32(1<<29)); - OR(32, M(&XER), R(EAX)); + OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); SetJumpTarget(end); gpr.UnlockAll(); gpr.UnlockAllX(); @@ -961,11 +936,11 @@ namespace Jit64 FixupBranch nocarry1 = J_CC(CC_GE); TEST(32, R(EAX), Imm32((u32)0xFFFFFFFF >> (32 - amount))); // were any 1s shifted out? FixupBranch nocarry2 = J_CC(CC_Z); - OR(32, M(&XER), Imm32(XER_CA_MASK)); //XER.CA = 1 + JitSetCA(); FixupBranch carry = J(false); SetJumpTarget(nocarry1); SetJumpTarget(nocarry2); - AND(32, M(&XER), Imm32(~XER_CA_MASK)); //XER.CA = 0 + JitClearCA(); SetJumpTarget(carry); gpr.UnlockAll(); } @@ -973,7 +948,7 @@ namespace Jit64 { Default(inst); return; gpr.Lock(a, s); - AND(32, M(&XER), Imm32(~XER_CA_MASK)); //XER.CA = 0 + JitClearCA(); gpr.LoadToX64(a, a == s, true); if (a != s) MOV(32, gpr.R(a), gpr.R(s)); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.cpp index 699ef09881..763515599e 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.cpp @@ -37,6 +37,16 @@ namespace Jit64 { +void JitClearCA() +{ + AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0 +} + +void JitSetCA() +{ + OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1 +} + void UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend) { #ifdef _M_IX86 diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.h b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.h index b06f749101..49e168a4b8 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.h +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.h @@ -33,4 +33,7 @@ void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address); void ForceSinglePrecisionS(X64Reg xmm); void ForceSinglePrecisionP(X64Reg xmm); +void JitClearCA(); +void JitSetCA(); + } // namespace diff --git a/Source/Core/Core/Src/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/Src/PowerPC/PPCAnalyst.cpp index 59620b762d..369d8f4e41 100644 --- a/Source/Core/Core/Src/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/Src/PowerPC/PPCAnalyst.cpp @@ -285,19 +285,10 @@ void FixUpInternalBranches(CodeOp *code, int begin, int end) } } -void ShuffleUp(CodeOp *code, int first, int last) -{ - CodeOp temp = code[first]; - for (int i = first; i < last; i++) - code[i] = code[i + 1]; - code[last] = temp; -} - // IMPORTANT - CURRENTLY ASSUMES THAT A IS A COMPARE bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b) { - // Disabled for now - return false; + return false; // Currently deactivated in SVN. const GekkoOPInfo *a_info = GetOpInfo(a.inst); const GekkoOPInfo *b_info = GetOpInfo(b.inst); @@ -308,7 +299,6 @@ bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b) if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.hex & 1)) return false; - // 10 cmpi, 11 cmpli - we got a compare! switch (b.inst.OPCD) { case 16: @@ -323,20 +313,34 @@ bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b) // For now, only integer ops acceptable. switch (b_info->type) { case OPTYPE_INTEGER: + case OPTYPE_LOAD: + case OPTYPE_STORE: + case OPTYPE_LOADFP: + case OPTYPE_STOREFP: break; default: return false; } // Check that we have no register collisions. + // That is, check that none of b's outputs matches any of a's inputs, + // and that none of a's outputs matches any of b's inputs. + // The latter does not apply if a is a cmp, of course, but doesn't hurt to check. bool no_swap = false; for (int j = 0; j < 3; j++) { - int regIn = a.regsIn[j]; - if (regIn < 0) - continue; - if (b.regsOut[0] == regIn || - b.regsOut[1] == regIn) + int regInA = a.regsIn[j]; + int regInB = b.regsIn[j]; + if (regInA >= 0 && + b.regsOut[0] == regInA || + b.regsOut[1] == regInA) + { + // reg collision! don't swap + return false; + } + if (regInB >= 0 && + a.regsOut[0] == regInB || + a.regsOut[1] == regInB) { // reg collision! don't swap return false; @@ -346,6 +350,7 @@ bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b) return true; } +// Does not yet perform inlining - although there are plans for that. CodeOp *Flatten(u32 address, int &realsize, BlockStats &st, BlockRegStats &gpa, BlockRegStats &fpa) { int numCycles = 0; @@ -623,9 +628,8 @@ CodeOp *Flatten(u32 address, int &realsize, BlockStats &st, BlockRegStats &gpa, } } - - //Scan for CR0 dependency - //assume next block wants CR0 to be safe + // Scan for CR0 dependency + // assume next block wants CR0 to be safe bool wantsCR0 = true; bool wantsCR1 = true; bool wantsPS1 = true; diff --git a/Source/Core/Core/Src/PowerPC/PPCAnalyst.h b/Source/Core/Core/Src/PowerPC/PPCAnalyst.h index 3e965d423e..3c6de7702d 100644 --- a/Source/Core/Core/Src/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/Src/PowerPC/PPCAnalyst.h @@ -79,8 +79,6 @@ struct BlockRegStats void Init(); void Shutdown(); -void ShuffleUp(CodeOp *code, int first, int last); - CodeOp *Flatten(u32 address, int &realsize, BlockStats &st, BlockRegStats &gpa, BlockRegStats &fpa); void LogFunctionCall(u32 addr); diff --git a/Source/Core/Core/Src/PowerPC/PPCTables.cpp b/Source/Core/Core/Src/PowerPC/PPCTables.cpp index 027f593049..6681e5fa7c 100644 --- a/Source/Core/Core/Src/PowerPC/PPCTables.cpp +++ b/Source/Core/Core/Src/PowerPC/PPCTables.cpp @@ -141,8 +141,8 @@ GekkoOPTemplate primarytable[] = {7, Interpreter::mulli, Jit64::mulli, {"mulli", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_RC_BIT, 2}}, {8, Interpreter::subfic, Jit64::subfic, {"subfic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}}, - {10, Interpreter::cmpli, Jit64::cmpli, {"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}}, - {11, Interpreter::cmpi, Jit64::cmpi, {"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}}, + {10, Interpreter::cmpli, Jit64::cmpXi, {"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}}, + {11, Interpreter::cmpi, Jit64::cmpXi, {"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}}, {12, Interpreter::addic, Jit64::reg_imm, {"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}}, {13, Interpreter::addic_rc, Jit64::reg_imm, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0}}, {14, Interpreter::addi, Jit64::reg_imm, {"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}}, @@ -283,8 +283,8 @@ GekkoOPTemplate table31[] = {412, Interpreter::orcx, Jit64::Default, {"orcx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_SB | FL_RC_BIT}}, {476, Interpreter::nandx, Jit64::Default, {"nandx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_SB | FL_RC_BIT}}, {284, Interpreter::eqvx, Jit64::Default, {"eqvx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_SB | FL_RC_BIT}}, - {0, Interpreter::cmp, Jit64::cmp, {"cmp", OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}}, - {32, Interpreter::cmpl, Jit64::cmpl, {"cmpl", OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}}, + {0, Interpreter::cmp, Jit64::cmpX, {"cmp", OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}}, + {32, Interpreter::cmpl, Jit64::cmpX, {"cmpl", OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}}, {26, Interpreter::cntlzwx, Jit64::cntlzwx, {"cntlzwx",OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {922, Interpreter::extshx, Jit64::extshx, {"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {954, Interpreter::extsbx, Jit64::extsbx, {"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, diff --git a/Source/Core/Core/Src/PowerPC/PowerPC.h b/Source/Core/Core/Src/PowerPC/PowerPC.h index 2ac455211f..097e571347 100644 --- a/Source/Core/Core/Src/PowerPC/PowerPC.h +++ b/Source/Core/Core/Src/PowerPC/PowerPC.h @@ -25,77 +25,75 @@ class PointerWrap; namespace PowerPC { - enum CoreMode - { - MODE_INTERPRETER, - MODE_JIT, - }; - // This contains the entire state of the emulated PowerPC "Gekko" CPU. - struct GC_ALIGNED64(PowerPCState) - { - u32 mojs[128]; // Try to isolate the regs from other variables in the cache. - u32 gpr[32]; // General purpose registers. r1 = stack pointer. +enum CoreMode +{ + MODE_INTERPRETER, + MODE_JIT, +}; - // The paired singles are strange : PS0 is stored in the full 64 bits of each FPR - // but ps calculations are only done in 32-bit precision, and PS1 is only 32 bits. - // Since we want to use SIMD, SSE2 is the only viable alternative - 2x double. - u64 ps[32][2]; +// This contains the entire state of the emulated PowerPC "Gekko" CPU. +struct GC_ALIGNED64(PowerPCState) +{ + u32 mojs[128]; // Try to isolate the regs from other variables in the cache. + u32 gpr[32]; // General purpose registers. r1 = stack pointer. - u32 pc; // program counter - u32 npc; + // The paired singles are strange : PS0 is stored in the full 64 bits of each FPR + // but ps calculations are only done in 32-bit precision, and PS1 is only 32 bits. + // Since we want to use SIMD, SSE2 is the only viable alternative - 2x double. + u64 ps[32][2]; - u32 cr; // flags - u32 msr; // machine specific register - u32 fpscr; // floating point flags/status bits + u32 pc; // program counter + u32 npc; - // Exception management. - u32 Exceptions; + u32 cr; // flags + u32 msr; // machine specific register + u32 fpscr; // floating point flags/status bits - u32 sr[16]; // Segment registers. Unused. + // Exception management. + u32 Exceptions; - u32 DebugCount; - - // special purpose registers - controlls quantizers, DMA, and lots of other misc extensions. - // also for power management, but we don't care about that. - u32 spr[1024]; - }; + u32 sr[16]; // Segment registers. Unused. - enum CPUState - { - CPU_RUNNING = 0, - CPU_RUNNINGDEBUG = 1, - CPU_STEPPING = 2, - CPU_POWERDOWN = 3, - }; + u32 DebugCount; + + // special purpose registers - controlls quantizers, DMA, and lots of other misc extensions. + // also for power management, but we don't care about that. + u32 spr[1024]; +}; - extern PowerPCState ppcState; - extern volatile CPUState state; // Execution engines should poll this to know when to exit. +enum CPUState +{ + CPU_RUNNING = 0, + CPU_RUNNINGDEBUG = 1, + CPU_STEPPING = 2, + CPU_POWERDOWN = 3, +}; - void Init(); - void Shutdown(); - void DoState(PointerWrap &p); +extern PowerPCState ppcState; +extern volatile CPUState state; // Execution engines should poll this to know when to exit. - void SetMode(CoreMode _coreType); +void Init(); +void Shutdown(); +void DoState(PointerWrap &p); - void SingleStep(); - void CheckExceptions(); - void RunLoop(); - void Start(); - void Pause(); - void Stop(); +void SetMode(CoreMode _coreType); - void OnIdle(u32 _uThreadAddr); -} +void SingleStep(); +void CheckExceptions(); +void RunLoop(); +void Start(); +void Pause(); +void Stop(); -// Easy register access macros. +void OnIdle(u32 _uThreadAddr); + + // Easy register access macros. #define HID2 ((UReg_HID2&)PowerPC::ppcState.spr[SPR_HID2]) #define DMAU (*(UReg_DMAU*)&PowerPC::ppcState.spr[SPR_DMAU]) #define DMAL (*(UReg_DMAL*)&PowerPC::ppcState.spr[SPR_DMAL]) -#define XER ((UReg_XER&)PowerPC::ppcState.spr[SPR_XER]) #define PC PowerPC::ppcState.pc #define NPC PowerPC::ppcState.npc -#define CR PowerPC::ppcState.cr #define FPSCR ((UReg_FPSCR&)PowerPC::ppcState.fpscr) #define MSR PowerPC::ppcState.msr #define GPR(n) PowerPC::ppcState.gpr[n] @@ -121,11 +119,13 @@ namespace PowerPC #define riPS0(i) (*(u64*)(&PowerPC::ppcState.ps[i][0])) #define riPS1(i) (*(u64*)(&PowerPC::ppcState.ps[i][1])) +} // namespace // Wrappers to make it easier to in the future completely replace the storage of CR and Carry bits // to something more x86-friendly. These are not used 100% consistently yet - and if we do this, we // need the corresponding stuff on the JIT side too. +// These are intended to stay fast, probably become faster, and are not likely to slow down much if at all. inline void SetCRField(int cr_field, int value) { PowerPC::ppcState.cr = (PowerPC::ppcState.cr & (~(0xF0000000 >> (cr_field * 4)))) | (value << ((7 - cr_field) * 4)); } @@ -135,9 +135,10 @@ inline u32 GetCRField(int cr_field) { } inline u32 GetCRBit(int bit) { - return (CR >> (31 - bit)) & 1; + return (PowerPC::ppcState.cr >> (31 - bit)) & 1; } +// SetCR and GetCR may become fairly slow soon. Should be avoided if possible. inline void SetCR(u32 new_cr) { PowerPC::ppcState.cr = new_cr; } @@ -146,12 +147,29 @@ inline u32 GetCR() { return PowerPC::ppcState.cr; } +// SetCarry/GetCarry may speed up soon. inline void SetCarry(int ca) { - XER.CA = ca; + ((UReg_XER&)PowerPC::ppcState.spr[SPR_XER]).CA = ca; } inline int GetCarry() { - return XER.CA; + return ((UReg_XER&)PowerPC::ppcState.spr[SPR_XER]).CA; +} + +inline UReg_XER GetXER() { + return ((UReg_XER&)PowerPC::ppcState.spr[SPR_XER]); +} + +inline void SetXER(UReg_XER new_xer) { + ((UReg_XER&)PowerPC::ppcState.spr[SPR_XER]) = new_xer; +} + +inline int GetXER_SO() { + return ((UReg_XER&)PowerPC::ppcState.spr[SPR_XER]).SO; +} + +inline void SetXER_SO(int value) { + ((UReg_XER&)PowerPC::ppcState.spr[SPR_XER]).SO = value; } #endif