From a44c421d01b717af5a748e66af4c2a5776987ef0 Mon Sep 17 00:00:00 2001 From: hrydgard Date: Mon, 15 Dec 2008 20:41:59 +0000 Subject: [PATCH] Somewhat faster CR flag storage. Doesn't really make that much of a difference - but opens a possibility to merge cmp instructions with their following conditional branches in an efficient way. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1549 8ced0084-cf51-0410-be5f-012b33b47a6e --- Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp | 9 +++--- .../Core/Src/PowerPC/Jit64/Jit_Branch.cpp | 2 +- .../Src/PowerPC/Jit64/Jit_FloatingPoint.cpp | 12 +++---- .../Core/Src/PowerPC/Jit64/Jit_Integer.cpp | 20 +++++------- .../Src/PowerPC/Jit64/Jit_SystemRegisters.cpp | 32 ++++++++++++++----- Source/Core/Core/Src/PowerPC/PowerPC.cpp | 16 ++++++++++ Source/Core/Core/Src/PowerPC/PowerPC.h | 15 ++++++--- 7 files changed, 68 insertions(+), 38 deletions(-) diff --git a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp index 17b8663ef1..0e4fc0a32b 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp @@ -326,19 +326,18 @@ void GenerateCommon() { // USES_CR computeRc = AlignCode16(); - AND(32, M(&PowerPC::ppcState.cr), Imm32(0x0FFFFFFF)); CMP(32, R(EAX), Imm8(0)); FixupBranch pLesser = J_CC(CC_L); FixupBranch pGreater = J_CC(CC_G); - OR(32, M(&PowerPC::ppcState.cr), Imm32(0x20000000)); // _x86Reg == 0 + MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x2)); // _x86Reg == 0 RET(); SetJumpTarget(pGreater); - OR(32, M(&PowerPC::ppcState.cr), Imm32(0x40000000)); // _x86Reg > 0 + MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x4)); // _x86Reg > 0 RET(); SetJumpTarget(pLesser); - OR(32, M(&PowerPC::ppcState.cr), Imm32(0x80000000)); // _x86Reg < 0 + MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x8)); // _x86Reg < 0 RET(); - + fifoDirectWrite8 = AlignCode4(); GenFifoWrite(8); fifoDirectWrite16 = AlignCode4(); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Branch.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Branch.cpp index 081029b83b..4a192ac4f2 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Branch.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Branch.cpp @@ -125,7 +125,7 @@ namespace Jit64 if ((inst.BO & 16) == 0) // Test a CR bit { - TEST(32, M(&PowerPC::ppcState.cr), Imm32(0x80000000 >> inst.BI)); + TEST(8, M(&PowerPC::ppcState.cr_fast[inst.BI >> 2]), Imm8(8 >> (inst.BI & 3))); if (inst.BO & 8) // Conditional branch branch = CC_NZ; else diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp index 5fafea7e78..51f0f2d153 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -202,11 +202,9 @@ namespace Jit64 fpr.Lock(a,b); if (a != b) - { fpr.LoadToX64(a, true); - } + // USES_CR - AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> shift))); if (ordered) COMISD(fpr.R(a).GetSimpleReg(), fpr.R(b)); else @@ -214,19 +212,17 @@ namespace Jit64 FixupBranch pLesser = J_CC(CC_B); FixupBranch pGreater = J_CC(CC_A); // _x86Reg == 0 - MOV(32, R(EAX), Imm32(0x20000000)); + MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2)); FixupBranch continue1 = J(); // _x86Reg > 0 SetJumpTarget(pGreater); - MOV(32, R(EAX), Imm32(0x40000000)); + MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4)); FixupBranch continue2 = J(); // _x86Reg < 0 SetJumpTarget(pLesser); - MOV(32, R(EAX), Imm32(0x80000000)); + MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8)); SetJumpTarget(continue1); SetJumpTarget(continue2); - SHR(32, R(EAX), Imm8(shift)); - OR(32, M(&PowerPC::ppcState.cr), R(EAX)); fpr.UnlockAll(); } diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp index 39c6ebd3f6..8fedb90f7b 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp @@ -174,23 +174,21 @@ namespace Jit64 } gpr.KillImmediate(a); // todo, optimize instead, but unlikely to make a difference - AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> (crf*4)))); CMP(32, gpr.R(a), comparand); FixupBranch pLesser = J_CC(less_than); FixupBranch pGreater = J_CC(greater_than); - MOV(32, R(EAX), Imm32(0x20000000 >> shift)); // _x86Reg == 0 + MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2)); // _x86Reg == 0 FixupBranch continue1 = J(); SetJumpTarget(pGreater); - MOV(32, R(EAX), Imm32(0x40000000 >> shift)); // _x86Reg > 0 + MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4)); // _x86Reg > 0 FixupBranch continue2 = J(); SetJumpTarget(pLesser); - MOV(32, R(EAX), Imm32(0x80000000 >> shift));// _x86Reg < 0 + MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8)); // _x86Reg < 0 SetJumpTarget(continue1); SetJumpTarget(continue2); - OR(32, M(&PowerPC::ppcState.cr), R(EAX)); // TODO: Add extra code at the end for the "taken" case. Jump to it from the matching branches. // Since it's the last block, some liberties can be taken. @@ -221,23 +219,21 @@ namespace Jit64 } gpr.Lock(a, b); gpr.LoadToX64(a, true, false); - AND(32, M(&PowerPC::ppcState.cr), Imm32(~(0xF0000000 >> (crf*4)))); CMP(32, gpr.R(a), comparand); FixupBranch pLesser = J_CC(less_than); FixupBranch pGreater = J_CC(greater_than); // _x86Reg == 0 - MOV(32, R(EAX), Imm32(0x20000000 >> shift)); + MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2)); // _x86Reg == 0 FixupBranch continue1 = J(); - // _x86Reg > 0 + SetJumpTarget(pGreater); - MOV(32, R(EAX), Imm32(0x40000000 >> shift)); + MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4)); // _x86Reg > 0 FixupBranch continue2 = J(); - // _x86Reg < 0 + SetJumpTarget(pLesser); - MOV(32, R(EAX), Imm32(0x80000000 >> shift)); + MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8)); // _x86Reg < 0 SetJumpTarget(continue1); SetJumpTarget(continue2); - OR(32, M(&PowerPC::ppcState.cr), R(EAX)); gpr.UnlockAll(); } diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp index 68c9474146..ec31b8f4d7 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -163,20 +163,39 @@ namespace Jit64 // USES_CR int d = inst.RD; gpr.LoadToX64(d, false, true); - MOV(32, gpr.R(d), M(&PowerPC::ppcState.cr)); + MOV(8, R(EAX), M(&PowerPC::ppcState.cr_fast[0])); + SHL(32, R(EAX), Imm8(4)); + for (int i = 1; i < 7; i++) { + OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[i])); + SHL(32, R(EAX), Imm8(4)); + } + OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[7])); + MOV(32, gpr.R(d), R(EAX)); } void mtcrf(UGeckoInstruction inst) { + //Default(inst); + //return; + // USES_CR u32 mask = 0; u32 crm = inst.CRM; - gpr.FlushLockX(ECX); if (crm == 0xFF) { + gpr.FlushLockX(ECX); MOV(32, R(EAX), gpr.R(inst.RS)); - MOV(32, M(&PowerPC::ppcState.cr), R(EAX)); + for (int i = 0; i < 8; i++) { + MOV(32, R(ECX), R(EAX)); + SHR(32, R(ECX), Imm8(28 - (i * 4))); + AND(32, R(ECX), Imm32(0xF)); + MOV(8, M(&PowerPC::ppcState.cr_fast[i]), R(ECX)); + } + gpr.UnlockAllX(); } else { - //TODO: use lookup table? probably not worth it + Default(inst); + return; + + // TODO: translate this to work in new CR model. for (int i = 0; i < 8; i++) { if (crm & (1 << i)) mask |= 0xF << (i*4); @@ -188,9 +207,6 @@ namespace Jit64 OR(32, R(EAX), R(ECX)); MOV(32, M(&PowerPC::ppcState.cr), R(EAX)); } - gpr.UnlockAllX(); } - -} - +} // namespace diff --git a/Source/Core/Core/Src/PowerPC/PowerPC.cpp b/Source/Core/Core/Src/PowerPC/PowerPC.cpp index fe44e31597..e9a2d5b05f 100644 --- a/Source/Core/Core/Src/PowerPC/PowerPC.cpp +++ b/Source/Core/Core/Src/PowerPC/PowerPC.cpp @@ -42,6 +42,22 @@ volatile CPUState state = CPU_STEPPING; static CoreMode mode; +void CompactCR() +{ + ppcState.cr = 0; + for (int i = 0; i < 8; i++) { + ppcState.cr |= ppcState.cr_fast[i] << (28 - i * 4); + } +} + +void ExpandCR() +{ + for (int i = 0; i < 8; i++) { + ppcState.cr_fast[i] = (ppcState.cr >> (28 - i * 4)) & 0xF; + } +} + + void DoState(PointerWrap &p) { p.Do(ppcState); diff --git a/Source/Core/Core/Src/PowerPC/PowerPC.h b/Source/Core/Core/Src/PowerPC/PowerPC.h index 097e571347..838898669e 100644 --- a/Source/Core/Core/Src/PowerPC/PowerPC.h +++ b/Source/Core/Core/Src/PowerPC/PowerPC.h @@ -46,7 +46,9 @@ struct GC_ALIGNED64(PowerPCState) u32 pc; // program counter u32 npc; - u32 cr; // flags + u32 cr; // flags + u8 cr_fast[8]; // Possibly reorder to 0, 2, 4, 8, 1, 3, 5, 7 so that we can make Compact and Expand super fast? + u32 msr; // machine specific register u32 fpscr; // floating point flags/status bits @@ -86,6 +88,9 @@ void Start(); void Pause(); void Stop(); +void CompactCR(); +void ExpandCR(); + void OnIdle(u32 _uThreadAddr); // Easy register access macros. @@ -127,23 +132,25 @@ void OnIdle(u32 _uThreadAddr); // These are intended to stay fast, probably become faster, and are not likely to slow down much if at all. inline void SetCRField(int cr_field, int value) { - PowerPC::ppcState.cr = (PowerPC::ppcState.cr & (~(0xF0000000 >> (cr_field * 4)))) | (value << ((7 - cr_field) * 4)); + PowerPC::ppcState.cr_fast[cr_field] = value; } inline u32 GetCRField(int cr_field) { - return (PowerPC::ppcState.cr >> (4 * cr_field)) & 0xF; + return PowerPC::ppcState.cr_fast[cr_field]; } inline u32 GetCRBit(int bit) { - return (PowerPC::ppcState.cr >> (31 - bit)) & 1; + return (PowerPC::ppcState.cr_fast[bit >> 2] >> (3 - (bit & 3))) & 1; } // SetCR and GetCR may become fairly slow soon. Should be avoided if possible. inline void SetCR(u32 new_cr) { PowerPC::ppcState.cr = new_cr; + PowerPC::ExpandCR(); } inline u32 GetCR() { + PowerPC::CompactCR(); return PowerPC::ppcState.cr; }