More performance work on the PowerPC JIT compiler:
* Merge some loops in PPCAnalyst::Flatten() * Put ppcState.cr and ppcState.cr_fast[] into a single C++ union. This allows quick access to the whole CR register without needing to merge the cr_fast array. The implemented solution assumes the host system is little-endian, but it seems to be already assumed in many places in the code... * Inline the call to computeCR: it now costs a few more memory bytes per JITed instruction but it removes the CPU overhead of the CALL. This allowed to remove some unneeded MOV as well. * Jit64::GenerateCarry() don't need a temporary register anymore * Fix what seems to be a bug in PowerPC instruction RFI: the wrong bit was cleared in MSR git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5970 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
15b741706d
commit
cbc66f9467
|
@ -117,7 +117,7 @@ void rfi(UGeckoInstruction _inst)
|
|||
const int mask = 0x87C0FFFF;
|
||||
MSR = (MSR & ~mask) | (SRR1 & mask);
|
||||
//MSR[13] is set to 0.
|
||||
MSR &= 0xFFFDFFFF;
|
||||
MSR &= 0xFFFBFFFF;
|
||||
// Here we should check if there are pending exceptions, and if their corresponding enable bits are set
|
||||
// if above is true, we'd do:
|
||||
//PowerPC::CheckExceptions();
|
||||
|
|
|
@ -136,7 +136,8 @@ public:
|
|||
void WriteCallInterpreter(UGeckoInstruction _inst);
|
||||
void Cleanup();
|
||||
|
||||
void GenerateCarry(Gen::X64Reg temp_reg);
|
||||
void GenerateCarry();
|
||||
void ComputeRC(const Gen::OpArg & arg);
|
||||
|
||||
void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
|
||||
typedef u32 (*Operation)(u32 a, u32 b);
|
||||
|
|
|
@ -222,35 +222,6 @@ void Jit64AsmRoutineManager::Generate()
|
|||
|
||||
void Jit64AsmRoutineManager::GenerateCommon()
|
||||
{
|
||||
// USES_CR
|
||||
computeRc = AlignCode16();
|
||||
CMP(32, R(EAX), Imm8(0));
|
||||
FixupBranch pLesser = J_CC(CC_L);
|
||||
FixupBranch pGreater = J_CC(CC_G);
|
||||
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x2)); // _x86Reg == 0
|
||||
FixupBranch continue1 = J();
|
||||
|
||||
SetJumpTarget(pGreater);
|
||||
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x4)); // _x86Reg > 0
|
||||
FixupBranch continue2 = J();
|
||||
|
||||
SetJumpTarget(pLesser);
|
||||
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x8)); // _x86Reg < 0
|
||||
|
||||
SetJumpTarget(continue1);
|
||||
SetJumpTarget(continue2);
|
||||
|
||||
// cr[0] |= SPR_XER & 1
|
||||
|
||||
/*MOV(32, R(EAX), M(&PowerPC::ppcState.spr[SPR_XER]));
|
||||
|
||||
AND(32, R(EAX), Imm32(1));
|
||||
|
||||
MOVSX(32, 8, ECX, M(&PowerPC::ppcState.cr_fast[0]));
|
||||
OR(32, R(ECX), R(EAX));
|
||||
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), R(ECX));*/
|
||||
RET();
|
||||
|
||||
fifoDirectWrite8 = AlignCode4();
|
||||
GenFifoWrite(8);
|
||||
fifoDirectWrite16 = AlignCode4();
|
||||
|
|
|
@ -66,7 +66,7 @@ void Jit64::rfi(UGeckoInstruction inst)
|
|||
AND(32, R(ECX), Imm32(mask));
|
||||
OR(32, R(EAX), R(ECX));
|
||||
// MSR &= 0xFFFDFFFF; //TODO: VERIFY
|
||||
AND(32, R(EAX), Imm32(0xFFFDFFFF));
|
||||
AND(32, R(EAX), Imm32(0xFFFBFFFF));
|
||||
MOV(32, M(&MSR), R(EAX));
|
||||
// NPC = SRR0;
|
||||
MOV(32, R(EAX), M(&SRR0));
|
||||
|
@ -246,7 +246,7 @@ void Jit64::bcctrx(UGeckoInstruction inst)
|
|||
MOV(32, R(EAX), Imm32(js.compilerPC + 4));
|
||||
FixupBranch b = J_CC(branch, false);
|
||||
MOV(32, R(EAX), M(&CTR));
|
||||
MOV(32, M(&PC), R(EAX));
|
||||
//MOV(32, M(&PC), R(EAX)); => Already done in WriteExitDestInEAX()
|
||||
if (inst.LK_3)
|
||||
MOV(32, M(&LR), Imm32(js.compilerPC + 4)); // LR = PC + 4;
|
||||
// Would really like to continue the block here, but it ends. TODO.
|
||||
|
@ -274,7 +274,7 @@ void Jit64::bclrx(UGeckoInstruction inst)
|
|||
AND(32, M(&CR), Imm32(~(0xFF000000)));
|
||||
#endif
|
||||
MOV(32, R(EAX), M(&LR));
|
||||
MOV(32, M(&PC), R(EAX));
|
||||
//MOV(32, M(&PC), R(EAX)); => Already done in WriteExitDestInEAX()
|
||||
if (inst.LK_3)
|
||||
MOV(32, M(&LR), Imm32(js.compilerPC + 4)); // LR = PC + 4;
|
||||
WriteExitDestInEAX(0);
|
||||
|
@ -295,15 +295,15 @@ void Jit64::bclrx(UGeckoInstruction inst)
|
|||
branch = CC_Z;
|
||||
else
|
||||
branch = CC_NZ;
|
||||
MOV(32, R(EAX), Imm32(js.compilerPC + 4));
|
||||
FixupBranch b = J_CC(branch, false);
|
||||
MOV(32, R(EAX), M(&LR));
|
||||
MOV(32, M(&PC), R(EAX));
|
||||
//MOV(32, M(&PC), R(EAX)); => Already done in WriteExitDestInEAX()
|
||||
if (inst.LK_3)
|
||||
MOV(32, M(&LR), Imm32(js.compilerPC + 4)); // LR = PC + 4;
|
||||
WriteExitDestInEAX(0);
|
||||
// Would really like to continue the block here, but it ends. TODO.
|
||||
SetJumpTarget(b);
|
||||
WriteExitDestInEAX(0);
|
||||
WriteExit(js.compilerPC + 4, 1);
|
||||
return;
|
||||
}
|
||||
// Call interpreter
|
||||
|
|
|
@ -25,12 +25,32 @@
|
|||
#include "JitAsm.h"
|
||||
|
||||
// Assumes that the flags were just set through an addition.
|
||||
void Jit64::GenerateCarry(Gen::X64Reg temp_reg) {
|
||||
void Jit64::GenerateCarry() {
|
||||
// USES_XER
|
||||
SETcc(CC_C, R(temp_reg));
|
||||
FixupBranch pNoCarry = J_CC(CC_NC);
|
||||
OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(1 << 29));
|
||||
FixupBranch pContinue = J();
|
||||
SetJumpTarget(pNoCarry);
|
||||
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~(1 << 29)));
|
||||
SHL(32, R(temp_reg), Imm8(29));
|
||||
OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(temp_reg));
|
||||
SetJumpTarget(pContinue);
|
||||
}
|
||||
|
||||
void Jit64::ComputeRC(const Gen::OpArg & arg) {
|
||||
CMP(32, arg, Imm8(0));
|
||||
FixupBranch pLesser = J_CC(CC_L);
|
||||
FixupBranch pGreater = J_CC(CC_G);
|
||||
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x2)); // _x86Reg == 0
|
||||
FixupBranch continue1 = J();
|
||||
|
||||
SetJumpTarget(pGreater);
|
||||
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x4)); // _x86Reg > 0
|
||||
FixupBranch continue2 = J();
|
||||
|
||||
SetJumpTarget(pLesser);
|
||||
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x8)); // _x86Reg < 0
|
||||
|
||||
SetJumpTarget(continue1);
|
||||
SetJumpTarget(continue2);
|
||||
}
|
||||
|
||||
u32 Add(u32 a, u32 b) {return a + b;}
|
||||
|
@ -55,7 +75,7 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void
|
|||
gpr.LoadToX64(d, false);
|
||||
(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
|
||||
if (carry)
|
||||
GenerateCarry(EAX);
|
||||
GenerateCarry();
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -64,7 +84,7 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void
|
|||
MOV(32, gpr.R(d), gpr.R(a));
|
||||
(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
|
||||
if (carry)
|
||||
GenerateCarry(EAX);
|
||||
GenerateCarry();
|
||||
}
|
||||
}
|
||||
else if (doop == Add)
|
||||
|
@ -81,8 +101,7 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void
|
|||
if (Rc)
|
||||
{
|
||||
// Todo - special case immediates.
|
||||
MOV(32, R(EAX), gpr.R(d));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(d));
|
||||
}
|
||||
gpr.UnlockAll();
|
||||
}
|
||||
|
@ -281,8 +300,7 @@ void Jit64::orx(UGeckoInstruction inst)
|
|||
|
||||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(a));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -311,8 +329,7 @@ void Jit64::xorx(UGeckoInstruction inst)
|
|||
|
||||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(a));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -334,7 +351,7 @@ void Jit64::andx(UGeckoInstruction inst)
|
|||
|
||||
if (inst.Rc) {
|
||||
// result is already in eax
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(R(EAX));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -351,8 +368,7 @@ void Jit64::extsbx(UGeckoInstruction inst)
|
|||
MOV(32, R(EAX), gpr.R(s));
|
||||
MOVSX(32, 8, gpr.RX(a), R(AL)); // watch out for ah and friends
|
||||
if (inst.Rc) {
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(a));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -368,8 +384,7 @@ void Jit64::extshx(UGeckoInstruction inst)
|
|||
// as the 32-bit register.
|
||||
MOVSX(32, 16, gpr.RX(a), gpr.R(s));
|
||||
if (inst.Rc) {
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(a));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -378,7 +393,6 @@ void Jit64::subfic(UGeckoInstruction inst)
|
|||
INSTRUCTION_START
|
||||
JITDISABLE(Integer)
|
||||
int a = inst.RA, d = inst.RD;
|
||||
gpr.FlushLockX(ECX);
|
||||
gpr.Lock(a, d);
|
||||
gpr.LoadToX64(d, a == d, true);
|
||||
int imm = inst.SIMM_16;
|
||||
|
@ -386,9 +400,8 @@ void Jit64::subfic(UGeckoInstruction inst)
|
|||
NOT(32, R(EAX));
|
||||
ADD(32, R(EAX), Imm32(imm + 1));
|
||||
MOV(32, gpr.R(d), R(EAX));
|
||||
GenerateCarry(ECX);
|
||||
GenerateCarry();
|
||||
gpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
// This instruction has no RC flag
|
||||
}
|
||||
|
||||
|
@ -421,7 +434,7 @@ void Jit64::subfcx(UGeckoInstruction inst)
|
|||
gpr.UnlockAll();
|
||||
if (inst.OE) PanicAlert("OE: subfcx");
|
||||
if (inst.Rc) {
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(R(EAX));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -464,7 +477,7 @@ void Jit64::subfex(UGeckoInstruction inst)
|
|||
gpr.UnlockAllX();
|
||||
if (inst.OE) PanicAlert("OE: subfex");
|
||||
if (inst.Rc) {
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(R(EAX));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -486,7 +499,7 @@ void Jit64::subfx(UGeckoInstruction inst)
|
|||
if (inst.OE) PanicAlert("OE: subfx");
|
||||
if (inst.Rc) {
|
||||
// result is already in eax
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(R(EAX));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -519,8 +532,7 @@ void Jit64::mullwx(UGeckoInstruction inst)
|
|||
}
|
||||
gpr.UnlockAll();
|
||||
if (inst.Rc) {
|
||||
MOV(32, R(EAX), gpr.R(d));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(d));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -543,14 +555,9 @@ void Jit64::mulhwux(UGeckoInstruction inst)
|
|||
MUL(32, gpr.R(b));
|
||||
gpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
if (inst.Rc) {
|
||||
MOV(32, R(EAX), R(EDX));
|
||||
MOV(32, gpr.R(d), R(EDX));
|
||||
// result is already in eax
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
} else {
|
||||
MOV(32, gpr.R(d), R(EDX));
|
||||
}
|
||||
if (inst.Rc)
|
||||
ComputeRC(R(EDX));
|
||||
}
|
||||
|
||||
void Jit64::divwux(UGeckoInstruction inst)
|
||||
|
@ -581,7 +588,7 @@ void Jit64::divwux(UGeckoInstruction inst)
|
|||
gpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
if (inst.Rc) {
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(R(EAX));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -604,8 +611,7 @@ void Jit64::addx(UGeckoInstruction inst)
|
|||
}
|
||||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(d));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(d));
|
||||
}
|
||||
gpr.UnlockAll();
|
||||
}
|
||||
|
@ -616,8 +622,7 @@ void Jit64::addx(UGeckoInstruction inst)
|
|||
ADD(32, gpr.R(d), gpr.R(b));
|
||||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(d));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(d));
|
||||
}
|
||||
gpr.UnlockAll();
|
||||
}
|
||||
|
@ -628,8 +633,7 @@ void Jit64::addx(UGeckoInstruction inst)
|
|||
ADD(32, gpr.R(d), gpr.R(a));
|
||||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(d));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(d));
|
||||
}
|
||||
gpr.UnlockAll();
|
||||
}
|
||||
|
@ -640,8 +644,7 @@ void Jit64::addx(UGeckoInstruction inst)
|
|||
ADD(32, gpr.R(d), gpr.R(d));
|
||||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(d));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(d));
|
||||
}
|
||||
gpr.UnlockAll();
|
||||
}
|
||||
|
@ -653,8 +656,7 @@ void Jit64::addx(UGeckoInstruction inst)
|
|||
ADD(32, gpr.R(d), gpr.R(d));
|
||||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(d));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(d));
|
||||
}
|
||||
gpr.UnlockAll();
|
||||
}
|
||||
|
@ -671,7 +673,6 @@ void Jit64::addex(UGeckoInstruction inst)
|
|||
INSTRUCTION_START
|
||||
JITDISABLE(Integer)
|
||||
int a = inst.RA, b = inst.RB, d = inst.RD;
|
||||
gpr.FlushLockX(ECX);
|
||||
gpr.Lock(a, b, d);
|
||||
if (d != a && d != b)
|
||||
gpr.LoadToX64(d, false);
|
||||
|
@ -682,12 +683,11 @@ void Jit64::addex(UGeckoInstruction inst)
|
|||
MOV(32, R(EAX), gpr.R(a));
|
||||
ADC(32, R(EAX), gpr.R(b));
|
||||
MOV(32, gpr.R(d), R(EAX));
|
||||
GenerateCarry(ECX);
|
||||
GenerateCarry();
|
||||
gpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
if (inst.Rc)
|
||||
{
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(R(EAX));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -741,8 +741,7 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
|
|||
|
||||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(a));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -775,8 +774,7 @@ void Jit64::rlwimix(UGeckoInstruction inst)
|
|||
gpr.UnlockAll();
|
||||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(a));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -804,8 +802,7 @@ void Jit64::rlwnmx(UGeckoInstruction inst)
|
|||
gpr.UnlockAllX();
|
||||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(R(EAX));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -823,8 +820,7 @@ void Jit64::negx(UGeckoInstruction inst)
|
|||
gpr.UnlockAll();
|
||||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(d));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(d));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -850,8 +846,7 @@ void Jit64::srwx(UGeckoInstruction inst)
|
|||
gpr.UnlockAllX();
|
||||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(R(EAX));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -877,8 +872,7 @@ void Jit64::slwx(UGeckoInstruction inst)
|
|||
gpr.UnlockAllX();
|
||||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(R(EAX));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -922,8 +916,7 @@ void Jit64::srawx(UGeckoInstruction inst)
|
|||
gpr.UnlockAllX();
|
||||
|
||||
if (inst.Rc) {
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(a));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -965,8 +958,7 @@ void Jit64::srawix(UGeckoInstruction inst)
|
|||
}
|
||||
|
||||
if (inst.Rc) {
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(a));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -993,8 +985,7 @@ void Jit64::cntlzwx(UGeckoInstruction inst)
|
|||
|
||||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
ComputeRC(gpr.R(a));
|
||||
// TODO: Check PPC manual too
|
||||
}
|
||||
}
|
||||
|
|
|
@ -145,13 +145,8 @@ void Jit64::mfcr(UGeckoInstruction inst)
|
|||
// USES_CR
|
||||
int d = inst.RD;
|
||||
gpr.LoadToX64(d, false, true);
|
||||
MOV(8, R(EAX), M(&PowerPC::ppcState.cr_fast[0]));
|
||||
SHL(32, R(EAX), Imm8(4));
|
||||
for (int i = 1; i < 7; i++) {
|
||||
OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[i]));
|
||||
SHL(32, R(EAX), Imm8(4));
|
||||
}
|
||||
OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[7]));
|
||||
MOV(32, R(EAX), M(&PowerPC::ppcState.cr_fast_u32));
|
||||
BSWAP(32, EAX);
|
||||
MOV(32, gpr.R(d), R(EAX));
|
||||
}
|
||||
|
||||
|
@ -160,33 +155,22 @@ void Jit64::mtcrf(UGeckoInstruction inst)
|
|||
INSTRUCTION_START
|
||||
JITDISABLE(SystemRegisters)
|
||||
|
||||
// USES_CR
|
||||
u32 mask = 0;
|
||||
u32 crm = inst.CRM;
|
||||
if (crm == 0xFF) {
|
||||
gpr.FlushLockX(ECX);
|
||||
MOV(32, R(EAX), gpr.R(inst.RS));
|
||||
for (int i = 0; i < 8; i++) {
|
||||
MOV(32, R(ECX), R(EAX));
|
||||
SHR(32, R(ECX), Imm8(28 - (i * 4)));
|
||||
AND(32, R(ECX), Imm32(0xF));
|
||||
MOV(8, M(&PowerPC::ppcState.cr_fast[i]), R(ECX));
|
||||
BSWAP(32, EAX);
|
||||
MOV(32, M(&PowerPC::ppcState.cr_fast_u32), R(EAX));
|
||||
}
|
||||
gpr.UnlockAllX();
|
||||
} else {
|
||||
Default(inst);
|
||||
return;
|
||||
|
||||
// TODO: translate this to work in new CR model.
|
||||
else if (crm != 0) {
|
||||
u32 mask = 0;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
if (crm & (1 << i))
|
||||
mask |= 0xF << (i*4);
|
||||
}
|
||||
|
||||
MOV(32, R(EAX), gpr.R(inst.RS));
|
||||
MOV(32, R(ECX), M(&PowerPC::ppcState.cr));
|
||||
AND(32, R(EAX), Imm32(mask));
|
||||
AND(32, R(ECX), Imm32(~mask));
|
||||
OR(32, R(EAX), R(ECX));
|
||||
MOV(32, M(&PowerPC::ppcState.cr), R(EAX));
|
||||
BSWAP(32, EAX);
|
||||
MOV(32, M(&PowerPC::ppcState.cr_fast_u32), R(EAX));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -221,20 +221,6 @@ void JitILAsmRoutineManager::Generate()
|
|||
|
||||
void JitILAsmRoutineManager::GenerateCommon()
|
||||
{
|
||||
// USES_CR
|
||||
computeRc = AlignCode16();
|
||||
CMP(32, R(EAX), Imm8(0));
|
||||
FixupBranch pLesser = J_CC(CC_L);
|
||||
FixupBranch pGreater = J_CC(CC_G);
|
||||
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x2)); // _x86Reg == 0
|
||||
RET();
|
||||
SetJumpTarget(pGreater);
|
||||
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x4)); // _x86Reg > 0
|
||||
RET();
|
||||
SetJumpTarget(pLesser);
|
||||
MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x8)); // _x86Reg < 0
|
||||
RET();
|
||||
|
||||
fifoDirectWrite8 = AlignCode4();
|
||||
GenFifoWrite(8);
|
||||
fifoDirectWrite16 = AlignCode4();
|
||||
|
|
|
@ -45,7 +45,6 @@ public:
|
|||
const u8 *dispatcherPcInEAX;
|
||||
|
||||
const u8 *fpException;
|
||||
const u8 *computeRc;
|
||||
const u8 *testExceptions;
|
||||
const u8 *dispatchPcInEAX;
|
||||
const u8 *doTiming;
|
||||
|
|
|
@ -296,6 +296,13 @@ u32 Flatten(u32 address, int *realsize, BlockStats *st, BlockRegStats *gpa, Bloc
|
|||
|
||||
gpa->any = true;
|
||||
fpa->any = false;
|
||||
for (int i = 0; i < 32; i++)
|
||||
{
|
||||
gpa->firstRead[i] = -1;
|
||||
gpa->firstWrite[i] = -1;
|
||||
gpa->numReads[i] = 0;
|
||||
gpa->numWrites[i] = 0;
|
||||
}
|
||||
|
||||
u32 blockstart = address;
|
||||
int maxsize = blockSize;
|
||||
|
@ -307,7 +314,8 @@ u32 Flatten(u32 address, int *realsize, BlockStats *st, BlockRegStats *gpa, Bloc
|
|||
CodeOp *code = buffer->codebuffer;
|
||||
bool foundExit = false;
|
||||
|
||||
// Flatten! (Currently just copies, following branches is disabled)
|
||||
// Do analysis of the code, look for dependencies etc
|
||||
int numSystemInstructions = 0;
|
||||
for (int i = 0; i < maxsize; i++)
|
||||
{
|
||||
num_inst++;
|
||||
|
@ -326,6 +334,106 @@ u32 Flatten(u32 address, int *realsize, BlockStats *st, BlockRegStats *gpa, Bloc
|
|||
if (opinfo)
|
||||
numCycles += opinfo->numCyclesMinusOne + 1;
|
||||
_assert_msg_(POWERPC, opinfo != 0, "Invalid Op - Error flattening %08x op %08x", address + i*4, inst.hex);
|
||||
|
||||
code[i].wantsCR0 = false;
|
||||
code[i].wantsCR1 = false;
|
||||
code[i].wantsPS1 = false;
|
||||
|
||||
int flags = opinfo->flags;
|
||||
|
||||
if (flags & FL_USE_FPU)
|
||||
fpa->any = true;
|
||||
|
||||
if (flags & FL_TIMER)
|
||||
gpa->anyTimer = true;
|
||||
|
||||
// Does the instruction output CR0?
|
||||
if (flags & FL_RC_BIT)
|
||||
code[i].outputCR0 = inst.hex & 1; //todo fix
|
||||
else if ((flags & FL_SET_CRn) && inst.CRFD == 0)
|
||||
code[i].outputCR0 = true;
|
||||
else
|
||||
code[i].outputCR0 = (flags & FL_SET_CR0) ? true : false;
|
||||
|
||||
// Does the instruction output CR1?
|
||||
if (flags & FL_RC_BIT_F)
|
||||
code[i].outputCR1 = inst.hex & 1; //todo fix
|
||||
else if ((flags & FL_SET_CRn) && inst.CRFD == 1)
|
||||
code[i].outputCR1 = true;
|
||||
else
|
||||
code[i].outputCR1 = (flags & FL_SET_CR1) ? true : false;
|
||||
|
||||
int numOut = 0;
|
||||
int numIn = 0;
|
||||
if (flags & FL_OUT_A)
|
||||
{
|
||||
code[i].regsOut[numOut++] = inst.RA;
|
||||
gpa->SetOutputRegister(inst.RA, i);
|
||||
}
|
||||
if (flags & FL_OUT_D)
|
||||
{
|
||||
code[i].regsOut[numOut++] = inst.RD;
|
||||
gpa->SetOutputRegister(inst.RD, i);
|
||||
}
|
||||
if (flags & FL_OUT_S)
|
||||
{
|
||||
code[i].regsOut[numOut++] = inst.RS;
|
||||
gpa->SetOutputRegister(inst.RS, i);
|
||||
}
|
||||
if ((flags & FL_IN_A) || ((flags & FL_IN_A0) && inst.RA != 0))
|
||||
{
|
||||
code[i].regsIn[numIn++] = inst.RA;
|
||||
gpa->SetInputRegister(inst.RA, i);
|
||||
}
|
||||
if (flags & FL_IN_B)
|
||||
{
|
||||
code[i].regsIn[numIn++] = inst.RB;
|
||||
gpa->SetInputRegister(inst.RB, i);
|
||||
}
|
||||
if (flags & FL_IN_C)
|
||||
{
|
||||
code[i].regsIn[numIn++] = inst.RC;
|
||||
gpa->SetInputRegister(inst.RC, i);
|
||||
}
|
||||
if (flags & FL_IN_S)
|
||||
{
|
||||
code[i].regsIn[numIn++] = inst.RS;
|
||||
gpa->SetInputRegister(inst.RS, i);
|
||||
}
|
||||
|
||||
// Set remaining register slots as unused (-1)
|
||||
for (int j = numIn; j < 3; j++)
|
||||
code[i].regsIn[j] = -1;
|
||||
for (int j = numOut; j < 2; j++)
|
||||
code[i].regsOut[j] = -1;
|
||||
for (int j = 0; j < 3; j++)
|
||||
code[i].fregsIn[j] = -1;
|
||||
code[i].fregOut = -1;
|
||||
|
||||
switch (opinfo->type)
|
||||
{
|
||||
case OPTYPE_INTEGER:
|
||||
case OPTYPE_LOAD:
|
||||
case OPTYPE_STORE:
|
||||
break;
|
||||
case OPTYPE_FPU:
|
||||
break;
|
||||
case OPTYPE_LOADFP:
|
||||
break;
|
||||
case OPTYPE_BRANCH:
|
||||
if (code[i].inst.hex == 0x4e800020)
|
||||
{
|
||||
// For analysis purposes, we can assume that blr eats flags.
|
||||
code[i].outputCR0 = true;
|
||||
code[i].outputCR1 = true;
|
||||
}
|
||||
break;
|
||||
case OPTYPE_SYSTEM:
|
||||
case OPTYPE_SYSTEMFP:
|
||||
numSystemInstructions++;
|
||||
break;
|
||||
}
|
||||
|
||||
bool follow = false;
|
||||
u32 destination;
|
||||
if (inst.OPCD == 18 && blockSize > 1)
|
||||
|
@ -362,146 +470,6 @@ u32 Flatten(u32 address, int *realsize, BlockStats *st, BlockRegStats *gpa, Bloc
|
|||
NOTICE_LOG(POWERPC, "Analyzer ERROR - Function %08x too big, size is 0x%08x", blockstart, address-blockstart);
|
||||
st->numCycles = numCycles;
|
||||
|
||||
// Do analysis of the code, look for dependencies etc
|
||||
int numSystemInstructions = 0;
|
||||
for (int i = 0; i < 32; i++)
|
||||
{
|
||||
gpa->firstRead[i] = -1;
|
||||
gpa->firstWrite[i] = -1;
|
||||
gpa->numReads[i] = 0;
|
||||
gpa->numWrites[i] = 0;
|
||||
}
|
||||
|
||||
gpa->any = true;
|
||||
for (int i = 0; i < num_inst; i++)
|
||||
{
|
||||
UGeckoInstruction inst = code[i].inst;
|
||||
|
||||
code[i].wantsCR0 = false;
|
||||
code[i].wantsCR1 = false;
|
||||
code[i].wantsPS1 = false;
|
||||
|
||||
const GekkoOPInfo *opinfo = code[i].opinfo;
|
||||
_assert_msg_(POWERPC, opinfo != 0, "Invalid Op - Error scanning %08x op %08x",address+i*4,inst.hex);
|
||||
int flags = opinfo->flags;
|
||||
|
||||
if (flags & FL_USE_FPU)
|
||||
fpa->any = true;
|
||||
|
||||
if (flags & FL_TIMER)
|
||||
gpa->anyTimer = true;
|
||||
|
||||
// Does the instruction output CR0?
|
||||
if (flags & FL_RC_BIT)
|
||||
code[i].outputCR0 = inst.hex & 1; //todo fix
|
||||
else if ((flags & FL_SET_CRn) && inst.CRFD == 0)
|
||||
code[i].outputCR0 = true;
|
||||
else
|
||||
code[i].outputCR0 = (flags & FL_SET_CR0) ? true : false;
|
||||
|
||||
// Does the instruction output CR1?
|
||||
if (flags & FL_RC_BIT_F)
|
||||
code[i].outputCR1 = inst.hex & 1; //todo fix
|
||||
else if ((flags & FL_SET_CRn) && inst.CRFD == 1)
|
||||
code[i].outputCR1 = true;
|
||||
else
|
||||
code[i].outputCR1 = (flags & FL_SET_CR1) ? true : false;
|
||||
|
||||
for (int j = 0; j < 3; j++)
|
||||
{
|
||||
code[i].fregsIn[j] = -1;
|
||||
code[i].regsIn[j] = -1;
|
||||
}
|
||||
for (int j = 0; j < 2; j++)
|
||||
code[i].regsOut[j] = -1;
|
||||
|
||||
code[i].fregOut = -1;
|
||||
|
||||
int numOut = 0;
|
||||
int numIn = 0;
|
||||
if (flags & FL_OUT_A)
|
||||
{
|
||||
code[i].regsOut[numOut++] = inst.RA;
|
||||
gpa->numWrites[inst.RA]++;
|
||||
}
|
||||
if (flags & FL_OUT_D)
|
||||
{
|
||||
code[i].regsOut[numOut++] = inst.RD;
|
||||
gpa->numWrites[inst.RD]++;
|
||||
}
|
||||
if (flags & FL_OUT_S)
|
||||
{
|
||||
code[i].regsOut[numOut++] = inst.RS;
|
||||
gpa->numWrites[inst.RS]++;
|
||||
}
|
||||
if ((flags & FL_IN_A) || ((flags & FL_IN_A0) && inst.RA != 0))
|
||||
{
|
||||
code[i].regsIn[numIn++] = inst.RA;
|
||||
gpa->numReads[inst.RA]++;
|
||||
}
|
||||
if (flags & FL_IN_B)
|
||||
{
|
||||
code[i].regsIn[numIn++] = inst.RB;
|
||||
gpa->numReads[inst.RB]++;
|
||||
}
|
||||
if (flags & FL_IN_C)
|
||||
{
|
||||
code[i].regsIn[numIn++] = inst.RC;
|
||||
gpa->numReads[inst.RC]++;
|
||||
}
|
||||
if (flags & FL_IN_S)
|
||||
{
|
||||
code[i].regsIn[numIn++] = inst.RS;
|
||||
gpa->numReads[inst.RS]++;
|
||||
}
|
||||
|
||||
switch (opinfo->type)
|
||||
{
|
||||
case OPTYPE_INTEGER:
|
||||
case OPTYPE_LOAD:
|
||||
case OPTYPE_STORE:
|
||||
break;
|
||||
case OPTYPE_FPU:
|
||||
break;
|
||||
case OPTYPE_LOADFP:
|
||||
break;
|
||||
case OPTYPE_BRANCH:
|
||||
if (code[i].inst.hex == 0x4e800020)
|
||||
{
|
||||
// For analysis purposes, we can assume that blr eats flags.
|
||||
code[i].outputCR0 = true;
|
||||
code[i].outputCR1 = true;
|
||||
}
|
||||
break;
|
||||
case OPTYPE_SYSTEM:
|
||||
case OPTYPE_SYSTEMFP:
|
||||
numSystemInstructions++;
|
||||
break;
|
||||
}
|
||||
|
||||
for (int j = 0; j < numIn; j++)
|
||||
{
|
||||
int r = code[i].regsIn[j];
|
||||
if (r < 0 || r > 31)
|
||||
PanicAlert("wtf");
|
||||
if (gpa->firstRead[r] == -1)
|
||||
gpa->firstRead[r] = (short)(i);
|
||||
gpa->lastRead[r] = (short)(i);
|
||||
gpa->numReads[r]++;
|
||||
}
|
||||
|
||||
for (int j = 0; j < numOut; j++)
|
||||
{
|
||||
int r = code[i].regsOut[j];
|
||||
if (r < 0 || r > 31)
|
||||
PanicAlert("wtf");
|
||||
if (gpa->firstWrite[r] == -1)
|
||||
gpa->firstWrite[r] = (short)(i);
|
||||
gpa->lastWrite[r] = (short)(i);
|
||||
gpa->numWrites[r]++;
|
||||
}
|
||||
}
|
||||
|
||||
// Instruction Reordering Pass
|
||||
if (blockSize > 1)
|
||||
{
|
||||
|
@ -530,7 +498,7 @@ u32 Flatten(u32 address, int *realsize, BlockStats *st, BlockRegStats *gpa, Bloc
|
|||
bool wantsCR0 = true;
|
||||
bool wantsCR1 = true;
|
||||
bool wantsPS1 = true;
|
||||
for (int i = num_inst; i; i--)
|
||||
for (int i = num_inst - 1; i >= 0; i--)
|
||||
{
|
||||
if (code[i].outputCR0)
|
||||
wantsCR0 = false;
|
||||
|
|
|
@ -77,6 +77,20 @@ struct BlockRegStats
|
|||
int GetUseRange(int reg) {
|
||||
return max(lastRead[reg], lastWrite[reg]) -
|
||||
min(firstRead[reg], firstWrite[reg]);}
|
||||
|
||||
inline void SetInputRegister(int reg, short opindex) {
|
||||
if (firstRead[reg] == -1)
|
||||
firstRead[reg] = (short)(opindex);
|
||||
lastRead[reg] = (short)(opindex);
|
||||
numReads[reg]++;
|
||||
}
|
||||
|
||||
inline void SetOutputRegister(int reg, short opindex) {
|
||||
if (firstWrite[reg] == -1)
|
||||
firstWrite[reg] = (short)(opindex);
|
||||
lastWrite[reg] = (short)(opindex);
|
||||
numWrites[reg]++;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -48,24 +48,6 @@ BreakPoints breakpoints;
|
|||
MemChecks memchecks;
|
||||
PPCDebugInterface debug_interface;
|
||||
|
||||
void CompactCR()
|
||||
{
|
||||
u32 new_cr = ppcState.cr_fast[0] << 28;
|
||||
for (int i = 1; i < 8; i++)
|
||||
{
|
||||
new_cr |= ppcState.cr_fast[i] << (28 - i * 4);
|
||||
}
|
||||
ppcState.cr = new_cr;
|
||||
}
|
||||
|
||||
void ExpandCR()
|
||||
{
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
ppcState.cr_fast[i] = (ppcState.cr >> (28 - i * 4)) & 0xF;
|
||||
}
|
||||
}
|
||||
|
||||
void DoState(PointerWrap &p)
|
||||
{
|
||||
p.Do(ppcState);
|
||||
|
@ -97,7 +79,7 @@ void ResetRegisters()
|
|||
ppcState.spr[SPR_ECID_M] = 0x1840c00d;
|
||||
ppcState.spr[SPR_ECID_L] = 0x82bb08e8;
|
||||
|
||||
ppcState.cr = 0;
|
||||
ppcState.cr_fast_u32 = 0;
|
||||
ppcState.fpscr = 0;
|
||||
ppcState.pc = 0;
|
||||
ppcState.npc = 0;
|
||||
|
|
|
@ -49,8 +49,14 @@ struct GC_ALIGNED64(PowerPCState)
|
|||
u32 pc; // program counter
|
||||
u32 npc;
|
||||
|
||||
u32 cr; // flags
|
||||
// flags
|
||||
u32 cr_old; // Not used anymore (only there to maintain backward compatibility with previous save states)
|
||||
#pragma pack(push,1)
|
||||
union {
|
||||
u8 cr_fast[8]; // Possibly reorder to 0, 2, 4, 8, 1, 3, 5, 7 so that we can make Compact and Expand super fast?
|
||||
u32 cr_fast_u32; // Warning: This is reversed CR on little-endian systems
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
u32 msr; // machine specific register
|
||||
u32 fpscr; // floating point flags/status bits
|
||||
|
@ -168,13 +174,11 @@ inline void SetCRBit(int bit, int value) {
|
|||
|
||||
// SetCR and GetCR are fairly slow. Should be avoided if possible.
|
||||
inline void SetCR(u32 new_cr) {
|
||||
PowerPC::ppcState.cr = new_cr;
|
||||
PowerPC::ExpandCR();
|
||||
PowerPC::ppcState.cr_fast_u32 = Common::swap32(new_cr);
|
||||
}
|
||||
|
||||
inline u32 GetCR() {
|
||||
PowerPC::CompactCR();
|
||||
return PowerPC::ppcState.cr;
|
||||
return Common::swap32(PowerPC::ppcState.cr_fast_u32);
|
||||
}
|
||||
|
||||
// SetCarry/GetCarry may speed up soon.
|
||||
|
|
Loading…
Reference in New Issue