JIT64: optimize carry calculations

Omit carry calculations that get overwritten later in the block before they're
used. Very common in the case of srawix and friends.
This commit is contained in:
Fiora 2014-08-21 13:56:18 -07:00
parent a40278b1c4
commit 3aa40dab00
9 changed files with 119 additions and 86 deletions

View File

@ -34,7 +34,7 @@ static GekkoOPTemplate primarytable[] =
{10, Interpreter::cmpli, {"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
{11, Interpreter::cmpi, {"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
{12, Interpreter::addic, {"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA, 1, 0, 0, 0}},
{13, Interpreter::addic_rc, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0, 1, 0, 0, 0}},
{13, Interpreter::addic_rc, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0, 1, 0, 0, 0}},
{14, Interpreter::addi, {"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},
{15, Interpreter::addis, {"addis", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},
@ -180,8 +180,8 @@ static GekkoOPTemplate table31[] =
{922, Interpreter::extshx, {"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{954, Interpreter::extsbx, {"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{536, Interpreter::srwx, {"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{792, Interpreter::srawx, {"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{824, Interpreter::srawix, {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{792, Interpreter::srawx, {"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{824, Interpreter::srawix, {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{24, Interpreter::slwx, {"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{54, Interpreter::dcbst, {"dcbst", OPTYPE_DCACHE, 0, 5, 0, 0, 0}},
@ -260,7 +260,7 @@ static GekkoOPTemplate table31[] =
{339, Interpreter::mfspr, {"mfspr", OPTYPE_SPR, FL_OUT_D, 1, 0, 0, 0}},
{467, Interpreter::mtspr, {"mtspr", OPTYPE_SPR, 0, 2, 0, 0, 0}},
{371, Interpreter::mftb, {"mftb", OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER, 1, 0, 0, 0}},
{512, Interpreter::mcrxr, {"mcrxr", OPTYPE_SYSTEM, 0, 1, 0, 0, 0}},
{512, Interpreter::mcrxr, {"mcrxr", OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA, 1, 0, 0, 0}},
{595, Interpreter::mfsr, {"mfsr", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},
{659, Interpreter::mfsrin, {"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},

View File

@ -100,7 +100,7 @@ public:
void GenerateConstantOverflow(bool overflow);
void GenerateConstantOverflow(s64 val);
void GenerateOverflow();
void FinalizeCarryOverflow(bool oe, bool inv = false);
void FinalizeCarryOverflow(bool ca, bool oe, bool inv = false);
void ComputeRC(const Gen::OpArg & arg);
// use to extract bytes from a register using the regcache. offset is in bytes.

View File

@ -193,8 +193,8 @@ static GekkoOPTemplate table31[] =
{922, &Jit64::extshx}, //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
{954, &Jit64::extsbx}, //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
{536, &Jit64::srwx}, //"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
{792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
{824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
{792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
{824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
{24, &Jit64::slwx}, //"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
{54, &Jit64::dcbst}, //"dcbst", OPTYPE_DCACHE, 0, 4}},

View File

@ -45,7 +45,7 @@ void Jit64::GenerateOverflow()
}
// Assumes CA,OV are clear
void Jit64::FinalizeCarryOverflow(bool oe, bool inv)
void Jit64::FinalizeCarryOverflow(bool ca, bool oe, bool inv)
{
// USES_XER
if (oe)
@ -53,15 +53,17 @@ void Jit64::FinalizeCarryOverflow(bool oe, bool inv)
// this is slightly messy because JitSetCAIf modifies x86 flags, so we have to do it in both
// sides of the branch.
FixupBranch jno = J_CC(CC_NO);
JitSetCAIf(inv ? CC_NC : CC_C);
if (ca)
JitSetCAIf(inv ? CC_NC : CC_C);
//XER[OV/SO] = 1
OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_SO_MASK | XER_OV_MASK));
FixupBranch exit = J();
SetJumpTarget(jno);
JitSetCAIf(inv ? CC_NC : CC_C);
if (ca)
JitSetCAIf(inv ? CC_NC : CC_C);
SetJumpTarget(exit);
}
else
else if (ca)
{
// Do carry
JitSetCAIf(inv ? CC_NC : CC_C);
@ -129,10 +131,10 @@ static u32 Xor(u32 a, u32 b)
void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry)
{
gpr.Lock(d, a);
carry &= js.op->wantsCA;
if (a || binary || carry) // yeh nasty special case addic
{
if (carry)
JitClearCAOV(false);
JitClearCAOV(carry, false);
if (gpr.R(a).IsImm() && !carry)
{
gpr.SetImmediate32(d, doop((u32)gpr.R(a).offset, value));
@ -749,34 +751,38 @@ void Jit64::subfic(UGeckoInstruction inst)
{
if (imm == 0)
{
JitClearCAOV(false);
JitClearCAOV(js.op->wantsCA, false);
// Flags act exactly like subtracting from 0
NEG(32, gpr.R(d));
// Output carry is inverted
JitSetCAIf(CC_NC);
if (js.op->wantsCA)
JitSetCAIf(CC_NC);
}
else if (imm == -1)
{
// CA is always set in this case
JitSetCA();
if (js.op->wantsCA)
JitSetCA();
NOT(32, gpr.R(d));
}
else
{
JitClearCAOV(false);
JitClearCAOV(js.op->wantsCA, false);
NOT(32, gpr.R(d));
ADD(32, gpr.R(d), Imm32(imm+1));
// Output carry is normal
JitSetCAIf(CC_C);
if (js.op->wantsCA)
JitSetCAIf(CC_C);
}
}
else
{
JitClearCAOV(false);
JitClearCAOV(js.op->wantsCA, false);
MOV(32, gpr.R(d), Imm32(imm));
SUB(32, gpr.R(d), gpr.R(a));
// Output carry is inverted
JitSetCAIf(CC_NC);
if (js.op->wantsCA)
JitSetCAIf(CC_NC);
}
gpr.UnlockAll();
// This instruction has no RC flag
@ -789,8 +795,7 @@ void Jit64::subfcx(UGeckoInstruction inst)
int a = inst.RA, b = inst.RB, d = inst.RD;
gpr.Lock(a, b, d);
gpr.BindToRegister(d, (d == a || d == b), true);
JitClearCAOV(inst.OE);
JitClearCAOV(js.op->wantsCA, inst.OE);
if (d == b)
{
SUB(32, gpr.R(d), gpr.R(a));
@ -808,7 +813,7 @@ void Jit64::subfcx(UGeckoInstruction inst)
}
if (inst.Rc)
ComputeRC(gpr.R(d));
FinalizeCarryOverflow(inst.OE, true);
FinalizeCarryOverflow(js.op->wantsCA, inst.OE, true);
gpr.UnlockAll();
}
@ -842,7 +847,7 @@ void Jit64::subfex(UGeckoInstruction inst)
NOT(32, gpr.R(d));
ADC(32, gpr.R(d), gpr.R(b));
}
FinalizeCarryOverflow(inst.OE, invertedCarry);
FinalizeCarryOverflow(js.op->wantsCA, inst.OE, invertedCarry);
if (inst.Rc)
ComputeRC(gpr.R(d));
@ -863,7 +868,7 @@ void Jit64::subfmex(UGeckoInstruction inst)
MOV(32, gpr.R(d), gpr.R(a));
NOT(32, gpr.R(d));
ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
FinalizeCarryOverflow(inst.OE);
FinalizeCarryOverflow(js.op->wantsCA, inst.OE);
if (inst.Rc)
ComputeRC(gpr.R(d));
gpr.UnlockAll();
@ -884,7 +889,7 @@ void Jit64::subfzex(UGeckoInstruction inst)
MOV(32, gpr.R(d), gpr.R(a));
NOT(32, gpr.R(d));
ADC(32, gpr.R(d), Imm8(0));
FinalizeCarryOverflow(inst.OE);
FinalizeCarryOverflow(js.op->wantsCA, inst.OE);
if (inst.Rc)
ComputeRC(gpr.R(d));
@ -1375,7 +1380,7 @@ void Jit64::addex(UGeckoInstruction inst)
MOV(32, gpr.R(d), gpr.R(a));
ADC(32, gpr.R(d), gpr.R(b));
}
FinalizeCarryOverflow(inst.OE);
FinalizeCarryOverflow(js.op->wantsCA, inst.OE);
if (inst.Rc)
ComputeRC(gpr.R(d));
gpr.UnlockAll();
@ -1392,9 +1397,9 @@ void Jit64::addcx(UGeckoInstruction inst)
int operand = ((d == a) ? b : a);
gpr.Lock(a, b, d);
gpr.BindToRegister(d, true);
JitClearCAOV(inst.OE);
JitClearCAOV(js.op->wantsCA, inst.OE);
ADD(32, gpr.R(d), gpr.R(operand));
FinalizeCarryOverflow(inst.OE);
FinalizeCarryOverflow(js.op->wantsCA, inst.OE);
if (inst.Rc)
ComputeRC(gpr.R(d));
gpr.UnlockAll();
@ -1403,10 +1408,10 @@ void Jit64::addcx(UGeckoInstruction inst)
{
gpr.Lock(a, b, d);
gpr.BindToRegister(d, false);
JitClearCAOV(inst.OE);
JitClearCAOV(js.op->wantsCA, inst.OE);
MOV(32, gpr.R(d), gpr.R(a));
ADD(32, gpr.R(d), gpr.R(b));
FinalizeCarryOverflow(inst.OE);
FinalizeCarryOverflow(js.op->wantsCA, inst.OE);
if (inst.Rc)
ComputeRC(gpr.R(d));
gpr.UnlockAll();
@ -1426,7 +1431,7 @@ void Jit64::addmex(UGeckoInstruction inst)
if (d != a)
MOV(32, gpr.R(d), gpr.R(a));
ADC(32, gpr.R(d), Imm32(0xFFFFFFFF));
FinalizeCarryOverflow(inst.OE);
FinalizeCarryOverflow(js.op->wantsCA, inst.OE);
if (inst.Rc)
ComputeRC(gpr.R(d));
gpr.UnlockAll();
@ -1445,7 +1450,7 @@ void Jit64::addzex(UGeckoInstruction inst)
if (d != a)
MOV(32, gpr.R(d), gpr.R(a));
ADC(32, gpr.R(d), Imm8(0));
FinalizeCarryOverflow(inst.OE);
FinalizeCarryOverflow(js.op->wantsCA, inst.OE);
if (inst.Rc)
ComputeRC(gpr.R(d));
gpr.UnlockAll();
@ -1826,16 +1831,23 @@ void Jit64::srawx(UGeckoInstruction inst)
gpr.Lock(a, s, b);
gpr.FlushLockX(ECX);
gpr.BindToRegister(a, (a == s || a == b), true);
JitClearCAOV(false);
JitClearCAOV(js.op->wantsCA, false);
MOV(32, R(ECX), gpr.R(b));
if (a != s)
MOV(32, gpr.R(a), gpr.R(s));
SHL(64, gpr.R(a), Imm8(32));
SAR(64, gpr.R(a), R(ECX));
MOV(32, R(EAX), gpr.R(a));
SHR(64, gpr.R(a), Imm8(32));
TEST(32, gpr.R(a), R(EAX));
JitSetCAIf(CC_NZ);
if (js.op->wantsCA)
{
MOV(32, R(EAX), gpr.R(a));
SHR(64, gpr.R(a), Imm8(32));
TEST(32, gpr.R(a), R(EAX));
JitSetCAIf(CC_NZ);
}
else
{
SHR(64, gpr.R(a), Imm8(32));
}
gpr.UnlockAll();
gpr.UnlockAllX();
if (inst.Rc)
@ -1853,33 +1865,42 @@ void Jit64::srawix(UGeckoInstruction inst)
{
gpr.Lock(a, s);
gpr.BindToRegister(a, a == s, true);
JitClearCAOV(false);
MOV(32, R(EAX), gpr.R(s));
if (a != s)
MOV(32, gpr.R(a), R(EAX));
// some optimized common cases that can be done in slightly fewer ops
if (amount == 31)
if (!js.op->wantsCA)
{
SAR(32, gpr.R(a), Imm8(31));
NEG(32, R(EAX)); // EAX = input == INT_MIN ? INT_MIN : -input;
AND(32, R(EAX), Imm32(0x80000000)); // EAX = input < 0 && input != INT_MIN ? 0 : 0x80000000
SHR(32, R(EAX), Imm8(31 - XER_CA_SHIFT));
XOR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = (input < 0 && input != INT_MIN)
}
else if (amount == 1)
{
SHR(32, R(EAX), Imm8(31)); // sign
AND(32, R(EAX), gpr.R(a)); // (sign && carry)
SAR(32, gpr.R(a), Imm8(1));
SHL(32, R(EAX), Imm8(XER_CA_SHIFT));
OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001
if (a != s)
MOV(32, gpr.R(a), gpr.R(s));
SAR(32, gpr.R(a), Imm8(amount));
}
else
{
SAR(32, gpr.R(a), Imm8(amount));
SHL(32, R(EAX), Imm8(32 - amount));
TEST(32, R(EAX), gpr.R(a));
JitSetCAIf(CC_NZ);
JitClearCAOV(true, false);
MOV(32, R(EAX), gpr.R(s));
if (a != s)
MOV(32, gpr.R(a), R(EAX));
// some optimized common cases that can be done in slightly fewer ops
if (amount == 31)
{
SAR(32, gpr.R(a), Imm8(31));
NEG(32, R(EAX)); // EAX = input == INT_MIN ? INT_MIN : -input;
AND(32, R(EAX), Imm32(0x80000000)); // EAX = input < 0 && input != INT_MIN ? 0 : 0x80000000
SHR(32, R(EAX), Imm8(31 - XER_CA_SHIFT));
XOR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = (input < 0 && input != INT_MIN)
}
else if (amount == 1)
{
SHR(32, R(EAX), Imm8(31)); // sign
AND(32, R(EAX), gpr.R(a)); // (sign && carry)
SAR(32, gpr.R(a), Imm8(1));
SHL(32, R(EAX), Imm8(XER_CA_SHIFT));
OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001
}
else
{
SAR(32, gpr.R(a), Imm8(amount));
SHL(32, R(EAX), Imm8(32 - amount));
TEST(32, R(EAX), gpr.R(a));
JitSetCAIf(CC_NZ);
}
}
}
else
@ -1888,7 +1909,7 @@ void Jit64::srawix(UGeckoInstruction inst)
FALLBACK_IF(true);
gpr.Lock(a, s);
JitClearCAOV(false);
JitClearCAOV(js.op->wantsCA, false);
gpr.BindToRegister(a, a == s, true);
if (a != s)

View File

@ -1110,7 +1110,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
Jit->JitSetCA();
FixupBranch cont = Jit->J();
Jit->SetJumpTarget(nocarry);
Jit->JitClearCAOV(false);
Jit->JitClearCAOV(true, false);
Jit->SetJumpTarget(cont);
regNormalRegClear(RI, I);
break;

View File

@ -824,10 +824,10 @@ void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode)
OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); //XER.CA = 1
}
void EmuCodeBlock::JitClearCAOV(bool oe)
void EmuCodeBlock::JitClearCAOV(bool ca, bool oe)
{
if (oe)
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK & ~XER_OV_MASK)); //XER.CA, XER.OV = 0
else
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
u32 mask = (ca ? ~XER_CA_MASK : 0xFFFFFFFF) & (oe ? ~XER_OV_MASK : 0xFFFFFFFF);
if (mask == 0xFFFFFFFF)
return;
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(mask));
}

View File

@ -53,7 +53,7 @@ public:
void JitGetAndClearCAOV(bool oe);
void JitSetCA();
void JitSetCAIf(Gen::CCFlags conditionCode);
void JitClearCAOV(bool oe);
void JitClearCAOV(bool ca, bool oe);
void ForceSinglePrecisionS(Gen::X64Reg xmm);
void ForceSinglePrecisionP(Gen::X64Reg xmm);

View File

@ -430,7 +430,6 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
{
code->wantsCR0 = false;
code->wantsCR1 = false;
code->wantsPS1 = false;
if (opinfo->flags & FL_USE_FPU)
block->m_fpa->any = true;
@ -458,6 +457,15 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false;
code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false;
code->wantsCA = (opinfo->flags & FL_READ_CA) ? true : false;
code->outputCA = (opinfo->flags & FL_SET_CA) ? true : false;
// mfspr/mtspr can affect/use XER, so be super careful here
if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 339) // mfspr
code->wantsCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr
code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
int numOut = 0;
int numIn = 0;
if (opinfo->flags & FL_OUT_A)
@ -715,26 +723,30 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
block->m_broken = true;
}
// Scan for CR0 dependency
// assume next block wants flags to be safe
// Scan for flag dependencies; assume the next block (or any branch that can leave the block)
// wants flags, to be safe.
bool wantsCR0 = true;
bool wantsCR1 = true;
bool wantsPS1 = true;
bool wantsFPRF = true;
bool wantsCA = true;
for (int i = block->m_num_instructions - 1; i >= 0; i--)
{
wantsCR0 |= code[i].wantsCR0 || code[i].canEndBlock;
wantsCR1 |= code[i].wantsCR1 || code[i].canEndBlock;
wantsPS1 |= code[i].wantsPS1 || code[i].canEndBlock;
wantsFPRF |= code[i].wantsFPRF || code[i].canEndBlock;
code[i].wantsCR0 = wantsCR0;
code[i].wantsCR1 = wantsCR1;
code[i].wantsPS1 = wantsPS1;
bool opWantsCR0 = code[i].wantsCR0;
bool opWantsCR1 = code[i].wantsCR1;
bool opWantsFPRF = code[i].wantsFPRF;
bool opWantsCA = code[i].wantsCA;
wantsCR0 |= opWantsCR0 || code[i].canEndBlock;
wantsCR1 |= opWantsCR1 || code[i].canEndBlock;
wantsFPRF |= opWantsFPRF || code[i].canEndBlock;
wantsCA |= opWantsCA || code[i].canEndBlock;
code[i].wantsCR0 = wantsCR0;
code[i].wantsCR1 = wantsCR1;
code[i].wantsFPRF = wantsFPRF;
wantsCR0 &= !code[i].outputCR0;
wantsCR1 &= !code[i].outputCR1;
wantsPS1 &= !code[i].outputPS1;
wantsFPRF &= !code[i].outputFPRF;
code[i].wantsCA = wantsCA;
wantsCR0 &= !code[i].outputCR0 || opWantsCR0;
wantsCR1 &= !code[i].outputCR1 || opWantsCR1;
wantsFPRF &= !code[i].outputFPRF || opWantsFPRF;
wantsCA &= !code[i].outputCA || opWantsCA;
}
return address;
}

View File

@ -33,12 +33,12 @@ struct CodeOp //16B
bool isBranchTarget;
bool wantsCR0;
bool wantsCR1;
bool wantsPS1;
bool wantsFPRF;
bool wantsCA;
bool outputCR0;
bool outputCR1;
bool outputPS1;
bool outputFPRF;
bool outputCA;
bool canEndBlock;
bool skip; // followed BL-s for example
};