Merge pull request #1350 from FioraAeterna/integeropts

Various smallish JIT optimizations
This commit is contained in:
Ryan Houdek 2014-11-02 20:13:20 -06:00
commit 204598a082
10 changed files with 127 additions and 25 deletions

View File

@ -285,8 +285,8 @@ static GekkoOPTemplate table31_2[] =
{522, Interpreter::addcx, {"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{138, Interpreter::addex, {"addex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{650, Interpreter::addex, {"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
{234, Interpreter::addmex, {"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{202, Interpreter::addzex, {"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{234, Interpreter::addmex, {"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{202, Interpreter::addzex, {"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{491, Interpreter::divwx, {"divwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
{1003, Interpreter::divwx, {"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
{459, Interpreter::divwux, {"divwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
@ -295,14 +295,14 @@ static GekkoOPTemplate table31_2[] =
{11, Interpreter::mulhwux, {"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
{235, Interpreter::mullwx, {"mullwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
{747, Interpreter::mullwx, {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 5, 0, 0, 0}},
{104, Interpreter::negx, {"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
{104, Interpreter::negx, {"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_RC_BIT, 1, 0, 0, 0}},
{40, Interpreter::subfx, {"subfx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
{552, Interpreter::subfx, {"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
{8, Interpreter::subfcx, {"subfcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{520, Interpreter::subfcx, {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
{136, Interpreter::subfex, {"subfex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{232, Interpreter::subfmex, {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{200, Interpreter::subfzex, {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{232, Interpreter::subfmex, {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{200, Interpreter::subfzex, {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
};
static GekkoOPTemplate table59[] =

View File

@ -414,10 +414,10 @@ void Jit64::WriteBLRExit()
bool disturbed = Cleanup();
if (disturbed)
MOV(32, R(RSCRATCH), PPCSTATE(pc));
MOV(32, R(RSCRATCH2), Imm32(js.downcountAmount));
CMP(64, R(RSCRATCH), MDisp(RSP, 8));
MOV(32, R(RSCRATCH), Imm32(js.downcountAmount));
J_CC(CC_NE, asm_routines.dispatcherMispredictedBLR);
SUB(32, PPCSTATE(downcount), R(RSCRATCH));
SUB(32, PPCSTATE(downcount), R(RSCRATCH2));
RET();
}

View File

@ -137,6 +137,7 @@ public:
void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
// Clobbers RDX.
void SetCRFieldBit(int field, int bit, Gen::X64Reg in);
void ClearCRFieldBit(int field, int bit);
// Generates a branch that will check if a given bit of a CR register part
// is set or not.

View File

@ -48,17 +48,18 @@ void Jit64AsmRoutineManager::Generate()
ABI_PopRegistersAndAdjustStack({}, 0);
FixupBranch skipToRealDispatch = J(SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging); //skip the sync and compare first time
dispatcherMispredictedBLR = GetCodePtr();
AND(32, PPCSTATE(pc), Imm32(0xFFFFFFFC));
#if 0 // debug mispredicts
MOV(32, R(ABI_PARAM1), MDisp(RSP, 8)); // guessed_pc
ABI_PushRegistersAndAdjustStack(1 << RSCRATCH, 0);
ABI_PushRegistersAndAdjustStack(1 << RSCRATCH2, 0);
CALL(reinterpret_cast<void *>(&ReportMispredict));
ABI_PopRegistersAndAdjustStack(1 << RSCRATCH, 0);
ABI_PopRegistersAndAdjustStack(1 << RSCRATCH2, 0);
#endif
ResetStack();
SUB(32, PPCSTATE(downcount), R(RSCRATCH));
SUB(32, PPCSTATE(downcount), R(RSCRATCH2));
dispatcher = GetCodePtr();
// The result of slice decrementation should be in flags if somebody jumped here

View File

@ -229,7 +229,11 @@ void Jit64::bclrx(UGeckoInstruction inst)
#endif
MOV(32, R(RSCRATCH), PPCSTATE_LR);
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
// We don't have to do this because WriteBLRExit handles it for us. Specifically, since we only ever push
// divisible-by-four instruction addresses onto the stack, if the return address matches, we're already
// good. If it doesn't match, the mispredicted-BLR code handles the fixup.
if (!m_enable_blr_optimization)
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
if (inst.LK)
MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4));

View File

@ -276,6 +276,18 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
int a = inst.FA;
int b = inst.FB;
int crf = inst.CRFD;
int output[4] = { CR_SO, CR_EQ, CR_GT, CR_LT };
// Merge neighboring fcmp and cror (the primary use of cror).
UGeckoInstruction next = js.next_inst;
if (next.OPCD == 19 && next.SUBOP10 == 449 && (next.CRBA >> 2) == crf && (next.CRBB >> 2) == crf && (next.CRBD >> 2) == crf)
{
js.skipnext = true;
js.downcountAmount++;
int dst = 3 - (next.CRBD & 3);
output[3 - (next.CRBA & 3)] |= 1 << dst;
output[3 - (next.CRBB & 3)] |= 1 << dst;
}
fpr.Lock(a, b);
fpr.BindToRegister(b, true, false);
@ -315,14 +327,14 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
pGreater = J_CC(CC_B);
}
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_EQ)));
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_EQ_BIT])));
if (fprf)
OR(32, PPCSTATE(fpscr), Imm32(CR_EQ << FPRF_SHIFT));
continue1 = J();
SetJumpTarget(pNaN);
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_SO)));
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_SO_BIT])));
if (fprf)
OR(32, PPCSTATE(fpscr), Imm32(CR_SO << FPRF_SHIFT));
@ -331,13 +343,13 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
continue2 = J();
SetJumpTarget(pGreater);
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_GT)));
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_GT_BIT])));
if (fprf)
OR(32, PPCSTATE(fpscr), Imm32(CR_GT << FPRF_SHIFT));
continue3 = J();
SetJumpTarget(pLesser);
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_LT)));
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_LT_BIT])));
if (fprf)
OR(32, PPCSTATE(fpscr), Imm32(CR_LT << FPRF_SHIFT));
}

View File

@ -153,7 +153,18 @@ void Jit64::ComputeRC(const Gen::OpArg & arg, bool needs_test, bool needs_sext)
else
{
if (needs_test)
{
TEST(32, arg, arg);
}
else
{
// If an operand to the cmp/rc op we're merging with the branch isn't used anymore, it'd be
// better to flush it here so that we don't have to flush it on both sides of the branch.
// We don't want to do this if a test is needed though, because it would interrupt macro-op
// fusion.
for (int j : js.op->gprInUse)
gpr.StoreFromRegister(j);
}
DoMergedBranchCondition();
}
}
@ -355,7 +366,8 @@ void Jit64::DoMergedBranch()
else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
{
MOV(32, R(RSCRATCH), M(&LR));
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
if (!m_enable_blr_optimization)
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
if (js.next_inst.LK)
MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
WriteBLRExit();
@ -544,7 +556,16 @@ void Jit64::cmpXX(UGeckoInstruction inst)
MOV(64, PPCSTATE(cr_val[crf]), R(input));
// Place the comparison next to the branch for macro-op fusion
if (merge_branch)
TEST(64, R(input), R(input));
{
// We only need to do a 32-bit compare, since the flags set will be the same as a sign-extended
// result.
// We should also test against gpr.R(a) if it's bound, since that's one less cycle of latency
// (the CPU doesn't have to wait for the movsxd to finish to resolve the branch).
if (gpr.R(a).IsSimpleReg())
TEST(32, gpr.R(a), gpr.R(a));
else
TEST(32, R(input), R(input));
}
}
else
{
@ -1007,20 +1028,30 @@ void Jit64::mulhwXx(UGeckoInstruction inst)
else
gpr.SetImmediate32(d, (u32)((gpr.R(a).offset * gpr.R(b).offset) >> 32));
}
else
else if (sign)
{
gpr.Lock(a, b, d);
// no register choice
gpr.FlushLockX(EDX, EAX);
gpr.BindToRegister(d, (d == a || d == b), true);
gpr.BindToRegister(d, d == a || d == b, true);
MOV(32, R(EAX), gpr.R(a));
gpr.KillImmediate(b, true, false);
if (sign)
IMUL(32, gpr.R(b));
else
MUL(32, gpr.R(b));
IMUL(32, gpr.R(b));
MOV(32, gpr.R(d), R(EDX));
}
else
{
// Not faster for signed because we'd need two movsx.
gpr.Lock(a, b, d);
// We need to bind everything to registers since the top 32 bits need to be zero.
int src = d == b ? a : b;
gpr.BindToRegister(d, d == a || d == b, true);
gpr.BindToRegister(src, true, false);
if (d != a && d != b)
MOV(32, gpr.R(d), gpr.R(a));
IMUL(64, gpr.RX(d), gpr.R(src));
SHR(64, gpr.R(d), Imm8(32));
}
if (inst.Rc)
ComputeRC(gpr.R(d));
gpr.UnlockAll();

View File

@ -89,6 +89,42 @@ void Jit64::SetCRFieldBit(int field, int bit, Gen::X64Reg in)
MOV(64, PPCSTATE(cr_val[field]), R(RSCRATCH2));
}
void Jit64::ClearCRFieldBit(int field, int bit)
{
MOV(64, R(RSCRATCH2), PPCSTATE(cr_val[field]));
if (bit != CR_GT_BIT)
{
TEST(64, R(RSCRATCH2), R(RSCRATCH2));
FixupBranch dont_clear_gt = J_CC(CC_NZ);
BTS(64, R(RSCRATCH2), Imm8(63));
SetJumpTarget(dont_clear_gt);
}
switch (bit)
{
case CR_SO_BIT: // set bit 61 to input
BTR(64, R(RSCRATCH2), Imm8(61));
break;
case CR_EQ_BIT: // clear low 32 bits, set bit 0 to !input
SHR(64, R(RSCRATCH2), Imm8(32));
SHL(64, R(RSCRATCH2), Imm8(32));
break;
case CR_GT_BIT: // set bit 63 to !input
BTR(64, R(RSCRATCH2), Imm8(63));
break;
case CR_LT_BIT: // set bit 62 to input
BTR(64, R(RSCRATCH2), Imm8(62));
break;
}
BTS(64, R(RSCRATCH2), Imm8(32));
MOV(64, PPCSTATE(cr_val[field]), R(RSCRATCH2));
}
FixupBranch Jit64::JumpIfCRFieldBit(int field, int bit, bool jump_if_set)
{
switch (bit)
@ -472,6 +508,13 @@ void Jit64::crXXX(UGeckoInstruction inst)
JITDISABLE(bJITSystemRegistersOff);
_dbg_assert_msg_(DYNA_REC, inst.OPCD == 19, "Invalid crXXX");
// Special case: crclr
if (inst.CRBA == inst.CRBB && inst.CRBA == inst.CRBD && inst.SUBOP10 == 193)
{
ClearCRFieldBit(inst.CRBD >> 2, 3 - (inst.CRBD & 3));
return;
}
// TODO(delroth): Potential optimizations could be applied here. For
// instance, if the two CR bits being loaded are the same, two loads are
// not required.

View File

@ -406,6 +406,11 @@ static bool isCarryOp(const CodeOp& a)
return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && a.opinfo->type == OPTYPE_INTEGER;
}
static bool isCror(const CodeOp& a)
{
return a.inst.OPCD == 19 && a.inst.SUBOP10 == 449;
}
void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type)
{
// Bubbling an instruction sometimes reveals another opportunity to bubble an instruction, so do
@ -426,7 +431,7 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r
CodeOp &b = code[i + increment];
// Reorder integer compares, rlwinm., and carry-affecting ops
// (if we add more merged branch instructions, add them here!)
if ((type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || a.outputCR0)))
if ((type == REORDER_CROR && isCror(a)) || (type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || a.outputCR0)))
{
// once we're next to a carry instruction, don't move away!
if (type == REORDER_CARRY && i != start)
@ -454,6 +459,10 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r
void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
{
// Reorder cror instructions upwards (e.g. towards an fcmp). Technically we should be more
// picky about this, but cror seems to almost solely be used for this purpose in real code.
// Additionally, the other boolean ops seem to almost never be used.
ReorderInstructionsCore(instructions, code, true, REORDER_CROR);
// For carry, bubble instructions *towards* each other; one direction often isn't enough
// to get pairs like addc/adde next to each other.
if (HasOption(OPTION_CARRY_MERGE))

View File

@ -156,7 +156,8 @@ private:
enum ReorderType
{
REORDER_CARRY,
REORDER_CMP
REORDER_CMP,
REORDER_CROR
};
void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);