Merge pull request #1350 from FioraAeterna/integeropts
Various smallish JIT optimizations
This commit is contained in:
commit
204598a082
|
@ -285,8 +285,8 @@ static GekkoOPTemplate table31_2[] =
|
|||
{522, Interpreter::addcx, {"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{138, Interpreter::addex, {"addex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{650, Interpreter::addex, {"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
||||
{234, Interpreter::addmex, {"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{202, Interpreter::addzex, {"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{234, Interpreter::addmex, {"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{202, Interpreter::addzex, {"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{491, Interpreter::divwx, {"divwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
||||
{1003, Interpreter::divwx, {"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
|
||||
{459, Interpreter::divwux, {"divwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
||||
|
@ -295,14 +295,14 @@ static GekkoOPTemplate table31_2[] =
|
|||
{11, Interpreter::mulhwux, {"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
||||
{235, Interpreter::mullwx, {"mullwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
||||
{747, Interpreter::mullwx, {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 5, 0, 0, 0}},
|
||||
{104, Interpreter::negx, {"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{104, Interpreter::negx, {"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{40, Interpreter::subfx, {"subfx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{552, Interpreter::subfx, {"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
||||
{8, Interpreter::subfcx, {"subfcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{520, Interpreter::subfcx, {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
||||
{136, Interpreter::subfex, {"subfex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{232, Interpreter::subfmex, {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{200, Interpreter::subfzex, {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{232, Interpreter::subfmex, {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{200, Interpreter::subfzex, {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
};
|
||||
|
||||
static GekkoOPTemplate table59[] =
|
||||
|
|
|
@ -414,10 +414,10 @@ void Jit64::WriteBLRExit()
|
|||
bool disturbed = Cleanup();
|
||||
if (disturbed)
|
||||
MOV(32, R(RSCRATCH), PPCSTATE(pc));
|
||||
MOV(32, R(RSCRATCH2), Imm32(js.downcountAmount));
|
||||
CMP(64, R(RSCRATCH), MDisp(RSP, 8));
|
||||
MOV(32, R(RSCRATCH), Imm32(js.downcountAmount));
|
||||
J_CC(CC_NE, asm_routines.dispatcherMispredictedBLR);
|
||||
SUB(32, PPCSTATE(downcount), R(RSCRATCH));
|
||||
SUB(32, PPCSTATE(downcount), R(RSCRATCH2));
|
||||
RET();
|
||||
}
|
||||
|
||||
|
|
|
@ -137,6 +137,7 @@ public:
|
|||
void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
|
||||
// Clobbers RDX.
|
||||
void SetCRFieldBit(int field, int bit, Gen::X64Reg in);
|
||||
void ClearCRFieldBit(int field, int bit);
|
||||
|
||||
// Generates a branch that will check if a given bit of a CR register part
|
||||
// is set or not.
|
||||
|
|
|
@ -48,17 +48,18 @@ void Jit64AsmRoutineManager::Generate()
|
|||
ABI_PopRegistersAndAdjustStack({}, 0);
|
||||
FixupBranch skipToRealDispatch = J(SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging); //skip the sync and compare first time
|
||||
dispatcherMispredictedBLR = GetCodePtr();
|
||||
AND(32, PPCSTATE(pc), Imm32(0xFFFFFFFC));
|
||||
|
||||
#if 0 // debug mispredicts
|
||||
MOV(32, R(ABI_PARAM1), MDisp(RSP, 8)); // guessed_pc
|
||||
ABI_PushRegistersAndAdjustStack(1 << RSCRATCH, 0);
|
||||
ABI_PushRegistersAndAdjustStack(1 << RSCRATCH2, 0);
|
||||
CALL(reinterpret_cast<void *>(&ReportMispredict));
|
||||
ABI_PopRegistersAndAdjustStack(1 << RSCRATCH, 0);
|
||||
ABI_PopRegistersAndAdjustStack(1 << RSCRATCH2, 0);
|
||||
#endif
|
||||
|
||||
ResetStack();
|
||||
|
||||
SUB(32, PPCSTATE(downcount), R(RSCRATCH));
|
||||
SUB(32, PPCSTATE(downcount), R(RSCRATCH2));
|
||||
|
||||
dispatcher = GetCodePtr();
|
||||
// The result of slice decrementation should be in flags if somebody jumped here
|
||||
|
|
|
@ -229,7 +229,11 @@ void Jit64::bclrx(UGeckoInstruction inst)
|
|||
#endif
|
||||
|
||||
MOV(32, R(RSCRATCH), PPCSTATE_LR);
|
||||
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
|
||||
// We don't have to do this because WriteBLRExit handles it for us. Specifically, since we only ever push
|
||||
// divisible-by-four instruction addresses onto the stack, if the return address matches, we're already
|
||||
// good. If it doesn't match, the mispredicted-BLR code handles the fixup.
|
||||
if (!m_enable_blr_optimization)
|
||||
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
|
||||
if (inst.LK)
|
||||
MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4));
|
||||
|
||||
|
|
|
@ -276,6 +276,18 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
|
|||
int a = inst.FA;
|
||||
int b = inst.FB;
|
||||
int crf = inst.CRFD;
|
||||
int output[4] = { CR_SO, CR_EQ, CR_GT, CR_LT };
|
||||
|
||||
// Merge neighboring fcmp and cror (the primary use of cror).
|
||||
UGeckoInstruction next = js.next_inst;
|
||||
if (next.OPCD == 19 && next.SUBOP10 == 449 && (next.CRBA >> 2) == crf && (next.CRBB >> 2) == crf && (next.CRBD >> 2) == crf)
|
||||
{
|
||||
js.skipnext = true;
|
||||
js.downcountAmount++;
|
||||
int dst = 3 - (next.CRBD & 3);
|
||||
output[3 - (next.CRBA & 3)] |= 1 << dst;
|
||||
output[3 - (next.CRBB & 3)] |= 1 << dst;
|
||||
}
|
||||
|
||||
fpr.Lock(a, b);
|
||||
fpr.BindToRegister(b, true, false);
|
||||
|
@ -315,14 +327,14 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
|
|||
pGreater = J_CC(CC_B);
|
||||
}
|
||||
|
||||
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_EQ)));
|
||||
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_EQ_BIT])));
|
||||
if (fprf)
|
||||
OR(32, PPCSTATE(fpscr), Imm32(CR_EQ << FPRF_SHIFT));
|
||||
|
||||
continue1 = J();
|
||||
|
||||
SetJumpTarget(pNaN);
|
||||
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_SO)));
|
||||
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_SO_BIT])));
|
||||
if (fprf)
|
||||
OR(32, PPCSTATE(fpscr), Imm32(CR_SO << FPRF_SHIFT));
|
||||
|
||||
|
@ -331,13 +343,13 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
|
|||
continue2 = J();
|
||||
|
||||
SetJumpTarget(pGreater);
|
||||
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_GT)));
|
||||
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_GT_BIT])));
|
||||
if (fprf)
|
||||
OR(32, PPCSTATE(fpscr), Imm32(CR_GT << FPRF_SHIFT));
|
||||
continue3 = J();
|
||||
|
||||
SetJumpTarget(pLesser);
|
||||
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_LT)));
|
||||
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_LT_BIT])));
|
||||
if (fprf)
|
||||
OR(32, PPCSTATE(fpscr), Imm32(CR_LT << FPRF_SHIFT));
|
||||
}
|
||||
|
|
|
@ -153,7 +153,18 @@ void Jit64::ComputeRC(const Gen::OpArg & arg, bool needs_test, bool needs_sext)
|
|||
else
|
||||
{
|
||||
if (needs_test)
|
||||
{
|
||||
TEST(32, arg, arg);
|
||||
}
|
||||
else
|
||||
{
|
||||
// If an operand to the cmp/rc op we're merging with the branch isn't used anymore, it'd be
|
||||
// better to flush it here so that we don't have to flush it on both sides of the branch.
|
||||
// We don't want to do this if a test is needed though, because it would interrupt macro-op
|
||||
// fusion.
|
||||
for (int j : js.op->gprInUse)
|
||||
gpr.StoreFromRegister(j);
|
||||
}
|
||||
DoMergedBranchCondition();
|
||||
}
|
||||
}
|
||||
|
@ -355,7 +366,8 @@ void Jit64::DoMergedBranch()
|
|||
else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
|
||||
{
|
||||
MOV(32, R(RSCRATCH), M(&LR));
|
||||
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
|
||||
if (!m_enable_blr_optimization)
|
||||
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
|
||||
if (js.next_inst.LK)
|
||||
MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
|
||||
WriteBLRExit();
|
||||
|
@ -544,7 +556,16 @@ void Jit64::cmpXX(UGeckoInstruction inst)
|
|||
MOV(64, PPCSTATE(cr_val[crf]), R(input));
|
||||
// Place the comparison next to the branch for macro-op fusion
|
||||
if (merge_branch)
|
||||
TEST(64, R(input), R(input));
|
||||
{
|
||||
// We only need to do a 32-bit compare, since the flags set will be the same as a sign-extended
|
||||
// result.
|
||||
// We should also test against gpr.R(a) if it's bound, since that's one less cycle of latency
|
||||
// (the CPU doesn't have to wait for the movsxd to finish to resolve the branch).
|
||||
if (gpr.R(a).IsSimpleReg())
|
||||
TEST(32, gpr.R(a), gpr.R(a));
|
||||
else
|
||||
TEST(32, R(input), R(input));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1007,20 +1028,30 @@ void Jit64::mulhwXx(UGeckoInstruction inst)
|
|||
else
|
||||
gpr.SetImmediate32(d, (u32)((gpr.R(a).offset * gpr.R(b).offset) >> 32));
|
||||
}
|
||||
else
|
||||
else if (sign)
|
||||
{
|
||||
gpr.Lock(a, b, d);
|
||||
// no register choice
|
||||
gpr.FlushLockX(EDX, EAX);
|
||||
gpr.BindToRegister(d, (d == a || d == b), true);
|
||||
gpr.BindToRegister(d, d == a || d == b, true);
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
gpr.KillImmediate(b, true, false);
|
||||
if (sign)
|
||||
IMUL(32, gpr.R(b));
|
||||
else
|
||||
MUL(32, gpr.R(b));
|
||||
IMUL(32, gpr.R(b));
|
||||
MOV(32, gpr.R(d), R(EDX));
|
||||
}
|
||||
else
|
||||
{
|
||||
// Not faster for signed because we'd need two movsx.
|
||||
gpr.Lock(a, b, d);
|
||||
// We need to bind everything to registers since the top 32 bits need to be zero.
|
||||
int src = d == b ? a : b;
|
||||
gpr.BindToRegister(d, d == a || d == b, true);
|
||||
gpr.BindToRegister(src, true, false);
|
||||
if (d != a && d != b)
|
||||
MOV(32, gpr.R(d), gpr.R(a));
|
||||
IMUL(64, gpr.RX(d), gpr.R(src));
|
||||
SHR(64, gpr.R(d), Imm8(32));
|
||||
}
|
||||
if (inst.Rc)
|
||||
ComputeRC(gpr.R(d));
|
||||
gpr.UnlockAll();
|
||||
|
|
|
@ -89,6 +89,42 @@ void Jit64::SetCRFieldBit(int field, int bit, Gen::X64Reg in)
|
|||
MOV(64, PPCSTATE(cr_val[field]), R(RSCRATCH2));
|
||||
}
|
||||
|
||||
void Jit64::ClearCRFieldBit(int field, int bit)
|
||||
{
|
||||
MOV(64, R(RSCRATCH2), PPCSTATE(cr_val[field]));
|
||||
|
||||
if (bit != CR_GT_BIT)
|
||||
{
|
||||
TEST(64, R(RSCRATCH2), R(RSCRATCH2));
|
||||
FixupBranch dont_clear_gt = J_CC(CC_NZ);
|
||||
BTS(64, R(RSCRATCH2), Imm8(63));
|
||||
SetJumpTarget(dont_clear_gt);
|
||||
}
|
||||
|
||||
switch (bit)
|
||||
{
|
||||
case CR_SO_BIT: // set bit 61 to input
|
||||
BTR(64, R(RSCRATCH2), Imm8(61));
|
||||
break;
|
||||
|
||||
case CR_EQ_BIT: // clear low 32 bits, set bit 0 to !input
|
||||
SHR(64, R(RSCRATCH2), Imm8(32));
|
||||
SHL(64, R(RSCRATCH2), Imm8(32));
|
||||
break;
|
||||
|
||||
case CR_GT_BIT: // set bit 63 to !input
|
||||
BTR(64, R(RSCRATCH2), Imm8(63));
|
||||
break;
|
||||
|
||||
case CR_LT_BIT: // set bit 62 to input
|
||||
BTR(64, R(RSCRATCH2), Imm8(62));
|
||||
break;
|
||||
}
|
||||
|
||||
BTS(64, R(RSCRATCH2), Imm8(32));
|
||||
MOV(64, PPCSTATE(cr_val[field]), R(RSCRATCH2));
|
||||
}
|
||||
|
||||
FixupBranch Jit64::JumpIfCRFieldBit(int field, int bit, bool jump_if_set)
|
||||
{
|
||||
switch (bit)
|
||||
|
@ -472,6 +508,13 @@ void Jit64::crXXX(UGeckoInstruction inst)
|
|||
JITDISABLE(bJITSystemRegistersOff);
|
||||
_dbg_assert_msg_(DYNA_REC, inst.OPCD == 19, "Invalid crXXX");
|
||||
|
||||
// Special case: crclr
|
||||
if (inst.CRBA == inst.CRBB && inst.CRBA == inst.CRBD && inst.SUBOP10 == 193)
|
||||
{
|
||||
ClearCRFieldBit(inst.CRBD >> 2, 3 - (inst.CRBD & 3));
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO(delroth): Potential optimizations could be applied here. For
|
||||
// instance, if the two CR bits being loaded are the same, two loads are
|
||||
// not required.
|
||||
|
|
|
@ -406,6 +406,11 @@ static bool isCarryOp(const CodeOp& a)
|
|||
return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && a.opinfo->type == OPTYPE_INTEGER;
|
||||
}
|
||||
|
||||
static bool isCror(const CodeOp& a)
|
||||
{
|
||||
return a.inst.OPCD == 19 && a.inst.SUBOP10 == 449;
|
||||
}
|
||||
|
||||
void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type)
|
||||
{
|
||||
// Bubbling an instruction sometimes reveals another opportunity to bubble an instruction, so do
|
||||
|
@ -426,7 +431,7 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r
|
|||
CodeOp &b = code[i + increment];
|
||||
// Reorder integer compares, rlwinm., and carry-affecting ops
|
||||
// (if we add more merged branch instructions, add them here!)
|
||||
if ((type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || a.outputCR0)))
|
||||
if ((type == REORDER_CROR && isCror(a)) || (type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || a.outputCR0)))
|
||||
{
|
||||
// once we're next to a carry instruction, don't move away!
|
||||
if (type == REORDER_CARRY && i != start)
|
||||
|
@ -454,6 +459,10 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r
|
|||
|
||||
void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
|
||||
{
|
||||
// Reorder cror instructions upwards (e.g. towards an fcmp). Technically we should be more
|
||||
// picky about this, but cror seems to almost solely be used for this purpose in real code.
|
||||
// Additionally, the other boolean ops seem to almost never be used.
|
||||
ReorderInstructionsCore(instructions, code, true, REORDER_CROR);
|
||||
// For carry, bubble instructions *towards* each other; one direction often isn't enough
|
||||
// to get pairs like addc/adde next to each other.
|
||||
if (HasOption(OPTION_CARRY_MERGE))
|
||||
|
|
|
@ -156,7 +156,8 @@ private:
|
|||
enum ReorderType
|
||||
{
|
||||
REORDER_CARRY,
|
||||
REORDER_CMP
|
||||
REORDER_CMP,
|
||||
REORDER_CROR
|
||||
};
|
||||
|
||||
void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);
|
||||
|
|
Loading…
Reference in New Issue