Merge pull request #1350 from FioraAeterna/integeropts
Various smallish JIT optimizations
This commit is contained in:
commit
204598a082
|
@ -285,8 +285,8 @@ static GekkoOPTemplate table31_2[] =
|
||||||
{522, Interpreter::addcx, {"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{522, Interpreter::addcx, {"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{138, Interpreter::addex, {"addex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{138, Interpreter::addex, {"addex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{650, Interpreter::addex, {"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
{650, Interpreter::addex, {"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
||||||
{234, Interpreter::addmex, {"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{234, Interpreter::addmex, {"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{202, Interpreter::addzex, {"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{202, Interpreter::addzex, {"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{491, Interpreter::divwx, {"divwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
{491, Interpreter::divwx, {"divwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
||||||
{1003, Interpreter::divwx, {"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
|
{1003, Interpreter::divwx, {"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
|
||||||
{459, Interpreter::divwux, {"divwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
{459, Interpreter::divwux, {"divwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
||||||
|
@ -295,14 +295,14 @@ static GekkoOPTemplate table31_2[] =
|
||||||
{11, Interpreter::mulhwux, {"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
{11, Interpreter::mulhwux, {"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
||||||
{235, Interpreter::mullwx, {"mullwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
{235, Interpreter::mullwx, {"mullwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
||||||
{747, Interpreter::mullwx, {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 5, 0, 0, 0}},
|
{747, Interpreter::mullwx, {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 5, 0, 0, 0}},
|
||||||
{104, Interpreter::negx, {"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
{104, Interpreter::negx, {"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{40, Interpreter::subfx, {"subfx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
{40, Interpreter::subfx, {"subfx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{552, Interpreter::subfx, {"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
{552, Interpreter::subfx, {"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
||||||
{8, Interpreter::subfcx, {"subfcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{8, Interpreter::subfcx, {"subfcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{520, Interpreter::subfcx, {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
{520, Interpreter::subfcx, {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
||||||
{136, Interpreter::subfex, {"subfex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{136, Interpreter::subfex, {"subfex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{232, Interpreter::subfmex, {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{232, Interpreter::subfmex, {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{200, Interpreter::subfzex, {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{200, Interpreter::subfzex, {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
};
|
};
|
||||||
|
|
||||||
static GekkoOPTemplate table59[] =
|
static GekkoOPTemplate table59[] =
|
||||||
|
|
|
@ -414,10 +414,10 @@ void Jit64::WriteBLRExit()
|
||||||
bool disturbed = Cleanup();
|
bool disturbed = Cleanup();
|
||||||
if (disturbed)
|
if (disturbed)
|
||||||
MOV(32, R(RSCRATCH), PPCSTATE(pc));
|
MOV(32, R(RSCRATCH), PPCSTATE(pc));
|
||||||
|
MOV(32, R(RSCRATCH2), Imm32(js.downcountAmount));
|
||||||
CMP(64, R(RSCRATCH), MDisp(RSP, 8));
|
CMP(64, R(RSCRATCH), MDisp(RSP, 8));
|
||||||
MOV(32, R(RSCRATCH), Imm32(js.downcountAmount));
|
|
||||||
J_CC(CC_NE, asm_routines.dispatcherMispredictedBLR);
|
J_CC(CC_NE, asm_routines.dispatcherMispredictedBLR);
|
||||||
SUB(32, PPCSTATE(downcount), R(RSCRATCH));
|
SUB(32, PPCSTATE(downcount), R(RSCRATCH2));
|
||||||
RET();
|
RET();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -137,6 +137,7 @@ public:
|
||||||
void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
|
void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
|
||||||
// Clobbers RDX.
|
// Clobbers RDX.
|
||||||
void SetCRFieldBit(int field, int bit, Gen::X64Reg in);
|
void SetCRFieldBit(int field, int bit, Gen::X64Reg in);
|
||||||
|
void ClearCRFieldBit(int field, int bit);
|
||||||
|
|
||||||
// Generates a branch that will check if a given bit of a CR register part
|
// Generates a branch that will check if a given bit of a CR register part
|
||||||
// is set or not.
|
// is set or not.
|
||||||
|
|
|
@ -48,17 +48,18 @@ void Jit64AsmRoutineManager::Generate()
|
||||||
ABI_PopRegistersAndAdjustStack({}, 0);
|
ABI_PopRegistersAndAdjustStack({}, 0);
|
||||||
FixupBranch skipToRealDispatch = J(SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging); //skip the sync and compare first time
|
FixupBranch skipToRealDispatch = J(SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging); //skip the sync and compare first time
|
||||||
dispatcherMispredictedBLR = GetCodePtr();
|
dispatcherMispredictedBLR = GetCodePtr();
|
||||||
|
AND(32, PPCSTATE(pc), Imm32(0xFFFFFFFC));
|
||||||
|
|
||||||
#if 0 // debug mispredicts
|
#if 0 // debug mispredicts
|
||||||
MOV(32, R(ABI_PARAM1), MDisp(RSP, 8)); // guessed_pc
|
MOV(32, R(ABI_PARAM1), MDisp(RSP, 8)); // guessed_pc
|
||||||
ABI_PushRegistersAndAdjustStack(1 << RSCRATCH, 0);
|
ABI_PushRegistersAndAdjustStack(1 << RSCRATCH2, 0);
|
||||||
CALL(reinterpret_cast<void *>(&ReportMispredict));
|
CALL(reinterpret_cast<void *>(&ReportMispredict));
|
||||||
ABI_PopRegistersAndAdjustStack(1 << RSCRATCH, 0);
|
ABI_PopRegistersAndAdjustStack(1 << RSCRATCH2, 0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ResetStack();
|
ResetStack();
|
||||||
|
|
||||||
SUB(32, PPCSTATE(downcount), R(RSCRATCH));
|
SUB(32, PPCSTATE(downcount), R(RSCRATCH2));
|
||||||
|
|
||||||
dispatcher = GetCodePtr();
|
dispatcher = GetCodePtr();
|
||||||
// The result of slice decrementation should be in flags if somebody jumped here
|
// The result of slice decrementation should be in flags if somebody jumped here
|
||||||
|
|
|
@ -229,6 +229,10 @@ void Jit64::bclrx(UGeckoInstruction inst)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
MOV(32, R(RSCRATCH), PPCSTATE_LR);
|
MOV(32, R(RSCRATCH), PPCSTATE_LR);
|
||||||
|
// We don't have to do this because WriteBLRExit handles it for us. Specifically, since we only ever push
|
||||||
|
// divisible-by-four instruction addresses onto the stack, if the return address matches, we're already
|
||||||
|
// good. If it doesn't match, the mispredicted-BLR code handles the fixup.
|
||||||
|
if (!m_enable_blr_optimization)
|
||||||
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
|
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
|
||||||
if (inst.LK)
|
if (inst.LK)
|
||||||
MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4));
|
MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4));
|
||||||
|
|
|
@ -276,6 +276,18 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
|
||||||
int a = inst.FA;
|
int a = inst.FA;
|
||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
int crf = inst.CRFD;
|
int crf = inst.CRFD;
|
||||||
|
int output[4] = { CR_SO, CR_EQ, CR_GT, CR_LT };
|
||||||
|
|
||||||
|
// Merge neighboring fcmp and cror (the primary use of cror).
|
||||||
|
UGeckoInstruction next = js.next_inst;
|
||||||
|
if (next.OPCD == 19 && next.SUBOP10 == 449 && (next.CRBA >> 2) == crf && (next.CRBB >> 2) == crf && (next.CRBD >> 2) == crf)
|
||||||
|
{
|
||||||
|
js.skipnext = true;
|
||||||
|
js.downcountAmount++;
|
||||||
|
int dst = 3 - (next.CRBD & 3);
|
||||||
|
output[3 - (next.CRBA & 3)] |= 1 << dst;
|
||||||
|
output[3 - (next.CRBB & 3)] |= 1 << dst;
|
||||||
|
}
|
||||||
|
|
||||||
fpr.Lock(a, b);
|
fpr.Lock(a, b);
|
||||||
fpr.BindToRegister(b, true, false);
|
fpr.BindToRegister(b, true, false);
|
||||||
|
@ -315,14 +327,14 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
|
||||||
pGreater = J_CC(CC_B);
|
pGreater = J_CC(CC_B);
|
||||||
}
|
}
|
||||||
|
|
||||||
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_EQ)));
|
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_EQ_BIT])));
|
||||||
if (fprf)
|
if (fprf)
|
||||||
OR(32, PPCSTATE(fpscr), Imm32(CR_EQ << FPRF_SHIFT));
|
OR(32, PPCSTATE(fpscr), Imm32(CR_EQ << FPRF_SHIFT));
|
||||||
|
|
||||||
continue1 = J();
|
continue1 = J();
|
||||||
|
|
||||||
SetJumpTarget(pNaN);
|
SetJumpTarget(pNaN);
|
||||||
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_SO)));
|
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_SO_BIT])));
|
||||||
if (fprf)
|
if (fprf)
|
||||||
OR(32, PPCSTATE(fpscr), Imm32(CR_SO << FPRF_SHIFT));
|
OR(32, PPCSTATE(fpscr), Imm32(CR_SO << FPRF_SHIFT));
|
||||||
|
|
||||||
|
@ -331,13 +343,13 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
|
||||||
continue2 = J();
|
continue2 = J();
|
||||||
|
|
||||||
SetJumpTarget(pGreater);
|
SetJumpTarget(pGreater);
|
||||||
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_GT)));
|
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_GT_BIT])));
|
||||||
if (fprf)
|
if (fprf)
|
||||||
OR(32, PPCSTATE(fpscr), Imm32(CR_GT << FPRF_SHIFT));
|
OR(32, PPCSTATE(fpscr), Imm32(CR_GT << FPRF_SHIFT));
|
||||||
continue3 = J();
|
continue3 = J();
|
||||||
|
|
||||||
SetJumpTarget(pLesser);
|
SetJumpTarget(pLesser);
|
||||||
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_LT)));
|
MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_LT_BIT])));
|
||||||
if (fprf)
|
if (fprf)
|
||||||
OR(32, PPCSTATE(fpscr), Imm32(CR_LT << FPRF_SHIFT));
|
OR(32, PPCSTATE(fpscr), Imm32(CR_LT << FPRF_SHIFT));
|
||||||
}
|
}
|
||||||
|
|
|
@ -153,7 +153,18 @@ void Jit64::ComputeRC(const Gen::OpArg & arg, bool needs_test, bool needs_sext)
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (needs_test)
|
if (needs_test)
|
||||||
|
{
|
||||||
TEST(32, arg, arg);
|
TEST(32, arg, arg);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// If an operand to the cmp/rc op we're merging with the branch isn't used anymore, it'd be
|
||||||
|
// better to flush it here so that we don't have to flush it on both sides of the branch.
|
||||||
|
// We don't want to do this if a test is needed though, because it would interrupt macro-op
|
||||||
|
// fusion.
|
||||||
|
for (int j : js.op->gprInUse)
|
||||||
|
gpr.StoreFromRegister(j);
|
||||||
|
}
|
||||||
DoMergedBranchCondition();
|
DoMergedBranchCondition();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -355,6 +366,7 @@ void Jit64::DoMergedBranch()
|
||||||
else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
|
else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
|
||||||
{
|
{
|
||||||
MOV(32, R(RSCRATCH), M(&LR));
|
MOV(32, R(RSCRATCH), M(&LR));
|
||||||
|
if (!m_enable_blr_optimization)
|
||||||
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
|
AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
|
||||||
if (js.next_inst.LK)
|
if (js.next_inst.LK)
|
||||||
MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
|
MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
|
||||||
|
@ -544,7 +556,16 @@ void Jit64::cmpXX(UGeckoInstruction inst)
|
||||||
MOV(64, PPCSTATE(cr_val[crf]), R(input));
|
MOV(64, PPCSTATE(cr_val[crf]), R(input));
|
||||||
// Place the comparison next to the branch for macro-op fusion
|
// Place the comparison next to the branch for macro-op fusion
|
||||||
if (merge_branch)
|
if (merge_branch)
|
||||||
TEST(64, R(input), R(input));
|
{
|
||||||
|
// We only need to do a 32-bit compare, since the flags set will be the same as a sign-extended
|
||||||
|
// result.
|
||||||
|
// We should also test against gpr.R(a) if it's bound, since that's one less cycle of latency
|
||||||
|
// (the CPU doesn't have to wait for the movsxd to finish to resolve the branch).
|
||||||
|
if (gpr.R(a).IsSimpleReg())
|
||||||
|
TEST(32, gpr.R(a), gpr.R(a));
|
||||||
|
else
|
||||||
|
TEST(32, R(input), R(input));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -1007,20 +1028,30 @@ void Jit64::mulhwXx(UGeckoInstruction inst)
|
||||||
else
|
else
|
||||||
gpr.SetImmediate32(d, (u32)((gpr.R(a).offset * gpr.R(b).offset) >> 32));
|
gpr.SetImmediate32(d, (u32)((gpr.R(a).offset * gpr.R(b).offset) >> 32));
|
||||||
}
|
}
|
||||||
else
|
else if (sign)
|
||||||
{
|
{
|
||||||
gpr.Lock(a, b, d);
|
gpr.Lock(a, b, d);
|
||||||
// no register choice
|
// no register choice
|
||||||
gpr.FlushLockX(EDX, EAX);
|
gpr.FlushLockX(EDX, EAX);
|
||||||
gpr.BindToRegister(d, (d == a || d == b), true);
|
gpr.BindToRegister(d, d == a || d == b, true);
|
||||||
MOV(32, R(EAX), gpr.R(a));
|
MOV(32, R(EAX), gpr.R(a));
|
||||||
gpr.KillImmediate(b, true, false);
|
gpr.KillImmediate(b, true, false);
|
||||||
if (sign)
|
|
||||||
IMUL(32, gpr.R(b));
|
IMUL(32, gpr.R(b));
|
||||||
else
|
|
||||||
MUL(32, gpr.R(b));
|
|
||||||
MOV(32, gpr.R(d), R(EDX));
|
MOV(32, gpr.R(d), R(EDX));
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Not faster for signed because we'd need two movsx.
|
||||||
|
gpr.Lock(a, b, d);
|
||||||
|
// We need to bind everything to registers since the top 32 bits need to be zero.
|
||||||
|
int src = d == b ? a : b;
|
||||||
|
gpr.BindToRegister(d, d == a || d == b, true);
|
||||||
|
gpr.BindToRegister(src, true, false);
|
||||||
|
if (d != a && d != b)
|
||||||
|
MOV(32, gpr.R(d), gpr.R(a));
|
||||||
|
IMUL(64, gpr.RX(d), gpr.R(src));
|
||||||
|
SHR(64, gpr.R(d), Imm8(32));
|
||||||
|
}
|
||||||
if (inst.Rc)
|
if (inst.Rc)
|
||||||
ComputeRC(gpr.R(d));
|
ComputeRC(gpr.R(d));
|
||||||
gpr.UnlockAll();
|
gpr.UnlockAll();
|
||||||
|
|
|
@ -89,6 +89,42 @@ void Jit64::SetCRFieldBit(int field, int bit, Gen::X64Reg in)
|
||||||
MOV(64, PPCSTATE(cr_val[field]), R(RSCRATCH2));
|
MOV(64, PPCSTATE(cr_val[field]), R(RSCRATCH2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Jit64::ClearCRFieldBit(int field, int bit)
|
||||||
|
{
|
||||||
|
MOV(64, R(RSCRATCH2), PPCSTATE(cr_val[field]));
|
||||||
|
|
||||||
|
if (bit != CR_GT_BIT)
|
||||||
|
{
|
||||||
|
TEST(64, R(RSCRATCH2), R(RSCRATCH2));
|
||||||
|
FixupBranch dont_clear_gt = J_CC(CC_NZ);
|
||||||
|
BTS(64, R(RSCRATCH2), Imm8(63));
|
||||||
|
SetJumpTarget(dont_clear_gt);
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (bit)
|
||||||
|
{
|
||||||
|
case CR_SO_BIT: // set bit 61 to input
|
||||||
|
BTR(64, R(RSCRATCH2), Imm8(61));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case CR_EQ_BIT: // clear low 32 bits, set bit 0 to !input
|
||||||
|
SHR(64, R(RSCRATCH2), Imm8(32));
|
||||||
|
SHL(64, R(RSCRATCH2), Imm8(32));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case CR_GT_BIT: // set bit 63 to !input
|
||||||
|
BTR(64, R(RSCRATCH2), Imm8(63));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case CR_LT_BIT: // set bit 62 to input
|
||||||
|
BTR(64, R(RSCRATCH2), Imm8(62));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
BTS(64, R(RSCRATCH2), Imm8(32));
|
||||||
|
MOV(64, PPCSTATE(cr_val[field]), R(RSCRATCH2));
|
||||||
|
}
|
||||||
|
|
||||||
FixupBranch Jit64::JumpIfCRFieldBit(int field, int bit, bool jump_if_set)
|
FixupBranch Jit64::JumpIfCRFieldBit(int field, int bit, bool jump_if_set)
|
||||||
{
|
{
|
||||||
switch (bit)
|
switch (bit)
|
||||||
|
@ -472,6 +508,13 @@ void Jit64::crXXX(UGeckoInstruction inst)
|
||||||
JITDISABLE(bJITSystemRegistersOff);
|
JITDISABLE(bJITSystemRegistersOff);
|
||||||
_dbg_assert_msg_(DYNA_REC, inst.OPCD == 19, "Invalid crXXX");
|
_dbg_assert_msg_(DYNA_REC, inst.OPCD == 19, "Invalid crXXX");
|
||||||
|
|
||||||
|
// Special case: crclr
|
||||||
|
if (inst.CRBA == inst.CRBB && inst.CRBA == inst.CRBD && inst.SUBOP10 == 193)
|
||||||
|
{
|
||||||
|
ClearCRFieldBit(inst.CRBD >> 2, 3 - (inst.CRBD & 3));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO(delroth): Potential optimizations could be applied here. For
|
// TODO(delroth): Potential optimizations could be applied here. For
|
||||||
// instance, if the two CR bits being loaded are the same, two loads are
|
// instance, if the two CR bits being loaded are the same, two loads are
|
||||||
// not required.
|
// not required.
|
||||||
|
|
|
@ -406,6 +406,11 @@ static bool isCarryOp(const CodeOp& a)
|
||||||
return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && a.opinfo->type == OPTYPE_INTEGER;
|
return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && a.opinfo->type == OPTYPE_INTEGER;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool isCror(const CodeOp& a)
|
||||||
|
{
|
||||||
|
return a.inst.OPCD == 19 && a.inst.SUBOP10 == 449;
|
||||||
|
}
|
||||||
|
|
||||||
void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type)
|
void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type)
|
||||||
{
|
{
|
||||||
// Bubbling an instruction sometimes reveals another opportunity to bubble an instruction, so do
|
// Bubbling an instruction sometimes reveals another opportunity to bubble an instruction, so do
|
||||||
|
@ -426,7 +431,7 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r
|
||||||
CodeOp &b = code[i + increment];
|
CodeOp &b = code[i + increment];
|
||||||
// Reorder integer compares, rlwinm., and carry-affecting ops
|
// Reorder integer compares, rlwinm., and carry-affecting ops
|
||||||
// (if we add more merged branch instructions, add them here!)
|
// (if we add more merged branch instructions, add them here!)
|
||||||
if ((type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || a.outputCR0)))
|
if ((type == REORDER_CROR && isCror(a)) || (type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || a.outputCR0)))
|
||||||
{
|
{
|
||||||
// once we're next to a carry instruction, don't move away!
|
// once we're next to a carry instruction, don't move away!
|
||||||
if (type == REORDER_CARRY && i != start)
|
if (type == REORDER_CARRY && i != start)
|
||||||
|
@ -454,6 +459,10 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r
|
||||||
|
|
||||||
void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
|
void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
|
||||||
{
|
{
|
||||||
|
// Reorder cror instructions upwards (e.g. towards an fcmp). Technically we should be more
|
||||||
|
// picky about this, but cror seems to almost solely be used for this purpose in real code.
|
||||||
|
// Additionally, the other boolean ops seem to almost never be used.
|
||||||
|
ReorderInstructionsCore(instructions, code, true, REORDER_CROR);
|
||||||
// For carry, bubble instructions *towards* each other; one direction often isn't enough
|
// For carry, bubble instructions *towards* each other; one direction often isn't enough
|
||||||
// to get pairs like addc/adde next to each other.
|
// to get pairs like addc/adde next to each other.
|
||||||
if (HasOption(OPTION_CARRY_MERGE))
|
if (HasOption(OPTION_CARRY_MERGE))
|
||||||
|
|
|
@ -156,7 +156,8 @@ private:
|
||||||
enum ReorderType
|
enum ReorderType
|
||||||
{
|
{
|
||||||
REORDER_CARRY,
|
REORDER_CARRY,
|
||||||
REORDER_CMP
|
REORDER_CMP,
|
||||||
|
REORDER_CROR
|
||||||
};
|
};
|
||||||
|
|
||||||
void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);
|
void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);
|
||||||
|
|
Loading…
Reference in New Issue