Merge pull request #1350 from FioraAeterna/integeropts

Various smallish JIT optimizations
2014-11-02 20:13:20 -06:00 · 2014-11-02 20:13:20 -06:00 · 204598a082
parent 347e8e8157 fb0960f0ee
commit 204598a082
10 changed files with 127 additions and 25 deletions
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
@ -285,8 +285,8 @@ static GekkoOPTemplate table31_2[] =
 	{522,  Interpreter::addcx,       {"addcox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{138,  Interpreter::addex,       {"addex",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{650,  Interpreter::addex,       {"addeox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
-	{234,  Interpreter::addmex,      {"addmex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{234,  Interpreter::addmex,      {"addmex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
-	{202,  Interpreter::addzex,      {"addzex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{202,  Interpreter::addzex,      {"addzex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{491,  Interpreter::divwx,       {"divwx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
 	{1003, Interpreter::divwx,       {"divwox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
 	{459,  Interpreter::divwux,      {"divwux",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
@ -295,14 +295,14 @@ static GekkoOPTemplate table31_2[] =
 	{11,   Interpreter::mulhwux,     {"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
 	{235,  Interpreter::mullwx,      {"mullwx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
 	{747,  Interpreter::mullwx,      {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 5, 0, 0, 0}},
-	{104,  Interpreter::negx,        {"negx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
+	{104,  Interpreter::negx,        {"negx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_RC_BIT, 1, 0, 0, 0}},
 	{40,   Interpreter::subfx,       {"subfx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
 	{552,  Interpreter::subfx,       {"subox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
 	{8,    Interpreter::subfcx,      {"subfcx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{520,  Interpreter::subfcx,      {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
 	{136,  Interpreter::subfex,      {"subfex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
-	{232,  Interpreter::subfmex,     {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{232,  Interpreter::subfmex,     {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
-	{200,  Interpreter::subfzex,     {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{200,  Interpreter::subfzex,     {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 };
 static GekkoOPTemplate table59[] =
--- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
@ -414,10 +414,10 @@ void Jit64::WriteBLRExit()
 	bool disturbed = Cleanup();
 	if (disturbed)
 		MOV(32, R(RSCRATCH), PPCSTATE(pc));
 	MOV(32, R(RSCRATCH2), Imm32(js.downcountAmount));
 	CMP(64, R(RSCRATCH), MDisp(RSP, 8));
 	MOV(32, R(RSCRATCH), Imm32(js.downcountAmount));
 	J_CC(CC_NE, asm_routines.dispatcherMispredictedBLR);
-	SUB(32, PPCSTATE(downcount), R(RSCRATCH));
+	SUB(32, PPCSTATE(downcount), R(RSCRATCH2));
 	RET();
 }
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@ -137,6 +137,7 @@ public:
 	void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
 	// Clobbers RDX.
 	void SetCRFieldBit(int field, int bit, Gen::X64Reg in);
 	void ClearCRFieldBit(int field, int bit);
 	// Generates a branch that will check if a given bit of a CR register part
 	// is set or not.
--- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
@ -48,17 +48,18 @@ void Jit64AsmRoutineManager::Generate()
 		ABI_PopRegistersAndAdjustStack({}, 0);
 		FixupBranch skipToRealDispatch = J(SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging); //skip the sync and compare first time
 		dispatcherMispredictedBLR = GetCodePtr();
 		AND(32, PPCSTATE(pc), Imm32(0xFFFFFFFC));
 		#if 0 // debug mispredicts
 		MOV(32, R(ABI_PARAM1), MDisp(RSP, 8)); // guessed_pc
-		ABI_PushRegistersAndAdjustStack(1 << RSCRATCH, 0);
+		ABI_PushRegistersAndAdjustStack(1 << RSCRATCH2, 0);
 		CALL(reinterpret_cast<void *>(&ReportMispredict));
-		ABI_PopRegistersAndAdjustStack(1 << RSCRATCH, 0);
+		ABI_PopRegistersAndAdjustStack(1 << RSCRATCH2, 0);
 		#endif
 		ResetStack();
-		SUB(32, PPCSTATE(downcount), R(RSCRATCH));
+		SUB(32, PPCSTATE(downcount), R(RSCRATCH2));
 		dispatcher = GetCodePtr();
 			// The result of slice decrementation should be in flags if somebody jumped here
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp
@ -229,7 +229,11 @@ void Jit64::bclrx(UGeckoInstruction inst)
 #endif
 	MOV(32, R(RSCRATCH), PPCSTATE_LR);
-	AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
+	// We don't have to do this because WriteBLRExit handles it for us. Specifically, since we only ever push
 	// divisible-by-four instruction addresses onto the stack, if the return address matches, we're already
 	// good. If it doesn't match, the mispredicted-BLR code handles the fixup.
 	if (!m_enable_blr_optimization)
 		AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
 	if (inst.LK)
 		MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4));
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -276,6 +276,18 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
 	int a = inst.FA;
 	int b = inst.FB;
 	int crf = inst.CRFD;
 	int output[4] = { CR_SO, CR_EQ, CR_GT, CR_LT };
 	// Merge neighboring fcmp and cror (the primary use of cror).
 	UGeckoInstruction next = js.next_inst;
 	if (next.OPCD == 19 && next.SUBOP10 == 449 && (next.CRBA >> 2) == crf && (next.CRBB >> 2) == crf && (next.CRBD >> 2) == crf)
 	{
 		js.skipnext = true;
 		js.downcountAmount++;
 		int dst = 3 - (next.CRBD & 3);
 		output[3 - (next.CRBA & 3)] |= 1 << dst;
 		output[3 - (next.CRBB & 3)] |= 1 << dst;
 	}
 	fpr.Lock(a, b);
 	fpr.BindToRegister(b, true, false);
@ -315,14 +327,14 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
 		pGreater = J_CC(CC_B);
 	}
-	MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_EQ)));
+	MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_EQ_BIT])));
 	if (fprf)
 		OR(32, PPCSTATE(fpscr), Imm32(CR_EQ << FPRF_SHIFT));
 	continue1 = J();
 	SetJumpTarget(pNaN);
-	MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_SO)));
+	MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_SO_BIT])));
 	if (fprf)
 		OR(32, PPCSTATE(fpscr), Imm32(CR_SO << FPRF_SHIFT));
@ -331,13 +343,13 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
 		continue2 = J();
 		SetJumpTarget(pGreater);
-		MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_GT)));
+		MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_GT_BIT])));
 		if (fprf)
 			OR(32, PPCSTATE(fpscr), Imm32(CR_GT << FPRF_SHIFT));
 		continue3 = J();
 		SetJumpTarget(pLesser);
-		MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_LT)));
+		MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_LT_BIT])));
 		if (fprf)
 			OR(32, PPCSTATE(fpscr), Imm32(CR_LT << FPRF_SHIFT));
 	}
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@ -153,7 +153,18 @@ void Jit64::ComputeRC(const Gen::OpArg & arg, bool needs_test, bool needs_sext)
 		else
 		{
 			if (needs_test)
 			{
 				TEST(32, arg, arg);
 			}
 			else
 			{
 				// If an operand to the cmp/rc op we're merging with the branch isn't used anymore, it'd be
 				// better to flush it here so that we don't have to flush it on both sides of the branch.
 				// We don't want to do this if a test is needed though, because it would interrupt macro-op
 				// fusion.
 				for (int j : js.op->gprInUse)
 					gpr.StoreFromRegister(j);
 			}
 			DoMergedBranchCondition();
 		}
 	}
@ -355,7 +366,8 @@ void Jit64::DoMergedBranch()
 	else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx
 	{
 		MOV(32, R(RSCRATCH), M(&LR));
-		AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
+		if (!m_enable_blr_optimization)
 			AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC));
 		if (js.next_inst.LK)
 			MOV(32, M(&LR), Imm32(js.next_compilerPC + 4));
 		WriteBLRExit();
@ -544,7 +556,16 @@ void Jit64::cmpXX(UGeckoInstruction inst)
 			MOV(64, PPCSTATE(cr_val[crf]), R(input));
 			// Place the comparison next to the branch for macro-op fusion
 			if (merge_branch)
-				TEST(64, R(input), R(input));
+			{
 				// We only need to do a 32-bit compare, since the flags set will be the same as a sign-extended
 				// result.
 				// We should also test against gpr.R(a) if it's bound, since that's one less cycle of latency
 				// (the CPU doesn't have to wait for the movsxd to finish to resolve the branch).
 				if (gpr.R(a).IsSimpleReg())
 					TEST(32, gpr.R(a), gpr.R(a));
 				else
 					TEST(32, R(input), R(input));
 			}
 		}
 		else
 		{
@ -1007,20 +1028,30 @@ void Jit64::mulhwXx(UGeckoInstruction inst)
 		else
 			gpr.SetImmediate32(d, (u32)((gpr.R(a).offset * gpr.R(b).offset) >> 32));
 	}
-	else
+	else if (sign)
 	{
 		gpr.Lock(a, b, d);
 		// no register choice
 		gpr.FlushLockX(EDX, EAX);
-		gpr.BindToRegister(d, (d == a || d == b), true);
+		gpr.BindToRegister(d, d == a || d == b, true);
 		MOV(32, R(EAX), gpr.R(a));
 		gpr.KillImmediate(b, true, false);
-		if (sign)
+		IMUL(32, gpr.R(b));
 			IMUL(32, gpr.R(b));
 		else
 			MUL(32, gpr.R(b));
 		MOV(32, gpr.R(d), R(EDX));
 	}
 	else
 	{
 		// Not faster for signed because we'd need two movsx.
 		gpr.Lock(a, b, d);
 		// We need to bind everything to registers since the top 32 bits need to be zero.
 		int src = d == b ? a : b;
 		gpr.BindToRegister(d, d == a || d == b, true);
 		gpr.BindToRegister(src, true, false);
 		if (d != a && d != b)
 			MOV(32, gpr.R(d), gpr.R(a));
 		IMUL(64, gpr.RX(d), gpr.R(src));
 		SHR(64, gpr.R(d), Imm8(32));
 	}
 	if (inst.Rc)
 		ComputeRC(gpr.R(d));
 	gpr.UnlockAll();
--- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp
@ -89,6 +89,42 @@ void Jit64::SetCRFieldBit(int field, int bit, Gen::X64Reg in)
 	MOV(64, PPCSTATE(cr_val[field]), R(RSCRATCH2));
 }
 void Jit64::ClearCRFieldBit(int field, int bit)
 {
 	MOV(64, R(RSCRATCH2), PPCSTATE(cr_val[field]));
 	if (bit != CR_GT_BIT)
 	{
 		TEST(64, R(RSCRATCH2), R(RSCRATCH2));
 		FixupBranch dont_clear_gt = J_CC(CC_NZ);
 		BTS(64, R(RSCRATCH2), Imm8(63));
 		SetJumpTarget(dont_clear_gt);
 	}
 	switch (bit)
 	{
 	case CR_SO_BIT:  // set bit 61 to input
 		BTR(64, R(RSCRATCH2), Imm8(61));
 		break;
 	case CR_EQ_BIT:  // clear low 32 bits, set bit 0 to !input
 		SHR(64, R(RSCRATCH2), Imm8(32));
 		SHL(64, R(RSCRATCH2), Imm8(32));
 		break;
 	case CR_GT_BIT:  // set bit 63 to !input
 		BTR(64, R(RSCRATCH2), Imm8(63));
 		break;
 	case CR_LT_BIT:  // set bit 62 to input
 		BTR(64, R(RSCRATCH2), Imm8(62));
 		break;
 	}
 	BTS(64, R(RSCRATCH2), Imm8(32));
 	MOV(64, PPCSTATE(cr_val[field]), R(RSCRATCH2));
 }
 FixupBranch Jit64::JumpIfCRFieldBit(int field, int bit, bool jump_if_set)
 {
 	switch (bit)
@ -472,6 +508,13 @@ void Jit64::crXXX(UGeckoInstruction inst)
 	JITDISABLE(bJITSystemRegistersOff);
 	_dbg_assert_msg_(DYNA_REC, inst.OPCD == 19, "Invalid crXXX");
 	// Special case: crclr
 	if (inst.CRBA == inst.CRBB && inst.CRBA == inst.CRBD && inst.SUBOP10 == 193)
 	{
 		ClearCRFieldBit(inst.CRBD >> 2, 3 - (inst.CRBD & 3));
 		return;
 	}
 	// TODO(delroth): Potential optimizations could be applied here. For
 	// instance, if the two CR bits being loaded are the same, two loads are
 	// not required.
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@ -406,6 +406,11 @@ static bool isCarryOp(const CodeOp& a)
 	return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && a.opinfo->type == OPTYPE_INTEGER;
 }
 static bool isCror(const CodeOp& a)
 {
 	return a.inst.OPCD == 19 && a.inst.SUBOP10 == 449;
 }
 void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type)
 {
 	// Bubbling an instruction sometimes reveals another opportunity to bubble an instruction, so do
@ -426,7 +431,7 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r
 			CodeOp &b = code[i + increment];
 			// Reorder integer compares, rlwinm., and carry-affecting ops
 			// (if we add more merged branch instructions, add them here!)
-			if ((type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || a.outputCR0)))
+			if ((type == REORDER_CROR && isCror(a)) || (type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || a.outputCR0)))
 			{
 				// once we're next to a carry instruction, don't move away!
 				if (type == REORDER_CARRY && i != start)
@ -454,6 +459,10 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r
 void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
 {
 	// Reorder cror instructions upwards (e.g. towards an fcmp). Technically we should be more
 	// picky about this, but cror seems to almost solely be used for this purpose in real code.
 	// Additionally, the other boolean ops seem to almost never be used.
 	ReorderInstructionsCore(instructions, code, true, REORDER_CROR);
 	// For carry, bubble instructions *towards* each other; one direction often isn't enough
 	// to get pairs like addc/adde next to each other.
 	if (HasOption(OPTION_CARRY_MERGE))
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@ -156,7 +156,8 @@ private:
 	enum ReorderType
 	{
 		REORDER_CARRY,
-		REORDER_CMP
+		REORDER_CMP,
 		REORDER_CROR
 	};
 	void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);