PPCAnalyst: refactor, add carry op reordering and non-cmp reordering

Tries as hard as possible to push carry-using operations (like addc and adde) next to each other. Refactor the instruction reordering to be more flexible and allow multiple passes. 353 -> 192 x86 instructions on a carry-heavy code block in Pokemon Puzzle. 12% faster overall in Pokemon Puzzle; probably less in typical games (Virtual Console games seem to be carry-heavy for some reason; maybe a different compiler?)
2014-09-07 08:30:11 -07:00 · 2014-09-07 08:30:11 -07:00 · 54129a8ca5
parent 45d84605a9
commit 54129a8ca5
5 changed files with 93 additions and 39 deletions
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
@ -280,26 +280,26 @@ static GekkoOPTemplate table31[] =
 static GekkoOPTemplate table31_2[] =
 {
 	{266,  Interpreter::addx,        {"addx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
-	{778,  Interpreter::addx,        {"addox",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
+	{778,  Interpreter::addx,        {"addox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
 	{10,   Interpreter::addcx,       {"addcx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
-	{522,  Interpreter::addcx,       {"addcox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{522,  Interpreter::addcx,       {"addcox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{138,  Interpreter::addex,       {"addex",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
-	{650,  Interpreter::addex,       {"addeox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{650,  Interpreter::addex,       {"addeox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
 	{234,  Interpreter::addmex,      {"addmex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{202,  Interpreter::addzex,      {"addzex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{491,  Interpreter::divwx,       {"divwx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
-	{1003, Interpreter::divwx,       {"divwox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
+	{1003, Interpreter::divwx,       {"divwox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
 	{459,  Interpreter::divwux,      {"divwux",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
-	{971,  Interpreter::divwux,      {"divwuox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
+	{971,  Interpreter::divwux,      {"divwuox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
 	{75,   Interpreter::mulhwx,      {"mulhwx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
 	{11,   Interpreter::mulhwux,     {"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
 	{235,  Interpreter::mullwx,      {"mullwx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
-	{747,  Interpreter::mullwx,      {"mullwox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
+	{747,  Interpreter::mullwx,      {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 5, 0, 0, 0}},
 	{104,  Interpreter::negx,        {"negx",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
 	{40,   Interpreter::subfx,       {"subfx",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
-	{552,  Interpreter::subfx,       {"subox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
+	{552,  Interpreter::subfx,       {"subox",   OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
 	{8,    Interpreter::subfcx,      {"subfcx",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
-	{520,  Interpreter::subfcx,      {"subfcox",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
+	{520,  Interpreter::subfcx,      {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
 	{136,  Interpreter::subfex,      {"subfex",  OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{232,  Interpreter::subfmex,     {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
 	{200,  Interpreter::subfzex,     {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
--- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
@ -48,7 +48,7 @@ static GekkoOPTemplate primarytable[] =
 	{10, &Jit64::cmpXX},                 //"cmpli",    OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}},
 	{11, &Jit64::cmpXX},                 //"cmpi",     OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}},
 	{12, &Jit64::reg_imm},               //"addic",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}},
-	{13, &Jit64::reg_imm},               //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0}},
+	{13, &Jit64::reg_imm},               //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0}},
 	{14, &Jit64::reg_imm},               //"addi",     OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}},
 	{15, &Jit64::reg_imm},               //"addis",    OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}},

--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@ -213,14 +213,17 @@ static void AnalyzeFunction2(Symbol *func)
 	func->flags = flags;
 }

-// IMPORTANT - CURRENTLY ASSUMES THAT A IS A COMPARE
 static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
 {
+	const GekkoOPInfo *a_info = a.opinfo;
 	const GekkoOPInfo *b_info = b.opinfo;
+	int a_flags = a_info->flags;
 	int b_flags = b_info->flags;
-	if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL))
+	if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE))
 		return false;
-	if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.hex & 1))
+	if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.Rc))
+		return false;
+	if ((a_flags & (FL_SET_CA | FL_READ_CA)) && (b_flags & (FL_SET_CA | FL_READ_CA)))
 		return false;

 	switch (b.inst.OPCD)
@ -250,20 +253,16 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
 	{
 		int regInA = a.regsIn[j];
 		int regInB = b.regsIn[j];
-		if (regInA >= 0 &&
-			(b.regsOut[0] == regInA ||
-			 b.regsOut[1] == regInA))
-		{
-			// reg collision! don't swap
+		// register collision: b outputs to one of a's inputs
+		if (regInA >= 0 && (b.regsOut[0] == regInA || b.regsOut[1] == regInA))
 			return false;
-		}
-		if (regInB >= 0 &&
-			(a.regsOut[0] == regInB ||
-			 a.regsOut[1] == regInB))
-		{
-			// reg collision! don't swap
+		// register collision: a outputs to one of b's inputs
+		if (regInB >= 0 && (a.regsOut[0] == regInB || a.regsOut[1] == regInB))
 			return false;
-		}
+		// register collision: b outputs to one of a's outputs (overwriting it)
+		for (int k = 0; k < 2; k++)
+			if (b.regsOut[k] >= 0 && (b.regsOut[k] == a.regsOut[0] || b.regsOut[k] == a.regsOut[1]))
+				return false;
 	}

 	return true;
@ -403,29 +402,76 @@ void FindFunctions(u32 startAddr, u32 endAddr, PPCSymbolDB *func_db)
 		leafSize, niceSize, unniceSize);
 }

-void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
+static bool isCmp(const CodeOp& a)
 {
-	// Instruction Reordering Pass
-	// Bubble down compares towards branches, so that they can be merged.
-	// -2: -1 for the pair, -1 for not swapping with the final instruction which is probably the branch.
-	for (u32 i = 0; i < (instructions - 2); ++i)
+	return (a.inst.OPCD == 10 || a.inst.OPCD == 11) || (a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32));
+}
+
+static bool isRlwinm_rc(const CodeOp& a)
+{
+	return a.inst.OPCD == 21 && a.inst.Rc;
+}
+
+static bool isCarryOp(const CodeOp& a)
+{
+	return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && a.opinfo->type == OPTYPE_INTEGER;
+}
+
+void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type)
+{
+	// Bubbling an instruction sometimes reveals another opportunity to bubble an instruction, so do
+	// multiple passes.
+	while (true)
 	{
-		CodeOp &a = code[i];
-		CodeOp &b = code[i + 1];
-		// All integer compares can be reordered.
-		if ((a.inst.OPCD == 10 || a.inst.OPCD == 11) ||
-			(a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32)))
+		// Instruction Reordering Pass
+		// Carry pass: bubble carry-using instructions as close to each other as possible, so we can avoid
+		// storing the carry flag.
+		// Compare pass: bubble compare instructions next to branches, so they can be merged.
+		bool swapped = false;
+		int increment = reverse ? -1 : 1;
+		int start = reverse ? instructions - 1 : 0;
+		int end = reverse ? 0 : instructions - 1;
+		for (int i = start; i != end; i += increment)
 		{
-			// Got a compare instruction.
-			if (CanSwapAdjacentOps(a, b))
+			CodeOp &a = code[i];
+			CodeOp &b = code[i + increment];
+			// Reorder integer compares, rlwinm., and carry-affecting ops
+			// (if we add more merged branch instructions, add them here!)
+			if ((type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || isRlwinm_rc(a))))
 			{
-				// Alright, let's bubble it down!
-				std::swap(a, b);
+				// once we're next to a carry instruction, don't move away!
+				if (type == REORDER_CARRY && i != start)
+				{
+					// if we read the CA flag, and the previous instruction sets it, don't move away.
+					if (!reverse && (a.opinfo->flags & FL_READ_CA) && (code[i - increment].opinfo->flags & FL_SET_CA))
+						continue;
+					// if we set the CA flag, and the next instruction reads it, don't move away.
+					if (reverse && (a.opinfo->flags & FL_SET_CA) && (code[i - increment].opinfo->flags & FL_READ_CA))
+						continue;
+				}
+
+				if (CanSwapAdjacentOps(a, b))
+				{
+					// Alright, let's bubble it!
+					std::swap(a, b);
+					swapped = true;
+				}
 			}
 		}
+		if (!swapped)
+			return;
 	}
 }

+void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
+{
+	// For carry, bubble instructions *towards* each other; one direction often isn't enough
+	// to get pairs like addc/adde next to each other.
+	ReorderInstructionsCore(instructions, code, true, REORDER_CARRY);
+	ReorderInstructionsCore(instructions, code, false, REORDER_CARRY);
+	ReorderInstructionsCore(instructions, code, false, REORDER_CMP);
+}
+
 void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index)
 {
 	code->wantsCR0 = false;
@ -463,7 +509,7 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
 	// We're going to try to avoid storing carry in XER if we can avoid it -- keep it in the x86 carry flag!
 	// If the instruction reads CA but doesn't write it, we still need to store CA in XER; we can't
 	// leave it in flags.
-	code->wantsCAInFlags = code->wantsCA && code->outputCA && code->inst.SUBOP10 != 512;
+	code->wantsCAInFlags = code->wantsCA && code->outputCA && opinfo->type == OPTYPE_INTEGER;

 	// mfspr/mtspr can affect/use XER, so be super careful here
 	// we need to note specifically that mfspr needs CA in XER, not in the x86 carry flag
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@ -144,6 +144,13 @@ class PPCAnalyzer
 {
 private:

+	enum ReorderType
+	{
+		REORDER_CARRY,
+		REORDER_CMP
+	};
+
+	void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);
 	void ReorderInstructions(u32 instructions, CodeOp *code);
 	void SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index);

--- a/Source/Core/Core/PowerPC/PPCTables.h
+++ b/Source/Core/Core/PowerPC/PPCTables.h
@ -38,6 +38,7 @@ enum
 	FL_LOADSTORE       = (1<<19),
 	FL_SET_FPRF        = (1<<20),
 	FL_READ_FPRF       = (1<<21),
+	FL_SET_OE          = (1<<22),
 };

 enum