PPCAnalyst: refactor, add carry op reordering and non-cmp reordering

Tries as hard as possible to push carry-using operations (like addc and adde)
next to each other. Refactor the instruction reordering to be more flexible
and allow multiple passes.

353 -> 192 x86 instructions on a carry-heavy code block in Pokemon Puzzle.
12% faster overall in Pokemon Puzzle; probably less in typical games (Virtual
Console games seem to be carry-heavy for some reason; maybe a different
compiler?)
This commit is contained in:
Fiora 2014-09-07 08:30:11 -07:00
parent 45d84605a9
commit 54129a8ca5
5 changed files with 93 additions and 39 deletions

View File

@ -280,26 +280,26 @@ static GekkoOPTemplate table31[] =
static GekkoOPTemplate table31_2[] =
{
{266, Interpreter::addx, {"addx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
{778, Interpreter::addx, {"addox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
{778, Interpreter::addx, {"addox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
{10, Interpreter::addcx, {"addcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{522, Interpreter::addcx, {"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{522, Interpreter::addcx, {"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{138, Interpreter::addex, {"addex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{650, Interpreter::addex, {"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{650, Interpreter::addex, {"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
{234, Interpreter::addmex, {"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{202, Interpreter::addzex, {"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{491, Interpreter::divwx, {"divwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
{1003, Interpreter::divwx, {"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
{1003, Interpreter::divwx, {"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
{459, Interpreter::divwux, {"divwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
{971, Interpreter::divwux, {"divwuox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
{971, Interpreter::divwux, {"divwuox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
{75, Interpreter::mulhwx, {"mulhwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
{11, Interpreter::mulhwux, {"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
{235, Interpreter::mullwx, {"mullwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
{747, Interpreter::mullwx, {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
{747, Interpreter::mullwx, {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 5, 0, 0, 0}},
{104, Interpreter::negx, {"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
{40, Interpreter::subfx, {"subfx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
{552, Interpreter::subfx, {"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
{552, Interpreter::subfx, {"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
{8, Interpreter::subfcx, {"subfcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{520, Interpreter::subfcx, {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{520, Interpreter::subfcx, {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
{136, Interpreter::subfex, {"subfex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{232, Interpreter::subfmex, {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{200, Interpreter::subfzex, {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},

View File

@ -48,7 +48,7 @@ static GekkoOPTemplate primarytable[] =
{10, &Jit64::cmpXX}, //"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}},
{11, &Jit64::cmpXX}, //"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}},
{12, &Jit64::reg_imm}, //"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}},
{13, &Jit64::reg_imm}, //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0}},
{13, &Jit64::reg_imm}, //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0}},
{14, &Jit64::reg_imm}, //"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}},
{15, &Jit64::reg_imm}, //"addis", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}},

View File

@ -213,14 +213,17 @@ static void AnalyzeFunction2(Symbol *func)
func->flags = flags;
}
// IMPORTANT - CURRENTLY ASSUMES THAT A IS A COMPARE
static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
{
const GekkoOPInfo *a_info = a.opinfo;
const GekkoOPInfo *b_info = b.opinfo;
int a_flags = a_info->flags;
int b_flags = b_info->flags;
if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL))
if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE))
return false;
if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.hex & 1))
if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.Rc))
return false;
if ((a_flags & (FL_SET_CA | FL_READ_CA)) && (b_flags & (FL_SET_CA | FL_READ_CA)))
return false;
switch (b.inst.OPCD)
@ -250,20 +253,16 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
{
int regInA = a.regsIn[j];
int regInB = b.regsIn[j];
if (regInA >= 0 &&
(b.regsOut[0] == regInA ||
b.regsOut[1] == regInA))
{
// reg collision! don't swap
// register collision: b outputs to one of a's inputs
if (regInA >= 0 && (b.regsOut[0] == regInA || b.regsOut[1] == regInA))
return false;
}
if (regInB >= 0 &&
(a.regsOut[0] == regInB ||
a.regsOut[1] == regInB))
{
// reg collision! don't swap
// register collision: a outputs to one of b's inputs
if (regInB >= 0 && (a.regsOut[0] == regInB || a.regsOut[1] == regInB))
return false;
}
// register collision: b outputs to one of a's outputs (overwriting it)
for (int k = 0; k < 2; k++)
if (b.regsOut[k] >= 0 && (b.regsOut[k] == a.regsOut[0] || b.regsOut[k] == a.regsOut[1]))
return false;
}
return true;
@ -403,29 +402,76 @@ void FindFunctions(u32 startAddr, u32 endAddr, PPCSymbolDB *func_db)
leafSize, niceSize, unniceSize);
}
void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
static bool isCmp(const CodeOp& a)
{
// Instruction Reordering Pass
// Bubble down compares towards branches, so that they can be merged.
// -2: -1 for the pair, -1 for not swapping with the final instruction which is probably the branch.
for (u32 i = 0; i < (instructions - 2); ++i)
return (a.inst.OPCD == 10 || a.inst.OPCD == 11) || (a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32));
}
static bool isRlwinm_rc(const CodeOp& a)
{
return a.inst.OPCD == 21 && a.inst.Rc;
}
static bool isCarryOp(const CodeOp& a)
{
return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && a.opinfo->type == OPTYPE_INTEGER;
}
void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type)
{
// Bubbling an instruction sometimes reveals another opportunity to bubble an instruction, so do
// multiple passes.
while (true)
{
CodeOp &a = code[i];
CodeOp &b = code[i + 1];
// All integer compares can be reordered.
if ((a.inst.OPCD == 10 || a.inst.OPCD == 11) ||
(a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32)))
// Instruction Reordering Pass
// Carry pass: bubble carry-using instructions as close to each other as possible, so we can avoid
// storing the carry flag.
// Compare pass: bubble compare instructions next to branches, so they can be merged.
bool swapped = false;
int increment = reverse ? -1 : 1;
int start = reverse ? instructions - 1 : 0;
int end = reverse ? 0 : instructions - 1;
for (int i = start; i != end; i += increment)
{
// Got a compare instruction.
if (CanSwapAdjacentOps(a, b))
CodeOp &a = code[i];
CodeOp &b = code[i + increment];
// Reorder integer compares, rlwinm., and carry-affecting ops
// (if we add more merged branch instructions, add them here!)
if ((type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || isRlwinm_rc(a))))
{
// Alright, let's bubble it down!
std::swap(a, b);
// once we're next to a carry instruction, don't move away!
if (type == REORDER_CARRY && i != start)
{
// if we read the CA flag, and the previous instruction sets it, don't move away.
if (!reverse && (a.opinfo->flags & FL_READ_CA) && (code[i - increment].opinfo->flags & FL_SET_CA))
continue;
// if we set the CA flag, and the next instruction reads it, don't move away.
if (reverse && (a.opinfo->flags & FL_SET_CA) && (code[i - increment].opinfo->flags & FL_READ_CA))
continue;
}
if (CanSwapAdjacentOps(a, b))
{
// Alright, let's bubble it!
std::swap(a, b);
swapped = true;
}
}
}
if (!swapped)
return;
}
}
void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
{
// For carry, bubble instructions *towards* each other; one direction often isn't enough
// to get pairs like addc/adde next to each other.
ReorderInstructionsCore(instructions, code, true, REORDER_CARRY);
ReorderInstructionsCore(instructions, code, false, REORDER_CARRY);
ReorderInstructionsCore(instructions, code, false, REORDER_CMP);
}
void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index)
{
code->wantsCR0 = false;
@ -463,7 +509,7 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
// We're going to try to avoid storing carry in XER if we can avoid it -- keep it in the x86 carry flag!
// If the instruction reads CA but doesn't write it, we still need to store CA in XER; we can't
// leave it in flags.
code->wantsCAInFlags = code->wantsCA && code->outputCA && code->inst.SUBOP10 != 512;
code->wantsCAInFlags = code->wantsCA && code->outputCA && opinfo->type == OPTYPE_INTEGER;
// mfspr/mtspr can affect/use XER, so be super careful here
// we need to note specifically that mfspr needs CA in XER, not in the x86 carry flag

View File

@ -144,6 +144,13 @@ class PPCAnalyzer
{
private:
enum ReorderType
{
REORDER_CARRY,
REORDER_CMP
};
void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);
void ReorderInstructions(u32 instructions, CodeOp *code);
void SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index);

View File

@ -38,6 +38,7 @@ enum
FL_LOADSTORE = (1<<19),
FL_SET_FPRF = (1<<20),
FL_READ_FPRF = (1<<21),
FL_SET_OE = (1<<22),
};
enum