PPCAnalyst: refactor, add carry op reordering and non-cmp reordering
Tries as hard as possible to push carry-using operations (like addc and adde) next to each other. Refactor the instruction reordering to be more flexible and allow multiple passes. 353 -> 192 x86 instructions on a carry-heavy code block in Pokemon Puzzle. 12% faster overall in Pokemon Puzzle; probably less in typical games (Virtual Console games seem to be carry-heavy for some reason; maybe a different compiler?)
This commit is contained in:
parent
45d84605a9
commit
54129a8ca5
|
@ -284,22 +284,22 @@ static GekkoOPTemplate table31_2[] =
|
|||
{10, Interpreter::addcx, {"addcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{522, Interpreter::addcx, {"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{138, Interpreter::addex, {"addex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{650, Interpreter::addex, {"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{650, Interpreter::addex, {"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
||||
{234, Interpreter::addmex, {"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{202, Interpreter::addzex, {"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{491, Interpreter::divwx, {"divwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
||||
{1003, Interpreter::divwx, {"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
||||
{1003, Interpreter::divwx, {"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
|
||||
{459, Interpreter::divwux, {"divwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
||||
{971, Interpreter::divwux, {"divwuox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
||||
{971, Interpreter::divwux, {"divwuox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
|
||||
{75, Interpreter::mulhwx, {"mulhwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
||||
{11, Interpreter::mulhwux, {"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
||||
{235, Interpreter::mullwx, {"mullwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
||||
{747, Interpreter::mullwx, {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
||||
{747, Interpreter::mullwx, {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 5, 0, 0, 0}},
|
||||
{104, Interpreter::negx, {"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{40, Interpreter::subfx, {"subfx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{552, Interpreter::subfx, {"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{552, Interpreter::subfx, {"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
||||
{8, Interpreter::subfcx, {"subfcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{520, Interpreter::subfcx, {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{520, Interpreter::subfcx, {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
||||
{136, Interpreter::subfex, {"subfex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{232, Interpreter::subfmex, {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{200, Interpreter::subfzex, {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
|
|
|
@ -48,7 +48,7 @@ static GekkoOPTemplate primarytable[] =
|
|||
{10, &Jit64::cmpXX}, //"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}},
|
||||
{11, &Jit64::cmpXX}, //"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}},
|
||||
{12, &Jit64::reg_imm}, //"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}},
|
||||
{13, &Jit64::reg_imm}, //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0}},
|
||||
{13, &Jit64::reg_imm}, //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0}},
|
||||
{14, &Jit64::reg_imm}, //"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}},
|
||||
{15, &Jit64::reg_imm}, //"addis", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}},
|
||||
|
||||
|
|
|
@ -213,14 +213,17 @@ static void AnalyzeFunction2(Symbol *func)
|
|||
func->flags = flags;
|
||||
}
|
||||
|
||||
// IMPORTANT - CURRENTLY ASSUMES THAT A IS A COMPARE
|
||||
static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
|
||||
{
|
||||
const GekkoOPInfo *a_info = a.opinfo;
|
||||
const GekkoOPInfo *b_info = b.opinfo;
|
||||
int a_flags = a_info->flags;
|
||||
int b_flags = b_info->flags;
|
||||
if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL))
|
||||
if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE))
|
||||
return false;
|
||||
if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.hex & 1))
|
||||
if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.Rc))
|
||||
return false;
|
||||
if ((a_flags & (FL_SET_CA | FL_READ_CA)) && (b_flags & (FL_SET_CA | FL_READ_CA)))
|
||||
return false;
|
||||
|
||||
switch (b.inst.OPCD)
|
||||
|
@ -250,20 +253,16 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
|
|||
{
|
||||
int regInA = a.regsIn[j];
|
||||
int regInB = b.regsIn[j];
|
||||
if (regInA >= 0 &&
|
||||
(b.regsOut[0] == regInA ||
|
||||
b.regsOut[1] == regInA))
|
||||
{
|
||||
// reg collision! don't swap
|
||||
// register collision: b outputs to one of a's inputs
|
||||
if (regInA >= 0 && (b.regsOut[0] == regInA || b.regsOut[1] == regInA))
|
||||
return false;
|
||||
}
|
||||
if (regInB >= 0 &&
|
||||
(a.regsOut[0] == regInB ||
|
||||
a.regsOut[1] == regInB))
|
||||
{
|
||||
// reg collision! don't swap
|
||||
// register collision: a outputs to one of b's inputs
|
||||
if (regInB >= 0 && (a.regsOut[0] == regInB || a.regsOut[1] == regInB))
|
||||
return false;
|
||||
// register collision: b outputs to one of a's outputs (overwriting it)
|
||||
for (int k = 0; k < 2; k++)
|
||||
if (b.regsOut[k] >= 0 && (b.regsOut[k] == a.regsOut[0] || b.regsOut[k] == a.regsOut[1]))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -403,27 +402,74 @@ void FindFunctions(u32 startAddr, u32 endAddr, PPCSymbolDB *func_db)
|
|||
leafSize, niceSize, unniceSize);
|
||||
}
|
||||
|
||||
void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
|
||||
static bool isCmp(const CodeOp& a)
|
||||
{
|
||||
return (a.inst.OPCD == 10 || a.inst.OPCD == 11) || (a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32));
|
||||
}
|
||||
|
||||
static bool isRlwinm_rc(const CodeOp& a)
|
||||
{
|
||||
return a.inst.OPCD == 21 && a.inst.Rc;
|
||||
}
|
||||
|
||||
static bool isCarryOp(const CodeOp& a)
|
||||
{
|
||||
return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && a.opinfo->type == OPTYPE_INTEGER;
|
||||
}
|
||||
|
||||
void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type)
|
||||
{
|
||||
// Bubbling an instruction sometimes reveals another opportunity to bubble an instruction, so do
|
||||
// multiple passes.
|
||||
while (true)
|
||||
{
|
||||
// Instruction Reordering Pass
|
||||
// Bubble down compares towards branches, so that they can be merged.
|
||||
// -2: -1 for the pair, -1 for not swapping with the final instruction which is probably the branch.
|
||||
for (u32 i = 0; i < (instructions - 2); ++i)
|
||||
// Carry pass: bubble carry-using instructions as close to each other as possible, so we can avoid
|
||||
// storing the carry flag.
|
||||
// Compare pass: bubble compare instructions next to branches, so they can be merged.
|
||||
bool swapped = false;
|
||||
int increment = reverse ? -1 : 1;
|
||||
int start = reverse ? instructions - 1 : 0;
|
||||
int end = reverse ? 0 : instructions - 1;
|
||||
for (int i = start; i != end; i += increment)
|
||||
{
|
||||
CodeOp &a = code[i];
|
||||
CodeOp &b = code[i + 1];
|
||||
// All integer compares can be reordered.
|
||||
if ((a.inst.OPCD == 10 || a.inst.OPCD == 11) ||
|
||||
(a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32)))
|
||||
CodeOp &b = code[i + increment];
|
||||
// Reorder integer compares, rlwinm., and carry-affecting ops
|
||||
// (if we add more merged branch instructions, add them here!)
|
||||
if ((type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || isRlwinm_rc(a))))
|
||||
{
|
||||
// Got a compare instruction.
|
||||
// once we're next to a carry instruction, don't move away!
|
||||
if (type == REORDER_CARRY && i != start)
|
||||
{
|
||||
// if we read the CA flag, and the previous instruction sets it, don't move away.
|
||||
if (!reverse && (a.opinfo->flags & FL_READ_CA) && (code[i - increment].opinfo->flags & FL_SET_CA))
|
||||
continue;
|
||||
// if we set the CA flag, and the next instruction reads it, don't move away.
|
||||
if (reverse && (a.opinfo->flags & FL_SET_CA) && (code[i - increment].opinfo->flags & FL_READ_CA))
|
||||
continue;
|
||||
}
|
||||
|
||||
if (CanSwapAdjacentOps(a, b))
|
||||
{
|
||||
// Alright, let's bubble it down!
|
||||
// Alright, let's bubble it!
|
||||
std::swap(a, b);
|
||||
swapped = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!swapped)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
|
||||
{
|
||||
// For carry, bubble instructions *towards* each other; one direction often isn't enough
|
||||
// to get pairs like addc/adde next to each other.
|
||||
ReorderInstructionsCore(instructions, code, true, REORDER_CARRY);
|
||||
ReorderInstructionsCore(instructions, code, false, REORDER_CARRY);
|
||||
ReorderInstructionsCore(instructions, code, false, REORDER_CMP);
|
||||
}
|
||||
|
||||
void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index)
|
||||
|
@ -463,7 +509,7 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
|
|||
// We're going to try to avoid storing carry in XER if we can avoid it -- keep it in the x86 carry flag!
|
||||
// If the instruction reads CA but doesn't write it, we still need to store CA in XER; we can't
|
||||
// leave it in flags.
|
||||
code->wantsCAInFlags = code->wantsCA && code->outputCA && code->inst.SUBOP10 != 512;
|
||||
code->wantsCAInFlags = code->wantsCA && code->outputCA && opinfo->type == OPTYPE_INTEGER;
|
||||
|
||||
// mfspr/mtspr can affect/use XER, so be super careful here
|
||||
// we need to note specifically that mfspr needs CA in XER, not in the x86 carry flag
|
||||
|
|
|
@ -144,6 +144,13 @@ class PPCAnalyzer
|
|||
{
|
||||
private:
|
||||
|
||||
enum ReorderType
|
||||
{
|
||||
REORDER_CARRY,
|
||||
REORDER_CMP
|
||||
};
|
||||
|
||||
void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);
|
||||
void ReorderInstructions(u32 instructions, CodeOp *code);
|
||||
void SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index);
|
||||
|
||||
|
|
|
@ -38,6 +38,7 @@ enum
|
|||
FL_LOADSTORE = (1<<19),
|
||||
FL_SET_FPRF = (1<<20),
|
||||
FL_READ_FPRF = (1<<21),
|
||||
FL_SET_OE = (1<<22),
|
||||
};
|
||||
|
||||
enum
|
||||
|
|
Loading…
Reference in New Issue