PPCAnalyst: refactor, add carry op reordering and non-cmp reordering
Tries as hard as possible to push carry-using operations (like addc and adde) next to each other. Refactor the instruction reordering to be more flexible and allow multiple passes. 353 -> 192 x86 instructions on a carry-heavy code block in Pokemon Puzzle. 12% faster overall in Pokemon Puzzle; probably less in typical games (Virtual Console games seem to be carry-heavy for some reason; maybe a different compiler?)
This commit is contained in:
parent
45d84605a9
commit
54129a8ca5
|
@ -280,26 +280,26 @@ static GekkoOPTemplate table31[] =
|
||||||
static GekkoOPTemplate table31_2[] =
|
static GekkoOPTemplate table31_2[] =
|
||||||
{
|
{
|
||||||
{266, Interpreter::addx, {"addx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
{266, Interpreter::addx, {"addx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{778, Interpreter::addx, {"addox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
{778, Interpreter::addx, {"addox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{10, Interpreter::addcx, {"addcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{10, Interpreter::addcx, {"addcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{522, Interpreter::addcx, {"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{522, Interpreter::addcx, {"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{138, Interpreter::addex, {"addex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{138, Interpreter::addex, {"addex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{650, Interpreter::addex, {"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{650, Interpreter::addex, {"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
||||||
{234, Interpreter::addmex, {"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{234, Interpreter::addmex, {"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{202, Interpreter::addzex, {"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{202, Interpreter::addzex, {"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{491, Interpreter::divwx, {"divwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
{491, Interpreter::divwx, {"divwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
||||||
{1003, Interpreter::divwx, {"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
{1003, Interpreter::divwx, {"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
|
||||||
{459, Interpreter::divwux, {"divwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
{459, Interpreter::divwux, {"divwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
||||||
{971, Interpreter::divwux, {"divwuox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}},
|
{971, Interpreter::divwux, {"divwuox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}},
|
||||||
{75, Interpreter::mulhwx, {"mulhwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
{75, Interpreter::mulhwx, {"mulhwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
||||||
{11, Interpreter::mulhwux, {"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
{11, Interpreter::mulhwux, {"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
||||||
{235, Interpreter::mullwx, {"mullwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
{235, Interpreter::mullwx, {"mullwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
||||||
{747, Interpreter::mullwx, {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}},
|
{747, Interpreter::mullwx, {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 5, 0, 0, 0}},
|
||||||
{104, Interpreter::negx, {"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
{104, Interpreter::negx, {"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{40, Interpreter::subfx, {"subfx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
{40, Interpreter::subfx, {"subfx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{552, Interpreter::subfx, {"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}},
|
{552, Interpreter::subfx, {"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
||||||
{8, Interpreter::subfcx, {"subfcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{8, Interpreter::subfcx, {"subfcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{520, Interpreter::subfcx, {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{520, Interpreter::subfcx, {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}},
|
||||||
{136, Interpreter::subfex, {"subfex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{136, Interpreter::subfex, {"subfex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{232, Interpreter::subfmex, {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{232, Interpreter::subfmex, {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
{200, Interpreter::subfzex, {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
{200, Interpreter::subfzex, {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||||
|
|
|
@ -48,7 +48,7 @@ static GekkoOPTemplate primarytable[] =
|
||||||
{10, &Jit64::cmpXX}, //"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}},
|
{10, &Jit64::cmpXX}, //"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}},
|
||||||
{11, &Jit64::cmpXX}, //"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}},
|
{11, &Jit64::cmpXX}, //"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}},
|
||||||
{12, &Jit64::reg_imm}, //"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}},
|
{12, &Jit64::reg_imm}, //"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}},
|
||||||
{13, &Jit64::reg_imm}, //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0}},
|
{13, &Jit64::reg_imm}, //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0}},
|
||||||
{14, &Jit64::reg_imm}, //"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}},
|
{14, &Jit64::reg_imm}, //"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}},
|
||||||
{15, &Jit64::reg_imm}, //"addis", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}},
|
{15, &Jit64::reg_imm}, //"addis", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}},
|
||||||
|
|
||||||
|
|
|
@ -213,14 +213,17 @@ static void AnalyzeFunction2(Symbol *func)
|
||||||
func->flags = flags;
|
func->flags = flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
// IMPORTANT - CURRENTLY ASSUMES THAT A IS A COMPARE
|
|
||||||
static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
|
static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
|
||||||
{
|
{
|
||||||
|
const GekkoOPInfo *a_info = a.opinfo;
|
||||||
const GekkoOPInfo *b_info = b.opinfo;
|
const GekkoOPInfo *b_info = b.opinfo;
|
||||||
|
int a_flags = a_info->flags;
|
||||||
int b_flags = b_info->flags;
|
int b_flags = b_info->flags;
|
||||||
if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL))
|
if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE))
|
||||||
return false;
|
return false;
|
||||||
if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.hex & 1))
|
if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.Rc))
|
||||||
|
return false;
|
||||||
|
if ((a_flags & (FL_SET_CA | FL_READ_CA)) && (b_flags & (FL_SET_CA | FL_READ_CA)))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
switch (b.inst.OPCD)
|
switch (b.inst.OPCD)
|
||||||
|
@ -250,20 +253,16 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
|
||||||
{
|
{
|
||||||
int regInA = a.regsIn[j];
|
int regInA = a.regsIn[j];
|
||||||
int regInB = b.regsIn[j];
|
int regInB = b.regsIn[j];
|
||||||
if (regInA >= 0 &&
|
// register collision: b outputs to one of a's inputs
|
||||||
(b.regsOut[0] == regInA ||
|
if (regInA >= 0 && (b.regsOut[0] == regInA || b.regsOut[1] == regInA))
|
||||||
b.regsOut[1] == regInA))
|
|
||||||
{
|
|
||||||
// reg collision! don't swap
|
|
||||||
return false;
|
return false;
|
||||||
}
|
// register collision: a outputs to one of b's inputs
|
||||||
if (regInB >= 0 &&
|
if (regInB >= 0 && (a.regsOut[0] == regInB || a.regsOut[1] == regInB))
|
||||||
(a.regsOut[0] == regInB ||
|
|
||||||
a.regsOut[1] == regInB))
|
|
||||||
{
|
|
||||||
// reg collision! don't swap
|
|
||||||
return false;
|
return false;
|
||||||
}
|
// register collision: b outputs to one of a's outputs (overwriting it)
|
||||||
|
for (int k = 0; k < 2; k++)
|
||||||
|
if (b.regsOut[k] >= 0 && (b.regsOut[k] == a.regsOut[0] || b.regsOut[k] == a.regsOut[1]))
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -403,29 +402,76 @@ void FindFunctions(u32 startAddr, u32 endAddr, PPCSymbolDB *func_db)
|
||||||
leafSize, niceSize, unniceSize);
|
leafSize, niceSize, unniceSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
|
static bool isCmp(const CodeOp& a)
|
||||||
{
|
{
|
||||||
// Instruction Reordering Pass
|
return (a.inst.OPCD == 10 || a.inst.OPCD == 11) || (a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32));
|
||||||
// Bubble down compares towards branches, so that they can be merged.
|
}
|
||||||
// -2: -1 for the pair, -1 for not swapping with the final instruction which is probably the branch.
|
|
||||||
for (u32 i = 0; i < (instructions - 2); ++i)
|
static bool isRlwinm_rc(const CodeOp& a)
|
||||||
|
{
|
||||||
|
return a.inst.OPCD == 21 && a.inst.Rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool isCarryOp(const CodeOp& a)
|
||||||
|
{
|
||||||
|
return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && a.opinfo->type == OPTYPE_INTEGER;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type)
|
||||||
|
{
|
||||||
|
// Bubbling an instruction sometimes reveals another opportunity to bubble an instruction, so do
|
||||||
|
// multiple passes.
|
||||||
|
while (true)
|
||||||
{
|
{
|
||||||
CodeOp &a = code[i];
|
// Instruction Reordering Pass
|
||||||
CodeOp &b = code[i + 1];
|
// Carry pass: bubble carry-using instructions as close to each other as possible, so we can avoid
|
||||||
// All integer compares can be reordered.
|
// storing the carry flag.
|
||||||
if ((a.inst.OPCD == 10 || a.inst.OPCD == 11) ||
|
// Compare pass: bubble compare instructions next to branches, so they can be merged.
|
||||||
(a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32)))
|
bool swapped = false;
|
||||||
|
int increment = reverse ? -1 : 1;
|
||||||
|
int start = reverse ? instructions - 1 : 0;
|
||||||
|
int end = reverse ? 0 : instructions - 1;
|
||||||
|
for (int i = start; i != end; i += increment)
|
||||||
{
|
{
|
||||||
// Got a compare instruction.
|
CodeOp &a = code[i];
|
||||||
if (CanSwapAdjacentOps(a, b))
|
CodeOp &b = code[i + increment];
|
||||||
|
// Reorder integer compares, rlwinm., and carry-affecting ops
|
||||||
|
// (if we add more merged branch instructions, add them here!)
|
||||||
|
if ((type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || isRlwinm_rc(a))))
|
||||||
{
|
{
|
||||||
// Alright, let's bubble it down!
|
// once we're next to a carry instruction, don't move away!
|
||||||
std::swap(a, b);
|
if (type == REORDER_CARRY && i != start)
|
||||||
|
{
|
||||||
|
// if we read the CA flag, and the previous instruction sets it, don't move away.
|
||||||
|
if (!reverse && (a.opinfo->flags & FL_READ_CA) && (code[i - increment].opinfo->flags & FL_SET_CA))
|
||||||
|
continue;
|
||||||
|
// if we set the CA flag, and the next instruction reads it, don't move away.
|
||||||
|
if (reverse && (a.opinfo->flags & FL_SET_CA) && (code[i - increment].opinfo->flags & FL_READ_CA))
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (CanSwapAdjacentOps(a, b))
|
||||||
|
{
|
||||||
|
// Alright, let's bubble it!
|
||||||
|
std::swap(a, b);
|
||||||
|
swapped = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (!swapped)
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code)
|
||||||
|
{
|
||||||
|
// For carry, bubble instructions *towards* each other; one direction often isn't enough
|
||||||
|
// to get pairs like addc/adde next to each other.
|
||||||
|
ReorderInstructionsCore(instructions, code, true, REORDER_CARRY);
|
||||||
|
ReorderInstructionsCore(instructions, code, false, REORDER_CARRY);
|
||||||
|
ReorderInstructionsCore(instructions, code, false, REORDER_CMP);
|
||||||
|
}
|
||||||
|
|
||||||
void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index)
|
void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index)
|
||||||
{
|
{
|
||||||
code->wantsCR0 = false;
|
code->wantsCR0 = false;
|
||||||
|
@ -463,7 +509,7 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
|
||||||
// We're going to try to avoid storing carry in XER if we can avoid it -- keep it in the x86 carry flag!
|
// We're going to try to avoid storing carry in XER if we can avoid it -- keep it in the x86 carry flag!
|
||||||
// If the instruction reads CA but doesn't write it, we still need to store CA in XER; we can't
|
// If the instruction reads CA but doesn't write it, we still need to store CA in XER; we can't
|
||||||
// leave it in flags.
|
// leave it in flags.
|
||||||
code->wantsCAInFlags = code->wantsCA && code->outputCA && code->inst.SUBOP10 != 512;
|
code->wantsCAInFlags = code->wantsCA && code->outputCA && opinfo->type == OPTYPE_INTEGER;
|
||||||
|
|
||||||
// mfspr/mtspr can affect/use XER, so be super careful here
|
// mfspr/mtspr can affect/use XER, so be super careful here
|
||||||
// we need to note specifically that mfspr needs CA in XER, not in the x86 carry flag
|
// we need to note specifically that mfspr needs CA in XER, not in the x86 carry flag
|
||||||
|
|
|
@ -144,6 +144,13 @@ class PPCAnalyzer
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
|
|
||||||
|
enum ReorderType
|
||||||
|
{
|
||||||
|
REORDER_CARRY,
|
||||||
|
REORDER_CMP
|
||||||
|
};
|
||||||
|
|
||||||
|
void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);
|
||||||
void ReorderInstructions(u32 instructions, CodeOp *code);
|
void ReorderInstructions(u32 instructions, CodeOp *code);
|
||||||
void SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index);
|
void SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInfo *opinfo, u32 index);
|
||||||
|
|
||||||
|
|
|
@ -38,6 +38,7 @@ enum
|
||||||
FL_LOADSTORE = (1<<19),
|
FL_LOADSTORE = (1<<19),
|
||||||
FL_SET_FPRF = (1<<20),
|
FL_SET_FPRF = (1<<20),
|
||||||
FL_READ_FPRF = (1<<21),
|
FL_READ_FPRF = (1<<21),
|
||||||
|
FL_SET_OE = (1<<22),
|
||||||
};
|
};
|
||||||
|
|
||||||
enum
|
enum
|
||||||
|
|
Loading…
Reference in New Issue