A tiny bit more JIT WIP work.
git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1847 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
4acda0096b
commit
018cb993e3
|
@ -90,16 +90,15 @@ integer code are more aggresively combining blocks and dead condition
|
|||
register elimination, which should be very helpful for small blocks.
|
||||
|
||||
TODO (in no particular order):
|
||||
Floating-point JIT (both paired and unpaired)
|
||||
(very large win for FP code, no effect for integer code)
|
||||
Inter-block dead condition register elimination (Likely significant win
|
||||
combined with optimized conditions)
|
||||
Optimize conditions for conditional branches.
|
||||
General dead register elimination.
|
||||
Inter-block inlining.
|
||||
Track down issues with new JIT + dual-core mode (I think I'm going to
|
||||
need help with this one; I'm not very familiar with the
|
||||
dual-core code.)
|
||||
JIT for misc remaining FP instructions
|
||||
JIT for bcctrx
|
||||
Misc optimizations for FP instructions
|
||||
Inter-block dead register elimination; this seems likely to have large
|
||||
performance benefits, although I'm not completely sure.
|
||||
Inter-block inlining; also likely to have large performance benefits.
|
||||
The tricky parts are deciding which blocks to inline, and that the
|
||||
IR can't really deal with branches whose destination is in the
|
||||
the middle of a generated block.
|
||||
Specialized slw/srw/sraw; I think there are some tricks that could
|
||||
have a non-trivial effect, and there are significantly shorter
|
||||
implementations for 64-bit involving abusing 64-bit shifts.
|
||||
|
@ -111,15 +110,19 @@ Scheduling to reduce register pressure: PowerPC compilers like to push
|
|||
instruction reordering.
|
||||
Common subexpression elimination
|
||||
Optimize load/store of sum using complex addressing (partially implemented)
|
||||
Implement idle-skipping
|
||||
Loop optimizations (loop-carried registers, LICM); not sure how much
|
||||
this will help on top of dead register elimination
|
||||
Fold loads (both register and memory) into arithmetic operations
|
||||
Loop optimizations (loop-carried registers, LICM)
|
||||
Fold register loads into arithmetic operations
|
||||
Code refactoring/cleanup
|
||||
Investigate performance of the JIT itself; this doesn't affect
|
||||
framerates significantly, but it does take a visible amount
|
||||
of time for a complicated piece of code like a video decoder
|
||||
to compile.
|
||||
Fix profiled loads/stores to work safely. On 32-bit, one solution is to
|
||||
use a spare segment register, and expand the backpatch solution
|
||||
to work in all the relevant situations. On 64-bit, the existing
|
||||
fast memory solution should basically work. An alternative
|
||||
would be to figure out a heuristic for what loads actually
|
||||
vary their "type", and special-case them.
|
||||
|
||||
*/
|
||||
|
||||
|
@ -464,6 +467,12 @@ InstLoc IRBuilder::FoldBranchCond(InstLoc Op1, InstLoc Op2) {
|
|||
if (branchValue == 2)
|
||||
return FoldBranchCond(EmitICmpEq(getOp1(getOp1(Op1)),
|
||||
getOp2(getOp1(Op1))), Op2);
|
||||
if (branchValue == 4)
|
||||
return FoldBranchCond(EmitICmpSgt(getOp1(getOp1(Op1)),
|
||||
getOp2(getOp1(Op1))), Op2);
|
||||
if (branchValue == 8)
|
||||
return FoldBranchCond(EmitICmpSlt(getOp1(getOp1(Op1)),
|
||||
getOp2(getOp1(Op1))), Op2);
|
||||
}
|
||||
if (getOpcode(*Op1) == Xor &&
|
||||
isImm(*getOp2(Op1))) {
|
||||
|
@ -475,10 +484,15 @@ InstLoc IRBuilder::FoldBranchCond(InstLoc Op1, InstLoc Op2) {
|
|||
unsigned innerBranchValue =
|
||||
GetImmValue(getOp2(XOp1));
|
||||
if (branchValue == innerBranchValue) {
|
||||
if (branchValue == 4) {
|
||||
if (branchValue == 2)
|
||||
return FoldBranchCond(EmitICmpNe(getOp1(getOp1(XOp1)),
|
||||
getOp2(getOp1(XOp1))), Op2);
|
||||
if (branchValue == 4)
|
||||
return FoldBranchCond(EmitICmpSle(getOp1(getOp1(XOp1)),
|
||||
getOp2(getOp1(XOp1))), Op2);
|
||||
}
|
||||
if (branchValue == 8)
|
||||
return FoldBranchCond(EmitICmpSge(getOp1(getOp1(XOp1)),
|
||||
getOp2(getOp1(XOp1))), Op2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -493,6 +507,9 @@ InstLoc IRBuilder::FoldICmp(unsigned Opcode, InstLoc Op1, InstLoc Op2) {
|
|||
case ICmpEq:
|
||||
result = GetImmValue(Op1) == GetImmValue(Op2);
|
||||
break;
|
||||
case ICmpNe:
|
||||
result = GetImmValue(Op1) != GetImmValue(Op2);
|
||||
break;
|
||||
case ICmpUgt:
|
||||
result = GetImmValue(Op1) > GetImmValue(Op2);
|
||||
break;
|
||||
|
@ -1285,9 +1302,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
regMarkUse(RI, I, getOp1(I), 1);
|
||||
break;
|
||||
case BranchCond: {
|
||||
unsigned CondOpcode = getOpcode(*getOp1(I));
|
||||
if ((CondOpcode == ICmpEq ||
|
||||
CondOpcode == ICmpSle) &&
|
||||
if (isICmp(*getOp1(I)) &&
|
||||
isImm(*getOp2(getOp1(I)))) {
|
||||
regMarkUse(RI, I, getOp1(getOp1(I)), 1);
|
||||
} else {
|
||||
|
@ -1904,20 +1919,24 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
case BlockEnd:
|
||||
break;
|
||||
case BranchCond: {
|
||||
if (getOpcode(*getOp1(I)) == ICmpEq &&
|
||||
if (isICmp(*getOp1(I)) &&
|
||||
isImm(*getOp2(getOp1(I)))) {
|
||||
Jit->CMP(32, regLocForInst(RI, getOp1(getOp1(I))),
|
||||
Imm32(RI.Build->GetImmValue(getOp2(getOp1(I)))));
|
||||
FixupBranch cont = Jit->J_CC(CC_NZ);
|
||||
regWriteExit(RI, getOp2(I));
|
||||
Jit->SetJumpTarget(cont);
|
||||
if (RI.IInfo[I - RI.FirstI] & 4)
|
||||
regClearInst(RI, getOp1(getOp1(I)));
|
||||
} else if (getOpcode(*getOp1(I)) == ICmpSle &&
|
||||
isImm(*getOp2(getOp1(I)))) {
|
||||
Jit->CMP(32, regLocForInst(RI, getOp1(getOp1(I))),
|
||||
Imm32(RI.Build->GetImmValue(getOp2(getOp1(I)))));
|
||||
FixupBranch cont = Jit->J_CC(CC_G);
|
||||
CCFlags flag;
|
||||
switch (getOpcode(*getOp1(I))) {
|
||||
case ICmpEq: flag = CC_NE; break;
|
||||
case ICmpNe: flag = CC_E; break;
|
||||
case ICmpUgt: flag = CC_BE; break;
|
||||
case ICmpUlt: flag = CC_AE; break;
|
||||
case ICmpUge: flag = CC_L; break;
|
||||
case ICmpUle: flag = CC_A; break;
|
||||
case ICmpSgt: flag = CC_LE; break;
|
||||
case ICmpSlt: flag = CC_GE; break;
|
||||
case ICmpSge: flag = CC_L; break;
|
||||
case ICmpSle: flag = CC_G; break;
|
||||
}
|
||||
FixupBranch cont = Jit->J_CC(flag);
|
||||
regWriteExit(RI, getOp2(I));
|
||||
Jit->SetJumpTarget(cont);
|
||||
if (RI.IInfo[I - RI.FirstI] & 4)
|
||||
|
|
|
@ -217,6 +217,10 @@ namespace IREmitter {
|
|||
return getOpcode(i) >= CInt16 && getOpcode(i) <= CInt32;
|
||||
}
|
||||
|
||||
unsigned inline isICmp(Inst i) {
|
||||
return getOpcode(i) >= ICmpEq && getOpcode(i) <= ICmpSle;
|
||||
}
|
||||
|
||||
unsigned inline isFResult(Inst i) {
|
||||
return getOpcode(i) > FResult_Start &&
|
||||
getOpcode(i) < FResult_End;
|
||||
|
@ -329,12 +333,21 @@ namespace IREmitter {
|
|||
InstLoc EmitICmpEq(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(ICmpEq, op1, op2);
|
||||
}
|
||||
InstLoc EmitICmpNe(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(ICmpNe, op1, op2);
|
||||
}
|
||||
InstLoc EmitICmpUgt(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(ICmpUgt, op1, op2);
|
||||
}
|
||||
InstLoc EmitICmpSgt(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(ICmpSgt, op1, op2);
|
||||
}
|
||||
InstLoc EmitICmpSlt(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(ICmpSlt, op1, op2);
|
||||
}
|
||||
InstLoc EmitICmpSge(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(ICmpSge, op1, op2);
|
||||
}
|
||||
InstLoc EmitICmpSle(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(ICmpSle, op1, op2);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue