A tiny bit more JIT WIP work.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1847 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
magumagu9 2009-01-11 01:26:58 +00:00
parent 4acda0096b
commit 018cb993e3
2 changed files with 62 additions and 30 deletions

View File

@ -90,16 +90,15 @@ integer code are more aggresively combining blocks and dead condition
register elimination, which should be very helpful for small blocks.
TODO (in no particular order):
Floating-point JIT (both paired and unpaired)
(very large win for FP code, no effect for integer code)
Inter-block dead condition register elimination (Likely significant win
combined with optimized conditions)
Optimize conditions for conditional branches.
General dead register elimination.
Inter-block inlining.
Track down issues with new JIT + dual-core mode (I think I'm going to
need help with this one; I'm not very familiar with the
dual-core code.)
JIT for misc remaining FP instructions
JIT for bcctrx
Misc optimizations for FP instructions
Inter-block dead register elimination; this seems likely to have large
performance benefits, although I'm not completely sure.
Inter-block inlining; also likely to have large performance benefits.
The tricky parts are deciding which blocks to inline, and that the
IR can't really deal with branches whose destination is in the
the middle of a generated block.
Specialized slw/srw/sraw; I think there are some tricks that could
have a non-trivial effect, and there are significantly shorter
implementations for 64-bit involving abusing 64-bit shifts.
@ -111,15 +110,19 @@ Scheduling to reduce register pressure: PowerPC compilers like to push
instruction reordering.
Common subexpression elimination
Optimize load/store of sum using complex addressing (partially implemented)
Implement idle-skipping
Loop optimizations (loop-carried registers, LICM); not sure how much
this will help on top of dead register elimination
Fold loads (both register and memory) into arithmetic operations
Loop optimizations (loop-carried registers, LICM)
Fold register loads into arithmetic operations
Code refactoring/cleanup
Investigate performance of the JIT itself; this doesn't affect
framerates significantly, but it does take a visible amount
of time for a complicated piece of code like a video decoder
to compile.
Fix profiled loads/stores to work safely. On 32-bit, one solution is to
use a spare segment register, and expand the backpatch solution
to work in all the relevant situations. On 64-bit, the existing
fast memory solution should basically work. An alternative
would be to figure out a heuristic for what loads actually
vary their "type", and special-case them.
*/
@ -464,6 +467,12 @@ InstLoc IRBuilder::FoldBranchCond(InstLoc Op1, InstLoc Op2) {
if (branchValue == 2)
return FoldBranchCond(EmitICmpEq(getOp1(getOp1(Op1)),
getOp2(getOp1(Op1))), Op2);
if (branchValue == 4)
return FoldBranchCond(EmitICmpSgt(getOp1(getOp1(Op1)),
getOp2(getOp1(Op1))), Op2);
if (branchValue == 8)
return FoldBranchCond(EmitICmpSlt(getOp1(getOp1(Op1)),
getOp2(getOp1(Op1))), Op2);
}
if (getOpcode(*Op1) == Xor &&
isImm(*getOp2(Op1))) {
@ -475,10 +484,15 @@ InstLoc IRBuilder::FoldBranchCond(InstLoc Op1, InstLoc Op2) {
unsigned innerBranchValue =
GetImmValue(getOp2(XOp1));
if (branchValue == innerBranchValue) {
if (branchValue == 4) {
if (branchValue == 2)
return FoldBranchCond(EmitICmpNe(getOp1(getOp1(XOp1)),
getOp2(getOp1(XOp1))), Op2);
if (branchValue == 4)
return FoldBranchCond(EmitICmpSle(getOp1(getOp1(XOp1)),
getOp2(getOp1(XOp1))), Op2);
}
if (branchValue == 8)
return FoldBranchCond(EmitICmpSge(getOp1(getOp1(XOp1)),
getOp2(getOp1(XOp1))), Op2);
}
}
}
@ -493,6 +507,9 @@ InstLoc IRBuilder::FoldICmp(unsigned Opcode, InstLoc Op1, InstLoc Op2) {
case ICmpEq:
result = GetImmValue(Op1) == GetImmValue(Op2);
break;
case ICmpNe:
result = GetImmValue(Op1) != GetImmValue(Op2);
break;
case ICmpUgt:
result = GetImmValue(Op1) > GetImmValue(Op2);
break;
@ -1285,9 +1302,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
regMarkUse(RI, I, getOp1(I), 1);
break;
case BranchCond: {
unsigned CondOpcode = getOpcode(*getOp1(I));
if ((CondOpcode == ICmpEq ||
CondOpcode == ICmpSle) &&
if (isICmp(*getOp1(I)) &&
isImm(*getOp2(getOp1(I)))) {
regMarkUse(RI, I, getOp1(getOp1(I)), 1);
} else {
@ -1904,20 +1919,24 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
case BlockEnd:
break;
case BranchCond: {
if (getOpcode(*getOp1(I)) == ICmpEq &&
if (isICmp(*getOp1(I)) &&
isImm(*getOp2(getOp1(I)))) {
Jit->CMP(32, regLocForInst(RI, getOp1(getOp1(I))),
Imm32(RI.Build->GetImmValue(getOp2(getOp1(I)))));
FixupBranch cont = Jit->J_CC(CC_NZ);
regWriteExit(RI, getOp2(I));
Jit->SetJumpTarget(cont);
if (RI.IInfo[I - RI.FirstI] & 4)
regClearInst(RI, getOp1(getOp1(I)));
} else if (getOpcode(*getOp1(I)) == ICmpSle &&
isImm(*getOp2(getOp1(I)))) {
Jit->CMP(32, regLocForInst(RI, getOp1(getOp1(I))),
Imm32(RI.Build->GetImmValue(getOp2(getOp1(I)))));
FixupBranch cont = Jit->J_CC(CC_G);
CCFlags flag;
switch (getOpcode(*getOp1(I))) {
case ICmpEq: flag = CC_NE; break;
case ICmpNe: flag = CC_E; break;
case ICmpUgt: flag = CC_BE; break;
case ICmpUlt: flag = CC_AE; break;
case ICmpUge: flag = CC_L; break;
case ICmpUle: flag = CC_A; break;
case ICmpSgt: flag = CC_LE; break;
case ICmpSlt: flag = CC_GE; break;
case ICmpSge: flag = CC_L; break;
case ICmpSle: flag = CC_G; break;
}
FixupBranch cont = Jit->J_CC(flag);
regWriteExit(RI, getOp2(I));
Jit->SetJumpTarget(cont);
if (RI.IInfo[I - RI.FirstI] & 4)

View File

@ -217,6 +217,10 @@ namespace IREmitter {
return getOpcode(i) >= CInt16 && getOpcode(i) <= CInt32;
}
unsigned inline isICmp(Inst i) {
return getOpcode(i) >= ICmpEq && getOpcode(i) <= ICmpSle;
}
unsigned inline isFResult(Inst i) {
return getOpcode(i) > FResult_Start &&
getOpcode(i) < FResult_End;
@ -329,12 +333,21 @@ namespace IREmitter {
InstLoc EmitICmpEq(InstLoc op1, InstLoc op2) {
return FoldBiOp(ICmpEq, op1, op2);
}
InstLoc EmitICmpNe(InstLoc op1, InstLoc op2) {
return FoldBiOp(ICmpNe, op1, op2);
}
InstLoc EmitICmpUgt(InstLoc op1, InstLoc op2) {
return FoldBiOp(ICmpUgt, op1, op2);
}
InstLoc EmitICmpSgt(InstLoc op1, InstLoc op2) {
return FoldBiOp(ICmpSgt, op1, op2);
}
InstLoc EmitICmpSlt(InstLoc op1, InstLoc op2) {
return FoldBiOp(ICmpSlt, op1, op2);
}
InstLoc EmitICmpSge(InstLoc op1, InstLoc op2) {
return FoldBiOp(ICmpSge, op1, op2);
}
InstLoc EmitICmpSle(InstLoc op1, InstLoc op2) {
return FoldBiOp(ICmpSle, op1, op2);
}