From 018cb993e30027b0261f96037f56fac107492743 Mon Sep 17 00:00:00 2001 From: magumagu9 Date: Sun, 11 Jan 2009 01:26:58 +0000 Subject: [PATCH] A tiny bit more JIT WIP work. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1847 8ced0084-cf51-0410-be5f-012b33b47a6e --- Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp | 79 +++++++++++++-------- Source/Core/Core/Src/PowerPC/Jit64IL/IR.h | 13 ++++ 2 files changed, 62 insertions(+), 30 deletions(-) diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp index 2ae2c5cbae..007079338e 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp @@ -90,16 +90,15 @@ integer code are more aggresively combining blocks and dead condition register elimination, which should be very helpful for small blocks. TODO (in no particular order): -Floating-point JIT (both paired and unpaired) - (very large win for FP code, no effect for integer code) -Inter-block dead condition register elimination (Likely significant win - combined with optimized conditions) -Optimize conditions for conditional branches. -General dead register elimination. -Inter-block inlining. -Track down issues with new JIT + dual-core mode (I think I'm going to - need help with this one; I'm not very familiar with the - dual-core code.) +JIT for misc remaining FP instructions +JIT for bcctrx +Misc optimizations for FP instructions +Inter-block dead register elimination; this seems likely to have large + performance benefits, although I'm not completely sure. +Inter-block inlining; also likely to have large performance benefits. + The tricky parts are deciding which blocks to inline, and that the + IR can't really deal with branches whose destination is in the + the middle of a generated block. Specialized slw/srw/sraw; I think there are some tricks that could have a non-trivial effect, and there are significantly shorter implementations for 64-bit involving abusing 64-bit shifts. @@ -111,15 +110,19 @@ Scheduling to reduce register pressure: PowerPC compilers like to push instruction reordering. Common subexpression elimination Optimize load/store of sum using complex addressing (partially implemented) -Implement idle-skipping -Loop optimizations (loop-carried registers, LICM); not sure how much - this will help on top of dead register elimination -Fold loads (both register and memory) into arithmetic operations +Loop optimizations (loop-carried registers, LICM) +Fold register loads into arithmetic operations Code refactoring/cleanup Investigate performance of the JIT itself; this doesn't affect framerates significantly, but it does take a visible amount of time for a complicated piece of code like a video decoder to compile. +Fix profiled loads/stores to work safely. On 32-bit, one solution is to + use a spare segment register, and expand the backpatch solution + to work in all the relevant situations. On 64-bit, the existing + fast memory solution should basically work. An alternative + would be to figure out a heuristic for what loads actually + vary their "type", and special-case them. */ @@ -464,6 +467,12 @@ InstLoc IRBuilder::FoldBranchCond(InstLoc Op1, InstLoc Op2) { if (branchValue == 2) return FoldBranchCond(EmitICmpEq(getOp1(getOp1(Op1)), getOp2(getOp1(Op1))), Op2); + if (branchValue == 4) + return FoldBranchCond(EmitICmpSgt(getOp1(getOp1(Op1)), + getOp2(getOp1(Op1))), Op2); + if (branchValue == 8) + return FoldBranchCond(EmitICmpSlt(getOp1(getOp1(Op1)), + getOp2(getOp1(Op1))), Op2); } if (getOpcode(*Op1) == Xor && isImm(*getOp2(Op1))) { @@ -475,10 +484,15 @@ InstLoc IRBuilder::FoldBranchCond(InstLoc Op1, InstLoc Op2) { unsigned innerBranchValue = GetImmValue(getOp2(XOp1)); if (branchValue == innerBranchValue) { - if (branchValue == 4) { + if (branchValue == 2) + return FoldBranchCond(EmitICmpNe(getOp1(getOp1(XOp1)), + getOp2(getOp1(XOp1))), Op2); + if (branchValue == 4) return FoldBranchCond(EmitICmpSle(getOp1(getOp1(XOp1)), getOp2(getOp1(XOp1))), Op2); - } + if (branchValue == 8) + return FoldBranchCond(EmitICmpSge(getOp1(getOp1(XOp1)), + getOp2(getOp1(XOp1))), Op2); } } } @@ -493,6 +507,9 @@ InstLoc IRBuilder::FoldICmp(unsigned Opcode, InstLoc Op1, InstLoc Op2) { case ICmpEq: result = GetImmValue(Op1) == GetImmValue(Op2); break; + case ICmpNe: + result = GetImmValue(Op1) != GetImmValue(Op2); + break; case ICmpUgt: result = GetImmValue(Op1) > GetImmValue(Op2); break; @@ -1285,9 +1302,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { regMarkUse(RI, I, getOp1(I), 1); break; case BranchCond: { - unsigned CondOpcode = getOpcode(*getOp1(I)); - if ((CondOpcode == ICmpEq || - CondOpcode == ICmpSle) && + if (isICmp(*getOp1(I)) && isImm(*getOp2(getOp1(I)))) { regMarkUse(RI, I, getOp1(getOp1(I)), 1); } else { @@ -1904,20 +1919,24 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { case BlockEnd: break; case BranchCond: { - if (getOpcode(*getOp1(I)) == ICmpEq && + if (isICmp(*getOp1(I)) && isImm(*getOp2(getOp1(I)))) { Jit->CMP(32, regLocForInst(RI, getOp1(getOp1(I))), Imm32(RI.Build->GetImmValue(getOp2(getOp1(I))))); - FixupBranch cont = Jit->J_CC(CC_NZ); - regWriteExit(RI, getOp2(I)); - Jit->SetJumpTarget(cont); - if (RI.IInfo[I - RI.FirstI] & 4) - regClearInst(RI, getOp1(getOp1(I))); - } else if (getOpcode(*getOp1(I)) == ICmpSle && - isImm(*getOp2(getOp1(I)))) { - Jit->CMP(32, regLocForInst(RI, getOp1(getOp1(I))), - Imm32(RI.Build->GetImmValue(getOp2(getOp1(I))))); - FixupBranch cont = Jit->J_CC(CC_G); + CCFlags flag; + switch (getOpcode(*getOp1(I))) { + case ICmpEq: flag = CC_NE; break; + case ICmpNe: flag = CC_E; break; + case ICmpUgt: flag = CC_BE; break; + case ICmpUlt: flag = CC_AE; break; + case ICmpUge: flag = CC_L; break; + case ICmpUle: flag = CC_A; break; + case ICmpSgt: flag = CC_LE; break; + case ICmpSlt: flag = CC_GE; break; + case ICmpSge: flag = CC_L; break; + case ICmpSle: flag = CC_G; break; + } + FixupBranch cont = Jit->J_CC(flag); regWriteExit(RI, getOp2(I)); Jit->SetJumpTarget(cont); if (RI.IInfo[I - RI.FirstI] & 4) diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h index 58f768dcd1..27edb78c5c 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h @@ -216,6 +216,10 @@ namespace IREmitter { unsigned inline isImm(Inst i) { return getOpcode(i) >= CInt16 && getOpcode(i) <= CInt32; } + + unsigned inline isICmp(Inst i) { + return getOpcode(i) >= ICmpEq && getOpcode(i) <= ICmpSle; + } unsigned inline isFResult(Inst i) { return getOpcode(i) > FResult_Start && @@ -329,12 +333,21 @@ namespace IREmitter { InstLoc EmitICmpEq(InstLoc op1, InstLoc op2) { return FoldBiOp(ICmpEq, op1, op2); } + InstLoc EmitICmpNe(InstLoc op1, InstLoc op2) { + return FoldBiOp(ICmpNe, op1, op2); + } InstLoc EmitICmpUgt(InstLoc op1, InstLoc op2) { return FoldBiOp(ICmpUgt, op1, op2); } InstLoc EmitICmpSgt(InstLoc op1, InstLoc op2) { return FoldBiOp(ICmpSgt, op1, op2); } + InstLoc EmitICmpSlt(InstLoc op1, InstLoc op2) { + return FoldBiOp(ICmpSlt, op1, op2); + } + InstLoc EmitICmpSge(InstLoc op1, InstLoc op2) { + return FoldBiOp(ICmpSge, op1, op2); + } InstLoc EmitICmpSle(InstLoc op1, InstLoc op2) { return FoldBiOp(ICmpSle, op1, op2); }