From ce2b4bead95a305353a02ee938ff14f4eafd11f2 Mon Sep 17 00:00:00 2001 From: magumagu9 Date: Tue, 6 Jan 2009 07:35:06 +0000 Subject: [PATCH] And a bit more WIP JIT work, mostly floating-point improvements. It should be getting to the point where the performance is reasonably comparable to the current JIT on most workloads. I'm not sure of the exact comparisons because I haven't done much in terms of checking the performance versus the current JIT on floating-point heavy stuff. The floating point stuff is still relatively unoptimized, and there are still a few relatively common functions that aren't being compiled yet, but the most critical work is done. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1807 8ced0084-cf51-0410-be5f-012b33b47a6e --- Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp | 173 +++++++++++++++++- Source/Core/Core/Src/PowerPC/Jit64IL/IR.h | 44 +++++ .../Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp | 83 +++------ .../PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp | 125 ++----------- .../Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp | 25 +-- 5 files changed, 262 insertions(+), 188 deletions(-) diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp index 791e7f377c..356718cc73 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp @@ -305,6 +305,10 @@ InstLoc IRBuilder::FoldUOp(unsigned Opcode, InstLoc Op1, unsigned extra) { if (getOpcode(*Op1) == ExpandPackedToMReg) return getOp1(Op1); } + if (Opcode == DoubleToSingle) { + if (getOpcode(*Op1) == DupSingleToMReg) + return getOp1(Op1); + } return EmitUOp(Opcode, Op1, extra); } @@ -590,6 +594,11 @@ static OpArg fregLocForSlot(RegInfo& RI, unsigned slot) { return M(&FSlotSet[slot*16]); } +// Used for accessing the top half of a spilled double +static OpArg fregLocForSlotPlusFour(RegInfo& RI, unsigned slot) { + return M(&FSlotSet[slot*16+4]); +} + static unsigned fregCreateSpill(RegInfo& RI, InstLoc I) { unsigned newSpill = ++RI.numFSpills; RI.IInfo[I - RI.FirstI] |= newSpill << 16; @@ -679,6 +688,16 @@ static X64Reg regEnsureInReg(RegInfo& RI, InstLoc I) { return loc.GetSimpleReg(); } +static X64Reg fregEnsureInReg(RegInfo& RI, InstLoc I) { + OpArg loc = fregLocForInst(RI, I); + if (!loc.IsSimpleReg()) { + X64Reg newReg = fregFindFreeReg(RI); + RI.Jit->MOVAPD(newReg, loc); + loc = R(newReg); + } + return loc.GetSimpleReg(); +} + static void regSpillCallerSaved(RegInfo& RI) { regSpill(RI, EDX); regSpill(RI, ECX); @@ -1044,6 +1063,8 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { case ExpandPackedToMReg: case CompactMRegToPacked: case FPNeg: + case FSNeg: + case FDNeg: if (thisUsed) regMarkUse(RI, I, getOp1(I), 1); break; @@ -1052,6 +1073,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { case Load32: regMarkMemAddress(RI, I, getOp1(I), 1); break; + case LoadDouble: case LoadSingle: case LoadPaired: regMarkUse(RI, I, getOp1(I), 1); @@ -1086,9 +1108,17 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { case ICmpSgt: case FSMul: case FSAdd: + case FSSub: + case FDMul: + case FDAdd: + case FDSub: case FPAdd: case FPMul: case FPSub: + case FPMerge00: + case FPMerge01: + case FPMerge10: + case FPMerge11: case InsertDoubleInMReg: if (thisUsed) { regMarkUse(RI, I, getOp1(I), 1); @@ -1104,6 +1134,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { regMarkMemAddress(RI, I, getOp2(I), 2); break; case StoreSingle: + case StoreDouble: case StorePaired: regMarkUse(RI, I, getOp1(I), 1); regMarkUse(RI, I, getOp2(I), 2); @@ -1417,6 +1448,21 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { regNormalRegClear(RI, I); break; } + case LoadDouble: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I))); + Jit->ADD(32, R(ECX), Imm8(4)); + RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false); + Jit->MOVD_xmm(reg, R(ECX)); + Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I))); + RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false); + Jit->MOVD_xmm(XMM0, R(ECX)); + Jit->PUNPCKLDQ(reg, R(XMM0)); + RI.fregs[reg] = I; + regNormalRegClear(RI, I); + break; + } case LoadPaired: { if (!thisUsed) break; regSpill(RI, EAX); @@ -1449,6 +1495,30 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { regClearInst(RI, getOp2(I)); break; } + case StoreDouble: { + regSpill(RI, EAX); + // FIXME: Use 64-bit where possible + // FIXME: Use unsafe write with pshufb where possible + unsigned fspill = fregGetSpill(RI, getOp1(I)); + if (!fspill) { + // Force the value to spill, so we can use + // memory operations to load it + fspill = fregCreateSpill(RI, getOp1(I)); + X64Reg reg = fregLocForInst(RI, getOp1(I)).GetSimpleReg(); + RI.Jit->MOVAPD(fregLocForSlot(RI, fspill), reg); + } + Jit->MOV(32, R(EAX), fregLocForSlotPlusFour(RI, fspill)); + Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I))); + RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 0); + Jit->MOV(32, R(EAX), fregLocForSlot(RI, fspill)); + Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I))); + RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 4); + if (RI.IInfo[I - RI.FirstI] & 4) + fregClearInst(RI, getOp1(I)); + if (RI.IInfo[I - RI.FirstI] & 8) + regClearInst(RI, getOp2(I)); + break; + } case StorePaired: { regSpill(RI, EAX); regSpill(RI, EDX); @@ -1501,6 +1571,28 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { fregNormalRegClear(RI, I); break; } + case FSNeg: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I))); + static const u32 GC_ALIGNED16(ssSignBits[4]) = + {0x80000000}; + Jit->PXOR(reg, M((void*)&ssSignBits)); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } + case FDNeg: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I))); + static const u64 GC_ALIGNED16(ssSignBits[2]) = + {0x8000000000000000ULL}; + Jit->PXOR(reg, M((void*)&ssSignBits)); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } case FPNeg: { if (!thisUsed) break; X64Reg reg = fregFindFreeReg(RI); @@ -1522,8 +1614,8 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { } case StoreFReg: { unsigned ppcreg = *I >> 16; - Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I))); - Jit->MOVAPD(M(&PowerPC::ppcState.ps[ppcreg]), XMM0); + Jit->MOVAPD(M(&PowerPC::ppcState.ps[ppcreg]), + fregEnsureInReg(RI, getOp1(I))); fregNormalRegClear(RI, I); break; } @@ -1553,6 +1645,42 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { fregNormalRegClear(RI, I); break; } + case FSSub: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I))); + Jit->SUBSS(reg, fregLocForInst(RI, getOp2(I))); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } + case FDMul: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I))); + Jit->MULSD(reg, fregLocForInst(RI, getOp2(I))); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } + case FDAdd: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I))); + Jit->ADDSD(reg, fregLocForInst(RI, getOp2(I))); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } + case FDSub: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I))); + Jit->SUBSD(reg, fregLocForInst(RI, getOp2(I))); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } case FPAdd: { if (!thisUsed) break; X64Reg reg = fregFindFreeReg(RI); @@ -1580,6 +1708,47 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { fregNormalRegClear(RI, I); break; } + case FPMerge00: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I))); + Jit->PUNPCKLDQ(reg, fregLocForInst(RI, getOp2(I))); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } + case FPMerge01: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + // Note reversed operands! + Jit->MOVAPD(reg, fregLocForInst(RI, getOp2(I))); + Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I))); + Jit->MOVSS(reg, R(XMM0)); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } + case FPMerge10: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I))); + Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp2(I))); + Jit->MOVSS(reg, R(XMM0)); + Jit->SHUFPS(reg, R(reg), 0xF1); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } + case FPMerge11: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I))); + Jit->PUNPCKLDQ(reg, fregLocForInst(RI, getOp2(I))); + Jit->SHUFPD(reg, R(reg), 0x1); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } case CInt32: case CInt16: { if (!thisUsed) break; diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h index 4db1100791..92f30d8081 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h @@ -157,13 +157,24 @@ namespace IREmitter { LoadFReg, FSMul, FSAdd, + FSSub, + FSNeg, FPAdd, FPMul, FPSub, FPNeg, + FDMul, + FDAdd, + FDSub, + FDNeg, + FPMerge00, + FPMerge01, + FPMerge10, + FPMerge11, FResult_End, StorePaired, StoreSingle, + StoreDouble, StoreFReg, // "Trinary" operators @@ -380,6 +391,9 @@ namespace IREmitter { InstLoc EmitStoreSingle(InstLoc value, InstLoc addr) { return FoldBiOp(StoreSingle, value, addr); } + InstLoc EmitStoreDouble(InstLoc value, InstLoc addr) { + return FoldBiOp(StoreDouble, value, addr); + } InstLoc EmitStorePaired(InstLoc value, InstLoc addr, unsigned quantReg) { return FoldBiOp(StorePaired, value, addr, quantReg); } @@ -410,6 +424,24 @@ namespace IREmitter { InstLoc EmitFSAdd(InstLoc op1, InstLoc op2) { return FoldBiOp(FSAdd, op1, op2); } + InstLoc EmitFSSub(InstLoc op1, InstLoc op2) { + return FoldBiOp(FSSub, op1, op2); + } + InstLoc EmitFSNeg(InstLoc op1) { + return FoldUOp(FSNeg, op1); + } + InstLoc EmitFDMul(InstLoc op1, InstLoc op2) { + return FoldBiOp(FDMul, op1, op2); + } + InstLoc EmitFDAdd(InstLoc op1, InstLoc op2) { + return FoldBiOp(FDAdd, op1, op2); + } + InstLoc EmitFDSub(InstLoc op1, InstLoc op2) { + return FoldBiOp(FDSub, op1, op2); + } + InstLoc EmitFDNeg(InstLoc op1) { + return FoldUOp(FDNeg, op1); + } InstLoc EmitFPAdd(InstLoc op1, InstLoc op2) { return FoldBiOp(FPAdd, op1, op2); } @@ -419,6 +451,18 @@ namespace IREmitter { InstLoc EmitFPSub(InstLoc op1, InstLoc op2) { return FoldBiOp(FPSub, op1, op2); } + InstLoc EmitFPMerge00(InstLoc op1, InstLoc op2) { + return FoldBiOp(FPMerge00, op1, op2); + } + InstLoc EmitFPMerge01(InstLoc op1, InstLoc op2) { + return FoldBiOp(FPMerge01, op1, op2); + } + InstLoc EmitFPMerge10(InstLoc op1, InstLoc op2) { + return FoldBiOp(FPMerge10, op1, op2); + } + InstLoc EmitFPMerge11(InstLoc op1, InstLoc op2) { + return FoldBiOp(FPMerge11, op1, op2); + } InstLoc EmitFPNeg(InstLoc op1) { return FoldUOp(FPNeg, op1); } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp index f64a5e1556..8837cc86ff 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp @@ -28,31 +28,40 @@ void Jit64::fp_arith_s(UGeckoInstruction inst) { - if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 25) { + if (inst.Rc || inst.OPCD != 59 || (inst.SUBOP5 != 25 && inst.SUBOP5 != 20 && inst.SUBOP5 != 21)) { Default(inst); return; } IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA); - val = ibuild.EmitDoubleToSingle(val); - bool dupe = inst.OPCD == 59; switch (inst.SUBOP5) { case 25: //mul - val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC))); + val = ibuild.EmitFDMul(val, ibuild.EmitLoadFReg(inst.FC)); + break; case 18: //div case 20: //sub + val = ibuild.EmitFDSub(val, ibuild.EmitLoadFReg(inst.FB)); + break; case 21: //add + val = ibuild.EmitFDAdd(val, ibuild.EmitLoadFReg(inst.FB)); + break; case 23: //sel case 24: //res default: _assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!"); } - val = ibuild.EmitDupSingleToMReg(val); + + if (inst.OPCD == 59) { + val = ibuild.EmitDoubleToSingle(val); + val = ibuild.EmitDupSingleToMReg(val); + } else { + val = ibuild.EmitInsertDoubleInMReg(val, ibuild.EmitLoadFReg(inst.FD)); + } ibuild.EmitStoreFReg(val, inst.FD); } void Jit64::fmaddXX(UGeckoInstruction inst) { - if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 29) { + if (inst.Rc || inst.OPCD != 59) { Default(inst); return; } @@ -61,7 +70,12 @@ IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA); val = ibuild.EmitDoubleToSingle(val); val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC))); - val = ibuild.EmitFSAdd(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB))); + if (inst.SUBOP5 & 1) + val = ibuild.EmitFSAdd(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB))); + else + val = ibuild.EmitFSSub(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB))); + if (inst.SUBOP5 & 2) + val = ibuild.EmitFSNeg(val); val = ibuild.EmitDupSingleToMReg(val); ibuild.EmitStoreFReg(val, inst.FD); } @@ -78,57 +92,6 @@ void Jit64::fcmpx(UGeckoInstruction inst) { - if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff) - {Default(inst); return;} // turn off from debugger - INSTRUCTION_START; - if (jo.fpAccurateFlags) - { - Default(inst); - return; - } - bool ordered = inst.SUBOP10 == 32; - /* - double fa = rPS0(_inst.FA); - double fb = rPS0(_inst.FB); - u32 compareResult; - - if(IsNAN(fa) || IsNAN(fb)) compareResult = 1; - else if(fa < fb) compareResult = 8; - else if(fa > fb) compareResult = 4; - else compareResult = 2; - - FPSCR.FPRF = compareResult; - CR = (CR & (~(0xf0000000 >> (_inst.CRFD * 4)))) | (compareResult << ((7 - _inst.CRFD) * 4)); -*/ - int a = inst.FA; - int b = inst.FB; - int crf = inst.CRFD; - int shift = crf * 4; - //FPSCR - //XOR(32,R(EAX),R(EAX)); - - fpr.Lock(a,b); - if (a != b) - fpr.LoadToX64(a, true); - - // USES_CR - if (ordered) - COMISD(fpr.R(a).GetSimpleReg(), fpr.R(b)); - else - UCOMISD(fpr.R(a).GetSimpleReg(), fpr.R(b)); - FixupBranch pLesser = J_CC(CC_B); - FixupBranch pGreater = J_CC(CC_A); - // _x86Reg == 0 - MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2)); - FixupBranch continue1 = J(); - // _x86Reg > 0 - SetJumpTarget(pGreater); - MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4)); - FixupBranch continue2 = J(); - // _x86Reg < 0 - SetJumpTarget(pLesser); - MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8)); - SetJumpTarget(continue1); - SetJumpTarget(continue2); - fpr.UnlockAll(); + Default(inst); + return; } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp index fca175b33c..0c13c4e05d 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp @@ -65,121 +65,26 @@ void Jit64::lfs(UGeckoInstruction inst) void Jit64::lfd(UGeckoInstruction inst) { - if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff) - {Default(inst); return;} // turn off from debugger - INSTRUCTION_START; - - int d = inst.RD; - int a = inst.RA; - if (!a) - { - Default(inst); - return; - } - s32 offset = (s32)(s16)inst.SIMM_16; - gpr.FlushLockX(ABI_PARAM1); - gpr.Lock(a); - MOV(32, R(ABI_PARAM1), gpr.R(a)); - // TODO - optimize. This has to load the previous value - upper double should stay unmodified. - fpr.LoadToX64(d, true); - fpr.Lock(d); - X64Reg xd = fpr.RX(d); - if (cpu_info.bSSSE3) { -#ifdef _M_X64 - MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); -#else - AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); - MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset)); -#endif - PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe)); - MOVSD(xd, R(XMM0)); - } else { -#ifdef _M_X64 - MOV(64, R(EAX), MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); - BSWAP(64, EAX); - MOV(64, M(&temp64), R(EAX)); - MOVSD(XMM0, M(&temp64)); - MOVSD(xd, R(XMM0)); -#else - AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); - MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset)); - BSWAP(32, EAX); - MOV(32, M((void*)((u32)&temp64+4)), R(EAX)); - MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4)); - BSWAP(32, EAX); - MOV(32, M(&temp64), R(EAX)); - MOVSD(XMM0, M(&temp64)); - MOVSD(xd, R(XMM0)); -#if 0 - // Alternate implementation; possibly faster - AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); - MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset)); - PSHUFLW(XMM0, R(XMM0), 0x1B); - PSRLW(XMM0, 8); - MOVSD(xd, R(XMM0)); - MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset)); - PSHUFLW(XMM0, R(XMM0), 0x1B); - PSLLW(XMM0, 8); - POR(xd, R(XMM0)); -#endif -#endif - } - gpr.UnlockAll(); - gpr.UnlockAllX(); - fpr.UnlockAll(); + IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16), val; + if (inst.RA) + addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); + val = ibuild.EmitLoadFReg(inst.RD); + val = ibuild.EmitInsertDoubleInMReg(ibuild.EmitLoadDouble(addr), val); + ibuild.EmitStoreFReg(val, inst.RD); + return; } void Jit64::stfd(UGeckoInstruction inst) { - if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff) - {Default(inst); return;} // turn off from debugger - INSTRUCTION_START; - - int s = inst.RS; - int a = inst.RA; - if (!a) - { - Default(inst); - return; - } - s32 offset = (s32)(s16)inst.SIMM_16; - gpr.FlushLockX(ABI_PARAM1); - gpr.Lock(a); - fpr.Lock(s); - MOV(32, R(ABI_PARAM1), gpr.R(a)); -#ifdef _M_IX86 - AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); -#endif - if (cpu_info.bSSSE3) { - MOVAPD(XMM0, fpr.R(s)); - PSHUFB(XMM0, M((void *)bswapShuffle1x8)); -#ifdef _M_X64 - MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, offset), XMM0); -#else - MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base + offset), XMM0); -#endif - } else { -#ifdef _M_X64 - fpr.LoadToX64(s, true, false); - MOVSD(M(&temp64), fpr.RX(s)); - MOV(64, R(EAX), M(&temp64)); - BSWAP(64, EAX); - MOV(64, MComplex(RBX, ABI_PARAM1, SCALE_1, offset), R(EAX)); -#else - fpr.LoadToX64(s, true, false); - MOVSD(M(&temp64), fpr.RX(s)); - MOV(32, R(EAX), M(&temp64)); - BSWAP(32, EAX); - MOV(32, MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4), R(EAX)); - MOV(32, R(EAX), M((void*)((u32)&temp64 + 4))); - BSWAP(32, EAX); - MOV(32, MDisp(ABI_PARAM1, (u32)Memory::base + offset), R(EAX)); -#endif - } - gpr.UnlockAll(); - gpr.UnlockAllX(); - fpr.UnlockAll(); + IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16), + val = ibuild.EmitLoadFReg(inst.RS); + if (inst.RA) + addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); + if (inst.OPCD & 1) + ibuild.EmitStoreGReg(addr, inst.RA); + ibuild.EmitStoreDouble(val, addr); + return; } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp index b0b2a3cf58..b34a14c2d2 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp @@ -270,42 +270,35 @@ //TODO: find easy cases and optimize them, do a breakout like ps_arith void Jit64::ps_mergeXX(UGeckoInstruction inst) { - if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff) - {Default(inst); return;} // turn off from debugger - INSTRUCTION_START; if (inst.Rc) { Default(inst); return; } - int d = inst.FD; - int a = inst.FA; - int b = inst.FB; - fpr.Lock(a,b,d); - MOVAPD(XMM0, fpr.R(a)); + IREmitter::InstLoc val = ibuild.EmitCompactMRegToPacked(ibuild.EmitLoadFReg(inst.FA)), + rhs = ibuild.EmitCompactMRegToPacked(ibuild.EmitLoadFReg(inst.FB)); + switch (inst.SUBOP10) { case 528: - UNPCKLPD(XMM0, fpr.R(b)); //unpck is faster than shuf + val = ibuild.EmitFPMerge00(val, rhs); break; //00 case 560: - SHUFPD(XMM0, fpr.R(b), 2); //must use shuf here + val = ibuild.EmitFPMerge01(val, rhs); break; //01 case 592: - SHUFPD(XMM0, fpr.R(b), 1); + val = ibuild.EmitFPMerge10(val, rhs); break; //10 case 624: - UNPCKHPD(XMM0, fpr.R(b)); + val = ibuild.EmitFPMerge11(val, rhs); break; //11 default: _assert_msg_(DYNA_REC, 0, "ps_merge - invalid op"); } - fpr.LoadToX64(d, false); - MOVAPD(fpr.RX(d), Gen::R(XMM0)); - fpr.UnlockAll(); + val = ibuild.EmitExpandPackedToMReg(val); + ibuild.EmitStoreFReg(val, inst.FD); } - //TODO: add optimized cases void Jit64::ps_maddXX(UGeckoInstruction inst) { if (inst.Rc || (inst.SUBOP5 != 28 && inst.SUBOP5 != 29 && inst.SUBOP5 != 30)) {