And a bit more WIP JIT work, mostly floating-point improvements. It

should be getting to the point where the performance is reasonably
comparable to the current JIT on most workloads. I'm not sure of the 
exact comparisons because I haven't done much in terms of checking the 
performance versus the current JIT on floating-point heavy stuff. The 
floating point stuff is still relatively unoptimized, and there are still a
few relatively common functions that aren't being compiled yet, but the most 
critical work is done.



git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1807 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
magumagu9 2009-01-06 07:35:06 +00:00
parent acae6e6b69
commit ce2b4bead9
5 changed files with 262 additions and 188 deletions

View File

@ -305,6 +305,10 @@ InstLoc IRBuilder::FoldUOp(unsigned Opcode, InstLoc Op1, unsigned extra) {
if (getOpcode(*Op1) == ExpandPackedToMReg)
return getOp1(Op1);
}
if (Opcode == DoubleToSingle) {
if (getOpcode(*Op1) == DupSingleToMReg)
return getOp1(Op1);
}
return EmitUOp(Opcode, Op1, extra);
}
@ -590,6 +594,11 @@ static OpArg fregLocForSlot(RegInfo& RI, unsigned slot) {
return M(&FSlotSet[slot*16]);
}
// Used for accessing the top half of a spilled double
static OpArg fregLocForSlotPlusFour(RegInfo& RI, unsigned slot) {
return M(&FSlotSet[slot*16+4]);
}
static unsigned fregCreateSpill(RegInfo& RI, InstLoc I) {
unsigned newSpill = ++RI.numFSpills;
RI.IInfo[I - RI.FirstI] |= newSpill << 16;
@ -679,6 +688,16 @@ static X64Reg regEnsureInReg(RegInfo& RI, InstLoc I) {
return loc.GetSimpleReg();
}
static X64Reg fregEnsureInReg(RegInfo& RI, InstLoc I) {
OpArg loc = fregLocForInst(RI, I);
if (!loc.IsSimpleReg()) {
X64Reg newReg = fregFindFreeReg(RI);
RI.Jit->MOVAPD(newReg, loc);
loc = R(newReg);
}
return loc.GetSimpleReg();
}
static void regSpillCallerSaved(RegInfo& RI) {
regSpill(RI, EDX);
regSpill(RI, ECX);
@ -1044,6 +1063,8 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
case ExpandPackedToMReg:
case CompactMRegToPacked:
case FPNeg:
case FSNeg:
case FDNeg:
if (thisUsed)
regMarkUse(RI, I, getOp1(I), 1);
break;
@ -1052,6 +1073,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
case Load32:
regMarkMemAddress(RI, I, getOp1(I), 1);
break;
case LoadDouble:
case LoadSingle:
case LoadPaired:
regMarkUse(RI, I, getOp1(I), 1);
@ -1086,9 +1108,17 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
case ICmpSgt:
case FSMul:
case FSAdd:
case FSSub:
case FDMul:
case FDAdd:
case FDSub:
case FPAdd:
case FPMul:
case FPSub:
case FPMerge00:
case FPMerge01:
case FPMerge10:
case FPMerge11:
case InsertDoubleInMReg:
if (thisUsed) {
regMarkUse(RI, I, getOp1(I), 1);
@ -1104,6 +1134,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
regMarkMemAddress(RI, I, getOp2(I), 2);
break;
case StoreSingle:
case StoreDouble:
case StorePaired:
regMarkUse(RI, I, getOp1(I), 1);
regMarkUse(RI, I, getOp2(I), 2);
@ -1417,6 +1448,21 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
regNormalRegClear(RI, I);
break;
}
case LoadDouble: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
Jit->ADD(32, R(ECX), Imm8(4));
RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
Jit->MOVD_xmm(reg, R(ECX));
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
Jit->MOVD_xmm(XMM0, R(ECX));
Jit->PUNPCKLDQ(reg, R(XMM0));
RI.fregs[reg] = I;
regNormalRegClear(RI, I);
break;
}
case LoadPaired: {
if (!thisUsed) break;
regSpill(RI, EAX);
@ -1449,6 +1495,30 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
regClearInst(RI, getOp2(I));
break;
}
case StoreDouble: {
regSpill(RI, EAX);
// FIXME: Use 64-bit where possible
// FIXME: Use unsafe write with pshufb where possible
unsigned fspill = fregGetSpill(RI, getOp1(I));
if (!fspill) {
// Force the value to spill, so we can use
// memory operations to load it
fspill = fregCreateSpill(RI, getOp1(I));
X64Reg reg = fregLocForInst(RI, getOp1(I)).GetSimpleReg();
RI.Jit->MOVAPD(fregLocForSlot(RI, fspill), reg);
}
Jit->MOV(32, R(EAX), fregLocForSlotPlusFour(RI, fspill));
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 0);
Jit->MOV(32, R(EAX), fregLocForSlot(RI, fspill));
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 4);
if (RI.IInfo[I - RI.FirstI] & 4)
fregClearInst(RI, getOp1(I));
if (RI.IInfo[I - RI.FirstI] & 8)
regClearInst(RI, getOp2(I));
break;
}
case StorePaired: {
regSpill(RI, EAX);
regSpill(RI, EDX);
@ -1501,6 +1571,28 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
fregNormalRegClear(RI, I);
break;
}
case FSNeg: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
static const u32 GC_ALIGNED16(ssSignBits[4]) =
{0x80000000};
Jit->PXOR(reg, M((void*)&ssSignBits));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FDNeg: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
static const u64 GC_ALIGNED16(ssSignBits[2]) =
{0x8000000000000000ULL};
Jit->PXOR(reg, M((void*)&ssSignBits));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FPNeg: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
@ -1522,8 +1614,8 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
}
case StoreFReg: {
unsigned ppcreg = *I >> 16;
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
Jit->MOVAPD(M(&PowerPC::ppcState.ps[ppcreg]), XMM0);
Jit->MOVAPD(M(&PowerPC::ppcState.ps[ppcreg]),
fregEnsureInReg(RI, getOp1(I)));
fregNormalRegClear(RI, I);
break;
}
@ -1553,6 +1645,42 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
fregNormalRegClear(RI, I);
break;
}
case FSSub: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->SUBSS(reg, fregLocForInst(RI, getOp2(I)));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FDMul: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->MULSD(reg, fregLocForInst(RI, getOp2(I)));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FDAdd: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->ADDSD(reg, fregLocForInst(RI, getOp2(I)));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FDSub: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->SUBSD(reg, fregLocForInst(RI, getOp2(I)));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FPAdd: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
@ -1580,6 +1708,47 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
fregNormalRegClear(RI, I);
break;
}
case FPMerge00: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->PUNPCKLDQ(reg, fregLocForInst(RI, getOp2(I)));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FPMerge01: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
// Note reversed operands!
Jit->MOVAPD(reg, fregLocForInst(RI, getOp2(I)));
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
Jit->MOVSS(reg, R(XMM0));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FPMerge10: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp2(I)));
Jit->MOVSS(reg, R(XMM0));
Jit->SHUFPS(reg, R(reg), 0xF1);
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FPMerge11: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->PUNPCKLDQ(reg, fregLocForInst(RI, getOp2(I)));
Jit->SHUFPD(reg, R(reg), 0x1);
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case CInt32:
case CInt16: {
if (!thisUsed) break;

View File

@ -157,13 +157,24 @@ namespace IREmitter {
LoadFReg,
FSMul,
FSAdd,
FSSub,
FSNeg,
FPAdd,
FPMul,
FPSub,
FPNeg,
FDMul,
FDAdd,
FDSub,
FDNeg,
FPMerge00,
FPMerge01,
FPMerge10,
FPMerge11,
FResult_End,
StorePaired,
StoreSingle,
StoreDouble,
StoreFReg,
// "Trinary" operators
@ -380,6 +391,9 @@ namespace IREmitter {
InstLoc EmitStoreSingle(InstLoc value, InstLoc addr) {
return FoldBiOp(StoreSingle, value, addr);
}
InstLoc EmitStoreDouble(InstLoc value, InstLoc addr) {
return FoldBiOp(StoreDouble, value, addr);
}
InstLoc EmitStorePaired(InstLoc value, InstLoc addr, unsigned quantReg) {
return FoldBiOp(StorePaired, value, addr, quantReg);
}
@ -410,6 +424,24 @@ namespace IREmitter {
InstLoc EmitFSAdd(InstLoc op1, InstLoc op2) {
return FoldBiOp(FSAdd, op1, op2);
}
InstLoc EmitFSSub(InstLoc op1, InstLoc op2) {
return FoldBiOp(FSSub, op1, op2);
}
InstLoc EmitFSNeg(InstLoc op1) {
return FoldUOp(FSNeg, op1);
}
InstLoc EmitFDMul(InstLoc op1, InstLoc op2) {
return FoldBiOp(FDMul, op1, op2);
}
InstLoc EmitFDAdd(InstLoc op1, InstLoc op2) {
return FoldBiOp(FDAdd, op1, op2);
}
InstLoc EmitFDSub(InstLoc op1, InstLoc op2) {
return FoldBiOp(FDSub, op1, op2);
}
InstLoc EmitFDNeg(InstLoc op1) {
return FoldUOp(FDNeg, op1);
}
InstLoc EmitFPAdd(InstLoc op1, InstLoc op2) {
return FoldBiOp(FPAdd, op1, op2);
}
@ -419,6 +451,18 @@ namespace IREmitter {
InstLoc EmitFPSub(InstLoc op1, InstLoc op2) {
return FoldBiOp(FPSub, op1, op2);
}
InstLoc EmitFPMerge00(InstLoc op1, InstLoc op2) {
return FoldBiOp(FPMerge00, op1, op2);
}
InstLoc EmitFPMerge01(InstLoc op1, InstLoc op2) {
return FoldBiOp(FPMerge01, op1, op2);
}
InstLoc EmitFPMerge10(InstLoc op1, InstLoc op2) {
return FoldBiOp(FPMerge10, op1, op2);
}
InstLoc EmitFPMerge11(InstLoc op1, InstLoc op2) {
return FoldBiOp(FPMerge11, op1, op2);
}
InstLoc EmitFPNeg(InstLoc op1) {
return FoldUOp(FPNeg, op1);
}

View File

@ -28,31 +28,40 @@
void Jit64::fp_arith_s(UGeckoInstruction inst)
{
if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 25) {
if (inst.Rc || inst.OPCD != 59 || (inst.SUBOP5 != 25 && inst.SUBOP5 != 20 && inst.SUBOP5 != 21)) {
Default(inst); return;
}
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA);
val = ibuild.EmitDoubleToSingle(val);
bool dupe = inst.OPCD == 59;
switch (inst.SUBOP5)
{
case 25: //mul
val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC)));
val = ibuild.EmitFDMul(val, ibuild.EmitLoadFReg(inst.FC));
break;
case 18: //div
case 20: //sub
val = ibuild.EmitFDSub(val, ibuild.EmitLoadFReg(inst.FB));
break;
case 21: //add
val = ibuild.EmitFDAdd(val, ibuild.EmitLoadFReg(inst.FB));
break;
case 23: //sel
case 24: //res
default:
_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
}
if (inst.OPCD == 59) {
val = ibuild.EmitDoubleToSingle(val);
val = ibuild.EmitDupSingleToMReg(val);
} else {
val = ibuild.EmitInsertDoubleInMReg(val, ibuild.EmitLoadFReg(inst.FD));
}
ibuild.EmitStoreFReg(val, inst.FD);
}
void Jit64::fmaddXX(UGeckoInstruction inst)
{
if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 29) {
if (inst.Rc || inst.OPCD != 59) {
Default(inst); return;
}
@ -61,7 +70,12 @@
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA);
val = ibuild.EmitDoubleToSingle(val);
val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC)));
if (inst.SUBOP5 & 1)
val = ibuild.EmitFSAdd(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB)));
else
val = ibuild.EmitFSSub(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB)));
if (inst.SUBOP5 & 2)
val = ibuild.EmitFSNeg(val);
val = ibuild.EmitDupSingleToMReg(val);
ibuild.EmitStoreFReg(val, inst.FD);
}
@ -77,58 +91,7 @@
}
void Jit64::fcmpx(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (jo.fpAccurateFlags)
{
Default(inst);
return;
}
bool ordered = inst.SUBOP10 == 32;
/*
double fa = rPS0(_inst.FA);
double fb = rPS0(_inst.FB);
u32 compareResult;
if(IsNAN(fa) || IsNAN(fb)) compareResult = 1;
else if(fa < fb) compareResult = 8;
else if(fa > fb) compareResult = 4;
else compareResult = 2;
FPSCR.FPRF = compareResult;
CR = (CR & (~(0xf0000000 >> (_inst.CRFD * 4)))) | (compareResult << ((7 - _inst.CRFD) * 4));
*/
int a = inst.FA;
int b = inst.FB;
int crf = inst.CRFD;
int shift = crf * 4;
//FPSCR
//XOR(32,R(EAX),R(EAX));
fpr.Lock(a,b);
if (a != b)
fpr.LoadToX64(a, true);
// USES_CR
if (ordered)
COMISD(fpr.R(a).GetSimpleReg(), fpr.R(b));
else
UCOMISD(fpr.R(a).GetSimpleReg(), fpr.R(b));
FixupBranch pLesser = J_CC(CC_B);
FixupBranch pGreater = J_CC(CC_A);
// _x86Reg == 0
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2));
FixupBranch continue1 = J();
// _x86Reg > 0
SetJumpTarget(pGreater);
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4));
FixupBranch continue2 = J();
// _x86Reg < 0
SetJumpTarget(pLesser);
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8));
SetJumpTarget(continue1);
SetJumpTarget(continue2);
fpr.UnlockAll();
}

View File

@ -65,122 +65,27 @@ void Jit64::lfs(UGeckoInstruction inst)
void Jit64::lfd(UGeckoInstruction inst)
{
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
int d = inst.RD;
int a = inst.RA;
if (!a)
{
Default(inst);
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16), val;
if (inst.RA)
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
val = ibuild.EmitLoadFReg(inst.RD);
val = ibuild.EmitInsertDoubleInMReg(ibuild.EmitLoadDouble(addr), val);
ibuild.EmitStoreFReg(val, inst.RD);
return;
}
s32 offset = (s32)(s16)inst.SIMM_16;
gpr.FlushLockX(ABI_PARAM1);
gpr.Lock(a);
MOV(32, R(ABI_PARAM1), gpr.R(a));
// TODO - optimize. This has to load the previous value - upper double should stay unmodified.
fpr.LoadToX64(d, true);
fpr.Lock(d);
X64Reg xd = fpr.RX(d);
if (cpu_info.bSSSE3) {
#ifdef _M_X64
MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
#else
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
#endif
PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe));
MOVSD(xd, R(XMM0));
} else {
#ifdef _M_X64
MOV(64, R(EAX), MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
BSWAP(64, EAX);
MOV(64, M(&temp64), R(EAX));
MOVSD(XMM0, M(&temp64));
MOVSD(xd, R(XMM0));
#else
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset));
BSWAP(32, EAX);
MOV(32, M((void*)((u32)&temp64+4)), R(EAX));
MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4));
BSWAP(32, EAX);
MOV(32, M(&temp64), R(EAX));
MOVSD(XMM0, M(&temp64));
MOVSD(xd, R(XMM0));
#if 0
// Alternate implementation; possibly faster
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
PSHUFLW(XMM0, R(XMM0), 0x1B);
PSRLW(XMM0, 8);
MOVSD(xd, R(XMM0));
MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
PSHUFLW(XMM0, R(XMM0), 0x1B);
PSLLW(XMM0, 8);
POR(xd, R(XMM0));
#endif
#endif
}
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
void Jit64::stfd(UGeckoInstruction inst)
{
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
int s = inst.RS;
int a = inst.RA;
if (!a)
{
Default(inst);
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16),
val = ibuild.EmitLoadFReg(inst.RS);
if (inst.RA)
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
if (inst.OPCD & 1)
ibuild.EmitStoreGReg(addr, inst.RA);
ibuild.EmitStoreDouble(val, addr);
return;
}
s32 offset = (s32)(s16)inst.SIMM_16;
gpr.FlushLockX(ABI_PARAM1);
gpr.Lock(a);
fpr.Lock(s);
MOV(32, R(ABI_PARAM1), gpr.R(a));
#ifdef _M_IX86
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
#endif
if (cpu_info.bSSSE3) {
MOVAPD(XMM0, fpr.R(s));
PSHUFB(XMM0, M((void *)bswapShuffle1x8));
#ifdef _M_X64
MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, offset), XMM0);
#else
MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base + offset), XMM0);
#endif
} else {
#ifdef _M_X64
fpr.LoadToX64(s, true, false);
MOVSD(M(&temp64), fpr.RX(s));
MOV(64, R(EAX), M(&temp64));
BSWAP(64, EAX);
MOV(64, MComplex(RBX, ABI_PARAM1, SCALE_1, offset), R(EAX));
#else
fpr.LoadToX64(s, true, false);
MOVSD(M(&temp64), fpr.RX(s));
MOV(32, R(EAX), M(&temp64));
BSWAP(32, EAX);
MOV(32, MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4), R(EAX));
MOV(32, R(EAX), M((void*)((u32)&temp64 + 4)));
BSWAP(32, EAX);
MOV(32, MDisp(ABI_PARAM1, (u32)Memory::base + offset), R(EAX));
#endif
}
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
void Jit64::stfs(UGeckoInstruction inst)

View File

@ -270,42 +270,35 @@
//TODO: find easy cases and optimize them, do a breakout like ps_arith
void Jit64::ps_mergeXX(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
int d = inst.FD;
int a = inst.FA;
int b = inst.FB;
fpr.Lock(a,b,d);
MOVAPD(XMM0, fpr.R(a));
IREmitter::InstLoc val = ibuild.EmitCompactMRegToPacked(ibuild.EmitLoadFReg(inst.FA)),
rhs = ibuild.EmitCompactMRegToPacked(ibuild.EmitLoadFReg(inst.FB));
switch (inst.SUBOP10)
{
case 528:
UNPCKLPD(XMM0, fpr.R(b)); //unpck is faster than shuf
val = ibuild.EmitFPMerge00(val, rhs);
break; //00
case 560:
SHUFPD(XMM0, fpr.R(b), 2); //must use shuf here
val = ibuild.EmitFPMerge01(val, rhs);
break; //01
case 592:
SHUFPD(XMM0, fpr.R(b), 1);
val = ibuild.EmitFPMerge10(val, rhs);
break; //10
case 624:
UNPCKHPD(XMM0, fpr.R(b));
val = ibuild.EmitFPMerge11(val, rhs);
break; //11
default:
_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
}
fpr.LoadToX64(d, false);
MOVAPD(fpr.RX(d), Gen::R(XMM0));
fpr.UnlockAll();
val = ibuild.EmitExpandPackedToMReg(val);
ibuild.EmitStoreFReg(val, inst.FD);
}
//TODO: add optimized cases
void Jit64::ps_maddXX(UGeckoInstruction inst)
{
if (inst.Rc || (inst.SUBOP5 != 28 && inst.SUBOP5 != 29 && inst.SUBOP5 != 30)) {