diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index 84bc430f3b..59b764389c 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -1286,9 +1286,7 @@ void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg) { } void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src) { - if (arg.IsSimpleReg()) - PanicAlert("Emitter: MOVQ_xmm doesn't support single registers as destination"); - if (src > 7) + if (src > 7 || arg.IsSimpleReg()) { // Alternate encoding // This does not display correctly in MSVC's debugger, it thinks it's a MOVD diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 2a8c3d6072..0184205f1f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -88,7 +88,7 @@ static GekkoOPTemplate primarytable[] = {51, &Jit64::FallBackToInterpreter}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}}, {52, &Jit64::stfs}, //"stfs", OPTYPE_STOREFP, FL_IN_A}}, - {53, &Jit64::stfs}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, + {53, &Jit64::FallBackToInterpreter}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, {54, &Jit64::stfd}, //"stfd", OPTYPE_STOREFP, FL_IN_A}}, {55, &Jit64::FallBackToInterpreter}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index b580cd2205..03c5a88cd0 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -314,38 +314,6 @@ void Jit64::stX(UGeckoInstruction inst) } } - // Optimized stack access? - if (accessSize == 32 && !gpr.R(a).IsImm() && a == 1 && js.st.isFirstBlockOfFunction && jo.optimizeStack) - { - gpr.FlushLockX(ABI_PARAM1); - MOV(32, R(ABI_PARAM1), gpr.R(a)); - MOV(32, R(EAX), gpr.R(s)); - SwapAndStore(accessSize, MComplex(RBX, ABI_PARAM1, SCALE_1, (u32)offset), EAX); - if (update && offset) - { - gpr.Lock(a); - gpr.KillImmediate(a, true, true); - ADD(32, gpr.R(a), Imm32(offset)); - gpr.UnlockAll(); - } - gpr.UnlockAllX(); - return; - } - - /* // TODO - figure out why Beyond Good and Evil hates this - #if defined(_WIN32) && _M_X86_64 - if (accessSize == 32 && !update) - { - // Fast and daring - requires 64-bit - MOV(32, R(EAX), gpr.R(s)); - gpr.BindToRegister(a, true, false); - SwapAndStore(32, MComplex(RBX, gpr.RX(a), SCALE_1, (u32)offset), EAX); - return; - } - #endif*/ - - //Still here? Do regular path. - gpr.FlushLockX(ECX, EDX); gpr.Lock(s, a); MOV(32, R(EDX), gpr.R(a)); @@ -415,15 +383,16 @@ void Jit64::lmw(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); + // TODO: This doesn't handle rollback on DSI correctly gpr.FlushLockX(ECX); - MOV(32, R(EAX), Imm32((u32)(s32)inst.SIMM_16)); + MOV(32, R(ECX), Imm32((u32)(s32)inst.SIMM_16)); if (inst.RA) - ADD(32, R(EAX), gpr.R(inst.RA)); + ADD(32, R(ECX), gpr.R(inst.RA)); for (int i = inst.RD; i < 32; i++) { - LoadAndSwap(32, ECX, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4)); + SafeLoadToReg(EAX, R(ECX), 32, (i - inst.RD) * 4, RegistersInUse(), false); gpr.BindToRegister(i, false, true); - MOV(32, gpr.R(i), R(ECX)); + MOV(32, gpr.R(i), R(EAX)); } gpr.UnlockAllX(); } @@ -433,14 +402,16 @@ void Jit64::stmw(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); + // TODO: This doesn't handle rollback on DSI correctly gpr.FlushLockX(ECX); - MOV(32, R(EAX), Imm32((u32)(s32)inst.SIMM_16)); - if (inst.RA) - ADD(32, R(EAX), gpr.R(inst.RA)); for (int i = inst.RD; i < 32; i++) { + if (inst.RA) + MOV(32, R(EAX), gpr.R(inst.RA)); + else + XOR(32, R(EAX), R(EAX)); MOV(32, R(ECX), gpr.R(i)); - SwapAndStore(32, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4), ECX); + SafeWriteRegToReg(ECX, EAX, 32, (i - inst.RD) * 4 + (u32)(s32)inst.SIMM_16, RegistersInUse()); } gpr.UnlockAllX(); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index 57f3b6a2e5..328523ae80 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -2,9 +2,6 @@ // Licensed under GPLv2 // Refer to the license.txt file included. -// TODO(ector): Tons of pshufb optimization of the loads/stores, for SSSE3+, possibly SSE4, only. -// Should give a very noticeable speed boost to paired single heavy code. - #include "Common/Common.h" #include "Common/CPUDetect.h" @@ -12,20 +9,8 @@ #include "Core/PowerPC/Jit64/JitAsm.h" #include "Core/PowerPC/Jit64/JitRegCache.h" -namespace { - -// pshufb todo: MOVQ -const u8 GC_ALIGNED16(bswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; -const u8 GC_ALIGNED16(bswapShuffle1x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 8, 9, 10, 11, 12, 13, 14, 15}; -const u8 GC_ALIGNED16(bswapShuffle1x8Dupe[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0}; - -u64 GC_ALIGNED16(temp64); - -} - // TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common, // and pshufb could help a lot. -// Also add hacks for things like lfs/stfs the same reg consecutively, that is, simple memory moves. void Jit64::lfs(UGeckoInstruction inst) { @@ -40,12 +25,11 @@ void Jit64::lfs(UGeckoInstruction inst) SafeLoadToReg(EAX, gpr.R(a), 32, offset, RegistersInUse(), false); - MEMCHECK_START - fpr.Lock(d); - fpr.BindToRegister(d, false); - ConvertSingleToDouble(fpr.RX(d), EAX, true); + fpr.BindToRegister(d, js.memcheck); + MEMCHECK_START + ConvertSingleToDouble(fpr.RX(d), EAX, true); MEMCHECK_END fpr.UnlockAll(); @@ -56,61 +40,23 @@ void Jit64::lfd(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); - FALLBACK_IF(js.memcheck || !inst.RA); + FALLBACK_IF(!inst.RA); int d = inst.RD; int a = inst.RA; s32 offset = (s32)(s16)inst.SIMM_16; - gpr.FlushLockX(ABI_PARAM1); - gpr.Lock(a); - MOV(32, R(ABI_PARAM1), gpr.R(a)); - // TODO - optimize. This has to load the previous value - upper double should stay unmodified. + + SafeLoadToReg(RAX, gpr.R(a), 64, offset, RegistersInUse(), false); + fpr.Lock(d); fpr.BindToRegister(d, true); - X64Reg xd = fpr.RX(d); - if (cpu_info.bSSSE3) - { -#if _M_X86_64 - MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); -#else - AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); - MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset)); -#endif - PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe)); - MOVSD(xd, R(XMM0)); - } else { -#if _M_X86_64 - LoadAndSwap(64, EAX, MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); - MOV(64, M(&temp64), R(EAX)); + MEMCHECK_START + MOVQ_xmm(XMM0, R(RAX)); + MOVSD(fpr.RX(d), R(XMM0)); + MEMCHECK_END - MEMCHECK_START - - MOVSD(XMM0, M(&temp64)); - MOVSD(xd, R(XMM0)); - - MEMCHECK_END -#else - AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); - MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset)); - BSWAP(32, EAX); - MOV(32, M((void*)((u8 *)&temp64+4)), R(EAX)); - - MEMCHECK_START - - MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4)); - BSWAP(32, EAX); - MOV(32, M(&temp64), R(EAX)); - MOVSD(XMM0, M(&temp64)); - MOVSD(xd, R(XMM0)); - - MEMCHECK_END -#endif - } - - gpr.UnlockAll(); - gpr.UnlockAllX(); fpr.UnlockAll(); } @@ -119,146 +65,49 @@ void Jit64::stfd(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); - FALLBACK_IF(js.memcheck || !inst.RA); + FALLBACK_IF(!inst.RA); int s = inst.RS; int a = inst.RA; - u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS; - if (Core::g_CoreStartupParameter.bMMU || - Core::g_CoreStartupParameter.bTLBHack) { - mem_mask |= Memory::ADDR_MASK_MEM1; - } -#ifdef ENABLE_MEM_CHECK - if (Core::g_CoreStartupParameter.bEnableDebugging) - { - mem_mask |= Memory::EXRAM_MASK; - } -#endif - gpr.FlushLockX(ABI_PARAM1); - gpr.Lock(a); - fpr.Lock(s); - gpr.BindToRegister(a, true, false); + MOV(32, R(ABI_PARAM1), gpr.R(a)); + + if (fpr.R(s).IsSimpleReg()) + MOVQ_xmm(R(RAX), fpr.RX(s)); + else + MOV(64, R(RAX), fpr.R(s)); s32 offset = (s32)(s16)inst.SIMM_16; - LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset)); - TEST(32, R(ABI_PARAM1), Imm32(mem_mask)); - FixupBranch safe = J_CC(CC_NZ); + SafeWriteRegToReg(RAX, ABI_PARAM1, 64, offset, RegistersInUse()); - // Fast routine - if (cpu_info.bSSSE3) { - MOVAPD(XMM0, fpr.R(s)); - PSHUFB(XMM0, M((void*)bswapShuffle1x8)); -#if _M_X86_64 - MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, 0), XMM0); -#else - AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); - MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base), XMM0); -#endif - } else { - MOVAPD(XMM0, fpr.R(s)); - MOVD_xmm(R(EAX), XMM0); - UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4); - - PSRLQ(XMM0, 32); - MOVD_xmm(R(EAX), XMM0); - UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0); - } - FixupBranch exit = J(true); - SetJumpTarget(safe); - - // Safe but slow routine - MOVAPD(XMM0, fpr.R(s)); - PSRLQ(XMM0, 32); - MOVD_xmm(R(EAX), XMM0); - SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse() | (1 << (16 + XMM0))); - - MOVAPD(XMM0, fpr.R(s)); - MOVD_xmm(R(EAX), XMM0); - LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset)); - SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4, RegistersInUse()); - - SetJumpTarget(exit); - - gpr.UnlockAll(); gpr.UnlockAllX(); - fpr.UnlockAll(); } -// In Release on 32bit build, -// this seemed to cause a problem with PokePark2 -// at start after talking to first pokemon, -// you run and smash a box, then he goes on about -// following him and then you cant do anything. -// I have enabled interpreter for this function -// in the mean time. -// Parlane void Jit64::stfs(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); + FALLBACK_IF(!inst.RA); - bool update = inst.OPCD & 1; int s = inst.RS; int a = inst.RA; s32 offset = (s32)(s16)inst.SIMM_16; - FALLBACK_IF(!a || update); - fpr.BindToRegister(s, true, false); ConvertDoubleToSingle(XMM0, fpr.RX(s)); - - if (gpr.R(a).IsImm()) - { - u32 addr = (u32)(gpr.R(a).offset + offset); - if (Memory::IsRAMAddress(addr)) - { - if (cpu_info.bSSSE3) { - PSHUFB(XMM0, M((void *)bswapShuffle1x4)); - WriteFloatToConstRamAddress(XMM0, addr); - return; - } - } - else if (addr == 0xCC008000) - { - // Float directly to write gather pipe! Fun! - CALL((void*)asm_routines.fifoDirectWriteFloat); - // TODO - js.fifoBytesThisBlock += 4; - return; - } - } - - gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2); - gpr.Lock(a); - MOV(32, R(ABI_PARAM2), gpr.R(a)); - ADD(32, R(ABI_PARAM2), Imm32(offset)); - if (update && offset) - { - // We must flush immediate values from the following register because - // it may take another value at runtime if no MMU exception has been raised - gpr.KillImmediate(a, true, true); - - MEMCHECK_START - - MOV(32, gpr.R(a), R(ABI_PARAM2)); - - MEMCHECK_END - } - SafeWriteFloatToReg(XMM0, ABI_PARAM2, RegistersInUse()); - gpr.UnlockAll(); - gpr.UnlockAllX(); + gpr.FlushLockX(ABI_PARAM1); + MOV(32, R(ABI_PARAM1), gpr.R(a)); + SafeWriteF32ToReg(XMM0, ABI_PARAM1, offset, RegistersInUse()); fpr.UnlockAll(); + gpr.UnlockAllX(); } - void Jit64::stfsx(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); - // We can take a shortcut here - it's not likely that a hardware access would use this instruction. gpr.FlushLockX(ABI_PARAM1); MOV(32, R(ABI_PARAM1), gpr.R(inst.RB)); if (inst.RA) @@ -268,14 +117,11 @@ void Jit64::stfsx(UGeckoInstruction inst) fpr.Lock(s); fpr.BindToRegister(s, true, false); ConvertDoubleToSingle(XMM0, fpr.RX(s)); - MOVD_xmm(R(EAX), XMM0); - SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse()); - - gpr.UnlockAllX(); + SafeWriteF32ToReg(XMM0, ABI_PARAM1, 0, RegistersInUse()); fpr.UnlockAll(); + gpr.UnlockAllX(); } - void Jit64::lfsx(UGeckoInstruction inst) { INSTRUCTION_START @@ -283,30 +129,17 @@ void Jit64::lfsx(UGeckoInstruction inst) MOV(32, R(EAX), gpr.R(inst.RB)); if (inst.RA) - { ADD(32, R(EAX), gpr.R(inst.RA)); - } + + SafeLoadToReg(EAX, R(EAX), 32, 0, RegistersInUse(), false); + fpr.Lock(inst.RS); - fpr.BindToRegister(inst.RS, false); - X64Reg s = fpr.RX(inst.RS); - if (cpu_info.bSSSE3 && !js.memcheck) { -#if _M_X86_32 - AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); - MOVD_xmm(XMM0, MDisp(EAX, (u32)Memory::base)); -#else - MOVD_xmm(XMM0, MComplex(RBX, EAX, SCALE_1, 0)); -#endif - PSHUFB(XMM0, M((void *)bswapShuffle1x4)); - ConvertSingleToDouble(s, XMM0); - } else { - SafeLoadToReg(EAX, R(EAX), 32, 0, RegistersInUse(), false); + fpr.BindToRegister(inst.RS, js.memcheck); - MEMCHECK_START + MEMCHECK_START + ConvertSingleToDouble(fpr.RX(inst.RS), EAX, true); + MEMCHECK_END - ConvertSingleToDouble(s, EAX, true); - - MEMCHECK_END - } fpr.UnlockAll(); + gpr.UnlockAllX(); } - diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp index c4a5ed44d7..c1f1aacd40 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp @@ -1118,7 +1118,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) { if (!thisUsed) break; X64Reg reg = fregFindFreeReg(RI); Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I))); - RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false); + RI.Jit->SafeLoadToReg(ECX, R(ECX), 32, 0, regsInUse(RI), false, EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM); Jit->MOVD_xmm(reg, R(ECX)); RI.fregs[reg] = I; regNormalRegClear(RI, I); @@ -1127,30 +1127,10 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) { case LoadDouble: { if (!thisUsed) break; X64Reg reg = fregFindFreeReg(RI); - if (cpu_info.bSSSE3) { - static const u32 GC_ALIGNED16(maskSwapa64_1[4]) = - {0x04050607L, 0x00010203L, 0xFFFFFFFFL, 0xFFFFFFFFL}; -#if _M_X86_64 - // TODO: Remove regEnsureInReg() and use ECX - X64Reg address = regEnsureInReg(RI, getOp1(I)); - Jit->MOVQ_xmm(reg, MComplex(RBX, address, SCALE_1, 0)); -#else - X64Reg address = regBinLHSReg(RI, I); - Jit->AND(32, R(address), Imm32(Memory::MEMVIEW32_MASK)); - Jit->MOVQ_xmm(reg, MDisp(address, (u32)Memory::base)); -#endif - Jit->PSHUFB(reg, M((void*)maskSwapa64_1)); - } else { - const OpArg loc = regLocForInst(RI, getOp1(I)); - Jit->MOV(32, R(ECX), loc); - Jit->ADD(32, R(ECX), Imm8(4)); - RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false); - Jit->MOVD_xmm(reg, R(ECX)); - Jit->MOV(32, R(ECX), loc); - RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false); - Jit->MOVD_xmm(XMM0, R(ECX)); - Jit->PUNPCKLDQ(reg, R(XMM0)); - } + const OpArg loc = regLocForInst(RI, getOp1(I)); + Jit->MOV(32, R(ECX), loc); + RI.Jit->SafeLoadToReg(RCX, R(ECX), 64, 0, regsInUse(RI), false, EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM); + Jit->MOVQ_xmm(reg, R(RCX)); RI.fregs[reg] = I; regNormalRegClear(RI, I); break; @@ -1196,67 +1176,13 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) { } case StoreDouble: { regSpill(RI, EAX); - // Please fix the following code - // if SafeWriteRegToReg() is modified. - u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS; - if (Core::g_CoreStartupParameter.bMMU || - Core::g_CoreStartupParameter.bTLBHack) { - mem_mask |= Memory::ADDR_MASK_MEM1; - } -#ifdef ENABLE_MEM_CHECK - if (Core::g_CoreStartupParameter.bEnableDebugging) - { - mem_mask |= Memory::EXRAM_MASK; - } -#endif - Jit->TEST(32, regLocForInst(RI, getOp2(I)), Imm32(mem_mask)); - FixupBranch safe = Jit->J_CC(CC_NZ); - // Fast routine - if (cpu_info.bSSSE3) { - static const u32 GC_ALIGNED16(maskSwapa64_1[4]) = - {0x04050607L, 0x00010203L, 0xFFFFFFFFL, 0xFFFFFFFFL}; - X64Reg value = fregBinLHSRegWithMov(RI, I); - Jit->PSHUFB(value, M((void*)maskSwapa64_1)); - Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I))); -#if _M_X86_64 - Jit->MOVQ_xmm(MComplex(RBX, ECX, SCALE_1, 0), value); -#else - Jit->AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); - Jit->MOVQ_xmm(MDisp(ECX, (u32)Memory::base), value); -#endif - } else { - regSpill(RI, EAX); - OpArg loc = fregLocForInst(RI, getOp1(I)); - if (!loc.IsSimpleReg() || !(RI.IInfo[I - RI.FirstI] & 4)) { - Jit->MOVAPD(XMM0, loc); - loc = R(XMM0); - } - Jit->MOVD_xmm(R(EAX), loc.GetSimpleReg()); - Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I))); - RI.Jit->UnsafeWriteRegToReg(EAX, ECX, 32, 4); - - Jit->PSRLQ(loc.GetSimpleReg(), 32); - Jit->MOVD_xmm(R(EAX), loc.GetSimpleReg()); - Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I))); - RI.Jit->UnsafeWriteRegToReg(EAX, ECX, 32, 0); - } - FixupBranch exit = Jit->J(true); - Jit->SetJumpTarget(safe); - // Safe but slow routine - OpArg value = fregLocForInst(RI, getOp1(I)); - OpArg address = regLocForInst(RI, getOp2(I)); - Jit->MOVAPD(XMM0, value); - Jit->PSRLQ(XMM0, 32); - Jit->MOVD_xmm(R(EAX), XMM0); - Jit->MOV(32, R(ECX), address); - RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM); - - Jit->MOVAPD(XMM0, value); - Jit->MOVD_xmm(R(EAX), XMM0); - Jit->MOV(32, R(ECX), address); - RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 4, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM); - Jit->SetJumpTarget(exit); + OpArg value = fregLocForInst(RI, getOp1(I)); + OpArg address = regLocForInst(RI, getOp2(I)); + Jit->MOVAPD(XMM0, value); + Jit->MOVQ_xmm(R(RAX), XMM0); + Jit->MOV(32, R(ECX), address); + RI.Jit->SafeWriteRegToReg(RAX, ECX, 64, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM); if (RI.IInfo[I - RI.FirstI] & 4) fregClearInst(RI, getOp1(I)); diff --git a/Source/Core/Core/PowerPC/Jit64IL/JitIL.h b/Source/Core/Core/PowerPC/Jit64IL/JitIL.h index c824bffb00..eeb75f76fc 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/JitIL.h +++ b/Source/Core/Core/PowerPC/Jit64IL/JitIL.h @@ -93,8 +93,6 @@ public: void WriteCallInterpreter(UGeckoInstruction _inst); void Cleanup(); - void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address); - void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address); void GenerateCarry(Gen::X64Reg temp_reg); void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg)); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index 2e3d5701aa..403ed9e6b0 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -266,7 +266,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() // Easy! const u8* storeSingleFloat = AlignCode4(); - SafeWriteFloatToReg(XMM0, ECX, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); + SafeWriteF32ToReg(XMM0, ECX, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); RET(); /* if (cpu_info.bSSSE3) { diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index fa4b2959ab..58348d19cb 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -101,7 +101,7 @@ u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, Gen::OpArg opAddress, int ac if (accessSize == 8 && signExtend) MOVSX(32, accessSize, reg_value, MComplex(RBX, opAddress.GetSimpleReg(), SCALE_1, offset)); else - MOVZX(32, accessSize, reg_value, MComplex(RBX, opAddress.GetSimpleReg(), SCALE_1, offset)); + MOVZX(64, accessSize, reg_value, MComplex(RBX, opAddress.GetSimpleReg(), SCALE_1, offset)); } else { @@ -110,7 +110,7 @@ u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, Gen::OpArg opAddress, int ac if (accessSize == 8 && signExtend) MOVSX(32, accessSize, reg_value, MComplex(RBX, reg_value, SCALE_1, offset)); else - MOVZX(32, accessSize, reg_value, MComplex(RBX, reg_value, SCALE_1, offset)); + MOVZX(64, accessSize, reg_value, MComplex(RBX, reg_value, SCALE_1, offset)); } #else if (opAddress.IsImm()) @@ -151,6 +151,10 @@ u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, Gen::OpArg opAddress, int ac case 32: BSWAP(32, reg_value); break; + + case 64: + BSWAP(64, reg_value); + break; } return result; @@ -272,6 +276,8 @@ void EmuCodeBlock::MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value, } } +// Always clobbers EAX. Preserves the address. +// Preserves the value if the load fails and js.memcheck is enabled. void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags) { if (!jit->js.memcheck) @@ -325,7 +331,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, { UnsafeLoadToReg(reg_value, opAddress, accessSize, offset, signExtend); } - else if (!Core::g_CoreStartupParameter.bMMU && MMIO::IsMMIOAddress(address)) + else if (!Core::g_CoreStartupParameter.bMMU && MMIO::IsMMIOAddress(address) && accessSize != 64) { MMIOLoadToReg(Memory::mmio_mapping, reg_value, registersInUse, address, accessSize, signExtend); @@ -335,6 +341,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, ABI_PushRegistersAndAdjustStack(registersInUse, false); switch (accessSize) { + case 64: ABI_CallFunctionC((void *)&Memory::Read_U64, address); break; case 32: ABI_CallFunctionC((void *)&Memory::Read_U32, address); break; case 16: ABI_CallFunctionC((void *)&Memory::Read_U16_ZX, address); break; case 8: ABI_CallFunctionC((void *)&Memory::Read_U8_ZX, address); break; @@ -350,7 +357,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, } else if (reg_value != EAX) { - MOVZX(32, accessSize, reg_value, R(EAX)); + MOVZX(64, accessSize, reg_value, R(EAX)); } MEMCHECK_END @@ -372,6 +379,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, ABI_PushRegistersAndAdjustStack(registersInUse, false); switch (accessSize) { + case 64: ABI_CallFunctionA((void *)&Memory::Read_U64, addr_loc); break; case 32: ABI_CallFunctionA((void *)&Memory::Read_U32, addr_loc); break; case 16: ABI_CallFunctionA((void *)&Memory::Read_U16_ZX, addr_loc); break; case 8: ABI_CallFunctionA((void *)&Memory::Read_U8_ZX, addr_loc); break; @@ -387,7 +395,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, } else if (reg_value != EAX) { - MOVZX(32, accessSize, reg_value, R(EAX)); + MOVZX(64, accessSize, reg_value, R(EAX)); } MEMCHECK_END @@ -490,6 +498,7 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce ABI_PushRegistersAndAdjustStack(registersInUse, noProlog); switch (accessSize) { + case 64: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U64) : ((void *)&Memory::Write_U64_Swap), reg_value, reg_addr, false); break; case 32: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U32) : ((void *)&Memory::Write_U32_Swap), reg_value, reg_addr, false); break; case 16: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U16) : ((void *)&Memory::Write_U16_Swap), reg_value, reg_addr, false); break; case 8: ABI_CallFunctionRR((void *)&Memory::Write_U8, reg_value, reg_addr, false); break; @@ -501,43 +510,12 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce SetJumpTarget(exit); } -void EmuCodeBlock::SafeWriteFloatToReg(X64Reg xmm_value, X64Reg reg_addr, u32 registersInUse, int flags) +// Destroys both arg registers and EAX +void EmuCodeBlock::SafeWriteF32ToReg(X64Reg xmm_value, X64Reg reg_addr, s32 offset, u32 registersInUse, int flags) { - // FIXME - if (false && cpu_info.bSSSE3) { - // This path should be faster but for some reason it causes errors so I've disabled it. - u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS; - - if (Core::g_CoreStartupParameter.bMMU || Core::g_CoreStartupParameter.bTLBHack) - mem_mask |= Memory::ADDR_MASK_MEM1; - -#ifdef ENABLE_MEM_CHECK - if (Core::g_CoreStartupParameter.bEnableDebugging) - mem_mask |= Memory::EXRAM_MASK; -#endif - TEST(32, R(reg_addr), Imm32(mem_mask)); - FixupBranch argh = J_CC(CC_Z); - MOVSS(M(&float_buffer), xmm_value); - LoadAndSwap(32, EAX, M(&float_buffer)); - MOV(32, M(&PC), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write - ABI_PushRegistersAndAdjustStack(registersInUse, false); - ABI_CallFunctionRR((void *)&Memory::Write_U32, EAX, reg_addr); - ABI_PopRegistersAndAdjustStack(registersInUse, false); - FixupBranch arg2 = J(); - SetJumpTarget(argh); - PSHUFB(xmm_value, M((void *)pbswapShuffle1x4)); -#if _M_X86_64 - MOVD_xmm(MComplex(RBX, reg_addr, SCALE_1, 0), xmm_value); -#else - AND(32, R(reg_addr), Imm32(Memory::MEMVIEW32_MASK)); - MOVD_xmm(MDisp(reg_addr, (u32)Memory::base), xmm_value); -#endif - SetJumpTarget(arg2); - } else { - MOVSS(M(&float_buffer), xmm_value); - MOV(32, R(EAX), M(&float_buffer)); - SafeWriteRegToReg(EAX, reg_addr, 32, 0, registersInUse, flags); - } + // TODO: PSHUFB might be faster if fastmem supported MOVSS. + MOVD_xmm(R(EAX), xmm_value); + SafeWriteRegToReg(EAX, reg_addr, 32, offset, registersInUse, flags); } void EmuCodeBlock::WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap) @@ -555,16 +533,6 @@ void EmuCodeBlock::WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 a #endif } -void EmuCodeBlock::WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address) -{ -#if _M_X86_64 - MOV(32, R(RAX), Imm32(address)); - MOVSS(MComplex(RBX, RAX, 1, 0), xmm_reg); -#else - MOVSS(M((void*)((u32)Memory::base + (address & Memory::MEMVIEW32_MASK))), xmm_reg); -#endif -} - void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm) { // Most games don't need these. Zelda requires it though - some platforms get stuck without them. if (jit->jo.accurateSinglePrecision) diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index 24fa76a536..85a3320d0a 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -47,11 +47,9 @@ public: void SafeLoadToReg(Gen::X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags = 0); void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags = 0); - // Trashes both inputs and EAX. - void SafeWriteFloatToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, u32 registersInUse, int flags = 0); + void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0); void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false); - void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address); void JitClearCA(); void JitSetCA(); void JitClearCAOV(bool oe);