Merge pull request #443 from magumagu/loadstore-cleanup

Loadstore cleanup
This commit is contained in:
Lioncash 2014-06-26 21:32:59 -04:00
commit bd377b9580
9 changed files with 79 additions and 387 deletions

View File

@ -1286,9 +1286,7 @@ void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg) {
}
void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src) {
if (arg.IsSimpleReg())
PanicAlert("Emitter: MOVQ_xmm doesn't support single registers as destination");
if (src > 7)
if (src > 7 || arg.IsSimpleReg())
{
// Alternate encoding
// This does not display correctly in MSVC's debugger, it thinks it's a MOVD

View File

@ -88,7 +88,7 @@ static GekkoOPTemplate primarytable[] =
{51, &Jit64::FallBackToInterpreter}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}},
{52, &Jit64::stfs}, //"stfs", OPTYPE_STOREFP, FL_IN_A}},
{53, &Jit64::stfs}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
{53, &Jit64::FallBackToInterpreter}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
{54, &Jit64::stfd}, //"stfd", OPTYPE_STOREFP, FL_IN_A}},
{55, &Jit64::FallBackToInterpreter}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},

View File

@ -314,38 +314,6 @@ void Jit64::stX(UGeckoInstruction inst)
}
}
// Optimized stack access?
if (accessSize == 32 && !gpr.R(a).IsImm() && a == 1 && js.st.isFirstBlockOfFunction && jo.optimizeStack)
{
gpr.FlushLockX(ABI_PARAM1);
MOV(32, R(ABI_PARAM1), gpr.R(a));
MOV(32, R(EAX), gpr.R(s));
SwapAndStore(accessSize, MComplex(RBX, ABI_PARAM1, SCALE_1, (u32)offset), EAX);
if (update && offset)
{
gpr.Lock(a);
gpr.KillImmediate(a, true, true);
ADD(32, gpr.R(a), Imm32(offset));
gpr.UnlockAll();
}
gpr.UnlockAllX();
return;
}
/* // TODO - figure out why Beyond Good and Evil hates this
#if defined(_WIN32) && _M_X86_64
if (accessSize == 32 && !update)
{
// Fast and daring - requires 64-bit
MOV(32, R(EAX), gpr.R(s));
gpr.BindToRegister(a, true, false);
SwapAndStore(32, MComplex(RBX, gpr.RX(a), SCALE_1, (u32)offset), EAX);
return;
}
#endif*/
//Still here? Do regular path.
gpr.FlushLockX(ECX, EDX);
gpr.Lock(s, a);
MOV(32, R(EDX), gpr.R(a));
@ -415,15 +383,16 @@ void Jit64::lmw(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(bJITLoadStoreOff);
// TODO: This doesn't handle rollback on DSI correctly
gpr.FlushLockX(ECX);
MOV(32, R(EAX), Imm32((u32)(s32)inst.SIMM_16));
MOV(32, R(ECX), Imm32((u32)(s32)inst.SIMM_16));
if (inst.RA)
ADD(32, R(EAX), gpr.R(inst.RA));
ADD(32, R(ECX), gpr.R(inst.RA));
for (int i = inst.RD; i < 32; i++)
{
LoadAndSwap(32, ECX, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4));
SafeLoadToReg(EAX, R(ECX), 32, (i - inst.RD) * 4, RegistersInUse(), false);
gpr.BindToRegister(i, false, true);
MOV(32, gpr.R(i), R(ECX));
MOV(32, gpr.R(i), R(EAX));
}
gpr.UnlockAllX();
}
@ -433,14 +402,16 @@ void Jit64::stmw(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(bJITLoadStoreOff);
// TODO: This doesn't handle rollback on DSI correctly
gpr.FlushLockX(ECX);
MOV(32, R(EAX), Imm32((u32)(s32)inst.SIMM_16));
if (inst.RA)
ADD(32, R(EAX), gpr.R(inst.RA));
for (int i = inst.RD; i < 32; i++)
{
if (inst.RA)
MOV(32, R(EAX), gpr.R(inst.RA));
else
XOR(32, R(EAX), R(EAX));
MOV(32, R(ECX), gpr.R(i));
SwapAndStore(32, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4), ECX);
SafeWriteRegToReg(ECX, EAX, 32, (i - inst.RD) * 4 + (u32)(s32)inst.SIMM_16, RegistersInUse());
}
gpr.UnlockAllX();
}

View File

@ -2,9 +2,6 @@
// Licensed under GPLv2
// Refer to the license.txt file included.
// TODO(ector): Tons of pshufb optimization of the loads/stores, for SSSE3+, possibly SSE4, only.
// Should give a very noticeable speed boost to paired single heavy code.
#include "Common/Common.h"
#include "Common/CPUDetect.h"
@ -12,20 +9,8 @@
#include "Core/PowerPC/Jit64/JitAsm.h"
#include "Core/PowerPC/Jit64/JitRegCache.h"
namespace {
// pshufb todo: MOVQ
const u8 GC_ALIGNED16(bswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
const u8 GC_ALIGNED16(bswapShuffle1x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 8, 9, 10, 11, 12, 13, 14, 15};
const u8 GC_ALIGNED16(bswapShuffle1x8Dupe[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0};
u64 GC_ALIGNED16(temp64);
}
// TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common,
// and pshufb could help a lot.
// Also add hacks for things like lfs/stfs the same reg consecutively, that is, simple memory moves.
void Jit64::lfs(UGeckoInstruction inst)
{
@ -40,12 +25,11 @@ void Jit64::lfs(UGeckoInstruction inst)
SafeLoadToReg(EAX, gpr.R(a), 32, offset, RegistersInUse(), false);
MEMCHECK_START
fpr.Lock(d);
fpr.BindToRegister(d, false);
ConvertSingleToDouble(fpr.RX(d), EAX, true);
fpr.BindToRegister(d, js.memcheck);
MEMCHECK_START
ConvertSingleToDouble(fpr.RX(d), EAX, true);
MEMCHECK_END
fpr.UnlockAll();
@ -56,61 +40,23 @@ void Jit64::lfd(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
FALLBACK_IF(js.memcheck || !inst.RA);
FALLBACK_IF(!inst.RA);
int d = inst.RD;
int a = inst.RA;
s32 offset = (s32)(s16)inst.SIMM_16;
gpr.FlushLockX(ABI_PARAM1);
gpr.Lock(a);
MOV(32, R(ABI_PARAM1), gpr.R(a));
// TODO - optimize. This has to load the previous value - upper double should stay unmodified.
SafeLoadToReg(RAX, gpr.R(a), 64, offset, RegistersInUse(), false);
fpr.Lock(d);
fpr.BindToRegister(d, true);
X64Reg xd = fpr.RX(d);
if (cpu_info.bSSSE3)
{
#if _M_X86_64
MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
#else
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
#endif
PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe));
MOVSD(xd, R(XMM0));
} else {
#if _M_X86_64
LoadAndSwap(64, EAX, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
MOV(64, M(&temp64), R(EAX));
MEMCHECK_START
MOVQ_xmm(XMM0, R(RAX));
MOVSD(fpr.RX(d), R(XMM0));
MEMCHECK_END
MEMCHECK_START
MOVSD(XMM0, M(&temp64));
MOVSD(xd, R(XMM0));
MEMCHECK_END
#else
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset));
BSWAP(32, EAX);
MOV(32, M((void*)((u8 *)&temp64+4)), R(EAX));
MEMCHECK_START
MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4));
BSWAP(32, EAX);
MOV(32, M(&temp64), R(EAX));
MOVSD(XMM0, M(&temp64));
MOVSD(xd, R(XMM0));
MEMCHECK_END
#endif
}
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
@ -119,146 +65,49 @@ void Jit64::stfd(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
FALLBACK_IF(js.memcheck || !inst.RA);
FALLBACK_IF(!inst.RA);
int s = inst.RS;
int a = inst.RA;
u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS;
if (Core::g_CoreStartupParameter.bMMU ||
Core::g_CoreStartupParameter.bTLBHack) {
mem_mask |= Memory::ADDR_MASK_MEM1;
}
#ifdef ENABLE_MEM_CHECK
if (Core::g_CoreStartupParameter.bEnableDebugging)
{
mem_mask |= Memory::EXRAM_MASK;
}
#endif
gpr.FlushLockX(ABI_PARAM1);
gpr.Lock(a);
fpr.Lock(s);
gpr.BindToRegister(a, true, false);
MOV(32, R(ABI_PARAM1), gpr.R(a));
if (fpr.R(s).IsSimpleReg())
MOVQ_xmm(R(RAX), fpr.RX(s));
else
MOV(64, R(RAX), fpr.R(s));
s32 offset = (s32)(s16)inst.SIMM_16;
LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset));
TEST(32, R(ABI_PARAM1), Imm32(mem_mask));
FixupBranch safe = J_CC(CC_NZ);
SafeWriteRegToReg(RAX, ABI_PARAM1, 64, offset, RegistersInUse());
// Fast routine
if (cpu_info.bSSSE3) {
MOVAPD(XMM0, fpr.R(s));
PSHUFB(XMM0, M((void*)bswapShuffle1x8));
#if _M_X86_64
MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, 0), XMM0);
#else
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base), XMM0);
#endif
} else {
MOVAPD(XMM0, fpr.R(s));
MOVD_xmm(R(EAX), XMM0);
UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4);
PSRLQ(XMM0, 32);
MOVD_xmm(R(EAX), XMM0);
UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0);
}
FixupBranch exit = J(true);
SetJumpTarget(safe);
// Safe but slow routine
MOVAPD(XMM0, fpr.R(s));
PSRLQ(XMM0, 32);
MOVD_xmm(R(EAX), XMM0);
SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse() | (1 << (16 + XMM0)));
MOVAPD(XMM0, fpr.R(s));
MOVD_xmm(R(EAX), XMM0);
LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset));
SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4, RegistersInUse());
SetJumpTarget(exit);
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
// In Release on 32bit build,
// this seemed to cause a problem with PokePark2
// at start after talking to first pokemon,
// you run and smash a box, then he goes on about
// following him and then you cant do anything.
// I have enabled interpreter for this function
// in the mean time.
// Parlane
void Jit64::stfs(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
FALLBACK_IF(!inst.RA);
bool update = inst.OPCD & 1;
int s = inst.RS;
int a = inst.RA;
s32 offset = (s32)(s16)inst.SIMM_16;
FALLBACK_IF(!a || update);
fpr.BindToRegister(s, true, false);
ConvertDoubleToSingle(XMM0, fpr.RX(s));
if (gpr.R(a).IsImm())
{
u32 addr = (u32)(gpr.R(a).offset + offset);
if (Memory::IsRAMAddress(addr))
{
if (cpu_info.bSSSE3) {
PSHUFB(XMM0, M((void *)bswapShuffle1x4));
WriteFloatToConstRamAddress(XMM0, addr);
return;
}
}
else if (addr == 0xCC008000)
{
// Float directly to write gather pipe! Fun!
CALL((void*)asm_routines.fifoDirectWriteFloat);
// TODO
js.fifoBytesThisBlock += 4;
return;
}
}
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
gpr.Lock(a);
MOV(32, R(ABI_PARAM2), gpr.R(a));
ADD(32, R(ABI_PARAM2), Imm32(offset));
if (update && offset)
{
// We must flush immediate values from the following register because
// it may take another value at runtime if no MMU exception has been raised
gpr.KillImmediate(a, true, true);
MEMCHECK_START
MOV(32, gpr.R(a), R(ABI_PARAM2));
MEMCHECK_END
}
SafeWriteFloatToReg(XMM0, ABI_PARAM2, RegistersInUse());
gpr.UnlockAll();
gpr.UnlockAllX();
gpr.FlushLockX(ABI_PARAM1);
MOV(32, R(ABI_PARAM1), gpr.R(a));
SafeWriteF32ToReg(XMM0, ABI_PARAM1, offset, RegistersInUse());
fpr.UnlockAll();
gpr.UnlockAllX();
}
void Jit64::stfsx(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
// We can take a shortcut here - it's not likely that a hardware access would use this instruction.
gpr.FlushLockX(ABI_PARAM1);
MOV(32, R(ABI_PARAM1), gpr.R(inst.RB));
if (inst.RA)
@ -268,14 +117,11 @@ void Jit64::stfsx(UGeckoInstruction inst)
fpr.Lock(s);
fpr.BindToRegister(s, true, false);
ConvertDoubleToSingle(XMM0, fpr.RX(s));
MOVD_xmm(R(EAX), XMM0);
SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse());
gpr.UnlockAllX();
SafeWriteF32ToReg(XMM0, ABI_PARAM1, 0, RegistersInUse());
fpr.UnlockAll();
gpr.UnlockAllX();
}
void Jit64::lfsx(UGeckoInstruction inst)
{
INSTRUCTION_START
@ -283,30 +129,17 @@ void Jit64::lfsx(UGeckoInstruction inst)
MOV(32, R(EAX), gpr.R(inst.RB));
if (inst.RA)
{
ADD(32, R(EAX), gpr.R(inst.RA));
}
SafeLoadToReg(EAX, R(EAX), 32, 0, RegistersInUse(), false);
fpr.Lock(inst.RS);
fpr.BindToRegister(inst.RS, false);
X64Reg s = fpr.RX(inst.RS);
if (cpu_info.bSSSE3 && !js.memcheck) {
#if _M_X86_32
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOVD_xmm(XMM0, MDisp(EAX, (u32)Memory::base));
#else
MOVD_xmm(XMM0, MComplex(RBX, EAX, SCALE_1, 0));
#endif
PSHUFB(XMM0, M((void *)bswapShuffle1x4));
ConvertSingleToDouble(s, XMM0);
} else {
SafeLoadToReg(EAX, R(EAX), 32, 0, RegistersInUse(), false);
fpr.BindToRegister(inst.RS, js.memcheck);
MEMCHECK_START
MEMCHECK_START
ConvertSingleToDouble(fpr.RX(inst.RS), EAX, true);
MEMCHECK_END
ConvertSingleToDouble(s, EAX, true);
MEMCHECK_END
}
fpr.UnlockAll();
gpr.UnlockAllX();
}

View File

@ -1118,7 +1118,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
RI.Jit->SafeLoadToReg(ECX, R(ECX), 32, 0, regsInUse(RI), false, EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
Jit->MOVD_xmm(reg, R(ECX));
RI.fregs[reg] = I;
regNormalRegClear(RI, I);
@ -1127,30 +1127,10 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) {
case LoadDouble: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
if (cpu_info.bSSSE3) {
static const u32 GC_ALIGNED16(maskSwapa64_1[4]) =
{0x04050607L, 0x00010203L, 0xFFFFFFFFL, 0xFFFFFFFFL};
#if _M_X86_64
// TODO: Remove regEnsureInReg() and use ECX
X64Reg address = regEnsureInReg(RI, getOp1(I));
Jit->MOVQ_xmm(reg, MComplex(RBX, address, SCALE_1, 0));
#else
X64Reg address = regBinLHSReg(RI, I);
Jit->AND(32, R(address), Imm32(Memory::MEMVIEW32_MASK));
Jit->MOVQ_xmm(reg, MDisp(address, (u32)Memory::base));
#endif
Jit->PSHUFB(reg, M((void*)maskSwapa64_1));
} else {
const OpArg loc = regLocForInst(RI, getOp1(I));
Jit->MOV(32, R(ECX), loc);
Jit->ADD(32, R(ECX), Imm8(4));
RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
Jit->MOVD_xmm(reg, R(ECX));
Jit->MOV(32, R(ECX), loc);
RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
Jit->MOVD_xmm(XMM0, R(ECX));
Jit->PUNPCKLDQ(reg, R(XMM0));
}
const OpArg loc = regLocForInst(RI, getOp1(I));
Jit->MOV(32, R(ECX), loc);
RI.Jit->SafeLoadToReg(RCX, R(ECX), 64, 0, regsInUse(RI), false, EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
Jit->MOVQ_xmm(reg, R(RCX));
RI.fregs[reg] = I;
regNormalRegClear(RI, I);
break;
@ -1196,67 +1176,13 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) {
}
case StoreDouble: {
regSpill(RI, EAX);
// Please fix the following code
// if SafeWriteRegToReg() is modified.
u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS;
if (Core::g_CoreStartupParameter.bMMU ||
Core::g_CoreStartupParameter.bTLBHack) {
mem_mask |= Memory::ADDR_MASK_MEM1;
}
#ifdef ENABLE_MEM_CHECK
if (Core::g_CoreStartupParameter.bEnableDebugging)
{
mem_mask |= Memory::EXRAM_MASK;
}
#endif
Jit->TEST(32, regLocForInst(RI, getOp2(I)), Imm32(mem_mask));
FixupBranch safe = Jit->J_CC(CC_NZ);
// Fast routine
if (cpu_info.bSSSE3) {
static const u32 GC_ALIGNED16(maskSwapa64_1[4]) =
{0x04050607L, 0x00010203L, 0xFFFFFFFFL, 0xFFFFFFFFL};
X64Reg value = fregBinLHSRegWithMov(RI, I);
Jit->PSHUFB(value, M((void*)maskSwapa64_1));
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
#if _M_X86_64
Jit->MOVQ_xmm(MComplex(RBX, ECX, SCALE_1, 0), value);
#else
Jit->AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
Jit->MOVQ_xmm(MDisp(ECX, (u32)Memory::base), value);
#endif
} else {
regSpill(RI, EAX);
OpArg loc = fregLocForInst(RI, getOp1(I));
if (!loc.IsSimpleReg() || !(RI.IInfo[I - RI.FirstI] & 4)) {
Jit->MOVAPD(XMM0, loc);
loc = R(XMM0);
}
Jit->MOVD_xmm(R(EAX), loc.GetSimpleReg());
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
RI.Jit->UnsafeWriteRegToReg(EAX, ECX, 32, 4);
Jit->PSRLQ(loc.GetSimpleReg(), 32);
Jit->MOVD_xmm(R(EAX), loc.GetSimpleReg());
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
RI.Jit->UnsafeWriteRegToReg(EAX, ECX, 32, 0);
}
FixupBranch exit = Jit->J(true);
Jit->SetJumpTarget(safe);
// Safe but slow routine
OpArg value = fregLocForInst(RI, getOp1(I));
OpArg address = regLocForInst(RI, getOp2(I));
Jit->MOVAPD(XMM0, value);
Jit->PSRLQ(XMM0, 32);
Jit->MOVD_xmm(R(EAX), XMM0);
Jit->MOV(32, R(ECX), address);
RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
Jit->MOVAPD(XMM0, value);
Jit->MOVD_xmm(R(EAX), XMM0);
Jit->MOV(32, R(ECX), address);
RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 4, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
Jit->SetJumpTarget(exit);
OpArg value = fregLocForInst(RI, getOp1(I));
OpArg address = regLocForInst(RI, getOp2(I));
Jit->MOVAPD(XMM0, value);
Jit->MOVQ_xmm(R(RAX), XMM0);
Jit->MOV(32, R(ECX), address);
RI.Jit->SafeWriteRegToReg(RAX, ECX, 64, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
if (RI.IInfo[I - RI.FirstI] & 4)
fregClearInst(RI, getOp1(I));

View File

@ -93,8 +93,6 @@ public:
void WriteCallInterpreter(UGeckoInstruction _inst);
void Cleanup();
void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address);
void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address);
void GenerateCarry(Gen::X64Reg temp_reg);
void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg));

View File

@ -266,7 +266,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
// Easy!
const u8* storeSingleFloat = AlignCode4();
SafeWriteFloatToReg(XMM0, ECX, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
SafeWriteF32ToReg(XMM0, ECX, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
/*
if (cpu_info.bSSSE3) {

View File

@ -101,7 +101,7 @@ u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, Gen::OpArg opAddress, int ac
if (accessSize == 8 && signExtend)
MOVSX(32, accessSize, reg_value, MComplex(RBX, opAddress.GetSimpleReg(), SCALE_1, offset));
else
MOVZX(32, accessSize, reg_value, MComplex(RBX, opAddress.GetSimpleReg(), SCALE_1, offset));
MOVZX(64, accessSize, reg_value, MComplex(RBX, opAddress.GetSimpleReg(), SCALE_1, offset));
}
else
{
@ -110,7 +110,7 @@ u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, Gen::OpArg opAddress, int ac
if (accessSize == 8 && signExtend)
MOVSX(32, accessSize, reg_value, MComplex(RBX, reg_value, SCALE_1, offset));
else
MOVZX(32, accessSize, reg_value, MComplex(RBX, reg_value, SCALE_1, offset));
MOVZX(64, accessSize, reg_value, MComplex(RBX, reg_value, SCALE_1, offset));
}
#else
if (opAddress.IsImm())
@ -151,6 +151,10 @@ u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, Gen::OpArg opAddress, int ac
case 32:
BSWAP(32, reg_value);
break;
case 64:
BSWAP(64, reg_value);
break;
}
return result;
@ -272,6 +276,8 @@ void EmuCodeBlock::MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value,
}
}
// Always clobbers EAX. Preserves the address.
// Preserves the value if the load fails and js.memcheck is enabled.
void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags)
{
if (!jit->js.memcheck)
@ -325,7 +331,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
{
UnsafeLoadToReg(reg_value, opAddress, accessSize, offset, signExtend);
}
else if (!Core::g_CoreStartupParameter.bMMU && MMIO::IsMMIOAddress(address))
else if (!Core::g_CoreStartupParameter.bMMU && MMIO::IsMMIOAddress(address) && accessSize != 64)
{
MMIOLoadToReg(Memory::mmio_mapping, reg_value, registersInUse,
address, accessSize, signExtend);
@ -335,6 +341,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
ABI_PushRegistersAndAdjustStack(registersInUse, false);
switch (accessSize)
{
case 64: ABI_CallFunctionC((void *)&Memory::Read_U64, address); break;
case 32: ABI_CallFunctionC((void *)&Memory::Read_U32, address); break;
case 16: ABI_CallFunctionC((void *)&Memory::Read_U16_ZX, address); break;
case 8: ABI_CallFunctionC((void *)&Memory::Read_U8_ZX, address); break;
@ -350,7 +357,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
}
else if (reg_value != EAX)
{
MOVZX(32, accessSize, reg_value, R(EAX));
MOVZX(64, accessSize, reg_value, R(EAX));
}
MEMCHECK_END
@ -372,6 +379,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
ABI_PushRegistersAndAdjustStack(registersInUse, false);
switch (accessSize)
{
case 64: ABI_CallFunctionA((void *)&Memory::Read_U64, addr_loc); break;
case 32: ABI_CallFunctionA((void *)&Memory::Read_U32, addr_loc); break;
case 16: ABI_CallFunctionA((void *)&Memory::Read_U16_ZX, addr_loc); break;
case 8: ABI_CallFunctionA((void *)&Memory::Read_U8_ZX, addr_loc); break;
@ -387,7 +395,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
}
else if (reg_value != EAX)
{
MOVZX(32, accessSize, reg_value, R(EAX));
MOVZX(64, accessSize, reg_value, R(EAX));
}
MEMCHECK_END
@ -490,6 +498,7 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
ABI_PushRegistersAndAdjustStack(registersInUse, noProlog);
switch (accessSize)
{
case 64: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U64) : ((void *)&Memory::Write_U64_Swap), reg_value, reg_addr, false); break;
case 32: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U32) : ((void *)&Memory::Write_U32_Swap), reg_value, reg_addr, false); break;
case 16: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U16) : ((void *)&Memory::Write_U16_Swap), reg_value, reg_addr, false); break;
case 8: ABI_CallFunctionRR((void *)&Memory::Write_U8, reg_value, reg_addr, false); break;
@ -501,43 +510,12 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
SetJumpTarget(exit);
}
void EmuCodeBlock::SafeWriteFloatToReg(X64Reg xmm_value, X64Reg reg_addr, u32 registersInUse, int flags)
// Destroys both arg registers and EAX
void EmuCodeBlock::SafeWriteF32ToReg(X64Reg xmm_value, X64Reg reg_addr, s32 offset, u32 registersInUse, int flags)
{
// FIXME
if (false && cpu_info.bSSSE3) {
// This path should be faster but for some reason it causes errors so I've disabled it.
u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS;
if (Core::g_CoreStartupParameter.bMMU || Core::g_CoreStartupParameter.bTLBHack)
mem_mask |= Memory::ADDR_MASK_MEM1;
#ifdef ENABLE_MEM_CHECK
if (Core::g_CoreStartupParameter.bEnableDebugging)
mem_mask |= Memory::EXRAM_MASK;
#endif
TEST(32, R(reg_addr), Imm32(mem_mask));
FixupBranch argh = J_CC(CC_Z);
MOVSS(M(&float_buffer), xmm_value);
LoadAndSwap(32, EAX, M(&float_buffer));
MOV(32, M(&PC), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write
ABI_PushRegistersAndAdjustStack(registersInUse, false);
ABI_CallFunctionRR((void *)&Memory::Write_U32, EAX, reg_addr);
ABI_PopRegistersAndAdjustStack(registersInUse, false);
FixupBranch arg2 = J();
SetJumpTarget(argh);
PSHUFB(xmm_value, M((void *)pbswapShuffle1x4));
#if _M_X86_64
MOVD_xmm(MComplex(RBX, reg_addr, SCALE_1, 0), xmm_value);
#else
AND(32, R(reg_addr), Imm32(Memory::MEMVIEW32_MASK));
MOVD_xmm(MDisp(reg_addr, (u32)Memory::base), xmm_value);
#endif
SetJumpTarget(arg2);
} else {
MOVSS(M(&float_buffer), xmm_value);
MOV(32, R(EAX), M(&float_buffer));
SafeWriteRegToReg(EAX, reg_addr, 32, 0, registersInUse, flags);
}
// TODO: PSHUFB might be faster if fastmem supported MOVSS.
MOVD_xmm(R(EAX), xmm_value);
SafeWriteRegToReg(EAX, reg_addr, 32, offset, registersInUse, flags);
}
void EmuCodeBlock::WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap)
@ -555,16 +533,6 @@ void EmuCodeBlock::WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 a
#endif
}
void EmuCodeBlock::WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address)
{
#if _M_X86_64
MOV(32, R(RAX), Imm32(address));
MOVSS(MComplex(RBX, RAX, 1, 0), xmm_reg);
#else
MOVSS(M((void*)((u32)Memory::base + (address & Memory::MEMVIEW32_MASK))), xmm_reg);
#endif
}
void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm) {
// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
if (jit->jo.accurateSinglePrecision)

View File

@ -47,11 +47,9 @@ public:
void SafeLoadToReg(Gen::X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags = 0);
void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags = 0);
// Trashes both inputs and EAX.
void SafeWriteFloatToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, u32 registersInUse, int flags = 0);
void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0);
void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address);
void JitClearCA();
void JitSetCA();
void JitClearCAOV(bool oe);