Merge pull request #173 from delroth/movbe
Optimize memory access on Haswell by using MOVBE when possible.
This commit is contained in:
commit
a823edcc5b
|
@ -44,6 +44,7 @@ struct CPUInfo
|
|||
bool bAES;
|
||||
// FXSAVE/FXRSTOR
|
||||
bool bFXSR;
|
||||
bool bMOVBE;
|
||||
// This flag indicates that the hardware supports some mode
|
||||
// in which denormal inputs _and_ outputs are automatically set to (signed) zero.
|
||||
// TODO: ARM
|
||||
|
|
|
@ -159,6 +159,7 @@ void CPUInfo::Detect()
|
|||
if ((cpu_id[2] >> 9) & 1) bSSSE3 = true;
|
||||
if ((cpu_id[2] >> 19) & 1) bSSE4_1 = true;
|
||||
if ((cpu_id[2] >> 20) & 1) bSSE4_2 = true;
|
||||
if ((cpu_id[2] >> 22) & 1) bMOVBE = true;
|
||||
if ((cpu_id[2] >> 25) & 1) bAES = true;
|
||||
|
||||
// To check DAZ support, we first need to check FXSAVE support.
|
||||
|
@ -263,6 +264,7 @@ std::string CPUInfo::Summarize()
|
|||
if (bAVX) sum += ", AVX";
|
||||
if (bFMA) sum += ", FMA";
|
||||
if (bAES) sum += ", AES";
|
||||
if (bMOVBE) sum += ", MOVBE";
|
||||
if (bLongMode) sum += ", 64-bit support";
|
||||
return sum;
|
||||
}
|
||||
|
|
|
@ -804,6 +804,38 @@ void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
|
|||
src.WriteRest(this);
|
||||
}
|
||||
|
||||
void XEmitter::MOVBE(int bits, const OpArg& dest, const OpArg& src)
|
||||
{
|
||||
_assert_msg_(DYNA_REC, cpu_info.bMOVBE, "Generating MOVBE on a system that does not support it.");
|
||||
if (bits == 8)
|
||||
{
|
||||
MOV(bits, dest, src);
|
||||
return;
|
||||
}
|
||||
|
||||
if (bits == 16)
|
||||
Write8(0x66);
|
||||
|
||||
if (dest.IsSimpleReg())
|
||||
{
|
||||
_assert_msg_(DYNA_REC, !src.IsSimpleReg() && !src.IsImm(), "MOVBE: Loading from !mem");
|
||||
src.WriteRex(this, bits, bits, dest.GetSimpleReg());
|
||||
Write8(0x0F); Write8(0x38); Write8(0xF0);
|
||||
src.WriteRest(this, 0, dest.GetSimpleReg());
|
||||
}
|
||||
else if (src.IsSimpleReg())
|
||||
{
|
||||
_assert_msg_(DYNA_REC, !dest.IsSimpleReg() && !dest.IsImm(), "MOVBE: Storing to !mem");
|
||||
dest.WriteRex(this, bits, bits, src.GetSimpleReg());
|
||||
Write8(0x0F); Write8(0x38); Write8(0xF1);
|
||||
dest.WriteRest(this, 0, src.GetSimpleReg());
|
||||
}
|
||||
else
|
||||
{
|
||||
_assert_msg_(DYNA_REC, 0, "MOVBE: Not loading or storing to mem");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void XEmitter::LEA(int bits, X64Reg dest, OpArg src)
|
||||
{
|
||||
|
|
|
@ -427,6 +427,9 @@ public:
|
|||
void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary
|
||||
void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
|
||||
|
||||
// Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE.
|
||||
void MOVBE(int dbits, const OpArg& dest, const OpArg& src);
|
||||
|
||||
// WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
|
||||
void STMXCSR(OpArg memloc);
|
||||
void LDMXCSR(OpArg memloc);
|
||||
|
|
|
@ -314,8 +314,7 @@ void Jit64::stX(UGeckoInstruction inst)
|
|||
else if (Memory::IsRAMAddress(addr))
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(s));
|
||||
BSWAP(accessSize, EAX);
|
||||
WriteToConstRamAddress(accessSize, R(EAX), addr);
|
||||
WriteToConstRamAddress(accessSize, EAX, addr, true);
|
||||
if (update)
|
||||
gpr.SetImmediate32(a, addr);
|
||||
return;
|
||||
|
@ -344,10 +343,10 @@ void Jit64::stX(UGeckoInstruction inst)
|
|||
gpr.FlushLockX(ABI_PARAM1);
|
||||
MOV(32, R(ABI_PARAM1), gpr.R(a));
|
||||
MOV(32, R(EAX), gpr.R(s));
|
||||
BSWAP(32, EAX);
|
||||
#if _M_X86_64
|
||||
MOV(accessSize, MComplex(RBX, ABI_PARAM1, SCALE_1, (u32)offset), R(EAX));
|
||||
SwapAndStore(accessSize, MComplex(RBX, ABI_PARAM1, SCALE_1, (u32)offset), EAX);
|
||||
#else
|
||||
BSWAP(32, EAX);
|
||||
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOV(accessSize, MDisp(ABI_PARAM1, (u32)Memory::base + (u32)offset), R(EAX));
|
||||
#endif
|
||||
|
@ -456,8 +455,7 @@ void Jit64::lmw(UGeckoInstruction inst)
|
|||
ADD(32, R(EAX), gpr.R(inst.RA));
|
||||
for (int i = inst.RD; i < 32; i++)
|
||||
{
|
||||
MOV(32, R(ECX), MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4));
|
||||
BSWAP(32, ECX);
|
||||
LoadAndSwap(32, ECX, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4));
|
||||
gpr.BindToRegister(i, false, true);
|
||||
MOV(32, gpr.R(i), R(ECX));
|
||||
}
|
||||
|
@ -481,8 +479,7 @@ void Jit64::stmw(UGeckoInstruction inst)
|
|||
for (int i = inst.RD; i < 32; i++)
|
||||
{
|
||||
MOV(32, R(ECX), gpr.R(i));
|
||||
BSWAP(32, ECX);
|
||||
MOV(32, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4), R(ECX));
|
||||
SwapAndStore(32, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4), ECX);
|
||||
}
|
||||
gpr.UnlockAllX();
|
||||
#else
|
||||
|
|
|
@ -96,8 +96,7 @@ void Jit64::lfd(UGeckoInstruction inst)
|
|||
MOVSD(xd, R(XMM0));
|
||||
} else {
|
||||
#if _M_X86_64
|
||||
MOV(64, R(EAX), MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
|
||||
BSWAP(64, EAX);
|
||||
LoadAndSwap(64, EAX, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
|
||||
MOV(64, M(&temp64), R(EAX));
|
||||
|
||||
MEMCHECK_START
|
||||
|
|
|
@ -21,15 +21,11 @@ void CommonAsmRoutines::GenFifoWrite(int size)
|
|||
PUSH(ESI);
|
||||
if (size != 32)
|
||||
PUSH(EDX);
|
||||
BSWAP(size, ABI_PARAM1);
|
||||
MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
|
||||
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
|
||||
if (size != 32) {
|
||||
MOV(32, R(EDX), R(ABI_PARAM1));
|
||||
MOV(size, MComplex(RAX, RSI, 1, 0), R(EDX));
|
||||
} else {
|
||||
MOV(size, MComplex(RAX, RSI, 1, 0), R(ABI_PARAM1));
|
||||
}
|
||||
|
||||
SwapAndStore(size, MComplex(RAX, RSI, 1, 0), ABI_PARAM1);
|
||||
|
||||
ADD(32, R(ESI), Imm8(size >> 3));
|
||||
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
|
||||
if (size != 32)
|
||||
|
@ -45,10 +41,9 @@ void CommonAsmRoutines::GenFifoFloatWrite()
|
|||
PUSH(EDX);
|
||||
MOVSS(M(&temp32), XMM0);
|
||||
MOV(32, R(EDX), M(&temp32));
|
||||
BSWAP(32, EDX);
|
||||
MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
|
||||
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
|
||||
MOV(32, MComplex(RAX, RSI, 1, 0), R(EDX));
|
||||
SwapAndStore(32, MComplex(RAX, RSI, 1, 0), EDX);
|
||||
ADD(32, R(ESI), Imm8(4));
|
||||
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
|
||||
POP(EDX);
|
||||
|
@ -150,8 +145,7 @@ void CommonAsmRoutines::GenQuantizedStores()
|
|||
TEST(32, R(ECX), Imm32(0x0C000000));
|
||||
FixupBranch too_complex = J_CC(CC_NZ, true);
|
||||
MOV(64, R(RAX), M(&psTemp[0]));
|
||||
BSWAP(64, RAX);
|
||||
MOV(64, MComplex(RBX, RCX, SCALE_1, 0), R(RAX));
|
||||
SwapAndStore(64, MComplex(RBX, RCX, SCALE_1, 0), RAX);
|
||||
FixupBranch skip_complex = J(true);
|
||||
SetJumpTarget(too_complex);
|
||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true);
|
||||
|
@ -371,8 +365,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
|||
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
||||
} else {
|
||||
#if _M_X86_64
|
||||
MOV(64, R(RCX), MComplex(RBX, RCX, 1, 0));
|
||||
BSWAP(64, RCX);
|
||||
LoadAndSwap(64, RCX, MComplex(RBX, RCX, 1, 0));
|
||||
ROL(64, R(RCX), Imm8(32));
|
||||
MOVQ_xmm(XMM0, R(RCX));
|
||||
#else
|
||||
|
|
|
@ -16,6 +16,32 @@ using namespace Gen;
|
|||
static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
static u32 GC_ALIGNED16(float_buffer);
|
||||
|
||||
void EmuCodeBlock::LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src)
|
||||
{
|
||||
if (cpu_info.bMOVBE)
|
||||
{
|
||||
MOVBE(size, R(dst), src);
|
||||
}
|
||||
else
|
||||
{
|
||||
MOV(size, R(dst), src);
|
||||
BSWAP(size, dst);
|
||||
}
|
||||
}
|
||||
|
||||
void EmuCodeBlock::SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src)
|
||||
{
|
||||
if (cpu_info.bMOVBE)
|
||||
{
|
||||
MOVBE(size, dst, R(src));
|
||||
}
|
||||
else
|
||||
{
|
||||
BSWAP(size, src);
|
||||
MOV(size, dst, R(src));
|
||||
}
|
||||
}
|
||||
|
||||
void EmuCodeBlock::UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend)
|
||||
{
|
||||
#if _M_X86_64
|
||||
|
@ -513,12 +539,15 @@ void EmuCodeBlock::SafeWriteFloatToReg(X64Reg xmm_value, X64Reg reg_addr, u32 re
|
|||
}
|
||||
}
|
||||
|
||||
void EmuCodeBlock::WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address)
|
||||
void EmuCodeBlock::WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap)
|
||||
{
|
||||
#if _M_X86_64
|
||||
MOV(accessSize, MDisp(RBX, address & 0x3FFFFFFF), arg);
|
||||
if (swap)
|
||||
SwapAndStore(accessSize, MDisp(RBX, address & 0x3FFFFFFF), arg);
|
||||
else
|
||||
MOV(accessSize, MDisp(RBX, address & 0x3FFFFFFF), R(arg));
|
||||
#else
|
||||
MOV(accessSize, M((void*)(Memory::base + (address & Memory::MEMVIEW32_MASK))), arg);
|
||||
MOV(accessSize, M((void*)(Memory::base + (address & Memory::MEMVIEW32_MASK))), R(arg));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -25,6 +25,9 @@ namespace MMIO { class Mapping; }
|
|||
class EmuCodeBlock : public Gen::X64CodeBlock
|
||||
{
|
||||
public:
|
||||
void LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src);
|
||||
void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src);
|
||||
|
||||
void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false);
|
||||
void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset);
|
||||
// these return the address of the MOV, for backpatching
|
||||
|
@ -47,7 +50,7 @@ public:
|
|||
// Trashes both inputs and EAX.
|
||||
void SafeWriteFloatToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, u32 registersInUse, int flags = 0);
|
||||
|
||||
void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address);
|
||||
void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
|
||||
void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address);
|
||||
void JitClearCA();
|
||||
void JitSetCA();
|
||||
|
|
Loading…
Reference in New Issue