Merge pull request #173 from delroth/movbe

Optimize memory access on Haswell by using MOVBE when possible.
This commit is contained in:
Pierre Bourdon 2014-04-11 23:44:46 +02:00
commit a823edcc5b
9 changed files with 86 additions and 27 deletions

View File

@ -44,6 +44,7 @@ struct CPUInfo
bool bAES;
// FXSAVE/FXRSTOR
bool bFXSR;
bool bMOVBE;
// This flag indicates that the hardware supports some mode
// in which denormal inputs _and_ outputs are automatically set to (signed) zero.
// TODO: ARM

View File

@ -159,6 +159,7 @@ void CPUInfo::Detect()
if ((cpu_id[2] >> 9) & 1) bSSSE3 = true;
if ((cpu_id[2] >> 19) & 1) bSSE4_1 = true;
if ((cpu_id[2] >> 20) & 1) bSSE4_2 = true;
if ((cpu_id[2] >> 22) & 1) bMOVBE = true;
if ((cpu_id[2] >> 25) & 1) bAES = true;
// To check DAZ support, we first need to check FXSAVE support.
@ -263,6 +264,7 @@ std::string CPUInfo::Summarize()
if (bAVX) sum += ", AVX";
if (bFMA) sum += ", FMA";
if (bAES) sum += ", AES";
if (bMOVBE) sum += ", MOVBE";
if (bLongMode) sum += ", 64-bit support";
return sum;
}

View File

@ -804,6 +804,38 @@ void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
src.WriteRest(this);
}
void XEmitter::MOVBE(int bits, const OpArg& dest, const OpArg& src)
{
_assert_msg_(DYNA_REC, cpu_info.bMOVBE, "Generating MOVBE on a system that does not support it.");
if (bits == 8)
{
MOV(bits, dest, src);
return;
}
if (bits == 16)
Write8(0x66);
if (dest.IsSimpleReg())
{
_assert_msg_(DYNA_REC, !src.IsSimpleReg() && !src.IsImm(), "MOVBE: Loading from !mem");
src.WriteRex(this, bits, bits, dest.GetSimpleReg());
Write8(0x0F); Write8(0x38); Write8(0xF0);
src.WriteRest(this, 0, dest.GetSimpleReg());
}
else if (src.IsSimpleReg())
{
_assert_msg_(DYNA_REC, !dest.IsSimpleReg() && !dest.IsImm(), "MOVBE: Storing to !mem");
dest.WriteRex(this, bits, bits, src.GetSimpleReg());
Write8(0x0F); Write8(0x38); Write8(0xF1);
dest.WriteRest(this, 0, src.GetSimpleReg());
}
else
{
_assert_msg_(DYNA_REC, 0, "MOVBE: Not loading or storing to mem");
}
}
void XEmitter::LEA(int bits, X64Reg dest, OpArg src)
{

View File

@ -427,6 +427,9 @@ public:
void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary
void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
// Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE.
void MOVBE(int dbits, const OpArg& dest, const OpArg& src);
// WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
void STMXCSR(OpArg memloc);
void LDMXCSR(OpArg memloc);

View File

@ -314,8 +314,7 @@ void Jit64::stX(UGeckoInstruction inst)
else if (Memory::IsRAMAddress(addr))
{
MOV(32, R(EAX), gpr.R(s));
BSWAP(accessSize, EAX);
WriteToConstRamAddress(accessSize, R(EAX), addr);
WriteToConstRamAddress(accessSize, EAX, addr, true);
if (update)
gpr.SetImmediate32(a, addr);
return;
@ -344,10 +343,10 @@ void Jit64::stX(UGeckoInstruction inst)
gpr.FlushLockX(ABI_PARAM1);
MOV(32, R(ABI_PARAM1), gpr.R(a));
MOV(32, R(EAX), gpr.R(s));
BSWAP(32, EAX);
#if _M_X86_64
MOV(accessSize, MComplex(RBX, ABI_PARAM1, SCALE_1, (u32)offset), R(EAX));
SwapAndStore(accessSize, MComplex(RBX, ABI_PARAM1, SCALE_1, (u32)offset), EAX);
#else
BSWAP(32, EAX);
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
MOV(accessSize, MDisp(ABI_PARAM1, (u32)Memory::base + (u32)offset), R(EAX));
#endif
@ -456,8 +455,7 @@ void Jit64::lmw(UGeckoInstruction inst)
ADD(32, R(EAX), gpr.R(inst.RA));
for (int i = inst.RD; i < 32; i++)
{
MOV(32, R(ECX), MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4));
BSWAP(32, ECX);
LoadAndSwap(32, ECX, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4));
gpr.BindToRegister(i, false, true);
MOV(32, gpr.R(i), R(ECX));
}
@ -481,8 +479,7 @@ void Jit64::stmw(UGeckoInstruction inst)
for (int i = inst.RD; i < 32; i++)
{
MOV(32, R(ECX), gpr.R(i));
BSWAP(32, ECX);
MOV(32, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4), R(ECX));
SwapAndStore(32, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4), ECX);
}
gpr.UnlockAllX();
#else

View File

@ -96,8 +96,7 @@ void Jit64::lfd(UGeckoInstruction inst)
MOVSD(xd, R(XMM0));
} else {
#if _M_X86_64
MOV(64, R(EAX), MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
BSWAP(64, EAX);
LoadAndSwap(64, EAX, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
MOV(64, M(&temp64), R(EAX));
MEMCHECK_START

View File

@ -21,15 +21,11 @@ void CommonAsmRoutines::GenFifoWrite(int size)
PUSH(ESI);
if (size != 32)
PUSH(EDX);
BSWAP(size, ABI_PARAM1);
MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
if (size != 32) {
MOV(32, R(EDX), R(ABI_PARAM1));
MOV(size, MComplex(RAX, RSI, 1, 0), R(EDX));
} else {
MOV(size, MComplex(RAX, RSI, 1, 0), R(ABI_PARAM1));
}
SwapAndStore(size, MComplex(RAX, RSI, 1, 0), ABI_PARAM1);
ADD(32, R(ESI), Imm8(size >> 3));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
if (size != 32)
@ -45,10 +41,9 @@ void CommonAsmRoutines::GenFifoFloatWrite()
PUSH(EDX);
MOVSS(M(&temp32), XMM0);
MOV(32, R(EDX), M(&temp32));
BSWAP(32, EDX);
MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
MOV(32, MComplex(RAX, RSI, 1, 0), R(EDX));
SwapAndStore(32, MComplex(RAX, RSI, 1, 0), EDX);
ADD(32, R(ESI), Imm8(4));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
POP(EDX);
@ -150,8 +145,7 @@ void CommonAsmRoutines::GenQuantizedStores()
TEST(32, R(ECX), Imm32(0x0C000000));
FixupBranch too_complex = J_CC(CC_NZ, true);
MOV(64, R(RAX), M(&psTemp[0]));
BSWAP(64, RAX);
MOV(64, MComplex(RBX, RCX, SCALE_1, 0), R(RAX));
SwapAndStore(64, MComplex(RBX, RCX, SCALE_1, 0), RAX);
FixupBranch skip_complex = J(true);
SetJumpTarget(too_complex);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true);
@ -371,8 +365,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
} else {
#if _M_X86_64
MOV(64, R(RCX), MComplex(RBX, RCX, 1, 0));
BSWAP(64, RCX);
LoadAndSwap(64, RCX, MComplex(RBX, RCX, 1, 0));
ROL(64, R(RCX), Imm8(32));
MOVQ_xmm(XMM0, R(RCX));
#else

View File

@ -16,6 +16,32 @@ using namespace Gen;
static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
static u32 GC_ALIGNED16(float_buffer);
void EmuCodeBlock::LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src)
{
if (cpu_info.bMOVBE)
{
MOVBE(size, R(dst), src);
}
else
{
MOV(size, R(dst), src);
BSWAP(size, dst);
}
}
void EmuCodeBlock::SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src)
{
if (cpu_info.bMOVBE)
{
MOVBE(size, dst, R(src));
}
else
{
BSWAP(size, src);
MOV(size, dst, R(src));
}
}
void EmuCodeBlock::UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend)
{
#if _M_X86_64
@ -513,12 +539,15 @@ void EmuCodeBlock::SafeWriteFloatToReg(X64Reg xmm_value, X64Reg reg_addr, u32 re
}
}
void EmuCodeBlock::WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address)
void EmuCodeBlock::WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap)
{
#if _M_X86_64
MOV(accessSize, MDisp(RBX, address & 0x3FFFFFFF), arg);
if (swap)
SwapAndStore(accessSize, MDisp(RBX, address & 0x3FFFFFFF), arg);
else
MOV(accessSize, MDisp(RBX, address & 0x3FFFFFFF), R(arg));
#else
MOV(accessSize, M((void*)(Memory::base + (address & Memory::MEMVIEW32_MASK))), arg);
MOV(accessSize, M((void*)(Memory::base + (address & Memory::MEMVIEW32_MASK))), R(arg));
#endif
}

View File

@ -25,6 +25,9 @@ namespace MMIO { class Mapping; }
class EmuCodeBlock : public Gen::X64CodeBlock
{
public:
void LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src);
void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src);
void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false);
void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset);
// these return the address of the MOV, for backpatching
@ -47,7 +50,7 @@ public:
// Trashes both inputs and EAX.
void SafeWriteFloatToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, u32 registersInUse, int flags = 0);
void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address);
void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address);
void JitClearCA();
void JitSetCA();