From 7ab820c6f81cd82aa5679fd9a9417ae667cdd38e Mon Sep 17 00:00:00 2001 From: Fiora Date: Sat, 26 Jul 2014 18:46:09 -0700 Subject: [PATCH] JIT: Various JitAsmCommon optimizations Use some SSE4 instructions in on CPUs that support them. Use float instructions instead of int where appropriate (it's a cycle faster on CPUs with arithmetic unit forwarding penalties). --- .../PowerPC/Jit64/Jit_LoadStorePaired.cpp | 32 ++- .../Core/PowerPC/JitCommon/JitAsmCommon.cpp | 242 ++++++++++-------- 2 files changed, 163 insertions(+), 111 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp index 5f87d22ecb..b207de9ad4 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -22,9 +22,8 @@ void Jit64::psq_st(UGeckoInstruction inst) JITDISABLE(bJITLoadStorePairedOff); FALLBACK_IF(!inst.RA); - bool update = inst.OPCD == 61; - - int offset = inst.SIMM_12; + s32 offset = inst.SIMM_12; + bool update = inst.OPCD == 61 && offset; int a = inst.RA; int s = inst.RS; @@ -32,9 +31,16 @@ void Jit64::psq_st(UGeckoInstruction inst) if (update) gpr.BindToRegister(a, true, true); fpr.BindToRegister(s, true, false); - MOV(32, R(RSCRATCH_EXTRA), gpr.R(a)); - if (offset) - ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset)); + if (offset && gpr.R(a).IsSimpleReg()) + { + LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(a), offset)); + } + else + { + MOV(32, R(RSCRATCH_EXTRA), gpr.R(a)); + if (offset) + ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset)); + } // In memcheck mode, don't update the address until the exception check if (update && offset && !js.memcheck) MOV(32, gpr.R(a), R(RSCRATCH_EXTRA)); @@ -46,7 +52,7 @@ void Jit64::psq_st(UGeckoInstruction inst) AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + inst.I])); MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); - // FIXME: Fix ModR/M encoding to allow [RSCRATCH2*4+disp32] without a base register! + // FIXME: Fix ModR/M encoding to allow [RSCRATCH2*8+disp32] without a base register! if (inst.W) { // One value @@ -77,18 +83,24 @@ void Jit64::psq_l(UGeckoInstruction inst) JITDISABLE(bJITLoadStorePairedOff); FALLBACK_IF(!inst.RA); - bool update = inst.OPCD == 57; - int offset = inst.SIMM_12; + s32 offset = inst.SIMM_12; + bool update = inst.OPCD == 57 && offset; int a = inst.RA; int s = inst.RS; gpr.FlushLockX(RSCRATCH_EXTRA); gpr.BindToRegister(a, true, update && offset); fpr.BindToRegister(s, false, true); - if (offset) + if (offset && gpr.R(a).IsSimpleReg()) + { LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(a), offset)); + } else + { MOV(32, R(RSCRATCH_EXTRA), gpr.R(a)); + if (offset) + ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset)); + } // In memcheck mode, don't update the address until the exception check if (update && offset && !js.memcheck) MOV(32, gpr.R(a), R(RSCRATCH_EXTRA)); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index 8a5e7dcfe5..f76acc6ba7 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -184,47 +184,63 @@ static const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, static const float GC_ALIGNED16(m_quantizeTableS[]) = { - (1ULL << 0), (1ULL << 1), (1ULL << 2), (1ULL << 3), - (1ULL << 4), (1ULL << 5), (1ULL << 6), (1ULL << 7), - (1ULL << 8), (1ULL << 9), (1ULL << 10), (1ULL << 11), - (1ULL << 12), (1ULL << 13), (1ULL << 14), (1ULL << 15), - (1ULL << 16), (1ULL << 17), (1ULL << 18), (1ULL << 19), - (1ULL << 20), (1ULL << 21), (1ULL << 22), (1ULL << 23), - (1ULL << 24), (1ULL << 25), (1ULL << 26), (1ULL << 27), - (1ULL << 28), (1ULL << 29), (1ULL << 30), (1ULL << 31), - 1.0 / (1ULL << 32), 1.0 / (1ULL << 31), 1.0 / (1ULL << 30), 1.0 / (1ULL << 29), - 1.0 / (1ULL << 28), 1.0 / (1ULL << 27), 1.0 / (1ULL << 26), 1.0 / (1ULL << 25), - 1.0 / (1ULL << 24), 1.0 / (1ULL << 23), 1.0 / (1ULL << 22), 1.0 / (1ULL << 21), - 1.0 / (1ULL << 20), 1.0 / (1ULL << 19), 1.0 / (1ULL << 18), 1.0 / (1ULL << 17), - 1.0 / (1ULL << 16), 1.0 / (1ULL << 15), 1.0 / (1ULL << 14), 1.0 / (1ULL << 13), - 1.0 / (1ULL << 12), 1.0 / (1ULL << 11), 1.0 / (1ULL << 10), 1.0 / (1ULL << 9), - 1.0 / (1ULL << 8), 1.0 / (1ULL << 7), 1.0 / (1ULL << 6), 1.0 / (1ULL << 5), - 1.0 / (1ULL << 4), 1.0 / (1ULL << 3), 1.0 / (1ULL << 2), 1.0 / (1ULL << 1), + (1ULL << 0), (1ULL << 0), (1ULL << 1), (1ULL << 1), (1ULL << 2), (1ULL << 2), (1ULL << 3), (1ULL << 3), + (1ULL << 4), (1ULL << 4), (1ULL << 5), (1ULL << 5), (1ULL << 6), (1ULL << 6), (1ULL << 7), (1ULL << 7), + (1ULL << 8), (1ULL << 8), (1ULL << 9), (1ULL << 9), (1ULL << 10), (1ULL << 10), (1ULL << 11), (1ULL << 11), + (1ULL << 12), (1ULL << 12), (1ULL << 13), (1ULL << 13), (1ULL << 14), (1ULL << 14), (1ULL << 15), (1ULL << 15), + (1ULL << 16), (1ULL << 16), (1ULL << 17), (1ULL << 17), (1ULL << 18), (1ULL << 18), (1ULL << 19), (1ULL << 19), + (1ULL << 20), (1ULL << 20), (1ULL << 21), (1ULL << 21), (1ULL << 22), (1ULL << 22), (1ULL << 23), (1ULL << 23), + (1ULL << 24), (1ULL << 24), (1ULL << 25), (1ULL << 25), (1ULL << 26), (1ULL << 26), (1ULL << 27), (1ULL << 27), + (1ULL << 28), (1ULL << 28), (1ULL << 29), (1ULL << 29), (1ULL << 30), (1ULL << 30), (1ULL << 31), (1ULL << 31), + 1.0 / (1ULL << 32), 1.0 / (1ULL << 32), 1.0 / (1ULL << 31), 1.0 / (1ULL << 31), + 1.0 / (1ULL << 30), 1.0 / (1ULL << 30), 1.0 / (1ULL << 29), 1.0 / (1ULL << 29), + 1.0 / (1ULL << 28), 1.0 / (1ULL << 28), 1.0 / (1ULL << 27), 1.0 / (1ULL << 27), + 1.0 / (1ULL << 26), 1.0 / (1ULL << 26), 1.0 / (1ULL << 25), 1.0 / (1ULL << 25), + 1.0 / (1ULL << 24), 1.0 / (1ULL << 24), 1.0 / (1ULL << 23), 1.0 / (1ULL << 23), + 1.0 / (1ULL << 22), 1.0 / (1ULL << 22), 1.0 / (1ULL << 21), 1.0 / (1ULL << 21), + 1.0 / (1ULL << 20), 1.0 / (1ULL << 20), 1.0 / (1ULL << 19), 1.0 / (1ULL << 19), + 1.0 / (1ULL << 18), 1.0 / (1ULL << 18), 1.0 / (1ULL << 17), 1.0 / (1ULL << 17), + 1.0 / (1ULL << 16), 1.0 / (1ULL << 16), 1.0 / (1ULL << 15), 1.0 / (1ULL << 15), + 1.0 / (1ULL << 14), 1.0 / (1ULL << 14), 1.0 / (1ULL << 13), 1.0 / (1ULL << 13), + 1.0 / (1ULL << 12), 1.0 / (1ULL << 12), 1.0 / (1ULL << 11), 1.0 / (1ULL << 11), + 1.0 / (1ULL << 10), 1.0 / (1ULL << 10), 1.0 / (1ULL << 9), 1.0 / (1ULL << 9), + 1.0 / (1ULL << 8), 1.0 / (1ULL << 8), 1.0 / (1ULL << 7), 1.0 / (1ULL << 7), + 1.0 / (1ULL << 6), 1.0 / (1ULL << 6), 1.0 / (1ULL << 5), 1.0 / (1ULL << 5), + 1.0 / (1ULL << 4), 1.0 / (1ULL << 4), 1.0 / (1ULL << 3), 1.0 / (1ULL << 3), + 1.0 / (1ULL << 2), 1.0 / (1ULL << 2), 1.0 / (1ULL << 1), 1.0 / (1ULL << 1), }; static const float GC_ALIGNED16(m_dequantizeTableS[]) = { - 1.0 / (1ULL << 0), 1.0 / (1ULL << 1), 1.0 / (1ULL << 2), 1.0 / (1ULL << 3), - 1.0 / (1ULL << 4), 1.0 / (1ULL << 5), 1.0 / (1ULL << 6), 1.0 / (1ULL << 7), - 1.0 / (1ULL << 8), 1.0 / (1ULL << 9), 1.0 / (1ULL << 10), 1.0 / (1ULL << 11), - 1.0 / (1ULL << 12), 1.0 / (1ULL << 13), 1.0 / (1ULL << 14), 1.0 / (1ULL << 15), - 1.0 / (1ULL << 16), 1.0 / (1ULL << 17), 1.0 / (1ULL << 18), 1.0 / (1ULL << 19), - 1.0 / (1ULL << 20), 1.0 / (1ULL << 21), 1.0 / (1ULL << 22), 1.0 / (1ULL << 23), - 1.0 / (1ULL << 24), 1.0 / (1ULL << 25), 1.0 / (1ULL << 26), 1.0 / (1ULL << 27), - 1.0 / (1ULL << 28), 1.0 / (1ULL << 29), 1.0 / (1ULL << 30), 1.0 / (1ULL << 31), - (1ULL << 32), (1ULL << 31), (1ULL << 30), (1ULL << 29), - (1ULL << 28), (1ULL << 27), (1ULL << 26), (1ULL << 25), - (1ULL << 24), (1ULL << 23), (1ULL << 22), (1ULL << 21), - (1ULL << 20), (1ULL << 19), (1ULL << 18), (1ULL << 17), - (1ULL << 16), (1ULL << 15), (1ULL << 14), (1ULL << 13), - (1ULL << 12), (1ULL << 11), (1ULL << 10), (1ULL << 9), - (1ULL << 8), (1ULL << 7), (1ULL << 6), (1ULL << 5), - (1ULL << 4), (1ULL << 3), (1ULL << 2), (1ULL << 1), + 1.0 / (1ULL << 0), 1.0 / (1ULL << 0), 1.0 / (1ULL << 1), 1.0 / (1ULL << 1), + 1.0 / (1ULL << 2), 1.0 / (1ULL << 2), 1.0 / (1ULL << 3), 1.0 / (1ULL << 3), + 1.0 / (1ULL << 4), 1.0 / (1ULL << 4), 1.0 / (1ULL << 5), 1.0 / (1ULL << 5), + 1.0 / (1ULL << 6), 1.0 / (1ULL << 6), 1.0 / (1ULL << 7), 1.0 / (1ULL << 7), + 1.0 / (1ULL << 8), 1.0 / (1ULL << 8), 1.0 / (1ULL << 9), 1.0 / (1ULL << 9), + 1.0 / (1ULL << 10), 1.0 / (1ULL << 10), 1.0 / (1ULL << 11), 1.0 / (1ULL << 11), + 1.0 / (1ULL << 12), 1.0 / (1ULL << 12), 1.0 / (1ULL << 13), 1.0 / (1ULL << 13), + 1.0 / (1ULL << 14), 1.0 / (1ULL << 14), 1.0 / (1ULL << 15), 1.0 / (1ULL << 15), + 1.0 / (1ULL << 16), 1.0 / (1ULL << 16), 1.0 / (1ULL << 17), 1.0 / (1ULL << 17), + 1.0 / (1ULL << 18), 1.0 / (1ULL << 18), 1.0 / (1ULL << 19), 1.0 / (1ULL << 19), + 1.0 / (1ULL << 20), 1.0 / (1ULL << 20), 1.0 / (1ULL << 21), 1.0 / (1ULL << 21), + 1.0 / (1ULL << 22), 1.0 / (1ULL << 22), 1.0 / (1ULL << 23), 1.0 / (1ULL << 23), + 1.0 / (1ULL << 24), 1.0 / (1ULL << 24), 1.0 / (1ULL << 25), 1.0 / (1ULL << 25), + 1.0 / (1ULL << 26), 1.0 / (1ULL << 26), 1.0 / (1ULL << 27), 1.0 / (1ULL << 27), + 1.0 / (1ULL << 28), 1.0 / (1ULL << 28), 1.0 / (1ULL << 29), 1.0 / (1ULL << 29), + 1.0 / (1ULL << 30), 1.0 / (1ULL << 30), 1.0 / (1ULL << 31), 1.0 / (1ULL << 31), + (1ULL << 32), (1ULL << 32), (1ULL << 31), (1ULL << 31), (1ULL << 30), (1ULL << 30), (1ULL << 29), (1ULL << 29), + (1ULL << 28), (1ULL << 28), (1ULL << 27), (1ULL << 27), (1ULL << 26), (1ULL << 26), (1ULL << 25), (1ULL << 25), + (1ULL << 24), (1ULL << 24), (1ULL << 23), (1ULL << 23), (1ULL << 22), (1ULL << 22), (1ULL << 21), (1ULL << 21), + (1ULL << 20), (1ULL << 20), (1ULL << 19), (1ULL << 19), (1ULL << 18), (1ULL << 18), (1ULL << 17), (1ULL << 17), + (1ULL << 16), (1ULL << 16), (1ULL << 15), (1ULL << 15), (1ULL << 14), (1ULL << 14), (1ULL << 13), (1ULL << 13), + (1ULL << 12), (1ULL << 12), (1ULL << 11), (1ULL << 11), (1ULL << 10), (1ULL << 10), (1ULL << 9), (1ULL << 9), + (1ULL << 8), (1ULL << 8), (1ULL << 7), (1ULL << 7), (1ULL << 6), (1ULL << 6), (1ULL << 5), (1ULL << 5), + (1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1), }; static float GC_ALIGNED16(psTemp[4]); -static const float GC_ALIGNED16(m_65535) = 65535.0f; +static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; static const float GC_ALIGNED16(m_32767) = 32767.0f; static const float GC_ALIGNED16(m_m32768) = -32768.0f; static const float GC_ALIGNED16(m_255) = 255.0f; @@ -273,14 +289,11 @@ void CommonAsmRoutines::GenQuantizedStores() RET(); const u8* storePairedU8 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULPS(XMM0, R(XMM1)); #ifdef QUANTIZE_OVERFLOW_SAFE - MOVSS(XMM1, M((void *)&m_65535)); - PUNPCKLDQ(XMM1, R(XMM1)); - MINPS(XMM0, R(XMM1)); + MINPS(XMM0, M((void *)&m_65535)); #endif CVTTPS2DQ(XMM0, R(XMM0)); PACKSSDW(XMM0, R(XMM0)); @@ -291,14 +304,11 @@ void CommonAsmRoutines::GenQuantizedStores() RET(); const u8* storePairedS8 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULPS(XMM0, R(XMM1)); #ifdef QUANTIZE_OVERFLOW_SAFE - MOVSS(XMM1, M((void *)&m_65535)); - PUNPCKLDQ(XMM1, R(XMM1)); - MINPS(XMM0, R(XMM1)); + MINPS(XMM0, M((void *)&m_65535)); #endif CVTTPS2DQ(XMM0, R(XMM0)); PACKSSDW(XMM0, R(XMM0)); @@ -310,41 +320,47 @@ void CommonAsmRoutines::GenQuantizedStores() RET(); const u8* storePairedU16 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULPS(XMM0, R(XMM1)); - // PACKUSDW is available only in SSE4 - PXOR(XMM1, R(XMM1)); - MAXPS(XMM0, R(XMM1)); - MOVSS(XMM1, M((void *)&m_65535)); - PUNPCKLDQ(XMM1, R(XMM1)); - MINPS(XMM0, R(XMM1)); + if (cpu_info.bSSE4_1) + { +#ifdef QUANTIZE_OVERFLOW_SAFE + MINPS(XMM0, M((void *)&m_65535)); +#endif + CVTTPS2DQ(XMM0, R(XMM0)); + PACKUSDW(XMM0, R(XMM0)); + MOVD_xmm(R(RSCRATCH), XMM0); + BSWAP(32, RSCRATCH); + ROL(32, R(RSCRATCH), Imm8(16)); + } + else + { + XORPS(XMM1, R(XMM1)); + MAXPS(XMM0, R(XMM1)); + MINPS(XMM0, M((void *)&m_65535)); - CVTTPS2DQ(XMM0, R(XMM0)); - MOVQ_xmm(M(psTemp), XMM0); - // place ps[0] into the higher word, ps[1] into the lower - // so no need in ROL after BSWAP - MOVZX(32, 16, RSCRATCH, M((char*)psTemp + 0)); - SHL(32, R(RSCRATCH), Imm8(16)); - MOV(16, R(RSCRATCH), M((char*)psTemp + 4)); + CVTTPS2DQ(XMM0, R(XMM0)); + MOVQ_xmm(M(psTemp), XMM0); + // place ps[0] into the higher word, ps[1] into the lower + // so no need in ROL after BSWAP + MOVZX(32, 16, RSCRATCH, M((char*)psTemp + 0)); + SHL(32, R(RSCRATCH), Imm8(16)); + MOV(16, R(RSCRATCH), M((char*)psTemp + 4)); + BSWAP(32, RSCRATCH); + } - BSWAP(32, RSCRATCH); SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); RET(); const u8* storePairedS16 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - // SHUFPS or UNPCKLPS might be a better choice here. The last one might just be an alias though. - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULPS(XMM0, R(XMM1)); #ifdef QUANTIZE_OVERFLOW_SAFE - MOVSS(XMM1, M((void *)&m_65535)); - PUNPCKLDQ(XMM1, R(XMM1)); - MINPS(XMM0, R(XMM1)); + MINPS(XMM0, M((void *)&m_65535)); #endif CVTTPS2DQ(XMM0, R(XMM0)); PACKSSDW(XMM0, R(XMM0)); @@ -395,10 +411,10 @@ void CommonAsmRoutines::GenQuantizedSingleStores() }*/ const u8* storeSingleU8 = AlignCode4(); // Used by MKWii - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); - PXOR(XMM1, R(XMM1)); + XORPS(XMM1, R(XMM1)); MAXSS(XMM0, R(XMM1)); MINSS(XMM0, M((void *)&m_255)); CVTTSS2SI(RSCRATCH, R(XMM0)); @@ -406,7 +422,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() RET(); const u8* storeSingleS8 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); MAXSS(XMM0, M((void *)&m_m128)); @@ -416,10 +432,10 @@ void CommonAsmRoutines::GenQuantizedSingleStores() RET(); const u8* storeSingleU16 = AlignCode4(); // Used by MKWii - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); - PXOR(XMM1, R(XMM1)); + XORPS(XMM1, R(XMM1)); MAXSS(XMM0, R(XMM1)); MINSS(XMM0, M((void *)&m_65535)); CVTTSS2SI(RSCRATCH, R(XMM0)); @@ -427,7 +443,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() RET(); const u8* storeSingleS16 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); MAXSS(XMM0, M((void *)&m_m32768)); @@ -507,13 +523,19 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); } MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - PXOR(XMM1, R(XMM1)); - PUNPCKLBW(XMM0, R(XMM1)); - PUNPCKLWD(XMM0, R(XMM1)); + if (cpu_info.bSSE4_1) + { + PMOVZXBD(XMM0, R(XMM0)); + } + else + { + PXOR(XMM1, R(XMM1)); + PUNPCKLBW(XMM0, R(XMM1)); + PUNPCKLWD(XMM0, R(XMM1)); + } CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULPS(XMM0, R(XMM1)); RET(); @@ -524,7 +546,7 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); // Is CVTSI2SS better? - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one)); @@ -542,13 +564,19 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); } MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - PUNPCKLBW(XMM0, R(XMM0)); - PUNPCKLWD(XMM0, R(XMM0)); - PSRAD(XMM0, 24); + if (cpu_info.bSSE4_1) + { + PMOVSXBD(XMM0, R(XMM0)); + } + else + { + PUNPCKLBW(XMM0, R(XMM0)); + PUNPCKLWD(XMM0, R(XMM0)); + PSRAD(XMM0, 24); + } CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULPS(XMM0, R(XMM1)); RET(); @@ -559,7 +587,7 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one)); @@ -573,12 +601,18 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - PXOR(XMM1, R(XMM1)); - PUNPCKLWD(XMM0, R(XMM1)); + if (cpu_info.bSSE4_1) + { + PMOVZXWD(XMM0, R(XMM0)); + } + else + { + PXOR(XMM1, R(XMM1)); + PUNPCKLWD(XMM0, R(XMM1)); + } CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULPS(XMM0, R(XMM1)); RET(); @@ -589,7 +623,7 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one)); @@ -602,12 +636,18 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - PUNPCKLWD(XMM0, R(XMM0)); - PSRAD(XMM0, 16); + if (cpu_info.bSSE4_1) + { + PMOVSXWD(XMM0, R(XMM0)); + } + else + { + PUNPCKLWD(XMM0, R(XMM0)); + PSRAD(XMM0, 16); + } CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULPS(XMM0, R(XMM1)); RET(); @@ -618,7 +658,7 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one));