diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 53a846237f..d444f0f834 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -206,6 +206,7 @@ public: void fmaddXX(UGeckoInstruction inst); void fsign(UGeckoInstruction inst); + void fselx(UGeckoInstruction inst); void stX(UGeckoInstruction inst); //stw sth stb void rlwinmx(UGeckoInstruction inst); void rlwimix(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index a9c932b8ca..0007915089 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -358,7 +358,7 @@ static GekkoOPTemplate table63_2[] = {20, &Jit64::fp_arith}, //"fsubx", OPTYPE_FPU, FL_RC_BIT_F}}, {21, &Jit64::fp_arith}, //"faddx", OPTYPE_FPU, FL_RC_BIT_F}}, {22, &Jit64::FallBackToInterpreter}, //"fsqrtx", OPTYPE_FPU, FL_RC_BIT_F}}, - {23, &Jit64::FallBackToInterpreter}, //"fselx", OPTYPE_FPU, FL_RC_BIT_F}}, + {23, &Jit64::fselx}, //"fselx", OPTYPE_FPU, FL_RC_BIT_F}}, {25, &Jit64::fp_arith}, //"fmulx", OPTYPE_FPU, FL_RC_BIT_F}}, {26, &Jit64::frsqrtex}, //"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F}}, {28, &Jit64::fmaddXX}, //"fmsubx", OPTYPE_FPU, FL_RC_BIT_F}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 68d5472ff8..a20dd17e89 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -10,8 +10,8 @@ using namespace Gen; -static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; -static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; +static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL}; +static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL}; static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000}; void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS) @@ -77,16 +77,7 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X if (single) { ForceSinglePrecisionS(fpr.RX(d)); - if (cpu_info.bSSE3) - { - MOVDDUP(fpr.RX(d), fpr.R(d)); - } - else - { - if (!fpr.R(d).IsSimpleReg(fpr.RX(d))) - MOVQ_xmm(fpr.RX(d), fpr.R(d)); - UNPCKLPD(fpr.RX(d), R(fpr.RX(d))); - } + MOVDDUP(fpr.RX(d), fpr.R(d)); } SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); @@ -136,29 +127,29 @@ void Jit64::fmaddXX(UGeckoInstruction inst) int d = inst.FD; fpr.Lock(a, b, c, d); - MOVSD(XMM0, fpr.R(c)); - if (single_precision) - Force25BitPrecision(XMM0, XMM1); - switch (inst.SUBOP5) + + // nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately + if (inst.SUBOP5 == 30) //nmsub { - case 28: //msub + MOVSD(XMM1, fpr.R(c)); + if (single_precision) + Force25BitPrecision(XMM1, XMM0); + MULSD(XMM1, fpr.R(a)); + MOVSD(XMM0, fpr.R(b)); + SUBSD(XMM0, R(XMM1)); + } + else + { + MOVSD(XMM0, fpr.R(c)); + if (single_precision) + Force25BitPrecision(XMM0, XMM1); MULSD(XMM0, fpr.R(a)); - SUBSD(XMM0, fpr.R(b)); - break; - case 29: //madd - MULSD(XMM0, fpr.R(a)); - ADDSD(XMM0, fpr.R(b)); - break; - case 30: //nmsub - MULSD(XMM0, fpr.R(a)); - SUBSD(XMM0, fpr.R(b)); - PXOR(XMM0, M((void*)&psSignBits2)); - break; - case 31: //nmadd - MULSD(XMM0, fpr.R(a)); - ADDSD(XMM0, fpr.R(b)); - PXOR(XMM0, M((void*)&psSignBits2)); - break; + if (inst.SUBOP5 == 28) //msub + SUBSD(XMM0, fpr.R(b)); + else //(n)madd + ADDSD(XMM0, fpr.R(b)); + if (inst.SUBOP5 == 31) //nmadd + PXOR(XMM0, M((void*)&psSignBits2)); } fpr.BindToRegister(d, false); //YES it is necessary to dupe the result :( @@ -186,23 +177,59 @@ void Jit64::fsign(UGeckoInstruction inst) int b = inst.FB; fpr.Lock(b, d); fpr.BindToRegister(d, true, true); - MOVSD(XMM0, fpr.R(b)); + + if (d != b) + MOVSD(fpr.RX(d), fpr.R(b)); switch (inst.SUBOP10) { case 40: // fnegx - PXOR(XMM0, M((void*)&psSignBits2)); + // We can cheat and not worry about clobbering the top half by using masks + // that don't modify the top half. + PXOR(fpr.RX(d), M((void*)&psSignBits2)); break; case 264: // fabsx - PAND(XMM0, M((void*)&psAbsMask2)); + PAND(fpr.RX(d), M((void*)&psAbsMask2)); break; case 136: // fnabs - POR(XMM0, M((void*)&psSignBits2)); + POR(fpr.RX(d), M((void*)&psSignBits2)); break; default: PanicAlert("fsign bleh"); break; } - MOVSD(fpr.R(d), XMM0); + fpr.UnlockAll(); +} + +void Jit64::fselx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + int d = inst.FD; + int a = inst.FA; + int b = inst.FB; + int c = inst.FC; + + fpr.Lock(a, b, c, d); + MOVSD(XMM0, fpr.R(a)); + PXOR(XMM1, R(XMM1)); + // XMM0 = XMM0 < 0 ? all 1s : all 0s + CMPSD(XMM0, R(XMM1), LT); + if (cpu_info.bSSE4_1) + { + MOVSD(XMM1, fpr.R(c)); + BLENDVPD(XMM1, fpr.R(b)); + } + else + { + MOVSD(XMM1, R(XMM0)); + PAND(XMM0, fpr.R(b)); + PANDN(XMM1, fpr.R(c)); + POR(XMM1, R(XMM0)); + } + fpr.BindToRegister(d, false); + MOVSD(fpr.RX(d), R(XMM1)); fpr.UnlockAll(); } @@ -220,14 +247,22 @@ void Jit64::fmrx(UGeckoInstruction inst) fpr.Lock(b, d); - // We don't need to load d, but if it is loaded, we need to mark it as dirty. if (fpr.IsBound(d)) + { + // We don't need to load d, but if it is loaded, we need to mark it as dirty. fpr.BindToRegister(d); - - // b needs to be in a register because "MOVSD reg, mem" sets the upper bits (64+) to zero and we don't want that. - fpr.BindToRegister(b, true, false); - - MOVSD(fpr.R(d), fpr.RX(b)); + // We have to use MOVLPD if b isn't loaded because "MOVSD reg, mem" sets the upper bits (64+) + // to zero and we don't want that. + if (!fpr.R(b).IsSimpleReg()) + MOVLPD(fpr.RX(d), fpr.R(b)); + else + MOVSD(fpr.R(d), fpr.RX(b)); + } + else + { + fpr.BindToRegister(b, true, false); + MOVSD(fpr.R(d), fpr.RX(b)); + } fpr.UnlockAll(); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp index 5f87d22ecb..b207de9ad4 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -22,9 +22,8 @@ void Jit64::psq_st(UGeckoInstruction inst) JITDISABLE(bJITLoadStorePairedOff); FALLBACK_IF(!inst.RA); - bool update = inst.OPCD == 61; - - int offset = inst.SIMM_12; + s32 offset = inst.SIMM_12; + bool update = inst.OPCD == 61 && offset; int a = inst.RA; int s = inst.RS; @@ -32,9 +31,16 @@ void Jit64::psq_st(UGeckoInstruction inst) if (update) gpr.BindToRegister(a, true, true); fpr.BindToRegister(s, true, false); - MOV(32, R(RSCRATCH_EXTRA), gpr.R(a)); - if (offset) - ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset)); + if (offset && gpr.R(a).IsSimpleReg()) + { + LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(a), offset)); + } + else + { + MOV(32, R(RSCRATCH_EXTRA), gpr.R(a)); + if (offset) + ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset)); + } // In memcheck mode, don't update the address until the exception check if (update && offset && !js.memcheck) MOV(32, gpr.R(a), R(RSCRATCH_EXTRA)); @@ -46,7 +52,7 @@ void Jit64::psq_st(UGeckoInstruction inst) AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + inst.I])); MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); - // FIXME: Fix ModR/M encoding to allow [RSCRATCH2*4+disp32] without a base register! + // FIXME: Fix ModR/M encoding to allow [RSCRATCH2*8+disp32] without a base register! if (inst.W) { // One value @@ -77,18 +83,24 @@ void Jit64::psq_l(UGeckoInstruction inst) JITDISABLE(bJITLoadStorePairedOff); FALLBACK_IF(!inst.RA); - bool update = inst.OPCD == 57; - int offset = inst.SIMM_12; + s32 offset = inst.SIMM_12; + bool update = inst.OPCD == 57 && offset; int a = inst.RA; int s = inst.RS; gpr.FlushLockX(RSCRATCH_EXTRA); gpr.BindToRegister(a, true, update && offset); fpr.BindToRegister(s, false, true); - if (offset) + if (offset && gpr.R(a).IsSimpleReg()) + { LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(a), offset)); + } else + { MOV(32, R(RSCRATCH_EXTRA), gpr.R(a)); + if (offset) + ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset)); + } // In memcheck mode, don't update the address until the exception check if (update && offset && !js.memcheck) MOV(32, gpr.R(a), R(RSCRATCH_EXTRA)); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index ff0786cca0..f148285468 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -3,19 +3,13 @@ // Refer to the license.txt file included. #include "Common/CommonTypes.h" +#include "Common/CPUDetect.h" #include "Core/PowerPC/Jit64/Jit.h" #include "Core/PowerPC/Jit64/JitRegCache.h" using namespace Gen; -// TODO -// ps_madds0 -// ps_muls0 -// ps_madds1 -// cmppd, andpd, andnpd, or -// lfsx, ps_merge01 etc - static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; @@ -36,9 +30,6 @@ void Jit64::ps_mr(UGeckoInstruction inst) void Jit64::ps_sel(UGeckoInstruction inst) { - // we can't use (V)BLENDVPD here because it just looks at the sign bit - // but we need -0 = +0 - INSTRUCTION_START JITDISABLE(bJITPairedOff); FALLBACK_IF(inst.Rc); @@ -49,16 +40,26 @@ void Jit64::ps_sel(UGeckoInstruction inst) int c = inst.FC; fpr.Lock(a, b, c, d); - MOVAPD(XMM0, fpr.R(a)); - PXOR(XMM1, R(XMM1)); - // XMM0 = XMM0 < 0 ? all 1s : all 0s - CMPPD(XMM0, R(XMM1), LT); - MOVAPD(XMM1, R(XMM0)); - PAND(XMM0, fpr.R(b)); - PANDN(XMM1, fpr.R(c)); - POR(XMM0, R(XMM1)); + + if (cpu_info.bSSE4_1) + { + PXOR(XMM0, R(XMM0)); + CMPPD(XMM0, fpr.R(a), LT); // XMM0 = XMM0 >= 0 ? all 1s : all 0s + MOVAPD(XMM1, fpr.R(b)); + BLENDVPD(XMM1, fpr.R(c)); + } + else + { + MOVAPD(XMM1, fpr.R(a)); + PXOR(XMM0, R(XMM0)); + CMPPD(XMM1, R(XMM0), LT); // XMM0 = XMM0 < 0 ? all 1s : all 0s + MOVAPD(XMM0, R(XMM1)); + PAND(XMM1, fpr.R(b)); + PANDN(XMM0, fpr.R(c)); + POR(XMM1, R(XMM0)); + } fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), R(XMM0)); + MOVAPD(fpr.RX(d), R(XMM1)); fpr.UnlockAll(); } @@ -98,20 +99,6 @@ void Jit64::ps_sign(UGeckoInstruction inst) fpr.UnlockAll(); } -//add a, b, c - -//mov a, b -//add a, c -//we need: -/* -psq_l -psq_stu -*/ - -/* -add a,b,a -*/ - //There's still a little bit more optimization that can be squeezed out of this void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS) { @@ -152,7 +139,7 @@ void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X6 MOVAPD(XMM0, fpr.R(b)); fpr.BindToRegister(d, false); MOVAPD(fpr.RX(d), fpr.R(a)); - (this->*op)(fpr.RX(d), Gen::R(XMM0)); + (this->*op)(fpr.RX(d), R(XMM0)); } } else @@ -204,32 +191,26 @@ void Jit64::ps_sum(UGeckoInstruction inst) int b = inst.FB; int c = inst.FC; fpr.Lock(a,b,c,d); - fpr.BindToRegister(d, d == a || d == b || d == c, true); switch (inst.SUBOP5) { case 10: - // ps_sum0, do the sum in upper subregisters, merge uppers - MOVDDUP(XMM0, fpr.R(a)); - MOVAPD(XMM1, fpr.R(b)); - ADDPD(XMM0, R(XMM1)); - UNPCKHPD(XMM0, fpr.R(c)); //merge - MOVAPD(fpr.R(d), XMM0); + MOVDDUP(XMM0, fpr.R(a)); // {a.ps0, a.ps0} + ADDPD(XMM0, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1} + UNPCKHPD(XMM0, fpr.R(c)); // {a.ps0 + b.ps1, c.ps1} break; case 11: - // ps_sum1, do the sum in lower subregisters, merge lowers - MOVAPD(XMM0, fpr.R(a)); - MOVAPD(XMM1, fpr.R(b)); - SHUFPD(XMM1, R(XMM1), 5); // copy higher to lower - ADDPD(XMM0, R(XMM1)); // sum lowers - MOVAPD(XMM1, fpr.R(c)); - UNPCKLPD(XMM1, R(XMM0)); // merge - MOVAPD(fpr.R(d), XMM1); + MOVDDUP(XMM1, fpr.R(a)); // {a.ps0, a.ps0} + ADDPD(XMM1, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1} + MOVAPD(XMM0, fpr.R(c)); + SHUFPD(XMM0, R(XMM1), 2); // {c.ps0, a.ps0 + b.ps1} break; default: PanicAlert("ps_sum WTF!!!"); } - ForceSinglePrecisionP(fpr.RX(d)); - SetFPRFIfNeeded(inst, fpr.RX(d)); + ForceSinglePrecisionP(XMM0); + SetFPRFIfNeeded(inst, XMM0); + fpr.BindToRegister(d, false); + MOVAPD(fpr.RX(d), R(XMM0)); fpr.UnlockAll(); } @@ -244,37 +225,28 @@ void Jit64::ps_muls(UGeckoInstruction inst) int a = inst.FA; int c = inst.FC; fpr.Lock(a, c, d); - fpr.BindToRegister(d, d == a || d == c, true); switch (inst.SUBOP5) { case 12: - // Single multiply scalar high - // TODO - faster version for when regs are different - MOVDDUP(XMM1, fpr.R(c)); - Force25BitPrecision(XMM1, XMM0); - MOVAPD(XMM0, fpr.R(a)); - MULPD(XMM0, R(XMM1)); - MOVAPD(fpr.R(d), XMM0); + MOVDDUP(XMM0, fpr.R(c)); break; case 13: - // TODO - faster version for when regs are different - MOVAPD(XMM1, fpr.R(c)); - Force25BitPrecision(XMM1, XMM0); - MOVAPD(XMM0, fpr.R(a)); - SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower - MULPD(XMM0, R(XMM1)); - MOVAPD(fpr.R(d), XMM0); + MOVAPD(XMM0, fpr.R(c)); + SHUFPD(XMM0, R(XMM0), 3); break; default: PanicAlert("ps_muls WTF!!!"); } - ForceSinglePrecisionP(fpr.RX(d)); - SetFPRFIfNeeded(inst, fpr.RX(d)); + Force25BitPrecision(XMM0, XMM1); + MULPD(XMM0, fpr.R(a)); + ForceSinglePrecisionP(XMM0); + SetFPRFIfNeeded(inst, XMM0); + fpr.BindToRegister(d, false); + MOVAPD(fpr.RX(d), R(XMM0)); fpr.UnlockAll(); } -//TODO: find easy cases and optimize them, do a breakout like ps_arith void Jit64::ps_mergeXX(UGeckoInstruction inst) { INSTRUCTION_START @@ -305,7 +277,7 @@ void Jit64::ps_mergeXX(UGeckoInstruction inst) _assert_msg_(DYNA_REC, 0, "ps_merge - invalid op"); } fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), Gen::R(XMM0)); + MOVAPD(fpr.RX(d), R(XMM0)); fpr.UnlockAll(); } @@ -373,8 +345,8 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) return; } fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), Gen::R(XMM0)); - ForceSinglePrecisionP(fpr.RX(d)); - SetFPRFIfNeeded(inst, fpr.RX(d)); + ForceSinglePrecisionP(XMM0); + SetFPRFIfNeeded(inst, XMM0); + MOVAPD(fpr.RX(d), R(XMM0)); fpr.UnlockAll(); } diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index 8a5e7dcfe5..f76acc6ba7 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -184,47 +184,63 @@ static const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, static const float GC_ALIGNED16(m_quantizeTableS[]) = { - (1ULL << 0), (1ULL << 1), (1ULL << 2), (1ULL << 3), - (1ULL << 4), (1ULL << 5), (1ULL << 6), (1ULL << 7), - (1ULL << 8), (1ULL << 9), (1ULL << 10), (1ULL << 11), - (1ULL << 12), (1ULL << 13), (1ULL << 14), (1ULL << 15), - (1ULL << 16), (1ULL << 17), (1ULL << 18), (1ULL << 19), - (1ULL << 20), (1ULL << 21), (1ULL << 22), (1ULL << 23), - (1ULL << 24), (1ULL << 25), (1ULL << 26), (1ULL << 27), - (1ULL << 28), (1ULL << 29), (1ULL << 30), (1ULL << 31), - 1.0 / (1ULL << 32), 1.0 / (1ULL << 31), 1.0 / (1ULL << 30), 1.0 / (1ULL << 29), - 1.0 / (1ULL << 28), 1.0 / (1ULL << 27), 1.0 / (1ULL << 26), 1.0 / (1ULL << 25), - 1.0 / (1ULL << 24), 1.0 / (1ULL << 23), 1.0 / (1ULL << 22), 1.0 / (1ULL << 21), - 1.0 / (1ULL << 20), 1.0 / (1ULL << 19), 1.0 / (1ULL << 18), 1.0 / (1ULL << 17), - 1.0 / (1ULL << 16), 1.0 / (1ULL << 15), 1.0 / (1ULL << 14), 1.0 / (1ULL << 13), - 1.0 / (1ULL << 12), 1.0 / (1ULL << 11), 1.0 / (1ULL << 10), 1.0 / (1ULL << 9), - 1.0 / (1ULL << 8), 1.0 / (1ULL << 7), 1.0 / (1ULL << 6), 1.0 / (1ULL << 5), - 1.0 / (1ULL << 4), 1.0 / (1ULL << 3), 1.0 / (1ULL << 2), 1.0 / (1ULL << 1), + (1ULL << 0), (1ULL << 0), (1ULL << 1), (1ULL << 1), (1ULL << 2), (1ULL << 2), (1ULL << 3), (1ULL << 3), + (1ULL << 4), (1ULL << 4), (1ULL << 5), (1ULL << 5), (1ULL << 6), (1ULL << 6), (1ULL << 7), (1ULL << 7), + (1ULL << 8), (1ULL << 8), (1ULL << 9), (1ULL << 9), (1ULL << 10), (1ULL << 10), (1ULL << 11), (1ULL << 11), + (1ULL << 12), (1ULL << 12), (1ULL << 13), (1ULL << 13), (1ULL << 14), (1ULL << 14), (1ULL << 15), (1ULL << 15), + (1ULL << 16), (1ULL << 16), (1ULL << 17), (1ULL << 17), (1ULL << 18), (1ULL << 18), (1ULL << 19), (1ULL << 19), + (1ULL << 20), (1ULL << 20), (1ULL << 21), (1ULL << 21), (1ULL << 22), (1ULL << 22), (1ULL << 23), (1ULL << 23), + (1ULL << 24), (1ULL << 24), (1ULL << 25), (1ULL << 25), (1ULL << 26), (1ULL << 26), (1ULL << 27), (1ULL << 27), + (1ULL << 28), (1ULL << 28), (1ULL << 29), (1ULL << 29), (1ULL << 30), (1ULL << 30), (1ULL << 31), (1ULL << 31), + 1.0 / (1ULL << 32), 1.0 / (1ULL << 32), 1.0 / (1ULL << 31), 1.0 / (1ULL << 31), + 1.0 / (1ULL << 30), 1.0 / (1ULL << 30), 1.0 / (1ULL << 29), 1.0 / (1ULL << 29), + 1.0 / (1ULL << 28), 1.0 / (1ULL << 28), 1.0 / (1ULL << 27), 1.0 / (1ULL << 27), + 1.0 / (1ULL << 26), 1.0 / (1ULL << 26), 1.0 / (1ULL << 25), 1.0 / (1ULL << 25), + 1.0 / (1ULL << 24), 1.0 / (1ULL << 24), 1.0 / (1ULL << 23), 1.0 / (1ULL << 23), + 1.0 / (1ULL << 22), 1.0 / (1ULL << 22), 1.0 / (1ULL << 21), 1.0 / (1ULL << 21), + 1.0 / (1ULL << 20), 1.0 / (1ULL << 20), 1.0 / (1ULL << 19), 1.0 / (1ULL << 19), + 1.0 / (1ULL << 18), 1.0 / (1ULL << 18), 1.0 / (1ULL << 17), 1.0 / (1ULL << 17), + 1.0 / (1ULL << 16), 1.0 / (1ULL << 16), 1.0 / (1ULL << 15), 1.0 / (1ULL << 15), + 1.0 / (1ULL << 14), 1.0 / (1ULL << 14), 1.0 / (1ULL << 13), 1.0 / (1ULL << 13), + 1.0 / (1ULL << 12), 1.0 / (1ULL << 12), 1.0 / (1ULL << 11), 1.0 / (1ULL << 11), + 1.0 / (1ULL << 10), 1.0 / (1ULL << 10), 1.0 / (1ULL << 9), 1.0 / (1ULL << 9), + 1.0 / (1ULL << 8), 1.0 / (1ULL << 8), 1.0 / (1ULL << 7), 1.0 / (1ULL << 7), + 1.0 / (1ULL << 6), 1.0 / (1ULL << 6), 1.0 / (1ULL << 5), 1.0 / (1ULL << 5), + 1.0 / (1ULL << 4), 1.0 / (1ULL << 4), 1.0 / (1ULL << 3), 1.0 / (1ULL << 3), + 1.0 / (1ULL << 2), 1.0 / (1ULL << 2), 1.0 / (1ULL << 1), 1.0 / (1ULL << 1), }; static const float GC_ALIGNED16(m_dequantizeTableS[]) = { - 1.0 / (1ULL << 0), 1.0 / (1ULL << 1), 1.0 / (1ULL << 2), 1.0 / (1ULL << 3), - 1.0 / (1ULL << 4), 1.0 / (1ULL << 5), 1.0 / (1ULL << 6), 1.0 / (1ULL << 7), - 1.0 / (1ULL << 8), 1.0 / (1ULL << 9), 1.0 / (1ULL << 10), 1.0 / (1ULL << 11), - 1.0 / (1ULL << 12), 1.0 / (1ULL << 13), 1.0 / (1ULL << 14), 1.0 / (1ULL << 15), - 1.0 / (1ULL << 16), 1.0 / (1ULL << 17), 1.0 / (1ULL << 18), 1.0 / (1ULL << 19), - 1.0 / (1ULL << 20), 1.0 / (1ULL << 21), 1.0 / (1ULL << 22), 1.0 / (1ULL << 23), - 1.0 / (1ULL << 24), 1.0 / (1ULL << 25), 1.0 / (1ULL << 26), 1.0 / (1ULL << 27), - 1.0 / (1ULL << 28), 1.0 / (1ULL << 29), 1.0 / (1ULL << 30), 1.0 / (1ULL << 31), - (1ULL << 32), (1ULL << 31), (1ULL << 30), (1ULL << 29), - (1ULL << 28), (1ULL << 27), (1ULL << 26), (1ULL << 25), - (1ULL << 24), (1ULL << 23), (1ULL << 22), (1ULL << 21), - (1ULL << 20), (1ULL << 19), (1ULL << 18), (1ULL << 17), - (1ULL << 16), (1ULL << 15), (1ULL << 14), (1ULL << 13), - (1ULL << 12), (1ULL << 11), (1ULL << 10), (1ULL << 9), - (1ULL << 8), (1ULL << 7), (1ULL << 6), (1ULL << 5), - (1ULL << 4), (1ULL << 3), (1ULL << 2), (1ULL << 1), + 1.0 / (1ULL << 0), 1.0 / (1ULL << 0), 1.0 / (1ULL << 1), 1.0 / (1ULL << 1), + 1.0 / (1ULL << 2), 1.0 / (1ULL << 2), 1.0 / (1ULL << 3), 1.0 / (1ULL << 3), + 1.0 / (1ULL << 4), 1.0 / (1ULL << 4), 1.0 / (1ULL << 5), 1.0 / (1ULL << 5), + 1.0 / (1ULL << 6), 1.0 / (1ULL << 6), 1.0 / (1ULL << 7), 1.0 / (1ULL << 7), + 1.0 / (1ULL << 8), 1.0 / (1ULL << 8), 1.0 / (1ULL << 9), 1.0 / (1ULL << 9), + 1.0 / (1ULL << 10), 1.0 / (1ULL << 10), 1.0 / (1ULL << 11), 1.0 / (1ULL << 11), + 1.0 / (1ULL << 12), 1.0 / (1ULL << 12), 1.0 / (1ULL << 13), 1.0 / (1ULL << 13), + 1.0 / (1ULL << 14), 1.0 / (1ULL << 14), 1.0 / (1ULL << 15), 1.0 / (1ULL << 15), + 1.0 / (1ULL << 16), 1.0 / (1ULL << 16), 1.0 / (1ULL << 17), 1.0 / (1ULL << 17), + 1.0 / (1ULL << 18), 1.0 / (1ULL << 18), 1.0 / (1ULL << 19), 1.0 / (1ULL << 19), + 1.0 / (1ULL << 20), 1.0 / (1ULL << 20), 1.0 / (1ULL << 21), 1.0 / (1ULL << 21), + 1.0 / (1ULL << 22), 1.0 / (1ULL << 22), 1.0 / (1ULL << 23), 1.0 / (1ULL << 23), + 1.0 / (1ULL << 24), 1.0 / (1ULL << 24), 1.0 / (1ULL << 25), 1.0 / (1ULL << 25), + 1.0 / (1ULL << 26), 1.0 / (1ULL << 26), 1.0 / (1ULL << 27), 1.0 / (1ULL << 27), + 1.0 / (1ULL << 28), 1.0 / (1ULL << 28), 1.0 / (1ULL << 29), 1.0 / (1ULL << 29), + 1.0 / (1ULL << 30), 1.0 / (1ULL << 30), 1.0 / (1ULL << 31), 1.0 / (1ULL << 31), + (1ULL << 32), (1ULL << 32), (1ULL << 31), (1ULL << 31), (1ULL << 30), (1ULL << 30), (1ULL << 29), (1ULL << 29), + (1ULL << 28), (1ULL << 28), (1ULL << 27), (1ULL << 27), (1ULL << 26), (1ULL << 26), (1ULL << 25), (1ULL << 25), + (1ULL << 24), (1ULL << 24), (1ULL << 23), (1ULL << 23), (1ULL << 22), (1ULL << 22), (1ULL << 21), (1ULL << 21), + (1ULL << 20), (1ULL << 20), (1ULL << 19), (1ULL << 19), (1ULL << 18), (1ULL << 18), (1ULL << 17), (1ULL << 17), + (1ULL << 16), (1ULL << 16), (1ULL << 15), (1ULL << 15), (1ULL << 14), (1ULL << 14), (1ULL << 13), (1ULL << 13), + (1ULL << 12), (1ULL << 12), (1ULL << 11), (1ULL << 11), (1ULL << 10), (1ULL << 10), (1ULL << 9), (1ULL << 9), + (1ULL << 8), (1ULL << 8), (1ULL << 7), (1ULL << 7), (1ULL << 6), (1ULL << 6), (1ULL << 5), (1ULL << 5), + (1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1), }; static float GC_ALIGNED16(psTemp[4]); -static const float GC_ALIGNED16(m_65535) = 65535.0f; +static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; static const float GC_ALIGNED16(m_32767) = 32767.0f; static const float GC_ALIGNED16(m_m32768) = -32768.0f; static const float GC_ALIGNED16(m_255) = 255.0f; @@ -273,14 +289,11 @@ void CommonAsmRoutines::GenQuantizedStores() RET(); const u8* storePairedU8 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULPS(XMM0, R(XMM1)); #ifdef QUANTIZE_OVERFLOW_SAFE - MOVSS(XMM1, M((void *)&m_65535)); - PUNPCKLDQ(XMM1, R(XMM1)); - MINPS(XMM0, R(XMM1)); + MINPS(XMM0, M((void *)&m_65535)); #endif CVTTPS2DQ(XMM0, R(XMM0)); PACKSSDW(XMM0, R(XMM0)); @@ -291,14 +304,11 @@ void CommonAsmRoutines::GenQuantizedStores() RET(); const u8* storePairedS8 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULPS(XMM0, R(XMM1)); #ifdef QUANTIZE_OVERFLOW_SAFE - MOVSS(XMM1, M((void *)&m_65535)); - PUNPCKLDQ(XMM1, R(XMM1)); - MINPS(XMM0, R(XMM1)); + MINPS(XMM0, M((void *)&m_65535)); #endif CVTTPS2DQ(XMM0, R(XMM0)); PACKSSDW(XMM0, R(XMM0)); @@ -310,41 +320,47 @@ void CommonAsmRoutines::GenQuantizedStores() RET(); const u8* storePairedU16 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULPS(XMM0, R(XMM1)); - // PACKUSDW is available only in SSE4 - PXOR(XMM1, R(XMM1)); - MAXPS(XMM0, R(XMM1)); - MOVSS(XMM1, M((void *)&m_65535)); - PUNPCKLDQ(XMM1, R(XMM1)); - MINPS(XMM0, R(XMM1)); + if (cpu_info.bSSE4_1) + { +#ifdef QUANTIZE_OVERFLOW_SAFE + MINPS(XMM0, M((void *)&m_65535)); +#endif + CVTTPS2DQ(XMM0, R(XMM0)); + PACKUSDW(XMM0, R(XMM0)); + MOVD_xmm(R(RSCRATCH), XMM0); + BSWAP(32, RSCRATCH); + ROL(32, R(RSCRATCH), Imm8(16)); + } + else + { + XORPS(XMM1, R(XMM1)); + MAXPS(XMM0, R(XMM1)); + MINPS(XMM0, M((void *)&m_65535)); - CVTTPS2DQ(XMM0, R(XMM0)); - MOVQ_xmm(M(psTemp), XMM0); - // place ps[0] into the higher word, ps[1] into the lower - // so no need in ROL after BSWAP - MOVZX(32, 16, RSCRATCH, M((char*)psTemp + 0)); - SHL(32, R(RSCRATCH), Imm8(16)); - MOV(16, R(RSCRATCH), M((char*)psTemp + 4)); + CVTTPS2DQ(XMM0, R(XMM0)); + MOVQ_xmm(M(psTemp), XMM0); + // place ps[0] into the higher word, ps[1] into the lower + // so no need in ROL after BSWAP + MOVZX(32, 16, RSCRATCH, M((char*)psTemp + 0)); + SHL(32, R(RSCRATCH), Imm8(16)); + MOV(16, R(RSCRATCH), M((char*)psTemp + 4)); + BSWAP(32, RSCRATCH); + } - BSWAP(32, RSCRATCH); SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); RET(); const u8* storePairedS16 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - // SHUFPS or UNPCKLPS might be a better choice here. The last one might just be an alias though. - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULPS(XMM0, R(XMM1)); #ifdef QUANTIZE_OVERFLOW_SAFE - MOVSS(XMM1, M((void *)&m_65535)); - PUNPCKLDQ(XMM1, R(XMM1)); - MINPS(XMM0, R(XMM1)); + MINPS(XMM0, M((void *)&m_65535)); #endif CVTTPS2DQ(XMM0, R(XMM0)); PACKSSDW(XMM0, R(XMM0)); @@ -395,10 +411,10 @@ void CommonAsmRoutines::GenQuantizedSingleStores() }*/ const u8* storeSingleU8 = AlignCode4(); // Used by MKWii - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); - PXOR(XMM1, R(XMM1)); + XORPS(XMM1, R(XMM1)); MAXSS(XMM0, R(XMM1)); MINSS(XMM0, M((void *)&m_255)); CVTTSS2SI(RSCRATCH, R(XMM0)); @@ -406,7 +422,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() RET(); const u8* storeSingleS8 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); MAXSS(XMM0, M((void *)&m_m128)); @@ -416,10 +432,10 @@ void CommonAsmRoutines::GenQuantizedSingleStores() RET(); const u8* storeSingleU16 = AlignCode4(); // Used by MKWii - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); - PXOR(XMM1, R(XMM1)); + XORPS(XMM1, R(XMM1)); MAXSS(XMM0, R(XMM1)); MINSS(XMM0, M((void *)&m_65535)); CVTTSS2SI(RSCRATCH, R(XMM0)); @@ -427,7 +443,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() RET(); const u8* storeSingleS16 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); MAXSS(XMM0, M((void *)&m_m32768)); @@ -507,13 +523,19 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); } MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - PXOR(XMM1, R(XMM1)); - PUNPCKLBW(XMM0, R(XMM1)); - PUNPCKLWD(XMM0, R(XMM1)); + if (cpu_info.bSSE4_1) + { + PMOVZXBD(XMM0, R(XMM0)); + } + else + { + PXOR(XMM1, R(XMM1)); + PUNPCKLBW(XMM0, R(XMM1)); + PUNPCKLWD(XMM0, R(XMM1)); + } CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULPS(XMM0, R(XMM1)); RET(); @@ -524,7 +546,7 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); // Is CVTSI2SS better? - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one)); @@ -542,13 +564,19 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); } MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - PUNPCKLBW(XMM0, R(XMM0)); - PUNPCKLWD(XMM0, R(XMM0)); - PSRAD(XMM0, 24); + if (cpu_info.bSSE4_1) + { + PMOVSXBD(XMM0, R(XMM0)); + } + else + { + PUNPCKLBW(XMM0, R(XMM0)); + PUNPCKLWD(XMM0, R(XMM0)); + PSRAD(XMM0, 24); + } CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULPS(XMM0, R(XMM1)); RET(); @@ -559,7 +587,7 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one)); @@ -573,12 +601,18 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - PXOR(XMM1, R(XMM1)); - PUNPCKLWD(XMM0, R(XMM1)); + if (cpu_info.bSSE4_1) + { + PMOVZXWD(XMM0, R(XMM0)); + } + else + { + PXOR(XMM1, R(XMM1)); + PUNPCKLWD(XMM0, R(XMM1)); + } CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULPS(XMM0, R(XMM1)); RET(); @@ -589,7 +623,7 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one)); @@ -602,12 +636,18 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - PUNPCKLWD(XMM0, R(XMM0)); - PSRAD(XMM0, 16); + if (cpu_info.bSSE4_1) + { + PMOVSXWD(XMM0, R(XMM0)); + } + else + { + PUNPCKLWD(XMM0, R(XMM0)); + PSRAD(XMM0, 16); + } CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULPS(XMM0, R(XMM1)); RET(); @@ -618,7 +658,7 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(6)); + SHR(32, R(RSCRATCH2), Imm8(5)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one));