Merge pull request #686 from FioraAeterna/fiora
JIT: Optimize JitAsmCommon, Float, and PS implementations
This commit is contained in:
commit
2c233c4976
|
@ -206,6 +206,7 @@ public:
|
||||||
|
|
||||||
void fmaddXX(UGeckoInstruction inst);
|
void fmaddXX(UGeckoInstruction inst);
|
||||||
void fsign(UGeckoInstruction inst);
|
void fsign(UGeckoInstruction inst);
|
||||||
|
void fselx(UGeckoInstruction inst);
|
||||||
void stX(UGeckoInstruction inst); //stw sth stb
|
void stX(UGeckoInstruction inst); //stw sth stb
|
||||||
void rlwinmx(UGeckoInstruction inst);
|
void rlwinmx(UGeckoInstruction inst);
|
||||||
void rlwimix(UGeckoInstruction inst);
|
void rlwimix(UGeckoInstruction inst);
|
||||||
|
|
|
@ -358,7 +358,7 @@ static GekkoOPTemplate table63_2[] =
|
||||||
{20, &Jit64::fp_arith}, //"fsubx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{20, &Jit64::fp_arith}, //"fsubx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{21, &Jit64::fp_arith}, //"faddx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{21, &Jit64::fp_arith}, //"faddx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{22, &Jit64::FallBackToInterpreter}, //"fsqrtx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{22, &Jit64::FallBackToInterpreter}, //"fsqrtx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{23, &Jit64::FallBackToInterpreter}, //"fselx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{23, &Jit64::fselx}, //"fselx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{25, &Jit64::fp_arith}, //"fmulx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{25, &Jit64::fp_arith}, //"fmulx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{26, &Jit64::frsqrtex}, //"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F}},
|
{26, &Jit64::frsqrtex}, //"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{28, &Jit64::fmaddXX}, //"fmsubx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{28, &Jit64::fmaddXX}, //"fmsubx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
|
|
|
@ -10,8 +10,8 @@
|
||||||
|
|
||||||
using namespace Gen;
|
using namespace Gen;
|
||||||
|
|
||||||
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
|
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
|
||||||
static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
|
||||||
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
|
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
|
||||||
|
|
||||||
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS)
|
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS)
|
||||||
|
@ -77,17 +77,8 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X
|
||||||
if (single)
|
if (single)
|
||||||
{
|
{
|
||||||
ForceSinglePrecisionS(fpr.RX(d));
|
ForceSinglePrecisionS(fpr.RX(d));
|
||||||
if (cpu_info.bSSE3)
|
|
||||||
{
|
|
||||||
MOVDDUP(fpr.RX(d), fpr.R(d));
|
MOVDDUP(fpr.RX(d), fpr.R(d));
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
|
||||||
if (!fpr.R(d).IsSimpleReg(fpr.RX(d)))
|
|
||||||
MOVQ_xmm(fpr.RX(d), fpr.R(d));
|
|
||||||
UNPCKLPD(fpr.RX(d), R(fpr.RX(d)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
@ -136,29 +127,29 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
|
|
||||||
fpr.Lock(a, b, c, d);
|
fpr.Lock(a, b, c, d);
|
||||||
|
|
||||||
|
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
||||||
|
if (inst.SUBOP5 == 30) //nmsub
|
||||||
|
{
|
||||||
|
MOVSD(XMM1, fpr.R(c));
|
||||||
|
if (single_precision)
|
||||||
|
Force25BitPrecision(XMM1, XMM0);
|
||||||
|
MULSD(XMM1, fpr.R(a));
|
||||||
|
MOVSD(XMM0, fpr.R(b));
|
||||||
|
SUBSD(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
MOVSD(XMM0, fpr.R(c));
|
MOVSD(XMM0, fpr.R(c));
|
||||||
if (single_precision)
|
if (single_precision)
|
||||||
Force25BitPrecision(XMM0, XMM1);
|
Force25BitPrecision(XMM0, XMM1);
|
||||||
switch (inst.SUBOP5)
|
|
||||||
{
|
|
||||||
case 28: //msub
|
|
||||||
MULSD(XMM0, fpr.R(a));
|
MULSD(XMM0, fpr.R(a));
|
||||||
|
if (inst.SUBOP5 == 28) //msub
|
||||||
SUBSD(XMM0, fpr.R(b));
|
SUBSD(XMM0, fpr.R(b));
|
||||||
break;
|
else //(n)madd
|
||||||
case 29: //madd
|
|
||||||
MULSD(XMM0, fpr.R(a));
|
|
||||||
ADDSD(XMM0, fpr.R(b));
|
ADDSD(XMM0, fpr.R(b));
|
||||||
break;
|
if (inst.SUBOP5 == 31) //nmadd
|
||||||
case 30: //nmsub
|
|
||||||
MULSD(XMM0, fpr.R(a));
|
|
||||||
SUBSD(XMM0, fpr.R(b));
|
|
||||||
PXOR(XMM0, M((void*)&psSignBits2));
|
PXOR(XMM0, M((void*)&psSignBits2));
|
||||||
break;
|
|
||||||
case 31: //nmadd
|
|
||||||
MULSD(XMM0, fpr.R(a));
|
|
||||||
ADDSD(XMM0, fpr.R(b));
|
|
||||||
PXOR(XMM0, M((void*)&psSignBits2));
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, false);
|
||||||
//YES it is necessary to dupe the result :(
|
//YES it is necessary to dupe the result :(
|
||||||
|
@ -186,23 +177,59 @@ void Jit64::fsign(UGeckoInstruction inst)
|
||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
fpr.Lock(b, d);
|
fpr.Lock(b, d);
|
||||||
fpr.BindToRegister(d, true, true);
|
fpr.BindToRegister(d, true, true);
|
||||||
MOVSD(XMM0, fpr.R(b));
|
|
||||||
|
if (d != b)
|
||||||
|
MOVSD(fpr.RX(d), fpr.R(b));
|
||||||
switch (inst.SUBOP10)
|
switch (inst.SUBOP10)
|
||||||
{
|
{
|
||||||
case 40: // fnegx
|
case 40: // fnegx
|
||||||
PXOR(XMM0, M((void*)&psSignBits2));
|
// We can cheat and not worry about clobbering the top half by using masks
|
||||||
|
// that don't modify the top half.
|
||||||
|
PXOR(fpr.RX(d), M((void*)&psSignBits2));
|
||||||
break;
|
break;
|
||||||
case 264: // fabsx
|
case 264: // fabsx
|
||||||
PAND(XMM0, M((void*)&psAbsMask2));
|
PAND(fpr.RX(d), M((void*)&psAbsMask2));
|
||||||
break;
|
break;
|
||||||
case 136: // fnabs
|
case 136: // fnabs
|
||||||
POR(XMM0, M((void*)&psSignBits2));
|
POR(fpr.RX(d), M((void*)&psSignBits2));
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
PanicAlert("fsign bleh");
|
PanicAlert("fsign bleh");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
MOVSD(fpr.R(d), XMM0);
|
fpr.UnlockAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Jit64::fselx(UGeckoInstruction inst)
|
||||||
|
{
|
||||||
|
INSTRUCTION_START
|
||||||
|
JITDISABLE(bJITFloatingPointOff);
|
||||||
|
FALLBACK_IF(inst.Rc);
|
||||||
|
|
||||||
|
int d = inst.FD;
|
||||||
|
int a = inst.FA;
|
||||||
|
int b = inst.FB;
|
||||||
|
int c = inst.FC;
|
||||||
|
|
||||||
|
fpr.Lock(a, b, c, d);
|
||||||
|
MOVSD(XMM0, fpr.R(a));
|
||||||
|
PXOR(XMM1, R(XMM1));
|
||||||
|
// XMM0 = XMM0 < 0 ? all 1s : all 0s
|
||||||
|
CMPSD(XMM0, R(XMM1), LT);
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
MOVSD(XMM1, fpr.R(c));
|
||||||
|
BLENDVPD(XMM1, fpr.R(b));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOVSD(XMM1, R(XMM0));
|
||||||
|
PAND(XMM0, fpr.R(b));
|
||||||
|
PANDN(XMM1, fpr.R(c));
|
||||||
|
POR(XMM1, R(XMM0));
|
||||||
|
}
|
||||||
|
fpr.BindToRegister(d, false);
|
||||||
|
MOVSD(fpr.RX(d), R(XMM1));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -220,14 +247,22 @@ void Jit64::fmrx(UGeckoInstruction inst)
|
||||||
|
|
||||||
fpr.Lock(b, d);
|
fpr.Lock(b, d);
|
||||||
|
|
||||||
// We don't need to load d, but if it is loaded, we need to mark it as dirty.
|
|
||||||
if (fpr.IsBound(d))
|
if (fpr.IsBound(d))
|
||||||
|
{
|
||||||
|
// We don't need to load d, but if it is loaded, we need to mark it as dirty.
|
||||||
fpr.BindToRegister(d);
|
fpr.BindToRegister(d);
|
||||||
|
// We have to use MOVLPD if b isn't loaded because "MOVSD reg, mem" sets the upper bits (64+)
|
||||||
// b needs to be in a register because "MOVSD reg, mem" sets the upper bits (64+) to zero and we don't want that.
|
// to zero and we don't want that.
|
||||||
fpr.BindToRegister(b, true, false);
|
if (!fpr.R(b).IsSimpleReg())
|
||||||
|
MOVLPD(fpr.RX(d), fpr.R(b));
|
||||||
|
else
|
||||||
MOVSD(fpr.R(d), fpr.RX(b));
|
MOVSD(fpr.R(d), fpr.RX(b));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
fpr.BindToRegister(b, true, false);
|
||||||
|
MOVSD(fpr.R(d), fpr.RX(b));
|
||||||
|
}
|
||||||
|
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,9 +22,8 @@ void Jit64::psq_st(UGeckoInstruction inst)
|
||||||
JITDISABLE(bJITLoadStorePairedOff);
|
JITDISABLE(bJITLoadStorePairedOff);
|
||||||
FALLBACK_IF(!inst.RA);
|
FALLBACK_IF(!inst.RA);
|
||||||
|
|
||||||
bool update = inst.OPCD == 61;
|
s32 offset = inst.SIMM_12;
|
||||||
|
bool update = inst.OPCD == 61 && offset;
|
||||||
int offset = inst.SIMM_12;
|
|
||||||
int a = inst.RA;
|
int a = inst.RA;
|
||||||
int s = inst.RS;
|
int s = inst.RS;
|
||||||
|
|
||||||
|
@ -32,9 +31,16 @@ void Jit64::psq_st(UGeckoInstruction inst)
|
||||||
if (update)
|
if (update)
|
||||||
gpr.BindToRegister(a, true, true);
|
gpr.BindToRegister(a, true, true);
|
||||||
fpr.BindToRegister(s, true, false);
|
fpr.BindToRegister(s, true, false);
|
||||||
|
if (offset && gpr.R(a).IsSimpleReg())
|
||||||
|
{
|
||||||
|
LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(a), offset));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
MOV(32, R(RSCRATCH_EXTRA), gpr.R(a));
|
MOV(32, R(RSCRATCH_EXTRA), gpr.R(a));
|
||||||
if (offset)
|
if (offset)
|
||||||
ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset));
|
ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset));
|
||||||
|
}
|
||||||
// In memcheck mode, don't update the address until the exception check
|
// In memcheck mode, don't update the address until the exception check
|
||||||
if (update && offset && !js.memcheck)
|
if (update && offset && !js.memcheck)
|
||||||
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
|
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
|
||||||
|
@ -46,7 +52,7 @@ void Jit64::psq_st(UGeckoInstruction inst)
|
||||||
AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + inst.I]));
|
AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + inst.I]));
|
||||||
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
|
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
|
||||||
|
|
||||||
// FIXME: Fix ModR/M encoding to allow [RSCRATCH2*4+disp32] without a base register!
|
// FIXME: Fix ModR/M encoding to allow [RSCRATCH2*8+disp32] without a base register!
|
||||||
if (inst.W)
|
if (inst.W)
|
||||||
{
|
{
|
||||||
// One value
|
// One value
|
||||||
|
@ -77,18 +83,24 @@ void Jit64::psq_l(UGeckoInstruction inst)
|
||||||
JITDISABLE(bJITLoadStorePairedOff);
|
JITDISABLE(bJITLoadStorePairedOff);
|
||||||
FALLBACK_IF(!inst.RA);
|
FALLBACK_IF(!inst.RA);
|
||||||
|
|
||||||
bool update = inst.OPCD == 57;
|
s32 offset = inst.SIMM_12;
|
||||||
int offset = inst.SIMM_12;
|
bool update = inst.OPCD == 57 && offset;
|
||||||
int a = inst.RA;
|
int a = inst.RA;
|
||||||
int s = inst.RS;
|
int s = inst.RS;
|
||||||
|
|
||||||
gpr.FlushLockX(RSCRATCH_EXTRA);
|
gpr.FlushLockX(RSCRATCH_EXTRA);
|
||||||
gpr.BindToRegister(a, true, update && offset);
|
gpr.BindToRegister(a, true, update && offset);
|
||||||
fpr.BindToRegister(s, false, true);
|
fpr.BindToRegister(s, false, true);
|
||||||
if (offset)
|
if (offset && gpr.R(a).IsSimpleReg())
|
||||||
|
{
|
||||||
LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(a), offset));
|
LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(a), offset));
|
||||||
|
}
|
||||||
else
|
else
|
||||||
|
{
|
||||||
MOV(32, R(RSCRATCH_EXTRA), gpr.R(a));
|
MOV(32, R(RSCRATCH_EXTRA), gpr.R(a));
|
||||||
|
if (offset)
|
||||||
|
ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset));
|
||||||
|
}
|
||||||
// In memcheck mode, don't update the address until the exception check
|
// In memcheck mode, don't update the address until the exception check
|
||||||
if (update && offset && !js.memcheck)
|
if (update && offset && !js.memcheck)
|
||||||
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
|
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
|
||||||
|
|
|
@ -3,19 +3,13 @@
|
||||||
// Refer to the license.txt file included.
|
// Refer to the license.txt file included.
|
||||||
|
|
||||||
#include "Common/CommonTypes.h"
|
#include "Common/CommonTypes.h"
|
||||||
|
#include "Common/CPUDetect.h"
|
||||||
|
|
||||||
#include "Core/PowerPC/Jit64/Jit.h"
|
#include "Core/PowerPC/Jit64/Jit.h"
|
||||||
#include "Core/PowerPC/Jit64/JitRegCache.h"
|
#include "Core/PowerPC/Jit64/JitRegCache.h"
|
||||||
|
|
||||||
using namespace Gen;
|
using namespace Gen;
|
||||||
|
|
||||||
// TODO
|
|
||||||
// ps_madds0
|
|
||||||
// ps_muls0
|
|
||||||
// ps_madds1
|
|
||||||
// cmppd, andpd, andnpd, or
|
|
||||||
// lfsx, ps_merge01 etc
|
|
||||||
|
|
||||||
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
|
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
|
||||||
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
||||||
|
|
||||||
|
@ -36,9 +30,6 @@ void Jit64::ps_mr(UGeckoInstruction inst)
|
||||||
|
|
||||||
void Jit64::ps_sel(UGeckoInstruction inst)
|
void Jit64::ps_sel(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
// we can't use (V)BLENDVPD here because it just looks at the sign bit
|
|
||||||
// but we need -0 = +0
|
|
||||||
|
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
JITDISABLE(bJITPairedOff);
|
JITDISABLE(bJITPairedOff);
|
||||||
FALLBACK_IF(inst.Rc);
|
FALLBACK_IF(inst.Rc);
|
||||||
|
@ -49,16 +40,26 @@ void Jit64::ps_sel(UGeckoInstruction inst)
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
|
|
||||||
fpr.Lock(a, b, c, d);
|
fpr.Lock(a, b, c, d);
|
||||||
MOVAPD(XMM0, fpr.R(a));
|
|
||||||
PXOR(XMM1, R(XMM1));
|
if (cpu_info.bSSE4_1)
|
||||||
// XMM0 = XMM0 < 0 ? all 1s : all 0s
|
{
|
||||||
CMPPD(XMM0, R(XMM1), LT);
|
PXOR(XMM0, R(XMM0));
|
||||||
MOVAPD(XMM1, R(XMM0));
|
CMPPD(XMM0, fpr.R(a), LT); // XMM0 = XMM0 >= 0 ? all 1s : all 0s
|
||||||
PAND(XMM0, fpr.R(b));
|
MOVAPD(XMM1, fpr.R(b));
|
||||||
PANDN(XMM1, fpr.R(c));
|
BLENDVPD(XMM1, fpr.R(c));
|
||||||
POR(XMM0, R(XMM1));
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOVAPD(XMM1, fpr.R(a));
|
||||||
|
PXOR(XMM0, R(XMM0));
|
||||||
|
CMPPD(XMM1, R(XMM0), LT); // XMM0 = XMM0 < 0 ? all 1s : all 0s
|
||||||
|
MOVAPD(XMM0, R(XMM1));
|
||||||
|
PAND(XMM1, fpr.R(b));
|
||||||
|
PANDN(XMM0, fpr.R(c));
|
||||||
|
POR(XMM1, R(XMM0));
|
||||||
|
}
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, false);
|
||||||
MOVAPD(fpr.RX(d), R(XMM0));
|
MOVAPD(fpr.RX(d), R(XMM1));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,20 +99,6 @@ void Jit64::ps_sign(UGeckoInstruction inst)
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
//add a, b, c
|
|
||||||
|
|
||||||
//mov a, b
|
|
||||||
//add a, c
|
|
||||||
//we need:
|
|
||||||
/*
|
|
||||||
psq_l
|
|
||||||
psq_stu
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
add a,b,a
|
|
||||||
*/
|
|
||||||
|
|
||||||
//There's still a little bit more optimization that can be squeezed out of this
|
//There's still a little bit more optimization that can be squeezed out of this
|
||||||
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
|
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
|
||||||
{
|
{
|
||||||
|
@ -152,7 +139,7 @@ void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X6
|
||||||
MOVAPD(XMM0, fpr.R(b));
|
MOVAPD(XMM0, fpr.R(b));
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, false);
|
||||||
MOVAPD(fpr.RX(d), fpr.R(a));
|
MOVAPD(fpr.RX(d), fpr.R(a));
|
||||||
(this->*op)(fpr.RX(d), Gen::R(XMM0));
|
(this->*op)(fpr.RX(d), R(XMM0));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -204,32 +191,26 @@ void Jit64::ps_sum(UGeckoInstruction inst)
|
||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
fpr.Lock(a,b,c,d);
|
fpr.Lock(a,b,c,d);
|
||||||
fpr.BindToRegister(d, d == a || d == b || d == c, true);
|
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 10:
|
case 10:
|
||||||
// ps_sum0, do the sum in upper subregisters, merge uppers
|
MOVDDUP(XMM0, fpr.R(a)); // {a.ps0, a.ps0}
|
||||||
MOVDDUP(XMM0, fpr.R(a));
|
ADDPD(XMM0, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1}
|
||||||
MOVAPD(XMM1, fpr.R(b));
|
UNPCKHPD(XMM0, fpr.R(c)); // {a.ps0 + b.ps1, c.ps1}
|
||||||
ADDPD(XMM0, R(XMM1));
|
|
||||||
UNPCKHPD(XMM0, fpr.R(c)); //merge
|
|
||||||
MOVAPD(fpr.R(d), XMM0);
|
|
||||||
break;
|
break;
|
||||||
case 11:
|
case 11:
|
||||||
// ps_sum1, do the sum in lower subregisters, merge lowers
|
MOVDDUP(XMM1, fpr.R(a)); // {a.ps0, a.ps0}
|
||||||
MOVAPD(XMM0, fpr.R(a));
|
ADDPD(XMM1, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1}
|
||||||
MOVAPD(XMM1, fpr.R(b));
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
SHUFPD(XMM1, R(XMM1), 5); // copy higher to lower
|
SHUFPD(XMM0, R(XMM1), 2); // {c.ps0, a.ps0 + b.ps1}
|
||||||
ADDPD(XMM0, R(XMM1)); // sum lowers
|
|
||||||
MOVAPD(XMM1, fpr.R(c));
|
|
||||||
UNPCKLPD(XMM1, R(XMM0)); // merge
|
|
||||||
MOVAPD(fpr.R(d), XMM1);
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
PanicAlert("ps_sum WTF!!!");
|
PanicAlert("ps_sum WTF!!!");
|
||||||
}
|
}
|
||||||
ForceSinglePrecisionP(fpr.RX(d));
|
ForceSinglePrecisionP(XMM0);
|
||||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
SetFPRFIfNeeded(inst, XMM0);
|
||||||
|
fpr.BindToRegister(d, false);
|
||||||
|
MOVAPD(fpr.RX(d), R(XMM0));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -244,37 +225,28 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
||||||
int a = inst.FA;
|
int a = inst.FA;
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
fpr.Lock(a, c, d);
|
fpr.Lock(a, c, d);
|
||||||
fpr.BindToRegister(d, d == a || d == c, true);
|
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 12:
|
case 12:
|
||||||
// Single multiply scalar high
|
MOVDDUP(XMM0, fpr.R(c));
|
||||||
// TODO - faster version for when regs are different
|
|
||||||
MOVDDUP(XMM1, fpr.R(c));
|
|
||||||
Force25BitPrecision(XMM1, XMM0);
|
|
||||||
MOVAPD(XMM0, fpr.R(a));
|
|
||||||
MULPD(XMM0, R(XMM1));
|
|
||||||
MOVAPD(fpr.R(d), XMM0);
|
|
||||||
break;
|
break;
|
||||||
case 13:
|
case 13:
|
||||||
// TODO - faster version for when regs are different
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
MOVAPD(XMM1, fpr.R(c));
|
SHUFPD(XMM0, R(XMM0), 3);
|
||||||
Force25BitPrecision(XMM1, XMM0);
|
|
||||||
MOVAPD(XMM0, fpr.R(a));
|
|
||||||
SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
|
|
||||||
MULPD(XMM0, R(XMM1));
|
|
||||||
MOVAPD(fpr.R(d), XMM0);
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
PanicAlert("ps_muls WTF!!!");
|
PanicAlert("ps_muls WTF!!!");
|
||||||
}
|
}
|
||||||
ForceSinglePrecisionP(fpr.RX(d));
|
Force25BitPrecision(XMM0, XMM1);
|
||||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
MULPD(XMM0, fpr.R(a));
|
||||||
|
ForceSinglePrecisionP(XMM0);
|
||||||
|
SetFPRFIfNeeded(inst, XMM0);
|
||||||
|
fpr.BindToRegister(d, false);
|
||||||
|
MOVAPD(fpr.RX(d), R(XMM0));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//TODO: find easy cases and optimize them, do a breakout like ps_arith
|
|
||||||
void Jit64::ps_mergeXX(UGeckoInstruction inst)
|
void Jit64::ps_mergeXX(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
|
@ -305,7 +277,7 @@ void Jit64::ps_mergeXX(UGeckoInstruction inst)
|
||||||
_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
|
_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
|
||||||
}
|
}
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, false);
|
||||||
MOVAPD(fpr.RX(d), Gen::R(XMM0));
|
MOVAPD(fpr.RX(d), R(XMM0));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -373,8 +345,8 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, false);
|
||||||
MOVAPD(fpr.RX(d), Gen::R(XMM0));
|
ForceSinglePrecisionP(XMM0);
|
||||||
ForceSinglePrecisionP(fpr.RX(d));
|
SetFPRFIfNeeded(inst, XMM0);
|
||||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
MOVAPD(fpr.RX(d), R(XMM0));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
|
@ -184,47 +184,63 @@ static const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8,
|
||||||
|
|
||||||
static const float GC_ALIGNED16(m_quantizeTableS[]) =
|
static const float GC_ALIGNED16(m_quantizeTableS[]) =
|
||||||
{
|
{
|
||||||
(1ULL << 0), (1ULL << 1), (1ULL << 2), (1ULL << 3),
|
(1ULL << 0), (1ULL << 0), (1ULL << 1), (1ULL << 1), (1ULL << 2), (1ULL << 2), (1ULL << 3), (1ULL << 3),
|
||||||
(1ULL << 4), (1ULL << 5), (1ULL << 6), (1ULL << 7),
|
(1ULL << 4), (1ULL << 4), (1ULL << 5), (1ULL << 5), (1ULL << 6), (1ULL << 6), (1ULL << 7), (1ULL << 7),
|
||||||
(1ULL << 8), (1ULL << 9), (1ULL << 10), (1ULL << 11),
|
(1ULL << 8), (1ULL << 8), (1ULL << 9), (1ULL << 9), (1ULL << 10), (1ULL << 10), (1ULL << 11), (1ULL << 11),
|
||||||
(1ULL << 12), (1ULL << 13), (1ULL << 14), (1ULL << 15),
|
(1ULL << 12), (1ULL << 12), (1ULL << 13), (1ULL << 13), (1ULL << 14), (1ULL << 14), (1ULL << 15), (1ULL << 15),
|
||||||
(1ULL << 16), (1ULL << 17), (1ULL << 18), (1ULL << 19),
|
(1ULL << 16), (1ULL << 16), (1ULL << 17), (1ULL << 17), (1ULL << 18), (1ULL << 18), (1ULL << 19), (1ULL << 19),
|
||||||
(1ULL << 20), (1ULL << 21), (1ULL << 22), (1ULL << 23),
|
(1ULL << 20), (1ULL << 20), (1ULL << 21), (1ULL << 21), (1ULL << 22), (1ULL << 22), (1ULL << 23), (1ULL << 23),
|
||||||
(1ULL << 24), (1ULL << 25), (1ULL << 26), (1ULL << 27),
|
(1ULL << 24), (1ULL << 24), (1ULL << 25), (1ULL << 25), (1ULL << 26), (1ULL << 26), (1ULL << 27), (1ULL << 27),
|
||||||
(1ULL << 28), (1ULL << 29), (1ULL << 30), (1ULL << 31),
|
(1ULL << 28), (1ULL << 28), (1ULL << 29), (1ULL << 29), (1ULL << 30), (1ULL << 30), (1ULL << 31), (1ULL << 31),
|
||||||
1.0 / (1ULL << 32), 1.0 / (1ULL << 31), 1.0 / (1ULL << 30), 1.0 / (1ULL << 29),
|
1.0 / (1ULL << 32), 1.0 / (1ULL << 32), 1.0 / (1ULL << 31), 1.0 / (1ULL << 31),
|
||||||
1.0 / (1ULL << 28), 1.0 / (1ULL << 27), 1.0 / (1ULL << 26), 1.0 / (1ULL << 25),
|
1.0 / (1ULL << 30), 1.0 / (1ULL << 30), 1.0 / (1ULL << 29), 1.0 / (1ULL << 29),
|
||||||
1.0 / (1ULL << 24), 1.0 / (1ULL << 23), 1.0 / (1ULL << 22), 1.0 / (1ULL << 21),
|
1.0 / (1ULL << 28), 1.0 / (1ULL << 28), 1.0 / (1ULL << 27), 1.0 / (1ULL << 27),
|
||||||
1.0 / (1ULL << 20), 1.0 / (1ULL << 19), 1.0 / (1ULL << 18), 1.0 / (1ULL << 17),
|
1.0 / (1ULL << 26), 1.0 / (1ULL << 26), 1.0 / (1ULL << 25), 1.0 / (1ULL << 25),
|
||||||
1.0 / (1ULL << 16), 1.0 / (1ULL << 15), 1.0 / (1ULL << 14), 1.0 / (1ULL << 13),
|
1.0 / (1ULL << 24), 1.0 / (1ULL << 24), 1.0 / (1ULL << 23), 1.0 / (1ULL << 23),
|
||||||
1.0 / (1ULL << 12), 1.0 / (1ULL << 11), 1.0 / (1ULL << 10), 1.0 / (1ULL << 9),
|
1.0 / (1ULL << 22), 1.0 / (1ULL << 22), 1.0 / (1ULL << 21), 1.0 / (1ULL << 21),
|
||||||
1.0 / (1ULL << 8), 1.0 / (1ULL << 7), 1.0 / (1ULL << 6), 1.0 / (1ULL << 5),
|
1.0 / (1ULL << 20), 1.0 / (1ULL << 20), 1.0 / (1ULL << 19), 1.0 / (1ULL << 19),
|
||||||
1.0 / (1ULL << 4), 1.0 / (1ULL << 3), 1.0 / (1ULL << 2), 1.0 / (1ULL << 1),
|
1.0 / (1ULL << 18), 1.0 / (1ULL << 18), 1.0 / (1ULL << 17), 1.0 / (1ULL << 17),
|
||||||
|
1.0 / (1ULL << 16), 1.0 / (1ULL << 16), 1.0 / (1ULL << 15), 1.0 / (1ULL << 15),
|
||||||
|
1.0 / (1ULL << 14), 1.0 / (1ULL << 14), 1.0 / (1ULL << 13), 1.0 / (1ULL << 13),
|
||||||
|
1.0 / (1ULL << 12), 1.0 / (1ULL << 12), 1.0 / (1ULL << 11), 1.0 / (1ULL << 11),
|
||||||
|
1.0 / (1ULL << 10), 1.0 / (1ULL << 10), 1.0 / (1ULL << 9), 1.0 / (1ULL << 9),
|
||||||
|
1.0 / (1ULL << 8), 1.0 / (1ULL << 8), 1.0 / (1ULL << 7), 1.0 / (1ULL << 7),
|
||||||
|
1.0 / (1ULL << 6), 1.0 / (1ULL << 6), 1.0 / (1ULL << 5), 1.0 / (1ULL << 5),
|
||||||
|
1.0 / (1ULL << 4), 1.0 / (1ULL << 4), 1.0 / (1ULL << 3), 1.0 / (1ULL << 3),
|
||||||
|
1.0 / (1ULL << 2), 1.0 / (1ULL << 2), 1.0 / (1ULL << 1), 1.0 / (1ULL << 1),
|
||||||
};
|
};
|
||||||
|
|
||||||
static const float GC_ALIGNED16(m_dequantizeTableS[]) =
|
static const float GC_ALIGNED16(m_dequantizeTableS[]) =
|
||||||
{
|
{
|
||||||
1.0 / (1ULL << 0), 1.0 / (1ULL << 1), 1.0 / (1ULL << 2), 1.0 / (1ULL << 3),
|
1.0 / (1ULL << 0), 1.0 / (1ULL << 0), 1.0 / (1ULL << 1), 1.0 / (1ULL << 1),
|
||||||
1.0 / (1ULL << 4), 1.0 / (1ULL << 5), 1.0 / (1ULL << 6), 1.0 / (1ULL << 7),
|
1.0 / (1ULL << 2), 1.0 / (1ULL << 2), 1.0 / (1ULL << 3), 1.0 / (1ULL << 3),
|
||||||
1.0 / (1ULL << 8), 1.0 / (1ULL << 9), 1.0 / (1ULL << 10), 1.0 / (1ULL << 11),
|
1.0 / (1ULL << 4), 1.0 / (1ULL << 4), 1.0 / (1ULL << 5), 1.0 / (1ULL << 5),
|
||||||
1.0 / (1ULL << 12), 1.0 / (1ULL << 13), 1.0 / (1ULL << 14), 1.0 / (1ULL << 15),
|
1.0 / (1ULL << 6), 1.0 / (1ULL << 6), 1.0 / (1ULL << 7), 1.0 / (1ULL << 7),
|
||||||
1.0 / (1ULL << 16), 1.0 / (1ULL << 17), 1.0 / (1ULL << 18), 1.0 / (1ULL << 19),
|
1.0 / (1ULL << 8), 1.0 / (1ULL << 8), 1.0 / (1ULL << 9), 1.0 / (1ULL << 9),
|
||||||
1.0 / (1ULL << 20), 1.0 / (1ULL << 21), 1.0 / (1ULL << 22), 1.0 / (1ULL << 23),
|
1.0 / (1ULL << 10), 1.0 / (1ULL << 10), 1.0 / (1ULL << 11), 1.0 / (1ULL << 11),
|
||||||
1.0 / (1ULL << 24), 1.0 / (1ULL << 25), 1.0 / (1ULL << 26), 1.0 / (1ULL << 27),
|
1.0 / (1ULL << 12), 1.0 / (1ULL << 12), 1.0 / (1ULL << 13), 1.0 / (1ULL << 13),
|
||||||
1.0 / (1ULL << 28), 1.0 / (1ULL << 29), 1.0 / (1ULL << 30), 1.0 / (1ULL << 31),
|
1.0 / (1ULL << 14), 1.0 / (1ULL << 14), 1.0 / (1ULL << 15), 1.0 / (1ULL << 15),
|
||||||
(1ULL << 32), (1ULL << 31), (1ULL << 30), (1ULL << 29),
|
1.0 / (1ULL << 16), 1.0 / (1ULL << 16), 1.0 / (1ULL << 17), 1.0 / (1ULL << 17),
|
||||||
(1ULL << 28), (1ULL << 27), (1ULL << 26), (1ULL << 25),
|
1.0 / (1ULL << 18), 1.0 / (1ULL << 18), 1.0 / (1ULL << 19), 1.0 / (1ULL << 19),
|
||||||
(1ULL << 24), (1ULL << 23), (1ULL << 22), (1ULL << 21),
|
1.0 / (1ULL << 20), 1.0 / (1ULL << 20), 1.0 / (1ULL << 21), 1.0 / (1ULL << 21),
|
||||||
(1ULL << 20), (1ULL << 19), (1ULL << 18), (1ULL << 17),
|
1.0 / (1ULL << 22), 1.0 / (1ULL << 22), 1.0 / (1ULL << 23), 1.0 / (1ULL << 23),
|
||||||
(1ULL << 16), (1ULL << 15), (1ULL << 14), (1ULL << 13),
|
1.0 / (1ULL << 24), 1.0 / (1ULL << 24), 1.0 / (1ULL << 25), 1.0 / (1ULL << 25),
|
||||||
(1ULL << 12), (1ULL << 11), (1ULL << 10), (1ULL << 9),
|
1.0 / (1ULL << 26), 1.0 / (1ULL << 26), 1.0 / (1ULL << 27), 1.0 / (1ULL << 27),
|
||||||
(1ULL << 8), (1ULL << 7), (1ULL << 6), (1ULL << 5),
|
1.0 / (1ULL << 28), 1.0 / (1ULL << 28), 1.0 / (1ULL << 29), 1.0 / (1ULL << 29),
|
||||||
(1ULL << 4), (1ULL << 3), (1ULL << 2), (1ULL << 1),
|
1.0 / (1ULL << 30), 1.0 / (1ULL << 30), 1.0 / (1ULL << 31), 1.0 / (1ULL << 31),
|
||||||
|
(1ULL << 32), (1ULL << 32), (1ULL << 31), (1ULL << 31), (1ULL << 30), (1ULL << 30), (1ULL << 29), (1ULL << 29),
|
||||||
|
(1ULL << 28), (1ULL << 28), (1ULL << 27), (1ULL << 27), (1ULL << 26), (1ULL << 26), (1ULL << 25), (1ULL << 25),
|
||||||
|
(1ULL << 24), (1ULL << 24), (1ULL << 23), (1ULL << 23), (1ULL << 22), (1ULL << 22), (1ULL << 21), (1ULL << 21),
|
||||||
|
(1ULL << 20), (1ULL << 20), (1ULL << 19), (1ULL << 19), (1ULL << 18), (1ULL << 18), (1ULL << 17), (1ULL << 17),
|
||||||
|
(1ULL << 16), (1ULL << 16), (1ULL << 15), (1ULL << 15), (1ULL << 14), (1ULL << 14), (1ULL << 13), (1ULL << 13),
|
||||||
|
(1ULL << 12), (1ULL << 12), (1ULL << 11), (1ULL << 11), (1ULL << 10), (1ULL << 10), (1ULL << 9), (1ULL << 9),
|
||||||
|
(1ULL << 8), (1ULL << 8), (1ULL << 7), (1ULL << 7), (1ULL << 6), (1ULL << 6), (1ULL << 5), (1ULL << 5),
|
||||||
|
(1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1),
|
||||||
};
|
};
|
||||||
|
|
||||||
static float GC_ALIGNED16(psTemp[4]);
|
static float GC_ALIGNED16(psTemp[4]);
|
||||||
|
|
||||||
static const float GC_ALIGNED16(m_65535) = 65535.0f;
|
static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
|
||||||
static const float GC_ALIGNED16(m_32767) = 32767.0f;
|
static const float GC_ALIGNED16(m_32767) = 32767.0f;
|
||||||
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
|
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
|
||||||
static const float GC_ALIGNED16(m_255) = 255.0f;
|
static const float GC_ALIGNED16(m_255) = 255.0f;
|
||||||
|
@ -273,14 +289,11 @@ void CommonAsmRoutines::GenQuantizedStores()
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storePairedU8 = AlignCode4();
|
const u8* storePairedU8 = AlignCode4();
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||||
MOVSS(XMM1, M((void *)&m_65535));
|
MINPS(XMM0, M((void *)&m_65535));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
|
||||||
MINPS(XMM0, R(XMM1));
|
|
||||||
#endif
|
#endif
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
CVTTPS2DQ(XMM0, R(XMM0));
|
||||||
PACKSSDW(XMM0, R(XMM0));
|
PACKSSDW(XMM0, R(XMM0));
|
||||||
|
@ -291,14 +304,11 @@ void CommonAsmRoutines::GenQuantizedStores()
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storePairedS8 = AlignCode4();
|
const u8* storePairedS8 = AlignCode4();
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||||
MOVSS(XMM1, M((void *)&m_65535));
|
MINPS(XMM0, M((void *)&m_65535));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
|
||||||
MINPS(XMM0, R(XMM1));
|
|
||||||
#endif
|
#endif
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
CVTTPS2DQ(XMM0, R(XMM0));
|
||||||
PACKSSDW(XMM0, R(XMM0));
|
PACKSSDW(XMM0, R(XMM0));
|
||||||
|
@ -310,17 +320,26 @@ void CommonAsmRoutines::GenQuantizedStores()
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storePairedU16 = AlignCode4();
|
const u8* storePairedU16 = AlignCode4();
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
|
|
||||||
// PACKUSDW is available only in SSE4
|
if (cpu_info.bSSE4_1)
|
||||||
PXOR(XMM1, R(XMM1));
|
{
|
||||||
|
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||||
|
MINPS(XMM0, M((void *)&m_65535));
|
||||||
|
#endif
|
||||||
|
CVTTPS2DQ(XMM0, R(XMM0));
|
||||||
|
PACKUSDW(XMM0, R(XMM0));
|
||||||
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
|
BSWAP(32, RSCRATCH);
|
||||||
|
ROL(32, R(RSCRATCH), Imm8(16));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
XORPS(XMM1, R(XMM1));
|
||||||
MAXPS(XMM0, R(XMM1));
|
MAXPS(XMM0, R(XMM1));
|
||||||
MOVSS(XMM1, M((void *)&m_65535));
|
MINPS(XMM0, M((void *)&m_65535));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
|
||||||
MINPS(XMM0, R(XMM1));
|
|
||||||
|
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
CVTTPS2DQ(XMM0, R(XMM0));
|
||||||
MOVQ_xmm(M(psTemp), XMM0);
|
MOVQ_xmm(M(psTemp), XMM0);
|
||||||
|
@ -329,22 +348,19 @@ void CommonAsmRoutines::GenQuantizedStores()
|
||||||
MOVZX(32, 16, RSCRATCH, M((char*)psTemp + 0));
|
MOVZX(32, 16, RSCRATCH, M((char*)psTemp + 0));
|
||||||
SHL(32, R(RSCRATCH), Imm8(16));
|
SHL(32, R(RSCRATCH), Imm8(16));
|
||||||
MOV(16, R(RSCRATCH), M((char*)psTemp + 4));
|
MOV(16, R(RSCRATCH), M((char*)psTemp + 4));
|
||||||
|
|
||||||
BSWAP(32, RSCRATCH);
|
BSWAP(32, RSCRATCH);
|
||||||
|
}
|
||||||
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||||
|
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storePairedS16 = AlignCode4();
|
const u8* storePairedS16 = AlignCode4();
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
// SHUFPS or UNPCKLPS might be a better choice here. The last one might just be an alias though.
|
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||||
MOVSS(XMM1, M((void *)&m_65535));
|
MINPS(XMM0, M((void *)&m_65535));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
|
||||||
MINPS(XMM0, R(XMM1));
|
|
||||||
#endif
|
#endif
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
CVTTPS2DQ(XMM0, R(XMM0));
|
||||||
PACKSSDW(XMM0, R(XMM0));
|
PACKSSDW(XMM0, R(XMM0));
|
||||||
|
@ -395,10 +411,10 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||||
}*/
|
}*/
|
||||||
|
|
||||||
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
PXOR(XMM1, R(XMM1));
|
XORPS(XMM1, R(XMM1));
|
||||||
MAXSS(XMM0, R(XMM1));
|
MAXSS(XMM0, R(XMM1));
|
||||||
MINSS(XMM0, M((void *)&m_255));
|
MINSS(XMM0, M((void *)&m_255));
|
||||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||||
|
@ -406,7 +422,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storeSingleS8 = AlignCode4();
|
const u8* storeSingleS8 = AlignCode4();
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
MAXSS(XMM0, M((void *)&m_m128));
|
MAXSS(XMM0, M((void *)&m_m128));
|
||||||
|
@ -416,10 +432,10 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
PXOR(XMM1, R(XMM1));
|
XORPS(XMM1, R(XMM1));
|
||||||
MAXSS(XMM0, R(XMM1));
|
MAXSS(XMM0, R(XMM1));
|
||||||
MINSS(XMM0, M((void *)&m_65535));
|
MINSS(XMM0, M((void *)&m_65535));
|
||||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||||
|
@ -427,7 +443,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storeSingleS16 = AlignCode4();
|
const u8* storeSingleS16 = AlignCode4();
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
MAXSS(XMM0, M((void *)&m_m32768));
|
MAXSS(XMM0, M((void *)&m_m32768));
|
||||||
|
@ -507,13 +523,19 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
||||||
}
|
}
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
PMOVZXBD(XMM0, R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
PXOR(XMM1, R(XMM1));
|
PXOR(XMM1, R(XMM1));
|
||||||
PUNPCKLBW(XMM0, R(XMM1));
|
PUNPCKLBW(XMM0, R(XMM1));
|
||||||
PUNPCKLWD(XMM0, R(XMM1));
|
PUNPCKLWD(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
|
@ -524,7 +546,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
CVTDQ2PS(XMM0, R(XMM0)); // Is CVTSI2SS better?
|
CVTDQ2PS(XMM0, R(XMM0)); // Is CVTSI2SS better?
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
UNPCKLPS(XMM0, M((void*)m_one));
|
UNPCKLPS(XMM0, M((void*)m_one));
|
||||||
|
@ -542,13 +564,19 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
||||||
}
|
}
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
PMOVSXBD(XMM0, R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
PUNPCKLBW(XMM0, R(XMM0));
|
PUNPCKLBW(XMM0, R(XMM0));
|
||||||
PUNPCKLWD(XMM0, R(XMM0));
|
PUNPCKLWD(XMM0, R(XMM0));
|
||||||
PSRAD(XMM0, 24);
|
PSRAD(XMM0, 24);
|
||||||
|
}
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
|
@ -559,7 +587,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
UNPCKLPS(XMM0, M((void*)m_one));
|
UNPCKLPS(XMM0, M((void*)m_one));
|
||||||
|
@ -573,12 +601,18 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
||||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
PMOVZXWD(XMM0, R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
PXOR(XMM1, R(XMM1));
|
PXOR(XMM1, R(XMM1));
|
||||||
PUNPCKLWD(XMM0, R(XMM1));
|
PUNPCKLWD(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
|
@ -589,7 +623,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
UNPCKLPS(XMM0, M((void*)m_one));
|
UNPCKLPS(XMM0, M((void*)m_one));
|
||||||
|
@ -602,12 +636,18 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
||||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
PMOVSXWD(XMM0, R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
PUNPCKLWD(XMM0, R(XMM0));
|
PUNPCKLWD(XMM0, R(XMM0));
|
||||||
PSRAD(XMM0, 16);
|
PSRAD(XMM0, 16);
|
||||||
|
}
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
|
@ -618,7 +658,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
SHR(32, R(RSCRATCH2), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
UNPCKLPS(XMM0, M((void*)m_one));
|
UNPCKLPS(XMM0, M((void*)m_one));
|
||||||
|
|
Loading…
Reference in New Issue