Refactor the paired load/store code
Simplification/reduction of duplicated code. Detect other constant GQR values and inline loads (5-10% speedup) and do direct dispatch to AOT methods for stores.
This commit is contained in:
parent
6b01eca3a0
commit
4aa5291f54
|
@ -672,27 +672,20 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
|
|||
js.skipInstructions = 0;
|
||||
js.carryFlagSet = false;
|
||||
js.carryFlagInverted = false;
|
||||
js.assumeNoPairedQuantize = false;
|
||||
js.constantGqr.clear();
|
||||
|
||||
// If the block only uses one GQR and the GQR is zero at compile time, make a guess that the block
|
||||
// never uses quantized loads/stores. Many paired-heavy games use largely float loads and stores,
|
||||
// Assume that GQR values don't change often at runtime. Many paired-heavy games use largely float
|
||||
// loads and stores,
|
||||
// which are significantly faster when inlined (especially in MMU mode, where this lets them use
|
||||
// fastmem).
|
||||
// Insert a check that the GQR is still zero at the start of the block in case our guess turns out
|
||||
// wrong.
|
||||
// TODO: support any other constant GQR value, not merely zero/unquantized: we can optimize
|
||||
// quantized
|
||||
// loadstores too, it'd just be more code.
|
||||
if (code_block.m_gqr_used.Count() == 1 &&
|
||||
js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end())
|
||||
if (js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end())
|
||||
{
|
||||
int gqr = *code_block.m_gqr_used.begin();
|
||||
if (!code_block.m_gqr_modified[gqr] && !GQR(gqr))
|
||||
// If there are GQRs used but not set, we'll treat those as constant and optimize them
|
||||
BitSet8 gqr_static = ComputeStaticGQRs(code_block);
|
||||
if (gqr_static)
|
||||
{
|
||||
CMP(32, PPCSTATE(spr[SPR_GQR0 + gqr]), Imm8(0));
|
||||
FixupBranch failure = J_CC(CC_NZ, true);
|
||||
SwitchToFarCode();
|
||||
SetJumpTarget(failure);
|
||||
const u8* target = GetCodePtr();
|
||||
MOV(32, PPCSTATE(pc), Imm32(js.blockStart));
|
||||
ABI_PushRegistersAndAdjustStack({}, 0);
|
||||
ABI_CallFunctionC((void*)&JitInterface::CompileExceptionCheck,
|
||||
|
@ -700,7 +693,16 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
|
|||
ABI_PopRegistersAndAdjustStack({}, 0);
|
||||
JMP(asm_routines.dispatcher, true);
|
||||
SwitchToNearCode();
|
||||
js.assumeNoPairedQuantize = true;
|
||||
|
||||
// Insert a check that the GQRs are still the value we expect at
|
||||
// the start of the block in case our guess turns out wrong.
|
||||
for (int gqr : gqr_static)
|
||||
{
|
||||
u32 value = GQR(gqr);
|
||||
js.constantGqr[gqr] = value;
|
||||
CMP_or_TEST(32, PPCSTATE(spr[SPR_GQR0 + gqr]), Imm32(value));
|
||||
J_CC(CC_NZ, target);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -947,7 +949,12 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
|
|||
return normalEntry;
|
||||
}
|
||||
|
||||
BitSet32 Jit64::CallerSavedRegistersInUse()
|
||||
BitSet8 Jit64::ComputeStaticGQRs(const PPCAnalyst::CodeBlock& cb) const
|
||||
{
|
||||
return cb.m_gqr_used & ~cb.m_gqr_modified;
|
||||
}
|
||||
|
||||
BitSet32 Jit64::CallerSavedRegistersInUse() const
|
||||
{
|
||||
BitSet32 result;
|
||||
for (int i = 0; i < NUMXREGS; i++)
|
||||
|
|
|
@ -64,7 +64,8 @@ public:
|
|||
void Jit(u32 em_address) override;
|
||||
const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC);
|
||||
|
||||
BitSet32 CallerSavedRegistersInUse();
|
||||
BitSet32 CallerSavedRegistersInUse() const;
|
||||
BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const;
|
||||
|
||||
JitBlockCache* GetBlockCache() override { return &blocks; }
|
||||
void Trace();
|
||||
|
|
|
@ -35,8 +35,12 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
|
|||
int w = indexed ? inst.Wx : inst.W;
|
||||
FALLBACK_IF(!a);
|
||||
|
||||
auto it = js.constantGqr.find(i);
|
||||
bool gqrIsConstant = it != js.constantGqr.end();
|
||||
u32 gqrValue = gqrIsConstant ? it->second & 0xffff : 0;
|
||||
|
||||
gpr.Lock(a, b);
|
||||
if (js.assumeNoPairedQuantize)
|
||||
if (gqrIsConstant && gqrValue == 0)
|
||||
{
|
||||
int storeOffset = 0;
|
||||
gpr.BindToRegister(a, true, update);
|
||||
|
@ -125,25 +129,68 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
|
|||
// In memcheck mode, don't update the address until the exception check
|
||||
if (update && !jo.memcheck)
|
||||
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
|
||||
// Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code.
|
||||
// Hence, we need to mask out the unused bits. The layout of the GQR register is
|
||||
// UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with
|
||||
// 0b0011111100000111, or 0x3F07.
|
||||
MOV(32, R(RSCRATCH2), Imm32(0x3F07));
|
||||
AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + i]));
|
||||
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
|
||||
|
||||
if (w)
|
||||
if (gqrIsConstant)
|
||||
{
|
||||
// One value
|
||||
CVTSD2SS(XMM0, fpr.R(s));
|
||||
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized));
|
||||
// Paired stores don't yield any real change in performance right now, but if we can
|
||||
// improve fastmem support this might change
|
||||
//#define INLINE_PAIRED_STORES
|
||||
#ifdef INLINE_PAIRED_STORES
|
||||
if (w)
|
||||
{
|
||||
// One value
|
||||
CVTSD2SS(XMM0, fpr.R(s));
|
||||
GenQuantizedStore(true, static_cast<EQuantizeType>(gqrValue & 0x7), (gqrValue & 0x3F00) >> 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Pair of values
|
||||
CVTPD2PS(XMM0, fpr.R(s));
|
||||
GenQuantizedStore(false, static_cast<EQuantizeType>(gqrValue & 0x7),
|
||||
(gqrValue & 0x3F00) >> 8);
|
||||
}
|
||||
#else
|
||||
// We know what GQR is here, so we can load RSCRATCH2 and call into the store method directly
|
||||
// with just the scale bits.
|
||||
int type = gqrValue & 0x7;
|
||||
MOV(32, R(RSCRATCH2), Imm32(gqrValue & 0x3F00));
|
||||
|
||||
if (w)
|
||||
{
|
||||
// One value
|
||||
CVTSD2SS(XMM0, fpr.R(s));
|
||||
CALL(asm_routines.singleStoreQuantized[type]);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Pair of values
|
||||
CVTPD2PS(XMM0, fpr.R(s));
|
||||
CALL(asm_routines.pairedStoreQuantized[type]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
// Pair of values
|
||||
CVTPD2PS(XMM0, fpr.R(s));
|
||||
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized));
|
||||
// Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code.
|
||||
// Hence, we need to mask out the unused bits. The layout of the GQR register is
|
||||
// UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with
|
||||
// 0b0011111100000111, or 0x3F07.
|
||||
MOV(32, R(RSCRATCH2), Imm32(0x3F07));
|
||||
AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + i]));
|
||||
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
|
||||
|
||||
if (w)
|
||||
{
|
||||
// One value
|
||||
CVTSD2SS(XMM0, fpr.R(s));
|
||||
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized));
|
||||
}
|
||||
else
|
||||
{
|
||||
// Pair of values
|
||||
CVTPD2PS(XMM0, fpr.R(s));
|
||||
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized));
|
||||
}
|
||||
}
|
||||
|
||||
if (update && jo.memcheck)
|
||||
|
@ -173,8 +220,13 @@ void Jit64::psq_lXX(UGeckoInstruction inst)
|
|||
int w = indexed ? inst.Wx : inst.W;
|
||||
FALLBACK_IF(!a);
|
||||
|
||||
auto it = js.constantGqr.find(i);
|
||||
bool gqrIsConstant = it != js.constantGqr.end();
|
||||
u32 gqrValue = gqrIsConstant ? it->second >> 16 : 0;
|
||||
|
||||
gpr.Lock(a, b);
|
||||
if (js.assumeNoPairedQuantize)
|
||||
|
||||
if (gqrIsConstant && gqrValue == 0)
|
||||
{
|
||||
s32 loadOffset = 0;
|
||||
gpr.BindToRegister(a, true, update);
|
||||
|
@ -302,16 +354,24 @@ void Jit64::psq_lXX(UGeckoInstruction inst)
|
|||
// In memcheck mode, don't update the address until the exception check
|
||||
if (update && !jo.memcheck)
|
||||
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
|
||||
MOV(32, R(RSCRATCH2), Imm32(0x3F07));
|
||||
|
||||
// Get the high part of the GQR register
|
||||
OpArg gqr = PPCSTATE(spr[SPR_GQR0 + i]);
|
||||
gqr.AddMemOffset(2);
|
||||
if (gqrIsConstant)
|
||||
{
|
||||
GenQuantizedLoad(w == 1, static_cast<EQuantizeType>(gqrValue & 0x7), (gqrValue & 0x3F00) >> 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
MOV(32, R(RSCRATCH2), Imm32(0x3F07));
|
||||
|
||||
AND(32, R(RSCRATCH2), gqr);
|
||||
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
|
||||
// Get the high part of the GQR register
|
||||
OpArg gqr = PPCSTATE(spr[SPR_GQR0 + i]);
|
||||
gqr.AddMemOffset(2);
|
||||
|
||||
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(&asm_routines.pairedLoadQuantized[w * 8])));
|
||||
AND(32, R(RSCRATCH2), gqr);
|
||||
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
|
||||
|
||||
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(&asm_routines.pairedLoadQuantized[w * 8])));
|
||||
}
|
||||
|
||||
MemoryExceptionCheck();
|
||||
CVTPS2PD(fpr.RX(s), R(XMM0));
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#include "Common/x64ABI.h"
|
||||
#include "Common/x64Emitter.h"
|
||||
#include "Core/HW/GPFifo.h"
|
||||
#include "Core/PowerPC/Gekko.h"
|
||||
#include "Core/PowerPC/JitCommon/JitBase.h"
|
||||
#include "Core/PowerPC/JitCommon/Jit_Util.h"
|
||||
#include "Core/PowerPC/PowerPC.h"
|
||||
|
@ -219,438 +220,416 @@ alignas(16) static const float m_255 = 255.0f;
|
|||
alignas(16) static const float m_127 = 127.0f;
|
||||
alignas(16) static const float m_m128 = -128.0f;
|
||||
|
||||
#define QUANTIZE_OVERFLOW_SAFE
|
||||
// Sizes of the various quantized store types
|
||||
constexpr std::array<u8, 8> sizes{{32, 0, 0, 0, 8, 16, 8, 16}};
|
||||
|
||||
// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of
|
||||
// int32 range
|
||||
// while it's OK for large negatives, it isn't for positives
|
||||
// I don't know whether the overflow actually happens in any games
|
||||
// but it potentially can cause problems, so we need some clamping
|
||||
|
||||
// See comment in header for in/outs.
|
||||
void CommonAsmRoutines::GenQuantizedStores()
|
||||
{
|
||||
const void* start = GetCodePtr();
|
||||
|
||||
const u8* storePairedIllegal = AlignCode4();
|
||||
UD2();
|
||||
|
||||
const u8* storePairedFloat = AlignCode4();
|
||||
if (cpu_info.bSSSE3)
|
||||
{
|
||||
PSHUFB(XMM0, M((void*)pbswapShuffle2x4));
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
ROL(64, R(RSCRATCH), Imm8(32));
|
||||
BSWAP(64, RSCRATCH);
|
||||
}
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE,
|
||||
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedU8 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||
MINPS(XMM0, M(m_65535));
|
||||
#endif
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
PACKUSWB(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
|
||||
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedS8 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||
MINPS(XMM0, M(m_65535));
|
||||
#endif
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
PACKSSWB(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
|
||||
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedU16 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||
MINPS(XMM0, M(m_65535));
|
||||
#endif
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PACKUSDW(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
BSWAP(32, RSCRATCH);
|
||||
ROL(32, R(RSCRATCH), Imm8(16));
|
||||
}
|
||||
else
|
||||
{
|
||||
XORPS(XMM1, R(XMM1));
|
||||
MAXPS(XMM0, R(XMM1));
|
||||
MINPS(XMM0, M(m_65535));
|
||||
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
BSWAP(32, RSCRATCH);
|
||||
}
|
||||
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE,
|
||||
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedS16 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||
MINPS(XMM0, M(m_65535));
|
||||
#endif
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
BSWAP(32, RSCRATCH);
|
||||
ROL(32, R(RSCRATCH), Imm8(16));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE,
|
||||
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore");
|
||||
|
||||
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||
ReserveCodeSpace(8 * sizeof(u8*));
|
||||
|
||||
pairedStoreQuantized[0] = storePairedFloat;
|
||||
pairedStoreQuantized[1] = storePairedIllegal;
|
||||
pairedStoreQuantized[2] = storePairedIllegal;
|
||||
pairedStoreQuantized[3] = storePairedIllegal;
|
||||
pairedStoreQuantized[4] = storePairedU8;
|
||||
pairedStoreQuantized[5] = storePairedU16;
|
||||
pairedStoreQuantized[6] = storePairedS8;
|
||||
pairedStoreQuantized[7] = storePairedS16;
|
||||
for (int type = 0; type < 8; type++)
|
||||
pairedStoreQuantized[type] = GenQuantizedStoreRuntime(false, static_cast<EQuantizeType>(type));
|
||||
}
|
||||
|
||||
// See comment in header for in/outs.
|
||||
void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||
{
|
||||
const void* start = GetCodePtr();
|
||||
|
||||
const u8* storeSingleIllegal = AlignCode4();
|
||||
UD2();
|
||||
|
||||
// Easy!
|
||||
const u8* storeSingleFloat = AlignCode4();
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE,
|
||||
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
XORPS(XMM1, R(XMM1));
|
||||
MAXSS(XMM0, R(XMM1));
|
||||
MINSS(XMM0, M(&m_255));
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE,
|
||||
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleS8 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MAXSS(XMM0, M(&m_m128));
|
||||
MINSS(XMM0, M(&m_127));
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE,
|
||||
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
XORPS(XMM1, R(XMM1));
|
||||
MAXSS(XMM0, R(XMM1));
|
||||
MINSS(XMM0, M(m_65535));
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
|
||||
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleS16 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MAXSS(XMM0, M(&m_m32768));
|
||||
MINSS(XMM0, M(&m_32767));
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
|
||||
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedSingleStore");
|
||||
|
||||
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||
ReserveCodeSpace(8 * sizeof(u8*));
|
||||
|
||||
singleStoreQuantized[0] = storeSingleFloat;
|
||||
singleStoreQuantized[1] = storeSingleIllegal;
|
||||
singleStoreQuantized[2] = storeSingleIllegal;
|
||||
singleStoreQuantized[3] = storeSingleIllegal;
|
||||
singleStoreQuantized[4] = storeSingleU8;
|
||||
singleStoreQuantized[5] = storeSingleU16;
|
||||
singleStoreQuantized[6] = storeSingleS8;
|
||||
singleStoreQuantized[7] = storeSingleS16;
|
||||
for (int type = 0; type < 8; type++)
|
||||
singleStoreQuantized[type] = GenQuantizedStoreRuntime(true, static_cast<EQuantizeType>(type));
|
||||
}
|
||||
|
||||
const u8* CommonAsmRoutines::GenQuantizedStoreRuntime(bool single, EQuantizeType type)
|
||||
{
|
||||
const void* start = GetCodePtr();
|
||||
const u8* load = AlignCode4();
|
||||
GenQuantizedStore(single, type, -1);
|
||||
RET();
|
||||
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore_%i_%i", type, single);
|
||||
|
||||
return load;
|
||||
}
|
||||
|
||||
void CommonAsmRoutines::GenQuantizedLoads()
|
||||
{
|
||||
const void* start = GetCodePtr();
|
||||
|
||||
const u8* loadPairedIllegal = AlignCode4();
|
||||
UD2();
|
||||
|
||||
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
|
||||
// don't need hardware access handling. This will definitely crash if paired loads occur
|
||||
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
|
||||
// for a good reason, or merely because no game does this.
|
||||
// If we find something that actually does do this, maybe this should be changed. How
|
||||
// much of a performance hit would it be?
|
||||
const u8* loadPairedFloatTwo = AlignCode4();
|
||||
if (jit->jo.memcheck)
|
||||
{
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false,
|
||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
}
|
||||
else if (cpu_info.bSSSE3)
|
||||
{
|
||||
MOVQ_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
|
||||
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
||||
}
|
||||
else
|
||||
{
|
||||
LoadAndSwap(64, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
|
||||
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
}
|
||||
RET();
|
||||
|
||||
const u8* loadPairedFloatOne = AlignCode4();
|
||||
if (jit->jo.memcheck)
|
||||
{
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false,
|
||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
}
|
||||
else if (cpu_info.bSSSE3)
|
||||
{
|
||||
MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
|
||||
PSHUFB(XMM0, M(pbswapShuffle1x4));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
}
|
||||
else
|
||||
{
|
||||
LoadAndSwap(32, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
}
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU8Two = AlignCode4();
|
||||
if (jit->jo.memcheck)
|
||||
{
|
||||
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
||||
}
|
||||
else
|
||||
{
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
||||
}
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVZXBD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PXOR(XMM1, R(XMM1));
|
||||
PUNPCKLBW(XMM0, R(XMM1));
|
||||
PUNPCKLWD(XMM0, R(XMM1));
|
||||
}
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU8One = AlignCode4();
|
||||
if (jit->jo.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS8Two = AlignCode4();
|
||||
if (jit->jo.memcheck)
|
||||
{
|
||||
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
||||
}
|
||||
else
|
||||
{
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
||||
}
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVSXBD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PUNPCKLBW(XMM0, R(XMM0));
|
||||
PUNPCKLWD(XMM0, R(XMM0));
|
||||
PSRAD(XMM0, 24);
|
||||
}
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS8One = AlignCode4();
|
||||
if (jit->jo.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true,
|
||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU16Two = AlignCode4();
|
||||
// TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
|
||||
if (jit->jo.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVZXWD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PXOR(XMM1, R(XMM1));
|
||||
PUNPCKLWD(XMM0, R(XMM1));
|
||||
}
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU16One = AlignCode4();
|
||||
if (jit->jo.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS16Two = AlignCode4();
|
||||
if (jit->jo.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVSXWD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PUNPCKLWD(XMM0, R(XMM0));
|
||||
PSRAD(XMM0, 16);
|
||||
}
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS16One = AlignCode4();
|
||||
if (jit->jo.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true,
|
||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
RET();
|
||||
|
||||
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedLoad");
|
||||
|
||||
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||
ReserveCodeSpace(16 * sizeof(u8*));
|
||||
|
||||
pairedLoadQuantized[0] = loadPairedFloatTwo;
|
||||
pairedLoadQuantized[1] = loadPairedIllegal;
|
||||
pairedLoadQuantized[2] = loadPairedIllegal;
|
||||
pairedLoadQuantized[3] = loadPairedIllegal;
|
||||
pairedLoadQuantized[4] = loadPairedU8Two;
|
||||
pairedLoadQuantized[5] = loadPairedU16Two;
|
||||
pairedLoadQuantized[6] = loadPairedS8Two;
|
||||
pairedLoadQuantized[7] = loadPairedS16Two;
|
||||
|
||||
pairedLoadQuantized[8] = loadPairedFloatOne;
|
||||
pairedLoadQuantized[9] = loadPairedIllegal;
|
||||
pairedLoadQuantized[10] = loadPairedIllegal;
|
||||
pairedLoadQuantized[11] = loadPairedIllegal;
|
||||
pairedLoadQuantized[12] = loadPairedU8One;
|
||||
pairedLoadQuantized[13] = loadPairedU16One;
|
||||
pairedLoadQuantized[14] = loadPairedS8One;
|
||||
pairedLoadQuantized[15] = loadPairedS16One;
|
||||
for (int type = 0; type < 8; type++)
|
||||
pairedLoadQuantized[type] = GenQuantizedLoadRuntime(false, static_cast<EQuantizeType>(type));
|
||||
for (int type = 0; type < 8; type++)
|
||||
pairedLoadQuantized[type + 8] = GenQuantizedLoadRuntime(true, static_cast<EQuantizeType>(type));
|
||||
}
|
||||
|
||||
const u8* CommonAsmRoutines::GenQuantizedLoadRuntime(bool single, EQuantizeType type)
|
||||
{
|
||||
const void* start = GetCodePtr();
|
||||
const u8* load = AlignCode4();
|
||||
GenQuantizedLoad(single, type, -1);
|
||||
RET();
|
||||
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedLoad_%i_%i", type, single);
|
||||
|
||||
return load;
|
||||
}
|
||||
|
||||
void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type, int quantize)
|
||||
{
|
||||
// In: one or two single floats in XMM0, if quantize is -1, a quantization factor in RSCRATCH2
|
||||
|
||||
int size = sizes[type] * (single ? 1 : 2);
|
||||
bool isInline = quantize != -1;
|
||||
|
||||
// illegal
|
||||
if (type == QUANTIZE_INVALID1 || type == QUANTIZE_INVALID2 || type == QUANTIZE_INVALID3)
|
||||
{
|
||||
UD2();
|
||||
return;
|
||||
}
|
||||
|
||||
if (type == QUANTIZE_FLOAT)
|
||||
{
|
||||
GenQuantizedStoreFloat(single, isInline);
|
||||
}
|
||||
else if (single)
|
||||
{
|
||||
if (quantize == -1)
|
||||
{
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
}
|
||||
else if (quantize > 0)
|
||||
{
|
||||
MULSS(XMM0, M(&m_dequantizeTableS[quantize * 2]));
|
||||
}
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case QUANTIZE_U8:
|
||||
XORPS(XMM1, R(XMM1));
|
||||
MAXSS(XMM0, R(XMM1));
|
||||
MINSS(XMM0, M(&m_255));
|
||||
break;
|
||||
case QUANTIZE_S8:
|
||||
MAXSS(XMM0, M(&m_m128));
|
||||
MINSS(XMM0, M(&m_127));
|
||||
break;
|
||||
case QUANTIZE_U16:
|
||||
XORPS(XMM1, R(XMM1));
|
||||
MAXSS(XMM0, R(XMM1));
|
||||
MINSS(XMM0, M(m_65535));
|
||||
break;
|
||||
case QUANTIZE_S16:
|
||||
MAXSS(XMM0, M(&m_m32768));
|
||||
MINSS(XMM0, M(&m_32767));
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (quantize == -1)
|
||||
{
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
}
|
||||
else if (quantize > 0)
|
||||
{
|
||||
MOVQ_xmm(XMM1, M(&m_quantizeTableS[quantize * 2]));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
}
|
||||
|
||||
bool hasPACKUSDW = cpu_info.bSSE4_1;
|
||||
|
||||
// Special case: if we don't have PACKUSDW we need to clamp to zero as well so the shuffle
|
||||
// below can work
|
||||
if (type == QUANTIZE_U16 && !hasPACKUSDW)
|
||||
{
|
||||
XORPS(XMM1, R(XMM1));
|
||||
MAXPS(XMM0, R(XMM1));
|
||||
}
|
||||
|
||||
// According to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value
|
||||
// is out of int32 range while it's OK for large negatives, it isn't for positives
|
||||
// I don't know whether the overflow actually happens in any games but it potentially can
|
||||
// cause problems, so we need some clamping
|
||||
MINPS(XMM0, M(m_65535));
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case QUANTIZE_U8:
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
PACKUSWB(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
break;
|
||||
case QUANTIZE_S8:
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
PACKSSWB(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
break;
|
||||
case QUANTIZE_U16:
|
||||
if (hasPACKUSDW)
|
||||
{
|
||||
PACKUSDW(XMM0, R(XMM0)); // AAAABBBB CCCCDDDD ... -> AABBCCDD ...
|
||||
MOVD_xmm(R(RSCRATCH), XMM0); // AABBCCDD ... -> AABBCCDD
|
||||
BSWAP(32, RSCRATCH); // AABBCCDD -> DDCCBBAA
|
||||
ROL(32, R(RSCRATCH), Imm8(16)); // DDCCBBAA -> BBAADDCC
|
||||
}
|
||||
else
|
||||
{
|
||||
// We don't have PACKUSDW so we'll shuffle instead (assumes 32-bit values >= 0 and < 65536)
|
||||
PSHUFLW(XMM0, R(XMM0), 2); // AABB0000 CCDD0000 ... -> CCDDAABB ...
|
||||
MOVD_xmm(R(RSCRATCH), XMM0); // CCDDAABB ... -> CCDDAABB
|
||||
BSWAP(32, RSCRATCH); // CCDDAABB -> BBAADDCC
|
||||
}
|
||||
break;
|
||||
case QUANTIZE_S16:
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
BSWAP(32, RSCRATCH);
|
||||
ROL(32, R(RSCRATCH), Imm8(16));
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG;
|
||||
if (!single)
|
||||
flags |= SAFE_LOADSTORE_NO_SWAP;
|
||||
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, size, 0, QUANTIZED_REGS_TO_SAVE, flags);
|
||||
}
|
||||
|
||||
void QuantizedMemoryRoutines::GenQuantizedStoreFloat(bool single, bool isInline)
|
||||
{
|
||||
if (single)
|
||||
{
|
||||
// Easy!
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (cpu_info.bSSSE3)
|
||||
{
|
||||
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
ROL(64, R(RSCRATCH), Imm8(32));
|
||||
BSWAP(64, RSCRATCH);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void QuantizedMemoryRoutines::GenQuantizedLoad(bool single, EQuantizeType type, int quantize)
|
||||
{
|
||||
// Note that this method assumes that inline methods know the value of quantize ahead of
|
||||
// time. The methods generated AOT assume that the quantize flag is placed in RSCRATCH in
|
||||
// the second lowest byte, ie: 0x0000xx00
|
||||
|
||||
int size = sizes[type] * (single ? 1 : 2);
|
||||
bool isInline = quantize != -1;
|
||||
|
||||
// illegal
|
||||
if (type == QUANTIZE_INVALID1 || type == QUANTIZE_INVALID2 || type == QUANTIZE_INVALID3)
|
||||
{
|
||||
UD2();
|
||||
return;
|
||||
}
|
||||
|
||||
// Floats don't use quantization and can generate more optimal code
|
||||
if (type == QUANTIZE_FLOAT)
|
||||
{
|
||||
GenQuantizedLoadFloat(single, isInline);
|
||||
return;
|
||||
}
|
||||
|
||||
bool extend = single && (type == QUANTIZE_S8 || type == QUANTIZE_S16);
|
||||
|
||||
if (jit->jo.memcheck)
|
||||
{
|
||||
BitSet32 regsToSave = QUANTIZED_REGS_TO_SAVE_LOAD;
|
||||
int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG;
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), size, 0, regsToSave, extend, flags);
|
||||
if (!single && (type == QUANTIZE_U8 || type == QUANTIZE_S8))
|
||||
{
|
||||
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
||||
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
switch (type)
|
||||
{
|
||||
case QUANTIZE_U8:
|
||||
case QUANTIZE_S8:
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, size, 0, extend);
|
||||
break;
|
||||
case QUANTIZE_U16:
|
||||
case QUANTIZE_S16:
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, size, 0, extend);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (single)
|
||||
{
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
|
||||
if (quantize == -1)
|
||||
{
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
}
|
||||
else if (quantize > 0)
|
||||
{
|
||||
MULSS(XMM0, M(&m_dequantizeTableS[quantize * 2]));
|
||||
}
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
}
|
||||
else
|
||||
{
|
||||
switch (type)
|
||||
{
|
||||
case QUANTIZE_U8:
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVZXBD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PXOR(XMM1, R(XMM1));
|
||||
PUNPCKLBW(XMM0, R(XMM1));
|
||||
PUNPCKLWD(XMM0, R(XMM1));
|
||||
}
|
||||
break;
|
||||
case QUANTIZE_S8:
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVSXBD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PUNPCKLBW(XMM0, R(XMM0));
|
||||
PUNPCKLWD(XMM0, R(XMM0));
|
||||
PSRAD(XMM0, 24);
|
||||
}
|
||||
break;
|
||||
case QUANTIZE_U16:
|
||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVZXWD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PXOR(XMM1, R(XMM1));
|
||||
PUNPCKLWD(XMM0, R(XMM1));
|
||||
}
|
||||
break;
|
||||
case QUANTIZE_S16:
|
||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVSXWD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PUNPCKLWD(XMM0, R(XMM0));
|
||||
PSRAD(XMM0, 16);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
|
||||
if (quantize == -1)
|
||||
{
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
}
|
||||
else if (quantize > 0)
|
||||
{
|
||||
MOVQ_xmm(XMM1, M(&m_dequantizeTableS[quantize * 2]));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void QuantizedMemoryRoutines::GenQuantizedLoadFloat(bool single, bool isInline)
|
||||
{
|
||||
int size = single ? 32 : 64;
|
||||
bool extend = false;
|
||||
|
||||
if (jit->jo.memcheck)
|
||||
{
|
||||
BitSet32 regsToSave = QUANTIZED_REGS_TO_SAVE;
|
||||
int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG;
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), size, 0, regsToSave, extend, flags);
|
||||
}
|
||||
|
||||
if (single)
|
||||
{
|
||||
if (jit->jo.memcheck)
|
||||
{
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
}
|
||||
else if (cpu_info.bSSSE3)
|
||||
{
|
||||
MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
|
||||
PSHUFB(XMM0, M(pbswapShuffle1x4));
|
||||
}
|
||||
else
|
||||
{
|
||||
LoadAndSwap(32, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
}
|
||||
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
}
|
||||
else
|
||||
{
|
||||
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
|
||||
// don't need hardware access handling. This will definitely crash if paired loads occur
|
||||
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
|
||||
// for a good reason, or merely because no game does this.
|
||||
// If we find something that actually does do this, maybe this should be changed. How
|
||||
// much of a performance hit would it be?
|
||||
if (jit->jo.memcheck)
|
||||
{
|
||||
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
}
|
||||
else if (cpu_info.bSSSE3)
|
||||
{
|
||||
MOVQ_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
|
||||
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
||||
}
|
||||
else
|
||||
{
|
||||
LoadAndSwap(64, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
|
||||
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,16 +7,31 @@
|
|||
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
|
||||
#include "Core/PowerPC/JitCommon/Jit_Util.h"
|
||||
|
||||
class CommonAsmRoutines : public CommonAsmRoutinesBase, public EmuCodeBlock
|
||||
{
|
||||
protected:
|
||||
void GenQuantizedLoads();
|
||||
void GenQuantizedStores();
|
||||
void GenQuantizedSingleStores();
|
||||
enum EQuantizeType : u32;
|
||||
|
||||
class QuantizedMemoryRoutines : public EmuCodeBlock
|
||||
{
|
||||
public:
|
||||
void GenQuantizedLoad(bool single, EQuantizeType type, int quantize);
|
||||
void GenQuantizedStore(bool single, EQuantizeType type, int quantize);
|
||||
|
||||
private:
|
||||
void GenQuantizedLoadFloat(bool single, bool isInline);
|
||||
void GenQuantizedStoreFloat(bool single, bool isInline);
|
||||
};
|
||||
|
||||
class CommonAsmRoutines : public CommonAsmRoutinesBase, public QuantizedMemoryRoutines
|
||||
{
|
||||
public:
|
||||
void GenFifoWrite(int size);
|
||||
void GenFrsqrte();
|
||||
void GenFres();
|
||||
void GenMfcr();
|
||||
|
||||
protected:
|
||||
const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type);
|
||||
const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type);
|
||||
void GenQuantizedLoads();
|
||||
void GenQuantizedStores();
|
||||
void GenQuantizedSingleStores();
|
||||
};
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
//#define JIT_LOG_GPR // Enables logging of the PPC general purpose regs
|
||||
//#define JIT_LOG_FPR // Enables logging of the PPC floating point regs
|
||||
|
||||
#include <map>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "Common/CommonTypes.h"
|
||||
|
@ -88,6 +89,7 @@ protected:
|
|||
int revertFprLoad;
|
||||
|
||||
bool assumeNoPairedQuantize;
|
||||
std::map<u8, u32> constantGqr;
|
||||
bool firstFPInstructionFound;
|
||||
bool isLastInstruction;
|
||||
int skipInstructions;
|
||||
|
@ -130,7 +132,7 @@ public:
|
|||
virtual bool HandleStackFault() { return false; }
|
||||
};
|
||||
|
||||
class Jitx86Base : public JitBase, public EmuCodeBlock
|
||||
class Jitx86Base : public JitBase, public QuantizedMemoryRoutines
|
||||
{
|
||||
protected:
|
||||
bool BackPatch(u32 emAddress, SContext* ctx);
|
||||
|
|
Loading…
Reference in New Issue