Refactor the paired load/store code
Simplification/reduction of duplicated code. Detect other constant GQR values and inline loads (5-10% speedup) and do direct dispatch to AOT methods for stores.
This commit is contained in:
parent
6b01eca3a0
commit
4aa5291f54
|
@ -672,27 +672,20 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
|
||||||
js.skipInstructions = 0;
|
js.skipInstructions = 0;
|
||||||
js.carryFlagSet = false;
|
js.carryFlagSet = false;
|
||||||
js.carryFlagInverted = false;
|
js.carryFlagInverted = false;
|
||||||
js.assumeNoPairedQuantize = false;
|
js.constantGqr.clear();
|
||||||
|
|
||||||
// If the block only uses one GQR and the GQR is zero at compile time, make a guess that the block
|
// Assume that GQR values don't change often at runtime. Many paired-heavy games use largely float
|
||||||
// never uses quantized loads/stores. Many paired-heavy games use largely float loads and stores,
|
// loads and stores,
|
||||||
// which are significantly faster when inlined (especially in MMU mode, where this lets them use
|
// which are significantly faster when inlined (especially in MMU mode, where this lets them use
|
||||||
// fastmem).
|
// fastmem).
|
||||||
// Insert a check that the GQR is still zero at the start of the block in case our guess turns out
|
if (js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end())
|
||||||
// wrong.
|
|
||||||
// TODO: support any other constant GQR value, not merely zero/unquantized: we can optimize
|
|
||||||
// quantized
|
|
||||||
// loadstores too, it'd just be more code.
|
|
||||||
if (code_block.m_gqr_used.Count() == 1 &&
|
|
||||||
js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end())
|
|
||||||
{
|
{
|
||||||
int gqr = *code_block.m_gqr_used.begin();
|
// If there are GQRs used but not set, we'll treat those as constant and optimize them
|
||||||
if (!code_block.m_gqr_modified[gqr] && !GQR(gqr))
|
BitSet8 gqr_static = ComputeStaticGQRs(code_block);
|
||||||
|
if (gqr_static)
|
||||||
{
|
{
|
||||||
CMP(32, PPCSTATE(spr[SPR_GQR0 + gqr]), Imm8(0));
|
|
||||||
FixupBranch failure = J_CC(CC_NZ, true);
|
|
||||||
SwitchToFarCode();
|
SwitchToFarCode();
|
||||||
SetJumpTarget(failure);
|
const u8* target = GetCodePtr();
|
||||||
MOV(32, PPCSTATE(pc), Imm32(js.blockStart));
|
MOV(32, PPCSTATE(pc), Imm32(js.blockStart));
|
||||||
ABI_PushRegistersAndAdjustStack({}, 0);
|
ABI_PushRegistersAndAdjustStack({}, 0);
|
||||||
ABI_CallFunctionC((void*)&JitInterface::CompileExceptionCheck,
|
ABI_CallFunctionC((void*)&JitInterface::CompileExceptionCheck,
|
||||||
|
@ -700,7 +693,16 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
|
||||||
ABI_PopRegistersAndAdjustStack({}, 0);
|
ABI_PopRegistersAndAdjustStack({}, 0);
|
||||||
JMP(asm_routines.dispatcher, true);
|
JMP(asm_routines.dispatcher, true);
|
||||||
SwitchToNearCode();
|
SwitchToNearCode();
|
||||||
js.assumeNoPairedQuantize = true;
|
|
||||||
|
// Insert a check that the GQRs are still the value we expect at
|
||||||
|
// the start of the block in case our guess turns out wrong.
|
||||||
|
for (int gqr : gqr_static)
|
||||||
|
{
|
||||||
|
u32 value = GQR(gqr);
|
||||||
|
js.constantGqr[gqr] = value;
|
||||||
|
CMP_or_TEST(32, PPCSTATE(spr[SPR_GQR0 + gqr]), Imm32(value));
|
||||||
|
J_CC(CC_NZ, target);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -947,7 +949,12 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
|
||||||
return normalEntry;
|
return normalEntry;
|
||||||
}
|
}
|
||||||
|
|
||||||
BitSet32 Jit64::CallerSavedRegistersInUse()
|
BitSet8 Jit64::ComputeStaticGQRs(const PPCAnalyst::CodeBlock& cb) const
|
||||||
|
{
|
||||||
|
return cb.m_gqr_used & ~cb.m_gqr_modified;
|
||||||
|
}
|
||||||
|
|
||||||
|
BitSet32 Jit64::CallerSavedRegistersInUse() const
|
||||||
{
|
{
|
||||||
BitSet32 result;
|
BitSet32 result;
|
||||||
for (int i = 0; i < NUMXREGS; i++)
|
for (int i = 0; i < NUMXREGS; i++)
|
||||||
|
|
|
@ -64,7 +64,8 @@ public:
|
||||||
void Jit(u32 em_address) override;
|
void Jit(u32 em_address) override;
|
||||||
const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC);
|
const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC);
|
||||||
|
|
||||||
BitSet32 CallerSavedRegistersInUse();
|
BitSet32 CallerSavedRegistersInUse() const;
|
||||||
|
BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const;
|
||||||
|
|
||||||
JitBlockCache* GetBlockCache() override { return &blocks; }
|
JitBlockCache* GetBlockCache() override { return &blocks; }
|
||||||
void Trace();
|
void Trace();
|
||||||
|
|
|
@ -35,8 +35,12 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
|
||||||
int w = indexed ? inst.Wx : inst.W;
|
int w = indexed ? inst.Wx : inst.W;
|
||||||
FALLBACK_IF(!a);
|
FALLBACK_IF(!a);
|
||||||
|
|
||||||
|
auto it = js.constantGqr.find(i);
|
||||||
|
bool gqrIsConstant = it != js.constantGqr.end();
|
||||||
|
u32 gqrValue = gqrIsConstant ? it->second & 0xffff : 0;
|
||||||
|
|
||||||
gpr.Lock(a, b);
|
gpr.Lock(a, b);
|
||||||
if (js.assumeNoPairedQuantize)
|
if (gqrIsConstant && gqrValue == 0)
|
||||||
{
|
{
|
||||||
int storeOffset = 0;
|
int storeOffset = 0;
|
||||||
gpr.BindToRegister(a, true, update);
|
gpr.BindToRegister(a, true, update);
|
||||||
|
@ -125,25 +129,68 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
|
||||||
// In memcheck mode, don't update the address until the exception check
|
// In memcheck mode, don't update the address until the exception check
|
||||||
if (update && !jo.memcheck)
|
if (update && !jo.memcheck)
|
||||||
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
|
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
|
||||||
// Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code.
|
|
||||||
// Hence, we need to mask out the unused bits. The layout of the GQR register is
|
|
||||||
// UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with
|
|
||||||
// 0b0011111100000111, or 0x3F07.
|
|
||||||
MOV(32, R(RSCRATCH2), Imm32(0x3F07));
|
|
||||||
AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + i]));
|
|
||||||
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
|
|
||||||
|
|
||||||
if (w)
|
if (gqrIsConstant)
|
||||||
{
|
{
|
||||||
// One value
|
// Paired stores don't yield any real change in performance right now, but if we can
|
||||||
CVTSD2SS(XMM0, fpr.R(s));
|
// improve fastmem support this might change
|
||||||
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized));
|
//#define INLINE_PAIRED_STORES
|
||||||
|
#ifdef INLINE_PAIRED_STORES
|
||||||
|
if (w)
|
||||||
|
{
|
||||||
|
// One value
|
||||||
|
CVTSD2SS(XMM0, fpr.R(s));
|
||||||
|
GenQuantizedStore(true, static_cast<EQuantizeType>(gqrValue & 0x7), (gqrValue & 0x3F00) >> 8);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Pair of values
|
||||||
|
CVTPD2PS(XMM0, fpr.R(s));
|
||||||
|
GenQuantizedStore(false, static_cast<EQuantizeType>(gqrValue & 0x7),
|
||||||
|
(gqrValue & 0x3F00) >> 8);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// We know what GQR is here, so we can load RSCRATCH2 and call into the store method directly
|
||||||
|
// with just the scale bits.
|
||||||
|
int type = gqrValue & 0x7;
|
||||||
|
MOV(32, R(RSCRATCH2), Imm32(gqrValue & 0x3F00));
|
||||||
|
|
||||||
|
if (w)
|
||||||
|
{
|
||||||
|
// One value
|
||||||
|
CVTSD2SS(XMM0, fpr.R(s));
|
||||||
|
CALL(asm_routines.singleStoreQuantized[type]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Pair of values
|
||||||
|
CVTPD2PS(XMM0, fpr.R(s));
|
||||||
|
CALL(asm_routines.pairedStoreQuantized[type]);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Pair of values
|
// Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code.
|
||||||
CVTPD2PS(XMM0, fpr.R(s));
|
// Hence, we need to mask out the unused bits. The layout of the GQR register is
|
||||||
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized));
|
// UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with
|
||||||
|
// 0b0011111100000111, or 0x3F07.
|
||||||
|
MOV(32, R(RSCRATCH2), Imm32(0x3F07));
|
||||||
|
AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + i]));
|
||||||
|
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
|
||||||
|
|
||||||
|
if (w)
|
||||||
|
{
|
||||||
|
// One value
|
||||||
|
CVTSD2SS(XMM0, fpr.R(s));
|
||||||
|
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Pair of values
|
||||||
|
CVTPD2PS(XMM0, fpr.R(s));
|
||||||
|
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (update && jo.memcheck)
|
if (update && jo.memcheck)
|
||||||
|
@ -173,8 +220,13 @@ void Jit64::psq_lXX(UGeckoInstruction inst)
|
||||||
int w = indexed ? inst.Wx : inst.W;
|
int w = indexed ? inst.Wx : inst.W;
|
||||||
FALLBACK_IF(!a);
|
FALLBACK_IF(!a);
|
||||||
|
|
||||||
|
auto it = js.constantGqr.find(i);
|
||||||
|
bool gqrIsConstant = it != js.constantGqr.end();
|
||||||
|
u32 gqrValue = gqrIsConstant ? it->second >> 16 : 0;
|
||||||
|
|
||||||
gpr.Lock(a, b);
|
gpr.Lock(a, b);
|
||||||
if (js.assumeNoPairedQuantize)
|
|
||||||
|
if (gqrIsConstant && gqrValue == 0)
|
||||||
{
|
{
|
||||||
s32 loadOffset = 0;
|
s32 loadOffset = 0;
|
||||||
gpr.BindToRegister(a, true, update);
|
gpr.BindToRegister(a, true, update);
|
||||||
|
@ -302,16 +354,24 @@ void Jit64::psq_lXX(UGeckoInstruction inst)
|
||||||
// In memcheck mode, don't update the address until the exception check
|
// In memcheck mode, don't update the address until the exception check
|
||||||
if (update && !jo.memcheck)
|
if (update && !jo.memcheck)
|
||||||
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
|
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
|
||||||
MOV(32, R(RSCRATCH2), Imm32(0x3F07));
|
|
||||||
|
|
||||||
// Get the high part of the GQR register
|
if (gqrIsConstant)
|
||||||
OpArg gqr = PPCSTATE(spr[SPR_GQR0 + i]);
|
{
|
||||||
gqr.AddMemOffset(2);
|
GenQuantizedLoad(w == 1, static_cast<EQuantizeType>(gqrValue & 0x7), (gqrValue & 0x3F00) >> 8);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOV(32, R(RSCRATCH2), Imm32(0x3F07));
|
||||||
|
|
||||||
AND(32, R(RSCRATCH2), gqr);
|
// Get the high part of the GQR register
|
||||||
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
|
OpArg gqr = PPCSTATE(spr[SPR_GQR0 + i]);
|
||||||
|
gqr.AddMemOffset(2);
|
||||||
|
|
||||||
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(&asm_routines.pairedLoadQuantized[w * 8])));
|
AND(32, R(RSCRATCH2), gqr);
|
||||||
|
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
|
||||||
|
|
||||||
|
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(&asm_routines.pairedLoadQuantized[w * 8])));
|
||||||
|
}
|
||||||
|
|
||||||
MemoryExceptionCheck();
|
MemoryExceptionCheck();
|
||||||
CVTPS2PD(fpr.RX(s), R(XMM0));
|
CVTPS2PD(fpr.RX(s), R(XMM0));
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
#include "Common/x64ABI.h"
|
#include "Common/x64ABI.h"
|
||||||
#include "Common/x64Emitter.h"
|
#include "Common/x64Emitter.h"
|
||||||
#include "Core/HW/GPFifo.h"
|
#include "Core/HW/GPFifo.h"
|
||||||
|
#include "Core/PowerPC/Gekko.h"
|
||||||
#include "Core/PowerPC/JitCommon/JitBase.h"
|
#include "Core/PowerPC/JitCommon/JitBase.h"
|
||||||
#include "Core/PowerPC/JitCommon/Jit_Util.h"
|
#include "Core/PowerPC/JitCommon/Jit_Util.h"
|
||||||
#include "Core/PowerPC/PowerPC.h"
|
#include "Core/PowerPC/PowerPC.h"
|
||||||
|
@ -219,438 +220,416 @@ alignas(16) static const float m_255 = 255.0f;
|
||||||
alignas(16) static const float m_127 = 127.0f;
|
alignas(16) static const float m_127 = 127.0f;
|
||||||
alignas(16) static const float m_m128 = -128.0f;
|
alignas(16) static const float m_m128 = -128.0f;
|
||||||
|
|
||||||
#define QUANTIZE_OVERFLOW_SAFE
|
// Sizes of the various quantized store types
|
||||||
|
constexpr std::array<u8, 8> sizes{{32, 0, 0, 0, 8, 16, 8, 16}};
|
||||||
|
|
||||||
// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of
|
|
||||||
// int32 range
|
|
||||||
// while it's OK for large negatives, it isn't for positives
|
|
||||||
// I don't know whether the overflow actually happens in any games
|
|
||||||
// but it potentially can cause problems, so we need some clamping
|
|
||||||
|
|
||||||
// See comment in header for in/outs.
|
|
||||||
void CommonAsmRoutines::GenQuantizedStores()
|
void CommonAsmRoutines::GenQuantizedStores()
|
||||||
{
|
{
|
||||||
const void* start = GetCodePtr();
|
|
||||||
|
|
||||||
const u8* storePairedIllegal = AlignCode4();
|
|
||||||
UD2();
|
|
||||||
|
|
||||||
const u8* storePairedFloat = AlignCode4();
|
|
||||||
if (cpu_info.bSSSE3)
|
|
||||||
{
|
|
||||||
PSHUFB(XMM0, M((void*)pbswapShuffle2x4));
|
|
||||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
|
||||||
ROL(64, R(RSCRATCH), Imm8(32));
|
|
||||||
BSWAP(64, RSCRATCH);
|
|
||||||
}
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE,
|
|
||||||
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storePairedU8 = AlignCode4();
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
|
||||||
MINPS(XMM0, M(m_65535));
|
|
||||||
#endif
|
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
|
||||||
PACKSSDW(XMM0, R(XMM0));
|
|
||||||
PACKUSWB(XMM0, R(XMM0));
|
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
|
|
||||||
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storePairedS8 = AlignCode4();
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
|
||||||
MINPS(XMM0, M(m_65535));
|
|
||||||
#endif
|
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
|
||||||
PACKSSDW(XMM0, R(XMM0));
|
|
||||||
PACKSSWB(XMM0, R(XMM0));
|
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
||||||
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
|
|
||||||
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storePairedU16 = AlignCode4();
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
|
|
||||||
if (cpu_info.bSSE4_1)
|
|
||||||
{
|
|
||||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
|
||||||
MINPS(XMM0, M(m_65535));
|
|
||||||
#endif
|
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
|
||||||
PACKUSDW(XMM0, R(XMM0));
|
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
||||||
BSWAP(32, RSCRATCH);
|
|
||||||
ROL(32, R(RSCRATCH), Imm8(16));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
XORPS(XMM1, R(XMM1));
|
|
||||||
MAXPS(XMM0, R(XMM1));
|
|
||||||
MINPS(XMM0, M(m_65535));
|
|
||||||
|
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
|
||||||
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
|
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
||||||
BSWAP(32, RSCRATCH);
|
|
||||||
}
|
|
||||||
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE,
|
|
||||||
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storePairedS16 = AlignCode4();
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
|
||||||
MINPS(XMM0, M(m_65535));
|
|
||||||
#endif
|
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
|
||||||
PACKSSDW(XMM0, R(XMM0));
|
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
||||||
BSWAP(32, RSCRATCH);
|
|
||||||
ROL(32, R(RSCRATCH), Imm8(16));
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE,
|
|
||||||
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
|
|
||||||
RET();
|
|
||||||
|
|
||||||
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore");
|
|
||||||
|
|
||||||
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||||
ReserveCodeSpace(8 * sizeof(u8*));
|
ReserveCodeSpace(8 * sizeof(u8*));
|
||||||
|
|
||||||
pairedStoreQuantized[0] = storePairedFloat;
|
for (int type = 0; type < 8; type++)
|
||||||
pairedStoreQuantized[1] = storePairedIllegal;
|
pairedStoreQuantized[type] = GenQuantizedStoreRuntime(false, static_cast<EQuantizeType>(type));
|
||||||
pairedStoreQuantized[2] = storePairedIllegal;
|
|
||||||
pairedStoreQuantized[3] = storePairedIllegal;
|
|
||||||
pairedStoreQuantized[4] = storePairedU8;
|
|
||||||
pairedStoreQuantized[5] = storePairedU16;
|
|
||||||
pairedStoreQuantized[6] = storePairedS8;
|
|
||||||
pairedStoreQuantized[7] = storePairedS16;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// See comment in header for in/outs.
|
// See comment in header for in/outs.
|
||||||
void CommonAsmRoutines::GenQuantizedSingleStores()
|
void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||||
{
|
{
|
||||||
const void* start = GetCodePtr();
|
|
||||||
|
|
||||||
const u8* storeSingleIllegal = AlignCode4();
|
|
||||||
UD2();
|
|
||||||
|
|
||||||
// Easy!
|
|
||||||
const u8* storeSingleFloat = AlignCode4();
|
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE,
|
|
||||||
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
XORPS(XMM1, R(XMM1));
|
|
||||||
MAXSS(XMM0, R(XMM1));
|
|
||||||
MINSS(XMM0, M(&m_255));
|
|
||||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE,
|
|
||||||
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storeSingleS8 = AlignCode4();
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
MAXSS(XMM0, M(&m_m128));
|
|
||||||
MINSS(XMM0, M(&m_127));
|
|
||||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE,
|
|
||||||
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
XORPS(XMM1, R(XMM1));
|
|
||||||
MAXSS(XMM0, R(XMM1));
|
|
||||||
MINSS(XMM0, M(m_65535));
|
|
||||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
|
|
||||||
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storeSingleS16 = AlignCode4();
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
MAXSS(XMM0, M(&m_m32768));
|
|
||||||
MINSS(XMM0, M(&m_32767));
|
|
||||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
|
|
||||||
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
RET();
|
|
||||||
|
|
||||||
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedSingleStore");
|
|
||||||
|
|
||||||
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||||
ReserveCodeSpace(8 * sizeof(u8*));
|
ReserveCodeSpace(8 * sizeof(u8*));
|
||||||
|
|
||||||
singleStoreQuantized[0] = storeSingleFloat;
|
for (int type = 0; type < 8; type++)
|
||||||
singleStoreQuantized[1] = storeSingleIllegal;
|
singleStoreQuantized[type] = GenQuantizedStoreRuntime(true, static_cast<EQuantizeType>(type));
|
||||||
singleStoreQuantized[2] = storeSingleIllegal;
|
}
|
||||||
singleStoreQuantized[3] = storeSingleIllegal;
|
|
||||||
singleStoreQuantized[4] = storeSingleU8;
|
const u8* CommonAsmRoutines::GenQuantizedStoreRuntime(bool single, EQuantizeType type)
|
||||||
singleStoreQuantized[5] = storeSingleU16;
|
{
|
||||||
singleStoreQuantized[6] = storeSingleS8;
|
const void* start = GetCodePtr();
|
||||||
singleStoreQuantized[7] = storeSingleS16;
|
const u8* load = AlignCode4();
|
||||||
|
GenQuantizedStore(single, type, -1);
|
||||||
|
RET();
|
||||||
|
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore_%i_%i", type, single);
|
||||||
|
|
||||||
|
return load;
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommonAsmRoutines::GenQuantizedLoads()
|
void CommonAsmRoutines::GenQuantizedLoads()
|
||||||
{
|
{
|
||||||
const void* start = GetCodePtr();
|
|
||||||
|
|
||||||
const u8* loadPairedIllegal = AlignCode4();
|
|
||||||
UD2();
|
|
||||||
|
|
||||||
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
|
|
||||||
// don't need hardware access handling. This will definitely crash if paired loads occur
|
|
||||||
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
|
|
||||||
// for a good reason, or merely because no game does this.
|
|
||||||
// If we find something that actually does do this, maybe this should be changed. How
|
|
||||||
// much of a performance hit would it be?
|
|
||||||
const u8* loadPairedFloatTwo = AlignCode4();
|
|
||||||
if (jit->jo.memcheck)
|
|
||||||
{
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false,
|
|
||||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
|
||||||
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
}
|
|
||||||
else if (cpu_info.bSSSE3)
|
|
||||||
{
|
|
||||||
MOVQ_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
|
|
||||||
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
LoadAndSwap(64, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
|
|
||||||
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
|
||||||
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
}
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedFloatOne = AlignCode4();
|
|
||||||
if (jit->jo.memcheck)
|
|
||||||
{
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false,
|
|
||||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
|
||||||
}
|
|
||||||
else if (cpu_info.bSSSE3)
|
|
||||||
{
|
|
||||||
MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
|
|
||||||
PSHUFB(XMM0, M(pbswapShuffle1x4));
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
LoadAndSwap(32, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
|
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
|
||||||
}
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedU8Two = AlignCode4();
|
|
||||||
if (jit->jo.memcheck)
|
|
||||||
{
|
|
||||||
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
|
||||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
|
||||||
}
|
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
if (cpu_info.bSSE4_1)
|
|
||||||
{
|
|
||||||
PMOVZXBD(XMM0, R(XMM0));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
PXOR(XMM1, R(XMM1));
|
|
||||||
PUNPCKLBW(XMM0, R(XMM1));
|
|
||||||
PUNPCKLWD(XMM0, R(XMM1));
|
|
||||||
}
|
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedU8One = AlignCode4();
|
|
||||||
if (jit->jo.memcheck)
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
|
||||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
else
|
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
|
|
||||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedS8Two = AlignCode4();
|
|
||||||
if (jit->jo.memcheck)
|
|
||||||
{
|
|
||||||
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
|
||||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
|
||||||
}
|
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
if (cpu_info.bSSE4_1)
|
|
||||||
{
|
|
||||||
PMOVSXBD(XMM0, R(XMM0));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
PUNPCKLBW(XMM0, R(XMM0));
|
|
||||||
PUNPCKLWD(XMM0, R(XMM0));
|
|
||||||
PSRAD(XMM0, 24);
|
|
||||||
}
|
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedS8One = AlignCode4();
|
|
||||||
if (jit->jo.memcheck)
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true,
|
|
||||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
else
|
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
|
|
||||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedU16Two = AlignCode4();
|
|
||||||
// TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
|
|
||||||
if (jit->jo.memcheck)
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
|
||||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
else
|
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
|
||||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
if (cpu_info.bSSE4_1)
|
|
||||||
{
|
|
||||||
PMOVZXWD(XMM0, R(XMM0));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
PXOR(XMM1, R(XMM1));
|
|
||||||
PUNPCKLWD(XMM0, R(XMM1));
|
|
||||||
}
|
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedU16One = AlignCode4();
|
|
||||||
if (jit->jo.memcheck)
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
|
||||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
else
|
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
|
|
||||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedS16Two = AlignCode4();
|
|
||||||
if (jit->jo.memcheck)
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
|
||||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
else
|
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
|
||||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
if (cpu_info.bSSE4_1)
|
|
||||||
{
|
|
||||||
PMOVSXWD(XMM0, R(XMM0));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
PUNPCKLWD(XMM0, R(XMM0));
|
|
||||||
PSRAD(XMM0, 16);
|
|
||||||
}
|
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedS16One = AlignCode4();
|
|
||||||
if (jit->jo.memcheck)
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true,
|
|
||||||
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
else
|
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
|
|
||||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedLoad");
|
|
||||||
|
|
||||||
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||||
ReserveCodeSpace(16 * sizeof(u8*));
|
ReserveCodeSpace(16 * sizeof(u8*));
|
||||||
|
|
||||||
pairedLoadQuantized[0] = loadPairedFloatTwo;
|
for (int type = 0; type < 8; type++)
|
||||||
pairedLoadQuantized[1] = loadPairedIllegal;
|
pairedLoadQuantized[type] = GenQuantizedLoadRuntime(false, static_cast<EQuantizeType>(type));
|
||||||
pairedLoadQuantized[2] = loadPairedIllegal;
|
for (int type = 0; type < 8; type++)
|
||||||
pairedLoadQuantized[3] = loadPairedIllegal;
|
pairedLoadQuantized[type + 8] = GenQuantizedLoadRuntime(true, static_cast<EQuantizeType>(type));
|
||||||
pairedLoadQuantized[4] = loadPairedU8Two;
|
}
|
||||||
pairedLoadQuantized[5] = loadPairedU16Two;
|
|
||||||
pairedLoadQuantized[6] = loadPairedS8Two;
|
const u8* CommonAsmRoutines::GenQuantizedLoadRuntime(bool single, EQuantizeType type)
|
||||||
pairedLoadQuantized[7] = loadPairedS16Two;
|
{
|
||||||
|
const void* start = GetCodePtr();
|
||||||
pairedLoadQuantized[8] = loadPairedFloatOne;
|
const u8* load = AlignCode4();
|
||||||
pairedLoadQuantized[9] = loadPairedIllegal;
|
GenQuantizedLoad(single, type, -1);
|
||||||
pairedLoadQuantized[10] = loadPairedIllegal;
|
RET();
|
||||||
pairedLoadQuantized[11] = loadPairedIllegal;
|
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedLoad_%i_%i", type, single);
|
||||||
pairedLoadQuantized[12] = loadPairedU8One;
|
|
||||||
pairedLoadQuantized[13] = loadPairedU16One;
|
return load;
|
||||||
pairedLoadQuantized[14] = loadPairedS8One;
|
}
|
||||||
pairedLoadQuantized[15] = loadPairedS16One;
|
|
||||||
|
void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type, int quantize)
|
||||||
|
{
|
||||||
|
// In: one or two single floats in XMM0, if quantize is -1, a quantization factor in RSCRATCH2
|
||||||
|
|
||||||
|
int size = sizes[type] * (single ? 1 : 2);
|
||||||
|
bool isInline = quantize != -1;
|
||||||
|
|
||||||
|
// illegal
|
||||||
|
if (type == QUANTIZE_INVALID1 || type == QUANTIZE_INVALID2 || type == QUANTIZE_INVALID3)
|
||||||
|
{
|
||||||
|
UD2();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type == QUANTIZE_FLOAT)
|
||||||
|
{
|
||||||
|
GenQuantizedStoreFloat(single, isInline);
|
||||||
|
}
|
||||||
|
else if (single)
|
||||||
|
{
|
||||||
|
if (quantize == -1)
|
||||||
|
{
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
|
}
|
||||||
|
else if (quantize > 0)
|
||||||
|
{
|
||||||
|
MULSS(XMM0, M(&m_dequantizeTableS[quantize * 2]));
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (type)
|
||||||
|
{
|
||||||
|
case QUANTIZE_U8:
|
||||||
|
XORPS(XMM1, R(XMM1));
|
||||||
|
MAXSS(XMM0, R(XMM1));
|
||||||
|
MINSS(XMM0, M(&m_255));
|
||||||
|
break;
|
||||||
|
case QUANTIZE_S8:
|
||||||
|
MAXSS(XMM0, M(&m_m128));
|
||||||
|
MINSS(XMM0, M(&m_127));
|
||||||
|
break;
|
||||||
|
case QUANTIZE_U16:
|
||||||
|
XORPS(XMM1, R(XMM1));
|
||||||
|
MAXSS(XMM0, R(XMM1));
|
||||||
|
MINSS(XMM0, M(m_65535));
|
||||||
|
break;
|
||||||
|
case QUANTIZE_S16:
|
||||||
|
MAXSS(XMM0, M(&m_m32768));
|
||||||
|
MINSS(XMM0, M(&m_32767));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (quantize == -1)
|
||||||
|
{
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
|
else if (quantize > 0)
|
||||||
|
{
|
||||||
|
MOVQ_xmm(XMM1, M(&m_quantizeTableS[quantize * 2]));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool hasPACKUSDW = cpu_info.bSSE4_1;
|
||||||
|
|
||||||
|
// Special case: if we don't have PACKUSDW we need to clamp to zero as well so the shuffle
|
||||||
|
// below can work
|
||||||
|
if (type == QUANTIZE_U16 && !hasPACKUSDW)
|
||||||
|
{
|
||||||
|
XORPS(XMM1, R(XMM1));
|
||||||
|
MAXPS(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
|
|
||||||
|
// According to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value
|
||||||
|
// is out of int32 range while it's OK for large negatives, it isn't for positives
|
||||||
|
// I don't know whether the overflow actually happens in any games but it potentially can
|
||||||
|
// cause problems, so we need some clamping
|
||||||
|
MINPS(XMM0, M(m_65535));
|
||||||
|
CVTTPS2DQ(XMM0, R(XMM0));
|
||||||
|
|
||||||
|
switch (type)
|
||||||
|
{
|
||||||
|
case QUANTIZE_U8:
|
||||||
|
PACKSSDW(XMM0, R(XMM0));
|
||||||
|
PACKUSWB(XMM0, R(XMM0));
|
||||||
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
|
break;
|
||||||
|
case QUANTIZE_S8:
|
||||||
|
PACKSSDW(XMM0, R(XMM0));
|
||||||
|
PACKSSWB(XMM0, R(XMM0));
|
||||||
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
|
break;
|
||||||
|
case QUANTIZE_U16:
|
||||||
|
if (hasPACKUSDW)
|
||||||
|
{
|
||||||
|
PACKUSDW(XMM0, R(XMM0)); // AAAABBBB CCCCDDDD ... -> AABBCCDD ...
|
||||||
|
MOVD_xmm(R(RSCRATCH), XMM0); // AABBCCDD ... -> AABBCCDD
|
||||||
|
BSWAP(32, RSCRATCH); // AABBCCDD -> DDCCBBAA
|
||||||
|
ROL(32, R(RSCRATCH), Imm8(16)); // DDCCBBAA -> BBAADDCC
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// We don't have PACKUSDW so we'll shuffle instead (assumes 32-bit values >= 0 and < 65536)
|
||||||
|
PSHUFLW(XMM0, R(XMM0), 2); // AABB0000 CCDD0000 ... -> CCDDAABB ...
|
||||||
|
MOVD_xmm(R(RSCRATCH), XMM0); // CCDDAABB ... -> CCDDAABB
|
||||||
|
BSWAP(32, RSCRATCH); // CCDDAABB -> BBAADDCC
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case QUANTIZE_S16:
|
||||||
|
PACKSSDW(XMM0, R(XMM0));
|
||||||
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
|
BSWAP(32, RSCRATCH);
|
||||||
|
ROL(32, R(RSCRATCH), Imm8(16));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG;
|
||||||
|
if (!single)
|
||||||
|
flags |= SAFE_LOADSTORE_NO_SWAP;
|
||||||
|
|
||||||
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, size, 0, QUANTIZED_REGS_TO_SAVE, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
void QuantizedMemoryRoutines::GenQuantizedStoreFloat(bool single, bool isInline)
|
||||||
|
{
|
||||||
|
if (single)
|
||||||
|
{
|
||||||
|
// Easy!
|
||||||
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (cpu_info.bSSSE3)
|
||||||
|
{
|
||||||
|
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
||||||
|
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||||
|
ROL(64, R(RSCRATCH), Imm8(32));
|
||||||
|
BSWAP(64, RSCRATCH);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void QuantizedMemoryRoutines::GenQuantizedLoad(bool single, EQuantizeType type, int quantize)
|
||||||
|
{
|
||||||
|
// Note that this method assumes that inline methods know the value of quantize ahead of
|
||||||
|
// time. The methods generated AOT assume that the quantize flag is placed in RSCRATCH in
|
||||||
|
// the second lowest byte, ie: 0x0000xx00
|
||||||
|
|
||||||
|
int size = sizes[type] * (single ? 1 : 2);
|
||||||
|
bool isInline = quantize != -1;
|
||||||
|
|
||||||
|
// illegal
|
||||||
|
if (type == QUANTIZE_INVALID1 || type == QUANTIZE_INVALID2 || type == QUANTIZE_INVALID3)
|
||||||
|
{
|
||||||
|
UD2();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Floats don't use quantization and can generate more optimal code
|
||||||
|
if (type == QUANTIZE_FLOAT)
|
||||||
|
{
|
||||||
|
GenQuantizedLoadFloat(single, isInline);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool extend = single && (type == QUANTIZE_S8 || type == QUANTIZE_S16);
|
||||||
|
|
||||||
|
if (jit->jo.memcheck)
|
||||||
|
{
|
||||||
|
BitSet32 regsToSave = QUANTIZED_REGS_TO_SAVE_LOAD;
|
||||||
|
int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG;
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), size, 0, regsToSave, extend, flags);
|
||||||
|
if (!single && (type == QUANTIZE_U8 || type == QUANTIZE_S8))
|
||||||
|
{
|
||||||
|
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
||||||
|
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
switch (type)
|
||||||
|
{
|
||||||
|
case QUANTIZE_U8:
|
||||||
|
case QUANTIZE_S8:
|
||||||
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, size, 0, extend);
|
||||||
|
break;
|
||||||
|
case QUANTIZE_U16:
|
||||||
|
case QUANTIZE_S16:
|
||||||
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, size, 0, extend);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (single)
|
||||||
|
{
|
||||||
|
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
|
||||||
|
if (quantize == -1)
|
||||||
|
{
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
|
}
|
||||||
|
else if (quantize > 0)
|
||||||
|
{
|
||||||
|
MULSS(XMM0, M(&m_dequantizeTableS[quantize * 2]));
|
||||||
|
}
|
||||||
|
UNPCKLPS(XMM0, M(m_one));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
switch (type)
|
||||||
|
{
|
||||||
|
case QUANTIZE_U8:
|
||||||
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
PMOVZXBD(XMM0, R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PXOR(XMM1, R(XMM1));
|
||||||
|
PUNPCKLBW(XMM0, R(XMM1));
|
||||||
|
PUNPCKLWD(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case QUANTIZE_S8:
|
||||||
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
PMOVSXBD(XMM0, R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PUNPCKLBW(XMM0, R(XMM0));
|
||||||
|
PUNPCKLWD(XMM0, R(XMM0));
|
||||||
|
PSRAD(XMM0, 24);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case QUANTIZE_U16:
|
||||||
|
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||||
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
PMOVZXWD(XMM0, R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PXOR(XMM1, R(XMM1));
|
||||||
|
PUNPCKLWD(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case QUANTIZE_S16:
|
||||||
|
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||||
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
PMOVSXWD(XMM0, R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PUNPCKLWD(XMM0, R(XMM0));
|
||||||
|
PSRAD(XMM0, 16);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
|
|
||||||
|
if (quantize == -1)
|
||||||
|
{
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
|
else if (quantize > 0)
|
||||||
|
{
|
||||||
|
MOVQ_xmm(XMM1, M(&m_dequantizeTableS[quantize * 2]));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
void QuantizedMemoryRoutines::GenQuantizedLoadFloat(bool single, bool isInline)
|
||||||
|
{
|
||||||
|
int size = single ? 32 : 64;
|
||||||
|
bool extend = false;
|
||||||
|
|
||||||
|
if (jit->jo.memcheck)
|
||||||
|
{
|
||||||
|
BitSet32 regsToSave = QUANTIZED_REGS_TO_SAVE;
|
||||||
|
int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG;
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), size, 0, regsToSave, extend, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (single)
|
||||||
|
{
|
||||||
|
if (jit->jo.memcheck)
|
||||||
|
{
|
||||||
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
}
|
||||||
|
else if (cpu_info.bSSSE3)
|
||||||
|
{
|
||||||
|
MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
|
||||||
|
PSHUFB(XMM0, M(pbswapShuffle1x4));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LoadAndSwap(32, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
|
||||||
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
}
|
||||||
|
|
||||||
|
UNPCKLPS(XMM0, M(m_one));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
|
||||||
|
// don't need hardware access handling. This will definitely crash if paired loads occur
|
||||||
|
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
|
||||||
|
// for a good reason, or merely because no game does this.
|
||||||
|
// If we find something that actually does do this, maybe this should be changed. How
|
||||||
|
// much of a performance hit would it be?
|
||||||
|
if (jit->jo.memcheck)
|
||||||
|
{
|
||||||
|
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
||||||
|
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
}
|
||||||
|
else if (cpu_info.bSSSE3)
|
||||||
|
{
|
||||||
|
MOVQ_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
|
||||||
|
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LoadAndSwap(64, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
|
||||||
|
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
||||||
|
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,16 +7,31 @@
|
||||||
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
|
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
|
||||||
#include "Core/PowerPC/JitCommon/Jit_Util.h"
|
#include "Core/PowerPC/JitCommon/Jit_Util.h"
|
||||||
|
|
||||||
class CommonAsmRoutines : public CommonAsmRoutinesBase, public EmuCodeBlock
|
enum EQuantizeType : u32;
|
||||||
{
|
|
||||||
protected:
|
|
||||||
void GenQuantizedLoads();
|
|
||||||
void GenQuantizedStores();
|
|
||||||
void GenQuantizedSingleStores();
|
|
||||||
|
|
||||||
|
class QuantizedMemoryRoutines : public EmuCodeBlock
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
void GenQuantizedLoad(bool single, EQuantizeType type, int quantize);
|
||||||
|
void GenQuantizedStore(bool single, EQuantizeType type, int quantize);
|
||||||
|
|
||||||
|
private:
|
||||||
|
void GenQuantizedLoadFloat(bool single, bool isInline);
|
||||||
|
void GenQuantizedStoreFloat(bool single, bool isInline);
|
||||||
|
};
|
||||||
|
|
||||||
|
class CommonAsmRoutines : public CommonAsmRoutinesBase, public QuantizedMemoryRoutines
|
||||||
|
{
|
||||||
public:
|
public:
|
||||||
void GenFifoWrite(int size);
|
void GenFifoWrite(int size);
|
||||||
void GenFrsqrte();
|
void GenFrsqrte();
|
||||||
void GenFres();
|
void GenFres();
|
||||||
void GenMfcr();
|
void GenMfcr();
|
||||||
|
|
||||||
|
protected:
|
||||||
|
const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type);
|
||||||
|
const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type);
|
||||||
|
void GenQuantizedLoads();
|
||||||
|
void GenQuantizedStores();
|
||||||
|
void GenQuantizedSingleStores();
|
||||||
};
|
};
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
//#define JIT_LOG_GPR // Enables logging of the PPC general purpose regs
|
//#define JIT_LOG_GPR // Enables logging of the PPC general purpose regs
|
||||||
//#define JIT_LOG_FPR // Enables logging of the PPC floating point regs
|
//#define JIT_LOG_FPR // Enables logging of the PPC floating point regs
|
||||||
|
|
||||||
|
#include <map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
|
|
||||||
#include "Common/CommonTypes.h"
|
#include "Common/CommonTypes.h"
|
||||||
|
@ -88,6 +89,7 @@ protected:
|
||||||
int revertFprLoad;
|
int revertFprLoad;
|
||||||
|
|
||||||
bool assumeNoPairedQuantize;
|
bool assumeNoPairedQuantize;
|
||||||
|
std::map<u8, u32> constantGqr;
|
||||||
bool firstFPInstructionFound;
|
bool firstFPInstructionFound;
|
||||||
bool isLastInstruction;
|
bool isLastInstruction;
|
||||||
int skipInstructions;
|
int skipInstructions;
|
||||||
|
@ -130,7 +132,7 @@ public:
|
||||||
virtual bool HandleStackFault() { return false; }
|
virtual bool HandleStackFault() { return false; }
|
||||||
};
|
};
|
||||||
|
|
||||||
class Jitx86Base : public JitBase, public EmuCodeBlock
|
class Jitx86Base : public JitBase, public QuantizedMemoryRoutines
|
||||||
{
|
{
|
||||||
protected:
|
protected:
|
||||||
bool BackPatch(u32 emAddress, SContext* ctx);
|
bool BackPatch(u32 emAddress, SContext* ctx);
|
||||||
|
|
Loading…
Reference in New Issue