Merge pull request #3454 from mmastrac/gqr_fixes

JIT perf improvements for quantized loads/stores
This commit is contained in:
Markus Wick 2016-06-27 18:31:50 +02:00 committed by GitHub
commit ddc9e414ee
6 changed files with 525 additions and 461 deletions

View File

@ -672,27 +672,20 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
js.skipInstructions = 0;
js.carryFlagSet = false;
js.carryFlagInverted = false;
js.assumeNoPairedQuantize = false;
js.constantGqr.clear();
// If the block only uses one GQR and the GQR is zero at compile time, make a guess that the block
// never uses quantized loads/stores. Many paired-heavy games use largely float loads and stores,
// Assume that GQR values don't change often at runtime. Many paired-heavy games use largely float
// loads and stores,
// which are significantly faster when inlined (especially in MMU mode, where this lets them use
// fastmem).
// Insert a check that the GQR is still zero at the start of the block in case our guess turns out
// wrong.
// TODO: support any other constant GQR value, not merely zero/unquantized: we can optimize
// quantized
// loadstores too, it'd just be more code.
if (code_block.m_gqr_used.Count() == 1 &&
js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end())
if (js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end())
{
int gqr = *code_block.m_gqr_used.begin();
if (!code_block.m_gqr_modified[gqr] && !GQR(gqr))
// If there are GQRs used but not set, we'll treat those as constant and optimize them
BitSet8 gqr_static = ComputeStaticGQRs(code_block);
if (gqr_static)
{
CMP(32, PPCSTATE(spr[SPR_GQR0 + gqr]), Imm8(0));
FixupBranch failure = J_CC(CC_NZ, true);
SwitchToFarCode();
SetJumpTarget(failure);
const u8* target = GetCodePtr();
MOV(32, PPCSTATE(pc), Imm32(js.blockStart));
ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunctionC((void*)&JitInterface::CompileExceptionCheck,
@ -700,7 +693,16 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
ABI_PopRegistersAndAdjustStack({}, 0);
JMP(asm_routines.dispatcher, true);
SwitchToNearCode();
js.assumeNoPairedQuantize = true;
// Insert a check that the GQRs are still the value we expect at
// the start of the block in case our guess turns out wrong.
for (int gqr : gqr_static)
{
u32 value = GQR(gqr);
js.constantGqr[gqr] = value;
CMP_or_TEST(32, PPCSTATE(spr[SPR_GQR0 + gqr]), Imm32(value));
J_CC(CC_NZ, target);
}
}
}
@ -947,7 +949,12 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
return normalEntry;
}
BitSet32 Jit64::CallerSavedRegistersInUse()
BitSet8 Jit64::ComputeStaticGQRs(const PPCAnalyst::CodeBlock& cb) const
{
return cb.m_gqr_used & ~cb.m_gqr_modified;
}
BitSet32 Jit64::CallerSavedRegistersInUse() const
{
BitSet32 result;
for (int i = 0; i < NUMXREGS; i++)

View File

@ -64,7 +64,8 @@ public:
void Jit(u32 em_address) override;
const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC);
BitSet32 CallerSavedRegistersInUse();
BitSet32 CallerSavedRegistersInUse() const;
BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const;
JitBlockCache* GetBlockCache() override { return &blocks; }
void Trace();

View File

@ -35,8 +35,12 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
int w = indexed ? inst.Wx : inst.W;
FALLBACK_IF(!a);
auto it = js.constantGqr.find(i);
bool gqrIsConstant = it != js.constantGqr.end();
u32 gqrValue = gqrIsConstant ? it->second & 0xffff : 0;
gpr.Lock(a, b);
if (js.assumeNoPairedQuantize)
if (gqrIsConstant && gqrValue == 0)
{
int storeOffset = 0;
gpr.BindToRegister(a, true, update);
@ -125,25 +129,68 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
// In memcheck mode, don't update the address until the exception check
if (update && !jo.memcheck)
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
// Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code.
// Hence, we need to mask out the unused bits. The layout of the GQR register is
// UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with
// 0b0011111100000111, or 0x3F07.
MOV(32, R(RSCRATCH2), Imm32(0x3F07));
AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + i]));
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
if (w)
if (gqrIsConstant)
{
// One value
CVTSD2SS(XMM0, fpr.R(s));
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized));
// Paired stores don't yield any real change in performance right now, but if we can
// improve fastmem support this might change
//#define INLINE_PAIRED_STORES
#ifdef INLINE_PAIRED_STORES
if (w)
{
// One value
CVTSD2SS(XMM0, fpr.R(s));
GenQuantizedStore(true, static_cast<EQuantizeType>(gqrValue & 0x7), (gqrValue & 0x3F00) >> 8);
}
else
{
// Pair of values
CVTPD2PS(XMM0, fpr.R(s));
GenQuantizedStore(false, static_cast<EQuantizeType>(gqrValue & 0x7),
(gqrValue & 0x3F00) >> 8);
}
#else
// We know what GQR is here, so we can load RSCRATCH2 and call into the store method directly
// with just the scale bits.
int type = gqrValue & 0x7;
MOV(32, R(RSCRATCH2), Imm32(gqrValue & 0x3F00));
if (w)
{
// One value
CVTSD2SS(XMM0, fpr.R(s));
CALL(asm_routines.singleStoreQuantized[type]);
}
else
{
// Pair of values
CVTPD2PS(XMM0, fpr.R(s));
CALL(asm_routines.pairedStoreQuantized[type]);
}
#endif
}
else
{
// Pair of values
CVTPD2PS(XMM0, fpr.R(s));
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized));
// Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code.
// Hence, we need to mask out the unused bits. The layout of the GQR register is
// UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with
// 0b0011111100000111, or 0x3F07.
MOV(32, R(RSCRATCH2), Imm32(0x3F07));
AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + i]));
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
if (w)
{
// One value
CVTSD2SS(XMM0, fpr.R(s));
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized));
}
else
{
// Pair of values
CVTPD2PS(XMM0, fpr.R(s));
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized));
}
}
if (update && jo.memcheck)
@ -173,8 +220,13 @@ void Jit64::psq_lXX(UGeckoInstruction inst)
int w = indexed ? inst.Wx : inst.W;
FALLBACK_IF(!a);
auto it = js.constantGqr.find(i);
bool gqrIsConstant = it != js.constantGqr.end();
u32 gqrValue = gqrIsConstant ? it->second >> 16 : 0;
gpr.Lock(a, b);
if (js.assumeNoPairedQuantize)
if (gqrIsConstant && gqrValue == 0)
{
s32 loadOffset = 0;
gpr.BindToRegister(a, true, update);
@ -302,16 +354,24 @@ void Jit64::psq_lXX(UGeckoInstruction inst)
// In memcheck mode, don't update the address until the exception check
if (update && !jo.memcheck)
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
MOV(32, R(RSCRATCH2), Imm32(0x3F07));
// Get the high part of the GQR register
OpArg gqr = PPCSTATE(spr[SPR_GQR0 + i]);
gqr.AddMemOffset(2);
if (gqrIsConstant)
{
GenQuantizedLoad(w == 1, static_cast<EQuantizeType>(gqrValue & 0x7), (gqrValue & 0x3F00) >> 8);
}
else
{
MOV(32, R(RSCRATCH2), Imm32(0x3F07));
AND(32, R(RSCRATCH2), gqr);
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
// Get the high part of the GQR register
OpArg gqr = PPCSTATE(spr[SPR_GQR0 + i]);
gqr.AddMemOffset(2);
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(&asm_routines.pairedLoadQuantized[w * 8])));
AND(32, R(RSCRATCH2), gqr);
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(&asm_routines.pairedLoadQuantized[w * 8])));
}
MemoryExceptionCheck();
CVTPS2PD(fpr.RX(s), R(XMM0));

View File

@ -10,6 +10,7 @@
#include "Common/x64ABI.h"
#include "Common/x64Emitter.h"
#include "Core/HW/GPFifo.h"
#include "Core/PowerPC/Gekko.h"
#include "Core/PowerPC/JitCommon/JitBase.h"
#include "Core/PowerPC/JitCommon/Jit_Util.h"
#include "Core/PowerPC/PowerPC.h"
@ -219,438 +220,416 @@ alignas(16) static const float m_255 = 255.0f;
alignas(16) static const float m_127 = 127.0f;
alignas(16) static const float m_m128 = -128.0f;
#define QUANTIZE_OVERFLOW_SAFE
// Sizes of the various quantized store types
constexpr std::array<u8, 8> sizes{{32, 0, 0, 0, 8, 16, 8, 16}};
// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of
// int32 range
// while it's OK for large negatives, it isn't for positives
// I don't know whether the overflow actually happens in any games
// but it potentially can cause problems, so we need some clamping
// See comment in header for in/outs.
void CommonAsmRoutines::GenQuantizedStores()
{
const void* start = GetCodePtr();
const u8* storePairedIllegal = AlignCode4();
UD2();
const u8* storePairedFloat = AlignCode4();
if (cpu_info.bSSSE3)
{
PSHUFB(XMM0, M((void*)pbswapShuffle2x4));
MOVQ_xmm(R(RSCRATCH), XMM0);
}
else
{
MOVQ_xmm(R(RSCRATCH), XMM0);
ROL(64, R(RSCRATCH), Imm8(32));
BSWAP(64, RSCRATCH);
}
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE,
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedU8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
PACKUSWB(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedS8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
PACKSSWB(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedU16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
if (cpu_info.bSSE4_1)
{
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKUSDW(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
ROL(32, R(RSCRATCH), Imm8(16));
}
else
{
XORPS(XMM1, R(XMM1));
MAXPS(XMM0, R(XMM1));
MINPS(XMM0, M(m_65535));
CVTTPS2DQ(XMM0, R(XMM0));
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
}
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE,
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedS16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
ROL(32, R(RSCRATCH), Imm8(16));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE,
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore");
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(8 * sizeof(u8*));
pairedStoreQuantized[0] = storePairedFloat;
pairedStoreQuantized[1] = storePairedIllegal;
pairedStoreQuantized[2] = storePairedIllegal;
pairedStoreQuantized[3] = storePairedIllegal;
pairedStoreQuantized[4] = storePairedU8;
pairedStoreQuantized[5] = storePairedU16;
pairedStoreQuantized[6] = storePairedS8;
pairedStoreQuantized[7] = storePairedS16;
for (int type = 0; type < 8; type++)
pairedStoreQuantized[type] = GenQuantizedStoreRuntime(false, static_cast<EQuantizeType>(type));
}
// See comment in header for in/outs.
void CommonAsmRoutines::GenQuantizedSingleStores()
{
const void* start = GetCodePtr();
const u8* storeSingleIllegal = AlignCode4();
UD2();
// Easy!
const u8* storeSingleFloat = AlignCode4();
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE,
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
XORPS(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M(&m_255));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE,
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleS8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MAXSS(XMM0, M(&m_m128));
MINSS(XMM0, M(&m_127));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE,
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
XORPS(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M(m_65535));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleS16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MAXSS(XMM0, M(&m_m32768));
MINSS(XMM0, M(&m_32767));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedSingleStore");
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(8 * sizeof(u8*));
singleStoreQuantized[0] = storeSingleFloat;
singleStoreQuantized[1] = storeSingleIllegal;
singleStoreQuantized[2] = storeSingleIllegal;
singleStoreQuantized[3] = storeSingleIllegal;
singleStoreQuantized[4] = storeSingleU8;
singleStoreQuantized[5] = storeSingleU16;
singleStoreQuantized[6] = storeSingleS8;
singleStoreQuantized[7] = storeSingleS16;
for (int type = 0; type < 8; type++)
singleStoreQuantized[type] = GenQuantizedStoreRuntime(true, static_cast<EQuantizeType>(type));
}
const u8* CommonAsmRoutines::GenQuantizedStoreRuntime(bool single, EQuantizeType type)
{
const void* start = GetCodePtr();
const u8* load = AlignCode4();
GenQuantizedStore(single, type, -1);
RET();
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore_%i_%i", type, single);
return load;
}
void CommonAsmRoutines::GenQuantizedLoads()
{
const void* start = GetCodePtr();
const u8* loadPairedIllegal = AlignCode4();
UD2();
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
// don't need hardware access handling. This will definitely crash if paired loads occur
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
// for a good reason, or merely because no game does this.
// If we find something that actually does do this, maybe this should be changed. How
// much of a performance hit would it be?
const u8* loadPairedFloatTwo = AlignCode4();
if (jit->jo.memcheck)
{
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false,
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
}
else if (cpu_info.bSSSE3)
{
MOVQ_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
PSHUFB(XMM0, M(pbswapShuffle2x4));
}
else
{
LoadAndSwap(64, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
}
RET();
const u8* loadPairedFloatOne = AlignCode4();
if (jit->jo.memcheck)
{
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false,
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
UNPCKLPS(XMM0, M(m_one));
}
else if (cpu_info.bSSSE3)
{
MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
PSHUFB(XMM0, M(pbswapShuffle1x4));
UNPCKLPS(XMM0, M(m_one));
}
else
{
LoadAndSwap(32, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
UNPCKLPS(XMM0, M(m_one));
}
RET();
const u8* loadPairedU8Two = AlignCode4();
if (jit->jo.memcheck)
{
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
}
else
{
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
}
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVZXBD(XMM0, R(XMM0));
}
else
{
PXOR(XMM1, R(XMM1));
PUNPCKLBW(XMM0, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedU8One = AlignCode4();
if (jit->jo.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedS8Two = AlignCode4();
if (jit->jo.memcheck)
{
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
}
else
{
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
}
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVSXBD(XMM0, R(XMM0));
}
else
{
PUNPCKLBW(XMM0, R(XMM0));
PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 24);
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedS8One = AlignCode4();
if (jit->jo.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true,
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedU16Two = AlignCode4();
// TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
if (jit->jo.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVZXWD(XMM0, R(XMM0));
}
else
{
PXOR(XMM1, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedU16One = AlignCode4();
if (jit->jo.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedS16Two = AlignCode4();
if (jit->jo.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVSXWD(XMM0, R(XMM0));
}
else
{
PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 16);
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedS16One = AlignCode4();
if (jit->jo.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true,
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedLoad");
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(16 * sizeof(u8*));
pairedLoadQuantized[0] = loadPairedFloatTwo;
pairedLoadQuantized[1] = loadPairedIllegal;
pairedLoadQuantized[2] = loadPairedIllegal;
pairedLoadQuantized[3] = loadPairedIllegal;
pairedLoadQuantized[4] = loadPairedU8Two;
pairedLoadQuantized[5] = loadPairedU16Two;
pairedLoadQuantized[6] = loadPairedS8Two;
pairedLoadQuantized[7] = loadPairedS16Two;
pairedLoadQuantized[8] = loadPairedFloatOne;
pairedLoadQuantized[9] = loadPairedIllegal;
pairedLoadQuantized[10] = loadPairedIllegal;
pairedLoadQuantized[11] = loadPairedIllegal;
pairedLoadQuantized[12] = loadPairedU8One;
pairedLoadQuantized[13] = loadPairedU16One;
pairedLoadQuantized[14] = loadPairedS8One;
pairedLoadQuantized[15] = loadPairedS16One;
for (int type = 0; type < 8; type++)
pairedLoadQuantized[type] = GenQuantizedLoadRuntime(false, static_cast<EQuantizeType>(type));
for (int type = 0; type < 8; type++)
pairedLoadQuantized[type + 8] = GenQuantizedLoadRuntime(true, static_cast<EQuantizeType>(type));
}
const u8* CommonAsmRoutines::GenQuantizedLoadRuntime(bool single, EQuantizeType type)
{
const void* start = GetCodePtr();
const u8* load = AlignCode4();
GenQuantizedLoad(single, type, -1);
RET();
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedLoad_%i_%i", type, single);
return load;
}
void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type, int quantize)
{
// In: one or two single floats in XMM0, if quantize is -1, a quantization factor in RSCRATCH2
int size = sizes[type] * (single ? 1 : 2);
bool isInline = quantize != -1;
// illegal
if (type == QUANTIZE_INVALID1 || type == QUANTIZE_INVALID2 || type == QUANTIZE_INVALID3)
{
UD2();
return;
}
if (type == QUANTIZE_FLOAT)
{
GenQuantizedStoreFloat(single, isInline);
}
else if (single)
{
if (quantize == -1)
{
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
}
else if (quantize > 0)
{
MULSS(XMM0, M(&m_dequantizeTableS[quantize * 2]));
}
switch (type)
{
case QUANTIZE_U8:
XORPS(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M(&m_255));
break;
case QUANTIZE_S8:
MAXSS(XMM0, M(&m_m128));
MINSS(XMM0, M(&m_127));
break;
case QUANTIZE_U16:
XORPS(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M(m_65535));
break;
case QUANTIZE_S16:
MAXSS(XMM0, M(&m_m32768));
MINSS(XMM0, M(&m_32767));
break;
default:
break;
}
CVTTSS2SI(RSCRATCH, R(XMM0));
}
else
{
if (quantize == -1)
{
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
}
else if (quantize > 0)
{
MOVQ_xmm(XMM1, M(&m_quantizeTableS[quantize * 2]));
MULPS(XMM0, R(XMM1));
}
bool hasPACKUSDW = cpu_info.bSSE4_1;
// Special case: if we don't have PACKUSDW we need to clamp to zero as well so the shuffle
// below can work
if (type == QUANTIZE_U16 && !hasPACKUSDW)
{
XORPS(XMM1, R(XMM1));
MAXPS(XMM0, R(XMM1));
}
// According to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value
// is out of int32 range while it's OK for large negatives, it isn't for positives
// I don't know whether the overflow actually happens in any games but it potentially can
// cause problems, so we need some clamping
MINPS(XMM0, M(m_65535));
CVTTPS2DQ(XMM0, R(XMM0));
switch (type)
{
case QUANTIZE_U8:
PACKSSDW(XMM0, R(XMM0));
PACKUSWB(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
break;
case QUANTIZE_S8:
PACKSSDW(XMM0, R(XMM0));
PACKSSWB(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
break;
case QUANTIZE_U16:
if (hasPACKUSDW)
{
PACKUSDW(XMM0, R(XMM0)); // AAAABBBB CCCCDDDD ... -> AABBCCDD ...
MOVD_xmm(R(RSCRATCH), XMM0); // AABBCCDD ... -> AABBCCDD
BSWAP(32, RSCRATCH); // AABBCCDD -> DDCCBBAA
ROL(32, R(RSCRATCH), Imm8(16)); // DDCCBBAA -> BBAADDCC
}
else
{
// We don't have PACKUSDW so we'll shuffle instead (assumes 32-bit values >= 0 and < 65536)
PSHUFLW(XMM0, R(XMM0), 2); // AABB0000 CCDD0000 ... -> CCDDAABB ...
MOVD_xmm(R(RSCRATCH), XMM0); // CCDDAABB ... -> CCDDAABB
BSWAP(32, RSCRATCH); // CCDDAABB -> BBAADDCC
}
break;
case QUANTIZE_S16:
PACKSSDW(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
ROL(32, R(RSCRATCH), Imm8(16));
break;
default:
break;
}
}
int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG;
if (!single)
flags |= SAFE_LOADSTORE_NO_SWAP;
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, size, 0, QUANTIZED_REGS_TO_SAVE, flags);
}
void QuantizedMemoryRoutines::GenQuantizedStoreFloat(bool single, bool isInline)
{
if (single)
{
// Easy!
MOVD_xmm(R(RSCRATCH), XMM0);
}
else
{
if (cpu_info.bSSSE3)
{
PSHUFB(XMM0, M(pbswapShuffle2x4));
MOVQ_xmm(R(RSCRATCH), XMM0);
}
else
{
MOVQ_xmm(R(RSCRATCH), XMM0);
ROL(64, R(RSCRATCH), Imm8(32));
BSWAP(64, RSCRATCH);
}
}
}
void QuantizedMemoryRoutines::GenQuantizedLoad(bool single, EQuantizeType type, int quantize)
{
// Note that this method assumes that inline methods know the value of quantize ahead of
// time. The methods generated AOT assume that the quantize flag is placed in RSCRATCH in
// the second lowest byte, ie: 0x0000xx00
int size = sizes[type] * (single ? 1 : 2);
bool isInline = quantize != -1;
// illegal
if (type == QUANTIZE_INVALID1 || type == QUANTIZE_INVALID2 || type == QUANTIZE_INVALID3)
{
UD2();
return;
}
// Floats don't use quantization and can generate more optimal code
if (type == QUANTIZE_FLOAT)
{
GenQuantizedLoadFloat(single, isInline);
return;
}
bool extend = single && (type == QUANTIZE_S8 || type == QUANTIZE_S16);
if (jit->jo.memcheck)
{
BitSet32 regsToSave = QUANTIZED_REGS_TO_SAVE_LOAD;
int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG;
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), size, 0, regsToSave, extend, flags);
if (!single && (type == QUANTIZE_U8 || type == QUANTIZE_S8))
{
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
}
}
else
{
switch (type)
{
case QUANTIZE_U8:
case QUANTIZE_S8:
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, size, 0, extend);
break;
case QUANTIZE_U16:
case QUANTIZE_S16:
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, size, 0, extend);
break;
default:
break;
}
}
if (single)
{
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
if (quantize == -1)
{
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
}
else if (quantize > 0)
{
MULSS(XMM0, M(&m_dequantizeTableS[quantize * 2]));
}
UNPCKLPS(XMM0, M(m_one));
}
else
{
switch (type)
{
case QUANTIZE_U8:
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVZXBD(XMM0, R(XMM0));
}
else
{
PXOR(XMM1, R(XMM1));
PUNPCKLBW(XMM0, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
}
break;
case QUANTIZE_S8:
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVSXBD(XMM0, R(XMM0));
}
else
{
PUNPCKLBW(XMM0, R(XMM0));
PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 24);
}
break;
case QUANTIZE_U16:
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVZXWD(XMM0, R(XMM0));
}
else
{
PXOR(XMM1, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
}
break;
case QUANTIZE_S16:
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVSXWD(XMM0, R(XMM0));
}
else
{
PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 16);
}
break;
default:
break;
}
CVTDQ2PS(XMM0, R(XMM0));
if (quantize == -1)
{
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
}
else if (quantize > 0)
{
MOVQ_xmm(XMM1, M(&m_dequantizeTableS[quantize * 2]));
MULPS(XMM0, R(XMM1));
}
}
return;
}
void QuantizedMemoryRoutines::GenQuantizedLoadFloat(bool single, bool isInline)
{
int size = single ? 32 : 64;
bool extend = false;
if (jit->jo.memcheck)
{
BitSet32 regsToSave = QUANTIZED_REGS_TO_SAVE;
int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG;
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), size, 0, regsToSave, extend, flags);
}
if (single)
{
if (jit->jo.memcheck)
{
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
}
else if (cpu_info.bSSSE3)
{
MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
PSHUFB(XMM0, M(pbswapShuffle1x4));
}
else
{
LoadAndSwap(32, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
}
UNPCKLPS(XMM0, M(m_one));
}
else
{
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
// don't need hardware access handling. This will definitely crash if paired loads occur
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
// for a good reason, or merely because no game does this.
// If we find something that actually does do this, maybe this should be changed. How
// much of a performance hit would it be?
if (jit->jo.memcheck)
{
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
}
else if (cpu_info.bSSSE3)
{
MOVQ_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
PSHUFB(XMM0, M(pbswapShuffle2x4));
}
else
{
LoadAndSwap(64, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
}
}
}

View File

@ -7,16 +7,31 @@
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
#include "Core/PowerPC/JitCommon/Jit_Util.h"
class CommonAsmRoutines : public CommonAsmRoutinesBase, public EmuCodeBlock
{
protected:
void GenQuantizedLoads();
void GenQuantizedStores();
void GenQuantizedSingleStores();
enum EQuantizeType : u32;
class QuantizedMemoryRoutines : public EmuCodeBlock
{
public:
void GenQuantizedLoad(bool single, EQuantizeType type, int quantize);
void GenQuantizedStore(bool single, EQuantizeType type, int quantize);
private:
void GenQuantizedLoadFloat(bool single, bool isInline);
void GenQuantizedStoreFloat(bool single, bool isInline);
};
class CommonAsmRoutines : public CommonAsmRoutinesBase, public QuantizedMemoryRoutines
{
public:
void GenFifoWrite(int size);
void GenFrsqrte();
void GenFres();
void GenMfcr();
protected:
const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type);
const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type);
void GenQuantizedLoads();
void GenQuantizedStores();
void GenQuantizedSingleStores();
};

View File

@ -8,6 +8,7 @@
//#define JIT_LOG_GPR // Enables logging of the PPC general purpose regs
//#define JIT_LOG_FPR // Enables logging of the PPC floating point regs
#include <map>
#include <unordered_set>
#include "Common/CommonTypes.h"
@ -88,6 +89,7 @@ protected:
int revertFprLoad;
bool assumeNoPairedQuantize;
std::map<u8, u32> constantGqr;
bool firstFPInstructionFound;
bool isLastInstruction;
int skipInstructions;
@ -130,7 +132,7 @@ public:
virtual bool HandleStackFault() { return false; }
};
class Jitx86Base : public JitBase, public EmuCodeBlock
class Jitx86Base : public JitBase, public QuantizedMemoryRoutines
{
protected:
bool BackPatch(u32 emAddress, SContext* ctx);