From 4aa5291f545a44a987a931d8b09ba33497772f28 Mon Sep 17 00:00:00 2001 From: Matt Mastracci Date: Sun, 28 Feb 2016 14:33:53 -0700 Subject: [PATCH] Refactor the paired load/store code Simplification/reduction of duplicated code. Detect other constant GQR values and inline loads (5-10% speedup) and do direct dispatch to AOT methods for stores. --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 41 +- Source/Core/Core/PowerPC/Jit64/Jit.h | 3 +- .../PowerPC/Jit64/Jit_LoadStorePaired.cpp | 106 ++- .../PowerPC/Jit64Common/Jit64AsmCommon.cpp | 805 +++++++++--------- .../Core/PowerPC/Jit64Common/Jit64AsmCommon.h | 27 +- Source/Core/Core/PowerPC/JitCommon/JitBase.h | 4 +- 6 files changed, 525 insertions(+), 461 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 291d2667d7..f6d3b3849d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -672,27 +672,20 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc js.skipInstructions = 0; js.carryFlagSet = false; js.carryFlagInverted = false; - js.assumeNoPairedQuantize = false; + js.constantGqr.clear(); - // If the block only uses one GQR and the GQR is zero at compile time, make a guess that the block - // never uses quantized loads/stores. Many paired-heavy games use largely float loads and stores, + // Assume that GQR values don't change often at runtime. Many paired-heavy games use largely float + // loads and stores, // which are significantly faster when inlined (especially in MMU mode, where this lets them use // fastmem). - // Insert a check that the GQR is still zero at the start of the block in case our guess turns out - // wrong. - // TODO: support any other constant GQR value, not merely zero/unquantized: we can optimize - // quantized - // loadstores too, it'd just be more code. - if (code_block.m_gqr_used.Count() == 1 && - js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end()) + if (js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end()) { - int gqr = *code_block.m_gqr_used.begin(); - if (!code_block.m_gqr_modified[gqr] && !GQR(gqr)) + // If there are GQRs used but not set, we'll treat those as constant and optimize them + BitSet8 gqr_static = ComputeStaticGQRs(code_block); + if (gqr_static) { - CMP(32, PPCSTATE(spr[SPR_GQR0 + gqr]), Imm8(0)); - FixupBranch failure = J_CC(CC_NZ, true); SwitchToFarCode(); - SetJumpTarget(failure); + const u8* target = GetCodePtr(); MOV(32, PPCSTATE(pc), Imm32(js.blockStart)); ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunctionC((void*)&JitInterface::CompileExceptionCheck, @@ -700,7 +693,16 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc ABI_PopRegistersAndAdjustStack({}, 0); JMP(asm_routines.dispatcher, true); SwitchToNearCode(); - js.assumeNoPairedQuantize = true; + + // Insert a check that the GQRs are still the value we expect at + // the start of the block in case our guess turns out wrong. + for (int gqr : gqr_static) + { + u32 value = GQR(gqr); + js.constantGqr[gqr] = value; + CMP_or_TEST(32, PPCSTATE(spr[SPR_GQR0 + gqr]), Imm32(value)); + J_CC(CC_NZ, target); + } } } @@ -947,7 +949,12 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc return normalEntry; } -BitSet32 Jit64::CallerSavedRegistersInUse() +BitSet8 Jit64::ComputeStaticGQRs(const PPCAnalyst::CodeBlock& cb) const +{ + return cb.m_gqr_used & ~cb.m_gqr_modified; +} + +BitSet32 Jit64::CallerSavedRegistersInUse() const { BitSet32 result; for (int i = 0; i < NUMXREGS; i++) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 4d006f4d12..990bbd6e17 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -64,7 +64,8 @@ public: void Jit(u32 em_address) override; const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC); - BitSet32 CallerSavedRegistersInUse(); + BitSet32 CallerSavedRegistersInUse() const; + BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const; JitBlockCache* GetBlockCache() override { return &blocks; } void Trace(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp index f5223347f9..a1d1a223c9 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -35,8 +35,12 @@ void Jit64::psq_stXX(UGeckoInstruction inst) int w = indexed ? inst.Wx : inst.W; FALLBACK_IF(!a); + auto it = js.constantGqr.find(i); + bool gqrIsConstant = it != js.constantGqr.end(); + u32 gqrValue = gqrIsConstant ? it->second & 0xffff : 0; + gpr.Lock(a, b); - if (js.assumeNoPairedQuantize) + if (gqrIsConstant && gqrValue == 0) { int storeOffset = 0; gpr.BindToRegister(a, true, update); @@ -125,25 +129,68 @@ void Jit64::psq_stXX(UGeckoInstruction inst) // In memcheck mode, don't update the address until the exception check if (update && !jo.memcheck) MOV(32, gpr.R(a), R(RSCRATCH_EXTRA)); - // Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code. - // Hence, we need to mask out the unused bits. The layout of the GQR register is - // UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with - // 0b0011111100000111, or 0x3F07. - MOV(32, R(RSCRATCH2), Imm32(0x3F07)); - AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + i])); - MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); - if (w) + if (gqrIsConstant) { - // One value - CVTSD2SS(XMM0, fpr.R(s)); - CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized)); +// Paired stores don't yield any real change in performance right now, but if we can +// improve fastmem support this might change +//#define INLINE_PAIRED_STORES +#ifdef INLINE_PAIRED_STORES + if (w) + { + // One value + CVTSD2SS(XMM0, fpr.R(s)); + GenQuantizedStore(true, static_cast(gqrValue & 0x7), (gqrValue & 0x3F00) >> 8); + } + else + { + // Pair of values + CVTPD2PS(XMM0, fpr.R(s)); + GenQuantizedStore(false, static_cast(gqrValue & 0x7), + (gqrValue & 0x3F00) >> 8); + } +#else + // We know what GQR is here, so we can load RSCRATCH2 and call into the store method directly + // with just the scale bits. + int type = gqrValue & 0x7; + MOV(32, R(RSCRATCH2), Imm32(gqrValue & 0x3F00)); + + if (w) + { + // One value + CVTSD2SS(XMM0, fpr.R(s)); + CALL(asm_routines.singleStoreQuantized[type]); + } + else + { + // Pair of values + CVTPD2PS(XMM0, fpr.R(s)); + CALL(asm_routines.pairedStoreQuantized[type]); + } +#endif } else { - // Pair of values - CVTPD2PS(XMM0, fpr.R(s)); - CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized)); + // Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code. + // Hence, we need to mask out the unused bits. The layout of the GQR register is + // UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with + // 0b0011111100000111, or 0x3F07. + MOV(32, R(RSCRATCH2), Imm32(0x3F07)); + AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + i])); + MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); + + if (w) + { + // One value + CVTSD2SS(XMM0, fpr.R(s)); + CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized)); + } + else + { + // Pair of values + CVTPD2PS(XMM0, fpr.R(s)); + CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized)); + } } if (update && jo.memcheck) @@ -173,8 +220,13 @@ void Jit64::psq_lXX(UGeckoInstruction inst) int w = indexed ? inst.Wx : inst.W; FALLBACK_IF(!a); + auto it = js.constantGqr.find(i); + bool gqrIsConstant = it != js.constantGqr.end(); + u32 gqrValue = gqrIsConstant ? it->second >> 16 : 0; + gpr.Lock(a, b); - if (js.assumeNoPairedQuantize) + + if (gqrIsConstant && gqrValue == 0) { s32 loadOffset = 0; gpr.BindToRegister(a, true, update); @@ -302,16 +354,24 @@ void Jit64::psq_lXX(UGeckoInstruction inst) // In memcheck mode, don't update the address until the exception check if (update && !jo.memcheck) MOV(32, gpr.R(a), R(RSCRATCH_EXTRA)); - MOV(32, R(RSCRATCH2), Imm32(0x3F07)); - // Get the high part of the GQR register - OpArg gqr = PPCSTATE(spr[SPR_GQR0 + i]); - gqr.AddMemOffset(2); + if (gqrIsConstant) + { + GenQuantizedLoad(w == 1, static_cast(gqrValue & 0x7), (gqrValue & 0x3F00) >> 8); + } + else + { + MOV(32, R(RSCRATCH2), Imm32(0x3F07)); - AND(32, R(RSCRATCH2), gqr); - MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); + // Get the high part of the GQR register + OpArg gqr = PPCSTATE(spr[SPR_GQR0 + i]); + gqr.AddMemOffset(2); - CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(&asm_routines.pairedLoadQuantized[w * 8]))); + AND(32, R(RSCRATCH2), gqr); + MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); + + CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(&asm_routines.pairedLoadQuantized[w * 8]))); + } MemoryExceptionCheck(); CVTPS2PD(fpr.RX(s), R(XMM0)); diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp index 160b3fb05e..38d0e4a6e5 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp @@ -10,6 +10,7 @@ #include "Common/x64ABI.h" #include "Common/x64Emitter.h" #include "Core/HW/GPFifo.h" +#include "Core/PowerPC/Gekko.h" #include "Core/PowerPC/JitCommon/JitBase.h" #include "Core/PowerPC/JitCommon/Jit_Util.h" #include "Core/PowerPC/PowerPC.h" @@ -219,438 +220,416 @@ alignas(16) static const float m_255 = 255.0f; alignas(16) static const float m_127 = 127.0f; alignas(16) static const float m_m128 = -128.0f; -#define QUANTIZE_OVERFLOW_SAFE +// Sizes of the various quantized store types +constexpr std::array sizes{{32, 0, 0, 0, 8, 16, 8, 16}}; -// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of -// int32 range -// while it's OK for large negatives, it isn't for positives -// I don't know whether the overflow actually happens in any games -// but it potentially can cause problems, so we need some clamping - -// See comment in header for in/outs. void CommonAsmRoutines::GenQuantizedStores() { - const void* start = GetCodePtr(); - - const u8* storePairedIllegal = AlignCode4(); - UD2(); - - const u8* storePairedFloat = AlignCode4(); - if (cpu_info.bSSSE3) - { - PSHUFB(XMM0, M((void*)pbswapShuffle2x4)); - MOVQ_xmm(R(RSCRATCH), XMM0); - } - else - { - MOVQ_xmm(R(RSCRATCH), XMM0); - ROL(64, R(RSCRATCH), Imm8(32)); - BSWAP(64, RSCRATCH); - } - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, - SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - - RET(); - - const u8* storePairedU8 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MULPS(XMM0, R(XMM1)); -#ifdef QUANTIZE_OVERFLOW_SAFE - MINPS(XMM0, M(m_65535)); -#endif - CVTTPS2DQ(XMM0, R(XMM0)); - PACKSSDW(XMM0, R(XMM0)); - PACKUSWB(XMM0, R(XMM0)); - MOVD_xmm(R(RSCRATCH), XMM0); - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, - SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - - RET(); - - const u8* storePairedS8 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MULPS(XMM0, R(XMM1)); -#ifdef QUANTIZE_OVERFLOW_SAFE - MINPS(XMM0, M(m_65535)); -#endif - CVTTPS2DQ(XMM0, R(XMM0)); - PACKSSDW(XMM0, R(XMM0)); - PACKSSWB(XMM0, R(XMM0)); - MOVD_xmm(R(RSCRATCH), XMM0); - - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, - SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - - RET(); - - const u8* storePairedU16 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MULPS(XMM0, R(XMM1)); - - if (cpu_info.bSSE4_1) - { -#ifdef QUANTIZE_OVERFLOW_SAFE - MINPS(XMM0, M(m_65535)); -#endif - CVTTPS2DQ(XMM0, R(XMM0)); - PACKUSDW(XMM0, R(XMM0)); - MOVD_xmm(R(RSCRATCH), XMM0); - BSWAP(32, RSCRATCH); - ROL(32, R(RSCRATCH), Imm8(16)); - } - else - { - XORPS(XMM1, R(XMM1)); - MAXPS(XMM0, R(XMM1)); - MINPS(XMM0, M(m_65535)); - - CVTTPS2DQ(XMM0, R(XMM0)); - PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____ - MOVD_xmm(R(RSCRATCH), XMM0); - BSWAP(32, RSCRATCH); - } - - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, - SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - - RET(); - - const u8* storePairedS16 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MULPS(XMM0, R(XMM1)); -#ifdef QUANTIZE_OVERFLOW_SAFE - MINPS(XMM0, M(m_65535)); -#endif - CVTTPS2DQ(XMM0, R(XMM0)); - PACKSSDW(XMM0, R(XMM0)); - MOVD_xmm(R(RSCRATCH), XMM0); - BSWAP(32, RSCRATCH); - ROL(32, R(RSCRATCH), Imm8(16)); - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, - SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - - RET(); - - JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore"); - pairedStoreQuantized = reinterpret_cast(const_cast(AlignCode16())); ReserveCodeSpace(8 * sizeof(u8*)); - pairedStoreQuantized[0] = storePairedFloat; - pairedStoreQuantized[1] = storePairedIllegal; - pairedStoreQuantized[2] = storePairedIllegal; - pairedStoreQuantized[3] = storePairedIllegal; - pairedStoreQuantized[4] = storePairedU8; - pairedStoreQuantized[5] = storePairedU16; - pairedStoreQuantized[6] = storePairedS8; - pairedStoreQuantized[7] = storePairedS16; + for (int type = 0; type < 8; type++) + pairedStoreQuantized[type] = GenQuantizedStoreRuntime(false, static_cast(type)); } // See comment in header for in/outs. void CommonAsmRoutines::GenQuantizedSingleStores() { - const void* start = GetCodePtr(); - - const u8* storeSingleIllegal = AlignCode4(); - UD2(); - - // Easy! - const u8* storeSingleFloat = AlignCode4(); - MOVD_xmm(R(RSCRATCH), XMM0); - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, - SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - RET(); - - const u8* storeSingleU8 = AlignCode4(); // Used by MKWii - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - XORPS(XMM1, R(XMM1)); - MAXSS(XMM0, R(XMM1)); - MINSS(XMM0, M(&m_255)); - CVTTSS2SI(RSCRATCH, R(XMM0)); - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, - SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - RET(); - - const u8* storeSingleS8 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MAXSS(XMM0, M(&m_m128)); - MINSS(XMM0, M(&m_127)); - CVTTSS2SI(RSCRATCH, R(XMM0)); - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, - SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - RET(); - - const u8* storeSingleU16 = AlignCode4(); // Used by MKWii - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - XORPS(XMM1, R(XMM1)); - MAXSS(XMM0, R(XMM1)); - MINSS(XMM0, M(m_65535)); - CVTTSS2SI(RSCRATCH, R(XMM0)); - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, - SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - RET(); - - const u8* storeSingleS16 = AlignCode4(); - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MAXSS(XMM0, M(&m_m32768)); - MINSS(XMM0, M(&m_32767)); - CVTTSS2SI(RSCRATCH, R(XMM0)); - SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, - SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); - RET(); - - JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedSingleStore"); - singleStoreQuantized = reinterpret_cast(const_cast(AlignCode16())); ReserveCodeSpace(8 * sizeof(u8*)); - singleStoreQuantized[0] = storeSingleFloat; - singleStoreQuantized[1] = storeSingleIllegal; - singleStoreQuantized[2] = storeSingleIllegal; - singleStoreQuantized[3] = storeSingleIllegal; - singleStoreQuantized[4] = storeSingleU8; - singleStoreQuantized[5] = storeSingleU16; - singleStoreQuantized[6] = storeSingleS8; - singleStoreQuantized[7] = storeSingleS16; + for (int type = 0; type < 8; type++) + singleStoreQuantized[type] = GenQuantizedStoreRuntime(true, static_cast(type)); +} + +const u8* CommonAsmRoutines::GenQuantizedStoreRuntime(bool single, EQuantizeType type) +{ + const void* start = GetCodePtr(); + const u8* load = AlignCode4(); + GenQuantizedStore(single, type, -1); + RET(); + JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore_%i_%i", type, single); + + return load; } void CommonAsmRoutines::GenQuantizedLoads() { - const void* start = GetCodePtr(); - - const u8* loadPairedIllegal = AlignCode4(); - UD2(); - - // FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e. - // don't need hardware access handling. This will definitely crash if paired loads occur - // from non-RAM areas, but as far as I know, this never happens. I don't know if this is - // for a good reason, or merely because no game does this. - // If we find something that actually does do this, maybe this should be changed. How - // much of a performance hit would it be? - const u8* loadPairedFloatTwo = AlignCode4(); - if (jit->jo.memcheck) - { - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, - SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - ROL(64, R(RSCRATCH_EXTRA), Imm8(32)); - MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA)); - } - else if (cpu_info.bSSSE3) - { - MOVQ_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA)); - PSHUFB(XMM0, M(pbswapShuffle2x4)); - } - else - { - LoadAndSwap(64, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA)); - ROL(64, R(RSCRATCH_EXTRA), Imm8(32)); - MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA)); - } - RET(); - - const u8* loadPairedFloatOne = AlignCode4(); - if (jit->jo.memcheck) - { - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, - SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - UNPCKLPS(XMM0, M(m_one)); - } - else if (cpu_info.bSSSE3) - { - MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA)); - PSHUFB(XMM0, M(pbswapShuffle1x4)); - UNPCKLPS(XMM0, M(m_one)); - } - else - { - LoadAndSwap(32, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA)); - MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - UNPCKLPS(XMM0, M(m_one)); - } - RET(); - - const u8* loadPairedU8Two = AlignCode4(); - if (jit->jo.memcheck) - { - // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, - SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); - } - else - { - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); - } - MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - if (cpu_info.bSSE4_1) - { - PMOVZXBD(XMM0, R(XMM0)); - } - else - { - PXOR(XMM1, R(XMM1)); - PUNPCKLBW(XMM0, R(XMM1)); - PUNPCKLWD(XMM0, R(XMM1)); - } - CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - MULPS(XMM0, R(XMM1)); - RET(); - - const u8* loadPairedU8One = AlignCode4(); - if (jit->jo.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, - SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - else - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx - CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - UNPCKLPS(XMM0, M(m_one)); - RET(); - - const u8* loadPairedS8Two = AlignCode4(); - if (jit->jo.memcheck) - { - // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, - SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); - } - else - { - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); - } - MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - if (cpu_info.bSSE4_1) - { - PMOVSXBD(XMM0, R(XMM0)); - } - else - { - PUNPCKLBW(XMM0, R(XMM0)); - PUNPCKLWD(XMM0, R(XMM0)); - PSRAD(XMM0, 24); - } - CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - MULPS(XMM0, R(XMM1)); - RET(); - - const u8* loadPairedS8One = AlignCode4(); - if (jit->jo.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, - SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - else - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true); - CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - UNPCKLPS(XMM0, M(m_one)); - RET(); - - const u8* loadPairedU16Two = AlignCode4(); - // TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice - if (jit->jo.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, - SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - else - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); - ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); - MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - if (cpu_info.bSSE4_1) - { - PMOVZXWD(XMM0, R(XMM0)); - } - else - { - PXOR(XMM1, R(XMM1)); - PUNPCKLWD(XMM0, R(XMM1)); - } - CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - MULPS(XMM0, R(XMM1)); - RET(); - - const u8* loadPairedU16One = AlignCode4(); - if (jit->jo.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, - SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - else - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false); - CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - UNPCKLPS(XMM0, M(m_one)); - RET(); - - const u8* loadPairedS16Two = AlignCode4(); - if (jit->jo.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, - SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - else - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); - ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); - MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); - if (cpu_info.bSSE4_1) - { - PMOVSXWD(XMM0, R(XMM0)); - } - else - { - PUNPCKLWD(XMM0, R(XMM0)); - PSRAD(XMM0, 16); - } - CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - MULPS(XMM0, R(XMM1)); - RET(); - - const u8* loadPairedS16One = AlignCode4(); - if (jit->jo.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, - SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); - else - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true); - CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); - SHR(32, R(RSCRATCH2), Imm8(5)); - MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - UNPCKLPS(XMM0, M(m_one)); - RET(); - - JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedLoad"); - pairedLoadQuantized = reinterpret_cast(const_cast(AlignCode16())); ReserveCodeSpace(16 * sizeof(u8*)); - pairedLoadQuantized[0] = loadPairedFloatTwo; - pairedLoadQuantized[1] = loadPairedIllegal; - pairedLoadQuantized[2] = loadPairedIllegal; - pairedLoadQuantized[3] = loadPairedIllegal; - pairedLoadQuantized[4] = loadPairedU8Two; - pairedLoadQuantized[5] = loadPairedU16Two; - pairedLoadQuantized[6] = loadPairedS8Two; - pairedLoadQuantized[7] = loadPairedS16Two; - - pairedLoadQuantized[8] = loadPairedFloatOne; - pairedLoadQuantized[9] = loadPairedIllegal; - pairedLoadQuantized[10] = loadPairedIllegal; - pairedLoadQuantized[11] = loadPairedIllegal; - pairedLoadQuantized[12] = loadPairedU8One; - pairedLoadQuantized[13] = loadPairedU16One; - pairedLoadQuantized[14] = loadPairedS8One; - pairedLoadQuantized[15] = loadPairedS16One; + for (int type = 0; type < 8; type++) + pairedLoadQuantized[type] = GenQuantizedLoadRuntime(false, static_cast(type)); + for (int type = 0; type < 8; type++) + pairedLoadQuantized[type + 8] = GenQuantizedLoadRuntime(true, static_cast(type)); +} + +const u8* CommonAsmRoutines::GenQuantizedLoadRuntime(bool single, EQuantizeType type) +{ + const void* start = GetCodePtr(); + const u8* load = AlignCode4(); + GenQuantizedLoad(single, type, -1); + RET(); + JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedLoad_%i_%i", type, single); + + return load; +} + +void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type, int quantize) +{ + // In: one or two single floats in XMM0, if quantize is -1, a quantization factor in RSCRATCH2 + + int size = sizes[type] * (single ? 1 : 2); + bool isInline = quantize != -1; + + // illegal + if (type == QUANTIZE_INVALID1 || type == QUANTIZE_INVALID2 || type == QUANTIZE_INVALID3) + { + UD2(); + return; + } + + if (type == QUANTIZE_FLOAT) + { + GenQuantizedStoreFloat(single, isInline); + } + else if (single) + { + if (quantize == -1) + { + SHR(32, R(RSCRATCH2), Imm8(5)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); + } + else if (quantize > 0) + { + MULSS(XMM0, M(&m_dequantizeTableS[quantize * 2])); + } + + switch (type) + { + case QUANTIZE_U8: + XORPS(XMM1, R(XMM1)); + MAXSS(XMM0, R(XMM1)); + MINSS(XMM0, M(&m_255)); + break; + case QUANTIZE_S8: + MAXSS(XMM0, M(&m_m128)); + MINSS(XMM0, M(&m_127)); + break; + case QUANTIZE_U16: + XORPS(XMM1, R(XMM1)); + MAXSS(XMM0, R(XMM1)); + MINSS(XMM0, M(m_65535)); + break; + case QUANTIZE_S16: + MAXSS(XMM0, M(&m_m32768)); + MINSS(XMM0, M(&m_32767)); + break; + default: + break; + } + + CVTTSS2SI(RSCRATCH, R(XMM0)); + } + else + { + if (quantize == -1) + { + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); + MULPS(XMM0, R(XMM1)); + } + else if (quantize > 0) + { + MOVQ_xmm(XMM1, M(&m_quantizeTableS[quantize * 2])); + MULPS(XMM0, R(XMM1)); + } + + bool hasPACKUSDW = cpu_info.bSSE4_1; + + // Special case: if we don't have PACKUSDW we need to clamp to zero as well so the shuffle + // below can work + if (type == QUANTIZE_U16 && !hasPACKUSDW) + { + XORPS(XMM1, R(XMM1)); + MAXPS(XMM0, R(XMM1)); + } + + // According to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value + // is out of int32 range while it's OK for large negatives, it isn't for positives + // I don't know whether the overflow actually happens in any games but it potentially can + // cause problems, so we need some clamping + MINPS(XMM0, M(m_65535)); + CVTTPS2DQ(XMM0, R(XMM0)); + + switch (type) + { + case QUANTIZE_U8: + PACKSSDW(XMM0, R(XMM0)); + PACKUSWB(XMM0, R(XMM0)); + MOVD_xmm(R(RSCRATCH), XMM0); + break; + case QUANTIZE_S8: + PACKSSDW(XMM0, R(XMM0)); + PACKSSWB(XMM0, R(XMM0)); + MOVD_xmm(R(RSCRATCH), XMM0); + break; + case QUANTIZE_U16: + if (hasPACKUSDW) + { + PACKUSDW(XMM0, R(XMM0)); // AAAABBBB CCCCDDDD ... -> AABBCCDD ... + MOVD_xmm(R(RSCRATCH), XMM0); // AABBCCDD ... -> AABBCCDD + BSWAP(32, RSCRATCH); // AABBCCDD -> DDCCBBAA + ROL(32, R(RSCRATCH), Imm8(16)); // DDCCBBAA -> BBAADDCC + } + else + { + // We don't have PACKUSDW so we'll shuffle instead (assumes 32-bit values >= 0 and < 65536) + PSHUFLW(XMM0, R(XMM0), 2); // AABB0000 CCDD0000 ... -> CCDDAABB ... + MOVD_xmm(R(RSCRATCH), XMM0); // CCDDAABB ... -> CCDDAABB + BSWAP(32, RSCRATCH); // CCDDAABB -> BBAADDCC + } + break; + case QUANTIZE_S16: + PACKSSDW(XMM0, R(XMM0)); + MOVD_xmm(R(RSCRATCH), XMM0); + BSWAP(32, RSCRATCH); + ROL(32, R(RSCRATCH), Imm8(16)); + break; + default: + break; + } + } + + int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG; + if (!single) + flags |= SAFE_LOADSTORE_NO_SWAP; + + SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, size, 0, QUANTIZED_REGS_TO_SAVE, flags); +} + +void QuantizedMemoryRoutines::GenQuantizedStoreFloat(bool single, bool isInline) +{ + if (single) + { + // Easy! + MOVD_xmm(R(RSCRATCH), XMM0); + } + else + { + if (cpu_info.bSSSE3) + { + PSHUFB(XMM0, M(pbswapShuffle2x4)); + MOVQ_xmm(R(RSCRATCH), XMM0); + } + else + { + MOVQ_xmm(R(RSCRATCH), XMM0); + ROL(64, R(RSCRATCH), Imm8(32)); + BSWAP(64, RSCRATCH); + } + } +} + +void QuantizedMemoryRoutines::GenQuantizedLoad(bool single, EQuantizeType type, int quantize) +{ + // Note that this method assumes that inline methods know the value of quantize ahead of + // time. The methods generated AOT assume that the quantize flag is placed in RSCRATCH in + // the second lowest byte, ie: 0x0000xx00 + + int size = sizes[type] * (single ? 1 : 2); + bool isInline = quantize != -1; + + // illegal + if (type == QUANTIZE_INVALID1 || type == QUANTIZE_INVALID2 || type == QUANTIZE_INVALID3) + { + UD2(); + return; + } + + // Floats don't use quantization and can generate more optimal code + if (type == QUANTIZE_FLOAT) + { + GenQuantizedLoadFloat(single, isInline); + return; + } + + bool extend = single && (type == QUANTIZE_S8 || type == QUANTIZE_S16); + + if (jit->jo.memcheck) + { + BitSet32 regsToSave = QUANTIZED_REGS_TO_SAVE_LOAD; + int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG; + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), size, 0, regsToSave, extend, flags); + if (!single && (type == QUANTIZE_U8 || type == QUANTIZE_S8)) + { + // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice + ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); + } + } + else + { + switch (type) + { + case QUANTIZE_U8: + case QUANTIZE_S8: + UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, size, 0, extend); + break; + case QUANTIZE_U16: + case QUANTIZE_S16: + UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, size, 0, extend); + break; + default: + break; + } + } + + if (single) + { + CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); + + if (quantize == -1) + { + SHR(32, R(RSCRATCH2), Imm8(5)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); + } + else if (quantize > 0) + { + MULSS(XMM0, M(&m_dequantizeTableS[quantize * 2])); + } + UNPCKLPS(XMM0, M(m_one)); + } + else + { + switch (type) + { + case QUANTIZE_U8: + MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); + if (cpu_info.bSSE4_1) + { + PMOVZXBD(XMM0, R(XMM0)); + } + else + { + PXOR(XMM1, R(XMM1)); + PUNPCKLBW(XMM0, R(XMM1)); + PUNPCKLWD(XMM0, R(XMM1)); + } + break; + case QUANTIZE_S8: + MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); + if (cpu_info.bSSE4_1) + { + PMOVSXBD(XMM0, R(XMM0)); + } + else + { + PUNPCKLBW(XMM0, R(XMM0)); + PUNPCKLWD(XMM0, R(XMM0)); + PSRAD(XMM0, 24); + } + break; + case QUANTIZE_U16: + ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); + MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); + if (cpu_info.bSSE4_1) + { + PMOVZXWD(XMM0, R(XMM0)); + } + else + { + PXOR(XMM1, R(XMM1)); + PUNPCKLWD(XMM0, R(XMM1)); + } + break; + case QUANTIZE_S16: + ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); + MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); + if (cpu_info.bSSE4_1) + { + PMOVSXWD(XMM0, R(XMM0)); + } + else + { + PUNPCKLWD(XMM0, R(XMM0)); + PSRAD(XMM0, 16); + } + break; + default: + break; + } + CVTDQ2PS(XMM0, R(XMM0)); + + if (quantize == -1) + { + SHR(32, R(RSCRATCH2), Imm8(5)); + MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); + MULPS(XMM0, R(XMM1)); + } + else if (quantize > 0) + { + MOVQ_xmm(XMM1, M(&m_dequantizeTableS[quantize * 2])); + MULPS(XMM0, R(XMM1)); + } + } + + return; +} + +void QuantizedMemoryRoutines::GenQuantizedLoadFloat(bool single, bool isInline) +{ + int size = single ? 32 : 64; + bool extend = false; + + if (jit->jo.memcheck) + { + BitSet32 regsToSave = QUANTIZED_REGS_TO_SAVE; + int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG; + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), size, 0, regsToSave, extend, flags); + } + + if (single) + { + if (jit->jo.memcheck) + { + MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); + } + else if (cpu_info.bSSSE3) + { + MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA)); + PSHUFB(XMM0, M(pbswapShuffle1x4)); + } + else + { + LoadAndSwap(32, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA)); + MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); + } + + UNPCKLPS(XMM0, M(m_one)); + } + else + { + // FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e. + // don't need hardware access handling. This will definitely crash if paired loads occur + // from non-RAM areas, but as far as I know, this never happens. I don't know if this is + // for a good reason, or merely because no game does this. + // If we find something that actually does do this, maybe this should be changed. How + // much of a performance hit would it be? + if (jit->jo.memcheck) + { + ROL(64, R(RSCRATCH_EXTRA), Imm8(32)); + MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA)); + } + else if (cpu_info.bSSSE3) + { + MOVQ_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA)); + PSHUFB(XMM0, M(pbswapShuffle2x4)); + } + else + { + LoadAndSwap(64, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA)); + ROL(64, R(RSCRATCH_EXTRA), Imm8(32)); + MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA)); + } + } } diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h index 62f38b5c8e..fc9f1d8bea 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h +++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h @@ -7,16 +7,31 @@ #include "Core/PowerPC/JitCommon/JitAsmCommon.h" #include "Core/PowerPC/JitCommon/Jit_Util.h" -class CommonAsmRoutines : public CommonAsmRoutinesBase, public EmuCodeBlock -{ -protected: - void GenQuantizedLoads(); - void GenQuantizedStores(); - void GenQuantizedSingleStores(); +enum EQuantizeType : u32; +class QuantizedMemoryRoutines : public EmuCodeBlock +{ +public: + void GenQuantizedLoad(bool single, EQuantizeType type, int quantize); + void GenQuantizedStore(bool single, EQuantizeType type, int quantize); + +private: + void GenQuantizedLoadFloat(bool single, bool isInline); + void GenQuantizedStoreFloat(bool single, bool isInline); +}; + +class CommonAsmRoutines : public CommonAsmRoutinesBase, public QuantizedMemoryRoutines +{ public: void GenFifoWrite(int size); void GenFrsqrte(); void GenFres(); void GenMfcr(); + +protected: + const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type); + const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type); + void GenQuantizedLoads(); + void GenQuantizedStores(); + void GenQuantizedSingleStores(); }; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 0af9282645..245f73df66 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -8,6 +8,7 @@ //#define JIT_LOG_GPR // Enables logging of the PPC general purpose regs //#define JIT_LOG_FPR // Enables logging of the PPC floating point regs +#include #include #include "Common/CommonTypes.h" @@ -88,6 +89,7 @@ protected: int revertFprLoad; bool assumeNoPairedQuantize; + std::map constantGqr; bool firstFPInstructionFound; bool isLastInstruction; int skipInstructions; @@ -130,7 +132,7 @@ public: virtual bool HandleStackFault() { return false; } }; -class Jitx86Base : public JitBase, public EmuCodeBlock +class Jitx86Base : public JitBase, public QuantizedMemoryRoutines { protected: bool BackPatch(u32 emAddress, SContext* ctx);