JIT: optimize for the common case of unquantized psq_l/st

Optimistically assume used GQRs are 0 in blocks that only use one GQR, and
bail at the start of the block and recompile if that assumption fails.

Many games use almost entirely unquantized stores (e.g. Rebel Strike, Sonic
Colors), so this will likely be a big performance improvement across the board
for games with heavy use of paired singles.
This commit is contained in:
Fiora 2015-01-04 04:20:59 -08:00
parent e32d63c43d
commit 8237004448
12 changed files with 275 additions and 17 deletions

View File

@ -21,6 +21,18 @@ static inline int CountSetBits(T v)
v = (v + (v >> 4)) & (T)~(T)0/255*15;
return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8;
}
static inline int LeastSignificantSetBit(u8 val)
{
unsigned long index;
_BitScanForward(&index, val);
return (int)index;
}
static inline int LeastSignificantSetBit(u16 val)
{
unsigned long index;
_BitScanForward(&index, val);
return (int)index;
}
static inline int LeastSignificantSetBit(u32 val)
{
unsigned long index;
@ -34,8 +46,12 @@ static inline int LeastSignificantSetBit(u64 val)
return (int)index;
}
#else
static inline int CountSetBits(u8 val) { return __builtin_popcount(val); }
static inline int CountSetBits(u16 val) { return __builtin_popcount(val); }
static inline int CountSetBits(u32 val) { return __builtin_popcount(val); }
static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); }
static inline int LeastSignificantSetBit(u8 val) { return __builtin_ctz(val); }
static inline int LeastSignificantSetBit(u16 val) { return __builtin_ctz(val); }
static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); }
static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); }
#endif
@ -163,5 +179,7 @@ public:
}
typedef BS::BitSet<u8> BitSet8;
typedef BS::BitSet<u16> BitSet16;
typedef BS::BitSet<u32> BitSet32;
typedef BS::BitSet<u64> BitSet64;

View File

@ -137,10 +137,10 @@ static GekkoOPTemplate table4_2[] =
static GekkoOPTemplate table4_3[] =
{
{6, Interpreter::psq_lx, {"psq_lx", OPTYPE_PS, FL_OUT_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{7, Interpreter::psq_stx, {"psq_stx", OPTYPE_PS, FL_IN_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{38, Interpreter::psq_lux, {"psq_lux", OPTYPE_PS, FL_OUT_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{39, Interpreter::psq_stux, {"psq_stux", OPTYPE_PS, FL_IN_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{6, Interpreter::psq_lx, {"psq_lx", OPTYPE_LOADPS, FL_OUT_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{7, Interpreter::psq_stx, {"psq_stx", OPTYPE_STOREPS, FL_IN_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{38, Interpreter::psq_lux, {"psq_lux", OPTYPE_LOADPS, FL_OUT_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{39, Interpreter::psq_stux, {"psq_stux", OPTYPE_STOREPS, FL_IN_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
};
static GekkoOPTemplate table19[] =

View File

@ -15,6 +15,7 @@
#include "Core/PatchEngine.h"
#include "Core/HLE/HLE.h"
#include "Core/HW/ProcessorInterface.h"
#include "Core/PowerPC/JitInterface.h"
#include "Core/PowerPC/Profiler.h"
#include "Core/PowerPC/Jit64/Jit.h"
#include "Core/PowerPC/Jit64/Jit64_Tables.h"
@ -605,6 +606,35 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
js.skipnext = false;
js.carryFlagSet = false;
js.carryFlagInverted = false;
js.assumeNoPairedQuantize = false;
// If the block only uses one GQR and the GQR is zero at compile time, make a guess that the block
// never uses quantized loads/stores. Many paired-heavy games use largely float loads and stores,
// which are significantly faster when inlined (especially in MMU mode, where this lets them use
// fastmem).
// Insert a check that the GQR is still zero at the start of the block in case our guess turns out
// wrong.
// TODO: support any other constant GQR value, not merely zero/unquantized: we can optimize quantized
// loadstores too, it'd just be more code.
if (code_block.m_gqr_used.Count() == 1 && js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end())
{
int gqr = *code_block.m_gqr_used.begin();
if (!code_block.m_gqr_modified[gqr] && !GQR(gqr))
{
CMP(32, PPCSTATE(spr[SPR_GQR0 + gqr]), Imm8(0));
FixupBranch failure = J_CC(CC_NZ, true);
SwitchToFarCode();
SetJumpTarget(failure);
MOV(32, PPCSTATE(pc), Imm32(js.blockStart));
ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunctionC((void *)&JitInterface::CompileExceptionCheck, (u32)JitInterface::ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE);
ABI_PopRegistersAndAdjustStack({}, 0);
JMP(asm_routines.dispatcher, true);
SwitchToNearCode();
js.assumeNoPairedQuantize = true;
}
}
// Translate instructions
for (u32 i = 0; i < code_block.m_num_instructions; i++)
{

View File

@ -11,6 +11,7 @@
#include "Core/PowerPC/Jit64/Jit.h"
#include "Core/PowerPC/Jit64/JitAsm.h"
#include "Core/PowerPC/Jit64/JitRegCache.h"
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
using namespace Gen;
@ -20,7 +21,6 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff);
FALLBACK_IF(!inst.RA);
s32 offset = inst.SIMM_12;
bool indexed = inst.OPCD == 4;
@ -30,12 +30,75 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
int s = inst.FS;
int i = indexed ? inst.Ix : inst.I;
int w = indexed ? inst.Wx : inst.W;
FALLBACK_IF(!a);
gpr.Lock(a, b);
if (js.assumeNoPairedQuantize)
{
int storeOffset = 0;
gpr.BindToRegister(a, true, update);
X64Reg addr = gpr.RX(a);
if (update && js.memcheck)
{
addr = RSCRATCH2;
MOV(32, R(addr), gpr.R(a));
}
if (indexed)
{
if (update)
{
ADD(32, R(addr), gpr.R(b));
}
else
{
addr = RSCRATCH2;
if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
{
LEA(32, addr, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
}
else
{
MOV(32, R(addr), gpr.R(b));
if (a)
ADD(32, R(addr), gpr.R(a));
}
}
}
else
{
if (update)
ADD(32, R(addr), Imm32(offset));
else
storeOffset = offset;
}
fpr.Lock(s);
if (w)
{
CVTSD2SS(XMM0, fpr.R(s));
MOVD_xmm(R(RSCRATCH), XMM0);
}
else
{
CVTPD2PS(XMM0, fpr.R(s));
MOVQ_xmm(R(RSCRATCH), XMM0);
ROL(64, R(RSCRATCH), Imm8(32));
}
BitSet32 registersInUse = CallerSavedRegistersInUse();
if (update && js.memcheck)
registersInUse[addr] = true;
SafeWriteRegToReg(RSCRATCH, addr, w ? 32 : 64, storeOffset, registersInUse);
MemoryExceptionCheck();
if (update && js.memcheck)
MOV(32, gpr.R(a), R(addr));
gpr.UnlockAll();
fpr.UnlockAll();
return;
}
gpr.FlushLockX(RSCRATCH_EXTRA);
if (update)
gpr.BindToRegister(a, true, true);
fpr.BindToRegister(s, true, false);
if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && (indexed || offset))
{
if (indexed)
@ -92,7 +155,6 @@ void Jit64::psq_lXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff);
FALLBACK_IF(!inst.RA);
s32 offset = inst.SIMM_12;
bool indexed = inst.OPCD == 4;
@ -102,8 +164,116 @@ void Jit64::psq_lXX(UGeckoInstruction inst)
int s = inst.FS;
int i = indexed ? inst.Ix : inst.I;
int w = indexed ? inst.Wx : inst.W;
FALLBACK_IF(!a);
gpr.Lock(a, b);
if (js.assumeNoPairedQuantize)
{
s32 loadOffset = 0;
gpr.BindToRegister(a, true, update);
X64Reg addr = gpr.RX(a);
if (update && js.memcheck)
{
addr = RSCRATCH2;
MOV(32, R(addr), gpr.R(a));
}
if (indexed)
{
if (update)
{
ADD(32, R(addr), gpr.R(b));
}
else
{
addr = RSCRATCH2;
if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
{
LEA(32, addr, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
}
else
{
MOV(32, R(addr), gpr.R(b));
if (a)
ADD(32, R(addr), gpr.R(a));
}
}
}
else
{
if (update)
ADD(32, R(addr), Imm32(offset));
else
loadOffset = offset;
}
fpr.Lock(s);
if (js.memcheck)
{
fpr.StoreFromRegister(s);
js.revertFprLoad = s;
}
fpr.BindToRegister(s, false);
// Let's mirror the JitAsmCommon code and assume all non-MMU loads go to RAM.
if (!js.memcheck)
{
if (w)
{
if (cpu_info.bSSSE3)
{
MOVD_xmm(XMM0, MComplex(RMEM, addr, SCALE_1, loadOffset));
PSHUFB(XMM0, M(pbswapShuffle1x4));
UNPCKLPS(XMM0, M(m_one));
}
else
{
LoadAndSwap(32, RSCRATCH, MComplex(RMEM, addr, SCALE_1, loadOffset));
MOVD_xmm(XMM0, R(RSCRATCH));
UNPCKLPS(XMM0, M(m_one));
}
}
else
{
if (cpu_info.bSSSE3)
{
MOVQ_xmm(XMM0, MComplex(RMEM, addr, SCALE_1, loadOffset));
PSHUFB(XMM0, M(pbswapShuffle2x4));
}
else
{
LoadAndSwap(64, RSCRATCH, MComplex(RMEM, addr, SCALE_1, loadOffset));
ROL(64, R(RSCRATCH), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH));
}
}
CVTPS2PD(fpr.RX(s), R(XMM0));
}
else
{
BitSet32 registersInUse = CallerSavedRegistersInUse();
registersInUse[fpr.RX(s) << 16] = false;
if (update)
registersInUse[addr] = true;
SafeLoadToReg(RSCRATCH, R(addr), w ? 32 : 64, loadOffset, registersInUse, false);
MemoryExceptionCheck();
if (w)
{
MOVD_xmm(XMM0, R(RSCRATCH));
UNPCKLPS(XMM0, M(m_one));
}
else
{
ROL(64, R(RSCRATCH), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH));
}
CVTPS2PD(fpr.RX(s), R(XMM0));
if (update)
MOV(32, gpr.R(a), R(addr));
}
gpr.UnlockAll();
fpr.UnlockAll();
return;
}
gpr.FlushLockX(RSCRATCH_EXTRA);
gpr.BindToRegister(a, true, update);
fpr.BindToRegister(s, false, true);

View File

@ -191,8 +191,8 @@ void CommonAsmRoutines::GenMfcr()
// Safe + Fast Quantizers, originally from JITIL by magumagu
static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
static const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 };
const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 };
static const float GC_ALIGNED16(m_quantizeTableS[]) =
{
@ -257,7 +257,7 @@ static const float GC_ALIGNED16(m_255) = 255.0f;
static const float GC_ALIGNED16(m_127) = 127.0f;
static const float GC_ALIGNED16(m_m128) = -128.0f;
static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f};
const float GC_ALIGNED16(m_one[]) = { 1.0f, 0.0f, 0.0f, 0.0f };
#define QUANTIZE_OVERFLOW_SAFE

View File

@ -6,6 +6,10 @@
#include "Core/PowerPC/JitCommon/Jit_Util.h"
extern const u8 GC_ALIGNED16(pbswapShuffle1x4[16]);
extern const u8 GC_ALIGNED16(pbswapShuffle2x4[16]);
extern const float GC_ALIGNED16(m_one[]);
class CommonAsmRoutinesBase
{
public:

View File

@ -84,6 +84,7 @@ protected:
int revertGprLoad;
int revertFprLoad;
bool assumeNoPairedQuantize;
bool firstFPInstructionFound;
bool isLastInstruction;
bool memcheck;
@ -104,6 +105,7 @@ protected:
JitBlock *curBlock;
std::unordered_set<u32> fifoWriteAddresses;
std::unordered_set<u32> pairedQuantizeAddresses;
};
PPCAnalyst::CodeBlock code_block;

View File

@ -65,6 +65,7 @@ using namespace Gen;
Core::DisplayMessage("Clearing code cache.", 3000);
#endif
jit->js.fifoWriteAddresses.clear();
jit->js.pairedQuantizeAddresses.clear();
for (int i = 0; i < num_blocks; i++)
{
DestroyBlock(i, false);
@ -311,7 +312,10 @@ using namespace Gen;
if (!forced)
{
for (u32 i = address; i < address + length; i += 4)
{
jit->js.fifoWriteAddresses.erase(i);
jit->js.pairedQuantizeAddresses.erase(i);
}
}
}
}

View File

@ -240,20 +240,26 @@ namespace JitInterface
case ExceptionType::EXCEPTIONS_FIFO_WRITE:
exception_addresses = &jit->js.fifoWriteAddresses;
break;
case ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE:
exception_addresses = &jit->js.pairedQuantizeAddresses;
break;
}
if (PC != 0 && (exception_addresses->find(PC)) == (exception_addresses->end()))
{
int optype = GetOpInfo(Memory::ReadUnchecked_U32(PC))->type;
if (optype == OPTYPE_STORE || optype == OPTYPE_STOREFP || (optype == OPTYPE_STOREPS))
if (type == ExceptionType::EXCEPTIONS_FIFO_WRITE)
{
// Check in case the code has been replaced since: do we need to do this?
int optype = GetOpInfo(Memory::ReadUnchecked_U32(PC))->type;
if (optype != OPTYPE_STORE && optype != OPTYPE_STOREFP && (optype != OPTYPE_STOREPS))
return;
}
exception_addresses->insert(PC);
// Invalidate the JIT block so that it gets recompiled with the external exception check included.
jit->GetBlockCache()->InvalidateICache(PC, 4, true);
}
}
}
void Shutdown()
{

View File

@ -13,7 +13,8 @@ namespace JitInterface
{
enum class ExceptionType
{
EXCEPTIONS_FIFO_WRITE
EXCEPTIONS_FIFO_WRITE,
EXCEPTIONS_PAIRED_QUANTIZE
};
void DoState(PointerWrap &p);

View File

@ -638,6 +638,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
block->m_broken = false;
block->m_memory_exception = false;
block->m_num_instructions = 0;
block->m_gqr_used = BitSet8(0);
if (address == 0)
{
@ -865,6 +866,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
// Forward scan, for flags that need the other direction for calculation.
BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe;
BitSet8 gqrUsed, gqrModified;
for (u32 i = 0; i < block->m_num_instructions; i++)
{
code[i].fprIsSingle = fprIsSingle;
@ -903,7 +905,22 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
if (!strncmp(code[i].opinfo->opname, "mtfs", 4))
fprIsStoreSafe = BitSet32(0);
}
if (code[i].opinfo->type == OPTYPE_STOREPS || code[i].opinfo->type == OPTYPE_LOADPS)
{
int gqr = code[i].inst.OPCD == 4 ? code[i].inst.Ix : code[i].inst.I;
gqrUsed[gqr] = true;
}
if (code[i].inst.OPCD == 31 && code[i].inst.SUBOP10 == 467) // mtspr
{
int gqr = ((code[i].inst.SPRU << 5) | code[i].inst.SPRL) - SPR_GQR0;
if (gqr >= 0 && gqr <= 7)
gqrModified[gqr] = true;
}
}
block->m_gqr_used = gqrUsed;
block->m_gqr_modified = gqrModified;
return address;
}

View File

@ -154,6 +154,12 @@ struct CodeBlock
// Did we have a memory_exception?
bool m_memory_exception;
// Which GQRs this block uses, if any.
BitSet8 m_gqr_used;
// Which GQRs this block modifies, if any.
BitSet8 m_gqr_modified;
};
class PPCAnalyzer