Merge pull request #1830 from FioraAeterna/gqropts

JIT: optimize for the common case of unquantized psq_l/st
This commit is contained in:
Dolphin Bot 2015-01-11 02:01:45 +01:00
commit 33047c9536
12 changed files with 275 additions and 17 deletions

View File

@ -21,6 +21,18 @@ static inline int CountSetBits(T v)
v = (v + (v >> 4)) & (T)~(T)0/255*15;
return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8;
}
static inline int LeastSignificantSetBit(u8 val)
{
unsigned long index;
_BitScanForward(&index, val);
return (int)index;
}
static inline int LeastSignificantSetBit(u16 val)
{
unsigned long index;
_BitScanForward(&index, val);
return (int)index;
}
static inline int LeastSignificantSetBit(u32 val)
{
unsigned long index;
@ -34,8 +46,12 @@ static inline int LeastSignificantSetBit(u64 val)
return (int)index;
}
#else
static inline int CountSetBits(u8 val) { return __builtin_popcount(val); }
static inline int CountSetBits(u16 val) { return __builtin_popcount(val); }
static inline int CountSetBits(u32 val) { return __builtin_popcount(val); }
static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); }
static inline int LeastSignificantSetBit(u8 val) { return __builtin_ctz(val); }
static inline int LeastSignificantSetBit(u16 val) { return __builtin_ctz(val); }
static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); }
static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); }
#endif
@ -163,5 +179,7 @@ public:
}
typedef BS::BitSet<u8> BitSet8;
typedef BS::BitSet<u16> BitSet16;
typedef BS::BitSet<u32> BitSet32;
typedef BS::BitSet<u64> BitSet64;

View File

@ -137,10 +137,10 @@ static GekkoOPTemplate table4_2[] =
static GekkoOPTemplate table4_3[] =
{
{6, Interpreter::psq_lx, {"psq_lx", OPTYPE_PS, FL_OUT_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{7, Interpreter::psq_stx, {"psq_stx", OPTYPE_PS, FL_IN_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{38, Interpreter::psq_lux, {"psq_lux", OPTYPE_PS, FL_OUT_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{39, Interpreter::psq_stux, {"psq_stux", OPTYPE_PS, FL_IN_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{6, Interpreter::psq_lx, {"psq_lx", OPTYPE_LOADPS, FL_OUT_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{7, Interpreter::psq_stx, {"psq_stx", OPTYPE_STOREPS, FL_IN_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{38, Interpreter::psq_lux, {"psq_lux", OPTYPE_LOADPS, FL_OUT_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
{39, Interpreter::psq_stux, {"psq_stux", OPTYPE_STOREPS, FL_IN_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}},
};
static GekkoOPTemplate table19[] =

View File

@ -15,6 +15,7 @@
#include "Core/PatchEngine.h"
#include "Core/HLE/HLE.h"
#include "Core/HW/ProcessorInterface.h"
#include "Core/PowerPC/JitInterface.h"
#include "Core/PowerPC/Profiler.h"
#include "Core/PowerPC/Jit64/Jit.h"
#include "Core/PowerPC/Jit64/Jit64_Tables.h"
@ -605,6 +606,35 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
js.skipnext = false;
js.carryFlagSet = false;
js.carryFlagInverted = false;
js.assumeNoPairedQuantize = false;
// If the block only uses one GQR and the GQR is zero at compile time, make a guess that the block
// never uses quantized loads/stores. Many paired-heavy games use largely float loads and stores,
// which are significantly faster when inlined (especially in MMU mode, where this lets them use
// fastmem).
// Insert a check that the GQR is still zero at the start of the block in case our guess turns out
// wrong.
// TODO: support any other constant GQR value, not merely zero/unquantized: we can optimize quantized
// loadstores too, it'd just be more code.
if (code_block.m_gqr_used.Count() == 1 && js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end())
{
int gqr = *code_block.m_gqr_used.begin();
if (!code_block.m_gqr_modified[gqr] && !GQR(gqr))
{
CMP(32, PPCSTATE(spr[SPR_GQR0 + gqr]), Imm8(0));
FixupBranch failure = J_CC(CC_NZ, true);
SwitchToFarCode();
SetJumpTarget(failure);
MOV(32, PPCSTATE(pc), Imm32(js.blockStart));
ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunctionC((void *)&JitInterface::CompileExceptionCheck, (u32)JitInterface::ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE);
ABI_PopRegistersAndAdjustStack({}, 0);
JMP(asm_routines.dispatcher, true);
SwitchToNearCode();
js.assumeNoPairedQuantize = true;
}
}
// Translate instructions
for (u32 i = 0; i < code_block.m_num_instructions; i++)
{

View File

@ -11,6 +11,7 @@
#include "Core/PowerPC/Jit64/Jit.h"
#include "Core/PowerPC/Jit64/JitAsm.h"
#include "Core/PowerPC/Jit64/JitRegCache.h"
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
using namespace Gen;
@ -20,7 +21,6 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff);
FALLBACK_IF(!inst.RA);
s32 offset = inst.SIMM_12;
bool indexed = inst.OPCD == 4;
@ -30,12 +30,75 @@ void Jit64::psq_stXX(UGeckoInstruction inst)
int s = inst.FS;
int i = indexed ? inst.Ix : inst.I;
int w = indexed ? inst.Wx : inst.W;
FALLBACK_IF(!a);
gpr.Lock(a, b);
if (js.assumeNoPairedQuantize)
{
int storeOffset = 0;
gpr.BindToRegister(a, true, update);
X64Reg addr = gpr.RX(a);
if (update && js.memcheck)
{
addr = RSCRATCH2;
MOV(32, R(addr), gpr.R(a));
}
if (indexed)
{
if (update)
{
ADD(32, R(addr), gpr.R(b));
}
else
{
addr = RSCRATCH2;
if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
{
LEA(32, addr, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
}
else
{
MOV(32, R(addr), gpr.R(b));
if (a)
ADD(32, R(addr), gpr.R(a));
}
}
}
else
{
if (update)
ADD(32, R(addr), Imm32(offset));
else
storeOffset = offset;
}
fpr.Lock(s);
if (w)
{
CVTSD2SS(XMM0, fpr.R(s));
MOVD_xmm(R(RSCRATCH), XMM0);
}
else
{
CVTPD2PS(XMM0, fpr.R(s));
MOVQ_xmm(R(RSCRATCH), XMM0);
ROL(64, R(RSCRATCH), Imm8(32));
}
BitSet32 registersInUse = CallerSavedRegistersInUse();
if (update && js.memcheck)
registersInUse[addr] = true;
SafeWriteRegToReg(RSCRATCH, addr, w ? 32 : 64, storeOffset, registersInUse);
MemoryExceptionCheck();
if (update && js.memcheck)
MOV(32, gpr.R(a), R(addr));
gpr.UnlockAll();
fpr.UnlockAll();
return;
}
gpr.FlushLockX(RSCRATCH_EXTRA);
if (update)
gpr.BindToRegister(a, true, true);
fpr.BindToRegister(s, true, false);
if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && (indexed || offset))
{
if (indexed)
@ -92,7 +155,6 @@ void Jit64::psq_lXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff);
FALLBACK_IF(!inst.RA);
s32 offset = inst.SIMM_12;
bool indexed = inst.OPCD == 4;
@ -102,8 +164,116 @@ void Jit64::psq_lXX(UGeckoInstruction inst)
int s = inst.FS;
int i = indexed ? inst.Ix : inst.I;
int w = indexed ? inst.Wx : inst.W;
FALLBACK_IF(!a);
gpr.Lock(a, b);
if (js.assumeNoPairedQuantize)
{
s32 loadOffset = 0;
gpr.BindToRegister(a, true, update);
X64Reg addr = gpr.RX(a);
if (update && js.memcheck)
{
addr = RSCRATCH2;
MOV(32, R(addr), gpr.R(a));
}
if (indexed)
{
if (update)
{
ADD(32, R(addr), gpr.R(b));
}
else
{
addr = RSCRATCH2;
if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg())
{
LEA(32, addr, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0));
}
else
{
MOV(32, R(addr), gpr.R(b));
if (a)
ADD(32, R(addr), gpr.R(a));
}
}
}
else
{
if (update)
ADD(32, R(addr), Imm32(offset));
else
loadOffset = offset;
}
fpr.Lock(s);
if (js.memcheck)
{
fpr.StoreFromRegister(s);
js.revertFprLoad = s;
}
fpr.BindToRegister(s, false);
// Let's mirror the JitAsmCommon code and assume all non-MMU loads go to RAM.
if (!js.memcheck)
{
if (w)
{
if (cpu_info.bSSSE3)
{
MOVD_xmm(XMM0, MComplex(RMEM, addr, SCALE_1, loadOffset));
PSHUFB(XMM0, M(pbswapShuffle1x4));
UNPCKLPS(XMM0, M(m_one));
}
else
{
LoadAndSwap(32, RSCRATCH, MComplex(RMEM, addr, SCALE_1, loadOffset));
MOVD_xmm(XMM0, R(RSCRATCH));
UNPCKLPS(XMM0, M(m_one));
}
}
else
{
if (cpu_info.bSSSE3)
{
MOVQ_xmm(XMM0, MComplex(RMEM, addr, SCALE_1, loadOffset));
PSHUFB(XMM0, M(pbswapShuffle2x4));
}
else
{
LoadAndSwap(64, RSCRATCH, MComplex(RMEM, addr, SCALE_1, loadOffset));
ROL(64, R(RSCRATCH), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH));
}
}
CVTPS2PD(fpr.RX(s), R(XMM0));
}
else
{
BitSet32 registersInUse = CallerSavedRegistersInUse();
registersInUse[fpr.RX(s) << 16] = false;
if (update)
registersInUse[addr] = true;
SafeLoadToReg(RSCRATCH, R(addr), w ? 32 : 64, loadOffset, registersInUse, false);
MemoryExceptionCheck();
if (w)
{
MOVD_xmm(XMM0, R(RSCRATCH));
UNPCKLPS(XMM0, M(m_one));
}
else
{
ROL(64, R(RSCRATCH), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH));
}
CVTPS2PD(fpr.RX(s), R(XMM0));
if (update)
MOV(32, gpr.R(a), R(addr));
}
gpr.UnlockAll();
fpr.UnlockAll();
return;
}
gpr.FlushLockX(RSCRATCH_EXTRA);
gpr.BindToRegister(a, true, update);
fpr.BindToRegister(s, false, true);

View File

@ -191,8 +191,8 @@ void CommonAsmRoutines::GenMfcr()
// Safe + Fast Quantizers, originally from JITIL by magumagu
static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
static const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 };
const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 };
static const float GC_ALIGNED16(m_quantizeTableS[]) =
{
@ -257,7 +257,7 @@ static const float GC_ALIGNED16(m_255) = 255.0f;
static const float GC_ALIGNED16(m_127) = 127.0f;
static const float GC_ALIGNED16(m_m128) = -128.0f;
static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f};
const float GC_ALIGNED16(m_one[]) = { 1.0f, 0.0f, 0.0f, 0.0f };
#define QUANTIZE_OVERFLOW_SAFE

View File

@ -6,6 +6,10 @@
#include "Core/PowerPC/JitCommon/Jit_Util.h"
extern const u8 GC_ALIGNED16(pbswapShuffle1x4[16]);
extern const u8 GC_ALIGNED16(pbswapShuffle2x4[16]);
extern const float GC_ALIGNED16(m_one[]);
class CommonAsmRoutinesBase
{
public:

View File

@ -84,6 +84,7 @@ protected:
int revertGprLoad;
int revertFprLoad;
bool assumeNoPairedQuantize;
bool firstFPInstructionFound;
bool isLastInstruction;
bool memcheck;
@ -104,6 +105,7 @@ protected:
JitBlock *curBlock;
std::unordered_set<u32> fifoWriteAddresses;
std::unordered_set<u32> pairedQuantizeAddresses;
};
PPCAnalyst::CodeBlock code_block;

View File

@ -65,6 +65,7 @@ using namespace Gen;
Core::DisplayMessage("Clearing code cache.", 3000);
#endif
jit->js.fifoWriteAddresses.clear();
jit->js.pairedQuantizeAddresses.clear();
for (int i = 0; i < num_blocks; i++)
{
DestroyBlock(i, false);
@ -311,7 +312,10 @@ using namespace Gen;
if (!forced)
{
for (u32 i = address; i < address + length; i += 4)
{
jit->js.fifoWriteAddresses.erase(i);
jit->js.pairedQuantizeAddresses.erase(i);
}
}
}
}

View File

@ -240,20 +240,26 @@ namespace JitInterface
case ExceptionType::EXCEPTIONS_FIFO_WRITE:
exception_addresses = &jit->js.fifoWriteAddresses;
break;
case ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE:
exception_addresses = &jit->js.pairedQuantizeAddresses;
break;
}
if (PC != 0 && (exception_addresses->find(PC)) == (exception_addresses->end()))
{
int optype = GetOpInfo(Memory::ReadUnchecked_U32(PC))->type;
if (optype == OPTYPE_STORE || optype == OPTYPE_STOREFP || (optype == OPTYPE_STOREPS))
if (type == ExceptionType::EXCEPTIONS_FIFO_WRITE)
{
// Check in case the code has been replaced since: do we need to do this?
int optype = GetOpInfo(Memory::ReadUnchecked_U32(PC))->type;
if (optype != OPTYPE_STORE && optype != OPTYPE_STOREFP && (optype != OPTYPE_STOREPS))
return;
}
exception_addresses->insert(PC);
// Invalidate the JIT block so that it gets recompiled with the external exception check included.
jit->GetBlockCache()->InvalidateICache(PC, 4, true);
}
}
}
void Shutdown()
{

View File

@ -13,7 +13,8 @@ namespace JitInterface
{
enum class ExceptionType
{
EXCEPTIONS_FIFO_WRITE
EXCEPTIONS_FIFO_WRITE,
EXCEPTIONS_PAIRED_QUANTIZE
};
void DoState(PointerWrap &p);

View File

@ -638,6 +638,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
block->m_broken = false;
block->m_memory_exception = false;
block->m_num_instructions = 0;
block->m_gqr_used = BitSet8(0);
if (address == 0)
{
@ -865,6 +866,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
// Forward scan, for flags that need the other direction for calculation.
BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe;
BitSet8 gqrUsed, gqrModified;
for (u32 i = 0; i < block->m_num_instructions; i++)
{
code[i].fprIsSingle = fprIsSingle;
@ -903,7 +905,22 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
if (!strncmp(code[i].opinfo->opname, "mtfs", 4))
fprIsStoreSafe = BitSet32(0);
}
if (code[i].opinfo->type == OPTYPE_STOREPS || code[i].opinfo->type == OPTYPE_LOADPS)
{
int gqr = code[i].inst.OPCD == 4 ? code[i].inst.Ix : code[i].inst.I;
gqrUsed[gqr] = true;
}
if (code[i].inst.OPCD == 31 && code[i].inst.SUBOP10 == 467) // mtspr
{
int gqr = ((code[i].inst.SPRU << 5) | code[i].inst.SPRL) - SPR_GQR0;
if (gqr >= 0 && gqr <= 7)
gqrModified[gqr] = true;
}
}
block->m_gqr_used = gqrUsed;
block->m_gqr_modified = gqrModified;
return address;
}

View File

@ -154,6 +154,12 @@ struct CodeBlock
// Did we have a memory_exception?
bool m_memory_exception;
// Which GQRs this block uses, if any.
BitSet8 m_gqr_used;
// Which GQRs this block modifies, if any.
BitSet8 m_gqr_modified;
};
class PPCAnalyzer