[AArch64] Implement Fiora's preemptive paired loadstore optimization.
This provides a decent speed up in pretty much everything that touches pair loadstores because in most cases they are just regular non-quantizing float loadstores that happen.
This commit is contained in:
parent
e01428935f
commit
2c68f6bfc5
|
@ -7,6 +7,7 @@
|
|||
#include "Common/PerformanceCounter.h"
|
||||
|
||||
#include "Core/PatchEngine.h"
|
||||
#include "Core/PowerPC/JitInterface.h"
|
||||
#include "Core/PowerPC/Profiler.h"
|
||||
#include "Core/PowerPC/JitArm64/Jit.h"
|
||||
#include "Core/PowerPC/JitArm64/JitArm64_RegCache.h"
|
||||
|
@ -351,6 +352,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
|
|||
|
||||
js.isLastInstruction = false;
|
||||
js.firstFPInstructionFound = false;
|
||||
js.assumeNoPairedQuantize = false;
|
||||
js.blockStart = em_address;
|
||||
js.fifoBytesThisBlock = 0;
|
||||
js.downcountAmount = 0;
|
||||
|
@ -396,6 +398,30 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
|
|||
// get start tic
|
||||
BeginTimeProfile(b);
|
||||
}
|
||||
|
||||
if (code_block.m_gqr_used.Count() == 1 && js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end())
|
||||
{
|
||||
int gqr = *code_block.m_gqr_used.begin();
|
||||
if (!code_block.m_gqr_modified[gqr] && !GQR(gqr))
|
||||
{
|
||||
LDR(INDEX_UNSIGNED, W0, X29, PPCSTATE_OFF(spr[SPR_GQR0]) + gqr * 4);
|
||||
FixupBranch no_fail = B(CC_EQ);
|
||||
FixupBranch fail = B();
|
||||
SwitchToFarCode();
|
||||
SetJumpTarget(fail);
|
||||
MOVI2R(W0, js.blockStart);
|
||||
STR(INDEX_UNSIGNED, W0, X29, PPCSTATE_OFF(pc));
|
||||
MOVI2R(W0, (u32)JitInterface::ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE);
|
||||
MOVI2R(X1, (u64)&JitInterface::CompileExceptionCheck);
|
||||
BLR(X1);
|
||||
MOVI2R(X1, (u64)asm_routines.dispatcher);
|
||||
BR(X1);
|
||||
SwitchToNearCode();
|
||||
SetJumpTarget(no_fail);
|
||||
js.assumeNoPairedQuantize = true;
|
||||
}
|
||||
}
|
||||
|
||||
const u8 *normalEntry = GetCodePtr();
|
||||
b->normalEntry = normalEntry;
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
|
|||
{
|
||||
|
||||
if (flags & BackPatchInfo::FLAG_STORE &&
|
||||
flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64 | BackPatchInfo::FLAG_SIZE_F32I))
|
||||
flags & BackPatchInfo::FLAG_MASK_FLOAT)
|
||||
{
|
||||
if (flags & BackPatchInfo::FLAG_SIZE_F32)
|
||||
{
|
||||
|
@ -64,6 +64,12 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
|
|||
m_float_emit.REV32(8, D0, RS);
|
||||
m_float_emit.STR(32, D0, X28, addr);
|
||||
}
|
||||
else if (flags & BackPatchInfo::FLAG_SIZE_F32X2)
|
||||
{
|
||||
m_float_emit.FCVTN(32, D0, RS);
|
||||
m_float_emit.REV32(8, D0, D0);
|
||||
m_float_emit.STR(64, Q0, X28, addr);
|
||||
}
|
||||
else
|
||||
{
|
||||
m_float_emit.REV64(8, Q0, RS);
|
||||
|
@ -71,7 +77,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
|
|||
}
|
||||
}
|
||||
else if (flags & BackPatchInfo::FLAG_LOAD &&
|
||||
flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64))
|
||||
flags & BackPatchInfo::FLAG_MASK_FLOAT)
|
||||
{
|
||||
if (flags & BackPatchInfo::FLAG_SIZE_F32)
|
||||
{
|
||||
|
@ -166,7 +172,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
|
|||
m_float_emit.ABI_PushRegisters(fprs_to_push, X30);
|
||||
|
||||
if (flags & BackPatchInfo::FLAG_STORE &&
|
||||
flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64 | BackPatchInfo::FLAG_SIZE_F32I))
|
||||
flags & BackPatchInfo::FLAG_MASK_FLOAT)
|
||||
{
|
||||
if (flags & BackPatchInfo::FLAG_SIZE_F32)
|
||||
{
|
||||
|
@ -181,6 +187,14 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
|
|||
MOVI2R(X30, (u64)&PowerPC::Write_U32);
|
||||
BLR(X30);
|
||||
}
|
||||
else if (flags & BackPatchInfo::FLAG_SIZE_F32X2)
|
||||
{
|
||||
m_float_emit.FCVTN(32, D0, RS);
|
||||
m_float_emit.UMOV(64, X0, D0, 0);
|
||||
ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32));
|
||||
MOVI2R(X30, (u64)PowerPC::Write_U64);
|
||||
BLR(X30);
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVI2R(X30, (u64)&PowerPC::Write_U64);
|
||||
|
@ -190,7 +204,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
|
|||
|
||||
}
|
||||
else if (flags & BackPatchInfo::FLAG_LOAD &&
|
||||
flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64))
|
||||
flags & BackPatchInfo::FLAG_MASK_FLOAT)
|
||||
{
|
||||
if (flags & BackPatchInfo::FLAG_SIZE_F32)
|
||||
{
|
||||
|
|
|
@ -38,8 +38,7 @@ void JitArm64::psq_l(UGeckoInstruction inst)
|
|||
ARM64Reg scale_reg = W0;
|
||||
ARM64Reg addr_reg = W1;
|
||||
ARM64Reg type_reg = W2;
|
||||
|
||||
LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
|
||||
ARM64Reg VS;
|
||||
|
||||
if (inst.RA || update) // Always uses the register on update
|
||||
{
|
||||
|
@ -53,21 +52,43 @@ void JitArm64::psq_l(UGeckoInstruction inst)
|
|||
MOVI2R(addr_reg, (u32)offset);
|
||||
}
|
||||
|
||||
UBFM(type_reg, scale_reg, 16, 18); // Type
|
||||
UBFM(scale_reg, scale_reg, 24, 29); // Scale
|
||||
|
||||
if (update)
|
||||
{
|
||||
gpr.BindToRegister(inst.RA, REG_REG);
|
||||
MOV(arm_addr, addr_reg);
|
||||
}
|
||||
|
||||
MOVI2R(X30, (u64)&asm_routines.pairedLoadQuantized[inst.W * 8]);
|
||||
LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true));
|
||||
BLR(X30);
|
||||
if (js.assumeNoPairedQuantize)
|
||||
{
|
||||
VS = fpr.RW(inst.RS, REG_REG);
|
||||
if (!inst.W)
|
||||
{
|
||||
ADD(EncodeRegTo64(addr_reg), EncodeRegTo64(addr_reg), X28);
|
||||
m_float_emit.LD1(32, 1, EncodeRegToDouble(VS), EncodeRegTo64(addr_reg));
|
||||
m_float_emit.REV32(8, VS, VS);
|
||||
}
|
||||
else
|
||||
{
|
||||
m_float_emit.LDR(32, VS, EncodeRegTo64(addr_reg), X28);
|
||||
m_float_emit.REV32(8, VS, VS);
|
||||
|
||||
}
|
||||
m_float_emit.FCVTL(64, VS, VS);
|
||||
}
|
||||
else
|
||||
{
|
||||
LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
|
||||
UBFM(type_reg, scale_reg, 16, 18); // Type
|
||||
UBFM(scale_reg, scale_reg, 24, 29); // Scale
|
||||
|
||||
MOVI2R(X30, (u64)&asm_routines.pairedLoadQuantized[inst.W * 8]);
|
||||
LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true));
|
||||
BLR(X30);
|
||||
|
||||
VS = fpr.RW(inst.RS, REG_REG);
|
||||
m_float_emit.FCVTL(64, VS, D0);
|
||||
}
|
||||
|
||||
ARM64Reg VS = fpr.RW(inst.RS, REG_REG);
|
||||
m_float_emit.FCVTL(64, VS, D0);
|
||||
if (inst.W)
|
||||
{
|
||||
m_float_emit.FMOV(D0, 0x70); // 1.0 as a Double
|
||||
|
@ -106,11 +127,9 @@ void JitArm64::psq_st(UGeckoInstruction inst)
|
|||
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();
|
||||
|
||||
// Wipe the registers we are using as temporaries
|
||||
gprs_in_use &= BitSet32(~0x40000007);
|
||||
gprs_in_use &= BitSet32(~7);
|
||||
fprs_in_use &= BitSet32(~3);
|
||||
|
||||
LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
|
||||
|
||||
if (inst.RA || update) // Always uses the register on update
|
||||
{
|
||||
if (offset >= 0)
|
||||
|
@ -123,40 +142,59 @@ void JitArm64::psq_st(UGeckoInstruction inst)
|
|||
MOVI2R(addr_reg, (u32)offset);
|
||||
}
|
||||
|
||||
UBFM(type_reg, scale_reg, 0, 2); // Type
|
||||
UBFM(scale_reg, scale_reg, 8, 13); // Scale
|
||||
|
||||
if (update)
|
||||
{
|
||||
gpr.BindToRegister(inst.RA, REG_REG);
|
||||
MOV(arm_addr, addr_reg);
|
||||
}
|
||||
|
||||
m_float_emit.FCVTN(32, D0, VS);
|
||||
|
||||
// Inline address check
|
||||
if (js.assumeNoPairedQuantize)
|
||||
{
|
||||
u32 flags = BackPatchInfo::FLAG_STORE;
|
||||
flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2);
|
||||
EmitBackpatchRoutine(flags,
|
||||
jo.fastmem,
|
||||
jo.fastmem,
|
||||
VS, EncodeRegTo64(addr_reg),
|
||||
gprs_in_use,
|
||||
fprs_in_use);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (inst.W)
|
||||
m_float_emit.FCVT(32, 64, D0, VS);
|
||||
else
|
||||
m_float_emit.FCVTN(32, D0, VS);
|
||||
|
||||
LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
|
||||
UBFM(type_reg, scale_reg, 0, 2); // Type
|
||||
UBFM(scale_reg, scale_reg, 8, 13); // Scale
|
||||
|
||||
// Inline address check
|
||||
TST(addr_reg, 6, 1);
|
||||
FixupBranch argh = B(CC_NEQ);
|
||||
FixupBranch pass = B(CC_EQ);
|
||||
FixupBranch fail = B();
|
||||
|
||||
SwitchToFarCode();
|
||||
SetJumpTarget(fail);
|
||||
// Slow
|
||||
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[16 + inst.W * 8]);
|
||||
LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
|
||||
|
||||
ABI_PushRegisters(gprs_in_use);
|
||||
m_float_emit.ABI_PushRegisters(fprs_in_use, X30);
|
||||
BLR(EncodeRegTo64(type_reg));
|
||||
m_float_emit.ABI_PopRegisters(fprs_in_use, X30);
|
||||
ABI_PopRegisters(gprs_in_use);
|
||||
FixupBranch continue1 = B();
|
||||
SwitchToNearCode();
|
||||
SetJumpTarget(pass);
|
||||
|
||||
// Fast
|
||||
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[inst.W * 8]);
|
||||
LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
|
||||
BLR(EncodeRegTo64(type_reg));
|
||||
|
||||
FixupBranch continue1 = B();
|
||||
SetJumpTarget(argh);
|
||||
|
||||
// Slow
|
||||
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[16 + inst.W * 8]);
|
||||
LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
|
||||
|
||||
ABI_PushRegisters(gprs_in_use);
|
||||
m_float_emit.ABI_PushRegisters(fprs_in_use, X30);
|
||||
BLR(EncodeRegTo64(type_reg));
|
||||
m_float_emit.ABI_PopRegisters(fprs_in_use, X30);
|
||||
ABI_PopRegisters(gprs_in_use);
|
||||
|
||||
SetJumpTarget(continue1);
|
||||
}
|
||||
|
||||
|
|
|
@ -9,17 +9,22 @@ struct BackPatchInfo
|
|||
{
|
||||
enum
|
||||
{
|
||||
FLAG_STORE = (1 << 0),
|
||||
FLAG_LOAD = (1 << 1),
|
||||
FLAG_SIZE_8 = (1 << 2),
|
||||
FLAG_SIZE_16 = (1 << 3),
|
||||
FLAG_SIZE_32 = (1 << 4),
|
||||
FLAG_SIZE_F32 = (1 << 5),
|
||||
FLAG_SIZE_F64 = (1 << 6),
|
||||
FLAG_REVERSE = (1 << 7),
|
||||
FLAG_EXTEND = (1 << 8),
|
||||
FLAG_SIZE_F32I = (1 << 9),
|
||||
FLAG_ZERO_256 = (1 << 10),
|
||||
FLAG_STORE = (1 << 0),
|
||||
FLAG_LOAD = (1 << 1),
|
||||
FLAG_SIZE_8 = (1 << 2),
|
||||
FLAG_SIZE_16 = (1 << 3),
|
||||
FLAG_SIZE_32 = (1 << 4),
|
||||
FLAG_SIZE_F32 = (1 << 5),
|
||||
FLAG_SIZE_F32X2 = (1 << 6),
|
||||
FLAG_SIZE_F64 = (1 << 7),
|
||||
FLAG_REVERSE = (1 << 8),
|
||||
FLAG_EXTEND = (1 << 9),
|
||||
FLAG_SIZE_F32I = (1 << 10),
|
||||
FLAG_ZERO_256 = (1 << 11),
|
||||
FLAG_MASK_FLOAT = FLAG_SIZE_F32 |
|
||||
FLAG_SIZE_F32X2 |
|
||||
FLAG_SIZE_F64 |
|
||||
FLAG_SIZE_F32I,
|
||||
};
|
||||
|
||||
static u32 GetFlagSize(u32 flags)
|
||||
|
|
Loading…
Reference in New Issue