Merge pull request #2186 from Sonicadvance1/aarch64_optimize_paired_slowmem

[AArch64] Optimize slowmem paired stores.
This commit is contained in:
Ryan Houdek 2015-03-15 14:37:21 -05:00
commit 5e0b9179db
3 changed files with 219 additions and 212 deletions

View File

@ -2618,6 +2618,7 @@ void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
bool b64Bit = Is64Bit(Rd); bool b64Bit = Is64Bit(Rd);
_assert_msg_(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __FUNCTION__); _assert_msg_(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __FUNCTION__);
_assert_msg_(DYNA_REC, size != 64, "%s doesn't support 64bit destination. Use UMOV!", __FUNCTION__); _assert_msg_(DYNA_REC, size != 64, "%s doesn't support 64bit destination. Use UMOV!", __FUNCTION__);
_assert_msg_(DYNA_REC, !b64Bit && size != 32, "%s doesn't support 32bit move to 32bit register. Use UMOV!", __FUNCTION__);
u32 imm5 = 0; u32 imm5 = 0;
if (size == 8) if (size == 8)

View File

@ -94,9 +94,18 @@ void JitArm64::psq_st(UGeckoInstruction inst)
fpr.Lock(Q0, Q1); fpr.Lock(Q0, Q1);
ARM64Reg arm_addr = gpr.R(inst.RA); ARM64Reg arm_addr = gpr.R(inst.RA);
ARM64Reg VS = fpr.R(inst.RS);
ARM64Reg scale_reg = W0; ARM64Reg scale_reg = W0;
ARM64Reg addr_reg = W1; ARM64Reg addr_reg = W1;
ARM64Reg type_reg = gpr.GetReg(); ARM64Reg type_reg = W2;
BitSet32 gprs_in_use = gpr.GetCallerSavedUsed();
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();
// Wipe the registers we are using as temporaries
gprs_in_use &= BitSet32(~0x40000007);
fprs_in_use &= BitSet32(~3);
LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I])); LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
@ -118,13 +127,35 @@ void JitArm64::psq_st(UGeckoInstruction inst)
if (update) if (update)
MOV(arm_addr, addr_reg); MOV(arm_addr, addr_reg);
ARM64Reg VS = fpr.R(inst.RS);
m_float_emit.FCVTN(32, D0, VS); m_float_emit.FCVTN(32, D0, VS);
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[inst.W * 8]);
LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(X30);
gpr.Unlock(W0, W1, W2, W30, type_reg); // Inline address check
{
TST(addr_reg, 6, 1);
FixupBranch argh = B(CC_NEQ);
// Fast
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[inst.W * 8]);
LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(EncodeRegTo64(type_reg));
FixupBranch continue1 = B();
SetJumpTarget(argh);
// Slow
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[16 + inst.W * 8]);
LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
ABI_PushRegisters(gprs_in_use);
m_float_emit.ABI_PushRegisters(fprs_in_use, X30);
BLR(EncodeRegTo64(type_reg));
m_float_emit.ABI_PopRegisters(fprs_in_use, X30);
ABI_PushRegisters(gprs_in_use);
SetJumpTarget(continue1);
}
gpr.Unlock(W0, W1, W2, W30);
fpr.Unlock(Q0, Q1); fpr.Unlock(Q0, Q1);
} }

View File

@ -107,7 +107,6 @@ void JitArm64AsmRoutineManager::GenerateCommon()
ARM64Reg addr_reg = X1; ARM64Reg addr_reg = X1;
ARM64Reg scale_reg = X0; ARM64Reg scale_reg = X0;
ARM64FloatEmitter float_emit(this); ARM64FloatEmitter float_emit(this);
const u32 GPR_CALLER_SAVE = 0x6007FFFF;
const u8* loadPairedIllegal = GetCodePtr(); const u8* loadPairedIllegal = GetCodePtr();
BRK(100); BRK(100);
@ -263,36 +262,27 @@ void JitArm64AsmRoutineManager::GenerateCommon()
// Stores // Stores
const u8* storePairedIllegal = GetCodePtr(); const u8* storePairedIllegal = GetCodePtr();
BRK(0x101); BRK(0x101);
const u8* storePairedFloat = GetCodePtr(); const u8* storePairedFloat;
const u8* storePairedFloatSlow;
{ {
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 storePairedFloat = GetCodePtr();
BitSet32 fprs(~3); // All except Q0/Q1
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
float_emit.REV32(8, D0, D0); float_emit.REV32(8, D0, D0);
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(64, Q0, 0, addr_reg, SP); float_emit.ST1(64, Q0, 0, addr_reg, SP);
RET(X30); RET(X30);
SetJumpTarget(argh); storePairedFloatSlow = GetCodePtr();
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
float_emit.UMOV(64, X0, Q0, 0); float_emit.UMOV(64, X0, Q0, 0);
ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32)); ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32));
MOVI2R(X30, (u64)PowerPC::Write_U64); MOVI2R(X2, (u64)PowerPC::Write_U64);
BLR(X30); BR(X2);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
} }
const u8* storePairedU8 = GetCodePtr();
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
const u8* storePairedU8;
const u8* storePairedU8Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{
MOVI2R(X2, (u64)&m_quantizeTableS); MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
@ -300,30 +290,26 @@ void JitArm64AsmRoutineManager::GenerateCommon()
float_emit.FCVTZU(32, D0, D0); float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0); float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0); float_emit.XTN(8, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1); storePairedU8 = GetCodePtr();
FixupBranch argh = B(CC_NEQ); emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(16, Q0, 0, addr_reg, SP); float_emit.ST1(16, Q0, 0, addr_reg, SP);
RET(X30); RET(X30);
SetJumpTarget(argh); storePairedU8Slow = GetCodePtr();
ABI_PushRegisters(gprs); emit_quantize();
float_emit.ABI_PushRegisters(fprs, X3);
float_emit.UMOV(16, W0, Q0, 0); float_emit.UMOV(16, W0, Q0, 0);
REV16(W0, W0); REV16(W0, W0);
MOVI2R(X30, (u64)PowerPC::Write_U16); MOVI2R(X2, (u64)PowerPC::Write_U16);
BLR(X30); BR(X2);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
} }
const u8* storePairedS8 = GetCodePtr(); const u8* storePairedS8;
const u8* storePairedS8Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{ {
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS); MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
@ -331,31 +317,27 @@ void JitArm64AsmRoutineManager::GenerateCommon()
float_emit.FCVTZS(32, D0, D0); float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0); float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0); float_emit.XTN(8, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1); storePairedS8 = GetCodePtr();
FixupBranch argh = B(CC_NEQ); emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(16, Q0, 0, addr_reg, SP); float_emit.ST1(16, Q0, 0, addr_reg, SP);
RET(X30); RET(X30);
SetJumpTarget(argh); storePairedS8Slow = GetCodePtr();
ABI_PushRegisters(gprs); emit_quantize();
float_emit.ABI_PushRegisters(fprs, X3);
float_emit.UMOV(16, W0, Q0, 0); float_emit.UMOV(16, W0, Q0, 0);
REV16(W0, W0); REV16(W0, W0);
MOVI2R(X30, (u64)PowerPC::Write_U16); MOVI2R(X2, (u64)PowerPC::Write_U16);
BLR(X30); BR(X2);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
} }
const u8* storePairedU16 = GetCodePtr(); const u8* storePairedU16;
const u8* storePairedU16Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{ {
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS); MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
@ -363,29 +345,26 @@ void JitArm64AsmRoutineManager::GenerateCommon()
float_emit.FCVTZU(32, D0, D0); float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0); float_emit.XTN(16, D0, D0);
float_emit.REV16(8, D0, D0); float_emit.REV16(8, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1); storePairedU16 = GetCodePtr();
FixupBranch argh = B(CC_NEQ); emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(32, Q0, 0, addr_reg, SP); float_emit.ST1(32, Q0, 0, addr_reg, SP);
RET(X30); RET(X30);
SetJumpTarget(argh); storePairedU16Slow = GetCodePtr();
ABI_PushRegisters(gprs); emit_quantize();
float_emit.ABI_PushRegisters(fprs, X3);
float_emit.REV32(8, D0, D0); float_emit.REV32(8, D0, D0);
float_emit.UMOV(32, W0, Q0, 0); float_emit.UMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)PowerPC::Write_U32); MOVI2R(X2, (u64)PowerPC::Write_U32);
BLR(X30); BR(X2);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
} }
const u8* storePairedS16 = GetCodePtr(); // Used by Viewtiful Joe's intro movie const u8* storePairedS16; // Used by Viewtiful Joe's intro movie
const u8* storePairedS16Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{ {
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS); MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
@ -393,54 +372,41 @@ void JitArm64AsmRoutineManager::GenerateCommon()
float_emit.FCVTZS(32, D0, D0); float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0); float_emit.XTN(16, D0, D0);
float_emit.REV16(8, D0, D0); float_emit.REV16(8, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1); storePairedS16 = GetCodePtr();
FixupBranch argh = B(CC_NEQ); emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(32, Q0, 0, addr_reg, SP); float_emit.ST1(32, Q0, 0, addr_reg, SP);
RET(X30); RET(X30);
SetJumpTarget(argh); storePairedS16Slow = GetCodePtr();
ABI_PushRegisters(gprs); emit_quantize();
float_emit.ABI_PushRegisters(fprs, X3);
float_emit.REV32(8, D0, D0); float_emit.REV32(8, D0, D0);
float_emit.UMOV(32, W0, Q0, 0); float_emit.UMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)PowerPC::Write_U32); MOVI2R(X2, (u64)PowerPC::Write_U32);
BLR(X30); BR(X2);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
} }
const u8* storeSingleFloat = GetCodePtr(); const u8* storeSingleFloat;
const u8* storeSingleFloatSlow;
{ {
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 storeSingleFloat = GetCodePtr();
BitSet32 fprs(~3); // All except Q0/Q1
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
float_emit.REV32(8, D0, D0); float_emit.REV32(8, D0, D0);
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.STR(32, INDEX_UNSIGNED, D0, addr_reg, 0); float_emit.STR(32, INDEX_UNSIGNED, D0, addr_reg, 0);
RET(X30); RET(X30);
SetJumpTarget(argh); storeSingleFloatSlow = GetCodePtr();
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
float_emit.UMOV(32, W0, Q0, 0); float_emit.UMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)&PowerPC::Write_U32); MOVI2R(X2, (u64)&PowerPC::Write_U32);
BLR(X30); BR(X2);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
} }
const u8* storeSingleU8 = GetCodePtr(); // Used by MKWii const u8* storeSingleU8; // Used by MKWii
const u8* storeSingleU8Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{ {
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS); MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
@ -448,28 +414,25 @@ void JitArm64AsmRoutineManager::GenerateCommon()
float_emit.FCVTZU(32, D0, D0); float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0); float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0); float_emit.XTN(8, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1); storeSingleU8 = GetCodePtr();
FixupBranch argh = B(CC_NEQ); emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(8, Q0, 0, addr_reg); float_emit.ST1(8, Q0, 0, addr_reg);
RET(X30); RET(X30);
SetJumpTarget(argh); storeSingleU8Slow = GetCodePtr();
ABI_PushRegisters(gprs); emit_quantize();
float_emit.ABI_PushRegisters(fprs, X3); float_emit.UMOV(8, W0, Q0, 0);
float_emit.UMOV(32, W0, Q0, 0); MOVI2R(X2, (u64)&PowerPC::Write_U8);
MOVI2R(X30, (u64)&PowerPC::Write_U8); BR(X2);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
} }
const u8* storeSingleS8 = GetCodePtr(); const u8* storeSingleS8;
const u8* storeSingleS8Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{ {
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS); MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
@ -477,85 +440,77 @@ void JitArm64AsmRoutineManager::GenerateCommon()
float_emit.FCVTZS(32, D0, D0); float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0); float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0); float_emit.XTN(8, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1); storeSingleS8 = GetCodePtr();
FixupBranch argh = B(CC_NEQ); emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(8, Q0, 0, addr_reg); float_emit.ST1(8, Q0, 0, addr_reg);
RET(X30); RET(X30);
SetJumpTarget(argh); storeSingleS8Slow = GetCodePtr();
ABI_PushRegisters(gprs); emit_quantize();
float_emit.ABI_PushRegisters(fprs, X3); float_emit.SMOV(8, W0, Q0, 0);
float_emit.SMOV(32, W0, Q0, 0); MOVI2R(X2, (u64)&PowerPC::Write_U8);
MOVI2R(X30, (u64)&PowerPC::Write_U8); BR(X2);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
} }
const u8* storeSingleU16 = GetCodePtr(); // Used by MKWii const u8* storeSingleU16; // Used by MKWii
const u8* storeSingleU16Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{ {
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS); MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1); float_emit.FMUL(32, D0, D0, D1);
float_emit.FCVTZU(32, D0, D0); float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0); float_emit.XTN(16, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1); storeSingleU16 = GetCodePtr();
FixupBranch argh = B(CC_NEQ); emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.REV16(8, D0, D0); float_emit.REV16(8, D0, D0);
float_emit.ST1(16, Q0, 0, addr_reg); float_emit.ST1(16, Q0, 0, addr_reg);
RET(X30); RET(X30);
SetJumpTarget(argh); storeSingleU16Slow = GetCodePtr();
ABI_PushRegisters(gprs); emit_quantize();
float_emit.ABI_PushRegisters(fprs, X3); float_emit.UMOV(16, W0, Q0, 0);
float_emit.UMOV(32, W0, Q0, 0); MOVI2R(X2, (u64)&PowerPC::Write_U16);
MOVI2R(X30, (u64)&PowerPC::Write_U16); BR(X2);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
} }
const u8* storeSingleS16 = GetCodePtr(); const u8* storeSingleS16;
const u8* storeSingleS16Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{ {
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS); MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1); float_emit.FMUL(32, D0, D0, D1);
float_emit.FCVTZS(32, D0, D0); float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0); float_emit.XTN(16, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1); storeSingleS16 = GetCodePtr();
FixupBranch argh = B(CC_NEQ); emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.REV16(8, D0, D0); float_emit.REV16(8, D0, D0);
float_emit.ST1(16, Q0, 0, addr_reg); float_emit.ST1(16, Q0, 0, addr_reg);
RET(X30); RET(X30);
SetJumpTarget(argh); storeSingleS16Slow = GetCodePtr();
ABI_PushRegisters(gprs); emit_quantize();
float_emit.ABI_PushRegisters(fprs, X3); float_emit.SMOV(16, W0, Q0, 0);
float_emit.SMOV(32, W0, Q0, 0); MOVI2R(X2, (u64)&PowerPC::Write_U16);
MOVI2R(X30, (u64)&PowerPC::Write_U16); BR(X2);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
} }
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16())); pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(16 * sizeof(u8*)); ReserveCodeSpace(32 * sizeof(u8*));
// Fast
pairedStoreQuantized[0] = storePairedFloat; pairedStoreQuantized[0] = storePairedFloat;
pairedStoreQuantized[1] = storePairedIllegal; pairedStoreQuantized[1] = storePairedIllegal;
pairedStoreQuantized[2] = storePairedIllegal; pairedStoreQuantized[2] = storePairedIllegal;
@ -573,4 +528,24 @@ void JitArm64AsmRoutineManager::GenerateCommon()
pairedStoreQuantized[13] = storeSingleU16; pairedStoreQuantized[13] = storeSingleU16;
pairedStoreQuantized[14] = storeSingleS8; pairedStoreQuantized[14] = storeSingleS8;
pairedStoreQuantized[15] = storeSingleS16; pairedStoreQuantized[15] = storeSingleS16;
// Slow
pairedStoreQuantized[16] = storePairedFloatSlow;
pairedStoreQuantized[17] = storePairedIllegal;
pairedStoreQuantized[18] = storePairedIllegal;
pairedStoreQuantized[19] = storePairedIllegal;
pairedStoreQuantized[20] = storePairedU8Slow;
pairedStoreQuantized[21] = storePairedU16Slow;
pairedStoreQuantized[22] = storePairedS8Slow;
pairedStoreQuantized[23] = storePairedS16Slow;
pairedStoreQuantized[24] = storeSingleFloatSlow;
pairedStoreQuantized[25] = storePairedIllegal;
pairedStoreQuantized[26] = storePairedIllegal;
pairedStoreQuantized[27] = storePairedIllegal;
pairedStoreQuantized[28] = storeSingleU8Slow;
pairedStoreQuantized[29] = storeSingleU16Slow;
pairedStoreQuantized[30] = storeSingleS8Slow;
pairedStoreQuantized[31] = storeSingleS16Slow;
} }