From f6511c3ba56cf6136e878265f8eef7bdc9f910b9 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 8 Mar 2015 12:29:45 -0500 Subject: [PATCH 1/2] [AArch64] Add an assert to SMOV in the emitter. SMOV doesn't have an encoding for moving a 32bit element to a 32bit GPR. One should use UMOV if they want that. --- Source/Core/Common/Arm64Emitter.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 69a680f7e3..e0f4c4b432 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -2618,6 +2618,7 @@ void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) bool b64Bit = Is64Bit(Rd); _assert_msg_(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __FUNCTION__); _assert_msg_(DYNA_REC, size != 64, "%s doesn't support 64bit destination. Use UMOV!", __FUNCTION__); + _assert_msg_(DYNA_REC, !b64Bit && size != 32, "%s doesn't support 32bit move to 32bit register. Use UMOV!", __FUNCTION__); u32 imm5 = 0; if (size == 8) From 7f50cc0873a0caf684360ba82c02f77de330eec6 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 8 Mar 2015 12:30:41 -0500 Subject: [PATCH 2/2] [AArch64] Optimize slowmem paired stores. This came up from the discussion we were having prior about dumping half of a kilobyte of VFP registers to the stack is insanity. This was due to me basically copying exactly what I did on ARMv7's paired loadstores, where the impact is less since we only use the bottom 64bits of the VFP registers. So I decided to think about how to improve upon this since I got called out on my terrible code. The solution I have come up with is instead of jumping to the common ASM routine and having that check if it needs to take the fastmem or slowmem routes, just inline the check in to the JIT block and jump to either a fastmem or slowmem handler. Fairly simple, and this allows us to only flush the registers that are required when doing so. Should give a reasonable increase in performance for games that use the slowmem path quite a lot. --- .../JitArm64/JitArm64_LoadStorePaired.cpp | 43 +- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 387 ++++++++---------- 2 files changed, 218 insertions(+), 212 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 4c6b565d5d..d0e033d371 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -94,9 +94,18 @@ void JitArm64::psq_st(UGeckoInstruction inst) fpr.Lock(Q0, Q1); ARM64Reg arm_addr = gpr.R(inst.RA); + ARM64Reg VS = fpr.R(inst.RS); + ARM64Reg scale_reg = W0; ARM64Reg addr_reg = W1; - ARM64Reg type_reg = gpr.GetReg(); + ARM64Reg type_reg = W2; + + BitSet32 gprs_in_use = gpr.GetCallerSavedUsed(); + BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); + + // Wipe the registers we are using as temporaries + gprs_in_use &= BitSet32(~0x40000007); + fprs_in_use &= BitSet32(~3); LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I])); @@ -118,13 +127,35 @@ void JitArm64::psq_st(UGeckoInstruction inst) if (update) MOV(arm_addr, addr_reg); - ARM64Reg VS = fpr.R(inst.RS); m_float_emit.FCVTN(32, D0, VS); - MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[inst.W * 8]); - LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true)); - BLR(X30); - gpr.Unlock(W0, W1, W2, W30, type_reg); + // Inline address check + { + TST(addr_reg, 6, 1); + FixupBranch argh = B(CC_NEQ); + + // Fast + MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[inst.W * 8]); + LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true)); + BLR(EncodeRegTo64(type_reg)); + + FixupBranch continue1 = B(); + SetJumpTarget(argh); + + // Slow + MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[16 + inst.W * 8]); + LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true)); + + ABI_PushRegisters(gprs_in_use); + m_float_emit.ABI_PushRegisters(fprs_in_use, X30); + BLR(EncodeRegTo64(type_reg)); + m_float_emit.ABI_PopRegisters(fprs_in_use, X30); + ABI_PushRegisters(gprs_in_use); + + SetJumpTarget(continue1); + } + + gpr.Unlock(W0, W1, W2, W30); fpr.Unlock(Q0, Q1); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index e82367873b..053d7ebf05 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -107,7 +107,6 @@ void JitArm64AsmRoutineManager::GenerateCommon() ARM64Reg addr_reg = X1; ARM64Reg scale_reg = X0; ARM64FloatEmitter float_emit(this); - const u32 GPR_CALLER_SAVE = 0x6007FFFF; const u8* loadPairedIllegal = GetCodePtr(); BRK(100); @@ -263,299 +262,255 @@ void JitArm64AsmRoutineManager::GenerateCommon() // Stores const u8* storePairedIllegal = GetCodePtr(); BRK(0x101); - const u8* storePairedFloat = GetCodePtr(); + const u8* storePairedFloat; + const u8* storePairedFloatSlow; { - BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 - BitSet32 fprs(~3); // All except Q0/Q1 - - TST(DecodeReg(addr_reg), 6, 1); - FixupBranch argh = B(CC_NEQ); - + storePairedFloat = GetCodePtr(); float_emit.REV32(8, D0, D0); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); float_emit.ST1(64, Q0, 0, addr_reg, SP); RET(X30); - SetJumpTarget(argh); - - ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs, X3); + storePairedFloatSlow = GetCodePtr(); float_emit.UMOV(64, X0, Q0, 0); ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32)); - MOVI2R(X30, (u64)PowerPC::Write_U64); - BLR(X30); - float_emit.ABI_PopRegisters(fprs, X3); - ABI_PopRegisters(gprs); - RET(X30); + MOVI2R(X2, (u64)PowerPC::Write_U64); + BR(X2); } - const u8* storePairedU8 = GetCodePtr(); + + const u8* storePairedU8; + const u8* storePairedU8Slow; { - BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 - BitSet32 fprs(~3); // All except Q0/Q1 - - MOVI2R(X2, (u64)&m_quantizeTableS); - ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); - float_emit.FMUL(32, D0, D0, D1, 0); - float_emit.FCVTZU(32, D0, D0); - float_emit.XTN(16, D0, D0); - float_emit.XTN(8, D0, D0); - - TST(DecodeReg(addr_reg), 6, 1); - FixupBranch argh = B(CC_NEQ); + auto emit_quantize = [this, &float_emit, scale_reg]() + { + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); + float_emit.FCVTZU(32, D0, D0); + float_emit.XTN(16, D0, D0); + float_emit.XTN(8, D0, D0); + }; + storePairedU8 = GetCodePtr(); + emit_quantize(); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); float_emit.ST1(16, Q0, 0, addr_reg, SP); RET(X30); - SetJumpTarget(argh); - ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs, X3); + storePairedU8Slow = GetCodePtr(); + emit_quantize(); float_emit.UMOV(16, W0, Q0, 0); REV16(W0, W0); - MOVI2R(X30, (u64)PowerPC::Write_U16); - BLR(X30); - float_emit.ABI_PopRegisters(fprs, X3); - ABI_PopRegisters(gprs); - RET(X30); + MOVI2R(X2, (u64)PowerPC::Write_U16); + BR(X2); } - const u8* storePairedS8 = GetCodePtr(); + const u8* storePairedS8; + const u8* storePairedS8Slow; { - BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 - BitSet32 fprs(~3); // All except Q0/Q1 - - MOVI2R(X2, (u64)&m_quantizeTableS); - ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); - float_emit.FMUL(32, D0, D0, D1, 0); - float_emit.FCVTZS(32, D0, D0); - float_emit.XTN(16, D0, D0); - float_emit.XTN(8, D0, D0); - - TST(DecodeReg(addr_reg), 6, 1); - FixupBranch argh = B(CC_NEQ); + auto emit_quantize = [this, &float_emit, scale_reg]() + { + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); + float_emit.FCVTZS(32, D0, D0); + float_emit.XTN(16, D0, D0); + float_emit.XTN(8, D0, D0); + }; + storePairedS8 = GetCodePtr(); + emit_quantize(); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); float_emit.ST1(16, Q0, 0, addr_reg, SP); RET(X30); - SetJumpTarget(argh); - ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs, X3); + storePairedS8Slow = GetCodePtr(); + emit_quantize(); float_emit.UMOV(16, W0, Q0, 0); REV16(W0, W0); - MOVI2R(X30, (u64)PowerPC::Write_U16); - BLR(X30); - float_emit.ABI_PopRegisters(fprs, X3); - ABI_PopRegisters(gprs); - RET(X30); + MOVI2R(X2, (u64)PowerPC::Write_U16); + BR(X2); } - const u8* storePairedU16 = GetCodePtr(); + const u8* storePairedU16; + const u8* storePairedU16Slow; { - BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 - BitSet32 fprs(~3); // All except Q0/Q1 + auto emit_quantize = [this, &float_emit, scale_reg]() + { + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); + float_emit.FCVTZU(32, D0, D0); + float_emit.XTN(16, D0, D0); + float_emit.REV16(8, D0, D0); + }; - MOVI2R(X2, (u64)&m_quantizeTableS); - ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); - float_emit.FMUL(32, D0, D0, D1, 0); - float_emit.FCVTZU(32, D0, D0); - float_emit.XTN(16, D0, D0); - float_emit.REV16(8, D0, D0); - - TST(DecodeReg(addr_reg), 6, 1); - FixupBranch argh = B(CC_NEQ); + storePairedU16 = GetCodePtr(); + emit_quantize(); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); float_emit.ST1(32, Q0, 0, addr_reg, SP); RET(X30); - SetJumpTarget(argh); - ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs, X3); + storePairedU16Slow = GetCodePtr(); + emit_quantize(); float_emit.REV32(8, D0, D0); float_emit.UMOV(32, W0, Q0, 0); - MOVI2R(X30, (u64)PowerPC::Write_U32); - BLR(X30); - float_emit.ABI_PopRegisters(fprs, X3); - ABI_PopRegisters(gprs); - RET(X30); + MOVI2R(X2, (u64)PowerPC::Write_U32); + BR(X2); } - const u8* storePairedS16 = GetCodePtr(); // Used by Viewtiful Joe's intro movie + const u8* storePairedS16; // Used by Viewtiful Joe's intro movie + const u8* storePairedS16Slow; { - BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 - BitSet32 fprs(~3); // All except Q0/Q1 + auto emit_quantize = [this, &float_emit, scale_reg]() + { + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); + float_emit.FCVTZS(32, D0, D0); + float_emit.XTN(16, D0, D0); + float_emit.REV16(8, D0, D0); + }; - MOVI2R(X2, (u64)&m_quantizeTableS); - ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); - float_emit.FMUL(32, D0, D0, D1, 0); - float_emit.FCVTZS(32, D0, D0); - float_emit.XTN(16, D0, D0); - float_emit.REV16(8, D0, D0); - - TST(DecodeReg(addr_reg), 6, 1); - FixupBranch argh = B(CC_NEQ); + storePairedS16 = GetCodePtr(); + emit_quantize(); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); float_emit.ST1(32, Q0, 0, addr_reg, SP); RET(X30); - SetJumpTarget(argh); - ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs, X3); + storePairedS16Slow = GetCodePtr(); + emit_quantize(); float_emit.REV32(8, D0, D0); float_emit.UMOV(32, W0, Q0, 0); - MOVI2R(X30, (u64)PowerPC::Write_U32); - BLR(X30); - float_emit.ABI_PopRegisters(fprs, X3); - ABI_PopRegisters(gprs); - RET(X30); + MOVI2R(X2, (u64)PowerPC::Write_U32); + BR(X2); } - const u8* storeSingleFloat = GetCodePtr(); + const u8* storeSingleFloat; + const u8* storeSingleFloatSlow; { - BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 - BitSet32 fprs(~3); // All except Q0/Q1 - - TST(DecodeReg(addr_reg), 6, 1); - FixupBranch argh = B(CC_NEQ); - + storeSingleFloat = GetCodePtr(); float_emit.REV32(8, D0, D0); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); float_emit.STR(32, INDEX_UNSIGNED, D0, addr_reg, 0); RET(X30); - SetJumpTarget(argh); - - ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs, X3); + storeSingleFloatSlow = GetCodePtr(); float_emit.UMOV(32, W0, Q0, 0); - MOVI2R(X30, (u64)&PowerPC::Write_U32); - BLR(X30); - float_emit.ABI_PopRegisters(fprs, X3); - ABI_PopRegisters(gprs); - RET(X30); + MOVI2R(X2, (u64)&PowerPC::Write_U32); + BR(X2); } - const u8* storeSingleU8 = GetCodePtr(); // Used by MKWii + const u8* storeSingleU8; // Used by MKWii + const u8* storeSingleU8Slow; { - BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 - BitSet32 fprs(~3); // All except Q0/Q1 + auto emit_quantize = [this, &float_emit, scale_reg]() + { + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); + float_emit.XTN(16, D0, D0); + float_emit.XTN(8, D0, D0); + }; - MOVI2R(X2, (u64)&m_quantizeTableS); - ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); - float_emit.FMUL(32, D0, D0, D1); - float_emit.FCVTZU(32, D0, D0); - float_emit.XTN(16, D0, D0); - float_emit.XTN(8, D0, D0); - - TST(DecodeReg(addr_reg), 6, 1); - FixupBranch argh = B(CC_NEQ); + storeSingleU8 = GetCodePtr(); + emit_quantize(); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); float_emit.ST1(8, Q0, 0, addr_reg); RET(X30); - SetJumpTarget(argh); - ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs, X3); - float_emit.UMOV(32, W0, Q0, 0); - MOVI2R(X30, (u64)&PowerPC::Write_U8); - BLR(X30); - float_emit.ABI_PopRegisters(fprs, X3); - ABI_PopRegisters(gprs); - RET(X30); + storeSingleU8Slow = GetCodePtr(); + emit_quantize(); + float_emit.UMOV(8, W0, Q0, 0); + MOVI2R(X2, (u64)&PowerPC::Write_U8); + BR(X2); } - const u8* storeSingleS8 = GetCodePtr(); + const u8* storeSingleS8; + const u8* storeSingleS8Slow; { - BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 - BitSet32 fprs(~3); // All except Q0/Q1 + auto emit_quantize = [this, &float_emit, scale_reg]() + { + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZS(32, D0, D0); + float_emit.XTN(16, D0, D0); + float_emit.XTN(8, D0, D0); + }; - MOVI2R(X2, (u64)&m_quantizeTableS); - ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); - float_emit.FMUL(32, D0, D0, D1); - float_emit.FCVTZS(32, D0, D0); - float_emit.XTN(16, D0, D0); - float_emit.XTN(8, D0, D0); - - TST(DecodeReg(addr_reg), 6, 1); - FixupBranch argh = B(CC_NEQ); + storeSingleS8 = GetCodePtr(); + emit_quantize(); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); float_emit.ST1(8, Q0, 0, addr_reg); RET(X30); - SetJumpTarget(argh); - ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs, X3); - float_emit.SMOV(32, W0, Q0, 0); - MOVI2R(X30, (u64)&PowerPC::Write_U8); - BLR(X30); - float_emit.ABI_PopRegisters(fprs, X3); - ABI_PopRegisters(gprs); - RET(X30); + storeSingleS8Slow = GetCodePtr(); + emit_quantize(); + float_emit.SMOV(8, W0, Q0, 0); + MOVI2R(X2, (u64)&PowerPC::Write_U8); + BR(X2); } - const u8* storeSingleU16 = GetCodePtr(); // Used by MKWii + const u8* storeSingleU16; // Used by MKWii + const u8* storeSingleU16Slow; { - BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 - BitSet32 fprs(~3); // All except Q0/Q1 + auto emit_quantize = [this, &float_emit, scale_reg]() + { + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); + float_emit.XTN(16, D0, D0); + }; - MOVI2R(X2, (u64)&m_quantizeTableS); - ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); - float_emit.FMUL(32, D0, D0, D1); - float_emit.FCVTZU(32, D0, D0); - float_emit.XTN(16, D0, D0); - - TST(DecodeReg(addr_reg), 6, 1); - FixupBranch argh = B(CC_NEQ); + storeSingleU16 = GetCodePtr(); + emit_quantize(); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); float_emit.REV16(8, D0, D0); float_emit.ST1(16, Q0, 0, addr_reg); RET(X30); - SetJumpTarget(argh); - ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs, X3); - float_emit.UMOV(32, W0, Q0, 0); - MOVI2R(X30, (u64)&PowerPC::Write_U16); - BLR(X30); - float_emit.ABI_PopRegisters(fprs, X3); - ABI_PopRegisters(gprs); - RET(X30); + storeSingleU16Slow = GetCodePtr(); + emit_quantize(); + float_emit.UMOV(16, W0, Q0, 0); + MOVI2R(X2, (u64)&PowerPC::Write_U16); + BR(X2); } - const u8* storeSingleS16 = GetCodePtr(); + const u8* storeSingleS16; + const u8* storeSingleS16Slow; { - BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 - BitSet32 fprs(~3); // All except Q0/Q1 + auto emit_quantize = [this, &float_emit, scale_reg]() + { + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZS(32, D0, D0); + float_emit.XTN(16, D0, D0); + }; - MOVI2R(X2, (u64)&m_quantizeTableS); - ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); - float_emit.FMUL(32, D0, D0, D1); - float_emit.FCVTZS(32, D0, D0); - float_emit.XTN(16, D0, D0); - - TST(DecodeReg(addr_reg), 6, 1); - FixupBranch argh = B(CC_NEQ); + storeSingleS16 = GetCodePtr(); + emit_quantize(); MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32); float_emit.REV16(8, D0, D0); float_emit.ST1(16, Q0, 0, addr_reg); RET(X30); - SetJumpTarget(argh); - ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs, X3); - float_emit.SMOV(32, W0, Q0, 0); - MOVI2R(X30, (u64)&PowerPC::Write_U16); - BLR(X30); - float_emit.ABI_PopRegisters(fprs, X3); - ABI_PopRegisters(gprs); - RET(X30); + storeSingleS16Slow = GetCodePtr(); + emit_quantize(); + float_emit.SMOV(16, W0, Q0, 0); + MOVI2R(X2, (u64)&PowerPC::Write_U16); + BR(X2); } pairedStoreQuantized = reinterpret_cast(const_cast(AlignCode16())); - ReserveCodeSpace(16 * sizeof(u8*)); + ReserveCodeSpace(32 * sizeof(u8*)); + // Fast pairedStoreQuantized[0] = storePairedFloat; pairedStoreQuantized[1] = storePairedIllegal; pairedStoreQuantized[2] = storePairedIllegal; @@ -573,4 +528,24 @@ void JitArm64AsmRoutineManager::GenerateCommon() pairedStoreQuantized[13] = storeSingleU16; pairedStoreQuantized[14] = storeSingleS8; pairedStoreQuantized[15] = storeSingleS16; + + // Slow + pairedStoreQuantized[16] = storePairedFloatSlow; + pairedStoreQuantized[17] = storePairedIllegal; + pairedStoreQuantized[18] = storePairedIllegal; + pairedStoreQuantized[19] = storePairedIllegal; + pairedStoreQuantized[20] = storePairedU8Slow; + pairedStoreQuantized[21] = storePairedU16Slow; + pairedStoreQuantized[22] = storePairedS8Slow; + pairedStoreQuantized[23] = storePairedS16Slow; + + pairedStoreQuantized[24] = storeSingleFloatSlow; + pairedStoreQuantized[25] = storePairedIllegal; + pairedStoreQuantized[26] = storePairedIllegal; + pairedStoreQuantized[27] = storePairedIllegal; + pairedStoreQuantized[28] = storeSingleU8Slow; + pairedStoreQuantized[29] = storeSingleU16Slow; + pairedStoreQuantized[30] = storeSingleS8Slow; + pairedStoreQuantized[31] = storeSingleS16Slow; + }