From 4262d2199a9ba632de2ad0dde300824a22b46b85 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 18 Jan 2015 16:28:22 -0600 Subject: [PATCH] [AArch64] Implements paired loadstores. --- Source/Core/Core/CMakeLists.txt | 1 + Source/Core/Core/PowerPC/JitArm64/Jit.h | 4 + .../JitArm64/JitArm64_LoadStorePaired.cpp | 130 ++++++ .../Core/PowerPC/JitArm64/JitArm64_Tables.cpp | 8 +- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 420 ++++++++++++++++++ 5 files changed, 559 insertions(+), 4 deletions(-) create mode 100644 Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp diff --git a/Source/Core/Core/CMakeLists.txt b/Source/Core/Core/CMakeLists.txt index 4d2fa68d7b..788269a729 100644 --- a/Source/Core/Core/CMakeLists.txt +++ b/Source/Core/Core/CMakeLists.txt @@ -230,6 +230,7 @@ elseif(_M_ARM_64) PowerPC/JitArm64/JitArm64_LoadStore.cpp PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp PowerPC/JitArm64/JitArm64_Paired.cpp + PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp PowerPC/JitArm64/JitArm64_SystemRegisters.cpp PowerPC/JitArm64/JitArm64_Tables.cpp) endif() diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 94e9e945eb..87e09ce176 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -168,6 +168,10 @@ public: void ps_sum0(UGeckoInstruction inst); void ps_sum1(UGeckoInstruction inst); + // Loadstore paired + void psq_l(UGeckoInstruction inst); + void psq_st(UGeckoInstruction inst); + private: Arm64GPRCache gpr; Arm64FPRCache fpr; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp new file mode 100644 index 0000000000..4c6b565d5d --- /dev/null +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -0,0 +1,130 @@ +// Copyright 2014 Dolphin Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#include "Common/Arm64Emitter.h" +#include "Common/Common.h" +#include "Common/StringUtil.h" + +#include "Core/Core.h" +#include "Core/CoreTiming.h" +#include "Core/PowerPC/PowerPC.h" +#include "Core/PowerPC/PPCTables.h" +#include "Core/PowerPC/JitArm64/Jit.h" +#include "Core/PowerPC/JitArm64/JitArm64_RegCache.h" +#include "Core/PowerPC/JitArm64/JitAsm.h" + +using namespace Arm64Gen; + +void JitArm64::psq_l(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITLoadStorePairedOff); + FALLBACK_IF(js.memcheck || !SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem); + + // X30 is LR + // X0 contains the scale + // X1 is the address + // X2 is a temporary + // Q0 is the return register + // Q1 is a temporary + bool update = inst.OPCD == 57; + s32 offset = inst.SIMM_12; + + gpr.Lock(W0, W1, W2, W30); + fpr.Lock(Q0, Q1); + + ARM64Reg arm_addr = gpr.R(inst.RA); + ARM64Reg scale_reg = W0; + ARM64Reg addr_reg = W1; + ARM64Reg type_reg = W2; + + LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I])); + + if (inst.RA || update) // Always uses the register on update + { + if (offset >= 0) + ADD(addr_reg, gpr.R(inst.RA), offset); + else + SUB(addr_reg, gpr.R(inst.RA), std::abs(offset)); + } + else + { + MOVI2R(addr_reg, (u32)offset); + } + + UBFM(type_reg, scale_reg, 16, 18); // Type + UBFM(scale_reg, scale_reg, 24, 29); // Scale + + if (update) + MOV(arm_addr, addr_reg); + + MOVI2R(X30, (u64)&asm_routines.pairedLoadQuantized[inst.W * 8]); + LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true)); + BLR(X30); + + fpr.BindToRegister(inst.RS, false); + ARM64Reg VS = fpr.R(inst.RS); + m_float_emit.FCVTL(64, EncodeRegToDouble(VS), D0); + if (inst.W) + { + m_float_emit.FMOV(D0, 0x70); // 1.0 as a Double + m_float_emit.INS(64, VS, 1, Q0, 0); + } + + gpr.Unlock(W0, W1, W2, W30); + fpr.Unlock(Q0, Q1); +} + +void JitArm64::psq_st(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITLoadStorePairedOff); + FALLBACK_IF(js.memcheck || !SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem); + + // X30 is LR + // X0 contains the scale + // X1 is the address + // Q0 is the store register + + bool update = inst.OPCD == 61; + s32 offset = inst.SIMM_12; + + gpr.Lock(W0, W1, W2, W30); + fpr.Lock(Q0, Q1); + + ARM64Reg arm_addr = gpr.R(inst.RA); + ARM64Reg scale_reg = W0; + ARM64Reg addr_reg = W1; + ARM64Reg type_reg = gpr.GetReg(); + + LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I])); + + if (inst.RA || update) // Always uses the register on update + { + if (offset >= 0) + ADD(addr_reg, gpr.R(inst.RA), offset); + else + SUB(addr_reg, gpr.R(inst.RA), std::abs(offset)); + } + else + { + MOVI2R(addr_reg, (u32)offset); + } + + UBFM(type_reg, scale_reg, 0, 2); // Type + UBFM(scale_reg, scale_reg, 8, 13); // Scale + + if (update) + MOV(arm_addr, addr_reg); + + ARM64Reg VS = fpr.R(inst.RS); + m_float_emit.FCVTN(32, D0, VS); + MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[inst.W * 8]); + LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true)); + BLR(X30); + + gpr.Unlock(W0, W1, W2, W30, type_reg); + fpr.Unlock(Q0, Q1); +} + diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp index f1087a27c8..b5997e9f7f 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp @@ -94,10 +94,10 @@ static GekkoOPTemplate primarytable[] = {54, &JitArm64::stfXX}, //"stfd", OPTYPE_STOREFP, FL_IN_A}}, {55, &JitArm64::stfXX}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, - {56, &JitArm64::FallBackToInterpreter}, //"psq_l", OPTYPE_PS, FL_IN_A}}, - {57, &JitArm64::FallBackToInterpreter}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}}, - {60, &JitArm64::FallBackToInterpreter}, //"psq_st", OPTYPE_PS, FL_IN_A}}, - {61, &JitArm64::FallBackToInterpreter}, //"psq_stu", OPTYPE_PS, FL_OUT_A | FL_IN_A}}, + {56, &JitArm64::psq_l}, //"psq_l", OPTYPE_PS, FL_IN_A}}, + {57, &JitArm64::psq_l}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}}, + {60, &JitArm64::psq_st}, //"psq_st", OPTYPE_PS, FL_IN_A}}, + {61, &JitArm64::psq_st}, //"psq_stu", OPTYPE_PS, FL_OUT_A | FL_IN_A}}, //missing: 0, 5, 6, 9, 22, 30, 62, 58 {0, &JitArm64::FallBackToInterpreter}, //"unknown_instruction", OPTYPE_UNKNOWN, 0}}, diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 600de5a703..029bca364a 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -7,6 +7,7 @@ #include "Core/PowerPC/PowerPC.h" #include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitArm64/JitAsm.h" +#include "Core/PowerPC/JitCommon/JitAsmCommon.h" #include "Core/PowerPC/JitCommon/JitCache.h" using namespace Arm64Gen; @@ -89,9 +90,428 @@ void JitArm64AsmRoutineManager::Generate() ABI_PopRegisters(regs_to_save); RET(X30); + GenerateCommon(); + FlushIcache(); } void JitArm64AsmRoutineManager::GenerateCommon() { + // X0 is the scale + // X1 is address + // X2 is a temporary on stores + // X30 is LR + // Q0 is the return for loads + // is the register for stores + // Q1 is a temporary + ARM64Reg addr_reg = X1; + ARM64Reg scale_reg = X0; + ARM64FloatEmitter float_emit(this); + const u32 GPR_CALLER_SAVE = 0x6007FFFF; + + const u8* loadPairedIllegal = GetCodePtr(); + BRK(100); + const u8* loadPairedFloatTwo = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LD1(32, 1, D0, addr_reg); + float_emit.REV32(8, D0, D0); + RET(X30); + } + const u8* loadPairedU8Two = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LDR(16, INDEX_UNSIGNED, D0, addr_reg, 0); + float_emit.UXTL(8, D0, D0); + float_emit.UXTL(16, D0, D0); + float_emit.UCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + const u8* loadPairedS8Two = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LDR(16, INDEX_UNSIGNED, D0, addr_reg, 0); + float_emit.SXTL(8, D0, D0); + float_emit.SXTL(16, D0, D0); + float_emit.SCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + const u8* loadPairedU16Two = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LD1(16, 1, D0, addr_reg); + float_emit.REV16(8, D0, D0); + float_emit.UXTL(16, D0, D0); + float_emit.UCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + const u8* loadPairedS16Two = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LD1(16, 1, D0, addr_reg); + float_emit.REV16(8, D0, D0); + float_emit.SXTL(16, D0, D0); + float_emit.SCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + + const u8* loadPairedFloatOne = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LDR(32, INDEX_UNSIGNED, D0, addr_reg, 0); + float_emit.REV32(8, D0, D0); + RET(X30); + } + const u8* loadPairedU8One = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LDR(8, INDEX_UNSIGNED, D0, addr_reg, 0); + float_emit.UXTL(8, D0, D0); + float_emit.UXTL(16, D0, D0); + float_emit.UCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + const u8* loadPairedS8One = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LDR(8, INDEX_UNSIGNED, D0, addr_reg, 0); + float_emit.SXTL(8, D0, D0); + float_emit.SXTL(16, D0, D0); + float_emit.SCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + const u8* loadPairedU16One = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LDR(16, INDEX_UNSIGNED, D0, addr_reg, 0); + float_emit.REV16(8, D0, D0); + float_emit.UXTL(16, D0, D0); + float_emit.UCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + const u8* loadPairedS16One = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LDR(16, INDEX_UNSIGNED, D0, addr_reg, 0); + float_emit.REV16(8, D0, D0); + float_emit.SXTL(16, D0, D0); + float_emit.SCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + + pairedLoadQuantized = reinterpret_cast(const_cast(AlignCode16())); + ReserveCodeSpace(16 * sizeof(u8*)); + + pairedLoadQuantized[0] = loadPairedFloatTwo; + pairedLoadQuantized[1] = loadPairedIllegal; + pairedLoadQuantized[2] = loadPairedIllegal; + pairedLoadQuantized[3] = loadPairedIllegal; + pairedLoadQuantized[4] = loadPairedU8Two; + pairedLoadQuantized[5] = loadPairedU16Two; + pairedLoadQuantized[6] = loadPairedS8Two; + pairedLoadQuantized[7] = loadPairedS16Two; + + pairedLoadQuantized[8] = loadPairedFloatOne; + pairedLoadQuantized[9] = loadPairedIllegal; + pairedLoadQuantized[10] = loadPairedIllegal; + pairedLoadQuantized[11] = loadPairedIllegal; + pairedLoadQuantized[12] = loadPairedU8One; + pairedLoadQuantized[13] = loadPairedU16One; + pairedLoadQuantized[14] = loadPairedS8One; + pairedLoadQuantized[15] = loadPairedS16One; + + // Stores + const u8* storePairedIllegal = GetCodePtr(); + BRK(0x101); + const u8* storePairedFloat = GetCodePtr(); + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + + float_emit.REV32(8, D0, D0); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.ST1(32, Q0, 0, addr_reg, SP); + float_emit.ST1(32, Q0, 1, addr_reg, SP); + RET(X30); + + SetJumpTarget(argh); + + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.UMOV(64, X0, Q0, 0); + ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32)); + MOVI2R(X30, (u64)Memory::Write_U64); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + const u8* storePairedU8 = GetCodePtr(); + const u8* storePairedS8 = GetCodePtr(); + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); + float_emit.XTN(16, D0, D0); + float_emit.XTN(8, D0, D0); + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.ST1(8, Q0, 0, addr_reg, SP); + float_emit.ST1(8, Q0, 1, addr_reg, SP); + RET(X30); + + SetJumpTarget(argh); + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.UMOV(16, W0, Q0, 0); + REV16(W0, W0); + MOVI2R(X30, (u64)Memory::Write_U16); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + + const u8* storePairedU16 = GetCodePtr(); + const u8* storePairedS16 = GetCodePtr(); + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); + float_emit.XTN(16, D0, D0); + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.ST1(16, Q0, 0, addr_reg, SP); + float_emit.ST1(16, Q0, 1, addr_reg, SP); + RET(X30); + + SetJumpTarget(argh); + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.UMOV(32, W0, Q0, 0); + REV32(W0, W0); + MOVI2R(X30, (u64)Memory::Write_U32); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + + const u8* storeSingleFloat = GetCodePtr(); + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + + float_emit.REV32(8, D0, D0); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.STR(32, INDEX_UNSIGNED, D0, addr_reg, 0); + RET(X30); + + SetJumpTarget(argh); + + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.UMOV(32, W0, Q0, 0); + MOVI2R(X30, (u64)&Memory::Write_U32); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + const u8* storeSingleU8 = GetCodePtr(); // Used by MKWii + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); + float_emit.XTN(16, D0, D0); + float_emit.XTN(8, D0, D0); + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.ST1(8, Q0, 0, addr_reg); + RET(X30); + + SetJumpTarget(argh); + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.UMOV(32, W0, Q0, 0); + MOVI2R(X30, (u64)&Memory::Write_U8); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + const u8* storeSingleS8 = GetCodePtr(); + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZS(32, D0, D0); + float_emit.XTN(16, D0, D0); + float_emit.XTN(8, D0, D0); + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.ST1(8, Q0, 0, addr_reg); + RET(X30); + + SetJumpTarget(argh); + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.SMOV(32, W0, Q0, 0); + MOVI2R(X30, (u64)&Memory::Write_U8); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + const u8* storeSingleU16 = GetCodePtr(); // Used by MKWii + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); + float_emit.XTN(16, D0, D0); + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.ST1(16, Q0, 0, addr_reg); + RET(X30); + + SetJumpTarget(argh); + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.UMOV(32, W0, Q0, 0); + MOVI2R(X30, (u64)&Memory::Write_U16); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + const u8* storeSingleS16 = GetCodePtr(); + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZS(32, D0, D0); + float_emit.XTN(16, D0, D0); + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.ST1(16, Q0, 0, addr_reg); + RET(X30); + + SetJumpTarget(argh); + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.SMOV(32, W0, Q0, 0); + + MOVI2R(X30, (u64)&Memory::Write_U16); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + + pairedStoreQuantized = reinterpret_cast(const_cast(AlignCode16())); + ReserveCodeSpace(16 * sizeof(u8*)); + + pairedStoreQuantized[0] = storePairedFloat; + pairedStoreQuantized[1] = storePairedIllegal; + pairedStoreQuantized[2] = storePairedIllegal; + pairedStoreQuantized[3] = storePairedIllegal; + pairedStoreQuantized[4] = storePairedU8; + pairedStoreQuantized[5] = storePairedU16; + pairedStoreQuantized[6] = storePairedS8; + pairedStoreQuantized[7] = storePairedS16; + + pairedStoreQuantized[8] = storeSingleFloat; + pairedStoreQuantized[9] = storePairedIllegal; + pairedStoreQuantized[10] = storePairedIllegal; + pairedStoreQuantized[11] = storePairedIllegal; + pairedStoreQuantized[12] = storeSingleU8; + pairedStoreQuantized[13] = storeSingleU16; + pairedStoreQuantized[14] = storeSingleS8; + pairedStoreQuantized[15] = storeSingleS16; }