From 44405e2ec2018e48a11083899d3f26206a2fa531 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 18 Jan 2015 16:20:26 -0600 Subject: [PATCH 1/3] Expose the JIT quantize and dequantize arrays to all the JITs. Removes the ARMv7 arrays that were being used, and lets it use the common one instead. --- Source/Core/Core/CMakeLists.txt | 2 +- .../JitArm32/JitArm_LoadStorePaired.cpp | 8 ++-- Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp | 40 ------------------- .../Core/PowerPC/JitCommon/JitAsmCommon.cpp | 4 +- .../Core/PowerPC/JitCommon/JitAsmCommon.h | 2 + 5 files changed, 9 insertions(+), 47 deletions(-) diff --git a/Source/Core/Core/CMakeLists.txt b/Source/Core/Core/CMakeLists.txt index 7cbf4509f6..4d2fa68d7b 100644 --- a/Source/Core/Core/CMakeLists.txt +++ b/Source/Core/Core/CMakeLists.txt @@ -165,6 +165,7 @@ set(SRCS ActionReplay.cpp PowerPC/Interpreter/Interpreter_Paired.cpp PowerPC/Interpreter/Interpreter_SystemRegisters.cpp PowerPC/Interpreter/Interpreter_Tables.cpp + PowerPC/JitCommon/JitAsmCommon.cpp PowerPC/JitCommon/JitBase.cpp PowerPC/JitCommon/JitCache.cpp PowerPC/JitILCommon/IR.cpp @@ -195,7 +196,6 @@ if(_M_X86) PowerPC/Jit64/Jit_Paired.cpp PowerPC/Jit64/JitRegCache.cpp PowerPC/Jit64/Jit_SystemRegisters.cpp - PowerPC/JitCommon/JitAsmCommon.cpp PowerPC/JitCommon/JitBackpatch.cpp PowerPC/JitCommon/Jit_Util.cpp PowerPC/JitCommon/TrampolineCache.cpp) diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp index e0e485903a..dd68e747a4 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp @@ -33,7 +33,7 @@ void JitArm::psq_l(UGeckoInstruction inst) UBFX(R12, R11, 16, 3); // Type LSL(R12, R12, 2); UBFX(R11, R11, 24, 6); // Scale - LSL(R11, R11, 2); + LSL(R11, R11, 3); Operand2 off; if (TryMakeOperand2(offset, off)) @@ -84,7 +84,7 @@ void JitArm::psq_lx(UGeckoInstruction inst) UBFX(R12, R11, 16, 3); // Type LSL(R12, R12, 2); UBFX(R11, R11, 24, 6); // Scale - LSL(R11, R11, 2); + LSL(R11, R11, 3); if (inst.RA || update) // Always uses the register on update { @@ -136,7 +136,7 @@ void JitArm::psq_st(UGeckoInstruction inst) UBFX(R12, R11, 0, 3); // Type LSL(R12, R12, 2); UBFX(R11, R11, 8, 6); // Scale - LSL(R11, R11, 2); + LSL(R11, R11, 3); Operand2 off; if (TryMakeOperand2(offset, off)) @@ -187,7 +187,7 @@ void JitArm::psq_stx(UGeckoInstruction inst) UBFX(R12, R11, 0, 3); // Type LSL(R12, R12, 2); UBFX(R11, R11, 8, 6); // Scale - LSL(R11, R11, 2); + LSL(R11, R11, 3); if (inst.RA || update) // Always uses the register on update { diff --git a/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp index b0966d4f58..ff65bf6393 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp @@ -27,46 +27,6 @@ using namespace ArmGen; JitArmAsmRoutineManager asm_routines; -static const float GC_ALIGNED16(m_quantizeTableS[]) = -{ - (1 << 0), (1 << 1), (1 << 2), (1 << 3), - (1 << 4), (1 << 5), (1 << 6), (1 << 7), - (1 << 8), (1 << 9), (1 << 10), (1 << 11), - (1 << 12), (1 << 13), (1 << 14), (1 << 15), - (1 << 16), (1 << 17), (1 << 18), (1 << 19), - (1 << 20), (1 << 21), (1 << 22), (1 << 23), - (1 << 24), (1 << 25), (1 << 26), (1 << 27), - (1 << 28), (1 << 29), (1 << 30), (1 << 31), - 1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29), - 1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25), - 1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21), - 1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17), - 1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13), - 1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9), - 1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5), - 1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1), -}; - -static const float GC_ALIGNED16(m_dequantizeTableS[]) = -{ - 1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3), - 1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7), - 1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11), - 1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15), - 1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19), - 1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23), - 1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27), - 1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31), - (1ULL << 32), (1 << 31), (1 << 30), (1 << 29), - (1 << 28), (1 << 27), (1 << 26), (1 << 25), - (1 << 24), (1 << 23), (1 << 22), (1 << 21), - (1 << 20), (1 << 19), (1 << 18), (1 << 17), - (1 << 16), (1 << 15), (1 << 14), (1 << 13), - (1 << 12), (1 << 11), (1 << 10), (1 << 9), - (1 << 8), (1 << 7), (1 << 6), (1 << 5), - (1 << 4), (1 << 3), (1 << 2), (1 << 1), -}; - static void WriteDual32(u32 value1, u32 value2, u32 address) { Memory::Write_U32(value1, address); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index c7b3995510..7514167c86 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -194,7 +194,7 @@ void CommonAsmRoutines::GenMfcr() const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 }; -static const float GC_ALIGNED16(m_quantizeTableS[]) = +const float GC_ALIGNED16(m_quantizeTableS[]) = { (1ULL << 0), (1ULL << 0), (1ULL << 1), (1ULL << 1), (1ULL << 2), (1ULL << 2), (1ULL << 3), (1ULL << 3), (1ULL << 4), (1ULL << 4), (1ULL << 5), (1ULL << 5), (1ULL << 6), (1ULL << 6), (1ULL << 7), (1ULL << 7), @@ -222,7 +222,7 @@ static const float GC_ALIGNED16(m_quantizeTableS[]) = 1.0 / (1ULL << 2), 1.0 / (1ULL << 2), 1.0 / (1ULL << 1), 1.0 / (1ULL << 1), }; -static const float GC_ALIGNED16(m_dequantizeTableS[]) = +const float GC_ALIGNED16(m_dequantizeTableS[]) = { 1.0 / (1ULL << 0), 1.0 / (1ULL << 0), 1.0 / (1ULL << 1), 1.0 / (1ULL << 1), 1.0 / (1ULL << 2), 1.0 / (1ULL << 2), 1.0 / (1ULL << 3), 1.0 / (1ULL << 3), diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h index b41bc26875..c872865fa7 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h @@ -9,6 +9,8 @@ extern const u8 GC_ALIGNED16(pbswapShuffle1x4[16]); extern const u8 GC_ALIGNED16(pbswapShuffle2x4[16]); extern const float GC_ALIGNED16(m_one[]); +extern const float GC_ALIGNED16(m_quantizeTableS[]); +extern const float GC_ALIGNED16(m_dequantizeTableS[]); class CommonAsmRoutinesBase { From 8d5947efac07d5a1c2fb4d8e04c630f0b40f6cf5 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 18 Jan 2015 16:25:40 -0600 Subject: [PATCH 2/3] [AArch64] Emitter improvements. Adds a bunch of new instructions to the emitter. --- Source/Core/Common/Arm64Emitter.cpp | 530 ++++++++++++++++++++++++++-- Source/Core/Common/Arm64Emitter.h | 85 ++++- 2 files changed, 570 insertions(+), 45 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index d5e082aa2a..39f62d21eb 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -250,10 +250,12 @@ void ARM64XEmitter::EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2 void ARM64XEmitter::EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) { + bool b64Bit = Is64Bit(Rd); + Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); Rm = DecodeReg(Rm); - Write32((flags << 29) | (ArithEnc[instenc] << 21) | \ + Write32((b64Bit << 31) | (flags << 29) | (ArithEnc[instenc] << 21) | \ (Option.GetType() == ArithOption::TYPE_EXTENDEDREG ? 1 << 21 : 0) | (Rm << 16) | Option.GetData() | (Rn << 5) | Rd); } @@ -342,10 +344,12 @@ void ARM64XEmitter::EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, AR void ARM64XEmitter::EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) { + bool b64Bit = Is64Bit(Rd); + Rd = DecodeReg(Rd); Rm = DecodeReg(Rm); Rn = DecodeReg(Rn); - Write32((LogicalEnc[instenc][0] << 29) | (0x50 << 21) | (LogicalEnc[instenc][1] << 21) | \ + Write32((b64Bit << 31) | (LogicalEnc[instenc][0] << 29) | (0x50 << 21) | (LogicalEnc[instenc][1] << 21) | \ Shift.GetData() | (Rm << 16) | (Rn << 5) | Rd); } @@ -457,14 +461,14 @@ void ARM64XEmitter::EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 (immr << 16) | (imms << 10) | (Rn << 5) | Rd); } -void ARM64XEmitter::EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend) +void ARM64XEmitter::EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { Rt = DecodeReg(Rt); Rn = DecodeReg(Rn); - Rm = DecodeReg(Rm); + ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg()); - Write32((size << 30) | (opc << 22) | (0x1C1 << 21) | (Rm << 16) | \ - (extend << 13) | (1 << 11) | (Rn << 5) | Rt); + Write32((size << 30) | (opc << 22) | (0x1C1 << 21) | (decoded_Rm << 16) | \ + Rm.GetData() | (1 << 11) | (Rn << 5) | Rt); } void ARM64XEmitter::EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd) @@ -1158,6 +1162,14 @@ void ARM64XEmitter::SXTW(ARM64Reg Rd, ARM64Reg Rn) SBFM(Rd, Rn, 0, 31); } +void ARM64XEmitter::UXTB(ARM64Reg Rd, ARM64Reg Rn) +{ + UBFM(Rd, Rn, 0, 7); +} +void ARM64XEmitter::UXTH(ARM64Reg Rd, ARM64Reg Rn) +{ + UBFM(Rd, Rn, 0, 15); +} // Load Register (Literal) void ARM64XEmitter::LDR(ARM64Reg Rt, u32 imm) @@ -1363,49 +1375,49 @@ void ARM64XEmitter::LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) } // Load/Store register (register offset) -void ARM64XEmitter::STRB(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend) +void ARM64XEmitter::STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { - EncodeLoadStoreRegisterOffset(0, 0, Rt, Rn, Rm, extend); + EncodeLoadStoreRegisterOffset(0, 0, Rt, Rn, Rm); } -void ARM64XEmitter::LDRB(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend) +void ARM64XEmitter::LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { - EncodeLoadStoreRegisterOffset(0, 1, Rt, Rn, Rm, extend); + EncodeLoadStoreRegisterOffset(0, 1, Rt, Rn, Rm); } -void ARM64XEmitter::LDRSB(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend) +void ARM64XEmitter::LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { bool b64Bit = Is64Bit(Rt); - EncodeLoadStoreRegisterOffset(0, 3 - b64Bit, Rt, Rn, Rm, extend); + EncodeLoadStoreRegisterOffset(0, 3 - b64Bit, Rt, Rn, Rm); } -void ARM64XEmitter::STRH(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend) +void ARM64XEmitter::STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { - EncodeLoadStoreRegisterOffset(1, 0, Rt, Rn, Rm, extend); + EncodeLoadStoreRegisterOffset(1, 0, Rt, Rn, Rm); } -void ARM64XEmitter::LDRH(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend) +void ARM64XEmitter::LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { - EncodeLoadStoreRegisterOffset(1, 1, Rt, Rn, Rm, extend); + EncodeLoadStoreRegisterOffset(1, 1, Rt, Rn, Rm); } -void ARM64XEmitter::LDRSH(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend) +void ARM64XEmitter::LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { bool b64Bit = Is64Bit(Rt); - EncodeLoadStoreRegisterOffset(1, 3 - b64Bit, Rt, Rn, Rm, extend); + EncodeLoadStoreRegisterOffset(1, 3 - b64Bit, Rt, Rn, Rm); } -void ARM64XEmitter::STR(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend) +void ARM64XEmitter::STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { bool b64Bit = Is64Bit(Rt); - EncodeLoadStoreRegisterOffset(2 + b64Bit, 0, Rt, Rn, Rm, extend); + EncodeLoadStoreRegisterOffset(2 + b64Bit, 0, Rt, Rn, Rm); } -void ARM64XEmitter::LDR(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend) +void ARM64XEmitter::LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { bool b64Bit = Is64Bit(Rt); - EncodeLoadStoreRegisterOffset(2 + b64Bit, 1, Rt, Rn, Rm, extend); + EncodeLoadStoreRegisterOffset(2 + b64Bit, 1, Rt, Rn, Rm); } -void ARM64XEmitter::LDRSW(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend) +void ARM64XEmitter::LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { - EncodeLoadStoreRegisterOffset(2, 2, Rt, Rn, Rm, extend); + EncodeLoadStoreRegisterOffset(2, 2, Rt, Rn, Rm); } -void ARM64XEmitter::PRFM(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend) +void ARM64XEmitter::PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { - EncodeLoadStoreRegisterOffset(3, 2, Rt, Rn, Rm, extend); + EncodeLoadStoreRegisterOffset(3, 2, Rt, Rn, Rm); } // Address of label/page PC-relative @@ -1679,7 +1691,6 @@ void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn) { - _assert_msg_(DYNA_REC, Rn <= SP, "%s only supports VFP registers!", __FUNCTION__); Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); @@ -1709,6 +1720,18 @@ void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, (S << 12) | (size << 10) | (Rn << 5) | Rt); } +void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + _assert_msg_(DYNA_REC, IsSingle(Rt), "%s doesn't support singles!", __FUNCTION__); + bool quad = IsQuad(Rt); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (0b11011 << 23) | (L << 22) | (R << 21) | (Rm << 16) | \ + (opcode << 13) | (S << 12) | (size << 10) | (Rn << 5) | Rt); +} + void ARM64FloatEmitter::Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { _assert_msg_(DYNA_REC, IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__); @@ -1774,6 +1797,49 @@ void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, (1 << 11) | (Rn << 5) | Rd); } +void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm) +{ + _assert_msg_(DYNA_REC, IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__); + + bool is_double = !IsSingle(Rd); + + Rd = DecodeReg(Rd); + + Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (is_double << 22) | (type << 22) | \ + (imm << 13) | (1 << 12) | (imm5 << 5) | Rd); +} + +void ARM64FloatEmitter::EmitShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) +{ + bool quad = IsQuad(Rd); + + _assert_msg_(DYNA_REC, !immh, "%s bad encoding! Can't have zero immh", __FUNCTION__); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((quad << 30) | (U << 29) | (0b1111 << 24) | (immh << 19) | (immb << 16) | \ + (opcode << 11) | (1 << 10) | (Rn << 5) | Rd); +} +void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn) +{ + bool quad = IsQuad(Rt); + u32 encoded_size = 0; + + if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((quad << 30) | (3 << 26) | (L << 22) | (opcode << 12) | \ + (encoded_size << 10) | (Rn << 5) | Rt); +} + void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm); @@ -1784,17 +1850,251 @@ void ARM64FloatEmitter::STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s } // Loadstore single structure +void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) +{ + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) + { + S = index & 4; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + + } + else if (size == 16) + { + S = index & 2; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + + } + else if (size == 32) + { + S = index & 1; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 64) + { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn); +} + +void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm) +{ + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) + { + S = index & 4; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + + } + else if (size == 16) + { + S = index & 2; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + + } + else if (size == 32) + { + S = index & 1; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 64) + { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm); +} + void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn) { EmitLoadStoreSingleStructure(1, 0, 0b110, 0, size >> 4, Rt, Rn); } +void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) +{ + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) + { + S = index & 4; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + + } + else if (size == 16) + { + S = index & 2; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + + } + else if (size == 32) + { + S = index & 1; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 64) + { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn); +} + +void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm) +{ + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) + { + S = index & 4; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + + } + else if (size == 16) + { + S = index & 2; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + + } + else if (size == 32) + { + S = index & 1; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 64) + { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm); +} + +// Loadstore multiple structure +void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) +{ + _assert_msg_(DYNA_REC, count == 0 || count > 4, "%s must have a count of 1 to 4 registers!", __FUNCTION__); + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn); +} + // Scalar - 2 Source void ARM64FloatEmitter::FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { Emit2Source(0, 0, IsDouble(Rd), 0, Rd, Rn, Rm); } +// Scalar floating point immediate +void ARM64FloatEmitter::FMOV(ARM64Reg Rd, u32 imm) +{ + EmitScalarImm(0, 0, 0, 0, Rd, imm); +} + // Vector void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { @@ -1843,6 +2143,18 @@ void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn) { Emit2RegMisc(0, size >> 6, 0b10111, Rd, Rn); } +void ARM64FloatEmitter::FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(0, dest_size >> 5, 0b10110, Rd, Rn); +} +void ARM64FloatEmitter::FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(0, 2 | (size >> 6), 0b11011, Rd, Rn); +} +void ARM64FloatEmitter::FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(1, 2 | (size >> 6), 0b11011, Rd, Rn); +} void ARM64FloatEmitter::FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitThreeSame(1, size >> 6, 0b11111, Rd, Rn, Rm); @@ -1873,7 +2185,7 @@ void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) } void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn) { - Emit2RegMisc(0, 1 | (size >> 4), 0, Rd, Rn); + Emit2RegMisc(0, size >> 4, 1, Rd, Rn); } void ARM64FloatEmitter::REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn) { @@ -1883,6 +2195,18 @@ void ARM64FloatEmitter::REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn) { Emit2RegMisc(0, size >> 4, 0, Rd, Rn); } +void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(0, size >> 6, 0b11101, Rd, Rn); +} +void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(1, size >> 6, 0b11101, Rd, Rn); +} +void ARM64FloatEmitter::XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(0, dest_size >> 4, 0b10010, Rd, Rn); +} // Move void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn) @@ -1960,6 +2284,62 @@ void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 ind EmitCopy(1, 1, imm5, imm4, Rd, Rn); } +void ARM64FloatEmitter::UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) +{ + bool b64Bit = Is64Bit(Rd); + _assert_msg_(DYNA_REC, Rd > SP, "%s destination must be a GPR!", __FUNCTION__); + _assert_msg_(DYNA_REC, b64Bit && size != 64, "%s must have a size of 64 when destination is 64bit!", __FUNCTION__); + u32 imm5 = 0; + + if (size == 8) + { + imm5 = 1; + imm5 |= index << 1; + } + else if (size == 16) + { + imm5 = 2; + imm5 |= index << 2; + } + else if (size == 32) + { + imm5 = 4; + imm5 |= index << 3; + } + else if (size == 64) + { + imm5 = 8; + imm5 |= index << 4; + } + + EmitCopy(b64Bit, 0, imm5, 0b0111, Rd, Rn); +} +void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) +{ + bool b64Bit = Is64Bit(Rd); + _assert_msg_(DYNA_REC, Rd > SP, "%s destination must be a GPR!", __FUNCTION__); + _assert_msg_(DYNA_REC, size == 64, "%s doesn't support 64bit destination. Use UMOV!", __FUNCTION__); + u32 imm5 = 0; + + if (size == 8) + { + imm5 = 1; + imm5 |= index << 1; + } + else if (size == 16) + { + imm5 = 2; + imm5 |= index << 2; + } + else if (size == 32) + { + imm5 = 4; + imm5 |= index << 3; + } + + EmitCopy(b64Bit, 0, imm5, 0b0101, Rd, Rn); +} + // One source void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn) { @@ -2000,6 +2380,26 @@ void ARM64FloatEmitter::FMOV(u8 size, bool top, ARM64Reg Rd, ARM64Reg Rn) EmitConversion(sf, 0, type, rmode, IsVector(Rd) ? 0b111 : 0b110, Rd, Rn); } +void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn) +{ + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + + EmitConversion(sf, 0, type, 0, 0b010, Rd, Rn); +} + +void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn) +{ + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + + EmitConversion(sf, 0, type, 0, 0b011, Rd, Rn); +} + void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm) { EmitCompare(0, 0, 0, 0, Rn, Rm); @@ -2080,6 +2480,80 @@ void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) EmitPermute(size, 0b111, Rd, Rn, Rm); } +// Shift by immediate +void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + _assert_msg_(DYNA_REC, shift >= src_size, "%s shift amount must less than the element size!", __FUNCTION__); + u32 immh = 0; + u32 immb = shift & 0xFFF; + + if (src_size == 8) + { + immh = 1; + } + else if (src_size == 16) + { + immh = 2 | ((shift >> 3) & 1); + } + else if (src_size == 32) + { + immh = 4 | ((shift >> 3) & 3);; + } + EmitShiftImm(0, immh, immb, 0b10100, Rd, Rn); +} + +void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + _assert_msg_(DYNA_REC, shift >= src_size, "%s shift amount must less than the element size!", __FUNCTION__); + u32 immh = 0; + u32 immb = shift & 0xFFF; + + if (src_size == 8) + { + immh = 1; + } + else if (src_size == 16) + { + immh = 2 | ((shift >> 3) & 1); + } + else if (src_size == 32) + { + immh = 4 | ((shift >> 3) & 3);; + } + EmitShiftImm(1, immh, immb, 0b10100, Rd, Rn); +} + +void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + _assert_msg_(DYNA_REC, shift >= dest_size, "%s shift amount must less than the element size!", __FUNCTION__); + u32 immh = 0; + u32 immb = shift & 0xFFF; + + if (dest_size == 8) + { + immh = 1; + } + else if (dest_size == 16) + { + immh = 2 | ((shift >> 3) & 1); + } + else if (dest_size == 32) + { + immh = 4 | ((shift >> 3) & 3);; + } + EmitShiftImm(1, immh, immb, 0b10000, Rd, Rn); +} + +void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + SSHLL(src_size, Rd, Rn, 0); +} + +void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + USHLL(src_size, Rd, Rn, 0); +} + void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers) { for (auto it : registers) diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index f6e09e6615..086c813aee 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -82,6 +82,8 @@ inline bool IsQuad(ARM64Reg reg) { return (reg & 0xC0) == 0xC0; } inline bool IsVector(ARM64Reg reg) { return (reg & 0xC0) != 0; } inline ARM64Reg DecodeReg(ARM64Reg reg) { return (ARM64Reg)(reg & 0x1F); } inline ARM64Reg EncodeRegTo64(ARM64Reg reg) { return (ARM64Reg)(reg | 0x20); } +inline ARM64Reg EncodeRegToDouble(ARM64Reg reg) { return (ARM64Reg)((reg & ~0xC0) | 0x80); } +inline ARM64Reg EncodeRegToQuad(ARM64Reg reg) { return (ARM64Reg)(reg | 0xC0); } enum OpType { @@ -217,10 +219,24 @@ private: u32 m_shift; public: - ArithOption(ARM64Reg Rd) + ArithOption(ARM64Reg Rd, bool index = false) { + // Indexed registers are a certain feature of AARch64 + // On Loadstore instructions that use a register offset + // We can have the register as an index + // If we are indexing then the offset register will + // be shifted to the left so we are indexing at intervals + // of the size of what we are loading + // 8-bit: Index does nothing + // 16-bit: Index LSL 1 + // 32-bit: Index LSL 2 + // 64-bit: Index LSL 3 + if (index) + m_shift = 4; + else + m_shift = 0; + m_destReg = Rd; - m_shift = 0; m_type = TYPE_EXTENDEDREG; if (Is64Bit(Rd)) { @@ -256,18 +272,20 @@ public: { return m_type; } + ARM64Reg GetReg() + { + return m_destReg; + } u32 GetData() const { switch (m_type) { case TYPE_EXTENDEDREG: - return (m_width == WIDTH_64BIT ? (1 << 31) : 0) | - (m_extend << 13) | + return (m_extend << 13) | (m_shift << 10); break; case TYPE_SHIFTEDREG: - return (m_width == WIDTH_64BIT ? (1 << 31) : 0) | - (m_shifttype << 22) | + return (m_shifttype << 22) | (m_shift << 10); break; default: @@ -309,7 +327,7 @@ private: void EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size); void EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos); void EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); - void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend); + void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd); void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); @@ -505,6 +523,8 @@ public: void SXTB(ARM64Reg Rd, ARM64Reg Rn); void SXTH(ARM64Reg Rd, ARM64Reg Rn); void SXTW(ARM64Reg Rd, ARM64Reg Rn); + void UXTB(ARM64Reg Rd, ARM64Reg Rn); + void UXTH(ARM64Reg Rd, ARM64Reg Rn); // Load Register (Literal) void LDR(ARM64Reg Rt, u32 imm); @@ -551,16 +571,16 @@ public: void LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); // Load/Store register (register offset) - void STRB(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend = EXTEND_LSL); - void LDRB(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend = EXTEND_LSL); - void LDRSB(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend = EXTEND_LSL); - void STRH(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend = EXTEND_LSL); - void LDRH(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend = EXTEND_LSL); - void LDRSH(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend = EXTEND_LSL); - void STR(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend = EXTEND_LSL); - void LDR(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend = EXTEND_LSL); - void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend = EXTEND_LSL); - void PRFM(ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm, ExtendType extend = EXTEND_LSL); + void STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); // Load/Store pair void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); @@ -588,11 +608,21 @@ public: void STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); // Loadstore single structure + void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); + void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn); + void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); + void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); + + // Loadstore multiple structure + void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); // Scalar - 2 Source void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + // Scalar floating point immediate + void FMOV(ARM64Reg Rd, u32 imm); + // Vector void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); @@ -600,6 +630,9 @@ public: void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn); @@ -610,17 +643,24 @@ public: void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn); void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn); void REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); // Move void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn); void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn); void INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2); + void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); + void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); // One source void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn); // Conversion between float and integer void FMOV(u8 size, bool top, ARM64Reg Rd, ARM64Reg Rn); + void SCVTF(ARM64Reg Rd, ARM64Reg Rn); + void UCVTF(ARM64Reg Rd, ARM64Reg Rn); // Float comparison void FCMP(ARM64Reg Rn, ARM64Reg Rm); @@ -647,6 +687,13 @@ public: void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + // Shift by immediate + void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + // ABI related void ABI_PushRegisters(BitSet32 registers); void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0)); @@ -662,11 +709,15 @@ private: void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn); void Emit2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn); + void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm); void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm); + void EmitShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn); }; class ARM64CodeBlock : public CodeBlock From 4262d2199a9ba632de2ad0dde300824a22b46b85 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 18 Jan 2015 16:28:22 -0600 Subject: [PATCH 3/3] [AArch64] Implements paired loadstores. --- Source/Core/Core/CMakeLists.txt | 1 + Source/Core/Core/PowerPC/JitArm64/Jit.h | 4 + .../JitArm64/JitArm64_LoadStorePaired.cpp | 130 ++++++ .../Core/PowerPC/JitArm64/JitArm64_Tables.cpp | 8 +- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 420 ++++++++++++++++++ 5 files changed, 559 insertions(+), 4 deletions(-) create mode 100644 Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp diff --git a/Source/Core/Core/CMakeLists.txt b/Source/Core/Core/CMakeLists.txt index 4d2fa68d7b..788269a729 100644 --- a/Source/Core/Core/CMakeLists.txt +++ b/Source/Core/Core/CMakeLists.txt @@ -230,6 +230,7 @@ elseif(_M_ARM_64) PowerPC/JitArm64/JitArm64_LoadStore.cpp PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp PowerPC/JitArm64/JitArm64_Paired.cpp + PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp PowerPC/JitArm64/JitArm64_SystemRegisters.cpp PowerPC/JitArm64/JitArm64_Tables.cpp) endif() diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 94e9e945eb..87e09ce176 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -168,6 +168,10 @@ public: void ps_sum0(UGeckoInstruction inst); void ps_sum1(UGeckoInstruction inst); + // Loadstore paired + void psq_l(UGeckoInstruction inst); + void psq_st(UGeckoInstruction inst); + private: Arm64GPRCache gpr; Arm64FPRCache fpr; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp new file mode 100644 index 0000000000..4c6b565d5d --- /dev/null +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -0,0 +1,130 @@ +// Copyright 2014 Dolphin Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#include "Common/Arm64Emitter.h" +#include "Common/Common.h" +#include "Common/StringUtil.h" + +#include "Core/Core.h" +#include "Core/CoreTiming.h" +#include "Core/PowerPC/PowerPC.h" +#include "Core/PowerPC/PPCTables.h" +#include "Core/PowerPC/JitArm64/Jit.h" +#include "Core/PowerPC/JitArm64/JitArm64_RegCache.h" +#include "Core/PowerPC/JitArm64/JitAsm.h" + +using namespace Arm64Gen; + +void JitArm64::psq_l(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITLoadStorePairedOff); + FALLBACK_IF(js.memcheck || !SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem); + + // X30 is LR + // X0 contains the scale + // X1 is the address + // X2 is a temporary + // Q0 is the return register + // Q1 is a temporary + bool update = inst.OPCD == 57; + s32 offset = inst.SIMM_12; + + gpr.Lock(W0, W1, W2, W30); + fpr.Lock(Q0, Q1); + + ARM64Reg arm_addr = gpr.R(inst.RA); + ARM64Reg scale_reg = W0; + ARM64Reg addr_reg = W1; + ARM64Reg type_reg = W2; + + LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I])); + + if (inst.RA || update) // Always uses the register on update + { + if (offset >= 0) + ADD(addr_reg, gpr.R(inst.RA), offset); + else + SUB(addr_reg, gpr.R(inst.RA), std::abs(offset)); + } + else + { + MOVI2R(addr_reg, (u32)offset); + } + + UBFM(type_reg, scale_reg, 16, 18); // Type + UBFM(scale_reg, scale_reg, 24, 29); // Scale + + if (update) + MOV(arm_addr, addr_reg); + + MOVI2R(X30, (u64)&asm_routines.pairedLoadQuantized[inst.W * 8]); + LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true)); + BLR(X30); + + fpr.BindToRegister(inst.RS, false); + ARM64Reg VS = fpr.R(inst.RS); + m_float_emit.FCVTL(64, EncodeRegToDouble(VS), D0); + if (inst.W) + { + m_float_emit.FMOV(D0, 0x70); // 1.0 as a Double + m_float_emit.INS(64, VS, 1, Q0, 0); + } + + gpr.Unlock(W0, W1, W2, W30); + fpr.Unlock(Q0, Q1); +} + +void JitArm64::psq_st(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITLoadStorePairedOff); + FALLBACK_IF(js.memcheck || !SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem); + + // X30 is LR + // X0 contains the scale + // X1 is the address + // Q0 is the store register + + bool update = inst.OPCD == 61; + s32 offset = inst.SIMM_12; + + gpr.Lock(W0, W1, W2, W30); + fpr.Lock(Q0, Q1); + + ARM64Reg arm_addr = gpr.R(inst.RA); + ARM64Reg scale_reg = W0; + ARM64Reg addr_reg = W1; + ARM64Reg type_reg = gpr.GetReg(); + + LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I])); + + if (inst.RA || update) // Always uses the register on update + { + if (offset >= 0) + ADD(addr_reg, gpr.R(inst.RA), offset); + else + SUB(addr_reg, gpr.R(inst.RA), std::abs(offset)); + } + else + { + MOVI2R(addr_reg, (u32)offset); + } + + UBFM(type_reg, scale_reg, 0, 2); // Type + UBFM(scale_reg, scale_reg, 8, 13); // Scale + + if (update) + MOV(arm_addr, addr_reg); + + ARM64Reg VS = fpr.R(inst.RS); + m_float_emit.FCVTN(32, D0, VS); + MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[inst.W * 8]); + LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true)); + BLR(X30); + + gpr.Unlock(W0, W1, W2, W30, type_reg); + fpr.Unlock(Q0, Q1); +} + diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp index f1087a27c8..b5997e9f7f 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp @@ -94,10 +94,10 @@ static GekkoOPTemplate primarytable[] = {54, &JitArm64::stfXX}, //"stfd", OPTYPE_STOREFP, FL_IN_A}}, {55, &JitArm64::stfXX}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, - {56, &JitArm64::FallBackToInterpreter}, //"psq_l", OPTYPE_PS, FL_IN_A}}, - {57, &JitArm64::FallBackToInterpreter}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}}, - {60, &JitArm64::FallBackToInterpreter}, //"psq_st", OPTYPE_PS, FL_IN_A}}, - {61, &JitArm64::FallBackToInterpreter}, //"psq_stu", OPTYPE_PS, FL_OUT_A | FL_IN_A}}, + {56, &JitArm64::psq_l}, //"psq_l", OPTYPE_PS, FL_IN_A}}, + {57, &JitArm64::psq_l}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}}, + {60, &JitArm64::psq_st}, //"psq_st", OPTYPE_PS, FL_IN_A}}, + {61, &JitArm64::psq_st}, //"psq_stu", OPTYPE_PS, FL_OUT_A | FL_IN_A}}, //missing: 0, 5, 6, 9, 22, 30, 62, 58 {0, &JitArm64::FallBackToInterpreter}, //"unknown_instruction", OPTYPE_UNKNOWN, 0}}, diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 600de5a703..029bca364a 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -7,6 +7,7 @@ #include "Core/PowerPC/PowerPC.h" #include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitArm64/JitAsm.h" +#include "Core/PowerPC/JitCommon/JitAsmCommon.h" #include "Core/PowerPC/JitCommon/JitCache.h" using namespace Arm64Gen; @@ -89,9 +90,428 @@ void JitArm64AsmRoutineManager::Generate() ABI_PopRegisters(regs_to_save); RET(X30); + GenerateCommon(); + FlushIcache(); } void JitArm64AsmRoutineManager::GenerateCommon() { + // X0 is the scale + // X1 is address + // X2 is a temporary on stores + // X30 is LR + // Q0 is the return for loads + // is the register for stores + // Q1 is a temporary + ARM64Reg addr_reg = X1; + ARM64Reg scale_reg = X0; + ARM64FloatEmitter float_emit(this); + const u32 GPR_CALLER_SAVE = 0x6007FFFF; + + const u8* loadPairedIllegal = GetCodePtr(); + BRK(100); + const u8* loadPairedFloatTwo = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LD1(32, 1, D0, addr_reg); + float_emit.REV32(8, D0, D0); + RET(X30); + } + const u8* loadPairedU8Two = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LDR(16, INDEX_UNSIGNED, D0, addr_reg, 0); + float_emit.UXTL(8, D0, D0); + float_emit.UXTL(16, D0, D0); + float_emit.UCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + const u8* loadPairedS8Two = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LDR(16, INDEX_UNSIGNED, D0, addr_reg, 0); + float_emit.SXTL(8, D0, D0); + float_emit.SXTL(16, D0, D0); + float_emit.SCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + const u8* loadPairedU16Two = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LD1(16, 1, D0, addr_reg); + float_emit.REV16(8, D0, D0); + float_emit.UXTL(16, D0, D0); + float_emit.UCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + const u8* loadPairedS16Two = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LD1(16, 1, D0, addr_reg); + float_emit.REV16(8, D0, D0); + float_emit.SXTL(16, D0, D0); + float_emit.SCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + + const u8* loadPairedFloatOne = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LDR(32, INDEX_UNSIGNED, D0, addr_reg, 0); + float_emit.REV32(8, D0, D0); + RET(X30); + } + const u8* loadPairedU8One = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LDR(8, INDEX_UNSIGNED, D0, addr_reg, 0); + float_emit.UXTL(8, D0, D0); + float_emit.UXTL(16, D0, D0); + float_emit.UCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + const u8* loadPairedS8One = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LDR(8, INDEX_UNSIGNED, D0, addr_reg, 0); + float_emit.SXTL(8, D0, D0); + float_emit.SXTL(16, D0, D0); + float_emit.SCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + const u8* loadPairedU16One = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LDR(16, INDEX_UNSIGNED, D0, addr_reg, 0); + float_emit.REV16(8, D0, D0); + float_emit.UXTL(16, D0, D0); + float_emit.UCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + const u8* loadPairedS16One = GetCodePtr(); + { + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.LDR(16, INDEX_UNSIGNED, D0, addr_reg, 0); + float_emit.REV16(8, D0, D0); + float_emit.SXTL(16, D0, D0); + float_emit.SCVTF(32, D0, D0); + + MOVI2R(addr_reg, (u64)&m_dequantizeTableS); + ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + RET(X30); + } + + pairedLoadQuantized = reinterpret_cast(const_cast(AlignCode16())); + ReserveCodeSpace(16 * sizeof(u8*)); + + pairedLoadQuantized[0] = loadPairedFloatTwo; + pairedLoadQuantized[1] = loadPairedIllegal; + pairedLoadQuantized[2] = loadPairedIllegal; + pairedLoadQuantized[3] = loadPairedIllegal; + pairedLoadQuantized[4] = loadPairedU8Two; + pairedLoadQuantized[5] = loadPairedU16Two; + pairedLoadQuantized[6] = loadPairedS8Two; + pairedLoadQuantized[7] = loadPairedS16Two; + + pairedLoadQuantized[8] = loadPairedFloatOne; + pairedLoadQuantized[9] = loadPairedIllegal; + pairedLoadQuantized[10] = loadPairedIllegal; + pairedLoadQuantized[11] = loadPairedIllegal; + pairedLoadQuantized[12] = loadPairedU8One; + pairedLoadQuantized[13] = loadPairedU16One; + pairedLoadQuantized[14] = loadPairedS8One; + pairedLoadQuantized[15] = loadPairedS16One; + + // Stores + const u8* storePairedIllegal = GetCodePtr(); + BRK(0x101); + const u8* storePairedFloat = GetCodePtr(); + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + + float_emit.REV32(8, D0, D0); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.ST1(32, Q0, 0, addr_reg, SP); + float_emit.ST1(32, Q0, 1, addr_reg, SP); + RET(X30); + + SetJumpTarget(argh); + + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.UMOV(64, X0, Q0, 0); + ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32)); + MOVI2R(X30, (u64)Memory::Write_U64); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + const u8* storePairedU8 = GetCodePtr(); + const u8* storePairedS8 = GetCodePtr(); + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); + float_emit.XTN(16, D0, D0); + float_emit.XTN(8, D0, D0); + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.ST1(8, Q0, 0, addr_reg, SP); + float_emit.ST1(8, Q0, 1, addr_reg, SP); + RET(X30); + + SetJumpTarget(argh); + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.UMOV(16, W0, Q0, 0); + REV16(W0, W0); + MOVI2R(X30, (u64)Memory::Write_U16); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + + const u8* storePairedU16 = GetCodePtr(); + const u8* storePairedS16 = GetCodePtr(); + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LD1R(32, D1, scale_reg); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); + float_emit.XTN(16, D0, D0); + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.ST1(16, Q0, 0, addr_reg, SP); + float_emit.ST1(16, Q0, 1, addr_reg, SP); + RET(X30); + + SetJumpTarget(argh); + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.UMOV(32, W0, Q0, 0); + REV32(W0, W0); + MOVI2R(X30, (u64)Memory::Write_U32); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + + const u8* storeSingleFloat = GetCodePtr(); + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + + float_emit.REV32(8, D0, D0); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.STR(32, INDEX_UNSIGNED, D0, addr_reg, 0); + RET(X30); + + SetJumpTarget(argh); + + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.UMOV(32, W0, Q0, 0); + MOVI2R(X30, (u64)&Memory::Write_U32); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + const u8* storeSingleU8 = GetCodePtr(); // Used by MKWii + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); + float_emit.XTN(16, D0, D0); + float_emit.XTN(8, D0, D0); + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.ST1(8, Q0, 0, addr_reg); + RET(X30); + + SetJumpTarget(argh); + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.UMOV(32, W0, Q0, 0); + MOVI2R(X30, (u64)&Memory::Write_U8); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + const u8* storeSingleS8 = GetCodePtr(); + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZS(32, D0, D0); + float_emit.XTN(16, D0, D0); + float_emit.XTN(8, D0, D0); + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.ST1(8, Q0, 0, addr_reg); + RET(X30); + + SetJumpTarget(argh); + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.SMOV(32, W0, Q0, 0); + MOVI2R(X30, (u64)&Memory::Write_U8); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + const u8* storeSingleU16 = GetCodePtr(); // Used by MKWii + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); + float_emit.XTN(16, D0, D0); + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.ST1(16, Q0, 0, addr_reg); + RET(X30); + + SetJumpTarget(argh); + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.UMOV(32, W0, Q0, 0); + MOVI2R(X30, (u64)&Memory::Write_U16); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + const u8* storeSingleS16 = GetCodePtr(); + { + BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2 + BitSet32 fprs(~3); // All except Q0/Q1 + + MOVI2R(X2, (u64)&m_quantizeTableS); + ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1); + float_emit.FCVTZS(32, D0, D0); + float_emit.XTN(16, D0, D0); + + TST(DecodeReg(addr_reg), 6, 1); + FixupBranch argh = B(CC_NEQ); + MOVK(addr_reg, ((u64)Memory::base >> 32) & 0xFFFF, SHIFT_32); + float_emit.ST1(16, Q0, 0, addr_reg); + RET(X30); + + SetJumpTarget(argh); + ABI_PushRegisters(gprs); + float_emit.ABI_PushRegisters(fprs); + float_emit.SMOV(32, W0, Q0, 0); + + MOVI2R(X30, (u64)&Memory::Write_U16); + BLR(X30); + float_emit.ABI_PopRegisters(fprs); + ABI_PopRegisters(gprs); + RET(X30); + } + + pairedStoreQuantized = reinterpret_cast(const_cast(AlignCode16())); + ReserveCodeSpace(16 * sizeof(u8*)); + + pairedStoreQuantized[0] = storePairedFloat; + pairedStoreQuantized[1] = storePairedIllegal; + pairedStoreQuantized[2] = storePairedIllegal; + pairedStoreQuantized[3] = storePairedIllegal; + pairedStoreQuantized[4] = storePairedU8; + pairedStoreQuantized[5] = storePairedU16; + pairedStoreQuantized[6] = storePairedS8; + pairedStoreQuantized[7] = storePairedS16; + + pairedStoreQuantized[8] = storeSingleFloat; + pairedStoreQuantized[9] = storePairedIllegal; + pairedStoreQuantized[10] = storePairedIllegal; + pairedStoreQuantized[11] = storePairedIllegal; + pairedStoreQuantized[12] = storeSingleU8; + pairedStoreQuantized[13] = storeSingleU16; + pairedStoreQuantized[14] = storeSingleS8; + pairedStoreQuantized[15] = storeSingleS16; }