From 99b34efe2d923c4fd6fbfdb051833d2af6ea2136 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Tue, 4 Feb 2020 18:29:52 +0100 Subject: [PATCH] move ARM64 JIT backend here --- CMakeLists.txt | 2 +- src/ARM.h | 9 +- src/ARMJIT.cpp | 4 + src/ARMJIT_A64/ARMJIT_ALU.cpp | 837 +++++ src/ARMJIT_A64/ARMJIT_Branch.cpp | 452 +++ src/ARMJIT_A64/ARMJIT_Compiler.cpp | 707 +++++ src/ARMJIT_A64/ARMJIT_Compiler.h | 234 ++ src/ARMJIT_A64/ARMJIT_LoadStore.cpp | 848 +++++ src/ARM_InstrInfo.cpp | 7 +- src/CMakeLists.txt | 27 +- src/dolphin/Align.h | 24 + src/dolphin/Arm64Emitter.cpp | 4466 +++++++++++++++++++++++++++ src/dolphin/Arm64Emitter.h | 1152 +++++++ src/dolphin/ArmCommon.h | 27 + src/dolphin/BitUtils.h | 254 ++ src/dolphin/Compat.h | 12 + src/dolphin/MathUtil.cpp | 13 + src/dolphin/MathUtil.h | 121 + 18 files changed, 9188 insertions(+), 8 deletions(-) create mode 100644 src/ARMJIT_A64/ARMJIT_ALU.cpp create mode 100644 src/ARMJIT_A64/ARMJIT_Branch.cpp create mode 100644 src/ARMJIT_A64/ARMJIT_Compiler.cpp create mode 100644 src/ARMJIT_A64/ARMJIT_Compiler.h create mode 100644 src/ARMJIT_A64/ARMJIT_LoadStore.cpp create mode 100644 src/dolphin/Align.h create mode 100644 src/dolphin/Arm64Emitter.cpp create mode 100644 src/dolphin/Arm64Emitter.h create mode 100644 src/dolphin/ArmCommon.h create mode 100644 src/dolphin/BitUtils.h create mode 100644 src/dolphin/MathUtil.cpp create mode 100644 src/dolphin/MathUtil.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1e53c607..6729e739 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,7 +36,7 @@ detect_architecture("__i386__" x86) detect_architecture("__arm__" ARM) detect_architecture("__aarch64__" ARM64) -if (ARCHITECTURE STREQUAL x86_64) +if (ARCHITECTURE STREQUAL x86_64 OR ARCHITECTURE STREQUAL ARM64) option(ENABLE_JIT "Enable x64 JIT recompiler" ON) endif() diff --git a/src/ARM.h b/src/ARM.h index 96dd8577..7ef19388 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -254,10 +254,14 @@ public: u32 DTCMSetting, ITCMSetting; - u8 ITCM[0x8000]; + // for aarch64 JIT they need to go up here + // to be addressable by a 12-bit immediate u32 ITCMSize; - u8 DTCM[0x4000]; u32 DTCMBase, DTCMSize; + s32 RegionCodeCycles; + + u8 ITCM[0x8000]; + u8 DTCM[0x4000]; u8 ICache[0x2000]; u32 ICacheTags[64*4]; @@ -282,7 +286,6 @@ public: // code/16N/32N/32S u8 MemTimings[0x100000][4]; - s32 RegionCodeCycles; u8* CurICacheLine; bool (*GetMemRegion)(u32 addr, bool write, NDS::MemRegion* region); diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 8fd77083..561fabbb 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -6,7 +6,11 @@ #include "Config.h" #include "ARMJIT_Internal.h" +#if defined(__x86_64__) #include "ARMJIT_x64/ARMJIT_Compiler.h" +#else +#include "ARMJIT_A64/ARMJIT_Compiler.h" +#endif #include "ARMInterpreter_ALU.h" #include "ARMInterpreter_LoadStore.h" diff --git a/src/ARMJIT_A64/ARMJIT_ALU.cpp b/src/ARMJIT_A64/ARMJIT_ALU.cpp new file mode 100644 index 00000000..0fe6a970 --- /dev/null +++ b/src/ARMJIT_A64/ARMJIT_ALU.cpp @@ -0,0 +1,837 @@ +#include "ARMJIT_Compiler.h" + +using namespace Arm64Gen; + +namespace ARMJIT +{ + +void Compiler::Comp_RegShiftReg(int op, bool S, Op2& op2, ARM64Reg rs) +{ + if (!(CurInstr.SetFlags & 0x2)) + S = false; + + CPSRDirty |= S; + + UBFX(W1, rs, 0, 8); + + if (!S) + { + if (op == 3) + RORV(W0, op2.Reg.Rm, W1); + else + { + CMP(W1, 32); + if (op == 2) + { + MOVI2R(W2, 31); + CSEL(W1, W2, W1, CC_GE); + ASRV(W0, op2.Reg.Rm, W1); + } + else + { + if (op == 0) + LSLV(W0, op2.Reg.Rm, W1); + else if (op == 1) + LSRV(W0, op2.Reg.Rm, W1); + CSEL(W0, WZR, W0, CC_GE); + } + } + } + else + { + MOV(W0, op2.Reg.Rm); + FixupBranch zero = CBZ(W1); + + SUB(W1, W1, 1); + if (op == 3) + { + RORV(W0, op2.Reg.Rm, W1); + BFI(RCPSR, W0, 29, 1); + } + else + { + CMP(W1, 31); + if (op == 2) + { + MOVI2R(W2, 31); + CSEL(W1, W2, W1, CC_GT); + ASRV(W0, op2.Reg.Rm, W1); + BFI(RCPSR, W0, 29, 1); + } + else + { + if (op == 0) + { + LSLV(W0, op2.Reg.Rm, W1); + UBFX(W1, W0, 31, 1); + } + else if (op == 1) + LSRV(W0, op2.Reg.Rm, W1); + CSEL(W1, WZR, op ? W0 : W1, CC_GT); + BFI(RCPSR, W1, 29, 1); + CSEL(W0, WZR, W0, CC_GE); + } + } + + MOV(W0, W0, ArithOption(W0, (ShiftType)op, 1)); + SetJumpTarget(zero); + } + op2 = Op2(W0, ST_LSL, 0); +} + +void Compiler::Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, ARM64Reg tmp) +{ + if (!(CurInstr.SetFlags & 0x2)) + S = false; + + CPSRDirty |= S; + + switch (op) + { + case 0: // LSL + if (S && amount) + { + UBFX(tmp, op2.Reg.Rm, 32 - amount, 1); + BFI(RCPSR, tmp, 29, 1); + } + op2 = Op2(op2.Reg.Rm, ST_LSL, amount); + return; + case 1: // LSR + if (S) + { + UBFX(tmp, op2.Reg.Rm, (amount ? amount : 32) - 1, 1); + BFI(RCPSR, tmp, 29, 1); + } + if (amount == 0) + { + op2 = Op2(0); + return; + } + op2 = Op2(op2.Reg.Rm, ST_LSR, amount); + return; + case 2: // ASR + if (S) + { + UBFX(tmp, op2.Reg.Rm, (amount ? amount : 32) - 1, 1); + BFI(RCPSR, tmp, 29, 1); + } + op2 = Op2(op2.Reg.Rm, ST_ASR, amount ? amount : 31); + return; + case 3: // ROR + if (amount == 0) + { + UBFX(tmp, RCPSR, 29, 1); + LSL(tmp, tmp, 31); + if (S) + BFI(RCPSR, op2.Reg.Rm, 29, 1); + ORR(tmp, tmp, op2.Reg.Rm, ArithOption(tmp, ST_LSR, 1)); + + op2 = Op2(tmp, ST_LSL, 0); + } + else + { + if (S) + { + UBFX(tmp, op2.Reg.Rm, amount - 1, 1); + BFI(RCPSR, tmp, 29, 1); + } + op2 = Op2(op2.Reg.Rm, ST_ROR, amount); + } + return; + } +} + +void Compiler::Comp_RetriveFlags(bool retriveCV) +{ + if (CurInstr.SetFlags) + CPSRDirty = true; + + if (CurInstr.SetFlags & 0x4) + { + CSET(W0, CC_EQ); + BFI(RCPSR, W0, 30, 1); + } + if (CurInstr.SetFlags & 0x8) + { + CSET(W0, CC_MI); + BFI(RCPSR, W0, 31, 1); + } + if (retriveCV) + { + if (CurInstr.SetFlags & 0x2) + { + CSET(W0, CC_CS); + BFI(RCPSR, W0, 29, 1); + } + if (CurInstr.SetFlags & 0x1) + { + CSET(W0, CC_VS); + BFI(RCPSR, W0, 28, 1); + } + } +} + +void Compiler::Comp_Logical(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2) +{ + if (S && !CurInstr.SetFlags) + S = false; + + switch (op) + { + case 0x0: // AND + if (S) + { + if (op2.IsImm) + ANDSI2R(rd, rn, op2.Imm, W0); + else + ANDS(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + else + { + if (op2.IsImm) + ANDI2R(rd, rn, op2.Imm, W0); + else + AND(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + break; + case 0x1: // EOR + if (op2.IsImm) + EORI2R(rd, rn, op2.Imm, W0); + else + EOR(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + if (S && FlagsNZNeeded()) + TST(rd, rd); + break; + case 0xC: // ORR + if (op2.IsImm) + ORRI2R(rd, rn, op2.Imm, W0); + else + ORR(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + if (S && FlagsNZNeeded()) + TST(rd, rd); + break; + case 0xE: // BIC + if (S) + { + if (op2.IsImm) + ANDSI2R(rd, rn, ~op2.Imm, W0); + else + BICS(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + else + { + if (op2.IsImm) + ANDI2R(rd, rn, ~op2.Imm, W0); + else + BIC(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + break; + } + + if (S) + Comp_RetriveFlags(false); +} + +void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2) +{ + if (!op2.IsImm && op2.Reg.ShiftType == ST_ROR) + { + MOV(W0, op2.Reg.Rm, op2.ToArithOption()); + op2 = Op2(W0, ST_LSL, 0); + } + + if (S && !CurInstr.SetFlags) + S = false; + + bool CVInGP = false; + switch (op) + { + case 0x2: // SUB + if (S) + { + if (op2.IsImm) + SUBSI2R(rd, rn, op2.Imm, W0); + else + SUBS(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + else + { + if (op2.IsImm) + { + MOVI2R(W2, op2.Imm); + SUBI2R(rd, rn, op2.Imm, W0); + } + else + SUB(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + break; + case 0x3: // RSB + if (op2.IsZero()) + { + op2 = Op2(WZR); + } + else if (op2.IsImm) + { + MOVI2R(W1, op2.Imm); + op2 = Op2(W1); + } + else if (op2.Reg.ShiftAmount != 0) + { + MOV(W1, op2.Reg.Rm, op2.ToArithOption()); + op2 = Op2(W1); + } + + if (S) + SUBS(rd, op2.Reg.Rm, rn); + else + SUB(rd, op2.Reg.Rm, rn); + break; + case 0x4: // ADD + if (S) + { + if (op2.IsImm) + ADDSI2R(rd, rn, op2.Imm, W0); + else + ADDS(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + else + { + if (op2.IsImm) + ADDI2R(rd, rn, op2.Imm, W0); + else + ADD(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + break; + case 0x5: // ADC + UBFX(W2, RCPSR, 29, 1); + if (S) + { + CVInGP = true; + ADDS(W1, rn, W2); + CSET(W2, CC_CS); + CSET(W3, CC_VS); + if (op2.IsImm) + ADDSI2R(rd, W1, op2.Imm, W0); + else + ADDS(rd, W1, op2.Reg.Rm, op2.ToArithOption()); + CSINC(W2, W2, WZR, CC_CC); + CSINC(W3, W3, WZR, CC_VC); + } + else + { + ADD(W1, rn, W2); + if (op2.IsImm) + ADDI2R(rd, W1, op2.Imm, W0); + else + ADD(rd, W1, op2.Reg.Rm, op2.ToArithOption()); + } + break; + case 0x6: // SBC + UBFX(W2, RCPSR, 29, 1); + // W1 = -op2 - 1 + if (op2.IsImm) + MOVI2R(W1, ~op2.Imm); + else + ORN(W1, WZR, op2.Reg.Rm, op2.ToArithOption()); + if (S) + { + CVInGP = true; + ADDS(W1, W2, W1); + CSET(W2, CC_CS); + CSET(W3, CC_VS); + ADDS(rd, rn, W1); + CSINC(W2, W2, WZR, CC_CC); + CSINC(W3, W3, WZR, CC_VC); + } + else + { + ADD(W1, W2, W1); + ADD(rd, rn, W1); + } + break; + case 0x7: // RSC + UBFX(W2, RCPSR, 29, 1); + // W1 = -rn - 1 + MVN(W1, rn); + if (S) + { + CVInGP = true; + ADDS(W1, W2, W1); + CSET(W2, CC_CS); + CSET(W3, CC_VS); + if (op2.IsImm) + ADDSI2R(rd, W1, op2.Imm); + else + ADDS(rd, W1, op2.Reg.Rm, op2.ToArithOption()); + CSINC(W2, W2, WZR, CC_CC); + CSINC(W3, W3, WZR, CC_VC); + } + else + { + ADD(W1, W2, W1); + if (op2.IsImm) + ADDI2R(rd, W1, op2.Imm); + else + ADD(rd, W1, op2.Reg.Rm, op2.ToArithOption()); + } + break; + } + + if (S) + { + if (CVInGP) + { + BFI(RCPSR, W2, 29, 1); + BFI(RCPSR, W3, 28, 1); + } + Comp_RetriveFlags(!CVInGP); + } +} + +void Compiler::Comp_Compare(int op, ARM64Reg rn, Op2 op2) +{ + if (!op2.IsImm && op2.Reg.ShiftType == ST_ROR) + { + MOV(W0, op2.Reg.Rm, op2.ToArithOption()); + op2 = Op2(W0, ST_LSL, 0); + } + + switch (op) + { + case 0x8: // TST + if (op2.IsImm) + TSTI2R(rn, op2.Imm, W0); + else + ANDS(WZR, rn, op2.Reg.Rm, op2.ToArithOption()); + break; + case 0x9: // TEQ + if (op2.IsImm) + EORI2R(W0, rn, op2.Imm, W0); + else + EOR(W0, rn, op2.Reg.Rm, op2.ToArithOption()); + TST(W0, W0); + break; + case 0xA: // CMP + if (op2.IsImm) + CMPI2R(rn, op2.Imm, W0); + else + CMP(rn, op2.Reg.Rm, op2.ToArithOption()); + break; + case 0xB: // CMN + if (op2.IsImm) + ADDSI2R(WZR, rn, op2.Imm, W0); + else + CMN(rn, op2.Reg.Rm, op2.ToArithOption()); + break; + } + + Comp_RetriveFlags(op >= 0xA); +} + +// also counts cycles! +void Compiler::A_Comp_GetOp2(bool S, Op2& op2) +{ + if (CurInstr.Instr & (1 << 25)) + { + Comp_AddCycles_C(); + op2 = Op2(ROR(CurInstr.Instr & 0xFF, (CurInstr.Instr >> 7) & 0x1E)); + } + else + { + int op = (CurInstr.Instr >> 5) & 0x3; + op2.Reg.Rm = MapReg(CurInstr.A_Reg(0)); + if (CurInstr.Instr & (1 << 4)) + { + Comp_AddCycles_CI(1); + + ARM64Reg rs = MapReg(CurInstr.A_Reg(8)); + if (CurInstr.A_Reg(0) == 15) + { + ADD(W0, op2.Reg.Rm, 4); + op2.Reg.Rm = W0; + } + Comp_RegShiftReg(op, S, op2, rs); + } + else + { + Comp_AddCycles_C(); + + int amount = (CurInstr.Instr >> 7) & 0x1F; + Comp_RegShiftImm(op, amount, S, op2); + } + } +} + +void Compiler::A_Comp_ALUCmpOp() +{ + u32 op = (CurInstr.Instr >> 21) & 0xF; + ARM64Reg rn = MapReg(CurInstr.A_Reg(16)); + Op2 op2; + A_Comp_GetOp2(op <= 0x9, op2); + + Comp_Compare(op, rn, op2); +} + +void Compiler::A_Comp_ALUMovOp() +{ + bool S = CurInstr.Instr & (1 << 20); + u32 op = (CurInstr.Instr >> 21) & 0xF; + + ARM64Reg rd = MapReg(CurInstr.A_Reg(12)); + Op2 op2; + A_Comp_GetOp2(S, op2); + + if (op == 0xF) // MVN + { + if (op2.IsImm) + { + if (CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm); + MOVI2R(rd, ~op2.Imm); + } + else + ORN(rd, WZR, op2.Reg.Rm, op2.ToArithOption()); + } + else // MOV + { + if (op2.IsImm) + { + if (CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm); + MOVI2R(rd, op2.Imm); + } + else + MOV(rd, op2.Reg.Rm, op2.ToArithOption()); + } + + if (S) + { + if (FlagsNZNeeded()) + TST(rd, rd); + Comp_RetriveFlags(false); + } + + if (CurInstr.Info.Branches()) + Comp_JumpTo(rd, true, S); +} + +void Compiler::A_Comp_ALUTriOp() +{ + bool S = CurInstr.Instr & (1 << 20); + u32 op = (CurInstr.Instr >> 21) & 0xF; + bool logical = (1 << op) & 0xF303; + + ARM64Reg rd = MapReg(CurInstr.A_Reg(12)); + ARM64Reg rn = MapReg(CurInstr.A_Reg(16)); + Op2 op2; + A_Comp_GetOp2(S && logical, op2); + + if (op2.IsImm && op2.Imm == 0) + op2 = Op2(WZR, ST_LSL, 0); + + if (logical) + Comp_Logical(op, S, rd, rn, op2); + else + Comp_Arithmetic(op, S, rd, rn, op2); + + if (CurInstr.Info.Branches()) + Comp_JumpTo(rd, true, S); +} + +void Compiler::A_Comp_Clz() +{ + Comp_AddCycles_C(); + + ARM64Reg rd = MapReg(CurInstr.A_Reg(12)); + ARM64Reg rm = MapReg(CurInstr.A_Reg(0)); + + CLZ(rd, rm); + + assert(Num == 0); +} + +void Compiler::Comp_Mul_Mla(bool S, bool mla, ARM64Reg rd, ARM64Reg rm, ARM64Reg rs, ARM64Reg rn) +{ + if (Num == 0) + { + Comp_AddCycles_CI(S ? 3 : 1); + } + else + { + CLZ(W0, rs); + CLS(W1, rs); + CMP(W0, W1); + CSEL(W0, W0, W1, CC_GT); + Comp_AddCycles_CI(mla ? 1 : 0, W0, ArithOption(W0, ST_LSR, 3)); + } + + if (mla) + MADD(rd, rm, rs, rn); + else + MUL(rd, rm, rs); + + if (S && FlagsNZNeeded()) + { + TST(rd, rd); + Comp_RetriveFlags(false); + } +} + +void Compiler::A_Comp_Mul_Long() +{ + ARM64Reg rd = MapReg(CurInstr.A_Reg(16)); + ARM64Reg rm = MapReg(CurInstr.A_Reg(0)); + ARM64Reg rs = MapReg(CurInstr.A_Reg(8)); + ARM64Reg rn = MapReg(CurInstr.A_Reg(12)); + + bool S = CurInstr.Instr & (1 << 20); + bool add = CurInstr.Instr & (1 << 21); + bool sign = CurInstr.Instr & (1 << 22); + + if (Num == 0) + { + Comp_AddCycles_CI(S ? 3 : 1); + } + else + { + CLZ(W0, rs); + CLS(W1, rs); + CMP(W0, W1); + CSEL(W0, W0, W1, CC_GT); + Comp_AddCycles_CI(0, W0, ArithOption(W0, ST_LSR, 3)); + } + + if (add) + { + MOV(W0, rn); + BFI(X0, EncodeRegTo64(rd), 32, 32); + if (sign) + SMADDL(EncodeRegTo64(rn), rm, rs, X0); + else + UMADDL(EncodeRegTo64(rn), rm, rs, X0); + if (S && FlagsNZNeeded()) + TST(EncodeRegTo64(rn), EncodeRegTo64(rn)); + UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32); + } + else + { + if (sign) + SMULL(EncodeRegTo64(rn), rm, rs); + else + UMULL(EncodeRegTo64(rn), rm, rs); + if (S && FlagsNZNeeded()) + TST(EncodeRegTo64(rn), EncodeRegTo64(rn)); + UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32); + } + + if (S) + Comp_RetriveFlags(false); +} + +void Compiler::A_Comp_Mul() +{ + ARM64Reg rd = MapReg(CurInstr.A_Reg(16)); + ARM64Reg rm = MapReg(CurInstr.A_Reg(0)); + ARM64Reg rs = MapReg(CurInstr.A_Reg(8)); + + bool S = CurInstr.Instr & (1 << 20); + bool mla = CurInstr.Instr & (1 << 21); + ARM64Reg rn = INVALID_REG; + if (mla) + rn = MapReg(CurInstr.A_Reg(12)); + + Comp_Mul_Mla(S, mla, rd, rm, rs, rn); +} + +void Compiler::T_Comp_ShiftImm() +{ + Comp_AddCycles_C(); + + u32 op = (CurInstr.Instr >> 11) & 0x3; + int amount = (CurInstr.Instr >> 6) & 0x1F; + + ARM64Reg rd = MapReg(CurInstr.T_Reg(0)); + Op2 op2; + op2.Reg.Rm = MapReg(CurInstr.T_Reg(3)); + Comp_RegShiftImm(op, amount, true, op2); + if (op2.IsImm) + MOVI2R(rd, op2.Imm); + else + MOV(rd, op2.Reg.Rm, op2.ToArithOption()); + if (FlagsNZNeeded()) + TST(rd, rd); + + Comp_RetriveFlags(false); +} + +void Compiler::T_Comp_AddSub_() +{ + Comp_AddCycles_C(); + + Op2 op2; + if (CurInstr.Instr & (1 << 10)) + op2 = Op2((CurInstr.Instr >> 6) & 0x7); + else + op2 = Op2(MapReg(CurInstr.T_Reg(6))); + + Comp_Arithmetic( + CurInstr.Instr & (1 << 9) ? 0x2 : 0x4, + true, + MapReg(CurInstr.T_Reg(0)), + MapReg(CurInstr.T_Reg(3)), + op2); +} + +void Compiler::T_Comp_ALUImm8() +{ + Comp_AddCycles_C(); + + u32 imm = CurInstr.Instr & 0xFF; + int op = (CurInstr.Instr >> 11) & 0x3; + + ARM64Reg rd = MapReg(CurInstr.T_Reg(8)); + + switch (op) + { + case 0: + MOVI2R(rd, imm); + if (FlagsNZNeeded()) + TST(rd, rd); + Comp_RetriveFlags(false); + break; + case 1: + Comp_Compare(0xA, rd, Op2(imm)); + break; + case 2: + case 3: + Comp_Arithmetic(op == 2 ? 0x4 : 0x2, true, rd, rd, Op2(imm)); + break; + } +} + +void Compiler::T_Comp_ALU() +{ + int op = (CurInstr.Instr >> 6) & 0xF; + ARM64Reg rd = MapReg(CurInstr.T_Reg(0)); + ARM64Reg rs = MapReg(CurInstr.T_Reg(3)); + + if ((op >= 0x2 && op <= 0x4) || op == 0x7) + Comp_AddCycles_CI(1); + else + Comp_AddCycles_C(); + + switch (op) + { + case 0x0: + Comp_Logical(0x0, true, rd, rd, Op2(rs)); + break; + case 0x1: + Comp_Logical(0x1, true, rd, rd, Op2(rs)); + break; + case 0x2: + case 0x3: + case 0x4: + case 0x7: + { + Op2 op2; + op2.Reg.Rm = rd; + Comp_RegShiftReg(op == 0x7 ? 3 : (op - 0x2), true, op2, rs); + MOV(rd, op2.Reg.Rm, op2.ToArithOption()); + if (FlagsNZNeeded()) + TST(rd, rd); + Comp_RetriveFlags(false); + } + break; + case 0x5: + Comp_Arithmetic(0x5, true, rd, rd, Op2(rs)); + break; + case 0x6: + Comp_Arithmetic(0x6, true, rd, rd, Op2(rs)); + break; + case 0x8: + Comp_Compare(0x8, rd, Op2(rs)); + break; + case 0x9: + Comp_Arithmetic(0x3, true, rd, rs, Op2(0)); + break; + case 0xA: + Comp_Compare(0xA, rd, Op2(rs)); + break; + case 0xB: + Comp_Compare(0xB, rd, Op2(rs)); + break; + case 0xC: + Comp_Logical(0xC, true, rd, rd, Op2(rs)); + break; + case 0xD: + Comp_Mul_Mla(true, false, rd, rd, rs, INVALID_REG); + break; + case 0xE: + Comp_Logical(0xE, true, rd, rd, Op2(rs)); + break; + case 0xF: + MVN(rd, rs); + if (FlagsNZNeeded()) + TST(rd, rd); + Comp_RetriveFlags(false); + break; + } +} + +void Compiler::T_Comp_ALU_HiReg() +{ + u32 rd = ((CurInstr.Instr & 0x7) | ((CurInstr.Instr >> 4) & 0x8)); + ARM64Reg rdMapped = MapReg(rd); + ARM64Reg rs = MapReg((CurInstr.Instr >> 3) & 0xF); + + u32 op = (CurInstr.Instr >> 8) & 0x3; + + Comp_AddCycles_C(); + + switch (op) + { + case 0: + Comp_Arithmetic(0x4, false, rdMapped, rdMapped, Op2(rs)); + break; + case 1: + Comp_Compare(0xA, rdMapped, rs); + return; + case 2: + MOV(rdMapped, rs); + break; + } + + if (rd == 15) + { + Comp_JumpTo(rdMapped, false, false); + } +} + +void Compiler::T_Comp_AddSP() +{ + Comp_AddCycles_C(); + + ARM64Reg sp = MapReg(13); + u32 offset = (CurInstr.Instr & 0x7F) << 2; + if (CurInstr.Instr & (1 << 7)) + SUB(sp, sp, offset); + else + ADD(sp, sp, offset); +} + +void Compiler::T_Comp_RelAddr() +{ + Comp_AddCycles_C(); + + ARM64Reg rd = MapReg(CurInstr.T_Reg(8)); + u32 offset = (CurInstr.Instr & 0xFF) << 2; + if (CurInstr.Instr & (1 << 11)) + { + ARM64Reg sp = MapReg(13); + ADD(rd, sp, offset); + } + else + MOVI2R(rd, (R15 & ~2) + offset); +} + +} \ No newline at end of file diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp new file mode 100644 index 00000000..542f0b73 --- /dev/null +++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp @@ -0,0 +1,452 @@ +#include "ARMJIT_Compiler.h" + +using namespace Arm64Gen; + +// hack +const int kCodeCacheTiming = 3; + +namespace ARMJIT +{ + +template +void jumpToTrampoline(T* cpu, u32 addr, bool changeCPSR) +{ + cpu->JumpTo(addr, changeCPSR); +} + +void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) +{ + // we can simplify constant branches by a lot + // it's not completely safe to assume stuff like, which instructions to preload + // we'll see how it works out + + IrregularCycles = true; + + u32 newPC; + u32 cycles = 0; + bool setupRegion = false; + + if (addr & 0x1 && !Thumb) + { + CPSRDirty = true; + ORRI2R(RCPSR, RCPSR, 0x20); + } + else if (!(addr & 0x1) && Thumb) + { + CPSRDirty = true; + ANDI2R(RCPSR, RCPSR, ~0x20); + } + + if (Num == 0) + { + ARMv5* cpu9 = (ARMv5*)CurCPU; + + u32 oldregion = R15 >> 24; + u32 newregion = addr >> 24; + + u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0]; + u32 compileTimeCodeCycles = cpu9->RegionCodeCycles; + cpu9->RegionCodeCycles = regionCodeCycles; + + MOVI2R(W0, regionCodeCycles); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, RegionCodeCycles)); + + setupRegion = newregion != oldregion; + if (setupRegion) + cpu9->SetupCodeMem(addr); + + if (addr & 0x1) + { + addr &= ~0x1; + newPC = addr+2; + + // two-opcodes-at-once fetch + // doesn't matter if we put garbage in the MSbs there + if (addr & 0x2) + { + cpu9->CodeRead32(addr-2, true) >> 16; + cycles += cpu9->CodeCycles; + cpu9->CodeRead32(addr+2, false); + cycles += CurCPU->CodeCycles; + } + else + { + cpu9->CodeRead32(addr, true); + cycles += cpu9->CodeCycles; + } + } + else + { + addr &= ~0x3; + newPC = addr+4; + + cpu9->CodeRead32(addr, true); + cycles += cpu9->CodeCycles; + cpu9->CodeRead32(addr+4, false); + cycles += cpu9->CodeCycles; + } + + cpu9->RegionCodeCycles = compileTimeCodeCycles; + if (setupRegion) + cpu9->SetupCodeMem(R15); + } + else + { + ARMv4* cpu7 = (ARMv4*)CurCPU; + + u32 codeRegion = addr >> 24; + u32 codeCycles = addr >> 15; // cheato + + cpu7->CodeRegion = codeRegion; + cpu7->CodeCycles = codeCycles; + + MOVI2R(W0, codeRegion); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeRegion)); + MOVI2R(W0, codeCycles); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeCycles)); + + if (addr & 0x1) + { + addr &= ~0x1; + newPC = addr+2; + + // this is necessary because ARM7 bios protection + u32 compileTimePC = CurCPU->R[15]; + CurCPU->R[15] = newPC; + + cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1]; + + CurCPU->R[15] = compileTimePC; + } + else + { + addr &= ~0x3; + newPC = addr+4; + + u32 compileTimePC = CurCPU->R[15]; + CurCPU->R[15] = newPC; + + cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3]; + + CurCPU->R[15] = compileTimePC; + } + + cpu7->CodeRegion = R15 >> 24; + cpu7->CodeCycles = addr >> 15; + } + + if (Exit) + { + MOVI2R(W0, newPC); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, R[15])); + } + if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) + ConstantCycles += cycles; + else + ADD(RCycles, RCycles, cycles); +} + + +void* Compiler::Gen_JumpTo9(int kind) +{ + AlignCode16(); + void* res = GetRXPtr(); + + MOVI2R(W2, kCodeCacheTiming); + // W1 - code cycles non branch + // W2 - branch code cycles + LSR(W1, W0, 12); + LSL(W1, W1, 2); + ADDI2R(W1, W1, offsetof(ARMv5, MemTimings), W2); + LDRB(W1, RCPU, W1); + + LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize)); + + STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARMv5, RegionCodeCycles)); + + CMP(W0, W3); + FixupBranch outsideITCM = B(CC_LO); + MOVI2R(W1, 1); + MOVI2R(W2, 1); + SetJumpTarget(outsideITCM); + + FixupBranch switchToThumb; + if (kind == 0) + switchToThumb = TBNZ(W0, 0); + + if (kind == 0 || kind == 1) + { + ANDI2R(W0, W0, ~3); + + if (kind == 0) + ANDI2R(RCPSR, RCPSR, ~0x20); + + ADD(W3, W0, 4); + STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15])); + + ADD(W1, W1, W2); + ADD(RCycles, RCycles, W1); + + RET(); + } + if (kind == 0 || kind == 2) + { + if (kind == 0) + { + SetJumpTarget(switchToThumb); + + ORRI2R(RCPSR, RCPSR, 0x20); + } + + ANDI2R(W0, W0, ~1); + + ADD(W3, W0, 2); + STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15])); + + FixupBranch halfwordLoc = TBZ(W0, 1); + ADD(W1, W1, W2); + ADD(RCycles, RCycles, W1); + RET(); + + SetJumpTarget(halfwordLoc); + ADD(RCycles, RCycles, W2); + RET(); + } + + return res; +} + +void* Compiler::Gen_JumpTo7(int kind) +{ + void* res = GetRXPtr(); + + LSR(W1, W0, 24); + STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARM, CodeRegion)); + LSR(W1, W0, 15); + STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARM, CodeCycles)); + + MOVP2R(X2, NDS::ARM7MemTimings); + LDR(W3, X2, ArithOption(W1, true)); + + FixupBranch switchToThumb; + if (kind == 0) + switchToThumb = TBNZ(W0, 0); + + if (kind == 0 || kind == 1) + { + UBFX(W2, W3, 0, 8); + UBFX(W3, W3, 8, 8); + ADD(W2, W3, W2); + ADD(RCycles, RCycles, W2); + + ANDI2R(W0, W0, ~3); + + if (kind == 0) + ANDI2R(RCPSR, RCPSR, ~0x20); + + ADD(W3, W0, 4); + STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15])); + + RET(); + } + if (kind == 0 || kind == 2) + { + if (kind == 0) + { + SetJumpTarget(switchToThumb); + + ORRI2R(RCPSR, RCPSR, 0x20); + } + + UBFX(W2, W3, 16, 8); + UBFX(W3, W3, 24, 8); + ADD(W2, W3, W2); + ADD(RCycles, RCycles, W2); + + ANDI2R(W0, W0, ~1); + + ADD(W3, W0, 2); + STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15])); + + RET(); + } + + return res; +} + +void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool restoreCPSR) +{ + IrregularCycles = true; + + if (!restoreCPSR) + { + if (switchThumb) + CPSRDirty = true; + MOV(W0, addr); + BL((Num ? JumpToFuncs7 : JumpToFuncs9)[switchThumb ? 0 : (Thumb + 1)]); + } + else + { + BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00); + bool previouslyDirty = CPSRDirty; + SaveCPSR(); + + if (restoreCPSR) + { + if (Thumb || CurInstr.Cond() >= 0xE) + RegCache.Flush(); + else + { + // the ugly way... + // we only save them, to load and save them again + for (int reg : hiRegsLoaded) + SaveReg(reg, RegCache.Mapping[reg]); + } + } + + if (switchThumb) + MOV(W1, addr); + else + { + if (Thumb) + ORRI2R(W1, addr, 1); + else + ANDI2R(W1, addr, ~1); + } + MOV(X0, RCPU); + MOVI2R(W2, restoreCPSR); + if (Num == 0) + QuickCallFunction(X3, jumpToTrampoline); + else + QuickCallFunction(X3, jumpToTrampoline); + + if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE) + { + for (int reg : hiRegsLoaded) + LoadReg(reg, RegCache.Mapping[reg]); + } + + if (previouslyDirty) + LoadCPSR(); + CPSRDirty = previouslyDirty; + } +} + +void Compiler::A_Comp_BranchImm() +{ + int op = (CurInstr.Instr >> 24) & 1; + s32 offset = (s32)(CurInstr.Instr << 8) >> 6; + u32 target = R15 + offset; + bool link = op; + + if (CurInstr.Cond() == 0xF) // BLX_imm + { + target += (op << 1) + 1; + link = true; + } + + if (link) + MOVI2R(MapReg(14), R15 - 4); + + Comp_JumpTo(target); +} + +void Compiler::A_Comp_BranchXchangeReg() +{ + ARM64Reg rn = MapReg(CurInstr.A_Reg(0)); + MOV(W0, rn); + if ((CurInstr.Instr & 0xF0) == 0x30) // BLX_reg + MOVI2R(MapReg(14), R15 - 4); + Comp_JumpTo(W0, true); +} + +void Compiler::T_Comp_BCOND() +{ + u32 cond = (CurInstr.Instr >> 8) & 0xF; + FixupBranch skipExecute = CheckCondition(cond); + + s32 offset = (s32)(CurInstr.Instr << 24) >> 23; + Comp_JumpTo(R15 + offset + 1, true); + + Comp_BranchSpecialBehaviour(); + + FixupBranch skipFailed = B(); + SetJumpTarget(skipExecute); + Comp_AddCycles_C(true); + + if (CurInstr.BranchFlags & branch_FollowCondTaken) + { + SaveCPSR(false); + RegCache.PrepareExit(); + + ADD(W0, RCycles, ConstantCycles); + ABI_PopRegisters(SavedRegs); + RET(); + } + + SetJumpTarget(skipFailed); +} + +void Compiler::T_Comp_B() +{ + s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 20; + Comp_JumpTo(R15 + offset + 1); +} + +void Compiler::T_Comp_BranchXchangeReg() +{ + bool link = CurInstr.Instr & (1 << 7); + + if (link) + { + if (Num == 1) + { + printf("BLX unsupported on ARM7!!!\n"); + return; + } + MOV(W0, MapReg(CurInstr.A_Reg(3))); + MOVI2R(MapReg(14), R15 - 1); + Comp_JumpTo(W0, true); + } + else + { + ARM64Reg rn = MapReg(CurInstr.A_Reg(3)); + Comp_JumpTo(rn, true); + } +} + +void Compiler::T_Comp_BL_LONG_1() +{ + s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 9; + MOVI2R(MapReg(14), R15 + offset); + Comp_AddCycles_C(); +} + +void Compiler::T_Comp_BL_LONG_2() +{ + ARM64Reg lr = MapReg(14); + s32 offset = (CurInstr.Instr & 0x7FF) << 1; + ADD(W0, lr, offset); + MOVI2R(lr, (R15 - 2) | 1); + Comp_JumpTo(W0, Num == 0 && !(CurInstr.Instr & (1 << 12))); +} + +void Compiler::T_Comp_BL_Merged() +{ + Comp_AddCycles_C(); + + R15 += 2; + + u32 upperPart = CurInstr.Instr >> 16; + u32 target = (R15 - 2) + ((s32)((CurInstr.Instr & 0x7FF) << 21) >> 9); + target += (upperPart & 0x7FF) << 1; + + if (Num == 1 || upperPart & (1 << 12)) + target |= 1; + + MOVI2R(MapReg(14), (R15 - 2) | 1); + + Comp_JumpTo(target); +} + +} \ No newline at end of file diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp new file mode 100644 index 00000000..89d00298 --- /dev/null +++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp @@ -0,0 +1,707 @@ +#include "ARMJIT_Compiler.h" + +#include "../ARMInterpreter.h" + +#include "../ARMJIT_Internal.h" + +#ifdef __SWITCH__ +#include "../switch/compat_switch.h" + +extern char __start__; +#endif + +#include + +using namespace Arm64Gen; + + +namespace ARMJIT +{ + +/* + + Recompiling classic ARM to ARMv8 code is at the same time + easier and trickier than compiling to a less related architecture + like x64. At one hand you can translate a lot of instructions directly. + But at the same time, there are a ton of exceptions, like for + example ADD and SUB can't have a RORed second operand on ARMv8. + */ + +template <> +const ARM64Reg RegisterCache::NativeRegAllocOrder[] = + {W19, W20, W21, W22, W23, W24, W25, W26}; +template <> +const int RegisterCache::NativeRegsAvailable = 8; + +const int JitMemSize = 16 * 1024 * 1024; + +void Compiler::MovePC() +{ + ADD(MapReg(15), MapReg(15), Thumb ? 2 : 4); +} + +Compiler::Compiler() +{ +#ifdef __SWITCH__ + JitRWBase = memalign(0x1000, JitMemSize); + + JitRXStart = (u8*)&__start__ - JitMemSize - 0x1000; + JitRWStart = virtmemReserve(JitMemSize); + MemoryInfo info = {0}; + u32 pageInfo = {0}; + int i = 0; + while (JitRXStart != NULL) + { + svcQueryMemory(&info, &pageInfo, (u64)JitRXStart); + if (info.type != MemType_Unmapped) + JitRXStart = (void*)((u8*)info.addr - JitMemSize - 0x1000); + else + break; + if (i++ > 8) + { + printf("couldn't find unmapped place for jit memory\n"); + JitRXStart = NULL; + } + } + + assert(JitRXStart != NULL); + + bool succeded = R_SUCCEEDED(svcMapProcessCodeMemory(envGetOwnProcessHandle(), (u64)JitRXStart, (u64)JitRWBase, JitMemSize)); + assert(succeded); + succeded = R_SUCCEEDED(svcSetProcessMemoryPermission(envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize, Perm_Rx)); + assert(succeded); + succeded = R_SUCCEEDED(svcMapProcessMemory(JitRWStart, envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize)); + assert(succeded); + + SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart); + JitMemUseableSize = JitMemSize; + Reset(); +#endif + + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 2; j++) + { + MemFunc9[i][j] = Gen_MemoryRoutine9(8 << i, j); + } + } + MemFunc7[0][0] = (void*)NDS::ARM7Read8; + MemFunc7[1][0] = (void*)NDS::ARM7Read16; + MemFunc7[2][0] = (void*)NDS::ARM7Read32; + MemFunc7[0][1] = (void*)NDS::ARM7Write8; + MemFunc7[1][1] = (void*)NDS::ARM7Write16; + MemFunc7[2][1] = (void*)NDS::ARM7Write32; + + for (int i = 0; i < 2; i++) + { + for (int j = 0; j < 2; j++) + { + MemFuncsSeq9[i][j] = Gen_MemoryRoutine9Seq(i, j); + MemFuncsSeq7[i][j] = Gen_MemoryRoutine7Seq(i, j); + } + } + + for (int i = 0; i < 3; i++) + { + JumpToFuncs9[i] = Gen_JumpTo9(i); + JumpToFuncs7[i] = Gen_JumpTo7(i); + } + + /* + W0 - mode + W1 - reg num + W3 - in/out value of reg + */ + { + ReadBanked = GetRXPtr(); + + ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); + CMP(W0, 0x11); + FixupBranch fiq = B(CC_EQ); + SUBS(W1, W1, 13 - 8); + ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); + FixupBranch notEverything = B(CC_LT); + CMP(W0, 0x12); + FixupBranch irq = B(CC_EQ); + CMP(W0, 0x13); + FixupBranch svc = B(CC_EQ); + CMP(W0, 0x17); + FixupBranch abt = B(CC_EQ); + CMP(W0, 0x1B); + FixupBranch und = B(CC_EQ); + SetJumpTarget(notEverything); + RET(); + + SetJumpTarget(fiq); + LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_FIQ)); + RET(); + SetJumpTarget(irq); + LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_IRQ)); + RET(); + SetJumpTarget(svc); + LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_SVC)); + RET(); + SetJumpTarget(abt); + LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_ABT)); + RET(); + SetJumpTarget(und); + LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_UND)); + RET(); + } + { + WriteBanked = GetRXPtr(); + + ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); + CMP(W0, 0x11); + FixupBranch fiq = B(CC_EQ); + SUBS(W1, W1, 13 - 8); + ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); + FixupBranch notEverything = B(CC_LT); + CMP(W0, 0x12); + FixupBranch irq = B(CC_EQ); + CMP(W0, 0x13); + FixupBranch svc = B(CC_EQ); + CMP(W0, 0x17); + FixupBranch abt = B(CC_EQ); + CMP(W0, 0x1B); + FixupBranch und = B(CC_EQ); + SetJumpTarget(notEverything); + MOVI2R(W4, 0); + RET(); + + SetJumpTarget(fiq); + STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_FIQ)); + MOVI2R(W4, 1); + RET(); + SetJumpTarget(irq); + STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_IRQ)); + MOVI2R(W4, 1); + RET(); + SetJumpTarget(svc); + STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_SVC)); + MOVI2R(W4, 1); + RET(); + SetJumpTarget(abt); + STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_ABT)); + MOVI2R(W4, 1); + RET(); + SetJumpTarget(und); + STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_UND)); + MOVI2R(W4, 1); + RET(); + } + + //FlushIcache(); + + JitMemUseableSize -= GetCodeOffset(); + SetCodeBase((u8*)GetRWPtr(), (u8*)GetRXPtr()); +} + +Compiler::~Compiler() +{ +#ifdef __SWITCH__ + if (JitRWStart != NULL) + { + bool succeded = R_SUCCEEDED(svcUnmapProcessMemory(JitRWStart, envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize)); + assert(succeded); + virtmemFree(JitRWStart, JitMemSize); + succeded = R_SUCCEEDED(svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)JitRXStart, (u64)JitRWBase, JitMemSize)); + assert(succeded); + free(JitRWBase); + } +#endif +} + +void Compiler::LoadReg(int reg, ARM64Reg nativeReg) +{ + if (reg == 15) + MOVI2R(nativeReg, R15); + else + LDR(INDEX_UNSIGNED, nativeReg, RCPU, offsetof(ARM, R[reg])); +} + +void Compiler::SaveReg(int reg, ARM64Reg nativeReg) +{ + STR(INDEX_UNSIGNED, nativeReg, RCPU, offsetof(ARM, R[reg])); +} + +void Compiler::LoadCPSR() +{ + assert(!CPSRDirty); + LDR(INDEX_UNSIGNED, RCPSR, RCPU, offsetof(ARM, CPSR)); +} + +void Compiler::SaveCPSR(bool markClean) +{ + if (CPSRDirty) + { + STR(INDEX_UNSIGNED, RCPSR, RCPU, offsetof(ARM, CPSR)); + CPSRDirty = CPSRDirty && !markClean; + } +} + +FixupBranch Compiler::CheckCondition(u32 cond) +{ + if (cond >= 0x8) + { + LSR(W1, RCPSR, 28); + MOVI2R(W2, 1); + LSLV(W2, W2, W1); + ANDI2R(W2, W2, ARM::ConditionTable[cond], W3); + + return CBZ(W2); + } + else + { + u8 bit = (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))); + + if (cond & 1) + return TBNZ(RCPSR, bit); + else + return TBZ(RCPSR, bit); + } +} + +#define F(x) &Compiler::A_Comp_##x +const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = +{ + // AND + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // EOR + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // SUB + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // RSB + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // ADD + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // ADC + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // SBC + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // RSC + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // ORR + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // MOV + F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), + F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), + // BIC + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // MVN + F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), + F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), + // TST + F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), + // TEQ + F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), + // CMP + F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), + // CMN + F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), + // Mul + F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), NULL, NULL, NULL, NULL, NULL, + // ARMv5 exclusives + F(Clz), NULL, NULL, NULL, NULL, + + // STR + F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), + // STRB + F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), + // LDR + F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), + // LDRB + F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), + // STRH + F(MemHD), F(MemHD), F(MemHD), F(MemHD), + // LDRD + NULL, NULL, NULL, NULL, + // STRD + NULL, NULL, NULL, NULL, + // LDRH + F(MemHD), F(MemHD), F(MemHD), F(MemHD), + // LDRSB + F(MemHD), F(MemHD), F(MemHD), F(MemHD), + // LDRSH + F(MemHD), F(MemHD), F(MemHD), F(MemHD), + // Swap + NULL, NULL, + // LDM, STM + F(LDM_STM), F(LDM_STM), + // Branch + F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg), + // Special + NULL, NULL, NULL, NULL, NULL, NULL, NULL +}; +#undef F +#define F(x) &Compiler::T_Comp_##x +const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = +{ + // Shift imm + F(ShiftImm), F(ShiftImm), F(ShiftImm), + // Add/sub tri operand + F(AddSub_), F(AddSub_), F(AddSub_), F(AddSub_), + // 8 bit imm + F(ALUImm8), F(ALUImm8), F(ALUImm8), F(ALUImm8), + // ALU + F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), + F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), + // ALU hi reg + F(ALU_HiReg), F(ALU_HiReg), F(ALU_HiReg), + // PC/SP relative ops + F(RelAddr), F(RelAddr), F(AddSP), + // LDR PC rel + F(LoadPCRel), + // LDR/STR reg offset + F(MemReg), F(MemReg), F(MemReg), F(MemReg), + // LDR/STR sign extended, half + F(MemRegHalf), F(MemRegHalf), F(MemRegHalf), F(MemRegHalf), + // LDR/STR imm offset + F(MemImm), F(MemImm), F(MemImm), F(MemImm), + // LDR/STR half imm offset + F(MemImmHalf), F(MemImmHalf), + // LDR/STR sp rel + F(MemSPRel), F(MemSPRel), + // PUSH/POP + F(PUSH_POP), F(PUSH_POP), + // LDMIA, STMIA + F(LDMIA_STMIA), F(LDMIA_STMIA), + // Branch + F(BCOND), F(BranchXchangeReg), F(BranchXchangeReg), F(B), F(BL_LONG_1), F(BL_LONG_2), + // Unk, SVC + NULL, NULL, + F(BL_Merged) +}; + +bool Compiler::CanCompile(bool thumb, u16 kind) +{ + return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL; +} + +void Compiler::Comp_BranchSpecialBehaviour() +{ + if (CurInstr.BranchFlags & branch_IdleBranch) + { + MOVI2R(W0, 1); + STRB(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, IdleLoop)); + } + + if (CurInstr.BranchFlags & branch_FollowCondNotTaken) + { + SaveCPSR(false); + RegCache.PrepareExit(); + ADD(W0, RCycles, ConstantCycles); + ABI_PopRegisters(SavedRegs); + RET(); + } +} + +JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) +{ + if (JitMemUseableSize - GetCodeOffset() < 1024 * 16) + { + printf("JIT memory full, resetting...\n"); + ResetBlockCache(); + } + + JitBlockEntry res = (JitBlockEntry)GetRXPtr(); + + Thumb = thumb; + Num = cpu->Num; + CurCPU = cpu; + ConstantCycles = 0; + RegCache = RegisterCache(this, instrs, instrsCount, true); + + //printf("compiling block at %x\n", R15 - (Thumb ? 2 : 4)); + const u32 ALL_CALLEE_SAVED = 0x7FF80000; + + SavedRegs = BitSet32((RegCache.GetPushRegs() | BitSet32(0x78000000)) & BitSet32(ALL_CALLEE_SAVED)); + + //if (Num == 1) + { + ABI_PushRegisters(SavedRegs); + + MOVP2R(RCPU, CurCPU); + MOVI2R(RCycles, 0); + + LoadCPSR(); + } + + for (int i = 0; i < instrsCount; i++) + { + CurInstr = instrs[i]; + R15 = CurInstr.Addr + (Thumb ? 4 : 8); + CodeRegion = R15 >> 24; + + CompileFunc comp = Thumb + ? T_Comp[CurInstr.Info.Kind] + : A_Comp[CurInstr.Info.Kind]; + + Exit = i == (instrsCount - 1) || (CurInstr.BranchFlags & branch_FollowCondNotTaken); + + //printf("%x instr %x regs: r%x w%x n%x flags: %x %x %x\n", R15, CurInstr.Instr, CurInstr.Info.SrcRegs, CurInstr.Info.DstRegs, CurInstr.Info.ReadFlags, CurInstr.Info.NotStrictlyNeeded, CurInstr.Info.WriteFlags, CurInstr.SetFlags); + + bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE; + if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional))) + { + MOVI2R(W0, R15); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, R[15])); + if (comp == NULL) + { + MOVI2R(W0, CurInstr.Instr); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CurInstr)); + } + if (Num == 0) + { + MOVI2R(W0, (s32)CurInstr.CodeCycles); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeCycles)); + } + } + + if (comp == NULL) + { + SaveCPSR(); + RegCache.Flush(); + } + else + RegCache.Prepare(Thumb, i); + + if (Thumb) + { + if (comp == NULL) + { + MOV(X0, RCPU); + QuickCallFunction(X1, InterpretTHUMB[CurInstr.Info.Kind]); + } + else + (this->*comp)(); + } + else + { + u32 cond = CurInstr.Cond(); + if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM) + { + if (comp) + (this->*comp)(); + else + { + MOV(X0, RCPU); + QuickCallFunction(X1, ARMInterpreter::A_BLX_IMM); + } + } + else if (cond == 0xF) + Comp_AddCycles_C(); + else + { + IrregularCycles = false; + + FixupBranch skipExecute; + if (cond < 0xE) + skipExecute = CheckCondition(cond); + + if (comp == NULL) + { + MOV(X0, RCPU); + QuickCallFunction(X1, InterpretARM[CurInstr.Info.Kind]); + } + else + { + (this->*comp)(); + } + + Comp_BranchSpecialBehaviour(); + + if (cond < 0xE) + { + if (IrregularCycles) + { + FixupBranch skipNop = B(); + SetJumpTarget(skipExecute); + + Comp_AddCycles_C(); + + if (CurInstr.BranchFlags & branch_FollowCondTaken) + { + SaveCPSR(false); + RegCache.PrepareExit(); + ADD(W0, RCycles, ConstantCycles); + ABI_PopRegisters(SavedRegs); + RET(); + } + + SetJumpTarget(skipNop); + } + else + SetJumpTarget(skipExecute); + } + + } + } + + if (comp == NULL) + LoadCPSR(); + } + + RegCache.Flush(); + + //if (Num == 1) + { + SaveCPSR(); + + ADD(W0, RCycles, ConstantCycles); + + ABI_PopRegisters(SavedRegs); + } + //else + // ADD(RCycles, RCycles, ConstantCycles); + + RET(); + + FlushIcache(); + + //printf("finished\n"); + + return res; +} + +void Compiler::Reset() +{ + SetCodePtr(0); + + const u32 brk_0 = 0xD4200000; + + for (int i = 0; i < JitMemUseableSize / 4; i++) + *(((u32*)GetRWPtr()) + i) = brk_0; +} + +void Compiler::Comp_AddCycles_C(bool nonConst) +{ + s32 cycles = Num ? + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); + + if (!nonConst && !CurInstr.Info.Branches()) + ConstantCycles += cycles; + else + ADD(RCycles, RCycles, cycles); +} + +void Compiler::Comp_AddCycles_CI(u32 numI) +{ + s32 cycles = (Num ? + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + numI; + + if (Thumb || CurInstr.Cond() >= 0xE) + ConstantCycles += cycles; + else + ADD(RCycles, RCycles, cycles); +} + +void Compiler::Comp_AddCycles_CI(u32 c, ARM64Reg numI, ArithOption shift) +{ + s32 cycles = (Num ? + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + c; + + ADD(RCycles, RCycles, numI, shift); + if (Thumb || CurInstr.Cond() >= 0xE) + ConstantCycles += c; + else + ADD(RCycles, RCycles, cycles); +} + +void Compiler::Comp_AddCycles_CDI() +{ + if (Num == 0) + Comp_AddCycles_CD(); + else + { + IrregularCycles = true; + + s32 cycles; + + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if (CurInstr.DataRegion == 0x02) // mainRAM + { + if (CodeRegion == 0x02) + cycles = numC + numD; + else + { + numC++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + } + else if (CodeRegion == 0x02) + { + numD++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles = numC + numD + 1; + } + + if (!Thumb && CurInstr.Cond() < 0xE) + ADD(RCycles, RCycles, cycles); + else + ConstantCycles += cycles; + } +} + +void Compiler::Comp_AddCycles_CD() +{ + u32 cycles = 0; + if (Num == 0) + { + s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles; + s32 numD = CurInstr.DataCycles; + + //if (DataRegion != CodeRegion) + cycles = std::max(numC + numD - 6, std::max(numC, numD)); + + IrregularCycles = cycles != numC; + } + else + { + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if (CurInstr.DataRegion == 0x02) + { + if (CodeRegion == 0x02) + cycles += numC + numD; + else + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else if (CodeRegion == 0x02) + { + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles += numC + numD; + } + + IrregularCycles = true; + } + + if ((!Thumb && CurInstr.Cond() < 0xE) && IrregularCycles) + ADD(RCycles, RCycles, cycles); + else + ConstantCycles += cycles; +} + +} \ No newline at end of file diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h new file mode 100644 index 00000000..7e135075 --- /dev/null +++ b/src/ARMJIT_A64/ARMJIT_Compiler.h @@ -0,0 +1,234 @@ +#ifndef ARMJIT_COMPILER_H +#define ARMJIT_COMPILER_H + +#include "../ARM.h" +#include "../ARMJIT.h" + +#include "../dolphin/Arm64Emitter.h" + +#include "../ARMJIT_Internal.h" +#include "../ARMJIT_RegisterCache.h" + +namespace ARMJIT +{ + +const Arm64Gen::ARM64Reg RCPSR = Arm64Gen::W27; +const Arm64Gen::ARM64Reg RCycles = Arm64Gen::W28; +const Arm64Gen::ARM64Reg RCPU = Arm64Gen::X29; + +struct Op2 +{ + Op2() + {} + + Op2(Arm64Gen::ARM64Reg rm) : IsImm(false) + { + Reg.Rm = rm; + Reg.ShiftType = Arm64Gen::ST_LSL; + Reg.ShiftAmount = 0; + } + + Op2(u32 imm) : IsImm(true), Imm(imm) + {} + + Op2(Arm64Gen::ARM64Reg rm, Arm64Gen::ShiftType st, int amount) : IsImm(false) + { + Reg.Rm = rm; + Reg.ShiftType = st; + Reg.ShiftAmount = amount; + } + + Arm64Gen::ArithOption ToArithOption() + { + assert(!IsImm); + return Arm64Gen::ArithOption(Reg.Rm, Reg.ShiftType, Reg.ShiftAmount); + } + + bool IsSimpleReg() + { return !IsImm && !Reg.ShiftAmount && Reg.ShiftType == Arm64Gen::ST_LSL; } + bool ImmFits12Bit() + { return IsImm && (Imm & 0xFFF == Imm); } + bool IsZero() + { return IsImm && !Imm; } + + bool IsImm; + union + { + struct + { + Arm64Gen::ARM64Reg Rm; + Arm64Gen::ShiftType ShiftType; + int ShiftAmount; + } Reg; + u32 Imm; + }; +}; + +class Compiler : Arm64Gen::ARM64XEmitter +{ +public: + typedef void (Compiler::*CompileFunc)(); + + Compiler(); + ~Compiler(); + + Arm64Gen::ARM64Reg MapReg(int reg) + { + assert(RegCache.Mapping[reg] != Arm64Gen::INVALID_REG); + return RegCache.Mapping[reg]; + } + + JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); + + bool CanCompile(bool thumb, u16 kind); + + bool FlagsNZNeeded() + { + return CurInstr.SetFlags & 0xC; + } + + void Reset(); + + void Comp_AddCycles_C(bool forceNonConst = false); + void Comp_AddCycles_CI(u32 numI); + void Comp_AddCycles_CI(u32 c, Arm64Gen::ARM64Reg numI, Arm64Gen::ArithOption shift); + void Comp_AddCycles_CD(); + void Comp_AddCycles_CDI(); + + void MovePC(); + + void LoadReg(int reg, Arm64Gen::ARM64Reg nativeReg); + void SaveReg(int reg, Arm64Gen::ARM64Reg nativeReg); + + void LoadCPSR(); + void SaveCPSR(bool markClean = true); + + void A_Comp_ALUTriOp(); + void A_Comp_ALUMovOp(); + void A_Comp_ALUCmpOp(); + + void A_Comp_Mul(); + void A_Comp_Mul_Long(); + + void A_Comp_Clz(); + + void A_Comp_MemWB(); + void A_Comp_MemHD(); + + void A_Comp_LDM_STM(); + + void A_Comp_BranchImm(); + void A_Comp_BranchXchangeReg(); + + + void T_Comp_ShiftImm(); + void T_Comp_AddSub_(); + void T_Comp_ALUImm8(); + void T_Comp_ALU(); + void T_Comp_ALU_HiReg(); + void T_Comp_AddSP(); + void T_Comp_RelAddr(); + + void T_Comp_MemReg(); + void T_Comp_MemImm(); + void T_Comp_MemRegHalf(); + void T_Comp_MemImmHalf(); + void T_Comp_LoadPCRel(); + void T_Comp_MemSPRel(); + + void T_Comp_LDMIA_STMIA(); + void T_Comp_PUSH_POP(); + + void T_Comp_BCOND(); + void T_Comp_B(); + void T_Comp_BranchXchangeReg(); + void T_Comp_BL_LONG_1(); + void T_Comp_BL_LONG_2(); + void T_Comp_BL_Merged(); + + s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); + + void Comp_Mul_Mla(bool S, bool mla, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rm, Arm64Gen::ARM64Reg rs, Arm64Gen::ARM64Reg rn); + + void Comp_Compare(int op, Arm64Gen::ARM64Reg rn, Op2 op2); + void Comp_Logical(int op, bool S, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rn, Op2 op2); + void Comp_Arithmetic(int op, bool S, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rn, Op2 op2); + + void Comp_RetriveFlags(bool retriveCV); + + Arm64Gen::FixupBranch CheckCondition(u32 cond); + + void Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool restoreCPSR = false); + void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false); + + void A_Comp_GetOp2(bool S, Op2& op2); + + void Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, Arm64Gen::ARM64Reg tmp = Arm64Gen::W0); + void Comp_RegShiftReg(int op, bool S, Op2& op2, Arm64Gen::ARM64Reg rs); + + void Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr); + enum + { + memop_Writeback = 1 << 0, + memop_Post = 1 << 1, + memop_SignExtend = 1 << 2, + memop_Store = 1 << 3, + memop_SubtractOffset = 1 << 4 + }; + void Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags); + + void* Gen_MemoryRoutine9(int size, bool store); + + void* Gen_MemoryRoutine9Seq(bool store, bool preinc); + void* Gen_MemoryRoutine7Seq(bool store, bool preinc); + + // 0 = switch mode, 1 = stay arm, 2 = stay thumb + void* Gen_JumpTo9(int kind); + void* Gen_JumpTo7(int kind); + + void Comp_BranchSpecialBehaviour(); + + bool Exit; + + FetchedInstr CurInstr; + bool Thumb; + u32 R15; + u32 Num; + ARM* CurCPU; + u32 ConstantCycles; + u32 CodeRegion; + + BitSet32 SavedRegs; + + u32 JitMemUseableSize; + + void* ReadBanked, *WriteBanked; + + // [size][store] + void* MemFunc9[3][2]; + void* MemFunc7[3][2]; + + // [store][pre increment] + void* MemFuncsSeq9[2][2]; + // "[code in main ram] + void* MemFuncsSeq7[2][2]; + + void* JumpToFuncs9[3]; + void* JumpToFuncs7[3]; + + RegisterCache RegCache; + + bool CPSRDirty = false; + + bool IrregularCycles = false; + +#ifdef __SWITCH__ + void* JitRWBase; + void* JitRWStart; + void* JitRXStart; +#endif +}; + +} + +#endif \ No newline at end of file diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp new file mode 100644 index 00000000..a5d0e3f3 --- /dev/null +++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp @@ -0,0 +1,848 @@ +#include "ARMJIT_Compiler.h" + +#include "../Config.h" + +using namespace Arm64Gen; + +namespace ARMJIT +{ + +// W0 - address +// (if store) W1 - value to store +// W2 - code cycles +void* Compiler::Gen_MemoryRoutine9(int size, bool store) +{ + AlignCode16(); + void* res = GetRXPtr(); + + u32 addressMask; + switch (size) + { + case 32: addressMask = ~3; break; + case 16: addressMask = ~1; break; + case 8: addressMask = ~0; break; + } + + LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, DTCMBase)); + LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMSize)); + SUB(W3, W0, W3); + CMP(W3, W4); + FixupBranch insideDTCM = B(CC_LO); + + UBFX(W4, W0, 24, 8); + CMP(W4, 0x02); + FixupBranch outsideMainRAM = B(CC_NEQ); + ANDI2R(W3, W0, addressMask & (MAIN_RAM_SIZE - 1)); + MOVP2R(X4, NDS::MainRAM); + if (!store && size == 32) + { + LDR(W3, X3, X4); + ANDI2R(W0, W0, 3); + LSL(W0, W0, 3); + RORV(W0, W3, W0); + } + else if (store) + STRGeneric(size, W1, X3, X4); + else + LDRGeneric(size, false, W0, X3, X4); + RET(); + + SetJumpTarget(outsideMainRAM); + + LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize)); + CMP(W0, W3); + FixupBranch insideITCM = B(CC_LO); + + if (store) + { + if (size > 8) + ANDI2R(W0, W0, addressMask); + + switch (size) + { + case 32: QuickTailCall(X4, NDS::ARM9Write32); break; + case 16: QuickTailCall(X4, NDS::ARM9Write16); break; + case 8: QuickTailCall(X4, NDS::ARM9Write8); break; + } + } + else + { + if (size == 32) + ABI_PushRegisters({0, 30}); + if (size > 8) + ANDI2R(W0, W0, addressMask); + + switch (size) + { + case 32: QuickCallFunction(X4, NDS::ARM9Read32); break; + case 16: QuickTailCall (X4, NDS::ARM9Read16); break; + case 8: QuickTailCall (X4, NDS::ARM9Read8 ); break; + } + if (size == 32) + { + ABI_PopRegisters({1, 30}); + ANDI2R(W1, W1, 3); + LSL(W1, W1, 3); + RORV(W0, W0, W1); + RET(); + } + } + + SetJumpTarget(insideDTCM); + ANDI2R(W3, W3, 0x3FFF & addressMask); + ADDI2R(W3, W3, offsetof(ARMv5, DTCM), W4); + if (!store && size == 32) + { + ANDI2R(W4, W0, 3); + LDR(W0, RCPU, W3); + LSL(W4, W4, 3); + RORV(W0, W0, W4); + } + else if (store) + STRGeneric(size, W1, RCPU, W3); + else + LDRGeneric(size, false, W0, RCPU, W3); + + RET(); + + SetJumpTarget(insideITCM); + ANDI2R(W3, W0, 0x7FFF & addressMask); + if (store) + { + LSR(W0, W3, 8); + ADDI2R(W0, W0, ExeMemRegionOffsets[exeMem_ITCM], W4); + MOVP2R(X4, CodeRanges); + ADD(X4, X4, X0, ArithOption(X0, ST_LSL, 4)); + static_assert(sizeof(AddressRange) == 16); + LDR(INDEX_UNSIGNED, W4, X4, offsetof(AddressRange, Blocks.Length)); + FixupBranch null = CBZ(W4); + ABI_PushRegisters({1, 3, 30}); + QuickCallFunction(X4, InvalidateByAddr); + ABI_PopRegisters({1, 3, 30}); + SetJumpTarget(null); + } + ADDI2R(W3, W3, offsetof(ARMv5, ITCM), W4); + if (!store && size == 32) + { + ANDI2R(W4, W0, 3); + LDR(W0, RCPU, W3); + LSL(W4, W4, 3); + RORV(W0, W0, W4); + } + else if (store) + STRGeneric(size, W1, RCPU, W3); + else + LDRGeneric(size, false, W0, RCPU, W3); + RET(); + + return res; +} + +/* + W0 - base address + X1 - stack space + W2 - values count +*/ +void* Compiler::Gen_MemoryRoutine9Seq(bool store, bool preinc) +{ + AlignCode16(); + void* res = GetRXPtr(); + + void* loopStart = GetRXPtr(); + SUB(W2, W2, 1); + + if (preinc) + ADD(W0, W0, 4); + + LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMBase)); + LDR(INDEX_UNSIGNED, W5, RCPU, offsetof(ARMv5, DTCMSize)); + SUB(W4, W0, W4); + CMP(W4, W5); + FixupBranch insideDTCM = B(CC_LO); + + LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, ITCMSize)); + CMP(W0, W4); + FixupBranch insideITCM = B(CC_LO); + + ABI_PushRegisters({0, 1, 2, 30}); // TODO: move SP only once + if (store) + { + LDR(X1, X1, ArithOption(X2, true)); + QuickCallFunction(X4, NDS::ARM9Write32); + + ABI_PopRegisters({0, 1, 2, 30}); + } + else + { + QuickCallFunction(X4, NDS::ARM9Read32); + MOV(W4, W0); + + ABI_PopRegisters({0, 1, 2, 30}); + + STR(X4, X1, ArithOption(X2, true)); + } + + if (!preinc) + ADD(W0, W0, 4); + CBNZ(W2, loopStart); + RET(); + + SetJumpTarget(insideDTCM); + + ANDI2R(W4, W4, ~3 & 0x3FFF); + ADDI2R(X4, X4, offsetof(ARMv5, DTCM)); + if (store) + { + LDR(X5, X1, ArithOption(X2, true)); + STR(W5, RCPU, X4); + } + else + { + LDR(W5, RCPU, X4); + STR(X5, X1, ArithOption(X2, true)); + } + + if (!preinc) + ADD(W0, W0, 4); + CBNZ(W2, loopStart); + RET(); + + SetJumpTarget(insideITCM); + + ANDI2R(W4, W0, ~3 & 0x7FFF); + + if (store) + { + LSR(W6, W4, 8); + ADDI2R(W6, W6, ExeMemRegionOffsets[exeMem_ITCM], W5); + MOVP2R(X5, CodeRanges); + ADD(X5, X5, X6, ArithOption(X6, ST_LSL, 4)); + static_assert(sizeof(AddressRange) == 16); + LDR(INDEX_UNSIGNED, W5, X5, offsetof(AddressRange, Blocks.Length)); + FixupBranch null = CBZ(W5); + ABI_PushRegisters({0, 1, 2, 4, 30}); + MOV(W0, W6); + QuickCallFunction(X5, InvalidateByAddr); + ABI_PopRegisters({0, 1, 2, 4, 30}); + SetJumpTarget(null); + } + + ADDI2R(W4, W4, offsetof(ARMv5, ITCM), W5); + if (store) + { + LDR(X5, X1, ArithOption(X2, true)); + STR(W5, RCPU, X4); + } + else + { + LDR(W5, RCPU, X4); + STR(X5, X1, ArithOption(X2, true)); + } + + if (!preinc) + ADD(W0, W0, 4); + CBNZ(W2, loopStart); + RET(); + return res; +} + +void* Compiler::Gen_MemoryRoutine7Seq(bool store, bool preinc) +{ + AlignCode16(); + void* res = GetRXPtr(); + + void* loopStart = GetRXPtr(); + SUB(W2, W2, 1); + + if (preinc) + ADD(W0, W0, 4); + + ABI_PushRegisters({0, 1, 2, 30}); + if (store) + { + LDR(X1, X1, ArithOption(X2, true)); + QuickCallFunction(X4, NDS::ARM7Write32); + ABI_PopRegisters({0, 1, 2, 30}); + } + else + { + QuickCallFunction(X4, NDS::ARM7Read32); + MOV(W4, W0); + ABI_PopRegisters({0, 1, 2, 30}); + STR(X4, X1, ArithOption(X2, true)); + } + + if (!preinc) + ADD(W0, W0, 4); + CBNZ(W2, loopStart); + RET(); + + return res; +} + +void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) +{ + u32 val; + // make sure arm7 bios is accessible + u32 tmpR15 = CurCPU->R[15]; + CurCPU->R[15] = R15; + if (size == 32) + { + CurCPU->DataRead32(addr & ~0x3, &val); + val = ROR(val, (addr & 0x3) << 3); + } + else if (size == 16) + { + CurCPU->DataRead16(addr & ~0x1, &val); + if (signExtend) + val = ((s32)val << 16) >> 16; + } + else + { + CurCPU->DataRead8(addr, &val); + if (signExtend) + val = ((s32)val << 24) >> 24; + } + CurCPU->R[15] = tmpR15; + + MOVI2R(MapReg(rd), val); + + if (Thumb || CurInstr.Cond() == 0xE) + RegCache.PutLiteral(rd, val); +} + +void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) +{ + u32 addressMask = ~0; + if (size == 32) + addressMask = ~3; + if (size == 16) + addressMask = ~1; + + if (flags & memop_Store) + Comp_AddCycles_CD(); + else + Comp_AddCycles_CDI(); + + if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) + { + u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr); + + if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16)))) + { + Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr); + return; + } + } + + { + ARM64Reg rdMapped = MapReg(rd); + ARM64Reg rnMapped = MapReg(rn); + + bool inlinePreparation = Num == 1; + u32 constLocalROR32 = 4; + + void* memFunc = Num == 0 + ? MemFunc9[size >> 4][!!(flags & memop_Store)] + : MemFunc7[size >> 4][!!((flags & memop_Store))]; + + if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && offset.IsImm && RegCache.IsLiteral(rn)) + { + u32 addr = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + + NDS::MemRegion region; + region.Mem = NULL; + if (Num == 0) + { + ARMv5* cpu5 = (ARMv5*)CurCPU; + + // stupid dtcm... + if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize)) + { + region.Mem = cpu5->DTCM; + region.Mask = 0x3FFF; + } + else + { + NDS::ARM9GetMemRegion(addr, flags & memop_Store, ®ion); + } + } + else + NDS::ARM7GetMemRegion(addr, flags & memop_Store, ®ion); + + if (region.Mem != NULL) + { + void* ptr = ®ion.Mem[addr & addressMask & region.Mask]; + + MOVP2R(X0, ptr); + if (flags & memop_Store) + STRGeneric(size, INDEX_UNSIGNED, rdMapped, X0, 0); + else + { + LDRGeneric(size, flags & memop_SignExtend, INDEX_UNSIGNED, rdMapped, X0, 0); + if (size == 32 && addr & ~0x3) + ROR_(rdMapped, rdMapped, (addr & 0x3) << 3); + } + return; + } + + void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size); + if (specialFunc) + { + memFunc = specialFunc; + inlinePreparation = true; + constLocalROR32 = addr & 0x3; + } + } + + ARM64Reg finalAddr = W0; + if (flags & memop_Post) + { + finalAddr = rnMapped; + MOV(W0, rnMapped); + } + + if (flags & memop_Store) + MOV(W1, rdMapped); + + if (!offset.IsImm) + Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2); + // offset might become an immediate + if (offset.IsImm) + { + if (flags & memop_SubtractOffset) + SUB(finalAddr, rnMapped, offset.Imm); + else + ADD(finalAddr, rnMapped, offset.Imm); + } + else + { + if (offset.Reg.ShiftType == ST_ROR) + { + ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount); + offset = Op2(W0); + } + + if (flags & memop_SubtractOffset) + SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); + else + ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); + } + + if (!(flags & memop_Post) && (flags & memop_Writeback)) + MOV(rnMapped, W0); + + if (inlinePreparation) + { + if (size == 32 && !(flags & memop_Store) && constLocalROR32 == 4) + ANDI2R(rdMapped, W0, 3); + if (size > 8) + ANDI2R(W0, W0, addressMask); + } + QuickCallFunction(X2, memFunc); + if (!(flags & memop_Store)) + { + if (inlinePreparation && !(flags & memop_Store) && size == 32) + { + if (constLocalROR32 == 4) + { + LSL(rdMapped, rdMapped, 3); + RORV(rdMapped, W0, rdMapped); + } + else if (constLocalROR32 > 0) + ROR_(rdMapped, W0, constLocalROR32 << 3); + else + MOV(rdMapped, W0); + } + else if (flags & memop_SignExtend) + { + if (size == 16) + SXTH(rdMapped, W0); + else if (size == 8) + SXTB(rdMapped, W0); + else + assert("What's wrong with you?"); + } + else + MOV(rdMapped, W0); + + if (CurInstr.Info.Branches()) + { + if (size < 32) + printf("LDR size < 32 branching?\n"); + Comp_JumpTo(rdMapped, Num == 0, false); + } + } + } +} + +void Compiler::A_Comp_MemWB() +{ + Op2 offset; + if (CurInstr.Instr & (1 << 25)) + offset = Op2(MapReg(CurInstr.A_Reg(0)), (ShiftType)((CurInstr.Instr >> 5) & 0x3), (CurInstr.Instr >> 7) & 0x1F); + else + offset = Op2(CurInstr.Instr & 0xFFF); + + bool load = CurInstr.Instr & (1 << 20); + bool byte = CurInstr.Instr & (1 << 22); + + int flags = 0; + if (!load) + flags |= memop_Store; + if (!(CurInstr.Instr & (1 << 24))) + flags |= memop_Post; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; + + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, byte ? 8 : 32, flags); +} + +void Compiler::A_Comp_MemHD() +{ + bool load = CurInstr.Instr & (1 << 20); + bool signExtend; + int op = (CurInstr.Instr >> 5) & 0x3; + int size; + + if (load) + { + signExtend = op >= 2; + size = op == 2 ? 8 : 16; + } + else + { + size = 16; + signExtend = false; + } + + Op2 offset; + if (CurInstr.Instr & (1 << 22)) + offset = Op2((CurInstr.Instr & 0xF) | ((CurInstr.Instr >> 4) & 0xF0)); + else + offset = Op2(MapReg(CurInstr.A_Reg(0))); + + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; + if (!(CurInstr.Instr & (1 << 24))) + flags |= memop_Post; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; + + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); +} + +void Compiler::T_Comp_MemReg() +{ + int op = (CurInstr.Instr >> 10) & 0x3; + bool load = op & 0x2; + bool byte = op & 0x1; + + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), + Op2(MapReg(CurInstr.T_Reg(6))), byte ? 8 : 32, load ? 0 : memop_Store); +} + +void Compiler::T_Comp_MemImm() +{ + int op = (CurInstr.Instr >> 11) & 0x3; + bool load = op & 0x1; + bool byte = op & 0x2; + u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4); + + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), + byte ? 8 : 32, load ? 0 : memop_Store); +} + +void Compiler::T_Comp_MemRegHalf() +{ + int op = (CurInstr.Instr >> 10) & 0x3; + bool load = op != 0; + int size = op != 1 ? 16 : 8; + bool signExtend = op & 1; + + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; + + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(MapReg(CurInstr.T_Reg(6))), + size, flags); +} + +void Compiler::T_Comp_MemImmHalf() +{ + u32 offset = (CurInstr.Instr >> 5) & 0x3E; + bool load = CurInstr.Instr & (1 << 11); + + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), 16, + load ? 0 : memop_Store); +} + +void Compiler::T_Comp_LoadPCRel() +{ + u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); + + if (Config::JIT_LiteralOptimisations) + { + Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr); + Comp_AddCycles_CDI(); + } + else + { + bool negative = addr < R15; + u32 abs = negative ? R15 - addr : addr - R15; + Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(abs), 32, negative ? memop_SubtractOffset : 0); + } +} + +void Compiler::T_Comp_MemSPRel() +{ + u32 offset = (CurInstr.Instr & 0xFF) * 4; + bool load = CurInstr.Instr & (1 << 11); + + Comp_MemAccess(CurInstr.T_Reg(8), 13, Op2(offset), 32, load ? 0 : memop_Store); +} + +s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) +{ + IrregularCycles = true; + + int regsCount = regs.Count(); + + if (regsCount == 0) + return 0; // actually not the right behaviour TODO: fix me + + SUB(SP, SP, ((regsCount + 1) & ~1) * 8); + if (store) + { + Comp_AddCycles_CD(); + + if (usermode && (regs & BitSet16(0x7f00))) + UBFX(W0, RCPSR, 0, 5); + + int i = regsCount - 1; + + BitSet16::Iterator it = regs.begin(); + while (it != regs.end()) + { + BitSet16::Iterator nextReg = it; + nextReg++; + + int reg = *it; + + if (usermode && reg >= 8 && reg < 15) + { + if (RegCache.Mapping[reg] != INVALID_REG) + MOV(W3, MapReg(reg)); + else + LoadReg(reg, W3); + MOVI2R(W1, reg - 8); + BL(ReadBanked); + STR(INDEX_UNSIGNED, W3, SP, i * 8); + } + else if (!usermode && nextReg != regs.end()) + { + ARM64Reg first = W3; + ARM64Reg second = W4; + + if (RegCache.Mapping[reg] != INVALID_REG) + first = MapReg(reg); + else + LoadReg(reg, W3); + + if (RegCache.Mapping[*nextReg] != INVALID_REG) + second = MapReg(*nextReg); + else + LoadReg(*nextReg, W4); + + STP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8); + + i--; + it++; + } + else if (RegCache.Mapping[reg] != INVALID_REG) + STR(INDEX_UNSIGNED, MapReg(reg), SP, i * 8); + else + { + LoadReg(reg, W3); + STR(INDEX_UNSIGNED, W3, SP, i * 8); + } + i--; + it++; + } + } + if (decrement) + { + SUB(W0, MapReg(rn), regsCount * 4); + preinc ^= true; + } + else + MOV(W0, MapReg(rn)); + ADD(X1, SP, 0); + MOVI2R(W2, regsCount); + + BL(Num ? MemFuncsSeq7[store][preinc] : MemFuncsSeq9[store][preinc]); + + if (!store) + { + Comp_AddCycles_CDI(); + + if (usermode && (regs & BitSet16(0x7f00))) + UBFX(W0, RCPSR, 0, 5); + + int i = regsCount - 1; + BitSet16::Iterator it = regs.begin(); + while (it != regs.end()) + { + BitSet16::Iterator nextReg = it; + nextReg++; + + int reg = *it; + + if (usermode && reg >= 8 && reg < 15) + { + LDR(INDEX_UNSIGNED, W3, SP, i * 8); + MOVI2R(W1, reg - 8); + BL(WriteBanked); + FixupBranch alreadyWritten = CBNZ(W4); + if (RegCache.Mapping[reg] != INVALID_REG) + { + MOV(MapReg(reg), W3); + RegCache.DirtyRegs |= 1 << reg; + } + else + SaveReg(reg, W3); + SetJumpTarget(alreadyWritten); + } + else if (!usermode && nextReg != regs.end()) + { + ARM64Reg first = W3, second = W4; + + if (RegCache.Mapping[reg] != INVALID_REG) + { + first = MapReg(reg); + if (reg != 15) + RegCache.DirtyRegs |= 1 << reg; + } + if (RegCache.Mapping[*nextReg] != INVALID_REG) + { + second = MapReg(*nextReg); + if (*nextReg != 15) + RegCache.DirtyRegs |= 1 << *nextReg; + } + + LDP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8); + + if (first == W3) + SaveReg(reg, W3); + if (second == W4) + SaveReg(*nextReg, W4); + + it++; + i--; + } + else if (RegCache.Mapping[reg] != INVALID_REG) + { + ARM64Reg mapped = MapReg(reg); + LDR(INDEX_UNSIGNED, mapped, SP, i * 8); + + if (reg != 15) + RegCache.DirtyRegs |= 1 << reg; + } + else + { + LDR(INDEX_UNSIGNED, W3, SP, i * 8); + SaveReg(reg, W3); + } + + it++; + i--; + } + } + ADD(SP, SP, ((regsCount + 1) & ~1) * 8); + + if (!store && regs[15]) + { + ARM64Reg mapped = MapReg(15); + Comp_JumpTo(mapped, Num == 0, usermode); + } + + return regsCount * 4 * (decrement ? -1 : 1); +} + +void Compiler::A_Comp_LDM_STM() +{ + BitSet16 regs(CurInstr.Instr & 0xFFFF); + + bool load = CurInstr.Instr & (1 << 20); + bool pre = CurInstr.Instr & (1 << 24); + bool add = CurInstr.Instr & (1 << 23); + bool writeback = CurInstr.Instr & (1 << 21); + bool usermode = CurInstr.Instr & (1 << 22); + + ARM64Reg rn = MapReg(CurInstr.A_Reg(16)); + + s32 offset = Comp_MemAccessBlock(CurInstr.A_Reg(16), regs, !load, pre, !add, usermode); + + if (load && writeback && regs[CurInstr.A_Reg(16)]) + writeback = Num == 0 + ? (!(regs & ~BitSet16(1 << CurInstr.A_Reg(16)))) || (regs & ~BitSet16((2 << CurInstr.A_Reg(16)) - 1)) + : false; + if (writeback) + { + if (offset > 0) + ADD(rn, rn, offset); + else + SUB(rn, rn, -offset); + } +} + +void Compiler::T_Comp_PUSH_POP() +{ + bool load = CurInstr.Instr & (1 << 11); + BitSet16 regs(CurInstr.Instr & 0xFF); + if (CurInstr.Instr & (1 << 8)) + { + if (load) + regs[15] = true; + else + regs[14] = true; + } + + ARM64Reg sp = MapReg(13); + s32 offset = Comp_MemAccessBlock(13, regs, !load, !load, !load, false); + + if (offset > 0) + ADD(sp, sp, offset); + else + SUB(sp, sp, -offset); +} + +void Compiler::T_Comp_LDMIA_STMIA() +{ + BitSet16 regs(CurInstr.Instr & 0xFF); + ARM64Reg rb = MapReg(CurInstr.T_Reg(8)); + bool load = CurInstr.Instr & (1 << 11); + u32 regsCount = regs.Count(); + + s32 offset = Comp_MemAccessBlock(CurInstr.T_Reg(8), regs, !load, false, false, false); + + if (!load || !regs[CurInstr.T_Reg(8)]) + { + if (offset > 0) + ADD(rb, rb, offset); + else + SUB(rb, rb, -offset); + } +} + +} \ No newline at end of file diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 08e2f0a6..b8847731 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -2,6 +2,8 @@ #include +#include "Config.h" + namespace ARMInstrInfo { @@ -363,7 +365,11 @@ Info Decode(bool thumb, u32 num, u32 instr) res.SpecialKind = special_WriteMem; if (res.Kind == ARMInstrInfo::tk_LDR_PCREL) + { + if (!Config::JIT_LiteralOptimisations) + res.SrcRegs |= 1 << 15; res.SpecialKind = special_LoadLiteral; + } if (res.Kind == tk_LDMIA || res.Kind == tk_POP) { @@ -417,7 +423,6 @@ Info Decode(bool thumb, u32 num, u32 instr) u32 cp = ((instr >> 8) & 0xF); if ((num == 0 && cp != 15) || (num == 1 && cp != 14)) { - printf("happens\n"); data = A_UNK; res.Kind = ak_UNK; } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bfc0ad97..fce9e490 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -60,10 +60,31 @@ if (ENABLE_JIT) ARMJIT_x64/ARMJIT_Branch.cpp dolphin/CommonFuncs.cpp - dolphin/x64ABI.cpp - dolphin/x64CPUDetect.cpp - dolphin/x64Emitter.cpp ) + + if (ARCHITECTURE STREQUAL x86_64) + target_sources(core PRIVATE + dolphin/x64ABI.cpp + dolphin/x64CPUDetect.cpp + dolphin/x64Emitter.cpp + + ARMJIT_x64/ARMJIT_Compiler.cpp + ARMJIT_x64/ARMJIT_ALU.cpp + ARMJIT_x64/ARMJIT_LoadStore.cpp + ARMJIT_x64/ARMJIT_Branch.cpp + ) + endif() + if (ARCHITECTURE STREQUAL ARM64) + target_sources(core PRIVATE + dolphin/Arm64Emitter.cpp + dolphin/MathUtil.cpp + + ARMJIT_A64/ARMJIT_Compiler.cpp + ARMJIT_A64/ARMJIT_ALU.cpp + ARMJIT_A64/ARMJIT_LoadStore.cpp + ARMJIT_A64/ARMJIT_Branch.cpp + ) + endif() endif() if (WIN32) diff --git a/src/dolphin/Align.h b/src/dolphin/Align.h new file mode 100644 index 00000000..40c45766 --- /dev/null +++ b/src/dolphin/Align.h @@ -0,0 +1,24 @@ +// This file is under the public domain. + +#pragma once + +#include +#include + +namespace Common +{ +template +constexpr T AlignUp(T value, size_t size) +{ + static_assert(std::is_unsigned(), "T must be an unsigned value."); + return static_cast(value + (size - value % size) % size); +} + +template +constexpr T AlignDown(T value, size_t size) +{ + static_assert(std::is_unsigned(), "T must be an unsigned value."); + return static_cast(value - value % size); +} + +} // namespace Common diff --git a/src/dolphin/Arm64Emitter.cpp b/src/dolphin/Arm64Emitter.cpp new file mode 100644 index 00000000..dbcf4257 --- /dev/null +++ b/src/dolphin/Arm64Emitter.cpp @@ -0,0 +1,4466 @@ +// Copyright 2015 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include +#include +#include +#include +#include + +#include "Align.h" +#include "Arm64Emitter.h" +#include "Assert.h" +#include "BitUtils.h" +#include "../types.h" +#include "MathUtil.h" + +namespace Arm64Gen +{ +namespace +{ +const int kWRegSizeInBits = 32; +const int kXRegSizeInBits = 64; + +// The below few functions are taken from V8. +int CountLeadingZeros(uint64_t value, int width) +{ + // TODO(jbramley): Optimize this for ARM64 hosts. + int count = 0; + uint64_t bit_test = 1ULL << (width - 1); + while ((count < width) && ((bit_test & value) == 0)) + { + count++; + bit_test >>= 1; + } + return count; +} + +uint64_t LargestPowerOf2Divisor(uint64_t value) +{ + return value & -(int64_t)value; +} + +// For ADD/SUB +bool IsImmArithmetic(uint64_t input, u32* val, bool* shift) +{ + if (input < 4096) + { + *val = input; + *shift = false; + return true; + } + else if ((input & 0xFFF000) == input) + { + *val = input >> 12; + *shift = true; + return true; + } + return false; +} + +// For AND/TST/ORR/EOR etc +bool IsImmLogical(uint64_t value, unsigned int width, unsigned int* n, unsigned int* imm_s, + unsigned int* imm_r) +{ + // DCHECK((n != NULL) && (imm_s != NULL) && (imm_r != NULL)); + // DCHECK((width == kWRegSizeInBits) || (width == kXRegSizeInBits)); + + bool negate = false; + + // Logical immediates are encoded using parameters n, imm_s and imm_r using + // the following table: + // + // N imms immr size S R + // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr) + // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr) + // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr) + // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr) + // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr) + // 0 11110s xxxxxr 2 UInt(s) UInt(r) + // (s bits must not be all set) + // + // A pattern is constructed of size bits, where the least significant S+1 bits + // are set. The pattern is rotated right by R, and repeated across a 32 or + // 64-bit value, depending on destination register width. + // + // Put another way: the basic format of a logical immediate is a single + // contiguous stretch of 1 bits, repeated across the whole word at intervals + // given by a power of 2. To identify them quickly, we first locate the + // lowest stretch of 1 bits, then the next 1 bit above that; that combination + // is different for every logical immediate, so it gives us all the + // information we need to identify the only logical immediate that our input + // could be, and then we simply check if that's the value we actually have. + // + // (The rotation parameter does give the possibility of the stretch of 1 bits + // going 'round the end' of the word. To deal with that, we observe that in + // any situation where that happens the bitwise NOT of the value is also a + // valid logical immediate. So we simply invert the input whenever its low bit + // is set, and then we know that the rotated case can't arise.) + + if (value & 1) + { + // If the low bit is 1, negate the value, and set a flag to remember that we + // did (so that we can adjust the return values appropriately). + negate = true; + value = ~value; + } + + if (width == kWRegSizeInBits) + { + // To handle 32-bit logical immediates, the very easiest thing is to repeat + // the input value twice to make a 64-bit word. The correct encoding of that + // as a logical immediate will also be the correct encoding of the 32-bit + // value. + + // The most-significant 32 bits may not be zero (ie. negate is true) so + // shift the value left before duplicating it. + value <<= kWRegSizeInBits; + value |= value >> kWRegSizeInBits; + } + + // The basic analysis idea: imagine our input word looks like this. + // + // 0011111000111110001111100011111000111110001111100011111000111110 + // c b a + // |<--d-->| + // + // We find the lowest set bit (as an actual power-of-2 value, not its index) + // and call it a. Then we add a to our original number, which wipes out the + // bottommost stretch of set bits and replaces it with a 1 carried into the + // next zero bit. Then we look for the new lowest set bit, which is in + // position b, and subtract it, so now our number is just like the original + // but with the lowest stretch of set bits completely gone. Now we find the + // lowest set bit again, which is position c in the diagram above. Then we'll + // measure the distance d between bit positions a and c (using CLZ), and that + // tells us that the only valid logical immediate that could possibly be equal + // to this number is the one in which a stretch of bits running from a to just + // below b is replicated every d bits. + uint64_t a = LargestPowerOf2Divisor(value); + uint64_t value_plus_a = value + a; + uint64_t b = LargestPowerOf2Divisor(value_plus_a); + uint64_t value_plus_a_minus_b = value_plus_a - b; + uint64_t c = LargestPowerOf2Divisor(value_plus_a_minus_b); + + int d, clz_a, out_n; + uint64_t mask; + + if (c != 0) + { + // The general case, in which there is more than one stretch of set bits. + // Compute the repeat distance d, and set up a bitmask covering the basic + // unit of repetition (i.e. a word with the bottom d bits set). Also, in all + // of these cases the N bit of the output will be zero. + clz_a = CountLeadingZeros(a, kXRegSizeInBits); + int clz_c = CountLeadingZeros(c, kXRegSizeInBits); + d = clz_a - clz_c; + mask = ((UINT64_C(1) << d) - 1); + out_n = 0; + } + else + { + // Handle degenerate cases. + // + // If any of those 'find lowest set bit' operations didn't find a set bit at + // all, then the word will have been zero thereafter, so in particular the + // last lowest_set_bit operation will have returned zero. So we can test for + // all the special case conditions in one go by seeing if c is zero. + if (a == 0) + { + // The input was zero (or all 1 bits, which will come to here too after we + // inverted it at the start of the function), for which we just return + // false. + return false; + } + else + { + // Otherwise, if c was zero but a was not, then there's just one stretch + // of set bits in our word, meaning that we have the trivial case of + // d == 64 and only one 'repetition'. Set up all the same variables as in + // the general case above, and set the N bit in the output. + clz_a = CountLeadingZeros(a, kXRegSizeInBits); + d = 64; + mask = ~UINT64_C(0); + out_n = 1; + } + } + + // If the repeat period d is not a power of two, it can't be encoded. + if (!MathUtil::IsPow2(d)) + return false; + + // If the bit stretch (b - a) does not fit within the mask derived from the + // repeat period, then fail. + if (((b - a) & ~mask) != 0) + return false; + + // The only possible option is b - a repeated every d bits. Now we're going to + // actually construct the valid logical immediate derived from that + // specification, and see if it equals our original input. + // + // To repeat a value every d bits, we multiply it by a number of the form + // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can + // be derived using a table lookup on CLZ(d). + static const std::array multipliers = {{ + 0x0000000000000001UL, + 0x0000000100000001UL, + 0x0001000100010001UL, + 0x0101010101010101UL, + 0x1111111111111111UL, + 0x5555555555555555UL, + }}; + + int multiplier_idx = CountLeadingZeros(d, kXRegSizeInBits) - 57; + + // Ensure that the index to the multipliers array is within bounds. + DEBUG_ASSERT((multiplier_idx >= 0) && (static_cast(multiplier_idx) < multipliers.size())); + + uint64_t multiplier = multipliers[multiplier_idx]; + uint64_t candidate = (b - a) * multiplier; + + // The candidate pattern doesn't match our input value, so fail. + if (value != candidate) + return false; + + // We have a match! This is a valid logical immediate, so now we have to + // construct the bits and pieces of the instruction encoding that generates + // it. + + // Count the set bits in our basic stretch. The special case of clz(0) == -1 + // makes the answer come out right for stretches that reach the very top of + // the word (e.g. numbers like 0xffffc00000000000). + int clz_b = (b == 0) ? -1 : CountLeadingZeros(b, kXRegSizeInBits); + int s = clz_a - clz_b; + + // Decide how many bits to rotate right by, to put the low bit of that basic + // stretch in position a. + int r; + if (negate) + { + // If we inverted the input right at the start of this function, here's + // where we compensate: the number of set bits becomes the number of clear + // bits, and the rotation count is based on position b rather than position + // a (since b is the location of the 'lowest' 1 bit after inversion). + s = d - s; + r = (clz_b + 1) & (d - 1); + } + else + { + r = (clz_a + 1) & (d - 1); + } + + // Now we're done, except for having to encode the S output in such a way that + // it gives both the number of set bits and the length of the repeated + // segment. The s field is encoded like this: + // + // imms size S + // ssssss 64 UInt(ssssss) + // 0sssss 32 UInt(sssss) + // 10ssss 16 UInt(ssss) + // 110sss 8 UInt(sss) + // 1110ss 4 UInt(ss) + // 11110s 2 UInt(s) + // + // So we 'or' (-d << 1) with our computed s to form imms. + *n = out_n; + *imm_s = ((-d << 1) | (s - 1)) & 0x3f; + *imm_r = r; + + return true; +} + +float FPImm8ToFloat(u8 bits) +{ + const u32 sign = bits >> 7; + const u32 bit6 = (bits >> 6) & 1; + const u32 exp = ((!bit6) << 7) | (0x7C * bit6) | ((bits >> 4) & 3); + const u32 mantissa = (bits & 0xF) << 19; + const u32 f = (sign << 31) | (exp << 23) | mantissa; + + return Common::BitCast(f); +} + +bool FPImm8FromFloat(float value, u8* imm_out) +{ + const u32 f = Common::BitCast(value); + const u32 mantissa4 = (f & 0x7FFFFF) >> 19; + const u32 exponent = (f >> 23) & 0xFF; + const u32 sign = f >> 31; + + if ((exponent >> 7) == ((exponent >> 6) & 1)) + return false; + + const u8 imm8 = (sign << 7) | ((!(exponent >> 7)) << 6) | ((exponent & 3) << 4) | mantissa4; + const float new_float = FPImm8ToFloat(imm8); + if (new_float == value) + *imm_out = imm8; + else + return false; + + return true; +} +} // Anonymous namespace + +void ARM64XEmitter::SetCodePtrUnsafe(ptrdiff_t ptr) +{ + m_code = ptr; +} + +void ARM64XEmitter::SetCodePtr(ptrdiff_t ptr) +{ + SetCodePtrUnsafe(ptr); + m_lastCacheFlushEnd = ptr; +} + +void ARM64XEmitter::SetCodeBase(u8* rwbase, u8* rxbase) +{ + m_code = 0; + m_lastCacheFlushEnd = 0; + m_rwbase = rwbase; + m_rxbase = rxbase; +} + +ptrdiff_t ARM64XEmitter::GetCodeOffset() +{ + return m_code; +} + +const u8* ARM64XEmitter::GetRWPtr() +{ + return m_rwbase + m_code; +} + +u8* ARM64XEmitter::GetWriteableRWPtr() +{ + return m_rwbase + m_code; +} + +void* ARM64XEmitter::GetRXPtr() +{ + return m_rxbase + m_code; +} + +void ARM64XEmitter::ReserveCodeSpace(u32 bytes) +{ + for (u32 i = 0; i < bytes / 4; i++) + BRK(0); +} + +ptrdiff_t ARM64XEmitter::AlignCode16() +{ + int c = int((u64)m_code & 15); + if (c) + ReserveCodeSpace(16 - c); + return m_code; +} + +ptrdiff_t ARM64XEmitter::AlignCodePage() +{ + int c = int((u64)m_code & 4095); + if (c) + ReserveCodeSpace(4096 - c); + return m_code; +} + +void ARM64XEmitter::Write32(u32 value) +{ + std::memcpy(m_rwbase + m_code, &value, sizeof(u32)); + m_code += sizeof(u32); +} + +void ARM64XEmitter::FlushIcache() +{ + FlushIcacheSection(m_rxbase + m_lastCacheFlushEnd, m_rxbase + m_code); + m_lastCacheFlushEnd = m_code; +} + +void ARM64XEmitter::FlushIcacheSection(u8* start, u8* end) +{ + if (start == end) + return; + +#if defined(IOS) + // Header file says this is equivalent to: sys_icache_invalidate(start, end - start); + sys_cache_control(kCacheFunctionPrepareForExecution, start, end - start); +#else + // Don't rely on GCC's __clear_cache implementation, as it caches + // icache/dcache cache line sizes, that can vary between cores on + // big.LITTLE architectures. + u64 addr, ctr_el0; + static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff; + size_t isize, dsize; + + __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0)); + isize = 4 << ((ctr_el0 >> 0) & 0xf); + dsize = 4 << ((ctr_el0 >> 16) & 0xf); + + // use the global minimum cache line size + icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize; + dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize; + + addr = (u64)start & ~(u64)(dsize - 1); + for (; addr < (u64)end; addr += dsize) + // use "civac" instead of "cvau", as this is the suggested workaround for + // Cortex-A53 errata 819472, 826319, 827319 and 824069. + __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory"); + __asm__ volatile("dsb ish" : : : "memory"); + + addr = (u64)start & ~(u64)(isize - 1); + for (; addr < (u64)end; addr += isize) + __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory"); + + __asm__ volatile("dsb ish" : : : "memory"); + __asm__ volatile("isb" : : : "memory"); +#endif +} + +// Exception generation +static const u32 ExcEnc[][3] = { + {0, 0, 1}, // SVC + {0, 0, 2}, // HVC + {0, 0, 3}, // SMC + {1, 0, 0}, // BRK + {2, 0, 0}, // HLT + {5, 0, 1}, // DCPS1 + {5, 0, 2}, // DCPS2 + {5, 0, 3}, // DCPS3 +}; + +// Arithmetic generation +static const u32 ArithEnc[] = { + 0x058, // ADD + 0x258, // SUB +}; + +// Conditional Select +static const u32 CondSelectEnc[][2] = { + {0, 0}, // CSEL + {0, 1}, // CSINC + {1, 0}, // CSINV + {1, 1}, // CSNEG +}; + +// Data-Processing (1 source) +static const u32 Data1SrcEnc[][2] = { + {0, 0}, // RBIT + {0, 1}, // REV16 + {0, 2}, // REV32 + {0, 3}, // REV64 + {0, 4}, // CLZ + {0, 5}, // CLS +}; + +// Data-Processing (2 source) +static const u32 Data2SrcEnc[] = { + 0x02, // UDIV + 0x03, // SDIV + 0x08, // LSLV + 0x09, // LSRV + 0x0A, // ASRV + 0x0B, // RORV + 0x10, // CRC32B + 0x11, // CRC32H + 0x12, // CRC32W + 0x14, // CRC32CB + 0x15, // CRC32CH + 0x16, // CRC32CW + 0x13, // CRC32X (64bit Only) + 0x17, // XRC32CX (64bit Only) +}; + +// Data-Processing (3 source) +static const u32 Data3SrcEnc[][2] = { + {0, 0}, // MADD + {0, 1}, // MSUB + {1, 0}, // SMADDL (64Bit Only) + {1, 1}, // SMSUBL (64Bit Only) + {2, 0}, // SMULH (64Bit Only) + {5, 0}, // UMADDL (64Bit Only) + {5, 1}, // UMSUBL (64Bit Only) + {6, 0}, // UMULH (64Bit Only) +}; + +// Logical (shifted register) +static const u32 LogicalEnc[][2] = { + {0, 0}, // AND + {0, 1}, // BIC + {1, 0}, // OOR + {1, 1}, // ORN + {2, 0}, // EOR + {2, 1}, // EON + {3, 0}, // ANDS + {3, 1}, // BICS +}; + +// Load/Store Exclusive +static const u32 LoadStoreExcEnc[][5] = { + {0, 0, 0, 0, 0}, // STXRB + {0, 0, 0, 0, 1}, // STLXRB + {0, 0, 1, 0, 0}, // LDXRB + {0, 0, 1, 0, 1}, // LDAXRB + {0, 1, 0, 0, 1}, // STLRB + {0, 1, 1, 0, 1}, // LDARB + {1, 0, 0, 0, 0}, // STXRH + {1, 0, 0, 0, 1}, // STLXRH + {1, 0, 1, 0, 0}, // LDXRH + {1, 0, 1, 0, 1}, // LDAXRH + {1, 1, 0, 0, 1}, // STLRH + {1, 1, 1, 0, 1}, // LDARH + {2, 0, 0, 0, 0}, // STXR + {3, 0, 0, 0, 0}, // (64bit) STXR + {2, 0, 0, 0, 1}, // STLXR + {3, 0, 0, 0, 1}, // (64bit) STLXR + {2, 0, 0, 1, 0}, // STXP + {3, 0, 0, 1, 0}, // (64bit) STXP + {2, 0, 0, 1, 1}, // STLXP + {3, 0, 0, 1, 1}, // (64bit) STLXP + {2, 0, 1, 0, 0}, // LDXR + {3, 0, 1, 0, 0}, // (64bit) LDXR + {2, 0, 1, 0, 1}, // LDAXR + {3, 0, 1, 0, 1}, // (64bit) LDAXR + {2, 0, 1, 1, 0}, // LDXP + {3, 0, 1, 1, 0}, // (64bit) LDXP + {2, 0, 1, 1, 1}, // LDAXP + {3, 0, 1, 1, 1}, // (64bit) LDAXP + {2, 1, 0, 0, 1}, // STLR + {3, 1, 0, 0, 1}, // (64bit) STLR + {2, 1, 1, 0, 1}, // LDAR + {3, 1, 1, 0, 1}, // (64bit) LDAR +}; + +void ARM64XEmitter::EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr) +{ + bool b64Bit = Is64Bit(Rt); + s64 distance = (s64)ptr - (s64)(m_rxbase + m_code); + + ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64, + __func__, distance); + + distance >>= 2; + + ASSERT_MSG(DYNA_REC, distance >= -0x40000 && distance <= 0x3FFFF, + "%s: Received too large distance: %" PRIx64, __func__, distance); + + Rt = DecodeReg(Rt); + Write32((b64Bit << 31) | (0x34 << 24) | (op << 24) | (((u32)distance << 5) & 0xFFFFE0) | Rt); +} + +void ARM64XEmitter::EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr) +{ + bool b64Bit = Is64Bit(Rt); + s64 distance = (s64)ptr - (s64)(m_rxbase + m_code); + + ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64, + __func__, distance); + + distance >>= 2; + + ASSERT_MSG(DYNA_REC, distance >= -0x3FFF && distance < 0x3FFF, + "%s: Received too large distance: %" PRIx64, __func__, distance); + + Rt = DecodeReg(Rt); + Write32((b64Bit << 31) | (0x36 << 24) | (op << 24) | (bits << 19) | + (((u32)distance << 5) & 0x7FFE0) | Rt); +} + +void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 op, const void* ptr) +{ + s64 distance = (s64)ptr - s64(m_rxbase + m_code); + + ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64, + __func__, distance); + + distance >>= 2; + + ASSERT_MSG(DYNA_REC, distance >= -0x2000000LL && distance <= 0x1FFFFFFLL, + "%s: Received too large distance: %" PRIx64, __func__, distance); + + Write32((op << 31) | (0x5 << 26) | (distance & 0x3FFFFFF)); +} + +void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn) +{ + Rn = DecodeReg(Rn); + Write32((0x6B << 25) | (opc << 21) | (op2 << 16) | (op3 << 10) | (Rn << 5) | op4); +} + +void ARM64XEmitter::EncodeExceptionInst(u32 instenc, u32 imm) +{ + ASSERT_MSG(DYNA_REC, !(imm & ~0xFFFF), "%s: Exception instruction too large immediate: %d", + __func__, imm); + + Write32((0xD4 << 24) | (ExcEnc[instenc][0] << 21) | (imm << 5) | (ExcEnc[instenc][1] << 2) | + ExcEnc[instenc][2]); +} + +void ARM64XEmitter::EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt) +{ + Write32((0x354 << 22) | (op0 << 19) | (op1 << 16) | (CRn << 12) | (CRm << 8) | (op2 << 5) | Rt); +} + +void ARM64XEmitter::EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm, ArithOption Option) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + Write32((b64Bit << 31) | (flags << 29) | (ArithEnc[instenc] << 21) | + (Option.GetType() == ArithOption::TYPE_EXTENDEDREG ? (1 << 21) : 0) | (Rm << 16) | + Option.GetData() | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0xD0 << 21) | (Rm << 16) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond) +{ + bool b64Bit = Is64Bit(Rn); + + ASSERT_MSG(DYNA_REC, !(imm & ~0x1F), "%s: too large immediate: %d", __func__, imm); + ASSERT_MSG(DYNA_REC, !(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv); + + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (imm << 16) | (cond << 12) | + (1 << 11) | (Rn << 5) | nzcv); +} + +void ARM64XEmitter::EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, + CCFlags cond) +{ + bool b64Bit = Is64Bit(Rm); + + ASSERT_MSG(DYNA_REC, !(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv); + + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (Rm << 16) | (cond << 12) | + (Rn << 5) | nzcv); +} + +void ARM64XEmitter::EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + CCFlags cond) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (CondSelectEnc[instenc][0] << 30) | (0xD4 << 21) | (Rm << 16) | + (cond << 12) | (CondSelectEnc[instenc][1] << 10) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (0x2D6 << 21) | (Data1SrcEnc[instenc][0] << 16) | + (Data1SrcEnc[instenc][1] << 10) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (0x0D6 << 21) | (Rm << 16) | (Data2SrcEnc[instenc] << 10) | (Rn << 5) | + Rd); +} + +void ARM64XEmitter::EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + ARM64Reg Ra) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Ra = DecodeReg(Ra); + Write32((b64Bit << 31) | (0xD8 << 21) | (Data3SrcEnc[instenc][0] << 21) | (Rm << 16) | + (Data3SrcEnc[instenc][1] << 15) | (Ra << 10) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + ArithOption Shift) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (LogicalEnc[instenc][0] << 29) | (0x5 << 25) | + (LogicalEnc[instenc][1] << 21) | Shift.GetData() | (Rm << 16) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm) +{ + bool b64Bit = Is64Bit(Rt); + bool bVec = IsVector(Rt); + + ASSERT_MSG(DYNA_REC, !(imm & 0xFFFFF), "%s: offset too large %d", __func__, imm); + + Rt = DecodeReg(Rt); + if (b64Bit && bitop != 0x2) // LDRSW(0x2) uses 64bit reg, doesn't have 64bit bit set + bitop |= 0x1; + Write32((bitop << 30) | (bVec << 26) | (0x18 << 24) | (imm << 5) | Rt); +} + +void ARM64XEmitter::EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, + ARM64Reg Rt) +{ + Rs = DecodeReg(Rs); + Rt2 = DecodeReg(Rt2); + Rn = DecodeReg(Rn); + Rt = DecodeReg(Rt); + Write32((LoadStoreExcEnc[instenc][0] << 30) | (0x8 << 24) | (LoadStoreExcEnc[instenc][1] << 23) | + (LoadStoreExcEnc[instenc][2] << 22) | (LoadStoreExcEnc[instenc][3] << 21) | (Rs << 16) | + (LoadStoreExcEnc[instenc][4] << 15) | (Rt2 << 10) | (Rn << 5) | Rt); +} + +void ARM64XEmitter::EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, + u32 imm) +{ + bool b64Bit = Is64Bit(Rt); + bool b128Bit = IsQuad(Rt); + bool bVec = IsVector(Rt); + + if (b128Bit) + imm >>= 4; + else if (b64Bit) + imm >>= 3; + else + imm >>= 2; + + ASSERT_MSG(DYNA_REC, !(imm & ~0xF), "%s: offset too large %d", __func__, imm); + + u32 opc = 0; + if (b128Bit) + opc = 2; + else if (b64Bit && bVec) + opc = 1; + else if (b64Bit && !bVec) + opc = 2; + + Rt = DecodeReg(Rt); + Rt2 = DecodeReg(Rt2); + Rn = DecodeReg(Rn); + Write32((opc << 30) | (bVec << 26) | (op << 22) | (imm << 15) | (Rt2 << 10) | (Rn << 5) | Rt); +} + +void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + bool b64Bit = Is64Bit(Rt); + bool bVec = IsVector(Rt); + + u32 offset = imm & 0x1FF; + + ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s: offset too large %d", __func__, imm); + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (offset << 12) | (op2 << 10) | (Rn << 5) | + Rt); +} + +void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size) +{ + bool b64Bit = Is64Bit(Rt); + bool bVec = IsVector(Rt); + + if (size == 64) + imm >>= 3; + else if (size == 32) + imm >>= 2; + else if (size == 16) + imm >>= 1; + + ASSERT_MSG(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED): offset must be positive %d", __func__, imm); + ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "%s(INDEX_UNSIGNED): offset too large %d", __func__, imm); + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (imm << 10) | (Rn << 5) | Rt); +} + +void ARM64XEmitter::EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos) +{ + bool b64Bit = Is64Bit(Rd); + + ASSERT_MSG(DYNA_REC, !(imm & ~0xFFFF), "%s: immediate out of range: %d", __func__, imm); + + Rd = DecodeReg(Rd); + Write32((b64Bit << 31) | (op << 29) | (0x25 << 23) | (pos << 21) | (imm << 5) | Rd); +} + +void ARM64XEmitter::EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 29) | (0x26 << 23) | (b64Bit << 22) | (immr << 16) | + (imms << 10) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, + ArithOption Rm) +{ + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg()); + + Write32((size << 30) | (opc << 22) | (0x1C1 << 21) | (decoded_Rm << 16) | Rm.GetData() | + (1 << 11) | (Rn << 5) | Rt); +} + +void ARM64XEmitter::EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, + ARM64Reg Rd) +{ + bool b64Bit = Is64Bit(Rd); + + ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "%s: immediate too large: %x", __func__, imm); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0x11 << 24) | (shift << 22) | (imm << 10) | + (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, + int n) +{ + // Sometimes Rd is fixed to SP, but can still be 32bit or 64bit. + // Use Rn to determine bitness here. + bool b64Bit = Is64Bit(Rn); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((b64Bit << 31) | (op << 29) | (0x24 << 23) | (n << 22) | (immr << 16) | (imms << 10) | + (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, + ARM64Reg Rn, s32 imm) +{ + bool b64Bit = Is64Bit(Rt); + u32 type_encode = 0; + + switch (type) + { + case INDEX_SIGNED: + type_encode = 0b010; + break; + case INDEX_POST: + type_encode = 0b001; + break; + case INDEX_PRE: + type_encode = 0b011; + break; + case INDEX_UNSIGNED: + ASSERT_MSG(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __func__); + break; + } + + if (b64Bit) + { + op |= 0b10; + imm >>= 3; + } + else + { + imm >>= 2; + } + + Rt = DecodeReg(Rt); + Rt2 = DecodeReg(Rt2); + Rn = DecodeReg(Rn); + + Write32((op << 30) | (0b101 << 27) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) | + (Rt2 << 10) | (Rn << 5) | Rt); +} +void ARM64XEmitter::EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm) +{ + Rd = DecodeReg(Rd); + + Write32((op << 31) | ((imm & 0x3) << 29) | (0x10 << 24) | ((imm & 0x1FFFFC) << 3) | Rd); +} + +void ARM64XEmitter::EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __func__, + imm); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((size << 30) | (0b111 << 27) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt); +} + +static constexpr bool IsInRangeImm19(s64 distance) +{ + return (distance >= -0x40000 && distance <= 0x3FFFF); +} + +static constexpr bool IsInRangeImm14(s64 distance) +{ + return (distance >= -0x2000 && distance <= 0x1FFF); +} + +static constexpr bool IsInRangeImm26(s64 distance) +{ + return (distance >= -0x2000000 && distance <= 0x1FFFFFF); +} + +static constexpr u32 MaskImm19(s64 distance) +{ + return distance & 0x7FFFF; +} + +static constexpr u32 MaskImm14(s64 distance) +{ + return distance & 0x3FFF; +} + +static constexpr u32 MaskImm26(s64 distance) +{ + return distance & 0x3FFFFFF; +} + +// FixupBranch branching +void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch) +{ + bool Not = false; + u32 inst = 0; + s64 distance = (s64)(m_code - branch.ptr); + distance >>= 2; + + switch (branch.type) + { + case 1: // CBNZ + Not = true; + case 0: // CBZ + { + ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + bool b64Bit = Is64Bit(branch.reg); + ARM64Reg reg = DecodeReg(branch.reg); + inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | (MaskImm19(distance) << 5) | reg; + } + break; + case 2: // B (conditional) + ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + inst = (0x2A << 25) | (MaskImm19(distance) << 5) | branch.cond; + break; + case 4: // TBNZ + Not = true; + case 3: // TBZ + { + ASSERT_MSG(DYNA_REC, IsInRangeImm14(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + ARM64Reg reg = DecodeReg(branch.reg); + inst = ((branch.bit & 0x20) << 26) | (0x1B << 25) | (Not << 24) | ((branch.bit & 0x1F) << 19) | + (MaskImm14(distance) << 5) | reg; + } + break; + case 5: // B (uncoditional) + ASSERT_MSG(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + inst = (0x5 << 26) | MaskImm26(distance); + break; + case 6: // BL (unconditional) + ASSERT_MSG(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + inst = (0x25 << 26) | MaskImm26(distance); + break; + } + + std::memcpy(m_rwbase + branch.ptr, &inst, sizeof(inst)); +} + +FixupBranch ARM64XEmitter::CBZ(ARM64Reg Rt) +{ + FixupBranch branch; + branch.ptr = m_code; + branch.type = 0; + branch.reg = Rt; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::CBNZ(ARM64Reg Rt) +{ + FixupBranch branch; + branch.ptr = m_code; + branch.type = 1; + branch.reg = Rt; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::B(CCFlags cond) +{ + FixupBranch branch; + branch.ptr = m_code; + branch.type = 2; + branch.cond = cond; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bit) +{ + FixupBranch branch; + branch.ptr = m_code; + branch.type = 3; + branch.reg = Rt; + branch.bit = bit; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bit) +{ + FixupBranch branch; + branch.ptr = m_code; + branch.type = 4; + branch.reg = Rt; + branch.bit = bit; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::B() +{ + FixupBranch branch; + branch.ptr = m_code; + branch.type = 5; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::BL() +{ + FixupBranch branch; + branch.ptr = m_code; + branch.type = 6; + HINT(HINT_NOP); + return branch; +} + +// Compare and Branch +void ARM64XEmitter::CBZ(ARM64Reg Rt, const void* ptr) +{ + EncodeCompareBranchInst(0, Rt, ptr); +} +void ARM64XEmitter::CBNZ(ARM64Reg Rt, const void* ptr) +{ + EncodeCompareBranchInst(1, Rt, ptr); +} + +// Conditional Branch +void ARM64XEmitter::B(CCFlags cond, const void* ptr) +{ + s64 distance = (s64)ptr - (s64)(m_rxbase + m_code); + + distance >>= 2; + + ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), + "%s: Received too large distance: %p->%p %" PRIi64 " %" PRIx64, __func__, m_execcode, ptr, + distance, distance); + Write32((0x54 << 24) | (MaskImm19(distance) << 5) | cond); +} + +// Test and Branch +void ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bits, const void* ptr) +{ + EncodeTestBranchInst(0, Rt, bits, ptr); +} +void ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bits, const void* ptr) +{ + EncodeTestBranchInst(1, Rt, bits, ptr); +} + +// Unconditional Branch +void ARM64XEmitter::B(const void* ptr) +{ + EncodeUnconditionalBranchInst(0, ptr); +} +void ARM64XEmitter::BL(const void* ptr) +{ + EncodeUnconditionalBranchInst(1, ptr); +} + +void ARM64XEmitter::QuickCallFunction(ARM64Reg scratchreg, const void* func) +{ + s64 distance = (s64)func - (s64)(m_rxbase + m_code); + distance >>= 2; // Can only branch to opcode-aligned (4) addresses + if (!IsInRangeImm26(distance)) + { + // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code, + // func); + MOVI2R(scratchreg, (uintptr_t)func); + BLR(scratchreg); + } + else + { + BL(func); + } +} + +void ARM64XEmitter::QuickTailCall(ARM64Reg scratchreg, const void* func) +{ + s64 distance = (s64)func - (s64)(m_rxbase + m_code); + distance >>= 2; // Can only branch to opcode-aligned (4) addresses + if (!IsInRangeImm26(distance)) + { + // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code, + // func); + MOVI2R(scratchreg, (uintptr_t)func); + BR(scratchreg); + } + else + { + B(func); + } +} + +// Unconditional Branch (register) +void ARM64XEmitter::BR(ARM64Reg Rn) +{ + EncodeUnconditionalBranchInst(0, 0x1F, 0, 0, Rn); +} +void ARM64XEmitter::BLR(ARM64Reg Rn) +{ + EncodeUnconditionalBranchInst(1, 0x1F, 0, 0, Rn); +} +void ARM64XEmitter::RET(ARM64Reg Rn) +{ + EncodeUnconditionalBranchInst(2, 0x1F, 0, 0, Rn); +} +void ARM64XEmitter::ERET() +{ + EncodeUnconditionalBranchInst(4, 0x1F, 0, 0, SP); +} +void ARM64XEmitter::DRPS() +{ + EncodeUnconditionalBranchInst(5, 0x1F, 0, 0, SP); +} + +// Exception generation +void ARM64XEmitter::SVC(u32 imm) +{ + EncodeExceptionInst(0, imm); +} + +void ARM64XEmitter::HVC(u32 imm) +{ + EncodeExceptionInst(1, imm); +} + +void ARM64XEmitter::SMC(u32 imm) +{ + EncodeExceptionInst(2, imm); +} + +void ARM64XEmitter::BRK(u32 imm) +{ + EncodeExceptionInst(3, imm); +} + +void ARM64XEmitter::HLT(u32 imm) +{ + EncodeExceptionInst(4, imm); +} + +void ARM64XEmitter::DCPS1(u32 imm) +{ + EncodeExceptionInst(5, imm); +} + +void ARM64XEmitter::DCPS2(u32 imm) +{ + EncodeExceptionInst(6, imm); +} + +void ARM64XEmitter::DCPS3(u32 imm) +{ + EncodeExceptionInst(7, imm); +} + +// System +void ARM64XEmitter::_MSR(PStateField field, u8 imm) +{ + u32 op1 = 0, op2 = 0; + switch (field) + { + case FIELD_SPSel: + op1 = 0; + op2 = 5; + break; + case FIELD_DAIFSet: + op1 = 3; + op2 = 6; + break; + case FIELD_DAIFClr: + op1 = 3; + op2 = 7; + break; + default: + ASSERT_MSG(DYNA_REC, false, "Invalid PStateField to do a imm move to"); + break; + } + EncodeSystemInst(0, op1, 4, imm, op2, WSP); +} + +static void GetSystemReg(PStateField field, int& o0, int& op1, int& CRn, int& CRm, int& op2) +{ + switch (field) + { + case FIELD_NZCV: + o0 = 3; + op1 = 3; + CRn = 4; + CRm = 2; + op2 = 0; + break; + case FIELD_FPCR: + o0 = 3; + op1 = 3; + CRn = 4; + CRm = 4; + op2 = 0; + break; + case FIELD_FPSR: + o0 = 3; + op1 = 3; + CRn = 4; + CRm = 4; + op2 = 1; + break; + case FIELD_PMCR_EL0: + o0 = 3; + op1 = 3; + CRn = 9; + CRm = 6; + op2 = 0; + break; + case FIELD_PMCCNTR_EL0: + o0 = 3; + op1 = 3; + CRn = 9; + CRm = 7; + op2 = 0; + break; + default: + ASSERT_MSG(DYNA_REC, false, "Invalid PStateField to do a register move from/to"); + break; + } +} + +void ARM64XEmitter::_MSR(PStateField field, ARM64Reg Rt) +{ + int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0; + ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "MSR: Rt must be 64-bit"); + GetSystemReg(field, o0, op1, CRn, CRm, op2); + EncodeSystemInst(o0, op1, CRn, CRm, op2, DecodeReg(Rt)); +} + +void ARM64XEmitter::MRS(ARM64Reg Rt, PStateField field) +{ + int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0; + ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "MRS: Rt must be 64-bit"); + GetSystemReg(field, o0, op1, CRn, CRm, op2); + EncodeSystemInst(o0 | 4, op1, CRn, CRm, op2, DecodeReg(Rt)); +} + +void ARM64XEmitter::CNTVCT(Arm64Gen::ARM64Reg Rt) +{ + ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "CNTVCT: Rt must be 64-bit"); + + // MRS , CNTVCT_EL0 ; Read CNTVCT_EL0 into Xt + EncodeSystemInst(3 | 4, 3, 0xe, 0, 2, DecodeReg(Rt)); +} + +void ARM64XEmitter::HINT(SystemHint op) +{ + EncodeSystemInst(0, 3, 2, 0, op, WSP); +} +void ARM64XEmitter::CLREX() +{ + EncodeSystemInst(0, 3, 3, 0, 2, WSP); +} +void ARM64XEmitter::DSB(BarrierType type) +{ + EncodeSystemInst(0, 3, 3, type, 4, WSP); +} +void ARM64XEmitter::DMB(BarrierType type) +{ + EncodeSystemInst(0, 3, 3, type, 5, WSP); +} +void ARM64XEmitter::ISB(BarrierType type) +{ + EncodeSystemInst(0, 3, 3, type, 6, WSP); +} + +// Add/Subtract (extended register) +void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + ADD(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); +} + +void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) +{ + EncodeArithmeticInst(0, false, Rd, Rn, Rm, Option); +} + +void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeArithmeticInst(0, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); +} + +void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) +{ + EncodeArithmeticInst(0, true, Rd, Rn, Rm, Option); +} + +void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + SUB(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); +} + +void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) +{ + EncodeArithmeticInst(1, false, Rd, Rn, Rm, Option); +} + +void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeArithmeticInst(1, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); +} + +void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) +{ + EncodeArithmeticInst(1, true, Rd, Rn, Rm, Option); +} + +void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm) +{ + CMN(Rn, Rm, ArithOption(Rn, ST_LSL, 0)); +} + +void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) +{ + EncodeArithmeticInst(0, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option); +} + +void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm) +{ + CMP(Rn, Rm, ArithOption(Rn, ST_LSL, 0)); +} + +void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) +{ + EncodeArithmeticInst(1, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option); +} + +// Add/Subtract (with carry) +void ARM64XEmitter::ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeArithmeticCarryInst(0, false, Rd, Rn, Rm); +} +void ARM64XEmitter::ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeArithmeticCarryInst(0, true, Rd, Rn, Rm); +} +void ARM64XEmitter::SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeArithmeticCarryInst(1, false, Rd, Rn, Rm); +} +void ARM64XEmitter::SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeArithmeticCarryInst(1, true, Rd, Rn, Rm); +} + +// Conditional Compare (immediate) +void ARM64XEmitter::CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond) +{ + EncodeCondCompareImmInst(0, Rn, imm, nzcv, cond); +} +void ARM64XEmitter::CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond) +{ + EncodeCondCompareImmInst(1, Rn, imm, nzcv, cond); +} + +// Conditiona Compare (register) +void ARM64XEmitter::CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond) +{ + EncodeCondCompareRegInst(0, Rn, Rm, nzcv, cond); +} +void ARM64XEmitter::CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond) +{ + EncodeCondCompareRegInst(1, Rn, Rm, nzcv, cond); +} + +// Conditional Select +void ARM64XEmitter::CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) +{ + EncodeCondSelectInst(0, Rd, Rn, Rm, cond); +} +void ARM64XEmitter::CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) +{ + EncodeCondSelectInst(1, Rd, Rn, Rm, cond); +} +void ARM64XEmitter::CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) +{ + EncodeCondSelectInst(2, Rd, Rn, Rm, cond); +} +void ARM64XEmitter::CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) +{ + EncodeCondSelectInst(3, Rd, Rn, Rm, cond); +} + +// Data-Processing 1 source +void ARM64XEmitter::RBIT(ARM64Reg Rd, ARM64Reg Rn) +{ + EncodeData1SrcInst(0, Rd, Rn); +} +void ARM64XEmitter::REV16(ARM64Reg Rd, ARM64Reg Rn) +{ + EncodeData1SrcInst(1, Rd, Rn); +} +void ARM64XEmitter::REV32(ARM64Reg Rd, ARM64Reg Rn) +{ + EncodeData1SrcInst(2, Rd, Rn); +} +void ARM64XEmitter::REV64(ARM64Reg Rd, ARM64Reg Rn) +{ + EncodeData1SrcInst(3, Rd, Rn); +} +void ARM64XEmitter::CLZ(ARM64Reg Rd, ARM64Reg Rn) +{ + EncodeData1SrcInst(4, Rd, Rn); +} +void ARM64XEmitter::CLS(ARM64Reg Rd, ARM64Reg Rn) +{ + EncodeData1SrcInst(5, Rd, Rn); +} + +// Data-Processing 2 source +void ARM64XEmitter::UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(0, Rd, Rn, Rm); +} +void ARM64XEmitter::SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(1, Rd, Rn, Rm); +} +void ARM64XEmitter::LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(2, Rd, Rn, Rm); +} +void ARM64XEmitter::LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(3, Rd, Rn, Rm); +} +void ARM64XEmitter::ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(4, Rd, Rn, Rm); +} +void ARM64XEmitter::RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(5, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(6, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(7, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(8, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(9, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(10, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(11, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(12, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(13, Rd, Rn, Rm); +} + +// Data-Processing 3 source +void ARM64XEmitter::MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EncodeData3SrcInst(0, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EncodeData3SrcInst(1, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EncodeData3SrcInst(2, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + SMADDL(Rd, Rn, Rm, SP); +} +void ARM64XEmitter::SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EncodeData3SrcInst(3, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData3SrcInst(4, Rd, Rn, Rm, SP); +} +void ARM64XEmitter::UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EncodeData3SrcInst(5, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + UMADDL(Rd, Rn, Rm, SP); +} +void ARM64XEmitter::UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EncodeData3SrcInst(6, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData3SrcInst(7, Rd, Rn, Rm, SP); +} +void ARM64XEmitter::MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData3SrcInst(0, Rd, Rn, Rm, SP); +} +void ARM64XEmitter::MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData3SrcInst(1, Rd, Rn, Rm, SP); +} + +// Logical (shifted register) +void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(0, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(1, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(2, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(3, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(4, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(5, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(6, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(7, Rd, Rn, Rm, Shift); +} + +void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift) +{ + ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, Shift); +} + +void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm) +{ + if (IsGPR(Rd) && IsGPR(Rm)) + ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0)); + else + ASSERT_MSG(DYNA_REC, false, "Non-GPRs not supported in MOV"); +} +void ARM64XEmitter::MVN(ARM64Reg Rd, ARM64Reg Rm) +{ + ORN(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0)); +} +void ARM64XEmitter::LSL(ARM64Reg Rd, ARM64Reg Rm, int shift) +{ + int bits = Is64Bit(Rd) ? 64 : 32; + UBFM(Rd, Rm, (bits - shift) & (bits - 1), bits - shift - 1); +} +void ARM64XEmitter::LSR(ARM64Reg Rd, ARM64Reg Rm, int shift) +{ + int bits = Is64Bit(Rd) ? 64 : 32; + UBFM(Rd, Rm, shift, bits - 1); +} +void ARM64XEmitter::ASR(ARM64Reg Rd, ARM64Reg Rm, int shift) +{ + int bits = Is64Bit(Rd) ? 64 : 32; + SBFM(Rd, Rm, shift, bits - 1); +} +void ARM64XEmitter::ROR_(ARM64Reg Rd, ARM64Reg Rm, int shift) +{ + EXTR(Rd, Rm, Rm, shift); +} + +// Logical (immediate) +void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) +{ + EncodeLogicalImmInst(0, Rd, Rn, immr, imms, invert); +} +void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) +{ + EncodeLogicalImmInst(3, Rd, Rn, immr, imms, invert); +} +void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) +{ + EncodeLogicalImmInst(2, Rd, Rn, immr, imms, invert); +} +void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) +{ + EncodeLogicalImmInst(1, Rd, Rn, immr, imms, invert); +} +void ARM64XEmitter::TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert) +{ + EncodeLogicalImmInst(3, Is64Bit(Rn) ? ZR : WZR, Rn, immr, imms, invert); +} + +// Add/subtract (immediate) +void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) +{ + EncodeAddSubImmInst(0, false, shift, imm, Rn, Rd); +} +void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) +{ + EncodeAddSubImmInst(0, true, shift, imm, Rn, Rd); +} +void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) +{ + EncodeAddSubImmInst(1, false, shift, imm, Rn, Rd); +} +void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) +{ + EncodeAddSubImmInst(1, true, shift, imm, Rn, Rd); +} +void ARM64XEmitter::CMP(ARM64Reg Rn, u32 imm, bool shift) +{ + EncodeAddSubImmInst(1, true, shift, imm, Rn, Is64Bit(Rn) ? SP : WSP); +} + +// Data Processing (Immediate) +void ARM64XEmitter::MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos) +{ + EncodeMOVWideInst(2, Rd, imm, pos); +} +void ARM64XEmitter::MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos) +{ + EncodeMOVWideInst(0, Rd, imm, pos); +} +void ARM64XEmitter::MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos) +{ + EncodeMOVWideInst(3, Rd, imm, pos); +} + +// Bitfield move +void ARM64XEmitter::BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) +{ + EncodeBitfieldMOVInst(1, Rd, Rn, immr, imms); +} +void ARM64XEmitter::SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) +{ + EncodeBitfieldMOVInst(0, Rd, Rn, immr, imms); +} +void ARM64XEmitter::UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) +{ + EncodeBitfieldMOVInst(2, Rd, Rn, immr, imms); +} + +void ARM64XEmitter::BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width) +{ + u32 size = Is64Bit(Rn) ? 64 : 32; + ASSERT_MSG(DYNA_REC, (lsb + width) <= size, + "%s passed lsb %d and width %d which is greater than the register size!", __func__, + lsb, width); + EncodeBitfieldMOVInst(1, Rd, Rn, (size - lsb) % size, width - 1); +} +void ARM64XEmitter::UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width) +{ + u32 size = Is64Bit(Rn) ? 64 : 32; + ASSERT_MSG(DYNA_REC, (lsb + width) <= size, + "%s passed lsb %d and width %d which is greater than the register size!", __func__, + lsb, width); + EncodeBitfieldMOVInst(2, Rd, Rn, (size - lsb) % size, width - 1); +} +void ARM64XEmitter::EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift) +{ + bool sf = Is64Bit(Rd); + bool N = sf; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + Write32((sf << 31) | (0x27 << 23) | (N << 22) | (Rm << 16) | (shift << 10) | (Rm << 5) | Rd); +} +void ARM64XEmitter::SXTB(ARM64Reg Rd, ARM64Reg Rn) +{ + SBFM(Rd, Rn, 0, 7); +} +void ARM64XEmitter::SXTH(ARM64Reg Rd, ARM64Reg Rn) +{ + SBFM(Rd, Rn, 0, 15); +} +void ARM64XEmitter::SXTW(ARM64Reg Rd, ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "%s requires 64bit register as destination", __func__); + SBFM(Rd, Rn, 0, 31); +} +void ARM64XEmitter::UXTB(ARM64Reg Rd, ARM64Reg Rn) +{ + UBFM(Rd, Rn, 0, 7); +} +void ARM64XEmitter::UXTH(ARM64Reg Rd, ARM64Reg Rn) +{ + UBFM(Rd, Rn, 0, 15); +} + +// Load Register (Literal) +void ARM64XEmitter::LDR(ARM64Reg Rt, u32 imm) +{ + EncodeLoadRegisterInst(0, Rt, imm); +} +void ARM64XEmitter::LDRSW(ARM64Reg Rt, u32 imm) +{ + EncodeLoadRegisterInst(2, Rt, imm); +} +void ARM64XEmitter::PRFM(ARM64Reg Rt, u32 imm) +{ + EncodeLoadRegisterInst(3, Rt, imm); +} + +// Load/Store pair +void ARM64XEmitter::LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStorePair(0, 1, type, Rt, Rt2, Rn, imm); +} +void ARM64XEmitter::LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStorePair(1, 1, type, Rt, Rt2, Rn, imm); +} +void ARM64XEmitter::STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStorePair(0, 0, type, Rt, Rt2, Rn, imm); +} + +// Load/Store Exclusive +void ARM64XEmitter::STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(0, Rs, SP, Rt, Rn); +} +void ARM64XEmitter::STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(1, Rs, SP, Rt, Rn); +} +void ARM64XEmitter::LDXRB(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(2, SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDAXRB(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(3, SP, SP, Rt, Rn); +} +void ARM64XEmitter::STLRB(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(4, SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDARB(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(5, SP, SP, Rt, Rn); +} +void ARM64XEmitter::STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(6, Rs, SP, Rt, Rn); +} +void ARM64XEmitter::STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(7, Rs, SP, Rt, Rn); +} +void ARM64XEmitter::LDXRH(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(8, SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDAXRH(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(9, SP, SP, Rt, Rn); +} +void ARM64XEmitter::STLRH(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(10, SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDARH(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(11, SP, SP, Rt, Rn); +} +void ARM64XEmitter::STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(12 + Is64Bit(Rt), Rs, SP, Rt, Rn); +} +void ARM64XEmitter::STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(14 + Is64Bit(Rt), Rs, SP, Rt, Rn); +} +void ARM64XEmitter::STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(16 + Is64Bit(Rt), Rs, Rt2, Rt, Rn); +} +void ARM64XEmitter::STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(18 + Is64Bit(Rt), Rs, Rt2, Rt, Rn); +} +void ARM64XEmitter::LDXR(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(20 + Is64Bit(Rt), SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDAXR(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(22 + Is64Bit(Rt), SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(24 + Is64Bit(Rt), SP, Rt2, Rt, Rn); +} +void ARM64XEmitter::LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(26 + Is64Bit(Rt), SP, Rt2, Rt, Rn); +} +void ARM64XEmitter::STLR(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(28 + Is64Bit(Rt), SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDAR(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(30 + Is64Bit(Rt), SP, SP, Rt, Rn); +} + +// Load/Store no-allocate pair (offset) +void ARM64XEmitter::STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm) +{ + EncodeLoadStorePairedInst(0xA0, Rt, Rt2, Rn, imm); +} +void ARM64XEmitter::LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm) +{ + EncodeLoadStorePairedInst(0xA1, Rt, Rt2, Rn, imm); +} + +// Load/Store register (immediate post-indexed) +// XXX: Most of these support vectors +void ARM64XEmitter::STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x0E4, Rt, Rn, imm, 8); + else + EncodeLoadStoreIndexedInst(0x0E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x0E5, Rt, Rn, imm, 8); + else + EncodeLoadStoreIndexedInst(0x0E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E6 : 0x0E7, Rt, Rn, imm, 8); + else + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E2 : 0x0E3, type == INDEX_POST ? 1 : 3, Rt, Rn, + imm); +} +void ARM64XEmitter::STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x1E4, Rt, Rn, imm, 16); + else + EncodeLoadStoreIndexedInst(0x1E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x1E5, Rt, Rn, imm, 16); + else + EncodeLoadStoreIndexedInst(0x1E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E6 : 0x1E7, Rt, Rn, imm, 16); + else + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E2 : 0x1E3, type == INDEX_POST ? 1 : 3, Rt, Rn, + imm); +} +void ARM64XEmitter::STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E4 : 0x2E4, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32); + else + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E0 : 0x2E0, type == INDEX_POST ? 1 : 3, Rt, Rn, + imm); +} +void ARM64XEmitter::LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E5 : 0x2E5, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32); + else + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E1 : 0x2E1, type == INDEX_POST ? 1 : 3, Rt, Rn, + imm); +} +void ARM64XEmitter::LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x2E6, Rt, Rn, imm, 32); + else + EncodeLoadStoreIndexedInst(0x2E2, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} + +// Load/Store register (register offset) +void ARM64XEmitter::STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(0, 0, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(0, 1, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + bool b64Bit = Is64Bit(Rt); + EncodeLoadStoreRegisterOffset(0, 3 - b64Bit, Rt, Rn, Rm); +} +void ARM64XEmitter::STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(1, 0, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(1, 1, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + bool b64Bit = Is64Bit(Rt); + EncodeLoadStoreRegisterOffset(1, 3 - b64Bit, Rt, Rn, Rm); +} +void ARM64XEmitter::STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + bool b64Bit = Is64Bit(Rt); + EncodeLoadStoreRegisterOffset(2 + b64Bit, 0, Rt, Rn, Rm); +} +void ARM64XEmitter::LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + bool b64Bit = Is64Bit(Rt); + EncodeLoadStoreRegisterOffset(2 + b64Bit, 1, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(2, 2, Rt, Rn, Rm); +} +void ARM64XEmitter::PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(3, 2, Rt, Rn, Rm); +} + +// Load/Store register (unscaled offset) +void ARM64XEmitter::STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(0, 0, Rt, Rn, imm); +} +void ARM64XEmitter::LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(0, 1, Rt, Rn, imm); +} +void ARM64XEmitter::LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(0, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(1, 0, Rt, Rn, imm); +} +void ARM64XEmitter::LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(1, 1, Rt, Rn, imm); +} +void ARM64XEmitter::LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(1, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 0, Rt, Rn, imm); +} +void ARM64XEmitter::LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 1, Rt, Rn, imm); +} +void ARM64XEmitter::LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + ASSERT_MSG(DYNA_REC, !Is64Bit(Rt), "%s must have a 64bit destination register!", __func__); + EncodeLoadStoreUnscaled(2, 2, Rt, Rn, imm); +} + +void ARM64XEmitter::LDRGeneric(int size, bool signExtend, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + switch (size | signExtend) + { + case 32: LDR (Rt, Rn, Rm); break; + case 33: LDRSW(Rt, Rn, Rm); break; + case 16: LDRH (Rt, Rn, Rm); break; + case 17: LDRSH(Rt, Rn, Rm); break; + case 8: LDRB (Rt, Rn, Rm); break; + case 9: LDRSB(Rt, Rn, Rm); break; + default: PanicAlert("LDRGeneric(reg): invalid size %d", size); break; + } +} +void ARM64XEmitter::STRGeneric(int size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + switch (size) + { + case 32: STR (Rt, Rn, Rm); break; + case 16: STRH (Rt, Rn, Rm); break; + case 8: STRB (Rt, Rn, Rm); break; + default: PanicAlert("STRGeneric(reg): invalid size %d", size); break; + } +} + +void ARM64XEmitter::LDRGeneric(int size, bool signExtend, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + switch (size | signExtend) + { + case 32: LDR (type, Rt, Rn, imm); break; + case 33: LDRSW(type, Rt, Rn, imm); break; + case 16: LDRH (type, Rt, Rn, imm); break; + case 17: LDRSH(type, Rt, Rn, imm); break; + case 8: LDRB (type, Rt, Rn, imm); break; + case 9: LDRSB(type, Rt, Rn, imm); break; + default: PanicAlert("LDRGeneric(imm): invalid size %d", size); break; + } +} +void ARM64XEmitter::STRGeneric(int size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + switch (size) + { + case 32: STR (type, Rt, Rn, imm); break; + case 16: STRH (type, Rt, Rn, imm); break; + case 8: STRB (type, Rt, Rn, imm); break; + default: PanicAlert("STRGeneric(imm): invalid size %d", size); break; + } +} + +// Address of label/page PC-relative +void ARM64XEmitter::ADR(ARM64Reg Rd, s32 imm) +{ + EncodeAddressInst(0, Rd, imm); +} +void ARM64XEmitter::ADRP(ARM64Reg Rd, s32 imm) +{ + EncodeAddressInst(1, Rd, imm >> 12); +} + +// Wrapper around MOVZ+MOVK (and later MOVN) +void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize) +{ + unsigned int parts = Is64Bit(Rd) ? 4 : 2; + BitSet32 upload_part(0); + + // Always start with a movz! Kills the dependency on the register. + bool use_movz = true; + + if (!imm) + { + // Zero immediate, just clear the register. EOR is pointless when we have MOVZ, which looks + // clearer in disasm too. + MOVZ(Rd, 0, SHIFT_0); + return; + } + + if ((Is64Bit(Rd) && imm == std::numeric_limits::max()) || + (!Is64Bit(Rd) && imm == std::numeric_limits::max())) + { + // Max unsigned value (or if signed, -1) + // Set to ~ZR + ARM64Reg ZR = Is64Bit(Rd) ? SP : WSP; + ORN(Rd, ZR, ZR, ArithOption(ZR, ST_LSL, 0)); + return; + } + + // TODO: Make some more systemic use of MOVN, but this will take care of most cases. + // Small negative integer. Use MOVN + if (!Is64Bit(Rd) && (imm | 0xFFFF0000) == imm) + { + MOVN(Rd, ~imm, SHIFT_0); + return; + } + + // XXX: Use MOVN when possible. + // XXX: Optimize more + // XXX: Support rotating immediates to save instructions + if (optimize) + { + for (unsigned int i = 0; i < parts; ++i) + { + if ((imm >> (i * 16)) & 0xFFFF) + upload_part[i] = 1; + } + } + + u64 aligned_pc = (u64)(m_rxbase + m_code) & ~0xFFF; +s64 aligned_offset = (s64)imm - (s64)aligned_pc; + // The offset for ADR/ADRP is an s32, so make sure it can be represented in that + if (upload_part.Count() > 1 && std::abs(aligned_offset) < 0x7FFFFFFFLL) + { + // Immediate we are loading is within 4GB of our aligned range + // Most likely a address that we can load in one or two instructions + if (!(std::abs(aligned_offset) & 0xFFF)) + { + // Aligned ADR + ADRP(Rd, (s32)aligned_offset); + return; + } + else + { + // If the address is within 1MB of PC we can load it in a single instruction still + s64 offset = (s64)imm - (s64)(m_rxbase + m_code); + if (offset >= -0xFFFFF && offset <= 0xFFFFF) + { + ADR(Rd, (s32)offset); + return; + } + else + { + ADRP(Rd, (s32)(aligned_offset & ~0xFFF)); + ADD(Rd, Rd, imm & 0xFFF); + return; + } + } + } + + for (unsigned i = 0; i < parts; ++i) + { + if (use_movz && upload_part[i]) + { + MOVZ(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i); + use_movz = false; + } + else + { + if (upload_part[i] || !optimize) + MOVK(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i); + } + } +} + +bool ARM64XEmitter::MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2) +{ + // TODO: Also optimize for performance, not just for code size. + ptrdiff_t start_offset = GetCodeOffset(); + + MOVI2R(Rd, imm1); + int size1 = GetCodeOffset() - start_offset; + + SetCodePtrUnsafe(start_offset); + + MOVI2R(Rd, imm2); + int size2 = GetCodeOffset() - start_offset; + + SetCodePtrUnsafe(start_offset); + + bool element = size1 > size2; + + MOVI2R(Rd, element ? imm2 : imm1); + + return element; +} + +void ARM64XEmitter::ABI_PushRegisters(BitSet32 registers) +{ + int num_regs = registers.Count(); + int stack_size = (num_regs + (num_regs & 1)) * 8; + auto it = registers.begin(); + + if (!num_regs) + return; + + // 8 byte per register, but 16 byte alignment, so we may have to padd one register. + // Only update the SP on the last write to avoid the dependency between those stores. + + // The first push must adjust the SP, else a context switch may invalidate everything below SP. + if (num_regs & 1) + { + STR(INDEX_PRE, (ARM64Reg)(X0 + *it++), SP, -stack_size); + } + else + { + ARM64Reg first_reg = (ARM64Reg)(X0 + *it++); + ARM64Reg second_reg = (ARM64Reg)(X0 + *it++); + STP(INDEX_PRE, first_reg, second_reg, SP, -stack_size); + } + + // Fast store for all other registers, this is always an even number. + for (int i = 0; i < (num_regs - 1) / 2; i++) + { + ARM64Reg odd_reg = (ARM64Reg)(X0 + *it++); + ARM64Reg even_reg = (ARM64Reg)(X0 + *it++); + STP(INDEX_SIGNED, odd_reg, even_reg, SP, 16 * (i + 1)); + } + + ASSERT_MSG(DYNA_REC, it == registers.end(), "%s registers don't match.", __func__); +} + +void ARM64XEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask) +{ + int num_regs = registers.Count(); + int stack_size = (num_regs + (num_regs & 1)) * 8; + auto it = registers.begin(); + + if (!num_regs) + return; + + // We must adjust the SP in the end, so load the first (two) registers at least. + ARM64Reg first = (ARM64Reg)(X0 + *it++); + ARM64Reg second; + if (!(num_regs & 1)) + second = (ARM64Reg)(X0 + *it++); + + // 8 byte per register, but 16 byte alignment, so we may have to padd one register. + // Only update the SP on the last load to avoid the dependency between those loads. + + // Fast load for all but the first (two) registers, this is always an even number. + for (int i = 0; i < (num_regs - 1) / 2; i++) + { + ARM64Reg odd_reg = (ARM64Reg)(X0 + *it++); + ARM64Reg even_reg = (ARM64Reg)(X0 + *it++); + LDP(INDEX_SIGNED, odd_reg, even_reg, SP, 16 * (i + 1)); + } + + // Post loading the first (two) registers. + if (num_regs & 1) + LDR(INDEX_POST, first, SP, stack_size); + else + LDP(INDEX_POST, first, second, SP, stack_size); + + ASSERT_MSG(DYNA_REC, it == registers.end(), "%s registers don't match.", __func__); +} + +// Float Emitter +void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, + ARM64Reg Rn, s32 imm) +{ + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + u32 encoded_size = 0; + u32 encoded_imm = 0; + + if (size == 8) + encoded_size = 0; + else if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + else if (size == 128) + encoded_size = 0; + + if (type == INDEX_UNSIGNED) + { + ASSERT_MSG(DYNA_REC, !(imm & ((size - 1) >> 3)), + "%s(INDEX_UNSIGNED) immediate offset must be aligned to size! (%d) (%p)", __func__, + imm, m_emit->GetCodePtr()); + ASSERT_MSG(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED) immediate offset must be positive!", + __func__); + if (size == 16) + imm >>= 1; + else if (size == 32) + imm >>= 2; + else if (size == 64) + imm >>= 3; + else if (size == 128) + imm >>= 4; + encoded_imm = (imm & 0xFFF); + } + else + { + ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), + "%s immediate offset must be within range of -256 to 256!", __func__); + encoded_imm = (imm & 0x1FF) << 2; + if (type == INDEX_POST) + encoded_imm |= 1; + else + encoded_imm |= 3; + } + + Write32((encoded_size << 30) | (0xF << 26) | (type == INDEX_UNSIGNED ? (1 << 24) : 0) | + (size == 128 ? (1 << 23) : 0) | (opc << 22) | (encoded_imm << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, + ARM64Reg Rn, ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s only supports double and single registers!", __func__); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (type << 22) | (Rm << 16) | (opcode << 12) | + (1 << 11) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__); + bool quad = IsQuad(Rd); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (Rm << 16) | + (opcode << 11) | (1 << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn) +{ + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((Q << 30) | (op << 29) | (0b111 << 25) | (imm5 << 16) | (imm4 << 11) | (1 << 10) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((Q << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (opcode << 12) | (1 << 11) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, + ARM64Reg Rt, ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __func__); + bool quad = IsQuad(Rt); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((quad << 30) | (0b1101 << 24) | (L << 22) | (R << 21) | (opcode << 13) | (S << 12) | + (size << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, + ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __func__); + bool quad = IsQuad(Rt); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (0x1B << 23) | (L << 22) | (R << 21) | (Rm << 16) | (opcode << 13) | + (S << 12) | (size << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, + ARM64Reg Rd, ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, Rn <= SP, "%s only supports GPR as source!", __func__); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((sf << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (rmode << 19) | (opcode << 16) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, + bool sign) +{ + DEBUG_ASSERT_MSG(DYNA_REC, IsScalar(Rn), "fcvts: Rn must be floating point"); + if (IsGPR(Rd)) + { + // Use the encoding that transfers the result to a GPR. + bool sf = Is64Bit(Rd); + int type = IsDouble(Rn) ? 1 : 0; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int opcode = (sign ? 1 : 0); + int rmode = 0; + switch (round) + { + case ROUND_A: + rmode = 0; + opcode |= 4; + break; + case ROUND_P: + rmode = 1; + break; + case ROUND_M: + rmode = 2; + break; + case ROUND_Z: + rmode = 3; + break; + case ROUND_N: + rmode = 0; + break; + } + EmitConversion2(sf, 0, true, type, rmode, opcode, 0, Rd, Rn); + } + else + { + // Use the encoding (vector, single) that keeps the result in the fp register. + int sz = IsDouble(Rn); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int opcode = 0; + switch (round) + { + case ROUND_A: + opcode = 0x1C; + break; + case ROUND_N: + opcode = 0x1A; + break; + case ROUND_M: + opcode = 0x1B; + break; + case ROUND_P: + opcode = 0x1A; + sz |= 2; + break; + case ROUND_Z: + opcode = 0x1B; + sz |= 2; + break; + } + Write32((0x5E << 24) | (sign << 29) | (sz << 22) | (1 << 21) | (opcode << 12) | (2 << 10) | + (Rn << 5) | Rd); + } +} + +void ARM64FloatEmitter::FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round) +{ + EmitConvertScalarToInt(Rd, Rn, round, false); +} + +void ARM64FloatEmitter::FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round) +{ + EmitConvertScalarToInt(Rd, Rn, round, true); +} + +void ARM64FloatEmitter::EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, + u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn) +{ + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((sf << 31) | (S << 29) | (0xF0 << 21) | (direction << 21) | (type << 22) | (rmode << 19) | + (opcode << 16) | (scale << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !IsQuad(Rn), "%s doesn't support vector!", __func__); + bool is_double = IsDouble(Rn); + + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (op << 14) | + (1 << 13) | (Rn << 5) | opcode2); +} + +void ARM64FloatEmitter::EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__); + bool is_double = IsDouble(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (cond << 12) | + (3 << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__); + + bool quad = IsQuad(Rd); + + u32 encoded_size = 0; + if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (7 << 25) | (encoded_size << 22) | (Rm << 16) | (op << 12) | (1 << 11) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8) +{ + ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__); + + bool is_double = !IsSingle(Rd); + + Rd = DecodeReg(Rd); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (type << 22) | (imm8 << 13) | + (1 << 12) | (imm5 << 5) | Rd); +} + +void ARM64FloatEmitter::EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, + ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, immh, "%s bad encoding! Can't have zero immh", __func__); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((Q << 30) | (U << 29) | (0xF << 24) | (immh << 19) | (immb << 16) | (opcode << 11) | + (1 << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, + ARM64Reg Rn) +{ + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((2 << 30) | (U << 29) | (0x3E << 23) | (immh << 19) | (immb << 16) | (opcode << 11) | + (1 << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, + ARM64Reg Rn) +{ + bool quad = IsQuad(Rt); + u32 encoded_size = 0; + + if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((quad << 30) | (3 << 26) | (L << 22) | (opcode << 12) | (encoded_size << 10) | (Rn << 5) | + Rt); +} + +void ARM64FloatEmitter::EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, + ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + bool quad = IsQuad(Rt); + u32 encoded_size = 0; + + if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (0b11001 << 23) | (L << 22) | (Rm << 16) | (opcode << 12) | + (encoded_size << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, + ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, + ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + bool quad = IsQuad(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (U << 29) | (0xF << 24) | (size << 22) | (L << 21) | (Rm << 16) | + (opcode << 12) | (H << 11) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __func__, + imm); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((size << 30) | (0xF << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, + ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +{ + u32 type_encode = 0; + u32 opc = 0; + + switch (type) + { + case INDEX_SIGNED: + type_encode = 0b010; + break; + case INDEX_POST: + type_encode = 0b001; + break; + case INDEX_PRE: + type_encode = 0b011; + break; + case INDEX_UNSIGNED: + ASSERT_MSG(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __func__); + break; + } + + if (size == 128) + { + ASSERT_MSG(DYNA_REC, !(imm & 0xF), "%s received invalid offset 0x%x!", __func__, imm); + opc = 2; + imm >>= 4; + } + else if (size == 64) + { + ASSERT_MSG(DYNA_REC, !(imm & 0x7), "%s received invalid offset 0x%x!", __func__, imm); + opc = 1; + imm >>= 3; + } + else if (size == 32) + { + ASSERT_MSG(DYNA_REC, !(imm & 0x3), "%s received invalid offset 0x%x!", __func__, imm); + opc = 0; + imm >>= 2; + } + + Rt = DecodeReg(Rt); + Rt2 = DecodeReg(Rt2); + Rn = DecodeReg(Rn); + + Write32((opc << 30) | (0b1011 << 26) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) | + (Rt2 << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, + ArithOption Rm) +{ + ASSERT_MSG(DYNA_REC, Rm.GetType() == ArithOption::TYPE_EXTENDEDREG, + "%s must contain an extended reg as Rm!", __func__); + + u32 encoded_size = 0; + u32 encoded_op = 0; + + if (size == 8) + { + encoded_size = 0; + encoded_op = 0; + } + else if (size == 16) + { + encoded_size = 1; + encoded_op = 0; + } + else if (size == 32) + { + encoded_size = 2; + encoded_op = 0; + } + else if (size == 64) + { + encoded_size = 3; + encoded_op = 0; + } + else if (size == 128) + { + encoded_size = 0; + encoded_op = 2; + } + + if (load) + encoded_op |= 1; + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg()); + + Write32((encoded_size << 30) | (encoded_op << 22) | (0b111100001 << 21) | (decoded_Rm << 16) | + Rm.GetData() | (1 << 11) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh) +{ + union + { + u8 hex; + struct + { + unsigned defgh : 5; + unsigned abc : 3; + }; + } v; + v.hex = abcdefgh; + Rd = DecodeReg(Rd); + Write32((Q << 30) | (op << 29) | (0xF << 24) | (v.abc << 16) | (cmode << 12) | (o2 << 11) | + (1 << 10) | (v.defgh << 5) | Rd); +} + +void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm); +} +void ARM64FloatEmitter::STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EmitLoadStoreImmediate(size, 0, type, Rt, Rn, imm); +} + +// Loadstore unscaled +void ARM64FloatEmitter::LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + u32 encoded_size = 0; + u32 encoded_op = 0; + + if (size == 8) + { + encoded_size = 0; + encoded_op = 1; + } + else if (size == 16) + { + encoded_size = 1; + encoded_op = 1; + } + else if (size == 32) + { + encoded_size = 2; + encoded_op = 1; + } + else if (size == 64) + { + encoded_size = 3; + encoded_op = 1; + } + else if (size == 128) + { + encoded_size = 0; + encoded_op = 3; + } + + EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm); +} +void ARM64FloatEmitter::STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + u32 encoded_size = 0; + u32 encoded_op = 0; + + if (size == 8) + { + encoded_size = 0; + encoded_op = 0; + } + else if (size == 16) + { + encoded_size = 1; + encoded_op = 0; + } + else if (size == 32) + { + encoded_size = 2; + encoded_op = 0; + } + else if (size == 64) + { + encoded_size = 3; + encoded_op = 0; + } + else if (size == 128) + { + encoded_size = 0; + encoded_op = 2; + } + + EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm); +} + +// Loadstore single structure +void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) +{ + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) + { + S = (index & 4) != 0; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 16) + { + S = (index & 2) != 0; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 32) + { + S = (index & 1) != 0; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 64) + { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn); +} + +void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm) +{ + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) + { + S = (index & 4) != 0; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 16) + { + S = (index & 2) != 0; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 32) + { + S = (index & 1) != 0; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 64) + { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm); +} + +void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn) +{ + EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn); +} +void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn) +{ + EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn); +} +void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn, Rm); +} +void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn, Rm); +} + +void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) +{ + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) + { + S = (index & 4) != 0; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 16) + { + S = (index & 2) != 0; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 32) + { + S = (index & 1) != 0; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 64) + { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn); +} + +void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm) +{ + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) + { + S = (index & 4) != 0; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 16) + { + S = (index & 2) != 0; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 32) + { + S = (index & 1) != 0; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 64) + { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm); +} + +// Loadstore multiple structure +void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", + __func__); + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn); +} +void ARM64FloatEmitter::LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, + ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", + __func__); + ASSERT_MSG(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __func__); + + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructurePost(size, 1, opcode, Rt, Rn, Rm); +} +void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", + __func__); + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructure(size, 0, opcode, Rt, Rn); +} +void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, + ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", + __func__); + ASSERT_MSG(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __func__); + + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm); +} + +// Scalar - 1 Source +void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top) +{ + if (IsScalar(Rd) && IsScalar(Rn)) + { + EmitScalar1Source(0, 0, IsDouble(Rd), 0, Rd, Rn); + } + else + { + ASSERT_MSG(DYNA_REC, !IsQuad(Rd) && !IsQuad(Rn), "FMOV can't move to/from quads"); + int rmode = 0; + int opcode = 6; + int sf = 0; + if (IsSingle(Rd) && !Is64Bit(Rn) && !top) + { + // GPR to scalar single + opcode |= 1; + } + else if (!Is64Bit(Rd) && IsSingle(Rn) && !top) + { + // Scalar single to GPR - defaults are correct + } + else + { + // TODO + ASSERT_MSG(DYNA_REC, 0, "FMOV: Unhandled case"); + } + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Write32((sf << 31) | (0x1e2 << 20) | (rmode << 19) | (opcode << 16) | (Rn << 5) | Rd); + } +} + +// Loadstore paired +void ARM64FloatEmitter::LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, + s32 imm) +{ + EncodeLoadStorePair(size, true, type, Rt, Rt2, Rn, imm); +} +void ARM64FloatEmitter::STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, + s32 imm) +{ + EncodeLoadStorePair(size, false, type, Rt, Rt2, Rn, imm); +} + +// Loadstore register offset +void ARM64FloatEmitter::STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(size, false, Rt, Rn, Rm); +} +void ARM64FloatEmitter::LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(size, true, Rt, Rn, Rm); +} + +void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalar1Source(0, 0, IsDouble(Rd), 1, Rd, Rn); +} +void ARM64FloatEmitter::FNEG(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalar1Source(0, 0, IsDouble(Rd), 2, Rd, Rn); +} +void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn); +} + +// Scalar - 2 Source +void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 0, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 1, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 4, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 5, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 6, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 7, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 8, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 0); +} +void ARM64FloatEmitter::FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 1); +} +void ARM64FloatEmitter::FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 2); +} +void ARM64FloatEmitter::FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 3); +} + +void ARM64FloatEmitter::EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + ARM64Reg Ra, int opcode) +{ + int type = isDouble ? 1 : 0; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + Ra = DecodeReg(Ra); + int o1 = opcode >> 1; + int o0 = opcode & 1; + m_emit->Write32((0x1F << 24) | (type << 22) | (o1 << 21) | (Rm << 16) | (o0 << 15) | (Ra << 10) | + (Rn << 5) | Rd); +} + +// Scalar floating point immediate +void ARM64FloatEmitter::FMOV(ARM64Reg Rd, uint8_t imm8) +{ + EmitScalarImm(0, 0, 0, 0, Rd, imm8); +} + +// Vector +void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 0, 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, 1, 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) +{ + u32 imm5 = 0; + + if (size == 8) + { + imm5 = 1; + imm5 |= index << 1; + } + else if (size == 16) + { + imm5 = 2; + imm5 |= index << 2; + } + else if (size == 32) + { + imm5 = 4; + imm5 |= index << 3; + } + else if (size == 64) + { + imm5 = 8; + imm5 |= index << 4; + } + + EmitCopy(IsQuad(Rd), 0, imm5, 0, Rd, Rn); +} +void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn); +} +void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, size >> 6, 0b11110, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, size >> 6, 0x19, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 2 | size >> 6, 0b11110, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(false, 0, size >> 6, 0x17, Rd, Rn); +} +void ARM64FloatEmitter::FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(true, 0, size >> 6, 0x17, Rd, Rn); +} +void ARM64FloatEmitter::FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, dest_size >> 5, 0x16, Rd, Rn); +} +void ARM64FloatEmitter::FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1B, Rd, Rn); +} +void ARM64FloatEmitter::FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1B, Rd, Rn); +} +void ARM64FloatEmitter::FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, size >> 6, 0x1F, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, size >> 6, 0x1B, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xF, Rd, Rn); +} +void ARM64FloatEmitter::FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1D, Rd, Rn); +} +void ARM64FloatEmitter::FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1D, Rd, Rn); +} +void ARM64FloatEmitter::FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 2 | (size >> 6), 0x1A, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 2 | (size >> 6), 0x19, Rd, Rn, Rm); +} +void ARM64FloatEmitter::NOT(ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, 0, 5, Rd, Rn); +} +void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 2, 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn); +} +void ARM64FloatEmitter::REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, size >> 4, 0, Rd, Rn); +} +void ARM64FloatEmitter::REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 0, Rd, Rn); +} +void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, size >> 6, 0x1D, Rd, Rn); +} +void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, size >> 6, 0x1D, Rd, Rn); +} +void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale) +{ + int imm = size * 2 - scale; + EmitShiftImm(IsQuad(Rd), 0, imm >> 3, imm & 7, 0x1C, Rd, Rn); +} +void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale) +{ + int imm = size * 2 - scale; + EmitShiftImm(IsQuad(Rd), 1, imm >> 3, imm & 7, 0x1C, Rd, Rn); +} +void ARM64FloatEmitter::SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(false, 0, dest_size >> 4, 0b10100, Rd, Rn); +} +void ARM64FloatEmitter::SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(true, 0, dest_size >> 4, 0b10100, Rd, Rn); +} +void ARM64FloatEmitter::UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(false, 1, dest_size >> 4, 0b10100, Rd, Rn); +} +void ARM64FloatEmitter::UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(true, 1, dest_size >> 4, 0b10100, Rd, Rn); +} +void ARM64FloatEmitter::XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(false, 0, dest_size >> 4, 0b10010, Rd, Rn); +} +void ARM64FloatEmitter::XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(true, 0, dest_size >> 4, 0b10010, Rd, Rn); +} + +// Move +void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + u32 imm5 = 0; + + if (size == 8) + imm5 = 1; + else if (size == 16) + imm5 = 2; + else if (size == 32) + imm5 = 4; + else if (size == 64) + imm5 = 8; + + EmitCopy(IsQuad(Rd), 0, imm5, 1, Rd, Rn); +} +void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn) +{ + u32 imm5 = 0; + + if (size == 8) + { + imm5 = 1; + imm5 |= index << 1; + } + else if (size == 16) + { + imm5 = 2; + imm5 |= index << 2; + } + else if (size == 32) + { + imm5 = 4; + imm5 |= index << 3; + } + else if (size == 64) + { + imm5 = 8; + imm5 |= index << 4; + } + + EmitCopy(1, 0, imm5, 3, Rd, Rn); +} +void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2) +{ + u32 imm5 = 0, imm4 = 0; + + if (size == 8) + { + imm5 = 1; + imm5 |= index1 << 1; + imm4 = index2; + } + else if (size == 16) + { + imm5 = 2; + imm5 |= index1 << 2; + imm4 = index2 << 1; + } + else if (size == 32) + { + imm5 = 4; + imm5 |= index1 << 3; + imm4 = index2 << 2; + } + else if (size == 64) + { + imm5 = 8; + imm5 |= index1 << 4; + imm4 = index2 << 3; + } + + EmitCopy(1, 1, imm5, imm4, Rd, Rn); +} + +void ARM64FloatEmitter::UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) +{ + bool b64Bit = Is64Bit(Rd); + ASSERT_MSG(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __func__); + ASSERT_MSG(DYNA_REC, !(b64Bit && size != 64), + "%s must have a size of 64 when destination is 64bit!", __func__); + u32 imm5 = 0; + + if (size == 8) + { + imm5 = 1; + imm5 |= index << 1; + } + else if (size == 16) + { + imm5 = 2; + imm5 |= index << 2; + } + else if (size == 32) + { + imm5 = 4; + imm5 |= index << 3; + } + else if (size == 64) + { + imm5 = 8; + imm5 |= index << 4; + } + + EmitCopy(b64Bit, 0, imm5, 7, Rd, Rn); +} +void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) +{ + bool b64Bit = Is64Bit(Rd); + ASSERT_MSG(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __func__); + ASSERT_MSG(DYNA_REC, size != 64, "%s doesn't support 64bit destination. Use UMOV!", __func__); + u32 imm5 = 0; + + if (size == 8) + { + imm5 = 1; + imm5 |= index << 1; + } + else if (size == 16) + { + imm5 = 2; + imm5 |= index << 2; + } + else if (size == 32) + { + imm5 = 4; + imm5 |= index << 3; + } + + EmitCopy(b64Bit, 0, imm5, 5, Rd, Rn); +} + +// One source +void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn) +{ + u32 dst_encoding = 0; + u32 src_encoding = 0; + + if (size_to == 16) + dst_encoding = 3; + else if (size_to == 32) + dst_encoding = 0; + else if (size_to == 64) + dst_encoding = 1; + + if (size_from == 16) + src_encoding = 3; + else if (size_from == 32) + src_encoding = 0; + else if (size_from == 64) + src_encoding = 1; + + Emit1Source(0, 0, src_encoding, 4 | dst_encoding, Rd, Rn); +} + +void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn) +{ + if (IsScalar(Rn)) + { + // Source is in FP register (like destination!). We must use a vector encoding. + bool sign = false; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int sz = IsDouble(Rn); + Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd); + } + else + { + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + EmitConversion(sf, 0, type, 0, 2, Rd, Rn); + } +} + +void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn) +{ + if (IsScalar(Rn)) + { + // Source is in FP register (like destination!). We must use a vector encoding. + bool sign = true; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int sz = IsDouble(Rn); + Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd); + } + else + { + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + + EmitConversion(sf, 0, type, 0, 3, Rd, Rn); + } +} + +void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale) +{ + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + + EmitConversion2(sf, 0, false, type, 0, 2, 64 - scale, Rd, Rn); +} + +void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale) +{ + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + + EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn); +} + +void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm) +{ + EmitCompare(0, 0, 0, 0, Rn, Rm); +} +void ARM64FloatEmitter::FCMP(ARM64Reg Rn) +{ + EmitCompare(0, 0, 0, 8, Rn, (ARM64Reg)0); +} +void ARM64FloatEmitter::FCMPE(ARM64Reg Rn, ARM64Reg Rm) +{ + EmitCompare(0, 0, 0, 0x10, Rn, Rm); +} +void ARM64FloatEmitter::FCMPE(ARM64Reg Rn) +{ + EmitCompare(0, 0, 0, 0x18, Rn, (ARM64Reg)0); +} +void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, size >> 6, 0x1C, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xD, Rd, Rn); +} +void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, size >> 6, 0x1C, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x0C, Rd, Rn); +} +void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, 2 | (size >> 6), 0x1C, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x0C, Rd, Rn); +} +void ARM64FloatEmitter::FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xD, Rd, Rn); +} +void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn); +} + +void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) +{ + EmitCondSelect(0, 0, cond, Rd, Rn, Rm); +} + +// Permute +void ARM64FloatEmitter::UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitPermute(size, 0b001, Rd, Rn, Rm); +} +void ARM64FloatEmitter::TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitPermute(size, 0b010, Rd, Rn, Rm); +} +void ARM64FloatEmitter::ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitPermute(size, 0b011, Rd, Rn, Rm); +} +void ARM64FloatEmitter::UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitPermute(size, 0b101, Rd, Rn, Rm); +} +void ARM64FloatEmitter::TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitPermute(size, 0b110, Rd, Rn, Rm); +} +void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitPermute(size, 0b111, Rd, Rn, Rm); +} + +// Shift by immediate +void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SSHLL(src_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SSHLL(src_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SHRN(dest_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SHRN(dest_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + USHLL(src_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + USHLL(src_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + SXTL(src_size, Rd, Rn, false); +} +void ARM64FloatEmitter::SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + SXTL(src_size, Rd, Rn, true); +} +void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + UXTL(src_size, Rd, Rn, false); +} +void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + UXTL(src_size, Rd, Rn, true); +} + +void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) +{ + ASSERT_MSG(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!", + __func__); + u32 immh = 0; + u32 immb = shift & 0xFFF; + + if (src_size == 8) + { + immh = 1; + } + else if (src_size == 16) + { + immh = 2 | ((shift >> 3) & 1); + } + else if (src_size == 32) + { + immh = 4 | ((shift >> 3) & 3); + ; + } + EmitShiftImm(upper, 0, immh, immb, 0b10100, Rd, Rn); +} + +void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) +{ + ASSERT_MSG(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!", + __func__); + u32 immh = 0; + u32 immb = shift & 0xFFF; + + if (src_size == 8) + { + immh = 1; + } + else if (src_size == 16) + { + immh = 2 | ((shift >> 3) & 1); + } + else if (src_size == 32) + { + immh = 4 | ((shift >> 3) & 3); + ; + } + EmitShiftImm(upper, 1, immh, immb, 0b10100, Rd, Rn); +} + +void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) +{ + ASSERT_MSG(DYNA_REC, shift < dest_size, "%s shift amount must less than the element size!", + __func__); + u32 immh = 0; + u32 immb = shift & 0xFFF; + + if (dest_size == 8) + { + immh = 1; + } + else if (dest_size == 16) + { + immh = 2 | ((shift >> 3) & 1); + } + else if (dest_size == 32) + { + immh = 4 | ((shift >> 3) & 3); + ; + } + EmitShiftImm(upper, 1, immh, immb, 0b10000, Rd, Rn); +} + +void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper) +{ + SSHLL(src_size, Rd, Rn, 0, upper); +} + +void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper) +{ + USHLL(src_size, Rd, Rn, 0, upper); +} + +// vector x indexed element +void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index) +{ + ASSERT_MSG(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__); + + bool L = false; + bool H = false; + if (size == 32) + { + L = index & 1; + H = (index >> 1) & 1; + } + else if (size == 64) + { + H = index == 1; + } + + EmitVectorxElement(0, 2 | (size >> 6), L, 0x9, H, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index) +{ + ASSERT_MSG(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__); + + bool L = false; + bool H = false; + if (size == 32) + { + L = index & 1; + H = (index >> 1) & 1; + } + else if (size == 64) + { + H = index == 1; + } + + EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm); +} + +// Modified Immediate +void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift) +{ + bool Q = IsQuad(Rd); + u8 cmode = 0; + u8 op = 0; + u8 abcdefgh = imm & 0xFF; + if (size == 8) + { + ASSERT_MSG(DYNA_REC, shift == 0, "%s(size8) doesn't support shift!", __func__); + ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size8) only supports 8bit values!", __func__); + } + else if (size == 16) + { + ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!", + __func__); + ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size16) only supports 8bit values!", __func__); + + if (shift == 8) + cmode |= 2; + } + else if (size == 32) + { + ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8 || shift == 16 || shift == 24, + "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__); + // XXX: Implement support for MOVI - shifting ones variant + ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size32) only supports 8bit values!", __func__); + switch (shift) + { + case 8: + cmode |= 2; + break; + case 16: + cmode |= 4; + break; + case 24: + cmode |= 6; + break; + default: + break; + } + } + else // 64 + { + ASSERT_MSG(DYNA_REC, shift == 0, "%s(size64) doesn't support shift!", __func__); + + op = 1; + cmode = 0xE; + abcdefgh = 0; + for (int i = 0; i < 8; ++i) + { + u8 tmp = (imm >> (i << 3)) & 0xFF; + ASSERT_MSG(DYNA_REC, tmp == 0xFF || tmp == 0, "%s(size64) Invalid immediate!", __func__); + if (tmp == 0xFF) + abcdefgh |= (1 << i); + } + } + EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh); +} + +void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift) +{ + bool Q = IsQuad(Rd); + u8 cmode = 1; + u8 op = 1; + if (size == 16) + { + ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!", + __func__); + + if (shift == 8) + cmode |= 2; + } + else if (size == 32) + { + ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8 || shift == 16 || shift == 24, + "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__); + // XXX: Implement support for MOVI - shifting ones variant + switch (shift) + { + case 8: + cmode |= 2; + break; + case 16: + cmode |= 4; + break; + case 24: + cmode |= 6; + break; + default: + break; + } + } + else + { + ASSERT_MSG(DYNA_REC, false, "%s only supports size of {16, 32}!", __func__); + } + EncodeModImm(Q, op, cmode, 0, Rd, imm); +} + +void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp) +{ + bool bundled_loadstore = false; + + for (int i = 0; i < 32; ++i) + { + if (!registers[i]) + continue; + + int count = 0; + while (++count < 4 && (i + count) < 32 && registers[i + count]) + { + } + if (count > 1) + { + bundled_loadstore = true; + break; + } + } + + if (bundled_loadstore && tmp != INVALID_REG) + { + int num_regs = registers.Count(); + m_emit->SUB(SP, SP, num_regs * 16); + m_emit->ADD(tmp, SP, 0); + std::vector island_regs; + for (int i = 0; i < 32; ++i) + { + if (!registers[i]) + continue; + + int count = 0; + + // 0 = true + // 1 < 4 && registers[i + 1] true! + // 2 < 4 && registers[i + 2] true! + // 3 < 4 && registers[i + 3] true! + // 4 < 4 && registers[i + 4] false! + while (++count < 4 && (i + count) < 32 && registers[i + count]) + { + } + + if (count == 1) + island_regs.push_back((ARM64Reg)(Q0 + i)); + else + ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp); + + i += count - 1; + } + + // Handle island registers + std::vector pair_regs; + for (auto& it : island_regs) + { + pair_regs.push_back(it); + if (pair_regs.size() == 2) + { + STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32); + pair_regs.clear(); + } + } + if (pair_regs.size()) + STR(128, INDEX_POST, pair_regs[0], tmp, 16); + } + else + { + std::vector pair_regs; + for (auto it : registers) + { + pair_regs.push_back((ARM64Reg)(Q0 + it)); + if (pair_regs.size() == 2) + { + STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32); + pair_regs.clear(); + } + } + if (pair_regs.size()) + STR(128, INDEX_PRE, pair_regs[0], SP, -16); + } +} +void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp) +{ + bool bundled_loadstore = false; + int num_regs = registers.Count(); + + for (int i = 0; i < 32; ++i) + { + if (!registers[i]) + continue; + + int count = 0; + while (++count < 4 && (i + count) < 32 && registers[i + count]) + { + } + if (count > 1) + { + bundled_loadstore = true; + break; + } + } + + if (bundled_loadstore && tmp != INVALID_REG) + { + // The temporary register is only used to indicate that we can use this code path + std::vector island_regs; + for (int i = 0; i < 32; ++i) + { + if (!registers[i]) + continue; + + int count = 0; + while (++count < 4 && (i + count) < 32 && registers[i + count]) + { + } + + if (count == 1) + island_regs.push_back((ARM64Reg)(Q0 + i)); + else + LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP); + + i += count - 1; + } + + // Handle island registers + std::vector pair_regs; + for (auto& it : island_regs) + { + pair_regs.push_back(it); + if (pair_regs.size() == 2) + { + LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32); + pair_regs.clear(); + } + } + if (pair_regs.size()) + LDR(128, INDEX_POST, pair_regs[0], SP, 16); + } + else + { + bool odd = num_regs % 2; + std::vector pair_regs; + for (int i = 31; i >= 0; --i) + { + if (!registers[i]) + continue; + + if (odd) + { + // First load must be a regular LDR if odd + odd = false; + LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16); + } + else + { + pair_regs.push_back((ARM64Reg)(Q0 + i)); + if (pair_regs.size() == 2) + { + LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32); + pair_regs.clear(); + } + } + } + } +} + +void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + unsigned int n, imm_s, imm_r; + if (!Is64Bit(Rn)) + imm &= 0xFFFFFFFF; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) + { + AND(Rd, Rn, imm_r, imm_s, n != 0); + } + else + { + ASSERT_MSG(DYNA_REC, scratch != INVALID_REG, + "ANDI2R - failed to construct logical immediate value from %08x, need scratch", + (u32)imm); + MOVI2R(scratch, imm); + AND(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + unsigned int n, imm_s, imm_r; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) + { + ORR(Rd, Rn, imm_r, imm_s, n != 0); + } + else + { + ASSERT_MSG(DYNA_REC, scratch != INVALID_REG, + "ORRI2R - failed to construct logical immediate value from %08x, need scratch", + (u32)imm); + MOVI2R(scratch, imm); + ORR(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + unsigned int n, imm_s, imm_r; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) + { + EOR(Rd, Rn, imm_r, imm_s, n != 0); + } + else + { + ASSERT_MSG(DYNA_REC, scratch != INVALID_REG, + "EORI2R - failed to construct logical immediate value from %08x, need scratch", + (u32)imm); + MOVI2R(scratch, imm); + EOR(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + unsigned int n, imm_s, imm_r; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) + { + ANDS(Rd, Rn, imm_r, imm_s, n != 0); + } + else + { + ASSERT_MSG(DYNA_REC, scratch != INVALID_REG, + "ANDSI2R - failed to construct logical immediate value from %08x, need scratch", + (u32)imm); + MOVI2R(scratch, imm); + ANDS(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, + bool flags) +{ + switch ((negative << 1) | flags) + { + case 0: + ADD(Rd, Rn, imm, shift); + break; + case 1: + ADDS(Rd, Rn, imm, shift); + break; + case 2: + SUB(Rd, Rn, imm, shift); + break; + case 3: + SUBS(Rd, Rn, imm, shift); + break; + } +} + +void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags, + ARM64Reg scratch) +{ + bool has_scratch = scratch != INVALID_REG; + u64 imm_neg = Is64Bit(Rd) ? -imm : -imm & 0xFFFFFFFFuLL; + bool neg_neg = negative ? false : true; + + // Fast paths, aarch64 immediate instructions + // Try them all first + if (imm <= 0xFFF) + { + AddImmediate(Rd, Rn, imm, false, negative, flags); + return; + } + if (imm <= 0xFFFFFF && (imm & 0xFFF) == 0) + { + AddImmediate(Rd, Rn, imm >> 12, true, negative, flags); + return; + } + if (imm_neg <= 0xFFF) + { + AddImmediate(Rd, Rn, imm_neg, false, neg_neg, flags); + return; + } + if (imm_neg <= 0xFFFFFF && (imm_neg & 0xFFF) == 0) + { + AddImmediate(Rd, Rn, imm_neg >> 12, true, neg_neg, flags); + return; + } + + // ADD+ADD is slower than MOVK+ADD, but inplace. + // But it supports a few more bits, so use it to avoid MOVK+MOVK+ADD. + // As this splits the addition in two parts, this must not be done on setting flags. + if (!flags && (imm >= 0x10000u || !has_scratch) && imm < 0x1000000u) + { + AddImmediate(Rd, Rn, imm & 0xFFF, false, negative, false); + AddImmediate(Rd, Rd, imm >> 12, true, negative, false); + return; + } + if (!flags && (imm_neg >= 0x10000u || !has_scratch) && imm_neg < 0x1000000u) + { + AddImmediate(Rd, Rn, imm_neg & 0xFFF, false, neg_neg, false); + AddImmediate(Rd, Rd, imm_neg >> 12, true, neg_neg, false); + return; + } + + ASSERT_MSG(DYNA_REC, has_scratch, + "ADDI2R - failed to construct arithmetic immediate value from %08x, need scratch", + (u32)imm); + + negative ^= MOVI2R2(scratch, imm, imm_neg); + switch ((negative << 1) | flags) + { + case 0: + ADD(Rd, Rn, scratch); + break; + case 1: + ADDS(Rd, Rn, scratch); + break; + case 2: + SUB(Rd, Rn, scratch); + break; + case 3: + SUBS(Rd, Rn, scratch); + break; + } +} + +void ARM64XEmitter::ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + ADDI2R_internal(Rd, Rn, imm, false, false, scratch); +} + +void ARM64XEmitter::ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + ADDI2R_internal(Rd, Rn, imm, false, true, scratch); +} + +void ARM64XEmitter::SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + ADDI2R_internal(Rd, Rn, imm, true, false, scratch); +} + +void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + ADDI2R_internal(Rd, Rn, imm, true, true, scratch); +} + +void ARM64XEmitter::CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + ADDI2R_internal(Is64Bit(Rn) ? ZR : WZR, Rn, imm, true, true, scratch); +} + +bool ARM64XEmitter::TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + ADD(Rd, Rn, val, shift); + else + return false; + + return true; +} + +bool ARM64XEmitter::TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + SUB(Rd, Rn, val, shift); + else + return false; + + return true; +} + +bool ARM64XEmitter::TryCMPI2R(ARM64Reg Rn, u32 imm) +{ + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + CMP(Rn, val, shift); + else + return false; + + return true; +} + +bool ARM64XEmitter::TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 n, imm_r, imm_s; + if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) + AND(Rd, Rn, imm_r, imm_s, n != 0); + else + return false; + + return true; +} +bool ARM64XEmitter::TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 n, imm_r, imm_s; + if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) + ORR(Rd, Rn, imm_r, imm_s, n != 0); + else + return false; + + return true; +} +bool ARM64XEmitter::TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 n, imm_r, imm_s; + if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) + EOR(Rd, Rn, imm_r, imm_s, n != 0); + else + return false; + + return true; +} + +void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate) +{ + ASSERT_MSG(DYNA_REC, !IsDouble(Rd), "MOVI2F does not yet support double precision"); + uint8_t imm8; + if (value == 0.0) + { + FMOV(Rd, IsDouble(Rd) ? ZR : WZR); + if (negate) + FNEG(Rd, Rd); + // TODO: There are some other values we could generate with the float-imm instruction, like + // 1.0... + } + else if (FPImm8FromFloat(value, &imm8)) + { + FMOV(Rd, imm8); + } + else + { + ASSERT_MSG(DYNA_REC, scratch != INVALID_REG, + "Failed to find a way to generate FP immediate %f without scratch", value); + if (negate) + value = -value; + + const u32 ival = Common::BitCast(value); + m_emit->MOVI2R(scratch, ival); + FMOV(Rd, scratch); + } +} + +// TODO: Quite a few values could be generated easily using the MOVI instruction and friends. +void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch) +{ + // TODO: Make it work with more element sizes + // TODO: Optimize - there are shorter solution for many values + ARM64Reg s = (ARM64Reg)(S0 + DecodeReg(Rd)); + MOVI2F(s, value, scratch); + DUP(32, Rd, Rd, 0); +} + +} // namespace Arm64Gen diff --git a/src/dolphin/Arm64Emitter.h b/src/dolphin/Arm64Emitter.h new file mode 100644 index 00000000..4cb9ff7c --- /dev/null +++ b/src/dolphin/Arm64Emitter.h @@ -0,0 +1,1152 @@ +// Copyright 2015 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include +#include + +#include "ArmCommon.h" +#include "Assert.h" +#include "BitSet.h" +#include "Compat.h" + +namespace Arm64Gen +{ +// X30 serves a dual purpose as a link register +// Encoded as +// Types: +// 000 - 32bit GPR +// 001 - 64bit GPR +// 010 - VFP single precision +// 100 - VFP double precision +// 110 - VFP quad precision +enum ARM64Reg +{ + // 32bit registers + W0 = 0, + W1, + W2, + W3, + W4, + W5, + W6, + W7, + W8, + W9, + W10, + W11, + W12, + W13, + W14, + W15, + W16, + W17, + W18, + W19, + W20, + W21, + W22, + W23, + W24, + W25, + W26, + W27, + W28, + W29, + W30, + + WSP, // 32bit stack pointer + + // 64bit registers + X0 = 0x20, + X1, + X2, + X3, + X4, + X5, + X6, + X7, + X8, + X9, + X10, + X11, + X12, + X13, + X14, + X15, + X16, + X17, + X18, + X19, + X20, + X21, + X22, + X23, + X24, + X25, + X26, + X27, + X28, + X29, + X30, + + SP, // 64bit stack pointer + + // VFP single precision registers + S0 = 0x40, + S1, + S2, + S3, + S4, + S5, + S6, + S7, + S8, + S9, + S10, + S11, + S12, + S13, + S14, + S15, + S16, + S17, + S18, + S19, + S20, + S21, + S22, + S23, + S24, + S25, + S26, + S27, + S28, + S29, + S30, + S31, + + // VFP Double Precision registers + D0 = 0x80, + D1, + D2, + D3, + D4, + D5, + D6, + D7, + D8, + D9, + D10, + D11, + D12, + D13, + D14, + D15, + D16, + D17, + D18, + D19, + D20, + D21, + D22, + D23, + D24, + D25, + D26, + D27, + D28, + D29, + D30, + D31, + + // ASIMD Quad-Word registers + Q0 = 0xC0, + Q1, + Q2, + Q3, + Q4, + Q5, + Q6, + Q7, + Q8, + Q9, + Q10, + Q11, + Q12, + Q13, + Q14, + Q15, + Q16, + Q17, + Q18, + Q19, + Q20, + Q21, + Q22, + Q23, + Q24, + Q25, + Q26, + Q27, + Q28, + Q29, + Q30, + Q31, + + // For PRFM(prefetch memory) encoding + // This is encoded in the Rt register + // Data preload + PLDL1KEEP = 0, + PLDL1STRM, + PLDL2KEEP, + PLDL2STRM, + PLDL3KEEP, + PLDL3STRM, + // Instruction preload + PLIL1KEEP = 8, + PLIL1STRM, + PLIL2KEEP, + PLIL2STRM, + PLIL3KEEP, + PLIL3STRM, + // Prepare for store + PLTL1KEEP = 16, + PLTL1STRM, + PLTL2KEEP, + PLTL2STRM, + PLTL3KEEP, + PLTL3STRM, + + WZR = WSP, + ZR = SP, + + INVALID_REG = 0xFFFFFFFF +}; + +constexpr bool Is64Bit(ARM64Reg reg) +{ + return (reg & 0x20) != 0; +} +constexpr bool IsSingle(ARM64Reg reg) +{ + return (reg & 0xC0) == 0x40; +} +constexpr bool IsDouble(ARM64Reg reg) +{ + return (reg & 0xC0) == 0x80; +} +constexpr bool IsScalar(ARM64Reg reg) +{ + return IsSingle(reg) || IsDouble(reg); +} +constexpr bool IsQuad(ARM64Reg reg) +{ + return (reg & 0xC0) == 0xC0; +} +constexpr bool IsVector(ARM64Reg reg) +{ + return (reg & 0xC0) != 0; +} +constexpr bool IsGPR(ARM64Reg reg) +{ + return static_cast(reg) < 0x40; +} + +constexpr ARM64Reg DecodeReg(ARM64Reg reg) +{ + return static_cast(reg & 0x1F); +} +constexpr ARM64Reg EncodeRegTo64(ARM64Reg reg) +{ + return static_cast(reg | 0x20); +} +constexpr ARM64Reg EncodeRegToSingle(ARM64Reg reg) +{ + return static_cast(DecodeReg(reg) + S0); +} +constexpr ARM64Reg EncodeRegToDouble(ARM64Reg reg) +{ + return static_cast((reg & ~0xC0) | 0x80); +} +constexpr ARM64Reg EncodeRegToQuad(ARM64Reg reg) +{ + return static_cast(reg | 0xC0); +} + +enum OpType +{ + TYPE_IMM = 0, + TYPE_REG, + TYPE_IMMSREG, + TYPE_RSR, + TYPE_MEM +}; + +enum ShiftType +{ + ST_LSL = 0, + ST_LSR = 1, + ST_ASR = 2, + ST_ROR = 3, +}; + +enum IndexType +{ + INDEX_UNSIGNED, + INDEX_POST, + INDEX_PRE, + INDEX_SIGNED, // used in LDP/STP +}; + +enum ShiftAmount +{ + SHIFT_0 = 0, + SHIFT_16 = 1, + SHIFT_32 = 2, + SHIFT_48 = 3, +}; + +enum RoundingMode +{ + ROUND_A, // round to nearest, ties to away + ROUND_M, // round towards -inf + ROUND_N, // round to nearest, ties to even + ROUND_P, // round towards +inf + ROUND_Z, // round towards zero +}; + +struct FixupBranch +{ + ptrdiff_t ptr; + // Type defines + // 0 = CBZ (32bit) + // 1 = CBNZ (32bit) + // 2 = B (conditional) + // 3 = TBZ + // 4 = TBNZ + // 5 = B (unconditional) + // 6 = BL (unconditional) + u32 type; + + // Used with B.cond + CCFlags cond; + + // Used with TBZ/TBNZ + u8 bit; + + // Used with Test/Compare and Branch + ARM64Reg reg; +}; + +enum PStateField +{ + FIELD_SPSel = 0, + FIELD_DAIFSet, + FIELD_DAIFClr, + FIELD_NZCV, // The only system registers accessible from EL0 (user space) + FIELD_PMCR_EL0, + FIELD_PMCCNTR_EL0, + FIELD_FPCR = 0x340, + FIELD_FPSR = 0x341, +}; + +enum SystemHint +{ + HINT_NOP = 0, + HINT_YIELD, + HINT_WFE, + HINT_WFI, + HINT_SEV, + HINT_SEVL, +}; + +enum BarrierType +{ + OSHLD = 1, + OSHST = 2, + OSH = 3, + NSHLD = 5, + NSHST = 6, + NSH = 7, + ISHLD = 9, + ISHST = 10, + ISH = 11, + LD = 13, + ST = 14, + SY = 15, +}; + +class ArithOption +{ +public: + enum WidthSpecifier + { + WIDTH_DEFAULT, + WIDTH_32BIT, + WIDTH_64BIT, + }; + + enum ExtendSpecifier + { + EXTEND_UXTB = 0x0, + EXTEND_UXTH = 0x1, + EXTEND_UXTW = 0x2, /* Also LSL on 32bit width */ + EXTEND_UXTX = 0x3, /* Also LSL on 64bit width */ + EXTEND_SXTB = 0x4, + EXTEND_SXTH = 0x5, + EXTEND_SXTW = 0x6, + EXTEND_SXTX = 0x7, + }; + + enum TypeSpecifier + { + TYPE_EXTENDEDREG, + TYPE_IMM, + TYPE_SHIFTEDREG, + }; + +private: + ARM64Reg m_destReg; + WidthSpecifier m_width; + ExtendSpecifier m_extend; + TypeSpecifier m_type; + ShiftType m_shifttype; + u32 m_shift; + +public: + ArithOption(ARM64Reg Rd, bool index = false) + { + // Indexed registers are a certain feature of AARch64 + // On Loadstore instructions that use a register offset + // We can have the register as an index + // If we are indexing then the offset register will + // be shifted to the left so we are indexing at intervals + // of the size of what we are loading + // 8-bit: Index does nothing + // 16-bit: Index LSL 1 + // 32-bit: Index LSL 2 + // 64-bit: Index LSL 3 + if (index) + m_shift = 4; + else + m_shift = 0; + + m_destReg = Rd; + m_type = TYPE_EXTENDEDREG; + if (Is64Bit(Rd)) + { + m_width = WIDTH_64BIT; + m_extend = EXTEND_UXTX; + } + else + { + m_width = WIDTH_32BIT; + m_extend = EXTEND_UXTW; + } + m_shifttype = ST_LSL; + } + ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift) + { + m_destReg = Rd; + m_shift = shift; + m_shifttype = shift_type; + m_type = TYPE_SHIFTEDREG; + if (Is64Bit(Rd)) + { + m_width = WIDTH_64BIT; + if (shift == 64) + m_shift = 0; + } + else + { + m_width = WIDTH_32BIT; + if (shift == 32) + m_shift = 0; + } + } + TypeSpecifier GetType() const { return m_type; } + ARM64Reg GetReg() const { return m_destReg; } + u32 GetData() const + { + switch (m_type) + { + case TYPE_EXTENDEDREG: + return (m_extend << 13) | (m_shift << 10); + break; + case TYPE_SHIFTEDREG: + return (m_shifttype << 22) | (m_shift << 10); + break; + default: + DEBUG_ASSERT_MSG(DYNA_REC, false, "Invalid type in GetData"); + break; + } + return 0; + } +}; + +class ARM64XEmitter +{ + friend class ARM64FloatEmitter; + +private: + ptrdiff_t m_code; + ptrdiff_t m_lastCacheFlushEnd; + u8* m_rwbase; + u8* m_rxbase; + + void AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, bool flags); + void EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr); + void EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr); + void EncodeUnconditionalBranchInst(u32 op, const void* ptr); + void EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn); + void EncodeExceptionInst(u32 instenc, u32 imm); + void EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt); + void EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + ArithOption Option); + void EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond); + void EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond); + void EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + void EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn); + void EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm); + void EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, ARM64Reg Rt); + void EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm); + void EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size); + void EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos); + void EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); + void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd); + void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, int n); + void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, + s32 imm); + void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm); + void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + +protected: + // TODO: make this less ugly + // used for Switch where memory is executable and writeable and different addresses + // we need to take this for relative addressing in account + + void Write32(u32 value); + +public: + ARM64XEmitter() : m_code(0), m_lastCacheFlushEnd(0), m_rwbase(nullptr), m_rxbase(nullptr) {} + ARM64XEmitter(u8* rwbase, u8* rxbase, ptrdiff_t offset) + { + m_rwbase = rwbase; + m_rxbase = rxbase; + m_code = offset; + m_lastCacheFlushEnd = offset; + } + + virtual ~ARM64XEmitter() {} + void SetCodePtr(ptrdiff_t ptr); + void SetCodePtrUnsafe(ptrdiff_t ptr); + void SetCodeBase(u8* rwbase, u8* rxbase); + void ReserveCodeSpace(u32 bytes); + ptrdiff_t AlignCode16(); + ptrdiff_t AlignCodePage(); + ptrdiff_t GetCodeOffset(); + const u8* GetRWPtr(); + u8* GetWriteableRWPtr(); + void* GetRXPtr(); + void FlushIcache(); + void FlushIcacheSection(u8* start, u8* end); + + // FixupBranch branching + void SetJumpTarget(FixupBranch const& branch); + FixupBranch CBZ(ARM64Reg Rt); + FixupBranch CBNZ(ARM64Reg Rt); + FixupBranch B(CCFlags cond); + FixupBranch TBZ(ARM64Reg Rt, u8 bit); + FixupBranch TBNZ(ARM64Reg Rt, u8 bit); + FixupBranch B(); + FixupBranch BL(); + + // Compare and Branch + void CBZ(ARM64Reg Rt, const void* ptr); + void CBNZ(ARM64Reg Rt, const void* ptr); + + // Conditional Branch + void B(CCFlags cond, const void* ptr); + + // Test and Branch + void TBZ(ARM64Reg Rt, u8 bits, const void* ptr); + void TBNZ(ARM64Reg Rt, u8 bits, const void* ptr); + + // Unconditional Branch + void B(const void* ptr); + void BL(const void* ptr); + + // Unconditional Branch (register) + void BR(ARM64Reg Rn); + void BLR(ARM64Reg Rn); + void RET(ARM64Reg Rn = X30); + void ERET(); + void DRPS(); + + // Exception generation + void SVC(u32 imm); + void HVC(u32 imm); + void SMC(u32 imm); + void BRK(u32 imm); + void HLT(u32 imm); + void DCPS1(u32 imm); + void DCPS2(u32 imm); + void DCPS3(u32 imm); + + // System + void _MSR(PStateField field, u8 imm); + void _MSR(PStateField field, ARM64Reg Rt); + void MRS(ARM64Reg Rt, PStateField field); + void CNTVCT(ARM64Reg Rt); + + void HINT(SystemHint op); + void CLREX(); + void DSB(BarrierType type); + void DMB(BarrierType type); + void ISB(BarrierType type); + + // Add/Subtract (Extended/Shifted register) + void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void CMN(ARM64Reg Rn, ARM64Reg Rm); + void CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void CMP(ARM64Reg Rn, ARM64Reg Rm); + void CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + + // Add/Subtract (with carry) + void ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Conditional Compare (immediate) + void CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond); + void CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond); + + // Conditional Compare (register) + void CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond); + void CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond); + + // Conditional Select + void CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + void CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + void CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + void CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + + // Aliases + void CSET(ARM64Reg Rd, CCFlags cond) + { + ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR; + CSINC(Rd, zr, zr, (CCFlags)((u32)cond ^ 1)); + } + void CSETM(ARM64Reg Rd, CCFlags cond) + { + ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR; + CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1)); + } + void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ZR : WZR, Rs); } + // Data-Processing 1 source + void RBIT(ARM64Reg Rd, ARM64Reg Rn); + void REV16(ARM64Reg Rd, ARM64Reg Rn); + void REV32(ARM64Reg Rd, ARM64Reg Rn); + void REV64(ARM64Reg Rd, ARM64Reg Rn); + void CLZ(ARM64Reg Rd, ARM64Reg Rn); + void CLS(ARM64Reg Rd, ARM64Reg Rn); + + // Data-Processing 2 source + void UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Data-Processing 3 source + void MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Logical (shifted register) + void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + + // Wrap the above for saner syntax + void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { AND(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BIC(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORN(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EOR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EON(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ANDS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BICS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + // Convenience wrappers around ORR. These match the official convenience syntax. + void MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift); + void MOV(ARM64Reg Rd, ARM64Reg Rm); + void MVN(ARM64Reg Rd, ARM64Reg Rm); + + // Convenience wrappers around UBFM/EXTR. + void LSR(ARM64Reg Rd, ARM64Reg Rm, int shift); + void LSL(ARM64Reg Rd, ARM64Reg Rm, int shift); + void ASR(ARM64Reg Rd, ARM64Reg Rm, int shift); + void ROR_(ARM64Reg Rd, ARM64Reg Rm, int shift); + + // Logical (immediate) + void AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void TST(ARM64Reg Rn, ARM64Reg Rm) { ANDS(Is64Bit(Rn) ? ZR : WZR, Rn, Rm); } + // Add/subtract (immediate) + void ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); + void ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); + void SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); + void SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); + void CMP(ARM64Reg Rn, u32 imm, bool shift = false); + + // Data Processing (Immediate) + void MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0); + void MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0); + void MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0); + + // Bitfield move + void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); + void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); + void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); + void BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width); + void UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width); + + // Extract register (ROR with two inputs, if same then faster on A67) + void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift); + + // Aliases + void SXTB(ARM64Reg Rd, ARM64Reg Rn); + void SXTH(ARM64Reg Rd, ARM64Reg Rn); + void SXTW(ARM64Reg Rd, ARM64Reg Rn); + void UXTB(ARM64Reg Rd, ARM64Reg Rn); + void UXTH(ARM64Reg Rd, ARM64Reg Rn); + + void UBFX(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) { UBFM(Rd, Rn, lsb, lsb + width - 1); } + // Load Register (Literal) + void LDR(ARM64Reg Rt, u32 imm); + void LDRSW(ARM64Reg Rt, u32 imm); + void PRFM(ARM64Reg Rt, u32 imm); + + // Load/Store Exclusive + void STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void LDXRB(ARM64Reg Rt, ARM64Reg Rn); + void LDAXRB(ARM64Reg Rt, ARM64Reg Rn); + void STLRB(ARM64Reg Rt, ARM64Reg Rn); + void LDARB(ARM64Reg Rt, ARM64Reg Rn); + void STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void LDXRH(ARM64Reg Rt, ARM64Reg Rn); + void LDAXRH(ARM64Reg Rt, ARM64Reg Rn); + void STLRH(ARM64Reg Rt, ARM64Reg Rn); + void LDARH(ARM64Reg Rt, ARM64Reg Rn); + void STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); + void STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); + void LDXR(ARM64Reg Rt, ARM64Reg Rn); + void LDAXR(ARM64Reg Rt, ARM64Reg Rn); + void LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); + void LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); + void STLR(ARM64Reg Rt, ARM64Reg Rn); + void LDAR(ARM64Reg Rt, ARM64Reg Rn); + + // Load/Store no-allocate pair (offset) + void STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm); + void LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm); + + // Load/Store register (immediate indexed) + void STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + + // Load/Store register (register offset) + void STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + + // Load/Store register (unscaled offset) + void STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + + // Load/Store pair + void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + void LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + void STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + + void LDRGeneric(int size, bool signExtend, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void STRGeneric(int size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + + void LDRGeneric(int size, bool signExtend, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STRGeneric(int size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + + // Address of label/page PC-relative + void ADR(ARM64Reg Rd, s32 imm); + void ADRP(ARM64Reg Rd, s32 imm); + + // Wrapper around MOVZ+MOVK + void MOVI2R(ARM64Reg Rd, u64 imm, bool optimize = true); + bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2); + template + void MOVP2R(ARM64Reg Rd, P* ptr) + { + ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "Can't store pointers in 32-bit registers"); + MOVI2R(Rd, (uintptr_t)ptr); + } + + // Wrapper around AND x, y, imm etc. If you are sure the imm will work, no need to pass a scratch + // register. + void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG) + { + ANDSI2R(Is64Bit(Rn) ? ZR : WZR, Rn, imm, scratch); + } + void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + + void ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags, + ARM64Reg scratch); + void ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + + bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TryCMPI2R(ARM64Reg Rn, u32 imm); + + bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + + // ABI related + void ABI_PushRegisters(BitSet32 registers); + void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0)); + + // Utility to generate a call to a std::function object. + // + // Unfortunately, calling operator() directly is undefined behavior in C++ + // (this method might be a thunk in the case of multi-inheritance) so we + // have to go through a trampoline function. + template + static T CallLambdaTrampoline(const std::function* f, Args... args) + { + return (*f)(args...); + } + + // This function expects you to have set up the state. + // Overwrites X0 and X30 + template + ARM64Reg ABI_SetupLambda(const std::function* f) + { + auto trampoline = &ARM64XEmitter::CallLambdaTrampoline; + MOVI2R(X30, (uintptr_t)trampoline); + MOVI2R(X0, (uintptr_t) const_cast((const void*)f)); + return X30; + } + + void QuickTailCall(ARM64Reg scratchreg, const void* func); + template + void QuickTailCall(ARM64Reg scratchreg, T func) + { + QuickTailCall(scratchreg, (const void*)func); + } + + // Plain function call + void QuickCallFunction(ARM64Reg scratchreg, const void* func); + template + void QuickCallFunction(ARM64Reg scratchreg, T func) + { + QuickCallFunction(scratchreg, (const void*)func); + } +}; + +class ARM64FloatEmitter +{ +public: + ARM64FloatEmitter(ARM64XEmitter* emit) : m_emit(emit) {} + void LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + + // Loadstore unscaled + void LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + + // Loadstore single structure + void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); + void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); + void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn); + void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn); + void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); + void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); + void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); + void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); + + // Loadstore multiple structure + void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); + void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP); + void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); + void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP); + + // Loadstore paired + void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + + // Loadstore register offset + void STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + + // Scalar - 1 Source + void FABS(ARM64Reg Rd, ARM64Reg Rn); + void FNEG(ARM64Reg Rd, ARM64Reg Rn); + void FSQRT(ARM64Reg Rd, ARM64Reg Rn); + void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false); // Also generalized move between GPR/FP + + // Scalar - 2 Source + void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Scalar - 3 Source. Note - the accumulator is last on ARM! + void FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + + // Scalar floating point immediate + void FMOV(ARM64Reg Rd, uint8_t imm8); + + // Vector + void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); + void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void NOT(ARM64Reg Rd, ARM64Reg Rn); + void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void MOV(ARM64Reg Rd, ARM64Reg Rn) { ORR(Rd, Rn, Rn); } + void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale); + void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale); + void SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + + // Move + void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn); + void INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2); + void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); + void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); + + // One source + void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn); + + // Scalar convert float to int, in a lot of variants. + // Note that the scalar version of this operation has two encodings, one that goes to an integer + // register + // and one that outputs to a scalar fp register. + void FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round); + void FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round); + + // Scalar convert int to float. No rounding mode specifier necessary. + void SCVTF(ARM64Reg Rd, ARM64Reg Rn); + void UCVTF(ARM64Reg Rd, ARM64Reg Rn); + + // Scalar fixed point to float. scale is the number of fractional bits. + void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale); + void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale); + + // Float comparison + void FCMP(ARM64Reg Rn, ARM64Reg Rm); + void FCMP(ARM64Reg Rn); + void FCMPE(ARM64Reg Rn, ARM64Reg Rm); + void FCMPE(ARM64Reg Rn); + void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn); + + // Conditional select + void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + + // Permute + void UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Shift by immediate + void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + + // vector x indexed element + void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); + void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); + + // Modified Immediate + void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0); + void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0); + + void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false); + void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG); + + // ABI related + void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG); + void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG); + +private: + ARM64XEmitter* m_emit; + inline void Write32(u32 value) { m_emit->Write32(value); } + // Emitting functions + void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm); + void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn); + void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, + ARM64Reg Rn); + void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, + ARM64Reg Rn, ARM64Reg Rm); + void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, int scale, + ARM64Reg Rd, ARM64Reg Rn); + void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm); + void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8); + void EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn); + void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn, + ARM64Reg Rm); + void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm); + void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign); + void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra, + int opcode); + void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, + ARM64Reg Rn, s32 imm); + void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh); + + void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); + void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); +}; + +} \ No newline at end of file diff --git a/src/dolphin/ArmCommon.h b/src/dolphin/ArmCommon.h new file mode 100644 index 00000000..6d82e9d7 --- /dev/null +++ b/src/dolphin/ArmCommon.h @@ -0,0 +1,27 @@ +// Copyright 2014 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include "../types.h" + +enum CCFlags +{ + CC_EQ = 0, // Equal + CC_NEQ, // Not equal + CC_CS, // Carry Set + CC_CC, // Carry Clear + CC_MI, // Minus (Negative) + CC_PL, // Plus + CC_VS, // Overflow + CC_VC, // No Overflow + CC_HI, // Unsigned higher + CC_LS, // Unsigned lower or same + CC_GE, // Signed greater than or equal + CC_LT, // Signed less than + CC_GT, // Signed greater than + CC_LE, // Signed less than or equal + CC_AL, // Always (unconditional) 14 + CC_HS = CC_CS, // Alias of CC_CS Unsigned higher or same + CC_LO = CC_CC, // Alias of CC_CC Unsigned lower +}; +const u32 NO_COND = 0xE0000000; diff --git a/src/dolphin/BitUtils.h b/src/dolphin/BitUtils.h new file mode 100644 index 00000000..8b64a925 --- /dev/null +++ b/src/dolphin/BitUtils.h @@ -0,0 +1,254 @@ +// Copyright 2017 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include + +namespace Common +{ +/// +/// Retrieves the size of a type in bits. +/// +/// @tparam T Type to get the size of. +/// +/// @return the size of the type in bits. +/// +template +constexpr size_t BitSize() noexcept +{ + return sizeof(T) * CHAR_BIT; +} + +/// +/// Extracts a bit from a value. +/// +/// @param src The value to extract a bit from. +/// @param bit The bit to extract. +/// +/// @tparam T The type of the value. +/// +/// @return The extracted bit. +/// +template +constexpr T ExtractBit(const T src, const size_t bit) noexcept +{ + return (src >> bit) & static_cast(1); +} + +/// +/// Extracts a bit from a value. +/// +/// @param src The value to extract a bit from. +/// +/// @tparam bit The bit to extract. +/// @tparam T The type of the value. +/// +/// @return The extracted bit. +/// +template +constexpr T ExtractBit(const T src) noexcept +{ + static_assert(bit < BitSize(), "Specified bit must be within T's bit width."); + + return ExtractBit(src, bit); +} + +/// +/// Extracts a range of bits from a value. +/// +/// @param src The value to extract the bits from. +/// @param begin The beginning of the bit range. This is inclusive. +/// @param end The ending of the bit range. This is inclusive. +/// +/// @tparam T The type of the value. +/// @tparam Result The returned result type. This is the unsigned analog +/// of a signed type if a signed type is passed as T. +/// +/// @return The extracted bits. +/// +template > +constexpr Result ExtractBits(const T src, const size_t begin, const size_t end) noexcept +{ + return static_cast(((static_cast(src) << ((BitSize() - 1) - end)) >> + (BitSize() - end + begin - 1))); +} + +/// +/// Extracts a range of bits from a value. +/// +/// @param src The value to extract the bits from. +/// +/// @tparam begin The beginning of the bit range. This is inclusive. +/// @tparam end The ending of the bit range. This is inclusive. +/// @tparam T The type of the value. +/// @tparam Result The returned result type. This is the unsigned analog +/// of a signed type if a signed type is passed as T. +/// +/// @return The extracted bits. +/// +template > +constexpr Result ExtractBits(const T src) noexcept +{ + static_assert(begin < end, "Beginning bit must be less than the ending bit."); + static_assert(begin < BitSize(), "Beginning bit is larger than T's bit width."); + static_assert(end < BitSize(), "Ending bit is larger than T's bit width."); + + return ExtractBits(src, begin, end); +} + +/// +/// Rotates a value left (ROL). +/// +/// @param value The value to rotate. +/// @param amount The number of bits to rotate the value. +/// @tparam T An unsigned type. +/// +/// @return The rotated value. +/// +template +constexpr T RotateLeft(const T value, size_t amount) noexcept +{ + static_assert(std::is_unsigned(), "Can only rotate unsigned types left."); + + amount %= BitSize(); + + if (amount == 0) + return value; + + return static_cast((value << amount) | (value >> (BitSize() - amount))); +} + +/// +/// Rotates a value right (ROR). +/// +/// @param value The value to rotate. +/// @param amount The number of bits to rotate the value. +/// @tparam T An unsigned type. +/// +/// @return The rotated value. +/// +template +constexpr T RotateRight(const T value, size_t amount) noexcept +{ + static_assert(std::is_unsigned(), "Can only rotate unsigned types right."); + + amount %= BitSize(); + + if (amount == 0) + return value; + + return static_cast((value >> amount) | (value << (BitSize() - amount))); +} + +/// +/// Verifies whether the supplied value is a valid bit mask of the form 0b00...0011...11. +/// Both edge cases of all zeros and all ones are considered valid masks, too. +/// +/// @param mask The mask value to test for validity. +/// +/// @tparam T The type of the value. +/// +/// @return A bool indicating whether the mask is valid. +/// +template +constexpr bool IsValidLowMask(const T mask) noexcept +{ + static_assert(std::is_integral::value, "Mask must be an integral type."); + static_assert(std::is_unsigned::value, "Signed masks can introduce hard to find bugs."); + + // Can be efficiently determined without looping or bit counting. It's the counterpart + // to https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 + // and doesn't require special casing either edge case. + return (mask & (mask + 1)) == 0; +} + +/// +/// Reinterpret objects of one type as another by bit-casting between object representations. +/// +/// @remark This is the example implementation of std::bit_cast which is to be included +/// in C++2a. See http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0476r2.html +/// for more details. The only difference is this variant is not constexpr, +/// as the mechanism for bit_cast requires a compiler built-in to have that quality. +/// +/// @param source The source object to convert to another representation. +/// +/// @tparam To The type to reinterpret source as. +/// @tparam From The initial type representation of source. +/// +/// @return The representation of type From as type To. +/// +/// @pre Both To and From types must be the same size +/// @pre Both To and From types must satisfy the TriviallyCopyable concept. +/// +template +inline To BitCast(const From& source) noexcept +{ + static_assert(sizeof(From) == sizeof(To), + "BitCast source and destination types must be equal in size."); + static_assert(std::is_trivially_copyable(), + "BitCast source type must be trivially copyable."); + static_assert(std::is_trivially_copyable(), + "BitCast destination type must be trivially copyable."); + + std::aligned_storage_t storage; + std::memcpy(&storage, &source, sizeof(storage)); + return reinterpret_cast(storage); +} + +template +class BitCastPtrType +{ +public: + static_assert(std::is_trivially_copyable(), + "BitCastPtr source type must be trivially copyable."); + static_assert(std::is_trivially_copyable(), + "BitCastPtr destination type must be trivially copyable."); + + explicit BitCastPtrType(PtrType* ptr) : m_ptr(ptr) {} + + // Enable operator= only for pointers to non-const data + template + inline typename std::enable_if() && !std::is_const()>::type + operator=(const S& source) + { + std::memcpy(m_ptr, &source, sizeof(source)); + } + + inline operator T() const + { + T result; + std::memcpy(&result, m_ptr, sizeof(result)); + return result; + } + +private: + PtrType* m_ptr; +}; + +// Provides an aliasing-safe alternative to reinterpret_cast'ing pointers to structs +// Conversion constructor and operator= provided for a convenient syntax. +// Usage: MyStruct s = BitCastPtr(some_ptr); +// BitCastPtr(some_ptr) = s; +template +inline auto BitCastPtr(PtrType* ptr) noexcept -> BitCastPtrType +{ + return BitCastPtrType{ptr}; +} + +template +void SetBit(T& value, size_t bit_number, bool bit_value) +{ + static_assert(std::is_unsigned(), "SetBit is only sane on unsigned types."); + + if (bit_value) + value |= (T{1} << bit_number); + else + value &= ~(T{1} << bit_number); +} + +} // namespace Common diff --git a/src/dolphin/Compat.h b/src/dolphin/Compat.h index f2f52a5c..787d5050 100644 --- a/src/dolphin/Compat.h +++ b/src/dolphin/Compat.h @@ -61,3 +61,15 @@ { \ printf(fmt "\n", ## __VA_ARGS__); \ } while (false) + +#if __cplusplus < 201703L +// cheat +namespace std +{ +template +T clamp(const T& v, const T& lo, const T& hi) +{ + return v < lo ? lo : (v > hi ? hi : v); +} +} +#endif \ No newline at end of file diff --git a/src/dolphin/MathUtil.cpp b/src/dolphin/MathUtil.cpp new file mode 100644 index 00000000..70f2ede1 --- /dev/null +++ b/src/dolphin/MathUtil.cpp @@ -0,0 +1,13 @@ +// Copyright 2008 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include "MathUtil.h" + +#include + +// Calculate sum of a float list +float MathFloatVectorSum(const std::vector& Vec) +{ + return std::accumulate(Vec.begin(), Vec.end(), 0.0f); +} diff --git a/src/dolphin/MathUtil.h b/src/dolphin/MathUtil.h new file mode 100644 index 00000000..b1dbbaec --- /dev/null +++ b/src/dolphin/MathUtil.h @@ -0,0 +1,121 @@ +// Copyright 2008 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include +#include + +#include "Compat.h" + +#include "../types.h" + +#ifdef _MSC_VER +#include +#endif + +namespace MathUtil +{ +constexpr double TAU = 6.2831853071795865; +constexpr double PI = TAU / 2; + +template +constexpr auto Sign(const T& val) -> decltype((T{} < val) - (val < T{})) +{ + return (T{} < val) - (val < T{}); +} + +template +constexpr auto Lerp(const T& x, const T& y, const F& a) -> decltype(x + (y - x) * a) +{ + return x + (y - x) * a; +} + +template +constexpr bool IsPow2(T imm) +{ + return imm > 0 && (imm & (imm - 1)) == 0; +} + +constexpr u32 NextPowerOf2(u32 value) +{ + --value; + value |= value >> 1; + value |= value >> 2; + value |= value >> 4; + value |= value >> 8; + value |= value >> 16; + ++value; + + return value; +} + +template +struct Rectangle +{ + T left{}; + T top{}; + T right{}; + T bottom{}; + + constexpr Rectangle() = default; + + constexpr Rectangle(T theLeft, T theTop, T theRight, T theBottom) + : left(theLeft), top(theTop), right(theRight), bottom(theBottom) + { + } + + constexpr bool operator==(const Rectangle& r) const + { + return left == r.left && top == r.top && right == r.right && bottom == r.bottom; + } + + T GetWidth() const { return abs(right - left); } + T GetHeight() const { return abs(bottom - top); } + // If the rectangle is in a coordinate system with a lower-left origin, use + // this Clamp. + void ClampLL(T x1, T y1, T x2, T y2) + { + left = std::clamp(left, x1, x2); + right = std::clamp(right, x1, x2); + top = std::clamp(top, y2, y1); + bottom = std::clamp(bottom, y2, y1); + } + + // If the rectangle is in a coordinate system with an upper-left origin, + // use this Clamp. + void ClampUL(T x1, T y1, T x2, T y2) + { + left = std::clamp(left, x1, x2); + right = std::clamp(right, x1, x2); + top = std::clamp(top, y1, y2); + bottom = std::clamp(bottom, y1, y2); + } +}; + +} // namespace MathUtil + +float MathFloatVectorSum(const std::vector&); + +// Rounds down. 0 -> undefined +inline int IntLog2(u64 val) +{ +#if defined(__GNUC__) + return 63 - __builtin_clzll(val); + +#elif defined(_MSC_VER) + unsigned long result = ULONG_MAX; + _BitScanReverse64(&result, val); + return result; + +#else + int result = -1; + while (val != 0) + { + val >>= 1; + ++result; + } + return result; +#endif +}