From 52a532370a7525f9c859990fa271d844d0b3387c Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 7 Jan 2015 14:24:29 -0600 Subject: [PATCH 01/12] [AArch64] Implement FPR Cache. --- .../PowerPC/JitArm64/JitArm64_RegCache.cpp | 142 ++++++++++++++---- .../Core/PowerPC/JitArm64/JitArm64_RegCache.h | 56 ++++--- 2 files changed, 146 insertions(+), 52 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index fc7fc952d6..0abcb29abb 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -10,6 +10,7 @@ using namespace Arm64Gen; void Arm64RegCache::Init(ARM64XEmitter *emitter) { m_emit = emitter; + m_float_emit.reset(new ARM64FloatEmitter(m_emit)); GetAllocationOrder(); } @@ -56,6 +57,23 @@ void Arm64RegCache::UnlockRegister(ARM64Reg host_reg) reg->Unlock(); } +void Arm64RegCache::FlushMostStaleRegister() +{ + u32 most_stale_preg = 0; + u32 most_stale_amount = 0; + for (u32 i = 0; i < 32; ++i) + { + u32 last_used = m_guest_registers[i].GetLastUsed(); + if (last_used > most_stale_amount && + m_guest_registers[i].GetType() == REG_REG) + { + most_stale_preg = i; + most_stale_amount = last_used; + } + } + FlushRegister(most_stale_preg, false); +} + // GPR Cache void Arm64GPRCache::Start(PPCAnalyst::BlockRegStats &stats) { @@ -212,23 +230,6 @@ void Arm64GPRCache::GetAllocationOrder() m_host_registers.push_back(HostReg(reg)); } -void Arm64GPRCache::FlushMostStaleRegister() -{ - u32 most_stale_preg = 0; - u32 most_stale_amount = 0; - for (u32 i = 0; i < 32; ++i) - { - u32 last_used = m_guest_registers[i].GetLastUsed(); - if (last_used > most_stale_amount && - m_guest_registers[i].GetType() == REG_REG) - { - most_stale_preg = i; - most_stale_amount = last_used; - } - } - FlushRegister(most_stale_preg, false); -} - BitSet32 Arm64GPRCache::GetCallerSavedUsed() { BitSet32 registers(0); @@ -254,35 +255,120 @@ void Arm64GPRCache::FlushByHost(ARM64Reg host_reg) // FPR Cache void Arm64FPRCache::Flush(FlushMode mode, PPCAnalyst::CodeOp* op) { - // XXX: Flush our stuff + for (int i = 0; i < 32; ++i) + { + bool flush = true; + if (mode == FLUSH_INTERPRETER) + { + if (!(op->regsOut[i] || op->regsIn[i])) + { + // This interpreted instruction doesn't use this register + flush = false; + } + } + + if (m_guest_registers[i].GetType() == REG_REG) + { + // Has to be flushed if it isn't in a callee saved register + ARM64Reg host_reg = m_guest_registers[i].GetReg(); + if (flush || !IsCalleeSaved(host_reg)) + FlushRegister(i, mode == FLUSH_MAINTAIN_STATE); + } + } } ARM64Reg Arm64FPRCache::R(u32 preg) { - // XXX: return a host reg holding a guest register + OpArg& reg = m_guest_registers[preg]; + IncrementAllUsed(); + reg.ResetLastUsed(); + + switch (reg.GetType()) + { + case REG_REG: // already in a reg + return reg.GetReg(); + break; + case REG_NOTLOADED: // Register isn't loaded at /all/ + { + ARM64Reg host_reg = GetReg(); + reg.LoadToReg(host_reg); + m_float_emit->LDR(128, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); + return host_reg; + } + break; + default: + _dbg_assert_msg_(DYNA_REC, false, "Invalid OpArg Type!"); + break; + } + // We've got an issue if we end up here + return INVALID_REG; +} + +void Arm64FPRCache::BindToRegister(u32 preg, bool do_load) +{ + OpArg& reg = m_guest_registers[preg]; + + if (reg.GetType() == REG_NOTLOADED) + { + ARM64Reg host_reg = GetReg(); + reg.LoadToReg(host_reg); + if (do_load) + m_float_emit->LDR(128, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); + } } void Arm64FPRCache::GetAllocationOrder() { const std::vector allocation_order = { - D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, - D11, D12, D13, D14, D15, D16, D17, D18, D19, - D20, D21, D22, D23, D24, D25, D26, D27, D28, - D29, D30, D31, + // Callee saved + Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15, + + // Caller saved + Q16, Q17, Q18, Q19, Q20, Q21, Q22, Q23, + Q24, Q25, Q26, Q27, Q28, Q29, Q30, Q31, + Q7, Q6, Q5, Q4, Q3, Q2, Q1, Q0 }; for (ARM64Reg reg : allocation_order) m_host_registers.push_back(HostReg(reg)); } -void Arm64FPRCache::FlushMostStaleRegister() -{ - // XXX: Flush a register -} - void Arm64FPRCache::FlushByHost(ARM64Reg host_reg) { // XXX: Scan guest registers and flush if found } +bool Arm64FPRCache::IsCalleeSaved(ARM64Reg reg) +{ + static std::vector callee_regs = + { + Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15, INVALID_REG, + }; + return std::find(callee_regs.begin(), callee_regs.end(), EncodeRegTo64(reg)) != callee_regs.end(); +} + +void Arm64FPRCache::FlushRegister(u32 preg, bool maintain_state) +{ + OpArg& reg = m_guest_registers[preg]; + if (reg.GetType() == REG_REG) + { + ARM64Reg host_reg = reg.GetReg(); + + m_float_emit->STR(128, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); + if (!maintain_state) + { + UnlockRegister(host_reg); + reg.Flush(); + } + } +} + +BitSet32 Arm64FPRCache::GetCallerSavedUsed() +{ + BitSet32 registers(0); + for (auto& it : m_host_registers) + if (it.IsLocked()) + registers[Q0 - it.GetReg()] = 1; + return registers; +} diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index d032a16f82..115fad0720 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -119,7 +119,7 @@ private: class Arm64RegCache { public: - Arm64RegCache() : m_emit(nullptr), m_reg_stats(nullptr) {}; + Arm64RegCache() : m_emit(nullptr), m_float_emit(nullptr), m_reg_stats(nullptr) {}; virtual ~Arm64RegCache() {}; void Init(ARM64XEmitter *emitter); @@ -133,10 +133,14 @@ public: // Will dump an immediate to the host register as well virtual ARM64Reg R(u32 reg) = 0; + virtual BitSet32 GetCallerSavedUsed() = 0; + // Returns a temporary register for use // Requires unlocking after done ARM64Reg GetReg(); + void StoreRegister(u32 preg) { FlushRegister(preg, false); } + // Locks a register so a cache cannot use it // Useful for function calls template @@ -166,7 +170,7 @@ protected: virtual void GetAllocationOrder() = 0; // Flushes the most stale register - virtual void FlushMostStaleRegister() = 0; + void FlushMostStaleRegister(); // Lock a register void LockRegister(ARM64Reg host_reg); @@ -177,15 +181,31 @@ protected: // Flushes a guest register by host provided virtual void FlushByHost(ARM64Reg host_reg) = 0; + virtual void FlushRegister(u32 preg, bool maintain_state) = 0; + // Get available host registers u32 GetUnlockedRegisterCount(); + void IncrementAllUsed() + { + for (auto& reg : m_guest_registers) + reg.IncrementLastUsed(); + } + // Code emitter ARM64XEmitter *m_emit; + // Float emitter + std::unique_ptr m_float_emit; + // Host side registers that hold the host registers in order of use std::vector m_host_registers; + // Our guest GPRs + // PowerPC has 32 GPRs + // PowerPC also has 32 paired FPRs + OpArg m_guest_registers[32]; + // Register stats for the current block PPCAnalyst::BlockRegStats *m_reg_stats; }; @@ -215,34 +235,20 @@ public: void BindToRegister(u32 preg, bool do_load); - void StoreRegister(u32 preg) { FlushRegister(preg, false); } - - BitSet32 GetCallerSavedUsed(); + BitSet32 GetCallerSavedUsed() override; protected: // Get the order of the host registers void GetAllocationOrder(); - // Flushes the most stale register - void FlushMostStaleRegister(); - // Flushes a guest register by host provided void FlushByHost(ARM64Reg host_reg) override; - // Our guest GPRs - // PowerPC has 32 GPRs - OpArg m_guest_registers[32]; + void FlushRegister(u32 preg, bool maintain_state) override; private: bool IsCalleeSaved(ARM64Reg reg); - void IncrementAllUsed() - { - for (auto& reg : m_guest_registers) - reg.IncrementLastUsed(); - } - - void FlushRegister(u32 preg, bool maintain_state); }; class Arm64FPRCache : public Arm64RegCache @@ -256,17 +262,19 @@ public: // Will dump an immediate to the host register as well ARM64Reg R(u32 preg); + void BindToRegister(u32 preg, bool do_load); + + BitSet32 GetCallerSavedUsed() override; + protected: // Get the order of the host registers void GetAllocationOrder(); - // Flushes the most stale register - void FlushMostStaleRegister(); - // Flushes a guest register by host provided void FlushByHost(ARM64Reg host_reg) override; - // Our guest FPRs - // Gekko has 32 paired registers(32x2) - OpArg m_guest_registers[32][2]; + void FlushRegister(u32 preg, bool maintain_state) override; + +private: + bool IsCalleeSaved(ARM64Reg reg); }; From 74de345b51d1022045b8339e3141b510d629b5e2 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 7 Jan 2015 14:27:46 -0600 Subject: [PATCH 02/12] [AArch64] Fix loads with update. Update register wasn't being loaded in to the cache prior pushing the address in to it. Adds float push and pop routines around the calls that need it as well. --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 4 +++- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 17 +++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index b0c4689bff..f4c0adae74 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -25,7 +25,7 @@ static_assert((PPCSTATE_OFF(ps[0][0]) % 8) == 0, "LDR(64bit VFP) requires FPRs t class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock { public: - JitArm64() : code_buffer(32000) {} + JitArm64() : code_buffer(32000), m_float_emit(this) {} ~JitArm64() {} void Init(); @@ -112,6 +112,8 @@ private: PPCAnalyst::CodeBuffer code_buffer; + ARM64FloatEmitter m_float_emit; + // The key is the backpatch flags std::map m_backpatch_info; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 58c1523897..c0a3176307 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -41,6 +41,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o off_reg = gpr.R(offsetReg); BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); + BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); BitSet32 ignore_mask(0); regs_in_use[W0] = 0; regs_in_use[W30] = 0; @@ -114,25 +115,24 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o if (is_immediate) MOVI2R(XA, imm_addr); + if (update) + MOV(gpr.R(addr), addr_reg); + if (is_immediate && Memory::IsRAMAddress(imm_addr)) { EmitBackpatchRoutine(this, flags, true, false, dest_reg, XA); - - if (update) - MOVI2R(up_reg, imm_addr); } else { - if (update) - MOV(up_reg, addr_reg); - // Has a chance of being backpatched which will destroy our state // push and pop everything in this instance ABI_PushRegisters(regs_in_use); + m_float_emit.ABI_PushRegisters(fprs_in_use); EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, dest_reg, XA); + m_float_emit.ABI_PopRegisters(fprs_in_use); ABI_PopRegisters(regs_in_use, ignore_mask); } @@ -155,6 +155,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s reg_dest = gpr.R(dest); BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); + BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); regs_in_use[W0] = 0; regs_in_use[W1] = 0; regs_in_use[W30] = 0; @@ -237,10 +238,12 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s // Has a chance of being backpatched which will destroy our state // push and pop everything in this instance ABI_PushRegisters(regs_in_use); + m_float_emit.ABI_PushRegisters(fprs_in_use); EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, RS, XA); + m_float_emit.ABI_PopRegisters(fprs_in_use); ABI_PopRegisters(regs_in_use); } @@ -321,8 +324,6 @@ void JitArm64::lXX(UGeckoInstruction inst) break; } - FALLBACK_IF(update); - SafeLoadToReg(d, update ? a : (a ? a : -1), offsetReg, flags, offset, update); // LWZ idle skipping From b3201be95f5a5dc7ee3c76e8fd1d7333ea7f9785 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 7 Jan 2015 14:31:07 -0600 Subject: [PATCH 03/12] [AArch64] Add some static_asserts. Makes sure we have a few PPCState values within range of STRB's range. --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index f4c0adae74..ebef27173e 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -21,6 +21,8 @@ // Some asserts to make sure we will be able to load everything static_assert(PPCSTATE_OFF(spr[1023]) <= 16380, "LDR(32bit) can't reach the last SPR"); static_assert((PPCSTATE_OFF(ps[0][0]) % 8) == 0, "LDR(64bit VFP) requires FPRs to be 8 byte aligned"); +static_assert(PPCSTATE_OFF(xer_ca) < 4096, "STRB can't store xer_ca!"); +static_assert(PPCSTATE_OFF(xer_so_ov) < 4096, "STRB can't store xer_so_ov!"); class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock { From f1443bec1e8b09d0e94099b434bc45d0f843a280 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 7 Jan 2015 14:32:50 -0600 Subject: [PATCH 04/12] [AArch64] Add a memory dump routine. Allows me to easily disassemble a block of code from the Nexus 9 by dumping it to logcat. --- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 8 ++++++++ Source/Core/Core/PowerPC/JitArm64/Jit.h | 3 +++ 2 files changed, 11 insertions(+) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 6f1b087854..f664df6cbf 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -179,6 +179,14 @@ void JitArm64::WriteExitDestInR(ARM64Reg Reg) BR(EncodeRegTo64(Reg)); } +void JitArm64::DumpCode(const u8* start, const u8* end) +{ + std::string output = ""; + for (u8* code = (u8*)start; code < end; code += 4) + output += StringFromFormat("%08x", Common::swap32(*(u32*)code)); + WARN_LOG(DYNA_REC, "Code dump from %p to %p:\n%s", start, end, output.c_str()); +} + void JitArm64::Run() { CompiledCode pExecAddr = (CompiledCode)asm_routines.enterCode; diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index ebef27173e..45b2f9bf91 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -116,6 +116,9 @@ private: ARM64FloatEmitter m_float_emit; + // Dump a memory range of code + void DumpCode(const u8* start, const u8* end); + // The key is the backpatch flags std::map m_backpatch_info; From 3fe0b5b969b8b86c423414fbb8a689fe150dce2e Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 7 Jan 2015 14:38:14 -0600 Subject: [PATCH 05/12] [AArch64] Add floating point backpatching support. --- .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 171 +++++++++++++++++- 1 file changed, 167 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index 0b813e4b72..ff185214ad 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -29,7 +29,8 @@ static void DoBacktrace(uintptr_t access_address, SContext* ctx) for (u64 pc = (ctx->CTX_PC - 32); pc < (ctx->CTX_PC + 32); pc += 16) { pc_memory += StringFromFormat("%08x%08x%08x%08x", - *(u32*)pc, *(u32*)(pc + 4), *(u32*)(pc + 8), *(u32*)(pc + 12)); + Common::swap32(*(u32*)pc), Common::swap32(*(u32*)(pc + 4)), + Common::swap32(*(u32*)(pc + 8)), Common::swap32(*(u32*)(pc + 12))); ERROR_LOG(DYNA_REC, "0x%016lx: %08x %08x %08x %08x", pc, *(u32*)pc, *(u32*)(pc + 4), *(u32*)(pc + 8), *(u32*)(pc + 12)); @@ -51,10 +52,34 @@ bool JitArm64::DisasmLoadStore(const u8* ptr, u32* flags, ARM64Reg* reg) *flags |= BackPatchInfo::FLAG_SIZE_8; else if (size == 1) // 16-bit *flags |= BackPatchInfo::FLAG_SIZE_16; - else // 32-bit + else if (size == 2) // 32-bit *flags |= BackPatchInfo::FLAG_SIZE_32; + else if (size == 3) // 64-bit + *flags |= BackPatchInfo::FLAG_SIZE_F64; - if (op == 0xE5) // Load + if (op == 0xF5) // NEON LDR + { + if (size == 2) // 32-bit float + { + *flags &= ~BackPatchInfo::FLAG_SIZE_32; + *flags |= BackPatchInfo::FLAG_SIZE_F32; + } + *flags |= BackPatchInfo::FLAG_LOAD; + *reg = (ARM64Reg)(inst & 0x1F); + return true; + } + else if (op == 0xF4) // NEON STR + { + if (size == 2) // 32-bit float + { + *flags &= ~BackPatchInfo::FLAG_SIZE_32; + *flags |= BackPatchInfo::FLAG_SIZE_F32; + } + *flags |= BackPatchInfo::FLAG_STORE; + *reg = (ARM64Reg)(inst & 0x1F); + return true; + } + else if (op == 0xE5) // Load { *flags |= BackPatchInfo::FLAG_LOAD; *reg = (ARM64Reg)(inst & 0x1F); @@ -90,10 +115,38 @@ u32 JitArm64::EmitBackpatchRoutine(ARM64XEmitter* emit, u32 flags, bool fastmem, if (flags & BackPatchInfo::FLAG_STORE && flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64)) { + ARM64FloatEmitter float_emit(emit); + if (flags & BackPatchInfo::FLAG_SIZE_F32) + { + float_emit.FCVT(32, 64, Q0, RS); + float_emit.REV32(8, D0, D0); + trouble_offset = (emit->GetCodePtr() - code_base) / 4; + float_emit.STR(32, INDEX_UNSIGNED, D0, addr, 0); + } + else + { + float_emit.REV64(8, Q0, RS); + trouble_offset = (emit->GetCodePtr() - code_base) / 4; + float_emit.STR(64, INDEX_UNSIGNED, Q0, addr, 0); + } } else if (flags & BackPatchInfo::FLAG_LOAD && flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64)) { + ARM64FloatEmitter float_emit(emit); + trouble_offset = (emit->GetCodePtr() - code_base) / 4; + if (flags & BackPatchInfo::FLAG_SIZE_F32) + { + float_emit.LD1R(32, RS, addr); + float_emit.REV64(8, RS, RS); + float_emit.FCVTL(64, RS, RS); + } + else + { + float_emit.LDR(64, INDEX_UNSIGNED, Q0, addr, 0); + float_emit.REV64(8, Q0, Q0); + float_emit.INS(64, RS, 0, Q0, 0); + } } else if (flags & BackPatchInfo::FLAG_STORE) { @@ -143,10 +196,39 @@ u32 JitArm64::EmitBackpatchRoutine(ARM64XEmitter* emit, u32 flags, bool fastmem, if (flags & BackPatchInfo::FLAG_STORE && flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64)) { + ARM64FloatEmitter float_emit(emit); + if (flags & BackPatchInfo::FLAG_SIZE_F32) + { + float_emit.FCVT(32, 64, Q0, RS); + float_emit.FMOV(32, false, W0, Q0); + emit->MOVI2R(X30, (u64)&Memory::Write_U32); + emit->BLR(X30); + } + else + { + emit->MOVI2R(X30, (u64)&Memory::Write_F64); + float_emit.DUP(64, Q0, RS); + emit->BLR(X30); + } + } else if (flags & BackPatchInfo::FLAG_LOAD && flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64)) { + ARM64FloatEmitter float_emit(emit); + if (flags & BackPatchInfo::FLAG_SIZE_F32) + { + emit->MOVI2R(X30, (u64)&Memory::Read_U32); + emit->BLR(X30); + float_emit.DUP(32, RS, X0); + float_emit.FCVTL(64, RS, RS); + } + else + { + emit->MOVI2R(X30, (u64)&Memory::Read_F64); + emit->BLR(X30); + float_emit.INS(64, RS, 0, X0); + } } else if (flags & BackPatchInfo::FLAG_STORE) { @@ -245,7 +327,8 @@ bool JitArm64::HandleFault(uintptr_t access_address, SContext* ctx) ctx->CTX_PC = new_pc; // Wipe the top bits of the addr_register - if (flags & BackPatchInfo::FLAG_STORE) + if (flags & BackPatchInfo::FLAG_STORE && + !(flags & BackPatchInfo::FLAG_SIZE_F64)) ctx->CTX_REG(1) &= 0xFFFFFFFFUll; else ctx->CTX_REG(0) &= 0xFFFFFFFFUll; @@ -382,6 +465,46 @@ void JitArm64::InitBackpatch() SetCodePtr(code_base); + m_backpatch_info[flags] = info; + } + // 32bit float + { + flags = + BackPatchInfo::FLAG_LOAD | + BackPatchInfo::FLAG_SIZE_F32; + EmitBackpatchRoutine(this, flags, false, false, Q0, X1); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, Q0, X1); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } + // 64bit float + { + flags = + BackPatchInfo::FLAG_LOAD | + BackPatchInfo::FLAG_SIZE_F64; + EmitBackpatchRoutine(this, flags, false, false, Q0, X1); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, Q0, X1); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + m_backpatch_info[flags] = info; } } @@ -446,6 +569,46 @@ void JitArm64::InitBackpatch() SetCodePtr(code_base); + m_backpatch_info[flags] = info; + } + // 32bit float + { + flags = + BackPatchInfo::FLAG_STORE | + BackPatchInfo::FLAG_SIZE_F32; + EmitBackpatchRoutine(this, flags, false, false, Q0, X1); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, Q0, X1); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } + // 64bit float + { + flags = + BackPatchInfo::FLAG_STORE | + BackPatchInfo::FLAG_SIZE_F64; + EmitBackpatchRoutine(this, flags, false, false, Q0, X1); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, Q0, X1); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + m_backpatch_info[flags] = info; } } From b1169a9773b68ead04d18419cb146c3a382e09af Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 7 Jan 2015 14:39:41 -0600 Subject: [PATCH 06/12] [AArch64] Flush FPRs if they won't be used be used the rest of the block. --- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index f664df6cbf..6c44411349 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -302,6 +302,8 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB // If we have a register that will never be used again, flush it. for (int j : ~ops[i].gprInUse) gpr.StoreRegister(j); + for (int j : ~ops[i].fprInUse) + fpr.StoreRegister(j); if (js.memcheck && (opinfo->flags & FL_LOADSTORE)) { From be0d552d542f725b34c2501147c8614790ef5dce Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 7 Jan 2015 14:55:28 -0600 Subject: [PATCH 07/12] [AArch64] Enable the full range of integer loadstores. --- .../Core/PowerPC/JitArm64/JitArm64_Tables.cpp | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp index 04ed7e7be6..13e02bfa4a 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp @@ -208,24 +208,24 @@ static GekkoOPTemplate table31[] = {1014, &JitArm64::FallBackToInterpreter}, //"dcbz", OPTYPE_DCACHE, 0, 4}}, //load word - {23, &JitArm64::FallBackToInterpreter}, //"lwzx", OPTYPE_LOAD, FL_OUT_D | FL_IN_A0 | FL_IN_B}}, - {55, &JitArm64::FallBackToInterpreter}, //"lwzux", OPTYPE_LOAD, FL_OUT_D | FL_OUT_A | FL_IN_A | FL_IN_B}}, + {23, &JitArm64::lXX}, //"lwzx", OPTYPE_LOAD, FL_OUT_D | FL_IN_A0 | FL_IN_B}}, + {55, &JitArm64::lXX}, //"lwzux", OPTYPE_LOAD, FL_OUT_D | FL_OUT_A | FL_IN_A | FL_IN_B}}, //load halfword - {279, &JitArm64::FallBackToInterpreter}, //"lhzx", OPTYPE_LOAD, FL_OUT_D | FL_IN_A0 | FL_IN_B}}, - {311, &JitArm64::FallBackToInterpreter}, //"lhzux", OPTYPE_LOAD, FL_OUT_D | FL_OUT_A | FL_IN_A | FL_IN_B}}, + {279, &JitArm64::lXX}, //"lhzx", OPTYPE_LOAD, FL_OUT_D | FL_IN_A0 | FL_IN_B}}, + {311, &JitArm64::lXX}, //"lhzux", OPTYPE_LOAD, FL_OUT_D | FL_OUT_A | FL_IN_A | FL_IN_B}}, //load halfword signextend - {343, &JitArm64::FallBackToInterpreter}, //"lhax", OPTYPE_LOAD, FL_OUT_D | FL_IN_A0 | FL_IN_B}}, - {375, &JitArm64::FallBackToInterpreter}, //"lhaux", OPTYPE_LOAD, FL_OUT_D | FL_OUT_A | FL_IN_A | FL_IN_B}}, + {343, &JitArm64::lXX}, //"lhax", OPTYPE_LOAD, FL_OUT_D | FL_IN_A0 | FL_IN_B}}, + {375, &JitArm64::lXX}, //"lhaux", OPTYPE_LOAD, FL_OUT_D | FL_OUT_A | FL_IN_A | FL_IN_B}}, //load byte - {87, &JitArm64::FallBackToInterpreter}, //"lbzx", OPTYPE_LOAD, FL_OUT_D | FL_IN_A0 | FL_IN_B}}, - {119, &JitArm64::FallBackToInterpreter}, //"lbzux", OPTYPE_LOAD, FL_OUT_D | FL_OUT_A | FL_IN_A | FL_IN_B}}, + {87, &JitArm64::lXX}, //"lbzx", OPTYPE_LOAD, FL_OUT_D | FL_IN_A0 | FL_IN_B}}, + {119, &JitArm64::lXX}, //"lbzux", OPTYPE_LOAD, FL_OUT_D | FL_OUT_A | FL_IN_A | FL_IN_B}}, //load byte reverse - {534, &JitArm64::FallBackToInterpreter}, //"lwbrx", OPTYPE_LOAD, FL_OUT_D | FL_IN_A0 | FL_IN_B}}, - {790, &JitArm64::FallBackToInterpreter}, //"lhbrx", OPTYPE_LOAD, FL_OUT_D | FL_IN_A0 | FL_IN_B}}, + {534, &JitArm64::lXX}, //"lwbrx", OPTYPE_LOAD, FL_OUT_D | FL_IN_A0 | FL_IN_B}}, + {790, &JitArm64::lXX}, //"lhbrx", OPTYPE_LOAD, FL_OUT_D | FL_IN_A0 | FL_IN_B}}, // Conditional load/store (Wii SMP) {150, &JitArm64::FallBackToInterpreter}, //"stwcxd", OPTYPE_STORE, FL_EVIL | FL_SET_CR0}}, @@ -236,16 +236,16 @@ static GekkoOPTemplate table31[] = {597, &JitArm64::FallBackToInterpreter}, //"lswi", OPTYPE_LOAD, FL_EVIL | FL_IN_AB | FL_OUT_D}}, //store word - {151, &JitArm64::FallBackToInterpreter}, //"stwx", OPTYPE_STORE, FL_IN_A0 | FL_IN_B}}, - {183, &JitArm64::FallBackToInterpreter}, //"stwux", OPTYPE_STORE, FL_OUT_A | FL_IN_A | FL_IN_B}}, + {151, &JitArm64::stX}, //"stwx", OPTYPE_STORE, FL_IN_A0 | FL_IN_B}}, + {183, &JitArm64::stX}, //"stwux", OPTYPE_STORE, FL_OUT_A | FL_IN_A | FL_IN_B}}, //store halfword - {407, &JitArm64::FallBackToInterpreter}, //"sthx", OPTYPE_STORE, FL_IN_A0 | FL_IN_B}}, - {439, &JitArm64::FallBackToInterpreter}, //"sthux", OPTYPE_STORE, FL_OUT_A | FL_IN_A | FL_IN_B}}, + {407, &JitArm64::stX}, //"sthx", OPTYPE_STORE, FL_IN_A0 | FL_IN_B}}, + {439, &JitArm64::stX}, //"sthux", OPTYPE_STORE, FL_OUT_A | FL_IN_A | FL_IN_B}}, //store byte - {215, &JitArm64::FallBackToInterpreter}, //"stbx", OPTYPE_STORE, FL_IN_A0 | FL_IN_B}}, - {247, &JitArm64::FallBackToInterpreter}, //"stbux", OPTYPE_STORE, FL_OUT_A | FL_IN_A | FL_IN_B}}, + {215, &JitArm64::stX}, //"stbx", OPTYPE_STORE, FL_IN_A0 | FL_IN_B}}, + {247, &JitArm64::stX}, //"stbux", OPTYPE_STORE, FL_OUT_A | FL_IN_A | FL_IN_B}}, //store bytereverse {662, &JitArm64::FallBackToInterpreter}, //"stwbrx", OPTYPE_STORE, FL_IN_A0 | FL_IN_B}}, From 0dd3804cf77c5edef79aa12e4e82770814f07a7d Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 7 Jan 2015 14:57:55 -0600 Subject: [PATCH 08/12] [AArch64] Implement 13 integer instructions. --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 11 + .../PowerPC/JitArm64/JitArm64_Integer.cpp | 279 ++++++++++++++++++ .../Core/PowerPC/JitArm64/JitArm64_Tables.cpp | 28 +- 3 files changed, 304 insertions(+), 14 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 45b2f9bf91..d3e51ee168 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -82,6 +82,7 @@ public: // Integer void arith_imm(UGeckoInstruction inst); void boolX(UGeckoInstruction inst); + void addx(UGeckoInstruction inst); void extsXx(UGeckoInstruction inst); void cntlzwx(UGeckoInstruction inst); void negx(UGeckoInstruction inst); @@ -89,6 +90,14 @@ public: void cmpl(UGeckoInstruction inst); void cmpi(UGeckoInstruction inst); void cmpli(UGeckoInstruction inst); + void rlwinmx(UGeckoInstruction inst); + void srawix(UGeckoInstruction inst); + void mullwx(UGeckoInstruction inst); + void addic(UGeckoInstruction inst); + void mulli(UGeckoInstruction inst); + void addzex(UGeckoInstruction inst); + void subfx(UGeckoInstruction inst); + void addcx(UGeckoInstruction inst); // System Registers void mtmsr(UGeckoInstruction inst); @@ -144,6 +153,8 @@ private: void ComputeRC(Arm64Gen::ARM64Reg reg, int crf = 0); void ComputeRC(u32 imm, int crf = 0); + void ComputeCarry(bool Carry); + void ComputeCarry(); typedef u32 (*Operation)(u32, u32); void reg_imm(u32 d, u32 a, bool binary, u32 value, Operation do_op, void (ARM64XEmitter::*op)(Arm64Gen::ARM64Reg, Arm64Gen::ARM64Reg, Arm64Gen::ARM64Reg, ArithOption), bool Rc = false); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index a1a7ffa005..14da56ec7c 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -39,6 +39,28 @@ void JitArm64::ComputeRC(u32 imm, int crf) gpr.Unlock(WA); } +void JitArm64::ComputeCarry(bool Carry) +{ + if (Carry) + { + ARM64Reg WA = gpr.GetReg(); + MOVI2R(WA, 1); + STRB(INDEX_UNSIGNED, WA, X29, PPCSTATE_OFF(xer_ca)); + gpr.Unlock(WA); + return; + } + + STRB(INDEX_UNSIGNED, WSP, X29, PPCSTATE_OFF(xer_ca)); +} + +void JitArm64::ComputeCarry() +{ + ARM64Reg WA = gpr.GetReg(); + CSINC(WA, WSP, WSP, CC_CC); + STRB(INDEX_UNSIGNED, WA, X29, PPCSTATE_OFF(xer_ca)); + gpr.Unlock(WA); +} + // Following static functions are used in conjunction with reg_imm static u32 Add(u32 a, u32 b) { @@ -245,6 +267,29 @@ void JitArm64::boolX(UGeckoInstruction inst) } } +void JitArm64::addx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITIntegerOff); + FALLBACK_IF(inst.OE); + + int a = inst.RA, b = inst.RB, d = inst.RD; + + if (gpr.IsImm(a) && gpr.IsImm(b)) + { + s32 i = (s32)gpr.GetImm(a), j = (s32)gpr.GetImm(b); + gpr.SetImmediate(d, i + j); + if (inst.Rc) + ComputeRC(gpr.GetImm(d), 0); + } + else + { + ADD(gpr.R(d), gpr.R(a), gpr.R(b)); + if (inst.Rc) + ComputeRC(gpr.R(d), 0); + } +} + void JitArm64::extsXx(UGeckoInstruction inst) { INSTRUCTION_START @@ -415,3 +460,237 @@ void JitArm64::cmpli(UGeckoInstruction inst) FALLBACK_IF(true); } +void JitArm64::rlwinmx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITIntegerOff); + + u32 mask = Helper_Mask(inst.MB,inst.ME); + if (gpr.IsImm(inst.RS)) + { + gpr.SetImmediate(inst.RA, _rotl(gpr.GetImm(inst.RS), inst.SH) & mask); + if (inst.Rc) + ComputeRC(gpr.GetImm(inst.RA), 0); + return; + } + + gpr.BindToRegister(inst.RA, inst.RA == inst.RS); + + ARM64Reg WA = gpr.GetReg(); + ArithOption Shift(gpr.R(inst.RS), ST_ROR, 32 - inst.SH); + MOVI2R(WA, mask); + AND(gpr.R(inst.RA), WA, gpr.R(inst.RS), Shift); + gpr.Unlock(WA); + + if (inst.Rc) + ComputeRC(gpr.R(inst.RA), 0); +} + +void JitArm64::srawix(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITIntegerOff); + + int a = inst.RA; + int s = inst.RS; + int amount = inst.SH; + + if (gpr.IsImm(s)) + { + s32 imm = (s32)gpr.GetImm(s); + gpr.SetImmediate(a, imm >> amount); + + if (amount != 0 && (imm < 0) && (imm << (32 - amount))) + ComputeCarry(true); + else + ComputeCarry(false); + } + else if (amount != 0) + { + gpr.BindToRegister(a, a == s); + ARM64Reg RA = gpr.R(a); + ARM64Reg RS = gpr.R(s); + ARM64Reg WA = gpr.GetReg(); + + ORR(WA, WSP, RS, ArithOption(RS, ST_LSL, 32 - amount)); + ORR(RA, WSP, RS, ArithOption(RS, ST_ASR, amount)); + if (inst.Rc) + ComputeRC(RA, 0); + + ANDS(WSP, WA, RA, ArithOption(RA, ST_LSL, 0)); + CSINC(WA, WSP, WSP, CC_EQ); + STRB(INDEX_UNSIGNED, WA, X29, PPCSTATE_OFF(xer_ca)); + gpr.Unlock(WA); + } + else + { + gpr.BindToRegister(a, a == s); + ARM64Reg RA = gpr.R(a); + ARM64Reg RS = gpr.R(s); + MOV(RA, RS); + STRB(INDEX_UNSIGNED, WSP, X29, PPCSTATE_OFF(xer_ca)); + } +} + +void JitArm64::addic(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITIntegerOff); + + int a = inst.RA, d = inst.RD; + bool rc = inst.OPCD == 13; + s32 simm = inst.SIMM_16; + u32 imm = (u32)simm; + + if (gpr.IsImm(a)) + { + + u32 i = gpr.GetImm(a); + gpr.SetImmediate(d, i + imm); + + bool has_carry = Interpreter::Helper_Carry(i, imm); + ComputeCarry(has_carry); + if (rc) + ComputeRC(gpr.GetImm(d), 0); + } + else + { + gpr.BindToRegister(d, d == a); + if (imm < 4096) + { + ADDS(gpr.R(d), gpr.R(a), imm); + } + else if (simm > -4096 && simm < 0) + { + SUBS(gpr.R(d), gpr.R(a), std::abs(simm)); + } + else + { + ARM64Reg WA = gpr.GetReg(); + MOVI2R(WA, imm); + ADDS(gpr.R(d), gpr.R(a), WA); + gpr.Unlock(WA); + } + + ComputeCarry(); + if (rc) + ComputeRC(gpr.R(d), 0); + } +} + +void JitArm64::mulli(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITIntegerOff); + FALLBACK_IF(inst.OE); + + int a = inst.RA, d = inst.RD; + + if (gpr.IsImm(a)) + { + s32 i = (s32)gpr.GetImm(a); + gpr.SetImmediate(d, i * inst.SIMM_16); + } + else + { + gpr.BindToRegister(d, d == a); + ARM64Reg WA = gpr.GetReg(); + MOVI2R(WA, (u32)(s32)inst.SIMM_16); + MUL(gpr.R(d), gpr.R(a), WA); + gpr.Unlock(WA); + } +} + +void JitArm64::mullwx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITIntegerOff); + FALLBACK_IF(inst.OE); + + int a = inst.RA, b = inst.RB, d = inst.RD; + + if (gpr.IsImm(a) && gpr.IsImm(b)) + { + s32 i = (s32)gpr.GetImm(a), j = (s32)gpr.GetImm(b); + gpr.SetImmediate(d, i * j); + if (inst.Rc) + ComputeRC(gpr.GetImm(d), 0); + } + else + { + gpr.BindToRegister(d, d == a || d == b); + MUL(gpr.R(d), gpr.R(a), gpr.R(b)); + if (inst.Rc) + ComputeRC(gpr.R(d), 0); + } +} + +void JitArm64::addzex(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITIntegerOff); + FALLBACK_IF(inst.OE); + + int a = inst.RA, d = inst.RD; + + gpr.BindToRegister(d, d == a); + ARM64Reg WA = gpr.GetReg(); + LDRB(INDEX_UNSIGNED, WA, X29, PPCSTATE_OFF(xer_ca)); + CMP(WA, 1); + CSINC(gpr.R(d), gpr.R(a), gpr.R(a), CC_NEQ); + CMP(gpr.R(d), 0); + gpr.Unlock(WA); + ComputeCarry(); +} + +void JitArm64::subfx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITIntegerOff); + FALLBACK_IF(inst.OE); + + int a = inst.RA, b = inst.RB, d = inst.RD; + + if (gpr.IsImm(a) && gpr.IsImm(b)) + { + u32 i = gpr.GetImm(a), j = gpr.GetImm(b); + gpr.SetImmediate(d, j - i); + if (inst.Rc) + ComputeRC(gpr.GetImm(d), 0); + } + else + { + SUB(gpr.R(d), gpr.R(b), gpr.R(a)); + if (inst.Rc) + ComputeRC(gpr.R(d), 0); + } +} + +void JitArm64::addcx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITIntegerOff); + FALLBACK_IF(inst.OE); + + int a = inst.RA, b = inst.RB, d = inst.RD; + + if (gpr.IsImm(a) && gpr.IsImm(b)) + { + u32 i = gpr.GetImm(a), j = gpr.GetImm(b); + gpr.SetImmediate(d, i * j); + + bool has_carry = Interpreter::Helper_Carry(i, j); + ComputeCarry(has_carry); + if (inst.Rc) + ComputeRC(gpr.GetImm(d), 0); + } + else + { + gpr.BindToRegister(d, d == a || d == b); + ADDS(gpr.R(d), gpr.R(a), gpr.R(b)); + + ComputeCarry(); + if (inst.Rc) + ComputeRC(gpr.R(d), 0); + } +} diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp index 13e02bfa4a..fae0a3bdc9 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp @@ -45,17 +45,17 @@ static GekkoOPTemplate primarytable[] = {3, &JitArm64::twx}, //"twi", OPTYPE_SYSTEM, FL_ENDBLOCK}}, {17, &JitArm64::sc}, //"sc", OPTYPE_SYSTEM, FL_ENDBLOCK, 1}}, - {7, &JitArm64::FallBackToInterpreter}, //"mulli", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_RC_BIT, 2}}, + {7, &JitArm64::mulli}, //"mulli", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_RC_BIT, 2}}, {8, &JitArm64::FallBackToInterpreter}, //"subfic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}}, {10, &JitArm64::cmpli}, //"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}}, {11, &JitArm64::cmpi}, //"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}}, - {12, &JitArm64::FallBackToInterpreter}, //"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}}, - {13, &JitArm64::FallBackToInterpreter}, //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0}}, + {12, &JitArm64::addic}, //"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}}, + {13, &JitArm64::addic}, //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0}}, {14, &JitArm64::arith_imm}, //"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}}, {15, &JitArm64::arith_imm}, //"addis", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0}}, {20, &JitArm64::FallBackToInterpreter}, //"rlwimix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_A | FL_IN_S | FL_RC_BIT}}, - {21, &JitArm64::FallBackToInterpreter}, //"rlwinmx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, + {21, &JitArm64::rlwinmx}, //"rlwinmx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {23, &JitArm64::FallBackToInterpreter}, //"rlwnmx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_IN_B | FL_RC_BIT}}, {24, &JitArm64::arith_imm}, //"ori", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S}}, @@ -196,7 +196,7 @@ static GekkoOPTemplate table31[] = {954, &JitArm64::extsXx}, //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {536, &JitArm64::FallBackToInterpreter}, //"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {792, &JitArm64::FallBackToInterpreter}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, - {824, &JitArm64::FallBackToInterpreter}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, + {824, &JitArm64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {24, &JitArm64::FallBackToInterpreter}, //"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {54, &JitArm64::FallBackToInterpreter}, //"dcbst", OPTYPE_DCACHE, 0, 4}}, @@ -294,25 +294,25 @@ static GekkoOPTemplate table31[] = static GekkoOPTemplate table31_2[] = { - {266, &JitArm64::FallBackToInterpreter}, //"addx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, - {778, &JitArm64::FallBackToInterpreter}, //"addx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, - {10, &JitArm64::FallBackToInterpreter}, //"addcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, - {522, &JitArm64::FallBackToInterpreter}, //"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, + {266, &JitArm64::addx}, //"addx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, + {778, &JitArm64::addx}, //"addx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, + {10, &JitArm64::addcx}, //"addcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, + {522, &JitArm64::addcx}, //"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, {138, &JitArm64::FallBackToInterpreter}, //"addex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, {650, &JitArm64::FallBackToInterpreter}, //"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, {234, &JitArm64::FallBackToInterpreter}, //"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, - {202, &JitArm64::FallBackToInterpreter}, //"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, + {202, &JitArm64::addzex}, //"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, {491, &JitArm64::FallBackToInterpreter}, //"divwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 39}}, {1003, &JitArm64::FallBackToInterpreter}, //"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 39}}, {459, &JitArm64::FallBackToInterpreter}, //"divwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 39}}, {971, &JitArm64::FallBackToInterpreter}, //"divwuox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 39}}, {75, &JitArm64::FallBackToInterpreter}, //"mulhwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 4}}, {11, &JitArm64::FallBackToInterpreter}, //"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 4}}, - {235, &JitArm64::FallBackToInterpreter}, //"mullwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 4}}, - {747, &JitArm64::FallBackToInterpreter}, //"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 4}}, + {235, &JitArm64::mullwx}, //"mullwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 4}}, + {747, &JitArm64::mullwx}, //"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 4}}, {104, &JitArm64::negx}, //"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, - {40, &JitArm64::FallBackToInterpreter}, //"subfx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, - {552, &JitArm64::FallBackToInterpreter}, //"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, + {40, &JitArm64::subfx}, //"subfx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, + {552, &JitArm64::subfx}, //"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT}}, {8, &JitArm64::FallBackToInterpreter}, //"subfcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, {520, &JitArm64::FallBackToInterpreter}, //"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT}}, {136, &JitArm64::FallBackToInterpreter}, //"subfex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT}}, From 602702fdcb1b7d80be59e8deeb09e1a186d4fa9f Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 7 Jan 2015 14:59:41 -0600 Subject: [PATCH 09/12] [AArch64] Implement three system register instructions. --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 3 + .../JitArm64/JitArm64_SystemRegisters.cpp | 87 +++++++++++++++++++ .../Core/PowerPC/JitArm64/JitArm64_Tables.cpp | 6 +- 3 files changed, 93 insertions(+), 3 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index d3e51ee168..bff5061432 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -108,6 +108,9 @@ public: void mfsrin(UGeckoInstruction inst); void mtsrin(UGeckoInstruction inst); void twx(UGeckoInstruction inst); + void mfspr(UGeckoInstruction inst); + void mftb(UGeckoInstruction inst); + void mtspr(UGeckoInstruction inst); // LoadStore void icbi(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp index ea6ea6c46a..01df60015f 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp @@ -196,3 +196,90 @@ void JitArm64::twx(UGeckoInstruction inst) WriteExit(js.compilerPC + 4); } } + +void JitArm64::mfspr(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITSystemRegistersOff); + + u32 iIndex = (inst.SPRU << 5) | (inst.SPRL & 0x1F); + switch (iIndex) + { + case SPR_XER: + case SPR_WPAR: + case SPR_DEC: + case SPR_TL: + case SPR_TU: + FALLBACK_IF(true); + default: + gpr.BindToRegister(inst.RD, false); + ARM64Reg RD = gpr.R(inst.RD); + LDR(INDEX_UNSIGNED, RD, X29, PPCSTATE_OFF(spr) + iIndex * 4); + break; + } +} + +void JitArm64::mftb(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITSystemRegistersOff); + mfspr(inst); +} + +void JitArm64::mtspr(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITSystemRegistersOff); + + u32 iIndex = (inst.SPRU << 5) | (inst.SPRL & 0x1F); + + switch (iIndex) + { + case SPR_DMAU: + + case SPR_SPRG0: + case SPR_SPRG1: + case SPR_SPRG2: + case SPR_SPRG3: + + case SPR_SRR0: + case SPR_SRR1: + // These are safe to do the easy way, see the bottom of this function. + break; + + case SPR_LR: + case SPR_CTR: + case SPR_GQR0: + case SPR_GQR0 + 1: + case SPR_GQR0 + 2: + case SPR_GQR0 + 3: + case SPR_GQR0 + 4: + case SPR_GQR0 + 5: + case SPR_GQR0 + 6: + case SPR_GQR0 + 7: + // These are safe to do the easy way, see the bottom of this function. + break; + case SPR_XER: + { + FALLBACK_IF(true); + ARM64Reg RD = gpr.R(inst.RD); + ARM64Reg WA = gpr.GetReg(); + ARM64Reg mask = gpr.GetReg(); + MOVI2R(mask, 0xFF7F); + AND(WA, RD, mask, ArithOption(mask, ST_LSL, 0)); + STRH(INDEX_UNSIGNED, WA, X29, PPCSTATE_OFF(xer_stringctrl)); + UBFM(WA, RD, XER_CA_SHIFT, XER_CA_SHIFT); + STRB(INDEX_UNSIGNED, WA, X29, PPCSTATE_OFF(xer_ca)); + UBFM(WA, RD, XER_OV_SHIFT, 31); // Same as WA = RD >> XER_OV_SHIFT + STRB(INDEX_UNSIGNED, WA, X29, PPCSTATE_OFF(xer_so_ov)); + gpr.Unlock(WA, mask); + } + break; + default: + FALLBACK_IF(true); + } + + // OK, this is easy. + ARM64Reg RD = gpr.R(inst.RD); + STR(INDEX_UNSIGNED, RD, X29, PPCSTATE_OFF(spr) + iIndex * 4); +} diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp index fae0a3bdc9..84721f58ec 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp @@ -272,9 +272,9 @@ static GekkoOPTemplate table31[] = {146, &JitArm64::mtmsr}, //"mtmsr", OPTYPE_SYSTEM, FL_ENDBLOCK}}, {210, &JitArm64::mtsr}, //"mtsr", OPTYPE_SYSTEM, 0}}, {242, &JitArm64::mtsrin}, //"mtsrin", OPTYPE_SYSTEM, 0}}, - {339, &JitArm64::FallBackToInterpreter}, //"mfspr", OPTYPE_SPR, FL_OUT_D}}, - {467, &JitArm64::FallBackToInterpreter}, //"mtspr", OPTYPE_SPR, 0, 2}}, - {371, &JitArm64::FallBackToInterpreter}, //"mftb", OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER}}, + {339, &JitArm64::mfspr}, //"mfspr", OPTYPE_SPR, FL_OUT_D}}, + {467, &JitArm64::mtspr}, //"mtspr", OPTYPE_SPR, 0, 2}}, + {371, &JitArm64::mftb}, //"mftb", OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER}}, {512, &JitArm64::FallBackToInterpreter}, //"mcrxr", OPTYPE_SYSTEM, 0}}, {595, &JitArm64::mfsr}, //"mfsr", OPTYPE_SYSTEM, FL_OUT_D, 2}}, {659, &JitArm64::mfsrin}, //"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 2}}, From 6dff4421d371d557d7831cd356c43f4958181273 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 7 Jan 2015 15:02:11 -0600 Subject: [PATCH 10/12] [AArch64] Implement 24 paired instructions. --- Source/Core/Core/CMakeLists.txt | 1 + Source/Core/Core/PowerPC/JitArm64/Jit.h | 26 + .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 495 ++++++++++++++++++ .../Core/PowerPC/JitArm64/JitArm64_Tables.cpp | 48 +- 4 files changed, 546 insertions(+), 24 deletions(-) create mode 100644 Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp diff --git a/Source/Core/Core/CMakeLists.txt b/Source/Core/Core/CMakeLists.txt index 3af53428ac..0eddc09a24 100644 --- a/Source/Core/Core/CMakeLists.txt +++ b/Source/Core/Core/CMakeLists.txt @@ -227,6 +227,7 @@ elseif(_M_ARM_64) PowerPC/JitArm64/JitArm64_Branch.cpp PowerPC/JitArm64/JitArm64_Integer.cpp PowerPC/JitArm64/JitArm64_LoadStore.cpp + PowerPC/JitArm64/JitArm64_Paired.cpp PowerPC/JitArm64/JitArm64_SystemRegisters.cpp PowerPC/JitArm64/JitArm64_Tables.cpp) endif() diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index bff5061432..4e19f3b2e4 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -117,6 +117,32 @@ public: void lXX(UGeckoInstruction inst); void stX(UGeckoInstruction inst); + // Paired + void ps_abs(UGeckoInstruction inst); + void ps_add(UGeckoInstruction inst); + void ps_div(UGeckoInstruction inst); + void ps_madd(UGeckoInstruction inst); + void ps_madds0(UGeckoInstruction inst); + void ps_madds1(UGeckoInstruction inst); + void ps_merge00(UGeckoInstruction inst); + void ps_merge01(UGeckoInstruction inst); + void ps_merge10(UGeckoInstruction inst); + void ps_merge11(UGeckoInstruction inst); + void ps_mr(UGeckoInstruction inst); + void ps_msub(UGeckoInstruction inst); + void ps_mul(UGeckoInstruction inst); + void ps_muls0(UGeckoInstruction inst); + void ps_muls1(UGeckoInstruction inst); + void ps_nabs(UGeckoInstruction inst); + void ps_nmadd(UGeckoInstruction inst); + void ps_nmsub(UGeckoInstruction inst); + void ps_neg(UGeckoInstruction inst); + void ps_res(UGeckoInstruction inst); + void ps_sel(UGeckoInstruction inst); + void ps_sub(UGeckoInstruction inst); + void ps_sum0(UGeckoInstruction inst); + void ps_sum1(UGeckoInstruction inst); + private: Arm64GPRCache gpr; Arm64FPRCache fpr; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp new file mode 100644 index 0000000000..e90ee63c64 --- /dev/null +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -0,0 +1,495 @@ +// Copyright 2014 Dolphin Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#include "Common/Arm64Emitter.h" +#include "Common/Common.h" +#include "Common/StringUtil.h" + +#include "Core/Core.h" +#include "Core/CoreTiming.h" +#include "Core/PowerPC/PowerPC.h" +#include "Core/PowerPC/PPCTables.h" +#include "Core/PowerPC/JitArm64/Jit.h" +#include "Core/PowerPC/JitArm64/JitArm64_RegCache.h" +#include "Core/PowerPC/JitArm64/JitAsm.h" + +using namespace Arm64Gen; + +void JitArm64::ps_abs(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 b = inst.FB, d = inst.FD; + fpr.BindToRegister(d, d == b); + + ARM64Reg VB = fpr.R(b); + ARM64Reg VD = fpr.R(d); + + m_float_emit.FABS(64, VD, VB); +} + +void JitArm64::ps_add(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VD = fpr.R(d); + + m_float_emit.FADD(64, VD, VA, VB); +} + +void JitArm64::ps_div(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VD = fpr.R(d); + + m_float_emit.FDIV(64, VD, VA, VB); +} + +void JitArm64::ps_madd(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FMUL(64, V0, VA, VC); + m_float_emit.FADD(64, VD, V0, VB); + + fpr.Unlock(V0); +} + +void JitArm64::ps_madds0(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.DUP(64, V0, VC, 0); + m_float_emit.FMUL(64, V0, V0, VA); + m_float_emit.FADD(64, VD, V0, VB); + + fpr.Unlock(V0); +} + +void JitArm64::ps_madds1(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.DUP(64, V0, VC, 1); + m_float_emit.FMUL(64, V0, V0, VA); + m_float_emit.FADD(64, VD, V0, VB); + + fpr.Unlock(V0); +} + +void JitArm64::ps_merge00(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VD = fpr.R(d); + + m_float_emit.TRN1(64, VD, VA, VB); +} + +void JitArm64::ps_merge01(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VD = fpr.R(d); + + m_float_emit.INS(64, VD, 0, VA, 0); + m_float_emit.INS(64, VD, 1, VB, 1); +} + +void JitArm64::ps_merge10(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VD = fpr.R(d); + + if (d != a && d != b) + { + m_float_emit.INS(64, VD, 0, VA, 1); + m_float_emit.INS(64, VD, 1, VB, 0); + } + else + { + ARM64Reg V0 = fpr.GetReg(); + m_float_emit.INS(64, V0, 0, VA, 1); + m_float_emit.INS(64, V0, 1, VB, 0); + m_float_emit.ORR(VD, V0, V0); + fpr.Unlock(V0); + } +} + +void JitArm64::ps_merge11(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VD = fpr.R(d); + + m_float_emit.TRN2(64, VD, VA, VB); +} + +void JitArm64::ps_mr(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 b = inst.FB, d = inst.FD; + + if (d == b) + return; + + fpr.BindToRegister(d, false); + + ARM64Reg VB = fpr.R(b); + ARM64Reg VD = fpr.R(d); + + m_float_emit.ORR(VD, VB, VB); +} + +void JitArm64::ps_mul(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + + m_float_emit.FMUL(64, VD, VA, VC); +} + +void JitArm64::ps_muls0(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.DUP(64, V0, VC, 0); + m_float_emit.FMUL(64, VD, VA, V0); + fpr.Unlock(V0); +} + +void JitArm64::ps_muls1(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.DUP(64, V0, VC, 1); + m_float_emit.FMUL(64, VD, VA, V0); + fpr.Unlock(V0); +} + +void JitArm64::ps_msub(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FMUL(64, V0, VA, VC); + m_float_emit.FSUB(64, VD, V0, VB); + + fpr.Unlock(V0); +} + +void JitArm64::ps_nabs(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 b = inst.FB, d = inst.FD; + fpr.BindToRegister(d, d == b); + + ARM64Reg VB = fpr.R(b); + ARM64Reg VD = fpr.R(d); + + m_float_emit.FABS(64, VD, VB); + m_float_emit.FNEG(64, VD, VD); +} + +void JitArm64::ps_neg(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 b = inst.FB, d = inst.FD; + fpr.BindToRegister(d, d == b); + + ARM64Reg VB = fpr.R(b); + ARM64Reg VD = fpr.R(d); + + m_float_emit.FNEG(64, VD, VB); +} + +void JitArm64::ps_nmadd(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FMUL(64, V0, VA, VC); + m_float_emit.FADD(64, VD, V0, VB); + m_float_emit.FNEG(64, VD, VD); + + fpr.Unlock(V0); +} + +void JitArm64::ps_nmsub(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FMUL(64, V0, VA, VC); + m_float_emit.FSUB(64, VD, V0, VB); + m_float_emit.FNEG(64, VD, VD); + + fpr.Unlock(V0); +} + +void JitArm64::ps_res(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 b = inst.FB, d = inst.FD; + fpr.BindToRegister(d, d == b); + + ARM64Reg VB = fpr.R(b); + ARM64Reg VD = fpr.R(d); + + m_float_emit.FRSQRTE(64, VD, VB); +} + +void JitArm64::ps_sel(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + + if (d != a && d != b && d != c) + { + m_float_emit.FCMGE(64, VD, VA); + m_float_emit.BSL(VD, VC, VB); + } + else + { + ARM64Reg V0 = fpr.GetReg(); + m_float_emit.FCMGE(64, V0, VA); + m_float_emit.BSL(V0, VC, VB); + m_float_emit.ORR(VD, V0, V0); + fpr.Unlock(V0); + } +} + +void JitArm64::ps_sub(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VD = fpr.R(d); + + m_float_emit.FSUB(64, VD, VA, VB); +} + +void JitArm64::ps_sum0(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.DUP(64, V0, VB, 1); + if (d != c) + { + m_float_emit.FADD(64, VD, V0, VA); + m_float_emit.INS(64, VD, 1, VC, 1); + } + else + { + m_float_emit.FADD(64, V0, V0, VA); + m_float_emit.INS(64, VD, 0, V0, 0); + } + + fpr.Unlock(V0); +} + +void JitArm64::ps_sum1(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.DUP(64, V0, VA, 0); + if (d != c) + { + m_float_emit.FADD(64, VD, V0, VB); + m_float_emit.INS(64, VD, 0, VC, 0); + } + else + { + m_float_emit.FADD(64, V0, V0, VB); + m_float_emit.INS(64, VD, 1, V0, 1); + } + + fpr.Unlock(V0); +} diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp index 84721f58ec..eb0d5fb141 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp @@ -114,39 +114,39 @@ static GekkoOPTemplate table4[] = { //SUBOP10 {0, &JitArm64::FallBackToInterpreter}, //"ps_cmpu0", OPTYPE_PS, FL_SET_CRn}}, {32, &JitArm64::FallBackToInterpreter}, //"ps_cmpo0", OPTYPE_PS, FL_SET_CRn}}, - {40, &JitArm64::FallBackToInterpreter}, //"ps_neg", OPTYPE_PS, FL_RC_BIT}}, - {136, &JitArm64::FallBackToInterpreter}, //"ps_nabs", OPTYPE_PS, FL_RC_BIT}}, - {264, &JitArm64::FallBackToInterpreter}, //"ps_abs", OPTYPE_PS, FL_RC_BIT}}, + {40, &JitArm64::ps_neg}, //"ps_neg", OPTYPE_PS, FL_RC_BIT}}, + {136, &JitArm64::ps_nabs}, //"ps_nabs", OPTYPE_PS, FL_RC_BIT}}, + {264, &JitArm64::ps_abs}, //"ps_abs", OPTYPE_PS, FL_RC_BIT}}, {64, &JitArm64::FallBackToInterpreter}, //"ps_cmpu1", OPTYPE_PS, FL_RC_BIT}}, - {72, &JitArm64::FallBackToInterpreter}, //"ps_mr", OPTYPE_PS, FL_RC_BIT}}, + {72, &JitArm64::ps_mr}, //"ps_mr", OPTYPE_PS, FL_RC_BIT}}, {96, &JitArm64::FallBackToInterpreter}, //"ps_cmpo1", OPTYPE_PS, FL_RC_BIT}}, - {528, &JitArm64::FallBackToInterpreter}, //"ps_merge00", OPTYPE_PS, FL_RC_BIT}}, - {560, &JitArm64::FallBackToInterpreter}, //"ps_merge01", OPTYPE_PS, FL_RC_BIT}}, - {592, &JitArm64::FallBackToInterpreter}, //"ps_merge10", OPTYPE_PS, FL_RC_BIT}}, - {624, &JitArm64::FallBackToInterpreter}, //"ps_merge11", OPTYPE_PS, FL_RC_BIT}}, + {528, &JitArm64::ps_merge00}, //"ps_merge00", OPTYPE_PS, FL_RC_BIT}}, + {560, &JitArm64::ps_merge01}, //"ps_merge01", OPTYPE_PS, FL_RC_BIT}}, + {592, &JitArm64::ps_merge10}, //"ps_merge10", OPTYPE_PS, FL_RC_BIT}}, + {624, &JitArm64::ps_merge11}, //"ps_merge11", OPTYPE_PS, FL_RC_BIT}}, {1014, &JitArm64::FallBackToInterpreter}, //"dcbz_l", OPTYPE_SYSTEM, 0}}, }; static GekkoOPTemplate table4_2[] = { - {10, &JitArm64::FallBackToInterpreter}, //"ps_sum0", OPTYPE_PS, 0}}, - {11, &JitArm64::FallBackToInterpreter}, //"ps_sum1", OPTYPE_PS, 0}}, - {12, &JitArm64::FallBackToInterpreter}, //"ps_muls0", OPTYPE_PS, 0}}, - {13, &JitArm64::FallBackToInterpreter}, //"ps_muls1", OPTYPE_PS, 0}}, - {14, &JitArm64::FallBackToInterpreter}, //"ps_madds0", OPTYPE_PS, 0}}, - {15, &JitArm64::FallBackToInterpreter}, //"ps_madds1", OPTYPE_PS, 0}}, - {18, &JitArm64::FallBackToInterpreter}, //"ps_div", OPTYPE_PS, 0, 16}}, - {20, &JitArm64::FallBackToInterpreter}, //"ps_sub", OPTYPE_PS, 0}}, - {21, &JitArm64::FallBackToInterpreter}, //"ps_add", OPTYPE_PS, 0}}, - {23, &JitArm64::FallBackToInterpreter}, //"ps_sel", OPTYPE_PS, 0}}, - {24, &JitArm64::FallBackToInterpreter}, //"ps_res", OPTYPE_PS, 0}}, - {25, &JitArm64::FallBackToInterpreter}, //"ps_mul", OPTYPE_PS, 0}}, + {10, &JitArm64::ps_sum0}, //"ps_sum0", OPTYPE_PS, 0}}, + {11, &JitArm64::ps_sum1}, //"ps_sum1", OPTYPE_PS, 0}}, + {12, &JitArm64::ps_muls0}, //"ps_muls0", OPTYPE_PS, 0}}, + {13, &JitArm64::ps_muls1}, //"ps_muls1", OPTYPE_PS, 0}}, + {14, &JitArm64::ps_madds0}, //"ps_madds0", OPTYPE_PS, 0}}, + {15, &JitArm64::ps_madds1}, //"ps_madds1", OPTYPE_PS, 0}}, + {18, &JitArm64::ps_div}, //"ps_div", OPTYPE_PS, 0, 16}}, + {20, &JitArm64::ps_sub}, //"ps_sub", OPTYPE_PS, 0}}, + {21, &JitArm64::ps_add}, //"ps_add", OPTYPE_PS, 0}}, + {23, &JitArm64::ps_sel}, //"ps_sel", OPTYPE_PS, 0}}, + {24, &JitArm64::ps_res}, //"ps_res", OPTYPE_PS, 0}}, + {25, &JitArm64::ps_mul}, //"ps_mul", OPTYPE_PS, 0}}, {26, &JitArm64::FallBackToInterpreter}, //"ps_rsqrte", OPTYPE_PS, 0, 1}}, - {28, &JitArm64::FallBackToInterpreter}, //"ps_msub", OPTYPE_PS, 0}}, - {29, &JitArm64::FallBackToInterpreter}, //"ps_madd", OPTYPE_PS, 0}}, - {30, &JitArm64::FallBackToInterpreter}, //"ps_nmsub", OPTYPE_PS, 0}}, - {31, &JitArm64::FallBackToInterpreter}, //"ps_nmadd", OPTYPE_PS, 0}}, + {28, &JitArm64::ps_msub}, //"ps_msub", OPTYPE_PS, 0}}, + {29, &JitArm64::ps_madd}, //"ps_madd", OPTYPE_PS, 0}}, + {30, &JitArm64::ps_nmsub}, //"ps_nmsub", OPTYPE_PS, 0}}, + {31, &JitArm64::ps_nmadd}, //"ps_nmadd", OPTYPE_PS, 0}}, }; From 7370473eb33d5f30ded2977966707c3283c80c6f Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 7 Jan 2015 15:04:26 -0600 Subject: [PATCH 11/12] [AArch64] Implement 19 floating point instructions --- Source/Core/Core/CMakeLists.txt | 1 + Source/Core/Core/PowerPC/JitArm64/Jit.h | 21 + .../JitArm64/JitArm64_FloatingPoint.cpp | 376 ++++++++++++++++++ .../Core/PowerPC/JitArm64/JitArm64_Tables.cpp | 38 +- 4 files changed, 417 insertions(+), 19 deletions(-) create mode 100644 Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp diff --git a/Source/Core/Core/CMakeLists.txt b/Source/Core/Core/CMakeLists.txt index 0eddc09a24..61462124dd 100644 --- a/Source/Core/Core/CMakeLists.txt +++ b/Source/Core/Core/CMakeLists.txt @@ -225,6 +225,7 @@ elseif(_M_ARM_64) PowerPC/JitArm64/JitArm64_RegCache.cpp PowerPC/JitArm64/JitArm64_BackPatch.cpp PowerPC/JitArm64/JitArm64_Branch.cpp + PowerPC/JitArm64/JitArm64_FloatingPoint.cpp PowerPC/JitArm64/JitArm64_Integer.cpp PowerPC/JitArm64/JitArm64_LoadStore.cpp PowerPC/JitArm64/JitArm64_Paired.cpp diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 4e19f3b2e4..c979633ae2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -117,6 +117,27 @@ public: void lXX(UGeckoInstruction inst); void stX(UGeckoInstruction inst); + // Floating point + void fabsx(UGeckoInstruction inst); + void faddsx(UGeckoInstruction inst); + void faddx(UGeckoInstruction inst); + void fmaddsx(UGeckoInstruction inst); + void fmaddx(UGeckoInstruction inst); + void fmrx(UGeckoInstruction inst); + void fmsubsx(UGeckoInstruction inst); + void fmsubx(UGeckoInstruction inst); + void fmulsx(UGeckoInstruction inst); + void fmulx(UGeckoInstruction inst); + void fnabsx(UGeckoInstruction inst); + void fnegx(UGeckoInstruction inst); + void fnmaddsx(UGeckoInstruction inst); + void fnmaddx(UGeckoInstruction inst); + void fnmsubsx(UGeckoInstruction inst); + void fnmsubx(UGeckoInstruction inst); + void fselx(UGeckoInstruction inst); + void fsubsx(UGeckoInstruction inst); + void fsubx(UGeckoInstruction inst); + // Paired void ps_abs(UGeckoInstruction inst); void ps_add(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp new file mode 100644 index 0000000000..a670edccc8 --- /dev/null +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -0,0 +1,376 @@ +// Copyright 2014 Dolphin Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#include "Common/Arm64Emitter.h" +#include "Common/Common.h" +#include "Common/StringUtil.h" + +#include "Core/Core.h" +#include "Core/CoreTiming.h" +#include "Core/PowerPC/PowerPC.h" +#include "Core/PowerPC/PPCTables.h" +#include "Core/PowerPC/JitArm64/Jit.h" +#include "Core/PowerPC/JitArm64/JitArm64_RegCache.h" +#include "Core/PowerPC/JitArm64/JitAsm.h" + +using namespace Arm64Gen; + +void JitArm64::fabsx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + fpr.BindToRegister(inst.FD, inst.FD == inst.FB); + ARM64Reg VB = fpr.R(inst.FB); + ARM64Reg VD = fpr.R(inst.FD); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FABS(64, V0, VB); + m_float_emit.INS(64, VD, 0, V0, 0); + + fpr.Unlock(V0); +} + +void JitArm64::faddsx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + fpr.BindToRegister(inst.FD, inst.FD == inst.FA || inst.FD == inst.FB); + ARM64Reg VA = fpr.R(inst.FA); + ARM64Reg VB = fpr.R(inst.FB); + ARM64Reg VD = fpr.R(inst.FD); + + m_float_emit.FADD(64, VD, VA, VB); + m_float_emit.INS(64, VD, 1, VD, 0); +} + +void JitArm64::faddx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + fpr.BindToRegister(inst.FD, inst.FD == inst.FA || inst.FD == inst.FB); + ARM64Reg VA = fpr.R(inst.FA); + ARM64Reg VB = fpr.R(inst.FB); + ARM64Reg VD = fpr.R(inst.FD); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FADD(64, V0, VA, VB); + m_float_emit.INS(64, VD, 0, V0, 0); + + fpr.Unlock(V0); +} + +void JitArm64::fmaddsx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FMUL(64, V0, VA, VC); + m_float_emit.FADD(64, V0, V0, VB); + m_float_emit.DUP(64, VD, V0, 0); + fpr.Unlock(V0); +} + +void JitArm64::fmaddx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FMUL(64, V0, VA, VC); + m_float_emit.FADD(64, V0, V0, VB); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); +} + +void JitArm64::fmrx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + fpr.BindToRegister(inst.FD, inst.FD == inst.FB); + ARM64Reg VB = fpr.R(inst.FB); + ARM64Reg VD = fpr.R(inst.FD); + + m_float_emit.INS(64, VD, 0, VB, 0); +} + +void JitArm64::fmsubsx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FMUL(64, V0, VA, VC); + m_float_emit.FSUB(64, V0, V0, VB); + m_float_emit.DUP(64, VD, V0, 0); + fpr.Unlock(V0); +} + +void JitArm64::fmsubx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FMUL(64, V0, VA, VC); + m_float_emit.FSUB(64, V0, V0, VB); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); +} + +void JitArm64::fmulsx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + fpr.BindToRegister(inst.FD, inst.FD == inst.FA || inst.FD == inst.FC); + ARM64Reg VA = fpr.R(inst.FA); + ARM64Reg VC = fpr.R(inst.FC); + ARM64Reg VD = fpr.R(inst.FD); + + m_float_emit.FMUL(64, VD, VA, VC); + m_float_emit.INS(64, VD, 1, VD, 0); +} + +void JitArm64::fmulx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + fpr.BindToRegister(inst.FD, inst.FD == inst.FA || inst.FD == inst.FC); + ARM64Reg VA = fpr.R(inst.FA); + ARM64Reg VC = fpr.R(inst.FC); + ARM64Reg VD = fpr.R(inst.FD); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FMUL(64, V0, VA, VC); + m_float_emit.INS(64, VD, 0, V0, 0); + + fpr.Unlock(V0); +} + +void JitArm64::fnabsx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + fpr.BindToRegister(inst.FD, inst.FD == inst.FB); + ARM64Reg VB = fpr.R(inst.FB); + ARM64Reg VD = fpr.R(inst.FD); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FABS(64, V0, VB); + m_float_emit.FNEG(64, V0, V0); + m_float_emit.INS(64, VD, 0, V0, 0); + + fpr.Unlock(V0); +} + +void JitArm64::fnegx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + fpr.BindToRegister(inst.FD, inst.FD == inst.FB); + ARM64Reg VB = fpr.R(inst.FB); + ARM64Reg VD = fpr.R(inst.FD); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FNEG(64, V0, VB); + m_float_emit.INS(64, VD, 0, V0, 0); + + fpr.Unlock(V0); +} + +void JitArm64::fnmaddsx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FMUL(64, V0, VA, VC); + m_float_emit.FADD(64, V0, V0, VB); + m_float_emit.FNEG(64, V0, V0); + m_float_emit.DUP(64, VD, V0, 0); + fpr.Unlock(V0); +} + +void JitArm64::fnmaddx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FMUL(64, V0, VA, VC); + m_float_emit.FADD(64, V0, V0, VB); + m_float_emit.FNEG(64, V0, V0); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); +} + +void JitArm64::fnmsubsx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FMUL(64, V0, VA, VC); + m_float_emit.FSUB(64, V0, V0, VB); + m_float_emit.FNEG(64, V0, V0); + m_float_emit.DUP(64, VD, V0, 0); + fpr.Unlock(V0); +} + +void JitArm64::fnmsubx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + fpr.BindToRegister(d, d == a || d == b || d == c); + + ARM64Reg VA = fpr.R(a); + ARM64Reg VB = fpr.R(b); + ARM64Reg VC = fpr.R(c); + ARM64Reg VD = fpr.R(d); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FMUL(64, V0, VA, VC); + m_float_emit.FSUB(64, V0, V0, VB); + m_float_emit.FNEG(64, V0, V0); + m_float_emit.INS(64, VD, 0, V0, 0); + fpr.Unlock(V0); +} + +void JitArm64::fselx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + fpr.BindToRegister(inst.FD, + inst.FD == inst.FA || + inst.FD == inst.FB || + inst.FD == inst.FC); + + ARM64Reg V0 = fpr.GetReg(); + ARM64Reg VD = fpr.R(inst.FD); + ARM64Reg VA = fpr.R(inst.FA); + ARM64Reg VB = fpr.R(inst.FB); + ARM64Reg VC = gpr.R(inst.FC); + + m_float_emit.FCMPE(VA); + m_float_emit.FCSEL(V0, VC, VB, CC_GE); + m_float_emit.INS(64, VD, 0, V0, 0); + + fpr.Unlock(V0); +} + +void JitArm64::fsubsx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + fpr.BindToRegister(inst.FD, inst.FD == inst.FA || inst.FD == inst.FB); + ARM64Reg VA = fpr.R(inst.FA); + ARM64Reg VB = fpr.R(inst.FB); + ARM64Reg VD = fpr.R(inst.FD); + + m_float_emit.FSUB(64, VD, VA, VB); + m_float_emit.INS(64, VD, 1, VD, 0); +} + +void JitArm64::fsubx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + fpr.BindToRegister(inst.FD, inst.FD == inst.FA || inst.FD == inst.FB); + ARM64Reg VA = fpr.R(inst.FA); + ARM64Reg VB = fpr.R(inst.FB); + ARM64Reg VD = fpr.R(inst.FD); + ARM64Reg V0 = fpr.GetReg(); + + m_float_emit.FSUB(64, V0, VA, VB); + m_float_emit.INS(64, VD, 0, V0, 0); + + fpr.Unlock(V0); +} diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp index eb0d5fb141..3d67cb4d03 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp @@ -323,27 +323,27 @@ static GekkoOPTemplate table31_2[] = static GekkoOPTemplate table59[] = { {18, &JitArm64::FallBackToInterpreter}, //{"fdivsx", OPTYPE_FPU, FL_RC_BIT_F, 16}}, - {20, &JitArm64::FallBackToInterpreter}, //"fsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, - {21, &JitArm64::FallBackToInterpreter}, //"faddsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {20, &JitArm64::fsubsx}, //"fsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {21, &JitArm64::faddsx}, //"faddsx", OPTYPE_FPU, FL_RC_BIT_F}}, // {22, &JitArm64::FallBackToInterpreter}, //"fsqrtsx", OPTYPE_FPU, FL_RC_BIT_F}}, {24, &JitArm64::FallBackToInterpreter}, //"fresx", OPTYPE_FPU, FL_RC_BIT_F}}, - {25, &JitArm64::FallBackToInterpreter}, //"fmulsx", OPTYPE_FPU, FL_RC_BIT_F}}, - {28, &JitArm64::FallBackToInterpreter}, //"fmsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, - {29, &JitArm64::FallBackToInterpreter}, //"fmaddsx", OPTYPE_FPU, FL_RC_BIT_F}}, - {30, &JitArm64::FallBackToInterpreter}, //"fnmsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, - {31, &JitArm64::FallBackToInterpreter}, //"fnmaddsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {25, &JitArm64::fmulsx}, //"fmulsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {28, &JitArm64::fmsubsx}, //"fmsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {29, &JitArm64::fmaddsx}, //"fmaddsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {30, &JitArm64::fnmsubsx}, //"fnmsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {31, &JitArm64::fnmaddsx}, //"fnmaddsx", OPTYPE_FPU, FL_RC_BIT_F}}, }; static GekkoOPTemplate table63[] = { - {264, &JitArm64::FallBackToInterpreter}, //"fabsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {264, &JitArm64::fabsx}, //"fabsx", OPTYPE_FPU, FL_RC_BIT_F}}, {32, &JitArm64::FallBackToInterpreter}, //"fcmpo", OPTYPE_FPU, FL_RC_BIT_F}}, {0, &JitArm64::FallBackToInterpreter}, //"fcmpu", OPTYPE_FPU, FL_RC_BIT_F}}, {14, &JitArm64::FallBackToInterpreter}, //"fctiwx", OPTYPE_FPU, FL_RC_BIT_F}}, {15, &JitArm64::FallBackToInterpreter}, //"fctiwzx", OPTYPE_FPU, FL_RC_BIT_F}}, - {72, &JitArm64::FallBackToInterpreter}, //"fmrx", OPTYPE_FPU, FL_RC_BIT_F}}, - {136, &JitArm64::FallBackToInterpreter}, //"fnabsx", OPTYPE_FPU, FL_RC_BIT_F}}, - {40, &JitArm64::FallBackToInterpreter}, //"fnegx", OPTYPE_FPU, FL_RC_BIT_F}}, + {72, &JitArm64::fmrx}, //"fmrx", OPTYPE_FPU, FL_RC_BIT_F}}, + {136, &JitArm64::fnabsx}, //"fnabsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {40, &JitArm64::fnegx}, //"fnegx", OPTYPE_FPU, FL_RC_BIT_F}}, {12, &JitArm64::FallBackToInterpreter}, //"frspx", OPTYPE_FPU, FL_RC_BIT_F}}, {64, &JitArm64::FallBackToInterpreter}, //"mcrfs", OPTYPE_SYSTEMFP, 0}}, @@ -357,16 +357,16 @@ static GekkoOPTemplate table63[] = static GekkoOPTemplate table63_2[] = { {18, &JitArm64::FallBackToInterpreter}, //"fdivx", OPTYPE_FPU, FL_RC_BIT_F, 30}}, - {20, &JitArm64::FallBackToInterpreter}, //"fsubx", OPTYPE_FPU, FL_RC_BIT_F}}, - {21, &JitArm64::FallBackToInterpreter}, //"faddx", OPTYPE_FPU, FL_RC_BIT_F}}, + {20, &JitArm64::fsubx}, //"fsubx", OPTYPE_FPU, FL_RC_BIT_F}}, + {21, &JitArm64::faddx}, //"faddx", OPTYPE_FPU, FL_RC_BIT_F}}, {22, &JitArm64::FallBackToInterpreter}, //"fsqrtx", OPTYPE_FPU, FL_RC_BIT_F}}, - {23, &JitArm64::FallBackToInterpreter}, //"fselx", OPTYPE_FPU, FL_RC_BIT_F}}, - {25, &JitArm64::FallBackToInterpreter}, //"fmulx", OPTYPE_FPU, FL_RC_BIT_F}}, + {23, &JitArm64::fselx}, //"fselx", OPTYPE_FPU, FL_RC_BIT_F}}, + {25, &JitArm64::fmulx}, //"fmulx", OPTYPE_FPU, FL_RC_BIT_F}}, {26, &JitArm64::FallBackToInterpreter}, //"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F}}, - {28, &JitArm64::FallBackToInterpreter}, //"fmsubx", OPTYPE_FPU, FL_RC_BIT_F}}, - {29, &JitArm64::FallBackToInterpreter}, //"fmaddx", OPTYPE_FPU, FL_RC_BIT_F}}, - {30, &JitArm64::FallBackToInterpreter}, //"fnmsubx", OPTYPE_FPU, FL_RC_BIT_F}}, - {31, &JitArm64::FallBackToInterpreter}, //"fnmaddx", OPTYPE_FPU, FL_RC_BIT_F}}, + {28, &JitArm64::fmsubx}, //"fmsubx", OPTYPE_FPU, FL_RC_BIT_F}}, + {29, &JitArm64::fmaddx}, //"fmaddx", OPTYPE_FPU, FL_RC_BIT_F}}, + {30, &JitArm64::fnmsubx}, //"fnmsubx", OPTYPE_FPU, FL_RC_BIT_F}}, + {31, &JitArm64::fnmaddx}, //"fnmaddx", OPTYPE_FPU, FL_RC_BIT_F}}, }; From 5a28883f9ea65136f9747009d01b5a57e04d4e51 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 7 Jan 2015 15:09:07 -0600 Subject: [PATCH 12/12] [AArch64] Implements 15 floating loadstores. --- Source/Core/Core/CMakeLists.txt | 1 + Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 16 + Source/Core/Core/PowerPC/JitArm64/Jit.h | 4 + .../JitArm64/JitArm64_LoadStoreFloating.cpp | 394 ++++++++++++++++++ .../Core/PowerPC/JitArm64/JitArm64_Tables.cpp | 32 +- 5 files changed, 431 insertions(+), 16 deletions(-) create mode 100644 Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp diff --git a/Source/Core/Core/CMakeLists.txt b/Source/Core/Core/CMakeLists.txt index 61462124dd..7cbf4509f6 100644 --- a/Source/Core/Core/CMakeLists.txt +++ b/Source/Core/Core/CMakeLists.txt @@ -228,6 +228,7 @@ elseif(_M_ARM_64) PowerPC/JitArm64/JitArm64_FloatingPoint.cpp PowerPC/JitArm64/JitArm64_Integer.cpp PowerPC/JitArm64/JitArm64_LoadStore.cpp + PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp PowerPC/JitArm64/JitArm64_Paired.cpp PowerPC/JitArm64/JitArm64_SystemRegisters.cpp PowerPC/JitArm64/JitArm64_Tables.cpp) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 6c44411349..29cbb62e1d 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -16,6 +16,7 @@ void JitArm64::Init() { AllocCodeSpace(CODE_SIZE); jo.enableBlocklink = true; + jo.optimizeGatherPipe = true; gpr.Init(this); fpr.Init(this); @@ -289,6 +290,21 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB js.next_compilerPC = ops[i + 1].address; } + if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32) + { + js.fifoBytesThisBlock -= 32; + + gpr.Lock(W30); + BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); + regs_in_use[W30] = 0; + + ABI_PushRegisters(regs_in_use); + MOVI2R(X30, (u64)&GPFifo::CheckGatherPipe); + BLR(X30); + ABI_PopRegisters(regs_in_use); + gpr.Unlock(W30); + } + if (!ops[i].skip) { if (js.memcheck && (opinfo->flags & FL_USE_FPU)) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index c979633ae2..94e9e945eb 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -117,6 +117,10 @@ public: void lXX(UGeckoInstruction inst); void stX(UGeckoInstruction inst); + // LoadStore floating point + void lfXX(UGeckoInstruction inst); + void stfXX(UGeckoInstruction inst); + // Floating point void fabsx(UGeckoInstruction inst); void faddsx(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp new file mode 100644 index 0000000000..49c40a905a --- /dev/null +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -0,0 +1,394 @@ +// Copyright 2014 Dolphin Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#include "Common/Arm64Emitter.h" +#include "Common/Common.h" + +#include "Core/Core.h" +#include "Core/CoreTiming.h" +#include "Core/PowerPC/PowerPC.h" +#include "Core/PowerPC/PPCTables.h" +#include "Core/PowerPC/JitArm64/Jit.h" +#include "Core/PowerPC/JitArm64/JitArm64_RegCache.h" +#include "Core/PowerPC/JitArm64/JitAsm.h" + +using namespace Arm64Gen; + +void JitArm64::lfXX(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITLoadStoreFloatingOff); + + u32 a = inst.RA, b = inst.RB; + + s32 offset = inst.SIMM_16; + u32 flags = BackPatchInfo::FLAG_LOAD; + bool update = false; + s32 offset_reg = -1; + + switch (inst.OPCD) + { + case 31: + switch (inst.SUBOP10) + { + case 567: // lfsux + flags |= BackPatchInfo::FLAG_SIZE_F32; + update = true; + offset_reg = b; + break; + case 535: // lfsx + flags |= BackPatchInfo::FLAG_SIZE_F32; + offset_reg = b; + break; + case 631: // lfdux + flags |= BackPatchInfo::FLAG_SIZE_F64; + update = true; + offset_reg = b; + break; + case 599: // lfdx + flags |= BackPatchInfo::FLAG_SIZE_F64; + offset_reg = b; + break; + } + break; + case 49: // lfsu + flags |= BackPatchInfo::FLAG_SIZE_F32; + update = true; + break; + case 48: // lfs + flags |= BackPatchInfo::FLAG_SIZE_F32; + break; + case 51: // lfdu + flags |= BackPatchInfo::FLAG_SIZE_F64; + update = true; + break; + case 50: // lfd + flags |= BackPatchInfo::FLAG_SIZE_F64; + break; + } + + u32 imm_addr = 0; + bool is_immediate = false; + + ARM64Reg VD = fpr.R(inst.FD); + ARM64Reg addr_reg = W0; + + gpr.Lock(W0, W30); + fpr.Lock(Q0); + + if (update) + { + // Always uses RA + if (gpr.IsImm(a) && offset_reg == -1) + { + is_immediate = true; + imm_addr = offset + gpr.GetImm(a); + } + else if (gpr.IsImm(a) && offset_reg != -1 && gpr.IsImm(offset_reg)) + { + is_immediate = true; + imm_addr = gpr.GetImm(a) + gpr.GetImm(offset_reg); + } + else + { + if (offset_reg == -1) + { + MOVI2R(addr_reg, offset); + ADD(addr_reg, addr_reg, gpr.R(a)); + } + else + { + ADD(addr_reg, gpr.R(offset_reg), gpr.R(a)); + } + } + } + else + { + if (offset_reg == -1) + { + if (a && gpr.IsImm(a)) + { + is_immediate = true; + imm_addr = gpr.GetImm(a) + offset; + } + else if (a) + { + MOVI2R(addr_reg, offset); + ADD(addr_reg, addr_reg, gpr.R(a)); + } + else + { + is_immediate = true; + imm_addr = offset; + } + } + else + { + if (a && gpr.IsImm(a) && gpr.IsImm(offset_reg)) + { + is_immediate = true; + imm_addr = gpr.GetImm(a) + gpr.GetImm(offset_reg); + } + else if (!a && gpr.IsImm(offset_reg)) + { + is_immediate = true; + imm_addr = gpr.GetImm(offset_reg); + } + else if (a) + { + ADD(addr_reg, gpr.R(a), gpr.R(offset_reg)); + } + else + { + MOV(addr_reg, gpr.R(offset_reg)); + } + } + } + + ARM64Reg XA = EncodeRegTo64(addr_reg); + + if (is_immediate) + MOVI2R(XA, imm_addr); + + if (update) + MOV(gpr.R(a), addr_reg); + + BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); + BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); + BitSet32 fpr_ignore_mask(0); + regs_in_use[W0] = 0; + regs_in_use[W30] = 0; + fprs_in_use[0] = 0; // Q0 + fpr_ignore_mask[VD - Q0] = 1; + + if (is_immediate && Memory::IsRAMAddress(imm_addr)) + { + EmitBackpatchRoutine(this, flags, true, false, VD, XA); + } + else + { + // Has a chance of being backpatched which will destroy our state + // push and pop everything in this instance + ABI_PushRegisters(regs_in_use); + m_float_emit.ABI_PushRegisters(fprs_in_use); + EmitBackpatchRoutine(this, flags, + SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, + SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, + VD, XA); + m_float_emit.ABI_PopRegisters(fprs_in_use, fpr_ignore_mask); + ABI_PopRegisters(regs_in_use); + } + + gpr.Unlock(W0, W30); + fpr.Unlock(Q0); +} + +void JitArm64::stfXX(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITLoadStoreFloatingOff); + + u32 a = inst.RA, b = inst.RB; + + s32 offset = inst.SIMM_16; + u32 flags = BackPatchInfo::FLAG_STORE; + bool update = false; + s32 offset_reg = -1; + + switch (inst.OPCD) + { + case 31: + switch (inst.SUBOP10) + { + case 663: // stfsx + flags |= BackPatchInfo::FLAG_SIZE_F32; + offset_reg = b; + break; + case 695: // stfsux + flags |= BackPatchInfo::FLAG_SIZE_F32; + offset_reg = b; + break; + case 727: // stfdx + flags |= BackPatchInfo::FLAG_SIZE_F64; + offset_reg = b; + break; + case 759: // stfdux + flags |= BackPatchInfo::FLAG_SIZE_F64; + update = true; + offset_reg = b; + break; + } + break; + case 53: // stfsu + flags |= BackPatchInfo::FLAG_SIZE_F32; + update = true; + break; + case 52: // stfs + flags |= BackPatchInfo::FLAG_SIZE_F32; + break; + case 55: // stfdu + flags |= BackPatchInfo::FLAG_SIZE_F64; + update = true; + break; + case 54: // stfd + flags |= BackPatchInfo::FLAG_SIZE_F64; + break; + } + + u32 imm_addr = 0; + bool is_immediate = false; + + ARM64Reg V0 = fpr.R(inst.FS); + ARM64Reg addr_reg; + if (flags & BackPatchInfo::FLAG_SIZE_F64) + addr_reg = W0; + else + addr_reg = W1; + + gpr.Lock(W0, W1, W30); + fpr.Lock(Q0); + + if (update) + { + // Always uses RA + if (gpr.IsImm(a) && offset_reg == -1) + { + is_immediate = true; + imm_addr = offset + gpr.GetImm(a); + } + else if (gpr.IsImm(a) && offset_reg != -1 && gpr.IsImm(offset_reg)) + { + is_immediate = true; + imm_addr = gpr.GetImm(a) + gpr.GetImm(offset_reg); + } + else + { + if (offset_reg == -1) + { + MOVI2R(addr_reg, offset); + ADD(addr_reg, addr_reg, gpr.R(a)); + } + else + { + ADD(addr_reg, gpr.R(offset_reg), gpr.R(a)); + } + } + } + else + { + if (offset_reg == -1) + { + if (a && gpr.IsImm(a)) + { + is_immediate = true; + imm_addr = gpr.GetImm(a) + offset; + } + else if (a) + { + MOVI2R(addr_reg, offset); + ADD(addr_reg, addr_reg, gpr.R(a)); + } + else + { + is_immediate = true; + imm_addr = offset; + } + } + else + { + if (a && gpr.IsImm(a) && gpr.IsImm(offset_reg)) + { + is_immediate = true; + imm_addr = gpr.GetImm(a) + gpr.GetImm(offset_reg); + } + else if (!a && gpr.IsImm(offset_reg)) + { + is_immediate = true; + imm_addr = gpr.GetImm(offset_reg); + } + else if (a) + { + ADD(addr_reg, gpr.R(a), gpr.R(offset_reg)); + } + else + { + MOV(addr_reg, gpr.R(offset_reg)); + } + } + } + + ARM64Reg XA = EncodeRegTo64(addr_reg); + + if (is_immediate) + MOVI2R(XA, imm_addr); + + if (update) + MOV(gpr.R(a), addr_reg); + + BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); + BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); + regs_in_use[W0] = 0; + regs_in_use[W1] = 0; + regs_in_use[W30] = 0; + fprs_in_use[0] = 0; // Q0 + + if (is_immediate) + { + if ((imm_addr & 0xFFFFF000) == 0xCC008000 && jit->jo.optimizeGatherPipe) + { + int accessSize; + if (flags & BackPatchInfo::FLAG_SIZE_F64) + accessSize = 64; + else + accessSize = 32; + + MOVI2R(X30, (u64)&GPFifo::m_gatherPipeCount); + MOVI2R(X1, (u64)GPFifo::m_gatherPipe); + LDR(INDEX_UNSIGNED, W0, X30, 0); + ADD(X1, X1, X0); + if (accessSize == 64) + { + m_float_emit.REV64(8, Q0, V0); + m_float_emit.STR(64, INDEX_UNSIGNED, Q0, X1, 0); + } + else if (accessSize == 32) + { + m_float_emit.FCVT(32, 64, Q0, V0); + m_float_emit.REV32(8, D0, D0); + m_float_emit.STR(32, INDEX_UNSIGNED, D0, X1, 0); + } + ADD(W0, W0, accessSize >> 3); + STR(INDEX_UNSIGNED, W0, X30, 0); + jit->js.fifoBytesThisBlock += accessSize >> 3; + + } + else if (Memory::IsRAMAddress(imm_addr)) + { + EmitBackpatchRoutine(this, flags, true, false, V0, XA); + } + else + { + ABI_PushRegisters(regs_in_use); + m_float_emit.ABI_PushRegisters(fprs_in_use); + EmitBackpatchRoutine(this, flags, false, false, V0, XA); + m_float_emit.ABI_PopRegisters(fprs_in_use); + ABI_PopRegisters(regs_in_use); + } + } + else + { + // Has a chance of being backpatched which will destroy our state + // push and pop everything in this instance + ABI_PushRegisters(regs_in_use); + m_float_emit.ABI_PushRegisters(fprs_in_use); + EmitBackpatchRoutine(this, flags, + SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, + SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, + V0, XA); + m_float_emit.ABI_PopRegisters(fprs_in_use); + ABI_PopRegisters(regs_in_use); + } + gpr.Unlock(W0, W1, W30); + fpr.Unlock(Q0); +} diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp index 3d67cb4d03..f1087a27c8 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp @@ -84,15 +84,15 @@ static GekkoOPTemplate primarytable[] = {46, &JitArm64::FallBackToInterpreter}, //"lmw", OPTYPE_SYSTEM, FL_EVIL, 10}}, {47, &JitArm64::FallBackToInterpreter}, //"stmw", OPTYPE_SYSTEM, FL_EVIL, 10}}, - {48, &JitArm64::FallBackToInterpreter}, //"lfs", OPTYPE_LOADFP, FL_IN_A}}, - {49, &JitArm64::FallBackToInterpreter}, //"lfsu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}}, - {50, &JitArm64::FallBackToInterpreter}, //"lfd", OPTYPE_LOADFP, FL_IN_A}}, - {51, &JitArm64::FallBackToInterpreter}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}}, + {48, &JitArm64::lfXX}, //"lfs", OPTYPE_LOADFP, FL_IN_A}}, + {49, &JitArm64::lfXX}, //"lfsu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}}, + {50, &JitArm64::lfXX}, //"lfd", OPTYPE_LOADFP, FL_IN_A}}, + {51, &JitArm64::lfXX}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}}, - {52, &JitArm64::FallBackToInterpreter}, //"stfs", OPTYPE_STOREFP, FL_IN_A}}, - {53, &JitArm64::FallBackToInterpreter}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, - {54, &JitArm64::FallBackToInterpreter}, //"stfd", OPTYPE_STOREFP, FL_IN_A}}, - {55, &JitArm64::FallBackToInterpreter}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, + {52, &JitArm64::stfXX}, //"stfs", OPTYPE_STOREFP, FL_IN_A}}, + {53, &JitArm64::stfXX}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, + {54, &JitArm64::stfXX}, //"stfd", OPTYPE_STOREFP, FL_IN_A}}, + {55, &JitArm64::stfXX}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, {56, &JitArm64::FallBackToInterpreter}, //"psq_l", OPTYPE_PS, FL_IN_A}}, {57, &JitArm64::FallBackToInterpreter}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}}, @@ -255,15 +255,15 @@ static GekkoOPTemplate table31[] = {725, &JitArm64::FallBackToInterpreter}, //"stswi", OPTYPE_STORE, FL_EVIL}}, // fp load/store - {535, &JitArm64::FallBackToInterpreter}, //"lfsx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}}, - {567, &JitArm64::FallBackToInterpreter}, //"lfsux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}}, - {599, &JitArm64::FallBackToInterpreter}, //"lfdx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}}, - {631, &JitArm64::FallBackToInterpreter}, //"lfdux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}}, + {535, &JitArm64::lfXX}, //"lfsx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}}, + {567, &JitArm64::lfXX}, //"lfsux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}}, + {599, &JitArm64::lfXX}, //"lfdx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}}, + {631, &JitArm64::lfXX}, //"lfdux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}}, - {663, &JitArm64::FallBackToInterpreter}, //"stfsx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}}, - {695, &JitArm64::FallBackToInterpreter}, //"stfsux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}}, - {727, &JitArm64::FallBackToInterpreter}, //"stfdx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}}, - {759, &JitArm64::FallBackToInterpreter}, //"stfdux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}}, + {663, &JitArm64::stfXX}, //"stfsx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}}, + {695, &JitArm64::stfXX}, //"stfsux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}}, + {727, &JitArm64::stfXX}, //"stfdx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}}, + {759, &JitArm64::stfXX}, //"stfdux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}}, {983, &JitArm64::FallBackToInterpreter}, //"stfiwx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}}, {19, &JitArm64::FallBackToInterpreter}, //"mfcr", OPTYPE_SYSTEM, FL_OUT_D}},