From 181f16c5f048e99b7576423ffec63a97b9bf0a48 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 15 Nov 2014 08:16:36 +0000 Subject: [PATCH] Reimplements fastmem for ARMv7 floating point loadstores. This implements a new system for fastmem backpatching on ARMv7 that is less of a mindfsck to deal with. This also implements stfs under the default loadstore path as well, not sure why it was by itself in the first place. I'll be moving the rest of the loadstore methods over to this new way in a few days. --- Source/Core/Core/PowerPC/JitArm32/Jit.h | 1 - .../PowerPC/JitArm32/JitArm_BackPatch.cpp | 142 +++++++++++------- .../JitArm32/JitArm_LoadStoreFloating.cpp | 142 ++++++++---------- .../Core/PowerPC/JitArm32/JitArm_Tables.cpp | 2 +- 4 files changed, 149 insertions(+), 138 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm32/Jit.h b/Source/Core/Core/PowerPC/JitArm32/Jit.h index d4885d5229..7021113d73 100644 --- a/Source/Core/Core/PowerPC/JitArm32/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm32/Jit.h @@ -205,7 +205,6 @@ public: // Floating point loadStore void lfXX(UGeckoInstruction _inst); void stfXX(UGeckoInstruction _inst); - void stfs(UGeckoInstruction _inst); // Paired Singles void ps_add(UGeckoInstruction _inst); diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp index b3c77cdab4..1ffc609609 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp @@ -17,7 +17,7 @@ using namespace ArmGen; // 1) It's really necessary. We don't know anything about the context. // 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be // that many of them in a typical program/game. -static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Store) +static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Store, bool *new_system) { u8 op = (inst >> 20) & 0xFF; rD = (ARMReg)((inst >> 12) & 0xF); @@ -61,8 +61,23 @@ static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Sto } break; default: - printf("Op is 0x%02x\n", op); - return false; + { + // Could be a floating point loadstore + u8 op2 = (inst >> 24) & 0xF; + switch (op2) + { + case 0xD: // VLDR/VSTR + *new_system = true; + break; + case 0x4: // VST1/VLD1 + *new_system = true; + break; + default: + printf("Op is 0x%02x\n", op); + return false; + break; + } + } } return true; } @@ -70,10 +85,7 @@ static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Sto bool JitArm::HandleFault(uintptr_t access_address, SContext* ctx) { if (access_address < (uintptr_t)Memory::base) - { - PanicAlertT("Exception handler - access below memory space. %08llx%08llx", - access_address >> 32, access_address); - } + PanicAlertT("Exception handler - access below memory space. 0x%08x", access_address); return BackPatch(ctx); } @@ -87,66 +99,90 @@ bool JitArm::BackPatch(SContext* ctx) ARMReg rD; u8 accessSize; bool Store; + bool new_system = false; - if (!DisamLoadStore(Value, rD, accessSize, Store)) + if (!DisamLoadStore(Value, rD, accessSize, Store, &new_system)) { printf("Invalid backpatch at location 0x%08lx(0x%08x)\n", ctx->CTX_PC, Value); exit(0); } - if (Store) + if (new_system) { - const u32 ARMREGOFFSET = 4 * 5; - ARMXEmitter emitter(codePtr - ARMREGOFFSET); - switch (accessSize) - { - case 8: // 8bit - emitter.MOVI2R(R14, (u32)&Memory::Write_U8, false); // 1-2 - return 0; - break; - case 16: // 16bit - emitter.MOVI2R(R14, (u32)&Memory::Write_U16, false); // 1-2 - return 0; - break; - case 32: // 32bit - emitter.MOVI2R(R14, (u32)&Memory::Write_U32, false); // 1-2 - break; - } - emitter.PUSH(4, R0, R1, R2, R3); // 3 - emitter.MOV(R0, rD); // Value - 4 - emitter.MOV(R1, R10); // Addr- 5 - emitter.BL(R14); // 6 - emitter.POP(4, R0, R1, R2, R3); // 7 - u32 newPC = ctx->CTX_PC - (ARMREGOFFSET + 4 * 4); - ctx->CTX_PC = newPC; + // The new system is a lot easier to backpatch than the old crap. + // Instead of backpatching over code and making sure we NOP pad and other crap + // We emit both the slow and fast path and branch over the slow path each time + // We search backwards until we find the second branch instruction + // Then proceed to replace it with a NOP and set that to the new PC. + // This ensures that we run the slow path and then branch over the fast path. + + // Run backwards until we find the branch we want to NOP + for (int branches = 2; branches > 0; ctx->CTX_PC -= 4) + if ((*(u32*)ctx->CTX_PC & 0x0F000000) == 0x0A000000) // B + --branches; + + ctx->CTX_PC += 4; + ARMXEmitter emitter((u8*)ctx->CTX_PC); + emitter.NOP(1); emitter.FlushIcache(); return true; } else { - const u32 ARMREGOFFSET = 4 * 4; - ARMXEmitter emitter(codePtr - ARMREGOFFSET); - switch (accessSize) + if (Store) { - case 8: // 8bit - emitter.MOVI2R(R14, (u32)&Memory::Read_U8, false); // 2 - break; - case 16: // 16bit - emitter.MOVI2R(R14, (u32)&Memory::Read_U16, false); // 2 - break; - case 32: // 32bit - emitter.MOVI2R(R14, (u32)&Memory::Read_U32, false); // 2 - break; + const u32 ARMREGOFFSET = 4 * 5; + ARMXEmitter emitter(codePtr - ARMREGOFFSET); + switch (accessSize) + { + case 8: // 8bit + emitter.MOVI2R(R14, (u32)&Memory::Write_U8, false); // 1-2 + return 0; + break; + case 16: // 16bit + emitter.MOVI2R(R14, (u32)&Memory::Write_U16, false); // 1-2 + return 0; + break; + case 32: // 32bit + emitter.MOVI2R(R14, (u32)&Memory::Write_U32, false); // 1-2 + break; + } + emitter.PUSH(4, R0, R1, R2, R3); // 3 + emitter.MOV(R0, rD); // Value - 4 + emitter.MOV(R1, R10); // Addr- 5 + emitter.BL(R14); // 6 + emitter.POP(4, R0, R1, R2, R3); // 7 + u32 newPC = ctx->CTX_PC - (ARMREGOFFSET + 4 * 4); + ctx->CTX_PC = newPC; + emitter.FlushIcache(); + return true; + } + else + { + const u32 ARMREGOFFSET = 4 * 4; + ARMXEmitter emitter(codePtr - ARMREGOFFSET); + switch (accessSize) + { + case 8: // 8bit + emitter.MOVI2R(R14, (u32)&Memory::Read_U8, false); // 2 + break; + case 16: // 16bit + emitter.MOVI2R(R14, (u32)&Memory::Read_U16, false); // 2 + break; + case 32: // 32bit + emitter.MOVI2R(R14, (u32)&Memory::Read_U32, false); // 2 + break; + } + emitter.PUSH(4, R0, R1, R2, R3); // 3 + emitter.MOV(R0, R10); // 4 + emitter.BL(R14); // 5 + emitter.MOV(R14, R0); // 6 + emitter.POP(4, R0, R1, R2, R3); // 7 + emitter.MOV(rD, R14); // 8 + ctx->CTX_PC -= ARMREGOFFSET + (4 * 4); + emitter.FlushIcache(); + return true; } - emitter.PUSH(4, R0, R1, R2, R3); // 3 - emitter.MOV(R0, R10); // 4 - emitter.BL(R14); // 5 - emitter.MOV(R14, R0); // 6 - emitter.POP(4, R0, R1, R2, R3); // 7 - emitter.MOV(rD, R14); // 8 - ctx->CTX_PC -= ARMREGOFFSET + (4 * 4); - emitter.FlushIcache(); - return true; } return 0; } diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp index e9588b1c62..1e5021dc3f 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp @@ -77,9 +77,9 @@ void JitArm::lfXX(UGeckoInstruction inst) break; } - ARMReg v0 = fpr.R0(inst.FD), v1; + ARMReg v0 = fpr.R0(inst.FD, false), v1; if (single) - v1 = fpr.R1(inst.FD); + v1 = fpr.R1(inst.FD, false); if (update) { @@ -134,28 +134,9 @@ void JitArm::lfXX(UGeckoInstruction inst) if (update) MOV(RA, rB); - if (false) - { - Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) - BIC(rB, rB, mask); // 1 - MOVI2R(rA, (u32)Memory::base, false); // 2-3 - ADD(rB, rB, rA); // 4 - - NEONXEmitter nemit(this); - if (single) - { - VLDR(S0, rB, 0); - nemit.VREV32(I_8, D0, D0); // Byte swap to result - VCVT(v0, S0, 0); - VCVT(v1, S0, 0); - } - else - { - VLDR(v0, rB, 0); - nemit.VREV64(I_8, v0, v0); // Byte swap to result - } - } - else + // This branch gets changed to a NOP when the fastpath fails + FixupBranch fast_path = B(); + FixupBranch slow_out; { PUSH(4, R0, R1, R2, R3); MOV(R0, rB); @@ -163,9 +144,7 @@ void JitArm::lfXX(UGeckoInstruction inst) { MOVI2R(rA, (u32)&Memory::Read_U32); BL(rA); - VMOV(S0, R0); - VCVT(v0, S0, 0); VCVT(v1, S0, 0); } @@ -181,7 +160,34 @@ void JitArm::lfXX(UGeckoInstruction inst) #endif } POP(4, R0, R1, R2, R3); + slow_out = B(); } + SetJumpTarget(fast_path); + { + Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) + ARMReg rC = gpr.GetReg(); + BIC(rC, rB, mask); + MOVI2R(rA, (u32)Memory::base); + ADD(rC, rC, rA); + + NEONXEmitter nemit(this); + if (single) + { + nemit.VLD1(F_32, D0, rC); + nemit.VREV32(I_8, D0, D0); // Byte swap to result + VCVT(v0, S0, 0); + VCVT(v1, S0, 0); + } + else + { + nemit.VLD1(I_64, v0, rC); + nemit.VREV64(I_8, v0, v0); // Byte swap to result + } + gpr.Unlock(rC); + } + + SetJumpTarget(slow_out); + gpr.Unlock(rA, rB); SetJumpTarget(DoNotLoad); } @@ -302,36 +308,17 @@ void JitArm::stfXX(UGeckoInstruction inst) SetCC(); } - if (false) - { - Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) - BIC(rB, rB, mask); // 1 - MOVI2R(rA, (u32)Memory::base, false); // 2-3 - ADD(rB, rB, rA); // 4 - - NEONXEmitter nemit(this); - if (single) - { - VCVT(S0, v0, 0); - nemit.VREV32(I_8, D0, D0); - VSTR(S0, rB, 0); - } - else - { - nemit.VREV64(I_8, D0, v0); - VSTR(D0, rB, 0); - } - } - else + // This branch gets changed to a NOP when the fastpath fails + FixupBranch fast_path = B(); + FixupBranch slow_out; { PUSH(4, R0, R1, R2, R3); if (single) { - MOVI2R(rA, (u32)&Memory::Write_U32); + MOV(R1, rB); VCVT(S0, v0, 0); VMOV(R0, S0); - MOV(R1, rB); - + MOVI2R(rA, (u32)&Memory::Write_U32); BL(rA); } else @@ -347,43 +334,32 @@ void JitArm::stfXX(UGeckoInstruction inst) BL(rA); } POP(4, R0, R1, R2, R3); + slow_out = B(); } - gpr.Unlock(rA, rB); -} - -// Some games use stfs as a way to quickly write to the gatherpipe and other hardware areas. -// Keep it as a safe store until this can get optimized. -// Look at the JIT64 implementation to see how it is done - -void JitArm::stfs(UGeckoInstruction inst) -{ - INSTRUCTION_START - JITDISABLE(bJITLoadStoreFloatingOff); - - ARMReg rA = gpr.GetReg(); - ARMReg rB = gpr.GetReg(); - ARMReg v0 = fpr.R0(inst.FS); - VCVT(S0, v0, 0); - - if (inst.RA) + SetJumpTarget(fast_path); { - MOVI2R(rB, inst.SIMM_16); - ARMReg RA = gpr.R(inst.RA); - ADD(rB, rB, RA); - } - else - { - MOVI2R(rB, (u32)inst.SIMM_16); + Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) + ARMReg rC = gpr.GetReg(); + BIC(rC, rB, mask); + MOVI2R(rA, (u32)Memory::base); + ADD(rC, rC, rA); + + NEONXEmitter nemit(this); + if (single) + { + VCVT(S0, v0, 0); + nemit.VREV32(I_8, D0, D0); + VSTR(S0, rC, 0); + } + else + { + nemit.VREV64(I_8, D0, v0); + VSTR(D0, rC, 0); + } + gpr.Unlock(rC); } - MOVI2R(rA, (u32)&Memory::Write_U32); - PUSH(4, R0, R1, R2, R3); - VMOV(R0, S0); - MOV(R1, rB); - - BL(rA); - - POP(4, R0, R1, R2, R3); + SetJumpTarget(slow_out); gpr.Unlock(rA, rB); } diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_Tables.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_Tables.cpp index 76e79da67f..198a73fba3 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_Tables.cpp @@ -89,7 +89,7 @@ static GekkoOPTemplate primarytable[] = {50, &JitArm::lfXX}, //"lfd", OPTYPE_LOADFP, FL_IN_A}}, {51, &JitArm::lfXX}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}}, - {52, &JitArm::stfs}, //"stfs", OPTYPE_STOREFP, FL_IN_A}}, + {52, &JitArm::stfXX}, //"stfs", OPTYPE_STOREFP, FL_IN_A}}, {53, &JitArm::stfXX}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, {54, &JitArm::stfXX}, //"stfd", OPTYPE_STOREFP, FL_IN_A}}, {55, &JitArm::stfXX}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},