diff --git a/Source/Core/Core/PowerPC/JitArm32/Jit.h b/Source/Core/Core/PowerPC/JitArm32/Jit.h index d4885d5229..7021113d73 100644 --- a/Source/Core/Core/PowerPC/JitArm32/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm32/Jit.h @@ -205,7 +205,6 @@ public: // Floating point loadStore void lfXX(UGeckoInstruction _inst); void stfXX(UGeckoInstruction _inst); - void stfs(UGeckoInstruction _inst); // Paired Singles void ps_add(UGeckoInstruction _inst); diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp index b3c77cdab4..1ffc609609 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp @@ -17,7 +17,7 @@ using namespace ArmGen; // 1) It's really necessary. We don't know anything about the context. // 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be // that many of them in a typical program/game. -static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Store) +static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Store, bool *new_system) { u8 op = (inst >> 20) & 0xFF; rD = (ARMReg)((inst >> 12) & 0xF); @@ -61,8 +61,23 @@ static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Sto } break; default: - printf("Op is 0x%02x\n", op); - return false; + { + // Could be a floating point loadstore + u8 op2 = (inst >> 24) & 0xF; + switch (op2) + { + case 0xD: // VLDR/VSTR + *new_system = true; + break; + case 0x4: // VST1/VLD1 + *new_system = true; + break; + default: + printf("Op is 0x%02x\n", op); + return false; + break; + } + } } return true; } @@ -70,10 +85,7 @@ static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Sto bool JitArm::HandleFault(uintptr_t access_address, SContext* ctx) { if (access_address < (uintptr_t)Memory::base) - { - PanicAlertT("Exception handler - access below memory space. %08llx%08llx", - access_address >> 32, access_address); - } + PanicAlertT("Exception handler - access below memory space. 0x%08x", access_address); return BackPatch(ctx); } @@ -87,66 +99,90 @@ bool JitArm::BackPatch(SContext* ctx) ARMReg rD; u8 accessSize; bool Store; + bool new_system = false; - if (!DisamLoadStore(Value, rD, accessSize, Store)) + if (!DisamLoadStore(Value, rD, accessSize, Store, &new_system)) { printf("Invalid backpatch at location 0x%08lx(0x%08x)\n", ctx->CTX_PC, Value); exit(0); } - if (Store) + if (new_system) { - const u32 ARMREGOFFSET = 4 * 5; - ARMXEmitter emitter(codePtr - ARMREGOFFSET); - switch (accessSize) - { - case 8: // 8bit - emitter.MOVI2R(R14, (u32)&Memory::Write_U8, false); // 1-2 - return 0; - break; - case 16: // 16bit - emitter.MOVI2R(R14, (u32)&Memory::Write_U16, false); // 1-2 - return 0; - break; - case 32: // 32bit - emitter.MOVI2R(R14, (u32)&Memory::Write_U32, false); // 1-2 - break; - } - emitter.PUSH(4, R0, R1, R2, R3); // 3 - emitter.MOV(R0, rD); // Value - 4 - emitter.MOV(R1, R10); // Addr- 5 - emitter.BL(R14); // 6 - emitter.POP(4, R0, R1, R2, R3); // 7 - u32 newPC = ctx->CTX_PC - (ARMREGOFFSET + 4 * 4); - ctx->CTX_PC = newPC; + // The new system is a lot easier to backpatch than the old crap. + // Instead of backpatching over code and making sure we NOP pad and other crap + // We emit both the slow and fast path and branch over the slow path each time + // We search backwards until we find the second branch instruction + // Then proceed to replace it with a NOP and set that to the new PC. + // This ensures that we run the slow path and then branch over the fast path. + + // Run backwards until we find the branch we want to NOP + for (int branches = 2; branches > 0; ctx->CTX_PC -= 4) + if ((*(u32*)ctx->CTX_PC & 0x0F000000) == 0x0A000000) // B + --branches; + + ctx->CTX_PC += 4; + ARMXEmitter emitter((u8*)ctx->CTX_PC); + emitter.NOP(1); emitter.FlushIcache(); return true; } else { - const u32 ARMREGOFFSET = 4 * 4; - ARMXEmitter emitter(codePtr - ARMREGOFFSET); - switch (accessSize) + if (Store) { - case 8: // 8bit - emitter.MOVI2R(R14, (u32)&Memory::Read_U8, false); // 2 - break; - case 16: // 16bit - emitter.MOVI2R(R14, (u32)&Memory::Read_U16, false); // 2 - break; - case 32: // 32bit - emitter.MOVI2R(R14, (u32)&Memory::Read_U32, false); // 2 - break; + const u32 ARMREGOFFSET = 4 * 5; + ARMXEmitter emitter(codePtr - ARMREGOFFSET); + switch (accessSize) + { + case 8: // 8bit + emitter.MOVI2R(R14, (u32)&Memory::Write_U8, false); // 1-2 + return 0; + break; + case 16: // 16bit + emitter.MOVI2R(R14, (u32)&Memory::Write_U16, false); // 1-2 + return 0; + break; + case 32: // 32bit + emitter.MOVI2R(R14, (u32)&Memory::Write_U32, false); // 1-2 + break; + } + emitter.PUSH(4, R0, R1, R2, R3); // 3 + emitter.MOV(R0, rD); // Value - 4 + emitter.MOV(R1, R10); // Addr- 5 + emitter.BL(R14); // 6 + emitter.POP(4, R0, R1, R2, R3); // 7 + u32 newPC = ctx->CTX_PC - (ARMREGOFFSET + 4 * 4); + ctx->CTX_PC = newPC; + emitter.FlushIcache(); + return true; + } + else + { + const u32 ARMREGOFFSET = 4 * 4; + ARMXEmitter emitter(codePtr - ARMREGOFFSET); + switch (accessSize) + { + case 8: // 8bit + emitter.MOVI2R(R14, (u32)&Memory::Read_U8, false); // 2 + break; + case 16: // 16bit + emitter.MOVI2R(R14, (u32)&Memory::Read_U16, false); // 2 + break; + case 32: // 32bit + emitter.MOVI2R(R14, (u32)&Memory::Read_U32, false); // 2 + break; + } + emitter.PUSH(4, R0, R1, R2, R3); // 3 + emitter.MOV(R0, R10); // 4 + emitter.BL(R14); // 5 + emitter.MOV(R14, R0); // 6 + emitter.POP(4, R0, R1, R2, R3); // 7 + emitter.MOV(rD, R14); // 8 + ctx->CTX_PC -= ARMREGOFFSET + (4 * 4); + emitter.FlushIcache(); + return true; } - emitter.PUSH(4, R0, R1, R2, R3); // 3 - emitter.MOV(R0, R10); // 4 - emitter.BL(R14); // 5 - emitter.MOV(R14, R0); // 6 - emitter.POP(4, R0, R1, R2, R3); // 7 - emitter.MOV(rD, R14); // 8 - ctx->CTX_PC -= ARMREGOFFSET + (4 * 4); - emitter.FlushIcache(); - return true; } return 0; } diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp index e9588b1c62..1e5021dc3f 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp @@ -77,9 +77,9 @@ void JitArm::lfXX(UGeckoInstruction inst) break; } - ARMReg v0 = fpr.R0(inst.FD), v1; + ARMReg v0 = fpr.R0(inst.FD, false), v1; if (single) - v1 = fpr.R1(inst.FD); + v1 = fpr.R1(inst.FD, false); if (update) { @@ -134,28 +134,9 @@ void JitArm::lfXX(UGeckoInstruction inst) if (update) MOV(RA, rB); - if (false) - { - Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) - BIC(rB, rB, mask); // 1 - MOVI2R(rA, (u32)Memory::base, false); // 2-3 - ADD(rB, rB, rA); // 4 - - NEONXEmitter nemit(this); - if (single) - { - VLDR(S0, rB, 0); - nemit.VREV32(I_8, D0, D0); // Byte swap to result - VCVT(v0, S0, 0); - VCVT(v1, S0, 0); - } - else - { - VLDR(v0, rB, 0); - nemit.VREV64(I_8, v0, v0); // Byte swap to result - } - } - else + // This branch gets changed to a NOP when the fastpath fails + FixupBranch fast_path = B(); + FixupBranch slow_out; { PUSH(4, R0, R1, R2, R3); MOV(R0, rB); @@ -163,9 +144,7 @@ void JitArm::lfXX(UGeckoInstruction inst) { MOVI2R(rA, (u32)&Memory::Read_U32); BL(rA); - VMOV(S0, R0); - VCVT(v0, S0, 0); VCVT(v1, S0, 0); } @@ -181,7 +160,34 @@ void JitArm::lfXX(UGeckoInstruction inst) #endif } POP(4, R0, R1, R2, R3); + slow_out = B(); } + SetJumpTarget(fast_path); + { + Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) + ARMReg rC = gpr.GetReg(); + BIC(rC, rB, mask); + MOVI2R(rA, (u32)Memory::base); + ADD(rC, rC, rA); + + NEONXEmitter nemit(this); + if (single) + { + nemit.VLD1(F_32, D0, rC); + nemit.VREV32(I_8, D0, D0); // Byte swap to result + VCVT(v0, S0, 0); + VCVT(v1, S0, 0); + } + else + { + nemit.VLD1(I_64, v0, rC); + nemit.VREV64(I_8, v0, v0); // Byte swap to result + } + gpr.Unlock(rC); + } + + SetJumpTarget(slow_out); + gpr.Unlock(rA, rB); SetJumpTarget(DoNotLoad); } @@ -302,36 +308,17 @@ void JitArm::stfXX(UGeckoInstruction inst) SetCC(); } - if (false) - { - Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) - BIC(rB, rB, mask); // 1 - MOVI2R(rA, (u32)Memory::base, false); // 2-3 - ADD(rB, rB, rA); // 4 - - NEONXEmitter nemit(this); - if (single) - { - VCVT(S0, v0, 0); - nemit.VREV32(I_8, D0, D0); - VSTR(S0, rB, 0); - } - else - { - nemit.VREV64(I_8, D0, v0); - VSTR(D0, rB, 0); - } - } - else + // This branch gets changed to a NOP when the fastpath fails + FixupBranch fast_path = B(); + FixupBranch slow_out; { PUSH(4, R0, R1, R2, R3); if (single) { - MOVI2R(rA, (u32)&Memory::Write_U32); + MOV(R1, rB); VCVT(S0, v0, 0); VMOV(R0, S0); - MOV(R1, rB); - + MOVI2R(rA, (u32)&Memory::Write_U32); BL(rA); } else @@ -347,43 +334,32 @@ void JitArm::stfXX(UGeckoInstruction inst) BL(rA); } POP(4, R0, R1, R2, R3); + slow_out = B(); } - gpr.Unlock(rA, rB); -} - -// Some games use stfs as a way to quickly write to the gatherpipe and other hardware areas. -// Keep it as a safe store until this can get optimized. -// Look at the JIT64 implementation to see how it is done - -void JitArm::stfs(UGeckoInstruction inst) -{ - INSTRUCTION_START - JITDISABLE(bJITLoadStoreFloatingOff); - - ARMReg rA = gpr.GetReg(); - ARMReg rB = gpr.GetReg(); - ARMReg v0 = fpr.R0(inst.FS); - VCVT(S0, v0, 0); - - if (inst.RA) + SetJumpTarget(fast_path); { - MOVI2R(rB, inst.SIMM_16); - ARMReg RA = gpr.R(inst.RA); - ADD(rB, rB, RA); - } - else - { - MOVI2R(rB, (u32)inst.SIMM_16); + Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) + ARMReg rC = gpr.GetReg(); + BIC(rC, rB, mask); + MOVI2R(rA, (u32)Memory::base); + ADD(rC, rC, rA); + + NEONXEmitter nemit(this); + if (single) + { + VCVT(S0, v0, 0); + nemit.VREV32(I_8, D0, D0); + VSTR(S0, rC, 0); + } + else + { + nemit.VREV64(I_8, D0, v0); + VSTR(D0, rC, 0); + } + gpr.Unlock(rC); } - MOVI2R(rA, (u32)&Memory::Write_U32); - PUSH(4, R0, R1, R2, R3); - VMOV(R0, S0); - MOV(R1, rB); - - BL(rA); - - POP(4, R0, R1, R2, R3); + SetJumpTarget(slow_out); gpr.Unlock(rA, rB); } diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_Tables.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_Tables.cpp index 76e79da67f..198a73fba3 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_Tables.cpp @@ -89,7 +89,7 @@ static GekkoOPTemplate primarytable[] = {50, &JitArm::lfXX}, //"lfd", OPTYPE_LOADFP, FL_IN_A}}, {51, &JitArm::lfXX}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}}, - {52, &JitArm::stfs}, //"stfs", OPTYPE_STOREFP, FL_IN_A}}, + {52, &JitArm::stfXX}, //"stfs", OPTYPE_STOREFP, FL_IN_A}}, {53, &JitArm::stfXX}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, {54, &JitArm::stfXX}, //"stfd", OPTYPE_STOREFP, FL_IN_A}}, {55, &JitArm::stfXX}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},