Reimplements fastmem for ARMv7 floating point loadstores.

This implements a new system for fastmem backpatching on ARMv7 that is less of a mindfsck to deal with.
This also implements stfs under the default loadstore path as well, not sure why it was by itself in the first place.

I'll be moving the rest of the loadstore methods over to this new way in a few days.
This commit is contained in:
Ryan Houdek 2014-11-15 08:16:36 +00:00
parent e47bfc2788
commit 181f16c5f0
4 changed files with 149 additions and 138 deletions

View File

@ -205,7 +205,6 @@ public:
// Floating point loadStore // Floating point loadStore
void lfXX(UGeckoInstruction _inst); void lfXX(UGeckoInstruction _inst);
void stfXX(UGeckoInstruction _inst); void stfXX(UGeckoInstruction _inst);
void stfs(UGeckoInstruction _inst);
// Paired Singles // Paired Singles
void ps_add(UGeckoInstruction _inst); void ps_add(UGeckoInstruction _inst);

View File

@ -17,7 +17,7 @@ using namespace ArmGen;
// 1) It's really necessary. We don't know anything about the context. // 1) It's really necessary. We don't know anything about the context.
// 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be // 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be
// that many of them in a typical program/game. // that many of them in a typical program/game.
static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Store) static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Store, bool *new_system)
{ {
u8 op = (inst >> 20) & 0xFF; u8 op = (inst >> 20) & 0xFF;
rD = (ARMReg)((inst >> 12) & 0xF); rD = (ARMReg)((inst >> 12) & 0xF);
@ -61,8 +61,23 @@ static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Sto
} }
break; break;
default: default:
printf("Op is 0x%02x\n", op); {
return false; // Could be a floating point loadstore
u8 op2 = (inst >> 24) & 0xF;
switch (op2)
{
case 0xD: // VLDR/VSTR
*new_system = true;
break;
case 0x4: // VST1/VLD1
*new_system = true;
break;
default:
printf("Op is 0x%02x\n", op);
return false;
break;
}
}
} }
return true; return true;
} }
@ -70,10 +85,7 @@ static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Sto
bool JitArm::HandleFault(uintptr_t access_address, SContext* ctx) bool JitArm::HandleFault(uintptr_t access_address, SContext* ctx)
{ {
if (access_address < (uintptr_t)Memory::base) if (access_address < (uintptr_t)Memory::base)
{ PanicAlertT("Exception handler - access below memory space. 0x%08x", access_address);
PanicAlertT("Exception handler - access below memory space. %08llx%08llx",
access_address >> 32, access_address);
}
return BackPatch(ctx); return BackPatch(ctx);
} }
@ -87,66 +99,90 @@ bool JitArm::BackPatch(SContext* ctx)
ARMReg rD; ARMReg rD;
u8 accessSize; u8 accessSize;
bool Store; bool Store;
bool new_system = false;
if (!DisamLoadStore(Value, rD, accessSize, Store)) if (!DisamLoadStore(Value, rD, accessSize, Store, &new_system))
{ {
printf("Invalid backpatch at location 0x%08lx(0x%08x)\n", ctx->CTX_PC, Value); printf("Invalid backpatch at location 0x%08lx(0x%08x)\n", ctx->CTX_PC, Value);
exit(0); exit(0);
} }
if (Store) if (new_system)
{ {
const u32 ARMREGOFFSET = 4 * 5; // The new system is a lot easier to backpatch than the old crap.
ARMXEmitter emitter(codePtr - ARMREGOFFSET); // Instead of backpatching over code and making sure we NOP pad and other crap
switch (accessSize) // We emit both the slow and fast path and branch over the slow path each time
{ // We search backwards until we find the second branch instruction
case 8: // 8bit // Then proceed to replace it with a NOP and set that to the new PC.
emitter.MOVI2R(R14, (u32)&Memory::Write_U8, false); // 1-2 // This ensures that we run the slow path and then branch over the fast path.
return 0;
break; // Run backwards until we find the branch we want to NOP
case 16: // 16bit for (int branches = 2; branches > 0; ctx->CTX_PC -= 4)
emitter.MOVI2R(R14, (u32)&Memory::Write_U16, false); // 1-2 if ((*(u32*)ctx->CTX_PC & 0x0F000000) == 0x0A000000) // B
return 0; --branches;
break;
case 32: // 32bit ctx->CTX_PC += 4;
emitter.MOVI2R(R14, (u32)&Memory::Write_U32, false); // 1-2 ARMXEmitter emitter((u8*)ctx->CTX_PC);
break; emitter.NOP(1);
}
emitter.PUSH(4, R0, R1, R2, R3); // 3
emitter.MOV(R0, rD); // Value - 4
emitter.MOV(R1, R10); // Addr- 5
emitter.BL(R14); // 6
emitter.POP(4, R0, R1, R2, R3); // 7
u32 newPC = ctx->CTX_PC - (ARMREGOFFSET + 4 * 4);
ctx->CTX_PC = newPC;
emitter.FlushIcache(); emitter.FlushIcache();
return true; return true;
} }
else else
{ {
const u32 ARMREGOFFSET = 4 * 4; if (Store)
ARMXEmitter emitter(codePtr - ARMREGOFFSET);
switch (accessSize)
{ {
case 8: // 8bit const u32 ARMREGOFFSET = 4 * 5;
emitter.MOVI2R(R14, (u32)&Memory::Read_U8, false); // 2 ARMXEmitter emitter(codePtr - ARMREGOFFSET);
break; switch (accessSize)
case 16: // 16bit {
emitter.MOVI2R(R14, (u32)&Memory::Read_U16, false); // 2 case 8: // 8bit
break; emitter.MOVI2R(R14, (u32)&Memory::Write_U8, false); // 1-2
case 32: // 32bit return 0;
emitter.MOVI2R(R14, (u32)&Memory::Read_U32, false); // 2 break;
break; case 16: // 16bit
emitter.MOVI2R(R14, (u32)&Memory::Write_U16, false); // 1-2
return 0;
break;
case 32: // 32bit
emitter.MOVI2R(R14, (u32)&Memory::Write_U32, false); // 1-2
break;
}
emitter.PUSH(4, R0, R1, R2, R3); // 3
emitter.MOV(R0, rD); // Value - 4
emitter.MOV(R1, R10); // Addr- 5
emitter.BL(R14); // 6
emitter.POP(4, R0, R1, R2, R3); // 7
u32 newPC = ctx->CTX_PC - (ARMREGOFFSET + 4 * 4);
ctx->CTX_PC = newPC;
emitter.FlushIcache();
return true;
}
else
{
const u32 ARMREGOFFSET = 4 * 4;
ARMXEmitter emitter(codePtr - ARMREGOFFSET);
switch (accessSize)
{
case 8: // 8bit
emitter.MOVI2R(R14, (u32)&Memory::Read_U8, false); // 2
break;
case 16: // 16bit
emitter.MOVI2R(R14, (u32)&Memory::Read_U16, false); // 2
break;
case 32: // 32bit
emitter.MOVI2R(R14, (u32)&Memory::Read_U32, false); // 2
break;
}
emitter.PUSH(4, R0, R1, R2, R3); // 3
emitter.MOV(R0, R10); // 4
emitter.BL(R14); // 5
emitter.MOV(R14, R0); // 6
emitter.POP(4, R0, R1, R2, R3); // 7
emitter.MOV(rD, R14); // 8
ctx->CTX_PC -= ARMREGOFFSET + (4 * 4);
emitter.FlushIcache();
return true;
} }
emitter.PUSH(4, R0, R1, R2, R3); // 3
emitter.MOV(R0, R10); // 4
emitter.BL(R14); // 5
emitter.MOV(R14, R0); // 6
emitter.POP(4, R0, R1, R2, R3); // 7
emitter.MOV(rD, R14); // 8
ctx->CTX_PC -= ARMREGOFFSET + (4 * 4);
emitter.FlushIcache();
return true;
} }
return 0; return 0;
} }

View File

@ -77,9 +77,9 @@ void JitArm::lfXX(UGeckoInstruction inst)
break; break;
} }
ARMReg v0 = fpr.R0(inst.FD), v1; ARMReg v0 = fpr.R0(inst.FD, false), v1;
if (single) if (single)
v1 = fpr.R1(inst.FD); v1 = fpr.R1(inst.FD, false);
if (update) if (update)
{ {
@ -134,28 +134,9 @@ void JitArm::lfXX(UGeckoInstruction inst)
if (update) if (update)
MOV(RA, rB); MOV(RA, rB);
if (false) // This branch gets changed to a NOP when the fastpath fails
{ FixupBranch fast_path = B();
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) FixupBranch slow_out;
BIC(rB, rB, mask); // 1
MOVI2R(rA, (u32)Memory::base, false); // 2-3
ADD(rB, rB, rA); // 4
NEONXEmitter nemit(this);
if (single)
{
VLDR(S0, rB, 0);
nemit.VREV32(I_8, D0, D0); // Byte swap to result
VCVT(v0, S0, 0);
VCVT(v1, S0, 0);
}
else
{
VLDR(v0, rB, 0);
nemit.VREV64(I_8, v0, v0); // Byte swap to result
}
}
else
{ {
PUSH(4, R0, R1, R2, R3); PUSH(4, R0, R1, R2, R3);
MOV(R0, rB); MOV(R0, rB);
@ -163,9 +144,7 @@ void JitArm::lfXX(UGeckoInstruction inst)
{ {
MOVI2R(rA, (u32)&Memory::Read_U32); MOVI2R(rA, (u32)&Memory::Read_U32);
BL(rA); BL(rA);
VMOV(S0, R0); VMOV(S0, R0);
VCVT(v0, S0, 0); VCVT(v0, S0, 0);
VCVT(v1, S0, 0); VCVT(v1, S0, 0);
} }
@ -181,7 +160,34 @@ void JitArm::lfXX(UGeckoInstruction inst)
#endif #endif
} }
POP(4, R0, R1, R2, R3); POP(4, R0, R1, R2, R3);
slow_out = B();
} }
SetJumpTarget(fast_path);
{
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
ARMReg rC = gpr.GetReg();
BIC(rC, rB, mask);
MOVI2R(rA, (u32)Memory::base);
ADD(rC, rC, rA);
NEONXEmitter nemit(this);
if (single)
{
nemit.VLD1(F_32, D0, rC);
nemit.VREV32(I_8, D0, D0); // Byte swap to result
VCVT(v0, S0, 0);
VCVT(v1, S0, 0);
}
else
{
nemit.VLD1(I_64, v0, rC);
nemit.VREV64(I_8, v0, v0); // Byte swap to result
}
gpr.Unlock(rC);
}
SetJumpTarget(slow_out);
gpr.Unlock(rA, rB); gpr.Unlock(rA, rB);
SetJumpTarget(DoNotLoad); SetJumpTarget(DoNotLoad);
} }
@ -302,36 +308,17 @@ void JitArm::stfXX(UGeckoInstruction inst)
SetCC(); SetCC();
} }
if (false) // This branch gets changed to a NOP when the fastpath fails
{ FixupBranch fast_path = B();
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) FixupBranch slow_out;
BIC(rB, rB, mask); // 1
MOVI2R(rA, (u32)Memory::base, false); // 2-3
ADD(rB, rB, rA); // 4
NEONXEmitter nemit(this);
if (single)
{
VCVT(S0, v0, 0);
nemit.VREV32(I_8, D0, D0);
VSTR(S0, rB, 0);
}
else
{
nemit.VREV64(I_8, D0, v0);
VSTR(D0, rB, 0);
}
}
else
{ {
PUSH(4, R0, R1, R2, R3); PUSH(4, R0, R1, R2, R3);
if (single) if (single)
{ {
MOVI2R(rA, (u32)&Memory::Write_U32); MOV(R1, rB);
VCVT(S0, v0, 0); VCVT(S0, v0, 0);
VMOV(R0, S0); VMOV(R0, S0);
MOV(R1, rB); MOVI2R(rA, (u32)&Memory::Write_U32);
BL(rA); BL(rA);
} }
else else
@ -347,43 +334,32 @@ void JitArm::stfXX(UGeckoInstruction inst)
BL(rA); BL(rA);
} }
POP(4, R0, R1, R2, R3); POP(4, R0, R1, R2, R3);
slow_out = B();
} }
gpr.Unlock(rA, rB); SetJumpTarget(fast_path);
}
// Some games use stfs as a way to quickly write to the gatherpipe and other hardware areas.
// Keep it as a safe store until this can get optimized.
// Look at the JIT64 implementation to see how it is done
void JitArm::stfs(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
ARMReg rA = gpr.GetReg();
ARMReg rB = gpr.GetReg();
ARMReg v0 = fpr.R0(inst.FS);
VCVT(S0, v0, 0);
if (inst.RA)
{ {
MOVI2R(rB, inst.SIMM_16); Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
ARMReg RA = gpr.R(inst.RA); ARMReg rC = gpr.GetReg();
ADD(rB, rB, RA); BIC(rC, rB, mask);
} MOVI2R(rA, (u32)Memory::base);
else ADD(rC, rC, rA);
{
MOVI2R(rB, (u32)inst.SIMM_16); NEONXEmitter nemit(this);
if (single)
{
VCVT(S0, v0, 0);
nemit.VREV32(I_8, D0, D0);
VSTR(S0, rC, 0);
}
else
{
nemit.VREV64(I_8, D0, v0);
VSTR(D0, rC, 0);
}
gpr.Unlock(rC);
} }
MOVI2R(rA, (u32)&Memory::Write_U32); SetJumpTarget(slow_out);
PUSH(4, R0, R1, R2, R3);
VMOV(R0, S0);
MOV(R1, rB);
BL(rA);
POP(4, R0, R1, R2, R3);
gpr.Unlock(rA, rB); gpr.Unlock(rA, rB);
} }

View File

@ -89,7 +89,7 @@ static GekkoOPTemplate primarytable[] =
{50, &JitArm::lfXX}, //"lfd", OPTYPE_LOADFP, FL_IN_A}}, {50, &JitArm::lfXX}, //"lfd", OPTYPE_LOADFP, FL_IN_A}},
{51, &JitArm::lfXX}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}}, {51, &JitArm::lfXX}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}},
{52, &JitArm::stfs}, //"stfs", OPTYPE_STOREFP, FL_IN_A}}, {52, &JitArm::stfXX}, //"stfs", OPTYPE_STOREFP, FL_IN_A}},
{53, &JitArm::stfXX}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, {53, &JitArm::stfXX}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
{54, &JitArm::stfXX}, //"stfd", OPTYPE_STOREFP, FL_IN_A}}, {54, &JitArm::stfXX}, //"stfd", OPTYPE_STOREFP, FL_IN_A}},
{55, &JitArm::stfXX}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, {55, &JitArm::stfXX}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},