Merge pull request #1551 from Sonicadvance1/armv7-float-fastmem

Reimplements fastmem for ARMv7 floating point loadstores.
This commit is contained in:
Ryan Houdek 2014-11-15 15:32:17 -06:00
commit 4ce1b33e55
4 changed files with 149 additions and 138 deletions

View File

@ -205,7 +205,6 @@ public:
// Floating point loadStore // Floating point loadStore
void lfXX(UGeckoInstruction _inst); void lfXX(UGeckoInstruction _inst);
void stfXX(UGeckoInstruction _inst); void stfXX(UGeckoInstruction _inst);
void stfs(UGeckoInstruction _inst);
// Paired Singles // Paired Singles
void ps_add(UGeckoInstruction _inst); void ps_add(UGeckoInstruction _inst);

View File

@ -17,7 +17,7 @@ using namespace ArmGen;
// 1) It's really necessary. We don't know anything about the context. // 1) It's really necessary. We don't know anything about the context.
// 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be // 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be
// that many of them in a typical program/game. // that many of them in a typical program/game.
static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Store) static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Store, bool *new_system)
{ {
u8 op = (inst >> 20) & 0xFF; u8 op = (inst >> 20) & 0xFF;
rD = (ARMReg)((inst >> 12) & 0xF); rD = (ARMReg)((inst >> 12) & 0xF);
@ -61,8 +61,23 @@ static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Sto
} }
break; break;
default: default:
printf("Op is 0x%02x\n", op); {
return false; // Could be a floating point loadstore
u8 op2 = (inst >> 24) & 0xF;
switch (op2)
{
case 0xD: // VLDR/VSTR
*new_system = true;
break;
case 0x4: // VST1/VLD1
*new_system = true;
break;
default:
printf("Op is 0x%02x\n", op);
return false;
break;
}
}
} }
return true; return true;
} }
@ -70,10 +85,7 @@ static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Sto
bool JitArm::HandleFault(uintptr_t access_address, SContext* ctx) bool JitArm::HandleFault(uintptr_t access_address, SContext* ctx)
{ {
if (access_address < (uintptr_t)Memory::base) if (access_address < (uintptr_t)Memory::base)
{ PanicAlertT("Exception handler - access below memory space. 0x%08x", access_address);
PanicAlertT("Exception handler - access below memory space. %08llx%08llx",
access_address >> 32, access_address);
}
return BackPatch(ctx); return BackPatch(ctx);
} }
@ -87,66 +99,90 @@ bool JitArm::BackPatch(SContext* ctx)
ARMReg rD; ARMReg rD;
u8 accessSize; u8 accessSize;
bool Store; bool Store;
bool new_system = false;
if (!DisamLoadStore(Value, rD, accessSize, Store)) if (!DisamLoadStore(Value, rD, accessSize, Store, &new_system))
{ {
printf("Invalid backpatch at location 0x%08lx(0x%08x)\n", ctx->CTX_PC, Value); printf("Invalid backpatch at location 0x%08lx(0x%08x)\n", ctx->CTX_PC, Value);
exit(0); exit(0);
} }
if (Store) if (new_system)
{ {
const u32 ARMREGOFFSET = 4 * 5; // The new system is a lot easier to backpatch than the old crap.
ARMXEmitter emitter(codePtr - ARMREGOFFSET); // Instead of backpatching over code and making sure we NOP pad and other crap
switch (accessSize) // We emit both the slow and fast path and branch over the slow path each time
{ // We search backwards until we find the second branch instruction
case 8: // 8bit // Then proceed to replace it with a NOP and set that to the new PC.
emitter.MOVI2R(R14, (u32)&Memory::Write_U8, false); // 1-2 // This ensures that we run the slow path and then branch over the fast path.
return 0;
break; // Run backwards until we find the branch we want to NOP
case 16: // 16bit for (int branches = 2; branches > 0; ctx->CTX_PC -= 4)
emitter.MOVI2R(R14, (u32)&Memory::Write_U16, false); // 1-2 if ((*(u32*)ctx->CTX_PC & 0x0F000000) == 0x0A000000) // B
return 0; --branches;
break;
case 32: // 32bit ctx->CTX_PC += 4;
emitter.MOVI2R(R14, (u32)&Memory::Write_U32, false); // 1-2 ARMXEmitter emitter((u8*)ctx->CTX_PC);
break; emitter.NOP(1);
}
emitter.PUSH(4, R0, R1, R2, R3); // 3
emitter.MOV(R0, rD); // Value - 4
emitter.MOV(R1, R10); // Addr- 5
emitter.BL(R14); // 6
emitter.POP(4, R0, R1, R2, R3); // 7
u32 newPC = ctx->CTX_PC - (ARMREGOFFSET + 4 * 4);
ctx->CTX_PC = newPC;
emitter.FlushIcache(); emitter.FlushIcache();
return true; return true;
} }
else else
{ {
const u32 ARMREGOFFSET = 4 * 4; if (Store)
ARMXEmitter emitter(codePtr - ARMREGOFFSET);
switch (accessSize)
{ {
case 8: // 8bit const u32 ARMREGOFFSET = 4 * 5;
emitter.MOVI2R(R14, (u32)&Memory::Read_U8, false); // 2 ARMXEmitter emitter(codePtr - ARMREGOFFSET);
break; switch (accessSize)
case 16: // 16bit {
emitter.MOVI2R(R14, (u32)&Memory::Read_U16, false); // 2 case 8: // 8bit
break; emitter.MOVI2R(R14, (u32)&Memory::Write_U8, false); // 1-2
case 32: // 32bit return 0;
emitter.MOVI2R(R14, (u32)&Memory::Read_U32, false); // 2 break;
break; case 16: // 16bit
emitter.MOVI2R(R14, (u32)&Memory::Write_U16, false); // 1-2
return 0;
break;
case 32: // 32bit
emitter.MOVI2R(R14, (u32)&Memory::Write_U32, false); // 1-2
break;
}
emitter.PUSH(4, R0, R1, R2, R3); // 3
emitter.MOV(R0, rD); // Value - 4
emitter.MOV(R1, R10); // Addr- 5
emitter.BL(R14); // 6
emitter.POP(4, R0, R1, R2, R3); // 7
u32 newPC = ctx->CTX_PC - (ARMREGOFFSET + 4 * 4);
ctx->CTX_PC = newPC;
emitter.FlushIcache();
return true;
}
else
{
const u32 ARMREGOFFSET = 4 * 4;
ARMXEmitter emitter(codePtr - ARMREGOFFSET);
switch (accessSize)
{
case 8: // 8bit
emitter.MOVI2R(R14, (u32)&Memory::Read_U8, false); // 2
break;
case 16: // 16bit
emitter.MOVI2R(R14, (u32)&Memory::Read_U16, false); // 2
break;
case 32: // 32bit
emitter.MOVI2R(R14, (u32)&Memory::Read_U32, false); // 2
break;
}
emitter.PUSH(4, R0, R1, R2, R3); // 3
emitter.MOV(R0, R10); // 4
emitter.BL(R14); // 5
emitter.MOV(R14, R0); // 6
emitter.POP(4, R0, R1, R2, R3); // 7
emitter.MOV(rD, R14); // 8
ctx->CTX_PC -= ARMREGOFFSET + (4 * 4);
emitter.FlushIcache();
return true;
} }
emitter.PUSH(4, R0, R1, R2, R3); // 3
emitter.MOV(R0, R10); // 4
emitter.BL(R14); // 5
emitter.MOV(R14, R0); // 6
emitter.POP(4, R0, R1, R2, R3); // 7
emitter.MOV(rD, R14); // 8
ctx->CTX_PC -= ARMREGOFFSET + (4 * 4);
emitter.FlushIcache();
return true;
} }
return 0; return 0;
} }

View File

@ -77,9 +77,9 @@ void JitArm::lfXX(UGeckoInstruction inst)
break; break;
} }
ARMReg v0 = fpr.R0(inst.FD), v1; ARMReg v0 = fpr.R0(inst.FD, false), v1;
if (single) if (single)
v1 = fpr.R1(inst.FD); v1 = fpr.R1(inst.FD, false);
if (update) if (update)
{ {
@ -134,28 +134,9 @@ void JitArm::lfXX(UGeckoInstruction inst)
if (update) if (update)
MOV(RA, rB); MOV(RA, rB);
if (false) // This branch gets changed to a NOP when the fastpath fails
{ FixupBranch fast_path = B();
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) FixupBranch slow_out;
BIC(rB, rB, mask); // 1
MOVI2R(rA, (u32)Memory::base, false); // 2-3
ADD(rB, rB, rA); // 4
NEONXEmitter nemit(this);
if (single)
{
VLDR(S0, rB, 0);
nemit.VREV32(I_8, D0, D0); // Byte swap to result
VCVT(v0, S0, 0);
VCVT(v1, S0, 0);
}
else
{
VLDR(v0, rB, 0);
nemit.VREV64(I_8, v0, v0); // Byte swap to result
}
}
else
{ {
PUSH(4, R0, R1, R2, R3); PUSH(4, R0, R1, R2, R3);
MOV(R0, rB); MOV(R0, rB);
@ -163,9 +144,7 @@ void JitArm::lfXX(UGeckoInstruction inst)
{ {
MOVI2R(rA, (u32)&Memory::Read_U32); MOVI2R(rA, (u32)&Memory::Read_U32);
BL(rA); BL(rA);
VMOV(S0, R0); VMOV(S0, R0);
VCVT(v0, S0, 0); VCVT(v0, S0, 0);
VCVT(v1, S0, 0); VCVT(v1, S0, 0);
} }
@ -181,7 +160,34 @@ void JitArm::lfXX(UGeckoInstruction inst)
#endif #endif
} }
POP(4, R0, R1, R2, R3); POP(4, R0, R1, R2, R3);
slow_out = B();
} }
SetJumpTarget(fast_path);
{
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
ARMReg rC = gpr.GetReg();
BIC(rC, rB, mask);
MOVI2R(rA, (u32)Memory::base);
ADD(rC, rC, rA);
NEONXEmitter nemit(this);
if (single)
{
nemit.VLD1(F_32, D0, rC);
nemit.VREV32(I_8, D0, D0); // Byte swap to result
VCVT(v0, S0, 0);
VCVT(v1, S0, 0);
}
else
{
nemit.VLD1(I_64, v0, rC);
nemit.VREV64(I_8, v0, v0); // Byte swap to result
}
gpr.Unlock(rC);
}
SetJumpTarget(slow_out);
gpr.Unlock(rA, rB); gpr.Unlock(rA, rB);
SetJumpTarget(DoNotLoad); SetJumpTarget(DoNotLoad);
} }
@ -302,36 +308,17 @@ void JitArm::stfXX(UGeckoInstruction inst)
SetCC(); SetCC();
} }
if (false) // This branch gets changed to a NOP when the fastpath fails
{ FixupBranch fast_path = B();
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) FixupBranch slow_out;
BIC(rB, rB, mask); // 1
MOVI2R(rA, (u32)Memory::base, false); // 2-3
ADD(rB, rB, rA); // 4
NEONXEmitter nemit(this);
if (single)
{
VCVT(S0, v0, 0);
nemit.VREV32(I_8, D0, D0);
VSTR(S0, rB, 0);
}
else
{
nemit.VREV64(I_8, D0, v0);
VSTR(D0, rB, 0);
}
}
else
{ {
PUSH(4, R0, R1, R2, R3); PUSH(4, R0, R1, R2, R3);
if (single) if (single)
{ {
MOVI2R(rA, (u32)&Memory::Write_U32); MOV(R1, rB);
VCVT(S0, v0, 0); VCVT(S0, v0, 0);
VMOV(R0, S0); VMOV(R0, S0);
MOV(R1, rB); MOVI2R(rA, (u32)&Memory::Write_U32);
BL(rA); BL(rA);
} }
else else
@ -347,43 +334,32 @@ void JitArm::stfXX(UGeckoInstruction inst)
BL(rA); BL(rA);
} }
POP(4, R0, R1, R2, R3); POP(4, R0, R1, R2, R3);
slow_out = B();
} }
gpr.Unlock(rA, rB); SetJumpTarget(fast_path);
}
// Some games use stfs as a way to quickly write to the gatherpipe and other hardware areas.
// Keep it as a safe store until this can get optimized.
// Look at the JIT64 implementation to see how it is done
void JitArm::stfs(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
ARMReg rA = gpr.GetReg();
ARMReg rB = gpr.GetReg();
ARMReg v0 = fpr.R0(inst.FS);
VCVT(S0, v0, 0);
if (inst.RA)
{ {
MOVI2R(rB, inst.SIMM_16); Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
ARMReg RA = gpr.R(inst.RA); ARMReg rC = gpr.GetReg();
ADD(rB, rB, RA); BIC(rC, rB, mask);
} MOVI2R(rA, (u32)Memory::base);
else ADD(rC, rC, rA);
{
MOVI2R(rB, (u32)inst.SIMM_16); NEONXEmitter nemit(this);
if (single)
{
VCVT(S0, v0, 0);
nemit.VREV32(I_8, D0, D0);
VSTR(S0, rC, 0);
}
else
{
nemit.VREV64(I_8, D0, v0);
VSTR(D0, rC, 0);
}
gpr.Unlock(rC);
} }
MOVI2R(rA, (u32)&Memory::Write_U32); SetJumpTarget(slow_out);
PUSH(4, R0, R1, R2, R3);
VMOV(R0, S0);
MOV(R1, rB);
BL(rA);
POP(4, R0, R1, R2, R3);
gpr.Unlock(rA, rB); gpr.Unlock(rA, rB);
} }

View File

@ -89,7 +89,7 @@ static GekkoOPTemplate primarytable[] =
{50, &JitArm::lfXX}, //"lfd", OPTYPE_LOADFP, FL_IN_A}}, {50, &JitArm::lfXX}, //"lfd", OPTYPE_LOADFP, FL_IN_A}},
{51, &JitArm::lfXX}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}}, {51, &JitArm::lfXX}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}},
{52, &JitArm::stfs}, //"stfs", OPTYPE_STOREFP, FL_IN_A}}, {52, &JitArm::stfXX}, //"stfs", OPTYPE_STOREFP, FL_IN_A}},
{53, &JitArm::stfXX}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, {53, &JitArm::stfXX}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
{54, &JitArm::stfXX}, //"stfd", OPTYPE_STOREFP, FL_IN_A}}, {54, &JitArm::stfXX}, //"stfd", OPTYPE_STOREFP, FL_IN_A}},
{55, &JitArm::stfXX}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, {55, &JitArm::stfXX}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},