Merge pull request #1551 from Sonicadvance1/armv7-float-fastmem

Reimplements fastmem for ARMv7 floating point loadstores.
This commit is contained in:
Ryan Houdek 2014-11-15 15:32:17 -06:00
commit 4ce1b33e55
4 changed files with 149 additions and 138 deletions

View File

@ -205,7 +205,6 @@ public:
// Floating point loadStore
void lfXX(UGeckoInstruction _inst);
void stfXX(UGeckoInstruction _inst);
void stfs(UGeckoInstruction _inst);
// Paired Singles
void ps_add(UGeckoInstruction _inst);

View File

@ -17,7 +17,7 @@ using namespace ArmGen;
// 1) It's really necessary. We don't know anything about the context.
// 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be
// that many of them in a typical program/game.
static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Store)
static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Store, bool *new_system)
{
u8 op = (inst >> 20) & 0xFF;
rD = (ARMReg)((inst >> 12) & 0xF);
@ -61,8 +61,23 @@ static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Sto
}
break;
default:
printf("Op is 0x%02x\n", op);
return false;
{
// Could be a floating point loadstore
u8 op2 = (inst >> 24) & 0xF;
switch (op2)
{
case 0xD: // VLDR/VSTR
*new_system = true;
break;
case 0x4: // VST1/VLD1
*new_system = true;
break;
default:
printf("Op is 0x%02x\n", op);
return false;
break;
}
}
}
return true;
}
@ -70,10 +85,7 @@ static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Sto
bool JitArm::HandleFault(uintptr_t access_address, SContext* ctx)
{
if (access_address < (uintptr_t)Memory::base)
{
PanicAlertT("Exception handler - access below memory space. %08llx%08llx",
access_address >> 32, access_address);
}
PanicAlertT("Exception handler - access below memory space. 0x%08x", access_address);
return BackPatch(ctx);
}
@ -87,66 +99,90 @@ bool JitArm::BackPatch(SContext* ctx)
ARMReg rD;
u8 accessSize;
bool Store;
bool new_system = false;
if (!DisamLoadStore(Value, rD, accessSize, Store))
if (!DisamLoadStore(Value, rD, accessSize, Store, &new_system))
{
printf("Invalid backpatch at location 0x%08lx(0x%08x)\n", ctx->CTX_PC, Value);
exit(0);
}
if (Store)
if (new_system)
{
const u32 ARMREGOFFSET = 4 * 5;
ARMXEmitter emitter(codePtr - ARMREGOFFSET);
switch (accessSize)
{
case 8: // 8bit
emitter.MOVI2R(R14, (u32)&Memory::Write_U8, false); // 1-2
return 0;
break;
case 16: // 16bit
emitter.MOVI2R(R14, (u32)&Memory::Write_U16, false); // 1-2
return 0;
break;
case 32: // 32bit
emitter.MOVI2R(R14, (u32)&Memory::Write_U32, false); // 1-2
break;
}
emitter.PUSH(4, R0, R1, R2, R3); // 3
emitter.MOV(R0, rD); // Value - 4
emitter.MOV(R1, R10); // Addr- 5
emitter.BL(R14); // 6
emitter.POP(4, R0, R1, R2, R3); // 7
u32 newPC = ctx->CTX_PC - (ARMREGOFFSET + 4 * 4);
ctx->CTX_PC = newPC;
// The new system is a lot easier to backpatch than the old crap.
// Instead of backpatching over code and making sure we NOP pad and other crap
// We emit both the slow and fast path and branch over the slow path each time
// We search backwards until we find the second branch instruction
// Then proceed to replace it with a NOP and set that to the new PC.
// This ensures that we run the slow path and then branch over the fast path.
// Run backwards until we find the branch we want to NOP
for (int branches = 2; branches > 0; ctx->CTX_PC -= 4)
if ((*(u32*)ctx->CTX_PC & 0x0F000000) == 0x0A000000) // B
--branches;
ctx->CTX_PC += 4;
ARMXEmitter emitter((u8*)ctx->CTX_PC);
emitter.NOP(1);
emitter.FlushIcache();
return true;
}
else
{
const u32 ARMREGOFFSET = 4 * 4;
ARMXEmitter emitter(codePtr - ARMREGOFFSET);
switch (accessSize)
if (Store)
{
case 8: // 8bit
emitter.MOVI2R(R14, (u32)&Memory::Read_U8, false); // 2
break;
case 16: // 16bit
emitter.MOVI2R(R14, (u32)&Memory::Read_U16, false); // 2
break;
case 32: // 32bit
emitter.MOVI2R(R14, (u32)&Memory::Read_U32, false); // 2
break;
const u32 ARMREGOFFSET = 4 * 5;
ARMXEmitter emitter(codePtr - ARMREGOFFSET);
switch (accessSize)
{
case 8: // 8bit
emitter.MOVI2R(R14, (u32)&Memory::Write_U8, false); // 1-2
return 0;
break;
case 16: // 16bit
emitter.MOVI2R(R14, (u32)&Memory::Write_U16, false); // 1-2
return 0;
break;
case 32: // 32bit
emitter.MOVI2R(R14, (u32)&Memory::Write_U32, false); // 1-2
break;
}
emitter.PUSH(4, R0, R1, R2, R3); // 3
emitter.MOV(R0, rD); // Value - 4
emitter.MOV(R1, R10); // Addr- 5
emitter.BL(R14); // 6
emitter.POP(4, R0, R1, R2, R3); // 7
u32 newPC = ctx->CTX_PC - (ARMREGOFFSET + 4 * 4);
ctx->CTX_PC = newPC;
emitter.FlushIcache();
return true;
}
else
{
const u32 ARMREGOFFSET = 4 * 4;
ARMXEmitter emitter(codePtr - ARMREGOFFSET);
switch (accessSize)
{
case 8: // 8bit
emitter.MOVI2R(R14, (u32)&Memory::Read_U8, false); // 2
break;
case 16: // 16bit
emitter.MOVI2R(R14, (u32)&Memory::Read_U16, false); // 2
break;
case 32: // 32bit
emitter.MOVI2R(R14, (u32)&Memory::Read_U32, false); // 2
break;
}
emitter.PUSH(4, R0, R1, R2, R3); // 3
emitter.MOV(R0, R10); // 4
emitter.BL(R14); // 5
emitter.MOV(R14, R0); // 6
emitter.POP(4, R0, R1, R2, R3); // 7
emitter.MOV(rD, R14); // 8
ctx->CTX_PC -= ARMREGOFFSET + (4 * 4);
emitter.FlushIcache();
return true;
}
emitter.PUSH(4, R0, R1, R2, R3); // 3
emitter.MOV(R0, R10); // 4
emitter.BL(R14); // 5
emitter.MOV(R14, R0); // 6
emitter.POP(4, R0, R1, R2, R3); // 7
emitter.MOV(rD, R14); // 8
ctx->CTX_PC -= ARMREGOFFSET + (4 * 4);
emitter.FlushIcache();
return true;
}
return 0;
}

View File

@ -77,9 +77,9 @@ void JitArm::lfXX(UGeckoInstruction inst)
break;
}
ARMReg v0 = fpr.R0(inst.FD), v1;
ARMReg v0 = fpr.R0(inst.FD, false), v1;
if (single)
v1 = fpr.R1(inst.FD);
v1 = fpr.R1(inst.FD, false);
if (update)
{
@ -134,28 +134,9 @@ void JitArm::lfXX(UGeckoInstruction inst)
if (update)
MOV(RA, rB);
if (false)
{
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
BIC(rB, rB, mask); // 1
MOVI2R(rA, (u32)Memory::base, false); // 2-3
ADD(rB, rB, rA); // 4
NEONXEmitter nemit(this);
if (single)
{
VLDR(S0, rB, 0);
nemit.VREV32(I_8, D0, D0); // Byte swap to result
VCVT(v0, S0, 0);
VCVT(v1, S0, 0);
}
else
{
VLDR(v0, rB, 0);
nemit.VREV64(I_8, v0, v0); // Byte swap to result
}
}
else
// This branch gets changed to a NOP when the fastpath fails
FixupBranch fast_path = B();
FixupBranch slow_out;
{
PUSH(4, R0, R1, R2, R3);
MOV(R0, rB);
@ -163,9 +144,7 @@ void JitArm::lfXX(UGeckoInstruction inst)
{
MOVI2R(rA, (u32)&Memory::Read_U32);
BL(rA);
VMOV(S0, R0);
VCVT(v0, S0, 0);
VCVT(v1, S0, 0);
}
@ -181,7 +160,34 @@ void JitArm::lfXX(UGeckoInstruction inst)
#endif
}
POP(4, R0, R1, R2, R3);
slow_out = B();
}
SetJumpTarget(fast_path);
{
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
ARMReg rC = gpr.GetReg();
BIC(rC, rB, mask);
MOVI2R(rA, (u32)Memory::base);
ADD(rC, rC, rA);
NEONXEmitter nemit(this);
if (single)
{
nemit.VLD1(F_32, D0, rC);
nemit.VREV32(I_8, D0, D0); // Byte swap to result
VCVT(v0, S0, 0);
VCVT(v1, S0, 0);
}
else
{
nemit.VLD1(I_64, v0, rC);
nemit.VREV64(I_8, v0, v0); // Byte swap to result
}
gpr.Unlock(rC);
}
SetJumpTarget(slow_out);
gpr.Unlock(rA, rB);
SetJumpTarget(DoNotLoad);
}
@ -302,36 +308,17 @@ void JitArm::stfXX(UGeckoInstruction inst)
SetCC();
}
if (false)
{
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
BIC(rB, rB, mask); // 1
MOVI2R(rA, (u32)Memory::base, false); // 2-3
ADD(rB, rB, rA); // 4
NEONXEmitter nemit(this);
if (single)
{
VCVT(S0, v0, 0);
nemit.VREV32(I_8, D0, D0);
VSTR(S0, rB, 0);
}
else
{
nemit.VREV64(I_8, D0, v0);
VSTR(D0, rB, 0);
}
}
else
// This branch gets changed to a NOP when the fastpath fails
FixupBranch fast_path = B();
FixupBranch slow_out;
{
PUSH(4, R0, R1, R2, R3);
if (single)
{
MOVI2R(rA, (u32)&Memory::Write_U32);
MOV(R1, rB);
VCVT(S0, v0, 0);
VMOV(R0, S0);
MOV(R1, rB);
MOVI2R(rA, (u32)&Memory::Write_U32);
BL(rA);
}
else
@ -347,43 +334,32 @@ void JitArm::stfXX(UGeckoInstruction inst)
BL(rA);
}
POP(4, R0, R1, R2, R3);
slow_out = B();
}
gpr.Unlock(rA, rB);
}
// Some games use stfs as a way to quickly write to the gatherpipe and other hardware areas.
// Keep it as a safe store until this can get optimized.
// Look at the JIT64 implementation to see how it is done
void JitArm::stfs(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff);
ARMReg rA = gpr.GetReg();
ARMReg rB = gpr.GetReg();
ARMReg v0 = fpr.R0(inst.FS);
VCVT(S0, v0, 0);
if (inst.RA)
SetJumpTarget(fast_path);
{
MOVI2R(rB, inst.SIMM_16);
ARMReg RA = gpr.R(inst.RA);
ADD(rB, rB, RA);
}
else
{
MOVI2R(rB, (u32)inst.SIMM_16);
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
ARMReg rC = gpr.GetReg();
BIC(rC, rB, mask);
MOVI2R(rA, (u32)Memory::base);
ADD(rC, rC, rA);
NEONXEmitter nemit(this);
if (single)
{
VCVT(S0, v0, 0);
nemit.VREV32(I_8, D0, D0);
VSTR(S0, rC, 0);
}
else
{
nemit.VREV64(I_8, D0, v0);
VSTR(D0, rC, 0);
}
gpr.Unlock(rC);
}
MOVI2R(rA, (u32)&Memory::Write_U32);
PUSH(4, R0, R1, R2, R3);
VMOV(R0, S0);
MOV(R1, rB);
BL(rA);
POP(4, R0, R1, R2, R3);
SetJumpTarget(slow_out);
gpr.Unlock(rA, rB);
}

View File

@ -89,7 +89,7 @@ static GekkoOPTemplate primarytable[] =
{50, &JitArm::lfXX}, //"lfd", OPTYPE_LOADFP, FL_IN_A}},
{51, &JitArm::lfXX}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}},
{52, &JitArm::stfs}, //"stfs", OPTYPE_STOREFP, FL_IN_A}},
{52, &JitArm::stfXX}, //"stfs", OPTYPE_STOREFP, FL_IN_A}},
{53, &JitArm::stfXX}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
{54, &JitArm::stfXX}, //"stfd", OPTYPE_STOREFP, FL_IN_A}},
{55, &JitArm::stfXX}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},