Merge pull request #1551 from Sonicadvance1/armv7-float-fastmem
Reimplements fastmem for ARMv7 floating point loadstores.
This commit is contained in:
commit
4ce1b33e55
|
@ -205,7 +205,6 @@ public:
|
|||
// Floating point loadStore
|
||||
void lfXX(UGeckoInstruction _inst);
|
||||
void stfXX(UGeckoInstruction _inst);
|
||||
void stfs(UGeckoInstruction _inst);
|
||||
|
||||
// Paired Singles
|
||||
void ps_add(UGeckoInstruction _inst);
|
||||
|
|
|
@ -17,7 +17,7 @@ using namespace ArmGen;
|
|||
// 1) It's really necessary. We don't know anything about the context.
|
||||
// 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be
|
||||
// that many of them in a typical program/game.
|
||||
static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Store)
|
||||
static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Store, bool *new_system)
|
||||
{
|
||||
u8 op = (inst >> 20) & 0xFF;
|
||||
rD = (ARMReg)((inst >> 12) & 0xF);
|
||||
|
@ -61,8 +61,23 @@ static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Sto
|
|||
}
|
||||
break;
|
||||
default:
|
||||
printf("Op is 0x%02x\n", op);
|
||||
return false;
|
||||
{
|
||||
// Could be a floating point loadstore
|
||||
u8 op2 = (inst >> 24) & 0xF;
|
||||
switch (op2)
|
||||
{
|
||||
case 0xD: // VLDR/VSTR
|
||||
*new_system = true;
|
||||
break;
|
||||
case 0x4: // VST1/VLD1
|
||||
*new_system = true;
|
||||
break;
|
||||
default:
|
||||
printf("Op is 0x%02x\n", op);
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -70,10 +85,7 @@ static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Sto
|
|||
bool JitArm::HandleFault(uintptr_t access_address, SContext* ctx)
|
||||
{
|
||||
if (access_address < (uintptr_t)Memory::base)
|
||||
{
|
||||
PanicAlertT("Exception handler - access below memory space. %08llx%08llx",
|
||||
access_address >> 32, access_address);
|
||||
}
|
||||
PanicAlertT("Exception handler - access below memory space. 0x%08x", access_address);
|
||||
return BackPatch(ctx);
|
||||
}
|
||||
|
||||
|
@ -87,66 +99,90 @@ bool JitArm::BackPatch(SContext* ctx)
|
|||
ARMReg rD;
|
||||
u8 accessSize;
|
||||
bool Store;
|
||||
bool new_system = false;
|
||||
|
||||
if (!DisamLoadStore(Value, rD, accessSize, Store))
|
||||
if (!DisamLoadStore(Value, rD, accessSize, Store, &new_system))
|
||||
{
|
||||
printf("Invalid backpatch at location 0x%08lx(0x%08x)\n", ctx->CTX_PC, Value);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (Store)
|
||||
if (new_system)
|
||||
{
|
||||
const u32 ARMREGOFFSET = 4 * 5;
|
||||
ARMXEmitter emitter(codePtr - ARMREGOFFSET);
|
||||
switch (accessSize)
|
||||
{
|
||||
case 8: // 8bit
|
||||
emitter.MOVI2R(R14, (u32)&Memory::Write_U8, false); // 1-2
|
||||
return 0;
|
||||
break;
|
||||
case 16: // 16bit
|
||||
emitter.MOVI2R(R14, (u32)&Memory::Write_U16, false); // 1-2
|
||||
return 0;
|
||||
break;
|
||||
case 32: // 32bit
|
||||
emitter.MOVI2R(R14, (u32)&Memory::Write_U32, false); // 1-2
|
||||
break;
|
||||
}
|
||||
emitter.PUSH(4, R0, R1, R2, R3); // 3
|
||||
emitter.MOV(R0, rD); // Value - 4
|
||||
emitter.MOV(R1, R10); // Addr- 5
|
||||
emitter.BL(R14); // 6
|
||||
emitter.POP(4, R0, R1, R2, R3); // 7
|
||||
u32 newPC = ctx->CTX_PC - (ARMREGOFFSET + 4 * 4);
|
||||
ctx->CTX_PC = newPC;
|
||||
// The new system is a lot easier to backpatch than the old crap.
|
||||
// Instead of backpatching over code and making sure we NOP pad and other crap
|
||||
// We emit both the slow and fast path and branch over the slow path each time
|
||||
// We search backwards until we find the second branch instruction
|
||||
// Then proceed to replace it with a NOP and set that to the new PC.
|
||||
// This ensures that we run the slow path and then branch over the fast path.
|
||||
|
||||
// Run backwards until we find the branch we want to NOP
|
||||
for (int branches = 2; branches > 0; ctx->CTX_PC -= 4)
|
||||
if ((*(u32*)ctx->CTX_PC & 0x0F000000) == 0x0A000000) // B
|
||||
--branches;
|
||||
|
||||
ctx->CTX_PC += 4;
|
||||
ARMXEmitter emitter((u8*)ctx->CTX_PC);
|
||||
emitter.NOP(1);
|
||||
emitter.FlushIcache();
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
const u32 ARMREGOFFSET = 4 * 4;
|
||||
ARMXEmitter emitter(codePtr - ARMREGOFFSET);
|
||||
switch (accessSize)
|
||||
if (Store)
|
||||
{
|
||||
case 8: // 8bit
|
||||
emitter.MOVI2R(R14, (u32)&Memory::Read_U8, false); // 2
|
||||
break;
|
||||
case 16: // 16bit
|
||||
emitter.MOVI2R(R14, (u32)&Memory::Read_U16, false); // 2
|
||||
break;
|
||||
case 32: // 32bit
|
||||
emitter.MOVI2R(R14, (u32)&Memory::Read_U32, false); // 2
|
||||
break;
|
||||
const u32 ARMREGOFFSET = 4 * 5;
|
||||
ARMXEmitter emitter(codePtr - ARMREGOFFSET);
|
||||
switch (accessSize)
|
||||
{
|
||||
case 8: // 8bit
|
||||
emitter.MOVI2R(R14, (u32)&Memory::Write_U8, false); // 1-2
|
||||
return 0;
|
||||
break;
|
||||
case 16: // 16bit
|
||||
emitter.MOVI2R(R14, (u32)&Memory::Write_U16, false); // 1-2
|
||||
return 0;
|
||||
break;
|
||||
case 32: // 32bit
|
||||
emitter.MOVI2R(R14, (u32)&Memory::Write_U32, false); // 1-2
|
||||
break;
|
||||
}
|
||||
emitter.PUSH(4, R0, R1, R2, R3); // 3
|
||||
emitter.MOV(R0, rD); // Value - 4
|
||||
emitter.MOV(R1, R10); // Addr- 5
|
||||
emitter.BL(R14); // 6
|
||||
emitter.POP(4, R0, R1, R2, R3); // 7
|
||||
u32 newPC = ctx->CTX_PC - (ARMREGOFFSET + 4 * 4);
|
||||
ctx->CTX_PC = newPC;
|
||||
emitter.FlushIcache();
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
const u32 ARMREGOFFSET = 4 * 4;
|
||||
ARMXEmitter emitter(codePtr - ARMREGOFFSET);
|
||||
switch (accessSize)
|
||||
{
|
||||
case 8: // 8bit
|
||||
emitter.MOVI2R(R14, (u32)&Memory::Read_U8, false); // 2
|
||||
break;
|
||||
case 16: // 16bit
|
||||
emitter.MOVI2R(R14, (u32)&Memory::Read_U16, false); // 2
|
||||
break;
|
||||
case 32: // 32bit
|
||||
emitter.MOVI2R(R14, (u32)&Memory::Read_U32, false); // 2
|
||||
break;
|
||||
}
|
||||
emitter.PUSH(4, R0, R1, R2, R3); // 3
|
||||
emitter.MOV(R0, R10); // 4
|
||||
emitter.BL(R14); // 5
|
||||
emitter.MOV(R14, R0); // 6
|
||||
emitter.POP(4, R0, R1, R2, R3); // 7
|
||||
emitter.MOV(rD, R14); // 8
|
||||
ctx->CTX_PC -= ARMREGOFFSET + (4 * 4);
|
||||
emitter.FlushIcache();
|
||||
return true;
|
||||
}
|
||||
emitter.PUSH(4, R0, R1, R2, R3); // 3
|
||||
emitter.MOV(R0, R10); // 4
|
||||
emitter.BL(R14); // 5
|
||||
emitter.MOV(R14, R0); // 6
|
||||
emitter.POP(4, R0, R1, R2, R3); // 7
|
||||
emitter.MOV(rD, R14); // 8
|
||||
ctx->CTX_PC -= ARMREGOFFSET + (4 * 4);
|
||||
emitter.FlushIcache();
|
||||
return true;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -77,9 +77,9 @@ void JitArm::lfXX(UGeckoInstruction inst)
|
|||
break;
|
||||
}
|
||||
|
||||
ARMReg v0 = fpr.R0(inst.FD), v1;
|
||||
ARMReg v0 = fpr.R0(inst.FD, false), v1;
|
||||
if (single)
|
||||
v1 = fpr.R1(inst.FD);
|
||||
v1 = fpr.R1(inst.FD, false);
|
||||
|
||||
if (update)
|
||||
{
|
||||
|
@ -134,28 +134,9 @@ void JitArm::lfXX(UGeckoInstruction inst)
|
|||
if (update)
|
||||
MOV(RA, rB);
|
||||
|
||||
if (false)
|
||||
{
|
||||
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
|
||||
BIC(rB, rB, mask); // 1
|
||||
MOVI2R(rA, (u32)Memory::base, false); // 2-3
|
||||
ADD(rB, rB, rA); // 4
|
||||
|
||||
NEONXEmitter nemit(this);
|
||||
if (single)
|
||||
{
|
||||
VLDR(S0, rB, 0);
|
||||
nemit.VREV32(I_8, D0, D0); // Byte swap to result
|
||||
VCVT(v0, S0, 0);
|
||||
VCVT(v1, S0, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
VLDR(v0, rB, 0);
|
||||
nemit.VREV64(I_8, v0, v0); // Byte swap to result
|
||||
}
|
||||
}
|
||||
else
|
||||
// This branch gets changed to a NOP when the fastpath fails
|
||||
FixupBranch fast_path = B();
|
||||
FixupBranch slow_out;
|
||||
{
|
||||
PUSH(4, R0, R1, R2, R3);
|
||||
MOV(R0, rB);
|
||||
|
@ -163,9 +144,7 @@ void JitArm::lfXX(UGeckoInstruction inst)
|
|||
{
|
||||
MOVI2R(rA, (u32)&Memory::Read_U32);
|
||||
BL(rA);
|
||||
|
||||
VMOV(S0, R0);
|
||||
|
||||
VCVT(v0, S0, 0);
|
||||
VCVT(v1, S0, 0);
|
||||
}
|
||||
|
@ -181,7 +160,34 @@ void JitArm::lfXX(UGeckoInstruction inst)
|
|||
#endif
|
||||
}
|
||||
POP(4, R0, R1, R2, R3);
|
||||
slow_out = B();
|
||||
}
|
||||
SetJumpTarget(fast_path);
|
||||
{
|
||||
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
|
||||
ARMReg rC = gpr.GetReg();
|
||||
BIC(rC, rB, mask);
|
||||
MOVI2R(rA, (u32)Memory::base);
|
||||
ADD(rC, rC, rA);
|
||||
|
||||
NEONXEmitter nemit(this);
|
||||
if (single)
|
||||
{
|
||||
nemit.VLD1(F_32, D0, rC);
|
||||
nemit.VREV32(I_8, D0, D0); // Byte swap to result
|
||||
VCVT(v0, S0, 0);
|
||||
VCVT(v1, S0, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
nemit.VLD1(I_64, v0, rC);
|
||||
nemit.VREV64(I_8, v0, v0); // Byte swap to result
|
||||
}
|
||||
gpr.Unlock(rC);
|
||||
}
|
||||
|
||||
SetJumpTarget(slow_out);
|
||||
|
||||
gpr.Unlock(rA, rB);
|
||||
SetJumpTarget(DoNotLoad);
|
||||
}
|
||||
|
@ -302,36 +308,17 @@ void JitArm::stfXX(UGeckoInstruction inst)
|
|||
SetCC();
|
||||
}
|
||||
|
||||
if (false)
|
||||
{
|
||||
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
|
||||
BIC(rB, rB, mask); // 1
|
||||
MOVI2R(rA, (u32)Memory::base, false); // 2-3
|
||||
ADD(rB, rB, rA); // 4
|
||||
|
||||
NEONXEmitter nemit(this);
|
||||
if (single)
|
||||
{
|
||||
VCVT(S0, v0, 0);
|
||||
nemit.VREV32(I_8, D0, D0);
|
||||
VSTR(S0, rB, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
nemit.VREV64(I_8, D0, v0);
|
||||
VSTR(D0, rB, 0);
|
||||
}
|
||||
}
|
||||
else
|
||||
// This branch gets changed to a NOP when the fastpath fails
|
||||
FixupBranch fast_path = B();
|
||||
FixupBranch slow_out;
|
||||
{
|
||||
PUSH(4, R0, R1, R2, R3);
|
||||
if (single)
|
||||
{
|
||||
MOVI2R(rA, (u32)&Memory::Write_U32);
|
||||
MOV(R1, rB);
|
||||
VCVT(S0, v0, 0);
|
||||
VMOV(R0, S0);
|
||||
MOV(R1, rB);
|
||||
|
||||
MOVI2R(rA, (u32)&Memory::Write_U32);
|
||||
BL(rA);
|
||||
}
|
||||
else
|
||||
|
@ -347,43 +334,32 @@ void JitArm::stfXX(UGeckoInstruction inst)
|
|||
BL(rA);
|
||||
}
|
||||
POP(4, R0, R1, R2, R3);
|
||||
slow_out = B();
|
||||
}
|
||||
gpr.Unlock(rA, rB);
|
||||
}
|
||||
|
||||
// Some games use stfs as a way to quickly write to the gatherpipe and other hardware areas.
|
||||
// Keep it as a safe store until this can get optimized.
|
||||
// Look at the JIT64 implementation to see how it is done
|
||||
|
||||
void JitArm::stfs(UGeckoInstruction inst)
|
||||
{
|
||||
INSTRUCTION_START
|
||||
JITDISABLE(bJITLoadStoreFloatingOff);
|
||||
|
||||
ARMReg rA = gpr.GetReg();
|
||||
ARMReg rB = gpr.GetReg();
|
||||
ARMReg v0 = fpr.R0(inst.FS);
|
||||
VCVT(S0, v0, 0);
|
||||
|
||||
if (inst.RA)
|
||||
SetJumpTarget(fast_path);
|
||||
{
|
||||
MOVI2R(rB, inst.SIMM_16);
|
||||
ARMReg RA = gpr.R(inst.RA);
|
||||
ADD(rB, rB, RA);
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVI2R(rB, (u32)inst.SIMM_16);
|
||||
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
|
||||
ARMReg rC = gpr.GetReg();
|
||||
BIC(rC, rB, mask);
|
||||
MOVI2R(rA, (u32)Memory::base);
|
||||
ADD(rC, rC, rA);
|
||||
|
||||
NEONXEmitter nemit(this);
|
||||
if (single)
|
||||
{
|
||||
VCVT(S0, v0, 0);
|
||||
nemit.VREV32(I_8, D0, D0);
|
||||
VSTR(S0, rC, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
nemit.VREV64(I_8, D0, v0);
|
||||
VSTR(D0, rC, 0);
|
||||
}
|
||||
gpr.Unlock(rC);
|
||||
}
|
||||
|
||||
MOVI2R(rA, (u32)&Memory::Write_U32);
|
||||
PUSH(4, R0, R1, R2, R3);
|
||||
VMOV(R0, S0);
|
||||
MOV(R1, rB);
|
||||
|
||||
BL(rA);
|
||||
|
||||
POP(4, R0, R1, R2, R3);
|
||||
SetJumpTarget(slow_out);
|
||||
|
||||
gpr.Unlock(rA, rB);
|
||||
}
|
||||
|
|
|
@ -89,7 +89,7 @@ static GekkoOPTemplate primarytable[] =
|
|||
{50, &JitArm::lfXX}, //"lfd", OPTYPE_LOADFP, FL_IN_A}},
|
||||
{51, &JitArm::lfXX}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}},
|
||||
|
||||
{52, &JitArm::stfs}, //"stfs", OPTYPE_STOREFP, FL_IN_A}},
|
||||
{52, &JitArm::stfXX}, //"stfs", OPTYPE_STOREFP, FL_IN_A}},
|
||||
{53, &JitArm::stfXX}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
|
||||
{54, &JitArm::stfXX}, //"stfd", OPTYPE_STOREFP, FL_IN_A}},
|
||||
{55, &JitArm::stfXX}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
|
||||
|
|
Loading…
Reference in New Issue