Merge pull request #1230 from FioraAeterna/constaddr

JIT: improve handling of stores with a known address
This commit is contained in:
skidau 2014-11-05 12:40:38 +11:00
commit 0515ab852e
7 changed files with 194 additions and 161 deletions

View File

@ -186,8 +186,6 @@ void Jit64AsmRoutineManager::GenerateCommon()
GenFifoWrite(16); GenFifoWrite(16);
fifoDirectWrite32 = AlignCode4(); fifoDirectWrite32 = AlignCode4();
GenFifoWrite(32); GenFifoWrite(32);
fifoDirectWriteFloat = AlignCode4();
GenFifoFloatWrite();
frsqrte = AlignCode4(); frsqrte = AlignCode4();
GenFrsqrte(); GenFrsqrte();
fres = AlignCode4(); fres = AlignCode4();

View File

@ -334,98 +334,54 @@ void Jit64::stX(UGeckoInstruction inst)
int s = inst.RS; int s = inst.RS;
int a = inst.RA; int a = inst.RA;
bool update = inst.OPCD & 1;
s32 offset = (s32)(s16)inst.SIMM_16; s32 offset = (s32)(s16)inst.SIMM_16;
if (a || !update) bool update = (inst.OPCD & 1) && offset;
FALLBACK_IF(update);
if (!a && update)
PanicAlert("Invalid stX");
int accessSize;
switch (inst.OPCD & ~1)
{ {
int accessSize; case 36: // stw
switch (inst.OPCD & ~1) accessSize = 32;
break;
case 44: // sth
accessSize = 16;
break;
case 38: // stb
accessSize = 8;
break;
default:
_assert_msg_(DYNA_REC, 0, "stX: Invalid access size.");
return;
}
// If we already know the address of the write
if (!a || gpr.R(a).IsImm())
{
u32 addr = (a ? (u32)gpr.R(a).offset : 0) + offset;
bool exception = WriteToConstAddress(accessSize, gpr.R(s), addr, CallerSavedRegistersInUse());
if (update)
{ {
case 36: // stw if (!js.memcheck || !exception)
accessSize = 32;
break;
case 44: // sth
accessSize = 16;
break;
case 38: // stb
accessSize = 8;
break;
default:
_assert_msg_(DYNA_REC, 0, "stX: Invalid access size.");
return;
}
if ((a == 0) || gpr.R(a).IsImm())
{
// If we already know the address through constant folding, we can do some
// fun tricks...
u32 addr = ((a == 0) ? 0 : (u32)gpr.R(a).offset);
addr += offset;
if ((addr & 0xFFFFF000) == 0xCC008000 && jo.optimizeGatherPipe)
{ {
// Helps external systems know which instruction triggered the write gpr.SetImmediate32(a, addr);
MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC));
MOV(32, R(RSCRATCH2), gpr.R(s));
if (update)
gpr.SetImmediate32(a, addr);
// No need to protect these, they don't touch any state
// question - should we inline them instead? Pro: Lose a CALL Con: Code bloat
switch (accessSize)
{
case 8:
CALL((void *)asm_routines.fifoDirectWrite8);
break;
case 16:
CALL((void *)asm_routines.fifoDirectWrite16);
break;
case 32:
CALL((void *)asm_routines.fifoDirectWrite32);
break;
}
js.fifoBytesThisBlock += accessSize >> 3;
gpr.UnlockAllX();
return;
}
else if (Memory::IsRAMAddress(addr))
{
MOV(32, R(RSCRATCH), gpr.R(s));
WriteToConstRamAddress(accessSize, RSCRATCH, addr, true);
if (update)
gpr.SetImmediate32(a, addr);
return;
} }
else else
{ {
// Helps external systems know which instruction triggered the write gpr.KillImmediate(a, true, true);
MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); MEMCHECK_START(false)
ADD(32, gpr.R(a), Imm32((u32)offset));
BitSet32 registersInUse = CallerSavedRegistersInUse(); MEMCHECK_END
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
switch (accessSize)
{
case 32:
ABI_CallFunctionAC(true ? ((void *)&Memory::Write_U32) : ((void *)&Memory::Write_U32_Swap), gpr.R(s), addr);
break;
case 16:
ABI_CallFunctionAC(true ? ((void *)&Memory::Write_U16) : ((void *)&Memory::Write_U16_Swap), gpr.R(s), addr);
break;
case 8:
ABI_CallFunctionAC((void *)&Memory::Write_U8, gpr.R(s), addr);
break;
}
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
if (update)
gpr.SetImmediate32(a, addr);
return;
} }
} }
}
else
{
gpr.Lock(a, s); gpr.Lock(a, s);
gpr.BindToRegister(a, true, false); gpr.BindToRegister(a, true, update);
if (gpr.R(s).IsImm()) if (gpr.R(s).IsImm())
{ {
SafeWriteRegToReg(gpr.R(s), gpr.RX(a), accessSize, offset, CallerSavedRegistersInUse(), SAFE_LOADSTORE_CLOBBER_RSCRATCH_INSTEAD_OF_ADDR); SafeWriteRegToReg(gpr.R(s), gpr.RX(a), accessSize, offset, CallerSavedRegistersInUse(), SAFE_LOADSTORE_CLOBBER_RSCRATCH_INSTEAD_OF_ADDR);
@ -446,21 +402,14 @@ void Jit64::stX(UGeckoInstruction inst)
SafeWriteRegToReg(reg_value, gpr.RX(a), accessSize, offset, CallerSavedRegistersInUse(), SAFE_LOADSTORE_CLOBBER_RSCRATCH_INSTEAD_OF_ADDR); SafeWriteRegToReg(reg_value, gpr.RX(a), accessSize, offset, CallerSavedRegistersInUse(), SAFE_LOADSTORE_CLOBBER_RSCRATCH_INSTEAD_OF_ADDR);
} }
if (update && offset) if (update)
{ {
MEMCHECK_START(false) MEMCHECK_START(false)
gpr.KillImmediate(a, true, true);
ADD(32, gpr.R(a), Imm32((u32)offset)); ADD(32, gpr.R(a), Imm32((u32)offset));
MEMCHECK_END MEMCHECK_END
} }
gpr.UnlockAll();
}
else
{
PanicAlert("Invalid stX");
} }
gpr.UnlockAll();
} }
void Jit64::stXx(UGeckoInstruction inst) void Jit64::stXx(UGeckoInstruction inst)

View File

@ -101,11 +101,50 @@ void Jit64::stfXXX(UGeckoInstruction inst)
int s = inst.RS; int s = inst.RS;
int a = inst.RA; int a = inst.RA;
int b = inst.RB; int b = inst.RB;
s32 imm = (s16)inst.SIMM_16;
int accessSize = single ? 32 : 64;
FALLBACK_IF((!indexed && !a) || (update && js.memcheck && a == b)); FALLBACK_IF(update && js.memcheck && a == b);
if (single)
{
fpr.BindToRegister(s, true, false);
ConvertDoubleToSingle(XMM0, fpr.RX(s));
MOVD_xmm(R(RSCRATCH), XMM0);
}
else
{
if (fpr.R(s).IsSimpleReg())
MOVQ_xmm(R(RSCRATCH), fpr.RX(s));
else
MOV(64, R(RSCRATCH), fpr.R(s));
}
if (!indexed && (!a || gpr.R(a).IsImm()))
{
u32 addr = (a ? (u32)gpr.R(a).offset : 0) + imm;
bool exception = WriteToConstAddress(accessSize, R(RSCRATCH), addr, CallerSavedRegistersInUse());
if (update)
{
if (!js.memcheck || !exception)
{
gpr.SetImmediate32(a, addr);
}
else
{
gpr.KillImmediate(a, true, true);
MEMCHECK_START(false)
ADD(32, gpr.R(a), Imm32((u32)imm));
MEMCHECK_END
}
}
fpr.UnlockAll();
gpr.UnlockAll();
return;
}
s32 offset = 0; s32 offset = 0;
s32 imm = (s16)inst.SIMM_16;
if (indexed) if (indexed)
{ {
if (update) if (update)
@ -140,21 +179,8 @@ void Jit64::stfXXX(UGeckoInstruction inst)
MOV(32, R(RSCRATCH2), gpr.R(a)); MOV(32, R(RSCRATCH2), gpr.R(a));
} }
if (single) SafeWriteRegToReg(RSCRATCH, RSCRATCH2, accessSize, offset, CallerSavedRegistersInUse());
{
fpr.BindToRegister(s, true, false);
ConvertDoubleToSingle(XMM0, fpr.RX(s));
SafeWriteF32ToReg(XMM0, RSCRATCH2, offset, CallerSavedRegistersInUse());
fpr.UnlockAll();
}
else
{
if (fpr.R(s).IsSimpleReg())
MOVQ_xmm(R(RSCRATCH), fpr.RX(s));
else
MOV(64, R(RSCRATCH), fpr.R(s));
SafeWriteRegToReg(RSCRATCH, RSCRATCH2, 64, offset, CallerSavedRegistersInUse());
}
if (js.memcheck && update) if (js.memcheck && update)
{ {
// revert the address change if an exception occurred // revert the address change if an exception occurred
@ -162,6 +188,8 @@ void Jit64::stfXXX(UGeckoInstruction inst)
SUB(32, gpr.R(a), indexed ? gpr.R(b) : Imm32(imm)); SUB(32, gpr.R(a), indexed ? gpr.R(b) : Imm32(imm));
MEMCHECK_END MEMCHECK_END
} }
fpr.UnlockAll();
gpr.UnlockAll(); gpr.UnlockAll();
gpr.UnlockAllX(); gpr.UnlockAllX();
} }

View File

@ -22,31 +22,13 @@ static int temp32;
void CommonAsmRoutines::GenFifoWrite(int size) void CommonAsmRoutines::GenFifoWrite(int size)
{ {
// Assume value in RSCRATCH2 // Assume value in RSCRATCH
PUSH(ESI); u32 gather_pipe = (u32)(u64)GPFifo::m_gatherPipe;
MOV(32, R(RSCRATCH), Imm32((u32)(u64)GPFifo::m_gatherPipe)); _assert_msg_(DYNA_REC, gather_pipe <= 0x7FFFFFFF, "Gather pipe not in low 2GB of memory!");
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount)); MOV(32, R(RSCRATCH2), M(&GPFifo::m_gatherPipeCount));
SwapAndStore(size, MDisp(RSCRATCH2, gather_pipe), RSCRATCH);
SwapAndStore(size, MComplex(RSCRATCH, ESI, 1, 0), RSCRATCH2); ADD(32, R(RSCRATCH2), Imm8(size >> 3));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(RSCRATCH2));
ADD(32, R(ESI), Imm8(size >> 3));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
POP(ESI);
RET();
}
void CommonAsmRoutines::GenFifoFloatWrite()
{
// Assume value in XMM0
PUSH(ESI);
MOVSS(M(&temp32), XMM0);
MOV(32, R(RSCRATCH2), M(&temp32));
MOV(32, R(RSCRATCH), Imm32((u32)(u64)GPFifo::m_gatherPipe));
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
SwapAndStore(32, MComplex(RSCRATCH, RSI, 1, 0), RSCRATCH2);
ADD(32, R(ESI), Imm8(4));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
POP(ESI);
RET(); RET();
} }
@ -173,8 +155,8 @@ void CommonAsmRoutines::GenFres()
// Safe + Fast Quantizers, originally from JITIL by magumagu // Safe + Fast Quantizers, originally from JITIL by magumagu
static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
static const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15}; static const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 };
static const float GC_ALIGNED16(m_quantizeTableS[]) = static const float GC_ALIGNED16(m_quantizeTableS[]) =
{ {
@ -386,7 +368,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
// Easy! // Easy!
const u8* storeSingleFloat = AlignCode4(); const u8* storeSingleFloat = AlignCode4();
SafeWriteF32ToReg(XMM0, RSCRATCH_EXTRA, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET(); RET();
/* /*
if (cpu_info.bSSSE3) if (cpu_info.bSSSE3)

View File

@ -13,7 +13,6 @@ public:
const u8 *fifoDirectWrite8; const u8 *fifoDirectWrite8;
const u8 *fifoDirectWrite16; const u8 *fifoDirectWrite16;
const u8 *fifoDirectWrite32; const u8 *fifoDirectWrite32;
const u8 *fifoDirectWriteFloat;
const u8 *enterCode; const u8 *enterCode;

View File

@ -422,6 +422,16 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
} }
} }
static OpArg SwapImmediate(int accessSize, OpArg reg_value)
{
if (accessSize == 32)
return Imm32(Common::swap32((u32)reg_value.offset));
else if (accessSize == 16)
return Imm16(Common::swap16((u16)reg_value.offset));
else
return Imm8((u8)reg_value.offset);
}
u8 *EmuCodeBlock::UnsafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int accessSize, s32 offset, bool swap) u8 *EmuCodeBlock::UnsafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int accessSize, s32 offset, bool swap)
{ {
u8* result = GetWritableCodePtr(); u8* result = GetWritableCodePtr();
@ -429,14 +439,7 @@ u8 *EmuCodeBlock::UnsafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acce
if (reg_value.IsImm()) if (reg_value.IsImm())
{ {
if (swap) if (swap)
{ reg_value = SwapImmediate(accessSize, reg_value);
if (accessSize == 32)
reg_value = Imm32(Common::swap32((u32)reg_value.offset));
else if (accessSize == 16)
reg_value = Imm16(Common::swap16((u16)reg_value.offset));
else
reg_value = Imm8((u8)reg_value.offset);
}
MOV(accessSize, dest, reg_value); MOV(accessSize, dest, reg_value);
} }
else if (swap) else if (swap)
@ -461,6 +464,68 @@ u8 *EmuCodeBlock::UnsafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acce
return result; return result;
} }
void EmuCodeBlock::UnsafeWriteGatherPipe(int accessSize)
{
// No need to protect these, they don't touch any state
// question - should we inline them instead? Pro: Lose a CALL Con: Code bloat
switch (accessSize)
{
case 8:
CALL((void *)jit->GetAsmRoutines()->fifoDirectWrite8);
break;
case 16:
CALL((void *)jit->GetAsmRoutines()->fifoDirectWrite16);
break;
case 32:
CALL((void *)jit->GetAsmRoutines()->fifoDirectWrite32);
break;
}
jit->js.fifoBytesThisBlock += accessSize >> 3;
}
bool EmuCodeBlock::WriteToConstAddress(int accessSize, OpArg arg, u32 address, BitSet32 registersInUse)
{
// If we already know the address through constant folding, we can do some
// fun tricks...
if ((address & 0xFFFFF000) == 0xCC008000 && jit->jo.optimizeGatherPipe && accessSize <= 32)
{
if (!arg.IsSimpleReg() || arg.GetSimpleReg() != RSCRATCH)
MOV(32, R(RSCRATCH), arg);
UnsafeWriteGatherPipe(accessSize);
return false;
}
else if (Memory::IsRAMAddress(address))
{
WriteToConstRamAddress(accessSize, arg, address);
return false;
}
else
{
// Helps external systems know which instruction triggered the write
MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC));
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
switch (accessSize)
{
case 64:
ABI_CallFunctionAC((void *)&Memory::Write_U64, arg, address);
break;
case 32:
ABI_CallFunctionAC((void *)&Memory::Write_U32, arg, address);
break;
case 16:
ABI_CallFunctionAC((void *)&Memory::Write_U16, arg, address);
break;
case 8:
ABI_CallFunctionAC((void *)&Memory::Write_U8, arg, address);
break;
}
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
return true;
}
}
void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int accessSize, s32 offset, BitSet32 registersInUse, int flags) void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int accessSize, s32 offset, BitSet32 registersInUse, int flags)
{ {
// set the correct immediate format // set the correct immediate format
@ -565,20 +630,30 @@ void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acces
SetJumpTarget(exit); SetJumpTarget(exit);
} }
// Destroys the same as SafeWrite plus RSCRATCH. TODO: see if we can avoid temporaries here void EmuCodeBlock::WriteToConstRamAddress(int accessSize, OpArg arg, u32 address, bool swap)
void EmuCodeBlock::SafeWriteF32ToReg(X64Reg xmm_value, X64Reg reg_addr, s32 offset, BitSet32 registersInUse, int flags)
{ {
// TODO: PSHUFB might be faster if fastmem supported MOVSS. X64Reg reg;
MOVD_xmm(R(RSCRATCH), xmm_value); if (arg.IsImm())
SafeWriteRegToReg(RSCRATCH, reg_addr, 32, offset, registersInUse, flags); {
} arg = SwapImmediate(accessSize, arg);
MOV(accessSize, MDisp(RMEM, address & 0x3FFFFFFF), arg);
return;
}
void EmuCodeBlock::WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap) if (!arg.IsSimpleReg() || (!cpu_info.bMOVBE && swap && arg.GetSimpleReg() != RSCRATCH))
{ {
if (swap) MOV(accessSize, R(RSCRATCH), arg);
SwapAndStore(accessSize, MDisp(RMEM, address & 0x3FFFFFFF), arg); reg = RSCRATCH;
}
else else
MOV(accessSize, MDisp(RMEM, address & 0x3FFFFFFF), R(arg)); {
reg = arg.GetSimpleReg();
}
if (swap)
SwapAndStore(accessSize, MDisp(RMEM, address & 0x3FFFFFFF), reg);
else
MOV(accessSize, MDisp(RMEM, address & 0x3FFFFFFF), R(reg));
} }
void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm) void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm)

View File

@ -87,6 +87,7 @@ public:
return UnsafeWriteRegToReg(R(reg_value), reg_addr, accessSize, offset, swap); return UnsafeWriteRegToReg(R(reg_value), reg_addr, accessSize, offset, swap);
} }
u8 *UnsafeLoadToReg(Gen::X64Reg reg_value, Gen::OpArg opAddress, int accessSize, s32 offset, bool signExtend); u8 *UnsafeLoadToReg(Gen::X64Reg reg_value, Gen::OpArg opAddress, int accessSize, s32 offset, bool signExtend);
void UnsafeWriteGatherPipe(int accessSize);
// Generate a load/write from the MMIO handler for a given address. Only // Generate a load/write from the MMIO handler for a given address. Only
// call for known addresses in MMIO range (MMIO::IsMMIOAddress). // call for known addresses in MMIO range (MMIO::IsMMIOAddress).
@ -116,9 +117,9 @@ public:
return swap && !cpu_info.bMOVBE && accessSize > 8; return swap && !cpu_info.bMOVBE && accessSize > 8;
} }
void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, BitSet32 registersInUse, int flags = 0); void WriteToConstRamAddress(int accessSize, Gen::OpArg arg, u32 address, bool swap = true);
// returns true if an exception could have been caused
void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false); bool WriteToConstAddress(int accessSize, Gen::OpArg arg, u32 address, BitSet32 registersInUse);
void JitGetAndClearCAOV(bool oe); void JitGetAndClearCAOV(bool oe);
void JitSetCA(); void JitSetCA();
void JitSetCAIf(Gen::CCFlags conditionCode); void JitSetCAIf(Gen::CCFlags conditionCode);