This commit is contained in:
Connor McLaughlin 2021-05-10 03:07:19 +10:00
parent 9f6c3c8d44
commit 5f947655f5
9 changed files with 133 additions and 83 deletions

View File

@ -76,7 +76,7 @@ u32 m_ram_code_page_count = 0;
u8* g_ram = nullptr; // 2MB RAM
u32 g_ram_size = 0;
u32 g_ram_mask = 0;
u8 g_bios[BIOS_SIZE]{}; // 512K BIOS ROM
alignas(HOST_PAGE_SIZE) u8 g_bios[BIOS_SIZE]{}; // 512K BIOS ROM
static std::array<TickCount, 3> m_exp1_access_time = {};
static std::array<TickCount, 3> m_exp2_access_time = {};
@ -315,9 +315,11 @@ static ALWAYS_INLINE u32 FastmemAddressToLUTPageIndex(u32 address)
return address >> 12;
}
static ALWAYS_INLINE_RELEASE void SetLUTFastmemPage(u32 address, u8* ptr, bool writable)
static ALWAYS_INLINE_RELEASE void SetLUTFastmemPage(u32 address, u8* ptr, bool writable, u32 read_ticks)
{
m_fastmem_lut[FastmemAddressToLUTPageIndex(address)] = ptr;
DebugAssert((reinterpret_cast<uintptr_t>(ptr) & HOST_PAGE_OFFSET_MASK) == 0);
m_fastmem_lut[FastmemAddressToLUTPageIndex(address)] =
reinterpret_cast<u8*>(reinterpret_cast<uintptr_t>(ptr) | static_cast<uintptr_t>(read_ticks));
m_fastmem_lut[FASTMEM_LUT_NUM_PAGES + FastmemAddressToLUTPageIndex(address)] = writable ? ptr : nullptr;
}
@ -451,21 +453,25 @@ void UpdateFastmemViews(CPUFastmemMode mode)
for (u32 address = 0; address < g_ram_size; address += HOST_PAGE_SIZE)
{
SetLUTFastmemPage(base_address + address, &g_ram[address],
!m_ram_code_bits[FastmemAddressToLUTPageIndex(address)]);
!m_ram_code_bits[FastmemAddressToLUTPageIndex(address)], RAM_READ_TICKS);
}
};
auto MapScratchpad = [](u32 base_address) { SetLUTFastmemPage(base_address, CPU::g_scratchpad.data(), true, 0); };
// KUSEG - cached
MapRAM(0x00000000);
MapRAM(0x00200000);
MapRAM(0x00400000);
MapRAM(0x00600000);
MapScratchpad(0x1F800000);
// KSEG0 - cached
MapRAM(0x80000000);
MapRAM(0x80200000);
MapRAM(0x80400000);
MapRAM(0x80600000);
MapScratchpad(0x8F800000);
// KSEG1 - uncached
MapRAM(0xA0000000);
@ -490,7 +496,7 @@ bool CanUseFastmemForAddress(VirtualMemoryAddress address)
#endif
case CPUFastmemMode::LUT:
return (paddr < g_ram_size);
return (paddr < g_ram_size) || ((paddr & CPU::DCACHE_LOCATION_MASK) == CPU::DCACHE_LOCATION);
case CPUFastmemMode::Disabled:
default:
@ -548,7 +554,7 @@ void SetCodePageFastmemProtection(u32 page_index, bool writable)
// mirrors...
const u32 ram_address = page_index * HOST_PAGE_SIZE;
for (u32 mirror_start : m_fastmem_ram_mirrors)
SetLUTFastmemPage(mirror_start + ram_address, &g_ram[ram_address], writable);
SetLUTFastmemPage(mirror_start + ram_address, &g_ram[ram_address], writable, RAM_READ_TICKS);
}
}
@ -576,7 +582,7 @@ void ClearRAMCodePageFlags()
{
const u32 addr = (i * HOST_PAGE_SIZE);
for (u32 mirror_start : m_fastmem_ram_mirrors)
SetLUTFastmemPage(mirror_start + addr, &g_ram[addr], true);
SetLUTFastmemPage(mirror_start + addr, &g_ram[addr], true, RAM_READ_TICKS);
}
}
}
@ -664,7 +670,7 @@ u8* GetMemoryRegionPointer(MemoryRegion region)
return nullptr;
case MemoryRegion::Scratchpad:
return CPU::g_state.dcache.data();
return CPU::g_scratchpad.data();
case MemoryRegion::BIOS:
return g_bios;
@ -1451,30 +1457,30 @@ ALWAYS_INLINE static TickCount DoScratchpadAccess(PhysicalMemoryAddress address,
if constexpr (size == MemoryAccessSize::Byte)
{
if constexpr (type == MemoryAccessType::Read)
value = ZeroExtend32(g_state.dcache[cache_offset]);
value = ZeroExtend32(g_scratchpad[cache_offset]);
else
g_state.dcache[cache_offset] = Truncate8(value);
g_scratchpad[cache_offset] = Truncate8(value);
}
else if constexpr (size == MemoryAccessSize::HalfWord)
{
if constexpr (type == MemoryAccessType::Read)
{
u16 temp;
std::memcpy(&temp, &g_state.dcache[cache_offset], sizeof(temp));
std::memcpy(&temp, &g_scratchpad[cache_offset], sizeof(temp));
value = ZeroExtend32(temp);
}
else
{
u16 temp = Truncate16(value);
std::memcpy(&g_state.dcache[cache_offset], &temp, sizeof(temp));
std::memcpy(&g_scratchpad[cache_offset], &temp, sizeof(temp));
}
}
else if constexpr (size == MemoryAccessSize::Word)
{
if constexpr (type == MemoryAccessType::Read)
std::memcpy(&value, &g_state.dcache[cache_offset], sizeof(value));
std::memcpy(&value, &g_scratchpad[cache_offset], sizeof(value));
else
std::memcpy(&g_state.dcache[cache_offset], &value, sizeof(value));
std::memcpy(&g_scratchpad[cache_offset], &value, sizeof(value));
}
return 0;
@ -1924,7 +1930,7 @@ void* GetDirectReadMemoryPointer(VirtualMemoryAddress address, MemoryAccessSize
if (read_ticks)
*read_ticks = 0;
return &g_state.dcache[paddr & DCACHE_OFFSET_MASK];
return &g_scratchpad[paddr & DCACHE_OFFSET_MASK];
}
if (paddr >= BIOS_BASE && paddr < (BIOS_BASE + BIOS_SIZE))
@ -1955,7 +1961,7 @@ void* GetDirectWriteMemoryPointer(VirtualMemoryAddress address, MemoryAccessSize
#endif
if ((paddr & DCACHE_LOCATION_MASK) == DCACHE_LOCATION)
return &g_state.dcache[paddr & DCACHE_OFFSET_MASK];
return &g_scratchpad[paddr & DCACHE_OFFSET_MASK];
return nullptr;
}

View File

@ -46,7 +46,7 @@ static T DoMemoryRead(PhysicalMemoryAddress address)
if ((address & CPU::DCACHE_LOCATION_MASK) == CPU::DCACHE_LOCATION &&
(address & CPU::DCACHE_OFFSET_MASK) < CPU::DCACHE_SIZE)
{
std::memcpy(&result, &CPU::g_state.dcache[address & CPU::DCACHE_OFFSET_MASK], sizeof(result));
std::memcpy(&result, &CPU::g_scratchpad[address & CPU::DCACHE_OFFSET_MASK], sizeof(result));
return result;
}
@ -74,7 +74,7 @@ static void DoMemoryWrite(PhysicalMemoryAddress address, T value)
if ((address & CPU::DCACHE_LOCATION_MASK) == CPU::DCACHE_LOCATION &&
(address & CPU::DCACHE_OFFSET_MASK) < CPU::DCACHE_SIZE)
{
std::memcpy(&CPU::g_state.dcache[address & CPU::DCACHE_OFFSET_MASK], &value, sizeof(value));
std::memcpy(&CPU::g_scratchpad[address & CPU::DCACHE_OFFSET_MASK], &value, sizeof(value));
return;
}

View File

@ -25,6 +25,7 @@ static void Branch(u32 target);
static void FlushPipeline();
State g_state;
alignas(HOST_PAGE_SIZE) std::array<u8, DCACHE_SIZE> g_scratchpad;
bool g_using_interpreter = false;
bool TRACE_EXECUTION = false;
@ -164,7 +165,7 @@ bool DoState(StateWrapper& sw)
sw.Do(&g_state.next_load_delay_reg);
sw.Do(&g_state.next_load_delay_value);
sw.Do(&g_state.cache_control.bits);
sw.DoBytes(g_state.dcache.data(), g_state.dcache.size());
sw.DoBytes(g_scratchpad.data(), g_scratchpad.size());
if (!GTE::DoState(sw))
return false;

View File

@ -81,7 +81,6 @@ struct State
u8* fastmem_base = nullptr;
// data cache (used as scratchpad)
std::array<u8, DCACHE_SIZE> dcache = {};
std::array<u32, ICACHE_LINES> icache_tags = {};
std::array<u8, ICACHE_SIZE> icache_data = {};
@ -90,6 +89,7 @@ struct State
};
extern State g_state;
extern std::array<u8, DCACHE_SIZE> g_scratchpad;
extern bool g_using_interpreter;
void Initialize();

View File

@ -2868,7 +2868,7 @@ CodeGenerator::SpeculativeValue CodeGenerator::SpeculativeReadMemory(VirtualMemo
if ((phys_addr & DCACHE_LOCATION_MASK) == DCACHE_LOCATION)
{
u32 scratchpad_offset = phys_addr & DCACHE_OFFSET_MASK;
std::memcpy(&value, &CPU::g_state.dcache[scratchpad_offset], sizeof(value));
std::memcpy(&value, &CPU::g_scratchpad[scratchpad_offset], sizeof(value));
return value;
}

View File

@ -1203,9 +1203,10 @@ void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size,
}
m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), 12);
m_emit->and_(GetHostReg32(RARG2), GetHostReg32(address_reg), HOST_PAGE_OFFSET_MASK);
m_emit->ubfx(GetHostReg32(RARG2), GetHostReg32(address_reg), 0, 12); // offset = addr & 0xfff
m_emit->ldr(GetHostReg32(RARG1),
a32::MemOperand(GetHostReg32(fastmem_base), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load
m_emit->bic(GetHostReg32(RARG1), GetHostReg32(RARG1), 0xFF); // ptr &= ~cycles
switch (size)
{
@ -1249,10 +1250,12 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
address_reg = address.host_reg;
}
m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), 12);
m_emit->and_(GetHostReg32(RARG2), GetHostReg32(address_reg), HOST_PAGE_OFFSET_MASK);
m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), 12); // page = addr >> 12
m_emit->ubfx(GetHostReg32(RARG2), GetHostReg32(address_reg), 0, 12); // offset = addr & 0xfff
m_emit->ldr(GetHostReg32(RARG1),
a32::MemOperand(GetHostReg32(fastmem_base), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load
m_emit->and_(GetHostReg32(RSCRATCH), GetHostReg32(RARG1), 0xFF); // scratch = ptr & cycles
m_emit->bic(GetHostReg32(RARG1), GetHostReg32(RARG1), 0xFF); // ptr &= ~cycles
m_register_cache.InhibitAllocation();
bpi.host_pc = GetCurrentNearCodePointer();
@ -1285,16 +1288,8 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
bpi.host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode();
// we add the ticks *after* the add here, since we counted incorrectly, then correct for it below
DebugAssert(m_delayed_cycles_add > 0);
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
m_delayed_cycles_add += Bus::RAM_READ_TICKS;
EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
// restore fastmem base state for the next instruction
if (old_store_fastmem_base)
fastmem_base = GetFastmemStoreBase();
@ -1409,7 +1404,7 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi,
// TODO: if this gets backpatched, these instructions are wasted
m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), 12);
m_emit->and_(GetHostReg32(RARG2), GetHostReg32(address_reg), HOST_PAGE_OFFSET_MASK);
m_emit->ubfx(GetHostReg32(RARG2), GetHostReg32(address_reg), 0, 12);
m_emit->ldr(GetHostReg32(RARG1),
a32::MemOperand(GetHostReg32(fastmem_base), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load

View File

@ -1368,6 +1368,7 @@ void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size,
m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), 12);
m_emit->and_(GetHostReg32(RARG2), GetHostReg32(address_reg), HOST_PAGE_OFFSET_MASK);
m_emit->ldr(GetHostReg64(RARG1), a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RARG1), a64::LSL, 3));
m_emit->and_(GetHostReg64(RARG1), GetHostReg64(RARG1), ~static_cast<u64>(HOST_PAGE_OFFSET_MASK));
switch (size)
{
@ -1434,12 +1435,37 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
UnreachableCode();
break;
}
bpi.host_code_size = static_cast<u32>(
static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
// generate slowmem fallback
bpi.host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode();
// we add the ticks *after* the add here, since we counted incorrectly, then correct for it below
DebugAssert(m_delayed_cycles_add > 0);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
m_delayed_cycles_add += Bus::RAM_READ_TICKS;
EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
// return to the block code
EmitBranch(GetCurrentNearCodePointer(), false);
SwitchToNearCode();
}
else
{
m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), 12);
m_emit->and_(GetHostReg32(RARG2), GetHostReg32(address_reg), HOST_PAGE_OFFSET_MASK);
m_emit->ldr(GetHostReg64(RARG1), a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RARG1), a64::LSL, 3));
m_emit->and_(GetHostReg64(RARG3), GetHostReg64(RARG1), static_cast<u64>(HOST_PAGE_OFFSET_MASK));
m_emit->and_(GetHostReg64(RARG1), GetHostReg64(RARG1), ~static_cast<u64>(HOST_PAGE_OFFSET_MASK));
bpi.host_pc = GetCurrentNearCodePointer();
@ -1461,29 +1487,24 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
UnreachableCode();
break;
}
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromHostReg(&m_register_cache, RARG3, RegSize_32));
bpi.host_code_size = static_cast<u32>(
static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
// generate slowmem fallback
bpi.host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode();
EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
// return to the block code
EmitBranch(GetCurrentNearCodePointer(), false);
SwitchToNearCode();
}
bpi.host_code_size = static_cast<u32>(
static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
// generate slowmem fallback
bpi.host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode();
// we add the ticks *after* the add here, since we counted incorrectly, then correct for it below
DebugAssert(m_delayed_cycles_add > 0);
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
m_delayed_cycles_add += Bus::RAM_READ_TICKS;
EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
// return to the block code
EmitBranch(GetCurrentNearCodePointer(), false);
SwitchToNearCode();
m_register_cache.UninhibitAllocation();
m_block->loadstore_backpatch_info.push_back(bpi);

View File

@ -1838,6 +1838,7 @@ void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size,
m_emit->shr(GetHostReg32(RARG1), 12);
m_emit->and_(GetHostReg32(RARG2), HOST_PAGE_OFFSET_MASK);
m_emit->mov(GetHostReg64(RARG1), m_emit->qword[GetFastmemBasePtrReg() + GetHostReg64(RARG1) * 8]);
m_emit->and_(GetHostReg64(RARG1), ~static_cast<u32>(HOST_PAGE_OFFSET_MASK));
switch (size)
{
@ -1926,6 +1927,35 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
}
break;
}
// insert nops, we need at least 5 bytes for a relative jump
const u32 fastmem_size =
static_cast<u32>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc));
const u32 nops = (fastmem_size < 5 ? 5 - fastmem_size : 0);
for (u32 i = 0; i < nops; i++)
m_emit->nop();
bpi.host_code_size = static_cast<u32>(
static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
// generate slowmem fallback
m_far_emitter.align(16);
bpi.host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode();
// we add the ticks *after* the add here, since we counted incorrectly, then correct for it below
DebugAssert(m_delayed_cycles_add > 0);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
m_delayed_cycles_add += Bus::RAM_READ_TICKS;
EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
// return to the block code
m_emit->jmp(GetCurrentNearCodePointer());
}
else
{
@ -1937,6 +1967,9 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
m_emit->shr(GetHostReg32(RARG1), 12);
m_emit->and_(GetHostReg32(RARG2), HOST_PAGE_OFFSET_MASK);
m_emit->mov(GetHostReg64(RARG1), m_emit->qword[GetFastmemBasePtrReg() + GetHostReg64(RARG1) * 8]);
m_emit->mov(GetHostReg32(RARG3), GetHostReg32(RARG1));
m_emit->and_(GetHostReg64(RARG1), ~static_cast<u32>(HOST_PAGE_OFFSET_MASK));
m_emit->and_(GetHostReg32(RARG3), static_cast<u32>(HOST_PAGE_OFFSET_MASK));
bpi.host_pc = GetCurrentNearCodePointer();
switch (size)
@ -1953,36 +1986,30 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
m_emit->mov(GetHostReg32(result.host_reg), m_emit->dword[GetHostReg64(RARG1) + GetHostReg64(RARG2)]);
break;
}
m_emit->add(m_emit->dword[GetHostReg64(RCPUPTR) + offsetof(State, pending_ticks)], GetHostReg32(RARG3));
// insert nops, we need at least 5 bytes for a relative jump
const u32 fastmem_size =
static_cast<u32>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc));
const u32 nops = (fastmem_size < 5 ? 5 - fastmem_size : 0);
for (u32 i = 0; i < nops; i++)
m_emit->nop();
bpi.host_code_size = static_cast<u32>(
static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
// generate slowmem fallback
m_far_emitter.align(16);
bpi.host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode();
EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
// return to the block code
m_emit->jmp(GetCurrentNearCodePointer());
}
// insert nops, we need at least 5 bytes for a relative jump
const u32 fastmem_size =
static_cast<u32>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc));
const u32 nops = (fastmem_size < 5 ? 5 - fastmem_size : 0);
for (u32 i = 0; i < nops; i++)
m_emit->nop();
bpi.host_code_size = static_cast<u32>(
static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
// generate slowmem fallback
m_far_emitter.align(16);
bpi.host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode();
// we add the ticks *after* the add here, since we counted incorrectly, then correct for it below
DebugAssert(m_delayed_cycles_add > 0);
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
m_delayed_cycles_add += Bus::RAM_READ_TICKS;
EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
// return to the block code
m_emit->jmp(GetCurrentNearCodePointer());
SwitchToNearCode();
m_register_cache.UninhibitAllocation();

View File

@ -1401,7 +1401,7 @@ static T DoMemoryRead(PhysicalMemoryAddress address)
if ((address & CPU::DCACHE_LOCATION_MASK) == CPU::DCACHE_LOCATION &&
(address & CPU::DCACHE_OFFSET_MASK) < CPU::DCACHE_SIZE)
{
std::memcpy(&result, &CPU::g_state.dcache[address & CPU::DCACHE_OFFSET_MASK], sizeof(result));
std::memcpy(&result, &CPU::g_scratchpad[address & CPU::DCACHE_OFFSET_MASK], sizeof(result));
return result;
}