wince: 32-bit virtual mem space

use fast mem read/write for x64 and arm64 dynarecs
This commit is contained in:
Flyinghead 2019-04-29 18:23:00 +02:00
parent 693a6c97f3
commit 810b8a59da
17 changed files with 1024 additions and 317 deletions

View File

@ -288,6 +288,10 @@
#define FEAT_HAS_SOFTREND BUILD_COMPILER == COMPILER_VC //GCC wants us to enable sse4 globaly to enable intrins
#endif
#if HOST_CPU == CPU_X64 || HOST_CPU == CPU_ARM64
#define HOST_64BIT_CPU
#endif
#define RAM_SIZE_MAX (32*1024*1024)
#define VRAM_SIZE_MAX (16*1024*1024)
#define ARAM_SIZE_MAX (8*1024*1024)

396
core/hw/mem/vmem32.cpp Normal file
View File

@ -0,0 +1,396 @@
/*
* vmem32.cpp
*
* Created on: Apr 11, 2019
* Author: Flyinghead
*/
#include <unordered_set>
#include "build.h"
#include "vmem32.h"
#include "_vmem.h"
#if HOST_OS == OS_WINDOWS
#include <Windows.h>
#else
#include <sys/mman.h>
#include <sys/stat.h> /* For mode constants */
#include <fcntl.h> /* For O_* constants */
#include <unistd.h>
#include <errno.h>
#ifdef _ANDROID
#include <linux/ashmem.h>
#endif
#endif
#ifndef MAP_NOSYNC
#define MAP_NOSYNC 0
#endif
#include "types.h"
#include "hw/sh4/dyna/ngen.h"
#include "hw/sh4/modules/mmu.h"
extern bool VramLockedWriteOffset(size_t offset);
extern cMutex vramlist_lock;
#if HOST_OS == OS_WINDOWS
extern HANDLE mem_handle;
#else
extern int vmem_fd;
#endif
#define VMEM32_ERROR_NOT_MAPPED 0x100
// FIXME stolen from _vmem.cpp
#define MAP_RAM_START_OFFSET 0
#define MAP_VRAM_START_OFFSET (MAP_RAM_START_OFFSET+RAM_SIZE)
#define MAP_ARAM_START_OFFSET (MAP_VRAM_START_OFFSET+VRAM_SIZE)
static const u64 VMEM32_SIZE = 0x100000000L;
static const u64 KERNEL_SPACE = 0x80000000L;
static const u64 AREA7_ADDRESS = 0x7C000000L;
#define VRAM_PROT_SEGMENT (1024 * 1024) // vram protection regions are grouped by 1MB segment
u8* vmem32_base;
unordered_set<u32> vram_mapped_pages;
vector<vram_block*> vram_blocks[VRAM_SIZE / VRAM_PROT_SEGMENT];
// stats
u64 vmem32_page_faults;
u64 vmem32_flush;
static void* vmem32_map_buffer(u32 dst, u32 addrsz, u32 offset, u32 size, bool write)
{
void* ptr;
void* rv;
//printf("MAP32 %08X w/ %d\n",dst,offset);
u32 map_times = addrsz / size;
#if HOST_OS == OS_WINDOWS
rv = MapViewOfFileEx(mem_handle, FILE_MAP_READ | (write ? FILE_MAP_WRITE : 0), 0, offset, size, &vmem32_base[dst]);
if (rv == NULL)
return NULL;
for (u32 i = 1; i < map_times; i++)
{
dst += size;
ptr = MapViewOfFileEx(mem_handle, FILE_MAP_READ | (write ? FILE_MAP_WRITE : 0), 0, offset, size, &vmem32_base[dst]);
if (ptr == NULL)
return NULL;
}
#else
u32 prot = PROT_READ | (write ? PROT_WRITE : 0);
rv = mmap(&vmem32_base[dst], size, prot, MAP_SHARED | MAP_NOSYNC | MAP_FIXED, vmem_fd, offset);
if (MAP_FAILED == rv)
{
printf("MAP1 failed %d\n", errno);
return NULL;
}
for (u32 i = 1; i < map_times; i++)
{
dst += size;
ptr = mmap(&vmem32_base[dst], size, prot , MAP_SHARED | MAP_NOSYNC | MAP_FIXED, vmem_fd, offset);
if (MAP_FAILED == ptr)
{
printf("MAP2 failed %d\n", errno);
return NULL;
}
}
#endif
return rv;
}
static void vmem32_unmap_buffer(u32 start, u64 end)
{
#if HOST_OS == OS_LINUX
mmap(&vmem32_base[start], end - start, PROT_NONE, MAP_FIXED | MAP_PRIVATE | MAP_ANON, -1, 0);
#elif HOST_OS == OS_WINDOWS
VirtualAlloc(&vmem32_base[start], end - start, MEM_RESERVE, PAGE_NOACCESS);
#else
#error Unsupported OS
#endif
}
static void vmem32_protect_buffer(u32 start, u32 size)
{
verify((start & PAGE_MASK) == 0);
#if HOST_OS == OS_LINUX
mprotect(&vmem32_base[start], size, PROT_READ);
#elif HOST_OS == OS_WINDOWS
DWORD old;
VirtualProtect(vmem32_base + start, end - start, PAGE_READONLY, &old);
#else
#error Unsupported OS
#endif
}
static void vmem32_unprotect_buffer(u32 start, u32 size)
{
verify((start & PAGE_MASK) == 0);
#if HOST_OS == OS_LINUX
mprotect(&vmem32_base[start], size, PROT_READ | PROT_WRITE);
#elif HOST_OS == OS_WINDOWS
DWORD old;
VirtualProtect(vmem32_base + start, end - start, PAGE_READWRITE, &old);
#else
#error Unsupported OS
#endif
}
void vmem32_protect_vram(vram_block *block)
{
if (vmem32_base == NULL)
return;
for (int i = block->start / VRAM_PROT_SEGMENT; i <= block->end / VRAM_PROT_SEGMENT; i++)
{
vram_blocks[i].push_back(block);
}
}
void vmem32_unprotect_vram(vram_block *block)
{
if (vmem32_base == NULL)
return;
for (int page = block->start / VRAM_PROT_SEGMENT; page <= block->end / VRAM_PROT_SEGMENT; page++)
{
for (int i = 0; i < vram_blocks[page].size(); i++)
if (vram_blocks[page][i] == block)
{
vram_blocks[page].erase(vram_blocks[page].begin() + i);
break;
}
}
}
static bool vmem32_map_areas()
{
// Aica ram
vmem32_map_buffer(0x80800000, 0x00800000, MAP_ARAM_START_OFFSET, ARAM_SIZE, true); // P1
vmem32_map_buffer(0x82800000, ARAM_SIZE, MAP_ARAM_START_OFFSET, ARAM_SIZE, true);
vmem32_map_buffer(0xA0800000, 0x00800000, MAP_ARAM_START_OFFSET, ARAM_SIZE, true); // P2
vmem32_map_buffer(0xA2800000, ARAM_SIZE, MAP_ARAM_START_OFFSET, ARAM_SIZE, true);
// Vram
// Note: this should be mapped read/write but doesn't seem to be used
vmem32_map_buffer(0x84000000, 0x01000000, MAP_VRAM_START_OFFSET, VRAM_SIZE, false); // P1
vmem32_map_buffer(0x86000000, 0x01000000, MAP_VRAM_START_OFFSET, VRAM_SIZE, false);
vmem32_map_buffer(0xA4000000, 0x01000000, MAP_VRAM_START_OFFSET, VRAM_SIZE, false); // P2
vmem32_map_buffer(0xA6000000, 0x01000000, MAP_VRAM_START_OFFSET, VRAM_SIZE, false);
// System ram
vmem32_map_buffer(0x8C000000, 0x04000000, MAP_RAM_START_OFFSET, RAM_SIZE, true); // P1
vmem32_map_buffer(0xAC000000, 0x04000000, MAP_RAM_START_OFFSET, RAM_SIZE, true); // P2
return true;
}
static const u32 page_sizes[] = { 1024, 4 * 1024, 64 * 1024, 1024 * 1024 };
static u32 vmem32_paddr_to_offset(u32 address)
{
u32 low_addr = address & 0x1FFFFFFF;
switch ((address >> 26) & 7)
{
case 0: // area 0
// Aica ram
if (low_addr >= 0x00800000 && low_addr < 0x00800000 + 0x00800000)
{
return ((low_addr - 0x00800000) & (ARAM_SIZE - 1)) + MAP_ARAM_START_OFFSET;
}
else if (low_addr >= 0x02800000 && low_addr < 0x02800000 + 0x00800000)
{
return low_addr - 0x02800000 + MAP_ARAM_START_OFFSET;
}
break;
case 1: // area 1
// Vram
if (low_addr >= 0x04000000 && low_addr < 0x04000000 + 0x01000000)
{
return ((low_addr - 0x04000000) & (VRAM_SIZE - 1)) + MAP_VRAM_START_OFFSET;
}
else if (low_addr >= 0x06000000 && low_addr < 0x06000000 + 0x01000000)
{
return ((low_addr - 0x06000000) & (VRAM_SIZE - 1)) + MAP_VRAM_START_OFFSET;
}
break;
case 3: // area 3
// System ram
if (low_addr >= 0x0C000000 && low_addr < 0x0C000000 + 0x04000000)
{
return ((low_addr - 0x0C000000) & (RAM_SIZE - 1)) + MAP_RAM_START_OFFSET;
}
break;
//case 4:
// TODO vram?
//break;
default:
break;
}
// Unmapped address
return -1;
}
static u32 vmem32_map_mmu(u32 address, bool write)
{
#ifndef NO_MMU
u32 pa;
const TLB_Entry *entry;
u32 rc = mmu_full_lookup<false>(address, &entry, pa);
if (rc == MMU_ERROR_NONE)
{
//0X & User mode-> protection violation
//if ((entry->Data.PR >> 1) == 0 && p_sh4rcb->cntx.sr.MD == 0)
// return MMU_ERROR_PROTECTED;
//if (write)
//{
// if ((entry->Data.PR & 1) == 0)
// return MMU_ERROR_PROTECTED;
// if (entry->Data.D == 0)
// return MMU_ERROR_FIRSTWRITE;
//}
u32 page_size = page_sizes[entry->Data.SZ1 * 2 + entry->Data.SZ0];
if (page_size == 1024)
return VMEM32_ERROR_NOT_MAPPED;
u32 vpn = (entry->Address.VPN << 10) & ~(page_size - 1);
u32 ppn = (entry->Data.PPN << 10) & ~(page_size - 1);
u32 offset = vmem32_paddr_to_offset(ppn);
if (offset == -1)
return VMEM32_ERROR_NOT_MAPPED;
if (offset >= MAP_VRAM_START_OFFSET && offset < MAP_VRAM_START_OFFSET + VRAM_SIZE)
{
// Check vram protected regions
u32 start = offset - MAP_VRAM_START_OFFSET;
if (!vram_mapped_pages.insert(vpn).second)
{
// page has been mapped already: vram locked write
vmem32_unprotect_buffer(address & ~PAGE_MASK, PAGE_SIZE);
u32 addr_offset = start + (address & (page_size - 1));
VramLockedWriteOffset(addr_offset);
return MMU_ERROR_NONE;
}
verify(vmem32_map_buffer(vpn, page_size, offset, page_size, (entry->Data.PR & 1) != 0) != NULL);
u32 end = start + page_size;
const vector<vram_block *>& blocks = vram_blocks[start / VRAM_PROT_SEGMENT];
vramlist_lock.Lock();
for (int i = blocks.size() - 1; i >= 0; i--)
{
if (blocks[i]->start < end && blocks[i]->end >= start)
{
u32 prot_start = max(start, blocks[i]->start);
u32 prot_size = min(end, blocks[i]->end + 1) - prot_start;
prot_size += prot_start % PAGE_SIZE;
prot_start &= ~PAGE_MASK;
vmem32_protect_buffer(vpn + (prot_start & (page_size - 1)), prot_size);
}
}
vramlist_lock.Unlock();
}
else
// Not vram
verify(vmem32_map_buffer(vpn, page_size, offset, page_size, (entry->Data.PR & 1) != 0) != NULL);
return MMU_ERROR_NONE;
}
#else
u32 rc = MMU_ERROR_PROTECTED;
#endif
return rc;
}
static u32 vmem32_map_address(u32 address, bool write)
{
u32 area = address >> 29;
switch (area)
{
case 3: // P0/U0
if (address >= AREA7_ADDRESS)
// area 7: unmapped
return VMEM32_ERROR_NOT_MAPPED;
/* no break */
case 0:
case 1:
case 2:
case 6: // P3
return vmem32_map_mmu(address, write);
default:
break;
}
return VMEM32_ERROR_NOT_MAPPED;
}
#if !defined(NO_MMU) && defined(HOST_64BIT_CPU)
bool vmem32_handle_signal(void *fault_addr, bool write)
{
if ((u8*)fault_addr < vmem32_base || (u8*)fault_addr >= vmem32_base + VMEM32_SIZE)
return false;
vmem32_page_faults++;
u32 guest_addr = (u8*)fault_addr - vmem32_base;
u32 rv = vmem32_map_address(guest_addr, write);
//printf("vmem32_handle_signal handled signal %s @ %p -> %08x rv=%d\n", write ? "W" : "R", fault_addr, guest_addr, rv);
if (rv == MMU_ERROR_NONE)
return true;
if (rv == VMEM32_ERROR_NOT_MAPPED)
return false;
p_sh4rcb->cntx.pc = p_sh4rcb->cntx.exception_pc;
DoMMUException(guest_addr, rv, write ? MMU_TT_DWRITE : MMU_TT_DREAD);
ngen_HandleException();
// not reached
return true;
}
#endif
void vmem32_flush_mmu()
{
vmem32_flush++;
vram_mapped_pages.clear();
vmem32_unmap_buffer(0, KERNEL_SPACE);
// TODO flush P3?
}
bool vmem32_init()
{
if (!_nvmem_enabled())
return false;
#ifdef HOST_64BIT_CPU
#if HOST_OS == OS_LINUX
void* rv = mmap(0, VMEM32_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
verify(rv != NULL);
munmap(rv, VMEM32_SIZE);
vmem32_base = (u8*)rv;
#elif HOST_OS == OS_WINDOWS
void* rv = (u8 *)VirtualAlloc(0, VMEM32_SIZE, MEM_RESERVE, PAGE_NOACCESS);
if (rv != NULL)
VirtualFree(rv, 0, MEM_RELEASE);
vmem32_base = (u8*)rv;
#else
#error Unsupported OS
#endif
vmem32_unmap_buffer(0, VMEM32_SIZE);
printf("vmem32_init: allocated %zx bytes from %p to %p\n", VMEM32_SIZE, vmem32_base, vmem32_base + VMEM32_SIZE);
if (!vmem32_map_areas())
{
vmem32_term();
return false;
}
#endif
return true;
}
void vmem32_term()
{
if (vmem32_base != NULL)
{
munmap(vmem32_base, VMEM32_SIZE);
vmem32_base = NULL;
}
}

11
core/hw/mem/vmem32.h Normal file
View File

@ -0,0 +1,11 @@
#include "types.h"
bool vmem32_init();
void vmem32_term();
bool vmem32_handle_signal(void *fault_addr, bool write);
void vmem32_flush_mmu();
void vmem32_protect_vram(vram_block *block);
void vmem32_unprotect_vram(vram_block *block);
static inline bool vmem32_enabled() {
return !settings.dynarec.disable_vmem32;
}

View File

@ -88,12 +88,11 @@ void clear_temp_cache(bool full)
void recSh4_ClearCache()
{
printf("recSh4:Dynarec Cache clear at %08X free space %d\n",curr_pc, emit_FreeSpace());
LastAddr=LastAddr_min;
bm_Reset();
smc_hotspots.clear();
clear_temp_cache(true);
printf("recSh4:Dynarec Cache clear at %08X\n",curr_pc);
}
void recSh4_Run()
@ -279,7 +278,7 @@ DynarecCodeEntryPtr rdv_CompilePC(u32 blockcheck_failures)
emit_ptr_limit = (u32 *)(TempCodeCache + TEMP_CODE_SIZE);
rbi->temp_block = true;
}
bool do_opts=((rbi->addr&0x3FFFFFFF)>0x0C010100);
bool do_opts = !rbi->temp_block; //((rbi->addr&0x3FFFFFFF)>0x0C010100);
rbi->staging_runs=do_opts?100:-100;
ngen_Compile(rbi,DoCheck(rbi->addr),(pc&0xFFFFFF)==0x08300 || (pc&0xFFFFFF)==0x10000,false,do_opts);
verify(rbi->code!=0);

View File

@ -100,6 +100,7 @@ extern void (*ngen_FailedToFindBlock)();
void ngen_mainloop(void* cntx);
void ngen_GetFeatures(ngen_features* dst);
void ngen_HandleException();
//Canonical callback interface
enum CanonicalParamType

View File

@ -124,6 +124,8 @@ void Sh4_int_Skip()
}
}
extern u8 *vmem32_base;
void Sh4_int_Reset(bool Manual)
{
if (sh4_int_bCpuRun)
@ -148,6 +150,8 @@ void Sh4_int_Reset(bool Manual)
old_fpscr=fpscr;
UpdateFPSCR();
p_sh4rcb->cntx.vmem32_base = vmem32_base;
//Any more registers have default value ?
printf("Sh4 Reset\n");
}

View File

@ -2066,7 +2066,7 @@ sh4op(i0000_nnnn_0110_1010)
{
u32 n = GetN(op);
r[n] = fpscr.full;
UpdateFPSCR();
//UpdateFPSCR();
}
//sts.l FPSCR,@-<REG_N>

View File

@ -8,6 +8,7 @@
#include "../sh4_core.h"
#include "hw/pvr/pvr_mem.h"
#include "hw/mem/_vmem.h"
#include "hw/mem/vmem32.h"
#include "mmu.h"
//Types
@ -41,6 +42,16 @@ void CCN_QACR_write(u32 addr, u32 value)
}
}
void CCN_PTEH_write(u32 addr, u32 value)
{
CCN_PTEH_type temp;
temp.reg_data = value;
if (temp.ASID != CCN_PTEH.ASID && vmem32_enabled())
vmem32_flush_mmu();
CCN_PTEH = temp;
}
void CCN_MMUCR_write(u32 addr, u32 value)
{
CCN_MMUCR_type temp;
@ -52,6 +63,8 @@ void CCN_MMUCR_write(u32 addr, u32 value)
{
//sh4_cpu.ResetCache();
mmu_flush_table();
if (vmem32_enabled())
vmem32_flush_mmu();
temp.TI = 0;
}
@ -99,7 +112,7 @@ static u32 CCN_PRR_read(u32 addr)
void ccn_init()
{
//CCN PTEH 0xFF000000 0x1F000000 32 Undefined Undefined Held Held Iclk
sh4_rio_reg(CCN,CCN_PTEH_addr,RIO_DATA,32);
sh4_rio_reg(CCN,CCN_PTEH_addr,RIO_WF,32,0,&CCN_PTEH_write);
//CCN PTEL 0xFF000004 0x1F000004 32 Undefined Undefined Held Held Iclk
sh4_rio_reg(CCN,CCN_PTEL_addr,RIO_DATA,32);

View File

@ -282,6 +282,9 @@ struct Sh4Context
int sh4_sched_next;
u32 interrupt_pend;
u32 exception_pc;
u8 *vmem32_base;
};
u64 raw[64-8];
};

View File

@ -29,6 +29,7 @@
#endif
#include <unistd.h>
#include "hw/sh4/dyna/blockmanager.h"
#include "hw/mem/vmem32.h"
#include "linux/context.h"
@ -48,7 +49,7 @@ void sigill_handler(int sn, siginfo_t * si, void *segfault_ctx) {
context_from_segfault(&ctx, segfault_ctx);
unat pc = (unat)ctx.pc;
bool dyna_cde = (pc>(unat)CodeCache) && (pc<(unat)(CodeCache + CODE_SIZE));
bool dyna_cde = (pc>(unat)CodeCache) && (pc<(unat)(CodeCache + CODE_SIZE + TEMP_CODE_SIZE));
printf("SIGILL @ %lx -> %p was not in vram, dynacode:%d\n", pc, si->si_addr, dyna_cde);
@ -64,12 +65,21 @@ void fault_handler (int sn, siginfo_t * si, void *segfault_ctx)
context_from_segfault(&ctx, segfault_ctx);
bool dyna_cde = ((unat)ctx.pc>(unat)CodeCache) && ((unat)ctx.pc<(unat)(CodeCache + CODE_SIZE));
bool dyna_cde = ((unat)ctx.pc>(unat)CodeCache) && ((unat)ctx.pc<(unat)(CodeCache + CODE_SIZE + TEMP_CODE_SIZE));
//ucontext_t* ctx=(ucontext_t*)ctxr;
//printf("mprot hit @ ptr 0x%08X @@ code: %08X, %d\n",si->si_addr,ctx->uc_mcontext.arm_pc,dyna_cde);
#if !defined(NO_MMU) && defined(HOST_64BIT_CPU)
#if HOST_CPU == CPU_ARM64
u32 op = *(u32*)ctx.pc;
bool write = (op & 0x00400000) == 0;
#elif HOST_CPU == CPU_X64
bool write = false; // TODO?
#endif
if (vmem32_handle_signal(si->si_addr, write))
return;
#endif
if (VramLockedWrite((u8*)si->si_addr) || BM_LockedWrite((u8*)si->si_addr))
return;
#if FEAT_SHREC == DYNAREC_JIT
@ -91,7 +101,10 @@ void fault_handler (int sn, siginfo_t * si, void *segfault_ctx)
context_to_segfault(&ctx, segfault_ctx);
}
#elif HOST_CPU == CPU_X64
//x64 has no rewrite support
else if (dyna_cde && ngen_Rewrite((unat&)ctx.pc, 0, 0))
{
context_to_segfault(&ctx, segfault_ctx);
}
#elif HOST_CPU == CPU_ARM64
else if (dyna_cde && ngen_Rewrite(ctx.pc, 0, 0))
{

View File

@ -6,6 +6,7 @@
#include "oslib/oslib.h"
#include "oslib/audiostream.h"
#include "hw/mem/_vmem.h"
#include "hw/mem/vmem32.h"
#include "stdclass.h"
#include "cfg/cfg.h"
@ -140,7 +141,9 @@ void LoadSpecialSettings()
extra_depth_game = false;
full_mmu_game = false;
if (reios_windows_ce)
if (reios_windows_ce
// Half-life
|| !strncmp("MK-51035", reios_product_number, 8))
{
printf("Enabling Full MMU and Extra depth scaling for Windows CE game\n");
settings.rend.ExtraDepthScale = 0.1;
@ -275,6 +278,13 @@ int reicast_init(int argc, char* argv[])
printf("Failed to alloc mem\n");
return -1;
}
#ifdef HOST_64BIT_CPU
if (!vmem32_init())
{
printf("Failed to alloc 32-bit mem space\n");
return -1;
}
#endif
if (ParseCommandLine(argc, argv))
{
return 69;
@ -461,6 +471,7 @@ void InitSettings()
settings.dynarec.idleskip = true;
settings.dynarec.unstable_opt = false;
settings.dynarec.safemode = true;
settings.dynarec.disable_vmem32 = false;
settings.dreamcast.cable = 3; // TV composite
settings.dreamcast.region = 3; // default
settings.dreamcast.broadcast = 4; // default
@ -534,6 +545,7 @@ void LoadSettings(bool game_specific)
settings.dynarec.idleskip = cfgLoadBool(config_section, "Dynarec.idleskip", settings.dynarec.idleskip);
settings.dynarec.unstable_opt = cfgLoadBool(config_section, "Dynarec.unstable-opt", settings.dynarec.unstable_opt);
settings.dynarec.safemode = cfgLoadBool(config_section, "Dynarec.safe-mode", settings.dynarec.safemode);
settings.dynarec.disable_vmem32 = cfgLoadBool(config_section, "Dynarec.DisableVmem32", settings.dynarec.disable_vmem32);
//disable_nvmem can't be loaded, because nvmem init is before cfg load
settings.dreamcast.cable = cfgLoadInt(config_section, "Dreamcast.Cable", settings.dreamcast.cable);
settings.dreamcast.region = cfgLoadInt(config_section, "Dreamcast.Region", settings.dreamcast.region);
@ -670,6 +682,7 @@ void SaveSettings()
cfgSaveBool("config", "Dynarec.unstable-opt", settings.dynarec.unstable_opt);
if (!safemode_game || !settings.dynarec.safemode)
cfgSaveBool("config", "Dynarec.safe-mode", settings.dynarec.safemode);
cfgSaveBool("config", "Dynarec.DisableVmem32", settings.dynarec.disable_vmem32);
cfgSaveInt("config", "Dreamcast.Language", settings.dreamcast.language);
cfgSaveBool("config", "aica.LimitFPS", settings.aica.LimitFPS);
cfgSaveBool("config", "aica.NoBatch", settings.aica.NoBatch);

View File

@ -39,6 +39,7 @@ using namespace vixl::aarch64;
#include "hw/sh4/dyna/ngen.h"
#include "hw/sh4/sh4_mem.h"
#include "hw/sh4/sh4_rom.h"
#include "hw/mem/vmem32.h"
#include "arm64_regalloc.h"
#undef do_sqw_nommu
@ -185,8 +186,8 @@ void ngen_mainloop(void* v_cntx)
"stp x29, x30, [sp, #144] \n\t"
"stp %[cntx], %[cycle_counter], [sp, #-16]! \n\t" // Push context, cycle_counter address
"mov w27, %[_SH4_TIMESLICE] \n\t"
"str w27, [%[cycle_counter]] \n\t"
"mov w1, %[_SH4_TIMESLICE] \n\t"
"str w1, [%[cycle_counter]] \n\t"
"mov x0, %[jmp_env] \n\t" // SETJMP
"bl setjmp \n\t"
@ -195,15 +196,17 @@ void ngen_mainloop(void* v_cntx)
"ldr x28, [sp] \n\t" // Set context
// w29 is next_pc
"ldr w29, [x28, %[pc]] \n\t"
// x27 is vmem32_base
"ldr x27, [x28, %[vmem32_base]] \n\t"
"b no_update \n"
".hidden intc_sched \n\t"
".globl intc_sched \n\t"
"intc_sched: \n\t"
"ldr x27, [sp, #8] \n\t" // &cycle_counter
"ldr w0, [x27] \n\t" // cycle_counter
"ldr x1, [sp, #8] \n\t" // &cycle_counter
"ldr w0, [x1] \n\t" // cycle_counter
"add w0, w0, %[_SH4_TIMESLICE] \n\t"
"str w0, [x27] \n\t"
"str w0, [x1] \n\t"
"mov x29, lr \n\t" // Trashing pc here but it will be reset at the end of the block or in DoInterrupts
"bl UpdateSystem \n\t"
"mov lr, x29 \n\t"
@ -260,7 +263,8 @@ void ngen_mainloop(void* v_cntx)
[RCB_SIZE] "i" (sizeof(Sh4RCB) >> 16),
[SH4CTX_SIZE] "i" (sizeof(Sh4Context)),
[jmp_env] "r"(reinterpret_cast<uintptr_t>(jmp_env)),
[cycle_counter] "r"(reinterpret_cast<uintptr_t>(&cycle_counter))
[cycle_counter] "r"(reinterpret_cast<uintptr_t>(&cycle_counter)),
[vmem32_base] "i"(offsetof(Sh4Context, vmem32_base))
: "memory"
);
}
@ -476,10 +480,10 @@ public:
regalloc.DoAlloc(block);
// scheduler
Mov(x27, reinterpret_cast<uintptr_t>(&cycle_counter));
Ldr(w0, MemOperand(x27));
Mov(x1, reinterpret_cast<uintptr_t>(&cycle_counter));
Ldr(w0, MemOperand(x1));
Subs(w0, w0, block->guest_cycles);
Str(w0, MemOperand(x27));
Str(w0, MemOperand(x1));
Label cycles_remaining;
B(&cycles_remaining, pl);
GenCallRuntime(intc_sched);
@ -568,11 +572,11 @@ public:
break;
case shop_readm:
GenReadMemory(op, i);
GenReadMemory(op, i, optimise);
break;
case shop_writem:
GenWriteMemory(op, i);
GenWriteMemory(op, i, optimise);
break;
case shop_sync_sr:
@ -1073,10 +1077,10 @@ public:
void GenWriteMemorySlow(const shil_opcode& op)
{
Instruction *start_instruction = GetCursorAddress<Instruction *>();
if (mmu_enabled())
Mov(*call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc
Instruction *start_instruction = GetCursorAddress<Instruction *>();
u32 size = op.flags & 0x7f;
switch (size)
{
@ -1117,7 +1121,10 @@ public:
void InitializeRewrite(RuntimeBlockInfo *block, size_t opid)
{
regalloc.DoAlloc(block);
this->block = block;
// with full mmu, all regs are flushed before mem ops
if (!mmu_enabled())
regalloc.DoAlloc(block);
regalloc.current_opid = opid;
}
@ -1308,14 +1315,14 @@ private:
B(&code_label, cond);
}
void GenReadMemory(const shil_opcode& op, size_t opid)
void GenReadMemory(const shil_opcode& op, size_t opid, bool optimise)
{
if (GenReadMemoryImmediate(op))
return;
GenMemAddr(op, call_regs[0]);
if (GenReadMemoryFast(op, opid))
if (optimise && GenReadMemoryFast(op, opid))
return;
GenReadMemorySlow(op);
@ -1431,59 +1438,104 @@ private:
bool GenReadMemoryFast(const shil_opcode& op, size_t opid)
{
// Direct memory access. Need to handle SIGSEGV and rewrite block as needed. See ngen_Rewrite()
if (!_nvmem_enabled() || mmu_enabled())
if (!_nvmem_enabled() || (mmu_enabled() && !vmem32_enabled()))
return false;
Instruction *start_instruction = GetCursorAddress<Instruction *>();
// WARNING: the rewrite code relies on having two ops before the memory access
const XRegister* base_reg;
const XRegister* offset_reg;
// WARNING: the rewrite code relies on having two ops before the memory access (3 when mmu is enabled)
// Update ngen_Rewrite (and perhaps read_memory_rewrite_size) if adding or removing code
Add(w1, *call_regs[0], sizeof(Sh4Context), LeaveFlags);
Bfc(w1, 29, 3); // addr &= ~0xE0000000
if (!mmu_enabled())
{
Add(w1, *call_regs[0], sizeof(Sh4Context), LeaveFlags);
Bfc(w1, 29, 3); // addr &= ~0xE0000000
base_reg = &x28;
offset_reg = &x1;
}
else
{
u32 exception_pc = block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0);
// 3 ops before memory access
Mov(w8, exception_pc & 0xFFFF);
Movk(w8, exception_pc >> 16, 16);
Str(w8, sh4_context_mem_operand(&p_sh4rcb->cntx.exception_pc));
base_reg = &x27;
offset_reg = call_regs64[0];
}
//printf("direct read memory access opid %d pc %p code addr %08x\n", opid, GetCursorAddress<void *>(), this->block->addr);
this->block->memory_accesses[GetCursorAddress<void *>()] = (u32)opid;
u32 size = op.flags & 0x7f;
switch(size)
if (regalloc.IsAllocAny(op.rd))
{
case 1:
Ldrsb(regalloc.MapRegister(op.rd), MemOperand(x28, x1, SXTW));
break;
switch(size)
{
case 1:
Ldrsb(regalloc.MapRegister(op.rd), MemOperand(*base_reg, *offset_reg));
break;
case 2:
Ldrsh(regalloc.MapRegister(op.rd), MemOperand(x28, x1, SXTW));
break;
case 2:
Ldrsh(regalloc.MapRegister(op.rd), MemOperand(*base_reg, *offset_reg));
break;
case 4:
if (!op.rd.is_r32f())
Ldr(regalloc.MapRegister(op.rd), MemOperand(x28, x1));
else
Ldr(regalloc.MapVRegister(op.rd), MemOperand(x28, x1));
break;
case 4:
if (!op.rd.is_r32f())
Ldr(regalloc.MapRegister(op.rd), MemOperand(*base_reg, *offset_reg));
else
Ldr(regalloc.MapVRegister(op.rd), MemOperand(*base_reg, *offset_reg));
break;
case 8:
Ldr(x1, MemOperand(x28, x1));
break;
}
case 8:
Ldr(x1, MemOperand(*base_reg, *offset_reg));
break;
}
if (size == 8)
{
if (size == 8)
{
#ifdef EXPLODE_SPANS
verify(op.rd.count() == 2 && regalloc.IsAllocf(op.rd, 0) && regalloc.IsAllocf(op.rd, 1));
Fmov(regalloc.MapVRegister(op.rd, 0), w1);
Lsr(x1, x1, 32);
Fmov(regalloc.MapVRegister(op.rd, 1), w1);
verify(op.rd.count() == 2 && regalloc.IsAllocf(op.rd, 0) && regalloc.IsAllocf(op.rd, 1));
Fmov(regalloc.MapVRegister(op.rd, 0), w1);
Lsr(x1, x1, 32);
Fmov(regalloc.MapVRegister(op.rd, 1), w1);
#else
Str(x1, sh4_context_mem_operand(op.rd.reg_ptr()));
Str(x1, sh4_context_mem_operand(op.rd.reg_ptr()));
#endif
}
}
else
{
switch(size)
{
case 1:
Ldrsb(w1, MemOperand(*base_reg, *offset_reg));
break;
case 2:
Ldrsh(w1, MemOperand(*base_reg, *offset_reg));
break;
case 4:
Ldr(w1, MemOperand(*base_reg, *offset_reg));
break;
case 8:
Ldr(x1, MemOperand(*base_reg, *offset_reg));
break;
}
if (size == 8)
Str(x1, sh4_context_mem_operand(op.rd.reg_ptr()));
else
Str(w1, sh4_context_mem_operand(op.rd.reg_ptr()));
}
EnsureCodeSize(start_instruction, read_memory_rewrite_size);
return true;
}
void GenWriteMemory(const shil_opcode& op, size_t opid)
void GenWriteMemory(const shil_opcode& op, size_t opid, bool optimise)
{
GenMemAddr(op, call_regs[0]);
@ -1502,7 +1554,7 @@ private:
shil_param_to_host_reg(op.rs2, *call_regs64[1]);
#endif
}
if (GenWriteMemoryFast(op, opid))
if (optimise && GenWriteMemoryFast(op, opid))
return;
GenWriteMemorySlow(op);
@ -1511,15 +1563,31 @@ private:
bool GenWriteMemoryFast(const shil_opcode& op, size_t opid)
{
// Direct memory access. Need to handle SIGSEGV and rewrite block as needed. See ngen_Rewrite()
if (!_nvmem_enabled() || mmu_enabled())
if (!_nvmem_enabled() || (mmu_enabled() && !vmem32_enabled()))
return false;
Instruction *start_instruction = GetCursorAddress<Instruction *>();
// WARNING: the rewrite code relies on having two ops before the memory access
const XRegister* base_reg;
const XRegister* offset_reg;
// WARNING: the rewrite code relies on having two ops before the memory access (3 when mmu is enabled)
// Update ngen_Rewrite (and perhaps write_memory_rewrite_size) if adding or removing code
Add(w7, *call_regs[0], sizeof(Sh4Context), LeaveFlags);
Bfc(w7, 29, 3); // addr &= ~0xE0000000
if (!mmu_enabled())
{
Add(w7, *call_regs[0], sizeof(Sh4Context), LeaveFlags);
Bfc(w7, 29, 3); // addr &= ~0xE0000000
base_reg = &x28;
offset_reg = &x7;
}
else
{
u32 exception_pc = block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0);
Mov(w8, exception_pc & 0xFFFF);
Movk(w8, exception_pc >> 16, 16);
Str(w8, sh4_context_mem_operand(&p_sh4rcb->cntx.exception_pc));
base_reg = &x27;
offset_reg = call_regs64[0];
}
//printf("direct write memory access opid %d pc %p code addr %08x\n", opid, GetCursorAddress<void *>(), this->block->addr);
this->block->memory_accesses[GetCursorAddress<void *>()] = (u32)opid;
@ -1528,19 +1596,19 @@ private:
switch(size)
{
case 1:
Strb(w1, MemOperand(x28, x7, SXTW));
Strb(w1, MemOperand(*base_reg, *offset_reg));
break;
case 2:
Strh(w1, MemOperand(x28, x7, SXTW));
Strh(w1, MemOperand(*base_reg, *offset_reg));
break;
case 4:
Str(w1, MemOperand(x28, x7));
Str(w1, MemOperand(*base_reg, *offset_reg));
break;
case 8:
Str(x1, MemOperand(x28, x7));
Str(x1, MemOperand(*base_reg, *offset_reg));
break;
}
EnsureCodeSize(start_instruction, write_memory_rewrite_size);
@ -1699,7 +1767,7 @@ private:
RuntimeBlockInfo* block = NULL;
const int read_memory_rewrite_size = 6; // worst case for u64: add, bfc, ldr, fmov, lsr, fmov
// FIXME rewrite size per read/write size?
const int write_memory_rewrite_size = 3;
const int write_memory_rewrite_size = 4;
};
static Arm64Assembler* compiler;
@ -1755,7 +1823,7 @@ bool ngen_Rewrite(unat& host_pc, unat, unat)
u32 opid = it->second;
verify(opid < block->oplist.size());
const shil_opcode& op = block->oplist[opid];
Arm64Assembler *assembler = new Arm64Assembler(code_ptr - 2); // Skip the 2 preceding ops (bic, add)
Arm64Assembler *assembler = new Arm64Assembler(code_ptr - 2 - (mmu_enabled() ? 1 : 0)); // Skip the 2 preceding ops (bic, add)
assembler->InitializeRewrite(block, opid);
if (op.op == shop_readm)
assembler->GenReadMemorySlow(op);
@ -1763,11 +1831,16 @@ bool ngen_Rewrite(unat& host_pc, unat, unat)
assembler->GenWriteMemorySlow(op);
assembler->Finalize(true);
delete assembler;
host_pc = (unat)(code_ptr - 2);
host_pc = (unat)(code_ptr - 2 - (mmu_enabled() ? 1 : 0));
return true;
}
void ngen_HandleException()
{
longjmp(jmp_env, 1);
}
u32 DynaRBI::Relink()
{
if (mmu_enabled())

View File

@ -19,6 +19,7 @@
#include "hw/sh4/sh4_core.h"
#include "hw/sh4/sh4_mem.h"
#include "hw/sh4/sh4_rom.h"
#include "hw/mem/vmem32.h"
#include "emitter/x86_emitter.h"
#include "profiler/profiler.h"
#include "oslib/oslib.h"
@ -130,7 +131,6 @@ WIN32_ONLY( ".seh_pushreg %r14 \n\t")
"lea " _U "jmp_env(%rip), %rdi \n\t"
#endif
"call " _U "setjmp \n\t"
// "testl %rax, %rax \n\t"
"1: \n\t" // run_loop
"movq " _U "p_sh4rcb(%rip), %rax \n\t"
@ -219,13 +219,8 @@ static void ngen_blockcheckfail(u32 pc) {
rdv_BlockCheckFail(pc);
}
static u32 exception_raised;
template<typename T>
static T ReadMemNoEx(u32 addr, u32 pc)
static void handle_mem_exception(u32 exception_raised, u32 pc)
{
#ifndef NO_MMU
T rv = mmu_ReadMemNoEx<T>(addr, &exception_raised);
if (exception_raised)
{
if (pc & 1)
@ -233,8 +228,19 @@ static T ReadMemNoEx(u32 addr, u32 pc)
spc = pc - 1;
else
spc = pc;
cycle_counter += CPU_RATIO * 2; // probably more is needed but no easy way to find out
longjmp(jmp_env, 1);
}
}
template<typename T>
static T ReadMemNoEx(u32 addr, u32 pc)
{
#ifndef NO_MMU
u32 exception_raised;
T rv = mmu_ReadMemNoEx<T>(addr, &exception_raised);
handle_mem_exception(exception_raised, pc);
return rv;
#else
// not used
@ -246,32 +252,30 @@ template<typename T>
static void WriteMemNoEx(u32 addr, T data, u32 pc)
{
#ifndef NO_MMU
exception_raised = mmu_WriteMemNoEx<T>(addr, data);
if (exception_raised)
{
if (pc & 1)
// Delay slot
spc = pc - 1;
else
spc = pc;
longjmp(jmp_env, 1);
}
u32 exception_raised = mmu_WriteMemNoEx<T>(addr, data);
handle_mem_exception(exception_raised, pc);
#endif
}
static void handle_sh4_exception(SH4ThrownException& ex, u32 pc)
{
if (pc & 1)
{
// Delay slot
AdjustDelaySlotException(ex);
pc--;
}
Do_Exception(pc, ex.expEvn, ex.callVect);
cycle_counter += CPU_RATIO * 4; // probably more is needed
longjmp(jmp_env, 1);
}
static void interpreter_fallback(u16 op, OpCallFP *oph, u32 pc)
{
try {
oph(op);
} catch (SH4ThrownException& ex) {
if (pc & 1)
{
// Delay slot
AdjustDelaySlotException(ex);
pc--;
}
Do_Exception(pc, ex.expEvn, ex.callVect);
longjmp(jmp_env, 1);
handle_sh4_exception(ex, pc);
}
}
@ -279,16 +283,8 @@ static void do_sqw_mmu_no_ex(u32 addr, u32 pc)
{
try {
do_sqw_mmu(addr);
exception_raised = 0;
} catch (SH4ThrownException& ex) {
if (pc & 1)
{
// Delay slot
AdjustDelaySlotException(ex);
pc--;
}
Do_Exception(pc, ex.expEvn, ex.callVect);
exception_raised = 1;
handle_sh4_exception(ex, pc);
}
}
@ -300,7 +296,9 @@ static void do_sqw_nommu_local(u32 addr, u8* sqb)
class BlockCompiler : public Xbyak::CodeGenerator
{
public:
BlockCompiler() : Xbyak::CodeGenerator(emit_FreeSpace(), emit_GetCCPtr()), regalloc(this)
BlockCompiler() : BlockCompiler((u8 *)emit_GetCCPtr()) {}
BlockCompiler(u8 *code_ptr) : Xbyak::CodeGenerator(emit_FreeSpace(), code_ptr), regalloc(this)
{
#if HOST_OS == OS_WINDOWS
call_regs.push_back(ecx);
@ -333,17 +331,11 @@ public:
void compile(RuntimeBlockInfo* block, bool force_checks, bool reset, bool staging, bool optimise)
{
//printf("X86_64 compiling %08x to %p\n", block->addr, emit_GetCCPtr());
current_opid = -1;
if (force_checks) {
CheckBlock(block);
}
regalloc.DoAlloc(block);
sub(dword[rip + &cycle_counter], block->guest_cycles);
#ifdef PROFILING
mov(rax, (uintptr_t)&guest_cpu_cycles);
mov(ecx, block->guest_cycles);
add(qword[rax], rcx);
#endif
#ifdef _WIN32
sub(rsp, 0x28); // 32-byte shadow space + 8 byte alignment
#else
@ -364,6 +356,13 @@ public:
jmp(exit_block, T_NEAR);
L(fpu_enabled);
}
sub(dword[rip + &cycle_counter], block->guest_cycles);
#ifdef PROFILING
mov(rax, (uintptr_t)&guest_cpu_cycles);
mov(ecx, block->guest_cycles);
add(qword[rax], rcx);
#endif
regalloc.DoAlloc(block);
for (current_opid = 0; current_opid < block->oplist.size(); current_opid++)
{
@ -440,98 +439,7 @@ public:
break;
case shop_readm:
{
u32 size = op.flags & 0x7f;
bool immediate_address = op.rs1.is_imm();
u32 addr = op.rs1._imm;
if (immediate_address && mmu_enabled())
{
if ((op.rs1._imm >> 12) != (block->vaddr >> 12))
{
// When full mmu is on, only consider addresses in the same 4k page
immediate_address = false;
}
else
{
u32 paddr;
u32 rv;
if (size == 2)
rv = mmu_data_translation<MMU_TT_DREAD, u16>(addr, paddr);
else if (size == 4)
rv = mmu_data_translation<MMU_TT_DREAD, u32>(addr, paddr);
else
die("Invalid immediate size");
if (rv != MMU_ERROR_NONE)
immediate_address = false;
else
addr = paddr;
}
}
if (immediate_address)
{
bool isram = false;
void* ptr = _vmem_read_const(addr, isram, size);
if (isram)
{
// Immediate pointer to RAM: super-duper fast access
mov(rax, reinterpret_cast<uintptr_t>(ptr));
switch (size)
{
case 2:
if (regalloc.IsAllocg(op.rd))
movsx(regalloc.MapRegister(op.rd), word[rax]);
else
{
movsx(eax, word[rax]);
mov(rcx, (uintptr_t)op.rd.reg_ptr());
mov(dword[rcx], eax);
}
break;
case 4:
if (regalloc.IsAllocg(op.rd))
mov(regalloc.MapRegister(op.rd), dword[rax]);
else if (regalloc.IsAllocf(op.rd))
movd(regalloc.MapXRegister(op.rd), dword[rax]);
else
{
mov(eax, dword[rax]);
mov(rcx, (uintptr_t)op.rd.reg_ptr());
mov(dword[rcx], eax);
}
break;
default:
die("Invalid immediate size");
break;
}
}
else
{
// Not RAM: the returned pointer is a memory handler
mov(call_regs[0], addr);
switch(size)
{
case 2:
GenCall((void (*)())ptr);
movsx(ecx, ax);
break;
case 4:
GenCall((void (*)())ptr);
mov(ecx, eax);
break;
default:
die("Invalid immediate size");
break;
}
host_reg_to_shil_param(op.rd, ecx);
}
}
else
if (!GenReadMemImmediate(op, block))
{
// Not an immediate address
shil_param_to_host_reg(op.rs1, call_regs[0]);
@ -547,47 +455,10 @@ public:
add(call_regs[0], dword[rax]);
}
}
if (mmu_enabled())
mov(call_regs[1], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc
if (size == 1) {
if (!mmu_enabled())
GenCall(ReadMem8);
else
GenCall(ReadMemNoEx<u8>);
movsx(ecx, al);
}
else if (size == 2) {
if (!mmu_enabled())
GenCall(ReadMem16);
else
GenCall(ReadMemNoEx<u16>);
movsx(ecx, ax);
}
else if (size == 4) {
if (!mmu_enabled())
GenCall(ReadMem32);
else
GenCall(ReadMemNoEx<u32>);
mov(ecx, eax);
}
else if (size == 8) {
if (!mmu_enabled())
GenCall(ReadMem64);
else
GenCall(ReadMemNoEx<u64>);
mov(rcx, rax);
}
else {
die("1..8 bytes");
}
// if (mmu_enabled())
// {
// test(dword[(void *)&exception_raised], 1);
// jnz(exit_block, T_NEAR);
// }
if (!optimise || !GenReadMemoryFast(op, block))
GenReadMemorySlow(op, block);
u32 size = op.flags & 0x7f;
if (size != 8)
host_reg_to_shil_param(op.rd, ecx);
else {
@ -606,12 +477,10 @@ public:
}
}
}
}
break;
break;
case shop_writem:
{
u32 size = op.flags & 0x7f;
shil_param_to_host_reg(op.rs1, call_regs[0]);
if (!op.rs3.is_null())
{
@ -626,6 +495,7 @@ public:
}
}
u32 size = op.flags & 0x7f;
if (size != 8)
shil_param_to_host_reg(op.rs2, call_regs[1]);
else {
@ -644,42 +514,8 @@ public:
mov(call_regs64[1], qword[rax]);
}
}
if (mmu_enabled())
mov(call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc
if (size == 1) {
if (!mmu_enabled())
GenCall(WriteMem8);
else
GenCall(WriteMemNoEx<u8>);
}
else if (size == 2) {
if (!mmu_enabled())
GenCall(WriteMem16);
else
GenCall(WriteMemNoEx<u16>);
}
else if (size == 4) {
if (!mmu_enabled())
GenCall(WriteMem32);
else
GenCall(WriteMemNoEx<u32>);
}
else if (size == 8) {
if (!mmu_enabled())
GenCall(WriteMem64);
else
GenCall(WriteMemNoEx<u64>);
}
else {
die("1..8 bytes");
}
// if (mmu_enabled())
// {
// test(dword[(void *)&exception_raised], 1);
// jnz(exit_block, T_NEAR);
// }
if (!optimise || !GenWriteMemoryFast(op, block))
GenWriteMemorySlow(op, block);
}
break;
@ -729,8 +565,8 @@ public:
mov(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); \
if (op.rs2.is_imm()) \
natop(regalloc.MapRegister(op.rd), op.rs2._imm); \
else if (op.rs2.is_reg()) \
natop(regalloc.MapRegister(op.rd), Xbyak::Reg8(regalloc.MapRegister(op.rs2).getIdx()));
else \
die("Unsupported operand");
case shop_shl:
SHIFT_OP(shl)
break;
@ -925,9 +761,6 @@ public:
mov(call_regs[1], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc
GenCall(do_sqw_mmu_no_ex);
test(dword[(void *)&exception_raised], 1);
jnz(exit_block, T_NEAR);
}
else
{
@ -1163,6 +996,7 @@ public:
regalloc.OpEnd(&op);
}
regalloc.Cleanup();
current_opid = -1;
mov(rax, (size_t)&next_pc);
@ -1242,6 +1076,118 @@ public:
emit_Skip(getSize());
}
void GenReadMemorySlow(const shil_opcode& op, RuntimeBlockInfo* block)
{
const u8 *start_addr = getCurr();
if (mmu_enabled())
mov(call_regs[1], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc
u32 size = op.flags & 0x7f;
switch (size) {
case 1:
if (!mmu_enabled())
GenCall(ReadMem8);
else
GenCall(ReadMemNoEx<u8>);
movsx(ecx, al);
break;
case 2:
if (!mmu_enabled())
GenCall(ReadMem16);
else
GenCall(ReadMemNoEx<u16>);
movsx(ecx, ax);
break;
case 4:
if (!mmu_enabled())
GenCall(ReadMem32);
else
GenCall(ReadMemNoEx<u32>);
mov(ecx, eax);
break;
case 8:
if (!mmu_enabled())
GenCall(ReadMem64);
else
GenCall(ReadMemNoEx<u64>);
mov(rcx, rax);
break;
default:
die("1..8 bytes");
}
if (mmu_enabled())
{
Xbyak::Label quick_exit;
if (getCurr() - start_addr <= read_mem_op_size - 6)
jmp(quick_exit, T_NEAR);
while (getCurr() - start_addr < read_mem_op_size)
nop();
L(quick_exit);
verify(getCurr() - start_addr == read_mem_op_size);
}
}
void GenWriteMemorySlow(const shil_opcode& op, RuntimeBlockInfo* block)
{
const u8 *start_addr = getCurr();
if (mmu_enabled())
mov(call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc
u32 size = op.flags & 0x7f;
switch (size) {
case 1:
if (!mmu_enabled())
GenCall(WriteMem8);
else
GenCall(WriteMemNoEx<u8>);
break;
case 2:
if (!mmu_enabled())
GenCall(WriteMem16);
else
GenCall(WriteMemNoEx<u16>);
break;
case 4:
if (!mmu_enabled())
GenCall(WriteMem32);
else
GenCall(WriteMemNoEx<u32>);
break;
case 8:
if (!mmu_enabled())
GenCall(WriteMem64);
else
GenCall(WriteMemNoEx<u64>);
break;
default:
die("1..8 bytes");
}
if (mmu_enabled())
{
Xbyak::Label quick_exit;
if (getCurr() - start_addr <= write_mem_op_size - 6)
jmp(quick_exit, T_NEAR);
while (getCurr() - start_addr < write_mem_op_size)
nop();
L(quick_exit);
verify(getCurr() - start_addr == write_mem_op_size);
}
}
void InitializeRewrite(RuntimeBlockInfo *block, size_t opid)
{
// shouldn't be necessary since all regs are flushed before mem access when mmu is enabled
//regalloc.DoAlloc(block);
regalloc.current_opid = opid;
}
void FinalizeRewrite()
{
ready();
}
void ngen_CC_Start(const shil_opcode& op)
{
CC_pars.clear();
@ -1346,16 +1292,188 @@ private:
typedef void (BlockCompiler::*X64BinaryOp)(const Xbyak::Operand&, const Xbyak::Operand&);
typedef void (BlockCompiler::*X64BinaryFOp)(const Xbyak::Xmm&, const Xbyak::Operand&);
bool GenReadMemImmediate(const shil_opcode& op, RuntimeBlockInfo* block)
{
if (!op.rs1.is_imm())
return false;
u32 size = op.flags & 0x7f;
u32 addr = op.rs1._imm;
if (mmu_enabled())
{
if ((addr >> 12) != (block->vaddr >> 12))
// When full mmu is on, only consider addresses in the same 4k page
return false;
u32 paddr;
u32 rv;
if (size == 2)
rv = mmu_data_translation<MMU_TT_DREAD, u16>(addr, paddr);
else if (size == 4)
rv = mmu_data_translation<MMU_TT_DREAD, u32>(addr, paddr);
else
die("Invalid immediate size");
if (rv != MMU_ERROR_NONE)
return false;
addr = paddr;
}
bool isram = false;
void* ptr = _vmem_read_const(addr, isram, size);
if (isram)
{
// Immediate pointer to RAM: super-duper fast access
mov(rax, reinterpret_cast<uintptr_t>(ptr));
switch (size)
{
case 2:
if (regalloc.IsAllocg(op.rd))
movsx(regalloc.MapRegister(op.rd), word[rax]);
else
{
movsx(eax, word[rax]);
mov(rcx, (uintptr_t)op.rd.reg_ptr());
mov(dword[rcx], eax);
}
break;
case 4:
if (regalloc.IsAllocg(op.rd))
mov(regalloc.MapRegister(op.rd), dword[rax]);
else if (regalloc.IsAllocf(op.rd))
movd(regalloc.MapXRegister(op.rd), dword[rax]);
else
{
mov(eax, dword[rax]);
mov(rcx, (uintptr_t)op.rd.reg_ptr());
mov(dword[rcx], eax);
}
break;
default:
die("Invalid immediate size");
break;
}
}
else
{
// Not RAM: the returned pointer is a memory handler
mov(call_regs[0], addr);
switch(size)
{
case 2:
GenCall((void (*)())ptr);
movsx(ecx, ax);
break;
case 4:
GenCall((void (*)())ptr);
mov(ecx, eax);
break;
default:
die("Invalid immediate size");
break;
}
host_reg_to_shil_param(op.rd, ecx);
}
return true;
}
bool GenReadMemoryFast(const shil_opcode& op, RuntimeBlockInfo* block)
{
if (!mmu_enabled() || !vmem32_enabled())
return false;
const u8 *start_addr = getCurr();
mov(rax, (uintptr_t)&p_sh4rcb->cntx.exception_pc);
mov(dword[rax], block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0));
mov(rax, (uintptr_t)p_sh4rcb->cntx.vmem32_base);
u32 size = op.flags & 0x7f;
verify(getCurr() - start_addr == 26);
block->memory_accesses[(void*)getCurr()] = (u32)current_opid;
switch (size)
{
case 1:
movsx(ecx, byte[rax + call_regs64[0]]);
break;
case 2:
movsx(ecx, word[rax + call_regs64[0]]);
break;
case 4:
mov(ecx, dword[rax + call_regs64[0]]);
break;
case 8:
mov(rcx, qword[rax + call_regs64[0]]);
break;
default:
die("1..8 bytes");
}
while (getCurr() - start_addr < read_mem_op_size)
nop();
verify(getCurr() - start_addr == read_mem_op_size);
return true;
}
bool GenWriteMemoryFast(const shil_opcode& op, RuntimeBlockInfo* block)
{
if (!mmu_enabled() || !vmem32_enabled())
return false;
const u8 *start_addr = getCurr();
mov(rax, (uintptr_t)&p_sh4rcb->cntx.exception_pc);
mov(dword[rax], block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0));
mov(rax, (uintptr_t)p_sh4rcb->cntx.vmem32_base);
u32 size = op.flags & 0x7f;
verify(getCurr() - start_addr == 26);
block->memory_accesses[(void*)getCurr()] = (u32)current_opid;
switch (size)
{
case 1:
mov(byte[rax + call_regs64[0] + 0], Xbyak::Reg8(call_regs[1].getIdx(), call_regs[1] == edi || call_regs[1] == esi));
break;
case 2:
mov(word[rax + call_regs64[0]], Xbyak::Reg16(call_regs[1].getIdx()));
break;
case 4:
mov(dword[rax + call_regs64[0]], call_regs[1]);
break;
case 8:
mov(qword[rax + call_regs64[0]], call_regs64[1]);
break;
default:
die("1..8 bytes");
}
while (getCurr() - start_addr < write_mem_op_size)
nop();
verify(getCurr() - start_addr == write_mem_op_size);
return true;
}
void CheckBlock(RuntimeBlockInfo* block) {
mov(call_regs[0], block->addr);
// if (mmu_enabled() && block->asid != 0xFFFFFFFF)
// {
// mov(rax, (uintptr_t)&CCN_PTEH.reg_data);
// cmp(byte[rax], block->asid);
// jne(reinterpret_cast<const void*>(&ngen_blockcheckfail));
// }
// FIXME Neither of these tests should be necessary
// FIXME This test shouldn't be necessary
// However the decoder makes various assumptions about the current PC value, which are simply not
// true in a virtualized memory model. So this can only work if virtual and phy addresses are the
// same at compile and run times.
@ -1424,10 +1542,10 @@ private:
void GenCall(Ret(*function)(Params...))
{
#ifndef _WIN32
bool xmm8_mapped = regalloc.IsMapped(xmm8, current_opid);
bool xmm9_mapped = regalloc.IsMapped(xmm9, current_opid);
bool xmm10_mapped = regalloc.IsMapped(xmm10, current_opid);
bool xmm11_mapped = regalloc.IsMapped(xmm11, current_opid);
bool xmm8_mapped = current_opid != -1 && regalloc.IsMapped(xmm8, current_opid);
bool xmm9_mapped = current_opid != -1 && regalloc.IsMapped(xmm9, current_opid);
bool xmm10_mapped = current_opid != -1 && regalloc.IsMapped(xmm10, current_opid);
bool xmm11_mapped = current_opid != -1 && regalloc.IsMapped(xmm11, current_opid);
// Need to save xmm registers as they are not preserved in linux/mach
int offset = 0;
@ -1587,11 +1705,15 @@ private:
static const u32 float_sign_mask;
static const u32 float_abs_mask;
static const f32 cvtf2i_pos_saturation;
static const u32 read_mem_op_size;
static const u32 write_mem_op_size;
};
const u32 BlockCompiler::float_sign_mask = 0x80000000;
const u32 BlockCompiler::float_abs_mask = 0x7fffffff;
const f32 BlockCompiler::cvtf2i_pos_saturation = 2147483520.0f; // IEEE 754: 0x4effffff;
const u32 BlockCompiler::read_mem_op_size = 30;
const u32 BlockCompiler::write_mem_op_size = 30;
void X64RegAlloc::Preload(u32 reg, Xbyak::Operand::Code nreg)
{
@ -1641,4 +1763,47 @@ void ngen_CC_Call(shil_opcode* op, void* function)
void ngen_CC_Finish(shil_opcode* op)
{
}
bool ngen_Rewrite(unat& host_pc, unat, unat)
{
if (!mmu_enabled() || !vmem32_enabled())
return false;
//printf("ngen_Rewrite pc %p\n", host_pc);
RuntimeBlockInfo *block = bm_GetBlock((void *)host_pc);
if (block == NULL)
{
printf("ngen_Rewrite: Block at %p not found\n", (void *)host_pc);
return false;
}
u8 *code_ptr = (u8*)host_pc;
auto it = block->memory_accesses.find(code_ptr);
if (it == block->memory_accesses.end())
{
printf("ngen_Rewrite: memory access at %p not found (%lu entries)\n", code_ptr, block->memory_accesses.size());
return false;
}
u32 opid = it->second;
verify(opid < block->oplist.size());
const shil_opcode& op = block->oplist[opid];
BlockCompiler *assembler = new BlockCompiler(code_ptr - 26);
assembler->InitializeRewrite(block, opid);
if (op.op == shop_readm)
assembler->GenReadMemorySlow(op, block);
else
assembler->GenWriteMemorySlow(op, block);
assembler->FinalizeRewrite();
verify(block->host_code_size >= assembler->getSize());
delete assembler;
block->memory_accesses.erase(it);
host_pc = (unat)(code_ptr - 26);
return true;
}
void ngen_HandleException()
{
longjmp(jmp_env, 1);
}
#endif

View File

@ -7,6 +7,8 @@
#include "TexCache.h"
#include "hw/pvr/pvr_regs.h"
#include "hw/mem/_vmem.h"
#include "hw/mem/vmem32.h"
#include "hw/sh4/modules/mmu.h"
#include "deps/xbrz/xbrz.h"
#include "deps/xxhash/xxhash.h"
@ -213,6 +215,8 @@ vram_block* libCore_vramlock_Lock(u32 start_offset64,u32 end_offset64,void* user
if (_nvmem_enabled() && VRAM_SIZE == 0x800000) {
vram.LockRegion(block->start + VRAM_SIZE, block->len);
}
if (mmu_enabled())
vmem32_protect_vram(block);
vramlock_list_add(block);
@ -222,11 +226,8 @@ vram_block* libCore_vramlock_Lock(u32 start_offset64,u32 end_offset64,void* user
return block;
}
bool VramLockedWrite(u8* address)
bool VramLockedWriteOffset(size_t offset)
{
size_t offset=address-vram.data;
if (offset<VRAM_SIZE)
{
@ -268,6 +269,16 @@ bool VramLockedWrite(u8* address)
return false;
}
bool VramLockedWrite(u8* address)
{
size_t offset=address-vram.data;
if (offset < 0x01000000)
return VramLockedWriteOffset(offset & (VRAM_SIZE - 1));
else
return false;
}
//unlocks mem
//also frees the handle
void libCore_vramlock_Unlock_block(vram_block* block)
@ -284,6 +295,8 @@ void libCore_vramlock_Unlock_block_wb(vram_block* block)
msgboxf("Error : block end is after vram , skipping unlock",MBX_OK);
else
{
if (mmu_enabled())
vmem32_unprotect_vram(block);
vramlock_list_remove(block);
//more work needed
free(block);

View File

@ -613,19 +613,5 @@ template void texture_VQ<convBMP_TW<pp_565>, u16>(PixelBuffer<u16>* pb,u8* p_in,
#define tex1555_VQ32 texture_VQ<conv1555_TW32<pp_8888>, u32>
#define tex4444_VQ32 texture_VQ<conv4444_TW32<pp_8888>, u32>
#define Is_64_Bit(addr) ((addr &0x1000000)==0)
//vram_block, vramLockCBFP on plugin headers
u32 vramlock_ConvAddrtoOffset64(u32 Address);
u32 vramlock_ConvOffset32toOffset64(u32 offset32);
void vramlock_Unlock_block(vram_block* block);
vram_block* vramlock_Lock_32(u32 start_offset32,u32 end_offset32,void* userdata);
vram_block* vramlock_Lock_64(u32 start_offset64,u32 end_offset64,void* userdata);
void vram_LockedWrite(u32 offset64);
void DePosterize(u32* source, u32* dest, int width, int height);
void UpscalexBRZ(int factor, u32* source, u32* dest, int width, int height, bool has_alpha);

View File

@ -736,6 +736,7 @@ struct settings_t
bool unstable_opt;
bool safemode;
bool disable_nvmem;
bool disable_vmem32;
} dynarec;
struct

View File

@ -1,6 +1,7 @@
#include "oslib\oslib.h"
#include "oslib\audiostream.h"
#include "imgread\common.h"
#include "hw\mem\vmem32.h"
#include "xinput_gamepad.h"
#include "win_keyboard.h"
@ -141,6 +142,11 @@ LONG ExeptionHandler(EXCEPTION_POINTERS *ExceptionInfo)
u8* address=(u8*)pExceptionRecord->ExceptionInformation[1];
//printf("[EXC] During access to : 0x%X\n", address);
#if !defined(NO_MMU) && defined(HOST_64BIT_CPU)
bool write = false; // TODO?
if (vmem32_handle_signal(ep->ContextRecord->Rcx, write))
return EXCEPTION_CONTINUE_EXECUTION;
#endif
if (VramLockedWrite(address))
{
@ -152,7 +158,8 @@ LONG ExeptionHandler(EXCEPTION_POINTERS *ExceptionInfo)
return EXCEPTION_CONTINUE_EXECUTION;
}
#endif
#if FEAT_SHREC == DYNAREC_JIT && HOST_CPU == CPU_X86
#if FEAT_SHREC == DYNAREC_JIT
#if HOST_CPU == CPU_X86
else if ( ngen_Rewrite((unat&)ep->ContextRecord->Eip,*(unat*)ep->ContextRecord->Esp,ep->ContextRecord->Eax) )
{
//remove the call from call stack
@ -161,6 +168,11 @@ LONG ExeptionHandler(EXCEPTION_POINTERS *ExceptionInfo)
ep->ContextRecord->Ecx=ep->ContextRecord->Eax;
return EXCEPTION_CONTINUE_EXECUTION;
}
#elif HOST_CPU == CPU_X64
else if (dyna_cde && ngen_Rewrite((unat&)ep->ContextRecord->Rip, 0, 0))
{
return EXCEPTION_CONTINUE_EXECUTION;
}
#endif
else
{
@ -576,7 +588,7 @@ _In_opt_ PVOID Context
// (DWORD)((u8 *)__gnat_SEH_error_handler - CodeCache);
/* Set its scope to the entire program. */
Table[0].BeginAddress = 0;// (CodeCache - (u8*)__ImageBase);
Table[0].EndAddress = /*(CodeCache - (u8*)__ImageBase) +*/ CODE_SIZE;
Table[0].EndAddress = /*(CodeCache - (u8*)__ImageBase) +*/ CODE_SIZE + TEMP_CODE_SIZE;
Table[0].UnwindData = (DWORD)((u8 *)unwind_info - CodeCache);
printf("TABLE CALLBACK\n");
//for (;;);
@ -605,13 +617,13 @@ void setup_seh() {
//(DWORD)((u8 *)__gnat_SEH_error_handler - CodeCache);
/* Set its scope to the entire program. */
Table[0].BeginAddress = 0;// (CodeCache - (u8*)__ImageBase);
Table[0].EndAddress = /*(CodeCache - (u8*)__ImageBase) +*/ CODE_SIZE;
Table[0].EndAddress = /*(CodeCache - (u8*)__ImageBase) +*/ CODE_SIZE + TEMP_CODE_SIZE;
Table[0].UnwindData = (DWORD)((u8 *)unwind_info - CodeCache);
/* Register the unwind information. */
RtlAddFunctionTable(Table, 1, (DWORD64)CodeCache);
#endif
//verify(RtlInstallFunctionTableCallback((unat)CodeCache | 0x3, (DWORD64)CodeCache, CODE_SIZE, seh_callback, 0, 0));
//verify(RtlInstallFunctionTableCallback((unat)CodeCache | 0x3, (DWORD64)CodeCache, CODE_SIZE + TEMP_CODE_SIZE, seh_callback, 0, 0));
}
#endif