Pseudonym Crazy-Opts Projects Inc presents: Improved block managers for EE/IOP recompilers! Basic asm optimizations combined with a technique of replacing NULL/invalid pointer checks with a direct link to a JIT dispatcher. A collab effort on irc produced this gem, which improves speed on all accounts, and even makes the linux .S files uber-simple and clean. (no more wondering if they're broke or not!)

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@687 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2009-03-05 18:13:14 +00:00
parent ce3a2c7d32
commit 482505d241
10 changed files with 631 additions and 744 deletions

View File

@ -35,7 +35,7 @@ void so_exit(void);
void recRecompile( u32 startpc );
// aR3000A.S
void psxRecRecompile(u32 startpc);
void iopRecRecompile(u32 startpc);
}
// Linux specific
@ -46,21 +46,23 @@ PCSX2_ALIGNED16( u8 _mmx_backup[8*4] );
extern "C"
{
// aVUzerorec.S
void* SuperVUGetProgram(u32 startpc, int vuindex);
void SuperVUCleanupProgram(u32 startpc, int vuindex);
void svudispfn();
// aR3000A.S
void psxDispatcher();
void psxDispatcherClear();
void psxDispatcherReg();
void iopJITCompile();
void iopDispatcher();
void iopDispatcherClear();
void iopDispatcherReg();
// aR5900-32.S
void JITCompile();
void Dispatcher();
void DispatcherClear();
void DispatcherReg();
}
#endif
#endif

View File

@ -1437,10 +1437,6 @@
/>
</FileConfiguration>
</File>
<File
RelativePath="..\..\Linux\ConfigDlg.h"
>
</File>
<File
RelativePath="..\..\Linux\CpuDlg.cpp"
>
@ -1497,10 +1493,6 @@
/>
</FileConfiguration>
</File>
<File
RelativePath="..\..\Linux\DebugDlg.h"
>
</File>
<File
RelativePath="..\..\Linux\HacksDlg.cpp"
>
@ -1585,10 +1577,6 @@
/>
</FileConfiguration>
</File>
<File
RelativePath="..\..\Linux\LnxMain.h"
>
</File>
<File
RelativePath="..\..\Linux\LnxMisc.cpp"
>
@ -1645,10 +1633,6 @@
/>
</FileConfiguration>
</File>
<File
RelativePath="..\..\Linux\LnxSysExec.h"
>
</File>
<File
RelativePath="..\..\Linux\LnxThreads.cpp"
>
@ -1677,10 +1661,6 @@
/>
</FileConfiguration>
</File>
<File
RelativePath="..\..\Linux\memzero.h"
>
</File>
<File
RelativePath="..\..\Linux\Pref.cpp"
>
@ -1709,6 +1689,58 @@
/>
</FileConfiguration>
</File>
<Filter
Name="asm"
>
<File
RelativePath="..\..\x86\aMicroVU.S"
>
</File>
<File
RelativePath="..\..\x86\aR3000A.S"
>
</File>
<File
RelativePath="..\..\x86\ix86-32\aR5900-32.S"
>
</File>
<File
RelativePath="..\..\x86\aVif.S"
>
</File>
<File
RelativePath="..\..\x86\aVUzerorec.S"
>
</File>
<File
RelativePath="..\..\x86\fast_routines.S"
>
</File>
</Filter>
<Filter
Name="Include"
>
<File
RelativePath="..\..\Linux\ConfigDlg.h"
>
</File>
<File
RelativePath="..\..\Linux\DebugDlg.h"
>
</File>
<File
RelativePath="..\..\Linux\LnxMain.h"
>
</File>
<File
RelativePath="..\..\Linux\LnxSysExec.h"
>
</File>
<File
RelativePath="..\..\Linux\memzero.h"
>
</File>
</Filter>
</Filter>
</Filter>
<Filter

View File

@ -16,6 +16,8 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
#define _SECURE_SCL 0
#include "PrecompiledHeader.h"
#include "BaseblockEx.h"
@ -66,7 +68,6 @@ void BASEBLOCKS::Add(BASEBLOCKEX* pex)
}
assert( imin == blocks.size() || blocks[imin]->startpc > pex->startpc );
if( imin > 0 ) assert( blocks[imin-1]->startpc < pex->startpc );
blocks.insert(blocks.begin()+imin, pex);
return;
@ -77,10 +78,15 @@ void BASEBLOCKS::Add(BASEBLOCKEX* pex)
int BASEBLOCKS::Get(u32 startpc)
{
switch(blocks.size()) {
case 0:
return -1;
case 1:
return 0;
case 2:
return blocks.front()->startpc < startpc;
if (blocks.front()->startpc + blocks.front()->size*4 <= startpc)
return -1;
else
return 0;
/*case 2:
return blocks.front()->startpc < startpc;*/
default:
{
@ -94,8 +100,12 @@ int BASEBLOCKS::Get(u32 startpc)
else imin = imid+1;
}
assert( blocks[imin]->startpc == startpc );
return imin;
//assert( blocks[imin]->startpc == startpc );
if (startpc < blocks[imin]->startpc ||
startpc >= blocks[imin]->startpc + blocks[imin]->size*4)
return -1;
else
return imin;
}
}
}
@ -130,7 +140,11 @@ void AddBaseBlockEx(BASEBLOCKEX* pex, int cpu)
BASEBLOCKEX* GetBaseBlockEx(u32 startpc, int cpu)
{
return s_vecBaseBlocksEx[cpu].blocks[s_vecBaseBlocksEx[cpu].Get(startpc)];
int i = s_vecBaseBlocksEx[cpu].Get(startpc);
if (i < 0)
return 0;
else
return s_vecBaseBlocksEx[cpu].blocks[i];
}
void RemoveBaseBlockEx(BASEBLOCKEX* pex, int cpu)

View File

@ -20,7 +20,6 @@
#define _BASEBLOCKEX_H_
// used to keep block information
#define BLOCKTYPE_STARTPC 4 // startpc offset
#define BLOCKTYPE_DELAYSLOT 1 // if bit set, delay slot
// Every potential jump point in the PS2's addressable memory has a BASEBLOCK
@ -28,28 +27,22 @@
// addressable memory. Yay!
struct BASEBLOCK
{
u32 m_pFnptr : 28;
u32 uType : 4;
u32 startpc;
u32 m_pFnptr;
u32 startpc : 30;
u32 uType : 2;
const uptr GetFnptr() const { return ((u32)m_pFnptr)<<4; }
void SetFnptr( uptr ptr )
{
// 16 byte alignments only, please!
jASSUME( (ptr & 0xf) == 0 );
m_pFnptr = ptr>>4;
}
const __inline uptr GetFnptr() const { return m_pFnptr; }
void __inline SetFnptr( uptr ptr ) { m_pFnptr = ptr; }
const __inline uptr GetStartPC() const { return startpc << 2; }
void __inline SetStartPC( uptr pc ) { startpc = pc >> 2; }
};
// extra block info (only valid for start of fn)
// The only "important" piece of information is size. Startpc is used as a debug/check
// var to make sure the baseblock is sane. (and it's used for some FFX hack involving
// a big snake in a sewer, but no one knows if the hack is relevant anymore).
struct BASEBLOCKEX
{
u16 size; // size in dwords
u16 size; // size in dwords
u16 dummy;
u32 startpc; // for debugging?
u32 startpc;
#ifdef PCSX2_DEVBUILD
u32 visited; // number of times called
@ -60,7 +53,10 @@ struct BASEBLOCKEX
// This is an asinine macro that bases indexing on sizeof(BASEBLOCK) for no reason. (air)
#define GET_BLOCKTYPE(b) ((b)->Type)
#define PC_GETBLOCK_(x, reclut) ((BASEBLOCK*)(reclut[((u32)(x)) >> 16] + (sizeof(BASEBLOCK)/4)*((x) & 0xffff)))
// x * (sizeof(BASEBLOCK) / 4) sacrifices safety for speed compared to
// x / 4 * sizeof(BASEBLOCK) or a higher level approach.
#define PC_GETBLOCK_(x, reclut) ((BASEBLOCK*)(reclut[((u32)(x)) >> 16] + (x)*(sizeof(BASEBLOCK)/4)))
#define RECLUT_SETPAGE(reclut, page, p) do { reclut[page] = (uptr)(p) - ((page) << 14) * sizeof(BASEBLOCK); } while (0)
// This is needed because of the retarded GETBLOCK macro above.
C_ASSERT( sizeof(BASEBLOCK) == 8 );

View File

@ -1,185 +1,126 @@
// iR3000A.c assembly routines
// zerofrog(@gmail.com)
.intel_syntax
.extern psxRegs
.extern psxRecLUT
.extern psxRecRecompile
.extern b440
.extern b440table
// iR3000a.c assembly routines
.intel_syntax noprefix
//////////////////////////////////////////////////////////////////////////
// Note that iR3000A.S and iR5900.S asm code is now identical. Only some
// function names and the following two defines should ever differ:
#define REGINFO psxRegs
#define PCOFFSET 0x208 // this must always match what Pcsx2 displays at startup
//////////////////////////////////////////////////////////////////////////
// Preprocessor Mess!
.extern REGINFO
.extern iopRecRecompile
#define BLOCKTYPE_STARTPC 4 // startpc offset
#define BLOCKTYPE_DELAYSLOT 1 // if bit set, delay slot
#define BASEBLOCK_SIZE 2 // in dwords
#define PCOFFSET 0x208
#define REG_PC %ecx
#define REG_BLOCK %esi
//////////////////////////////////////////////////////////////////////////
// The address for all cleared blocks. It recompiles the current pc and then
// dispatches to the recompiled block address.
.global JITCompile
iopJITCompile:
// jumped to when invalid psxpc address
.globl psxDispatcher
psxDispatcher:
// EDX contains the current psxpc to jump to, stack contains the jump addr to modify
push %edx
mov esi, dword ptr [REGINFO + PCOFFSET]
push esi
call iopRecRecompile
add esp, 4
mov ebx, esi
shr esi, 16
mov ecx, dword ptr [recLUT+esi*4]
jmp dword ptr [ecx+ebx*2]
// calc PSX_GETBLOCK
// ((BASEBLOCK*)(recLUT[((u32)(x)) >> 16] + (sizeof(BASEBLOCK)/4)*((x) & 0xffff)))
mov %eax, dword ptr [psxRegs + PCOFFSET]
mov REG_BLOCK, %eax
mov REG_PC, %eax
shr %eax, 16
and REG_BLOCK, 0xffff
shl %eax, 2
add %eax, [psxRecLUT]
shl REG_BLOCK, 1
add REG_BLOCK, dword ptr [%eax]
// check if startpc&PSX_MEMMASK == psxRegs.pc&PSX_MEMMASK
mov %eax, REG_PC
mov %edx, [REG_BLOCK+BLOCKTYPE_STARTPC]
//and %eax, PSX_MEMMASK // remove higher bits
//and %edx, PSX_MEMMASK
cmp %eax, %edx
je psxDispatcher_CheckPtr
//////////////////////////////////////////////////////////////////////////
// Recompiles the next block, and links the old block directly to it.
// This is a on-shot execution for any block which uses it. Once the block
// has been statically linked to the new block, this function will be bypassed
//
// edx - jump address to modify
.globl iopDispatcher
iopDispatcher:
// recompile
push REG_BLOCK
push REG_PC // psxpc
call psxRecRecompile
add %esp, 4 // pop old param
pop REG_BLOCK
psxDispatcher_CheckPtr:
mov REG_BLOCK, dword ptr [REG_BLOCK]
# calc PC_GETBLOCK
# ((BASEBLOCK*)(reclut[((u32)(x)) >> 16] + (x)*(sizeof(BASEBLOCK)/4)))
#ifdef _DEBUG
test REG_BLOCK, REG_BLOCK
jnz psxDispatcher_CallFn
// throw an exception
int 10
psxDispatcher_CallFn:
#endif
//and REG_BLOCK, 0x0fffffff
shl REG_BLOCK, 4
mov %edx, REG_BLOCK
pop %ecx // x86Ptr to mod
sub %edx, %ecx
sub %edx, 4
mov dword ptr [%ecx], %edx
mov eax, dword ptr [REGINFO + PCOFFSET]
mov ebx, eax
shr eax, 16
mov ecx, dword ptr [recLUT+eax*4]
mov eax, dword ptr [ecx+ebx*2]
jmp REG_BLOCK
cmp eax, offset JITCompile
je Dispatch_notcompiled
lea ebx, [eax-4]
sub ebx, edx
mov dword ptr [edx], ebx
jmp eax
.globl psxDispatcherClear
psxDispatcherClear:
// %EDX contains the current psxpc
mov dword ptr [psxRegs + PCOFFSET], %edx
.align 16
Dispatch_notcompiled:
mov esi, edx
lea edi, [ecx+ebx*2]
push ebx
call iopRecRecompile
add esp, 4
// calc PSX_GETBLOCK
// ((BASEBLOCK*)(recLUT[((u32)(x)) >> 16] + (sizeof(BASEBLOCK)/4)*((x) & 0xffff)))
mov %eax, %edx
mov REG_BLOCK, %edx
shr %eax, 16
and REG_BLOCK, 0xffff
shl %eax, 2
add %eax, [psxRecLUT]
shl REG_BLOCK, 1
add REG_BLOCK, dword ptr [%eax];
// check if startpc&PSX_MEMMASK == psxRegs.pc&PSX_MEMMASK
mov %eax, %edx
mov REG_PC, %edx
mov %edx, [REG_BLOCK+BLOCKTYPE_STARTPC]
//and %eax, PSX_MEMMASK // remove higher bits
//and %edx, PSX_MEMMASK
cmp %eax, %edx
jne psxDispatcherClear_Recompile
mov eax, dword ptr [edi]
lea ebx, [eax-4]
sub ebx, esi
mov dword ptr [esi], ebx
jmp eax
add %esp, 4 // ignore stack
mov %eax, dword ptr [REG_BLOCK]
#ifdef _DEBUG
test %eax, %eax
jnz psxDispatcherClear_CallFn
// throw an exception
int 10
psxDispatcherClear_CallFn:
#endif
//and %eax, 0x0fffffff
shl %eax, 4
jmp %eax
psxDispatcherClear_Recompile:
push REG_BLOCK
push REG_PC
call psxRecRecompile
add %esp, 4 // pop old param
pop REG_BLOCK
mov %eax, dword ptr [REG_BLOCK]
//////////////////////////////////////////////////////////////////////////
// edx - baseblock->startpc
// stack - x86Ptr
.globl iopDispatcherClear
iopDispatcherClear:
mov [REGINFO + PCOFFSET], edx
pop %ecx // old fnptr
# calc PC_GETBLOCK
# ((BASEBLOCK*)(reclut[((u32)(x)) >> 16] + (x)*(sizeof(BASEBLOCK)/4)))
//and %eax, 0x0fffffff
shl %eax, 4
mov byte ptr [%ecx], 0xe9 // jmp32
mov %edx, %eax
sub %edx, %ecx
sub %edx, 5
mov dword ptr [%ecx+1], %edx
mov ebx, edx
shr edx, 16
mov ecx, dword ptr [recLUT+edx*4]
mov eax, dword ptr [ecx+ebx*2]
jmp %eax
cmp eax, offset JITCompile
je Clear_notcompiled
add esp, 4
jmp eax
// called when jumping to variable psxpc address
.globl psxDispatcherReg
psxDispatcherReg:
.align 16
Clear_notcompiled:
lea edi, [ecx+ebx*2]
push ebx
call iopRecRecompile
add esp, 4
mov eax, dword ptr [edi]
//s_pDispatchBlock = PSX_GETBLOCK(psxRegs.pc);
mov %edx, dword ptr [psxRegs+PCOFFSET]
mov %ecx, %edx
pop ecx
mov byte ptr [ecx], 0xe9 // jmp32
lea ebx, [eax-5]
sub ebx, ecx
mov dword ptr [ecx+1], ebx
shr %edx, 14
and %edx, 0xfffffffc
add %edx, [psxRecLUT]
mov %edx, dword ptr [%edx]
jmp eax
mov %eax, %ecx
and %eax, 0xfffc
// %edx += 2*%eax
shl %eax, 1
add %edx, %eax
// check if startpc == psxRegs.pc
mov %eax, %ecx
cmp %eax, dword ptr [%edx+BLOCKTYPE_STARTPC]
jne psxDispatcherReg_recomp
mov %eax, dword ptr [%edx]
//////////////////////////////////////////////////////////////////////////
// called when jumping to variable pc address.
#ifdef _DEBUG
test %eax, %eax
jnz psxDispatcherReg_CallFn2
// throw an exception
int 10
psxDispatcherReg_CallFn2:
#endif
//and %eax, 0x0fffffff
shl %eax, 4
jmp %eax // fnptr
.globl iopDispatcherReg
iopDispatcherReg:
psxDispatcherReg_recomp:
// changed this to use push/pop instead (faster on cores) - air
push %edx
push %ecx
call psxRecRecompile
pop %ecx
pop %edx
mov %eax, dword ptr [%edx]
//and %eax, 0x0fffffff
shl %eax, 4
jmp %eax // fnptr
mov eax, dword ptr [REGINFO + PCOFFSET]
mov ebx, eax
shr eax, 16
mov ecx, dword ptr [recLUT+eax*4]
jmp dword ptr [ecx+ebx*2]

View File

@ -44,7 +44,7 @@ extern void zeroEx();
u32 g_psxMaxRecMem = 0;
u32 s_psxrecblocks[] = {0};
uptr *psxRecLUT;
uptr psxRecLUT[0x10000];
#define PSX_NUMBLOCKS (1<<12)
#define MAPBASE 0x48000000
@ -71,10 +71,6 @@ static u32 s_nInstCacheSize = 0;
static BASEBLOCK* s_pCurBlock = NULL;
static BASEBLOCKEX* s_pCurBlockEx = NULL;
#if defined(_MSC_VER) && !defined(__x86_64__)
static BASEBLOCK* s_pDispatchBlock = NULL;
#endif
static u32 s_nEndBlock = 0; // what psxpc the current block ends
static u32 s_nNextBlock = 0; // next free block in recBlocks
@ -92,6 +88,8 @@ void psxRecompileNextInstruction(int delayslot);
extern void (*rpsxBSC[64])();
void rpsxpropBSC(EEINST* prev, EEINST* pinst);
static void iopInitRecLUT(BASEBLOCK* base, int count);
#ifdef _DEBUG
u32 psxdump = 0;
#else
@ -101,9 +99,8 @@ u32 psxdump = 0;
#define PSX_GETBLOCK(x) PC_GETBLOCK_(x, psxRecLUT)
#define PSXREC_CLEARM(mem) { \
if ((mem) < g_psxMaxRecMem && psxRecLUT[(mem) >> 16]) { \
BASEBLOCK* p = PSX_GETBLOCK(mem); \
if( *(u32*)p ) psxRecClearMem(p); \
if ((mem) < g_psxMaxRecMem && (psxRecLUT[(mem) >> 16] + mem)) { \
psxRecClearMem(mem); \
} \
} \
@ -114,7 +111,7 @@ BASEBLOCKEX* PSX_GETBLOCKEX(BASEBLOCK* p)
// return pex;
// otherwise, use the sorted list
return GetBaseBlockEx(p->startpc, 1);
return GetBaseBlockEx(p->GetStartPC(), 1);
}
////////////////////////////////////////////////////
@ -546,9 +543,6 @@ static void recAlloc()
if( recMem == NULL )
throw Exception::OutOfMemory( "R3000a Init > failed to allocate memory for the recompiler." );
if( psxRecLUT == NULL )
psxRecLUT = (uptr*) malloc(0x010000 * sizeof(uptr));
// Goal: Allocate BASEBLOCKs for every possible branch target in IOP memory.
// Any 4-byte aligned address makes a valid branch target as per MIPS design (all instructions are
// always 4 bytes long).
@ -586,31 +580,33 @@ void recResetIOP()
DbgCon::Status( "iR3000A > Resetting recompiler memory and structures!" );
memzero_ptr<0x010000 * sizeof(uptr)>( psxRecLUT );
memzero_ptr<sizeof(psxRecLUT)>( psxRecLUT );
memset_8<0xcd,RECMEM_SIZE>( recMem );
memzero_ptr<m_recBlockAllocSize>( m_recBlockAlloc );
memzero_ptr<PSX_NUMBLOCKS*sizeof(BASEBLOCKEX)>(recBlocks);
iopInitRecLUT((BASEBLOCK*)m_recBlockAlloc,
(((Ps2MemSize::IopRam + Ps2MemSize::Rom + Ps2MemSize::Rom1) / 4)));
// We're only mapping 20 pages here in 4 places.
// 0x80 comes from : (Ps2MemSize::IopRam / 0x10000) * 4
for (int i=0; i<0x80; i++)
{
psxRecLUT[i + 0x0000] = (uptr)&recRAM[(i & 0x1f) << 14];
psxRecLUT[i + 0x8000] = (uptr)&recRAM[(i & 0x1f) << 14];
psxRecLUT[i + 0xa000] = (uptr)&recRAM[(i & 0x1f) << 14];
RECLUT_SETPAGE(psxRecLUT, i + 0x0000, &recRAM[(i & 0x1f) << 14]);
RECLUT_SETPAGE(psxRecLUT, i + 0x8000, &recRAM[(i & 0x1f) << 14]);
RECLUT_SETPAGE(psxRecLUT, i + 0xa000, &recRAM[(i & 0x1f) << 14]);
}
for (int i=0; i<(Ps2MemSize::Rom / 0x10000); i++)
{
psxRecLUT[i + 0x1fc0] = (uptr)&recROM[i << 14];
psxRecLUT[i + 0x9fc0] = (uptr)&recROM[i << 14];
psxRecLUT[i + 0xbfc0] = (uptr)&recROM[i << 14];
RECLUT_SETPAGE(psxRecLUT, i + 0x1fc0, &recROM[i << 14]);
RECLUT_SETPAGE(psxRecLUT, i + 0x9fc0, &recROM[i << 14]);
RECLUT_SETPAGE(psxRecLUT, i + 0xbfc0, &recROM[i << 14]);
}
for (int i=0; i<(Ps2MemSize::Rom1 / 0x10000); i++)
{
psxRecLUT[i + 0x1e00] = (uptr)&recROM1[i << 14];
psxRecLUT[i + 0x9e00] = (uptr)&recROM1[i << 14];
psxRecLUT[i + 0xbe00] = (uptr)&recROM1[i << 14];
RECLUT_SETPAGE(psxRecLUT, i + 0x1e00, &recROM1[i << 14]);
RECLUT_SETPAGE(psxRecLUT, i + 0x9e00, &recROM1[i << 14]);
RECLUT_SETPAGE(psxRecLUT, i + 0xbe00, &recROM1[i << 14]);
}
if( s_pInstCache )
@ -630,7 +626,6 @@ static void recShutdown()
SafeSysMunmap(recMem, RECMEM_SIZE);
safe_aligned_free( m_recBlockAlloc );
safe_free(psxRecLUT);
safe_free( s_pInstCache );
s_nInstCacheSize = 0;
}
@ -644,7 +639,7 @@ static __forceinline void R3000AExecute()
pblock = PSX_GETBLOCK(psxRegs.pc);
if ( !pblock->GetFnptr() || (pblock->startpc&PSX_MEMMASK) != (psxRegs.pc&PSX_MEMMASK) ) {
if ( !pblock->GetFnptr() || (pblock->GetStartPC()&PSX_MEMMASK) != (psxRegs.pc&PSX_MEMMASK) ) {
psxRecRecompile(psxRegs.pc);
}
@ -691,119 +686,111 @@ u32 g_psxlastpc = 0;
static u32 g_temp;
// jumped to when invalid psxpc address
__declspec(naked) void psxDispatcher()
// The address for all cleared blocks. It recompiles the current pc and then
// dispatches to the recompiled block address.
static __declspec(naked) void iopJITCompile()
{
// EDX contains the current psxpc to jump to, stack contains the jump addr to modify
__asm push edx
//jASSUME( psxRegs.pc <= PSX_MEMMASK );
s_pDispatchBlock = PSX_GETBLOCK( psxRegs.pc );
if( s_pDispatchBlock->startpc != psxRegs.pc )
psxRecRecompile(psxRegs.pc);
__asm
{
mov eax, s_pDispatchBlock
mov eax, dword ptr [eax]
__asm {
mov esi, dword ptr [psxRegs.pc]
push esi
call iopRecRecompile
add esp, 4
mov ebx, esi
shr esi, 16
mov ecx, dword ptr [psxRecLUT+esi*4]
jmp dword ptr [ecx+ebx*2]
}
}
#ifdef _DEBUG
__asm mov g_temp, eax
assert( g_temp );
#endif
// jumped to when an immediate branch (EE side) hasn't been statically linked yet.
// Block is compiled if needed, and the link is made.
// EDX contains the jump addr to modify
static __declspec(naked) void iopDispatcher()
{
__asm {
mov eax, dword ptr [psxRegs.pc]
mov ebx, eax
shr eax, 16
mov ecx, dword ptr [psxRecLUT+eax*4]
mov eax, dword ptr [ecx+ebx*2]
cmp eax, offset iopJITCompile
je notcompiled
lea ebx, [eax-4]
sub ebx, edx
mov dword ptr [edx], ebx
jmp eax
// Modify the prev block's jump address, and jump to the new block:
__asm
{
shl eax,4
pop ecx // x86Ptr[0] to mod
mov edx, eax
sub edx, ecx
sub edx, 4
mov dword ptr [ecx], edx
align 16
notcompiled:
mov esi, edx
lea edi, [ecx+ebx*2]
push ebx
call iopRecRecompile
add esp, 4
mov eax, dword ptr [edi]
lea ebx, [eax-4]
sub ebx, esi
mov dword ptr [esi], ebx
jmp eax
}
}
__declspec(naked) void psxDispatcherClear()
// edx - baseblock->GetStartPC()
// stack - x86Ptr[0]
static __declspec(naked) void iopDispatcherClear()
{
// EDX contains the current psxpc
__asm mov psxRegs.pc, edx
__asm push edx
__asm {
mov [psxRegs.pc], edx
mov ebx, edx
shr edx, 16
mov ecx, dword ptr [psxRecLUT+edx*4]
mov eax, dword ptr [ecx+ebx*2]
cmp eax, iopJITCompile
je notcompiled
add esp, 4
jmp eax
//jASSUME( psxRegs.pc <= PSX_MEMMASK );
align 16
notcompiled:
lea edi, [ecx+ebx*2]
push ebx
call iopRecRecompile
add esp, 4
mov eax, dword ptr [edi]
// calc PSX_GETBLOCK
s_pDispatchBlock = PSX_GETBLOCK(psxRegs.pc);
if( s_pDispatchBlock->startpc == psxRegs.pc ) {
assert( s_pDispatchBlock->GetFnptr() != 0 );
// already modded the code, jump to the new place
__asm
{
pop edx
add esp, 4 // ignore stack
mov eax, s_pDispatchBlock
mov eax, dword ptr [eax]
//and eax, 0x0fffffff
shl eax,4
jmp eax
}
}
__asm
{
call psxRecRecompile
add esp, 4 // pop old param
mov eax, s_pDispatchBlock
mov eax, dword ptr [eax]
pop ecx // old fnptr
//and eax, 0x0fffffff
shl eax,4
pop ecx
mov byte ptr [ecx], 0xe9 // jmp32
mov edx, eax
sub edx, ecx
sub edx, 5
mov dword ptr [ecx+1], edx
lea ebx, [eax-5]
sub ebx, ecx
mov dword ptr [ecx+1], ebx
jmp eax
}
}
// called when jumping to variable psxpc address
__declspec(naked) void psxDispatcherReg()
static __declspec(naked) void iopDispatcherReg()
{
//jASSUME( psxRegs.pc <= PSX_MEMMASK );
s_pDispatchBlock = PSX_GETBLOCK( psxRegs.pc );
if( s_pDispatchBlock->startpc != psxRegs.pc )
psxRecRecompile(psxRegs.pc);
__asm
{
mov eax, s_pDispatchBlock
mov eax, dword ptr [eax]
}
#ifdef _DEBUG
__asm mov g_temp, eax
assert( g_temp );
#endif
__asm
{
shl eax, 4
jmp eax
__asm {
mov eax, dword ptr [psxRegs.pc]
mov ebx, eax
shr eax, 16
mov ecx, dword ptr [psxRecLUT+eax*4]
jmp dword ptr [ecx+ebx*2]
}
}
#endif // _MSC_VER
static void iopInitRecLUT(BASEBLOCK* base, int count)
{
for (int i = 0; i < count; i++) {
base[i].SetFnptr((uptr)iopJITCompile);
base[i].SetStartPC(0);
base[i].uType = 0;
}
}
static void recExecute()
{
// note: this function is currently never used.
@ -827,7 +814,7 @@ static s32 recExecuteBlock( s32 eeCycles )
push edi
push ebp
call psxDispatcherReg
call iopDispatcherReg
pop ebp
pop edi
@ -869,57 +856,44 @@ static void recClear(u32 Addr, u32 Size)
void rpsxMemConstClear(u32 mem)
{
// NOTE! This assumes recLUT never changes its mapping
if( !psxRecLUT[mem>>16] )
if( !(psxRecLUT[mem>>16] + mem) )
return;
CMP32ItoM((uptr)PSX_GETBLOCK(mem), 0);
j8Ptr[6] = JE8(0);
_callFunctionArg1((uptr)psxRecClearMem, MEM_CONSTTAG, (uptr)PSX_GETBLOCK(mem));
_callFunctionArg1((uptr)psxRecClearMem, MEM_CONSTTAG, mem);
x86SetJ8(j8Ptr[6]);
}
void psxRecClearMem(BASEBLOCK* p)
void psxRecClearMem(u32 pc)
{
BASEBLOCKEX* pexblock;
BASEBLOCK* pstart;
int lastdelay;
assert( p != NULL );
BASEBLOCK* p;
p= PSX_GETBLOCK(pc);
pc = p->GetStartPC();
if (!pc)
return;
pexblock = GetBaseBlockEx(pc, 1);
if (!pexblock)
return;
pstart = PSX_GETBLOCK(pexblock->startpc);
if( p->uType & BLOCKTYPE_DELAYSLOT ) {
psxRecClearMem(p-1);
if( p->GetFnptr() == 0 )
return;
}
assert( p->GetFnptr() != 0 );
assert( p->startpc );
for (int i = 0; i < pexblock->size; i++) {
x86Ptr[0] = (u8*)pstart[i].GetFnptr();
if (x86Ptr[0] == (u8*)iopJITCompile)
continue;
x86Ptr[0] = (u8*)p->GetFnptr();
// there is a small problem: mem can be ored with 0xa<<28 or 0x8<<28, and don't know which
MOV32ItoR(EDX, p->startpc);
assert( (uptr)x86Ptr[0] <= 0xffffffff );
PUSH32I((uptr)x86Ptr[0]);
JMP32((uptr)psxDispatcherClear - ( (uptr)x86Ptr[0] + 5 ));
assert( x86Ptr[0] == (u8*)p->GetFnptr() + IOP_MIN_BLOCK_BYTES );
pstart = PSX_GETBLOCK(p->startpc);
pexblock = PSX_GETBLOCKEX(pstart);
assert( pexblock->startpc == pstart->startpc );
// don't delete if last is delay
lastdelay = pexblock->size;
if( pstart[pexblock->size-1].uType & BLOCKTYPE_DELAYSLOT ) {
assert( pstart[pexblock->size-1].GetFnptr() != pstart->GetFnptr() );
if( pstart[pexblock->size-1].GetFnptr() != 0 ) {
pstart[pexblock->size-1].uType = 0;
--lastdelay;
}
// there is a small problem: mem can be ored with 0xa<<28 or 0x8<<28, and don't know which
MOV32ItoR(EDX, pexblock->startpc + i*4);
assert( (uptr)x86Ptr[0] <= 0xffffffff );
PUSH32I((uptr)x86Ptr[0]); // will be replaced by JMP32
JMP32((uptr)iopDispatcherClear - ( (uptr)x86Ptr[0] + 5 ));
}
memset(pstart, 0, lastdelay*sizeof(BASEBLOCK));
iopInitRecLUT(pstart, pexblock->size);
RemoveBaseBlockEx(pexblock, 1);
pexblock->size = 0;
@ -950,7 +924,7 @@ void psxSetBranchReg(u32 reg)
_psxFlushCall(FLUSH_EVERYTHING);
iPsxBranchTest(0xffffffff, 1);
JMP32((uptr)psxDispatcherReg - ( (uptr)x86Ptr[0] + 5 ));
JMP32((uptr)iopDispatcherReg - ( (uptr)x86Ptr[0] + 5 ));
}
void psxSetBranchImm( u32 imm )
@ -966,7 +940,7 @@ void psxSetBranchImm( u32 imm )
MOV32ItoR(EDX, 0);
ptr = (u32*)(x86Ptr[0]-4);
*ptr = (uptr)JMP32((uptr)psxDispatcher - ( (uptr)x86Ptr[0] + 5 ));
*ptr = (uptr)JMP32((uptr)iopDispatcher - ( (uptr)x86Ptr[0] + 5 ));
}
//fixme : this is all a huge hack, we base the counter advancements on the average an opcode should take (wtf?)
@ -1007,7 +981,7 @@ static void iPsxBranchTest(u32 newpc, u32 cpuBranch)
if( newpc != 0xffffffff )
{
CMP32ItoM((uptr)&psxRegs.pc, newpc);
JNE32((uptr)psxDispatcherReg - ( (uptr)x86Ptr[0] + 6 ));
JNE32((uptr)iopDispatcherReg - ( (uptr)x86Ptr[0] + 6 ));
}
// Skip branch jump target here:
@ -1043,7 +1017,7 @@ void rpsxSYSCALL()
ADD32ItoM((uptr)&psxRegs.cycle, psxScaleBlockCycles() );
SUB32ItoM((uptr)&psxCycleEE, psxScaleBlockCycles()*8 );
JMP32((uptr)psxDispatcherReg - ( (uptr)x86Ptr[0] + 5 ));
JMP32((uptr)iopDispatcherReg - ( (uptr)x86Ptr[0] + 5 ));
// jump target for skipping blockCycle updates
x86SetJ8(j8Ptr[0]);
@ -1063,7 +1037,7 @@ void rpsxBREAK()
j8Ptr[0] = JE8(0);
ADD32ItoM((uptr)&psxRegs.cycle, psxScaleBlockCycles() );
SUB32ItoM((uptr)&psxCycleEE, psxScaleBlockCycles()*8 );
JMP32((uptr)psxDispatcherReg - ( (uptr)x86Ptr[0] + 5 ));
JMP32((uptr)iopDispatcherReg - ( (uptr)x86Ptr[0] + 5 ));
x86SetJ8(j8Ptr[0]);
//if (!psxbranch) psxbranch = 2;
@ -1073,8 +1047,8 @@ u32 psxRecompileCodeSafe(u32 temppc)
{
BASEBLOCK* pblock = PSX_GETBLOCK(temppc);
if( pblock->GetFnptr() != 0 && pblock->startpc != s_pCurBlock->startpc ) {
if( psxpc == pblock->startpc )
if( pblock->GetFnptr() != (uptr)iopJITCompile && pblock->GetStartPC() != s_pCurBlock->GetStartPC() ) {
if( psxpc == pblock->GetStartPC() )
return 0;
}
@ -1088,32 +1062,26 @@ void psxRecompileNextInstruction(int delayslot)
BASEBLOCK* pblock = PSX_GETBLOCK(psxpc);
// need *ppblock != s_pCurBlock because of branches
if( pblock->GetFnptr() != 0 && pblock->startpc != s_pCurBlock->startpc ) {
if( !delayslot && psxpc == pblock->startpc ) {
if( pblock->GetFnptr() != (uptr)iopJITCompile && pblock->GetStartPC() != s_pCurBlock->GetStartPC() )
{
if( !delayslot && psxpc == pblock->GetStartPC() )
{
// code already in place, so jump to it and exit recomp
assert( PSX_GETBLOCKEX(pblock)->startpc == pblock->startpc );
assert( PSX_GETBLOCKEX(pblock)->startpc == pblock->GetStartPC() );
_psxFlushCall(FLUSH_EVERYTHING);
MOV32ItoM((uptr)&psxRegs.pc, psxpc);
// if( pexblock->pOldFnptr ) {
// // code already in place, so jump to it and exit recomp
// JMP32((uptr)pexblock->pOldFnptr - ((uptr)x86Ptr[0] + 5));
// branch = 3;
// return;
// }
JMP32((uptr)pblock->GetFnptr() - ((uptr)x86Ptr[0] + 5));
psxbranch = 3;
return;
}
else {
if( !(delayslot && pblock->startpc == psxpc) ) {
else
{
if( !(delayslot && pblock->GetStartPC() == psxpc) )
{
u8* oldX86 = x86Ptr[0];
//__Log("clear block %x\n", pblock->startpc);
psxRecClearMem(pblock);
//__Log("clear block %x\n", pblock->GetStartPC());
psxRecClearMem(psxpc);
x86Ptr[0] = oldX86;
if( delayslot )
SysPrintf("delay slot %x\n", psxpc);
@ -1223,7 +1191,7 @@ static void printfn()
}
#endif
void psxRecRecompile(u32 startpc)
void iopRecRecompile(u32 startpc)
{
u32 i;
u32 branchTo;
@ -1246,13 +1214,13 @@ void psxRecRecompile(u32 startpc)
s_pCurBlock = PSX_GETBLOCK(startpc);
if( s_pCurBlock->GetFnptr() ) {
if( s_pCurBlock->GetFnptr() != (uptr)iopJITCompile ) {
// clear if already taken
assert( s_pCurBlock->startpc < startpc );
psxRecClearMem(s_pCurBlock);
assert( s_pCurBlock->GetStartPC() < startpc );
psxRecClearMem(startpc);
}
if( s_pCurBlock->startpc == startpc ) {
if( s_pCurBlock->GetStartPC() == startpc ) {
s_pCurBlockEx = PSX_GETBLOCKEX(s_pCurBlock);
assert( s_pCurBlockEx->startpc == startpc );
}
@ -1282,7 +1250,7 @@ void psxRecRecompile(u32 startpc)
psxbranch = 0;
s_pCurBlock->startpc = startpc;
s_pCurBlock->SetStartPC(startpc);
s_pCurBlock->SetFnptr( (uptr)x86Ptr[0] );
s_psxBlockCycles = 0;
@ -1305,9 +1273,9 @@ void psxRecRecompile(u32 startpc)
while(1) {
BASEBLOCK* pblock = PSX_GETBLOCK(i);
if( pblock->GetFnptr() != 0 && pblock->startpc != s_pCurBlock->startpc ) {
if( pblock->GetFnptr() != (uptr)iopJITCompile && pblock->GetStartPC() != s_pCurBlock->GetStartPC() ) {
if( i == pblock->startpc ) {
if( i == pblock->GetStartPC() ) {
// branch = 3
willbranch3 = 1;
s_nEndBlock = i;
@ -1407,16 +1375,25 @@ StartRecomp:
assert( (psxpc-startpc)>>2 <= 0xffff );
s_pCurBlockEx->size = (psxpc-startpc)>>2;
for(i = 1; i <= (u32)s_pCurBlockEx->size-1; ++i) {
if (!s_pCurBlock[i].GetStartPC())
s_pCurBlock[i].SetStartPC( startpc );
}
// This is just wrong, right? How can setting a jump to any point in this block
// to jump to the beginning of the block possibly be right?
#ifdef ZERO_TOLERANCE
for(i = 1; i < (u32)s_pCurBlockEx->size-1; ++i) {
s_pCurBlock[i].SetFnptr( s_pCurBlock->GetFnptr() );
s_pCurBlock[i].startpc = s_pCurBlock->startpc;
s_pCurBlock[i].SetStartPC( p_CurBlock->startpc );
}
// don't overwrite if delay slot
if( i < (u32)s_pCurBlockEx->size && !(s_pCurBlock[i].uType & BLOCKTYPE_DELAYSLOT) ) {
s_pCurBlock[i].SetFnptr( s_pCurBlock->GetFnptr() );
s_pCurBlock[i].startpc = s_pCurBlock->startpc;
s_pCurBlock[i].SetFnptr(0);
s_pCurBlock[i].SetStartPC(0);
}
#endif
// set the block ptr
AddBaseBlockEx(s_pCurBlockEx, 1);
@ -1429,7 +1406,7 @@ StartRecomp:
iPsxBranchTest(0xffffffff, 1);
JMP32((uptr)psxDispatcherReg - ( (uptr)x86Ptr[0] + 5 ));
JMP32((uptr)iopDispatcherReg - ( (uptr)x86Ptr[0] + 5 ));
}
else {
assert( psxbranch != 3 );
@ -1472,23 +1449,23 @@ StartRecomp:
s_pCurBlock = PSX_GETBLOCK(psxpc);
assert( ptr != NULL );
if( s_pCurBlock->startpc != psxpc ){
psxRecRecompile(psxpc);
if( s_pCurBlock->GetStartPC() != psxpc ){
iopRecRecompile(psxpc);
}
// could have reset
if( pcurblock->startpc == startpc ) {
assert( pcurblock->GetFnptr() );
assert( s_pCurBlock->startpc == nEndBlock );
if( pcurblock->GetStartPC() == startpc ) {
assert( pcurblock->GetFnptr() != (uptr)iopJITCompile );
assert( s_pCurBlock->GetStartPC() == nEndBlock );
*ptr = (u32)((uptr)s_pCurBlock->GetFnptr() - ( (uptr)ptr + 4 ));
}
else {
psxRecRecompile(startpc);
assert( pcurblock->GetFnptr() != 0 );
iopRecRecompile(startpc);
assert( pcurblock->GetFnptr() != (uptr)iopJITCompile );
}
}
else
assert( s_pCurBlock->GetFnptr() != 0 );
assert( s_pCurBlock->GetFnptr() != (uptr)iopJITCompile );
}
R3000Acpu psxRec = {

View File

@ -38,7 +38,7 @@ static const int psxInstCycles_Load = 0;
#define PSX_HI XMMGPR_HI
#define PSX_LO XMMGPR_LO
extern uptr *psxRecLUT;
extern uptr psxRecLUT[];
u8 _psxLoadWritesRs(u32 tempcode);
u8 _psxIsLoadStore(u32 tempcode);
@ -70,7 +70,7 @@ void psxLoadBranchState();
void psxSetBranchReg(u32 reg);
void psxSetBranchImm( u32 imm );
void psxRecompileNextInstruction(int delayslot);
void psxRecClearMem(BASEBLOCK* p);
void psxRecClearMem(u32 p);
////////////////////////////////////////////////////////////////////
// IOP Constant Propagation Defines, Vars, and API - From here down!

View File

@ -51,7 +51,7 @@
extern u32 pc;
extern int branch;
extern uptr* recLUT;
extern uptr recLUT[];
extern u32 maxrecmem;
extern u32 pc; // recompiler pc (also used by the SuperVU! .. why? (air))
@ -92,7 +92,7 @@ extern GPR_reg64 s_ConstGPRreg;
// Used to clear recompiled code blocks during memory/dma write operations.
void recClearMem(BASEBLOCK* p);
void recClearMem(u32 pc);
void REC_CLEARM( u32 mem );
// used when processing branches

View File

@ -1,169 +1,127 @@
// iR5900.c assembly routines
// zerofrog(@gmail.com)
.intel_syntax
.extern cpuRegs
// iR5900.c assembly routines
.intel_syntax noprefix
//////////////////////////////////////////////////////////////////////////
// Note that iR3000A.S and iR5900.S asm code is now identical. Only some
// function names and the following two defines should ever differ:
#define REGINFO cpuRegs
#define PCOFFSET 0x2a8 // this must always match what Pcsx2 displays at startup
//////////////////////////////////////////////////////////////////////////
// Preprocessor Mess!
.extern REGINFO
.extern recRecompile
#define BLOCKTYPE_STARTPC 4 // startpc offset
#define BLOCKTYPE_DELAYSLOT 1 // if bit set, delay slot
#define BASEBLOCK_SIZE 2 // in dwords
#define PCOFFSET 0x2a8 // this must always match what Pcsx2 displays at startup
#define REG_BLOCK %esi
//////////////////////////////////////////////////////////////////////////
// The address for all cleared blocks. It recompiles the current pc and then
// dispatches to the recompiled block address.
.global JITCompile
JITCompile:
mov esi, dword ptr [REGINFO + PCOFFSET]
push esi
call recRecompile
add esp, 4
mov ebx, esi
shr esi, 16
mov ecx, dword ptr [recLUT+esi*4]
jmp dword ptr [ecx+ebx*2]
//////////////////////////////////////////////////////////////////////////
// Recompiles the next block, and links the old block directly to it.
// This is a on-shot execution for ny block which uses it. Once the block
// This is a on-shot execution for any block which uses it. Once the block
// has been statically linked to the new block, this function will be bypassed
//
// edx - jump address to modify
.globl Dispatcher
Dispatcher:
# EDX contains the jump addr to modify
push %edx
# calc PC_GETBLOCK
# ((BASEBLOCK*)(recLUT[((u32)(x)) >> 16] + (sizeof(BASEBLOCK)/4)*((x) & 0xffff)))
# ((BASEBLOCK*)(reclut[((u32)(x)) >> 16] + (x)*(sizeof(BASEBLOCK)/4)))
mov %eax,dword ptr [cpuRegs+PCOFFSET]
mov %ecx,%eax // ecx is the BLOCK address
mov %esi,%eax // esi is the PC address (leave unmodified!)
shr %eax,0x10
and %ecx,0xFFFF
mov %edx,dword ptr [recLUT]
mov %eax,dword ptr [%edx+%eax*4]
lea %ecx,[%eax+%ecx*2]
mov eax, dword ptr [REGINFO + PCOFFSET]
mov ebx, eax
shr eax, 16
mov ecx, dword ptr [recLUT+eax*4]
mov eax, dword ptr [ecx+ebx*2]
// check if startpc == cpuRegs.pc
//and %ecx, 0x5fffffff // remove higher bits
cmp %esi, dword ptr [%ecx+BLOCKTYPE_STARTPC]
je Dispatcher_CheckPtr
cmp eax, offset JITCompile
je Dispatch_notcompiled
lea ebx, [eax-4]
sub ebx, edx
mov dword ptr [edx], ebx
jmp eax
// recompile
push %ecx
push %esi // pc
.align 16
Dispatch_notcompiled:
mov esi, edx
lea edi, [ecx+ebx*2]
push ebx
call recRecompile
add %esp, 4
pop %ecx // ecx is now the REG_BLOCK
Dispatcher_CheckPtr:
mov %eax, dword ptr [%ecx]
add esp, 4
#ifdef _DEBUG
test %eax, %eax
jnz Dispatcher_CallFn
// throw an exception
int 10
Dispatcher_CallFn:
#endif
//and %eax, 0x0fffffff
shl %eax, 4
pop %ecx // x86Ptr to mod
mov %edx, %eax
sub %edx, %ecx
sub %edx, 4
mov dword ptr [%ecx], %edx
mov eax, dword ptr [edi]
lea ebx, [eax-4]
sub ebx, esi
mov dword ptr [esi], ebx
jmp eax
jmp %eax
//////////////////////////////////////////////////////////////////////////
// edx - baseblock->startpc
// stack - x86Ptr
.globl DispatcherClear
DispatcherClear:
// EDX contains the current pc
mov dword ptr [cpuRegs + PCOFFSET], %edx
mov [REGINFO + PCOFFSET], edx
// calc PC_GETBLOCK
# ((BASEBLOCK*)(recLUT[((u32)(x)) >> 16] + (sizeof(BASEBLOCK)/4)*((x) & 0xffff)))
mov %eax, %edx
mov REG_BLOCK, %edx
shr %eax, 16
and REG_BLOCK, 0xffff
shl %eax, 2
add %eax, dword ptr [recLUT]
shl REG_BLOCK, 1
add REG_BLOCK, dword ptr [%eax]
# calc PC_GETBLOCK
# ((BASEBLOCK*)(reclut[((u32)(x)) >> 16] + (x)*(sizeof(BASEBLOCK)/4)))
cmp %edx, dword ptr [REG_BLOCK + 4]
jne DispatcherClear_Recompile
add %esp, 4 // ignore stack
mov %eax, dword ptr [REG_BLOCK]
#ifdef _DEBUG
test %eax, %eax
jnz DispatcherClear_CallFn
# throw an exception
int 10
DispatcherClear_CallFn:
#endif
mov ebx, edx
shr edx, 16
mov ecx, dword ptr [recLUT+edx*4]
mov eax, dword ptr [ecx+ebx*2]
//and %eax, 0x0fffffff
shl %eax, 4
jmp %eax
cmp eax, offset JITCompile
je Clear_notcompiled
add esp, 4
jmp eax
DispatcherClear_Recompile:
push REG_BLOCK
push %edx
.align 16
Clear_notcompiled:
lea edi, [ecx+ebx*2]
push ebx
call recRecompile
add %esp, 4 // pop old param
pop REG_BLOCK
mov %eax, dword ptr [REG_BLOCK]
add esp, 4
mov eax, dword ptr [edi]
pop %ecx // old fnptr
pop ecx
mov byte ptr [ecx], 0xe9 // jmp32
lea ebx, [eax-5]
sub ebx, ecx
mov dword ptr [ecx+1], ebx
//and %eax, 0x0fffffff
shl %eax, 4
mov byte ptr [%ecx], 0xe9 // jmp32
mov %edx, %eax
sub %edx, %ecx
sub %edx, 5
mov dword ptr [%ecx+1], %edx
jmp eax
jmp %eax
//////////////////////////////////////////////////////////////////////////
// called when jumping to variable pc address
// This is basically the same as Dispatcher but without the part at the end
// that modifies the block's jmp instruction. (ie, no static block linking)
// called when jumping to variable pc address.
.globl DispatcherReg
DispatcherReg:
mov %eax,dword ptr [cpuRegs+PCOFFSET]
mov %ecx,%eax // ecx will be the BLOCK
mov %esi,%eax // esi is the PC address (leave unmodified!)
shr %eax,0x10
and %ecx,0xFFFF
mov %edx,dword ptr [recLUT]
mov %eax,dword ptr [%edx+%eax*4]
lea %ecx,[%eax+%ecx*2]
// check if startpc == cpuRegs.pc
//and %ecx, 0x5fffffff // remove higher bits
cmp %esi, dword ptr [%ecx+BLOCKTYPE_STARTPC]
je DispatcherReg_CheckPtr
// recompile
push %ecx // block
push %esi // pc
call recRecompile
add %esp, 4
pop %ecx // block
DispatcherReg_CheckPtr:
mov %eax, dword ptr [%ecx]
#ifdef _DEBUG
test %eax, %eax
jnz DispatcherReg_CallFn
// throw an exception
int 10
DispatcherReg_CallFn:
#endif
//and %eax, 0x0fffffff
shl %eax, 4
jmp %eax
mov eax, dword ptr [REGINFO + PCOFFSET]
mov ebx, eax
shr eax, 16
mov ecx, dword ptr [recLUT+eax*4]
jmp dword ptr [ecx+ebx*2]

View File

@ -52,7 +52,7 @@ using namespace R5900;
bool g_EEFreezeRegs = false;
u32 maxrecmem = 0;
uptr *recLUT = NULL;
uptr recLUT[0x10000];
u32 s_nBlockCycles = 0; // cycles of current block recompiling
//u8* dyna_block_discard_recmem=0;
@ -84,7 +84,6 @@ static u32 s_nInstCacheSize = 0;
static BASEBLOCK* s_pCurBlock = NULL;
static BASEBLOCKEX* s_pCurBlockEx = NULL;
const BASEBLOCK* s_pDispatchBlock = NULL;
static u32 s_nEndBlock = 0; // what pc the current block ends
static u32 s_nHasDelay = 0;
@ -112,15 +111,16 @@ static u32 dumplog = 0;
#endif
static void iBranchTest(u32 newpc, bool noDispatch=false);
static void InitRecLUT(BASEBLOCK* base, int count);
BASEBLOCKEX* PC_GETBLOCKEX(BASEBLOCK* p)
BASEBLOCKEX* PC_GETBLOCKEX(u32 pc)
{
// BASEBLOCKEX* pex = *(BASEBLOCKEX**)(p+1);
// if( pex >= recBlocks && pex < recBlocks+EE_NUMBLOCKS )
// return pex;
// otherwise, use the sorted list
return GetBaseBlockEx(p->startpc, 0);
return GetBaseBlockEx(pc, 0);
}
////////////////////////////////////////////////////
@ -480,9 +480,6 @@ static void recAlloc()
if ( !( cpucaps.hasStreamingSIMD2Extensions ) )
throw Exception::HardwareDeficiency( _( "Processor doesn't support SSE2" ) );
if( recLUT == NULL )
recLUT = (uptr*) _aligned_malloc( 0x010000 * sizeof(uptr), 16 );
if( recMem == NULL )
{
// Note: the VUrec depends on being able to grab an allocatione below the 0x10000000 line,
@ -539,6 +536,9 @@ void recResetEE( void )
memset_8<0xcd, REC_CACHEMEM>(recMem);
memzero_ptr<m_recBlockAllocSize>( m_recBlockAlloc );
memzero_ptr<EE_NUMBLOCKS*sizeof(BASEBLOCKEX)>(recBlocks);
InitRecLUT((BASEBLOCK*)m_recBlockAlloc,
(((Ps2MemSize::Base + Ps2MemSize::Rom + Ps2MemSize::Rom1) / 4)));
if( s_pInstCache )
memset( s_pInstCache, 0, sizeof(EEINST)*s_nInstCacheSize );
@ -552,32 +552,34 @@ void recResetEE( void )
__asm__("emms");
#endif
memzero_ptr<0x010000 * sizeof(uptr)>( recLUT );
memzero_ptr<sizeof recLUT>( recLUT );
for ( int i = 0x0000; i < 0x0200; i++ )
{
recLUT[ i + 0x0000 ] = (uptr)&recRAM[ i << 14 ];
recLUT[ i + 0x2000 ] = (uptr)&recRAM[ i << 14 ];
recLUT[ i + 0x3000 ] = (uptr)&recRAM[ i << 14 ];
RECLUT_SETPAGE(recLUT, i + 0x0000, &recRAM[ i << 14 ]);
RECLUT_SETPAGE(recLUT, i + 0x2000, &recRAM[ i << 14 ]);
RECLUT_SETPAGE(recLUT, i + 0x3000, &recRAM[ i << 14 ]);
RECLUT_SETPAGE(recLUT, i + 0x8000, &recRAM[ i << 14 ]);
RECLUT_SETPAGE(recLUT, i + 0xa000, &recRAM[ i << 14 ]);
RECLUT_SETPAGE(recLUT, i + 0xb000, &recRAM[ i << 14 ]);
RECLUT_SETPAGE(recLUT, i + 0xc000, &recRAM[ i << 14 ]);
RECLUT_SETPAGE(recLUT, i + 0xd000, &recRAM[ i << 14 ]);
}
for ( int i = 0x0000; i < 0x0040; i++ )
{
recLUT[ i + 0x1fc0 ] = (uptr)&recROM[ i << 14 ];
recLUT[ i + 0x9fc0 ] = (uptr)&recROM[ i << 14 ];
recLUT[ i + 0xbfc0 ] = (uptr)&recROM[ i << 14 ];
RECLUT_SETPAGE(recLUT, i + 0x1fc0, &recROM[ i << 14 ]);
RECLUT_SETPAGE(recLUT, i + 0x9fc0, &recROM[ i << 14 ]);
RECLUT_SETPAGE(recLUT, i + 0xbfc0, &recROM[ i << 14 ]);
}
for ( int i = 0x0000; i < 0x0004; i++ )
{
recLUT[ i + 0x1e00 ] = (uptr)&recROM1[ i << 14 ];
recLUT[ i + 0x9e00 ] = (uptr)&recROM1[ i << 14 ];
recLUT[ i + 0xbe00 ] = (uptr)&recROM1[ i << 14 ];
RECLUT_SETPAGE(recLUT, i + 0x1e00, &recROM1[ i << 14 ]);
RECLUT_SETPAGE(recLUT, i + 0x9e00, &recROM1[ i << 14 ]);
RECLUT_SETPAGE(recLUT, i + 0xbe00, &recROM1[ i << 14 ]);
}
memcpy_fast( recLUT + 0x8000, recLUT, 0x2000 * sizeof(uptr) );
memcpy_fast( recLUT + 0xa000, recLUT, 0x2000 * sizeof(uptr) );
// drk||Raziel says this is useful but I'm not sure why. Something to do with forward jumps.
// Anyways, it causes random crashing for some reasom, possibly because of memory
// corrupition elsewhere in the recs. I can't reproduce the problem here though,
@ -604,7 +606,6 @@ static void recShutdown( void )
ResetBaseBlockEx(0);
SafeSysMunmap( recMem, REC_CACHEMEM );
safe_aligned_free( recLUT );
safe_aligned_free( m_recBlockAlloc );
recRAM = recROM = recROM1 = NULL;
recBlocks = NULL;
@ -645,110 +646,100 @@ u32 g_EEDispatchTemp;
#ifdef _MSC_VER
// jumped to when invalid pc address
// The address for all cleared blocks. It recompiles the current pc and then
// dispatches to the recompiled block address.
static __declspec(naked) void JITCompile()
{
__asm {
mov esi, dword ptr [cpuRegs.pc]
push esi
call recRecompile
add esp, 4
mov ebx, esi
shr esi, 16
mov ecx, dword ptr [recLUT+esi*4]
jmp dword ptr [ecx+ebx*2]
}
}
// jumped to when an immediate branch (EE side) hasn't been statically linked yet.
// Block is compiled if needed, and the link is made.
// EDX contains the jump addr to modify
static __naked void Dispatcher()
{
// EDX contains the jump addr to modify
__asm push edx
// calc PC_GETBLOCK
s_pDispatchBlock = PC_GETBLOCK(cpuRegs.pc);
if( s_pDispatchBlock->startpc != cpuRegs.pc )
recRecompile(cpuRegs.pc);
__asm
{
mov eax, s_pDispatchBlock
mov eax, dword ptr [eax]
}
#ifdef _DEBUG
__asm mov g_EEDispatchTemp, eax
assert( g_EEDispatchTemp );
#endif
// Modify the prev block's jump address, and jump to the new block:
__asm {
shl eax, 4
pop ecx // x86Ptr[0] to mod
mov edx, eax
sub edx, ecx
sub edx, 4
mov dword ptr [ecx], edx
mov eax, dword ptr [cpuRegs.pc]
mov ebx, eax
shr eax, 16
mov ecx, dword ptr [recLUT+eax*4]
mov eax, dword ptr [ecx+ebx*2]
cmp eax, offset JITCompile
je notcompiled
lea ebx, [eax-4]
sub ebx, edx
mov dword ptr [edx], ebx
jmp eax
align 16
notcompiled:
mov esi, edx
lea edi, [ecx+ebx*2]
push ebx
call recRecompile
add esp, 4
mov eax, dword ptr [edi]
lea ebx, [eax-4]
sub ebx, esi
mov dword ptr [esi], ebx
jmp eax
}
}
// edx - baseblock->startpc
// edx - baseblock->GetStartPC()
// stack - x86Ptr[0]
static __naked void DispatcherClear()
{
// EDX contains the current pc
__asm mov cpuRegs.pc, edx
__asm push edx
// calc PC_GETBLOCK
s_pDispatchBlock = PC_GETBLOCK(cpuRegs.pc);
if( s_pDispatchBlock != NULL && s_pDispatchBlock->startpc == cpuRegs.pc )
{
assert( s_pDispatchBlock->GetFnptr() != 0 );
// already modded the code, jump to the new place
__asm {
pop edx
mov eax, s_pDispatchBlock
add esp, 4 // ignore stack
mov eax, dword ptr [eax]
shl eax, 4
jmp eax
}
}
__asm {
mov [cpuRegs.pc], edx
mov ebx, edx
shr edx, 16
mov ecx, dword ptr [recLUT+edx*4]
mov eax, dword ptr [ecx+ebx*2]
cmp eax, offset JITCompile
je notcompiled
add esp, 4
jmp eax
align 16
notcompiled:
lea edi, [ecx+ebx*2]
push ebx
call recRecompile
add esp, 4 // pop old param
mov eax, s_pDispatchBlock
mov eax, dword ptr [eax]
add esp, 4
mov eax, dword ptr [edi]
pop ecx // old fnptr
shl eax, 4
pop ecx
mov byte ptr [ecx], 0xe9 // jmp32
mov edx, eax
sub edx, ecx
sub edx, 5
mov dword ptr [ecx+1], edx
lea ebx, [eax-5]
sub ebx, ecx
mov dword ptr [ecx+1], ebx
jmp eax
}
}
// called when jumping to variable pc address
static __naked void DispatcherReg()
static void __naked DispatcherReg()
{
s_pDispatchBlock = PC_GETBLOCK(cpuRegs.pc);
if( s_pDispatchBlock->startpc != cpuRegs.pc )
recRecompile(cpuRegs.pc);
__asm
{
mov eax, s_pDispatchBlock
mov eax, dword ptr [eax]
}
#ifdef _DEBUG
__asm mov g_EEDispatchTemp, eax
assert( g_EEDispatchTemp );
#endif
__asm {
shl eax, 4
jmp eax
mov eax, dword ptr [cpuRegs.pc]
mov ebx, eax
shr eax, 16
mov ecx, dword ptr [recLUT+eax*4]
jmp dword ptr [ecx+ebx*2]
}
}
@ -889,21 +880,44 @@ void recBREAK( void ) {
} } } // end namespace R5900::Dynarec::OpcodeImpl
////////////////////////////////////////////////////
static void REC_CLEARM( u32 mem )
{
if ((mem) < maxrecmem && (recLUT[(mem) >> 16] + mem))
recClearMem(mem);
}
void recClear( u32 Addr, u32 Size )
{
u32 i;
for(i = 0; i < Size; ++i, Addr+=4) {
for(i = 0; i < Size; ++i, Addr+=4)
REC_CLEARM(Addr);
}
// Clears the recLUT table so that all blocks are mapped to the JIT recompiler by default.
static void InitRecLUT(BASEBLOCK* base, int count)
{
for (int i = 0; i < count; i++)
{
base[i].SetFnptr((uptr)JITCompile);
base[i].SetStartPC(0);
base[i].uType = 0;
}
}
static const int EE_MIN_BLOCK_BYTES = 15;
void recClearMem(BASEBLOCK* p)
void recClearMem(u32 pc)
{
BASEBLOCKEX* pexblock;
BASEBLOCK* pstart;
int lastdelay;
BASEBLOCK* p;
p= PC_GETBLOCK(pc);
pc = p->GetStartPC();
if (!pc)
return;
pexblock = PC_GETBLOCKEX(pc);
if (!pexblock)
return;
pstart = PC_GETBLOCK(pexblock->startpc);
// necessary since recompiler doesn't call femms/emms
#ifdef __INTEL_COMPILER
@ -918,63 +932,26 @@ void recClearMem(BASEBLOCK* p)
__asm__("emms");
#endif
#endif
assert( p != NULL );
if( p->uType & BLOCKTYPE_DELAYSLOT ) {
recClearMem(p-1);
if( p->GetFnptr() == 0 )
return;
for (int i = 0; i < pexblock->size; i++)
{
x86Ptr[0] = (u8*)pstart[i].GetFnptr();
if (x86Ptr[0] == (u8*)JITCompile)
continue;
// there is a small problem: mem can be ored with 0xa<<28 or 0x8<<28, and don't know which
MOV32ItoR(EDX, pexblock->startpc + i*4);
PUSH32I((u32)x86Ptr[0]); // will be replaced by JMP32
JMP32((u32)DispatcherClear - ( (u32)x86Ptr[0] + 5 ));
}
assert( p->GetFnptr() != 0 );
assert( p->startpc );
x86Ptr[0] = (u8*)p->GetFnptr();
// there is a small problem: mem can be ored with 0xa<<28 or 0x8<<28, and don't know which
MOV32ItoR(EDX, p->startpc);
PUSH32I((u32)x86Ptr[0]); // will be replaced by JMP32
JMP32((u32)DispatcherClear - ( (u32)x86Ptr[0] + 5 ));
assert( x86Ptr[0] == (u8*)p->GetFnptr() + EE_MIN_BLOCK_BYTES );
pstart = PC_GETBLOCK(p->startpc);
pexblock = PC_GETBLOCKEX(pstart);
assert( pexblock->startpc == pstart->startpc );
if( pexblock->startpc != pstart->startpc ) {
// some bug with ffx after beating a big snake in sewers
RemoveBaseBlockEx(pexblock, 0);
pexblock->size = 0;
pexblock->startpc = 0;
return;
}
// don't delete if last is delay
lastdelay = pexblock->size;
if( pstart[pexblock->size-1].uType & BLOCKTYPE_DELAYSLOT ) {
assert( pstart[pexblock->size-1].GetFnptr() != pstart->GetFnptr() );
if( pstart[pexblock->size-1].GetFnptr() != 0 ) {
pstart[pexblock->size-1].uType = 0;
--lastdelay;
}
}
memset(pstart, 0, lastdelay*sizeof(BASEBLOCK));
InitRecLUT(pstart, pexblock->size);
RemoveBaseBlockEx(pexblock, 0);
pexblock->size = 0;
pexblock->startpc = 0;
}
void REC_CLEARM( u32 mem )
{
if ((mem) < maxrecmem && recLUT[(mem) >> 16]) {
BASEBLOCK* p = PC_GETBLOCK(mem);
if( *(u32*)p ) recClearMem(p);
}
}
// check for end of bios
void CheckForBIOSEnd()
{
@ -1283,8 +1260,8 @@ u32 recompileCodeSafe(u32 temppc)
{
BASEBLOCK* pblock = PC_GETBLOCK(temppc);
if( pblock->GetFnptr() != 0 && pblock->startpc != s_pCurBlock->startpc ) {
if( pc == pblock->startpc )
if( pblock->GetFnptr() != (uptr)JITCompile && pblock->GetStartPC() != s_pCurBlock->GetStartPC() ) {
if( pc == pblock->GetStartPC() )
return 0;
}
@ -1299,32 +1276,26 @@ void recompileNextInstruction(int delayslot)
BASEBLOCK* pblock = PC_GETBLOCK(pc);
// need *ppblock != s_pCurBlock because of branches
if( pblock->GetFnptr() != 0 && pblock->startpc != s_pCurBlock->startpc ) {
if( !delayslot && pc == pblock->startpc ) {
if( pblock->GetFnptr() != (uptr)JITCompile && pblock->GetStartPC() != s_pCurBlock->GetStartPC() )
{
if( !delayslot && pc == pblock->GetStartPC() )
{
// code already in place, so jump to it and exit recomp
assert( PC_GETBLOCKEX(pblock)->startpc == pblock->startpc );
assert( PC_GETBLOCKEX(pc)->startpc == pblock->GetStartPC() );
iFlushCall(FLUSH_EVERYTHING);
MOV32ItoM((uptr)&cpuRegs.pc, pc);
// if( pexblock->pOldFnptr ) {
// // code already in place, so jump to it and exit recomp
// JMP32((u32)pexblock->pOldFnptr - ((u32)x86Ptr[0] + 5));
// branch = 3;
// return;
// }
MOV32ItoM((uptr)&cpuRegs.pc, pc);
JMP32((uptr)pblock->GetFnptr() - ((uptr)x86Ptr[0] + 5));
branch = 3;
return;
}
else {
if( !(delayslot && pblock->startpc == pc) ) {
else
{
if( !(delayslot && pblock->GetStartPC() == pc) )
{
u8* oldX86 = x86Ptr[0];
//__Log("clear block %x\n", pblock->startpc);
recClearMem(pblock);
//__Log("clear block %x\n", pblock->GetStartPC());
recClearMem(pc);
x86Ptr[0] = oldX86;
if( delayslot )
Console::Notice("delay slot %x", params pc);
@ -1332,8 +1303,10 @@ void recompileNextInstruction(int delayslot)
}
}
#if 1
if( delayslot )
pblock->uType = BLOCKTYPE_DELAYSLOT;
#endif
s_pCode = (int *)PSM( pc );
assert(s_pCode);
@ -1368,14 +1341,6 @@ void recompileNextInstruction(int delayslot)
g_pCurInstInfo++;
// reorder register priorities
// for(i = 0; i < X86REGS; ++i) {
// if( x86regs[i].inuse ) {
// if( count > 0 ) mmxregs[i].counter = 1000-count;
// else mmxregs[i].counter = 0;
// }
// }
for(i = 0; i < MMXREGS; ++i) {
if( mmxregs[i].inuse ) {
assert( MMX_ISGPR(mmxregs[i].reg) );
@ -1541,14 +1506,14 @@ void recRecompile( const u32 startpc )
s_pCurBlock = PC_GETBLOCK(startpc);
if( s_pCurBlock->GetFnptr() ) {
if( s_pCurBlock->GetFnptr() != (uptr)JITCompile ) {
// clear if already taken
assert( s_pCurBlock->startpc < startpc );
recClearMem(s_pCurBlock);
assert( s_pCurBlock->GetStartPC() < startpc );
recClearMem(startpc);
}
if( s_pCurBlock->startpc == startpc ) {
s_pCurBlockEx = PC_GETBLOCKEX(s_pCurBlock);
if( s_pCurBlock->GetStartPC() == startpc ) {
s_pCurBlockEx = PC_GETBLOCKEX(startpc);
assert( s_pCurBlockEx->startpc == startpc );
}
else {
@ -1575,7 +1540,7 @@ void recRecompile( const u32 startpc )
x86Align(16);
recPtr = x86Ptr[0];
s_pCurBlock->SetFnptr( (uptr)x86Ptr[0] );
s_pCurBlock->startpc = startpc;
s_pCurBlock->SetStartPC(startpc);
branch = 0;
@ -1612,9 +1577,9 @@ void recRecompile( const u32 startpc )
while(1) {
BASEBLOCK* pblock = PC_GETBLOCK(i);
if( pblock->GetFnptr() != 0 && pblock->startpc != s_pCurBlock->startpc ) {
if( pblock->GetFnptr() != (uptr)JITCompile && pblock->GetStartPC() != s_pCurBlock->GetStartPC() ) {
if( i == pblock->startpc ) {
if( i == pblock->GetStartPC() ) {
// branch = 3
willbranch3 = 1;
s_nEndBlock = i;
@ -1943,29 +1908,32 @@ StartRecomp:
assert( (pc-startpc)>>2 <= 0xffff );
s_pCurBlockEx->size = (pc-startpc)>>2;
for(i = 1; i <= (u32)s_pCurBlockEx->size-1; ++i) {
if (!s_pCurBlock[i].GetStartPC())
s_pCurBlock[i].SetStartPC( startpc );
}
// This is just wrong, right? How can setting a jump to any point in this block
// to jump to the beginning of the block possibly be right? -pseudonym
// - Jumping to the beginning of the block will work fine so long as the registers
// are flushed first before the jump is made. Of course that's how all static
// links work so I still don't see the point of any complication for it -air
#ifdef ZERO_TOLERANCE
for(i = 1; i < (u32)s_pCurBlockEx->size-1; ++i) {
s_pCurBlock[i].SetFnptr( s_pCurBlock->GetFnptr() );
s_pCurBlock[i].startpc = s_pCurBlock->startpc;
s_pCurBlock[i].SetStartPC( p_CurBlock->startpc );
}
// don't overwrite if delay slot
if( i < (u32)s_pCurBlockEx->size && !(s_pCurBlock[i].uType & BLOCKTYPE_DELAYSLOT) ) {
s_pCurBlock[i].SetFnptr( s_pCurBlock->GetFnptr() );
s_pCurBlock[i].startpc = s_pCurBlock->startpc;
s_pCurBlock[i].SetFnptr(0);
s_pCurBlock[i].SetStartPC(0);
}
#endif
// set the block ptr
AddBaseBlockEx(s_pCurBlockEx, 0);
// if( p[1].startpc == p[0].startpc + 4 ) {
// assert( p[1].GetFnptr() != 0 );
// // already fn in place, so add to list
// AddBaseBlockEx(s_pCurBlockEx, 0);
// }
// else
// *(BASEBLOCKEX**)(p+1) = pex;
// }
//PC_SETBLOCKEX(s_pCurBlock, s_pCurBlockEx);
if( !(pc&0x10000000) )
maxrecmem = std::max( (pc&~0xa0000000), maxrecmem );
@ -2007,7 +1975,6 @@ StartRecomp:
}
}
assert( x86Ptr[0] >= (u8*)s_pCurBlock->GetFnptr() + EE_MIN_BLOCK_BYTES );
assert( x86Ptr[0] < recMem+REC_CACHEMEM );
assert( recStackPtr < recStack+RECSTACK_SIZE );
assert( x86FpuState == 0 );
@ -2021,18 +1988,18 @@ StartRecomp:
u32 nEndBlock = s_nEndBlock;
s_pCurBlock = PC_GETBLOCK(pc);
assert( ptr != NULL );
if( s_pCurBlock->startpc != pc )
if( s_pCurBlock->GetStartPC() != pc )
recRecompile(pc);
if( pcurblock->startpc == startpc ) {
assert( pcurblock->GetFnptr() );
assert( s_pCurBlock->startpc == nEndBlock );
if( pcurblock->GetStartPC() == startpc ) {
assert( pcurblock->GetFnptr() != (uptr)JITCompile );
assert( s_pCurBlock->GetStartPC() == nEndBlock );
*ptr = s_pCurBlock->GetFnptr() - ( (u32)ptr + 4 );
}
else {
recRecompile(startpc);
assert( pcurblock->GetFnptr() != 0 );
assert( pcurblock->GetFnptr() != (uptr)JITCompile );
}
}
}