/* PCSX2 - PS2 Emulator for PCs * Copyright (C) 2002-2009 PCSX2 Dev Team * * PCSX2 is free software: you can redistribute it and/or modify it under the terms * of the GNU Lesser General Public License as published by the Free Software Found- * ation, either version 3 of the License, or (at your option) any later version. * * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with PCSX2. * If not, see . */ /* * ix86 core v0.9.1 * * Original Authors (v0.6.2 and prior): * linuzappz * alexey silinov * goldfinger * zerofrog(@gmail.com) * * Authors of v0.9.1: * Jake.Stine(@gmail.com) * cottonvibes(@gmail.com) * sudonim(1@gmail.com) */ #include "PrecompiledHeader.h" #include "internal.h" // defined in tools.cpp //extern __aligned16 u64 g_globalXMMData[2*iREGCNT_XMM]; #include "tools.h" // ------------------------------------------------------------------------ // Notes on Thread Local Storage: // * TLS is pretty simple, and "just works" from a programmer perspective, with only // some minor additional computational overhead (see performance notes below). // // * MSVC and GCC handle TLS differently internally, but behavior to the programmer is // generally identical. // // Performance Considerations: // * GCC's implementation involves an extra dereference from normal storage (possibly // applies to x86-32 only -- x86-64 is untested). // // * MSVC's implementation involves *two* extra dereferences from normal storage because // it has to look up the TLS heap pointer from the Windows Thread Storage Area. (in // generated ASM code, this dereference is denoted by access to the fs:[2ch] address), // // * However, in either case, the optimizer usually optimizes it to a register so the // extra overhead is minimal over a series of instructions. // // MSVC Notes: // * Important!! the Full Optimization [/Ox] option effectively disables TLS optimizations // in MSVC 2008 and earlier, causing generally significant code bloat. Not tested in // VC2010 yet. // // * VC2010 generally does a superior job of optimizing TLS across inlined functions and // class methods, compared to predecessors. // __tls_emit u8* x86Ptr; __tls_emit XMMSSEType g_xmmtypes[iREGCNT_XMM] = { XMMT_INT }; namespace x86Emitter { template void xWrite( u8 val ); template void xWrite( u16 val ); template void xWrite( u32 val ); template void xWrite( u64 val ); template void xWrite( u128 val ); __forceinline void xWrite8( u8 val ) { xWrite( val ); } __forceinline void xWrite16( u16 val ) { xWrite( val ); } __forceinline void xWrite32( u32 val ) { xWrite( val ); } __forceinline void xWrite64( u64 val ) { xWrite( val ); } // Empty initializers are due to frivolously pointless GCC errors (it demands the // objects be initialized even though they have no actual variable members). const xAddressIndexer ptr = { }; const xAddressIndexer ptr128 = { }; const xAddressIndexer ptr64 = { }; const xAddressIndexer ptr32 = { }; const xAddressIndexer ptr16 = { }; const xAddressIndexer ptr8 = { }; // ------------------------------------------------------------------------ const xRegisterEmpty xEmptyReg = { }; const xRegisterSSE xmm0( 0 ), xmm1( 1 ), xmm2( 2 ), xmm3( 3 ), xmm4( 4 ), xmm5( 5 ), xmm6( 6 ), xmm7( 7 ); const xRegisterMMX mm0( 0 ), mm1( 1 ), mm2( 2 ), mm3( 3 ), mm4( 4 ), mm5( 5 ), mm6( 6 ), mm7( 7 ); const xAddressReg eax( 0 ), ebx( 3 ), ecx( 1 ), edx( 2 ), esp( 4 ), ebp( 5 ), esi( 6 ), edi( 7 ); const xRegister16 ax( 0 ), bx( 3 ), cx( 1 ), dx( 2 ), sp( 4 ), bp( 5 ), si( 6 ), di( 7 ); const xRegister8 al( 0 ), dl( 2 ), bl( 3 ), ah( 4 ), ch( 5 ), dh( 6 ), bh( 7 ); const xRegisterCL cl; const char *const x86_regnames_gpr8[8] = { "al", "cl", "dl", "bl", "ah", "ch", "dh", "bh" }; const char *const x86_regnames_gpr16[8] = { "ax", "cx", "dx", "bx", "sp", "bp", "si", "di" }; const char *const x86_regnames_gpr32[8] = { "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi" }; const char *const x86_regnames_sse[8] = { "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" }; const char *const x86_regnames_mmx[8] = { "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" }; const char* xRegisterBase::GetName() { if( Id == xRegId_Invalid ) return "invalid"; if( Id == xRegId_Empty ) return "empty"; // bad error? Return a "big" error string. Might break formatting of register tables // but that's the least of your worries if you see this baby. if( Id >= 8 || Id <= -3 ) return "!Register index out of range!"; switch( GetOperandSize() ) { case 1: return x86_regnames_gpr8[ Id ]; case 2: return x86_regnames_gpr16[ Id ]; case 4: return x86_regnames_gpr32[ Id ]; case 8: return x86_regnames_mmx[ Id ]; case 16: return x86_regnames_sse[ Id ]; } return "oops?"; } ////////////////////////////////////////////////////////////////////////////////////////// // Performance note: VC++ wants to use byte/word register form for the following // ModRM/SibSB constructors when we use xWrite, and furthermore unrolls the // the shift using a series of ADDs for the following results: // add cl,cl // add cl,cl // add cl,cl // or cl,bl // add cl,cl // ... etc. // // This is unquestionably bad optimization by Core2 standard, an generates tons of // register aliases and false dependencies. (although may have been ideal for early- // brand P4s with a broken barrel shifter?). The workaround is to do our own manual // x86Ptr access and update using a u32 instead of u8. Thanks to little endianness, // the same end result is achieved and no false dependencies are generated. The draw- // back is that it clobbers 3 bytes past the end of the write, which could cause a // headache for someone who himself is doing some kind of headache-inducing amount of // recompiler SMC. So we don't do a work-around, and just hope for the compiler to // stop sucking someday instead. :) // // (btw, I know this isn't a critical performance item by any means, but it's // annoying simply because it *should* be an easy thing to optimize) static __forceinline void ModRM( uint mod, uint reg, uint rm ) { xWrite8( (mod << 6) | (reg << 3) | rm ); } static __forceinline void SibSB( u32 ss, u32 index, u32 base ) { xWrite8( (ss << 6) | (index << 3) | base ); } void EmitSibMagic( uint regfield, const void* address ) { ModRM( 0, regfield, ModRm_UseDisp32 ); xWrite( (s32)address ); } ////////////////////////////////////////////////////////////////////////////////////////// // emitter helpers for xmm instruction with prefixes, most of which are using // the basic opcode format (items inside braces denote optional or conditional // emission): // // [Prefix] / 0x0f / [OpcodePrefix] / Opcode / ModRM+[SibSB] // // Prefixes are typically 0x66, 0xf2, or 0xf3. OpcodePrefixes are either 0x38 or // 0x3a [and other value will result in assertion failue]. // __emitinline void xOpWrite0F( u8 prefix, u16 opcode, int instId, const ModSibBase& sib ) { SimdPrefix( prefix, opcode ); EmitSibMagic( instId, sib ); } __emitinline void xOpWrite0F( u8 prefix, u16 opcode, int instId, const void* data ) { SimdPrefix( prefix, opcode ); EmitSibMagic( instId, data ); } __emitinline void xOpWrite0F( u16 opcode, int instId, const ModSibBase& sib ) { xOpWrite0F( 0, opcode, instId, sib ); } ////////////////////////////////////////////////////////////////////////////////////////// // returns TRUE if this instruction requires SIB to be encoded, or FALSE if the // instruction ca be encoded as ModRm alone. static __forceinline bool NeedsSibMagic( const ModSibBase& info ) { // no registers? no sibs! // (ModSibBase::Reduce always places a register in Index, and optionally leaves // Base empty if only register is specified) if( info.Index.IsEmpty() ) return false; // A scaled register needs a SIB if( info.Scale != 0 ) return true; // two registers needs a SIB if( !info.Base.IsEmpty() ) return true; return false; } ////////////////////////////////////////////////////////////////////////////////////////// // Conditionally generates Sib encoding information! // // regfield - register field to be written to the ModRm. This is either a register specifier // or an opcode extension. In either case, the instruction determines the value for us. // void EmitSibMagic( uint regfield, const ModSibBase& info ) { pxAssertDev( regfield < 8, "Invalid x86 register identifier." ); int displacement_size = (info.Displacement == 0) ? 0 : ( ( info.IsByteSizeDisp() ) ? 1 : 2 ); if( !NeedsSibMagic( info ) ) { // Use ModRm-only encoding, with the rm field holding an index/base register, if // one has been specified. If neither register is specified then use Disp32 form, // which is encoded as "EBP w/o displacement" (which is why EBP must always be // encoded *with* a displacement of 0, if it would otherwise not have one). if( info.Index.IsEmpty() ) { EmitSibMagic( regfield, (void*)info.Displacement ); return; } else { if( info.Index == ebp && displacement_size == 0 ) displacement_size = 1; // forces [ebp] to be encoded as [ebp+0]! ModRM( displacement_size, regfield, info.Index.Id ); } } else { // In order to encode "just" index*scale (and no base), we have to encode // it as a special [index*scale + displacement] form, which is done by // specifying EBP as the base register and setting the displacement field // to zero. (same as ModRm w/o SIB form above, basically, except the // ModRm_UseDisp flag is specified in the SIB instead of the ModRM field). if( info.Base.IsEmpty() ) { ModRM( 0, regfield, ModRm_UseSib ); SibSB( info.Scale, info.Index.Id, ModRm_UseDisp32 ); xWrite( info.Displacement ); return; } else { if( info.Base == ebp && displacement_size == 0 ) displacement_size = 1; // forces [ebp] to be encoded as [ebp+0]! ModRM( displacement_size, regfield, ModRm_UseSib ); SibSB( info.Scale, info.Index.Id, info.Base.Id ); } } if( displacement_size != 0 ) { if( displacement_size == 1 ) xWrite( info.Displacement ); else xWrite( info.Displacement ); } } // Writes a ModRM byte for "Direct" register access forms, which is used for all // instructions taking a form of [reg,reg]. void EmitSibMagic( uint reg1, const xRegisterBase& reg2 ) { xWrite8( (Mod_Direct << 6) | (reg1 << 3) | reg2.Id ); } void EmitSibMagic( const xRegisterBase& reg1, const xRegisterBase& reg2 ) { xWrite8( (Mod_Direct << 6) | (reg1.Id << 3) | reg2.Id ); } void EmitSibMagic( const xRegisterBase& reg1, const void* src ) { EmitSibMagic( reg1.Id, src ); } void EmitSibMagic( const xRegisterBase& reg1, const ModSibBase& sib ) { EmitSibMagic( reg1.Id, sib ); } // -------------------------------------------------------------------------------------- // xSetPtr / xAlignPtr / xGetPtr / xAdvancePtr // -------------------------------------------------------------------------------------- // Assigns the current emitter buffer target address. // This is provided instead of using x86Ptr directly, since we may in the future find // a need to change the storage class system for the x86Ptr 'under the hood.' __emitinline void xSetPtr( void* ptr ) { x86Ptr = (u8*)ptr; } // Retrieves the current emitter buffer target address. // This is provided instead of using x86Ptr directly, since we may in the future find // a need to change the storage class system for the x86Ptr 'under the hood.' __emitinline u8* xGetPtr() { return x86Ptr; } __emitinline void xAlignPtr( uint bytes ) { // forward align x86Ptr = (u8*)( ( (uptr)x86Ptr + bytes - 1) & ~(bytes - 1) ); } // Performs best-case alignment for the target CPU, for use prior to starting a new // function. This is not meant to be used prior to jump targets, since it doesn't // add padding (additionally, speed benefit from jump alignment is minimal, and often // a loss). __emitinline void xAlignCallTarget() { // Core2/i7 CPUs prefer unaligned addresses. Checking for SSSE3 is a decent filter. // (also align in debug modes for disasm convenience) if( IsDebugBuild || !x86caps.hasSupplementalStreamingSIMD3Extensions ) { // - P4's and earlier prefer 16 byte alignment. // - AMD Athlons and Phenoms prefer 8 byte alignment, but I don't have an easy // heuristic for it yet. // - AMD Phenom IIs are unknown (either prefer 8 byte, or unaligned). xAlignPtr( 16 ); } } __emitinline u8* xGetAlignedCallTarget() { xAlignCallTarget(); return x86Ptr; } __emitinline void xAdvancePtr( uint bytes ) { if( IsDevBuild ) { // common debugger courtesy: advance with INT3 as filler. for( uint i=0; i( src.Displacement ); else xWrite( src.Displacement ); } } __emitinline void xLEA( xRegister32 to, const ModSibBase& src, bool preserve_flags ) { EmitLeaMagic( to, src, preserve_flags ); } __emitinline void xLEA( xRegister16 to, const ModSibBase& src, bool preserve_flags ) { xWrite8( 0x66 ); EmitLeaMagic( to, src, preserve_flags ); } // ===================================================================================================== // TEST / INC / DEC // ===================================================================================================== void xImpl_Test::operator()( const xRegister8& to, const xRegister8& from ) const { xWrite8( 0x84 ); EmitSibMagic( from, to ); } void xImpl_Test::operator()( const xRegister16& to, const xRegister16& from ) const { to.prefix16(); xWrite8( 0x85 ); EmitSibMagic( from, to ); } void xImpl_Test::operator()( const xRegister32& to, const xRegister32& from ) const { xWrite8( 0x85 ); EmitSibMagic( from, to ); } void xImpl_Test::operator()( const ModSib32orLess& dest, int imm ) const { dest.prefix16(); xWrite8( dest.Is8BitOp() ? 0xf6 : 0xf7 ); EmitSibMagic( 0, dest ); dest.xWriteImm( imm ); } void xImpl_Test::operator()( const xRegisterInt& to, int imm ) const { to.prefix16(); if( to.IsAccumulator() ) xWrite8( to.Is8BitOp() ? 0xa8 : 0xa9 ); else { xWrite8( to.Is8BitOp() ? 0xf6 : 0xf7 ); EmitSibMagic( 0, to ); } to.xWriteImm( imm ); } void xImpl_BitScan::operator()( const xRegister32& to, const xRegister32& from ) const { xOpWrite0F( Opcode, to, from ); } void xImpl_BitScan::operator()( const xRegister16& to, const xRegister16& from ) const { xOpWrite0F( 0x66, Opcode, to, from ); } void xImpl_BitScan::operator()( const xRegister16or32& to, const ModSibBase& sibsrc ) const { xOpWrite0F( (to->GetOperandSize() == 2) ? 0x66 : 0x00, Opcode, to, sibsrc ); } void xImpl_IncDec::operator()( const xRegisterInt& to ) const { if( to.Is8BitOp() ) { xWrite8( 0xfe ); EmitSibMagic( isDec ? 1 : 0, to ); } else { to.prefix16(); xWrite8( (isDec ? 0x48 : 0x40) | to.Id ); } } void xImpl_IncDec::operator()( const ModSib32orLess& to ) const { to.prefix16(); xWrite8( to.Is8BitOp() ? 0xfe : 0xff ); EmitSibMagic( isDec ? 1 : 0, to ); } void xImpl_DwordShift::operator()( const xRegister32& to, const xRegister32& from, const xRegisterCL& /* clreg */ ) const { xOpWrite0F( OpcodeBase+1, to, from ); } void xImpl_DwordShift::operator()( const xRegister16& to, const xRegister16& from, const xRegisterCL& /* clreg */ ) const { xOpWrite0F( 0x66, OpcodeBase+1, to, from ); } void xImpl_DwordShift::operator()( const xRegister32& to, const xRegister32& from, u8 shiftcnt ) const { if( shiftcnt != 0 ) xOpWrite0F( OpcodeBase, to, from ); } void xImpl_DwordShift::operator()( const xRegister16& to, const xRegister16& from, u8 shiftcnt ) const { if( shiftcnt != 0 ) xOpWrite0F( 0x66, OpcodeBase, to, from ); } void xImpl_DwordShift::operator()( const ModSibBase& dest, const xRegister16or32& from, const xRegisterCL& /* clreg */ ) const { xOpWrite0F( (from->GetOperandSize() == 2) ? 0x66 : 0x00, OpcodeBase, from, dest ); } void xImpl_DwordShift::operator()( const ModSibBase& dest, const xRegister16or32& from, u8 shiftcnt ) const { if( shiftcnt != 0 ) xOpWrite0F( (from->GetOperandSize() == 2) ? 0x66 : 0x00, OpcodeBase, from, dest, shiftcnt ); } const xImpl_Test xTEST = { }; const xImpl_BitScan xBSF = { 0xbc }; const xImpl_BitScan xBSR = { 0xbd }; const xImpl_IncDec xINC = { false }; const xImpl_IncDec xDEC = { true }; const xImpl_DwordShift xSHLD = { 0xa4 }; const xImpl_DwordShift xSHRD = { 0xac }; ////////////////////////////////////////////////////////////////////////////////////////// // Push / Pop Emitters // // Note: pushad/popad implementations are intentionally left out. The instructions are // invalid in x64, and are super slow on x32. Use multiple Push/Pop instructions instead. __emitinline void xPOP( const ModSibBase& from ) { xWrite8( 0x8f ); EmitSibMagic( 0, from ); } __emitinline void xPUSH( const ModSibBase& from ) { xWrite8( 0xff ); EmitSibMagic( 6, from ); } __forceinline void xPOP( xRegister32 from ) { xWrite8( 0x58 | from.Id ); } __forceinline void xPUSH( u32 imm ) { xWrite8( 0x68 ); xWrite32( imm ); } __forceinline void xPUSH( xRegister32 from ) { xWrite8( 0x50 | from.Id ); } // pushes the EFLAGS register onto the stack __forceinline void xPUSHFD() { xWrite8( 0x9C ); } // pops the EFLAGS register from the stack __forceinline void xPOPFD() { xWrite8( 0x9D ); } ////////////////////////////////////////////////////////////////////////////////////////// // __forceinline void xLEAVE() { xWrite8( 0xC9 ); } __forceinline void xRET() { xWrite8( 0xC3 ); } __forceinline void xCBW() { xWrite16( 0x9866 ); } __forceinline void xCWD() { xWrite8( 0x98 ); } __forceinline void xCDQ() { xWrite8( 0x99 ); } __forceinline void xCWDE() { xWrite8( 0x98 ); } __forceinline void xLAHF() { xWrite8( 0x9f ); } __forceinline void xSAHF() { xWrite8( 0x9e ); } __forceinline void xSTC() { xWrite8( 0xF9 ); } __forceinline void xCLC() { xWrite8( 0xF8 ); } // NOP 1-byte __forceinline void xNOP() { xWrite8(0x90); } __emitinline void xBSWAP( const xRegister32& to ) { xWrite8( 0x0F ); xWrite8( 0xC8 | to.Id ); } __emitinline void xStoreReg( const xRegisterSSE& src ) { xMOVDQA( &XMMRegisters::data[src.Id*2], src ); } __emitinline void xRestoreReg( const xRegisterSSE& dest ) { xMOVDQA( dest, &XMMRegisters::data[dest.Id*2] ); } }