From 71e2dc2fb4d0454f094dfc0039f0f9c0a62a61fd Mon Sep 17 00:00:00 2001 From: "Jake.Stine" Date: Sun, 8 Mar 2009 20:10:09 +0000 Subject: [PATCH] Many small bugfixes and optimizations: * Fixed Memcard init so that Memcard1 isn't default in both slots (oops!) * Fixed Memcard path logic so that cards outside your pcsx2 folder can be browsed/selected. * Fixed CDVD-to-BIOS time sync (I simply forgot a function call!) * Optimized yuvrgb_sse2, by using Mod/RM form instructions. * Win32: Same optimization applied to FreezeXMMRegs and FreezeMMXRegs (linux already had this optimization) git-svn-id: http://pcsx2.googlecode.com/svn/trunk@719 96395faa-99c1-11dd-bbfe-3dabce05a288 --- common/include/PS2Etypes.h | 14 ++- pcsx2/CDVD.cpp | 4 +- pcsx2/IPU/yuv2rgb.cpp | 154 +++++++++++++++++++---------- pcsx2/IPU/yuv2rgb.h | 4 +- pcsx2/PrecompiledHeader.h | 14 +++ pcsx2/System.cpp | 2 +- pcsx2/Vif.cpp | 17 +--- pcsx2/windows/McdConfigDlg.cpp | 8 +- pcsx2/x86/ix86/ix86_tools.cpp | 173 ++++++++++++++++++--------------- 9 files changed, 230 insertions(+), 160 deletions(-) diff --git a/common/include/PS2Etypes.h b/common/include/PS2Etypes.h index e285db8ede..258deaaded 100644 --- a/common/include/PS2Etypes.h +++ b/common/include/PS2Etypes.h @@ -31,13 +31,7 @@ #define ArraySize(x) (sizeof(x)/sizeof((x)[0])) #endif -#ifdef __LINUX__ -#define CALLBACK -#else -#define CALLBACK __stdcall -#endif - - +////////////////////////////////////////////////////////////////////////////////////////// // jASSUME - give hints to the optimizer // This is primarily useful for the default case switch optimizer, which enables VC to // generate more compact switches. @@ -68,8 +62,9 @@ default: \ break; \ } +////////////////////////////////////////////////////////////////////////////////////////// +// Basic Atomic Types -// Basic types #if defined(_MSC_VER) typedef __int8 s8; @@ -92,6 +87,7 @@ typedef unsigned int uint; #define PCSX2_ALIGNED16_EXTERN(x) extern __declspec(align(16)) x #define __naked __declspec(naked) +#define CALLBACK __stdcall #else // _MSC_VER @@ -140,6 +136,7 @@ typedef union _LARGE_INTEGER #define _inline __inline__ __attribute__((unused)) #define __forceinline __attribute__((always_inline,unused)) #define __naked // GCC lacks the naked specifier +#define CALLBACK // CALLBACK is win32-specific mess #endif // __LINUX__ @@ -164,6 +161,7 @@ typedef s32 sptr; #endif #endif +////////////////////////////////////////////////////////////////////////////////////////// // A rough-and-ready cross platform 128-bit datatype, Non-SSE style. #ifdef __cplusplus struct u128 diff --git a/pcsx2/CDVD.cpp b/pcsx2/CDVD.cpp index 1747d8aa18..35c76b4387 100644 --- a/pcsx2/CDVD.cpp +++ b/pcsx2/CDVD.cpp @@ -692,7 +692,7 @@ __forceinline void cdvdGetDiskType() // gets value for start lsn of layer1 // returns: 1 if on dual layer disc // 0 if not on dual layer disc -s32 cdvdReadDvdDualInfo(s32* dualType, u32* layer1Start) +static s32 cdvdReadDvdDualInfo(s32* dualType, u32* layer1Start) { u8 toc[2064]; *dualType = 0; @@ -754,6 +754,8 @@ void cdvdReset() cdvd.RTC.day = 25; cdvd.RTC.month = 5; cdvd.RTC.year = 7; //2007 + + cdvdSetSystemTime( cdvd ); } struct Freeze_v10Compat diff --git a/pcsx2/IPU/yuv2rgb.cpp b/pcsx2/IPU/yuv2rgb.cpp index fde1419912..c9131474e5 100644 --- a/pcsx2/IPU/yuv2rgb.cpp +++ b/pcsx2/IPU/yuv2rgb.cpp @@ -21,35 +21,74 @@ #include "PrecompiledHeader.h" -#include "System.h" +#include "Misc.h" #include "IPU.h" #include "yuv2rgb.h" // Everything below is bit accurate to the IPU specification (except maybe rounding). // Know the specification before you touch it. -PCSX2_ALIGNED16(u16 C_bias[8]) = {0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}; -PCSX2_ALIGNED16(u8 Y_bias[16]) = {16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16}; -#define SSE_COEFFICIENTS(name, x) \ - PCSX2_ALIGNED16(u16 name[8]) = {x<<2,x<<2,x<<2,x<<2,x<<2,x<<2,x<<2,x<<2}; -SSE_COEFFICIENTS(Y_coefficients, 0x95); // 1.1640625 -SSE_COEFFICIENTS(RCr_coefficients, 0xcc); // 1.59375 -SSE_COEFFICIENTS(GCr_coefficients, (-0x68)); // -0.8125 -SSE_COEFFICIENTS(GCb_coefficients, (-0x32)); // -0.390625 -SSE_COEFFICIENTS(BCb_coefficients, 0x102); // 2.015625 -PCSX2_ALIGNED16(u16 Y_mask[8]) = {0xff00,0xff00,0xff00,0xff00,0xff00,0xff00,0xff00,0xff00}; -// Specifying round off instead of round down as everywhere else -// implies that this is right -PCSX2_ALIGNED16(u16 round_1bit[8]) = {1,1,1,1,1,1,1,1}; -PCSX2_ALIGNED16(u16 yuv2rgb_temp[3][8]); +#define SSE_COEFFICIENTS(x) \ + {(x)<<2,(x)<<2,(x)<<2,(x)<<2,(x)<<2,(x)<<2,(x)<<2,(x)<<2} + +struct SSE2_Tables +{ + u16 C_bias[8]; // offset -64 + u8 Y_bias[16]; // offset -48 + u16 Y_mask[8]; // offset -32 + u16 round_1bit[8]; // offset -16 + + u16 Y_coefficients[8]; // offset 0 + u16 GCr_coefficients[8];// offset 16 + u16 GCb_coefficients[8];// offset 32 + u16 RCr_coefficients[8];// offset 48 + u16 BCb_coefficients[8];// offset 64 +}; + +#define C_BIAS (-64) +#define Y_BIAS (-48) +#define Y_MASK (-32) +#define ROUND_1BIT (-16) + +#define Y_COEFF 0 +#define GCr_COEFF 16 +#define GCb_COEFF 32 +#define RCr_COEFF 48 +#define BCb_COEFF 64 + +static PCSX2_ALIGNED16(const SSE2_Tables sse2_tables) = +{ + {0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}, // c_bias + {16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16}, // y_bias + {0xff00,0xff00,0xff00,0xff00,0xff00,0xff00,0xff00,0xff00}, // y_mask + + // Specifying round off instead of round down as everywhere else + // implies that this is right + {1,1,1,1,1,1,1,1}, // round_1bit + + SSE_COEFFICIENTS(0x95), // 1.1640625 [Y_coefficients] + SSE_COEFFICIENTS(-0x68), // -0.8125 [GCr_coefficients] + SSE_COEFFICIENTS(-0x32), // -0.390625 [GCb_coefficients] + SSE_COEFFICIENTS(0xcc), // 1.59375 [RCr_coefficients] + SSE_COEFFICIENTS(0x102), // 2.015625 [BCb_coefficients] +}; + +static PCSX2_ALIGNED16(u16 yuv2rgb_temp[3][8]); // This could potentially be improved for SSE4 -void yuv2rgb_sse2(void) +__releaseinline void yuv2rgb_sse2(void) { + FreezeXMMRegs(1); + #if defined(_MSC_VER) || defined(__INTEL_COMPILER) __asm { mov eax, 1 - mov esi, 0 - mov edi, 0 + xor esi, esi + xor edi, edi + + // Use ecx and edx as base pointers, to allow for Mod/RM form on memOps. + // This saves 2-3 bytes per instruction where these are used. :) + mov ecx, offset yuv2rgb_temp + mov edx, offset sse2_tables+64; align 16 tworows: @@ -65,29 +104,29 @@ tworows: // unfortunately I don't think this will matter despite being // technically potentially a little faster, but this is // equivalent to an add or sub - pxor xmm2, xmmword ptr [C_bias] // xmm2 <-- 8 x (Cb - 128) << 8 - pxor xmm0, xmmword ptr [C_bias] // xmm0 <-- 8 x (Cr - 128) << 8 + pxor xmm2, xmmword ptr [edx+C_BIAS] // xmm2 <-- 8 x (Cb - 128) << 8 + pxor xmm0, xmmword ptr [edx+C_BIAS] // xmm0 <-- 8 x (Cr - 128) << 8 movaps xmm1, xmm0 movaps xmm3, xmm2 - pmulhw xmm1, xmmword ptr [GCr_coefficients] - pmulhw xmm3, xmmword ptr [GCb_coefficients] - pmulhw xmm0, xmmword ptr [RCr_coefficients] - pmulhw xmm2, xmmword ptr [BCb_coefficients] + pmulhw xmm1, xmmword ptr [edx+GCr_COEFF] + pmulhw xmm3, xmmword ptr [edx+GCb_COEFF] + pmulhw xmm0, xmmword ptr [edx+RCr_COEFF] + pmulhw xmm2, xmmword ptr [edx+BCb_COEFF] paddsw xmm1, xmm3 // store for the next line; looking at the code above // compared to the code below, I have to wonder whether // this was worth the hassle - movaps xmmword ptr [yuv2rgb_temp], xmm0 - movaps xmmword ptr [yuv2rgb_temp+16], xmm1 - movaps xmmword ptr [yuv2rgb_temp+32], xmm2 + movaps xmmword ptr [ecx], xmm0 + movaps xmmword ptr [ecx+16], xmm1 + movaps xmmword ptr [ecx+32], xmm2 jmp ihatemsvc align 16 onerow: - movaps xmm0, xmmword ptr [yuv2rgb_temp] - movaps xmm1, xmmword ptr [yuv2rgb_temp+16] - movaps xmm2, xmmword ptr [yuv2rgb_temp+32] + movaps xmm0, xmmword ptr [ecx] + movaps xmm1, xmmword ptr [ecx+16] + movaps xmm2, xmmword ptr [ecx+32] // If masm directives worked properly in inline asm, I'd be using them, // but I'm not inclined to write ~70 line #defines to simulate them. @@ -100,13 +139,13 @@ ihatemsvc: movaps xmm5, xmm2 movaps xmm6, xmmword ptr [mb8+edi] - psubusb xmm6, xmmword ptr [Y_bias] + psubusb xmm6, xmmword ptr [edx+Y_BIAS] movaps xmm7, xmm6 psllw xmm6, 8 // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14 - pand xmm7, xmmword ptr [Y_mask] // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15 + pand xmm7, xmmword ptr [edx+Y_MASK] // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15 - pmulhuw xmm6, xmmword ptr [Y_coefficients] - pmulhuw xmm7, xmmword ptr [Y_coefficients] + pmulhuw xmm6, xmmword ptr [edx+Y_COEFF] + pmulhuw xmm7, xmmword ptr [edx+Y_COEFF] paddsw xmm0, xmm6 paddsw xmm3, xmm7 @@ -116,7 +155,7 @@ ihatemsvc: paddsw xmm5, xmm7 // round - movaps xmm6, xmmword ptr [round_1bit] + movaps xmm6, xmmword ptr [edx+ROUND_1BIT] paddw xmm0, xmm6 paddw xmm1, xmm6 paddw xmm2, xmm6 @@ -176,8 +215,13 @@ ihatemsvc: asm( ".intel_syntax noprefix\n" "mov eax, 1\n" - "mov esi, 0\n" - "mov edi, 0\n" + "xor esi, esi\n" + "xor edi, edi\n" + + // Use ecx and edx as base pointers, to allow for Mod/RM form on memOps. + // This saves 2-3 bytes per instruction where these are used. :) + "mov ecx, offset yuv2rgb_temp\n" + "mov edx, offset sse2_tables+64\n" ".align 16\n" "tworows:\n" @@ -193,29 +237,29 @@ ihatemsvc: // unfortunately I don't think this will matter despite being // technically potentially a little faster, but this is // equivalent to an add or sub - "pxor xmm2, xmmword ptr [C_bias]\n" // xmm2 <-- 8 x (Cb - 128) << 8 - "pxor xmm0, xmmword ptr [C_bias]\n" // xmm0 <-- 8 x (Cr - 128) << 8 + "pxor xmm2, xmmword ptr [edx+C_BIAS]\n" // xmm2 <-- 8 x (Cb - 128) << 8 + "pxor xmm0, xmmword ptr [edx+C_BIAS]\n" // xmm0 <-- 8 x (Cr - 128) << 8 "movaps xmm1, xmm0\n" "movaps xmm3, xmm2\n" - "pmulhw xmm1, xmmword ptr [GCr_coefficients]\n" - "pmulhw xmm3, xmmword ptr [GCb_coefficients]\n" - "pmulhw xmm0, xmmword ptr [RCr_coefficients]\n" - "pmulhw xmm2, xmmword ptr [BCb_coefficients]\n" + "pmulhw xmm1, xmmword ptr [edx+GCr_COEFF]\n" + "pmulhw xmm3, xmmword ptr [edx+GCb_COEFF]\n" + "pmulhw xmm0, xmmword ptr [edx+RCr_COEFF]\n" + "pmulhw xmm2, xmmword ptr [edx+BCb_COEFF]\n" "paddsw xmm1, xmm3\n" // store for the next line; looking at the code above // compared to the code below, I have to wonder whether // this was worth the hassle - "movaps xmmword ptr [yuv2rgb_temp], xmm0\n" - "movaps xmmword ptr [yuv2rgb_temp+16], xmm1\n" - "movaps xmmword ptr [yuv2rgb_temp+32], xmm2\n" + "movaps xmmword ptr [ecx], xmm0\n" + "movaps xmmword ptr [ecx+16], xmm1\n" + "movaps xmmword ptr [ecx+32], xmm2\n" "jmp ihategcctoo\n" ".align 16\n" "onerow:\n" - "movaps xmm0, xmmword ptr [yuv2rgb_temp]\n" - "movaps xmm1, xmmword ptr [yuv2rgb_temp+16]\n" - "movaps xmm2, xmmword ptr [yuv2rgb_temp+32]\n" + "movaps xmm0, xmmword ptr [ecx]\n" + "movaps xmm1, xmmword ptr [ecx+16]\n" + "movaps xmm2, xmmword ptr [ecx+32]\n" "ihategcctoo:\n" "movaps xmm3, xmm0\n" @@ -223,13 +267,13 @@ ihatemsvc: "movaps xmm5, xmm2\n" "movaps xmm6, xmmword ptr [mb8+edi]\n" - "psubusb xmm6, xmmword ptr [Y_bias]\n" + "psubusb xmm6, xmmword ptr [edx+Y_BIAS]\n" "movaps xmm7, xmm6\n" "psllw xmm6, 8\n" // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14 - "pand xmm7, xmmword ptr [Y_mask]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15 + "pand xmm7, xmmword ptr [edx+Y_MASK]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15 - "pmulhuw xmm6, xmmword ptr [Y_coefficients]\n" - "pmulhuw xmm7, xmmword ptr [Y_coefficients]\n" + "pmulhuw xmm6, xmmword ptr [edx+Y_COEFF]\n" + "pmulhuw xmm7, xmmword ptr [edx+Y_COEFF]\n" "paddsw xmm0, xmm6\n" "paddsw xmm3, xmm7\n" @@ -239,7 +283,7 @@ ihatemsvc: "paddsw xmm5, xmm7\n" // round - "movaps xmm6, xmmword ptr [round_1bit]\n" + "movaps xmm6, xmmword ptr [edx+ROUND_1BIT]\n" "paddw xmm0, xmm6\n" "paddw xmm1, xmm6\n" "paddw xmm2, xmm6\n" @@ -299,6 +343,8 @@ ihatemsvc: #else #error Unsupported compiler #endif + + FreezeXMMRegs(0); } void yuv2rgb_init(void) diff --git a/pcsx2/IPU/yuv2rgb.h b/pcsx2/IPU/yuv2rgb.h index 438175b383..0e07bd1c27 100644 --- a/pcsx2/IPU/yuv2rgb.h +++ b/pcsx2/IPU/yuv2rgb.h @@ -18,5 +18,5 @@ #pragma once -void yuv2rgb_sse2(void); -void yuv2rgb_init(void); +extern void yuv2rgb_sse2(void); +extern void yuv2rgb_init(void); diff --git a/pcsx2/PrecompiledHeader.h b/pcsx2/PrecompiledHeader.h index bc7ffce50a..8bf5cdb6bd 100644 --- a/pcsx2/PrecompiledHeader.h +++ b/pcsx2/PrecompiledHeader.h @@ -115,10 +115,24 @@ static __forceinline u32 timeGetTime() # define __unused #endif +////////////////////////////////////////////////////////////////////////////////////////// +// Forceinline macro that is enabled for RELEASE/PUBLIC builds ONLY. (non-inline in devel) +// This is useful because forceinline can make certain types of debugging problematic since +// functions that look like they should be called won't breakpoint since their code is inlined. +// Henceforth, use release_inline for things which we want inlined on public/release builds but +// *not* in devel builds. + +#ifdef PCSX2_DEVBUILD +# define __releaseinline +#else +# define __releaseinline __forceinline +#endif + ////////////////////////////////////////////////////////////////////////////////////////// // Emitter Instance Identifiers. If you add a new emitter, do it here also. // Note: Currently most of the instances map back to 0, since existing dynarec code all // shares iCore and must therefore all share the same emitter instance. +// (note: these don't really belong here per-se, but it's an easy spot to use for now) enum { EmitterId_R5900 = 0, diff --git a/pcsx2/System.cpp b/pcsx2/System.cpp index ec2bd1ec41..a043907ea1 100644 --- a/pcsx2/System.cpp +++ b/pcsx2/System.cpp @@ -357,7 +357,7 @@ void SysExecute() catch( R5900Exception::BaseExcept& ex ) { Console::Error( ex.cMessage() ); - Console::Error( fmt_string( "(EE) PC: 0x%8.8x \tCycle:0x8.8x", ex.cpuState.pc, ex.cpuState.cycle ).c_str() ); + Console::Error( fmt_string( "(EE) PC: 0x%8.8x \tCycle: 0x%8.8x", ex.cpuState.pc, ex.cpuState.cycle ).c_str() ); } } diff --git a/pcsx2/Vif.cpp b/pcsx2/Vif.cpp index f0e696079c..373fb37651 100644 --- a/pcsx2/Vif.cpp +++ b/pcsx2/Vif.cpp @@ -57,16 +57,7 @@ __forceinline static int _limit( int a, int max ) _vifRegs->offset++; \ } -// Forceinline macro that is enabled for RELEASE/PUBLIC builds ONLY. -// This is useful because forceinline can make certain types of debugging problematic since -// functions that look like they should be called won't breakpoint since their code is inlined. -#ifdef PCSX2_DEVBUILD -# define __pub_inline -#else -# define __pub_inline __forceinline -#endif - -static __pub_inline void writeX( u32 *dest, u32 data ) { +static __releaseinline void writeX( u32 *dest, u32 data ) { if (_vifRegs->code & 0x10000000) { switch ( _vif->cl ) { case 0: n = (_vifRegs->mask) & 0x3; break; @@ -105,7 +96,7 @@ static __pub_inline void writeX( u32 *dest, u32 data ) { // VIF_LOG("writeX %8.8x : Mode %d, r0 = %x, data %8.8x\n", *dest,_vifRegs->mode,_vifRegs->r0,data); } -static __pub_inline void writeY( u32 *dest, u32 data ) { +static __releaseinline void writeY( u32 *dest, u32 data ) { if (_vifRegs->code & 0x10000000) { switch ( _vif->cl ) { case 0: n = (_vifRegs->mask >> 2) & 0x3; break; @@ -144,7 +135,7 @@ static __pub_inline void writeY( u32 *dest, u32 data ) { // VIF_LOG("writeY %8.8x : Mode %d, r1 = %x, data %8.8x\n", *dest,_vifRegs->mode,_vifRegs->r1,data); } -static __pub_inline void writeZ( u32 *dest, u32 data ) { +static __releaseinline void writeZ( u32 *dest, u32 data ) { if (_vifRegs->code & 0x10000000) { switch ( _vif->cl ) { case 0: n = (_vifRegs->mask >> 4) & 0x3; break; @@ -183,7 +174,7 @@ static __pub_inline void writeZ( u32 *dest, u32 data ) { // VIF_LOG("writeZ %8.8x : Mode %d, r2 = %x, data %8.8x\n", *dest,_vifRegs->mode,_vifRegs->r2,data); } -static __pub_inline void writeW( u32 *dest, u32 data ) { +static __releaseinline void writeW( u32 *dest, u32 data ) { if (_vifRegs->code & 0x10000000) { switch ( _vif->cl ) { case 0: n = (_vifRegs->mask >> 6) & 0x3; break; diff --git a/pcsx2/windows/McdConfigDlg.cpp b/pcsx2/windows/McdConfigDlg.cpp index 0d87ed5b7a..c78c1075aa 100644 --- a/pcsx2/windows/McdConfigDlg.cpp +++ b/pcsx2/windows/McdConfigDlg.cpp @@ -54,6 +54,8 @@ void DlgItem_GetText( HWND hwnd, int dlgId, string& dest ) } } +// strips path information so that absolute paths are reduced to relative paths +// where appropriate. static const char* _stripPathInfo( const char* src ) { const char* retval = src; @@ -65,7 +67,9 @@ static const char* _stripPathInfo( const char* src ) workingfold++; } - if( *retval == 0 ) return src; + // If a difference is found before we reach the end of our pcsx2 working folder, it + // means we need to use the fully absolute path form the user. + if( *workingfold != 0 ) return src; while( (*retval != 0) && (*retval == '\\') ) retval++; @@ -246,7 +250,7 @@ void IniFile::MemcardSettings( PcsxConfig& conf ) Path::Combine( g_WorkingFolder, m_Default_MemcardsDir[0] ) ); Entry( "Slot2_Path", conf.Mcd[1].Filename, - Path::Combine( g_WorkingFolder, m_Default_MemcardsDir[0] ) ); + Path::Combine( g_WorkingFolder, m_Default_MemcardsDir[1] ) ); Entry( "Slot1_Enabled", conf.Mcd[0].Enabled, true ); Entry( "Slot2_Enabled", conf.Mcd[1].Enabled, true ); diff --git a/pcsx2/x86/ix86/ix86_tools.cpp b/pcsx2/x86/ix86/ix86_tools.cpp index 46117b00cd..daf7bc97d7 100644 --- a/pcsx2/x86/ix86/ix86_tools.cpp +++ b/pcsx2/x86/ix86/ix86_tools.cpp @@ -67,7 +67,7 @@ void SetCPUState(u32 sseMXCSR, u32 sseVUMXCSR) } ///////////////////////////////////////////////////////////////////// -// +// MMX Register Freezing #ifndef __INTEL_COMPILER extern "C" { @@ -86,28 +86,31 @@ __forceinline void FreezeMMXRegs_(int save) #ifdef _MSC_VER __asm { - movntq mmword ptr [g_globalMMXData + 0], mm0 - movntq mmword ptr [g_globalMMXData + 8], mm1 - movntq mmword ptr [g_globalMMXData + 16], mm2 - movntq mmword ptr [g_globalMMXData + 24], mm3 - movntq mmword ptr [g_globalMMXData + 32], mm4 - movntq mmword ptr [g_globalMMXData + 40], mm5 - movntq mmword ptr [g_globalMMXData + 48], mm6 - movntq mmword ptr [g_globalMMXData + 56], mm7 + mov ecx, offset g_globalMMXData + movntq mmword ptr [ecx+0], mm0 + movntq mmword ptr [ecx+8], mm1 + movntq mmword ptr [ecx+16], mm2 + movntq mmword ptr [ecx+24], mm3 + movntq mmword ptr [ecx+32], mm4 + movntq mmword ptr [ecx+40], mm5 + movntq mmword ptr [ecx+48], mm6 + movntq mmword ptr [ecx+56], mm7 emms } #else - __asm__(".intel_syntax noprefix\n" - "movq [%0+0x00], mm0\n" - "movq [%0+0x08], mm1\n" - "movq [%0+0x10], mm2\n" - "movq [%0+0x18], mm3\n" - "movq [%0+0x20], mm4\n" - "movq [%0+0x28], mm5\n" - "movq [%0+0x30], mm6\n" - "movq [%0+0x38], mm7\n" - "emms\n" - ".att_syntax\n" : : "r"(g_globalMMXData) ); + __asm__( + ".intel_syntax noprefix\n" + "movq [%0+0x00], mm0\n" + "movq [%0+0x08], mm1\n" + "movq [%0+0x10], mm2\n" + "movq [%0+0x18], mm3\n" + "movq [%0+0x20], mm4\n" + "movq [%0+0x28], mm5\n" + "movq [%0+0x30], mm6\n" + "movq [%0+0x38], mm7\n" + "emms\n" + ".att_syntax\n" : : "r"(g_globalMMXData) + ); #endif } @@ -123,40 +126,44 @@ __forceinline void FreezeMMXRegs_(int save) #ifdef _MSC_VER __asm { - movq mm0, mmword ptr [g_globalMMXData + 0] - movq mm1, mmword ptr [g_globalMMXData + 8] - movq mm2, mmword ptr [g_globalMMXData + 16] - movq mm3, mmword ptr [g_globalMMXData + 24] - movq mm4, mmword ptr [g_globalMMXData + 32] - movq mm5, mmword ptr [g_globalMMXData + 40] - movq mm6, mmword ptr [g_globalMMXData + 48] - movq mm7, mmword ptr [g_globalMMXData + 56] + mov ecx, offset g_globalMMXData + movq mm0, mmword ptr [ecx+0] + movq mm1, mmword ptr [ecx+8] + movq mm2, mmword ptr [ecx+16] + movq mm3, mmword ptr [ecx+24] + movq mm4, mmword ptr [ecx+32] + movq mm5, mmword ptr [ecx+40] + movq mm6, mmword ptr [ecx+48] + movq mm7, mmword ptr [ecx+56] emms } #else - __asm__(".intel_syntax noprefix\n" - "movq mm0, [%0+0x00]\n" - "movq mm1, [%0+0x08]\n" - "movq mm2, [%0+0x10]\n" - "movq mm3, [%0+0x18]\n" - "movq mm4, [%0+0x20]\n" - "movq mm5, [%0+0x28]\n" - "movq mm6, [%0+0x30]\n" - "movq mm7, [%0+0x38]\n" - "emms\n" - ".att_syntax\n" : : "r"(g_globalMMXData) ); + __asm__( + ".intel_syntax noprefix\n" + "movq mm0, [%0+0x00]\n" + "movq mm1, [%0+0x08]\n" + "movq mm2, [%0+0x10]\n" + "movq mm3, [%0+0x18]\n" + "movq mm4, [%0+0x20]\n" + "movq mm5, [%0+0x28]\n" + "movq mm6, [%0+0x30]\n" + "movq mm7, [%0+0x38]\n" + "emms\n" + ".att_syntax\n" : : "r"(g_globalMMXData) + ); #endif } } ////////////////////////////////////////////////////////////////////// - +// XMM Register Freezing __forceinline void FreezeXMMRegs_(int save) { //SysPrintf("FreezeXMMRegs_(%d); [%d]\n", save, g_globalXMMSaved); assert( g_EEFreezeRegs ); - if( save ) { + if( save ) + { g_globalXMMSaved++; if( g_globalXMMSaved > 1 ){ //SysPrintf("XMM Already saved\n"); @@ -166,31 +173,35 @@ __forceinline void FreezeXMMRegs_(int save) #ifdef _MSC_VER __asm { - movaps xmmword ptr [g_globalXMMData + 0x00], xmm0 - movaps xmmword ptr [g_globalXMMData + 0x10], xmm1 - movaps xmmword ptr [g_globalXMMData + 0x20], xmm2 - movaps xmmword ptr [g_globalXMMData + 0x30], xmm3 - movaps xmmword ptr [g_globalXMMData + 0x40], xmm4 - movaps xmmword ptr [g_globalXMMData + 0x50], xmm5 - movaps xmmword ptr [g_globalXMMData + 0x60], xmm6 - movaps xmmword ptr [g_globalXMMData + 0x70], xmm7 + mov ecx, offset g_globalXMMData + movaps xmmword ptr [ecx+0x00], xmm0 + movaps xmmword ptr [ecx+0x10], xmm1 + movaps xmmword ptr [ecx+0x20], xmm2 + movaps xmmword ptr [ecx+0x30], xmm3 + movaps xmmword ptr [ecx+0x40], xmm4 + movaps xmmword ptr [ecx+0x50], xmm5 + movaps xmmword ptr [ecx+0x60], xmm6 + movaps xmmword ptr [ecx+0x70], xmm7 } #else - __asm__(".intel_syntax noprefix\n" - "movaps [%0+0x00], xmm0\n" - "movaps [%0+0x10], xmm1\n" - "movaps [%0+0x20], xmm2\n" - "movaps [%0+0x30], xmm3\n" - "movaps [%0+0x40], xmm4\n" - "movaps [%0+0x50], xmm5\n" - "movaps [%0+0x60], xmm6\n" - "movaps [%0+0x70], xmm7\n" - ".att_syntax\n" : : "r"(g_globalXMMData) ); + __asm__( + ".intel_syntax noprefix\n" + "movaps [%0+0x00], xmm0\n" + "movaps [%0+0x10], xmm1\n" + "movaps [%0+0x20], xmm2\n" + "movaps [%0+0x30], xmm3\n" + "movaps [%0+0x40], xmm4\n" + "movaps [%0+0x50], xmm5\n" + "movaps [%0+0x60], xmm6\n" + "movaps [%0+0x70], xmm7\n" + ".att_syntax\n" : : "r"(g_globalXMMData) ); + ); #endif // _MSC_VER } - else { + else + { if( g_globalXMMSaved==0 ) { //SysPrintf("XMM Regs not saved!\n"); @@ -202,28 +213,32 @@ __forceinline void FreezeXMMRegs_(int save) if( g_globalXMMSaved > 0 ) return; #ifdef _MSC_VER - __asm { - movaps xmm0, xmmword ptr [g_globalXMMData + 0x00] - movaps xmm1, xmmword ptr [g_globalXMMData + 0x10] - movaps xmm2, xmmword ptr [g_globalXMMData + 0x20] - movaps xmm3, xmmword ptr [g_globalXMMData + 0x30] - movaps xmm4, xmmword ptr [g_globalXMMData + 0x40] - movaps xmm5, xmmword ptr [g_globalXMMData + 0x50] - movaps xmm6, xmmword ptr [g_globalXMMData + 0x60] - movaps xmm7, xmmword ptr [g_globalXMMData + 0x70] + __asm + { + mov ecx, offset g_globalXMMData + movaps xmm0, xmmword ptr [ecx+0x00] + movaps xmm1, xmmword ptr [ecx+0x10] + movaps xmm2, xmmword ptr [ecx+0x20] + movaps xmm3, xmmword ptr [ecx+0x30] + movaps xmm4, xmmword ptr [ecx+0x40] + movaps xmm5, xmmword ptr [ecx+0x50] + movaps xmm6, xmmword ptr [ecx+0x60] + movaps xmm7, xmmword ptr [ecx+0x70] } #else - __asm__(".intel_syntax noprefix\n" - "movaps xmm0, [%0+0x00]\n" - "movaps xmm1, [%0+0x10]\n" - "movaps xmm2, [%0+0x20]\n" - "movaps xmm3, [%0+0x30]\n" - "movaps xmm4, [%0+0x40]\n" - "movaps xmm5, [%0+0x50]\n" - "movaps xmm6, [%0+0x60]\n" - "movaps xmm7, [%0+0x70]\n" - ".att_syntax\n" : : "r"(g_globalXMMData) ); + __asm__( + ".intel_syntax noprefix\n" + "movaps xmm0, [%0+0x00]\n" + "movaps xmm1, [%0+0x10]\n" + "movaps xmm2, [%0+0x20]\n" + "movaps xmm3, [%0+0x30]\n" + "movaps xmm4, [%0+0x40]\n" + "movaps xmm5, [%0+0x50]\n" + "movaps xmm6, [%0+0x60]\n" + "movaps xmm7, [%0+0x70]\n" + ".att_syntax\n" : : "r"(g_globalXMMData) ); + ); #endif // _MSC_VER }