Many small bugfixes and optimizations:

* Fixed Memcard init so that Memcard1 isn't default in both slots (oops!)
 * Fixed Memcard path logic so that cards outside your pcsx2 folder can be browsed/selected.
 * Fixed CDVD-to-BIOS time sync (I simply forgot a function call!)
 * Optimized yuvrgb_sse2, by using Mod/RM form instructions.
 * Win32: Same optimization applied to FreezeXMMRegs and FreezeMMXRegs (linux already had this optimization)

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@719 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2009-03-08 20:10:09 +00:00
parent 25df8958b2
commit 71e2dc2fb4
9 changed files with 230 additions and 160 deletions

View File

@ -31,13 +31,7 @@
#define ArraySize(x) (sizeof(x)/sizeof((x)[0]))
#endif
#ifdef __LINUX__
#define CALLBACK
#else
#define CALLBACK __stdcall
#endif
//////////////////////////////////////////////////////////////////////////////////////////
// jASSUME - give hints to the optimizer
// This is primarily useful for the default case switch optimizer, which enables VC to
// generate more compact switches.
@ -68,8 +62,9 @@ default: \
break; \
}
//////////////////////////////////////////////////////////////////////////////////////////
// Basic Atomic Types
// Basic types
#if defined(_MSC_VER)
typedef __int8 s8;
@ -92,6 +87,7 @@ typedef unsigned int uint;
#define PCSX2_ALIGNED16_EXTERN(x) extern __declspec(align(16)) x
#define __naked __declspec(naked)
#define CALLBACK __stdcall
#else // _MSC_VER
@ -140,6 +136,7 @@ typedef union _LARGE_INTEGER
#define _inline __inline__ __attribute__((unused))
#define __forceinline __attribute__((always_inline,unused))
#define __naked // GCC lacks the naked specifier
#define CALLBACK // CALLBACK is win32-specific mess
#endif // __LINUX__
@ -164,6 +161,7 @@ typedef s32 sptr;
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////
// A rough-and-ready cross platform 128-bit datatype, Non-SSE style.
#ifdef __cplusplus
struct u128

View File

@ -692,7 +692,7 @@ __forceinline void cdvdGetDiskType()
// gets value for start lsn of layer1
// returns: 1 if on dual layer disc
// 0 if not on dual layer disc
s32 cdvdReadDvdDualInfo(s32* dualType, u32* layer1Start)
static s32 cdvdReadDvdDualInfo(s32* dualType, u32* layer1Start)
{
u8 toc[2064];
*dualType = 0;
@ -754,6 +754,8 @@ void cdvdReset()
cdvd.RTC.day = 25;
cdvd.RTC.month = 5;
cdvd.RTC.year = 7; //2007
cdvdSetSystemTime( cdvd );
}
struct Freeze_v10Compat

View File

@ -21,35 +21,74 @@
#include "PrecompiledHeader.h"
#include "System.h"
#include "Misc.h"
#include "IPU.h"
#include "yuv2rgb.h"
// Everything below is bit accurate to the IPU specification (except maybe rounding).
// Know the specification before you touch it.
PCSX2_ALIGNED16(u16 C_bias[8]) = {0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000};
PCSX2_ALIGNED16(u8 Y_bias[16]) = {16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16};
#define SSE_COEFFICIENTS(name, x) \
PCSX2_ALIGNED16(u16 name[8]) = {x<<2,x<<2,x<<2,x<<2,x<<2,x<<2,x<<2,x<<2};
SSE_COEFFICIENTS(Y_coefficients, 0x95); // 1.1640625
SSE_COEFFICIENTS(RCr_coefficients, 0xcc); // 1.59375
SSE_COEFFICIENTS(GCr_coefficients, (-0x68)); // -0.8125
SSE_COEFFICIENTS(GCb_coefficients, (-0x32)); // -0.390625
SSE_COEFFICIENTS(BCb_coefficients, 0x102); // 2.015625
PCSX2_ALIGNED16(u16 Y_mask[8]) = {0xff00,0xff00,0xff00,0xff00,0xff00,0xff00,0xff00,0xff00};
#define SSE_COEFFICIENTS(x) \
{(x)<<2,(x)<<2,(x)<<2,(x)<<2,(x)<<2,(x)<<2,(x)<<2,(x)<<2}
struct SSE2_Tables
{
u16 C_bias[8]; // offset -64
u8 Y_bias[16]; // offset -48
u16 Y_mask[8]; // offset -32
u16 round_1bit[8]; // offset -16
u16 Y_coefficients[8]; // offset 0
u16 GCr_coefficients[8];// offset 16
u16 GCb_coefficients[8];// offset 32
u16 RCr_coefficients[8];// offset 48
u16 BCb_coefficients[8];// offset 64
};
#define C_BIAS (-64)
#define Y_BIAS (-48)
#define Y_MASK (-32)
#define ROUND_1BIT (-16)
#define Y_COEFF 0
#define GCr_COEFF 16
#define GCb_COEFF 32
#define RCr_COEFF 48
#define BCb_COEFF 64
static PCSX2_ALIGNED16(const SSE2_Tables sse2_tables) =
{
{0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}, // c_bias
{16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16}, // y_bias
{0xff00,0xff00,0xff00,0xff00,0xff00,0xff00,0xff00,0xff00}, // y_mask
// Specifying round off instead of round down as everywhere else
// implies that this is right
PCSX2_ALIGNED16(u16 round_1bit[8]) = {1,1,1,1,1,1,1,1};
PCSX2_ALIGNED16(u16 yuv2rgb_temp[3][8]);
{1,1,1,1,1,1,1,1}, // round_1bit
SSE_COEFFICIENTS(0x95), // 1.1640625 [Y_coefficients]
SSE_COEFFICIENTS(-0x68), // -0.8125 [GCr_coefficients]
SSE_COEFFICIENTS(-0x32), // -0.390625 [GCb_coefficients]
SSE_COEFFICIENTS(0xcc), // 1.59375 [RCr_coefficients]
SSE_COEFFICIENTS(0x102), // 2.015625 [BCb_coefficients]
};
static PCSX2_ALIGNED16(u16 yuv2rgb_temp[3][8]);
// This could potentially be improved for SSE4
void yuv2rgb_sse2(void)
__releaseinline void yuv2rgb_sse2(void)
{
FreezeXMMRegs(1);
#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
__asm {
mov eax, 1
mov esi, 0
mov edi, 0
xor esi, esi
xor edi, edi
// Use ecx and edx as base pointers, to allow for Mod/RM form on memOps.
// This saves 2-3 bytes per instruction where these are used. :)
mov ecx, offset yuv2rgb_temp
mov edx, offset sse2_tables+64;
align 16
tworows:
@ -65,29 +104,29 @@ tworows:
// unfortunately I don't think this will matter despite being
// technically potentially a little faster, but this is
// equivalent to an add or sub
pxor xmm2, xmmword ptr [C_bias] // xmm2 <-- 8 x (Cb - 128) << 8
pxor xmm0, xmmword ptr [C_bias] // xmm0 <-- 8 x (Cr - 128) << 8
pxor xmm2, xmmword ptr [edx+C_BIAS] // xmm2 <-- 8 x (Cb - 128) << 8
pxor xmm0, xmmword ptr [edx+C_BIAS] // xmm0 <-- 8 x (Cr - 128) << 8
movaps xmm1, xmm0
movaps xmm3, xmm2
pmulhw xmm1, xmmword ptr [GCr_coefficients]
pmulhw xmm3, xmmword ptr [GCb_coefficients]
pmulhw xmm0, xmmword ptr [RCr_coefficients]
pmulhw xmm2, xmmword ptr [BCb_coefficients]
pmulhw xmm1, xmmword ptr [edx+GCr_COEFF]
pmulhw xmm3, xmmword ptr [edx+GCb_COEFF]
pmulhw xmm0, xmmword ptr [edx+RCr_COEFF]
pmulhw xmm2, xmmword ptr [edx+BCb_COEFF]
paddsw xmm1, xmm3
// store for the next line; looking at the code above
// compared to the code below, I have to wonder whether
// this was worth the hassle
movaps xmmword ptr [yuv2rgb_temp], xmm0
movaps xmmword ptr [yuv2rgb_temp+16], xmm1
movaps xmmword ptr [yuv2rgb_temp+32], xmm2
movaps xmmword ptr [ecx], xmm0
movaps xmmword ptr [ecx+16], xmm1
movaps xmmword ptr [ecx+32], xmm2
jmp ihatemsvc
align 16
onerow:
movaps xmm0, xmmword ptr [yuv2rgb_temp]
movaps xmm1, xmmword ptr [yuv2rgb_temp+16]
movaps xmm2, xmmword ptr [yuv2rgb_temp+32]
movaps xmm0, xmmword ptr [ecx]
movaps xmm1, xmmword ptr [ecx+16]
movaps xmm2, xmmword ptr [ecx+32]
// If masm directives worked properly in inline asm, I'd be using them,
// but I'm not inclined to write ~70 line #defines to simulate them.
@ -100,13 +139,13 @@ ihatemsvc:
movaps xmm5, xmm2
movaps xmm6, xmmword ptr [mb8+edi]
psubusb xmm6, xmmword ptr [Y_bias]
psubusb xmm6, xmmword ptr [edx+Y_BIAS]
movaps xmm7, xmm6
psllw xmm6, 8 // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
pand xmm7, xmmword ptr [Y_mask] // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
pand xmm7, xmmword ptr [edx+Y_MASK] // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
pmulhuw xmm6, xmmword ptr [Y_coefficients]
pmulhuw xmm7, xmmword ptr [Y_coefficients]
pmulhuw xmm6, xmmword ptr [edx+Y_COEFF]
pmulhuw xmm7, xmmword ptr [edx+Y_COEFF]
paddsw xmm0, xmm6
paddsw xmm3, xmm7
@ -116,7 +155,7 @@ ihatemsvc:
paddsw xmm5, xmm7
// round
movaps xmm6, xmmword ptr [round_1bit]
movaps xmm6, xmmword ptr [edx+ROUND_1BIT]
paddw xmm0, xmm6
paddw xmm1, xmm6
paddw xmm2, xmm6
@ -176,8 +215,13 @@ ihatemsvc:
asm(
".intel_syntax noprefix\n"
"mov eax, 1\n"
"mov esi, 0\n"
"mov edi, 0\n"
"xor esi, esi\n"
"xor edi, edi\n"
// Use ecx and edx as base pointers, to allow for Mod/RM form on memOps.
// This saves 2-3 bytes per instruction where these are used. :)
"mov ecx, offset yuv2rgb_temp\n"
"mov edx, offset sse2_tables+64\n"
".align 16\n"
"tworows:\n"
@ -193,29 +237,29 @@ ihatemsvc:
// unfortunately I don't think this will matter despite being
// technically potentially a little faster, but this is
// equivalent to an add or sub
"pxor xmm2, xmmword ptr [C_bias]\n" // xmm2 <-- 8 x (Cb - 128) << 8
"pxor xmm0, xmmword ptr [C_bias]\n" // xmm0 <-- 8 x (Cr - 128) << 8
"pxor xmm2, xmmword ptr [edx+C_BIAS]\n" // xmm2 <-- 8 x (Cb - 128) << 8
"pxor xmm0, xmmword ptr [edx+C_BIAS]\n" // xmm0 <-- 8 x (Cr - 128) << 8
"movaps xmm1, xmm0\n"
"movaps xmm3, xmm2\n"
"pmulhw xmm1, xmmword ptr [GCr_coefficients]\n"
"pmulhw xmm3, xmmword ptr [GCb_coefficients]\n"
"pmulhw xmm0, xmmword ptr [RCr_coefficients]\n"
"pmulhw xmm2, xmmword ptr [BCb_coefficients]\n"
"pmulhw xmm1, xmmword ptr [edx+GCr_COEFF]\n"
"pmulhw xmm3, xmmword ptr [edx+GCb_COEFF]\n"
"pmulhw xmm0, xmmword ptr [edx+RCr_COEFF]\n"
"pmulhw xmm2, xmmword ptr [edx+BCb_COEFF]\n"
"paddsw xmm1, xmm3\n"
// store for the next line; looking at the code above
// compared to the code below, I have to wonder whether
// this was worth the hassle
"movaps xmmword ptr [yuv2rgb_temp], xmm0\n"
"movaps xmmword ptr [yuv2rgb_temp+16], xmm1\n"
"movaps xmmword ptr [yuv2rgb_temp+32], xmm2\n"
"movaps xmmword ptr [ecx], xmm0\n"
"movaps xmmword ptr [ecx+16], xmm1\n"
"movaps xmmword ptr [ecx+32], xmm2\n"
"jmp ihategcctoo\n"
".align 16\n"
"onerow:\n"
"movaps xmm0, xmmword ptr [yuv2rgb_temp]\n"
"movaps xmm1, xmmword ptr [yuv2rgb_temp+16]\n"
"movaps xmm2, xmmword ptr [yuv2rgb_temp+32]\n"
"movaps xmm0, xmmword ptr [ecx]\n"
"movaps xmm1, xmmword ptr [ecx+16]\n"
"movaps xmm2, xmmword ptr [ecx+32]\n"
"ihategcctoo:\n"
"movaps xmm3, xmm0\n"
@ -223,13 +267,13 @@ ihatemsvc:
"movaps xmm5, xmm2\n"
"movaps xmm6, xmmword ptr [mb8+edi]\n"
"psubusb xmm6, xmmword ptr [Y_bias]\n"
"psubusb xmm6, xmmword ptr [edx+Y_BIAS]\n"
"movaps xmm7, xmm6\n"
"psllw xmm6, 8\n" // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
"pand xmm7, xmmword ptr [Y_mask]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
"pand xmm7, xmmword ptr [edx+Y_MASK]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
"pmulhuw xmm6, xmmword ptr [Y_coefficients]\n"
"pmulhuw xmm7, xmmword ptr [Y_coefficients]\n"
"pmulhuw xmm6, xmmword ptr [edx+Y_COEFF]\n"
"pmulhuw xmm7, xmmword ptr [edx+Y_COEFF]\n"
"paddsw xmm0, xmm6\n"
"paddsw xmm3, xmm7\n"
@ -239,7 +283,7 @@ ihatemsvc:
"paddsw xmm5, xmm7\n"
// round
"movaps xmm6, xmmword ptr [round_1bit]\n"
"movaps xmm6, xmmword ptr [edx+ROUND_1BIT]\n"
"paddw xmm0, xmm6\n"
"paddw xmm1, xmm6\n"
"paddw xmm2, xmm6\n"
@ -299,6 +343,8 @@ ihatemsvc:
#else
#error Unsupported compiler
#endif
FreezeXMMRegs(0);
}
void yuv2rgb_init(void)

View File

@ -18,5 +18,5 @@
#pragma once
void yuv2rgb_sse2(void);
void yuv2rgb_init(void);
extern void yuv2rgb_sse2(void);
extern void yuv2rgb_init(void);

View File

@ -115,10 +115,24 @@ static __forceinline u32 timeGetTime()
# define __unused
#endif
//////////////////////////////////////////////////////////////////////////////////////////
// Forceinline macro that is enabled for RELEASE/PUBLIC builds ONLY. (non-inline in devel)
// This is useful because forceinline can make certain types of debugging problematic since
// functions that look like they should be called won't breakpoint since their code is inlined.
// Henceforth, use release_inline for things which we want inlined on public/release builds but
// *not* in devel builds.
#ifdef PCSX2_DEVBUILD
# define __releaseinline
#else
# define __releaseinline __forceinline
#endif
//////////////////////////////////////////////////////////////////////////////////////////
// Emitter Instance Identifiers. If you add a new emitter, do it here also.
// Note: Currently most of the instances map back to 0, since existing dynarec code all
// shares iCore and must therefore all share the same emitter instance.
// (note: these don't really belong here per-se, but it's an easy spot to use for now)
enum
{
EmitterId_R5900 = 0,

View File

@ -357,7 +357,7 @@ void SysExecute()
catch( R5900Exception::BaseExcept& ex )
{
Console::Error( ex.cMessage() );
Console::Error( fmt_string( "(EE) PC: 0x%8.8x \tCycle:0x8.8x", ex.cpuState.pc, ex.cpuState.cycle ).c_str() );
Console::Error( fmt_string( "(EE) PC: 0x%8.8x \tCycle: 0x%8.8x", ex.cpuState.pc, ex.cpuState.cycle ).c_str() );
}
}

View File

@ -57,16 +57,7 @@ __forceinline static int _limit( int a, int max )
_vifRegs->offset++; \
}
// Forceinline macro that is enabled for RELEASE/PUBLIC builds ONLY.
// This is useful because forceinline can make certain types of debugging problematic since
// functions that look like they should be called won't breakpoint since their code is inlined.
#ifdef PCSX2_DEVBUILD
# define __pub_inline
#else
# define __pub_inline __forceinline
#endif
static __pub_inline void writeX( u32 *dest, u32 data ) {
static __releaseinline void writeX( u32 *dest, u32 data ) {
if (_vifRegs->code & 0x10000000) {
switch ( _vif->cl ) {
case 0: n = (_vifRegs->mask) & 0x3; break;
@ -105,7 +96,7 @@ static __pub_inline void writeX( u32 *dest, u32 data ) {
// VIF_LOG("writeX %8.8x : Mode %d, r0 = %x, data %8.8x\n", *dest,_vifRegs->mode,_vifRegs->r0,data);
}
static __pub_inline void writeY( u32 *dest, u32 data ) {
static __releaseinline void writeY( u32 *dest, u32 data ) {
if (_vifRegs->code & 0x10000000) {
switch ( _vif->cl ) {
case 0: n = (_vifRegs->mask >> 2) & 0x3; break;
@ -144,7 +135,7 @@ static __pub_inline void writeY( u32 *dest, u32 data ) {
// VIF_LOG("writeY %8.8x : Mode %d, r1 = %x, data %8.8x\n", *dest,_vifRegs->mode,_vifRegs->r1,data);
}
static __pub_inline void writeZ( u32 *dest, u32 data ) {
static __releaseinline void writeZ( u32 *dest, u32 data ) {
if (_vifRegs->code & 0x10000000) {
switch ( _vif->cl ) {
case 0: n = (_vifRegs->mask >> 4) & 0x3; break;
@ -183,7 +174,7 @@ static __pub_inline void writeZ( u32 *dest, u32 data ) {
// VIF_LOG("writeZ %8.8x : Mode %d, r2 = %x, data %8.8x\n", *dest,_vifRegs->mode,_vifRegs->r2,data);
}
static __pub_inline void writeW( u32 *dest, u32 data ) {
static __releaseinline void writeW( u32 *dest, u32 data ) {
if (_vifRegs->code & 0x10000000) {
switch ( _vif->cl ) {
case 0: n = (_vifRegs->mask >> 6) & 0x3; break;

View File

@ -54,6 +54,8 @@ void DlgItem_GetText( HWND hwnd, int dlgId, string& dest )
}
}
// strips path information so that absolute paths are reduced to relative paths
// where appropriate.
static const char* _stripPathInfo( const char* src )
{
const char* retval = src;
@ -65,7 +67,9 @@ static const char* _stripPathInfo( const char* src )
workingfold++;
}
if( *retval == 0 ) return src;
// If a difference is found before we reach the end of our pcsx2 working folder, it
// means we need to use the fully absolute path form the user.
if( *workingfold != 0 ) return src;
while( (*retval != 0) && (*retval == '\\') ) retval++;
@ -246,7 +250,7 @@ void IniFile::MemcardSettings( PcsxConfig& conf )
Path::Combine( g_WorkingFolder, m_Default_MemcardsDir[0] ) );
Entry( "Slot2_Path", conf.Mcd[1].Filename,
Path::Combine( g_WorkingFolder, m_Default_MemcardsDir[0] ) );
Path::Combine( g_WorkingFolder, m_Default_MemcardsDir[1] ) );
Entry( "Slot1_Enabled", conf.Mcd[0].Enabled, true );
Entry( "Slot2_Enabled", conf.Mcd[1].Enabled, true );

View File

@ -67,7 +67,7 @@ void SetCPUState(u32 sseMXCSR, u32 sseVUMXCSR)
}
/////////////////////////////////////////////////////////////////////
//
// MMX Register Freezing
#ifndef __INTEL_COMPILER
extern "C"
{
@ -86,18 +86,20 @@ __forceinline void FreezeMMXRegs_(int save)
#ifdef _MSC_VER
__asm {
movntq mmword ptr [g_globalMMXData + 0], mm0
movntq mmword ptr [g_globalMMXData + 8], mm1
movntq mmword ptr [g_globalMMXData + 16], mm2
movntq mmword ptr [g_globalMMXData + 24], mm3
movntq mmword ptr [g_globalMMXData + 32], mm4
movntq mmword ptr [g_globalMMXData + 40], mm5
movntq mmword ptr [g_globalMMXData + 48], mm6
movntq mmword ptr [g_globalMMXData + 56], mm7
mov ecx, offset g_globalMMXData
movntq mmword ptr [ecx+0], mm0
movntq mmword ptr [ecx+8], mm1
movntq mmword ptr [ecx+16], mm2
movntq mmword ptr [ecx+24], mm3
movntq mmword ptr [ecx+32], mm4
movntq mmword ptr [ecx+40], mm5
movntq mmword ptr [ecx+48], mm6
movntq mmword ptr [ecx+56], mm7
emms
}
#else
__asm__(".intel_syntax noprefix\n"
__asm__(
".intel_syntax noprefix\n"
"movq [%0+0x00], mm0\n"
"movq [%0+0x08], mm1\n"
"movq [%0+0x10], mm2\n"
@ -107,7 +109,8 @@ __forceinline void FreezeMMXRegs_(int save)
"movq [%0+0x30], mm6\n"
"movq [%0+0x38], mm7\n"
"emms\n"
".att_syntax\n" : : "r"(g_globalMMXData) );
".att_syntax\n" : : "r"(g_globalMMXData)
);
#endif
}
@ -123,18 +126,20 @@ __forceinline void FreezeMMXRegs_(int save)
#ifdef _MSC_VER
__asm {
movq mm0, mmword ptr [g_globalMMXData + 0]
movq mm1, mmword ptr [g_globalMMXData + 8]
movq mm2, mmword ptr [g_globalMMXData + 16]
movq mm3, mmword ptr [g_globalMMXData + 24]
movq mm4, mmword ptr [g_globalMMXData + 32]
movq mm5, mmword ptr [g_globalMMXData + 40]
movq mm6, mmword ptr [g_globalMMXData + 48]
movq mm7, mmword ptr [g_globalMMXData + 56]
mov ecx, offset g_globalMMXData
movq mm0, mmword ptr [ecx+0]
movq mm1, mmword ptr [ecx+8]
movq mm2, mmword ptr [ecx+16]
movq mm3, mmword ptr [ecx+24]
movq mm4, mmword ptr [ecx+32]
movq mm5, mmword ptr [ecx+40]
movq mm6, mmword ptr [ecx+48]
movq mm7, mmword ptr [ecx+56]
emms
}
#else
__asm__(".intel_syntax noprefix\n"
__asm__(
".intel_syntax noprefix\n"
"movq mm0, [%0+0x00]\n"
"movq mm1, [%0+0x08]\n"
"movq mm2, [%0+0x10]\n"
@ -144,19 +149,21 @@ __forceinline void FreezeMMXRegs_(int save)
"movq mm6, [%0+0x30]\n"
"movq mm7, [%0+0x38]\n"
"emms\n"
".att_syntax\n" : : "r"(g_globalMMXData) );
".att_syntax\n" : : "r"(g_globalMMXData)
);
#endif
}
}
//////////////////////////////////////////////////////////////////////
// XMM Register Freezing
__forceinline void FreezeXMMRegs_(int save)
{
//SysPrintf("FreezeXMMRegs_(%d); [%d]\n", save, g_globalXMMSaved);
assert( g_EEFreezeRegs );
if( save ) {
if( save )
{
g_globalXMMSaved++;
if( g_globalXMMSaved > 1 ){
//SysPrintf("XMM Already saved\n");
@ -166,18 +173,20 @@ __forceinline void FreezeXMMRegs_(int save)
#ifdef _MSC_VER
__asm {
movaps xmmword ptr [g_globalXMMData + 0x00], xmm0
movaps xmmword ptr [g_globalXMMData + 0x10], xmm1
movaps xmmword ptr [g_globalXMMData + 0x20], xmm2
movaps xmmword ptr [g_globalXMMData + 0x30], xmm3
movaps xmmword ptr [g_globalXMMData + 0x40], xmm4
movaps xmmword ptr [g_globalXMMData + 0x50], xmm5
movaps xmmword ptr [g_globalXMMData + 0x60], xmm6
movaps xmmword ptr [g_globalXMMData + 0x70], xmm7
mov ecx, offset g_globalXMMData
movaps xmmword ptr [ecx+0x00], xmm0
movaps xmmword ptr [ecx+0x10], xmm1
movaps xmmword ptr [ecx+0x20], xmm2
movaps xmmword ptr [ecx+0x30], xmm3
movaps xmmword ptr [ecx+0x40], xmm4
movaps xmmword ptr [ecx+0x50], xmm5
movaps xmmword ptr [ecx+0x60], xmm6
movaps xmmword ptr [ecx+0x70], xmm7
}
#else
__asm__(".intel_syntax noprefix\n"
__asm__(
".intel_syntax noprefix\n"
"movaps [%0+0x00], xmm0\n"
"movaps [%0+0x10], xmm1\n"
"movaps [%0+0x20], xmm2\n"
@ -187,10 +196,12 @@ __forceinline void FreezeXMMRegs_(int save)
"movaps [%0+0x60], xmm6\n"
"movaps [%0+0x70], xmm7\n"
".att_syntax\n" : : "r"(g_globalXMMData) );
);
#endif // _MSC_VER
}
else {
else
{
if( g_globalXMMSaved==0 )
{
//SysPrintf("XMM Regs not saved!\n");
@ -202,19 +213,22 @@ __forceinline void FreezeXMMRegs_(int save)
if( g_globalXMMSaved > 0 ) return;
#ifdef _MSC_VER
__asm {
movaps xmm0, xmmword ptr [g_globalXMMData + 0x00]
movaps xmm1, xmmword ptr [g_globalXMMData + 0x10]
movaps xmm2, xmmword ptr [g_globalXMMData + 0x20]
movaps xmm3, xmmword ptr [g_globalXMMData + 0x30]
movaps xmm4, xmmword ptr [g_globalXMMData + 0x40]
movaps xmm5, xmmword ptr [g_globalXMMData + 0x50]
movaps xmm6, xmmword ptr [g_globalXMMData + 0x60]
movaps xmm7, xmmword ptr [g_globalXMMData + 0x70]
__asm
{
mov ecx, offset g_globalXMMData
movaps xmm0, xmmword ptr [ecx+0x00]
movaps xmm1, xmmword ptr [ecx+0x10]
movaps xmm2, xmmword ptr [ecx+0x20]
movaps xmm3, xmmword ptr [ecx+0x30]
movaps xmm4, xmmword ptr [ecx+0x40]
movaps xmm5, xmmword ptr [ecx+0x50]
movaps xmm6, xmmword ptr [ecx+0x60]
movaps xmm7, xmmword ptr [ecx+0x70]
}
#else
__asm__(".intel_syntax noprefix\n"
__asm__(
".intel_syntax noprefix\n"
"movaps xmm0, [%0+0x00]\n"
"movaps xmm1, [%0+0x10]\n"
"movaps xmm2, [%0+0x20]\n"
@ -224,6 +238,7 @@ __forceinline void FreezeXMMRegs_(int save)
"movaps xmm6, [%0+0x60]\n"
"movaps xmm7, [%0+0x70]\n"
".att_syntax\n" : : "r"(g_globalXMMData) );
);
#endif // _MSC_VER
}