Refactoring:

* Added __fi and __ri, which are abbreviations for __forceinline and __releaseinline. * Added some static qualifiers to functions in mVU, MMI ops, and others where appropriate. * Removed some unnecessary __fastcall qualifiers (since GCC gets funny sometimes when you combine __fastcall and inlining). * Made _1mb, _16mb, _1gb values common to all emulation code (moved from newVif/mvu to Common.h) -- they're useful! :) git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3624 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-08-09 04:10:38 +00:00 · 2010-08-09 04:10:38 +00:00 · 8375b0a989
parent dceb9a78bb
commit 8375b0a989
103 changed files with 798 additions and 790 deletions
--- a/common/include/Pcsx2Defs.h
+++ b/common/include/Pcsx2Defs.h
@ -41,24 +41,9 @@
 #	define ArraySize(x) (sizeof(x)/sizeof((x)[0]))
 #endif

-//////////////////////////////////////////////////////////////////////////////////////////
-// __releaseinline -- a forceinline macro that is enabled for RELEASE/PUBLIC builds ONLY.
-// This is useful because forceinline can make certain types of debugging problematic since
-// functions that look like they should be called won't breakpoint since their code is
-// inlined, and it can make stack traces confusing or near useless.
-//
-// Use __releaseinline for things which are generally large functions where trace debugging
-// from Devel builds is likely useful; but which should be inlined in an optimized Release
-// environment.
-//
-#ifdef PCSX2_DEVBUILD
-#	define __releaseinline
-#else
-#	define __releaseinline __forceinline
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// jASSUME - give hints to the optimizer
+// --------------------------------------------------------------------------------------
+// jASSUME - give hints to the optimizer  [obsolete, use pxAssume() instead]
+// --------------------------------------------------------------------------------------
 //  This is primarily useful for the default case switch optimizer, which enables VC to
 //  generate more compact switches.
 //
@ -83,7 +68,9 @@
 #	endif
 #endif

-//////////////////////////////////////////////////////////////////////////////////////////
+// --------------------------------------------------------------------------------------
+//  C_ASSERT
+// --------------------------------------------------------------------------------------
 // compile-time assertion; usable at static variable define level.
 // (typically used to confirm the correct sizeof() for struct types where size
 //  restaints must be enforced).
@ -92,9 +79,9 @@
 #	define C_ASSERT(e) typedef char __C_ASSERT__[(e)?1:-1]
 #endif

-//////////////////////////////////////////////////////////////////////////////////////////
+// --------------------------------------------------------------------------------------
 // Dev / Debug conditionals - Consts for using if() statements instead of uglier #ifdef.
-//
+// --------------------------------------------------------------------------------------
 // Note: Using if() optimizes nicely in Devel and Release builds, but will generate extra
 // code overhead in debug builds (since debug neither inlines, nor optimizes out const-
 // level conditionals).  Normally not a concern, but if you stick if( IsDevbuild ) in
@ -146,9 +133,9 @@
 #	define pxReleaseCode(code)		code
 #endif

-//////////////////////////////////////////////////////////////////////////////////////////
+// --------------------------------------------------------------------------------------
 // __aligned / __aligned16 / __pagealigned
-//
+// --------------------------------------------------------------------------------------
 // GCC Warning!  The GCC linker (LD) typically fails to assure alignment of class members.
 // If you want alignment to be assured, the variable must either be a member of a struct
 // or a static global.
@ -167,9 +154,9 @@
 #define PCSX2_PAGESIZE		0x1000
 static const int __pagesize	= PCSX2_PAGESIZE;

-//////////////////////////////////////////////////////////////////////////////////////////
+// --------------------------------------------------------------------------------------
 // Structure Packing (__packed)
-//
+// --------------------------------------------------------------------------------------
 // Current Method:
 // Use a combination of embedded compiler-specific #pragma mess in conjunction with a
 // __packed macro.  The former appeases the MSVC gods, the latter appeases the GCC gods.
@ -296,8 +283,28 @@ static const int __pagesize	= PCSX2_PAGESIZE;
 #endif		// end GCC-specific section.

 #ifndef THE_UNBEARABLE_LIGHTNESS_OF_BEING_GCC_4_4_0
-#	define __nooptimization
+#	define __nooptimization		// Pretty sure this is obsolete now, since we fixed __asm contraints and stuff. --air
 #endif

+// --------------------------------------------------------------------------------------
+// __releaseinline / __ri -- a forceinline macro that is enabled for RELEASE/PUBLIC builds ONLY.
+// --------------------------------------------------------------------------------------
+// This is useful because forceinline can make certain types of debugging problematic since
+// functions that look like they should be called won't breakpoint since their code is
+// inlined, and it can make stack traces confusing or near useless.
+//
+// Use __releaseinline for things which are generally large functions where trace debugging
+// from Devel builds is likely useful; but which should be inlined in an optimized Release
+// environment.
+//
+#ifdef PCSX2_DEVBUILD
+#	define __releaseinline
+#else
+#	define __releaseinline __forceinline
+#endif
+
+#define __ri	__releaseinline
+#define __fi	__forceinline
+

 #endif
--- a/common/include/Utilities/Dependencies.h
+++ b/common/include/Utilities/Dependencies.h
@ -80,18 +80,18 @@ namespace Threading
 //   EnumToString(value);
 //
 #define ImplementEnumOperators( enumName ) \
-	static __forceinline enumName& operator++	( enumName& src ) { src = (enumName)((int)src+1); return src; } \
-	static __forceinline enumName& operator--	( enumName& src ) { src = (enumName)((int)src-1); return src; } \
-	static __forceinline enumName operator++	( enumName& src, int ) { enumName orig = src; src = (enumName)((int)src+1); return orig; } \
-	static __forceinline enumName operator--	( enumName& src, int ) { enumName orig = src; src = (enumName)((int)src-1); return orig; } \
+	static __fi enumName& operator++	( enumName& src ) { src = (enumName)((int)src+1); return src; } \
+	static __fi enumName& operator--	( enumName& src ) { src = (enumName)((int)src-1); return src; } \
+	static __fi enumName operator++	( enumName& src, int ) { enumName orig = src; src = (enumName)((int)src+1); return orig; } \
+	static __fi enumName operator--	( enumName& src, int ) { enumName orig = src; src = (enumName)((int)src-1); return orig; } \
 \
-	static __forceinline bool operator<	( const enumName& left, const pxEnumEnd_t& )	{ return (int)left < enumName##_COUNT; } \
-	static __forceinline bool operator!=( const enumName& left, const pxEnumEnd_t& )	{ return (int)left != enumName##_COUNT; } \
-	static __forceinline bool operator==( const enumName& left, const pxEnumEnd_t& )	{ return (int)left == enumName##_COUNT; } \
+	static __fi bool operator<	( const enumName& left, const pxEnumEnd_t& )	{ return (int)left < enumName##_COUNT; } \
+	static __fi bool operator!=( const enumName& left, const pxEnumEnd_t& )	{ return (int)left != enumName##_COUNT; } \
+	static __fi bool operator==( const enumName& left, const pxEnumEnd_t& )	{ return (int)left == enumName##_COUNT; } \
 \
-	static __forceinline bool EnumIsValid( enumName id ) { \
+	static __fi bool EnumIsValid( enumName id ) { \
 		return ((int)id >= enumName##_FIRST) && ((int)id < enumName##_COUNT); } \
-	static __forceinline bool EnumAssert( enumName id ) { \
+	static __fi bool EnumAssert( enumName id ) { \
 		return pxAssert( EnumIsValid(id) ); } \
 \
 	extern const wxChar* EnumToString( enumName id )
--- a/common/include/Utilities/EventSource.inl
+++ b/common/include/Utilities/EventSource.inl
@ -63,7 +63,7 @@ typename EventSource<ListenerType>::ListenerIterator EventSource<ListenerType>::


 template< typename ListenerType >
-__forceinline void EventSource<ListenerType>::_DispatchRaw( ListenerIterator iter, const ListenerIterator& iend, const EvtParams& evtparams )
+__fi void EventSource<ListenerType>::_DispatchRaw( ListenerIterator iter, const ListenerIterator& iend, const EvtParams& evtparams )
 {
 	while( iter != iend )
 	{
--- a/common/include/Utilities/lnx_memzero.h
+++ b/common/include/Utilities/lnx_memzero.h
@ -20,7 +20,7 @@
 // memset16, etc.

 template< u32 data, typename T >
-static __forceinline void memset32( T& obj )
+static __fi void memset32( T& obj )
 {
 	// this function works on 32-bit aligned lengths of data only.
 	// If the data length is not a factor of 32 bits, the C++ optimizing compiler will
@ -34,19 +34,19 @@ static __forceinline void memset32( T& obj )
 }

 template< uint size >
-static __forceinline void memzero_ptr( void* dest )
+static __fi void memzero_ptr( void* dest )
 {
 	memset( dest, 0, size );
 }

 template< typename T >
-static __forceinline void memzero( T& obj )
+static __fi void memzero( T& obj )
 {
 	memset( &obj, 0, sizeof( T ) );
 }

 template< u8 data, typename T >
-static __forceinline void memset8( T& obj )
+static __fi void memset8( T& obj )
 {
 	// Aligned sizes use the optimized 32 bit inline memset.  Unaligned sizes use memset.
 	if( (sizeof(T) & 0x3) != 0 )
@ -56,7 +56,7 @@ static __forceinline void memset8( T& obj )
 }

 template< u16 data, typename T >
-static __forceinline void memset16( T& obj )
+static __fi void memset16( T& obj )
 {
 	if( (sizeof(T) & 0x3) != 0 )
 		_memset16_unaligned( &obj, data, sizeof( T ) );
@ -67,7 +67,7 @@ static __forceinline void memset16( T& obj )

 // An optimized memset for 8 bit destination data.
 template< u8 data, size_t bytes >
-static __forceinline void memset_8( void *dest )
+static __fi void memset_8( void *dest )
 {
 	if( bytes == 0 ) return;

--- a/common/include/Utilities/win_memzero.h
+++ b/common/include/Utilities/win_memzero.h
@ -60,7 +60,7 @@

 // This is an implementation of the memzero_ptr fast memset routine (for zero-clears only).
 template< size_t _bytes >
-static __forceinline void memzero_ptr( void *dest )
+static __fi void memzero_ptr( void *dest )
 {
 	if( MZFbytes == 0 ) return;

@ -247,7 +247,7 @@ static __forceinline void memzero_ptr( void *dest )

 // An optimized memset for 8 bit destination data.
 template< u8 data, size_t _bytes >
-static __forceinline void memset_8( void *dest )
+static __fi void memset_8( void *dest )
 {
 	if( MZFbytes == 0 ) return;

@ -374,7 +374,7 @@ static __forceinline void memset_8( void *dest )
 }

 template< u16 data, size_t _bytes >
-static __forceinline void memset_16( void *dest )
+static __fi void memset_16( void *dest )
 {
 	if( MZFbytes == 0 ) return;

@ -462,7 +462,7 @@ static __forceinline void memset_16( void *dest )
 }

 template< u32 data, size_t MZFbytes >
-static __forceinline void memset_32( void *dest )
+static __fi void memset_32( void *dest )
 {
 	if( MZFbytes == 0 ) return;

@ -547,28 +547,28 @@ static __forceinline void memset_32( void *dest )
 // Structures, static arrays, etc.  No need to include sizeof() crap, this does it automatically
 // for you!
 template< typename T >
-static __forceinline void memzero( T& object )
+static __fi void memzero( T& object )
 {
 	memzero_ptr<sizeof(T)>( &object );
 }

 // This method clears an object with the given 8 bit value.
 template< u8 data, typename T >
-static __forceinline void memset8( T& object )
+static __fi void memset8( T& object )
 {
 	memset_8<data, sizeof(T)>( &object );
 }

 // This method clears an object with the given 16 bit value.
 template< u16 data, typename T >
-static __forceinline void memset16( T& object )
+static __fi void memset16( T& object )
 {
 	memset_16<data, sizeof(T)>( &object );
 }

 // This method clears an object with the given 32 bit value.
 template< u32 data, typename T >
-static __forceinline void memset32( T& object )
+static __fi void memset32( T& object )
 {
 	memset_32<data, sizeof(T)>( &object );
 }
--- a/common/include/Utilities/wxGuiTools.h
+++ b/common/include/Utilities/wxGuiTools.h
@ -138,12 +138,12 @@ struct pxStretchType
 	}
 };

-static __forceinline wxSizerFlags pxProportion( int prop )
+static __fi wxSizerFlags pxProportion( int prop )
 {
 	return wxSizerFlags( prop );
 }

-static __forceinline wxSizerFlags pxBorder( int dir=wxALL, int pad=pxSizerFlags::StdPadding )
+static __fi wxSizerFlags pxBorder( int dir=wxALL, int pad=pxSizerFlags::StdPadding )
 {
 	return wxSizerFlags().Border( dir, pad );
 }
--- a/common/include/x86emitter/implement/jmpcall.h
+++ b/common/include/x86emitter/implement/jmpcall.h
@ -22,7 +22,7 @@ namespace x86Emitter {
 #ifdef __GNUG__
 	// GCC has a bug that causes the templated function handler for Jmp/Call emitters to generate
 	// bad asm code.  (error is something like "7#*_uber_379s_mangled_$&02_name is already defined!")
-	// Using GCC's always_inline attribute fixes it.  This differs from __forceinline in that it
+	// Using GCC's always_inline attribute fixes it.  This differs from __fi in that it
 	// inlines *even in debug builds* which is (usually) undesirable.
 	//  ... except when it avoids compiler bugs.
 #	define __always_inline_tmpl_fail	__attribute__((always_inline))
@ -45,7 +45,7 @@ struct xImpl_JmpCall

 	// Special form for calling functions.  This form automatically resolves the
 	// correct displacement based on the size of the instruction being generated.
-	template< typename T > __forceinline __always_inline_tmpl_fail
+	template< typename T > __fi __always_inline_tmpl_fail
 	void operator()( T* func ) const
 	{
 		if( isJmp )
--- a/common/include/x86emitter/inlines.inl
+++ b/common/include/x86emitter/inlines.inl
@ -37,7 +37,7 @@
 // definitions in the .h file because of inter-dependencies with other classes.
 //   (score one for C++!!)
 //
-// In order for MSVC to work correctly with __forceinline on class members,
+// In order for MSVC to work correctly with __fi on class members,
 // however, we need to include these methods into all source files which might
 // reference them.  Without this MSVC generates linker errors.  Or, in other words,
 // global optimization fails to resolve the externals and junk.
@ -50,51 +50,51 @@ namespace x86Emitter
 	//  x86Register Method Implementations (inlined!)
 	// --------------------------------------------------------------------------------------

-	__forceinline xAddressInfo xAddressReg::operator+( const xAddressReg& right ) const
+	__fi xAddressInfo xAddressReg::operator+( const xAddressReg& right ) const
 	{
 		pxAssertMsg( Id != -1, "Uninitialized x86 register." );
 		return xAddressInfo( *this, right );
 	}

-	__forceinline xAddressInfo xAddressReg::operator+( const xAddressInfo& right ) const
+	__fi xAddressInfo xAddressReg::operator+( const xAddressInfo& right ) const
 	{
 		pxAssertMsg( Id != -1, "Uninitialized x86 register." );
 		return right + *this;
 	}

-	__forceinline xAddressInfo xAddressReg::operator+( s32 right ) const
+	__fi xAddressInfo xAddressReg::operator+( s32 right ) const
 	{
 		pxAssertMsg( Id != -1, "Uninitialized x86 register." );
 		return xAddressInfo( *this, right );
 	}

-	__forceinline xAddressInfo xAddressReg::operator+( const void* right ) const
+	__fi xAddressInfo xAddressReg::operator+( const void* right ) const
 	{
 		pxAssertMsg( Id != -1, "Uninitialized x86 register." );
 		return xAddressInfo( *this, (s32)right );
 	}

 	// ------------------------------------------------------------------------
-	__forceinline xAddressInfo xAddressReg::operator-( s32 right ) const
+	__fi xAddressInfo xAddressReg::operator-( s32 right ) const
 	{
 		pxAssertMsg( Id != -1, "Uninitialized x86 register." );
 		return xAddressInfo( *this, -right );
 	}

-	__forceinline xAddressInfo xAddressReg::operator-( const void* right ) const
+	__fi xAddressInfo xAddressReg::operator-( const void* right ) const
 	{
 		pxAssertMsg( Id != -1, "Uninitialized x86 register." );
 		return xAddressInfo( *this, -(s32)right );
 	}

 	// ------------------------------------------------------------------------
-	__forceinline xAddressInfo xAddressReg::operator*( u32 right ) const
+	__fi xAddressInfo xAddressReg::operator*( u32 right ) const
 	{
 		pxAssertMsg( Id != -1, "Uninitialized x86 register." );
 		return xAddressInfo( xEmptyReg, *this, right );
 	}

-	__forceinline xAddressInfo xAddressReg::operator<<( u32 shift ) const
+	__fi xAddressInfo xAddressReg::operator<<( u32 shift ) const
 	{
 		pxAssertMsg( Id != -1, "Uninitialized x86 register." );
 		return xAddressInfo( xEmptyReg, *this, 1<<shift );
--- a/common/include/x86emitter/instructions.h
+++ b/common/include/x86emitter/instructions.h
@ -185,30 +185,30 @@ namespace x86Emitter
 	// the target (efficient!)
 	//

-	template< typename T > __forceinline void xJE( T* func )		{ xJcc( Jcc_Equal, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJZ( T* func )		{ xJcc( Jcc_Zero, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJNE( T* func )		{ xJcc( Jcc_NotEqual, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJNZ( T* func )		{ xJcc( Jcc_NotZero, (void*)(uptr)func ); }
+	template< typename T > __fi void xJE( T* func )		{ xJcc( Jcc_Equal, (void*)(uptr)func ); }
+	template< typename T > __fi void xJZ( T* func )		{ xJcc( Jcc_Zero, (void*)(uptr)func ); }
+	template< typename T > __fi void xJNE( T* func )		{ xJcc( Jcc_NotEqual, (void*)(uptr)func ); }
+	template< typename T > __fi void xJNZ( T* func )		{ xJcc( Jcc_NotZero, (void*)(uptr)func ); }

-	template< typename T > __forceinline void xJO( T* func )		{ xJcc( Jcc_Overflow, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJNO( T* func )		{ xJcc( Jcc_NotOverflow, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJC( T* func )		{ xJcc( Jcc_Carry, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJNC( T* func )		{ xJcc( Jcc_NotCarry, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJS( T* func )		{ xJcc( Jcc_Signed, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJNS( T* func )		{ xJcc( Jcc_Unsigned, (void*)(uptr)func ); }
+	template< typename T > __fi void xJO( T* func )		{ xJcc( Jcc_Overflow, (void*)(uptr)func ); }
+	template< typename T > __fi void xJNO( T* func )		{ xJcc( Jcc_NotOverflow, (void*)(uptr)func ); }
+	template< typename T > __fi void xJC( T* func )		{ xJcc( Jcc_Carry, (void*)(uptr)func ); }
+	template< typename T > __fi void xJNC( T* func )		{ xJcc( Jcc_NotCarry, (void*)(uptr)func ); }
+	template< typename T > __fi void xJS( T* func )		{ xJcc( Jcc_Signed, (void*)(uptr)func ); }
+	template< typename T > __fi void xJNS( T* func )		{ xJcc( Jcc_Unsigned, (void*)(uptr)func ); }

-	template< typename T > __forceinline void xJPE( T* func )		{ xJcc( Jcc_ParityEven, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJPO( T* func )		{ xJcc( Jcc_ParityOdd, (void*)(uptr)func ); }
+	template< typename T > __fi void xJPE( T* func )		{ xJcc( Jcc_ParityEven, (void*)(uptr)func ); }
+	template< typename T > __fi void xJPO( T* func )		{ xJcc( Jcc_ParityOdd, (void*)(uptr)func ); }

-	template< typename T > __forceinline void xJL( T* func )		{ xJcc( Jcc_Less, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJLE( T* func )		{ xJcc( Jcc_LessOrEqual, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJG( T* func )		{ xJcc( Jcc_Greater, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJGE( T* func )		{ xJcc( Jcc_GreaterOrEqual, (void*)(uptr)func ); }
+	template< typename T > __fi void xJL( T* func )		{ xJcc( Jcc_Less, (void*)(uptr)func ); }
+	template< typename T > __fi void xJLE( T* func )		{ xJcc( Jcc_LessOrEqual, (void*)(uptr)func ); }
+	template< typename T > __fi void xJG( T* func )		{ xJcc( Jcc_Greater, (void*)(uptr)func ); }
+	template< typename T > __fi void xJGE( T* func )		{ xJcc( Jcc_GreaterOrEqual, (void*)(uptr)func ); }

-	template< typename T > __forceinline void xJB( T* func )		{ xJcc( Jcc_Below, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJBE( T* func )		{ xJcc( Jcc_BelowOrEqual, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJA( T* func )		{ xJcc( Jcc_Above, (void*)(uptr)func ); }
-	template< typename T > __forceinline void xJAE( T* func )		{ xJcc( Jcc_AboveOrEqual, (void*)(uptr)func ); }
+	template< typename T > __fi void xJB( T* func )		{ xJcc( Jcc_Below, (void*)(uptr)func ); }
+	template< typename T > __fi void xJBE( T* func )		{ xJcc( Jcc_BelowOrEqual, (void*)(uptr)func ); }
+	template< typename T > __fi void xJA( T* func )		{ xJcc( Jcc_Above, (void*)(uptr)func ); }
+	template< typename T > __fi void xJAE( T* func )		{ xJcc( Jcc_AboveOrEqual, (void*)(uptr)func ); }

 	// ------------------------------------------------------------------------
 	// Forward Jump Helpers (act as labels!)
--- a/common/include/x86emitter/legacy_internal.h
+++ b/common/include/x86emitter/legacy_internal.h
@ -21,7 +21,7 @@
 // Legacy Helper Macros and Functions (depreciated)
 //------------------------------------------------------------------

-#define emitterT __forceinline
+#define emitterT __fi

 using x86Emitter::xWrite8;
 using x86Emitter::xWrite16;
--- a/common/include/x86emitter/x86types.h
+++ b/common/include/x86emitter/x86types.h
@ -80,7 +80,7 @@ extern const char* xGetRegName( int regid, int operandSize );
 //------------------------------------------------------------------
 // templated version of is_s8 is required, so that u16's get correct sign extension treatment.
 template< typename T >
-static __forceinline bool is_s8( T imm ) { return (s8)imm == (s32)imm; }
+static __fi bool is_s8( T imm ) { return (s8)imm == (s32)imm; }

 template< typename T > void xWrite( T val );

@ -114,14 +114,14 @@ template< typename T > void xWrite( T val );
 //
 // In the case of (Reg, Imm) forms, the inlining is up to the discreation of the compiler.
 //
-// Note: I *intentionally* use __forceinline directly for most single-line class members,
+// Note: I *intentionally* use __fi directly for most single-line class members,
 // when needed.  There's no point in using __emitline in these cases since the debugger
 // can't trace into single-line functions anyway.
 //
 #ifdef PCSX2_DEVBUILD
 #	define __emitinline
 #else
-#	define __emitinline __forceinline
+#	define __emitinline __fi
 #endif

 	// ModRM 'mod' field enumeration.   Provided mostly for reference:
@ -535,15 +535,15 @@ template< typename T > void xWrite( T val );
 		xAddressVoid& Add( const xAddressReg& src );
 		xAddressVoid& Add( const xAddressVoid& src );

-		__forceinline xAddressVoid operator+( const xAddressReg& right ) const	{ return xAddressVoid( *this ).Add( right ); }
-		__forceinline xAddressVoid operator+( const xAddressVoid& right ) const	{ return xAddressVoid( *this ).Add( right ); }
-		__forceinline xAddressVoid operator+( s32 imm ) const					{ return xAddressVoid( *this ).Add( imm ); }
-		__forceinline xAddressVoid operator-( s32 imm ) const					{ return xAddressVoid( *this ).Add( -imm ); }
-		__forceinline xAddressVoid operator+( const void* addr ) const			{ return xAddressVoid( *this ).Add( (uptr)addr ); }
+		__fi xAddressVoid operator+( const xAddressReg& right ) const	{ return xAddressVoid( *this ).Add( right ); }
+		__fi xAddressVoid operator+( const xAddressVoid& right ) const	{ return xAddressVoid( *this ).Add( right ); }
+		__fi xAddressVoid operator+( s32 imm ) const					{ return xAddressVoid( *this ).Add( imm ); }
+		__fi xAddressVoid operator-( s32 imm ) const					{ return xAddressVoid( *this ).Add( -imm ); }
+		__fi xAddressVoid operator+( const void* addr ) const			{ return xAddressVoid( *this ).Add( (uptr)addr ); }

-		__forceinline void operator+=( const xAddressReg& right ) { Add( right ); }
-		__forceinline void operator+=( s32 imm ) { Add( imm ); }
-		__forceinline void operator-=( s32 imm ) { Add( -imm ); }
+		__fi void operator+=( const xAddressReg& right ) { Add( right ); }
+		__fi void operator+=( s32 imm ) { Add( imm ); }
+		__fi void operator-=( s32 imm ) { Add( -imm ); }
 	};

 	// --------------------------------------------------------------------------------------
@ -584,13 +584,13 @@ template< typename T > void xWrite( T val );
 		xAddressInfo<BaseType>& Add( const xAddressReg& src )				{ _parent::Add(src); return *this; }
 		xAddressInfo<BaseType>& Add( const xAddressInfo<BaseType>& src )	{ _parent::Add(src); return *this; }

-		__forceinline xAddressInfo<BaseType> operator+( const xAddressReg& right ) const	{ return xAddressInfo( *this ).Add( right ); }
-		__forceinline xAddressInfo<BaseType> operator+( const xAddressInfo<BaseType>& right ) const	{ return xAddressInfo( *this ).Add( right ); }
-		__forceinline xAddressInfo<BaseType> operator+( s32 imm ) const					{ return xAddressInfo( *this ).Add( imm ); }
-		__forceinline xAddressInfo<BaseType> operator-( s32 imm ) const					{ return xAddressInfo( *this ).Add( -imm ); }
-		__forceinline xAddressInfo<BaseType> operator+( const void* addr ) const			{ return xAddressInfo( *this ).Add( (uptr)addr ); }
+		__fi xAddressInfo<BaseType> operator+( const xAddressReg& right ) const	{ return xAddressInfo( *this ).Add( right ); }
+		__fi xAddressInfo<BaseType> operator+( const xAddressInfo<BaseType>& right ) const	{ return xAddressInfo( *this ).Add( right ); }
+		__fi xAddressInfo<BaseType> operator+( s32 imm ) const					{ return xAddressInfo( *this ).Add( imm ); }
+		__fi xAddressInfo<BaseType> operator-( s32 imm ) const					{ return xAddressInfo( *this ).Add( -imm ); }
+		__fi xAddressInfo<BaseType> operator+( const void* addr ) const			{ return xAddressInfo( *this ).Add( (uptr)addr ); }

-		__forceinline void operator+=( const xAddressInfo<BaseType>& right )	{ Add( right ); }
+		__fi void operator+=( const xAddressInfo<BaseType>& right )	{ Add( right ); }
 	};

 	typedef xAddressInfo<u128>	xAddress128;
@ -599,25 +599,25 @@ template< typename T > void xWrite( T val );
 	typedef xAddressInfo<u16>	xAddress16;
 	typedef xAddressInfo<u8>	xAddress8;

-	static __forceinline xAddressVoid operator+( const void* addr, const xAddressVoid& right )
+	static __fi xAddressVoid operator+( const void* addr, const xAddressVoid& right )
 	{
 		return right + addr;
 	}

-	static __forceinline xAddressVoid operator+( s32 addr, const xAddressVoid& right )
+	static __fi xAddressVoid operator+( s32 addr, const xAddressVoid& right )
 	{
 		return right + addr;
 	}

 	template< typename OperandType >
-	static __forceinline xAddressInfo<OperandType> operator+( const void* addr, const xAddressInfo<OperandType>& right )
+	static __fi xAddressInfo<OperandType> operator+( const void* addr, const xAddressInfo<OperandType>& right )
 	{
 		//return xAddressInfo<OperandType>( (sptr)addr ).Add( reg );
 		return right + addr;
 	}

 	template< typename OperandType >
-	static __forceinline xAddressInfo<OperandType> operator+( s32 addr, const xAddressInfo<OperandType>& right )
+	static __fi xAddressInfo<OperandType> operator+( s32 addr, const xAddressInfo<OperandType>& right )
 	{
 		return right + addr;
 	}
@ -691,8 +691,8 @@ template< typename T > void xWrite( T val );
 			return xAddressVoid( Base, Index, Scale, Displacement );
 		}

-		__forceinline xIndirectVoid operator+( const s32 imm ) const { return xIndirectVoid( *this ).Add( imm ); }
-		__forceinline xIndirectVoid operator-( const s32 imm ) const { return xIndirectVoid( *this ).Add( -imm ); }
+		__fi xIndirectVoid operator+( const s32 imm ) const { return xIndirectVoid( *this ).Add( imm ); }
+		__fi xIndirectVoid operator-( const s32 imm ) const { return xIndirectVoid( *this ).Add( -imm ); }

 	protected:
 		void Reduce();
@ -717,8 +717,8 @@ template< typename T > void xWrite( T val );
 			return *this;
 		}

-		__forceinline xIndirect<OperandType> operator+( const s32 imm ) const { return xIndirect( *this ).Add( imm ); }
-		__forceinline xIndirect<OperandType> operator-( const s32 imm ) const { return xIndirect( *this ).Add( -imm ); }
+		__fi xIndirect<OperandType> operator+( const s32 imm ) const { return xIndirect( *this ).Add( imm ); }
+		__fi xIndirect<OperandType> operator-( const s32 imm ) const { return xIndirect( *this ).Add( -imm ); }

 		bool operator==( const xIndirect<OperandType>& src ) const
 		{
@ -963,12 +963,12 @@ template< typename T > void xWrite( T val );
 		}
 	};

-	static __forceinline xAddressVoid operator+( const void* addr, const xAddressReg& reg )
+	static __fi xAddressVoid operator+( const void* addr, const xAddressReg& reg )
 	{
 		return reg + (sptr)addr;
 	}

-	static __forceinline xAddressVoid operator+( s32 addr, const xAddressReg& reg )
+	static __fi xAddressVoid operator+( s32 addr, const xAddressReg& reg )
 	{
 		return reg + (sptr)addr;
 	}
--- a/common/src/Utilities/AlignedMalloc.cpp
+++ b/common/src/Utilities/AlignedMalloc.cpp
@ -60,7 +60,7 @@ void* __fastcall pcsx2_aligned_realloc(void* handle, size_t size, size_t align)
 	return newbuf;
 }

-__forceinline void pcsx2_aligned_free(void* pmem)
+__fi void pcsx2_aligned_free(void* pmem)
 {
 	if( pmem == NULL ) return;
 	AlignedMallocHeader* header = (AlignedMallocHeader*)((uptr)pmem - headsize);
@ -73,7 +73,7 @@ __forceinline void pcsx2_aligned_free(void* pmem)

 // Special unaligned memset used when all other optimized memsets fail (it's called from
 // memzero_obj and stuff).
-__forceinline void _memset16_unaligned( void* dest, u16 data, size_t size )
+__fi void _memset16_unaligned( void* dest, u16 data, size_t size )
 {
 	pxAssume( (size & 0x1) == 0 );

@ -82,7 +82,7 @@ __forceinline void _memset16_unaligned( void* dest, u16 data, size_t size )
 		*dst = data;
 }

-__forceinline void HostSys::Munmap( void* base, u32 size )
+__fi void HostSys::Munmap( void* base, u32 size )
 {
 	Munmap( (uptr)base, size );
 }
--- a/common/src/Utilities/Console.cpp
+++ b/common/src/Utilities/Console.cpp
@ -108,7 +108,7 @@ const IConsoleWriter ConsoleWriter_Null =
 // --------------------------------------------------------------------------------------

 #ifdef __LINUX__
-static __forceinline const wxChar* GetLinuxConsoleColor(ConsoleColors color)
+static __fi const wxChar* GetLinuxConsoleColor(ConsoleColors color)
 {
    switch(color)
    {
--- a/common/src/Utilities/Exceptions.cpp
+++ b/common/src/Utilities/Exceptions.cpp
@ -36,7 +36,7 @@ static wxString GetTranslation( const wxChar* msg )
 #ifdef PCSX2_DEVBUILD
 #	define DEVASSERT_INLINE __noinline
 #else
-#	define DEVASSERT_INLINE __forceinline
+#	define DEVASSERT_INLINE __fi
 #endif

 // Using a threadlocal assertion guard.  Separate threads can assert at the same time.
@ -123,7 +123,7 @@ DEVASSERT_INLINE void pxOnAssert( const DiagnosticOrigin& origin, const wxChar*
 	if( trapit ) { pxTrap(); }
 }

-__forceinline void pxOnAssert( const DiagnosticOrigin& origin, const char* msg)
+__fi void pxOnAssert( const DiagnosticOrigin& origin, const char* msg)
 {
 	pxOnAssert( origin, fromUTF8(msg) );
 }
--- a/common/src/Utilities/FastFormatString.cpp
+++ b/common/src/Utilities/FastFormatString.cpp
@ -141,7 +141,7 @@ public:
 static bool buffer_is_avail = false;
 static GlobalBufferManager< BaseTlsVariable< FastFormatBuffers< char > > > m_buffer_tls(buffer_is_avail);

-static __releaseinline void format_that_ascii_mess( SafeArray<char>& buffer, uint writepos, const char* fmt, va_list argptr )
+static __ri void format_that_ascii_mess( SafeArray<char>& buffer, uint writepos, const char* fmt, va_list argptr )
 {
 	while( true )
 	{
@ -171,7 +171,7 @@ static __releaseinline void format_that_ascii_mess( SafeArray<char>& buffer, uin
 	// though it'd be kinda nice if we did.
 }

-static __releaseinline void format_that_unicode_mess( SafeArray<char>& buffer, uint writepos, const wxChar* fmt, va_list argptr)
+static __ri void format_that_unicode_mess( SafeArray<char>& buffer, uint writepos, const wxChar* fmt, va_list argptr)
 {
 	while( true )
 	{
--- a/common/src/Utilities/StringHelpers.cpp
+++ b/common/src/Utilities/StringHelpers.cpp
@ -16,7 +16,7 @@
 #include "PrecompiledHeader.h"
 #include <wx/gdicmn.h>		// for wxPoint/wxRect stuff

-__forceinline wxString fromUTF8( const char* src )
+__fi wxString fromUTF8( const char* src )
 {
 	// IMPORTANT:  We cannot use wxString::FromUTF8 because it *stupidly* relies on a C++ global instance of
 	// wxMBConvUTF8().  C++ initializes and destroys these globals at random, so any object constructor or
@ -30,7 +30,7 @@ __forceinline wxString fromUTF8( const char* src )
 	return wxString( src, wxMBConvUTF8() );
 }

-__forceinline wxString fromAscii( const char* src )
+__fi wxString fromAscii( const char* src )
 {
 	return wxString::FromAscii( src );
 }
--- a/common/src/Utilities/ThreadTools.cpp
+++ b/common/src/Utilities/ThreadTools.cpp
@ -145,7 +145,7 @@ bool Threading::_WaitGui_RecursionGuard( const wxChar* name )
 	return true;
 }

-__forceinline void Threading::Timeslice()
+__fi void Threading::Timeslice()
 {
 	sched_yield();
 }
@ -774,57 +774,57 @@ void Threading::WaitEvent::Wait()
 // --------------------------------------------------------------------------------------
 // define some overloads for InterlockedExchanges for commonly used types, like u32 and s32.

-__forceinline bool Threading::AtomicBitTestAndReset( volatile u32& bitset, u8 bit )
+__fi bool Threading::AtomicBitTestAndReset( volatile u32& bitset, u8 bit )
 {
 	return _interlockedbittestandreset( (volatile long*)& bitset, bit ) != 0;
 }

-__forceinline u32 Threading::AtomicExchange( volatile u32& Target, u32 value )
+__fi u32 Threading::AtomicExchange( volatile u32& Target, u32 value )
 {
 	return _InterlockedExchange( (volatile long*)&Target, value );
 }

-__forceinline u32 Threading::AtomicExchangeAdd( volatile u32& Target, u32 value )
+__fi u32 Threading::AtomicExchangeAdd( volatile u32& Target, u32 value )
 {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, value );
 }

-__forceinline u32 Threading::AtomicIncrement( volatile u32& Target )
+__fi u32 Threading::AtomicIncrement( volatile u32& Target )
 {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, 1 );
 }

-__forceinline u32 Threading::AtomicDecrement( volatile u32& Target )
+__fi u32 Threading::AtomicDecrement( volatile u32& Target )
 {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, -1 );
 }

-__forceinline s32 Threading::AtomicExchange( volatile s32& Target, s32 value )
+__fi s32 Threading::AtomicExchange( volatile s32& Target, s32 value )
 {
 	return _InterlockedExchange( (volatile long*)&Target, value );
 }

-__forceinline s32 Threading::AtomicExchangeAdd( volatile s32& Target, s32 value )
+__fi s32 Threading::AtomicExchangeAdd( volatile s32& Target, s32 value )
 {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, value );
 }

-__forceinline s32 Threading::AtomicExchangeSub( volatile s32& Target, s32 value )
+__fi s32 Threading::AtomicExchangeSub( volatile s32& Target, s32 value )
 {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, -value );
 }

-__forceinline s32 Threading::AtomicIncrement( volatile s32& Target )
+__fi s32 Threading::AtomicIncrement( volatile s32& Target )
 {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, 1 );
 }

-__forceinline s32 Threading::AtomicDecrement( volatile s32& Target )
+__fi s32 Threading::AtomicDecrement( volatile s32& Target )
 {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, -1 );
 }

-__forceinline void* Threading::_AtomicExchangePointer( volatile uptr& target, uptr value )
+__fi void* Threading::_AtomicExchangePointer( volatile uptr& target, uptr value )
 {
 #ifdef _M_AMD64		// high-level atomic ops, please leave these 64 bit checks in place.
 	return (void*)_InterlockedExchange64( &(volatile s64&)target, value );
@ -833,7 +833,7 @@ __forceinline void* Threading::_AtomicExchangePointer( volatile uptr& target, up
 #endif
 }

-__forceinline void* Threading::_AtomicCompareExchangePointer( volatile uptr& target, uptr value, uptr comparand )
+__fi void* Threading::_AtomicCompareExchangePointer( volatile uptr& target, uptr value, uptr comparand )
 {
 #ifdef _M_AMD64		// high-level atomic ops, please leave these 64 bit checks in place.
 	return (void*)_InterlockedCompareExchange64( &(volatile s64&)target, value );
--- a/common/src/Utilities/Windows/WinThreads.cpp
+++ b/common/src/Utilities/Windows/WinThreads.cpp
@ -24,24 +24,24 @@

 #else

-__forceinline void Threading::Sleep( int ms )
+__fi void Threading::Sleep( int ms )
 {
 	::Sleep( ms );
 }

 // For use in spin/wait loops,  Acts as a hint to Intel CPUs and should, in theory
 // improve performance and reduce cpu power consumption.
-__forceinline void Threading::SpinWait()
+__fi void Threading::SpinWait()
 {
 	__asm pause;
 }

-__forceinline void Threading::StoreFence()
+__fi void Threading::StoreFence()
 {
 	__asm sfence;
 }

-__forceinline void Threading::EnableHiresScheduler()
+__fi void Threading::EnableHiresScheduler()
 {
 	// This improves accuracy of Sleep() by some amount, and only adds a negligible amount of
 	// overhead on modern CPUs.  Typically desktops are already set pretty low, but laptops in
@ -52,7 +52,7 @@ __forceinline void Threading::EnableHiresScheduler()
 	timeBeginPeriod( 1 );
 }

-__forceinline void Threading::DisableHiresScheduler()
+__fi void Threading::DisableHiresScheduler()
 {
 	timeEndPeriod( 1 );
 }
--- a/common/src/Utilities/x86/MemcpyFast.cpp
+++ b/common/src/Utilities/x86/MemcpyFast.cpp
@ -234,7 +234,7 @@ $memcpy_final:
 }

 // Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
-__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
+__fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
 {
 	// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
 	// registers will improve copy performance, because they won't.  Use of XMMs is only
--- a/common/src/Utilities/x86/MemcpyVibes.cpp
+++ b/common/src/Utilities/x86/MemcpyVibes.cpp
@ -29,7 +29,7 @@ __aligned16 _memCpyCall _memcpy_vibes[_maxSize+1];

 // this version uses SSE intrinsics to perform an inline copy.  MSVC disasm shows pretty
 // decent code generation on whole, but it hasn't been benchmarked at all yet --air
-__forceinline void memcpy_vibes(void * dest, const void * src, int size) {
+__fi void memcpy_vibes(void * dest, const void * src, int size) {

 	float (*destxmm)[4] = (float(*)[4])dest, (*srcxmm)[4] = (float(*)[4])src;
 	size_t count = size & ~15, extra = size & 15;
@ -110,7 +110,7 @@ void gen_memcpy_vibes() {
 	HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
 }

-__forceinline void memcpy_vibes(void * dest, const void * src, int size) {
+__fi void memcpy_vibes(void * dest, const void * src, int size) {
 	int offset = ((size & 0xf) - 7) << 4;
 	_memcpy_vibes[size]((void*)((uptr)dest + offset), (void*)((uptr)src + offset));
 }
@ -150,7 +150,7 @@ void gen_memcpy_vibes() {
 	HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
 }

-__forceinline void memcpy_vibes(void * dest, const void * src, int size) {
+__fi void memcpy_vibes(void * dest, const void * src, int size) {
 	_memcpy_vibes[size](dest, src);
 }

@ -163,7 +163,7 @@ __forceinline void memcpy_vibes(void * dest, const void * src, int size) {

 	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
 	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
-	__forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
+	__fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
 	{	
 		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
 		// registers will improve copy performance, because they won't.  Use of XMMs is only
--- a/common/src/x86emitter/jmp.cpp
+++ b/common/src/x86emitter/jmp.cpp
@ -180,7 +180,7 @@ void xForwardJumpBase::_setTarget( uint opsize ) const
 }

 // returns the inverted conditional type for this Jcc condition.  Ie, JNS will become JS.
-__forceinline JccComparisonType xInvertCond( JccComparisonType src )
+__fi JccComparisonType xInvertCond( JccComparisonType src )
 {
 	pxAssert( src != Jcc_Unknown );
 	if( Jcc_Unconditional == src ) return Jcc_Unconditional;
--- a/common/src/x86emitter/simd.cpp
+++ b/common/src/x86emitter/simd.cpp
@ -134,57 +134,57 @@ const xImplSimd_DestRegSSE		xPTEST = { 0x66,0x1738 };
 // nature of the functions.  (so if a function expects an m32, you must use (u32*) or ptr32[]).
 //

-__forceinline void xCVTDQ2PD( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0xf3, 0xe6 ); }
-__forceinline void xCVTDQ2PD( const xRegisterSSE& to, const xIndirect64& from )		{ OpWriteSSE( 0xf3, 0xe6 ); }
-__forceinline void xCVTDQ2PS( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x00, 0x5b ); }
-__forceinline void xCVTDQ2PS( const xRegisterSSE& to, const xIndirect128& from )		{ OpWriteSSE( 0x00, 0x5b ); }
+__fi void xCVTDQ2PD( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0xf3, 0xe6 ); }
+__fi void xCVTDQ2PD( const xRegisterSSE& to, const xIndirect64& from )		{ OpWriteSSE( 0xf3, 0xe6 ); }
+__fi void xCVTDQ2PS( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x00, 0x5b ); }
+__fi void xCVTDQ2PS( const xRegisterSSE& to, const xIndirect128& from )		{ OpWriteSSE( 0x00, 0x5b ); }

-__forceinline void xCVTPD2DQ( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0xf2, 0xe6 ); }
-__forceinline void xCVTPD2DQ( const xRegisterSSE& to, const xIndirect128& from )		{ OpWriteSSE( 0xf2, 0xe6 ); }
-__forceinline void xCVTPD2PI( const xRegisterMMX& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x66, 0x2d ); }
-__forceinline void xCVTPD2PI( const xRegisterMMX& to, const xIndirect128& from )		{ OpWriteSSE( 0x66, 0x2d ); }
-__forceinline void xCVTPD2PS( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x66, 0x5a ); }
-__forceinline void xCVTPD2PS( const xRegisterSSE& to, const xIndirect128& from )		{ OpWriteSSE( 0x66, 0x5a ); }
+__fi void xCVTPD2DQ( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0xf2, 0xe6 ); }
+__fi void xCVTPD2DQ( const xRegisterSSE& to, const xIndirect128& from )		{ OpWriteSSE( 0xf2, 0xe6 ); }
+__fi void xCVTPD2PI( const xRegisterMMX& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x66, 0x2d ); }
+__fi void xCVTPD2PI( const xRegisterMMX& to, const xIndirect128& from )		{ OpWriteSSE( 0x66, 0x2d ); }
+__fi void xCVTPD2PS( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x66, 0x5a ); }
+__fi void xCVTPD2PS( const xRegisterSSE& to, const xIndirect128& from )		{ OpWriteSSE( 0x66, 0x5a ); }

-__forceinline void xCVTPI2PD( const xRegisterSSE& to, const xRegisterMMX& from )	{ OpWriteSSE( 0x66, 0x2a ); }
-__forceinline void xCVTPI2PD( const xRegisterSSE& to, const xIndirect64& from )		{ OpWriteSSE( 0x66, 0x2a ); }
-__forceinline void xCVTPI2PS( const xRegisterSSE& to, const xRegisterMMX& from )	{ OpWriteSSE( 0x00, 0x2a ); }
-__forceinline void xCVTPI2PS( const xRegisterSSE& to, const xIndirect64& from )		{ OpWriteSSE( 0x00, 0x2a ); }
+__fi void xCVTPI2PD( const xRegisterSSE& to, const xRegisterMMX& from )	{ OpWriteSSE( 0x66, 0x2a ); }
+__fi void xCVTPI2PD( const xRegisterSSE& to, const xIndirect64& from )		{ OpWriteSSE( 0x66, 0x2a ); }
+__fi void xCVTPI2PS( const xRegisterSSE& to, const xRegisterMMX& from )	{ OpWriteSSE( 0x00, 0x2a ); }
+__fi void xCVTPI2PS( const xRegisterSSE& to, const xIndirect64& from )		{ OpWriteSSE( 0x00, 0x2a ); }

-__forceinline void xCVTPS2DQ( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x66, 0x5b ); }
-__forceinline void xCVTPS2DQ( const xRegisterSSE& to, const xIndirect128& from )		{ OpWriteSSE( 0x66, 0x5b ); }
-__forceinline void xCVTPS2PD( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x00, 0x5a ); }
-__forceinline void xCVTPS2PD( const xRegisterSSE& to, const xIndirect64& from )		{ OpWriteSSE( 0x00, 0x5a ); }
-__forceinline void xCVTPS2PI( const xRegisterMMX& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x00, 0x2d ); }
-__forceinline void xCVTPS2PI( const xRegisterMMX& to, const xIndirect64& from )		{ OpWriteSSE( 0x00, 0x2d ); }
+__fi void xCVTPS2DQ( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x66, 0x5b ); }
+__fi void xCVTPS2DQ( const xRegisterSSE& to, const xIndirect128& from )		{ OpWriteSSE( 0x66, 0x5b ); }
+__fi void xCVTPS2PD( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x00, 0x5a ); }
+__fi void xCVTPS2PD( const xRegisterSSE& to, const xIndirect64& from )		{ OpWriteSSE( 0x00, 0x5a ); }
+__fi void xCVTPS2PI( const xRegisterMMX& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x00, 0x2d ); }
+__fi void xCVTPS2PI( const xRegisterMMX& to, const xIndirect64& from )		{ OpWriteSSE( 0x00, 0x2d ); }

-__forceinline void xCVTSD2SI( const xRegister32& to, const xRegisterSSE& from )		{ OpWriteSSE( 0xf2, 0x2d ); }
-__forceinline void xCVTSD2SI( const xRegister32& to, const xIndirect64& from )			{ OpWriteSSE( 0xf2, 0x2d ); }
-__forceinline void xCVTSD2SS( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0xf2, 0x5a ); }
-__forceinline void xCVTSD2SS( const xRegisterSSE& to, const xIndirect64& from )		{ OpWriteSSE( 0xf2, 0x5a ); }
-__forceinline void xCVTSI2SD( const xRegisterMMX& to, const xRegister32& from )		{ OpWriteSSE( 0xf2, 0x2a ); }
-__forceinline void xCVTSI2SD( const xRegisterMMX& to, const xIndirect32& from )		{ OpWriteSSE( 0xf2, 0x2a ); }
-__forceinline void xCVTSI2SS( const xRegisterSSE& to, const xRegister32& from )		{ OpWriteSSE( 0xf3, 0x2a ); }
-__forceinline void xCVTSI2SS( const xRegisterSSE& to, const xIndirect32& from )		{ OpWriteSSE( 0xf3, 0x2a ); }
+__fi void xCVTSD2SI( const xRegister32& to, const xRegisterSSE& from )		{ OpWriteSSE( 0xf2, 0x2d ); }
+__fi void xCVTSD2SI( const xRegister32& to, const xIndirect64& from )			{ OpWriteSSE( 0xf2, 0x2d ); }
+__fi void xCVTSD2SS( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0xf2, 0x5a ); }
+__fi void xCVTSD2SS( const xRegisterSSE& to, const xIndirect64& from )		{ OpWriteSSE( 0xf2, 0x5a ); }
+__fi void xCVTSI2SD( const xRegisterMMX& to, const xRegister32& from )		{ OpWriteSSE( 0xf2, 0x2a ); }
+__fi void xCVTSI2SD( const xRegisterMMX& to, const xIndirect32& from )		{ OpWriteSSE( 0xf2, 0x2a ); }
+__fi void xCVTSI2SS( const xRegisterSSE& to, const xRegister32& from )		{ OpWriteSSE( 0xf3, 0x2a ); }
+__fi void xCVTSI2SS( const xRegisterSSE& to, const xIndirect32& from )		{ OpWriteSSE( 0xf3, 0x2a ); }

-__forceinline void xCVTSS2SD( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0xf3, 0x5a ); }
-__forceinline void xCVTSS2SD( const xRegisterSSE& to, const xIndirect32& from )		{ OpWriteSSE( 0xf3, 0x5a ); }
-__forceinline void xCVTSS2SI( const xRegister32& to, const xRegisterSSE& from )		{ OpWriteSSE( 0xf3, 0x2d ); }
-__forceinline void xCVTSS2SI( const xRegister32& to, const xIndirect32& from )			{ OpWriteSSE( 0xf3, 0x2d ); }
+__fi void xCVTSS2SD( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0xf3, 0x5a ); }
+__fi void xCVTSS2SD( const xRegisterSSE& to, const xIndirect32& from )		{ OpWriteSSE( 0xf3, 0x5a ); }
+__fi void xCVTSS2SI( const xRegister32& to, const xRegisterSSE& from )		{ OpWriteSSE( 0xf3, 0x2d ); }
+__fi void xCVTSS2SI( const xRegister32& to, const xIndirect32& from )			{ OpWriteSSE( 0xf3, 0x2d ); }

-__forceinline void xCVTTPD2DQ( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x66, 0xe6 ); }
-__forceinline void xCVTTPD2DQ( const xRegisterSSE& to, const xIndirect128& from )		{ OpWriteSSE( 0x66, 0xe6 ); }
-__forceinline void xCVTTPD2PI( const xRegisterMMX& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x66, 0x2c ); }
-__forceinline void xCVTTPD2PI( const xRegisterMMX& to, const xIndirect128& from )		{ OpWriteSSE( 0x66, 0x2c ); }
-__forceinline void xCVTTPS2DQ( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0xf3, 0x5b ); }
-__forceinline void xCVTTPS2DQ( const xRegisterSSE& to, const xIndirect128& from )		{ OpWriteSSE( 0xf3, 0x5b ); }
-__forceinline void xCVTTPS2PI( const xRegisterMMX& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x00, 0x2c ); }
-__forceinline void xCVTTPS2PI( const xRegisterMMX& to, const xIndirect64& from )		{ OpWriteSSE( 0x00, 0x2c ); }
+__fi void xCVTTPD2DQ( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x66, 0xe6 ); }
+__fi void xCVTTPD2DQ( const xRegisterSSE& to, const xIndirect128& from )		{ OpWriteSSE( 0x66, 0xe6 ); }
+__fi void xCVTTPD2PI( const xRegisterMMX& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x66, 0x2c ); }
+__fi void xCVTTPD2PI( const xRegisterMMX& to, const xIndirect128& from )		{ OpWriteSSE( 0x66, 0x2c ); }
+__fi void xCVTTPS2DQ( const xRegisterSSE& to, const xRegisterSSE& from )	{ OpWriteSSE( 0xf3, 0x5b ); }
+__fi void xCVTTPS2DQ( const xRegisterSSE& to, const xIndirect128& from )		{ OpWriteSSE( 0xf3, 0x5b ); }
+__fi void xCVTTPS2PI( const xRegisterMMX& to, const xRegisterSSE& from )	{ OpWriteSSE( 0x00, 0x2c ); }
+__fi void xCVTTPS2PI( const xRegisterMMX& to, const xIndirect64& from )		{ OpWriteSSE( 0x00, 0x2c ); }

-__forceinline void xCVTTSD2SI( const xRegister32& to, const xRegisterSSE& from )	{ OpWriteSSE( 0xf2, 0x2c ); }
-__forceinline void xCVTTSD2SI( const xRegister32& to, const xIndirect64& from )		{ OpWriteSSE( 0xf2, 0x2c ); }
-__forceinline void xCVTTSS2SI( const xRegister32& to, const xRegisterSSE& from )	{ OpWriteSSE( 0xf3, 0x2c ); }
-__forceinline void xCVTTSS2SI( const xRegister32& to, const xIndirect32& from )		{ OpWriteSSE( 0xf3, 0x2c ); }
+__fi void xCVTTSD2SI( const xRegister32& to, const xRegisterSSE& from )	{ OpWriteSSE( 0xf2, 0x2c ); }
+__fi void xCVTTSD2SI( const xRegister32& to, const xIndirect64& from )		{ OpWriteSSE( 0xf2, 0x2c ); }
+__fi void xCVTTSS2SI( const xRegister32& to, const xRegisterSSE& from )	{ OpWriteSSE( 0xf3, 0x2c ); }
+__fi void xCVTTSS2SI( const xRegister32& to, const xIndirect32& from )		{ OpWriteSSE( 0xf3, 0x2c ); }


 // ------------------------------------------------------------------------
@ -452,7 +452,7 @@ const xImplSimd_PMinMax xPMAX =
 //  SIMD Shuffle/Pack  (Shuffle puck?)
 // =====================================================================================================

-__forceinline void xImplSimd_Shuffle::_selector_assertion_check( u8 selector ) const
+__fi void xImplSimd_Shuffle::_selector_assertion_check( u8 selector ) const
 {
 	pxAssertMsg( (selector & ~3) == 0,
 		"Invalid immediate operand on SSE Shuffle: Upper 6 bits of the SSE Shuffle-PD Selector are reserved and must be zero."
@ -684,43 +684,43 @@ const xImplSimd_DestRegSSE xMOVSHDUP = { 0xf3,0x16 };
 //  * MOVD has valid forms for MMX and XMM registers.
 //

-__forceinline void xMOVDZX( const xRegisterSSE& to, const xRegister32& from )	{ xOpWrite0F( 0x66, 0x6e, to, from ); }
-__forceinline void xMOVDZX( const xRegisterSSE& to, const xIndirectVoid& src )		{ xOpWrite0F( 0x66, 0x6e, to, src ); }
+__fi void xMOVDZX( const xRegisterSSE& to, const xRegister32& from )	{ xOpWrite0F( 0x66, 0x6e, to, from ); }
+__fi void xMOVDZX( const xRegisterSSE& to, const xIndirectVoid& src )		{ xOpWrite0F( 0x66, 0x6e, to, src ); }

-__forceinline void xMOVDZX( const xRegisterMMX& to, const xRegister32& from )	{ xOpWrite0F( 0x6e, to, from ); }
-__forceinline void xMOVDZX( const xRegisterMMX& to, const xIndirectVoid& src )		{ xOpWrite0F( 0x6e, to, src ); }
+__fi void xMOVDZX( const xRegisterMMX& to, const xRegister32& from )	{ xOpWrite0F( 0x6e, to, from ); }
+__fi void xMOVDZX( const xRegisterMMX& to, const xIndirectVoid& src )		{ xOpWrite0F( 0x6e, to, src ); }

-__forceinline void xMOVD( const xRegister32& to, const xRegisterSSE& from )		{ xOpWrite0F( 0x66, 0x7e, from, to ); }
-__forceinline void xMOVD( const xIndirectVoid& dest, const xRegisterSSE& from )	{ xOpWrite0F( 0x66, 0x7e, from, dest ); }
+__fi void xMOVD( const xRegister32& to, const xRegisterSSE& from )		{ xOpWrite0F( 0x66, 0x7e, from, to ); }
+__fi void xMOVD( const xIndirectVoid& dest, const xRegisterSSE& from )	{ xOpWrite0F( 0x66, 0x7e, from, dest ); }

-__forceinline void xMOVD( const xRegister32& to, const xRegisterMMX& from )		{ xOpWrite0F( 0x7e, from, to ); }
-__forceinline void xMOVD( const xIndirectVoid& dest, const xRegisterMMX& from )	{ xOpWrite0F( 0x7e, from, dest ); }
+__fi void xMOVD( const xRegister32& to, const xRegisterMMX& from )		{ xOpWrite0F( 0x7e, from, to ); }
+__fi void xMOVD( const xIndirectVoid& dest, const xRegisterMMX& from )	{ xOpWrite0F( 0x7e, from, dest ); }


 // Moves from XMM to XMM, with the *upper 64 bits* of the destination register
 // being cleared to zero.
-__forceinline void xMOVQZX( const xRegisterSSE& to, const xRegisterSSE& from )	{ xOpWrite0F( 0xf3, 0x7e, to, from ); }
+__fi void xMOVQZX( const xRegisterSSE& to, const xRegisterSSE& from )	{ xOpWrite0F( 0xf3, 0x7e, to, from ); }

 // Moves from XMM to XMM, with the *upper 64 bits* of the destination register
 // being cleared to zero.
-__forceinline void xMOVQZX( const xRegisterSSE& to, const xIndirectVoid& src )		{ xOpWrite0F( 0xf3, 0x7e, to, src ); }
+__fi void xMOVQZX( const xRegisterSSE& to, const xIndirectVoid& src )		{ xOpWrite0F( 0xf3, 0x7e, to, src ); }

 // Moves from XMM to XMM, with the *upper 64 bits* of the destination register
 // being cleared to zero.
-__forceinline void xMOVQZX( const xRegisterSSE& to, const void* src )			{ xOpWrite0F( 0xf3, 0x7e, to, src ); }
+__fi void xMOVQZX( const xRegisterSSE& to, const void* src )			{ xOpWrite0F( 0xf3, 0x7e, to, src ); }

 // Moves lower quad of XMM to ptr64 (no bits are cleared)
-__forceinline void xMOVQ( const xIndirectVoid& dest, const xRegisterSSE& from )	{ xOpWrite0F( 0x66, 0xd6, from, dest ); }
+__fi void xMOVQ( const xIndirectVoid& dest, const xRegisterSSE& from )	{ xOpWrite0F( 0x66, 0xd6, from, dest ); }

-__forceinline void xMOVQ( const xRegisterMMX& to, const xRegisterMMX& from )	{ if( to != from ) xOpWrite0F( 0x6f, to, from ); }
-__forceinline void xMOVQ( const xRegisterMMX& to, const xIndirectVoid& src )		{ xOpWrite0F( 0x6f, to, src ); }
-__forceinline void xMOVQ( const xIndirectVoid& dest, const xRegisterMMX& from )	{ xOpWrite0F( 0x7f, from, dest ); }
+__fi void xMOVQ( const xRegisterMMX& to, const xRegisterMMX& from )	{ if( to != from ) xOpWrite0F( 0x6f, to, from ); }
+__fi void xMOVQ( const xRegisterMMX& to, const xIndirectVoid& src )		{ xOpWrite0F( 0x6f, to, src ); }
+__fi void xMOVQ( const xIndirectVoid& dest, const xRegisterMMX& from )	{ xOpWrite0F( 0x7f, from, dest ); }

 // This form of xMOVQ is Intel's adeptly named 'MOVQ2DQ'
-__forceinline void xMOVQ( const xRegisterSSE& to, const xRegisterMMX& from )	{ xOpWrite0F( 0xf3, 0xd6, to, from ); }
+__fi void xMOVQ( const xRegisterSSE& to, const xRegisterMMX& from )	{ xOpWrite0F( 0xf3, 0xd6, to, from ); }

 // This form of xMOVQ is Intel's adeptly named 'MOVDQ2Q'
-__forceinline void xMOVQ( const xRegisterMMX& to, const xRegisterSSE& from )
+__fi void xMOVQ( const xRegisterMMX& to, const xRegisterSSE& from )
 {
 	// Manual implementation of this form of MOVQ, since its parameters are unique in a way
 	// that breaks the template inference of writeXMMop();
@ -733,9 +733,9 @@ __forceinline void xMOVQ( const xRegisterMMX& to, const xRegisterSSE& from )
 //

 #define IMPLEMENT_xMOVS( ssd, prefix ) \
-	__forceinline void xMOV##ssd( const xRegisterSSE& to, const xRegisterSSE& from )	{ if( to != from ) xOpWrite0F( prefix, 0x10, to, from ); } \
-	__forceinline void xMOV##ssd##ZX( const xRegisterSSE& to, const xIndirectVoid& from )	{ xOpWrite0F( prefix, 0x10, to, from ); } \
-	__forceinline void xMOV##ssd( const xIndirectVoid& to, const xRegisterSSE& from )		{ xOpWrite0F( prefix, 0x11, from, to ); }
+	__fi void xMOV##ssd( const xRegisterSSE& to, const xRegisterSSE& from )	{ if( to != from ) xOpWrite0F( prefix, 0x10, to, from ); } \
+	__fi void xMOV##ssd##ZX( const xRegisterSSE& to, const xIndirectVoid& from )	{ xOpWrite0F( prefix, 0x10, to, from ); } \
+	__fi void xMOV##ssd( const xIndirectVoid& to, const xRegisterSSE& from )		{ xOpWrite0F( prefix, 0x11, from, to ); }

 IMPLEMENT_xMOVS( SS, 0xf3 )
 IMPLEMENT_xMOVS( SD, 0xf2 )
@ -744,31 +744,31 @@ IMPLEMENT_xMOVS( SD, 0xf2 )
 // Non-temporal movs only support a register as a target (ie, load form only, no stores)
 //

-__forceinline void xMOVNTDQA( const xRegisterSSE& to, const xIndirectVoid& from )
+__fi void xMOVNTDQA( const xRegisterSSE& to, const xIndirectVoid& from )
 {
 	xWrite32( 0x2A380f66 );
 	EmitSibMagic( to.Id, from );
 }

-__forceinline void xMOVNTDQA( const xIndirectVoid& to, const xRegisterSSE& from )	{ xOpWrite0F( 0x66, 0xe7, from, to ); }
+__fi void xMOVNTDQA( const xIndirectVoid& to, const xRegisterSSE& from )	{ xOpWrite0F( 0x66, 0xe7, from, to ); }

-__forceinline void xMOVNTPD( const xIndirectVoid& to, const xRegisterSSE& from )	{ xOpWrite0F( 0x66, 0x2b, from, to ); }
-__forceinline void xMOVNTPS( const xIndirectVoid& to, const xRegisterSSE& from )	{ xOpWrite0F( 0x2b, from, to ); }
+__fi void xMOVNTPD( const xIndirectVoid& to, const xRegisterSSE& from )	{ xOpWrite0F( 0x66, 0x2b, from, to ); }
+__fi void xMOVNTPS( const xIndirectVoid& to, const xRegisterSSE& from )	{ xOpWrite0F( 0x2b, from, to ); }

-__forceinline void xMOVNTQ( const xIndirectVoid& to, const xRegisterMMX& from )	{ xOpWrite0F( 0xe7, from, to ); }
+__fi void xMOVNTQ( const xIndirectVoid& to, const xRegisterMMX& from )	{ xOpWrite0F( 0xe7, from, to ); }

 // ------------------------------------------------------------------------

-__forceinline void xMOVMSKPS( const xRegister32& to, const xRegisterSSE& from)	{ xOpWrite0F( 0x50, to, from ); }
-__forceinline void xMOVMSKPD( const xRegister32& to, const xRegisterSSE& from)	{ xOpWrite0F( 0x66, 0x50, to, from, true ); }
+__fi void xMOVMSKPS( const xRegister32& to, const xRegisterSSE& from)	{ xOpWrite0F( 0x50, to, from ); }
+__fi void xMOVMSKPD( const xRegister32& to, const xRegisterSSE& from)	{ xOpWrite0F( 0x66, 0x50, to, from, true ); }

 // xMASKMOV:
 // Selectively write bytes from mm1/xmm1 to memory location using the byte mask in mm2/xmm2.
 // The default memory location is specified by DS:EDI.  The most significant bit in each byte
 // of the mask operand determines whether the corresponding byte in the source operand is
 // written to the corresponding byte location in memory.
-__forceinline void xMASKMOV( const xRegisterSSE& to, const xRegisterSSE& from )		{ xOpWrite0F( 0x66, 0xf7, to, from ); }
-__forceinline void xMASKMOV( const xRegisterMMX& to, const xRegisterMMX& from )		{ xOpWrite0F( 0xf7, to, from ); }
+__fi void xMASKMOV( const xRegisterSSE& to, const xRegisterSSE& from )		{ xOpWrite0F( 0x66, 0xf7, to, from ); }
+__fi void xMASKMOV( const xRegisterMMX& to, const xRegisterMMX& from )		{ xOpWrite0F( 0xf7, to, from ); }

 // xPMOVMSKB:
 // Creates a mask made up of the most significant bit of each byte of the source
@ -778,15 +778,15 @@ __forceinline void xMASKMOV( const xRegisterMMX& to, const xRegisterMMX& from )
 // When operating on a 64-bit (MMX) source, the byte mask is 8 bits; when operating on
 // 128-bit (SSE) source, the byte mask is 16-bits.
 //
-__forceinline void xPMOVMSKB( const xRegister32& to, const xRegisterSSE& from )		{ xOpWrite0F( 0x66, 0xd7, to, from ); }
-__forceinline void xPMOVMSKB( const xRegister32& to, const xRegisterMMX& from )		{ xOpWrite0F( 0xd7, to, from ); }
+__fi void xPMOVMSKB( const xRegister32& to, const xRegisterSSE& from )		{ xOpWrite0F( 0x66, 0xd7, to, from ); }
+__fi void xPMOVMSKB( const xRegister32& to, const xRegisterMMX& from )		{ xOpWrite0F( 0xd7, to, from ); }

 // [sSSE-3] Concatenates dest and source operands into an intermediate composite,
 // shifts the composite at byte granularity to the right by a constant immediate,
 // and extracts the right-aligned result into the destination.
 //
-__forceinline void xPALIGNR( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 )	{ xOpWrite0F( 0x66, 0x0f3a, to, from, imm8 ); }
-__forceinline void xPALIGNR( const xRegisterMMX& to, const xRegisterMMX& from, u8 imm8 )	{ xOpWrite0F( 0x0f3a, to, from, imm8 ); }
+__fi void xPALIGNR( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 )	{ xOpWrite0F( 0x66, 0x0f3a, to, from, imm8 ); }
+__fi void xPALIGNR( const xRegisterMMX& to, const xRegisterMMX& from, u8 imm8 )	{ xOpWrite0F( 0x0f3a, to, from, imm8 ); }


 // --------------------------------------------------------------------------------------
@ -826,14 +826,14 @@ __emitinline void xEXTRACTPS( const xIndirect32& dest, const xRegisterSSE& from,
 // Converts from MMX register mode to FPU register mode.  The cpu enters MMX register mode
 // when ever MMX instructions are run, and if FPU instructions are run without using EMMS,
 // the FPU results will be invalid.
-__forceinline void xEMMS()	{ xWrite16( 0x770F ); }
+__fi void xEMMS()	{ xWrite16( 0x770F ); }

 // [3DNow] Same as EMMS, but an AMD special version which may (or may not) leave MMX regs
 // in an undefined state (which is fine, since presumably you're done using them anyway).
 // This instruction is thus faster than EMMS on K8s, but all newer AMD cpus use the same
 // logic for either EMMS or FEMMS.
 // Conclusion: Obsolete.  Just use EMMS instead.
-__forceinline void xFEMMS()	{ xWrite16( 0x0E0F ); }
+__fi void xFEMMS()	{ xWrite16( 0x0E0F ); }


 // Store Streaming SIMD Extension Control/Status to Mem32.
--- a/common/src/x86emitter/x86emitter.cpp
+++ b/common/src/x86emitter/x86emitter.cpp
@ -72,22 +72,22 @@ template void xWrite<u32>( u32 val );
 template void xWrite<u64>( u64 val );
 template void xWrite<u128>( u128 val );

-__forceinline void xWrite8( u8 val )
+__fi void xWrite8( u8 val )
 {
 	xWrite( val );
 }

-__forceinline void xWrite16( u16 val )
+__fi void xWrite16( u16 val )
 {
 	xWrite( val );
 }

-__forceinline void xWrite32( u32 val )
+__fi void xWrite32( u32 val )
 {
 	xWrite( val );
 }

-__forceinline void xWrite64( u64 val )
+__fi void xWrite64( u64 val )
 {
 	xWrite( val );
 }
@ -213,12 +213,12 @@ const char* xRegisterBase::GetName()
 // (btw, I know this isn't a critical performance item by any means, but it's
 //  annoying simply because it *should* be an easy thing to optimize)

-static __forceinline void ModRM( uint mod, uint reg, uint rm )
+static __fi void ModRM( uint mod, uint reg, uint rm )
 {
 	xWrite8( (mod << 6) | (reg << 3) | rm );
 }

-static __forceinline void SibSB( u32 ss, u32 index, u32 base )
+static __fi void SibSB( u32 ss, u32 index, u32 base )
 {
 	xWrite8( (ss << 6) | (index << 3) | base );
 }
@ -260,7 +260,7 @@ __emitinline void xOpWrite0F( u16 opcode, int instId, const xIndirectVoid& sib )
 //////////////////////////////////////////////////////////////////////////////////////////
 // returns TRUE if this instruction requires SIB to be encoded, or FALSE if the
 // instruction ca be encoded as ModRm alone.
-static __forceinline bool NeedsSibMagic( const xIndirectVoid& info )
+static __fi bool NeedsSibMagic( const xIndirectVoid& info )
 {
 	// no registers? no sibs!
 	// (xIndirectVoid::Reduce always places a register in Index, and optionally leaves
@ -952,37 +952,37 @@ __emitinline void xPUSH( const xIndirectVoid& from )
 	EmitSibMagic( 6, from );
 }

-__forceinline void xPOP( xRegister32 from )		{ xWrite8( 0x58 | from.Id ); }
+__fi void xPOP( xRegister32 from )		{ xWrite8( 0x58 | from.Id ); }

-__forceinline void xPUSH( u32 imm )				{ xWrite8( 0x68 ); xWrite32( imm ); }
-__forceinline void xPUSH( xRegister32 from )	{ xWrite8( 0x50 | from.Id ); }
+__fi void xPUSH( u32 imm )				{ xWrite8( 0x68 ); xWrite32( imm ); }
+__fi void xPUSH( xRegister32 from )	{ xWrite8( 0x50 | from.Id ); }

 // pushes the EFLAGS register onto the stack
-__forceinline void xPUSHFD()					{ xWrite8( 0x9C ); }
+__fi void xPUSHFD()					{ xWrite8( 0x9C ); }
 // pops the EFLAGS register from the stack
-__forceinline void xPOPFD()						{ xWrite8( 0x9D ); }
+__fi void xPOPFD()						{ xWrite8( 0x9D ); }


 //////////////////////////////////////////////////////////////////////////////////////////
 //

-__forceinline void xLEAVE()	{ xWrite8( 0xC9 ); }
-__forceinline void xRET()	{ xWrite8( 0xC3 ); }
-__forceinline void xCBW()	{ xWrite16( 0x9866 );  }
-__forceinline void xCWD()	{ xWrite8( 0x98 ); }
-__forceinline void xCDQ()	{ xWrite8( 0x99 ); }
-__forceinline void xCWDE()	{ xWrite8( 0x98 ); }
+__fi void xLEAVE()	{ xWrite8( 0xC9 ); }
+__fi void xRET()	{ xWrite8( 0xC3 ); }
+__fi void xCBW()	{ xWrite16( 0x9866 );  }
+__fi void xCWD()	{ xWrite8( 0x98 ); }
+__fi void xCDQ()	{ xWrite8( 0x99 ); }
+__fi void xCWDE()	{ xWrite8( 0x98 ); }

-__forceinline void xLAHF()	{ xWrite8( 0x9f ); }
-__forceinline void xSAHF()	{ xWrite8( 0x9e ); }
+__fi void xLAHF()	{ xWrite8( 0x9f ); }
+__fi void xSAHF()	{ xWrite8( 0x9e ); }

-__forceinline void xSTC()	{ xWrite8( 0xF9 ); }
-__forceinline void xCLC()	{ xWrite8( 0xF8 ); }
+__fi void xSTC()	{ xWrite8( 0xF9 ); }
+__fi void xCLC()	{ xWrite8( 0xF8 ); }

 // NOP 1-byte
-__forceinline void xNOP()	{ xWrite8(0x90); }
+__fi void xNOP()	{ xWrite8(0x90); }

-__forceinline void xINT( u8 imm )
+__fi void xINT( u8 imm )
 {
 	if (imm == 3)
 		xWrite8(0xcc);
@ -993,7 +993,7 @@ __forceinline void xINT( u8 imm )
 	}
 }

-__forceinline void xINTO()	{ xWrite8(0xce); }
+__fi void xINTO()	{ xWrite8(0xce); }

 __emitinline void xBSWAP( const xRegister32& to )
 {
--- a/pcsx2/CDVD/CDVD.cpp
+++ b/pcsx2/CDVD/CDVD.cpp
@ -37,7 +37,7 @@ wxString DiscSerial;

 static cdvdStruct cdvd;

-static __forceinline void SetResultSize(u8 size)
+static __fi void SetResultSize(u8 size)
 {
 	cdvd.ResultC = size;
 	cdvd.ResultP = 0;
@ -308,7 +308,7 @@ s32 cdvdWriteConfig(const u8* config)
 static MutexRecursive Mutex_NewDiskCB;

 // Sets ElfCRC to the CRC of the game bound to the CDVD plugin.
-static __forceinline ElfObject* loadElf( const wxString filename )
+static __fi ElfObject* loadElf( const wxString filename )
 {
 	if (filename.StartsWith(L"host"))
 		return new ElfObject(filename.After(':'), Path::GetFileSize(filename.After(':')));
@ -338,7 +338,7 @@ static __forceinline ElfObject* loadElf( const wxString filename )
 	return new ElfObject(filename, file);
 }

-static __forceinline void _reloadElfInfo(wxString elfpath)
+static __fi void _reloadElfInfo(wxString elfpath)
 {
 	ScopedPtr<ElfObject> elfptr;

@ -417,7 +417,7 @@ void cdvdReloadElfInfo(wxString elfoverride)
 	}
 }

-static __forceinline s32 StrToS32(const wxString& str, int base = 10)
+static __fi s32 StrToS32(const wxString& str, int base = 10)
 {
    long l;
    str.ToLong(&l, base);
@ -540,7 +540,7 @@ s32 cdvdGetTrayStatus()
 //   cdvdNewDiskCB() can update it's status as well...

 // Modified by (efp) - 16/01/2006
-static __forceinline void cdvdGetDiskType()
+static __fi void cdvdGetDiskType()
 {
 	cdvd.Type = DoCDVDdetectDiskType();
 }
@ -741,7 +741,7 @@ int cdvdReadSector() {
 }

 // inlined due to being referenced in only one place.
-__forceinline void cdvdActionInterrupt()
+__fi void cdvdActionInterrupt()
 {
 	switch( cdvd.Action )
 	{
@ -786,7 +786,7 @@ __forceinline void cdvdActionInterrupt()
 }

 // inlined due to being referenced in only one place.
-__forceinline void cdvdReadInterrupt()
+__fi void cdvdReadInterrupt()
 {
 	//Console.WriteLn("cdvdReadInterrupt %x %x %x %x %x", cpuRegs.interrupt, cdvd.Readed, cdvd.Reading, cdvd.nSectors, (HW_DMA3_BCR_H16 * HW_DMA3_BCR_L16) *4);

@ -983,7 +983,7 @@ void cdvdVsync() {
 	cdvd.RTC.year = 0;
 }

-static __forceinline u8 cdvdRead18(void)  // SDATAOUT
+static __fi u8 cdvdRead18(void)  // SDATAOUT
 {
 	u8 ret = 0;

@ -1348,7 +1348,7 @@ static void cdvdWrite04(u8 rt) { // NCOMMAND
 	cdvd.ParamC = 0;
 }

-static __forceinline void cdvdWrite05(u8 rt) { // NDATAIN
+static __fi void cdvdWrite05(u8 rt) { // NDATAIN
 	CDVD_LOG("cdvdWrite05(NDataIn) %x", rt);

 	if (cdvd.ParamP < 32) {
@ -1357,12 +1357,12 @@ static __forceinline void cdvdWrite05(u8 rt) { // NDATAIN
 	}
 }

-static __forceinline void cdvdWrite06(u8 rt) { // HOWTO
+static __fi void cdvdWrite06(u8 rt) { // HOWTO
 	CDVD_LOG("cdvdWrite06(HowTo) %x", rt);
 	cdvd.HowTo = rt;
 }

-static __forceinline void cdvdWrite07(u8 rt)		// BREAK
+static __fi void cdvdWrite07(u8 rt)		// BREAK
 {
 	CDVD_LOG("cdvdWrite07(Break) %x", rt);

@ -1386,21 +1386,21 @@ static __forceinline void cdvdWrite07(u8 rt)		// BREAK
 	//cdvd.nCommand = 0;
 }

-static __forceinline void cdvdWrite08(u8 rt) { // INTR_STAT
+static __fi void cdvdWrite08(u8 rt) { // INTR_STAT
 	CDVD_LOG("cdvdWrite08(IntrReason) = ACK(%x)", rt);
 	cdvd.PwOff &= ~rt;
 }

-static __forceinline void cdvdWrite0A(u8 rt) { // STATUS
+static __fi void cdvdWrite0A(u8 rt) { // STATUS
 	CDVD_LOG("cdvdWrite0A(Status) %x", rt);
 }

-static __forceinline void cdvdWrite0F(u8 rt) { // TYPE
+static __fi void cdvdWrite0F(u8 rt) { // TYPE
 	CDVD_LOG("cdvdWrite0F(Type) %x", rt);
 	DevCon.WriteLn("*PCSX2*: CDVD TYPE %x", rt);
 }

-static __forceinline void cdvdWrite14(u8 rt) { // PS1 MODE??
+static __fi void cdvdWrite14(u8 rt) { // PS1 MODE??
 	u32 cycle = psxRegs.cycle;

 	if (rt == 0xFE)
@ -1414,7 +1414,7 @@ static __forceinline void cdvdWrite14(u8 rt) { // PS1 MODE??
 	psxRegs.cycle = cycle;
 }

-static __forceinline void fail_pol_cal()
+static __fi void fail_pol_cal()
 {
 	Console.Error("[MG] ERROR - Make sure the file is already decrypted!!!");
 	cdvd.Result[0] = 0x80;
@ -2025,7 +2025,7 @@ static void cdvdWrite16(u8 rt)		 // SCOMMAND
 	cdvd.ParamC = 0;
 }

-static __forceinline void cdvdWrite17(u8 rt) { // SDATAIN
+static __fi void cdvdWrite17(u8 rt) { // SDATAIN
 	CDVD_LOG("cdvdWrite17(SDataIn) %x", rt);

 	if (cdvd.ParamP < 32) {
@ -2034,12 +2034,12 @@ static __forceinline void cdvdWrite17(u8 rt) { // SDATAIN
 	}
 }

-static __forceinline void cdvdWrite18(u8 rt) { // SDATAOUT
+static __fi void cdvdWrite18(u8 rt) { // SDATAOUT
 	CDVD_LOG("cdvdWrite18(SDataOut) %x", rt);
 	Console.WriteLn("*PCSX2* SDATAOUT");
 }

-static __forceinline void cdvdWrite3A(u8 rt) { // DEC-SET
+static __fi void cdvdWrite3A(u8 rt) { // DEC-SET
 	CDVD_LOG("cdvdWrite3A(DecSet) %x", rt);
 	cdvd.decSet = rt;
 	Console.WriteLn("DecSet Write: %02X", cdvd.decSet);
--- a/pcsx2/CDVD/CDVD.h
+++ b/pcsx2/CDVD/CDVD.h
@ -23,7 +23,7 @@
 #define btoi(b)		((b)/16*10 + (b)%16)		/* BCD to u_char */
 #define itob(i)		((i)/10*16 + (i)%10)		/* u_char to BCD */

-static __forceinline s32 msf_to_lsn(u8 *Time)
+static __fi s32 msf_to_lsn(u8 *Time)
 {
 	u32 lsn;

@ -33,7 +33,7 @@ static __forceinline s32 msf_to_lsn(u8 *Time)
 	return lsn;
 }

-static __forceinline s32 msf_to_lba(u8 m, u8 s, u8 f)
+static __fi s32 msf_to_lba(u8 m, u8 s, u8 f)
 {
 	u32 lsn;
 	lsn = f;
@ -42,7 +42,7 @@ static __forceinline s32 msf_to_lba(u8 m, u8 s, u8 f)
 	return lsn;
 }

-static __forceinline void lsn_to_msf(u8 *Time, s32 lsn)
+static __fi void lsn_to_msf(u8 *Time, s32 lsn)
 {
 	u8 m, s, f;

@ -56,7 +56,7 @@ static __forceinline void lsn_to_msf(u8 *Time, s32 lsn)
 	Time[2] = itob(f);
 }

-static __forceinline void lba_to_msf(s32 lba, u8* m, u8* s, u8* f)
+static __fi void lba_to_msf(s32 lba, u8* m, u8* s, u8* f)
 {
 	lba += 150;
 	*m = lba / (60 * 75);
--- a/pcsx2/CDVD/CdRom.cpp
+++ b/pcsx2/CDVD/CdRom.cpp
@ -94,28 +94,28 @@ u32 cdReadTime;// = ((PSXCLK / 75) / BIAS);

 static void AddIrqQueue(u8 irq, u32 ecycle);

-static __forceinline void StartReading(u32 type) {
+static __fi void StartReading(u32 type) {
   	cdr.Reading = type;
  	cdr.FirstSector = 1;
  	cdr.Readed = 0xff;
 	AddIrqQueue(READ_ACK, 0x800);
 }

-static __forceinline void StopReading() {
+static __fi void StopReading() {
 	if (cdr.Reading) {
 		cdr.Reading = 0;
 		psxRegs.interrupt &= ~(1<<IopEvt_CdromRead);
 	}
 }

-static __forceinline void StopCdda() {
+static __fi void StopCdda() {
 	if (cdr.Play) {
 		cdr.StatP&=~0x80;
 		cdr.Play = 0;
 	}
 }

-static __forceinline void SetResultSize(u8 size) {
+static __fi void SetResultSize(u8 size) {
    cdr.ResultP = 0;
 	cdr.ResultC = size;
 	cdr.ResultReady = 1;
--- a/pcsx2/COP0.cpp
+++ b/pcsx2/COP0.cpp
@ -20,7 +20,7 @@
 u32 s_iLastCOP0Cycle = 0;
 u32 s_iLastPERFCycle[2] = { 0, 0 };

-__releaseinline void UpdateCP0Status() {
+__ri void UpdateCP0Status() {
 	//currently the 2 memory modes are not implemented. Given this function is called so much,
 	//it's commented out for now. Only the interrupt test is needed. (rama)

@ -162,7 +162,7 @@ void WriteTLB(int i)
 // count.  But only mode 1 (instruction counter) has been found to be used by games thus far.
 //

-static __forceinline bool PERF_ShouldCountEvent( uint evt )
+static __fi bool PERF_ShouldCountEvent( uint evt )
 {
 	switch( evt )
 	{
@ -213,7 +213,7 @@ void COP0_DiagnosticPCCR()
 		Console.Warning( "PERF/PCR1 Unsupported Update Event Mode = 0x%x", cpuRegs.PERF.n.pccr.b.Event1 );
 }
 extern int branch;
-__forceinline void COP0_UpdatePCCR()
+__fi void COP0_UpdatePCCR()
 {
 	//if( cpuRegs.CP0.n.Status.b.ERL || !cpuRegs.PERF.n.pccr.b.CTE ) return;

--- a/pcsx2/Cache.h
+++ b/pcsx2/Cache.h
@ -43,12 +43,12 @@ void writeCache128(u32 mem, u64 *value);
 u8  *readCache(u32 mem);

 // Fixme - these two functions do nothing, and the cache code relies on these two functions.
-static __forceinline u32 getMemR(s32 mem)
+static __fi u32 getMemR(s32 mem)
 {
 	return 0;//memLUTR[mem >> 12];
 }

-static __forceinline u32 getMemW(s32 mem)
+static __fi u32 getMemW(s32 mem)
 {
 	return 0;//memLUTW[mem>>12];
 }
--- a/pcsx2/Common.h
+++ b/pcsx2/Common.h
@ -17,6 +17,15 @@

 #include "Pcsx2Defs.h"

+static const s64 _1mb	= 0x100000;
+static const s64 _8mb	= _1mb * 8;
+static const s64 _16mb	= _1mb * 16;
+static const s64 _256mb	= _1mb * 256;
+static const s64 _1gb	= _256mb * 4;
+
+static const u32 BIAS = 2;				// Bus is half of the actual ps2 speed
+static const u32 PS2CLK = 294912000;	//hz	/* 294.912 mhz */
+
 #include "System.h"
 #include "Memory.h"
 #include "R5900.h"
@ -26,9 +35,6 @@
 #include "SaveState.h"
 #include "DebugTools/Debug.h"

-static const u32 BIAS = 2;   // Bus is half of the actual ps2 speed
-static const u32 PS2CLK = 294912000; //hz	/* 294.912 mhz */
-
 extern wxString ShiftJIS_ConvertString( const char* src );
 extern wxString ShiftJIS_ConvertString( const char* src, int maxlen );

--- a/pcsx2/Counters.cpp
+++ b/pcsx2/Counters.cpp
@ -54,7 +54,7 @@ void rcntReset(int index) {
 // Updates the state of the nextCounter value (if needed) to serve
 // any pending events for the given counter.
 // Call this method after any modifications to the state of a counter.
-static __forceinline void _rcntSet( int cntidx )
+static __fi void _rcntSet( int cntidx )
 {
 	s32 c;
 	jASSUME( cntidx <= 4 );		// rcntSet isn't valid for h/vsync counters.
@ -106,7 +106,7 @@ static __forceinline void _rcntSet( int cntidx )
 }


-static __forceinline void cpuRcntSet()
+static __fi void cpuRcntSet()
 {
 	int i;

@ -286,7 +286,7 @@ void frameLimitReset()
 // Framelimiter - Measures the delta time between calls and stalls until a
 // certain amount of time passes if such time hasn't passed yet.
 // See the GS FrameSkip function for details on why this is here and not in the GS.
-static __forceinline void frameLimit()
+static __fi void frameLimit()
 {
 	// 999 means the user would rather just have framelimiting turned off...
 	if( !EmuConfig.GS.FrameLimitEnable ) return;
@ -331,7 +331,7 @@ static __forceinline void frameLimit()
 	// starting this frame, it'll just sleep longer the next to make up for it. :)
 }

-static __forceinline void VSyncStart(u32 sCycle)
+static __fi void VSyncStart(u32 sCycle)
 {
 	GetCoreThread().VsyncInThread();
 	Cpu->CheckExecutionState();
@ -380,7 +380,7 @@ static __forceinline void VSyncStart(u32 sCycle)
 	// Should no longer be required (Refraction)
 }

-static __forceinline void VSyncEnd(u32 sCycle)
+static __fi void VSyncEnd(u32 sCycle)
 {
 	EECNT_LOG( "/////////  EE COUNTER VSYNC END (frame: %d)  \\\\\\\\\\\\\\\\\\\\", g_FrameCount );

@ -404,7 +404,7 @@ static u32 hsc=0;
 static int vblankinc = 0;
 #endif

-__forceinline void rcntUpdate_hScanline()
+__fi void rcntUpdate_hScanline()
 {
 	if( !cpuTestCycle( hsyncCounter.sCycle, hsyncCounter.CycleT ) ) return;

@ -441,7 +441,7 @@ __forceinline void rcntUpdate_hScanline()
 	}
 }

-__forceinline void rcntUpdate_vSync()
+__fi void rcntUpdate_vSync()
 {
 	s32 diff = (cpuRegs.cycle - vsyncCounter.sCycle);
 	if( diff < vsyncCounter.CycleT ) return;
@ -478,7 +478,7 @@ __forceinline void rcntUpdate_vSync()
 	}
 }

-static __forceinline void _cpuTestTarget( int i )
+static __fi void _cpuTestTarget( int i )
 {
 	if (counters[i].count < counters[i].target) return;

@ -497,7 +497,7 @@ static __forceinline void _cpuTestTarget( int i )
 	else counters[i].target |= EECNT_FUTURE_TARGET;
 }

-static __forceinline void _cpuTestOverflow( int i )
+static __fi void _cpuTestOverflow( int i )
 {
 	if (counters[i].count <= 0xffff) return;

@ -516,7 +516,7 @@ static __forceinline void _cpuTestOverflow( int i )
 // forceinline note: this method is called from two locations, but one
 // of them is the interpreter, which doesn't count. ;)  So might as
 // well forceinline it!
-__forceinline void rcntUpdate()
+__fi void rcntUpdate()
 {
 	rcntUpdate_vSync();

@ -550,7 +550,7 @@ __forceinline void rcntUpdate()
 	cpuRcntSet();
 }

-static __forceinline void _rcntSetGate( int index )
+static __fi void _rcntSetGate( int index )
 {
 	if (counters[index].mode.EnableGate)
 	{
@ -575,7 +575,7 @@ static __forceinline void _rcntSetGate( int index )
 }

 // mode - 0 means hblank source, 8 means vblank source.
-__forceinline void rcntStartGate(bool isVblank, u32 sCycle)
+__fi void rcntStartGate(bool isVblank, u32 sCycle)
 {
 	int i;

@ -636,7 +636,7 @@ __forceinline void rcntStartGate(bool isVblank, u32 sCycle)
 }

 // mode - 0 means hblank signal, 8 means vblank signal.
-__forceinline void rcntEndGate(bool isVblank , u32 sCycle)
+__fi void rcntEndGate(bool isVblank , u32 sCycle)
 {
 	int i;

@ -677,7 +677,7 @@ __forceinline void rcntEndGate(bool isVblank , u32 sCycle)
 	// rcntUpdate, since we're being called from there anyway.
 }

-__forceinline void rcntWmode(int index, u32 value)
+__fi void rcntWmode(int index, u32 value)
 {
 	if(counters[index].mode.IsCounting) {
 		if(counters[index].mode.ClockSource != 0x3) {
@ -711,7 +711,7 @@ __forceinline void rcntWmode(int index, u32 value)
 	_rcntSet( index );
 }

-__forceinline void rcntWcount(int index, u32 value)
+__fi void rcntWcount(int index, u32 value)
 {
 	EECNT_LOG("EE Counter[%d] writeCount = %x,   oldcount=%x, target=%x", index, value, counters[index].count, counters[index].target );

@ -737,7 +737,7 @@ __forceinline void rcntWcount(int index, u32 value)
 	_rcntSet( index );
 }

-__forceinline void rcntWtarget(int index, u32 value)
+__fi void rcntWtarget(int index, u32 value)
 {
 	EECNT_LOG("EE Counter[%d] writeTarget = %x", index, value);

@ -766,13 +766,13 @@ __forceinline void rcntWtarget(int index, u32 value)
 	_rcntSet( index );
 }

-__forceinline void rcntWhold(int index, u32 value)
+__fi void rcntWhold(int index, u32 value)
 {
 	EECNT_LOG("EE Counter[%d] Hold Write = %x", index, value);
 	counters[index].hold = value;
 }

-__forceinline u32 rcntRcount(int index)
+__fi u32 rcntRcount(int index)
 {
 	u32 ret;

@ -787,7 +787,7 @@ __forceinline u32 rcntRcount(int index)
 	return ret;
 }

-__forceinline u32 rcntCycle(int index)
+__fi u32 rcntCycle(int index)
 {
 	if (counters[index].mode.IsCounting && (counters[index].mode.ClockSource != 0x3))
 		return counters[index].count + ((cpuRegs.cycle - counters[index].sCycleT) / counters[index].rate);
--- a/pcsx2/Dmac.h
+++ b/pcsx2/Dmac.h
@ -215,9 +215,9 @@ union tDMA_QWC {
 	wxString desc() const { return wxsFormat(L"QWC: 0x%x", _u32); }
 	tDMA_TAG tag() { return (tDMA_TAG)_u32; }
 };
-static __forceinline void setDmacStat(u32 num);
-static __forceinline tDMA_TAG *dmaGetAddr(u32 addr, bool write);
-static __forceinline void throwBusError(const char *s);
+static void setDmacStat(u32 num);
+static tDMA_TAG *dmaGetAddr(u32 addr, bool write);
+static void throwBusError(const char *s);

 struct DMACh {
 	tDMA_CHCR chcr;
@ -374,7 +374,7 @@ union tDMAC_QUEUE
 	bool empty() const { return (_u16 == 0); }
 };

-static __forceinline const wxChar* ChcrName(u32 addr)
+static __fi const wxChar* ChcrName(u32 addr)
 {
    switch (addr)
    {
@ -393,7 +393,7 @@ static __forceinline const wxChar* ChcrName(u32 addr)
 }

 // Believe it or not, making this const can generate compiler warnings in gcc.
-static __forceinline int ChannelNumber(u32 addr)
+static __fi int ChannelNumber(u32 addr)
 {
    switch (addr)
    {
@ -607,19 +607,19 @@ struct INTCregisters
 #define dmacRegs ((DMACregisters*)(PS2MEM_HW+0xE000))
 #define intcRegs ((INTCregisters*)(PS2MEM_HW+0xF000))

-static __forceinline void throwBusError(const char *s)
+static __fi void throwBusError(const char *s)
 {
    Console.Error("%s BUSERR", s);
    dmacRegs->stat.BEIS = true;
 }

-static __forceinline void setDmacStat(u32 num)
+static __fi void setDmacStat(u32 num)
 {
 	dmacRegs->stat.set_flags(1 << num);
 }

 // Note: Dma addresses are guaranteed to be aligned to 16 bytes (128 bits)
-static __forceinline tDMA_TAG *SPRdmaGetAddr(u32 addr, bool write)
+static __fi tDMA_TAG *SPRdmaGetAddr(u32 addr, bool write)
 {
 	// if (addr & 0xf) { DMA_LOG("*PCSX2*: DMA address not 128bit aligned: %8.8x", addr); }

@ -653,7 +653,7 @@ static __forceinline tDMA_TAG *SPRdmaGetAddr(u32 addr, bool write)
 }

 // Note: Dma addresses are guaranteed to be aligned to 16 bytes (128 bits)
-static __forceinline tDMA_TAG *dmaGetAddr(u32 addr, bool write)
+static __ri tDMA_TAG *dmaGetAddr(u32 addr, bool write)
 {
 	// if (addr & 0xf) { DMA_LOG("*PCSX2*: DMA address not 128bit aligned: %8.8x", addr); }
 	if (DMA_TAG(addr).SPR) return (tDMA_TAG*)&psS[addr & 0x3ff0];
--- a/pcsx2/GS.cpp
+++ b/pcsx2/GS.cpp
@ -79,7 +79,7 @@ void gsReset()
 	gsGIFReset();
 }

-static __forceinline void gsCSRwrite( const tGS_CSR& csr )
+static __fi void gsCSRwrite( const tGS_CSR& csr )
 {
 	if (csr.RESET) {

@ -137,7 +137,7 @@ static __forceinline void gsCSRwrite( const tGS_CSR& csr )
 	if(csr.EDWINT)	CSRreg.EDWINT	= false;
 }

-static __forceinline void IMRwrite(u32 value)
+static __fi void IMRwrite(u32 value)
 {
 	GSIMR = (value & 0x1f00)|0x6000;

@ -161,7 +161,7 @@ static __forceinline void IMRwrite(u32 value)
 	}
 }

-__forceinline void gsWrite8(u32 mem, u8 value)
+__fi void gsWrite8(u32 mem, u8 value)
 {
 	switch (mem)
 	{
@ -189,7 +189,7 @@ __forceinline void gsWrite8(u32 mem, u8 value)
 	GIF_LOG("GS write 8 at %8.8lx with data %8.8lx", mem, value);
 }

-static __forceinline void _gsSMODEwrite( u32 mem, u32 value )
+static __fi void _gsSMODEwrite( u32 mem, u32 value )
 {
 	switch (mem)
 	{
@ -206,7 +206,7 @@ static __forceinline void _gsSMODEwrite( u32 mem, u32 value )
 //////////////////////////////////////////////////////////////////////////
 // GS Write 16 bit

-__forceinline void gsWrite16(u32 mem, u16 value)
+__fi void gsWrite16(u32 mem, u16 value)
 {
 	GIF_LOG("GS write 16 at %8.8lx with data %8.8lx", mem, value);

@ -236,7 +236,7 @@ __forceinline void gsWrite16(u32 mem, u16 value)
 //////////////////////////////////////////////////////////////////////////
 // GS Write 32 bit

-__forceinline void gsWrite32(u32 mem, u32 value)
+__fi void gsWrite32(u32 mem, u32 value)
 {
 	pxAssume( (mem & 3) == 0 );
 	GIF_LOG("GS write 32 at %8.8lx with data %8.8lx", mem, value);
@ -353,25 +353,25 @@ void __fastcall gsWrite128_generic( u32 mem, const mem128_t* value )
 	writeTo[1] = value[1];
 }

-__forceinline u8 gsRead8(u32 mem)
+__fi u8 gsRead8(u32 mem)
 {
 	GIF_LOG("GS read 8 from %8.8lx  value: %8.8lx", mem, *(u8*)PS2GS_BASE(mem));
 	return *(u8*)PS2GS_BASE(mem);
 }

-__forceinline u16 gsRead16(u32 mem)
+__fi u16 gsRead16(u32 mem)
 {
 	GIF_LOG("GS read 16 from %8.8lx  value: %8.8lx", mem, *(u16*)PS2GS_BASE(mem));
 	return *(u16*)PS2GS_BASE(mem);
 }

-__forceinline u32 gsRead32(u32 mem)
+__fi u32 gsRead32(u32 mem)
 {
 	GIF_LOG("GS read 32 from %8.8lx  value: %8.8lx", mem, *(u32*)PS2GS_BASE(mem));
 	return *(u32*)PS2GS_BASE(mem);
 }

-__forceinline u64 gsRead64(u32 mem)
+__fi u64 gsRead64(u32 mem)
 {
 	// fixme - PS2GS_BASE(mem+4) = (g_RealGSMem+(mem + 4 & 0x13ff))
 	GIF_LOG("GS read 64 from %8.8lx  value: %8.8lx_%8.8lx", mem, *(u32*)PS2GS_BASE(mem+4), *(u32*)PS2GS_BASE(mem) );
@ -402,7 +402,7 @@ void gsIrq() {
 //   functions are performed by the EE, which itself uses thread sleep logic to avoid spin
 //   waiting as much as possible (maximizes CPU resource availability for the GS).

-__forceinline void gsFrameSkip()
+__fi void gsFrameSkip()
 {
 	static int consec_skipped = 0;
 	static int consec_drawn = 0;
--- a/pcsx2/GSState.cpp
+++ b/pcsx2/GSState.cpp
@ -61,7 +61,7 @@ void GSGIFTRANSFER3(u32 *pMem, u32 size) {
 	GSgifTransfer3(pMem, size);
 }

-__forceinline void GSVSYNC(void) {
+__fi void GSVSYNC(void) {
 	if( g_SaveGSStream == 2 ) {
 		u32 type = GSRUN_VSYNC;
 		g_fGSSave->Freeze( type );
--- a/pcsx2/Gif.cpp
+++ b/pcsx2/Gif.cpp
@ -40,7 +40,7 @@ __aligned16 u8 Path1Buffer[0x1000000];
 u32 Path1WritePos = 0;
 u32 Path1ReadPos = 0;

-static __forceinline void clearFIFOstuff(bool full)
+static __fi void clearFIFOstuff(bool full)
 {
 	if (full)
 		CSRreg.FIFO = CSR_FIFO_FULL;
@ -93,7 +93,7 @@ void gsPath1Interrupt()

 extern bool SIGNAL_IMR_Pending;

-__forceinline void gsInterrupt()
+__fi void gsInterrupt()
 {
 	GIF_LOG("gsInterrupt: %8.8x", cpuRegs.cycle);

@ -182,7 +182,7 @@ int  _GIFchain()
 	return WRITERING_DMA(pMem, gif->qwc);
 }

-static __forceinline void GIFchain()
+static __fi void GIFchain()
 {
 	// qwc check now done outside this function
 	// Voodoocycles
@ -190,7 +190,7 @@ static __forceinline void GIFchain()
 	/*if (gif->qwc)*/ gscycles+= ( _GIFchain() * BIAS); /* guessing */
 }

-static __forceinline bool checkTieBit(tDMA_TAG* &ptag)
+static __fi bool checkTieBit(tDMA_TAG* &ptag)
 {
 	if (gif->chcr.TIE && ptag->IRQ)
 	{
@ -202,7 +202,7 @@ static __forceinline bool checkTieBit(tDMA_TAG* &ptag)
 	return false;
 }

-static __forceinline tDMA_TAG* ReadTag()
+static __fi tDMA_TAG* ReadTag()
 {
 	tDMA_TAG* ptag = dmaGetAddr(gif->tadr, false);  //Set memory pointer to TADR

@ -215,7 +215,7 @@ static __forceinline tDMA_TAG* ReadTag()
 	return ptag;
 }

-static __forceinline tDMA_TAG* ReadTag2()
+static __fi tDMA_TAG* ReadTag2()
 {
 	tDMA_TAG* ptag = dmaGetAddr(gif->tadr, false);  //Set memory pointer to TADR

@ -443,7 +443,7 @@ void dmaGIF()
 }

 // called from only one location, so forceinline it:
-static __forceinline bool mfifoGIFrbTransfer()
+static __fi bool mfifoGIFrbTransfer()
 {
 	u32 mfifoqwc = min(gifqwc, (u32)gif->qwc);
 	u32 *src;
@ -492,7 +492,7 @@ static __forceinline bool mfifoGIFrbTransfer()
 }

 // called from only one location, so forceinline it:
-static __forceinline bool mfifoGIFchain()
+static __fi bool mfifoGIFchain()
 {
 	/* Is QWC = 0? if so there is nothing to transfer */
 	if (gif->qwc == 0) return true;
--- a/pcsx2/Hw.cpp
+++ b/pcsx2/Hw.cpp
@ -76,7 +76,7 @@ void hwReset()
 	vif1Reset();
 }

-__forceinline void intcInterrupt()
+__fi void intcInterrupt()
 {
 	if ((psHu32(INTC_STAT)) == 0) {
 		//DevCon.Warning("*PCSX2*: intcInterrupt already cleared");
@ -97,7 +97,7 @@ __forceinline void intcInterrupt()
 	cpuException(0x400, cpuRegs.branch);
 }

-__forceinline void dmacInterrupt()
+__fi void dmacInterrupt()
 {
 	if( ((psHu16(DMAC_STAT + 2) & psHu16(DMAC_STAT)) == 0 ) &&
 		( psHu16(DMAC_STAT) & 0x8000) == 0 ) 
@ -130,7 +130,7 @@ void hwDmacIrq(int n)
 }

 // Write 'size' bytes to memory address 'addr' from 'data'.
-__releaseinline bool hwMFIFOWrite(u32 addr, const u128* data, uint qwc)
+__ri bool hwMFIFOWrite(u32 addr, const u128* data, uint qwc)
 {
 	// all FIFO addresses should always be QWC-aligned.
 	pxAssume((dmacRegs->rbor.ADDR & 15) == 0);
@ -158,7 +158,7 @@ __releaseinline bool hwMFIFOWrite(u32 addr, const u128* data, uint qwc)
 	return true;
 }

-__releaseinline bool hwDmacSrcChainWithStack(DMACh *dma, int id) {
+__ri bool hwDmacSrcChainWithStack(DMACh *dma, int id) {
 	switch (id) {
 		case TAG_REFE: // Refe - Transfer Packet According to ADDR field
            //End Transfer
--- a/pcsx2/HwRead.cpp
+++ b/pcsx2/HwRead.cpp
@ -21,7 +21,7 @@

 using namespace R5900;

-static __forceinline void IntCHackCheck()
+static __fi void IntCHackCheck()
 {
 	// Sanity check: To protect from accidentally "rewinding" the cyclecount
 	// on the few times nextBranchCycle can be behind our current cycle.
@ -32,7 +32,7 @@ static __forceinline void IntCHackCheck()
 /////////////////////////////////////////////////////////////////////////
 // Hardware READ 8 bit

-__forceinline mem8_t hwRead8(u32 mem)
+__fi mem8_t hwRead8(u32 mem)
 {
 	u8 ret;

@ -150,7 +150,7 @@ __forceinline mem8_t hwRead8(u32 mem)
 /////////////////////////////////////////////////////////////////////////
 // Hardware READ 16 bit

-__forceinline mem16_t hwRead16(u32 mem)
+__fi mem16_t hwRead16(u32 mem)
 {
 	u16 ret;
 	const u16 masked_mem = mem & 0xffff;
@ -293,7 +293,7 @@ mem32_t __fastcall hwRead32_page_01(u32 mem)

 // Reads hardware registers for page 15 (0x0F).
 // This is used internally to produce two inline versions, one with INTC_HACK, and one without.
-static __forceinline mem32_t __hwRead32_page_0F( u32 mem, bool intchack )
+static __fi mem32_t __hwRead32_page_0F( u32 mem, bool intchack )
 {
 	// *Performance Warning*  This function is called -A-LOT.  Be wary when making changes.  It
 	// could impact FPS significantly.
--- a/pcsx2/HwWrite.cpp
+++ b/pcsx2/HwWrite.cpp
@ -58,7 +58,7 @@ static void StartQueuedDMA()
 	if (QueuedDMA.SPR1) { DMA_LOG("Resuming DMA for SPR1"); QueuedDMA.SPR1 = !QuickDmaExec(dmaSPR1, D9_CHCR); }
 }

-static _f void DmaExec( void (*func)(), u32 mem, u32 value )
+static __ri void DmaExec( void (*func)(), u32 mem, u32 value )
 {
 	DMACh *reg = &psH_DMACh(mem);
    tDMA_CHCR chcr(value);
@ -145,7 +145,7 @@ static _f void DmaExec( void (*func)(), u32 mem, u32 value )

 // DmaExec8 should only be called for the second byte of CHCR.
 // Testing Note: dark cloud 2 uses 8 bit DMAs register writes.
-static _f void DmaExec8( void (*func)(), u32 mem, u8 value )
+static __fi void DmaExec8( void (*func)(), u32 mem, u8 value )
 {
 	pxAssumeMsg( (mem & 0xf) == 1, "DmaExec8 should only be called for the second byte of CHCR" );

@ -154,7 +154,7 @@ static _f void DmaExec8( void (*func)(), u32 mem, u8 value )
 	DmaExec( func, mem & ~0xf, (u32)value<<8 );
 }

-static _f void DmaExec16( void (*func)(), u32 mem, u16 value )
+static __fi void DmaExec16( void (*func)(), u32 mem, u16 value )
 {
 	DmaExec( func, mem, (u32)value );
 }
@ -418,7 +418,7 @@ void hwWrite8(u32 mem, u8 value)
 	}
 }

-__forceinline void hwWrite16(u32 mem, u16 value)
+__ri void hwWrite16(u32 mem, u16 value)
 {
 	if( mem >= IPU_CMD && mem < D0_CHCR )
 		Console.Warning( "hwWrite16 to %x", mem );
--- a/pcsx2/IPU/IPU.cpp
+++ b/pcsx2/IPU/IPU.cpp
@ -68,7 +68,7 @@ __aligned16 decoder_t decoder;
 __aligned16 u8 _readbits[80];	//local buffer (ring buffer)
 u8* readbits = _readbits;		// always can decrement by one 1qw

-__forceinline void IPUProcessInterrupt()
+__fi void IPUProcessInterrupt()
 {
 	if (ipuRegs->ctrl.BUSY && g_BP.IFC) IPUWorker();
 }
@ -219,7 +219,7 @@ void tIPU_CMD_CSC::log_from_RGB32() const
 }


-__forceinline u32 ipuRead32(u32 mem)
+__fi u32 ipuRead32(u32 mem)
 {
 	// Note: It's assumed that mem's input value is always in the 0x10002000 page
 	// of memory (if not, it's probably bad code).
@ -255,7 +255,7 @@ __forceinline u32 ipuRead32(u32 mem)
 	return *(u32*)(((u8*)ipuRegs) + mem);
 }

-__forceinline u64 ipuRead64(u32 mem)
+__fi u64 ipuRead64(u32 mem)
 {
 	// Note: It's assumed that mem's input value is always in the 0x10002000 page
 	// of memory (if not, it's probably bad code).
@ -307,7 +307,7 @@ void ipuSoftReset()
 	//g_BP.bufferhasnew = 0;
 }

-__forceinline void ipuWrite32(u32 mem, u32 value)
+__fi void ipuWrite32(u32 mem, u32 value)
 {
 	// Note: It's assumed that mem's input value is always in the 0x10002000 page
 	// of memory (if not, it's probably bad code).
@ -346,7 +346,7 @@ __forceinline void ipuWrite32(u32 mem, u32 value)
 	}
 }

-__forceinline void ipuWrite64(u32 mem, u64 value)
+__fi void ipuWrite64(u32 mem, u64 value)
 {
 	// Note: It's assumed that mem's input value is always in the 0x10002000 page
 	// of memory (if not, it's probably bad code).
@ -420,7 +420,7 @@ static BOOL ipuIDEC(u32 val, bool resume)

 static int s_bdec = 0;

-static __forceinline BOOL ipuBDEC(u32 val, bool resume)
+static __fi BOOL ipuBDEC(u32 val, bool resume)
 {
 	tIPU_CMD_BDEC bdec(val);

@ -514,7 +514,7 @@ static BOOL __fastcall ipuVDEC(u32 val)
 	return FALSE;
 }

-static __forceinline BOOL ipuFDEC(u32 val)
+static __fi BOOL ipuFDEC(u32 val)
 {
 	if (!getBits32((u8*)&ipuRegs->cmd.DATA, 0)) return FALSE;

@ -691,7 +691,7 @@ static void ipuSETTH(u32 val)
 ///////////////////////
 // IPU Worker Thread //
 ///////////////////////
-__forceinline void IPU_INTERRUPT() //dma
+__fi void IPU_INTERRUPT() //dma
 {
 	hwIntcIrq(INTC_IPU);
 }
@ -901,7 +901,7 @@ void IPUWorker()
 // Buffer reader

 // move the readbits queue
-__forceinline void inc_readbits()
+__fi void inc_readbits()
 {
 	readbits += 16;
 	if (readbits >= _readbits + 64)
@ -914,7 +914,7 @@ __forceinline void inc_readbits()
 }

 // returns the pointer of readbits moved by 1 qword
-__forceinline u8* next_readbits()
+__fi u8* next_readbits()
 {
 	return readbits + 16;
 }
@ -1070,7 +1070,7 @@ u8 __fastcall getBits32(u8 *address, u32 advance)
 	return 1;
 }

-__forceinline u8 __fastcall getBits16(u8 *address, u32 advance)
+__fi u8 __fastcall getBits16(u8 *address, u32 advance)
 {
 	u32 mask;
 	u8* readpos;
@ -1127,7 +1127,7 @@ u8 __fastcall getBits8(u8 *address, u32 advance)
 void Skl_YUV_To_RGB32_MMX(u8 *RGB, const int Dst_BpS, const u8 *Y, const u8 *U, const u8 *V,
                          const int Src_BpS, const int Width, const int Height);

-__forceinline void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn)
+__fi void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn)
 {
 	int i;
 	u8* p = (u8*)&rgb32;
@ -1161,7 +1161,7 @@ __forceinline void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn)
 	}
 }

-__forceinline void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& rgb16, int dte)
+__fi void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& rgb16, int dte)
 {
 	int i, j;
 	for (i = 0; i < 16; ++i)
@ -1176,12 +1176,12 @@ __forceinline void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& r
 	}
 }

-__forceinline void ipu_vq(macroblock_rgb16& rgb16, u8* indx4)
+__fi void ipu_vq(macroblock_rgb16& rgb16, u8* indx4)
 {
 	Console.Error("IPU: VQ not implemented");
 }

-__forceinline void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16)
+__fi void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16)
 {
 	const u8	*s = (const u8*)&mb8;
 	s16	*d = (s16*)&mb16;
@ -1193,7 +1193,7 @@ __forceinline void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16)



-static __forceinline bool ipuDmacPartialChain(tDMA_TAG tag)
+static __fi bool ipuDmacPartialChain(tDMA_TAG tag)
 {
 	switch (tag.ID)
 	{
@ -1211,7 +1211,7 @@ static __forceinline bool ipuDmacPartialChain(tDMA_TAG tag)
 extern void gsInterrupt();
 extern void vif1Interrupt();

-static __forceinline void ipuDmacSrcChain()
+static __fi void ipuDmacSrcChain()
 {

 		switch (IPU1Status.ChainMode)
@ -1243,7 +1243,7 @@ static __forceinline void ipuDmacSrcChain()
 		}
 }

-static __forceinline bool WaitGSPaths()
+static __fi bool WaitGSPaths()
 {
 	if(CHECK_IPUWAITHACK)
 	{
@ -1268,7 +1268,7 @@ static __forceinline bool WaitGSPaths()
 	return true;
 }

-static __forceinline int IPU1chain() {
+static __fi int IPU1chain() {

 	int totalqwc = 0;

@ -1304,7 +1304,7 @@ static __forceinline int IPU1chain() {
 	return totalqwc;
 }

-//static __forceinline bool WaitGSPaths()
+//static __fi bool WaitGSPaths()
 //{
 //	//Wait for all GS paths to be clear
 //	if (GSTransferStatus._u32 != 0x2a)
@ -1524,7 +1524,7 @@ int IPU0dma()
 	return readsize;
 }

-__forceinline void dmaIPU0() // fromIPU
+__fi void dmaIPU0() // fromIPU
 {
 	if (ipu0dma->pad != 0)
 	{
@ -1539,7 +1539,7 @@ __forceinline void dmaIPU0() // fromIPU
 	if (ipuRegs->ctrl.BUSY) IPUWorker();
 }

-__forceinline void dmaIPU1() // toIPU
+__fi void dmaIPU1() // toIPU
 {
 	IPU_LOG("IPU1DMAStart QWC %x, MADR %x, CHCR %x, TADR %x", ipu1dma->qwc, ipu1dma->madr, ipu1dma->chcr._u32, ipu1dma->tadr);

--- a/pcsx2/IPU/IPU.h
+++ b/pcsx2/IPU/IPU.h
@ -27,7 +27,7 @@
 #define IPU_INT_TO( cycles )  if(!(cpuRegs.interrupt & (1<<4))) CPU_INT( DMAC_TO_IPU, cycles )
 #define IPU_INT_FROM( cycles )  CPU_INT( DMAC_FROM_IPU, cycles )

-#define IPU_FORCEINLINE __forceinline
+#define IPU_FORCEINLINE __fi

 struct IPUStatus {
 	bool InProgress;
--- a/pcsx2/IPU/IPU_Fifo.cpp
+++ b/pcsx2/IPU/IPU_Fifo.cpp
@ -168,7 +168,7 @@ void IPU_Fifo_Output::readsingle(void *value)
 	}
 }

-__forceinline bool decoder_t::ReadIpuData(u128* out)
+__fi bool decoder_t::ReadIpuData(u128* out)
 {
 	if(decoder.ipu0_data == 0) return false;
 	_mm_store_ps((float*)out, _mm_load_ps((float*)GetIpuDataPtr()));
--- a/pcsx2/IPU/mpeg2lib/Idct.cpp
+++ b/pcsx2/IPU/mpeg2lib/Idct.cpp
@ -66,7 +66,7 @@ do {					\
 } while (0)
 #endif

-static __forceinline void idct_row (s16 * const block)
+static __fi void idct_row (s16 * const block)
 {
    int d0, d1, d2, d3;
    int a0, a1, a2, a3, b0, b1, b2, b3;
@ -119,7 +119,7 @@ static __forceinline void idct_row (s16 * const block)
    block[7] = (a0 - b0) >> 8;
 }

-static __forceinline void idct_col (s16 * const block)
+static __fi void idct_col (s16 * const block)
 {
    int d0, d1, d2, d3;
    int a0, a1, a2, a3, b0, b1, b2, b3;
@ -160,7 +160,7 @@ static __forceinline void idct_col (s16 * const block)
    block[8*7] = (a0 - b0) >> 17;
 }

-__releaseinline void mpeg2_idct_copy(s16 * block, u8 * dest, const int stride)
+__ri void mpeg2_idct_copy(s16 * block, u8 * dest, const int stride)
 {
    int i;

@ -189,7 +189,7 @@ __releaseinline void mpeg2_idct_copy(s16 * block, u8 * dest, const int stride)


 // stride = increment for dest in 16-bit units (typically either 8 [128 bits] or 16 [256 bits]).
-__releaseinline void mpeg2_idct_add (const int last, s16 * block, s16 * dest, const int stride)
+__ri void mpeg2_idct_add (const int last, s16 * block, s16 * dest, const int stride)
 {
 	// on the IPU, stride is always assured to be multiples of QWC (bottom 3 bits are 0).

--- a/pcsx2/IPU/mpeg2lib/Mpeg.cpp
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.cpp
@ -164,7 +164,7 @@ intra:
 	}
 }

-static __forceinline int get_quantizer_scale()
+static __fi int get_quantizer_scale()
 {
 	int quantizer_scale_code;

@ -176,7 +176,7 @@ static __forceinline int get_quantizer_scale()
 		return quantizer_scale_code << 1;
 }

-static __forceinline int get_coded_block_pattern()
+static __fi int get_coded_block_pattern()
 {
 	const CBPtab * tab;
 	u16 code = UBITS(16);
@ -190,7 +190,7 @@ static __forceinline int get_coded_block_pattern()
 	return tab->cbp;
 }

-int __forceinline get_motion_delta(const int f_code)
+int __fi get_motion_delta(const int f_code)
 {
 	int delta;
 	int sign;
@ -219,7 +219,7 @@ int __forceinline get_motion_delta(const int f_code)
 	return (delta ^ sign) - sign;
 }

-int __forceinline get_dmv()
+int __fi get_dmv()
 {
 	const DMVtab * tab;

@ -261,7 +261,7 @@ int get_macroblock_address_increment()
 	return mba->mba + 1;
 }

-static __forceinline int get_luma_dc_dct_diff()
+static __fi int get_luma_dc_dct_diff()
 {
 	int size;
 	int dc_diff;
@ -297,7 +297,7 @@ static __forceinline int get_luma_dc_dct_diff()
 	return dc_diff;
 }

-static __forceinline int get_chroma_dc_dct_diff()
+static __fi int get_chroma_dc_dct_diff()
 {
 	int size;
 	int dc_diff;
@ -336,7 +336,7 @@ do {							\
 	val = (((s32)val) >> 31) ^ 2047;			\
 } while (0)

-static __forceinline bool get_intra_block()
+static __fi bool get_intra_block()
 {
 	int i;
 	int j;
@ -488,7 +488,7 @@ static __forceinline bool get_intra_block()
  return true;
 }

-static __forceinline bool get_non_intra_block(int * last)
+static __fi bool get_non_intra_block(int * last)
 {
 	int i;
 	int j;
@ -629,7 +629,7 @@ static __forceinline bool get_non_intra_block(int * last)
 	return true;
 }

-static __forceinline bool slice_intra_DCT(const int cc, u8 * const dest, const int stride, const bool skip)
+static __fi bool slice_intra_DCT(const int cc, u8 * const dest, const int stride, const bool skip)
 {
 	if (!skip || ipu_cmd.pos[3])
 	{
@ -659,7 +659,7 @@ static __forceinline bool slice_intra_DCT(const int cc, u8 * const dest, const i
 	return true;
 }

-static __forceinline bool slice_non_intra_DCT(s16 * const dest, const int stride, const bool skip)
+static __fi bool slice_non_intra_DCT(s16 * const dest, const int stride, const bool skip)
 {
 	int last;

@ -678,7 +678,7 @@ static __forceinline bool slice_non_intra_DCT(s16 * const dest, const int stride
 	return true;
 }

-void __forceinline finishmpeg2sliceIDEC()
+void __fi finishmpeg2sliceIDEC()
 {
 	ipuRegs->ctrl.SCD = 0;
 	coded_block_pattern = decoder.coded_block_pattern;
--- a/pcsx2/IPU/mpeg2lib/Vlc.h
+++ b/pcsx2/IPU/mpeg2lib/Vlc.h
@ -34,7 +34,7 @@
 //static u8 dword[8];
 //static u8 qword[16];

-static __forceinline int GETWORD()
+static __fi int GETWORD()
 {
 	static u8 data[2];

@ -56,7 +56,7 @@ static __forceinline int GETWORD()
 	return 1;
 }

-static __forceinline int bitstream_init ()
+static __fi int bitstream_init ()
 {
 	if (!getBits32((u8*)&decoder.bitstream_buf, 1))
 	{
@ -72,7 +72,7 @@ static __forceinline int bitstream_init ()
 }

 /* remove num valid bits from bit_buf */
-static __forceinline void DUMPBITS(int num)
+static __fi void DUMPBITS(int num)
 {
 	decoder.bitstream_buf <<= num;
    decoder.bitstream_bits += num;
@ -85,7 +85,7 @@ static __forceinline void DUMPBITS(int num)
 #define SBITS(num) (((s32)decoder.bitstream_buf) >> (32 - (num)))

 /* Get bits from bitstream */
-static __forceinline u32 GETBITS(int num)
+static __fi u32 GETBITS(int num)
 {
 	u16 retVal = UBITS(num);
 	DUMPBITS(num);
--- a/pcsx2/IPU/yuv2rgb.cpp
+++ b/pcsx2/IPU/yuv2rgb.cpp
@ -112,7 +112,7 @@ static const __aligned16 SSE2_Tables sse2_tables =
 static __aligned16 u16 yuv2rgb_temp[3][8];

 // This could potentially be improved for SSE4
-__releaseinline void yuv2rgb_sse2(void)
+__ri void yuv2rgb_sse2(void)
 {
 #if defined(_MSC_VER) || defined(__INTEL_COMPILER)
 	__asm {
--- a/pcsx2/Interpreter.cpp
+++ b/pcsx2/Interpreter.cpp
@ -77,7 +77,7 @@ static void execI()
 	opcode.interpret();
 }

-static __forceinline void _doBranch_shared(u32 tar)
+static __fi void _doBranch_shared(u32 tar)
 {
 	branch2 = cpuRegs.branch = 1;
 	execI();
--- a/pcsx2/IopBios.cpp
+++ b/pcsx2/IopBios.cpp
@ -107,7 +107,7 @@ public:
 		fd = hostfd;
 	}

-	static __forceinline int translate_error(int err)
+	static __fi int translate_error(int err)
 	{
 		if (err >= 0)
 			return err;
--- a/pcsx2/IopCounters.cpp
+++ b/pcsx2/IopCounters.cpp
@ -194,7 +194,7 @@ static void __fastcall _rcntTestTarget( int i )
 }


-static __forceinline void _rcntTestOverflow( int i )
+static __fi void _rcntTestOverflow( int i )
 {
 	u64 maxTarget = ( i < 3 ) ? 0xffff : 0xfffffffful;
 	if( psxCounters[i].count <= maxTarget ) return;
@ -538,7 +538,7 @@ void psxRcntWcount32(int index, u32 value)

 //////////////////////////////////////////////////////////////////////////////////////////
 //
-__forceinline void psxRcntWmode16( int index, u32 value )
+__fi void psxRcntWmode16( int index, u32 value )
 {
 	PSXCNT_LOG( "IOP Counter[%d] writeMode = 0x%04X", index, value );

@ -599,7 +599,7 @@ __forceinline void psxRcntWmode16( int index, u32 value )

 //////////////////////////////////////////////////////////////////////////////////////////
 //
-__forceinline void psxRcntWmode32( int index, u32 value )
+__fi void psxRcntWmode32( int index, u32 value )
 {
 	PSXCNT_LOG( "IOP Counter[%d] writeMode = 0x%04x", index, value );

--- a/pcsx2/IopDma.cpp
+++ b/pcsx2/IopDma.cpp
@ -261,10 +261,10 @@ struct DmaHandlerInfo
 	DmaIHandler Interrupt;
 	DmaSHandler Start;

-	__forceinline u32& REG_MADR(void) const { return psxHu32(DmacRegisterBase + 0x0); }
-	__forceinline u32& REG_BCR(void)  const { return psxHu32(DmacRegisterBase + 0x4); }
-	__forceinline u32& REG_CHCR(void) const { return psxHu32(DmacRegisterBase + 0x8); }
-	__forceinline u32& REG_TADR(void) const { return psxHu32(DmacRegisterBase + 0xC); }
+	__fi u32& REG_MADR(void) const { return psxHu32(DmacRegisterBase + 0x0); }
+	__fi u32& REG_BCR(void)  const { return psxHu32(DmacRegisterBase + 0x4); }
+	__fi u32& REG_CHCR(void) const { return psxHu32(DmacRegisterBase + 0x8); }
+	__fi u32& REG_TADR(void) const { return psxHu32(DmacRegisterBase + 0xC); }
 };

 #define MEM_BASE1 0x1f801080
@ -452,7 +452,7 @@ void IopDmaStart(int channel)
 // IopDmaProcessChannel: Called from IopDmaUpdate (below) to process a dma channel

 template<int channel>
-static void __releaseinline IopDmaProcessChannel(int elapsed, int& MinDelay)
+static void __ri IopDmaProcessChannel(int elapsed, int& MinDelay)
 {
 	// Hopefully the compiler would be able to optimize the whole function away if this doesn't pass.
 	if(!(IopDmaHandlers[channel].DirectionFlags&_E__))
--- a/pcsx2/IopHw.cpp
+++ b/pcsx2/IopHw.cpp
@ -37,7 +37,7 @@ void psxHwReset() {
 	//sio2Reset();
 }

-__forceinline u8 psxHw4Read8(u32 add)
+__fi u8 psxHw4Read8(u32 add)
 {
 	u16 mem = add & 0xFF;
 	u8 ret = cdvdRead(mem);
@ -45,7 +45,7 @@ __forceinline u8 psxHw4Read8(u32 add)
 	return ret;
 }

-__forceinline void psxHw4Write8(u32 add, u8 value)
+__fi void psxHw4Write8(u32 add, u8 value)
 {
 	u8 mem = (u8)add;	// only lower 8 bits are relevant (cdvd regs mirror across the page)
 	cdvdWrite(mem, value);
--- a/pcsx2/IopMem.h
+++ b/pcsx2/IopMem.h
@ -30,7 +30,7 @@ extern const uptr *psxMemRLUT;
 // Hacky!  This should really never be used, ever, since it bypasses the iop's Hardware
 // Register handler and SPU/DEV/USB maps.
 template<typename T>
-static __forceinline T* iopVirtMemW( u32 mem )
+static __fi T* iopVirtMemW( u32 mem )
 {
 	return (psxMemWLUT[(mem) >> 16] == 0) ? NULL : (T*)(psxMemWLUT[(mem) >> 16] + ((mem) & 0xffff));
 }
@ -42,14 +42,14 @@ static __forceinline T* iopVirtMemW( u32 mem )
 // TLB should be using iopMemRead/Write instead for each individual access.  That ensures
 // correct handling of page boundary crossings.
 template<typename T>
-static __forceinline const T* iopVirtMemR( u32 mem )
+static __fi const T* iopVirtMemR( u32 mem )
 {
 	mem &= 0x1fffffff;
 	return (psxMemRLUT[mem >> 16] == 0) ? NULL : (const T*)(psxMemRLUT[mem >> 16] + (mem & 0xffff));
 }

 // Obtains a pointer to the IOP's physical mapping (bypasses the TLB)
-static __forceinline u8* iopPhysMem( u32 addr )
+static __fi u8* iopPhysMem( u32 addr )
 {
 	return &psxM[addr & 0x1fffff];
 }
--- a/pcsx2/MMI.cpp
+++ b/pcsx2/MMI.cpp
@ -145,7 +145,7 @@ namespace MMI {

 //*****************MMI OPCODES*********************************

-__forceinline void _PLZCW(int n)
+static __fi void _PLZCW(int n)
 {
 	// This function counts the number of "like" bits in the source register, starting
 	// with the MSB and working its way down, and returns the result MINUS ONE.
@ -171,7 +171,7 @@ void PLZCW() {
 	_PLZCW (1);
 }

-__forceinline void PMFHL_CLAMP(u16 dst, u16 src)
+__fi void PMFHL_CLAMP(u16 dst, u16 src)
 {
    if ((int)src > (int)0x00007fff)
 	    dst = 0x7fff;
@ -254,7 +254,7 @@ void PMTHL() {
 	cpuRegs.HI.UL[2] = cpuRegs.GPR.r[_Rs_].UL[3];
 }

-__forceinline void _PSLLH(int n)
+static __fi void _PSLLH(int n)
 {
 	cpuRegs.GPR.r[_Rd_].US[n] = cpuRegs.GPR.r[_Rt_].US[n] << ( _Sa_ & 0xf );
 }
@ -266,7 +266,7 @@ void PSLLH() {
 	_PSLLH(4); _PSLLH(5); _PSLLH(6); _PSLLH(7);
 }

-__forceinline void _PSRLH(int n)
+static __fi void _PSRLH(int n)
 {
 	cpuRegs.GPR.r[_Rd_].US[n] = cpuRegs.GPR.r[_Rt_].US[n] >> ( _Sa_ & 0xf );
 }
@ -278,7 +278,7 @@ void PSRLH () {
 	_PSRLH(4); _PSRLH(5); _PSRLH(6); _PSRLH(7);
 }

-__forceinline void _PSRAH(int n)
+static __fi void _PSRAH(int n)
 {
 	cpuRegs.GPR.r[_Rd_].US[n] = cpuRegs.GPR.r[_Rt_].SS[n] >> ( _Sa_ & 0xf );
 }
@ -290,7 +290,7 @@ void PSRAH() {
 	_PSRAH(4); _PSRAH(5); _PSRAH(6); _PSRAH(7);
 }

-__forceinline void _PSLLW(int n)
+static __fi void _PSLLW(int n)
 {
 	cpuRegs.GPR.r[_Rd_].UL[n] = cpuRegs.GPR.r[_Rt_].UL[n] << _Sa_;
 }
@ -301,7 +301,7 @@ void PSLLW() {
 	_PSLLW(0); _PSLLW(1); _PSLLW(2); _PSLLW(3);
 }

-__forceinline void _PSRLW(int n)
+static __fi void _PSRLW(int n)
 {
 	cpuRegs.GPR.r[_Rd_].UL[n] = cpuRegs.GPR.r[_Rt_].UL[n] >> _Sa_;
 }
@ -312,7 +312,7 @@ void PSRLW() {
 	_PSRLW(0); _PSRLW(1); _PSRLW(2); _PSRLW(3);
 }

-__forceinline void _PSRAW(int n)
+static __fi void _PSRAW(int n)
 {
 	cpuRegs.GPR.r[_Rd_].UL[n] = cpuRegs.GPR.r[_Rt_].SL[n] >> _Sa_;
 }
@ -326,7 +326,7 @@ void PSRAW() {
 //*****************END OF MMI OPCODES**************************
 //*************************MMI0 OPCODES************************

-__forceinline void _PADDW(int n)
+static __fi void _PADDW(int n)
 {
 	cpuRegs.GPR.r[_Rd_].UL[n] = cpuRegs.GPR.r[_Rs_].UL[n] + cpuRegs.GPR.r[_Rt_].UL[n];
 }
@ -337,7 +337,7 @@ void PADDW() {
 	_PADDW(0); _PADDW(1); _PADDW(2); _PADDW(3);
 }

-__forceinline void _PSUBW(int n)
+static __fi void _PSUBW(int n)
 {
 	cpuRegs.GPR.r[_Rd_].UL[n] = cpuRegs.GPR.r[_Rs_].UL[n] - cpuRegs.GPR.r[_Rt_].UL[n];
 }
@ -348,7 +348,7 @@ void PSUBW() {
 	_PSUBW(0); _PSUBW(1); _PSUBW(2); _PSUBW(3);
 }

-__forceinline void _PCGTW(int n)
+static __fi void _PCGTW(int n)
 {
 	if (cpuRegs.GPR.r[_Rs_].SL[n] > cpuRegs.GPR.r[_Rt_].SL[n])
 		cpuRegs.GPR.r[_Rd_].UL[n] = 0xFFFFFFFF;
@ -362,7 +362,7 @@ void PCGTW() {
 	_PCGTW(0);  _PCGTW(1);  _PCGTW(2);  _PCGTW(3);
 }

-__forceinline void _PMAXW(int n)
+static __fi void _PMAXW(int n)
 {
 	if (cpuRegs.GPR.r[_Rs_].SL[n] > cpuRegs.GPR.r[_Rt_].SL[n])
 		cpuRegs.GPR.r[_Rd_].UL[n] = cpuRegs.GPR.r[_Rs_].UL[n];
@ -376,7 +376,7 @@ void PMAXW() {
 	_PMAXW(0);  _PMAXW(1);  _PMAXW(2);  _PMAXW(3);
 }

-__forceinline void _PADDH(int n)
+static __fi void _PADDH(int n)
 {
 	cpuRegs.GPR.r[_Rd_].US[n] = cpuRegs.GPR.r[_Rs_].US[n] + cpuRegs.GPR.r[_Rt_].US[n];
 }
@ -388,7 +388,7 @@ void PADDH() {
 	_PADDH(4);  _PADDH(5);  _PADDH(6);  _PADDH(7);
 }

-__forceinline void _PSUBH(int n)
+static __fi void _PSUBH(int n)
 {
 	cpuRegs.GPR.r[_Rd_].US[n] = cpuRegs.GPR.r[_Rs_].US[n] - cpuRegs.GPR.r[_Rt_].US[n];
 }
@ -400,7 +400,7 @@ void PSUBH() {
 	_PSUBH(4);  _PSUBH(5);  _PSUBH(6);  _PSUBH(7);
 }

-__forceinline void _PCGTH(int n)
+static __fi void _PCGTH(int n)
 {
 	if (cpuRegs.GPR.r[_Rs_].SS[n] > cpuRegs.GPR.r[_Rt_].SS[n])
 		cpuRegs.GPR.r[_Rd_].US[n] = 0xFFFF;
@ -415,7 +415,7 @@ void PCGTH() {
 	_PCGTH(4);  _PCGTH(5);  _PCGTH(6);  _PCGTH(7);
 }

-__forceinline void _PMAXH(int n)
+static __fi void _PMAXH(int n)
 {
 	if (cpuRegs.GPR.r[_Rs_].SS[n] > cpuRegs.GPR.r[_Rt_].SS[n])
 		cpuRegs.GPR.r[_Rd_].US[n] =  cpuRegs.GPR.r[_Rs_].US[n];
@ -430,7 +430,7 @@ void PMAXH() {
 	_PMAXH(4);  _PMAXH(5);  _PMAXH(6);  _PMAXH(7);
 }

-__forceinline void _PADDB(int n)
+static __fi void _PADDB(int n)
 {
 	cpuRegs.GPR.r[_Rd_].SC[n] = cpuRegs.GPR.r[_Rs_].SC[n] + cpuRegs.GPR.r[_Rt_].SC[n];
 }
@ -443,7 +443,7 @@ void PADDB() {
 		_PADDB( i );
 }

-__forceinline void _PSUBB(int n)
+static __fi void _PSUBB(int n)
 {
 	cpuRegs.GPR.r[_Rd_].SC[n] = cpuRegs.GPR.r[_Rs_].SC[n] - cpuRegs.GPR.r[_Rt_].SC[n];
 }
@ -456,7 +456,7 @@ void PSUBB() {
 		_PSUBB( i );
 }

-__forceinline void _PCGTB(int n)
+static __fi void _PCGTB(int n)
 {
 	if (cpuRegs.GPR.r[_Rs_].SC[n] > cpuRegs.GPR.r[_Rt_].SC[n])
 		cpuRegs.GPR.r[_Rd_].UC[n] = 0xFF;
@ -472,7 +472,7 @@ void PCGTB() {
 		_PCGTB( i );
 }

-__forceinline void _PADDSW(int n)
+static __fi void _PADDSW(int n)
 {
 	s64 sTemp64;

@ -491,7 +491,7 @@ void PADDSW() {
 	_PADDSW(0); _PADDSW(1); _PADDSW(2); _PADDSW(3);
 }

-__forceinline void _PSUBSW(int n)
+static __fi void _PSUBSW(int n)
 {
 	s64 sTemp64;

@ -538,7 +538,7 @@ void PPACW() {
 	cpuRegs.GPR.r[_Rd_].UL[3] = Rs.UL[2];
 }

-__forceinline void  _PADDSH(int n)
+__fi void  _PADDSH(int n)
 {
 	s32 sTemp32;
 	sTemp32 = (s32)cpuRegs.GPR.r[_Rs_].SS[n] + (s32)cpuRegs.GPR.r[_Rt_].SS[n];
@ -558,7 +558,7 @@ void PADDSH() {
 	_PADDSH(4); _PADDSH(5); _PADDSH(6); _PADDSH(7);
 }

-__forceinline void  _PSUBSH(int n)
+__fi void  _PSUBSH(int n)
 {
 	s32 sTemp32;
 	sTemp32 = (s32)cpuRegs.GPR.r[_Rs_].SS[n] - (s32)cpuRegs.GPR.r[_Rt_].SS[n];
@ -610,7 +610,7 @@ void PPACH() {
 	cpuRegs.GPR.r[_Rd_].US[7] = Rs.US[6];
 }

-__forceinline void  _PADDSB(int n)
+__fi void  _PADDSB(int n)
 {
 	s16 sTemp16;
 	sTemp16 = (s16)cpuRegs.GPR.r[_Rs_].SC[n] + (s16)cpuRegs.GPR.r[_Rt_].SC[n];
@ -631,7 +631,7 @@ void PADDSB() {
 		_PADDSB(i);
 }

-static __forceinline void _PSUBSB( u8 n )
+static __fi void _PSUBSB( u8 n )
 {
 	s16 sTemp16;
 	sTemp16 = (s16)cpuRegs.GPR.r[_Rs_].SC[n] - (s16)cpuRegs.GPR.r[_Rt_].SC[n];
@ -706,7 +706,7 @@ void PPACB() {
 	cpuRegs.GPR.r[_Rd_].UC[15] = Rs.UC[14];
 }

-__forceinline void  _PEXT5(int n)
+__fi void  _PEXT5(int n)
 {
 	cpuRegs.GPR.r[_Rd_].UL[n] =
 		((cpuRegs.GPR.r[_Rt_].UL[n] & 0x0000001F) <<  3) |
@ -721,7 +721,7 @@ void PEXT5() {
 	_PEXT5(0); _PEXT5(1); _PEXT5(2); _PEXT5(3);
 }

-__forceinline void  _PPAC5(int n)
+__fi void  _PPAC5(int n)
 {
 	cpuRegs.GPR.r[_Rd_].UL[n] =
 		((cpuRegs.GPR.r[_Rt_].UL[n] >>  3) & 0x0000001F) |
@ -739,7 +739,7 @@ void PPAC5() {
 //***END OF MMI0 OPCODES******************************************
 //**********MMI1 OPCODES**************************************

-__forceinline void _PABSW(int n)
+static __fi void _PABSW(int n)
 {
 	if (cpuRegs.GPR.r[_Rt_].UL[n] == 0x80000000)
 		cpuRegs.GPR.r[_Rd_].UL[n] = 0x7fffffff; //clamp
@ -755,7 +755,7 @@ void PABSW() {
 	_PABSW(0);  _PABSW(1);  _PABSW(2);  _PABSW(3);
 }

-__forceinline void _PCEQW(int n)
+static __fi void _PCEQW(int n)
 {
 	if (cpuRegs.GPR.r[_Rs_].UL[n] == cpuRegs.GPR.r[_Rt_].UL[n])
 		cpuRegs.GPR.r[_Rd_].UL[n] = 0xFFFFFFFF;
@ -769,7 +769,7 @@ void PCEQW() {
 	_PCEQW(0); _PCEQW(1); _PCEQW(2); _PCEQW(3);
 }

-static __forceinline void _PMINW( u8 n )
+static __fi void _PMINW( u8 n )
 {
 	if (cpuRegs.GPR.r[_Rs_].SL[n] < cpuRegs.GPR.r[_Rt_].SL[n])
 		cpuRegs.GPR.r[_Rd_].SL[n] = cpuRegs.GPR.r[_Rs_].SL[n];
@ -790,7 +790,7 @@ void PADSBH() {
 	_PADDH(4); _PADDH(5); _PADDH(6); _PADDH(7);
 }

-__forceinline void _PABSH(int n)
+static __fi void _PABSH(int n)
 {
 	if (cpuRegs.GPR.r[_Rt_].US[n] == 0x8000)
 		cpuRegs.GPR.r[_Rd_].US[n] = 0x7fff; //clamp
@ -807,7 +807,7 @@ void PABSH() {
 	_PABSH(4); _PABSH(5); _PABSH(6); _PABSH(7);
 }

-static __forceinline void  _PCEQH( u8 n )
+static __fi void  _PCEQH( u8 n )
 {
 	if (cpuRegs.GPR.r[_Rs_].US[n] == cpuRegs.GPR.r[_Rt_].US[n])
 		cpuRegs.GPR.r[_Rd_].US[n] = 0xFFFF;
@ -822,7 +822,7 @@ void PCEQH() {
 	_PCEQH(4); _PCEQH(5); _PCEQH(6); _PCEQH(7);
 }

-static __forceinline void  _PMINH( u8 n )
+static __fi void  _PMINH( u8 n )
 {
 	if (cpuRegs.GPR.r[_Rs_].SS[n] < cpuRegs.GPR.r[_Rt_].SS[n])
 		cpuRegs.GPR.r[_Rd_].US[n] = cpuRegs.GPR.r[_Rs_].US[n];
@ -837,7 +837,7 @@ void PMINH() {
 	_PMINH(4); _PMINH(5); _PMINH(6); _PMINH(7);
 }

-__forceinline void  _PCEQB(int n)
+__fi void  _PCEQB(int n)
 {
 	if (cpuRegs.GPR.r[_Rs_].UC[n] == cpuRegs.GPR.r[_Rt_].UC[n])
 		cpuRegs.GPR.r[_Rd_].UC[n] = 0xFF;
@ -853,7 +853,7 @@ void PCEQB() {
 		_PCEQB(i);
 }

-__forceinline void  _PADDUW(int n)
+__fi void  _PADDUW(int n)
 {
 	s64 tmp;
 	tmp = (s64)cpuRegs.GPR.r[_Rs_].UL[n] + (s64)cpuRegs.GPR.r[_Rt_].UL[n];
@ -870,7 +870,7 @@ void PADDUW () {
 	_PADDUW(0); _PADDUW(1); _PADDUW(2); _PADDUW(3);
 }

-__forceinline void  _PSUBUW(int n)
+__fi void  _PSUBUW(int n)
 {
 	s64 sTemp64;
 	sTemp64 = (s64)cpuRegs.GPR.r[_Rs_].UL[n] - (s64)cpuRegs.GPR.r[_Rt_].UL[n];
@ -899,7 +899,7 @@ void PEXTUW() {
 	cpuRegs.GPR.r[_Rd_].UL[3] = Rs.UL[3];
 }

-__forceinline void  _PADDUH(int n)
+__fi void  _PADDUH(int n)
 {
 	s32 sTemp32;
 	sTemp32 = (s32)cpuRegs.GPR.r[_Rs_].US[n] + (s32)cpuRegs.GPR.r[_Rt_].US[n];
@ -917,7 +917,7 @@ void PADDUH() {
 	_PADDUH(4); _PADDUH(5); _PADDUH(6); _PADDUH(7);
 }

-__forceinline void  _PSUBUH(int n)
+__fi void  _PSUBUH(int n)
 {
 	s32 sTemp32;
 	sTemp32 = (s32)cpuRegs.GPR.r[_Rs_].US[n] - (s32)cpuRegs.GPR.r[_Rt_].US[n];
@ -952,7 +952,7 @@ void PEXTUH() {
 	cpuRegs.GPR.r[_Rd_].US[7] = Rs.US[7];
 }

-__forceinline void  _PADDUB(int n)
+__fi void  _PADDUB(int n)
 {
 	u16 Temp16;
 	Temp16 = (u16)cpuRegs.GPR.r[_Rs_].UC[n] + (u16)cpuRegs.GPR.r[_Rt_].UC[n];
@ -971,7 +971,7 @@ void PADDUB() {
 		 _PADDUB(i);
 }

-__forceinline void  _PSUBUB(int n) {
+__fi void  _PSUBUB(int n) {
 	s16 sTemp16;
 	sTemp16 = (s16)cpuRegs.GPR.r[_Rs_].UC[n] - (s16)cpuRegs.GPR.r[_Rt_].UC[n];

@ -1060,7 +1060,7 @@ void QFSRV() {				// JayteeMaster: changed a bit to avoid screw up

 //*********MMI2 OPCODES***************************************

-__forceinline void _PMADDW(int dd, int ss)
+static __fi void _PMADDW(int dd, int ss)
 {
 	s64 temp = (s64)((s64)cpuRegs.LO.SL[ss] | ((s64)cpuRegs.HI.SL[ss] << 32)) +
 			   ((s64)cpuRegs.GPR.r[_Rs_].SL[ss] * (s64)cpuRegs.GPR.r[_Rt_].SL[ss]);
@ -1094,7 +1094,7 @@ void PSRLVW() {
 										  (cpuRegs.GPR.r[_Rs_].UL[2] & 0x1F));
 }

-__forceinline void  _PMSUBW(int dd, int ss)
+__fi void  _PMSUBW(int dd, int ss)
 {
 	s64 temp = (s64)((s64)cpuRegs.LO.SL[ss] | ((s64)cpuRegs.HI.SL[ss] << 32)) -
 			   ((s64)cpuRegs.GPR.r[_Rs_].SL[ss] * (s64)cpuRegs.GPR.r[_Rt_].SL[ss]);
@ -1140,7 +1140,7 @@ void PINTH() {
 	cpuRegs.GPR.r[_Rd_].US[7] = Rs.US[7];
 }

-__forceinline void  _PMULTW(int dd, int ss)
+__fi void  _PMULTW(int dd, int ss)
 {
 	s64 temp = (s64)cpuRegs.GPR.r[_Rs_].SL[ss] * (s64)cpuRegs.GPR.r[_Rt_].SL[ss];

@ -1155,7 +1155,7 @@ void PMULTW() {
 	_PMULTW(1, 2);
 }

-__forceinline void  _PDIVW(int dd, int ss)
+__fi void  _PDIVW(int dd, int ss)
 {
 	if (cpuRegs.GPR.r[_Rs_].UL[ss] == 0x80000000 && cpuRegs.GPR.r[_Rt_].UL[ss] == 0xffffffff)
 	{
@ -1229,7 +1229,7 @@ void PMADDH() {			// JayteeMaster: changed a bit to avoid screw up
 }

 // JayteeMaster: changed a bit to avoid screw up
-__forceinline void  _PHMADH_LO(int dd, int n)
+__fi void  _PHMADH_LO(int dd, int n)
 {
 	s32 firsttemp =		   (s32)cpuRegs.GPR.r[_Rs_].SS[n+1] * (s32)cpuRegs.GPR.r[_Rt_].SS[n+1];
 	s32 temp = firsttemp + (s32)cpuRegs.GPR.r[_Rs_].SS[n]   * (s32)cpuRegs.GPR.r[_Rt_].SS[n];
@ -1238,7 +1238,7 @@ __forceinline void  _PHMADH_LO(int dd, int n)
 	cpuRegs.LO.UL[dd+1] = firsttemp;
 }

-__forceinline void  _PHMADH_HI(int dd, int n)
+__fi void  _PHMADH_HI(int dd, int n)
 {
 	s32 firsttemp =		   (s32)cpuRegs.GPR.r[_Rs_].SS[n+1] * (s32)cpuRegs.GPR.r[_Rt_].SS[n+1];
 	s32 temp = firsttemp + (s32)cpuRegs.GPR.r[_Rs_].SS[n]   * (s32)cpuRegs.GPR.r[_Rt_].SS[n];
@ -1314,7 +1314,7 @@ void PMSUBH() {			// JayteeMaster: changed a bit to avoid screw up
 }

 // JayteeMaster: changed a bit to avoid screw up
-__forceinline void _PHMSBH_LO(int dd, int n, int rdd)
+static __fi void _PHMSBH_LO(int dd, int n, int rdd)
 {
 	s32 firsttemp =        (s32)cpuRegs.GPR.r[_Rs_].SS[n+1] * (s32)cpuRegs.GPR.r[_Rt_].SS[n+1];
 	s32 temp = firsttemp - (s32)cpuRegs.GPR.r[_Rs_].SS[n]   * (s32)cpuRegs.GPR.r[_Rt_].SS[n];
@ -1322,7 +1322,7 @@ __forceinline void _PHMSBH_LO(int dd, int n, int rdd)
 	cpuRegs.LO.UL[dd] = temp;
 	cpuRegs.LO.UL[dd+1] = ~firsttemp;
 }
-__forceinline void _PHMSBH_HI(int dd, int n, int rdd)
+static __fi void _PHMSBH_HI(int dd, int n, int rdd)
 {
 	s32 firsttemp =        (s32)cpuRegs.GPR.r[_Rs_].SS[n+1] * (s32)cpuRegs.GPR.r[_Rt_].SS[n+1];
 	s32 temp = firsttemp - (s32)cpuRegs.GPR.r[_Rs_].SS[n]   * (s32)cpuRegs.GPR.r[_Rt_].SS[n];
@ -1415,7 +1415,7 @@ void PMULTH() {			// JayteeMaster: changed a bit to avoid screw up
 	}
 }

-__forceinline void  _PDIVBW(int n)
+__fi void  _PDIVBW(int n)
 {
 	if (cpuRegs.GPR.r[_Rs_].UL[n] == 0x80000000 && cpuRegs.GPR.r[_Rt_].US[0] == 0xffff)
 	{
@ -1466,7 +1466,7 @@ void PROT3W() {

 //*************************MMI3 OPCODES************************

-__forceinline void _PMADDUW(int dd, int ss)
+static __fi void _PMADDUW(int dd, int ss)
 {
 	u64 tempu =	(u64)((u64)cpuRegs.LO.UL[ss] | ((u64)cpuRegs.HI.UL[ss] << 32)) + \
 			   ((u64)cpuRegs.GPR.r[_Rs_].UL[ss] * (u64)cpuRegs.GPR.r[_Rt_].UL[ss]);
@ -1517,7 +1517,7 @@ void PINTEH() {
 	cpuRegs.GPR.r[_Rd_].US[7] = Rs.US[6];
 }

-__forceinline void  _PMULTUW(int dd, int ss)
+__fi void  _PMULTUW(int dd, int ss)
 {
   u64 tempu = (u64)cpuRegs.GPR.r[_Rs_].UL[ss] * (u64)cpuRegs.GPR.r[_Rt_].UL[ss];

@ -1533,7 +1533,7 @@ void PMULTUW() {
 	_PMULTUW(1, 2);
 }

-__forceinline void  _PDIVUW(int dd, int ss)
+__fi void  _PDIVUW(int dd, int ss)
 {
 	if (cpuRegs.GPR.r[_Rt_].UL[ss] != 0) {
 		cpuRegs.LO.SD[dd] = (s32)(cpuRegs.GPR.r[_Rs_].UL[ss] / cpuRegs.GPR.r[_Rt_].UL[ss]);
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@ -737,7 +737,7 @@ void SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, u32 size )
 	PrepDataPacket( (MTGS_RingCommand)pathidx, size );
 }

-__forceinline void SysMtgsThread::_FinishSimplePacket()
+__fi void SysMtgsThread::_FinishSimplePacket()
 {
 	uint future_writepos = (m_WritePos+1) & RingBufferMask;
 	pxAssert( future_writepos != volatize(m_ReadPos) );
--- a/pcsx2/Memory.cpp
+++ b/pcsx2/Memory.cpp
@ -469,7 +469,7 @@ static void __fastcall _ext_memWrite128(u32 mem, const mem128_t *value)
 typedef void __fastcall ClearFunc_t( u32 addr, u32 qwc );

 template<int vunum>
-static __forceinline void ClearVuFunc( u32 addr, u32 size )
+static __fi void ClearVuFunc( u32 addr, u32 size )
 {
 	if( vunum==0 )
 		CpuVU0->Clear(addr,size);
@ -928,7 +928,7 @@ void mmap_MarkCountedRamPage( u32 paddr )
 // offset - offset of address relative to psM.
 // All recompiled blocks belonging to the page are cleared, and any new blocks recompiled
 // from code residing in this page will use manual protection.
-static __forceinline void mmap_ClearCpuBlock( uint offset )
+static __fi void mmap_ClearCpuBlock( uint offset )
 {
 	int rampage = offset >> 12;

--- a/pcsx2/Pcsx2Config.cpp
+++ b/pcsx2/Pcsx2Config.cpp
@ -246,7 +246,7 @@ const wxChar *const tbl_GamefixNames[] =
 	L"OPHFlag"
 };

-const __forceinline wxChar* EnumToString( GamefixId id )
+const __fi wxChar* EnumToString( GamefixId id )
 {
 	return tbl_GamefixNames[id];
 }
--- a/pcsx2/R3000A.cpp
+++ b/pcsx2/R3000A.cpp
@ -108,7 +108,7 @@ void __fastcall psxException(u32 code, u32 bd)
 	}*/
 }

-__forceinline void psxSetNextBranch( u32 startCycle, s32 delta )
+__fi void psxSetNextBranch( u32 startCycle, s32 delta )
 {
 	// typecast the conditional to signed so that things don't blow up
 	// if startCycle is greater than our next branch cycle.
@ -117,12 +117,12 @@ __forceinline void psxSetNextBranch( u32 startCycle, s32 delta )
 		g_psxNextBranchCycle = startCycle + delta;
 }

-__forceinline void psxSetNextBranchDelta( s32 delta )
+__fi void psxSetNextBranchDelta( s32 delta )
 {
 	psxSetNextBranch( psxRegs.cycle, delta );
 }

-__forceinline int psxTestCycle( u32 startCycle, s32 delta )
+__fi int psxTestCycle( u32 startCycle, s32 delta )
 {
 	// typecast the conditional to signed so that things don't explode
 	// if the startCycle is ahead of our current cpu cycle.
@ -130,7 +130,7 @@ __forceinline int psxTestCycle( u32 startCycle, s32 delta )
 	return (int)(psxRegs.cycle - startCycle) >= delta;
 }

-__forceinline void PSX_INT( IopEventId n, s32 ecycle )
+__fi void PSX_INT( IopEventId n, s32 ecycle )
 {
 	// Generally speaking games shouldn't throw ints that haven't been cleared yet.
 	// It's usually indicative of something amiss in our emulation, so uncomment this
@ -161,7 +161,7 @@ __forceinline void PSX_INT( IopEventId n, s32 ecycle )
 	}
 }

-static __forceinline void IopTestEvent( IopEventId n, void (*callback)() )
+static __fi void IopTestEvent( IopEventId n, void (*callback)() )
 {
 	if( !(psxRegs.interrupt & (1 << n)) ) return;

@ -174,7 +174,7 @@ static __forceinline void IopTestEvent( IopEventId n, void (*callback)() )
 		psxSetNextBranch( psxRegs.sCycle[n], psxRegs.eCycle[n] );
 }

-static __forceinline void sifHackInterrupt()
+static __fi void sifHackInterrupt()
 {
 	// No reason -- just that sometimes the SIF fell asleep, and this wakes it up.

@ -186,7 +186,7 @@ static __forceinline void sifHackInterrupt()
 	//PSX_INT( IopEvt_SIFhack, 128 );
 }

-static __forceinline void _psxTestInterrupts()
+static __fi void _psxTestInterrupts()
 {
 	IopTestEvent(IopEvt_SIF0,		sif0Interrupt);	// SIF0
 	IopTestEvent(IopEvt_SIF1,		sif1Interrupt);	// SIF1
@ -211,7 +211,7 @@ static __forceinline void _psxTestInterrupts()
 	}
 }

-__releaseinline void psxBranchTest()
+__ri void psxBranchTest()
 {
 	if( psxTestCycle( psxNextsCounter, psxNextCounter ) )
 	{
--- a/pcsx2/R3000AInterpreter.cpp
+++ b/pcsx2/R3000AInterpreter.cpp
@ -125,7 +125,7 @@ void psxJALR()
 ///////////////////////////////////////////
 // These macros are used to assemble the repassembler functions

-static __forceinline void execI()
+static __fi void execI()
 {
 	psxRegs.code = iopMemRead32(psxRegs.pc);

--- a/pcsx2/R5900.cpp
+++ b/pcsx2/R5900.cpp
@ -90,7 +90,7 @@ void cpuReset()
 	LastELF = L"";
 }

-__releaseinline void cpuException(u32 code, u32 bd)
+__ri void cpuException(u32 code, u32 bd)
 {
 	bool errLevel2, checkStatus;
 	u32 offset;
@ -201,7 +201,7 @@ void cpuTlbMissW(u32 addr, u32 bd) {
 	cpuTlbMiss(addr, bd, EXC_CODE_TLBS);
 }

-__forceinline void _cpuTestMissingINTC() {
+__fi void _cpuTestMissingINTC() {
 	if (cpuRegs.CP0.n.Status.val & 0x400 &&
 		psHu32(INTC_STAT) & psHu32(INTC_MASK)) {
 		if ((cpuRegs.interrupt & (1 << 30)) == 0) {
@ -210,7 +210,7 @@ __forceinline void _cpuTestMissingINTC() {
 	}
 }

-__forceinline void _cpuTestMissingDMAC() {
+__fi void _cpuTestMissingDMAC() {
 	if (cpuRegs.CP0.n.Status.val & 0x800 &&
 		(psHu16(0xe012) & psHu16(0xe010) ||
 		 psHu16(0xe010) & 0x8000)) {
@ -229,7 +229,7 @@ void cpuTestMissingHwInts() {
 }

 // sets a branch test to occur some time from an arbitrary starting point.
-__forceinline void cpuSetNextBranch( u32 startCycle, s32 delta )
+__fi void cpuSetNextBranch( u32 startCycle, s32 delta )
 {
 	// typecast the conditional to signed so that things don't blow up
 	// if startCycle is greater than our next branch cycle.
@ -241,14 +241,14 @@ __forceinline void cpuSetNextBranch( u32 startCycle, s32 delta )
 }

 // sets a branch to occur some time from the current cycle
-__forceinline void cpuSetNextBranchDelta( s32 delta )
+__fi void cpuSetNextBranchDelta( s32 delta )
 {
 	cpuSetNextBranch( cpuRegs.cycle, delta );
 }

 // tests the cpu cycle agaisnt the given start and delta values.
 // Returns true if the delta time has passed.
-__forceinline int cpuTestCycle( u32 startCycle, s32 delta )
+__fi int cpuTestCycle( u32 startCycle, s32 delta )
 {
 	// typecast the conditional to signed so that things don't explode
 	// if the startCycle is ahead of our current cpu cycle.
@ -257,18 +257,18 @@ __forceinline int cpuTestCycle( u32 startCycle, s32 delta )
 }

 // tells the EE to run the branch test the next time it gets a chance.
-__forceinline void cpuSetBranch()
+__fi void cpuSetBranch()
 {
 	g_nextBranchCycle = cpuRegs.cycle;
 }

-__forceinline void cpuClearInt( uint i )
+__fi void cpuClearInt( uint i )
 {
 	jASSUME( i < 32 );
 	cpuRegs.interrupt &= ~(1 << i);
 }

-static __forceinline void TESTINT( u8 n, void (*callback)() )
+static __fi void TESTINT( u8 n, void (*callback)() )
 {
 	if( !(cpuRegs.interrupt & (1 << n)) ) return;

@ -281,7 +281,7 @@ static __forceinline void TESTINT( u8 n, void (*callback)() )
 		cpuSetNextBranch( cpuRegs.sCycle[n], cpuRegs.eCycle[n] );
 }

-static __forceinline void _cpuTestInterrupts()
+static __fi void _cpuTestInterrupts()
 {
 	if (!dmacRegs->ctrl.DMAE || psHu8(DMAC_ENABLER+2) == 1)
 	{
@ -315,7 +315,7 @@ static __forceinline void _cpuTestInterrupts()
 	}
 }

-static __forceinline void _cpuTestTIMR()
+static __fi void _cpuTestTIMR()
 {
 	cpuRegs.CP0.n.Count += cpuRegs.cycle-s_iLastCOP0Cycle;
 	s_iLastCOP0Cycle = cpuRegs.cycle;
@ -333,7 +333,7 @@ static __forceinline void _cpuTestTIMR()
 	}
 }

-static __forceinline void _cpuTestPERF()
+static __fi void _cpuTestPERF()
 {
 	// Perfs are updated when read by games (COP0's MFC0/MTC0 instructions), so we need
 	// only update them at semi-regular intervals to keep cpuRegs.cycle from wrapping
@ -361,7 +361,7 @@ u32 g_nextBranchCycle = 0;

 // Shared portion of the branch test, called from both the Interpreter
 // and the recompiler.  (moved here to help alleviate redundant code)
-__forceinline void _cpuBranchTest_Shared()
+__fi void _cpuBranchTest_Shared()
 {
 	ScopedBool etest(eeEventTestIsActive);
 	g_nextBranchCycle = cpuRegs.cycle + eeWaitCycles;
@ -481,7 +481,7 @@ __forceinline void _cpuBranchTest_Shared()
 	if( cpuIntsEnabled(0x800) ) TESTINT(31, dmacInterrupt);
 }

-__releaseinline void cpuTestINTCInts()
+__ri void cpuTestINTCInts()
 {
 	// Check the internal Event System -- if one's already scheduled then don't bother:
 	if( cpuRegs.interrupt & (1 << 30) ) return;
@ -507,7 +507,7 @@ __releaseinline void cpuTestINTCInts()
 	}
 }

-__forceinline void cpuTestDMACInts()
+__fi void cpuTestDMACInts()
 {
 	// Check the internal Event System -- if one's already scheduled then don't bother:
 	if ( cpuRegs.interrupt & (1 << 31) ) return;
@ -534,20 +534,20 @@ __forceinline void cpuTestDMACInts()
 	}
 }

-__forceinline void cpuTestTIMRInts() {
+__fi void cpuTestTIMRInts() {
 	if ((cpuRegs.CP0.n.Status.val & 0x10007) == 0x10001) {
 		_cpuTestPERF();
 		_cpuTestTIMR();
 	}
 }

-__forceinline void cpuTestHwInts() {
+__fi void cpuTestHwInts() {
 	cpuTestINTCInts();
 	cpuTestDMACInts();
 	cpuTestTIMRInts();
 }

-__forceinline void CPU_INT( EE_EventType n, s32 ecycle)
+__fi void CPU_INT( EE_EventType n, s32 ecycle)
 {
 	if( n != 2 && cpuRegs.interrupt & (1<<n) ){ //2 is Gif, and every path 3 masking game triggers this :/
 		DevCon.Warning( "***** EE > Twice-thrown int on IRQ %d", n );
--- a/pcsx2/R5900OpcodeImpl.cpp
+++ b/pcsx2/R5900OpcodeImpl.cpp
@ -24,7 +24,7 @@
 #include "R5900Exceptions.h"


-static __forceinline s64 _add64_Overflow( s64 x, s64 y )
+static __fi s64 _add64_Overflow( s64 x, s64 y )
 {
 	const s64 result = x + y;

@ -43,7 +43,7 @@ static __forceinline s64 _add64_Overflow( s64 x, s64 y )
 	return result;
 }

-static __forceinline s64 _add32_Overflow( s32 x, s32 y )
+static __fi s64 _add32_Overflow( s32 x, s32 y )
 {
 	GPR_reg64 result;  result.SD[0] = (s64)x + y;

--- a/pcsx2/SPR.cpp
+++ b/pcsx2/SPR.cpp
@ -84,7 +84,7 @@ int  _SPR0chain()
 	return (spr0->qwc); // bus is 1/2 the ee speed
 }

-__forceinline void SPR0chain()
+__fi void SPR0chain()
 {
 	CPU_INT(DMAC_FROM_SPR, _SPR0chain() / BIAS);
 	spr0->qwc = 0;
@ -132,7 +132,7 @@ void _SPR0interleave()
 	spr0->qwc = 0;
 }

-static __forceinline void _dmaSPR0()
+static __fi void _dmaSPR0()
 {
 	if (dmacRegs->ctrl.STS == STS_fromSPR)
 	{
@ -273,7 +273,7 @@ void dmaSPR0()   // fromSPR
 	SPRFROMinterrupt();
 }

-__forceinline static void SPR1transfer(const void* data, int qwc)
+__fi static void SPR1transfer(const void* data, int qwc)
 {
 	memcpy_qwc(&psSu128(spr1->sadr), data, qwc);
 	spr1->sadr += qwc * 16;
@ -294,7 +294,7 @@ int  _SPR1chain()
 	return (spr1->qwc);
 }

-__forceinline void SPR1chain()
+__fi void SPR1chain()
 {
 	CPU_INT(DMAC_TO_SPR, _SPR1chain() / BIAS);
 	spr1->qwc = 0;
--- a/pcsx2/Sif.cpp
+++ b/pcsx2/Sif.cpp
@ -26,7 +26,7 @@ void sifInit()
 	memzero(sif1);
 }

-__forceinline void dmaSIF2()
+__fi void dmaSIF2()
 {
 	SIF_LOG(wxString(L"dmaSIF2" + sif2dma->cmq_to_str()).To8BitData());

--- a/pcsx2/Sif0.cpp
+++ b/pcsx2/Sif0.cpp
@ -24,7 +24,7 @@ _sif sif0;

 static bool done = false;

-static __forceinline void Sif0Init()
+static __fi void Sif0Init()
 {
 	SIF_LOG("SIF0 DMA start...");
 	done = false;
@ -33,7 +33,7 @@ static __forceinline void Sif0Init()
 }

 // Write from Fifo to EE.
-static __forceinline bool WriteFifoToEE()
+static __fi bool WriteFifoToEE()
 {
 	const int readSize = min((s32)sif0dma->qwc, sif0.fifo.size >> 2);

@ -62,7 +62,7 @@ static __forceinline bool WriteFifoToEE()
 }

 // Write IOP to Fifo.
-static __forceinline bool WriteIOPtoFifo()
+static __fi bool WriteIOPtoFifo()
 {
 	// There's some data ready to transfer into the fifo..
 	const int writeSize = min(sif0.iop.counter, sif0.fifo.free());
@ -80,7 +80,7 @@ static __forceinline bool WriteIOPtoFifo()
 }

 // Read Fifo into an ee tag, transfer it to sif0dma, and process it.
-static __forceinline bool ProcessEETag()
+static __fi bool ProcessEETag()
 {
 	static __aligned16 u32 tag[4];

@ -121,7 +121,7 @@ static __forceinline bool ProcessEETag()
 }

 // Read Fifo into an iop tag, and transfer it to hw_dma(9). And presumably process it.
-static __forceinline bool ProcessIOPTag()
+static __fi bool ProcessIOPTag()
 {
 	// Process DMA tag at hw_dma(9).tadr
 	sif0.iop.data = *(sifData *)iopPhysMem(hw_dma(9).tadr);
@ -141,7 +141,7 @@ static __forceinline bool ProcessIOPTag()
 }

 // Stop transferring ee, and signal an interrupt.
-static __forceinline void EndEE()
+static __fi void EndEE()
 {
 	SIF_LOG("Sif0: End EE");
 	sif0.ee.end = false;
@ -156,7 +156,7 @@ static __forceinline void EndEE()
 }

 // Stop transferring iop, and signal an interrupt.
-static __forceinline void EndIOP()
+static __fi void EndIOP()
 {
 	SIF_LOG("Sif0: End IOP");
 	sif0data = 0;
@ -175,7 +175,7 @@ static __forceinline void EndIOP()
 }

 // Handle the EE transfer.
-static __forceinline void HandleEETransfer()
+static __fi void HandleEETransfer()
 {
 	if(sif0dma->chcr.STR == false)
 	{
@ -253,7 +253,7 @@ static __forceinline void HandleEETransfer()
 // SIF - 8 = 0 (pos=12)
 // SIF0 DMA end...

-static __forceinline void HandleIOPTransfer()
+static __fi void HandleIOPTransfer()
 {
 	if (sif0.iop.counter <= 0) // If there's no more to transfer
 	{
@ -280,13 +280,13 @@ static __forceinline void HandleIOPTransfer()
 	}
 }

-static __forceinline void Sif0End()
+static __fi void Sif0End()
 {
 	SIF_LOG("SIF0 DMA end...");
 }

 // Transfer IOP to EE, putting data in the fifo as an intermediate step.
-__forceinline void SIF0Dma()
+__fi void SIF0Dma()
 {
 	int BusyCheck = 0;
 	Sif0Init();
@ -317,19 +317,19 @@ __forceinline void SIF0Dma()
 	Sif0End();
 }

-__forceinline void  sif0Interrupt()
+__fi void  sif0Interrupt()
 {
 	HW_DMA9_CHCR &= ~0x01000000;
 	psxDmaInterrupt2(2);
 }

-__forceinline void  EEsif0Interrupt()
+__fi void  EEsif0Interrupt()
 {
 	hwDmacIrq(DMAC_SIF0);
 	sif0dma->chcr.STR = false;
 }

-__forceinline void dmaSIF0()
+__fi void dmaSIF0()
 {
 	SIF_LOG(wxString(L"dmaSIF0" + sif0dma->cmqt_to_str()).To8BitData());

--- a/pcsx2/Sif1.cpp
+++ b/pcsx2/Sif1.cpp
@ -24,7 +24,7 @@ _sif sif1;

 static bool done = false;

-static __forceinline void Sif1Init()
+static __fi void Sif1Init()
 {
 	SIF_LOG("SIF1 DMA start...");
 	done = false;
@ -33,7 +33,7 @@ static __forceinline void Sif1Init()
 }

 // Write from the EE to Fifo.
-static __forceinline bool WriteEEtoFifo()
+static __fi bool WriteEEtoFifo()
 {
 	// There's some data ready to transfer into the fifo..

@ -59,7 +59,7 @@ static __forceinline bool WriteEEtoFifo()
 }

 // Read from the fifo and write to IOP
-static __forceinline bool WriteFifoToIOP()
+static __fi bool WriteFifoToIOP()
 {
 	// If we're reading something, continue to do so.

@ -78,7 +78,7 @@ static __forceinline bool WriteFifoToIOP()
 }

 // Get a tag and process it.
-static __forceinline bool ProcessEETag()
+static __fi bool ProcessEETag()
 {
 	// Chain mode
 	tDMA_TAG *ptag;
@ -142,7 +142,7 @@ static __forceinline bool ProcessEETag()
 }

 // Write fifo to data, and put it in IOP.
-static __forceinline bool SIFIOPReadTag()
+static __fi bool SIFIOPReadTag()
 {
 	// Read a tag.
 	sif1.fifo.read((u32*)&sif1.iop.data, 4);
@ -160,7 +160,7 @@ static __forceinline bool SIFIOPReadTag()
 }

 // Stop processing EE, and signal an interrupt.
-static __forceinline void EndEE()
+static __fi void EndEE()
 {
 	sif1.ee.end = false;
 	sif1.ee.busy = false;
@ -180,7 +180,7 @@ static __forceinline void EndEE()
 }

 // Stop processing IOP, and signal an interrupt.
-static __forceinline void EndIOP()
+static __fi void EndIOP()
 {
 	sif1data = 0;
 	sif1.iop.end = false;
@ -201,7 +201,7 @@ static __forceinline void EndIOP()
 }

 // Handle the EE transfer.
-static __forceinline void HandleEETransfer()
+static __fi void HandleEETransfer()
 {
 	if(sif1dma->chcr.STR == false)
 	{
@ -248,7 +248,7 @@ static __forceinline void HandleEETransfer()
 }

 // Handle the IOP transfer.
-static __forceinline void HandleIOPTransfer()
+static __fi void HandleIOPTransfer()
 {
 	if (sif1.iop.counter > 0)
 	{
@ -274,13 +274,13 @@ static __forceinline void HandleIOPTransfer()
 	}
 }

-static __forceinline void Sif1End()
+static __fi void Sif1End()
 {
 	SIF_LOG("SIF1 DMA end...");
 }

 // Transfer EE to IOP, putting data in the fifo as an intermediate step.
-__forceinline void SIF1Dma()
+__fi void SIF1Dma()
 {
 	int BusyCheck = 0;
 	Sif1Init();
@ -313,13 +313,13 @@ __forceinline void SIF1Dma()
 	Sif1End();
 }

-__forceinline void  sif1Interrupt()
+__fi void  sif1Interrupt()
 {
 	HW_DMA10_CHCR &= ~0x01000000; //reset TR flag
 	psxDmaInterrupt2(3);
 }

-__forceinline void  EEsif1Interrupt()
+__fi void  EEsif1Interrupt()
 {
 	hwDmacIrq(DMAC_SIF1);
 	sif1dma->chcr.STR = false;
@ -327,7 +327,7 @@ __forceinline void  EEsif1Interrupt()

 // Do almost exactly the same thing as psxDma10 in IopDma.cpp.
 // Main difference is this checks for iop, where psxDma10 checks for ee.
-__forceinline void dmaSIF1()
+__fi void dmaSIF1()
 {
 	SIF_LOG(wxString(L"dmaSIF1" + sif1dma->cmqt_to_str()).To8BitData());

--- a/pcsx2/Sio.cpp
+++ b/pcsx2/Sio.cpp
@ -36,14 +36,14 @@ static int m_ForceEjectionTimeout[2];

 #ifdef SIO_INLINE_IRQS
 #define SIO_INT() sioInterrupt()
-#define SIO_FORCEINLINE __forceinline
+#define SIO_FORCEINLINE __fi
 #else
-__forceinline void SIO_INT()
+__fi void SIO_INT()
 {
 	if( !(psxRegs.interrupt & (1<<IopEvt_SIO)) )
 		PSX_INT(IopEvt_SIO, 64 ); // PSXCLK/250000);
 }
-#define SIO_FORCEINLINE __forceinline
+#define SIO_FORCEINLINE __fi
 #endif

 // Currently only check if pad wants mtap to be active.
--- a/pcsx2/VU0.cpp
+++ b/pcsx2/VU0.cpp
@ -60,7 +60,7 @@ void COP2_Unknown()

 //****************************************************************************

-__forceinline void _vu0run(bool breakOnMbit, bool addCycles) {
+__fi void _vu0run(bool breakOnMbit, bool addCycles) {

 	if (!(VU0.VI[REG_VPU_STAT].UL & 1)) return;

@ -179,13 +179,13 @@ void CTC2() {
 //---------------------------------------------------------------------------------------


-__forceinline void SYNCMSFLAGS()
+__fi void SYNCMSFLAGS()
 {
 	VU0.VI[REG_STATUS_FLAG].UL = VU0.statusflag;
 	VU0.VI[REG_MAC_FLAG].UL = VU0.macflag;
 }

-__forceinline void SYNCFDIV()
+__fi void SYNCFDIV()
 {
 	VU0.VI[REG_Q].UL = VU0.q.UL;
 	VU0.VI[REG_STATUS_FLAG].UL = VU0.statusflag;
--- a/pcsx2/VUflags.cpp
+++ b/pcsx2/VUflags.cpp
@ -41,7 +41,7 @@ void vuUpdateDI(VURegs * VU) {
 //	VU->statusflag|= (Flag_D | (VU0.VI[REG_STATUS_FLAG].US[0] >> 5)) << 11;
 }

-static __releaseinline u32 VU_MAC_UPDATE( int shift, VURegs * VU, float f )
+static __ri u32 VU_MAC_UPDATE( int shift, VURegs * VU, float f )
 {
 	u32 v = *(u32*)&f;
 	int exp = (v >> 23) & 0xff;
@ -72,47 +72,47 @@ static __releaseinline u32 VU_MAC_UPDATE( int shift, VURegs * VU, float f )
 	}
 }

-__forceinline u32 VU_MACx_UPDATE(VURegs * VU, float x)
+__fi u32 VU_MACx_UPDATE(VURegs * VU, float x)
 {
 	return VU_MAC_UPDATE(3, VU, x);
 }

-__forceinline u32 VU_MACy_UPDATE(VURegs * VU, float y)
+__fi u32 VU_MACy_UPDATE(VURegs * VU, float y)
 {
 	return VU_MAC_UPDATE(2, VU, y);
 }

-__forceinline u32 VU_MACz_UPDATE(VURegs * VU, float z)
+__fi u32 VU_MACz_UPDATE(VURegs * VU, float z)
 {
 	return VU_MAC_UPDATE(1, VU, z);
 }

-__forceinline u32 VU_MACw_UPDATE(VURegs * VU, float w)
+__fi u32 VU_MACw_UPDATE(VURegs * VU, float w)
 {
 	return VU_MAC_UPDATE(0, VU, w);
 }

-__forceinline void VU_MACx_CLEAR(VURegs * VU)
+__fi void VU_MACx_CLEAR(VURegs * VU)
 {
 	VU->macflag&= ~(0x1111<<3);
 }

-__forceinline void VU_MACy_CLEAR(VURegs * VU)
+__fi void VU_MACy_CLEAR(VURegs * VU)
 {
 	VU->macflag&= ~(0x1111<<2);
 }

-__forceinline void VU_MACz_CLEAR(VURegs * VU)
+__fi void VU_MACz_CLEAR(VURegs * VU)
 {
 	VU->macflag&= ~(0x1111<<1);
 }

-__forceinline void VU_MACw_CLEAR(VURegs * VU)
+__fi void VU_MACw_CLEAR(VURegs * VU)
 {
 	VU->macflag&= ~(0x1111<<0);
 }

-__releaseinline void VU_STAT_UPDATE(VURegs * VU) {
+__ri void VU_STAT_UPDATE(VURegs * VU) {
 	int newflag = 0 ;
 	if (VU->macflag & 0x000F) newflag = 0x1;
 	if (VU->macflag & 0x00F0) newflag |= 0x2;
--- a/pcsx2/VUops.cpp
+++ b/pcsx2/VUops.cpp
@ -45,7 +45,7 @@

 static __aligned16 VECTOR RDzero;

-static __releaseinline void __fastcall _vuFMACflush(VURegs * VU) {
+static __ri void _vuFMACflush(VURegs * VU) {
 	int i;

 	for (i=0; i<8; i++) {
@ -62,7 +62,7 @@ static __releaseinline void __fastcall _vuFMACflush(VURegs * VU) {
 	}
 }

-static __releaseinline void __fastcall _vuFDIVflush(VURegs * VU) {
+static __ri void _vuFDIVflush(VURegs * VU) {
 	if (VU->fdiv.enable == 0) return;

 	if ((VU->cycle - VU->fdiv.sCycle) >= VU->fdiv.Cycle) {
@ -74,7 +74,7 @@ static __releaseinline void __fastcall _vuFDIVflush(VURegs * VU) {
 	}
 }

-static __releaseinline void __fastcall _vuEFUflush(VURegs * VU) {
+static __ri void _vuEFUflush(VURegs * VU) {
 	if (VU->efu.enable == 0) return;

 	if ((VU->cycle - VU->efu.sCycle) >= VU->efu.Cycle) {
@ -140,7 +140,7 @@ void _vuFlushAll(VURegs* VU)
 	} while(nRepeat);
 }

-__forceinline void _vuTestPipes(VURegs * VU) {
+__fi void _vuTestPipes(VURegs * VU) {
 	_vuFMACflush(VU);
 	_vuFDIVflush(VU);
 	_vuEFUflush(VU);
@ -169,7 +169,7 @@ static void __fastcall _vuFMACTestStall(VURegs * VU, int reg, int xyzw) {
 	_vuTestPipes(VU);
 }

-static __releaseinline void __fastcall _vuFMACAdd(VURegs * VU, int reg, int xyzw) {
+static __ri void __fastcall _vuFMACAdd(VURegs * VU, int reg, int xyzw) {
 	int i;

 	/* find a free fmac pipe */
@ -192,7 +192,7 @@ static __releaseinline void __fastcall _vuFMACAdd(VURegs * VU, int reg, int xyzw
 	VU->fmac[i].clipflag = VU->clipflag;
 }

-static __releaseinline void __fastcall _vuFDIVAdd(VURegs * VU, int cycles) {
+static __ri void __fastcall _vuFDIVAdd(VURegs * VU, int cycles) {
 	VUM_LOG("adding FDIV pipe");

 	VU->fdiv.enable = 1;
@ -202,7 +202,7 @@ static __releaseinline void __fastcall _vuFDIVAdd(VURegs * VU, int cycles) {
 	VU->fdiv.statusflag = VU->statusflag;
 }

-static __releaseinline void __fastcall _vuEFUAdd(VURegs * VU, int cycles) {
+static __ri void __fastcall _vuEFUAdd(VURegs * VU, int cycles) {
 //	VUM_LOG("adding EFU pipe\n");

 	VU->efu.enable = 1;
@ -211,7 +211,7 @@ static __releaseinline void __fastcall _vuEFUAdd(VURegs * VU, int cycles) {
 	VU->efu.reg.F  = VU->p.F;
 }

-static __releaseinline void __fastcall _vuFlushFDIV(VURegs * VU) {
+static __ri void __fastcall _vuFlushFDIV(VURegs * VU) {
 	int cycle;

 	if (VU->fdiv.enable == 0) return;
@ -225,7 +225,7 @@ static __releaseinline void __fastcall _vuFlushFDIV(VURegs * VU) {
 	VU->VI[REG_STATUS_FLAG].UL = VU->fdiv.statusflag;
 }

-static __releaseinline void __fastcall _vuFlushEFU(VURegs * VU) {
+static __ri void __fastcall _vuFlushEFU(VURegs * VU) {
 	int cycle;

 	if (VU->efu.enable == 0) return;
@ -238,7 +238,7 @@ static __releaseinline void __fastcall _vuFlushEFU(VURegs * VU) {
 	VU->VI[REG_P].UL = VU->efu.reg.UL;
 }

-__forceinline void _vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn) {
+static __fi void _vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn) {
 	if (VUregsn->VFread0) {
 		_vuFMACTestStall(VU, VUregsn->VFread0, VUregsn->VFr0xyzw);
 	}
@ -247,7 +247,7 @@ __forceinline void _vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn) {
 	}
 }

-__forceinline void _vuAddFMACStalls(VURegs * VU, _VURegsNum *VUregsn) {
+static __fi void _vuAddFMACStalls(VURegs * VU, _VURegsNum *VUregsn) {
 	if (VUregsn->VFwrite) {
 		_vuFMACAdd(VU, VUregsn->VFwrite, VUregsn->VFwxyzw);
 	} else
@ -258,36 +258,36 @@ __forceinline void _vuAddFMACStalls(VURegs * VU, _VURegsNum *VUregsn) {
 	}
 }

-__forceinline void _vuTestFDIVStalls(VURegs * VU, _VURegsNum *VUregsn) {
+static __fi void _vuTestFDIVStalls(VURegs * VU, _VURegsNum *VUregsn) {
 //	_vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn);
 	_vuFlushFDIV(VU);
 }

-__forceinline void _vuAddFDIVStalls(VURegs * VU, _VURegsNum *VUregsn) {
+static __fi void _vuAddFDIVStalls(VURegs * VU, _VURegsNum *VUregsn) {
 	if (VUregsn->VIwrite & (1 << REG_Q)) {
 		_vuFDIVAdd(VU, VUregsn->cycles);
 	}
 }


-__forceinline void _vuTestEFUStalls(VURegs * VU, _VURegsNum *VUregsn) {
+static __fi void _vuTestEFUStalls(VURegs * VU, _VURegsNum *VUregsn) {
 //	_vuTestFMACStalls(VURegs * VU, _VURegsNum *VUregsn);
 	_vuFlushEFU(VU);
 }

-__forceinline void _vuAddEFUStalls(VURegs * VU, _VURegsNum *VUregsn) {
+static __fi void _vuAddEFUStalls(VURegs * VU, _VURegsNum *VUregsn) {
 	if (VUregsn->VIwrite & (1 << REG_P)) {
 		_vuEFUAdd(VU, VUregsn->cycles);
 	}
 }

-__forceinline void _vuTestUpperStalls(VURegs * VU, _VURegsNum *VUregsn) {
+__fi void _vuTestUpperStalls(VURegs * VU, _VURegsNum *VUregsn) {
 	switch (VUregsn->pipe) {
 		case VUPIPE_FMAC: _vuTestFMACStalls(VU, VUregsn); break;
 	}
 }

-__forceinline void _vuTestLowerStalls(VURegs * VU, _VURegsNum *VUregsn) {
+__fi void _vuTestLowerStalls(VURegs * VU, _VURegsNum *VUregsn) {
 	switch (VUregsn->pipe) {
 		case VUPIPE_FMAC: _vuTestFMACStalls(VU, VUregsn); break;
 		case VUPIPE_FDIV: _vuTestFDIVStalls(VU, VUregsn); break;
@ -295,13 +295,13 @@ __forceinline void _vuTestLowerStalls(VURegs * VU, _VURegsNum *VUregsn) {
 	}
 }

-__forceinline void _vuAddUpperStalls(VURegs * VU, _VURegsNum *VUregsn) {
+__fi void _vuAddUpperStalls(VURegs * VU, _VURegsNum *VUregsn) {
 	switch (VUregsn->pipe) {
 		case VUPIPE_FMAC: _vuAddFMACStalls(VU, VUregsn); break;
 	}
 }

-__forceinline void _vuAddLowerStalls(VURegs * VU, _VURegsNum *VUregsn) {
+__fi void _vuAddLowerStalls(VURegs * VU, _VURegsNum *VUregsn) {
 	switch (VUregsn->pipe) {
 		case VUPIPE_FMAC: _vuAddFMACStalls(VU, VUregsn); break;
 		case VUPIPE_FDIV: _vuAddFDIVStalls(VU, VUregsn); break;
@ -332,7 +332,7 @@ static float __fastcall vuDouble(u32 f)
 	return *(float*)&f;
 }
 #else
-static __forceinline float vuDouble(u32 f)
+static __fi float vuDouble(u32 f)
 {
 	return *(float*)&f;
 }
@ -1577,7 +1577,7 @@ void _vuMR32(VURegs * VU) {
 //  Load / Store Instructions (VU Interpreter)
 // --------------------------------------------------------------------------------------

-__forceinline u32* GET_VU_MEM(VURegs* VU, u32 addr)		// non-static, also used by sVU for now.
+__fi u32* GET_VU_MEM(VURegs* VU, u32 addr)		// non-static, also used by sVU for now.
 {
 	if( VU == g_pVU1 ) return (u32*)(VU1.Mem+(addr&0x3fff));
 	if( addr >= 0x4000 ) return (u32*)(VU0.Mem+(addr&0x43f0)); // get VF and VI regs (they're mapped to 0x4xx0 in VU0 mem!)
--- a/pcsx2/Vif.cpp
+++ b/pcsx2/Vif.cpp
@ -86,7 +86,7 @@ void SaveStateBase::vif1Freeze()
 extern bool _chainVIF0();
 extern bool _VIF0chain();

-_f void vif0FBRST(u32 value) {
+__fi void vif0FBRST(u32 value) {
 	VIF_LOG("VIF0_FBRST write32 0x%8.8x", value);

 	if (value & 0x1) // Reset Vif.
@ -147,7 +147,7 @@ _f void vif0FBRST(u32 value) {
 	}
 }

-_f void vif1FBRST(u32 value) {
+__fi void vif1FBRST(u32 value) {
 	VIF_LOG("VIF1_FBRST write32 0x%8.8x", value);

 	if (FBRST(value).RST) // Reset Vif.
@ -241,7 +241,7 @@ _f void vif1FBRST(u32 value) {
 	}
 }

-_f void vif1STAT(u32 value) {
+__fi void vif1STAT(u32 value) {
 	VIF_LOG("VIF1_STAT write32 0x%8.8x", value);

 	/* Only FDR bit is writable, so mask the rest */
--- a/pcsx2/Vif.h
+++ b/pcsx2/Vif.h
@ -223,8 +223,6 @@ extern VIFregisters *vifRegs;
 #define  GetVifX	(idx ? (vif1)     : (vif0))
 #define  vifXch		(idx ? (vif1ch)   : (vif0ch))
 #define  vifXRegs	(idx ? (vif1Regs) : (vif0Regs))
-#define _f			__forceinline
-#define _ri			__releaseinline

 extern void dmaVIF0();
 extern void dmaVIF1();
--- a/pcsx2/Vif0_Dma.cpp
+++ b/pcsx2/Vif0_Dma.cpp
@ -21,7 +21,7 @@

 // Run VU0 until finish, don't add cycles to EE
 // because its vif stalling not the EE core...
-__forceinline void vif0FLUSH()
+__fi void vif0FLUSH()
 {
 	if(g_packetsizeonvu > vif0.vifpacketsize && g_vu0Cycles > 0) 
 	{
@ -78,7 +78,7 @@ bool _VIF0chain()
 		return VIF0transfer(pMem, vif0ch->qwc * 4);
 }

-__forceinline void vif0SetupTransfer()
+__fi void vif0SetupTransfer()
 {
    tDMA_TAG *ptag;

@ -138,7 +138,7 @@ __forceinline void vif0SetupTransfer()
 	}
 }

-__forceinline void vif0Interrupt()
+__fi void vif0Interrupt()
 {
 	VIF_LOG("vif0Interrupt: %8.8x", cpuRegs.cycle);

--- a/pcsx2/Vif1_Dma.cpp
+++ b/pcsx2/Vif1_Dma.cpp
@ -22,7 +22,7 @@
 #include "newVif.h"


-__forceinline void vif1FLUSH()
+__fi void vif1FLUSH()
 {
 	if(g_packetsizeonvu > vif1.vifpacketsize && g_vu1Cycles > 0) 
 	{
@ -180,7 +180,7 @@ bool _VIF1chain()
 		return VIF1transfer(pMem, vif1ch->qwc * 4);
 }

-__forceinline void vif1SetupTransfer()
+__fi void vif1SetupTransfer()
 {
    tDMA_TAG *ptag;
 	DMACh& vif1c = (DMACh&)PS2MEM_HW[0x9000];
@ -340,7 +340,7 @@ bool CheckPath2GIF(EE_EventType channel)
 	}
 	return true;
 }
-__forceinline void vif1Interrupt()
+__fi void vif1Interrupt()
 {
 	VIF_LOG("vif1Interrupt: %8.8x", cpuRegs.cycle);

--- a/pcsx2/Vif1_MFIFO.cpp
+++ b/pcsx2/Vif1_MFIFO.cpp
@ -36,7 +36,7 @@ static u32 qwctag(u32 mask)
 	return (dmacRegs->rbor.ADDR + (mask & dmacRegs->rbsr.RMSK));
 }

-static __forceinline bool mfifoVIF1rbTransfer()
+static __fi bool mfifoVIF1rbTransfer()
 {
 	u32 maddr = dmacRegs->rbor.ADDR;
 	u32 msize = dmacRegs->rbor.ADDR + dmacRegs->rbsr.RMSK + 16;
@ -93,7 +93,7 @@ static __forceinline bool mfifoVIF1rbTransfer()
 	return ret;
 }

-static __forceinline bool mfifo_VIF1chain()
+static __fi bool mfifo_VIF1chain()
 {
    bool ret;

--- a/pcsx2/Vif_Codes.cpp
+++ b/pcsx2/Vif_Codes.cpp
@ -32,12 +32,12 @@ vifOp(vifCode_Null);
 // Vif0/Vif1 Misc Functions
 //------------------------------------------------------------------

-static _f void vifFlush(int idx) {
+static __fi void vifFlush(int idx) {
 	if (!idx) vif0FLUSH();
 	else	  vif1FLUSH();
 }

-static _f void vuExecMicro(int idx, u32 addr) {
+static __fi void vuExecMicro(int idx, u32 addr) {
 	VURegs* VU = nVif[idx].VU;
 	int startcycles = 0;
 	//vifFlush(idx);
@ -116,7 +116,7 @@ vifOp(vifCode_Base) {

 extern bool SIGNAL_IMR_Pending;

-template<int idx> _f int _vifCode_Direct(int pass, const u8* data, bool isDirectHL) {
+template<int idx> __fi int _vifCode_Direct(int pass, const u8* data, bool isDirectHL) {
 	pass1 {
 		vif1Only();
 		int vifImm    = (u16)vif1Regs->code;
@ -301,7 +301,7 @@ vifOp(vifCode_Mark) {
 	return 0;
 }

-static _f void _vifCode_MPG(int idx, u32 addr, const u32 *data, int size) {
+static __fi void _vifCode_MPG(int idx, u32 addr, const u32 *data, int size) {
 	VURegs& VUx = idx ? VU1 : VU0;
 	pxAssume(VUx.Micro > 0);

@ -423,7 +423,7 @@ vifOp(vifCode_Offset) {
 	return 0;
 }

-template<int idx> static _f int _vifCode_STColRow(const u32* data, u32* pmem1, u32* pmem2) {
+template<int idx> static __fi int _vifCode_STColRow(const u32* data, u32* pmem1, u32* pmem2) {
 	vifStruct& vifX = GetVifX;

 	int ret = min(4 - vifX.tag.addr, vifX.vifpacketsize);
--- a/pcsx2/Vif_Dma.h
+++ b/pcsx2/Vif_Dma.h
@ -98,7 +98,7 @@ typedef FnType_VifCmdHandler* Fnptr_VifCmdHandler;

 extern const __aligned16 Fnptr_VifCmdHandler vifCmdHandler[2][128];

-__forceinline static int _limit(int a, int max)
+__fi static int _limit(int a, int max)
 {
 	return ((a > max) ? max : a);
 }
--- a/pcsx2/Vif_Transfer.cpp
+++ b/pcsx2/Vif_Transfer.cpp
@ -101,7 +101,7 @@ _vifT void vifTransferLoop(u32* &data) {
 	if (pSize)	  vifX.vifstalled	 = true;
 }

-_vifT _f bool vifTransfer(u32 *data, int size) {
+_vifT static __fi bool vifTransfer(u32 *data, int size) {
 	vifStruct& vifX = GetVifX;

 	// irqoffset necessary to add up the right qws, or else will spin (spiderman)
--- a/pcsx2/Vif_Unpack.cpp
+++ b/pcsx2/Vif_Unpack.cpp
@ -25,7 +25,7 @@ enum UnpackOffset {
 	OFFSET_W = 3
 };

-static __forceinline u32 setVifRowRegs(u32 reg, u32 data) {
+static __fi u32 setVifRowRegs(u32 reg, u32 data) {
 	switch (reg) {
 		case 0: vifRegs->r0 = data; break;
 		case 1: vifRegs->r1 = data; break;
@ -36,7 +36,7 @@ static __forceinline u32 setVifRowRegs(u32 reg, u32 data) {
 	return data;
 }

-static __forceinline u32 getVifRowRegs(u32 reg) {
+static __fi u32 getVifRowRegs(u32 reg) {
 	switch (reg) {
 		case 0: return vifRegs->r0; break;
 		case 1: return vifRegs->r1; break;
@ -47,7 +47,7 @@ static __forceinline u32 getVifRowRegs(u32 reg) {
 	return 0; // unreachable...
 }

-static __forceinline u32 getVifColRegs(u32 reg) {
+static __fi u32 getVifColRegs(u32 reg) {
 	switch (reg) {
 		case 0:  return vifRegs->c0; break;
 		case 1:  return vifRegs->c1; break;
@ -58,7 +58,7 @@ static __forceinline u32 getVifColRegs(u32 reg) {
 }

 template< bool doMask >
-static __releaseinline void writeXYZW(u32 offnum, u32 &dest, u32 data) {
+static __ri void writeXYZW(u32 offnum, u32 &dest, u32 data) {
 	u32 vifRowReg = getVifRowRegs(offnum);
 	int n = 0;

@ -89,7 +89,7 @@ static __releaseinline void writeXYZW(u32 offnum, u32 &dest, u32 data) {
 }

 template < bool doMask, class T >
-static __forceinline void __fastcall UNPACK_S(u32 *dest, const T *data, int size)
+static __fi void __fastcall UNPACK_S(u32 *dest, const T *data, int size)
 {
 	//S-# will always be a complete packet, no matter what. So we can skip the offset bits
 	writeXYZW<doMask>(OFFSET_X, *dest++, *data);
@ -99,7 +99,7 @@ static __forceinline void __fastcall UNPACK_S(u32 *dest, const T *data, int size
 }

 template <bool doMask, class T>
-static __forceinline void __fastcall UNPACK_V2(u32 *dest, const T *data, int size)
+static __ri void __fastcall UNPACK_V2(u32 *dest, const T *data, int size)
 {
 	if (vifRegs->offset == OFFSET_X)
 	{
@ -135,7 +135,7 @@ static __forceinline void __fastcall UNPACK_V2(u32 *dest, const T *data, int siz
 }

 template <bool doMask, class T>
-static __forceinline void __fastcall UNPACK_V3(u32 *dest, const T *data, int size)
+static __ri void __fastcall UNPACK_V3(u32 *dest, const T *data, int size)
 {
 	if(vifRegs->offset == OFFSET_X)
 	{
@ -177,7 +177,7 @@ static __forceinline void __fastcall UNPACK_V3(u32 *dest, const T *data, int siz
 }

 template <bool doMask, class T>
-static __forceinline void __fastcall UNPACK_V4(u32 *dest, const T *data , int size)
+static __fi void __fastcall UNPACK_V4(u32 *dest, const T *data , int size)
 {
 	while (size > 0)
 	{
@ -190,7 +190,7 @@ static __forceinline void __fastcall UNPACK_V4(u32 *dest, const T *data , int si
 }

 template< bool doMask >
-static __releaseinline void __fastcall UNPACK_V4_5(u32 *dest, const u32 *data, int size)
+static __ri void __fastcall UNPACK_V4_5(u32 *dest, const u32 *data, int size)
 {
 	//As with S-#, this will always be a complete packet
 	writeXYZW<doMask>(OFFSET_X, *dest++,	((*data & 0x001f) << 3));
--- a/pcsx2/gui/AppInit.cpp
+++ b/pcsx2/gui/AppInit.cpp
@ -805,12 +805,12 @@ void Pcsx2App::CleanUp()
 	_parent::CleanUp();
 }

-__forceinline wxString AddAppName( const wxChar* fmt )
+__fi wxString AddAppName( const wxChar* fmt )
 {
 	return wxsFormat( fmt, pxGetAppName().c_str() );
 }

-__forceinline wxString AddAppName( const char* fmt )
+__fi wxString AddAppName( const char* fmt )
 {
 	return wxsFormat( fromUTF8(fmt), pxGetAppName().c_str() );
 }
--- a/pcsx2/gui/AppMain.cpp
+++ b/pcsx2/gui/AppMain.cpp
@ -886,7 +886,7 @@ void Pcsx2App::SysExecute( CDVD_SourceType cdvdsrc, const wxString& elf_override
 // Thread Safety: The state of the system can change in parallel to execution of the
 // main thread.  If you need to perform an extended length activity on the execution
 // state (such as saving it), you *must* suspend the Corethread first!
-__forceinline bool SysHasValidState()
+__fi bool SysHasValidState()
 {
 	return CoreThread.HasActiveMachine();
 }
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@ -113,7 +113,7 @@ struct GifPathStruct
 	const GIFRegHandler	Handlers[0x100-0x60];		// handlers for 0x60->0x100
 	GIFPath				path[3];

-	__forceinline GIFPath& operator[]( int idx ) { return path[idx]; }
+	__fi GIFPath& operator[]( int idx ) { return path[idx]; }
 };


@ -249,13 +249,13 @@ GIFPath::GIFPath() : tag()
 	Reset();
 }

-__forceinline void GIFPath::Reset()
+__fi void GIFPath::Reset()
 {
 	memzero(*this);
 	const_cast<GIFTAG&>(tag).EOP = 1;
 }

-__forceinline bool GIFPath::StepReg()
+__fi bool GIFPath::StepReg()
 {
 	if (++curreg >= numregs) {
 		curreg = 0;
@ -266,13 +266,13 @@ __forceinline bool GIFPath::StepReg()
 	return true;
 }

-__forceinline u8 GIFPath::GetReg() { return regs[curreg]; }
+__fi u8 GIFPath::GetReg() { return regs[curreg]; }

 // Unpack the registers - registers are stored as a sequence of 4 bit values in the
 // upper 64 bits of the GIFTAG.  That sucks for us when handling partialized GIF packets
 // coming in from paths 2 and 3, so we unpack them into an 8 bit array here.
 //
-__forceinline void GIFPath::PrepPackedRegs()
+__fi void GIFPath::PrepPackedRegs()
 {
 	// Only unpack registers if we're starting a new pack.  Otherwise the unpacked
 	// array should have already been initialized by a previous partial transfer.
@ -292,7 +292,7 @@ __forceinline void GIFPath::PrepPackedRegs()


 template< bool Aligned >
-__forceinline void GIFPath::SetTag(const void* mem)
+__fi void GIFPath::SetTag(const void* mem)
 {
 	_mm_store_ps( (float*)&tag, Aligned ? _mm_load_ps((const float*)mem) : _mm_loadu_ps((const float*)mem) );

@ -300,7 +300,7 @@ __forceinline void GIFPath::SetTag(const void* mem)
 	curreg	= 0;
 }

-__forceinline bool GIFPath::IsActive() const
+__fi bool GIFPath::IsActive() const
 {
 	return (nloop != 0) || !tag.EOP;
 }
@ -312,7 +312,7 @@ void SaveStateBase::gifPathFreeze()
 }


-static __forceinline void gsHandler(const u8* pMem)
+static __fi void gsHandler(const u8* pMem)
 {
 	const int reg = pMem[8];

@ -382,7 +382,7 @@ static __forceinline void gsHandler(const u8* pMem)
 //   size - max size of incoming data stream, in qwc (simd128).  If the path is PATH1, and the
 //     path does not terminate (EOP) within the specified size, it is assumed that the path must
 //     loop around to the start of VU memory and continue processing.
-__forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size)
+__fi int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size)
 {
 	u32	startSize =  size;						// Start Size

@ -529,7 +529,7 @@ __forceinline int GIFPath::ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 s
 	return size;
 }

-__releaseinline void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
+__ri void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len )
 {
 	uint endpos = destStart + len;
 	if( endpos < destSize )
@ -547,7 +547,7 @@ __releaseinline void MemCopy_WrappedDest( const u128* src, u128* destBase, uint&
 	}
 }

-__releaseinline void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
+__ri void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len )
 {
 	uint endpos = srcStart + len;
 	if( endpos < srcSize )
@ -576,7 +576,7 @@ __releaseinline void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, ui
 //     path does not terminate (EOP) within the specified size, it is assumed that the path must
 //     loop around to the start of VU memory and continue processing.
 template< GIF_PATH pathidx, bool Aligned > 
-__forceinline int GIFPath::CopyTag(const u128* pMem128, u32 size)
+__fi int GIFPath::CopyTag(const u128* pMem128, u32 size)
 {
 	uint& ringpos = GetMTGS().m_packet_writepos;
 	const uint original_ringpos = ringpos;
@ -874,7 +874,7 @@ __forceinline int GIFPath::CopyTag(const u128* pMem128, u32 size)
 //   size - max size of incoming data stream, in qwc (simd128).  If the path is PATH1, and the
 //     path does not terminate (EOP) within the specified size, it is assumed that the path must
 //     loop around to the start of VU memory and continue processing.
-__forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size)
+__fi int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size)
 {
 	switch( pathidx )
 	{
@ -900,7 +900,7 @@ __forceinline int GIFPath_CopyTag(GIF_PATH pathidx, const u128* pMem, u32 size)
 // Quick version for queuing PATH1 data.
 // This version calculates the real length of the packet data only.  It does not process
 // IRQs or DMA status updates.
-__forceinline int GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size)
+__fi int GIFPath_ParseTagQuick(GIF_PATH pathidx, const u8* pMem, u32 size)
 {
 	int retSize = s_gifPath[pathidx].ParseTagQuick(pathidx, pMem, size);
 	return retSize;
@ -915,7 +915,7 @@ void GIFPath_Reset()

 // This is a hackfix tool provided for "canceling" the contents of the GIFpath when
 // invalid GIFdma states are encountered (typically needed for PATH3 only).
-__forceinline void GIFPath_Clear( GIF_PATH pathidx )
+__fi void GIFPath_Clear( GIF_PATH pathidx )
 {
 	memzero(s_gifPath.path[pathidx]);
 	s_gifPath.path[pathidx].Reset();
--- a/pcsx2/ps2/Iop/IopHwRead.cpp
+++ b/pcsx2/ps2/Iop/IopHwRead.cpp
@ -114,7 +114,7 @@ mem8_t __fastcall iopHwRead8_Page8( u32 addr )
 //////////////////////////////////////////////////////////////////////////////////////////
 //
 template< typename T >
-static __forceinline T _HwRead_16or32_Page1( u32 addr )
+static __fi T _HwRead_16or32_Page1( u32 addr )
 {
 	// all addresses are assumed to be prefixed with 0x1f801xxx:
 	jASSUME( (addr >> 12) == 0x1f801 );
--- a/pcsx2/ps2/Iop/IopHwWrite.cpp
+++ b/pcsx2/ps2/Iop/IopHwWrite.cpp
@ -30,7 +30,7 @@ using namespace Internal;
 //////////////////////////////////////////////////////////////////////////////////////////
 //
 template< typename T >
-static __forceinline void _generic_write( u32 addr, T val )
+static __fi void _generic_write( u32 addr, T val )
 {
 	//int bitsize = (sizeof(T) == 1) ? 8 : ( (sizeof(T) == 2) ? 16 : 32 );
 	IopHwTraceLog<T>( addr, val, "Write" );
@ -44,7 +44,7 @@ void __fastcall iopHwWrite32_generic( u32 addr, mem32_t val )	{ _generic_write<m
 //////////////////////////////////////////////////////////////////////////////////////////
 //
 template< typename T >
-static __forceinline T _generic_read( u32 addr )
+static __fi T _generic_read( u32 addr )
 {
 	//int bitsize = (sizeof(T) == 1) ? 8 : ( (sizeof(T) == 2) ? 16 : 32 );

@ -157,7 +157,7 @@ void __fastcall iopHwWrite8_Page8( u32 addr, mem8_t val )
 // Templated handler for both 32 and 16 bit write operations, to Page 1 registers.
 //
 template< typename T >
-static __forceinline void _HwWrite_16or32_Page1( u32 addr, T val )
+static __fi void _HwWrite_16or32_Page1( u32 addr, T val )
 {
 	// all addresses are assumed to be prefixed with 0x1f801xxx:
 	pxAssert( (addr >> 12) == 0x1f801 );
--- a/pcsx2/ps2/Iop/IopHw_Internal.h
+++ b/pcsx2/ps2/Iop/IopHw_Internal.h
@ -38,7 +38,7 @@ namespace Internal {
 //

 template< typename T>
-static __releaseinline const char* _log_GetIopHwName( u32 addr, T val )
+static __ri const char* _log_GetIopHwName( u32 addr, T val )
 {
 	switch( addr )
 	{
@ -200,7 +200,7 @@ static __releaseinline const char* _log_GetIopHwName( u32 addr, T val )
 }

 template< typename T>
-static __releaseinline void IopHwTraceLog( u32 addr, T val, const char* modestr )
+static __ri void IopHwTraceLog( u32 addr, T val, const char* modestr )
 {
 	if( !EmuConfig.Trace.IOP.m_EnableRegisters ) return;

--- a/pcsx2/vtlb.cpp
+++ b/pcsx2/vtlb.cpp
@ -65,7 +65,7 @@ vtlbHandler UnmappedPhyHandler1;

 // Interpreted VTLB lookup for 8, 16, and 32 bit accesses
 template<int DataSize,typename DataType>
-__forceinline DataType __fastcall MemOp_r0(u32 addr)
+__fi DataType __fastcall MemOp_r0(u32 addr)
 {
 	u32 vmv=vtlbdata.vmap[addr>>VTLB_PAGE_BITS];
 	s32 ppf=addr+vmv;
@ -94,7 +94,7 @@ __forceinline DataType __fastcall MemOp_r0(u32 addr)
 // ------------------------------------------------------------------------
 // Interpreterd VTLB lookup for 64 and 128 bit accesses.
 template<int DataSize,typename DataType>
-__forceinline void __fastcall MemOp_r1(u32 addr, DataType* data)
+__fi void MemOp_r1(u32 addr, DataType* data)
 {
 	u32 vmv=vtlbdata.vmap[addr>>VTLB_PAGE_BITS];
 	s32 ppf=addr+vmv;
@ -125,7 +125,7 @@ __forceinline void __fastcall MemOp_r1(u32 addr, DataType* data)

 // ------------------------------------------------------------------------
 template<int DataSize,typename DataType>
-__forceinline void __fastcall MemOp_w0(u32 addr, DataType data)
+__fi void MemOp_w0(u32 addr, DataType data)
 {
 	u32 vmv=vtlbdata.vmap[addr>>VTLB_PAGE_BITS];
 	s32 ppf=addr+vmv;
@ -153,7 +153,7 @@ __forceinline void __fastcall MemOp_w0(u32 addr, DataType data)

 // ------------------------------------------------------------------------
 template<int DataSize,typename DataType>
-__forceinline void __fastcall MemOp_w1(u32 addr,const DataType* data)
+__fi void MemOp_w1(u32 addr,const DataType* data)
 {
 	verify(DataSize==128 || DataSize==64);
 	u32 vmv=vtlbdata.vmap[addr>>VTLB_PAGE_BITS];
@ -230,7 +230,7 @@ void __fastcall vtlb_memWrite128(u32 mem, const mem128_t *value)
 //

 // Generates a tlbMiss Exception
-static __forceinline void vtlb_Miss(u32 addr,u32 mode)
+static __ri void vtlb_Miss(u32 addr,u32 mode)
 {
 	if( IsDevBuild )
 		Cpu->ThrowCpuException( R5900Exception::TLBMiss( addr, !!mode ) );
@ -241,7 +241,7 @@ static __forceinline void vtlb_Miss(u32 addr,u32 mode)
 // BusError exception: more serious than a TLB miss.  If properly emulated the PS2 kernel
 // itself would invoke a diagnostic/assertion screen that displays the cpu state at the
 // time of the exception.
-static __forceinline void vtlb_BusError(u32 addr,u32 mode)
+static __ri void vtlb_BusError(u32 addr,u32 mode)
 {
 	// Throwing exceptions isn't reliable *yet* because memory ops don't flush
 	// the PC prior to invoking the indirect handlers.
@ -297,17 +297,17 @@ template<u32 saddr>
 void __fastcall vtlbUnmappedPWrite128(u32 addr,const mem128_t* data) { vtlb_BusError(addr|saddr,1); }

 ///// VTLB mapping errors (unmapped address spaces)
-mem8_t __fastcall vtlbDefaultPhyRead8(u32 addr) { Console.Error("vtlbDefaultPhyRead8: 0x%X",addr); verify(false); return -1; }
-mem16_t __fastcall vtlbDefaultPhyRead16(u32 addr)  { Console.Error("vtlbDefaultPhyRead16: 0x%X",addr); verify(false); return -1; }
-mem32_t __fastcall vtlbDefaultPhyRead32(u32 addr) { Console.Error("vtlbDefaultPhyRead32: 0x%X",addr); verify(false); return -1; }
-void __fastcall vtlbDefaultPhyRead64(u32 addr,mem64_t* data) { Console.Error("vtlbDefaultPhyRead64: 0x%X",addr); verify(false); }
-void __fastcall vtlbDefaultPhyRead128(u32 addr,mem128_t* data) { Console.Error("vtlbDefaultPhyRead128: 0x%X",addr); verify(false); }
+static mem8_t __fastcall vtlbDefaultPhyRead8(u32 addr) { Console.Error("vtlbDefaultPhyRead8: 0x%X",addr); verify(false); return -1; }
+static mem16_t __fastcall vtlbDefaultPhyRead16(u32 addr)  { Console.Error("vtlbDefaultPhyRead16: 0x%X",addr); verify(false); return -1; }
+static mem32_t __fastcall vtlbDefaultPhyRead32(u32 addr) { Console.Error("vtlbDefaultPhyRead32: 0x%X",addr); verify(false); return -1; }
+static void __fastcall vtlbDefaultPhyRead64(u32 addr,mem64_t* data) { Console.Error("vtlbDefaultPhyRead64: 0x%X",addr); verify(false); }
+static void __fastcall vtlbDefaultPhyRead128(u32 addr,mem128_t* data) { Console.Error("vtlbDefaultPhyRead128: 0x%X",addr); verify(false); }

-void __fastcall vtlbDefaultPhyWrite8(u32 addr,mem8_t data) { Console.Error("vtlbDefaultPhyWrite8: 0x%X",addr); verify(false); }
-void __fastcall vtlbDefaultPhyWrite16(u32 addr,mem16_t data) { Console.Error("vtlbDefaultPhyWrite16: 0x%X",addr); verify(false); }
-void __fastcall vtlbDefaultPhyWrite32(u32 addr,mem32_t data) { Console.Error("vtlbDefaultPhyWrite32: 0x%X",addr); verify(false); }
-void __fastcall vtlbDefaultPhyWrite64(u32 addr,const mem64_t* data) { Console.Error("vtlbDefaultPhyWrite64: 0x%X",addr); verify(false); }
-void __fastcall vtlbDefaultPhyWrite128(u32 addr,const mem128_t* data) { Console.Error("vtlbDefaultPhyWrite128: 0x%X",addr); verify(false); }
+static void __fastcall vtlbDefaultPhyWrite8(u32 addr,mem8_t data) { Console.Error("vtlbDefaultPhyWrite8: 0x%X",addr); verify(false); }
+static void __fastcall vtlbDefaultPhyWrite16(u32 addr,mem16_t data) { Console.Error("vtlbDefaultPhyWrite16: 0x%X",addr); verify(false); }
+static void __fastcall vtlbDefaultPhyWrite32(u32 addr,mem32_t data) { Console.Error("vtlbDefaultPhyWrite32: 0x%X",addr); verify(false); }
+static void __fastcall vtlbDefaultPhyWrite64(u32 addr,const mem64_t* data) { Console.Error("vtlbDefaultPhyWrite64: 0x%X",addr); verify(false); }
+static void __fastcall vtlbDefaultPhyWrite128(u32 addr,const mem128_t* data) { Console.Error("vtlbDefaultPhyWrite128: 0x%X",addr); verify(false); }


 // ===========================================================================================
@ -436,7 +436,7 @@ void vtlb_Mirror(u32 new_region,u32 start,u32 size)
 	}
 }

-__forceinline void* vtlb_GetPhyPtr(u32 paddr)
+__fi void* vtlb_GetPhyPtr(u32 paddr)
 {
 	if (paddr>=VTLB_PMAP_SZ || vtlbdata.pmap[paddr>>VTLB_PAGE_BITS]<0)
 		return NULL;
--- a/pcsx2/x86/BaseblockEx.h
+++ b/pcsx2/x86/BaseblockEx.h
@ -78,7 +78,7 @@ public:
 	int LastIndex (u32 startpc) const;
 	BASEBLOCKEX* GetByX86(uptr ip);

-	__forceinline int Index (u32 startpc) const
+	__fi int Index (u32 startpc) const
 	{
 		int idx = LastIndex(startpc);
 		// fixme: I changed the parenthesis to be unambiguous, but this needs to be checked to see if ((x or y or z) and w)
@ -91,19 +91,19 @@ public:
 			return idx;
 	}

-	__forceinline BASEBLOCKEX* operator[](int idx)
+	__fi BASEBLOCKEX* operator[](int idx)
 	{
 		if (idx < 0 || idx >= (int)blocks.size())
 			return 0;
 		return &blocks[idx];
 	}

-	__forceinline BASEBLOCKEX* Get(u32 startpc)
+	__fi BASEBLOCKEX* Get(u32 startpc)
 	{
 		return (*this)[Index(startpc)];
 	}

-	__forceinline void Remove(int idx)
+	__fi void Remove(int idx)
 	{
 		//u32 startpc = blocks[idx].startpc;
 		std::pair<linkiter_t, linkiter_t> range = links.equal_range(blocks[idx].startpc);
@ -127,7 +127,7 @@ public:

 	void Link(u32 pc, s32* jumpptr);

-	__forceinline void Reset()
+	__fi void Reset()
 	{
 		blocks.clear();
 		links.clear();
--- a/pcsx2/x86/iCore.cpp
+++ b/pcsx2/x86/iCore.cpp
@ -51,7 +51,7 @@ void _initXMMregs() {
 	s_xmmchecknext = 0;
 }

-__forceinline void* _XMMGetAddr(int type, int reg, VURegs *VU)
+__fi void* _XMMGetAddr(int type, int reg, VURegs *VU)
 {
 	switch (type) {
 		case XMMTYPE_VFREG:
--- a/pcsx2/x86/iCore.h
+++ b/pcsx2/x86/iCore.h
@ -100,7 +100,7 @@
 #define X86TYPE_VU1 0x80

 //#define X86_ISVI(type) ((type&~X86TYPE_VU1) == X86TYPE_VI)
-static __forceinline int X86_ISVI(int type)
+static __fi int X86_ISVI(int type)
 {
 	return ((type&~X86TYPE_VU1) == X86TYPE_VI);
 }
@ -233,12 +233,12 @@ extern u32 _recIsRegWritten(EEINST* pinst, int size, u8 xmmtype, u8 reg);
 extern u32 _recIsRegUsed(EEINST* pinst, int size, u8 xmmtype, u8 reg);
 extern void _recFillRegister(EEINST& pinst, int type, int reg, int write);

-static __forceinline bool EEINST_ISLIVE64(u32 reg)	{ return !!(g_pCurInstInfo->regs[reg] & (EEINST_LIVE0)); }
-static __forceinline bool EEINST_ISLIVEXMM(u32 reg)	{ return !!(g_pCurInstInfo->regs[reg] & (EEINST_LIVE0|EEINST_LIVE2)); }
-static __forceinline bool EEINST_ISLIVE2(u32 reg)	{ return !!(g_pCurInstInfo->regs[reg] & EEINST_LIVE2); }
+static __fi bool EEINST_ISLIVE64(u32 reg)	{ return !!(g_pCurInstInfo->regs[reg] & (EEINST_LIVE0)); }
+static __fi bool EEINST_ISLIVEXMM(u32 reg)	{ return !!(g_pCurInstInfo->regs[reg] & (EEINST_LIVE0|EEINST_LIVE2)); }
+static __fi bool EEINST_ISLIVE2(u32 reg)	{ return !!(g_pCurInstInfo->regs[reg] & EEINST_LIVE2); }

-static __forceinline bool FPUINST_ISLIVE(u32 reg)	{ return !!(g_pCurInstInfo->fpuregs[reg] & EEINST_LIVE0); }
-static __forceinline bool FPUINST_LASTUSE(u32 reg)	{ return !!(g_pCurInstInfo->fpuregs[reg] & EEINST_LASTUSE); }
+static __fi bool FPUINST_ISLIVE(u32 reg)	{ return !!(g_pCurInstInfo->fpuregs[reg] & EEINST_LIVE0); }
+static __fi bool FPUINST_LASTUSE(u32 reg)	{ return !!(g_pCurInstInfo->fpuregs[reg] & EEINST_LASTUSE); }

 extern u32 g_recWriteback; // used for jumps (VUrec mess!)

@ -277,17 +277,17 @@ void SetFPUstate();
 #define MMX_COP0 96
 #define MMX_TEMP 0x7f

-static __forceinline bool MMX_IS32BITS(s32 x)
+static __fi bool MMX_IS32BITS(s32 x)
 {
 	return (((x >= MMX_FPU) && (x < MMX_COP0 + 32)) || (x == MMX_FPUACC));
 }

-static __forceinline bool MMX_ISGPR(s32 x)
+static __fi bool MMX_ISGPR(s32 x)
 {
 	return ((x >= MMX_GPR) && (x < MMX_GPR + 34));
 }

-static __forceinline bool MMX_ISGPR(u32 x)
+static __fi bool MMX_ISGPR(u32 x)
 {
 	return (x < MMX_GPR + 34);
 }
--- a/pcsx2/x86/iFPU.cpp
+++ b/pcsx2/x86/iFPU.cpp
@ -340,7 +340,7 @@ REC_FPUFUNC(RSQRT_S);
 //------------------------------------------------------------------

 static __aligned16 u64 FPU_FLOAT_TEMP[2];
-__forceinline void fpuFloat4(int regd) { // +NaN -> +fMax, -NaN -> -fMax, +Inf -> +fMax, -Inf -> -fMax
+__fi void fpuFloat4(int regd) { // +NaN -> +fMax, -NaN -> -fMax, +Inf -> +fMax, -Inf -> -fMax
 	int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
 	if (t1reg >= 0) {
 		SSE_MOVSS_XMM_to_XMM(t1reg, regd);
@ -363,20 +363,20 @@ __forceinline void fpuFloat4(int regd) { // +NaN -> +fMax, -NaN -> -fMax, +Inf -
 	}
 }

-__forceinline void fpuFloat(int regd) {  // +/-NaN -> +fMax, +Inf -> +fMax, -Inf -> -fMax
+__fi void fpuFloat(int regd) {  // +/-NaN -> +fMax, +Inf -> +fMax, -Inf -> -fMax
 	if (CHECK_FPU_OVERFLOW) {
 		SSE_MINSS_M32_to_XMM(regd, (uptr)&g_maxvals[0]); // MIN() must be before MAX()! So that NaN's become +Maximum
 		SSE_MAXSS_M32_to_XMM(regd, (uptr)&g_minvals[0]);
 	}
 }

-__forceinline void fpuFloat2(int regd) { // +NaN -> +fMax, -NaN -> -fMax, +Inf -> +fMax, -Inf -> -fMax
+__fi void fpuFloat2(int regd) { // +NaN -> +fMax, -NaN -> -fMax, +Inf -> +fMax, -Inf -> -fMax
 	if (CHECK_FPU_OVERFLOW) {
 		fpuFloat4(regd);
 	}
 }

-__forceinline void fpuFloat3(int regd) {
+__fi void fpuFloat3(int regd) {
 	// This clamp function is used in the recC_xx opcodes
 	// Rule of Rose needs clamping or else it crashes (minss or maxss both fix the crash)
 	// Tekken 5 has disappearing characters unless preserving NaN sign (fpuFloat4() preserves NaN sign).
--- a/pcsx2/x86/iR3000A.cpp
+++ b/pcsx2/x86/iR3000A.cpp
@ -901,7 +901,7 @@ static __noinline s32 recExecuteBlock( s32 eeCycles )
 }

 // Returns the offset to the next instruction after any cleared memory
-static __forceinline u32 psxRecClearMem(u32 pc)
+static __fi u32 psxRecClearMem(u32 pc)
 {
 	BASEBLOCK* pblock;

@ -948,7 +948,7 @@ static __forceinline u32 psxRecClearMem(u32 pc)
 	return upperextent - pc;
 }

-static __forceinline void recClearIOP(u32 Addr, u32 Size)
+static __fi void recClearIOP(u32 Addr, u32 Size)
 {
 	u32 pc = Addr;
 	while (pc < Addr + Size*4)
@ -1008,7 +1008,7 @@ void psxSetBranchImm( u32 imm )
 	recBlocks.Link(HWADDR(imm), xJcc32());
 }

-static __forceinline u32 psxScaleBlockCycles()
+static __fi u32 psxScaleBlockCycles()
 {
 	return s_psxBlockCycles;
 }
--- a/pcsx2/x86/ix86-32/iCore-32.cpp
+++ b/pcsx2/x86/ix86-32/iCore-32.cpp
@ -466,7 +466,7 @@ void _initMMXregs()
 	s_mmxchecknext = 0;
 }

-__forceinline void* _MMXGetAddr(int reg)
+__fi void* _MMXGetAddr(int reg)
 {
 	pxAssert( reg != MMX_TEMP );

--- a/pcsx2/x86/ix86-32/iR5900-32.cpp
+++ b/pcsx2/x86/ix86-32/iR5900-32.cpp
@ -817,7 +817,7 @@ void R5900::Dynarec::OpcodeImpl::recBREAK( void )
 }

 // Clears the recLUT table so that all blocks are mapped to the JIT recompiler by default.
-static __releaseinline void ClearRecLUT(BASEBLOCK* base, int count)
+static __ri void ClearRecLUT(BASEBLOCK* base, int count)
 {
 	for (int i = 0; i < count; i++)
 		base[i].SetFnptr((uptr)JITCompile);
--- a/pcsx2/x86/microVU.cpp
+++ b/pcsx2/x86/microVU.cpp
@ -63,14 +63,14 @@ const __aligned(32) mVU_Globals mVUglob = {
 // Micro VU - Main Functions
 //------------------------------------------------------------------

-_f void mVUthrowHardwareDeficiency(const wxChar* extFail, int vuIndex) {
+__fi void mVUthrowHardwareDeficiency(const wxChar* extFail, int vuIndex) {
 	throw Exception::HardwareDeficiency()
 		.SetDiagMsg(wxsFormat(L"microVU%d recompiler init failed: %s is not available.", vuIndex, extFail))
 		.SetUserMsg(wxsFormat(_("%s Extensions not found.  microVU requires a host CPU with MMX, SSE, and SSE2 extensions."), extFail ));
 }

 // Only run this once per VU! ;)
-_f void mVUinit(VURegs* vuRegsPtr, int vuIndex) {
+__fi void mVUinit(VURegs* vuRegsPtr, int vuIndex) {

 	if(!x86caps.hasMultimediaExtensions)		mVUthrowHardwareDeficiency( L"MMX", vuIndex );
 	if(!x86caps.hasStreamingSIMDExtensions)		mVUthrowHardwareDeficiency( L"SSE", vuIndex );
@ -110,7 +110,7 @@ _f void mVUinit(VURegs* vuRegsPtr, int vuIndex) {
 }

 // Resets Rec Data
-_f void mVUreset(mV) {
+__fi void mVUreset(mV) {

 	// Clear All Program Data
 	//memset(&mVU->prog, 0, sizeof(mVU->prog));
@ -146,7 +146,7 @@ _f void mVUreset(mV) {
 }

 // Free Allocated Resources
-_f void mVUclose(mV) {
+__fi void mVUclose(mV) {

 	if (mVU->dispCache) { HostSys::Munmap(mVU->dispCache, mVUdispCacheSize); mVU->dispCache = NULL; }
 	if (mVU->cache)		{ HostSys::Munmap(mVU->cache, mVU->cacheSize); mVU->cache = NULL; }
@ -194,7 +194,7 @@ void mVUresizeCache(mV, u32 size) {
 }

 // Clears Block Data in specified range
-_f void mVUclear(mV, u32 addr, u32 size) {
+__fi void mVUclear(mV, u32 addr, u32 size) {
 	if (!mVU->prog.cleared) {
 		memzero(mVU->prog.lpState); // Clear pipeline state
 		mVU->prog.cleared = 1;		// Next execution searches/creates a new microprogram
@ -210,12 +210,12 @@ _f void mVUclear(mV, u32 addr, u32 size) {
 //------------------------------------------------------------------

 // Finds and Ages/Kills Programs if they haven't been used in a while.
-_f void mVUvsyncUpdate(mV) {
+__fi void mVUvsyncUpdate(mV) {
 	//mVU->prog.curFrame++;
 }

 // Deletes a program
-_mVUt _f void mVUdeleteProg(microProgram*& prog) {
+_mVUt __fi void mVUdeleteProg(microProgram*& prog) {
 	microVU* mVU = mVUx;
 	for (u32 i = 0; i < (mVU->progSize / 2); i++) {
 		safe_delete(prog->block[i]);
@ -225,7 +225,7 @@ _mVUt _f void mVUdeleteProg(microProgram*& prog) {
 }

 // Creates a new Micro Program
-_mVUt _f microProgram* mVUcreateProg(int startPC) {
+_mVUt __fi microProgram* mVUcreateProg(int startPC) {
 	microVU* mVU = mVUx;
 	microProgram* prog = (microProgram*)_aligned_malloc(sizeof(microProgram), 64);
 	memzero_ptr<sizeof(microProgram)>(prog);
@ -242,7 +242,7 @@ _mVUt _f microProgram* mVUcreateProg(int startPC) {
 }

 // Caches Micro Program
-_mVUt _f void mVUcacheProg(microProgram& prog) {
+_mVUt __fi void mVUcacheProg(microProgram& prog) {
 	microVU* mVU = mVUx;
 	if (!vuIndex) memcpy_const(prog.data, mVU->regs->Micro, 0x1000);
 	else		  memcpy_const(prog.data, mVU->regs->Micro, 0x4000);
@ -250,7 +250,7 @@ _mVUt _f void mVUcacheProg(microProgram& prog) {
 }

 // Compare partial program by only checking compiled ranges...
-_mVUt _f bool mVUcmpPartial(microProgram& prog) {
+_mVUt __fi bool mVUcmpPartial(microProgram& prog) {
 	microVU* mVU = mVUx;
 	deque<microRange>::const_iterator it(prog.ranges->begin());
 	for ( ; it != prog.ranges->end(); ++it) {
@ -263,7 +263,7 @@ _mVUt _f bool mVUcmpPartial(microProgram& prog) {
 }

 // Compare Cached microProgram to mVU->regs->Micro
-_mVUt _f bool mVUcmpProg(microProgram& prog, const bool cmpWholeProg) {
+_mVUt __fi bool mVUcmpProg(microProgram& prog, const bool cmpWholeProg) {
 	microVU* mVU = mVUx;
 	if ((cmpWholeProg && !memcmp_mmx((u8*)prog.data, mVU->regs->Micro, mVU->microMemSize))
 	|| (!cmpWholeProg && mVUcmpPartial<vuIndex>(prog))) {
@ -276,7 +276,7 @@ _mVUt _f bool mVUcmpProg(microProgram& prog, const bool cmpWholeProg) {
 }

 // Searches for Cached Micro Program and sets prog.cur to it (returns entry-point to program)
-_mVUt _f void* mVUsearchProg(u32 startPC, uptr pState) {
+_mVUt __fi void* mVUsearchProg(u32 startPC, uptr pState) {
 	microVU* mVU = mVUx;
 	microProgramQuick& quick = mVU->prog.quick[startPC/8];
 	microProgramList*  list  = mVU->prog.prog [startPC/8];
--- a/pcsx2/x86/microVU.h
+++ b/pcsx2/x86/microVU.h
@ -78,7 +78,7 @@ public:
 		}
 		return thisBlock;
 	}
-	__releaseinline microBlock* search(microRegInfo* pState) {
+	__ri microBlock* search(microRegInfo* pState) {
 		microBlockLink* linkI = &blockList;
 		if (pState->needExactMatch) { // Needs Detailed Search (Exact Match of Pipeline State)
 			for (int i = 0; i <= listI; i++) {
@ -204,27 +204,27 @@ extern __aligned16 microVU microVU1;
 int mVUdebugNow = 0;

 // Main Functions
-_f void  mVUinit(VURegs*, int);
-_f void  mVUreset(mV);
-_f void  mVUclose(mV);
-_f void  mVUclear(mV, u32, u32);
-   void  mVUresizeCache(mV, u32);
-_f void* mVUblockFetch(microVU* mVU, u32 startPC, uptr pState);
-_mVUt void* __fastcall mVUcompileJIT(u32 startPC, uptr pState);
+extern void  mVUinit(VURegs*, int);
+extern void  mVUreset(mV);
+extern void  mVUclose(mV);
+extern void  mVUclear(mV, u32, u32);
+extern void  mVUresizeCache(mV, u32);
+extern void* mVUblockFetch(microVU* mVU, u32 startPC, uptr pState);
+_mVUt extern void* __fastcall mVUcompileJIT(u32 startPC, uptr pState);

 // Prototypes for Linux
-void  __fastcall mVUcleanUpVU0();
-void  __fastcall mVUcleanUpVU1();
+extern void  __fastcall mVUcleanUpVU0();
+extern void  __fastcall mVUcleanUpVU1();
 mVUop(mVUopU);
 mVUop(mVUopL);

 // Private Functions
-_mVUt _f void  mVUcacheProg (microProgram&  prog);
-_mVUt _f void  mVUdeleteProg(microProgram*& prog);
-_mVUt _f void* mVUsearchProg(u32 startPC, uptr pState);
-_mVUt _f microProgram* mVUfindLeastUsedProg();
-void* __fastcall mVUexecuteVU0(u32 startPC, u32 cycles);
-void* __fastcall mVUexecuteVU1(u32 startPC, u32 cycles);
+_mVUt extern void  mVUcacheProg (microProgram&  prog);
+_mVUt extern void  mVUdeleteProg(microProgram*& prog);
+_mVUt extern void* mVUsearchProg(u32 startPC, uptr pState);
+_mVUt extern microProgram* mVUfindLeastUsedProg();
+extern void* __fastcall mVUexecuteVU0(u32 startPC, u32 cycles);
+extern void* __fastcall mVUexecuteVU1(u32 startPC, u32 cycles);

 // recCall Function Pointer
 typedef void (__fastcall *mVUrecCall)(u32, u32);
--- a/pcsx2/x86/microVU_Alloc.inl
+++ b/pcsx2/x86/microVU_Alloc.inl
@ -23,14 +23,14 @@
 // Flag Allocators
 //------------------------------------------------------------------

-_f static const x32& getFlagReg(uint fInst)
+__fi static const x32& getFlagReg(uint fInst)
 {
 	static const x32* const gprF_crap[4] = { &gprF0, &gprF1, &gprF2, &gprF3 };
 	pxAssume(fInst < 4);
 	return *gprF_crap[fInst];
 }

-_f void setBitSFLAG(const x32& reg, const x32& regT, int bitTest, int bitSet)
+__fi void setBitSFLAG(const x32& reg, const x32& regT, int bitTest, int bitSet)
 {
 	xTEST(regT, bitTest);
 	xForwardJZ8 skip;
@ -38,7 +38,7 @@ _f void setBitSFLAG(const x32& reg, const x32& regT, int bitTest, int bitSet)
 	skip.SetTarget();
 }

-_f void setBitFSEQ(const x32& reg, int bitX)
+__fi void setBitFSEQ(const x32& reg, int bitX)
 {
 	xTEST(reg, bitX);
 	xForwardJump8 skip(Jcc_Zero);
@ -46,18 +46,18 @@ _f void setBitFSEQ(const x32& reg, int bitX)
 	skip.SetTarget();
 }

-_f void mVUallocSFLAGa(const x32& reg, int fInstance)
+__fi void mVUallocSFLAGa(const x32& reg, int fInstance)
 {
 	xMOV(reg, getFlagReg(fInstance));
 }

-_f void mVUallocSFLAGb(const x32& reg, int fInstance)
+__fi void mVUallocSFLAGb(const x32& reg, int fInstance)
 {
 	xMOV(getFlagReg(fInstance), reg);
 }

 // Normalize Status Flag
-_f void mVUallocSFLAGc(const x32& reg, const x32& regT, int fInstance)
+__ri void mVUallocSFLAGc(const x32& reg, const x32& regT, int fInstance)
 {
 	xXOR(reg, reg);
 	mVUallocSFLAGa(regT, fInstance);
@ -71,7 +71,7 @@ _f void mVUallocSFLAGc(const x32& reg, const x32& regT, int fInstance)
 }

 // Denormalizes Status Flag
-_f void mVUallocSFLAGd(u32* memAddr, bool setAllflags) {
+__ri void mVUallocSFLAGd(u32* memAddr, bool setAllflags) {

 	// Cannot use EBP (gprF1) here; as this function is used by mVU0 macro and
 	// the EErec needs EBP preserved.
@ -101,25 +101,25 @@ _f void mVUallocSFLAGd(u32* memAddr, bool setAllflags) {
 	}
 }

-_f void mVUallocMFLAGa(mV, const x32& reg, int fInstance)
+__fi void mVUallocMFLAGa(mV, const x32& reg, int fInstance)
 {
 	xMOVZX(reg, ptr16[&mVU->macFlag[fInstance]]);
 }

-_f void mVUallocMFLAGb(mV, const x32& reg, int fInstance)
+__fi void mVUallocMFLAGb(mV, const x32& reg, int fInstance)
 {
 	//xAND(reg, 0xffff);
 	if (fInstance < 4) xMOV(ptr32[&mVU->macFlag[fInstance]], reg);			// microVU
 	else			   xMOV(ptr32[&mVU->regs->VI[REG_MAC_FLAG].UL], reg);	// macroVU
 }

-_f void mVUallocCFLAGa(mV, const x32& reg, int fInstance)
+__fi void mVUallocCFLAGa(mV, const x32& reg, int fInstance)
 {
 	if (fInstance < 4) xMOV(reg, ptr32[&mVU->clipFlag[fInstance]]);			// microVU
 	else			   xMOV(reg, ptr32[&mVU->regs->VI[REG_CLIP_FLAG].UL]);	// macroVU
 }

-_f void mVUallocCFLAGb(mV, const x32& reg, int fInstance)
+__fi void mVUallocCFLAGb(mV, const x32& reg, int fInstance)
 {
 	if (fInstance < 4) xMOV(ptr32[&mVU->clipFlag[fInstance]], reg);			// microVU
 	else			   xMOV(ptr32[&mVU->regs->VI[REG_CLIP_FLAG].UL], reg);	// macroVU
@ -129,7 +129,7 @@ _f void mVUallocCFLAGb(mV, const x32& reg, int fInstance)
 // VI Reg Allocators
 //------------------------------------------------------------------

-_f void mVUallocVIa(mV, const x32& GPRreg, int _reg_, bool signext = false)
+__ri void mVUallocVIa(mV, const x32& GPRreg, int _reg_, bool signext = false)
 {
 	if (!_reg_)
 		xXOR(GPRreg, GPRreg);
@ -140,7 +140,7 @@ _f void mVUallocVIa(mV, const x32& GPRreg, int _reg_, bool signext = false)
 			xMOVZX(GPRreg, ptr16[&mVU->regs->VI[_reg_].UL]);
 }

-_f void mVUallocVIb(mV, const x32& GPRreg, int _reg_)
+__ri void mVUallocVIb(mV, const x32& GPRreg, int _reg_)
 {
 	if (mVUlow.backupVI) { // Backs up reg to memory (used when VI is modified b4 a branch)
 		xMOVZX(gprT3, ptr16[&mVU->regs->VI[_reg_].UL]);
@ -154,19 +154,19 @@ _f void mVUallocVIb(mV, const x32& GPRreg, int _reg_)
 // P/Q Reg Allocators
 //------------------------------------------------------------------

-_f void getPreg(mV, const xmm& reg)
+__fi void getPreg(mV, const xmm& reg)
 {
 	mVUunpack_xyzw(reg, xmmPQ, (2 + mVUinfo.readP));
 	/*if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2(reg, xmmT1, 15);*/
 }

-_f void getQreg(const xmm& reg, int qInstance)
+__fi void getQreg(const xmm& reg, int qInstance)
 {
 	mVUunpack_xyzw(reg, xmmPQ, qInstance);
 	/*if (CHECK_VU_EXTRA_OVERFLOW) mVUclamp2<vuIndex>(reg, xmmT1, 15);*/
 }

-_f void writeQreg(const xmm& reg, int qInstance)
+__ri void writeQreg(const xmm& reg, int qInstance)
 {
 	if (qInstance) {
 		if (!x86caps.hasStreamingSIMD4Extensions) {
--- a/pcsx2/x86/microVU_Analyze.inl
+++ b/pcsx2/x86/microVU_Analyze.inl
@ -118,7 +118,7 @@
 // FMAC1 - Normal FMAC Opcodes
 //------------------------------------------------------------------

-_f void mVUanalyzeFMAC1(mV, int Fd, int Fs, int Ft) {
+__fi void mVUanalyzeFMAC1(mV, int Fd, int Fs, int Ft) {
 	sFLAG.doFlag = 1;
 	analyzeReg1(Fs, mVUup.VF_read[0]);
 	analyzeReg1(Ft, mVUup.VF_read[1]);
@ -129,7 +129,7 @@ _f void mVUanalyzeFMAC1(mV, int Fd, int Fs, int Ft) {
 // FMAC2 - ABS/FTOI/ITOF Opcodes
 //------------------------------------------------------------------

-_f void mVUanalyzeFMAC2(mV, int Fs, int Ft) {
+__fi void mVUanalyzeFMAC2(mV, int Fs, int Ft) {
 	analyzeReg1(Fs, mVUup.VF_read[0]);
 	analyzeReg2(Ft, mVUup.VF_write, 0);
 }
@ -138,7 +138,7 @@ _f void mVUanalyzeFMAC2(mV, int Fs, int Ft) {
 // FMAC3 - BC(xyzw) FMAC Opcodes
 //------------------------------------------------------------------

-_f void mVUanalyzeFMAC3(mV, int Fd, int Fs, int Ft) {
+__fi void mVUanalyzeFMAC3(mV, int Fd, int Fs, int Ft) {
 	sFLAG.doFlag = 1;
 	analyzeReg1(Fs, mVUup.VF_read[0]);
 	analyzeReg3(Ft, mVUup.VF_read[1]);
@ -149,7 +149,7 @@ _f void mVUanalyzeFMAC3(mV, int Fd, int Fs, int Ft) {
 // FMAC4 - Clip FMAC Opcode
 //------------------------------------------------------------------

-_f void mVUanalyzeFMAC4(mV, int Fs, int Ft) {
+__fi void mVUanalyzeFMAC4(mV, int Fs, int Ft) {
 	cFLAG.doFlag = 1;
 	analyzeReg1(Fs, mVUup.VF_read[0]);
 	analyzeReg4(Ft, mVUup.VF_read[1]);
@ -159,20 +159,20 @@ _f void mVUanalyzeFMAC4(mV, int Fs, int Ft) {
 // IALU - IALU Opcodes
 //------------------------------------------------------------------

-_f void mVUanalyzeIALU1(mV, int Id, int Is, int It) {
+__fi void mVUanalyzeIALU1(mV, int Id, int Is, int It) {
 	if (!Id) { mVUlow.isNOP = 1; }
 	analyzeVIreg1(Is, mVUlow.VI_read[0]);
 	analyzeVIreg1(It, mVUlow.VI_read[1]);
 	analyzeVIreg2(Id, mVUlow.VI_write, 1);
 }

-_f void mVUanalyzeIALU2(mV, int Is, int It) {
+__fi void mVUanalyzeIALU2(mV, int Is, int It) {
 	if (!It) { mVUlow.isNOP = 1; }
 	analyzeVIreg1(Is, mVUlow.VI_read[0]);
 	analyzeVIreg2(It, mVUlow.VI_write, 1);
 }

-_f void mVUanalyzeIADDI(mV, int Is, int It, s16 imm) {
+__fi void mVUanalyzeIADDI(mV, int Is, int It, s16 imm) {
 	mVUanalyzeIALU2(mVU, Is, It);
 	if (!Is) { setConstReg(It, imm); }
 }
@ -181,7 +181,7 @@ _f void mVUanalyzeIADDI(mV, int Is, int It, s16 imm) {
 // MR32 - MR32 Opcode
 //------------------------------------------------------------------

-_f void mVUanalyzeMR32(mV, int Fs, int Ft) {
+__fi void mVUanalyzeMR32(mV, int Fs, int Ft) {
 	if (!Ft) { mVUlow.isNOP = 1; }
 	analyzeReg6(Fs, mVUlow.VF_read[0]);
 	analyzeReg2(Ft, mVUlow.VF_write, 1);
@ -191,7 +191,7 @@ _f void mVUanalyzeMR32(mV, int Fs, int Ft) {
 // FDIV - DIV/SQRT/RSQRT Opcodes
 //------------------------------------------------------------------

-_f void mVUanalyzeFDIV(mV, int Fs, int Fsf, int Ft, int Ftf, u8 xCycles) {
+__fi void mVUanalyzeFDIV(mV, int Fs, int Fsf, int Ft, int Ftf, u8 xCycles) {
 	analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]);
 	analyzeReg5(Ft, Ftf, mVUlow.VF_read[1]);
 	analyzeQreg(xCycles);
@ -201,12 +201,12 @@ _f void mVUanalyzeFDIV(mV, int Fs, int Fsf, int Ft, int Ftf, u8 xCycles) {
 // EFU - EFU Opcodes
 //------------------------------------------------------------------

-_f void mVUanalyzeEFU1(mV, int Fs, int Fsf, u8 xCycles) {
+__fi void mVUanalyzeEFU1(mV, int Fs, int Fsf, u8 xCycles) {
 	analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]);
 	analyzePreg(xCycles);
 }

-_f void mVUanalyzeEFU2(mV, int Fs, u8 xCycles) {
+__fi void mVUanalyzeEFU2(mV, int Fs, u8 xCycles) {
 	analyzeReg1(Fs, mVUlow.VF_read[0]);
 	analyzePreg(xCycles);
 }
@ -215,7 +215,7 @@ _f void mVUanalyzeEFU2(mV, int Fs, u8 xCycles) {
 // MFP - MFP Opcode
 //------------------------------------------------------------------

-_f void mVUanalyzeMFP(mV, int Ft) {
+__fi void mVUanalyzeMFP(mV, int Ft) {
 	if (!Ft) { mVUlow.isNOP = 1; }
 	analyzeReg2(Ft, mVUlow.VF_write, 1);
 }
@ -224,7 +224,7 @@ _f void mVUanalyzeMFP(mV, int Ft) {
 // MOVE - MOVE Opcode
 //------------------------------------------------------------------

-_f void mVUanalyzeMOVE(mV, int Fs, int Ft) {
+__fi void mVUanalyzeMOVE(mV, int Fs, int Ft) {
 	if (!Ft || (Ft == Fs)) { mVUlow.isNOP = 1; }
 	analyzeReg1(Fs, mVUlow.VF_read[0]);
 	analyzeReg2(Ft, mVUlow.VF_write, 1);
@ -234,7 +234,7 @@ _f void mVUanalyzeMOVE(mV, int Fs, int Ft) {
 // LQx - LQ/LQD/LQI Opcodes
 //------------------------------------------------------------------

-_f void mVUanalyzeLQ(mV, int Ft, int Is, bool writeIs) {
+__fi void mVUanalyzeLQ(mV, int Ft, int Is, bool writeIs) {
 	analyzeVIreg1(Is, mVUlow.VI_read[0]);
 	analyzeReg2  (Ft, mVUlow.VF_write, 1);
 	if (!Ft)	 { if (writeIs && Is) { mVUlow.noWriteVF = 1; } else { mVUlow.isNOP = 1; } }
@ -245,7 +245,7 @@ _f void mVUanalyzeLQ(mV, int Ft, int Is, bool writeIs) {
 // SQx - SQ/SQD/SQI Opcodes
 //------------------------------------------------------------------

-_f void mVUanalyzeSQ(mV, int Fs, int It, bool writeIt) {
+__fi void mVUanalyzeSQ(mV, int Fs, int It, bool writeIt) {
 	analyzeReg1  (Fs, mVUlow.VF_read[0]);
 	analyzeVIreg1(It, mVUlow.VI_read[0]);
 	if (writeIt) { analyzeVIreg2(It, mVUlow.VI_write, 1); }
@ -255,12 +255,12 @@ _f void mVUanalyzeSQ(mV, int Fs, int It, bool writeIt) {
 // R*** - R Reg Opcodes
 //------------------------------------------------------------------

-_f void mVUanalyzeR1(mV, int Fs, int Fsf) {
+__fi void mVUanalyzeR1(mV, int Fs, int Fsf) {
 	analyzeReg5(Fs, Fsf, mVUlow.VF_read[0]);
 	analyzeRreg();
 }

-_f void mVUanalyzeR2(mV, int Ft, bool canBeNOP) {
+__fi void mVUanalyzeR2(mV, int Ft, bool canBeNOP) {
 	if (!Ft) { if (canBeNOP) { mVUlow.isNOP = 1; } else { mVUlow.noWriteVF = 1; } }
 	analyzeReg2(Ft, mVUlow.VF_write, 1);
 	analyzeRreg();
@ -269,7 +269,7 @@ _f void mVUanalyzeR2(mV, int Ft, bool canBeNOP) {
 //------------------------------------------------------------------
 // Sflag - Status Flag Opcodes
 //------------------------------------------------------------------
-_f void flagSet(mV, bool setMacFlag) {
+__ri void flagSet(mV, bool setMacFlag) {
 	int curPC = iPC;
 	for (int i = mVUcount, j = 0; i > 0; i--, j++) {
 		j += mVUstall;
@ -283,7 +283,7 @@ _f void flagSet(mV, bool setMacFlag) {
 	iPC = curPC;
 }

-_f void mVUanalyzeSflag(mV, int It) {
+__ri void mVUanalyzeSflag(mV, int It) {
 	mVUlow.readFlags = 1;
 	analyzeVIreg2(It, mVUlow.VI_write, 1);
 	if (!It) { mVUlow.isNOP = 1; }
@ -295,7 +295,7 @@ _f void mVUanalyzeSflag(mV, int It) {
 	}
 }

-_f void mVUanalyzeFSSET(mV) {
+__ri void mVUanalyzeFSSET(mV) {
 	mVUlow.isFSSET = 1;
 	mVUlow.readFlags = 1;
 }
@ -304,7 +304,7 @@ _f void mVUanalyzeFSSET(mV) {
 // Mflag - Mac Flag Opcodes
 //------------------------------------------------------------------

-_f void mVUanalyzeMflag(mV, int Is, int It) {
+__ri void mVUanalyzeMflag(mV, int Is, int It) {
 	mVUlow.readFlags = 1;
 	analyzeVIreg1(Is, mVUlow.VI_read[0]);
 	analyzeVIreg2(It, mVUlow.VI_write, 1);
@ -320,7 +320,7 @@ _f void mVUanalyzeMflag(mV, int Is, int It) {
 // Cflag - Clip Flag Opcodes
 //------------------------------------------------------------------

-_f void mVUanalyzeCflag(mV, int It) {
+__fi void mVUanalyzeCflag(mV, int It) {
 	mVUinfo.swapOps = 1;
 	mVUlow.readFlags = 1;
 	if (mVUcount < 4) { mVUpBlock->pState.needExactMatch |= 4; }
@ -331,7 +331,7 @@ _f void mVUanalyzeCflag(mV, int It) {
 // XGkick
 //------------------------------------------------------------------

-_f void mVUanalyzeXGkick(mV, int Fs, int xCycles) {
+__fi void mVUanalyzeXGkick(mV, int Fs, int xCycles) {
 	analyzeVIreg1(Fs, mVUlow.VI_read[0]);
 	analyzeXGkick1();
 	analyzeXGkick2(xCycles);
@ -347,7 +347,7 @@ _f void mVUanalyzeXGkick(mV, int Fs, int xCycles) {
 // Branches - Branch Opcodes
 //------------------------------------------------------------------

-_f void analyzeBranchVI(mV, int xReg, bool &infoVar) {
+static void analyzeBranchVI(mV, int xReg, bool &infoVar) {
 	if (!xReg) return;
 	int i, j = 0;
 	int iEnd = 4;
@ -390,7 +390,7 @@ _f void analyzeBranchVI(mV, int xReg, bool &infoVar) {

 /*
 // Dead Code... the old version of analyzeBranchVI()
-_f void analyzeBranchVI(mV, int xReg, bool &infoVar) {
+__fi void analyzeBranchVI(mV, int xReg, bool &infoVar) {
 	if (!xReg) return;
 	int i;
 	int iEnd = aMin(5, (mVUcount+1));
@ -427,7 +427,7 @@ _f void analyzeBranchVI(mV, int xReg, bool &infoVar) {
 */

 // Branch in Branch Delay-Slots
-_f int mVUbranchCheck(mV) {
+__ri int mVUbranchCheck(mV) {
 	if (!mVUcount) return 0;
 	incPC(-2);
 	if (mVUlow.branch) {
@ -443,14 +443,14 @@ _f int mVUbranchCheck(mV) {
 	return 0;
 }

-_f void mVUanalyzeCondBranch1(mV, int Is) {
+__fi void mVUanalyzeCondBranch1(mV, int Is) {
 	analyzeVIreg1(Is, mVUlow.VI_read[0]);
 	if (!mVUstall && !mVUbranchCheck(mVU)) { 
 		analyzeBranchVI(mVU, Is, mVUlow.memReadIs);
 	}
 }

-_f void mVUanalyzeCondBranch2(mV, int Is, int It) {
+__fi void mVUanalyzeCondBranch2(mV, int Is, int It) {
 	analyzeVIreg1(Is, mVUlow.VI_read[0]);
 	analyzeVIreg1(It, mVUlow.VI_read[1]);
 	if (!mVUstall && !mVUbranchCheck(mVU)) {
@ -459,7 +459,7 @@ _f void mVUanalyzeCondBranch2(mV, int Is, int It) {
 	}
 }

-_f void mVUanalyzeNormBranch(mV, int It, bool isBAL) {
+__fi void mVUanalyzeNormBranch(mV, int It, bool isBAL) {
 	mVUbranchCheck(mVU);
 	if (isBAL) {
 		analyzeVIreg2(It, mVUlow.VI_write, 1); 
@ -467,7 +467,7 @@ _f void mVUanalyzeNormBranch(mV, int It, bool isBAL) {
 	}
 }

-_f void mVUanalyzeJump(mV, int Is, int It, bool isJALR) {
+__ri void mVUanalyzeJump(mV, int Is, int It, bool isJALR) {
 	mVUbranchCheck(mVU);
 	mVUlow.branch = (isJALR) ? 10 : 9;
 	if (mVUconstReg[Is].isValid && CHECK_VU_CONSTPROP) {
--- a/pcsx2/x86/microVU_Compile.inl
+++ b/pcsx2/x86/microVU_Compile.inl
@ -31,19 +31,19 @@
 // Messages Called at Execution Time...
 //------------------------------------------------------------------

-void __fastcall mVUbadOp0(mV)		{ Console.Error("microVU0 Warning: Exiting... Block started with illegal opcode. [%04x] [%x]", xPC, mVU->prog.cur); }
-void __fastcall mVUbadOp1(mV)		{ Console.Error("microVU1 Warning: Exiting... Block started with illegal opcode. [%04x] [%x]", xPC, mVU->prog.cur); }
-void __fastcall mVUwarning0(mV)		{ Console.Error("microVU0 Warning: Exiting from Possible Infinite Loop [%04x] [%x]", xPC, mVU->prog.cur); }
-void __fastcall mVUwarning1(mV)		{ Console.Error("microVU1 Warning: Exiting from Possible Infinite Loop [%04x] [%x]", xPC, mVU->prog.cur); }
-void __fastcall mVUprintPC1(u32 PC) { Console.WriteLn("Block Start PC = 0x%04x", PC); }
-void __fastcall mVUprintPC2(u32 PC) { Console.WriteLn("Block End PC   = 0x%04x", PC); }
+static void __fastcall mVUbadOp0(mV)		{ Console.Error("microVU0 Warning: Exiting... Block started with illegal opcode. [%04x] [%x]", xPC, mVU->prog.cur); }
+static void __fastcall mVUbadOp1(mV)		{ Console.Error("microVU1 Warning: Exiting... Block started with illegal opcode. [%04x] [%x]", xPC, mVU->prog.cur); }
+static void __fastcall mVUwarning0(mV)		{ Console.Error("microVU0 Warning: Exiting from Possible Infinite Loop [%04x] [%x]", xPC, mVU->prog.cur); }
+static void __fastcall mVUwarning1(mV)		{ Console.Error("microVU1 Warning: Exiting from Possible Infinite Loop [%04x] [%x]", xPC, mVU->prog.cur); }
+static void __fastcall mVUprintPC1(u32 PC)	{ Console.WriteLn("Block Start PC = 0x%04x", PC); }
+static void __fastcall mVUprintPC2(u32 PC)	{ Console.WriteLn("Block End PC   = 0x%04x", PC); }

 //------------------------------------------------------------------
 // Helper Functions
 //------------------------------------------------------------------

 // Used by mVUsetupRange
-_f void mVUcheckIsSame(mV) {
+static __fi void mVUcheckIsSame(mV) {
 	if (mVU->prog.isSame == -1) {
 		mVU->prog.isSame = !memcmp_mmx((u8*)mVUcurProg.data, mVU->regs->Micro, mVU->microMemSize);
 	}
@ -55,7 +55,7 @@ _f void mVUcheckIsSame(mV) {
 }

 // Sets up microProgram PC ranges based on whats been recompiled
-void mVUsetupRange(microVU* mVU, s32 pc, bool isStartPC) {
+static void mVUsetupRange(microVU* mVU, s32 pc, bool isStartPC) {
 	deque<microRange>*& ranges = mVUcurProg.ranges;
 	pc &= mVU->microMemSize - 8;

@ -106,7 +106,7 @@ void mVUsetupRange(microVU* mVU, s32 pc, bool isStartPC) {
 	}
 }

-_f void startLoop(mV) {
+static __fi void startLoop(mV) {
 	if (curI & _Mbit_)	{ Console.WriteLn(Color_Green, "microVU%d: M-bit set!", getIndex); }
 	if (curI & _Dbit_)	{ DevCon.WriteLn (Color_Green, "microVU%d: D-bit set!", getIndex); }
 	if (curI & _Tbit_)	{ DevCon.WriteLn (Color_Green, "microVU%d: T-bit set!", getIndex); }
@ -114,7 +114,7 @@ _f void startLoop(mV) {
 	memzero(mVUregsTemp);
 }

-void doIbit(mV) { 
+static void doIbit(mV) { 
 	if (mVUup.iBit) { 
 		incPC(-1);
 		u32 tempI;
@ -131,7 +131,7 @@ void doIbit(mV) {
 	} 
 }

-void doSwapOp(mV) { 
+static void doSwapOp(mV) { 
 	if (mVUinfo.backupVF && !mVUlow.noWriteVF) {
 		DevCon.WriteLn(Color_Green, "microVU%d: Backing Up VF Reg [%04x]", getIndex, xPC);

@ -161,7 +161,7 @@ void doSwapOp(mV) {
 }

 // If 1st op in block is a bad opcode, then don't compile rest of block (Dawn of Mana Level 2)
-_f void mVUcheckBadOp(mV) {
+static __fi void mVUcheckBadOp(mV) {
 	if (mVUinfo.isBadOp && mVUcount == 0) {
 		mVUinfo.isEOB = true;
 		Console.Warning("microVU Warning: First Instruction of block contains illegal opcode...");
@ -169,7 +169,7 @@ _f void mVUcheckBadOp(mV) {
 }

 // Prints msg when exiting block early if 1st op was a bad opcode (Dawn of Mana Level 2)
-_f void handleBadOp(mV, int count) {
+static __fi void handleBadOp(mV, int count) {
 	if (mVUinfo.isBadOp && count == 0) {
 		xMOV(gprT2, (uptr)mVU);
 		if (!isVU1) xCALL(mVUbadOp0);
@ -177,7 +177,7 @@ _f void handleBadOp(mV, int count) {
 	}
 }

-_f void branchWarning(mV) {
+static __ri void branchWarning(mV) {
 	incPC(-2);
 	if (mVUup.eBit && mVUbranch) {
 		incPC(2);
@ -193,14 +193,14 @@ _f void branchWarning(mV) {
 	}
 }

-_f void eBitPass1(mV, int& branch) {
+static __fi void eBitPass1(mV, int& branch) {
 	if (mVUregs.blockType != 1) {
 		branch = 1; 
 		mVUup.eBit = 1;
 	}
 }

-_f void eBitWarning(mV) {
+static __ri void eBitWarning(mV) {
 	if (mVUpBlock->pState.blockType == 1) Console.Error("microVU%d Warning: Branch, E-bit, Branch! [%04x]",  mVU->index, xPC);
 	if (mVUpBlock->pState.blockType == 2) Console.Error("microVU%d Warning: Branch, Branch, Branch! [%04x]", mVU->index, xPC);
 	incPC(2);
@ -212,7 +212,7 @@ _f void eBitWarning(mV) {
 }

 // Optimizes the End Pipeline State Removing Unnecessary Info
-_f void mVUoptimizePipeState(mV) {
+static __fi void mVUoptimizePipeState(mV) {
 	for (int i = 0; i < 32; i++) {
 		optimizeReg(mVUregs.VF[i].x);
 		optimizeReg(mVUregs.VF[i].y);
@ -227,7 +227,7 @@ _f void mVUoptimizePipeState(mV) {
 	mVUregs.r = 0; // There are no stalls on the R-reg, so its Safe to discard info
 }

-_f void mVUincCycles(mV, int x) {
+static __fi void mVUincCycles(mV, int x) {
 	mVUcycles += x;
 	for (int z = 31; z > 0; z--) {
 		calcCycles(mVUregs.VF[z].x, x);
@ -300,12 +300,12 @@ void mVUsetCycles(mV) {
 }

 // vu0 is allowed to exit early, so are dev builds (for inf loops)
-_f bool doEarlyExit(microVU* mVU) {
+static __fi bool doEarlyExit(microVU* mVU) {
 	return IsDevBuild || !isVU1;
 }

 // Saves Pipeline State for resuming from early exits
-_f void mVUsavePipelineState(microVU* mVU) {
+static __fi void mVUsavePipelineState(microVU* mVU) {
 	u32* lpS = (u32*)&mVU->prog.lpState.vi15;
 	for (int i = 0; i < (sizeof(microRegInfo)-4)/4; i++, lpS++) {
 		xMOV(ptr32[lpS], lpS[0]);
@ -313,7 +313,7 @@ _f void mVUsavePipelineState(microVU* mVU) {
 }

 // Prints Start/End PC of blocks executed, for debugging...
-void mVUdebugPrintBlocks(microVU* mVU, bool isEndPC) {
+static void mVUdebugPrintBlocks(microVU* mVU, bool isEndPC) {
 	if (mVUdebugNow) {
 		xMOV(gprT2, xPC);
 		if (isEndPC) xCALL(mVUprintPC2);
@ -322,7 +322,7 @@ void mVUdebugPrintBlocks(microVU* mVU, bool isEndPC) {
 }

 // Test cycles to see if we need to exit-early...
-void mVUtestCycles(microVU* mVU) {
+static void mVUtestCycles(microVU* mVU) {
 	iPC = mVUstartPC;
 	if (doEarlyExit(mVU)) {
 		xCMP(ptr32[&mVU->cycles], 0);
@ -348,7 +348,7 @@ void mVUtestCycles(microVU* mVU) {
 }

 // Initialize VI Constants (vi15 propagates through blocks)
-_f void mVUinitConstValues(microVU* mVU) {
+static __fi void mVUinitConstValues(microVU* mVU) {
 	for (int i = 0; i < 16; i++) {
 		mVUconstReg[i].isValid	= 0;
 		mVUconstReg[i].regValue	= 0;
@ -358,7 +358,7 @@ _f void mVUinitConstValues(microVU* mVU) {
 }

 // Initialize Variables
-_f void mVUinitFirstPass(microVU* mVU, uptr pState, u8* thisPtr) {
+static __fi void mVUinitFirstPass(microVU* mVU, uptr pState, u8* thisPtr) {
 	mVUstartPC				= iPC;	// Block Start PC
 	mVUbranch				= 0;	// Branch Type
 	mVUcount				= 0;	// Number of instructions ran
@ -466,14 +466,14 @@ void* mVUcompile(microVU* mVU, u32 startPC, uptr pState) {
 }

 // Returns the entry point of the block (compiles it if not found)
-_f void* mVUentryGet(microVU* mVU, microBlockManager* block, u32 startPC, uptr pState) {
+__fi void* mVUentryGet(microVU* mVU, microBlockManager* block, u32 startPC, uptr pState) {
 	microBlock* pBlock = block->search((microRegInfo*)pState);
 	if (pBlock) return pBlock->x86ptrStart;
 	else	    return mVUcompile(mVU, startPC, pState);
 }

 // Search for Existing Compiled Block (if found, return x86ptr; else, compile and return x86ptr)
-_f void* mVUblockFetch(microVU* mVU, u32 startPC, uptr pState) {
+__fi void* mVUblockFetch(microVU* mVU, u32 startPC, uptr pState) {

 	if (startPC > mVU->microMemSize-8) { DevCon.Error("microVU%d: invalid startPC [%04x]", mVU->index, startPC); }
 	startPC    &= mVU->microMemSize-8;
--- a/pcsx2/x86/microVU_Flags.inl
+++ b/pcsx2/x86/microVU_Flags.inl
@ -19,7 +19,7 @@
 #pragma once

 // Sets FDIV Flags at the proper time
-_f void mVUdivSet(mV) {
+__fi void mVUdivSet(mV) {
 	if (mVUinfo.doDivFlag) {
 		if (!sFLAG.doFlag) { xMOV(getFlagReg(sFLAG.write), getFlagReg(sFLAG.lastWrite)); }
 		xAND(getFlagReg(sFLAG.write), 0xfff3ffff);
@ -29,7 +29,7 @@ _f void mVUdivSet(mV) {

 // Optimizes out unneeded status flag updates
 // This can safely be done when there is an FSSET opcode
-_f void mVUstatusFlagOp(mV) {
+__fi void mVUstatusFlagOp(mV) {
 	int curPC = iPC;
 	int i = mVUcount;
 	bool runLoop = 1;
@ -77,7 +77,7 @@ int sortFlag(int* fFlag, int* bFlag, int cycles) {
 #define sHackCond (mVUsFlagHack && !sFLAG.doNonSticky)

 // Note: Flag handling is 'very' complex, it requires full knowledge of how microVU recs work, so don't touch!
-_f void mVUsetFlags(mV, microFlagCycles& mFC) {
+__fi void mVUsetFlags(mV, microFlagCycles& mFC) {

 	int endPC  = iPC;
 	u32 aCount = 1; // Amount of instructions needed to get valid mac flag instances for block linking
@ -164,7 +164,7 @@ _f void mVUsetFlags(mV, microFlagCycles& mFC) {
 #define shuffleClip		((bClip[3]<<6)|(bClip[2]<<4)|(bClip[1]<<2)|bClip[0])

 // Recompiles Code for Proper Flags on Block Linkings
-_f void mVUsetupFlags(mV, microFlagCycles& mFC) {
+__fi void mVUsetupFlags(mV, microFlagCycles& mFC) {

 	if (__Status) {
 		int bStatus[4];
@ -283,7 +283,7 @@ void mVUflagPass(mV, u32 startPC, u32 sCount = 0, u32 found = 0) {
 #define branchType3 else						// Conditional Branch

 // Checks if the first ~4 instructions of a block will read flags
-_f void mVUsetFlagInfo(mV) {
+__fi void mVUsetFlagInfo(mV) {
 	branchType1 { incPC(-1); mVUflagPass(mVU, branchAddr); incPC(1); }
 	branchType2 { // This case can possibly be turned off via a hack for a small speedup...
 		if (!mVUlow.constJump.isValid || !CHECK_VU_CONSTPROP) { mVUregs.needExactMatch |= 0x7; } 
--- a/pcsx2/x86/microVU_Lower.inl
+++ b/pcsx2/x86/microVU_Lower.inl
@ -24,7 +24,7 @@
 //------------------------------------------------------------------

 // Test if Vector is +/- Zero
-_f static void testZero(const xmm& xmmReg, const xmm& xmmTemp, const x32& gprTemp)
+static __fi void testZero(const xmm& xmmReg, const xmm& xmmTemp, const x32& gprTemp)
 {
 	xXOR.PS(xmmTemp, xmmTemp);
 	xCMPEQ.SS(xmmTemp, xmmReg);
@ -36,7 +36,7 @@ _f static void testZero(const xmm& xmmReg, const xmm& xmmTemp, const x32& gprTem
 }

 // Test if Vector is Negative (Set Flags and Makes Positive)
-_f static void testNeg(mV, const xmm& xmmReg, const x32& gprTemp)
+static __fi void testNeg(mV, const xmm& xmmReg, const x32& gprTemp)
 {
 	xMOVMSKPS(gprTemp, xmmReg);
 	xTEST(gprTemp, 1);
@ -156,7 +156,7 @@ mVUop(mVU_RSQRT) {
 }

 // ToDo: Can Be Optimized Further? (takes approximately (~115 cycles + mem access time) on a c2d)
-_f static void mVU_EATAN_(mV, const xmm& PQ, const xmm& Fs, const xmm& t1, const xmm& t2) {
+static __fi void mVU_EATAN_(mV, const xmm& PQ, const xmm& Fs, const xmm& t1, const xmm& t2) {
 	xMOVSS(PQ, Fs);
 	xMUL.SS(PQ, ptr32[mVUglob.T1]);
 	xMOVAPS(t2, Fs);
@ -272,7 +272,7 @@ mVUop(mVU_EEXP) {
 }

 // sumXYZ(): PQ.x = x ^ 2 + y ^ 2 + z ^ 2
-_f void mVU_sumXYZ(mV, const xmm& PQ, const xmm& Fs) {
+static __fi void mVU_sumXYZ(mV, const xmm& PQ, const xmm& Fs) {
 	if( x86caps.hasStreamingSIMD4Extensions ) {
 		xDP.PS(Fs, Fs, 0x71);
 		xMOVSS(PQ, Fs);
@ -995,7 +995,7 @@ mVUop(mVU_RINIT) {
 	pass3 { mVUlog("RINIT R, vf%02d%s", _Fs_, _Fsf_String); }
 }

-_f void mVU_RGET_(mV, const x32& Rreg) {
+static __fi void mVU_RGET_(mV, const x32& Rreg) {
 	if (!mVUlow.noWriteVF) {
 		const xmm& Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
 		xMOVDZX(Ft, Rreg);
@ -1139,7 +1139,7 @@ void __fastcall mVU_XGKICK_(u32 addr) {
 	}
 }

-_f void mVU_XGKICK_DELAY(mV, bool memVI) {
+static __fi void mVU_XGKICK_DELAY(mV, bool memVI) {
 	mVUbackupRegs(mVU);
 	if (memVI)	xMOV(gprT2, ptr32[&mVU->VIxgkick]);
 	else		mVUallocVIa(mVU, gprT2, _Is_);
--- a/pcsx2/x86/microVU_Misc.h
+++ b/pcsx2/x86/microVU_Misc.h
@ -127,7 +127,7 @@ typedef Fntype_mVUrecInst* Fnptr_mVUrecInst;

 // Recursive Inline
 #ifndef __LINUX__
-#define __recInline __releaseinline
+#define __recInline __ri
 #else
 #define __recInline inline
 #endif
@ -209,7 +209,6 @@ typedef u32 (__fastcall *mVUCall)(void*, void*);
 #define Rmem		 &mVU->regs->VI[REG_R].UL
 #define aWrap(x, m)	 ((x > m) ? 0 : x)
 #define shuffleSS(x) ((x==1)?(0x27):((x==2)?(0xc6):((x==4)?(0xe1):(0xe4))))
-#define _1mb		 (0x100000)
 #define clampE       CHECK_VU_EXTRA_OVERFLOW
 #define elif		 else if

--- a/pcsx2/x86/microVU_Misc.inl
+++ b/pcsx2/x86/microVU_Misc.inl
@ -214,7 +214,7 @@ void mVUmergeRegs(const xmm& dest, const xmm& src, int xyzw, bool modXYZW)
 //------------------------------------------------------------------

 // Transforms the Address in gprReg to valid VU0/VU1 Address
-_f void mVUaddrFix(mV, const x32& gprReg)
+__fi void mVUaddrFix(mV, const x32& gprReg)
 {
 	if (isVU1) {
 		xAND(gprReg, 0x3ff); // wrap around
@ -233,14 +233,14 @@ _f void mVUaddrFix(mV, const x32& gprReg)
 }

 // Backup Volatile Regs (EAX, ECX, EDX, MM0~7, XMM0~7, are all volatile according to 32bit Win/Linux ABI)
-_f void mVUbackupRegs(microVU* mVU)
+__fi void mVUbackupRegs(microVU* mVU)
 {
 	mVU->regAlloc->flushAll();
 	xMOVAPS(ptr128[mVU->xmmPQb], xmmPQ);
 }

 // Restore Volatile Regs
-_f void mVUrestoreRegs(microVU* mVU)
+__fi void mVUrestoreRegs(microVU* mVU)
 {
 	xMOVAPS(xmmPQ, ptr128[mVU->xmmPQb]);
 }
--- a/pcsx2/x86/microVU_Upper.inl
+++ b/pcsx2/x86/microVU_Upper.inl
@ -114,7 +114,7 @@ enum clampModes {
 };

 // Prints Opcode to MicroProgram Logs
-void mVU_printOP(microVU* mVU, int opCase, const char* opName, bool isACC) {
+static void mVU_printOP(microVU* mVU, int opCase, const char* opName, bool isACC) {
 	mVUlog(opName);
 	opCase1 { if (isACC) { mVUlogACC(); } else { mVUlogFd(); } mVUlogFt(); }
 	opCase2 { if (isACC) { mVUlogACC(); } else { mVUlogFd(); } mVUlogBC(); }
@ -123,7 +123,7 @@ void mVU_printOP(microVU* mVU, int opCase, const char* opName, bool isACC) {
 }

 // Sets Up Pass1 Info for Normal, BC, I, and Q Cases
-void setupPass1(microVU* mVU, int opCase, bool isACC, bool noFlagUpdate) {
+static void setupPass1(microVU* mVU, int opCase, bool isACC, bool noFlagUpdate) {
 	opCase1 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, _Ft_); }
 	opCase2 { mVUanalyzeFMAC3(mVU, ((isACC) ? 0 : _Fd_), _Fs_, _Ft_); }
 	opCase3 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, 0); }
@ -132,7 +132,7 @@ void setupPass1(microVU* mVU, int opCase, bool isACC, bool noFlagUpdate) {
 }

 // Safer to force 0 as the result for X minus X than to do actual subtraction
-bool doSafeSub(microVU* mVU, int opCase, int opType, bool isACC) {
+static bool doSafeSub(microVU* mVU, int opCase, int opType, bool isACC) {
 	opCase1 {
 		if ((opType == 1) && (_Ft_ == _Fs_)) {
 			const xmm& Fs = mVU->regAlloc->allocReg(-1, isACC ? 32 : _Fd_, _X_Y_Z_W);
@ -146,7 +146,7 @@ bool doSafeSub(microVU* mVU, int opCase, int opType, bool isACC) {
 }

 // Sets Up Ft Reg for Normal, BC, I, and Q Cases
-void setupFtReg(microVU* mVU, xmm& Ft, xmm& tempFt, int opCase) {
+static void setupFtReg(microVU* mVU, xmm& Ft, xmm& tempFt, int opCase) {
 	opCase1 {
 		if (_XYZW_SS2)   { Ft = mVU->regAlloc->allocReg(_Ft_, 0, _X_Y_Z_W);	tempFt = Ft; }
 		else if (clampE) { Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0xf);		tempFt = Ft; }
@ -167,7 +167,7 @@ void setupFtReg(microVU* mVU, xmm& Ft, xmm& tempFt, int opCase) {
 }

 // Normal FMAC Opcodes
-void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, const char* opName, int clampType) {
+static void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, const char* opName, int clampType) {
 	pass1 { setupPass1(mVU, opCase, isACC, ((opType == 3) || (opType == 4))); }
 	pass2 {
 		if (doSafeSub(mVU, opCase, opType, isACC)) return;
@ -205,7 +205,7 @@ void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, co
 }

 // MADDA/MSUBA Opcodes
-void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, const char* opName, int clampType) {
+static void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, const char* opName, int clampType) {
 	pass1 { setupPass1(mVU, opCase, 1, 0); }
 	pass2 {
 		xmm Fs, Ft, ACC, tempFt;
@ -246,7 +246,7 @@ void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, const char* op
 }

 // MADD Opcodes
-void mVU_FMACc(microVU* mVU, int recPass, int opCase, const char* opName, int clampType) {
+static void mVU_FMACc(microVU* mVU, int recPass, int opCase, const char* opName, int clampType) {
 	pass1 { setupPass1(mVU, opCase, 0, 0); }
 	pass2 {
 		xmm Fs, Ft, ACC, tempFt;
@ -277,7 +277,7 @@ void mVU_FMACc(microVU* mVU, int recPass, int opCase, const char* opName, int cl
 }

 // MSUB Opcodes
-void mVU_FMACd(microVU* mVU, int recPass, int opCase, const char* opName, int clampType) {
+static void mVU_FMACd(microVU* mVU, int recPass, int opCase, const char* opName, int clampType) {
 	pass1 { setupPass1(mVU, opCase, 0, 0); }
 	pass2 {
 		xmm Fs, Ft, Fd, tempFt;
--- a/pcsx2/x86/newVif.h
+++ b/pcsx2/x86/newVif.h
@ -21,10 +21,8 @@
 #include "x86emitter/x86emitter.h"
 using namespace x86Emitter;

-static const s64 _1mb = 0x100000;
 #define aMax(x, y) std::max(x,y)
 #define aMin(x, y) std::min(x,y)
-#define _f __forceinline

 // newVif_HashBucket.h uses this typedef, so it has to be declared first.
 typedef u32  (__fastcall *nVifCall)(void*, const void*);
--- a/Show More
+++ b/Show More