Improved SSE detection:

* SSE3 detection via cpuid implemented.  [fixes Linux -- the force_sse3 option should no longer be needed!]

 * Instruction tests are now done for SSE3, SSE4, and SSE4.1 to confirm cpuid results (I doubt this is necessary, but the old code did it for SSE3, so I figured I'd keep it and log results anytime an inconsistency is detected).

 * SSE4.2 and SSE4a detection added.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1086 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2009-04-29 10:10:33 +00:00
parent 6e82c77e92
commit 41079415fa
3 changed files with 94 additions and 38 deletions

View File

@ -135,13 +135,15 @@ void SysDetect()
"\t%sDetected SSE2\n" "\t%sDetected SSE2\n"
"\t%sDetected SSE3\n" "\t%sDetected SSE3\n"
"\t%sDetected SSSE3\n" "\t%sDetected SSSE3\n"
"\t%sDetected SSE4.1\n", params "\t%sDetected SSE4.1\n"
"\t%sDetected SSE4.2\n", params
cpucaps.hasMultimediaExtensions ? "" : "Not ", cpucaps.hasMultimediaExtensions ? "" : "Not ",
cpucaps.hasStreamingSIMDExtensions ? "" : "Not ", cpucaps.hasStreamingSIMDExtensions ? "" : "Not ",
cpucaps.hasStreamingSIMD2Extensions ? "" : "Not ", cpucaps.hasStreamingSIMD2Extensions ? "" : "Not ",
cpucaps.hasStreamingSIMD3Extensions ? "" : "Not ", cpucaps.hasStreamingSIMD3Extensions ? "" : "Not ",
cpucaps.hasSupplementalStreamingSIMD3Extensions ? "" : "Not ", cpucaps.hasSupplementalStreamingSIMD3Extensions ? "" : "Not ",
cpucaps.hasStreamingSIMD4Extensions ? "" : "Not " cpucaps.hasStreamingSIMD4Extensions ? "" : "Not ",
cpucaps.hasStreamingSIMD4Extensions2 ? "" : "Not "
); );
if ( cpuinfo.x86ID[0] == 'A' ) //AMD cpu if ( cpuinfo.x86ID[0] == 'A' ) //AMD cpu
@ -150,10 +152,12 @@ void SysDetect()
WriteLn( WriteLn(
"\t%sDetected MMX2\n" "\t%sDetected MMX2\n"
"\t%sDetected 3DNOW\n" "\t%sDetected 3DNOW\n"
"\t%sDetected 3DNOW2\n", params "\t%sDetected 3DNOW2\n"
"\t%sDetected SSE4a\n", params
cpucaps.hasMultimediaExtensionsExt ? "" : "Not ", cpucaps.hasMultimediaExtensionsExt ? "" : "Not ",
cpucaps.has3DNOWInstructionExtensions ? "" : "Not ", cpucaps.has3DNOWInstructionExtensions ? "" : "Not ",
cpucaps.has3DNOWInstructionExtensionsExt ? "" : "Not " cpucaps.has3DNOWInstructionExtensionsExt ? "" : "Not ",
cpucaps.hasStreamingSIMD4ExtensionsA ? "" : "Not "
); );
} }

View File

@ -23,6 +23,8 @@
#include "RedtapeWindows.h" #include "RedtapeWindows.h"
using namespace x86Emitter;
#if defined (_MSC_VER) && _MSC_VER >= 1400 #if defined (_MSC_VER) && _MSC_VER >= 1400
extern "C" extern "C"
@ -148,31 +150,29 @@ u64 GetCPUTick( void )
#endif #endif
} }
//////////////////////////////////////////////////////////////////////////////////////////
// Note: This function doesn't support GCC/Linux. Looking online it seems the only // Note: This function doesn't support GCC/Linux. Looking online it seems the only
// way to simulate the Micrsoft SEH model is to use unix signals, and the 'sigaction' // way to simulate the Micrsoft SEH model is to use unix signals, and the 'sigaction'
// function specifically. Maybe a project for a linux developer at a later date. :) // function specifically. Maybe a project for a linux developer at a later date. :)
void cpudetectSSE3(void* pfnCallSSE3)
{
cpucaps.hasStreamingSIMD3Extensions = 1;
#ifdef _MSC_VER #ifdef _MSC_VER
static bool _test_instruction( void* pfnCall )
{
__try { __try {
((void (*)())pfnCallSSE3)(); ((void (*)())pfnCall)();
} }
__except(EXCEPTION_EXECUTE_HANDLER) { __except(EXCEPTION_EXECUTE_HANDLER) {
cpucaps.hasStreamingSIMD3Extensions = 0; return false;
}
return true;
} }
#else // linux
#ifdef PCSX2_FORCESSE3 static char* bool_to_char( bool testcond )
cpucaps.hasStreamingSIMD3Extensions = 1; {
#else return testcond ? "true" : "false";
// exception handling doesn't work, so disable for x86 builds of linux
cpucaps.hasStreamingSIMD3Extensions = 0;
#endif
#endif
} }
#endif
#if defined __LINUX__ #if defined __LINUX__
#include <sys/time.h> #include <sys/time.h>
@ -180,6 +180,8 @@ void cpudetectSSE3(void* pfnCallSSE3)
#endif #endif
//////////////////////////////////////////////////////////////////////////////////////////
//
s64 CPUSpeedHz( unsigned int time ) s64 CPUSpeedHz( unsigned int time )
{ {
s64 timeStart, s64 timeStart,
@ -200,6 +202,7 @@ s64 CPUSpeedHz( unsigned int time )
{ {
timeStart = timeGetTime( ); timeStart = timeGetTime( );
} }
for(;;) for(;;)
{ {
timeStop = timeGetTime( ); timeStop = timeGetTime( );
@ -294,6 +297,7 @@ void cpudetectInit()
if ( iCpuId( 0x80000001, regs ) != -1 ) if ( iCpuId( 0x80000001, regs ) != -1 )
{ {
x86_64_12BITBRANDID = regs[1] & 0xfff; x86_64_12BITBRANDID = regs[1] & 0xfff;
cpuinfo.x86EFlags2 = regs[ 2 ];
cpuinfo.x86EFlags = regs[ 3 ]; cpuinfo.x86EFlags = regs[ 3 ];
} }
@ -364,40 +368,85 @@ void cpudetectInit()
cpucaps.hasMultiThreading = ( cpuinfo.x86Flags >> 28 ) & 1; cpucaps.hasMultiThreading = ( cpuinfo.x86Flags >> 28 ) & 1;
cpucaps.hasThermalMonitor = ( cpuinfo.x86Flags >> 29 ) & 1; cpucaps.hasThermalMonitor = ( cpuinfo.x86Flags >> 29 ) & 1;
cpucaps.hasIntel64BitArchitecture = ( cpuinfo.x86Flags >> 30 ) & 1; cpucaps.hasIntel64BitArchitecture = ( cpuinfo.x86Flags >> 30 ) & 1;
//that is only for AMDs //that is only for AMDs
cpucaps.hasMultimediaExtensionsExt = ( cpuinfo.x86EFlags >> 22 ) & 1; //mmx2 cpucaps.hasMultimediaExtensionsExt = ( cpuinfo.x86EFlags >> 22 ) & 1; //mmx2
cpucaps.hasAMD64BitArchitecture = ( cpuinfo.x86EFlags >> 29 ) & 1; //64bit cpu cpucaps.hasAMD64BitArchitecture = ( cpuinfo.x86EFlags >> 29 ) & 1; //64bit cpu
cpucaps.has3DNOWInstructionExtensionsExt = ( cpuinfo.x86EFlags >> 30 ) & 1; //3dnow+ cpucaps.has3DNOWInstructionExtensionsExt = ( cpuinfo.x86EFlags >> 30 ) & 1; //3dnow+
cpucaps.has3DNOWInstructionExtensions = ( cpuinfo.x86EFlags >> 31 ) & 1; //3dnow cpucaps.has3DNOWInstructionExtensions = ( cpuinfo.x86EFlags >> 31 ) & 1; //3dnow
cpucaps.hasStreamingSIMD4ExtensionsA = ( cpuinfo.x86EFlags2 >> 6 ) & 1; //INSERTQ / EXTRQ / MOVNT
cpuinfo.cpuspeed = (u32)(CPUSpeedHz( 1000 ) / 1000000);
// --> SSE 4.1 detection <-- cpuinfo.cpuspeed = (u32)(CPUSpeedHz( 600 ) / 1000000);
// We don't care about the small subset of CPUs using SSE4 (which is also hard to
// detect, in addition to being of limited use due to the abbreviated instruction set).
// So we'll just leave it at SSE 4.1. SSE4 cpu detection is ignored.
cpucaps.hasStreamingSIMD4Extensions = ( cpuinfo.x86Flags2 >> 19 ) & 1; //sse4.1 // --> SSE3 / SSSE3 / SSE4.1 / SSE 4.2 detection <--
// --> SSSE3 detection <--
cpucaps.hasStreamingSIMD3Extensions = ( cpuinfo.x86Flags2 >> 0 ) & 1; //sse3
cpucaps.hasSupplementalStreamingSIMD3Extensions = ( cpuinfo.x86Flags2 >> 9 ) & 1; //ssse3 cpucaps.hasSupplementalStreamingSIMD3Extensions = ( cpuinfo.x86Flags2 >> 9 ) & 1; //ssse3
cpucaps.hasStreamingSIMD4Extensions = ( cpuinfo.x86Flags2 >> 19 ) & 1; //sse4.1
cpucaps.hasStreamingSIMD4Extensions2 = ( cpuinfo.x86Flags2 >> 20 ) & 1; //sse4.2
// --> SSE3 detection <-- // Can the SSE3 / SSE4.1 bits be trusted? Using an instruction test is a very "complete"
// These instructions may not be recognized by some compilers, or may not have // approach to ensuring the bit is accurate, and at least one reported case of a Q9550 not
// intrinsic equivalents available. So we use our own ix86 emitter to generate // having SSE 4.1 set but still supporting it properly is fixed by this --air
// some code and run it that way. :)
#ifdef _MSC_VER
u8* recSSE = (u8*)HostSys::Mmap( NULL, 0x1000 ); u8* recSSE = (u8*)HostSys::Mmap( NULL, 0x1000 );
if( recSSE != NULL ) if( recSSE != NULL )
{ {
x86SetPtr(recSSE); xSetPtr( recSSE );
SSE3_MOVSLDUP_XMM_to_XMM(XMM0, XMM0); xMOVSLDUP( xmm1, xmm0 );
RET(); RET();
cpudetectSSE3(recSSE);
u8* funcSSSE3 = xGetPtr();
xPABS.W( xmm0, xmm1 );
RET();
u8* funcSSE41 = xGetPtr();
xBLEND.VPD( xmm1, xmm0 );
RET();
bool sse3_result = _test_instruction( recSSE ); // sse3
bool ssse3_result = _test_instruction( funcSSSE3 );
bool sse41_result = _test_instruction( funcSSE41 );
HostSys::Munmap( recSSE, 0x1000 ); HostSys::Munmap( recSSE, 0x1000 );
// Test for and log any irregularities here.
// We take the instruction test result over cpuid since (in theory) it should be a
// more reliable gauge of the cpu's actual ability.
if( sse3_result != cpucaps.hasStreamingSIMD3Extensions )
{
Console::Notice( "SSE3 Detection Inconsistency: cpuid=%s, test_result=%s",
params bool_to_char( cpucaps.hasStreamingSIMD3Extensions ), bool_to_char( sse3_result ) );
cpucaps.hasStreamingSIMD3Extensions = sse3_result;
} }
else { Console::Error("Error: Failed to allocate memory for SSE3 State detection."); }
if( ssse3_result != cpucaps.hasSupplementalStreamingSIMD3Extensions )
{
Console::Notice( "SSSE3 Detection Inconsistency: cpuid=%s, test_result=%s",
params bool_to_char( cpucaps.hasSupplementalStreamingSIMD3Extensions ), bool_to_char( ssse3_result ) );
cpucaps.hasSupplementalStreamingSIMD3Extensions = ssse3_result;
}
if( sse41_result != cpucaps.hasStreamingSIMD4Extensions )
{
Console::Notice( "SSE4 Detection Inconsistency: cpuid=%s, test_result=%s",
params bool_to_char( cpucaps.hasStreamingSIMD4Extensions ), bool_to_char( sse41_result ) );
cpucaps.hasStreamingSIMD4Extensions = sse41_result;
}
}
else
Console::Notice(
"Notice: Could not allocate memory for SSE3/4 detection.\n"
"\tRelying on CPUID results. [this is not an error]"
);
#endif
////////////////////////////////////// //////////////////////////////////////
// Core Counting! // Core Counting!

View File

@ -54,12 +54,14 @@ struct CAPABILITIES
u32 hasStreamingSIMD3Extensions; u32 hasStreamingSIMD3Extensions;
u32 hasSupplementalStreamingSIMD3Extensions; u32 hasSupplementalStreamingSIMD3Extensions;
u32 hasStreamingSIMD4Extensions; u32 hasStreamingSIMD4Extensions;
u32 hasStreamingSIMD4Extensions2;
// AMD-specific CPU Features // AMD-specific CPU Features
u32 hasMultimediaExtensionsExt; u32 hasMultimediaExtensionsExt;
u32 hasAMD64BitArchitecture; u32 hasAMD64BitArchitecture;
u32 has3DNOWInstructionExtensionsExt; u32 has3DNOWInstructionExtensionsExt;
u32 has3DNOWInstructionExtensions; u32 has3DNOWInstructionExtensions;
u32 hasStreamingSIMD4ExtensionsA;
}; };
extern CAPABILITIES cpucaps; extern CAPABILITIES cpucaps;
@ -73,6 +75,7 @@ struct CPUINFO
u32 x86Flags; // Feature Flags u32 x86Flags; // Feature Flags
u32 x86Flags2; // More Feature Flags u32 x86Flags2; // More Feature Flags
u32 x86EFlags; // Extended Feature Flags u32 x86EFlags; // Extended Feature Flags
u32 x86EFlags2; // Extended Feature Flags pg2
u32 PhysicalCores; u32 PhysicalCores;
u32 LogicalCores; u32 LogicalCores;