vif: replace sse cmp code with standard cmp

Standard instruction are faster to execute besides the CPU can optimize the cmp/jne

SSE

  e0:	add    ecx,0x10
  e3:	cmp    eax,0x7
  e6:	jg     1b0 <void dVifUnpack<0>(unsigned char const*, bool)+0x1b0>
enter_loop:
  ec:	vpcmpeqd xmm0,xmm1,XMMWORD PTR [ecx]
  f0:	vmovmskps eax,xmm0
  f4:	cmp    eax,0x7
  f7:	jne    e0 <void dVifUnpack<0>(unsigned char const*, bool)+0xe0>

Standard cmp

  d8:	add    eax,0x10
  db:	mov    esi,DWORD PTR [eax+0xc]
  de:	test   esi,esi
  e0:	je     190 <void dVifUnpack<0>(unsigned char const*, bool)+0x190>
enter_loop:
  e6:	cmp    ecx,DWORD PTR [eax+0x4]
  e9:	jne    d8 <void dVifUnpack<0>(unsigned char const*, bool)+0xd8>
  eb:	cmp    DWORD PTR [eax+0x8],ebx
  ee:	jne    d8 <void dVifUnpack<0>(unsigned char const*, bool)+0xd8>

v2: use reference instead of a pointer for find parameter
This commit is contained in:
Gregory Hainaut 2016-12-15 19:53:22 +01:00
parent 2320efeb55
commit 7a33cda122
2 changed files with 10 additions and 15 deletions

View File

@ -286,7 +286,7 @@ _vifT static __fi u8* dVifsetVUptr(uint cl, uint wl, u8 num, bool isFill) {
_vifT __fi uptr dVifCompile(nVifBlock& key) { _vifT __fi uptr dVifCompile(nVifBlock& key) {
nVifStruct& v = nVif[idx]; nVifStruct& v = nVif[idx];
nVifBlock* block = v.vifBlocks.find(&key); nVifBlock* block = v.vifBlocks.find(key);
// Cache hit // Cache hit
if (likely(block != nullptr)) if (likely(block != nullptr))

View File

@ -64,23 +64,18 @@ public:
~HashBucket() throw() { clear(); } ~HashBucket() throw() { clear(); }
__fi nVifBlock* find(nVifBlock* dataPtr) { __fi nVifBlock* find(const nVifBlock& dataPtr) {
const __m128i* chainpos = (__m128i*)m_bucket[dataPtr->hash_key]; nVifBlock* chainpos = m_bucket[dataPtr.hash_key];
const __m128i data128( _mm_load_si128((__m128i*)dataPtr) ); while (true) {
if (chainpos->key0 == dataPtr.key0 && chainpos->key1 == dataPtr.key1)
int result; return chainpos;
do {
// This inline SSE code is generally faster than using emitter code, since it inlines nicely. --air
result = _mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( data128, _mm_load_si128(chainpos) ) ) );
// startPtr doesn't match (aka not nullptr) hence 4th bit must be 0
if (result == 0x7) return (nVifBlock*)chainpos;
chainpos += sizeof(nVifBlock) / sizeof(__m128i);
} while(result < 0x8);
if (chainpos->startPtr == 0)
return nullptr; return nullptr;
chainpos++;
}
} }
void add(const nVifBlock& dataPtr) { void add(const nVifBlock& dataPtr) {