Reverted because of some processor and performance issue. I will develop in a branch about SSSE3/SSE4.1.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5123 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
nodchip 2010-02-24 23:58:48 +00:00
parent 1b43900f30
commit 609151c6e8
4 changed files with 30 additions and 189 deletions

View File

@ -16,7 +16,6 @@
// http://code.google.com/p/dolphin-emu/ // http://code.google.com/p/dolphin-emu/
#include <cmath> #include <cmath>
#include <nmmintrin.h>
#include "Common.h" #include "Common.h"
//#include "VideoCommon.h" // to get debug logs //#include "VideoCommon.h" // to get debug logs
@ -397,11 +396,7 @@ inline void decodebytesC8_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr)
} }
} }
template<bool SSSE3> inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr)
inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr);
template<>
inline void decodebytesC8_To_Raw16<false>(u16* dst, const u8* src, int tlutaddr)
{ {
u16* tlut = (u16*)(texMem + tlutaddr); u16* tlut = (u16*)(texMem + tlutaddr);
for (int x = 0; x < 8; x++) for (int x = 0; x < 8; x++)
@ -411,29 +406,6 @@ inline void decodebytesC8_To_Raw16<false>(u16* dst, const u8* src, int tlutaddr)
} }
} }
static const __m128i kMaskSwap16 = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L);
template<>
inline void decodebytesC8_To_Raw16<true>(u16* dst, const u8* src, int tlutaddr)
{
u16* tlut = (u16*)(texMem + tlutaddr);
// Make 8 16-bits unsigned integer values
__m128i a;
a = _mm_insert_epi16(a, tlut[src[0]], 0);
a = _mm_insert_epi16(a, tlut[src[1]], 1);
a = _mm_insert_epi16(a, tlut[src[2]], 2);
a = _mm_insert_epi16(a, tlut[src[3]], 3);
a = _mm_insert_epi16(a, tlut[src[4]], 4);
a = _mm_insert_epi16(a, tlut[src[5]], 5);
a = _mm_insert_epi16(a, tlut[src[6]], 6);
a = _mm_insert_epi16(a, tlut[src[7]], 7);
// Apply Common::swap16() to 16-bits unsigned integers at once
const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16);
// Store values to dst without polluting the caches
_mm_stream_si128((__m128i*)dst, b);
}
inline void decodebytesC14X2_5A3_To_BGRA32(u32 *dst, const u16 *src, int tlutaddr) inline void decodebytesC14X2_5A3_To_BGRA32(u32 *dst, const u16 *src, int tlutaddr)
{ {
@ -940,7 +912,6 @@ PC_TexFormat TexDecoder_DirectDecode_real(u8 *dst, const u8 *src, int width, int
//TODO: to save memory, don't blindly convert everything to argb8888 //TODO: to save memory, don't blindly convert everything to argb8888
//also ARGB order needs to be swapped later, to accommodate modern hardware better //also ARGB order needs to be swapped later, to accommodate modern hardware better
//need to add DXT support too //need to add DXT support too
static const __m128i kMaskSwap32 = _mm_set_epi32(0x0C0D0E0FL, 0x08090A0BL, 0x04050607L, 0x00010203L);
PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
{ {
switch (texformat) switch (texformat)
@ -994,18 +965,10 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
} }
else else
{ {
if (cpu_info.bSSSE3) { for (int y = 0; y < height; y += 4)
for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 8)
for (int x = 0; x < width; x += 8) for (int iy = 0; iy < 4; iy++, src += 8)
for (int iy = 0; iy < 4; iy++, src += 8) decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr);
decodebytesC8_To_Raw16<true>((u16*)dst + (y + iy) * width + x, src, tlutaddr);
} else {
for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8)
for (int iy = 0; iy < 4; iy++, src += 8)
decodebytesC8_To_Raw16<false>((u16*)dst + (y + iy) * width + x, src, tlutaddr);
}
} }
return GetPCFormatFromTLUTFormat(tlutfmt); return GetPCFormatFromTLUTFormat(tlutfmt);
case GX_TF_IA4: case GX_TF_IA4:
@ -1071,76 +1034,13 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
return PC_TEX_FMT_BGRA32; return PC_TEX_FMT_BGRA32;
case GX_TF_RGBA8: // speed critical case GX_TF_RGBA8: // speed critical
{ {
if (cpu_info.bSSE4_1) { for (int y = 0; y < height; y += 4)
for (int y = 0; y < height; y += 4) { for (int x = 0; x < width; x += 4)
__m128i* p = (__m128i*)(src + y * width * 4); {
for (int x = 0; x < width; x += 4) { for (int iy = 0; iy < 4; iy++)
// Load 64-bytes at once. decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16);
const __m128i a0 = _mm_stream_load_si128(p++); src += 64;
const __m128i a1 = _mm_stream_load_si128(p++); }
const __m128i a2 = _mm_stream_load_si128(p++);
const __m128i a3 = _mm_stream_load_si128(p++);
// Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(),
// apply Common::swap32() by _mm_shuffle_epi8() and
// store them by _mm_stream_si128().
// See decodebytesARGB8_4() about the idea.
const __m128i b0 = _mm_unpacklo_epi16(a0, a2);
const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32);
_mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0);
const __m128i b1 = _mm_unpackhi_epi16(a0, a2);
const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32);
_mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1);
const __m128i b2 = _mm_unpacklo_epi16(a1, a3);
const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32);
_mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2);
const __m128i b3 = _mm_unpackhi_epi16(a1, a3);
const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32);
_mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3);
}
}
} else if (cpu_info.bSSSE3) {
// SSSE3 can not use _mm_stream_load_si128().
// Use _mm_load_si128() instead of _mm_load_si128().
for (int y = 0; y < height; y += 4) {
__m128i* p = (__m128i*)(src + y * width * 4);
for (int x = 0; x < width; x += 4) {
const __m128i a0 = _mm_load_si128(p++);
const __m128i a1 = _mm_load_si128(p++);
const __m128i a2 = _mm_load_si128(p++);
const __m128i a3 = _mm_load_si128(p++);
const __m128i b0 = _mm_unpacklo_epi16(a0, a2);
const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32);
_mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0);
const __m128i b1 = _mm_unpackhi_epi16(a0, a2);
const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32);
_mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1);
const __m128i b2 = _mm_unpacklo_epi16(a1, a3);
const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32);
_mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2);
const __m128i b3 = _mm_unpackhi_epi16(a1, a3);
const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32);
_mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3);
}
}
} else {
for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4)
{
for (int iy = 0; iy < 4; iy++)
decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16);
src += 64;
}
}
} }
return PC_TEX_FMT_BGRA32; return PC_TEX_FMT_BGRA32;
case GX_TF_CMPR: // speed critical case GX_TF_CMPR: // speed critical

View File

@ -16,10 +16,8 @@
// http://code.google.com/p/dolphin-emu/ // http://code.google.com/p/dolphin-emu/
#include <assert.h> #include <assert.h>
#include <nmmintrin.h>
#include "Common.h" #include "Common.h"
#include "CPUDetect.h"
#include "VideoCommon.h" #include "VideoCommon.h"
#include "VideoConfig.h" #include "VideoConfig.h"
#include "Profiler.h" #include "Profiler.h"
@ -72,7 +70,7 @@ int colIndex;
TVtxAttr* pVtxAttr; TVtxAttr* pVtxAttr;
int colElements[2]; int colElements[2];
float posScale; float posScale;
__declspec(align(16)) float tcScale[8]; float tcScale[8];
using namespace Gen; using namespace Gen;
@ -635,31 +633,9 @@ void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
pVtxAttr = &m_VtxAttr; pVtxAttr = &m_VtxAttr;
posScale = 1.0f / float(1 << m_VtxAttr.PosFrac); posScale = 1.0f / float(1 << m_VtxAttr.PosFrac);
if (m_NativeFmt->m_components & VB_HAS_UVALL) { if (m_NativeFmt->m_components & VB_HAS_UVALL)
if (cpu_info.bSSE4_1) { for (int i = 0; i < 8; i++)
__m128i a0; tcScale[i] = 1.0f / float(1 << m_VtxAttr.texCoord[i].Frac);
a0 = _mm_insert_epi32(a0, 1 << m_VtxAttr.texCoord[0].Frac, 0);
a0 = _mm_insert_epi32(a0, 1 << m_VtxAttr.texCoord[1].Frac, 1);
a0 = _mm_insert_epi32(a0, 1 << m_VtxAttr.texCoord[2].Frac, 2);
a0 = _mm_insert_epi32(a0, 1 << m_VtxAttr.texCoord[3].Frac, 3);
const __m128 b0 = _mm_cvtepi32_ps(a0);
const __m128 c0 = _mm_rcp_ps(b0);
_mm_stream_ps(&tcScale[0], c0);
__m128i a1;
a1 = _mm_insert_epi32(a1, 1 << m_VtxAttr.texCoord[4].Frac, 0);
a1 = _mm_insert_epi32(a1, 1 << m_VtxAttr.texCoord[5].Frac, 1);
a1 = _mm_insert_epi32(a1, 1 << m_VtxAttr.texCoord[6].Frac, 2);
a1 = _mm_insert_epi32(a1, 1 << m_VtxAttr.texCoord[7].Frac, 3);
const __m128 b1 = _mm_cvtepi32_ps(a1);
const __m128 c1 = _mm_rcp_ps(b1);
_mm_stream_ps(&tcScale[4], c1);
} else {
for (int i = 0; i < 8; i++) {
tcScale[i] = 1.0f / float(1 << m_VtxAttr.texCoord[i].Frac);
}
}
}
for (int i = 0; i < 2; i++) for (int i = 0; i < 2; i++)
colElements[i] = m_VtxAttr.color[i].Elements; colElements[i] = m_VtxAttr.color[i].Elements;

View File

@ -18,9 +18,7 @@
#ifndef VERTEXLOADER_POSITION_H #ifndef VERTEXLOADER_POSITION_H
#define VERTEXLOADER_POSITION_H #define VERTEXLOADER_POSITION_H
#include <nmmintrin.h>
#include "Common.h" #include "Common.h"
#include "CPUDetect.h"
#include "VideoCommon.h" #include "VideoCommon.h"
#include "VertexLoader.h" #include "VertexLoader.h"
#include "VertexLoader_Position.h" #include "VertexLoader_Position.h"
@ -151,34 +149,18 @@ inline void Pos_ReadIndex_Short(int Index)
VertexManager::s_pCurBufferPointer += 12; VertexManager::s_pCurBufferPointer += 12;
} }
static const __m128i kMaskSwap32 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L);
template<bool three> template<bool three>
inline void Pos_ReadIndex_Float(int Index) inline void Pos_ReadIndex_Float(int Index)
{ {
const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION])); const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION]));
((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]);
if (cpu_info.bSSE4_1) { ((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]);
const __m128i a = _mm_loadu_si128((__m128i*)pData); if (three)
__m128i b = _mm_shuffle_epi8(a, kMaskSwap32); ((u32*)VertexManager::s_pCurBufferPointer)[2] = Common::swap32(pData[2]);
if (!three) { else
b = _mm_insert_epi32(b, 0, 2); ((float*)VertexManager::s_pCurBufferPointer)[2] = 0.0f;
} LOG_VTX();
u8* p = VertexManager::s_pCurBufferPointer; VertexManager::s_pCurBufferPointer += 12;
_mm_storeu_si128((__m128i*)p, b);
LOG_VTX();
p += 12;
VertexManager::s_pCurBufferPointer = p;
} else {
((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]);
((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]);
if (three)
((u32*)VertexManager::s_pCurBufferPointer)[2] = Common::swap32(pData[2]);
else
((float*)VertexManager::s_pCurBufferPointer)[2] = 0.0f;
LOG_VTX();
VertexManager::s_pCurBufferPointer += 12;
}
} }
// ============================================================================== // ==============================================================================

View File

@ -18,9 +18,7 @@
#ifndef VERTEXLOADER_TEXCOORD_H #ifndef VERTEXLOADER_TEXCOORD_H
#define VERTEXLOADER_TEXCOORD_H #define VERTEXLOADER_TEXCOORD_H
#include <nmmintrin.h>
#include "Common.h" #include "Common.h"
#include "CPUDetect.h"
#include "VideoCommon.h" #include "VideoCommon.h"
#include "VertexLoader.h" #include "VertexLoader.h"
#include "VertexLoader_Position.h" #include "VertexLoader_Position.h"
@ -310,30 +308,15 @@ void LOADERDECL TexCoord_ReadIndex16_Float1()
VertexManager::s_pCurBufferPointer += 4; VertexManager::s_pCurBufferPointer += 4;
tcIndex++; tcIndex++;
} }
void LOADERDECL TexCoord_ReadIndex16_Float2()
static const __m128i kMaskSwap32 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);
void LOADERDECL TexCoord_ReadIndex16_Float2()
{ {
u16 Index = DataReadU16(); u16 Index = DataReadU16();
const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]));
((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]);
if (cpu_info.bSSSE3) { ((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]);
const __m128i a = _mm_loadl_epi64((__m128i*)pData); LOG_TEX2();
const __m128i b = _mm_shuffle_epi8(a, kMaskSwap32); VertexManager::s_pCurBufferPointer += 8;
u8* p = VertexManager::s_pCurBufferPointer; tcIndex++;
_mm_storel_epi64((__m128i*)p, b);
LOG_TEX2();
p += 8;
VertexManager::s_pCurBufferPointer = p;
tcIndex++;
} else {
((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]);
((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]);
LOG_TEX2();
VertexManager::s_pCurBufferPointer += 8;
tcIndex++;
}
} }
#endif #endif