Use alignment for ReadDataU32XN. Revert james temp fix.
Should provide some form of a speedup. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6812 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
d78be40be9
commit
3f5d1afc6a
|
@ -82,6 +82,7 @@ private:
|
||||||
#define GC_ALIGNED16(x) __declspec(align(16)) x
|
#define GC_ALIGNED16(x) __declspec(align(16)) x
|
||||||
#define GC_ALIGNED32(x) __declspec(align(32)) x
|
#define GC_ALIGNED32(x) __declspec(align(32)) x
|
||||||
#define GC_ALIGNED64(x) __declspec(align(64)) x
|
#define GC_ALIGNED64(x) __declspec(align(64)) x
|
||||||
|
#define GC_ALIGNED128(x) __declspec(align(128)) x
|
||||||
#define GC_ALIGNED16_DECL(x) __declspec(align(16)) x
|
#define GC_ALIGNED16_DECL(x) __declspec(align(16)) x
|
||||||
#define GC_ALIGNED64_DECL(x) __declspec(align(64)) x
|
#define GC_ALIGNED64_DECL(x) __declspec(align(64)) x
|
||||||
|
|
||||||
|
@ -129,6 +130,7 @@ private:
|
||||||
#define GC_ALIGNED16(x) __attribute__((aligned(16))) x
|
#define GC_ALIGNED16(x) __attribute__((aligned(16))) x
|
||||||
#define GC_ALIGNED32(x) __attribute__((aligned(32))) x
|
#define GC_ALIGNED32(x) __attribute__((aligned(32))) x
|
||||||
#define GC_ALIGNED64(x) __attribute__((aligned(64))) x
|
#define GC_ALIGNED64(x) __attribute__((aligned(64))) x
|
||||||
|
#define GC_ALIGNED128(x) __attribute__((aligned(128))) x
|
||||||
#define GC_ALIGNED16_DECL(x) __attribute__((aligned(16))) x
|
#define GC_ALIGNED16_DECL(x) __attribute__((aligned(16))) x
|
||||||
#define GC_ALIGNED64_DECL(x) __attribute__((aligned(64))) x
|
#define GC_ALIGNED64_DECL(x) __attribute__((aligned(64))) x
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -305,7 +305,7 @@ u8 AnalyzeAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
|
||||||
u32 Cmd2 = DataReadU32();
|
u32 Cmd2 = DataReadU32();
|
||||||
int transfer_size = ((Cmd2 >> 16) & 15) + 1;
|
int transfer_size = ((Cmd2 >> 16) & 15) + 1;
|
||||||
u32 xf_address = Cmd2 & 0xFFFF;
|
u32 xf_address = Cmd2 & 0xFFFF;
|
||||||
u32 data_buffer[16];
|
GC_ALIGNED128(u32 data_buffer[16]);
|
||||||
DataReadU32xFuncs[transfer_size-1](data_buffer);
|
DataReadU32xFuncs[transfer_size-1](data_buffer);
|
||||||
LoadXFReg(transfer_size, xf_address, data_buffer);
|
LoadXFReg(transfer_size, xf_address, data_buffer);
|
||||||
INCSTAT(stats.thisFrame.numXFLoads);
|
INCSTAT(stats.thisFrame.numXFLoads);
|
||||||
|
@ -455,10 +455,10 @@ bool CompileAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
|
||||||
ReferencedDataRegion* NewRegion = new ReferencedDataRegion;
|
ReferencedDataRegion* NewRegion = new ReferencedDataRegion;
|
||||||
NewRegion->MustClean = true;
|
NewRegion->MustClean = true;
|
||||||
NewRegion->size = transfer_size * 4;
|
NewRegion->size = transfer_size * 4;
|
||||||
NewRegion->start_address = (u8*) new u8[NewRegion->size];
|
NewRegion->start_address = (u8*) new u8[NewRegion->size+0xf]; // alignment
|
||||||
NewRegion->hash = 0;
|
NewRegion->hash = 0;
|
||||||
dl->InsertRegion(NewRegion);
|
dl->InsertRegion(NewRegion);
|
||||||
u32 *data_buffer = (u32*)NewRegion->start_address;
|
u32 *data_buffer = (u32*)(u8*)(((size_t)NewRegion->start_address+0xf)&~0xf);
|
||||||
DataReadU32xFuncs[transfer_size-1](data_buffer);
|
DataReadU32xFuncs[transfer_size-1](data_buffer);
|
||||||
LoadXFReg(transfer_size, xf_address, data_buffer);
|
LoadXFReg(transfer_size, xf_address, data_buffer);
|
||||||
INCSTAT(stats.thisFrame.numXFLoads);
|
INCSTAT(stats.thisFrame.numXFLoads);
|
||||||
|
|
|
@ -83,35 +83,29 @@ const __m128i mask4 = _mm_set_epi8(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
|
||||||
template<unsigned int N>
|
template<unsigned int N>
|
||||||
void DataReadU32xN_SSSE3(u32 *bufx16)
|
void DataReadU32xN_SSSE3(u32 *bufx16)
|
||||||
{
|
{
|
||||||
__m128i* store = (__m128i *)bufx16;
|
memcpy(bufx16, g_pVideoData, sizeof(u32) * N);
|
||||||
__m128i* load = (__m128i *)g_pVideoData;
|
__m128i* buf = (__m128i *)bufx16;
|
||||||
switch(N)
|
switch(N)
|
||||||
{
|
{
|
||||||
case 13: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
case 13: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++;
|
||||||
case 9: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
case 9: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++;
|
||||||
case 5: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
case 5: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++;
|
||||||
case 1: // 1 U32 left:
|
case 1: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask1));
|
||||||
((u32 *)store)[0] = Common::swap32(((u32 *)load)[0]);
|
|
||||||
break;
|
break;
|
||||||
case 14: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
case 14: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++;
|
||||||
case 10: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
case 10: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++;
|
||||||
case 6: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
case 6: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++;
|
||||||
case 2: // 2 U32s left:
|
case 2: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask2));
|
||||||
((u32 *)store)[0] = Common::swap32(((u32 *)load)[0]);
|
|
||||||
((u32 *)store)[1] = Common::swap32(((u32 *)load)[1]);
|
|
||||||
break;
|
break;
|
||||||
case 15: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
case 15: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++;
|
||||||
case 11: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
case 11: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++;
|
||||||
case 7: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
case 7: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++;
|
||||||
case 3: // 3 U32s left:
|
case 3: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask3));
|
||||||
((u32 *)store)[0] = Common::swap32(((u32 *)load)[0]);
|
|
||||||
((u32 *)store)[1] = Common::swap32(((u32 *)load)[1]);
|
|
||||||
((u32 *)store)[2] = Common::swap32(((u32 *)load)[2]);
|
|
||||||
break;
|
break;
|
||||||
case 16: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
case 16: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++;
|
||||||
case 12: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
case 12: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++;
|
||||||
case 8: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
case 8: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++;
|
||||||
case 4: _mm_storeu_si128(store, _mm_shuffle_epi8(_mm_loadu_si128(load), mask4));
|
case 4: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
g_pVideoData += (sizeof(u32) * N);
|
g_pVideoData += (sizeof(u32) * N);
|
||||||
|
|
|
@ -364,8 +364,7 @@ static void DecodeSemiNop()
|
||||||
u32 Cmd2 = DataReadU32();
|
u32 Cmd2 = DataReadU32();
|
||||||
int transfer_size = ((Cmd2 >> 16) & 15) + 1;
|
int transfer_size = ((Cmd2 >> 16) & 15) + 1;
|
||||||
u32 address = Cmd2 & 0xFFFF;
|
u32 address = Cmd2 & 0xFFFF;
|
||||||
// TODO - speed this up. pshufb?
|
GC_ALIGNED128(u32 data_buffer[16]);
|
||||||
u32 data_buffer[16];
|
|
||||||
DataReadU32xFuncs[transfer_size-1](data_buffer);
|
DataReadU32xFuncs[transfer_size-1](data_buffer);
|
||||||
LoadXFReg(transfer_size, address, data_buffer);
|
LoadXFReg(transfer_size, address, data_buffer);
|
||||||
INCSTAT(stats.thisFrame.numXFLoads);
|
INCSTAT(stats.thisFrame.numXFLoads);
|
||||||
|
|
Loading…
Reference in New Issue