Used SSSE3 instructions to swap16 and memcpy the DSP DMA transfers for a speed up
git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6822 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
22d66f51a8
commit
559fb7434e
|
@ -33,6 +33,11 @@
|
||||||
#include "DSPAccelerator.h"
|
#include "DSPAccelerator.h"
|
||||||
#include "DSPInterpreter.h"
|
#include "DSPInterpreter.h"
|
||||||
#include "DSPHWInterface.h"
|
#include "DSPHWInterface.h"
|
||||||
|
#include "CPUDetect.h"
|
||||||
|
|
||||||
|
#if _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
|
||||||
|
#include <tmmintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
void gdsp_do_dma();
|
void gdsp_do_dma();
|
||||||
|
|
||||||
|
@ -264,17 +269,29 @@ void gdsp_idma_out(u16 dsp_addr, u32 addr, u32 size)
|
||||||
ERROR_LOG(DSPLLE, "*** idma_out IRAM_DSP (0x%04x) -> RAM (0x%08x) : size (0x%08x)", dsp_addr / 2, addr, size);
|
ERROR_LOG(DSPLLE, "*** idma_out IRAM_DSP (0x%04x) -> RAM (0x%08x) : size (0x%08x)", dsp_addr / 2, addr, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const __m128i s_mask = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L);
|
||||||
|
|
||||||
// TODO: These should eat clock cycles.
|
// TODO: These should eat clock cycles.
|
||||||
void gdsp_ddma_in(u16 dsp_addr, u32 addr, u32 size)
|
void gdsp_ddma_in(u16 dsp_addr, u32 addr, u32 size)
|
||||||
{
|
{
|
||||||
u8* dst = ((u8*)g_dsp.dram);
|
u8* dst = ((u8*)g_dsp.dram);
|
||||||
|
|
||||||
|
#if _M_SSE >= 0x301
|
||||||
|
if (cpu_info.bSSSE3 && !(size % 16))
|
||||||
|
{
|
||||||
|
for (u32 i = 0; i < size; i += 16)
|
||||||
|
{
|
||||||
|
_mm_store_si128((__m128i *)&dst[dsp_addr + i], _mm_shuffle_epi8(_mm_load_si128((__m128i *)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF]), s_mask));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
for (u32 i = 0; i < size; i += 2)
|
for (u32 i = 0; i < size; i += 2)
|
||||||
{
|
{
|
||||||
*(u16*)&dst[dsp_addr + i] = Common::swap16(*(const u16*)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF]);
|
*(u16*)&dst[dsp_addr + i] = Common::swap16(*(const u16*)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
INFO_LOG(DSPLLE, "*** ddma_in RAM (0x%08x) -> DRAM_DSP (0x%04x) : size (0x%08x)", addr, dsp_addr / 2, size);
|
INFO_LOG(DSPLLE, "*** ddma_in RAM (0x%08x) -> DRAM_DSP (0x%04x) : size (0x%08x)", addr, dsp_addr / 2, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -283,10 +300,22 @@ void gdsp_ddma_out(u16 dsp_addr, u32 addr, u32 size)
|
||||||
{
|
{
|
||||||
const u8* src = ((const u8*)g_dsp.dram);
|
const u8* src = ((const u8*)g_dsp.dram);
|
||||||
|
|
||||||
|
#if _M_SSE >= 0x301
|
||||||
|
if (cpu_info.bSSSE3 && !(size % 16))
|
||||||
|
{
|
||||||
|
for (u32 i = 0; i < size; i += 16)
|
||||||
|
{
|
||||||
|
_mm_store_si128((__m128i *)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF], _mm_shuffle_epi8(_mm_load_si128((__m128i *)&src[dsp_addr + i]), s_mask));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
for (u32 i = 0; i < size; i += 2)
|
for (u32 i = 0; i < size; i += 2)
|
||||||
{
|
{
|
||||||
*(u16*)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF] = Common::swap16(*(const u16*)&src[dsp_addr + i]);
|
*(u16*)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF] = Common::swap16(*(const u16*)&src[dsp_addr + i]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
INFO_LOG(DSPLLE, "*** ddma_out DRAM_DSP (0x%04x) -> RAM (0x%08x) : size (0x%08x)", dsp_addr / 2, addr, size);
|
INFO_LOG(DSPLLE, "*** ddma_out DRAM_DSP (0x%04x) -> RAM (0x%08x) : size (0x%08x)", dsp_addr / 2, addr, size);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue