mirror of https://github.com/PCSX2/pcsx2.git
GregMiscellaneous: zzogl-pg:
* redid the WriteCLUT_T16_I4_CSM1_sse2 functions (more generic, faster, cleaner) * Create WriteCLUT_T16_I8_CSM1_sse2 based on WriteCLUT_T16_I4_CSM1_sse2 * Change some clut buffer offset... Probably impact the compatibility git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3920 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
d84d8e8a2a
commit
0b7bccaa17
|
@ -1688,6 +1688,7 @@ int memcmp_clut16(u16* pSavedBuffer, u16* pClutBuffer, int clutsize)
|
|||
return 0;
|
||||
}
|
||||
|
||||
#if 0
|
||||
bool ZeroGS::CMemoryTarget::ValidateClut(const tex0Info& tex0)
|
||||
{
|
||||
FUNCLOG
|
||||
|
@ -1720,6 +1721,7 @@ bool ZeroGS::CMemoryTarget::ValidateClut(const tex0Info& tex0)
|
|||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool ZeroGS::CMemoryTarget::ValidateTex(const tex0Info& tex0, int starttex, int endtex, bool bDeleteBadTex)
|
||||
{
|
||||
|
|
|
@ -626,6 +626,7 @@ extern "C" void __fastcall WriteCLUT_T32_I8_CSM1_sse2(u32* vm, u32* clut)
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
extern "C" void __fastcall WriteCLUT_T32_I4_CSM1_sse2(u32* vm, u32* clut)
|
||||
{
|
||||
__m128i* src = (__m128i*)vm;
|
||||
|
@ -642,13 +643,89 @@ extern "C" void __fastcall WriteCLUT_T32_I4_CSM1_sse2(u32* vm, u32* clut)
|
|||
_mm_store_si128(&dst[3], _mm_unpackhi_epi64(r2, r3));
|
||||
}
|
||||
|
||||
|
||||
static const __aligned16 int s_clut_16bits_mask[4] = { 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff };
|
||||
static const __aligned16 int s_clut16mask2[4] = { 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff };
|
||||
static const __aligned16 int s_clut16mask[8] = { 0xffff0000, 0xffff0000, 0xffff0000, 0xffff0000,
|
||||
0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
|
||||
};
|
||||
|
||||
template<bool CSA_0_15>
|
||||
__forceinline void WriteCLUT_T16_I4_CSM1_core_sse2(u32* vm, u32* clut)
|
||||
{
|
||||
// CSA 0-15
|
||||
// Replace lower 16 bits of clut with lower 16 bits of vm
|
||||
// CSA 16-31
|
||||
// Replace higher 16 bits of clut with higher 16 bits of vm
|
||||
|
||||
__m128i vm_0;
|
||||
__m128i vm_1;
|
||||
__m128i vm_2;
|
||||
__m128i vm_3;
|
||||
__m128i clut_0;
|
||||
__m128i clut_1;
|
||||
__m128i clut_2;
|
||||
__m128i clut_3;
|
||||
|
||||
__m128i clut_mask = _mm_load_si128((__m128i*)s_clut_16bits_mask);
|
||||
|
||||
// load new data & remove useless part
|
||||
if(CSA_0_15) {
|
||||
// Remove higher 16 bits
|
||||
vm_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm)); // 9 8 1 0
|
||||
vm_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); // 11 10 3 2
|
||||
vm_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); // 13 12 5 4
|
||||
vm_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); // 15 14 7 6
|
||||
} else {
|
||||
// Remove lower 16 bits
|
||||
vm_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm));
|
||||
vm_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+1));
|
||||
vm_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+2));
|
||||
vm_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+3));
|
||||
}
|
||||
|
||||
// Unsizzle the data
|
||||
__m128i row_0 = _mm_unpacklo_epi32(vm_0, vm_1); // 3 2 1 0
|
||||
__m128i row_1 = _mm_unpacklo_epi32(vm_2, vm_3); // 7 6 5 4
|
||||
__m128i row_2 = _mm_unpackhi_epi32(vm_0, vm_1); // 11 10 9 8
|
||||
__m128i row_3 = _mm_unpackhi_epi32(vm_2, vm_3); // 15 14 13 12
|
||||
|
||||
// load old data & remove useless part
|
||||
if(CSA_0_15) {
|
||||
// Remove lower 16 bits
|
||||
clut_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut));
|
||||
clut_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+1));
|
||||
clut_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+2));
|
||||
clut_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+3));
|
||||
} else {
|
||||
// Remove higher 16 bits
|
||||
clut_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut));
|
||||
clut_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+1));
|
||||
clut_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+2));
|
||||
clut_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+3));
|
||||
}
|
||||
|
||||
// Merge old & new data
|
||||
clut_0 = _mm_or_si128(clut_0, row_0);
|
||||
clut_1 = _mm_or_si128(clut_1, row_1);
|
||||
clut_2 = _mm_or_si128(clut_2, row_2);
|
||||
clut_3 = _mm_or_si128(clut_3, row_3);
|
||||
|
||||
_mm_store_si128((__m128i*)clut, clut_0);
|
||||
_mm_store_si128((__m128i*)clut+1, clut_1);
|
||||
_mm_store_si128((__m128i*)clut+2, clut_2);
|
||||
_mm_store_si128((__m128i*)clut+3, clut_3);
|
||||
}
|
||||
|
||||
extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut)
|
||||
{
|
||||
if ((u32)clut & 0x0F) {
|
||||
WriteCLUT_T16_I4_CSM1_core_sse2<false>(vm, clut);
|
||||
} else {
|
||||
WriteCLUT_T16_I4_CSM1_core_sse2<true>(vm, clut);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2_old(u32* vm, u32* clut)
|
||||
{
|
||||
#define YET_ANOTHER_INTRINSIC
|
||||
#ifdef YET_ANOTHER_INTRINSIC
|
||||
|
@ -677,7 +754,7 @@ extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut)
|
|||
|
||||
// Note: MSVC complains about direct c-cast...
|
||||
// vm2 = (__m128i)_mm_shuffle_ps((__m128)vm2, (__m128)vm3, 0x88);
|
||||
__m128 vm2_f = (_mm_shuffle_ps((__m128&)vm2, (__m128&)vm3, 0x88)); // 14 12 10 8 6 4 2 0
|
||||
__m128 vm2_f = (_mm_shuffle_ps((__m128&)vm2, (__m128&)vm3, 0x88));
|
||||
vm2 = (__m128i&)vm2_f;
|
||||
vm2 = _mm_shuffle_epi32(vm2, 0xD8);
|
||||
|
||||
|
@ -997,6 +1074,30 @@ End:
|
|||
#endif
|
||||
}
|
||||
|
||||
__forceinline void WriteCLUT_T16_I8_CSM1_sse2(u32* vm, u32 csa)
|
||||
{
|
||||
// update the right clut column (csa < 16)
|
||||
u32* clut = (u32*)(g_pbyGSClut + 64*(csa & 15));
|
||||
// u32 csa_right = (csa < 16) ? 16 - csa : 0;
|
||||
u32 csa_right = 16 - csa;
|
||||
|
||||
for(int i = csa_right; i > 0 ; --i) {
|
||||
WriteCLUT_T16_I4_CSM1_core_sse2<true>(vm, clut);
|
||||
vm += 16; // go down one column
|
||||
clut += 16;
|
||||
}
|
||||
|
||||
// update the left clut column
|
||||
clut = (u32*)(g_pbyGSClut);
|
||||
u32 csa_left = (csa >= 16) ? 16 : csa;
|
||||
|
||||
for(int i = csa_left; i > 0 ; --i) {
|
||||
WriteCLUT_T16_I4_CSM1_core_sse2<false>(vm, clut);
|
||||
vm += 16; // go down one column
|
||||
clut += 16;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // ZEROGS_SSE2
|
||||
|
||||
void __fastcall WriteCLUT_T16_I8_CSM1_c(u32* _vm, u32* _clut)
|
||||
|
|
|
@ -854,9 +854,6 @@ bool IsDirty(u32 highdword, u32 psm, int cld, int cbp)
|
|||
|
||||
bool bRet = false;
|
||||
|
||||
// FIXME code generated by intrinsics is the same as the linux asm.
|
||||
// However there is no "cmp %%esi, 0x90" equivalent in the windows asm !!!
|
||||
// So control flow must be check
|
||||
#define TEST_THIS
|
||||
#ifdef TEST_THIS
|
||||
while(entries != 0) {
|
||||
|
@ -899,14 +896,17 @@ bool IsDirty(u32 highdword, u32 psm, int cld, int cbp)
|
|||
}
|
||||
#endif
|
||||
|
||||
// go to the next memory block
|
||||
src += 32;
|
||||
|
||||
// go back to the previous memory block then down one memory column
|
||||
if (entries & 0x10) {
|
||||
src -= 56; // go back and down one column
|
||||
src -= (64-8);
|
||||
}
|
||||
|
||||
src += 32; // go to the right block
|
||||
|
||||
// In case previous operation (down one column) cross the block boundary
|
||||
// Go to the next block
|
||||
if (entries == 0x90) {
|
||||
src += 32; // skip whole block
|
||||
src += 32;
|
||||
}
|
||||
|
||||
dst += 8;
|
||||
|
@ -1150,7 +1150,7 @@ void ZeroGS::texClutWrite(int ctx)
|
|||
case PSMCT16:
|
||||
{
|
||||
u16* src = (u16*)g_pbyGSMemory + tex0.cbp * 128;
|
||||
u16 *dst = (u16*)(g_pbyGSClut + 32 * (tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0));
|
||||
u16 *dst = (u16*)(g_pbyGSClut + 64 * (tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0));
|
||||
|
||||
for (int i = 0; i < entries; ++i)
|
||||
{
|
||||
|
@ -1167,7 +1167,7 @@ void ZeroGS::texClutWrite(int ctx)
|
|||
case PSMCT16S:
|
||||
{
|
||||
u16* src = (u16*)g_pbyGSMemory + tex0.cbp * 128;
|
||||
u16 *dst = (u16*)(g_pbyGSClut + 32 * (tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0));
|
||||
u16 *dst = (u16*)(g_pbyGSClut + 64 * (tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0));
|
||||
|
||||
for (int i = 0; i < entries; ++i)
|
||||
{
|
||||
|
@ -1221,7 +1221,7 @@ void ZeroGS::texClutWrite(int ctx)
|
|||
break;
|
||||
|
||||
default:
|
||||
WriteCLUT_T16_I4_CSM1(src, (u32*)(g_pbyGSClut + 32*(tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0)));
|
||||
WriteCLUT_T16_I4_CSM1(src, (u32*)(g_pbyGSClut + 64*(tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0)));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -1236,7 +1236,11 @@ void ZeroGS::texClutWrite(int ctx)
|
|||
|
||||
default:
|
||||
// sse2 for 256 is more complicated, so use regular
|
||||
WriteCLUT_T16_I8_CSM1_c(src, (u32*)(g_pbyGSClut + 32*(tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0)));
|
||||
#ifdef ZEROGS_SSE2
|
||||
WriteCLUT_T16_I8_CSM1_sse2(src, tex0.csa);
|
||||
#else
|
||||
WriteCLUT_T16_I8_CSM1_c(src, (u32*)(g_pbyGSClut + 64*(tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0)));
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue