mirror of https://github.com/PCSX2/pcsx2.git
GregMiscellaneous: zzogl-pg:
* Fix previous commit :) git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3921 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
0b7bccaa17
commit
e3a2569b53
|
@ -649,14 +649,9 @@ static const __aligned16 int s_clut16mask[8] = { 0xffff0000, 0xffff0000, 0xffff0
|
||||||
0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
|
0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
|
||||||
};
|
};
|
||||||
|
|
||||||
template<bool CSA_0_15>
|
template<bool CSA_0_15, bool HIGH_16BITS_VM>
|
||||||
__forceinline void WriteCLUT_T16_I4_CSM1_core_sse2(u32* vm, u32* clut)
|
void __fastcall WriteCLUT_T16_I4_CSM1_core_sse2(u32* vm, u32* clut)
|
||||||
{
|
{
|
||||||
// CSA 0-15
|
|
||||||
// Replace lower 16 bits of clut with lower 16 bits of vm
|
|
||||||
// CSA 16-31
|
|
||||||
// Replace higher 16 bits of clut with higher 16 bits of vm
|
|
||||||
|
|
||||||
__m128i vm_0;
|
__m128i vm_0;
|
||||||
__m128i vm_1;
|
__m128i vm_1;
|
||||||
__m128i vm_2;
|
__m128i vm_2;
|
||||||
|
@ -668,19 +663,47 @@ __forceinline void WriteCLUT_T16_I4_CSM1_core_sse2(u32* vm, u32* clut)
|
||||||
|
|
||||||
__m128i clut_mask = _mm_load_si128((__m128i*)s_clut_16bits_mask);
|
__m128i clut_mask = _mm_load_si128((__m128i*)s_clut_16bits_mask);
|
||||||
|
|
||||||
// load new data & remove useless part
|
// Note:
|
||||||
if(CSA_0_15) {
|
// !HIGH_16BITS_VM
|
||||||
|
// CSA in 0-15 -> Replace lower 16 bits of clut0 with lower 16 bits of vm
|
||||||
|
// CSA in 16-31 -> Replace higher 16 bits of clut0 with lower 16 bits of vm
|
||||||
|
|
||||||
|
// HIGH_16BITS_VM
|
||||||
|
// CSA in 0-15 -> Replace lower 16 bits of clut0 with higher 16 bits of vm
|
||||||
|
// CSA in 16-31 -> Replace higher 16 bits of clut0 with higher 16 bits of vm
|
||||||
|
|
||||||
|
if(HIGH_16BITS_VM && CSA_0_15) {
|
||||||
|
// move high to low
|
||||||
|
vm_0 = _mm_load_si128((__m128i*)vm); // 9 8 1 0
|
||||||
|
vm_1 = _mm_load_si128((__m128i*)vm+1); // 11 10 3 2
|
||||||
|
vm_2 = _mm_load_si128((__m128i*)vm+2); // 13 12 5 4
|
||||||
|
vm_3 = _mm_load_si128((__m128i*)vm+3); // 15 14 7 6
|
||||||
|
vm_0 = _mm_srli_epi32(vm_0, 16);
|
||||||
|
vm_1 = _mm_srli_epi32(vm_1, 16);
|
||||||
|
vm_2 = _mm_srli_epi32(vm_2, 16);
|
||||||
|
vm_3 = _mm_srli_epi32(vm_3, 16);
|
||||||
|
} else if(HIGH_16BITS_VM && !CSA_0_15) {
|
||||||
|
// Remove lower 16 bits
|
||||||
|
vm_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm)); // 9 8 1 0
|
||||||
|
vm_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); // 11 10 3 2
|
||||||
|
vm_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); // 13 12 5 4
|
||||||
|
vm_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); // 15 14 7 6
|
||||||
|
} else if(!HIGH_16BITS_VM && CSA_0_15) {
|
||||||
// Remove higher 16 bits
|
// Remove higher 16 bits
|
||||||
vm_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm)); // 9 8 1 0
|
vm_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm)); // 9 8 1 0
|
||||||
vm_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); // 11 10 3 2
|
vm_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); // 11 10 3 2
|
||||||
vm_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); // 13 12 5 4
|
vm_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); // 13 12 5 4
|
||||||
vm_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); // 15 14 7 6
|
vm_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); // 15 14 7 6
|
||||||
} else {
|
} else if(!HIGH_16BITS_VM && !CSA_0_15) {
|
||||||
// Remove lower 16 bits
|
// move low to high
|
||||||
vm_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm));
|
vm_0 = _mm_load_si128((__m128i*)vm); // 9 8 1 0
|
||||||
vm_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+1));
|
vm_1 = _mm_load_si128((__m128i*)vm+1); // 11 10 3 2
|
||||||
vm_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+2));
|
vm_2 = _mm_load_si128((__m128i*)vm+2); // 13 12 5 4
|
||||||
vm_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+3));
|
vm_3 = _mm_load_si128((__m128i*)vm+3); // 15 14 7 6
|
||||||
|
vm_0 = _mm_slli_epi32(vm_0, 16);
|
||||||
|
vm_1 = _mm_slli_epi32(vm_1, 16);
|
||||||
|
vm_2 = _mm_slli_epi32(vm_2, 16);
|
||||||
|
vm_3 = _mm_slli_epi32(vm_3, 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unsizzle the data
|
// Unsizzle the data
|
||||||
|
@ -719,9 +742,11 @@ __forceinline void WriteCLUT_T16_I4_CSM1_core_sse2(u32* vm, u32* clut)
|
||||||
extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut)
|
extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut)
|
||||||
{
|
{
|
||||||
if ((u32)clut & 0x0F) {
|
if ((u32)clut & 0x0F) {
|
||||||
WriteCLUT_T16_I4_CSM1_core_sse2<false>(vm, clut);
|
// CSA 16-31 && low 16bits vm
|
||||||
|
WriteCLUT_T16_I4_CSM1_core_sse2<false, false>(vm, clut);
|
||||||
} else {
|
} else {
|
||||||
WriteCLUT_T16_I4_CSM1_core_sse2<true>(vm, clut);
|
// CSA 0-15 && low 16bits vm
|
||||||
|
WriteCLUT_T16_I4_CSM1_core_sse2<true, false>(vm, clut);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1078,23 +1103,36 @@ __forceinline void WriteCLUT_T16_I8_CSM1_sse2(u32* vm, u32 csa)
|
||||||
{
|
{
|
||||||
// update the right clut column (csa < 16)
|
// update the right clut column (csa < 16)
|
||||||
u32* clut = (u32*)(g_pbyGSClut + 64*(csa & 15));
|
u32* clut = (u32*)(g_pbyGSClut + 64*(csa & 15));
|
||||||
// u32 csa_right = (csa < 16) ? 16 - csa : 0;
|
u32 csa_right = (csa < 16) ? 16 - csa : 0;
|
||||||
u32 csa_right = 16 - csa;
|
|
||||||
|
|
||||||
for(int i = csa_right; i > 0 ; --i) {
|
for(int i = (csa_right/2); i > 0 ; --i) {
|
||||||
WriteCLUT_T16_I4_CSM1_core_sse2<true>(vm, clut);
|
WriteCLUT_T16_I4_CSM1_core_sse2<true,false>(vm, clut);
|
||||||
vm += 16; // go down one column
|
|
||||||
clut += 16;
|
clut += 16;
|
||||||
|
WriteCLUT_T16_I4_CSM1_core_sse2<true,true>(vm, clut);
|
||||||
|
clut += 16;
|
||||||
|
vm += 16; // go down one column
|
||||||
}
|
}
|
||||||
|
|
||||||
// update the left clut column
|
// update the left clut column
|
||||||
clut = (u32*)(g_pbyGSClut);
|
|
||||||
u32 csa_left = (csa >= 16) ? 16 : csa;
|
u32 csa_left = (csa >= 16) ? 16 : csa;
|
||||||
|
|
||||||
for(int i = csa_left; i > 0 ; --i) {
|
// In case csa_right is odd (so csa_left is also odd), we cross the clut column
|
||||||
WriteCLUT_T16_I4_CSM1_core_sse2<false>(vm, clut);
|
if(csa_right & 0x1) {
|
||||||
vm += 16; // go down one column
|
WriteCLUT_T16_I4_CSM1_core_sse2<true,false>(vm, clut);
|
||||||
|
// go back to the base before processing left clut column
|
||||||
|
clut = (u32*)(g_pbyGSClut);
|
||||||
|
WriteCLUT_T16_I4_CSM1_core_sse2<false,true>(vm, clut);
|
||||||
|
} else if(csa_right != 0) {
|
||||||
|
// go back to the base before processing left clut column
|
||||||
|
clut = (u32*)(g_pbyGSClut);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int i = (csa_left/2); i > 0 ; --i) {
|
||||||
|
WriteCLUT_T16_I4_CSM1_core_sse2<false,false>(vm, clut);
|
||||||
clut += 16;
|
clut += 16;
|
||||||
|
WriteCLUT_T16_I4_CSM1_core_sse2<false,true>(vm, clut);
|
||||||
|
clut += 16;
|
||||||
|
vm += 16; // go down one column
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue