GregMiscellaneous: zzogl-pg:

* make code more consistent
* Use some sse2 for 16 bits texture


git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3943 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gregory.hainaut@gmail.com 2010-10-18 17:39:12 +00:00
parent e71401068e
commit b20c1021e8
5 changed files with 217 additions and 84 deletions

View File

@ -25,6 +25,19 @@
#include <emmintrin.h>
#endif
// Local Clut buffer:
// It supports both 32 bits and 16 bits colors formats. The size of the buffer is 1KBytes.
// The 16 bits entries are arranged in 2 columns. One row is a 32 bits colors.
// 256 0
// 271 1
// ... ..
// 510 254
// 511 255
//
// CSA -> clut buffer offset:
// 16 bits format: CSA < 32 <=> 16 entries, 16 half-row of the buffer (for example 0 to 15)
// 32 bits format: CSA < 16 <=> 16 entries, 16 full row of the buffer (for example 256|0 to 271|15)
static const __aligned16 int s_clut_16bits_mask[4] = { 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff };
template <class T>
@ -472,48 +485,115 @@ __forceinline void GSMem_to_ClutBuffer(tex0Info &tex0)
* Clut buffer -> local C array (linear)
* *****************************************************************/
template <class T>
__forceinline void ClutBuffer_to_Array(T* dst, T* clut, u32 clutsize) {}
__forceinline void ClutBuffer_to_Array(T* dst, u32 csa, u32 clutsize) {}
template <>
__forceinline void ClutBuffer_to_Array<u32>(u32* dst, u32* clut, u32 clutsize)
__forceinline void ClutBuffer_to_Array<u32>(u32* dst, u32 csa, u32 clutsize)
{
ZZLog::Error_Log("Fill 32b clut");
memcpy_amd((u8*)dst, (u8*)clut, clutsize);
u8* clut = (u8*)GetClutBufferAddress<u32>(csa);
memcpy_amd((u8*)dst, clut, clutsize);
}
template <>
__forceinline void ClutBuffer_to_Array<u16>(u16* dst, u16* clut, u32 clutsize)
__forceinline void ClutBuffer_to_Array<u16>(u16* dst, u32 csa, u32 clutsize)
{
ZZLog::Error_Log("Fill 16b clut");
int left = ((u32)clut & 2) ? 0 : (((u32)clut & 0x3ff) / 2) + clutsize - 512;
u16* clut = (u16*)GetClutBufferAddress<u32>(csa); // Keep aligned version for sse2
if (left > 0) clutsize -= left;
while (clutsize > 0)
{
dst[0] = clut[0];
dst++;
clut += 2;
clutsize -= 2;
// which side to copy
u32 clutsize_right;
u32 clutsize_left;
if (csa < 16) {
clutsize_right = min(clutsize, (16-csa)*64);
clutsize_left = clutsize - clutsize_right;
} else {
clutsize_right = 0;
clutsize_left = clutsize;
}
if (left > 0)
while (clutsize_right > 0)
{
clut = GetClutBufferAddress<u16>(16);
#ifdef ZEROGS_SSE4
// only lower 16 bits of dword are valid
__m128i clut_0 = _mm_load_si128((__m128i*)clut);
__m128i clut_1 = _mm_load_si128((__m128i*)clut+1);
__m128i clut_2 = _mm_load_si128((__m128i*)clut+2);
__m128i clut_3 = _mm_load_si128((__m128i*)clut+3);
while (left > 0)
{
dst[0] = clut[0];
left -= 2;
clut += 2;
dst++;
}
clut_0 = _mm_shufflelo_epi16(clut_0, 0x32);
clut_1 = _mm_shufflelo_epi16(clut_1, 0x32);
clut_2 = _mm_shufflelo_epi16(clut_2, 0x32);
clut_3 = _mm_shufflelo_epi16(clut_3, 0x32);
clut_0 = _mm_shufflehi_epi16(clut_0, 0xD8); // - - 3 2 1 0 - -
clut_1 = _mm_shufflehi_epi16(clut_1, 0xD8);
clut_2 = _mm_shufflehi_epi16(clut_2, 0xD8);
clut_3 = _mm_shufflehi_epi16(clut_3, 0xD8);
clut_0 = _mm_srli_si128(clut_0, 4);
clut_1 = _mm_srli_si128(clut_1, 4);
clut_2 = _mm_srli_si128(clut_2, 4);
clut_3 = _mm_srli_si128(clut_3, 4);
_mm_store_si128((__m128i*)dst, _mm_unpacklo_epi64(clut_0, clut_1));
_mm_store_si128((__m128i*)dst+1, _mm_unpacklo_epi64(clut_2, clut_3));
#else
for(int i = 0; i < 16; ++i)
dst[i] = clut[2*i];
#endif
dst += 16;
clut += 32;
clutsize_right -= 32;
}
if(csa < 16) {
// go back to the base before processing left clut column
clut = (u16*)GetClutBufferAddress<u32>(0); // Keep aligned version for sse2
}
while (clutsize_left > 0)
{
#ifdef ZEROGS_SSE2
// only higher 16 bits of dword are valid
__m128i clut_0 = _mm_load_si128((__m128i*)clut);
__m128i clut_1 = _mm_load_si128((__m128i*)clut+1);
__m128i clut_2 = _mm_load_si128((__m128i*)clut+2);
__m128i clut_3 = _mm_load_si128((__m128i*)clut+3);
clut_0 = _mm_shufflelo_epi16(clut_0, 0xD8);
clut_1 = _mm_shufflelo_epi16(clut_1, 0xD8);
clut_2 = _mm_shufflelo_epi16(clut_2, 0xD8);
clut_3 = _mm_shufflelo_epi16(clut_3, 0xD8);
clut_0 = _mm_shufflehi_epi16(clut_0, 0x63); // - - 3 2 1 0 - -
clut_1 = _mm_shufflehi_epi16(clut_1, 0x63);
clut_2 = _mm_shufflehi_epi16(clut_2, 0x63);
clut_3 = _mm_shufflehi_epi16(clut_3, 0x63);
clut_0 = _mm_srli_si128(clut_0, 4);
clut_1 = _mm_srli_si128(clut_1, 4);
clut_2 = _mm_srli_si128(clut_2, 4);
clut_3 = _mm_srli_si128(clut_3, 4);
_mm_store_si128((__m128i*)dst, _mm_unpacklo_epi64(clut_0, clut_1));
_mm_store_si128((__m128i*)dst+1, _mm_unpacklo_epi64(clut_2, clut_3));
#else
// Note +1 because we change higher 16 bits
for(int i = 0; i < 16; ++i)
dst[i] = clut[2*i];
#endif
dst += 16;
clut += 32;
clutsize_left -= 32;
}
}
/* *****************************************************************
* Compare: Clut buffer <-> Local Memory
* *****************************************************************/
// false -> identical
// true -> different
template <class T>
__forceinline bool Cmp_ClutBuffer_GSMem(T* GSmem, u32 csa, u32 clutsize);
@ -563,17 +643,17 @@ __forceinline bool Cmp_ClutBuffer_GSMem<u32>(u32* GSmem, u32 csa, u32 clutsize)
_GSmem += 32;
// go back to the previous memory block then down one memory column
if (clutsize & 0x10) {
if (clutsize & 0x40) {
_GSmem -= (64-8);
}
// In case previous operation (down one column) cross the block boundary
// Go to the next block
if (clutsize == 0x90) {
if (clutsize == 0x240) {
_GSmem += 32;
}
clut += 8;
clutsize -= 16;
clutsize -= 64;
}
return false;
@ -589,59 +669,120 @@ __forceinline bool Cmp_ClutBuffer_GSMem<u16>(u16* GSmem, u32 csa, u32 clutsize)
/* *****************************************************************
* Compare: Clut buffer <-> local C array (linear)
* *****************************************************************/
// false -> identical
// true -> different
template <class T>
__forceinline bool Cmp_ClutBuffer_SavedClut(T* saved_clut, T* clut, u32 clutsize);
__forceinline bool Cmp_ClutBuffer_SavedClut(T* saved_clut, u32 csa, u32 clutsize);
template <>
__forceinline bool Cmp_ClutBuffer_SavedClut<u32>(u32* saved_clut, u32* clut, u32 clutsize)
__forceinline bool Cmp_ClutBuffer_SavedClut<u32>(u32* saved_clut, u32 csa, u32 clutsize)
{
u32* clut = GetClutBufferAddress<u32>(csa);
return memcmp_mmx(saved_clut, clut, clutsize);
}
template <>
__forceinline bool Cmp_ClutBuffer_SavedClut<u16>(u16* saved_clut, u16* clut, u32 clutsize)
__forceinline bool Cmp_ClutBuffer_SavedClut<u16>(u16* saved_clut, u32 csa, u32 clutsize)
{
assert((clutsize&31) == 0);
// left > 0 only when csa < 16
int left = 0;
if (((u32)clut & 2) == 0)
{
left = (((u32)clut & 0x3ff) / 2) + clutsize - 512;
clutsize -= left;
}
#ifdef ZEROGS_SSE2
__m128i zero_128 = _mm_setzero_si128();
#endif
u16* clut = (u16*)GetClutBufferAddress<u32>(csa); // Keep aligned version for sse2
while (clutsize > 0)
{
// which side to cmp
u32 clutsize_right;
u32 clutsize_left;
if (csa < 16) {
clutsize_right = min(clutsize, (16-csa)*64);
clutsize_left = clutsize - clutsize_right;
} else {
clutsize_right = 0;
clutsize_left = clutsize;
}
while (clutsize_right > 0)
{
#ifdef ZEROGS_SSE2
// only lower 16 bits of dword are valid
__m128i clut_0 = _mm_load_si128((__m128i*)clut);
__m128i clut_1 = _mm_load_si128((__m128i*)clut+1);
__m128i clut_2 = _mm_load_si128((__m128i*)clut+2);
__m128i clut_3 = _mm_load_si128((__m128i*)clut+3);
// value must converted to 32 bits
__m128i saved_clut_0 = _mm_load_si128((__m128i*)saved_clut);
__m128i saved_clut_1 = _mm_load_si128((__m128i*)saved_clut+1);
__m128i result = _mm_cmpeq_epi16(_mm_unpacklo_epi16(saved_clut_0, zero_128), clut_0);
__m128i result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(saved_clut_0, zero_128), clut_1);
result = _mm_and_si128(result, result_tmp);
result_tmp = _mm_cmpeq_epi16(_mm_unpacklo_epi16(saved_clut_1, zero_128), clut_2);
result = _mm_and_si128(result, result_tmp);
result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(saved_clut_1, zero_128), clut_3);
result = _mm_and_si128(result, result_tmp);
u32 result_int = _mm_movemask_epi8(result);
// only lower 16bits must be checked
if ((result_int&0x3333) != 0x3333)
return true;
#else
for (int i = 0; i < 16; ++i)
{
if (saved_clut[i] != clut[2*i]) return 1;
}
if (saved_clut[i] != clut[2*i]) return true;
#endif
clutsize -= 32;
saved_clut += 16;
clut += 32;
clutsize_right -= 32;
}
if (left > 0)
{
clut = (u16*)(g_pbyGSClut + 2);
if(csa < 16) {
// go back to the base before processing left clut column
clut = (u16*)GetClutBufferAddress<u32>(0); // Keep aligned version for sse2
}
while (left > 0)
{
for (int i = 0; i < 16; ++i)
{
if (saved_clut[i] != clut[2*i]) return 1;
}
while (clutsize_left > 0)
{
#ifdef ZEROGS_SSE2
// only higher 16 bits of dword are valid
__m128i clut_0 = _mm_load_si128((__m128i*)clut);
__m128i clut_1 = _mm_load_si128((__m128i*)clut+1);
__m128i clut_2 = _mm_load_si128((__m128i*)clut+2);
__m128i clut_3 = _mm_load_si128((__m128i*)clut+3);
left -= 32;
// value must converted to 32 bits (with 0 in lower 16 bits)
__m128i saved_clut_0 = _mm_load_si128((__m128i*)saved_clut);
__m128i saved_clut_1 = _mm_load_si128((__m128i*)saved_clut+1);
saved_clut += 16;
clut += 32;
}
}
__m128i result = _mm_cmpeq_epi16(_mm_unpacklo_epi16(zero_128, saved_clut_0), clut_0);
__m128i result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(zero_128, saved_clut_0), clut_1);
result = _mm_and_si128(result, result_tmp);
return 0;
result_tmp = _mm_cmpeq_epi16(_mm_unpacklo_epi16(zero_128, saved_clut_1), clut_2);
result = _mm_and_si128(result, result_tmp);
result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(zero_128, saved_clut_1), clut_3);
result = _mm_and_si128(result, result_tmp);
u32 result_int = _mm_movemask_epi8(result);
// only higher 16bits must be checked
if ((result_int&0xCCCC) != 0xCCCC)
return true;
#else
// Note +1 because we change higher 16 bits
for (int i = 0; i < 16; ++i)
if (saved_clut[i] != clut[2*i+1]) return true;
#endif
saved_clut += 16;
clut += 32;
clutsize_left -= 32;
}
return false;
}
@ -653,7 +794,6 @@ __forceinline bool Cmp_ClutBuffer_SavedClut<u16>(u16* saved_clut, u16* clut, u32
template <class T>
__forceinline void Build_Clut_Texture(u32 psm, u32 height, T* pclut, u8* psrc, T* pdst)
{
ZZLog::Error_Log("Build clut texture");
switch (psm)
{
case PSMT8:

View File

@ -21,10 +21,10 @@
#define CLUT_H_INCLUDED
extern void GSMem_to_ClutBuffer(tex0Info &tex0);
template <class T> extern void ClutBuffer_to_Array(T* dst, T* clut, u32 clutsize);
template <class T> extern void ClutBuffer_to_Array(T* dst, u32 csa, u32 clutsize);
template <class T> extern void Build_Clut_Texture(u32 psm, u32 height, T* pclut, u8* psrc, T* pdst);
template <class T> extern bool Cmp_ClutBuffer_GSMem(T* GSmem, u32 csa, u32 clutsize);
template <class T> extern bool Cmp_ClutBuffer_SavedClut(T* saved_clut, T* clut, u32 clutsize);
template <class T> extern bool Cmp_ClutBuffer_SavedClut(T* saved_clut, u32 csa, u32 clutsize);
#endif // CLUT_H_INCLUDED

View File

@ -1731,7 +1731,7 @@ inline list<CMemoryTarget>::iterator CMemoryTargetMngr::DestroyTargetIter(list<C
// Not same format -> 1
// Same format, not same data (clut only) -> 2
// identical -> 0
int CMemoryTargetMngr::CompareTarget(list<CMemoryTarget>::iterator& it, const tex0Info& tex0, int clutsize, int nClutOffset)
int CMemoryTargetMngr::CompareTarget(list<CMemoryTarget>::iterator& it, const tex0Info& tex0, int clutsize)
{
if (PSMT_ISCLUT(it->psm) != PSMT_ISCLUT(tex0.psm))
return 1;
@ -1743,10 +1743,10 @@ int CMemoryTargetMngr::CompareTarget(list<CMemoryTarget>::iterator& it, const te
return 1;
if (PSMT_IS32BIT(tex0.cpsm)) {
if (Cmp_ClutBuffer_SavedClut<u32>((u32*)&it->clut[0], (u32*)(g_pbyGSClut + nClutOffset), clutsize))
if (Cmp_ClutBuffer_SavedClut<u32>((u32*)&it->clut[0], tex0.csa, clutsize))
return 2;
} else {
if (Cmp_ClutBuffer_SavedClut<u16>((u16*)&it->clut[0], (u16*)(g_pbyGSClut + nClutOffset), clutsize))
if (Cmp_ClutBuffer_SavedClut<u16>((u16*)&it->clut[0], tex0.csa, clutsize))
return 2;
}
@ -1758,9 +1758,8 @@ int CMemoryTargetMngr::CompareTarget(list<CMemoryTarget>::iterator& it, const te
return 0;
}
void CMemoryTargetMngr::GetClutVariables(int& nClutOffset, int& clutsize, const tex0Info& tex0)
void CMemoryTargetMngr::GetClutVariables(int& clutsize, const tex0Info& tex0)
{
nClutOffset = 0;
clutsize = 0;
if (PSMT_ISCLUT(tex0.psm))
@ -1768,15 +1767,9 @@ void CMemoryTargetMngr::GetClutVariables(int& nClutOffset, int& clutsize, const
int entries = PSMT_IS8CLUT(tex0.psm) ? 256 : 16;
if (PSMT_IS32BIT(tex0.cpsm))
{
nClutOffset = 64 * tex0.csa;
clutsize = min(entries, 256 - tex0.csa * 16) * 4;
}
else
{
nClutOffset = 64 * (tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0);
clutsize = min(entries, 512 - tex0.csa * 16) * 2;
}
}
}
@ -1793,7 +1786,7 @@ void CMemoryTargetMngr::GetMemAddress(int& start, int& end, const tex0Info& tex
}
CMemoryTarget* CMemoryTargetMngr::SearchExistTarget(int start, int end, int nClutOffset, int clutsize, const tex0Info& tex0, int forcevalidate)
CMemoryTarget* CMemoryTargetMngr::SearchExistTarget(int start, int end, int clutsize, const tex0Info& tex0, int forcevalidate)
{
for (list<CMemoryTarget>::iterator it = listTargets.begin(); it != listTargets.end();)
{
@ -1801,7 +1794,7 @@ CMemoryTarget* CMemoryTargetMngr::SearchExistTarget(int start, int end, int nClu
if (it->starty <= start && it->starty + it->height >= end)
{
int res = CompareTarget(it, tex0, clutsize, nClutOffset);
int res = CompareTarget(it, tex0, clutsize);
if (res == 1)
{
@ -1905,12 +1898,12 @@ CMemoryTarget* CMemoryTargetMngr::ClearedTargetsSearch(int fmt, int widthmult, i
CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forcevalidate)
{
FUNCLOG
int start, end, nClutOffset, clutsize;
int start, end, clutsize;
GetClutVariables(nClutOffset, clutsize, tex0);
GetClutVariables(clutsize, tex0);
GetMemAddress(start, end, tex0);
CMemoryTarget* it = SearchExistTarget(start, end, nClutOffset, clutsize, tex0, forcevalidate);
CMemoryTarget* it = SearchExistTarget(start, end, clutsize, tex0, forcevalidate);
if (it != NULL) return it;
@ -2006,13 +1999,13 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
if (PSMT_IS32BIT(tex0.cpsm))
{
u32* pclut = (u32*) & targ->clut[0];
ClutBuffer_to_Array<u32>(pclut, (u32*)(g_pbyGSClut + nClutOffset), clutsize);
ClutBuffer_to_Array<u32>(pclut, tex0.csa, clutsize);
Build_Clut_Texture<u32>(tex0.psm, targ->height, pclut, psrc, (u32*)ptexdata);
}
else
{
u16* pclut = (u16*) & targ->clut[0];
ClutBuffer_to_Array<u16>(pclut, (u16*)(g_pbyGSClut + nClutOffset), clutsize);
ClutBuffer_to_Array<u16>(pclut, tex0.csa, clutsize);
Build_Clut_Texture<u16>(tex0.psm, targ->height, pclut, psrc, (u16*)ptexdata);
}

View File

@ -440,9 +440,9 @@ class CMemoryTargetMngr
CMemoryTargetMngr() : curstamp(0) {}
CMemoryTarget* GetMemoryTarget(const tex0Info& tex0, int forcevalidate); // pcbp is pointer to start of clut
CMemoryTarget* SearchExistTarget(int start, int end, int nClutOffset, int clutsize, const tex0Info& tex0, int forcevalidate);
CMemoryTarget* SearchExistTarget(int start, int end, int clutsize, const tex0Info& tex0, int forcevalidate);
CMemoryTarget* ClearedTargetsSearch(int fmt, int widthmult, int channels, int height);
int CompareTarget(list<CMemoryTarget>::iterator& it, const tex0Info& tex0, int clutsize, int nClutOffset);
int CompareTarget(list<CMemoryTarget>::iterator& it, const tex0Info& tex0, int clutsize);
void Destroy(); // destroy all targs
@ -455,7 +455,7 @@ class CMemoryTargetMngr
private:
list<CMemoryTarget>::iterator DestroyTargetIter(list<CMemoryTarget>::iterator& it);
void GetClutVariables(int& nClutOffset, int& clutsize, const tex0Info& tex0);
void GetClutVariables(int& clutsize, const tex0Info& tex0);
void GetMemAddress(int& start, int& end, const tex0Info& tex0);
};

View File

@ -526,11 +526,11 @@ bool CheckChangeInClut(u32 highdword, u32 psm)
u8* GSMem = g_pbyGSMemory + cbp * 256;
if (PSMT_IS32BIT(cpsm))
return Cmp_ClutBuffer_GSMem<u32>((u32*)GSMem, csa, entries);
return Cmp_ClutBuffer_GSMem<u32>((u32*)GSMem, csa, entries*4);
else {
// Mana Khemia triggers this.
//ZZLog::Error_Log("16 bit clut not supported.");
return Cmp_ClutBuffer_GSMem<u16>((u16*)GSMem, csa, entries);
return Cmp_ClutBuffer_GSMem<u16>((u16*)GSMem, csa, entries*2);
}
}