GregMiscellaneous: zzogl-pg:

* regroup clut core function into one big files Note: codeblock need to be updated. And I hope template are ms friendly :) git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3931 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-10-17 15:49:34 +00:00 · 2010-10-17 15:49:34 +00:00 · 97cd280684
parent 01c171e9e7
commit 97cd280684
8 changed files with 242 additions and 763 deletions
--- a/plugins/zzogl-pg/opengl/CMakeLists.txt
+++ b/plugins/zzogl-pg/opengl/CMakeLists.txt
@ -46,6 +46,7 @@ endif(CMAKE_BUILD_TYPE STREQUAL Release)

 # zzogl sources
 set(zzoglSources
+    Clut.cpp
    GifTransfer.cpp
    GLWin32.cpp
    GLWinX11.cpp
@ -77,6 +78,7 @@ set(zzoglSources

 # zzogl headers
 set(zzoglHeaders
+    Clut.h
    common.h
    CRC.h
    GifTransfer.h
--- a/plugins/zzogl-pg/opengl/ZZLog.h
+++ b/plugins/zzogl-pg/opengl/ZZLog.h
@ -110,7 +110,7 @@ static bool SPAM_PASS;
 	if( err != GL_NO_ERROR ) \
 	{ \
 		ZZLog::Error_Log("%s:%d: gl error %s (0x%x)", __FILE__, (int)__LINE__, error_name(err), err); \
-		HandleGLError(); \
+		/* HandleGLError();*/ \
 	} \
 }
 #else
--- a/plugins/zzogl-pg/opengl/targets.cpp
+++ b/plugins/zzogl-pg/opengl/targets.cpp
@ -26,6 +26,7 @@
 #include "zerogs.h"
 #include "targets.h"
 #include "ZZoglShaders.h"
+#include "Clut.h"

 #ifdef ZEROGS_SSE2
 #include <emmintrin.h>
@ -1642,87 +1643,6 @@ void CMemoryTargetMngr::Destroy()
 	listClearedTargets.clear();
 }

-int memcmp_clut16(u16* pSavedBuffer, u16* pClutBuffer, int clutsize)
-{
-	FUNCLOG
-	assert((clutsize&31) == 0);
-
-	// left > 0 only when csa < 16
-	int left = 0;
-	if (((u32)(uptr)pClutBuffer & 2) == 0)
-	{
-		left = (((u32)(uptr)pClutBuffer & 0x3ff) / 2) + clutsize - 512;
-		clutsize -= left;
-	}
-
-	while (clutsize > 0)
-	{
-		for (int i = 0; i < 16; ++i)
-		{
-			if (pSavedBuffer[i] != pClutBuffer[2*i]) return 1;
-		}
-
-		clutsize -= 32;
-		pSavedBuffer += 16;
-		pClutBuffer += 32;
-	}
-
-	if (left > 0)
-	{
-		pClutBuffer = (u16*)(g_pbyGSClut + 2);
-
-		while (left > 0)
-		{
-			for (int i = 0; i < 16; ++i)
-			{
-				if (pSavedBuffer[i] != pClutBuffer[2*i]) return 1;
-			}
-
-			left -= 32;
-
-			pSavedBuffer += 16;
-			pClutBuffer += 32;
-		}
-	}
-
-	return 0;
-}
-
-#if 0
-bool CMemoryTarget::ValidateClut(const tex0Info& tex0)
-{
-	FUNCLOG
-	assert(tex0.psm == psm && PSMT_ISCLUT(psm) && cpsm == tex0.cpsm);
-
-	int nClutOffset = 0, clutsize = 0;
-	int entries = PSMT_IS8CLUT(tex0.psm) ? 256 : 16;
-
-	if (PSMT_IS32BIT(tex0.cpsm))   // 32 bit
-	{
-		nClutOffset = 64 * tex0.csa;
-		clutsize = min(entries, 256 - tex0.csa * 16) * 4;
-	}
-	else
-	{
-		nClutOffset = 32 * (tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0);
-		clutsize = min(entries, 512 - tex0.csa * 16) * 2;
-	}
-
-	assert(clutsize == clut.size());
-
-	if (PSMT_IS32BIT(cpsm))
-	{
-		if (memcmp_mmx(&clut[0], g_pbyGSClut + nClutOffset, clutsize)) return false;
-	}
-	else
-	{
-		if (memcmp_clut16((u16*)&clut[0], (u16*)(g_pbyGSClut + nClutOffset), clutsize)) return false;
-	}
-
-	return true;
-}
-#endif
-
 bool CMemoryTarget::ValidateTex(const tex0Info& tex0, int starttex, int endtex, bool bDeleteBadTex)
 {
 	FUNCLOG
@ -1783,113 +1703,6 @@ bool CMemoryTarget::ValidateTex(const tex0Info& tex0, int starttex, int endtex,
 	return false;
 }

-// used to build clut textures (note that this is for both 16 and 32 bit cluts)
-template <class T>
-static __forceinline void BuildClut(u32 psm, u32 height, T* pclut, u8* psrc, T* pdst)
-{
-	switch (psm)
-	{
-		case PSMT8:
-			for (u32 i = 0; i < height; ++i)
-			{
-				for (int j = 0; j < GPU_TEXWIDTH / 2; ++j)
-				{
-					pdst[0] = pclut[psrc[0]];
-					pdst[1] = pclut[psrc[1]];
-					pdst[2] = pclut[psrc[2]];
-					pdst[3] = pclut[psrc[3]];
-					pdst[4] = pclut[psrc[4]];
-					pdst[5] = pclut[psrc[5]];
-					pdst[6] = pclut[psrc[6]];
-					pdst[7] = pclut[psrc[7]];
-					pdst += 8;
-					psrc += 8;
-				}
-			}
-			break;
-
-		case PSMT4:
-			for (u32 i = 0; i < height; ++i)
-			{
-				for (int j = 0; j < GPU_TEXWIDTH; ++j)
-				{
-					pdst[0] = pclut[psrc[0] & 15];
-					pdst[1] = pclut[psrc[0] >> 4];
-					pdst[2] = pclut[psrc[1] & 15];
-					pdst[3] = pclut[psrc[1] >> 4];
-					pdst[4] = pclut[psrc[2] & 15];
-					pdst[5] = pclut[psrc[2] >> 4];
-					pdst[6] = pclut[psrc[3] & 15];
-					pdst[7] = pclut[psrc[3] >> 4];
-
-					pdst += 8;
-					psrc += 4;
-				}
-			}
-			break;
-
-		case PSMT8H:
-			for (u32 i = 0; i < height; ++i)
-			{
-				for (int j = 0; j < GPU_TEXWIDTH / 8; ++j)
-				{
-					pdst[0] = pclut[psrc[3]];
-					pdst[1] = pclut[psrc[7]];
-					pdst[2] = pclut[psrc[11]];
-					pdst[3] = pclut[psrc[15]];
-					pdst[4] = pclut[psrc[19]];
-					pdst[5] = pclut[psrc[23]];
-					pdst[6] = pclut[psrc[27]];
-					pdst[7] = pclut[psrc[31]];
-					pdst += 8;
-					psrc += 32;
-				}
-			}
-			break;
-
-		case PSMT4HH:
-			for (u32 i = 0; i < height; ++i)
-			{
-				for (int j = 0; j < GPU_TEXWIDTH / 8; ++j)
-				{
-					pdst[0] = pclut[psrc[3] >> 4];
-					pdst[1] = pclut[psrc[7] >> 4];
-					pdst[2] = pclut[psrc[11] >> 4];
-					pdst[3] = pclut[psrc[15] >> 4];
-					pdst[4] = pclut[psrc[19] >> 4];
-					pdst[5] = pclut[psrc[23] >> 4];
-					pdst[6] = pclut[psrc[27] >> 4];
-					pdst[7] = pclut[psrc[31] >> 4];
-					pdst += 8;
-					psrc += 32;
-				}
-			}
-			break;
-
-		case PSMT4HL:
-			for (u32 i = 0; i < height; ++i)
-			{
-				for (int j = 0; j < GPU_TEXWIDTH / 8; ++j)
-				{
-					pdst[0] = pclut[psrc[3] & 15];
-					pdst[1] = pclut[psrc[7] & 15];
-					pdst[2] = pclut[psrc[11] & 15];
-					pdst[3] = pclut[psrc[15] & 15];
-					pdst[4] = pclut[psrc[19] & 15];
-					pdst[5] = pclut[psrc[23] & 15];
-					pdst[6] = pclut[psrc[27] & 15];
-					pdst[7] = pclut[psrc[31] & 15];
-					pdst += 8;
-					psrc += 32;
-				}
-			}
-			break;
-
-		default:
-			assert(0);
-	}
-}
-
 #define TARGET_THRESH 0x500

 extern int g_MaxTexWidth, g_MaxTexHeight; // Maximum height & width of supported texture.
@ -1926,10 +1739,10 @@ int CMemoryTargetMngr::CompareTarget(list<CMemoryTarget>::iterator& it, const te
 			return 1;

 		if	(PSMT_IS32BIT(tex0.cpsm)) {
-			if (memcmp_mmx(&it->clut[0], g_pbyGSClut + nClutOffset, clutsize))
+			if (Cmp_ClutBuffer_SavedClut<u32>((u32*)&it->clut[0], (u32*)(g_pbyGSClut + nClutOffset), clutsize))
 				return 2;
 		} else {
-			if (memcmp_clut16((u16*)&it->clut[0], (u16*)(g_pbyGSClut + nClutOffset), clutsize))
+			if (Cmp_ClutBuffer_SavedClut<u16>((u16*)&it->clut[0], (u16*)(g_pbyGSClut + nClutOffset), clutsize))
 				return 2;
 		}

@ -2136,38 +1949,9 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
 		targ->clut.resize(clutsize);

 		if (PSMT_IS32BIT(tex0.cpsm))
-		{
-			memcpy_amd(&targ->clut[0], g_pbyGSClut + nClutOffset, clutsize);
-		}
+            ClutBuffer_to_Array<u32>((u32*)&targ->clut[0], (u32*)(g_pbyGSClut + nClutOffset), clutsize);
        else
-		{
-			u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset);
-			u16* pclut = (u16*) & targ->clut[0];
-			int left = ((u32)nClutOffset & 2) ? 0 : ((nClutOffset & 0x3ff) / 2) + clutsize - 512;
-
-			if (left > 0) clutsize -= left;
-
-			while (clutsize > 0)
-			{
-				pclut[0] = pClutBuffer[0];
-				pclut++;
-				pClutBuffer += 2;
-				clutsize -= 2;
-			}
-
-			if (left > 0)
-			{
-				pClutBuffer = (u16*)(g_pbyGSClut + 2);
-
-				while (left > 0)
-				{
-					pclut[0] = pClutBuffer[0];
-					left -= 2;
-					pClutBuffer += 2;
-					pclut++;
-				}
-			}
-		}
+            ClutBuffer_to_Array<u16>((u16*)&targ->clut[0], (u16*)(g_pbyGSClut + nClutOffset), clutsize);
 	}

 	if (targ->ptex != NULL)
@ -2226,14 +2010,14 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
 			u32* pclut = (u32*) & targ->clut[0];
 			u32* pdst = (u32*)ptexdata;

-			BuildClut<u32>(tex0.psm, targ->height, pclut, psrc, pdst);
+			Build_Clut_Texture<u32>(tex0.psm, targ->height, pclut, psrc, pdst);
 		}
 		else
 		{
 			u16* pclut = (u16*) & targ->clut[0];
 			u16* pdst = (u16*)ptexdata;

-			BuildClut<u16>(tex0.psm, targ->height, pclut, psrc, pdst);
+			Build_Clut_Texture<u16>(tex0.psm, targ->height, pclut, psrc, pdst);
 		}
 	}
 	else
--- a/plugins/zzogl-pg/opengl/x86.cpp
+++ b/plugins/zzogl-pg/opengl/x86.cpp
@ -594,6 +594,7 @@ void __fastcall Frame16SwizzleBlock16ZA4_c(u16* dst, Vector_16F* src, int srcpit
 //  }
 //}

+#if 0
 extern "C" void __fastcall WriteCLUT_T32_I8_CSM1_sse2(u32* vm, u32* clut)
 {
 	__m128i* src = (__m128i*)vm;
@ -1137,9 +1138,11 @@ __forceinline void  WriteCLUT_T16_I8_CSM1_sse2(u32* vm, u32 csa)
        vm += 16; // go down one column
    }
 }
+#endif

 #endif // ZEROGS_SSE2

+#if 0
 void __fastcall WriteCLUT_T16_I8_CSM1_c(u32* _vm, u32* _clut)
 {
 	const static u32 map[] =
@ -1251,6 +1254,8 @@ void __fastcall WriteCLUT_T32_I4_CSM1_c(u32* vm, u32* clut)
 	dst[7] = src[7];
 }

+#endif
+
 void SSE2_UnswizzleZ16Target(u16* dst, u16* src, int iters)
 {

--- a/plugins/zzogl-pg/opengl/zerogs.cpp
+++ b/plugins/zzogl-pg/opengl/zerogs.cpp
@ -33,10 +33,7 @@
 #include "GLWin.h"
 #include "ZZoglShaders.h"
 #include "ZZKick.h"
-
-#ifdef ZEROGS_SSE2
-#include <emmintrin.h>
-#endif
+#include "Clut.h"

 //----------------------- Defines

@ -518,217 +515,6 @@ void ExtWrite()
 //	case 7: ASSERT(0); return false;
 //	default: __assume(0);

-bool IsDirty(u32 highdword, u32 psm, int cld, int cbp)
-{
-	int cpsm = ZZOglGet_cpsm_TexBits(highdword);
-	int csm = ZZOglGet_csm_TexBits(highdword);
-
-	if (cpsm > 1 || csm)
-	{
-		// Mana Khemia triggers this.
-        //ZZLog::Error_Log("16 bit clut not supported.");
-		return true;
-	}
-
-	int csa = ZZOglGet_csa_TexBits(highdword);
-
-	int entries = PSMT_IS8CLUT(psm) ? 256 : 16;
-
-	u64* src = (u64*)(g_pbyGSMemory + cbp * 256);
-	u64* dst = (u64*)(g_pbyGSClut + 64 * csa);
-
-	bool bRet = false;
-
-#define TEST_THIS
-#ifdef TEST_THIS
-    while(entries != 0) {
-#ifdef ZEROGS_SSE2
-        // Note: local memory datas are swizzles
-        __m128i src_0 = _mm_load_si128((__m128i*)src);   // 9  8  1 0
-        __m128i src_1 = _mm_load_si128((__m128i*)src+1); // 11 10 3 2
-        __m128i src_2 = _mm_load_si128((__m128i*)src+2); // 13 12 5 4
-        __m128i src_3 = _mm_load_si128((__m128i*)src+3); // 15 14 7 6
-
-        __m128i dst_0 = _mm_load_si128((__m128i*)dst);
-        __m128i dst_1 = _mm_load_si128((__m128i*)dst+1);
-        __m128i dst_2 = _mm_load_si128((__m128i*)dst+2);
-        __m128i dst_3 = _mm_load_si128((__m128i*)dst+3);
-
-        __m128i result = _mm_cmpeq_epi32(_mm_unpacklo_epi64(src_0, src_1), dst_0);
-
-        __m128i result_tmp = _mm_cmpeq_epi32(_mm_unpacklo_epi64(src_2, src_3), dst_1);
-        result = _mm_and_si128(result, result_tmp);
-
-        result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(src_0, src_1), dst_2);
-        result = _mm_and_si128(result, result_tmp);
-
-        result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(src_2, src_3), dst_3);
-        result = _mm_and_si128(result, result_tmp);
-
-        u32 result_int = _mm_movemask_epi8(result);
-        if (result_int != 0xFFFF) {
-            bRet = true;
-            break;
-        }
-#else
-        // I see no point to keep an mmx version. SSE2 versions is probably faster.
-        // Keep a slow portable C version for reference/debug
-        // Note: local memory datas are swizzles
-        if (dst[0] != src[0] || dst[1] != src[2] || dst[2] != src[4] || dst[3] != src[6]
-                || dst[4] != src[1] || dst[5] != src[3] || dst[6] != src[5] || dst[7] != src[7]) {
-            bRet = true;
-            break;
-        }
-#endif
-
-        // go to the next memory block
-        src += 32;
-
-        // go back to the previous memory block then down one memory column
-        if (entries & 0x10) {
-            src -= (64-8);
-        }
-        // In case previous operation (down one column) cross the block boundary
-        // Go to the next block
-        if (entries == 0x90) {
-            src += 32;
-        }
-
-        dst += 8;
-        entries -= 16;
-    }
-#else
-
-	// do a fast test with MMX
-#ifdef _MSC_VER
-	int storeebx;
-	__asm
-	{
-		mov storeebx, ebx
-		mov edx, dst
-		mov ecx, src
-		mov ebx, entries
-
-Start:
-		movq mm0, [edx]
-		movq mm1, [edx+8]
-		pcmpeqd mm0, [ecx]
-		pcmpeqd mm1, [ecx+16]
-
-		movq mm2, [edx+16]
-		movq mm3, [edx+24]
-		pcmpeqd mm2, [ecx+32]
-		pcmpeqd mm3, [ecx+48]
-
-		pand mm0, mm1
-		pand mm2, mm3
-		movq mm4, [edx+32]
-		movq mm5, [edx+40]
-		pcmpeqd mm4, [ecx+8]
-		pcmpeqd mm5, [ecx+24]
-
-		pand mm0, mm2
-		pand mm4, mm5
-		movq mm6, [edx+48]
-		movq mm7, [edx+56]
-		pcmpeqd mm6, [ecx+40]
-		pcmpeqd mm7, [ecx+56]
-
-		pand mm0, mm4
-		pand mm6, mm7
-		pand mm0, mm6
-
-		pmovmskb eax, mm0
-		cmp eax, 0xff
-		je Continue
-		mov bRet, 1
-		jmp Return
-
-Continue:
-		cmp ebx, 16
-		jle Return
-
-		test ebx, 0x10
-		jz AddEcx
-		sub ecx, 448 // go back and down one column,
-
-AddEcx:
-		add ecx, 256 // go to the right block
-
-
-		jne Continue1
-		add ecx, 256 // skip whole block
-
-Continue1:
-		add edx, 64
-		sub ebx, 16
-		jmp Start
-
-Return:
-		emms
-		mov ebx, storeebx
-	}
-
-#else // linux
-	// do a fast test with MMX
-	__asm__(
-		".intel_syntax\n"
-		"Start:\n"
-		"movq %%mm0, [%%ecx]\n"
-		"movq %%mm1, [%%ecx+8]\n"
-		"pcmpeqd %%mm0, [%%edx]\n"
-		"pcmpeqd %%mm1, [%%edx+16]\n"
-		"movq %%mm2, [%%ecx+16]\n"
-		"movq %%mm3, [%%ecx+24]\n"
-		"pcmpeqd %%mm2, [%%edx+32]\n"
-		"pcmpeqd %%mm3, [%%edx+48]\n"
-		"pand %%mm0, %%mm1\n"
-		"pand %%mm2, %%mm3\n"
-		"movq %%mm4, [%%ecx+32]\n"
-		"movq %%mm5, [%%ecx+40]\n"
-		"pcmpeqd %%mm4, [%%edx+8]\n"
-		"pcmpeqd %%mm5, [%%edx+24]\n"
-		"pand %%mm0, %%mm2\n"
-		"pand %%mm4, %%mm5\n"
-		"movq %%mm6, [%%ecx+48]\n"
-		"movq %%mm7, [%%ecx+56]\n"
-		"pcmpeqd %%mm6, [%%edx+40]\n"
-		"pcmpeqd %%mm7, [%%edx+56]\n"
-		"pand %%mm0, %%mm4\n"
-		"pand %%mm6, %%mm7\n"
-		"pand %%mm0, %%mm6\n"
-		"pmovmskb %%eax, %%mm0\n"
-		"cmp %%eax, 0xff\n"
-		"je Continue\n"
-		".att_syntax\n"
-		"movb $1, %0\n"
-		".intel_syntax\n"
-		"jmp Return\n"
-		"Continue:\n"
-		"cmp %%esi, 16\n"
-		"jle Return\n"
-		"test %%esi, 0x10\n"
-		"jz AddEcx\n"
-		"sub %%edx, 448\n" // go back and down one column
-		"AddEcx:\n"
-		"add %%edx, 256\n" // go to the right block
-		"cmp %%esi, 0x90\n"
-		"jne Continue1\n"
-		"add %%edx, 256\n" // skip whole block
-		"Continue1:\n"
-		"add %%ecx, 64\n"
-		"sub %%esi, 16\n"
-		"jmp Start\n"
-		"Return:\n"
-		"emms\n"
-
-	".att_syntax\n" : "=m"(bRet) : "c"(dst), "d"(src), "S"(entries) : "eax", "memory");
-
-#endif // _WIN32
-#endif
-	return bRet;
-}
-
 // cld state:
 // 000 - clut data is not loaded; data in the temp buffer is stored
 // 001 - clut data is always loaded.
@ -769,16 +555,29 @@ bool CheckChangeInClut(u32 highdword, u32 psm)
 			if (gs.cbp[1] == cbp) return false;
 			break;

-			//case 4: return gs.cbp[0] != cbp;
-			//case 5: return gs.cbp[1] != cbp;
-
-			// default: load
-
 		default:
 			break;
 	}

-	return IsDirty(highdword, psm, cld, cbp);
+    // Compare the cache with current memory
+
+    // CSM2 is not supported
+    if (ZZOglGet_csm_TexBits(highdword))
+		return true;
+
+	int cpsm = ZZOglGet_cpsm_TexBits(highdword);
+	int csa = ZZOglGet_csa_TexBits(highdword);
+	int entries = PSMT_IS8CLUT(psm) ? 256 : 16;
+
+	u8* GSMem = g_pbyGSMemory + cbp * 256;
+
+    if (PSMT_IS32BIT(cpsm))
+        return Cmp_ClutBuffer_GSMem<u32>((u32*)GSMem, csa, entries);
+    else {
+		// Mana Khemia triggers this.
+        //ZZLog::Error_Log("16 bit clut not supported.");
+		return Cmp_ClutBuffer_GSMem<u16>((u16*)GSMem, csa, entries);
+    }
 }

 void texClutWrite(int ctx)
@ -823,118 +622,7 @@ void texClutWrite(int ctx)

 	Flush(!ctx);

-	int entries = PSMT_IS8CLUT(tex0.psm) ? 256 : 16;
-
-	if (tex0.csm)
-	{
-		switch (tex0.cpsm)
-		{
-				// 16bit psm
-				// eggomania uses non16bit textures for csm2
-
-			case PSMCT16:
-			{
-				u16* src = (u16*)g_pbyGSMemory + tex0.cbp * 128;
-				u16 *dst = (u16*)(g_pbyGSClut + 64 * (tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0));
-
-				for (int i = 0; i < entries; ++i)
-				{
-					*dst = src[getPixelAddress16_0(gs.clut.cou+i, gs.clut.cov, gs.clut.cbw)];
-					dst += 2;
-
-					// check for wrapping
-
-					if (((u32)(uptr)dst & 0x3ff) == 0) dst = (u16*)(g_pbyGSClut + 2);
-				}
-				break;
-			}
-
-			case PSMCT16S:
-			{
-				u16* src = (u16*)g_pbyGSMemory + tex0.cbp * 128;
-				u16 *dst = (u16*)(g_pbyGSClut + 64 * (tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0));
-
-				for (int i = 0; i < entries; ++i)
-				{
-					*dst = src[getPixelAddress16S_0(gs.clut.cou+i, gs.clut.cov, gs.clut.cbw)];
-					dst += 2;
-
-					// check for wrapping
-
-					if (((u32)(uptr)dst & 0x3ff) == 0) dst = (u16*)(g_pbyGSClut + 2);
-				}
-				break;
-			}
-
-			case PSMCT32:
-			case PSMCT24:
-			{
-				u32* src = (u32*)g_pbyGSMemory + tex0.cbp * 64;
-				u32 *dst = (u32*)(g_pbyGSClut + 64 * tex0.csa);
-
-				// check if address exceeds src
-
-				if (src + getPixelAddress32_0(gs.clut.cou + entries - 1, gs.clut.cov, gs.clut.cbw) >= (u32*)g_pbyGSMemory + 0x00100000)
-					ZZLog::Error_Log("texClutWrite out of bounds.");
-				else
-					for (int i = 0; i < entries; ++i)
-					{
-						*dst = src[getPixelAddress32_0(gs.clut.cou+i, gs.clut.cov, gs.clut.cbw)];
-						dst++;
-					}
-				break;
-			}
-
-			default:
-			{
-				//ZZLog::Debug_Log("Unknown cpsm: %x (%x).", tex0.cpsm, tex0.psm);
-				break;
-			}
-		}
-	}
-	else
-	{
-		u32* src = (u32*)(g_pbyGSMemory + 256 * tex0.cbp);
-		
-		if (entries == 16)
-		{
-			switch (tex0.cpsm)
-			{
-				case PSMCT24:
-				case PSMCT32:
-					WriteCLUT_T32_I4_CSM1(src, (u32*)(g_pbyGSClut + 64 * tex0.csa));
-					break;
-
-				default:
-#ifdef ZEROGS_SSE2
-					WriteCLUT_T16_I4_CSM1_sse2(src, tex0.csa);
-#else
-					WriteCLUT_T16_I4_CSM1_c(src, (u32*)(g_pbyGSClut + 64*(tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0)));
-#endif
-					break;
-			}
-		}
-		else
-		{
-			switch (tex0.cpsm)
-			{
-				case PSMCT24:
-				case PSMCT32:
-					WriteCLUT_T32_I8_CSM1(src, (u32*)(g_pbyGSClut + 64 * tex0.csa));
-					break;
-
-				default:
-					// sse2 for 256 is more complicated, so use regular
-#ifdef ZEROGS_SSE2
-					WriteCLUT_T16_I8_CSM1_sse2(src, tex0.csa);
-#else
-					WriteCLUT_T16_I8_CSM1_c(src, (u32*)(g_pbyGSClut + 64*(tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0)));
-#endif
-					break;
-			}
-
-		}
-	}
+    // Write the memory to clut buffer
+    GSMem_to_ClutBuffer(tex0);
 }

-