From 6a96e46920776e1697feb2f52c07320f835634a5 Mon Sep 17 00:00:00 2001
From: "gregory.hainaut@gmail.com"
 <gregory.hainaut@gmail.com@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Mon, 20 Dec 2010 19:57:50 +0000
Subject: [PATCH] zzogl: * increase a little the hack window (better for
 screenshot, not too big for small screen) * Use generic clut function in
 FlushDecodeClut * Various clean and comment

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4113 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 plugins/zzogl-pg/opengl/Linux/Linux.cpp |   3 +-
 plugins/zzogl-pg/opengl/ZZoglCreate.cpp |   2 +-
 plugins/zzogl-pg/opengl/ZZoglFlush.cpp  |  57 ++-------
 plugins/zzogl-pg/opengl/ZZoglShoots.cpp |  10 +-
 plugins/zzogl-pg/opengl/targets.cpp     | 149 ++++++++++++------------
 5 files changed, 93 insertions(+), 128 deletions(-)
diff --git a/plugins/zzogl-pg/opengl/Linux/Linux.cpp b/plugins/zzogl-pg/opengl/Linux/Linux.cpp
index df1e87f63c..cddd200b23 100644
--- a/plugins/zzogl-pg/opengl/Linux/Linux.cpp
+++ b/plugins/zzogl-pg/opengl/Linux/Linux.cpp
@@ -234,7 +234,8 @@ void DisplayAdvancedDialog()
 				 
 	dialog = gtk_dialog_new();
 	gtk_window_set_title(GTK_WINDOW(dialog), "ZZOgl PG Advanced Config");
-	gtk_window_set_default_size(GTK_WINDOW(dialog), 600, 600);
+	// A good value for the heigh will be 1000 instead of 800 but I'm afraid that some people still uses small screen...
+	gtk_window_set_default_size(GTK_WINDOW(dialog), 600, 800);
 	gtk_window_set_modal(GTK_WINDOW(dialog), true);
 	
 	advanced_box = gtk_vbox_new(false, 5);
diff --git a/plugins/zzogl-pg/opengl/ZZoglCreate.cpp b/plugins/zzogl-pg/opengl/ZZoglCreate.cpp
index 2107c41b4b..279986e93f 100644
--- a/plugins/zzogl-pg/opengl/ZZoglCreate.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglCreate.cpp
@@ -188,7 +188,7 @@ inline bool CreateImportantCheck()
 
 	if (!IsGLExt("GL_EXT_framebuffer_object"))
 	{
-		ZZLog::Error_Log("*********\nZZogl: ERROR: Need GL_EXT_framebufer_object for multiple render targets\nZZogl: *********");
+		ZZLog::Error_Log("*********\nZZogl: ERROR: Need GL_EXT_framebuffer_object for multiple render targets\nZZogl: *********");
 		bSuccess = false;
 	}
 
diff --git a/plugins/zzogl-pg/opengl/ZZoglFlush.cpp b/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
index 731f0fcadc..0c7b888a5f 100644
--- a/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
@@ -26,6 +26,7 @@
 #include "targets.h"
 #include "ZZoglFlushHack.h"
 #include "ZZoglShaders.h"
+#include "ZZClut.h"
 #include <math.h>
 
 //------------------ Defines
@@ -337,14 +338,9 @@ inline void VisualBufferMessage(int context)
 					 curvb.tex0.th, curvb.tex0.tcc, curvb.tex0.tfx, curvb.tex0.cbp,
 					 curvb.tex0.cpsm, curvb.tex0.csm, curvb.tex0.csa, curvb.tex0.cld);
 	char* Name;
-//	if (g_bSaveTex) {
-//		if (g_bSaveTex == 1)
 	Name = NamedSaveTex(&curvb.tex0, 1);
-//		else
-//			Name = NamedSaveTex(&curvb.tex0, 0);
 	ZZLog::Error_Log("TGA name '%s'.", Name);
 	free(Name);
-//	}
 	ZZLog::Debug_Log("buffer %ld.\n", BufferNumber);
 #endif
 }
@@ -730,57 +726,19 @@ inline void FlushDecodeClut(VB& curvb, GLuint& ptexclut)
 
 	if (ptexclut != 0)
 	{
-
-		int nClutOffset = 0, clutsize;
+		int clutsize;
 		int entries = PSMT_IS8CLUT(curvb.tex0.psm) ? 256 : 16;
 
 		if (curvb.tex0.csm && curvb.tex0.csa)
 			ZZLog::Debug_Log("ERROR, csm1.");
 
-		if (PSMT_IS32BIT(curvb.tex0.cpsm))   // 32 bit
-		{
-			nClutOffset = 64 * curvb.tex0.csa;
+		if (PSMT_IS32BIT(curvb.tex0.cpsm)) {
 			clutsize = min(entries, 256 - curvb.tex0.csa * 16) * 4;
-		}
-		else
-		{
-			nClutOffset = 64 * (curvb.tex0.csa & 15) + (curvb.tex0.csa >= 16 ? 2 : 0);
+		    ClutBuffer_to_Array<u32>((u32*)&data[0], curvb.tex0.csa, clutsize);
+        } else {
 			clutsize = min(entries, 512 - curvb.tex0.csa * 16) * 2;
-		}
-
-		if (PSMT_IS32BIT(curvb.tex0.cpsm))   // 32 bit
-		{
-			memcpy_amd(&data[0], g_pbyGSClut + nClutOffset, clutsize);
-		}
-		else
-		{
-			u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset);
-			u16* pclut = (u16*) & data[0];
-			int left = ((u32)nClutOffset & 2) ? 0 : ((nClutOffset & 0x3ff) / 2) + clutsize - 512;
-
-			if (left > 0) clutsize -= left;
-
-			while (clutsize > 0)
-			{
-				pclut[0] = pClutBuffer[0];
-				pclut++;
-				pClutBuffer += 2;
-				clutsize -= 2;
-			}
-
-			if (left > 0)
-			{
-				pClutBuffer = (u16*)(g_pbyGSClut + 2);
-
-				while (left > 0)
-				{
-					pclut[0] = pClutBuffer[0];
-					left -= 2;
-					pClutBuffer += 2;
-					pclut++;
-				}
-			}
-		}
+		    ClutBuffer_to_Array<u16>((u16*)&data[0], curvb.tex0.csa, clutsize);
+        }
 
 		GLenum tempType = PSMT_ISHALF_STORAGE(curvb.tex0) ? GL_UNSIGNED_SHORT_5_5_5_1 : GL_UNSIGNED_BYTE;
 		Texture2D(4, 256, 1, GL_RGBA, tempType, &data[0]);
@@ -987,6 +945,7 @@ inline FRAGMENTSHADER* FlushMadeNewTarget(VB& curvb, int exactcolor, int context
 	// save the texture
 	if (g_bSaveTex)
 	{
+        // FIXME: I suspect one of g_bSaveTex test variable is wrong
 		if (g_bSaveTex == 1)
 		{
 			SaveTex(&curvb.tex0, 1);
diff --git a/plugins/zzogl-pg/opengl/ZZoglShoots.cpp b/plugins/zzogl-pg/opengl/ZZoglShoots.cpp
index 7455f0fd6e..eb174e9ef0 100644
--- a/plugins/zzogl-pg/opengl/ZZoglShoots.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglShoots.cpp
@@ -395,6 +395,8 @@ SaveTex(tex0Info* ptex, int usevid)
 		glBindTexture(GL_TEXTURE_RECTANGLE_NV, pmemtarg->ptex->tex);
 		srcdata.resize(4 * pmemtarg->texW * pmemtarg->texH);
 
+        // FIXME strangely this function call seem to crash pcsx2 on atelier of iris 1
+        // Note: fmt is GL_UNSIGNED_SHORT_1_5_5_5_REV
 		glGetTexImage(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA, pmemtarg->fmt, &srcdata[0]);
 
 		u32 offset = MemorySize(pmemtarg->realy);
@@ -613,6 +615,9 @@ SaveTex(tex0Info* ptex, int usevid)
 
 	snprintf(Name, TGA_FILE_NAME_MAX_LENGTH, "Tex.%d.tga", TexNumber);
 	SaveTGA(Name, ptex->tw, ptex->th, &data[0]);
+
+	TexNumber++;
+	if (TexNumber > MAX_NUMBER_SAVED_TGA) TexNumber = 0;
 }
 
 
@@ -621,13 +626,10 @@ SaveTex(tex0Info* ptex, int usevid)
 char* NamedSaveTex(tex0Info* ptex, int usevid)
 {
 	SaveTex(ptex, usevid);
+
 	char* Name = (char*)malloc(TGA_FILE_NAME_MAX_LENGTH);
 	snprintf(Name, TGA_FILE_NAME_MAX_LENGTH, "Tex.%d.tga", TexNumber);
 
-	TexNumber++;
-
-	if (TexNumber > MAX_NUMBER_SAVED_TGA) TexNumber = 0;
-
 	return Name;
 }
 
diff --git a/plugins/zzogl-pg/opengl/targets.cpp b/plugins/zzogl-pg/opengl/targets.cpp
index b956b4cea5..8b134a72ef 100644
--- a/plugins/zzogl-pg/opengl/targets.cpp
+++ b/plugins/zzogl-pg/opengl/targets.cpp
@@ -475,6 +475,9 @@ void CRenderTarget::Update(int context, CRenderTarget* pdepth)
 		texframe.tw = fbw;
 		texframe.th = fbh;
 		texframe.psm = psm;
+        // FIXME some field are not initialized...
+        // in particular the clut related one
+        assert(!PSMT_ISCLUT(psm));
 
 		// write color and zero out stencil buf, always 0 context!
 		// force bilinear if using AA
@@ -966,6 +969,9 @@ void CDepthTarget::Update(int context, CRenderTarget* prndr)
 	texframe.tw = fbw;
 	texframe.th = fbh;
 	texframe.psm = psm;
+    // FIXME some field are not initialized...
+    // in particular the clut related one
+    assert(!PSMT_ISCLUT(psm));
 
 	DisableAllgl();
 
@@ -2017,96 +2023,93 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
 
         assert(targ->clutsize > 0);
 	}
-	else
-	{
-		if (tex0.psm == PSMT16Z || tex0.psm == PSMT16SZ)
-		{
-			ptexdata = (u8*)_aligned_malloc(4 * targ->texH * targ->texW, 16);
-			has_data = true;
+	else if (tex0.psm == PSMT16Z || tex0.psm == PSMT16SZ)
+    {
+        ptexdata = (u8*)_aligned_malloc(4 * targ->texH * targ->texW, 16);
+        has_data = true;
 
-			// needs to be 8 bit, use xmm for unpacking
-			u16* dst = (u16*)ptexdata;
-			u16* src = (u16*)(MemoryAddress(targ->realy));
+        // needs to be 8 bit, use xmm for unpacking
+        u16* dst = (u16*)ptexdata;
+        u16* src = (u16*)(MemoryAddress(targ->realy));
 
 #ifdef ZEROGS_SSE2
-			assert(((u32)(uptr)dst) % 16 == 0);
-            // FIXME Uncomment to test intrinsic versions (instead of asm)
-            // perf improvement vs asm:
-            // 1/ gcc updates both pointer with 1 addition
-            // 2/ Bypass the cache for the store
+        assert(((u32)(uptr)dst) % 16 == 0);
+        // FIXME Uncomment to test intrinsic versions (instead of asm)
+        // perf improvement vs asm:
+        // 1/ gcc updates both pointer with 1 addition
+        // 2/ Bypass the cache for the store
 #define NEW_INTRINSIC_VERSION
 #ifdef NEW_INTRINSIC_VERSION
 
-            __m128i zero_128 = _mm_setzero_si128();
-            // NOTE: future performance improvement
-            // SSE4.1 support uncacheable load 128bits. Maybe it can
-            // avoid some cache pollution
-            // NOTE2: I create multiple _n variable to mimic the previous ASM behavior
-            // but I'm not sure there are real gains.
-			for (int i = targ->height * GPU_TEXWIDTH/16 ; i > 0 ; --i)
-            {
-                // Convert 16 bits pixels to 32bits (zero extended)
-                // Batch 64 bytes (32 pixels) at once.
-                __m128i pixels_1 = _mm_load_si128((__m128i*)src);
-                __m128i pixels_2 = _mm_load_si128((__m128i*)(src+8));
-                __m128i pixels_3 = _mm_load_si128((__m128i*)(src+16));
-                __m128i pixels_4 = _mm_load_si128((__m128i*)(src+24));
+        __m128i zero_128 = _mm_setzero_si128();
+        // NOTE: future performance improvement
+        // SSE4.1 support uncacheable load 128bits. Maybe it can
+        // avoid some cache pollution
+        // NOTE2: I create multiple _n variable to mimic the previous ASM behavior
+        // but I'm not sure there are real gains.
+        for (int i = targ->height * GPU_TEXWIDTH/16 ; i > 0 ; --i)
+        {
+            // Convert 16 bits pixels to 32bits (zero extended)
+            // Batch 64 bytes (32 pixels) at once.
+            __m128i pixels_1 = _mm_load_si128((__m128i*)src);
+            __m128i pixels_2 = _mm_load_si128((__m128i*)(src+8));
+            __m128i pixels_3 = _mm_load_si128((__m128i*)(src+16));
+            __m128i pixels_4 = _mm_load_si128((__m128i*)(src+24));
 
-                __m128i pix_low_1 = _mm_unpacklo_epi16(pixels_1, zero_128);
-                __m128i pix_high_1 = _mm_unpackhi_epi16(pixels_1, zero_128);
-                __m128i pix_low_2 = _mm_unpacklo_epi16(pixels_2, zero_128);
-                __m128i pix_high_2 = _mm_unpackhi_epi16(pixels_2, zero_128);
+            __m128i pix_low_1 = _mm_unpacklo_epi16(pixels_1, zero_128);
+            __m128i pix_high_1 = _mm_unpackhi_epi16(pixels_1, zero_128);
+            __m128i pix_low_2 = _mm_unpacklo_epi16(pixels_2, zero_128);
+            __m128i pix_high_2 = _mm_unpackhi_epi16(pixels_2, zero_128);
 
-                // Note: bypass cache
-                _mm_stream_si128((__m128i*)dst, pix_low_1);
-                _mm_stream_si128((__m128i*)(dst+8), pix_high_1);
-                _mm_stream_si128((__m128i*)(dst+16), pix_low_2);
-                _mm_stream_si128((__m128i*)(dst+24), pix_high_2);
+            // Note: bypass cache
+            _mm_stream_si128((__m128i*)dst, pix_low_1);
+            _mm_stream_si128((__m128i*)(dst+8), pix_high_1);
+            _mm_stream_si128((__m128i*)(dst+16), pix_low_2);
+            _mm_stream_si128((__m128i*)(dst+24), pix_high_2);
 
-                __m128i pix_low_3 = _mm_unpacklo_epi16(pixels_3, zero_128);
-                __m128i pix_high_3 = _mm_unpackhi_epi16(pixels_3, zero_128);
-                __m128i pix_low_4 = _mm_unpacklo_epi16(pixels_4, zero_128);
-                __m128i pix_high_4 = _mm_unpackhi_epi16(pixels_4, zero_128);
+            __m128i pix_low_3 = _mm_unpacklo_epi16(pixels_3, zero_128);
+            __m128i pix_high_3 = _mm_unpackhi_epi16(pixels_3, zero_128);
+            __m128i pix_low_4 = _mm_unpacklo_epi16(pixels_4, zero_128);
+            __m128i pix_high_4 = _mm_unpackhi_epi16(pixels_4, zero_128);
 
-                // Note: bypass cache
-                _mm_stream_si128((__m128i*)(dst+32), pix_low_3);
-                _mm_stream_si128((__m128i*)(dst+40), pix_high_3);
-                _mm_stream_si128((__m128i*)(dst+48), pix_low_4);
-                _mm_stream_si128((__m128i*)(dst+56), pix_high_4);
+            // Note: bypass cache
+            _mm_stream_si128((__m128i*)(dst+32), pix_low_3);
+            _mm_stream_si128((__m128i*)(dst+40), pix_high_3);
+            _mm_stream_si128((__m128i*)(dst+48), pix_low_4);
+            _mm_stream_si128((__m128i*)(dst+56), pix_high_4);
 
-                src += 32;
-                dst += 64;
-            }
-            // It is advise to use a fence instruction after non temporal move (mm_stream) instruction...
-            // store fence insures that previous store are finish before execute new one.
-            _mm_sfence();
+            src += 32;
+            dst += 64;
+        }
+        // It is advise to use a fence instruction after non temporal move (mm_stream) instruction...
+        // store fence insures that previous store are finish before execute new one.
+        _mm_sfence();
 #else
-			SSE2_UnswizzleZ16Target(dst, src, targ->height * GPU_TEXWIDTH / 16);
+        SSE2_UnswizzleZ16Target(dst, src, targ->height * GPU_TEXWIDTH / 16);
 #endif
 #else // ZEROGS_SSE2
 
-			for (int i = 0; i < targ->height; ++i)
-			{
-				for (int j = 0; j < GPU_TEXWIDTH; ++j)
-				{
-					dst[0] = src[0];
-					dst[1] = 0;
-					dst[2] = src[1];
-					dst[3] = 0;
-					dst += 4;
-					src += 2;
-				}
-			}
+        for (int i = 0; i < targ->height; ++i)
+        {
+            for (int j = 0; j < GPU_TEXWIDTH; ++j)
+            {
+                dst[0] = src[0];
+                dst[1] = 0;
+                dst[2] = src[1];
+                dst[3] = 0;
+                dst += 4;
+                src += 2;
+            }
+        }
 
 #endif // ZEROGS_SSE2
-		}
-		else
-		{
-			ptexdata = targ->ptex->memptr;
-			// We really don't want to deallocate memptr. As a reminder...
-			has_data = false;
-		}
-	}
+    }
+    else
+    {
+        ptexdata = targ->ptex->memptr;
+        // We really don't want to deallocate memptr. As a reminder...
+        has_data = false;
+    }
 
 	// create the texture
 	GL_REPORT_ERRORD();