mirror of https://github.com/PCSX2/pcsx2.git
zzogl:
* increase a little the hack window (better for screenshot, not too big for small screen) * Use generic clut function in FlushDecodeClut * Various clean and comment git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4113 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
39780dcc10
commit
6a96e46920
|
@ -234,7 +234,8 @@ void DisplayAdvancedDialog()
|
|||
|
||||
dialog = gtk_dialog_new();
|
||||
gtk_window_set_title(GTK_WINDOW(dialog), "ZZOgl PG Advanced Config");
|
||||
gtk_window_set_default_size(GTK_WINDOW(dialog), 600, 600);
|
||||
// A good value for the heigh will be 1000 instead of 800 but I'm afraid that some people still uses small screen...
|
||||
gtk_window_set_default_size(GTK_WINDOW(dialog), 600, 800);
|
||||
gtk_window_set_modal(GTK_WINDOW(dialog), true);
|
||||
|
||||
advanced_box = gtk_vbox_new(false, 5);
|
||||
|
|
|
@ -188,7 +188,7 @@ inline bool CreateImportantCheck()
|
|||
|
||||
if (!IsGLExt("GL_EXT_framebuffer_object"))
|
||||
{
|
||||
ZZLog::Error_Log("*********\nZZogl: ERROR: Need GL_EXT_framebufer_object for multiple render targets\nZZogl: *********");
|
||||
ZZLog::Error_Log("*********\nZZogl: ERROR: Need GL_EXT_framebuffer_object for multiple render targets\nZZogl: *********");
|
||||
bSuccess = false;
|
||||
}
|
||||
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include "targets.h"
|
||||
#include "ZZoglFlushHack.h"
|
||||
#include "ZZoglShaders.h"
|
||||
#include "ZZClut.h"
|
||||
#include <math.h>
|
||||
|
||||
//------------------ Defines
|
||||
|
@ -337,14 +338,9 @@ inline void VisualBufferMessage(int context)
|
|||
curvb.tex0.th, curvb.tex0.tcc, curvb.tex0.tfx, curvb.tex0.cbp,
|
||||
curvb.tex0.cpsm, curvb.tex0.csm, curvb.tex0.csa, curvb.tex0.cld);
|
||||
char* Name;
|
||||
// if (g_bSaveTex) {
|
||||
// if (g_bSaveTex == 1)
|
||||
Name = NamedSaveTex(&curvb.tex0, 1);
|
||||
// else
|
||||
// Name = NamedSaveTex(&curvb.tex0, 0);
|
||||
ZZLog::Error_Log("TGA name '%s'.", Name);
|
||||
free(Name);
|
||||
// }
|
||||
ZZLog::Debug_Log("buffer %ld.\n", BufferNumber);
|
||||
#endif
|
||||
}
|
||||
|
@ -730,57 +726,19 @@ inline void FlushDecodeClut(VB& curvb, GLuint& ptexclut)
|
|||
|
||||
if (ptexclut != 0)
|
||||
{
|
||||
|
||||
int nClutOffset = 0, clutsize;
|
||||
int clutsize;
|
||||
int entries = PSMT_IS8CLUT(curvb.tex0.psm) ? 256 : 16;
|
||||
|
||||
if (curvb.tex0.csm && curvb.tex0.csa)
|
||||
ZZLog::Debug_Log("ERROR, csm1.");
|
||||
|
||||
if (PSMT_IS32BIT(curvb.tex0.cpsm)) // 32 bit
|
||||
{
|
||||
nClutOffset = 64 * curvb.tex0.csa;
|
||||
if (PSMT_IS32BIT(curvb.tex0.cpsm)) {
|
||||
clutsize = min(entries, 256 - curvb.tex0.csa * 16) * 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
nClutOffset = 64 * (curvb.tex0.csa & 15) + (curvb.tex0.csa >= 16 ? 2 : 0);
|
||||
ClutBuffer_to_Array<u32>((u32*)&data[0], curvb.tex0.csa, clutsize);
|
||||
} else {
|
||||
clutsize = min(entries, 512 - curvb.tex0.csa * 16) * 2;
|
||||
}
|
||||
|
||||
if (PSMT_IS32BIT(curvb.tex0.cpsm)) // 32 bit
|
||||
{
|
||||
memcpy_amd(&data[0], g_pbyGSClut + nClutOffset, clutsize);
|
||||
}
|
||||
else
|
||||
{
|
||||
u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset);
|
||||
u16* pclut = (u16*) & data[0];
|
||||
int left = ((u32)nClutOffset & 2) ? 0 : ((nClutOffset & 0x3ff) / 2) + clutsize - 512;
|
||||
|
||||
if (left > 0) clutsize -= left;
|
||||
|
||||
while (clutsize > 0)
|
||||
{
|
||||
pclut[0] = pClutBuffer[0];
|
||||
pclut++;
|
||||
pClutBuffer += 2;
|
||||
clutsize -= 2;
|
||||
}
|
||||
|
||||
if (left > 0)
|
||||
{
|
||||
pClutBuffer = (u16*)(g_pbyGSClut + 2);
|
||||
|
||||
while (left > 0)
|
||||
{
|
||||
pclut[0] = pClutBuffer[0];
|
||||
left -= 2;
|
||||
pClutBuffer += 2;
|
||||
pclut++;
|
||||
}
|
||||
}
|
||||
}
|
||||
ClutBuffer_to_Array<u16>((u16*)&data[0], curvb.tex0.csa, clutsize);
|
||||
}
|
||||
|
||||
GLenum tempType = PSMT_ISHALF_STORAGE(curvb.tex0) ? GL_UNSIGNED_SHORT_5_5_5_1 : GL_UNSIGNED_BYTE;
|
||||
Texture2D(4, 256, 1, GL_RGBA, tempType, &data[0]);
|
||||
|
@ -987,6 +945,7 @@ inline FRAGMENTSHADER* FlushMadeNewTarget(VB& curvb, int exactcolor, int context
|
|||
// save the texture
|
||||
if (g_bSaveTex)
|
||||
{
|
||||
// FIXME: I suspect one of g_bSaveTex test variable is wrong
|
||||
if (g_bSaveTex == 1)
|
||||
{
|
||||
SaveTex(&curvb.tex0, 1);
|
||||
|
|
|
@ -395,6 +395,8 @@ SaveTex(tex0Info* ptex, int usevid)
|
|||
glBindTexture(GL_TEXTURE_RECTANGLE_NV, pmemtarg->ptex->tex);
|
||||
srcdata.resize(4 * pmemtarg->texW * pmemtarg->texH);
|
||||
|
||||
// FIXME strangely this function call seem to crash pcsx2 on atelier of iris 1
|
||||
// Note: fmt is GL_UNSIGNED_SHORT_1_5_5_5_REV
|
||||
glGetTexImage(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA, pmemtarg->fmt, &srcdata[0]);
|
||||
|
||||
u32 offset = MemorySize(pmemtarg->realy);
|
||||
|
@ -613,6 +615,9 @@ SaveTex(tex0Info* ptex, int usevid)
|
|||
|
||||
snprintf(Name, TGA_FILE_NAME_MAX_LENGTH, "Tex.%d.tga", TexNumber);
|
||||
SaveTGA(Name, ptex->tw, ptex->th, &data[0]);
|
||||
|
||||
TexNumber++;
|
||||
if (TexNumber > MAX_NUMBER_SAVED_TGA) TexNumber = 0;
|
||||
}
|
||||
|
||||
|
||||
|
@ -621,13 +626,10 @@ SaveTex(tex0Info* ptex, int usevid)
|
|||
char* NamedSaveTex(tex0Info* ptex, int usevid)
|
||||
{
|
||||
SaveTex(ptex, usevid);
|
||||
|
||||
char* Name = (char*)malloc(TGA_FILE_NAME_MAX_LENGTH);
|
||||
snprintf(Name, TGA_FILE_NAME_MAX_LENGTH, "Tex.%d.tga", TexNumber);
|
||||
|
||||
TexNumber++;
|
||||
|
||||
if (TexNumber > MAX_NUMBER_SAVED_TGA) TexNumber = 0;
|
||||
|
||||
return Name;
|
||||
}
|
||||
|
||||
|
|
|
@ -475,6 +475,9 @@ void CRenderTarget::Update(int context, CRenderTarget* pdepth)
|
|||
texframe.tw = fbw;
|
||||
texframe.th = fbh;
|
||||
texframe.psm = psm;
|
||||
// FIXME some field are not initialized...
|
||||
// in particular the clut related one
|
||||
assert(!PSMT_ISCLUT(psm));
|
||||
|
||||
// write color and zero out stencil buf, always 0 context!
|
||||
// force bilinear if using AA
|
||||
|
@ -966,6 +969,9 @@ void CDepthTarget::Update(int context, CRenderTarget* prndr)
|
|||
texframe.tw = fbw;
|
||||
texframe.th = fbh;
|
||||
texframe.psm = psm;
|
||||
// FIXME some field are not initialized...
|
||||
// in particular the clut related one
|
||||
assert(!PSMT_ISCLUT(psm));
|
||||
|
||||
DisableAllgl();
|
||||
|
||||
|
@ -2017,96 +2023,93 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
|
|||
|
||||
assert(targ->clutsize > 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (tex0.psm == PSMT16Z || tex0.psm == PSMT16SZ)
|
||||
{
|
||||
ptexdata = (u8*)_aligned_malloc(4 * targ->texH * targ->texW, 16);
|
||||
has_data = true;
|
||||
else if (tex0.psm == PSMT16Z || tex0.psm == PSMT16SZ)
|
||||
{
|
||||
ptexdata = (u8*)_aligned_malloc(4 * targ->texH * targ->texW, 16);
|
||||
has_data = true;
|
||||
|
||||
// needs to be 8 bit, use xmm for unpacking
|
||||
u16* dst = (u16*)ptexdata;
|
||||
u16* src = (u16*)(MemoryAddress(targ->realy));
|
||||
// needs to be 8 bit, use xmm for unpacking
|
||||
u16* dst = (u16*)ptexdata;
|
||||
u16* src = (u16*)(MemoryAddress(targ->realy));
|
||||
|
||||
#ifdef ZEROGS_SSE2
|
||||
assert(((u32)(uptr)dst) % 16 == 0);
|
||||
// FIXME Uncomment to test intrinsic versions (instead of asm)
|
||||
// perf improvement vs asm:
|
||||
// 1/ gcc updates both pointer with 1 addition
|
||||
// 2/ Bypass the cache for the store
|
||||
assert(((u32)(uptr)dst) % 16 == 0);
|
||||
// FIXME Uncomment to test intrinsic versions (instead of asm)
|
||||
// perf improvement vs asm:
|
||||
// 1/ gcc updates both pointer with 1 addition
|
||||
// 2/ Bypass the cache for the store
|
||||
#define NEW_INTRINSIC_VERSION
|
||||
#ifdef NEW_INTRINSIC_VERSION
|
||||
|
||||
__m128i zero_128 = _mm_setzero_si128();
|
||||
// NOTE: future performance improvement
|
||||
// SSE4.1 support uncacheable load 128bits. Maybe it can
|
||||
// avoid some cache pollution
|
||||
// NOTE2: I create multiple _n variable to mimic the previous ASM behavior
|
||||
// but I'm not sure there are real gains.
|
||||
for (int i = targ->height * GPU_TEXWIDTH/16 ; i > 0 ; --i)
|
||||
{
|
||||
// Convert 16 bits pixels to 32bits (zero extended)
|
||||
// Batch 64 bytes (32 pixels) at once.
|
||||
__m128i pixels_1 = _mm_load_si128((__m128i*)src);
|
||||
__m128i pixels_2 = _mm_load_si128((__m128i*)(src+8));
|
||||
__m128i pixels_3 = _mm_load_si128((__m128i*)(src+16));
|
||||
__m128i pixels_4 = _mm_load_si128((__m128i*)(src+24));
|
||||
__m128i zero_128 = _mm_setzero_si128();
|
||||
// NOTE: future performance improvement
|
||||
// SSE4.1 support uncacheable load 128bits. Maybe it can
|
||||
// avoid some cache pollution
|
||||
// NOTE2: I create multiple _n variable to mimic the previous ASM behavior
|
||||
// but I'm not sure there are real gains.
|
||||
for (int i = targ->height * GPU_TEXWIDTH/16 ; i > 0 ; --i)
|
||||
{
|
||||
// Convert 16 bits pixels to 32bits (zero extended)
|
||||
// Batch 64 bytes (32 pixels) at once.
|
||||
__m128i pixels_1 = _mm_load_si128((__m128i*)src);
|
||||
__m128i pixels_2 = _mm_load_si128((__m128i*)(src+8));
|
||||
__m128i pixels_3 = _mm_load_si128((__m128i*)(src+16));
|
||||
__m128i pixels_4 = _mm_load_si128((__m128i*)(src+24));
|
||||
|
||||
__m128i pix_low_1 = _mm_unpacklo_epi16(pixels_1, zero_128);
|
||||
__m128i pix_high_1 = _mm_unpackhi_epi16(pixels_1, zero_128);
|
||||
__m128i pix_low_2 = _mm_unpacklo_epi16(pixels_2, zero_128);
|
||||
__m128i pix_high_2 = _mm_unpackhi_epi16(pixels_2, zero_128);
|
||||
__m128i pix_low_1 = _mm_unpacklo_epi16(pixels_1, zero_128);
|
||||
__m128i pix_high_1 = _mm_unpackhi_epi16(pixels_1, zero_128);
|
||||
__m128i pix_low_2 = _mm_unpacklo_epi16(pixels_2, zero_128);
|
||||
__m128i pix_high_2 = _mm_unpackhi_epi16(pixels_2, zero_128);
|
||||
|
||||
// Note: bypass cache
|
||||
_mm_stream_si128((__m128i*)dst, pix_low_1);
|
||||
_mm_stream_si128((__m128i*)(dst+8), pix_high_1);
|
||||
_mm_stream_si128((__m128i*)(dst+16), pix_low_2);
|
||||
_mm_stream_si128((__m128i*)(dst+24), pix_high_2);
|
||||
// Note: bypass cache
|
||||
_mm_stream_si128((__m128i*)dst, pix_low_1);
|
||||
_mm_stream_si128((__m128i*)(dst+8), pix_high_1);
|
||||
_mm_stream_si128((__m128i*)(dst+16), pix_low_2);
|
||||
_mm_stream_si128((__m128i*)(dst+24), pix_high_2);
|
||||
|
||||
__m128i pix_low_3 = _mm_unpacklo_epi16(pixels_3, zero_128);
|
||||
__m128i pix_high_3 = _mm_unpackhi_epi16(pixels_3, zero_128);
|
||||
__m128i pix_low_4 = _mm_unpacklo_epi16(pixels_4, zero_128);
|
||||
__m128i pix_high_4 = _mm_unpackhi_epi16(pixels_4, zero_128);
|
||||
__m128i pix_low_3 = _mm_unpacklo_epi16(pixels_3, zero_128);
|
||||
__m128i pix_high_3 = _mm_unpackhi_epi16(pixels_3, zero_128);
|
||||
__m128i pix_low_4 = _mm_unpacklo_epi16(pixels_4, zero_128);
|
||||
__m128i pix_high_4 = _mm_unpackhi_epi16(pixels_4, zero_128);
|
||||
|
||||
// Note: bypass cache
|
||||
_mm_stream_si128((__m128i*)(dst+32), pix_low_3);
|
||||
_mm_stream_si128((__m128i*)(dst+40), pix_high_3);
|
||||
_mm_stream_si128((__m128i*)(dst+48), pix_low_4);
|
||||
_mm_stream_si128((__m128i*)(dst+56), pix_high_4);
|
||||
// Note: bypass cache
|
||||
_mm_stream_si128((__m128i*)(dst+32), pix_low_3);
|
||||
_mm_stream_si128((__m128i*)(dst+40), pix_high_3);
|
||||
_mm_stream_si128((__m128i*)(dst+48), pix_low_4);
|
||||
_mm_stream_si128((__m128i*)(dst+56), pix_high_4);
|
||||
|
||||
src += 32;
|
||||
dst += 64;
|
||||
}
|
||||
// It is advise to use a fence instruction after non temporal move (mm_stream) instruction...
|
||||
// store fence insures that previous store are finish before execute new one.
|
||||
_mm_sfence();
|
||||
src += 32;
|
||||
dst += 64;
|
||||
}
|
||||
// It is advise to use a fence instruction after non temporal move (mm_stream) instruction...
|
||||
// store fence insures that previous store are finish before execute new one.
|
||||
_mm_sfence();
|
||||
#else
|
||||
SSE2_UnswizzleZ16Target(dst, src, targ->height * GPU_TEXWIDTH / 16);
|
||||
SSE2_UnswizzleZ16Target(dst, src, targ->height * GPU_TEXWIDTH / 16);
|
||||
#endif
|
||||
#else // ZEROGS_SSE2
|
||||
|
||||
for (int i = 0; i < targ->height; ++i)
|
||||
{
|
||||
for (int j = 0; j < GPU_TEXWIDTH; ++j)
|
||||
{
|
||||
dst[0] = src[0];
|
||||
dst[1] = 0;
|
||||
dst[2] = src[1];
|
||||
dst[3] = 0;
|
||||
dst += 4;
|
||||
src += 2;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < targ->height; ++i)
|
||||
{
|
||||
for (int j = 0; j < GPU_TEXWIDTH; ++j)
|
||||
{
|
||||
dst[0] = src[0];
|
||||
dst[1] = 0;
|
||||
dst[2] = src[1];
|
||||
dst[3] = 0;
|
||||
dst += 4;
|
||||
src += 2;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // ZEROGS_SSE2
|
||||
}
|
||||
else
|
||||
{
|
||||
ptexdata = targ->ptex->memptr;
|
||||
// We really don't want to deallocate memptr. As a reminder...
|
||||
has_data = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ptexdata = targ->ptex->memptr;
|
||||
// We really don't want to deallocate memptr. As a reminder...
|
||||
has_data = false;
|
||||
}
|
||||
|
||||
// create the texture
|
||||
GL_REPORT_ERRORD();
|
||||
|
|
Loading…
Reference in New Issue