/* ZZ Open GL graphics plugin * Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com * Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ //-------------------------- Includes #if defined(_WIN32) # include # include "resource.h" #endif #include #include "GS.h" #include "Mem.h" #include "x86.h" #include "zerogs.h" #include "targets.h" #include "GLWin.h" #include "ZZoglShaders.h" #ifdef ZEROGS_SSE2 #include #endif //----------------------- Defines //-------------------------- Typedefs typedef void (APIENTRYP _PFNSWAPINTERVAL)(int); //-------------------------- Extern variables extern u32 g_nGenVars, g_nTexVars, g_nAlphaVars, g_nResolve; extern char *libraryName; extern int g_nFrame, g_nRealFrame; //extern int s_nFullscreen; //-------------------------- Variables primInfo *prim; inline u32 FtoDW(float f) { return (*((u32*)&f)); } int g_nDepthUpdateCount = 0; // Consts const GLenum primtype[8] = { GL_POINTS, GL_LINES, GL_LINES, GL_TRIANGLES, GL_TRIANGLES, GL_TRIANGLES, GL_TRIANGLES, 0xffffffff }; static const int PRIMMASK = 0x0e; // for now ignore 0x10 (AA) PFNGLISRENDERBUFFEREXTPROC glIsRenderbufferEXT = NULL; PFNGLBINDRENDERBUFFEREXTPROC glBindRenderbufferEXT = NULL; PFNGLDELETERENDERBUFFERSEXTPROC glDeleteRenderbuffersEXT = NULL; PFNGLGENRENDERBUFFERSEXTPROC glGenRenderbuffersEXT = NULL; PFNGLRENDERBUFFERSTORAGEEXTPROC glRenderbufferStorageEXT = NULL; PFNGLGETRENDERBUFFERPARAMETERIVEXTPROC glGetRenderbufferParameterivEXT = NULL; PFNGLISFRAMEBUFFEREXTPROC glIsFramebufferEXT = NULL; PFNGLBINDFRAMEBUFFEREXTPROC glBindFramebufferEXT = NULL; PFNGLDELETEFRAMEBUFFERSEXTPROC glDeleteFramebuffersEXT = NULL; PFNGLGENFRAMEBUFFERSEXTPROC glGenFramebuffersEXT = NULL; PFNGLCHECKFRAMEBUFFERSTATUSEXTPROC glCheckFramebufferStatusEXT = NULL; PFNGLFRAMEBUFFERTEXTURE1DEXTPROC glFramebufferTexture1DEXT = NULL; PFNGLFRAMEBUFFERTEXTURE2DEXTPROC glFramebufferTexture2DEXT = NULL; PFNGLFRAMEBUFFERTEXTURE3DEXTPROC glFramebufferTexture3DEXT = NULL; PFNGLFRAMEBUFFERRENDERBUFFEREXTPROC glFramebufferRenderbufferEXT = NULL; PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVEXTPROC glGetFramebufferAttachmentParameterivEXT = NULL; PFNGLGENERATEMIPMAPEXTPROC glGenerateMipmapEXT = NULL; PFNGLDRAWBUFFERSPROC glDrawBuffers = NULL; ///////////////////// // graphics resources bool s_bTexFlush = false; int s_nLastResolveReset = 0; int s_nResolveCounts[30] = {0}; // resolve counts for last 30 frames //////////////////// // State parameters int nBackbufferWidth, nBackbufferHeight; // ZZ // = float4( 255.0 /256.0f, 255.0/65536.0f, 255.0f/(65535.0f*256.0f), 1.0f/(65536.0f*65536.0f)); // float4 g_vdepth = float4( 65536.0f*65536.0f, 256.0f*65536.0f, 65536.0f, 256.0f); extern CRangeManager s_RangeMngr; // manages overwritten memory int s_nNewWidth = -1, s_nNewHeight = -1; void ChangeDeviceSize(int nNewWidth, int nNewHeight); void ProcessMessages(); void RenderCustom(float fAlpha); // intro anim /////////////////////// // Method Prototypes // /////////////////////// void KickPoint(); void KickLine(); void KickTriangle(); void KickTriangleFan(); void KickSprite(); void KickDummy(); void ResolveInRange(int start, int end); void ExtWrite(); void ResetRenderTarget(int index) { FBTexture(index); } DrawFn drawfn[8] = { KickDummy, KickDummy, KickDummy, KickDummy, KickDummy, KickDummy, KickDummy, KickDummy }; // does one time only initializing/destruction class ZeroGSInit { public: ZeroGSInit() { const u32 mem_size = MEMORY_END + 0x10000; // leave some room for out of range accesses (saves on the checks) // clear g_pbyGSMemory = (u8*)_aligned_malloc(mem_size, 1024); memset(g_pbyGSMemory, 0, mem_size); g_pbyGSClut = (u8*)_aligned_malloc(256 * 8, 1024); // need 512 alignment! memset(g_pbyGSClut, 0, 256*8); memset(&GLWin, 0, sizeof(GLWin)); } ~ZeroGSInit() { _aligned_free(g_pbyGSMemory); g_pbyGSMemory = NULL; _aligned_free(g_pbyGSClut); g_pbyGSClut = NULL; } }; static ZeroGSInit s_ZeroGSInit; #ifndef GL_FRAMEBUFFER_INCOMPLETE_DUPLICATE_ATTACHMENT_EXT #define GL_FRAMEBUFFER_INCOMPLETE_DUPLICATE_ATTACHMENT_EXT 0x8CD8 #endif void HandleGLError() { FUNCLOG // check the error status of this framebuffer */ GLenum error = glCheckFramebufferStatusEXT(GL_FRAMEBUFFER_EXT); // if error != GL_FRAMEBUFFER_COMPLETE_EXT, there's an error of some sort if (error != 0) { int w = 0; int h = 0; GLint fmt; glGetRenderbufferParameterivEXT(GL_COLOR_ATTACHMENT0_EXT, GL_RENDERBUFFER_INTERNAL_FORMAT_EXT, &fmt); glGetRenderbufferParameterivEXT(GL_COLOR_ATTACHMENT0_EXT, GL_RENDERBUFFER_WIDTH_EXT, &w); glGetRenderbufferParameterivEXT(GL_COLOR_ATTACHMENT0_EXT, GL_RENDERBUFFER_HEIGHT_EXT, &h); switch (error) { case GL_FRAMEBUFFER_COMPLETE_EXT: break; case GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT_EXT: ZZLog::Error_Log("Error! missing a required image/buffer attachment!"); break; case GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_EXT: ZZLog::Error_Log("Error! has no images/buffers attached!"); break; // case GL_FRAMEBUFFER_INCOMPLETE_DUPLICATE_ATTACHMENT_EXT: // ZZLog::Error_Log("Error! has an image/buffer attached in multiple locations!"); // break; case GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS_EXT: ZZLog::Error_Log("Error! has mismatched image/buffer dimensions!"); break; case GL_FRAMEBUFFER_INCOMPLETE_FORMATS_EXT: ZZLog::Error_Log("Error! colorbuffer attachments have different types!"); break; case GL_FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER_EXT: ZZLog::Error_Log("Error! trying to draw to non-attached color buffer!"); break; case GL_FRAMEBUFFER_INCOMPLETE_READ_BUFFER_EXT: ZZLog::Error_Log("Error! trying to read from a non-attached color buffer!"); break; case GL_FRAMEBUFFER_UNSUPPORTED_EXT: ZZLog::Error_Log("Error! format is not supported by current graphics card/driver!"); break; default: ZZLog::Error_Log("*UNKNOWN ERROR* reported from glCheckFramebufferStatusEXT(0x%x)!", error); break; } } } void ZZGSStateReset() { FUNCLOG icurctx = -1; for (int i = 0; i < 2; ++i) { vb[i].Destroy(); memset(&vb[i], 0, sizeof(VB)); vb[i].tex0.tw = 1; vb[i].tex0.th = 1; vb[i].scissor.x1 = 639; vb[i].scissor.y1 = 479; vb[i].tex0.tbw = 64; vb[i].Init(VB_BUFFERSIZE); } s_RangeMngr.Clear(); g_MemTargs.Destroy(); s_RTs.Destroy(); s_DepthRTs.Destroy(); s_BitwiseTextures.Destroy(); vb[0].ictx = 0; vb[1].ictx = 1; } void ZZReset() { FUNCLOG s_RTs.ResolveAll(); s_DepthRTs.ResolveAll(); vb[0].nCount = 0; vb[1].nCount = 0; memset(s_nResolveCounts, 0, sizeof(s_nResolveCounts)); s_nLastResolveReset = 0; icurctx = -1; g_vsprog = g_psprog = 0; ZZGSStateReset(); ZZDestroy(0); drawfn[0] = KickDummy; drawfn[1] = KickDummy; drawfn[2] = KickDummy; drawfn[3] = KickDummy; drawfn[4] = KickDummy; drawfn[5] = KickDummy; drawfn[6] = KickDummy; drawfn[7] = KickDummy; } void ZZGSReset() { FUNCLOG memset(&gs, 0, sizeof(gs)); ZZGSStateReset(); gs.prac = 1; prim = &gs._prim[0]; gs.nTriFanVert = -1; gs.imageTransfer = -1; gs.q = 1; } void ZZGSSoftReset(u32 mask) { FUNCLOG if (mask & 1) memset(&gs.path[0], 0, sizeof(gs.path[0])); if (mask & 2) memset(&gs.path[1], 0, sizeof(gs.path[1])); if (mask & 4) memset(&gs.path[2], 0, sizeof(gs.path[2])); gs.imageTransfer = -1; gs.q = 1; gs.nTriFanVert = -1; } void ChangeWindowSize(int nNewWidth, int nNewHeight) { FUNCLOG nBackbufferWidth = max(nNewWidth, 16); nBackbufferHeight = max(nNewHeight, 16); if (!(conf.fullscreen())) { conf.width = nNewWidth; conf.height = nNewHeight; } } void SetChangeDeviceSize(int nNewWidth, int nNewHeight) { FUNCLOG s_nNewWidth = nNewWidth; s_nNewHeight = nNewHeight; if (!(conf.fullscreen())) { conf.width = nNewWidth; conf.height = nNewHeight; } } void ChangeDeviceSize(int nNewWidth, int nNewHeight) { FUNCLOG //int oldscreen = s_nFullscreen; int oldwidth = nBackbufferWidth, oldheight = nBackbufferHeight; if (!ZZCreate(nNewWidth&~7, nNewHeight&~7)) { ZZLog::Error_Log("Failed to recreate, changing to old device."); if (ZZCreate(oldwidth, oldheight)) { SysMessage("Failed to create device, exiting..."); exit(0); } } for (int i = 0; i < 2; ++i) { vb[i].bNeedFrameCheck = vb[i].bNeedZCheck = 1; vb[i].CheckFrame(0); } assert(vb[0].pBufferData != NULL && vb[1].pBufferData != NULL); } void SetAA(int mode) { FUNCLOG float f = 1.0f; // need to flush all targets s_RTs.ResolveAll(); s_RTs.Destroy(); s_DepthRTs.ResolveAll(); s_DepthRTs.Destroy(); AA.x = AA.y = 0; // This is code for x0, x2, x4, x8 and x16 anti-aliasing. if (mode > 0) { // ( 1, 0 ) ; ( 1, 1 ) ; ( 2, 1 ) ; ( 2, 2 ) // it's used as a binary shift, so x >> AA.x, y >> AA.y AA.x = (mode + 1) / 2; AA.y = mode / 2; f = 2.0f; } memset(s_nResolveCounts, 0, sizeof(s_nResolveCounts)); s_nLastResolveReset = 0; vb[0].prndr = NULL; vb[0].pdepth = NULL; vb[1].prndr = NULL; vb[1].pdepth = NULL; vb[0].bNeedFrameCheck = vb[0].bNeedZCheck = 1; vb[1].bNeedFrameCheck = vb[1].bNeedZCheck = 1; glPointSize(f); } void Prim() { FUNCLOG VB& curvb = vb[prim->ctxt]; if (curvb.CheckPrim()) Flush(prim->ctxt); curvb.curprim._val = prim->_val; curvb.curprim.prim = prim->prim; } extern u32 ptexLogo; extern int nLogoWidth, nLogoHeight; void RenderCustom(float fAlpha) { FUNCLOG GL_REPORT_ERROR(); fAlpha = 1; glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0); // switch to the backbuffer DisableAllgl() ; SetShaderCaller("RenderCustom"); glViewport(0, 0, nBackbufferWidth, nBackbufferHeight); // play custom animation glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT); // tex coords float4 v = float4(1 / 32767.0f, 1 / 32767.0f, 0, 0); ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltPos, v, "g_fBitBltPos"); v.x = (float)nLogoWidth; v.y = (float)nLogoHeight; ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "g_fBitBltTex"); v.x = v.y = v.z = v.w = fAlpha; ZZshSetParameter4fv(ppsBaseTexture.prog, ppsBaseTexture.sOneColor, v, "g_fOneColor"); if (conf.wireframe()) glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); // inside vhDCb[0]'s target area, so render that region only ZZshGLSetTextureParameter(ppsBaseTexture.prog, ppsBaseTexture.sFinal, ptexLogo, "Logo"); glBindBuffer(GL_ARRAY_BUFFER, vboRect); SET_STREAM(); ZZshSetVertexShader(pvsBitBlt.prog); ZZshSetPixelShader(ppsBaseTexture.prog); DrawTriangleArray(); // restore if (conf.wireframe()) glPolygonMode(GL_FRONT_AND_BACK, GL_LINE); ProcessMessages(); GLWin.SwapGLBuffers(); glEnable(GL_SCISSOR_TEST); glEnable(GL_STENCIL_TEST); vb[0].bSyncVars = 0; vb[1].bSyncVars = 0; GL_REPORT_ERROR(); } ////////////////////////// // Internal Definitions // ////////////////////////// __forceinline void MOVZ(VertexGPU *p, u32 gsz, const VB& curvb) { p->z = (curvb.zprimmask == 0xffff) ? min((u32)0xffff, gsz) : gsz; } __forceinline void MOVFOG(VertexGPU *p, Vertex gsf) { p->f = ((s16)(gsf).f << 7) | 0x7f; } int Values[100] = {0, }; inline void SET_VERTEX(VertexGPU *p, int Index, const VB& curvb) { int index = Index; p->x = ((((int)gs.gsvertex[index].x - curvb.offset.x) >> 1) & 0xffff); p->y = ((((int)gs.gsvertex[index].y - curvb.offset.y) >> 1) & 0xffff); p->f = ((s16)gs.gsvertex[index].f << 7) | 0x7f; MOVZ(p, gs.gsvertex[index].z, curvb); p->rgba = prim->iip ? gs.gsvertex[index].rgba : gs.rgba; // This code is somehow incorrect // if ((gs.texa.aem) && ((p->rgba & 0xffffff ) == 0)) // p->rgba = 0; if (conf.settings().texa) { u32 B = ((p->rgba & 0xfe000000) >> 1) + (0x01000000 * curvb.fba.fba) ; p->rgba = (p->rgba & 0xffffff) + B; } if (prim->tme) { if (prim->fst) { p->s = (float)gs.gsvertex[index].u * fiTexWidth[prim->ctxt]; p->t = (float)gs.gsvertex[index].v * fiTexHeight[prim->ctxt]; p->q = 1; } else { p->s = gs.gsvertex[index].s; p->t = gs.gsvertex[index].t; p->q = gs.gsvertex[index].q; } } } static __forceinline void OUTPUT_VERT(VertexGPU vert, u32 id) { #ifdef WRITE_PRIM_LOGS ZZLog::Prim_Log("%c%d(%d): xyzf=(%4d,%4d,0x%x,%3d), rgba=0x%8.8x, stq = (%2.5f,%2.5f,%2.5f)\n", id == 0 ? '*' : ' ', id, prim->prim, vert.x / 8, vert.y / 8, vert.z, vert.f / 128, vert.rgba, Clamp(vert.s, -10, 10), Clamp(vert.t, -10, 10), Clamp(vert.q, -10, 10)); #endif } void KickPoint() { FUNCLOG assert(gs.primC >= 1); VB& curvb = vb[prim->ctxt]; curvb.FlushTexData(); if ((vb[!prim->ctxt].nCount > 0) && (vb[prim->ctxt].gsfb.fbp == vb[!prim->ctxt].gsfb.fbp)) { assert(vb[prim->ctxt].nCount == 0); Flush(!prim->ctxt); } curvb.NotifyWrite(1); int last = gs.primNext(2); VertexGPU* p = curvb.pBufferData + curvb.nCount; SET_VERTEX(&p[0], last, curvb); curvb.nCount++; OUTPUT_VERT(p[0], 0); } void KickLine() { FUNCLOG assert(gs.primC >= 2); VB& curvb = vb[prim->ctxt]; curvb.FlushTexData(); if ((vb[!prim->ctxt].nCount > 0) && (vb[prim->ctxt].gsfb.fbp == vb[!prim->ctxt].gsfb.fbp)) { assert(vb[prim->ctxt].nCount == 0); Flush(!prim->ctxt); } curvb.NotifyWrite(2); int next = gs.primNext(); int last = gs.primNext(2); VertexGPU* p = curvb.pBufferData + curvb.nCount; SET_VERTEX(&p[0], next, curvb); SET_VERTEX(&p[1], last, curvb); curvb.nCount += 2; OUTPUT_VERT(p[0], 0); OUTPUT_VERT(p[1], 1); } void KickTriangle() { FUNCLOG assert(gs.primC >= 3); VB& curvb = vb[prim->ctxt]; curvb.FlushTexData(); if ((vb[!prim->ctxt].nCount > 0) && (vb[prim->ctxt].gsfb.fbp == vb[!prim->ctxt].gsfb.fbp)) { assert(vb[prim->ctxt].nCount == 0); Flush(!prim->ctxt); } curvb.NotifyWrite(3); VertexGPU* p = curvb.pBufferData + curvb.nCount; SET_VERTEX(&p[0], 0, curvb); SET_VERTEX(&p[1], 1, curvb); SET_VERTEX(&p[2], 2, curvb); curvb.nCount += 3; OUTPUT_VERT(p[0], 0); OUTPUT_VERT(p[1], 1); OUTPUT_VERT(p[2], 2); } void KickTriangleFan() { FUNCLOG assert(gs.primC >= 3); VB& curvb = vb[prim->ctxt]; curvb.FlushTexData(); if ((vb[!prim->ctxt].nCount > 0) && (vb[prim->ctxt].gsfb.fbp == vb[!prim->ctxt].gsfb.fbp)) { assert(vb[prim->ctxt].nCount == 0); Flush(!prim->ctxt); } curvb.NotifyWrite(3); VertexGPU* p = curvb.pBufferData + curvb.nCount; SET_VERTEX(&p[0], 0, curvb); SET_VERTEX(&p[1], 1, curvb); SET_VERTEX(&p[2], 2, curvb); curvb.nCount += 3; // add 1 to skip the first vertex if (gs.primIndex == gs.nTriFanVert) gs.primIndex = gs.primNext(); OUTPUT_VERT(p[0], 0); OUTPUT_VERT(p[1], 1); OUTPUT_VERT(p[2], 2); } void SetKickVertex(VertexGPU *p, Vertex v, int next, const VB& curvb) { SET_VERTEX(p, next, curvb); MOVZ(p, v.z, curvb); MOVFOG(p, v); } void KickSprite() { FUNCLOG assert(gs.primC >= 2); VB& curvb = vb[prim->ctxt]; curvb.FlushTexData(); if ((vb[!prim->ctxt].nCount > 0) && (vb[prim->ctxt].gsfb.fbp == vb[!prim->ctxt].gsfb.fbp)) { assert(vb[prim->ctxt].nCount == 0); Flush(!prim->ctxt); } curvb.NotifyWrite(6); int next = gs.primNext(); int last = gs.primNext(2); // sprite is too small and AA shows lines (tek4, Mana Khemia) gs.gsvertex[last].x += (4 * AA.x); gs.gsvertex[last].y += (4 * AA.y); // might be bad sprite (KH dialog text) //if( gs.gsvertex[next].x == gs.gsvertex[last].x || gs.gsvertex[next].y == gs.gsvertex[last].y ) //return; VertexGPU* p = curvb.pBufferData + curvb.nCount; SetKickVertex(&p[0], gs.gsvertex[last], next, curvb); SetKickVertex(&p[3], gs.gsvertex[last], next, curvb); SetKickVertex(&p[1], gs.gsvertex[last], last, curvb); SetKickVertex(&p[4], gs.gsvertex[last], last, curvb); SetKickVertex(&p[2], gs.gsvertex[last], next, curvb); p[2].s = p[1].s; p[2].x = p[1].x; SetKickVertex(&p[5], gs.gsvertex[last], last, curvb); p[5].s = p[0].s; p[5].x = p[0].x; curvb.nCount += 6; OUTPUT_VERT(p[0], 0); OUTPUT_VERT(p[1], 1); } void KickDummy() { FUNCLOG //ZZLog::Greg_Log("Kicking bad primitive: %.8x\n", *(u32*)prim); } void SetFogColor(u32 fog) { FUNCLOG // Always set the fog color, even if it was already set. // if (gs.fogcol != fog) // { gs.fogcol = fog; FlushBoth(); SetShaderCaller("SetFogColor"); float4 v; // set it immediately v.SetColor(gs.fogcol); ZZshSetParameter4fv(g_fparamFogColor, v, "g_fParamFogColor"); // } } void SetFogColor(GIFRegFOGCOL* fog) { FUNCLOG SetShaderCaller("SetFogColor"); float4 v; v.x = fog->FCR / 255.0f; v.y = fog->FCG / 255.0f; v.z = fog->FCB / 255.0f; ZZshSetParameter4fv(g_fparamFogColor, v, "g_fParamFogColor"); } void ExtWrite() { FUNCLOG ZZLog::Warn_Log("A hollow voice says 'EXTWRITE'! Nothing happens."); // use local DISPFB, EXTDATA, EXTBUF, and PMODE // int bpp, start, end; // tex0Info texframe; // bpp = 4; // if( texframe.psm == PSMT16S ) bpp = 3; // else if (PSMT_ISHALF(texframe.psm)) bpp = 2; // // // get the start and end addresses of the buffer // GetRectMemAddress(start, end, texframe.psm, 0, 0, texframe.tw, texframe.th, texframe.tbp0, texframe.tbw); } //////////// // Caches // //////////// // case 0: return false; // case 1: break; // case 2: m_CBP[0] = TEX0.CBP; break; // case 3: m_CBP[1] = TEX0.CBP; break; // case 4: if(m_CBP[0] == TEX0.CBP) return false; m_CBP[0] = TEX0.CBP; break; // case 5: if(m_CBP[1] == TEX0.CBP) return false; m_CBP[1] = TEX0.CBP; break; // case 6: ASSERT(0); return false; // ffx2 menu // case 7: ASSERT(0); return false; // default: __assume(0); bool IsDirty(u32 highdword, u32 psm, int cld, int cbp) { int cpsm = ZZOglGet_cpsm_TexBits(highdword); int csm = ZZOglGet_csm_TexBits(highdword); if (cpsm > 1 || csm) { // Mana Khemia triggers this. //ZZLog::Error_Log("16 bit clut not supported."); return true; } int csa = ZZOglGet_csa_TexBits(highdword); int entries = PSMT_IS8CLUT(psm) ? 256 : 16; u64* src = (u64*)(g_pbyGSMemory + cbp * 256); u64* dst = (u64*)(g_pbyGSClut + 64 * csa); bool bRet = false; #define TEST_THIS #ifdef TEST_THIS while(entries != 0) { #ifdef ZEROGS_SSE2 // Note: local memory datas are swizzles __m128i src_0 = _mm_load_si128((__m128i*)src); // 9 8 1 0 __m128i src_1 = _mm_load_si128((__m128i*)src+1); // 11 10 3 2 __m128i src_2 = _mm_load_si128((__m128i*)src+2); // 13 12 5 4 __m128i src_3 = _mm_load_si128((__m128i*)src+3); // 15 14 7 6 __m128i dst_0 = _mm_load_si128((__m128i*)dst); __m128i dst_1 = _mm_load_si128((__m128i*)dst+1); __m128i dst_2 = _mm_load_si128((__m128i*)dst+2); __m128i dst_3 = _mm_load_si128((__m128i*)dst+3); __m128i result = _mm_cmpeq_epi32(_mm_unpacklo_epi64(src_0, src_1), dst_0); __m128i result_tmp = _mm_cmpeq_epi32(_mm_unpacklo_epi64(src_2, src_3), dst_1); result = _mm_and_si128(result, result_tmp); result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(src_0, src_1), dst_2); result = _mm_and_si128(result, result_tmp); result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(src_2, src_3), dst_3); result = _mm_and_si128(result, result_tmp); u32 result_int = _mm_movemask_epi8(result); if (result_int != 0xFFFF) { bRet = true; break; } #else // I see no point to keep an mmx version. SSE2 versions is probably faster. // Keep a slow portable C version for reference/debug // Note: local memory datas are swizzles if (dst[0] != src[0] || dst[1] != src[2] || dst[2] != src[4] || dst[3] != src[6] || dst[4] != src[1] || dst[5] != src[3] || dst[6] != src[5] || dst[7] != src[7]) { bRet = true; break; } #endif // go to the next memory block src += 32; // go back to the previous memory block then down one memory column if (entries & 0x10) { src -= (64-8); } // In case previous operation (down one column) cross the block boundary // Go to the next block if (entries == 0x90) { src += 32; } dst += 8; entries -= 16; } #else // do a fast test with MMX #ifdef _MSC_VER int storeebx; __asm { mov storeebx, ebx mov edx, dst mov ecx, src mov ebx, entries Start: movq mm0, [edx] movq mm1, [edx+8] pcmpeqd mm0, [ecx] pcmpeqd mm1, [ecx+16] movq mm2, [edx+16] movq mm3, [edx+24] pcmpeqd mm2, [ecx+32] pcmpeqd mm3, [ecx+48] pand mm0, mm1 pand mm2, mm3 movq mm4, [edx+32] movq mm5, [edx+40] pcmpeqd mm4, [ecx+8] pcmpeqd mm5, [ecx+24] pand mm0, mm2 pand mm4, mm5 movq mm6, [edx+48] movq mm7, [edx+56] pcmpeqd mm6, [ecx+40] pcmpeqd mm7, [ecx+56] pand mm0, mm4 pand mm6, mm7 pand mm0, mm6 pmovmskb eax, mm0 cmp eax, 0xff je Continue mov bRet, 1 jmp Return Continue: cmp ebx, 16 jle Return test ebx, 0x10 jz AddEcx sub ecx, 448 // go back and down one column, AddEcx: add ecx, 256 // go to the right block jne Continue1 add ecx, 256 // skip whole block Continue1: add edx, 64 sub ebx, 16 jmp Start Return: emms mov ebx, storeebx } #else // linux // do a fast test with MMX __asm__( ".intel_syntax\n" "Start:\n" "movq %%mm0, [%%ecx]\n" "movq %%mm1, [%%ecx+8]\n" "pcmpeqd %%mm0, [%%edx]\n" "pcmpeqd %%mm1, [%%edx+16]\n" "movq %%mm2, [%%ecx+16]\n" "movq %%mm3, [%%ecx+24]\n" "pcmpeqd %%mm2, [%%edx+32]\n" "pcmpeqd %%mm3, [%%edx+48]\n" "pand %%mm0, %%mm1\n" "pand %%mm2, %%mm3\n" "movq %%mm4, [%%ecx+32]\n" "movq %%mm5, [%%ecx+40]\n" "pcmpeqd %%mm4, [%%edx+8]\n" "pcmpeqd %%mm5, [%%edx+24]\n" "pand %%mm0, %%mm2\n" "pand %%mm4, %%mm5\n" "movq %%mm6, [%%ecx+48]\n" "movq %%mm7, [%%ecx+56]\n" "pcmpeqd %%mm6, [%%edx+40]\n" "pcmpeqd %%mm7, [%%edx+56]\n" "pand %%mm0, %%mm4\n" "pand %%mm6, %%mm7\n" "pand %%mm0, %%mm6\n" "pmovmskb %%eax, %%mm0\n" "cmp %%eax, 0xff\n" "je Continue\n" ".att_syntax\n" "movb $1, %0\n" ".intel_syntax\n" "jmp Return\n" "Continue:\n" "cmp %%esi, 16\n" "jle Return\n" "test %%esi, 0x10\n" "jz AddEcx\n" "sub %%edx, 448\n" // go back and down one column "AddEcx:\n" "add %%edx, 256\n" // go to the right block "cmp %%esi, 0x90\n" "jne Continue1\n" "add %%edx, 256\n" // skip whole block "Continue1:\n" "add %%ecx, 64\n" "sub %%esi, 16\n" "jmp Start\n" "Return:\n" "emms\n" ".att_syntax\n" : "=m"(bRet) : "c"(dst), "d"(src), "S"(entries) : "eax", "memory"); #endif // _WIN32 #endif return bRet; } // cld state: // 000 - clut data is not loaded; data in the temp buffer is stored // 001 - clut data is always loaded. // 010 - clut data is always loaded; cbp0 = cbp. // 011 - clut data is always loadedl cbp1 = cbp. // 100 - cbp0 is compared with cbp. if different, clut data is loaded. // 101 - cbp1 is compared with cbp. if different, clut data is loaded. // GSdx sets cbp0 & cbp1 when checking for clut changes. ZeroGS sets them in texClutWrite. bool CheckChangeInClut(u32 highdword, u32 psm) { FUNCLOG int cld = ZZOglGet_cld_TexBits(highdword); int cbp = ZZOglGet_cbp_TexBits(highdword); // processing the CLUT after tex0/2 are written //ZZLog::Error_Log("high == 0x%x; cld == %d", highdword, cld); switch (cld) { case 0: return false; case 1: break; case 2: break; case 3: break; case 4: if (gs.cbp[0] == cbp) return false; break; case 5: if (gs.cbp[1] == cbp) return false; break; //case 4: return gs.cbp[0] != cbp; //case 5: return gs.cbp[1] != cbp; // default: load default: break; } return IsDirty(highdword, psm, cld, cbp); } void texClutWrite(int ctx) { FUNCLOG s_bTexFlush = false; tex0Info& tex0 = vb[ctx].tex0; assert(PSMT_ISCLUT(tex0.psm)); // processing the CLUT after tex0/2 are written switch (tex0.cld) { case 0: return; case 1: break; // tex0.cld is usually 1. case 2: gs.cbp[0] = tex0.cbp; break; case 3: gs.cbp[1] = tex0.cbp; break; case 4: if (gs.cbp[0] == tex0.cbp) return; gs.cbp[0] = tex0.cbp; break; case 5: if (gs.cbp[1] == tex0.cbp) return; gs.cbp[1] = tex0.cbp; break; default: //ZZLog::Debug_Log("cld isn't 0-5!"); break; } Flush(!ctx); int entries = PSMT_IS8CLUT(tex0.psm) ? 256 : 16; if (tex0.csm) { switch (tex0.cpsm) { // 16bit psm // eggomania uses non16bit textures for csm2 case PSMCT16: { u16* src = (u16*)g_pbyGSMemory + tex0.cbp * 128; u16 *dst = (u16*)(g_pbyGSClut + 64 * (tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0)); for (int i = 0; i < entries; ++i) { *dst = src[getPixelAddress16_0(gs.clut.cou+i, gs.clut.cov, gs.clut.cbw)]; dst += 2; // check for wrapping if (((u32)(uptr)dst & 0x3ff) == 0) dst = (u16*)(g_pbyGSClut + 2); } break; } case PSMCT16S: { u16* src = (u16*)g_pbyGSMemory + tex0.cbp * 128; u16 *dst = (u16*)(g_pbyGSClut + 64 * (tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0)); for (int i = 0; i < entries; ++i) { *dst = src[getPixelAddress16S_0(gs.clut.cou+i, gs.clut.cov, gs.clut.cbw)]; dst += 2; // check for wrapping if (((u32)(uptr)dst & 0x3ff) == 0) dst = (u16*)(g_pbyGSClut + 2); } break; } case PSMCT32: case PSMCT24: { u32* src = (u32*)g_pbyGSMemory + tex0.cbp * 64; u32 *dst = (u32*)(g_pbyGSClut + 64 * tex0.csa); // check if address exceeds src if (src + getPixelAddress32_0(gs.clut.cou + entries - 1, gs.clut.cov, gs.clut.cbw) >= (u32*)g_pbyGSMemory + 0x00100000) ZZLog::Error_Log("texClutWrite out of bounds."); else for (int i = 0; i < entries; ++i) { *dst = src[getPixelAddress32_0(gs.clut.cou+i, gs.clut.cov, gs.clut.cbw)]; dst++; } break; } default: { //ZZLog::Debug_Log("Unknown cpsm: %x (%x).", tex0.cpsm, tex0.psm); break; } } } else { u32* src = (u32*)(g_pbyGSMemory + 256 * tex0.cbp); if (entries == 16) { switch (tex0.cpsm) { case PSMCT24: case PSMCT32: WriteCLUT_T32_I4_CSM1(src, (u32*)(g_pbyGSClut + 64 * tex0.csa)); break; default: #ifdef ZEROGS_SSE2 WriteCLUT_T16_I4_CSM1_sse2(src, tex0.csa); #else WriteCLUT_T16_I4_CSM1_c(src, (u32*)(g_pbyGSClut + 64*(tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0))); #endif break; } } else { switch (tex0.cpsm) { case PSMCT24: case PSMCT32: WriteCLUT_T32_I8_CSM1(src, (u32*)(g_pbyGSClut + 64 * tex0.csa)); break; default: // sse2 for 256 is more complicated, so use regular #ifdef ZEROGS_SSE2 WriteCLUT_T16_I8_CSM1_sse2(src, tex0.csa); #else WriteCLUT_T16_I8_CSM1_c(src, (u32*)(g_pbyGSClut + 64*(tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0))); #endif break; } } } }