/* ZZ Open GL graphics plugin * Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com * Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ #include #include #include "GS.h" #include "Mem.h" #include "targets.h" #include "ZZClut.h" #include "Util.h" extern int g_TransferredToGPU; extern int VALIDATE_THRESH; extern u32 TEXDESTROY_THRESH; #define FORCE_TEXDESTROY_THRESH (3) // destroy texture after FORCE_TEXDESTROY_THRESH frames void CMemoryTargetMngr::Destroy() { FUNCLOG listTargets.clear(); listClearedTargets.clear(); } bool CMemoryTarget::ValidateTex(const tex0Info& tex0, int starttex, int endtex, bool bDeleteBadTex) { FUNCLOG if (clearmaxy == 0) return true; int checkstarty = max(starttex, clearminy); int checkendy = min(endtex, clearmaxy); if (checkstarty >= checkendy) return true; if (validatecount++ > VALIDATE_THRESH) { height = 0; return false; } // lock and compare assert(ptex != NULL && ptex->memptr != NULL); int result = memcmp_mmx(ptex->memptr + MemorySize(checkstarty-realy), MemoryAddress(checkstarty), MemorySize(checkendy-checkstarty)); if (result == 0) { clearmaxy = 0; return true; } if (!bDeleteBadTex) return false; // delete clearminy, clearmaxy range (not the checkstarty, checkendy range) //int newstarty = 0; if (clearminy <= starty) { if (clearmaxy < starty + height) { // preserve end height = starty + height - clearmaxy; starty = clearmaxy; assert(height > 0); } else { // destroy height = 0; } } else { // beginning can be preserved height = clearminy - starty; } clearmaxy = 0; assert((starty >= realy) && ((starty + height) <= (realy + realheight))); return false; } #define TARGET_THRESH 0x500 extern int g_MaxTexWidth, g_MaxTexHeight; // Maximum height & width of supported texture. //#define SORT_TARGETS inline list::iterator CMemoryTargetMngr::DestroyTargetIter(list::iterator& it) { // find the target and destroy list::iterator itprev = it; ++it; listClearedTargets.splice(listClearedTargets.end(), listTargets, itprev); if (listClearedTargets.size() > TEXDESTROY_THRESH) { listClearedTargets.pop_front(); } return it; } // Compare target to current texture info // Not same format -> 1 // Same format, not same data (clut only) -> 2 // identical -> 0 int CMemoryTargetMngr::CompareTarget(list::iterator& it, const tex0Info& tex0, int clutsize) { if (PSMT_ISCLUT(it->psm) != PSMT_ISCLUT(tex0.psm)) return 1; if (PSMT_ISCLUT(tex0.psm)) { if (it->psm != tex0.psm || it->cpsm != tex0.cpsm || it->clutsize != clutsize) return 1; if (PSMT_IS32BIT(tex0.cpsm)) { if (Cmp_ClutBuffer_SavedClut((u32*)&it->clut[0], tex0.csa, clutsize)) return 2; } else { if (Cmp_ClutBuffer_SavedClut((u16*)&it->clut[0], tex0.csa, clutsize)) return 2; } } else { if (PSMT_IS16BIT(tex0.psm) != PSMT_IS16BIT(it->psm)) return 1; } return 0; } void CMemoryTargetMngr::GetClutVariables(int& clutsize, const tex0Info& tex0) { clutsize = 0; if (PSMT_ISCLUT(tex0.psm)) { int entries = PSMT_IS8CLUT(tex0.psm) ? 256 : 16; if (PSMT_IS32BIT(tex0.cpsm)) clutsize = min(entries, 256 - tex0.csa * 16) * 4; else clutsize = min(entries, 512 - tex0.csa * 16) * 2; } } void CMemoryTargetMngr::GetMemAddress(int& start, int& end, const tex0Info& tex0) { int nbStart, nbEnd; GetRectMemAddressZero(nbStart, nbEnd, tex0.psm, tex0.tw, tex0.th, tex0.tbp0, tex0.tbw); assert(nbStart < nbEnd); nbEnd = min(nbEnd, MEMORY_END); start = nbStart / (4 * GPU_TEXWIDTH); end = (nbEnd + GPU_TEXWIDTH * 4 - 1) / (4 * GPU_TEXWIDTH); assert(start < end); } CMemoryTarget* CMemoryTargetMngr::SearchExistTarget(int start, int end, int clutsize, const tex0Info& tex0, int forcevalidate) { for (list::iterator it = listTargets.begin(); it != listTargets.end();) { if (it->starty <= start && it->starty + it->height >= end) { int res = CompareTarget(it, tex0, clutsize); if (res == 1) { if (it->validatecount++ > VALIDATE_THRESH) { it = DestroyTargetIter(it); if (listTargets.size() == 0) break; } else ++it; continue; } else if (res == 2) { ++it; continue; } if (forcevalidate) //&& listTargets.size() < TARGET_THRESH ) { { // do more validation checking. delete if not been used for a while if (!it->ValidateTex(tex0, start, end, curstamp > it->usedstamp + FORCE_TEXDESTROY_THRESH)) { if (it->height <= 0) { it = DestroyTargetIter(it); if (listTargets.size() == 0) break; } else ++it; continue; } } it->usedstamp = curstamp; it->validatecount = 0; return &(*it); } #ifdef SORT_TARGETS else if (it->starty >= end) break; #endif ++it; } return NULL; } CMemoryTarget* CMemoryTargetMngr::ClearedTargetsSearch(u32 fmt, int widthmult, int channels, int height) { CMemoryTarget* targ = NULL; if (listClearedTargets.size() > 0) { list::iterator itbest = listClearedTargets.begin(); while (itbest != listClearedTargets.end()) { if ((height == itbest->realheight) && (itbest->fmt == fmt) && (itbest->widthmult == widthmult) && (itbest->channels == channels)) { // check channels if (PIXELS_PER_WORD(itbest->psm) == channels) break; } ++itbest; } if (itbest != listClearedTargets.end()) { listTargets.splice(listTargets.end(), listClearedTargets, itbest); targ = &listTargets.back(); targ->validatecount = 0; } else { // create a new listTargets.push_back(CMemoryTarget()); targ = &listTargets.back(); } } else { listTargets.push_back(CMemoryTarget()); targ = &listTargets.back(); } return targ; } CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forcevalidate) { FUNCLOG int start, end, clutsize; GetClutVariables(clutsize, tex0); GetMemAddress(start, end, tex0); CMemoryTarget* it = SearchExistTarget(start, end, clutsize, tex0, forcevalidate); if (it != NULL) return it; // couldn't find so create CMemoryTarget* targ; u32 fmt; u32 internal_fmt; if (PSMT_ISHALF_STORAGE(tex0)) { // RGBA_5551 storage format fmt = GL_UNSIGNED_SHORT_1_5_5_5_REV; internal_fmt = GL_RGB5_A1; } else { // RGBA_8888 storage format fmt = GL_UNSIGNED_BYTE; internal_fmt = GL_RGBA; } int widthmult = 1, channels = 1; // If our texture is too big and could not be placed in 1 GPU texture. Pretty rare in modern cards. if ((g_MaxTexHeight < 4096) && (end - start > g_MaxTexHeight)) { // In this rare case we made a texture of half height and place it on the screen. ZZLog::Debug_Log("Making a half height texture (start - end == 0x%x)", (end-start)); widthmult = 2; } channels = PIXELS_PER_WORD(tex0.psm); targ = ClearedTargetsSearch(fmt, widthmult, channels, end - start); if (targ->ptex != NULL) { assert(end - start <= targ->realheight && targ->fmt == fmt && targ->widthmult == widthmult); // good enough, so init targ->realy = targ->starty = start; targ->usedstamp = curstamp; targ->psm = tex0.psm; targ->cpsm = tex0.cpsm; targ->height = end - start; } else { // not initialized yet targ->fmt = fmt; targ->realy = targ->starty = start; targ->realheight = targ->height = end - start; targ->usedstamp = curstamp; targ->psm = tex0.psm; targ->cpsm = tex0.cpsm; targ->widthmult = widthmult; targ->channels = channels; targ->texH = (targ->realheight + widthmult - 1)/widthmult; targ->texW = GPU_TEXWIDTH * widthmult * channels; // alloc the mem targ->ptex = new CMemoryTarget::TEXTURE(); targ->ptex->ref = 1; } #if defined(ZEROGS_DEVBUILD) g_TransferredToGPU += MemorySize(channels * targ->height); #endif // fill with data if (targ->ptex->memptr == NULL) { targ->ptex->memptr = (u8*)_aligned_malloc(MemorySize(targ->realheight), 16); assert(targ->ptex->ref > 0); } memcpy(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height)); __aligned16 u8* ptexdata = NULL; bool has_data = false; if (PSMT_ISCLUT(tex0.psm)) { assert(clutsize > 0); // Local clut parameter targ->cpsm = tex0.cpsm; // Allocate a local clut array targ->clutsize = clutsize; if(targ->clut == NULL) targ->clut = (u8*)_aligned_malloc(clutsize, 16); else { // In case it could occured // realloc would be better but you need to get it from libutilies first // _aligned_realloc is brought in from ScopedAlloc.h now. --arcum42 _aligned_free(targ->clut); targ->clut = (u8*)_aligned_malloc(clutsize, 16); } // texture parameter ptexdata = (u8*)_aligned_malloc(CLUT_PIXEL_SIZE(tex0.cpsm) * targ->texH * targ->texW, 16); has_data = true; u8* psrc = (u8*)(MemoryAddress(targ->realy)); // Fill a local clut then build the real texture if (PSMT_IS32BIT(tex0.cpsm)) { ClutBuffer_to_Array((u32*)targ->clut, tex0.csa, clutsize); Build_Clut_Texture(tex0.psm, targ->height, (u32*)targ->clut, psrc, (u32*)ptexdata); } else { ClutBuffer_to_Array((u16*)targ->clut, tex0.csa, clutsize); Build_Clut_Texture(tex0.psm, targ->height, (u16*)targ->clut, psrc, (u16*)ptexdata); } assert(targ->clutsize > 0); } else if (tex0.psm == PSMT16Z || tex0.psm == PSMT16SZ) { ptexdata = (u8*)_aligned_malloc(4 * targ->texH * targ->texW, 16); has_data = true; // needs to be 8 bit, use xmm for unpacking u16* dst = (u16*)ptexdata; u16* src = (u16*)(MemoryAddress(targ->realy)); #ifdef ZEROGS_SSE2 assert(((u32)(uptr)dst) % 16 == 0); __m128i zero_128 = _mm_setzero_si128(); // NOTE: future performance improvement // SSE4.1 support uncacheable load 128bits. Maybe it can // avoid some cache pollution // NOTE2: I create multiple _n variable to mimic the previous ASM behavior // but I'm not sure there are real gains. for (int i = targ->height * GPU_TEXWIDTH/16 ; i > 0 ; --i) { // Convert 16 bits pixels to 32bits (zero extended) // Batch 64 bytes (32 pixels) at once. __m128i pixels_1 = _mm_load_si128((__m128i*)src); __m128i pixels_2 = _mm_load_si128((__m128i*)(src+8)); __m128i pixels_3 = _mm_load_si128((__m128i*)(src+16)); __m128i pixels_4 = _mm_load_si128((__m128i*)(src+24)); __m128i pix_low_1 = _mm_unpacklo_epi16(pixels_1, zero_128); __m128i pix_high_1 = _mm_unpackhi_epi16(pixels_1, zero_128); __m128i pix_low_2 = _mm_unpacklo_epi16(pixels_2, zero_128); __m128i pix_high_2 = _mm_unpackhi_epi16(pixels_2, zero_128); // Note: bypass cache _mm_stream_si128((__m128i*)dst, pix_low_1); _mm_stream_si128((__m128i*)(dst+8), pix_high_1); _mm_stream_si128((__m128i*)(dst+16), pix_low_2); _mm_stream_si128((__m128i*)(dst+24), pix_high_2); __m128i pix_low_3 = _mm_unpacklo_epi16(pixels_3, zero_128); __m128i pix_high_3 = _mm_unpackhi_epi16(pixels_3, zero_128); __m128i pix_low_4 = _mm_unpacklo_epi16(pixels_4, zero_128); __m128i pix_high_4 = _mm_unpackhi_epi16(pixels_4, zero_128); // Note: bypass cache _mm_stream_si128((__m128i*)(dst+32), pix_low_3); _mm_stream_si128((__m128i*)(dst+40), pix_high_3); _mm_stream_si128((__m128i*)(dst+48), pix_low_4); _mm_stream_si128((__m128i*)(dst+56), pix_high_4); src += 32; dst += 64; } // It is advise to use a fence instruction after non temporal move (mm_stream) instruction... // store fence insures that previous store are finish before execute new one. _mm_sfence(); #else // ZEROGS_SSE2 for (int i = 0; i < targ->height; ++i) { for (int j = 0; j < GPU_TEXWIDTH; ++j) { dst[0] = src[0]; dst[1] = 0; dst[2] = src[1]; dst[3] = 0; dst += 4; src += 2; } } #endif // ZEROGS_SSE2 } else { ptexdata = targ->ptex->memptr; // We really don't want to deallocate memptr. As a reminder... has_data = false; } // create the texture GL_REPORT_ERRORD(); assert(ptexdata != NULL); if (targ->ptex->tex == 0) glGenTextures(1, &targ->ptex->tex); glBindTexture(GL_TEXTURE_RECTANGLE_NV, targ->ptex->tex); TextureRect(internal_fmt, targ->texW, targ->texH, GL_RGBA, fmt, ptexdata); while (glGetError() != GL_NO_ERROR) { // release resources until can create if (listClearedTargets.size() > 0) { listClearedTargets.pop_front(); } else { if (listTargets.size() == 0) { ZZLog::Error_Log("Failed to create %dx%x texture.", targ->texW, targ->texH); channels = 1; if (has_data) _aligned_free(ptexdata); return NULL; } DestroyOldest(); } TextureRect(internal_fmt, targ->texW, targ->texH, GL_RGBA, fmt, ptexdata); } setRectWrap(GL_CLAMP); if (has_data) _aligned_free(ptexdata); assert(tex0.psm != 0xd); return targ; } void CMemoryTargetMngr::ClearRange(int nbStartY, int nbEndY) { FUNCLOG int starty = nbStartY / (4 * GPU_TEXWIDTH); int endy = (nbEndY + 4 * GPU_TEXWIDTH - 1) / (4 * GPU_TEXWIDTH); for (list::iterator it = listTargets.begin(); it != listTargets.end();) { if (it->starty < endy && (it->starty + it->height) > starty) { // intersects, reduce valid texture mem (or totally delete texture) // there are 4 cases int miny = max(it->starty, starty); int maxy = min(it->starty + it->height, endy); assert(miny < maxy); if (it->clearmaxy == 0) { it->clearminy = miny; it->clearmaxy = maxy; } else { if (it->clearminy > miny) it->clearminy = miny; if (it->clearmaxy < maxy) it->clearmaxy = maxy; } } ++it; } } void CMemoryTargetMngr::DestroyCleared() { FUNCLOG for (list::iterator it = listClearedTargets.begin(); it != listClearedTargets.end();) { if (it->usedstamp < curstamp - (FORCE_TEXDESTROY_THRESH -1)) { it = listClearedTargets.erase(it); continue; } ++it; } if ((curstamp % FORCE_TEXDESTROY_THRESH) == 0) { // purge old targets every FORCE_TEXDESTROY_THRESH frames for (list::iterator it = listTargets.begin(); it != listTargets.end();) { if (it->usedstamp < curstamp - FORCE_TEXDESTROY_THRESH) { it = listTargets.erase(it); continue; } ++it; } } ++curstamp; } void CMemoryTargetMngr::DestroyOldest() { FUNCLOG if (listTargets.size() == 0) return; list::iterator it, itbest; it = itbest = listTargets.begin(); while (it != listTargets.end()) { if (it->usedstamp < itbest->usedstamp) itbest = it; ++it; } listTargets.erase(itbest); }