pcsx2/plugins/zzogl-pg/opengl/ZZMemoryTargets.cpp

618 lines
16 KiB
C++

/* ZZ Open GL graphics plugin
* Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
* Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
#include <stdlib.h>
#include <math.h>
#include "GS.h"
#include "Mem.h"
#include "targets.h"
#include "ZZClut.h"
#include "Util.h"
extern int g_TransferredToGPU;
extern int VALIDATE_THRESH;
extern u32 TEXDESTROY_THRESH;
#define FORCE_TEXDESTROY_THRESH (3) // destroy texture after FORCE_TEXDESTROY_THRESH frames
void CMemoryTargetMngr::Destroy()
{
FUNCLOG
listTargets.clear();
listClearedTargets.clear();
}
bool CMemoryTarget::ValidateTex(const tex0Info& tex0, int starttex, int endtex, bool bDeleteBadTex)
{
FUNCLOG
if (clearmaxy == 0) return true;
int checkstarty = max(starttex, clearminy);
int checkendy = min(endtex, clearmaxy);
if (checkstarty >= checkendy) return true;
if (validatecount++ > VALIDATE_THRESH)
{
height = 0;
return false;
}
// lock and compare
assert(ptex != NULL && ptex->memptr != NULL);
int result = memcmp_mmx(ptex->memptr + MemorySize(checkstarty-realy), MemoryAddress(checkstarty), MemorySize(checkendy-checkstarty));
if (result == 0)
{
clearmaxy = 0;
return true;
}
if (!bDeleteBadTex) return false;
// delete clearminy, clearmaxy range (not the checkstarty, checkendy range)
//int newstarty = 0;
if (clearminy <= starty)
{
if (clearmaxy < starty + height)
{
// preserve end
height = starty + height - clearmaxy;
starty = clearmaxy;
assert(height > 0);
}
else
{
// destroy
height = 0;
}
}
else
{
// beginning can be preserved
height = clearminy - starty;
}
clearmaxy = 0;
assert((starty >= realy) && ((starty + height) <= (realy + realheight)));
return false;
}
#define TARGET_THRESH 0x500
extern int g_MaxTexWidth, g_MaxTexHeight; // Maximum height & width of supported texture.
//#define SORT_TARGETS
inline list<CMemoryTarget>::iterator CMemoryTargetMngr::DestroyTargetIter(list<CMemoryTarget>::iterator& it)
{
// find the target and destroy
list<CMemoryTarget>::iterator itprev = it;
++it;
listClearedTargets.splice(listClearedTargets.end(), listTargets, itprev);
if (listClearedTargets.size() > TEXDESTROY_THRESH)
{
listClearedTargets.pop_front();
}
return it;
}
// Compare target to current texture info
// Not same format -> 1
// Same format, not same data (clut only) -> 2
// identical -> 0
int CMemoryTargetMngr::CompareTarget(list<CMemoryTarget>::iterator& it, const tex0Info& tex0, int clutsize)
{
if (PSMT_ISCLUT(it->psm) != PSMT_ISCLUT(tex0.psm))
return 1;
if (PSMT_ISCLUT(tex0.psm)) {
if (it->psm != tex0.psm || it->cpsm != tex0.cpsm || it->clutsize != clutsize)
return 1;
if (PSMT_IS32BIT(tex0.cpsm)) {
if (Cmp_ClutBuffer_SavedClut<u32>((u32*)&it->clut[0], tex0.csa, clutsize))
return 2;
} else {
if (Cmp_ClutBuffer_SavedClut<u16>((u16*)&it->clut[0], tex0.csa, clutsize))
return 2;
}
} else {
if (PSMT_IS16BIT(tex0.psm) != PSMT_IS16BIT(it->psm))
return 1;
}
return 0;
}
void CMemoryTargetMngr::GetClutVariables(int& clutsize, const tex0Info& tex0)
{
clutsize = 0;
if (PSMT_ISCLUT(tex0.psm))
{
int entries = PSMT_IS8CLUT(tex0.psm) ? 256 : 16;
if (PSMT_IS32BIT(tex0.cpsm))
clutsize = min(entries, 256 - tex0.csa * 16) * 4;
else
clutsize = min(entries, 512 - tex0.csa * 16) * 2;
}
}
void CMemoryTargetMngr::GetMemAddress(int& start, int& end, const tex0Info& tex0)
{
int nbStart, nbEnd;
GetRectMemAddressZero(nbStart, nbEnd, tex0.psm, tex0.tw, tex0.th, tex0.tbp0, tex0.tbw);
assert(nbStart < nbEnd);
nbEnd = min(nbEnd, MEMORY_END);
start = nbStart / (4 * GPU_TEXWIDTH);
end = (nbEnd + GPU_TEXWIDTH * 4 - 1) / (4 * GPU_TEXWIDTH);
assert(start < end);
}
CMemoryTarget* CMemoryTargetMngr::SearchExistTarget(int start, int end, int clutsize, const tex0Info& tex0, int forcevalidate)
{
for (list<CMemoryTarget>::iterator it = listTargets.begin(); it != listTargets.end();)
{
if (it->starty <= start && it->starty + it->height >= end)
{
int res = CompareTarget(it, tex0, clutsize);
if (res == 1)
{
if (it->validatecount++ > VALIDATE_THRESH)
{
it = DestroyTargetIter(it);
if (listTargets.size() == 0) break;
}
else
++it;
continue;
}
else if (res == 2)
{
++it;
continue;
}
if (forcevalidate) //&& listTargets.size() < TARGET_THRESH ) {
{
// do more validation checking. delete if not been used for a while
if (!it->ValidateTex(tex0, start, end, curstamp > it->usedstamp + FORCE_TEXDESTROY_THRESH))
{
if (it->height <= 0)
{
it = DestroyTargetIter(it);
if (listTargets.size() == 0) break;
}
else
++it;
continue;
}
}
it->usedstamp = curstamp;
it->validatecount = 0;
return &(*it);
}
#ifdef SORT_TARGETS
else if (it->starty >= end) break;
#endif
++it;
}
return NULL;
}
CMemoryTarget* CMemoryTargetMngr::ClearedTargetsSearch(u32 fmt, int widthmult, int channels, int height)
{
CMemoryTarget* targ = NULL;
if (listClearedTargets.size() > 0)
{
list<CMemoryTarget>::iterator itbest = listClearedTargets.begin();
while (itbest != listClearedTargets.end())
{
if ((height == itbest->realheight) && (itbest->fmt == fmt) && (itbest->widthmult == widthmult) && (itbest->channels == channels))
{
// check channels
if (PIXELS_PER_WORD(itbest->psm) == channels) break;
}
++itbest;
}
if (itbest != listClearedTargets.end())
{
listTargets.splice(listTargets.end(), listClearedTargets, itbest);
targ = &listTargets.back();
targ->validatecount = 0;
}
else
{
// create a new
listTargets.push_back(CMemoryTarget());
targ = &listTargets.back();
}
}
else
{
listTargets.push_back(CMemoryTarget());
targ = &listTargets.back();
}
return targ;
}
CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forcevalidate)
{
FUNCLOG
int start, end, clutsize;
GetClutVariables(clutsize, tex0);
GetMemAddress(start, end, tex0);
CMemoryTarget* it = SearchExistTarget(start, end, clutsize, tex0, forcevalidate);
if (it != NULL) return it;
// couldn't find so create
CMemoryTarget* targ;
u32 fmt;
u32 internal_fmt;
if (PSMT_ISHALF_STORAGE(tex0)) {
// RGBA_5551 storage format
fmt = GL_UNSIGNED_SHORT_1_5_5_5_REV;
internal_fmt = GL_RGB5_A1;
} else {
// RGBA_8888 storage format
fmt = GL_UNSIGNED_BYTE;
internal_fmt = GL_RGBA;
}
int widthmult = 1, channels = 1;
// If our texture is too big and could not be placed in 1 GPU texture. Pretty rare in modern cards.
if ((g_MaxTexHeight < 4096) && (end - start > g_MaxTexHeight))
{
// In this rare case we made a texture of half height and place it on the screen.
ZZLog::Debug_Log("Making a half height texture (start - end == 0x%x)", (end-start));
widthmult = 2;
}
channels = PIXELS_PER_WORD(tex0.psm);
targ = ClearedTargetsSearch(fmt, widthmult, channels, end - start);
if (targ->ptex != NULL)
{
assert(end - start <= targ->realheight && targ->fmt == fmt && targ->widthmult == widthmult);
// good enough, so init
targ->realy = targ->starty = start;
targ->usedstamp = curstamp;
targ->psm = tex0.psm;
targ->cpsm = tex0.cpsm;
targ->height = end - start;
} else {
// not initialized yet
targ->fmt = fmt;
targ->realy = targ->starty = start;
targ->realheight = targ->height = end - start;
targ->usedstamp = curstamp;
targ->psm = tex0.psm;
targ->cpsm = tex0.cpsm;
targ->widthmult = widthmult;
targ->channels = channels;
targ->texH = (targ->realheight + widthmult - 1)/widthmult;
targ->texW = GPU_TEXWIDTH * widthmult * channels;
// alloc the mem
targ->ptex = new CMemoryTarget::TEXTURE();
targ->ptex->ref = 1;
}
#if defined(ZEROGS_DEVBUILD)
g_TransferredToGPU += MemorySize(channels * targ->height);
#endif
// fill with data
if (targ->ptex->memptr == NULL)
{
targ->ptex->memptr = (u8*)_aligned_malloc(MemorySize(targ->realheight), 16);
assert(targ->ptex->ref > 0);
}
memcpy_amd(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));
__aligned16 u8* ptexdata = NULL;
bool has_data = false;
if (PSMT_ISCLUT(tex0.psm))
{
assert(clutsize > 0);
// Local clut parameter
targ->cpsm = tex0.cpsm;
// Allocate a local clut array
targ->clutsize = clutsize;
if(targ->clut == NULL)
targ->clut = (u8*)_aligned_malloc(clutsize, 16);
else {
// In case it could occured
// realloc would be better but you need to get it from libutilies first
// _aligned_realloc is brought in from ScopedAlloc.h now. --arcum42
_aligned_free(targ->clut);
targ->clut = (u8*)_aligned_malloc(clutsize, 16);
}
// texture parameter
ptexdata = (u8*)_aligned_malloc(CLUT_PIXEL_SIZE(tex0.cpsm) * targ->texH * targ->texW, 16);
has_data = true;
u8* psrc = (u8*)(MemoryAddress(targ->realy));
// Fill a local clut then build the real texture
if (PSMT_IS32BIT(tex0.cpsm))
{
ClutBuffer_to_Array<u32>((u32*)targ->clut, tex0.csa, clutsize);
Build_Clut_Texture<u32>(tex0.psm, targ->height, (u32*)targ->clut, psrc, (u32*)ptexdata);
}
else
{
ClutBuffer_to_Array<u16>((u16*)targ->clut, tex0.csa, clutsize);
Build_Clut_Texture<u16>(tex0.psm, targ->height, (u16*)targ->clut, psrc, (u16*)ptexdata);
}
assert(targ->clutsize > 0);
}
else if (tex0.psm == PSMT16Z || tex0.psm == PSMT16SZ)
{
ptexdata = (u8*)_aligned_malloc(4 * targ->texH * targ->texW, 16);
has_data = true;
// needs to be 8 bit, use xmm for unpacking
u16* dst = (u16*)ptexdata;
u16* src = (u16*)(MemoryAddress(targ->realy));
#ifdef ZEROGS_SSE2
assert(((u32)(uptr)dst) % 16 == 0);
__m128i zero_128 = _mm_setzero_si128();
// NOTE: future performance improvement
// SSE4.1 support uncacheable load 128bits. Maybe it can
// avoid some cache pollution
// NOTE2: I create multiple _n variable to mimic the previous ASM behavior
// but I'm not sure there are real gains.
for (int i = targ->height * GPU_TEXWIDTH/16 ; i > 0 ; --i)
{
// Convert 16 bits pixels to 32bits (zero extended)
// Batch 64 bytes (32 pixels) at once.
__m128i pixels_1 = _mm_load_si128((__m128i*)src);
__m128i pixels_2 = _mm_load_si128((__m128i*)(src+8));
__m128i pixels_3 = _mm_load_si128((__m128i*)(src+16));
__m128i pixels_4 = _mm_load_si128((__m128i*)(src+24));
__m128i pix_low_1 = _mm_unpacklo_epi16(pixels_1, zero_128);
__m128i pix_high_1 = _mm_unpackhi_epi16(pixels_1, zero_128);
__m128i pix_low_2 = _mm_unpacklo_epi16(pixels_2, zero_128);
__m128i pix_high_2 = _mm_unpackhi_epi16(pixels_2, zero_128);
// Note: bypass cache
_mm_stream_si128((__m128i*)dst, pix_low_1);
_mm_stream_si128((__m128i*)(dst+8), pix_high_1);
_mm_stream_si128((__m128i*)(dst+16), pix_low_2);
_mm_stream_si128((__m128i*)(dst+24), pix_high_2);
__m128i pix_low_3 = _mm_unpacklo_epi16(pixels_3, zero_128);
__m128i pix_high_3 = _mm_unpackhi_epi16(pixels_3, zero_128);
__m128i pix_low_4 = _mm_unpacklo_epi16(pixels_4, zero_128);
__m128i pix_high_4 = _mm_unpackhi_epi16(pixels_4, zero_128);
// Note: bypass cache
_mm_stream_si128((__m128i*)(dst+32), pix_low_3);
_mm_stream_si128((__m128i*)(dst+40), pix_high_3);
_mm_stream_si128((__m128i*)(dst+48), pix_low_4);
_mm_stream_si128((__m128i*)(dst+56), pix_high_4);
src += 32;
dst += 64;
}
// It is advise to use a fence instruction after non temporal move (mm_stream) instruction...
// store fence insures that previous store are finish before execute new one.
_mm_sfence();
#else // ZEROGS_SSE2
for (int i = 0; i < targ->height; ++i)
{
for (int j = 0; j < GPU_TEXWIDTH; ++j)
{
dst[0] = src[0];
dst[1] = 0;
dst[2] = src[1];
dst[3] = 0;
dst += 4;
src += 2;
}
}
#endif // ZEROGS_SSE2
}
else
{
ptexdata = targ->ptex->memptr;
// We really don't want to deallocate memptr. As a reminder...
has_data = false;
}
// create the texture
GL_REPORT_ERRORD();
assert(ptexdata != NULL);
if (targ->ptex->tex == 0) glGenTextures(1, &targ->ptex->tex);
glBindTexture(GL_TEXTURE_RECTANGLE_NV, targ->ptex->tex);
TextureRect(internal_fmt, targ->texW, targ->texH, GL_RGBA, fmt, ptexdata);
while (glGetError() != GL_NO_ERROR)
{
// release resources until can create
if (listClearedTargets.size() > 0)
{
listClearedTargets.pop_front();
}
else
{
if (listTargets.size() == 0)
{
ZZLog::Error_Log("Failed to create %dx%x texture.", targ->texW, targ->texH);
channels = 1;
if (has_data) _aligned_free(ptexdata);
return NULL;
}
DestroyOldest();
}
TextureRect(internal_fmt, targ->texW, targ->texH, GL_RGBA, fmt, ptexdata);
}
setRectWrap(GL_CLAMP);
if (has_data) _aligned_free(ptexdata);
assert(tex0.psm != 0xd);
return targ;
}
void CMemoryTargetMngr::ClearRange(int nbStartY, int nbEndY)
{
FUNCLOG
int starty = nbStartY / (4 * GPU_TEXWIDTH);
int endy = (nbEndY + 4 * GPU_TEXWIDTH - 1) / (4 * GPU_TEXWIDTH);
for (list<CMemoryTarget>::iterator it = listTargets.begin(); it != listTargets.end();)
{
if (it->starty < endy && (it->starty + it->height) > starty)
{
// intersects, reduce valid texture mem (or totally delete texture)
// there are 4 cases
int miny = max(it->starty, starty);
int maxy = min(it->starty + it->height, endy);
assert(miny < maxy);
if (it->clearmaxy == 0)
{
it->clearminy = miny;
it->clearmaxy = maxy;
}
else
{
if (it->clearminy > miny) it->clearminy = miny;
if (it->clearmaxy < maxy) it->clearmaxy = maxy;
}
}
++it;
}
}
void CMemoryTargetMngr::DestroyCleared()
{
FUNCLOG
for (list<CMemoryTarget>::iterator it = listClearedTargets.begin(); it != listClearedTargets.end();)
{
if (it->usedstamp < curstamp - (FORCE_TEXDESTROY_THRESH -1))
{
it = listClearedTargets.erase(it);
continue;
}
++it;
}
if ((curstamp % FORCE_TEXDESTROY_THRESH) == 0)
{
// purge old targets every FORCE_TEXDESTROY_THRESH frames
for (list<CMemoryTarget>::iterator it = listTargets.begin(); it != listTargets.end();)
{
if (it->usedstamp < curstamp - FORCE_TEXDESTROY_THRESH)
{
it = listTargets.erase(it);
continue;
}
++it;
}
}
++curstamp;
}
void CMemoryTargetMngr::DestroyOldest()
{
FUNCLOG
if (listTargets.size() == 0)
return;
list<CMemoryTarget>::iterator it, itbest;
it = itbest = listTargets.begin();
while (it != listTargets.end())
{
if (it->usedstamp < itbest->usedstamp) itbest = it;
++it;
}
listTargets.erase(itbest);
}