zzogl-pg: Merge back GregMiscellaneous branch (3867)

* Various clean
* Replace ASM by intrinsics (much more portable)
* Various performance tuning (expect 10%-20% speedup ^_^ )


git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3868 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gregory.hainaut@gmail.com 2010-10-03 12:35:57 +00:00
parent c875caec15
commit c7a929a530
27 changed files with 2006 additions and 972 deletions

View File

@ -205,6 +205,8 @@ inline bool PSMT_HAS_SHARED_BITS (int fpsm, int tpsm) {
return (SUM == 0x15 || SUM == 0x1D || SUM == 0x2C || SUM == 0x30);
}
// If a clut is in 32-bit color, its size is 4 bytes, and 16-bit clut has a 2 byte size.
inline int CLUT_PIXEL_SIZE(int cpsm) {return ((cpsm <= 1) ? 4 : 2); }
//----------------------- Data from registers -----------------------
@ -542,7 +544,9 @@ typedef struct
extern GSinternal gs;
static __forceinline u16 RGBA32to16(u32 c)
// Note the function is used in a template parameter so it must be declared extern
// Note2: In this case extern is not compatible with __forceinline so just inline it...
extern inline u16 RGBA32to16(u32 c)
{
return (u16)((((c) & 0x000000f8) >> 3) |
(((c) & 0x0000f800) >> 6) |
@ -558,6 +562,7 @@ static __forceinline u32 RGBA16to32(u16 c)
(((c) & 0x8000) ? 0xff000000 : 0);
}
#if 0
// converts float16 [0,1] to BYTE [0,255] (assumes value is in range, otherwise will take lower 8bits)
// f is a u16
static __forceinline u16 Float16ToBYTE(u16 f)
@ -603,6 +608,7 @@ static __forceinline u16 Float16ToALPHA(u16 f)
// used for Z values
#define Float16ToARGB_Z(f) COLOR_ARGB((u32)Float16ToBYTE_2(f.w), Float16ToBYTE_2(f.x), Float16ToBYTE_2(f.y), Float16ToBYTE_2(f.z))
#define Float16ToARGB16_Z(f) ((Float16ToBYTE_2(f.y)<<8)|Float16ToBYTE_2(f.z))
#endif
inline float Clamp(float fx, float fmin, float fmax)

View File

@ -38,6 +38,7 @@ using namespace std;
#include "targets.h"
#include "ZZoglShaders.h"
#include "ZZoglFlushHack.h"
#include "ZZoglFlushHack.h"
#ifdef _MSC_VER
#pragma warning(disable:4244)
@ -68,7 +69,7 @@ extern const char* pbilinear[];
// statistics
u32 g_nGenVars = 0, g_nTexVars = 0, g_nAlphaVars = 0, g_nResolve = 0;
#define VER 2
#define VER 3
const unsigned char zgsversion = PS2E_GS_VERSION;
unsigned char zgsrevision = 0; // revision and build gives plugin version
unsigned char zgsbuild = VER;
@ -143,6 +144,7 @@ void ReportHacks(gameHacks hacks)
if (hacks.quick_resolve_1) ZZLog::WriteLn("'Quick resolve 1' enabled.");
if (hacks.no_quick_resolve) ZZLog::WriteLn("'No Quick resolve' hack enabled.");
if (hacks.no_target_clut) ZZLog::WriteLn("'No target clut' hack enabled.");
if (hacks.no_stencil) ZZLog::WriteLn("'No stencil' hack enabled.");
if (hacks.vss_hack_off) ZZLog::WriteLn("VSS hack enabled.");
if (hacks.no_depth_resolve) ZZLog::WriteLn("'No depth resolve' hack enabled.");
if (hacks.full_16_bit_res) ZZLog::WriteLn("'Full 16 bit resolution' hack enabled.");
@ -151,7 +153,7 @@ void ReportHacks(gameHacks hacks)
if (hacks.no_alpha_test) ZZLog::WriteLn("'No alpha test' hack enabled.");
if (hacks.disable_mrt_depth) ZZLog::WriteLn("'Disable mrt depth' hack enabled.");
if (hacks.args_32_bit) ZZLog::WriteLn("'Args 32 bit' hack enabled.");
if (hacks.path3) ZZLog::WriteLn("'Path3' hack enabled.");
//if (hacks.path3) ZZLog::WriteLn("'Path3' hack enabled.");
if (hacks.parallel_context) ZZLog::WriteLn("'Parallel context' hack enabled.");
if (hacks.xenosaga_spec) ZZLog::WriteLn("'Xenosaga spec' hack enabled.");
if (hacks.partial_pointers) ZZLog::WriteLn("'Partial pointers' hack enabled.");
@ -382,6 +384,7 @@ void CALLBACK GSclose()
SaveStateFile = NULL;
SaveStateExists = true; // default value
g_LastCRC = 0;
}
void CALLBACK GSirqCallback(void (*callback)())

View File

@ -88,8 +88,6 @@ template<int index> void _GSgifTransfer(const u32 *pMem, u32 size)
pMem += 4;
size--;
if ((conf.settings().path3) && (index == 2) && path->eop) nPath3Hack = 1;
// eeuser 7.2.2. GIFtag: "... when NLOOP is 0, the GIF does not output anything, and
// values other than the EOP field are disregarded."
if (path->nloop > 0)

View File

@ -79,6 +79,11 @@
static vector<u8> s_vTempBuffer, s_vTransferCache;
static int gs_imageEnd = 0;
// From the start of monster labs. In all 3 cases, psm == 0.
// ZZogl-PG: GetRectMemAddress(0x3f4000, 0x404000, 0x0, 0x0, 0x0, 0x100, 0x40, 0x3f40, 0x100);
// ZZogl-PG: GetRectMemAddress(0x3f8000, 0x408000, 0x0, 0x0, 0x0, 0x100, 0x40, 0x3f80, 0x100);
// ZZogl-PG: GetRectMemAddress(0x3fc000, 0x40c000, 0x0, 0x0, 0x0, 0x100, 0x40, 0x3fc0, 0x100);
void GetRectMemAddress(int& start, int& end, int psm, int x, int y, int w, int h, int bp, int bw)
{
FUNCLOG
@ -158,7 +163,7 @@
if (end > MEMORY_END)
{
ZZLog::Warn_Log("Host local out of bounds!");
ZZLog::Warn_Log("Init host local out of bounds! (end == 0x%x)", end);
//gs.imageTransfer = -1;
end = MEMORY_END;
}
@ -180,7 +185,6 @@
GetRectMemAddress(start, end, gs.dstbuf.psm, gs.imageX, gs.imageY, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw);
assert(start < gs_imageEnd);
end = gs_imageEnd;
// sometimes games can decompress to alpha channel of render target only, in this case
@ -434,20 +438,20 @@ __forceinline void _TransferLocalLocal_4()
write = gdp((j2+3)%2048, i2%2048, gs.dstbuf.bw);
pDstBuf[write] = (pDstBuf[write]&0x0f)|(pSrcBuf[read]&0xf0);
read = gsp((j+2)%2048, i%2048, gs.srcbuf.bw);
write = gdp((j2+2)%2048, i2%2048, gs.dstbuf.bw);
read = gsp((j+4)%2048, i%2048, gs.srcbuf.bw);
write = gdp((j2+4)%2048, i2%2048, gs.dstbuf.bw);
pDstBuf[write] = (pDstBuf[write]&0xf0)|(pSrcBuf[read]&0x0f);
read = gsp((j+3)%2048, i%2048, gs.srcbuf.bw);
write = gdp((j2+3)%2048, i2%2048, gs.dstbuf.bw);
read = gsp((j+5)%2048, i%2048, gs.srcbuf.bw);
write = gdp((j2+5)%2048, i2%2048, gs.dstbuf.bw);
pDstBuf[write] = (pDstBuf[write]&0x0f)|(pSrcBuf[read]&0xf0);
read = gsp((j+2)%2048, i%2048, gs.srcbuf.bw);
write = gdp((j2+2)%2048, i2%2048, gs.dstbuf.bw);
read = gsp((j+6)%2048, i%2048, gs.srcbuf.bw);
write = gdp((j2+6)%2048, i2%2048, gs.dstbuf.bw);
pDstBuf[write] = (pDstBuf[write]&0xf0)|(pSrcBuf[read]&0x0f);
read = gsp((j+3)%2048, i%2048, gs.srcbuf.bw);
write = gdp((j2+3)%2048, i2%2048, gs.dstbuf.bw);
read = gsp((j+7)%2048, i%2048, gs.srcbuf.bw);
write = gdp((j2+7)%2048, i2%2048, gs.dstbuf.bw);
pDstBuf[write] = (pDstBuf[write]&0x0f)|(pSrcBuf[read]&0xf0);
}
}

View File

@ -132,28 +132,35 @@ void CreateGameHackTable(GtkWidget *treeview, gameHacks hacks)
mapConfOpts.clear();
add_map_entry(GAME_TEXTURETARGS, "00000001", "Tex Target checking - 00000001\nLego Racers");
add_map_entry(GAME_AUTORESET, "00000002", "Auto reset targs - 00000002\nShadow Hearts, Samurai Warriors. Use when game is slow and toggling AA fixes it.");
add_map_entry(GAME_NOTARGETRESOLVE, "00000010", "No target resolves - 00000010\nStops all resolving of targets. Try this first for really slow games. Dark Cloud 1");
add_map_entry(GAME_EXACTCOLOR, "00000020", "Exact color testing - 00000020\nFixes overbright or shadow/black artifacts (Crash 'n Burn).");
add_map_entry(GAME_NOCOLORCLAMP, "00000040", "No color clamping - 00000040\nSpeeds up games, but might be too bright or too dim.");
add_map_entry(GAME_NOALPHAFAIL, "00000100", "Alpha Fail hack - 00000100\nFor Sonic Unleashed, Shadow the Hedgehog, Ghost in the Shell. Remove vertical stripes or other coloring artefacts. Break Persona 4 and MGS3");
add_map_entry(GAME_AUTORESET, "00000002", "Auto reset targs - 00000002\nUse when game is slow and toggling AA fixes it. Samurai Warriors. (Automatically on for Shadow Hearts)");
add_map_entry(GAME_INTERLACE2X, "00000004", "Interlace 2X - 00000004\nFixes 2x bigger screen. Gradius 3.");
//GAME_TEXAHACK (still implemented)
add_map_entry(GAME_NOTARGETRESOLVE, "00000010", "No target resolves - 00000010\nStops all resolving of targets. Try this first for really slow games. (Automatically on for Dark Cloud 1.)");
add_map_entry(GAME_EXACTCOLOR, "00000020", "Exact color testing - 00000020\nFixes overbright or shadow/black artifacts. Crash 'n Burn.");
//add_map_entry(GAME_NOCOLORCLAMP, "00000040", "No color clamping - 00000040\nSpeeds up games, but might be too bright or too dim.");
//GAME_FFXHACK
add_map_entry(GAME_NOALPHAFAIL, "00000100", "Alpha Fail hack - 00000100\nRemove vertical stripes or other coloring artifacts. Breaks Persona 4 and MGS3. (Automatically on for Sonic Unleashed, Shadow the Hedgehog, & Ghost in the Shell.)");
add_map_entry(GAME_NODEPTHUPDATE, "00000200", "Disable depth updates - 00000200");
add_map_entry(GAME_QUICKRESOLVE1, "00000400", "Resolve Hack #1 - 00000400\nKingdom Hearts. Speeds some games.");
add_map_entry(GAME_NOQUICKRESOLVE, "00000800", "Resolve Hack #2 - 00000800\nShadow Hearts, Urbz. Destroy FFX");
add_map_entry(GAME_QUICKRESOLVE1, "00000400", "Resolve Hack #1 - 00000400\n Speeds some games. Kingdom Hearts.");
add_map_entry(GAME_NOQUICKRESOLVE, "00000800", "Resolve Hack #2 - 00000800\nShadow Hearts, Urbz. Destroys FFX.");
add_map_entry(GAME_NOTARGETCLUT, "00001000", "No target CLUT - 00001000\nResident Evil 4, or foggy scenes.");
add_map_entry(GAME_NOSTENCIL, "00002000", "Disable stencil buffer - 00002000\nUsually safe to do for simple scenes. Harvest Moon");
add_map_entry(GAME_NOSTENCIL, "00002000", "Disable stencil buffer - 00002000\nUsually safe to do for simple scenes. Harvest Moon.");
//GAME_VSSHACKOFF (still implemented)
add_map_entry(GAME_NODEPTHRESOLVE, "00008000", "No depth resolve - 00008000\nMight give z buffer artifacts.");
add_map_entry(GAME_FULL16BITRES, "00010000", "Full 16 bit resolution - 00010000\nUse when half the screen is missing.");
add_map_entry(GAME_RESOLVEPROMOTED, "00020000", "Resolve Hack #3 - 00020000\nNeopets");
add_map_entry(GAME_FASTUPDATE, "00040000", "Fast Update - 00040000\nOkami. Speeds some games. Needs for Sonic Unleashed");
add_map_entry(GAME_FASTUPDATE, "00040000", "Fast Update - 00040000\n Speeds some games. Needed for Sonic Unleashed. Okami.");
add_map_entry(GAME_NOALPHATEST, "00080000", "Disable alpha testing - 00080000");
add_map_entry(GAME_DISABLEMRTDEPTH, "00100000", "Enable Multiple RTs - 00100000");
add_map_entry(GAME_XENOSPECHACK, "01000000", "Specular Highlights - 01000000\nMakes Xenosaga and Okage graphics faster by removing highlights");
add_map_entry(GAME_PARTIALPOINTERS, "02000000", "Partial targets - 02000000");
//GAME_32BITTARGS
//GAME_PATH3HACK
//GAME_DOPARALLELCTX
add_map_entry(GAME_XENOSPECHACK, "01000000", "Specular Highlights - 01000000\nMakes graphics faster by removing highlights. (Automatically on for Xenosaga, Okami, & Okage.)");
//add_map_entry(GAME_PARTIALPOINTERS, "02000000", "Partial targets - 02000000");
add_map_entry(GAME_PARTIALDEPTH, "04000000", "Partial depth - 04000000");
add_map_entry(GAME_GUSTHACK, "10000000", "Gust fix, made gustgame more clean and fast - 10000000");
add_map_entry(GAME_NOLOGZ, "20000000", "No logarithmic Z, could decrease number of Z-artefacts - 20000000");
add_map_entry(GAME_INTERLACE2X, "00000004", "Interlace 2X - 00000004\nFixes 2x bigger screen (Gradius 3).");
//GAME_REGETHACK (commented out in code)
add_map_entry(GAME_GUSTHACK, "10000000", "Gust fix - 10000000. Makes gust games cleaner and faster. (Automatically on for most Gust games)");
add_map_entry(GAME_NOLOGZ, "20000000", "No logarithmic Z - 20000000. Could decrease number of Z-artifacts.");
add_map_entry(GAME_AUTOSKIPDRAW, "40000000", "Remove blur effect on some games\nSlow games.");
for (map<string, confOptsStruct>::iterator it = mapConfOpts.begin(); it != mapConfOpts.end(); ++it)
@ -255,7 +262,7 @@ void DisplayDialog()
GtkWidget *option_frame, *option_box;
GtkWidget *log_check;
GtkWidget *int_label, *int_box, *int_holder;
GtkWidget *bilinear_check;
GtkWidget *bilinear_label, *bilinear_box, *bilinear_holder;
GtkWidget *aa_label, *aa_box, *aa_holder;
GtkWidget *snap_label, *snap_box, *snap_holder;
GtkWidget *fullscreen_label, *widescreen_check;
@ -293,9 +300,17 @@ void DisplayDialog()
gtk_box_pack_start(GTK_BOX(int_holder), int_label, false, false, 2);
gtk_box_pack_start(GTK_BOX(int_holder), int_box, false, false, 2);
bilinear_label = gtk_label_new("Bilinear Filtering:");
bilinear_box = gtk_combo_box_new_text();
bilinear_check = gtk_check_button_new_with_label("Bilinear Filtering");
gtk_widget_set_tooltip_text(bilinear_check, "Best quality is off. Turn on for speed. Toggled by pressing Shift + F5 when running.");
gtk_combo_box_append_text(GTK_COMBO_BOX(bilinear_box), "Off");
gtk_combo_box_append_text(GTK_COMBO_BOX(bilinear_box), "Normal");
gtk_combo_box_append_text(GTK_COMBO_BOX(bilinear_box), "Forced");
gtk_combo_box_set_active(GTK_COMBO_BOX(bilinear_box), conf.bilinear);
gtk_widget_set_tooltip_text(bilinear_box, "Best quality is off. Turn on for speed. Toggled by pressing Shift + F5 when running.");
bilinear_holder = gtk_hbox_new(false, 5);
gtk_box_pack_start(GTK_BOX(bilinear_holder), bilinear_label, false, false, 2);
gtk_box_pack_start(GTK_BOX(bilinear_holder), bilinear_box, false, false, 2);
aa_label = gtk_label_new("Anti-Aliasing:");
aa_box = gtk_combo_box_new_text();
@ -352,7 +367,7 @@ void DisplayDialog()
gtk_frame_set_shadow_type(GTK_FRAME(option_frame), GTK_SHADOW_NONE);
gtk_box_pack_start(GTK_BOX(option_box), log_check, false, false, 2);
gtk_box_pack_start(GTK_BOX(option_box), bilinear_check, false, false, 2);
gtk_box_pack_start(GTK_BOX(option_box), bilinear_holder, false, false, 2);
gtk_box_pack_start(GTK_BOX(option_box), int_holder, false, false, 2);
gtk_box_pack_start(GTK_BOX(option_box), aa_holder, false, false, 2);
gtk_box_pack_start(GTK_BOX(option_box), snap_holder, false, false, 2);
@ -370,7 +385,6 @@ void DisplayDialog()
gtk_box_pack_start(GTK_BOX(main_box), option_frame, false, false, 2);
gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(log_check), conf.log);
gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(bilinear_check), conf.bilinear);
gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(widescreen_check), (conf.widescreen()));
gtk_container_add(GTK_CONTAINER(GTK_DIALOG(dialog)->vbox), main_frame);
@ -390,8 +404,10 @@ void DisplayDialog()
if (gtk_combo_box_get_active(GTK_COMBO_BOX(aa_box)) != -1)
conf.aa = gtk_combo_box_get_active(GTK_COMBO_BOX(aa_box));
if (gtk_combo_box_get_active(GTK_COMBO_BOX(bilinear_box)) != -1)
conf.bilinear = gtk_combo_box_get_active(GTK_COMBO_BOX(bilinear_box));
conf.log = gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(log_check));
conf.bilinear = gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(bilinear_check));
fake_options.widescreen = gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(widescreen_check));
fake_options.tga_snap = gtk_combo_box_get_active(GTK_COMBO_BOX(snap_box));
@ -445,7 +461,7 @@ void SysMessage(const char *fmt, ...)
void CALLBACK GSabout()
{
SysMessage("ZZOgl PG: by Zeydlitz (PG version worked on by arcum42). Based off of ZeroGS, by zerofrog.");
SysMessage("ZZOgl PG: by Zeydlitz (PG version worked on by arcum42, gregory, and the pcsx2 development team). Based off of ZeroGS, by zerofrog.");
}
s32 CALLBACK GStest()

View File

@ -152,6 +152,7 @@
<Unit filename="../../ZZoglFlush.cpp" />
<Unit filename="../../ZZoglFlushHack.cpp" />
<Unit filename="../../ZZoglFlushHack.h" />
<Unit filename="../../ZZoglMath.h" />
<Unit filename="../../ZZoglSave.cpp" />
<Unit filename="../../ZZoglShaders.cpp" />
<Unit filename="../../ZZoglShaders.h" />
@ -171,7 +172,6 @@
<Unit filename="../../x86.h" />
<Unit filename="../../zerogs.cpp" />
<Unit filename="../../zerogs.h" />
<Unit filename="../../zerogsmath.h" />
<Unit filename="../../zpipe.cpp" />
<Unit filename="../../zpipe.h" />
<Extensions>

View File

@ -241,237 +241,136 @@ void TransferLocalHost24Z(void* pbyMem, u32 nQWordSize) {FUNCLOG}
void TransferLocalHost16Z(void* pbyMem, u32 nQWordSize) {FUNCLOG}
void TransferLocalHost16SZ(void* pbyMem, u32 nQWordSize) {FUNCLOG}
#define FILL_BLOCK(psm, psmcol) \
{ \
b.pageTable = &g_pageTable##psm[0][0]; \
b.blockTable = &g_blockTable##psm[0][0]; \
b.columnTable = &g_columnTable##psmcol[0][0]; \
\
assert( sizeof(g_pageTable##psm) == b.width * b.height * sizeof(g_pageTable##psm[0][0]) ); \
\
psrcf = (float*)&vBlockData[0] + b.ox + b.oy * BLOCK_TEXWIDTH; \
psrcw = (u16*)&vBlockData[0] + b.ox + b.oy * BLOCK_TEXWIDTH; \
\
for(i = 0; i < b.height; ++i) \
{ \
u32 i_width = i*BLOCK_TEXWIDTH; \
for(j = 0; j < b.width; ++j) \
{ \
/* fill the table */ \
u32 u = g_blockTable##psm[(i / b.colheight)][(j / b.colwidth)] * 64 * b.mult + g_columnTable##psmcol[i%b.colheight][j%b.colwidth]; \
b.pageTable[i * b.width + j] = u; \
psrcf[i_width + j] = (float)(u) / (float)(GPU_TEXWIDTH * b.mult); \
} \
} \
\
psrcv = (float4*)&vBilinearData[0] + b.ox + b.oy * BLOCK_TEXWIDTH; \
\
for(i = 0; i < b.height; ++i) \
{ \
u32 i_width = i*BLOCK_TEXWIDTH; \
u32 i_width2 = ((i+1)%b.height)*BLOCK_TEXWIDTH; \
for(j = 0; j < b.width; ++j) \
{ \
u32 temp = ((j + 1) % b.width); \
float4* pv = &psrcv[i_width + j]; \
pv->x = psrcf[i_width + j]; \
pv->y = psrcf[i_width + temp]; \
pv->z = psrcf[i_width2 + j]; \
pv->w = psrcf[i_width2 + temp]; \
} \
} \
}
#define FILL_BLOCK_NF(psm, psmcol) \
{ \
b.pageTable = &g_pageTable##psm[0][0]; \
b.blockTable = &g_blockTable##psm[0][0]; \
b.columnTable = &g_columnTable##psmcol[0][0]; \
\
assert( sizeof(g_pageTable##psm) == b.width * b.height * sizeof(g_pageTable##psm[0][0]) ); \
\
psrcf = (float*)&vBlockData[0] + b.ox + b.oy * BLOCK_TEXWIDTH; \
psrcw = (u16*)&vBlockData[0] + b.ox + b.oy * BLOCK_TEXWIDTH; \
\
for(i = 0; i < b.height; ++i) \
{ \
u32 i_width = i*BLOCK_TEXWIDTH; \
for(j = 0; j < b.width; ++j) \
{ \
/* fill the table */ \
u32 u = g_blockTable##psm[(i / b.colheight)][(j / b.colwidth)] * 64 * b.mult + g_columnTable##psmcol[i%b.colheight][j%b.colwidth]; \
b.pageTable[i * b.width + j] = u; \
psrcw[i_width + j] = u; \
} \
} \
}
void FillBlocksNF(vector<char>& vBlockData, vector<char>& vBilinearData)
void fill_block(BLOCK b, vector<char>& vBlockData, vector<char>& vBilinearData, int floatfmt)
{
FUNCLOG
vBlockData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * 2);
int i, j;
BLOCK b;
float* psrcf = NULL;
float* psrcf = (float*)&vBlockData[0] + b.ox + b.oy * BLOCK_TEXWIDTH;
u16* psrcw = NULL;
if (!floatfmt)
psrcw = (u16*)&vBlockData[0] + b.ox + b.oy * BLOCK_TEXWIDTH;
memset(m_Blocks, 0, sizeof(m_Blocks));
for(int i = 0; i < b.height; ++i)
{
u32 i_width = i*BLOCK_TEXWIDTH;
for(int j = 0; j < b.width; ++j)
{
/* fill the table */
u32 bt = b.blockTable[(i / b.colheight)*(b.width/b.colwidth) + (j / b.colwidth)];
u32 ct = b.columnTable[(i%b.colheight)*b.colwidth + (j%b.colwidth)];
u32 u = bt * 64 * b.mult + ct;
b.pageTable[i * b.width + j] = u;
if (floatfmt)
psrcf[i_width + j] = (float)(u) / (float)(GPU_TEXWIDTH * b.mult);
else
psrcw[i_width + j] = u;
// 32
b.SetDim(64, 32, 0, 0, 1);
FILL_BLOCK_NF(32, 32);
m_Blocks[PSMCT32] = b;
m_Blocks[PSMCT32].SetFun(PSMCT32);
// 24 (same as 32 except write/readPixel are different)
m_Blocks[PSMCT24] = b;
m_Blocks[PSMCT24].SetFun(PSMCT24);
// 8H (same as 32 except write/readPixel are different)
m_Blocks[PSMT8H] = b;
m_Blocks[PSMT8H].SetFun(PSMT8H);
m_Blocks[PSMT4HL] = b;
m_Blocks[PSMT4HL].SetFun(PSMT4HL);
m_Blocks[PSMT4HH] = b;
m_Blocks[PSMT4HH].SetFun(PSMT4HH);
// 32z
b.SetDim(64, 32, 64, 0, 1);
FILL_BLOCK_NF(32Z, 32);
m_Blocks[PSMT32Z] = b;
m_Blocks[PSMT32Z].SetFun(PSMT32Z);
// 24Z (same as 32Z except write/readPixel are different)
m_Blocks[PSMT24Z] = b;
m_Blocks[PSMT24Z].SetFun(PSMT24Z);
// 16
b.SetDim(64, 64, 0, 32, 2);
FILL_BLOCK_NF(16, 16);
m_Blocks[PSMCT16] = b;
m_Blocks[PSMCT16].SetFun(PSMCT16);
// 16s
b.SetDim(64, 64, 64, 32, 2);
FILL_BLOCK_NF(16S, 16);
m_Blocks[PSMCT16S] = b;
m_Blocks[PSMCT16S].SetFun(PSMCT16S);
// 16z
b.SetDim(64, 64, 0, 96, 2);
FILL_BLOCK_NF(16Z, 16);
m_Blocks[PSMT16Z] = b;
m_Blocks[PSMT16Z].SetFun(PSMT16Z);
// 16sz
b.SetDim(64, 64, 64, 96, 2);
FILL_BLOCK_NF(16SZ, 16);
m_Blocks[PSMT16SZ] = b;
m_Blocks[PSMT16SZ].SetFun(PSMT16SZ);
// 8
b.SetDim(128, 64, 0, 160, 4);
FILL_BLOCK_NF(8, 8);
m_Blocks[PSMT8] = b;
m_Blocks[PSMT8].SetFun(PSMT8);
// 4
b.SetDim(128, 128, 0, 224, 8);
FILL_BLOCK_NF(4, 4);
m_Blocks[PSMT4] = b;
m_Blocks[PSMT4].SetFun(PSMT4);
}
}
if (floatfmt) {
float4* psrcv = (float4*)&vBilinearData[0] + b.ox + b.oy * BLOCK_TEXWIDTH;
void FillBlocksF(vector<char>& vBlockData, vector<char>& vBilinearData)
for(int i = 0; i < b.height; ++i)
{
FUNCLOG
vBlockData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * 4);
vBilinearData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * sizeof(float4));
int i, j;
BLOCK b;
float* psrcf = NULL;
u16* psrcw = NULL;
float4* psrcv = NULL;
memset(m_Blocks, 0, sizeof(m_Blocks));
// 32
b.SetDim(64, 32, 0, 0, 1);
FILL_BLOCK(32, 32);
m_Blocks[PSMCT32] = b;
m_Blocks[PSMCT32].SetFun(PSMCT32);
// 24 (same as 32 except write/readPixel are different)
m_Blocks[PSMCT24] = b;
m_Blocks[PSMCT24].SetFun(PSMCT24);
// 8H (same as 32 except write/readPixel are different)
m_Blocks[PSMT8H] = b;
m_Blocks[PSMT8H].SetFun(PSMT8H);
m_Blocks[PSMT4HL] = b;
m_Blocks[PSMT4HL].SetFun(PSMT4HL);
m_Blocks[PSMT4HH] = b;
m_Blocks[PSMT4HH].SetFun(PSMT4HH);
// 32z
b.SetDim(64, 32, 64, 0, 1);
FILL_BLOCK(32Z, 32);
m_Blocks[PSMT32Z] = b;
m_Blocks[PSMT32Z].SetFun(PSMT32Z);
// 24Z (same as 32Z except write/readPixel are different)
m_Blocks[PSMT24Z] = b;
m_Blocks[PSMT24Z].SetFun(PSMT24Z);
// 16
b.SetDim(64, 64, 0, 32, 2);
FILL_BLOCK(16, 16);
m_Blocks[PSMCT16] = b;
m_Blocks[PSMCT16].SetFun(PSMCT16);
// 16s
b.SetDim(64, 64, 64, 32, 2);
FILL_BLOCK(16S, 16);
m_Blocks[PSMCT16S] = b;
m_Blocks[PSMCT16S].SetFun(PSMCT16S);
// 16z
b.SetDim(64, 64, 0, 96, 2);
FILL_BLOCK(16Z, 16);
m_Blocks[PSMT16Z] = b;
m_Blocks[PSMT16Z].SetFun(PSMT16Z);
// 16sz
b.SetDim(64, 64, 64, 96, 2);
FILL_BLOCK(16SZ, 16);
m_Blocks[PSMT16SZ] = b;
m_Blocks[PSMT16SZ].SetFun(PSMT16SZ);
// 8
b.SetDim(128, 64, 0, 160, 4);
FILL_BLOCK(8, 8);
m_Blocks[PSMT8] = b;
m_Blocks[PSMT8].SetFun(PSMT8);
// 4
b.SetDim(128, 128, 0, 224, 8);
FILL_BLOCK(4, 4);
m_Blocks[PSMT4] = b;
m_Blocks[PSMT4].SetFun(PSMT4);
u32 i_width = i*BLOCK_TEXWIDTH;
u32 i_width2 = ((i+1)%b.height)*BLOCK_TEXWIDTH;
for(int j = 0; j < b.width; ++j)
{
u32 temp = ((j + 1) % b.width);
float4* pv = &psrcv[i_width + j];
pv->x = psrcf[i_width + j];
pv->y = psrcf[i_width + temp];
pv->z = psrcf[i_width2 + j];
pv->w = psrcf[i_width2 + temp];
}
}
}
}
void BLOCK::FillBlocks(vector<char>& vBlockData, vector<char>& vBilinearData, int floatfmt)
{
FUNCLOG
if (floatfmt)
FillBlocksF(vBlockData, vBilinearData);
else
FillBlocksNF(vBlockData, vBilinearData);
if (floatfmt) {
vBlockData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * 4);
vBilinearData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * sizeof(float4));
} else {
vBlockData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * 2);
}
BLOCK b;
memset(m_Blocks, 0, sizeof(m_Blocks));
// 32
b.SetDim(64, 32, 0, 0, 1);
b.SetTable(PSMCT32);
fill_block(b, vBlockData, vBilinearData, floatfmt);
m_Blocks[PSMCT32] = b;
m_Blocks[PSMCT32].SetFun(PSMCT32);
// 24 (same as 32 except write/readPixel are different)
m_Blocks[PSMCT24] = b;
m_Blocks[PSMCT24].SetFun(PSMCT24);
// 8H (same as 32 except write/readPixel are different)
m_Blocks[PSMT8H] = b;
m_Blocks[PSMT8H].SetFun(PSMT8H);
m_Blocks[PSMT4HL] = b;
m_Blocks[PSMT4HL].SetFun(PSMT4HL);
m_Blocks[PSMT4HH] = b;
m_Blocks[PSMT4HH].SetFun(PSMT4HH);
// 32z
b.SetDim(64, 32, 64, 0, 1);
b.SetTable(PSMT32Z);
fill_block(b, vBlockData, vBilinearData, floatfmt);
m_Blocks[PSMT32Z] = b;
m_Blocks[PSMT32Z].SetFun(PSMT32Z);
// 24Z (same as 32Z except write/readPixel are different)
m_Blocks[PSMT24Z] = b;
m_Blocks[PSMT24Z].SetFun(PSMT24Z);
// 16
b.SetDim(64, 64, 0, 32, 2);
b.SetTable(PSMCT16);
fill_block(b, vBlockData, vBilinearData, floatfmt);
m_Blocks[PSMCT16] = b;
m_Blocks[PSMCT16].SetFun(PSMCT16);
// 16s
b.SetDim(64, 64, 64, 32, 2);
b.SetTable(PSMCT16S);
fill_block(b, vBlockData, vBilinearData, floatfmt);
m_Blocks[PSMCT16S] = b;
m_Blocks[PSMCT16S].SetFun(PSMCT16S);
// 16z
b.SetDim(64, 64, 0, 96, 2);
b.SetTable(PSMT16Z);
fill_block(b, vBlockData, vBilinearData, floatfmt);
m_Blocks[PSMT16Z] = b;
m_Blocks[PSMT16Z].SetFun(PSMT16Z);
// 16sz
b.SetDim(64, 64, 64, 96, 2);
b.SetTable(PSMT16SZ);
fill_block(b, vBlockData, vBilinearData, floatfmt);
m_Blocks[PSMT16SZ] = b;
m_Blocks[PSMT16SZ].SetFun(PSMT16SZ);
// 8
b.SetDim(128, 64, 0, 160, 4);
b.SetTable(PSMT8);
fill_block(b, vBlockData, vBilinearData, floatfmt);
m_Blocks[PSMT8] = b;
m_Blocks[PSMT8].SetFun(PSMT8);
// 4
b.SetDim(128, 128, 0, 224, 8);
b.SetTable(PSMT4);
fill_block(b, vBlockData, vBilinearData, floatfmt);
m_Blocks[PSMT4] = b;
m_Blocks[PSMT4].SetFun(PSMT4);
}

View File

@ -92,6 +92,29 @@ struct TransferFuncts
extern TransferData tData[64];
// rest not visible externally
extern u32 g_blockTable32[4][8];
extern u32 g_blockTable32Z[4][8];
extern u32 g_blockTable16[8][4];
extern u32 g_blockTable16S[8][4];
extern u32 g_blockTable16Z[8][4];
extern u32 g_blockTable16SZ[8][4];
extern u32 g_blockTable8[4][8];
extern u32 g_blockTable4[8][4];
extern u32 g_columnTable32[8][8];
extern u32 g_columnTable16[8][16];
extern u32 g_columnTable8[16][16];
extern u32 g_columnTable4[16][32];
extern u32 g_pageTable32[32][64];
extern u32 g_pageTable32Z[32][64];
extern u32 g_pageTable16[64][64];
extern u32 g_pageTable16S[64][64];
extern u32 g_pageTable16Z[64][64];
extern u32 g_pageTable16SZ[64][64];
extern u32 g_pageTable8[64][128];
extern u32 g_pageTable4[128][128];
struct BLOCK
{
BLOCK() { memset(this, 0, sizeof(BLOCK)); }
@ -142,47 +165,69 @@ struct BLOCK
TransferHostLocal = TransferHostLocalFun[psm];
TransferLocalHost = TransferLocalHostFun[psm];
}
void SetTable(u32 psm)
{
switch (psm) {
case PSMCT32:
assert( sizeof(g_pageTable32) == width * height * sizeof(g_pageTable32[0][0]) );
pageTable = &g_pageTable32[0][0];
blockTable = &g_blockTable32[0][0];
columnTable = &g_columnTable32[0][0];
break;
case PSMT32Z:
assert( sizeof(g_pageTable32Z) == width * height * sizeof(g_pageTable32Z[0][0]) );
pageTable = &g_pageTable32Z[0][0];
blockTable = &g_blockTable32Z[0][0];
columnTable = &g_columnTable32[0][0];
break;
case PSMCT16:
assert( sizeof(g_pageTable16) == width * height * sizeof(g_pageTable16[0][0]) );
pageTable = &g_pageTable16[0][0];
blockTable = &g_blockTable16[0][0];
columnTable = &g_columnTable16[0][0];
break;
case PSMCT16S:
assert( sizeof(g_pageTable16S) == width * height * sizeof(g_pageTable16S[0][0]) );
pageTable = &g_pageTable16S[0][0];
blockTable = &g_blockTable16S[0][0];
columnTable = &g_columnTable16[0][0];
break;
case PSMT16Z:
assert( sizeof(g_pageTable16Z) == width * height * sizeof(g_pageTable16Z[0][0]) );
pageTable = &g_pageTable16Z[0][0];
blockTable = &g_blockTable16Z[0][0];
columnTable = &g_columnTable16[0][0];
break;
case PSMT16SZ:
assert( sizeof(g_pageTable16SZ) == width * height * sizeof(g_pageTable16SZ[0][0]) );
pageTable = &g_pageTable16SZ[0][0];
blockTable = &g_blockTable16SZ[0][0];
columnTable = &g_columnTable16[0][0];
break;
case PSMT8:
assert( sizeof(g_pageTable8) == width * height * sizeof(g_pageTable8[0][0]) );
pageTable = &g_pageTable8[0][0];
blockTable = &g_blockTable8[0][0];
columnTable = &g_columnTable8[0][0];
break;
case PSMT4:
assert( sizeof(g_pageTable4) == width * height * sizeof(g_pageTable4[0][0]) );
pageTable = &g_pageTable4[0][0];
blockTable = &g_blockTable4[0][0];
columnTable = &g_columnTable4[0][0];
break;
default:
pageTable = NULL;
blockTable = NULL;
columnTable = NULL;
break;
}
}
};
extern BLOCK m_Blocks[];
extern u32 g_blockTable32[4][8];
extern u32 g_blockTable32Z[4][8];
extern u32 g_blockTable16[8][4];
extern u32 g_blockTable16S[8][4];
extern u32 g_blockTable16Z[8][4];
extern u32 g_blockTable16SZ[8][4];
extern u32 g_blockTable8[4][8];
extern u32 g_blockTable4[8][4];
extern u32 g_columnTable32[8][8];
extern u32 g_columnTable16[8][16];
extern u32 g_columnTable8[16][16];
extern u32 g_columnTable4[16][32];
extern u32 g_pageTable32[32][64];
extern u32 g_pageTable32Z[32][64];
extern u32 g_pageTable16[64][64];
extern u32 g_pageTable16S[64][64];
extern u32 g_pageTable16Z[64][64];
extern u32 g_pageTable16SZ[64][64];
extern u32 g_pageTable8[64][128];
extern u32 g_pageTable4[128][128];
static __forceinline u32 getPixelAddress32(int x, int y, u32 bp, u32 bw)
{
u32 basepage = ((y >> 5) * (bw >> 6)) + (x >> 6);
u32 word = bp * 64 + basepage * 2048 + g_pageTable32[y&31][x&63];
return word;
}
static __forceinline u32 getPixelAddress32_0(int x, int y, u32 bw)
{
u32 basepage = ((y >> 5) * (bw >> 6)) + (x >> 6);
u32 word = basepage * 2048 + g_pageTable32[y&31][x&63];
return word;
}
#define getPixelAddress24 getPixelAddress32
#define getPixelAddress24_0 getPixelAddress32_0
#define getPixelAddress8H getPixelAddress32
@ -191,6 +236,15 @@ static __forceinline u32 getPixelAddress32_0(int x, int y, u32 bw)
#define getPixelAddress4HL_0 getPixelAddress32_0
#define getPixelAddress4HH getPixelAddress32
#define getPixelAddress4HH_0 getPixelAddress32_0
#define getPixelAddress24Z getPixelAddress32Z
#define getPixelAddress24Z_0 getPixelAddress32Z_0
static __forceinline u32 getPixelAddress32(int x, int y, u32 bp, u32 bw)
{
u32 basepage = ((y >> 5) * (bw >> 6)) + (x >> 6);
u32 word = bp * 64 + basepage * 2048 + g_pageTable32[y&31][x&63];
return word;
}
static __forceinline u32 getPixelAddress16(int x, int y, u32 bp, u32 bw)
{
@ -199,13 +253,6 @@ static __forceinline u32 getPixelAddress16(int x, int y, u32 bp, u32 bw)
return word;
}
static __forceinline u32 getPixelAddress16_0(int x, int y, u32 bw)
{
u32 basepage = ((y >> 6) * (bw >> 6)) + (x >> 6);
u32 word = basepage * 4096 + g_pageTable16[y&63][x&63];
return word;
}
static __forceinline u32 getPixelAddress16S(int x, int y, u32 bp, u32 bw)
{
u32 basepage = ((y >> 6) * (bw >> 6)) + (x >> 6);
@ -213,13 +260,6 @@ static __forceinline u32 getPixelAddress16S(int x, int y, u32 bp, u32 bw)
return word;
}
static __forceinline u32 getPixelAddress16S_0(int x, int y, u32 bw)
{
u32 basepage = ((y >> 6) * (bw >> 6)) + (x >> 6);
u32 word = basepage * 4096 + g_pageTable16S[y&63][x&63];
return word;
}
static __forceinline u32 getPixelAddress8(int x, int y, u32 bp, u32 bw)
{
u32 basepage = ((y >> 6) * ((bw + 127) >> 7)) + (x >> 7);
@ -227,13 +267,6 @@ static __forceinline u32 getPixelAddress8(int x, int y, u32 bp, u32 bw)
return word;
}
static __forceinline u32 getPixelAddress8_0(int x, int y, u32 bw)
{
u32 basepage = ((y >> 6) * ((bw + 127) >> 7)) + (x >> 7);
u32 word = basepage * 8192 + g_pageTable8[y&63][x&127];
return word;
}
static __forceinline u32 getPixelAddress4(int x, int y, u32 bp, u32 bw)
{
u32 basepage = ((y >> 7) * ((bw + 127) >> 7)) + (x >> 7);
@ -241,13 +274,6 @@ static __forceinline u32 getPixelAddress4(int x, int y, u32 bp, u32 bw)
return word;
}
static __forceinline u32 getPixelAddress4_0(int x, int y, u32 bw)
{
u32 basepage = ((y >> 7) * ((bw + 127) >> 7)) + (x >> 7);
u32 word = basepage * 16384 + g_pageTable4[y&127][x&127];
return word;
}
static __forceinline u32 getPixelAddress32Z(int x, int y, u32 bp, u32 bw)
{
u32 basepage = ((y >> 5) * (bw >> 6)) + (x >> 6);
@ -255,16 +281,6 @@ static __forceinline u32 getPixelAddress32Z(int x, int y, u32 bp, u32 bw)
return word;
}
static __forceinline u32 getPixelAddress32Z_0(int x, int y, u32 bw)
{
u32 basepage = ((y >> 5) * (bw >> 6)) + (x >> 6);
u32 word = basepage * 2048 + g_pageTable32Z[y&31][x&63];
return word;
}
#define getPixelAddress24Z getPixelAddress32Z
#define getPixelAddress24Z_0 getPixelAddress32Z_0
static __forceinline u32 getPixelAddress16Z(int x, int y, u32 bp, u32 bw)
{
u32 basepage = ((y >> 6) * (bw >> 6)) + (x >> 6);
@ -272,13 +288,6 @@ static __forceinline u32 getPixelAddress16Z(int x, int y, u32 bp, u32 bw)
return word;
}
static __forceinline u32 getPixelAddress16Z_0(int x, int y, u32 bw)
{
u32 basepage = ((y >> 6) * (bw >> 6)) + (x >> 6);
u32 word = basepage * 4096 + g_pageTable16Z[y&63][x&63];
return word;
}
static __forceinline u32 getPixelAddress16SZ(int x, int y, u32 bp, u32 bw)
{
u32 basepage = ((y >> 6) * (bw >> 6)) + (x >> 6);
@ -286,15 +295,7 @@ static __forceinline u32 getPixelAddress16SZ(int x, int y, u32 bp, u32 bw)
return word;
}
static __forceinline u32 getPixelAddress16SZ_0(int x, int y, u32 bw)
{
u32 basepage = ((y >> 6) * (bw >> 6)) + (x >> 6);
u32 word = basepage * 4096 + g_pageTable16SZ[y&63][x&63];
return word;
}
//#define getPixelAddress_0(psm,x,y,bw) getPixelAddress##psm##_0(x,y,bw)
//#define getPixelAddress(psm,x,y,bp,bw) getPixelAddress##psm##(x,y,bp,bw)
///////////////
static __forceinline void writePixel32(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw)
{
@ -375,7 +376,6 @@ static __forceinline void writePixel16SZ(void* pmem, int x, int y, u32 pixel, u3
((u16*)pmem)[getPixelAddress16SZ(x, y, bp, bw)] = pixel;
}
///////////////
static __forceinline u32 readPixel32(const void* pmem, int x, int y, u32 bp, u32 bw)
@ -457,161 +457,48 @@ static __forceinline u32 readPixel16SZ(const void* pmem, int x, int y, u32 bp, u
// Functions that take 0 bps //
///////////////////////////////
static __forceinline void writePixel32_0(void* pmem, int x, int y, u32 pixel, u32 bw)
{
((u32*)pmem)[getPixelAddress32_0(x, y, bw)] = pixel;
}
static __forceinline void writePixel24_0(void* pmem, int x, int y, u32 pixel, u32 bw)
{
u8 *buf = (u8*) & ((u32*)pmem)[getPixelAddress32_0(x, y, bw)];
u8 *pix = (u8*) & pixel;
buf[0] = pix[0];
buf[1] = pix[1];
buf[2] = pix[2];
}
static __forceinline void writePixel16_0(void* pmem, int x, int y, u32 pixel, u32 bw)
{
((u16*)pmem)[getPixelAddress16_0(x, y, bw)] = pixel;
}
static __forceinline void writePixel16S_0(void* pmem, int x, int y, u32 pixel, u32 bw)
{
((u16*)pmem)[getPixelAddress16S_0(x, y, bw)] = pixel;
}
static __forceinline void writePixel8_0(void* pmem, int x, int y, u32 pixel, u32 bw)
{
((u8*)pmem)[getPixelAddress8_0(x, y, bw)] = pixel;
}
static __forceinline void writePixel8H_0(void* pmem, int x, int y, u32 pixel, u32 bw)
{
((u8*)pmem)[4*getPixelAddress32_0(x, y, bw)+3] = pixel;
}
static __forceinline void writePixel4_0(void* pmem, int x, int y, u32 pixel, u32 bw)
{
u32 addr = getPixelAddress4_0(x, y, bw);
u8 pix = ((u8*)pmem)[addr/2];
if (addr & 0x1)((u8*)pmem)[addr/2] = (pix & 0x0f) | (pixel << 4);
else ((u8*)pmem)[addr/2] = (pix & 0xf0) | (pixel);
}
static __forceinline void writePixel4HL_0(void* pmem, int x, int y, u32 pixel, u32 bw)
{
u8 *p = (u8*)pmem + 4 * getPixelAddress4HL_0(x, y, bw) + 3;
*p = (*p & 0xf0) | pixel;
}
static __forceinline void writePixel4HH_0(void* pmem, int x, int y, u32 pixel, u32 bw)
{
u8 *p = (u8*)pmem + 4 * getPixelAddress4HH_0(x, y, bw) + 3;
*p = (*p & 0x0f) | (pixel << 4);
}
static __forceinline void writePixel32Z_0(void* pmem, int x, int y, u32 pixel, u32 bw)
{
((u32*)pmem)[getPixelAddress32Z_0(x, y, bw)] = pixel;
}
static __forceinline void writePixel24Z_0(void* pmem, int x, int y, u32 pixel, u32 bw)
{
u8 *buf = (u8*)pmem + 4 * getPixelAddress32Z_0(x, y, bw);
u8 *pix = (u8*) & pixel;
buf[0] = pix[0];
buf[1] = pix[1];
buf[2] = pix[2];
}
static __forceinline void writePixel16Z_0(void* pmem, int x, int y, u32 pixel, u32 bw)
{
((u16*)pmem)[getPixelAddress16Z_0(x, y, bw)] = pixel;
}
static __forceinline void writePixel16SZ_0(void* pmem, int x, int y, u32 pixel, u32 bw)
{
((u16*)pmem)[getPixelAddress16SZ_0(x, y, bw)] = pixel;
}
static __forceinline u32 getPixelAddress32_0(int x, int y, u32 bw) { return getPixelAddress32(x, y, 0, bw); }
static __forceinline u32 getPixelAddress16_0(int x, int y, u32 bw) { return getPixelAddress16(x, y, 0, bw); }
static __forceinline u32 getPixelAddress16S_0(int x, int y, u32 bw) { return getPixelAddress16S(x, y, 0, bw); }
static __forceinline u32 getPixelAddress8_0(int x, int y, u32 bw) { return getPixelAddress8(x, y, 0, bw); }
static __forceinline u32 getPixelAddress4_0(int x, int y, u32 bw) { return getPixelAddress4(x, y, 0, bw); }
static __forceinline u32 getPixelAddress32Z_0(int x, int y, u32 bw) { return getPixelAddress32Z(x, y, 0, bw); }
static __forceinline u32 getPixelAddress16Z_0(int x, int y, u32 bw) { return getPixelAddress16Z(x, y, 0, bw); }
static __forceinline u32 getPixelAddress16SZ_0(int x, int y, u32 bw) { return getPixelAddress16SZ(x, y, 0, bw); }
///////////////
static __forceinline u32 readPixel32_0(const void* pmem, int x, int y, u32 bw)
{
return ((const u32*)pmem)[getPixelAddress32_0(x, y, bw)];
}
static __forceinline u32 readPixel24_0(const void* pmem, int x, int y, u32 bw)
{
return ((const u32*)pmem)[getPixelAddress32_0(x, y, bw)] & 0xffffff;
}
static __forceinline u32 readPixel16_0(const void* pmem, int x, int y, u32 bw)
{
return ((const u16*)pmem)[getPixelAddress16_0(x, y, bw)];
}
static __forceinline u32 readPixel16S_0(const void* pmem, int x, int y, u32 bw)
{
return ((const u16*)pmem)[getPixelAddress16S_0(x, y, bw)];
}
static __forceinline u32 readPixel8_0(const void* pmem, int x, int y, u32 bw)
{
return ((const u8*)pmem)[getPixelAddress8_0(x, y, bw)];
}
static __forceinline u32 readPixel8H_0(const void* pmem, int x, int y, u32 bw)
{
return ((const u8*)pmem)[4*getPixelAddress32_0(x, y, bw) + 3];
}
static __forceinline u32 readPixel4_0(const void* pmem, int x, int y, u32 bw)
{
u32 addr = getPixelAddress4_0(x, y, bw);
u8 pix = ((const u8*)pmem)[addr/2];
if (addr & 0x1)
return pix >> 4;
else
return pix & 0xf;
}
static __forceinline u32 readPixel4HL_0(const void* pmem, int x, int y, u32 bw)
{
const u8 *p = (const u8*)pmem + 4 * getPixelAddress4HL_0(x, y, bw) + 3;
return *p & 0x0f;
}
static __forceinline u32 readPixel4HH_0(const void* pmem, int x, int y, u32 bw)
{
const u8 *p = (const u8*)pmem + 4 * getPixelAddress4HH_0(x, y, bw) + 3;
return *p >> 4;
}
static __forceinline void writePixel32_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel32(pmem, x, y, pixel, 0, bw); }
static __forceinline void writePixel24_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel24(pmem, x, y, pixel, 0, bw); }
static __forceinline void writePixel16_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel16(pmem, x, y, pixel, 0, bw); }
static __forceinline void writePixel16S_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel16S(pmem, x, y, pixel, 0, bw); }
static __forceinline void writePixel8_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel8(pmem, x, y, pixel, 0, bw); }
static __forceinline void writePixel8H_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel8H(pmem, x, y, pixel, 0, bw); }
static __forceinline void writePixel4_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel4(pmem, x, y, pixel, 0, bw); }
static __forceinline void writePixel4HL_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel4HL(pmem, x, y, pixel, 0, bw); }
static __forceinline void writePixel4HH_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel4HH(pmem, x, y, pixel, 0, bw); }
static __forceinline void writePixel32Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel32Z(pmem, x, y, pixel, 0, bw); }
static __forceinline void writePixel24Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel24Z(pmem, x, y, pixel, 0, bw); }
static __forceinline void writePixel16Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel16Z(pmem, x, y, pixel, 0, bw); }
static __forceinline void writePixel16SZ_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel16SZ(pmem, x, y, pixel, 0, bw); }
///////////////
static __forceinline u32 readPixel32Z_0(const void* pmem, int x, int y, u32 bw)
{
return ((const u32*)pmem)[getPixelAddress32Z_0(x, y, bw)];
}
static __forceinline u32 readPixel32_0(const void* pmem, int x, int y, u32 bw) { return readPixel32(pmem, x, y, 0, bw); }
static __forceinline u32 readPixel24_0(const void* pmem, int x, int y, u32 bw) { return readPixel24(pmem, x, y, 0, bw); }
static __forceinline u32 readPixel16_0(const void* pmem, int x, int y, u32 bw) { return readPixel16(pmem, x, y, 0, bw); }
static __forceinline u32 readPixel16S_0(const void* pmem, int x, int y, u32 bw) { return readPixel16S(pmem, x, y, 0, bw); }
static __forceinline u32 readPixel8_0(const void* pmem, int x, int y, u32 bw) { return readPixel8(pmem, x, y, 0, bw); }
static __forceinline u32 readPixel8H_0(const void* pmem, int x, int y, u32 bw) { return readPixel8H(pmem, x, y, 0, bw); }
static __forceinline u32 readPixel4_0(const void* pmem, int x, int y, u32 bw) { return readPixel4(pmem, x, y, 0, bw); }
static __forceinline u32 readPixel4HL_0(const void* pmem, int x, int y, u32 bw) { return readPixel4HL(pmem, x, y, 0, bw); }
static __forceinline u32 readPixel4HH_0(const void* pmem, int x, int y, u32 bw) { return readPixel4HH(pmem, x, y, 0, bw); }
static __forceinline u32 readPixel32Z_0(const void* pmem, int x, int y, u32 bw) { return readPixel32Z(pmem, x, y, 0, bw); }
static __forceinline u32 readPixel24Z_0(const void* pmem, int x, int y, u32 bw) { return readPixel24Z(pmem, x, y, 0, bw); }
static __forceinline u32 readPixel16Z_0(const void* pmem, int x, int y, u32 bw) { return readPixel16Z(pmem, x, y, 0, bw); }
static __forceinline u32 readPixel16SZ_0(const void* pmem, int x, int y, u32 bw) { return readPixel16SZ(pmem, x, y, 0, bw); }
static __forceinline u32 readPixel24Z_0(const void* pmem, int x, int y, u32 bw)
{
return ((const u32*)pmem)[getPixelAddress32Z_0(x, y, bw)] & 0xffffff;
}
static __forceinline u32 readPixel16Z_0(const void* pmem, int x, int y, u32 bw)
{
return ((const u16*)pmem)[getPixelAddress16Z_0(x, y, bw)];
}
static __forceinline u32 readPixel16SZ_0(const void* pmem, int x, int y, u32 bw)
{
return ((const u16*)pmem)[getPixelAddress16SZ_0(x, y, bw)];
}
///////////////
extern int TransferHostLocal32(const void* pbyMem, u32 nQWordSize);
extern int TransferHostLocal32Z(const void* pbyMem, u32 nQWordSize);

View File

@ -638,7 +638,7 @@ void __gifCall GIFRegHandlerSCISSOR(const u32* data)
Flush();
}
m_env.CTXT[i].SCISSOR = (Vector4i)r->SCISSOR;
m_env.CTXT[i].SCISSOR = (GSVector4i)r->SCISSOR;
m_env.CTXT[i].UpdateScissor();*/
ZZLog::Greg_Log("SCISSOR%d", i);

View File

@ -56,6 +56,7 @@ extern "C" char* CALLBACK PS2EgetLibName(void);
#include <vector>
#include <string>
#include <cstring>
extern std::string s_strIniPath; // Air's new (r2361) new constant for ini file path
@ -87,6 +88,9 @@ static __forceinline void pcsx2_aligned_free(void* pmem)
#define _aligned_malloc pcsx2_aligned_malloc
#define _aligned_free pcsx2_aligned_free
#endif
#ifdef __LINUX__
#include <sys/timeb.h> // ftime(), struct timeb
inline unsigned long timeGetTime()
@ -97,6 +101,15 @@ inline unsigned long timeGetTime()
return (unsigned long)(t.time*1000 + t.millitm);
}
#include <time.h>
inline unsigned long timeGetPreciseTime()
{
timespec t;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t);
return t.tv_nsec;
}
struct RECT
{
int left, top;
@ -138,6 +151,7 @@ enum GSWindowDim
GSDim_1024,
GSDim_1280,
};
typedef union
{
struct
@ -217,7 +231,7 @@ typedef struct
gameHacks settings()
{
gameHacks tempHack;
tempHack._u32 = (hacks._u32 | def_hacks._u32 | GAME_PATH3HACK);
tempHack._u32 = (hacks._u32 | def_hacks._u32);
return tempHack;
}
bool fullscreen() { return !!(zz_options.fullscreen); }

View File

@ -25,6 +25,8 @@ void SaveConfig()
WritePrivateProfileString("Settings", "Width", szValue, iniFile.c_str());
sprintf(szValue, "%u", conf.height);
WritePrivateProfileString("Settings", "Height", szValue, iniFile.c_str());
sprintf(szValue, "%u", conf.SkipDraw);
WritePrivateProfileString("Settings", "SkipDraw", szValue, iniFile.c_str());
}
void LoadConfig()
@ -40,6 +42,7 @@ void LoadConfig()
conf.bilinear = 1;
conf.width = 640;
conf.height = 480;
conf.SkipDraw = 0;
FILE *fp = fopen(iniFile.c_str(), "rt");
@ -67,6 +70,8 @@ void LoadConfig()
conf.width = strtoul(szValue, NULL, 10);
GetPrivateProfileString("Settings", "Height", NULL, szValue, 20, iniFile.c_str());
conf.height = strtoul(szValue, NULL, 10);
GetPrivateProfileString("Settings", "SkipDraw", NULL, szValue, 20, iniFile.c_str());
conf.SkipDraw = strtoul(szValue, NULL, 10);
if (conf.aa < 0 || conf.aa > 4) conf.aa = 0;

View File

@ -116,7 +116,7 @@ typedef struct GameHackStruct
u32 HackMask;
} GameHack;
#define HACK_NUMBER 30
#define HACK_NUMBER 25
GameHack HackinshTable[HACK_NUMBER] =
{
@ -127,30 +127,31 @@ GameHack HackinshTable[HACK_NUMBER] =
{"*** 4 TexA hack", GAME_TEXAHACK},
{"*** 5 No Target Resolve", GAME_NOTARGETRESOLVE},
{"*** 6 Exact color", GAME_EXACTCOLOR},
{"*** 7 No color clamp", GAME_NOCOLORCLAMP},
{"*** 8 FFX hack", GAME_FFXHACK},
{"*** 9 No Alpha Fail", GAME_NOALPHAFAIL},
{"***10 No Depth Update", GAME_NODEPTHUPDATE},
{"***11 Quick Resolve 1", GAME_QUICKRESOLVE1},
{"***12 No quick resolve", GAME_NOQUICKRESOLVE},
{"***13 Notaget clut", GAME_NOTARGETCLUT},
{"***14 No Stencil", GAME_NOSTENCIL},
{"***15 No Depth resolve", GAME_NODEPTHRESOLVE},
{"***16 Full 16 bit", GAME_FULL16BITRES},
{"***17 Resolve promoted", GAME_RESOLVEPROMOTED},
{"***18 Fast Update", GAME_FASTUPDATE},
{"***19 No Alpha Test", GAME_NOALPHATEST},
{"***20 Disable MRT deprh", GAME_DISABLEMRTDEPTH},
{"***21 32 bit targes", GAME_32BITTARGS},
{"***22 path 3 hack", GAME_PATH3HACK},
{"***23 parallelise calls", GAME_DOPARALLELCTX},
{"***24 specular highligths", GAME_XENOSPECHACK},
{"***25 partial pointers", GAME_PARTIALPOINTERS},
{"***26 partial depth", GAME_PARTIALDEPTH},
{"***27 reget hack", GAME_REGETHACK},
//{"***xx No color clamp", GAME_NOCOLORCLAMP},
//{"***xx FFX hack", GAME_FFXHACK},
{"*** 7 No Alpha Fail", GAME_NOALPHAFAIL},
{"*** 8 No Depth Update", GAME_NODEPTHUPDATE},
{"*** 9 Quick Resolve 1", GAME_QUICKRESOLVE1},
{"***10 No quick resolve", GAME_NOQUICKRESOLVE},
{"***11 Notaget clut", GAME_NOTARGETCLUT},
{"***12 No Stencil", GAME_NOSTENCIL},
{"***13 No Depth resolve", GAME_NODEPTHRESOLVE},
{"***14 Full 16 bit", GAME_FULL16BITRES},
{"***15 Resolve promoted", GAME_RESOLVEPROMOTED},
{"***16 Fast Update", GAME_FASTUPDATE},
{"***17 No Alpha Test", GAME_NOALPHATEST},
{"***18 Disable MRT depth", GAME_DISABLEMRTDEPTH},
//{"***xx 32 bit targs", GAME_32BITTARGS},
//{"***xx Path 3 hack", GAME_PATH3HACK},
//{"***xx Parallel calls", GAME_DOPARALLELCTX},
{"***19 Specular highlights", GAME_XENOSPECHACK},
//{"***xx Partial pointers", GAME_PARTIALPOINTERS},
{"***20 Partial depth", GAME_PARTIALDEPTH},
{"***21 Reget hack", GAME_REGETHACK},
{"***28 gust hack", GAME_GUSTHACK},
{"***29 log-Z", GAME_NOLOGZ}
{"***22 Gust hack", GAME_GUSTHACK},
{"***23 Log-Z", GAME_NOLOGZ},
{"***24 Auto skipdraw", GAME_AUTOSKIPDRAW}
};
int CurrentHackSetting = 0;
@ -172,7 +173,7 @@ void ProcessHackSetting(bool reverse)
{
CurrentHackSetting++;
if (CurrentHackSetting == HACK_NUMBER) CurrentHackSetting = 0;
if (CurrentHackSetting >= HACK_NUMBER) CurrentHackSetting = 0;
}
conf.hacks._u32 |= HackinshTable[CurrentHackSetting].HackMask;

View File

@ -244,6 +244,27 @@ void Warn_Log(const char *fmt, ...)
#endif
}
void Dev_Log(const char *fmt, ...)
{
#ifdef ZEROGS_DEVBUILD
va_list list;
va_start(list, fmt);
if (IsLogging())
{
vfprintf(gsLog, fmt, list);
fprintf(gsLog, "\n");
}
fprintf(stderr, "ZZogl-PG: ");
vfprintf(stderr, fmt, list);
fprintf(stderr, "\n");
va_end(list);
#endif
}
void Debug_Log(const char *fmt, ...)
{
#if _DEBUG

View File

@ -185,6 +185,7 @@ extern void Prim_Log(const char *fmt, ...);
extern void GS_Log(const char *fmt, ...);
extern void Debug_Log(const char *fmt, ...);
extern void Dev_Log(const char *fmt, ...);
extern void Warn_Log(const char *fmt, ...);
extern void Error_Log(const char *fmt, ...);
};

View File

@ -54,6 +54,7 @@ void ZeroGS::AdjustTransToAspect(float4& v)
{
double temp;
float f;
const float mult = 1 / 32767.0f;
if (conf.width * nBackbufferHeight > conf.height * nBackbufferWidth) // limited by width
{
@ -74,7 +75,7 @@ void ZeroGS::AdjustTransToAspect(float4& v)
v.z *= f;
}
v *= 1 / 32767.0f;
v *= mult;
}
inline bool FrameSkippingHelper()

View File

@ -89,7 +89,7 @@ inline u32 CreateInterlaceTex(int width)
glGenTextures(1, &s_ptexInterlace);
glBindTexture(GL_TEXTURE_RECTANGLE_NV, s_ptexInterlace);
TextureRect(4, width, 1, GL_RGBA, GL_UNSIGNED_BYTE, &data[0]);
TextureRect(GL_RGBA, width, 1, GL_RGBA, GL_UNSIGNED_BYTE, &data[0]);
setRectFilters(GL_NEAREST);
GL_REPORT_ERRORD();

View File

@ -75,8 +75,6 @@ map<string, GLbyte> mapGLExtensions;
namespace ZeroGS
{
RenderFormatType g_RenderFormatType = RFT_float16;
extern void KickPoint();
extern void KickLine();
extern void KickTriangle();
@ -84,8 +82,8 @@ extern void KickTriangleFan();
extern void KickSprite();
extern void KickDummy();
extern bool LoadEffects();
extern bool LoadExtraEffects();
extern FRAGMENTSHADER* LoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed);
extern bool ZZshLoadExtraEffects();
extern FRAGMENTSHADER* ZZshLoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed);
GLuint vboRect = 0;
vector<GLuint> g_vboBuffers; // VBOs for all drawing commands
@ -270,19 +268,6 @@ inline void ZeroGS::CreateOtherCheck()
if (Max_Texture_Size_NV < 1024)
ZZLog::Error_Log("Could not properly make bitmasks, so some textures will be missed.");
/* Zeydlitz: we don't support 128-bit targets yet. they are slow and weirdo
if( conf.settings() & GAME_32BITTARGS ) {
g_RenderFormatType = RFT_byte8;
ZZLog::Error_Log("Setting 32 bit render target.");
}
else {
if( !IsGLExt("GL_NV_float_buffer") && !IsGLExt("GL_ARB_color_buffer_float") && !IsGLExt("ATI_pixel_format_float") ) {
ZZLog::Error_Log("******\nZZogl: GS WARNING: Floating point render targets not supported, switching to 32bit\nZZogl: *********");
g_RenderFormatType = RFT_byte8;
}
}*/
g_RenderFormatType = RFT_byte8;
#ifdef _WIN32
if (IsGLExt("WGL_EXT_swap_control") || IsGLExt("EXT_swap_control"))
wglSwapIntervalEXT(0);
@ -469,8 +454,6 @@ bool ZeroGS::Create(int _width, int _height)
Destroy(1);
GSStateReset();
g_RenderFormatType = RFT_float16;
if (!Create_Window(_width, _height)) return false;
if (!CreateFillExtensionsMap()) return false;
if (!CreateImportantCheck()) return false;
@ -574,7 +557,7 @@ bool ZeroGS::Create(int _width, int _height)
PBITMAPINFO pinfo = (PBITMAPINFO)LockResource(hBitmapGlob);
GLenum tempFmt = (pinfo->bmiHeader.biBitCount == 32) ? GL_RGBA : GL_RGB;
TextureRect(4, pinfo->bmiHeader.biWidth, pinfo->bmiHeader.biHeight, tempFmt, GL_UNSIGNED_BYTE, (u8*)pinfo + pinfo->bmiHeader.biSize);
TextureRect(GL_RGBA, pinfo->bmiHeader.biWidth, pinfo->bmiHeader.biHeight, tempFmt, GL_UNSIGNED_BYTE, (u8*)pinfo + pinfo->bmiHeader.biSize);
nLogoWidth = pinfo->bmiHeader.biWidth;
nLogoHeight = pinfo->bmiHeader.biHeight;

View File

@ -207,8 +207,6 @@ int icurctx = -1;
extern CRangeManager s_RangeMngr; // manages overwritten memory // zz
void FlushTransferRanges(const tex0Info* ptex); //zz
RenderFormatType GetRenderFormat() { return g_RenderFormatType; } //zz
// use to update the state
void SetTexVariables(int context, FRAGMENTSHADER* pfragment); // zz
void SetTexInt(int context, FRAGMENTSHADER* pfragment, int settexint); // zz
@ -859,7 +857,7 @@ inline float4 FlushSetPageOffset(FRAGMENTSHADER* pfragment, int shadertype, CRen
// zoe2
if (PSMT_ISZTEX(ptextarg->psm)) vpageoffset.w = -1.0f;
ZZshSetParameter4fv(pfragment->prog, pfragment->fPageOffset, vpageoffset, "g_fPageOffset");
ZZshSetParameter4fv(pfragment->fPageOffset, vpageoffset, "g_fPageOffset");
return vpageoffset;
}
@ -877,7 +875,7 @@ inline float4 FlushSetTexOffset(FRAGMENTSHADER* pfragment, int shadertype, VB& c
v.y = 16.0f / (float)curvb.tex0.th;
v.z = 0.5f * v.x;
v.w = 0.5f * v.y;
ZZshSetParameter4fv(pfragment->prog, pfragment->fTexOffset, v, "g_fTexOffset");
ZZshSetParameter4fv(pfragment->fTexOffset, v, "g_fTexOffset");
}
else if (shadertype == 4)
{
@ -886,7 +884,7 @@ inline float4 FlushSetTexOffset(FRAGMENTSHADER* pfragment, int shadertype, VB& c
v.y = 16.0f / (float)ptextarg->fbh;
v.z = -1;
v.w = 8.0f / (float)ptextarg->fbh;
ZZshSetParameter4fv(pfragment->prog, pfragment->fTexOffset, v, "g_fTexOffset");
ZZshSetParameter4fv(pfragment->fTexOffset, v, "g_fTexOffset");
}
return v;
@ -920,7 +918,7 @@ inline float4 FlushTextureDims(FRAGMENTSHADER* pfragment, int shadertype, VB& cu
if (shadertype == 4)
vTexDims.z += 8.0f;
ZZshSetParameter4fv(pfragment->prog, pfragment->fTexDims, vTexDims, "g_fTexDims");
ZZshSetParameter4fv(pfragment->fTexDims, vTexDims, "g_fTexDims");
return vTexDims;
}
@ -970,7 +968,7 @@ inline FRAGMENTSHADER* FlushUseExistRenderTarget(VB& curvb, CRenderTarget* ptext
float4 vTexDims = FlushTextureDims(pfragment, shadertype, curvb, ptextarg);
if (pfragment->sCLUT != NULL && ptexclut != 0)
ZZshGLSetTextureParameter(pfragment->prog, pfragment->sCLUT, ptexclut, "CLUT");
ZZshGLSetTextureParameter(pfragment->sCLUT, ptexclut, "CLUT");
FlushApplyResizeFilter(curvb, dwFilterOpts, ptextarg, context);
@ -1016,13 +1014,13 @@ inline void FlushSetTexture(VB& curvb, FRAGMENTSHADER* pfragment, CRenderTarget*
// have to enable the texture parameters(curtest.atst)
if( curvb.ptexClamp[0] != 0 )
ZZshGLSetTextureParameter(pfragment->prog, pfragment->sBitwiseANDX, curvb.ptexClamp[0], "Clamp 0");
ZZshGLSetTextureParameter(pfragment->sBitwiseANDX, curvb.ptexClamp[0], "Clamp 0");
if( curvb.ptexClamp[1] != 0 )
ZZshGLSetTextureParameter(pfragment->prog, pfragment->sBitwiseANDY, curvb.ptexClamp[1], "Clamp 1");
ZZshGLSetTextureParameter(pfragment->sBitwiseANDY, curvb.ptexClamp[1], "Clamp 1");
if( pfragment->sMemory != NULL && s_ptexCurSet[context] != 0)
ZZshGLSetTextureParameter(pfragment->prog, pfragment->sMemory, s_ptexCurSet[context], "Clamp memory");
ZZshGLSetTextureParameter(pfragment->sMemory, s_ptexCurSet[context], "Clamp memory");
}
@ -1170,13 +1168,13 @@ inline u32 AlphaRenderAlpha(VB& curvb, const pixTest curtest, FRAGMENTSHADER* pf
v.w *= 255;
}
ZZshSetParameter4fv(pfragment->prog, pfragment->sOneColor, v, "g_fOneColor");
ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");
}
else
{
// not using blending so set to defaults
float4 v = exactcolor ? float4(1, 510 * 255.0f / 256.0f, 0, 0) : float4(1, 2 * 255.0f / 256.0f, 0, 0);
ZZshSetParameter4fv(pfragment->prog, pfragment->sOneColor, v, "g_fOneColor");
ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");
}
@ -1267,7 +1265,7 @@ inline void AlphaPabe(VB& curvb, FRAGMENTSHADER* pfragment, int exactcolor)
if (exactcolor) v.y *= 255;
ZZshSetParameter4fv(pfragment->prog, pfragment->sOneColor, v, "g_fOneColor");
ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");
Draw(curvb);
@ -1336,7 +1334,7 @@ inline void AlphaFailureTestJob(VB& curvb, const pixTest curtest, FRAGMENTSHADE
if (exactcolor) { v.y *= 255; v.w *= 255; }
ZZshSetParameter4fv(pfragment->prog, pfragment->sOneColor, v, "g_fOneColor");
ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");
glEnable(GL_BLEND);
GL_STENCILFUNC(GL_EQUAL, s_stencilref | STENCIL_FBA, s_stencilmask | STENCIL_FBA);
@ -1360,7 +1358,7 @@ inline void AlphaFailureTestJob(VB& curvb, const pixTest curtest, FRAGMENTSHADE
if (exactcolor) v.y *= 255;
ZZshSetParameter4fv(pfragment->prog, pfragment->sOneColor, v, "g_fOneColor");
ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");
Draw(curvb);
@ -1412,7 +1410,7 @@ inline void AlphaSpecialTesting(VB& curvb, FRAGMENTSHADER* pfragment, u32 dwUsin
glStencilOp(GL_KEEP, GL_KEEP, GL_KEEP);
float4 v = float4(0, exactcolor ? 510.0f : 2.0f, 0, 0);
ZZshSetParameter4fv(pfragment->prog, pfragment->sOneColor, v, "g_fOneColor");
ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");
Draw(curvb);
// don't need to restore
@ -1468,66 +1466,6 @@ inline void AlphaSaveTarget(VB& curvb)
#endif
}
inline void AlphaColorClamping(VB& curvb, const pixTest curtest)
{
// clamp the final colors, when enabled ffx2 credits mess up
//if (gs.colclamp) ZZLog::Error_Log("ColClamp!");
if ((curvb.curprim.abe && bAlphaClamping) && (GetRenderFormat() != RFT_byte8) && !(conf.settings().no_color_clamp)) // if !colclamp, skip
{
//ZZLog::Error_Log("Clamped.");
ResetAlphaVariables();
// if processing the clamping case, make sure can write to the front buffer
glDisable(GL_STENCIL_TEST);
glEnable(GL_BLEND);
glDisable(GL_ALPHA_TEST);
glDisable(GL_DEPTH_TEST);
glDepthMask(0);
glColorMask(1, 1, 1, 0);
if (s_bWriteDepth) ResetRenderTarget(1);
SetShaderCaller("AlphaColorClamping");
ZZshSetPixelShader(ppsOne.prog);
GL_BLEND_RGB(GL_ONE, GL_ONE);
float f;
if (bAlphaClamping & 1) // min
{
f = 0;
ZZshSetParameter4fv(ppsOne.prog, ppsOne.sOneColor, &f, "g_fOneColor");
GL_BLENDEQ_RGB(GL_MAX_EXT);
Draw(curvb);
}
// bios shows white screen
if (bAlphaClamping & 2) // max
{
f = 1;
ZZshSetParameter4fv(ppsOne.prog, ppsOne.sOneColor, &f, "g_fOneColor");
GL_BLENDEQ_RGB(GL_MIN_EXT);
Draw(curvb);
}
if (!curvb.zbuf.zmsk)
{
glDepthMask(1);
if (s_bWriteDepth)
{
assert(curvb.pdepth != NULL);
curvb.pdepth->SetRenderTarget(1);
}
}
if (curvb.test.ate && USEALPHATESTING) glEnable(GL_ALPHA_TEST);
GL_ZTEST(curtest.zte);
}
}
inline void FlushUndoFiter(u32 dwFilterOpts)
{
if (dwFilterOpts)
@ -1585,7 +1523,6 @@ void ZeroGS::Flush(int context)
GL_REPORT_ERRORD();
AlphaColorClamping(curvb, curtest);
FlushUndoFiter(dwFilterOpts);
ppf += curvb.nCount + 0x100000;
@ -1988,7 +1925,7 @@ void ZeroGS::SetTexInt(int context, FRAGMENTSHADER* pfragment, int settexint)
}
// clamp relies on texture width
inline void SetTexClamping(int context, FRAGMENTSHADER* pfragment )
void SetTexClamping(int context, FRAGMENTSHADER* pfragment)
{
FUNCLOG
SetShaderCaller("SetTexClamping");
@ -2004,14 +1941,19 @@ inline void SetTexClamping(int context, FRAGMENTSHADER* pfragment )
switch (pclamp->wms)
{
case 0:
v2.x = -1e10; v2.z = 1e10;
v2.x = -1e10;
v2.z = 1e10;
break;
case 1: // pclamp
// suikoden5 movie text
v2.x = 0; v2.z = 1-0.5f/fw;
v2.x = 0;
v2.z = 1 - 0.5f / fw;
break;
case 2: // reg pclamp
v2.x = (pclamp->minu+0.5f)/fw; v2.z = (pclamp->maxu-0.5f)/fw;
v2.x = (pclamp->minu + 0.5f) / fw;
v2.z = (pclamp->maxu - 0.5f) / fw;
break;
case 3: // region rep x
@ -2026,20 +1968,27 @@ inline void SetTexClamping(int context, FRAGMENTSHADER* pfragment )
g_PrevBitwiseTexX = correctMinu;
ptex[0] = ZeroGS::s_BitwiseTextures.GetTex(correctMinu, 0);
}
break;
}
switch (pclamp->wmt)
{
case 0:
v2.y = -1e10; v2.w = 1e10;
v2.y = -1e10;
v2.w = 1e10;
break;
case 1: // pclamp
// suikoden5 movie text
v2.y = 0; v2.w = 1-0.5f/fh;
v2.y = 0;
v2.w = 1 - 0.5f / fh;
break;
case 2: // reg pclamp
v2.y = (pclamp->minv+0.5f)/fh; v2.w = (pclamp->maxv-0.5f)/fh;
v2.y = (pclamp->minv + 0.5f) / fh;
v2.w = (pclamp->maxv - 0.5f) / fh;
break;
case 3: // region rep y
@ -2049,17 +1998,21 @@ inline void SetTexClamping(int context, FRAGMENTSHADER* pfragment )
v2.w = pclamp->maxv / fh;
int correctMinv = pclamp->minv & (~pclamp->maxv); // (A && B) || C == (A && (B && !C)) + C
if (correctMinv != g_PrevBitwiseTexY) {
if (correctMinv != g_PrevBitwiseTexY)
{
g_PrevBitwiseTexY = correctMinv;
ptex[1] = ZeroGS::s_BitwiseTextures.GetTex(correctMinv, ptex[0]);
}
break;
}
if (ZZshActiveParameter(pfragment->fTexWrapMode))
ZZshSetParameter4fv(pfragment->prog, pfragment->fTexWrapMode, v, "g_fTexWrapMode");
if (ZZshActiveParameter( pfragment->fClampExts))
ZZshSetParameter4fv(pfragment->prog, pfragment->fClampExts, v2, "g_fClampExts");
if (pfragment->fTexWrapMode != 0)
ZZshSetParameter4fv(pfragment->fTexWrapMode, v, "g_fTexWrapMode");
if (pfragment->fClampExts != 0)
ZZshSetParameter4fv(pfragment->fClampExts, v2, "g_fClampExts");
}
// Fixme should be in float4 lib
@ -2230,11 +2183,11 @@ void ZeroGS::SetTexVariables(int context, FRAGMENTSHADER* pfragment)
// Test;*/
ZZshSetParameter4fv(pfragment->prog, pfragment->fTexAlpha, valpha, "g_fTexAlpha");
ZZshSetParameter4fv(pfragment->prog, pfragment->fTexAlpha2, valpha2, "g_fTexAlpha2");
ZZshSetParameter4fv(pfragment->fTexAlpha, valpha, "g_fTexAlpha");
ZZshSetParameter4fv(pfragment->fTexAlpha2, valpha2, "g_fTexAlpha2");
if (IsAlphaTestExpansion(tex0))
ZZshSetParameter4fv(pfragment->prog, pfragment->fTestBlack, vblack, "g_fTestBlack");
ZZshSetParameter4fv(pfragment->fTestBlack, vblack, "g_fTestBlack");
SetTexClamping(context, pfragment);
@ -2280,7 +2233,7 @@ void ZeroGS::SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0,
v.w = 1.0f / (float)fh;
if (pfragment->fRealTexDims)
ZZshSetParameter4fv(pfragment->prog, pfragment->fRealTexDims, v, "g_fRealTexDims");
ZZshSetParameter4fv(pfragment->fRealTexDims, v, "g_fRealTexDims");
else
ZZshSetParameter4fv(cgGetNamedParameter(pfragment->prog,"g_fRealTexDims"),v, "g_fRealTexDims");
}
@ -2336,15 +2289,15 @@ void ZeroGS::SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0,
v.z *= b.bpp * (1 / 32.0f);
}
ZZshSetParameter4fv(pfragment->prog, pfragment->fTexDims, vTexDims, "g_fTexDims");
ZZshSetParameter4fv(pfragment->fTexDims, vTexDims, "g_fTexDims");
// ZZshSetParameter4fv(pfragment->prog, pfragment->fTexBlock, b.vTexBlock, "g_fTexBlock"); // I change it, and it's working. Seems casting from float4 to float[4] is ok.
ZZshSetParameter4fv(pfragment->prog, pfragment->fTexBlock, &b.vTexBlock.x, "g_fTexBlock");
ZZshSetParameter4fv(pfragment->prog, pfragment->fTexOffset, v, "g_fTexOffset");
// ZZshSetParameter4fv(pfragment->fTexBlock, b.vTexBlock, "g_fTexBlock"); // I change it, and it's working. Seems casting from float4 to float[4] is ok.
ZZshSetParameter4fv(pfragment->fTexBlock, &b.vTexBlock.x, "g_fTexBlock");
ZZshSetParameter4fv(pfragment->fTexOffset, v, "g_fTexOffset");
// get hardware texture dims
//int texheight = (pmemtarg->realheight+pmemtarg->widthmult-1)/pmemtarg->widthmult;
int texwidth = GPU_TEXWIDTH * pmemtarg->widthmult * pmemtarg->channels;
//int texheight = pmemtarg->texH;
int texwidth = pmemtarg->texW;
v.y = 1.0f;
v.x = (fpageint - (float)pmemtarg->realy / (float)pmemtarg->widthmult + 0.5f);//*v.y;

View File

@ -1,83 +1,493 @@
/* ZeroGS KOSMOS
/* ZZ Open GL graphics plugin
* Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
* Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
*
* Zerofrog's ZeroGS KOSMOS (c)2005-2008
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Zerofrog forgot to write any copyright notice after releasing the plugin into GPLv2
* If someone can contact him successfully to clarify this matter that would be great.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
// Now that it's down to 82 lines, and most of it's fairly obvious, perhaps it'd be easier to
// just reimplement it... -arcum42
#ifndef ZZOGLMATH_H_INCLUDED
#define ZZOGLMATH_H_INCLUDED
//Remind me to check and see if this is necessary, and what uses it. --arcum42
#ifndef _WIN32
#include <alloca.h>
#endif
#include <string.h>
#include <math.h>
#include <assert.h>
typedef float dReal;
//#define ZZ_MMATH
// class used for 3 and 4 dim vectors and quaternions
// It is better to use this for a 3 dim vector because it is 16byte aligned and SIMD instructions can be used
#ifndef ZZ_MMATH
class float4
template <class T>
class Vector4
{
public:
dReal x, y, z, w;
T x, y, z, w;
float4() : x(0), y(0), z(0), w(0) {}
float4(dReal x, dReal y, dReal z) : x(x), y(y), z(z), w(0) {}
float4(dReal x, dReal y, dReal z, dReal w) : x(x), y(y), z(z), w(w) {}
float4(const float4 &vec) : x(vec.x), y(vec.y), z(vec.z), w(vec.w) {}
float4(const dReal* pf) { assert(pf != NULL); x = pf[0]; y = pf[1]; z = pf[2]; w = 0; }
dReal operator[](int i) const { return (&x)[i]; }
dReal& operator[](int i) { return (&x)[i]; }
Vector4(T x1 = 0, T y1 = 0, T z1 = 0, T w1 = 0)
{
x = x1;
y = y1;
z = z1;
w = w1;
}
// casting operators
operator dReal*() { return &x; }
operator const dReal*() const { return (const dReal*)&x; }
Vector4(Vector4<T> &f)
{
x = f.x;
y = f.y;
z = f.z;
w = f.w;
}
// SCALAR FUNCTIONS
inline dReal dot(const float4 &v) const { return x*v.x + y*v.y + z*v.z + w*v.w; }
inline void Set3(const float* pvals) { x = pvals[0]; y = pvals[1]; z = pvals[2]; }
inline void Set4(const float* pvals) { x = pvals[0]; y = pvals[1]; z = pvals[2]; w = pvals[3]; }
inline void SetColor(u32 color)
Vector4(T* f)
{
x = f[0];
y = f[1];
z = f[2];
w = f[3]; // For some reason, the old code set this to 0.
}
T& operator[](int i)
{
switch(i)
{
case 0: return x;
case 1: return y;
case 2: return z;
case 3: return w;
default: assert(0);
}
}
operator T*()
{
return (T*) this;
}
operator const T*() const
{
return (const T*) this;
}
Vector4<T>& operator =(const Vector4<T>& v)
{
x = v.x;
y = v.y;
z = v.z;
w = v.w;
return *this;
}
bool operator ==(const Vector4<T>& v)
{
return !!( x == v.x &&
y == v.y &&
z == v.z &&
w == v.w );
}
Vector4<T> operator +(const Vector4<T>& v) const
{
return Vector4<T>(x + v.x, y + v.y, z + v.z, w + v.w);
}
Vector4<T> operator -(const Vector4<T>& v) const
{
return Vector4<T>(x - v.x, y - v.y, z - v.z, w - v.w);
}
Vector4<T> operator *(const Vector4<T>& v) const
{
return Vector4<T>(x * v.x, y * v.y, z * v.z, w * v.w);
}
Vector4<T> operator /(const Vector4<T>& v) const
{
return Vector4<T>(x / v.x, y / v.y, z / v.z, w / v.w);
}
Vector4<T> operator +(T val) const
{
return Vector4<T>(x + val, y + val, z + val, w + val);
}
Vector4<T> operator -(T val) const
{
return Vector4<T>(x - val, y - val, z - val, w - val);
}
Vector4<T> operator *(T val) const
{
return Vector4<T>(x * val, y * val, z * val, w * val);
}
Vector4<T> operator /(T val) const
{
return Vector4<T>(x / val, y / val, z / val, w / val);
}
Vector4<T>& operator +=(const Vector4<T>& v)
{
*this = *this + v;
return *this;
}
Vector4<T>& operator -=(const Vector4<T>& v)
{
*this = *this - v;
return *this;
}
Vector4<T>& operator *=(const Vector4<T>& v)
{
*this = *this * v;
return *this;
}
Vector4<T>& operator /=(const Vector4<T>& v)
{
*this = *this - v;
return *this;
}
Vector4<T>& operator +=(T val)
{
*this = *this + (T)val;
return *this;
}
Vector4<T>& operator -=(T val)
{
*this = *this - (T)val;
return *this;
}
Vector4<T>& operator *=(T val)
{
*this = *this * (T)val;
return *this;
}
Vector4<T>& operator /=(T val)
{
*this = *this / (T)val;
return *this;
}
// Probably doesn't belong here, but I'll leave it in for the moment.
void SetColor(u32 color)
{
x = (color & 0xff) / 255.0f;
y = ((color >> 8) & 0xff) / 255.0f;
z = ((color >> 16) & 0xff) / 255.0f;
}
// 3 dim cross product, w is not touched
/// this = this x v
/// this = u x v
inline float4 operator-() const { float4 v; v.x = -x; v.y = -y; v.z = -z; v.w = -w; return v; }
inline float4 operator+(const float4 &r) const { float4 v; v.x = x + r.x; v.y = y + r.y; v.z = z + r.z; v.w = w + r.w; return v; }
inline float4 operator-(const float4 &r) const { float4 v; v.x = x - r.x; v.y = y - r.y; v.z = z - r.z; v.w = w - r.w; return v; }
inline float4 operator*(const float4 &r) const { float4 v; v.x = r.x * x; v.y = r.y * y; v.z = r.z * z; v.w = r.w * w; return v; }
inline float4 operator*(dReal k) const { float4 v; v.x = k * x; v.y = k * y; v.z = k * z; v.w = k * w; return v; }
inline float4& operator += (const float4& r) { x += r.x; y += r.y; z += r.z; w += r.w; return *this; }
inline float4& operator -= (const float4& r) { x -= r.x; y -= r.y; z -= r.z; w -= r.w; return *this; }
inline float4& operator *= (const float4& r) { x *= r.x; y *= r.y; z *= r.z; w *= r.w; return *this; }
inline float4& operator *= (const dReal k) { x *= k; y *= k; z *= k; w *= k; return *this; }
inline float4& operator /= (const dReal _k) { dReal k = 1 / _k; x *= k; y *= k; z *= k; w *= k; return *this; }
friend float4 operator*(float f, const float4& v);
//friend ostream& operator<<(ostream& O, const float4& v);
//friend istream& operator>>(istream& I, float4& v);
};
inline float4 operator*(float f, const float4& left)
typedef Vector4<float> float4;
#else
// Reimplement, swiping a bunch of code from GSdx and adapting it. (specifically GSVector.h)
// This doesn't include more then half of the functions in there, as well as some of the structs...
#include <xmmintrin.h>
#include "Pcsx2Types.h"
class float4
{
float4 v;
v.x = f * left.x;
v.y = f * left.y;
v.z = f * left.z;
return v;
public:
union
{
struct {float x, y, z, w;};
struct {float r, g, b, a;};
struct {float left, top, right, bottom;};
float v[4];
float f32[4];
s8 _s8[16];
s16 _s16[8];
s32 _s32[4];
s64 _s64[2];
u8 _u8[16];
u16 _u16[8];
u32 _u32[4];
u64 _u64[2];
__m128 m;
};
float4()
{
m = _mm_setzero_ps();
}
#endif // ZZOGLMATH_H_INCLUDED
float4(float x, float y, float z, float w = 0)
{
m = _mm_set_ps(w, z, y, x);
}
float4(float4 &f)
{
m = f.m;
}
float4(float x, float y)
{
m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y));
}
float4(int x, int y)
{
m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y)));
}
explicit float4(float f)
{
m = _mm_set1_ps(f);
}
explicit float4(__m128 m)
{
this->m = m;
}
float4(float* f)
{
x = f[0];
y = f[1];
z = f[2];
w = f[3]; // For some reason, the old code set this to 0.
}
float& operator[](int i)
{
switch(i)
{
case 0: return x;
case 1: return y;
case 2: return z;
case 3: return w;
default: assert(0);
}
}
operator float*()
{
return (float*) this;
}
operator const float*() const
{
return (const float*) this;
}
void operator = (float f)
{
m = _mm_set1_ps(f);
}
void operator = (__m128 m)
{
this->m = m;
}
void operator += (const float4& v)
{
m = _mm_add_ps(m, v.m);
}
void operator -= (const float4& v)
{
m = _mm_sub_ps(m, v.m);
}
void operator *= (const float4& v)
{
m = _mm_mul_ps(m, v.m);
}
void operator /= (const float4& v)
{
m = _mm_div_ps(m, v.m);
}
void operator += (float f)
{
*this += float4(f);
}
void operator -= (float f)
{
*this -= float4(f);
}
void operator *= (float f)
{
*this *= float4(f);
}
void operator /= (float f)
{
*this /= float4(f);
}
void operator &= (const float4& v)
{
m = _mm_and_ps(m, v.m);
}
void operator |= (const float4& v)
{
m = _mm_or_ps(m, v.m);
}
void operator ^= (const float4& v)
{
m = _mm_xor_ps(m, v.m);
}
friend float4 operator + (const float4& v1, const float4& v2)
{
return float4(_mm_add_ps(v1.m, v2.m));
}
friend float4 operator - (const float4& v1, const float4& v2)
{
return float4(_mm_sub_ps(v1.m, v2.m));
}
friend float4 operator * (const float4& v1, const float4& v2)
{
return float4(_mm_mul_ps(v1.m, v2.m));
}
friend float4 operator / (const float4& v1, const float4& v2)
{
return float4(_mm_div_ps(v1.m, v2.m));
}
friend float4 operator + (const float4& v, float f)
{
return v + float4(f);
}
friend float4 operator - (const float4& v, float f)
{
return v - float4(f);
}
friend float4 operator * (const float4& v, float f)
{
return v * float4(f);
}
friend float4 operator / (const float4& v, float f)
{
return v / float4(f);
}
friend float4 operator & (const float4& v1, const float4& v2)
{
return float4(_mm_and_ps(v1.m, v2.m));
}
friend float4 operator | (const float4& v1, const float4& v2)
{
return float4(_mm_or_ps(v1.m, v2.m));
}
friend float4 operator ^ (const float4& v1, const float4& v2)
{
return float4(_mm_xor_ps(v1.m, v2.m));
}
friend float4 operator == (const float4& v1, const float4& v2)
{
return float4(_mm_cmpeq_ps(v1.m, v2.m));
}
friend float4 operator != (const float4& v1, const float4& v2)
{
return float4(_mm_cmpneq_ps(v1.m, v2.m));
}
friend float4 operator > (const float4& v1, const float4& v2)
{
return float4(_mm_cmpgt_ps(v1.m, v2.m));
}
friend float4 operator < (const float4& v1, const float4& v2)
{
return float4(_mm_cmplt_ps(v1.m, v2.m));
}
friend float4 operator >= (const float4& v1, const float4& v2)
{
return float4(_mm_cmpge_ps(v1.m, v2.m));
}
friend float4 operator <= (const float4& v1, const float4& v2)
{
return float4(_mm_cmple_ps(v1.m, v2.m));
}
// This looked interesting, so I thought I'd include it...
template<int i> float4 shuffle() const
{
return float4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(i, i, i, i)));
}
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
float4 xs##ys##zs##ws() const {return float4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
float4 xs##ys##zs##ws(const float4& v) const {return float4(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
#define VECTOR4_SHUFFLE_1(xs, xn) \
float4 xs##4() const {return float4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
float4 xs##4(const float4& v) const {return float4(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
VECTOR4_SHUFFLE_1(x, 0)
VECTOR4_SHUFFLE_1(y, 1)
VECTOR4_SHUFFLE_1(z, 2)
VECTOR4_SHUFFLE_1(w, 3)
// Probably doesn't belong here, but I'll leave it in for the moment.
void SetColor(u32 color)
{
x = (color & 0xff) / 255.0f;
y = ((color >> 8) & 0xff) / 255.0f;
z = ((color >> 16) & 0xff) / 255.0f;
}
};
#endif
#endif

View File

@ -392,16 +392,16 @@ ZeroGS::SaveTex(tex0Info* ptex, int usevid)
assert(pmemtarg != NULL);
glBindTexture(GL_TEXTURE_RECTANGLE_NV, pmemtarg->ptex->tex);
srcdata.resize(pmemtarg->realheight * GPU_TEXWIDTH * pmemtarg->widthmult * 4 * 8); // max of 8 cannels
srcdata.resize(4 * pmemtarg->texW * pmemtarg->texH);
glGetTexImage(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA, pmemtarg->fmt, &srcdata[0]);
u32 offset = pmemtarg->realy * 4 * GPU_TEXWIDTH;
u32 offset = MemorySize(pmemtarg->realy);
if (ptex->psm == PSMT8)
offset *= PSMT_IS32BIT(ptex->cpsm) ? 4 : 2;
offset *= CLUT_PIXEL_SIZE(ptex->cpsm);
else if (ptex->psm == PSMT4)
offset *= PSMT_IS32BIT(ptex->cpsm) ? 8 : 4;
offset *= CLUT_PIXEL_SIZE(ptex->cpsm) * 2;
psrc = &srcdata[0] - offset;
}

File diff suppressed because it is too large Load Diff

View File

@ -51,6 +51,8 @@ class CRenderTargetMngr
void Destroy();
static MAPTARGETS::iterator GetOldestTarg(MAPTARGETS& m);
bool isFound(const frameInfo& frame, MAPTARGETS::iterator& it, u32 opts, u32 key, int maxposheight);
CRenderTarget* GetTarg(const frameInfo& frame, u32 Options, int maxposheight);
inline CRenderTarget* GetTarg(int fbp, int fbw, VB& curvb)
{
@ -119,13 +121,13 @@ class CRenderTargetMngr
class CMemoryTargetMngr
{
public:
CMemoryTargetMngr() : curstamp(0) {}
CMemoryTarget* GetMemoryTarget(const tex0Info& tex0, int forcevalidate); // pcbp is pointer to start of clut
CMemoryTarget* MemoryTarget_SearchExistTarget(int start, int end, int nClutOffset, int clutsize, const tex0Info& tex0, int forcevalidate);
CMemoryTarget* MemoryTarget_ClearedTargetsSearch(int fmt, int widthmult, int channels, int height);
CMemoryTarget* SearchExistTarget(int start, int end, int nClutOffset, int clutsize, const tex0Info& tex0, int forcevalidate);
CMemoryTarget* ClearedTargetsSearch(int fmt, int widthmult, int channels, int height);
int CompareTarget(list<CMemoryTarget>::iterator& it, const tex0Info& tex0, int clutsize, int nClutOffset);
void Destroy(); // destroy all targs
@ -138,6 +140,8 @@ class CMemoryTargetMngr
private:
list<CMemoryTarget>::iterator DestroyTargetIter(list<CMemoryTarget>::iterator& it);
void GetClutVariables(int& nClutOffset, int& clutsize, const tex0Info& tex0);
void GetMemAddress(int& start, int& end, const tex0Info& tex0);
};
class CBitwiseTextureMngr

View File

@ -20,6 +20,11 @@
#ifdef ZEROGS_SSE2
// SSE2 extensions
// Note: pshufd 0xea <=> movdqa !!!
// What the function does is
// Interleave s1 and sd0 -> d1 (high) & sd0 (low)
// Interleave s3 and sd2 -> d3 (high) & sd2 (low)
#define punpck(op, sd0, sd2, s1, s3, d1, d3) \
movdqa %xmm##d1, %xmm##sd0; \
pshufd %xmm##d3, %xmm##sd2, 0xe4; \
@ -29,6 +34,15 @@
punpckh##op %xmm##d3, %xmm##s3; \
// Input xmm7 == 0x0F0F0F0F 0x0F0F0F0F 0x0F0F0F0F 0x0F0F0F0F
// DATA xmm[0-3]
// This function does a 4-bits interleaving of 4 xmm registers
//
// ARG Can not put comment in the middle of the define...
// After the first por
// low 32bits (4bits packed) == 1.6 0.6 1.4 0.4 1.2 0.2 1.0 0.0
// After the second one
// low 32bits (4bits packed) == 1.7 0.7 1.5 0.5 1.3 0.3 1.1 0.1
#define punpcknb \
movdqa %xmm4, %xmm0; \
pshufd %xmm5, %xmm1, 0xe4; \
@ -48,6 +62,7 @@
\
movdqa %xmm1, %xmm4; \
\
\
movdqa %xmm4, %xmm2; \
pshufd %xmm5, %xmm3, 0xe4; \
\
@ -68,6 +83,12 @@
\
punpck(bw, 0, 2, 1, 3, 4, 6);\
// output
// low 32 bits 0 (4 bits packed) == 1.3 0.3 1.2 0.2 1.1 0.1 1.0 0.0
// low 32 bits 4 (4 bits packed) == 1.19 0.19 1.18 0.18 1.17 0.17 1.16 0.16
// low 32 bits 2 (4 bits packed) == 3.3 2.3 3.2 2.2 3.1 2.1 3.0 2.0
// low 32 bits 6 (4 bits packed) == 3.19 2.19 3.18 2.18 3.17 2.17 3.16 2.16
//
// swizzling
@ -84,11 +105,15 @@ SwizzleBlock32_sse2:
push %esi
push %edi
// save dst
mov %edi, %ecx
// save src
mov %esi, %edx
// get pitch
mov %edx, [%esp+4+8]
mov %ecx, 4
// get WriteMask
mov %eax, [%esp+8+8]
cmp %eax, 0xffffffff
jne SwizzleBlock32_sse2_2
@ -100,6 +125,8 @@ SwizzleBlock32_sse2_1:
movdqa %xmm1, [%esi+%edx]
movdqa %xmm5, [%esi+%edx+16]
// 64bits interleave 1&0 -> 2&0
// 64bits interleave 5&4 -> 6&4
punpck(qdq, 0, 4, 1, 5, 2, 6)
movntps [%edi+16*0], %xmm0
@ -107,6 +134,7 @@ SwizzleBlock32_sse2_1:
movntps [%edi+16*2], %xmm4
movntps [%edi+16*3], %xmm6
// update ptr
lea %esi, [%esi+%edx*2]
add %edi, 64
@ -120,6 +148,7 @@ SwizzleBlock32_sse2_1:
SwizzleBlock32_sse2_2:
// WriteMask: 32bits to 4*32bits
movd %xmm7, %eax
pshufd %xmm7, %xmm7, 0
@ -130,13 +159,19 @@ SwizzleBlock32_sse2_3:
movdqa %xmm1, [%esi+%edx]
movdqa %xmm5, [%esi+%edx+16]
// 64bits interleave 1&0 -> 2&0
// 64bits interleave 5&4 -> 6&4
punpck(qdq, 0, 4, 1, 5, 2, 6)
// save a mask copy
movdqa %xmm3, %xmm7
pshufd %xmm5, %xmm7, 0xe4
// *dst & ~WriteMask
pandn %xmm3, [%edi+16*0]
// *src & WriteMask
pand %xmm0, %xmm7
// Final value to save
por %xmm0, %xmm3
movntps [%edi+16*0], %xmm0
@ -158,6 +193,7 @@ SwizzleBlock32_sse2_3:
por %xmm6, %xmm5
movntps [%edi+16*3], %xmm6
// update ptr
lea %esi, [%esi+%edx*2]
add %edi, 64
@ -179,6 +215,7 @@ SwizzleBlock16_sse2:
push %ebx
// srcpitch
mov %ebx, [%esp+4+4]
mov %eax, 4
@ -189,7 +226,11 @@ SwizzleBlock16_sse2_1:
movdqa %xmm2, [%edx+%ebx]
movdqa %xmm3, [%edx+%ebx+16]
// 16bits interleave 1&0 -> 4&0
// 16bits interleave 3&2 -> 6&2
punpck(wd, 0, 2, 1, 3, 4, 6)
// 64bits interleave 2&0 -> 1&0
// 64bits interleave 6&4 -> 5&4
punpck(qdq, 0, 4, 2, 6, 1, 5)
movntps [%ecx+16*0], %xmm0
@ -197,6 +238,7 @@ SwizzleBlock16_sse2_1:
movntps [%ecx+16*2], %xmm4
movntps [%ecx+16*3], %xmm5
// update ptr
lea %edx, [%edx+%ebx*2]
add %ecx, 64
@ -217,7 +259,9 @@ SwizzleBlock8_sse2:
push %ebx
// load srcpitch
mov %ebx, [%esp+4+4]
// basic counter
mov %eax, 2
.align 16
@ -226,14 +270,23 @@ SwizzleBlock8_sse2_1:
movdqa %xmm0, [%edx]
movdqa %xmm2, [%edx+%ebx]
// update src pointer
lea %edx, [%edx+%ebx*2]
// 2 3 0 1
pshufd %xmm1, [%edx], 0xb1
pshufd %xmm3, [%edx+%ebx], 0xb1
// update src pointer
lea %edx, [%edx+%ebx*2]
// 8bits interleave 1&0 -> 4&0
// 8bits interleave 3&2 -> 6&2
punpck(bw, 0, 2, 1, 3, 4, 6)
// 16bits interleave 4&0 -> 1&0
// 16bits interleave 6&2 -> 3&2
punpck(wd, 0, 2, 4, 6, 1, 3)
// 64bits interleave 2&0 -> 4&0
// 64bits interleave 3&1 -> 5&1
punpck(qdq, 0, 1, 2, 3, 4, 5)
movntps [%ecx+16*0], %xmm0
@ -241,18 +294,27 @@ SwizzleBlock8_sse2_1:
movntps [%ecx+16*2], %xmm1
movntps [%ecx+16*3], %xmm5
// col 1, 3
// col 1, 3 (same as previous column)
// 2 3 0 1
pshufd %xmm0, [%edx], 0xb1
pshufd %xmm2, [%edx+%ebx], 0xb1
// update src pointer
lea %edx, [%edx+%ebx*2]
movdqa %xmm1, [%edx]
movdqa %xmm3, [%edx+%ebx]
// update src pointer
lea %edx, [%edx+%ebx*2]
// 8bits interleave 1&0 -> 4&0
// 8bits interleave 3&2 -> 6&2
punpck(bw, 0, 2, 1, 3, 4, 6)
// 16bits interleave 4&0 -> 1&0
// 16bits interleave 6&2 -> 3&2
punpck(wd, 0, 2, 4, 6, 1, 3)
// 64bits interleave 2&0 -> 4&0
// 64bits interleave 3&1 -> 5&1
punpck(qdq, 0, 1, 2, 3, 4, 5)
movntps [%ecx+16*4], %xmm0
@ -260,6 +322,7 @@ SwizzleBlock8_sse2_1:
movntps [%ecx+16*6], %xmm1
movntps [%ecx+16*7], %xmm5
// update dst pointer
add %ecx, 128
dec %eax
@ -279,10 +342,12 @@ SwizzleBlock4_sse2:
push %ebx
// load 4 0x0F0F0F0F
mov %eax, 0xf0f0f0f
movd %xmm7, %eax
pshufd %xmm7, %xmm7, 0
// load srcpitch
mov %ebx, [%esp+4+4]
mov %eax, 2
@ -292,20 +357,32 @@ SwizzleBlock4_sse2_1:
movdqa %xmm0, [%edx]
movdqa %xmm2, [%edx+%ebx]
//update src pointer
lea %edx, [%edx+%ebx*2]
movdqa %xmm1, [%edx]
movdqa %xmm3, [%edx+%ebx]
// update src pointer
lea %edx, [%edx+%ebx*2]
// - - - - 2 3 0 1
pshuflw %xmm1, %xmm1, 0xb1
pshuflw %xmm3, %xmm3, 0xb1
// 6 7 4 5 - - - -
pshufhw %xmm1, %xmm1, 0xb1
pshufhw %xmm3, %xmm3, 0xb1
// 4bits interleave 1&0 -> 4&0
// 4bits interleave 3&2 -> 6&2
punpcknb
// 8bits interleave 4&0 -> 1&0
// 8bits interleave 6&2 -> 3&2
punpck(bw, 0, 2, 4, 6, 1, 3)
// 8bits interleave 1&0 -> 4&0
// 8bits interleave 3&2 -> 6&2
punpck(bw, 0, 2, 1, 3, 4, 6)
// 64bits interleave 2&0 -> 1&0
// 64bits interleave 6&4 -> 3&4
punpck(qdq, 0, 4, 2, 6, 1, 3)
movntps [%ecx+16*0], %xmm0
@ -313,7 +390,7 @@ SwizzleBlock4_sse2_1:
movntps [%ecx+16*2], %xmm4
movntps [%ecx+16*3], %xmm3
// col 1, 3
// col 1, 3 (same as previous column)
movdqa %xmm0, [%edx]
movdqa %xmm2, [%edx+%ebx]
@ -349,6 +426,9 @@ SwizzleBlock4_sse2_1:
//
// swizzling with unaligned reads
// Same functions as a above with movdqu instead of movdqa for the reads
// Movdqu is as fast as movdqa with aligned address... So do not bother, directly
// use movdqu
//
//

View File

@ -22,7 +22,6 @@
#include "x86.h"
#if defined(ZEROGS_SSE2)
#include <xmmintrin.h>
#include <emmintrin.h>
#endif
@ -64,23 +63,17 @@ void __fastcall FrameSwizzleBlock32A2_c(u32* dst, u32* src, int srcpitch, u32 Wr
{
u32* d = &g_columnTable32[0][0];
if( WriteMask == 0xffffffff )
{
for(int i = 0; i < 8; ++i, d += 8)
{
for(int j = 0; j < 8; ++j)
{
if( WriteMask == 0xffffffff ) {
for(int i = 0; i < 8; ++i, d += 8) {
for(int j = 0; j < 8; ++j) {
dst[d[j]] = ((src[2*j] + src[2*j+1]) >> 1);
}
src += srcpitch;
}
}
else
{
for(int i = 0; i < 8; ++i, d += 8)
{
for(int j = 0; j < 8; ++j)
{
else {
for(int i = 0; i < 8; ++i, d += 8) {
for(int j = 0; j < 8; ++j) {
dst[d[j]] = (((src[2*j] + src[2*j+1]) >> 1)&WriteMask)|(dst[d[j]]&~WriteMask);
}
src += srcpitch;
@ -92,23 +85,17 @@ void __fastcall FrameSwizzleBlock32A4_c(u32* dst, u32* src, int srcpitch, u32 Wr
{
u32* d = &g_columnTable32[0][0];
if( WriteMask == 0xffffffff )
{
for(int i = 0; i < 8; ++i, d += 8)
{
for(int j = 0; j < 8; ++j)
{
if( WriteMask == 0xffffffff ) {
for(int i = 0; i < 8; ++i, d += 8) {
for(int j = 0; j < 8; ++j) {
dst[d[j]] = ((src[2*j] + src[2*j+1] + src[2*j+srcpitch] + src[2*j+srcpitch+1]) >> 2);
}
src += srcpitch << 1;
}
}
else
{
for(int i = 0; i < 8; ++i, d += 8)
{
for(int j = 0; j < 8; ++j)
{
else {
for(int i = 0; i < 8; ++i, d += 8) {
for(int j = 0; j < 8; ++j) {
dst[d[j]] = (((src[2*j] + src[2*j+1] + src[2*j+srcpitch] + src[2*j+srcpitch+1]) >> 2)&WriteMask)|(dst[d[j]]&~WriteMask);
}
src += srcpitch << 1;
@ -663,6 +650,120 @@ static const __aligned16 int s_clut16mask[8] = { 0xffff0000, 0xffff0000, 0xffff0
extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut)
{
#define YET_ANOTHER_INTRINSIC
#ifdef YET_ANOTHER_INTRINSIC
__m128i vm0 = _mm_load_si128((__m128i*)vm);
__m128i vm1 = _mm_load_si128((__m128i*)vm+1);
__m128i vm2 = _mm_load_si128((__m128i*)vm+2);
__m128i vm3 = _mm_load_si128((__m128i*)vm+3);
// rearrange 16bits words
vm0 = _mm_shufflehi_epi16(vm0, 0x88);
vm0 = _mm_shufflelo_epi16(vm0, 0x88); // 6 4 6 4 2 0 2 0
vm1 = _mm_shufflehi_epi16(vm1, 0x88);
vm1 = _mm_shufflelo_epi16(vm1, 0x88); // 14 12 14 12 10 8 10 8
// Note: MSVC complains about direct c-cast...
// vm0 = (__m128i)_mm_shuffle_ps((__m128)vm0, (__m128)vm1, 0x88); // 14 12 10 8 6 4 2 0
__m128 vm0_f = (_mm_shuffle_ps((__m128&)vm0, (__m128&)vm1, 0x88)); // 14 12 10 8 6 4 2 0
vm0 = (__m128i&)vm0_f;
vm0 = _mm_shuffle_epi32(vm0, 0xD8); // 14 12 6 4 10 8 2 0
// *** Same jobs for vm2 and vm3
vm2 = _mm_shufflehi_epi16(vm2, 0x88);
vm2 = _mm_shufflelo_epi16(vm2, 0x88);
vm3 = _mm_shufflehi_epi16(vm3, 0x88);
vm3 = _mm_shufflelo_epi16(vm3, 0x88);
// Note: MSVC complains about direct c-cast...
// vm2 = (__m128i)_mm_shuffle_ps((__m128)vm2, (__m128)vm3, 0x88);
__m128 vm2_f = (_mm_shuffle_ps((__m128&)vm2, (__m128&)vm3, 0x88)); // 14 12 10 8 6 4 2 0
vm2 = (__m128i&)vm2_f;
vm2 = _mm_shuffle_epi32(vm2, 0xD8);
// Create a zero register.
__m128i zero_128 = _mm_setzero_si128();
if ((u32)clut & 0x0F) {
// Unaligned write.
u16* clut_word_ptr = (u16*)clut;
__m128i clut_mask = _mm_load_si128((__m128i*)s_clut16mask2);
// Load previous data and clear high 16 bits of double words
__m128i clut_0 = _mm_load_si128((__m128i*)(clut_word_ptr-1)); // 6 5 4 3 2 1 0 x
__m128i clut_2 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+2); // 22 21 20 19 18 17 16 15
clut_0 = _mm_and_si128(clut_0, clut_mask); // - 5 - 3 - 1 - x
clut_2 = _mm_and_si128(clut_2, clut_mask); // - 21 - 19 - 17 - 15
// Convert 16bits to 32 bits vm0 (zero entended)
__m128i vm0_low = _mm_unpacklo_epi16(vm0, zero_128); // - 10 - 8 - 2 - 0
__m128i vm0_high = _mm_unpackhi_epi16(vm0, zero_128); // - 14 - 12 - 6 - 4
// shift the value to aligned it with clut
vm0_low = _mm_slli_epi32(vm0_low, 16); // 10 - 8 - 2 - 0 -
vm0_high = _mm_slli_epi32(vm0_high, 16); // 14 - 12 - 6 - 4 -
// Interlace old and new data
clut_0 = _mm_or_si128(clut_0, vm0_low); // 10 5 8 3 2 1 0 x
clut_2 = _mm_or_si128(clut_2, vm0_high); // 14 21 12 19 6 17 4 15
// Save the result
_mm_store_si128((__m128i*)(clut_word_ptr-1), clut_0);
_mm_store_si128((__m128i*)(clut_word_ptr-1)+2, clut_2);
// *** Same jobs for clut_1 and clut_3
__m128i clut_1 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+1);
__m128i clut_3 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+3);
clut_1 = _mm_and_si128(clut_1, clut_mask);
clut_3 = _mm_and_si128(clut_3, clut_mask);
__m128i vm2_low = _mm_unpacklo_epi16(vm2, zero_128);
__m128i vm2_high = _mm_unpackhi_epi16(vm2, zero_128);
vm2_low = _mm_slli_epi32(vm2_low, 16);
vm2_high = _mm_slli_epi32(vm2_high, 16);
clut_1 = _mm_or_si128(clut_1, vm2_low);
clut_3 = _mm_or_si128(clut_3, vm2_high);
_mm_store_si128((__m128i*)(clut_word_ptr-1)+1, clut_1);
_mm_store_si128((__m128i*)(clut_word_ptr-1)+3, clut_3);
} else {
// Standard write
__m128i clut_mask = _mm_load_si128((__m128i*)s_clut16mask);
// Load previous data and clear low 16 bits of double words
__m128i clut_0 = _mm_and_si128(_mm_load_si128((__m128i*)clut), clut_mask); // 7 - 5 - 3 - 1 -
__m128i clut_2 = _mm_and_si128(_mm_load_si128((__m128i*)clut+2), clut_mask); // 23 - 21 - 19 - 17 -
// Convert 16bits to 32 bits vm0 (zero entended)
__m128i vm0_low = _mm_unpacklo_epi16(vm0, zero_128); // - 10 - 8 - 2 - 0
__m128i vm0_high = _mm_unpackhi_epi16(vm0, zero_128); // - 14 - 12 - 6 - 4
// Interlace old and new data
clut_0 = _mm_or_si128(clut_0, vm0_low); // 7 10 5 8 3 2 1 0
clut_2 = _mm_or_si128(clut_2, vm0_high); // 23 14 21 12 19 6 17 4
// Save the result
_mm_store_si128((__m128i*)clut, clut_0);
_mm_store_si128((__m128i*)clut+2, clut_2);
// *** Same jobs for clut_1 and clut_3
__m128i clut_1 = _mm_and_si128(_mm_load_si128((__m128i*)clut+1), clut_mask);
__m128i clut_3 = _mm_and_si128(_mm_load_si128((__m128i*)clut+3), clut_mask);
__m128i vm2_low = _mm_unpacklo_epi16(vm2, zero_128);
__m128i vm2_high = _mm_unpackhi_epi16(vm2, zero_128);
clut_1 = _mm_or_si128(clut_1, vm2_low);
clut_3 = _mm_or_si128(clut_3, vm2_high);
_mm_store_si128((__m128i*)clut+1, clut_1);
_mm_store_si128((__m128i*)clut+3, clut_3);
}
#else
#if defined(_MSC_VER)
__asm
{
@ -893,6 +994,7 @@ End:
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
);
#endif // _MSC_VER
#endif
}
#endif // ZEROGS_SSE2
@ -1115,3 +1217,4 @@ Z16Loop:
);
#endif // _MSC_VER
}

View File

@ -32,6 +32,9 @@
#include "targets.h"
#include "GLWin.h"
#include "ZZoglShaders.h"
#ifdef ZEROGS_SSE2
#include <emmintrin.h>
#endif
//----------------------- Defines
@ -95,7 +98,6 @@ namespace ZeroGS
// float4 g_vdepth = float4( 65536.0f*65536.0f, 256.0f*65536.0f, 65536.0f, 256.0f);
extern CRangeManager s_RangeMngr; // manages overwritten memory
GLenum GetRenderTargetFormat() { return GetRenderFormat() == RFT_byte8 ? 4 : g_internalRGBAFloat16Fmt; }
// returns the first and last addresses aligned to a page that cover
void GetRectMemAddress(int& start, int& end, int psm, int x, int y, int w, int h, int bp, int bw);
@ -541,7 +543,7 @@ __forceinline void MOVFOG(VertexGPU *p, Vertex gsf)
int Values[100] = {0, };
void SET_VERTEX(VertexGPU *p, int Index, const VB& curvb)
inline void SET_VERTEX(VertexGPU *p, int Index, const VB& curvb)
{
int index = Index;
p->x = ((((int)gs.gsvertex[index].x - curvb.offset.x) >> 1) & 0xffff);
@ -852,6 +854,55 @@ bool IsDirty(u32 highdword, u32 psm, int cld, int cbp)
bool bRet = false;
// FIXME code generated by intrinsics is the same as the linux asm.
// However there is no "cmp %%esi, 0x90" equivalent in the windows asm !!!
// So control flow must be check
#define TEST_THIS
#ifdef TEST_THIS
while(entries != 0) {
#ifdef ZEROGS_SSE2
__m128i result = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src), _mm_load_si128((__m128i*)dst));
__m128i result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+1), _mm_load_si128((__m128i*)dst+1));
result = _mm_and_si128(result, result_tmp);
result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+2), _mm_load_si128((__m128i*)dst+2));
result = _mm_and_si128(result, result_tmp);
result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+3), _mm_load_si128((__m128i*)dst+3));
result = _mm_and_si128(result, result_tmp);
u32 result_int = _mm_movemask_epi8(result);
if (result_int != 0xFF) {
bRet = true;
break;
}
#else
// I see no point to keep an mmx version. SSE2 versions is probably faster.
// Keep a slow portable C version for reference/debug
for (int i=0; i < 16 ; i++) {
if (*((u32*)src+i) != *((u32*)dst+i)) {
bRet = true;
break;
}
}
#endif
if (entries & 0x10) {
src -= 56; // go back and down one column
}
src += 32; // go to the right block
if (entries == 0x90) {
src += 32; // skip whole block
}
dst += 8;
entries -= 16;
}
#else
// do a fast test with MMX
#ifdef _MSC_VER
int storeebx;
@ -978,6 +1029,7 @@ Return:
".att_syntax\n" : "=m"(bRet) : "c"(dst), "d"(src), "S"(entries) : "eax", "memory");
#endif // _WIN32
#endif
return bRet;
}

View File

@ -29,6 +29,7 @@
#include <vector>
#include <map>
#include <string>
#include <math.h>
#include "ZZGl.h"
#include "GS.h"
@ -100,12 +101,6 @@ namespace ZeroGS
typedef void (*DrawFn)();
enum RenderFormatType
{
RFT_byte8 = 0, // A8R8G8B8
RFT_float16 = 1, // A32R32B32G32
};
// managers render-to-texture targets
class CRenderTarget
@ -237,6 +232,8 @@ class CMemoryTarget
clearminy = r.clearminy;
clearmaxy = r.clearmaxy;
widthmult = r.widthmult;
texH = r.texH;
texW = r.texW;
channels = r.channels;
validatecount = r.validatecount;
fmt = r.fmt;
@ -267,13 +264,19 @@ class CMemoryTarget
int starty, height; // assert(starty >= realy)
int realy, realheight; // this is never touched once allocated
// realy is start pointer of data in 4M data block (start) and size (end-start).
u32 usedstamp;
u8 psm, cpsm; // texture and clut format. For psm, only 16bit/32bit differentiation matters
u32 fmt;
int widthmult;
int channels;
int widthmult; // Either 1 or 2.
int channels; // The number of pixels per PSM format word. channels == PIXELS_PER_WORD(psm)
// This is the real drawing size in pixels of the texture in renderbuffer.
int texW; // (realheight + widthmult - 1)/widthmult == realheight or [(realheight+1)/2]
int texH; // GPU_TEXWIDTH *widthmult * channels;
int clearminy, clearmaxy; // when maxy > 0, need to check for clearing
int validatecount; // count how many times has been validated, if too many, destroy
@ -415,7 +418,6 @@ extern float fiTexWidth[2], fiTexHeight[2]; // current tex width and height
extern vector<GLuint> g_vboBuffers; // VBOs for all drawing commands
extern GLuint vboRect;
extern int g_nCurVBOIndex;
extern RenderFormatType g_RenderFormatType;
void AddMessage(const char* pstr, u32 ms = 5000);
void DrawText(const char* pstr, int left, int top, u32 color);
@ -479,8 +481,6 @@ bool CheckChangeInClut(u32 highdword, u32 psm); // returns true if clut will cha
// call to load CLUT data (depending on CLD)
void texClutWrite(int ctx);
RenderFormatType GetRenderFormat();
GLenum GetRenderTargetFormat();
int Save(s8* pbydata);
bool Load(s8* pbydata);
@ -523,7 +523,25 @@ inline void CluttingForFlushedTex(tex0Info* tex0, u32 Data, int ictx)
tex0->cld = ZZOglGet_cld_TexBits(Data);
ZeroGS::texClutWrite(ictx);
}
};
// The size in bytes of x strings (of texture).
inline int MemorySize(int x)
{
return 4 * GPU_TEXWIDTH * x;
}
// Return the address in memory of data block for string x.
inline u8* MemoryAddress(int x)
{
return g_pbyGSMemory + MemorySize(x);
}
template <u32 mult>
inline u8* _MemoryAddress(int x)
{
return g_pbyGSMemory + mult * x;
}
};
#endif