diff --git a/plugins/GSdx/GPURenderer.h b/plugins/GSdx/GPURenderer.h index 1a7b505ced..16568a0da2 100644 --- a/plugins/GSdx/GPURenderer.h +++ b/plugins/GSdx/GPURenderer.h @@ -123,13 +123,13 @@ protected: int maxcount = std::max(m_maxcount * 3 / 2, 10000); Vertex* vertices = (Vertex*)_aligned_malloc(sizeof(Vertex) * maxcount, 32); - if (!vertices) + if(vertices == NULL) { printf("GSdx: failed to allocate %d bytes for verticles.\n", sizeof(Vertex) * maxcount); throw GSDXError(); } - if (m_vertices != NULL) + if(m_vertices != NULL) { memcpy(vertices, m_vertices, sizeof(Vertex) * m_maxcount); _aligned_free(m_vertices); diff --git a/plugins/GSdx/GS.cpp b/plugins/GSdx/GS.cpp index 2f23f0903f..5d91813659 100644 --- a/plugins/GSdx/GS.cpp +++ b/plugins/GSdx/GS.cpp @@ -37,6 +37,7 @@ #include "GSWndDX.h" #include "GSWndWGL.h" #include "GSRendererCS.h" +#include "GSRendererCL.h" #include "GSSettingsDlg.h" static HRESULT s_hr = E_FAIL; @@ -203,6 +204,7 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1) } GSWnd* wnd[2]; + try { if(s_renderer != renderer) @@ -216,78 +218,72 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1) s_gs = NULL; } - if(renderer == 15) + switch(renderer) { - #ifdef _WINDOWS - - dev = new GSDevice11(); - - if(dev == NULL) - { - return -1; - } - - delete s_gs; - - s_gs = new GSRendererCS(); - - s_renderer = renderer; - - #endif + default: +#ifdef _WINDOWS + case 0: case 1: case 2: case 14: + dev = new GSDevice9(); + break; + case 3: case 4: case 5: case 15: + dev = new GSDevice11(); + break; +#endif + case 9: case 10: case 11: case 16: + dev = new GSDeviceNull(); + break; + case 12: case 13: case 17: + dev = new GSDeviceOGL(); + break; } - else + + if(dev == NULL) { - switch(renderer / 3) + return -1; + } + + if(s_gs == NULL) + { + switch(renderer) { default: - #ifdef _WINDOWS - case 0: dev = new GSDevice9(); break; - case 1: dev = new GSDevice11(); break; - #endif - case 3: dev = new GSDeviceNull(); break; - case 4: dev = new GSDeviceOGL(); break; - } - - if(dev == NULL) - { - return -1; - } - - if(s_gs == NULL) - { - switch(renderer % 3) - { - default: - case 0: - switch(renderer) - { - default: #ifdef _WINDOWS - case 0: s_gs = (GSRenderer*)new GSRendererDX9(); break; - case 3: s_gs = (GSRenderer*)new GSRendererDX11(); break; + case 0: + s_gs = (GSRenderer*)new GSRendererDX9(); + break; + case 3: + s_gs = (GSRenderer*)new GSRendererDX11(); + break; #endif - case 12: s_gs = (GSRenderer*)new GSRendererOGL(); break; - } - break; - case 1: - s_gs = new GSRendererSW(threads); - break; - case 2: - s_gs = new GSRendererNull(); - break; - } - - s_renderer = renderer; + case 12: + s_gs = (GSRenderer*)new GSRendererOGL(); + break; + case 1: case 4: case 10: case 13: + s_gs = new GSRendererSW(threads); + break; + case 2: case 5: case 11: + s_gs = new GSRendererNull(); + break; + case 14: case 15: case 16: case 17: + s_gs = new GSRendererCL(); + break; } + + s_renderer = renderer; } if (s_gs->m_wnd == NULL) { #ifdef _WINDOWS - if (renderer / 3 == 4) + switch(renderer) + { + case 12: case 13: case 17: s_gs->m_wnd = new GSWndWGL(); - else + break; + default: s_gs->m_wnd = new GSWndDX(); + break; + } #else #ifdef ENABLE_GLES wnd[0] = NULL; @@ -681,8 +677,10 @@ EXPORT_C GSkeyEvent(GSKeyEventData* e) { try { - if (gsopen_done) + if(gsopen_done) + { s_gs->KeyEvent(e); + } } catch (GSDXRecoverableError) { @@ -1218,15 +1216,11 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow { ::SetPriorityClass(::GetCurrentProcess(), HIGH_PRIORITY_CLASS); - FILE* file = fopen("c:\\temp1\\log.txt", "a"); - - fprintf(file, "-------------------------\n\n"); + Console console("GSdx", true); if(1) { - GSLocalMemory * pMem = new GSLocalMemory(); - GSLocalMemory& mem(*pMem); - + GSLocalMemory* mem = new GSLocalMemory(); static struct {int psm; const char* name;} s_format[] = { @@ -1258,7 +1252,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow int w = 1 << tbw; int h = 1 << tbw; - fprintf(file, "%d x %d\n\n", w, h); + printf("%d x %d\n\n", w, h); for(size_t i = 0; i < countof(s_format); i++) { @@ -1308,7 +1302,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow clock_t start, end; - _ftprintf(file, _T("[%4s] "), s_format[i].name); + printf("[%4s] ", s_format[i].name); start = clock(); @@ -1317,12 +1311,12 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow int x = 0; int y = 0; - (mem.*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); + (mem->*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); } end = clock(); - fprintf(file, "%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); + printf("%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); start = clock(); @@ -1331,25 +1325,25 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow int x = 0; int y = 0; - (mem.*ri)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); + (mem->*ri)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); } end = clock(); - fprintf(file, "%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); + printf("%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); - const GSOffset* o = mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM); + const GSOffset* o = mem->GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM); start = clock(); for(int j = 0; j < n; j++) { - (mem.*rtx)(o, r, ptr, w * 4, TEXA); + (mem->*rtx)(o, r, ptr, w * 4, TEXA); } end = clock(); - fprintf(file, "%6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); + printf("%6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); if(psm.pal > 0) { @@ -1357,32 +1351,30 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow for(int j = 0; j < n; j++) { - (mem.*rtxP)(o, r, ptr, w, TEXA); + (mem->*rtxP)(o, r, ptr, w, TEXA); } end = clock(); - fprintf(file, "| %6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); + printf("| %6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); } - fprintf(file, "\n"); - - fflush(file); + printf("\n"); } - fprintf(file, "\n"); + printf("\n"); } _aligned_free(ptr); - delete pMem; + + delete mem; } // if(0) { - GSLocalMemory * pMem2 = new GSLocalMemory(); - GSLocalMemory& mem2(*pMem2); + GSLocalMemory* mem = new GSLocalMemory(); uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32); @@ -1413,13 +1405,13 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow int x = 0; int y = 0; - (mem2.*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); - delete pMem2; + (mem->*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); + + delete mem; } // - fclose(file); PostQuitMessage(0); } diff --git a/plugins/GSdx/GSDevice9.h b/plugins/GSdx/GSDevice9.h index fd32304c70..e7cb160135 100644 --- a/plugins/GSdx/GSDevice9.h +++ b/plugins/GSdx/GSDevice9.h @@ -173,7 +173,6 @@ public: // TODO // Shaders... hash_map m_vs; - D3DXHANDLE m_vs_params; hash_map > m_ps; hash_map m_ps_ss; hash_map m_om_dss; diff --git a/plugins/GSdx/GSLocalMemory.cpp b/plugins/GSdx/GSLocalMemory.cpp index 10cb2433f5..757463d134 100644 --- a/plugins/GSdx/GSLocalMemory.cpp +++ b/plugins/GSdx/GSLocalMemory.cpp @@ -692,14 +692,14 @@ void GSLocalMemory::WriteImageColumn(int l, int r, int y, int h, const uint8* sr { switch(psm) { - case PSM_PSMCT32: WriteColumn32(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break; - case PSM_PSMCT16: WriteColumn16(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break; - case PSM_PSMCT16S: WriteColumn16(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break; - case PSM_PSMT8: WriteColumn8(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break; - case PSM_PSMT4: WriteColumn4(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break; - case PSM_PSMZ32: WriteColumn32(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break; - case PSM_PSMZ16: WriteColumn16(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break; - case PSM_PSMZ16S: WriteColumn16(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMCT32: GSBlock::WriteColumn32(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break; + case PSM_PSMCT16: GSBlock::WriteColumn16(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMCT16S: GSBlock::WriteColumn16(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMT8: GSBlock::WriteColumn8(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break; + case PSM_PSMT4: GSBlock::WriteColumn4(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break; + case PSM_PSMZ32: GSBlock::WriteColumn32(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break; + case PSM_PSMZ16: GSBlock::WriteColumn16(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMZ16S: GSBlock::WriteColumn16(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break; // TODO default: __assume(0); } @@ -719,14 +719,14 @@ void GSLocalMemory::WriteImageBlock(int l, int r, int y, int h, const uint8* src { switch(psm) { - case PSM_PSMCT32: WriteBlock32(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break; - case PSM_PSMCT16: WriteBlock16(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break; - case PSM_PSMCT16S: WriteBlock16(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break; - case PSM_PSMT8: WriteBlock8(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break; - case PSM_PSMT4: WriteBlock4(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break; - case PSM_PSMZ32: WriteBlock32(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break; - case PSM_PSMZ16: WriteBlock16(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break; - case PSM_PSMZ16S: WriteBlock16(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMCT32: GSBlock::WriteBlock32(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break; + case PSM_PSMCT16: GSBlock::WriteBlock16(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMCT16S: GSBlock::WriteBlock16(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMT8: GSBlock::WriteBlock8(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break; + case PSM_PSMT4: GSBlock::WriteBlock4(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break; + case PSM_PSMZ32: GSBlock::WriteBlock32(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break; + case PSM_PSMZ16: GSBlock::WriteBlock16(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMZ16S: GSBlock::WriteBlock16(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break; // TODO default: __assume(0); } @@ -801,27 +801,27 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8* { case PSM_PSMCT32: case PSM_PSMZ32: - ReadColumn32(y, dst, buff, 32); + GSBlock::ReadColumn32(y, dst, buff, 32); memcpy(&buff[32], &src[x * 4], 32); - WriteColumn32<32, 0xffffffff>(y, dst, buff, 32); + GSBlock::WriteColumn32<32, 0xffffffff>(y, dst, buff, 32); break; case PSM_PSMCT16: case PSM_PSMCT16S: case PSM_PSMZ16: case PSM_PSMZ16S: - ReadColumn16(y, dst, buff, 32); + GSBlock::ReadColumn16(y, dst, buff, 32); memcpy(&buff[32], &src[x * 2], 32); - WriteColumn16<32>(y, dst, buff, 32); + GSBlock::WriteColumn16<32>(y, dst, buff, 32); break; case PSM_PSMT8: - ReadColumn8(y, dst, buff, 16); + GSBlock::ReadColumn8(y, dst, buff, 16); for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + x], 16); - WriteColumn8<32>(y, dst, buff, 16); + GSBlock::WriteColumn8<32>(y, dst, buff, 16); break; case PSM_PSMT4: - ReadColumn4(y, dst, buff, 16); + GSBlock::ReadColumn4(y, dst, buff, 16); for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + (x >> 1)], 16); - WriteColumn4<32>(y, dst, buff, 16); + GSBlock::WriteColumn4<32>(y, dst, buff, 16); break; // TODO default: @@ -888,27 +888,27 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8* { case PSM_PSMCT32: case PSM_PSMZ32: - ReadColumn32(y, dst, buff, 32); + GSBlock::ReadColumn32(y, dst, buff, 32); memcpy(&buff[0], &src[x * 4], 32); - WriteColumn32<32, 0xffffffff>(y, dst, buff, 32); + GSBlock::WriteColumn32<32, 0xffffffff>(y, dst, buff, 32); break; case PSM_PSMCT16: case PSM_PSMCT16S: case PSM_PSMZ16: case PSM_PSMZ16S: - ReadColumn16(y, dst, buff, 32); + GSBlock::ReadColumn16(y, dst, buff, 32); memcpy(&buff[0], &src[x * 2], 32); - WriteColumn16<32>(y, dst, buff, 32); + GSBlock::WriteColumn16<32>(y, dst, buff, 32); break; case PSM_PSMT8: - ReadColumn8(y, dst, buff, 16); + GSBlock::ReadColumn8(y, dst, buff, 16); for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + x], 16); - WriteColumn8<32>(y, dst, buff, 16); + GSBlock::WriteColumn8<32>(y, dst, buff, 16); break; case PSM_PSMT4: - ReadColumn4(y, dst, buff, 16); + GSBlock::ReadColumn4(y, dst, buff, 16); for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + (x >> 1)], 16); - WriteColumn4<32>(y, dst, buff, 16); + GSBlock::WriteColumn4<32>(y, dst, buff, 16); break; // TODO default: @@ -1060,7 +1060,7 @@ void GSLocalMemory::WriteImage24(int& tx, int& ty, const uint8* src, int len, GI { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32(x, y, bp, bw)); + GSBlock::UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32(x, y, bp, bw)); } } @@ -1094,7 +1094,7 @@ void GSLocalMemory::WriteImage8H(int& tx, int& ty, const uint8* src, int len, GI { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock8H(src + (x - tx), srcpitch, BlockPtr32(x, y, bp, bw)); + GSBlock::UnpackAndWriteBlock8H(src + (x - tx), srcpitch, BlockPtr32(x, y, bp, bw)); } } @@ -1128,7 +1128,7 @@ void GSLocalMemory::WriteImage4HL(int& tx, int& ty, const uint8* src, int len, G { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock4HL(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw)); + GSBlock::UnpackAndWriteBlock4HL(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw)); } } @@ -1162,7 +1162,7 @@ void GSLocalMemory::WriteImage4HH(int& tx, int& ty, const uint8* src, int len, G { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock4HH(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw)); + GSBlock::UnpackAndWriteBlock4HH(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw)); } } @@ -1196,7 +1196,7 @@ void GSLocalMemory::WriteImage24Z(int& tx, int& ty, const uint8* src, int len, G { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32Z(x, y, bp, bw)); + GSBlock::UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32Z(x, y, bp, bw)); } } @@ -1612,7 +1612,7 @@ void GSLocalMemory::ReadTexture32(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 8, 8, 32) { - ReadBlock32(src, dst, dstpitch); + GSBlock::ReadBlock32(src, dst, dstpitch); } FOREACH_BLOCK_END } @@ -1623,7 +1623,7 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 8, 8, 32) { - ReadAndExpandBlock24(src, dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock24(src, dst, dstpitch, TEXA); } FOREACH_BLOCK_END } @@ -1631,7 +1631,7 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 8, 8, 32) { - ReadAndExpandBlock24(src, dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock24(src, dst, dstpitch, TEXA); } FOREACH_BLOCK_END } @@ -1643,7 +1643,7 @@ void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 16, 8, 32) { - ReadAndExpandBlock16(src, dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock16(src, dst, dstpitch, TEXA); } FOREACH_BLOCK_END } @@ -1651,7 +1651,7 @@ void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 16, 8, 32) { - ReadAndExpandBlock16(src, dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock16(src, dst, dstpitch, TEXA); } FOREACH_BLOCK_END } @@ -1663,7 +1663,7 @@ void GSLocalMemory::ReadTexture8(const GSOffset* RESTRICT o, const GSVector4i& r FOREACH_BLOCK_START(r, 16, 16, 32) { - ReadAndExpandBlock8_32(src, dst, dstpitch, pal); + GSBlock::ReadAndExpandBlock8_32(src, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -1674,7 +1674,7 @@ void GSLocalMemory::ReadTexture4(const GSOffset* RESTRICT o, const GSVector4i& r FOREACH_BLOCK_START(r, 32, 16, 32) { - ReadAndExpandBlock4_32(src, dst, dstpitch, pal); + GSBlock::ReadAndExpandBlock4_32(src, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -1685,7 +1685,7 @@ void GSLocalMemory::ReadTexture8H(const GSOffset* RESTRICT o, const GSVector4i& FOREACH_BLOCK_START(r, 8, 8, 32) { - ReadAndExpandBlock8H_32(src, dst, dstpitch, pal); + GSBlock::ReadAndExpandBlock8H_32(src, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -1696,7 +1696,7 @@ void GSLocalMemory::ReadTexture4HL(const GSOffset* RESTRICT o, const GSVector4i& FOREACH_BLOCK_START(r, 8, 8, 32) { - ReadAndExpandBlock4HL_32(src, dst, dstpitch, pal); + GSBlock::ReadAndExpandBlock4HL_32(src, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -1707,7 +1707,7 @@ void GSLocalMemory::ReadTexture4HH(const GSOffset* RESTRICT o, const GSVector4i& FOREACH_BLOCK_START(r, 8, 8, 32) { - ReadAndExpandBlock4HH_32(src, dst, dstpitch, pal); + GSBlock::ReadAndExpandBlock4HH_32(src, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -1718,7 +1718,7 @@ void GSLocalMemory::ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, cons { ALIGN_STACK(32); - ReadBlock32(BlockPtr(bp), dst, dstpitch); + GSBlock::ReadBlock32(BlockPtr(bp), dst, dstpitch); } void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const @@ -1727,11 +1727,11 @@ void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, cons if(TEXA.AEM) { - ReadAndExpandBlock24(BlockPtr(bp), dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock24(BlockPtr(bp), dst, dstpitch, TEXA); } else { - ReadAndExpandBlock24(BlockPtr(bp), dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock24(BlockPtr(bp), dst, dstpitch, TEXA); } } @@ -1741,11 +1741,11 @@ void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, cons if(TEXA.AEM) { - ReadAndExpandBlock16(BlockPtr(bp), dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock16(BlockPtr(bp), dst, dstpitch, TEXA); } else { - ReadAndExpandBlock16(BlockPtr(bp), dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock16(BlockPtr(bp), dst, dstpitch, TEXA); } } @@ -1753,35 +1753,35 @@ void GSLocalMemory::ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const { ALIGN_STACK(32); - ReadAndExpandBlock8_32(BlockPtr(bp), dst, dstpitch, m_clut); + GSBlock::ReadAndExpandBlock8_32(BlockPtr(bp), dst, dstpitch, m_clut); } void GSLocalMemory::ReadTextureBlock4(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadAndExpandBlock4_32(BlockPtr(bp), dst, dstpitch, m_clut); + GSBlock::ReadAndExpandBlock4_32(BlockPtr(bp), dst, dstpitch, m_clut); } void GSLocalMemory::ReadTextureBlock8H(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut); + GSBlock::ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut); } void GSLocalMemory::ReadTextureBlock4HL(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadAndExpandBlock4HL_32(BlockPtr(bp), dst, dstpitch, m_clut); + GSBlock::ReadAndExpandBlock4HL_32(BlockPtr(bp), dst, dstpitch, m_clut); } void GSLocalMemory::ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut); + GSBlock::ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut); } /////////////////// @@ -1870,7 +1870,7 @@ void GSLocalMemory::ReadTexture8P(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 16, 16, 8) { - ReadBlock8(src, dst, dstpitch); + GSBlock::ReadBlock8(src, dst, dstpitch); } FOREACH_BLOCK_END } @@ -1879,7 +1879,7 @@ void GSLocalMemory::ReadTexture4P(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 32, 16, 8) { - ReadBlock4P(src, dst, dstpitch); + GSBlock::ReadBlock4P(src, dst, dstpitch); } FOREACH_BLOCK_END } @@ -1888,7 +1888,7 @@ void GSLocalMemory::ReadTexture8HP(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 8, 8, 8) { - ReadBlock8HP(src, dst, dstpitch); + GSBlock::ReadBlock8HP(src, dst, dstpitch); } FOREACH_BLOCK_END } @@ -1897,7 +1897,7 @@ void GSLocalMemory::ReadTexture4HLP(const GSOffset* RESTRICT o, const GSVector4i { FOREACH_BLOCK_START(r, 8, 8, 8) { - ReadBlock4HLP(src, dst, dstpitch); + GSBlock::ReadBlock4HLP(src, dst, dstpitch); } FOREACH_BLOCK_END } @@ -1906,7 +1906,7 @@ void GSLocalMemory::ReadTexture4HHP(const GSOffset* RESTRICT o, const GSVector4i { FOREACH_BLOCK_START(r, 8, 8, 8) { - ReadBlock4HHP(src, dst, dstpitch); + GSBlock::ReadBlock4HHP(src, dst, dstpitch); } FOREACH_BLOCK_END } @@ -1915,35 +1915,35 @@ void GSLocalMemory::ReadTexture4HHP(const GSOffset* RESTRICT o, const GSVector4i void GSLocalMemory::ReadTextureBlock8P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { - ReadBlock8(BlockPtr(bp), dst, dstpitch); + GSBlock::ReadBlock8(BlockPtr(bp), dst, dstpitch); } void GSLocalMemory::ReadTextureBlock4P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadBlock4P(BlockPtr(bp), dst, dstpitch); + GSBlock::ReadBlock4P(BlockPtr(bp), dst, dstpitch); } void GSLocalMemory::ReadTextureBlock8HP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadBlock8HP(BlockPtr(bp), dst, dstpitch); + GSBlock::ReadBlock8HP(BlockPtr(bp), dst, dstpitch); } void GSLocalMemory::ReadTextureBlock4HLP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadBlock4HLP(BlockPtr(bp), dst, dstpitch); + GSBlock::ReadBlock4HLP(BlockPtr(bp), dst, dstpitch); } void GSLocalMemory::ReadTextureBlock4HHP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadBlock4HHP(BlockPtr(bp), dst, dstpitch); + GSBlock::ReadBlock4HHP(BlockPtr(bp), dst, dstpitch); } // diff --git a/plugins/GSdx/GSLocalMemory.h b/plugins/GSdx/GSLocalMemory.h index 67f2cf0c27..ea83bfd53b 100644 --- a/plugins/GSdx/GSLocalMemory.h +++ b/plugins/GSdx/GSLocalMemory.h @@ -76,7 +76,7 @@ struct GSPixelOffset4 uint32 fbp, zbp, fpsm, zpsm, bw; }; -class GSLocalMemory : public GSBlock +class GSLocalMemory : public GSAlignedClass<32> { public: typedef uint32 (*pixelAddress)(int x, int y, uint32 bp, uint32 bw); diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp new file mode 100644 index 0000000000..357a2a3e23 --- /dev/null +++ b/plugins/GSdx/GSRendererCL.cpp @@ -0,0 +1,1780 @@ +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSRendererCL.h" + +#define LOG 0 + +static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL; + +#define MAX_FRAME_SIZE 2048 +#define MAX_PRIM_COUNT 4096u +#define MAX_PRIM_PER_BATCH_BITS 5 +#define MAX_PRIM_PER_BATCH (1u << MAX_PRIM_PER_BATCH_BITS) +#define BATCH_COUNT(prim_count) (((prim_count) + (MAX_PRIM_PER_BATCH - 1)) / MAX_PRIM_PER_BATCH) +#define MAX_BATCH_COUNT BATCH_COUNT(MAX_PRIM_COUNT) +#define BIN_SIZE_BITS 4 +#define BIN_SIZE (1u << BIN_SIZE_BITS) +#define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE)) +#define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT) + +#if MAX_PRIM_PER_BATCH == 64u +#define BIN_TYPE cl_ulong +#elif MAX_PRIM_PER_BATCH == 32u +#define BIN_TYPE cl_uint +#else +#error "MAX_PRIM_PER_BATCH != 32u OR 64u" +#endif + +#pragma pack(push, 1) + +typedef struct +{ + GSVertexCL v[4]; +} gs_prim; + +typedef struct +{ + cl_float4 dx, dy; + cl_float4 zero; + cl_float4 reject_corner; +} gs_barycentric; + +typedef struct +{ + cl_uint batch_counter; + cl_uint _pad[7]; + struct { cl_uint first, last; } bounds[MAX_BIN_PER_BATCH]; + BIN_TYPE bin[MAX_BIN_COUNT]; + cl_uchar4 bbox[MAX_PRIM_COUNT]; + gs_prim prim[MAX_PRIM_COUNT]; + gs_barycentric barycentric[MAX_PRIM_COUNT]; +} gs_env; + +#pragma pack(pop) + +GSRendererCL::GSRendererCL() + : m_vb_count(0) +{ + m_nativeres = true; // ignore ini, sw is always native + + //s_dump = 1; + //s_save = 1; + //s_savez = 1; + + // TODO: m_tc = new GSTextureCacheCL(this); + + memset(m_texture, 0, sizeof(m_texture)); + + m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32); + + memset(m_rw_pages, 0, sizeof(m_rw_pages)); + memset(m_tex_pages, 0, sizeof(m_tex_pages)); + + #define InitCVB(P) \ + m_cvb[P][0][0] = &GSRendererCL::ConvertVertexBuffer; \ + m_cvb[P][0][1] = &GSRendererCL::ConvertVertexBuffer; \ + m_cvb[P][1][0] = &GSRendererCL::ConvertVertexBuffer; \ + m_cvb[P][1][1] = &GSRendererCL::ConvertVertexBuffer; \ + + InitCVB(GS_POINT_CLASS); + InitCVB(GS_LINE_CLASS); + InitCVB(GS_TRIANGLE_CLASS); + InitCVB(GS_SPRITE_CLASS); + + m_cl.vm = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (size_t)m_mem.m_vmsize, m_mem.m_vm8, NULL); + m_cl.tex = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE, (size_t)m_mem.m_vmsize); +} + +GSRendererCL::~GSRendererCL() +{ + // TODO: delete m_tc; + + for(size_t i = 0; i < countof(m_texture); i++) + { + delete m_texture[i]; + } + + _aligned_free(m_output); +} + +void GSRendererCL::Reset() +{ + Sync(-1); + + // TODO: m_tc->RemoveAll(); + + GSRenderer::Reset(); +} + +void GSRendererCL::VSync(int field) +{ + Sync(0); // IncAge might delete a cached texture in use + + GSRenderer::VSync(field); + + // TODO: m_tc->IncAge(); + + //if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize); +} + +void GSRendererCL::ResetDevice() +{ + for(size_t i = 0; i < countof(m_texture); i++) + { + delete m_texture[i]; + + m_texture[i] = NULL; + } +} + +GSTexture* GSRendererCL::GetOutput(int i) +{ + Sync(1); + + const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB; + + int w = DISPFB.FBW * 64; + int h = GetFrameRect(i).bottom; + + // TODO: round up bottom + + if(m_dev->ResizeTexture(&m_texture[i], w, h)) + { + static int pitch = 1024 * 4; + + GSVector4i r(0, 0, w, h); + + const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM]; + + (m_mem.*psm.rtx)(m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM), r.ralign(psm.bs), m_output, pitch, m_env.TEXA); + + m_texture[i]->Update(r, m_output, pitch); + + if(s_dump) + { + if(s_save && s_n >= s_saven) + { + m_texture[i]->Save(format("c:\\temp1\\_%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)DISPFB.Block(), (int)DISPFB.PSM)); + } + + s_n++; + } + } + + return m_texture[i]; +} + +const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 1.0f); + +template +void GSRendererCL::ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count) +{ + GSVector4i o = (GSVector4i)m_context->XYOFFSET; + GSVector4 st_scale = GSVector4(16 << m_context->TEX0.TW, 16 << m_context->TEX0.TH, 1, 0); + + for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++) + { + GSVector4 stcq = GSVector4::load(&src->m[0]); // s t rgba q + + #if _M_SSE >= 0x401 + + GSVector4i xyzuvf(src->m[1]); + + GSVector4i xy = xyzuvf.upl16() - o; + GSVector4i zf = xyzuvf.ywww().min_u32(GSVector4i::xffffff00()); + + #else + + uint32 z = src->XYZ.Z; + + GSVector4i xy = GSVector4i::load((int)src->XYZ.u32[0]).upl16() - o; + GSVector4i zf = GSVector4i((int)std::min(z, 0xffffff00), src->FOG); // NOTE: larger values of z may roll over to 0 when converting back to uint32 later + + #endif + + dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * g_pos_scale; + + GSVector4 t = GSVector4::zero(); + + if(tme) + { + if(fst) + { + #if _M_SSE >= 0x401 + + t = GSVector4(xyzuvf.uph16()); + + #else + + t = GSVector4(GSVector4i::load(src->UV).upl16()); + + #endif + } + else + { + t = stcq.xyww() * st_scale; + } + } + + dst->t = t.insert32<2, 3>(stcq); + } +} + +void GSRendererCL::Draw() +{ + const GSDrawingContext* context = m_context; + + GSVector4i scissor = GSVector4i(context->scissor.in); + GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil())); + + // points and lines may have zero area bbox (example: single line 0,0->256,0) + + if(m_vt.m_primclass == GS_POINT_CLASS || m_vt.m_primclass == GS_LINE_CLASS) + { + if(bbox.x == bbox.z) bbox.z++; + if(bbox.y == bbox.w) bbox.w++; + } + + scissor.z = std::min(scissor.z, (int)context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour + + GSVector4i rect = bbox.rintersect(scissor); + + if(rect.rempty()) + { + return; + } + + if(s_dump) + { + Sync(2); + + uint64 frame = m_perfmon.GetFrame(); + + std::string s; + + if(s_save && s_n >= s_saven && PRIM->TME) + { + s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM); + + m_mem.SaveBMP(s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH); + } + + s_n++; + + if(s_save && s_n >= s_saven) + { + s = format("c:\\temp1\\_%05d_f%lld_rt0_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM); + + m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512); + } + + if(s_savez && s_n >= s_saven) + { + s = format("c:\\temp1\\_%05d_f%lld_rz0_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM); + + m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512); + } + + s_n++; + } + + try + { + size_t vb_size = m_vertex.next * sizeof(GSVertexCL); + size_t ib_size = m_index.tail * sizeof(uint32); + size_t pb_size = sizeof(TFXParameter); + + if(m_cl.vb.tail + vb_size > m_cl.vb.size || m_cl.ib.tail + ib_size > m_cl.ib.size || m_cl.pb.tail + pb_size > m_cl.pb.size) + { + if(vb_size > m_cl.vb.size || ib_size > m_cl.ib.size) + { + // buffer too small for even one batch, allow twice the size (at least 1 MB) + + Sync(2); // must sync, reallocating the input buffers + + m_cl.Unmap(); + + m_cl.vb.size = 0; + m_cl.ib.size = 0; + + size_t size = std::max(vb_size * 2, 2u << 20); + + printf("growing vertex/index buffer %d\n", size); + + m_cl.vb.buff[0] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size); + m_cl.vb.buff[1] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size); + m_cl.vb.size = size; + + size = std::max(size / sizeof(GSVertex) * 3 * sizeof(uint32), 1u << 20); // worst case, three times the vertex count + + ASSERT(size >= ib_size); + + if(size < ib_size) size = ib_size; // should not happen + + m_cl.ib.buff[0] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size); + m_cl.ib.buff[1] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size); + m_cl.ib.size = size; + } + else + { + Enqueue(); + + m_cl.Unmap(); + + // make the write queue wait until the rendering queue is ready, it may still use the device buffers + + std::vector el(1); + + m_cl.queue[2].enqueueMarkerWithWaitList(NULL, &el[0]); + m_cl.wq->enqueueBarrierWithWaitList(&el, NULL); + + // switch to the other queue/buffer (double buffering) + + m_cl.wqidx = (m_cl.wqidx + 1) & 1; + m_cl.wq = &m_cl.queue[m_cl.wqidx]; + } + + m_cl.vb.head = m_cl.vb.tail = 0; + m_cl.ib.head = m_cl.ib.tail = 0; + m_cl.pb.head = m_cl.pb.tail = 0; + + m_cl.Map(); + } + else + { + // only allow batches of the same primclass in Enqueue + + if(!m_jobs.empty() && m_jobs.front().sel.prim != (uint32)m_vt.m_primclass) + { + Enqueue(); + } + } + + // + + GSVertexCL* vb = (GSVertexCL*)(m_cl.vb.ptr + m_cl.vb.tail); + uint32* ib = (uint32*)(m_cl.ib.ptr + m_cl.ib.tail); + TFXParameter* pb = (TFXParameter*)(m_cl.pb.ptr + m_cl.pb.tail); + + pb->scissor = scissor; + pb->bbox = bbox; + pb->rect = rect; + + (this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(vb, m_vertex.buff, m_vertex.next); // TODO: upload in GSVertex format and extract the fields in the kernel? + + if(m_jobs.empty()) + { + memcpy(ib, m_index.buff, m_index.tail * sizeof(uint32)); + + m_vb_start = m_cl.vb.tail; + } + else + { + // TODO: SIMD + + uint32 vb_count = m_vb_count; + + for(size_t i = 0; i < m_index.tail; i++) + { + ib[i] = m_index.buff[i] + vb_count; + } + } + + m_vb_count += m_vertex.next; + + if(!SetupParameter(pb, vb, m_vertex.next, m_index.buff, m_index.tail)) + { + return; + } + + TFXJob job; + + job.rect.x = rect.x; + job.rect.y = rect.y; + job.rect.z = rect.z; + job.rect.w = rect.w; + job.sel = pb->sel; + job.ib_start = m_cl.ib.tail; + job.ib_count = m_index.tail; + job.pb_start = m_cl.pb.tail; + + m_jobs.push_back(job); + + m_cl.vb.tail += vb_size; + m_cl.ib.tail += ib_size; + m_cl.pb.tail += pb_size; + + // mark pages for writing + + if(pb->sel.fb) + { + uint8 flag = pb->sel.fb; + + const uint32* pages = m_context->offset.fb->GetPages(rect, m_tmp_pages); + + for(const uint32* p = pages; *p != GSOffset::EOP; p++) + { + m_rw_pages[*p] |= flag; + } + } + + if(pb->sel.zb) + { + uint8 flag = pb->sel.zb; + + const uint32* pages = m_context->offset.zb->GetPages(rect, m_tmp_pages); + + for(const uint32* p = pages; *p != GSOffset::EOP; p++) + { + m_rw_pages[*p] |= flag; + } + } + + // don't buffer too much data, feed them to the device if there is enough + + if(m_cl.vb.tail - m_cl.vb.head >= 256 * 4096 || m_jobs.size() >= 64) + { + Enqueue(); + } + + /* + // check if the texture is not part of a target currently in use + + if(CheckSourcePages(data)) + { + Sync(4); + } + + // addref source and target pages + + data->UsePages(fb_pages, m_context->offset.fb->psm, zb_pages, m_context->offset.zb->psm); + */ + + // update previously invalidated parts + + //data->UpdateSource(); + /* + if(LOG) + { + fprintf(s_fp, "[%d] queue %05x %d (%d) %05x %d (%d) %05x %d %dx%d (%d %d %d) | %d %d %d\n", + sd->counter, + m_context->FRAME.Block(), m_context->FRAME.PSM, gd.sel.fwrite, + m_context->ZBUF.Block(), m_context->ZBUF.PSM, gd.sel.zwrite, + PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, m_context->TEX0.CSM, m_context->TEX0.CPSM, m_context->TEX0.CSA, + PRIM->PRIM, sd->vertex_count, sd->index_count); + + fflush(s_fp); + } + */ + + //printf("q %p %d (%d %d %d %d)\n", pb, pb->ib_count, r.x, r.y, r.z, r.w); + + /* + // invalidate new parts rendered onto + + if(sd->global.sel.fwrite) + { + m_tc->InvalidatePages(sd->m_fb_pages, sd->m_fpsm); + } + + if(sd->global.sel.zwrite) + { + m_tc->InvalidatePages(sd->m_zb_pages, sd->m_zpsm); + } + */ + } + catch(cl::Error err) + { + printf("%s (%d)\n", err.what(), err.err()); + + return; + } + catch(std::exception err) + { + printf("%s\n", err.what()); + + return; + } + + if(s_dump) + { + Sync(2); + + uint64 frame = m_perfmon.GetFrame(); + + std::string s; + + if(s_save && s_n >= s_saven) + { + s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM); + + m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512); + } + + if(s_savez && s_n >= s_saven) + { + s = format("c:\\temp1\\_%05d_f%lld_rz1_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM); + + m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512); + } + + s_n++; + } +} + +void GSRendererCL::Sync(int reason) +{ + //printf("sync %d\n", reason); + + GSPerfMonAutoTimer pmat(&m_perfmon, GSPerfMon::Sync); + + Enqueue(); + + m_cl.queue[2].finish(); + + memset(m_rw_pages, 0, sizeof(m_rw_pages)); + memset(m_tex_pages, 0, sizeof(m_tex_pages)); + + // TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap, + // though it does not seem to be necessary even with GPU devices where it might be cached, + // needs more testing... + + //void* ptr = m_cl.queue->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize); + //m_cl.queue->enqueueUnmapMemObject(m_cl.vm, ptr); +} + +void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) +{ + if(LOG) {fprintf(s_fp, "w %05x %d %d, %d %d %d %d\n", BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM, r.x, r.y, r.z, r.w); fflush(s_fp);} + + GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM); + + o->GetPages(r, m_tmp_pages); + + //if(!synced) + { + for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) + { + if(m_rw_pages[*p] & 3) // rw + { + Sync(3); + + break; + } + } + } + + for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) + { + m_tex_pages[*p] = 1; + } +} + +void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut) +{ + if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);} + + //if(!synced) + { + GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM); + + o->GetPages(r, m_tmp_pages); + + for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) + { + if(m_rw_pages[*p] & 1) // w + { + Sync(4); + + break; + } + } + } +} +/* +bool GSRendererCL::CheckSourcePages(RasterizerData* data) +{ + // TODO: if(!m_rl->IsSynced()) // TODO: all callbacks from the issued drawings reported in => in-sync + { + for(size_t i = 0; data->m_tex[i].t != NULL; i++) + { + data->m_tex[i].t->m_offset->GetPages(data->m_tex[i].r, m_tmp_pages); + + uint32* pages = m_tmp_pages; // data->m_tex[i].t->m_pages.n; + + for(const uint32* p = pages; *p != GSOffset::EOP; p++) + { + // TODO: 8H 4HL 4HH texture at the same place as the render target (24 bit, or 32-bit where the alpha channel is masked, Valkyrie Profile 2) + + if(m_fzb_pages[*p]) // currently being drawn to? => sync + { + return true; + } + } + } + } + + return false; +} +*/ + +//#include "GSTextureCL.h" + +void GSRendererCL::Enqueue() +{ + if(m_jobs.empty()) return; + + try + { + ASSERT(m_cl.vb.tail > m_cl.vb.head); + ASSERT(m_cl.ib.tail > m_cl.ib.head); + ASSERT(m_cl.pb.tail > m_cl.pb.head); + + int primclass = m_jobs.front().sel.prim; + + uint32 n; + + switch(primclass) + { + case GS_POINT_CLASS: n = 1; break; + case GS_LINE_CLASS: n = 2; break; + case GS_TRIANGLE_CLASS: n = 3; break; + case GS_SPRITE_CLASS: n = 2; break; + default: __assume(0); + } + + PrimSelector psel; + + psel.key = 0; + psel.prim = primclass; + + cl::Kernel& pk = m_cl.GetPrimKernel(psel); + + pk.setArg(1, m_cl.vb.buff[m_cl.wqidx]); + pk.setArg(2, m_cl.ib.buff[m_cl.wqidx]); + + TileSelector tsel; + + tsel.key = 0; + tsel.prim = primclass; + + tsel.mode = 0; + + cl::Kernel& tk_32 = m_cl.GetTileKernel(tsel); + + tsel.mode = 1; + + cl::Kernel& tk_16 = m_cl.GetTileKernel(tsel); + + tsel.mode = 2; + + cl::Kernel& tk_8 = m_cl.GetTileKernel(tsel); + + tsel.mode = 3; + + cl::Kernel& tk = m_cl.GetTileKernel(tsel); + + tsel.key = 0; + tsel.clear = 1; + + cl::Kernel& tk_clear = m_cl.GetTileKernel(tsel); + + // + + m_cl.Unmap(); + + std::vector el2(1); + + m_cl.wq->enqueueMarkerWithWaitList(NULL, &el2[0]); + m_cl.queue[2].enqueueBarrierWithWaitList(&el2, NULL); + + // + + cl_kernel tfx_prev = NULL; + + auto head = m_jobs.begin(); + + while(head != m_jobs.end()) + { + uint32 total_prim_count = 0; + + auto next = head; + + while(next != m_jobs.end()) + { + auto job = next++; + + uint32 cur_prim_count = job->ib_count / n; + uint32 next_prim_count = next != m_jobs.end() ? next->ib_count / n : 0; + + total_prim_count += cur_prim_count; + + if(total_prim_count >= MAX_PRIM_COUNT || next == m_jobs.end())// || next_prim_count >= MAX_PRIM_COUNT || next_prim_count < 16 && total_prim_count >= MAX_PRIM_COUNT / 2) + { + uint32 prim_count = std::min(total_prim_count, MAX_PRIM_COUNT); + + pk.setArg(3, (cl_uint)m_vb_start); + pk.setArg(4, (cl_uint)head->ib_start); + + m_cl.queue[2].enqueueNDRangeKernel(pk, cl::NullRange, cl::NDRange(prim_count), cl::NullRange); + + if(0) + { + gs_env* ptr = (gs_env*)m_cl.queue[2].enqueueMapBuffer(m_cl.env, CL_TRUE, CL_MAP_READ, 0, sizeof(gs_env)); + m_cl.queue[2].enqueueUnmapMemObject(m_cl.env, ptr); + } + + GSVector4i rect = GSVector4i::zero(); + + for(auto i = head; i != next; i++) + { + rect = rect.runion(GSVector4i::load(&i->rect)); + } + + rect = rect.ralign(GSVector2i(BIN_SIZE, BIN_SIZE)) >> BIN_SIZE_BITS; + + int bin_w = rect.width(); + int bin_h = rect.height(); + + uint32 batch_count = BATCH_COUNT(prim_count); + uint32 bin_count = bin_w * bin_h; + + cl_uchar4 bin_dim; + + bin_dim.s[0] = (cl_uchar)rect.x; + bin_dim.s[1] = (cl_uchar)rect.y; + bin_dim.s[2] = (cl_uchar)bin_w; + bin_dim.s[3] = (cl_uchar)bin_h; + + if(1)//bin_w > 1 || bin_h > 1) // && not just one sprite covering the whole area + { + m_cl.queue[2].enqueueNDRangeKernel(tk_clear, cl::NullRange, cl::NDRange(bin_count), cl::NullRange); + + if(bin_count <= 32 && m_cl.WIs >= 256) + { + uint32 item_count; + uint32 group_count; + cl::Kernel* k; + + if(bin_count <= 8) + { + item_count = std::min(prim_count, 32u); + group_count = ((prim_count + 31) >> 5) * item_count; + k = &tk_32; + } + else if(bin_count <= 16) + { + item_count = std::min(prim_count, 16u); + group_count = ((prim_count + 15) >> 4) * item_count; + k = &tk_16; + } + else + { + item_count = std::min(prim_count, 8u); + group_count = ((prim_count + 7) >> 3) * item_count; + k = &tk_8; + } + + k->setArg(1, (cl_uint)prim_count); + k->setArg(2, (cl_uint)bin_count); + k->setArg(3, bin_dim); + + m_cl.queue[2].enqueueNDRangeKernel(*k, cl::NullRange, cl::NDRange(bin_w, bin_h, group_count), cl::NDRange(bin_w, bin_h, item_count)); + } + else + { + uint32 item_count = std::min(bin_count, m_cl.WIs); + uint32 group_count = batch_count * item_count; + + tk.setArg(1, (cl_uint)prim_count); + tk.setArg(2, (cl_uint)batch_count); + tk.setArg(3, (cl_uint)bin_count); + tk.setArg(4, bin_dim); + + m_cl.queue[2].enqueueNDRangeKernel(tk, cl::NullRange, cl::NDRange(group_count), cl::NDRange(item_count)); + } + + if(0) + { + gs_env* ptr = (gs_env*)m_cl.queue[2].enqueueMapBuffer(m_cl.env, CL_TRUE, CL_MAP_READ, 0, sizeof(gs_env)); + m_cl.queue[2].enqueueUnmapMemObject(m_cl.env, ptr); + } + } + + // + + uint32 prim_start = 0; + + for(auto i = head; i != next; i++) + { + ASSERT(prim_start < MAX_PRIM_COUNT); + + uint32 prim_count_inner = std::min(i->ib_count / n, MAX_PRIM_COUNT - prim_start); + + // TODO: update the needed pages of the texture cache buffer with enqueueCopyBuffer (src=this->vm, dst=this->vm_text), + // changed by tfx in the previous loop or marked by InvalidateVideoMem + + // TODO: tile level z test + + cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel); + + if(tfx_prev != tfx()) + { + tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]); + + tfx_prev = tfx(); + } + + tfx.setArg(4, (cl_uint)i->pb_start); + tfx.setArg(5, (cl_uint)prim_start); + tfx.setArg(6, (cl_uint)prim_count_inner); + tfx.setArg(7, (cl_uint)batch_count); + tfx.setArg(8, (cl_uint)bin_count); + tfx.setArg(9, bin_dim); + + //m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NullRange, cl::NDRange(std::min(bin_count * 4, CUs) * 256), cl::NDRange(256)); + + //printf("%d %d %d %d\n", rect.width() << BIN_SIZE_BITS, rect.height() << BIN_SIZE_BITS, i->rect.z - i->rect.x, i->rect.w - i->rect.y); + + GSVector4i r = GSVector4i::load(&i->rect); + + r = r.ralign(GSVector2i(BIN_SIZE, BIN_SIZE)); + /* + if(i->sel.IsSolidRect()) // TODO: simple mem fill + ;//printf("%d %d %d %d\n", r.left, r.top, r.width(), r.height()); + else + */ + m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(16, 16)); + + // TODO: invalidate texture cache pages + + prim_start += prim_count_inner; + } + + // + + if(total_prim_count > MAX_PRIM_COUNT) + { + prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT); + + job->ib_start += prim_count * n * sizeof(uint32); + job->ib_count -= prim_count * n; + + next = job; // try again for the reminder + } + + break; + } + } + + head = next; + } + } + catch(cl::Error err) + { + printf("%s (%d)\n", err.what(), err.err()); + } + + m_jobs.clear(); + + m_vb_count = 0; + + m_cl.vb.head = m_cl.vb.tail; + m_cl.ib.head = m_cl.ib.tail; + m_cl.pb.head = m_cl.pb.tail; + + m_cl.Map(); +} + +static int RemapPSM(int psm) +{ + switch(psm) + { + default: + case PSM_PSMCT32: psm = 0; break; + case PSM_PSMCT24: psm = 1; break; + case PSM_PSMCT16: psm = 2; break; + case PSM_PSMCT16S: psm = 3; break; + case PSM_PSMZ32: psm = 4; break; + case PSM_PSMZ24: psm = 5; break; + case PSM_PSMZ16: psm = 6; break; + case PSM_PSMZ16S: psm = 7; break; + case PSM_PSMT8: psm = 8; break; + case PSM_PSMT4: psm = 9; break; + case PSM_PSMT8H: psm = 10; break; + case PSM_PSMT4HL: psm = 11; break; + case PSM_PSMT4HH: psm = 12; break; + } + + return psm; +} + +bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count) +{ + const GSDrawingEnvironment& env = m_env; + const GSDrawingContext* context = m_context; + const GS_PRIM_CLASS primclass = m_vt.m_primclass; + + pb->sel.key = 0; + + pb->sel.atst = ATST_ALWAYS; + pb->sel.tfx = TFX_NONE; + pb->sel.ababcd = 0xff; + pb->sel.prim = primclass; + + uint32 fm = context->FRAME.FBMSK; + uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0; + + if(context->TEST.ZTE && context->TEST.ZTST == ZTST_NEVER) + { + fm = 0xffffffff; + zm = 0xffffffff; + } + + if(PRIM->TME) + { + if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) + { + m_mem.m_clut.Read32(context->TEX0, env.TEXA); + } + } + + if(context->TEST.ATE) + { + if(!TryAlphaTest(fm, zm)) + { + pb->sel.atst = context->TEST.ATST; + pb->sel.afail = context->TEST.AFAIL; + pb->aref = context->TEST.AREF; + } + } + + bool fwrite; + bool zwrite; + + switch(context->FRAME.PSM) + { + default: + case PSM_PSMCT32: + case PSM_PSMZ32: + fwrite = fm != 0xffffffff; + break; + case PSM_PSMCT24: + case PSM_PSMZ24: + fwrite = (fm & 0x00ffffff) != 0x00ffffff; + break; + case PSM_PSMCT16: + case PSM_PSMCT16S: + case PSM_PSMZ16: + case PSM_PSMZ16S: + fwrite = (fm & 0x80f8f8f8) != 0x80f8f8f8; + break; + } + + switch(context->ZBUF.PSM) + { + default: + case PSM_PSMCT32: + case PSM_PSMZ32: + zwrite = zm != 0xffffffff; + break; + case PSM_PSMCT24: + case PSM_PSMZ24: + zwrite = (zm & 0x00ffffff) != 0x00ffffff; + break; + case PSM_PSMCT16: + case PSM_PSMCT16S: + case PSM_PSMZ16: + case PSM_PSMZ16S: + zm &= 0x80f8f8f8; + zwrite = (zm & 0x80f8f8f8) != 0x80f8f8f8; + break; + } + + if(!fwrite && !zwrite) return false; + + bool ftest = pb->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24; + bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS; + + pb->sel.fwrite = fwrite; + pb->sel.ftest = ftest; + pb->sel.zwrite = zwrite; + pb->sel.ztest = ztest; + + if(fwrite || ftest) + { + pb->sel.fpsm = RemapPSM(context->FRAME.PSM); + + if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff) + { + pb->sel.iip = PRIM->IIP; + } + + if(PRIM->TME) + { + pb->sel.tfx = context->TEX0.TFX; + pb->sel.tcc = context->TEX0.TCC; + pb->sel.fst = PRIM->FST; + pb->sel.ltf = m_vt.IsLinear(); + pb->sel.tpsm = RemapPSM(context->TEX0.PSM); + pb->sel.aem = m_env.TEXA.AEM; + + pb->tbp[0] = context->TEX0.TBP0; + pb->tbw[0] = context->TEX0.TBW; + pb->ta0 = m_env.TEXA.TA0; + pb->ta1 = m_env.TEXA.TA1; + + if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) + { + pb->sel.tlu = 1; + + memcpy(pb->clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal); + } + + pb->sel.wms = context->CLAMP.WMS; + pb->sel.wmt = context->CLAMP.WMT; + + if(pb->sel.tfx == TFX_MODULATE && pb->sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128))) + { + // modulate does not do anything when vertex color is 0x80 + + pb->sel.tfx = TFX_DECAL; + } + + // TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(context->TEX0, env.TEXA); + + // TODO: if(t == NULL) {ASSERT(0); return false;} + + GSVector4i r; + + GetTextureMinMax(r, context->TEX0, context->CLAMP, pb->sel.ltf); + + // TODO: data->SetSource(t, r, 0); + + // TODO: pb->sel.tw = t->m_tw - 3; + + // TODO: store r to current job + + if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.y > 0) + { + // TEX1.MMIN + // 000 p + // 001 l + // 010 p round + // 011 p tri + // 100 l round + // 101 l tri + + if(m_vt.m_lod.x > 0) + { + pb->sel.ltf = context->TEX1.MMIN >> 2; + } + else + { + // TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt.m_lod.x <= 0 && m_vt.m_lod.y > 0 + } + + pb->sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri + pb->sel.lcm = context->TEX1.LCM; + + int mxl = std::min((int)context->TEX1.MXL, 6) << 16; + int k = context->TEX1.K << 12; + + if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL) + { + k = (int)m_vt.m_lod.x << 16; // set lod to max level + + pb->sel.lcm = 1; // lod is constant + pb->sel.mmin = 1; // tri-linear is meaningless + } + + if(pb->sel.mmin == 2) + { + mxl--; // don't sample beyond the last level (TODO: add a dummy level instead?) + } + + if(pb->sel.fst) + { + ASSERT(pb->sel.lcm == 1); + ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu) + + pb->sel.lcm = 1; + } + + if(pb->sel.lcm) + { + int lod = std::max(std::min(k, mxl), 0); + + if(pb->sel.mmin == 1) + { + lod = (lod + 0x8000) & 0xffff0000; // rounding + } + + pb->lod = lod; + + // TODO: lot to optimize when lod is constant + } + else + { + pb->mxl = mxl; + pb->l = (float)(-0x10000 << context->TEX1.L); + pb->k = (float)k; + } + + GIFRegTEX0 MIP_TEX0 = context->TEX0; + GIFRegCLAMP MIP_CLAMP = context->CLAMP; + + GSVector4 tmin = m_vt.m_min.t; + GSVector4 tmax = m_vt.m_max.t; + + static int s_counter = 0; + + for(int i = 1, j = std::min((int)context->TEX1.MXL, 6); i <= j; i++) + { + switch(i) + { + case 1: + MIP_TEX0.TBP0 = context->MIPTBP1.TBP1; + MIP_TEX0.TBW = context->MIPTBP1.TBW1; + break; + case 2: + MIP_TEX0.TBP0 = context->MIPTBP1.TBP2; + MIP_TEX0.TBW = context->MIPTBP1.TBW2; + break; + case 3: + MIP_TEX0.TBP0 = context->MIPTBP1.TBP3; + MIP_TEX0.TBW = context->MIPTBP1.TBW3; + break; + case 4: + MIP_TEX0.TBP0 = context->MIPTBP2.TBP4; + MIP_TEX0.TBW = context->MIPTBP2.TBW4; + break; + case 5: + MIP_TEX0.TBP0 = context->MIPTBP2.TBP5; + MIP_TEX0.TBW = context->MIPTBP2.TBW5; + break; + case 6: + MIP_TEX0.TBP0 = context->MIPTBP2.TBP6; + MIP_TEX0.TBW = context->MIPTBP2.TBW6; + break; + default: + __assume(0); + } + + pb->tbp[i] = MIP_TEX0.TBP0; + pb->tbw[i] = MIP_TEX0.TBW; + + if(MIP_TEX0.TW > 0) MIP_TEX0.TW--; + if(MIP_TEX0.TH > 0) MIP_TEX0.TH--; + + MIP_CLAMP.MINU >>= 1; + MIP_CLAMP.MINV >>= 1; + MIP_CLAMP.MAXU >>= 1; + MIP_CLAMP.MAXV >>= 1; + + m_vt.m_min.t *= 0.5f; + m_vt.m_max.t *= 0.5f; + + // TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, pb->sel.tw + 3); + + // TODO: if(t == NULL) {ASSERT(0); return false;} + + GSVector4i r; + + GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, pb->sel.ltf); + + // TODO: data->SetSource(t, r, i); + + // TODO: store r to current job + } + + s_counter++; + + m_vt.m_min.t = tmin; + m_vt.m_max.t = tmax; + } + else + { + if(pb->sel.fst == 0) + { + // skip per pixel division if q is constant + + GSVertexCL* RESTRICT v = vertex; + + if(m_vt.m_eq.q) + { + pb->sel.fst = 1; + + const GSVector4& t = v[index[0]].t; + + if(t.z != 1.0f) + { + GSVector4 w = t.zzzz().rcpnr(); + + for(int i = 0, j = vertex_count; i < j; i++) + { + GSVector4 t = v[i].t; + + v[i].t = (t * w).xyzw(t); + } + } + } + else if(primclass == GS_SPRITE_CLASS) + { + pb->sel.fst = 1; + + for(int i = 0, j = vertex_count; i < j; i += 2) + { + GSVector4 t0 = v[i + 0].t; + GSVector4 t1 = v[i + 1].t; + + GSVector4 w = t1.zzzz().rcpnr(); + + v[i + 0].t = (t0 * w).xyzw(t0); + v[i + 1].t = (t1 * w).xyzw(t1); + } + } + } + + if(pb->sel.ltf && pb->sel.fst) // TODO: quite slow, do this in the prim kernel? + { + // if q is constant we can do the half pel shift for bilinear sampling on the vertices + + // TODO: but not when mipmapping is used!!! + + GSVector4 half(8.0f, 8.0f); + + GSVertexCL* RESTRICT v = vertex; + + for(int i = 0, j = vertex_count; i < j; i++) + { + GSVector4 t = v[i].t; + + v[i].t = (t - half).xyzw(t); + } + } + } + + int tw = 1 << context->TEX0.TW; + int th = 1 << context->TEX0.TH; + + switch(context->CLAMP.WMS) + { + case CLAMP_REPEAT: + pb->minu = tw - 1; + pb->maxu = 0; + //gd.t.mask.u32[0] = 0xffffffff; + break; + case CLAMP_CLAMP: + pb->minu = 0; + pb->maxu = tw - 1; + //gd.t.mask.u32[0] = 0; + break; + case CLAMP_REGION_CLAMP: + pb->minu = std::min((int)context->CLAMP.MINU, tw - 1); + pb->maxu = std::min((int)context->CLAMP.MAXU, tw - 1); + //gd.t.mask.u32[0] = 0; + break; + case CLAMP_REGION_REPEAT: + pb->minu = (int)context->CLAMP.MINU & (tw - 1); + pb->maxu = (int)context->CLAMP.MAXU & (tw - 1); + //gd.t.mask.u32[0] = 0xffffffff; + break; + default: + __assume(0); + } + + switch(context->CLAMP.WMT) + { + case CLAMP_REPEAT: + pb->minv = th - 1; + pb->maxv = 0; + //gd.t.mask.u32[2] = 0xffffffff; + break; + case CLAMP_CLAMP: + pb->minv = 0; + pb->maxv = th - 1; + //gd.t.mask.u32[2] = 0; + break; + case CLAMP_REGION_CLAMP: + pb->minv = std::min((int)context->CLAMP.MINV, th - 1); + pb->maxv = std::min((int)context->CLAMP.MAXV, th - 1); // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256) + //gd.t.mask.u32[2] = 0; + break; + case CLAMP_REGION_REPEAT: + pb->minv = (int)context->CLAMP.MINV & (th - 1); // skygunner main menu water texture 64x64, MINV = 127 + pb->maxv = (int)context->CLAMP.MAXV & (th - 1); + //gd.t.mask.u32[2] = 0xffffffff; + break; + default: + __assume(0); + } + } + + if(PRIM->FGE) + { + pb->sel.fge = 1; + pb->fog = env.FOGCOL.u32[0]; + } + + if(context->FRAME.PSM != PSM_PSMCT24) + { + pb->sel.date = context->TEST.DATE; + pb->sel.datm = context->TEST.DATM; + } + + if(!IsOpaque()) + { + pb->sel.abe = PRIM->ABE; + pb->sel.ababcd = context->ALPHA.u32[0]; + + if(env.PABE.PABE) + { + pb->sel.pabe = 1; + } + + if(m_aa1 && PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS)) + { + pb->sel.aa1 = 1; + } + + pb->afix = context->ALPHA.FIX; + } + + if(pb->sel.date + || pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1 + || pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY + || (pb->sel.fpsm & 3) == 0 && fwrite && fm != 0 + || (pb->sel.fpsm & 3) == 1 && fwrite // always read-merge-write 24bpp, regardless the mask + || (pb->sel.fpsm & 3) >= 2 && fwrite && (fm & 0x80f8f8f8) != 0) + { + pb->sel.rfb = 1; + } + + pb->sel.colclamp = env.COLCLAMP.CLAMP; + pb->sel.fba = context->FBA.FBA; + + if(env.DTHE.DTHE) + { + pb->sel.dthe = 1; + + GSVector4i dimx0 = env.dimx[1].sll32(16).sra32(16); + GSVector4i dimx1 = env.dimx[3].sll32(16).sra32(16); + GSVector4i dimx2 = env.dimx[5].sll32(16).sra32(16); + GSVector4i dimx3 = env.dimx[7].sll32(16).sra32(16); + + pb->dimx = dimx0.ps32(dimx1).ps16(dimx2.ps32(dimx3)); + } + } + + if(zwrite || ztest) + { + pb->sel.zpsm = RemapPSM(context->ZBUF.PSM); + pb->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS; + pb->sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000; + } + + pb->fm = fm; + pb->zm = zm; + + if((pb->sel.fpsm & 3) == 1) + { + pb->fm |= 0xff000000; + } + else if((pb->sel.fpsm & 3) >= 2) + { + uint32 rb = pb->fm & 0x00f800f8; + uint32 ga = pb->fm & 0x8000f800; + + pb->fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000; + } + + if((pb->sel.zpsm & 3) == 1) + { + pb->zm |= 0xff000000; + } + else if((pb->sel.zpsm & 3) >= 2) + { + pb->zm |= 0xffff0000; + } + + if(pb->bbox.eq(pb->bbox.rintersect(pb->scissor))) + { + pb->sel.noscissor = 1; + } + + pb->fbp = context->FRAME.Block(); + pb->zbp = context->ZBUF.Block(); + pb->bw = context->FRAME.FBW; + + return true; +} + +////////// + +//#define IOCL_DEBUG + +GSRendererCL::CL::CL() +{ + WIs = INT_MAX; + + std::vector platforms; + + cl::Platform::get(&platforms); + + for(auto& p : platforms) + { + std::string platform_vendor = p.getInfo(); + + std::vector ds; + + p.getDevices(CL_DEVICE_TYPE_ALL, &ds); + + for(auto& device : ds) + { + std::string vendor = device.getInfo(); + std::string name = device.getInfo(); + std::string version = device.getInfo(); + + printf("%s %s %s", vendor.c_str(), name.c_str(), version.c_str()); + + cl_device_type type = device.getInfo(); + + switch(type) + { + case CL_DEVICE_TYPE_GPU: printf(" GPU"); break; + case CL_DEVICE_TYPE_CPU: printf(" CPU"); break; + } + + if(strstr(version.c_str(), "OpenCL C 1.2") != NULL) + { +#ifdef IOCL_DEBUG + if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL) +#else + //if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Intel") != NULL) + if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Advanced Micro Devices") != NULL) +#endif + { + devices.push_back(device); + + WIs = std::min(WIs, device.getInfo()); + + printf(" *"); + } + } + + printf("\n"); + } + + if(!devices.empty()) break; + } + + if(devices.empty()) + { + throw new std::exception("OpenCL device not found"); + } + + context = cl::Context(devices); + + queue[0] = cl::CommandQueue(context); + queue[1] = cl::CommandQueue(context); + queue[2] = cl::CommandQueue(context); + + vector buff; + + if(theApp.LoadResource(IDR_TFX_CL, buff)) + { + kernel_str = std::string((const char*)buff.data(), buff.size()); + } + + vb.head = vb.tail = vb.size = 0; + ib.head = ib.tail = ib.size = 0; + pb.head = pb.tail = pb.size = 0; + + vb.mapped_ptr = vb.ptr = NULL; + ib.mapped_ptr = ib.ptr = NULL; + pb.mapped_ptr = pb.ptr = NULL; + + pb.size = sizeof(TFXParameter) * 256; + pb.buff[0] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size); + pb.buff[1] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size); + + env = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(gs_env)); + + wqidx = 0; + wq = &queue[0]; +} + +GSRendererCL::CL::~CL() +{ + Unmap(); +} + +void GSRendererCL::CL::Map() +{ + Unmap(); + + if(vb.head < vb.size) + { + vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, vb.head, vb.size - vb.head); + vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head; + ASSERT(((size_t)vb.ptr & 15) == 0); + ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0); + } + + if(ib.head < ib.size) + { + ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, ib.head, ib.size - ib.head); + ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head; + ASSERT(((size_t)ib.ptr & 15) == 0); + } + + if(pb.head < pb.size) + { + pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pb.head, pb.size - pb.head); + pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head; + ASSERT(((size_t)pb.ptr & 15) == 0); + ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0); + } +} + +void GSRendererCL::CL::Unmap() +{ + if(vb.mapped_ptr != NULL) wq->enqueueUnmapMemObject(vb.buff[wqidx], vb.mapped_ptr); + if(ib.mapped_ptr != NULL) wq->enqueueUnmapMemObject(ib.buff[wqidx], ib.mapped_ptr); + if(pb.mapped_ptr != NULL) wq->enqueueUnmapMemObject(pb.buff[wqidx], pb.mapped_ptr); + + vb.mapped_ptr = vb.ptr = NULL; + ib.mapped_ptr = ib.ptr = NULL; + pb.mapped_ptr = pb.ptr = NULL; +} + +static void AddDefs(ostringstream& opt) +{ + opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u "; + opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u "; + opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u "; + opt << "-D MAX_PRIM_PER_BATCH=" << MAX_PRIM_PER_BATCH << "u "; + opt << "-D MAX_BATCH_COUNT=" << MAX_BATCH_COUNT << "u "; + opt << "-D BIN_SIZE_BITS=" << BIN_SIZE_BITS << " "; + opt << "-D BIN_SIZE=" << BIN_SIZE << "u "; + opt << "-D MAX_BIN_PER_BATCH=" << MAX_BIN_PER_BATCH << "u "; + opt << "-D MAX_BIN_COUNT=" << MAX_BIN_COUNT << "u "; +#ifdef IOCL_DEBUG + opt << "-g -s \"E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\tfx.cl\" "; +#endif +} + +cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel) +{ + auto i = prim_map.find(sel); + + if(i != prim_map.end()) + { + return i->second; + } + + char entry[256]; + + sprintf(entry, "prim_%02x", sel); + + cl::Program program = cl::Program(context, kernel_str); + + try + { + ostringstream opt; + + opt << "-D KERNEL_PRIM=" << entry << " "; + opt << "-D PRIM=" << sel.prim << " "; + + AddDefs(opt); + + program.build(opt.str().c_str()); + } + catch(cl::Error err) + { + if(err.err() == CL_BUILD_PROGRAM_FAILURE) + { + for(auto device : devices) + { + auto s = program.getBuildInfo(device); + + printf("kernel (%s) build error: %s\n", entry, s.c_str()); + } + } + + throw err; + } + + cl::Kernel k(program, entry); + + prim_map[sel] = k; + + k.setArg(0, env); + + return prim_map[sel]; +} + +cl::Kernel& GSRendererCL::CL::GetTileKernel(const TileSelector& sel) +{ + auto i = tile_map.find(sel); + + if(i != tile_map.end()) + { + return i->second; + } + + char entry[256]; + + sprintf(entry, "tile_%02x", sel); + + cl::Program program = cl::Program(context, kernel_str); + + try + { + ostringstream opt; + + opt << "-D KERNEL_TILE=" << entry << " "; + opt << "-D PRIM=" << sel.prim << " "; + opt << "-D MODE=" << sel.mode << " "; + opt << "-D CLEAR=" << sel.clear << " "; + + AddDefs(opt); + + program.build(opt.str().c_str()); + } + catch(cl::Error err) + { + if(err.err() == CL_BUILD_PROGRAM_FAILURE) + { + for(auto device : devices) + { + auto s = program.getBuildInfo(device); + + printf("kernel (%s) build error: %s\n", entry, s.c_str()); + } + } + + throw err; + } + + cl::Kernel k(program, entry); + + tile_map[sel] = k; + + k.setArg(0, env); + + return tile_map[sel]; +} + +cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel) +{ + auto i = tfx_map.find(sel); + + if(i != tfx_map.end()) + { + return i->second; + } + + char entry[256]; + + sprintf(entry, "tfx_%016x", sel); + + cl::Program program = cl::Program(context, kernel_str); + + try + { + ostringstream opt; + + opt << "-D KERNEL_TFX=" << entry << " "; + opt << "-D FPSM=" << sel.fpsm << " "; + opt << "-D ZPSM=" << sel.zpsm << " "; + opt << "-D ZTST=" << sel.ztst << " "; + opt << "-D ATST=" << sel.atst << " "; + opt << "-D AFAIL=" << sel.afail << " "; + opt << "-D IIP=" << sel.iip << " "; + opt << "-D TFX=" << sel.tfx << " "; + opt << "-D TCC=" << sel.tcc << " "; + opt << "-D FST=" << sel.fst << " "; + opt << "-D LTF=" << sel.ltf << " "; + opt << "-D TLU=" << sel.tlu << " "; + opt << "-D FGE=" << sel.fge << " "; + opt << "-D DATE=" << sel.date << " "; + opt << "-D ABE=" << sel.abe << " "; + opt << "-D ABA=" << sel.aba << " "; + opt << "-D ABB=" << sel.abb << " "; + opt << "-D ABC=" << sel.abc << " "; + opt << "-D ABD=" << sel.abd << " "; + opt << "-D PABE=" << sel.pabe << " "; + opt << "-D AA1=" << sel.aa1 << " "; + opt << "-D FWRITE=" << sel.fwrite << " "; + opt << "-D FTEST=" << sel.ftest << " "; + opt << "-D RFB=" << sel.rfb << " "; + opt << "-D ZWRITE=" << sel.zwrite << " "; + opt << "-D ZTEST=" << sel.ztest << " "; + opt << "-D ZOVERFLOW=" << sel.zoverflow << " "; + opt << "-D WMS=" << sel.wms << " "; + opt << "-D WMT=" << sel.wmt << " "; + opt << "-D DATM=" << sel.datm << " "; + opt << "-D COLCLAMP=" << sel.colclamp << " "; + opt << "-D FBA=" << sel.fba << " "; + opt << "-D DTHE=" << sel.dthe << " "; + opt << "-D PRIM=" << sel.prim << " "; + opt << "-D TW=" << sel.tw << " "; + opt << "-D LCM=" << sel.lcm << " "; + opt << "-D MMIN=" << sel.mmin << " "; + opt << "-D NOSCISSOR=" << sel.noscissor << " "; + opt << "-D TPSM=" << sel.tpsm << " "; + opt << "-D AEM=" << sel.aem << " "; + opt << "-D FB=" << sel.fb << " "; + opt << "-D ZB=" << sel.zb << " "; + + AddDefs(opt); + + program.build(opt.str().c_str()); + } + catch(cl::Error err) + { + if(err.err() == CL_BUILD_PROGRAM_FAILURE) + { + for(auto device : devices) + { + auto s = program.getBuildInfo(device); + + printf("kernel (%s) build error: %s\n", entry, s.c_str()); + } + } + + throw err; + } + + cl::Kernel k(program, entry); + + tfx_map[sel] = k; + + k.setArg(0, env); + k.setArg(1, vm); + k.setArg(2, tex); + + return tfx_map[sel]; +} diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h new file mode 100644 index 0000000000..3ac008ceaa --- /dev/null +++ b/plugins/GSdx/GSRendererCL.h @@ -0,0 +1,310 @@ +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#pragma once + +#include "GSRenderer.h" +//#include "GSTextureCacheCL.h" + +__aligned(struct, 32) GSVertexCL +{ + GSVector4 p, t; +}; + +class GSRendererCL : public GSRenderer +{ + typedef void (GSRendererCL::*ConvertVertexBufferPtr)(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count); + + ConvertVertexBufferPtr m_cvb[4][2][2]; + + template + void ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count); + + union PrimSelector + { + struct + { + uint32 prim:2; // 0 + }; + + uint32 key; + + operator uint32() const { return key; } + }; + + union TileSelector + { + struct + { + uint32 prim:2; // 0 + uint32 mode:2; // 2 + uint32 clear:1; // 4 + }; + + uint32 key; + + operator uint32() const { return key; } + }; + + union JobSelector + { + struct + { + uint32 dummy:1; // 0 + }; + + uint32 key; + + operator uint32() const { return key; } + }; + + union TFXSelector + { + struct + { + uint32 fpsm:3; // 0 + uint32 zpsm:3; // 3 + uint32 ztst:2; // 6 (0: off, 1: write, 2: test (ge), 3: test (g)) + uint32 atst:3; // 8 + uint32 afail:2; // 11 + uint32 iip:1; // 13 + uint32 tfx:3; // 14 + uint32 tcc:1; // 17 + uint32 fst:1; // 18 + uint32 ltf:1; // 19 + uint32 tlu:1; // 20 + uint32 fge:1; // 21 + uint32 date:1; // 22 + uint32 abe:1; // 23 + uint32 aba:2; // 24 + uint32 abb:2; // 26 + uint32 abc:2; // 28 + uint32 abd:2; // 30 + + uint32 pabe:1; // 32 + uint32 aa1:1; // 33 + uint32 fwrite:1; // 34 + uint32 ftest:1; // 35 + uint32 rfb:1; // 36 + uint32 zwrite:1; // 37 + uint32 ztest:1; // 38 + uint32 zoverflow:1; // 39 (z max >= 0x80000000) + uint32 wms:2; // 40 + uint32 wmt:2; // 42 + uint32 datm:1; // 44 + uint32 colclamp:1; // 45 + uint32 fba:1; // 46 + uint32 dthe:1; // 47 + uint32 prim:2; // 48 + uint32 tw:3; // 50 (encodes values between 3 -> 10, texture cache makes sure it is at least 3) + uint32 lcm:1; // 53 + uint32 mmin:2; // 54 + uint32 noscissor:1; // 55 + uint32 tpsm:4; // 56 + uint32 aem:1; // 60 + // TODO + }; + + struct + { + uint32 _pad1:24; + uint32 ababcd:8; + uint32 _pad2:2; + uint32 fb:2; + uint32 _pad3:1; + uint32 zb:2; + }; + + struct + { + uint32 lo; + uint32 hi; + }; + + uint64 key; + + operator uint64() const { return key; } + + bool IsSolidRect() const + { + return prim == GS_SPRITE_CLASS + && iip == 0 + && tfx == TFX_NONE + && abe == 0 + && ztst <= 1 + && atst <= 1 + && date == 0 + && fge == 0; + } + }; + + __aligned(struct, 32) TFXParameter + { + GSVector4i scissor; + GSVector4i bbox; + GSVector4i rect; + GSVector4i dimx; // 4x4 signed char + TFXSelector sel; + uint32 fbp, zbp, bw; + uint32 fm, zm; + uint32 fog; // rgb + uint8 aref, afix; + uint8 ta0, ta1; + uint32 tbp[7], tbw[7]; + int minu, maxu, minv, maxv; // umsk, ufix, vmsk, vfix + int lod; // lcm == 1 + int mxl; + float l; // TEX1.L * -0x10000 + float k; // TEX1.K * 0x10000 + uint32 clut[256]; + }; + + struct TFXJob + { + struct {int x, y, z, w;} rect; + TFXSelector sel; + uint32 ib_start, ib_count; + uint32 pb_start; + }; + + class CL + { + std::string kernel_str; + std::map prim_map; + std::map tile_map; + std::map tfx_map; + + public: + std::vector devices; + cl::Context context; + cl::CommandQueue queue[3]; + cl::Buffer vm; + cl::Buffer tex; + struct { cl::Buffer buff[2]; size_t head, tail, size; unsigned char* ptr; void* mapped_ptr; } vb, ib, pb; + cl::Buffer env; + cl::CommandQueue* wq; + int wqidx; + size_t WIs; + + public: + CL(); + virtual ~CL(); + + cl::Kernel& GetPrimKernel(const PrimSelector& sel); + cl::Kernel& GetTileKernel(const TileSelector& sel); + cl::Kernel& GetTFXKernel(const TFXSelector& sel); + + void Map(); + void Unmap(); + }; + + CL m_cl; + std::list m_jobs; + uint32 m_vb_start; + uint32 m_vb_count; + + void Enqueue(); + + /* + class RasterizerData : public GSAlignedClass<32> + { + __aligned(struct, 16) TextureLevel + { + GSVector4i r; + // TODO: GSTextureCacheCL::Texture* t; + }; + + public: + GSRendererCL* m_parent; + const uint32* m_fb_pages; + const uint32* m_zb_pages; + + //cl::Buffer m_vbuff; + //cl::Buffer m_ibuff; + + // TODO: buffers + TextureLevel m_tex[7 + 1]; // NULL terminated + //cl::Buffer m_clut; + //cl::Buffer m_dimx; + + // TODO: struct in a cl::Buffer + TFXSelector m_sel; + GSVector4i m_scissor; + GSVector4i m_bbox; + uint32 m_fm, m_zm; + int m_aref, m_afix; + uint32 m_fog; // rgb + int m_lod; // lcm == 1 + int m_mxl; + float m_l; // TEX1.L * -0x10000 + float m_k; // TEX1.K * 0x10000 + // TODO: struct { GSVector4i min, max, minmax, mask, invmask; } t; // [u] x 4 [v] x 4 + + RasterizerData(GSRendererCL* parent) + : m_parent(parent) + , m_fb_pages(NULL) + , m_zb_pages(NULL) + { + m_sel.key = 0; + } + + virtual ~RasterizerData() + { + // TODO: ReleasePages(); + } + + // TODO: void UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm); + // TODO: void ReleasePages(); + + // TODO: void SetSource(GSTextureCacheCL::Texture* t, const GSVector4i& r, int level); + // TODO: void UpdateSource(); + }; + */ +protected: +// GSTextureCacheCL* m_tc; + GSTexture* m_texture[2]; + uint8* m_output; + + uint8 m_rw_pages[512]; // TODO: bit array for faster clearing (bit 0: write, bit 1: read) + uint8 m_tex_pages[512]; + uint32 m_tmp_pages[512 + 1]; + + void Reset(); + void VSync(int field); + void ResetDevice(); + GSTexture* GetOutput(int i); + + void Draw(); + void Sync(int reason); + void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r); + void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false); + + void UsePages(const uint32* pages, int type); + void ReleasePages(const uint32* pages, int type); + + //bool CheckSourcePages(RasterizerData* data); + + bool SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count); + +public: + GSRendererCL(); + virtual ~GSRendererCL(); +}; diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index 30d848e4aa..71f42f961d 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -429,6 +429,15 @@ void GSRendererSW::Draw() GSVector4i scissor = GSVector4i(context->scissor.in); GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil())); + + // points and lines may have zero area bbox (single line: 0, 0 - 256, 0) + + if(m_vt.m_primclass == GS_POINT_CLASS || m_vt.m_primclass == GS_LINE_CLASS) + { + if(bbox.x == bbox.z) bbox.z++; + if(bbox.y == bbox.w) bbox.w++; + } + GSVector4i r = bbox.rintersect(scissor); scissor.z = std::min(scissor.z, (int)context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour @@ -973,7 +982,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.sel.zpsm = 3; gd.sel.atst = ATST_ALWAYS; gd.sel.tfx = TFX_NONE; - gd.sel.ababcd = 255; + gd.sel.ababcd = 0xff; gd.sel.prim = primclass; uint32 fm = context->FRAME.FBMSK; @@ -1101,7 +1110,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri gd.sel.lcm = context->TEX1.LCM; - int mxl = (std::min((int)context->TEX1.MXL, 6) << 16); + int mxl = std::min((int)context->TEX1.MXL, 6) << 16; int k = context->TEX1.K << 12; if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL) diff --git a/plugins/GSdx/GSSettingsDlg.cpp b/plugins/GSdx/GSSettingsDlg.cpp index 89656965dd..41d66f24e9 100644 --- a/plugins/GSdx/GSSettingsDlg.cpp +++ b/plugins/GSdx/GSSettingsDlg.cpp @@ -329,16 +329,19 @@ void GSSettingsDlg::UpdateRenderers() { GSSetting r = theApp.m_gs_renderers[i]; - if(i >= 3 && i <= 5) + if(r.id >= 3 && r.id <= 5 || r.id == 15) { if(level < D3D_FEATURE_LEVEL_10_0) continue; - r.name = std::string("Direct3D") + (level >= D3D_FEATURE_LEVEL_11_0 ? "11" : "10"); + r.name += (level >= D3D_FEATURE_LEVEL_11_0 ? "11" : "10"); } renderers.push_back(r); - if (r.id == renderer_setting) + + if(r.id == renderer_setting) + { renderer_sel = renderer_setting; + } } ComboBoxInit(IDC_RENDERER, renderers, renderer_sel); @@ -607,13 +610,13 @@ bool GSHacksDlg::OnMessage(UINT message, WPARAM wParam, LPARAM lParam) break; case IDC_SPRITEHACK: helpstr = "Sprite Hack\n\nHelps getting rid of black inner lines in some filtered sprites." - " Half option is the preferred one. Use it for Mana Khemia or Ar tonelico for example." + " Half option is the preferred one. Use it for Mana Khemia or Ar Tonelico for example." " Full can be used for Tales of Destiny."; break; case IDC_WILDHACK: helpstr = "WildArms\n\nLowers the GS precision to avoid gaps between pixels when" " upscaling. Full option fixes the text on WildArms games, while Half option might improve portraits" - " in Ar tonelico."; + " in Ar Tonelico."; break; case IDC_MSAACB: case IDC_STATIC_MSAA: diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index 604836af03..731b2aa480 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -1551,7 +1551,8 @@ void GSState::Read(uint8* mem, int len) return; } - if (!m_init_read_fifo_supported) { + if(!m_init_read_fifo_supported) + { if(m_tr.x == sx && m_tr.y == sy) { InvalidateLocalMem(m_env.BITBLTBUF, GSVector4i(sx, sy, sx + w, sy + h)); @@ -2316,20 +2317,20 @@ void GSState::GrowVertexBuffer() GSVertex* vertex = (GSVertex*)_aligned_malloc(sizeof(GSVertex) * maxcount, 32); uint32* index = (uint32*)_aligned_malloc(sizeof(uint32) * maxcount * 3, 32); // worst case is slightly less than vertex number * 3 - if (!vertex || !index) + if(vertex == NULL || index == NULL) { printf("GSdx: failed to allocate %d bytes for verticles and %d for indices.\n", sizeof(GSVertex) * maxcount, sizeof(uint32) * maxcount * 3); throw GSDXError(); } - if (m_vertex.buff != NULL) + if(m_vertex.buff != NULL) { memcpy(vertex, m_vertex.buff, sizeof(GSVertex) * m_vertex.tail); _aligned_free(m_vertex.buff); } - if (m_index.buff != NULL) + if(m_index.buff != NULL) { memcpy(index, m_index.buff, sizeof(uint32) * m_index.tail); diff --git a/plugins/GSdx/GSdx.cpp b/plugins/GSdx/GSdx.cpp index 00fb984260..93d9642fc2 100644 --- a/plugins/GSdx/GSdx.cpp +++ b/plugins/GSdx/GSdx.cpp @@ -41,8 +41,29 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserv return TRUE; } +bool GSdxApp::LoadResource(int id, vector& buff, const char* type) +{ + buff.clear(); + HRSRC hRsrc = FindResource((HMODULE)s_hModule, MAKEINTRESOURCE(id), type != NULL ? type : RT_RCDATA); + if(!hRsrc) return false; + HGLOBAL hGlobal = ::LoadResource((HMODULE)s_hModule, hRsrc); + if(!hGlobal) return false; + DWORD size = SizeofResource((HMODULE)s_hModule, hRsrc); + if(!size) return false; + buff.resize(size); + memcpy(buff.data(), LockResource(hGlobal), size); + return true; +} + #else +bool GSdxApp::LoadResource(int id, vector& buff, const char* type) +{ + buff.clear(); + printf("LoadResource not implemented\n"); + return false; +} + size_t GSdxApp::GetPrivateProfileString(const char* lpAppName, const char* lpKeyName, const char* lpDefault, char* lpReturnedString, size_t nSize, const char* lpFileName) { BuildConfigurationMap(lpFileName); @@ -108,10 +129,12 @@ GSdxApp::GSdxApp() m_gs_renderers.push_back(GSSetting(0, "Direct3D9", "Hardware")); m_gs_renderers.push_back(GSSetting(1, "Direct3D9", "Software")); + m_gs_renderers.push_back(GSSetting(14, "Direct3D9", "OpenCL")); m_gs_renderers.push_back(GSSetting(2, "Direct3D9", "Null")); - m_gs_renderers.push_back(GSSetting(3, "Direct3D%d ", "Hardware")); - m_gs_renderers.push_back(GSSetting(4, "Direct3D%d ", "Software")); - m_gs_renderers.push_back(GSSetting(5, "Direct3D%d ", "Null")); + m_gs_renderers.push_back(GSSetting(3, "Direct3D", "Hardware")); + m_gs_renderers.push_back(GSSetting(4, "Direct3D", "Software")); + m_gs_renderers.push_back(GSSetting(15, "Direct3D", "OpenCL")); + m_gs_renderers.push_back(GSSetting(5, "Direct3D", "Null")); #ifdef _LINUX // note: SDL was removed. We keep those bits for compatibility of the renderer // position in the linux dialog. @@ -119,9 +142,11 @@ GSdxApp::GSdxApp() m_gs_renderers.push_back(GSSetting(8, "SDL 1.3", "Null")); #endif m_gs_renderers.push_back(GSSetting(10, "Null", "Software")); + m_gs_renderers.push_back(GSSetting(16, "Null", "OpenCL")); m_gs_renderers.push_back(GSSetting(11, "Null", "Null")); m_gs_renderers.push_back(GSSetting(12, "OpenGL", "Hardware")); m_gs_renderers.push_back(GSSetting(13, "OpenGL", "Software")); + m_gs_renderers.push_back(GSSetting(17, "OpenGL", "OpenCL")); m_gs_interlace.push_back(GSSetting(0, "None", "")); m_gs_interlace.push_back(GSSetting(1, "Weave tff", "saw-tooth")); diff --git a/plugins/GSdx/GSdx.h b/plugins/GSdx/GSdx.h index f89a3fa761..5ccb1e2c69 100644 --- a/plugins/GSdx/GSdx.h +++ b/plugins/GSdx/GSdx.h @@ -39,6 +39,7 @@ public: #ifdef _WINDOWS HMODULE GetModuleHandle() {return (HMODULE)GetModuleHandlePtr();} #endif + #ifdef _LINUX void BuildConfigurationMap(const char* lpFileName); void ReloadConfig(); @@ -48,6 +49,8 @@ public: int GetPrivateProfileInt(const char* lpAppName, const char* lpKeyName, int nDefault, const char* lpFileName); #endif + bool LoadResource(int id, vector& buff, const char* type = NULL); + string GetConfig(const char* entry, const char* value); void SetConfig(const char* entry, const char* value); int GetConfig(const char* entry, int value); diff --git a/plugins/GSdx/GSdx.rc b/plugins/GSdx/GSdx.rc index 47ccc62aa0..87c0b6164d 100644 --- a/plugins/GSdx/GSdx.rc +++ b/plugins/GSdx/GSdx.rc @@ -51,9 +51,11 @@ BEGIN "#include ""res/tfx.fx""\r\n" "#include ""res/convert.fx""\r\n" "#include ""res/interlace.fx""\r\n" - "#include ""res/merge.fx""\r\0" - "#include ""res/fxaa.fx""\r\0" - "#include ""res/shadeboost.fx""\r\0" + "#include ""res/merge.fx""\r\n" + "#include ""res/fxaa.fx""\r\n" + "#include ""res/cs.fx""\r\n" + "#include ""res/shadeboost.fx""\r\n" + "#include ""res/tfx.cl""\r\n" END #endif // APSTUDIO_INVOKED @@ -64,13 +66,14 @@ END // RCDATA // -IDR_CONVERT_FX RCDATA "res\\convert.fx" IDR_TFX_FX RCDATA "res\\tfx.fx" -IDR_MERGE_FX RCDATA "res\\merge.fx" +IDR_CONVERT_FX RCDATA "res\\convert.fx" IDR_INTERLACE_FX RCDATA "res\\interlace.fx" +IDR_MERGE_FX RCDATA "res\\merge.fx" IDR_FXAA_FX RCDATA "res\\fxaa.fx" IDR_CS_FX RCDATA "res\\cs.fx" IDR_SHADEBOOST_FX RCDATA "res\\shadeboost.fx" +IDR_TFX_CL RCDATA "res\\tfx.cl" ///////////////////////////////////////////////////////////////////////////// // @@ -394,6 +397,10 @@ END #include "res/convert.fx" #include "res/interlace.fx" #include "res/merge.fx" +#include "res/fxaa.fx" +#include "res/cs.fx" +#include "res/shadeboost.fx" +#include "res/tfx.cl" ///////////////////////////////////////////////////////////////////////////// #endif // not APSTUDIO_INVOKED diff --git a/plugins/GSdx/GSdx_vs2013.vcxproj b/plugins/GSdx/GSdx_vs2013.vcxproj index f9ee445906..ef165a5839 100644 --- a/plugins/GSdx/GSdx_vs2013.vcxproj +++ b/plugins/GSdx/GSdx_vs2013.vcxproj @@ -687,6 +687,7 @@ AssemblyAndSourceCode + @@ -1970,6 +1971,7 @@ + @@ -2057,6 +2059,7 @@ + diff --git a/plugins/GSdx/GSdx_vs2013.vcxproj.filters b/plugins/GSdx/GSdx_vs2013.vcxproj.filters index 1067b76f32..b0053934e9 100644 --- a/plugins/GSdx/GSdx_vs2013.vcxproj.filters +++ b/plugins/GSdx/GSdx_vs2013.vcxproj.filters @@ -348,6 +348,9 @@ Source Files + + Source Files + @@ -707,6 +710,9 @@ Header Files + + Header Files + @@ -737,10 +743,13 @@ Shaders + + Shaders + Shaders - + Shaders diff --git a/plugins/GSdx/res/cs.fx b/plugins/GSdx/res/cs.fx index fb63c0b012..c84211ba95 100644 --- a/plugins/GSdx/res/cs.fx +++ b/plugins/GSdx/res/cs.fx @@ -1,3 +1,5 @@ +#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency + #ifndef VS_TME #define VS_TME 1 #define VS_FST 1 @@ -381,3 +383,5 @@ void ps_main1(GS_OUTPUT input) WritePixel(addr.x, c, PS_FPSM); WritePixel(addr.y, z, PS_ZPSM); } + +#endif diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl new file mode 100644 index 0000000000..d28622380e --- /dev/null +++ b/plugins/GSdx/res/tfx.cl @@ -0,0 +1,1619 @@ +#ifdef __OPENCL_C_VERSION__ // make safe to include in resource file to enforce dependency + +#ifndef CL_FLT_EPSILON +#define CL_FLT_EPSILON 1.1920928955078125e-7 +#endif + +#if MAX_PRIM_PER_BATCH == 64u + #define BIN_TYPE ulong +#elif MAX_PRIM_PER_BATCH == 32u + #define BIN_TYPE uint +#else + #error "MAX_PRIM_PER_BATCH != 32u OR 64u" +#endif + +typedef struct +{ + union {float4 p; struct {float x, y, z, f;};}; + union {float4 tc; struct {float s, t, q; uchar4 c;};}; +} gs_vertex; + +typedef struct +{ + gs_vertex v[4]; +} gs_prim; + +typedef struct +{ + float4 dx, dy; + float4 zero; + float4 reject_corner; +} gs_barycentric; + +typedef struct +{ + uint batch_counter; + uint _pad[7]; + struct {uint first, last;} bounds[MAX_BIN_PER_BATCH]; + BIN_TYPE bin[MAX_BIN_COUNT]; + uchar4 bbox[MAX_PRIM_COUNT]; + gs_prim prim[MAX_PRIM_COUNT]; + gs_barycentric barycentric[MAX_PRIM_COUNT]; +} gs_env; + +typedef struct +{ + int4 scissor; + int4 bbox; + int4 rect; + char dimx[4][4]; + ulong sel; + uint fbp, zbp, bw; + uint fm, zm; + uchar4 fog; // rgb + uchar aref, afix; + uchar ta0, ta1; + uint tbp[7], tbw[7]; + int minu, maxu, minv, maxv; + int lod; // lcm == 1 + int mxl; + float l; // TEX1.L * -0x10000 + float k; // TEX1.K * 0x10000 + uchar4 clut[256]; // TODO: this could be an index to a separate buffer, it may be the same across several gs_params following eachother +} gs_param; + +enum GS_PRIM_CLASS +{ + GS_POINT_CLASS, + GS_LINE_CLASS, + GS_TRIANGLE_CLASS, + GS_SPRITE_CLASS +}; + +enum GS_PSM_TARGET +{ + PSM_PSMCT32, + PSM_PSMCT24, + PSM_PSMCT16, + PSM_PSMCT16S, + PSM_PSMZ32, + PSM_PSMZ24, + PSM_PSMZ16, + PSM_PSMZ16S, + PSM_PSMT8, + PSM_PSMT4, + PSM_PSMT8H, + PSM_PSMT4HL, + PSM_PSMT4HH, +}; + +enum GS_TFX +{ + TFX_MODULATE = 0, + TFX_DECAL = 1, + TFX_HIGHLIGHT = 2, + TFX_HIGHLIGHT2 = 3, + TFX_NONE = 4, +}; + +enum GS_CLAMP +{ + CLAMP_REPEAT = 0, + CLAMP_CLAMP = 1, + CLAMP_REGION_CLAMP = 2, + CLAMP_REGION_REPEAT = 3, +}; + +enum GS_ZTST +{ + ZTST_NEVER = 0, + ZTST_ALWAYS = 1, + ZTST_GEQUAL = 2, + ZTST_GREATER = 3, +}; + +enum GS_ATST +{ + ATST_NEVER = 0, + ATST_ALWAYS = 1, + ATST_LESS = 2, + ATST_LEQUAL = 3, + ATST_EQUAL = 4, + ATST_GEQUAL = 5, + ATST_GREATER = 6, + ATST_NOTEQUAL = 7, +}; + +enum GS_AFAIL +{ + AFAIL_KEEP = 0, + AFAIL_FB_ONLY = 1, + AFAIL_ZB_ONLY = 2, + AFAIL_RGB_ONLY = 3, +}; + +__constant uchar blockTable32[4][8] = +{ + { 0, 1, 4, 5, 16, 17, 20, 21}, + { 2, 3, 6, 7, 18, 19, 22, 23}, + { 8, 9, 12, 13, 24, 25, 28, 29}, + { 10, 11, 14, 15, 26, 27, 30, 31} +}; + +__constant uchar blockTable32Z[4][8] = +{ + { 24, 25, 28, 29, 8, 9, 12, 13}, + { 26, 27, 30, 31, 10, 11, 14, 15}, + { 16, 17, 20, 21, 0, 1, 4, 5}, + { 18, 19, 22, 23, 2, 3, 6, 7} +}; + +__constant uchar blockTable16[8][4] = +{ + { 0, 2, 8, 10 }, + { 1, 3, 9, 11 }, + { 4, 6, 12, 14 }, + { 5, 7, 13, 15 }, + { 16, 18, 24, 26 }, + { 17, 19, 25, 27 }, + { 20, 22, 28, 30 }, + { 21, 23, 29, 31 } +}; + +__constant uchar blockTable16S[8][4] = +{ + { 0, 2, 16, 18 }, + { 1, 3, 17, 19 }, + { 8, 10, 24, 26 }, + { 9, 11, 25, 27 }, + { 4, 6, 20, 22 }, + { 5, 7, 21, 23 }, + { 12, 14, 28, 30 }, + { 13, 15, 29, 31 } +}; + +__constant uchar blockTable16Z[8][4] = +{ + { 24, 26, 16, 18 }, + { 25, 27, 17, 19 }, + { 28, 30, 20, 22 }, + { 29, 31, 21, 23 }, + { 8, 10, 0, 2 }, + { 9, 11, 1, 3 }, + { 12, 14, 4, 6 }, + { 13, 15, 5, 7 } +}; + +__constant uchar blockTable16SZ[8][4] = +{ + { 24, 26, 8, 10 }, + { 25, 27, 9, 11 }, + { 16, 18, 0, 2 }, + { 17, 19, 1, 3 }, + { 28, 30, 12, 14 }, + { 29, 31, 13, 15 }, + { 20, 22, 4, 6 }, + { 21, 23, 5, 7 } +}; + +__constant uchar blockTable8[4][8] = +{ + { 0, 1, 4, 5, 16, 17, 20, 21}, + { 2, 3, 6, 7, 18, 19, 22, 23}, + { 8, 9, 12, 13, 24, 25, 28, 29}, + { 10, 11, 14, 15, 26, 27, 30, 31} +}; + +__constant uchar blockTable4[8][4] = +{ + { 0, 2, 8, 10 }, + { 1, 3, 9, 11 }, + { 4, 6, 12, 14 }, + { 5, 7, 13, 15 }, + { 16, 18, 24, 26 }, + { 17, 19, 25, 27 }, + { 20, 22, 28, 30 }, + { 21, 23, 29, 31 } +}; + +__constant uchar columnTable32[8][8] = +{ + { 0, 1, 4, 5, 8, 9, 12, 13 }, + { 2, 3, 6, 7, 10, 11, 14, 15 }, + { 16, 17, 20, 21, 24, 25, 28, 29 }, + { 18, 19, 22, 23, 26, 27, 30, 31 }, + { 32, 33, 36, 37, 40, 41, 44, 45 }, + { 34, 35, 38, 39, 42, 43, 46, 47 }, + { 48, 49, 52, 53, 56, 57, 60, 61 }, + { 50, 51, 54, 55, 58, 59, 62, 63 }, +}; + +__constant uchar columnTable16[8][16] = +{ + { 0, 2, 8, 10, 16, 18, 24, 26, + 1, 3, 9, 11, 17, 19, 25, 27 }, + { 4, 6, 12, 14, 20, 22, 28, 30, + 5, 7, 13, 15, 21, 23, 29, 31 }, + { 32, 34, 40, 42, 48, 50, 56, 58, + 33, 35, 41, 43, 49, 51, 57, 59 }, + { 36, 38, 44, 46, 52, 54, 60, 62, + 37, 39, 45, 47, 53, 55, 61, 63 }, + { 64, 66, 72, 74, 80, 82, 88, 90, + 65, 67, 73, 75, 81, 83, 89, 91 }, + { 68, 70, 76, 78, 84, 86, 92, 94, + 69, 71, 77, 79, 85, 87, 93, 95 }, + { 96, 98, 104, 106, 112, 114, 120, 122, + 97, 99, 105, 107, 113, 115, 121, 123 }, + { 100, 102, 108, 110, 116, 118, 124, 126, + 101, 103, 109, 111, 117, 119, 125, 127 }, +}; + +__constant uchar columnTable8[16][16] = +{ + { 0, 4, 16, 20, 32, 36, 48, 52, // column 0 + 2, 6, 18, 22, 34, 38, 50, 54 }, + { 8, 12, 24, 28, 40, 44, 56, 60, + 10, 14, 26, 30, 42, 46, 58, 62 }, + { 33, 37, 49, 53, 1, 5, 17, 21, + 35, 39, 51, 55, 3, 7, 19, 23 }, + { 41, 45, 57, 61, 9, 13, 25, 29, + 43, 47, 59, 63, 11, 15, 27, 31 }, + { 96, 100, 112, 116, 64, 68, 80, 84, // column 1 + 98, 102, 114, 118, 66, 70, 82, 86 }, + { 104, 108, 120, 124, 72, 76, 88, 92, + 106, 110, 122, 126, 74, 78, 90, 94 }, + { 65, 69, 81, 85, 97, 101, 113, 117, + 67, 71, 83, 87, 99, 103, 115, 119 }, + { 73, 77, 89, 93, 105, 109, 121, 125, + 75, 79, 91, 95, 107, 111, 123, 127 }, + { 128, 132, 144, 148, 160, 164, 176, 180, // column 2 + 130, 134, 146, 150, 162, 166, 178, 182 }, + { 136, 140, 152, 156, 168, 172, 184, 188, + 138, 142, 154, 158, 170, 174, 186, 190 }, + { 161, 165, 177, 181, 129, 133, 145, 149, + 163, 167, 179, 183, 131, 135, 147, 151 }, + { 169, 173, 185, 189, 137, 141, 153, 157, + 171, 175, 187, 191, 139, 143, 155, 159 }, + { 224, 228, 240, 244, 192, 196, 208, 212, // column 3 + 226, 230, 242, 246, 194, 198, 210, 214 }, + { 232, 236, 248, 252, 200, 204, 216, 220, + 234, 238, 250, 254, 202, 206, 218, 222 }, + { 193, 197, 209, 213, 225, 229, 241, 245, + 195, 199, 211, 215, 227, 231, 243, 247 }, + { 201, 205, 217, 221, 233, 237, 249, 253, + 203, 207, 219, 223, 235, 239, 251, 255 }, +}; + +__constant ushort columnTable4[16][32] = +{ + { 0, 8, 32, 40, 64, 72, 96, 104, // column 0 + 2, 10, 34, 42, 66, 74, 98, 106, + 4, 12, 36, 44, 68, 76, 100, 108, + 6, 14, 38, 46, 70, 78, 102, 110 }, + { 16, 24, 48, 56, 80, 88, 112, 120, + 18, 26, 50, 58, 82, 90, 114, 122, + 20, 28, 52, 60, 84, 92, 116, 124, + 22, 30, 54, 62, 86, 94, 118, 126 }, + { 65, 73, 97, 105, 1, 9, 33, 41, + 67, 75, 99, 107, 3, 11, 35, 43, + 69, 77, 101, 109, 5, 13, 37, 45, + 71, 79, 103, 111, 7, 15, 39, 47 }, + { 81, 89, 113, 121, 17, 25, 49, 57, + 83, 91, 115, 123, 19, 27, 51, 59, + 85, 93, 117, 125, 21, 29, 53, 61, + 87, 95, 119, 127, 23, 31, 55, 63 }, + { 192, 200, 224, 232, 128, 136, 160, 168, // column 1 + 194, 202, 226, 234, 130, 138, 162, 170, + 196, 204, 228, 236, 132, 140, 164, 172, + 198, 206, 230, 238, 134, 142, 166, 174 }, + { 208, 216, 240, 248, 144, 152, 176, 184, + 210, 218, 242, 250, 146, 154, 178, 186, + 212, 220, 244, 252, 148, 156, 180, 188, + 214, 222, 246, 254, 150, 158, 182, 190 }, + { 129, 137, 161, 169, 193, 201, 225, 233, + 131, 139, 163, 171, 195, 203, 227, 235, + 133, 141, 165, 173, 197, 205, 229, 237, + 135, 143, 167, 175, 199, 207, 231, 239 }, + { 145, 153, 177, 185, 209, 217, 241, 249, + 147, 155, 179, 187, 211, 219, 243, 251, + 149, 157, 181, 189, 213, 221, 245, 253, + 151, 159, 183, 191, 215, 223, 247, 255 }, + { 256, 264, 288, 296, 320, 328, 352, 360, // column 2 + 258, 266, 290, 298, 322, 330, 354, 362, + 260, 268, 292, 300, 324, 332, 356, 364, + 262, 270, 294, 302, 326, 334, 358, 366 }, + { 272, 280, 304, 312, 336, 344, 368, 376, + 274, 282, 306, 314, 338, 346, 370, 378, + 276, 284, 308, 316, 340, 348, 372, 380, + 278, 286, 310, 318, 342, 350, 374, 382 }, + { 321, 329, 353, 361, 257, 265, 289, 297, + 323, 331, 355, 363, 259, 267, 291, 299, + 325, 333, 357, 365, 261, 269, 293, 301, + 327, 335, 359, 367, 263, 271, 295, 303 }, + { 337, 345, 369, 377, 273, 281, 305, 313, + 339, 347, 371, 379, 275, 283, 307, 315, + 341, 349, 373, 381, 277, 285, 309, 317, + 343, 351, 375, 383, 279, 287, 311, 319 }, + { 448, 456, 480, 488, 384, 392, 416, 424, // column 3 + 450, 458, 482, 490, 386, 394, 418, 426, + 452, 460, 484, 492, 388, 396, 420, 428, + 454, 462, 486, 494, 390, 398, 422, 430 }, + { 464, 472, 496, 504, 400, 408, 432, 440, + 466, 474, 498, 506, 402, 410, 434, 442, + 468, 476, 500, 508, 404, 412, 436, 444, + 470, 478, 502, 510, 406, 414, 438, 446 }, + { 385, 393, 417, 425, 449, 457, 481, 489, + 387, 395, 419, 427, 451, 459, 483, 491, + 389, 397, 421, 429, 453, 461, 485, 493, + 391, 399, 423, 431, 455, 463, 487, 495 }, + { 401, 409, 433, 441, 465, 473, 497, 505, + 403, 411, 435, 443, 467, 475, 499, 507, + 405, 413, 437, 445, 469, 477, 501, 509, + 407, 415, 439, 447, 471, 479, 503, 511 }, +}; + +uint BlockNumber32(int x, int y, uint bp, uint bw) +{ + return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 3][(x >> 3) & 7]; +} + +uint BlockNumber16(int x, int y, uint bp, uint bw) +{ + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 7][(x >> 4) & 3]; +} + +uint BlockNumber16S(int x, int y, uint bp, uint bw) +{ + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 7][(x >> 4) & 3]; +} + +uint BlockNumber32Z(int x, int y, uint bp, uint bw) +{ + return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 3][(x >> 3) & 7]; +} + +uint BlockNumber16Z(int x, int y, uint bp, uint bw) +{ + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 7][(x >> 4) & 3]; +} + +uint BlockNumber16SZ(int x, int y, uint bp, uint bw) +{ + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3]; +} + +uint BlockNumber8(int x, int y, uint bp, uint bw) +{ + return bp + ((y >> 1) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7]; +} + +uint BlockNumber4(int x, int y, uint bp, uint bw) +{ + return bp + ((y >> 2) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3]; +} + +uint PixelAddress32(int x, int y, uint bp, uint bw) +{ + return (BlockNumber32(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7]; +} + +uint PixelAddress16(int x, int y, uint bp, uint bw) +{ + return (BlockNumber16(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; +} + +uint PixelAddress16S(int x, int y, uint bp, uint bw) +{ + return (BlockNumber16S(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; +} + +uint PixelAddress32Z(int x, int y, uint bp, uint bw) +{ + return (BlockNumber32Z(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7]; +} + +uint PixelAddress16Z(int x, int y, uint bp, uint bw) +{ + return (BlockNumber16Z(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; +} + +uint PixelAddress16SZ(int x, int y, uint bp, uint bw) +{ + return (BlockNumber16SZ(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; +} + +uint PixelAddress8(int x, int y, uint bp, uint bw) +{ + return (BlockNumber8(x, y, bp, bw) << 8) + columnTable8[y & 15][x & 15]; +} + +uint PixelAddress4(int x, int y, uint bp, uint bw) +{ + return (BlockNumber4(x, y, bp, bw) << 9) + columnTable4[y & 15][x & 31]; +} + +uint PixelAddress(int x, int y, uint bp, uint bw, uint psm) +{ + switch(psm) + { + default: + case PSM_PSMCT32: + case PSM_PSMCT24: + case PSM_PSMT8H: + case PSM_PSMT4HL: + case PSM_PSMT4HH: + return PixelAddress32(x, y, bp, bw); + case PSM_PSMCT16: + return PixelAddress16(x, y, bp, bw); + case PSM_PSMCT16S: + return PixelAddress16S(x, y, bp, bw); + case PSM_PSMZ32: + case PSM_PSMZ24: + return PixelAddress32Z(x, y, bp, bw); + case PSM_PSMZ16: + return PixelAddress16Z(x, y, bp, bw); + case PSM_PSMZ16S: + return PixelAddress16SZ(x, y, bp, bw); + case PSM_PSMT8: + return PixelAddress8(x, y, bp, bw); + case PSM_PSMT4: + return PixelAddress4(x, y, bp, bw); + } +} + +uint TileBlockNumber(int x, int y, uint bp, uint bw, uint psm) +{ + // TODO: replace blockTable with a subset tileTable + + switch(psm) + { + default: + case PSM_PSMCT32: + case PSM_PSMCT24: + return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 2][(x >> 3) & 6]; + case PSM_PSMCT16: + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 2][(x >> 4) & 3]; + case PSM_PSMCT16S: + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 2][(x >> 4) & 3]; + case PSM_PSMZ32: + case PSM_PSMZ24: + return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 2][(x >> 3) & 6]; + case PSM_PSMZ16: + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 2][(x >> 4) & 3]; + case PSM_PSMZ16S: + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 2][(x >> 4) & 3]; + } +} + +uint TilePixelAddress(int x, int y, uint ba, uint psm) +{ + switch(psm) + { + default: + case PSM_PSMCT32: + case PSM_PSMCT24: + case PSM_PSMZ32: + case PSM_PSMZ24: + return ((ba + ((y >> 2) & 2) + ((x >> 3) & 1)) << 6) + columnTable32[y & 7][x & 7]; + case PSM_PSMCT16: + case PSM_PSMCT16S: + case PSM_PSMZ16: + case PSM_PSMZ16S: + return ((ba + ((y >> 3) & 1)) << 7) + columnTable16[y & 7][x & 15]; + } +} + +uint ReadFrame(__global uchar* vm, uint addr, uint psm) +{ + switch(psm) + { + default: + case PSM_PSMCT32: + case PSM_PSMCT24: + case PSM_PSMZ32: + case PSM_PSMZ24: + return ((__global uint*)vm)[addr]; + case PSM_PSMCT16: + case PSM_PSMCT16S: + case PSM_PSMZ16: + case PSM_PSMZ16S: + return ((__global ushort*)vm)[addr]; + } +} + +void WriteFrame(__global uchar* vm, uint addr, uint psm, uint value) +{ + switch(psm) + { + default: + case PSM_PSMCT32: + case PSM_PSMZ32: + case PSM_PSMCT24: + case PSM_PSMZ24: + ((__global uint*)vm)[addr] = value; + break; + case PSM_PSMCT16: + case PSM_PSMCT16S: + case PSM_PSMZ16: + case PSM_PSMZ16S: + ((__global ushort*)vm)[addr] = (ushort)value; + break; + } +} + +bool is16bit(int psm) +{ + return psm < 8 && (psm & 3) >= 2; +} + +bool is24bit(int psm) +{ + return psm < 8 && (psm & 3) == 1; +} + +bool is32bit(int psm) +{ + return psm < 8 && (psm & 3) == 0; +} + +#ifdef PRIM + +int GetVertexPerPrim(int prim_class) +{ + switch(prim_class) + { + default: + case GS_POINT_CLASS: return 1; + case GS_LINE_CLASS: return 2; + case GS_TRIANGLE_CLASS: return 3; + case GS_SPRITE_CLASS: return 2; + } +} + +#define VERTEX_PER_PRIM GetVertexPerPrim(PRIM) + +#endif + +#ifdef KERNEL_PRIM + +__kernel void KERNEL_PRIM( + __global gs_env* env, + __global uchar* vb_base, + __global uchar* ib_base, + uint vb_start, + uint ib_start) +{ + size_t prim_index = get_global_id(0); + + __global gs_vertex* vb = (__global gs_vertex*)(vb_base + vb_start); + __global uint* ib = (__global uint*)(ib_base + ib_start); + __global gs_prim* prim = &env->prim[prim_index]; + + ib += prim_index * VERTEX_PER_PRIM; + + int2 pmin, pmax; + + if(PRIM == GS_POINT_CLASS) + { + pmin = pmax = convert_int2_rte(vb[ib[0]].p.xy); + } + else if(PRIM == GS_LINE_CLASS) + { + int2 p0 = convert_int2_rte(vb[ib[0]].p.xy); + int2 p1 = convert_int2_rte(vb[ib[1]].p.xy); + + pmin = min(p0, p1); + pmax = max(p0, p1); + } + else if(PRIM == GS_TRIANGLE_CLASS) + { + __global gs_vertex* v0 = &vb[ib[0]]; + __global gs_vertex* v1 = &vb[ib[1]]; + __global gs_vertex* v2 = &vb[ib[2]]; + + int2 p0 = convert_int2_rtp(v0->p.xy); + int2 p1 = convert_int2_rtp(v1->p.xy); + int2 p2 = convert_int2_rtp(v2->p.xy); + + pmin = min(min(p0, p1), p2); + pmax = max(max(p0, p1), p2); + + prim->v[0].p = v0->p; + prim->v[0].tc = v0->tc; + prim->v[1].p = v1->p; + prim->v[1].tc = v1->tc; + prim->v[2].p = v2->p; + prim->v[2].tc = v2->tc; + + float4 dp0 = v1->p - v0->p; + float4 dp1 = v0->p - v2->p; + float4 dp2 = v2->p - v1->p; + + float cp = dp0.x * dp1.y - dp0.y * dp1.x; + + if(cp != 0.0f) + { + float cp_rcp = 1.0f / cp;// native_recip(cp); + + float2 u = dp0.xy * cp_rcp; + float2 v = -dp1.xy * cp_rcp; + + // v0 has the (0, 0, 1) barycentric coord, v1: (0, 1, 0), v2: (1, 0, 0) + + gs_barycentric b; + + b.dx = (float4)(-v.y, u.y, v.y - u.y, v0->p.x); + b.dy = (float4)(v.x, -u.x, u.x - v.x, v0->p.y); + + dp0.xy = dp0.xy * sign(cp); + dp1.xy = dp1.xy * sign(cp); + dp2.xy = dp2.xy * sign(cp); + + b.zero.x = (dp1.y < 0 || dp1.y == 0 && dp1.x > 0) ? CL_FLT_EPSILON : 0; + b.zero.y = (dp0.y < 0 || dp0.y == 0 && dp0.x > 0) ? CL_FLT_EPSILON : 0; + b.zero.z = (dp2.y < 0 || dp2.y == 0 && dp2.x > 0) ? CL_FLT_EPSILON : 0; + + // any barycentric(reject_corner) < 0, tile outside the triangle + + b.reject_corner.x = 0.0f + max(max(max(0.0f, b.dx.x), b.dy.x), b.dx.x + b.dy.x) * BIN_SIZE; + b.reject_corner.y = 0.0f + max(max(max(0.0f, b.dx.y), b.dy.y), b.dx.y + b.dy.y) * BIN_SIZE; + b.reject_corner.z = 1.0f + max(max(max(0.0f, b.dx.z), b.dy.z), b.dx.z + b.dy.z) * BIN_SIZE; + + // TODO: accept_corner, at min value, all barycentric(accept_corner) >= 0, tile fully inside, no per pixel hittest needed + + env->barycentric[prim_index] = b; + } + else + { + // TODO: set b.zero to something that always fails the tests + } + } + else if(PRIM == GS_SPRITE_CLASS) + { + __global gs_vertex* v0 = &vb[ib[0]]; + __global gs_vertex* v1 = &vb[ib[1]]; + + int2 p0 = convert_int2_rtp(v0->p.xy); + int2 p1 = convert_int2_rtp(v1->p.xy); + + pmin = min(p0, p1); + pmax = max(p0, p1); + + int4 mask = (int4)(v0->p.xy > v1->p.xy, 0, 0); + + prim->v[0].p = select(v0->p, v1->p, mask); // pmin + prim->v[0].tc = select(v0->tc, v1->tc, mask); + prim->v[1].p = select(v1->p, v0->p, mask); // pmax + prim->v[1].tc = select(v1->tc, v0->tc, mask); + prim->v[1].tc.xy = (prim->v[1].tc.xy - prim->v[0].tc.xy) / (prim->v[1].p.xy - prim->v[0].p.xy); + } + + int4 pminmax = (int4)(pmin, pmax); + + env->bbox[prim_index] = convert_uchar4_sat(pminmax >> BIN_SIZE_BITS); +} + +#endif + +#ifdef KERNEL_TILE + +int tile_in_triangle(float2 p, gs_barycentric b) +{ + float3 f = b.dx.xyz * (p.x - b.dx.w) + b.dy.xyz * (p.y - b.dy.w) + b.reject_corner.xyz; + + f = select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10)); + + return all(f >= b.zero.xyz); +} + +#if CLEAR == 1 + +__kernel void KERNEL_TILE(__global gs_env* env) +{ + env->batch_counter = 0; + env->bounds[get_global_id(0)].first = -1; + env->bounds[get_global_id(0)].last = 0; +} + +#elif MODE < 3 + +#if MAX_PRIM_PER_BATCH != 32 + #error "MAX_PRIM_PER_BATCH != 32" +#endif + +#define MAX_PRIM_PER_GROUP (32u >> MODE) + +__kernel void KERNEL_TILE( + __global gs_env* env, + uint prim_count, + uint bin_count, // == bin_dim.z * bin_dim.w + uchar4 bin_dim) +{ + uint batch_index = get_group_id(2) >> MODE; + uint prim_start = get_group_id(2) << (5 - MODE); + uint group_prim_index = get_local_id(2); + uint bin_index = get_local_id(1) * get_local_size(0) + get_local_id(0); + + __global BIN_TYPE* bin = &env->bin[batch_index * bin_count]; + __global uchar4* bbox = &env->bbox[prim_start]; + __global gs_barycentric* barycentric = &env->barycentric[prim_start]; + + __local uchar4 bbox_cache[MAX_PRIM_PER_GROUP]; + __local gs_barycentric barycentric_cache[MAX_PRIM_PER_GROUP]; + __local uint visible[8 << MODE]; + + if(get_local_id(2) == 0) + { + visible[bin_index] = 0; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + uint group_prim_count = min(prim_count - prim_start, MAX_PRIM_PER_GROUP); + + event_t e = async_work_group_copy(bbox_cache, bbox, group_prim_count, 0); + + wait_group_events(1, &e); + + if(PRIM == GS_TRIANGLE_CLASS) + { + e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, group_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0); + + wait_group_events(1, &e); + } + + if(group_prim_index < group_prim_count) + { + int x = bin_dim.x + get_local_id(0); + int y = bin_dim.y + get_local_id(1); + + uchar4 r = bbox_cache[group_prim_index]; + + uint test = (r.x <= x + 1) & (r.z >= x) & (r.y <= y + 1) & (r.w >= y); + + if(PRIM == GS_TRIANGLE_CLASS && test != 0) + { + test &= tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[group_prim_index]); + } + + atomic_or(&visible[bin_index], test << ((MAX_PRIM_PER_GROUP - 1) - get_local_id(2))); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(get_local_id(2) == 0) + { + #if MODE == 0 + ((__global uint*)&bin[bin_index])[0] = visible[bin_index]; + #elif MODE == 1 + ((__global ushort*)&bin[bin_index])[1 - (get_group_id(2) & 1)] = visible[bin_index]; + #elif MODE == 2 + ((__global uchar*)&bin[bin_index])[3 - (get_group_id(2) & 3)] = visible[bin_index]; + #endif + + if(visible[bin_index] != 0) + { + atomic_min(&env->bounds[bin_index].first, batch_index); + atomic_max(&env->bounds[bin_index].last, batch_index); + } + } +} + +#elif MODE == 3 + +__kernel void KERNEL_TILE( + __global gs_env* env, + uint prim_count, + uint batch_count, + uint bin_count, // == bin_dim.z * bin_dim.w + uchar4 bin_dim) +{ + __local uchar4 bbox_cache[MAX_PRIM_PER_BATCH]; + __local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH]; + __local uint batch_index; + + size_t local_id = get_local_id(0); + size_t local_size = get_local_size(0); + + while(1) + { + barrier(CLK_LOCAL_MEM_FENCE); + + if(local_id == 0) + { + batch_index = atomic_inc(&env->batch_counter); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(batch_index >= batch_count) + { + break; + } + + uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH); + + __global BIN_TYPE* bin = &env->bin[batch_index * bin_count]; + __global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS]; + __global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS]; + + event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0); + + wait_group_events(1, &e); + + if(PRIM == GS_TRIANGLE_CLASS) + { + e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0); + + wait_group_events(1, &e); + } + + for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size) + { + int y = bin_index / bin_dim.z; + int x = bin_index - y * bin_dim.z; + + x += bin_dim.x; + y += bin_dim.y; + + BIN_TYPE visible = 0; + + for(uint i = 0; i < batch_prim_count; i++) + { + uchar4 r = bbox_cache[i]; + + BIN_TYPE test = (r.x <= x + 1) & (r.z >= x) & (r.y <= y + 1) & (r.w >= y); + + if(PRIM == GS_TRIANGLE_CLASS && test != 0) + { + test &= tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]); + } + + visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i); + } + + bin[bin_index] = visible; + + if(visible != 0) + { + atomic_min(&env->bounds[bin_index].first, batch_index); + atomic_max(&env->bounds[bin_index].last, batch_index); + } + } + } +} + +#endif + +#endif + +#ifdef KERNEL_TFX + +bool ZTest(uint zs, uint zd) +{ + if(ZTEST) + { + if(is24bit(ZPSM)) zd &= 0x00ffffff; + + switch(ZTST) + { + case ZTST_NEVER: + return false; + case ZTST_ALWAYS: + return true; + case ZTST_GEQUAL: + return zs >= zd; + case ZTST_GREATER: + return zs > zd; + } + } + + return true; +} + +bool AlphaTest(int alpha, int aref, uint* fm, uint* zm) +{ + switch(AFAIL) + { + case AFAIL_KEEP: + break; + case AFAIL_FB_ONLY: + if(!ZWRITE) return true; + break; + case AFAIL_ZB_ONLY: + if(!FWRITE) return true; + break; + case AFAIL_RGB_ONLY: + if(!ZWRITE && is24bit(FPSM)) return true; + break; + } + + uint pass; + + switch(ATST) + { + case ATST_NEVER: + pass = false; + break; + case ATST_ALWAYS: + return true; + case ATST_LESS: + pass = alpha < aref; + break; + case ATST_LEQUAL: + pass = alpha <= aref; + break; + case ATST_EQUAL: + pass = alpha == aref; + break; + case ATST_GEQUAL: + pass = alpha >= aref; + break; + case ATST_GREATER: + pass = alpha > aref; + break; + case ATST_NOTEQUAL: + pass = alpha != aref; + break; + } + + switch(AFAIL) + { + case AFAIL_KEEP: + return pass; + case AFAIL_FB_ONLY: + *zm |= pass ? 0 : 0xffffffff; + break; + case AFAIL_ZB_ONLY: + *fm |= pass ? 0 : 0xffffffff; + break; + case AFAIL_RGB_ONLY: + *fm |= pass ? 0 : 0xff000000; + *zm |= pass ? 0 : 0xffffffff; + break; + } + + return true; +} + +bool DestAlphaTest(uint fd) +{ + if(DATE) + { + if(DATM) + { + if(is32bit(FPSM)) return (fd & 0x80000000) != 0; + if(is16bit(FPSM)) return (fd & 0x00008000) != 0; + } + else + { + if(is32bit(FPSM)) return (fd & 0x80000000) == 0; + if(is16bit(FPSM)) return (fd & 0x00008000) == 0; + } + } + + return true; +} + +int Wrap(int a, int b, int c, int mode) +{ + switch(mode) + { + case CLAMP_REPEAT: + return a & b; + case CLAMP_CLAMP: + return clamp(a, 0, c); + case CLAMP_REGION_CLAMP: + return clamp(a, b, c); + case CLAMP_REGION_REPEAT: + return (a & b) | c; + } +} + +int4 AlphaBlend(int4 c, int afix, uint fd) +{ + if(FWRITE && (ABE || AA1)) + { + int4 cs = c; + int4 cd; + + if(ABA != ABB && (ABA == 1 || ABB == 1 || ABC == 1) || ABD == 1) + { + if(is32bit(FPSM) || is24bit(FPSM)) + { + cd.x = fd & 0xff; + cd.y = (fd >> 8) & 0xff; + cd.z = (fd >> 16) & 0xff; + cd.w = fd >> 24; + } + else if(is16bit(FPSM)) + { + cd.x = (fd & 0x001f) << 3; + cd.y = (fd & 0x03e0) >> 2; + cd.z = (fd & 0x7c00) >> 7; + cd.w = (fd & 0x8000) >> 8; + } + } + + if(ABA != ABB) + { + switch(ABA) + { + case 0: break; // c.xyz = cs.xyz; + case 1: c.xyz = cd.xyz; break; + case 2: c.xyz = 0; break; + } + + switch(ABB) + { + case 0: c.xyz -= cs.xyz; break; + case 1: c.xyz -= cd.xyz; break; + case 2: break; + } + + if(!(is24bit(FPSM) && ABC == 1)) + { + int a = 0; + + switch(ABC) + { + case 0: a = cs.w; break; + case 1: a = cd.w; break; + case 2: a = afix; break; + } + + c.xyz = c.xyz * a >> 7; + } + + switch(ABD) + { + case 0: c.xyz += cs.xyz; break; + case 1: c.xyz += cd.xyz; break; + case 2: break; + } + } + else + { + switch(ABD) + { + case 0: break; + case 1: c.xyz = cd.xyz; break; + case 2: c.xyz = 0; break; + } + } + + if(PABE) + { + c.xyz = select(cs.xyz, c.xyz, (int3)(cs.w << 24)); + } + } + + return c; +} + +uchar4 Expand24To32(uint rgba, uchar ta0) +{ + uchar4 c; + + c.x = rgba & 0xff; + c.y = (rgba >> 8) & 0xff; + c.z = (rgba >> 16) & 0xff; + c.w = !AEM || (rgba & 0xffffff) != 0 ? ta0 : 0; + + return c; +} + +uchar4 Expand16To32(ushort rgba, uchar ta0, uchar ta1) +{ + uchar4 c; + + c.x = (rgba & 0x001f) << 3; + c.y = (rgba & 0x03e0) >> 2; + c.z = (rgba & 0x7c00) >> 7; + c.w = !AEM || (rgba & 0x7fff) != 0 ? ((rgba & 0x8000) ? ta1 : ta0) : 0; + + return c; +} + +int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* pb) +{ + uchar4 c; + + uint addr = PixelAddress(x, y, pb->tbp[level], pb->tbw[level], TPSM); + + __global ushort* vm16 = (__global ushort*)vm; + __global uint* vm32 = (__global uint*)vm; + + switch(TPSM) + { + default: + case PSM_PSMCT32: + case PSM_PSMZ32: + c = ((__global uchar4*)vm)[addr]; + break; + case PSM_PSMCT24: + case PSM_PSMZ24: + c = Expand24To32(vm32[addr], pb->ta0); + break; + case PSM_PSMCT16: + case PSM_PSMCT16S: + case PSM_PSMZ16: + case PSM_PSMZ16S: + c = Expand16To32(vm16[addr], pb->ta0, pb->ta1); + break; + case PSM_PSMT8: + c = pb->clut[vm[addr]]; + break; + case PSM_PSMT4: + c = pb->clut[(vm[addr] >> ((addr & 1) << 2)) & 0x0f]; + break; + case PSM_PSMT8H: + c = pb->clut[vm32[addr] >> 24]; + break; + case PSM_PSMT4HL: + c = pb->clut[(vm32[addr] >> 24) & 0x0f]; + break; + case PSM_PSMT4HH: + c = pb->clut[(vm32[addr] >> 28) & 0x0f]; + break; + } + + //printf("[%d %d] %05x %d %d %08x | %v4hhd | %08x\n", x, y, pb->tbp[level], pb->tbw[level], TPSM, addr, c, vm32[addr]); + + return convert_int4(c); +} + +__kernel void KERNEL_TFX( + __global gs_env* env, + __global uchar* vm, + __global uchar* tex, + __global uchar* pb_base, + uint pb_start, + uint prim_start, + uint prim_count, + uint batch_count, + uint bin_count, // == bin_dim.z * bin_dim.w + uchar4 bin_dim) +{ + // TODO: try it the bin_index = atomic_inc(&env->bin_counter) way + + uint bin_x = (get_global_id(0) >> BIN_SIZE_BITS) - bin_dim.x; + uint bin_y = (get_global_id(1) >> BIN_SIZE_BITS) - bin_dim.y; + uint bin_index = bin_y * bin_dim.z + bin_x; + + uint batch_first = env->bounds[bin_index].first; + uint batch_last = env->bounds[bin_index].last; + uint batch_start = prim_start >> MAX_PRIM_PER_BATCH_BITS; + + if(batch_last < batch_first) + { + return; + } + + uint skip; + + if(batch_start < batch_first) + { + uint n = (batch_first - batch_start) * MAX_PRIM_PER_BATCH - (prim_start & (MAX_PRIM_PER_BATCH - 1)); + + if(n > prim_count) + { + return; + } + + skip = 0; + prim_count -= n; + batch_start = batch_first; + } + else + { + skip = prim_start & (MAX_PRIM_PER_BATCH - 1); + prim_count += skip; + } + + if(batch_start > batch_last) + { + return; + } + + prim_count = min(prim_count, (batch_last - batch_start + 1) << MAX_PRIM_PER_BATCH_BITS); + + // + + __global gs_param* pb = (__global gs_param*)(pb_base + pb_start); + + uint x = get_global_id(0); + uint y = get_global_id(1); + + int2 pi = (int2)(x, y); + float2 pf = convert_float2(pi); + + if(!NOSCISSOR) + { + int4 scissor = pb->scissor; + + if(!all((pi >= scissor.xy) & (pi < scissor.zw))) + { + return; + } + } + + uint faddr = PixelAddress(x, y, pb->fbp, pb->bw, FPSM); + uint zaddr = PixelAddress(x, y, pb->zbp, pb->bw, ZPSM); + + uint fd, zd; + + if(RFB) + { + fd = ReadFrame(vm, faddr, FPSM); + } + + if(ZTEST) + { + zd = ReadFrame(vm, zaddr, ZPSM); + } +/* + // TODO: lookup top left address of this tile + local offset + // + // 32bpp: 8x8 block size, 4 blocks, 1024 bytes + // 0 1 + // 2 3 + // 16bpp: 16x8 block size, 2 blocks, 512 bytes + // 0 + // 1 + // linear access in memory, this layout is the same for all formats + + __local uint fbn, zbn; + __local uchar fb[1024], zb[1024]; + + if(get_local_id(0) == 0 && get_local_id(1) == 0) + { + fbn = TileBlockNumber(x, y, pb->fbp, pb->bw, FPSM); + zbn = TileBlockNumber(x, y, pb->fbp, pb->bw, FPSM); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + uint faddr = TilePixelAddress(x, y, fbn, FPSM); + uint zaddr = TilePixelAddress(x, y, zbn, ZPSM); + + if(RFB) + { + event_t e = async_work_group_copy((__local uint4*)fb, (__global uint4*)&vm[fbn << 8], 1024 / sizeof(uint4), 0); + + wait_group_events(1, &e); + } + + if(ZTEST) + { + event_t e = async_work_group_copy((__local uint4*)zb, (__global uint4*)&vm[zbn << 8], 1024 / sizeof(uint4), 0); + + wait_group_events(1, &e); + } + + // not sure if faster +*/ + + // early destination alpha test + + if(!DestAlphaTest(fd)) + { + return; + } + + // + + uint fragments = 0; + + //__local gs_prim p; + + __global BIN_TYPE* bin = &env->bin[bin_index + batch_start * bin_count]; // TODO: not needed for "one tile case" + __global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS]; + __global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS]; + + BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip); + + __local gs_prim prim_cache; + + for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH) + { + while(bin_value != 0) + { + uint i = clz(bin_value); + + if(prim_index + i >= prim_count) + { + break; + } + + __global gs_prim* prim = &prim_base[prim_index + i]; + + bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1); + + uint2 zf; + float4 t; + int4 c; + + // TODO: do not hittest if we know the tile is fully inside the prim + + if(PRIM == GS_POINT_CLASS) + { + // TODO: distance.x < 0.5f || distance.y < 0.5f + + continue; + } + else if(PRIM == GS_LINE_CLASS) + { + // TODO: find point on line prependicular to (x,y), distance.x < 0.5f || distance.y < 0.5f + + continue; + } + else if(PRIM == GS_TRIANGLE_CLASS) + { + __global gs_barycentric* b = &barycentric[prim_index + i]; + + float3 f = b->dx.xyz * (pf.x - b->dx.w) + b->dy.xyz * (pf.y - b->dy.w) + (float3)(0, 0, 1); + + f = select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10)); + + if(!all(f >= b->zero.xyz)) + { + continue; + } + + zf = convert_uint2(prim->v[0].p.zw * f.z + prim->v[1].p.zw * f.x + prim->v[2].p.zw * f.y); + + t.xyz = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y; + + if(IIP) + { + float4 c0 = convert_float4(prim->v[0].c); + float4 c1 = convert_float4(prim->v[1].c); + float4 c2 = convert_float4(prim->v[2].c); + + c = convert_int4(c0 * f.z + c1 * f.x + c2 * f.y); + } + else + { + c = convert_int4(prim->v[2].c); + } + } + else if(PRIM == GS_SPRITE_CLASS) + { + int2 tl = convert_int2_rtp(prim->v[0].p.xy); + int2 br = convert_int2_rtp(prim->v[1].p.xy); + + if(!all((pi >= tl) & (pi < br))) + { + continue; + } + + zf = convert_uint2(prim->v[1].p.zw); // TODO: send as uint + + t.xy = prim->v[0].tc.xy + prim->v[1].tc.xy * (pf - prim->v[0].p.xy); + t.z = prim->v[0].tc.z; + + c = convert_int4(prim->v[1].c); + } + + // z test + + uint zs = zf.x; + + if(!ZTest(zs, zd)) + { + continue; + } + + // sample texture + + int4 ct; + + if(FB && TFX != TFX_NONE) + { + // TODO + + if(0)//if(MMIN) + { + // TODO + } + else + { + int2 uv; + + if(!FST) + { + uv = convert_int2(t.xy * (1.0f / t.z)); + + if(LTF) uv -= 0x0008; + } + else + { + uv = convert_int2(t.xy); + } + + int2 uvf = uv & 0x000f; + + int2 uv0 = uv >> 4; + int2 uv1 = uv0 + 1; + + uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS); + uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT); + uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS); + uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT); + + tex = vm; // TODO: use the texture cache + + int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb); + int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb); + int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb); + int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb); + + if(LTF) + { + c00 = ((c01 - c00) * uvf.x >> 4) + c00; + c10 = ((c11 - c10) * uvf.x >> 4) + c10; + c00 = ((c10 - c00) * uvf.y >> 4) + c00; + } + + ct = c00; + } + } + + // alpha tfx + + if(FB) + { + if(TCC) + { + switch(TFX) + { + case TFX_MODULATE: + c.w = clamp(ct.w * c.w >> 7, 0, 0xff); + break; + case TFX_DECAL: + c.w = ct.w; + break; + case TFX_HIGHLIGHT: + c.w = clamp(ct.w + c.w, 0, 0xff); + break; + case TFX_HIGHLIGHT2: + c.w = ct.w; + break; + } + } + + if(AA1) + { + if(!ABE || c.w == 0x80) + { + // TODO: c.w = coverage; // coverage 0x80 at 100% + } + } + } + + // read mask (read once outside the loop if alpha test does not modify, not sure if it does not get optimized there anyway) + + uint fm = pb->fm; + uint zm = pb->zm; + + // alpha test + + if(!AlphaTest(c.w, pb->aref, &fm, &zm)) + { + continue; + } + + // all tests done, we have a new output + + fragments++; + + // write z + + if(ZWRITE) + { + zd = bitselect(zs, zd, zm); + } + + // rgb tfx + + if(FWRITE) + { + switch(TFX) + { + case TFX_MODULATE: + c.xyz = clamp(ct.xyz * c.xyz >> 7, 0, 0xff); + break; + case TFX_DECAL: + c.xyz = ct.xyz; + break; + case TFX_HIGHLIGHT: + case TFX_HIGHLIGHT2: + c.xyz = clamp((ct.xyz * c.xyz >> 7) + c.w, 0, 0xff); + break; + } + } + + // fog + + if(FWRITE && FGE) + { + int fog = (int)zf.y; + + c.xyz = (c.xyz * fog >> 8) + (convert_int4(pb->fog).xyz * (int3)(0xff - fog) >> 8); + } + + // alpha blend + + c = AlphaBlend(c, pb->afix, fd); + + // write frame + + if(FWRITE) + { + if(DTHE && is16bit(FPSM)) + { + // TODO: c += pb->dimx[y & 3] + } + + c = COLCLAMP ? clamp(c, 0, 0xff) : (c & 0xff); + + if(FBA && !is24bit(FPSM)) + { + c.w |= 0x80; + } + + uint fs; + + if(is32bit(FPSM)) + { + fs = (c.w << 24) | (c.z << 16) | (c.y << 8) | c.x; + } + else if(is24bit(FPSM)) + { + fs = (c.z << 16) | (c.y << 8) | c.x; + } + else if(is16bit(FPSM)) + { + fs = ((c.w & 0x80) << 8) | ((c.z & 0xf8) << 7) | ((c.y & 0xf8) << 2) | (c.x >> 3); + } + + fd = RFB ? bitselect(fs, fd, fm) : fs; + + // dest alpha test for the next loop + + if(!DestAlphaTest(fd)) + { + prim_index = prim_count; // game over + + break; + } + } + } + + bin += bin_count; + bin_value = *bin; + } + + if(fragments > 0) + { + // TODO: write color/z to faddr/zaddr (if 16x16 was cached, barrier local mem, swizzle back to its place) + + // TODO if(fm/zm != 0xffffffff) or whatever masks the output completely for the pixel format) + + if(ZWRITE) + { + WriteFrame(vm, zaddr, ZPSM, zd); + } + + if(FWRITE) + { + WriteFrame(vm, faddr, FPSM, fd); + //WriteFrame(vm, faddr, FPSM, 0xff202020 * fragments); + } + } +} + +#endif + +#endif diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index 02626f8dad..b9bfa19737 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -43,6 +43,7 @@ #include #include #include "../../common/include/comptr.h" +#include #define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA) #define D3D11_SHADER_MACRO D3D10_SHADER_MACRO diff --git a/plugins/GSdx/vsprops/common.props b/plugins/GSdx/vsprops/common.props index 71bf73f281..91c58981d0 100644 --- a/plugins/GSdx/vsprops/common.props +++ b/plugins/GSdx/vsprops/common.props @@ -8,22 +8,22 @@ true - _WINDOWS;_WIN32_WINNT=0x500;%(PreprocessorDefinitions) + _WINDOWS;_WIN32_WINNT=0x500;__CL_ENABLE_EXCEPTIONS;%(PreprocessorDefinitions) Fast false Level4 ProgramDatabase 4996;4995;4324;4100;4101;4201;4556;4127;4512;%(DisableSpecificWarnings) - $(DXSDK_DIR)include;$(VTUNE_AMPLIFIER_XE_2013_DIR)include;$(SolutionDir)3rdparty;%(AdditionalIncludeDirectories) + $(DXSDK_DIR)include;$(INTELOCLSDKROOT)include;$(VTUNE_AMPLIFIER_XE_2015_DIR)include;$(SolutionDir)3rdparty;%(AdditionalIncludeDirectories) true - d3d11.lib;d3dx11.lib;d3d10_1.lib;d3dx10.lib;d3d9.lib;d3dx9.lib;dxgi.lib;dxguid.lib;winmm.lib;strmiids.lib;xinput.lib;opengl32.lib;comsuppw.lib;%(AdditionalDependencies) + d3d11.lib;d3dx11.lib;d3d10_1.lib;d3dx10.lib;d3d9.lib;d3dx9.lib;dxgi.lib;dxguid.lib;winmm.lib;strmiids.lib;xinput.lib;opengl32.lib;opencl.lib;comsuppw.lib;%(AdditionalDependencies) d3d9.dll;d3dx9_43.dll;d3d11.dll;d3dx11_43.dll;dxgi.dll;opengl32.dll;%(DelayLoadDLLs) true Windows false - $(VTUNE_AMPLIFIER_XE_2013_DIR)lib32;%(AdditionalLibraryDirectories) + $(VTUNE_AMPLIFIER_XE_2015_DIR)lib32;%(AdditionalLibraryDirectories) .\postBuild.cmd "$(TargetPath)" "$(TargetName)" $(TargetExt) $(PcsxSubsection) diff --git a/plugins/GSdx/vsprops/x64.props b/plugins/GSdx/vsprops/x64.props index 947675c717..ab8253d819 100644 --- a/plugins/GSdx/vsprops/x64.props +++ b/plugins/GSdx/vsprops/x64.props @@ -5,7 +5,7 @@ - $(DXSDK_DIR)Lib\x64;$(ProjectDir)vtune\x64;%(AdditionalLibraryDirectories) + $(DXSDK_DIR)Lib\x64;$(INTELOCLSDKROOT)lib\x64;$(ProjectDir)vtune\x64;%(AdditionalLibraryDirectories) _WIN64;%(PreprocessorDefinitions) diff --git a/plugins/GSdx/vsprops/x86.props b/plugins/GSdx/vsprops/x86.props index ff76b9535d..56171d784c 100644 --- a/plugins/GSdx/vsprops/x86.props +++ b/plugins/GSdx/vsprops/x86.props @@ -5,7 +5,7 @@ - $(DXSDK_DIR)Lib\x86;$(ProjectDir)vtune\x86;%(AdditionalLibraryDirectories) + $(DXSDK_DIR)Lib\x86;$(INTELOCLSDKROOT)lib\x86;$(ProjectDir)vtune\x86;%(AdditionalLibraryDirectories)