From db7c26cde7134af84f4cd72d919c1397d91800bd Mon Sep 17 00:00:00 2001 From: gabest11 Date: Mon, 15 Sep 2014 15:49:16 +0200 Subject: [PATCH 01/15] - Experimental OpenCL renderer (missing features: point, line, texture cache, mipmap, aa1, device selection). Needs any OpenCL SDK for the common headers and stub lib to compile, tested with AMD and Intel. Too bad it is not part of the Windows SDK yet. - Renumbered renderer ids, compatible with old numbering, but it does not follow the mod3 logic anymore. --- plugins/GSdx/GPURenderer.h | 4 +- plugins/GSdx/GS.cpp | 162 +- plugins/GSdx/GSDevice9.h | 1 - plugins/GSdx/GSLocalMemory.cpp | 134 +- plugins/GSdx/GSLocalMemory.h | 2 +- plugins/GSdx/GSRendererCL.cpp | 1780 ++++++++++++++++++++++ plugins/GSdx/GSRendererCL.h | 310 ++++ plugins/GSdx/GSRendererSW.cpp | 13 +- plugins/GSdx/GSSettingsDlg.cpp | 13 +- plugins/GSdx/GSState.cpp | 9 +- plugins/GSdx/GSdx.cpp | 31 +- plugins/GSdx/GSdx.h | 3 + plugins/GSdx/GSdx.rc | 17 +- plugins/GSdx/GSdx_vs2013.vcxproj | 3 + plugins/GSdx/GSdx_vs2013.vcxproj.filters | 11 +- plugins/GSdx/res/cs.fx | 4 + plugins/GSdx/res/tfx.cl | 1619 ++++++++++++++++++++ plugins/GSdx/stdafx.h | 1 + plugins/GSdx/vsprops/common.props | 8 +- plugins/GSdx/vsprops/x64.props | 2 +- plugins/GSdx/vsprops/x86.props | 2 +- 21 files changed, 3947 insertions(+), 182 deletions(-) create mode 100644 plugins/GSdx/GSRendererCL.cpp create mode 100644 plugins/GSdx/GSRendererCL.h create mode 100644 plugins/GSdx/res/tfx.cl diff --git a/plugins/GSdx/GPURenderer.h b/plugins/GSdx/GPURenderer.h index 1a7b505ced..16568a0da2 100644 --- a/plugins/GSdx/GPURenderer.h +++ b/plugins/GSdx/GPURenderer.h @@ -123,13 +123,13 @@ protected: int maxcount = std::max(m_maxcount * 3 / 2, 10000); Vertex* vertices = (Vertex*)_aligned_malloc(sizeof(Vertex) * maxcount, 32); - if (!vertices) + if(vertices == NULL) { printf("GSdx: failed to allocate %d bytes for verticles.\n", sizeof(Vertex) * maxcount); throw GSDXError(); } - if (m_vertices != NULL) + if(m_vertices != NULL) { memcpy(vertices, m_vertices, sizeof(Vertex) * m_maxcount); _aligned_free(m_vertices); diff --git a/plugins/GSdx/GS.cpp b/plugins/GSdx/GS.cpp index 2f23f0903f..5d91813659 100644 --- a/plugins/GSdx/GS.cpp +++ b/plugins/GSdx/GS.cpp @@ -37,6 +37,7 @@ #include "GSWndDX.h" #include "GSWndWGL.h" #include "GSRendererCS.h" +#include "GSRendererCL.h" #include "GSSettingsDlg.h" static HRESULT s_hr = E_FAIL; @@ -203,6 +204,7 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1) } GSWnd* wnd[2]; + try { if(s_renderer != renderer) @@ -216,78 +218,72 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1) s_gs = NULL; } - if(renderer == 15) + switch(renderer) { - #ifdef _WINDOWS - - dev = new GSDevice11(); - - if(dev == NULL) - { - return -1; - } - - delete s_gs; - - s_gs = new GSRendererCS(); - - s_renderer = renderer; - - #endif + default: +#ifdef _WINDOWS + case 0: case 1: case 2: case 14: + dev = new GSDevice9(); + break; + case 3: case 4: case 5: case 15: + dev = new GSDevice11(); + break; +#endif + case 9: case 10: case 11: case 16: + dev = new GSDeviceNull(); + break; + case 12: case 13: case 17: + dev = new GSDeviceOGL(); + break; } - else + + if(dev == NULL) { - switch(renderer / 3) + return -1; + } + + if(s_gs == NULL) + { + switch(renderer) { default: - #ifdef _WINDOWS - case 0: dev = new GSDevice9(); break; - case 1: dev = new GSDevice11(); break; - #endif - case 3: dev = new GSDeviceNull(); break; - case 4: dev = new GSDeviceOGL(); break; - } - - if(dev == NULL) - { - return -1; - } - - if(s_gs == NULL) - { - switch(renderer % 3) - { - default: - case 0: - switch(renderer) - { - default: #ifdef _WINDOWS - case 0: s_gs = (GSRenderer*)new GSRendererDX9(); break; - case 3: s_gs = (GSRenderer*)new GSRendererDX11(); break; + case 0: + s_gs = (GSRenderer*)new GSRendererDX9(); + break; + case 3: + s_gs = (GSRenderer*)new GSRendererDX11(); + break; #endif - case 12: s_gs = (GSRenderer*)new GSRendererOGL(); break; - } - break; - case 1: - s_gs = new GSRendererSW(threads); - break; - case 2: - s_gs = new GSRendererNull(); - break; - } - - s_renderer = renderer; + case 12: + s_gs = (GSRenderer*)new GSRendererOGL(); + break; + case 1: case 4: case 10: case 13: + s_gs = new GSRendererSW(threads); + break; + case 2: case 5: case 11: + s_gs = new GSRendererNull(); + break; + case 14: case 15: case 16: case 17: + s_gs = new GSRendererCL(); + break; } + + s_renderer = renderer; } if (s_gs->m_wnd == NULL) { #ifdef _WINDOWS - if (renderer / 3 == 4) + switch(renderer) + { + case 12: case 13: case 17: s_gs->m_wnd = new GSWndWGL(); - else + break; + default: s_gs->m_wnd = new GSWndDX(); + break; + } #else #ifdef ENABLE_GLES wnd[0] = NULL; @@ -681,8 +677,10 @@ EXPORT_C GSkeyEvent(GSKeyEventData* e) { try { - if (gsopen_done) + if(gsopen_done) + { s_gs->KeyEvent(e); + } } catch (GSDXRecoverableError) { @@ -1218,15 +1216,11 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow { ::SetPriorityClass(::GetCurrentProcess(), HIGH_PRIORITY_CLASS); - FILE* file = fopen("c:\\temp1\\log.txt", "a"); - - fprintf(file, "-------------------------\n\n"); + Console console("GSdx", true); if(1) { - GSLocalMemory * pMem = new GSLocalMemory(); - GSLocalMemory& mem(*pMem); - + GSLocalMemory* mem = new GSLocalMemory(); static struct {int psm; const char* name;} s_format[] = { @@ -1258,7 +1252,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow int w = 1 << tbw; int h = 1 << tbw; - fprintf(file, "%d x %d\n\n", w, h); + printf("%d x %d\n\n", w, h); for(size_t i = 0; i < countof(s_format); i++) { @@ -1308,7 +1302,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow clock_t start, end; - _ftprintf(file, _T("[%4s] "), s_format[i].name); + printf("[%4s] ", s_format[i].name); start = clock(); @@ -1317,12 +1311,12 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow int x = 0; int y = 0; - (mem.*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); + (mem->*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); } end = clock(); - fprintf(file, "%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); + printf("%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); start = clock(); @@ -1331,25 +1325,25 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow int x = 0; int y = 0; - (mem.*ri)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); + (mem->*ri)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); } end = clock(); - fprintf(file, "%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); + printf("%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); - const GSOffset* o = mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM); + const GSOffset* o = mem->GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM); start = clock(); for(int j = 0; j < n; j++) { - (mem.*rtx)(o, r, ptr, w * 4, TEXA); + (mem->*rtx)(o, r, ptr, w * 4, TEXA); } end = clock(); - fprintf(file, "%6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); + printf("%6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); if(psm.pal > 0) { @@ -1357,32 +1351,30 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow for(int j = 0; j < n; j++) { - (mem.*rtxP)(o, r, ptr, w, TEXA); + (mem->*rtxP)(o, r, ptr, w, TEXA); } end = clock(); - fprintf(file, "| %6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); + printf("| %6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); } - fprintf(file, "\n"); - - fflush(file); + printf("\n"); } - fprintf(file, "\n"); + printf("\n"); } _aligned_free(ptr); - delete pMem; + + delete mem; } // if(0) { - GSLocalMemory * pMem2 = new GSLocalMemory(); - GSLocalMemory& mem2(*pMem2); + GSLocalMemory* mem = new GSLocalMemory(); uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32); @@ -1413,13 +1405,13 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow int x = 0; int y = 0; - (mem2.*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); - delete pMem2; + (mem->*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); + + delete mem; } // - fclose(file); PostQuitMessage(0); } diff --git a/plugins/GSdx/GSDevice9.h b/plugins/GSdx/GSDevice9.h index fd32304c70..e7cb160135 100644 --- a/plugins/GSdx/GSDevice9.h +++ b/plugins/GSdx/GSDevice9.h @@ -173,7 +173,6 @@ public: // TODO // Shaders... hash_map m_vs; - D3DXHANDLE m_vs_params; hash_map > m_ps; hash_map m_ps_ss; hash_map m_om_dss; diff --git a/plugins/GSdx/GSLocalMemory.cpp b/plugins/GSdx/GSLocalMemory.cpp index 10cb2433f5..757463d134 100644 --- a/plugins/GSdx/GSLocalMemory.cpp +++ b/plugins/GSdx/GSLocalMemory.cpp @@ -692,14 +692,14 @@ void GSLocalMemory::WriteImageColumn(int l, int r, int y, int h, const uint8* sr { switch(psm) { - case PSM_PSMCT32: WriteColumn32(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break; - case PSM_PSMCT16: WriteColumn16(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break; - case PSM_PSMCT16S: WriteColumn16(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break; - case PSM_PSMT8: WriteColumn8(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break; - case PSM_PSMT4: WriteColumn4(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break; - case PSM_PSMZ32: WriteColumn32(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break; - case PSM_PSMZ16: WriteColumn16(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break; - case PSM_PSMZ16S: WriteColumn16(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMCT32: GSBlock::WriteColumn32(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break; + case PSM_PSMCT16: GSBlock::WriteColumn16(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMCT16S: GSBlock::WriteColumn16(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMT8: GSBlock::WriteColumn8(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break; + case PSM_PSMT4: GSBlock::WriteColumn4(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break; + case PSM_PSMZ32: GSBlock::WriteColumn32(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break; + case PSM_PSMZ16: GSBlock::WriteColumn16(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMZ16S: GSBlock::WriteColumn16(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break; // TODO default: __assume(0); } @@ -719,14 +719,14 @@ void GSLocalMemory::WriteImageBlock(int l, int r, int y, int h, const uint8* src { switch(psm) { - case PSM_PSMCT32: WriteBlock32(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break; - case PSM_PSMCT16: WriteBlock16(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break; - case PSM_PSMCT16S: WriteBlock16(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break; - case PSM_PSMT8: WriteBlock8(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break; - case PSM_PSMT4: WriteBlock4(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break; - case PSM_PSMZ32: WriteBlock32(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break; - case PSM_PSMZ16: WriteBlock16(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break; - case PSM_PSMZ16S: WriteBlock16(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMCT32: GSBlock::WriteBlock32(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break; + case PSM_PSMCT16: GSBlock::WriteBlock16(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMCT16S: GSBlock::WriteBlock16(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMT8: GSBlock::WriteBlock8(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break; + case PSM_PSMT4: GSBlock::WriteBlock4(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break; + case PSM_PSMZ32: GSBlock::WriteBlock32(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break; + case PSM_PSMZ16: GSBlock::WriteBlock16(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break; + case PSM_PSMZ16S: GSBlock::WriteBlock16(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break; // TODO default: __assume(0); } @@ -801,27 +801,27 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8* { case PSM_PSMCT32: case PSM_PSMZ32: - ReadColumn32(y, dst, buff, 32); + GSBlock::ReadColumn32(y, dst, buff, 32); memcpy(&buff[32], &src[x * 4], 32); - WriteColumn32<32, 0xffffffff>(y, dst, buff, 32); + GSBlock::WriteColumn32<32, 0xffffffff>(y, dst, buff, 32); break; case PSM_PSMCT16: case PSM_PSMCT16S: case PSM_PSMZ16: case PSM_PSMZ16S: - ReadColumn16(y, dst, buff, 32); + GSBlock::ReadColumn16(y, dst, buff, 32); memcpy(&buff[32], &src[x * 2], 32); - WriteColumn16<32>(y, dst, buff, 32); + GSBlock::WriteColumn16<32>(y, dst, buff, 32); break; case PSM_PSMT8: - ReadColumn8(y, dst, buff, 16); + GSBlock::ReadColumn8(y, dst, buff, 16); for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + x], 16); - WriteColumn8<32>(y, dst, buff, 16); + GSBlock::WriteColumn8<32>(y, dst, buff, 16); break; case PSM_PSMT4: - ReadColumn4(y, dst, buff, 16); + GSBlock::ReadColumn4(y, dst, buff, 16); for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + (x >> 1)], 16); - WriteColumn4<32>(y, dst, buff, 16); + GSBlock::WriteColumn4<32>(y, dst, buff, 16); break; // TODO default: @@ -888,27 +888,27 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8* { case PSM_PSMCT32: case PSM_PSMZ32: - ReadColumn32(y, dst, buff, 32); + GSBlock::ReadColumn32(y, dst, buff, 32); memcpy(&buff[0], &src[x * 4], 32); - WriteColumn32<32, 0xffffffff>(y, dst, buff, 32); + GSBlock::WriteColumn32<32, 0xffffffff>(y, dst, buff, 32); break; case PSM_PSMCT16: case PSM_PSMCT16S: case PSM_PSMZ16: case PSM_PSMZ16S: - ReadColumn16(y, dst, buff, 32); + GSBlock::ReadColumn16(y, dst, buff, 32); memcpy(&buff[0], &src[x * 2], 32); - WriteColumn16<32>(y, dst, buff, 32); + GSBlock::WriteColumn16<32>(y, dst, buff, 32); break; case PSM_PSMT8: - ReadColumn8(y, dst, buff, 16); + GSBlock::ReadColumn8(y, dst, buff, 16); for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + x], 16); - WriteColumn8<32>(y, dst, buff, 16); + GSBlock::WriteColumn8<32>(y, dst, buff, 16); break; case PSM_PSMT4: - ReadColumn4(y, dst, buff, 16); + GSBlock::ReadColumn4(y, dst, buff, 16); for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + (x >> 1)], 16); - WriteColumn4<32>(y, dst, buff, 16); + GSBlock::WriteColumn4<32>(y, dst, buff, 16); break; // TODO default: @@ -1060,7 +1060,7 @@ void GSLocalMemory::WriteImage24(int& tx, int& ty, const uint8* src, int len, GI { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32(x, y, bp, bw)); + GSBlock::UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32(x, y, bp, bw)); } } @@ -1094,7 +1094,7 @@ void GSLocalMemory::WriteImage8H(int& tx, int& ty, const uint8* src, int len, GI { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock8H(src + (x - tx), srcpitch, BlockPtr32(x, y, bp, bw)); + GSBlock::UnpackAndWriteBlock8H(src + (x - tx), srcpitch, BlockPtr32(x, y, bp, bw)); } } @@ -1128,7 +1128,7 @@ void GSLocalMemory::WriteImage4HL(int& tx, int& ty, const uint8* src, int len, G { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock4HL(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw)); + GSBlock::UnpackAndWriteBlock4HL(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw)); } } @@ -1162,7 +1162,7 @@ void GSLocalMemory::WriteImage4HH(int& tx, int& ty, const uint8* src, int len, G { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock4HH(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw)); + GSBlock::UnpackAndWriteBlock4HH(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw)); } } @@ -1196,7 +1196,7 @@ void GSLocalMemory::WriteImage24Z(int& tx, int& ty, const uint8* src, int len, G { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32Z(x, y, bp, bw)); + GSBlock::UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32Z(x, y, bp, bw)); } } @@ -1612,7 +1612,7 @@ void GSLocalMemory::ReadTexture32(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 8, 8, 32) { - ReadBlock32(src, dst, dstpitch); + GSBlock::ReadBlock32(src, dst, dstpitch); } FOREACH_BLOCK_END } @@ -1623,7 +1623,7 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 8, 8, 32) { - ReadAndExpandBlock24(src, dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock24(src, dst, dstpitch, TEXA); } FOREACH_BLOCK_END } @@ -1631,7 +1631,7 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 8, 8, 32) { - ReadAndExpandBlock24(src, dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock24(src, dst, dstpitch, TEXA); } FOREACH_BLOCK_END } @@ -1643,7 +1643,7 @@ void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 16, 8, 32) { - ReadAndExpandBlock16(src, dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock16(src, dst, dstpitch, TEXA); } FOREACH_BLOCK_END } @@ -1651,7 +1651,7 @@ void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 16, 8, 32) { - ReadAndExpandBlock16(src, dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock16(src, dst, dstpitch, TEXA); } FOREACH_BLOCK_END } @@ -1663,7 +1663,7 @@ void GSLocalMemory::ReadTexture8(const GSOffset* RESTRICT o, const GSVector4i& r FOREACH_BLOCK_START(r, 16, 16, 32) { - ReadAndExpandBlock8_32(src, dst, dstpitch, pal); + GSBlock::ReadAndExpandBlock8_32(src, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -1674,7 +1674,7 @@ void GSLocalMemory::ReadTexture4(const GSOffset* RESTRICT o, const GSVector4i& r FOREACH_BLOCK_START(r, 32, 16, 32) { - ReadAndExpandBlock4_32(src, dst, dstpitch, pal); + GSBlock::ReadAndExpandBlock4_32(src, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -1685,7 +1685,7 @@ void GSLocalMemory::ReadTexture8H(const GSOffset* RESTRICT o, const GSVector4i& FOREACH_BLOCK_START(r, 8, 8, 32) { - ReadAndExpandBlock8H_32(src, dst, dstpitch, pal); + GSBlock::ReadAndExpandBlock8H_32(src, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -1696,7 +1696,7 @@ void GSLocalMemory::ReadTexture4HL(const GSOffset* RESTRICT o, const GSVector4i& FOREACH_BLOCK_START(r, 8, 8, 32) { - ReadAndExpandBlock4HL_32(src, dst, dstpitch, pal); + GSBlock::ReadAndExpandBlock4HL_32(src, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -1707,7 +1707,7 @@ void GSLocalMemory::ReadTexture4HH(const GSOffset* RESTRICT o, const GSVector4i& FOREACH_BLOCK_START(r, 8, 8, 32) { - ReadAndExpandBlock4HH_32(src, dst, dstpitch, pal); + GSBlock::ReadAndExpandBlock4HH_32(src, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -1718,7 +1718,7 @@ void GSLocalMemory::ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, cons { ALIGN_STACK(32); - ReadBlock32(BlockPtr(bp), dst, dstpitch); + GSBlock::ReadBlock32(BlockPtr(bp), dst, dstpitch); } void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const @@ -1727,11 +1727,11 @@ void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, cons if(TEXA.AEM) { - ReadAndExpandBlock24(BlockPtr(bp), dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock24(BlockPtr(bp), dst, dstpitch, TEXA); } else { - ReadAndExpandBlock24(BlockPtr(bp), dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock24(BlockPtr(bp), dst, dstpitch, TEXA); } } @@ -1741,11 +1741,11 @@ void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, cons if(TEXA.AEM) { - ReadAndExpandBlock16(BlockPtr(bp), dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock16(BlockPtr(bp), dst, dstpitch, TEXA); } else { - ReadAndExpandBlock16(BlockPtr(bp), dst, dstpitch, TEXA); + GSBlock::ReadAndExpandBlock16(BlockPtr(bp), dst, dstpitch, TEXA); } } @@ -1753,35 +1753,35 @@ void GSLocalMemory::ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const { ALIGN_STACK(32); - ReadAndExpandBlock8_32(BlockPtr(bp), dst, dstpitch, m_clut); + GSBlock::ReadAndExpandBlock8_32(BlockPtr(bp), dst, dstpitch, m_clut); } void GSLocalMemory::ReadTextureBlock4(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadAndExpandBlock4_32(BlockPtr(bp), dst, dstpitch, m_clut); + GSBlock::ReadAndExpandBlock4_32(BlockPtr(bp), dst, dstpitch, m_clut); } void GSLocalMemory::ReadTextureBlock8H(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut); + GSBlock::ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut); } void GSLocalMemory::ReadTextureBlock4HL(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadAndExpandBlock4HL_32(BlockPtr(bp), dst, dstpitch, m_clut); + GSBlock::ReadAndExpandBlock4HL_32(BlockPtr(bp), dst, dstpitch, m_clut); } void GSLocalMemory::ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut); + GSBlock::ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut); } /////////////////// @@ -1870,7 +1870,7 @@ void GSLocalMemory::ReadTexture8P(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 16, 16, 8) { - ReadBlock8(src, dst, dstpitch); + GSBlock::ReadBlock8(src, dst, dstpitch); } FOREACH_BLOCK_END } @@ -1879,7 +1879,7 @@ void GSLocalMemory::ReadTexture4P(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 32, 16, 8) { - ReadBlock4P(src, dst, dstpitch); + GSBlock::ReadBlock4P(src, dst, dstpitch); } FOREACH_BLOCK_END } @@ -1888,7 +1888,7 @@ void GSLocalMemory::ReadTexture8HP(const GSOffset* RESTRICT o, const GSVector4i& { FOREACH_BLOCK_START(r, 8, 8, 8) { - ReadBlock8HP(src, dst, dstpitch); + GSBlock::ReadBlock8HP(src, dst, dstpitch); } FOREACH_BLOCK_END } @@ -1897,7 +1897,7 @@ void GSLocalMemory::ReadTexture4HLP(const GSOffset* RESTRICT o, const GSVector4i { FOREACH_BLOCK_START(r, 8, 8, 8) { - ReadBlock4HLP(src, dst, dstpitch); + GSBlock::ReadBlock4HLP(src, dst, dstpitch); } FOREACH_BLOCK_END } @@ -1906,7 +1906,7 @@ void GSLocalMemory::ReadTexture4HHP(const GSOffset* RESTRICT o, const GSVector4i { FOREACH_BLOCK_START(r, 8, 8, 8) { - ReadBlock4HHP(src, dst, dstpitch); + GSBlock::ReadBlock4HHP(src, dst, dstpitch); } FOREACH_BLOCK_END } @@ -1915,35 +1915,35 @@ void GSLocalMemory::ReadTexture4HHP(const GSOffset* RESTRICT o, const GSVector4i void GSLocalMemory::ReadTextureBlock8P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { - ReadBlock8(BlockPtr(bp), dst, dstpitch); + GSBlock::ReadBlock8(BlockPtr(bp), dst, dstpitch); } void GSLocalMemory::ReadTextureBlock4P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadBlock4P(BlockPtr(bp), dst, dstpitch); + GSBlock::ReadBlock4P(BlockPtr(bp), dst, dstpitch); } void GSLocalMemory::ReadTextureBlock8HP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadBlock8HP(BlockPtr(bp), dst, dstpitch); + GSBlock::ReadBlock8HP(BlockPtr(bp), dst, dstpitch); } void GSLocalMemory::ReadTextureBlock4HLP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadBlock4HLP(BlockPtr(bp), dst, dstpitch); + GSBlock::ReadBlock4HLP(BlockPtr(bp), dst, dstpitch); } void GSLocalMemory::ReadTextureBlock4HHP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); - ReadBlock4HHP(BlockPtr(bp), dst, dstpitch); + GSBlock::ReadBlock4HHP(BlockPtr(bp), dst, dstpitch); } // diff --git a/plugins/GSdx/GSLocalMemory.h b/plugins/GSdx/GSLocalMemory.h index 67f2cf0c27..ea83bfd53b 100644 --- a/plugins/GSdx/GSLocalMemory.h +++ b/plugins/GSdx/GSLocalMemory.h @@ -76,7 +76,7 @@ struct GSPixelOffset4 uint32 fbp, zbp, fpsm, zpsm, bw; }; -class GSLocalMemory : public GSBlock +class GSLocalMemory : public GSAlignedClass<32> { public: typedef uint32 (*pixelAddress)(int x, int y, uint32 bp, uint32 bw); diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp new file mode 100644 index 0000000000..357a2a3e23 --- /dev/null +++ b/plugins/GSdx/GSRendererCL.cpp @@ -0,0 +1,1780 @@ +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSRendererCL.h" + +#define LOG 0 + +static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL; + +#define MAX_FRAME_SIZE 2048 +#define MAX_PRIM_COUNT 4096u +#define MAX_PRIM_PER_BATCH_BITS 5 +#define MAX_PRIM_PER_BATCH (1u << MAX_PRIM_PER_BATCH_BITS) +#define BATCH_COUNT(prim_count) (((prim_count) + (MAX_PRIM_PER_BATCH - 1)) / MAX_PRIM_PER_BATCH) +#define MAX_BATCH_COUNT BATCH_COUNT(MAX_PRIM_COUNT) +#define BIN_SIZE_BITS 4 +#define BIN_SIZE (1u << BIN_SIZE_BITS) +#define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE)) +#define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT) + +#if MAX_PRIM_PER_BATCH == 64u +#define BIN_TYPE cl_ulong +#elif MAX_PRIM_PER_BATCH == 32u +#define BIN_TYPE cl_uint +#else +#error "MAX_PRIM_PER_BATCH != 32u OR 64u" +#endif + +#pragma pack(push, 1) + +typedef struct +{ + GSVertexCL v[4]; +} gs_prim; + +typedef struct +{ + cl_float4 dx, dy; + cl_float4 zero; + cl_float4 reject_corner; +} gs_barycentric; + +typedef struct +{ + cl_uint batch_counter; + cl_uint _pad[7]; + struct { cl_uint first, last; } bounds[MAX_BIN_PER_BATCH]; + BIN_TYPE bin[MAX_BIN_COUNT]; + cl_uchar4 bbox[MAX_PRIM_COUNT]; + gs_prim prim[MAX_PRIM_COUNT]; + gs_barycentric barycentric[MAX_PRIM_COUNT]; +} gs_env; + +#pragma pack(pop) + +GSRendererCL::GSRendererCL() + : m_vb_count(0) +{ + m_nativeres = true; // ignore ini, sw is always native + + //s_dump = 1; + //s_save = 1; + //s_savez = 1; + + // TODO: m_tc = new GSTextureCacheCL(this); + + memset(m_texture, 0, sizeof(m_texture)); + + m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32); + + memset(m_rw_pages, 0, sizeof(m_rw_pages)); + memset(m_tex_pages, 0, sizeof(m_tex_pages)); + + #define InitCVB(P) \ + m_cvb[P][0][0] = &GSRendererCL::ConvertVertexBuffer; \ + m_cvb[P][0][1] = &GSRendererCL::ConvertVertexBuffer; \ + m_cvb[P][1][0] = &GSRendererCL::ConvertVertexBuffer; \ + m_cvb[P][1][1] = &GSRendererCL::ConvertVertexBuffer; \ + + InitCVB(GS_POINT_CLASS); + InitCVB(GS_LINE_CLASS); + InitCVB(GS_TRIANGLE_CLASS); + InitCVB(GS_SPRITE_CLASS); + + m_cl.vm = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (size_t)m_mem.m_vmsize, m_mem.m_vm8, NULL); + m_cl.tex = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE, (size_t)m_mem.m_vmsize); +} + +GSRendererCL::~GSRendererCL() +{ + // TODO: delete m_tc; + + for(size_t i = 0; i < countof(m_texture); i++) + { + delete m_texture[i]; + } + + _aligned_free(m_output); +} + +void GSRendererCL::Reset() +{ + Sync(-1); + + // TODO: m_tc->RemoveAll(); + + GSRenderer::Reset(); +} + +void GSRendererCL::VSync(int field) +{ + Sync(0); // IncAge might delete a cached texture in use + + GSRenderer::VSync(field); + + // TODO: m_tc->IncAge(); + + //if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize); +} + +void GSRendererCL::ResetDevice() +{ + for(size_t i = 0; i < countof(m_texture); i++) + { + delete m_texture[i]; + + m_texture[i] = NULL; + } +} + +GSTexture* GSRendererCL::GetOutput(int i) +{ + Sync(1); + + const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB; + + int w = DISPFB.FBW * 64; + int h = GetFrameRect(i).bottom; + + // TODO: round up bottom + + if(m_dev->ResizeTexture(&m_texture[i], w, h)) + { + static int pitch = 1024 * 4; + + GSVector4i r(0, 0, w, h); + + const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM]; + + (m_mem.*psm.rtx)(m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM), r.ralign(psm.bs), m_output, pitch, m_env.TEXA); + + m_texture[i]->Update(r, m_output, pitch); + + if(s_dump) + { + if(s_save && s_n >= s_saven) + { + m_texture[i]->Save(format("c:\\temp1\\_%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)DISPFB.Block(), (int)DISPFB.PSM)); + } + + s_n++; + } + } + + return m_texture[i]; +} + +const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 1.0f); + +template +void GSRendererCL::ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count) +{ + GSVector4i o = (GSVector4i)m_context->XYOFFSET; + GSVector4 st_scale = GSVector4(16 << m_context->TEX0.TW, 16 << m_context->TEX0.TH, 1, 0); + + for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++) + { + GSVector4 stcq = GSVector4::load(&src->m[0]); // s t rgba q + + #if _M_SSE >= 0x401 + + GSVector4i xyzuvf(src->m[1]); + + GSVector4i xy = xyzuvf.upl16() - o; + GSVector4i zf = xyzuvf.ywww().min_u32(GSVector4i::xffffff00()); + + #else + + uint32 z = src->XYZ.Z; + + GSVector4i xy = GSVector4i::load((int)src->XYZ.u32[0]).upl16() - o; + GSVector4i zf = GSVector4i((int)std::min(z, 0xffffff00), src->FOG); // NOTE: larger values of z may roll over to 0 when converting back to uint32 later + + #endif + + dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * g_pos_scale; + + GSVector4 t = GSVector4::zero(); + + if(tme) + { + if(fst) + { + #if _M_SSE >= 0x401 + + t = GSVector4(xyzuvf.uph16()); + + #else + + t = GSVector4(GSVector4i::load(src->UV).upl16()); + + #endif + } + else + { + t = stcq.xyww() * st_scale; + } + } + + dst->t = t.insert32<2, 3>(stcq); + } +} + +void GSRendererCL::Draw() +{ + const GSDrawingContext* context = m_context; + + GSVector4i scissor = GSVector4i(context->scissor.in); + GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil())); + + // points and lines may have zero area bbox (example: single line 0,0->256,0) + + if(m_vt.m_primclass == GS_POINT_CLASS || m_vt.m_primclass == GS_LINE_CLASS) + { + if(bbox.x == bbox.z) bbox.z++; + if(bbox.y == bbox.w) bbox.w++; + } + + scissor.z = std::min(scissor.z, (int)context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour + + GSVector4i rect = bbox.rintersect(scissor); + + if(rect.rempty()) + { + return; + } + + if(s_dump) + { + Sync(2); + + uint64 frame = m_perfmon.GetFrame(); + + std::string s; + + if(s_save && s_n >= s_saven && PRIM->TME) + { + s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM); + + m_mem.SaveBMP(s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH); + } + + s_n++; + + if(s_save && s_n >= s_saven) + { + s = format("c:\\temp1\\_%05d_f%lld_rt0_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM); + + m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512); + } + + if(s_savez && s_n >= s_saven) + { + s = format("c:\\temp1\\_%05d_f%lld_rz0_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM); + + m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512); + } + + s_n++; + } + + try + { + size_t vb_size = m_vertex.next * sizeof(GSVertexCL); + size_t ib_size = m_index.tail * sizeof(uint32); + size_t pb_size = sizeof(TFXParameter); + + if(m_cl.vb.tail + vb_size > m_cl.vb.size || m_cl.ib.tail + ib_size > m_cl.ib.size || m_cl.pb.tail + pb_size > m_cl.pb.size) + { + if(vb_size > m_cl.vb.size || ib_size > m_cl.ib.size) + { + // buffer too small for even one batch, allow twice the size (at least 1 MB) + + Sync(2); // must sync, reallocating the input buffers + + m_cl.Unmap(); + + m_cl.vb.size = 0; + m_cl.ib.size = 0; + + size_t size = std::max(vb_size * 2, 2u << 20); + + printf("growing vertex/index buffer %d\n", size); + + m_cl.vb.buff[0] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size); + m_cl.vb.buff[1] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size); + m_cl.vb.size = size; + + size = std::max(size / sizeof(GSVertex) * 3 * sizeof(uint32), 1u << 20); // worst case, three times the vertex count + + ASSERT(size >= ib_size); + + if(size < ib_size) size = ib_size; // should not happen + + m_cl.ib.buff[0] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size); + m_cl.ib.buff[1] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size); + m_cl.ib.size = size; + } + else + { + Enqueue(); + + m_cl.Unmap(); + + // make the write queue wait until the rendering queue is ready, it may still use the device buffers + + std::vector el(1); + + m_cl.queue[2].enqueueMarkerWithWaitList(NULL, &el[0]); + m_cl.wq->enqueueBarrierWithWaitList(&el, NULL); + + // switch to the other queue/buffer (double buffering) + + m_cl.wqidx = (m_cl.wqidx + 1) & 1; + m_cl.wq = &m_cl.queue[m_cl.wqidx]; + } + + m_cl.vb.head = m_cl.vb.tail = 0; + m_cl.ib.head = m_cl.ib.tail = 0; + m_cl.pb.head = m_cl.pb.tail = 0; + + m_cl.Map(); + } + else + { + // only allow batches of the same primclass in Enqueue + + if(!m_jobs.empty() && m_jobs.front().sel.prim != (uint32)m_vt.m_primclass) + { + Enqueue(); + } + } + + // + + GSVertexCL* vb = (GSVertexCL*)(m_cl.vb.ptr + m_cl.vb.tail); + uint32* ib = (uint32*)(m_cl.ib.ptr + m_cl.ib.tail); + TFXParameter* pb = (TFXParameter*)(m_cl.pb.ptr + m_cl.pb.tail); + + pb->scissor = scissor; + pb->bbox = bbox; + pb->rect = rect; + + (this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(vb, m_vertex.buff, m_vertex.next); // TODO: upload in GSVertex format and extract the fields in the kernel? + + if(m_jobs.empty()) + { + memcpy(ib, m_index.buff, m_index.tail * sizeof(uint32)); + + m_vb_start = m_cl.vb.tail; + } + else + { + // TODO: SIMD + + uint32 vb_count = m_vb_count; + + for(size_t i = 0; i < m_index.tail; i++) + { + ib[i] = m_index.buff[i] + vb_count; + } + } + + m_vb_count += m_vertex.next; + + if(!SetupParameter(pb, vb, m_vertex.next, m_index.buff, m_index.tail)) + { + return; + } + + TFXJob job; + + job.rect.x = rect.x; + job.rect.y = rect.y; + job.rect.z = rect.z; + job.rect.w = rect.w; + job.sel = pb->sel; + job.ib_start = m_cl.ib.tail; + job.ib_count = m_index.tail; + job.pb_start = m_cl.pb.tail; + + m_jobs.push_back(job); + + m_cl.vb.tail += vb_size; + m_cl.ib.tail += ib_size; + m_cl.pb.tail += pb_size; + + // mark pages for writing + + if(pb->sel.fb) + { + uint8 flag = pb->sel.fb; + + const uint32* pages = m_context->offset.fb->GetPages(rect, m_tmp_pages); + + for(const uint32* p = pages; *p != GSOffset::EOP; p++) + { + m_rw_pages[*p] |= flag; + } + } + + if(pb->sel.zb) + { + uint8 flag = pb->sel.zb; + + const uint32* pages = m_context->offset.zb->GetPages(rect, m_tmp_pages); + + for(const uint32* p = pages; *p != GSOffset::EOP; p++) + { + m_rw_pages[*p] |= flag; + } + } + + // don't buffer too much data, feed them to the device if there is enough + + if(m_cl.vb.tail - m_cl.vb.head >= 256 * 4096 || m_jobs.size() >= 64) + { + Enqueue(); + } + + /* + // check if the texture is not part of a target currently in use + + if(CheckSourcePages(data)) + { + Sync(4); + } + + // addref source and target pages + + data->UsePages(fb_pages, m_context->offset.fb->psm, zb_pages, m_context->offset.zb->psm); + */ + + // update previously invalidated parts + + //data->UpdateSource(); + /* + if(LOG) + { + fprintf(s_fp, "[%d] queue %05x %d (%d) %05x %d (%d) %05x %d %dx%d (%d %d %d) | %d %d %d\n", + sd->counter, + m_context->FRAME.Block(), m_context->FRAME.PSM, gd.sel.fwrite, + m_context->ZBUF.Block(), m_context->ZBUF.PSM, gd.sel.zwrite, + PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, m_context->TEX0.CSM, m_context->TEX0.CPSM, m_context->TEX0.CSA, + PRIM->PRIM, sd->vertex_count, sd->index_count); + + fflush(s_fp); + } + */ + + //printf("q %p %d (%d %d %d %d)\n", pb, pb->ib_count, r.x, r.y, r.z, r.w); + + /* + // invalidate new parts rendered onto + + if(sd->global.sel.fwrite) + { + m_tc->InvalidatePages(sd->m_fb_pages, sd->m_fpsm); + } + + if(sd->global.sel.zwrite) + { + m_tc->InvalidatePages(sd->m_zb_pages, sd->m_zpsm); + } + */ + } + catch(cl::Error err) + { + printf("%s (%d)\n", err.what(), err.err()); + + return; + } + catch(std::exception err) + { + printf("%s\n", err.what()); + + return; + } + + if(s_dump) + { + Sync(2); + + uint64 frame = m_perfmon.GetFrame(); + + std::string s; + + if(s_save && s_n >= s_saven) + { + s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM); + + m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512); + } + + if(s_savez && s_n >= s_saven) + { + s = format("c:\\temp1\\_%05d_f%lld_rz1_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM); + + m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512); + } + + s_n++; + } +} + +void GSRendererCL::Sync(int reason) +{ + //printf("sync %d\n", reason); + + GSPerfMonAutoTimer pmat(&m_perfmon, GSPerfMon::Sync); + + Enqueue(); + + m_cl.queue[2].finish(); + + memset(m_rw_pages, 0, sizeof(m_rw_pages)); + memset(m_tex_pages, 0, sizeof(m_tex_pages)); + + // TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap, + // though it does not seem to be necessary even with GPU devices where it might be cached, + // needs more testing... + + //void* ptr = m_cl.queue->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize); + //m_cl.queue->enqueueUnmapMemObject(m_cl.vm, ptr); +} + +void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) +{ + if(LOG) {fprintf(s_fp, "w %05x %d %d, %d %d %d %d\n", BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM, r.x, r.y, r.z, r.w); fflush(s_fp);} + + GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM); + + o->GetPages(r, m_tmp_pages); + + //if(!synced) + { + for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) + { + if(m_rw_pages[*p] & 3) // rw + { + Sync(3); + + break; + } + } + } + + for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) + { + m_tex_pages[*p] = 1; + } +} + +void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut) +{ + if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);} + + //if(!synced) + { + GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM); + + o->GetPages(r, m_tmp_pages); + + for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) + { + if(m_rw_pages[*p] & 1) // w + { + Sync(4); + + break; + } + } + } +} +/* +bool GSRendererCL::CheckSourcePages(RasterizerData* data) +{ + // TODO: if(!m_rl->IsSynced()) // TODO: all callbacks from the issued drawings reported in => in-sync + { + for(size_t i = 0; data->m_tex[i].t != NULL; i++) + { + data->m_tex[i].t->m_offset->GetPages(data->m_tex[i].r, m_tmp_pages); + + uint32* pages = m_tmp_pages; // data->m_tex[i].t->m_pages.n; + + for(const uint32* p = pages; *p != GSOffset::EOP; p++) + { + // TODO: 8H 4HL 4HH texture at the same place as the render target (24 bit, or 32-bit where the alpha channel is masked, Valkyrie Profile 2) + + if(m_fzb_pages[*p]) // currently being drawn to? => sync + { + return true; + } + } + } + } + + return false; +} +*/ + +//#include "GSTextureCL.h" + +void GSRendererCL::Enqueue() +{ + if(m_jobs.empty()) return; + + try + { + ASSERT(m_cl.vb.tail > m_cl.vb.head); + ASSERT(m_cl.ib.tail > m_cl.ib.head); + ASSERT(m_cl.pb.tail > m_cl.pb.head); + + int primclass = m_jobs.front().sel.prim; + + uint32 n; + + switch(primclass) + { + case GS_POINT_CLASS: n = 1; break; + case GS_LINE_CLASS: n = 2; break; + case GS_TRIANGLE_CLASS: n = 3; break; + case GS_SPRITE_CLASS: n = 2; break; + default: __assume(0); + } + + PrimSelector psel; + + psel.key = 0; + psel.prim = primclass; + + cl::Kernel& pk = m_cl.GetPrimKernel(psel); + + pk.setArg(1, m_cl.vb.buff[m_cl.wqidx]); + pk.setArg(2, m_cl.ib.buff[m_cl.wqidx]); + + TileSelector tsel; + + tsel.key = 0; + tsel.prim = primclass; + + tsel.mode = 0; + + cl::Kernel& tk_32 = m_cl.GetTileKernel(tsel); + + tsel.mode = 1; + + cl::Kernel& tk_16 = m_cl.GetTileKernel(tsel); + + tsel.mode = 2; + + cl::Kernel& tk_8 = m_cl.GetTileKernel(tsel); + + tsel.mode = 3; + + cl::Kernel& tk = m_cl.GetTileKernel(tsel); + + tsel.key = 0; + tsel.clear = 1; + + cl::Kernel& tk_clear = m_cl.GetTileKernel(tsel); + + // + + m_cl.Unmap(); + + std::vector el2(1); + + m_cl.wq->enqueueMarkerWithWaitList(NULL, &el2[0]); + m_cl.queue[2].enqueueBarrierWithWaitList(&el2, NULL); + + // + + cl_kernel tfx_prev = NULL; + + auto head = m_jobs.begin(); + + while(head != m_jobs.end()) + { + uint32 total_prim_count = 0; + + auto next = head; + + while(next != m_jobs.end()) + { + auto job = next++; + + uint32 cur_prim_count = job->ib_count / n; + uint32 next_prim_count = next != m_jobs.end() ? next->ib_count / n : 0; + + total_prim_count += cur_prim_count; + + if(total_prim_count >= MAX_PRIM_COUNT || next == m_jobs.end())// || next_prim_count >= MAX_PRIM_COUNT || next_prim_count < 16 && total_prim_count >= MAX_PRIM_COUNT / 2) + { + uint32 prim_count = std::min(total_prim_count, MAX_PRIM_COUNT); + + pk.setArg(3, (cl_uint)m_vb_start); + pk.setArg(4, (cl_uint)head->ib_start); + + m_cl.queue[2].enqueueNDRangeKernel(pk, cl::NullRange, cl::NDRange(prim_count), cl::NullRange); + + if(0) + { + gs_env* ptr = (gs_env*)m_cl.queue[2].enqueueMapBuffer(m_cl.env, CL_TRUE, CL_MAP_READ, 0, sizeof(gs_env)); + m_cl.queue[2].enqueueUnmapMemObject(m_cl.env, ptr); + } + + GSVector4i rect = GSVector4i::zero(); + + for(auto i = head; i != next; i++) + { + rect = rect.runion(GSVector4i::load(&i->rect)); + } + + rect = rect.ralign(GSVector2i(BIN_SIZE, BIN_SIZE)) >> BIN_SIZE_BITS; + + int bin_w = rect.width(); + int bin_h = rect.height(); + + uint32 batch_count = BATCH_COUNT(prim_count); + uint32 bin_count = bin_w * bin_h; + + cl_uchar4 bin_dim; + + bin_dim.s[0] = (cl_uchar)rect.x; + bin_dim.s[1] = (cl_uchar)rect.y; + bin_dim.s[2] = (cl_uchar)bin_w; + bin_dim.s[3] = (cl_uchar)bin_h; + + if(1)//bin_w > 1 || bin_h > 1) // && not just one sprite covering the whole area + { + m_cl.queue[2].enqueueNDRangeKernel(tk_clear, cl::NullRange, cl::NDRange(bin_count), cl::NullRange); + + if(bin_count <= 32 && m_cl.WIs >= 256) + { + uint32 item_count; + uint32 group_count; + cl::Kernel* k; + + if(bin_count <= 8) + { + item_count = std::min(prim_count, 32u); + group_count = ((prim_count + 31) >> 5) * item_count; + k = &tk_32; + } + else if(bin_count <= 16) + { + item_count = std::min(prim_count, 16u); + group_count = ((prim_count + 15) >> 4) * item_count; + k = &tk_16; + } + else + { + item_count = std::min(prim_count, 8u); + group_count = ((prim_count + 7) >> 3) * item_count; + k = &tk_8; + } + + k->setArg(1, (cl_uint)prim_count); + k->setArg(2, (cl_uint)bin_count); + k->setArg(3, bin_dim); + + m_cl.queue[2].enqueueNDRangeKernel(*k, cl::NullRange, cl::NDRange(bin_w, bin_h, group_count), cl::NDRange(bin_w, bin_h, item_count)); + } + else + { + uint32 item_count = std::min(bin_count, m_cl.WIs); + uint32 group_count = batch_count * item_count; + + tk.setArg(1, (cl_uint)prim_count); + tk.setArg(2, (cl_uint)batch_count); + tk.setArg(3, (cl_uint)bin_count); + tk.setArg(4, bin_dim); + + m_cl.queue[2].enqueueNDRangeKernel(tk, cl::NullRange, cl::NDRange(group_count), cl::NDRange(item_count)); + } + + if(0) + { + gs_env* ptr = (gs_env*)m_cl.queue[2].enqueueMapBuffer(m_cl.env, CL_TRUE, CL_MAP_READ, 0, sizeof(gs_env)); + m_cl.queue[2].enqueueUnmapMemObject(m_cl.env, ptr); + } + } + + // + + uint32 prim_start = 0; + + for(auto i = head; i != next; i++) + { + ASSERT(prim_start < MAX_PRIM_COUNT); + + uint32 prim_count_inner = std::min(i->ib_count / n, MAX_PRIM_COUNT - prim_start); + + // TODO: update the needed pages of the texture cache buffer with enqueueCopyBuffer (src=this->vm, dst=this->vm_text), + // changed by tfx in the previous loop or marked by InvalidateVideoMem + + // TODO: tile level z test + + cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel); + + if(tfx_prev != tfx()) + { + tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]); + + tfx_prev = tfx(); + } + + tfx.setArg(4, (cl_uint)i->pb_start); + tfx.setArg(5, (cl_uint)prim_start); + tfx.setArg(6, (cl_uint)prim_count_inner); + tfx.setArg(7, (cl_uint)batch_count); + tfx.setArg(8, (cl_uint)bin_count); + tfx.setArg(9, bin_dim); + + //m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NullRange, cl::NDRange(std::min(bin_count * 4, CUs) * 256), cl::NDRange(256)); + + //printf("%d %d %d %d\n", rect.width() << BIN_SIZE_BITS, rect.height() << BIN_SIZE_BITS, i->rect.z - i->rect.x, i->rect.w - i->rect.y); + + GSVector4i r = GSVector4i::load(&i->rect); + + r = r.ralign(GSVector2i(BIN_SIZE, BIN_SIZE)); + /* + if(i->sel.IsSolidRect()) // TODO: simple mem fill + ;//printf("%d %d %d %d\n", r.left, r.top, r.width(), r.height()); + else + */ + m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(16, 16)); + + // TODO: invalidate texture cache pages + + prim_start += prim_count_inner; + } + + // + + if(total_prim_count > MAX_PRIM_COUNT) + { + prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT); + + job->ib_start += prim_count * n * sizeof(uint32); + job->ib_count -= prim_count * n; + + next = job; // try again for the reminder + } + + break; + } + } + + head = next; + } + } + catch(cl::Error err) + { + printf("%s (%d)\n", err.what(), err.err()); + } + + m_jobs.clear(); + + m_vb_count = 0; + + m_cl.vb.head = m_cl.vb.tail; + m_cl.ib.head = m_cl.ib.tail; + m_cl.pb.head = m_cl.pb.tail; + + m_cl.Map(); +} + +static int RemapPSM(int psm) +{ + switch(psm) + { + default: + case PSM_PSMCT32: psm = 0; break; + case PSM_PSMCT24: psm = 1; break; + case PSM_PSMCT16: psm = 2; break; + case PSM_PSMCT16S: psm = 3; break; + case PSM_PSMZ32: psm = 4; break; + case PSM_PSMZ24: psm = 5; break; + case PSM_PSMZ16: psm = 6; break; + case PSM_PSMZ16S: psm = 7; break; + case PSM_PSMT8: psm = 8; break; + case PSM_PSMT4: psm = 9; break; + case PSM_PSMT8H: psm = 10; break; + case PSM_PSMT4HL: psm = 11; break; + case PSM_PSMT4HH: psm = 12; break; + } + + return psm; +} + +bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count) +{ + const GSDrawingEnvironment& env = m_env; + const GSDrawingContext* context = m_context; + const GS_PRIM_CLASS primclass = m_vt.m_primclass; + + pb->sel.key = 0; + + pb->sel.atst = ATST_ALWAYS; + pb->sel.tfx = TFX_NONE; + pb->sel.ababcd = 0xff; + pb->sel.prim = primclass; + + uint32 fm = context->FRAME.FBMSK; + uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0; + + if(context->TEST.ZTE && context->TEST.ZTST == ZTST_NEVER) + { + fm = 0xffffffff; + zm = 0xffffffff; + } + + if(PRIM->TME) + { + if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) + { + m_mem.m_clut.Read32(context->TEX0, env.TEXA); + } + } + + if(context->TEST.ATE) + { + if(!TryAlphaTest(fm, zm)) + { + pb->sel.atst = context->TEST.ATST; + pb->sel.afail = context->TEST.AFAIL; + pb->aref = context->TEST.AREF; + } + } + + bool fwrite; + bool zwrite; + + switch(context->FRAME.PSM) + { + default: + case PSM_PSMCT32: + case PSM_PSMZ32: + fwrite = fm != 0xffffffff; + break; + case PSM_PSMCT24: + case PSM_PSMZ24: + fwrite = (fm & 0x00ffffff) != 0x00ffffff; + break; + case PSM_PSMCT16: + case PSM_PSMCT16S: + case PSM_PSMZ16: + case PSM_PSMZ16S: + fwrite = (fm & 0x80f8f8f8) != 0x80f8f8f8; + break; + } + + switch(context->ZBUF.PSM) + { + default: + case PSM_PSMCT32: + case PSM_PSMZ32: + zwrite = zm != 0xffffffff; + break; + case PSM_PSMCT24: + case PSM_PSMZ24: + zwrite = (zm & 0x00ffffff) != 0x00ffffff; + break; + case PSM_PSMCT16: + case PSM_PSMCT16S: + case PSM_PSMZ16: + case PSM_PSMZ16S: + zm &= 0x80f8f8f8; + zwrite = (zm & 0x80f8f8f8) != 0x80f8f8f8; + break; + } + + if(!fwrite && !zwrite) return false; + + bool ftest = pb->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24; + bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS; + + pb->sel.fwrite = fwrite; + pb->sel.ftest = ftest; + pb->sel.zwrite = zwrite; + pb->sel.ztest = ztest; + + if(fwrite || ftest) + { + pb->sel.fpsm = RemapPSM(context->FRAME.PSM); + + if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff) + { + pb->sel.iip = PRIM->IIP; + } + + if(PRIM->TME) + { + pb->sel.tfx = context->TEX0.TFX; + pb->sel.tcc = context->TEX0.TCC; + pb->sel.fst = PRIM->FST; + pb->sel.ltf = m_vt.IsLinear(); + pb->sel.tpsm = RemapPSM(context->TEX0.PSM); + pb->sel.aem = m_env.TEXA.AEM; + + pb->tbp[0] = context->TEX0.TBP0; + pb->tbw[0] = context->TEX0.TBW; + pb->ta0 = m_env.TEXA.TA0; + pb->ta1 = m_env.TEXA.TA1; + + if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) + { + pb->sel.tlu = 1; + + memcpy(pb->clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal); + } + + pb->sel.wms = context->CLAMP.WMS; + pb->sel.wmt = context->CLAMP.WMT; + + if(pb->sel.tfx == TFX_MODULATE && pb->sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128))) + { + // modulate does not do anything when vertex color is 0x80 + + pb->sel.tfx = TFX_DECAL; + } + + // TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(context->TEX0, env.TEXA); + + // TODO: if(t == NULL) {ASSERT(0); return false;} + + GSVector4i r; + + GetTextureMinMax(r, context->TEX0, context->CLAMP, pb->sel.ltf); + + // TODO: data->SetSource(t, r, 0); + + // TODO: pb->sel.tw = t->m_tw - 3; + + // TODO: store r to current job + + if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.y > 0) + { + // TEX1.MMIN + // 000 p + // 001 l + // 010 p round + // 011 p tri + // 100 l round + // 101 l tri + + if(m_vt.m_lod.x > 0) + { + pb->sel.ltf = context->TEX1.MMIN >> 2; + } + else + { + // TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt.m_lod.x <= 0 && m_vt.m_lod.y > 0 + } + + pb->sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri + pb->sel.lcm = context->TEX1.LCM; + + int mxl = std::min((int)context->TEX1.MXL, 6) << 16; + int k = context->TEX1.K << 12; + + if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL) + { + k = (int)m_vt.m_lod.x << 16; // set lod to max level + + pb->sel.lcm = 1; // lod is constant + pb->sel.mmin = 1; // tri-linear is meaningless + } + + if(pb->sel.mmin == 2) + { + mxl--; // don't sample beyond the last level (TODO: add a dummy level instead?) + } + + if(pb->sel.fst) + { + ASSERT(pb->sel.lcm == 1); + ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu) + + pb->sel.lcm = 1; + } + + if(pb->sel.lcm) + { + int lod = std::max(std::min(k, mxl), 0); + + if(pb->sel.mmin == 1) + { + lod = (lod + 0x8000) & 0xffff0000; // rounding + } + + pb->lod = lod; + + // TODO: lot to optimize when lod is constant + } + else + { + pb->mxl = mxl; + pb->l = (float)(-0x10000 << context->TEX1.L); + pb->k = (float)k; + } + + GIFRegTEX0 MIP_TEX0 = context->TEX0; + GIFRegCLAMP MIP_CLAMP = context->CLAMP; + + GSVector4 tmin = m_vt.m_min.t; + GSVector4 tmax = m_vt.m_max.t; + + static int s_counter = 0; + + for(int i = 1, j = std::min((int)context->TEX1.MXL, 6); i <= j; i++) + { + switch(i) + { + case 1: + MIP_TEX0.TBP0 = context->MIPTBP1.TBP1; + MIP_TEX0.TBW = context->MIPTBP1.TBW1; + break; + case 2: + MIP_TEX0.TBP0 = context->MIPTBP1.TBP2; + MIP_TEX0.TBW = context->MIPTBP1.TBW2; + break; + case 3: + MIP_TEX0.TBP0 = context->MIPTBP1.TBP3; + MIP_TEX0.TBW = context->MIPTBP1.TBW3; + break; + case 4: + MIP_TEX0.TBP0 = context->MIPTBP2.TBP4; + MIP_TEX0.TBW = context->MIPTBP2.TBW4; + break; + case 5: + MIP_TEX0.TBP0 = context->MIPTBP2.TBP5; + MIP_TEX0.TBW = context->MIPTBP2.TBW5; + break; + case 6: + MIP_TEX0.TBP0 = context->MIPTBP2.TBP6; + MIP_TEX0.TBW = context->MIPTBP2.TBW6; + break; + default: + __assume(0); + } + + pb->tbp[i] = MIP_TEX0.TBP0; + pb->tbw[i] = MIP_TEX0.TBW; + + if(MIP_TEX0.TW > 0) MIP_TEX0.TW--; + if(MIP_TEX0.TH > 0) MIP_TEX0.TH--; + + MIP_CLAMP.MINU >>= 1; + MIP_CLAMP.MINV >>= 1; + MIP_CLAMP.MAXU >>= 1; + MIP_CLAMP.MAXV >>= 1; + + m_vt.m_min.t *= 0.5f; + m_vt.m_max.t *= 0.5f; + + // TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, pb->sel.tw + 3); + + // TODO: if(t == NULL) {ASSERT(0); return false;} + + GSVector4i r; + + GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, pb->sel.ltf); + + // TODO: data->SetSource(t, r, i); + + // TODO: store r to current job + } + + s_counter++; + + m_vt.m_min.t = tmin; + m_vt.m_max.t = tmax; + } + else + { + if(pb->sel.fst == 0) + { + // skip per pixel division if q is constant + + GSVertexCL* RESTRICT v = vertex; + + if(m_vt.m_eq.q) + { + pb->sel.fst = 1; + + const GSVector4& t = v[index[0]].t; + + if(t.z != 1.0f) + { + GSVector4 w = t.zzzz().rcpnr(); + + for(int i = 0, j = vertex_count; i < j; i++) + { + GSVector4 t = v[i].t; + + v[i].t = (t * w).xyzw(t); + } + } + } + else if(primclass == GS_SPRITE_CLASS) + { + pb->sel.fst = 1; + + for(int i = 0, j = vertex_count; i < j; i += 2) + { + GSVector4 t0 = v[i + 0].t; + GSVector4 t1 = v[i + 1].t; + + GSVector4 w = t1.zzzz().rcpnr(); + + v[i + 0].t = (t0 * w).xyzw(t0); + v[i + 1].t = (t1 * w).xyzw(t1); + } + } + } + + if(pb->sel.ltf && pb->sel.fst) // TODO: quite slow, do this in the prim kernel? + { + // if q is constant we can do the half pel shift for bilinear sampling on the vertices + + // TODO: but not when mipmapping is used!!! + + GSVector4 half(8.0f, 8.0f); + + GSVertexCL* RESTRICT v = vertex; + + for(int i = 0, j = vertex_count; i < j; i++) + { + GSVector4 t = v[i].t; + + v[i].t = (t - half).xyzw(t); + } + } + } + + int tw = 1 << context->TEX0.TW; + int th = 1 << context->TEX0.TH; + + switch(context->CLAMP.WMS) + { + case CLAMP_REPEAT: + pb->minu = tw - 1; + pb->maxu = 0; + //gd.t.mask.u32[0] = 0xffffffff; + break; + case CLAMP_CLAMP: + pb->minu = 0; + pb->maxu = tw - 1; + //gd.t.mask.u32[0] = 0; + break; + case CLAMP_REGION_CLAMP: + pb->minu = std::min((int)context->CLAMP.MINU, tw - 1); + pb->maxu = std::min((int)context->CLAMP.MAXU, tw - 1); + //gd.t.mask.u32[0] = 0; + break; + case CLAMP_REGION_REPEAT: + pb->minu = (int)context->CLAMP.MINU & (tw - 1); + pb->maxu = (int)context->CLAMP.MAXU & (tw - 1); + //gd.t.mask.u32[0] = 0xffffffff; + break; + default: + __assume(0); + } + + switch(context->CLAMP.WMT) + { + case CLAMP_REPEAT: + pb->minv = th - 1; + pb->maxv = 0; + //gd.t.mask.u32[2] = 0xffffffff; + break; + case CLAMP_CLAMP: + pb->minv = 0; + pb->maxv = th - 1; + //gd.t.mask.u32[2] = 0; + break; + case CLAMP_REGION_CLAMP: + pb->minv = std::min((int)context->CLAMP.MINV, th - 1); + pb->maxv = std::min((int)context->CLAMP.MAXV, th - 1); // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256) + //gd.t.mask.u32[2] = 0; + break; + case CLAMP_REGION_REPEAT: + pb->minv = (int)context->CLAMP.MINV & (th - 1); // skygunner main menu water texture 64x64, MINV = 127 + pb->maxv = (int)context->CLAMP.MAXV & (th - 1); + //gd.t.mask.u32[2] = 0xffffffff; + break; + default: + __assume(0); + } + } + + if(PRIM->FGE) + { + pb->sel.fge = 1; + pb->fog = env.FOGCOL.u32[0]; + } + + if(context->FRAME.PSM != PSM_PSMCT24) + { + pb->sel.date = context->TEST.DATE; + pb->sel.datm = context->TEST.DATM; + } + + if(!IsOpaque()) + { + pb->sel.abe = PRIM->ABE; + pb->sel.ababcd = context->ALPHA.u32[0]; + + if(env.PABE.PABE) + { + pb->sel.pabe = 1; + } + + if(m_aa1 && PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS)) + { + pb->sel.aa1 = 1; + } + + pb->afix = context->ALPHA.FIX; + } + + if(pb->sel.date + || pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1 + || pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY + || (pb->sel.fpsm & 3) == 0 && fwrite && fm != 0 + || (pb->sel.fpsm & 3) == 1 && fwrite // always read-merge-write 24bpp, regardless the mask + || (pb->sel.fpsm & 3) >= 2 && fwrite && (fm & 0x80f8f8f8) != 0) + { + pb->sel.rfb = 1; + } + + pb->sel.colclamp = env.COLCLAMP.CLAMP; + pb->sel.fba = context->FBA.FBA; + + if(env.DTHE.DTHE) + { + pb->sel.dthe = 1; + + GSVector4i dimx0 = env.dimx[1].sll32(16).sra32(16); + GSVector4i dimx1 = env.dimx[3].sll32(16).sra32(16); + GSVector4i dimx2 = env.dimx[5].sll32(16).sra32(16); + GSVector4i dimx3 = env.dimx[7].sll32(16).sra32(16); + + pb->dimx = dimx0.ps32(dimx1).ps16(dimx2.ps32(dimx3)); + } + } + + if(zwrite || ztest) + { + pb->sel.zpsm = RemapPSM(context->ZBUF.PSM); + pb->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS; + pb->sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000; + } + + pb->fm = fm; + pb->zm = zm; + + if((pb->sel.fpsm & 3) == 1) + { + pb->fm |= 0xff000000; + } + else if((pb->sel.fpsm & 3) >= 2) + { + uint32 rb = pb->fm & 0x00f800f8; + uint32 ga = pb->fm & 0x8000f800; + + pb->fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000; + } + + if((pb->sel.zpsm & 3) == 1) + { + pb->zm |= 0xff000000; + } + else if((pb->sel.zpsm & 3) >= 2) + { + pb->zm |= 0xffff0000; + } + + if(pb->bbox.eq(pb->bbox.rintersect(pb->scissor))) + { + pb->sel.noscissor = 1; + } + + pb->fbp = context->FRAME.Block(); + pb->zbp = context->ZBUF.Block(); + pb->bw = context->FRAME.FBW; + + return true; +} + +////////// + +//#define IOCL_DEBUG + +GSRendererCL::CL::CL() +{ + WIs = INT_MAX; + + std::vector platforms; + + cl::Platform::get(&platforms); + + for(auto& p : platforms) + { + std::string platform_vendor = p.getInfo(); + + std::vector ds; + + p.getDevices(CL_DEVICE_TYPE_ALL, &ds); + + for(auto& device : ds) + { + std::string vendor = device.getInfo(); + std::string name = device.getInfo(); + std::string version = device.getInfo(); + + printf("%s %s %s", vendor.c_str(), name.c_str(), version.c_str()); + + cl_device_type type = device.getInfo(); + + switch(type) + { + case CL_DEVICE_TYPE_GPU: printf(" GPU"); break; + case CL_DEVICE_TYPE_CPU: printf(" CPU"); break; + } + + if(strstr(version.c_str(), "OpenCL C 1.2") != NULL) + { +#ifdef IOCL_DEBUG + if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL) +#else + //if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Intel") != NULL) + if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Advanced Micro Devices") != NULL) +#endif + { + devices.push_back(device); + + WIs = std::min(WIs, device.getInfo()); + + printf(" *"); + } + } + + printf("\n"); + } + + if(!devices.empty()) break; + } + + if(devices.empty()) + { + throw new std::exception("OpenCL device not found"); + } + + context = cl::Context(devices); + + queue[0] = cl::CommandQueue(context); + queue[1] = cl::CommandQueue(context); + queue[2] = cl::CommandQueue(context); + + vector buff; + + if(theApp.LoadResource(IDR_TFX_CL, buff)) + { + kernel_str = std::string((const char*)buff.data(), buff.size()); + } + + vb.head = vb.tail = vb.size = 0; + ib.head = ib.tail = ib.size = 0; + pb.head = pb.tail = pb.size = 0; + + vb.mapped_ptr = vb.ptr = NULL; + ib.mapped_ptr = ib.ptr = NULL; + pb.mapped_ptr = pb.ptr = NULL; + + pb.size = sizeof(TFXParameter) * 256; + pb.buff[0] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size); + pb.buff[1] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size); + + env = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(gs_env)); + + wqidx = 0; + wq = &queue[0]; +} + +GSRendererCL::CL::~CL() +{ + Unmap(); +} + +void GSRendererCL::CL::Map() +{ + Unmap(); + + if(vb.head < vb.size) + { + vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, vb.head, vb.size - vb.head); + vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head; + ASSERT(((size_t)vb.ptr & 15) == 0); + ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0); + } + + if(ib.head < ib.size) + { + ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, ib.head, ib.size - ib.head); + ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head; + ASSERT(((size_t)ib.ptr & 15) == 0); + } + + if(pb.head < pb.size) + { + pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pb.head, pb.size - pb.head); + pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head; + ASSERT(((size_t)pb.ptr & 15) == 0); + ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0); + } +} + +void GSRendererCL::CL::Unmap() +{ + if(vb.mapped_ptr != NULL) wq->enqueueUnmapMemObject(vb.buff[wqidx], vb.mapped_ptr); + if(ib.mapped_ptr != NULL) wq->enqueueUnmapMemObject(ib.buff[wqidx], ib.mapped_ptr); + if(pb.mapped_ptr != NULL) wq->enqueueUnmapMemObject(pb.buff[wqidx], pb.mapped_ptr); + + vb.mapped_ptr = vb.ptr = NULL; + ib.mapped_ptr = ib.ptr = NULL; + pb.mapped_ptr = pb.ptr = NULL; +} + +static void AddDefs(ostringstream& opt) +{ + opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u "; + opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u "; + opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u "; + opt << "-D MAX_PRIM_PER_BATCH=" << MAX_PRIM_PER_BATCH << "u "; + opt << "-D MAX_BATCH_COUNT=" << MAX_BATCH_COUNT << "u "; + opt << "-D BIN_SIZE_BITS=" << BIN_SIZE_BITS << " "; + opt << "-D BIN_SIZE=" << BIN_SIZE << "u "; + opt << "-D MAX_BIN_PER_BATCH=" << MAX_BIN_PER_BATCH << "u "; + opt << "-D MAX_BIN_COUNT=" << MAX_BIN_COUNT << "u "; +#ifdef IOCL_DEBUG + opt << "-g -s \"E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\tfx.cl\" "; +#endif +} + +cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel) +{ + auto i = prim_map.find(sel); + + if(i != prim_map.end()) + { + return i->second; + } + + char entry[256]; + + sprintf(entry, "prim_%02x", sel); + + cl::Program program = cl::Program(context, kernel_str); + + try + { + ostringstream opt; + + opt << "-D KERNEL_PRIM=" << entry << " "; + opt << "-D PRIM=" << sel.prim << " "; + + AddDefs(opt); + + program.build(opt.str().c_str()); + } + catch(cl::Error err) + { + if(err.err() == CL_BUILD_PROGRAM_FAILURE) + { + for(auto device : devices) + { + auto s = program.getBuildInfo(device); + + printf("kernel (%s) build error: %s\n", entry, s.c_str()); + } + } + + throw err; + } + + cl::Kernel k(program, entry); + + prim_map[sel] = k; + + k.setArg(0, env); + + return prim_map[sel]; +} + +cl::Kernel& GSRendererCL::CL::GetTileKernel(const TileSelector& sel) +{ + auto i = tile_map.find(sel); + + if(i != tile_map.end()) + { + return i->second; + } + + char entry[256]; + + sprintf(entry, "tile_%02x", sel); + + cl::Program program = cl::Program(context, kernel_str); + + try + { + ostringstream opt; + + opt << "-D KERNEL_TILE=" << entry << " "; + opt << "-D PRIM=" << sel.prim << " "; + opt << "-D MODE=" << sel.mode << " "; + opt << "-D CLEAR=" << sel.clear << " "; + + AddDefs(opt); + + program.build(opt.str().c_str()); + } + catch(cl::Error err) + { + if(err.err() == CL_BUILD_PROGRAM_FAILURE) + { + for(auto device : devices) + { + auto s = program.getBuildInfo(device); + + printf("kernel (%s) build error: %s\n", entry, s.c_str()); + } + } + + throw err; + } + + cl::Kernel k(program, entry); + + tile_map[sel] = k; + + k.setArg(0, env); + + return tile_map[sel]; +} + +cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel) +{ + auto i = tfx_map.find(sel); + + if(i != tfx_map.end()) + { + return i->second; + } + + char entry[256]; + + sprintf(entry, "tfx_%016x", sel); + + cl::Program program = cl::Program(context, kernel_str); + + try + { + ostringstream opt; + + opt << "-D KERNEL_TFX=" << entry << " "; + opt << "-D FPSM=" << sel.fpsm << " "; + opt << "-D ZPSM=" << sel.zpsm << " "; + opt << "-D ZTST=" << sel.ztst << " "; + opt << "-D ATST=" << sel.atst << " "; + opt << "-D AFAIL=" << sel.afail << " "; + opt << "-D IIP=" << sel.iip << " "; + opt << "-D TFX=" << sel.tfx << " "; + opt << "-D TCC=" << sel.tcc << " "; + opt << "-D FST=" << sel.fst << " "; + opt << "-D LTF=" << sel.ltf << " "; + opt << "-D TLU=" << sel.tlu << " "; + opt << "-D FGE=" << sel.fge << " "; + opt << "-D DATE=" << sel.date << " "; + opt << "-D ABE=" << sel.abe << " "; + opt << "-D ABA=" << sel.aba << " "; + opt << "-D ABB=" << sel.abb << " "; + opt << "-D ABC=" << sel.abc << " "; + opt << "-D ABD=" << sel.abd << " "; + opt << "-D PABE=" << sel.pabe << " "; + opt << "-D AA1=" << sel.aa1 << " "; + opt << "-D FWRITE=" << sel.fwrite << " "; + opt << "-D FTEST=" << sel.ftest << " "; + opt << "-D RFB=" << sel.rfb << " "; + opt << "-D ZWRITE=" << sel.zwrite << " "; + opt << "-D ZTEST=" << sel.ztest << " "; + opt << "-D ZOVERFLOW=" << sel.zoverflow << " "; + opt << "-D WMS=" << sel.wms << " "; + opt << "-D WMT=" << sel.wmt << " "; + opt << "-D DATM=" << sel.datm << " "; + opt << "-D COLCLAMP=" << sel.colclamp << " "; + opt << "-D FBA=" << sel.fba << " "; + opt << "-D DTHE=" << sel.dthe << " "; + opt << "-D PRIM=" << sel.prim << " "; + opt << "-D TW=" << sel.tw << " "; + opt << "-D LCM=" << sel.lcm << " "; + opt << "-D MMIN=" << sel.mmin << " "; + opt << "-D NOSCISSOR=" << sel.noscissor << " "; + opt << "-D TPSM=" << sel.tpsm << " "; + opt << "-D AEM=" << sel.aem << " "; + opt << "-D FB=" << sel.fb << " "; + opt << "-D ZB=" << sel.zb << " "; + + AddDefs(opt); + + program.build(opt.str().c_str()); + } + catch(cl::Error err) + { + if(err.err() == CL_BUILD_PROGRAM_FAILURE) + { + for(auto device : devices) + { + auto s = program.getBuildInfo(device); + + printf("kernel (%s) build error: %s\n", entry, s.c_str()); + } + } + + throw err; + } + + cl::Kernel k(program, entry); + + tfx_map[sel] = k; + + k.setArg(0, env); + k.setArg(1, vm); + k.setArg(2, tex); + + return tfx_map[sel]; +} diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h new file mode 100644 index 0000000000..3ac008ceaa --- /dev/null +++ b/plugins/GSdx/GSRendererCL.h @@ -0,0 +1,310 @@ +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#pragma once + +#include "GSRenderer.h" +//#include "GSTextureCacheCL.h" + +__aligned(struct, 32) GSVertexCL +{ + GSVector4 p, t; +}; + +class GSRendererCL : public GSRenderer +{ + typedef void (GSRendererCL::*ConvertVertexBufferPtr)(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count); + + ConvertVertexBufferPtr m_cvb[4][2][2]; + + template + void ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count); + + union PrimSelector + { + struct + { + uint32 prim:2; // 0 + }; + + uint32 key; + + operator uint32() const { return key; } + }; + + union TileSelector + { + struct + { + uint32 prim:2; // 0 + uint32 mode:2; // 2 + uint32 clear:1; // 4 + }; + + uint32 key; + + operator uint32() const { return key; } + }; + + union JobSelector + { + struct + { + uint32 dummy:1; // 0 + }; + + uint32 key; + + operator uint32() const { return key; } + }; + + union TFXSelector + { + struct + { + uint32 fpsm:3; // 0 + uint32 zpsm:3; // 3 + uint32 ztst:2; // 6 (0: off, 1: write, 2: test (ge), 3: test (g)) + uint32 atst:3; // 8 + uint32 afail:2; // 11 + uint32 iip:1; // 13 + uint32 tfx:3; // 14 + uint32 tcc:1; // 17 + uint32 fst:1; // 18 + uint32 ltf:1; // 19 + uint32 tlu:1; // 20 + uint32 fge:1; // 21 + uint32 date:1; // 22 + uint32 abe:1; // 23 + uint32 aba:2; // 24 + uint32 abb:2; // 26 + uint32 abc:2; // 28 + uint32 abd:2; // 30 + + uint32 pabe:1; // 32 + uint32 aa1:1; // 33 + uint32 fwrite:1; // 34 + uint32 ftest:1; // 35 + uint32 rfb:1; // 36 + uint32 zwrite:1; // 37 + uint32 ztest:1; // 38 + uint32 zoverflow:1; // 39 (z max >= 0x80000000) + uint32 wms:2; // 40 + uint32 wmt:2; // 42 + uint32 datm:1; // 44 + uint32 colclamp:1; // 45 + uint32 fba:1; // 46 + uint32 dthe:1; // 47 + uint32 prim:2; // 48 + uint32 tw:3; // 50 (encodes values between 3 -> 10, texture cache makes sure it is at least 3) + uint32 lcm:1; // 53 + uint32 mmin:2; // 54 + uint32 noscissor:1; // 55 + uint32 tpsm:4; // 56 + uint32 aem:1; // 60 + // TODO + }; + + struct + { + uint32 _pad1:24; + uint32 ababcd:8; + uint32 _pad2:2; + uint32 fb:2; + uint32 _pad3:1; + uint32 zb:2; + }; + + struct + { + uint32 lo; + uint32 hi; + }; + + uint64 key; + + operator uint64() const { return key; } + + bool IsSolidRect() const + { + return prim == GS_SPRITE_CLASS + && iip == 0 + && tfx == TFX_NONE + && abe == 0 + && ztst <= 1 + && atst <= 1 + && date == 0 + && fge == 0; + } + }; + + __aligned(struct, 32) TFXParameter + { + GSVector4i scissor; + GSVector4i bbox; + GSVector4i rect; + GSVector4i dimx; // 4x4 signed char + TFXSelector sel; + uint32 fbp, zbp, bw; + uint32 fm, zm; + uint32 fog; // rgb + uint8 aref, afix; + uint8 ta0, ta1; + uint32 tbp[7], tbw[7]; + int minu, maxu, minv, maxv; // umsk, ufix, vmsk, vfix + int lod; // lcm == 1 + int mxl; + float l; // TEX1.L * -0x10000 + float k; // TEX1.K * 0x10000 + uint32 clut[256]; + }; + + struct TFXJob + { + struct {int x, y, z, w;} rect; + TFXSelector sel; + uint32 ib_start, ib_count; + uint32 pb_start; + }; + + class CL + { + std::string kernel_str; + std::map prim_map; + std::map tile_map; + std::map tfx_map; + + public: + std::vector devices; + cl::Context context; + cl::CommandQueue queue[3]; + cl::Buffer vm; + cl::Buffer tex; + struct { cl::Buffer buff[2]; size_t head, tail, size; unsigned char* ptr; void* mapped_ptr; } vb, ib, pb; + cl::Buffer env; + cl::CommandQueue* wq; + int wqidx; + size_t WIs; + + public: + CL(); + virtual ~CL(); + + cl::Kernel& GetPrimKernel(const PrimSelector& sel); + cl::Kernel& GetTileKernel(const TileSelector& sel); + cl::Kernel& GetTFXKernel(const TFXSelector& sel); + + void Map(); + void Unmap(); + }; + + CL m_cl; + std::list m_jobs; + uint32 m_vb_start; + uint32 m_vb_count; + + void Enqueue(); + + /* + class RasterizerData : public GSAlignedClass<32> + { + __aligned(struct, 16) TextureLevel + { + GSVector4i r; + // TODO: GSTextureCacheCL::Texture* t; + }; + + public: + GSRendererCL* m_parent; + const uint32* m_fb_pages; + const uint32* m_zb_pages; + + //cl::Buffer m_vbuff; + //cl::Buffer m_ibuff; + + // TODO: buffers + TextureLevel m_tex[7 + 1]; // NULL terminated + //cl::Buffer m_clut; + //cl::Buffer m_dimx; + + // TODO: struct in a cl::Buffer + TFXSelector m_sel; + GSVector4i m_scissor; + GSVector4i m_bbox; + uint32 m_fm, m_zm; + int m_aref, m_afix; + uint32 m_fog; // rgb + int m_lod; // lcm == 1 + int m_mxl; + float m_l; // TEX1.L * -0x10000 + float m_k; // TEX1.K * 0x10000 + // TODO: struct { GSVector4i min, max, minmax, mask, invmask; } t; // [u] x 4 [v] x 4 + + RasterizerData(GSRendererCL* parent) + : m_parent(parent) + , m_fb_pages(NULL) + , m_zb_pages(NULL) + { + m_sel.key = 0; + } + + virtual ~RasterizerData() + { + // TODO: ReleasePages(); + } + + // TODO: void UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm); + // TODO: void ReleasePages(); + + // TODO: void SetSource(GSTextureCacheCL::Texture* t, const GSVector4i& r, int level); + // TODO: void UpdateSource(); + }; + */ +protected: +// GSTextureCacheCL* m_tc; + GSTexture* m_texture[2]; + uint8* m_output; + + uint8 m_rw_pages[512]; // TODO: bit array for faster clearing (bit 0: write, bit 1: read) + uint8 m_tex_pages[512]; + uint32 m_tmp_pages[512 + 1]; + + void Reset(); + void VSync(int field); + void ResetDevice(); + GSTexture* GetOutput(int i); + + void Draw(); + void Sync(int reason); + void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r); + void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false); + + void UsePages(const uint32* pages, int type); + void ReleasePages(const uint32* pages, int type); + + //bool CheckSourcePages(RasterizerData* data); + + bool SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count); + +public: + GSRendererCL(); + virtual ~GSRendererCL(); +}; diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index 30d848e4aa..71f42f961d 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -429,6 +429,15 @@ void GSRendererSW::Draw() GSVector4i scissor = GSVector4i(context->scissor.in); GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil())); + + // points and lines may have zero area bbox (single line: 0, 0 - 256, 0) + + if(m_vt.m_primclass == GS_POINT_CLASS || m_vt.m_primclass == GS_LINE_CLASS) + { + if(bbox.x == bbox.z) bbox.z++; + if(bbox.y == bbox.w) bbox.w++; + } + GSVector4i r = bbox.rintersect(scissor); scissor.z = std::min(scissor.z, (int)context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour @@ -973,7 +982,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.sel.zpsm = 3; gd.sel.atst = ATST_ALWAYS; gd.sel.tfx = TFX_NONE; - gd.sel.ababcd = 255; + gd.sel.ababcd = 0xff; gd.sel.prim = primclass; uint32 fm = context->FRAME.FBMSK; @@ -1101,7 +1110,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri gd.sel.lcm = context->TEX1.LCM; - int mxl = (std::min((int)context->TEX1.MXL, 6) << 16); + int mxl = std::min((int)context->TEX1.MXL, 6) << 16; int k = context->TEX1.K << 12; if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL) diff --git a/plugins/GSdx/GSSettingsDlg.cpp b/plugins/GSdx/GSSettingsDlg.cpp index 89656965dd..41d66f24e9 100644 --- a/plugins/GSdx/GSSettingsDlg.cpp +++ b/plugins/GSdx/GSSettingsDlg.cpp @@ -329,16 +329,19 @@ void GSSettingsDlg::UpdateRenderers() { GSSetting r = theApp.m_gs_renderers[i]; - if(i >= 3 && i <= 5) + if(r.id >= 3 && r.id <= 5 || r.id == 15) { if(level < D3D_FEATURE_LEVEL_10_0) continue; - r.name = std::string("Direct3D") + (level >= D3D_FEATURE_LEVEL_11_0 ? "11" : "10"); + r.name += (level >= D3D_FEATURE_LEVEL_11_0 ? "11" : "10"); } renderers.push_back(r); - if (r.id == renderer_setting) + + if(r.id == renderer_setting) + { renderer_sel = renderer_setting; + } } ComboBoxInit(IDC_RENDERER, renderers, renderer_sel); @@ -607,13 +610,13 @@ bool GSHacksDlg::OnMessage(UINT message, WPARAM wParam, LPARAM lParam) break; case IDC_SPRITEHACK: helpstr = "Sprite Hack\n\nHelps getting rid of black inner lines in some filtered sprites." - " Half option is the preferred one. Use it for Mana Khemia or Ar tonelico for example." + " Half option is the preferred one. Use it for Mana Khemia or Ar Tonelico for example." " Full can be used for Tales of Destiny."; break; case IDC_WILDHACK: helpstr = "WildArms\n\nLowers the GS precision to avoid gaps between pixels when" " upscaling. Full option fixes the text on WildArms games, while Half option might improve portraits" - " in Ar tonelico."; + " in Ar Tonelico."; break; case IDC_MSAACB: case IDC_STATIC_MSAA: diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index 604836af03..731b2aa480 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -1551,7 +1551,8 @@ void GSState::Read(uint8* mem, int len) return; } - if (!m_init_read_fifo_supported) { + if(!m_init_read_fifo_supported) + { if(m_tr.x == sx && m_tr.y == sy) { InvalidateLocalMem(m_env.BITBLTBUF, GSVector4i(sx, sy, sx + w, sy + h)); @@ -2316,20 +2317,20 @@ void GSState::GrowVertexBuffer() GSVertex* vertex = (GSVertex*)_aligned_malloc(sizeof(GSVertex) * maxcount, 32); uint32* index = (uint32*)_aligned_malloc(sizeof(uint32) * maxcount * 3, 32); // worst case is slightly less than vertex number * 3 - if (!vertex || !index) + if(vertex == NULL || index == NULL) { printf("GSdx: failed to allocate %d bytes for verticles and %d for indices.\n", sizeof(GSVertex) * maxcount, sizeof(uint32) * maxcount * 3); throw GSDXError(); } - if (m_vertex.buff != NULL) + if(m_vertex.buff != NULL) { memcpy(vertex, m_vertex.buff, sizeof(GSVertex) * m_vertex.tail); _aligned_free(m_vertex.buff); } - if (m_index.buff != NULL) + if(m_index.buff != NULL) { memcpy(index, m_index.buff, sizeof(uint32) * m_index.tail); diff --git a/plugins/GSdx/GSdx.cpp b/plugins/GSdx/GSdx.cpp index 00fb984260..93d9642fc2 100644 --- a/plugins/GSdx/GSdx.cpp +++ b/plugins/GSdx/GSdx.cpp @@ -41,8 +41,29 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserv return TRUE; } +bool GSdxApp::LoadResource(int id, vector& buff, const char* type) +{ + buff.clear(); + HRSRC hRsrc = FindResource((HMODULE)s_hModule, MAKEINTRESOURCE(id), type != NULL ? type : RT_RCDATA); + if(!hRsrc) return false; + HGLOBAL hGlobal = ::LoadResource((HMODULE)s_hModule, hRsrc); + if(!hGlobal) return false; + DWORD size = SizeofResource((HMODULE)s_hModule, hRsrc); + if(!size) return false; + buff.resize(size); + memcpy(buff.data(), LockResource(hGlobal), size); + return true; +} + #else +bool GSdxApp::LoadResource(int id, vector& buff, const char* type) +{ + buff.clear(); + printf("LoadResource not implemented\n"); + return false; +} + size_t GSdxApp::GetPrivateProfileString(const char* lpAppName, const char* lpKeyName, const char* lpDefault, char* lpReturnedString, size_t nSize, const char* lpFileName) { BuildConfigurationMap(lpFileName); @@ -108,10 +129,12 @@ GSdxApp::GSdxApp() m_gs_renderers.push_back(GSSetting(0, "Direct3D9", "Hardware")); m_gs_renderers.push_back(GSSetting(1, "Direct3D9", "Software")); + m_gs_renderers.push_back(GSSetting(14, "Direct3D9", "OpenCL")); m_gs_renderers.push_back(GSSetting(2, "Direct3D9", "Null")); - m_gs_renderers.push_back(GSSetting(3, "Direct3D%d ", "Hardware")); - m_gs_renderers.push_back(GSSetting(4, "Direct3D%d ", "Software")); - m_gs_renderers.push_back(GSSetting(5, "Direct3D%d ", "Null")); + m_gs_renderers.push_back(GSSetting(3, "Direct3D", "Hardware")); + m_gs_renderers.push_back(GSSetting(4, "Direct3D", "Software")); + m_gs_renderers.push_back(GSSetting(15, "Direct3D", "OpenCL")); + m_gs_renderers.push_back(GSSetting(5, "Direct3D", "Null")); #ifdef _LINUX // note: SDL was removed. We keep those bits for compatibility of the renderer // position in the linux dialog. @@ -119,9 +142,11 @@ GSdxApp::GSdxApp() m_gs_renderers.push_back(GSSetting(8, "SDL 1.3", "Null")); #endif m_gs_renderers.push_back(GSSetting(10, "Null", "Software")); + m_gs_renderers.push_back(GSSetting(16, "Null", "OpenCL")); m_gs_renderers.push_back(GSSetting(11, "Null", "Null")); m_gs_renderers.push_back(GSSetting(12, "OpenGL", "Hardware")); m_gs_renderers.push_back(GSSetting(13, "OpenGL", "Software")); + m_gs_renderers.push_back(GSSetting(17, "OpenGL", "OpenCL")); m_gs_interlace.push_back(GSSetting(0, "None", "")); m_gs_interlace.push_back(GSSetting(1, "Weave tff", "saw-tooth")); diff --git a/plugins/GSdx/GSdx.h b/plugins/GSdx/GSdx.h index f89a3fa761..5ccb1e2c69 100644 --- a/plugins/GSdx/GSdx.h +++ b/plugins/GSdx/GSdx.h @@ -39,6 +39,7 @@ public: #ifdef _WINDOWS HMODULE GetModuleHandle() {return (HMODULE)GetModuleHandlePtr();} #endif + #ifdef _LINUX void BuildConfigurationMap(const char* lpFileName); void ReloadConfig(); @@ -48,6 +49,8 @@ public: int GetPrivateProfileInt(const char* lpAppName, const char* lpKeyName, int nDefault, const char* lpFileName); #endif + bool LoadResource(int id, vector& buff, const char* type = NULL); + string GetConfig(const char* entry, const char* value); void SetConfig(const char* entry, const char* value); int GetConfig(const char* entry, int value); diff --git a/plugins/GSdx/GSdx.rc b/plugins/GSdx/GSdx.rc index 47ccc62aa0..87c0b6164d 100644 --- a/plugins/GSdx/GSdx.rc +++ b/plugins/GSdx/GSdx.rc @@ -51,9 +51,11 @@ BEGIN "#include ""res/tfx.fx""\r\n" "#include ""res/convert.fx""\r\n" "#include ""res/interlace.fx""\r\n" - "#include ""res/merge.fx""\r\0" - "#include ""res/fxaa.fx""\r\0" - "#include ""res/shadeboost.fx""\r\0" + "#include ""res/merge.fx""\r\n" + "#include ""res/fxaa.fx""\r\n" + "#include ""res/cs.fx""\r\n" + "#include ""res/shadeboost.fx""\r\n" + "#include ""res/tfx.cl""\r\n" END #endif // APSTUDIO_INVOKED @@ -64,13 +66,14 @@ END // RCDATA // -IDR_CONVERT_FX RCDATA "res\\convert.fx" IDR_TFX_FX RCDATA "res\\tfx.fx" -IDR_MERGE_FX RCDATA "res\\merge.fx" +IDR_CONVERT_FX RCDATA "res\\convert.fx" IDR_INTERLACE_FX RCDATA "res\\interlace.fx" +IDR_MERGE_FX RCDATA "res\\merge.fx" IDR_FXAA_FX RCDATA "res\\fxaa.fx" IDR_CS_FX RCDATA "res\\cs.fx" IDR_SHADEBOOST_FX RCDATA "res\\shadeboost.fx" +IDR_TFX_CL RCDATA "res\\tfx.cl" ///////////////////////////////////////////////////////////////////////////// // @@ -394,6 +397,10 @@ END #include "res/convert.fx" #include "res/interlace.fx" #include "res/merge.fx" +#include "res/fxaa.fx" +#include "res/cs.fx" +#include "res/shadeboost.fx" +#include "res/tfx.cl" ///////////////////////////////////////////////////////////////////////////// #endif // not APSTUDIO_INVOKED diff --git a/plugins/GSdx/GSdx_vs2013.vcxproj b/plugins/GSdx/GSdx_vs2013.vcxproj index f9ee445906..ef165a5839 100644 --- a/plugins/GSdx/GSdx_vs2013.vcxproj +++ b/plugins/GSdx/GSdx_vs2013.vcxproj @@ -687,6 +687,7 @@ AssemblyAndSourceCode + @@ -1970,6 +1971,7 @@ + @@ -2057,6 +2059,7 @@ + diff --git a/plugins/GSdx/GSdx_vs2013.vcxproj.filters b/plugins/GSdx/GSdx_vs2013.vcxproj.filters index 1067b76f32..b0053934e9 100644 --- a/plugins/GSdx/GSdx_vs2013.vcxproj.filters +++ b/plugins/GSdx/GSdx_vs2013.vcxproj.filters @@ -348,6 +348,9 @@ Source Files + + Source Files + @@ -707,6 +710,9 @@ Header Files + + Header Files + @@ -737,10 +743,13 @@ Shaders + + Shaders + Shaders - + Shaders diff --git a/plugins/GSdx/res/cs.fx b/plugins/GSdx/res/cs.fx index fb63c0b012..c84211ba95 100644 --- a/plugins/GSdx/res/cs.fx +++ b/plugins/GSdx/res/cs.fx @@ -1,3 +1,5 @@ +#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency + #ifndef VS_TME #define VS_TME 1 #define VS_FST 1 @@ -381,3 +383,5 @@ void ps_main1(GS_OUTPUT input) WritePixel(addr.x, c, PS_FPSM); WritePixel(addr.y, z, PS_ZPSM); } + +#endif diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl new file mode 100644 index 0000000000..d28622380e --- /dev/null +++ b/plugins/GSdx/res/tfx.cl @@ -0,0 +1,1619 @@ +#ifdef __OPENCL_C_VERSION__ // make safe to include in resource file to enforce dependency + +#ifndef CL_FLT_EPSILON +#define CL_FLT_EPSILON 1.1920928955078125e-7 +#endif + +#if MAX_PRIM_PER_BATCH == 64u + #define BIN_TYPE ulong +#elif MAX_PRIM_PER_BATCH == 32u + #define BIN_TYPE uint +#else + #error "MAX_PRIM_PER_BATCH != 32u OR 64u" +#endif + +typedef struct +{ + union {float4 p; struct {float x, y, z, f;};}; + union {float4 tc; struct {float s, t, q; uchar4 c;};}; +} gs_vertex; + +typedef struct +{ + gs_vertex v[4]; +} gs_prim; + +typedef struct +{ + float4 dx, dy; + float4 zero; + float4 reject_corner; +} gs_barycentric; + +typedef struct +{ + uint batch_counter; + uint _pad[7]; + struct {uint first, last;} bounds[MAX_BIN_PER_BATCH]; + BIN_TYPE bin[MAX_BIN_COUNT]; + uchar4 bbox[MAX_PRIM_COUNT]; + gs_prim prim[MAX_PRIM_COUNT]; + gs_barycentric barycentric[MAX_PRIM_COUNT]; +} gs_env; + +typedef struct +{ + int4 scissor; + int4 bbox; + int4 rect; + char dimx[4][4]; + ulong sel; + uint fbp, zbp, bw; + uint fm, zm; + uchar4 fog; // rgb + uchar aref, afix; + uchar ta0, ta1; + uint tbp[7], tbw[7]; + int minu, maxu, minv, maxv; + int lod; // lcm == 1 + int mxl; + float l; // TEX1.L * -0x10000 + float k; // TEX1.K * 0x10000 + uchar4 clut[256]; // TODO: this could be an index to a separate buffer, it may be the same across several gs_params following eachother +} gs_param; + +enum GS_PRIM_CLASS +{ + GS_POINT_CLASS, + GS_LINE_CLASS, + GS_TRIANGLE_CLASS, + GS_SPRITE_CLASS +}; + +enum GS_PSM_TARGET +{ + PSM_PSMCT32, + PSM_PSMCT24, + PSM_PSMCT16, + PSM_PSMCT16S, + PSM_PSMZ32, + PSM_PSMZ24, + PSM_PSMZ16, + PSM_PSMZ16S, + PSM_PSMT8, + PSM_PSMT4, + PSM_PSMT8H, + PSM_PSMT4HL, + PSM_PSMT4HH, +}; + +enum GS_TFX +{ + TFX_MODULATE = 0, + TFX_DECAL = 1, + TFX_HIGHLIGHT = 2, + TFX_HIGHLIGHT2 = 3, + TFX_NONE = 4, +}; + +enum GS_CLAMP +{ + CLAMP_REPEAT = 0, + CLAMP_CLAMP = 1, + CLAMP_REGION_CLAMP = 2, + CLAMP_REGION_REPEAT = 3, +}; + +enum GS_ZTST +{ + ZTST_NEVER = 0, + ZTST_ALWAYS = 1, + ZTST_GEQUAL = 2, + ZTST_GREATER = 3, +}; + +enum GS_ATST +{ + ATST_NEVER = 0, + ATST_ALWAYS = 1, + ATST_LESS = 2, + ATST_LEQUAL = 3, + ATST_EQUAL = 4, + ATST_GEQUAL = 5, + ATST_GREATER = 6, + ATST_NOTEQUAL = 7, +}; + +enum GS_AFAIL +{ + AFAIL_KEEP = 0, + AFAIL_FB_ONLY = 1, + AFAIL_ZB_ONLY = 2, + AFAIL_RGB_ONLY = 3, +}; + +__constant uchar blockTable32[4][8] = +{ + { 0, 1, 4, 5, 16, 17, 20, 21}, + { 2, 3, 6, 7, 18, 19, 22, 23}, + { 8, 9, 12, 13, 24, 25, 28, 29}, + { 10, 11, 14, 15, 26, 27, 30, 31} +}; + +__constant uchar blockTable32Z[4][8] = +{ + { 24, 25, 28, 29, 8, 9, 12, 13}, + { 26, 27, 30, 31, 10, 11, 14, 15}, + { 16, 17, 20, 21, 0, 1, 4, 5}, + { 18, 19, 22, 23, 2, 3, 6, 7} +}; + +__constant uchar blockTable16[8][4] = +{ + { 0, 2, 8, 10 }, + { 1, 3, 9, 11 }, + { 4, 6, 12, 14 }, + { 5, 7, 13, 15 }, + { 16, 18, 24, 26 }, + { 17, 19, 25, 27 }, + { 20, 22, 28, 30 }, + { 21, 23, 29, 31 } +}; + +__constant uchar blockTable16S[8][4] = +{ + { 0, 2, 16, 18 }, + { 1, 3, 17, 19 }, + { 8, 10, 24, 26 }, + { 9, 11, 25, 27 }, + { 4, 6, 20, 22 }, + { 5, 7, 21, 23 }, + { 12, 14, 28, 30 }, + { 13, 15, 29, 31 } +}; + +__constant uchar blockTable16Z[8][4] = +{ + { 24, 26, 16, 18 }, + { 25, 27, 17, 19 }, + { 28, 30, 20, 22 }, + { 29, 31, 21, 23 }, + { 8, 10, 0, 2 }, + { 9, 11, 1, 3 }, + { 12, 14, 4, 6 }, + { 13, 15, 5, 7 } +}; + +__constant uchar blockTable16SZ[8][4] = +{ + { 24, 26, 8, 10 }, + { 25, 27, 9, 11 }, + { 16, 18, 0, 2 }, + { 17, 19, 1, 3 }, + { 28, 30, 12, 14 }, + { 29, 31, 13, 15 }, + { 20, 22, 4, 6 }, + { 21, 23, 5, 7 } +}; + +__constant uchar blockTable8[4][8] = +{ + { 0, 1, 4, 5, 16, 17, 20, 21}, + { 2, 3, 6, 7, 18, 19, 22, 23}, + { 8, 9, 12, 13, 24, 25, 28, 29}, + { 10, 11, 14, 15, 26, 27, 30, 31} +}; + +__constant uchar blockTable4[8][4] = +{ + { 0, 2, 8, 10 }, + { 1, 3, 9, 11 }, + { 4, 6, 12, 14 }, + { 5, 7, 13, 15 }, + { 16, 18, 24, 26 }, + { 17, 19, 25, 27 }, + { 20, 22, 28, 30 }, + { 21, 23, 29, 31 } +}; + +__constant uchar columnTable32[8][8] = +{ + { 0, 1, 4, 5, 8, 9, 12, 13 }, + { 2, 3, 6, 7, 10, 11, 14, 15 }, + { 16, 17, 20, 21, 24, 25, 28, 29 }, + { 18, 19, 22, 23, 26, 27, 30, 31 }, + { 32, 33, 36, 37, 40, 41, 44, 45 }, + { 34, 35, 38, 39, 42, 43, 46, 47 }, + { 48, 49, 52, 53, 56, 57, 60, 61 }, + { 50, 51, 54, 55, 58, 59, 62, 63 }, +}; + +__constant uchar columnTable16[8][16] = +{ + { 0, 2, 8, 10, 16, 18, 24, 26, + 1, 3, 9, 11, 17, 19, 25, 27 }, + { 4, 6, 12, 14, 20, 22, 28, 30, + 5, 7, 13, 15, 21, 23, 29, 31 }, + { 32, 34, 40, 42, 48, 50, 56, 58, + 33, 35, 41, 43, 49, 51, 57, 59 }, + { 36, 38, 44, 46, 52, 54, 60, 62, + 37, 39, 45, 47, 53, 55, 61, 63 }, + { 64, 66, 72, 74, 80, 82, 88, 90, + 65, 67, 73, 75, 81, 83, 89, 91 }, + { 68, 70, 76, 78, 84, 86, 92, 94, + 69, 71, 77, 79, 85, 87, 93, 95 }, + { 96, 98, 104, 106, 112, 114, 120, 122, + 97, 99, 105, 107, 113, 115, 121, 123 }, + { 100, 102, 108, 110, 116, 118, 124, 126, + 101, 103, 109, 111, 117, 119, 125, 127 }, +}; + +__constant uchar columnTable8[16][16] = +{ + { 0, 4, 16, 20, 32, 36, 48, 52, // column 0 + 2, 6, 18, 22, 34, 38, 50, 54 }, + { 8, 12, 24, 28, 40, 44, 56, 60, + 10, 14, 26, 30, 42, 46, 58, 62 }, + { 33, 37, 49, 53, 1, 5, 17, 21, + 35, 39, 51, 55, 3, 7, 19, 23 }, + { 41, 45, 57, 61, 9, 13, 25, 29, + 43, 47, 59, 63, 11, 15, 27, 31 }, + { 96, 100, 112, 116, 64, 68, 80, 84, // column 1 + 98, 102, 114, 118, 66, 70, 82, 86 }, + { 104, 108, 120, 124, 72, 76, 88, 92, + 106, 110, 122, 126, 74, 78, 90, 94 }, + { 65, 69, 81, 85, 97, 101, 113, 117, + 67, 71, 83, 87, 99, 103, 115, 119 }, + { 73, 77, 89, 93, 105, 109, 121, 125, + 75, 79, 91, 95, 107, 111, 123, 127 }, + { 128, 132, 144, 148, 160, 164, 176, 180, // column 2 + 130, 134, 146, 150, 162, 166, 178, 182 }, + { 136, 140, 152, 156, 168, 172, 184, 188, + 138, 142, 154, 158, 170, 174, 186, 190 }, + { 161, 165, 177, 181, 129, 133, 145, 149, + 163, 167, 179, 183, 131, 135, 147, 151 }, + { 169, 173, 185, 189, 137, 141, 153, 157, + 171, 175, 187, 191, 139, 143, 155, 159 }, + { 224, 228, 240, 244, 192, 196, 208, 212, // column 3 + 226, 230, 242, 246, 194, 198, 210, 214 }, + { 232, 236, 248, 252, 200, 204, 216, 220, + 234, 238, 250, 254, 202, 206, 218, 222 }, + { 193, 197, 209, 213, 225, 229, 241, 245, + 195, 199, 211, 215, 227, 231, 243, 247 }, + { 201, 205, 217, 221, 233, 237, 249, 253, + 203, 207, 219, 223, 235, 239, 251, 255 }, +}; + +__constant ushort columnTable4[16][32] = +{ + { 0, 8, 32, 40, 64, 72, 96, 104, // column 0 + 2, 10, 34, 42, 66, 74, 98, 106, + 4, 12, 36, 44, 68, 76, 100, 108, + 6, 14, 38, 46, 70, 78, 102, 110 }, + { 16, 24, 48, 56, 80, 88, 112, 120, + 18, 26, 50, 58, 82, 90, 114, 122, + 20, 28, 52, 60, 84, 92, 116, 124, + 22, 30, 54, 62, 86, 94, 118, 126 }, + { 65, 73, 97, 105, 1, 9, 33, 41, + 67, 75, 99, 107, 3, 11, 35, 43, + 69, 77, 101, 109, 5, 13, 37, 45, + 71, 79, 103, 111, 7, 15, 39, 47 }, + { 81, 89, 113, 121, 17, 25, 49, 57, + 83, 91, 115, 123, 19, 27, 51, 59, + 85, 93, 117, 125, 21, 29, 53, 61, + 87, 95, 119, 127, 23, 31, 55, 63 }, + { 192, 200, 224, 232, 128, 136, 160, 168, // column 1 + 194, 202, 226, 234, 130, 138, 162, 170, + 196, 204, 228, 236, 132, 140, 164, 172, + 198, 206, 230, 238, 134, 142, 166, 174 }, + { 208, 216, 240, 248, 144, 152, 176, 184, + 210, 218, 242, 250, 146, 154, 178, 186, + 212, 220, 244, 252, 148, 156, 180, 188, + 214, 222, 246, 254, 150, 158, 182, 190 }, + { 129, 137, 161, 169, 193, 201, 225, 233, + 131, 139, 163, 171, 195, 203, 227, 235, + 133, 141, 165, 173, 197, 205, 229, 237, + 135, 143, 167, 175, 199, 207, 231, 239 }, + { 145, 153, 177, 185, 209, 217, 241, 249, + 147, 155, 179, 187, 211, 219, 243, 251, + 149, 157, 181, 189, 213, 221, 245, 253, + 151, 159, 183, 191, 215, 223, 247, 255 }, + { 256, 264, 288, 296, 320, 328, 352, 360, // column 2 + 258, 266, 290, 298, 322, 330, 354, 362, + 260, 268, 292, 300, 324, 332, 356, 364, + 262, 270, 294, 302, 326, 334, 358, 366 }, + { 272, 280, 304, 312, 336, 344, 368, 376, + 274, 282, 306, 314, 338, 346, 370, 378, + 276, 284, 308, 316, 340, 348, 372, 380, + 278, 286, 310, 318, 342, 350, 374, 382 }, + { 321, 329, 353, 361, 257, 265, 289, 297, + 323, 331, 355, 363, 259, 267, 291, 299, + 325, 333, 357, 365, 261, 269, 293, 301, + 327, 335, 359, 367, 263, 271, 295, 303 }, + { 337, 345, 369, 377, 273, 281, 305, 313, + 339, 347, 371, 379, 275, 283, 307, 315, + 341, 349, 373, 381, 277, 285, 309, 317, + 343, 351, 375, 383, 279, 287, 311, 319 }, + { 448, 456, 480, 488, 384, 392, 416, 424, // column 3 + 450, 458, 482, 490, 386, 394, 418, 426, + 452, 460, 484, 492, 388, 396, 420, 428, + 454, 462, 486, 494, 390, 398, 422, 430 }, + { 464, 472, 496, 504, 400, 408, 432, 440, + 466, 474, 498, 506, 402, 410, 434, 442, + 468, 476, 500, 508, 404, 412, 436, 444, + 470, 478, 502, 510, 406, 414, 438, 446 }, + { 385, 393, 417, 425, 449, 457, 481, 489, + 387, 395, 419, 427, 451, 459, 483, 491, + 389, 397, 421, 429, 453, 461, 485, 493, + 391, 399, 423, 431, 455, 463, 487, 495 }, + { 401, 409, 433, 441, 465, 473, 497, 505, + 403, 411, 435, 443, 467, 475, 499, 507, + 405, 413, 437, 445, 469, 477, 501, 509, + 407, 415, 439, 447, 471, 479, 503, 511 }, +}; + +uint BlockNumber32(int x, int y, uint bp, uint bw) +{ + return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 3][(x >> 3) & 7]; +} + +uint BlockNumber16(int x, int y, uint bp, uint bw) +{ + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 7][(x >> 4) & 3]; +} + +uint BlockNumber16S(int x, int y, uint bp, uint bw) +{ + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 7][(x >> 4) & 3]; +} + +uint BlockNumber32Z(int x, int y, uint bp, uint bw) +{ + return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 3][(x >> 3) & 7]; +} + +uint BlockNumber16Z(int x, int y, uint bp, uint bw) +{ + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 7][(x >> 4) & 3]; +} + +uint BlockNumber16SZ(int x, int y, uint bp, uint bw) +{ + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3]; +} + +uint BlockNumber8(int x, int y, uint bp, uint bw) +{ + return bp + ((y >> 1) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7]; +} + +uint BlockNumber4(int x, int y, uint bp, uint bw) +{ + return bp + ((y >> 2) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3]; +} + +uint PixelAddress32(int x, int y, uint bp, uint bw) +{ + return (BlockNumber32(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7]; +} + +uint PixelAddress16(int x, int y, uint bp, uint bw) +{ + return (BlockNumber16(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; +} + +uint PixelAddress16S(int x, int y, uint bp, uint bw) +{ + return (BlockNumber16S(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; +} + +uint PixelAddress32Z(int x, int y, uint bp, uint bw) +{ + return (BlockNumber32Z(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7]; +} + +uint PixelAddress16Z(int x, int y, uint bp, uint bw) +{ + return (BlockNumber16Z(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; +} + +uint PixelAddress16SZ(int x, int y, uint bp, uint bw) +{ + return (BlockNumber16SZ(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; +} + +uint PixelAddress8(int x, int y, uint bp, uint bw) +{ + return (BlockNumber8(x, y, bp, bw) << 8) + columnTable8[y & 15][x & 15]; +} + +uint PixelAddress4(int x, int y, uint bp, uint bw) +{ + return (BlockNumber4(x, y, bp, bw) << 9) + columnTable4[y & 15][x & 31]; +} + +uint PixelAddress(int x, int y, uint bp, uint bw, uint psm) +{ + switch(psm) + { + default: + case PSM_PSMCT32: + case PSM_PSMCT24: + case PSM_PSMT8H: + case PSM_PSMT4HL: + case PSM_PSMT4HH: + return PixelAddress32(x, y, bp, bw); + case PSM_PSMCT16: + return PixelAddress16(x, y, bp, bw); + case PSM_PSMCT16S: + return PixelAddress16S(x, y, bp, bw); + case PSM_PSMZ32: + case PSM_PSMZ24: + return PixelAddress32Z(x, y, bp, bw); + case PSM_PSMZ16: + return PixelAddress16Z(x, y, bp, bw); + case PSM_PSMZ16S: + return PixelAddress16SZ(x, y, bp, bw); + case PSM_PSMT8: + return PixelAddress8(x, y, bp, bw); + case PSM_PSMT4: + return PixelAddress4(x, y, bp, bw); + } +} + +uint TileBlockNumber(int x, int y, uint bp, uint bw, uint psm) +{ + // TODO: replace blockTable with a subset tileTable + + switch(psm) + { + default: + case PSM_PSMCT32: + case PSM_PSMCT24: + return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 2][(x >> 3) & 6]; + case PSM_PSMCT16: + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 2][(x >> 4) & 3]; + case PSM_PSMCT16S: + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 2][(x >> 4) & 3]; + case PSM_PSMZ32: + case PSM_PSMZ24: + return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 2][(x >> 3) & 6]; + case PSM_PSMZ16: + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 2][(x >> 4) & 3]; + case PSM_PSMZ16S: + return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 2][(x >> 4) & 3]; + } +} + +uint TilePixelAddress(int x, int y, uint ba, uint psm) +{ + switch(psm) + { + default: + case PSM_PSMCT32: + case PSM_PSMCT24: + case PSM_PSMZ32: + case PSM_PSMZ24: + return ((ba + ((y >> 2) & 2) + ((x >> 3) & 1)) << 6) + columnTable32[y & 7][x & 7]; + case PSM_PSMCT16: + case PSM_PSMCT16S: + case PSM_PSMZ16: + case PSM_PSMZ16S: + return ((ba + ((y >> 3) & 1)) << 7) + columnTable16[y & 7][x & 15]; + } +} + +uint ReadFrame(__global uchar* vm, uint addr, uint psm) +{ + switch(psm) + { + default: + case PSM_PSMCT32: + case PSM_PSMCT24: + case PSM_PSMZ32: + case PSM_PSMZ24: + return ((__global uint*)vm)[addr]; + case PSM_PSMCT16: + case PSM_PSMCT16S: + case PSM_PSMZ16: + case PSM_PSMZ16S: + return ((__global ushort*)vm)[addr]; + } +} + +void WriteFrame(__global uchar* vm, uint addr, uint psm, uint value) +{ + switch(psm) + { + default: + case PSM_PSMCT32: + case PSM_PSMZ32: + case PSM_PSMCT24: + case PSM_PSMZ24: + ((__global uint*)vm)[addr] = value; + break; + case PSM_PSMCT16: + case PSM_PSMCT16S: + case PSM_PSMZ16: + case PSM_PSMZ16S: + ((__global ushort*)vm)[addr] = (ushort)value; + break; + } +} + +bool is16bit(int psm) +{ + return psm < 8 && (psm & 3) >= 2; +} + +bool is24bit(int psm) +{ + return psm < 8 && (psm & 3) == 1; +} + +bool is32bit(int psm) +{ + return psm < 8 && (psm & 3) == 0; +} + +#ifdef PRIM + +int GetVertexPerPrim(int prim_class) +{ + switch(prim_class) + { + default: + case GS_POINT_CLASS: return 1; + case GS_LINE_CLASS: return 2; + case GS_TRIANGLE_CLASS: return 3; + case GS_SPRITE_CLASS: return 2; + } +} + +#define VERTEX_PER_PRIM GetVertexPerPrim(PRIM) + +#endif + +#ifdef KERNEL_PRIM + +__kernel void KERNEL_PRIM( + __global gs_env* env, + __global uchar* vb_base, + __global uchar* ib_base, + uint vb_start, + uint ib_start) +{ + size_t prim_index = get_global_id(0); + + __global gs_vertex* vb = (__global gs_vertex*)(vb_base + vb_start); + __global uint* ib = (__global uint*)(ib_base + ib_start); + __global gs_prim* prim = &env->prim[prim_index]; + + ib += prim_index * VERTEX_PER_PRIM; + + int2 pmin, pmax; + + if(PRIM == GS_POINT_CLASS) + { + pmin = pmax = convert_int2_rte(vb[ib[0]].p.xy); + } + else if(PRIM == GS_LINE_CLASS) + { + int2 p0 = convert_int2_rte(vb[ib[0]].p.xy); + int2 p1 = convert_int2_rte(vb[ib[1]].p.xy); + + pmin = min(p0, p1); + pmax = max(p0, p1); + } + else if(PRIM == GS_TRIANGLE_CLASS) + { + __global gs_vertex* v0 = &vb[ib[0]]; + __global gs_vertex* v1 = &vb[ib[1]]; + __global gs_vertex* v2 = &vb[ib[2]]; + + int2 p0 = convert_int2_rtp(v0->p.xy); + int2 p1 = convert_int2_rtp(v1->p.xy); + int2 p2 = convert_int2_rtp(v2->p.xy); + + pmin = min(min(p0, p1), p2); + pmax = max(max(p0, p1), p2); + + prim->v[0].p = v0->p; + prim->v[0].tc = v0->tc; + prim->v[1].p = v1->p; + prim->v[1].tc = v1->tc; + prim->v[2].p = v2->p; + prim->v[2].tc = v2->tc; + + float4 dp0 = v1->p - v0->p; + float4 dp1 = v0->p - v2->p; + float4 dp2 = v2->p - v1->p; + + float cp = dp0.x * dp1.y - dp0.y * dp1.x; + + if(cp != 0.0f) + { + float cp_rcp = 1.0f / cp;// native_recip(cp); + + float2 u = dp0.xy * cp_rcp; + float2 v = -dp1.xy * cp_rcp; + + // v0 has the (0, 0, 1) barycentric coord, v1: (0, 1, 0), v2: (1, 0, 0) + + gs_barycentric b; + + b.dx = (float4)(-v.y, u.y, v.y - u.y, v0->p.x); + b.dy = (float4)(v.x, -u.x, u.x - v.x, v0->p.y); + + dp0.xy = dp0.xy * sign(cp); + dp1.xy = dp1.xy * sign(cp); + dp2.xy = dp2.xy * sign(cp); + + b.zero.x = (dp1.y < 0 || dp1.y == 0 && dp1.x > 0) ? CL_FLT_EPSILON : 0; + b.zero.y = (dp0.y < 0 || dp0.y == 0 && dp0.x > 0) ? CL_FLT_EPSILON : 0; + b.zero.z = (dp2.y < 0 || dp2.y == 0 && dp2.x > 0) ? CL_FLT_EPSILON : 0; + + // any barycentric(reject_corner) < 0, tile outside the triangle + + b.reject_corner.x = 0.0f + max(max(max(0.0f, b.dx.x), b.dy.x), b.dx.x + b.dy.x) * BIN_SIZE; + b.reject_corner.y = 0.0f + max(max(max(0.0f, b.dx.y), b.dy.y), b.dx.y + b.dy.y) * BIN_SIZE; + b.reject_corner.z = 1.0f + max(max(max(0.0f, b.dx.z), b.dy.z), b.dx.z + b.dy.z) * BIN_SIZE; + + // TODO: accept_corner, at min value, all barycentric(accept_corner) >= 0, tile fully inside, no per pixel hittest needed + + env->barycentric[prim_index] = b; + } + else + { + // TODO: set b.zero to something that always fails the tests + } + } + else if(PRIM == GS_SPRITE_CLASS) + { + __global gs_vertex* v0 = &vb[ib[0]]; + __global gs_vertex* v1 = &vb[ib[1]]; + + int2 p0 = convert_int2_rtp(v0->p.xy); + int2 p1 = convert_int2_rtp(v1->p.xy); + + pmin = min(p0, p1); + pmax = max(p0, p1); + + int4 mask = (int4)(v0->p.xy > v1->p.xy, 0, 0); + + prim->v[0].p = select(v0->p, v1->p, mask); // pmin + prim->v[0].tc = select(v0->tc, v1->tc, mask); + prim->v[1].p = select(v1->p, v0->p, mask); // pmax + prim->v[1].tc = select(v1->tc, v0->tc, mask); + prim->v[1].tc.xy = (prim->v[1].tc.xy - prim->v[0].tc.xy) / (prim->v[1].p.xy - prim->v[0].p.xy); + } + + int4 pminmax = (int4)(pmin, pmax); + + env->bbox[prim_index] = convert_uchar4_sat(pminmax >> BIN_SIZE_BITS); +} + +#endif + +#ifdef KERNEL_TILE + +int tile_in_triangle(float2 p, gs_barycentric b) +{ + float3 f = b.dx.xyz * (p.x - b.dx.w) + b.dy.xyz * (p.y - b.dy.w) + b.reject_corner.xyz; + + f = select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10)); + + return all(f >= b.zero.xyz); +} + +#if CLEAR == 1 + +__kernel void KERNEL_TILE(__global gs_env* env) +{ + env->batch_counter = 0; + env->bounds[get_global_id(0)].first = -1; + env->bounds[get_global_id(0)].last = 0; +} + +#elif MODE < 3 + +#if MAX_PRIM_PER_BATCH != 32 + #error "MAX_PRIM_PER_BATCH != 32" +#endif + +#define MAX_PRIM_PER_GROUP (32u >> MODE) + +__kernel void KERNEL_TILE( + __global gs_env* env, + uint prim_count, + uint bin_count, // == bin_dim.z * bin_dim.w + uchar4 bin_dim) +{ + uint batch_index = get_group_id(2) >> MODE; + uint prim_start = get_group_id(2) << (5 - MODE); + uint group_prim_index = get_local_id(2); + uint bin_index = get_local_id(1) * get_local_size(0) + get_local_id(0); + + __global BIN_TYPE* bin = &env->bin[batch_index * bin_count]; + __global uchar4* bbox = &env->bbox[prim_start]; + __global gs_barycentric* barycentric = &env->barycentric[prim_start]; + + __local uchar4 bbox_cache[MAX_PRIM_PER_GROUP]; + __local gs_barycentric barycentric_cache[MAX_PRIM_PER_GROUP]; + __local uint visible[8 << MODE]; + + if(get_local_id(2) == 0) + { + visible[bin_index] = 0; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + uint group_prim_count = min(prim_count - prim_start, MAX_PRIM_PER_GROUP); + + event_t e = async_work_group_copy(bbox_cache, bbox, group_prim_count, 0); + + wait_group_events(1, &e); + + if(PRIM == GS_TRIANGLE_CLASS) + { + e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, group_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0); + + wait_group_events(1, &e); + } + + if(group_prim_index < group_prim_count) + { + int x = bin_dim.x + get_local_id(0); + int y = bin_dim.y + get_local_id(1); + + uchar4 r = bbox_cache[group_prim_index]; + + uint test = (r.x <= x + 1) & (r.z >= x) & (r.y <= y + 1) & (r.w >= y); + + if(PRIM == GS_TRIANGLE_CLASS && test != 0) + { + test &= tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[group_prim_index]); + } + + atomic_or(&visible[bin_index], test << ((MAX_PRIM_PER_GROUP - 1) - get_local_id(2))); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(get_local_id(2) == 0) + { + #if MODE == 0 + ((__global uint*)&bin[bin_index])[0] = visible[bin_index]; + #elif MODE == 1 + ((__global ushort*)&bin[bin_index])[1 - (get_group_id(2) & 1)] = visible[bin_index]; + #elif MODE == 2 + ((__global uchar*)&bin[bin_index])[3 - (get_group_id(2) & 3)] = visible[bin_index]; + #endif + + if(visible[bin_index] != 0) + { + atomic_min(&env->bounds[bin_index].first, batch_index); + atomic_max(&env->bounds[bin_index].last, batch_index); + } + } +} + +#elif MODE == 3 + +__kernel void KERNEL_TILE( + __global gs_env* env, + uint prim_count, + uint batch_count, + uint bin_count, // == bin_dim.z * bin_dim.w + uchar4 bin_dim) +{ + __local uchar4 bbox_cache[MAX_PRIM_PER_BATCH]; + __local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH]; + __local uint batch_index; + + size_t local_id = get_local_id(0); + size_t local_size = get_local_size(0); + + while(1) + { + barrier(CLK_LOCAL_MEM_FENCE); + + if(local_id == 0) + { + batch_index = atomic_inc(&env->batch_counter); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(batch_index >= batch_count) + { + break; + } + + uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH); + + __global BIN_TYPE* bin = &env->bin[batch_index * bin_count]; + __global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS]; + __global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS]; + + event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0); + + wait_group_events(1, &e); + + if(PRIM == GS_TRIANGLE_CLASS) + { + e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0); + + wait_group_events(1, &e); + } + + for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size) + { + int y = bin_index / bin_dim.z; + int x = bin_index - y * bin_dim.z; + + x += bin_dim.x; + y += bin_dim.y; + + BIN_TYPE visible = 0; + + for(uint i = 0; i < batch_prim_count; i++) + { + uchar4 r = bbox_cache[i]; + + BIN_TYPE test = (r.x <= x + 1) & (r.z >= x) & (r.y <= y + 1) & (r.w >= y); + + if(PRIM == GS_TRIANGLE_CLASS && test != 0) + { + test &= tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]); + } + + visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i); + } + + bin[bin_index] = visible; + + if(visible != 0) + { + atomic_min(&env->bounds[bin_index].first, batch_index); + atomic_max(&env->bounds[bin_index].last, batch_index); + } + } + } +} + +#endif + +#endif + +#ifdef KERNEL_TFX + +bool ZTest(uint zs, uint zd) +{ + if(ZTEST) + { + if(is24bit(ZPSM)) zd &= 0x00ffffff; + + switch(ZTST) + { + case ZTST_NEVER: + return false; + case ZTST_ALWAYS: + return true; + case ZTST_GEQUAL: + return zs >= zd; + case ZTST_GREATER: + return zs > zd; + } + } + + return true; +} + +bool AlphaTest(int alpha, int aref, uint* fm, uint* zm) +{ + switch(AFAIL) + { + case AFAIL_KEEP: + break; + case AFAIL_FB_ONLY: + if(!ZWRITE) return true; + break; + case AFAIL_ZB_ONLY: + if(!FWRITE) return true; + break; + case AFAIL_RGB_ONLY: + if(!ZWRITE && is24bit(FPSM)) return true; + break; + } + + uint pass; + + switch(ATST) + { + case ATST_NEVER: + pass = false; + break; + case ATST_ALWAYS: + return true; + case ATST_LESS: + pass = alpha < aref; + break; + case ATST_LEQUAL: + pass = alpha <= aref; + break; + case ATST_EQUAL: + pass = alpha == aref; + break; + case ATST_GEQUAL: + pass = alpha >= aref; + break; + case ATST_GREATER: + pass = alpha > aref; + break; + case ATST_NOTEQUAL: + pass = alpha != aref; + break; + } + + switch(AFAIL) + { + case AFAIL_KEEP: + return pass; + case AFAIL_FB_ONLY: + *zm |= pass ? 0 : 0xffffffff; + break; + case AFAIL_ZB_ONLY: + *fm |= pass ? 0 : 0xffffffff; + break; + case AFAIL_RGB_ONLY: + *fm |= pass ? 0 : 0xff000000; + *zm |= pass ? 0 : 0xffffffff; + break; + } + + return true; +} + +bool DestAlphaTest(uint fd) +{ + if(DATE) + { + if(DATM) + { + if(is32bit(FPSM)) return (fd & 0x80000000) != 0; + if(is16bit(FPSM)) return (fd & 0x00008000) != 0; + } + else + { + if(is32bit(FPSM)) return (fd & 0x80000000) == 0; + if(is16bit(FPSM)) return (fd & 0x00008000) == 0; + } + } + + return true; +} + +int Wrap(int a, int b, int c, int mode) +{ + switch(mode) + { + case CLAMP_REPEAT: + return a & b; + case CLAMP_CLAMP: + return clamp(a, 0, c); + case CLAMP_REGION_CLAMP: + return clamp(a, b, c); + case CLAMP_REGION_REPEAT: + return (a & b) | c; + } +} + +int4 AlphaBlend(int4 c, int afix, uint fd) +{ + if(FWRITE && (ABE || AA1)) + { + int4 cs = c; + int4 cd; + + if(ABA != ABB && (ABA == 1 || ABB == 1 || ABC == 1) || ABD == 1) + { + if(is32bit(FPSM) || is24bit(FPSM)) + { + cd.x = fd & 0xff; + cd.y = (fd >> 8) & 0xff; + cd.z = (fd >> 16) & 0xff; + cd.w = fd >> 24; + } + else if(is16bit(FPSM)) + { + cd.x = (fd & 0x001f) << 3; + cd.y = (fd & 0x03e0) >> 2; + cd.z = (fd & 0x7c00) >> 7; + cd.w = (fd & 0x8000) >> 8; + } + } + + if(ABA != ABB) + { + switch(ABA) + { + case 0: break; // c.xyz = cs.xyz; + case 1: c.xyz = cd.xyz; break; + case 2: c.xyz = 0; break; + } + + switch(ABB) + { + case 0: c.xyz -= cs.xyz; break; + case 1: c.xyz -= cd.xyz; break; + case 2: break; + } + + if(!(is24bit(FPSM) && ABC == 1)) + { + int a = 0; + + switch(ABC) + { + case 0: a = cs.w; break; + case 1: a = cd.w; break; + case 2: a = afix; break; + } + + c.xyz = c.xyz * a >> 7; + } + + switch(ABD) + { + case 0: c.xyz += cs.xyz; break; + case 1: c.xyz += cd.xyz; break; + case 2: break; + } + } + else + { + switch(ABD) + { + case 0: break; + case 1: c.xyz = cd.xyz; break; + case 2: c.xyz = 0; break; + } + } + + if(PABE) + { + c.xyz = select(cs.xyz, c.xyz, (int3)(cs.w << 24)); + } + } + + return c; +} + +uchar4 Expand24To32(uint rgba, uchar ta0) +{ + uchar4 c; + + c.x = rgba & 0xff; + c.y = (rgba >> 8) & 0xff; + c.z = (rgba >> 16) & 0xff; + c.w = !AEM || (rgba & 0xffffff) != 0 ? ta0 : 0; + + return c; +} + +uchar4 Expand16To32(ushort rgba, uchar ta0, uchar ta1) +{ + uchar4 c; + + c.x = (rgba & 0x001f) << 3; + c.y = (rgba & 0x03e0) >> 2; + c.z = (rgba & 0x7c00) >> 7; + c.w = !AEM || (rgba & 0x7fff) != 0 ? ((rgba & 0x8000) ? ta1 : ta0) : 0; + + return c; +} + +int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* pb) +{ + uchar4 c; + + uint addr = PixelAddress(x, y, pb->tbp[level], pb->tbw[level], TPSM); + + __global ushort* vm16 = (__global ushort*)vm; + __global uint* vm32 = (__global uint*)vm; + + switch(TPSM) + { + default: + case PSM_PSMCT32: + case PSM_PSMZ32: + c = ((__global uchar4*)vm)[addr]; + break; + case PSM_PSMCT24: + case PSM_PSMZ24: + c = Expand24To32(vm32[addr], pb->ta0); + break; + case PSM_PSMCT16: + case PSM_PSMCT16S: + case PSM_PSMZ16: + case PSM_PSMZ16S: + c = Expand16To32(vm16[addr], pb->ta0, pb->ta1); + break; + case PSM_PSMT8: + c = pb->clut[vm[addr]]; + break; + case PSM_PSMT4: + c = pb->clut[(vm[addr] >> ((addr & 1) << 2)) & 0x0f]; + break; + case PSM_PSMT8H: + c = pb->clut[vm32[addr] >> 24]; + break; + case PSM_PSMT4HL: + c = pb->clut[(vm32[addr] >> 24) & 0x0f]; + break; + case PSM_PSMT4HH: + c = pb->clut[(vm32[addr] >> 28) & 0x0f]; + break; + } + + //printf("[%d %d] %05x %d %d %08x | %v4hhd | %08x\n", x, y, pb->tbp[level], pb->tbw[level], TPSM, addr, c, vm32[addr]); + + return convert_int4(c); +} + +__kernel void KERNEL_TFX( + __global gs_env* env, + __global uchar* vm, + __global uchar* tex, + __global uchar* pb_base, + uint pb_start, + uint prim_start, + uint prim_count, + uint batch_count, + uint bin_count, // == bin_dim.z * bin_dim.w + uchar4 bin_dim) +{ + // TODO: try it the bin_index = atomic_inc(&env->bin_counter) way + + uint bin_x = (get_global_id(0) >> BIN_SIZE_BITS) - bin_dim.x; + uint bin_y = (get_global_id(1) >> BIN_SIZE_BITS) - bin_dim.y; + uint bin_index = bin_y * bin_dim.z + bin_x; + + uint batch_first = env->bounds[bin_index].first; + uint batch_last = env->bounds[bin_index].last; + uint batch_start = prim_start >> MAX_PRIM_PER_BATCH_BITS; + + if(batch_last < batch_first) + { + return; + } + + uint skip; + + if(batch_start < batch_first) + { + uint n = (batch_first - batch_start) * MAX_PRIM_PER_BATCH - (prim_start & (MAX_PRIM_PER_BATCH - 1)); + + if(n > prim_count) + { + return; + } + + skip = 0; + prim_count -= n; + batch_start = batch_first; + } + else + { + skip = prim_start & (MAX_PRIM_PER_BATCH - 1); + prim_count += skip; + } + + if(batch_start > batch_last) + { + return; + } + + prim_count = min(prim_count, (batch_last - batch_start + 1) << MAX_PRIM_PER_BATCH_BITS); + + // + + __global gs_param* pb = (__global gs_param*)(pb_base + pb_start); + + uint x = get_global_id(0); + uint y = get_global_id(1); + + int2 pi = (int2)(x, y); + float2 pf = convert_float2(pi); + + if(!NOSCISSOR) + { + int4 scissor = pb->scissor; + + if(!all((pi >= scissor.xy) & (pi < scissor.zw))) + { + return; + } + } + + uint faddr = PixelAddress(x, y, pb->fbp, pb->bw, FPSM); + uint zaddr = PixelAddress(x, y, pb->zbp, pb->bw, ZPSM); + + uint fd, zd; + + if(RFB) + { + fd = ReadFrame(vm, faddr, FPSM); + } + + if(ZTEST) + { + zd = ReadFrame(vm, zaddr, ZPSM); + } +/* + // TODO: lookup top left address of this tile + local offset + // + // 32bpp: 8x8 block size, 4 blocks, 1024 bytes + // 0 1 + // 2 3 + // 16bpp: 16x8 block size, 2 blocks, 512 bytes + // 0 + // 1 + // linear access in memory, this layout is the same for all formats + + __local uint fbn, zbn; + __local uchar fb[1024], zb[1024]; + + if(get_local_id(0) == 0 && get_local_id(1) == 0) + { + fbn = TileBlockNumber(x, y, pb->fbp, pb->bw, FPSM); + zbn = TileBlockNumber(x, y, pb->fbp, pb->bw, FPSM); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + uint faddr = TilePixelAddress(x, y, fbn, FPSM); + uint zaddr = TilePixelAddress(x, y, zbn, ZPSM); + + if(RFB) + { + event_t e = async_work_group_copy((__local uint4*)fb, (__global uint4*)&vm[fbn << 8], 1024 / sizeof(uint4), 0); + + wait_group_events(1, &e); + } + + if(ZTEST) + { + event_t e = async_work_group_copy((__local uint4*)zb, (__global uint4*)&vm[zbn << 8], 1024 / sizeof(uint4), 0); + + wait_group_events(1, &e); + } + + // not sure if faster +*/ + + // early destination alpha test + + if(!DestAlphaTest(fd)) + { + return; + } + + // + + uint fragments = 0; + + //__local gs_prim p; + + __global BIN_TYPE* bin = &env->bin[bin_index + batch_start * bin_count]; // TODO: not needed for "one tile case" + __global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS]; + __global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS]; + + BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip); + + __local gs_prim prim_cache; + + for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH) + { + while(bin_value != 0) + { + uint i = clz(bin_value); + + if(prim_index + i >= prim_count) + { + break; + } + + __global gs_prim* prim = &prim_base[prim_index + i]; + + bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1); + + uint2 zf; + float4 t; + int4 c; + + // TODO: do not hittest if we know the tile is fully inside the prim + + if(PRIM == GS_POINT_CLASS) + { + // TODO: distance.x < 0.5f || distance.y < 0.5f + + continue; + } + else if(PRIM == GS_LINE_CLASS) + { + // TODO: find point on line prependicular to (x,y), distance.x < 0.5f || distance.y < 0.5f + + continue; + } + else if(PRIM == GS_TRIANGLE_CLASS) + { + __global gs_barycentric* b = &barycentric[prim_index + i]; + + float3 f = b->dx.xyz * (pf.x - b->dx.w) + b->dy.xyz * (pf.y - b->dy.w) + (float3)(0, 0, 1); + + f = select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10)); + + if(!all(f >= b->zero.xyz)) + { + continue; + } + + zf = convert_uint2(prim->v[0].p.zw * f.z + prim->v[1].p.zw * f.x + prim->v[2].p.zw * f.y); + + t.xyz = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y; + + if(IIP) + { + float4 c0 = convert_float4(prim->v[0].c); + float4 c1 = convert_float4(prim->v[1].c); + float4 c2 = convert_float4(prim->v[2].c); + + c = convert_int4(c0 * f.z + c1 * f.x + c2 * f.y); + } + else + { + c = convert_int4(prim->v[2].c); + } + } + else if(PRIM == GS_SPRITE_CLASS) + { + int2 tl = convert_int2_rtp(prim->v[0].p.xy); + int2 br = convert_int2_rtp(prim->v[1].p.xy); + + if(!all((pi >= tl) & (pi < br))) + { + continue; + } + + zf = convert_uint2(prim->v[1].p.zw); // TODO: send as uint + + t.xy = prim->v[0].tc.xy + prim->v[1].tc.xy * (pf - prim->v[0].p.xy); + t.z = prim->v[0].tc.z; + + c = convert_int4(prim->v[1].c); + } + + // z test + + uint zs = zf.x; + + if(!ZTest(zs, zd)) + { + continue; + } + + // sample texture + + int4 ct; + + if(FB && TFX != TFX_NONE) + { + // TODO + + if(0)//if(MMIN) + { + // TODO + } + else + { + int2 uv; + + if(!FST) + { + uv = convert_int2(t.xy * (1.0f / t.z)); + + if(LTF) uv -= 0x0008; + } + else + { + uv = convert_int2(t.xy); + } + + int2 uvf = uv & 0x000f; + + int2 uv0 = uv >> 4; + int2 uv1 = uv0 + 1; + + uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS); + uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT); + uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS); + uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT); + + tex = vm; // TODO: use the texture cache + + int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb); + int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb); + int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb); + int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb); + + if(LTF) + { + c00 = ((c01 - c00) * uvf.x >> 4) + c00; + c10 = ((c11 - c10) * uvf.x >> 4) + c10; + c00 = ((c10 - c00) * uvf.y >> 4) + c00; + } + + ct = c00; + } + } + + // alpha tfx + + if(FB) + { + if(TCC) + { + switch(TFX) + { + case TFX_MODULATE: + c.w = clamp(ct.w * c.w >> 7, 0, 0xff); + break; + case TFX_DECAL: + c.w = ct.w; + break; + case TFX_HIGHLIGHT: + c.w = clamp(ct.w + c.w, 0, 0xff); + break; + case TFX_HIGHLIGHT2: + c.w = ct.w; + break; + } + } + + if(AA1) + { + if(!ABE || c.w == 0x80) + { + // TODO: c.w = coverage; // coverage 0x80 at 100% + } + } + } + + // read mask (read once outside the loop if alpha test does not modify, not sure if it does not get optimized there anyway) + + uint fm = pb->fm; + uint zm = pb->zm; + + // alpha test + + if(!AlphaTest(c.w, pb->aref, &fm, &zm)) + { + continue; + } + + // all tests done, we have a new output + + fragments++; + + // write z + + if(ZWRITE) + { + zd = bitselect(zs, zd, zm); + } + + // rgb tfx + + if(FWRITE) + { + switch(TFX) + { + case TFX_MODULATE: + c.xyz = clamp(ct.xyz * c.xyz >> 7, 0, 0xff); + break; + case TFX_DECAL: + c.xyz = ct.xyz; + break; + case TFX_HIGHLIGHT: + case TFX_HIGHLIGHT2: + c.xyz = clamp((ct.xyz * c.xyz >> 7) + c.w, 0, 0xff); + break; + } + } + + // fog + + if(FWRITE && FGE) + { + int fog = (int)zf.y; + + c.xyz = (c.xyz * fog >> 8) + (convert_int4(pb->fog).xyz * (int3)(0xff - fog) >> 8); + } + + // alpha blend + + c = AlphaBlend(c, pb->afix, fd); + + // write frame + + if(FWRITE) + { + if(DTHE && is16bit(FPSM)) + { + // TODO: c += pb->dimx[y & 3] + } + + c = COLCLAMP ? clamp(c, 0, 0xff) : (c & 0xff); + + if(FBA && !is24bit(FPSM)) + { + c.w |= 0x80; + } + + uint fs; + + if(is32bit(FPSM)) + { + fs = (c.w << 24) | (c.z << 16) | (c.y << 8) | c.x; + } + else if(is24bit(FPSM)) + { + fs = (c.z << 16) | (c.y << 8) | c.x; + } + else if(is16bit(FPSM)) + { + fs = ((c.w & 0x80) << 8) | ((c.z & 0xf8) << 7) | ((c.y & 0xf8) << 2) | (c.x >> 3); + } + + fd = RFB ? bitselect(fs, fd, fm) : fs; + + // dest alpha test for the next loop + + if(!DestAlphaTest(fd)) + { + prim_index = prim_count; // game over + + break; + } + } + } + + bin += bin_count; + bin_value = *bin; + } + + if(fragments > 0) + { + // TODO: write color/z to faddr/zaddr (if 16x16 was cached, barrier local mem, swizzle back to its place) + + // TODO if(fm/zm != 0xffffffff) or whatever masks the output completely for the pixel format) + + if(ZWRITE) + { + WriteFrame(vm, zaddr, ZPSM, zd); + } + + if(FWRITE) + { + WriteFrame(vm, faddr, FPSM, fd); + //WriteFrame(vm, faddr, FPSM, 0xff202020 * fragments); + } + } +} + +#endif + +#endif diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index 02626f8dad..b9bfa19737 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -43,6 +43,7 @@ #include #include #include "../../common/include/comptr.h" +#include #define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA) #define D3D11_SHADER_MACRO D3D10_SHADER_MACRO diff --git a/plugins/GSdx/vsprops/common.props b/plugins/GSdx/vsprops/common.props index 71bf73f281..91c58981d0 100644 --- a/plugins/GSdx/vsprops/common.props +++ b/plugins/GSdx/vsprops/common.props @@ -8,22 +8,22 @@ true - _WINDOWS;_WIN32_WINNT=0x500;%(PreprocessorDefinitions) + _WINDOWS;_WIN32_WINNT=0x500;__CL_ENABLE_EXCEPTIONS;%(PreprocessorDefinitions) Fast false Level4 ProgramDatabase 4996;4995;4324;4100;4101;4201;4556;4127;4512;%(DisableSpecificWarnings) - $(DXSDK_DIR)include;$(VTUNE_AMPLIFIER_XE_2013_DIR)include;$(SolutionDir)3rdparty;%(AdditionalIncludeDirectories) + $(DXSDK_DIR)include;$(INTELOCLSDKROOT)include;$(VTUNE_AMPLIFIER_XE_2015_DIR)include;$(SolutionDir)3rdparty;%(AdditionalIncludeDirectories) true - d3d11.lib;d3dx11.lib;d3d10_1.lib;d3dx10.lib;d3d9.lib;d3dx9.lib;dxgi.lib;dxguid.lib;winmm.lib;strmiids.lib;xinput.lib;opengl32.lib;comsuppw.lib;%(AdditionalDependencies) + d3d11.lib;d3dx11.lib;d3d10_1.lib;d3dx10.lib;d3d9.lib;d3dx9.lib;dxgi.lib;dxguid.lib;winmm.lib;strmiids.lib;xinput.lib;opengl32.lib;opencl.lib;comsuppw.lib;%(AdditionalDependencies) d3d9.dll;d3dx9_43.dll;d3d11.dll;d3dx11_43.dll;dxgi.dll;opengl32.dll;%(DelayLoadDLLs) true Windows false - $(VTUNE_AMPLIFIER_XE_2013_DIR)lib32;%(AdditionalLibraryDirectories) + $(VTUNE_AMPLIFIER_XE_2015_DIR)lib32;%(AdditionalLibraryDirectories) .\postBuild.cmd "$(TargetPath)" "$(TargetName)" $(TargetExt) $(PcsxSubsection) diff --git a/plugins/GSdx/vsprops/x64.props b/plugins/GSdx/vsprops/x64.props index 947675c717..ab8253d819 100644 --- a/plugins/GSdx/vsprops/x64.props +++ b/plugins/GSdx/vsprops/x64.props @@ -5,7 +5,7 @@ - $(DXSDK_DIR)Lib\x64;$(ProjectDir)vtune\x64;%(AdditionalLibraryDirectories) + $(DXSDK_DIR)Lib\x64;$(INTELOCLSDKROOT)lib\x64;$(ProjectDir)vtune\x64;%(AdditionalLibraryDirectories) _WIN64;%(PreprocessorDefinitions) diff --git a/plugins/GSdx/vsprops/x86.props b/plugins/GSdx/vsprops/x86.props index ff76b9535d..56171d784c 100644 --- a/plugins/GSdx/vsprops/x86.props +++ b/plugins/GSdx/vsprops/x86.props @@ -5,7 +5,7 @@ - $(DXSDK_DIR)Lib\x86;$(ProjectDir)vtune\x86;%(AdditionalLibraryDirectories) + $(DXSDK_DIR)Lib\x86;$(INTELOCLSDKROOT)lib\x86;$(ProjectDir)vtune\x86;%(AdditionalLibraryDirectories) From 881735b56268e76ec8a4b817ebd3fa7601160c12 Mon Sep 17 00:00:00 2001 From: gabest11 Date: Mon, 15 Sep 2014 15:59:50 +0200 Subject: [PATCH 02/15] minor merging errors --- plugins/GSdx/res/shadeboost.fx | 2 +- plugins/GSdx/resource.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/plugins/GSdx/res/shadeboost.fx b/plugins/GSdx/res/shadeboost.fx index 5f3c2474a0..890a68a948 100644 --- a/plugins/GSdx/res/shadeboost.fx +++ b/plugins/GSdx/res/shadeboost.fx @@ -73,4 +73,4 @@ float4 ps_main(PS_INPUT input) : COLOR } #endif -#endif \ No newline at end of file +#endif diff --git a/plugins/GSdx/resource.h b/plugins/GSdx/resource.h index 24ed550f41..b6502c2aef 100644 --- a/plugins/GSdx/resource.h +++ b/plugins/GSdx/resource.h @@ -81,6 +81,7 @@ #define IDR_CS_FX 10006 #define IDD_SHADEBOOST 10007 #define IDR_SHADEBOOST_FX 10009 +#define IDR_TFX_CL 10010 #define IDD_HACKS 10011 #define IDC_STATIC -1 From ba1e522bbb63fb7196fa287018ddfe4f40108575 Mon Sep 17 00:00:00 2001 From: gabest11 Date: Tue, 16 Sep 2014 05:37:06 +0200 Subject: [PATCH 03/15] Less opencl bugs, some games are actually playable now, there are still texture errors. --- plugins/GSdx/GSDrawScanline.cpp | 2 +- plugins/GSdx/GSLocalMemory.cpp | 42 ++++ plugins/GSdx/GSLocalMemory.h | 1 + plugins/GSdx/GSRendererCL.cpp | 383 ++++++++++++++++---------------- plugins/GSdx/GSRendererCL.h | 93 +++++--- plugins/GSdx/GSState.cpp | 4 + plugins/GSdx/res/tfx.cl | 30 ++- 7 files changed, 325 insertions(+), 230 deletions(-) diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp index 568a9c2bc3..5e3c4fa705 100644 --- a/plugins/GSdx/GSDrawScanline.cpp +++ b/plugins/GSdx/GSDrawScanline.cpp @@ -2711,7 +2711,7 @@ bool GSDrawScanline::TestAlpha(T& test, T& fm, T& zm, const T& ga) case AFAIL_RGB_ONLY: zm |= t; - fm |= t & T::xff000000(); + fm |= t & T::xff000000(); // fpsm 16 bit => & 0xffff8000? break; default: diff --git a/plugins/GSdx/GSLocalMemory.cpp b/plugins/GSdx/GSLocalMemory.cpp index 757463d134..5519dfebb6 100644 --- a/plugins/GSdx/GSLocalMemory.cpp +++ b/plugins/GSdx/GSLocalMemory.cpp @@ -2082,3 +2082,45 @@ uint32* GSOffset::GetPages(const GSVector4i& rect, uint32* pages, GSVector4i* bb return pages; } + +GSVector4i* GSOffset::GetPagesAsBits(const GSVector4i& rect, GSVector4i* pages, GSVector4i* bbox) +{ + if(pages == NULL) + { + pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16); + } + + pages[0] = GSVector4i::zero(); + pages[1] = GSVector4i::zero(); + pages[2] = GSVector4i::zero(); + pages[3] = GSVector4i::zero(); + + GSVector2i bs = (bp & 31) == 0 ? GSLocalMemory::m_psm[psm].pgs : GSLocalMemory::m_psm[psm].bs; + + GSVector4i r = rect.ralign(bs); + + if(bbox != NULL) *bbox = r; + + r = r.sra32(3); + + bs.x >>= 3; + bs.y >>= 3; + + for(int y = r.top; y < r.bottom; y += bs.y) + { + uint32 base = block.row[y]; + + for(int x = r.left; x < r.right; x += bs.x) + { + uint32 n = (base + block.col[x]) >> 5; + + if(n < MAX_PAGES) + { + ((uint32*)pages)[n >> 5] |= 1 << (n & 31); + } + } + } + + return pages; + +} \ No newline at end of file diff --git a/plugins/GSdx/GSLocalMemory.h b/plugins/GSdx/GSLocalMemory.h index ea83bfd53b..70e171090f 100644 --- a/plugins/GSdx/GSLocalMemory.h +++ b/plugins/GSdx/GSLocalMemory.h @@ -54,6 +54,7 @@ public: enum {EOP = 0xffffffff}; uint32* GetPages(const GSVector4i& rect, uint32* pages = NULL, GSVector4i* bbox = NULL); + GSVector4i* GetPagesAsBits(const GSVector4i& rect, GSVector4i* pages = NULL, GSVector4i* bbox = NULL); // free returned value with _aligned_free }; struct GSPixelOffset diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp index 357a2a3e23..a08ee02e1f 100644 --- a/plugins/GSdx/GSRendererCL.cpp +++ b/plugins/GSdx/GSRendererCL.cpp @@ -77,18 +77,16 @@ GSRendererCL::GSRendererCL() { m_nativeres = true; // ignore ini, sw is always native - //s_dump = 1; - //s_save = 1; - //s_savez = 1; - - // TODO: m_tc = new GSTextureCacheCL(this); - memset(m_texture, 0, sizeof(m_texture)); m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32); - memset(m_rw_pages, 0, sizeof(m_rw_pages)); - memset(m_tex_pages, 0, sizeof(m_tex_pages)); + for(int i = 0; i < 4; i++) + { + m_rw_pages[0][i] = GSVector4i::zero(); + m_rw_pages[1][i] = GSVector4i::zero(); + m_tc_pages[i] = GSVector4i::xffffffff(); + } #define InitCVB(P) \ m_cvb[P][0][0] = &GSRendererCL::ConvertVertexBuffer; \ @@ -107,8 +105,6 @@ GSRendererCL::GSRendererCL() GSRendererCL::~GSRendererCL() { - // TODO: delete m_tc; - for(size_t i = 0; i < countof(m_texture); i++) { delete m_texture[i]; @@ -121,18 +117,21 @@ void GSRendererCL::Reset() { Sync(-1); - // TODO: m_tc->RemoveAll(); - GSRenderer::Reset(); } +static int pageuploads = 0; +static int pageuploadcount = 0; +static int tfxcount = 0; + void GSRendererCL::VSync(int field) { - Sync(0); // IncAge might delete a cached texture in use + Sync(0); GSRenderer::VSync(field); - // TODO: m_tc->IncAge(); + printf("vsync %d/%d/%d\n", pageuploads, pageuploadcount, tfxcount); + pageuploads = pageuploadcount = tfxcount = 0; //if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize); } @@ -364,7 +363,7 @@ void GSRendererCL::Draw() { // only allow batches of the same primclass in Enqueue - if(!m_jobs.empty() && m_jobs.front().sel.prim != (uint32)m_vt.m_primclass) + if(!m_jobs.empty() && m_jobs.front()->sel.prim != (uint32)m_vt.m_primclass) { Enqueue(); } @@ -372,6 +371,16 @@ void GSRendererCL::Draw() // + shared_ptr job(new TFXJob()); + + job->rect.x = rect.x; + job->rect.y = rect.y; + job->rect.z = rect.z; + job->rect.w = rect.w; + job->ib_start = m_cl.ib.tail; + job->ib_count = m_index.tail; + job->pb_start = m_cl.pb.tail; + GSVertexCL* vb = (GSVertexCL*)(m_cl.vb.ptr + m_cl.vb.tail); uint32* ib = (uint32*)(m_cl.ib.ptr + m_cl.ib.tail); TFXParameter* pb = (TFXParameter*)(m_cl.pb.ptr + m_cl.pb.tail); @@ -402,21 +411,12 @@ void GSRendererCL::Draw() m_vb_count += m_vertex.next; - if(!SetupParameter(pb, vb, m_vertex.next, m_index.buff, m_index.tail)) + if(!SetupParameter(job.get(), pb, vb, m_vertex.next, m_index.buff, m_index.tail)) { return; } - TFXJob job; - - job.rect.x = rect.x; - job.rect.y = rect.y; - job.rect.z = rect.z; - job.rect.w = rect.w; - job.sel = pb->sel; - job.ib_start = m_cl.ib.tail; - job.ib_count = m_index.tail; - job.pb_start = m_cl.pb.tail; + job->sel = pb->sel; m_jobs.push_back(job); @@ -424,29 +424,67 @@ void GSRendererCL::Draw() m_cl.ib.tail += ib_size; m_cl.pb.tail += pb_size; - // mark pages for writing + // mark pages used in rendering as source or target - if(pb->sel.fb) + if(pb->sel.fwrite || pb->sel.rfb) { - uint8 flag = pb->sel.fb; + m_context->offset.fb->GetPagesAsBits(rect, m_tmp_pages); - const uint32* pages = m_context->offset.fb->GetPages(rect, m_tmp_pages); - - for(const uint32* p = pages; *p != GSOffset::EOP; p++) + if(pb->sel.rfb) { - m_rw_pages[*p] |= flag; + for(int i = 0; i < 4; i++) + { + m_rw_pages[0][i] |= m_tmp_pages[i]; + } + } + + if(pb->sel.fwrite) + { + for(int i = 0; i < 4; i++) + { + m_rw_pages[1][i] |= m_tmp_pages[i]; + } + } + + GSVector4i* dst_pages = job->GetDstPages(); + + if(pb->sel.fwrite) + { + for(int i = 0; i < 4; i++) + { + dst_pages[i] |= m_tmp_pages[i]; + } } } - if(pb->sel.zb) + if(pb->sel.zwrite || pb->sel.rzb) { - uint8 flag = pb->sel.zb; + m_context->offset.zb->GetPagesAsBits(rect, m_tmp_pages); - const uint32* pages = m_context->offset.zb->GetPages(rect, m_tmp_pages); - - for(const uint32* p = pages; *p != GSOffset::EOP; p++) + if(pb->sel.rzb) { - m_rw_pages[*p] |= flag; + for(int i = 0; i < 4; i++) + { + m_rw_pages[0][i] |= m_tmp_pages[i]; + } + } + + if(pb->sel.zwrite) + { + for(int i = 0; i < 4; i++) + { + m_rw_pages[1][i] |= m_tmp_pages[i]; + } + } + + GSVector4i* dst_pages = job->GetDstPages(); + + if(pb->sel.zwrite) + { + for(int i = 0; i < 4; i++) + { + dst_pages[i] |= m_tmp_pages[i]; + } } } @@ -456,52 +494,6 @@ void GSRendererCL::Draw() { Enqueue(); } - - /* - // check if the texture is not part of a target currently in use - - if(CheckSourcePages(data)) - { - Sync(4); - } - - // addref source and target pages - - data->UsePages(fb_pages, m_context->offset.fb->psm, zb_pages, m_context->offset.zb->psm); - */ - - // update previously invalidated parts - - //data->UpdateSource(); - /* - if(LOG) - { - fprintf(s_fp, "[%d] queue %05x %d (%d) %05x %d (%d) %05x %d %dx%d (%d %d %d) | %d %d %d\n", - sd->counter, - m_context->FRAME.Block(), m_context->FRAME.PSM, gd.sel.fwrite, - m_context->ZBUF.Block(), m_context->ZBUF.PSM, gd.sel.zwrite, - PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, m_context->TEX0.CSM, m_context->TEX0.CPSM, m_context->TEX0.CSA, - PRIM->PRIM, sd->vertex_count, sd->index_count); - - fflush(s_fp); - } - */ - - //printf("q %p %d (%d %d %d %d)\n", pb, pb->ib_count, r.x, r.y, r.z, r.w); - - /* - // invalidate new parts rendered onto - - if(sd->global.sel.fwrite) - { - m_tc->InvalidatePages(sd->m_fb_pages, sd->m_fpsm); - } - - if(sd->global.sel.zwrite) - { - m_tc->InvalidatePages(sd->m_zb_pages, sd->m_zpsm); - } - */ } catch(cl::Error err) { @@ -552,8 +544,11 @@ void GSRendererCL::Sync(int reason) m_cl.queue[2].finish(); - memset(m_rw_pages, 0, sizeof(m_rw_pages)); - memset(m_tex_pages, 0, sizeof(m_tex_pages)); + for(int i = 0; i < 4; i++) + { + m_rw_pages[0][i] = GSVector4i::zero(); + m_rw_pages[1][i] = GSVector4i::zero(); + } // TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap, // though it does not seem to be necessary even with GPU devices where it might be cached, @@ -569,13 +564,15 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM); - o->GetPages(r, m_tmp_pages); + o->GetPagesAsBits(r, m_tmp_pages); //if(!synced) { - for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) + for(int i = 0; i < 4; i++) { - if(m_rw_pages[*p] & 3) // rw + GSVector4i pages = m_rw_pages[0][i] | m_rw_pages[1][i]; + + if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero())) { Sync(3); @@ -584,9 +581,9 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS } } - for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) + for(int i = 0; i < 4; i++) { - m_tex_pages[*p] = 1; + m_tc_pages[i] |= m_tmp_pages[i]; } } @@ -598,11 +595,13 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS { GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM); - o->GetPages(r, m_tmp_pages); + o->GetPagesAsBits(r, m_tmp_pages); - for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) + for(int i = 0; i < 4; i++) { - if(m_rw_pages[*p] & 1) // w + GSVector4i pages = m_rw_pages[1][i]; + + if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero())) { Sync(4); @@ -611,34 +610,6 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS } } } -/* -bool GSRendererCL::CheckSourcePages(RasterizerData* data) -{ - // TODO: if(!m_rl->IsSynced()) // TODO: all callbacks from the issued drawings reported in => in-sync - { - for(size_t i = 0; data->m_tex[i].t != NULL; i++) - { - data->m_tex[i].t->m_offset->GetPages(data->m_tex[i].r, m_tmp_pages); - - uint32* pages = m_tmp_pages; // data->m_tex[i].t->m_pages.n; - - for(const uint32* p = pages; *p != GSOffset::EOP; p++) - { - // TODO: 8H 4HL 4HH texture at the same place as the render target (24 bit, or 32-bit where the alpha channel is masked, Valkyrie Profile 2) - - if(m_fzb_pages[*p]) // currently being drawn to? => sync - { - return true; - } - } - } - } - - return false; -} -*/ - -//#include "GSTextureCL.h" void GSRendererCL::Enqueue() { @@ -650,7 +621,7 @@ void GSRendererCL::Enqueue() ASSERT(m_cl.ib.tail > m_cl.ib.head); ASSERT(m_cl.pb.tail > m_cl.pb.head); - int primclass = m_jobs.front().sel.prim; + int primclass = m_jobs.front()->sel.prim; uint32 n; @@ -724,8 +695,8 @@ void GSRendererCL::Enqueue() { auto job = next++; - uint32 cur_prim_count = job->ib_count / n; - uint32 next_prim_count = next != m_jobs.end() ? next->ib_count / n : 0; + uint32 cur_prim_count = (*job)->ib_count / n; + uint32 next_prim_count = next != m_jobs.end() ? (*next)->ib_count / n : 0; total_prim_count += cur_prim_count; @@ -734,7 +705,7 @@ void GSRendererCL::Enqueue() uint32 prim_count = std::min(total_prim_count, MAX_PRIM_COUNT); pk.setArg(3, (cl_uint)m_vb_start); - pk.setArg(4, (cl_uint)head->ib_start); + pk.setArg(4, (cl_uint)(*head)->ib_start); m_cl.queue[2].enqueueNDRangeKernel(pk, cl::NullRange, cl::NDRange(prim_count), cl::NullRange); @@ -748,7 +719,7 @@ void GSRendererCL::Enqueue() for(auto i = head; i != next; i++) { - rect = rect.runion(GSVector4i::load(&i->rect)); + rect = rect.runion(GSVector4i::load(&(*i)->rect)); } rect = rect.ralign(GSVector2i(BIN_SIZE, BIN_SIZE)) >> BIN_SIZE_BITS; @@ -829,14 +800,40 @@ void GSRendererCL::Enqueue() { ASSERT(prim_start < MAX_PRIM_COUNT); - uint32 prim_count_inner = std::min(i->ib_count / n, MAX_PRIM_COUNT - prim_start); + uint32 prim_count_inner = std::min((*i)->ib_count / n, MAX_PRIM_COUNT - prim_start); - // TODO: update the needed pages of the texture cache buffer with enqueueCopyBuffer (src=this->vm, dst=this->vm_text), - // changed by tfx in the previous loop or marked by InvalidateVideoMem + tfxcount++; + if((*i)->src_pages != NULL) + { + int count = 0; + + for(int j = 0; j < 4; j++) + { + GSVector4i pages = m_tc_pages[j] & (*i)->src_pages[j]; + + if(!pages.eq(GSVector4i::zero())) + { + // TODO: update texture cache with pages where the bits are set, enqueueCopyBuffer or "memcpy" kernel (src=this->vm, dst=this->tex) + // TODO: only use the texture cache if there is an overlap between src_pages and dst_pages? (or if already uploaded) + + for(int ii = 0; ii < 4; ii++) + for(int jj = 0; jj < 32; jj++) + if(pages.u32[ii] & (1 << jj)) count++; + + m_tc_pages[j] &= ~(*i)->src_pages[j]; + } + } + + if(count > 0) + { + pageuploads += count; + pageuploadcount++; + } + } // TODO: tile level z test - cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel); + cl::Kernel& tfx = m_cl.GetTFXKernel((*i)->sel); if(tfx_prev != tfx()) { @@ -845,28 +842,32 @@ void GSRendererCL::Enqueue() tfx_prev = tfx(); } - tfx.setArg(4, (cl_uint)i->pb_start); + tfx.setArg(4, (cl_uint)(*i)->pb_start); tfx.setArg(5, (cl_uint)prim_start); tfx.setArg(6, (cl_uint)prim_count_inner); tfx.setArg(7, (cl_uint)batch_count); tfx.setArg(8, (cl_uint)bin_count); tfx.setArg(9, bin_dim); - //m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NullRange, cl::NDRange(std::min(bin_count * 4, CUs) * 256), cl::NDRange(256)); - - //printf("%d %d %d %d\n", rect.width() << BIN_SIZE_BITS, rect.height() << BIN_SIZE_BITS, i->rect.z - i->rect.x, i->rect.w - i->rect.y); - - GSVector4i r = GSVector4i::load(&i->rect); + GSVector4i r = GSVector4i::load(&(*i)->rect); r = r.ralign(GSVector2i(BIN_SIZE, BIN_SIZE)); /* - if(i->sel.IsSolidRect()) // TODO: simple mem fill + if(i->sel.IsSolidRect()) // TODO: simple mem fill with optional mask ;//printf("%d %d %d %d\n", r.left, r.top, r.width(), r.height()); else */ m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(16, 16)); - // TODO: invalidate texture cache pages + if((*i)->dst_pages != NULL) + { + for(int j = 0; j < 4; j++) + { + m_tc_pages[j] |= (*i)->dst_pages[j]; + } + } + + // TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily prim_start += prim_count_inner; } @@ -877,10 +878,12 @@ void GSRendererCL::Enqueue() { prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT); - job->ib_start += prim_count * n * sizeof(uint32); - job->ib_count -= prim_count * n; + (*job)->ib_start += prim_count * n * sizeof(uint32); + (*job)->ib_count -= prim_count * n; next = job; // try again for the reminder + + printf("split %d\n", (*job)->ib_count / n); } break; @@ -929,7 +932,7 @@ static int RemapPSM(int psm) return psm; } -bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count) +bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count) { const GSDrawingEnvironment& env = m_env; const GSDrawingContext* context = m_context; @@ -970,7 +973,7 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v } bool fwrite; - bool zwrite; + bool zwrite = zm != 0xffffffff; switch(context->FRAME.PSM) { @@ -991,26 +994,6 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v break; } - switch(context->ZBUF.PSM) - { - default: - case PSM_PSMCT32: - case PSM_PSMZ32: - zwrite = zm != 0xffffffff; - break; - case PSM_PSMCT24: - case PSM_PSMZ24: - zwrite = (zm & 0x00ffffff) != 0x00ffffff; - break; - case PSM_PSMCT16: - case PSM_PSMCT16S: - case PSM_PSMZ16: - case PSM_PSMZ16S: - zm &= 0x80f8f8f8; - zwrite = (zm & 0x80f8f8f8) != 0x80f8f8f8; - break; - } - if(!fwrite && !zwrite) return false; bool ftest = pb->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24; @@ -1061,19 +1044,21 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v pb->sel.tfx = TFX_DECAL; } - // TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(context->TEX0, env.TEXA); - - // TODO: if(t == NULL) {ASSERT(0); return false;} - GSVector4i r; GetTextureMinMax(r, context->TEX0, context->CLAMP, pb->sel.ltf); - // TODO: data->SetSource(t, r, 0); + GSVector4i* src_pages = job->GetSrcPages(); - // TODO: pb->sel.tw = t->m_tw - 3; + GSOffset* o = m_mem.GetOffset(context->TEX0.TBP0, context->TEX0.TBW, context->TEX0.PSM); + + o->GetPagesAsBits(r, m_tmp_pages); - // TODO: store r to current job + for(int i = 0; i < 4; i++) + { + src_pages[i] |= m_tmp_pages[i]; + m_rw_pages[0][i] |= m_tmp_pages[i]; + } if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.y > 0) { @@ -1195,17 +1180,19 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v m_vt.m_min.t *= 0.5f; m_vt.m_max.t *= 0.5f; - // TODO: GSTextureCacheSW::Texture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, pb->sel.tw + 3); - - // TODO: if(t == NULL) {ASSERT(0); return false;} - GSVector4i r; GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, pb->sel.ltf); - // TODO: data->SetSource(t, r, i); + GSOffset* o = m_mem.GetOffset(MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM); + + o->GetPagesAsBits(r, m_tmp_pages); - // TODO: store r to current job + for(int i = 0; i < 4; i++) + { + src_pages[i] |= m_tmp_pages[i]; + m_rw_pages[0][i] |= m_tmp_pages[i]; + } } s_counter++; @@ -1361,15 +1348,23 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v pb->afix = context->ALPHA.FIX; } - if(pb->sel.date - || pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1 - || pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY - || (pb->sel.fpsm & 3) == 0 && fwrite && fm != 0 - || (pb->sel.fpsm & 3) == 1 && fwrite // always read-merge-write 24bpp, regardless the mask - || (pb->sel.fpsm & 3) >= 2 && fwrite && (fm & 0x80f8f8f8) != 0) + if(pb->sel.date || pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1) { pb->sel.rfb = 1; } + else + { + if(fwrite) + { + if(pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY + || (pb->sel.fpsm & 3) == 0 && fm != 0 + || (pb->sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask + || (pb->sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0) + { + pb->sel.rfb = 1; + } + } + } pb->sel.colclamp = env.COLCLAMP.CLAMP; pb->sel.fba = context->FBA.FBA; @@ -1391,7 +1386,22 @@ bool GSRendererCL::SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t v { pb->sel.zpsm = RemapPSM(context->ZBUF.PSM); pb->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS; - pb->sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000; + + if(ztest) + { + pb->sel.rzb = 1; + } + else + { + if(zwrite) + { + if(pb->sel.atst != ATST_ALWAYS && (pb->sel.afail == AFAIL_FB_ONLY || pb->sel.afail == AFAIL_RGB_ONLY) + || (pb->sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask + { + pb->sel.rzb = 1; + } + } + } } pb->fm = fm; @@ -1732,7 +1742,7 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel) opt << "-D RFB=" << sel.rfb << " "; opt << "-D ZWRITE=" << sel.zwrite << " "; opt << "-D ZTEST=" << sel.ztest << " "; - opt << "-D ZOVERFLOW=" << sel.zoverflow << " "; + opt << "-D RZB=" << sel.rzb << " "; opt << "-D WMS=" << sel.wms << " "; opt << "-D WMT=" << sel.wmt << " "; opt << "-D DATM=" << sel.datm << " "; @@ -1740,7 +1750,6 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel) opt << "-D FBA=" << sel.fba << " "; opt << "-D DTHE=" << sel.dthe << " "; opt << "-D PRIM=" << sel.prim << " "; - opt << "-D TW=" << sel.tw << " "; opt << "-D LCM=" << sel.lcm << " "; opt << "-D MMIN=" << sel.mmin << " "; opt << "-D NOSCISSOR=" << sel.noscissor << " "; @@ -1751,6 +1760,8 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel) AddDefs(opt); + printf("building kernel (%s)\n", entry); + program.build(opt.str().c_str()); } catch(cl::Error err) diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h index 3ac008ceaa..ab9961dd95 100644 --- a/plugins/GSdx/GSRendererCL.h +++ b/plugins/GSdx/GSRendererCL.h @@ -64,18 +64,6 @@ class GSRendererCL : public GSRenderer operator uint32() const { return key; } }; - union JobSelector - { - struct - { - uint32 dummy:1; // 0 - }; - - uint32 key; - - operator uint32() const { return key; } - }; - union TFXSelector { struct @@ -106,7 +94,7 @@ class GSRendererCL : public GSRenderer uint32 rfb:1; // 36 uint32 zwrite:1; // 37 uint32 ztest:1; // 38 - uint32 zoverflow:1; // 39 (z max >= 0x80000000) + uint32 rzb:1; // 39 uint32 wms:2; // 40 uint32 wmt:2; // 42 uint32 datm:1; // 44 @@ -114,12 +102,11 @@ class GSRendererCL : public GSRenderer uint32 fba:1; // 46 uint32 dthe:1; // 47 uint32 prim:2; // 48 - uint32 tw:3; // 50 (encodes values between 3 -> 10, texture cache makes sure it is at least 3) - uint32 lcm:1; // 53 - uint32 mmin:2; // 54 - uint32 noscissor:1; // 55 - uint32 tpsm:4; // 56 - uint32 aem:1; // 60 + uint32 lcm:1; // 50 + uint32 mmin:2; // 51 + uint32 noscissor:1; // 53 + uint32 tpsm:4; // 54 + uint32 aem:1; // 58 // TODO }; @@ -177,12 +164,57 @@ class GSRendererCL : public GSRenderer uint32 clut[256]; }; - struct TFXJob + class TFXJob { - struct {int x, y, z, w;} rect; - TFXSelector sel; + public: + struct { int x, y, z, w; } rect; + TFXSelector sel; // uses primclass, solidrect only uint32 ib_start, ib_count; uint32 pb_start; + GSVector4i* src_pages; // read by any texture level + GSVector4i* dst_pages; // f/z writes to it + + TFXJob() + : src_pages(NULL) + , dst_pages(NULL) + { + } + + virtual ~TFXJob() + { + if(src_pages != NULL) _aligned_free(src_pages); + if(dst_pages != NULL) _aligned_free(dst_pages); + } + + GSVector4i* GetSrcPages() + { + if(src_pages == NULL) + { + src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16); + + src_pages[0] = GSVector4i::zero(); + src_pages[1] = GSVector4i::zero(); + src_pages[2] = GSVector4i::zero(); + src_pages[3] = GSVector4i::zero(); + } + + return src_pages; + } + + GSVector4i* GetDstPages() + { + if(dst_pages == NULL) + { + dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16); + + dst_pages[0] = GSVector4i::zero(); + dst_pages[1] = GSVector4i::zero(); + dst_pages[2] = GSVector4i::zero(); + dst_pages[3] = GSVector4i::zero(); + } + + return dst_pages; + } }; class CL @@ -217,7 +249,7 @@ class GSRendererCL : public GSRenderer }; CL m_cl; - std::list m_jobs; + std::list> m_jobs; uint32 m_vb_start; uint32 m_vb_count; @@ -282,10 +314,10 @@ protected: // GSTextureCacheCL* m_tc; GSTexture* m_texture[2]; uint8* m_output; - - uint8 m_rw_pages[512]; // TODO: bit array for faster clearing (bit 0: write, bit 1: read) - uint8 m_tex_pages[512]; - uint32 m_tmp_pages[512 + 1]; + + GSVector4i m_rw_pages[2][4]; // pages that may be read or modified by the rendering queue, f/z rw, tex r + GSVector4i m_tc_pages[4]; // invalidated texture cache pages + GSVector4i m_tmp_pages[4]; void Reset(); void VSync(int field); @@ -297,12 +329,7 @@ protected: void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r); void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false); - void UsePages(const uint32* pages, int type); - void ReleasePages(const uint32* pages, int type); - - //bool CheckSourcePages(RasterizerData* data); - - bool SetupParameter(TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count); + bool SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count); public: GSRendererCL(); diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index 731b2aa480..10917b4fb7 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -47,6 +47,10 @@ GSState::GSState() s_savez = !!theApp.GetConfig("savez", 0); s_saven = theApp.GetConfig("saven", 0); + //s_dump = 1; + //s_save = 1; + //s_savez = 1; + UserHacks_AggressiveCRC = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_AggressiveCRC", 0) : 0; UserHacks_DisableCrcHacks = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig( "UserHacks_DisableCrcHacks", 0 ) : 0; UserHacks_WildHack = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_WildHack", 0) : 0; diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl index d28622380e..007f805d9a 100644 --- a/plugins/GSdx/res/tfx.cl +++ b/plugins/GSdx/res/tfx.cl @@ -968,7 +968,8 @@ bool AlphaTest(int alpha, int aref, uint* fm, uint* zm) *fm |= pass ? 0 : 0xffffffff; break; case AFAIL_RGB_ONLY: - *fm |= pass ? 0 : 0xff000000; + if(is32bit(FPSM)) *fm |= pass ? 0 : 0xff000000; + if(is16bit(FPSM)) *fm |= pass ? 0 : 0xffff8000; *zm |= pass ? 0 : 0xffffffff; break; } @@ -1248,7 +1249,7 @@ __kernel void KERNEL_TFX( fd = ReadFrame(vm, faddr, FPSM); } - if(ZTEST) + if(RZB) { zd = ReadFrame(vm, zaddr, ZPSM); } @@ -1284,7 +1285,7 @@ __kernel void KERNEL_TFX( wait_group_events(1, &e); } - if(ZTEST) + if(RZB) { event_t e = async_work_group_copy((__local uint4*)zb, (__global uint4*)&vm[zbn << 8], 1024 / sizeof(uint4), 0); @@ -1409,7 +1410,7 @@ __kernel void KERNEL_TFX( int4 ct; - if(FB && TFX != TFX_NONE) + if(TFX != TFX_NONE) { // TODO @@ -1423,13 +1424,20 @@ __kernel void KERNEL_TFX( if(!FST) { - uv = convert_int2(t.xy * (1.0f / t.z)); + uv = convert_int2_rte(t.xy * (1.0f / t.z));// * native_recip(t.z)); if(LTF) uv -= 0x0008; } else { - uv = convert_int2(t.xy); + // sfex capcom logo third drawing call at (0,223) calculated as: + // t0 + (p - p0) * (t - t0) / (p1 - p0) + // 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112 + // due to rounding errors (multiply-add instruction maybe): + // t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation + // TODO: sw renderer samples at 112 exactly, check which one is correct + + uv = convert_int2(t.xy); } int2 uvf = uv & 0x000f; @@ -1462,6 +1470,8 @@ __kernel void KERNEL_TFX( // alpha tfx + int alpha = c.w; + if(FB) { if(TCC) @@ -1512,7 +1522,7 @@ __kernel void KERNEL_TFX( if(ZWRITE) { - zd = bitselect(zs, zd, zm); + zd = RZB ? bitselect(zs, zd, zm) : zs; } // rgb tfx @@ -1529,7 +1539,7 @@ __kernel void KERNEL_TFX( break; case TFX_HIGHLIGHT: case TFX_HIGHLIGHT2: - c.xyz = clamp((ct.xyz * c.xyz >> 7) + c.w, 0, 0xff); + c.xyz = clamp((ct.xyz * c.xyz >> 7) + alpha, 0, 0xff); break; } } @@ -1553,10 +1563,10 @@ __kernel void KERNEL_TFX( { if(DTHE && is16bit(FPSM)) { - // TODO: c += pb->dimx[y & 3] + c.xyz += pb->dimx[y & 3][x & 3]; } - c = COLCLAMP ? clamp(c, 0, 0xff) : (c & 0xff); + c = COLCLAMP ? clamp(c, 0, 0xff) : c & 0xff; if(FBA && !is24bit(FPSM)) { From c64f9ad9b1b53db58876405ec9a42724673fd1ca Mon Sep 17 00:00:00 2001 From: gabest11 Date: Wed, 17 Sep 2014 08:52:25 +0200 Subject: [PATCH 04/15] squishing opencl bugs, there aren't many left hopefully --- plugins/GSdx/GSRendererCL.cpp | 430 ++++++++++++++++++++-------------- plugins/GSdx/GSRendererCL.h | 15 +- plugins/GSdx/res/tfx.cl | 28 ++- 3 files changed, 278 insertions(+), 195 deletions(-) diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp index a08ee02e1f..05cc2213d4 100644 --- a/plugins/GSdx/GSRendererCL.cpp +++ b/plugins/GSdx/GSRendererCL.cpp @@ -126,11 +126,9 @@ static int tfxcount = 0; void GSRendererCL::VSync(int field) { - Sync(0); - GSRenderer::VSync(field); - printf("vsync %d/%d/%d\n", pageuploads, pageuploadcount, tfxcount); + //printf("vsync %d/%d/%d\n", pageuploads, pageuploadcount, tfxcount); pageuploads = pageuploadcount = tfxcount = 0; //if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize); @@ -371,24 +369,10 @@ void GSRendererCL::Draw() // - shared_ptr job(new TFXJob()); - - job->rect.x = rect.x; - job->rect.y = rect.y; - job->rect.z = rect.z; - job->rect.w = rect.w; - job->ib_start = m_cl.ib.tail; - job->ib_count = m_index.tail; - job->pb_start = m_cl.pb.tail; - GSVertexCL* vb = (GSVertexCL*)(m_cl.vb.ptr + m_cl.vb.tail); uint32* ib = (uint32*)(m_cl.ib.ptr + m_cl.ib.tail); TFXParameter* pb = (TFXParameter*)(m_cl.pb.ptr + m_cl.pb.tail); - pb->scissor = scissor; - pb->bbox = bbox; - pb->rect = rect; - (this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(vb, m_vertex.buff, m_vertex.next); // TODO: upload in GSVertex format and extract the fields in the kernel? if(m_jobs.empty()) @@ -396,6 +380,7 @@ void GSRendererCL::Draw() memcpy(ib, m_index.buff, m_index.tail * sizeof(uint32)); m_vb_start = m_cl.vb.tail; + m_vb_count = 0; } else { @@ -409,17 +394,45 @@ void GSRendererCL::Draw() } } - m_vb_count += m_vertex.next; + shared_ptr job(new TFXJob()); if(!SetupParameter(job.get(), pb, vb, m_vertex.next, m_index.buff, m_index.tail)) { return; } + pb->scissor = scissor; + + if(bbox.eq(bbox.rintersect(scissor))) + { + pb->sel.noscissor = 1; + } + job->sel = pb->sel; + job->rect.x = rect.x; + job->rect.y = rect.y; + job->rect.z = rect.z; + job->rect.w = rect.w; + job->ib_start = m_cl.ib.tail; + job->ib_count = m_index.tail; + job->pb_start = m_cl.pb.tail; + +#ifdef DEBUG + job->fbp = context->FRAME.Block(); + job->fbw = context->FRAME.FBW; + job->fpsm = context->FRAME.PSM; + job->zbp = context->ZBUF.Block(); + job->tbp = PRIM->TME ? context->TEX0.TBP0 : 0xfffff; + job->tbw = PRIM->TME ? context->TEX0.TBW : 1; + job->tpsm = PRIM->TME ? context->TEX0.PSM : 0; + job->tw = PRIM->TME ? context->TEX0.TW : 0; + job->th = PRIM->TME ? context->TEX0.TH : 0; +#endif m_jobs.push_back(job); + m_vb_count += m_vertex.next; + m_cl.vb.tail += vb_size; m_cl.ib.tail += ib_size; m_cl.pb.tail += pb_size; @@ -444,12 +457,9 @@ void GSRendererCL::Draw() { m_rw_pages[1][i] |= m_tmp_pages[i]; } - } - GSVector4i* dst_pages = job->GetDstPages(); + GSVector4i* dst_pages = job->GetDstPages(); - if(pb->sel.fwrite) - { for(int i = 0; i < 4; i++) { dst_pages[i] |= m_tmp_pages[i]; @@ -475,12 +485,9 @@ void GSRendererCL::Draw() { m_rw_pages[1][i] |= m_tmp_pages[i]; } - } - GSVector4i* dst_pages = job->GetDstPages(); + GSVector4i* dst_pages = job->GetDstPages(); - if(pb->sel.zwrite) - { for(int i = 0; i < 4; i++) { dst_pages[i] |= m_tmp_pages[i]; @@ -488,6 +495,19 @@ void GSRendererCL::Draw() } } + if(job->src_pages != NULL) + { + for(int i = 0; i < 4; i++) + { + m_rw_pages[0][i] |= job->src_pages[i]; + + if(job->dst_pages != NULL && !(job->dst_pages[i] & job->src_pages[i]).eq(GSVector4i::zero())) + { + //printf("src and dst overlap!\n"); + } + } + } + // don't buffer too much data, feed them to the device if there is enough if(m_cl.vb.tail - m_cl.vb.head >= 256 * 4096 || m_jobs.size() >= 64) @@ -536,6 +556,8 @@ void GSRendererCL::Draw() void GSRendererCL::Sync(int reason) { + if(LOG) { fprintf(s_fp, "Sync (%d)\n", reason); fflush(s_fp); } + //printf("sync %d\n", reason); GSPerfMonAutoTimer pmat(&m_perfmon, GSPerfMon::Sync); @@ -574,7 +596,12 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero())) { - Sync(3); + // TODO: an awesome idea to avoid this Sync + // - call Enqueue() to flush m_jobs + // - append rendering queue with a kernel that writes the incoming data to m_mem.vm and tell the parent class to not do it + // - the only problem, clut has to be read directly by the texture sampler, can't attach it to gs_param before being written + + Sync(3); break; } @@ -800,36 +827,28 @@ void GSRendererCL::Enqueue() { ASSERT(prim_start < MAX_PRIM_COUNT); + tfxcount++; + + //if(LOG) { fprintf(s_fp, "q %05x %05x %05x\n", (*i)->fbp, (*i)->zbp, (*i)->tbp); fflush(s_fp); } + + UpdateTextureCache((*i).get()); + uint32 prim_count_inner = std::min((*i)->ib_count / n, MAX_PRIM_COUNT - prim_start); - tfxcount++; - if((*i)->src_pages != NULL) + /* + if(m_perfmon.GetFrame() >= 5036) if((*i)->src_pages != NULL) { - int count = 0; + m_cl.queue[2].finish(); - for(int j = 0; j < 4; j++) - { - GSVector4i pages = m_tc_pages[j] & (*i)->src_pages[j]; + uint64 frame = m_perfmon.GetFrame(); - if(!pages.eq(GSVector4i::zero())) - { - // TODO: update texture cache with pages where the bits are set, enqueueCopyBuffer or "memcpy" kernel (src=this->vm, dst=this->tex) - // TODO: only use the texture cache if there is an overlap between src_pages and dst_pages? (or if already uploaded) + std::string s; - for(int ii = 0; ii < 4; ii++) - for(int jj = 0; jj < 32; jj++) - if(pages.u32[ii] & (1 << jj)) count++; + s = format("c:\\temp1\\_%05d_f%lld_tex2_%05x_%d.bmp", s_n++, frame, (*i)->tbp, (*i)->tpsm); - m_tc_pages[j] &= ~(*i)->src_pages[j]; - } - } - - if(count > 0) - { - pageuploads += count; - pageuploadcount++; - } + m_mem.SaveBMP(s, (*i)->tbp, (*i)->tbw, (*i)->tpsm, 1 << (*i)->tw, 1 << (*i)->th); } + */ // TODO: tile level z test @@ -859,13 +878,22 @@ void GSRendererCL::Enqueue() */ m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(16, 16)); - if((*i)->dst_pages != NULL) + /* + if(m_perfmon.GetFrame() >= 5036) { - for(int j = 0; j < 4; j++) - { - m_tc_pages[j] |= (*i)->dst_pages[j]; - } + m_cl.queue[2].finish(); + + uint64 frame = m_perfmon.GetFrame(); + + std::string s; + + s = format("c:\\temp1\\_%05d_f%lld_rt2_%05x_%d.bmp", s_n++, frame, (*i)->fbp, (*i)->fpsm); + + m_mem.SaveBMP(s, (*i)->fbp, (*i)->fbw, (*i)->fpsm, GetFrameRect().width(), 512); } + */ + + InvalidateTextureCache((*i).get()); // TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily @@ -881,9 +909,9 @@ void GSRendererCL::Enqueue() (*job)->ib_start += prim_count * n * sizeof(uint32); (*job)->ib_count -= prim_count * n; - next = job; // try again for the reminder + next = job; // try again for the remainder - printf("split %d\n", (*job)->ib_count / n); + //printf("split %d\n", (*job)->ib_count / n); } break; @@ -909,6 +937,96 @@ void GSRendererCL::Enqueue() m_cl.Map(); } +void GSRendererCL::UpdateTextureCache(TFXJob* job) +{ + if(job->src_pages == NULL) return; + + int count = 0; + + for(int i = 0; i < 4; i++) + { + GSVector4i pages = m_tc_pages[i] & job->src_pages[i]; + + if(pages.eq(GSVector4i::zero())) continue; + + size_t page_size = 8192; + + // TODO: only use the texture cache if there is an overlap between src_pages and dst_pages? (or if already uploaded) + + if(0) for(int j = 0; j < 4; j++) + { + if(pages.u32[j] == 0) continue; + + if(pages.u32[j] == 0xffffffff) + { + size_t offset = (i * sizeof(GSVector4i) + j * sizeof(uint32)) * 8 * page_size; + + m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, page_size * 32); + + if(LOG) { fprintf(s_fp, "tc (%d x32)\n", offset >> 13); fflush(s_fp); } + + pageuploadcount++; + count += 32; + + continue; + } + + for(int k = 0; k < 4; k++) + { + uint8 b = pages.u8[j * 4 + k]; + + if(b == 0) continue; + + if(b == 0xff) + { + size_t offset = (i * sizeof(GSVector4i) + (j * 4 + k)) * 8 * page_size; + + m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, page_size * 8); + + if(LOG) { fprintf(s_fp, "tc (%d x8)\n", offset >> 13); fflush(s_fp); } + + pageuploadcount++; + count += 8; + + continue; + } + + for(int l = 0; l < 8; l++) + { + if(b & (1 << l)) + { + size_t offset = ((i * sizeof(GSVector4i) + (j * 4 + k)) * 8 + l) * page_size; + + m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, page_size); + + if(LOG) { fprintf(s_fp, "tc (%d x1)\n", offset >> 13); fflush(s_fp); } + + pageuploadcount++; + count++; + } + } + } + } + + m_tc_pages[i] &= ~job->src_pages[i]; + } + + if(count > 0) + { + pageuploads += count; + } +} + +void GSRendererCL::InvalidateTextureCache(TFXJob* job) +{ + if(job->dst_pages == NULL) return; + + for(int j = 0; j < 4; j++) + { + m_tc_pages[j] |= job->dst_pages[j]; + } +} + static int RemapPSM(int psm) { switch(psm) @@ -1057,7 +1175,6 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver for(int i = 0; i < 4; i++) { src_pages[i] |= m_tmp_pages[i]; - m_rw_pages[0][i] |= m_tmp_pages[i]; } if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.y > 0) @@ -1191,7 +1308,6 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver for(int i = 0; i < 4; i++) { src_pages[i] |= m_tmp_pages[i]; - m_rw_pages[0][i] |= m_tmp_pages[i]; } } @@ -1428,11 +1544,6 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver pb->zm |= 0xffff0000; } - if(pb->bbox.eq(pb->bbox.rintersect(pb->scissor))) - { - pb->sel.noscissor = 1; - } - pb->fbp = context->FRAME.Block(); pb->zbp = context->ZBUF.Block(); pb->bw = context->FRAME.FBW; @@ -1481,8 +1592,10 @@ GSRendererCL::CL::CL() #ifdef IOCL_DEBUG if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL) #else + //if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL) //if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Intel") != NULL) - if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Advanced Micro Devices") != NULL) + //if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Advanced Micro Devices") != NULL) + if(type == CL_DEVICE_TYPE_GPU) #endif { devices.push_back(device); @@ -1595,28 +1708,16 @@ static void AddDefs(ostringstream& opt) #endif } -cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel) +cl::Kernel GSRendererCL::CL::Build(const char* entry, ostringstream& opt) { - auto i = prim_map.find(sel); + // TODO: cache binary on disk - if(i != prim_map.end()) - { - return i->second; - } - - char entry[256]; - - sprintf(entry, "prim_%02x", sel); + printf("building kernel (%s)\n", entry); cl::Program program = cl::Program(context, kernel_str); try { - ostringstream opt; - - opt << "-D KERNEL_PRIM=" << entry << " "; - opt << "-D PRIM=" << sel.prim << " "; - AddDefs(opt); program.build(opt.str().c_str()); @@ -1636,7 +1737,28 @@ cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel) throw err; } - cl::Kernel k(program, entry); + return cl::Kernel(program, entry); +} + +cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel) +{ + auto i = prim_map.find(sel); + + if(i != prim_map.end()) + { + return i->second; + } + + char entry[256]; + + sprintf(entry, "prim_%02x", sel); + + ostringstream opt; + + opt << "-D KERNEL_PRIM=" << entry << " "; + opt << "-D PRIM=" << sel.prim << " "; + + cl::Kernel k = Build(entry, opt); prim_map[sel] = k; @@ -1658,37 +1780,14 @@ cl::Kernel& GSRendererCL::CL::GetTileKernel(const TileSelector& sel) sprintf(entry, "tile_%02x", sel); - cl::Program program = cl::Program(context, kernel_str); + ostringstream opt; - try - { - ostringstream opt; + opt << "-D KERNEL_TILE=" << entry << " "; + opt << "-D PRIM=" << sel.prim << " "; + opt << "-D MODE=" << sel.mode << " "; + opt << "-D CLEAR=" << sel.clear << " "; - opt << "-D KERNEL_TILE=" << entry << " "; - opt << "-D PRIM=" << sel.prim << " "; - opt << "-D MODE=" << sel.mode << " "; - opt << "-D CLEAR=" << sel.clear << " "; - - AddDefs(opt); - - program.build(opt.str().c_str()); - } - catch(cl::Error err) - { - if(err.err() == CL_BUILD_PROGRAM_FAILURE) - { - for(auto device : devices) - { - auto s = program.getBuildInfo(device); - - printf("kernel (%s) build error: %s\n", entry, s.c_str()); - } - } - - throw err; - } - - cl::Kernel k(program, entry); + cl::Kernel k = Build(entry, opt); tile_map[sel] = k; @@ -1708,78 +1807,53 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel) char entry[256]; - sprintf(entry, "tfx_%016x", sel); + sprintf(entry, "tfx_%016llx", sel); - cl::Program program = cl::Program(context, kernel_str); + ostringstream opt; - try - { - ostringstream opt; + opt << "-D KERNEL_TFX=" << entry << " "; + opt << "-D FPSM=" << sel.fpsm << " "; + opt << "-D ZPSM=" << sel.zpsm << " "; + opt << "-D ZTST=" << sel.ztst << " "; + opt << "-D ATST=" << sel.atst << " "; + opt << "-D AFAIL=" << sel.afail << " "; + opt << "-D IIP=" << sel.iip << " "; + opt << "-D TFX=" << sel.tfx << " "; + opt << "-D TCC=" << sel.tcc << " "; + opt << "-D FST=" << sel.fst << " "; + opt << "-D LTF=" << sel.ltf << " "; + opt << "-D TLU=" << sel.tlu << " "; + opt << "-D FGE=" << sel.fge << " "; + opt << "-D DATE=" << sel.date << " "; + opt << "-D ABE=" << sel.abe << " "; + opt << "-D ABA=" << sel.aba << " "; + opt << "-D ABB=" << sel.abb << " "; + opt << "-D ABC=" << sel.abc << " "; + opt << "-D ABD=" << sel.abd << " "; + opt << "-D PABE=" << sel.pabe << " "; + opt << "-D AA1=" << sel.aa1 << " "; + opt << "-D FWRITE=" << sel.fwrite << " "; + opt << "-D FTEST=" << sel.ftest << " "; + opt << "-D RFB=" << sel.rfb << " "; + opt << "-D ZWRITE=" << sel.zwrite << " "; + opt << "-D ZTEST=" << sel.ztest << " "; + opt << "-D RZB=" << sel.rzb << " "; + opt << "-D WMS=" << sel.wms << " "; + opt << "-D WMT=" << sel.wmt << " "; + opt << "-D DATM=" << sel.datm << " "; + opt << "-D COLCLAMP=" << sel.colclamp << " "; + opt << "-D FBA=" << sel.fba << " "; + opt << "-D DTHE=" << sel.dthe << " "; + opt << "-D PRIM=" << sel.prim << " "; + opt << "-D LCM=" << sel.lcm << " "; + opt << "-D MMIN=" << sel.mmin << " "; + opt << "-D NOSCISSOR=" << sel.noscissor << " "; + opt << "-D TPSM=" << sel.tpsm << " "; + opt << "-D AEM=" << sel.aem << " "; + opt << "-D FB=" << sel.fb << " "; + opt << "-D ZB=" << sel.zb << " "; - opt << "-D KERNEL_TFX=" << entry << " "; - opt << "-D FPSM=" << sel.fpsm << " "; - opt << "-D ZPSM=" << sel.zpsm << " "; - opt << "-D ZTST=" << sel.ztst << " "; - opt << "-D ATST=" << sel.atst << " "; - opt << "-D AFAIL=" << sel.afail << " "; - opt << "-D IIP=" << sel.iip << " "; - opt << "-D TFX=" << sel.tfx << " "; - opt << "-D TCC=" << sel.tcc << " "; - opt << "-D FST=" << sel.fst << " "; - opt << "-D LTF=" << sel.ltf << " "; - opt << "-D TLU=" << sel.tlu << " "; - opt << "-D FGE=" << sel.fge << " "; - opt << "-D DATE=" << sel.date << " "; - opt << "-D ABE=" << sel.abe << " "; - opt << "-D ABA=" << sel.aba << " "; - opt << "-D ABB=" << sel.abb << " "; - opt << "-D ABC=" << sel.abc << " "; - opt << "-D ABD=" << sel.abd << " "; - opt << "-D PABE=" << sel.pabe << " "; - opt << "-D AA1=" << sel.aa1 << " "; - opt << "-D FWRITE=" << sel.fwrite << " "; - opt << "-D FTEST=" << sel.ftest << " "; - opt << "-D RFB=" << sel.rfb << " "; - opt << "-D ZWRITE=" << sel.zwrite << " "; - opt << "-D ZTEST=" << sel.ztest << " "; - opt << "-D RZB=" << sel.rzb << " "; - opt << "-D WMS=" << sel.wms << " "; - opt << "-D WMT=" << sel.wmt << " "; - opt << "-D DATM=" << sel.datm << " "; - opt << "-D COLCLAMP=" << sel.colclamp << " "; - opt << "-D FBA=" << sel.fba << " "; - opt << "-D DTHE=" << sel.dthe << " "; - opt << "-D PRIM=" << sel.prim << " "; - opt << "-D LCM=" << sel.lcm << " "; - opt << "-D MMIN=" << sel.mmin << " "; - opt << "-D NOSCISSOR=" << sel.noscissor << " "; - opt << "-D TPSM=" << sel.tpsm << " "; - opt << "-D AEM=" << sel.aem << " "; - opt << "-D FB=" << sel.fb << " "; - opt << "-D ZB=" << sel.zb << " "; - - AddDefs(opt); - - printf("building kernel (%s)\n", entry); - - program.build(opt.str().c_str()); - } - catch(cl::Error err) - { - if(err.err() == CL_BUILD_PROGRAM_FAILURE) - { - for(auto device : devices) - { - auto s = program.getBuildInfo(device); - - printf("kernel (%s) build error: %s\n", entry, s.c_str()); - } - } - - throw err; - } - - cl::Kernel k(program, entry); + cl::Kernel k = Build(entry, opt); tfx_map[sel] = k; diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h index ab9961dd95..d60b5f2939 100644 --- a/plugins/GSdx/GSRendererCL.h +++ b/plugins/GSdx/GSRendererCL.h @@ -146,8 +146,6 @@ class GSRendererCL : public GSRenderer __aligned(struct, 32) TFXParameter { GSVector4i scissor; - GSVector4i bbox; - GSVector4i rect; GSVector4i dimx; // 4x4 signed char TFXSelector sel; uint32 fbp, zbp, bw; @@ -173,7 +171,9 @@ class GSRendererCL : public GSRenderer uint32 pb_start; GSVector4i* src_pages; // read by any texture level GSVector4i* dst_pages; // f/z writes to it - +#ifdef DEBUG + uint32 fbp, fbw, fpsm, zbp, tbp, tbw, tpsm, tw, th; +#endif TFXJob() : src_pages(NULL) , dst_pages(NULL) @@ -224,6 +224,8 @@ class GSRendererCL : public GSRenderer std::map tile_map; std::map tfx_map; + cl::Kernel Build(const char* entry, ostringstream& opt); + public: std::vector devices; cl::Context context; @@ -254,6 +256,8 @@ class GSRendererCL : public GSRenderer uint32 m_vb_count; void Enqueue(); + void UpdateTextureCache(TFXJob* job); + void InvalidateTextureCache(TFXJob* job); /* class RasterizerData : public GSAlignedClass<32> @@ -311,13 +315,12 @@ class GSRendererCL : public GSRenderer }; */ protected: -// GSTextureCacheCL* m_tc; GSTexture* m_texture[2]; uint8* m_output; GSVector4i m_rw_pages[2][4]; // pages that may be read or modified by the rendering queue, f/z rw, tex r - GSVector4i m_tc_pages[4]; // invalidated texture cache pages - GSVector4i m_tmp_pages[4]; + GSVector4i m_tc_pages[4]; // invalidated texture cache pages (split this into 8:24?) + GSVector4i m_tmp_pages[4]; // TODO: this should be block level, too many overlaps inside pages with render targets void Reset(); void VSync(int field); diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl index 007f805d9a..17b400b764 100644 --- a/plugins/GSdx/res/tfx.cl +++ b/plugins/GSdx/res/tfx.cl @@ -44,8 +44,6 @@ typedef struct typedef struct { int4 scissor; - int4 bbox; - int4 rect; char dimx[4][4]; ulong sel; uint fbp, zbp, bw; @@ -663,9 +661,9 @@ __kernel void KERNEL_PRIM( env->barycentric[prim_index] = b; } - else + else // triangle has zero area { - // TODO: set b.zero to something that always fails the tests + pmax = -1; // won't get included in any tile } } else if(PRIM == GS_SPRITE_CLASS) @@ -769,7 +767,7 @@ __kernel void KERNEL_TILE( uchar4 r = bbox_cache[group_prim_index]; - uint test = (r.x <= x + 1) & (r.z >= x) & (r.y <= y + 1) & (r.w >= y); + uint test = (r.x <= x) & (r.z >= x) & (r.y <= y) & (r.w >= y); if(PRIM == GS_TRIANGLE_CLASS && test != 0) { @@ -862,7 +860,7 @@ __kernel void KERNEL_TILE( { uchar4 r = bbox_cache[i]; - BIN_TYPE test = (r.x <= x + 1) & (r.z >= x) & (r.y <= y + 1) & (r.w >= y); + BIN_TYPE test = (r.x <= x) & (r.z >= x) & (r.y <= y) & (r.w >= y); if(PRIM == GS_TRIANGLE_CLASS && test != 0) { @@ -1146,7 +1144,7 @@ int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* p c = pb->clut[vm[addr]]; break; case PSM_PSMT4: - c = pb->clut[(vm[addr] >> ((addr & 1) << 2)) & 0x0f]; + c = pb->clut[(vm[addr >> 1] >> ((addr & 1) << 2)) & 0x0f]; break; case PSM_PSMT8H: c = pb->clut[vm32[addr] >> 24]; @@ -1159,11 +1157,20 @@ int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* p break; } - //printf("[%d %d] %05x %d %d %08x | %v4hhd | %08x\n", x, y, pb->tbp[level], pb->tbw[level], TPSM, addr, c, vm32[addr]); + //printf("[%d %d] %05x %d %d %08x | %v4hhd | %08x\n", x, y, pb->tbp[level], pb->tbw[level], TPSM, addr, c, vm[addr]); return convert_int4(c); } +// TODO: 2x2 MSAA idea +// downsize the rendering tile to 16x8 or 8x8 and render 2x2 sub-pixels to __local +// hittest and ztest 2x2 (create write mask, only skip if all -1) +// calculate color 1x1, alpha tests 1x1 +// use mask to filter failed sub-pixels when writing to __local +// needs the tile data to be fetched at the beginning, even if rfb/zfb is not set, unless we know the tile is fully covered +// multiple work-items may render different prims to the same 2x2 sub-pixel, averaging can only be done after a barrier at the very end +// pb->fm? alpha channel and following alpha tests? some games may depend on exact results, not some average + __kernel void KERNEL_TFX( __global gs_env* env, __global uchar* vm, @@ -1437,7 +1444,7 @@ __kernel void KERNEL_TFX( // t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation // TODO: sw renderer samples at 112 exactly, check which one is correct - uv = convert_int2(t.xy); + uv = convert_int2_rte(t.xy); } int2 uvf = uv & 0x000f; @@ -1497,7 +1504,7 @@ __kernel void KERNEL_TFX( { if(!ABE || c.w == 0x80) { - // TODO: c.w = coverage; // coverage 0x80 at 100% + c.w = /*edge ? coverage :*/ 0x80; // TODO } } } @@ -1619,7 +1626,6 @@ __kernel void KERNEL_TFX( if(FWRITE) { WriteFrame(vm, faddr, FPSM, fd); - //WriteFrame(vm, faddr, FPSM, 0xff202020 * fragments); } } } From 263c097d13ed80147b636cf94d32de9c66a12315 Mon Sep 17 00:00:00 2001 From: gabest11 Date: Thu, 18 Sep 2014 09:32:37 +0200 Subject: [PATCH 05/15] solution for 32-bit z values in opencl and other minor optimizations --- plugins/GSdx/GSRendererCL.cpp | 19 +-- plugins/GSdx/res/tfx.cl | 271 ++++++++++++++-------------------- 2 files changed, 110 insertions(+), 180 deletions(-) diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp index 05cc2213d4..990cc3e6f0 100644 --- a/plugins/GSdx/GSRendererCL.cpp +++ b/plugins/GSdx/GSRendererCL.cpp @@ -193,23 +193,9 @@ void GSRendererCL::ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* { GSVector4 stcq = GSVector4::load(&src->m[0]); // s t rgba q - #if _M_SSE >= 0x401 - GSVector4i xyzuvf(src->m[1]); - GSVector4i xy = xyzuvf.upl16() - o; - GSVector4i zf = xyzuvf.ywww().min_u32(GSVector4i::xffffff00()); - - #else - - uint32 z = src->XYZ.Z; - - GSVector4i xy = GSVector4i::load((int)src->XYZ.u32[0]).upl16() - o; - GSVector4i zf = GSVector4i((int)std::min(z, 0xffffff00), src->FOG); // NOTE: larger values of z may roll over to 0 when converting back to uint32 later - - #endif - - dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * g_pos_scale; + dst->p = (GSVector4(xyzuvf.upl16() - o) * g_pos_scale).xyxy(GSVector4::cast(xyzuvf.ywyw())); // pass zf as uints GSVector4 t = GSVector4::zero(); @@ -233,7 +219,7 @@ void GSRendererCL::ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* } } - dst->t = t.insert32<2, 3>(stcq); + dst->t = t.insert32<2, 3>(stcq); // color as uchar4 in t.w } } @@ -871,6 +857,7 @@ void GSRendererCL::Enqueue() GSVector4i r = GSVector4i::load(&(*i)->rect); r = r.ralign(GSVector2i(BIN_SIZE, BIN_SIZE)); + /* if(i->sel.IsSolidRect()) // TODO: simple mem fill with optional mask ;//printf("%d %d %d %d\n", r.left, r.top, r.width(), r.height()); diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl index 17b400b764..8342f338a5 100644 --- a/plugins/GSdx/res/tfx.cl +++ b/plugins/GSdx/res/tfx.cl @@ -14,7 +14,7 @@ typedef struct { - union {float4 p; struct {float x, y, z, f;};}; + union {float4 p; struct {float x, y; uint z, f;};}; union {float4 tc; struct {float s, t, q; uchar4 c;};}; } gs_vertex; @@ -46,12 +46,12 @@ typedef struct int4 scissor; char dimx[4][4]; ulong sel; - uint fbp, zbp, bw; + int fbp, zbp, bw; uint fm, zm; uchar4 fog; // rgb uchar aref, afix; uchar ta0, ta1; - uint tbp[7], tbw[7]; + int tbp[7], tbw[7]; int minu, maxu, minv, maxv; int lod; // lcm == 1 int mxl; @@ -68,7 +68,7 @@ enum GS_PRIM_CLASS GS_SPRITE_CLASS }; -enum GS_PSM_TARGET +enum GS_PSM { PSM_PSMCT32, PSM_PSMCT24, @@ -350,87 +350,87 @@ __constant ushort columnTable4[16][32] = 407, 415, 439, 447, 471, 479, 503, 511 }, }; -uint BlockNumber32(int x, int y, uint bp, uint bw) +int BlockNumber32(int x, int y, int bp, int bw) { - return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 3][(x >> 3) & 7]; + return bp + mad24(y & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 3][(x >> 3) & 7]; } -uint BlockNumber16(int x, int y, uint bp, uint bw) +int BlockNumber16(int x, int y, int bp, int bw) { - return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 7][(x >> 4) & 3]; + return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 7][(x >> 4) & 3]; } -uint BlockNumber16S(int x, int y, uint bp, uint bw) +int BlockNumber16S(int x, int y, int bp, int bw) { - return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 7][(x >> 4) & 3]; + return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 7][(x >> 4) & 3]; } -uint BlockNumber32Z(int x, int y, uint bp, uint bw) +int BlockNumber32Z(int x, int y, int bp, int bw) { - return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 3][(x >> 3) & 7]; + return bp + mad24(y & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 3][(x >> 3) & 7]; } -uint BlockNumber16Z(int x, int y, uint bp, uint bw) +int BlockNumber16Z(int x, int y, int bp, int bw) { - return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 7][(x >> 4) & 3]; + return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 7][(x >> 4) & 3]; } -uint BlockNumber16SZ(int x, int y, uint bp, uint bw) +int BlockNumber16SZ(int x, int y, int bp, int bw) { - return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3]; + return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3]; } -uint BlockNumber8(int x, int y, uint bp, uint bw) +int BlockNumber8(int x, int y, int bp, int bw) { - return bp + ((y >> 1) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7]; + return bp + mad24((y >> 1) & ~0x1f, bw >> 1, (x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7]; } -uint BlockNumber4(int x, int y, uint bp, uint bw) +int BlockNumber4(int x, int y, int bp, int bw) { - return bp + ((y >> 2) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3]; + return bp + mad24((y >> 2) & ~0x1f, bw >> 1, (x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3]; } -uint PixelAddress32(int x, int y, uint bp, uint bw) +int PixelAddress32(int x, int y, int bp, int bw) { return (BlockNumber32(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7]; } -uint PixelAddress16(int x, int y, uint bp, uint bw) +int PixelAddress16(int x, int y, int bp, int bw) { return (BlockNumber16(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; } -uint PixelAddress16S(int x, int y, uint bp, uint bw) +int PixelAddress16S(int x, int y, int bp, int bw) { return (BlockNumber16S(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; } -uint PixelAddress32Z(int x, int y, uint bp, uint bw) +int PixelAddress32Z(int x, int y, int bp, int bw) { return (BlockNumber32Z(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7]; } -uint PixelAddress16Z(int x, int y, uint bp, uint bw) +int PixelAddress16Z(int x, int y, int bp, int bw) { return (BlockNumber16Z(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; } -uint PixelAddress16SZ(int x, int y, uint bp, uint bw) +int PixelAddress16SZ(int x, int y, int bp, int bw) { return (BlockNumber16SZ(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; } -uint PixelAddress8(int x, int y, uint bp, uint bw) +int PixelAddress8(int x, int y, int bp, int bw) { return (BlockNumber8(x, y, bp, bw) << 8) + columnTable8[y & 15][x & 15]; } -uint PixelAddress4(int x, int y, uint bp, uint bw) +int PixelAddress4(int x, int y, int bp, int bw) { return (BlockNumber4(x, y, bp, bw) << 9) + columnTable4[y & 15][x & 31]; } -uint PixelAddress(int x, int y, uint bp, uint bw, uint psm) +int PixelAddress(int x, int y, int bp, int bw, int psm) { switch(psm) { @@ -459,49 +459,7 @@ uint PixelAddress(int x, int y, uint bp, uint bw, uint psm) } } -uint TileBlockNumber(int x, int y, uint bp, uint bw, uint psm) -{ - // TODO: replace blockTable with a subset tileTable - - switch(psm) - { - default: - case PSM_PSMCT32: - case PSM_PSMCT24: - return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 2][(x >> 3) & 6]; - case PSM_PSMCT16: - return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 2][(x >> 4) & 3]; - case PSM_PSMCT16S: - return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 2][(x >> 4) & 3]; - case PSM_PSMZ32: - case PSM_PSMZ24: - return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 2][(x >> 3) & 6]; - case PSM_PSMZ16: - return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 2][(x >> 4) & 3]; - case PSM_PSMZ16S: - return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 2][(x >> 4) & 3]; - } -} - -uint TilePixelAddress(int x, int y, uint ba, uint psm) -{ - switch(psm) - { - default: - case PSM_PSMCT32: - case PSM_PSMCT24: - case PSM_PSMZ32: - case PSM_PSMZ24: - return ((ba + ((y >> 2) & 2) + ((x >> 3) & 1)) << 6) + columnTable32[y & 7][x & 7]; - case PSM_PSMCT16: - case PSM_PSMCT16S: - case PSM_PSMZ16: - case PSM_PSMZ16S: - return ((ba + ((y >> 3) & 1)) << 7) + columnTable16[y & 7][x & 15]; - } -} - -uint ReadFrame(__global uchar* vm, uint addr, uint psm) +uint ReadFrame(__global uchar* vm, int addr, int psm) { switch(psm) { @@ -519,7 +477,7 @@ uint ReadFrame(__global uchar* vm, uint addr, uint psm) } } -void WriteFrame(__global uchar* vm, uint addr, uint psm, uint value) +void WriteFrame(__global uchar* vm, int addr, int psm, uint value) { switch(psm) { @@ -593,7 +551,12 @@ __kernel void KERNEL_PRIM( if(PRIM == GS_POINT_CLASS) { - pmin = pmax = convert_int2_rte(vb[ib[0]].p.xy); + __global gs_vertex* v0 = &vb[ib[0]]; + + pmin = pmax = convert_int2_rte(v0->p.xy); + + prim->v[0].p = v0->p; + prim->v[0].tc = v0->tc; } else if(PRIM == GS_LINE_CLASS) { @@ -616,13 +579,21 @@ __kernel void KERNEL_PRIM( pmin = min(min(p0, p1), p2); pmax = max(max(p0, p1), p2); - prim->v[0].p = v0->p; + // z needs special care, since it's a 32 bit unit, float cannot encode it exactly + // pass the minimum through the unused 4th padding vector + // only interpolate the relative and hopefully small values + + uint zmin = min(min(v0->z, v1->z), v2->z); + + prim->v[0].p = (float4)(v0->p.x, v0->p.y, as_float(v0->z - zmin), v0->p.w); prim->v[0].tc = v0->tc; - prim->v[1].p = v1->p; + prim->v[1].p = (float4)(v1->p.x, v1->p.y, as_float(v1->z - zmin), v1->p.w); prim->v[1].tc = v1->tc; - prim->v[2].p = v2->p; + prim->v[2].p = (float4)(v2->p.x, v2->p.y, as_float(v2->z - zmin), v2->p.w); prim->v[2].tc = v2->tc; + prim->v[3].z = zmin; + float4 dp0 = v1->p - v0->p; float4 dp1 = v0->p - v2->p; float4 dp2 = v2->p - v1->p; @@ -631,10 +602,10 @@ __kernel void KERNEL_PRIM( if(cp != 0.0f) { - float cp_rcp = 1.0f / cp;// native_recip(cp); + cp = native_recip(cp); - float2 u = dp0.xy * cp_rcp; - float2 v = -dp1.xy * cp_rcp; + float2 u = dp0.xy * cp; + float2 v = -dp1.xy * cp; // v0 has the (0, 0, 1) barycentric coord, v1: (0, 1, 0), v2: (1, 0, 0) @@ -653,9 +624,9 @@ __kernel void KERNEL_PRIM( // any barycentric(reject_corner) < 0, tile outside the triangle - b.reject_corner.x = 0.0f + max(max(max(0.0f, b.dx.x), b.dy.x), b.dx.x + b.dy.x) * BIN_SIZE; - b.reject_corner.y = 0.0f + max(max(max(0.0f, b.dx.y), b.dy.y), b.dx.y + b.dy.y) * BIN_SIZE; - b.reject_corner.z = 1.0f + max(max(max(0.0f, b.dx.z), b.dy.z), b.dx.z + b.dy.z) * BIN_SIZE; + b.reject_corner.x = 0.0f + max(max(max(b.dx.x + b.dy.x, b.dx.x), b.dy.x), 0.0f) * BIN_SIZE; + b.reject_corner.y = 0.0f + max(max(max(b.dx.y + b.dy.y, b.dx.y), b.dy.y), 0.0f) * BIN_SIZE; + b.reject_corner.z = 1.0f + max(max(max(b.dx.z + b.dy.z, b.dx.z), b.dy.z), 0.0f) * BIN_SIZE; // TODO: accept_corner, at min value, all barycentric(accept_corner) >= 0, tile fully inside, no per pixel hittest needed @@ -686,9 +657,9 @@ __kernel void KERNEL_PRIM( prim->v[1].tc.xy = (prim->v[1].tc.xy - prim->v[0].tc.xy) / (prim->v[1].p.xy - prim->v[0].p.xy); } - int4 pminmax = (int4)(pmin, pmax); + int4 r = (int4)(pmin, pmax + (int2)(BIN_SIZE - 1)) >> BIN_SIZE_BITS; - env->bbox[prim_index] = convert_uchar4_sat(pminmax >> BIN_SIZE_BITS); + env->bbox[prim_index] = convert_uchar4_sat(r); } #endif @@ -767,11 +738,11 @@ __kernel void KERNEL_TILE( uchar4 r = bbox_cache[group_prim_index]; - uint test = (r.x <= x) & (r.z >= x) & (r.y <= y) & (r.w >= y); + uint test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y); if(PRIM == GS_TRIANGLE_CLASS && test != 0) { - test &= tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[group_prim_index]); + test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[group_prim_index]); } atomic_or(&visible[bin_index], test << ((MAX_PRIM_PER_GROUP - 1) - get_local_id(2))); @@ -848,7 +819,7 @@ __kernel void KERNEL_TILE( for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size) { - int y = bin_index / bin_dim.z; + int y = bin_index / bin_dim.z; // TODO: very expensive, no integer divider on current hardware int x = bin_index - y * bin_dim.z; x += bin_dim.x; @@ -860,11 +831,11 @@ __kernel void KERNEL_TILE( { uchar4 r = bbox_cache[i]; - BIN_TYPE test = (r.x <= x) & (r.z >= x) & (r.y <= y) & (r.w >= y); + BIN_TYPE test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y); if(PRIM == GS_TRIANGLE_CLASS && test != 0) { - test &= tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]); + test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]); } visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i); @@ -1185,9 +1156,12 @@ __kernel void KERNEL_TFX( { // TODO: try it the bin_index = atomic_inc(&env->bin_counter) way - uint bin_x = (get_global_id(0) >> BIN_SIZE_BITS) - bin_dim.x; - uint bin_y = (get_global_id(1) >> BIN_SIZE_BITS) - bin_dim.y; - uint bin_index = bin_y * bin_dim.z + bin_x; + uint x = get_global_id(0); + uint y = get_global_id(1); + + uint bin_x = (x >> BIN_SIZE_BITS) - bin_dim.x; + uint bin_y = (y >> BIN_SIZE_BITS) - bin_dim.y; + uint bin_index = mad24(bin_y, (uint)bin_dim.z, bin_x); uint batch_first = env->bounds[bin_index].first; uint batch_last = env->bounds[bin_index].last; @@ -1230,26 +1204,21 @@ __kernel void KERNEL_TFX( __global gs_param* pb = (__global gs_param*)(pb_base + pb_start); - uint x = get_global_id(0); - uint y = get_global_id(1); - int2 pi = (int2)(x, y); float2 pf = convert_float2(pi); if(!NOSCISSOR) { - int4 scissor = pb->scissor; - - if(!all((pi >= scissor.xy) & (pi < scissor.zw))) + if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw))) { return; } } - uint faddr = PixelAddress(x, y, pb->fbp, pb->bw, FPSM); - uint zaddr = PixelAddress(x, y, pb->zbp, pb->bw, ZPSM); + int faddr = PixelAddress(x, y, pb->fbp, pb->bw, FPSM); + int zaddr = PixelAddress(x, y, pb->zbp, pb->bw, ZPSM); - uint fd, zd; + uint fd, zd; // TODO: fd as int4 and only pack before writing out? if(RFB) { @@ -1260,47 +1229,6 @@ __kernel void KERNEL_TFX( { zd = ReadFrame(vm, zaddr, ZPSM); } -/* - // TODO: lookup top left address of this tile + local offset - // - // 32bpp: 8x8 block size, 4 blocks, 1024 bytes - // 0 1 - // 2 3 - // 16bpp: 16x8 block size, 2 blocks, 512 bytes - // 0 - // 1 - // linear access in memory, this layout is the same for all formats - - __local uint fbn, zbn; - __local uchar fb[1024], zb[1024]; - - if(get_local_id(0) == 0 && get_local_id(1) == 0) - { - fbn = TileBlockNumber(x, y, pb->fbp, pb->bw, FPSM); - zbn = TileBlockNumber(x, y, pb->fbp, pb->bw, FPSM); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - uint faddr = TilePixelAddress(x, y, fbn, FPSM); - uint zaddr = TilePixelAddress(x, y, zbn, ZPSM); - - if(RFB) - { - event_t e = async_work_group_copy((__local uint4*)fb, (__global uint4*)&vm[fbn << 8], 1024 / sizeof(uint4), 0); - - wait_group_events(1, &e); - } - - if(RZB) - { - event_t e = async_work_group_copy((__local uint4*)zb, (__global uint4*)&vm[zbn << 8], 1024 / sizeof(uint4), 0); - - wait_group_events(1, &e); - } - - // not sure if faster -*/ // early destination alpha test @@ -1346,30 +1274,44 @@ __kernel void KERNEL_TFX( if(PRIM == GS_POINT_CLASS) { - // TODO: distance.x < 0.5f || distance.y < 0.5f + float2 dpf = pf - prim->v[0].p.xy; - continue; + if(!all((dpf <= 0.5f) & (dpf > -0.5f))) + { + continue; + } + + zf = as_uint2(prim->v[0].p.zw); + t.xyz = prim->v[0].tc.xyz; + c = convert_int4(prim->v[0].c); } else if(PRIM == GS_LINE_CLASS) { // TODO: find point on line prependicular to (x,y), distance.x < 0.5f || distance.y < 0.5f + // TODO: aa1: coverage ~ distance.x/y, slope selects x or y, zwrite disabled + // TODO: do not draw last pixel of the line continue; } else if(PRIM == GS_TRIANGLE_CLASS) { + // TODO: aa1: draw edge as a line + __global gs_barycentric* b = &barycentric[prim_index + i]; float3 f = b->dx.xyz * (pf.x - b->dx.w) + b->dy.xyz * (pf.y - b->dy.w) + (float3)(0, 0, 1); - f = select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10)); - - if(!all(f >= b->zero.xyz)) + if(!all(select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10)) >= b->zero.xyz)) { continue; } - zf = convert_uint2(prim->v[0].p.zw * f.z + prim->v[1].p.zw * f.x + prim->v[2].p.zw * f.y); + float2 zf0 = convert_float2(as_uint2(prim->v[0].p.zw)); + float2 zf1 = convert_float2(as_uint2(prim->v[1].p.zw)); + float2 zf2 = convert_float2(as_uint2(prim->v[2].p.zw)); + + zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z; + zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y); t.xyz = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y; @@ -1379,7 +1321,7 @@ __kernel void KERNEL_TFX( float4 c1 = convert_float4(prim->v[1].c); float4 c2 = convert_float4(prim->v[2].c); - c = convert_int4(c0 * f.z + c1 * f.x + c2 * f.y); + c = convert_int4_rte(c0 * f.z + c1 * f.x + c2 * f.y); } else { @@ -1396,7 +1338,7 @@ __kernel void KERNEL_TFX( continue; } - zf = convert_uint2(prim->v[1].p.zw); // TODO: send as uint + zf = as_uint2(prim->v[1].p.zw); t.xy = prim->v[0].tc.xy + prim->v[1].tc.xy * (pf - prim->v[0].p.xy); t.z = prim->v[0].tc.z; @@ -1431,7 +1373,7 @@ __kernel void KERNEL_TFX( if(!FST) { - uv = convert_int2_rte(t.xy * (1.0f / t.z));// * native_recip(t.z)); + uv = convert_int2_rte(t.xy * native_recip(t.z)); if(LTF) uv -= 0x0008; } @@ -1444,7 +1386,9 @@ __kernel void KERNEL_TFX( // t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation // TODO: sw renderer samples at 112 exactly, check which one is correct - uv = convert_int2_rte(t.xy); + // last line error in persona 3 movie clips if rounding is enabled + + uv = convert_int2(t.xy); } int2 uvf = uv & 0x000f; @@ -1466,9 +1410,9 @@ __kernel void KERNEL_TFX( if(LTF) { - c00 = ((c01 - c00) * uvf.x >> 4) + c00; - c10 = ((c11 - c10) * uvf.x >> 4) + c10; - c00 = ((c10 - c00) * uvf.y >> 4) + c00; + c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00; + c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10; + c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00; } ct = c00; @@ -1486,7 +1430,7 @@ __kernel void KERNEL_TFX( switch(TFX) { case TFX_MODULATE: - c.w = clamp(ct.w * c.w >> 7, 0, 0xff); + c.w = clamp(mul24(ct.w, c.w) >> 7, 0, 0xff); break; case TFX_DECAL: c.w = ct.w; @@ -1539,14 +1483,14 @@ __kernel void KERNEL_TFX( switch(TFX) { case TFX_MODULATE: - c.xyz = clamp(ct.xyz * c.xyz >> 7, 0, 0xff); + c.xyz = clamp(mul24(ct.xyz, c.xyz) >> 7, 0, 0xff); break; case TFX_DECAL: c.xyz = ct.xyz; break; case TFX_HIGHLIGHT: case TFX_HIGHLIGHT2: - c.xyz = clamp((ct.xyz * c.xyz >> 7) + alpha, 0, 0xff); + c.xyz = clamp((mul24(ct.xyz, c.xyz) >> 7) + alpha, 0, 0xff); break; } } @@ -1557,7 +1501,10 @@ __kernel void KERNEL_TFX( { int fog = (int)zf.y; - c.xyz = (c.xyz * fog >> 8) + (convert_int4(pb->fog).xyz * (int3)(0xff - fog) >> 8); + int3 fv = mul24(c.xyz, fog) >> 8; + int3 fc = mul24(convert_int4(pb->fog).xyz, 0xff - fog) >> 8; + + c.xyz = fv + fc; } // alpha blend @@ -1614,10 +1561,6 @@ __kernel void KERNEL_TFX( if(fragments > 0) { - // TODO: write color/z to faddr/zaddr (if 16x16 was cached, barrier local mem, swizzle back to its place) - - // TODO if(fm/zm != 0xffffffff) or whatever masks the output completely for the pixel format) - if(ZWRITE) { WriteFrame(vm, zaddr, ZPSM, zd); From 9e2038759553e06715c5766166fc529d5bc33623 Mon Sep 17 00:00:00 2001 From: gabest11 Date: Fri, 19 Sep 2014 06:53:05 +0200 Subject: [PATCH 06/15] Modified for opencl 1.1. While it runs on nvidia cards now, you can't use its sdk to compile gsdx, cl.hpp is missing there. Intel or amd is ok. --- plugins/GSdx/GSRendererCL.cpp | 34 ++++---- plugins/GSdx/GSRendererCL.h | 2 +- plugins/GSdx/res/tfx.cl | 127 ++++++++++++++++-------------- plugins/GSdx/stdafx.h | 5 ++ plugins/GSdx/vsprops/common.props | 2 +- 5 files changed, 88 insertions(+), 82 deletions(-) diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp index 990cc3e6f0..d380649222 100644 --- a/plugins/GSdx/GSRendererCL.cpp +++ b/plugins/GSdx/GSRendererCL.cpp @@ -328,8 +328,8 @@ void GSRendererCL::Draw() std::vector el(1); - m_cl.queue[2].enqueueMarkerWithWaitList(NULL, &el[0]); - m_cl.wq->enqueueBarrierWithWaitList(&el, NULL); + m_cl.queue[2].enqueueMarker(&el[0]); + m_cl.wq->enqueueWaitForEvents(el); // switch to the other queue/buffer (double buffering) @@ -404,15 +404,7 @@ void GSRendererCL::Draw() job->pb_start = m_cl.pb.tail; #ifdef DEBUG - job->fbp = context->FRAME.Block(); - job->fbw = context->FRAME.FBW; - job->fpsm = context->FRAME.PSM; - job->zbp = context->ZBUF.Block(); - job->tbp = PRIM->TME ? context->TEX0.TBP0 : 0xfffff; - job->tbw = PRIM->TME ? context->TEX0.TBW : 1; - job->tpsm = PRIM->TME ? context->TEX0.PSM : 0; - job->tw = PRIM->TME ? context->TEX0.TW : 0; - job->th = PRIM->TME ? context->TEX0.TH : 0; + job->param = pb; #endif m_jobs.push_back(job); @@ -687,10 +679,10 @@ void GSRendererCL::Enqueue() m_cl.Unmap(); - std::vector el2(1); + std::vector el(1); - m_cl.wq->enqueueMarkerWithWaitList(NULL, &el2[0]); - m_cl.queue[2].enqueueBarrierWithWaitList(&el2, NULL); + m_cl.wq->enqueueMarker(&el[0]); + m_cl.queue[2].enqueueWaitForEvents(el); // @@ -813,6 +805,9 @@ void GSRendererCL::Enqueue() { ASSERT(prim_start < MAX_PRIM_COUNT); + // TODO: join tfx kernel calls where the selector and fbp/zbp/bw/scissor are the same + // move dimx/fm/zm/fog/aref/afix/ta0/ta1/tbp/tbw/minu/minv/maxu/maxv/lod/mxl/l/k/clut to an indexed array per prim + tfxcount++; //if(LOG) { fprintf(s_fp, "q %05x %05x %05x\n", (*i)->fbp, (*i)->zbp, (*i)->tbp); fflush(s_fp); } @@ -1574,7 +1569,8 @@ GSRendererCL::CL::CL() case CL_DEVICE_TYPE_CPU: printf(" CPU"); break; } - if(strstr(version.c_str(), "OpenCL C 1.2") != NULL) + if(strstr(version.c_str(), "OpenCL C 1.1") != NULL + || strstr(version.c_str(), "OpenCL C 1.2") != NULL) { #ifdef IOCL_DEBUG if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL) @@ -1646,7 +1642,7 @@ void GSRendererCL::CL::Map() if(vb.head < vb.size) { - vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, vb.head, vb.size - vb.head); + vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, vb.head, vb.size - vb.head); vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head; ASSERT(((size_t)vb.ptr & 15) == 0); ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0); @@ -1654,14 +1650,13 @@ void GSRendererCL::CL::Map() if(ib.head < ib.size) { - ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, ib.head, ib.size - ib.head); + ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE, ib.head, ib.size - ib.head); ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head; - ASSERT(((size_t)ib.ptr & 15) == 0); } if(pb.head < pb.size) { - pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pb.head, pb.size - pb.head); + pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, pb.head, pb.size - pb.head); pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head; ASSERT(((size_t)pb.ptr & 15) == 0); ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0); @@ -1681,6 +1676,7 @@ void GSRendererCL::CL::Unmap() static void AddDefs(ostringstream& opt) { + opt << "-cl-std=CL1.1 "; opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u "; opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u "; opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u "; diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h index d60b5f2939..f6b3231a06 100644 --- a/plugins/GSdx/GSRendererCL.h +++ b/plugins/GSdx/GSRendererCL.h @@ -172,7 +172,7 @@ class GSRendererCL : public GSRenderer GSVector4i* src_pages; // read by any texture level GSVector4i* dst_pages; // f/z writes to it #ifdef DEBUG - uint32 fbp, fbw, fpsm, zbp, tbp, tbw, tpsm, tw, th; + TFXParameter* param; #endif TFXJob() : src_pages(NULL) diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl index 8342f338a5..4eac8374de 100644 --- a/plugins/GSdx/res/tfx.cl +++ b/plugins/GSdx/res/tfx.cl @@ -1,4 +1,4 @@ -#ifdef __OPENCL_C_VERSION__ // make safe to include in resource file to enforce dependency +#if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency #ifndef CL_FLT_EPSILON #define CL_FLT_EPSILON 1.1920928955078125e-7 @@ -1133,6 +1133,66 @@ int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* p return convert_int4(c); } +int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t) +{ + int4 c; + + if(0)//if(MMIN) + { + // TODO + } + else + { + int2 uv; + + if(!FST) + { + uv = convert_int2_rte(t.xy * native_recip(t.z)); + + if(LTF) uv -= 0x0008; + } + else + { + // sfex capcom logo third drawing call at (0,223) calculated as: + // t0 + (p - p0) * (t - t0) / (p1 - p0) + // 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112 + // due to rounding errors (multiply-add instruction maybe): + // t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation + // TODO: sw renderer samples at 112 exactly, check which one is correct + + // last line error in persona 3 movie clips if rounding is enabled + + uv = convert_int2(t.xy); + } + + int2 uvf = uv & 0x000f; + + int2 uv0 = uv >> 4; + int2 uv1 = uv0 + 1; + + uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS); + uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT); + uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS); + uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT); + + int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb); + int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb); + int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb); + int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb); + + if(LTF) + { + c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00; + c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10; + c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00; + } + + c = c00; + } + + return c; +} + // TODO: 2x2 MSAA idea // downsize the rendering tile to 16x8 or 8x8 and render 2x2 sub-pixels to __local // hittest and ztest 2x2 (create write mask, only skip if all -1) @@ -1249,8 +1309,6 @@ __kernel void KERNEL_TFX( BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip); - __local gs_prim prim_cache; - for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH) { while(bin_value != 0) @@ -1267,7 +1325,7 @@ __kernel void KERNEL_TFX( bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1); uint2 zf; - float4 t; + float3 t; int4 c; // TODO: do not hittest if we know the tile is fully inside the prim @@ -1282,7 +1340,7 @@ __kernel void KERNEL_TFX( } zf = as_uint2(prim->v[0].p.zw); - t.xyz = prim->v[0].tc.xyz; + t = prim->v[0].tc.xyz; c = convert_int4(prim->v[0].c); } else if(PRIM == GS_LINE_CLASS) @@ -1313,7 +1371,7 @@ __kernel void KERNEL_TFX( zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z; zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y); - t.xyz = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y; + t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y; if(IIP) { @@ -1361,62 +1419,9 @@ __kernel void KERNEL_TFX( if(TFX != TFX_NONE) { - // TODO + tex = vm; // TODO: use the texture cache - if(0)//if(MMIN) - { - // TODO - } - else - { - int2 uv; - - if(!FST) - { - uv = convert_int2_rte(t.xy * native_recip(t.z)); - - if(LTF) uv -= 0x0008; - } - else - { - // sfex capcom logo third drawing call at (0,223) calculated as: - // t0 + (p - p0) * (t - t0) / (p1 - p0) - // 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112 - // due to rounding errors (multiply-add instruction maybe): - // t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation - // TODO: sw renderer samples at 112 exactly, check which one is correct - - // last line error in persona 3 movie clips if rounding is enabled - - uv = convert_int2(t.xy); - } - - int2 uvf = uv & 0x000f; - - int2 uv0 = uv >> 4; - int2 uv1 = uv0 + 1; - - uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS); - uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT); - uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS); - uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT); - - tex = vm; // TODO: use the texture cache - - int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb); - int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb); - int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb); - int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb); - - if(LTF) - { - c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00; - c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10; - c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00; - } - - ct = c00; - } + ct = SampleTexture(tex, pb, t); } // alpha tfx diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index b9bfa19737..1929381354 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -43,6 +43,11 @@ #include #include #include "../../common/include/comptr.h" + +#include +#undef CL_VERSION_1_2 +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS +#define __CL_ENABLE_EXCEPTIONS #include #define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA) diff --git a/plugins/GSdx/vsprops/common.props b/plugins/GSdx/vsprops/common.props index 91c58981d0..b8972de77c 100644 --- a/plugins/GSdx/vsprops/common.props +++ b/plugins/GSdx/vsprops/common.props @@ -8,7 +8,7 @@ true - _WINDOWS;_WIN32_WINNT=0x500;__CL_ENABLE_EXCEPTIONS;%(PreprocessorDefinitions) + _WINDOWS;_WIN32_WINNT=0x500;%(PreprocessorDefinitions) Fast false Level4 From 72cfc6a6ef8abe9e825510e5118bceb487d92409 Mon Sep 17 00:00:00 2001 From: gabest11 Date: Fri, 19 Sep 2014 22:48:11 +0200 Subject: [PATCH 07/15] 3rdparty/opencl --- 3rdparty/opencl/CL/cl.h | 1383 +++ 3rdparty/opencl/CL/cl.hpp | 12456 +++++++++++++++++++++++ 3rdparty/opencl/CL/cl_d3d10.h | 126 + 3rdparty/opencl/CL/cl_d3d11.h | 132 + 3rdparty/opencl/CL/cl_d3d9.h | 331 + 3rdparty/opencl/CL/cl_egl.h | 133 + 3rdparty/opencl/CL/cl_ext.h | 458 + 3rdparty/opencl/CL/cl_gl.h | 162 + 3rdparty/opencl/CL/cl_gl_ext.h | 69 + 3rdparty/opencl/CL/cl_platform.h | 1299 +++ 3rdparty/opencl/CL/opencl.h | 54 + 3rdparty/opencl/opencl.def | 113 + 3rdparty/opencl/opencl.vcxproj | 167 + 3rdparty/opencl/opencl.vcxproj.filters | 57 + pcsx2_suite_2013.sln | 60 +- plugins/GSdx/GSRendererCL.cpp | 66 +- plugins/GSdx/GSRendererCL.h | 58 +- plugins/GSdx/GSdx_vs2013.vcxproj | 10 + plugins/GSdx/res/tfx.cl | 4 +- plugins/GSdx/vsprops/common.props | 2 +- plugins/GSdx/vsprops/x64.props | 2 +- plugins/GSdx/vsprops/x86.props | 2 +- 22 files changed, 17027 insertions(+), 117 deletions(-) create mode 100644 3rdparty/opencl/CL/cl.h create mode 100644 3rdparty/opencl/CL/cl.hpp create mode 100644 3rdparty/opencl/CL/cl_d3d10.h create mode 100644 3rdparty/opencl/CL/cl_d3d11.h create mode 100644 3rdparty/opencl/CL/cl_d3d9.h create mode 100644 3rdparty/opencl/CL/cl_egl.h create mode 100644 3rdparty/opencl/CL/cl_ext.h create mode 100644 3rdparty/opencl/CL/cl_gl.h create mode 100644 3rdparty/opencl/CL/cl_gl_ext.h create mode 100644 3rdparty/opencl/CL/cl_platform.h create mode 100644 3rdparty/opencl/CL/opencl.h create mode 100644 3rdparty/opencl/opencl.def create mode 100644 3rdparty/opencl/opencl.vcxproj create mode 100644 3rdparty/opencl/opencl.vcxproj.filters diff --git a/3rdparty/opencl/CL/cl.h b/3rdparty/opencl/CL/cl.h new file mode 100644 index 0000000000..363bd7fcd3 --- /dev/null +++ b/3rdparty/opencl/CL/cl.h @@ -0,0 +1,1383 @@ +/******************************************************************************* + * Copyright (c) 2008 - 2013 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +#ifndef __OPENCL_CL_H +#define __OPENCL_CL_H + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ + +typedef struct _cl_platform_id * cl_platform_id; +typedef struct _cl_device_id * cl_device_id; +typedef struct _cl_context * cl_context; +typedef struct _cl_command_queue * cl_command_queue; +typedef struct _cl_mem * cl_mem; +typedef struct _cl_program * cl_program; +typedef struct _cl_kernel * cl_kernel; +typedef struct _cl_event * cl_event; +typedef struct _cl_sampler * cl_sampler; + +typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ +typedef cl_ulong cl_bitfield; +typedef cl_bitfield cl_device_type; +typedef cl_uint cl_platform_info; +typedef cl_uint cl_device_info; +typedef cl_bitfield cl_device_fp_config; +typedef cl_uint cl_device_mem_cache_type; +typedef cl_uint cl_device_local_mem_type; +typedef cl_bitfield cl_device_exec_capabilities; +typedef cl_bitfield cl_device_svm_capabilities; +typedef cl_bitfield cl_command_queue_properties; +typedef intptr_t cl_device_partition_property; +typedef cl_bitfield cl_device_affinity_domain; + +typedef intptr_t cl_context_properties; +typedef cl_uint cl_context_info; +typedef cl_bitfield cl_queue_properties; +typedef cl_uint cl_command_queue_info; +typedef cl_uint cl_channel_order; +typedef cl_uint cl_channel_type; +typedef cl_bitfield cl_mem_flags; +typedef cl_bitfield cl_svm_mem_flags; +typedef cl_uint cl_mem_object_type; +typedef cl_uint cl_mem_info; +typedef cl_bitfield cl_mem_migration_flags; +typedef cl_uint cl_image_info; +typedef cl_uint cl_buffer_create_type; +typedef cl_uint cl_addressing_mode; +typedef cl_uint cl_filter_mode; +typedef cl_uint cl_sampler_info; +typedef cl_bitfield cl_map_flags; +typedef intptr_t cl_pipe_properties; +typedef cl_uint cl_pipe_info; +typedef cl_uint cl_program_info; +typedef cl_uint cl_program_build_info; +typedef cl_uint cl_program_binary_type; +typedef cl_int cl_build_status; +typedef cl_uint cl_kernel_info; +typedef cl_uint cl_kernel_arg_info; +typedef cl_uint cl_kernel_arg_address_qualifier; +typedef cl_uint cl_kernel_arg_access_qualifier; +typedef cl_bitfield cl_kernel_arg_type_qualifier; +typedef cl_uint cl_kernel_work_group_info; +typedef cl_uint cl_event_info; +typedef cl_uint cl_command_type; +typedef cl_uint cl_profiling_info; +typedef cl_bitfield cl_sampler_properties; +typedef cl_uint cl_kernel_exec_info; + +typedef struct _cl_image_format { + cl_channel_order image_channel_order; + cl_channel_type image_channel_data_type; +} cl_image_format; + +typedef struct _cl_image_desc { + cl_mem_object_type image_type; + size_t image_width; + size_t image_height; + size_t image_depth; + size_t image_array_size; + size_t image_row_pitch; + size_t image_slice_pitch; + cl_uint num_mip_levels; + cl_uint num_samples; +#ifdef __GNUC__ + __extension__ /* Prevents warnings about anonymous union in -pedantic builds */ +#endif + union { + cl_mem buffer; + cl_mem mem_object; + }; +} cl_image_desc; + +typedef struct _cl_buffer_region { + size_t origin; + size_t size; +} cl_buffer_region; + + +/******************************************************************************/ + +/* Error Codes */ +#define CL_SUCCESS 0 +#define CL_DEVICE_NOT_FOUND -1 +#define CL_DEVICE_NOT_AVAILABLE -2 +#define CL_COMPILER_NOT_AVAILABLE -3 +#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 +#define CL_OUT_OF_RESOURCES -5 +#define CL_OUT_OF_HOST_MEMORY -6 +#define CL_PROFILING_INFO_NOT_AVAILABLE -7 +#define CL_MEM_COPY_OVERLAP -8 +#define CL_IMAGE_FORMAT_MISMATCH -9 +#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 +#define CL_BUILD_PROGRAM_FAILURE -11 +#define CL_MAP_FAILURE -12 +#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 +#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 +#define CL_COMPILE_PROGRAM_FAILURE -15 +#define CL_LINKER_NOT_AVAILABLE -16 +#define CL_LINK_PROGRAM_FAILURE -17 +#define CL_DEVICE_PARTITION_FAILED -18 +#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 + +#define CL_INVALID_VALUE -30 +#define CL_INVALID_DEVICE_TYPE -31 +#define CL_INVALID_PLATFORM -32 +#define CL_INVALID_DEVICE -33 +#define CL_INVALID_CONTEXT -34 +#define CL_INVALID_QUEUE_PROPERTIES -35 +#define CL_INVALID_COMMAND_QUEUE -36 +#define CL_INVALID_HOST_PTR -37 +#define CL_INVALID_MEM_OBJECT -38 +#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 +#define CL_INVALID_IMAGE_SIZE -40 +#define CL_INVALID_SAMPLER -41 +#define CL_INVALID_BINARY -42 +#define CL_INVALID_BUILD_OPTIONS -43 +#define CL_INVALID_PROGRAM -44 +#define CL_INVALID_PROGRAM_EXECUTABLE -45 +#define CL_INVALID_KERNEL_NAME -46 +#define CL_INVALID_KERNEL_DEFINITION -47 +#define CL_INVALID_KERNEL -48 +#define CL_INVALID_ARG_INDEX -49 +#define CL_INVALID_ARG_VALUE -50 +#define CL_INVALID_ARG_SIZE -51 +#define CL_INVALID_KERNEL_ARGS -52 +#define CL_INVALID_WORK_DIMENSION -53 +#define CL_INVALID_WORK_GROUP_SIZE -54 +#define CL_INVALID_WORK_ITEM_SIZE -55 +#define CL_INVALID_GLOBAL_OFFSET -56 +#define CL_INVALID_EVENT_WAIT_LIST -57 +#define CL_INVALID_EVENT -58 +#define CL_INVALID_OPERATION -59 +#define CL_INVALID_GL_OBJECT -60 +#define CL_INVALID_BUFFER_SIZE -61 +#define CL_INVALID_MIP_LEVEL -62 +#define CL_INVALID_GLOBAL_WORK_SIZE -63 +#define CL_INVALID_PROPERTY -64 +#define CL_INVALID_IMAGE_DESCRIPTOR -65 +#define CL_INVALID_COMPILER_OPTIONS -66 +#define CL_INVALID_LINKER_OPTIONS -67 +#define CL_INVALID_DEVICE_PARTITION_COUNT -68 +#define CL_INVALID_PIPE_SIZE -69 +#define CL_INVALID_DEVICE_QUEUE -70 + +/* OpenCL Version */ +#define CL_VERSION_1_0 1 +#define CL_VERSION_1_1 1 +#define CL_VERSION_1_2 1 +#define CL_VERSION_2_0 1 + +/* cl_bool */ +#define CL_FALSE 0 +#define CL_TRUE 1 +#define CL_BLOCKING CL_TRUE +#define CL_NON_BLOCKING CL_FALSE + +/* cl_platform_info */ +#define CL_PLATFORM_PROFILE 0x0900 +#define CL_PLATFORM_VERSION 0x0901 +#define CL_PLATFORM_NAME 0x0902 +#define CL_PLATFORM_VENDOR 0x0903 +#define CL_PLATFORM_EXTENSIONS 0x0904 + +/* cl_device_type - bitfield */ +#define CL_DEVICE_TYPE_DEFAULT (1 << 0) +#define CL_DEVICE_TYPE_CPU (1 << 1) +#define CL_DEVICE_TYPE_GPU (1 << 2) +#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) +#define CL_DEVICE_TYPE_CUSTOM (1 << 4) +#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF + +/* cl_device_info */ +#define CL_DEVICE_TYPE 0x1000 +#define CL_DEVICE_VENDOR_ID 0x1001 +#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 +#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 +#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 +#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B +#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C +#define CL_DEVICE_ADDRESS_BITS 0x100D +#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E +#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F +#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 +#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 +#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 +#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 +#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 +#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 +#define CL_DEVICE_IMAGE_SUPPORT 0x1016 +#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 +#define CL_DEVICE_MAX_SAMPLERS 0x1018 +#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 +#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A +#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B +#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C +#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D +#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E +#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F +#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 +#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 +#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 +#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 +#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 +#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 +#define CL_DEVICE_ENDIAN_LITTLE 0x1026 +#define CL_DEVICE_AVAILABLE 0x1027 +#define CL_DEVICE_COMPILER_AVAILABLE 0x1028 +#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 +#define CL_DEVICE_QUEUE_PROPERTIES 0x102A /* deprecated */ +#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES 0x102A +#define CL_DEVICE_NAME 0x102B +#define CL_DEVICE_VENDOR 0x102C +#define CL_DRIVER_VERSION 0x102D +#define CL_DEVICE_PROFILE 0x102E +#define CL_DEVICE_VERSION 0x102F +#define CL_DEVICE_EXTENSIONS 0x1030 +#define CL_DEVICE_PLATFORM 0x1031 +#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 +/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */ +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 +#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 /* deprecated */ +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C +#define CL_DEVICE_OPENCL_C_VERSION 0x103D +#define CL_DEVICE_LINKER_AVAILABLE 0x103E +#define CL_DEVICE_BUILT_IN_KERNELS 0x103F +#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 +#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 +#define CL_DEVICE_PARENT_DEVICE 0x1042 +#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 +#define CL_DEVICE_PARTITION_PROPERTIES 0x1044 +#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 +#define CL_DEVICE_PARTITION_TYPE 0x1046 +#define CL_DEVICE_REFERENCE_COUNT 0x1047 +#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 +#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 +#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A +#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B +#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS 0x104C +#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE 0x104D +#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES 0x104E +#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE 0x104F +#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE 0x1050 +#define CL_DEVICE_MAX_ON_DEVICE_QUEUES 0x1051 +#define CL_DEVICE_MAX_ON_DEVICE_EVENTS 0x1052 +#define CL_DEVICE_SVM_CAPABILITIES 0x1053 +#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE 0x1054 +#define CL_DEVICE_MAX_PIPE_ARGS 0x1055 +#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS 0x1056 +#define CL_DEVICE_PIPE_MAX_PACKET_SIZE 0x1057 +#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT 0x1058 +#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT 0x1059 +#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT 0x105A + +/* cl_device_fp_config - bitfield */ +#define CL_FP_DENORM (1 << 0) +#define CL_FP_INF_NAN (1 << 1) +#define CL_FP_ROUND_TO_NEAREST (1 << 2) +#define CL_FP_ROUND_TO_ZERO (1 << 3) +#define CL_FP_ROUND_TO_INF (1 << 4) +#define CL_FP_FMA (1 << 5) +#define CL_FP_SOFT_FLOAT (1 << 6) +#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7) + +/* cl_device_mem_cache_type */ +#define CL_NONE 0x0 +#define CL_READ_ONLY_CACHE 0x1 +#define CL_READ_WRITE_CACHE 0x2 + +/* cl_device_local_mem_type */ +#define CL_LOCAL 0x1 +#define CL_GLOBAL 0x2 + +/* cl_device_exec_capabilities - bitfield */ +#define CL_EXEC_KERNEL (1 << 0) +#define CL_EXEC_NATIVE_KERNEL (1 << 1) + +/* cl_command_queue_properties - bitfield */ +#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) +#define CL_QUEUE_PROFILING_ENABLE (1 << 1) +#define CL_QUEUE_ON_DEVICE (1 << 2) +#define CL_QUEUE_ON_DEVICE_DEFAULT (1 << 3) + +/* cl_context_info */ +#define CL_CONTEXT_REFERENCE_COUNT 0x1080 +#define CL_CONTEXT_DEVICES 0x1081 +#define CL_CONTEXT_PROPERTIES 0x1082 +#define CL_CONTEXT_NUM_DEVICES 0x1083 + +/* cl_context_properties */ +#define CL_CONTEXT_PLATFORM 0x1084 +#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 + +/* cl_device_partition_property */ +#define CL_DEVICE_PARTITION_EQUALLY 0x1086 +#define CL_DEVICE_PARTITION_BY_COUNTS 0x1087 +#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0 +#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088 + +/* cl_device_affinity_domain */ +#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0) +#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1) +#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2) +#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3) +#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4) +#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5) + +/* cl_device_svm_capabilities */ +#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER (1 << 0) +#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER (1 << 1) +#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM (1 << 2) +#define CL_DEVICE_SVM_ATOMICS (1 << 3) + +/* cl_command_queue_info */ +#define CL_QUEUE_CONTEXT 0x1090 +#define CL_QUEUE_DEVICE 0x1091 +#define CL_QUEUE_REFERENCE_COUNT 0x1092 +#define CL_QUEUE_PROPERTIES 0x1093 +#define CL_QUEUE_SIZE 0x1094 + +/* cl_mem_flags and cl_svm_mem_flags - bitfield */ +#define CL_MEM_READ_WRITE (1 << 0) +#define CL_MEM_WRITE_ONLY (1 << 1) +#define CL_MEM_READ_ONLY (1 << 2) +#define CL_MEM_USE_HOST_PTR (1 << 3) +#define CL_MEM_ALLOC_HOST_PTR (1 << 4) +#define CL_MEM_COPY_HOST_PTR (1 << 5) +/* reserved (1 << 6) */ +#define CL_MEM_HOST_WRITE_ONLY (1 << 7) +#define CL_MEM_HOST_READ_ONLY (1 << 8) +#define CL_MEM_HOST_NO_ACCESS (1 << 9) +#define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10) /* used by cl_svm_mem_flags only */ +#define CL_MEM_SVM_ATOMICS (1 << 11) /* used by cl_svm_mem_flags only */ + +/* cl_mem_migration_flags - bitfield */ +#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0) +#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1) + +/* cl_channel_order */ +#define CL_R 0x10B0 +#define CL_A 0x10B1 +#define CL_RG 0x10B2 +#define CL_RA 0x10B3 +#define CL_RGB 0x10B4 +#define CL_RGBA 0x10B5 +#define CL_BGRA 0x10B6 +#define CL_ARGB 0x10B7 +#define CL_INTENSITY 0x10B8 +#define CL_LUMINANCE 0x10B9 +#define CL_Rx 0x10BA +#define CL_RGx 0x10BB +#define CL_RGBx 0x10BC +#define CL_DEPTH 0x10BD +#define CL_DEPTH_STENCIL 0x10BE +#define CL_sRGB 0x10BF +#define CL_sRGBx 0x10C0 +#define CL_sRGBA 0x10C1 +#define CL_sBGRA 0x10C2 +#define CL_ABGR 0x10C3 + +/* cl_channel_type */ +#define CL_SNORM_INT8 0x10D0 +#define CL_SNORM_INT16 0x10D1 +#define CL_UNORM_INT8 0x10D2 +#define CL_UNORM_INT16 0x10D3 +#define CL_UNORM_SHORT_565 0x10D4 +#define CL_UNORM_SHORT_555 0x10D5 +#define CL_UNORM_INT_101010 0x10D6 +#define CL_SIGNED_INT8 0x10D7 +#define CL_SIGNED_INT16 0x10D8 +#define CL_SIGNED_INT32 0x10D9 +#define CL_UNSIGNED_INT8 0x10DA +#define CL_UNSIGNED_INT16 0x10DB +#define CL_UNSIGNED_INT32 0x10DC +#define CL_HALF_FLOAT 0x10DD +#define CL_FLOAT 0x10DE +#define CL_UNORM_INT24 0x10DF + +/* cl_mem_object_type */ +#define CL_MEM_OBJECT_BUFFER 0x10F0 +#define CL_MEM_OBJECT_IMAGE2D 0x10F1 +#define CL_MEM_OBJECT_IMAGE3D 0x10F2 +#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3 +#define CL_MEM_OBJECT_IMAGE1D 0x10F4 +#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5 +#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6 +#define CL_MEM_OBJECT_PIPE 0x10F7 + +/* cl_mem_info */ +#define CL_MEM_TYPE 0x1100 +#define CL_MEM_FLAGS 0x1101 +#define CL_MEM_SIZE 0x1102 +#define CL_MEM_HOST_PTR 0x1103 +#define CL_MEM_MAP_COUNT 0x1104 +#define CL_MEM_REFERENCE_COUNT 0x1105 +#define CL_MEM_CONTEXT 0x1106 +#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 +#define CL_MEM_OFFSET 0x1108 +#define CL_MEM_USES_SVM_POINTER 0x1109 + +/* cl_image_info */ +#define CL_IMAGE_FORMAT 0x1110 +#define CL_IMAGE_ELEMENT_SIZE 0x1111 +#define CL_IMAGE_ROW_PITCH 0x1112 +#define CL_IMAGE_SLICE_PITCH 0x1113 +#define CL_IMAGE_WIDTH 0x1114 +#define CL_IMAGE_HEIGHT 0x1115 +#define CL_IMAGE_DEPTH 0x1116 +#define CL_IMAGE_ARRAY_SIZE 0x1117 +#define CL_IMAGE_BUFFER 0x1118 +#define CL_IMAGE_NUM_MIP_LEVELS 0x1119 +#define CL_IMAGE_NUM_SAMPLES 0x111A + +/* cl_pipe_info */ +#define CL_PIPE_PACKET_SIZE 0x1120 +#define CL_PIPE_MAX_PACKETS 0x1121 + +/* cl_addressing_mode */ +#define CL_ADDRESS_NONE 0x1130 +#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 +#define CL_ADDRESS_CLAMP 0x1132 +#define CL_ADDRESS_REPEAT 0x1133 +#define CL_ADDRESS_MIRRORED_REPEAT 0x1134 + +/* cl_filter_mode */ +#define CL_FILTER_NEAREST 0x1140 +#define CL_FILTER_LINEAR 0x1141 + +/* cl_sampler_info */ +#define CL_SAMPLER_REFERENCE_COUNT 0x1150 +#define CL_SAMPLER_CONTEXT 0x1151 +#define CL_SAMPLER_NORMALIZED_COORDS 0x1152 +#define CL_SAMPLER_ADDRESSING_MODE 0x1153 +#define CL_SAMPLER_FILTER_MODE 0x1154 +#define CL_SAMPLER_MIP_FILTER_MODE 0x1155 +#define CL_SAMPLER_LOD_MIN 0x1156 +#define CL_SAMPLER_LOD_MAX 0x1157 + +/* cl_map_flags - bitfield */ +#define CL_MAP_READ (1 << 0) +#define CL_MAP_WRITE (1 << 1) +#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) + +/* cl_program_info */ +#define CL_PROGRAM_REFERENCE_COUNT 0x1160 +#define CL_PROGRAM_CONTEXT 0x1161 +#define CL_PROGRAM_NUM_DEVICES 0x1162 +#define CL_PROGRAM_DEVICES 0x1163 +#define CL_PROGRAM_SOURCE 0x1164 +#define CL_PROGRAM_BINARY_SIZES 0x1165 +#define CL_PROGRAM_BINARIES 0x1166 +#define CL_PROGRAM_NUM_KERNELS 0x1167 +#define CL_PROGRAM_KERNEL_NAMES 0x1168 + +/* cl_program_build_info */ +#define CL_PROGRAM_BUILD_STATUS 0x1181 +#define CL_PROGRAM_BUILD_OPTIONS 0x1182 +#define CL_PROGRAM_BUILD_LOG 0x1183 +#define CL_PROGRAM_BINARY_TYPE 0x1184 +#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185 + +/* cl_program_binary_type */ +#define CL_PROGRAM_BINARY_TYPE_NONE 0x0 +#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1 +#define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2 +#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4 + +/* cl_build_status */ +#define CL_BUILD_SUCCESS 0 +#define CL_BUILD_NONE -1 +#define CL_BUILD_ERROR -2 +#define CL_BUILD_IN_PROGRESS -3 + +/* cl_kernel_info */ +#define CL_KERNEL_FUNCTION_NAME 0x1190 +#define CL_KERNEL_NUM_ARGS 0x1191 +#define CL_KERNEL_REFERENCE_COUNT 0x1192 +#define CL_KERNEL_CONTEXT 0x1193 +#define CL_KERNEL_PROGRAM 0x1194 +#define CL_KERNEL_ATTRIBUTES 0x1195 + +/* cl_kernel_arg_info */ +#define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196 +#define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197 +#define CL_KERNEL_ARG_TYPE_NAME 0x1198 +#define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199 +#define CL_KERNEL_ARG_NAME 0x119A + +/* cl_kernel_arg_address_qualifier */ +#define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B +#define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C +#define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D +#define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E + +/* cl_kernel_arg_access_qualifier */ +#define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0 +#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1 +#define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2 +#define CL_KERNEL_ARG_ACCESS_NONE 0x11A3 + +/* cl_kernel_arg_type_qualifer */ +#define CL_KERNEL_ARG_TYPE_NONE 0 +#define CL_KERNEL_ARG_TYPE_CONST (1 << 0) +#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1) +#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2) +#define CL_KERNEL_ARG_TYPE_PIPE (1 << 3) + +/* cl_kernel_work_group_info */ +#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 +#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 +#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 +#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 +#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 +#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 + +/* cl_kernel_exec_info */ +#define CL_KERNEL_EXEC_INFO_SVM_PTRS 0x11B6 +#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM 0x11B7 + +/* cl_event_info */ +#define CL_EVENT_COMMAND_QUEUE 0x11D0 +#define CL_EVENT_COMMAND_TYPE 0x11D1 +#define CL_EVENT_REFERENCE_COUNT 0x11D2 +#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 +#define CL_EVENT_CONTEXT 0x11D4 + +/* cl_command_type */ +#define CL_COMMAND_NDRANGE_KERNEL 0x11F0 +#define CL_COMMAND_TASK 0x11F1 +#define CL_COMMAND_NATIVE_KERNEL 0x11F2 +#define CL_COMMAND_READ_BUFFER 0x11F3 +#define CL_COMMAND_WRITE_BUFFER 0x11F4 +#define CL_COMMAND_COPY_BUFFER 0x11F5 +#define CL_COMMAND_READ_IMAGE 0x11F6 +#define CL_COMMAND_WRITE_IMAGE 0x11F7 +#define CL_COMMAND_COPY_IMAGE 0x11F8 +#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 +#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA +#define CL_COMMAND_MAP_BUFFER 0x11FB +#define CL_COMMAND_MAP_IMAGE 0x11FC +#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD +#define CL_COMMAND_MARKER 0x11FE +#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF +#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 +#define CL_COMMAND_READ_BUFFER_RECT 0x1201 +#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202 +#define CL_COMMAND_COPY_BUFFER_RECT 0x1203 +#define CL_COMMAND_USER 0x1204 +#define CL_COMMAND_BARRIER 0x1205 +#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206 +#define CL_COMMAND_FILL_BUFFER 0x1207 +#define CL_COMMAND_FILL_IMAGE 0x1208 +#define CL_COMMAND_SVM_FREE 0x1209 +#define CL_COMMAND_SVM_MEMCPY 0x120A +#define CL_COMMAND_SVM_MEMFILL 0x120B +#define CL_COMMAND_SVM_MAP 0x120C +#define CL_COMMAND_SVM_UNMAP 0x120D + +/* command execution status */ +#define CL_COMPLETE 0x0 +#define CL_RUNNING 0x1 +#define CL_SUBMITTED 0x2 +#define CL_QUEUED 0x3 + +/* cl_buffer_create_type */ +#define CL_BUFFER_CREATE_TYPE_REGION 0x1220 + +/* cl_profiling_info */ +#define CL_PROFILING_COMMAND_QUEUED 0x1280 +#define CL_PROFILING_COMMAND_SUBMIT 0x1281 +#define CL_PROFILING_COMMAND_START 0x1282 +#define CL_PROFILING_COMMAND_END 0x1283 +#define CL_PROFILING_COMMAND_COMPLETE 0x1284 + +/********************************************************************************************************/ + +/* Platform API */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformIDs(cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformInfo(cl_platform_id /* platform */, + cl_platform_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Device APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDs(cl_platform_id /* platform */, + cl_device_type /* device_type */, + cl_uint /* num_entries */, + cl_device_id * /* devices */, + cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceInfo(cl_device_id /* device */, + cl_device_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateSubDevices(cl_device_id /* in_device */, + const cl_device_partition_property * /* properties */, + cl_uint /* num_devices */, + cl_device_id * /* out_devices */, + cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2; + +/* Context APIs */ +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContext(const cl_context_properties * /* properties */, + cl_uint /* num_devices */, + const cl_device_id * /* devices */, + void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContextFromType(const cl_context_properties * /* properties */, + cl_device_type /* device_type */, + void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetContextInfo(cl_context /* context */, + cl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Command Queue APIs */ +extern CL_API_ENTRY cl_command_queue CL_API_CALL +clCreateCommandQueueWithProperties(cl_context /* context */, + cl_device_id /* device */, + const cl_queue_properties * /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetCommandQueueInfo(cl_command_queue /* command_queue */, + cl_command_queue_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Memory Object APIs */ +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateBuffer(cl_context /* context */, + cl_mem_flags /* flags */, + size_t /* size */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateSubBuffer(cl_mem /* buffer */, + cl_mem_flags /* flags */, + cl_buffer_create_type /* buffer_create_type */, + const void * /* buffer_create_info */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateImage(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + const cl_image_desc * /* image_desc */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreatePipe(cl_context /* context */, + cl_mem_flags /* flags */, + cl_uint /* pipe_packet_size */, + cl_uint /* pipe_max_packets */, + const cl_pipe_properties * /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSupportedImageFormats(cl_context /* context */, + cl_mem_flags /* flags */, + cl_mem_object_type /* image_type */, + cl_uint /* num_entries */, + cl_image_format * /* image_formats */, + cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetMemObjectInfo(cl_mem /* memobj */, + cl_mem_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetImageInfo(cl_mem /* image */, + cl_image_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPipeInfo(cl_mem /* pipe */, + cl_pipe_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0; + + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetMemObjectDestructorCallback(cl_mem /* memobj */, + void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1; + +/* SVM Allocation APIs */ +extern CL_API_ENTRY void * CL_API_CALL +clSVMAlloc(cl_context /* context */, + cl_svm_mem_flags /* flags */, + size_t /* size */, + cl_uint /* alignment */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY void CL_API_CALL +clSVMFree(cl_context /* context */, + void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0; + +/* Sampler APIs */ +extern CL_API_ENTRY cl_sampler CL_API_CALL +clCreateSamplerWithProperties(cl_context /* context */, + const cl_sampler_properties * /* normalized_coords */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSamplerInfo(cl_sampler /* sampler */, + cl_sampler_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Program Object APIs */ +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithSource(cl_context /* context */, + cl_uint /* count */, + const char ** /* strings */, + const size_t * /* lengths */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBinary(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const size_t * /* lengths */, + const unsigned char ** /* binaries */, + cl_int * /* binary_status */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBuiltInKernels(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* kernel_names */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clBuildProgram(cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCompileProgram(cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + cl_uint /* num_input_headers */, + const cl_program * /* input_headers */, + const char ** /* header_include_names */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_program CL_API_CALL +clLinkProgram(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + cl_uint /* num_input_programs */, + const cl_program * /* input_programs */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */, + cl_int * /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2; + + +extern CL_API_ENTRY cl_int CL_API_CALL +clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramInfo(cl_program /* program */, + cl_program_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramBuildInfo(cl_program /* program */, + cl_device_id /* device */, + cl_program_build_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Kernel Object APIs */ +extern CL_API_ENTRY cl_kernel CL_API_CALL +clCreateKernel(cl_program /* program */, + const char * /* kernel_name */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateKernelsInProgram(cl_program /* program */, + cl_uint /* num_kernels */, + cl_kernel * /* kernels */, + cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArg(cl_kernel /* kernel */, + cl_uint /* arg_index */, + size_t /* arg_size */, + const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArgSVMPointer(cl_kernel /* kernel */, + cl_uint /* arg_index */, + const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelExecInfo(cl_kernel /* kernel */, + cl_kernel_exec_info /* param_name */, + size_t /* param_value_size */, + const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelInfo(cl_kernel /* kernel */, + cl_kernel_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelArgInfo(cl_kernel /* kernel */, + cl_uint /* arg_indx */, + cl_kernel_arg_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelWorkGroupInfo(cl_kernel /* kernel */, + cl_device_id /* device */, + cl_kernel_work_group_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Event Object APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clWaitForEvents(cl_uint /* num_events */, + const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventInfo(cl_event /* event */, + cl_event_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateUserEvent(cl_context /* context */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetUserEventStatus(cl_event /* event */, + cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetEventCallback( cl_event /* event */, + cl_int /* command_exec_callback_type */, + void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_1; + +/* Profiling APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventProfilingInfo(cl_event /* event */, + cl_profiling_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Flush and Finish APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +/* Enqueued Commands APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + size_t /* offset */, + size_t /* size */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBufferRect(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + const size_t * /* buffer_offset */, + const size_t * /* host_offset */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + size_t /* offset */, + size_t /* size */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBufferRect(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + const size_t * /* buffer_offset */, + const size_t * /* host_offset */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueFillBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + const void * /* pattern */, + size_t /* pattern_size */, + size_t /* offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBuffer(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + size_t /* src_offset */, + size_t /* dst_offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferRect(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin */, + const size_t * /* dst_origin */, + const size_t * /* region */, + size_t /* src_row_pitch */, + size_t /* src_slice_pitch */, + size_t /* dst_row_pitch */, + size_t /* dst_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_read */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* row_pitch */, + size_t /* slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_write */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* input_row_pitch */, + size_t /* input_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueFillImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + const void * /* fill_color */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImage(cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_image */, + const size_t * /* src_origin[3] */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin[3] */, + const size_t * /* region[3] */, + size_t /* dst_offset */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_image */, + size_t /* src_offset */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + size_t /* offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t * /* image_row_pitch */, + size_t * /* image_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueUnmapMemObject(cl_command_queue /* command_queue */, + cl_mem /* memobj */, + void * /* mapped_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMigrateMemObjects(cl_command_queue /* command_queue */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_objects */, + cl_mem_migration_flags /* flags */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNDRangeKernel(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* work_dim */, + const size_t * /* global_work_offset */, + const size_t * /* global_work_size */, + const size_t * /* local_work_size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNativeKernel(cl_command_queue /* command_queue */, + void (CL_CALLBACK * /*user_func*/)(void *), + void * /* args */, + size_t /* cb_args */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_list */, + const void ** /* args_mem_loc */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMFree(cl_command_queue /* command_queue */, + cl_uint /* num_svm_pointers */, + void *[] /* svm_pointers[] */, + void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */, + cl_uint /* num_svm_pointers */, + void *[] /* svm_pointers[] */, + void * /* user_data */), + void * /* user_data */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemcpy(cl_command_queue /* command_queue */, + cl_bool /* blocking_copy */, + void * /* dst_ptr */, + const void * /* src_ptr */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemFill(cl_command_queue /* command_queue */, + void * /* svm_ptr */, + const void * /* pattern */, + size_t /* pattern_size */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMap(cl_command_queue /* command_queue */, + cl_bool /* blocking_map */, + cl_map_flags /* flags */, + void * /* svm_ptr */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMUnmap(cl_command_queue /* command_queue */, + void * /* svm_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + + +/* Extension function access + * + * Returns the extension function address for the given function name, + * or NULL if a valid function can not be found. The client must + * check to make sure the address is not NULL, before using or + * calling the returned function address. + */ +extern CL_API_ENTRY void * CL_API_CALL +clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */, + const char * /* func_name */) CL_API_SUFFIX__VERSION_1_2; + + +/* Deprecated OpenCL 1.1 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateImage2D(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_row_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateImage3D(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_depth */, + size_t /* image_row_pitch */, + size_t /* image_slice_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueMarker(cl_command_queue /* command_queue */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueWaitForEvents(cl_command_queue /* command_queue */, + cl_uint /* num_events */, + const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL +clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +/* Deprecated OpenCL 2.0 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_2_0_DEPRECATED cl_command_queue CL_API_CALL +clCreateCommandQueue(cl_context /* context */, + cl_device_id /* device */, + cl_command_queue_properties /* properties */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; + + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_2_0_DEPRECATED cl_sampler CL_API_CALL +clCreateSampler(cl_context /* context */, + cl_bool /* normalized_coords */, + cl_addressing_mode /* addressing_mode */, + cl_filter_mode /* filter_mode */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_2_0_DEPRECATED cl_int CL_API_CALL +clEnqueueTask(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_H */ + diff --git a/3rdparty/opencl/CL/cl.hpp b/3rdparty/opencl/CL/cl.hpp new file mode 100644 index 0000000000..6b361671e6 --- /dev/null +++ b/3rdparty/opencl/CL/cl.hpp @@ -0,0 +1,12456 @@ +/******************************************************************************* + * Copyright (c) 2008-2013 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/*! \file + * + * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and + * OpenCL 1.2 (rev 15) + * \author Benedict R. Gaster, Laurent Morichetti and Lee Howes + * + * Additions and fixes from: + * Brian Cole, March 3rd 2010 and April 2012 + * Matt Gruenke, April 2012. + * Bruce Merry, February 2013. + * Tom Deakin and Simon McIntosh-Smith, July 2013 + * + * \version 1.2.6 + * \date August 2013 + * + * Optional extension support + * + * cl + * cl_ext_device_fission + * #define USE_CL_DEVICE_FISSION + */ + +/*! \mainpage + * \section intro Introduction + * For many large applications C++ is the language of choice and so it seems + * reasonable to define C++ bindings for OpenCL. + * + * + * The interface is contained with a single C++ header file \em cl.hpp and all + * definitions are contained within the namespace \em cl. There is no additional + * requirement to include \em cl.h and to use either the C++ or original C + * bindings it is enough to simply include \em cl.hpp. + * + * The bindings themselves are lightweight and correspond closely to the + * underlying C API. Using the C++ bindings introduces no additional execution + * overhead. + * + * For detail documentation on the bindings see: + * + * The OpenCL C++ Wrapper API 1.2 (revision 09) + * http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf + * + * \section example Example + * + * The following example shows a general use case for the C++ + * bindings, including support for the optional exception feature and + * also the supplied vector and string classes, see following sections for + * decriptions of these features. + * + * \code + * #define __CL_ENABLE_EXCEPTIONS + * + * #if defined(__APPLE__) || defined(__MACOSX) + * #include + * #else + * #include + * #endif + * #include + * #include + * #include + * + * const char * helloStr = "__kernel void " + * "hello(void) " + * "{ " + * " " + * "} "; + * + * int + * main(void) + * { + * cl_int err = CL_SUCCESS; + * try { + * + * std::vector platforms; + * cl::Platform::get(&platforms); + * if (platforms.size() == 0) { + * std::cout << "Platform size 0\n"; + * return -1; + * } + * + * cl_context_properties properties[] = + * { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0}; + * cl::Context context(CL_DEVICE_TYPE_CPU, properties); + * + * std::vector devices = context.getInfo(); + * + * cl::Program::Sources source(1, + * std::make_pair(helloStr,strlen(helloStr))); + * cl::Program program_ = cl::Program(context, source); + * program_.build(devices); + * + * cl::Kernel kernel(program_, "hello", &err); + * + * cl::Event event; + * cl::CommandQueue queue(context, devices[0], 0, &err); + * queue.enqueueNDRangeKernel( + * kernel, + * cl::NullRange, + * cl::NDRange(4,4), + * cl::NullRange, + * NULL, + * &event); + * + * event.wait(); + * } + * catch (cl::Error err) { + * std::cerr + * << "ERROR: " + * << err.what() + * << "(" + * << err.err() + * << ")" + * << std::endl; + * } + * + * return EXIT_SUCCESS; + * } + * + * \endcode + * + */ +#ifndef CL_HPP_ +#define CL_HPP_ + +#ifdef _WIN32 + +#include +#include +#include +#include + +#if defined(__CL_ENABLE_EXCEPTIONS) +#include +#endif // #if defined(__CL_ENABLE_EXCEPTIONS) + +#pragma push_macro("max") +#undef max +#if defined(USE_DX_INTEROP) +#include +#include +#endif +#endif // _WIN32 + +// +#if defined(USE_CL_DEVICE_FISSION) +#include +#endif + +#if defined(__APPLE__) || defined(__MACOSX) +#include +#include +#include +#else +#if defined(__ANDROID__) +#include +#else +#include +#endif +#include +#endif // !__APPLE__ + +// To avoid accidentally taking ownership of core OpenCL types +// such as cl_kernel constructors are made explicit +// under OpenCL 1.2 +#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +#define __CL_EXPLICIT_CONSTRUCTORS explicit +#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +#define __CL_EXPLICIT_CONSTRUCTORS +#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + +// Define deprecated prefixes and suffixes to ensure compilation +// in case they are not pre-defined +#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) + +#if !defined(CL_CALLBACK) +#define CL_CALLBACK +#endif //CL_CALLBACK + +#include +#include + +#if !defined(__NO_STD_VECTOR) +#include +#endif + +#if !defined(__NO_STD_STRING) +#include +#endif + +#if defined(linux) || defined(__APPLE__) || defined(__MACOSX) +#include + +#include +#include +#endif // linux + +#include + + +/*! \namespace cl + * + * \brief The OpenCL C++ bindings are defined within this namespace. + * + */ +namespace cl { + +class Memory; + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) +#define __INIT_CL_EXT_FCN_PTR(name) \ + if(!pfn_##name) { \ + pfn_##name = (PFN_##name) \ + clGetExtensionFunctionAddress(#name); \ + if(!pfn_##name) { \ + } \ + } +#endif // #if defined(CL_VERSION_1_1) + +#if defined(CL_VERSION_1_2) +#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \ + if(!pfn_##name) { \ + pfn_##name = (PFN_##name) \ + clGetExtensionFunctionAddressForPlatform(platform, #name); \ + if(!pfn_##name) { \ + } \ + } +#endif // #if defined(CL_VERSION_1_1) + +class Program; +class Device; +class Context; +class CommandQueue; +class Memory; +class Buffer; + +#if defined(__CL_ENABLE_EXCEPTIONS) +/*! \brief Exception class + * + * This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined. + */ +class Error : public std::exception +{ +private: + cl_int err_; + const char * errStr_; +public: + /*! \brief Create a new CL error exception for a given error code + * and corresponding message. + * + * \param err error code value. + * + * \param errStr a descriptive string that must remain in scope until + * handling of the exception has concluded. If set, it + * will be returned by what(). + */ + Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr) + {} + + ~Error() throw() {} + + /*! \brief Get error string associated with exception + * + * \return A memory pointer to the error message string. + */ + virtual const char * what() const throw () + { + if (errStr_ == NULL) { + return "empty"; + } + else { + return errStr_; + } + } + + /*! \brief Get error code associated with exception + * + * \return The error code. + */ + cl_int err(void) const { return err_; } +}; + +#define __ERR_STR(x) #x +#else +#define __ERR_STR(x) NULL +#endif // __CL_ENABLE_EXCEPTIONS + + +namespace detail +{ +#if defined(__CL_ENABLE_EXCEPTIONS) +static inline cl_int errHandler ( + cl_int err, + const char * errStr = NULL) +{ + if (err != CL_SUCCESS) { + throw Error(err, errStr); + } + return err; +} +#else +static inline cl_int errHandler (cl_int err, const char * errStr = NULL) +{ + (void) errStr; // suppress unused variable warning + return err; +} +#endif // __CL_ENABLE_EXCEPTIONS +} + + + +//! \cond DOXYGEN_DETAIL +#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) +#define __GET_DEVICE_INFO_ERR __ERR_STR(clGetDeviceInfo) +#define __GET_PLATFORM_INFO_ERR __ERR_STR(clGetPlatformInfo) +#define __GET_DEVICE_IDS_ERR __ERR_STR(clGetDeviceIDs) +#define __GET_PLATFORM_IDS_ERR __ERR_STR(clGetPlatformIDs) +#define __GET_CONTEXT_INFO_ERR __ERR_STR(clGetContextInfo) +#define __GET_EVENT_INFO_ERR __ERR_STR(clGetEventInfo) +#define __GET_EVENT_PROFILE_INFO_ERR __ERR_STR(clGetEventProfileInfo) +#define __GET_MEM_OBJECT_INFO_ERR __ERR_STR(clGetMemObjectInfo) +#define __GET_IMAGE_INFO_ERR __ERR_STR(clGetImageInfo) +#define __GET_SAMPLER_INFO_ERR __ERR_STR(clGetSamplerInfo) +#define __GET_KERNEL_INFO_ERR __ERR_STR(clGetKernelInfo) +#if defined(CL_VERSION_1_2) +#define __GET_KERNEL_ARG_INFO_ERR __ERR_STR(clGetKernelArgInfo) +#endif // #if defined(CL_VERSION_1_2) +#define __GET_KERNEL_WORK_GROUP_INFO_ERR __ERR_STR(clGetKernelWorkGroupInfo) +#define __GET_PROGRAM_INFO_ERR __ERR_STR(clGetProgramInfo) +#define __GET_PROGRAM_BUILD_INFO_ERR __ERR_STR(clGetProgramBuildInfo) +#define __GET_COMMAND_QUEUE_INFO_ERR __ERR_STR(clGetCommandQueueInfo) + +#define __CREATE_CONTEXT_ERR __ERR_STR(clCreateContext) +#define __CREATE_CONTEXT_FROM_TYPE_ERR __ERR_STR(clCreateContextFromType) +#define __GET_SUPPORTED_IMAGE_FORMATS_ERR __ERR_STR(clGetSupportedImageFormats) + +#define __CREATE_BUFFER_ERR __ERR_STR(clCreateBuffer) +#define __COPY_ERR __ERR_STR(cl::copy) +#define __CREATE_SUBBUFFER_ERR __ERR_STR(clCreateSubBuffer) +#define __CREATE_GL_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer) +#define __CREATE_GL_RENDER_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer) +#define __GET_GL_OBJECT_INFO_ERR __ERR_STR(clGetGLObjectInfo) +#if defined(CL_VERSION_1_2) +#define __CREATE_IMAGE_ERR __ERR_STR(clCreateImage) +#define __CREATE_GL_TEXTURE_ERR __ERR_STR(clCreateFromGLTexture) +#define __IMAGE_DIMENSION_ERR __ERR_STR(Incorrect image dimensions) +#endif // #if defined(CL_VERSION_1_2) +#define __CREATE_SAMPLER_ERR __ERR_STR(clCreateSampler) +#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback) + +#define __CREATE_USER_EVENT_ERR __ERR_STR(clCreateUserEvent) +#define __SET_USER_EVENT_STATUS_ERR __ERR_STR(clSetUserEventStatus) +#define __SET_EVENT_CALLBACK_ERR __ERR_STR(clSetEventCallback) +#define __WAIT_FOR_EVENTS_ERR __ERR_STR(clWaitForEvents) + +#define __CREATE_KERNEL_ERR __ERR_STR(clCreateKernel) +#define __SET_KERNEL_ARGS_ERR __ERR_STR(clSetKernelArg) +#define __CREATE_PROGRAM_WITH_SOURCE_ERR __ERR_STR(clCreateProgramWithSource) +#define __CREATE_PROGRAM_WITH_BINARY_ERR __ERR_STR(clCreateProgramWithBinary) +#if defined(CL_VERSION_1_2) +#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR __ERR_STR(clCreateProgramWithBuiltInKernels) +#endif // #if defined(CL_VERSION_1_2) +#define __BUILD_PROGRAM_ERR __ERR_STR(clBuildProgram) +#if defined(CL_VERSION_1_2) +#define __COMPILE_PROGRAM_ERR __ERR_STR(clCompileProgram) + +#endif // #if defined(CL_VERSION_1_2) +#define __CREATE_KERNELS_IN_PROGRAM_ERR __ERR_STR(clCreateKernelsInProgram) + +#define __CREATE_COMMAND_QUEUE_ERR __ERR_STR(clCreateCommandQueue) +#define __SET_COMMAND_QUEUE_PROPERTY_ERR __ERR_STR(clSetCommandQueueProperty) +#define __ENQUEUE_READ_BUFFER_ERR __ERR_STR(clEnqueueReadBuffer) +#define __ENQUEUE_READ_BUFFER_RECT_ERR __ERR_STR(clEnqueueReadBufferRect) +#define __ENQUEUE_WRITE_BUFFER_ERR __ERR_STR(clEnqueueWriteBuffer) +#define __ENQUEUE_WRITE_BUFFER_RECT_ERR __ERR_STR(clEnqueueWriteBufferRect) +#define __ENQEUE_COPY_BUFFER_ERR __ERR_STR(clEnqueueCopyBuffer) +#define __ENQEUE_COPY_BUFFER_RECT_ERR __ERR_STR(clEnqueueCopyBufferRect) +#define __ENQUEUE_FILL_BUFFER_ERR __ERR_STR(clEnqueueFillBuffer) +#define __ENQUEUE_READ_IMAGE_ERR __ERR_STR(clEnqueueReadImage) +#define __ENQUEUE_WRITE_IMAGE_ERR __ERR_STR(clEnqueueWriteImage) +#define __ENQUEUE_COPY_IMAGE_ERR __ERR_STR(clEnqueueCopyImage) +#define __ENQUEUE_FILL_IMAGE_ERR __ERR_STR(clEnqueueFillImage) +#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR __ERR_STR(clEnqueueCopyImageToBuffer) +#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR __ERR_STR(clEnqueueCopyBufferToImage) +#define __ENQUEUE_MAP_BUFFER_ERR __ERR_STR(clEnqueueMapBuffer) +#define __ENQUEUE_MAP_IMAGE_ERR __ERR_STR(clEnqueueMapImage) +#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR __ERR_STR(clEnqueueUnMapMemObject) +#define __ENQUEUE_NDRANGE_KERNEL_ERR __ERR_STR(clEnqueueNDRangeKernel) +#define __ENQUEUE_TASK_ERR __ERR_STR(clEnqueueTask) +#define __ENQUEUE_NATIVE_KERNEL __ERR_STR(clEnqueueNativeKernel) +#if defined(CL_VERSION_1_2) +#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR __ERR_STR(clEnqueueMigrateMemObjects) +#endif // #if defined(CL_VERSION_1_2) + +#define __ENQUEUE_ACQUIRE_GL_ERR __ERR_STR(clEnqueueAcquireGLObjects) +#define __ENQUEUE_RELEASE_GL_ERR __ERR_STR(clEnqueueReleaseGLObjects) + + +#define __RETAIN_ERR __ERR_STR(Retain Object) +#define __RELEASE_ERR __ERR_STR(Release Object) +#define __FLUSH_ERR __ERR_STR(clFlush) +#define __FINISH_ERR __ERR_STR(clFinish) +#define __VECTOR_CAPACITY_ERR __ERR_STR(Vector capacity error) + +/** + * CL 1.2 version that uses device fission. + */ +#if defined(CL_VERSION_1_2) +#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevices) +#else +#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevicesEXT) +#endif // #if defined(CL_VERSION_1_2) + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) +#define __ENQUEUE_MARKER_ERR __ERR_STR(clEnqueueMarker) +#define __ENQUEUE_WAIT_FOR_EVENTS_ERR __ERR_STR(clEnqueueWaitForEvents) +#define __ENQUEUE_BARRIER_ERR __ERR_STR(clEnqueueBarrier) +#define __UNLOAD_COMPILER_ERR __ERR_STR(clUnloadCompiler) +#define __CREATE_GL_TEXTURE_2D_ERR __ERR_STR(clCreateFromGLTexture2D) +#define __CREATE_GL_TEXTURE_3D_ERR __ERR_STR(clCreateFromGLTexture3D) +#define __CREATE_IMAGE2D_ERR __ERR_STR(clCreateImage2D) +#define __CREATE_IMAGE3D_ERR __ERR_STR(clCreateImage3D) +#endif // #if defined(CL_VERSION_1_1) + +#endif // __CL_USER_OVERRIDE_ERROR_STRINGS +//! \endcond + +/** + * CL 1.2 marker and barrier commands + */ +#if defined(CL_VERSION_1_2) +#define __ENQUEUE_MARKER_WAIT_LIST_ERR __ERR_STR(clEnqueueMarkerWithWaitList) +#define __ENQUEUE_BARRIER_WAIT_LIST_ERR __ERR_STR(clEnqueueBarrierWithWaitList) +#endif // #if defined(CL_VERSION_1_2) + +#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING) +typedef std::string STRING_CLASS; +#elif !defined(__USE_DEV_STRING) + +/*! \class string + * \brief Simple string class, that provides a limited subset of std::string + * functionality but avoids many of the issues that come with that class. + + * \note Deprecated. Please use std::string as default or + * re-define the string class to match the std::string + * interface by defining STRING_CLASS + */ +class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED +{ +private: + ::size_t size_; + char * str_; +public: + //! \brief Constructs an empty string, allocating no memory. + string(void) : size_(0), str_(NULL) + { + } + + /*! \brief Constructs a string populated from an arbitrary value of + * specified size. + * + * An extra '\0' is added, in case none was contained in str. + * + * \param str the initial value of the string instance. Note that '\0' + * characters receive no special treatment. If NULL, + * the string is left empty, with a size of 0. + * + * \param size the number of characters to copy from str. + */ + string(const char * str, ::size_t size) : + size_(size), + str_(NULL) + { + if( size > 0 ) { + str_ = new char[size_+1]; + if (str_ != NULL) { + memcpy(str_, str, size_ * sizeof(char)); + str_[size_] = '\0'; + } + else { + size_ = 0; + } + } + } + + /*! \brief Constructs a string populated from a null-terminated value. + * + * \param str the null-terminated initial value of the string instance. + * If NULL, the string is left empty, with a size of 0. + */ + string(const char * str) : + size_(0), + str_(NULL) + { + if( str ) { + size_= ::strlen(str); + } + if( size_ > 0 ) { + str_ = new char[size_ + 1]; + if (str_ != NULL) { + memcpy(str_, str, (size_ + 1) * sizeof(char)); + } + } + } + + void resize( ::size_t n ) + { + if( size_ == n ) { + return; + } + if (n == 0) { + if( str_ ) { + delete [] str_; + } + str_ = NULL; + size_ = 0; + } + else { + char *newString = new char[n + 1]; + ::size_t copySize = n; + if( size_ < n ) { + copySize = size_; + } + size_ = n; + + if(str_) { + memcpy(newString, str_, (copySize + 1) * sizeof(char)); + } + if( copySize < size_ ) { + memset(newString + copySize, 0, size_ - copySize); + } + newString[size_] = '\0'; + + delete [] str_; + str_ = newString; + } + } + + const char& operator[] ( ::size_t pos ) const + { + return str_[pos]; + } + + char& operator[] ( ::size_t pos ) + { + return str_[pos]; + } + + /*! \brief Copies the value of another string to this one. + * + * \param rhs the string to copy. + * + * \returns a reference to the modified instance. + */ + string& operator=(const string& rhs) + { + if (this == &rhs) { + return *this; + } + + if( str_ != NULL ) { + delete [] str_; + str_ = NULL; + size_ = 0; + } + + if (rhs.size_ == 0 || rhs.str_ == NULL) { + str_ = NULL; + size_ = 0; + } + else { + str_ = new char[rhs.size_ + 1]; + size_ = rhs.size_; + + if (str_ != NULL) { + memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char)); + } + else { + size_ = 0; + } + } + + return *this; + } + + /*! \brief Constructs a string by copying the value of another instance. + * + * \param rhs the string to copy. + */ + string(const string& rhs) : + size_(0), + str_(NULL) + { + *this = rhs; + } + + //! \brief Destructor - frees memory used to hold the current value. + ~string() + { + delete[] str_; + str_ = NULL; + } + + //! \brief Queries the length of the string, excluding any added '\0's. + ::size_t size(void) const { return size_; } + + //! \brief Queries the length of the string, excluding any added '\0's. + ::size_t length(void) const { return size(); } + + /*! \brief Returns a pointer to the private copy held by this instance, + * or "" if empty/unset. + */ + const char * c_str(void) const { return (str_) ? str_ : "";} +}; +typedef cl::string STRING_CLASS; +#endif // #elif !defined(__USE_DEV_STRING) + +#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) +#define VECTOR_CLASS std::vector +#elif !defined(__USE_DEV_VECTOR) +#define VECTOR_CLASS cl::vector + +#if !defined(__MAX_DEFAULT_VECTOR_SIZE) +#define __MAX_DEFAULT_VECTOR_SIZE 10 +#endif + +/*! \class vector + * \brief Fixed sized vector implementation that mirroring + * + * \note Deprecated. Please use std::vector as default or + * re-define the vector class to match the std::vector + * interface by defining VECTOR_CLASS + + * \note Not recommended for use with custom objects as + * current implementation will construct N elements + * + * std::vector functionality. + * \brief Fixed sized vector compatible with std::vector. + * + * \note + * This differs from std::vector<> not just in memory allocation, + * but also in terms of when members are constructed, destroyed, + * and assigned instead of being copy constructed. + * + * \param T type of element contained in the vector. + * + * \param N maximum size of the vector. + */ +template +class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED +{ +private: + T data_[N]; + unsigned int size_; + +public: + //! \brief Constructs an empty vector with no memory allocated. + vector() : + size_(static_cast(0)) + {} + + //! \brief Deallocates the vector's memory and destroys all of its elements. + ~vector() + { + clear(); + } + + //! \brief Returns the number of elements currently contained. + unsigned int size(void) const + { + return size_; + } + + /*! \brief Empties the vector of all elements. + * \note + * This does not deallocate memory but will invoke destructors + * on contained elements. + */ + void clear() + { + while(!empty()) { + pop_back(); + } + } + + /*! \brief Appends an element after the last valid element. + * Calling this on a vector that has reached capacity will throw an + * exception if exceptions are enabled. + */ + void push_back (const T& x) + { + if (size() < N) { + new (&data_[size_]) T(x); + size_++; + } else { + detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); + } + } + + /*! \brief Removes the last valid element from the vector. + * Calling this on an empty vector will throw an exception + * if exceptions are enabled. + */ + void pop_back(void) + { + if (size_ != 0) { + --size_; + data_[size_].~T(); + } else { + detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); + } + } + + /*! \brief Constructs with a value copied from another. + * + * \param vec the vector to copy. + */ + vector(const vector& vec) : + size_(vec.size_) + { + if (size_ != 0) { + assign(vec.begin(), vec.end()); + } + } + + /*! \brief Constructs with a specified number of initial elements. + * + * \param size number of initial elements. + * + * \param val value of initial elements. + */ + vector(unsigned int size, const T& val = T()) : + size_(0) + { + for (unsigned int i = 0; i < size; i++) { + push_back(val); + } + } + + /*! \brief Overwrites the current content with that copied from another + * instance. + * + * \param rhs vector to copy. + * + * \returns a reference to this. + */ + vector& operator=(const vector& rhs) + { + if (this == &rhs) { + return *this; + } + + if (rhs.size_ != 0) { + assign(rhs.begin(), rhs.end()); + } else { + clear(); + } + + return *this; + } + + /*! \brief Tests equality against another instance. + * + * \param vec the vector against which to compare. + */ + bool operator==(vector &vec) + { + if (size() != vec.size()) { + return false; + } + + for( unsigned int i = 0; i < size(); ++i ) { + if( operator[](i) != vec[i] ) { + return false; + } + } + return true; + } + + //! \brief Conversion operator to T*. + operator T* () { return data_; } + + //! \brief Conversion operator to const T*. + operator const T* () const { return data_; } + + //! \brief Tests whether this instance has any elements. + bool empty (void) const + { + return size_==0; + } + + //! \brief Returns the maximum number of elements this instance can hold. + unsigned int max_size (void) const + { + return N; + } + + //! \brief Returns the maximum number of elements this instance can hold. + unsigned int capacity () const + { + return N; + } + + /*! \brief Returns a reference to a given element. + * + * \param index which element to access. * + * \note + * The caller is responsible for ensuring index is >= 0 and < size(). + */ + T& operator[](int index) + { + return data_[index]; + } + + /*! \brief Returns a const reference to a given element. + * + * \param index which element to access. + * + * \note + * The caller is responsible for ensuring index is >= 0 and < size(). + */ + const T& operator[](int index) const + { + return data_[index]; + } + + /*! \brief Assigns elements of the vector based on a source iterator range. + * + * \param start Beginning iterator of source range + * \param end Enditerator of source range + * + * \note + * Will throw an exception if exceptions are enabled and size exceeded. + */ + template + void assign(I start, I end) + { + clear(); + while(start != end) { + push_back(*start); + start++; + } + } + + /*! \class iterator + * \brief Const iterator class for vectors + */ + class iterator + { + private: + const vector *vec_; + int index_; + + /** + * Internal iterator constructor to capture reference + * to the vector it iterates over rather than taking + * the vector by copy. + */ + iterator (const vector &vec, int index) : + vec_(&vec) + { + if( !vec.empty() ) { + index_ = index; + } else { + index_ = -1; + } + } + + public: + iterator(void) : + index_(-1), + vec_(NULL) + { + } + + iterator(const iterator& rhs) : + vec_(rhs.vec_), + index_(rhs.index_) + { + } + + ~iterator(void) {} + + static iterator begin(const cl::vector &vec) + { + iterator i(vec, 0); + + return i; + } + + static iterator end(const cl::vector &vec) + { + iterator i(vec, vec.size()); + + return i; + } + + bool operator==(iterator i) + { + return ((vec_ == i.vec_) && + (index_ == i.index_)); + } + + bool operator!=(iterator i) + { + return (!(*this==i)); + } + + iterator& operator++() + { + ++index_; + return *this; + } + + iterator operator++(int) + { + iterator retVal(*this); + ++index_; + return retVal; + } + + iterator& operator--() + { + --index_; + return *this; + } + + iterator operator--(int) + { + iterator retVal(*this); + --index_; + return retVal; + } + + const T& operator *() const + { + return (*vec_)[index_]; + } + }; + + iterator begin(void) + { + return iterator::begin(*this); + } + + iterator begin(void) const + { + return iterator::begin(*this); + } + + iterator end(void) + { + return iterator::end(*this); + } + + iterator end(void) const + { + return iterator::end(*this); + } + + T& front(void) + { + return data_[0]; + } + + T& back(void) + { + return data_[size_]; + } + + const T& front(void) const + { + return data_[0]; + } + + const T& back(void) const + { + return data_[size_-1]; + } +}; +#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) + + + + + +namespace detail { +#define __DEFAULT_NOT_INITIALIZED 1 +#define __DEFAULT_BEING_INITIALIZED 2 +#define __DEFAULT_INITIALIZED 4 + + /* + * Compare and exchange primitives are needed for handling of defaults + */ + inline int compare_exchange(volatile int * dest, int exchange, int comparand) + { +#ifdef _WIN32 + return (int)(InterlockedCompareExchange( + (volatile long*)dest, + (long)exchange, + (long)comparand)); +#elif defined(__APPLE__) || defined(__MACOSX) + return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest); +#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX) + return (__sync_val_compare_and_swap( + dest, + comparand, + exchange)); +#endif // !_WIN32 + } + + inline void fence() { _mm_mfence(); } +} // namespace detail + + +/*! \brief class used to interface between C++ and + * OpenCL C calls that require arrays of size_t values, whose + * size is known statically. + */ +template +class size_t +{ +private: + ::size_t data_[N]; + +public: + //! \brief Initialize size_t to all 0s + size_t() + { + for( int i = 0; i < N; ++i ) { + data_[i] = 0; + } + } + + ::size_t& operator[](int index) + { + return data_[index]; + } + + const ::size_t& operator[](int index) const + { + return data_[index]; + } + + //! \brief Conversion operator to T*. + operator ::size_t* () { return data_; } + + //! \brief Conversion operator to const T*. + operator const ::size_t* () const { return data_; } +}; + +namespace detail { + +// Generic getInfoHelper. The final parameter is used to guide overload +// resolution: the actual parameter passed is an int, which makes this +// a worse conversion sequence than a specialization that declares the +// parameter as an int. +template +inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long) +{ + return f(name, sizeof(T), param, NULL); +} + +// Specialized getInfoHelper for VECTOR_CLASS params +template +inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, long) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + T* value = (T*) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + param->assign(&value[0], &value[required/sizeof(T)]); + return CL_SUCCESS; +} + +/* Specialization for reference-counted types. This depends on the + * existence of Wrapper::cl_type, and none of the other types having the + * cl_type member. Note that simplify specifying the parameter as Wrapper + * does not work, because when using a derived type (e.g. Context) the generic + * template will provide a better match. + */ +template +inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int, typename T::cl_type = 0) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + typename T::cl_type * value = (typename T::cl_type *) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + ::size_t elements = required / sizeof(typename T::cl_type); + param->assign(&value[0], &value[elements]); + for (::size_t i = 0; i < elements; i++) + { + if (value[i] != NULL) + { + err = (*param)[i].retain(); + if (err != CL_SUCCESS) { + return err; + } + } + } + return CL_SUCCESS; +} + +// Specialized for getInfo +template +inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int) +{ + cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL); + + if (err != CL_SUCCESS) { + return err; + } + + return CL_SUCCESS; +} + +// Specialized GetInfoHelper for STRING_CLASS params +template +inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + char* value = (char*) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + *param = value; + return CL_SUCCESS; +} + +// Specialized GetInfoHelper for cl::size_t params +template +inline cl_int getInfoHelper(Func f, cl_uint name, size_t* param, long) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + ::size_t* value = (::size_t*) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + for(int i = 0; i < N; ++i) { + (*param)[i] = value[i]; + } + + return CL_SUCCESS; +} + +template struct ReferenceHandler; + +/* Specialization for reference-counted types. This depends on the + * existence of Wrapper::cl_type, and none of the other types having the + * cl_type member. Note that simplify specifying the parameter as Wrapper + * does not work, because when using a derived type (e.g. Context) the generic + * template will provide a better match. + */ +template +inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0) +{ + typename T::cl_type value; + cl_int err = f(name, sizeof(value), &value, NULL); + if (err != CL_SUCCESS) { + return err; + } + *param = value; + if (value != NULL) + { + err = param->retain(); + if (err != CL_SUCCESS) { + return err; + } + } + return CL_SUCCESS; +} + +#define __PARAM_NAME_INFO_1_0(F) \ + F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \ + \ + F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \ + F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \ + F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \ + F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \ + F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \ + F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \ + F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \ + F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \ + F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \ + F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \ + F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \ + F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \ + \ + F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \ + F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS) \ + F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS) \ + \ + F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \ + F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \ + F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \ + F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \ + \ + F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \ + \ + F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \ + F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \ + F(cl_mem_info, CL_MEM_SIZE, ::size_t) \ + F(cl_mem_info, CL_MEM_HOST_PTR, void*) \ + F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \ + F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \ + F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \ + \ + F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \ + F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \ + F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \ + F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \ + F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \ + F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \ + F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \ + \ + F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \ + F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \ + F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \ + F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \ + F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \ + \ + F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \ + F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \ + F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \ + F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS) \ + F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \ + F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \ + F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS) \ + \ + F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \ + F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \ + F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \ + \ + F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \ + F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \ + F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \ + F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \ + F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \ + \ + F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \ + F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \ + F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \ + \ + F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \ + F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \ + F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \ + F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties) + +#if defined(CL_VERSION_1_1) +#define __PARAM_NAME_INFO_1_1(F) \ + F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \ + F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \ + F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \ + \ + F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \ + F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \ + \ + F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \ + F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \ + \ + F(cl_event_info, CL_EVENT_CONTEXT, cl::Context) +#endif // CL_VERSION_1_1 + + +#if defined(CL_VERSION_1_2) +#define __PARAM_NAME_INFO_1_2(F) \ + F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \ + \ + F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \ + F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \ + \ + F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \ + \ + F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \ + \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \ + \ + F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \ + F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \ + F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \ + F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS) +#endif // #if defined(CL_VERSION_1_2) + +#if defined(USE_CL_DEVICE_FISSION) +#define __PARAM_NAME_DEVICE_FISSION(F) \ + F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \ + F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \ + F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS) +#endif // USE_CL_DEVICE_FISSION + +template +struct param_traits {}; + +#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \ +struct token; \ +template<> \ +struct param_traits \ +{ \ + enum { value = param_name }; \ + typedef T param_type; \ +}; + +__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS) +#if defined(CL_VERSION_1_1) +__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS) +#endif // CL_VERSION_1_1 +#if defined(CL_VERSION_1_2) +__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS) +#endif // CL_VERSION_1_1 + +#if defined(USE_CL_DEVICE_FISSION) +__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS); +#endif // USE_CL_DEVICE_FISSION + +#ifdef CL_PLATFORM_ICD_SUFFIX_KHR +__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS) +#endif + +#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong) +#endif + +#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>) +#endif +#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_SIMD_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint) +#endif + +#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint) +#endif +#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint) +#endif +#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint) +#endif +#ifdef CL_DEVICE_WARP_SIZE_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint) +#endif +#ifdef CL_DEVICE_GPU_OVERLAP_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool) +#endif +#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool) +#endif +#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool) +#endif + +// Convenience functions + +template +inline cl_int +getInfo(Func f, cl_uint name, T* param) +{ + return getInfoHelper(f, name, param, 0); +} + +template +struct GetInfoFunctor0 +{ + Func f_; const Arg0& arg0_; + cl_int operator ()( + cl_uint param, ::size_t size, void* value, ::size_t* size_ret) + { return f_(arg0_, param, size, value, size_ret); } +}; + +template +struct GetInfoFunctor1 +{ + Func f_; const Arg0& arg0_; const Arg1& arg1_; + cl_int operator ()( + cl_uint param, ::size_t size, void* value, ::size_t* size_ret) + { return f_(arg0_, arg1_, param, size, value, size_ret); } +}; + +template +inline cl_int +getInfo(Func f, const Arg0& arg0, cl_uint name, T* param) +{ + GetInfoFunctor0 f0 = { f, arg0 }; + return getInfoHelper(f0, name, param, 0); +} + +template +inline cl_int +getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param) +{ + GetInfoFunctor1 f0 = { f, arg0, arg1 }; + return getInfoHelper(f0, name, param, 0); +} + +template +struct ReferenceHandler +{ }; + +#if defined(CL_VERSION_1_2) +/** + * OpenCL 1.2 devices do have retain/release. + */ +template <> +struct ReferenceHandler +{ + /** + * Retain the device. + * \param device A valid device created using createSubDevices + * \return + * CL_SUCCESS if the function executed successfully. + * CL_INVALID_DEVICE if device was not a valid subdevice + * CL_OUT_OF_RESOURCES + * CL_OUT_OF_HOST_MEMORY + */ + static cl_int retain(cl_device_id device) + { return ::clRetainDevice(device); } + /** + * Retain the device. + * \param device A valid device created using createSubDevices + * \return + * CL_SUCCESS if the function executed successfully. + * CL_INVALID_DEVICE if device was not a valid subdevice + * CL_OUT_OF_RESOURCES + * CL_OUT_OF_HOST_MEMORY + */ + static cl_int release(cl_device_id device) + { return ::clReleaseDevice(device); } +}; +#else // #if defined(CL_VERSION_1_2) +/** + * OpenCL 1.1 devices do not have retain/release. + */ +template <> +struct ReferenceHandler +{ + // cl_device_id does not have retain(). + static cl_int retain(cl_device_id) + { return CL_SUCCESS; } + // cl_device_id does not have release(). + static cl_int release(cl_device_id) + { return CL_SUCCESS; } +}; +#endif // #if defined(CL_VERSION_1_2) + +template <> +struct ReferenceHandler +{ + // cl_platform_id does not have retain(). + static cl_int retain(cl_platform_id) + { return CL_SUCCESS; } + // cl_platform_id does not have release(). + static cl_int release(cl_platform_id) + { return CL_SUCCESS; } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_context context) + { return ::clRetainContext(context); } + static cl_int release(cl_context context) + { return ::clReleaseContext(context); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_command_queue queue) + { return ::clRetainCommandQueue(queue); } + static cl_int release(cl_command_queue queue) + { return ::clReleaseCommandQueue(queue); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_mem memory) + { return ::clRetainMemObject(memory); } + static cl_int release(cl_mem memory) + { return ::clReleaseMemObject(memory); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_sampler sampler) + { return ::clRetainSampler(sampler); } + static cl_int release(cl_sampler sampler) + { return ::clReleaseSampler(sampler); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_program program) + { return ::clRetainProgram(program); } + static cl_int release(cl_program program) + { return ::clReleaseProgram(program); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_kernel kernel) + { return ::clRetainKernel(kernel); } + static cl_int release(cl_kernel kernel) + { return ::clReleaseKernel(kernel); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_event event) + { return ::clRetainEvent(event); } + static cl_int release(cl_event event) + { return ::clReleaseEvent(event); } +}; + + +// Extracts version number with major in the upper 16 bits, minor in the lower 16 +static cl_uint getVersion(const char *versionInfo) +{ + int highVersion = 0; + int lowVersion = 0; + int index = 7; + while(versionInfo[index] != '.' ) { + highVersion *= 10; + highVersion += versionInfo[index]-'0'; + ++index; + } + ++index; + while(versionInfo[index] != ' ' ) { + lowVersion *= 10; + lowVersion += versionInfo[index]-'0'; + ++index; + } + return (highVersion << 16) | lowVersion; +} + +static cl_uint getPlatformVersion(cl_platform_id platform) +{ + ::size_t size = 0; + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size); + char *versionInfo = (char *) alloca(size); + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size); + return getVersion(versionInfo); +} + +static cl_uint getDevicePlatformVersion(cl_device_id device) +{ + cl_platform_id platform; + clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL); + return getPlatformVersion(platform); +} + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +static cl_uint getContextPlatformVersion(cl_context context) +{ + // The platform cannot be queried directly, so we first have to grab a + // device and obtain its context + ::size_t size = 0; + clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size); + if (size == 0) + return 0; + cl_device_id *devices = (cl_device_id *) alloca(size); + clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL); + return getDevicePlatformVersion(devices[0]); +} +#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + +template +class Wrapper +{ +public: + typedef T cl_type; + +protected: + cl_type object_; + +public: + Wrapper() : object_(NULL) { } + + Wrapper(const cl_type &obj) : object_(obj) { } + + ~Wrapper() + { + if (object_ != NULL) { release(); } + } + + Wrapper(const Wrapper& rhs) + { + object_ = rhs.object_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + } + + Wrapper& operator = (const Wrapper& rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs.object_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + return *this; + } + + Wrapper& operator = (const cl_type &rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs; + return *this; + } + + cl_type operator ()() const { return object_; } + + cl_type& operator ()() { return object_; } + +protected: + template + friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); + + cl_int retain() const + { + return ReferenceHandler::retain(object_); + } + + cl_int release() const + { + return ReferenceHandler::release(object_); + } +}; + +template <> +class Wrapper +{ +public: + typedef cl_device_id cl_type; + +protected: + cl_type object_; + bool referenceCountable_; + + static bool isReferenceCountable(cl_device_id device) + { + bool retVal = false; + if (device != NULL) { + int version = getDevicePlatformVersion(device); + if(version > ((1 << 16) + 1)) { + retVal = true; + } + } + return retVal; + } + +public: + Wrapper() : object_(NULL), referenceCountable_(false) + { + } + + Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) + { + referenceCountable_ = isReferenceCountable(obj); + } + + ~Wrapper() + { + if (object_ != NULL) { release(); } + } + + Wrapper(const Wrapper& rhs) + { + object_ = rhs.object_; + referenceCountable_ = isReferenceCountable(object_); + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + } + + Wrapper& operator = (const Wrapper& rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs.object_; + referenceCountable_ = rhs.referenceCountable_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + return *this; + } + + Wrapper& operator = (const cl_type &rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs; + referenceCountable_ = isReferenceCountable(object_); + return *this; + } + + cl_type operator ()() const { return object_; } + + cl_type& operator ()() { return object_; } + +protected: + template + friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); + + template + friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS*, int, typename U::cl_type); + + cl_int retain() const + { + if( referenceCountable_ ) { + return ReferenceHandler::retain(object_); + } + else { + return CL_SUCCESS; + } + } + + cl_int release() const + { + if( referenceCountable_ ) { + return ReferenceHandler::release(object_); + } + else { + return CL_SUCCESS; + } + } +}; + +} // namespace detail +//! \endcond + +/*! \stuct ImageFormat + * \brief Adds constructors and member functions for cl_image_format. + * + * \see cl_image_format + */ +struct ImageFormat : public cl_image_format +{ + //! \brief Default constructor - performs no initialization. + ImageFormat(){} + + //! \brief Initializing constructor. + ImageFormat(cl_channel_order order, cl_channel_type type) + { + image_channel_order = order; + image_channel_data_type = type; + } + + //! \brief Assignment operator. + ImageFormat& operator = (const ImageFormat& rhs) + { + if (this != &rhs) { + this->image_channel_data_type = rhs.image_channel_data_type; + this->image_channel_order = rhs.image_channel_order; + } + return *this; + } +}; + +/*! \brief Class interface for cl_device_id. + * + * \note Copies of these objects are inexpensive, since they don't 'own' + * any underlying resources or data structures. + * + * \see cl_device_id + */ +class Device : public detail::Wrapper +{ +public: + //! \brief Default constructor - initializes to NULL. + Device() : detail::Wrapper() { } + + /*! \brief Copy constructor. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device(const Device& device) : detail::Wrapper(device) { } + + /*! \brief Constructor from cl_device_id. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device(const cl_device_id &device) : detail::Wrapper(device) { } + + /*! \brief Returns the first device on the default context. + * + * \see Context::getDefault() + */ + static Device getDefault(cl_int * err = NULL); + + /*! \brief Assignment operator from Device. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device& operator = (const Device& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_device_id. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device& operator = (const cl_device_id& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetDeviceInfo(). + template + cl_int getInfo(cl_device_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetDeviceInfo, object_, name, param), + __GET_DEVICE_INFO_ERR); + } + + //! \brief Wrapper for clGetDeviceInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_device_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /** + * CL 1.2 version + */ +#if defined(CL_VERSION_1_2) + //! \brief Wrapper for clCreateSubDevicesEXT(). + cl_int createSubDevices( + const cl_device_partition_property * properties, + VECTOR_CLASS* devices) + { + cl_uint n = 0; + cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = clCreateSubDevices(object_, properties, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif // #if defined(CL_VERSION_1_2) + +/** + * CL 1.1 version that uses device fission. + */ +#if defined(CL_VERSION_1_1) +#if defined(USE_CL_DEVICE_FISSION) + cl_int createSubDevices( + const cl_device_partition_property_ext * properties, + VECTOR_CLASS* devices) + { + typedef CL_API_ENTRY cl_int + ( CL_API_CALL * PFN_clCreateSubDevicesEXT)( + cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL; + __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT); + + cl_uint n = 0; + cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif // #if defined(USE_CL_DEVICE_FISSION) +#endif // #if defined(CL_VERSION_1_1) +}; + +/*! \brief Class interface for cl_platform_id. + * + * \note Copies of these objects are inexpensive, since they don't 'own' + * any underlying resources or data structures. + * + * \see cl_platform_id + */ +class Platform : public detail::Wrapper +{ +public: + //! \brief Default constructor - initializes to NULL. + Platform() : detail::Wrapper() { } + + /*! \brief Copy constructor. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform(const Platform& platform) : detail::Wrapper(platform) { } + + /*! \brief Constructor from cl_platform_id. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform(const cl_platform_id &platform) : detail::Wrapper(platform) { } + + /*! \brief Assignment operator from Platform. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform& operator = (const Platform& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_platform_id. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform& operator = (const cl_platform_id& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetPlatformInfo(). + cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetPlatformInfo, object_, name, param), + __GET_PLATFORM_INFO_ERR); + } + + //! \brief Wrapper for clGetPlatformInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_platform_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Gets a list of devices for this platform. + * + * Wraps clGetDeviceIDs(). + */ + cl_int getDevices( + cl_device_type type, + VECTOR_CLASS* devices) const + { + cl_uint n = 0; + if( devices == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); + } + cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = ::clGetDeviceIDs(object_, type, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } + +#if defined(USE_DX_INTEROP) + /*! \brief Get the list of available D3D10 devices. + * + * \param d3d_device_source. + * + * \param d3d_object. + * + * \param d3d_device_set. + * + * \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device + * values returned in devices can be used to identify a specific OpenCL + * device. If \a devices argument is NULL, this argument is ignored. + * + * \return One of the following values: + * - CL_SUCCESS if the function is executed successfully. + * + * The application can query specific capabilities of the OpenCL device(s) + * returned by cl::getDevices. This can be used by the application to + * determine which device(s) to use. + * + * \note In the case that exceptions are enabled and a return value + * other than CL_SUCCESS is generated, then cl::Error exception is + * generated. + */ + cl_int getDevices( + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + VECTOR_CLASS* devices) const + { + typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)( + cl_platform_id platform, + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint* num_devices); + + if( devices == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); + } + + static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL; + __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR); + + cl_uint n = 0; + cl_int err = pfn_clGetDeviceIDsFromD3D10KHR( + object_, + d3d_device_source, + d3d_object, + d3d_device_set, + 0, + NULL, + &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = pfn_clGetDeviceIDsFromD3D10KHR( + object_, + d3d_device_source, + d3d_object, + d3d_device_set, + n, + ids, + NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif + + /*! \brief Gets a list of available platforms. + * + * Wraps clGetPlatformIDs(). + */ + static cl_int get( + VECTOR_CLASS* platforms) + { + cl_uint n = 0; + + if( platforms == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); + } + + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + cl_platform_id* ids = (cl_platform_id*) alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + platforms->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } + + /*! \brief Gets the first available platform. + * + * Wraps clGetPlatformIDs(), returning the first result. + */ + static cl_int get( + Platform * platform) + { + cl_uint n = 0; + + if( platform == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); + } + + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + cl_platform_id* ids = (cl_platform_id*) alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + *platform = ids[0]; + return CL_SUCCESS; + } + + /*! \brief Gets the first available platform, returning it by value. + * + * Wraps clGetPlatformIDs(), returning the first result. + */ + static Platform get( + cl_int * errResult = NULL) + { + Platform platform; + cl_uint n = 0; + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + if (errResult != NULL) { + *errResult = err; + } + } + + cl_platform_id* ids = (cl_platform_id*) alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + + if (err != CL_SUCCESS) { + detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + if (errResult != NULL) { + *errResult = err; + } + + return ids[0]; + } + + static Platform getDefault( + cl_int *errResult = NULL ) + { + return get(errResult); + } + + +#if defined(CL_VERSION_1_2) + //! \brief Wrapper for clUnloadCompiler(). + cl_int + unloadCompiler() + { + return ::clUnloadPlatformCompiler(object_); + } +#endif // #if defined(CL_VERSION_1_2) +}; // class Platform + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) +/** + * Unload the OpenCL compiler. + * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead. + */ +inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int +UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +inline cl_int +UnloadCompiler() +{ + return ::clUnloadCompiler(); +} +#endif // #if defined(CL_VERSION_1_1) + +/*! \brief Class interface for cl_context. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_context as the original. For details, see + * clRetainContext() and clReleaseContext(). + * + * \see cl_context + */ +class Context + : public detail::Wrapper +{ +private: + static volatile int default_initialized_; + static Context default_; + static volatile cl_int default_error_; +public: + /*! \brief Destructor. + * + * This calls clReleaseContext() on the value held by this instance. + */ + ~Context() { } + + /*! \brief Constructs a context including a list of specified devices. + * + * Wraps clCreateContext(). + */ + Context( + const VECTOR_CLASS& devices, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + object_ = ::clCreateContext( + properties, (cl_uint) numDevices, + deviceIDs, + notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + if (err != NULL) { + *err = error; + } + } + + Context( + const Device& device, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + + cl_device_id deviceID = device(); + + object_ = ::clCreateContext( + properties, 1, + &deviceID, + notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructs a context including all or a subset of devices of a specified type. + * + * Wraps clCreateContextFromType(). + */ + Context( + cl_device_type type, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + +#if !defined(__APPLE__) || !defined(__MACOS) + cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 }; + + if (properties == NULL) { + // Get a valid platform ID as we cannot send in a blank one + VECTOR_CLASS platforms; + error = Platform::get(&platforms); + if (error != CL_SUCCESS) { + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + } + return; + } + + // Check the platforms we found for a device of our specified type + cl_context_properties platform_id = 0; + for (unsigned int i = 0; i < platforms.size(); i++) { + + VECTOR_CLASS devices; + +#if defined(__CL_ENABLE_EXCEPTIONS) + try { +#endif + + error = platforms[i].getDevices(type, &devices); + +#if defined(__CL_ENABLE_EXCEPTIONS) + } catch (Error) {} + // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type + // We do error checking next anyway, and can throw there if needed +#endif + + // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND + if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) { + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + } + } + + if (devices.size() > 0) { + platform_id = (cl_context_properties)platforms[i](); + break; + } + } + + if (platform_id == 0) { + detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = CL_DEVICE_NOT_FOUND; + } + return; + } + + prop[1] = platform_id; + properties = &prop[0]; + } +#endif + object_ = ::clCreateContextFromType( + properties, type, notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT. + * + * \note All calls to this function return the same cl_context as the first. + */ + static Context getDefault(cl_int * err = NULL) + { + int state = detail::compare_exchange( + &default_initialized_, + __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED); + + if (state & __DEFAULT_INITIALIZED) { + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + if (state & __DEFAULT_BEING_INITIALIZED) { + // Assume writes will propagate eventually... + while(default_initialized_ != __DEFAULT_INITIALIZED) { + detail::fence(); + } + + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + cl_int error; + default_ = Context( + CL_DEVICE_TYPE_DEFAULT, + NULL, + NULL, + NULL, + &error); + + detail::fence(); + + default_error_ = error; + // Assume writes will propagate eventually... + default_initialized_ = __DEFAULT_INITIALIZED; + + detail::fence(); + + if (err != NULL) { + *err = default_error_; + } + return default_; + + } + + //! \brief Default constructor - initializes to NULL. + Context() : detail::Wrapper() { } + + /*! \brief Copy constructor. + * + * This calls clRetainContext() on the parameter's cl_context. + */ + Context(const Context& context) : detail::Wrapper(context) { } + + /*! \brief Constructor from cl_context - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_context + * into the new Context object. + */ + __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper(context) { } + + /*! \brief Assignment operator from Context. + * + * This calls clRetainContext() on the parameter and clReleaseContext() on + * the previous value held by this instance. + */ + Context& operator = (const Context& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_context - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseContext() on the value previously held by this instance. + */ + Context& operator = (const cl_context& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetContextInfo(). + template + cl_int getInfo(cl_context_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetContextInfo, object_, name, param), + __GET_CONTEXT_INFO_ERR); + } + + //! \brief Wrapper for clGetContextInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_context_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Gets a list of supported image formats. + * + * Wraps clGetSupportedImageFormats(). + */ + cl_int getSupportedImageFormats( + cl_mem_flags flags, + cl_mem_object_type type, + VECTOR_CLASS* formats) const + { + cl_uint numEntries; + cl_int err = ::clGetSupportedImageFormats( + object_, + flags, + type, + 0, + NULL, + &numEntries); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); + } + + ImageFormat* value = (ImageFormat*) + alloca(numEntries * sizeof(ImageFormat)); + err = ::clGetSupportedImageFormats( + object_, + flags, + type, + numEntries, + (cl_image_format*) value, + NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); + } + + formats->assign(&value[0], &value[numEntries]); + return CL_SUCCESS; + } +}; + +inline Device Device::getDefault(cl_int * err) +{ + cl_int error; + Device device; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + device = context.getInfo()[0]; + if (err != NULL) { + *err = CL_SUCCESS; + } + } + + return device; +} + + +#ifdef _WIN32 +__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +__declspec(selectany) Context Context::default_; +__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS; +#else +__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +__attribute__((weak)) Context Context::default_; +__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS; +#endif + +/*! \brief Class interface for cl_event. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_event as the original. For details, see + * clRetainEvent() and clReleaseEvent(). + * + * \see cl_event + */ +class Event : public detail::Wrapper +{ +public: + /*! \brief Destructor. + * + * This calls clReleaseEvent() on the value held by this instance. + */ + ~Event() { } + + //! \brief Default constructor - initializes to NULL. + Event() : detail::Wrapper() { } + + /*! \brief Copy constructor. + * + * This calls clRetainEvent() on the parameter's cl_event. + */ + Event(const Event& event) : detail::Wrapper(event) { } + + /*! \brief Constructor from cl_event - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_event + * into the new Event object. + */ + Event(const cl_event& event) : detail::Wrapper(event) { } + + /*! \brief Assignment operator from cl_event - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseEvent() on the value previously held by this instance. + */ + Event& operator = (const Event& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_event. + * + * This calls clRetainEvent() on the parameter and clReleaseEvent() on + * the previous value held by this instance. + */ + Event& operator = (const cl_event& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetEventInfo(). + template + cl_int getInfo(cl_event_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetEventInfo, object_, name, param), + __GET_EVENT_INFO_ERR); + } + + //! \brief Wrapper for clGetEventInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_event_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + //! \brief Wrapper for clGetEventProfilingInfo(). + template + cl_int getProfilingInfo(cl_profiling_info name, T* param) const + { + return detail::errHandler(detail::getInfo( + &::clGetEventProfilingInfo, object_, name, param), + __GET_EVENT_PROFILE_INFO_ERR); + } + + //! \brief Wrapper for clGetEventProfilingInfo() that returns by value. + template typename + detail::param_traits::param_type + getProfilingInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_profiling_info, name>::param_type param; + cl_int result = getProfilingInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Blocks the calling thread until this event completes. + * + * Wraps clWaitForEvents(). + */ + cl_int wait() const + { + return detail::errHandler( + ::clWaitForEvents(1, &object_), + __WAIT_FOR_EVENTS_ERR); + } + +#if defined(CL_VERSION_1_1) + /*! \brief Registers a user callback function for a specific command execution status. + * + * Wraps clSetEventCallback(). + */ + cl_int setCallback( + cl_int type, + void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *), + void * user_data = NULL) + { + return detail::errHandler( + ::clSetEventCallback( + object_, + type, + pfn_notify, + user_data), + __SET_EVENT_CALLBACK_ERR); + } +#endif + + /*! \brief Blocks the calling thread until every event specified is complete. + * + * Wraps clWaitForEvents(). + */ + static cl_int + waitForEvents(const VECTOR_CLASS& events) + { + return detail::errHandler( + ::clWaitForEvents( + (cl_uint) events.size(), (cl_event*)&events.front()), + __WAIT_FOR_EVENTS_ERR); + } +}; + +#if defined(CL_VERSION_1_1) +/*! \brief Class interface for user events (a subset of cl_event's). + * + * See Event for details about copy semantics, etc. + */ +class UserEvent : public Event +{ +public: + /*! \brief Constructs a user event on a given context. + * + * Wraps clCreateUserEvent(). + */ + UserEvent( + const Context& context, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateUserEvent( + context(), + &error); + + detail::errHandler(error, __CREATE_USER_EVENT_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + UserEvent() : Event() { } + + //! \brief Copy constructor - performs shallow copy. + UserEvent(const UserEvent& event) : Event(event) { } + + //! \brief Assignment Operator - performs shallow copy. + UserEvent& operator = (const UserEvent& rhs) + { + if (this != &rhs) { + Event::operator=(rhs); + } + return *this; + } + + /*! \brief Sets the execution status of a user event object. + * + * Wraps clSetUserEventStatus(). + */ + cl_int setStatus(cl_int status) + { + return detail::errHandler( + ::clSetUserEventStatus(object_,status), + __SET_USER_EVENT_STATUS_ERR); + } +}; +#endif + +/*! \brief Blocks the calling thread until every event specified is complete. + * + * Wraps clWaitForEvents(). + */ +inline static cl_int +WaitForEvents(const VECTOR_CLASS& events) +{ + return detail::errHandler( + ::clWaitForEvents( + (cl_uint) events.size(), (cl_event*)&events.front()), + __WAIT_FOR_EVENTS_ERR); +} + +/*! \brief Class interface for cl_mem. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_mem as the original. For details, see + * clRetainMemObject() and clReleaseMemObject(). + * + * \see cl_mem + */ +class Memory : public detail::Wrapper +{ +public: + + /*! \brief Destructor. + * + * This calls clReleaseMemObject() on the value held by this instance. + */ + ~Memory() {} + + //! \brief Default constructor - initializes to NULL. + Memory() : detail::Wrapper() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * This calls clRetainMemObject() on the parameter's cl_mem. + */ + Memory(const Memory& memory) : detail::Wrapper(memory) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_mem + * into the new Memory object. + */ + __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper(memory) { } + + /*! \brief Assignment operator from Memory. + * + * This calls clRetainMemObject() on the parameter and clReleaseMemObject() + * on the previous value held by this instance. + */ + Memory& operator = (const Memory& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_mem - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseMemObject() on the value previously held by this instance. + */ + Memory& operator = (const cl_mem& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetMemObjectInfo(). + template + cl_int getInfo(cl_mem_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetMemObjectInfo, object_, name, param), + __GET_MEM_OBJECT_INFO_ERR); + } + + //! \brief Wrapper for clGetMemObjectInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_mem_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + +#if defined(CL_VERSION_1_1) + /*! \brief Registers a callback function to be called when the memory object + * is no longer needed. + * + * Wraps clSetMemObjectDestructorCallback(). + * + * Repeated calls to this function, for a given cl_mem value, will append + * to the list of functions called (in reverse order) when memory object's + * resources are freed and the memory object is deleted. + * + * \note + * The registered callbacks are associated with the underlying cl_mem + * value - not the Memory class instance. + */ + cl_int setDestructorCallback( + void (CL_CALLBACK * pfn_notify)(cl_mem, void *), + void * user_data = NULL) + { + return detail::errHandler( + ::clSetMemObjectDestructorCallback( + object_, + pfn_notify, + user_data), + __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR); + } +#endif + +}; + +// Pre-declare copy functions +class Buffer; +template< typename IteratorType > +cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ); +template< typename IteratorType > +cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ); +template< typename IteratorType > +cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ); +template< typename IteratorType > +cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ); + + +/*! \brief Class interface for Buffer Memory Objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Buffer : public Memory +{ +public: + + /*! \brief Constructs a Buffer in a specified context. + * + * Wraps clCreateBuffer(). + * + * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was + * specified. Note alignment & exclusivity requirements. + */ + Buffer( + const Context& context, + cl_mem_flags flags, + ::size_t size, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructs a Buffer in the default context. + * + * Wraps clCreateBuffer(). + * + * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was + * specified. Note alignment & exclusivity requirements. + * + * \see Context::getDefault() + */ + Buffer( + cl_mem_flags flags, + ::size_t size, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(err); + + object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! + * \brief Construct a Buffer from a host container via iterators. + * IteratorType must be random access. + * If useHostPtr is specified iterators must represent contiguous data. + */ + template< typename IteratorType > + Buffer( + IteratorType startIterator, + IteratorType endIterator, + bool readOnly, + bool useHostPtr = false, + cl_int* err = NULL) + { + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + cl_mem_flags flags = 0; + if( readOnly ) { + flags |= CL_MEM_READ_ONLY; + } + else { + flags |= CL_MEM_READ_WRITE; + } + if( useHostPtr ) { + flags |= CL_MEM_USE_HOST_PTR; + } + + ::size_t size = sizeof(DataType)*(endIterator - startIterator); + + Context context = Context::getDefault(err); + + if( useHostPtr ) { + object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + } else { + object_ = ::clCreateBuffer(context(), flags, size, 0, &error); + } + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + if( !useHostPtr ) { + error = cl::copy(startIterator, endIterator, *this); + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + } + + /*! + * \brief Construct a Buffer from a host container via iterators using a specified context. + * IteratorType must be random access. + * If useHostPtr is specified iterators must represent contiguous data. + */ + template< typename IteratorType > + Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator, + bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + + //! \brief Default constructor - initializes to NULL. + Buffer() : Memory() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Buffer(const Buffer& buffer) : Memory(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { } + + /*! \brief Assignment from Buffer - performs shallow copy. + * + * See Memory for further details. + */ + Buffer& operator = (const Buffer& rhs) + { + if (this != &rhs) { + Memory::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Buffer& operator = (const cl_mem& rhs) + { + Memory::operator=(rhs); + return *this; + } + +#if defined(CL_VERSION_1_1) + /*! \brief Creates a new buffer object from this. + * + * Wraps clCreateSubBuffer(). + */ + Buffer createSubBuffer( + cl_mem_flags flags, + cl_buffer_create_type buffer_create_type, + const void * buffer_create_info, + cl_int * err = NULL) + { + Buffer result; + cl_int error; + result.object_ = ::clCreateSubBuffer( + object_, + flags, + buffer_create_type, + buffer_create_info, + &error); + + detail::errHandler(error, __CREATE_SUBBUFFER_ERR); + if (err != NULL) { + *err = error; + } + + return result; + } +#endif +}; + +#if defined (USE_DX_INTEROP) +/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's. + * + * This is provided to facilitate interoperability with Direct3D. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class BufferD3D10 : public Buffer +{ +public: + typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)( + cl_context context, cl_mem_flags flags, ID3D10Buffer* buffer, + cl_int* errcode_ret); + + /*! \brief Constructs a BufferD3D10, in a specified context, from a + * given ID3D10Buffer. + * + * Wraps clCreateFromD3D10BufferKHR(). + */ + BufferD3D10( + const Context& context, + cl_mem_flags flags, + ID3D10Buffer* bufobj, + cl_int * err = NULL) + { + static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL; + +#if defined(CL_VERSION_1_2) + vector props = context.getInfo(); + cl_platform platform = -1; + for( int i = 0; i < props.size(); ++i ) { + if( props[i] == CL_CONTEXT_PLATFORM ) { + platform = props[i+1]; + } + } + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR); +#endif +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR); +#endif + + cl_int error; + object_ = pfn_clCreateFromD3D10BufferKHR( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferD3D10() : Buffer() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { } + + /*! \brief Assignment from BufferD3D10 - performs shallow copy. + * + * See Memory for further details. + */ + BufferD3D10& operator = (const BufferD3D10& rhs) + { + if (this != &rhs) { + Buffer::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferD3D10& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } +}; +#endif + +/*! \brief Class interface for GL Buffer Memory Objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class BufferGL : public Buffer +{ +public: + /*! \brief Constructs a BufferGL in a specified context, from a given + * GL buffer. + * + * Wraps clCreateFromGLBuffer(). + */ + BufferGL( + const Context& context, + cl_mem_flags flags, + GLuint bufobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLBuffer( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferGL() : Buffer() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + BufferGL(const BufferGL& buffer) : Buffer(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { } + + /*! \brief Assignment from BufferGL - performs shallow copy. + * + * See Memory for further details. + */ + BufferGL& operator = (const BufferGL& rhs) + { + if (this != &rhs) { + Buffer::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferGL& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetGLObjectInfo(). + cl_int getObjectInfo( + cl_gl_object_type *type, + GLuint * gl_object_name) + { + return detail::errHandler( + ::clGetGLObjectInfo(object_,type,gl_object_name), + __GET_GL_OBJECT_INFO_ERR); + } +}; + +/*! \brief Class interface for GL Render Buffer Memory Objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class BufferRenderGL : public Buffer +{ +public: + /*! \brief Constructs a BufferRenderGL in a specified context, from a given + * GL Renderbuffer. + * + * Wraps clCreateFromGLRenderbuffer(). + */ + BufferRenderGL( + const Context& context, + cl_mem_flags flags, + GLuint bufobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLRenderbuffer( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferRenderGL() : Buffer() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { } + + /*! \brief Assignment from BufferGL - performs shallow copy. + * + * See Memory for further details. + */ + BufferRenderGL& operator = (const BufferRenderGL& rhs) + { + if (this != &rhs) { + Buffer::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferRenderGL& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetGLObjectInfo(). + cl_int getObjectInfo( + cl_gl_object_type *type, + GLuint * gl_object_name) + { + return detail::errHandler( + ::clGetGLObjectInfo(object_,type,gl_object_name), + __GET_GL_OBJECT_INFO_ERR); + } +}; + +/*! \brief C++ base class for Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image : public Memory +{ +protected: + //! \brief Default constructor - initializes to NULL. + Image() : Memory() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image(const Image& image) : Memory(image) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { } + + /*! \brief Assignment from Image - performs shallow copy. + * + * See Memory for further details. + */ + Image& operator = (const Image& rhs) + { + if (this != &rhs) { + Memory::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image& operator = (const cl_mem& rhs) + { + Memory::operator=(rhs); + return *this; + } + +public: + //! \brief Wrapper for clGetImageInfo(). + template + cl_int getImageInfo(cl_image_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetImageInfo, object_, name, param), + __GET_IMAGE_INFO_ERR); + } + + //! \brief Wrapper for clGetImageInfo() that returns by value. + template typename + detail::param_traits::param_type + getImageInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_image_info, name>::param_type param; + cl_int result = getImageInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +}; + +#if defined(CL_VERSION_1_2) +/*! \brief Class interface for 1D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image1D : public Image +{ +public: + /*! \brief Constructs a 1D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image1D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE1D, + width, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + Image1D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image1D(const Image1D& image1D) : Image(image1D) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { } + + /*! \brief Assignment from Image1D - performs shallow copy. + * + * See Memory for further details. + */ + Image1D& operator = (const Image1D& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image1D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; + +/*! \class Image1DBuffer + * \brief Image interface for 1D buffer images. + */ +class Image1DBuffer : public Image +{ +public: + Image1DBuffer( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + const Buffer &buffer, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + width, + 0, 0, 0, 0, 0, 0, 0, + buffer() + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + NULL, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image1DBuffer() { } + + Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { } + + __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { } + + Image1DBuffer& operator = (const Image1DBuffer& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + Image1DBuffer& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; + +/*! \class Image1DArray + * \brief Image interface for arrays of 1D images. + */ +class Image1DArray : public Image +{ +public: + Image1DArray( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t arraySize, + ::size_t width, + ::size_t rowPitch, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE1D_ARRAY, + width, + 0, 0, // height, depth (unused) + arraySize, + rowPitch, + 0, 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image1DArray() { } + + Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { } + + __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { } + + Image1DArray& operator = (const Image1DArray& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + Image1DArray& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; +#endif // #if defined(CL_VERSION_1_2) + + +/*! \brief Class interface for 2D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image2D : public Image +{ +public: + /*! \brief Constructs a 1D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image2D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + ::size_t height, + ::size_t row_pitch = 0, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + bool useCreateImage; + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above + } +#elif defined(CL_VERSION_1_2) + useCreateImage = true; +#else + useCreateImage = false; +#endif + +#if defined(CL_VERSION_1_2) + if (useCreateImage) + { + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE2D, + width, + height, + 0, 0, // depth, array size (unused) + row_pitch, + 0, 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) +#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + if (!useCreateImage) + { + object_ = ::clCreateImage2D( + context(), flags,&format, width, height, row_pitch, host_ptr, &error); + + detail::errHandler(error, __CREATE_IMAGE2D_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + } + + //! \brief Default constructor - initializes to NULL. + Image2D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image2D(const Image2D& image2D) : Image(image2D) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { } + + /*! \brief Assignment from Image2D - performs shallow copy. + * + * See Memory for further details. + */ + Image2D& operator = (const Image2D& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image2D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; + + +#if !defined(CL_VERSION_1_2) +/*! \brief Class interface for GL 2D Image Memory objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + * \note Deprecated for OpenCL 1.2. Please use ImageGL instead. + */ +class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D +{ +public: + /*! \brief Constructs an Image2DGL in a specified context, from a given + * GL Texture. + * + * Wraps clCreateFromGLTexture2D(). + */ + Image2DGL( + const Context& context, + cl_mem_flags flags, + GLenum target, + GLint miplevel, + GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture2D( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR); + if (err != NULL) { + *err = error; + } + + } + + //! \brief Default constructor - initializes to NULL. + Image2DGL() : Image2D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image2DGL(const Image2DGL& image) : Image2D(image) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { } + + /*! \brief Assignment from Image2DGL - performs shallow copy. + * + * See Memory for further details. + */ + Image2DGL& operator = (const Image2DGL& rhs) + { + if (this != &rhs) { + Image2D::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image2DGL& operator = (const cl_mem& rhs) + { + Image2D::operator=(rhs); + return *this; + } +}; +#endif // #if !defined(CL_VERSION_1_2) + +#if defined(CL_VERSION_1_2) +/*! \class Image2DArray + * \brief Image interface for arrays of 2D images. + */ +class Image2DArray : public Image +{ +public: + Image2DArray( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t arraySize, + ::size_t width, + ::size_t height, + ::size_t rowPitch, + ::size_t slicePitch, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE2D_ARRAY, + width, + height, + 0, // depth (unused) + arraySize, + rowPitch, + slicePitch, + 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image2DArray() { } + + Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { } + + __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { } + + Image2DArray& operator = (const Image2DArray& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + Image2DArray& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; +#endif // #if defined(CL_VERSION_1_2) + +/*! \brief Class interface for 3D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image3D : public Image +{ +public: + /*! \brief Constructs a 3D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image3D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + ::size_t height, + ::size_t depth, + ::size_t row_pitch = 0, + ::size_t slice_pitch = 0, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + bool useCreateImage; + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above + } +#elif defined(CL_VERSION_1_2) + useCreateImage = true; +#else + useCreateImage = false; +#endif + +#if defined(CL_VERSION_1_2) + if (useCreateImage) + { + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE3D, + width, + height, + depth, + 0, // array size (unused) + row_pitch, + slice_pitch, + 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) +#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + if (!useCreateImage) + { + object_ = ::clCreateImage3D( + context(), flags, &format, width, height, depth, row_pitch, + slice_pitch, host_ptr, &error); + + detail::errHandler(error, __CREATE_IMAGE3D_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + } + + //! \brief Default constructor - initializes to NULL. + Image3D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image3D(const Image3D& image3D) : Image(image3D) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { } + + /*! \brief Assignment from Image3D - performs shallow copy. + * + * See Memory for further details. + */ + Image3D& operator = (const Image3D& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image3D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; + +#if !defined(CL_VERSION_1_2) +/*! \brief Class interface for GL 3D Image Memory objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image3DGL : public Image3D +{ +public: + /*! \brief Constructs an Image3DGL in a specified context, from a given + * GL Texture. + * + * Wraps clCreateFromGLTexture3D(). + */ + Image3DGL( + const Context& context, + cl_mem_flags flags, + GLenum target, + GLint miplevel, + GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture3D( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + Image3DGL() : Image3D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image3DGL(const Image3DGL& image) : Image3D(image) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { } + + /*! \brief Assignment from Image3DGL - performs shallow copy. + * + * See Memory for further details. + */ + Image3DGL& operator = (const Image3DGL& rhs) + { + if (this != &rhs) { + Image3D::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image3DGL& operator = (const cl_mem& rhs) + { + Image3D::operator=(rhs); + return *this; + } +}; +#endif // #if !defined(CL_VERSION_1_2) + +#if defined(CL_VERSION_1_2) +/*! \class ImageGL + * \brief general image interface for GL interop. + * We abstract the 2D and 3D GL images into a single instance here + * that wraps all GL sourced images on the grounds that setup information + * was performed by OpenCL anyway. + */ +class ImageGL : public Image +{ +public: + ImageGL( + const Context& context, + cl_mem_flags flags, + GLenum target, + GLint miplevel, + GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_ERR); + if (err != NULL) { + *err = error; + } + } + + ImageGL() : Image() { } + + ImageGL(const ImageGL& image) : Image(image) { } + + __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { } + + ImageGL& operator = (const ImageGL& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + ImageGL& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; +#endif // #if defined(CL_VERSION_1_2) + +/*! \brief Class interface for cl_sampler. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_sampler as the original. For details, see + * clRetainSampler() and clReleaseSampler(). + * + * \see cl_sampler + */ +class Sampler : public detail::Wrapper +{ +public: + /*! \brief Destructor. + * + * This calls clReleaseSampler() on the value held by this instance. + */ + ~Sampler() { } + + //! \brief Default constructor - initializes to NULL. + Sampler() { } + + /*! \brief Constructs a Sampler in a specified context. + * + * Wraps clCreateSampler(). + */ + Sampler( + const Context& context, + cl_bool normalized_coords, + cl_addressing_mode addressing_mode, + cl_filter_mode filter_mode, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateSampler( + context(), + normalized_coords, + addressing_mode, + filter_mode, + &error); + + detail::errHandler(error, __CREATE_SAMPLER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Copy constructor - performs shallow copy. + * + * This calls clRetainSampler() on the parameter's cl_sampler. + */ + Sampler(const Sampler& sampler) : detail::Wrapper(sampler) { } + + /*! \brief Constructor from cl_sampler - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_sampler + * into the new Sampler object. + */ + Sampler(const cl_sampler& sampler) : detail::Wrapper(sampler) { } + + /*! \brief Assignment operator from Sampler. + * + * This calls clRetainSampler() on the parameter and clReleaseSampler() + * on the previous value held by this instance. + */ + Sampler& operator = (const Sampler& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_sampler - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseSampler() on the value previously held by this instance. + */ + Sampler& operator = (const cl_sampler& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetSamplerInfo(). + template + cl_int getInfo(cl_sampler_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetSamplerInfo, object_, name, param), + __GET_SAMPLER_INFO_ERR); + } + + //! \brief Wrapper for clGetSamplerInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_sampler_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +}; + +class Program; +class CommandQueue; +class Kernel; + +//! \brief Class interface for specifying NDRange values. +class NDRange +{ +private: + size_t<3> sizes_; + cl_uint dimensions_; + +public: + //! \brief Default constructor - resulting range has zero dimensions. + NDRange() + : dimensions_(0) + { } + + //! \brief Constructs one-dimensional range. + NDRange(::size_t size0) + : dimensions_(1) + { + sizes_[0] = size0; + } + + //! \brief Constructs two-dimensional range. + NDRange(::size_t size0, ::size_t size1) + : dimensions_(2) + { + sizes_[0] = size0; + sizes_[1] = size1; + } + + //! \brief Constructs three-dimensional range. + NDRange(::size_t size0, ::size_t size1, ::size_t size2) + : dimensions_(3) + { + sizes_[0] = size0; + sizes_[1] = size1; + sizes_[2] = size2; + } + + /*! \brief Conversion operator to const ::size_t *. + * + * \returns a pointer to the size of the first dimension. + */ + operator const ::size_t*() const { + return (const ::size_t*) sizes_; + } + + //! \brief Queries the number of dimensions in the range. + ::size_t dimensions() const { return dimensions_; } +}; + +//! \brief A zero-dimensional range. +static const NDRange NullRange; + +//! \brief Local address wrapper for use with Kernel::setArg +struct LocalSpaceArg +{ + ::size_t size_; +}; + +namespace detail { + +template +struct KernelArgumentHandler +{ + static ::size_t size(const T&) { return sizeof(T); } + static T* ptr(T& value) { return &value; } +}; + +template <> +struct KernelArgumentHandler +{ + static ::size_t size(const LocalSpaceArg& value) { return value.size_; } + static void* ptr(LocalSpaceArg&) { return NULL; } +}; + +} +//! \endcond + +/*! __local + * \brief Helper function for generating LocalSpaceArg objects. + * Deprecated. Replaced with Local. + */ +inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg +__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +inline LocalSpaceArg +__local(::size_t size) +{ + LocalSpaceArg ret = { size }; + return ret; +} + +/*! Local + * \brief Helper function for generating LocalSpaceArg objects. + */ +inline LocalSpaceArg +Local(::size_t size) +{ + LocalSpaceArg ret = { size }; + return ret; +} + +//class KernelFunctor; + +/*! \brief Class interface for cl_kernel. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_kernel as the original. For details, see + * clRetainKernel() and clReleaseKernel(). + * + * \see cl_kernel + */ +class Kernel : public detail::Wrapper +{ +public: + inline Kernel(const Program& program, const char* name, cl_int* err = NULL); + + /*! \brief Destructor. + * + * This calls clReleaseKernel() on the value held by this instance. + */ + ~Kernel() { } + + //! \brief Default constructor - initializes to NULL. + Kernel() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * This calls clRetainKernel() on the parameter's cl_kernel. + */ + Kernel(const Kernel& kernel) : detail::Wrapper(kernel) { } + + /*! \brief Constructor from cl_kernel - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_kernel + * into the new Kernel object. + */ + __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper(kernel) { } + + /*! \brief Assignment operator from Kernel. + * + * This calls clRetainKernel() on the parameter and clReleaseKernel() + * on the previous value held by this instance. + */ + Kernel& operator = (const Kernel& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_kernel - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseKernel() on the value previously held by this instance. + */ + Kernel& operator = (const cl_kernel& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + template + cl_int getInfo(cl_kernel_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetKernelInfo, object_, name, param), + __GET_KERNEL_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + +#if defined(CL_VERSION_1_2) + template + cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param), + __GET_KERNEL_ARG_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getArgInfo(cl_uint argIndex, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_arg_info, name>::param_type param; + cl_int result = getArgInfo(argIndex, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +#endif // #if defined(CL_VERSION_1_2) + + template + cl_int getWorkGroupInfo( + const Device& device, cl_kernel_work_group_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetKernelWorkGroupInfo, object_, device(), name, param), + __GET_KERNEL_WORK_GROUP_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getWorkGroupInfo(const Device& device, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_work_group_info, name>::param_type param; + cl_int result = getWorkGroupInfo(device, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + template + cl_int setArg(cl_uint index, T value) + { + return detail::errHandler( + ::clSetKernelArg( + object_, + index, + detail::KernelArgumentHandler::size(value), + detail::KernelArgumentHandler::ptr(value)), + __SET_KERNEL_ARGS_ERR); + } + + cl_int setArg(cl_uint index, ::size_t size, void* argPtr) + { + return detail::errHandler( + ::clSetKernelArg(object_, index, size, argPtr), + __SET_KERNEL_ARGS_ERR); + } +}; + +/*! \class Program + * \brief Program interface that implements cl_program. + */ +class Program : public detail::Wrapper +{ +public: + typedef VECTOR_CLASS > Binaries; + typedef VECTOR_CLASS > Sources; + + Program( + const STRING_CLASS& source, + bool build = false, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const ::size_t length = source.size(); + + Context context = Context::getDefault(err); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS && build) { + + error = ::clBuildProgram( + object_, + 0, + NULL, + "", + NULL, + NULL); + + detail::errHandler(error, __BUILD_PROGRAM_ERR); + } + + if (err != NULL) { + *err = error; + } + } + + Program( + const Context& context, + const STRING_CLASS& source, + bool build = false, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const ::size_t length = source.size(); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS && build) { + + error = ::clBuildProgram( + object_, + 0, + NULL, + "", + NULL, + NULL); + + detail::errHandler(error, __BUILD_PROGRAM_ERR); + } + + if (err != NULL) { + *err = error; + } + } + + Program( + const Context& context, + const Sources& sources, + cl_int* err = NULL) + { + cl_int error; + + const ::size_t n = (::size_t)sources.size(); + ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t)); + const char** strings = (const char**) alloca(n * sizeof(const char*)); + + for (::size_t i = 0; i < n; ++i) { + strings[i] = sources[(int)i].first; + lengths[i] = sources[(int)i].second; + } + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)n, strings, lengths, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + if (err != NULL) { + *err = error; + } + } + + /** + * Construct a program object from a list of devices and a per-device list of binaries. + * \param context A valid OpenCL context in which to construct the program. + * \param devices A vector of OpenCL device objects for which the program will be created. + * \param binaries A vector of pairs of a pointer to a binary object and its length. + * \param binaryStatus An optional vector that on completion will be resized to + * match the size of binaries and filled with values to specify if each binary + * was successfully loaded. + * Set to CL_SUCCESS if the binary was successfully loaded. + * Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL. + * Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device. + * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors: + * CL_INVALID_CONTEXT if context is not a valid context. + * CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; + * or if any entry in binaries is NULL or has length 0. + * CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context. + * CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device. + * CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host. + */ + Program( + const Context& context, + const VECTOR_CLASS& devices, + const Binaries& binaries, + VECTOR_CLASS* binaryStatus = NULL, + cl_int* err = NULL) + { + cl_int error; + + const ::size_t numDevices = devices.size(); + + // Catch size mismatch early and return + if(binaries.size() != numDevices) { + error = CL_INVALID_VALUE; + detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); + if (err != NULL) { + *err = error; + } + return; + } + + ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t)); + const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**)); + + for (::size_t i = 0; i < numDevices; ++i) { + images[i] = (const unsigned char*)binaries[i].first; + lengths[i] = binaries[(int)i].second; + } + + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + if(binaryStatus) { + binaryStatus->resize(numDevices); + } + + object_ = ::clCreateProgramWithBinary( + context(), (cl_uint) devices.size(), + deviceIDs, + lengths, images, binaryStatus != NULL + ? &binaryStatus->front() + : NULL, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); + if (err != NULL) { + *err = error; + } + } + + +#if defined(CL_VERSION_1_2) + /** + * Create program using builtin kernels. + * \param kernelNames Semi-colon separated list of builtin kernel names + */ + Program( + const Context& context, + const VECTOR_CLASS& devices, + const STRING_CLASS& kernelNames, + cl_int* err = NULL) + { + cl_int error; + + + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + object_ = ::clCreateProgramWithBuiltInKernels( + context(), + (cl_uint) devices.size(), + deviceIDs, + kernelNames.c_str(), + &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) + + Program() { } + + Program(const Program& program) : detail::Wrapper(program) { } + + __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper(program) { } + + Program& operator = (const Program& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + Program& operator = (const cl_program& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + cl_int build( + const VECTOR_CLASS& devices, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + return detail::errHandler( + ::clBuildProgram( + object_, + (cl_uint) + devices.size(), + deviceIDs, + options, + notifyFptr, + data), + __BUILD_PROGRAM_ERR); + } + + cl_int build( + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + return detail::errHandler( + ::clBuildProgram( + object_, + 0, + NULL, + options, + notifyFptr, + data), + __BUILD_PROGRAM_ERR); + } + +#if defined(CL_VERSION_1_2) + cl_int compile( + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + return detail::errHandler( + ::clCompileProgram( + object_, + 0, + NULL, + options, + 0, + NULL, + NULL, + notifyFptr, + data), + __COMPILE_PROGRAM_ERR); + } +#endif + + template + cl_int getInfo(cl_program_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetProgramInfo, object_, name, param), + __GET_PROGRAM_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_program_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + template + cl_int getBuildInfo( + const Device& device, cl_program_build_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetProgramBuildInfo, object_, device(), name, param), + __GET_PROGRAM_BUILD_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getBuildInfo(const Device& device, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_program_build_info, name>::param_type param; + cl_int result = getBuildInfo(device, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + cl_int createKernels(VECTOR_CLASS* kernels) + { + cl_uint numKernels; + cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); + } + + Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel)); + err = ::clCreateKernelsInProgram( + object_, numKernels, (cl_kernel*) value, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); + } + + kernels->assign(&value[0], &value[numKernels]); + return CL_SUCCESS; + } +}; + +#if defined(CL_VERSION_1_2) +inline Program linkProgram( + Program input1, + Program input2, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL, + cl_int* err = NULL) +{ + cl_int err_local = CL_SUCCESS; + + cl_program programs[2] = { input1(), input2() }; + + Context ctx = input1.getInfo(); + + cl_program prog = ::clLinkProgram( + ctx(), + 0, + NULL, + options, + 2, + programs, + notifyFptr, + data, + &err_local); + + detail::errHandler(err_local,__COMPILE_PROGRAM_ERR); + if (err != NULL) { + *err = err_local; + } + + return Program(prog); +} + +inline Program linkProgram( + VECTOR_CLASS inputPrograms, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL, + cl_int* err = NULL) +{ + cl_int err_local = CL_SUCCESS; + + cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program)); + + if (programs != NULL) { + for (unsigned int i = 0; i < inputPrograms.size(); i++) { + programs[i] = inputPrograms[i](); + } + } + + cl_program prog = ::clLinkProgram( + Context::getDefault()(), + 0, + NULL, + options, + (cl_uint)inputPrograms.size(), + programs, + notifyFptr, + data, + &err_local); + + detail::errHandler(err_local,__COMPILE_PROGRAM_ERR); + if (err != NULL) { + *err = err_local; + } + + return Program(prog); +} +#endif + +template<> +inline VECTOR_CLASS cl::Program::getInfo(cl_int* err) const +{ + VECTOR_CLASS< ::size_t> sizes = getInfo(); + VECTOR_CLASS binaries; + for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) + { + char *ptr = NULL; + if (*s != 0) + ptr = new char[*s]; + binaries.push_back(ptr); + } + + cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries); + if (err != NULL) { + *err = result; + } + return binaries; +} + +inline Kernel::Kernel(const Program& program, const char* name, cl_int* err) +{ + cl_int error; + + object_ = ::clCreateKernel(program(), name, &error); + detail::errHandler(error, __CREATE_KERNEL_ERR); + + if (err != NULL) { + *err = error; + } + +} + +/*! \class CommandQueue + * \brief CommandQueue interface for cl_command_queue. + */ +class CommandQueue : public detail::Wrapper +{ +private: + static volatile int default_initialized_; + static CommandQueue default_; + static volatile cl_int default_error_; +public: + CommandQueue( + cl_command_queue_properties properties, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + Device device = context.getInfo()[0]; + + object_ = ::clCreateCommandQueue( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } + } + /*! + * \brief Constructs a CommandQueue for an implementation defined device in the given context + */ + explicit CommandQueue( + const Context& context, + cl_command_queue_properties properties = 0, + cl_int* err = NULL) + { + cl_int error; + VECTOR_CLASS devices; + error = context.getInfo(CL_CONTEXT_DEVICES, &devices); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + + if (error != CL_SUCCESS) + { + if (err != NULL) { + *err = error; + } + return; + } + + object_ = ::clCreateCommandQueue(context(), devices[0](), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + + if (err != NULL) { + *err = error; + } + + } + + CommandQueue( + const Context& context, + const Device& device, + cl_command_queue_properties properties = 0, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateCommandQueue( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } + + static CommandQueue getDefault(cl_int * err = NULL) + { + int state = detail::compare_exchange( + &default_initialized_, + __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED); + + if (state & __DEFAULT_INITIALIZED) { + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + if (state & __DEFAULT_BEING_INITIALIZED) { + // Assume writes will propagate eventually... + while(default_initialized_ != __DEFAULT_INITIALIZED) { + detail::fence(); + } + + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + cl_int error; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + Device device = context.getInfo()[0]; + + default_ = CommandQueue(context, device, 0, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } + + detail::fence(); + + default_error_ = error; + // Assume writes will propagate eventually... + default_initialized_ = __DEFAULT_INITIALIZED; + + detail::fence(); + + if (err != NULL) { + *err = default_error_; + } + return default_; + + } + + CommandQueue() { } + + CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper(commandQueue) { } + + CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper(commandQueue) { } + + CommandQueue& operator = (const CommandQueue& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + CommandQueue& operator = (const cl_command_queue& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + template + cl_int getInfo(cl_command_queue_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetCommandQueueInfo, object_, name, param), + __GET_COMMAND_QUEUE_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_command_queue_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + cl_int enqueueReadBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadBuffer( + object_, buffer(), blocking, offset, size, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + const void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteBuffer( + object_, buffer(), blocking, offset, size, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBuffer( + const Buffer& src, + const Buffer& dst, + ::size_t src_offset, + ::size_t dst_offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBuffer( + object_, src(), dst(), src_offset, dst_offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQEUE_COPY_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReadBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadBufferRect( + object_, + buffer(), + blocking, + (const ::size_t *)buffer_offset, + (const ::size_t *)host_offset, + (const ::size_t *)region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteBufferRect( + object_, + buffer(), + blocking, + (const ::size_t *)buffer_offset, + (const ::size_t *)host_offset, + (const ::size_t *)region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBufferRect( + const Buffer& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + ::size_t src_row_pitch, + ::size_t src_slice_pitch, + ::size_t dst_row_pitch, + ::size_t dst_slice_pitch, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBufferRect( + object_, + src(), + dst(), + (const ::size_t *)src_origin, + (const ::size_t *)dst_origin, + (const ::size_t *)region, + src_row_pitch, + src_slice_pitch, + dst_row_pitch, + dst_slice_pitch, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQEUE_COPY_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueue a command to fill a buffer object with a pattern + * of a given size. The pattern is specified a as vector. + * \tparam PatternType The datatype of the pattern field. + * The pattern type must be an accepted OpenCL data type. + */ + template + cl_int enqueueFillBuffer( + const Buffer& buffer, + PatternType pattern, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillBuffer( + object_, + buffer(), + static_cast(&pattern), + sizeof(PatternType), + offset, + size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueReadImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadImage( + object_, image(), blocking, (const ::size_t *) origin, + (const ::size_t *) region, row_pitch, slice_pitch, ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteImage( + object_, image(), blocking, (const ::size_t *) origin, + (const ::size_t *) region, row_pitch, slice_pitch, ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyImage( + const Image& src, + const Image& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyImage( + object_, src(), dst(), (const ::size_t *) src_origin, + (const ::size_t *)dst_origin, (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA floating-point color value if + * the image channel data type is not an unnormalized signed or + * unsigned data type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_float4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA signed integer color value if + * the image channel data type is an unnormalized signed integer + * type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_int4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA unsigned integer color value if + * the image channel data type is an unnormalized unsigned integer + * type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_uint4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueCopyImageToBuffer( + const Image& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& region, + ::size_t dst_offset, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyImageToBuffer( + object_, src(), dst(), (const ::size_t *) src_origin, + (const ::size_t *) region, dst_offset, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBufferToImage( + const Buffer& src, + const Image& dst, + ::size_t src_offset, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBufferToImage( + object_, src(), dst(), src_offset, + (const ::size_t *) dst_origin, (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + void* enqueueMapBuffer( + const Buffer& buffer, + cl_bool blocking, + cl_map_flags flags, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL, + cl_int* err = NULL) const + { + cl_int error; + void * result = ::clEnqueueMapBuffer( + object_, buffer(), blocking, flags, offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (cl_event*) event, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + return result; + } + + void* enqueueMapImage( + const Image& buffer, + cl_bool blocking, + cl_map_flags flags, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t * row_pitch, + ::size_t * slice_pitch, + const VECTOR_CLASS* events = NULL, + Event* event = NULL, + cl_int* err = NULL) const + { + cl_int error; + void * result = ::clEnqueueMapImage( + object_, buffer(), blocking, flags, + (const ::size_t *) origin, (const ::size_t *) region, + row_pitch, slice_pitch, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (cl_event*) event, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + return result; + } + + cl_int enqueueUnmapMemObject( + const Memory& memory, + void* mapped_ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueUnmapMemObject( + object_, memory(), mapped_ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueues a marker command which waits for either a list of events to complete, + * or all previously enqueued commands to complete. + * + * Enqueues a marker command which waits for either a list of events to complete, + * or if the list is empty it waits for all commands previously enqueued in command_queue + * to complete before it completes. This command returns an event which can be waited on, + * i.e. this event can be waited on to insure that all events either in the event_wait_list + * or all previously enqueued commands, queued before this command to command_queue, + * have completed. + */ + cl_int enqueueMarkerWithWaitList( + const VECTOR_CLASS *events = 0, + Event *event = 0) + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueMarkerWithWaitList( + object_, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_MARKER_WAIT_LIST_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * A synchronization point that enqueues a barrier operation. + * + * Enqueues a barrier command which waits for either a list of events to complete, + * or if the list is empty it waits for all commands previously enqueued in command_queue + * to complete before it completes. This command blocks command execution, that is, any + * following commands enqueued after it do not execute until it completes. This command + * returns an event which can be waited on, i.e. this event can be waited on to insure that + * all events either in the event_wait_list or all previously enqueued commands, queued + * before this command to command_queue, have completed. + */ + cl_int enqueueBarrierWithWaitList( + const VECTOR_CLASS *events = 0, + Event *event = 0) + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueBarrierWithWaitList( + object_, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_BARRIER_WAIT_LIST_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueues a command to indicate with which device a set of memory objects + * should be associated. + */ + cl_int enqueueMigrateMemObjects( + const VECTOR_CLASS &memObjects, + cl_mem_migration_flags flags, + const VECTOR_CLASS* events = NULL, + Event* event = NULL + ) + { + cl_event tmp; + + cl_mem* localMemObjects = static_cast(alloca(memObjects.size() * sizeof(cl_mem))); + for( int i = 0; i < (int)memObjects.size(); ++i ) { + localMemObjects[i] = memObjects[i](); + } + + + cl_int err = detail::errHandler( + ::clEnqueueMigrateMemObjects( + object_, + (cl_uint)memObjects.size(), + static_cast(localMemObjects), + flags, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueNDRangeKernel( + const Kernel& kernel, + const NDRange& offset, + const NDRange& global, + const NDRange& local = NullRange, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueNDRangeKernel( + object_, kernel(), (cl_uint) global.dimensions(), + offset.dimensions() != 0 ? (const ::size_t*) offset : NULL, + (const ::size_t*) global, + local.dimensions() != 0 ? (const ::size_t*) local : NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_NDRANGE_KERNEL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueTask( + const Kernel& kernel, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueTask( + object_, kernel(), + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_TASK_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueNativeKernel( + void (CL_CALLBACK *userFptr)(void *), + std::pair args, + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* mem_locs = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) + ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem)) + : NULL; + + if (mems != NULL) { + for (unsigned int i = 0; i < mem_objects->size(); i++) { + mems[i] = ((*mem_objects)[i])(); + } + } + + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueNativeKernel( + object_, userFptr, args.first, args.second, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + mems, + (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_NATIVE_KERNEL); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueMarker(object_, (cl_event*) event), + __ENQUEUE_MARKER_ERR); + } + + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueWaitForEvents(const VECTOR_CLASS& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueWaitForEvents( + object_, + (cl_uint) events.size(), + (const cl_event*) &events.front()), + __ENQUEUE_WAIT_FOR_EVENTS_ERR); + } +#endif // #if defined(CL_VERSION_1_1) + + cl_int enqueueAcquireGLObjects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueAcquireGLObjects( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_ACQUIRE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReleaseGLObjects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReleaseGLObjects( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_RELEASE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined (USE_DX_INTEROP) +typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem* mem_objects, cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, cl_event* event); +typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem* mem_objects, cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, cl_event* event); + + cl_int enqueueAcquireD3D10Objects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL; +#if defined(CL_VERSION_1_2) + cl_context context = getInfo(); + cl::Device device(getInfo()); + cl_platform_id platform = device.getInfo(); + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR); +#endif +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR); +#endif + + cl_event tmp; + cl_int err = detail::errHandler( + pfn_clEnqueueAcquireD3D10ObjectsKHR( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_ACQUIRE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReleaseD3D10Objects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL; +#if defined(CL_VERSION_1_2) + cl_context context = getInfo(); + cl::Device device(getInfo()); + cl_platform_id platform = device.getInfo(); + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR); +#endif // #if defined(CL_VERSION_1_2) +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR); +#endif // #if defined(CL_VERSION_1_1) + + cl_event tmp; + cl_int err = detail::errHandler( + pfn_clEnqueueReleaseD3D10ObjectsKHR( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_RELEASE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueBarrier(object_), + __ENQUEUE_BARRIER_ERR); + } +#endif // #if defined(CL_VERSION_1_1) + + cl_int flush() const + { + return detail::errHandler(::clFlush(object_), __FLUSH_ERR); + } + + cl_int finish() const + { + return detail::errHandler(::clFinish(object_), __FINISH_ERR); + } +}; + +#ifdef _WIN32 +__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +__declspec(selectany) CommandQueue CommandQueue::default_; +__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; +#else +__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +__attribute__((weak)) CommandQueue CommandQueue::default_; +__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; +#endif + +template< typename IteratorType > +Buffer::Buffer( + const Context &context, + IteratorType startIterator, + IteratorType endIterator, + bool readOnly, + bool useHostPtr, + cl_int* err) +{ + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + cl_mem_flags flags = 0; + if( readOnly ) { + flags |= CL_MEM_READ_ONLY; + } + else { + flags |= CL_MEM_READ_WRITE; + } + if( useHostPtr ) { + flags |= CL_MEM_USE_HOST_PTR; + } + + ::size_t size = sizeof(DataType)*(endIterator - startIterator); + + if( useHostPtr ) { + object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + } else { + object_ = ::clCreateBuffer(context(), flags, size, 0, &error); + } + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + if( !useHostPtr ) { + CommandQueue queue(context, 0, &error); + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + error = cl::copy(queue, startIterator, endIterator, *this); + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } +} + +inline cl_int enqueueReadBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event); +} + +inline cl_int enqueueWriteBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + const void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event); +} + +inline void* enqueueMapBuffer( + const Buffer& buffer, + cl_bool blocking, + cl_map_flags flags, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL, + cl_int* err = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + void * result = ::clEnqueueMapBuffer( + queue(), buffer(), blocking, flags, offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (cl_event*) event, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + return result; +} + +inline cl_int enqueueUnmapMemObject( + const Memory& memory, + void* mapped_ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (error != CL_SUCCESS) { + return error; + } + + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueUnmapMemObject( + queue(), memory(), mapped_ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; +} + +inline cl_int enqueueCopyBuffer( + const Buffer& src, + const Buffer& dst, + ::size_t src_offset, + ::size_t dst_offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event); +} + +/** + * Blocking copy operation between iterators and a buffer. + * Host to Device. + * Uses default command queue. + */ +template< typename IteratorType > +inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + if (error != CL_SUCCESS) + return error; + + return cl::copy(queue, startIterator, endIterator, buffer); +} + +/** + * Blocking copy operation between iterators and a buffer. + * Device to Host. + * Uses default command queue. + */ +template< typename IteratorType > +inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + if (error != CL_SUCCESS) + return error; + + return cl::copy(queue, buffer, startIterator, endIterator); +} + +/** + * Blocking copy operation between iterators and a buffer. + * Host to Device. + * Uses specified queue. + */ +template< typename IteratorType > +inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ) +{ + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + ::size_t length = endIterator-startIterator; + ::size_t byteLength = length*sizeof(DataType); + + DataType *pointer = + static_cast(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error)); + // if exceptions enabled, enqueueMapBuffer will throw + if( error != CL_SUCCESS ) { + return error; + } +#if defined(_MSC_VER) + std::copy( + startIterator, + endIterator, + stdext::checked_array_iterator( + pointer, length)); +#else + std::copy(startIterator, endIterator, pointer); +#endif + Event endEvent; + error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); + // if exceptions enabled, enqueueUnmapMemObject will throw + if( error != CL_SUCCESS ) { + return error; + } + endEvent.wait(); + return CL_SUCCESS; +} + +/** + * Blocking copy operation between iterators and a buffer. + * Device to Host. + * Uses specified queue. + */ +template< typename IteratorType > +inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ) +{ + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + ::size_t length = endIterator-startIterator; + ::size_t byteLength = length*sizeof(DataType); + + DataType *pointer = + static_cast(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error)); + // if exceptions enabled, enqueueMapBuffer will throw + if( error != CL_SUCCESS ) { + return error; + } + std::copy(pointer, pointer + length, startIterator); + Event endEvent; + error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); + // if exceptions enabled, enqueueUnmapMemObject will throw + if( error != CL_SUCCESS ) { + return error; + } + endEvent.wait(); + return CL_SUCCESS; +} + +#if defined(CL_VERSION_1_1) +inline cl_int enqueueReadBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadBufferRect( + buffer, + blocking, + buffer_offset, + host_offset, + region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueWriteBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteBufferRect( + buffer, + blocking, + buffer_offset, + host_offset, + region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueCopyBufferRect( + const Buffer& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + ::size_t src_row_pitch, + ::size_t src_slice_pitch, + ::size_t dst_row_pitch, + ::size_t dst_slice_pitch, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBufferRect( + src, + dst, + src_origin, + dst_origin, + region, + src_row_pitch, + src_slice_pitch, + dst_row_pitch, + dst_slice_pitch, + events, + event); +} +#endif + +inline cl_int enqueueReadImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadImage( + image, + blocking, + origin, + region, + row_pitch, + slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueWriteImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteImage( + image, + blocking, + origin, + region, + row_pitch, + slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueCopyImage( + const Image& src, + const Image& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyImage( + src, + dst, + src_origin, + dst_origin, + region, + events, + event); +} + +inline cl_int enqueueCopyImageToBuffer( + const Image& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& region, + ::size_t dst_offset, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyImageToBuffer( + src, + dst, + src_origin, + region, + dst_offset, + events, + event); +} + +inline cl_int enqueueCopyBufferToImage( + const Buffer& src, + const Image& dst, + ::size_t src_offset, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBufferToImage( + src, + dst, + src_offset, + dst_origin, + region, + events, + event); +} + + +inline cl_int flush(void) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.flush(); +} + +inline cl_int finish(void) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + + return queue.finish(); +} + +// Kernel Functor support +// New interface as of September 2011 +// Requires the C++11 std::tr1::function (note do not support TR1) +// Visual Studio 2010 and GCC 4.2 + +struct EnqueueArgs +{ + CommandQueue queue_; + const NDRange offset_; + const NDRange global_; + const NDRange local_; + VECTOR_CLASS events_; + + EnqueueArgs(NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange) + { + + } + + EnqueueArgs(NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local) + { + + } + + EnqueueArgs(NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local) + { + + } + + EnqueueArgs(Event e, NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange) + { + events_.push_back(e); + } + + EnqueueArgs(Event e, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(const VECTOR_CLASS &events, NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange), + events_(events) + { + + } + + EnqueueArgs(const VECTOR_CLASS &events, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(const VECTOR_CLASS &events, NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local) + { + + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local), + events_(events) + { + + } +}; + +namespace detail { + +class NullType {}; + +template +struct SetArg +{ + static void set (Kernel kernel, T0 arg) + { + kernel.setArg(index, arg); + } +}; + +template +struct SetArg +{ + static void set (Kernel, NullType) + { + } +}; + +template < + typename T0, typename T1, typename T2, typename T3, + typename T4, typename T5, typename T6, typename T7, + typename T8, typename T9, typename T10, typename T11, + typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, + typename T20, typename T21, typename T22, typename T23, + typename T24, typename T25, typename T26, typename T27, + typename T28, typename T29, typename T30, typename T31 +> +class KernelFunctorGlobal +{ +private: + Kernel kernel_; + +public: + KernelFunctorGlobal( + Kernel kernel) : + kernel_(kernel) + {} + + KernelFunctorGlobal( + const Program& program, + const STRING_CLASS name, + cl_int * err = NULL) : + kernel_(program, name.c_str(), err) + {} + + Event operator() ( + const EnqueueArgs& args, + T0 t0, + T1 t1 = NullType(), + T2 t2 = NullType(), + T3 t3 = NullType(), + T4 t4 = NullType(), + T5 t5 = NullType(), + T6 t6 = NullType(), + T7 t7 = NullType(), + T8 t8 = NullType(), + T9 t9 = NullType(), + T10 t10 = NullType(), + T11 t11 = NullType(), + T12 t12 = NullType(), + T13 t13 = NullType(), + T14 t14 = NullType(), + T15 t15 = NullType(), + T16 t16 = NullType(), + T17 t17 = NullType(), + T18 t18 = NullType(), + T19 t19 = NullType(), + T20 t20 = NullType(), + T21 t21 = NullType(), + T22 t22 = NullType(), + T23 t23 = NullType(), + T24 t24 = NullType(), + T25 t25 = NullType(), + T26 t26 = NullType(), + T27 t27 = NullType(), + T28 t28 = NullType(), + T29 t29 = NullType(), + T30 t30 = NullType(), + T31 t31 = NullType() + ) + { + Event event; + SetArg<0, T0>::set(kernel_, t0); + SetArg<1, T1>::set(kernel_, t1); + SetArg<2, T2>::set(kernel_, t2); + SetArg<3, T3>::set(kernel_, t3); + SetArg<4, T4>::set(kernel_, t4); + SetArg<5, T5>::set(kernel_, t5); + SetArg<6, T6>::set(kernel_, t6); + SetArg<7, T7>::set(kernel_, t7); + SetArg<8, T8>::set(kernel_, t8); + SetArg<9, T9>::set(kernel_, t9); + SetArg<10, T10>::set(kernel_, t10); + SetArg<11, T11>::set(kernel_, t11); + SetArg<12, T12>::set(kernel_, t12); + SetArg<13, T13>::set(kernel_, t13); + SetArg<14, T14>::set(kernel_, t14); + SetArg<15, T15>::set(kernel_, t15); + SetArg<16, T16>::set(kernel_, t16); + SetArg<17, T17>::set(kernel_, t17); + SetArg<18, T18>::set(kernel_, t18); + SetArg<19, T19>::set(kernel_, t19); + SetArg<20, T20>::set(kernel_, t20); + SetArg<21, T21>::set(kernel_, t21); + SetArg<22, T22>::set(kernel_, t22); + SetArg<23, T23>::set(kernel_, t23); + SetArg<24, T24>::set(kernel_, t24); + SetArg<25, T25>::set(kernel_, t25); + SetArg<26, T26>::set(kernel_, t26); + SetArg<27, T27>::set(kernel_, t27); + SetArg<28, T28>::set(kernel_, t28); + SetArg<29, T29>::set(kernel_, t29); + SetArg<30, T30>::set(kernel_, t30); + SetArg<31, T31>::set(kernel_, t31); + + args.queue_.enqueueNDRangeKernel( + kernel_, + args.offset_, + args.global_, + args.local_, + &args.events_, + &event); + + return event; + } + +}; + +//------------------------------------------------------------------------------------------------------ + + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29, + typename T30, + typename T31> +struct functionImplementation_ +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + T31> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + T31); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29, + T30 arg30, + T31 arg31) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29, + arg30, + arg31); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29, + typename T30> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29, + T30 arg30) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29, + arg30); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3); + } + + +}; + +template< + typename T0, + typename T1, + typename T2> +struct functionImplementation_ +< T0, + T1, + T2, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2); + } + + +}; + +template< + typename T0, + typename T1> +struct functionImplementation_ +< T0, + T1, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1) + { + return functor_( + enqueueArgs, + arg0, + arg1); + } + + +}; + +template< + typename T0> +struct functionImplementation_ +< T0, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0) + { + return functor_( + enqueueArgs, + arg0); + } + + +}; + + + + + +} // namespace detail + +//---------------------------------------------------------------------------------------------- + +template < + typename T0, typename T1 = detail::NullType, typename T2 = detail::NullType, + typename T3 = detail::NullType, typename T4 = detail::NullType, + typename T5 = detail::NullType, typename T6 = detail::NullType, + typename T7 = detail::NullType, typename T8 = detail::NullType, + typename T9 = detail::NullType, typename T10 = detail::NullType, + typename T11 = detail::NullType, typename T12 = detail::NullType, + typename T13 = detail::NullType, typename T14 = detail::NullType, + typename T15 = detail::NullType, typename T16 = detail::NullType, + typename T17 = detail::NullType, typename T18 = detail::NullType, + typename T19 = detail::NullType, typename T20 = detail::NullType, + typename T21 = detail::NullType, typename T22 = detail::NullType, + typename T23 = detail::NullType, typename T24 = detail::NullType, + typename T25 = detail::NullType, typename T26 = detail::NullType, + typename T27 = detail::NullType, typename T28 = detail::NullType, + typename T29 = detail::NullType, typename T30 = detail::NullType, + typename T31 = detail::NullType +> +struct make_kernel : + public detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + > +{ +public: + typedef detail::KernelFunctorGlobal< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + > FunctorType; + + make_kernel( + const Program& program, + const STRING_CLASS name, + cl_int * err = NULL) : + detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + >( + FunctorType(program, name, err)) + {} + + make_kernel( + const Kernel kernel) : + detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + >( + FunctorType(kernel)) + {} +}; + + +//---------------------------------------------------------------------------------------------------------------------- + +#undef __ERR_STR +#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) +#undef __GET_DEVICE_INFO_ERR +#undef __GET_PLATFORM_INFO_ERR +#undef __GET_DEVICE_IDS_ERR +#undef __GET_CONTEXT_INFO_ERR +#undef __GET_EVENT_INFO_ERR +#undef __GET_EVENT_PROFILE_INFO_ERR +#undef __GET_MEM_OBJECT_INFO_ERR +#undef __GET_IMAGE_INFO_ERR +#undef __GET_SAMPLER_INFO_ERR +#undef __GET_KERNEL_INFO_ERR +#undef __GET_KERNEL_ARG_INFO_ERR +#undef __GET_KERNEL_WORK_GROUP_INFO_ERR +#undef __GET_PROGRAM_INFO_ERR +#undef __GET_PROGRAM_BUILD_INFO_ERR +#undef __GET_COMMAND_QUEUE_INFO_ERR + +#undef __CREATE_CONTEXT_ERR +#undef __CREATE_CONTEXT_FROM_TYPE_ERR +#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR + +#undef __CREATE_BUFFER_ERR +#undef __CREATE_SUBBUFFER_ERR +#undef __CREATE_IMAGE2D_ERR +#undef __CREATE_IMAGE3D_ERR +#undef __CREATE_SAMPLER_ERR +#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR + +#undef __CREATE_USER_EVENT_ERR +#undef __SET_USER_EVENT_STATUS_ERR +#undef __SET_EVENT_CALLBACK_ERR +#undef __SET_PRINTF_CALLBACK_ERR + +#undef __WAIT_FOR_EVENTS_ERR + +#undef __CREATE_KERNEL_ERR +#undef __SET_KERNEL_ARGS_ERR +#undef __CREATE_PROGRAM_WITH_SOURCE_ERR +#undef __CREATE_PROGRAM_WITH_BINARY_ERR +#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR +#undef __BUILD_PROGRAM_ERR +#undef __CREATE_KERNELS_IN_PROGRAM_ERR + +#undef __CREATE_COMMAND_QUEUE_ERR +#undef __SET_COMMAND_QUEUE_PROPERTY_ERR +#undef __ENQUEUE_READ_BUFFER_ERR +#undef __ENQUEUE_WRITE_BUFFER_ERR +#undef __ENQUEUE_READ_BUFFER_RECT_ERR +#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR +#undef __ENQEUE_COPY_BUFFER_ERR +#undef __ENQEUE_COPY_BUFFER_RECT_ERR +#undef __ENQUEUE_READ_IMAGE_ERR +#undef __ENQUEUE_WRITE_IMAGE_ERR +#undef __ENQUEUE_COPY_IMAGE_ERR +#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR +#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR +#undef __ENQUEUE_MAP_BUFFER_ERR +#undef __ENQUEUE_MAP_IMAGE_ERR +#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR +#undef __ENQUEUE_NDRANGE_KERNEL_ERR +#undef __ENQUEUE_TASK_ERR +#undef __ENQUEUE_NATIVE_KERNEL + +#undef __CL_EXPLICIT_CONSTRUCTORS + +#undef __UNLOAD_COMPILER_ERR +#endif //__CL_USER_OVERRIDE_ERROR_STRINGS + +#undef __CL_FUNCTION_TYPE + +// Extensions +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_VERSION_1_1) +#undef __INIT_CL_EXT_FCN_PTR +#endif // #if defined(CL_VERSION_1_1) +#undef __CREATE_SUB_DEVICES + +#if defined(USE_CL_DEVICE_FISSION) +#undef __PARAM_NAME_DEVICE_FISSION +#endif // USE_CL_DEVICE_FISSION + +#undef __DEFAULT_NOT_INITIALIZED +#undef __DEFAULT_BEING_INITIALIZED +#undef __DEFAULT_INITIALIZED + +} // namespace cl + +#ifdef _WIN32 +#pragma pop_macro("max") +#endif // _WIN32 + +#endif // CL_HPP_ diff --git a/3rdparty/opencl/CL/cl_d3d10.h b/3rdparty/opencl/CL/cl_d3d10.h new file mode 100644 index 0000000000..91e4a68796 --- /dev/null +++ b/3rdparty/opencl/CL/cl_d3d10.h @@ -0,0 +1,126 @@ +/********************************************************************************** + * Copyright (c) 2008-2013 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_CL_D3D10_H +#define __OPENCL_CL_D3D10_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * cl_khr_d3d10_sharing */ +#define cl_khr_d3d10_sharing 1 + +typedef cl_uint cl_d3d10_device_source_khr; +typedef cl_uint cl_d3d10_device_set_khr; + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_D3D10_DEVICE_KHR -1002 +#define CL_INVALID_D3D10_RESOURCE_KHR -1003 +#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004 +#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005 + +/* cl_d3d10_device_source_nv */ +#define CL_D3D10_DEVICE_KHR 0x4010 +#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011 + +/* cl_d3d10_device_set_nv */ +#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012 +#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013 + +/* cl_context_info */ +#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014 +#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C + +/* cl_mem_info */ +#define CL_MEM_D3D10_RESOURCE_KHR 0x4015 + +/* cl_image_info */ +#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016 + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017 +#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018 + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)( + cl_platform_id platform, + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Buffer * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Texture2D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Texture3D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_D3D10_H */ + diff --git a/3rdparty/opencl/CL/cl_d3d11.h b/3rdparty/opencl/CL/cl_d3d11.h new file mode 100644 index 0000000000..89f772b5fa --- /dev/null +++ b/3rdparty/opencl/CL/cl_d3d11.h @@ -0,0 +1,132 @@ +/********************************************************************************** + * Copyright (c) 2008-2013 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_CL_D3D11_H +#define __OPENCL_CL_D3D11_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * cl_khr_d3d11_sharing */ +#define cl_khr_d3d11_sharing 1 + +typedef cl_uint cl_d3d11_device_source_khr; +typedef cl_uint cl_d3d11_device_set_khr; + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_D3D11_DEVICE_KHR -1006 +#define CL_INVALID_D3D11_RESOURCE_KHR -1007 +#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008 +#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009 + +/* cl_d3d11_device_source */ +#define CL_D3D11_DEVICE_KHR 0x4019 +#define CL_D3D11_DXGI_ADAPTER_KHR 0x401A + +/* cl_d3d11_device_set */ +#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B +#define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C + +/* cl_context_info */ +#define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D +#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D + +/* cl_mem_info */ +#define CL_MEM_D3D11_RESOURCE_KHR 0x401E + +/* cl_image_info */ +#define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020 +#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021 + +// object types + +#define CL_D3D11_OBJECT_BUFFER 0x3000 +#define CL_D3D11_OBJECT_TEXTURE2D 0x3001 +#define CL_D3D11_OBJECT_TEXTURE3D 0x3003 + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)( + cl_platform_id platform, + cl_d3d11_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d11_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Buffer * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Texture2D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Texture3D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_D3D11_H */ + diff --git a/3rdparty/opencl/CL/cl_d3d9.h b/3rdparty/opencl/CL/cl_d3d9.h new file mode 100644 index 0000000000..5662af499a --- /dev/null +++ b/3rdparty/opencl/CL/cl_d3d9.h @@ -0,0 +1,331 @@ +/********************************************************************************** + * Copyright (c) 2008-2013 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_CL_D3D9_H +#define __OPENCL_CL_D3D9_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ +/* cl_khr_dx9_media_sharing */ +#define cl_khr_dx9_media_sharing 1 + +typedef cl_uint cl_dx9_media_adapter_type_khr; +typedef cl_uint cl_dx9_media_adapter_set_khr; + +#if defined(_WIN32) +#include +typedef struct _cl_dx9_surface_info_khr +{ + IDirect3DSurface9 *resource; + HANDLE shared_handle; +} cl_dx9_surface_info_khr; +#endif + + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010 +#define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011 +#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012 +#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013 + +/* cl_media_adapter_type_khr */ +#define CL_ADAPTER_D3D9_KHR 0x2020 +#define CL_ADAPTER_D3D9EX_KHR 0x2021 +#define CL_ADAPTER_DXVA_KHR 0x2022 + +/* cl_media_adapter_set_khr */ +#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023 +#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024 + +/* cl_context_info */ +#define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025 +#define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026 +#define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027 + +/* cl_mem_info */ +#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028 +#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029 + +/* cl_image_info */ +#define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B +#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)( + cl_platform_id platform, + cl_uint num_media_adapters, + cl_dx9_media_adapter_type_khr * media_adapter_type, + void * media_adapters[], + cl_dx9_media_adapter_set_khr media_adapter_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)( + cl_context context, + cl_mem_flags flags, + cl_dx9_media_adapter_type_khr adapter_type, + void * surface_info, + cl_uint plane, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +#if defined CL_DX9_MEDIA_SHARING_INTEL_EXT + +#ifndef _WIN32 +#include +#endif +#include +#include +#include +#include + +/******************************************************************************/ +/* cl_intel_dx9_media_sharing extension */ +#define cl_intel_dx9_media_sharing 1 + +typedef cl_uint cl_dx9_device_source_intel; +typedef cl_uint cl_dx9_device_set_intel; + +/******************************************************************************/ + +// Error Codes +#define CL_INVALID_DX9_DEVICE_INTEL -1010 +#define CL_INVALID_DX9_RESOURCE_INTEL -1011 +#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL -1012 +#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL -1013 + +// cl_dx9_device_source_intel +#define CL_D3D9_DEVICE_INTEL 0x4022 +#define CL_D3D9EX_DEVICE_INTEL 0x4070 +#define CL_DXVA_DEVICE_INTEL 0x4071 + +// cl_dx9_device_set_intel +#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL 0x4024 +#define CL_ALL_DEVICES_FOR_DX9_INTEL 0x4025 + +// cl_context_info +#define CL_CONTEXT_D3D9_DEVICE_INTEL 0x4026 +#define CL_CONTEXT_D3D9EX_DEVICE_INTEL 0x4072 +#define CL_CONTEXT_DXVA_DEVICE_INTEL 0x4073 + +// cl_mem_info +#define CL_MEM_DX9_RESOURCE_INTEL 0x4027 +#define CL_MEM_DX9_SHARED_HANDLE_INTEL 0x4074 + +// cl_image_info +#define CL_IMAGE_DX9_PLANE_INTEL 0x4075 + +// cl_command_type +#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL 0x402A +#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL 0x402B + +//packed YUV channel order +#define CL_YUYV_INTEL 0x4076 +#define CL_UYVY_INTEL 0x4077 +#define CL_YVYU_INTEL 0x4078 +#define CL_VYUY_INTEL 0x4079 + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)( + cl_platform_id /*platform*/, + cl_dx9_device_source_intel /*dx9_device_source*/, + void* /*dx9_object*/, + cl_dx9_device_set_intel /*dx9_device_set*/, + cl_uint /*num_entries*/, + cl_device_id* /*devices*/, + cl_uint* /*num_devices*/); + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)( + cl_context /*context*/, + cl_mem_flags /*flags*/, + IDirect3DSurface9 * /*resource*/, + HANDLE /*sharedHandle*/, + UINT /*plane*/, + cl_int * /*errcode_ret*/); + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)( + cl_command_queue /*command_queue*/, + cl_uint /*num_objects*/, + const cl_mem * /*mem_objects*/, + cl_uint /*num_events_in_wait_list*/, + const cl_event * /*event_wait_list*/, + cl_event * /*event*/); + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)( + cl_command_queue /*command_queue*/, + cl_uint /*num_objects*/, + cl_mem * /*mem_objects*/, + cl_uint /*num_events_in_wait_list*/, + const cl_event * /*event_wait_list*/, + cl_event * /*event*/); + +#endif // CL_DX9_MEDIA_SHARING_INTEL_EXT + +#if defined CL_DX9_MEDIA_SHARING_NV_EXT + +#ifndef _WIN32 +#include +#endif + +/****************************************************************************** + * cl_nv_d3d9_sharing */ + +typedef cl_uint cl_d3d9_device_source_nv; +typedef cl_uint cl_d3d9_device_set_nv; + +/******************************************************************************/ + +// Error Codes +#define CL_INVALID_D3D9_DEVICE_NV -1010 +#define CL_INVALID_D3D9_RESOURCE_NV -1011 +#define CL_D3D9_RESOURCE_ALREADY_ACQUIRED_NV -1012 +#define CL_D3D9_RESOURCE_NOT_ACQUIRED_NV -1013 + +// cl_d3d9_device_source_nv +#define CL_D3D9_DEVICE_NV 0x4022 +#define CL_D3D9_ADAPTER_NAME_NV 0x4023 + +// cl_d3d9_device_set_nv +#define CL_PREFERRED_DEVICES_FOR_D3D9_NV 0x4024 +#define CL_ALL_DEVICES_FOR_D3D9_NV 0x4025 + +// cl_context_info +#define CL_CONTEXT_D3D9_DEVICE_NV 0x4026 + +// cl_mem_info +#define CL_MEM_D3D9_RESOURCE_NV 0x4027 + +// cl_image_info +#define CL_IMAGE_D3D9_FACE_NV 0x4028 +#define CL_IMAGE_D3D9_LEVEL_NV 0x4029 + +// cl_command_type +#define CL_COMMAND_ACQUIRE_D3D9_OBJECTS_NV 0x402A +#define CL_COMMAND_RELEASE_D3D9_OBJECTS_NV 0x402B + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D9NV_fn)( + cl_platform_id platform, + cl_d3d9_device_source_nv d3d_device_source, + void * d3d_object, + cl_d3d9_device_set_nv d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9VertexBufferNV_fn)( + cl_context context, + cl_mem_flags flags, + IDirect3DVertexBuffer9 * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9IndexBufferNV_fn)( + cl_context context, + cl_mem_flags flags, + IDirect3DIndexBuffer9 * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9SurfaceNV_fn)( + cl_context context, + cl_mem_flags flags, + IDirect3DSurface9 * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9TextureNV_fn)( + cl_context context, + cl_mem_flags flags, + IDirect3DTexture9 *resource, + UINT miplevel, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9CubeTextureNV_fn)( + cl_context context, + cl_mem_flags flags, + IDirect3DCubeTexture9 * resource, + D3DCUBEMAP_FACES facetype, + UINT miplevel, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9VolumeTextureNV_fn)( + cl_context context, + cl_mem_flags flags, + IDirect3DVolumeTexture9 * resource, + UINT miplevel, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D9ObjectsNV_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem *mem_objects, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D9ObjectsNV_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + cl_mem *mem_objects, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +#endif // CL_DX9_MEDIA_SHARING_NV_EXT + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */ + diff --git a/3rdparty/opencl/CL/cl_egl.h b/3rdparty/opencl/CL/cl_egl.h new file mode 100644 index 0000000000..93e6c9cfba --- /dev/null +++ b/3rdparty/opencl/CL/cl_egl.h @@ -0,0 +1,133 @@ +/******************************************************************************* + * Copyright (c) 2008-2010 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +#ifndef __OPENCL_CL_EGL_H +#define __OPENCL_CL_EGL_H + +#ifdef __APPLE__ + +#else +#include +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */ +#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F +#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D +#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E + +/* Error type for clCreateFromEGLImageKHR */ +#define CL_INVALID_EGL_OBJECT_KHR -1093 +#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092 + +/* CLeglImageKHR is an opaque handle to an EGLImage */ +typedef void* CLeglImageKHR; + +/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */ +typedef void* CLeglDisplayKHR; + +/* CLeglSyncKHR is an opaque handle to an EGLSync object */ +typedef void* CLeglSyncKHR; + +/* properties passed to clCreateFromEGLImageKHR */ +typedef intptr_t cl_egl_image_properties_khr; + + +#define cl_khr_egl_image 1 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromEGLImageKHR(cl_context /* context */, + CLeglDisplayKHR /* egldisplay */, + CLeglImageKHR /* eglimage */, + cl_mem_flags /* flags */, + const cl_egl_image_properties_khr * /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)( + cl_context context, + CLeglDisplayKHR egldisplay, + CLeglImageKHR eglimage, + cl_mem_flags flags, + const cl_egl_image_properties_khr * properties, + cl_int * errcode_ret); + + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + + +#define cl_khr_egl_event 1 + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateEventFromEGLSyncKHR(cl_context /* context */, + CLeglSyncKHR /* sync */, + CLeglDisplayKHR /* display */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)( + cl_context context, + CLeglSyncKHR sync, + CLeglDisplayKHR display, + cl_int * errcode_ret); + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_EGL_H */ diff --git a/3rdparty/opencl/CL/cl_ext.h b/3rdparty/opencl/CL/cl_ext.h new file mode 100644 index 0000000000..2c1aae35de --- /dev/null +++ b/3rdparty/opencl/CL/cl_ext.h @@ -0,0 +1,458 @@ +/******************************************************************************* + * Copyright (c) 2008-2013 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */ + +/* cl_ext.h contains OpenCL extensions which don't have external */ +/* (OpenGL, D3D) dependencies. */ + +#ifndef __CL_EXT_H +#define __CL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + #include + #include +#else + #include +#endif + +/* cl_khr_fp16 extension - no extension #define since it has no functions */ +#define CL_DEVICE_HALF_FP_CONFIG 0x1033 + +/* Memory object destruction + * + * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR + * + * Registers a user callback function that will be called when the memory object is deleted and its resources + * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback + * stack associated with memobj. The registered user callback functions are called in the reverse order in + * which they were registered. The user callback functions are called and then the memory object is deleted + * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be + * notified when the memory referenced by host_ptr, specified when the memory object is created and used as + * the storage bits for the memory object, can be reused or freed. + * + * The application may not call CL api's with the cl_mem object passed to the pfn_notify. + * + * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) + * before using. + */ +#define cl_APPLE_SetMemObjectDestructor 1 +cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem /* memobj */, + void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + + +/* Context Logging Functions + * + * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext(). + * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) + * before using. + * + * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger + */ +#define cl_APPLE_ContextLoggingFunctions 1 +extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + +/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */ +extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + +/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */ +extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + + +/************************ +* cl_khr_icd extension * +************************/ +#define cl_khr_icd 1 + +/* cl_platform_info */ +#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920 + +/* Additional Error Codes */ +#define CL_PLATFORM_NOT_FOUND_KHR -1001 + +extern CL_API_ENTRY cl_int CL_API_CALL +clIcdGetPlatformIDsKHR(cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */); + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)( + cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */); + + +/* Extension: cl_khr_image2D_buffer + * + * This extension allows a 2D image to be created from a cl_mem buffer without a copy. + * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t. + * Both the sampler and sampler-less read_image built-in functions are supported for 2D images + * and 2D images created from a buffer. Similarly, the write_image built-ins are also supported + * for 2D images created from a buffer. + * + * When the 2D image from buffer is created, the client must specify the width, + * height, image format (i.e. channel order and channel data type) and optionally the row pitch + * + * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels. + * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels. + */ + +/************************************* + * cl_khr_initalize_memory extension * + *************************************/ + +#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x2030 + + +/************************************** + * cl_khr_terminate_context extension * + **************************************/ + +#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x2031 +#define CL_CONTEXT_TERMINATE_KHR 0x2032 + +#define cl_khr_terminate_context 1 +extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2; + + +/* + * Extension: cl_khr_spir + * + * This extension adds support to create an OpenCL program object from a + * Standard Portable Intermediate Representation (SPIR) instance + */ + +#define CL_DEVICE_SPIR_VERSIONS 0x40E0 +#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1 + + +/****************************************** +* cl_nv_device_attribute_query extension * +******************************************/ +/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */ +#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 +#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 +#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002 +#define CL_DEVICE_WARP_SIZE_NV 0x4003 +#define CL_DEVICE_GPU_OVERLAP_NV 0x4004 +#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 +#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 + +/********************************* +* cl_amd_device_attribute_query * +*********************************/ +#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036 + +/********************************* +* cl_arm_printf extension +*********************************/ +#define CL_PRINTF_CALLBACK_ARM 0x40B0 +#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1 + +/********************************* +* cl_intel_accelerator extension * +*********************************/ +#define cl_intel_accelerator 1 +#define cl_intel_motion_estimation 1 + +typedef struct _cl_accelerator_intel* cl_accelerator_intel; +typedef cl_uint cl_accelerator_type_intel; +typedef cl_uint cl_accelerator_info_intel; + +typedef struct _cl_motion_estimation_desc_intel { + cl_uint mb_block_type; + cl_uint subpixel_mode; + cl_uint sad_adjust_mode; + cl_uint search_path_type; +} cl_motion_estimation_desc_intel; + +/* Error Codes */ +#define CL_INVALID_ACCELERATOR_INTEL -1094 +#define CL_INVALID_ACCELERATOR_TYPE_INTEL -1095 +#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096 +#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097 + +/* Deprecated Error Codes */ +#define CL_INVALID_ACCELERATOR_INTEL_DEPRECATED -6000 +#define CL_INVALID_ACCELERATOR_TYPE_INTEL_DEPRECATED -6001 +#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL_DEPRECATED -6002 +#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL_DEPRECATED -6003 + +/* cl_accelerator_type_intel */ +#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL 0x0 + +/* cl_accelerator_info_intel */ +#define CL_ACCELERATOR_DESCRIPTOR_INTEL 0x4090 +#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL 0x4091 +#define CL_ACCELERATOR_CONTEXT_INTEL 0x4092 +#define CL_ACCELERATOR_TYPE_INTEL 0x4093 + +/*cl_motion_detect_desc_intel flags */ +#define CL_ME_MB_TYPE_16x16_INTEL 0x0 +#define CL_ME_MB_TYPE_8x8_INTEL 0x1 +#define CL_ME_MB_TYPE_4x4_INTEL 0x2 + +#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 +#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 +#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL 0x2 + +#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 +#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x1 + +#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL 0x0 +#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL 0x1 +#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL 0x5 + +extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL +clCreateAcceleratorINTEL( + cl_context /* context */, + cl_accelerator_type_intel /* accelerator_type */, + size_t /* descriptor_size */, + const void* /* descriptor */, + cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_accelerator_intel + (CL_API_CALL *clCreateAcceleratorINTEL_fn)( + cl_context /* context */, + cl_accelerator_type_intel /* accelerator_type */, + size_t /* descriptor_size */, + const void* /* descriptor */, + cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetAcceleratorInfoINTEL +( + cl_accelerator_intel /* accelerator */, + cl_accelerator_info_intel /* param_name */, + size_t /* param_value_size */, + void* /* param_value */, + size_t* /* param_value_size_ret */ ) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int + (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)( + cl_accelerator_intel /* accelerator */, + cl_accelerator_info_intel /* param_name */, + size_t /* param_value_size */, + void* /* param_value */, + size_t* /* param_value_size_ret */ ) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainAcceleratorINTEL( + cl_accelerator_intel /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int + (CL_API_CALL *clRetainAcceleratorINTEL_fn)( + cl_accelerator_intel /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseAcceleratorINTEL( + cl_accelerator_intel /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int + (CL_API_CALL *clReleaseAcceleratorINTEL_fn)( + cl_accelerator_intel /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2; + +#ifdef CL_VERSION_1_1 + /*********************************** + * cl_ext_device_fission extension * + ***********************************/ + #define cl_ext_device_fission 1 + + extern CL_API_ENTRY cl_int CL_API_CALL + clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef CL_API_ENTRY cl_int + (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + extern CL_API_ENTRY cl_int CL_API_CALL + clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef CL_API_ENTRY cl_int + (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef cl_ulong cl_device_partition_property_ext; + extern CL_API_ENTRY cl_int CL_API_CALL + clCreateSubDevicesEXT( cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef CL_API_ENTRY cl_int + ( CL_API_CALL * clCreateSubDevicesEXT_fn)( cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + /* cl_device_partition_property_ext */ + #define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050 + #define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051 + #define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052 + #define CL_DEVICE_PARTITION_BY_NAMES_INTEL 0x4052 + #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053 + + /* clDeviceGetInfo selectors */ + #define CL_DEVICE_PARENT_DEVICE_EXT 0x4054 + #define CL_DEVICE_PARTITION_TYPES_EXT 0x4055 + #define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056 + #define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057 + #define CL_DEVICE_PARTITION_STYLE_EXT 0x4058 + + /* error codes */ + #define CL_DEVICE_PARTITION_FAILED_EXT -1057 + #define CL_INVALID_PARTITION_COUNT_EXT -1058 + #define CL_INVALID_PARTITION_NAME_EXT -1059 + + /* CL_AFFINITY_DOMAINs */ + #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1 + #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2 + #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3 + #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4 + #define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10 + #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100 + + /* cl_device_partition_property_ext list terminators */ + #define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0) + #define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0) + #define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1) + #define CL_PARTITION_BY_NAMES_LIST_END_INTEL ((cl_device_partition_property_ext) 0 - 1) + + #define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL (1 << 31) + + +/********************************* +* cl_qcom_ext_host_ptr extension +*********************************/ + +#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29) + +#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0 +#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1 +#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2 +#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3 +#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4 +#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5 +#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6 +#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7 + +typedef cl_uint cl_image_pitch_info_qcom; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceImageInfoQCOM(cl_device_id device, + size_t image_width, + size_t image_height, + const cl_image_format *image_format, + cl_image_pitch_info_qcom param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); + +typedef struct _cl_mem_ext_host_ptr +{ + /* Type of external memory allocation. */ + /* Legal values will be defined in layered extensions. */ + cl_uint allocation_type; + + /* Host cache policy for this external memory allocation. */ + cl_uint host_cache_policy; + +} cl_mem_ext_host_ptr; + +/********************************* +* cl_qcom_ion_host_ptr extension +*********************************/ + +#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8 + +typedef struct _cl_mem_ion_host_ptr +{ + /* Type of external memory allocation. */ + /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */ + cl_mem_ext_host_ptr ext_host_ptr; + + /* ION file descriptor */ + int ion_filedesc; + + /* Host pointer to the ION allocated memory */ + void* ion_hostptr; + +} cl_mem_ion_host_ptr; + +#endif /* CL_VERSION_1_1 */ + + +#ifdef CL_VERSION_2_0 +/********************************* +* cl_khr_sub_groups extension +*********************************/ +#define cl_khr_sub_groups 1 + +typedef cl_uint cl_kernel_sub_group_info; + +/* cl_khr_sub_group_info */ +#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR 0x2033 +#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR 0x2034 + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */, + cl_device_id /*in_device*/, + const cl_kernel_sub_group_info /* param_name */, + size_t /*input_value_size*/, + const void * /*input_value*/, + size_t /*param_value_size*/, + void* /*param_value*/, + size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0; + +typedef CL_API_ENTRY cl_int + ( CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */, + cl_device_id /*in_device*/, + const cl_kernel_sub_group_info /* param_name */, + size_t /*input_value_size*/, + const void * /*input_value*/, + size_t /*param_value_size*/, + void* /*param_value*/, + size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0; +#endif /* CL_VERSION_2_0 */ + +#ifdef __cplusplus +} +#endif + + +#endif /* __CL_EXT_H */ diff --git a/3rdparty/opencl/CL/cl_gl.h b/3rdparty/opencl/CL/cl_gl.h new file mode 100644 index 0000000000..04080937a2 --- /dev/null +++ b/3rdparty/opencl/CL/cl_gl.h @@ -0,0 +1,162 @@ +/********************************************************************************** + * Copyright (c) 2008 - 2013 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +#ifndef __OPENCL_CL_GL_H +#define __OPENCL_CL_GL_H + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef cl_uint cl_gl_object_type; +typedef cl_uint cl_gl_texture_info; +typedef cl_uint cl_gl_platform_info; +typedef struct __GLsync *cl_GLsync; + +/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */ +#define CL_GL_OBJECT_BUFFER 0x2000 +#define CL_GL_OBJECT_TEXTURE2D 0x2001 +#define CL_GL_OBJECT_TEXTURE3D 0x2002 +#define CL_GL_OBJECT_RENDERBUFFER 0x2003 +#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E +#define CL_GL_OBJECT_TEXTURE1D 0x200F +#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010 +#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011 + +/* cl_gl_texture_info */ +#define CL_GL_TEXTURE_TARGET 0x2004 +#define CL_GL_MIPMAP_LEVEL 0x2005 +#define CL_GL_NUM_SAMPLES 0x2012 + + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLBuffer(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLuint /* bufobj */, + int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLTexture(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLRenderbuffer(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLuint /* renderbuffer */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLObjectInfo(cl_mem /* memobj */, + cl_gl_object_type * /* gl_object_type */, + cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLTextureInfo(cl_mem /* memobj */, + cl_gl_texture_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + + +/* Deprecated OpenCL 1.1 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateFromGLTexture2D(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateFromGLTexture3D(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +/* cl_khr_gl_sharing extension */ + +#define cl_khr_gl_sharing 1 + +typedef cl_uint cl_gl_context_info; + +/* Additional Error Codes */ +#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 + +/* cl_gl_context_info */ +#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 +#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 + +/* Additional cl_context_properties */ +#define CL_GL_CONTEXT_KHR 0x2008 +#define CL_EGL_DISPLAY_KHR 0x2009 +#define CL_GLX_DISPLAY_KHR 0x200A +#define CL_WGL_HDC_KHR 0x200B +#define CL_CGL_SHAREGROUP_KHR 0x200C + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLContextInfoKHR(const cl_context_properties * /* properties */, + cl_gl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( + const cl_context_properties * properties, + cl_gl_context_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret); + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_H */ diff --git a/3rdparty/opencl/CL/cl_gl_ext.h b/3rdparty/opencl/CL/cl_gl_ext.h new file mode 100644 index 0000000000..a46e0a2e0b --- /dev/null +++ b/3rdparty/opencl/CL/cl_gl_ext.h @@ -0,0 +1,69 @@ +/********************************************************************************** + * Copyright (c) 2008-2013 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have */ +/* OpenGL dependencies. */ + +#ifndef __OPENCL_CL_GL_EXT_H +#define __OPENCL_CL_GL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + #include +#else + #include +#endif + +/* + * For each extension, follow this template + * cl_VEN_extname extension */ +/* #define cl_VEN_extname 1 + * ... define new types, if any + * ... define new tokens, if any + * ... define new APIs, if any + * + * If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header + * This allows us to avoid having to decide whether to include GL headers or GLES here. + */ + +/* + * cl_khr_gl_event extension + * See section 9.9 in the OpenCL 1.1 spec for more information + */ +#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateEventFromGLsyncKHR(cl_context /* context */, + cl_GLsync /* cl_GLsync */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_EXT_H */ diff --git a/3rdparty/opencl/CL/cl_platform.h b/3rdparty/opencl/CL/cl_platform.h new file mode 100644 index 0000000000..e86e5786fc --- /dev/null +++ b/3rdparty/opencl/CL/cl_platform.h @@ -0,0 +1,1299 @@ +/********************************************************************************** + * Copyright (c) 2008-2013 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */ + +#ifndef __CL_PLATFORM_H +#define __CL_PLATFORM_H + +#ifdef __APPLE__ + /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */ + #include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) + #define CL_API_ENTRY + #define CL_API_CALL __stdcall + #define CL_CALLBACK __stdcall +#else + #define CL_API_ENTRY + #define CL_API_CALL + #define CL_CALLBACK +#endif + +#ifdef __APPLE__ + #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import)) + #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + + #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + #else + #warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here! + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #endif +#else + #define CL_EXTENSION_WEAK_LINK + #define CL_API_SUFFIX__VERSION_1_0 + #define CL_EXT_SUFFIX__VERSION_1_0 + #define CL_API_SUFFIX__VERSION_1_1 + #define CL_EXT_SUFFIX__VERSION_1_1 + #define CL_API_SUFFIX__VERSION_1_2 + #define CL_EXT_SUFFIX__VERSION_1_2 + #define CL_API_SUFFIX__VERSION_2_0 + #define CL_EXT_SUFFIX__VERSION_2_0 + + #ifdef __GNUC__ + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #endif + #elif _WIN32 + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED __declspec(deprecated) + #endif + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #endif +#endif + +#if (defined (_WIN32) && defined(_MSC_VER)) + +/* scalar types */ +typedef signed __int8 cl_char; +typedef unsigned __int8 cl_uchar; +typedef signed __int16 cl_short; +typedef unsigned __int16 cl_ushort; +typedef signed __int32 cl_int; +typedef unsigned __int32 cl_uint; +typedef signed __int64 cl_long; +typedef unsigned __int64 cl_ulong; + +typedef unsigned __int16 cl_half; +typedef float cl_float; +typedef double cl_double; + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 1.1920928955078125e-7 + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 1.7976931348623157e+308 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.718281828459045090796 +#define CL_M_LOG2E 1.442695040888963387005 +#define CL_M_LOG10E 0.434294481903251816668 +#define CL_M_LN2 0.693147180559945286227 +#define CL_M_LN10 2.302585092994045901094 +#define CL_M_PI 3.141592653589793115998 +#define CL_M_PI_2 1.570796326794896557999 +#define CL_M_PI_4 0.785398163397448278999 +#define CL_M_1_PI 0.318309886183790691216 +#define CL_M_2_PI 0.636619772367581382433 +#define CL_M_2_SQRTPI 1.128379167095512558561 +#define CL_M_SQRT2 1.414213562373095145475 +#define CL_M_SQRT1_2 0.707106781186547572737 + +#define CL_M_E_F 2.71828174591064f +#define CL_M_LOG2E_F 1.44269502162933f +#define CL_M_LOG10E_F 0.43429449200630f +#define CL_M_LN2_F 0.69314718246460f +#define CL_M_LN10_F 2.30258512496948f +#define CL_M_PI_F 3.14159274101257f +#define CL_M_PI_2_F 1.57079637050629f +#define CL_M_PI_4_F 0.78539818525314f +#define CL_M_1_PI_F 0.31830987334251f +#define CL_M_2_PI_F 0.63661974668503f +#define CL_M_2_SQRTPI_F 1.12837922573090f +#define CL_M_SQRT2_F 1.41421353816986f +#define CL_M_SQRT1_2_F 0.70710676908493f + +#define CL_NAN (CL_INFINITY - CL_INFINITY) +#define CL_HUGE_VALF ((cl_float) 1e50) +#define CL_HUGE_VAL ((cl_double) 1e500) +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#else + +#include + +/* scalar types */ +typedef int8_t cl_char; +typedef uint8_t cl_uchar; +typedef int16_t cl_short __attribute__((aligned(2))); +typedef uint16_t cl_ushort __attribute__((aligned(2))); +typedef int32_t cl_int __attribute__((aligned(4))); +typedef uint32_t cl_uint __attribute__((aligned(4))); +typedef int64_t cl_long __attribute__((aligned(8))); +typedef uint64_t cl_ulong __attribute__((aligned(8))); + +typedef uint16_t cl_half __attribute__((aligned(2))); +typedef float cl_float __attribute__((aligned(4))); +typedef double cl_double __attribute__((aligned(8))); + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 0x1.fffffep127f +#define CL_FLT_MIN 0x1.0p-126f +#define CL_FLT_EPSILON 0x1.0p-23f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 0x1.fffffffffffffp1023 +#define CL_DBL_MIN 0x1.0p-1022 +#define CL_DBL_EPSILON 0x1.0p-52 + +#define CL_M_E 2.718281828459045090796 +#define CL_M_LOG2E 1.442695040888963387005 +#define CL_M_LOG10E 0.434294481903251816668 +#define CL_M_LN2 0.693147180559945286227 +#define CL_M_LN10 2.302585092994045901094 +#define CL_M_PI 3.141592653589793115998 +#define CL_M_PI_2 1.570796326794896557999 +#define CL_M_PI_4 0.785398163397448278999 +#define CL_M_1_PI 0.318309886183790691216 +#define CL_M_2_PI 0.636619772367581382433 +#define CL_M_2_SQRTPI 1.128379167095512558561 +#define CL_M_SQRT2 1.414213562373095145475 +#define CL_M_SQRT1_2 0.707106781186547572737 + +#define CL_M_E_F 2.71828174591064f +#define CL_M_LOG2E_F 1.44269502162933f +#define CL_M_LOG10E_F 0.43429449200630f +#define CL_M_LN2_F 0.69314718246460f +#define CL_M_LN10_F 2.30258512496948f +#define CL_M_PI_F 3.14159274101257f +#define CL_M_PI_2_F 1.57079637050629f +#define CL_M_PI_4_F 0.78539818525314f +#define CL_M_1_PI_F 0.31830987334251f +#define CL_M_2_PI_F 0.63661974668503f +#define CL_M_2_SQRTPI_F 1.12837922573090f +#define CL_M_SQRT2_F 1.41421353816986f +#define CL_M_SQRT1_2_F 0.70710676908493f + +#if defined( __GNUC__ ) + #define CL_HUGE_VALF __builtin_huge_valf() + #define CL_HUGE_VAL __builtin_huge_val() + #define CL_NAN __builtin_nanf( "" ) +#else + #define CL_HUGE_VALF ((cl_float) 1e50) + #define CL_HUGE_VAL ((cl_double) 1e500) + float nanf( const char * ); + #define CL_NAN nanf( "" ) +#endif +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#endif + +#include + +/* Mirror types to GL types. Mirror types allow us to avoid deciding which headers to load based on whether we are using GL or GLES here. */ +typedef unsigned int cl_GLuint; +typedef int cl_GLint; +typedef unsigned int cl_GLenum; + +/* + * Vector types + * + * Note: OpenCL requires that all types be naturally aligned. + * This means that vector types must be naturally aligned. + * For example, a vector of four floats must be aligned to + * a 16 byte boundary (calculated as 4 * the natural 4-byte + * alignment of the float). The alignment qualifiers here + * will only function properly if your compiler supports them + * and if you don't actively work to defeat them. For example, + * in order for a cl_float4 to be 16 byte aligned in a struct, + * the start of the struct must itself be 16-byte aligned. + * + * Maintaining proper alignment is the user's responsibility. + */ + +/* Define basic vector types */ +#if defined( __VEC__ ) + #include /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ + typedef vector unsigned char __cl_uchar16; + typedef vector signed char __cl_char16; + typedef vector unsigned short __cl_ushort8; + typedef vector signed short __cl_short8; + typedef vector unsigned int __cl_uint4; + typedef vector signed int __cl_int4; + typedef vector float __cl_float4; + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_UINT4__ 1 + #define __CL_INT4__ 1 + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef float __cl_float4 __attribute__((vector_size(16))); + #else + typedef __m128 __cl_float4; + #endif + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE2__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); + typedef cl_char __cl_char16 __attribute__((vector_size(16))); + typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); + typedef cl_short __cl_short8 __attribute__((vector_size(16))); + typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); + typedef cl_int __cl_int4 __attribute__((vector_size(16))); + typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); + typedef cl_long __cl_long2 __attribute__((vector_size(16))); + typedef cl_double __cl_double2 __attribute__((vector_size(16))); + #else + typedef __m128i __cl_uchar16; + typedef __m128i __cl_char16; + typedef __m128i __cl_ushort8; + typedef __m128i __cl_short8; + typedef __m128i __cl_uint4; + typedef __m128i __cl_int4; + typedef __m128i __cl_ulong2; + typedef __m128i __cl_long2; + typedef __m128d __cl_double2; + #endif + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_INT4__ 1 + #define __CL_UINT4__ 1 + #define __CL_ULONG2__ 1 + #define __CL_LONG2__ 1 + #define __CL_DOUBLE2__ 1 +#endif + +#if defined( __MMX__ ) + #include + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); + typedef cl_char __cl_char8 __attribute__((vector_size(8))); + typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); + typedef cl_short __cl_short4 __attribute__((vector_size(8))); + typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); + typedef cl_int __cl_int2 __attribute__((vector_size(8))); + typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); + typedef cl_long __cl_long1 __attribute__((vector_size(8))); + typedef cl_float __cl_float2 __attribute__((vector_size(8))); + #else + typedef __m64 __cl_uchar8; + typedef __m64 __cl_char8; + typedef __m64 __cl_ushort4; + typedef __m64 __cl_short4; + typedef __m64 __cl_uint2; + typedef __m64 __cl_int2; + typedef __m64 __cl_ulong1; + typedef __m64 __cl_long1; + typedef __m64 __cl_float2; + #endif + #define __CL_UCHAR8__ 1 + #define __CL_CHAR8__ 1 + #define __CL_USHORT4__ 1 + #define __CL_SHORT4__ 1 + #define __CL_INT2__ 1 + #define __CL_UINT2__ 1 + #define __CL_ULONG1__ 1 + #define __CL_LONG1__ 1 + #define __CL_FLOAT2__ 1 +#endif + +#if defined( __AVX__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef cl_float __cl_float8 __attribute__((vector_size(32))); + typedef cl_double __cl_double4 __attribute__((vector_size(32))); + #else + typedef __m256 __cl_float8; + typedef __m256d __cl_double4; + #endif + #define __CL_FLOAT8__ 1 + #define __CL_DOUBLE4__ 1 +#endif + +/* Define capabilities for anonymous struct members. */ +#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ __extension__ +#elif defined( _WIN32) && (_MSC_VER >= 1500) + /* Microsoft Developer Studio 2008 supports anonymous structs, but + * complains by default. */ +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ + /* Disable warning C4201: nonstandard extension used : nameless + * struct/union */ +#pragma warning( push ) +#pragma warning( disable : 4201 ) +#else +#define __CL_HAS_ANON_STRUCT__ 0 +#define __CL_ANON_STRUCT__ +#endif + +/* Define alignment keys */ +#if defined( __GNUC__ ) + #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) +#elif defined( _WIN32) && (_MSC_VER) + /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ + /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ + /* #include */ + /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ + #define CL_ALIGNED(_x) +#else + #warning Need to implement some method to align data here + #define CL_ALIGNED(_x) +#endif + +/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ +#if __CL_HAS_ANON_STRUCT__ + /* .xyzw and .s0123...{f|F} are supported */ + #define CL_HAS_NAMED_VECTOR_FIELDS 1 + /* .hi and .lo are supported */ + #define CL_HAS_HI_LO_VECTOR_FIELDS 1 +#endif + +/* Define cl_vector types */ + +/* ---- cl_charn ---- */ +typedef union +{ + cl_char CL_ALIGNED(2) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_char lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2; +#endif +}cl_char2; + +typedef union +{ + cl_char CL_ALIGNED(4) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[2]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4; +#endif +}cl_char4; + +/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ +typedef cl_char4 cl_char3; + +typedef union +{ + cl_char CL_ALIGNED(8) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[4]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[2]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8; +#endif +}cl_char8; + +typedef union +{ + cl_char CL_ALIGNED(16) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[8]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[4]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8[2]; +#endif +#if defined( __CL_CHAR16__ ) + __cl_char16 v16; +#endif +}cl_char16; + + +/* ---- cl_ucharn ---- */ +typedef union +{ + cl_uchar CL_ALIGNED(2) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; }; +#endif +#if defined( __cl_uchar2__) + __cl_uchar2 v2; +#endif +}cl_uchar2; + +typedef union +{ + cl_uchar CL_ALIGNED(4) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[2]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4; +#endif +}cl_uchar4; + +/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ +typedef cl_uchar4 cl_uchar3; + +typedef union +{ + cl_uchar CL_ALIGNED(8) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[4]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[2]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8; +#endif +}cl_uchar8; + +typedef union +{ + cl_uchar CL_ALIGNED(16) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[8]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[4]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8[2]; +#endif +#if defined( __CL_UCHAR16__ ) + __cl_uchar16 v16; +#endif +}cl_uchar16; + + +/* ---- cl_shortn ---- */ +typedef union +{ + cl_short CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_short lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2; +#endif +}cl_short2; + +typedef union +{ + cl_short CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[2]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4; +#endif +}cl_short4; + +/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ +typedef cl_short4 cl_short3; + +typedef union +{ + cl_short CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[4]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[2]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8; +#endif +}cl_short8; + +typedef union +{ + cl_short CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[8]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[4]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8[2]; +#endif +#if defined( __CL_SHORT16__ ) + __cl_short16 v16; +#endif +}cl_short16; + + +/* ---- cl_ushortn ---- */ +typedef union +{ + cl_ushort CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2; +#endif +}cl_ushort2; + +typedef union +{ + cl_ushort CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[2]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4; +#endif +}cl_ushort4; + +/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ +typedef cl_ushort4 cl_ushort3; + +typedef union +{ + cl_ushort CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[4]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[2]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8; +#endif +}cl_ushort8; + +typedef union +{ + cl_ushort CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[8]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[4]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8[2]; +#endif +#if defined( __CL_USHORT16__ ) + __cl_ushort16 v16; +#endif +}cl_ushort16; + +/* ---- cl_intn ---- */ +typedef union +{ + cl_int CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_int lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2; +#endif +}cl_int2; + +typedef union +{ + cl_int CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[2]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4; +#endif +}cl_int4; + +/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ +typedef cl_int4 cl_int3; + +typedef union +{ + cl_int CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[4]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[2]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8; +#endif +}cl_int8; + +typedef union +{ + cl_int CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[8]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[4]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8[2]; +#endif +#if defined( __CL_INT16__ ) + __cl_int16 v16; +#endif +}cl_int16; + + +/* ---- cl_uintn ---- */ +typedef union +{ + cl_uint CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_uint lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2; +#endif +}cl_uint2; + +typedef union +{ + cl_uint CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[2]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4; +#endif +}cl_uint4; + +/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ +typedef cl_uint4 cl_uint3; + +typedef union +{ + cl_uint CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[4]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[2]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8; +#endif +}cl_uint8; + +typedef union +{ + cl_uint CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[8]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[4]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8[2]; +#endif +#if defined( __CL_UINT16__ ) + __cl_uint16 v16; +#endif +}cl_uint16; + +/* ---- cl_longn ---- */ +typedef union +{ + cl_long CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_long lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2; +#endif +}cl_long2; + +typedef union +{ + cl_long CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[2]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4; +#endif +}cl_long4; + +/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ +typedef cl_long4 cl_long3; + +typedef union +{ + cl_long CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[4]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[2]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8; +#endif +}cl_long8; + +typedef union +{ + cl_long CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[8]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[4]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8[2]; +#endif +#if defined( __CL_LONG16__ ) + __cl_long16 v16; +#endif +}cl_long16; + + +/* ---- cl_ulongn ---- */ +typedef union +{ + cl_ulong CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2; +#endif +}cl_ulong2; + +typedef union +{ + cl_ulong CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[2]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4; +#endif +}cl_ulong4; + +/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ +typedef cl_ulong4 cl_ulong3; + +typedef union +{ + cl_ulong CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[4]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[2]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8; +#endif +}cl_ulong8; + +typedef union +{ + cl_ulong CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[8]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[4]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8[2]; +#endif +#if defined( __CL_ULONG16__ ) + __cl_ulong16 v16; +#endif +}cl_ulong16; + + +/* --- cl_floatn ---- */ + +typedef union +{ + cl_float CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_float lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2; +#endif +}cl_float2; + +typedef union +{ + cl_float CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[2]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4; +#endif +}cl_float4; + +/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ +typedef cl_float4 cl_float3; + +typedef union +{ + cl_float CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[4]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[2]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8; +#endif +}cl_float8; + +typedef union +{ + cl_float CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[8]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[4]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8[2]; +#endif +#if defined( __CL_FLOAT16__ ) + __cl_float16 v16; +#endif +}cl_float16; + +/* --- cl_doublen ---- */ + +typedef union +{ + cl_double CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_double lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2; +#endif +}cl_double2; + +typedef union +{ + cl_double CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[2]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4; +#endif +}cl_double4; + +/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ +typedef cl_double4 cl_double3; + +typedef union +{ + cl_double CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[4]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[2]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8; +#endif +}cl_double8; + +typedef union +{ + cl_double CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[8]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[4]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8[2]; +#endif +#if defined( __CL_DOUBLE16__ ) + __cl_double16 v16; +#endif +}cl_double16; + +/* Macro to facilitate debugging + * Usage: + * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. + * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \" + * Each line thereafter of OpenCL C source must end with: \n\ + * The last line ends in "; + * + * Example: + * + * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\ + * kernel void foo( int a, float * b ) \n\ + * { \n\ + * // my comment \n\ + * *b[ get_global_id(0)] = a; \n\ + * } \n\ + * "; + * + * This should correctly set up the line, (column) and file information for your source + * string so you can do source level debugging. + */ +#define __CL_STRINGIFY( _x ) # _x +#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) +#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" + +#ifdef __cplusplus +} +#endif + +#undef __CL_HAS_ANON_STRUCT__ +#undef __CL_ANON_STRUCT__ +#if defined( _WIN32) && (_MSC_VER >= 1500) +#pragma warning( pop ) +#endif + +#endif /* __CL_PLATFORM_H */ diff --git a/3rdparty/opencl/CL/opencl.h b/3rdparty/opencl/CL/opencl.h new file mode 100644 index 0000000000..0c2e639cd1 --- /dev/null +++ b/3rdparty/opencl/CL/opencl.h @@ -0,0 +1,54 @@ +/******************************************************************************* + * Copyright (c) 2008-2013 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_H +#define __OPENCL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + +#include +#include +#include +#include + +#else + +#include +#include +#include +#include + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_H */ + diff --git a/3rdparty/opencl/opencl.def b/3rdparty/opencl/opencl.def new file mode 100644 index 0000000000..cde815ac89 --- /dev/null +++ b/3rdparty/opencl/opencl.def @@ -0,0 +1,113 @@ +LIBRARY OpenCL.dll +EXPORTS +clBuildProgram@24 @1 +clCompileProgram@36 @2 +clCreateBuffer@24 @3 +clCreateCommandQueue@20 @4 +clCreateCommandQueueWithProperties@16 @5 +clCreateContext@24 @6 +clCreateContextFromType@24 @7 +clCreateFromGLBuffer@20 @8 +clCreateFromGLRenderbuffer@20 @9 +clCreateFromGLTexture@28 @10 +clCreateFromGLTexture2D@28 @11 +clCreateFromGLTexture3D@28 @12 +clCreateImage@28 @13 +clCreateImage2D@36 @14 +clCreateImage3D@44 @15 +clCreateKernel@12 @16 +clCreateKernelsInProgram@16 @17 +clCreatePipe@28 @18 +clCreateProgramWithBinary@28 @19 +clCreateProgramWithBuiltInKernels@20 @20 +clCreateProgramWithSource@20 @21 +clCreateSampler@20 @22 +clCreateSamplerWithProperties@12 @23 +clCreateSubBuffer@24 @24 +clCreateSubDevices@20 @25 +clCreateUserEvent@8 @26 +clEnqueueAcquireGLObjects@24 @27 +clEnqueueBarrier@4 @28 +clEnqueueBarrierWithWaitList@16 @29 +clEnqueueCopyBuffer@36 @30 +clEnqueueCopyBufferRect@52 @31 +clEnqueueCopyBufferToImage@36 @32 +clEnqueueCopyImage@36 @33 +clEnqueueCopyImageToBuffer@36 @34 +clEnqueueFillBuffer@36 @35 +clEnqueueFillImage@32 @36 +clEnqueueMapBuffer@44 @37 +clEnqueueMapImage@52 @38 +clEnqueueMarker@8 @39 +clEnqueueMarkerWithWaitList@16 @40 +clEnqueueMigrateMemObjects@32 @41 +clEnqueueNDRangeKernel@36 @42 +clEnqueueNativeKernel@40 @43 +clEnqueueReadBuffer@36 @44 +clEnqueueReadBufferRect@56 @45 +clEnqueueReadImage@44 @46 +clEnqueueReleaseGLObjects@24 @47 +clEnqueueSVMFree@32 @48 +clEnqueueSVMMap@36 @49 +clEnqueueSVMMemFill@32 @50 +clEnqueueSVMMemcpy@32 @51 +clEnqueueSVMUnmap@20 @52 +clEnqueueTask@20 @53 +clEnqueueUnmapMemObject@24 @54 +clEnqueueWaitForEvents@12 @55 +clEnqueueWriteBuffer@36 @56 +clEnqueueWriteBufferRect@56 @57 +clEnqueueWriteImage@44 @58 +clFinish@4 @59 +clFlush@4 @60 +clGetCommandQueueInfo@20 @61 +clGetContextInfo@20 @62 +clGetDeviceIDs@24 @63 +clGetDeviceInfo@20 @64 +clGetEventInfo@20 @65 +clGetEventProfilingInfo@20 @66 +clGetExtensionFunctionAddress@4 @67 +clGetExtensionFunctionAddressForPlatform@8 @68 +clGetGLObjectInfo@12 @69 +clGetGLTextureInfo@20 @70 +clGetImageInfo@20 @71 +clGetKernelArgInfo@24 @72 +clGetKernelInfo@20 @73 +clGetKernelWorkGroupInfo@24 @74 +clGetMemObjectInfo@20 @75 +clGetPipeInfo@20 @76 +clGetPlatformIDs@12 @77 +clGetPlatformInfo@20 @78 +clGetProgramBuildInfo@24 @79 +clGetProgramInfo@20 @80 +clGetSamplerInfo@20 @81 +clGetSupportedImageFormats@28 @82 +clLinkProgram@36 @83 +clReleaseCommandQueue@4 @84 +clReleaseContext@4 @85 +clReleaseDevice@4 @86 +clReleaseEvent@4 @87 +clReleaseKernel@4 @88 +clReleaseMemObject@4 @89 +clReleaseProgram@4 @90 +clReleaseSampler@4 @91 +clRetainCommandQueue@4 @92 +clRetainContext@4 @93 +clRetainDevice@4 @94 +clRetainEvent@4 @95 +clRetainKernel@4 @96 +clRetainMemObject@4 @97 +clRetainProgram@4 @98 +clRetainSampler@4 @99 +clSVMAlloc@20 @100 +clSVMFree@8 @101 +clSetCommandQueueProperty@20 @102 +clSetEventCallback@16 @103 +clSetKernelArg@16 @104 +clSetKernelArgSVMPointer@12 @105 +clSetKernelExecInfo@16 @106 +clSetMemObjectDestructorCallback@12 @107 +clSetUserEventStatus@8 @108 +clUnloadCompiler@0 @109 +clUnloadPlatformCompiler@4 @110 +clWaitForEvents@8 @111 diff --git a/3rdparty/opencl/opencl.vcxproj b/3rdparty/opencl/opencl.vcxproj new file mode 100644 index 0000000000..60a8285756 --- /dev/null +++ b/3rdparty/opencl/opencl.vcxproj @@ -0,0 +1,167 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {D80D4A75-C385-41BD-AE62-83D2E2B595A7} + Win32Proj + opencl + + + + Utility + true + v120 + Unicode + + + Utility + true + v120 + Unicode + + + Utility + false + v120 + true + Unicode + + + Utility + false + v120 + true + Unicode + + + + + + + + + + + + + + + + + + + + + + + + + + + + Level3 + Disabled + WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + + + Windows + true + + + + + + + + Level3 + Disabled + WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + + + Windows + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + + + Windows + true + true + true + + + opencl.def + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + + + Windows + true + true + true + + + + + lib /machine:$(PlatformTarget) "/def:%(FullPath)" "/out:$(SolutionDir)$(OutDir)\opencl.lib" + lib /machine:$(PlatformTarget) "/def:%(FullPath)" "/out:$(SolutionDir)$(OutDir)\opencl.lib" + lib /machine:$(PlatformTarget) "/def:%(FullPath)" "/out:$(SolutionDir)$(OutDir)\opencl.lib" + lib /machine:$(PlatformTarget) "/def:%(FullPath)" "/out:$(SolutionDir)$(OutDir)\opencl.lib" + $(SolutionDir)$(OutDir)\opencl.lib + $(SolutionDir)$(OutDir)\opencl.lib + $(SolutionDir)$(OutDir)\opencl.lib + $(SolutionDir)$(OutDir)\opencl.lib + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/3rdparty/opencl/opencl.vcxproj.filters b/3rdparty/opencl/opencl.vcxproj.filters new file mode 100644 index 0000000000..5c060a12cb --- /dev/null +++ b/3rdparty/opencl/opencl.vcxproj.filters @@ -0,0 +1,57 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/pcsx2_suite_2013.sln b/pcsx2_suite_2013.sln index bdefccadac..56e81de4f5 100644 --- a/pcsx2_suite_2013.sln +++ b/pcsx2_suite_2013.sln @@ -1,7 +1,7 @@  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 2013 -VisualStudioVersion = 12.0.30723.0 +VisualStudioVersion = 12.0.30825.0 MinimumVisualStudioVersion = 10.0.40219.1 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Nulls", "Nulls", "{E1828E40-2FBB-48FE-AE7F-5587755DCE0E}" EndProject @@ -40,6 +40,9 @@ EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SPU2-X", "plugins\spu2-x\src\Windows\Spu2-X_vs2013.vcxproj", "{5307BBB7-EBB9-4AA4-8CB6-A94EC473C8C4}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "GSdx", "plugins\GSdx\GSdx_vs2013.vcxproj", "{18E42F6F-3A62-41EE-B42F-79366C4F1E95}" + ProjectSection(ProjectDependencies) = postProject + {D80D4A75-C385-41BD-AE62-83D2E2B595A7} = {D80D4A75-C385-41BD-AE62-83D2E2B595A7} + EndProjectSection EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SoundTouch", "3rdparty\soundtouch\SoundTouch_vs2013.vcxproj", "{E9B51944-7E6D-4BCD-83F2-7BBD5A46182D}" EndProject @@ -91,6 +94,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "DEV9ghzdrk", "plugins\dev9g EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Experimental", "Experimental", "{7A407562-D70F-4F0A-9D3E-B32506416003}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "opencl", "3rdparty\opencl\opencl.vcxproj", "{D80D4A75-C385-41BD-AE62-83D2E2B595A7}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug AVX|Win32 = Debug AVX|Win32 @@ -1154,6 +1159,58 @@ Global {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Release|Win32.ActiveCfg = Release|Win32 {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Release|Win32.Build.0 = Release|Win32 {BBE4E5FB-530A-4D18-A633-35AF0577B7F3}.Release|x64.ActiveCfg = Release|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug AVX|Win32.ActiveCfg = Debug|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug AVX|Win32.Build.0 = Debug|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug AVX|x64.ActiveCfg = Debug|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug AVX|x64.Build.0 = Debug|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug AVX2|Win32.ActiveCfg = Debug|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug AVX2|Win32.Build.0 = Debug|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug AVX2|x64.ActiveCfg = Debug|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug AVX2|x64.Build.0 = Debug|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug SSE2|Win32.ActiveCfg = Debug|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug SSE2|Win32.Build.0 = Debug|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug SSE2|x64.ActiveCfg = Debug|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug SSE2|x64.Build.0 = Debug|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug SSE4|Win32.ActiveCfg = Debug|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug SSE4|Win32.Build.0 = Debug|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug SSE4|x64.ActiveCfg = Debug|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug SSE4|x64.Build.0 = Debug|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug SSSE3|Win32.ActiveCfg = Debug|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug SSSE3|Win32.Build.0 = Debug|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug SSSE3|x64.ActiveCfg = Debug|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug SSSE3|x64.Build.0 = Debug|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug|Win32.ActiveCfg = Debug|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug|Win32.Build.0 = Debug|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug|x64.ActiveCfg = Debug|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Debug|x64.Build.0 = Debug|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Devel|Win32.ActiveCfg = Debug|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Devel|Win32.Build.0 = Debug|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Devel|x64.ActiveCfg = Debug|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Devel|x64.Build.0 = Debug|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release AVX|Win32.ActiveCfg = Release|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release AVX|Win32.Build.0 = Release|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release AVX|x64.ActiveCfg = Release|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release AVX|x64.Build.0 = Release|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release AVX2|Win32.ActiveCfg = Release|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release AVX2|Win32.Build.0 = Release|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release AVX2|x64.ActiveCfg = Release|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release AVX2|x64.Build.0 = Release|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release SSE2|Win32.ActiveCfg = Release|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release SSE2|Win32.Build.0 = Release|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release SSE2|x64.ActiveCfg = Release|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release SSE2|x64.Build.0 = Release|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release SSE4|Win32.ActiveCfg = Release|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release SSE4|Win32.Build.0 = Release|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release SSE4|x64.ActiveCfg = Release|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release SSE4|x64.Build.0 = Release|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release SSSE3|Win32.ActiveCfg = Release|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release SSSE3|Win32.Build.0 = Release|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release SSSE3|x64.ActiveCfg = Release|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release SSSE3|x64.Build.0 = Release|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release|Win32.ActiveCfg = Release|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release|Win32.Build.0 = Release|Win32 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release|x64.ActiveCfg = Release|x64 + {D80D4A75-C385-41BD-AE62-83D2E2B595A7}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -1189,5 +1246,6 @@ Global {E613DA9F-41B4-4613-9911-E418EF5533BC} = {7A407562-D70F-4F0A-9D3E-B32506416003} {BBE4E5FB-530A-4D18-A633-35AF0577B7F3} = {7A407562-D70F-4F0A-9D3E-B32506416003} {7A407562-D70F-4F0A-9D3E-B32506416003} = {703FD00B-D7A0-41E3-BD03-CEC86B385DAF} + {D80D4A75-C385-41BD-AE62-83D2E2B595A7} = {78EBE642-7A4D-4EA7-86BE-5639C6646C38} EndGlobalSection EndGlobal diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp index d380649222..9954b51479 100644 --- a/plugins/GSdx/GSRendererCL.cpp +++ b/plugins/GSdx/GSRendererCL.cpp @@ -123,13 +123,14 @@ void GSRendererCL::Reset() static int pageuploads = 0; static int pageuploadcount = 0; static int tfxcount = 0; +static int64 tfxpixels = 0; void GSRendererCL::VSync(int field) { GSRenderer::VSync(field); - //printf("vsync %d/%d/%d\n", pageuploads, pageuploadcount, tfxcount); - pageuploads = pageuploadcount = tfxcount = 0; + //printf("vsync %d/%d/%d/%d\n", pageuploads, pageuploadcount, tfxcount, tfxpixels); + pageuploads = pageuploadcount = tfxcount = tfxpixels = 0; //if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize); } @@ -300,7 +301,7 @@ void GSRendererCL::Draw() m_cl.vb.size = 0; m_cl.ib.size = 0; - size_t size = std::max(vb_size * 2, 2u << 20); + size_t size = std::max(vb_size * 2, (size_t)2 << 20); printf("growing vertex/index buffer %d\n", size); @@ -308,7 +309,7 @@ void GSRendererCL::Draw() m_cl.vb.buff[1] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size); m_cl.vb.size = size; - size = std::max(size / sizeof(GSVertex) * 3 * sizeof(uint32), 1u << 20); // worst case, three times the vertex count + size = std::max(size / sizeof(GSVertex) * 3 * sizeof(uint32), (size_t)1 << 20); // worst case, three times the vertex count ASSERT(size >= ib_size); @@ -431,15 +432,12 @@ void GSRendererCL::Draw() if(pb->sel.fwrite) { - for(int i = 0; i < 4; i++) - { - m_rw_pages[1][i] |= m_tmp_pages[i]; - } - GSVector4i* dst_pages = job->GetDstPages(); for(int i = 0; i < 4; i++) { + m_rw_pages[1][i] |= m_tmp_pages[i]; + dst_pages[i] |= m_tmp_pages[i]; } } @@ -459,15 +457,12 @@ void GSRendererCL::Draw() if(pb->sel.zwrite) { - for(int i = 0; i < 4; i++) - { - m_rw_pages[1][i] |= m_tmp_pages[i]; - } - GSVector4i* dst_pages = job->GetDstPages(); for(int i = 0; i < 4; i++) { + m_rw_pages[1][i] |= m_tmp_pages[i]; + dst_pages[i] |= m_tmp_pages[i]; } } @@ -810,27 +805,10 @@ void GSRendererCL::Enqueue() tfxcount++; - //if(LOG) { fprintf(s_fp, "q %05x %05x %05x\n", (*i)->fbp, (*i)->zbp, (*i)->tbp); fflush(s_fp); } - UpdateTextureCache((*i).get()); uint32 prim_count_inner = std::min((*i)->ib_count / n, MAX_PRIM_COUNT - prim_start); - /* - if(m_perfmon.GetFrame() >= 5036) if((*i)->src_pages != NULL) - { - m_cl.queue[2].finish(); - - uint64 frame = m_perfmon.GetFrame(); - - std::string s; - - s = format("c:\\temp1\\_%05d_f%lld_tex2_%05x_%d.bmp", s_n++, frame, (*i)->tbp, (*i)->tpsm); - - m_mem.SaveBMP(s, (*i)->tbp, (*i)->tbw, (*i)->tpsm, 1 << (*i)->tw, 1 << (*i)->th); - } - */ - // TODO: tile level z test cl::Kernel& tfx = m_cl.GetTFXKernel((*i)->sel); @@ -851,29 +829,11 @@ void GSRendererCL::Enqueue() GSVector4i r = GSVector4i::load(&(*i)->rect); - r = r.ralign(GSVector2i(BIN_SIZE, BIN_SIZE)); + r = r.ralign(GSVector2i(8, 8)); - /* - if(i->sel.IsSolidRect()) // TODO: simple mem fill with optional mask - ;//printf("%d %d %d %d\n", r.left, r.top, r.width(), r.height()); - else - */ - m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(16, 16)); + m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8)); - /* - if(m_perfmon.GetFrame() >= 5036) - { - m_cl.queue[2].finish(); - - uint64 frame = m_perfmon.GetFrame(); - - std::string s; - - s = format("c:\\temp1\\_%05d_f%lld_rt2_%05x_%d.bmp", s_n++, frame, (*i)->fbp, (*i)->fpsm); - - m_mem.SaveBMP(s, (*i)->fbp, (*i)->fbw, (*i)->fpsm, GetFrameRect().width(), 512); - } - */ + tfxpixels += r.width() * r.height(); InvalidateTextureCache((*i).get()); @@ -1583,7 +1543,7 @@ GSRendererCL::CL::CL() { devices.push_back(device); - WIs = std::min(WIs, device.getInfo()); + WIs = std::min(WIs, (uint32)device.getInfo()); printf(" *"); } diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h index f6b3231a06..3efe33f29b 100644 --- a/plugins/GSdx/GSRendererCL.h +++ b/plugins/GSdx/GSRendererCL.h @@ -22,7 +22,6 @@ #pragma once #include "GSRenderer.h" -//#include "GSTextureCacheCL.h" __aligned(struct, 32) GSVertexCL { @@ -236,7 +235,7 @@ class GSRendererCL : public GSRenderer cl::Buffer env; cl::CommandQueue* wq; int wqidx; - size_t WIs; + uint32 WIs; public: CL(); @@ -259,61 +258,6 @@ class GSRendererCL : public GSRenderer void UpdateTextureCache(TFXJob* job); void InvalidateTextureCache(TFXJob* job); - /* - class RasterizerData : public GSAlignedClass<32> - { - __aligned(struct, 16) TextureLevel - { - GSVector4i r; - // TODO: GSTextureCacheCL::Texture* t; - }; - - public: - GSRendererCL* m_parent; - const uint32* m_fb_pages; - const uint32* m_zb_pages; - - //cl::Buffer m_vbuff; - //cl::Buffer m_ibuff; - - // TODO: buffers - TextureLevel m_tex[7 + 1]; // NULL terminated - //cl::Buffer m_clut; - //cl::Buffer m_dimx; - - // TODO: struct in a cl::Buffer - TFXSelector m_sel; - GSVector4i m_scissor; - GSVector4i m_bbox; - uint32 m_fm, m_zm; - int m_aref, m_afix; - uint32 m_fog; // rgb - int m_lod; // lcm == 1 - int m_mxl; - float m_l; // TEX1.L * -0x10000 - float m_k; // TEX1.K * 0x10000 - // TODO: struct { GSVector4i min, max, minmax, mask, invmask; } t; // [u] x 4 [v] x 4 - - RasterizerData(GSRendererCL* parent) - : m_parent(parent) - , m_fb_pages(NULL) - , m_zb_pages(NULL) - { - m_sel.key = 0; - } - - virtual ~RasterizerData() - { - // TODO: ReleasePages(); - } - - // TODO: void UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm); - // TODO: void ReleasePages(); - - // TODO: void SetSource(GSTextureCacheCL::Texture* t, const GSVector4i& r, int level); - // TODO: void UpdateSource(); - }; - */ protected: GSTexture* m_texture[2]; uint8* m_output; diff --git a/plugins/GSdx/GSdx_vs2013.vcxproj b/plugins/GSdx/GSdx_vs2013.vcxproj index ef165a5839..3b30a5bbb5 100644 --- a/plugins/GSdx/GSdx_vs2013.vcxproj +++ b/plugins/GSdx/GSdx_vs2013.vcxproj @@ -374,6 +374,7 @@ .\GSdx.def MachineX86 + $(SvnRootDir)\deps\$(Platform)\Debug;%(AdditionalLibraryDirectories) @@ -392,6 +393,7 @@ .\GSdx.def MachineX86 + $(SvnRootDir)\deps\$(Platform)\Release;%(AdditionalLibraryDirectories) @@ -410,6 +412,7 @@ .\GSdx.def MachineX86 + $(SvnRootDir)\deps\$(Platform)\Release;%(AdditionalLibraryDirectories) @@ -428,6 +431,7 @@ .\GSdx.def MachineX86 + $(SvnRootDir)\deps\$(Platform)\Debug;%(AdditionalLibraryDirectories) @@ -446,6 +450,7 @@ .\GSdx.def MachineX86 + $(SvnRootDir)\deps\$(Platform)\Debug;%(AdditionalLibraryDirectories) @@ -455,6 +460,7 @@ .\GSdx.def MachineX86 + $(SvnRootDir)\deps\$(Platform)\Debug;%(AdditionalLibraryDirectories) @@ -464,6 +470,7 @@ .\GSdx.def MachineX86 + $(SvnRootDir)\deps\$(Platform)\Debug;%(AdditionalLibraryDirectories) @@ -509,6 +516,7 @@ .\GSdx.def MachineX86 + $(SvnRootDir)\deps\$(Platform)\Release;%(AdditionalLibraryDirectories) @@ -518,6 +526,7 @@ .\GSdx.def MachineX86 + $(SvnRootDir)\deps\$(Platform)\Release;%(AdditionalLibraryDirectories) @@ -527,6 +536,7 @@ .\GSdx.def MachineX86 + $(SvnRootDir)\deps\$(Platform)\Release;%(AdditionalLibraryDirectories) diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl index 4eac8374de..bdf75b11a1 100644 --- a/plugins/GSdx/res/tfx.cl +++ b/plugins/GSdx/res/tfx.cl @@ -1,7 +1,7 @@ #if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency #ifndef CL_FLT_EPSILON -#define CL_FLT_EPSILON 1.1920928955078125e-7 +#define CL_FLT_EPSILON 1.1920928955078125e-7f #endif #if MAX_PRIM_PER_BATCH == 64u @@ -1301,8 +1301,6 @@ __kernel void KERNEL_TFX( uint fragments = 0; - //__local gs_prim p; - __global BIN_TYPE* bin = &env->bin[bin_index + batch_start * bin_count]; // TODO: not needed for "one tile case" __global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS]; __global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS]; diff --git a/plugins/GSdx/vsprops/common.props b/plugins/GSdx/vsprops/common.props index b8972de77c..30a1244560 100644 --- a/plugins/GSdx/vsprops/common.props +++ b/plugins/GSdx/vsprops/common.props @@ -14,7 +14,7 @@ Level4 ProgramDatabase 4996;4995;4324;4100;4101;4201;4556;4127;4512;%(DisableSpecificWarnings) - $(DXSDK_DIR)include;$(INTELOCLSDKROOT)include;$(VTUNE_AMPLIFIER_XE_2015_DIR)include;$(SolutionDir)3rdparty;%(AdditionalIncludeDirectories) + $(DXSDK_DIR)include;$(VTUNE_AMPLIFIER_XE_2015_DIR)include;$(SolutionDir)3rdparty;$(SolutionDir)3rdparty\opencl;%(AdditionalIncludeDirectories) true diff --git a/plugins/GSdx/vsprops/x64.props b/plugins/GSdx/vsprops/x64.props index ab8253d819..947675c717 100644 --- a/plugins/GSdx/vsprops/x64.props +++ b/plugins/GSdx/vsprops/x64.props @@ -5,7 +5,7 @@ - $(DXSDK_DIR)Lib\x64;$(INTELOCLSDKROOT)lib\x64;$(ProjectDir)vtune\x64;%(AdditionalLibraryDirectories) + $(DXSDK_DIR)Lib\x64;$(ProjectDir)vtune\x64;%(AdditionalLibraryDirectories) _WIN64;%(PreprocessorDefinitions) diff --git a/plugins/GSdx/vsprops/x86.props b/plugins/GSdx/vsprops/x86.props index 56171d784c..ff76b9535d 100644 --- a/plugins/GSdx/vsprops/x86.props +++ b/plugins/GSdx/vsprops/x86.props @@ -5,7 +5,7 @@ - $(DXSDK_DIR)Lib\x86;$(INTELOCLSDKROOT)lib\x86;$(ProjectDir)vtune\x86;%(AdditionalLibraryDirectories) + $(DXSDK_DIR)Lib\x86;$(ProjectDir)vtune\x86;%(AdditionalLibraryDirectories) From 3d2b0e3766b10b3d745775b99ef7599a46d7bb0d Mon Sep 17 00:00:00 2001 From: gabest11 Date: Sat, 20 Sep 2014 23:59:45 +0200 Subject: [PATCH 08/15] minor opencl kernel optimizations --- plugins/GSdx/GSRendererCL.cpp | 155 +++++++++++++++++----------------- plugins/GSdx/GSRendererCL.h | 1 - plugins/GSdx/res/tfx.cl | 143 +++++++++++++++---------------- 3 files changed, 144 insertions(+), 155 deletions(-) diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp index 9954b51479..77693579d6 100644 --- a/plugins/GSdx/GSRendererCL.cpp +++ b/plugins/GSdx/GSRendererCL.cpp @@ -61,8 +61,6 @@ typedef struct typedef struct { - cl_uint batch_counter; - cl_uint _pad[7]; struct { cl_uint first, last; } bounds[MAX_BIN_PER_BATCH]; BIN_TYPE bin[MAX_BIN_COUNT]; cl_uchar4 bbox[MAX_PRIM_COUNT]; @@ -392,10 +390,9 @@ void GSRendererCL::Draw() if(bbox.eq(bbox.rintersect(scissor))) { - pb->sel.noscissor = 1; + job->sel.noscissor = 1; } - job->sel = pb->sel; job->rect.x = rect.x; job->rect.y = rect.y; job->rect.z = rect.z; @@ -418,11 +415,11 @@ void GSRendererCL::Draw() // mark pages used in rendering as source or target - if(pb->sel.fwrite || pb->sel.rfb) + if(job->sel.fwrite || job->sel.rfb) { m_context->offset.fb->GetPagesAsBits(rect, m_tmp_pages); - if(pb->sel.rfb) + if(job->sel.rfb) { for(int i = 0; i < 4; i++) { @@ -430,7 +427,7 @@ void GSRendererCL::Draw() } } - if(pb->sel.fwrite) + if(job->sel.fwrite) { GSVector4i* dst_pages = job->GetDstPages(); @@ -443,11 +440,11 @@ void GSRendererCL::Draw() } } - if(pb->sel.zwrite || pb->sel.rzb) + if(job->sel.zwrite || job->sel.rzb) { m_context->offset.zb->GetPagesAsBits(rect, m_tmp_pages); - if(pb->sel.rzb) + if(job->sel.rzb) { for(int i = 0; i < 4; i++) { @@ -455,7 +452,7 @@ void GSRendererCL::Draw() } } - if(pb->sel.zwrite) + if(job->sel.zwrite) { GSVector4i* dst_pages = job->GetDstPages(); @@ -998,12 +995,12 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver const GSDrawingContext* context = m_context; const GS_PRIM_CLASS primclass = m_vt.m_primclass; - pb->sel.key = 0; + job->sel.key = 0; - pb->sel.atst = ATST_ALWAYS; - pb->sel.tfx = TFX_NONE; - pb->sel.ababcd = 0xff; - pb->sel.prim = primclass; + job->sel.atst = ATST_ALWAYS; + job->sel.tfx = TFX_NONE; + job->sel.ababcd = 0xff; + job->sel.prim = primclass; uint32 fm = context->FRAME.FBMSK; uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0; @@ -1026,8 +1023,8 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver { if(!TryAlphaTest(fm, zm)) { - pb->sel.atst = context->TEST.ATST; - pb->sel.afail = context->TEST.AFAIL; + job->sel.atst = context->TEST.ATST; + job->sel.afail = context->TEST.AFAIL; pb->aref = context->TEST.AREF; } } @@ -1056,31 +1053,31 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver if(!fwrite && !zwrite) return false; - bool ftest = pb->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24; + bool ftest = job->sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24; bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS; - pb->sel.fwrite = fwrite; - pb->sel.ftest = ftest; - pb->sel.zwrite = zwrite; - pb->sel.ztest = ztest; + job->sel.fwrite = fwrite; + job->sel.ftest = ftest; + job->sel.zwrite = zwrite; + job->sel.ztest = ztest; if(fwrite || ftest) { - pb->sel.fpsm = RemapPSM(context->FRAME.PSM); + job->sel.fpsm = RemapPSM(context->FRAME.PSM); if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff) { - pb->sel.iip = PRIM->IIP; + job->sel.iip = PRIM->IIP; } if(PRIM->TME) { - pb->sel.tfx = context->TEX0.TFX; - pb->sel.tcc = context->TEX0.TCC; - pb->sel.fst = PRIM->FST; - pb->sel.ltf = m_vt.IsLinear(); - pb->sel.tpsm = RemapPSM(context->TEX0.PSM); - pb->sel.aem = m_env.TEXA.AEM; + job->sel.tfx = context->TEX0.TFX; + job->sel.tcc = context->TEX0.TCC; + job->sel.fst = PRIM->FST; + job->sel.ltf = m_vt.IsLinear(); + job->sel.tpsm = RemapPSM(context->TEX0.PSM); + job->sel.aem = m_env.TEXA.AEM; pb->tbp[0] = context->TEX0.TBP0; pb->tbw[0] = context->TEX0.TBW; @@ -1089,24 +1086,24 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) { - pb->sel.tlu = 1; + job->sel.tlu = 1; memcpy(pb->clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal); } - pb->sel.wms = context->CLAMP.WMS; - pb->sel.wmt = context->CLAMP.WMT; + job->sel.wms = context->CLAMP.WMS; + job->sel.wmt = context->CLAMP.WMT; - if(pb->sel.tfx == TFX_MODULATE && pb->sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128))) + if(job->sel.tfx == TFX_MODULATE && job->sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128))) { // modulate does not do anything when vertex color is 0x80 - pb->sel.tfx = TFX_DECAL; + job->sel.tfx = TFX_DECAL; } GSVector4i r; - GetTextureMinMax(r, context->TEX0, context->CLAMP, pb->sel.ltf); + GetTextureMinMax(r, context->TEX0, context->CLAMP, job->sel.ltf); GSVector4i* src_pages = job->GetSrcPages(); @@ -1131,15 +1128,15 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver if(m_vt.m_lod.x > 0) { - pb->sel.ltf = context->TEX1.MMIN >> 2; + job->sel.ltf = context->TEX1.MMIN >> 2; } else { // TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt.m_lod.x <= 0 && m_vt.m_lod.y > 0 } - pb->sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri - pb->sel.lcm = context->TEX1.LCM; + job->sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri + job->sel.lcm = context->TEX1.LCM; int mxl = std::min((int)context->TEX1.MXL, 6) << 16; int k = context->TEX1.K << 12; @@ -1148,28 +1145,28 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver { k = (int)m_vt.m_lod.x << 16; // set lod to max level - pb->sel.lcm = 1; // lod is constant - pb->sel.mmin = 1; // tri-linear is meaningless + job->sel.lcm = 1; // lod is constant + job->sel.mmin = 1; // tri-linear is meaningless } - if(pb->sel.mmin == 2) + if(job->sel.mmin == 2) { mxl--; // don't sample beyond the last level (TODO: add a dummy level instead?) } - if(pb->sel.fst) + if(job->sel.fst) { - ASSERT(pb->sel.lcm == 1); + ASSERT(job->sel.lcm == 1); ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu) - pb->sel.lcm = 1; + job->sel.lcm = 1; } - if(pb->sel.lcm) + if(job->sel.lcm) { int lod = std::max(std::min(k, mxl), 0); - if(pb->sel.mmin == 1) + if(job->sel.mmin == 1) { lod = (lod + 0x8000) & 0xffff0000; // rounding } @@ -1241,7 +1238,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver GSVector4i r; - GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, pb->sel.ltf); + GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, job->sel.ltf); GSOffset* o = m_mem.GetOffset(MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM); @@ -1260,7 +1257,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver } else { - if(pb->sel.fst == 0) + if(job->sel.fst == 0) { // skip per pixel division if q is constant @@ -1268,7 +1265,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver if(m_vt.m_eq.q) { - pb->sel.fst = 1; + job->sel.fst = 1; const GSVector4& t = v[index[0]].t; @@ -1286,7 +1283,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver } else if(primclass == GS_SPRITE_CLASS) { - pb->sel.fst = 1; + job->sel.fst = 1; for(int i = 0, j = vertex_count; i < j; i += 2) { @@ -1301,7 +1298,7 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver } } - if(pb->sel.ltf && pb->sel.fst) // TODO: quite slow, do this in the prim kernel? + if(job->sel.ltf && job->sel.fst) // TODO: quite slow, do this in the prim kernel? { // if q is constant we can do the half pel shift for bilinear sampling on the vertices @@ -1378,58 +1375,58 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver if(PRIM->FGE) { - pb->sel.fge = 1; + job->sel.fge = 1; pb->fog = env.FOGCOL.u32[0]; } if(context->FRAME.PSM != PSM_PSMCT24) { - pb->sel.date = context->TEST.DATE; - pb->sel.datm = context->TEST.DATM; + job->sel.date = context->TEST.DATE; + job->sel.datm = context->TEST.DATM; } if(!IsOpaque()) { - pb->sel.abe = PRIM->ABE; - pb->sel.ababcd = context->ALPHA.u32[0]; + job->sel.abe = PRIM->ABE; + job->sel.ababcd = context->ALPHA.u32[0]; if(env.PABE.PABE) { - pb->sel.pabe = 1; + job->sel.pabe = 1; } if(m_aa1 && PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS)) { - pb->sel.aa1 = 1; + job->sel.aa1 = 1; } pb->afix = context->ALPHA.FIX; } - if(pb->sel.date || pb->sel.aba == 1 || pb->sel.abb == 1 || pb->sel.abc == 1 || pb->sel.abd == 1) + if(job->sel.date || job->sel.aba == 1 || job->sel.abb == 1 || job->sel.abc == 1 || job->sel.abd == 1) { - pb->sel.rfb = 1; + job->sel.rfb = 1; } else { if(fwrite) { - if(pb->sel.atst != ATST_ALWAYS && pb->sel.afail == AFAIL_RGB_ONLY - || (pb->sel.fpsm & 3) == 0 && fm != 0 - || (pb->sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask - || (pb->sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0) + if(job->sel.atst != ATST_ALWAYS && job->sel.afail == AFAIL_RGB_ONLY + || (job->sel.fpsm & 3) == 0 && fm != 0 + || (job->sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask + || (job->sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0) { - pb->sel.rfb = 1; + job->sel.rfb = 1; } } } - pb->sel.colclamp = env.COLCLAMP.CLAMP; - pb->sel.fba = context->FBA.FBA; + job->sel.colclamp = env.COLCLAMP.CLAMP; + job->sel.fba = context->FBA.FBA; if(env.DTHE.DTHE) { - pb->sel.dthe = 1; + job->sel.dthe = 1; GSVector4i dimx0 = env.dimx[1].sll32(16).sra32(16); GSVector4i dimx1 = env.dimx[3].sll32(16).sra32(16); @@ -1442,21 +1439,21 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver if(zwrite || ztest) { - pb->sel.zpsm = RemapPSM(context->ZBUF.PSM); - pb->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS; + job->sel.zpsm = RemapPSM(context->ZBUF.PSM); + job->sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS; if(ztest) { - pb->sel.rzb = 1; + job->sel.rzb = 1; } else { if(zwrite) { - if(pb->sel.atst != ATST_ALWAYS && (pb->sel.afail == AFAIL_FB_ONLY || pb->sel.afail == AFAIL_RGB_ONLY) - || (pb->sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask + if(job->sel.atst != ATST_ALWAYS && (job->sel.afail == AFAIL_FB_ONLY || job->sel.afail == AFAIL_RGB_ONLY) + || (job->sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask { - pb->sel.rzb = 1; + job->sel.rzb = 1; } } } @@ -1465,11 +1462,11 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver pb->fm = fm; pb->zm = zm; - if((pb->sel.fpsm & 3) == 1) + if((job->sel.fpsm & 3) == 1) { pb->fm |= 0xff000000; } - else if((pb->sel.fpsm & 3) >= 2) + else if((job->sel.fpsm & 3) >= 2) { uint32 rb = pb->fm & 0x00f800f8; uint32 ga = pb->fm & 0x8000f800; @@ -1477,11 +1474,11 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver pb->fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000; } - if((pb->sel.zpsm & 3) == 1) + if((job->sel.zpsm & 3) == 1) { pb->zm |= 0xff000000; } - else if((pb->sel.zpsm & 3) >= 2) + else if((job->sel.zpsm & 3) >= 2) { pb->zm |= 0xffff0000; } diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h index 3efe33f29b..59fd524943 100644 --- a/plugins/GSdx/GSRendererCL.h +++ b/plugins/GSdx/GSRendererCL.h @@ -146,7 +146,6 @@ class GSRendererCL : public GSRenderer { GSVector4i scissor; GSVector4i dimx; // 4x4 signed char - TFXSelector sel; uint32 fbp, zbp, bw; uint32 fm, zm; uint32 fog; // rgb diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl index bdf75b11a1..f9bb6dac2b 100644 --- a/plugins/GSdx/res/tfx.cl +++ b/plugins/GSdx/res/tfx.cl @@ -1,5 +1,21 @@ #if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency +#ifdef cl_amd_printf +#pragma OPENCL EXTENSION cl_amd_printf : enable +#else +#define printf(x) +#endif + +#ifdef cl_amd_media_ops +#pragma OPENCL EXTENSION cl_amd_media_ops : enable +#else +#endif + +#ifdef cl_amd_media_ops2 +#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable +#else +#endif + #ifndef CL_FLT_EPSILON #define CL_FLT_EPSILON 1.1920928955078125e-7f #endif @@ -32,8 +48,6 @@ typedef struct typedef struct { - uint batch_counter; - uint _pad[7]; struct {uint first, last;} bounds[MAX_BIN_PER_BATCH]; BIN_TYPE bin[MAX_BIN_COUNT]; uchar4 bbox[MAX_PRIM_COUNT]; @@ -45,7 +59,6 @@ typedef struct { int4 scissor; char dimx[4][4]; - ulong sel; int fbp, zbp, bw; uint fm, zm; uchar4 fog; // rgb @@ -679,7 +692,6 @@ int tile_in_triangle(float2 p, gs_barycentric b) __kernel void KERNEL_TILE(__global gs_env* env) { - env->batch_counter = 0; env->bounds[get_global_id(0)].first = -1; env->bounds[get_global_id(0)].last = 0; } @@ -777,77 +789,60 @@ __kernel void KERNEL_TILE( uint bin_count, // == bin_dim.z * bin_dim.w uchar4 bin_dim) { - __local uchar4 bbox_cache[MAX_PRIM_PER_BATCH]; - __local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH]; - __local uint batch_index; - + size_t batch_index = get_group_id(0); size_t local_id = get_local_id(0); size_t local_size = get_local_size(0); - while(1) + uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH); + + __global BIN_TYPE* bin = &env->bin[batch_index * bin_count]; + __global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS]; + __global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS]; + + __local uchar4 bbox_cache[MAX_PRIM_PER_BATCH]; + __local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH]; + + event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0); + + wait_group_events(1, &e); + + if(PRIM == GS_TRIANGLE_CLASS) { - barrier(CLK_LOCAL_MEM_FENCE); - - if(local_id == 0) - { - batch_index = atomic_inc(&env->batch_counter); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(batch_index >= batch_count) - { - break; - } - - uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH); + e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0); - __global BIN_TYPE* bin = &env->bin[batch_index * bin_count]; - __global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS]; - __global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS]; - - event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0); - wait_group_events(1, &e); + } - if(PRIM == GS_TRIANGLE_CLASS) + for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size) + { + int y = bin_index / bin_dim.z; // TODO: very expensive, no integer divider on current hardware + int x = bin_index - y * bin_dim.z; + + x += bin_dim.x; + y += bin_dim.y; + + BIN_TYPE visible = 0; + + for(uint i = 0; i < batch_prim_count; i++) { - e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0); - - wait_group_events(1, &e); + uchar4 r = bbox_cache[i]; + + BIN_TYPE test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y); + + if(PRIM == GS_TRIANGLE_CLASS && test != 0) + { + test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]); + } + + visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i); } - for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size) + bin[bin_index] = visible; + + if(visible != 0) { - int y = bin_index / bin_dim.z; // TODO: very expensive, no integer divider on current hardware - int x = bin_index - y * bin_dim.z; - - x += bin_dim.x; - y += bin_dim.y; - - BIN_TYPE visible = 0; - - for(uint i = 0; i < batch_prim_count; i++) - { - uchar4 r = bbox_cache[i]; - - BIN_TYPE test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y); - - if(PRIM == GS_TRIANGLE_CLASS && test != 0) - { - test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]); - } - - visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i); - } - - bin[bin_index] = visible; - - if(visible != 0) - { - atomic_min(&env->bounds[bin_index].first, batch_index); - atomic_max(&env->bounds[bin_index].last, batch_index); - } + atomic_min(&env->bounds[bin_index].first, batch_index); + atomic_max(&env->bounds[bin_index].last, batch_index); } } } @@ -998,10 +993,10 @@ int4 AlphaBlend(int4 c, int afix, uint fd) } else if(is16bit(FPSM)) { - cd.x = (fd & 0x001f) << 3; - cd.y = (fd & 0x03e0) >> 2; - cd.z = (fd & 0x7c00) >> 7; - cd.w = (fd & 0x8000) >> 8; + cd.x = (fd << 3) & 0xf8; + cd.y = (fd >> 2) & 0xf8; + cd.z = (fd >> 7) & 0xf8; + cd.w = (fd >> 8) & 0x80; } } @@ -1077,9 +1072,9 @@ uchar4 Expand16To32(ushort rgba, uchar ta0, uchar ta1) { uchar4 c; - c.x = (rgba & 0x001f) << 3; - c.y = (rgba & 0x03e0) >> 2; - c.z = (rgba & 0x7c00) >> 7; + c.x = (rgba << 3) & 0xf8; + c.y = (rgba >> 2) & 0xf8; + c.z = (rgba >> 7) & 0xf8; c.w = !AEM || (rgba & 0x7fff) != 0 ? ((rgba & 0x8000) ? ta1 : ta0) : 0; return c; @@ -1202,7 +1197,7 @@ int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t) // multiple work-items may render different prims to the same 2x2 sub-pixel, averaging can only be done after a barrier at the very end // pb->fm? alpha channel and following alpha tests? some games may depend on exact results, not some average -__kernel void KERNEL_TFX( +__kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( __global gs_env* env, __global uchar* vm, __global uchar* tex, @@ -1214,8 +1209,6 @@ __kernel void KERNEL_TFX( uint bin_count, // == bin_dim.z * bin_dim.w uchar4 bin_dim) { - // TODO: try it the bin_index = atomic_inc(&env->bin_counter) way - uint x = get_global_id(0); uint y = get_global_id(1); @@ -1451,7 +1444,7 @@ __kernel void KERNEL_TFX( { if(!ABE || c.w == 0x80) { - c.w = /*edge ? coverage :*/ 0x80; // TODO + c.w = 0x80; // TODO: edge ? coverage : 0x80 } } } From 6f5cd1cd4d820db7440235f357d13591b43bdcde Mon Sep 17 00:00:00 2001 From: gabest11 Date: Sun, 21 Sep 2014 18:13:55 +0200 Subject: [PATCH 09/15] joined some tfx kernel calls, general speed up in most games --- plugins/GSdx/GSRendererCL.cpp | 298 ++++++++++++++++++++++++---------- plugins/GSdx/GSRendererCL.h | 57 ++----- plugins/GSdx/GSUtil.cpp | 11 ++ plugins/GSdx/GSUtil.h | 1 + plugins/GSdx/res/tfx.cl | 69 ++++---- 5 files changed, 271 insertions(+), 165 deletions(-) diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp index 77693579d6..ff7aa4d43f 100644 --- a/plugins/GSdx/GSRendererCL.cpp +++ b/plugins/GSdx/GSRendererCL.cpp @@ -36,6 +36,7 @@ static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL; #define BIN_SIZE (1u << BIN_SIZE_BITS) #define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE)) #define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT) +#define TFX_PARAM_SIZE 2048 #if MAX_PRIM_PER_BATCH == 64u #define BIN_TYPE cl_ulong @@ -72,6 +73,7 @@ typedef struct GSRendererCL::GSRendererCL() : m_vb_count(0) + , m_synced(true) { m_nativeres = true; // ignore ini, sw is always native @@ -97,6 +99,9 @@ GSRendererCL::GSRendererCL() InitCVB(GS_TRIANGLE_CLASS); InitCVB(GS_SPRITE_CLASS); + // NOTE: m_cl.vm may be cached on the device according to the specs, there are a couple of places where we access m_mem.m_vm8 without + // mapping the buffer (after the two invalidate* calls and in getoutput), it is currently not an issue, but on some devices it may be. + m_cl.vm = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (size_t)m_mem.m_vmsize, m_mem.m_vm8, NULL); m_cl.tex = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE, (size_t)m_mem.m_vmsize); } @@ -122,13 +127,17 @@ static int pageuploads = 0; static int pageuploadcount = 0; static int tfxcount = 0; static int64 tfxpixels = 0; +static int tfxselcount = 0; +static int tfxdiffselcount = 0; void GSRendererCL::VSync(int field) { GSRenderer::VSync(field); //printf("vsync %d/%d/%d/%d\n", pageuploads, pageuploadcount, tfxcount, tfxpixels); + //printf("vsync %d/%d\n", tfxselcount, tfxdiffselcount); pageuploads = pageuploadcount = tfxcount = tfxpixels = 0; + tfxselcount = tfxdiffselcount = 0; //if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize); } @@ -284,7 +293,9 @@ void GSRendererCL::Draw() { size_t vb_size = m_vertex.next * sizeof(GSVertexCL); size_t ib_size = m_index.tail * sizeof(uint32); - size_t pb_size = sizeof(TFXParameter); + size_t pb_size = TFX_PARAM_SIZE; + + ASSERT(sizeof(TFXParameter) <= TFX_PARAM_SIZE); if(m_cl.vb.tail + vb_size > m_cl.vb.size || m_cl.ib.tail + ib_size > m_cl.ib.size || m_cl.pb.tail + pb_size > m_cl.pb.size) { @@ -366,12 +377,16 @@ void GSRendererCL::Draw() m_vb_start = m_cl.vb.tail; m_vb_count = 0; + m_pb_start = m_cl.pb.tail; + m_pb_count = 0; } else { // TODO: SIMD - uint32 vb_count = m_vb_count; + ASSERT(m_pb_count < 256); + + uint32 vb_count = m_vb_count | (m_pb_count << 24); for(size_t i = 0; i < m_index.tail; i++) { @@ -398,21 +413,25 @@ void GSRendererCL::Draw() job->rect.z = rect.z; job->rect.w = rect.w; job->ib_start = m_cl.ib.tail; - job->ib_count = m_index.tail; - job->pb_start = m_cl.pb.tail; + job->prim_count = m_index.tail / GSUtil::GetClassVertexCount(m_vt.m_primclass); + job->fbp = pb->fbp; + job->zbp = pb->zbp; + job->bw = pb->bw; #ifdef DEBUG - job->param = pb; + job->pb = pb; #endif - m_jobs.push_back(job); m_vb_count += m_vertex.next; + m_pb_count++; m_cl.vb.tail += vb_size; m_cl.ib.tail += ib_size; m_cl.pb.tail += pb_size; + m_synced = false; + // mark pages used in rendering as source or target if(job->sel.fwrite || job->sel.rfb) @@ -542,12 +561,7 @@ void GSRendererCL::Sync(int reason) m_rw_pages[1][i] = GSVector4i::zero(); } - // TODO: sync buffers created with CL_MEM_USE_HOST_PTR (on m_mem.m_vm8) by a simple map/unmap, - // though it does not seem to be necessary even with GPU devices where it might be cached, - // needs more testing... - - //void* ptr = m_cl.queue->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize); - //m_cl.queue->enqueueUnmapMemObject(m_cl.vm, ptr); + m_synced = true; } void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) @@ -558,7 +572,7 @@ void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS o->GetPagesAsBits(r, m_tmp_pages); - //if(!synced) + if(!m_synced) { for(int i = 0; i < 4; i++) { @@ -588,7 +602,7 @@ void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS { if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);} - //if(!synced) + if(!m_synced) { GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM); @@ -620,16 +634,7 @@ void GSRendererCL::Enqueue() int primclass = m_jobs.front()->sel.prim; - uint32 n; - - switch(primclass) - { - case GS_POINT_CLASS: n = 1; break; - case GS_LINE_CLASS: n = 2; break; - case GS_TRIANGLE_CLASS: n = 3; break; - case GS_SPRITE_CLASS: n = 2; break; - default: __assume(0); - } + uint32 n = GSUtil::GetClassVertexCount(primclass); PrimSelector psel; @@ -678,8 +683,6 @@ void GSRendererCL::Enqueue() // - cl_kernel tfx_prev = NULL; - auto head = m_jobs.begin(); while(head != m_jobs.end()) @@ -692,8 +695,8 @@ void GSRendererCL::Enqueue() { auto job = next++; - uint32 cur_prim_count = (*job)->ib_count / n; - uint32 next_prim_count = next != m_jobs.end() ? (*next)->ib_count / n : 0; + uint32 cur_prim_count = (*job)->prim_count; + uint32 next_prim_count = next != m_jobs.end() ? (*next)->prim_count : 0; total_prim_count += cur_prim_count; @@ -775,9 +778,8 @@ void GSRendererCL::Enqueue() uint32 group_count = batch_count * item_count; tk.setArg(1, (cl_uint)prim_count); - tk.setArg(2, (cl_uint)batch_count); - tk.setArg(3, (cl_uint)bin_count); - tk.setArg(4, bin_dim); + tk.setArg(2, (cl_uint)bin_count); + tk.setArg(3, bin_dim); m_cl.queue[2].enqueueNDRangeKernel(tk, cl::NullRange, cl::NDRange(group_count), cl::NDRange(item_count)); } @@ -789,68 +791,20 @@ void GSRendererCL::Enqueue() } } - // + std::list> jobs(head, next); - uint32 prim_start = 0; - - for(auto i = head; i != next; i++) - { - ASSERT(prim_start < MAX_PRIM_COUNT); - - // TODO: join tfx kernel calls where the selector and fbp/zbp/bw/scissor are the same - // move dimx/fm/zm/fog/aref/afix/ta0/ta1/tbp/tbw/minu/minv/maxu/maxv/lod/mxl/l/k/clut to an indexed array per prim - - tfxcount++; - - UpdateTextureCache((*i).get()); - - uint32 prim_count_inner = std::min((*i)->ib_count / n, MAX_PRIM_COUNT - prim_start); - - // TODO: tile level z test - - cl::Kernel& tfx = m_cl.GetTFXKernel((*i)->sel); - - if(tfx_prev != tfx()) - { - tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]); - - tfx_prev = tfx(); - } - - tfx.setArg(4, (cl_uint)(*i)->pb_start); - tfx.setArg(5, (cl_uint)prim_start); - tfx.setArg(6, (cl_uint)prim_count_inner); - tfx.setArg(7, (cl_uint)batch_count); - tfx.setArg(8, (cl_uint)bin_count); - tfx.setArg(9, bin_dim); - - GSVector4i r = GSVector4i::load(&(*i)->rect); - - r = r.ralign(GSVector2i(8, 8)); - - m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8)); - - tfxpixels += r.width() * r.height(); - - InvalidateTextureCache((*i).get()); - - // TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily - - prim_start += prim_count_inner; - } - - // + EnqueueTFX(jobs, bin_count, bin_dim); if(total_prim_count > MAX_PRIM_COUNT) { prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT); (*job)->ib_start += prim_count * n * sizeof(uint32); - (*job)->ib_count -= prim_count * n; + (*job)->prim_count -= prim_count; next = job; // try again for the remainder - //printf("split %d\n", (*job)->ib_count / n); + //printf("split %d\n", (*job)->prim_count); } break; @@ -876,6 +830,131 @@ void GSRendererCL::Enqueue() m_cl.Map(); } +void GSRendererCL::EnqueueTFX(std::list>& jobs, uint32 bin_count, const cl_uchar4& bin_dim) +{ + // join tfx kernel calls where the selector and fbp/zbp/bw are the same and src_pages != prev dst_pages + + //printf("before\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start); + + auto next = jobs.begin(); + + while(next != jobs.end()) + { + auto prev = next++; + + if(next == jobs.end()) + { + break; + } + + if((*prev)->sel == (*next)->sel && (*prev)->fbp == (*next)->fbp && (*prev)->zbp == (*next)->zbp && (*prev)->bw == (*next)->bw) + { + if((*prev)->dst_pages != NULL && (*next)->src_pages != NULL) + { + bool overlap = false; + + for(int i = 0; i < 4; i++) + { + if(!((*prev)->dst_pages[i] & (*next)->src_pages[i]).eq(GSVector4i::zero())) + { + overlap = true; + + break; + } + } + + if(overlap) + { + continue; + } + } + + if((*prev)->src_pages != NULL) + { + GSVector4i* src_pages = (*next)->GetSrcPages(); + + for(int i = 0; i < 4; i++) + { + src_pages[i] |= (*prev)->src_pages[i]; + } + } + + if((*prev)->dst_pages != NULL) + { + GSVector4i* dst_pages = (*next)->GetDstPages(); + + for(int i = 0; i < 4; i++) + { + dst_pages[i] |= (*prev)->dst_pages[i]; + } + } + + GSVector4i prev_rect = GSVector4i::load(&(*prev)->rect); + GSVector4i next_rect = GSVector4i::load(&(*next)->rect); + + GSVector4i::store(&(*next)->rect, prev_rect.runion(next_rect)); + + (*next)->prim_count += (*prev)->prim_count; + (*next)->ib_start = (*prev)->ib_start; + + jobs.erase(prev); + } + } + + //printf("after\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start); + + // + + cl_kernel tfx_prev = NULL; + + uint32 prim_start = 0; + + for(auto i : jobs) + { + ASSERT(prim_start < MAX_PRIM_COUNT); + + tfxcount++; + + UpdateTextureCache(i.get()); + + uint32 prim_count = std::min(i->prim_count, MAX_PRIM_COUNT - prim_start); + + // TODO: tile level z test + + cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel); + + if(tfx_prev != tfx()) + { + tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]); + tfx.setArg(4, (cl_uint)m_pb_start); + + tfx_prev = tfx(); + } + + tfx.setArg(5, (cl_uint)prim_start); + tfx.setArg(6, (cl_uint)prim_count); + tfx.setArg(7, (cl_uint)bin_count); + tfx.setArg(8, bin_dim); + tfx.setArg(9, i->fbp); + tfx.setArg(10, i->zbp); + tfx.setArg(11, i->bw); + + GSVector4i r = GSVector4i::load(&i->rect); + + r = r.ralign(GSVector2i(8, 8)); + + m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8)); + + tfxpixels += r.width() * r.height(); + + InvalidateTextureCache(i.get()); + + // TODO: partial job renderings (>MAX_PRIM_COUNT) may invalidate pages unnecessarily + + prim_start += prim_count; + } +} + void GSRendererCL::UpdateTextureCache(TFXJob* job) { if(job->src_pages == NULL) return; @@ -1490,7 +1569,51 @@ bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* ver return true; } -////////// +// + +GSRendererCL::TFXJob::TFXJob() + : src_pages(NULL) + , dst_pages(NULL) +{ +} + +GSRendererCL::TFXJob::~TFXJob() +{ + if(src_pages != NULL) _aligned_free(src_pages); + if(dst_pages != NULL) _aligned_free(dst_pages); +} + +GSVector4i* GSRendererCL::TFXJob::GetSrcPages() +{ + if(src_pages == NULL) + { + src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16); + + src_pages[0] = GSVector4i::zero(); + src_pages[1] = GSVector4i::zero(); + src_pages[2] = GSVector4i::zero(); + src_pages[3] = GSVector4i::zero(); + } + + return src_pages; +} + +GSVector4i* GSRendererCL::TFXJob::GetDstPages() +{ + if(dst_pages == NULL) + { + dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16); + + dst_pages[0] = GSVector4i::zero(); + dst_pages[1] = GSVector4i::zero(); + dst_pages[2] = GSVector4i::zero(); + dst_pages[3] = GSVector4i::zero(); + } + + return dst_pages; +} + +// //#define IOCL_DEBUG @@ -1578,7 +1701,7 @@ GSRendererCL::CL::CL() ib.mapped_ptr = ib.ptr = NULL; pb.mapped_ptr = pb.ptr = NULL; - pb.size = sizeof(TFXParameter) * 256; + pb.size = TFX_PARAM_SIZE * 256; pb.buff[0] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size); pb.buff[1] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size); @@ -1597,12 +1720,13 @@ void GSRendererCL::CL::Map() { Unmap(); + // TODO: CL_MAP_WRITE_INVALIDATE_REGION if 1.2+ + if(vb.head < vb.size) { vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, vb.head, vb.size - vb.head); vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head; ASSERT(((size_t)vb.ptr & 15) == 0); - ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0); } if(ib.head < ib.size) @@ -1616,7 +1740,6 @@ void GSRendererCL::CL::Map() pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, pb.head, pb.size - pb.head); pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head; ASSERT(((size_t)pb.ptr & 15) == 0); - ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0); } } @@ -1643,6 +1766,7 @@ static void AddDefs(ostringstream& opt) opt << "-D BIN_SIZE=" << BIN_SIZE << "u "; opt << "-D MAX_BIN_PER_BATCH=" << MAX_BIN_PER_BATCH << "u "; opt << "-D MAX_BIN_COUNT=" << MAX_BIN_COUNT << "u "; + opt << "-D TFX_PARAM_SIZE=" << TFX_PARAM_SIZE << "u "; #ifdef IOCL_DEBUG opt << "-g -s \"E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\tfx.cl\" "; #endif diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h index 59fd524943..fdfea3f6d3 100644 --- a/plugins/GSdx/GSRendererCL.h +++ b/plugins/GSdx/GSRendererCL.h @@ -164,55 +164,20 @@ class GSRendererCL : public GSRenderer { public: struct { int x, y, z, w; } rect; - TFXSelector sel; // uses primclass, solidrect only - uint32 ib_start, ib_count; - uint32 pb_start; + TFXSelector sel; + uint32 ib_start; + uint32 prim_count; GSVector4i* src_pages; // read by any texture level GSVector4i* dst_pages; // f/z writes to it + uint32 fbp, zbp, bw; #ifdef DEBUG - TFXParameter* param; + TFXParameter* pb; #endif - TFXJob() - : src_pages(NULL) - , dst_pages(NULL) - { - } + TFXJob(); + virtual ~TFXJob(); - virtual ~TFXJob() - { - if(src_pages != NULL) _aligned_free(src_pages); - if(dst_pages != NULL) _aligned_free(dst_pages); - } - - GSVector4i* GetSrcPages() - { - if(src_pages == NULL) - { - src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16); - - src_pages[0] = GSVector4i::zero(); - src_pages[1] = GSVector4i::zero(); - src_pages[2] = GSVector4i::zero(); - src_pages[3] = GSVector4i::zero(); - } - - return src_pages; - } - - GSVector4i* GetDstPages() - { - if(dst_pages == NULL) - { - dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16); - - dst_pages[0] = GSVector4i::zero(); - dst_pages[1] = GSVector4i::zero(); - dst_pages[2] = GSVector4i::zero(); - dst_pages[3] = GSVector4i::zero(); - } - - return dst_pages; - } + GSVector4i* GetSrcPages(); + GSVector4i* GetDstPages(); }; class CL @@ -252,8 +217,12 @@ class GSRendererCL : public GSRenderer std::list> m_jobs; uint32 m_vb_start; uint32 m_vb_count; + uint32 m_pb_start; + uint32 m_pb_count; + bool m_synced; void Enqueue(); + void EnqueueTFX(std::list>& jobs, uint32 bin_count, const cl_uchar4& bin_dim); void UpdateTextureCache(TFXJob* job); void InvalidateTextureCache(TFXJob* job); diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp index 533e51642a..7c0e239edc 100644 --- a/plugins/GSdx/GSUtil.cpp +++ b/plugins/GSdx/GSUtil.cpp @@ -94,6 +94,7 @@ static class GSUtilMaps public: uint8 PrimClassField[8]; uint8 VertexCountField[8]; + uint8 ClassVertexCountField[4]; uint32 CompatibleBitsField[64][2]; uint32 SharedBitsField[64][2]; @@ -117,6 +118,11 @@ public: VertexCountField[GS_SPRITE] = 2; VertexCountField[GS_INVALID] = 1; + ClassVertexCountField[GS_POINT_CLASS] = 1; + ClassVertexCountField[GS_LINE_CLASS] = 2; + ClassVertexCountField[GS_TRIANGLE_CLASS] = 3; + ClassVertexCountField[GS_SPRITE_CLASS] = 2; + memset(CompatibleBitsField, 0, sizeof(CompatibleBitsField)); for(int i = 0; i < 64; i++) @@ -163,6 +169,11 @@ int GSUtil::GetVertexCount(uint32 prim) return s_maps.VertexCountField[prim]; } +int GSUtil::GetClassVertexCount(uint32 primclass) +{ + return s_maps.ClassVertexCountField[primclass]; +} + const uint32* GSUtil::HasSharedBitsPtr(uint32 dpsm) { return s_maps.SharedBitsField[dpsm]; diff --git a/plugins/GSdx/GSUtil.h b/plugins/GSdx/GSUtil.h index b3697c0812..a4cad91014 100644 --- a/plugins/GSdx/GSUtil.h +++ b/plugins/GSdx/GSUtil.h @@ -30,6 +30,7 @@ public: static GS_PRIM_CLASS GetPrimClass(uint32 prim); static int GetVertexCount(uint32 prim); + static int GetClassVertexCount(uint32 primclass); static const uint32* HasSharedBitsPtr(uint32 dpsm); static bool HasSharedBits(uint32 spsm, const uint32* ptr); diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl index f9bb6dac2b..65f7cfa05e 100644 --- a/plugins/GSdx/res/tfx.cl +++ b/plugins/GSdx/res/tfx.cl @@ -36,7 +36,10 @@ typedef struct typedef struct { - gs_vertex v[4]; + gs_vertex v[3]; + uint zmin; + uint pb_index; + uint _pad[2]; } gs_prim; typedef struct @@ -560,12 +563,16 @@ __kernel void KERNEL_PRIM( ib += prim_index * VERTEX_PER_PRIM; + prim->pb_index = ib[0] >> 24; + + __global gs_vertex* v0 = &vb[ib[0] & 0x00ffffff]; + __global gs_vertex* v1 = &vb[ib[1] & 0x00ffffff]; + __global gs_vertex* v2 = &vb[ib[2] & 0x00ffffff]; + int2 pmin, pmax; if(PRIM == GS_POINT_CLASS) { - __global gs_vertex* v0 = &vb[ib[0]]; - pmin = pmax = convert_int2_rte(v0->p.xy); prim->v[0].p = v0->p; @@ -573,18 +580,14 @@ __kernel void KERNEL_PRIM( } else if(PRIM == GS_LINE_CLASS) { - int2 p0 = convert_int2_rte(vb[ib[0]].p.xy); - int2 p1 = convert_int2_rte(vb[ib[1]].p.xy); + int2 p0 = convert_int2_rte(v0->p.xy); + int2 p1 = convert_int2_rte(v1->p.xy); pmin = min(p0, p1); pmax = max(p0, p1); } else if(PRIM == GS_TRIANGLE_CLASS) { - __global gs_vertex* v0 = &vb[ib[0]]; - __global gs_vertex* v1 = &vb[ib[1]]; - __global gs_vertex* v2 = &vb[ib[2]]; - int2 p0 = convert_int2_rtp(v0->p.xy); int2 p1 = convert_int2_rtp(v1->p.xy); int2 p2 = convert_int2_rtp(v2->p.xy); @@ -593,8 +596,7 @@ __kernel void KERNEL_PRIM( pmax = max(max(p0, p1), p2); // z needs special care, since it's a 32 bit unit, float cannot encode it exactly - // pass the minimum through the unused 4th padding vector - // only interpolate the relative and hopefully small values + // only interpolate the relative to zmin and hopefully small values uint zmin = min(min(v0->z, v1->z), v2->z); @@ -605,7 +607,7 @@ __kernel void KERNEL_PRIM( prim->v[2].p = (float4)(v2->p.x, v2->p.y, as_float(v2->z - zmin), v2->p.w); prim->v[2].tc = v2->tc; - prim->v[3].z = zmin; + prim->zmin = zmin; float4 dp0 = v1->p - v0->p; float4 dp1 = v0->p - v2->p; @@ -652,9 +654,6 @@ __kernel void KERNEL_PRIM( } else if(PRIM == GS_SPRITE_CLASS) { - __global gs_vertex* v0 = &vb[ib[0]]; - __global gs_vertex* v1 = &vb[ib[1]]; - int2 p0 = convert_int2_rtp(v0->p.xy); int2 p1 = convert_int2_rtp(v1->p.xy); @@ -785,7 +784,6 @@ __kernel void KERNEL_TILE( __kernel void KERNEL_TILE( __global gs_env* env, uint prim_count, - uint batch_count, uint bin_count, // == bin_dim.z * bin_dim.w uchar4 bin_dim) { @@ -1205,9 +1203,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( uint pb_start, uint prim_start, uint prim_count, - uint batch_count, uint bin_count, // == bin_dim.z * bin_dim.w - uchar4 bin_dim) + uchar4 bin_dim, + uint fbp, + uint zbp, + uint bw) { uint x = get_global_id(0); uint y = get_global_id(1); @@ -1255,21 +1255,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( // - __global gs_param* pb = (__global gs_param*)(pb_base + pb_start); - int2 pi = (int2)(x, y); float2 pf = convert_float2(pi); - if(!NOSCISSOR) - { - if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw))) - { - return; - } - } - - int faddr = PixelAddress(x, y, pb->fbp, pb->bw, FPSM); - int zaddr = PixelAddress(x, y, pb->zbp, pb->bw, ZPSM); + int faddr = PixelAddress(x, y, fbp, bw, FPSM); + int zaddr = PixelAddress(x, y, zbp, bw, ZPSM); uint fd, zd; // TODO: fd as int4 and only pack before writing out? @@ -1298,6 +1288,8 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( __global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS]; __global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS]; + pb_base += pb_start; + BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip); for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH) @@ -1311,10 +1303,19 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( break; } - __global gs_prim* prim = &prim_base[prim_index + i]; - bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1); + __global gs_prim* prim = &prim_base[prim_index + i]; + __global gs_param* pb = (__global gs_param*)(pb_base + prim->pb_index * TFX_PARAM_SIZE); + + if(!NOSCISSOR) + { + if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw))) + { + continue; + } + } + uint2 zf; float3 t; int4 c; @@ -1359,7 +1360,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( float2 zf1 = convert_float2(as_uint2(prim->v[1].p.zw)); float2 zf2 = convert_float2(as_uint2(prim->v[2].p.zw)); - zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z; + zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->zmin; zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y); t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y; @@ -1449,7 +1450,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( } } - // read mask (read once outside the loop if alpha test does not modify, not sure if it does not get optimized there anyway) + // read mask uint fm = pb->fm; uint zm = pb->zm; From e3ba15de944e44c6b93335fc00aa4a5bc64f7db9 Mon Sep 17 00:00:00 2001 From: gabest11 Date: Mon, 22 Sep 2014 02:50:51 +0200 Subject: [PATCH 10/15] opencl device selection in settings dialog --- plugins/GSdx/GSDialog.cpp | 48 ++++++++++++++++++ plugins/GSdx/GSDialog.h | 1 + plugins/GSdx/GSRendererCL.cpp | 71 +++++++++----------------- plugins/GSdx/GSSettingsDlg.cpp | 92 ++++++++++++++++++++++------------ plugins/GSdx/GSSettingsDlg.h | 9 ++-- plugins/GSdx/GSUtil.cpp | 54 ++++++++++++++++++++ plugins/GSdx/GSUtil.h | 7 +++ plugins/GSdx/GSdx.cpp | 14 ++++++ plugins/GSdx/GSdx.h | 2 + plugins/GSdx/GSdx.rc | 72 +++++++++++++------------- plugins/GSdx/res/tfx.cl | 27 +++++++--- plugins/GSdx/resource.h | 3 +- 12 files changed, 275 insertions(+), 125 deletions(-) diff --git a/plugins/GSdx/GSDialog.cpp b/plugins/GSdx/GSDialog.cpp index 47a0d8796a..ea4c75191c 100644 --- a/plugins/GSdx/GSDialog.cpp +++ b/plugins/GSdx/GSDialog.cpp @@ -145,6 +145,8 @@ void GSDialog::ComboBoxInit(UINT id, const vector& settings, uint32 s ComboBoxAppend(id, str.c_str(), (LPARAM)s.id, s.id == selid); } } + + ComboBoxFixDroppedWidth(id); } int GSDialog::ComboBoxAppend(UINT id, const char* str, LPARAM data, bool select) @@ -178,3 +180,49 @@ bool GSDialog::ComboBoxGetSelData(UINT id, INT_PTR& data) return false; } + +void GSDialog::ComboBoxFixDroppedWidth(UINT id) +{ + HWND hWnd = GetDlgItem(m_hWnd, id); + + int count = (int)SendMessage(hWnd, CB_GETCOUNT, 0, 0); + + if(count > 0) + { + HDC hDC = GetDC(hWnd); + + SelectObject(hDC, (HFONT)SendMessage(hWnd, WM_GETFONT, 0, 0)); + + int width = (int)SendMessage(hWnd, CB_GETDROPPEDWIDTH, 0, 0); + + for(int i = 0; i < count; i++) + { + int len = (int)SendMessage(hWnd, CB_GETLBTEXTLEN, i, 0); + + if(len > 0) + { + char* buff = new char[len + 1]; + + SendMessage(hWnd, CB_GETLBTEXT, i, (LPARAM)buff); + + SIZE size; + + if(GetTextExtentPoint32(hDC, buff, strlen(buff), &size)) + { + size.cx += 10; + + if(size.cx > width) width = size.cx; + } + + delete [] buff; + } + } + + ReleaseDC(hWnd, hDC); + + if(width > 0) + { + SendMessage(hWnd, CB_SETDROPPEDWIDTH, width, 0); + } + } +} diff --git a/plugins/GSdx/GSDialog.h b/plugins/GSdx/GSDialog.h index 3f5534f01d..2e5ff499ab 100644 --- a/plugins/GSdx/GSDialog.h +++ b/plugins/GSdx/GSDialog.h @@ -53,4 +53,5 @@ public: void ComboBoxInit(UINT id, const vector& settings, uint32 selid, uint32 maxid = ~0); int ComboBoxAppend(UINT id, const char* str, LPARAM data = 0, bool select = false); bool ComboBoxGetSelData(UINT id, INT_PTR& data); + void ComboBoxFixDroppedWidth(UINT id); }; diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp index ff7aa4d43f..21d1a89a59 100644 --- a/plugins/GSdx/GSRendererCL.cpp +++ b/plugins/GSdx/GSRendererCL.cpp @@ -645,6 +645,9 @@ void GSRendererCL::Enqueue() pk.setArg(1, m_cl.vb.buff[m_cl.wqidx]); pk.setArg(2, m_cl.ib.buff[m_cl.wqidx]); + pk.setArg(3, m_cl.pb.buff[m_cl.wqidx]); + pk.setArg(4, (cl_uint)m_vb_start); + pk.setArg(6, (cl_uint)m_pb_start); TileSelector tsel; @@ -704,8 +707,7 @@ void GSRendererCL::Enqueue() { uint32 prim_count = std::min(total_prim_count, MAX_PRIM_COUNT); - pk.setArg(3, (cl_uint)m_vb_start); - pk.setArg(4, (cl_uint)(*head)->ib_start); + pk.setArg(5, (cl_uint)(*head)->ib_start); m_cl.queue[2].enqueueNDRangeKernel(pk, cl::NullRange, cl::NDRange(prim_count), cl::NullRange); @@ -1621,58 +1623,35 @@ GSRendererCL::CL::CL() { WIs = INT_MAX; - std::vector platforms; + std::string ocldev = theApp.GetConfig("ocldev", ""); - cl::Platform::get(&platforms); - - for(auto& p : platforms) - { - std::string platform_vendor = p.getInfo(); - - std::vector ds; - - p.getDevices(CL_DEVICE_TYPE_ALL, &ds); - - for(auto& device : ds) - { - std::string vendor = device.getInfo(); - std::string name = device.getInfo(); - std::string version = device.getInfo(); - - printf("%s %s %s", vendor.c_str(), name.c_str(), version.c_str()); - - cl_device_type type = device.getInfo(); - - switch(type) - { - case CL_DEVICE_TYPE_GPU: printf(" GPU"); break; - case CL_DEVICE_TYPE_CPU: printf(" CPU"); break; - } - - if(strstr(version.c_str(), "OpenCL C 1.1") != NULL - || strstr(version.c_str(), "OpenCL C 1.2") != NULL) - { #ifdef IOCL_DEBUG - if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL) -#else - //if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL) - //if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Intel") != NULL) - //if(type == CL_DEVICE_TYPE_GPU && strstr(platform_vendor.c_str(), "Advanced Micro Devices") != NULL) - if(type == CL_DEVICE_TYPE_GPU) + ocldev = "Intel(R) Corporation Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz OpenCL C 1.2 CPU"; #endif - { - devices.push_back(device); - WIs = std::min(WIs, (uint32)device.getInfo()); + list ocldevs; - printf(" *"); - } - } + GSUtil::GetOCLDevices(ocldevs); - printf("\n"); + for(auto dev : ocldevs) + { + if(dev.name == ocldev) + { + devices.push_back(dev.device); + + WIs = std::min(WIs, (uint32)dev.device.getInfo()); + + break; // TODO: multiple devices? } + } - if(!devices.empty()) break; + if(devices.empty() && !ocldevs.empty()) + { + auto dev = ocldevs.front(); + + devices.push_back(dev.device); + + WIs = std::min(WIs, (uint32)dev.device.getInfo()); } if(devices.empty()) diff --git a/plugins/GSdx/GSSettingsDlg.cpp b/plugins/GSdx/GSSettingsDlg.cpp index 41d66f24e9..a983cd07f5 100644 --- a/plugins/GSdx/GSSettingsDlg.cpp +++ b/plugins/GSdx/GSSettingsDlg.cpp @@ -40,11 +40,15 @@ void GSSettingsDlg::OnInit() m_modes.clear(); CComPtr d3d9; + d3d9.Attach(Direct3DCreate9(D3D_SDK_VERSION)); CComPtr dxgi_factory; - if (GSUtil::CheckDXGI()) + + if(GSUtil::CheckDXGI()) + { CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&dxgi_factory); + } if(!m_IsOpen2) { @@ -81,51 +85,49 @@ void GSSettingsDlg::OnInit() adapters.push_back(Adapter("Default Hardware Device", "default", GSUtil::CheckDirect3D11Level(NULL, D3D_DRIVER_TYPE_HARDWARE))); adapters.push_back(Adapter("Reference Device", "ref", GSUtil::CheckDirect3D11Level(NULL, D3D_DRIVER_TYPE_REFERENCE))); - if (dxgi_factory) + if(dxgi_factory) { - for (int i = 0;; i++) + for(int i = 0;; i++) { CComPtr adapter; - if (S_OK != dxgi_factory->EnumAdapters1(i, &adapter)) + + if(S_OK != dxgi_factory->EnumAdapters1(i, &adapter)) break; + DXGI_ADAPTER_DESC1 desc; + HRESULT hr = adapter->GetDesc1(&desc); - if (S_OK == hr) + + if(S_OK == hr) { D3D_FEATURE_LEVEL level = GSUtil::CheckDirect3D11Level(adapter, D3D_DRIVER_TYPE_UNKNOWN); -// GSDX isn't unicode!? + // GSDX isn't unicode!? #if 1 - int size = WideCharToMultiByte(CP_ACP, 0, - desc.Description, sizeof(desc.Description), - NULL, 0, - NULL, NULL); + int size = WideCharToMultiByte(CP_ACP, 0, desc.Description, sizeof(desc.Description), NULL, 0, NULL, NULL); char *buf = new char[size]; - WideCharToMultiByte(CP_ACP, 0, - desc.Description, sizeof(desc.Description), - buf, size, - NULL, NULL); + WideCharToMultiByte(CP_ACP, 0, desc.Description, sizeof(desc.Description), buf, size, NULL, NULL); adapters.push_back(Adapter(buf, GSAdapter(desc), level)); - delete [] buf; + delete[] buf; #else adapters.push_back(Adapter(desc.Description, GSAdapter(desc), level)); #endif } } } - else if (d3d9) + else if(d3d9) { int n = d3d9->GetAdapterCount(); - for (int i = 0; i < n; i++) + for(int i = 0; i < n; i++) { D3DADAPTER_IDENTIFIER9 desc; - if (D3D_OK != d3d9->GetAdapterIdentifier(i, 0, &desc)) + + if(D3D_OK != d3d9->GetAdapterIdentifier(i, 0, &desc)) break; -// GSDX isn't unicode!? + + // GSDX isn't unicode!? #if 0 wchar_t buf[sizeof desc.Description * sizeof(WCHAR)]; - MultiByteToWideChar(CP_ACP /* I have no idea if this is right */, 0, - desc.Description, sizeof(desc.Description), - buf, sizeof buf / sizeof *buf); + MultiByteToWideChar(CP_ACP /* I have no idea if this is right */, 0, desc.Description, sizeof(desc.Description), buf, sizeof buf / sizeof *buf); adapters.push_back(Adapter(buf, GSAdapter(desc), (D3D_FEATURE_LEVEL)0)); #else adapters.push_back(Adapter(desc.Description, GSAdapter(desc), (D3D_FEATURE_LEVEL)0)); @@ -135,17 +137,37 @@ void GSSettingsDlg::OnInit() std::string adapter_setting = theApp.GetConfig("Adapter", "default"); vector adapter_settings; - unsigned adapter_sel = 0; + unsigned int adapter_sel = 0; - for (unsigned i = 0; i < adapters.size(); i++) + for(unsigned int i = 0; i < adapters.size(); i++) { - if (adapters[i].id == adapter_setting) + if(adapters[i].id == adapter_setting) + { adapter_sel = i; + } + adapter_settings.push_back(GSSetting(i, adapters[i].name.c_str(), "")); } + std::string ocldev = theApp.GetConfig("ocldev", ""); + + unsigned int ocl_sel = 0; + + for(unsigned int i = 0; i < theApp.m_ocl_devs.size(); i++) + { + if(ocldev == theApp.m_ocl_devs[i].name) + { + ocl_sel = i; + + break; + } + } + ComboBoxInit(IDC_ADAPTER, adapter_settings, adapter_sel); + ComboBoxInit(IDC_OPENCL_DEVICE, theApp.m_ocl_devs, ocl_sel); + UpdateRenderers(); + ComboBoxInit(IDC_INTERLACE, theApp.m_gs_interlace, theApp.GetConfig("Interlace", 7)); // 7 = "auto", detects interlace based on SMODE2 register ComboBoxInit(IDC_ASPECTRATIO, theApp.m_gs_aspectratio, theApp.GetConfig("AspectRatio", 1)); ComboBoxInit(IDC_UPSCALE_MULTIPLIER, theApp.m_gs_upscale_multiplier, theApp.GetConfig("upscale_multiplier", 1)); @@ -233,6 +255,11 @@ bool GSSettingsDlg::OnCommand(HWND hWnd, UINT id, UINT code) theApp.SetConfig("Adapter", adapters[(int)data].id.c_str()); } + if(ComboBoxGetSelData(IDC_OPENCL_DEVICE, data)) + { + theApp.SetConfig("ocldev", theApp.m_ocl_devs[(int)data].name.c_str()); + } + if(!m_IsOpen2 && ComboBoxGetSelData(IDC_RESOLUTION, data)) { const D3DDISPLAYMODE* mode = (D3DDISPLAYMODE*)data; @@ -266,7 +293,7 @@ bool GSSettingsDlg::OnCommand(HWND hWnd, UINT id, UINT code) theApp.SetConfig("upscale_multiplier", 1); } - if (ComboBoxGetSelData(IDC_AFCOMBO, data)) + if(ComboBoxGetSelData(IDC_AFCOMBO, data)) { theApp.SetConfig("MaxAnisotropy", (int)data); } @@ -360,16 +387,19 @@ void GSSettingsDlg::UpdateControls() if(ComboBoxGetSelData(IDC_RENDERER, i)) { - bool dx9 = (i / 3) == 0; - bool dx11 = (i / 3) == 1; - bool ogl = (i / 3) == 4; - bool hw = (i % 3) == 0; - //bool sw = (i % 3) == 1; + bool dx9 = i >= 0 && i <= 2 || i == 14; + bool dx11 = i >= 3 && i <= 5 || i == 15; + bool ogl = i >= 12 && i <= 13 || i == 17; + bool hw = i == 0 || i == 3 || i == 12; + //bool sw = i == 1 || i == 4 || i == 10 || i == 13; + bool ocl = i >= 14 && i <= 17; + bool native = !!IsDlgButtonChecked(m_hWnd, IDC_NATIVERES); ShowWindow(GetDlgItem(m_hWnd, IDC_LOGO9), dx9 ? SW_SHOW : SW_HIDE); ShowWindow(GetDlgItem(m_hWnd, IDC_LOGO11), dx11 ? SW_SHOW : SW_HIDE); + EnableWindow(GetDlgItem(m_hWnd, IDC_OPENCL_DEVICE), ocl); EnableWindow(GetDlgItem(m_hWnd, IDC_WINDOWED), dx9); EnableWindow(GetDlgItem(m_hWnd, IDC_RESX), hw && !native && scaling == 1); EnableWindow(GetDlgItem(m_hWnd, IDC_RESX_EDIT), hw && !native && scaling == 1); diff --git a/plugins/GSdx/GSSettingsDlg.h b/plugins/GSdx/GSSettingsDlg.h index 86fcf08449..bc083debd1 100644 --- a/plugins/GSdx/GSSettingsDlg.h +++ b/plugins/GSdx/GSSettingsDlg.h @@ -69,16 +69,19 @@ public: class GSSettingsDlg : public GSDialog { list m_modes; + struct Adapter { std::string name; std::string id; D3D_FEATURE_LEVEL level; - Adapter(const std::string &n, const std::string &i, const D3D_FEATURE_LEVEL &l) - : name(n), id(i), level(l) {} + Adapter(const std::string &n, const std::string &i, const D3D_FEATURE_LEVEL &l) : name(n), id(i), level(l) {} }; + std::vector adapters; + bool m_IsOpen2; + uint32 m_lastValidMsaa; // used to revert to previous dialog value if the user changed to invalid one, or lesser one and canceled void UpdateRenderers(); void UpdateControls(); @@ -87,8 +90,6 @@ protected: void OnInit(); bool OnCommand(HWND hWnd, UINT id, UINT code); - uint32 m_lastValidMsaa; // used to revert to previous dialog value if the user changed to invalid one, or lesser one and canceled - // Shade Boost GSShadeBostDlg ShadeBoostDlg; GSHacksDlg HacksDlg; diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp index 7c0e239edc..71d60eed9e 100644 --- a/plugins/GSdx/GSUtil.cpp +++ b/plugins/GSdx/GSUtil.cpp @@ -226,6 +226,60 @@ bool GSUtil::CheckSSE() return true; } +void GSUtil::GetOCLDevices(list& devs) +{ + devs.clear(); + + try + { + std::vector platforms; + + cl::Platform::get(&platforms); + + for(auto& p : platforms) + { + std::string platform_vendor = p.getInfo(); + + std::vector ds; + + p.getDevices(CL_DEVICE_TYPE_ALL, &ds); + + for(auto& device : ds) + { + std::string vendor = device.getInfo(); + std::string name = device.getInfo(); + std::string version = device.getInfo(); + + string type; + + switch(device.getInfo()) + { + case CL_DEVICE_TYPE_GPU: type = "GPU"; break; + case CL_DEVICE_TYPE_CPU: type = "CPU"; break; + } + + int major, minor; + + if(!type.empty() && sscanf(version.c_str(), "OpenCL C %d.%d", &major, &minor) == 2 && major == 1 && minor >= 1 || major > 1) + { + name = vendor + " " + name + " " + version + type; + + OCLDevice dev; + + dev.device = device; + dev.name = name; + + devs.push_back(dev); + } + } + } + } + catch(cl::Error err) + { + printf("%s (%d)\n", err.what(), err.err()); + } +} + #ifdef _WINDOWS bool GSUtil::CheckDirectX() diff --git a/plugins/GSdx/GSUtil.h b/plugins/GSdx/GSUtil.h index a4cad91014..2e044e0bdc 100644 --- a/plugins/GSdx/GSUtil.h +++ b/plugins/GSdx/GSUtil.h @@ -23,6 +23,12 @@ #include "GS.h" +struct OCLDevice +{ + cl::Device device; + string name; +}; + class GSUtil { public: @@ -39,6 +45,7 @@ public: static bool HasCompatibleBits(uint32 spsm, uint32 dpsm); static bool CheckSSE(); + static void GetOCLDevices(list& devs); #ifdef _WINDOWS diff --git a/plugins/GSdx/GSdx.cpp b/plugins/GSdx/GSdx.cpp index 93d9642fc2..e3a3fc0665 100644 --- a/plugins/GSdx/GSdx.cpp +++ b/plugins/GSdx/GSdx.cpp @@ -21,6 +21,7 @@ #include "stdafx.h" #include "GSdx.h" +#include "GSUtil.h" static void* s_hModule; @@ -198,6 +199,19 @@ GSdxApp::GSdxApp() m_gpu_scale.push_back(GSSetting(2 | (1 << 2), "H x 4 - V x 2", "")); m_gpu_scale.push_back(GSSetting(1 | (2 << 2), "H x 2 - V x 4", "")); m_gpu_scale.push_back(GSSetting(2 | (2 << 2), "H x 4 - V x 4", "")); + + // + + list ocldevs; + + GSUtil::GetOCLDevices(ocldevs); + + int index = 0; + + for(auto dev : ocldevs) + { + m_ocl_devs.push_back(GSSetting(index++, dev.name.c_str(), "")); + } } #ifdef _LINUX diff --git a/plugins/GSdx/GSdx.h b/plugins/GSdx/GSdx.h index 5ccb1e2c69..68a17b71d3 100644 --- a/plugins/GSdx/GSdx.h +++ b/plugins/GSdx/GSdx.h @@ -69,6 +69,8 @@ public: vector m_gpu_dithering; vector m_gpu_aspectratio; vector m_gpu_scale; + + vector m_ocl_devs; }; struct GSDXError {}; diff --git a/plugins/GSdx/GSdx.rc b/plugins/GSdx/GSdx.rc index 87c0b6164d..58288adb99 100644 --- a/plugins/GSdx/GSdx.rc +++ b/plugins/GSdx/GSdx.rc @@ -55,7 +55,7 @@ BEGIN "#include ""res/fxaa.fx""\r\n" "#include ""res/cs.fx""\r\n" "#include ""res/shadeboost.fx""\r\n" - "#include ""res/tfx.cl""\r\n" + "#include ""res/tfx.cl""\r\0" END #endif // APSTUDIO_INVOKED @@ -230,49 +230,51 @@ BEGIN CONTROL "Windowed",IDC_WINDOWED,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,129,157,49,10 END -IDD_CONFIG2 DIALOGEX 0, 0, 187, 360 +IDD_CONFIG2 DIALOGEX 0, 0, 187, 370 STYLE DS_SETFONT | DS_MODALFRAME | DS_FIXEDSYS | WS_POPUP | WS_CAPTION | WS_SYSMENU CAPTION "Settings..." FONT 8, "MS Shell Dlg", 400, 0, 0x1 BEGIN CONTROL IDB_LOGO10,IDC_LOGO11,"Static",SS_BITMAP | SS_CENTERIMAGE,6,6,173,42 - DEFPUSHBUTTON "OK",IDOK,40,336,50,14 + DEFPUSHBUTTON "OK",IDOK,40,346,50,14 LTEXT "Renderer:",IDC_STATIC,6,72,34,8 COMBOBOX IDC_RENDERER,70,70,111,118,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP - LTEXT "Interlacing (F5):",IDC_STATIC,6,87,53,8 - COMBOBOX IDC_INTERLACE,70,85,111,98,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP - LTEXT "Custom Resolution:",IDC_STATIC,26,149,65,8 - EDITTEXT IDC_RESX_EDIT,92,147,35,13,ES_AUTOHSCROLL | ES_NUMBER - CONTROL "",IDC_RESX,"msctls_updown32",UDS_SETBUDDYINT | UDS_ALIGNRIGHT | UDS_AUTOBUDDY | UDS_ARROWKEYS | UDS_NOTHOUSANDS,120,147,11,14 - EDITTEXT IDC_RESY_EDIT,130,147,35,13,ES_AUTOHSCROLL | ES_NUMBER - CONTROL "",IDC_RESY,"msctls_updown32",UDS_SETBUDDYINT | UDS_ALIGNRIGHT | UDS_AUTOBUDDY | UDS_ARROWKEYS | UDS_NOTHOUSANDS,154,147,11,14 - CONTROL "Native",IDC_NATIVERES,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,92,120,33,10 - LTEXT "Extra rendering threads:",IDC_STATIC,11,289,80,8 - EDITTEXT IDC_SWTHREADS_EDIT,94,287,35,13,ES_AUTOHSCROLL | ES_NUMBER - CONTROL "",IDC_SWTHREADS,"msctls_updown32",UDS_SETBUDDYINT | UDS_ALIGNRIGHT | UDS_AUTOBUDDY | UDS_ARROWKEYS | UDS_NOTHOUSANDS,129,278,11,14 - COMBOBOX IDC_UPSCALE_MULTIPLIER,92,132,74,98,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP - LTEXT "Or use Scaling:",IDC_STATIC,38,134,49,8 - LTEXT "Original PS2 Resolution:",IDC_STATIC,10,120,80,8 - CONTROL "Edge Anti-aliasing (AA1)",IDC_AA1,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,10,305,93,10 - PUSHBUTTON "Cancel",IDCANCEL,95,336,50,14 + LTEXT "Interlacing (F5):",IDC_STATIC,6,101,53,8 + COMBOBOX IDC_INTERLACE,70,99,111,98,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP + LTEXT "Custom Resolution:",IDC_STATIC,26,163,65,8 + EDITTEXT IDC_RESX_EDIT,92,161,35,13,ES_AUTOHSCROLL | ES_NUMBER + CONTROL "",IDC_RESX,"msctls_updown32",UDS_SETBUDDYINT | UDS_ALIGNRIGHT | UDS_AUTOBUDDY | UDS_ARROWKEYS | UDS_NOTHOUSANDS,120,161,11,14 + EDITTEXT IDC_RESY_EDIT,130,161,35,13,ES_AUTOHSCROLL | ES_NUMBER + CONTROL "",IDC_RESY,"msctls_updown32",UDS_SETBUDDYINT | UDS_ALIGNRIGHT | UDS_AUTOBUDDY | UDS_ARROWKEYS | UDS_NOTHOUSANDS,154,161,11,14 + CONTROL "Native",IDC_NATIVERES,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,92,134,33,10 + LTEXT "Extra rendering threads:",IDC_STATIC,11,303,80,8 + EDITTEXT IDC_SWTHREADS_EDIT,94,301,35,13,ES_AUTOHSCROLL | ES_NUMBER + CONTROL "",IDC_SWTHREADS,"msctls_updown32",UDS_SETBUDDYINT | UDS_ALIGNRIGHT | UDS_AUTOBUDDY | UDS_ARROWKEYS | UDS_NOTHOUSANDS,129,292,11,14 + COMBOBOX IDC_UPSCALE_MULTIPLIER,92,146,74,98,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP + LTEXT "Or use Scaling:",IDC_STATIC,38,148,49,8 + LTEXT "Original PS2 Resolution:",IDC_STATIC,10,134,80,8 + CONTROL "Edge Anti-aliasing (AA1)",IDC_AA1,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,10,319,93,10 + PUSHBUTTON "Cancel",IDCANCEL,95,346,50,14 CONTROL IDB_LOGO9,IDC_LOGO9,"Static",SS_BITMAP | SS_CENTERIMAGE,6,6,175,44 - GROUPBOX "D3D Internal Resolution (can cause glitches)",IDC_STATIC,6,102,175,64,BS_CENTER - GROUPBOX "Software Mode Settings",IDC_STATIC,6,276,175,50,BS_CENTER - GROUPBOX "Hardware Mode Settings",IDC_STATIC,6,200,175,74,BS_CENTER - CONTROL "Logarithmic Z",IDC_LOGZ,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,92,213,58,10 - CONTROL "Alpha Correction (FBA)",IDC_FBA,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,92,229,87,10 - CONTROL "Allow 8-Bit Textures",IDC_PALTEX,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,10,229,82,10 - CONTROL "Texture Filtering",IDC_FILTER,"Button",BS_AUTO3STATE | WS_TABSTOP,10,213,67,10 - CONTROL "Enable Shade Boost",IDC_SHADEBOOST,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,10,172,79,10 - PUSHBUTTON "Settings...",IDC_SHADEBUTTON,92,169,75,14 - CONTROL "Enable HW Hacks",IDC_HACKS_ENABLED,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,10,245,71,10 - PUSHBUTTON "Configure...",IDC_HACKSBUTTON,92,242,75,14 + GROUPBOX "D3D Internal Resolution (can cause glitches)",IDC_STATIC,6,116,175,64,BS_CENTER + GROUPBOX "Software Mode Settings",IDC_STATIC,6,290,175,50,BS_CENTER + GROUPBOX "Hardware Mode Settings",IDC_STATIC,6,214,175,74,BS_CENTER + CONTROL "Logarithmic Z",IDC_LOGZ,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,92,227,58,10 + CONTROL "Alpha Correction (FBA)",IDC_FBA,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,92,243,87,10 + CONTROL "Allow 8-Bit Textures",IDC_PALTEX,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,10,243,82,10 + CONTROL "Texture Filtering",IDC_FILTER,"Button",BS_AUTO3STATE | WS_TABSTOP,10,227,67,10 + CONTROL "Enable Shade Boost",IDC_SHADEBOOST,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,10,186,79,10 + PUSHBUTTON "Settings...",IDC_SHADEBUTTON,92,183,75,14 + CONTROL "Enable HW Hacks",IDC_HACKS_ENABLED,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,10,259,71,10 + PUSHBUTTON "Configure...",IDC_HACKSBUTTON,92,256,75,14 LTEXT "Adapter:",IDC_STATIC,6,57,30,8 COMBOBOX IDC_ADAPTER,70,55,111,118,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP - CONTROL "Enable FXAA",IDC_FXAA,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,10,187,80,10 - CONTROL "Enable FX Shader",IDC_SHADER_FX,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,92,187,80,10 - CONTROL "Anisotropic Filtering",IDC_ANISOTROPIC,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,10,260,77,8 - COMBOBOX IDC_AFCOMBO,93,258,35,30,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP + CONTROL "Enable FXAA",IDC_FXAA,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,10,201,80,10 + CONTROL "Enable FX Shader",IDC_SHADER_FX,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,92,201,80,10 + CONTROL "Anisotropic Filtering",IDC_ANISOTROPIC,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,10,274,77,8 + COMBOBOX IDC_AFCOMBO,93,272,35,30,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP + LTEXT "OpenCL Device:",IDC_STATIC,6,86,52,8 + COMBOBOX IDC_OPENCL_DEVICE,70,84,111,118,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP END @@ -338,7 +340,7 @@ BEGIN VERTGUIDE, 11 VERTGUIDE, 87 TOPMARGIN, 6 - BOTTOMMARGIN, 335 + BOTTOMMARGIN, 360 END END #endif // APSTUDIO_INVOKED diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl index 65f7cfa05e..ba7ef5214e 100644 --- a/plugins/GSdx/res/tfx.cl +++ b/plugins/GSdx/res/tfx.cl @@ -551,9 +551,11 @@ int GetVertexPerPrim(int prim_class) __kernel void KERNEL_PRIM( __global gs_env* env, __global uchar* vb_base, - __global uchar* ib_base, + __global uchar* ib_base, + __global uchar* pb_base, uint vb_start, - uint ib_start) + uint ib_start, + uint pb_start) { size_t prim_index = get_global_id(0); @@ -563,7 +565,11 @@ __kernel void KERNEL_PRIM( ib += prim_index * VERTEX_PER_PRIM; - prim->pb_index = ib[0] >> 24; + uint pb_index = ib[0] >> 24; + + prim->pb_index = pb_index; + + __global gs_param* pb = (__global gs_param*)(pb_base + pb_start + pb_index * TFX_PARAM_SIZE); __global gs_vertex* v0 = &vb[ib[0] & 0x00ffffff]; __global gs_vertex* v1 = &vb[ib[1] & 0x00ffffff]; @@ -633,10 +639,10 @@ __kernel void KERNEL_PRIM( dp1.xy = dp1.xy * sign(cp); dp2.xy = dp2.xy * sign(cp); - b.zero.x = (dp1.y < 0 || dp1.y == 0 && dp1.x > 0) ? CL_FLT_EPSILON : 0; - b.zero.y = (dp0.y < 0 || dp0.y == 0 && dp0.x > 0) ? CL_FLT_EPSILON : 0; - b.zero.z = (dp2.y < 0 || dp2.y == 0 && dp2.x > 0) ? CL_FLT_EPSILON : 0; - + b.zero.x = select(0.0f, CL_FLT_EPSILON, (dp1.y < 0) | (dp1.y == 0) & (dp1.x > 0)); + b.zero.y = select(0.0f, CL_FLT_EPSILON, (dp0.y < 0) | (dp0.y == 0) & (dp0.x > 0)); + b.zero.z = select(0.0f, CL_FLT_EPSILON, (dp2.y < 0) | (dp2.y == 0) & (dp2.x > 0)); + // any barycentric(reject_corner) < 0, tile outside the triangle b.reject_corner.x = 0.0f + max(max(max(b.dx.x + b.dy.x, b.dx.x), b.dy.x), 0.0f) * BIN_SIZE; @@ -669,6 +675,11 @@ __kernel void KERNEL_PRIM( prim->v[1].tc.xy = (prim->v[1].tc.xy - prim->v[0].tc.xy) / (prim->v[1].p.xy - prim->v[0].p.xy); } + int4 scissor = pb->scissor; + + pmin = select(pmin, scissor.xy, pmin < scissor.xy); + pmax = select(pmax, scissor.zw, pmax > scissor.zw); + int4 r = (int4)(pmin, pmax + (int2)(BIN_SIZE - 1)) >> BIN_SIZE_BITS; env->bbox[prim_index] = convert_uchar4_sat(r); @@ -1167,7 +1178,7 @@ int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t) uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT); uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS); uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT); - + int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb); int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb); int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb); diff --git a/plugins/GSdx/resource.h b/plugins/GSdx/resource.h index b6502c2aef..cf0debbdca 100644 --- a/plugins/GSdx/resource.h +++ b/plugins/GSdx/resource.h @@ -71,6 +71,7 @@ #define IDC_SHADER_FX 2088 #define IDC_ANISOTROPIC 2089 #define IDC_AFCOMBO 2090 +#define IDC_OPENCL_DEVICE 2091 #define IDC_COLORSPACE 3000 #define IDR_CONVERT_FX 10000 #define IDR_TFX_FX 10001 @@ -91,7 +92,7 @@ #ifndef APSTUDIO_READONLY_SYMBOLS #define _APS_NEXT_RESOURCE_VALUE 10012 #define _APS_NEXT_COMMAND_VALUE 32771 -#define _APS_NEXT_CONTROL_VALUE 2091 +#define _APS_NEXT_CONTROL_VALUE 2092 #define _APS_NEXT_SYMED_VALUE 5000 #endif #endif From 7b466a98d0afbfefdd34db15e9b129a9cb256dc4 Mon Sep 17 00:00:00 2001 From: gabest11 Date: Mon, 22 Sep 2014 05:29:40 +0200 Subject: [PATCH 11/15] replaced opencl.def with dynamic dll loading, god bless search and replace --- 3rdparty/opencl/opencl.cpp | 824 +++++++++++++++++++++++++ 3rdparty/opencl/opencl.def | 113 ---- 3rdparty/opencl/opencl.vcxproj | 35 +- 3rdparty/opencl/opencl.vcxproj.filters | 10 +- plugins/GSdx/GSSettingsDlg.cpp | 18 +- plugins/GSdx/GSSettingsDlg.h | 4 +- plugins/GSdx/GSUtil.cpp | 3 +- plugins/GSdx/GSdx.cpp | 14 - plugins/GSdx/GSdx.h | 2 - 9 files changed, 863 insertions(+), 160 deletions(-) create mode 100644 3rdparty/opencl/opencl.cpp delete mode 100644 3rdparty/opencl/opencl.def diff --git a/3rdparty/opencl/opencl.cpp b/3rdparty/opencl/opencl.cpp new file mode 100644 index 0000000000..3af520f8a1 --- /dev/null +++ b/3rdparty/opencl/opencl.cpp @@ -0,0 +1,824 @@ +#include "CL/cl.h" + +typedef cl_int (CL_API_CALL * clGetPlatformIDsPtr)(cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clGetPlatformInfoPtr)(cl_platform_id platform, cl_platform_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clGetDeviceIDsPtr)(cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, cl_device_id* devices, cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clGetDeviceInfoPtr)(cl_device_id device, cl_device_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clCreateSubDevicesPtr)(cl_device_id in_device, const cl_device_partition_property* properties, cl_uint num_devices, cl_device_id* out_devices, cl_uint* num_devices_ret) CL_API_SUFFIX__VERSION_1_2; +typedef cl_int (CL_API_CALL * clRetainDevicePtr)(cl_device_id device) CL_API_SUFFIX__VERSION_1_2; +typedef cl_int (CL_API_CALL * clReleaseDevicePtr)(cl_device_id device) CL_API_SUFFIX__VERSION_1_2; +typedef cl_context (CL_API_CALL * clCreateContextPtr)(const cl_context_properties* properties, cl_uint num_devices, const cl_device_id* devices, void (CL_CALLBACK* pfn_notify)(const char*, const void*, size_t, void*), void* user_data, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_context (CL_API_CALL * clCreateContextFromTypePtr)(const cl_context_properties* properties, cl_device_type device_type, void (CL_CALLBACK* pfn_notify)(const char*, const void*, size_t, void*), void* user_data, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clRetainContextPtr)(cl_context context) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clReleaseContextPtr)(cl_context context) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clGetContextInfoPtr)(cl_context context, cl_context_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_command_queue (CL_API_CALL * clCreateCommandQueueWithPropertiesPtr)(cl_context context, cl_device_id device, const cl_queue_properties* properties, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_0; +typedef cl_int (CL_API_CALL * clRetainCommandQueuePtr)(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clReleaseCommandQueuePtr)(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clGetCommandQueueInfoPtr)(cl_command_queue command_queue, cl_command_queue_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_mem (CL_API_CALL * clCreateBufferPtr)(cl_context context, cl_mem_flags flags, size_t size, void* host_ptr, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_mem (CL_API_CALL * clCreateSubBufferPtr)(cl_mem buffer, cl_mem_flags flags, cl_buffer_create_type buffer_create_type, const void* buffer_create_info, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1; +typedef cl_mem (CL_API_CALL * clCreateImagePtr)(cl_context context, cl_mem_flags flags, const cl_image_format* image_format, const cl_image_desc* image_desc, void* host_ptr, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2; +typedef cl_mem (CL_API_CALL * clCreatePipePtr)(cl_context context, cl_mem_flags flags, cl_uint pipe_packet_size, cl_uint pipe_max_packets, const cl_pipe_properties* properties, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_0; +typedef cl_int (CL_API_CALL * clRetainMemObjectPtr)(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clReleaseMemObjectPtr)(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clGetSupportedImageFormatsPtr)(cl_context context, cl_mem_flags flags, cl_mem_object_type image_type, cl_uint num_entries, cl_image_format* image_formats, cl_uint* num_image_formats) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clGetMemObjectInfoPtr)(cl_mem memobj, cl_mem_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clGetImageInfoPtr)(cl_mem image, cl_image_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clGetPipeInfoPtr)(cl_mem pipe, cl_pipe_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_2_0; +typedef cl_int (CL_API_CALL * clSetMemObjectDestructorCallbackPtr)(cl_mem memobj, void (CL_CALLBACK* pfn_notify)(cl_mem memobj, void* user_data), void* user_data) CL_API_SUFFIX__VERSION_1_1; +typedef void* (CL_API_CALL * clSVMAllocPtr)(cl_context context, cl_svm_mem_flags flags, size_t size, cl_uint alignment) CL_API_SUFFIX__VERSION_2_0; +typedef void (CL_API_CALL * clSVMFreePtr)(cl_context context, void* svm_pointer) CL_API_SUFFIX__VERSION_2_0; +typedef cl_sampler (CL_API_CALL * clCreateSamplerWithPropertiesPtr)(cl_context context, const cl_sampler_properties* normalized_coords, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_0; +typedef cl_int (CL_API_CALL * clRetainSamplerPtr)(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clReleaseSamplerPtr)(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clGetSamplerInfoPtr)(cl_sampler sampler, cl_sampler_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_program (CL_API_CALL * clCreateProgramWithSourcePtr)(cl_context context, cl_uint count, const char** strings, const size_t* lengths, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_program (CL_API_CALL * clCreateProgramWithBinaryPtr)(cl_context context, cl_uint num_devices, const cl_device_id* device_list, const size_t* lengths, const unsigned char** binaries, cl_int* binary_status, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_program (CL_API_CALL * clCreateProgramWithBuiltInKernelsPtr)(cl_context context, cl_uint num_devices, const cl_device_id* device_list, const char* kernel_names, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2; +typedef cl_int (CL_API_CALL * clRetainProgramPtr)(cl_program program) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clReleaseProgramPtr)(cl_program program) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clBuildProgramPtr)(cl_program program, cl_uint num_devices, const cl_device_id* device_list, const char* options, void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data), void* user_data) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clCompileProgramPtr)(cl_program program, cl_uint num_devices, const cl_device_id* device_list, const char* options, cl_uint num_input_headers, const cl_program* input_headers, const char** header_include_names, void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data), void* user_data) CL_API_SUFFIX__VERSION_1_2; +typedef cl_program (CL_API_CALL * clLinkProgramPtr)(cl_context context, cl_uint num_devices, const cl_device_id* device_list, const char* options, cl_uint num_input_programs, const cl_program* input_programs, void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data), void* user_data, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2; +typedef cl_int (CL_API_CALL * clUnloadPlatformCompilerPtr)(cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2; +typedef cl_int (CL_API_CALL * clGetProgramInfoPtr)(cl_program program, cl_program_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clGetProgramBuildInfoPtr)(cl_program program, cl_device_id device, cl_program_build_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_kernel (CL_API_CALL * clCreateKernelPtr)(cl_program program, const char* kernel_name, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clCreateKernelsInProgramPtr)(cl_program program, cl_uint num_kernels, cl_kernel* kernels, cl_uint* num_kernels_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clRetainKernelPtr)(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clReleaseKernelPtr)(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clSetKernelArgPtr)(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void* arg_value) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clSetKernelArgSVMPointerPtr)(cl_kernel kernel, cl_uint arg_index, const void* arg_value) CL_API_SUFFIX__VERSION_2_0; +typedef cl_int (CL_API_CALL * clSetKernelExecInfoPtr)(cl_kernel kernel, cl_kernel_exec_info param_name, size_t param_value_size, const void* param_value) CL_API_SUFFIX__VERSION_2_0; +typedef cl_int (CL_API_CALL * clGetKernelInfoPtr)(cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clGetKernelArgInfoPtr)(cl_kernel kernel, cl_uint arg_indx, cl_kernel_arg_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; +typedef cl_int (CL_API_CALL * clGetKernelWorkGroupInfoPtr)(cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clWaitForEventsPtr)(cl_uint num_events, const cl_event* event_list) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clGetEventInfoPtr)(cl_event event, cl_event_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_event (CL_API_CALL * clCreateUserEventPtr)(cl_context context, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1; +typedef cl_int (CL_API_CALL * clRetainEventPtr)(cl_event event) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clReleaseEventPtr)(cl_event event) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clSetUserEventStatusPtr)(cl_event event, cl_int execution_status) CL_API_SUFFIX__VERSION_1_1; +typedef cl_int (CL_API_CALL * clSetEventCallbackPtr)(cl_event event, cl_int command_exec_callback_type, void (CL_CALLBACK* pfn_notify)(cl_event, cl_int, void*), void* user_data) CL_API_SUFFIX__VERSION_1_1; +typedef cl_int (CL_API_CALL * clGetEventProfilingInfoPtr)(cl_event event, cl_profiling_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clFlushPtr)(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clFinishPtr)(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clEnqueueReadBufferPtr)(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, size_t offset, size_t size, void* ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clEnqueueReadBufferRectPtr)(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, const size_t* buffer_offset, const size_t* host_offset, const size_t* region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void* ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_1; +typedef cl_int (CL_API_CALL * clEnqueueWriteBufferPtr)(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t offset, size_t size, const void* ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clEnqueueWriteBufferRectPtr)(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, const size_t* buffer_offset, const size_t* host_offset, const size_t* region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, const void* ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_1; +typedef cl_int (CL_API_CALL * clEnqueueFillBufferPtr)(cl_command_queue command_queue, cl_mem buffer, const void* pattern, size_t pattern_size, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_2; +typedef cl_int (CL_API_CALL * clEnqueueCopyBufferPtr)(cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clEnqueueCopyBufferRectPtr)(cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, const size_t* src_origin, const size_t* dst_origin, const size_t* region, size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, size_t dst_slice_pitch, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_1; +typedef cl_int (CL_API_CALL * clEnqueueReadImagePtr)(cl_command_queue command_queue, cl_mem image, cl_bool blocking_read, const size_t* origin, const size_t* region, size_t row_pitch, size_t slice_pitch, void* ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clEnqueueWriteImagePtr)(cl_command_queue command_queue, cl_mem image, cl_bool blocking_write, const size_t* origin, const size_t* region, size_t input_row_pitch, size_t input_slice_pitch, const void* ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clEnqueueFillImagePtr)(cl_command_queue command_queue, cl_mem image, const void* fill_color, const size_t* origin, const size_t* region, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_2; +typedef cl_int (CL_API_CALL * clEnqueueCopyImagePtr)(cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image, const size_t* src_origin, const size_t* dst_origin, const size_t* region, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clEnqueueCopyImageToBufferPtr)(cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer, const size_t* src_origin, const size_t* region, size_t dst_offset, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clEnqueueCopyBufferToImagePtr)(cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image, size_t src_offset, const size_t* dst_origin, const size_t* region, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0; +typedef void* (CL_API_CALL * clEnqueueMapBufferPtr)(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, cl_map_flags map_flags, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0; +typedef void* (CL_API_CALL * clEnqueueMapImagePtr)(cl_command_queue command_queue, cl_mem image, cl_bool blocking_map, cl_map_flags map_flags, const size_t* origin, const size_t* region, size_t* image_row_pitch, size_t* image_slice_pitch, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clEnqueueUnmapMemObjectPtr)(cl_command_queue command_queue, cl_mem memobj, void* mapped_ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clEnqueueMigrateMemObjectsPtr)(cl_command_queue command_queue, cl_uint num_mem_objects, const cl_mem* mem_objects, cl_mem_migration_flags flags, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_2; +typedef cl_int (CL_API_CALL * clEnqueueNDRangeKernelPtr)(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t* global_work_offset, const size_t* global_work_size, const size_t* local_work_size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clEnqueueNativeKernelPtr)(cl_command_queue command_queue, void (CL_CALLBACK* /*user_func*/)(void*), void* args, size_t cb_args, cl_uint num_mem_objects, const cl_mem* mem_list, const void** args_mem_loc, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0; +typedef cl_int (CL_API_CALL * clEnqueueMarkerWithWaitListPtr)(cl_command_queue command_queue, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_2; +typedef cl_int (CL_API_CALL * clEnqueueBarrierWithWaitListPtr)(cl_command_queue command_queue, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_2; +typedef cl_int (CL_API_CALL * clEnqueueSVMFreePtr)(cl_command_queue command_queue, cl_uint num_svm_pointers, void* svm_pointers[], void (CL_CALLBACK* /*pfn_free_func*/)(cl_command_queue queue, cl_uint num_svm_pointers, void* svm_pointers[], void* user_data), void* user_data, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_2_0; +typedef cl_int (CL_API_CALL * clEnqueueSVMMemcpyPtr)(cl_command_queue command_queue, cl_bool blocking_copy, void* dst_ptr, const void* src_ptr, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_2_0; +typedef cl_int (CL_API_CALL * clEnqueueSVMMemFillPtr)(cl_command_queue command_queue, void* svm_ptr, const void* pattern, size_t pattern_size, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_2_0; +typedef cl_int (CL_API_CALL * clEnqueueSVMMapPtr)(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void* svm_ptr, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_2_0; +typedef cl_int (CL_API_CALL * clEnqueueSVMUnmapPtr)(cl_command_queue command_queue, void* svm_ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_2_0; +typedef void* (CL_API_CALL * clGetExtensionFunctionAddressForPlatformPtr)(cl_platform_id platform, const char* func_name) CL_API_SUFFIX__VERSION_1_2; +typedef cl_mem (CL_API_CALL * clCreateImage2DPtr)(cl_context context, cl_mem_flags flags, const cl_image_format* image_format, size_t image_width, size_t image_height, size_t image_row_pitch, void* host_ptr, cl_int* errcode_ret); +typedef cl_mem (CL_API_CALL * clCreateImage3DPtr)(cl_context context, cl_mem_flags flags, const cl_image_format* image_format, size_t image_width, size_t image_height, size_t image_depth, size_t image_row_pitch, size_t image_slice_pitch, void* host_ptr, cl_int* errcode_ret); +typedef cl_int (CL_API_CALL * clEnqueueMarkerPtr)(cl_command_queue command_queue, cl_event* event); +typedef cl_int (CL_API_CALL * clEnqueueWaitForEventsPtr)(cl_command_queue command_queue, cl_uint num_events, const cl_event* event_list); +typedef cl_int (CL_API_CALL * clEnqueueBarrierPtr)(cl_command_queue command_queue); +typedef cl_int (CL_API_CALL * clUnloadCompilerPtr)(void); +typedef void* (CL_API_CALL * clGetExtensionFunctionAddressPtr)(const char* func_name); +typedef cl_command_queue (CL_API_CALL * clCreateCommandQueuePtr)(cl_context context, cl_device_id device, cl_command_queue_properties properties, cl_int* errcode_ret); +typedef cl_sampler (CL_API_CALL * clCreateSamplerPtr)(cl_context context, cl_bool normalized_coords, cl_addressing_mode addressing_mode, cl_filter_mode filter_mode, cl_int* errcode_ret); +typedef cl_int (CL_API_CALL * clEnqueueTaskPtr)(cl_command_queue command_queue, cl_kernel kernel, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event); + +static clGetPlatformIDsPtr cl_GetPlatformIDs = NULL; +static clGetPlatformInfoPtr cl_GetPlatformInfo = NULL; +static clGetDeviceIDsPtr cl_GetDeviceIDs = NULL; +static clGetDeviceInfoPtr cl_GetDeviceInfo = NULL; +static clCreateSubDevicesPtr cl_CreateSubDevices = NULL; +static clRetainDevicePtr cl_RetainDevice = NULL; +static clReleaseDevicePtr cl_ReleaseDevice = NULL; +static clCreateContextPtr cl_CreateContext = NULL; +static clCreateContextFromTypePtr cl_CreateContextFromType = NULL; +static clRetainContextPtr cl_RetainContext = NULL; +static clReleaseContextPtr cl_ReleaseContext = NULL; +static clGetContextInfoPtr cl_GetContextInfo = NULL; +static clCreateCommandQueueWithPropertiesPtr cl_CreateCommandQueueWithProperties = NULL; +static clRetainCommandQueuePtr cl_RetainCommandQueue = NULL; +static clReleaseCommandQueuePtr cl_ReleaseCommandQueue = NULL; +static clGetCommandQueueInfoPtr cl_GetCommandQueueInfo = NULL; +static clCreateBufferPtr cl_CreateBuffer = NULL; +static clCreateSubBufferPtr cl_CreateSubBuffer = NULL; +static clCreateImagePtr cl_CreateImage = NULL; +static clCreatePipePtr cl_CreatePipe = NULL; +static clRetainMemObjectPtr cl_RetainMemObject = NULL; +static clReleaseMemObjectPtr cl_ReleaseMemObject = NULL; +static clGetSupportedImageFormatsPtr cl_GetSupportedImageFormats = NULL; +static clGetMemObjectInfoPtr cl_GetMemObjectInfo = NULL; +static clGetImageInfoPtr cl_GetImageInfo = NULL; +static clGetPipeInfoPtr cl_GetPipeInfo = NULL; +static clSetMemObjectDestructorCallbackPtr cl_SetMemObjectDestructorCallback = NULL; +static clSVMAllocPtr cl_SVMAlloc = NULL; +static clSVMFreePtr cl_SVMFree = NULL; +static clCreateSamplerWithPropertiesPtr cl_CreateSamplerWithProperties = NULL; +static clRetainSamplerPtr cl_RetainSampler = NULL; +static clReleaseSamplerPtr cl_ReleaseSampler = NULL; +static clGetSamplerInfoPtr cl_GetSamplerInfo = NULL; +static clCreateProgramWithSourcePtr cl_CreateProgramWithSource = NULL; +static clCreateProgramWithBinaryPtr cl_CreateProgramWithBinary = NULL; +static clCreateProgramWithBuiltInKernelsPtr cl_CreateProgramWithBuiltInKernels = NULL; +static clRetainProgramPtr cl_RetainProgram = NULL; +static clReleaseProgramPtr cl_ReleaseProgram = NULL; +static clBuildProgramPtr cl_BuildProgram = NULL; +static clCompileProgramPtr cl_CompileProgram = NULL; +static clLinkProgramPtr cl_LinkProgram = NULL; +static clUnloadPlatformCompilerPtr cl_UnloadPlatformCompiler = NULL; +static clGetProgramInfoPtr cl_GetProgramInfo = NULL; +static clGetProgramBuildInfoPtr cl_GetProgramBuildInfo = NULL; +static clCreateKernelPtr cl_CreateKernel = NULL; +static clCreateKernelsInProgramPtr cl_CreateKernelsInProgram = NULL; +static clRetainKernelPtr cl_RetainKernel = NULL; +static clReleaseKernelPtr cl_ReleaseKernel = NULL; +static clSetKernelArgPtr cl_SetKernelArg = NULL; +static clSetKernelArgSVMPointerPtr cl_SetKernelArgSVMPointer = NULL; +static clSetKernelExecInfoPtr cl_SetKernelExecInfo = NULL; +static clGetKernelInfoPtr cl_GetKernelInfo = NULL; +static clGetKernelArgInfoPtr cl_GetKernelArgInfo = NULL; +static clGetKernelWorkGroupInfoPtr cl_GetKernelWorkGroupInfo = NULL; +static clWaitForEventsPtr cl_WaitForEvents = NULL; +static clGetEventInfoPtr cl_GetEventInfo = NULL; +static clCreateUserEventPtr cl_CreateUserEvent = NULL; +static clRetainEventPtr cl_RetainEvent = NULL; +static clReleaseEventPtr cl_ReleaseEvent = NULL; +static clSetUserEventStatusPtr cl_SetUserEventStatus = NULL; +static clSetEventCallbackPtr cl_SetEventCallback = NULL; +static clGetEventProfilingInfoPtr cl_GetEventProfilingInfo = NULL; +static clFlushPtr cl_Flush = NULL; +static clFinishPtr cl_Finish = NULL; +static clEnqueueReadBufferPtr cl_EnqueueReadBuffer = NULL; +static clEnqueueReadBufferRectPtr cl_EnqueueReadBufferRect = NULL; +static clEnqueueWriteBufferPtr cl_EnqueueWriteBuffer = NULL; +static clEnqueueWriteBufferRectPtr cl_EnqueueWriteBufferRect = NULL; +static clEnqueueFillBufferPtr cl_EnqueueFillBuffer = NULL; +static clEnqueueCopyBufferPtr cl_EnqueueCopyBuffer = NULL; +static clEnqueueCopyBufferRectPtr cl_EnqueueCopyBufferRect = NULL; +static clEnqueueReadImagePtr cl_EnqueueReadImage = NULL; +static clEnqueueWriteImagePtr cl_EnqueueWriteImage = NULL; +static clEnqueueFillImagePtr cl_EnqueueFillImage = NULL; +static clEnqueueCopyImagePtr cl_EnqueueCopyImage = NULL; +static clEnqueueCopyImageToBufferPtr cl_EnqueueCopyImageToBuffer = NULL; +static clEnqueueCopyBufferToImagePtr cl_EnqueueCopyBufferToImage = NULL; +static clEnqueueMapBufferPtr cl_EnqueueMapBuffer = NULL; +static clEnqueueMapImagePtr cl_EnqueueMapImage = NULL; +static clEnqueueUnmapMemObjectPtr cl_EnqueueUnmapMemObject = NULL; +static clEnqueueMigrateMemObjectsPtr cl_EnqueueMigrateMemObjects = NULL; +static clEnqueueNDRangeKernelPtr cl_EnqueueNDRangeKernel = NULL; +static clEnqueueNativeKernelPtr cl_EnqueueNativeKernel = NULL; +static clEnqueueMarkerWithWaitListPtr cl_EnqueueMarkerWithWaitList = NULL; +static clEnqueueBarrierWithWaitListPtr cl_EnqueueBarrierWithWaitList = NULL; +static clEnqueueSVMFreePtr cl_EnqueueSVMFree = NULL; +static clEnqueueSVMMemcpyPtr cl_EnqueueSVMMemcpy = NULL; +static clEnqueueSVMMemFillPtr cl_EnqueueSVMMemFill = NULL; +static clEnqueueSVMMapPtr cl_EnqueueSVMMap = NULL; +static clEnqueueSVMUnmapPtr cl_EnqueueSVMUnmap = NULL; +static clGetExtensionFunctionAddressForPlatformPtr cl_GetExtensionFunctionAddressForPlatform = NULL; +static clCreateImage2DPtr cl_CreateImage2D = NULL; +static clCreateImage3DPtr cl_CreateImage3D = NULL; +static clEnqueueMarkerPtr cl_EnqueueMarker = NULL; +static clEnqueueWaitForEventsPtr cl_EnqueueWaitForEvents = NULL; +static clEnqueueBarrierPtr cl_EnqueueBarrier = NULL; +static clUnloadCompilerPtr cl_UnloadCompiler = NULL; +static clGetExtensionFunctionAddressPtr cl_GetExtensionFunctionAddress = NULL; +static clCreateCommandQueuePtr cl_CreateCommandQueue = NULL; +static clCreateSamplerPtr cl_CreateSampler = NULL; +static clEnqueueTaskPtr cl_EnqueueTask = NULL; + +#include + +static struct Loader +{ + struct Loader() + { + HMODULE hModule = LoadLibrary("OpenCL.dll"); + + if(hModule == NULL) return; + + *(void**)&cl_GetPlatformIDs = GetProcAddress(hModule, "clGetPlatformIDs"); + *(void**)&cl_GetPlatformInfo = GetProcAddress(hModule, "clGetPlatformInfo"); + *(void**)&cl_GetDeviceIDs = GetProcAddress(hModule, "clGetDeviceIDs"); + *(void**)&cl_GetDeviceInfo = GetProcAddress(hModule, "clGetDeviceInfo"); + *(void**)&cl_CreateSubDevices = GetProcAddress(hModule, "clCreateSubDevices"); + *(void**)&cl_RetainDevice = GetProcAddress(hModule, "clRetainDevice"); + *(void**)&cl_ReleaseDevice = GetProcAddress(hModule, "clReleaseDevice"); + *(void**)&cl_CreateContext = GetProcAddress(hModule, "clCreateContext"); + *(void**)&cl_CreateContextFromType = GetProcAddress(hModule, "clCreateContextFromType"); + *(void**)&cl_RetainContext = GetProcAddress(hModule, "clRetainContext"); + *(void**)&cl_ReleaseContext = GetProcAddress(hModule, "clReleaseContext"); + *(void**)&cl_GetContextInfo = GetProcAddress(hModule, "clGetContextInfo"); + *(void**)&cl_CreateCommandQueueWithProperties = GetProcAddress(hModule, "clCreateCommandQueueWithProperties"); + *(void**)&cl_RetainCommandQueue = GetProcAddress(hModule, "clRetainCommandQueue"); + *(void**)&cl_ReleaseCommandQueue = GetProcAddress(hModule, "clReleaseCommandQueue"); + *(void**)&cl_GetCommandQueueInfo = GetProcAddress(hModule, "clGetCommandQueueInfo"); + *(void**)&cl_CreateBuffer = GetProcAddress(hModule, "clCreateBuffer"); + *(void**)&cl_CreateSubBuffer = GetProcAddress(hModule, "clCreateSubBuffer"); + *(void**)&cl_CreateImage = GetProcAddress(hModule, "clCreateImage"); + *(void**)&cl_CreatePipe = GetProcAddress(hModule, "clCreatePipe"); + *(void**)&cl_RetainMemObject = GetProcAddress(hModule, "clRetainMemObject"); + *(void**)&cl_ReleaseMemObject = GetProcAddress(hModule, "clReleaseMemObject"); + *(void**)&cl_GetSupportedImageFormats = GetProcAddress(hModule, "clGetSupportedImageFormats"); + *(void**)&cl_GetMemObjectInfo = GetProcAddress(hModule, "clGetMemObjectInfo"); + *(void**)&cl_GetImageInfo = GetProcAddress(hModule, "clGetImageInfo"); + *(void**)&cl_GetPipeInfo = GetProcAddress(hModule, "clGetPipeInfo"); + *(void**)&cl_SetMemObjectDestructorCallback = GetProcAddress(hModule, "clSetMemObjectDestructorCallback"); + *(void**)&cl_SVMAlloc = GetProcAddress(hModule, "clSVMAlloc"); + *(void**)&cl_SVMFree = GetProcAddress(hModule, "clSVMFree"); + *(void**)&cl_CreateSamplerWithProperties = GetProcAddress(hModule, "clCreateSamplerWithProperties"); + *(void**)&cl_RetainSampler = GetProcAddress(hModule, "clRetainSampler"); + *(void**)&cl_ReleaseSampler = GetProcAddress(hModule, "clReleaseSampler"); + *(void**)&cl_GetSamplerInfo = GetProcAddress(hModule, "clGetSamplerInfo"); + *(void**)&cl_CreateProgramWithSource = GetProcAddress(hModule, "clCreateProgramWithSource"); + *(void**)&cl_CreateProgramWithBinary = GetProcAddress(hModule, "clCreateProgramWithBinary"); + *(void**)&cl_CreateProgramWithBuiltInKernels = GetProcAddress(hModule, "clCreateProgramWithBuiltInKernels"); + *(void**)&cl_RetainProgram = GetProcAddress(hModule, "clRetainProgram"); + *(void**)&cl_ReleaseProgram = GetProcAddress(hModule, "clReleaseProgram"); + *(void**)&cl_BuildProgram = GetProcAddress(hModule, "clBuildProgram"); + *(void**)&cl_CompileProgram = GetProcAddress(hModule, "clCompileProgram"); + *(void**)&cl_LinkProgram = GetProcAddress(hModule, "clLinkProgram"); + *(void**)&cl_UnloadPlatformCompiler = GetProcAddress(hModule, "clUnloadPlatformCompiler"); + *(void**)&cl_GetProgramInfo = GetProcAddress(hModule, "clGetProgramInfo"); + *(void**)&cl_GetProgramBuildInfo = GetProcAddress(hModule, "clGetProgramBuildInfo"); + *(void**)&cl_CreateKernel = GetProcAddress(hModule, "clCreateKernel"); + *(void**)&cl_CreateKernelsInProgram = GetProcAddress(hModule, "clCreateKernelsInProgram"); + *(void**)&cl_RetainKernel = GetProcAddress(hModule, "clRetainKernel"); + *(void**)&cl_ReleaseKernel = GetProcAddress(hModule, "clReleaseKernel"); + *(void**)&cl_SetKernelArg = GetProcAddress(hModule, "clSetKernelArg"); + *(void**)&cl_SetKernelArgSVMPointer = GetProcAddress(hModule, "clSetKernelArgSVMPointer"); + *(void**)&cl_SetKernelExecInfo = GetProcAddress(hModule, "clSetKernelExecInfo"); + *(void**)&cl_GetKernelInfo = GetProcAddress(hModule, "clGetKernelInfo"); + *(void**)&cl_GetKernelArgInfo = GetProcAddress(hModule, "clGetKernelArgInfo"); + *(void**)&cl_GetKernelWorkGroupInfo = GetProcAddress(hModule, "clGetKernelWorkGroupInfo"); + *(void**)&cl_WaitForEvents = GetProcAddress(hModule, "clWaitForEvents"); + *(void**)&cl_GetEventInfo = GetProcAddress(hModule, "clGetEventInfo"); + *(void**)&cl_CreateUserEvent = GetProcAddress(hModule, "clCreateUserEvent"); + *(void**)&cl_RetainEvent = GetProcAddress(hModule, "clRetainEvent"); + *(void**)&cl_ReleaseEvent = GetProcAddress(hModule, "clReleaseEvent"); + *(void**)&cl_SetUserEventStatus = GetProcAddress(hModule, "clSetUserEventStatus"); + *(void**)&cl_SetEventCallback = GetProcAddress(hModule, "clSetEventCallback"); + *(void**)&cl_GetEventProfilingInfo = GetProcAddress(hModule, "clGetEventProfilingInfo"); + *(void**)&cl_Flush = GetProcAddress(hModule, "clFlush"); + *(void**)&cl_Finish = GetProcAddress(hModule, "clFinish"); + *(void**)&cl_EnqueueReadBuffer = GetProcAddress(hModule, "clEnqueueReadBuffer"); + *(void**)&cl_EnqueueReadBufferRect = GetProcAddress(hModule, "clEnqueueReadBufferRect"); + *(void**)&cl_EnqueueWriteBuffer = GetProcAddress(hModule, "clEnqueueWriteBuffer"); + *(void**)&cl_EnqueueWriteBufferRect = GetProcAddress(hModule, "clEnqueueWriteBufferRect"); + *(void**)&cl_EnqueueFillBuffer = GetProcAddress(hModule, "clEnqueueFillBuffer"); + *(void**)&cl_EnqueueCopyBuffer = GetProcAddress(hModule, "clEnqueueCopyBuffer"); + *(void**)&cl_EnqueueCopyBufferRect = GetProcAddress(hModule, "clEnqueueCopyBufferRect"); + *(void**)&cl_EnqueueReadImage = GetProcAddress(hModule, "clEnqueueReadImage"); + *(void**)&cl_EnqueueWriteImage = GetProcAddress(hModule, "clEnqueueWriteImage"); + *(void**)&cl_EnqueueFillImage = GetProcAddress(hModule, "clEnqueueFillImage"); + *(void**)&cl_EnqueueCopyImage = GetProcAddress(hModule, "clEnqueueCopyImage"); + *(void**)&cl_EnqueueCopyImageToBuffer = GetProcAddress(hModule, "clEnqueueCopyImageToBuffer"); + *(void**)&cl_EnqueueCopyBufferToImage = GetProcAddress(hModule, "clEnqueueCopyBufferToImage"); + *(void**)&cl_EnqueueMapBuffer = GetProcAddress(hModule, "clEnqueueMapBuffer"); + *(void**)&cl_EnqueueMapImage = GetProcAddress(hModule, "clEnqueueMapImage"); + *(void**)&cl_EnqueueUnmapMemObject = GetProcAddress(hModule, "clEnqueueUnmapMemObject"); + *(void**)&cl_EnqueueMigrateMemObjects = GetProcAddress(hModule, "clEnqueueMigrateMemObjects"); + *(void**)&cl_EnqueueNDRangeKernel = GetProcAddress(hModule, "clEnqueueNDRangeKernel"); + *(void**)&cl_EnqueueNativeKernel = GetProcAddress(hModule, "clEnqueueNativeKernel"); + *(void**)&cl_EnqueueMarkerWithWaitList = GetProcAddress(hModule, "clEnqueueMarkerWithWaitList"); + *(void**)&cl_EnqueueBarrierWithWaitList = GetProcAddress(hModule, "clEnqueueBarrierWithWaitList"); + *(void**)&cl_EnqueueSVMFree = GetProcAddress(hModule, "clEnqueueSVMFree"); + *(void**)&cl_EnqueueSVMMemcpy = GetProcAddress(hModule, "clEnqueueSVMMemcpy"); + *(void**)&cl_EnqueueSVMMemFill = GetProcAddress(hModule, "clEnqueueSVMMemFill"); + *(void**)&cl_EnqueueSVMMap = GetProcAddress(hModule, "clEnqueueSVMMap"); + *(void**)&cl_EnqueueSVMUnmap = GetProcAddress(hModule, "clEnqueueSVMUnmap"); + *(void**)&cl_GetExtensionFunctionAddressForPlatform = GetProcAddress(hModule, "clGetExtensionFunctionAddressForPlatform"); + *(void**)&cl_CreateImage2D = GetProcAddress(hModule, "clCreateImage2D"); + *(void**)&cl_CreateImage3D = GetProcAddress(hModule, "clCreateImage3D"); + *(void**)&cl_EnqueueMarker = GetProcAddress(hModule, "clEnqueueMarker"); + *(void**)&cl_EnqueueWaitForEvents = GetProcAddress(hModule, "clEnqueueWaitForEvents"); + *(void**)&cl_EnqueueBarrier = GetProcAddress(hModule, "clEnqueueBarrier"); + *(void**)&cl_UnloadCompiler = GetProcAddress(hModule, "clUnloadCompiler"); + *(void**)&cl_GetExtensionFunctionAddress = GetProcAddress(hModule, "clGetExtensionFunctionAddress"); + *(void**)&cl_CreateCommandQueue = GetProcAddress(hModule, "clCreateCommandQueue"); + *(void**)&cl_CreateSampler = GetProcAddress(hModule, "clCreateSampler"); + *(void**)&cl_EnqueueTask = GetProcAddress(hModule, "clEnqueueTask"); + } +} s_loader; + +cl_int CL_API_CALL clGetPlatformIDs(cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetPlatformIDs(num_entries, platforms, num_platforms); +} + +cl_int CL_API_CALL clGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetPlatformInfo(platform, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_int CL_API_CALL clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, cl_device_id* devices, cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetDeviceIDs(platform, device_type, num_entries, devices, num_devices); +} + +cl_int CL_API_CALL clGetDeviceInfo(cl_device_id device, cl_device_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetDeviceInfo(device, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_int CL_API_CALL clCreateSubDevices(cl_device_id in_device, const cl_device_partition_property* properties, cl_uint num_devices, cl_device_id* out_devices, cl_uint* num_devices_ret) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_CreateSubDevices(in_device, properties, num_devices, out_devices, num_devices_ret); +} + +cl_int CL_API_CALL clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_RetainDevice(device); +} + +cl_int CL_API_CALL clReleaseDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_ReleaseDevice(device); +} + +cl_context CL_API_CALL clCreateContext(const cl_context_properties* properties, cl_uint num_devices, const cl_device_id* devices, void (CL_CALLBACK* pfn_notify)(const char*, const void*, size_t, void*), void* user_data, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_CreateContext(properties, num_devices, devices, pfn_notify, user_data, errcode_ret); +} + +cl_context CL_API_CALL clCreateContextFromType(const cl_context_properties* properties, cl_device_type device_type, void (CL_CALLBACK* pfn_notify)(const char*, const void*, size_t, void*), void* user_data, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_CreateContextFromType(properties, device_type, pfn_notify, user_data, errcode_ret); +} + +cl_int CL_API_CALL clRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_RetainContext(context); +} + +cl_int CL_API_CALL clReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_ReleaseContext(context); +} + +cl_int CL_API_CALL clGetContextInfo(cl_context context, cl_context_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetContextInfo(context, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties(cl_context context, cl_device_id device, const cl_queue_properties* properties, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_0 +{ + return cl_CreateCommandQueueWithProperties(context, device, properties, errcode_ret); +} + +cl_int CL_API_CALL clRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_RetainCommandQueue(command_queue); +} + +cl_int CL_API_CALL clReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_ReleaseCommandQueue(command_queue); +} + +cl_int CL_API_CALL clGetCommandQueueInfo(cl_command_queue command_queue, cl_command_queue_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetCommandQueueInfo(command_queue, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_mem CL_API_CALL clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void* host_ptr, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_CreateBuffer(context, flags, size, host_ptr, errcode_ret); +} + +cl_mem CL_API_CALL clCreateSubBuffer(cl_mem buffer, cl_mem_flags flags, cl_buffer_create_type buffer_create_type, const void* buffer_create_info, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1 +{ + return cl_CreateSubBuffer(buffer, flags, buffer_create_type, buffer_create_info, errcode_ret); +} + +cl_mem CL_API_CALL clCreateImage(cl_context context, cl_mem_flags flags, const cl_image_format* image_format, const cl_image_desc* image_desc, void* host_ptr, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_CreateImage(context, flags, image_format, image_desc, host_ptr, errcode_ret); +} + +cl_mem CL_API_CALL clCreatePipe(cl_context context, cl_mem_flags flags, cl_uint pipe_packet_size, cl_uint pipe_max_packets, const cl_pipe_properties* properties, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_0 +{ + return cl_CreatePipe(context, flags, pipe_packet_size, pipe_max_packets, properties, errcode_ret); +} + +cl_int CL_API_CALL clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_RetainMemObject(memobj); +} + +cl_int CL_API_CALL clReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_ReleaseMemObject(memobj); +} + +cl_int CL_API_CALL clGetSupportedImageFormats(cl_context context, cl_mem_flags flags, cl_mem_object_type image_type, cl_uint num_entries, cl_image_format* image_formats, cl_uint* num_image_formats) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetSupportedImageFormats(context, flags, image_type, num_entries, image_formats, num_image_formats); +} + +cl_int CL_API_CALL clGetMemObjectInfo(cl_mem memobj, cl_mem_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetMemObjectInfo(memobj, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_int CL_API_CALL clGetImageInfo(cl_mem image, cl_image_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetImageInfo(image, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_int CL_API_CALL clGetPipeInfo(cl_mem pipe, cl_pipe_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_2_0 +{ + return cl_GetPipeInfo(pipe, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_int CL_API_CALL clSetMemObjectDestructorCallback(cl_mem memobj, void (CL_CALLBACK* pfn_notify)(cl_mem memobj, void* user_data), void* user_data) CL_API_SUFFIX__VERSION_1_1 +{ + return cl_SetMemObjectDestructorCallback(memobj, pfn_notify, user_data); +} + +void* CL_API_CALL clSVMAlloc(cl_context context, cl_svm_mem_flags flags, size_t size, cl_uint alignment) CL_API_SUFFIX__VERSION_2_0 +{ + return cl_SVMAlloc(context, flags, size, alignment); +} + +void CL_API_CALL clSVMFree(cl_context context, void* svm_pointer) CL_API_SUFFIX__VERSION_2_0 +{ + cl_SVMFree(context, svm_pointer); +} + +cl_sampler CL_API_CALL clCreateSamplerWithProperties(cl_context context, const cl_sampler_properties* normalized_coords, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_0 +{ + return cl_CreateSamplerWithProperties(context, normalized_coords, errcode_ret); +} + +cl_int CL_API_CALL clRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_RetainSampler(sampler); +} + +cl_int CL_API_CALL clReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_ReleaseSampler(sampler); +} + +cl_int CL_API_CALL clGetSamplerInfo(cl_sampler sampler, cl_sampler_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetSamplerInfo(sampler, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_program CL_API_CALL clCreateProgramWithSource(cl_context context, cl_uint count, const char** strings, const size_t* lengths, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_CreateProgramWithSource(context, count, strings, lengths, errcode_ret); +} + +cl_program CL_API_CALL clCreateProgramWithBinary(cl_context context, cl_uint num_devices, const cl_device_id* device_list, const size_t* lengths, const unsigned char** binaries, cl_int* binary_status, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_CreateProgramWithBinary(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret); +} + +cl_program CL_API_CALL clCreateProgramWithBuiltInKernels(cl_context context, cl_uint num_devices, const cl_device_id* device_list, const char* kernel_names, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_CreateProgramWithBuiltInKernels(context, num_devices, device_list, kernel_names, errcode_ret); +} + +cl_int CL_API_CALL clRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_RetainProgram(program); +} + +cl_int CL_API_CALL clReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_ReleaseProgram(program); +} + +cl_int CL_API_CALL clBuildProgram(cl_program program, cl_uint num_devices, const cl_device_id* device_list, const char* options, void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data), void* user_data) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_BuildProgram(program, num_devices, device_list, options, pfn_notify, user_data); +} + +cl_int CL_API_CALL clCompileProgram(cl_program program, cl_uint num_devices, const cl_device_id* device_list, const char* options, cl_uint num_input_headers, const cl_program* input_headers, const char** header_include_names, void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data), void* user_data) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_CompileProgram(program, num_devices, device_list, options, num_input_headers, input_headers, header_include_names, pfn_notify, user_data); +} + +cl_program CL_API_CALL clLinkProgram(cl_context context, cl_uint num_devices, const cl_device_id* device_list, const char* options, cl_uint num_input_programs, const cl_program* input_programs, void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data), void* user_data, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_LinkProgram(context, num_devices, device_list, options, num_input_programs, input_programs, pfn_notify, user_data, errcode_ret); +} + +cl_int CL_API_CALL clUnloadPlatformCompiler(cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_UnloadPlatformCompiler(platform); +} + +cl_int CL_API_CALL clGetProgramInfo(cl_program program, cl_program_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetProgramInfo(program, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_int CL_API_CALL clGetProgramBuildInfo(cl_program program, cl_device_id device, cl_program_build_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetProgramBuildInfo(program, device, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_kernel CL_API_CALL clCreateKernel(cl_program program, const char* kernel_name, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_CreateKernel(program, kernel_name, errcode_ret); +} + +cl_int CL_API_CALL clCreateKernelsInProgram(cl_program program, cl_uint num_kernels, cl_kernel* kernels, cl_uint* num_kernels_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_CreateKernelsInProgram(program, num_kernels, kernels, num_kernels_ret); +} + +cl_int CL_API_CALL clRetainKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_RetainKernel(kernel); +} + +cl_int CL_API_CALL clReleaseKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_ReleaseKernel(kernel); +} + +cl_int CL_API_CALL clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void* arg_value) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_SetKernelArg(kernel, arg_index, arg_size, arg_value); +} + +cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint arg_index, const void* arg_value) CL_API_SUFFIX__VERSION_2_0 +{ + return cl_SetKernelArgSVMPointer(kernel, arg_index, arg_value); +} + +cl_int CL_API_CALL clSetKernelExecInfo(cl_kernel kernel, cl_kernel_exec_info param_name, size_t param_value_size, const void* param_value) CL_API_SUFFIX__VERSION_2_0 +{ + return cl_SetKernelExecInfo(kernel, param_name, param_value_size, param_value); +} + +cl_int CL_API_CALL clGetKernelInfo(cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetKernelInfo(kernel, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_int CL_API_CALL clGetKernelArgInfo(cl_kernel kernel, cl_uint arg_indx, cl_kernel_arg_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_GetKernelArgInfo(kernel, arg_indx, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_int CL_API_CALL clGetKernelWorkGroupInfo(cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetKernelWorkGroupInfo(kernel, device, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_int CL_API_CALL clWaitForEvents(cl_uint num_events, const cl_event* event_list) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_WaitForEvents(num_events, event_list); +} + +cl_int CL_API_CALL clGetEventInfo(cl_event event, cl_event_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetEventInfo(event, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_event CL_API_CALL clCreateUserEvent(cl_context context, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1 +{ + return cl_CreateUserEvent(context, errcode_ret); +} + +cl_int CL_API_CALL clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_RetainEvent(event); +} + +cl_int CL_API_CALL clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_ReleaseEvent(event); +} + +cl_int CL_API_CALL clSetUserEventStatus(cl_event event, cl_int execution_status) CL_API_SUFFIX__VERSION_1_1 +{ + return cl_SetUserEventStatus(event, execution_status); +} + +cl_int CL_API_CALL clSetEventCallback(cl_event event, cl_int command_exec_callback_type, void (CL_CALLBACK* pfn_notify)(cl_event, cl_int, void*), void* user_data) CL_API_SUFFIX__VERSION_1_1 +{ + return cl_SetEventCallback(event, command_exec_callback_type, pfn_notify, user_data); +} + +cl_int CL_API_CALL clGetEventProfilingInfo(cl_event event, cl_profiling_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_GetEventProfilingInfo(event, param_name, param_value_size, param_value, param_value_size_ret); +} + +cl_int CL_API_CALL clFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_Flush(command_queue); +} + +cl_int CL_API_CALL clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_Finish(command_queue); +} + +cl_int CL_API_CALL clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, size_t offset, size_t size, void* ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_EnqueueReadBuffer(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueReadBufferRect(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, const size_t* buffer_offset, const size_t* host_offset, const size_t* region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void* ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_1 +{ + return cl_EnqueueReadBufferRect(command_queue, buffer, blocking_read, buffer_offset, host_offset, region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t offset, size_t size, const void* ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_EnqueueWriteBuffer(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueWriteBufferRect(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, const size_t* buffer_offset, const size_t* host_offset, const size_t* region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, const void* ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_1 +{ + return cl_EnqueueWriteBufferRect(command_queue, buffer, blocking_write, buffer_offset, host_offset, region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueFillBuffer(cl_command_queue command_queue, cl_mem buffer, const void* pattern, size_t pattern_size, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_EnqueueFillBuffer(command_queue, buffer, pattern, pattern_size, offset, size, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueCopyBuffer(cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_EnqueueCopyBuffer(command_queue, src_buffer, dst_buffer, src_offset, dst_offset, size, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueCopyBufferRect(cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, const size_t* src_origin, const size_t* dst_origin, const size_t* region, size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, size_t dst_slice_pitch, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_1 +{ + return cl_EnqueueCopyBufferRect(command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region, src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueReadImage(cl_command_queue command_queue, cl_mem image, cl_bool blocking_read, const size_t* origin, const size_t* region, size_t row_pitch, size_t slice_pitch, void* ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_EnqueueReadImage(command_queue, image, blocking_read, origin, region, row_pitch, slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueWriteImage(cl_command_queue command_queue, cl_mem image, cl_bool blocking_write, const size_t* origin, const size_t* region, size_t input_row_pitch, size_t input_slice_pitch, const void* ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_EnqueueWriteImage(command_queue, image, blocking_write, origin, region, input_row_pitch, input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueFillImage(cl_command_queue command_queue, cl_mem image, const void* fill_color, const size_t* origin, const size_t* region, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_EnqueueFillImage(command_queue, image, fill_color, origin, region, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueCopyImage(cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image, const size_t* src_origin, const size_t* dst_origin, const size_t* region, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_EnqueueCopyImage(command_queue, src_image, dst_image, src_origin, dst_origin, region, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueCopyImageToBuffer(cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer, const size_t* src_origin, const size_t* region, size_t dst_offset, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_EnqueueCopyImageToBuffer(command_queue, src_image, dst_buffer, src_origin, region, dst_offset, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueCopyBufferToImage(cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image, size_t src_offset, const size_t* dst_origin, const size_t* region, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_EnqueueCopyBufferToImage(command_queue, src_buffer, dst_image, src_offset, dst_origin, region, num_events_in_wait_list, event_wait_list, event); +} + +void* CL_API_CALL clEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, cl_map_flags map_flags, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_EnqueueMapBuffer(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret); +} + +void* CL_API_CALL clEnqueueMapImage(cl_command_queue command_queue, cl_mem image, cl_bool blocking_map, cl_map_flags map_flags, const size_t* origin, const size_t* region, size_t* image_row_pitch, size_t* image_slice_pitch, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_EnqueueMapImage(command_queue, image, blocking_map, map_flags, origin, region, image_row_pitch, image_slice_pitch, num_events_in_wait_list, event_wait_list, event, errcode_ret); +} + +cl_int CL_API_CALL clEnqueueUnmapMemObject(cl_command_queue command_queue, cl_mem memobj, void* mapped_ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_EnqueueUnmapMemObject(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueMigrateMemObjects(cl_command_queue command_queue, cl_uint num_mem_objects, const cl_mem* mem_objects, cl_mem_migration_flags flags, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_EnqueueMigrateMemObjects(command_queue, num_mem_objects, mem_objects, flags, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t* global_work_offset, const size_t* global_work_size, const size_t* local_work_size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_EnqueueNDRangeKernel(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueNativeKernel(cl_command_queue command_queue, void (CL_CALLBACK* user_func)(void*), void* args, size_t cb_args, cl_uint num_mem_objects, const cl_mem* mem_list, const void** args_mem_loc, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_0 +{ + return cl_EnqueueNativeKernel(command_queue, user_func, args, cb_args, num_mem_objects, mem_list, args_mem_loc, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueMarkerWithWaitList(cl_command_queue command_queue, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_EnqueueMarkerWithWaitList(command_queue, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueBarrierWithWaitList(cl_command_queue command_queue, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_EnqueueBarrierWithWaitList(command_queue, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueSVMFree(cl_command_queue command_queue, cl_uint num_svm_pointers, void* svm_pointers[], void (CL_CALLBACK* pfn_free_func)(cl_command_queue queue, cl_uint num_svm_pointers, void* svm_pointers[], void* user_data), void* user_data, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_2_0 +{ + return cl_EnqueueSVMFree(command_queue, num_svm_pointers, svm_pointers, pfn_free_func, user_data, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueSVMMemcpy(cl_command_queue command_queue, cl_bool blocking_copy, void* dst_ptr, const void* src_ptr, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_2_0 +{ + return cl_EnqueueSVMMemcpy(command_queue, blocking_copy, dst_ptr, src_ptr, size, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueSVMMemFill(cl_command_queue command_queue, void* svm_ptr, const void* pattern, size_t pattern_size, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_2_0 +{ + return cl_EnqueueSVMMemFill(command_queue, svm_ptr, pattern, pattern_size, size, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void* svm_ptr, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_2_0 +{ + return cl_EnqueueSVMMap(command_queue, blocking_map, flags, svm_ptr, size, num_events_in_wait_list, event_wait_list, event); +} + +cl_int CL_API_CALL clEnqueueSVMUnmap(cl_command_queue command_queue, void* svm_ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_API_SUFFIX__VERSION_2_0 +{ + return cl_EnqueueSVMUnmap(command_queue, svm_ptr, num_events_in_wait_list, event_wait_list, event); +} + +void* CL_API_CALL clGetExtensionFunctionAddressForPlatform(cl_platform_id platform, const char* func_name) CL_API_SUFFIX__VERSION_1_2 +{ + return cl_GetExtensionFunctionAddressForPlatform(platform, func_name); +} + +cl_mem CL_API_CALL clCreateImage2D(cl_context context, cl_mem_flags flags, const cl_image_format* image_format, size_t image_width, size_t image_height, size_t image_row_pitch, void* host_ptr, cl_int* errcode_ret) +{ + return cl_CreateImage2D(context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret); +} + +cl_mem CL_API_CALL clCreateImage3D(cl_context context, cl_mem_flags flags, const cl_image_format* image_format, size_t image_width, size_t image_height, size_t image_depth, size_t image_row_pitch, size_t image_slice_pitch, void* host_ptr, cl_int* errcode_ret) +{ + return cl_CreateImage3D(context, flags, image_format, image_width, image_height, image_depth, image_row_pitch, image_slice_pitch, host_ptr, errcode_ret); +} + +cl_int CL_API_CALL clEnqueueMarker(cl_command_queue command_queue, cl_event* event) +{ + return cl_EnqueueMarker(command_queue, event); +} + +cl_int CL_API_CALL clEnqueueWaitForEvents(cl_command_queue command_queue, cl_uint num_events, const cl_event* event_list) +{ + return cl_EnqueueWaitForEvents(command_queue, num_events, event_list); +} + +cl_int CL_API_CALL clEnqueueBarrier(cl_command_queue command_queue) +{ + return cl_EnqueueBarrier(command_queue); +} + +cl_int CL_API_CALL clUnloadCompiler(void) +{ + return cl_UnloadCompiler(); +} + +void* CL_API_CALL clGetExtensionFunctionAddress(const char* func_name) +{ + return cl_GetExtensionFunctionAddress(func_name); +} + +cl_command_queue CL_API_CALL clCreateCommandQueue(cl_context context, cl_device_id device, cl_command_queue_properties properties, cl_int* errcode_ret) +{ + return cl_CreateCommandQueue(context, device, properties, errcode_ret); +} + +cl_sampler CL_API_CALL clCreateSampler(cl_context context, cl_bool normalized_coords, cl_addressing_mode addressing_mode, cl_filter_mode filter_mode, cl_int* errcode_ret) +{ + return cl_CreateSampler(context, normalized_coords, addressing_mode, filter_mode, errcode_ret); +} + +cl_int CL_API_CALL clEnqueueTask(cl_command_queue command_queue, cl_kernel kernel, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) +{ + return cl_EnqueueTask(command_queue, kernel, num_events_in_wait_list, event_wait_list, event); +} diff --git a/3rdparty/opencl/opencl.def b/3rdparty/opencl/opencl.def deleted file mode 100644 index cde815ac89..0000000000 --- a/3rdparty/opencl/opencl.def +++ /dev/null @@ -1,113 +0,0 @@ -LIBRARY OpenCL.dll -EXPORTS -clBuildProgram@24 @1 -clCompileProgram@36 @2 -clCreateBuffer@24 @3 -clCreateCommandQueue@20 @4 -clCreateCommandQueueWithProperties@16 @5 -clCreateContext@24 @6 -clCreateContextFromType@24 @7 -clCreateFromGLBuffer@20 @8 -clCreateFromGLRenderbuffer@20 @9 -clCreateFromGLTexture@28 @10 -clCreateFromGLTexture2D@28 @11 -clCreateFromGLTexture3D@28 @12 -clCreateImage@28 @13 -clCreateImage2D@36 @14 -clCreateImage3D@44 @15 -clCreateKernel@12 @16 -clCreateKernelsInProgram@16 @17 -clCreatePipe@28 @18 -clCreateProgramWithBinary@28 @19 -clCreateProgramWithBuiltInKernels@20 @20 -clCreateProgramWithSource@20 @21 -clCreateSampler@20 @22 -clCreateSamplerWithProperties@12 @23 -clCreateSubBuffer@24 @24 -clCreateSubDevices@20 @25 -clCreateUserEvent@8 @26 -clEnqueueAcquireGLObjects@24 @27 -clEnqueueBarrier@4 @28 -clEnqueueBarrierWithWaitList@16 @29 -clEnqueueCopyBuffer@36 @30 -clEnqueueCopyBufferRect@52 @31 -clEnqueueCopyBufferToImage@36 @32 -clEnqueueCopyImage@36 @33 -clEnqueueCopyImageToBuffer@36 @34 -clEnqueueFillBuffer@36 @35 -clEnqueueFillImage@32 @36 -clEnqueueMapBuffer@44 @37 -clEnqueueMapImage@52 @38 -clEnqueueMarker@8 @39 -clEnqueueMarkerWithWaitList@16 @40 -clEnqueueMigrateMemObjects@32 @41 -clEnqueueNDRangeKernel@36 @42 -clEnqueueNativeKernel@40 @43 -clEnqueueReadBuffer@36 @44 -clEnqueueReadBufferRect@56 @45 -clEnqueueReadImage@44 @46 -clEnqueueReleaseGLObjects@24 @47 -clEnqueueSVMFree@32 @48 -clEnqueueSVMMap@36 @49 -clEnqueueSVMMemFill@32 @50 -clEnqueueSVMMemcpy@32 @51 -clEnqueueSVMUnmap@20 @52 -clEnqueueTask@20 @53 -clEnqueueUnmapMemObject@24 @54 -clEnqueueWaitForEvents@12 @55 -clEnqueueWriteBuffer@36 @56 -clEnqueueWriteBufferRect@56 @57 -clEnqueueWriteImage@44 @58 -clFinish@4 @59 -clFlush@4 @60 -clGetCommandQueueInfo@20 @61 -clGetContextInfo@20 @62 -clGetDeviceIDs@24 @63 -clGetDeviceInfo@20 @64 -clGetEventInfo@20 @65 -clGetEventProfilingInfo@20 @66 -clGetExtensionFunctionAddress@4 @67 -clGetExtensionFunctionAddressForPlatform@8 @68 -clGetGLObjectInfo@12 @69 -clGetGLTextureInfo@20 @70 -clGetImageInfo@20 @71 -clGetKernelArgInfo@24 @72 -clGetKernelInfo@20 @73 -clGetKernelWorkGroupInfo@24 @74 -clGetMemObjectInfo@20 @75 -clGetPipeInfo@20 @76 -clGetPlatformIDs@12 @77 -clGetPlatformInfo@20 @78 -clGetProgramBuildInfo@24 @79 -clGetProgramInfo@20 @80 -clGetSamplerInfo@20 @81 -clGetSupportedImageFormats@28 @82 -clLinkProgram@36 @83 -clReleaseCommandQueue@4 @84 -clReleaseContext@4 @85 -clReleaseDevice@4 @86 -clReleaseEvent@4 @87 -clReleaseKernel@4 @88 -clReleaseMemObject@4 @89 -clReleaseProgram@4 @90 -clReleaseSampler@4 @91 -clRetainCommandQueue@4 @92 -clRetainContext@4 @93 -clRetainDevice@4 @94 -clRetainEvent@4 @95 -clRetainKernel@4 @96 -clRetainMemObject@4 @97 -clRetainProgram@4 @98 -clRetainSampler@4 @99 -clSVMAlloc@20 @100 -clSVMFree@8 @101 -clSetCommandQueueProperty@20 @102 -clSetEventCallback@16 @103 -clSetKernelArg@16 @104 -clSetKernelArgSVMPointer@12 @105 -clSetKernelExecInfo@16 @106 -clSetMemObjectDestructorCallback@12 @107 -clSetUserEventStatus@8 @108 -clUnloadCompiler@0 @109 -clUnloadPlatformCompiler@4 @110 -clWaitForEvents@8 @111 diff --git a/3rdparty/opencl/opencl.vcxproj b/3rdparty/opencl/opencl.vcxproj index 60a8285756..36b6b241f2 100644 --- a/3rdparty/opencl/opencl.vcxproj +++ b/3rdparty/opencl/opencl.vcxproj @@ -25,48 +25,52 @@ - Utility + StaticLibrary true v120 - Unicode + MultiByte - Utility + StaticLibrary true v120 - Unicode + MultiByte - Utility + StaticLibrary false v120 true - Unicode + MultiByte - Utility + StaticLibrary false v120 true - Unicode + MultiByte + + + + @@ -136,18 +140,6 @@ true - - - lib /machine:$(PlatformTarget) "/def:%(FullPath)" "/out:$(SolutionDir)$(OutDir)\opencl.lib" - lib /machine:$(PlatformTarget) "/def:%(FullPath)" "/out:$(SolutionDir)$(OutDir)\opencl.lib" - lib /machine:$(PlatformTarget) "/def:%(FullPath)" "/out:$(SolutionDir)$(OutDir)\opencl.lib" - lib /machine:$(PlatformTarget) "/def:%(FullPath)" "/out:$(SolutionDir)$(OutDir)\opencl.lib" - $(SolutionDir)$(OutDir)\opencl.lib - $(SolutionDir)$(OutDir)\opencl.lib - $(SolutionDir)$(OutDir)\opencl.lib - $(SolutionDir)$(OutDir)\opencl.lib - - @@ -161,6 +153,9 @@ + + + diff --git a/3rdparty/opencl/opencl.vcxproj.filters b/3rdparty/opencl/opencl.vcxproj.filters index 5c060a12cb..0ab5d03884 100644 --- a/3rdparty/opencl/opencl.vcxproj.filters +++ b/3rdparty/opencl/opencl.vcxproj.filters @@ -14,11 +14,6 @@ rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - Source Files - - Header Files @@ -54,4 +49,9 @@ Header Files + + + Source Files + + \ No newline at end of file diff --git a/plugins/GSdx/GSSettingsDlg.cpp b/plugins/GSdx/GSSettingsDlg.cpp index a983cd07f5..1f5e58ef6d 100644 --- a/plugins/GSdx/GSSettingsDlg.cpp +++ b/plugins/GSdx/GSSettingsDlg.cpp @@ -31,6 +31,16 @@ GSSettingsDlg::GSSettingsDlg(bool isOpen2) : GSDialog(isOpen2 ? IDD_CONFIG2 : IDD_CONFIG) , m_IsOpen2(isOpen2) { + list ocldevs; + + GSUtil::GetOCLDevices(ocldevs); + + int index = 0; + + for(auto dev : ocldevs) + { + m_ocl_devs.push_back(GSSetting(index++, dev.name.c_str(), "")); + } } void GSSettingsDlg::OnInit() @@ -153,9 +163,9 @@ void GSSettingsDlg::OnInit() unsigned int ocl_sel = 0; - for(unsigned int i = 0; i < theApp.m_ocl_devs.size(); i++) + for(unsigned int i = 0; i < m_ocl_devs.size(); i++) { - if(ocldev == theApp.m_ocl_devs[i].name) + if(ocldev == m_ocl_devs[i].name) { ocl_sel = i; @@ -164,7 +174,7 @@ void GSSettingsDlg::OnInit() } ComboBoxInit(IDC_ADAPTER, adapter_settings, adapter_sel); - ComboBoxInit(IDC_OPENCL_DEVICE, theApp.m_ocl_devs, ocl_sel); + ComboBoxInit(IDC_OPENCL_DEVICE, m_ocl_devs, ocl_sel); UpdateRenderers(); @@ -257,7 +267,7 @@ bool GSSettingsDlg::OnCommand(HWND hWnd, UINT id, UINT code) if(ComboBoxGetSelData(IDC_OPENCL_DEVICE, data)) { - theApp.SetConfig("ocldev", theApp.m_ocl_devs[(int)data].name.c_str()); + theApp.SetConfig("ocldev", m_ocl_devs[(int)data].name.c_str()); } if(!m_IsOpen2 && ComboBoxGetSelData(IDC_RESOLUTION, data)) diff --git a/plugins/GSdx/GSSettingsDlg.h b/plugins/GSdx/GSSettingsDlg.h index bc083debd1..3b8c49ea44 100644 --- a/plugins/GSdx/GSSettingsDlg.h +++ b/plugins/GSdx/GSSettingsDlg.h @@ -79,7 +79,9 @@ class GSSettingsDlg : public GSDialog }; std::vector adapters; - + + vector m_ocl_devs; + bool m_IsOpen2; uint32 m_lastValidMsaa; // used to revert to previous dialog value if the user changed to invalid one, or lesser one and canceled diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp index 71d60eed9e..3567aa5080 100644 --- a/plugins/GSdx/GSUtil.cpp +++ b/plugins/GSdx/GSUtil.cpp @@ -258,7 +258,8 @@ void GSUtil::GetOCLDevices(list& devs) case CL_DEVICE_TYPE_CPU: type = "CPU"; break; } - int major, minor; + int major = 0; + int minor = 0; if(!type.empty() && sscanf(version.c_str(), "OpenCL C %d.%d", &major, &minor) == 2 && major == 1 && minor >= 1 || major > 1) { diff --git a/plugins/GSdx/GSdx.cpp b/plugins/GSdx/GSdx.cpp index e3a3fc0665..93d9642fc2 100644 --- a/plugins/GSdx/GSdx.cpp +++ b/plugins/GSdx/GSdx.cpp @@ -21,7 +21,6 @@ #include "stdafx.h" #include "GSdx.h" -#include "GSUtil.h" static void* s_hModule; @@ -199,19 +198,6 @@ GSdxApp::GSdxApp() m_gpu_scale.push_back(GSSetting(2 | (1 << 2), "H x 4 - V x 2", "")); m_gpu_scale.push_back(GSSetting(1 | (2 << 2), "H x 2 - V x 4", "")); m_gpu_scale.push_back(GSSetting(2 | (2 << 2), "H x 4 - V x 4", "")); - - // - - list ocldevs; - - GSUtil::GetOCLDevices(ocldevs); - - int index = 0; - - for(auto dev : ocldevs) - { - m_ocl_devs.push_back(GSSetting(index++, dev.name.c_str(), "")); - } } #ifdef _LINUX diff --git a/plugins/GSdx/GSdx.h b/plugins/GSdx/GSdx.h index 68a17b71d3..5ccb1e2c69 100644 --- a/plugins/GSdx/GSdx.h +++ b/plugins/GSdx/GSdx.h @@ -69,8 +69,6 @@ public: vector m_gpu_dithering; vector m_gpu_aspectratio; vector m_gpu_scale; - - vector m_ocl_devs; }; struct GSDXError {}; From ba59036a979f72c37c5f19407fec611ceff73d17 Mon Sep 17 00:00:00 2001 From: gabest11 Date: Mon, 22 Sep 2014 05:34:30 +0200 Subject: [PATCH 12/15] fixed a small compiling error in release mode --- 3rdparty/opencl/opencl.vcxproj | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/3rdparty/opencl/opencl.vcxproj b/3rdparty/opencl/opencl.vcxproj index 36b6b241f2..eea5ae72c0 100644 --- a/3rdparty/opencl/opencl.vcxproj +++ b/3rdparty/opencl/opencl.vcxproj @@ -119,9 +119,7 @@ true true - - opencl.def - + From b9b02cf749aa356626e3319bb62e00340c06bc70 Mon Sep 17 00:00:00 2001 From: gabest11 Date: Mon, 22 Sep 2014 09:15:25 +0200 Subject: [PATCH 13/15] implemented opencl program caching on disk under the system default temp folder, needs some additional work on linux --- plugins/GSdx/GSRendererCL.cpp | 172 +++++++++++++++++++++++++-------- plugins/GSdx/GSRendererCL.h | 4 +- plugins/GSdx/GSSettingsDlg.cpp | 4 +- plugins/GSdx/GSUtil.cpp | 61 +++++++++--- plugins/GSdx/GSUtil.h | 8 +- plugins/GSdx/res/tfx.cl | 8 +- plugins/GSdx/stdafx.h | 2 - 7 files changed, 198 insertions(+), 61 deletions(-) diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp index 21d1a89a59..2eb9519e4e 100644 --- a/plugins/GSdx/GSRendererCL.cpp +++ b/plugins/GSdx/GSRendererCL.cpp @@ -37,6 +37,7 @@ static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL; #define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE)) #define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT) #define TFX_PARAM_SIZE 2048 +#define TFX_PROGRAM_VERSION 1 #if MAX_PRIM_PER_BATCH == 64u #define BIN_TYPE cl_ulong @@ -1622,6 +1623,7 @@ GSVector4i* GSRendererCL::TFXJob::GetDstPages() GSRendererCL::CL::CL() { WIs = INT_MAX; + version = INT_MAX; std::string ocldev = theApp.GetConfig("ocldev", ""); @@ -1629,37 +1631,43 @@ GSRendererCL::CL::CL() ocldev = "Intel(R) Corporation Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz OpenCL C 1.2 CPU"; #endif - list ocldevs; + list dl; - GSUtil::GetOCLDevices(ocldevs); + GSUtil::GetDeviceDescs(dl); - for(auto dev : ocldevs) + for(auto d : dl) { - if(dev.name == ocldev) + if(d.name == ocldev) { - devices.push_back(dev.device); + devs.push_back(d); - WIs = std::min(WIs, (uint32)dev.device.getInfo()); + WIs = std::min(WIs, (uint32)d.device.getInfo()); + version = std::min(version, d.version); break; // TODO: multiple devices? } } - if(devices.empty() && !ocldevs.empty()) + if(devs.empty() && !dl.empty()) { - auto dev = ocldevs.front(); + auto d = dl.front(); - devices.push_back(dev.device); + devs.push_back(d); - WIs = std::min(WIs, (uint32)dev.device.getInfo()); + WIs = std::min(WIs, (uint32)d.device.getInfo()); + version = std::min(version, d.version); } - if(devices.empty()) + if(devs.empty()) { throw new std::exception("OpenCL device not found"); } - context = cl::Context(devices); + vector tmp; + + for(auto d : devs) tmp.push_back(d.device); + + context = cl::Context(tmp); queue[0] = cl::CommandQueue(context); queue[1] = cl::CommandQueue(context); @@ -1699,24 +1707,24 @@ void GSRendererCL::CL::Map() { Unmap(); - // TODO: CL_MAP_WRITE_INVALIDATE_REGION if 1.2+ + cl_map_flags flags = version >= 120 ? CL_MAP_WRITE_INVALIDATE_REGION : CL_MAP_WRITE; if(vb.head < vb.size) { - vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, vb.head, vb.size - vb.head); + vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, flags, vb.head, vb.size - vb.head); vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head; ASSERT(((size_t)vb.ptr & 15) == 0); } if(ib.head < ib.size) { - ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE, ib.head, ib.size - ib.head); + ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, flags, ib.head, ib.size - ib.head); ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head; } if(pb.head < pb.size) { - pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, pb.head, pb.size - pb.head); + pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, flags, pb.head, pb.size - pb.head); pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head; ASSERT(((size_t)pb.ptr & 15) == 0); } @@ -1733,34 +1741,75 @@ void GSRendererCL::CL::Unmap() pb.mapped_ptr = pb.ptr = NULL; } -static void AddDefs(ostringstream& opt) -{ - opt << "-cl-std=CL1.1 "; - opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u "; - opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u "; - opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u "; - opt << "-D MAX_PRIM_PER_BATCH=" << MAX_PRIM_PER_BATCH << "u "; - opt << "-D MAX_BATCH_COUNT=" << MAX_BATCH_COUNT << "u "; - opt << "-D BIN_SIZE_BITS=" << BIN_SIZE_BITS << " "; - opt << "-D BIN_SIZE=" << BIN_SIZE << "u "; - opt << "-D MAX_BIN_PER_BATCH=" << MAX_BIN_PER_BATCH << "u "; - opt << "-D MAX_BIN_COUNT=" << MAX_BIN_COUNT << "u "; - opt << "-D TFX_PARAM_SIZE=" << TFX_PARAM_SIZE << "u "; -#ifdef IOCL_DEBUG - opt << "-g -s \"E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\tfx.cl\" "; -#endif -} - cl::Kernel GSRendererCL::CL::Build(const char* entry, ostringstream& opt) { // TODO: cache binary on disk - printf("building kernel (%s)\n", entry); + cl::Program program; - cl::Program program = cl::Program(context, kernel_str); + if(version >= 120) + { + cl::Program::Binaries binaries; + + try + { + for(auto d : devs) + { + string path = d.tmppath + "/" + entry; + + FILE* f = fopen(path.c_str(), "rb"); + + if(f != NULL) + { + fseek(f, 0, SEEK_END); + long size = ftell(f); + pair b(new char[size], size); + fseek(f, 0, SEEK_SET); + fread(b.first, b.second, 1, f); + fclose(f); + + binaries.push_back(b); + } + else + { + break; + } + } + + if(binaries.size() == devs.size()) + { + vector tmp; + + for(auto d : devs) tmp.push_back(d.device); + + program = cl::Program(context, tmp, binaries); + + AddDefs(opt); + + program.build(opt.str().c_str()); + + cl::Kernel kernel = cl::Kernel(program, entry); + + return kernel; + } + } + catch(cl::Error err) + { + printf("%s (%d)\n", err.what(), err.err()); + } + + for(auto b : binaries) + { + delete [] b.first; + } + } try { + printf("building kernel (%s)\n", entry); + + program = cl::Program(context, kernel_str); + AddDefs(opt); program.build(opt.str().c_str()); @@ -1769,9 +1818,9 @@ cl::Kernel GSRendererCL::CL::Build(const char* entry, ostringstream& opt) { if(err.err() == CL_BUILD_PROGRAM_FAILURE) { - for(auto device : devices) + for(auto d : devs) { - auto s = program.getBuildInfo(device); + auto s = program.getBuildInfo(d.device); printf("kernel (%s) build error: %s\n", entry, s.c_str()); } @@ -1780,9 +1829,56 @@ cl::Kernel GSRendererCL::CL::Build(const char* entry, ostringstream& opt) throw err; } + if(version >= 120) + { + try + { + vector sizes = program.getInfo(); + vector binaries = program.getInfo(); + + for(int i = 0; i < binaries.size(); i++) + { + string path = devs[i].tmppath + "/" + entry; + + FILE* f = fopen(path.c_str(), "wb"); + + if(f != NULL) + { + fwrite(binaries[i], sizes[i], 1, f); + fclose(f); + } + + delete[] binaries[i]; + } + } + catch(cl::Error err) + { + printf("%s (%d)\n", err.what(), err.err()); + } + } + return cl::Kernel(program, entry); } +void GSRendererCL::CL::AddDefs(ostringstream& opt) +{ + if(version == 110) opt << "-cl-std=CL1.1 "; + else opt << "-cl-std=CL1.2 "; + opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u "; + opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u "; + opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u "; + opt << "-D MAX_PRIM_PER_BATCH=" << MAX_PRIM_PER_BATCH << "u "; + opt << "-D MAX_BATCH_COUNT=" << MAX_BATCH_COUNT << "u "; + opt << "-D BIN_SIZE_BITS=" << BIN_SIZE_BITS << " "; + opt << "-D BIN_SIZE=" << BIN_SIZE << "u "; + opt << "-D MAX_BIN_PER_BATCH=" << MAX_BIN_PER_BATCH << "u "; + opt << "-D MAX_BIN_COUNT=" << MAX_BIN_COUNT << "u "; + opt << "-D TFX_PARAM_SIZE=" << TFX_PARAM_SIZE << "u "; +#ifdef IOCL_DEBUG + opt << "-g -s \"E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\tfx.cl\" "; +#endif +} + cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel) { auto i = prim_map.find(sel); diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h index fdfea3f6d3..683b161768 100644 --- a/plugins/GSdx/GSRendererCL.h +++ b/plugins/GSdx/GSRendererCL.h @@ -188,9 +188,10 @@ class GSRendererCL : public GSRenderer std::map tfx_map; cl::Kernel Build(const char* entry, ostringstream& opt); + void AddDefs(ostringstream& opt); public: - std::vector devices; + std::vector devs; cl::Context context; cl::CommandQueue queue[3]; cl::Buffer vm; @@ -200,6 +201,7 @@ class GSRendererCL : public GSRenderer cl::CommandQueue* wq; int wqidx; uint32 WIs; + int version; public: CL(); diff --git a/plugins/GSdx/GSSettingsDlg.cpp b/plugins/GSdx/GSSettingsDlg.cpp index 1f5e58ef6d..ca4fb0a9df 100644 --- a/plugins/GSdx/GSSettingsDlg.cpp +++ b/plugins/GSdx/GSSettingsDlg.cpp @@ -31,9 +31,9 @@ GSSettingsDlg::GSSettingsDlg(bool isOpen2) : GSDialog(isOpen2 ? IDD_CONFIG2 : IDD_CONFIG) , m_IsOpen2(isOpen2) { - list ocldevs; + list ocldevs; - GSUtil::GetOCLDevices(ocldevs); + GSUtil::GetDeviceDescs(ocldevs); int index = 0; diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp index 3567aa5080..40df3690e4 100644 --- a/plugins/GSdx/GSUtil.cpp +++ b/plugins/GSdx/GSUtil.cpp @@ -226,9 +226,11 @@ bool GSUtil::CheckSSE() return true; } -void GSUtil::GetOCLDevices(list& devs) +#define OCL_PROGRAM_VERSION 1 + +void GSUtil::GetDeviceDescs(list& dl) { - devs.clear(); + dl.clear(); try { @@ -246,10 +248,6 @@ void GSUtil::GetOCLDevices(list& devs) for(auto& device : ds) { - std::string vendor = device.getInfo(); - std::string name = device.getInfo(); - std::string version = device.getInfo(); - string type; switch(device.getInfo()) @@ -258,19 +256,41 @@ void GSUtil::GetOCLDevices(list& devs) case CL_DEVICE_TYPE_CPU: type = "CPU"; break; } + if(type.empty()) continue; + + std::string version = device.getInfo(); + int major = 0; int minor = 0; if(!type.empty() && sscanf(version.c_str(), "OpenCL C %d.%d", &major, &minor) == 2 && major == 1 && minor >= 1 || major > 1) { - name = vendor + " " + name + " " + version + type; + OCLDeviceDesc desc; - OCLDevice dev; + desc.device = device; + desc.name = GetDeviceUniqueName(device); + desc.version = major * 100 + minor * 10; - dev.device = device; - dev.name = name; + // TODO: linux - devs.push_back(dev); + char* buff = new char[MAX_PATH + 1]; + GetTempPath(MAX_PATH, buff); + desc.tmppath = string(buff) + "/" + desc.name; + + WIN32_FIND_DATA FindFileData; + HANDLE hFind = FindFirstFile(desc.tmppath.c_str(), &FindFileData); + if(hFind != INVALID_HANDLE_VALUE) FindClose(hFind); + else CreateDirectory(desc.tmppath.c_str(), NULL); + + sprintf(buff, "/%d", OCL_PROGRAM_VERSION); + desc.tmppath += buff; + delete[] buff; + + hFind = FindFirstFile(desc.tmppath.c_str(), &FindFileData); + if(hFind != INVALID_HANDLE_VALUE) FindClose(hFind); + else CreateDirectory(desc.tmppath.c_str(), NULL); + + dl.push_back(desc); } } } @@ -281,6 +301,25 @@ void GSUtil::GetOCLDevices(list& devs) } } +string GSUtil::GetDeviceUniqueName(cl::Device& device) +{ + std::string vendor = device.getInfo(); + std::string name = device.getInfo(); + std::string version = device.getInfo(); + + string type; + + switch(device.getInfo()) + { + case CL_DEVICE_TYPE_GPU: type = "GPU"; break; + case CL_DEVICE_TYPE_CPU: type = "CPU"; break; + } + + version.erase(version.find_last_not_of(' ') + 1); + + return vendor + " " + name + " " + version + " " + type; +} + #ifdef _WINDOWS bool GSUtil::CheckDirectX() diff --git a/plugins/GSdx/GSUtil.h b/plugins/GSdx/GSUtil.h index 2e044e0bdc..ca90869ed1 100644 --- a/plugins/GSdx/GSUtil.h +++ b/plugins/GSdx/GSUtil.h @@ -23,10 +23,12 @@ #include "GS.h" -struct OCLDevice +struct OCLDeviceDesc { cl::Device device; string name; + int version; + string tmppath; }; class GSUtil @@ -45,7 +47,9 @@ public: static bool HasCompatibleBits(uint32 spsm, uint32 dpsm); static bool CheckSSE(); - static void GetOCLDevices(list& devs); + + static void GetDeviceDescs(list& dl); + static string GetDeviceUniqueName(cl::Device& device); #ifdef _WINDOWS diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl index ba7ef5214e..0ecd98f6db 100644 --- a/plugins/GSdx/res/tfx.cl +++ b/plugins/GSdx/res/tfx.cl @@ -2,8 +2,6 @@ #ifdef cl_amd_printf #pragma OPENCL EXTENSION cl_amd_printf : enable -#else -#define printf(x) #endif #ifdef cl_amd_media_ops @@ -639,9 +637,9 @@ __kernel void KERNEL_PRIM( dp1.xy = dp1.xy * sign(cp); dp2.xy = dp2.xy * sign(cp); - b.zero.x = select(0.0f, CL_FLT_EPSILON, (dp1.y < 0) | (dp1.y == 0) & (dp1.x > 0)); - b.zero.y = select(0.0f, CL_FLT_EPSILON, (dp0.y < 0) | (dp0.y == 0) & (dp0.x > 0)); - b.zero.z = select(0.0f, CL_FLT_EPSILON, (dp2.y < 0) | (dp2.y == 0) & (dp2.x > 0)); + b.zero.x = select(0.0f, CL_FLT_EPSILON, (dp1.y < 0) | ((dp1.y == 0) & (dp1.x > 0))); + b.zero.y = select(0.0f, CL_FLT_EPSILON, (dp0.y < 0) | ((dp0.y == 0) & (dp0.x > 0))); + b.zero.z = select(0.0f, CL_FLT_EPSILON, (dp2.y < 0) | ((dp2.y == 0) & (dp2.x > 0))); // any barycentric(reject_corner) < 0, tile outside the triangle diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index 1929381354..cef3dcbe72 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -44,8 +44,6 @@ #include #include "../../common/include/comptr.h" -#include -#undef CL_VERSION_1_2 #define CL_USE_DEPRECATED_OPENCL_1_1_APIS #define __CL_ENABLE_EXCEPTIONS #include From 76f719e5d0da3fb6084515dc77b65e31fa08367b Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Mon, 1 Dec 2014 23:06:24 +0100 Subject: [PATCH 14/15] gsdx-ocl: Add a ENABLE_OPENCL option * Allow to compile GSdx on linux without opencl yet. --- plugins/GSdx/CMakeLists.txt | 1 + plugins/GSdx/GS.cpp | 4 +++- plugins/GSdx/GSRendererCL.cpp | 3 +++ plugins/GSdx/GSRendererCL.h | 4 ++++ plugins/GSdx/GSUtil.cpp | 2 ++ plugins/GSdx/GSUtil.h | 4 ++++ plugins/GSdx/config.h | 4 ++++ plugins/GSdx/stdafx.h | 12 +++++++++--- 8 files changed, 30 insertions(+), 4 deletions(-) diff --git a/plugins/GSdx/CMakeLists.txt b/plugins/GSdx/CMakeLists.txt index edbc574be1..f37ef8ebf8 100644 --- a/plugins/GSdx/CMakeLists.txt +++ b/plugins/GSdx/CMakeLists.txt @@ -104,6 +104,7 @@ set(GSdxSources GSPerfMon.cpp GSRasterizer.cpp GSRenderer.cpp + GSRendererCL.cpp GSRendererHW.cpp GSRendererNull.cpp GSRendererOGL.cpp diff --git a/plugins/GSdx/GS.cpp b/plugins/GSdx/GS.cpp index 5d91813659..cb36a03226 100644 --- a/plugins/GSdx/GS.cpp +++ b/plugins/GSdx/GS.cpp @@ -27,6 +27,7 @@ #include "GSDeviceNull.h" #include "GSDeviceOGL.h" #include "GSRendererOGL.h" +#include "GSRendererCL.h" #ifdef _WINDOWS @@ -37,7 +38,6 @@ #include "GSWndDX.h" #include "GSWndWGL.h" #include "GSRendererCS.h" -#include "GSRendererCL.h" #include "GSSettingsDlg.h" static HRESULT s_hr = E_FAIL; @@ -265,7 +265,9 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1) s_gs = new GSRendererNull(); break; case 14: case 15: case 16: case 17: +#ifdef ENABLE_OPENCL s_gs = new GSRendererCL(); +#endif break; } diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp index 2eb9519e4e..e4b22d63e2 100644 --- a/plugins/GSdx/GSRendererCL.cpp +++ b/plugins/GSdx/GSRendererCL.cpp @@ -22,6 +22,8 @@ #include "stdafx.h" #include "GSRendererCL.h" +#ifdef ENABLE_OPENCL + #define LOG 0 static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL; @@ -2002,3 +2004,4 @@ cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel) return tfx_map[sel]; } +#endif diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h index 683b161768..e0afe67a22 100644 --- a/plugins/GSdx/GSRendererCL.h +++ b/plugins/GSdx/GSRendererCL.h @@ -23,6 +23,8 @@ #include "GSRenderer.h" +#ifdef ENABLE_OPENCL + __aligned(struct, 32) GSVertexCL { GSVector4 p, t; @@ -252,3 +254,5 @@ public: GSRendererCL(); virtual ~GSRendererCL(); }; + +#endif diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp index 40df3690e4..791aa56f5f 100644 --- a/plugins/GSdx/GSUtil.cpp +++ b/plugins/GSdx/GSUtil.cpp @@ -228,6 +228,7 @@ bool GSUtil::CheckSSE() #define OCL_PROGRAM_VERSION 1 +#ifdef ENABLE_OPENCL void GSUtil::GetDeviceDescs(list& dl) { dl.clear(); @@ -319,6 +320,7 @@ string GSUtil::GetDeviceUniqueName(cl::Device& device) return vendor + " " + name + " " + version + " " + type; } +#endif #ifdef _WINDOWS diff --git a/plugins/GSdx/GSUtil.h b/plugins/GSdx/GSUtil.h index ca90869ed1..5a722cd251 100644 --- a/plugins/GSdx/GSUtil.h +++ b/plugins/GSdx/GSUtil.h @@ -25,7 +25,9 @@ struct OCLDeviceDesc { +#ifdef ENABLE_OPENCL cl::Device device; +#endif string name; int version; string tmppath; @@ -48,8 +50,10 @@ public: static bool CheckSSE(); +#ifdef ENABLE_OPENCL static void GetDeviceDescs(list& dl); static string GetDeviceUniqueName(cl::Device& device); +#endif #ifdef _WINDOWS diff --git a/plugins/GSdx/config.h b/plugins/GSdx/config.h index 3ad4756bc9..323bfaae0e 100644 --- a/plugins/GSdx/config.h +++ b/plugins/GSdx/config.h @@ -45,3 +45,7 @@ // Output stencil to a color buffer //#define ENABLE_OGL_STENCIL_DEBUG + +#ifdef _WINDOWS +#define ENABLE_OPENCL +#endif diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index cef3dcbe72..52e564eee1 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -44,9 +44,6 @@ #include #include "../../common/include/comptr.h" -#define CL_USE_DEPRECATED_OPENCL_1_1_APIS -#define __CL_ENABLE_EXCEPTIONS -#include #define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA) #define D3D11_SHADER_MACRO D3D10_SHADER_MACRO @@ -54,6 +51,15 @@ #endif + +#ifdef ENABLE_OPENCL + +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS +#define __CL_ENABLE_EXCEPTIONS +#include + +#endif + // put these into vc9/common7/ide/usertype.dat to have them highlighted typedef unsigned char uint8; From 42382617845bfabe3a739910b27e9ba61e9bbc4c Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Mon, 1 Dec 2014 23:34:37 +0100 Subject: [PATCH 15/15] gsdx-cl: update linux menu config Try to use some id to be more robust The best will be to sort the array first --- plugins/GSdx/GSLinuxDialog.cpp | 47 ++++++++++++++++++---------------- plugins/GSdx/GSdx.cpp | 6 ----- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/plugins/GSdx/GSLinuxDialog.cpp b/plugins/GSdx/GSLinuxDialog.cpp index 8c45323732..1232b8573e 100644 --- a/plugins/GSdx/GSLinuxDialog.cpp +++ b/plugins/GSdx/GSLinuxDialog.cpp @@ -31,30 +31,29 @@ GtkWidget* CreateRenderComboBox() render_combo_box = gtk_combo_box_new_text (); - for(size_t i = 6; i < theApp.m_gs_renderers.size(); i++) + for(auto s = theApp.m_gs_renderers.begin(); s != theApp.m_gs_renderers.end(); s++) { - const GSSetting& s = theApp.m_gs_renderers[i]; + string label = s->name; - string label = s.name; - - if(!s.note.empty()) label += format(" (%s)", s.note.c_str()); + if(!s->note.empty()) label += format(" (%s)", s->note.c_str()); // Add some tags to ease users selection - switch (i) { - // better use opengl instead of SDL - case 6: - case 7: - label += " (removed)"; + switch (s->id) { + // Supported opengl + case 12: + case 13: + case 17: break; // (dev only) for any NULL stuff - case 8: - case 9: + case 10: + case 11: + case 16: label += " (debug only)"; break; default: - break; + continue; } gtk_combo_box_append_text(GTK_COMBO_BOX(render_combo_box), label.c_str()); @@ -62,13 +61,15 @@ GtkWidget* CreateRenderComboBox() switch (theApp.GetConfig("renderer", 0)) { // Note the value are based on m_gs_renderers vector on GSdx.cpp - case 10: renderer_box_position = 2; break; - case 11: renderer_box_position = 3; break; - case 12: renderer_box_position = 4; break; - case 13: renderer_box_position = 5; break; + case 10: renderer_box_position = 0; break; + case 16: renderer_box_position = 1; break; + case 11: renderer_box_position = 2; break; + case 12: renderer_box_position = 3; break; + case 13: renderer_box_position = 4; break; + case 17: renderer_box_position = 5; break; // Fallback to openGL SW - default: renderer_box_position = 5; break; + default: renderer_box_position = 4; break; } gtk_combo_box_set_active(GTK_COMBO_BOX(render_combo_box), renderer_box_position); return render_combo_box; @@ -483,10 +484,12 @@ override_GL_ARB_shading_language_420pack = -1 if (gtk_combo_box_get_active(GTK_COMBO_BOX(render_combo_box)) != -1) { // Note the value are based on m_gs_renderers vector on GSdx.cpp switch (gtk_combo_box_get_active(GTK_COMBO_BOX(render_combo_box))) { - case 2: theApp.SetConfig("renderer", 10); break; - case 3: theApp.SetConfig("renderer", 11); break; - case 4: theApp.SetConfig("renderer", 12); break; - case 5: theApp.SetConfig("renderer", 13); break; + case 0: theApp.SetConfig("renderer", 10); break; + case 1: theApp.SetConfig("renderer", 16); break; + case 2: theApp.SetConfig("renderer", 11); break; + case 3: theApp.SetConfig("renderer", 12); break; + case 4: theApp.SetConfig("renderer", 13); break; + case 5: theApp.SetConfig("renderer", 17); break; // Fallback to SW opengl default: theApp.SetConfig("renderer", 13); break; diff --git a/plugins/GSdx/GSdx.cpp b/plugins/GSdx/GSdx.cpp index 93d9642fc2..b2e9bc6fb8 100644 --- a/plugins/GSdx/GSdx.cpp +++ b/plugins/GSdx/GSdx.cpp @@ -135,12 +135,6 @@ GSdxApp::GSdxApp() m_gs_renderers.push_back(GSSetting(4, "Direct3D", "Software")); m_gs_renderers.push_back(GSSetting(15, "Direct3D", "OpenCL")); m_gs_renderers.push_back(GSSetting(5, "Direct3D", "Null")); -#ifdef _LINUX - // note: SDL was removed. We keep those bits for compatibility of the renderer - // position in the linux dialog. - m_gs_renderers.push_back(GSSetting(7, "SDL 1.3", "Software")); - m_gs_renderers.push_back(GSSetting(8, "SDL 1.3", "Null")); -#endif m_gs_renderers.push_back(GSSetting(10, "Null", "Software")); m_gs_renderers.push_back(GSSetting(16, "Null", "OpenCL")); m_gs_renderers.push_back(GSSetting(11, "Null", "Null"));