From c50c6034c056330ae9a04f160d4170487710f6e2 Mon Sep 17 00:00:00 2001 From: lightningterror <18107717+lightningterror@users.noreply.github.com> Date: Fri, 18 Sep 2020 00:17:51 +0200 Subject: [PATCH] gsdx-hw: Remove opencl code and files from gsdx. --- plugins/GSdx/GS.cpp | 26 - plugins/GSdx/GS.h | 2 - plugins/GSdx/GSUtil.cpp | 86 - plugins/GSdx/GSUtil.h | 15 - plugins/GSdx/GSdx.cpp | 13 - plugins/GSdx/GSdx.rc | 6 - .../GSdx/Renderers/OpenCL/GSRendererCL.cpp | 2259 ----------------- plugins/GSdx/Renderers/OpenCL/GSRendererCL.h | 272 -- plugins/GSdx/Window/GSSettingsDlg.cpp | 52 +- plugins/GSdx/Window/GSSettingsDlg.h | 2 - plugins/GSdx/config.h | 4 - plugins/GSdx/res/gsdx-res.xml | 3 - plugins/GSdx/res/tfx.cl | 1623 ------------ plugins/GSdx/resource.h | 8 +- plugins/GSdx/stdafx.h | 8 - 15 files changed, 8 insertions(+), 4371 deletions(-) delete mode 100644 plugins/GSdx/Renderers/OpenCL/GSRendererCL.cpp delete mode 100644 plugins/GSdx/Renderers/OpenCL/GSRendererCL.h delete mode 100644 plugins/GSdx/res/tfx.cl diff --git a/plugins/GSdx/GS.cpp b/plugins/GSdx/GS.cpp index 5ba67d4e08..ff010fcc83 100644 --- a/plugins/GSdx/GS.cpp +++ b/plugins/GSdx/GS.cpp @@ -27,7 +27,6 @@ #include "Renderers/Null/GSDeviceNull.h" #include "Renderers/OpenGL/GSDeviceOGL.h" #include "Renderers/OpenGL/GSRendererOGL.h" -#include "Renderers/OpenCL/GSRendererCL.h" #include "GSLzma.h" #ifdef _WIN32 @@ -124,9 +123,6 @@ EXPORT_C_(int) GSinit() GSUtil::Init(); GSBlock::InitVectors(); GSClut::InitVectors(); -#ifdef ENABLE_OPENCL - GSRendererCL::InitVectors(); -#endif GSRendererSW::InitVectors(); GSVector4i::InitVectors(); GSVector4::InitVectors(); @@ -232,9 +228,6 @@ static int _GSopen(void** dsp, const char* title, GSRendererType renderer, int t { case GSRendererType::OGL_HW: case GSRendererType::OGL_SW: -#ifdef ENABLE_OPENCL - case GSRendererType::OGL_OpenCL: -#endif #if defined(__unix__) // Note: EGL code use GLX otherwise maybe it could be also compatible with Windows // Yes OpenGL code isn't complicated enough ! @@ -321,12 +314,6 @@ static int _GSopen(void** dsp, const char* title, GSRendererType renderer, int t case GSRendererType::Null: renderer_mode = "(Null renderer)"; break; -#ifdef ENABLE_OPENCL - case GSRendererType::DX1011_OpenCL: - case GSRendererType::OGL_OpenCL: - renderer_mode = "(OpenCL)"; - break; -#endif default: renderer_mode = "(Hardware renderer)"; break; @@ -338,9 +325,6 @@ static int _GSopen(void** dsp, const char* title, GSRendererType renderer, int t #ifdef _WIN32 case GSRendererType::DX1011_HW: case GSRendererType::DX1011_SW: -#ifdef ENABLE_OPENCL - case GSRendererType::DX1011_OpenCL: -#endif dev = new GSDevice11(); s_renderer_name = " D3D11"; renderer_fullname = "Direct3D 11"; @@ -353,9 +337,6 @@ static int _GSopen(void** dsp, const char* title, GSRendererType renderer, int t break; case GSRendererType::OGL_HW: case GSRendererType::OGL_SW: -#ifdef ENABLE_OPENCL - case GSRendererType::OGL_OpenCL: -#endif dev = new GSDeviceOGL(); s_renderer_name = " OGL"; renderer_fullname = "OpenGL"; @@ -393,13 +374,6 @@ static int _GSopen(void** dsp, const char* title, GSRendererType renderer, int t s_gs = new GSRendererNull(); s_renderer_type = ""; break; -#ifdef ENABLE_OPENCL - case GSRendererType::DX1011_OpenCL: - case GSRendererType::OGL_OpenCL: - s_gs = new GSRendererCL(); - s_renderer_type = " OCL"; - break; -#endif } if (s_gs == NULL) return -1; diff --git a/plugins/GSdx/GS.h b/plugins/GSdx/GS.h index ecb66fa89a..f1a055a321 100644 --- a/plugins/GSdx/GS.h +++ b/plugins/GSdx/GS.h @@ -236,8 +236,6 @@ enum class GSRendererType : int8_t Null = 11, OGL_HW, OGL_SW, - DX1011_OpenCL = 15, - OGL_OpenCL = 17, #ifdef _WIN32 Default = Undefined diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp index a998b6f6f7..517aced913 100644 --- a/plugins/GSdx/GSUtil.cpp +++ b/plugins/GSdx/GSUtil.cpp @@ -252,93 +252,7 @@ CRCHackLevel GSUtil::GetRecommendedCRCHackLevel(GSRendererType type) return type == GSRendererType::OGL_HW ? CRCHackLevel::Partial : CRCHackLevel::Full; } -#define OCL_PROGRAM_VERSION 3 - -#ifdef ENABLE_OPENCL -void GSUtil::GetDeviceDescs(std::list& dl) -{ - dl.clear(); - - try - { - std::vector platforms; - - cl::Platform::get(&platforms); - - for(auto& p : platforms) - { - std::string platform_vendor = p.getInfo(); - - std::vector ds; - - p.getDevices(CL_DEVICE_TYPE_ALL, &ds); - - for(auto& device : ds) - { - std::string type; - - switch(device.getInfo()) - { - case CL_DEVICE_TYPE_GPU: type = "GPU"; break; - case CL_DEVICE_TYPE_CPU: type = "CPU"; break; - } - - if(type.empty()) continue; - - std::string version = device.getInfo(); - - int major = 0; - int minor = 0; - - if(!type.empty() && sscanf(version.c_str(), "OpenCL C %d.%d", &major, &minor) == 2 && major == 1 && minor >= 1 || major > 1) - { - OCLDeviceDesc desc; - - desc.device = device; - desc.name = GetDeviceUniqueName(device); - desc.version = major * 100 + minor * 10; - - desc.tmppath = GStempdir() + "/" + desc.name; - - GSmkdir(desc.tmppath.c_str()); - - desc.tmppath += "/" + std::to_string(OCL_PROGRAM_VERSION); - - GSmkdir(desc.tmppath.c_str()); - - dl.push_back(desc); - } - } - } - } - catch(cl::Error err) - { - printf("%s (%d)\n", err.what(), err.err()); - } -} - -std::string GSUtil::GetDeviceUniqueName(cl::Device& device) -{ - std::string vendor = device.getInfo(); - std::string name = device.getInfo(); - std::string version = device.getInfo(); - - std::string type; - - switch(device.getInfo()) - { - case CL_DEVICE_TYPE_GPU: type = "GPU"; break; - case CL_DEVICE_TYPE_CPU: type = "CPU"; break; - } - - version.erase(version.find_last_not_of(' ') + 1); - - return vendor + " " + name + " " + version + " " + type; -} -#endif - #ifdef _WIN32 - // --------------------------------------------------------------------------------- // DX11 Detection (includes DXGI detection and dynamic library method bindings) // --------------------------------------------------------------------------------- diff --git a/plugins/GSdx/GSUtil.h b/plugins/GSdx/GSUtil.h index 956f34ae58..ff2f95c041 100644 --- a/plugins/GSdx/GSUtil.h +++ b/plugins/GSdx/GSUtil.h @@ -24,16 +24,6 @@ #include "GS.h" #include "xbyak/xbyak_util.h" -struct OCLDeviceDesc -{ -#ifdef ENABLE_OPENCL - cl::Device device; -#endif - std::string name; - int version; - std::string tmppath; -}; - class GSUtil { public: @@ -54,11 +44,6 @@ public: static bool CheckSSE(); static CRCHackLevel GetRecommendedCRCHackLevel(GSRendererType type); -#ifdef ENABLE_OPENCL - static void GetDeviceDescs(std::list& dl); - static std::string GetDeviceUniqueName(cl::Device& device); -#endif - #ifdef _WIN32 static bool CheckDXGI(); static bool CheckD3D11(); diff --git a/plugins/GSdx/GSdx.cpp b/plugins/GSdx/GSdx.cpp index c5830c28df..6534b0aaa1 100644 --- a/plugins/GSdx/GSdx.cpp +++ b/plugins/GSdx/GSdx.cpp @@ -91,9 +91,6 @@ bool GSdxApp::LoadResource(int id, std::vector& buff, const char* type) case IDR_TFX_FS_GLSL: path = "/GSdx/res/glsl/tfx_fs.glsl"; break; - case IDR_TFX_CL: - path = "/GSdx/res/tfx.cl"; - break; case IDR_FONT_ROBOTO: path = "/GSdx/res/fonts-roboto/Roboto-Regular.ttf"; break; @@ -220,15 +217,6 @@ void GSdxApp::Init() // The null renderer goes third, it has use for benchmarking purposes in a release build m_gs_renderers.push_back(GSSetting(static_cast(GSRendererType::Null), "None", "Core Benchmark")); -#ifdef ENABLE_OPENCL - // OpenCL stuff goes last - // FIXME openCL isn't attached to a device (could be impacted by the window management stuff however) -#ifdef _WIN32 - m_gs_renderers.push_back(GSSetting(static_cast(GSRendererType::DX1011_OpenCL), "Direct3D 11", "OpenCL")); -#endif - m_gs_renderers.push_back(GSSetting(static_cast(GSRendererType::OGL_OpenCL), "OpenGL", "OpenCL")); -#endif - m_gs_interlace.push_back(GSSetting(0, "None", "")); m_gs_interlace.push_back(GSSetting(1, "Weave tff", "saw-tooth")); m_gs_interlace.push_back(GSSetting(2, "Weave bff", "saw-tooth")); @@ -360,7 +348,6 @@ void GSdxApp::Init() m_default_configuration["ModeHeight"] = "480"; m_default_configuration["ModeWidth"] = "640"; m_default_configuration["NTSC_Saturation"] = "1"; - m_default_configuration["ocldev"] = ""; #ifdef _WIN32 m_default_configuration["osd_fontname"] = "C:\\Windows\\Fonts\\my_favorite_font_e_g_tahoma.ttf"; #else diff --git a/plugins/GSdx/GSdx.rc b/plugins/GSdx/GSdx.rc index a99cf80e5f..75d11878e2 100644 --- a/plugins/GSdx/GSdx.rc +++ b/plugins/GSdx/GSdx.rc @@ -54,7 +54,6 @@ BEGIN "#include ""res/merge.fx""\r\n" "#include ""res/fxaa.fx""\r\n" "#include ""res/shadeboost.fx""\r\n" - "#include ""res/tfx.cl""\r\n" "\0" END @@ -78,8 +77,6 @@ IDR_FXAA_FX RCDATA "res\\fxaa.fx" IDR_SHADEBOOST_FX RCDATA "res\\shadeboost.fx" -IDR_TFX_CL RCDATA "res\\tfx.cl" - IDR_CONVERT_GLSL RCDATA "res\\glsl\\convert.glsl"; IDR_INTERLACE_GLSL RCDATA "res\\glsl\\interlace.glsl"; @@ -271,8 +268,6 @@ BEGIN COMBOBOX IDC_INTERLACE,70,85,166,118,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP LTEXT "Texture Filtering:",IDC_FILTER_TEXT,6,102,79,8 COMBOBOX IDC_FILTER,70,100,166,63,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP - LTEXT "OpenCL Device:",IDC_OPENCL_TEXT,6,117,53,8 - COMBOBOX IDC_OPENCL_DEVICE,70,115,166,118,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP PUSHBUTTON "OSD Configuration",IDC_OSDBUTTON,10,325,108,14 PUSHBUTTON "Shader Configuration",IDC_SHADEBUTTON,124,325,108,14 DEFPUSHBUTTON "OK",IDOK,68,351,50,14 @@ -425,7 +420,6 @@ END #include "res/merge.fx" #include "res/fxaa.fx" #include "res/shadeboost.fx" -#include "res/tfx.cl" ///////////////////////////////////////////////////////////////////////////// #endif // not APSTUDIO_INVOKED diff --git a/plugins/GSdx/Renderers/OpenCL/GSRendererCL.cpp b/plugins/GSdx/Renderers/OpenCL/GSRendererCL.cpp deleted file mode 100644 index 18af89fcb8..0000000000 --- a/plugins/GSdx/Renderers/OpenCL/GSRendererCL.cpp +++ /dev/null @@ -1,2259 +0,0 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSRendererCL.h" - -#ifdef ENABLE_OPENCL - -#define LOG 0 - -static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL; - -#define MAX_FRAME_SIZE 2048 -#define MAX_PRIM_COUNT 4096u -#define MAX_PRIM_PER_BATCH_BITS 5 -#define MAX_PRIM_PER_BATCH (1u << MAX_PRIM_PER_BATCH_BITS) -#define BATCH_COUNT(prim_count) (((prim_count) + (MAX_PRIM_PER_BATCH - 1)) / MAX_PRIM_PER_BATCH) -#define MAX_BATCH_COUNT BATCH_COUNT(MAX_PRIM_COUNT) -#define BIN_SIZE_BITS 4 -#define BIN_SIZE (1u << BIN_SIZE_BITS) -#define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE)) -#define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT) -#define TFX_PARAM_SIZE 2048 -#define TFX_MAX_PARAM_COUNT 256 - -#if MAX_PRIM_PER_BATCH == 64u -#define BIN_TYPE cl_ulong -#elif MAX_PRIM_PER_BATCH == 32u -#define BIN_TYPE cl_uint -#else -#error "MAX_PRIM_PER_BATCH != 32u OR 64u" -#endif - -#pragma pack(push, 1) - -typedef struct -{ - GSVertexCL v[4]; -} gs_prim; - -typedef struct -{ - cl_float4 dx, dy; - cl_float4 zero; - cl_float4 reject_corner; -} gs_barycentric; - -typedef struct -{ - struct { cl_uint first, last; } bounds[MAX_BIN_PER_BATCH]; - BIN_TYPE bin[MAX_BIN_COUNT]; - cl_uchar4 bbox[MAX_PRIM_COUNT]; - gs_prim prim[MAX_PRIM_COUNT]; - gs_barycentric barycentric[MAX_PRIM_COUNT]; -} gs_env; - -#pragma pack(pop) - -GSVector4 GSRendererCL::m_pos_scale; - -void GSRendererCL::InitVectors() -{ - m_pos_scale = GSVector4(1.0f / 16, 1.0f / 16, 1.0f, 1.0f); -} - -GSRendererCL::GSRendererCL() - : m_vb_count(0) - , m_synced(true) -{ - m_nativeres = true; // ignore ini, sw is always native - - memset(m_texture, 0, sizeof(m_texture)); - - m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32); - - for(int i = 0; i < 4; i++) - { - m_rw_pages[0][i] = GSVector4i::zero(); - m_rw_pages[1][i] = GSVector4i::zero(); - m_tc_pages[i] = GSVector4i::xffffffff(); - } - - for (auto& page_ref : m_rw_pages_rendering) - page_ref = 0; - - #define InitCVB(P) \ - m_cvb[P][0][0] = &GSRendererCL::ConvertVertexBuffer; \ - m_cvb[P][0][1] = &GSRendererCL::ConvertVertexBuffer; \ - m_cvb[P][1][0] = &GSRendererCL::ConvertVertexBuffer; \ - m_cvb[P][1][1] = &GSRendererCL::ConvertVertexBuffer; \ - - InitCVB(GS_POINT_CLASS); - InitCVB(GS_LINE_CLASS); - InitCVB(GS_TRIANGLE_CLASS); - InitCVB(GS_SPRITE_CLASS); - - // NOTE: m_cl.vm may be cached on the device according to the specs, there are a couple of places where we access m_mem.m_vm8 without - // mapping the buffer (after the two invalidate* calls and in getoutput), it is currently not an issue, but on some devices it may be. - - m_cl.vm = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (size_t)m_mem.m_vmsize, m_mem.m_vm8, NULL); - m_cl.tex = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY, (size_t)m_mem.m_vmsize); -} - -GSRendererCL::~GSRendererCL() -{ - for(size_t i = 0; i < countof(m_texture); i++) - { - delete m_texture[i]; - } - - _aligned_free(m_output); -} - -void GSRendererCL::Reset() -{ - Sync(-1); - - GSRenderer::Reset(); -} - -static int pageuploads = 0; -static int pageuploadcount = 0; -static int tfxcount = 0; -static long tfxpixels = 0; -static int tfxselcount = 0; -static int tfxdiffselcount = 0; - -void GSRendererCL::VSync(int field) -{ - GSRenderer::VSync(field); - - //printf("vsync %d/%d/%d/%d\n", pageuploads, pageuploadcount, tfxcount, tfxpixels); - //printf("vsync %d/%d\n", tfxselcount, tfxdiffselcount); - pageuploads = pageuploadcount = tfxcount = tfxpixels = 0; - tfxselcount = tfxdiffselcount = 0; - - //if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize); -} - -void GSRendererCL::ResetDevice() -{ - for(size_t i = 0; i < countof(m_texture); i++) - { - delete m_texture[i]; - - m_texture[i] = NULL; - } -} - -GSTexture* GSRendererCL::GetOutput(int i, int& y_offset) -{ - const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB; - - int w = DISPFB.FBW * 64; - int h = GetFramebufferHeight(); - - // TODO: round up bottom - - if(m_dev->ResizeTexture(&m_texture[i], w, h)) - { - static int pitch = 1024 * 4; - - GSVector4i r(0, 0, w, h); - - const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM]; - - GIFRegBITBLTBUF BITBLTBUF; - - BITBLTBUF.SBP = DISPFB.Block(); - BITBLTBUF.SBW = DISPFB.FBW; - BITBLTBUF.SPSM = DISPFB.PSM; - - InvalidateLocalMem(BITBLTBUF, r); - - (m_mem.*psm.rtx)(m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM), r.ralign(psm.bs), m_output, pitch, m_env.TEXA); - - m_texture[i]->Update(r, m_output, pitch); - - if(s_dump) - { - if(s_save && s_n >= s_saven) - { - m_texture[i]->Save(format("c:\\temp1\\_%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)DISPFB.Block(), (int)DISPFB.PSM)); - } - } - } - - return m_texture[i]; -} - -template -void GSRendererCL::ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count) -{ - GSVector4i o = (GSVector4i)m_context->XYOFFSET; - GSVector4 st_scale = GSVector4(16 << m_context->TEX0.TW, 16 << m_context->TEX0.TH, 1, 0); - - for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++) - { - GSVector4 stcq = GSVector4::load(&src->m[0]); // s t rgba q - - GSVector4i xyzuvf(src->m[1]); - - dst->p = (GSVector4(xyzuvf.upl16() - o) * m_pos_scale).xyxy(GSVector4::cast(xyzuvf.ywyw())); // pass zf as uints - - GSVector4 t = GSVector4::zero(); - - if(tme) - { - if(fst) - { - #if _M_SSE >= 0x401 - - t = GSVector4(xyzuvf.uph16()); - - #else - - t = GSVector4(GSVector4i::load(src->UV).upl16()); - - #endif - } - else - { - t = stcq.xyww() * st_scale; - } - } - - dst->t = t.insert32<2, 3>(stcq); // color as uchar4 in t.w - } -} - -void GSRendererCL::Draw() -{ - const GSDrawingContext* context = m_context; - - GSVector4i scissor = GSVector4i(context->scissor.in); - GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil())); - - // points and lines may have zero area bbox (example: single line 0,0->256,0) - - if(m_vt.m_primclass == GS_POINT_CLASS || m_vt.m_primclass == GS_LINE_CLASS) - { - if(bbox.x == bbox.z) bbox.z++; - if(bbox.y == bbox.w) bbox.w++; - } - - scissor.z = std::min(scissor.z, (int)context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour - - GSVector4i rect = bbox.rintersect(scissor); - - if(rect.rempty()) - { - return; - } - - if(s_dump) - { - Sync(2); - - uint64 frame = m_perfmon.GetFrame(); - - std::string s; - - if(s_save && s_n >= s_saven && PRIM->TME) - { - s = format("c:\\temp1\\_%05d_f%lld_itex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM); - - m_mem.SaveBMP(s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH); - } - - if(s_save && s_n >= s_saven) - { - s = format("c:\\temp1\\_%05d_f%lld_rt0_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM); - - m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512); - } - - if(s_savez && s_n >= s_saven) - { - s = format("c:\\temp1\\_%05d_f%lld_rz0_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM); - - m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512); - } - } - - try - { - size_t vb_size = m_vertex.next * sizeof(GSVertexCL); - size_t ib_size = m_index.tail * sizeof(uint32); - size_t pb_size = TFX_PARAM_SIZE; - - ASSERT(sizeof(TFXParameter) <= TFX_PARAM_SIZE); - - if(m_cl.vb.tail + vb_size > m_cl.vb.size || m_cl.ib.tail + ib_size > m_cl.ib.size || m_cl.pb.tail + pb_size > m_cl.pb.size) - { - if(vb_size > m_cl.vb.size || ib_size > m_cl.ib.size) - { - // buffer too small for even one batch, allow twice the size (at least 1 MB) - - Sync(2); // must sync, reallocating the input buffers - - m_cl.Unmap(); - - m_cl.vb.size = 0; - m_cl.ib.size = 0; - - size_t size = std::max(vb_size * 2, (size_t)2 << 20); - - printf("growing vertex/index buffer %d\n", size); - - m_cl.vb.buff[0] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size); - m_cl.vb.buff[1] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size); - m_cl.vb.size = size; - - size = std::max(size / sizeof(GSVertex) * 3 * sizeof(uint32), (size_t)1 << 20); // worst case, three times the vertex count - - ASSERT(size >= ib_size); - - if(size < ib_size) size = ib_size; // should not happen - - m_cl.ib.buff[0] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size); - m_cl.ib.buff[1] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size); - m_cl.ib.size = size; - } - else - { - Enqueue(); - - m_cl.Unmap(); - - // make the write queue wait until the rendering queue is ready, it may still use the device buffers - - std::vector el(1); - - m_cl.queue[2].enqueueMarker(&el[0]); - m_cl.wq->enqueueWaitForEvents(el); - - // switch to the other queue/buffer (double buffering) - - m_cl.wqidx = (m_cl.wqidx + 1) & 1; - m_cl.wq = &m_cl.queue[m_cl.wqidx]; - } - - m_cl.vb.head = m_cl.vb.tail = 0; - m_cl.ib.head = m_cl.ib.tail = 0; - m_cl.pb.head = m_cl.pb.tail = 0; - - m_cl.Map(); - } - else - { - // only allow batches of the same primclass in Enqueue - - if(!m_jobs.empty() && m_jobs.front()->sel.prim != (uint32)m_vt.m_primclass) - { - Enqueue(); - } - } - - // - - GSVertexCL* vb = (GSVertexCL*)(m_cl.vb.ptr + m_cl.vb.tail); - uint32* ib = (uint32*)(m_cl.ib.ptr + m_cl.ib.tail); - TFXParameter* pb = (TFXParameter*)(m_cl.pb.ptr + m_cl.pb.tail); - - (this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(vb, m_vertex.buff, m_vertex.next); // TODO: upload in GSVertex format and extract the fields in the kernel? - - if(m_jobs.empty()) - { - memcpy(ib, m_index.buff, m_index.tail * sizeof(uint32)); - - m_vb_start = m_cl.vb.tail; - m_vb_count = 0; - m_pb_start = m_cl.pb.tail; - m_pb_count = 0; - } - else - { - // TODO: SIMD - - ASSERT(m_pb_count < TFX_MAX_PARAM_COUNT); - - uint32 vb_count = m_vb_count | (m_pb_count << 24); - - for(size_t i = 0; i < m_index.tail; i++) - { - ib[i] = m_index.buff[i] + vb_count; - } - } - - std::shared_ptr job(new TFXJob()); - - if(!SetupParameter(job.get(), pb, vb, m_vertex.next, m_index.buff, m_index.tail)) - { - return; - } - - pb->scissor = scissor; - - if(bbox.eq(bbox.rintersect(scissor))) - { - pb->sel.noscissor = 1; - } - - job->rect.x = rect.x; - job->rect.y = rect.y; - job->rect.z = rect.z; - job->rect.w = rect.w; - job->sel = pb->sel; - job->ib_start = m_cl.ib.tail; - job->prim_count = m_index.tail / GSUtil::GetClassVertexCount(m_vt.m_primclass); - job->fbp = pb->fbp; - job->zbp = pb->zbp; - job->bw = pb->bw; - job->fpsm = context->FRAME.PSM; - job->zpsm = context->ZBUF.PSM; - job->tpsm = context->TEX0.PSM; - -#ifdef DEBUG - job->pb = pb; -#endif - m_jobs.push_back(job); - - m_vb_count += m_vertex.next; - m_pb_count++; - - m_cl.vb.tail += vb_size; - m_cl.ib.tail += ib_size; - m_cl.pb.tail += pb_size; - - m_synced = false; - - // mark pages used in rendering as source or target - - if(job->sel.fwrite || job->sel.rfb) - { - m_context->offset.fb->GetPagesAsBits(rect, m_tmp_pages); - - if(job->sel.rfb) - { - for(int i = 0; i < 4; i++) - { - m_rw_pages[0][i] |= m_tmp_pages[i]; - } - } - - if(job->sel.fwrite) - { - GSVector4i* dst_pages = job->GetDstPages(); - - for(int i = 0; i < 4; i++) - { - m_rw_pages[1][i] |= m_tmp_pages[i]; - - dst_pages[i] |= m_tmp_pages[i]; - } - } - } - - if(job->sel.zwrite || job->sel.rzb) - { - m_context->offset.zb->GetPagesAsBits(rect, m_tmp_pages); - - if(job->sel.rzb) - { - for(int i = 0; i < 4; i++) - { - m_rw_pages[0][i] |= m_tmp_pages[i]; - } - } - - if(job->sel.zwrite) - { - GSVector4i* dst_pages = job->GetDstPages(); - - for(int i = 0; i < 4; i++) - { - m_rw_pages[1][i] |= m_tmp_pages[i]; - - dst_pages[i] |= m_tmp_pages[i]; - } - } - } - - if(job->src_pages != NULL) - { - for(int i = 0; i < 4; i++) - { - m_rw_pages[0][i] |= job->src_pages[i]; - - if(job->dst_pages != NULL && !(job->dst_pages[i] & job->src_pages[i]).eq(GSVector4i::zero())) - { - //printf("src and dst overlap!\n"); - } - } - } - - // don't buffer too much data, feed them to the device if there is enough - - if(m_pb_count >= TFX_MAX_PARAM_COUNT || m_vb_count >= 4096) - { - Enqueue(); - } - } - catch(cl::Error err) - { - printf("%s (%d)\n", err.what(), err.err()); - - return; - } - catch(std::exception err) - { - printf("%s\n", err.what()); - - return; - } - - if(s_dump) - { - Sync(2); - - uint64 frame = m_perfmon.GetFrame(); - - std::string s; - - if(s_save && s_n >= s_saven) - { - s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM); - - m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512); - } - - if(s_savez && s_n >= s_saven) - { - s = format("c:\\temp1\\_%05d_f%lld_rz1_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM); - - m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512); - } - } -} - -void GSRendererCL::Sync(int reason) -{ - if(LOG) { fprintf(s_fp, "Sync (%d)\n", reason); fflush(s_fp); } - - //printf("sync %d\n", reason); - - GSPerfMonAutoTimer pmat(&m_perfmon, GSPerfMon::Sync); - - Enqueue(); - - m_cl.queue[2].finish(); - - for(int i = 0; i < 4; i++) - { - m_rw_pages[0][i] = GSVector4i::zero(); - m_rw_pages[1][i] = GSVector4i::zero(); - } - -#ifndef NDEBUG - for(const auto& page_ref : m_rw_pages_rendering) - ASSERT(page_ref == 0); -#endif - - m_synced = true; -} - -void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) -{ - if(LOG) {fprintf(s_fp, "w %05x %d %d, %d %d %d %d\n", BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM, r.x, r.y, r.z, r.w); fflush(s_fp);} - - GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM); - - o->GetPagesAsBits(r, m_tmp_pages); - - if(!m_synced) - { - int i = 0; - - bool wait; - - do - { - wait = false; - - for(; i < 4; i++) - { - GSVector4i pages = m_rw_pages[0][i] | m_rw_pages[1][i]; - - if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero())) - { - // TODO: an awesome idea to avoid this Sync - // - call Enqueue() to flush m_jobs - // - append rendering queue with a kernel that writes the incoming data to m_mem.vm and tell the parent class to not do it - // - the only problem, clut has to be read directly by the texture sampler, can't attach it to gs_param before being written - - //Sync(3); - - Enqueue(); - - wait = true; - - break; - } - } - - _mm_pause(); - } - while(wait); - - if(!m_synced) - { - o->GetPages(r, m_tmp_pages2); // TODO: don't ask twice - - const uint32* p = m_tmp_pages2; - - do - { - wait = false; - - for(; *p != GSOffset::EOP; p++) - { - if(m_rw_pages_rendering[*p]) - { - // Sync(5); - - wait = true; - - break; - } - } - /* - if(!m_synced) - { - void* ptr = m_cl.wq->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize); - m_cl.wq->enqueueUnmapMemObject(m_cl.vm, ptr); - } - */ - - _mm_pause(); - } - while(wait); - } - } - - for(int i = 0; i < 4; i++) - { - m_tc_pages[i] |= m_tmp_pages[i]; - } -} - -void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut) -{ - if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);} - - if(!m_synced) - { - GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM); - - o->GetPagesAsBits(r, m_tmp_pages); - - for(int i = 0; i < 4; i++) - { - GSVector4i pages = m_rw_pages[1][i]; - - if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero())) - { - Sync(4); - - break; - } - } - - if(!m_synced) - { - o->GetPages(r, m_tmp_pages2); // TODO: don't ask twice - - for(const uint32* p = m_tmp_pages2; *p != GSOffset::EOP; p++) - { - if(m_rw_pages_rendering[*p] & 0xffff0000) - { - Sync(6); - - break; - } - } - /* - if(!m_synced) - { - void* ptr = m_cl.wq->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize); - m_cl.wq->enqueueUnmapMemObject(m_cl.vm, ptr); - } - */ - } - } -} - -typedef struct { GSRendererCL* r; uint32 pages[(MAX_PAGES + 1) * 2]; } cb_data; - -void GSRendererCL::Enqueue() -{ - if(m_jobs.empty()) return; - - cb_data* data = new cb_data(); - - data->r = this; - - UsePages(data->pages); - - try - { - ASSERT(m_cl.vb.tail > m_cl.vb.head); - ASSERT(m_cl.ib.tail > m_cl.ib.head); - ASSERT(m_cl.pb.tail > m_cl.pb.head); - - int primclass = m_jobs.front()->sel.prim; - - uint32 n = GSUtil::GetClassVertexCount(primclass); - - PrimSelector psel; - - psel.key = 0; - psel.prim = primclass; - - cl::Kernel& pk = m_cl.GetPrimKernel(psel); - - pk.setArg(1, m_cl.vb.buff[m_cl.wqidx]); - pk.setArg(2, m_cl.ib.buff[m_cl.wqidx]); - pk.setArg(3, m_cl.pb.buff[m_cl.wqidx]); - pk.setArg(4, (cl_uint)m_vb_start); - pk.setArg(6, (cl_uint)m_pb_start); - - TileSelector tsel; - - tsel.key = 0; - tsel.prim = primclass; - - tsel.mode = 0; - - cl::Kernel& tk_32 = m_cl.GetTileKernel(tsel); - - tsel.mode = 1; - - cl::Kernel& tk_16 = m_cl.GetTileKernel(tsel); - - tsel.mode = 2; - - cl::Kernel& tk_8 = m_cl.GetTileKernel(tsel); - - tsel.mode = 3; - - cl::Kernel& tk = m_cl.GetTileKernel(tsel); - - tsel.key = 0; - tsel.clear = 1; - - cl::Kernel& tk_clear = m_cl.GetTileKernel(tsel); - - // - - m_cl.Unmap(); - - std::vector el(1); - - m_cl.wq->enqueueMarker(&el[0]); - m_cl.queue[2].enqueueWaitForEvents(el); - - // - - auto head = m_jobs.begin(); - - while(head != m_jobs.end()) - { - uint32 total_prim_count = 0; - - auto next = head; - - while(next != m_jobs.end()) - { - auto job = next++; - - uint32 cur_prim_count = (*job)->prim_count; - //uint32 next_prim_count = next != m_jobs.end() ? (*next)->prim_count : 0; - - total_prim_count += cur_prim_count; - - if(total_prim_count >= MAX_PRIM_COUNT || next == m_jobs.end())// || next_prim_count >= MAX_PRIM_COUNT || next_prim_count < 16 && total_prim_count >= MAX_PRIM_COUNT / 2) - { - uint32 prim_count = std::min(total_prim_count, MAX_PRIM_COUNT); - - pk.setArg(5, (cl_uint)(*head)->ib_start); - - m_cl.queue[2].enqueueNDRangeKernel(pk, cl::NullRange, cl::NDRange(prim_count), cl::NullRange); - - if(0) - { - gs_env* ptr = (gs_env*)m_cl.queue[2].enqueueMapBuffer(m_cl.env, CL_TRUE, CL_MAP_READ, 0, sizeof(gs_env)); - m_cl.queue[2].enqueueUnmapMemObject(m_cl.env, ptr); - } - - GSVector4i rect = GSVector4i::zero(); - - for(auto i = head; i != next; i++) - { - rect = rect.runion(GSVector4i::load(&(*i)->rect)); - } - - rect = rect.ralign(GSVector2i(BIN_SIZE, BIN_SIZE)) >> BIN_SIZE_BITS; - - int bin_w = rect.width(); - int bin_h = rect.height(); - - uint32 batch_count = BATCH_COUNT(prim_count); - uint32 bin_count = bin_w * bin_h; - - cl_uchar4 bin_dim; - - bin_dim.s[0] = (cl_uchar)rect.x; - bin_dim.s[1] = (cl_uchar)rect.y; - bin_dim.s[2] = (cl_uchar)bin_w; - bin_dim.s[3] = (cl_uchar)bin_h; - - if(1)//bin_w > 1 || bin_h > 1) // && not just one sprite covering the whole area - { - m_cl.queue[2].enqueueNDRangeKernel(tk_clear, cl::NullRange, cl::NDRange(bin_count), cl::NullRange); - - if(bin_count <= 32 && m_cl.WIs >= 256) - { - uint32 item_count; - uint32 group_count; - cl::Kernel* k; - - if(bin_count <= 8) - { - item_count = std::min(prim_count, 32u); - group_count = ((prim_count + 31) >> 5) * item_count; - k = &tk_32; - } - else if(bin_count <= 16) - { - item_count = std::min(prim_count, 16u); - group_count = ((prim_count + 15) >> 4) * item_count; - k = &tk_16; - } - else - { - item_count = std::min(prim_count, 8u); - group_count = ((prim_count + 7) >> 3) * item_count; - k = &tk_8; - } - - k->setArg(1, (cl_uint)prim_count); - k->setArg(2, (cl_uint)bin_count); - k->setArg(3, bin_dim); - - m_cl.queue[2].enqueueNDRangeKernel(*k, cl::NullRange, cl::NDRange(bin_w, bin_h, group_count), cl::NDRange(bin_w, bin_h, item_count)); - } - else - { - uint32 item_count = std::min(bin_count, m_cl.WIs); - uint32 group_count = batch_count * item_count; - - tk.setArg(1, (cl_uint)prim_count); - tk.setArg(2, (cl_uint)bin_count); - tk.setArg(3, bin_dim); - - m_cl.queue[2].enqueueNDRangeKernel(tk, cl::NullRange, cl::NDRange(group_count), cl::NDRange(item_count)); - } - - if(0) - { - gs_env* ptr = (gs_env*)m_cl.queue[2].enqueueMapBuffer(m_cl.env, CL_TRUE, CL_MAP_READ, 0, sizeof(gs_env)); - m_cl.queue[2].enqueueUnmapMemObject(m_cl.env, ptr); - } - } - - std::list> jobs(head, next); - - JoinTFX(jobs); - - EnqueueTFX(jobs, bin_count, bin_dim); - - if(total_prim_count > MAX_PRIM_COUNT) - { - prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT); - - (*job)->ib_start += prim_count * n * sizeof(uint32); - (*job)->prim_count -= prim_count; - - next = job; // try again for the remainder - - //printf("split %d\n", (*job)->prim_count); - } - - break; - } - } - - head = next; - } - } - catch(cl::Error err) - { - printf("%s (%d)\n", err.what(), err.err()); - } - - try - { - cl::Event e; - m_cl.queue[2].enqueueMarker(&e); - e.setCallback(CL_COMPLETE, ReleasePageEvent, data); - } - catch(cl::Error err) - { - printf("%s (%d)\n", err.what(), err.err()); - - delete data; - } - - m_jobs.clear(); - - m_vb_count = 0; - - m_cl.vb.head = m_cl.vb.tail; - m_cl.ib.head = m_cl.ib.tail; - m_cl.pb.head = m_cl.pb.tail; - - m_cl.Map(); -} - -void GSRendererCL::EnqueueTFX(std::list>& jobs, uint32 bin_count, const cl_uchar4& bin_dim) -{ - cl_kernel tfx_prev = NULL; - - uint32 prim_start = 0; - - for(auto i : jobs) - { - ASSERT(prim_start < MAX_PRIM_COUNT); - - tfxcount++; - - uint32 prim_count = std::min(i->prim_count, MAX_PRIM_COUNT - prim_start); - - cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel); - - cl::Buffer* tex = UpdateTextureCache(i.get()) ? &m_cl.tex : &m_cl.vm; - - tfx.setArg(2, sizeof(*tex), tex); - - if(tfx_prev != tfx()) - { - tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]); - tfx.setArg(4, (cl_uint)m_pb_start); - - tfx_prev = tfx(); - } - - tfx.setArg(5, (cl_uint)prim_start); - tfx.setArg(6, (cl_uint)prim_count); - tfx.setArg(7, (cl_uint)bin_count); - tfx.setArg(8, bin_dim); - tfx.setArg(9, i->fbp); - tfx.setArg(10, i->zbp); - tfx.setArg(11, i->bw); - - GSVector4i r = GSVector4i::load(&i->rect); - - r = r.ralign(GSVector2i(8, 8)); - - m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8)); - - tfxpixels += r.width() * r.height(); - - InvalidateTextureCache(i.get()); - - prim_start += prim_count; - } -} - -void GSRendererCL::JoinTFX(std::list>& jobs) -{ - // join tfx kernel calls where the selector and fbp/zbp/bw/fpsm/zpsm are the same and src_pages != prev dst_pages - - //printf("before\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start); - - tfxselcount += jobs.size(); - - auto next = jobs.begin(); - - while(next != jobs.end()) - { - auto prev = next++; - - if(next == jobs.end()) - { - break; - } - - TFXSelector prev_sel = (*prev)->sel; - TFXSelector next_sel = (*next)->sel; - - prev_sel.ababcd = next_sel.ababcd = 0; - prev_sel.wms = next_sel.wms = 0; - prev_sel.wmt = next_sel.wmt = 0; - prev_sel.noscissor = next_sel.noscissor = prev_sel.noscissor | next_sel.noscissor; - prev_sel.merged = next_sel.merged = 0; - - if(prev_sel != next_sel - || (*prev)->fbp != (*next)->fbp - || (*prev)->zbp != (*next)->zbp - || (*prev)->bw != (*next)->bw - || (*prev)->fpsm != (*next)->fpsm - || (*prev)->zpsm != (*next)->zpsm) - { - continue; - } - - if((*prev)->dst_pages != NULL && (*next)->src_pages != NULL) - { - bool overlap = false; - - for(int i = 0; i < 4; i++) - { - if(!((*prev)->dst_pages[i] & (*next)->src_pages[i]).eq(GSVector4i::zero())) - { - overlap = true; - - break; - } - } - - if(overlap) - { - continue; - } - } - - if((*prev)->src_pages != NULL) - { - GSVector4i* src_pages = (*next)->GetSrcPages(); - - for(int i = 0; i < 4; i++) - { - src_pages[i] |= (*prev)->src_pages[i]; - } - } - - if((*prev)->dst_pages != NULL) - { - GSVector4i* dst_pages = (*next)->GetDstPages(); - - for(int i = 0; i < 4; i++) - { - dst_pages[i] |= (*prev)->dst_pages[i]; - } - } - - GSVector4i prev_rect = GSVector4i::load(&(*prev)->rect); - GSVector4i next_rect = GSVector4i::load(&(*next)->rect); - - GSVector4i::store(&(*next)->rect, prev_rect.runion(next_rect)); - - (*next)->prim_count += (*prev)->prim_count; - (*next)->ib_start = (*prev)->ib_start; - - (*next)->sel = next_sel; - (*next)->sel.merged = 1; - - jobs.erase(prev); - - //if((*prev)->sel != (*next)->sel) printf("%d %016llx %016llx\n", jobs.size(), (*prev)->sel.key, (*next)->sel.key); - } - - tfxdiffselcount += jobs.size(); - - //printf("after\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start); -} - -bool GSRendererCL::UpdateTextureCache(TFXJob* job) -{ - if(job->src_pages == NULL) return false; - - bool overlap = false; - bool invalid = false; - - if(job->dst_pages != NULL) - { - bool can_overlap = job->sel.fwrite && GSUtil::HasSharedBits(job->tpsm, job->fpsm) || job->sel.zwrite && GSUtil::HasSharedBits(job->tpsm, job->zpsm); - - for(int i = 0; i < 4; i++) - { - if(!(job->src_pages[i] & job->dst_pages[i]).eq(GSVector4i::zero())) - { - overlap = can_overlap; // gow, re4 - } - - if(!(m_tc_pages[i] & job->src_pages[i]).eq(GSVector4i::zero())) - { - invalid = true; - } - } - } - - if(!invalid) - { - return true; // all needed pages are valid in texture cache, use it - } - - if(!overlap) - { - return false; // no overlap, but has invalid pages, don't use texture cache - } - - // overlap && invalid, update and use texture cache - - int count = 0; - - for(int i = 0; i < 4; i++) - { - GSVector4i pages = m_tc_pages[i] & job->src_pages[i]; - - if(pages.eq(GSVector4i::zero())) continue; - - m_tc_pages[i] &= ~job->src_pages[i]; - - for(int j = 0; j < 4; j++) - { - if(pages.u32[j] == 0) continue; - - if(pages.u32[j] == 0xffffffff) - { - size_t offset = (i * sizeof(GSVector4i) + j * sizeof(uint32)) * 8 * PAGE_SIZE; - - m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, PAGE_SIZE * 32); - - if(LOG) { fprintf(s_fp, "tc (%d x32)\n", offset >> 13); fflush(s_fp); } - - pageuploadcount++; - count += 32; - - continue; - } - - for(int k = 0; k < 4; k++) - { - uint8 b = pages.u8[j * 4 + k]; - - if(b == 0) continue; - - if(b == 0xff) - { - size_t offset = (i * sizeof(GSVector4i) + (j * 4 + k)) * 8 * PAGE_SIZE; - - m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, PAGE_SIZE * 8); - - if(LOG) { fprintf(s_fp, "tc (%d x8)\n", offset >> 13); fflush(s_fp); } - - pageuploadcount++; - count += 8; - - continue; - } - - for(int l = 0; l < 8; l++) - { - if(b & (1 << l)) - { - size_t offset = ((i * sizeof(GSVector4i) + (j * 4 + k)) * 8 + l) * PAGE_SIZE; - - m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, PAGE_SIZE); - - if(LOG) { fprintf(s_fp, "tc (%d x1)\n", offset >> 13); fflush(s_fp); } - - pageuploadcount++; - count++; - } - } - } - } - } - - if(count > 0) - { - pageuploads += count; - } - - return true; -} - -void GSRendererCL::InvalidateTextureCache(TFXJob* job) -{ - if(job->dst_pages == NULL) return; - - for(int i = 0; i < 4; i++) - { - m_tc_pages[i] |= job->dst_pages[i]; - } -} - -void GSRendererCL::UsePages(uint32* p) -{ - for(int l = 0; l < 2; l++) - { - for(int i = 0; i < 4; i++) - { - GSVector4i* v = &m_rw_pages[l][i]; - - if(v->eq(GSVector4i::zero())) continue; - - for(int j = 0; j < 4; j++) - { - unsigned long index; - unsigned long mask = v->u32[j]; - - if(mask == 0) continue; - - int o = (i << 7) | (j << 5); - - if(mask == 0xffffffff) - { - for(int index = 0; index < 32; index++) - { - //_InterlockedIncrement16((short*)&m_rw_pages_rendering[index | o] + l); - if (l == 0) - m_rw_pages_rendering[index | o] += 1; - else - m_rw_pages_rendering[index | o] += 0x10000; - - *p++ = index | o; - } - } - else - { - while(_BitScanForward(&index, mask)) - { - mask &= ~(1 << index); - - //_InterlockedIncrement16((short*)&m_rw_pages_rendering[index | o] + l); - if (l == 0) - m_rw_pages_rendering[index | o] += 1; - else - m_rw_pages_rendering[index | o] += 0x10000; - - *p++ = index | o; - } - } - } - - *v = GSVector4i::zero(); - } - - *p++ = (uint32)GSOffset::EOP; - } -} - -void GSRendererCL::ReleasePages(uint32* pages) -{ - const uint32* p = pages; - - for(; *p != GSOffset::EOP; p++) - { - m_rw_pages_rendering[*p] -= 1; - //_InterlockedDecrement16((short*)&m_rw_pages_rendering[*p] + 0); - } - - p++; - - for(; *p != GSOffset::EOP; p++) - { - m_rw_pages_rendering[*p] -= 0x10000; - //_InterlockedDecrement16((short*)&m_rw_pages_rendering[*p] + 1); - } -} - -void CL_CALLBACK GSRendererCL::ReleasePageEvent(cl_event event, cl_int event_command_exec_status, void* user_data) -{ - if(event_command_exec_status == CL_COMPLETE) - { - cb_data* data = (cb_data*)user_data; - - data->r->ReleasePages(data->pages); - - delete data; - } -} - -static int RemapPSM(int psm) -{ - switch(psm) - { - default: - case PSM_PSMCT32: psm = 0; break; - case PSM_PSMCT24: psm = 1; break; - case PSM_PSMCT16: psm = 2; break; - case PSM_PSMCT16S: psm = 3; break; - case PSM_PSMZ32: psm = 4; break; - case PSM_PSMZ24: psm = 5; break; - case PSM_PSMZ16: psm = 6; break; - case PSM_PSMZ16S: psm = 7; break; - case PSM_PSMT8: psm = 8; break; - case PSM_PSMT4: psm = 9; break; - case PSM_PSMT8H: psm = 10; break; - case PSM_PSMT4HL: psm = 11; break; - case PSM_PSMT4HH: psm = 12; break; - } - - return psm; -} - -bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count) -{ - const GSDrawingEnvironment& env = m_env; - const GSDrawingContext* context = m_context; - const GS_PRIM_CLASS primclass = m_vt.m_primclass; - - TFXSelector sel; - - sel.key = 0; - - sel.atst = ATST_ALWAYS; - sel.tfx = TFX_NONE; - sel.ababcd = 0xff; - sel.prim = primclass; - - uint32 fm = context->FRAME.FBMSK; - uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0; - - if(context->TEST.ZTE && context->TEST.ZTST == ZTST_NEVER) - { - fm = 0xffffffff; - zm = 0xffffffff; - } - - if(PRIM->TME) - { - if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) - { - m_mem.m_clut.Read32(context->TEX0, env.TEXA); - } - } - - if(context->TEST.ATE) - { - if(!TryAlphaTest(fm, zm)) - { - sel.atst = context->TEST.ATST; - sel.afail = context->TEST.AFAIL; - pb->aref = context->TEST.AREF; - - switch(sel.atst) - { - case ATST_LESS: - sel.atst = ATST_LEQUAL; - pb->aref--; - break; - case ATST_GREATER: - sel.atst = ATST_GEQUAL; - pb->aref++; - break; - } - } - } - - bool fwrite; - bool zwrite = zm != 0xffffffff; - - switch(context->FRAME.PSM) - { - default: - case PSM_PSMCT32: - case PSM_PSMZ32: - fwrite = fm != 0xffffffff; - break; - case PSM_PSMCT24: - case PSM_PSMZ24: - fwrite = (fm & 0x00ffffff) != 0x00ffffff; - break; - case PSM_PSMCT16: - case PSM_PSMCT16S: - case PSM_PSMZ16: - case PSM_PSMZ16S: - fwrite = (fm & 0x80f8f8f8) != 0x80f8f8f8; - break; - } - - if(!fwrite && !zwrite) return false; - - bool ftest = sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24; - bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS; - - sel.fwrite = fwrite; - sel.ftest = ftest; - sel.zwrite = zwrite; - sel.ztest = ztest; - - if(fwrite || ftest) - { - sel.fpsm = RemapPSM(context->FRAME.PSM); - - if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff) - { - sel.iip = PRIM->IIP; - } - - if(PRIM->TME) - { - sel.tfx = context->TEX0.TFX; - sel.tcc = context->TEX0.TCC; - sel.fst = PRIM->FST; - sel.ltf = m_vt.IsLinear(); - sel.tpsm = RemapPSM(context->TEX0.PSM); - sel.aem = m_env.TEXA.AEM; - - pb->tbp[0] = context->TEX0.TBP0; - pb->tbw[0] = context->TEX0.TBW; - pb->ta0 = m_env.TEXA.TA0; - pb->ta1 = m_env.TEXA.TA1; - - if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) - { - sel.tlu = 1; - - memcpy(pb->clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal); - } - - sel.wms = ((uint32)context->CLAMP.WMS + 1) & 3; - sel.wmt = ((uint32)context->CLAMP.WMT + 1) & 3; - - if(sel.tfx == TFX_MODULATE && sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128))) - { - // modulate does not do anything when vertex color is 0x80 - - sel.tfx = TFX_DECAL; - } - - bool mipmap = IsMipMapActive(); - - GIFRegTEX0 TEX0 = m_context->GetSizeFixedTEX0(m_vt.m_min.t.xyxy(m_vt.m_max.t), m_vt.IsLinear(), mipmap); - - GSVector4i r; - - GetTextureMinMax(r, TEX0, context->CLAMP, sel.ltf); - - GSVector4i* src_pages = job->GetSrcPages(); - - GSOffset* o = m_mem.GetOffset(context->TEX0.TBP0, context->TEX0.TBW, context->TEX0.PSM); - - o->GetPagesAsBits(r, m_tmp_pages); - - for(int i = 0; i < 4; i++) - { - src_pages[i] |= m_tmp_pages[i]; - } - - if(mipmap) - { - // TEX1.MMIN - // 000 p - // 001 l - // 010 p round - // 011 p tri - // 100 l round - // 101 l tri - - if(m_vt.m_lod.x > 0) - { - sel.ltf = context->TEX1.MMIN >> 2; - } - else - { - // TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt.m_lod.x <= 0 && m_vt.m_lod.y > 0 - } - - sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri - sel.lcm = context->TEX1.LCM; - - int mxl = std::min((int)context->TEX1.MXL, 6) << 16; - int k = context->TEX1.K << 12; - - if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL) - { - k = (int)m_vt.m_lod.x << 16; // set lod to max level - - sel.lcm = 1; // lod is constant - sel.mmin = 1; // tri-linear is meaningless - } - - if(sel.mmin == 2) - { - mxl--; // don't sample beyond the last level (TODO: add a dummy level instead?) - } - - if(sel.fst) - { - ASSERT(sel.lcm == 1); - ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu) - - sel.lcm = 1; - } - - if(sel.lcm) - { - int lod = std::max(std::min(k, mxl), 0); - - if(sel.mmin == 1) - { - lod = (lod + 0x8000) & 0xffff0000; // rounding - } - - pb->lod = lod; - - // TODO: lot to optimize when lod is constant - } - else - { - pb->mxl = mxl; - pb->l = (float)(-0x10000 << context->TEX1.L); - pb->k = (float)k; - } - - GIFRegTEX0 MIP_TEX0 = TEX0; - GIFRegCLAMP MIP_CLAMP = context->CLAMP; - - GSVector4 tmin = m_vt.m_min.t; - GSVector4 tmax = m_vt.m_max.t; - - static int s_counter = 0; - - for(int i = 1, j = std::min((int)context->TEX1.MXL, 6); i <= j; i++) - { - switch(i) - { - case 1: - MIP_TEX0.TBP0 = context->MIPTBP1.TBP1; - MIP_TEX0.TBW = context->MIPTBP1.TBW1; - break; - case 2: - MIP_TEX0.TBP0 = context->MIPTBP1.TBP2; - MIP_TEX0.TBW = context->MIPTBP1.TBW2; - break; - case 3: - MIP_TEX0.TBP0 = context->MIPTBP1.TBP3; - MIP_TEX0.TBW = context->MIPTBP1.TBW3; - break; - case 4: - MIP_TEX0.TBP0 = context->MIPTBP2.TBP4; - MIP_TEX0.TBW = context->MIPTBP2.TBW4; - break; - case 5: - MIP_TEX0.TBP0 = context->MIPTBP2.TBP5; - MIP_TEX0.TBW = context->MIPTBP2.TBW5; - break; - case 6: - MIP_TEX0.TBP0 = context->MIPTBP2.TBP6; - MIP_TEX0.TBW = context->MIPTBP2.TBW6; - break; - default: - __assume(0); - } - - pb->tbp[i] = MIP_TEX0.TBP0; - pb->tbw[i] = MIP_TEX0.TBW; - - if(MIP_TEX0.TW > 0) MIP_TEX0.TW--; - if(MIP_TEX0.TH > 0) MIP_TEX0.TH--; - - MIP_CLAMP.MINU >>= 1; - MIP_CLAMP.MINV >>= 1; - MIP_CLAMP.MAXU >>= 1; - MIP_CLAMP.MAXV >>= 1; - - m_vt.m_min.t *= 0.5f; - m_vt.m_max.t *= 0.5f; - - GSVector4i r; - - GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, sel.ltf); - - GSOffset* o = m_mem.GetOffset(MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM); - - o->GetPagesAsBits(r, m_tmp_pages); - - for(int i = 0; i < 4; i++) - { - src_pages[i] |= m_tmp_pages[i]; - } - } - - s_counter++; - - m_vt.m_min.t = tmin; - m_vt.m_max.t = tmax; - } - else - { - if(sel.fst == 0) - { - // skip per pixel division if q is constant - - GSVertexCL* RESTRICT v = vertex; - - if(m_vt.m_eq.q) - { - sel.fst = 1; - - const GSVector4& t = v[index[0]].t; - - if(t.z != 1.0f) - { - GSVector4 w = t.zzzz().rcpnr(); - - for(int i = 0, j = vertex_count; i < j; i++) - { - GSVector4 t = v[i].t; - - v[i].t = (t * w).xyzw(t); - } - } - } - else if(primclass == GS_SPRITE_CLASS) - { - sel.fst = 1; - - for(int i = 0, j = vertex_count; i < j; i += 2) - { - GSVector4 t0 = v[i + 0].t; - GSVector4 t1 = v[i + 1].t; - - GSVector4 w = t1.zzzz().rcpnr(); - - v[i + 0].t = (t0 * w).xyzw(t0); - v[i + 1].t = (t1 * w).xyzw(t1); - } - } - } - } - - int tw = 1 << TEX0.TW; - int th = 1 << TEX0.TH; - - switch(context->CLAMP.WMS) - { - case CLAMP_REPEAT: - pb->minu = tw - 1; - pb->maxu = 0; - //gd.t.mask.u32[0] = 0xffffffff; - break; - case CLAMP_CLAMP: - pb->minu = 0; - pb->maxu = tw - 1; - //gd.t.mask.u32[0] = 0; - break; - case CLAMP_REGION_CLAMP: - pb->minu = std::min((int)context->CLAMP.MINU, tw - 1); - pb->maxu = std::min((int)context->CLAMP.MAXU, tw - 1); - //gd.t.mask.u32[0] = 0; - break; - case CLAMP_REGION_REPEAT: - pb->minu = (int)context->CLAMP.MINU & (tw - 1); - pb->maxu = (int)context->CLAMP.MAXU & (tw - 1); - //gd.t.mask.u32[0] = 0xffffffff; - break; - default: - __assume(0); - } - - switch(context->CLAMP.WMT) - { - case CLAMP_REPEAT: - pb->minv = th - 1; - pb->maxv = 0; - //gd.t.mask.u32[2] = 0xffffffff; - break; - case CLAMP_CLAMP: - pb->minv = 0; - pb->maxv = th - 1; - //gd.t.mask.u32[2] = 0; - break; - case CLAMP_REGION_CLAMP: - pb->minv = std::min((int)context->CLAMP.MINV, th - 1); - pb->maxv = std::min((int)context->CLAMP.MAXV, th - 1); // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256) - //gd.t.mask.u32[2] = 0; - break; - case CLAMP_REGION_REPEAT: - pb->minv = (int)context->CLAMP.MINV & (th - 1); // skygunner main menu water texture 64x64, MINV = 127 - pb->maxv = (int)context->CLAMP.MAXV & (th - 1); - //gd.t.mask.u32[2] = 0xffffffff; - break; - default: - __assume(0); - } - } - - if(PRIM->FGE) - { - sel.fge = 1; - pb->fog = env.FOGCOL.u32[0]; - } - - if(context->FRAME.PSM != PSM_PSMCT24) - { - sel.date = context->TEST.DATE; - sel.datm = context->TEST.DATM; - } - - if(!IsOpaque()) - { - sel.abe = PRIM->ABE; - sel.ababcd = context->ALPHA.u32[0]; - - if(env.PABE.PABE) - { - sel.pabe = 1; - } - - if(m_aa1 && PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS)) - { - sel.aa1 = 1; - } - - pb->afix = context->ALPHA.FIX; - } - - if(sel.date || sel.aba == 1 || sel.abb == 1 || sel.abc == 1 && (sel.fpsm & 3) != 1 || sel.abd == 1) - { - sel.rfb = 1; - } - else - { - if(fwrite) - { - if(sel.atst != ATST_ALWAYS && sel.afail == AFAIL_RGB_ONLY - || (sel.fpsm & 3) == 0 && fm != 0 - || (sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask - || (sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0) - { - sel.rfb = 1; - } - } - } - - sel.colclamp = env.COLCLAMP.CLAMP; - sel.fba = context->FBA.FBA; - - if(env.DTHE.DTHE) - { - sel.dthe = 1; - - GSVector4i dimx0 = env.dimx[1].sll32(16).sra32(16); - GSVector4i dimx1 = env.dimx[3].sll32(16).sra32(16); - GSVector4i dimx2 = env.dimx[5].sll32(16).sra32(16); - GSVector4i dimx3 = env.dimx[7].sll32(16).sra32(16); - - pb->dimx = dimx0.ps32(dimx1).ps16(dimx2.ps32(dimx3)); - } - } - - if(zwrite || ztest) - { - sel.zpsm = RemapPSM(context->ZBUF.PSM); - sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS; - - if(ztest) - { - sel.rzb = 1; - } - else - { - if(zwrite) - { - if(sel.atst != ATST_ALWAYS && (sel.afail == AFAIL_FB_ONLY || sel.afail == AFAIL_RGB_ONLY) - || (sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask - { - sel.rzb = 1; - } - } - } - } - - pb->fm = fm; - pb->zm = zm; - - if((sel.fpsm & 3) == 1) - { - pb->fm |= 0xff000000; - } - else if((sel.fpsm & 3) >= 2) - { - uint32 rb = pb->fm & 0x00f800f8; - uint32 ga = pb->fm & 0x8000f800; - - pb->fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000; - } - - if((sel.zpsm & 3) == 1) - { - pb->zm |= 0xff000000; - } - else if((sel.zpsm & 3) >= 2) - { - pb->zm |= 0xffff0000; - } - - pb->fbp = context->FRAME.Block(); - pb->zbp = context->ZBUF.Block(); - pb->bw = context->FRAME.FBW; - - pb->sel = sel; - - return true; -} - -// - -GSRendererCL::TFXJob::TFXJob() - : src_pages(NULL) - , dst_pages(NULL) -{ -} - -GSRendererCL::TFXJob::~TFXJob() -{ - if(src_pages != NULL) _aligned_free(src_pages); - if(dst_pages != NULL) _aligned_free(dst_pages); -} - -GSVector4i* GSRendererCL::TFXJob::GetSrcPages() -{ - if(src_pages == NULL) - { - src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16); - - src_pages[0] = GSVector4i::zero(); - src_pages[1] = GSVector4i::zero(); - src_pages[2] = GSVector4i::zero(); - src_pages[3] = GSVector4i::zero(); - } - - return src_pages; -} - -GSVector4i* GSRendererCL::TFXJob::GetDstPages() -{ - if(dst_pages == NULL) - { - dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16); - - dst_pages[0] = GSVector4i::zero(); - dst_pages[1] = GSVector4i::zero(); - dst_pages[2] = GSVector4i::zero(); - dst_pages[3] = GSVector4i::zero(); - } - - return dst_pages; -} - -// - -//#define IOCL_DEBUG - -GSRendererCL::CL::CL() -{ - WIs = INT_MAX; - version = INT_MAX; - - std::string ocldev = theApp.GetConfigS("ocldev"); - -#ifdef IOCL_DEBUG - ocldev = "Intel(R) Corporation Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz OpenCL C 1.2 CPU"; -#endif - - std::list dl; - - GSUtil::GetDeviceDescs(dl); - - for(auto d : dl) - { - if(d.name == ocldev) - { - devs.push_back(d); - - WIs = std::min(WIs, (uint32)d.device.getInfo()); - version = std::min(version, d.version); - - break; // TODO: multiple devices? - } - } - - if(devs.empty() && !dl.empty()) - { - auto d = dl.front(); - - devs.push_back(d); - - WIs = std::min(WIs, (uint32)d.device.getInfo()); - version = std::min(version, d.version); - } - - if(devs.empty()) - { - throw new std::runtime_error("OpenCL device not found"); - } - - std::vector tmp; - - for(auto d : devs) tmp.push_back(d.device); - - context = cl::Context(tmp); - - queue[0] = cl::CommandQueue(context); - queue[1] = cl::CommandQueue(context); - queue[2] = cl::CommandQueue(context); - - std::vector buff; - - if(theApp.LoadResource(IDR_TFX_CL, buff)) - { - kernel_str = std::string(buff.data(), buff.size()); - } - - vb.head = vb.tail = vb.size = 0; - ib.head = ib.tail = ib.size = 0; - pb.head = pb.tail = pb.size = 0; - - vb.mapped_ptr = vb.ptr = NULL; - ib.mapped_ptr = ib.ptr = NULL; - pb.mapped_ptr = pb.ptr = NULL; - - pb.size = TFX_PARAM_SIZE * 256; - pb.buff[0] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size); - pb.buff[1] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size); - - env = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(gs_env)); - - wqidx = 0; - wq = &queue[0]; -} - -GSRendererCL::CL::~CL() -{ - Unmap(); -} - -void GSRendererCL::CL::Map() -{ - Unmap(); - - cl_map_flags flags = version >= 120 ? CL_MAP_WRITE_INVALIDATE_REGION : CL_MAP_WRITE; - - if(vb.head < vb.size) - { - vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, flags, vb.head, vb.size - vb.head); - vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head; - ASSERT(((size_t)vb.ptr & 15) == 0); - } - - if(ib.head < ib.size) - { - ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, flags, ib.head, ib.size - ib.head); - ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head; - } - - if(pb.head < pb.size) - { - pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, flags, pb.head, pb.size - pb.head); - pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head; - ASSERT(((size_t)pb.ptr & 15) == 0); - } -} - -void GSRendererCL::CL::Unmap() -{ - if(vb.mapped_ptr != NULL) wq->enqueueUnmapMemObject(vb.buff[wqidx], vb.mapped_ptr); - if(ib.mapped_ptr != NULL) wq->enqueueUnmapMemObject(ib.buff[wqidx], ib.mapped_ptr); - if(pb.mapped_ptr != NULL) wq->enqueueUnmapMemObject(pb.buff[wqidx], pb.mapped_ptr); - - vb.mapped_ptr = vb.ptr = NULL; - ib.mapped_ptr = ib.ptr = NULL; - pb.mapped_ptr = pb.ptr = NULL; -} - -cl::Kernel GSRendererCL::CL::Build(const char* entry, std::ostringstream& opt) -{ - cl::Program program; - - if(version >= 120) - { - cl::Program::Binaries binaries; - - try - { - for(auto d : devs) - { - std::string path = d.tmppath + "/" + entry; - - FILE* f = fopen(path.c_str(), "rb"); - - if(f != NULL) - { - fseek(f, 0, SEEK_END); - long size = ftell(f); - std::pair b(new char[size], size); - fseek(f, 0, SEEK_SET); - fread(b.first, b.second, 1, f); - fclose(f); - - binaries.push_back(b); - } - else - { - break; - } - } - - if(binaries.size() == devs.size()) - { - std::vector tmp; - - for(auto d : devs) tmp.push_back(d.device); - - program = cl::Program(context, tmp, binaries); - - AddDefs(opt); - - program.build(opt.str().c_str()); - - cl::Kernel kernel = cl::Kernel(program, entry); - - return kernel; - } - } - catch(cl::Error err) - { - printf("%s (%d)\n", err.what(), err.err()); - } - - for(auto b : binaries) - { - delete [] (char*)b.first; - } - } - - try - { - printf("building kernel (%s)\n", entry); - - program = cl::Program(context, kernel_str); - - AddDefs(opt); - - program.build(opt.str().c_str()); - } - catch(cl::Error err) - { - if(err.err() == CL_BUILD_PROGRAM_FAILURE) - { - for(auto d : devs) - { - auto s = program.getBuildInfo(d.device); - - printf("kernel (%s) build error: %s\n", entry, s.c_str()); - } - } - - throw err; - } - - if(version >= 120) - { - try - { - std::vector sizes = program.getInfo(); - std::vector binaries = program.getInfo(); - - for(size_t i = 0; i < binaries.size(); i++) - { - std::string path = devs[i].tmppath + "/" + entry; - - FILE* f = fopen(path.c_str(), "wb"); - - if(f != NULL) - { - fwrite(binaries[i], sizes[i], 1, f); - fclose(f); - } - - delete [] binaries[i]; - } - } - catch(cl::Error err) - { - printf("%s (%d)\n", err.what(), err.err()); - } - } - - return cl::Kernel(program, entry); -} - -void GSRendererCL::CL::AddDefs(std::ostringstream& opt) -{ - if(version == 110) opt << "-cl-std=CL1.1 "; - else opt << "-cl-std=CL1.2 "; - opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u "; - opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u "; - opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u "; - opt << "-D MAX_PRIM_PER_BATCH=" << MAX_PRIM_PER_BATCH << "u "; - opt << "-D MAX_BATCH_COUNT=" << MAX_BATCH_COUNT << "u "; - opt << "-D BIN_SIZE_BITS=" << BIN_SIZE_BITS << " "; - opt << "-D BIN_SIZE=" << BIN_SIZE << "u "; - opt << "-D MAX_BIN_PER_BATCH=" << MAX_BIN_PER_BATCH << "u "; - opt << "-D MAX_BIN_COUNT=" << MAX_BIN_COUNT << "u "; - opt << "-D TFX_PARAM_SIZE=" << TFX_PARAM_SIZE << "u "; -#ifdef IOCL_DEBUG - opt << "-g -s \"E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\tfx.cl\" "; -#endif -} - -cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel) -{ - auto i = prim_map.find(sel); - - if(i != prim_map.end()) - { - return i->second; - } - - char entry[256]; - - sprintf(entry, "prim_%02x", sel.key); - - std::ostringstream opt; - - opt << "-D KERNEL_PRIM=" << entry << " "; - opt << "-D PRIM=" << sel.prim << " "; - - cl::Kernel k = Build(entry, opt); - - prim_map[sel] = k; - - k.setArg(0, env); - - return prim_map[sel]; -} - -cl::Kernel& GSRendererCL::CL::GetTileKernel(const TileSelector& sel) -{ - auto i = tile_map.find(sel); - - if(i != tile_map.end()) - { - return i->second; - } - - char entry[256]; - - sprintf(entry, "tile_%02x", sel.key); - - std::ostringstream opt; - - opt << "-D KERNEL_TILE=" << entry << " "; - opt << "-D PRIM=" << sel.prim << " "; - opt << "-D MODE=" << sel.mode << " "; - opt << "-D CLEAR=" << sel.clear << " "; - - cl::Kernel k = Build(entry, opt); - - tile_map[sel] = k; - - k.setArg(0, env); - - return tile_map[sel]; -} - -cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel) -{ - auto i = tfx_map.find(sel); - - if(i != tfx_map.end()) - { - return i->second; - } - - char entry[256]; - - sprintf(entry, "tfx_%016llx", sel.key); - - std::ostringstream opt; - - opt << "-D KERNEL_TFX=" << entry << " "; - opt << "-D FPSM=" << sel.fpsm << " "; - opt << "-D ZPSM=" << sel.zpsm << " "; - opt << "-D ZTST=" << sel.ztst << " "; - opt << "-D ATST=" << sel.atst << " "; - opt << "-D AFAIL=" << sel.afail << " "; - opt << "-D IIP=" << sel.iip << " "; - opt << "-D TFX=" << sel.tfx << " "; - opt << "-D TCC=" << sel.tcc << " "; - opt << "-D FST=" << sel.fst << " "; - opt << "-D LTF=" << sel.ltf << " "; - opt << "-D TLU=" << sel.tlu << " "; - opt << "-D FGE=" << sel.fge << " "; - opt << "-D DATE=" << sel.date << " "; - opt << "-D ABE=" << sel.abe << " "; - opt << "-D ABA=" << sel.aba << " "; - opt << "-D ABB=" << sel.abb << " "; - opt << "-D ABC=" << sel.abc << " "; - opt << "-D ABD=" << sel.abd << " "; - opt << "-D PABE=" << sel.pabe << " "; - opt << "-D AA1=" << sel.aa1 << " "; - opt << "-D FWRITE=" << sel.fwrite << " "; - opt << "-D FTEST=" << sel.ftest << " "; - opt << "-D RFB=" << sel.rfb << " "; - opt << "-D ZWRITE=" << sel.zwrite << " "; - opt << "-D ZTEST=" << sel.ztest << " "; - opt << "-D RZB=" << sel.rzb << " "; - opt << "-D WMS=" << sel.wms << " "; - opt << "-D WMT=" << sel.wmt << " "; - opt << "-D DATM=" << sel.datm << " "; - opt << "-D COLCLAMP=" << sel.colclamp << " "; - opt << "-D FBA=" << sel.fba << " "; - opt << "-D DTHE=" << sel.dthe << " "; - opt << "-D PRIM=" << sel.prim << " "; - opt << "-D LCM=" << sel.lcm << " "; - opt << "-D MMIN=" << sel.mmin << " "; - opt << "-D NOSCISSOR=" << sel.noscissor << " "; - opt << "-D TPSM=" << sel.tpsm << " "; - opt << "-D AEM=" << sel.aem << " "; - opt << "-D FB=" << sel.fb << " "; - opt << "-D ZB=" << sel.zb << " "; - opt << "-D MERGED=" << sel.merged << " "; - - cl::Kernel k = Build(entry, opt); - - tfx_map[sel] = k; - - k.setArg(0, env); - k.setArg(1, vm); - - return tfx_map[sel]; -} -#endif diff --git a/plugins/GSdx/Renderers/OpenCL/GSRendererCL.h b/plugins/GSdx/Renderers/OpenCL/GSRendererCL.h deleted file mode 100644 index 3de5dc79d1..0000000000 --- a/plugins/GSdx/Renderers/OpenCL/GSRendererCL.h +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#pragma once - -#include "Renderers/Common/GSRenderer.h" - -#ifdef ENABLE_OPENCL - -struct alignas(32) GSVertexCL -{ - GSVector4 p, t; -}; - -class GSRendererCL : public GSRenderer -{ - static GSVector4 m_pos_scale; - - typedef void (GSRendererCL::*ConvertVertexBufferPtr)(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count); - - ConvertVertexBufferPtr m_cvb[4][2][2]; - - template - void ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count); - - union PrimSelector - { - struct - { - uint32 prim:2; // 0 - }; - - uint32 key; - - operator uint32() const { return key; } - }; - - union TileSelector - { - struct - { - uint32 prim:2; // 0 - uint32 mode:2; // 2 - uint32 clear:1; // 4 - }; - - uint32 key; - - operator uint32() const { return key; } - }; - - union TFXSelector - { - struct - { - uint32 fpsm:3; // 0 - uint32 zpsm:3; // 3 - uint32 ztst:2; // 6 (0: off, 1: write, 2: test (ge), 3: test (g)) - uint32 atst:3; // 8 - uint32 afail:2; // 11 - uint32 iip:1; // 13 - uint32 tfx:3; // 14 - uint32 tcc:1; // 17 - uint32 fst:1; // 18 - uint32 ltf:1; // 19 - uint32 tlu:1; // 20 - uint32 fge:1; // 21 - uint32 date:1; // 22 - uint32 abe:1; // 23 - uint32 aba:2; // 24 - uint32 abb:2; // 26 - uint32 abc:2; // 28 - uint32 abd:2; // 30 - - uint32 pabe:1; // 32 - uint32 aa1:1; // 33 - uint32 fwrite:1; // 34 - uint32 ftest:1; // 35 - uint32 rfb:1; // 36 - uint32 zwrite:1; // 37 - uint32 ztest:1; // 38 - uint32 rzb:1; // 39 - uint32 wms:2; // 40 - uint32 wmt:2; // 42 - uint32 datm:1; // 44 - uint32 colclamp:1; // 45 - uint32 fba:1; // 46 - uint32 dthe:1; // 47 - uint32 prim:2; // 48 - uint32 lcm:1; // 50 - uint32 mmin:2; // 51 - uint32 noscissor:1; // 53 - uint32 tpsm:4; // 54 - uint32 aem:1; // 58 - uint32 merged:1; // 59 - // TODO - }; - - struct - { - uint32 _pad1:24; - uint32 ababcd:8; - uint32 _pad2:2; - uint32 fb:2; - uint32 _pad3:1; - uint32 zb:2; - }; - - struct - { - uint32 lo; - uint32 hi; - }; - - uint64 key; - - operator uint64() const { return key; } - - bool IsSolidRect() const - { - return prim == GS_SPRITE_CLASS - && iip == 0 - && tfx == TFX_NONE - && abe == 0 - && ztst <= 1 - && atst <= 1 - && date == 0 - && fge == 0; - } - }; - - struct alignas(32) TFXParameter - { - GSVector4i scissor; - GSVector4i dimx; // 4x4 signed char - TFXSelector sel; - uint32 fbp, zbp, bw; - uint32 fm, zm; - uint32 fog; // rgb - uint8 aref, afix; - uint8 ta0, ta1; - uint32 tbp[7], tbw[7]; - int minu, maxu, minv, maxv; // umsk, ufix, vmsk, vfix - int lod; // lcm == 1 - int mxl; - float l; // TEX1.L * -0x10000 - float k; // TEX1.K * 0x10000 - uint32 clut[256]; - }; - - class TFXJob - { - public: - struct { int x, y, z, w; } rect; - TFXSelector sel; - uint32 ib_start; - uint32 prim_count; - GSVector4i* src_pages; // read by any texture level - GSVector4i* dst_pages; // f/z writes to it - uint32 fbp, zbp, bw; - uint32 fpsm, zpsm, tpsm; -#ifdef DEBUG - TFXParameter* pb; -#endif - TFXJob(); - virtual ~TFXJob(); - - GSVector4i* GetSrcPages(); - GSVector4i* GetDstPages(); - }; - - class CL - { - std::string kernel_str; - std::map prim_map; - std::map tile_map; - std::map tfx_map; - - cl::Kernel Build(const char* entry, std::ostringstream& opt); - void AddDefs(std::ostringstream& opt); - - public: - std::vector devs; - cl::Context context; - cl::CommandQueue queue[3]; - cl::Buffer vm; - cl::Buffer tex; - struct { cl::Buffer buff[2]; size_t head, tail, size; unsigned char* ptr; void* mapped_ptr; } vb, ib, pb; - cl::Buffer env; - cl::CommandQueue* wq; - int wqidx; - uint32 WIs; - int version; - - public: - CL(); - virtual ~CL(); - - cl::Kernel& GetPrimKernel(const PrimSelector& sel); - cl::Kernel& GetTileKernel(const TileSelector& sel); - cl::Kernel& GetTFXKernel(const TFXSelector& sel); - - void Map(); - void Unmap(); - }; - - CL m_cl; - std::list> m_jobs; - uint32 m_vb_start; - uint32 m_vb_count; - uint32 m_pb_start; - uint32 m_pb_count; - bool m_synced; - - void Enqueue(); - void EnqueueTFX(std::list>& jobs, uint32 bin_count, const cl_uchar4& bin_dim); - void JoinTFX(std::list>& jobs); - bool UpdateTextureCache(TFXJob* job); - void InvalidateTextureCache(TFXJob* job); - void UsePages(uint32* pages); - void ReleasePages(uint32* pages); - - static void CL_CALLBACK ReleasePageEvent(cl_event event, cl_int event_command_exec_status, void* user_data); - -protected: - GSTexture* m_texture[2]; - uint8* m_output; - - GSVector4i m_rw_pages[2][4]; // pages that may be read or modified by the rendering queue, f/z rw, tex r - GSVector4i m_tc_pages[4]; // invalidated texture cache pages (split this into 8:24?) // TODO: this should be block level, too many overlaps inside pages with render targets - GSVector4i m_tmp_pages[4]; - uint32 m_tmp_pages2[MAX_PAGES + 1]; - std::array, 512> m_rw_pages_rendering; // pages that are currently in-use - - void Reset(); - void VSync(int field); - void ResetDevice(); - GSTexture* GetOutput(int i, int& y_offset); - - void Draw(); - void Sync(int reason); - void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r); - void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false); - - bool SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count); - -public: - static void InitVectors(); - - GSRendererCL(); - virtual ~GSRendererCL(); -}; - -#endif diff --git a/plugins/GSdx/Window/GSSettingsDlg.cpp b/plugins/GSdx/Window/GSSettingsDlg.cpp index 4db657ab07..760b994107 100644 --- a/plugins/GSdx/Window/GSSettingsDlg.cpp +++ b/plugins/GSdx/Window/GSSettingsDlg.cpp @@ -39,23 +39,10 @@ GSSettingsDlg::GSSettingsDlg() { auto is_d3d11_renderer = [](const auto &renderer) { const GSRendererType type = static_cast(renderer.value); - return type == GSRendererType::DX1011_HW || type == GSRendererType::DX1011_SW || type == GSRendererType::DX1011_OpenCL; + return type == GSRendererType::DX1011_HW || type == GSRendererType::DX1011_SW; }; m_renderers.erase(std::remove_if(m_renderers.begin(), m_renderers.end(), is_d3d11_renderer), m_renderers.end()); } - -#ifdef ENABLE_OPENCL - std::list ocldevs; - - GSUtil::GetDeviceDescs(ocldevs); - - int index = 0; - - for(auto dev : ocldevs) - { - m_ocl_devs.push_back(GSSetting(index++, dev.name.c_str(), "")); - } -#endif } std::vector GSSettingsDlg::EnumerateD3D11Adapters() @@ -98,27 +85,12 @@ void GSSettingsDlg::OnInit() __super::OnInit(); GSRendererType renderer = GSRendererType(theApp.GetConfigI("Renderer")); - const bool dx11 = renderer == GSRendererType::DX1011_HW || renderer == GSRendererType::DX1011_SW || renderer == GSRendererType::DX1011_OpenCL; + const bool dx11 = renderer == GSRendererType::DX1011_HW || renderer == GSRendererType::DX1011_SW; if (renderer == GSRendererType::Undefined || m_d3d11_adapters.empty() && dx11) renderer = GSUtil::GetBestRenderer(); ComboBoxInit(IDC_RENDERER, m_renderers, static_cast(renderer)); UpdateAdapters(); - std::string ocldev = theApp.GetConfigS("ocldev"); - - unsigned int ocl_sel = 0; - - for(unsigned int i = 0; i < m_ocl_devs.size(); i++) - { - if(ocldev == m_ocl_devs[i].name) - { - ocl_sel = i; - - break; - } - } - - ComboBoxInit(IDC_OPENCL_DEVICE, m_ocl_devs, ocl_sel); ComboBoxInit(IDC_MIPMAP_HW, theApp.m_gs_hw_mipmapping, theApp.GetConfigI("mipmap_hw")); ComboBoxInit(IDC_INTERLACE, theApp.m_gs_interlace, theApp.GetConfigI("interlace")); @@ -220,13 +192,6 @@ bool GSSettingsDlg::OnCommand(HWND hWnd, UINT id, UINT code) theApp.SetConfig("Adapter", (*m_current_adapters)[data].id.c_str()); } - if(ComboBoxGetSelData(IDC_OPENCL_DEVICE, data)) - { - if ((UINT)data < m_ocl_devs.size()) { - theApp.SetConfig("ocldev", m_ocl_devs[(int)data].name.c_str()); - } - } - if(ComboBoxGetSelData(IDC_RENDERER, data)) { theApp.SetConfig("Renderer", (int)data); @@ -304,7 +269,7 @@ void GSSettingsDlg::UpdateAdapters() return; const GSRendererType renderer = static_cast(data); - const bool dx11 = renderer == GSRendererType::DX1011_HW || renderer == GSRendererType::DX1011_SW || renderer == GSRendererType::DX1011_OpenCL; + const bool dx11 = renderer == GSRendererType::DX1011_HW || renderer == GSRendererType::DX1011_SW; EnableWindow(GetDlgItem(m_hWnd, IDC_ADAPTER), dx11); EnableWindow(GetDlgItem(m_hWnd, IDC_ADAPTER_TEXT), dx11); @@ -345,12 +310,11 @@ void GSSettingsDlg::UpdateControls() { const GSRendererType renderer = static_cast(i); - const bool dx11 = renderer == GSRendererType::DX1011_HW || renderer == GSRendererType::DX1011_SW || renderer == GSRendererType::DX1011_OpenCL; - const bool ogl = renderer == GSRendererType::OGL_HW || renderer == GSRendererType::OGL_SW || renderer == GSRendererType::OGL_OpenCL; + const bool dx11 = renderer == GSRendererType::DX1011_HW || renderer == GSRendererType::DX1011_SW; + const bool ogl = renderer == GSRendererType::OGL_HW || renderer == GSRendererType::OGL_SW; const bool hw = renderer == GSRendererType::DX1011_HW || renderer == GSRendererType::OGL_HW; const bool sw = renderer == GSRendererType::DX1011_SW || renderer == GSRendererType::OGL_SW; - const bool ocl = renderer == GSRendererType::DX1011_OpenCL || renderer == GSRendererType::OGL_OpenCL; const bool null = renderer == GSRendererType::Null; const int sw_threads = SendMessage(GetDlgItem(m_hWnd, IDC_SWTHREADS), UDM_GETPOS, 0, 0); @@ -360,12 +324,6 @@ void GSSettingsDlg::UpdateControls() ShowWindow(GetDlgItem(m_hWnd, IDC_NULL), null ? SW_SHOW : SW_HIDE); ShowWindow(GetDlgItem(m_hWnd, IDC_LOGOGL), ogl ? SW_SHOW : SW_HIDE); -#ifndef ENABLE_OPENCL - ShowWindow(GetDlgItem(m_hWnd, IDC_OPENCL_TEXT), SW_HIDE); - ShowWindow(GetDlgItem(m_hWnd, IDC_OPENCL_DEVICE), SW_HIDE); -#endif - EnableWindow(GetDlgItem(m_hWnd, IDC_OPENCL_TEXT), ocl); - EnableWindow(GetDlgItem(m_hWnd, IDC_OPENCL_DEVICE), ocl); EnableWindow(GetDlgItem(m_hWnd, IDC_INTERLACE), !null); EnableWindow(GetDlgItem(m_hWnd, IDC_INTERLACE_TEXT), !null); EnableWindow(GetDlgItem(m_hWnd, IDC_FILTER), !null); diff --git a/plugins/GSdx/Window/GSSettingsDlg.h b/plugins/GSdx/Window/GSSettingsDlg.h index e560cb3d1a..2adc296d2d 100644 --- a/plugins/GSdx/Window/GSSettingsDlg.h +++ b/plugins/GSdx/Window/GSSettingsDlg.h @@ -91,8 +91,6 @@ class GSSettingsDlg : public GSDialog std::vector *m_current_adapters; std::string m_last_selected_adapter_id; - std::vector m_ocl_devs; - std::vector EnumerateD3D11Adapters(); void UpdateAdapters(); diff --git a/plugins/GSdx/config.h b/plugins/GSdx/config.h index fb5fc1621f..ea2050892a 100644 --- a/plugins/GSdx/config.h +++ b/plugins/GSdx/config.h @@ -46,7 +46,3 @@ #if defined(__unix__) && !(defined(_DEBUG) || defined(_DEVEL)) #define DISABLE_PERF_MON // Burn cycle for nothing in release mode #endif - -#ifdef _WIN32 -//#define ENABLE_OPENCL -#endif diff --git a/plugins/GSdx/res/gsdx-res.xml b/plugins/GSdx/res/gsdx-res.xml index 6100e35ee8..740405a458 100644 --- a/plugins/GSdx/res/gsdx-res.xml +++ b/plugins/GSdx/res/gsdx-res.xml @@ -27,9 +27,6 @@ glsl/tfx_fs.glsl - - tfx.cl - fonts-roboto/Roboto-Regular.ttf diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl deleted file mode 100644 index 0d461bba75..0000000000 --- a/plugins/GSdx/res/tfx.cl +++ /dev/null @@ -1,1623 +0,0 @@ -#if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency - -#ifdef cl_amd_printf -#pragma OPENCL EXTENSION cl_amd_printf : enable -#endif - -#ifdef cl_amd_media_ops -#pragma OPENCL EXTENSION cl_amd_media_ops : enable -#else -#endif - -#ifdef cl_amd_media_ops2 -#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable -#else -#endif - -#ifndef CL_FLT_EPSILON -#define CL_FLT_EPSILON 1.1920928955078125e-7f -#endif - -#if MAX_PRIM_PER_BATCH == 64u - #define BIN_TYPE ulong -#elif MAX_PRIM_PER_BATCH == 32u - #define BIN_TYPE uint -#else - #error "MAX_PRIM_PER_BATCH != 32u OR 64u" -#endif - -#define TFX_ABA(sel) ((sel.x >> 24) & 3) -#define TFX_ABB(sel) ((sel.x >> 26) & 3) -#define TFX_ABC(sel) ((sel.x >> 28) & 3) -#define TFX_ABD(sel) ((sel.x >> 30) & 3) -#define TFX_WMS(sel) ((sel.y >> 8) & 3) -#define TFX_WMT(sel) ((sel.y >> 10) & 3) - -typedef struct -{ - union {float4 p; struct {float x, y; uint z, f;};}; - union {float4 tc; struct {float s, t, q; uchar4 c;};}; -} gs_vertex; - -typedef struct -{ - gs_vertex v[3]; - uint zmin, zmax; - uint pb_index; - uint _pad; -} gs_prim; - -typedef struct -{ - float4 dx, dy; - float4 zero; - float4 reject_corner; -} gs_barycentric; - -typedef struct -{ - struct {uint first, last;} bounds[MAX_BIN_PER_BATCH]; - BIN_TYPE bin[MAX_BIN_COUNT]; - uchar4 bbox[MAX_PRIM_COUNT]; - gs_prim prim[MAX_PRIM_COUNT]; - gs_barycentric barycentric[MAX_PRIM_COUNT]; -} gs_env; - -typedef struct -{ - int4 scissor; - char dimx[4][4]; - uint2 sel; - int fbp, zbp, bw; - uint fm, zm; - uchar4 fog; // rgb - uchar aref, afix; - uchar ta0, ta1; - int tbp[7], tbw[7]; - int minu, maxu, minv, maxv; - int lod; // lcm == 1 - int mxl; - float l; // TEX1.L * -0x10000 - float k; // TEX1.K * 0x10000 - uchar4 clut[256]; // TODO: this could be an index to a separate buffer, it may be the same across several gs_params following eachother -} gs_param; - -enum GS_PRIM_CLASS -{ - GS_POINT_CLASS, - GS_LINE_CLASS, - GS_TRIANGLE_CLASS, - GS_SPRITE_CLASS -}; - -enum GS_PSM -{ - PSM_PSMCT32, - PSM_PSMCT24, - PSM_PSMCT16, - PSM_PSMCT16S, - PSM_PSMZ32, - PSM_PSMZ24, - PSM_PSMZ16, - PSM_PSMZ16S, - PSM_PSMT8, - PSM_PSMT4, - PSM_PSMT8H, - PSM_PSMT4HL, - PSM_PSMT4HH, -}; - -enum GS_TFX -{ - TFX_MODULATE = 0, - TFX_DECAL = 1, - TFX_HIGHLIGHT = 2, - TFX_HIGHLIGHT2 = 3, - TFX_NONE = 4, -}; - -enum GS_CLAMP -{ - CLAMP_REGION_REPEAT = 0, - CLAMP_REPEAT = 1, - CLAMP_CLAMP = 2, - CLAMP_REGION_CLAMP = 3, -}; - -enum GS_ZTST -{ - ZTST_NEVER = 0, - ZTST_ALWAYS = 1, - ZTST_GEQUAL = 2, - ZTST_GREATER = 3, -}; - -enum GS_ATST -{ - ATST_NEVER = 0, - ATST_ALWAYS = 1, - ATST_LESS = 2, - ATST_LEQUAL = 3, - ATST_EQUAL = 4, - ATST_GEQUAL = 5, - ATST_GREATER = 6, - ATST_NOTEQUAL = 7, -}; - -enum GS_AFAIL -{ - AFAIL_KEEP = 0, - AFAIL_FB_ONLY = 1, - AFAIL_ZB_ONLY = 2, - AFAIL_RGB_ONLY = 3, -}; - -__constant uchar blockTable32[4][8] = -{ - { 0, 1, 4, 5, 16, 17, 20, 21}, - { 2, 3, 6, 7, 18, 19, 22, 23}, - { 8, 9, 12, 13, 24, 25, 28, 29}, - { 10, 11, 14, 15, 26, 27, 30, 31} -}; - -__constant uchar blockTable32Z[4][8] = -{ - { 24, 25, 28, 29, 8, 9, 12, 13}, - { 26, 27, 30, 31, 10, 11, 14, 15}, - { 16, 17, 20, 21, 0, 1, 4, 5}, - { 18, 19, 22, 23, 2, 3, 6, 7} -}; - -__constant uchar blockTable16[8][4] = -{ - { 0, 2, 8, 10 }, - { 1, 3, 9, 11 }, - { 4, 6, 12, 14 }, - { 5, 7, 13, 15 }, - { 16, 18, 24, 26 }, - { 17, 19, 25, 27 }, - { 20, 22, 28, 30 }, - { 21, 23, 29, 31 } -}; - -__constant uchar blockTable16S[8][4] = -{ - { 0, 2, 16, 18 }, - { 1, 3, 17, 19 }, - { 8, 10, 24, 26 }, - { 9, 11, 25, 27 }, - { 4, 6, 20, 22 }, - { 5, 7, 21, 23 }, - { 12, 14, 28, 30 }, - { 13, 15, 29, 31 } -}; - -__constant uchar blockTable16Z[8][4] = -{ - { 24, 26, 16, 18 }, - { 25, 27, 17, 19 }, - { 28, 30, 20, 22 }, - { 29, 31, 21, 23 }, - { 8, 10, 0, 2 }, - { 9, 11, 1, 3 }, - { 12, 14, 4, 6 }, - { 13, 15, 5, 7 } -}; - -__constant uchar blockTable16SZ[8][4] = -{ - { 24, 26, 8, 10 }, - { 25, 27, 9, 11 }, - { 16, 18, 0, 2 }, - { 17, 19, 1, 3 }, - { 28, 30, 12, 14 }, - { 29, 31, 13, 15 }, - { 20, 22, 4, 6 }, - { 21, 23, 5, 7 } -}; - -__constant uchar blockTable8[4][8] = -{ - { 0, 1, 4, 5, 16, 17, 20, 21}, - { 2, 3, 6, 7, 18, 19, 22, 23}, - { 8, 9, 12, 13, 24, 25, 28, 29}, - { 10, 11, 14, 15, 26, 27, 30, 31} -}; - -__constant uchar blockTable4[8][4] = -{ - { 0, 2, 8, 10 }, - { 1, 3, 9, 11 }, - { 4, 6, 12, 14 }, - { 5, 7, 13, 15 }, - { 16, 18, 24, 26 }, - { 17, 19, 25, 27 }, - { 20, 22, 28, 30 }, - { 21, 23, 29, 31 } -}; - -__constant uchar columnTable32[8][8] = -{ - { 0, 1, 4, 5, 8, 9, 12, 13 }, - { 2, 3, 6, 7, 10, 11, 14, 15 }, - { 16, 17, 20, 21, 24, 25, 28, 29 }, - { 18, 19, 22, 23, 26, 27, 30, 31 }, - { 32, 33, 36, 37, 40, 41, 44, 45 }, - { 34, 35, 38, 39, 42, 43, 46, 47 }, - { 48, 49, 52, 53, 56, 57, 60, 61 }, - { 50, 51, 54, 55, 58, 59, 62, 63 }, -}; - -__constant uchar columnTable16[8][16] = -{ - { 0, 2, 8, 10, 16, 18, 24, 26, - 1, 3, 9, 11, 17, 19, 25, 27 }, - { 4, 6, 12, 14, 20, 22, 28, 30, - 5, 7, 13, 15, 21, 23, 29, 31 }, - { 32, 34, 40, 42, 48, 50, 56, 58, - 33, 35, 41, 43, 49, 51, 57, 59 }, - { 36, 38, 44, 46, 52, 54, 60, 62, - 37, 39, 45, 47, 53, 55, 61, 63 }, - { 64, 66, 72, 74, 80, 82, 88, 90, - 65, 67, 73, 75, 81, 83, 89, 91 }, - { 68, 70, 76, 78, 84, 86, 92, 94, - 69, 71, 77, 79, 85, 87, 93, 95 }, - { 96, 98, 104, 106, 112, 114, 120, 122, - 97, 99, 105, 107, 113, 115, 121, 123 }, - { 100, 102, 108, 110, 116, 118, 124, 126, - 101, 103, 109, 111, 117, 119, 125, 127 }, -}; - -__constant uchar columnTable8[16][16] = -{ - { 0, 4, 16, 20, 32, 36, 48, 52, // column 0 - 2, 6, 18, 22, 34, 38, 50, 54 }, - { 8, 12, 24, 28, 40, 44, 56, 60, - 10, 14, 26, 30, 42, 46, 58, 62 }, - { 33, 37, 49, 53, 1, 5, 17, 21, - 35, 39, 51, 55, 3, 7, 19, 23 }, - { 41, 45, 57, 61, 9, 13, 25, 29, - 43, 47, 59, 63, 11, 15, 27, 31 }, - { 96, 100, 112, 116, 64, 68, 80, 84, // column 1 - 98, 102, 114, 118, 66, 70, 82, 86 }, - { 104, 108, 120, 124, 72, 76, 88, 92, - 106, 110, 122, 126, 74, 78, 90, 94 }, - { 65, 69, 81, 85, 97, 101, 113, 117, - 67, 71, 83, 87, 99, 103, 115, 119 }, - { 73, 77, 89, 93, 105, 109, 121, 125, - 75, 79, 91, 95, 107, 111, 123, 127 }, - { 128, 132, 144, 148, 160, 164, 176, 180, // column 2 - 130, 134, 146, 150, 162, 166, 178, 182 }, - { 136, 140, 152, 156, 168, 172, 184, 188, - 138, 142, 154, 158, 170, 174, 186, 190 }, - { 161, 165, 177, 181, 129, 133, 145, 149, - 163, 167, 179, 183, 131, 135, 147, 151 }, - { 169, 173, 185, 189, 137, 141, 153, 157, - 171, 175, 187, 191, 139, 143, 155, 159 }, - { 224, 228, 240, 244, 192, 196, 208, 212, // column 3 - 226, 230, 242, 246, 194, 198, 210, 214 }, - { 232, 236, 248, 252, 200, 204, 216, 220, - 234, 238, 250, 254, 202, 206, 218, 222 }, - { 193, 197, 209, 213, 225, 229, 241, 245, - 195, 199, 211, 215, 227, 231, 243, 247 }, - { 201, 205, 217, 221, 233, 237, 249, 253, - 203, 207, 219, 223, 235, 239, 251, 255 }, -}; - -__constant ushort columnTable4[16][32] = -{ - { 0, 8, 32, 40, 64, 72, 96, 104, // column 0 - 2, 10, 34, 42, 66, 74, 98, 106, - 4, 12, 36, 44, 68, 76, 100, 108, - 6, 14, 38, 46, 70, 78, 102, 110 }, - { 16, 24, 48, 56, 80, 88, 112, 120, - 18, 26, 50, 58, 82, 90, 114, 122, - 20, 28, 52, 60, 84, 92, 116, 124, - 22, 30, 54, 62, 86, 94, 118, 126 }, - { 65, 73, 97, 105, 1, 9, 33, 41, - 67, 75, 99, 107, 3, 11, 35, 43, - 69, 77, 101, 109, 5, 13, 37, 45, - 71, 79, 103, 111, 7, 15, 39, 47 }, - { 81, 89, 113, 121, 17, 25, 49, 57, - 83, 91, 115, 123, 19, 27, 51, 59, - 85, 93, 117, 125, 21, 29, 53, 61, - 87, 95, 119, 127, 23, 31, 55, 63 }, - { 192, 200, 224, 232, 128, 136, 160, 168, // column 1 - 194, 202, 226, 234, 130, 138, 162, 170, - 196, 204, 228, 236, 132, 140, 164, 172, - 198, 206, 230, 238, 134, 142, 166, 174 }, - { 208, 216, 240, 248, 144, 152, 176, 184, - 210, 218, 242, 250, 146, 154, 178, 186, - 212, 220, 244, 252, 148, 156, 180, 188, - 214, 222, 246, 254, 150, 158, 182, 190 }, - { 129, 137, 161, 169, 193, 201, 225, 233, - 131, 139, 163, 171, 195, 203, 227, 235, - 133, 141, 165, 173, 197, 205, 229, 237, - 135, 143, 167, 175, 199, 207, 231, 239 }, - { 145, 153, 177, 185, 209, 217, 241, 249, - 147, 155, 179, 187, 211, 219, 243, 251, - 149, 157, 181, 189, 213, 221, 245, 253, - 151, 159, 183, 191, 215, 223, 247, 255 }, - { 256, 264, 288, 296, 320, 328, 352, 360, // column 2 - 258, 266, 290, 298, 322, 330, 354, 362, - 260, 268, 292, 300, 324, 332, 356, 364, - 262, 270, 294, 302, 326, 334, 358, 366 }, - { 272, 280, 304, 312, 336, 344, 368, 376, - 274, 282, 306, 314, 338, 346, 370, 378, - 276, 284, 308, 316, 340, 348, 372, 380, - 278, 286, 310, 318, 342, 350, 374, 382 }, - { 321, 329, 353, 361, 257, 265, 289, 297, - 323, 331, 355, 363, 259, 267, 291, 299, - 325, 333, 357, 365, 261, 269, 293, 301, - 327, 335, 359, 367, 263, 271, 295, 303 }, - { 337, 345, 369, 377, 273, 281, 305, 313, - 339, 347, 371, 379, 275, 283, 307, 315, - 341, 349, 373, 381, 277, 285, 309, 317, - 343, 351, 375, 383, 279, 287, 311, 319 }, - { 448, 456, 480, 488, 384, 392, 416, 424, // column 3 - 450, 458, 482, 490, 386, 394, 418, 426, - 452, 460, 484, 492, 388, 396, 420, 428, - 454, 462, 486, 494, 390, 398, 422, 430 }, - { 464, 472, 496, 504, 400, 408, 432, 440, - 466, 474, 498, 506, 402, 410, 434, 442, - 468, 476, 500, 508, 404, 412, 436, 444, - 470, 478, 502, 510, 406, 414, 438, 446 }, - { 385, 393, 417, 425, 449, 457, 481, 489, - 387, 395, 419, 427, 451, 459, 483, 491, - 389, 397, 421, 429, 453, 461, 485, 493, - 391, 399, 423, 431, 455, 463, 487, 495 }, - { 401, 409, 433, 441, 465, 473, 497, 505, - 403, 411, 435, 443, 467, 475, 499, 507, - 405, 413, 437, 445, 469, 477, 501, 509, - 407, 415, 439, 447, 471, 479, 503, 511 }, -}; - -int BlockNumber32(int x, int y, int bp, int bw) -{ - return bp + mad24(y & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 3][(x >> 3) & 7]; -} - -int BlockNumber16(int x, int y, int bp, int bw) -{ - return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 7][(x >> 4) & 3]; -} - -int BlockNumber16S(int x, int y, int bp, int bw) -{ - return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 7][(x >> 4) & 3]; -} - -int BlockNumber32Z(int x, int y, int bp, int bw) -{ - return bp + mad24(y & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 3][(x >> 3) & 7]; -} - -int BlockNumber16Z(int x, int y, int bp, int bw) -{ - return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 7][(x >> 4) & 3]; -} - -int BlockNumber16SZ(int x, int y, int bp, int bw) -{ - return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3]; -} - -int BlockNumber8(int x, int y, int bp, int bw) -{ - return bp + mad24((y >> 1) & ~0x1f, bw >> 1, (x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7]; -} - -int BlockNumber4(int x, int y, int bp, int bw) -{ - return bp + mad24((y >> 2) & ~0x1f, bw >> 1, (x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3]; -} - -int PixelAddress32(int x, int y, int bp, int bw) -{ - return (BlockNumber32(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7]; -} - -int PixelAddress16(int x, int y, int bp, int bw) -{ - return (BlockNumber16(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; -} - -int PixelAddress16S(int x, int y, int bp, int bw) -{ - return (BlockNumber16S(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; -} - -int PixelAddress32Z(int x, int y, int bp, int bw) -{ - return (BlockNumber32Z(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7]; -} - -int PixelAddress16Z(int x, int y, int bp, int bw) -{ - return (BlockNumber16Z(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; -} - -int PixelAddress16SZ(int x, int y, int bp, int bw) -{ - return (BlockNumber16SZ(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15]; -} - -int PixelAddress8(int x, int y, int bp, int bw) -{ - return (BlockNumber8(x, y, bp, bw) << 8) + columnTable8[y & 15][x & 15]; -} - -int PixelAddress4(int x, int y, int bp, int bw) -{ - return (BlockNumber4(x, y, bp, bw) << 9) + columnTable4[y & 15][x & 31]; -} - -int PixelAddress(int x, int y, int bp, int bw, int psm) -{ - switch(psm) - { - default: - case PSM_PSMCT32: - case PSM_PSMCT24: - case PSM_PSMT8H: - case PSM_PSMT4HL: - case PSM_PSMT4HH: - return PixelAddress32(x, y, bp, bw); - case PSM_PSMCT16: - return PixelAddress16(x, y, bp, bw); - case PSM_PSMCT16S: - return PixelAddress16S(x, y, bp, bw); - case PSM_PSMZ32: - case PSM_PSMZ24: - return PixelAddress32Z(x, y, bp, bw); - case PSM_PSMZ16: - return PixelAddress16Z(x, y, bp, bw); - case PSM_PSMZ16S: - return PixelAddress16SZ(x, y, bp, bw); - case PSM_PSMT8: - return PixelAddress8(x, y, bp, bw); - case PSM_PSMT4: - return PixelAddress4(x, y, bp, bw); - } -} - -uint ReadFrame(__global uchar* vm, int addr, int psm) -{ - switch(psm) - { - default: - case PSM_PSMCT32: - case PSM_PSMCT24: - case PSM_PSMZ32: - case PSM_PSMZ24: - return ((__global uint*)vm)[addr]; - case PSM_PSMCT16: - case PSM_PSMCT16S: - case PSM_PSMZ16: - case PSM_PSMZ16S: - return ((__global ushort*)vm)[addr]; - } -} - -void WriteFrame(__global uchar* vm, int addr, int psm, uint value) -{ - switch(psm) - { - default: - case PSM_PSMCT32: - case PSM_PSMZ32: - case PSM_PSMCT24: - case PSM_PSMZ24: - ((__global uint*)vm)[addr] = value; - break; - case PSM_PSMCT16: - case PSM_PSMCT16S: - case PSM_PSMZ16: - case PSM_PSMZ16S: - ((__global ushort*)vm)[addr] = (ushort)value; - break; - } -} - -bool is16bit(int psm) -{ - return psm < 8 && (psm & 3) >= 2; -} - -bool is24bit(int psm) -{ - return psm < 8 && (psm & 3) == 1; -} - -bool is32bit(int psm) -{ - return psm < 8 && (psm & 3) == 0; -} - -#ifdef PRIM - -int GetVertexPerPrim(int prim_class) -{ - switch(prim_class) - { - default: - case GS_POINT_CLASS: return 1; - case GS_LINE_CLASS: return 2; - case GS_TRIANGLE_CLASS: return 3; - case GS_SPRITE_CLASS: return 2; - } -} - -#define VERTEX_PER_PRIM GetVertexPerPrim(PRIM) - -#endif - -#ifdef KERNEL_PRIM - -__kernel void KERNEL_PRIM( - __global gs_env* env, - __global uchar* vb_base, - __global uchar* ib_base, - __global uchar* pb_base, - uint vb_start, - uint ib_start, - uint pb_start) -{ - size_t prim_index = get_global_id(0); - - __global gs_vertex* vb = (__global gs_vertex*)(vb_base + vb_start); - __global uint* ib = (__global uint*)(ib_base + ib_start); - __global gs_prim* prim = &env->prim[prim_index]; - - ib += prim_index * VERTEX_PER_PRIM; - - uint pb_index = ib[0] >> 24; - - prim->pb_index = pb_index; - - __global gs_param* pb = (__global gs_param*)(pb_base + pb_start + pb_index * TFX_PARAM_SIZE); - - __global gs_vertex* v0 = &vb[ib[0] & 0x00ffffff]; - __global gs_vertex* v1 = &vb[ib[1] & 0x00ffffff]; - __global gs_vertex* v2 = &vb[ib[2] & 0x00ffffff]; - - int2 pmin, pmax; - - if(PRIM == GS_POINT_CLASS) - { - pmin = pmax = convert_int2_rte(v0->p.xy); - - prim->v[0].p = v0->p; - prim->v[0].tc = v0->tc; - } - else if(PRIM == GS_LINE_CLASS) - { - int2 p0 = convert_int2_rte(v0->p.xy); - int2 p1 = convert_int2_rte(v1->p.xy); - - pmin = min(p0, p1); - pmax = max(p0, p1); - } - else if(PRIM == GS_TRIANGLE_CLASS) - { - int2 p0 = convert_int2_rtp(v0->p.xy); - int2 p1 = convert_int2_rtp(v1->p.xy); - int2 p2 = convert_int2_rtp(v2->p.xy); - - pmin = min(min(p0, p1), p2); - pmax = max(max(p0, p1), p2); - - // z needs special care, since it's a 32 bit unit, float cannot encode it exactly - // only interpolate the relative to zmin and hopefully small values - - uint zmin = min(min(v0->z, v1->z), v2->z); - uint zmax = max(max(v0->z, v1->z), v2->z); - - prim->v[0].p = (float4)(v0->p.x, v0->p.y, as_float(v0->z - zmin), v0->p.w); - prim->v[0].tc = v0->tc; - prim->v[1].p = (float4)(v1->p.x, v1->p.y, as_float(v1->z - zmin), v1->p.w); - prim->v[1].tc = v1->tc; - prim->v[2].p = (float4)(v2->p.x, v2->p.y, as_float(v2->z - zmin), v2->p.w); - prim->v[2].tc = v2->tc; - - prim->zmin = zmin; - prim->zmax = zmax; - - float4 dp0 = v1->p - v0->p; - float4 dp1 = v0->p - v2->p; - float4 dp2 = v2->p - v1->p; - - float cp = dp0.x * dp1.y - dp0.y * dp1.x; - - if(cp != 0.0f) - { - cp = native_recip(cp); - - float2 u = dp0.xy * cp; - float2 v = -dp1.xy * cp; - - // v0 has the (0, 0, 1) barycentric coord, v1: (0, 1, 0), v2: (1, 0, 0) - - gs_barycentric b; - - b.dx = (float4)(-v.y, u.y, v.y - u.y, v0->p.x); - b.dy = (float4)(v.x, -u.x, u.x - v.x, v0->p.y); - - dp0.xy = dp0.xy * sign(cp); - dp1.xy = dp1.xy * sign(cp); - dp2.xy = dp2.xy * sign(cp); - - b.zero.x = select(0.0f, CL_FLT_EPSILON, (dp1.y < 0) | ((dp1.y == 0) & (dp1.x > 0))); - b.zero.y = select(0.0f, CL_FLT_EPSILON, (dp0.y < 0) | ((dp0.y == 0) & (dp0.x > 0))); - b.zero.z = select(0.0f, CL_FLT_EPSILON, (dp2.y < 0) | ((dp2.y == 0) & (dp2.x > 0))); - - // any barycentric(reject_corner) < 0, tile outside the triangle - - b.reject_corner.x = 0.0f + max(max(max(b.dx.x + b.dy.x, b.dx.x), b.dy.x), 0.0f) * BIN_SIZE; - b.reject_corner.y = 0.0f + max(max(max(b.dx.y + b.dy.y, b.dx.y), b.dy.y), 0.0f) * BIN_SIZE; - b.reject_corner.z = 1.0f + max(max(max(b.dx.z + b.dy.z, b.dx.z), b.dy.z), 0.0f) * BIN_SIZE; - - // TODO: accept_corner, at min value, all barycentric(accept_corner) >= 0, tile fully inside, no per pixel hittest needed - - env->barycentric[prim_index] = b; - } - else // triangle has zero area - { - pmax = -1; // won't get included in any tile - } - } - else if(PRIM == GS_SPRITE_CLASS) - { - int2 p0 = convert_int2_rtp(v0->p.xy); - int2 p1 = convert_int2_rtp(v1->p.xy); - - pmin = min(p0, p1); - pmax = max(p0, p1); - - int4 mask = (int4)(v0->p.xy > v1->p.xy, 0, 0); - - prim->v[0].p = select(v0->p, v1->p, mask); // pmin - prim->v[0].tc = select(v0->tc, v1->tc, mask); - prim->v[1].p = select(v1->p, v0->p, mask); // pmax - prim->v[1].tc = select(v1->tc, v0->tc, mask); - prim->v[1].tc.xy = (prim->v[1].tc.xy - prim->v[0].tc.xy) / (prim->v[1].p.xy - prim->v[0].p.xy); - } - - int4 scissor = pb->scissor; - - pmin = select(pmin, scissor.xy, pmin < scissor.xy); - pmax = select(pmax, scissor.zw, pmax > scissor.zw); - - int4 r = (int4)(pmin, pmax + (int2)(BIN_SIZE - 1)) >> BIN_SIZE_BITS; - - env->bbox[prim_index] = convert_uchar4_sat(r); -} - -#endif - -#ifdef KERNEL_TILE - -int tile_in_triangle(float2 p, gs_barycentric b) -{ - float3 f = b.dx.xyz * (p.x - b.dx.w) + b.dy.xyz * (p.y - b.dy.w) + b.reject_corner.xyz; - - f = select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10)); - - return all(f >= b.zero.xyz); -} - -#if CLEAR == 1 - -__kernel void KERNEL_TILE(__global gs_env* env) -{ - env->bounds[get_global_id(0)].first = -1; - env->bounds[get_global_id(0)].last = 0; -} - -#elif MODE < 3 - -#if MAX_PRIM_PER_BATCH != 32 - #error "MAX_PRIM_PER_BATCH != 32" -#endif - -#define MAX_PRIM_PER_GROUP (32u >> MODE) - -__kernel void KERNEL_TILE( - __global gs_env* env, - uint prim_count, - uint bin_count, // == bin_dim.z * bin_dim.w - uchar4 bin_dim) -{ - uint batch_index = get_group_id(2) >> MODE; - uint prim_start = get_group_id(2) << (5 - MODE); - uint group_prim_index = get_local_id(2); - uint bin_index = get_local_id(1) * get_local_size(0) + get_local_id(0); - - __global BIN_TYPE* bin = &env->bin[batch_index * bin_count]; - __global uchar4* bbox = &env->bbox[prim_start]; - __global gs_barycentric* barycentric = &env->barycentric[prim_start]; - - __local uchar4 bbox_cache[MAX_PRIM_PER_GROUP]; - __local gs_barycentric barycentric_cache[MAX_PRIM_PER_GROUP]; - __local uint visible[8 << MODE]; - - if(get_local_id(2) == 0) - { - visible[bin_index] = 0; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - uint group_prim_count = min(prim_count - prim_start, MAX_PRIM_PER_GROUP); - - event_t e = async_work_group_copy(bbox_cache, bbox, group_prim_count, 0); - - wait_group_events(1, &e); - - if(PRIM == GS_TRIANGLE_CLASS) - { - e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, group_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0); - - wait_group_events(1, &e); - } - - if(group_prim_index < group_prim_count) - { - int x = bin_dim.x + get_local_id(0); - int y = bin_dim.y + get_local_id(1); - - uchar4 r = bbox_cache[group_prim_index]; - - uint test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y); - - if(PRIM == GS_TRIANGLE_CLASS && test != 0) - { - test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[group_prim_index]); - } - - atomic_or(&visible[bin_index], test << ((MAX_PRIM_PER_GROUP - 1) - get_local_id(2))); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(get_local_id(2) == 0) - { - #if MODE == 0 - ((__global uint*)&bin[bin_index])[0] = visible[bin_index]; - #elif MODE == 1 - ((__global ushort*)&bin[bin_index])[1 - (get_group_id(2) & 1)] = visible[bin_index]; - #elif MODE == 2 - ((__global uchar*)&bin[bin_index])[3 - (get_group_id(2) & 3)] = visible[bin_index]; - #endif - - if(visible[bin_index] != 0) - { - atomic_min(&env->bounds[bin_index].first, batch_index); - atomic_max(&env->bounds[bin_index].last, batch_index); - } - } -} - -#elif MODE == 3 - -__kernel void KERNEL_TILE( - __global gs_env* env, - uint prim_count, - uint bin_count, // == bin_dim.z * bin_dim.w - uchar4 bin_dim) -{ - size_t batch_index = get_group_id(0); - size_t local_id = get_local_id(0); - size_t local_size = get_local_size(0); - - uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH); - - __global BIN_TYPE* bin = &env->bin[batch_index * bin_count]; - __global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS]; - __global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS]; - - __local uchar4 bbox_cache[MAX_PRIM_PER_BATCH]; - __local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH]; - - event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0); - - wait_group_events(1, &e); - - if(PRIM == GS_TRIANGLE_CLASS) - { - e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0); - - wait_group_events(1, &e); - } - - for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size) - { - int y = bin_index / bin_dim.z; // TODO: very expensive, no integer divider on current hardware - int x = bin_index - y * bin_dim.z; - - x += bin_dim.x; - y += bin_dim.y; - - BIN_TYPE visible = 0; - - for(uint i = 0; i < batch_prim_count; i++) - { - uchar4 r = bbox_cache[i]; - - BIN_TYPE test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y); - - if(PRIM == GS_TRIANGLE_CLASS && test != 0) - { - test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]); - } - - visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i); - } - - bin[bin_index] = visible; - - if(visible != 0) - { - atomic_min(&env->bounds[bin_index].first, batch_index); - atomic_max(&env->bounds[bin_index].last, batch_index); - } - } -} - -#endif - -#endif - -#ifdef KERNEL_TFX - -bool ZTest(uint zs, uint zd) -{ - if(ZTEST) - { - if(is24bit(ZPSM)) zd &= 0x00ffffff; - - switch(ZTST) - { - case ZTST_NEVER: - return false; - case ZTST_ALWAYS: - return true; - case ZTST_GEQUAL: - return zs >= zd; - case ZTST_GREATER: - return zs > zd; - } - } - - return true; -} - -bool AlphaTest(int alpha, int aref, uint* fm, uint* zm) -{ - switch(AFAIL) - { - case AFAIL_KEEP: - break; - case AFAIL_FB_ONLY: - if(!ZWRITE) return true; - break; - case AFAIL_ZB_ONLY: - if(!FWRITE) return true; - break; - case AFAIL_RGB_ONLY: - if(!ZWRITE && is24bit(FPSM)) return true; - break; - } - - uint pass; - - switch(ATST) - { - case ATST_NEVER: - pass = false; - break; - case ATST_ALWAYS: - return true; - case ATST_LESS: - pass = alpha < aref; - break; - case ATST_LEQUAL: - pass = alpha <= aref; - break; - case ATST_EQUAL: - pass = alpha == aref; - break; - case ATST_GEQUAL: - pass = alpha >= aref; - break; - case ATST_GREATER: - pass = alpha > aref; - break; - case ATST_NOTEQUAL: - pass = alpha != aref; - break; - } - - switch(AFAIL) - { - case AFAIL_KEEP: - return pass; - case AFAIL_FB_ONLY: - *zm |= pass ? 0 : 0xffffffff; - break; - case AFAIL_ZB_ONLY: - *fm |= pass ? 0 : 0xffffffff; - break; - case AFAIL_RGB_ONLY: - if(is32bit(FPSM)) *fm |= pass ? 0 : 0xff000000; - if(is16bit(FPSM)) *fm |= pass ? 0 : 0xffff8000; - *zm |= pass ? 0 : 0xffffffff; - break; - } - - return true; -} - -bool DestAlphaTest(uint fd) -{ - if(DATE) - { - if(DATM) - { - if(is32bit(FPSM)) return (fd & 0x80000000) != 0; - if(is16bit(FPSM)) return (fd & 0x00008000) != 0; - } - else - { - if(is32bit(FPSM)) return (fd & 0x80000000) == 0; - if(is16bit(FPSM)) return (fd & 0x00008000) == 0; - } - } - - return true; -} - -int Wrap(int a, int b, int c, int mode) -{ - if(MERGED) - { - return select((a & b) | c, clamp(a, b, c), (mode & 2) != 0); - } - else - { - switch(mode) - { - case CLAMP_REGION_REPEAT: - return (a & b) | c; - case CLAMP_REPEAT: - return a & b; - case CLAMP_CLAMP: - return clamp(a, 0, c); - case CLAMP_REGION_CLAMP: - return clamp(a, b, c); - } - } -} - -int4 AlphaBlend(int4 c, uint fd, int afix, uint2 sel) -{ - if(FWRITE && (ABE || AA1)) - { - int4 cs = c; - int4 cd; - - if(ABA != ABB && (ABA == 1 || ABB == 1 || ABC == 1) || ABD == 1 || MERGED) - { - if(is32bit(FPSM) || is24bit(FPSM)) - { - cd.x = fd & 0xff; - cd.y = (fd >> 8) & 0xff; - cd.z = (fd >> 16) & 0xff; - cd.w = fd >> 24; - } - else if(is16bit(FPSM)) - { - cd.x = (fd << 3) & 0xf8; - cd.y = (fd >> 2) & 0xf8; - cd.z = (fd >> 7) & 0xf8; - cd.w = (fd >> 8) & 0x80; - } - } - - if(MERGED) - { - int aba = TFX_ABA(sel); - int abb = TFX_ABB(sel); - int abc = TFX_ABC(sel); - int abd = TFX_ABD(sel); - - int ad = !is24bit(FPSM) ? cd.w : 0x80; - - int3 A = aba == 0 ? cs.xyz : aba == 1 ? cd.xyz : 0; - int3 B = abb == 0 ? cs.xyz : abb == 1 ? cd.xyz : 0; - int C = abc == 0 ? cs.w : abc == 1 ? ad : afix; - int3 D = abd == 0 ? cs.xyz : abd == 1 ? cd.xyz : 0; - - c.xyz = (mul24(A - B, C) >> 7) + D; - } - else - { - if(ABA != ABB) - { - switch(ABA) - { - case 0: break; // c.xyz = cs.xyz; - case 1: c.xyz = cd.xyz; break; - case 2: c.xyz = 0; break; - } - - switch(ABB) - { - case 0: c.xyz -= cs.xyz; break; - case 1: c.xyz -= cd.xyz; break; - case 2: break; - } - - if(!(is24bit(FPSM) && ABC == 1)) - { - int a = 0; - - switch(ABC) - { - case 0: a = cs.w; break; - case 1: a = cd.w; break; - case 2: a = afix; break; - } - - c.xyz = c.xyz * a >> 7; - } - - switch(ABD) - { - case 0: c.xyz += cs.xyz; break; - case 1: c.xyz += cd.xyz; break; - case 2: break; - } - } - else - { - switch(ABD) - { - case 0: break; - case 1: c.xyz = cd.xyz; break; - case 2: c.xyz = 0; break; - } - } - } - - if(PABE) - { - c.xyz = select(cs.xyz, c.xyz, (int3)(cs.w << 24)); - } - } - - return c; -} - -uchar4 Expand24To32(uint rgba, uchar ta0) -{ - uchar4 c; - - c.x = rgba & 0xff; - c.y = (rgba >> 8) & 0xff; - c.z = (rgba >> 16) & 0xff; - c.w = !AEM || (rgba & 0xffffff) != 0 ? ta0 : 0; - - return c; -} - -uchar4 Expand16To32(ushort rgba, uchar ta0, uchar ta1) -{ - uchar4 c; - - c.x = (rgba << 3) & 0xf8; - c.y = (rgba >> 2) & 0xf8; - c.z = (rgba >> 7) & 0xf8; - c.w = !AEM || (rgba & 0x7fff) != 0 ? ((rgba & 0x8000) ? ta1 : ta0) : 0; - - return c; -} - -int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* pb) -{ - uchar4 c; - - uint addr = PixelAddress(x, y, pb->tbp[level], pb->tbw[level], TPSM); - - __global ushort* vm16 = (__global ushort*)vm; - __global uint* vm32 = (__global uint*)vm; - - switch(TPSM) - { - default: - case PSM_PSMCT32: - case PSM_PSMZ32: - c = ((__global uchar4*)vm)[addr]; - break; - case PSM_PSMCT24: - case PSM_PSMZ24: - c = Expand24To32(vm32[addr], pb->ta0); - break; - case PSM_PSMCT16: - case PSM_PSMCT16S: - case PSM_PSMZ16: - case PSM_PSMZ16S: - c = Expand16To32(vm16[addr], pb->ta0, pb->ta1); - break; - case PSM_PSMT8: - c = pb->clut[vm[addr]]; - break; - case PSM_PSMT4: - c = pb->clut[(vm[addr >> 1] >> ((addr & 1) << 2)) & 0x0f]; - break; - case PSM_PSMT8H: - c = pb->clut[vm32[addr] >> 24]; - break; - case PSM_PSMT4HL: - c = pb->clut[(vm32[addr] >> 24) & 0x0f]; - break; - case PSM_PSMT4HH: - c = pb->clut[(vm32[addr] >> 28) & 0x0f]; - break; - } - - //printf("[%d %d] %05x %d %d %08x | %v4hhd | %08x\n", x, y, pb->tbp[level], pb->tbw[level], TPSM, addr, c, vm[addr]); - - return convert_int4(c); -} - -int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t) -{ - int4 c; - - if(0)//if(MMIN) - { - // TODO - } - else - { - int2 uv; - - if(!FST) - { - uv = convert_int2_rte(t.xy * native_recip(t.z)); - } - else - { - // sfex capcom logo third drawing call at (0,223) calculated as: - // t0 + (p - p0) * (t - t0) / (p1 - p0) - // 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112 - // due to rounding errors (multiply-add instruction maybe): - // t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation - // TODO: sw renderer samples at 112 exactly, check which one is correct - - // last line error in persona 3 movie clips if rounding is enabled - - uv = convert_int2(t.xy); - } - - if(LTF) uv -= 0x0008; - - int2 uvf = uv & 0x000f; - - int2 uv0 = uv >> 4; - int2 uv1 = uv0 + 1; - - uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, MERGED ? TFX_WMS(pb->sel) : WMS); - uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, MERGED ? TFX_WMT(pb->sel) : WMT); - uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, MERGED ? TFX_WMS(pb->sel) : WMS); - uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, MERGED ? TFX_WMT(pb->sel) : WMT); - - int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb); - int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb); - int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb); - int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb); - - if(LTF) - { - c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00; - c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10; - c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00; - } - - c = c00; - } - - return c; -} - -// TODO: 2x2 MSAA idea -// downsize the rendering tile to 16x8 or 8x8 and render 2x2 sub-pixels to __local -// hittest and ztest 2x2 (create write mask, only skip if all -1) -// calculate color 1x1, alpha tests 1x1 -// use mask to filter failed sub-pixels when writing to __local -// needs the tile data to be fetched at the beginning, even if rfb/zfb is not set, unless we know the tile is fully covered -// multiple work-items may render different prims to the same 2x2 sub-pixel, averaging can only be done after a barrier at the very end -// pb->fm? alpha channel and following alpha tests? some games may depend on exact results, not some average - -__kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX( - __global gs_env* env, - __global uchar* vm, - __global uchar* tex, - __global uchar* pb_base, - uint pb_start, - uint prim_start, - uint prim_count, - uint bin_count, // == bin_dim.z * bin_dim.w - uchar4 bin_dim, - uint fbp, - uint zbp, - uint bw) -{ - uint x = get_global_id(0); - uint y = get_global_id(1); - - uint bin_x = (x >> BIN_SIZE_BITS) - bin_dim.x; - uint bin_y = (y >> BIN_SIZE_BITS) - bin_dim.y; - uint bin_index = mad24(bin_y, (uint)bin_dim.z, bin_x); - - uint batch_first = env->bounds[bin_index].first; - uint batch_last = env->bounds[bin_index].last; - uint batch_start = prim_start >> MAX_PRIM_PER_BATCH_BITS; - - if(batch_last < batch_first) - { - return; - } - - uint skip; - - if(batch_start < batch_first) - { - uint n = (batch_first - batch_start) * MAX_PRIM_PER_BATCH - (prim_start & (MAX_PRIM_PER_BATCH - 1)); - - if(n > prim_count) - { - return; - } - - skip = 0; - prim_count -= n; - batch_start = batch_first; - } - else - { - skip = prim_start & (MAX_PRIM_PER_BATCH - 1); - prim_count += skip; - } - - if(batch_start > batch_last) - { - return; - } - - prim_count = min(prim_count, (batch_last - batch_start + 1) << MAX_PRIM_PER_BATCH_BITS); - - // - - int2 pi = (int2)(x, y); - float2 pf = convert_float2(pi); - - int faddr = PixelAddress(x, y, fbp, bw, FPSM); - int zaddr = PixelAddress(x, y, zbp, bw, ZPSM); - - uint fd, zd; // TODO: fd as int4 and only pack before writing out? - - if(RFB) - { - fd = ReadFrame(vm, faddr, FPSM); - } - - if(RZB) - { - zd = ReadFrame(vm, zaddr, ZPSM); - } - - // early destination alpha test - - if(!DestAlphaTest(fd)) - { - return; - } - - // - - uint fragments = 0; - - __global BIN_TYPE* bin = &env->bin[bin_index + batch_start * bin_count]; // TODO: not needed for "one tile case" - __global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS]; - __global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS]; - - pb_base += pb_start; - - BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip); - - for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH) - { - while(bin_value != 0) - { - uint i = clz(bin_value); - - if(prim_index + i >= prim_count) - { - break; - } - - bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1); - - __global gs_prim* prim = &prim_base[prim_index + i]; - __global gs_param* pb = (__global gs_param*)(pb_base + prim->pb_index * TFX_PARAM_SIZE); - - if(!NOSCISSOR) - { - if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw))) - { - continue; - } - } - - uint2 zf; - float3 t; - int4 c; - - // TODO: do not hittest if we know the tile is fully inside the prim - - if(PRIM == GS_POINT_CLASS) - { - float2 dpf = pf - prim->v[0].p.xy; - - if(!all((dpf <= 0.5f) & (dpf > -0.5f))) - { - continue; - } - - zf = as_uint2(prim->v[0].p.zw); - t = prim->v[0].tc.xyz; - c = convert_int4(prim->v[0].c); - } - else if(PRIM == GS_LINE_CLASS) - { - // TODO: find point on line prependicular to (x,y), distance.x < 0.5f || distance.y < 0.5f - // TODO: aa1: coverage ~ distance.x/y, slope selects x or y, zwrite disabled - // TODO: do not draw last pixel of the line - - continue; - } - else if(PRIM == GS_TRIANGLE_CLASS) - { - // TODO: aa1: draw edge as a line - - if(!ZTest(prim->zmax, zd)) - { - continue; - } - - __global gs_barycentric* b = &barycentric[prim_index + i]; - - float3 f = b->dx.xyz * (pf.x - b->dx.w) + b->dy.xyz * (pf.y - b->dy.w) + (float3)(0, 0, 1); - - if(!all(select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10)) >= b->zero.xyz)) - { - continue; - } - - float2 zf0 = convert_float2(as_uint2(prim->v[0].p.zw)); - float2 zf1 = convert_float2(as_uint2(prim->v[1].p.zw)); - float2 zf2 = convert_float2(as_uint2(prim->v[2].p.zw)); - - zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->zmin; - zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y); - - t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y; - - if(IIP) - { - float4 c0 = convert_float4(prim->v[0].c); - float4 c1 = convert_float4(prim->v[1].c); - float4 c2 = convert_float4(prim->v[2].c); - - c = convert_int4_rte(c0 * f.z + c1 * f.x + c2 * f.y); - } - else - { - c = convert_int4(prim->v[2].c); - } - } - else if(PRIM == GS_SPRITE_CLASS) - { - int2 tl = convert_int2_rtp(prim->v[0].p.xy); - int2 br = convert_int2_rtp(prim->v[1].p.xy); - - if(!all((pi >= tl) & (pi < br))) - { - continue; - } - - zf = as_uint2(prim->v[1].p.zw); - - t.xy = prim->v[0].tc.xy + prim->v[1].tc.xy * (pf - prim->v[0].p.xy); - t.z = prim->v[0].tc.z; - - c = convert_int4(prim->v[1].c); - } - - // z test - - uint zs = zf.x; - - if(!ZTest(zs, zd)) - { - continue; - } - - // sample texture - - int4 ct; - - if(TFX != TFX_NONE) - { - ct = SampleTexture(tex, pb, t); - } - - // alpha tfx - - int alpha = c.w; - - if(FB) - { - if(TCC) - { - switch(TFX) - { - case TFX_MODULATE: - c.w = clamp(mul24(ct.w, c.w) >> 7, 0, 0xff); - break; - case TFX_DECAL: - c.w = ct.w; - break; - case TFX_HIGHLIGHT: - c.w = clamp(ct.w + c.w, 0, 0xff); - break; - case TFX_HIGHLIGHT2: - c.w = ct.w; - break; - } - } - - if(AA1) - { - if(!ABE || c.w == 0x80) - { - c.w = 0x80; // TODO: edge ? coverage : 0x80 - } - } - } - - // read mask - - uint fm = pb->fm; - uint zm = pb->zm; - - // alpha test - - if(!AlphaTest(c.w, pb->aref, &fm, &zm)) - { - continue; - } - - // all tests done, we have a new output - - fragments++; - - // write z - - if(ZWRITE) - { - zd = RZB ? bitselect(zs, zd, zm) : zs; - } - - // rgb tfx - - if(FWRITE) - { - switch(TFX) - { - case TFX_MODULATE: - c.xyz = clamp(mul24(ct.xyz, c.xyz) >> 7, 0, 0xff); - break; - case TFX_DECAL: - c.xyz = ct.xyz; - break; - case TFX_HIGHLIGHT: - case TFX_HIGHLIGHT2: - c.xyz = clamp((mul24(ct.xyz, c.xyz) >> 7) + alpha, 0, 0xff); - break; - } - } - - // fog - - if(FWRITE && FGE) - { - int fog = (int)zf.y; - - int3 fv = mul24(c.xyz, fog) >> 8; - int3 fc = mul24(convert_int4(pb->fog).xyz, 0xff - fog) >> 8; - - c.xyz = fv + fc; - } - - // alpha blend - - c = AlphaBlend(c, fd, pb->afix, pb->sel); - - // write frame - - if(FWRITE) - { - if(DTHE && is16bit(FPSM)) - { - c.xyz += pb->dimx[y & 3][x & 3]; - } - - c = COLCLAMP ? clamp(c, 0, 0xff) : c & 0xff; - - if(FBA && !is24bit(FPSM)) - { - c.w |= 0x80; - } - - uint fs; - - if(is32bit(FPSM)) - { - fs = (c.w << 24) | (c.z << 16) | (c.y << 8) | c.x; - } - else if(is24bit(FPSM)) - { - fs = (c.z << 16) | (c.y << 8) | c.x; - } - else if(is16bit(FPSM)) - { - fs = ((c.w & 0x80) << 8) | ((c.z & 0xf8) << 7) | ((c.y & 0xf8) << 2) | (c.x >> 3); - } - - fd = RFB ? bitselect(fs, fd, fm) : fs; - - // dest alpha test for the next loop - - if(!DestAlphaTest(fd)) - { - prim_index = prim_count; // game over - - break; - } - } - } - - bin += bin_count; - bin_value = *bin; - } - - if(fragments > 0) - { - if(ZWRITE) - { - WriteFrame(vm, zaddr, ZPSM, zd); - } - - if(FWRITE) - { - WriteFrame(vm, faddr, FPSM, fd); - } - } -} - -#endif - -#endif diff --git a/plugins/GSdx/resource.h b/plugins/GSdx/resource.h index 679adc1920..6193ce9ff3 100644 --- a/plugins/GSdx/resource.h +++ b/plugins/GSdx/resource.h @@ -17,11 +17,9 @@ #define IDC_FILTER 2012 #define IDC_INTERLACE_TEXT 2013 #define IDC_INTERLACE 2014 -#define IDC_OPENCL_DEVICE 2015 -#define IDC_OPENCL_TEXT 2016 -#define IDC_HACKSBUTTON 2017 -#define IDC_OSDBUTTON 2018 -#define IDC_SHADEBUTTON 2019 +#define IDC_HACKSBUTTON 2015 +#define IDC_OSDBUTTON 2016 +#define IDC_SHADEBUTTON 2017 // Hardware Renderer Settings: #define IDC_PALTEX 2030 #define IDC_LARGE_FB 2031 diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index 80a17a3cd4..78a5cf8079 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -51,14 +51,6 @@ #include -#ifdef ENABLE_OPENCL - -#define CL_USE_DEPRECATED_OPENCL_1_1_APIS -#define __CL_ENABLE_EXCEPTIONS -#include - -#endif - #ifdef __x86_64__ #define _M_AMD64 #endif