diff --git a/plugins/GSdx/GSRendererCL.cpp b/plugins/GSdx/GSRendererCL.cpp index 990cc3e6f0..d380649222 100644 --- a/plugins/GSdx/GSRendererCL.cpp +++ b/plugins/GSdx/GSRendererCL.cpp @@ -328,8 +328,8 @@ void GSRendererCL::Draw() std::vector el(1); - m_cl.queue[2].enqueueMarkerWithWaitList(NULL, &el[0]); - m_cl.wq->enqueueBarrierWithWaitList(&el, NULL); + m_cl.queue[2].enqueueMarker(&el[0]); + m_cl.wq->enqueueWaitForEvents(el); // switch to the other queue/buffer (double buffering) @@ -404,15 +404,7 @@ void GSRendererCL::Draw() job->pb_start = m_cl.pb.tail; #ifdef DEBUG - job->fbp = context->FRAME.Block(); - job->fbw = context->FRAME.FBW; - job->fpsm = context->FRAME.PSM; - job->zbp = context->ZBUF.Block(); - job->tbp = PRIM->TME ? context->TEX0.TBP0 : 0xfffff; - job->tbw = PRIM->TME ? context->TEX0.TBW : 1; - job->tpsm = PRIM->TME ? context->TEX0.PSM : 0; - job->tw = PRIM->TME ? context->TEX0.TW : 0; - job->th = PRIM->TME ? context->TEX0.TH : 0; + job->param = pb; #endif m_jobs.push_back(job); @@ -687,10 +679,10 @@ void GSRendererCL::Enqueue() m_cl.Unmap(); - std::vector el2(1); + std::vector el(1); - m_cl.wq->enqueueMarkerWithWaitList(NULL, &el2[0]); - m_cl.queue[2].enqueueBarrierWithWaitList(&el2, NULL); + m_cl.wq->enqueueMarker(&el[0]); + m_cl.queue[2].enqueueWaitForEvents(el); // @@ -813,6 +805,9 @@ void GSRendererCL::Enqueue() { ASSERT(prim_start < MAX_PRIM_COUNT); + // TODO: join tfx kernel calls where the selector and fbp/zbp/bw/scissor are the same + // move dimx/fm/zm/fog/aref/afix/ta0/ta1/tbp/tbw/minu/minv/maxu/maxv/lod/mxl/l/k/clut to an indexed array per prim + tfxcount++; //if(LOG) { fprintf(s_fp, "q %05x %05x %05x\n", (*i)->fbp, (*i)->zbp, (*i)->tbp); fflush(s_fp); } @@ -1574,7 +1569,8 @@ GSRendererCL::CL::CL() case CL_DEVICE_TYPE_CPU: printf(" CPU"); break; } - if(strstr(version.c_str(), "OpenCL C 1.2") != NULL) + if(strstr(version.c_str(), "OpenCL C 1.1") != NULL + || strstr(version.c_str(), "OpenCL C 1.2") != NULL) { #ifdef IOCL_DEBUG if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL) @@ -1646,7 +1642,7 @@ void GSRendererCL::CL::Map() if(vb.head < vb.size) { - vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, vb.head, vb.size - vb.head); + vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, vb.head, vb.size - vb.head); vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head; ASSERT(((size_t)vb.ptr & 15) == 0); ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0); @@ -1654,14 +1650,13 @@ void GSRendererCL::CL::Map() if(ib.head < ib.size) { - ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, ib.head, ib.size - ib.head); + ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE, ib.head, ib.size - ib.head); ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head; - ASSERT(((size_t)ib.ptr & 15) == 0); } if(pb.head < pb.size) { - pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pb.head, pb.size - pb.head); + pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, pb.head, pb.size - pb.head); pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head; ASSERT(((size_t)pb.ptr & 15) == 0); ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0); @@ -1681,6 +1676,7 @@ void GSRendererCL::CL::Unmap() static void AddDefs(ostringstream& opt) { + opt << "-cl-std=CL1.1 "; opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u "; opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u "; opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u "; diff --git a/plugins/GSdx/GSRendererCL.h b/plugins/GSdx/GSRendererCL.h index d60b5f2939..f6b3231a06 100644 --- a/plugins/GSdx/GSRendererCL.h +++ b/plugins/GSdx/GSRendererCL.h @@ -172,7 +172,7 @@ class GSRendererCL : public GSRenderer GSVector4i* src_pages; // read by any texture level GSVector4i* dst_pages; // f/z writes to it #ifdef DEBUG - uint32 fbp, fbw, fpsm, zbp, tbp, tbw, tpsm, tw, th; + TFXParameter* param; #endif TFXJob() : src_pages(NULL) diff --git a/plugins/GSdx/res/tfx.cl b/plugins/GSdx/res/tfx.cl index 8342f338a5..4eac8374de 100644 --- a/plugins/GSdx/res/tfx.cl +++ b/plugins/GSdx/res/tfx.cl @@ -1,4 +1,4 @@ -#ifdef __OPENCL_C_VERSION__ // make safe to include in resource file to enforce dependency +#if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency #ifndef CL_FLT_EPSILON #define CL_FLT_EPSILON 1.1920928955078125e-7 @@ -1133,6 +1133,66 @@ int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* p return convert_int4(c); } +int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t) +{ + int4 c; + + if(0)//if(MMIN) + { + // TODO + } + else + { + int2 uv; + + if(!FST) + { + uv = convert_int2_rte(t.xy * native_recip(t.z)); + + if(LTF) uv -= 0x0008; + } + else + { + // sfex capcom logo third drawing call at (0,223) calculated as: + // t0 + (p - p0) * (t - t0) / (p1 - p0) + // 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112 + // due to rounding errors (multiply-add instruction maybe): + // t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation + // TODO: sw renderer samples at 112 exactly, check which one is correct + + // last line error in persona 3 movie clips if rounding is enabled + + uv = convert_int2(t.xy); + } + + int2 uvf = uv & 0x000f; + + int2 uv0 = uv >> 4; + int2 uv1 = uv0 + 1; + + uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS); + uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT); + uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS); + uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT); + + int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb); + int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb); + int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb); + int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb); + + if(LTF) + { + c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00; + c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10; + c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00; + } + + c = c00; + } + + return c; +} + // TODO: 2x2 MSAA idea // downsize the rendering tile to 16x8 or 8x8 and render 2x2 sub-pixels to __local // hittest and ztest 2x2 (create write mask, only skip if all -1) @@ -1249,8 +1309,6 @@ __kernel void KERNEL_TFX( BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip); - __local gs_prim prim_cache; - for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH) { while(bin_value != 0) @@ -1267,7 +1325,7 @@ __kernel void KERNEL_TFX( bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1); uint2 zf; - float4 t; + float3 t; int4 c; // TODO: do not hittest if we know the tile is fully inside the prim @@ -1282,7 +1340,7 @@ __kernel void KERNEL_TFX( } zf = as_uint2(prim->v[0].p.zw); - t.xyz = prim->v[0].tc.xyz; + t = prim->v[0].tc.xyz; c = convert_int4(prim->v[0].c); } else if(PRIM == GS_LINE_CLASS) @@ -1313,7 +1371,7 @@ __kernel void KERNEL_TFX( zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z; zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y); - t.xyz = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y; + t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y; if(IIP) { @@ -1361,62 +1419,9 @@ __kernel void KERNEL_TFX( if(TFX != TFX_NONE) { - // TODO + tex = vm; // TODO: use the texture cache - if(0)//if(MMIN) - { - // TODO - } - else - { - int2 uv; - - if(!FST) - { - uv = convert_int2_rte(t.xy * native_recip(t.z)); - - if(LTF) uv -= 0x0008; - } - else - { - // sfex capcom logo third drawing call at (0,223) calculated as: - // t0 + (p - p0) * (t - t0) / (p1 - p0) - // 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112 - // due to rounding errors (multiply-add instruction maybe): - // t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation - // TODO: sw renderer samples at 112 exactly, check which one is correct - - // last line error in persona 3 movie clips if rounding is enabled - - uv = convert_int2(t.xy); - } - - int2 uvf = uv & 0x000f; - - int2 uv0 = uv >> 4; - int2 uv1 = uv0 + 1; - - uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS); - uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT); - uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS); - uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT); - - tex = vm; // TODO: use the texture cache - - int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb); - int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb); - int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb); - int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb); - - if(LTF) - { - c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00; - c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10; - c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00; - } - - ct = c00; - } + ct = SampleTexture(tex, pb, t); } // alpha tfx diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index b9bfa19737..1929381354 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -43,6 +43,11 @@ #include #include #include "../../common/include/comptr.h" + +#include +#undef CL_VERSION_1_2 +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS +#define __CL_ENABLE_EXCEPTIONS #include #define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA) diff --git a/plugins/GSdx/vsprops/common.props b/plugins/GSdx/vsprops/common.props index 91c58981d0..b8972de77c 100644 --- a/plugins/GSdx/vsprops/common.props +++ b/plugins/GSdx/vsprops/common.props @@ -8,7 +8,7 @@ true - _WINDOWS;_WIN32_WINNT=0x500;__CL_ENABLE_EXCEPTIONS;%(PreprocessorDefinitions) + _WINDOWS;_WIN32_WINNT=0x500;%(PreprocessorDefinitions) Fast false Level4