Modified for opencl 1.1. While it runs on nvidia cards now, you can't use its sdk to compile gsdx, cl.hpp is missing there. Intel or amd is ok.

This commit is contained in:
gabest11 2014-09-19 06:53:05 +02:00 committed by Gregory Hainaut
parent 263c097d13
commit 9e20387595
5 changed files with 88 additions and 82 deletions

View File

@ -328,8 +328,8 @@ void GSRendererCL::Draw()
std::vector<cl::Event> el(1); std::vector<cl::Event> el(1);
m_cl.queue[2].enqueueMarkerWithWaitList(NULL, &el[0]); m_cl.queue[2].enqueueMarker(&el[0]);
m_cl.wq->enqueueBarrierWithWaitList(&el, NULL); m_cl.wq->enqueueWaitForEvents(el);
// switch to the other queue/buffer (double buffering) // switch to the other queue/buffer (double buffering)
@ -404,15 +404,7 @@ void GSRendererCL::Draw()
job->pb_start = m_cl.pb.tail; job->pb_start = m_cl.pb.tail;
#ifdef DEBUG #ifdef DEBUG
job->fbp = context->FRAME.Block(); job->param = pb;
job->fbw = context->FRAME.FBW;
job->fpsm = context->FRAME.PSM;
job->zbp = context->ZBUF.Block();
job->tbp = PRIM->TME ? context->TEX0.TBP0 : 0xfffff;
job->tbw = PRIM->TME ? context->TEX0.TBW : 1;
job->tpsm = PRIM->TME ? context->TEX0.PSM : 0;
job->tw = PRIM->TME ? context->TEX0.TW : 0;
job->th = PRIM->TME ? context->TEX0.TH : 0;
#endif #endif
m_jobs.push_back(job); m_jobs.push_back(job);
@ -687,10 +679,10 @@ void GSRendererCL::Enqueue()
m_cl.Unmap(); m_cl.Unmap();
std::vector<cl::Event> el2(1); std::vector<cl::Event> el(1);
m_cl.wq->enqueueMarkerWithWaitList(NULL, &el2[0]); m_cl.wq->enqueueMarker(&el[0]);
m_cl.queue[2].enqueueBarrierWithWaitList(&el2, NULL); m_cl.queue[2].enqueueWaitForEvents(el);
// //
@ -813,6 +805,9 @@ void GSRendererCL::Enqueue()
{ {
ASSERT(prim_start < MAX_PRIM_COUNT); ASSERT(prim_start < MAX_PRIM_COUNT);
// TODO: join tfx kernel calls where the selector and fbp/zbp/bw/scissor are the same
// move dimx/fm/zm/fog/aref/afix/ta0/ta1/tbp/tbw/minu/minv/maxu/maxv/lod/mxl/l/k/clut to an indexed array per prim
tfxcount++; tfxcount++;
//if(LOG) { fprintf(s_fp, "q %05x %05x %05x\n", (*i)->fbp, (*i)->zbp, (*i)->tbp); fflush(s_fp); } //if(LOG) { fprintf(s_fp, "q %05x %05x %05x\n", (*i)->fbp, (*i)->zbp, (*i)->tbp); fflush(s_fp); }
@ -1574,7 +1569,8 @@ GSRendererCL::CL::CL()
case CL_DEVICE_TYPE_CPU: printf(" CPU"); break; case CL_DEVICE_TYPE_CPU: printf(" CPU"); break;
} }
if(strstr(version.c_str(), "OpenCL C 1.2") != NULL) if(strstr(version.c_str(), "OpenCL C 1.1") != NULL
|| strstr(version.c_str(), "OpenCL C 1.2") != NULL)
{ {
#ifdef IOCL_DEBUG #ifdef IOCL_DEBUG
if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL) if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL)
@ -1646,7 +1642,7 @@ void GSRendererCL::CL::Map()
if(vb.head < vb.size) if(vb.head < vb.size)
{ {
vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, vb.head, vb.size - vb.head); vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, vb.head, vb.size - vb.head);
vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head; vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head;
ASSERT(((size_t)vb.ptr & 15) == 0); ASSERT(((size_t)vb.ptr & 15) == 0);
ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0); ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0);
@ -1654,14 +1650,13 @@ void GSRendererCL::CL::Map()
if(ib.head < ib.size) if(ib.head < ib.size)
{ {
ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, ib.head, ib.size - ib.head); ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE, ib.head, ib.size - ib.head);
ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head; ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head;
ASSERT(((size_t)ib.ptr & 15) == 0);
} }
if(pb.head < pb.size) if(pb.head < pb.size)
{ {
pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pb.head, pb.size - pb.head); pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, pb.head, pb.size - pb.head);
pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head; pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head;
ASSERT(((size_t)pb.ptr & 15) == 0); ASSERT(((size_t)pb.ptr & 15) == 0);
ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0); ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0);
@ -1681,6 +1676,7 @@ void GSRendererCL::CL::Unmap()
static void AddDefs(ostringstream& opt) static void AddDefs(ostringstream& opt)
{ {
opt << "-cl-std=CL1.1 ";
opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u "; opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u ";
opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u "; opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u ";
opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u "; opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u ";

View File

@ -172,7 +172,7 @@ class GSRendererCL : public GSRenderer
GSVector4i* src_pages; // read by any texture level GSVector4i* src_pages; // read by any texture level
GSVector4i* dst_pages; // f/z writes to it GSVector4i* dst_pages; // f/z writes to it
#ifdef DEBUG #ifdef DEBUG
uint32 fbp, fbw, fpsm, zbp, tbp, tbw, tpsm, tw, th; TFXParameter* param;
#endif #endif
TFXJob() TFXJob()
: src_pages(NULL) : src_pages(NULL)

View File

@ -1,4 +1,4 @@
#ifdef __OPENCL_C_VERSION__ // make safe to include in resource file to enforce dependency #if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency
#ifndef CL_FLT_EPSILON #ifndef CL_FLT_EPSILON
#define CL_FLT_EPSILON 1.1920928955078125e-7 #define CL_FLT_EPSILON 1.1920928955078125e-7
@ -1133,6 +1133,66 @@ int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* p
return convert_int4(c); return convert_int4(c);
} }
int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t)
{
int4 c;
if(0)//if(MMIN)
{
// TODO
}
else
{
int2 uv;
if(!FST)
{
uv = convert_int2_rte(t.xy * native_recip(t.z));
if(LTF) uv -= 0x0008;
}
else
{
// sfex capcom logo third drawing call at (0,223) calculated as:
// t0 + (p - p0) * (t - t0) / (p1 - p0)
// 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112
// due to rounding errors (multiply-add instruction maybe):
// t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation
// TODO: sw renderer samples at 112 exactly, check which one is correct
// last line error in persona 3 movie clips if rounding is enabled
uv = convert_int2(t.xy);
}
int2 uvf = uv & 0x000f;
int2 uv0 = uv >> 4;
int2 uv1 = uv0 + 1;
uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS);
uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT);
uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS);
uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT);
int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb);
int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb);
int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb);
int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb);
if(LTF)
{
c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00;
c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10;
c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00;
}
c = c00;
}
return c;
}
// TODO: 2x2 MSAA idea // TODO: 2x2 MSAA idea
// downsize the rendering tile to 16x8 or 8x8 and render 2x2 sub-pixels to __local // downsize the rendering tile to 16x8 or 8x8 and render 2x2 sub-pixels to __local
// hittest and ztest 2x2 (create write mask, only skip if all -1) // hittest and ztest 2x2 (create write mask, only skip if all -1)
@ -1249,8 +1309,6 @@ __kernel void KERNEL_TFX(
BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip); BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip);
__local gs_prim prim_cache;
for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH) for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH)
{ {
while(bin_value != 0) while(bin_value != 0)
@ -1267,7 +1325,7 @@ __kernel void KERNEL_TFX(
bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1); bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1);
uint2 zf; uint2 zf;
float4 t; float3 t;
int4 c; int4 c;
// TODO: do not hittest if we know the tile is fully inside the prim // TODO: do not hittest if we know the tile is fully inside the prim
@ -1282,7 +1340,7 @@ __kernel void KERNEL_TFX(
} }
zf = as_uint2(prim->v[0].p.zw); zf = as_uint2(prim->v[0].p.zw);
t.xyz = prim->v[0].tc.xyz; t = prim->v[0].tc.xyz;
c = convert_int4(prim->v[0].c); c = convert_int4(prim->v[0].c);
} }
else if(PRIM == GS_LINE_CLASS) else if(PRIM == GS_LINE_CLASS)
@ -1313,7 +1371,7 @@ __kernel void KERNEL_TFX(
zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z; zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z;
zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y); zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y);
t.xyz = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y; t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y;
if(IIP) if(IIP)
{ {
@ -1361,62 +1419,9 @@ __kernel void KERNEL_TFX(
if(TFX != TFX_NONE) if(TFX != TFX_NONE)
{ {
// TODO tex = vm; // TODO: use the texture cache
if(0)//if(MMIN) ct = SampleTexture(tex, pb, t);
{
// TODO
}
else
{
int2 uv;
if(!FST)
{
uv = convert_int2_rte(t.xy * native_recip(t.z));
if(LTF) uv -= 0x0008;
}
else
{
// sfex capcom logo third drawing call at (0,223) calculated as:
// t0 + (p - p0) * (t - t0) / (p1 - p0)
// 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112
// due to rounding errors (multiply-add instruction maybe):
// t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation
// TODO: sw renderer samples at 112 exactly, check which one is correct
// last line error in persona 3 movie clips if rounding is enabled
uv = convert_int2(t.xy);
}
int2 uvf = uv & 0x000f;
int2 uv0 = uv >> 4;
int2 uv1 = uv0 + 1;
uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS);
uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT);
uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS);
uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT);
tex = vm; // TODO: use the texture cache
int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb);
int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb);
int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb);
int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb);
if(LTF)
{
c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00;
c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10;
c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00;
}
ct = c00;
}
} }
// alpha tfx // alpha tfx

View File

@ -43,6 +43,11 @@
#include <d3dx9.h> #include <d3dx9.h>
#include <comutil.h> #include <comutil.h>
#include "../../common/include/comptr.h" #include "../../common/include/comptr.h"
#include <CL/cl.h>
#undef CL_VERSION_1_2
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp> #include <CL/cl.hpp>
#define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA) #define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA)

View File

@ -8,7 +8,7 @@
<ItemDefinitionGroup> <ItemDefinitionGroup>
<ClCompile> <ClCompile>
<IntrinsicFunctions>true</IntrinsicFunctions> <IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>_WINDOWS;_WIN32_WINNT=0x500;__CL_ENABLE_EXCEPTIONS;%(PreprocessorDefinitions)</PreprocessorDefinitions> <PreprocessorDefinitions>_WINDOWS;_WIN32_WINNT=0x500;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<FloatingPointModel>Fast</FloatingPointModel> <FloatingPointModel>Fast</FloatingPointModel>
<RuntimeTypeInfo>false</RuntimeTypeInfo> <RuntimeTypeInfo>false</RuntimeTypeInfo>
<WarningLevel>Level4</WarningLevel> <WarningLevel>Level4</WarningLevel>