mirror of https://github.com/PCSX2/pcsx2.git
Modified for opencl 1.1. While it runs on nvidia cards now, you can't use its sdk to compile gsdx, cl.hpp is missing there. Intel or amd is ok.
This commit is contained in:
parent
263c097d13
commit
9e20387595
|
@ -328,8 +328,8 @@ void GSRendererCL::Draw()
|
||||||
|
|
||||||
std::vector<cl::Event> el(1);
|
std::vector<cl::Event> el(1);
|
||||||
|
|
||||||
m_cl.queue[2].enqueueMarkerWithWaitList(NULL, &el[0]);
|
m_cl.queue[2].enqueueMarker(&el[0]);
|
||||||
m_cl.wq->enqueueBarrierWithWaitList(&el, NULL);
|
m_cl.wq->enqueueWaitForEvents(el);
|
||||||
|
|
||||||
// switch to the other queue/buffer (double buffering)
|
// switch to the other queue/buffer (double buffering)
|
||||||
|
|
||||||
|
@ -404,15 +404,7 @@ void GSRendererCL::Draw()
|
||||||
job->pb_start = m_cl.pb.tail;
|
job->pb_start = m_cl.pb.tail;
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
job->fbp = context->FRAME.Block();
|
job->param = pb;
|
||||||
job->fbw = context->FRAME.FBW;
|
|
||||||
job->fpsm = context->FRAME.PSM;
|
|
||||||
job->zbp = context->ZBUF.Block();
|
|
||||||
job->tbp = PRIM->TME ? context->TEX0.TBP0 : 0xfffff;
|
|
||||||
job->tbw = PRIM->TME ? context->TEX0.TBW : 1;
|
|
||||||
job->tpsm = PRIM->TME ? context->TEX0.PSM : 0;
|
|
||||||
job->tw = PRIM->TME ? context->TEX0.TW : 0;
|
|
||||||
job->th = PRIM->TME ? context->TEX0.TH : 0;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
m_jobs.push_back(job);
|
m_jobs.push_back(job);
|
||||||
|
@ -687,10 +679,10 @@ void GSRendererCL::Enqueue()
|
||||||
|
|
||||||
m_cl.Unmap();
|
m_cl.Unmap();
|
||||||
|
|
||||||
std::vector<cl::Event> el2(1);
|
std::vector<cl::Event> el(1);
|
||||||
|
|
||||||
m_cl.wq->enqueueMarkerWithWaitList(NULL, &el2[0]);
|
m_cl.wq->enqueueMarker(&el[0]);
|
||||||
m_cl.queue[2].enqueueBarrierWithWaitList(&el2, NULL);
|
m_cl.queue[2].enqueueWaitForEvents(el);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
||||||
|
@ -813,6 +805,9 @@ void GSRendererCL::Enqueue()
|
||||||
{
|
{
|
||||||
ASSERT(prim_start < MAX_PRIM_COUNT);
|
ASSERT(prim_start < MAX_PRIM_COUNT);
|
||||||
|
|
||||||
|
// TODO: join tfx kernel calls where the selector and fbp/zbp/bw/scissor are the same
|
||||||
|
// move dimx/fm/zm/fog/aref/afix/ta0/ta1/tbp/tbw/minu/minv/maxu/maxv/lod/mxl/l/k/clut to an indexed array per prim
|
||||||
|
|
||||||
tfxcount++;
|
tfxcount++;
|
||||||
|
|
||||||
//if(LOG) { fprintf(s_fp, "q %05x %05x %05x\n", (*i)->fbp, (*i)->zbp, (*i)->tbp); fflush(s_fp); }
|
//if(LOG) { fprintf(s_fp, "q %05x %05x %05x\n", (*i)->fbp, (*i)->zbp, (*i)->tbp); fflush(s_fp); }
|
||||||
|
@ -1574,7 +1569,8 @@ GSRendererCL::CL::CL()
|
||||||
case CL_DEVICE_TYPE_CPU: printf(" CPU"); break;
|
case CL_DEVICE_TYPE_CPU: printf(" CPU"); break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(strstr(version.c_str(), "OpenCL C 1.2") != NULL)
|
if(strstr(version.c_str(), "OpenCL C 1.1") != NULL
|
||||||
|
|| strstr(version.c_str(), "OpenCL C 1.2") != NULL)
|
||||||
{
|
{
|
||||||
#ifdef IOCL_DEBUG
|
#ifdef IOCL_DEBUG
|
||||||
if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL)
|
if(type == CL_DEVICE_TYPE_CPU && strstr(platform_vendor.c_str(), "Intel") != NULL)
|
||||||
|
@ -1646,7 +1642,7 @@ void GSRendererCL::CL::Map()
|
||||||
|
|
||||||
if(vb.head < vb.size)
|
if(vb.head < vb.size)
|
||||||
{
|
{
|
||||||
vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, vb.head, vb.size - vb.head);
|
vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, vb.head, vb.size - vb.head);
|
||||||
vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head;
|
vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head;
|
||||||
ASSERT(((size_t)vb.ptr & 15) == 0);
|
ASSERT(((size_t)vb.ptr & 15) == 0);
|
||||||
ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0);
|
ASSERT((((size_t)vb.ptr + sizeof(GSVertexCL)) & 15) == 0);
|
||||||
|
@ -1654,14 +1650,13 @@ void GSRendererCL::CL::Map()
|
||||||
|
|
||||||
if(ib.head < ib.size)
|
if(ib.head < ib.size)
|
||||||
{
|
{
|
||||||
ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, ib.head, ib.size - ib.head);
|
ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE, ib.head, ib.size - ib.head);
|
||||||
ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head;
|
ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head;
|
||||||
ASSERT(((size_t)ib.ptr & 15) == 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if(pb.head < pb.size)
|
if(pb.head < pb.size)
|
||||||
{
|
{
|
||||||
pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pb.head, pb.size - pb.head);
|
pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, pb.head, pb.size - pb.head);
|
||||||
pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head;
|
pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head;
|
||||||
ASSERT(((size_t)pb.ptr & 15) == 0);
|
ASSERT(((size_t)pb.ptr & 15) == 0);
|
||||||
ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0);
|
ASSERT((((size_t)pb.ptr + sizeof(TFXParameter)) & 15) == 0);
|
||||||
|
@ -1681,6 +1676,7 @@ void GSRendererCL::CL::Unmap()
|
||||||
|
|
||||||
static void AddDefs(ostringstream& opt)
|
static void AddDefs(ostringstream& opt)
|
||||||
{
|
{
|
||||||
|
opt << "-cl-std=CL1.1 ";
|
||||||
opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u ";
|
opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u ";
|
||||||
opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u ";
|
opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u ";
|
||||||
opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u ";
|
opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u ";
|
||||||
|
|
|
@ -172,7 +172,7 @@ class GSRendererCL : public GSRenderer
|
||||||
GSVector4i* src_pages; // read by any texture level
|
GSVector4i* src_pages; // read by any texture level
|
||||||
GSVector4i* dst_pages; // f/z writes to it
|
GSVector4i* dst_pages; // f/z writes to it
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
uint32 fbp, fbw, fpsm, zbp, tbp, tbw, tpsm, tw, th;
|
TFXParameter* param;
|
||||||
#endif
|
#endif
|
||||||
TFXJob()
|
TFXJob()
|
||||||
: src_pages(NULL)
|
: src_pages(NULL)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#ifdef __OPENCL_C_VERSION__ // make safe to include in resource file to enforce dependency
|
#if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency
|
||||||
|
|
||||||
#ifndef CL_FLT_EPSILON
|
#ifndef CL_FLT_EPSILON
|
||||||
#define CL_FLT_EPSILON 1.1920928955078125e-7
|
#define CL_FLT_EPSILON 1.1920928955078125e-7
|
||||||
|
@ -1133,6 +1133,66 @@ int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* p
|
||||||
return convert_int4(c);
|
return convert_int4(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t)
|
||||||
|
{
|
||||||
|
int4 c;
|
||||||
|
|
||||||
|
if(0)//if(MMIN)
|
||||||
|
{
|
||||||
|
// TODO
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int2 uv;
|
||||||
|
|
||||||
|
if(!FST)
|
||||||
|
{
|
||||||
|
uv = convert_int2_rte(t.xy * native_recip(t.z));
|
||||||
|
|
||||||
|
if(LTF) uv -= 0x0008;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// sfex capcom logo third drawing call at (0,223) calculated as:
|
||||||
|
// t0 + (p - p0) * (t - t0) / (p1 - p0)
|
||||||
|
// 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112
|
||||||
|
// due to rounding errors (multiply-add instruction maybe):
|
||||||
|
// t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation
|
||||||
|
// TODO: sw renderer samples at 112 exactly, check which one is correct
|
||||||
|
|
||||||
|
// last line error in persona 3 movie clips if rounding is enabled
|
||||||
|
|
||||||
|
uv = convert_int2(t.xy);
|
||||||
|
}
|
||||||
|
|
||||||
|
int2 uvf = uv & 0x000f;
|
||||||
|
|
||||||
|
int2 uv0 = uv >> 4;
|
||||||
|
int2 uv1 = uv0 + 1;
|
||||||
|
|
||||||
|
uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS);
|
||||||
|
uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT);
|
||||||
|
uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS);
|
||||||
|
uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT);
|
||||||
|
|
||||||
|
int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb);
|
||||||
|
int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb);
|
||||||
|
int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb);
|
||||||
|
int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb);
|
||||||
|
|
||||||
|
if(LTF)
|
||||||
|
{
|
||||||
|
c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00;
|
||||||
|
c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10;
|
||||||
|
c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00;
|
||||||
|
}
|
||||||
|
|
||||||
|
c = c00;
|
||||||
|
}
|
||||||
|
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: 2x2 MSAA idea
|
// TODO: 2x2 MSAA idea
|
||||||
// downsize the rendering tile to 16x8 or 8x8 and render 2x2 sub-pixels to __local
|
// downsize the rendering tile to 16x8 or 8x8 and render 2x2 sub-pixels to __local
|
||||||
// hittest and ztest 2x2 (create write mask, only skip if all -1)
|
// hittest and ztest 2x2 (create write mask, only skip if all -1)
|
||||||
|
@ -1249,8 +1309,6 @@ __kernel void KERNEL_TFX(
|
||||||
|
|
||||||
BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip);
|
BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip);
|
||||||
|
|
||||||
__local gs_prim prim_cache;
|
|
||||||
|
|
||||||
for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH)
|
for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH)
|
||||||
{
|
{
|
||||||
while(bin_value != 0)
|
while(bin_value != 0)
|
||||||
|
@ -1267,7 +1325,7 @@ __kernel void KERNEL_TFX(
|
||||||
bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1);
|
bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1);
|
||||||
|
|
||||||
uint2 zf;
|
uint2 zf;
|
||||||
float4 t;
|
float3 t;
|
||||||
int4 c;
|
int4 c;
|
||||||
|
|
||||||
// TODO: do not hittest if we know the tile is fully inside the prim
|
// TODO: do not hittest if we know the tile is fully inside the prim
|
||||||
|
@ -1282,7 +1340,7 @@ __kernel void KERNEL_TFX(
|
||||||
}
|
}
|
||||||
|
|
||||||
zf = as_uint2(prim->v[0].p.zw);
|
zf = as_uint2(prim->v[0].p.zw);
|
||||||
t.xyz = prim->v[0].tc.xyz;
|
t = prim->v[0].tc.xyz;
|
||||||
c = convert_int4(prim->v[0].c);
|
c = convert_int4(prim->v[0].c);
|
||||||
}
|
}
|
||||||
else if(PRIM == GS_LINE_CLASS)
|
else if(PRIM == GS_LINE_CLASS)
|
||||||
|
@ -1313,7 +1371,7 @@ __kernel void KERNEL_TFX(
|
||||||
zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z;
|
zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->v[3].z;
|
||||||
zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y);
|
zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y);
|
||||||
|
|
||||||
t.xyz = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y;
|
t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y;
|
||||||
|
|
||||||
if(IIP)
|
if(IIP)
|
||||||
{
|
{
|
||||||
|
@ -1361,62 +1419,9 @@ __kernel void KERNEL_TFX(
|
||||||
|
|
||||||
if(TFX != TFX_NONE)
|
if(TFX != TFX_NONE)
|
||||||
{
|
{
|
||||||
// TODO
|
tex = vm; // TODO: use the texture cache
|
||||||
|
|
||||||
if(0)//if(MMIN)
|
ct = SampleTexture(tex, pb, t);
|
||||||
{
|
|
||||||
// TODO
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
int2 uv;
|
|
||||||
|
|
||||||
if(!FST)
|
|
||||||
{
|
|
||||||
uv = convert_int2_rte(t.xy * native_recip(t.z));
|
|
||||||
|
|
||||||
if(LTF) uv -= 0x0008;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// sfex capcom logo third drawing call at (0,223) calculated as:
|
|
||||||
// t0 + (p - p0) * (t - t0) / (p1 - p0)
|
|
||||||
// 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112
|
|
||||||
// due to rounding errors (multiply-add instruction maybe):
|
|
||||||
// t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation
|
|
||||||
// TODO: sw renderer samples at 112 exactly, check which one is correct
|
|
||||||
|
|
||||||
// last line error in persona 3 movie clips if rounding is enabled
|
|
||||||
|
|
||||||
uv = convert_int2(t.xy);
|
|
||||||
}
|
|
||||||
|
|
||||||
int2 uvf = uv & 0x000f;
|
|
||||||
|
|
||||||
int2 uv0 = uv >> 4;
|
|
||||||
int2 uv1 = uv0 + 1;
|
|
||||||
|
|
||||||
uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS);
|
|
||||||
uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT);
|
|
||||||
uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS);
|
|
||||||
uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT);
|
|
||||||
|
|
||||||
tex = vm; // TODO: use the texture cache
|
|
||||||
|
|
||||||
int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb);
|
|
||||||
int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb);
|
|
||||||
int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb);
|
|
||||||
int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb);
|
|
||||||
|
|
||||||
if(LTF)
|
|
||||||
{
|
|
||||||
c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00;
|
|
||||||
c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10;
|
|
||||||
c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00;
|
|
||||||
}
|
|
||||||
|
|
||||||
ct = c00;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// alpha tfx
|
// alpha tfx
|
||||||
|
|
|
@ -43,6 +43,11 @@
|
||||||
#include <d3dx9.h>
|
#include <d3dx9.h>
|
||||||
#include <comutil.h>
|
#include <comutil.h>
|
||||||
#include "../../common/include/comptr.h"
|
#include "../../common/include/comptr.h"
|
||||||
|
|
||||||
|
#include <CL/cl.h>
|
||||||
|
#undef CL_VERSION_1_2
|
||||||
|
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
|
||||||
|
#define __CL_ENABLE_EXCEPTIONS
|
||||||
#include <CL/cl.hpp>
|
#include <CL/cl.hpp>
|
||||||
|
|
||||||
#define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA)
|
#define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA)
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
<ItemDefinitionGroup>
|
<ItemDefinitionGroup>
|
||||||
<ClCompile>
|
<ClCompile>
|
||||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||||
<PreprocessorDefinitions>_WINDOWS;_WIN32_WINNT=0x500;__CL_ENABLE_EXCEPTIONS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>_WINDOWS;_WIN32_WINNT=0x500;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<FloatingPointModel>Fast</FloatingPointModel>
|
<FloatingPointModel>Fast</FloatingPointModel>
|
||||||
<RuntimeTypeInfo>false</RuntimeTypeInfo>
|
<RuntimeTypeInfo>false</RuntimeTypeInfo>
|
||||||
<WarningLevel>Level4</WarningLevel>
|
<WarningLevel>Level4</WarningLevel>
|
||||||
|
|
Loading…
Reference in New Issue