implemented opencl program caching on disk under the system default temp folder, needs some additional work on linux

This commit is contained in:
gabest11 2014-09-22 09:15:25 +02:00 committed by Gregory Hainaut
parent ba59036a97
commit b9b02cf749
7 changed files with 198 additions and 61 deletions

View File

@ -37,6 +37,7 @@ static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
#define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE)) #define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE))
#define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT) #define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT)
#define TFX_PARAM_SIZE 2048 #define TFX_PARAM_SIZE 2048
#define TFX_PROGRAM_VERSION 1
#if MAX_PRIM_PER_BATCH == 64u #if MAX_PRIM_PER_BATCH == 64u
#define BIN_TYPE cl_ulong #define BIN_TYPE cl_ulong
@ -1622,6 +1623,7 @@ GSVector4i* GSRendererCL::TFXJob::GetDstPages()
GSRendererCL::CL::CL() GSRendererCL::CL::CL()
{ {
WIs = INT_MAX; WIs = INT_MAX;
version = INT_MAX;
std::string ocldev = theApp.GetConfig("ocldev", ""); std::string ocldev = theApp.GetConfig("ocldev", "");
@ -1629,37 +1631,43 @@ GSRendererCL::CL::CL()
ocldev = "Intel(R) Corporation Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz OpenCL C 1.2 CPU"; ocldev = "Intel(R) Corporation Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz OpenCL C 1.2 CPU";
#endif #endif
list<OCLDevice> ocldevs; list<OCLDeviceDesc> dl;
GSUtil::GetOCLDevices(ocldevs); GSUtil::GetDeviceDescs(dl);
for(auto dev : ocldevs) for(auto d : dl)
{ {
if(dev.name == ocldev) if(d.name == ocldev)
{ {
devices.push_back(dev.device); devs.push_back(d);
WIs = std::min(WIs, (uint32)dev.device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); WIs = std::min(WIs, (uint32)d.device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
version = std::min(version, d.version);
break; // TODO: multiple devices? break; // TODO: multiple devices?
} }
} }
if(devices.empty() && !ocldevs.empty()) if(devs.empty() && !dl.empty())
{ {
auto dev = ocldevs.front(); auto d = dl.front();
devices.push_back(dev.device); devs.push_back(d);
WIs = std::min(WIs, (uint32)dev.device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); WIs = std::min(WIs, (uint32)d.device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
version = std::min(version, d.version);
} }
if(devices.empty()) if(devs.empty())
{ {
throw new std::exception("OpenCL device not found"); throw new std::exception("OpenCL device not found");
} }
context = cl::Context(devices); vector<cl::Device> tmp;
for(auto d : devs) tmp.push_back(d.device);
context = cl::Context(tmp);
queue[0] = cl::CommandQueue(context); queue[0] = cl::CommandQueue(context);
queue[1] = cl::CommandQueue(context); queue[1] = cl::CommandQueue(context);
@ -1699,24 +1707,24 @@ void GSRendererCL::CL::Map()
{ {
Unmap(); Unmap();
// TODO: CL_MAP_WRITE_INVALIDATE_REGION if 1.2+ cl_map_flags flags = version >= 120 ? CL_MAP_WRITE_INVALIDATE_REGION : CL_MAP_WRITE;
if(vb.head < vb.size) if(vb.head < vb.size)
{ {
vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, vb.head, vb.size - vb.head); vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, flags, vb.head, vb.size - vb.head);
vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head; vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head;
ASSERT(((size_t)vb.ptr & 15) == 0); ASSERT(((size_t)vb.ptr & 15) == 0);
} }
if(ib.head < ib.size) if(ib.head < ib.size)
{ {
ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, CL_MAP_WRITE, ib.head, ib.size - ib.head); ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, flags, ib.head, ib.size - ib.head);
ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head; ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head;
} }
if(pb.head < pb.size) if(pb.head < pb.size)
{ {
pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, CL_MAP_WRITE, pb.head, pb.size - pb.head); pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, flags, pb.head, pb.size - pb.head);
pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head; pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head;
ASSERT(((size_t)pb.ptr & 15) == 0); ASSERT(((size_t)pb.ptr & 15) == 0);
} }
@ -1733,9 +1741,129 @@ void GSRendererCL::CL::Unmap()
pb.mapped_ptr = pb.ptr = NULL; pb.mapped_ptr = pb.ptr = NULL;
} }
static void AddDefs(ostringstream& opt) cl::Kernel GSRendererCL::CL::Build(const char* entry, ostringstream& opt)
{ {
opt << "-cl-std=CL1.1 "; // TODO: cache binary on disk
cl::Program program;
if(version >= 120)
{
cl::Program::Binaries binaries;
try
{
for(auto d : devs)
{
string path = d.tmppath + "/" + entry;
FILE* f = fopen(path.c_str(), "rb");
if(f != NULL)
{
fseek(f, 0, SEEK_END);
long size = ftell(f);
pair<void*, size_t> b(new char[size], size);
fseek(f, 0, SEEK_SET);
fread(b.first, b.second, 1, f);
fclose(f);
binaries.push_back(b);
}
else
{
break;
}
}
if(binaries.size() == devs.size())
{
vector<cl::Device> tmp;
for(auto d : devs) tmp.push_back(d.device);
program = cl::Program(context, tmp, binaries);
AddDefs(opt);
program.build(opt.str().c_str());
cl::Kernel kernel = cl::Kernel(program, entry);
return kernel;
}
}
catch(cl::Error err)
{
printf("%s (%d)\n", err.what(), err.err());
}
for(auto b : binaries)
{
delete [] b.first;
}
}
try
{
printf("building kernel (%s)\n", entry);
program = cl::Program(context, kernel_str);
AddDefs(opt);
program.build(opt.str().c_str());
}
catch(cl::Error err)
{
if(err.err() == CL_BUILD_PROGRAM_FAILURE)
{
for(auto d : devs)
{
auto s = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(d.device);
printf("kernel (%s) build error: %s\n", entry, s.c_str());
}
}
throw err;
}
if(version >= 120)
{
try
{
vector<size_t> sizes = program.getInfo<CL_PROGRAM_BINARY_SIZES>();
vector<char*> binaries = program.getInfo<CL_PROGRAM_BINARIES>();
for(int i = 0; i < binaries.size(); i++)
{
string path = devs[i].tmppath + "/" + entry;
FILE* f = fopen(path.c_str(), "wb");
if(f != NULL)
{
fwrite(binaries[i], sizes[i], 1, f);
fclose(f);
}
delete[] binaries[i];
}
}
catch(cl::Error err)
{
printf("%s (%d)\n", err.what(), err.err());
}
}
return cl::Kernel(program, entry);
}
void GSRendererCL::CL::AddDefs(ostringstream& opt)
{
if(version == 110) opt << "-cl-std=CL1.1 ";
else opt << "-cl-std=CL1.2 ";
opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u "; opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u ";
opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u "; opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u ";
opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u "; opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u ";
@ -1751,38 +1879,6 @@ static void AddDefs(ostringstream& opt)
#endif #endif
} }
cl::Kernel GSRendererCL::CL::Build(const char* entry, ostringstream& opt)
{
// TODO: cache binary on disk
printf("building kernel (%s)\n", entry);
cl::Program program = cl::Program(context, kernel_str);
try
{
AddDefs(opt);
program.build(opt.str().c_str());
}
catch(cl::Error err)
{
if(err.err() == CL_BUILD_PROGRAM_FAILURE)
{
for(auto device : devices)
{
auto s = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
printf("kernel (%s) build error: %s\n", entry, s.c_str());
}
}
throw err;
}
return cl::Kernel(program, entry);
}
cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel) cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel)
{ {
auto i = prim_map.find(sel); auto i = prim_map.find(sel);

View File

@ -188,9 +188,10 @@ class GSRendererCL : public GSRenderer
std::map<uint64, cl::Kernel> tfx_map; std::map<uint64, cl::Kernel> tfx_map;
cl::Kernel Build(const char* entry, ostringstream& opt); cl::Kernel Build(const char* entry, ostringstream& opt);
void AddDefs(ostringstream& opt);
public: public:
std::vector<cl::Device> devices; std::vector<OCLDeviceDesc> devs;
cl::Context context; cl::Context context;
cl::CommandQueue queue[3]; cl::CommandQueue queue[3];
cl::Buffer vm; cl::Buffer vm;
@ -200,6 +201,7 @@ class GSRendererCL : public GSRenderer
cl::CommandQueue* wq; cl::CommandQueue* wq;
int wqidx; int wqidx;
uint32 WIs; uint32 WIs;
int version;
public: public:
CL(); CL();

View File

@ -31,9 +31,9 @@ GSSettingsDlg::GSSettingsDlg(bool isOpen2)
: GSDialog(isOpen2 ? IDD_CONFIG2 : IDD_CONFIG) : GSDialog(isOpen2 ? IDD_CONFIG2 : IDD_CONFIG)
, m_IsOpen2(isOpen2) , m_IsOpen2(isOpen2)
{ {
list<OCLDevice> ocldevs; list<OCLDeviceDesc> ocldevs;
GSUtil::GetOCLDevices(ocldevs); GSUtil::GetDeviceDescs(ocldevs);
int index = 0; int index = 0;

View File

@ -226,9 +226,11 @@ bool GSUtil::CheckSSE()
return true; return true;
} }
void GSUtil::GetOCLDevices(list<OCLDevice>& devs) #define OCL_PROGRAM_VERSION 1
void GSUtil::GetDeviceDescs(list<OCLDeviceDesc>& dl)
{ {
devs.clear(); dl.clear();
try try
{ {
@ -246,6 +248,61 @@ void GSUtil::GetOCLDevices(list<OCLDevice>& devs)
for(auto& device : ds) for(auto& device : ds)
{ {
string type;
switch(device.getInfo<CL_DEVICE_TYPE>())
{
case CL_DEVICE_TYPE_GPU: type = "GPU"; break;
case CL_DEVICE_TYPE_CPU: type = "CPU"; break;
}
if(type.empty()) continue;
std::string version = device.getInfo<CL_DEVICE_OPENCL_C_VERSION>();
int major = 0;
int minor = 0;
if(!type.empty() && sscanf(version.c_str(), "OpenCL C %d.%d", &major, &minor) == 2 && major == 1 && minor >= 1 || major > 1)
{
OCLDeviceDesc desc;
desc.device = device;
desc.name = GetDeviceUniqueName(device);
desc.version = major * 100 + minor * 10;
// TODO: linux
char* buff = new char[MAX_PATH + 1];
GetTempPath(MAX_PATH, buff);
desc.tmppath = string(buff) + "/" + desc.name;
WIN32_FIND_DATA FindFileData;
HANDLE hFind = FindFirstFile(desc.tmppath.c_str(), &FindFileData);
if(hFind != INVALID_HANDLE_VALUE) FindClose(hFind);
else CreateDirectory(desc.tmppath.c_str(), NULL);
sprintf(buff, "/%d", OCL_PROGRAM_VERSION);
desc.tmppath += buff;
delete[] buff;
hFind = FindFirstFile(desc.tmppath.c_str(), &FindFileData);
if(hFind != INVALID_HANDLE_VALUE) FindClose(hFind);
else CreateDirectory(desc.tmppath.c_str(), NULL);
dl.push_back(desc);
}
}
}
}
catch(cl::Error err)
{
printf("%s (%d)\n", err.what(), err.err());
}
}
string GSUtil::GetDeviceUniqueName(cl::Device& device)
{
std::string vendor = device.getInfo<CL_DEVICE_VENDOR>(); std::string vendor = device.getInfo<CL_DEVICE_VENDOR>();
std::string name = device.getInfo<CL_DEVICE_NAME>(); std::string name = device.getInfo<CL_DEVICE_NAME>();
std::string version = device.getInfo<CL_DEVICE_OPENCL_C_VERSION>(); std::string version = device.getInfo<CL_DEVICE_OPENCL_C_VERSION>();
@ -258,27 +315,9 @@ void GSUtil::GetOCLDevices(list<OCLDevice>& devs)
case CL_DEVICE_TYPE_CPU: type = "CPU"; break; case CL_DEVICE_TYPE_CPU: type = "CPU"; break;
} }
int major = 0; version.erase(version.find_last_not_of(' ') + 1);
int minor = 0;
if(!type.empty() && sscanf(version.c_str(), "OpenCL C %d.%d", &major, &minor) == 2 && major == 1 && minor >= 1 || major > 1) return vendor + " " + name + " " + version + " " + type;
{
name = vendor + " " + name + " " + version + type;
OCLDevice dev;
dev.device = device;
dev.name = name;
devs.push_back(dev);
}
}
}
}
catch(cl::Error err)
{
printf("%s (%d)\n", err.what(), err.err());
}
} }
#ifdef _WINDOWS #ifdef _WINDOWS

View File

@ -23,10 +23,12 @@
#include "GS.h" #include "GS.h"
struct OCLDevice struct OCLDeviceDesc
{ {
cl::Device device; cl::Device device;
string name; string name;
int version;
string tmppath;
}; };
class GSUtil class GSUtil
@ -45,7 +47,9 @@ public:
static bool HasCompatibleBits(uint32 spsm, uint32 dpsm); static bool HasCompatibleBits(uint32 spsm, uint32 dpsm);
static bool CheckSSE(); static bool CheckSSE();
static void GetOCLDevices(list<OCLDevice>& devs);
static void GetDeviceDescs(list<OCLDeviceDesc>& dl);
static string GetDeviceUniqueName(cl::Device& device);
#ifdef _WINDOWS #ifdef _WINDOWS

View File

@ -2,8 +2,6 @@
#ifdef cl_amd_printf #ifdef cl_amd_printf
#pragma OPENCL EXTENSION cl_amd_printf : enable #pragma OPENCL EXTENSION cl_amd_printf : enable
#else
#define printf(x)
#endif #endif
#ifdef cl_amd_media_ops #ifdef cl_amd_media_ops
@ -639,9 +637,9 @@ __kernel void KERNEL_PRIM(
dp1.xy = dp1.xy * sign(cp); dp1.xy = dp1.xy * sign(cp);
dp2.xy = dp2.xy * sign(cp); dp2.xy = dp2.xy * sign(cp);
b.zero.x = select(0.0f, CL_FLT_EPSILON, (dp1.y < 0) | (dp1.y == 0) & (dp1.x > 0)); b.zero.x = select(0.0f, CL_FLT_EPSILON, (dp1.y < 0) | ((dp1.y == 0) & (dp1.x > 0)));
b.zero.y = select(0.0f, CL_FLT_EPSILON, (dp0.y < 0) | (dp0.y == 0) & (dp0.x > 0)); b.zero.y = select(0.0f, CL_FLT_EPSILON, (dp0.y < 0) | ((dp0.y == 0) & (dp0.x > 0)));
b.zero.z = select(0.0f, CL_FLT_EPSILON, (dp2.y < 0) | (dp2.y == 0) & (dp2.x > 0)); b.zero.z = select(0.0f, CL_FLT_EPSILON, (dp2.y < 0) | ((dp2.y == 0) & (dp2.x > 0)));
// any barycentric(reject_corner) < 0, tile outside the triangle // any barycentric(reject_corner) < 0, tile outside the triangle

View File

@ -44,8 +44,6 @@
#include <comutil.h> #include <comutil.h>
#include "../../common/include/comptr.h" #include "../../common/include/comptr.h"
#include <CL/cl.h>
#undef CL_VERSION_1_2
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS #define CL_USE_DEPRECATED_OPENCL_1_1_APIS
#define __CL_ENABLE_EXCEPTIONS #define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp> #include <CL/cl.hpp>