diff --git a/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp b/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp index acfa9175dd..a590f48073 100644 --- a/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp +++ b/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp @@ -31,7 +31,6 @@ struct sDecoders { cl_program program; // compute program cl_kernel kernel; // compute kernel - cl_mem src, dst; // texture buffer memory objects }; const char *Kernel = " \n\ @@ -48,6 +47,18 @@ kernel void DecodeI8(global uchar *dst, \n\ } \n\ } \n\ \n\ +kernel void DecodeIA8(global ushort *dst, \n\ + const global ushort *src, int width) \n\ +{ \n\ + int x = get_global_id(0) * 4, y = get_global_id(1) * 4; \n\ + int srcOffset = ((x * 4) + (y * width)) / 4; \n\ + for (int iy = 0; iy < 4; iy++) \n\ + { \n\ + vstore4(vload4(srcOffset, src), \n\ + 0, dst + ((y + iy)*width + x)); \n\ + srcOffset++; \n\ + } \n\ +} \n\ \n\ ushort swapbytes(ushort x) { \n\ return (x & 0xf00f) | ((x >> 4) & 0x00f0) | \n\ @@ -77,8 +88,9 @@ kernel void DecodeIA4(global ushort *dst, \n\ } \n\ "; -sDecoders Decoders[] = { {NULL, NULL, NULL, NULL}, - {NULL, NULL, NULL, NULL}, +sDecoders Decoders[] = { {NULL, NULL}, + {NULL, NULL}, + {NULL, NULL}, }; bool g_Inited = false; @@ -88,7 +100,9 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei #if defined(HAVE_OPENCL) && HAVE_OPENCL cl_int err; cl_kernel kernelToRun = Decoders[0].kernel; - int sizeOfDst = sizeof(u8); + int sizeOfDst = sizeof(u8), sizeOfSrc = sizeof(u8), xSkip, ySkip; + PC_TexFormat formatResult; + cl_mem clsrc, cldst; // texture buffer memory objects if(!g_Inited) { if(!OpenCL::Initialize()) @@ -96,8 +110,10 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei Decoders[0].program = OpenCL::CompileProgram(Kernel); + Decoders[0].kernel = OpenCL::CompileKernel(Decoders[0].program, "DecodeI8"); Decoders[1].kernel = OpenCL::CompileKernel(Decoders[0].program, "DecodeIA4"); + Decoders[2].kernel = OpenCL::CompileKernel(Decoders[0].program, "DecodeIA8"); g_Inited = true; } @@ -106,146 +122,63 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei case GX_TF_IA4: // Maybe a cleaner way is needed kernelToRun = Decoders[1].kernel; + sizeOfSrc = sizeof(u8); sizeOfDst = sizeof(u16); + xSkip = 8; + ySkip = 4; + formatResult = PC_TEX_FMT_IA4_AS_IA8; + break; case GX_TF_I8: - { - // TODO: Optimize - //PanicAlert("Really calling the OCL version"); - Decoders[0].src = clCreateBuffer(OpenCL::GetContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, width * height * sizeof(u8), (void *)src, NULL); - Decoders[0].dst = clCreateBuffer(OpenCL::GetContext(), CL_MEM_WRITE_ONLY, width * height * sizeOfDst, NULL, NULL); - - clSetKernelArg(kernelToRun, 0, sizeof(cl_mem), &Decoders[0].dst); - clSetKernelArg(kernelToRun, 1, sizeof(cl_mem), &Decoders[0].src); - clSetKernelArg(kernelToRun, 2, sizeof(cl_int), &width); - - size_t global[] = { width / 8, height / 4 }; - - // No work-groups for now - /* - size_t local; - err = clGetKernelWorkGroupInfo(Decoders[0].kernel, OpenCL::device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); - if(err) - PanicAlert("Error obtaining work-group information"); - */ - - err = clEnqueueNDRangeKernel(OpenCL::GetCommandQueue(), kernelToRun, 2 , NULL, global, NULL, 0, NULL, NULL); - if(err) - PanicAlert("Error queueing kernel"); - - clFinish(OpenCL::GetCommandQueue()); - - clEnqueueReadBuffer(OpenCL::GetCommandQueue(), Decoders[0].dst, CL_TRUE, 0, width * height * sizeOfDst, dst, 0, NULL, NULL); - - clReleaseMemObject(Decoders[0].src); - clReleaseMemObject(Decoders[0].dst); - - /* - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 8) - for (int iy = 0; iy < 4; iy++, src += 8) - memcpy(dst + (y + iy)*width+x, src, 8); - */ - if(texformat == GX_TF_I8) - return PC_TEX_FMT_I8; - else - return PC_TEX_FMT_IA4_AS_IA8; - } - /* IA4: - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 8) - for (int iy = 0; iy < 4; iy++, src += 8) - for (int x = 0; x < 8; x++) - { - const u8 val = src[x]; - u8 a = Convert4To8(val >> 4); - u8 l = Convert4To8(val & 0xF); - dst[x] = (a << 8) | l; - } - */ - + kernelToRun = Decoders[0].kernel; + sizeOfSrc = sizeOfDst = sizeof(u8); + xSkip = 8; + ySkip = 4; + formatResult = PC_TEX_FMT_I8; + break; + case GX_TF_IA8: + kernelToRun = Decoders[2].kernel; + sizeOfSrc = sizeOfDst = sizeof(u16); + xSkip = 4; + ySkip = 4; + formatResult = PC_TEX_FMT_IA8; + break; default: return PC_TEX_FMT_NONE; } + + // TODO: Optimize + clsrc = clCreateBuffer(OpenCL::GetContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, width * height * sizeOfSrc, (void *)src, NULL); + cldst = clCreateBuffer(OpenCL::GetContext(), CL_MEM_WRITE_ONLY, width * height * sizeOfDst, NULL, NULL); + + clSetKernelArg(kernelToRun, 0, sizeof(cl_mem), &cldst); + clSetKernelArg(kernelToRun, 1, sizeof(cl_mem), &clsrc); + clSetKernelArg(kernelToRun, 2, sizeof(cl_int), &width); + + size_t global[] = { width / xSkip, height / ySkip }; + + // No work-groups for now + /* + size_t local; + err = clGetKernelWorkGroupInfo(kernelToRun, OpenCL::device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); + if(err) + PanicAlert("Error obtaining work-group information"); + */ + + err = clEnqueueNDRangeKernel(OpenCL::GetCommandQueue(), kernelToRun, 2, NULL, global, NULL, 0, NULL, NULL); + if(err) + PanicAlert("Error queueing kernel"); + + clFinish(OpenCL::GetCommandQueue()); + + clEnqueueReadBuffer(OpenCL::GetCommandQueue(), cldst, CL_TRUE, 0, width * height * sizeOfDst, dst, 0, NULL, NULL); + + clReleaseMemObject(clsrc); + clReleaseMemObject(cldst); + + return formatResult; #else return PC_TEX_FMT_NONE; #endif - /* OLD CODE - switch(texformat) - { - case GX_TF_I8: - { - size_t global = 0; // global domain size for our calculation - size_t local = 0; // local domain size for our calculation - printf("width %d, height %d\n", width, height); - // Create the input and output arrays in device memory for our calculation - // - cl_mem _dst = clCreateBuffer(OpenCL::g_context, CL_MEM_WRITE_ONLY, sizeof(unsigned char) * width * height, NULL, NULL); - if (!dst) - { - printf("Error: Failed to allocate device memory!\n"); - exit(1); - } - cl_mem _src = clCreateBuffer(OpenCL::g_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(unsigned char) * width * height, (void*)src, NULL); - if (!src) - { - printf("Error: Failed to allocate device memory!\n"); - exit(1); - } - // Set the arguments to our compute kernel - // - err = 0; - err = clSetKernelArg(Decoders[0].kernel, 0, sizeof(cl_mem), &_dst); - err |= clSetKernelArg(Decoders[0].kernel, 1, sizeof(cl_mem), &_src); - err |= clSetKernelArg(Decoders[0].kernel, 2, sizeof(cl_int), &width); - err |= clSetKernelArg(Decoders[0].kernel, 3, sizeof(cl_int), &height); - if (err != CL_SUCCESS) - { - printf("Error: Failed to set kernel arguments! %d\n", err); - exit(1); - } - - // Get the maximum work group size for executing the kernel on the device - // - err = clGetKernelWorkGroupInfo(Decoders[0].kernel, OpenCL::device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &local, NULL); - if (err != CL_SUCCESS) - { - printf("Error: Failed to retrieve kernel work group info! %d\n", err); - local = 64; - } - - // Execute the kernel over the entire range of our 1d input data set - // using the maximum number of work group items for this device - // - global = width * height; - err = clEnqueueNDRangeKernel(OpenCL::g_cmdq, Decoders[0].kernel, 1, NULL, &global, &local, 0, NULL, NULL); - if (err != CL_SUCCESS) - { - printf("Error: Failed to execute kernel! %d\n", err); - return PC_TEX_FMT_NONE; - } - - // Wait for the command commands to get serviced before reading back results - // - clFinish(OpenCL::g_cmdq); - - // Read back the results from the device to verify the output - // - err = clEnqueueReadBuffer( OpenCL::g_cmdq, _dst, CL_TRUE, 0, sizeof(unsigned char) * width * height, dst, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - printf("Error: Failed to read output array! %d\n", err); - exit(1); - } - clReleaseMemObject(_dst); - clReleaseMemObject(_src); - } - return PC_TEX_FMT_I8; - break; - default: - return PC_TEX_FMT_NONE; - } - // TODO: clEnqueueNDRangeKernel -*/ /* switch (texformat) { @@ -279,14 +212,7 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei } } return PC_TEX_FMT_I4_AS_I8; - case GX_TF_I8: // speed critical - { - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 8) - for (int iy = 0; iy < 4; iy++, src += 8) - memcpy(dst + (y + iy)*width+x, src, 8); - } - return PC_TEX_FMT_I8; + case GX_TF_C8: if (tlutfmt == 2) { @@ -304,14 +230,6 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr); } return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_IA4: - { - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 8) - for (int iy = 0; iy < 4; iy++, src += 8) - decodebytesIA4((u16*)dst + (y + iy) * width + x, src); - } - return PC_TEX_FMT_IA4_AS_IA8; case GX_TF_IA8: { for (int y = 0; y < height; y += 4)