diff --git a/Data/User/OpenCL/TextureDecoder.cl b/Data/User/OpenCL/TextureDecoder.cl index f7d3ddf39a..89aea3818e 100644 --- a/Data/User/OpenCL/TextureDecoder.cl +++ b/Data/User/OpenCL/TextureDecoder.cl @@ -99,6 +99,25 @@ kernel void DecodeRGBA8(global uchar *dst, } } +kernel void DecodeRGBA8_RGBA(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 4, y = get_global_id(1) * 4; + int srcOffset = (x * 2) + (y * width) / 2; + for (int iy = 0; iy < 4; iy++) + { + uchar8 ar = vload8(srcOffset, src); + uchar8 gb = vload8(srcOffset + 4, src); + uchar16 res; + res.even.even = ar.odd; + res.even.odd = gb.odd; + res.odd.even = gb.even; + res.odd.odd = ar.even; + vstore16(res, 0, dst + ((y + iy)*width + x) * 4); + srcOffset++; + } +} + kernel void DecodeRGB565(global ushort *dst, const global uchar *src, int width) { @@ -111,6 +130,25 @@ kernel void DecodeRGB565(global ushort *dst, } } +kernel void DecodeRGB565_RGBA(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 4, y = get_global_id(1) * 4; + int srcOffset = x + (y * width) / 4; + for (int iy = 0; iy < 4; iy++) + { + uchar8 val = vload8(srcOffset++, src); + + uchar16 res; + res.even.even = bitselect(val.even, val.even >> (uchar4)5, (uchar4)7); + res.odd.even = bitselect((val.odd >> (uchar4)3) | (val.even << (uchar4)5), val.even >> (uchar4)1, (uchar4)3); + res.even.odd = bitselect(val.odd << (uchar4)3, val.odd >> (uchar4)2, (uchar4)7); + res.odd.odd = (uchar4)0xFF; + + vstore16(res, 0, dst + ((y + iy)*width + x) * 4); + } +} + kernel void DecodeRGB5A3(global uchar *dst, const global uchar *src, int width) { @@ -141,6 +179,36 @@ kernel void DecodeRGB5A3(global uchar *dst, iterateRGB5A3(); } +kernel void DecodeRGB5A3_RGBA(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 4, y = get_global_id(1) * 4; + int srcOffset = x + (y * width) / 4; + uchar8 val; + uchar16 resNoAlpha, resAlpha, res, choice; + #define iterateRGB5A3() \ + val = vload8(srcOffset++, src); \ + resNoAlpha.s048C = val.even << (uchar4)1; \ + resNoAlpha.s159D = val.even << (uchar4)6 | val.odd >> (uchar4)2; \ + resNoAlpha.s26AE = val.odd << (uchar4)3; \ + resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> (uchar16)5, (uchar16)0x3); \ + resNoAlpha.s37BF = (uchar4)(0xFF); \ + resAlpha.s048C = bitselect(val.even << (uchar4)4, val.even, (uchar4)0xF); \ + resAlpha.s159D = bitselect(val.odd, val.odd >> (uchar4)4, (uchar4)0xF); \ + resAlpha.s26AE = bitselect(val.odd << (uchar4)4, val.odd, (uchar4)0xF); \ + resAlpha.s37BF = bitselect(val.even << (uchar4)1, val.even >> (uchar4)2, (uchar4)0x1C); \ + resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> (uchar4)5, (uchar4)0x3); \ + choice = (uchar16)((uchar4)(val.even.s0), \ + (uchar4)(val.even.s1), \ + (uchar4)(val.even.s2), \ + (uchar4)(val.even.s3)); \ + vstore16(select(resAlpha, resNoAlpha, choice), 0, dst + (y * width + x) * 4); + iterateRGB5A3(); dst += width*4; + iterateRGB5A3(); dst += width*4; + iterateRGB5A3(); dst += width*4; + iterateRGB5A3(); +} + uint16 unpack(uchar b) { return (uint16)((uint4)(b >> 3 & 0x18), @@ -192,4 +260,49 @@ kernel void DecodeCMPR(global uchar *dst, decodeCMPRBlock(dst + 16, src, width); src += 8; decodeCMPRBlock(dst + 16 * width, src, width); src += 8; decodeCMPRBlock(dst + 16 * (width + 1), src, width); +} + +kernel void decodeCMPRBlock_RGBA(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 4, y = get_global_id(1) * 4; + uchar8 val = vload8(0, src); + + uchar2 colora565 = (uchar2)(val.s1, val.s3); + uchar2 colorb565 = (uchar2)(val.s0, val.s2); + uchar8 color32 = (uchar8)(bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7), + bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3), + bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7), + (uchar2)0xFF); + + ushort4 frac2 = convert_ushort4(color32.even & (uchar4)0xFF) - convert_ushort4(color32.odd & (uchar4)0xFF); + uchar4 frac = convert_uchar4((frac2 * (ushort4)3) / (ushort4)8); + + ushort4 colorAlpha = upsample((uchar4)0, rhadd(color32.odd, color32.even)); + colorAlpha.s3 = 0xFF; + ushort4 colorNoAlpha = upsample(color32.odd + frac, color32.even - frac); + + uint4 colors = upsample((upsample(val.s0,val.s1) > upsample(val.s2,val.s3))?colorNoAlpha:colorAlpha, + upsample(color32.odd, color32.even)); + + uint16 colorsFull = (uint16)(colors, colors, colors, colors); + + vstore16(convert_uchar16(colorsFull >> unpack(val.s4)), 0, dst); + vstore16(convert_uchar16(colorsFull >> unpack(val.s5)), 0, dst+=width*4); + vstore16(convert_uchar16(colorsFull >> unpack(val.s6)), 0, dst+=width*4); + vstore16(convert_uchar16(colorsFull >> unpack(val.s7)), 0, dst+=width*4); +} + +kernel void DecodeCMPR_RGBA(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 8, y = get_global_id(1) * 8; + + src += x * 4 + (y * width) / 2; + dst += (y * width + x) * 4; + + decodeCMPRBlock_RGBA(dst, src, width); src += 8; + decodeCMPRBlock_RGBA(dst + 16, src, width); src += 8; + decodeCMPRBlock_RGBA(dst + 16 * width, src, width); src += 8; + decodeCMPRBlock_RGBA(dst + 16 * (width + 1), src, width); } \ No newline at end of file diff --git a/Source/Core/Common/Src/OpenCL.cpp b/Source/Core/Common/Src/OpenCL.cpp index f2713257fb..55a2453512 100644 --- a/Source/Core/Common/Src/OpenCL.cpp +++ b/Source/Core/Common/Src/OpenCL.cpp @@ -163,7 +163,9 @@ cl_kernel CompileKernel(cl_program program, const char *Function) cl_kernel kernel = clCreateKernel(program, Function, &err); if (!kernel || err != CL_SUCCESS) { - HandleCLError(err, "Failed to create compute kernel!"); + char buffer[1024]; + sprintf(buffer, "Failed to create compute kernel '%s' !", Function); + HandleCLError(err, buffer); return NULL; } NOTICE_LOG(COMMON, "OpenCL CompileKernel took %.3f seconds", (float)(Common::Timer::GetTimeMs() - compileStart) / 1000.0); diff --git a/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp b/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp index 73fe94879e..4a97dca188 100644 --- a/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp +++ b/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp @@ -102,11 +102,11 @@ void TexDecoder_OpenCL_Initialize() { g_program = OpenCL::CompileProgram(code.c_str()); int i = 0; - for(int i = 0; i < GX_TF_CMPR; ++i) { + for(int i = 0; i <= GX_TF_CMPR; ++i) { if(g_DecodeParametersNative[i].name) g_DecodeParametersNative[i].kernel = OpenCL::CompileKernel(g_program, g_DecodeParametersNative[i].name); - if(false && g_DecodeParametersRGBA[i].name) + if(g_DecodeParametersRGBA[i].name) g_DecodeParametersRGBA[i].kernel = OpenCL::CompileKernel(g_program, g_DecodeParametersRGBA[i].name); }