diff --git a/Data/User/OpenCL/TextureDecoder.cl b/Data/User/OpenCL/TextureDecoder.cl index 1ca5cc6d6f..69bc433212 100644 --- a/Data/User/OpenCL/TextureDecoder.cl +++ b/Data/User/OpenCL/TextureDecoder.cl @@ -52,12 +52,11 @@ kernel void DecodeIA8(global uchar *dst, int srcOffset = ((x * 4) + (y * width)) / 4; for (int iy = 0; iy < 4; iy++) { - uchar8 val = vload8(srcOffset, src); + uchar8 val = vload8(srcOffset++, src); uchar8 res; res.odd = val.even; res.even = val.odd; vstore8(res, 0, dst + ((y + iy)*width + x) * 2); - srcOffset++; } } @@ -68,13 +67,12 @@ kernel void DecodeIA4(global uchar *dst, int srcOffset = ((x * 4) + (y * width)) / 8; for (int iy = 0; iy < 4; iy++) { - uchar8 val = vload8(srcOffset, src); + uchar8 val = vload8(srcOffset++, src); uchar16 res; - res.odd = (val >> (uchar8)4) & (uchar8)0x0F; + res.odd = (val >> (uchar8)4); res.even = val & (uchar8)0x0F; res |= res << (uchar16)4; vstore16(res, 0, dst + ((y + iy)*width + x) * 2); - srcOffset++; } } @@ -98,16 +96,14 @@ kernel void DecodeRGBA8(global uchar *dst, } kernel void DecodeRGB565(global ushort *dst, - const global ushort *src, int width) + const global uchar *src, int width) { int x = get_global_id(0) * 4, y = get_global_id(1) * 4; int srcOffset = x + (y * width) / 4; for (int iy = 0; iy < 4; iy++) { - ushort4 val = vload4(srcOffset, src); - val = (val >> (ushort4)8) | (val << (ushort4)8); - vstore4(val, 0, dst + ((y + iy)*width + x)); - srcOffset++; + uchar8 val = vload8(srcOffset++, src); + vstore4(upsample(val.even, val.odd), 0, dst + ((y + iy)*width + x)); } } @@ -118,59 +114,60 @@ kernel void DecodeRGB5A3(global uchar *dst, int srcOffset = x + (y * width) / 4; for (int iy = 0; iy < 4; iy++) { - ushort8 val = convert_ushort8(vload8(srcOffset, src)); - ushort4 vs = val.odd | (ushort4)(val.even << (ushort4)8); + uchar8 val = vload8(srcOffset++, src); + ushort4 vs = upsample(val.even, val.odd); uchar16 resNoAlpha; - resNoAlpha.s26AE = convert_uchar4(vs >> (ushort4)7); // R - resNoAlpha.s159D = convert_uchar4(vs >> (ushort4)2); // G - resNoAlpha.s048C = convert_uchar4(vs << (ushort4)3); // B + resNoAlpha.s26AE = (uchar4)(vs >> (ushort4)7); // R + resNoAlpha.s159D = (uchar4)(vs >> (ushort4)2); // G + resNoAlpha.s048C = (uchar4)(vs << (ushort4)3); // B resNoAlpha &= (uchar16)0xF8; - resNoAlpha |= (uchar16)(resNoAlpha >> (uchar16)5) & (uchar16)3; // 5 -> 8 + resNoAlpha |= (uchar16)(resNoAlpha >> (uchar16)5); // 5 -> 8 resNoAlpha.s37BF = (uchar4)(0xFF); uchar16 resAlpha; - resAlpha.s26AE = convert_uchar4(vs >> (ushort4)8); // R - resAlpha.s159D = convert_uchar4(vs >> (ushort4)4); // G - resAlpha.s048C = convert_uchar4(vs); // B + resAlpha.s26AE = val.even; // R + resAlpha.s159D = val.odd >> (uchar4)4; // G + resAlpha.s048C = val.odd; // B resAlpha &= (uchar16)0x0F; resAlpha |= (resAlpha << (uchar16)4); resAlpha.s37BF = convert_uchar4(vs >> (ushort4)7) & (uchar4)0xE0; resAlpha.s37BF |= ((resAlpha.s37BF >> (uchar4)3) & (uchar4)0x1C) | ((resAlpha.s37BF >> (uchar4)6) & (uchar4)0x3); - uchar16 choice = (uchar16)((uchar4)(vs.s0 >> 8), - (uchar4)(vs.s1 >> 8), - (uchar4)(vs.s2 >> 8), - (uchar4)(vs.s3 >> 8)); + uchar16 choice = (uchar16)((uchar4)(val.even.s0), + (uchar4)(val.even.s1), + (uchar4)(val.even.s2), + (uchar4)(val.even.s3)); uchar16 res; res = select(resAlpha, resNoAlpha, choice); vstore16(res, 0, dst + ((y + iy) * width + x) * 4); - srcOffset++; } } -uint4 unpack2bits(uchar b) +uint16 unpack(uchar b) { - return (uint4)(b >> 6, - (b >> 4) & 3, - (b >> 2) & 3, - b & 3); -} + return (uint16)((uint4)(b >> 6), + (uint4)(b >> 4 & 3), + (uint4)(b >> 2 & 3), + (uint4)(b & 3)); +} kernel void decodeCMPRBlock(global uchar *dst, const global uchar *src, int width) { int x = get_global_id(0) * 4, y = get_global_id(1) * 4; uchar8 val = vload8(0, src); - ushort2 color565 = (ushort2)((val.s1 & 0xFF) | (val.s0 << 8), (val.s3 & 0xFF) | (val.s2 << 8)); - uchar8 color32 = convert_uchar8((ushort8) - (((color565 << (ushort2)3) & (ushort2)0xF8) | ((color565 >> (ushort2)2) & (ushort2)0x7), - ((color565 >> (ushort2)3) & (ushort2)0xFC) | ((color565 >> (ushort2)9) & (ushort2)0x3), - ((color565 >> (ushort2)8) & (ushort2)0xF8) | ((color565 >> (ushort2)13) & (ushort2)0x7), - 0xFF, 0xFF)); + + uchar2 colora565 = (uchar2)(val.s1, val.s3); + uchar2 colorb565 = (uchar2)(val.s0, val.s2); + uchar8 color32 = (uchar8)((colora565 << (uchar2)3) | (colora565 >> (uchar2)2 & (uchar2)7), + (colora565 >> (uchar2)3) | (colorb565 << (uchar2)5) | (colorb565 >> (uchar2)1 & (uchar2)3), + (colorb565 & (uchar2)0xF8) | (colorb565 >> (uchar2)5 & (uchar2)7), + (uchar2)0xFF); uint4 colors; uint4 colorNoAlpha; - uchar4 frac = convert_uchar4((((convert_ushort4(color32.even) & (ushort4)0xFF) - (convert_ushort4(color32.odd) & (ushort4)0xFF)) * (ushort4)3) / (ushort4)8); + ushort4 frac2 = (ushort4)(color32.even & (uchar4)0xFF) - (ushort4)(color32.odd & (uchar4)0xFF); + uchar4 frac = convert_uchar4((frac2 * (ushort4)3) / (ushort4)8); colorNoAlpha = convert_uint4(color32.odd + frac); colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.even - frac); colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.odd); @@ -179,31 +176,27 @@ kernel void decodeCMPRBlock(global uchar *dst, uint4 colorAlpha; uchar4 midpoint = convert_uchar4((convert_ushort4(color32.odd) + convert_ushort4(color32.even) + (ushort4)1) / (ushort4)2); midpoint.s3 = 0xFF; - colorAlpha = convert_uint4((uchar4)(0, 0, 0, 0)); - colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(midpoint); + colorAlpha = convert_uint4(midpoint); colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.odd); colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.even); - colors = color565.s0 > color565.s1 ? colorNoAlpha : colorAlpha; + uint4 choice = isgreater(upsample(val.s0,val.s1),upsample(val.s2, val.s3)); + colors = select(colorNoAlpha, colorAlpha, choice); uint16 colorsFull = (uint16)(colors, colors, colors, colors); - uint4 shift0 = unpack2bits(val.s4); - uint4 shift1 = unpack2bits(val.s5); - uint4 shift2 = unpack2bits(val.s6); - uint4 shift3 = unpack2bits(val.s7); - uint16 shifts = (uint16)((uint4)(shift3.s0), (uint4)(shift3.s1), (uint4)(shift3.s2), (uint4)(shift3.s3)); - shifts = (shifts << (uint16)8) | (uint16)((uint4)(shift2.s0), (uint4)(shift2.s1), (uint4)(shift2.s2), (uint4)(shift2.s3)); - shifts = (shifts << (uint16)8) | (uint16)((uint4)(shift1.s0), (uint4)(shift1.s1), (uint4)(shift1.s2), (uint4)(shift1.s3)); - shifts = (shifts << (uint16)8) | (uint16)((uint4)(shift0.s0), (uint4)(shift0.s1), (uint4)(shift0.s2), (uint4)(shift0.s3)) << (uint16)3; - - for (int iy = 0; iy < 4; iy++) - { - uchar16 res; - res = convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)) >> (uchar16)8; - vstore16(res, 0, dst); - dst += width * 4; - } + uint16 shifts = (((unpack(val.s7) << (uint16)8 + | unpack(val.s6)) << (uint16)8 + | unpack(val.s5)) << (uint16)8 + | unpack(val.s4)) << (uint16)3; + + vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width * 4); + shifts = shifts >> (uint16)8; + vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width * 4); + shifts = shifts >> (uint16)8; + vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width * 4); + shifts = shifts >> (uint16)8; + vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width * 4); } kernel void DecodeCMPR(global uchar *dst, @@ -215,10 +208,10 @@ kernel void DecodeCMPR(global uchar *dst, decodeCMPRBlock(dst + (y * width + x) * 4, src, width); src += 8; - decodeCMPRBlock(dst + (y * width + x + 4) * 4, src, width); + decodeCMPRBlock(dst + (y * width + x + 4) * 4, src, width); // + 16 src += 8; - decodeCMPRBlock(dst + ((y + 4) * width + x) * 4, src, width); + decodeCMPRBlock(dst + ((y + 4) * width + x) * 4, src, width); // + 16*width src += 8; - decodeCMPRBlock(dst + ((y + 4) * width + x + 4) * 4, src, width); + decodeCMPRBlock(dst + ((y + 4) * width + x + 4) * 4, src, width); // + 16*(width+1) } \ No newline at end of file diff --git a/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp b/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp index a81a58bd51..644279211d 100644 --- a/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp +++ b/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp @@ -33,8 +33,8 @@ struct sDecoders { - const char name[256]; // kernel name - cl_kernel kernel; // compute kernel + const char name[256]; // kernel name + cl_kernel kernel; // compute kernel }; cl_program g_program; @@ -57,17 +57,17 @@ cl_mem g_clsrc, g_cldst; // texture buffer memory objects void TexDecoder_OpenCL_Initialize() { #if defined(HAVE_OPENCL) && HAVE_OPENCL if(!g_Inited) - { + { if(!OpenCL::Initialize()) return; - std::string code; - char filename[1024]; - sprintf(filename, "%sOpenCL/TextureDecoder.cl", File::GetUserPath(D_USER_IDX)); + std::string code; + char filename[1024]; + sprintf(filename, "%sOpenCL/TextureDecoder.cl", File::GetUserPath(D_USER_IDX)); if (!File::ReadFileToString(true, filename, code)) { ERROR_LOG(VIDEO, "Failed to load OpenCL code %s - file is missing?", filename); - return; + return; } g_program = OpenCL::CompileProgram(code.c_str()); @@ -94,7 +94,7 @@ void TexDecoder_OpenCL_Shutdown() { clReleaseProgram(g_program); int i = 0; - while(strlen(Decoders[i].name) > 0) + while(strlen(Decoders[i].name) > 0) { clReleaseKernel(Decoders[i].kernel); i++; @@ -113,21 +113,21 @@ void TexDecoder_OpenCL_Shutdown() { PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) { #if defined(HAVE_OPENCL) && HAVE_OPENCL - cl_int err; + cl_int err; cl_kernel kernelToRun = Decoders[0].kernel; float sizeOfDst = sizeof(u8), sizeOfSrc = sizeof(u8), xSkip, ySkip; PC_TexFormat formatResult; - - switch(texformat) + + switch(texformat) { case GX_TF_I4: - kernelToRun = Decoders[0].kernel; + kernelToRun = Decoders[0].kernel; sizeOfSrc = sizeof(u8) / 2.0f; sizeOfDst = sizeof(u8); xSkip = 8; ySkip = 8; - formatResult = PC_TEX_FMT_I4_AS_I8; - break; + formatResult = PC_TEX_FMT_I4_AS_I8; + break; case GX_TF_I8: kernelToRun = Decoders[1].kernel; sizeOfSrc = sizeOfDst = sizeof(u8); @@ -162,25 +162,21 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei sizeOfSrc = sizeOfDst = sizeof(u16); xSkip = 4; ySkip = 4; - formatResult = PC_TEX_FMT_RGB565; + formatResult = PC_TEX_FMT_RGB565; break; - case GX_TF_RGB5A3: - // Doesn't decode correctly - // See Sonic Adventure 2: Battle opening sequence - return PC_TEX_FMT_NONE; + case GX_TF_RGB5A3: + // Reported issues with Sonic Adventure 2: Battle opening sequence? kernelToRun = Decoders[6].kernel; sizeOfSrc = sizeof(u16); - sizeOfDst = sizeof(u32); + sizeOfDst = sizeof(u32); xSkip = 4; ySkip = 4; formatResult = PC_TEX_FMT_BGRA32; break; case GX_TF_CMPR: - // Doesn't decode correctly - return PC_TEX_FMT_NONE; kernelToRun = Decoders[7].kernel; sizeOfSrc = sizeof(u8) / 2.0f; - sizeOfDst = sizeof(u32); + sizeOfDst = sizeof(u32); xSkip = 8; ySkip = 8; formatResult = PC_TEX_FMT_BGRA32; @@ -212,7 +208,7 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei err = clEnqueueNDRangeKernel(OpenCL::GetCommandQueue(), kernelToRun, 2, NULL, global, NULL, 0, NULL, NULL); if(err) - OpenCL::HandleCLError(err, "Failed to enqueue kernel"); + OpenCL::HandleCLError(err, "Failed to enqueue kernel"); clFinish(OpenCL::GetCommandQueue()); @@ -228,6 +224,6 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei return PC_TEX_FMT_NONE; #endif - return PC_TEX_FMT_NONE; + return PC_TEX_FMT_NONE; }