diff --git a/Data/User/OpenCL/TextureDecoder.cl b/Data/User/OpenCL/TextureDecoder.cl index d705465a80..2b46761167 100644 --- a/Data/User/OpenCL/TextureDecoder.cl +++ b/Data/User/OpenCL/TextureDecoder.cl @@ -65,15 +65,19 @@ kernel void DecodeIA4(global uchar *dst, { int x = get_global_id(0) * 8, y = get_global_id(1) * 4; int srcOffset = ((x * 4) + (y * width)) / 8; - for (int iy = 0; iy < 4; iy++) - { - uchar8 val = vload8(srcOffset++, src); - uchar16 res; - res.odd = (val >> (uchar8)4); - res.even = val & (uchar8)0x0F; - res |= res << (uchar16)4; - vstore16(res, 0, dst + ((y + iy)*width + x) * 2); - } + uchar8 val; + uchar16 res; + dst += 2*(y*width + x); + #define iterateIA4() \ + val = vload8(srcOffset++, src); \ + res.odd = (val >> (uchar8)4); \ + res.even = val & (uchar8)0x0F; \ + res |= res << (uchar16)4; \ + vstore16(res, 0, dst); + iterateIA4(); dst += 2*width; + iterateIA4(); dst += 2*width; + iterateIA4(); dst += 2*width; + iterateIA4(); } kernel void DecodeRGBA8(global uchar *dst, @@ -114,40 +118,34 @@ kernel void DecodeRGB5A3(global uchar *dst, int srcOffset = x + (y * width) / 4; uchar8 val; uchar16 resNoAlpha, resAlpha, res, choice; - uchar4 gNoAlpha, aAlpha; #define iterateRGB5A3() \ val = vload8(srcOffset++, src); \ - gNoAlpha = (val.even << (uchar4)6) | (val.odd >> (uchar4)2); \ - resNoAlpha.s26AE = bitselect(val.even >> (uchar4)4, val.even << (uchar4)1, (uchar4)0xFFF); \ - resNoAlpha.s159D = bitselect(gNoAlpha >> (uchar4)5, gNoAlpha, (uchar4)0xFFF); \ - resNoAlpha.s048C = bitselect(val.odd >> (uchar4)2, val.odd << (uchar4)3, (uchar4)0xFFF); \ + resNoAlpha.s26AE = val.even << (uchar4)1; \ + resNoAlpha.s159D = val.even << (uchar4)6 | val.odd >> (uchar4)2; \ + resNoAlpha.s048C = val.odd << (uchar4)3; \ resNoAlpha.s37BF = (uchar4)(0xFF); \ - resAlpha.s26AE = val.even; \ - resAlpha.s159D = val.odd >> (uchar4)4; \ - resAlpha.s048C = val.odd; \ - resAlpha &= (uchar16)0x0F; \ - resAlpha |= (resAlpha << (uchar16)4); \ - resAlpha.s37BF = val.even << (uchar4)1 & (uchar4)0xE0; \ - resAlpha.s37BF |= ((resAlpha.s37BF >> (uchar4)3) & (uchar4)0x1C) \ - | ((resAlpha.s37BF >> (uchar4)6) & (uchar4)0x3); \ + resAlpha.s26AE = bitselect(val.even << (uchar4)4, val.even, (uchar4)0xF); \ + resAlpha.s159D = bitselect(val.odd, val.odd >> (uchar4)4, (uchar4)0xF); \ + resAlpha.s048C = bitselect(val.odd << (uchar4)4, val.odd, (uchar4)0xF); \ + resAlpha.s37BF = bitselect(val.even << (uchar4)1, val.even >> (uchar4)2, (uchar4)0x1C); \ + resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> (uchar4)5, (uchar4)0x3); \ choice = (uchar16)((uchar4)(val.even.s0), \ (uchar4)(val.even.s1), \ (uchar4)(val.even.s2), \ (uchar4)(val.even.s3)); \ - vstore16(select(resAlpha, resNoAlpha, choice), 0, dst + (y * width + x) * 4); \ - dst += width*4; // This may look ugly but unrolling loops is required for pre-DX11 hardware. - iterateRGB5A3(); - iterateRGB5A3(); - iterateRGB5A3(); + vstore16(select(resAlpha, resNoAlpha, choice), 0, dst + (y * width + x) * 4); + iterateRGB5A3(); dst += width*4; + iterateRGB5A3(); dst += width*4; + iterateRGB5A3(); dst += width*4; iterateRGB5A3(); } uint16 unpack(uchar b) { - return (uint16)((uint4)(b >> 6), - (uint4)(b >> 4 & 3), - (uint4)(b >> 2 & 3), - (uint4)(b & 3)); + return (uint16)((uint4)(b >> 3 & 0x18), + (uint4)(b >> 1 & 0x18), + (uint4)(b << 1 & 0x18), + (uint4)(b << 3 & 0x18)); } kernel void decodeCMPRBlock(global uchar *dst, @@ -158,43 +156,28 @@ kernel void decodeCMPRBlock(global uchar *dst, uchar2 colora565 = (uchar2)(val.s1, val.s3); uchar2 colorb565 = (uchar2)(val.s0, val.s2); - uchar8 color32 = (uchar8)(bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)0xFFFFF000), - colora565 >> (uchar2)3 | bitselect(colorb565 << (uchar2)5, colorb565 >> (uchar2)1, (uchar2)0xFFFFFF00), - bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)0xFFFFF000), + uchar8 color32 = (uchar8)(colora565 << (uchar2)3, + colora565 >> (uchar2)3 | colorb565 << (uchar2)5, + colorb565, (uchar2)0xFF); - uint4 colors; - uint4 colorNoAlpha; - ushort4 frac2 = (ushort4)(color32.even & (uchar4)0xFF) - (ushort4)(color32.odd & (uchar4)0xFF); - uchar4 frac = convert_uchar4((frac2 * (ushort4)3) / (ushort4)8); - colorNoAlpha = convert_uint4(color32.odd + frac); - colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.even - frac); - colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.odd); - colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.even); - uint4 colorAlpha; - uchar4 midpoint = rhadd(color32.odd, color32.even); - midpoint.s3 = 0xFF; - colorAlpha = convert_uint4(midpoint); - colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.odd); - colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.even); + ushort4 frac2 = (ushort4)(color32.even & (uchar4)0xFF) - (ushort4)(color32.odd & (uchar4)0xFF); + uchar4 frac = convert_uchar4((frac2 * (ushort4)3) / (ushort4)8); + + ushort4 colorAlpha = upsample((uchar4)0, rhadd(color32.odd, color32.even)); + colorAlpha.s3 = 0xFF; + ushort4 colorNoAlpha = upsample(color32.odd + frac, color32.even - frac); - uint4 choice = isgreater(upsample(val.s0,val.s1),upsample(val.s2, val.s3)); - colors = bitselect(colorNoAlpha, colorAlpha, choice); + ushort4 choice = isgreater(val.s0,val.s2); + uint4 colors = upsample(bitselect(colorNoAlpha, colorAlpha, choice), + upsample(color32.odd, color32.even)); uint16 colorsFull = (uint16)(colors, colors, colors, colors); - uint16 shifts = (((unpack(val.s7) << (uint16)8 - | unpack(val.s6)) << (uint16)8 - | unpack(val.s5)) << (uint16)8 - | unpack(val.s4)) << (uint16)3; - - vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst); - shifts = shifts >> (uint16)8; - vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width*4); - shifts = shifts >> (uint16)8; - vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width*4); - shifts = shifts >> (uint16)8; - vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width*4); + vstore16((uchar16)(colorsFull >> unpack(val.s4)), 0, dst); + vstore16((uchar16)(colorsFull >> unpack(val.s5)), 0, dst+=width*4); + vstore16((uchar16)(colorsFull >> unpack(val.s6)), 0, dst+=width*4); + vstore16((uchar16)(colorsFull >> unpack(val.s7)), 0, dst+=width*4); } kernel void DecodeCMPR(global uchar *dst, @@ -203,13 +186,10 @@ kernel void DecodeCMPR(global uchar *dst, int x = get_global_id(0) * 8, y = get_global_id(1) * 8; src += x * 4 + (y * width) / 2; + dst += (y * width + x) * 4; - decodeCMPRBlock(dst + (y * width + x) * 4, src, width); - src += 8; - decodeCMPRBlock(dst + (y * width + x + 4) * 4, src, width); // + 16 - src += 8; - decodeCMPRBlock(dst + ((y + 4) * width + x) * 4, src, width); // + 16*width - src += 8; - decodeCMPRBlock(dst + ((y + 4) * width + x + 4) * 4, src, width); // + 16*(width+1) - + decodeCMPRBlock(dst, src, width); src += 8; + decodeCMPRBlock(dst + 16, src, width); src += 8; + decodeCMPRBlock(dst + 16 * width, src, width); src += 8; + decodeCMPRBlock(dst + 16 * (width + 1), src, width); } \ No newline at end of file