diff --git a/Data/User/OpenCL/TextureDecoder.cl b/Data/User/OpenCL/TextureDecoder.cl index 06bf7d5df3..93f3f2aa23 100644 --- a/Data/User/OpenCL/TextureDecoder.cl +++ b/Data/User/OpenCL/TextureDecoder.cl @@ -49,112 +49,107 @@ kernel void DecodeI4_RGBA(global uint *dst, } } -kernel void DecodeI8(global uchar *dst, - const global uchar *src, int width) -{ - int x = get_global_id(0) * 8, y = get_global_id(1) * 4; - int srcOffset = ((x * 4) + (y * width)) / 8; - for (int iy = 0; iy < 4; iy++) - { - vstore8(vload8(srcOffset++, src), - 0, dst + ((y + iy)*width + x)); - } +kernel void DecodeI8(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 8, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 8; + for (int iy = 0; iy < 4; iy++) + { + vstore8(vload8(srcOffset++, src), + 0, dst + ((y + iy)*width + x)); + } } -kernel void DecodeI8_RGBA(global uint *dst, - const global uchar *src, int width) -{ - int x = get_global_id(0) * 8, y = get_global_id(1) * 4; - int srcOffset = ((x * 4) + (y * width)) / 8; - for (int iy = 0; iy < 4; iy++) - { - uchar8 val = vload8(srcOffset++, src); - vstore8(upsample(upsample(val,val),upsample(val,val)), - 0, dst + ((y + iy)*width + x)); - } +kernel void DecodeI8_RGBA(global uint *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 8, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 8; + for (int iy = 0; iy < 4; iy++) + { + uchar8 val = vload8(srcOffset++, src); + vstore8(upsample(upsample(val,val),upsample(val,val)), + 0, dst + ((y + iy)*width + x)); + } } -kernel void DecodeIA8(global ushort *dst, - const global uchar *src, int width) -{ - int x = get_global_id(0) * 4, y = get_global_id(1) * 4; - int srcOffset = ((x * 4) + (y * width)) / 4; - for (int iy = 0; iy < 4; iy++) - { - uchar8 val = vload8(srcOffset++, src); - vstore4(upsample(val.even, val.odd), 0, dst + ((y + iy)*width + x)); - } -} - -kernel void DecodeIA8_RGBA(global uint *dst, - const global uchar *src, int width) -{ - int x = get_global_id(0) * 4, y = get_global_id(1) * 4; - int srcOffset = ((x * 4) + (y * width)) / 4; - for (int iy = 0; iy < 4; iy++) - { - uchar8 val = vload8(srcOffset++, src); - vstore4(upsample(upsample(val.even,val.odd),upsample(val.odd, val.odd)), 0, dst + ((y + iy)*width + x)); - } -} - -kernel void DecodeIA4(global ushort *dst, - const global uchar *src, int width) -{ - int x = get_global_id(0) * 8, y = get_global_id(1) * 4; - int srcOffset = ((x * 4) + (y * width)) / 8; - uchar8 val; - ushort8 res; - for (int iy = 0; iy < 4; iy++) - { - val = vload8(srcOffset++, src); - res = upsample(val >> (uchar8)4, val & (uchar8)0xF); - res |= res << (ushort8)4; - vstore8(res, 0, dst + y*width + x); - dst+=width; - } -} - -kernel void DecodeIA4_RGBA(global uint *dst, - const global uchar *src, int width) -{ - int x = get_global_id(0) * 8, y = get_global_id(1) * 4; - int srcOffset = ((x * 4) + (y * width)) / 8; - uchar8 val; - uint8 res; - for (int iy = 0; iy < 4; iy++) - { - val = vload8(srcOffset++, src); - uchar8 a = val >> (uchar8)4; - uchar8 l = val & (uchar8)0xF; - res = upsample(upsample(a, l), upsample(l,l)); - res |= res << (uint8)4; - vstore8(res, 0, dst + y*width + x); - dst+=width; - } -} - -kernel void DecodeRGBA8(global uchar *dst, +kernel void DecodeIA8(global ushort *dst, const global uchar *src, int width) +{ + int x = get_global_id(0) * 4, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 4; + for (int iy = 0; iy < 4; iy++) + { + uchar8 val = vload8(srcOffset++, src); + vstore4(upsample(val.even, val.odd), 0, dst + ((y + iy)*width + x)); + } +} + +kernel void DecodeIA8_RGBA(global uint *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 4, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 4; + for (int iy = 0; iy < 4; iy++) + { + uchar8 val = vload8(srcOffset++, src); + vstore4(upsample(upsample(val.even,val.odd),upsample(val.odd, val.odd)), 0, dst + ((y + iy)*width + x)); + } +} + +kernel void DecodeIA4(global ushort *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 8, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 8; + uchar8 val; + ushort8 res; + for (int iy = 0; iy < 4; iy++) + { + val = vload8(srcOffset++, src); + res = upsample(val >> (uchar8)4, val & (uchar8)0xF); + res |= res << (ushort8)4; + vstore8(res, 0, dst + y*width + x); + dst+=width; + } +} + +kernel void DecodeIA4_RGBA(global uint *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 8, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 8; + uchar8 val; + uint8 res; + for (int iy = 0; iy < 4; iy++) + { + val = vload8(srcOffset++, src); + uchar8 a = val >> (uchar8)4; + uchar8 l = val & (uchar8)0xF; + res = upsample(upsample(a, l), upsample(l,l)); + res |= res << (uint8)4; + vstore8(res, 0, dst + y*width + x); + dst+=width; + } +} + +kernel void DecodeRGBA8(global ushort *dst, + const global ushort *src, int width) { int x = get_global_id(0) * 4, y = get_global_id(1) * 4; int srcOffset = (x * 2) + (y * width) / 2; for (int iy = 0; iy < 4; iy++) { - uchar8 ar = vload8(srcOffset, src); - uchar8 gb = vload8(srcOffset + 4, src); - uchar16 res; - res.even.even = gb.odd; - res.even.odd = ar.odd; - res.odd.even = gb.even; - res.odd.odd = ar.even; - vstore16(res, 0, dst + ((y + iy)*width + x) * 4); + ushort8 val = (ushort8)(vload4(srcOffset, src), vload4(srcOffset + 4, src)); + ushort8 bgra = rotate(val,(ushort8)8).s40516273; + vstore8(bgra, 0, dst + ((y + iy)*width + x) * 2); srcOffset++; } } kernel void DecodeRGBA8_RGBA(global uchar *dst, - const global uchar *src, int width) + const global uchar *src, int width) { int x = get_global_id(0) * 4, y = get_global_id(1) * 4; int srcOffset = (x * 2) + (y * width) / 2; @@ -173,19 +168,19 @@ kernel void DecodeRGBA8_RGBA(global uchar *dst, } kernel void DecodeRGB565(global ushort *dst, - const global uchar *src, int width) + const global ushort *src, int width) { int x = get_global_id(0) * 4, y = get_global_id(1) * 4; int srcOffset = x + (y * width) / 4; + dst += width*y + x; for (int iy = 0; iy < 4; iy++) { - uchar8 val = vload8(srcOffset++, src); - vstore4(upsample(val.even, val.odd), 0, dst + ((y + iy)*width + x)); + vstore4(rotate(vload4(srcOffset++, src),(ushort4)8), 0, dst + iy*width); } } kernel void DecodeRGB565_RGBA(global uchar *dst, - const global uchar *src, int width) + const global uchar *src, int width) { int x = get_global_id(0) * 4, y = get_global_id(1) * 4; int srcOffset = x + (y * width) / 4; @@ -194,9 +189,9 @@ kernel void DecodeRGB565_RGBA(global uchar *dst, uchar8 val = vload8(srcOffset++, src); uchar16 res; - res.even.even = bitselect(val.even, val.even >> (uchar4)5, (uchar4)7); - res.odd.even = bitselect((val.odd >> (uchar4)3) | (val.even << (uchar4)5), val.even >> (uchar4)1, (uchar4)3); - res.even.odd = bitselect(val.odd << (uchar4)3, val.odd >> (uchar4)2, (uchar4)7); + res.even.even = bitselect(val.even, val.even >> (uchar4)5, (uchar4)7); + res.odd.even = bitselect((val.odd >> (uchar4)3) | (val.even << (uchar4)5), val.even >> (uchar4)1, (uchar4)3); + res.even.odd = bitselect(val.odd << (uchar4)3, val.odd >> (uchar4)2, (uchar4)7); res.odd.odd = (uchar4)0xFF; vstore16(res, 0, dst + ((y + iy)*width + x) * 4); @@ -209,7 +204,7 @@ kernel void DecodeRGB5A3(global uchar *dst, int x = get_global_id(0) * 4, y = get_global_id(1) * 4; int srcOffset = x + (y * width) / 4; uchar8 val; - uchar16 resNoAlpha, resAlpha, res, choice; + uchar16 resNoAlpha, resAlpha, choice; #define iterateRGB5A3() \ val = vload8(srcOffset++, src); \ resNoAlpha.s26AE = val.even << (uchar4)1; \ @@ -234,13 +229,13 @@ kernel void DecodeRGB5A3(global uchar *dst, } kernel void DecodeRGB5A3_RGBA(global uchar *dst, - const global uchar *src, int width) + const global uchar *src, int width) { int x = get_global_id(0) * 4, y = get_global_id(1) * 4; int srcOffset = x + (y * width) / 4; uchar8 val; - uchar16 resNoAlpha, resAlpha, res, choice; - #define iterateRGB5A3() \ + uchar16 resNoAlpha, resAlpha, choice; + #define iterateRGB5A3_RGBA() \ val = vload8(srcOffset++, src); \ resNoAlpha.s048C = val.even << (uchar4)1; \ resNoAlpha.s159D = val.even << (uchar4)6 | val.odd >> (uchar4)2; \ @@ -257,10 +252,10 @@ kernel void DecodeRGB5A3_RGBA(global uchar *dst, (uchar4)(val.even.s2), \ (uchar4)(val.even.s3)); \ vstore16(select(resAlpha, resNoAlpha, choice), 0, dst + (y * width + x) * 4); - iterateRGB5A3(); dst += width*4; - iterateRGB5A3(); dst += width*4; - iterateRGB5A3(); dst += width*4; - iterateRGB5A3(); + iterateRGB5A3_RGBA(); dst += width*4; + iterateRGB5A3_RGBA(); dst += width*4; + iterateRGB5A3_RGBA(); dst += width*4; + iterateRGB5A3_RGBA(); } uint16 unpack(uchar b) @@ -279,9 +274,9 @@ kernel void decodeCMPRBlock(global uchar *dst, uchar2 colora565 = (uchar2)(val.s1, val.s3); uchar2 colorb565 = (uchar2)(val.s0, val.s2); - uchar8 color32 = (uchar8)(bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7), - bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3), - bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7), + uchar8 color32 = (uchar8)(bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7), + bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3), + bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7), (uchar2)0xFF); ushort4 frac2 = convert_ushort4(color32.even & (uchar4)0xFF) - convert_ushort4(color32.odd & (uchar4)0xFF); @@ -317,16 +312,16 @@ kernel void DecodeCMPR(global uchar *dst, } kernel void decodeCMPRBlock_RGBA(global uchar *dst, - const global uchar *src, int width) + const global uchar *src, int width) { int x = get_global_id(0) * 4, y = get_global_id(1) * 4; uchar8 val = vload8(0, src); uchar2 colora565 = (uchar2)(val.s1, val.s3); uchar2 colorb565 = (uchar2)(val.s0, val.s2); - uchar8 color32 = (uchar8)(bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7), - bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3), - bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7), + uchar8 color32 = (uchar8)(bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7), + bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3), + bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7), (uchar2)0xFF); ushort4 frac2 = convert_ushort4(color32.even & (uchar4)0xFF) - convert_ushort4(color32.odd & (uchar4)0xFF); @@ -348,7 +343,7 @@ kernel void decodeCMPRBlock_RGBA(global uchar *dst, } kernel void DecodeCMPR_RGBA(global uchar *dst, - const global uchar *src, int width) + const global uchar *src, int width) { int x = get_global_id(0) * 8, y = get_global_id(1) * 8;