Hi! This is my first commit so be nice :)

New OpenCL updates: - OpenCL bug with ATI SDK (GPU or CPU) fixed. - IA4 texture loop unrolled. 12x speed up on 4xxx series. - Completed rewriting RGB5A3 texture decode. 20% faster. - Redundant code removed from CMPR and RGB5A3 (Alpha, shift). - Made use of optimised OpenCL functions (upsample, bitselect). - Cleaner code. Tested and working with DX9 plugin. DX11 plugin will NOT work due to a recent commit affecting VideoCommon. You can use this file with an older DX11 plugin (~r5730), however. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5753 8ced0084-cf51-0410-be5f-012b33b47a6e
2010-06-20 16:43:21 +00:00 · 2010-06-20 16:43:21 +00:00 · 783390539d
parent c51bb0b010
commit 783390539d
1 changed files with 50 additions and 70 deletions
--- a/Data/User/OpenCL/TextureDecoder.cl
+++ b/Data/User/OpenCL/TextureDecoder.cl
@ -65,15 +65,19 @@ kernel void DecodeIA4(global uchar *dst,
 {
    int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
    int srcOffset = ((x * 4) + (y * width)) / 8;
-    for (int iy = 0; iy < 4; iy++)
-    {
-        uchar8 val = vload8(srcOffset++, src);
-        uchar16 res;
-        res.odd = (val >> (uchar8)4);
-        res.even  = val & (uchar8)0x0F;
-        res |= res << (uchar16)4;
-        vstore16(res, 0, dst + ((y + iy)*width + x) * 2);
-    }
+    uchar8 val;
+    uchar16 res;
+    dst += 2*(y*width + x);
+    #define iterateIA4() \
+    val = vload8(srcOffset++, src); \
+    res.odd = (val >> (uchar8)4); \
+    res.even  = val & (uchar8)0x0F; \
+    res |= res << (uchar16)4; \
+    vstore16(res, 0, dst);
+    iterateIA4(); dst += 2*width;
+    iterateIA4(); dst += 2*width;
+    iterateIA4(); dst += 2*width;
+    iterateIA4();
 }

 kernel void DecodeRGBA8(global uchar *dst,
@ -114,40 +118,34 @@ kernel void DecodeRGB5A3(global uchar *dst,
    int srcOffset = x + (y * width) / 4;
    uchar8 val;
    uchar16 resNoAlpha, resAlpha, res, choice;
-    uchar4 gNoAlpha, aAlpha;
    #define iterateRGB5A3() \
    val = vload8(srcOffset++, src); \
-    gNoAlpha = (val.even << (uchar4)6) | (val.odd >> (uchar4)2); \
-    resNoAlpha.s26AE = bitselect(val.even >> (uchar4)4,     val.even << (uchar4)1, (uchar4)0xFFF); \
-    resNoAlpha.s159D = bitselect(gNoAlpha >> (uchar4)5,     gNoAlpha,              (uchar4)0xFFF); \
-    resNoAlpha.s048C = bitselect(val.odd >> (uchar4)2,      val.odd << (uchar4)3,  (uchar4)0xFFF); \
+    resNoAlpha.s26AE = val.even << (uchar4)1; \
+    resNoAlpha.s159D = val.even << (uchar4)6 | val.odd >> (uchar4)2; \
+    resNoAlpha.s048C = val.odd << (uchar4)3; \
    resNoAlpha.s37BF = (uchar4)(0xFF); \
-    resAlpha.s26AE = val.even; \
-    resAlpha.s159D = val.odd >> (uchar4)4; \
-    resAlpha.s048C = val.odd; \
-    resAlpha &= (uchar16)0x0F; \
-    resAlpha |= (resAlpha << (uchar16)4); \
-    resAlpha.s37BF = val.even << (uchar4)1 & (uchar4)0xE0; \
-    resAlpha.s37BF |= ((resAlpha.s37BF >> (uchar4)3) & (uchar4)0x1C) \
-                   | ((resAlpha.s37BF >> (uchar4)6) & (uchar4)0x3); \
+    resAlpha.s26AE = bitselect(val.even << (uchar4)4, val.even, (uchar4)0xF); \
+    resAlpha.s159D = bitselect(val.odd, val.odd >> (uchar4)4,   (uchar4)0xF); \
+    resAlpha.s048C = bitselect(val.odd << (uchar4)4, val.odd,   (uchar4)0xF); \
+    resAlpha.s37BF = bitselect(val.even << (uchar4)1, val.even >> (uchar4)2, (uchar4)0x1C); \
+    resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> (uchar4)5, (uchar4)0x3); \
    choice = (uchar16)((uchar4)(val.even.s0), \
                       (uchar4)(val.even.s1), \
                       (uchar4)(val.even.s2), \
                       (uchar4)(val.even.s3)); \
-    vstore16(select(resAlpha, resNoAlpha, choice), 0, dst + (y * width + x) * 4); \
-    dst += width*4; // This may look ugly but unrolling loops is required for pre-DX11 hardware.
-    iterateRGB5A3();
-    iterateRGB5A3();
-    iterateRGB5A3();
+    vstore16(select(resAlpha, resNoAlpha, choice), 0, dst + (y * width + x) * 4);
+    iterateRGB5A3(); dst += width*4;
+    iterateRGB5A3(); dst += width*4;
+    iterateRGB5A3(); dst += width*4;
    iterateRGB5A3();
 }

 uint16 unpack(uchar b)
 {
-    return (uint16)((uint4)(b >> 6),
-                    (uint4)(b >> 4 & 3),
-                    (uint4)(b >> 2 & 3),
-                    (uint4)(b      & 3));
+    return (uint16)((uint4)(b >> 3 & 0x18),
+                    (uint4)(b >> 1 & 0x18),
+                    (uint4)(b << 1 & 0x18),
+                    (uint4)(b << 3 & 0x18));
 }

 kernel void decodeCMPRBlock(global uchar *dst,
@ -158,43 +156,28 @@ kernel void decodeCMPRBlock(global uchar *dst,

    uchar2 colora565 = (uchar2)(val.s1, val.s3);
    uchar2 colorb565 = (uchar2)(val.s0, val.s2);
-    uchar8 color32 = (uchar8)(bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)0xFFFFF000),
-                              colora565 >> (uchar2)3 | bitselect(colorb565 << (uchar2)5, colorb565 >> (uchar2)1, (uchar2)0xFFFFFF00),
-                              bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)0xFFFFF000),
+    uchar8 color32 = (uchar8)(colora565 << (uchar2)3,
+                              colora565 >> (uchar2)3 | colorb565 << (uchar2)5,
+                              colorb565,
                              (uchar2)0xFF);
-    uint4 colors;
-    uint4 colorNoAlpha;
-	ushort4 frac2 = (ushort4)(color32.even & (uchar4)0xFF) - (ushort4)(color32.odd & (uchar4)0xFF);
-	uchar4 frac = convert_uchar4((frac2 * (ushort4)3) / (ushort4)8);
-    colorNoAlpha = convert_uint4(color32.odd + frac);
-    colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.even - frac);
-    colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.odd);
-    colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.even);

-    uint4 colorAlpha;
-    uchar4 midpoint = rhadd(color32.odd, color32.even);
-    midpoint.s3 = 0xFF;
-    colorAlpha = convert_uint4(midpoint);    
-    colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.odd);
-    colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.even);
+    ushort4 frac2 = (ushort4)(color32.even & (uchar4)0xFF) - (ushort4)(color32.odd & (uchar4)0xFF);
+    uchar4 frac = convert_uchar4((frac2 * (ushort4)3) / (ushort4)8);
+    
+    ushort4 colorAlpha =   upsample((uchar4)0, rhadd(color32.odd, color32.even));
+    colorAlpha.s3 = 0xFF;
+    ushort4 colorNoAlpha = upsample(color32.odd + frac, color32.even - frac);

-    uint4 choice = isgreater(upsample(val.s0,val.s1),upsample(val.s2, val.s3));
-    colors = bitselect(colorNoAlpha, colorAlpha, choice);
+    ushort4 choice = isgreater(val.s0,val.s2);
+    uint4 colors = upsample(bitselect(colorNoAlpha, colorAlpha, choice),
+                            upsample(color32.odd, color32.even));

    uint16 colorsFull = (uint16)(colors, colors, colors, colors);

-    uint16 shifts = (((unpack(val.s7) << (uint16)8
-                  |   unpack(val.s6)) << (uint16)8
-                  |   unpack(val.s5)) << (uint16)8
-                  |   unpack(val.s4)) << (uint16)3;
-
-    vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst);
-    shifts = shifts >> (uint16)8;
-    vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width*4);
-    shifts = shifts >> (uint16)8;
-    vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width*4);
-    shifts = shifts >> (uint16)8;
-    vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width*4);
+    vstore16((uchar16)(colorsFull >> unpack(val.s4)), 0, dst);
+    vstore16((uchar16)(colorsFull >> unpack(val.s5)), 0, dst+=width*4);
+    vstore16((uchar16)(colorsFull >> unpack(val.s6)), 0, dst+=width*4);
+    vstore16((uchar16)(colorsFull >> unpack(val.s7)), 0, dst+=width*4);
 }

 kernel void DecodeCMPR(global uchar *dst,
@ -203,13 +186,10 @@ kernel void DecodeCMPR(global uchar *dst,
    int x = get_global_id(0) * 8, y = get_global_id(1) * 8;
    
    src += x * 4 + (y * width) / 2;
+    dst += (y * width + x) * 4;
    
-    decodeCMPRBlock(dst + (y * width + x) * 4, src, width);
-    src += 8;
-    decodeCMPRBlock(dst + (y * width + x + 4) * 4, src, width); // + 16
-    src += 8;
-    decodeCMPRBlock(dst + ((y + 4) * width + x) * 4, src, width); // + 16*width
-    src += 8;
-    decodeCMPRBlock(dst + ((y + 4) * width + x + 4) * 4, src, width); // + 16*(width+1)
-
+    decodeCMPRBlock(dst, src, width);              src += 8;
+    decodeCMPRBlock(dst + 16, src, width);         src += 8;
+    decodeCMPRBlock(dst + 16 * width, src, width); src += 8;
+    decodeCMPRBlock(dst + 16 * (width + 1), src, width);
 }