From 1b670a982594672a6590b45728f2a1d59e6d7959 Mon Sep 17 00:00:00 2001 From: xsacha Date: Tue, 22 Jun 2010 06:09:21 +0000 Subject: [PATCH] New OpenCL update for DX11 Changes: - IA4: 2x Speed up for all hardware and ATI glitch fixed (blocky text) - IA8: 2x Speed up for all hardware - New DX11 OCL Textures: I4, I8, IA4, IA8 git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5766 8ced0084-cf51-0410-be5f-012b33b47a6e --- Data/User/OpenCL/TextureDecoder.cl | 130 ++++++++++++++++++++--------- 1 file changed, 92 insertions(+), 38 deletions(-) diff --git a/Data/User/OpenCL/TextureDecoder.cl b/Data/User/OpenCL/TextureDecoder.cl index 89aea3818e..06bf7d5df3 100644 --- a/Data/User/OpenCL/TextureDecoder.cl +++ b/Data/User/OpenCL/TextureDecoder.cl @@ -32,52 +32,106 @@ kernel void DecodeI4(global uchar *dst, } } -kernel void DecodeI8(global uchar *dst, - const global uchar *src, int width) +kernel void DecodeI4_RGBA(global uint *dst, + const global uchar *src, int width) { - int x = get_global_id(0) * 8, y = get_global_id(1) * 4; - int srcOffset = ((x * 4) + (y * width)) / 8; - for (int iy = 0; iy < 4; iy++) + int x = get_global_id(0) * 8, y = get_global_id(1) * 8; + int srcOffset = x + y * width / 8; + for (int iy = 0; iy < 8; iy++) { - vstore8(vload8(srcOffset, src), - 0, dst + ((y + iy)*width + x)); + uchar4 val = vload4(srcOffset, src); + uchar8 res; + res.even = (val >> (uchar4)4) & (uchar4)0x0F; + res.odd = val & (uchar4)0x0F; + res |= res << (uchar8)4; + vstore8(upsample(upsample(res,res),upsample(res,res)), 0, dst + ((y + iy)*width + x)); srcOffset++; } } -kernel void DecodeIA8(global uchar *dst, - const global uchar *src, int width) -{ - int x = get_global_id(0) * 4, y = get_global_id(1) * 4; - int srcOffset = ((x * 4) + (y * width)) / 4; - for (int iy = 0; iy < 4; iy++) - { - uchar8 val = vload8(srcOffset++, src); - uchar8 res; - res.odd = val.even; - res.even = val.odd; - vstore8(res, 0, dst + ((y + iy)*width + x) * 2); - } +kernel void DecodeI8(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 8, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 8; + for (int iy = 0; iy < 4; iy++) + { + vstore8(vload8(srcOffset++, src), + 0, dst + ((y + iy)*width + x)); + } } -kernel void DecodeIA4(global uchar *dst, - const global uchar *src, int width) -{ - int x = get_global_id(0) * 8, y = get_global_id(1) * 4; - int srcOffset = ((x * 4) + (y * width)) / 8; - uchar8 val; - uchar16 res; - dst += 2*(y*width + x); - #define iterateIA4() \ - val = vload8(srcOffset++, src); \ - res.odd = (val >> (uchar8)4); \ - res.even = val & (uchar8)0x0F; \ - res |= res << (uchar16)4; \ - vstore16(res, 0, dst); - iterateIA4(); dst += 2*width; - iterateIA4(); dst += 2*width; - iterateIA4(); dst += 2*width; - iterateIA4(); +kernel void DecodeI8_RGBA(global uint *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 8, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 8; + for (int iy = 0; iy < 4; iy++) + { + uchar8 val = vload8(srcOffset++, src); + vstore8(upsample(upsample(val,val),upsample(val,val)), + 0, dst + ((y + iy)*width + x)); + } +} + +kernel void DecodeIA8(global ushort *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 4, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 4; + for (int iy = 0; iy < 4; iy++) + { + uchar8 val = vload8(srcOffset++, src); + vstore4(upsample(val.even, val.odd), 0, dst + ((y + iy)*width + x)); + } +} + +kernel void DecodeIA8_RGBA(global uint *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 4, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 4; + for (int iy = 0; iy < 4; iy++) + { + uchar8 val = vload8(srcOffset++, src); + vstore4(upsample(upsample(val.even,val.odd),upsample(val.odd, val.odd)), 0, dst + ((y + iy)*width + x)); + } +} + +kernel void DecodeIA4(global ushort *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 8, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 8; + uchar8 val; + ushort8 res; + for (int iy = 0; iy < 4; iy++) + { + val = vload8(srcOffset++, src); + res = upsample(val >> (uchar8)4, val & (uchar8)0xF); + res |= res << (ushort8)4; + vstore8(res, 0, dst + y*width + x); + dst+=width; + } +} + +kernel void DecodeIA4_RGBA(global uint *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 8, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 8; + uchar8 val; + uint8 res; + for (int iy = 0; iy < 4; iy++) + { + val = vload8(srcOffset++, src); + uchar8 a = val >> (uchar8)4; + uchar8 l = val & (uchar8)0xF; + res = upsample(upsample(a, l), upsample(l,l)); + res |= res << (uint8)4; + vstore8(res, 0, dst + y*width + x); + dst+=width; + } } kernel void DecodeRGBA8(global uchar *dst,