- RGBA8 (DX9/OGL): 10x speed up on Radeon 5450, 2x speed up on other cards due to swizzle registers.
 - RGB565: 2-3x speed up on all hardware
 - Removed OpenCL compiler warnings (eg. redefine).
OpenCL is now optimally complete for DX9/OGL. The code is very fast on all supported hardware. No more updates are needed unless the spec changes or drivers improve. When I started, the OpenCL code was as slow or slower than CPU. Now, using the lowest end radeon that supports the code: a Radeon 45xx mobility, I experience a substantial 2-10x speedup over CPU. The benefits are more pronounced with modern hardware. A Radeon 5870 runs this code 20x faster than a 4550. Even ignoring speedups, the code benefits users by not using CPU for intermittent texture loads (unless GPU is your bottleneck). Instead, the CPU is able to do more important tasks.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5775 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
xsacha 2010-06-24 04:06:03 +00:00
parent e706ccc5d0
commit 2faae384b3
1 changed files with 109 additions and 114 deletions

View File

@ -49,112 +49,107 @@ kernel void DecodeI4_RGBA(global uint *dst,
}
}
kernel void DecodeI8(global uchar *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 8;
for (int iy = 0; iy < 4; iy++)
{
vstore8(vload8(srcOffset++, src),
0, dst + ((y + iy)*width + x));
}
kernel void DecodeI8(global uchar *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 8;
for (int iy = 0; iy < 4; iy++)
{
vstore8(vload8(srcOffset++, src),
0, dst + ((y + iy)*width + x));
}
}
kernel void DecodeI8_RGBA(global uint *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 8;
for (int iy = 0; iy < 4; iy++)
{
uchar8 val = vload8(srcOffset++, src);
vstore8(upsample(upsample(val,val),upsample(val,val)),
0, dst + ((y + iy)*width + x));
}
kernel void DecodeI8_RGBA(global uint *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 8;
for (int iy = 0; iy < 4; iy++)
{
uchar8 val = vload8(srcOffset++, src);
vstore8(upsample(upsample(val,val),upsample(val,val)),
0, dst + ((y + iy)*width + x));
}
}
kernel void DecodeIA8(global ushort *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 4;
for (int iy = 0; iy < 4; iy++)
{
uchar8 val = vload8(srcOffset++, src);
vstore4(upsample(val.even, val.odd), 0, dst + ((y + iy)*width + x));
}
}
kernel void DecodeIA8_RGBA(global uint *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 4;
for (int iy = 0; iy < 4; iy++)
{
uchar8 val = vload8(srcOffset++, src);
vstore4(upsample(upsample(val.even,val.odd),upsample(val.odd, val.odd)), 0, dst + ((y + iy)*width + x));
}
}
kernel void DecodeIA4(global ushort *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 8;
uchar8 val;
ushort8 res;
for (int iy = 0; iy < 4; iy++)
{
val = vload8(srcOffset++, src);
res = upsample(val >> (uchar8)4, val & (uchar8)0xF);
res |= res << (ushort8)4;
vstore8(res, 0, dst + y*width + x);
dst+=width;
}
}
kernel void DecodeIA4_RGBA(global uint *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 8;
uchar8 val;
uint8 res;
for (int iy = 0; iy < 4; iy++)
{
val = vload8(srcOffset++, src);
uchar8 a = val >> (uchar8)4;
uchar8 l = val & (uchar8)0xF;
res = upsample(upsample(a, l), upsample(l,l));
res |= res << (uint8)4;
vstore8(res, 0, dst + y*width + x);
dst+=width;
}
}
kernel void DecodeRGBA8(global uchar *dst,
kernel void DecodeIA8(global ushort *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 4;
for (int iy = 0; iy < 4; iy++)
{
uchar8 val = vload8(srcOffset++, src);
vstore4(upsample(val.even, val.odd), 0, dst + ((y + iy)*width + x));
}
}
kernel void DecodeIA8_RGBA(global uint *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 4;
for (int iy = 0; iy < 4; iy++)
{
uchar8 val = vload8(srcOffset++, src);
vstore4(upsample(upsample(val.even,val.odd),upsample(val.odd, val.odd)), 0, dst + ((y + iy)*width + x));
}
}
kernel void DecodeIA4(global ushort *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 8;
uchar8 val;
ushort8 res;
for (int iy = 0; iy < 4; iy++)
{
val = vload8(srcOffset++, src);
res = upsample(val >> (uchar8)4, val & (uchar8)0xF);
res |= res << (ushort8)4;
vstore8(res, 0, dst + y*width + x);
dst+=width;
}
}
kernel void DecodeIA4_RGBA(global uint *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 8;
uchar8 val;
uint8 res;
for (int iy = 0; iy < 4; iy++)
{
val = vload8(srcOffset++, src);
uchar8 a = val >> (uchar8)4;
uchar8 l = val & (uchar8)0xF;
res = upsample(upsample(a, l), upsample(l,l));
res |= res << (uint8)4;
vstore8(res, 0, dst + y*width + x);
dst+=width;
}
}
kernel void DecodeRGBA8(global ushort *dst,
const global ushort *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = (x * 2) + (y * width) / 2;
for (int iy = 0; iy < 4; iy++)
{
uchar8 ar = vload8(srcOffset, src);
uchar8 gb = vload8(srcOffset + 4, src);
uchar16 res;
res.even.even = gb.odd;
res.even.odd = ar.odd;
res.odd.even = gb.even;
res.odd.odd = ar.even;
vstore16(res, 0, dst + ((y + iy)*width + x) * 4);
ushort8 val = (ushort8)(vload4(srcOffset, src), vload4(srcOffset + 4, src));
ushort8 bgra = rotate(val,(ushort8)8).s40516273;
vstore8(bgra, 0, dst + ((y + iy)*width + x) * 2);
srcOffset++;
}
}
kernel void DecodeRGBA8_RGBA(global uchar *dst,
const global uchar *src, int width)
const global uchar *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = (x * 2) + (y * width) / 2;
@ -173,19 +168,19 @@ kernel void DecodeRGBA8_RGBA(global uchar *dst,
}
kernel void DecodeRGB565(global ushort *dst,
const global uchar *src, int width)
const global ushort *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = x + (y * width) / 4;
dst += width*y + x;
for (int iy = 0; iy < 4; iy++)
{
uchar8 val = vload8(srcOffset++, src);
vstore4(upsample(val.even, val.odd), 0, dst + ((y + iy)*width + x));
vstore4(rotate(vload4(srcOffset++, src),(ushort4)8), 0, dst + iy*width);
}
}
kernel void DecodeRGB565_RGBA(global uchar *dst,
const global uchar *src, int width)
const global uchar *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = x + (y * width) / 4;
@ -194,9 +189,9 @@ kernel void DecodeRGB565_RGBA(global uchar *dst,
uchar8 val = vload8(srcOffset++, src);
uchar16 res;
res.even.even = bitselect(val.even, val.even >> (uchar4)5, (uchar4)7);
res.odd.even = bitselect((val.odd >> (uchar4)3) | (val.even << (uchar4)5), val.even >> (uchar4)1, (uchar4)3);
res.even.odd = bitselect(val.odd << (uchar4)3, val.odd >> (uchar4)2, (uchar4)7);
res.even.even = bitselect(val.even, val.even >> (uchar4)5, (uchar4)7);
res.odd.even = bitselect((val.odd >> (uchar4)3) | (val.even << (uchar4)5), val.even >> (uchar4)1, (uchar4)3);
res.even.odd = bitselect(val.odd << (uchar4)3, val.odd >> (uchar4)2, (uchar4)7);
res.odd.odd = (uchar4)0xFF;
vstore16(res, 0, dst + ((y + iy)*width + x) * 4);
@ -209,7 +204,7 @@ kernel void DecodeRGB5A3(global uchar *dst,
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = x + (y * width) / 4;
uchar8 val;
uchar16 resNoAlpha, resAlpha, res, choice;
uchar16 resNoAlpha, resAlpha, choice;
#define iterateRGB5A3() \
val = vload8(srcOffset++, src); \
resNoAlpha.s26AE = val.even << (uchar4)1; \
@ -234,13 +229,13 @@ kernel void DecodeRGB5A3(global uchar *dst,
}
kernel void DecodeRGB5A3_RGBA(global uchar *dst,
const global uchar *src, int width)
const global uchar *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = x + (y * width) / 4;
uchar8 val;
uchar16 resNoAlpha, resAlpha, res, choice;
#define iterateRGB5A3() \
uchar16 resNoAlpha, resAlpha, choice;
#define iterateRGB5A3_RGBA() \
val = vload8(srcOffset++, src); \
resNoAlpha.s048C = val.even << (uchar4)1; \
resNoAlpha.s159D = val.even << (uchar4)6 | val.odd >> (uchar4)2; \
@ -257,10 +252,10 @@ kernel void DecodeRGB5A3_RGBA(global uchar *dst,
(uchar4)(val.even.s2), \
(uchar4)(val.even.s3)); \
vstore16(select(resAlpha, resNoAlpha, choice), 0, dst + (y * width + x) * 4);
iterateRGB5A3(); dst += width*4;
iterateRGB5A3(); dst += width*4;
iterateRGB5A3(); dst += width*4;
iterateRGB5A3();
iterateRGB5A3_RGBA(); dst += width*4;
iterateRGB5A3_RGBA(); dst += width*4;
iterateRGB5A3_RGBA(); dst += width*4;
iterateRGB5A3_RGBA();
}
uint16 unpack(uchar b)
@ -279,9 +274,9 @@ kernel void decodeCMPRBlock(global uchar *dst,
uchar2 colora565 = (uchar2)(val.s1, val.s3);
uchar2 colorb565 = (uchar2)(val.s0, val.s2);
uchar8 color32 = (uchar8)(bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7),
bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3),
bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7),
uchar8 color32 = (uchar8)(bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7),
bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3),
bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7),
(uchar2)0xFF);
ushort4 frac2 = convert_ushort4(color32.even & (uchar4)0xFF) - convert_ushort4(color32.odd & (uchar4)0xFF);
@ -317,16 +312,16 @@ kernel void DecodeCMPR(global uchar *dst,
}
kernel void decodeCMPRBlock_RGBA(global uchar *dst,
const global uchar *src, int width)
const global uchar *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
uchar8 val = vload8(0, src);
uchar2 colora565 = (uchar2)(val.s1, val.s3);
uchar2 colorb565 = (uchar2)(val.s0, val.s2);
uchar8 color32 = (uchar8)(bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7),
bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3),
bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7),
uchar8 color32 = (uchar8)(bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7),
bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3),
bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7),
(uchar2)0xFF);
ushort4 frac2 = convert_ushort4(color32.even & (uchar4)0xFF) - convert_ushort4(color32.odd & (uchar4)0xFF);
@ -348,7 +343,7 @@ kernel void decodeCMPRBlock_RGBA(global uchar *dst,
}
kernel void DecodeCMPR_RGBA(global uchar *dst,
const global uchar *src, int width)
const global uchar *src, int width)
{
int x = get_global_id(0) * 8, y = get_global_id(1) * 8;