Added OpenCL texture decoding to RGBA usable by DX11 for formats RGB565, RGBA8, RGB5A3 and CMPR
git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5765 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
c2e32371f6
commit
2cc5b98f07
|
@ -99,6 +99,25 @@ kernel void DecodeRGBA8(global uchar *dst,
|
|||
}
|
||||
}
|
||||
|
||||
kernel void DecodeRGBA8_RGBA(global uchar *dst,
|
||||
const global uchar *src, int width)
|
||||
{
|
||||
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
|
||||
int srcOffset = (x * 2) + (y * width) / 2;
|
||||
for (int iy = 0; iy < 4; iy++)
|
||||
{
|
||||
uchar8 ar = vload8(srcOffset, src);
|
||||
uchar8 gb = vload8(srcOffset + 4, src);
|
||||
uchar16 res;
|
||||
res.even.even = ar.odd;
|
||||
res.even.odd = gb.odd;
|
||||
res.odd.even = gb.even;
|
||||
res.odd.odd = ar.even;
|
||||
vstore16(res, 0, dst + ((y + iy)*width + x) * 4);
|
||||
srcOffset++;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void DecodeRGB565(global ushort *dst,
|
||||
const global uchar *src, int width)
|
||||
{
|
||||
|
@ -111,6 +130,25 @@ kernel void DecodeRGB565(global ushort *dst,
|
|||
}
|
||||
}
|
||||
|
||||
kernel void DecodeRGB565_RGBA(global uchar *dst,
|
||||
const global uchar *src, int width)
|
||||
{
|
||||
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
|
||||
int srcOffset = x + (y * width) / 4;
|
||||
for (int iy = 0; iy < 4; iy++)
|
||||
{
|
||||
uchar8 val = vload8(srcOffset++, src);
|
||||
|
||||
uchar16 res;
|
||||
res.even.even = bitselect(val.even, val.even >> (uchar4)5, (uchar4)7);
|
||||
res.odd.even = bitselect((val.odd >> (uchar4)3) | (val.even << (uchar4)5), val.even >> (uchar4)1, (uchar4)3);
|
||||
res.even.odd = bitselect(val.odd << (uchar4)3, val.odd >> (uchar4)2, (uchar4)7);
|
||||
res.odd.odd = (uchar4)0xFF;
|
||||
|
||||
vstore16(res, 0, dst + ((y + iy)*width + x) * 4);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void DecodeRGB5A3(global uchar *dst,
|
||||
const global uchar *src, int width)
|
||||
{
|
||||
|
@ -141,6 +179,36 @@ kernel void DecodeRGB5A3(global uchar *dst,
|
|||
iterateRGB5A3();
|
||||
}
|
||||
|
||||
kernel void DecodeRGB5A3_RGBA(global uchar *dst,
|
||||
const global uchar *src, int width)
|
||||
{
|
||||
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
|
||||
int srcOffset = x + (y * width) / 4;
|
||||
uchar8 val;
|
||||
uchar16 resNoAlpha, resAlpha, res, choice;
|
||||
#define iterateRGB5A3() \
|
||||
val = vload8(srcOffset++, src); \
|
||||
resNoAlpha.s048C = val.even << (uchar4)1; \
|
||||
resNoAlpha.s159D = val.even << (uchar4)6 | val.odd >> (uchar4)2; \
|
||||
resNoAlpha.s26AE = val.odd << (uchar4)3; \
|
||||
resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> (uchar16)5, (uchar16)0x3); \
|
||||
resNoAlpha.s37BF = (uchar4)(0xFF); \
|
||||
resAlpha.s048C = bitselect(val.even << (uchar4)4, val.even, (uchar4)0xF); \
|
||||
resAlpha.s159D = bitselect(val.odd, val.odd >> (uchar4)4, (uchar4)0xF); \
|
||||
resAlpha.s26AE = bitselect(val.odd << (uchar4)4, val.odd, (uchar4)0xF); \
|
||||
resAlpha.s37BF = bitselect(val.even << (uchar4)1, val.even >> (uchar4)2, (uchar4)0x1C); \
|
||||
resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> (uchar4)5, (uchar4)0x3); \
|
||||
choice = (uchar16)((uchar4)(val.even.s0), \
|
||||
(uchar4)(val.even.s1), \
|
||||
(uchar4)(val.even.s2), \
|
||||
(uchar4)(val.even.s3)); \
|
||||
vstore16(select(resAlpha, resNoAlpha, choice), 0, dst + (y * width + x) * 4);
|
||||
iterateRGB5A3(); dst += width*4;
|
||||
iterateRGB5A3(); dst += width*4;
|
||||
iterateRGB5A3(); dst += width*4;
|
||||
iterateRGB5A3();
|
||||
}
|
||||
|
||||
uint16 unpack(uchar b)
|
||||
{
|
||||
return (uint16)((uint4)(b >> 3 & 0x18),
|
||||
|
@ -192,4 +260,49 @@ kernel void DecodeCMPR(global uchar *dst,
|
|||
decodeCMPRBlock(dst + 16, src, width); src += 8;
|
||||
decodeCMPRBlock(dst + 16 * width, src, width); src += 8;
|
||||
decodeCMPRBlock(dst + 16 * (width + 1), src, width);
|
||||
}
|
||||
|
||||
kernel void decodeCMPRBlock_RGBA(global uchar *dst,
|
||||
const global uchar *src, int width)
|
||||
{
|
||||
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
|
||||
uchar8 val = vload8(0, src);
|
||||
|
||||
uchar2 colora565 = (uchar2)(val.s1, val.s3);
|
||||
uchar2 colorb565 = (uchar2)(val.s0, val.s2);
|
||||
uchar8 color32 = (uchar8)(bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7),
|
||||
bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3),
|
||||
bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7),
|
||||
(uchar2)0xFF);
|
||||
|
||||
ushort4 frac2 = convert_ushort4(color32.even & (uchar4)0xFF) - convert_ushort4(color32.odd & (uchar4)0xFF);
|
||||
uchar4 frac = convert_uchar4((frac2 * (ushort4)3) / (ushort4)8);
|
||||
|
||||
ushort4 colorAlpha = upsample((uchar4)0, rhadd(color32.odd, color32.even));
|
||||
colorAlpha.s3 = 0xFF;
|
||||
ushort4 colorNoAlpha = upsample(color32.odd + frac, color32.even - frac);
|
||||
|
||||
uint4 colors = upsample((upsample(val.s0,val.s1) > upsample(val.s2,val.s3))?colorNoAlpha:colorAlpha,
|
||||
upsample(color32.odd, color32.even));
|
||||
|
||||
uint16 colorsFull = (uint16)(colors, colors, colors, colors);
|
||||
|
||||
vstore16(convert_uchar16(colorsFull >> unpack(val.s4)), 0, dst);
|
||||
vstore16(convert_uchar16(colorsFull >> unpack(val.s5)), 0, dst+=width*4);
|
||||
vstore16(convert_uchar16(colorsFull >> unpack(val.s6)), 0, dst+=width*4);
|
||||
vstore16(convert_uchar16(colorsFull >> unpack(val.s7)), 0, dst+=width*4);
|
||||
}
|
||||
|
||||
kernel void DecodeCMPR_RGBA(global uchar *dst,
|
||||
const global uchar *src, int width)
|
||||
{
|
||||
int x = get_global_id(0) * 8, y = get_global_id(1) * 8;
|
||||
|
||||
src += x * 4 + (y * width) / 2;
|
||||
dst += (y * width + x) * 4;
|
||||
|
||||
decodeCMPRBlock_RGBA(dst, src, width); src += 8;
|
||||
decodeCMPRBlock_RGBA(dst + 16, src, width); src += 8;
|
||||
decodeCMPRBlock_RGBA(dst + 16 * width, src, width); src += 8;
|
||||
decodeCMPRBlock_RGBA(dst + 16 * (width + 1), src, width);
|
||||
}
|
|
@ -163,7 +163,9 @@ cl_kernel CompileKernel(cl_program program, const char *Function)
|
|||
cl_kernel kernel = clCreateKernel(program, Function, &err);
|
||||
if (!kernel || err != CL_SUCCESS)
|
||||
{
|
||||
HandleCLError(err, "Failed to create compute kernel!");
|
||||
char buffer[1024];
|
||||
sprintf(buffer, "Failed to create compute kernel '%s' !", Function);
|
||||
HandleCLError(err, buffer);
|
||||
return NULL;
|
||||
}
|
||||
NOTICE_LOG(COMMON, "OpenCL CompileKernel took %.3f seconds", (float)(Common::Timer::GetTimeMs() - compileStart) / 1000.0);
|
||||
|
|
|
@ -102,11 +102,11 @@ void TexDecoder_OpenCL_Initialize() {
|
|||
g_program = OpenCL::CompileProgram(code.c_str());
|
||||
|
||||
int i = 0;
|
||||
for(int i = 0; i < GX_TF_CMPR; ++i) {
|
||||
for(int i = 0; i <= GX_TF_CMPR; ++i) {
|
||||
if(g_DecodeParametersNative[i].name)
|
||||
g_DecodeParametersNative[i].kernel = OpenCL::CompileKernel(g_program, g_DecodeParametersNative[i].name);
|
||||
|
||||
if(false && g_DecodeParametersRGBA[i].name)
|
||||
if(g_DecodeParametersRGBA[i].name)
|
||||
g_DecodeParametersRGBA[i].kernel = OpenCL::CompileKernel(g_program, g_DecodeParametersRGBA[i].name);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue