apply a patch done by xsacha :) + some minor stuff :p

what changes do: cmpr decoding is fixed and rgb5a3 is re-enabled

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5744 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
luisr142004 2010-06-19 07:59:53 +00:00
parent d7bbddcbd3
commit ab1e5f8537
2 changed files with 73 additions and 84 deletions

View File

@ -52,12 +52,11 @@ kernel void DecodeIA8(global uchar *dst,
int srcOffset = ((x * 4) + (y * width)) / 4; int srcOffset = ((x * 4) + (y * width)) / 4;
for (int iy = 0; iy < 4; iy++) for (int iy = 0; iy < 4; iy++)
{ {
uchar8 val = vload8(srcOffset, src); uchar8 val = vload8(srcOffset++, src);
uchar8 res; uchar8 res;
res.odd = val.even; res.odd = val.even;
res.even = val.odd; res.even = val.odd;
vstore8(res, 0, dst + ((y + iy)*width + x) * 2); vstore8(res, 0, dst + ((y + iy)*width + x) * 2);
srcOffset++;
} }
} }
@ -68,13 +67,12 @@ kernel void DecodeIA4(global uchar *dst,
int srcOffset = ((x * 4) + (y * width)) / 8; int srcOffset = ((x * 4) + (y * width)) / 8;
for (int iy = 0; iy < 4; iy++) for (int iy = 0; iy < 4; iy++)
{ {
uchar8 val = vload8(srcOffset, src); uchar8 val = vload8(srcOffset++, src);
uchar16 res; uchar16 res;
res.odd = (val >> (uchar8)4) & (uchar8)0x0F; res.odd = (val >> (uchar8)4);
res.even = val & (uchar8)0x0F; res.even = val & (uchar8)0x0F;
res |= res << (uchar16)4; res |= res << (uchar16)4;
vstore16(res, 0, dst + ((y + iy)*width + x) * 2); vstore16(res, 0, dst + ((y + iy)*width + x) * 2);
srcOffset++;
} }
} }
@ -98,16 +96,14 @@ kernel void DecodeRGBA8(global uchar *dst,
} }
kernel void DecodeRGB565(global ushort *dst, kernel void DecodeRGB565(global ushort *dst,
const global ushort *src, int width) const global uchar *src, int width)
{ {
int x = get_global_id(0) * 4, y = get_global_id(1) * 4; int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = x + (y * width) / 4; int srcOffset = x + (y * width) / 4;
for (int iy = 0; iy < 4; iy++) for (int iy = 0; iy < 4; iy++)
{ {
ushort4 val = vload4(srcOffset, src); uchar8 val = vload8(srcOffset++, src);
val = (val >> (ushort4)8) | (val << (ushort4)8); vstore4(upsample(val.even, val.odd), 0, dst + ((y + iy)*width + x));
vstore4(val, 0, dst + ((y + iy)*width + x));
srcOffset++;
} }
} }
@ -118,59 +114,60 @@ kernel void DecodeRGB5A3(global uchar *dst,
int srcOffset = x + (y * width) / 4; int srcOffset = x + (y * width) / 4;
for (int iy = 0; iy < 4; iy++) for (int iy = 0; iy < 4; iy++)
{ {
ushort8 val = convert_ushort8(vload8(srcOffset, src)); uchar8 val = vload8(srcOffset++, src);
ushort4 vs = val.odd | (ushort4)(val.even << (ushort4)8); ushort4 vs = upsample(val.even, val.odd);
uchar16 resNoAlpha; uchar16 resNoAlpha;
resNoAlpha.s26AE = convert_uchar4(vs >> (ushort4)7); // R resNoAlpha.s26AE = (uchar4)(vs >> (ushort4)7); // R
resNoAlpha.s159D = convert_uchar4(vs >> (ushort4)2); // G resNoAlpha.s159D = (uchar4)(vs >> (ushort4)2); // G
resNoAlpha.s048C = convert_uchar4(vs << (ushort4)3); // B resNoAlpha.s048C = (uchar4)(vs << (ushort4)3); // B
resNoAlpha &= (uchar16)0xF8; resNoAlpha &= (uchar16)0xF8;
resNoAlpha |= (uchar16)(resNoAlpha >> (uchar16)5) & (uchar16)3; // 5 -> 8 resNoAlpha |= (uchar16)(resNoAlpha >> (uchar16)5); // 5 -> 8
resNoAlpha.s37BF = (uchar4)(0xFF); resNoAlpha.s37BF = (uchar4)(0xFF);
uchar16 resAlpha; uchar16 resAlpha;
resAlpha.s26AE = convert_uchar4(vs >> (ushort4)8); // R resAlpha.s26AE = val.even; // R
resAlpha.s159D = convert_uchar4(vs >> (ushort4)4); // G resAlpha.s159D = val.odd >> (uchar4)4; // G
resAlpha.s048C = convert_uchar4(vs); // B resAlpha.s048C = val.odd; // B
resAlpha &= (uchar16)0x0F; resAlpha &= (uchar16)0x0F;
resAlpha |= (resAlpha << (uchar16)4); resAlpha |= (resAlpha << (uchar16)4);
resAlpha.s37BF = convert_uchar4(vs >> (ushort4)7) & (uchar4)0xE0; resAlpha.s37BF = convert_uchar4(vs >> (ushort4)7) & (uchar4)0xE0;
resAlpha.s37BF |= ((resAlpha.s37BF >> (uchar4)3) & (uchar4)0x1C) resAlpha.s37BF |= ((resAlpha.s37BF >> (uchar4)3) & (uchar4)0x1C)
| ((resAlpha.s37BF >> (uchar4)6) & (uchar4)0x3); | ((resAlpha.s37BF >> (uchar4)6) & (uchar4)0x3);
uchar16 choice = (uchar16)((uchar4)(vs.s0 >> 8), uchar16 choice = (uchar16)((uchar4)(val.even.s0),
(uchar4)(vs.s1 >> 8), (uchar4)(val.even.s1),
(uchar4)(vs.s2 >> 8), (uchar4)(val.even.s2),
(uchar4)(vs.s3 >> 8)); (uchar4)(val.even.s3));
uchar16 res; uchar16 res;
res = select(resAlpha, resNoAlpha, choice); res = select(resAlpha, resNoAlpha, choice);
vstore16(res, 0, dst + ((y + iy) * width + x) * 4); vstore16(res, 0, dst + ((y + iy) * width + x) * 4);
srcOffset++;
} }
} }
uint4 unpack2bits(uchar b) uint16 unpack(uchar b)
{ {
return (uint4)(b >> 6, return (uint16)((uint4)(b >> 6),
(b >> 4) & 3, (uint4)(b >> 4 & 3),
(b >> 2) & 3, (uint4)(b >> 2 & 3),
b & 3); (uint4)(b & 3));
} }
kernel void decodeCMPRBlock(global uchar *dst, kernel void decodeCMPRBlock(global uchar *dst,
const global uchar *src, int width) const global uchar *src, int width)
{ {
int x = get_global_id(0) * 4, y = get_global_id(1) * 4; int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
uchar8 val = vload8(0, src); uchar8 val = vload8(0, src);
ushort2 color565 = (ushort2)((val.s1 & 0xFF) | (val.s0 << 8), (val.s3 & 0xFF) | (val.s2 << 8));
uchar8 color32 = convert_uchar8((ushort8) uchar2 colora565 = (uchar2)(val.s1, val.s3);
(((color565 << (ushort2)3) & (ushort2)0xF8) | ((color565 >> (ushort2)2) & (ushort2)0x7), uchar2 colorb565 = (uchar2)(val.s0, val.s2);
((color565 >> (ushort2)3) & (ushort2)0xFC) | ((color565 >> (ushort2)9) & (ushort2)0x3), uchar8 color32 = (uchar8)((colora565 << (uchar2)3) | (colora565 >> (uchar2)2 & (uchar2)7),
((color565 >> (ushort2)8) & (ushort2)0xF8) | ((color565 >> (ushort2)13) & (ushort2)0x7), (colora565 >> (uchar2)3) | (colorb565 << (uchar2)5) | (colorb565 >> (uchar2)1 & (uchar2)3),
0xFF, 0xFF)); (colorb565 & (uchar2)0xF8) | (colorb565 >> (uchar2)5 & (uchar2)7),
(uchar2)0xFF);
uint4 colors; uint4 colors;
uint4 colorNoAlpha; uint4 colorNoAlpha;
uchar4 frac = convert_uchar4((((convert_ushort4(color32.even) & (ushort4)0xFF) - (convert_ushort4(color32.odd) & (ushort4)0xFF)) * (ushort4)3) / (ushort4)8); ushort4 frac2 = (ushort4)(color32.even & (uchar4)0xFF) - (ushort4)(color32.odd & (uchar4)0xFF);
uchar4 frac = convert_uchar4((frac2 * (ushort4)3) / (ushort4)8);
colorNoAlpha = convert_uint4(color32.odd + frac); colorNoAlpha = convert_uint4(color32.odd + frac);
colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.even - frac); colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.even - frac);
colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.odd); colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.odd);
@ -179,31 +176,27 @@ kernel void decodeCMPRBlock(global uchar *dst,
uint4 colorAlpha; uint4 colorAlpha;
uchar4 midpoint = convert_uchar4((convert_ushort4(color32.odd) + convert_ushort4(color32.even) + (ushort4)1) / (ushort4)2); uchar4 midpoint = convert_uchar4((convert_ushort4(color32.odd) + convert_ushort4(color32.even) + (ushort4)1) / (ushort4)2);
midpoint.s3 = 0xFF; midpoint.s3 = 0xFF;
colorAlpha = convert_uint4((uchar4)(0, 0, 0, 0)); colorAlpha = convert_uint4(midpoint);
colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(midpoint);
colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.odd); colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.odd);
colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.even); colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.even);
colors = color565.s0 > color565.s1 ? colorNoAlpha : colorAlpha; uint4 choice = isgreater(upsample(val.s0,val.s1),upsample(val.s2, val.s3));
colors = select(colorNoAlpha, colorAlpha, choice);
uint16 colorsFull = (uint16)(colors, colors, colors, colors); uint16 colorsFull = (uint16)(colors, colors, colors, colors);
uint4 shift0 = unpack2bits(val.s4); uint16 shifts = (((unpack(val.s7) << (uint16)8
uint4 shift1 = unpack2bits(val.s5); | unpack(val.s6)) << (uint16)8
uint4 shift2 = unpack2bits(val.s6); | unpack(val.s5)) << (uint16)8
uint4 shift3 = unpack2bits(val.s7); | unpack(val.s4)) << (uint16)3;
uint16 shifts = (uint16)((uint4)(shift3.s0), (uint4)(shift3.s1), (uint4)(shift3.s2), (uint4)(shift3.s3));
shifts = (shifts << (uint16)8) | (uint16)((uint4)(shift2.s0), (uint4)(shift2.s1), (uint4)(shift2.s2), (uint4)(shift2.s3)); vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width * 4);
shifts = (shifts << (uint16)8) | (uint16)((uint4)(shift1.s0), (uint4)(shift1.s1), (uint4)(shift1.s2), (uint4)(shift1.s3)); shifts = shifts >> (uint16)8;
shifts = (shifts << (uint16)8) | (uint16)((uint4)(shift0.s0), (uint4)(shift0.s1), (uint4)(shift0.s2), (uint4)(shift0.s3)) << (uint16)3; vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width * 4);
shifts = shifts >> (uint16)8;
for (int iy = 0; iy < 4; iy++) vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width * 4);
{ shifts = shifts >> (uint16)8;
uchar16 res; vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width * 4);
res = convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)) >> (uchar16)8;
vstore16(res, 0, dst);
dst += width * 4;
}
} }
kernel void DecodeCMPR(global uchar *dst, kernel void DecodeCMPR(global uchar *dst,
@ -215,10 +208,10 @@ kernel void DecodeCMPR(global uchar *dst,
decodeCMPRBlock(dst + (y * width + x) * 4, src, width); decodeCMPRBlock(dst + (y * width + x) * 4, src, width);
src += 8; src += 8;
decodeCMPRBlock(dst + (y * width + x + 4) * 4, src, width); decodeCMPRBlock(dst + (y * width + x + 4) * 4, src, width); // + 16
src += 8; src += 8;
decodeCMPRBlock(dst + ((y + 4) * width + x) * 4, src, width); decodeCMPRBlock(dst + ((y + 4) * width + x) * 4, src, width); // + 16*width
src += 8; src += 8;
decodeCMPRBlock(dst + ((y + 4) * width + x + 4) * 4, src, width); decodeCMPRBlock(dst + ((y + 4) * width + x + 4) * 4, src, width); // + 16*(width+1)
} }

View File

@ -33,8 +33,8 @@
struct sDecoders struct sDecoders
{ {
const char name[256]; // kernel name const char name[256]; // kernel name
cl_kernel kernel; // compute kernel cl_kernel kernel; // compute kernel
}; };
cl_program g_program; cl_program g_program;
@ -57,17 +57,17 @@ cl_mem g_clsrc, g_cldst; // texture buffer memory objects
void TexDecoder_OpenCL_Initialize() { void TexDecoder_OpenCL_Initialize() {
#if defined(HAVE_OPENCL) && HAVE_OPENCL #if defined(HAVE_OPENCL) && HAVE_OPENCL
if(!g_Inited) if(!g_Inited)
{ {
if(!OpenCL::Initialize()) if(!OpenCL::Initialize())
return; return;
std::string code; std::string code;
char filename[1024]; char filename[1024];
sprintf(filename, "%sOpenCL/TextureDecoder.cl", File::GetUserPath(D_USER_IDX)); sprintf(filename, "%sOpenCL/TextureDecoder.cl", File::GetUserPath(D_USER_IDX));
if (!File::ReadFileToString(true, filename, code)) if (!File::ReadFileToString(true, filename, code))
{ {
ERROR_LOG(VIDEO, "Failed to load OpenCL code %s - file is missing?", filename); ERROR_LOG(VIDEO, "Failed to load OpenCL code %s - file is missing?", filename);
return; return;
} }
g_program = OpenCL::CompileProgram(code.c_str()); g_program = OpenCL::CompileProgram(code.c_str());
@ -94,7 +94,7 @@ void TexDecoder_OpenCL_Shutdown() {
clReleaseProgram(g_program); clReleaseProgram(g_program);
int i = 0; int i = 0;
while(strlen(Decoders[i].name) > 0) while(strlen(Decoders[i].name) > 0)
{ {
clReleaseKernel(Decoders[i].kernel); clReleaseKernel(Decoders[i].kernel);
i++; i++;
@ -113,21 +113,21 @@ void TexDecoder_OpenCL_Shutdown() {
PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
{ {
#if defined(HAVE_OPENCL) && HAVE_OPENCL #if defined(HAVE_OPENCL) && HAVE_OPENCL
cl_int err; cl_int err;
cl_kernel kernelToRun = Decoders[0].kernel; cl_kernel kernelToRun = Decoders[0].kernel;
float sizeOfDst = sizeof(u8), sizeOfSrc = sizeof(u8), xSkip, ySkip; float sizeOfDst = sizeof(u8), sizeOfSrc = sizeof(u8), xSkip, ySkip;
PC_TexFormat formatResult; PC_TexFormat formatResult;
switch(texformat) switch(texformat)
{ {
case GX_TF_I4: case GX_TF_I4:
kernelToRun = Decoders[0].kernel; kernelToRun = Decoders[0].kernel;
sizeOfSrc = sizeof(u8) / 2.0f; sizeOfSrc = sizeof(u8) / 2.0f;
sizeOfDst = sizeof(u8); sizeOfDst = sizeof(u8);
xSkip = 8; xSkip = 8;
ySkip = 8; ySkip = 8;
formatResult = PC_TEX_FMT_I4_AS_I8; formatResult = PC_TEX_FMT_I4_AS_I8;
break; break;
case GX_TF_I8: case GX_TF_I8:
kernelToRun = Decoders[1].kernel; kernelToRun = Decoders[1].kernel;
sizeOfSrc = sizeOfDst = sizeof(u8); sizeOfSrc = sizeOfDst = sizeof(u8);
@ -162,25 +162,21 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei
sizeOfSrc = sizeOfDst = sizeof(u16); sizeOfSrc = sizeOfDst = sizeof(u16);
xSkip = 4; xSkip = 4;
ySkip = 4; ySkip = 4;
formatResult = PC_TEX_FMT_RGB565; formatResult = PC_TEX_FMT_RGB565;
break; break;
case GX_TF_RGB5A3: case GX_TF_RGB5A3:
// Doesn't decode correctly // Reported issues with Sonic Adventure 2: Battle opening sequence?
// See Sonic Adventure 2: Battle opening sequence
return PC_TEX_FMT_NONE;
kernelToRun = Decoders[6].kernel; kernelToRun = Decoders[6].kernel;
sizeOfSrc = sizeof(u16); sizeOfSrc = sizeof(u16);
sizeOfDst = sizeof(u32); sizeOfDst = sizeof(u32);
xSkip = 4; xSkip = 4;
ySkip = 4; ySkip = 4;
formatResult = PC_TEX_FMT_BGRA32; formatResult = PC_TEX_FMT_BGRA32;
break; break;
case GX_TF_CMPR: case GX_TF_CMPR:
// Doesn't decode correctly
return PC_TEX_FMT_NONE;
kernelToRun = Decoders[7].kernel; kernelToRun = Decoders[7].kernel;
sizeOfSrc = sizeof(u8) / 2.0f; sizeOfSrc = sizeof(u8) / 2.0f;
sizeOfDst = sizeof(u32); sizeOfDst = sizeof(u32);
xSkip = 8; xSkip = 8;
ySkip = 8; ySkip = 8;
formatResult = PC_TEX_FMT_BGRA32; formatResult = PC_TEX_FMT_BGRA32;
@ -212,7 +208,7 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei
err = clEnqueueNDRangeKernel(OpenCL::GetCommandQueue(), kernelToRun, 2, NULL, global, NULL, 0, NULL, NULL); err = clEnqueueNDRangeKernel(OpenCL::GetCommandQueue(), kernelToRun, 2, NULL, global, NULL, 0, NULL, NULL);
if(err) if(err)
OpenCL::HandleCLError(err, "Failed to enqueue kernel"); OpenCL::HandleCLError(err, "Failed to enqueue kernel");
clFinish(OpenCL::GetCommandQueue()); clFinish(OpenCL::GetCommandQueue());
@ -228,6 +224,6 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei
return PC_TEX_FMT_NONE; return PC_TEX_FMT_NONE;
#endif #endif
return PC_TEX_FMT_NONE; return PC_TEX_FMT_NONE;
} }