made texture conversion shaders use less instructions

this is mainly so they fit within ps_2_0 limits, but I made this separate from my last checkin to make it easier to test separately in case it somehow causes problems

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6619 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
nitsuja- 2010-12-19 21:05:41 +00:00
parent 98fe8437ae
commit 60d2d42525
1 changed files with 56 additions and 32 deletions

View File

@ -30,6 +30,7 @@
static char text[16384];
static bool IntensityConstantAdded = false;
static int s_incrementSampleXCount = 0;
namespace TextureConversionShader
{
@ -204,14 +205,25 @@ void Write32BitSwizzler(char*& p, u32 format, API_TYPE ApiType)
}
}
void WriteSampleColor(char*& p, const char* colorComp, const char* dest,API_TYPE ApiType)
void WriteSampleColor(char*& p, const char* colorComp, const char* dest, API_TYPE ApiType)
{
const char* texSampleOpName;
if (ApiType == API_D3D9)
WRITE(p, " %s = tex2D(samp0, sampleUv).%s;\n", dest, colorComp);
texSampleOpName = "tex2D";
else if (ApiType == API_D3D11)
WRITE(p, " %s = tex0.Sample(samp0, sampleUv).%s;\n", dest, colorComp);
texSampleOpName = "tex0.Sample";
else
WRITE(p, " %s = texRECT(samp0, sampleUv).%s;\n", dest, colorComp);
texSampleOpName = "texRECT";
// the increment of sampleUv.x is delayed, so we perform it here. see WriteIncrementSampleX.
const char* texSampleIncrementUnit;
if(ApiType != API_OPENGL)
texSampleIncrementUnit = "blkDims.x / blkDims.z";
else
texSampleIncrementUnit = "blkDims.x";
WRITE(p, " %s = %s(samp0, sampleUv + float2(%d * (%s), 0)).%s;\n",
dest, texSampleOpName, s_incrementSampleXCount, texSampleIncrementUnit, colorComp);
}
void WriteColorToIntensity(char*& p, const char* src, const char* dest)
@ -221,15 +233,27 @@ void WriteColorToIntensity(char*& p, const char* src, const char* dest)
WRITE(p, " float4 IntensityConst = float4(0.257f,0.504f,0.098f,0.0625f);\n");
IntensityConstantAdded = true;
}
WRITE(p, " %s = dot(IntensityConst.rgb, %s.rgb) + IntensityConst.a;\n", dest, src);
WRITE(p, " %s = dot(IntensityConst.rgb, %s.rgb);\n", dest, src);
// don't add IntensityConst.a yet, because doing it later is faster and uses less instructions, due to vectorization
}
void WriteIncrementSampleX(char*& p,API_TYPE ApiType)
{
if(ApiType != API_OPENGL)
WRITE(p, " sampleUv.x = sampleUv.x + blkDims.x / blkDims.z;\n");
else
WRITE(p, " sampleUv.x = sampleUv.x + blkDims.x;\n");
// the shader compiler apparently isn't smart or aggressive enough to recognize that:
// foo1 = lookup(x)
// x = x + increment;
// foo2 = lookup(x)
// x = x + increment;
// foo3 = lookup(x)
// can be replaced with this:
// foo1 = lookup(x + 0.0 * increment)
// foo2 = lookup(x + 1.0 * increment)
// foo3 = lookup(x + 2.0 * increment)
// which looks like the same operations but uses considerably fewer ALU instruction slots.
// thus, instead of using the former method, we only increment a counter internally here,
// and we wait until WriteSampleColor to write out the constant multiplier
// to achieve the increment as in the latter case.
s_incrementSampleXCount++;
}
void WriteToBitDepth(char*& p, u8 depth, const char* src, const char* dest)
@ -242,6 +266,7 @@ void WriteEncoderEnd(char* p)
{
WRITE(p, "}\n");
IntensityConstantAdded = false;
s_incrementSampleXCount = 0;
}
void WriteI8Encoder(char* p, API_TYPE ApiType)
@ -264,6 +289,8 @@ void WriteI8Encoder(char* p, API_TYPE ApiType)
WriteSampleColor(p, "rgb", "texSample",ApiType);
WriteColorToIntensity(p, "texSample", "ocol0.a");
WRITE(p, " ocol0.rgba += IntensityConst.aaaa;\n"); // see WriteColorToIntensity
WriteEncoderEnd(p);
}
@ -305,6 +332,9 @@ void WriteI4Encoder(char* p, API_TYPE ApiType)
WriteSampleColor(p, "rgb", "texSample",ApiType);
WriteColorToIntensity(p, "texSample", "color1.a");
WRITE(p, " color0.rgba += IntensityConst.aaaa;\n");
WRITE(p, " color1.rgba += IntensityConst.aaaa;\n");
WriteToBitDepth(p, 4, "color0", "color0");
WriteToBitDepth(p, 4, "color1", "color1");
@ -326,6 +356,8 @@ void WriteIA8Encoder(char* p,API_TYPE ApiType)
WRITE(p, " ocol0.r = texSample.a;\n");
WriteColorToIntensity(p, "texSample", "ocol0.a");
WRITE(p, " ocol0.ga += IntensityConst.aa;\n");
WriteEncoderEnd(p);
}
@ -355,6 +387,8 @@ void WriteIA4Encoder(char* p,API_TYPE ApiType)
WRITE(p, " color0.a = texSample.a;\n");
WriteColorToIntensity(p, "texSample", "color1.a");
WRITE(p, " color1.rgba += IntensityConst.aaaa;\n");
WriteToBitDepth(p, 4, "color0", "color0");
WriteToBitDepth(p, 4, "color1", "color1");
@ -366,32 +400,22 @@ void WriteRGB565Encoder(char* p,API_TYPE ApiType)
{
WriteSwizzler(p, GX_TF_RGB565,ApiType);
WRITE(p, " float3 texSample;\n");
WRITE(p, " float gInt;\n");
WRITE(p, " float gUpper;\n");
WRITE(p, " float gLower;\n");
WriteSampleColor(p, "rgb", "texSample",ApiType);
WriteToBitDepth(p, 6, "texSample.g", "gInt");
WRITE(p, " gUpper = floor(gInt / 8.0f);\n");
WRITE(p, " gLower = gInt - gUpper * 8.0f;\n");
WriteToBitDepth(p, 5, "texSample.r", "ocol0.b");
WRITE(p, " ocol0.b = ocol0.b * 8.0f + gUpper;\n");
WriteToBitDepth(p, 5, "texSample.b", "ocol0.g");
WRITE(p, " ocol0.g = ocol0.g + gLower * 32.0f;\n");
WriteSampleColor(p, "rgb", "float3 texSample0",ApiType);
WriteIncrementSampleX(p,ApiType);
WriteSampleColor(p, "rgb", "float3 texSample1",ApiType);
WriteSampleColor(p, "rgb", "texSample",ApiType);
WriteToBitDepth(p, 6, "texSample.g", "gInt");
WRITE(p, " gUpper = floor(gInt / 8.0f);\n");
WRITE(p, " gLower = gInt - gUpper * 8.0f;\n");
WRITE(p, " float2 texRs = {texSample0.r, texSample1.r};\n");
WRITE(p, " float2 texGs = {texSample0.g, texSample1.g};\n");
WRITE(p, " float2 texBs = {texSample0.b, texSample1.b};\n");
WriteToBitDepth(p, 6, "texGs", "float2 gInt");
WRITE(p, " float2 gUpper = floor(gInt / 8.0f);\n");
WRITE(p, " float2 gLower = gInt - gUpper * 8.0f;\n");
WriteToBitDepth(p, 5, "texSample.r", "ocol0.r");
WRITE(p, " ocol0.r = ocol0.r * 8.0f + gUpper;\n");
WriteToBitDepth(p, 5, "texSample.b", "ocol0.a");
WRITE(p, " ocol0.a = ocol0.a + gLower * 32.0f;\n");
WriteToBitDepth(p, 5, "texRs", "ocol0.br");
WRITE(p, " ocol0.br = ocol0.br * 8.0f + gUpper;\n");
WriteToBitDepth(p, 5, "texBs", "ocol0.ga");
WRITE(p, " ocol0.ga = ocol0.ga + gLower * 32.0f;\n");
WRITE(p, " ocol0 = ocol0 / 255.0f;\n");
WriteEncoderEnd(p);