DX11: Improve efb-encode shader some more.

According to AMD's GPU ShaderAnalyzer, most combinations of shaders have about 1.5x-2x higher peak per-clock throughput after this commit. For those concerned about performance, I do intend to make this at least as fast as the other backends. This is one more step toward that goal.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@7262 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
Nolan Check 2011-02-27 20:07:59 +00:00
parent ccc12c6950
commit e926b28480
1 changed files with 171 additions and 165 deletions

View File

@ -65,7 +65,9 @@ union EFBEncodeParams
static const char EFB_ENCODE_VS[] =
"// dolphin-emu EFB encoder vertex shader\n"
"uniform struct\n" // Should match EFBEncodeParams above
"cbuffer cbParams : register(b0)\n"
"{\n"
"struct\n" // Should match EFBEncodeParams above
"{\n"
"uint NumHalfCacheLinesX;\n"
"uint NumBlocksY;\n"
@ -75,7 +77,8 @@ static const char EFB_ENCODE_VS[] =
"float TexTop;\n"
"float TexRight;\n"
"float TexBottom;\n"
"} Params : register(c0);\n"
"} Params;\n"
"}\n"
"struct Output\n"
"{\n"
@ -97,7 +100,9 @@ static const char EFB_ENCODE_PS[] =
// Input
"uniform struct\n" // Should match EFBEncodeParams above
"cbuffer cbParams : register(b0)\n"
"{\n"
"struct\n" // Should match EFBEncodeParams above
"{\n"
"uint NumHalfCacheLinesX;\n"
"uint NumBlocksY;\n"
@ -107,7 +112,8 @@ static const char EFB_ENCODE_PS[] =
"float TexTop;\n"
"float TexRight;\n"
"float TexBottom;\n"
"} Params : register(c0);\n"
"} Params;\n"
"}\n"
"Texture2D EFBTexture : register(t0);\n"
"sampler EFBSampler : register(s0);\n"
@ -122,26 +128,12 @@ static const char EFB_ENCODE_PS[] =
// Utility functions
"uint ExtractA(uint pixel) { return pixel >> 24; }\n"
"uint ExtractA1(uint pixel) { return ExtractA(pixel) >> 7; }\n"
"uint ExtractA3(uint pixel) { return ExtractA(pixel) >> 5; }\n"
"uint ExtractR(uint pixel) { return (pixel >> 16) & 0xFF; }\n"
"uint ExtractR4(uint pixel) { return ExtractR(pixel) >> 4; }\n"
"uint ExtractR5(uint pixel) { return ExtractR(pixel) >> 3; }\n"
"uint ExtractG(uint pixel) { return (pixel >> 8) & 0xFF; }\n"
"uint ExtractG4(uint pixel) { return ExtractG(pixel) >> 4; }\n"
"uint ExtractG5(uint pixel) { return ExtractG(pixel) >> 3; }\n"
"uint ExtractG6(uint pixel) { return ExtractG(pixel) >> 2; }\n"
"uint ExtractB(uint pixel) { return pixel & 0xFF; }\n"
"uint ExtractB4(uint pixel) { return ExtractB(pixel) >> 4; }\n"
"uint ExtractB5(uint pixel) { return ExtractB(pixel) >> 3; }\n"
"uint4 Swap4_32(uint4 v) {\n"
"return (((v >> 24) & 0xFF) | ((v >> 8) & 0xFF00) | ((v << 8) & 0xFF0000) | ((v << 24) & 0xFF000000));\n"
"}\n"
"uint UINT_8888(uint a, uint b, uint c, uint d) {\n"
"return (a << 24) | (b << 16) | (c << 8) | d;\n"
"uint4 UINT4_8888_BE(uint4 a, uint4 b, uint4 c, uint4 d) {\n"
"return (d << 24) | (c << 16) | (b << 8) | a;\n"
"}\n"
"uint UINT_44444444(uint a, uint b, uint c, uint d, uint e, uint f, uint g, uint h) {\n"
@ -164,18 +156,18 @@ static const char EFB_ENCODE_PS[] =
"return (a << 16) | b;\n"
"}\n"
"uint EncodeRGB5A3(uint pixel) {\n"
"if (ExtractA(pixel) >= 224) {\n"
"uint EncodeRGB5A3(uint4 pixel) {\n"
"if (pixel.a >= 224) {\n"
// Encode to ARGB1555
"return UINT_1555(ExtractA1(pixel), ExtractR5(pixel), ExtractG5(pixel), ExtractB5(pixel));\n"
"return UINT_1555(1, pixel.r >> 3, pixel.g >> 3, pixel.b >> 3);\n"
"} else {\n"
// Encode to ARGB3444
"return UINT_3444(ExtractA3(pixel), ExtractR4(pixel), ExtractG4(pixel), ExtractB4(pixel));\n"
"return UINT_3444(pixel.a >> 5, pixel.r >> 4, pixel.g >> 4, pixel.b >> 4);\n"
"}\n"
"}\n"
"uint EncodeRGB565(uint pixel) {\n"
"return UINT_565(ExtractR5(pixel), ExtractG6(pixel), ExtractB5(pixel));\n"
"uint EncodeRGB565(uint4 pixel) {\n"
"return UINT_565(pixel.r >> 3, pixel.g >> 2, pixel.b >> 3);\n"
"}\n"
"float2 CalcTexCoord(uint2 coord)\n"
@ -379,14 +371,13 @@ static const char EFB_ENCODE_PS[] =
// Main EFB-sampling function: performs all steps of fetching pixels, scaling,
// applying intensity function
"uint SampleEFB(uint2 coord)\n"
"uint4 SampleEFB(uint2 coord)\n"
"{\n"
// FIXME: Does intensity happen before or after scaling? Or does
// it matter?
"float4 sample = IMP_SCALEDFETCH(coord);\n"
"sample = IMP_INTENSITY(sample);\n"
"float4 byteSample = 255.0 * sample;\n"
"return UINT_8888(byteSample.a, byteSample.r, byteSample.g, byteSample.b);\n"
"return uint4(255.0 * sample);\n"
"}\n"
// Interfaces and classes for different destination formats
@ -398,7 +389,7 @@ static const char EFB_ENCODE_PS[] =
"uint2 blockUL = blockCoord * uint2(8,8);\n"
"uint2 subBlockUL = blockUL + uint2(0, 4*(cacheCoord.x%2));\n"
"uint sample[32];\n"
"uint4 sample[32];\n"
"for (uint y = 0; y < 4; ++y) {\n"
"for (uint x = 0; x < 8; ++x) {\n"
"sample[y*8+x] = SampleEFB(subBlockUL+uint2(x,y));\n"
@ -408,8 +399,14 @@ static const char EFB_ENCODE_PS[] =
"uint dw[4];\n"
"for (uint i = 0; i < 4; ++i) {\n"
"dw[i] = UINT_44444444(\n"
"ExtractR4(sample[8*i+0]), ExtractR4(sample[8*i+1]), ExtractR4(sample[8*i+2]), ExtractR4(sample[8*i+3]),\n"
"ExtractR4(sample[8*i+4]), ExtractR4(sample[8*i+5]), ExtractR4(sample[8*i+6]), ExtractR4(sample[8*i+7])\n"
"sample[8*i+0].r >> 4,\n"
"sample[8*i+1].r >> 4,\n"
"sample[8*i+2].r >> 4,\n"
"sample[8*i+3].r >> 4,\n"
"sample[8*i+4].r >> 4,\n"
"sample[8*i+5].r >> 4,\n"
"sample[8*i+6].r >> 4,\n"
"sample[8*i+7].r >> 4\n"
");\n"
"}\n"
@ -423,14 +420,14 @@ static const char EFB_ENCODE_PS[] =
"uint2 blockUL = blockCoord * uint2(4,4);\n"
"uint2 subBlockUL = blockUL + uint2(0, 2*(cacheCoord.x%2));\n"
"uint sample0 = SampleEFB(subBlockUL+uint2(0,0));\n"
"uint sample1 = SampleEFB(subBlockUL+uint2(1,0));\n"
"uint sample2 = SampleEFB(subBlockUL+uint2(2,0));\n"
"uint sample3 = SampleEFB(subBlockUL+uint2(3,0));\n"
"uint sample4 = SampleEFB(subBlockUL+uint2(0,1));\n"
"uint sample5 = SampleEFB(subBlockUL+uint2(1,1));\n"
"uint sample6 = SampleEFB(subBlockUL+uint2(2,1));\n"
"uint sample7 = SampleEFB(subBlockUL+uint2(3,1));\n"
"uint4 sample0 = SampleEFB(subBlockUL+uint2(0,0));\n"
"uint4 sample1 = SampleEFB(subBlockUL+uint2(1,0));\n"
"uint4 sample2 = SampleEFB(subBlockUL+uint2(2,0));\n"
"uint4 sample3 = SampleEFB(subBlockUL+uint2(3,0));\n"
"uint4 sample4 = SampleEFB(subBlockUL+uint2(0,1));\n"
"uint4 sample5 = SampleEFB(subBlockUL+uint2(1,1));\n"
"uint4 sample6 = SampleEFB(subBlockUL+uint2(2,1));\n"
"uint4 sample7 = SampleEFB(subBlockUL+uint2(3,1));\n"
"uint dw0 = UINT_1616(EncodeRGB565(sample0), EncodeRGB565(sample1));\n"
"uint dw1 = UINT_1616(EncodeRGB565(sample2), EncodeRGB565(sample3));\n"
@ -447,14 +444,14 @@ static const char EFB_ENCODE_PS[] =
"uint2 blockUL = blockCoord * uint2(4,4);\n"
"uint2 subBlockUL = blockUL + uint2(0, 2*(cacheCoord.x%2));\n"
"uint sample0 = SampleEFB(subBlockUL+uint2(0,0));\n"
"uint sample1 = SampleEFB(subBlockUL+uint2(1,0));\n"
"uint sample2 = SampleEFB(subBlockUL+uint2(2,0));\n"
"uint sample3 = SampleEFB(subBlockUL+uint2(3,0));\n"
"uint sample4 = SampleEFB(subBlockUL+uint2(0,1));\n"
"uint sample5 = SampleEFB(subBlockUL+uint2(1,1));\n"
"uint sample6 = SampleEFB(subBlockUL+uint2(2,1));\n"
"uint sample7 = SampleEFB(subBlockUL+uint2(3,1));\n"
"uint4 sample0 = SampleEFB(subBlockUL+uint2(0,0));\n"
"uint4 sample1 = SampleEFB(subBlockUL+uint2(1,0));\n"
"uint4 sample2 = SampleEFB(subBlockUL+uint2(2,0));\n"
"uint4 sample3 = SampleEFB(subBlockUL+uint2(3,0));\n"
"uint4 sample4 = SampleEFB(subBlockUL+uint2(0,1));\n"
"uint4 sample5 = SampleEFB(subBlockUL+uint2(1,1));\n"
"uint4 sample6 = SampleEFB(subBlockUL+uint2(2,1));\n"
"uint4 sample7 = SampleEFB(subBlockUL+uint2(3,1));\n"
"uint dw0 = UINT_1616(EncodeRGB5A3(sample0), EncodeRGB5A3(sample1));\n"
"uint dw1 = UINT_1616(EncodeRGB5A3(sample2), EncodeRGB5A3(sample3));\n"
@ -471,37 +468,38 @@ static const char EFB_ENCODE_PS[] =
"uint2 blockUL = blockCoord * uint2(4,4);\n"
"uint2 subBlockUL = blockUL + uint2(0, 2*(cacheCoord.x%2));\n"
"uint sample0 = SampleEFB(subBlockUL+uint2(0,0));\n"
"uint sample1 = SampleEFB(subBlockUL+uint2(1,0));\n"
"uint sample2 = SampleEFB(subBlockUL+uint2(2,0));\n"
"uint sample3 = SampleEFB(subBlockUL+uint2(3,0));\n"
"uint sample4 = SampleEFB(subBlockUL+uint2(0,1));\n"
"uint sample5 = SampleEFB(subBlockUL+uint2(1,1));\n"
"uint sample6 = SampleEFB(subBlockUL+uint2(2,1));\n"
"uint sample7 = SampleEFB(subBlockUL+uint2(3,1));\n"
"uint4 sample0 = SampleEFB(subBlockUL+uint2(0,0));\n"
"uint4 sample1 = SampleEFB(subBlockUL+uint2(1,0));\n"
"uint4 sample2 = SampleEFB(subBlockUL+uint2(2,0));\n"
"uint4 sample3 = SampleEFB(subBlockUL+uint2(3,0));\n"
"uint4 sample4 = SampleEFB(subBlockUL+uint2(0,1));\n"
"uint4 sample5 = SampleEFB(subBlockUL+uint2(1,1));\n"
"uint4 sample6 = SampleEFB(subBlockUL+uint2(2,1));\n"
"uint4 sample7 = SampleEFB(subBlockUL+uint2(3,1));\n"
"uint dw0;\n"
"uint dw1;\n"
"uint dw2;\n"
"uint dw3;\n"
"uint4 dw4;\n"
"if (cacheCoord.x % 4 < 2)\n"
"{\n"
// First cache line gets AR
"dw0 = UINT_8888(ExtractA(sample0), ExtractR(sample0), ExtractA(sample1), ExtractR(sample1));\n"
"dw1 = UINT_8888(ExtractA(sample2), ExtractR(sample2), ExtractA(sample3), ExtractR(sample3));\n"
"dw2 = UINT_8888(ExtractA(sample4), ExtractR(sample4), ExtractA(sample5), ExtractR(sample5));\n"
"dw3 = UINT_8888(ExtractA(sample6), ExtractR(sample6), ExtractA(sample7), ExtractR(sample7));\n"
"dw4 = UINT4_8888_BE(\n"
"uint4(sample0.a, sample2.a, sample4.a, sample6.a),\n"
"uint4(sample0.r, sample2.r, sample4.r, sample6.r),\n"
"uint4(sample1.a, sample3.a, sample5.a, sample7.a),\n"
"uint4(sample1.r, sample3.r, sample5.r, sample7.r)\n"
");\n"
"}\n"
"else\n"
"{\n"
// Second cache line gets GB
"dw0 = UINT_8888(ExtractG(sample0), ExtractB(sample0), ExtractG(sample1), ExtractB(sample1));\n"
"dw1 = UINT_8888(ExtractG(sample2), ExtractB(sample2), ExtractG(sample3), ExtractB(sample3));\n"
"dw2 = UINT_8888(ExtractG(sample4), ExtractB(sample4), ExtractG(sample5), ExtractB(sample5));\n"
"dw3 = UINT_8888(ExtractG(sample6), ExtractB(sample6), ExtractG(sample7), ExtractB(sample7));\n"
"dw4 = UINT4_8888_BE(\n"
"uint4(sample0.g, sample2.g, sample4.g, sample6.g),\n"
"uint4(sample0.b, sample2.b, sample4.b, sample6.b),\n"
"uint4(sample1.g, sample3.g, sample5.g, sample7.g),\n"
"uint4(sample1.b, sample3.b, sample5.b, sample7.b)\n"
");\n"
"}\n"
"return Swap4_32(uint4(dw0, dw1, dw2, dw3));\n"
"return dw4;\n"
"}\n"
"uint4 Generate_7(uint2 cacheCoord)\n"
@ -511,29 +509,31 @@ static const char EFB_ENCODE_PS[] =
"uint2 blockUL = blockCoord * uint2(8,4);\n"
"uint2 subBlockUL = blockUL + uint2(0, 2*(cacheCoord.x%2));\n"
"uint sample0 = SampleEFB(subBlockUL+uint2(0,0));\n"
"uint sample1 = SampleEFB(subBlockUL+uint2(1,0));\n"
"uint sample2 = SampleEFB(subBlockUL+uint2(2,0));\n"
"uint sample3 = SampleEFB(subBlockUL+uint2(3,0));\n"
"uint sample4 = SampleEFB(subBlockUL+uint2(4,0));\n"
"uint sample5 = SampleEFB(subBlockUL+uint2(5,0));\n"
"uint sample6 = SampleEFB(subBlockUL+uint2(6,0));\n"
"uint sample7 = SampleEFB(subBlockUL+uint2(7,0));\n"
"uint sample8 = SampleEFB(subBlockUL+uint2(0,1));\n"
"uint sample9 = SampleEFB(subBlockUL+uint2(1,1));\n"
"uint sampleA = SampleEFB(subBlockUL+uint2(2,1));\n"
"uint sampleB = SampleEFB(subBlockUL+uint2(3,1));\n"
"uint sampleC = SampleEFB(subBlockUL+uint2(4,1));\n"
"uint sampleD = SampleEFB(subBlockUL+uint2(5,1));\n"
"uint sampleE = SampleEFB(subBlockUL+uint2(6,1));\n"
"uint sampleF = SampleEFB(subBlockUL+uint2(7,1));\n"
"uint4 sample0 = SampleEFB(subBlockUL+uint2(0,0));\n"
"uint4 sample1 = SampleEFB(subBlockUL+uint2(1,0));\n"
"uint4 sample2 = SampleEFB(subBlockUL+uint2(2,0));\n"
"uint4 sample3 = SampleEFB(subBlockUL+uint2(3,0));\n"
"uint4 sample4 = SampleEFB(subBlockUL+uint2(4,0));\n"
"uint4 sample5 = SampleEFB(subBlockUL+uint2(5,0));\n"
"uint4 sample6 = SampleEFB(subBlockUL+uint2(6,0));\n"
"uint4 sample7 = SampleEFB(subBlockUL+uint2(7,0));\n"
"uint4 sample8 = SampleEFB(subBlockUL+uint2(0,1));\n"
"uint4 sample9 = SampleEFB(subBlockUL+uint2(1,1));\n"
"uint4 sampleA = SampleEFB(subBlockUL+uint2(2,1));\n"
"uint4 sampleB = SampleEFB(subBlockUL+uint2(3,1));\n"
"uint4 sampleC = SampleEFB(subBlockUL+uint2(4,1));\n"
"uint4 sampleD = SampleEFB(subBlockUL+uint2(5,1));\n"
"uint4 sampleE = SampleEFB(subBlockUL+uint2(6,1));\n"
"uint4 sampleF = SampleEFB(subBlockUL+uint2(7,1));\n"
"uint dw0 = UINT_8888(ExtractA(sample0), ExtractA(sample1), ExtractA(sample2), ExtractA(sample3));\n"
"uint dw1 = UINT_8888(ExtractA(sample4), ExtractA(sample5), ExtractA(sample6), ExtractA(sample7));\n"
"uint dw2 = UINT_8888(ExtractA(sample8), ExtractA(sample9), ExtractA(sampleA), ExtractA(sampleB));\n"
"uint dw3 = UINT_8888(ExtractA(sampleC), ExtractA(sampleD), ExtractA(sampleE), ExtractA(sampleF));\n"
"uint4 dw4 = UINT4_8888_BE(\n"
"uint4(sample0.a, sample4.a, sample8.a, sampleC.a),\n"
"uint4(sample1.a, sample5.a, sample9.a, sampleD.a),\n"
"uint4(sample2.a, sample6.a, sampleA.a, sampleE.a),\n"
"uint4(sample3.a, sample7.a, sampleB.a, sampleF.a)\n"
");\n"
"return Swap4_32(uint4(dw0, dw1, dw2, dw3));\n"
"return dw4;\n"
"}\n"
"uint4 Generate_8(uint2 cacheCoord)\n"
@ -543,29 +543,31 @@ static const char EFB_ENCODE_PS[] =
"uint2 blockUL = blockCoord * uint2(8,4);\n"
"uint2 subBlockUL = blockUL + uint2(0, 2*(cacheCoord.x%2));\n"
"uint sample0 = SampleEFB(subBlockUL+uint2(0,0));\n"
"uint sample1 = SampleEFB(subBlockUL+uint2(1,0));\n"
"uint sample2 = SampleEFB(subBlockUL+uint2(2,0));\n"
"uint sample3 = SampleEFB(subBlockUL+uint2(3,0));\n"
"uint sample4 = SampleEFB(subBlockUL+uint2(4,0));\n"
"uint sample5 = SampleEFB(subBlockUL+uint2(5,0));\n"
"uint sample6 = SampleEFB(subBlockUL+uint2(6,0));\n"
"uint sample7 = SampleEFB(subBlockUL+uint2(7,0));\n"
"uint sample8 = SampleEFB(subBlockUL+uint2(0,1));\n"
"uint sample9 = SampleEFB(subBlockUL+uint2(1,1));\n"
"uint sampleA = SampleEFB(subBlockUL+uint2(2,1));\n"
"uint sampleB = SampleEFB(subBlockUL+uint2(3,1));\n"
"uint sampleC = SampleEFB(subBlockUL+uint2(4,1));\n"
"uint sampleD = SampleEFB(subBlockUL+uint2(5,1));\n"
"uint sampleE = SampleEFB(subBlockUL+uint2(6,1));\n"
"uint sampleF = SampleEFB(subBlockUL+uint2(7,1));\n"
"uint4 sample0 = SampleEFB(subBlockUL+uint2(0,0));\n"
"uint4 sample1 = SampleEFB(subBlockUL+uint2(1,0));\n"
"uint4 sample2 = SampleEFB(subBlockUL+uint2(2,0));\n"
"uint4 sample3 = SampleEFB(subBlockUL+uint2(3,0));\n"
"uint4 sample4 = SampleEFB(subBlockUL+uint2(4,0));\n"
"uint4 sample5 = SampleEFB(subBlockUL+uint2(5,0));\n"
"uint4 sample6 = SampleEFB(subBlockUL+uint2(6,0));\n"
"uint4 sample7 = SampleEFB(subBlockUL+uint2(7,0));\n"
"uint4 sample8 = SampleEFB(subBlockUL+uint2(0,1));\n"
"uint4 sample9 = SampleEFB(subBlockUL+uint2(1,1));\n"
"uint4 sampleA = SampleEFB(subBlockUL+uint2(2,1));\n"
"uint4 sampleB = SampleEFB(subBlockUL+uint2(3,1));\n"
"uint4 sampleC = SampleEFB(subBlockUL+uint2(4,1));\n"
"uint4 sampleD = SampleEFB(subBlockUL+uint2(5,1));\n"
"uint4 sampleE = SampleEFB(subBlockUL+uint2(6,1));\n"
"uint4 sampleF = SampleEFB(subBlockUL+uint2(7,1));\n"
"uint dw0 = UINT_8888(ExtractR(sample0), ExtractR(sample1), ExtractR(sample2), ExtractR(sample3));\n"
"uint dw1 = UINT_8888(ExtractR(sample4), ExtractR(sample5), ExtractR(sample6), ExtractR(sample7));\n"
"uint dw2 = UINT_8888(ExtractR(sample8), ExtractR(sample9), ExtractR(sampleA), ExtractR(sampleB));\n"
"uint dw3 = UINT_8888(ExtractR(sampleC), ExtractR(sampleD), ExtractR(sampleE), ExtractR(sampleF));\n"
"uint4 dw4 = UINT4_8888_BE(\n"
"uint4(sample0.r, sample4.r, sample8.r, sampleC.r),\n"
"uint4(sample1.r, sample5.r, sample9.r, sampleD.r),\n"
"uint4(sample2.r, sample6.r, sampleA.r, sampleE.r),\n"
"uint4(sample3.r, sample7.r, sampleB.r, sampleF.r)\n"
");\n"
"return Swap4_32(uint4(dw0, dw1, dw2, dw3));\n"
"return dw4;\n"
"}\n"
"uint4 Generate_A(uint2 cacheCoord)\n"
@ -575,29 +577,31 @@ static const char EFB_ENCODE_PS[] =
"uint2 blockUL = blockCoord * uint2(8,4);\n"
"uint2 subBlockUL = blockUL + uint2(0, 2*(cacheCoord.x%2));\n"
"uint sample0 = SampleEFB(subBlockUL+uint2(0,0));\n"
"uint sample1 = SampleEFB(subBlockUL+uint2(1,0));\n"
"uint sample2 = SampleEFB(subBlockUL+uint2(2,0));\n"
"uint sample3 = SampleEFB(subBlockUL+uint2(3,0));\n"
"uint sample4 = SampleEFB(subBlockUL+uint2(4,0));\n"
"uint sample5 = SampleEFB(subBlockUL+uint2(5,0));\n"
"uint sample6 = SampleEFB(subBlockUL+uint2(6,0));\n"
"uint sample7 = SampleEFB(subBlockUL+uint2(7,0));\n"
"uint sample8 = SampleEFB(subBlockUL+uint2(0,1));\n"
"uint sample9 = SampleEFB(subBlockUL+uint2(1,1));\n"
"uint sampleA = SampleEFB(subBlockUL+uint2(2,1));\n"
"uint sampleB = SampleEFB(subBlockUL+uint2(3,1));\n"
"uint sampleC = SampleEFB(subBlockUL+uint2(4,1));\n"
"uint sampleD = SampleEFB(subBlockUL+uint2(5,1));\n"
"uint sampleE = SampleEFB(subBlockUL+uint2(6,1));\n"
"uint sampleF = SampleEFB(subBlockUL+uint2(7,1));\n"
"uint4 sample0 = SampleEFB(subBlockUL+uint2(0,0));\n"
"uint4 sample1 = SampleEFB(subBlockUL+uint2(1,0));\n"
"uint4 sample2 = SampleEFB(subBlockUL+uint2(2,0));\n"
"uint4 sample3 = SampleEFB(subBlockUL+uint2(3,0));\n"
"uint4 sample4 = SampleEFB(subBlockUL+uint2(4,0));\n"
"uint4 sample5 = SampleEFB(subBlockUL+uint2(5,0));\n"
"uint4 sample6 = SampleEFB(subBlockUL+uint2(6,0));\n"
"uint4 sample7 = SampleEFB(subBlockUL+uint2(7,0));\n"
"uint4 sample8 = SampleEFB(subBlockUL+uint2(0,1));\n"
"uint4 sample9 = SampleEFB(subBlockUL+uint2(1,1));\n"
"uint4 sampleA = SampleEFB(subBlockUL+uint2(2,1));\n"
"uint4 sampleB = SampleEFB(subBlockUL+uint2(3,1));\n"
"uint4 sampleC = SampleEFB(subBlockUL+uint2(4,1));\n"
"uint4 sampleD = SampleEFB(subBlockUL+uint2(5,1));\n"
"uint4 sampleE = SampleEFB(subBlockUL+uint2(6,1));\n"
"uint4 sampleF = SampleEFB(subBlockUL+uint2(7,1));\n"
"uint dw0 = UINT_8888(ExtractB(sample0), ExtractB(sample1), ExtractB(sample2), ExtractB(sample3));\n"
"uint dw1 = UINT_8888(ExtractB(sample4), ExtractB(sample5), ExtractB(sample6), ExtractB(sample7));\n"
"uint dw2 = UINT_8888(ExtractB(sample8), ExtractB(sample9), ExtractB(sampleA), ExtractB(sampleB));\n"
"uint dw3 = UINT_8888(ExtractB(sampleC), ExtractB(sampleD), ExtractB(sampleE), ExtractB(sampleF));\n"
"uint4 dw4 = UINT4_8888_BE(\n"
"uint4(sample0.b, sample4.b, sample8.b, sampleC.b),\n"
"uint4(sample1.b, sample5.b, sample9.b, sampleD.b),\n"
"uint4(sample2.b, sample6.b, sampleA.b, sampleE.b),\n"
"uint4(sample3.b, sample7.b, sampleB.b, sampleF.b)\n"
");\n"
"return Swap4_32(uint4(dw0, dw1, dw2, dw3));\n"
"return dw4;\n"
"}\n"
"uint4 Generate_B(uint2 cacheCoord)\n"
@ -607,21 +611,23 @@ static const char EFB_ENCODE_PS[] =
"uint2 blockUL = blockCoord * uint2(4,4);\n"
"uint2 subBlockUL = blockUL + uint2(0, 2*(cacheCoord.x%2));\n"
"uint sample0 = SampleEFB(subBlockUL+uint2(0,0));\n"
"uint sample1 = SampleEFB(subBlockUL+uint2(1,0));\n"
"uint sample2 = SampleEFB(subBlockUL+uint2(2,0));\n"
"uint sample3 = SampleEFB(subBlockUL+uint2(3,0));\n"
"uint sample4 = SampleEFB(subBlockUL+uint2(0,1));\n"
"uint sample5 = SampleEFB(subBlockUL+uint2(1,1));\n"
"uint sample6 = SampleEFB(subBlockUL+uint2(2,1));\n"
"uint sample7 = SampleEFB(subBlockUL+uint2(3,1));\n"
"uint4 sample0 = SampleEFB(subBlockUL+uint2(0,0));\n"
"uint4 sample1 = SampleEFB(subBlockUL+uint2(1,0));\n"
"uint4 sample2 = SampleEFB(subBlockUL+uint2(2,0));\n"
"uint4 sample3 = SampleEFB(subBlockUL+uint2(3,0));\n"
"uint4 sample4 = SampleEFB(subBlockUL+uint2(0,1));\n"
"uint4 sample5 = SampleEFB(subBlockUL+uint2(1,1));\n"
"uint4 sample6 = SampleEFB(subBlockUL+uint2(2,1));\n"
"uint4 sample7 = SampleEFB(subBlockUL+uint2(3,1));\n"
"uint dw0 = UINT_8888(ExtractG(sample0), ExtractR(sample0), ExtractG(sample1), ExtractR(sample1));\n"
"uint dw1 = UINT_8888(ExtractG(sample2), ExtractR(sample2), ExtractG(sample3), ExtractR(sample3));\n"
"uint dw2 = UINT_8888(ExtractG(sample4), ExtractR(sample4), ExtractG(sample5), ExtractR(sample5));\n"
"uint dw3 = UINT_8888(ExtractG(sample6), ExtractR(sample6), ExtractG(sample7), ExtractR(sample7));\n"
"uint4 dw4 = UINT4_8888_BE(\n"
"uint4(sample0.g, sample2.g, sample4.g, sample6.g),\n"
"uint4(sample0.r, sample2.r, sample4.r, sample6.r),\n"
"uint4(sample1.g, sample3.g, sample5.g, sample7.g),\n"
"uint4(sample1.r, sample3.r, sample5.r, sample7.r)\n"
");\n"
"return Swap4_32(uint4(dw0, dw1, dw2, dw3));\n"
"return dw4;\n"
"}\n"
"#ifdef DYNAMIC_MODE\n"