/* Copyright 2016-2024 melonDS team This file is part of melonDS. melonDS is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. melonDS is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with melonDS. If not, see http://www.gnu.org/licenses/. */ #ifndef GPU3D_COMPUTE_SHADERS #define GPU3D_COMPUTE_SHADERS #include namespace melonDS { namespace ComputeRendererShaders { // defines: // InterpSpans // BinCombined // Rasterise // DepthBlend // ClearCoarseBinMask // ClearIndirectWorkCount // CalculateWorkOffsets // SortWork // FinalPass // AntiAliasing // EdgeMarking // Fog // ZBuffer // WBuffer // for Rasterise // NoTexture // UseTexture // Decal // Modulate // Toon // Highlight // ShadowMask /* Some notes on signed division: we want to avoid it, so we can avoid higher precision numbers in a few places. Fortunately all divisions *should* assuming I'm not mistaken have the same sign on the divisor and the dividend. Thus we apply: assuming n < 0 <=> d < 0 n/d = abs(n)/abs(d) */ const std::string XSpanSetupBuffer{R"( const uint XSpanSetup_Linear = 1U << 0; const uint XSpanSetup_FillInside = 1U << 1; const uint XSpanSetup_FillLeft = 1U << 2; const uint XSpanSetup_FillRight = 1U << 3; struct XSpanSetup { int X0, X1; int InsideStart, InsideEnd, EdgeCovL, EdgeCovR; int XRecip; uint Flags; int Z0, Z1, W0, W1; int ColorR0, ColorG0, ColorB0; int ColorR1, ColorG1, ColorB1; int TexcoordU0, TexcoordV0; int TexcoordU1, TexcoordV1; int CovLInitial, CovRInitial; }; #if defined(Rasterise) int CalcYFactorX(XSpanSetup span, int x) { x -= span.X0; if (span.X0 != span.X1) { uint numLo = uint(x) * uint(span.W0); uint numHi = 0U; numHi |= numLo >> (32U-YFactorShift); numLo <<= YFactorShift; uint den = uint(x) * uint(span.W0) + uint(span.X1 - span.X0 - x) * uint(span.W1); if (den == 0) return 0; else return int(Div64_32_32(numHi, numLo, den)); } else { return 0; } } #endif layout (std430, binding = 1) buffer XSpanSetupsBuffer { XSpanSetup XSpanSetups[]; }; )"}; const std::string YSpanSetupBuffer{R"( struct YSpanSetup { // Attributes int Z0, Z1, W0, W1; int ColorR0, ColorG0, ColorB0; int ColorR1, ColorG1, ColorB1; int TexcoordU0, TexcoordV0; int TexcoordU1, TexcoordV1; // Interpolator int I0, I1; bool Linear; int IRecip; int W0n, W0d, W1d; // Slope int Increment; int X0, X1, Y0, Y1; int XMin, XMax; int DxInitial; int XCovIncr; bool IsDummy; }; #if defined(InterpSpans) int CalcYFactorY(YSpanSetup span, int i) { /* maybe it would be better to do use a 32x32=64 multiplication? */ uint numLo = uint(abs(i)) * uint(span.W0n); uint numHi = 0U; numHi |= numLo >> (32U-YFactorShift); numLo <<= YFactorShift; uint den = uint(abs(i)) * uint(span.W0d) + uint(abs(span.I1 - span.I0 - i)) * span.W1d; if (den == 0) { return 0; } else { return int(Div64_32_32(numHi, numLo, den)); } } int CalculateDx(int y, YSpanSetup span) { return span.DxInitial + (y - span.Y0) * span.Increment; } int CalculateX(int dx, YSpanSetup span) { int x = span.X0; if (span.X1 < span.X0) x -= dx >> 18; else x += dx >> 18; return clamp(x, span.XMin, span.XMax); } void EdgeParams_XMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov) { bool negative = span.X1 < span.X0; int len; if (side != negative) len = (dx >> 18) - ((dx-span.Increment) >> 18); else len = ((dx+span.Increment) >> 18) - (dx >> 18); edgelen = len; int xlen = span.XMax + 1 - span.XMin; int startx = dx >> 18; if (negative) startx = xlen - startx; if (side) startx = startx - len + 1; uint r; int startcov = int(Div(uint(((startx << 10) + 0x1FF) * (span.Y1 - span.Y0)), uint(xlen), r)); edgecov = (1<<31) | ((startcov & 0x3FF) << 12) | (span.XCovIncr & 0x3FF); } void EdgeParams_YMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov) { bool negative = span.X1 < span.X0; edgelen = 1; if (span.Increment == 0) { edgecov = 31; } else { int cov = ((dx >> 9) + (span.Increment >> 10)) >> 4; if ((cov >> 5) != (dx >> 18)) cov = 31; cov &= 0x1F; if (side == negative) cov = 0x1F - cov; edgecov = cov; } } #endif layout (std430, binding = 2) buffer YSpanSetupsBuffer { YSpanSetup YSpanSetups[]; }; )"}; const std::string PolygonBuffer{R"( struct Polygon { int FirstXSpan; int YTop, YBot; int XMin, XMax; int XMinY, XMaxY; int Variant; uint Attr; float TextureLayer; }; layout (std430, binding = 0) readonly buffer PolygonBuffer { Polygon Polygons[]; }; )"}; const std::string BinningBuffer{R"( layout (std430, binding = 6) buffer BinResultBuffer { uvec4 VariantWorkCount[MaxVariants]; uint SortedWorkOffset[MaxVariants]; uvec4 SortWorkWorkCount; uint BinningMaskAndOffset[]; //uint BinnedMaskCoarse[TilesPerLine*TileLines*CoarseBinStride]; //uint BinnedMask[TilesPerLine*TileLines*BinStride]; //uint WorkOffsets[TilesPerLine*TileLines*BinStride]; }; const int BinningCoarseMaskStart = 0; const int BinningMaskStart = BinningCoarseMaskStart+TilesPerLine*TileLines*CoarseBinStride; const int BinningWorkOffsetsStart = BinningMaskStart+TilesPerLine*TileLines*BinStride; )"}; /* structure of each WorkDesc item: x: bits 0-10: polygon idx bits 11-31: tile idx (before sorting within variant after sorting within all tiles) y: bits 0-15: X position on screen bits 15-31: Y position on screen */ const std::string WorkDescBuffer{R"( layout (std430, binding = 7) buffer WorkDescBuffer { //uvec2 UnsortedWorkDescs[MaxWorkTiles]; //uvec2 SortedWorkDescs[MaxWorkTiles]; uvec2 WorkDescs[]; }; const uint WorkDescsUnsortedStart = 0; const uint WorkDescsSortedStart = WorkDescsUnsortedStart+MaxWorkTiles; )"}; const std::string Tilebuffers{R"( layout (std430, binding = 2) buffer ColorTileBuffer { uint ColorTiles[]; }; layout (std430, binding = 3) buffer DepthTileBuffer { uint DepthTiles[]; }; layout (std430, binding = 4) buffer AttrTileBuffer { uint AttrTiles[]; }; )"}; const std::string ResultBuffer{R"( layout (std430, binding = 5) buffer ResultBuffer { uint ResultValue[]; }; const uint ResultColorStart = 0; const uint ResultDepthStart = ResultColorStart+ScreenWidth*ScreenHeight*2; const uint ResultAttrStart = ResultDepthStart+ScreenWidth*ScreenHeight*2; )"}; const char* Common = R"( #define TileSize 8 const int CoarseTileCountX = 8; const int CoarseTileCountY = 4; const int CoarseTileW = (CoarseTileCountX * TileSize); const int CoarseTileH = (CoarseTileCountY * TileSize); const int FramebufferStride = ScreenWidth*ScreenHeight; const int TilesPerLine = ScreenWidth/TileSize; const int TileLines = ScreenHeight/TileSize; const int BinStride = 2048/32; const int CoarseBinStride = BinStride/32; const int MaxVariants = 256; layout (std140, binding = 0) uniform MetaUniform { uint NumPolygons; uint NumVariants; int AlphaRef; uint DispCnt; // r = Toon // g = Fog Density // b = Edge Color uvec4 ToonTable[34]; uint ClearColor, ClearDepth, ClearAttr; uint FogOffset, FogShift, FogColor; }; #ifdef InterpSpans const int YFactorShift = 9; #else const int YFactorShift = 8; #endif #if defined(InterpSpans) || defined(Rasterise) uint Umulh(uint a, uint b) { uint lo, hi; umulExtended(a, b, hi, lo); return hi; } const uint startTable[256] = uint[256]( 254, 252, 250, 248, 246, 244, 242, 240, 238, 236, 234, 233, 231, 229, 227, 225, 224, 222, 220, 218, 217, 215, 213, 212, 210, 208, 207, 205, 203, 202, 200, 199, 197, 195, 194, 192, 191, 189, 188, 186, 185, 183, 182, 180, 179, 178, 176, 175, 173, 172, 170, 169, 168, 166, 165, 164, 162, 161, 160, 158, 157, 156, 154, 153, 152, 151, 149, 148, 147, 146, 144, 143, 142, 141, 139, 138, 137, 136, 135, 134, 132, 131, 130, 129, 128, 127, 126, 125, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 88, 87, 86, 85, 84, 83, 82, 81, 80, 80, 79, 78, 77, 76, 75, 74, 74, 73, 72, 71, 70, 70, 69, 68, 67, 66, 66, 65, 64, 63, 62, 62, 61, 60, 59, 59, 58, 57, 56, 56, 55, 54, 53, 53, 52, 51, 50, 50, 49, 48, 48, 47, 46, 46, 45, 44, 43, 43, 42, 41, 41, 40, 39, 39, 38, 37, 37, 36, 35, 35, 34, 33, 33, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0 ); uint Div(uint x, uint y, out uint r) { // https://www.microsoft.com/en-us/research/publication/software-integer-division/ uint k = 31 - findMSB(y); uint ty = (y << k) >> (32 - 9); uint t = startTable[ty - 256] + 256; uint z = (t << (32 - 9)) >> (32 - k - 1); uint my = 0 - y; z += Umulh(z, my * z); z += Umulh(z, my * z); uint q = Umulh(x, z); r = x - y * q; if(r >= y) { r = r - y; q = q + 1; if(r >= y) { r = r - y; q = q + 1; } } return q; } uint Div64_32_32(uint numHi, uint numLo, uint den) { // based on https://github.com/ridiculousfish/libdivide/blob/3bd34388573681ce563348cdf04fe15d24770d04/libdivide.h#L469 // modified to work with half the size 64/32=32 instead of 128/64=64 // for further details see https://ridiculousfish.com/blog/posts/labor-of-division-episode-iv.html // We work in base 2**16. // A uint32 holds a single digit (in the lower 16 bit). A uint32 holds two digits. // Our numerator is conceptually [num3, num2, num1, num0]. // Our denominator is [den1, den0]. const uint b = (1U << 16); // Determine the normalization factor. We multiply den by this, so that its leading digit is at // least half b. In binary this means just shifting left by the number of leading zeros, so that // there's a 1 in the MSB. // We also shift numer by the same amount. This cannot overflow because numHi < den. // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting // by 64. (it's also UB in GLSL!!!!) uint shift = 31 - findMSB(den); den <<= shift; numHi <<= shift; numHi |= (numLo >> (-shift & 31U)) & uint(-int(shift) >> 31); numLo <<= shift; // Extract the low digits of the numerator and both digits of the denominator. uint num1 = (numLo >> 16); uint num0 = (numLo & 0xFFFFU); uint den1 = (den >> 16); uint den0 = (den & 0xFFFFU); // We wish to compute q1 = [n3 n2 n1] / [d1 d0]. // Estimate q1 as [n3 n2] / [d1], and then correct it. // Note while qhat may be 2 digits, q1 is always 1 digit. uint rhat; uint qhat = Div(numHi, den1, rhat); uint c1 = qhat * den0; uint c2 = rhat * b + num1; if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; uint q1 = qhat & 0xFFFFU; // Compute the true (partial) remainder. uint rem = numHi * b + num1 - q1 * den; // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0]. // Estimate q0 as [rem1 rem0] / [d1] and correct it. qhat = Div(rem, den1, rhat); c1 = qhat * den0; c2 = rhat * b + num0; if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; return bitfieldInsert(qhat, q1, 16, 16); } int InterpolateAttrPersp(int y0, int y1, int ifactor) { if (y0 == y1) return y0; if (y0 < y1) return y0 + (((y1-y0) * ifactor) >> YFactorShift); else return y1 + (((y0-y1) * ((1<> YFactorShift); } int InterpolateAttrLinear(int y0, int y1, int i, int irecip, int idiff) { if (y0 == y1) return y0; #ifndef Rasterise irecip = abs(irecip); #endif uint mulLo, mulHi, carry; if (y0 < y1) { #ifndef Rasterise uint offset = uint(abs(i)); #else uint offset = uint(i); #endif umulExtended(uint(y1-y0)*offset, uint(irecip), mulHi, mulLo); mulLo = uaddCarry(mulLo, 3U<<24, carry); mulHi += carry; return y0 + int((mulLo >> 30) | (mulHi << (32 - 30))); //return y0 + int(((int64_t(y1-y0) * int64_t(offset) * int64_t(irecip)) + int64_t(3<<24)) >> 30); } else { #ifndef Rasterise uint offset = uint(abs(idiff-i)); #else uint offset = uint(idiff-i); #endif umulExtended(uint(y0-y1)*offset, uint(irecip), mulHi, mulLo); mulLo = uaddCarry(mulLo, 3<<24, carry); mulHi += carry; return y1 + int((mulLo >> 30) | (mulHi << (32 - 30))); //return y1 + int(((int64_t(y0-y1) * int64_t(offset) * int64_t(irecip)) + int64_t(3<<24)) >> 30); } } uint InterpolateZZBuffer(int z0, int z1, int i, int irecip, int idiff) { if (z0 == z1) return z0; uint base, disp, factor; if (z0 < z1) { base = uint(z0); disp = uint(z1 - z0); factor = uint(abs(i)); } else { base = uint(z1); disp = uint(z0 - z1), factor = uint(abs(idiff - i)); } #ifdef InterpSpans int shiftl = 0; const int shiftr = 22; if (disp > 0x3FF) { shiftl = findMSB(disp) - 9; disp >>= shiftl; } #else disp >>= 9; const int shiftl = 0; const int shiftr = 13; #endif uint mulLo, mulHi; umulExtended(disp * factor, abs(irecip) >> 8, mulHi, mulLo); return base + (((mulLo >> shiftr) | (mulHi << (32 - shiftr))) << shiftl); /* int base, disp, factor; if (z0 < z1) { base = z0; disp = z1 - z0; factor = i; } else { base = z1; disp = z0 - z1, factor = idiff - i; } #ifdef InterpSpans { int shift = 0; while (disp > 0x3FF) { disp >>= 1; shift++; } return base + int(((int64_t(disp) * int64_t(factor) * (int64_t(irecip) >> 8)) >> 22) << shift); } #else { disp >>= 9; return base + int((int64_t(disp) * int64_t(factor) * (int64_t(irecip) >> 8)) >> 13); } #endif*/ } uint InterpolateZWBuffer(int z0, int z1, int ifactor) { if (z0 == z1) return z0; #ifdef Rasterise // since the precision along x spans is only 8 bit the result will always fit in 32-bit if (z0 < z1) { return uint(z0) + (((z1-z0) * ifactor) >> YFactorShift); } else { return uint(z1) + (((z0-z1) * ((1<> YFactorShift); } #else uint mulLo, mulHi; if (z0 < z1) { umulExtended(z1-z0, ifactor, mulHi, mulLo); // 64-bit shift return uint(z0) + ((mulLo >> YFactorShift) | (mulHi << (32-YFactorShift))); } else { umulExtended(z0-z1, (1<> YFactorShift) | (mulHi << (32-YFactorShift))); } #endif /*if (z0 < z1) { return uint(z0) + uint((int64_t(z1-z0) * int64_t(ifactor)) >> YFactorShift); } else { return uint(z1) + uint((int64_t(z0-z1) * int64_t((1<> YFactorShift); }*/ } #endif )"; const std::string InterpSpans = PolygonBuffer + XSpanSetupBuffer + YSpanSetupBuffer + R"( layout (local_size_x = 32) in; layout (binding = 0, rgba16ui) uniform readonly uimageBuffer SetupIndices; void main() { uvec4 setup = imageLoad(SetupIndices, int(gl_GlobalInvocationID.x)); YSpanSetup spanL = YSpanSetups[setup.y]; YSpanSetup spanR = YSpanSetups[setup.z]; XSpanSetup xspan; xspan.Flags = 0U; int y = int(setup.w); int dxl = CalculateDx(y, spanL); int dxr = CalculateDx(y, spanR); int xl = CalculateX(dxl, spanL); int xr = CalculateX(dxr, spanR); Polygon polygon = Polygons[setup.x]; int edgeLenL, edgeLenR; if (xl > xr) { YSpanSetup tmpSpan = spanL; spanL = spanR; spanR = tmpSpan; int tmp = xl; xl = xr; xr = tmp; EdgeParams_YMajor(false, dxr, spanL, edgeLenL, xspan.EdgeCovL); EdgeParams_YMajor(true, dxl, spanR, edgeLenR, xspan.EdgeCovR); } else { // edges are the right way if (spanL.Increment > 0x40000) EdgeParams_XMajor(false, dxl, spanL, edgeLenL, xspan.EdgeCovL); else EdgeParams_YMajor(false, dxl, spanL, edgeLenL, xspan.EdgeCovL); if (spanR.Increment > 0x40000) EdgeParams_XMajor(true, dxr, spanR, edgeLenR, xspan.EdgeCovR); else EdgeParams_YMajor(true, dxr, spanR, edgeLenR, xspan.EdgeCovR); } xspan.CovLInitial = (xspan.EdgeCovL >> 12) & 0x3FF; if (xspan.CovLInitial == 0x3FF) xspan.CovLInitial = 0; xspan.CovRInitial = (xspan.EdgeCovR >> 12) & 0x3FF; if (xspan.CovRInitial == 0x3FF) xspan.CovRInitial = 0; xspan.X0 = xl; xspan.X1 = xr + 1; uint polyalpha = ((polygon.Attr >> 16) & 0x1FU); bool isWireframe = polyalpha == 0U; if (!isWireframe || (y == polygon.YTop || y == polygon.YBot - 1)) xspan.Flags |= XSpanSetup_FillInside; xspan.InsideStart = xspan.X0 + edgeLenL; if (xspan.InsideStart > xspan.X1) xspan.InsideStart = xspan.X1; xspan.InsideEnd = xspan.X1 - edgeLenR; if (xspan.InsideEnd > xspan.X1) xspan.InsideEnd = xspan.X1; bool isShadowMask = ((polygon.Attr & 0x3F000030U) == 0x00000030U); bool fillAllEdges = polyalpha < 31 || (DispCnt & (3U<<4)) != 0U; if (fillAllEdges || spanL.X1 < spanL.X0 || spanL.Increment <= 0x40000) xspan.Flags |= XSpanSetup_FillLeft; if (fillAllEdges || (spanR.X1 >= spanR.X0 && spanR.Increment > 0x40000) || spanR.Increment == 0) xspan.Flags |= XSpanSetup_FillRight; if (spanL.I0 == spanL.I1) { xspan.TexcoordU0 = spanL.TexcoordU0; xspan.TexcoordV0 = spanL.TexcoordV0; xspan.ColorR0 = spanL.ColorR0; xspan.ColorG0 = spanL.ColorG0; xspan.ColorB0 = spanL.ColorB0; xspan.Z0 = spanL.Z0; xspan.W0 = spanL.W0; } else { int i = (spanL.Increment > 0x40000 ? xl : y) - spanL.I0; int ifactor = CalcYFactorY(spanL, i); int idiff = spanL.I1 - spanL.I0; #ifdef ZBuffer xspan.Z0 = int(InterpolateZZBuffer(spanL.Z0, spanL.Z1, i, spanL.IRecip, idiff)); #endif #ifdef WBuffer xspan.Z0 = int(InterpolateZWBuffer(spanL.Z0, spanL.Z1, ifactor)); #endif if (!spanL.Linear) { xspan.TexcoordU0 = InterpolateAttrPersp(spanL.TexcoordU0, spanL.TexcoordU1, ifactor); xspan.TexcoordV0 = InterpolateAttrPersp(spanL.TexcoordV0, spanL.TexcoordV1, ifactor); xspan.ColorR0 = InterpolateAttrPersp(spanL.ColorR0, spanL.ColorR1, ifactor); xspan.ColorG0 = InterpolateAttrPersp(spanL.ColorG0, spanL.ColorG1, ifactor); xspan.ColorB0 = InterpolateAttrPersp(spanL.ColorB0, spanL.ColorB1, ifactor); xspan.W0 = InterpolateAttrPersp(spanL.W0, spanL.W1, ifactor); } else { xspan.TexcoordU0 = InterpolateAttrLinear(spanL.TexcoordU0, spanL.TexcoordU1, i, spanL.IRecip, idiff); xspan.TexcoordV0 = InterpolateAttrLinear(spanL.TexcoordV0, spanL.TexcoordV1, i, spanL.IRecip, idiff); xspan.ColorR0 = InterpolateAttrLinear(spanL.ColorR0, spanL.ColorR1, i, spanL.IRecip, idiff); xspan.ColorG0 = InterpolateAttrLinear(spanL.ColorG0, spanL.ColorG1, i, spanL.IRecip, idiff); xspan.ColorB0 = InterpolateAttrLinear(spanL.ColorB0, spanL.ColorB1, i, spanL.IRecip, idiff); xspan.W0 = spanL.W0; // linear mode is only taken if W0 == W1 } } if (spanR.I0 == spanR.I1) { xspan.TexcoordU1 = spanR.TexcoordU0; xspan.TexcoordV1 = spanR.TexcoordV0; xspan.ColorR1 = spanR.ColorR0; xspan.ColorG1 = spanR.ColorG0; xspan.ColorB1 = spanR.ColorB0; xspan.Z1 = spanR.Z0; xspan.W1 = spanR.W0; } else { int i = (spanR.Increment > 0x40000 ? xr : y) - spanR.I0; int ifactor = CalcYFactorY(spanR, i); int idiff = spanR.I1 - spanR.I0; #ifdef ZBuffer xspan.Z1 = int(InterpolateZZBuffer(spanR.Z0, spanR.Z1, i, spanR.IRecip, idiff)); #endif #ifdef WBuffer xspan.Z1 = int(InterpolateZWBuffer(spanR.Z0, spanR.Z1, ifactor)); #endif if (!spanR.Linear) { xspan.TexcoordU1 = InterpolateAttrPersp(spanR.TexcoordU0, spanR.TexcoordU1, ifactor); xspan.TexcoordV1 = InterpolateAttrPersp(spanR.TexcoordV0, spanR.TexcoordV1, ifactor); xspan.ColorR1 = InterpolateAttrPersp(spanR.ColorR0, spanR.ColorR1, ifactor); xspan.ColorG1 = InterpolateAttrPersp(spanR.ColorG0, spanR.ColorG1, ifactor); xspan.ColorB1 = InterpolateAttrPersp(spanR.ColorB0, spanR.ColorB1, ifactor); xspan.W1 = int(InterpolateAttrPersp(spanR.W0, spanR.W1, ifactor)); } else { xspan.TexcoordU1 = InterpolateAttrLinear(spanR.TexcoordU0, spanR.TexcoordU1, i, spanR.IRecip, idiff); xspan.TexcoordV1 = InterpolateAttrLinear(spanR.TexcoordV0, spanR.TexcoordV1, i, spanR.IRecip, idiff); xspan.ColorR1 = InterpolateAttrLinear(spanR.ColorR0, spanR.ColorR1, i, spanR.IRecip, idiff); xspan.ColorG1 = InterpolateAttrLinear(spanR.ColorG0, spanR.ColorG1, i, spanR.IRecip, idiff); xspan.ColorB1 = InterpolateAttrLinear(spanR.ColorB0, spanR.ColorB1, i, spanR.IRecip, idiff); xspan.W1 = spanR.W0; } } if (xspan.W0 == xspan.W1 && ((xspan.W0 | xspan.W1) & 0x7F) == 0) { xspan.Flags |= XSpanSetup_Linear; // a bit hacky, but when wbuffering we only need to calculate xrecip for linear spans #ifdef ZBuffer } { #endif uint r; xspan.XRecip = int(Div(1U<<30, uint(xspan.X1 - xspan.X0), r)); } XSpanSetups[gl_GlobalInvocationID.x] = xspan; } )"; const std::string ClearIndirectWorkCount = BinningBuffer + R"( layout (local_size_x = 32) in; void main() { VariantWorkCount[gl_GlobalInvocationID.x] = uvec4(1, 1, 0, 0); } )"; const std::string ClearCoarseBinMask = BinningBuffer + R"( layout (local_size_x = 32) in; void main() { BinningMaskAndOffset[BinningCoarseMaskStart + gl_GlobalInvocationID.x*CoarseBinStride+0] = 0; BinningMaskAndOffset[BinningCoarseMaskStart + gl_GlobalInvocationID.x*CoarseBinStride+1] = 0; } )"; const std::string BinCombined = PolygonBuffer + BinningBuffer + XSpanSetupBuffer + WorkDescBuffer + R"( layout (local_size_x = 32) in; bool BinPolygon(Polygon polygon, ivec2 topLeft, ivec2 botRight) { if (polygon.YTop > botRight.y || polygon.YBot <= topLeft.y) return false; int polygonHeight = polygon.YBot - polygon.YTop; /* All (good) polygons are convex. So the following holds true: Starting from the top most point where both edges originate the X coordinate of the left edge will stay the same or falls until the minimum X-axis coordinate is reached. Then it stays the same or rises until the point it meets with the right edge. The same applies to the right edge, except that it first may rise or stay equal and after the maximum point may only fall or stay the same. This means that for every tile which doesn't contain the point where the direction changes we can just get the maximum point by sampling the top most and bottom most coordinate within the tile. For a tile which is that the height of the direction change As a sidenote another consequence of this design decision is that malformed polygons aren't binned properly. As a note bottom Y is exclusive! */ int polyInnerTopY = clamp(topLeft.y - polygon.YTop, 0, max(polygonHeight-1, 0)); int polyInnerBotY = clamp(botRight.y - polygon.YTop, 0, max(polygonHeight-1, 0)); XSpanSetup xspanTop = XSpanSetups[polygon.FirstXSpan + polyInnerTopY]; XSpanSetup xspanBot = XSpanSetups[polygon.FirstXSpan + polyInnerBotY]; int minXL; if (polygon.XMinY >= topLeft.y && polygon.XMinY <= botRight.y) minXL = polygon.XMin; else minXL = min(xspanTop.X0, xspanBot.X0); if (minXL > botRight.x) return false; int maxXR; if (polygon.XMaxY >= topLeft.y && polygon.XMaxY <= botRight.y) maxXR = polygon.XMax; else maxXR = max(xspanTop.X1, xspanBot.X1) - 1; if (maxXR < topLeft.x) return false; return true; } shared uint mergedMaskShared; void main() { int groupIdx = int(gl_WorkGroupID.x); ivec2 coarseTile = ivec2(gl_WorkGroupID.yz); #if 0 int localIdx = int(gl_SubGroupInvocationARB); #else int localIdx = int(gl_LocalInvocationIndex); if (localIdx == 0) mergedMaskShared = 0U; barrier(); #endif int polygonIdx = groupIdx * 32 + localIdx; ivec2 coarseTopLeft = coarseTile * ivec2(CoarseTileW, CoarseTileH); ivec2 coarseBotRight = coarseTopLeft + ivec2(CoarseTileW-1, CoarseTileH-1); bool binned = false; if (polygonIdx < NumPolygons) { binned = BinPolygon(Polygons[polygonIdx], coarseTopLeft, coarseBotRight); } #if 0 uint mergedMask = unpackUint2x32(ballotARB(binned)).x; #else if (binned) atomicOr(mergedMaskShared, 1U << localIdx); barrier(); uint mergedMask = mergedMaskShared; #endif ivec2 fineTile = ivec2(localIdx & 0x7, localIdx >> 3); ivec2 fineTileTopLeft = coarseTopLeft + fineTile * ivec2(TileSize, TileSize); ivec2 fineTileBotRight = fineTileTopLeft + ivec2(TileSize-1, TileSize-1); uint binnedMask = 0U; while (mergedMask != 0U) { int bit = findLSB(mergedMask); mergedMask &= ~(1U << bit); int polygonIdx = groupIdx * 32 + bit; if (BinPolygon(Polygons[polygonIdx], fineTileTopLeft, fineTileBotRight)) binnedMask |= 1U << bit; } int linearTile = fineTile.x + fineTile.y * TilesPerLine + coarseTile.x * CoarseTileCountX + coarseTile.y * TilesPerLine * CoarseTileCountY; BinningMaskAndOffset[BinningMaskStart + linearTile * BinStride + groupIdx] = binnedMask; int coarseMaskIdx = linearTile * CoarseBinStride + (groupIdx >> 5); if (binnedMask != 0U) atomicOr(BinningMaskAndOffset[BinningCoarseMaskStart + coarseMaskIdx], 1U << (groupIdx & 0x1F)); if (binnedMask != 0U) { uint workOffset = atomicAdd(VariantWorkCount[0].w, uint(bitCount(binnedMask))); BinningMaskAndOffset[BinningWorkOffsetsStart + linearTile * BinStride + groupIdx] = workOffset; uint tilePositionCombined = bitfieldInsert(fineTileTopLeft.x, fineTileTopLeft.y, 16, 16); int idx = 0; while (binnedMask != 0U) { int bit = findLSB(binnedMask); binnedMask &= ~(1U << bit); int polygonIdx = groupIdx * 32 + bit; int variantIdx = Polygons[polygonIdx].Variant; int inVariantOffset = int(atomicAdd(VariantWorkCount[variantIdx].z, 1)); WorkDescs[WorkDescsUnsortedStart + workOffset + idx] = uvec2(tilePositionCombined, bitfieldInsert(polygonIdx, inVariantOffset, 11, 21)); idx++; } } } )"; const std::string CalcOffsets = BinningBuffer + R"( layout (local_size_x = 32) in; void main() { if (gl_GlobalInvocationID.x < NumVariants) { if (gl_GlobalInvocationID.x == 0) { // a bit of a cheat putting this here, but this shader won't run that often SortWorkWorkCount = uvec4((VariantWorkCount[0].w + 31) / 32, 1, 1, 0); } SortedWorkOffset[gl_GlobalInvocationID.x] = atomicAdd(VariantWorkCount[1].w, VariantWorkCount[gl_GlobalInvocationID.x].z); } } )"; const std::string SortWork = PolygonBuffer + BinningBuffer + WorkDescBuffer + R"( layout (local_size_x = 32) in; void main() { if (gl_GlobalInvocationID.x < VariantWorkCount[0].w) { uvec2 workDesc = WorkDescs[WorkDescsUnsortedStart + gl_GlobalInvocationID.x]; int inVariantOffset = int(bitfieldExtract(workDesc.y, 11, 21)); int polygonIdx = int(bitfieldExtract(workDesc.y, 0, 11)); int variantIdx = Polygons[polygonIdx].Variant; int sortedIndex = int(SortedWorkOffset[variantIdx]) + inVariantOffset; WorkDescs[WorkDescsSortedStart + sortedIndex] = uvec2(workDesc.x, bitfieldInsert(workDesc.y, gl_GlobalInvocationID.x, 11, 21)); } } )"; const std::string Rasterise = PolygonBuffer + WorkDescBuffer + XSpanSetupBuffer + BinningBuffer + Tilebuffers + R"( layout (local_size_x = TileSize, local_size_y = TileSize) in; layout (binding = 0) uniform usampler2DArray CurrentTexture; layout (location = 0) uniform uint CurVariant; layout (location = 1) uniform vec2 InvTextureSize; void main() { uvec2 workDesc = WorkDescs[WorkDescsSortedStart + SortedWorkOffset[CurVariant] + gl_WorkGroupID.z]; Polygon polygon = Polygons[bitfieldExtract(workDesc.y, 0, 11)]; ivec2 position = ivec2(bitfieldExtract(workDesc.x, 0, 16), bitfieldExtract(workDesc.x, 16, 16)) + ivec2(gl_LocalInvocationID.xy); int tileOffset = int(bitfieldExtract(workDesc.y, 11, 21)) * TileSize * TileSize + TileSize * int(gl_LocalInvocationID.y) + int(gl_LocalInvocationID.x); uint color = 0U; if (position.y >= polygon.YTop && position.y < polygon.YBot) { XSpanSetup xspan = XSpanSetups[polygon.FirstXSpan + (position.y - polygon.YTop)]; bool insideLeftEdge = position.x < xspan.InsideStart; bool insideRightEdge = position.x >= xspan.InsideEnd; bool insidePolygonInside = !insideLeftEdge && !insideRightEdge; if (position.x >= xspan.X0 && position.x < xspan.X1 && ((insideLeftEdge && (xspan.Flags & XSpanSetup_FillLeft) != 0U) || (insideRightEdge && (xspan.Flags & XSpanSetup_FillRight) != 0U) || (insidePolygonInside && (xspan.Flags & XSpanSetup_FillInside) != 0U))) { uint attr = 0; if (position.y == polygon.YTop) attr |= 0x4U; else if (position.y == polygon.YBot - 1) attr |= 0x8U; if (insideLeftEdge) { attr |= 0x1U; int cov = xspan.EdgeCovL; if (cov < 0) { int xcov = xspan.CovLInitial + (xspan.EdgeCovL & 0x3FF) * (position.x - xspan.X0); cov = min(xcov >> 5, 31); } attr |= uint(cov) << 8; } else if (insideRightEdge) { attr |= 0x2U; int cov = xspan.EdgeCovR; if (cov < 0) { int xcov = xspan.CovRInitial + (xspan.EdgeCovR & 0x3FF) * (position.x - xspan.InsideEnd); cov = max(0x1F - (xcov >> 5), 0); } attr |= uint(cov) << 8; } uint z; int u, v, vr, vg, vb; if (xspan.X0 == xspan.X1) { z = xspan.Z0; u = xspan.TexcoordU0; v = xspan.TexcoordV0; vr = xspan.ColorR0; vg = xspan.ColorG0; vb = xspan.ColorB0; } else { int ifactor = CalcYFactorX(xspan, position.x); int idiff = xspan.X1 - xspan.X0; int i = position.x - xspan.X0; #ifdef ZBuffer z = InterpolateZZBuffer(xspan.Z0, xspan.Z1, i, xspan.XRecip, idiff); #endif #ifdef WBuffer z = InterpolateZWBuffer(xspan.Z0, xspan.Z1, ifactor); #endif if ((xspan.Flags & XSpanSetup_Linear) == 0U) { u = InterpolateAttrPersp(xspan.TexcoordU0, xspan.TexcoordU1, ifactor); v = InterpolateAttrPersp(xspan.TexcoordV0, xspan.TexcoordV1, ifactor); vr = InterpolateAttrPersp(xspan.ColorR0, xspan.ColorR1, ifactor); vg = InterpolateAttrPersp(xspan.ColorG0, xspan.ColorG1, ifactor); vb = InterpolateAttrPersp(xspan.ColorB0, xspan.ColorB1, ifactor); } else { u = InterpolateAttrLinear(xspan.TexcoordU0, xspan.TexcoordU1, i, xspan.XRecip, idiff); v = InterpolateAttrLinear(xspan.TexcoordV0, xspan.TexcoordV1, i, xspan.XRecip, idiff); vr = InterpolateAttrLinear(xspan.ColorR0, xspan.ColorR1, i, xspan.XRecip, idiff); vg = InterpolateAttrLinear(xspan.ColorG0, xspan.ColorG1, i, xspan.XRecip, idiff); vb = InterpolateAttrLinear(xspan.ColorB0, xspan.ColorB1, i, xspan.XRecip, idiff); } } #ifndef ShadowMask vr >>= 3; vg >>= 3; vb >>= 3; uint r, g, b, a; uint polyalpha = bitfieldExtract(polygon.Attr, 16, 5); #ifdef Toon uint tooncolor = ToonTable[vr >> 1].r; vr = int(bitfieldExtract(tooncolor, 0, 8)); vg = int(bitfieldExtract(tooncolor, 8, 8)); vb = int(bitfieldExtract(tooncolor, 16, 8)); #endif #ifdef Highlight vg = vr; vb = vr; #endif #ifdef NoTexture a = int(polyalpha); #endif r = vr; g = vg; b = vb; #ifdef UseTexture vec2 uvf = vec2(ivec2(u, v)) * vec2(1.0 / 16.0) * InvTextureSize; uvec4 texcolor = texture(CurrentTexture, vec3(uvf, polygon.TextureLayer)); #ifdef Decal if (texcolor.a == 31) { r = int(texcolor.r); g = int(texcolor.g); b = int(texcolor.b); } else if (texcolor.a > 0) { r = int((texcolor.r * texcolor.a) + (vr * (31-texcolor.a))) >> 5; g = int((texcolor.g * texcolor.a) + (vg * (31-texcolor.a))) >> 5; b = int((texcolor.b * texcolor.a) + (vb * (31-texcolor.a))) >> 5; } a = int(polyalpha); #endif #if defined(Modulate) || defined(Toon) || defined(Highlight) r = int((texcolor.r+1) * (vr+1) - 1) >> 6; g = int((texcolor.g+1) * (vg+1) - 1) >> 6; b = int((texcolor.b+1) * (vb+1) - 1) >> 6; a = int((texcolor.a+1) * (polyalpha+1) - 1) >> 5; #endif #endif #ifdef Highlight uint tooncolor = ToonTable[vr >> 1].r; r = min(r + int(bitfieldExtract(tooncolor, 0, 8)), 63); g = min(g + int(bitfieldExtract(tooncolor, 8, 8)), 63); b = min(b + int(bitfieldExtract(tooncolor, 16, 8)), 63); #endif if (polyalpha == 0) a = 31; if (a > AlphaRef) { color = r | (g << 8) | (b << 16) | (a << 24); DepthTiles[tileOffset] = z; AttrTiles[tileOffset] = attr; } #else color = 0xFFFFFFFF; // doesn't really matter as long as it's not 0 DepthTiles[tileOffset] = z; #endif } } ColorTiles[tileOffset] = color; } )"; const std::string DepthBlend = PolygonBuffer + Tilebuffers + ResultBuffer + BinningBuffer + R"( layout (local_size_x = TileSize, local_size_y = TileSize) in; void PlotTranslucent(inout uint color, inout uint depth, inout uint attr, bool isShadow, uint tileColor, uint srcA, uint tileDepth, uint srcAttr, bool writeDepth) { uint blendAttr = (srcAttr & 0xE0F0U) | ((srcAttr >> 8) & 0xFF0000U) | (1U<<22) | (attr & 0xFF001F0FU); if ((!isShadow || (attr & (1U<<22)) != 0U) ? (attr & 0x007F0000U) != (blendAttr & 0x007F0000U) : (attr & 0x3F000000U) != (srcAttr & 0x3F000000U)) { // le blend if (writeDepth) depth = tileDepth; if ((attr & (1U<<15)) == 0) blendAttr &= ~(1U<<15); attr = blendAttr; uint srcRB = tileColor & 0x3F003FU; uint srcG = tileColor & 0x003F00U; uint dstRB = color & 0x3F003FU; uint dstG = color & 0x003F00U; uint dstA = color & 0x1F000000U; uint alpha = (srcA >> 24) + 1; if (dstA != 0) { srcRB = ((srcRB * alpha) + (dstRB * (32-alpha))) >> 5; srcG = ((srcG * alpha) + (dstG * (32-alpha))) >> 5; } color = (srcRB & 0x3F003FU) | (srcG & 0x003F00U) | max(dstA, srcA); } } void ProcessCoarseMask(int linearTile, uint coarseMask, uint coarseOffset, inout uvec2 color, inout uvec2 depth, inout uvec2 attr, inout uint stencil, inout bool prevIsShadowMask) { int tileInnerOffset = int(gl_LocalInvocationID.x) + int(gl_LocalInvocationID.y) * TileSize; while (coarseMask != 0U) { uint coarseBit = findLSB(coarseMask); coarseMask &= ~(1U << coarseBit); uint tileOffset = linearTile * BinStride + coarseBit + coarseOffset; uint fineMask = BinningMaskAndOffset[BinningMaskStart + tileOffset]; uint workIdx = BinningMaskAndOffset[BinningWorkOffsetsStart + tileOffset]; while (fineMask != 0U) { uint fineIdx = findLSB(fineMask); fineMask &= ~(1U << fineIdx); uint pixelindex = tileInnerOffset + workIdx * TileSize * TileSize; uint tileColor = ColorTiles[pixelindex]; workIdx++; uint polygonIdx = fineIdx + (coarseBit + coarseOffset) * 32; if (tileColor != 0U) { uint polygonAttr = Polygons[polygonIdx].Attr; bool isShadowMask = ((polygonAttr & 0x3F000030U) == 0x00000030U); bool prevIsShadowMaskOld = prevIsShadowMask; prevIsShadowMask = isShadowMask; bool equalDepthTest = (polygonAttr & (1U << 14)) != 0U; uint tileDepth = DepthTiles[pixelindex]; uint tileAttr = AttrTiles[pixelindex]; uint dstattr = attr.x; if (!isShadowMask) { bool isShadow = (polygonAttr & 0x30U) == 0x30U; bool writeSecondLayer = false; if (isShadow) { if (stencil == 0U) continue; if ((stencil & 1U) == 0U) writeSecondLayer = true; if ((stencil & 2U) == 0U) dstattr &= ~0x3U; } uint dstDepth = writeSecondLayer ? depth.y : depth.x; if (!(equalDepthTest #ifdef WBuffer ? dstDepth - tileDepth + 0xFFU <= 0x1FE #endif #ifdef ZBuffer ? dstDepth - tileDepth + 0x200 <= 0x400 #endif : tileDepth < dstDepth)) { if ((dstattr & 0x3U) == 0U || writeSecondLayer) continue; writeSecondLayer = true; dstattr = attr.y; if (!(equalDepthTest #ifdef WBuffer ? depth.y - tileDepth + 0xFFU <= 0x1FE #endif #ifdef ZBuffer ? depth.y - tileDepth + 0x200 <= 0x400 #endif : tileDepth < depth.y)) continue; } uint srcAttr = (polygonAttr & 0x3F008000U); uint srcA = tileColor & 0x1F000000U; if (srcA == 0x1F000000U) { srcAttr |= tileAttr; if (!writeSecondLayer) { if ((srcAttr & 0x3U) != 0U) { color.y = color.x; depth.y = depth.x; attr.y = attr.x; } color.x = tileColor; depth.x = tileDepth; attr.x = srcAttr; } else { color.y = tileColor; depth.y = tileDepth; attr.y = srcAttr; } } else { bool writeDepth = (polygonAttr & (1U<<11)) != 0; if (!writeSecondLayer) { // blend into both layers PlotTranslucent(color.x, depth.x, attr.x, isShadow, tileColor, srcA, tileDepth, srcAttr, writeDepth); } if (writeSecondLayer || (dstattr & 0x3U) != 0U) { PlotTranslucent(color.y, depth.y, attr.y, isShadow, tileColor, srcA, tileDepth, srcAttr, writeDepth); } } } else { if (!prevIsShadowMaskOld) stencil = 0; if (!(equalDepthTest #ifdef WBuffer ? depth.x - tileDepth + 0xFFU <= 0x1FE #endif #ifdef ZBuffer ? depth.x - tileDepth + 0x200 <= 0x400 #endif : tileDepth < depth.x)) stencil = 0x1U; if ((dstattr & 0x3U) != 0U) { if (!(equalDepthTest #ifdef WBuffer ? depth.y - tileDepth + 0xFFU <= 0x1FE #endif #ifdef ZBuffer ? depth.y - tileDepth + 0x200 <= 0x400 #endif : tileDepth < depth.y)) stencil |= 0x2U; } } } } } } void main() { int linearTile = int(gl_WorkGroupID.x + (gl_WorkGroupID.y * TilesPerLine)); uint coarseMaskLo = BinningMaskAndOffset[BinningCoarseMaskStart + linearTile*CoarseBinStride + 0]; uint coarseMaskHi = BinningMaskAndOffset[BinningCoarseMaskStart + linearTile*CoarseBinStride + 1]; uvec2 color = uvec2(ClearColor, 0U); uvec2 depth = uvec2(ClearDepth, 0U); uvec2 attr = uvec2(ClearAttr, 0U); uint stencil = 0U; bool prevIsShadowMask = false; ProcessCoarseMask(linearTile, coarseMaskLo, 0, color, depth, attr, stencil, prevIsShadowMask); ProcessCoarseMask(linearTile, coarseMaskHi, BinStride/2, color, depth, attr, stencil, prevIsShadowMask); int resultOffset = int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y) * ScreenWidth; ResultValue[ResultColorStart+resultOffset] = color.x; ResultValue[ResultColorStart+resultOffset+FramebufferStride] = color.y; ResultValue[ResultDepthStart+resultOffset] = depth.x; ResultValue[ResultDepthStart+resultOffset+FramebufferStride] = depth.y; ResultValue[ResultAttrStart+resultOffset] = attr.x; ResultValue[ResultAttrStart+resultOffset+FramebufferStride] = attr.y; } )"; const std::string FinalPass = ResultBuffer + R"( layout (local_size_x = 32) in; layout (binding = 0, rgba8) writeonly uniform image2D FinalFB; layout (binding = 1, rgba8ui) writeonly uniform uimage2D LowResFB; uint BlendFog(uint color, uint depth) { uint densityid = 0, densityfrac = 0; if (depth >= FogOffset) { depth -= FogOffset; depth = (depth >> 2) << FogShift; densityid = depth >> 17; if (densityid >= 32) { densityid = 32; densityfrac = 0; } else { densityfrac = depth & 0x1FFFFU; } } uint density = ((ToonTable[densityid].g * (0x20000U-densityfrac)) + (ToonTable[densityid+1].g * densityfrac)) >> 17; density = min(density, 128U); uint colorRB = color & 0x3F003FU; uint colorGA = (color >> 8) & 0x3F003FU; uint fogRB = FogColor & 0x3F003FU; uint fogGA = (FogColor >> 8) & 0x1F003FU; uint finalColorRB = ((fogRB * density) + (colorRB * (128-density))) >> 7; uint finalColorGA = ((fogGA * density) + (colorGA * (128-density))) >> 7; finalColorRB &= 0x3F003FU; finalColorGA &= 0x1F003FU; return (DispCnt & (1U<<6)) != 0 ? (bitfieldInsert(color, finalColorGA >> 16, 24, 8)) : (finalColorRB | (finalColorGA << 8)); } void main() { int srcX = int(gl_GlobalInvocationID.x); int resultOffset = int(srcX) + int(gl_GlobalInvocationID.y) * ScreenWidth; uvec2 color = uvec2(ResultValue[resultOffset+ResultColorStart], ResultValue[resultOffset+FramebufferStride+ResultColorStart]); uvec2 depth = uvec2(ResultValue[resultOffset+ResultDepthStart], ResultValue[resultOffset+FramebufferStride+ResultDepthStart]); uvec2 attr = uvec2(ResultValue[resultOffset+ResultAttrStart], ResultValue[resultOffset+FramebufferStride+ResultAttrStart]); #ifdef EdgeMarking if ((attr.x & 0xFU) != 0U) { uvec4 otherAttr = uvec4(ClearAttr); uvec4 otherDepth = uvec4(ClearDepth); if (srcX > 0U) { otherAttr.x = ResultValue[resultOffset-1+ResultAttrStart]; otherDepth.x = ResultValue[resultOffset-1+ResultDepthStart]; } if (srcX < ScreenWidth-1) { otherAttr.y = ResultValue[resultOffset+1+ResultAttrStart]; otherDepth.y = ResultValue[resultOffset+1+ResultDepthStart]; } if (gl_GlobalInvocationID.y > 0U) { otherAttr.z = ResultValue[resultOffset-ScreenWidth+ResultAttrStart]; otherDepth.z = ResultValue[resultOffset-ScreenWidth+ResultDepthStart]; } if (gl_GlobalInvocationID.y < ScreenHeight-1) { otherAttr.w = ResultValue[resultOffset+ScreenWidth+ResultAttrStart]; otherDepth.w = ResultValue[resultOffset+ScreenWidth+ResultDepthStart]; } uint polyId = bitfieldExtract(attr.x, 24, 6); uvec4 otherPolyId = bitfieldExtract(otherAttr, 24, 6); bvec4 polyIdMismatch = notEqual(uvec4(polyId), otherPolyId); bvec4 nearer = lessThan(uvec4(depth.x), otherDepth); if ((polyIdMismatch.x && nearer.x) || (polyIdMismatch.y && nearer.y) || (polyIdMismatch.z && nearer.z) || (polyIdMismatch.w && nearer.w)) { color.x = ToonTable[polyId >> 3].b | (color.x & 0xFF000000U); attr.x = (attr.x & 0xFFFFE0FFU) | 0x00001000U; } } #endif #ifdef Fog if ((attr.x & (1U<<15)) != 0U) { color.x = BlendFog(color.x, depth.x); } if ((attr.x & 0xFU) != 0 && (attr.y & (1U<<15)) != 0U) { color.y = BlendFog(color.y, depth.y); } #endif #ifdef AntiAliasing // resolve anti-aliasing if ((attr.x & 0x3U) != 0) { uint coverage = (attr.x >> 8) & 0x1FU; if (coverage != 0) { uint topRB = color.x & 0x3F003FU; uint topG = color.x & 0x003F00U; uint topA = bitfieldExtract(color.x, 24, 5); uint botRB = color.y & 0x3F003FU; uint botG = color.y & 0x003F00U; uint botA = bitfieldExtract(color.y, 24, 5); coverage++; if (botA > 0) { topRB = ((topRB * coverage) + (botRB * (32-coverage))) >> 5; topG = ((topG * coverage) + (botG * (32-coverage))) >> 5; topRB &= 0x3F003FU; topG &= 0x003F00U; } topA = ((topA * coverage) + (botA * (32-coverage))) >> 5; color.x = topRB | topG | (topA << 24); } else { color.x = color.y; } } #endif // if (bitfieldExtract(color.x, 24, 8) != 0U) // color.x |= 0x40000000U; // else // color.x = 0U; //if ((gl_GlobalInvocationID.y % 8) == 7 || (gl_GlobalInvocationID.y % 8) == 7) // color.x = 0x1F00001FU | 0x40000000U; vec4 result = vec4(bitfieldExtract(color.x, 16, 8), bitfieldExtract(color.x, 8, 8), color.x & 0x3FU, bitfieldExtract(color.x, 24, 8)); result /= vec4(63.0, 63.0, 63.0, 31.0); imageStore(FinalFB, ivec2(gl_GlobalInvocationID.xy), result); // It's a division by constant, so using the builtin division is fine const int scale = ScreenWidth/256; ivec2 lowresCoordinate = ivec2(gl_GlobalInvocationID.xy) / scale; ivec2 lowresCoordinateRest = ivec2(gl_GlobalInvocationID.xy) % scale; if (lowresCoordinateRest == ivec2(0, 0)) { uvec4 color8; color8.x = bitfieldExtract(color.x, 0, 8); color8.y = bitfieldExtract(color.x, 8, 8); color8.z = bitfieldExtract(color.x, 16, 8); color8.w = bitfieldExtract(color.x, 24, 8); imageStore(LowResFB, lowresCoordinate, color8); } } )"; } } #endif