1665 lines
49 KiB
C++
1665 lines
49 KiB
C++
/*
|
|
Copyright 2016-2024 melonDS team
|
|
|
|
This file is part of melonDS.
|
|
|
|
melonDS is free software: you can redistribute it and/or modify it under
|
|
the terms of the GNU General Public License as published by the Free
|
|
Software Foundation, either version 3 of the License, or (at your option)
|
|
any later version.
|
|
|
|
melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along
|
|
with melonDS. If not, see http://www.gnu.org/licenses/.
|
|
*/
|
|
|
|
#ifndef GPU3D_COMPUTE_SHADERS
|
|
#define GPU3D_COMPUTE_SHADERS
|
|
|
|
#include <string>
|
|
|
|
namespace melonDS
|
|
{
|
|
|
|
namespace ComputeRendererShaders
|
|
{
|
|
|
|
// defines:
|
|
// InterpSpans
|
|
// BinCombined
|
|
// Rasterise
|
|
// DepthBlend
|
|
// ClearCoarseBinMask
|
|
// ClearIndirectWorkCount
|
|
// CalculateWorkOffsets
|
|
// SortWork
|
|
// FinalPass
|
|
|
|
// AntiAliasing
|
|
// EdgeMarking
|
|
// Fog
|
|
|
|
// ZBuffer
|
|
// WBuffer
|
|
|
|
// for Rasterise
|
|
// NoTexture
|
|
// UseTexture
|
|
// Decal
|
|
// Modulate
|
|
// Toon
|
|
// Highlight
|
|
// ShadowMask
|
|
|
|
|
|
/*
|
|
Some notes on signed division:
|
|
|
|
we want to avoid it, so we can avoid higher precision numbers
|
|
in a few places.
|
|
|
|
Fortunately all divisions *should* assuming I'm not mistaken
|
|
have the same sign on the divisor and the dividend.
|
|
|
|
Thus we apply:
|
|
|
|
assuming n < 0 <=> d < 0
|
|
n/d = abs(n)/abs(d)
|
|
|
|
*/
|
|
|
|
const std::string XSpanSetupBuffer{R"(
|
|
|
|
const uint XSpanSetup_Linear = 1U << 0;
|
|
const uint XSpanSetup_FillInside = 1U << 1;
|
|
const uint XSpanSetup_FillLeft = 1U << 2;
|
|
const uint XSpanSetup_FillRight = 1U << 3;
|
|
|
|
struct XSpanSetup
|
|
{
|
|
int X0, X1;
|
|
|
|
int InsideStart, InsideEnd, EdgeCovL, EdgeCovR;
|
|
|
|
int XRecip;
|
|
|
|
uint Flags;
|
|
|
|
int Z0, Z1, W0, W1;
|
|
int ColorR0, ColorG0, ColorB0;
|
|
int ColorR1, ColorG1, ColorB1;
|
|
int TexcoordU0, TexcoordV0;
|
|
int TexcoordU1, TexcoordV1;
|
|
|
|
int CovLInitial, CovRInitial;
|
|
};
|
|
|
|
#if defined(Rasterise)
|
|
int CalcYFactorX(XSpanSetup span, int x)
|
|
{
|
|
x -= span.X0;
|
|
|
|
if (span.X0 != span.X1)
|
|
{
|
|
uint numLo = uint(x) * uint(span.W0);
|
|
uint numHi = 0U;
|
|
numHi |= numLo >> (32U-YFactorShift);
|
|
numLo <<= YFactorShift;
|
|
|
|
uint den = uint(x) * uint(span.W0) + uint(span.X1 - span.X0 - x) * uint(span.W1);
|
|
|
|
if (den == 0)
|
|
return 0;
|
|
else
|
|
return int(Div64_32_32(numHi, numLo, den));
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
layout (std430, binding = 1) buffer XSpanSetupsBuffer
|
|
{
|
|
XSpanSetup XSpanSetups[];
|
|
};
|
|
|
|
)"};
|
|
|
|
const std::string YSpanSetupBuffer{R"(
|
|
|
|
struct YSpanSetup
|
|
{
|
|
// Attributes
|
|
int Z0, Z1, W0, W1;
|
|
int ColorR0, ColorG0, ColorB0;
|
|
int ColorR1, ColorG1, ColorB1;
|
|
int TexcoordU0, TexcoordV0;
|
|
int TexcoordU1, TexcoordV1;
|
|
|
|
// Interpolator
|
|
int I0, I1;
|
|
bool Linear;
|
|
int IRecip;
|
|
int W0n, W0d, W1d;
|
|
|
|
// Slope
|
|
int Increment;
|
|
|
|
int X0, X1, Y0, Y1;
|
|
int XMin, XMax;
|
|
int DxInitial;
|
|
|
|
int XCovIncr;
|
|
|
|
bool IsDummy;
|
|
};
|
|
|
|
#if defined(InterpSpans)
|
|
int CalcYFactorY(YSpanSetup span, int i)
|
|
{
|
|
/*
|
|
maybe it would be better to do use a 32x32=64 multiplication?
|
|
*/
|
|
uint numLo = uint(abs(i)) * uint(span.W0n);
|
|
uint numHi = 0U;
|
|
numHi |= numLo >> (32U-YFactorShift);
|
|
numLo <<= YFactorShift;
|
|
|
|
uint den = uint(abs(i)) * uint(span.W0d) + uint(abs(span.I1 - span.I0 - i)) * span.W1d;
|
|
|
|
if (den == 0)
|
|
{
|
|
return 0;
|
|
}
|
|
else
|
|
{
|
|
return int(Div64_32_32(numHi, numLo, den));
|
|
}
|
|
}
|
|
|
|
int CalculateDx(int y, YSpanSetup span)
|
|
{
|
|
return span.DxInitial + (y - span.Y0) * span.Increment;
|
|
}
|
|
|
|
int CalculateX(int dx, YSpanSetup span)
|
|
{
|
|
int x = span.X0;
|
|
if (span.X1 < span.X0)
|
|
x -= dx >> 18;
|
|
else
|
|
x += dx >> 18;
|
|
return clamp(x, span.XMin, span.XMax);
|
|
}
|
|
|
|
void EdgeParams_XMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov)
|
|
{
|
|
bool negative = span.X1 < span.X0;
|
|
int len;
|
|
if (side != negative)
|
|
len = (dx >> 18) - ((dx-span.Increment) >> 18);
|
|
else
|
|
len = ((dx+span.Increment) >> 18) - (dx >> 18);
|
|
edgelen = len;
|
|
|
|
int xlen = span.XMax + 1 - span.XMin;
|
|
int startx = dx >> 18;
|
|
if (negative) startx = xlen - startx;
|
|
if (side) startx = startx - len + 1;
|
|
|
|
uint r;
|
|
int startcov = int(Div(uint(((startx << 10) + 0x1FF) * (span.Y1 - span.Y0)), uint(xlen), r));
|
|
edgecov = (1<<31) | ((startcov & 0x3FF) << 12) | (span.XCovIncr & 0x3FF);
|
|
}
|
|
|
|
void EdgeParams_YMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov)
|
|
{
|
|
bool negative = span.X1 < span.X0;
|
|
edgelen = 1;
|
|
|
|
if (span.Increment == 0)
|
|
{
|
|
edgecov = 31;
|
|
}
|
|
else
|
|
{
|
|
int cov = ((dx >> 9) + (span.Increment >> 10)) >> 4;
|
|
if ((cov >> 5) != (dx >> 18)) cov = 31;
|
|
cov &= 0x1F;
|
|
if (side == negative) cov = 0x1F - cov;
|
|
|
|
edgecov = cov;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
layout (std430, binding = 2) buffer YSpanSetupsBuffer
|
|
{
|
|
YSpanSetup YSpanSetups[];
|
|
};
|
|
|
|
)"};
|
|
|
|
const std::string PolygonBuffer{R"(
|
|
struct Polygon
|
|
{
|
|
int FirstXSpan;
|
|
int YTop, YBot;
|
|
|
|
int XMin, XMax;
|
|
int XMinY, XMaxY;
|
|
|
|
int Variant;
|
|
|
|
uint Attr;
|
|
|
|
float TextureLayer;
|
|
};
|
|
|
|
layout (std430, binding = 0) readonly buffer PolygonBuffer
|
|
{
|
|
Polygon Polygons[];
|
|
};
|
|
)"};
|
|
|
|
const std::string BinningBuffer{R"(
|
|
|
|
layout (std430, binding = 6) buffer BinResultBuffer
|
|
{
|
|
uvec4 VariantWorkCount[MaxVariants];
|
|
uint SortedWorkOffset[MaxVariants];
|
|
|
|
uvec4 SortWorkWorkCount;
|
|
|
|
uint BinningMaskAndOffset[];
|
|
//uint BinnedMaskCoarse[TilesPerLine*TileLines*CoarseBinStride];
|
|
//uint BinnedMask[TilesPerLine*TileLines*BinStride];
|
|
//uint WorkOffsets[TilesPerLine*TileLines*BinStride];
|
|
};
|
|
|
|
const int BinningCoarseMaskStart = 0;
|
|
const int BinningMaskStart = BinningCoarseMaskStart+TilesPerLine*TileLines*CoarseBinStride;
|
|
const int BinningWorkOffsetsStart = BinningMaskStart+TilesPerLine*TileLines*BinStride;
|
|
|
|
)"};
|
|
|
|
/*
|
|
structure of each WorkDesc item:
|
|
x:
|
|
bits 0-10: polygon idx
|
|
bits 11-31: tile idx (before sorting within variant after sorting within all tiles)
|
|
y:
|
|
bits 0-15: X position on screen
|
|
bits 15-31: Y position on screen
|
|
*/
|
|
const std::string WorkDescBuffer{R"(
|
|
layout (std430, binding = 7) buffer WorkDescBuffer
|
|
{
|
|
//uvec2 UnsortedWorkDescs[MaxWorkTiles];
|
|
//uvec2 SortedWorkDescs[MaxWorkTiles];
|
|
uvec2 WorkDescs[];
|
|
};
|
|
|
|
const uint WorkDescsUnsortedStart = 0;
|
|
const uint WorkDescsSortedStart = WorkDescsUnsortedStart+MaxWorkTiles;
|
|
|
|
)"};
|
|
|
|
const std::string Tilebuffers{R"(
|
|
layout (std430, binding = 2) buffer ColorTileBuffer
|
|
{
|
|
uint ColorTiles[];
|
|
};
|
|
layout (std430, binding = 3) buffer DepthTileBuffer
|
|
{
|
|
uint DepthTiles[];
|
|
};
|
|
layout (std430, binding = 4) buffer AttrTileBuffer
|
|
{
|
|
uint AttrTiles[];
|
|
};
|
|
|
|
)"};
|
|
|
|
const std::string ResultBuffer{R"(
|
|
layout (std430, binding = 5) buffer ResultBuffer
|
|
{
|
|
uint ResultValue[];
|
|
};
|
|
|
|
const uint ResultColorStart = 0;
|
|
const uint ResultDepthStart = ResultColorStart+ScreenWidth*ScreenHeight*2;
|
|
const uint ResultAttrStart = ResultDepthStart+ScreenWidth*ScreenHeight*2;
|
|
)"};
|
|
|
|
const char* Common = R"(
|
|
|
|
#define TileSize 8
|
|
const int CoarseTileCountX = 8;
|
|
const int CoarseTileCountY = 4;
|
|
const int CoarseTileW = (CoarseTileCountX * TileSize);
|
|
const int CoarseTileH = (CoarseTileCountY * TileSize);
|
|
|
|
const int FramebufferStride = ScreenWidth*ScreenHeight;
|
|
const int TilesPerLine = ScreenWidth/TileSize;
|
|
const int TileLines = ScreenHeight/TileSize;
|
|
|
|
const int BinStride = 2048/32;
|
|
const int CoarseBinStride = BinStride/32;
|
|
|
|
const int MaxVariants = 256;
|
|
|
|
layout (std140, binding = 0) uniform MetaUniform
|
|
{
|
|
uint NumPolygons;
|
|
uint NumVariants;
|
|
|
|
int AlphaRef;
|
|
|
|
uint DispCnt;
|
|
|
|
// r = Toon
|
|
// g = Fog Density
|
|
// b = Edge Color
|
|
uvec4 ToonTable[34];
|
|
|
|
uint ClearColor, ClearDepth, ClearAttr;
|
|
|
|
uint FogOffset, FogShift, FogColor;
|
|
};
|
|
|
|
#ifdef InterpSpans
|
|
const int YFactorShift = 9;
|
|
#else
|
|
const int YFactorShift = 8;
|
|
#endif
|
|
|
|
#if defined(InterpSpans) || defined(Rasterise)
|
|
uint Umulh(uint a, uint b)
|
|
{
|
|
uint lo, hi;
|
|
umulExtended(a, b, hi, lo);
|
|
return hi;
|
|
}
|
|
|
|
const uint startTable[256] = uint[256](
|
|
254, 252, 250, 248, 246, 244, 242, 240, 238, 236, 234, 233, 231, 229, 227, 225, 224, 222, 220, 218, 217, 215, 213, 212, 210, 208, 207, 205, 203, 202, 200, 199, 197, 195, 194, 192, 191, 189, 188, 186, 185, 183, 182, 180, 179, 178, 176, 175, 173, 172, 170, 169, 168, 166, 165, 164, 162, 161, 160, 158,
|
|
157, 156, 154, 153, 152, 151, 149, 148, 147, 146, 144, 143, 142, 141, 139, 138, 137, 136, 135, 134, 132, 131, 130, 129, 128, 127, 126, 125, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 88, 87, 86, 85, 84, 83, 82, 81, 80, 80, 79, 78, 77, 76, 75, 74, 74, 73, 72, 71, 70, 70, 69, 68, 67, 66, 66, 65, 64, 63, 62, 62, 61, 60, 59, 59, 58, 57, 56, 56, 55, 54, 53, 53, 52, 51, 50, 50, 49, 48, 48, 47, 46, 46, 45, 44, 43, 43, 42, 41, 41, 40, 39, 39, 38, 37, 37, 36, 35, 35, 34, 33, 33, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0
|
|
);
|
|
|
|
uint Div(uint x, uint y, out uint r)
|
|
{
|
|
// https://www.microsoft.com/en-us/research/publication/software-integer-division/
|
|
uint k = 31 - findMSB(y);
|
|
uint ty = (y << k) >> (32 - 9);
|
|
uint t = startTable[ty - 256] + 256;
|
|
uint z = (t << (32 - 9)) >> (32 - k - 1);
|
|
uint my = 0 - y;
|
|
|
|
z += Umulh(z, my * z);
|
|
z += Umulh(z, my * z);
|
|
|
|
uint q = Umulh(x, z);
|
|
r = x - y * q;
|
|
if(r >= y)
|
|
{
|
|
r = r - y;
|
|
q = q + 1;
|
|
if(r >= y)
|
|
{
|
|
r = r - y;
|
|
q = q + 1;
|
|
}
|
|
}
|
|
|
|
return q;
|
|
}
|
|
|
|
uint Div64_32_32(uint numHi, uint numLo, uint den)
|
|
{
|
|
// based on https://github.com/ridiculousfish/libdivide/blob/3bd34388573681ce563348cdf04fe15d24770d04/libdivide.h#L469
|
|
// modified to work with half the size 64/32=32 instead of 128/64=64
|
|
// for further details see https://ridiculousfish.com/blog/posts/labor-of-division-episode-iv.html
|
|
|
|
// We work in base 2**16.
|
|
// A uint32 holds a single digit (in the lower 16 bit). A uint32 holds two digits.
|
|
// Our numerator is conceptually [num3, num2, num1, num0].
|
|
// Our denominator is [den1, den0].
|
|
const uint b = (1U << 16);
|
|
|
|
// Determine the normalization factor. We multiply den by this, so that its leading digit is at
|
|
// least half b. In binary this means just shifting left by the number of leading zeros, so that
|
|
// there's a 1 in the MSB.
|
|
// We also shift numer by the same amount. This cannot overflow because numHi < den.
|
|
// The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
|
|
// by 64. (it's also UB in GLSL!!!!)
|
|
uint shift = 31 - findMSB(den);
|
|
den <<= shift;
|
|
numHi <<= shift;
|
|
numHi |= (numLo >> (-shift & 31U)) & uint(-int(shift) >> 31);
|
|
numLo <<= shift;
|
|
|
|
// Extract the low digits of the numerator and both digits of the denominator.
|
|
uint num1 = (numLo >> 16);
|
|
uint num0 = (numLo & 0xFFFFU);
|
|
uint den1 = (den >> 16);
|
|
uint den0 = (den & 0xFFFFU);
|
|
|
|
// We wish to compute q1 = [n3 n2 n1] / [d1 d0].
|
|
// Estimate q1 as [n3 n2] / [d1], and then correct it.
|
|
// Note while qhat may be 2 digits, q1 is always 1 digit.
|
|
|
|
uint rhat;
|
|
uint qhat = Div(numHi, den1, rhat);
|
|
uint c1 = qhat * den0;
|
|
uint c2 = rhat * b + num1;
|
|
if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
|
|
uint q1 = qhat & 0xFFFFU;
|
|
|
|
// Compute the true (partial) remainder.
|
|
uint rem = numHi * b + num1 - q1 * den;
|
|
|
|
// We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
|
|
// Estimate q0 as [rem1 rem0] / [d1] and correct it.
|
|
qhat = Div(rem, den1, rhat);
|
|
c1 = qhat * den0;
|
|
c2 = rhat * b + num0;
|
|
if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
|
|
|
|
return bitfieldInsert(qhat, q1, 16, 16);
|
|
}
|
|
|
|
int InterpolateAttrPersp(int y0, int y1, int ifactor)
|
|
{
|
|
if (y0 == y1)
|
|
return y0;
|
|
|
|
if (y0 < y1)
|
|
return y0 + (((y1-y0) * ifactor) >> YFactorShift);
|
|
else
|
|
return y1 + (((y0-y1) * ((1<<YFactorShift)-ifactor)) >> YFactorShift);
|
|
}
|
|
|
|
int InterpolateAttrLinear(int y0, int y1, int i, int irecip, int idiff)
|
|
{
|
|
if (y0 == y1)
|
|
return y0;
|
|
|
|
#ifndef Rasterise
|
|
irecip = abs(irecip);
|
|
#endif
|
|
|
|
uint mulLo, mulHi, carry;
|
|
if (y0 < y1)
|
|
{
|
|
#ifndef Rasterise
|
|
uint offset = uint(abs(i));
|
|
#else
|
|
uint offset = uint(i);
|
|
#endif
|
|
umulExtended(uint(y1-y0)*offset, uint(irecip), mulHi, mulLo);
|
|
mulLo = uaddCarry(mulLo, 3U<<24, carry);
|
|
mulHi += carry;
|
|
return y0 + int((mulLo >> 30) | (mulHi << (32 - 30)));
|
|
//return y0 + int(((int64_t(y1-y0) * int64_t(offset) * int64_t(irecip)) + int64_t(3<<24)) >> 30);
|
|
}
|
|
else
|
|
{
|
|
#ifndef Rasterise
|
|
uint offset = uint(abs(idiff-i));
|
|
#else
|
|
uint offset = uint(idiff-i);
|
|
#endif
|
|
umulExtended(uint(y0-y1)*offset, uint(irecip), mulHi, mulLo);
|
|
mulLo = uaddCarry(mulLo, 3<<24, carry);
|
|
mulHi += carry;
|
|
return y1 + int((mulLo >> 30) | (mulHi << (32 - 30)));
|
|
//return y1 + int(((int64_t(y0-y1) * int64_t(offset) * int64_t(irecip)) + int64_t(3<<24)) >> 30);
|
|
}
|
|
}
|
|
|
|
uint InterpolateZZBuffer(int z0, int z1, int i, int irecip, int idiff)
|
|
{
|
|
if (z0 == z1)
|
|
return z0;
|
|
|
|
uint base, disp, factor;
|
|
if (z0 < z1)
|
|
{
|
|
base = uint(z0);
|
|
disp = uint(z1 - z0);
|
|
factor = uint(abs(i));
|
|
}
|
|
else
|
|
{
|
|
base = uint(z1);
|
|
disp = uint(z0 - z1),
|
|
factor = uint(abs(idiff - i));
|
|
}
|
|
|
|
#ifdef InterpSpans
|
|
int shiftl = 0;
|
|
const int shiftr = 22;
|
|
if (disp > 0x3FF)
|
|
{
|
|
shiftl = findMSB(disp) - 9;
|
|
disp >>= shiftl;
|
|
}
|
|
#else
|
|
disp >>= 9;
|
|
const int shiftl = 0;
|
|
const int shiftr = 13;
|
|
#endif
|
|
uint mulLo, mulHi;
|
|
|
|
umulExtended(disp * factor, abs(irecip) >> 8, mulHi, mulLo);
|
|
|
|
return base + (((mulLo >> shiftr) | (mulHi << (32 - shiftr))) << shiftl);
|
|
/*
|
|
int base, disp, factor;
|
|
if (z0 < z1)
|
|
{
|
|
base = z0;
|
|
disp = z1 - z0;
|
|
factor = i;
|
|
}
|
|
else
|
|
{
|
|
base = z1;
|
|
disp = z0 - z1,
|
|
factor = idiff - i;
|
|
}
|
|
|
|
#ifdef InterpSpans
|
|
{
|
|
int shift = 0;
|
|
while (disp > 0x3FF)
|
|
{
|
|
disp >>= 1;
|
|
shift++;
|
|
}
|
|
|
|
return base + int(((int64_t(disp) * int64_t(factor) * (int64_t(irecip) >> 8)) >> 22) << shift);
|
|
}
|
|
#else
|
|
{
|
|
disp >>= 9;
|
|
return base + int((int64_t(disp) * int64_t(factor) * (int64_t(irecip) >> 8)) >> 13);
|
|
}
|
|
#endif*/
|
|
}
|
|
|
|
uint InterpolateZWBuffer(int z0, int z1, int ifactor)
|
|
{
|
|
if (z0 == z1)
|
|
return z0;
|
|
|
|
#ifdef Rasterise
|
|
// since the precision along x spans is only 8 bit the result will always fit in 32-bit
|
|
if (z0 < z1)
|
|
{
|
|
return uint(z0) + (((z1-z0) * ifactor) >> YFactorShift);
|
|
}
|
|
else
|
|
{
|
|
return uint(z1) + (((z0-z1) * ((1<<YFactorShift)-ifactor)) >> YFactorShift);
|
|
}
|
|
#else
|
|
uint mulLo, mulHi;
|
|
if (z0 < z1)
|
|
{
|
|
umulExtended(z1-z0, ifactor, mulHi, mulLo);
|
|
// 64-bit shift
|
|
return uint(z0) + ((mulLo >> YFactorShift) | (mulHi << (32-YFactorShift)));
|
|
}
|
|
else
|
|
{
|
|
umulExtended(z0-z1, (1<<YFactorShift)-ifactor, mulHi, mulLo);
|
|
return uint(z1) + ((mulLo >> YFactorShift) | (mulHi << (32-YFactorShift)));
|
|
}
|
|
#endif
|
|
/*if (z0 < z1)
|
|
{
|
|
return uint(z0) + uint((int64_t(z1-z0) * int64_t(ifactor)) >> YFactorShift);
|
|
}
|
|
else
|
|
{
|
|
return uint(z1) + uint((int64_t(z0-z1) * int64_t((1<<YFactorShift)-ifactor)) >> YFactorShift);
|
|
}*/
|
|
}
|
|
#endif
|
|
|
|
)";
|
|
|
|
const std::string InterpSpans =
|
|
PolygonBuffer +
|
|
XSpanSetupBuffer +
|
|
YSpanSetupBuffer + R"(
|
|
layout (local_size_x = 32) in;
|
|
|
|
layout (binding = 0, rgba16ui) uniform readonly uimageBuffer SetupIndices;
|
|
|
|
void main()
|
|
{
|
|
uvec4 setup = imageLoad(SetupIndices, int(gl_GlobalInvocationID.x));
|
|
|
|
YSpanSetup spanL = YSpanSetups[setup.y];
|
|
YSpanSetup spanR = YSpanSetups[setup.z];
|
|
|
|
XSpanSetup xspan;
|
|
xspan.Flags = 0U;
|
|
|
|
int y = int(setup.w);
|
|
|
|
int dxl = CalculateDx(y, spanL);
|
|
int dxr = CalculateDx(y, spanR);
|
|
|
|
int xl = CalculateX(dxl, spanL);
|
|
int xr = CalculateX(dxr, spanR);
|
|
|
|
Polygon polygon = Polygons[setup.x];
|
|
|
|
int edgeLenL, edgeLenR;
|
|
|
|
if (xl > xr)
|
|
{
|
|
YSpanSetup tmpSpan = spanL;
|
|
spanL = spanR;
|
|
spanR = tmpSpan;
|
|
|
|
int tmp = xl;
|
|
xl = xr;
|
|
xr = tmp;
|
|
|
|
EdgeParams_YMajor(false, dxr, spanL, edgeLenL, xspan.EdgeCovL);
|
|
EdgeParams_YMajor(true, dxl, spanR, edgeLenR, xspan.EdgeCovR);
|
|
}
|
|
else
|
|
{
|
|
// edges are the right way
|
|
if (spanL.Increment > 0x40000)
|
|
EdgeParams_XMajor(false, dxl, spanL, edgeLenL, xspan.EdgeCovL);
|
|
else
|
|
EdgeParams_YMajor(false, dxl, spanL, edgeLenL, xspan.EdgeCovL);
|
|
if (spanR.Increment > 0x40000)
|
|
EdgeParams_XMajor(true, dxr, spanR, edgeLenR, xspan.EdgeCovR);
|
|
else
|
|
EdgeParams_YMajor(true, dxr, spanR, edgeLenR, xspan.EdgeCovR);
|
|
}
|
|
|
|
xspan.CovLInitial = (xspan.EdgeCovL >> 12) & 0x3FF;
|
|
if (xspan.CovLInitial == 0x3FF)
|
|
xspan.CovLInitial = 0;
|
|
xspan.CovRInitial = (xspan.EdgeCovR >> 12) & 0x3FF;
|
|
if (xspan.CovRInitial == 0x3FF)
|
|
xspan.CovRInitial = 0;
|
|
|
|
xspan.X0 = xl;
|
|
xspan.X1 = xr + 1;
|
|
|
|
uint polyalpha = ((polygon.Attr >> 16) & 0x1FU);
|
|
bool isWireframe = polyalpha == 0U;
|
|
|
|
if (!isWireframe || (y == polygon.YTop || y == polygon.YBot - 1))
|
|
xspan.Flags |= XSpanSetup_FillInside;
|
|
|
|
xspan.InsideStart = xspan.X0 + edgeLenL;
|
|
if (xspan.InsideStart > xspan.X1)
|
|
xspan.InsideStart = xspan.X1;
|
|
xspan.InsideEnd = xspan.X1 - edgeLenR;
|
|
if (xspan.InsideEnd > xspan.X1)
|
|
xspan.InsideEnd = xspan.X1;
|
|
|
|
bool isShadowMask = ((polygon.Attr & 0x3F000030U) == 0x00000030U);
|
|
bool fillAllEdges = polyalpha < 31 || (DispCnt & (3U<<4)) != 0U;
|
|
|
|
if (fillAllEdges || spanL.X1 < spanL.X0 || spanL.Increment <= 0x40000)
|
|
xspan.Flags |= XSpanSetup_FillLeft;
|
|
if (fillAllEdges || (spanR.X1 >= spanR.X0 && spanR.Increment > 0x40000) || spanR.Increment == 0)
|
|
xspan.Flags |= XSpanSetup_FillRight;
|
|
|
|
if (spanL.I0 == spanL.I1)
|
|
{
|
|
xspan.TexcoordU0 = spanL.TexcoordU0;
|
|
xspan.TexcoordV0 = spanL.TexcoordV0;
|
|
xspan.ColorR0 = spanL.ColorR0;
|
|
xspan.ColorG0 = spanL.ColorG0;
|
|
xspan.ColorB0 = spanL.ColorB0;
|
|
xspan.Z0 = spanL.Z0;
|
|
xspan.W0 = spanL.W0;
|
|
}
|
|
else
|
|
{
|
|
int i = (spanL.Increment > 0x40000 ? xl : y) - spanL.I0;
|
|
int ifactor = CalcYFactorY(spanL, i);
|
|
int idiff = spanL.I1 - spanL.I0;
|
|
|
|
#ifdef ZBuffer
|
|
xspan.Z0 = int(InterpolateZZBuffer(spanL.Z0, spanL.Z1, i, spanL.IRecip, idiff));
|
|
#endif
|
|
#ifdef WBuffer
|
|
xspan.Z0 = int(InterpolateZWBuffer(spanL.Z0, spanL.Z1, ifactor));
|
|
#endif
|
|
|
|
if (!spanL.Linear)
|
|
{
|
|
xspan.TexcoordU0 = InterpolateAttrPersp(spanL.TexcoordU0, spanL.TexcoordU1, ifactor);
|
|
xspan.TexcoordV0 = InterpolateAttrPersp(spanL.TexcoordV0, spanL.TexcoordV1, ifactor);
|
|
|
|
xspan.ColorR0 = InterpolateAttrPersp(spanL.ColorR0, spanL.ColorR1, ifactor);
|
|
xspan.ColorG0 = InterpolateAttrPersp(spanL.ColorG0, spanL.ColorG1, ifactor);
|
|
xspan.ColorB0 = InterpolateAttrPersp(spanL.ColorB0, spanL.ColorB1, ifactor);
|
|
|
|
xspan.W0 = InterpolateAttrPersp(spanL.W0, spanL.W1, ifactor);
|
|
}
|
|
else
|
|
{
|
|
xspan.TexcoordU0 = InterpolateAttrLinear(spanL.TexcoordU0, spanL.TexcoordU1, i, spanL.IRecip, idiff);
|
|
xspan.TexcoordV0 = InterpolateAttrLinear(spanL.TexcoordV0, spanL.TexcoordV1, i, spanL.IRecip, idiff);
|
|
|
|
xspan.ColorR0 = InterpolateAttrLinear(spanL.ColorR0, spanL.ColorR1, i, spanL.IRecip, idiff);
|
|
xspan.ColorG0 = InterpolateAttrLinear(spanL.ColorG0, spanL.ColorG1, i, spanL.IRecip, idiff);
|
|
xspan.ColorB0 = InterpolateAttrLinear(spanL.ColorB0, spanL.ColorB1, i, spanL.IRecip, idiff);
|
|
|
|
xspan.W0 = spanL.W0; // linear mode is only taken if W0 == W1
|
|
}
|
|
}
|
|
|
|
if (spanR.I0 == spanR.I1)
|
|
{
|
|
xspan.TexcoordU1 = spanR.TexcoordU0;
|
|
xspan.TexcoordV1 = spanR.TexcoordV0;
|
|
xspan.ColorR1 = spanR.ColorR0;
|
|
xspan.ColorG1 = spanR.ColorG0;
|
|
xspan.ColorB1 = spanR.ColorB0;
|
|
xspan.Z1 = spanR.Z0;
|
|
xspan.W1 = spanR.W0;
|
|
}
|
|
else
|
|
{
|
|
int i = (spanR.Increment > 0x40000 ? xr : y) - spanR.I0;
|
|
int ifactor = CalcYFactorY(spanR, i);
|
|
int idiff = spanR.I1 - spanR.I0;
|
|
|
|
#ifdef ZBuffer
|
|
xspan.Z1 = int(InterpolateZZBuffer(spanR.Z0, spanR.Z1, i, spanR.IRecip, idiff));
|
|
#endif
|
|
#ifdef WBuffer
|
|
xspan.Z1 = int(InterpolateZWBuffer(spanR.Z0, spanR.Z1, ifactor));
|
|
#endif
|
|
|
|
if (!spanR.Linear)
|
|
{
|
|
xspan.TexcoordU1 = InterpolateAttrPersp(spanR.TexcoordU0, spanR.TexcoordU1, ifactor);
|
|
xspan.TexcoordV1 = InterpolateAttrPersp(spanR.TexcoordV0, spanR.TexcoordV1, ifactor);
|
|
|
|
xspan.ColorR1 = InterpolateAttrPersp(spanR.ColorR0, spanR.ColorR1, ifactor);
|
|
xspan.ColorG1 = InterpolateAttrPersp(spanR.ColorG0, spanR.ColorG1, ifactor);
|
|
xspan.ColorB1 = InterpolateAttrPersp(spanR.ColorB0, spanR.ColorB1, ifactor);
|
|
|
|
xspan.W1 = int(InterpolateAttrPersp(spanR.W0, spanR.W1, ifactor));
|
|
}
|
|
else
|
|
{
|
|
xspan.TexcoordU1 = InterpolateAttrLinear(spanR.TexcoordU0, spanR.TexcoordU1, i, spanR.IRecip, idiff);
|
|
xspan.TexcoordV1 = InterpolateAttrLinear(spanR.TexcoordV0, spanR.TexcoordV1, i, spanR.IRecip, idiff);
|
|
|
|
xspan.ColorR1 = InterpolateAttrLinear(spanR.ColorR0, spanR.ColorR1, i, spanR.IRecip, idiff);
|
|
xspan.ColorG1 = InterpolateAttrLinear(spanR.ColorG0, spanR.ColorG1, i, spanR.IRecip, idiff);
|
|
xspan.ColorB1 = InterpolateAttrLinear(spanR.ColorB0, spanR.ColorB1, i, spanR.IRecip, idiff);
|
|
|
|
xspan.W1 = spanR.W0;
|
|
}
|
|
}
|
|
|
|
if (xspan.W0 == xspan.W1 && ((xspan.W0 | xspan.W1) & 0x7F) == 0)
|
|
{
|
|
xspan.Flags |= XSpanSetup_Linear;
|
|
// a bit hacky, but when wbuffering we only need to calculate xrecip for linear spans
|
|
#ifdef ZBuffer
|
|
}
|
|
{
|
|
#endif
|
|
uint r;
|
|
xspan.XRecip = int(Div(1U<<30, uint(xspan.X1 - xspan.X0), r));
|
|
}
|
|
|
|
XSpanSetups[gl_GlobalInvocationID.x] = xspan;
|
|
}
|
|
|
|
)";
|
|
|
|
const std::string ClearIndirectWorkCount =
|
|
BinningBuffer + R"(
|
|
|
|
layout (local_size_x = 32) in;
|
|
|
|
void main()
|
|
{
|
|
VariantWorkCount[gl_GlobalInvocationID.x] = uvec4(1, 1, 0, 0);
|
|
}
|
|
|
|
)";
|
|
|
|
const std::string ClearCoarseBinMask =
|
|
BinningBuffer + R"(
|
|
layout (local_size_x = 32) in;
|
|
|
|
void main()
|
|
{
|
|
BinningMaskAndOffset[BinningCoarseMaskStart + gl_GlobalInvocationID.x*CoarseBinStride+0] = 0;
|
|
BinningMaskAndOffset[BinningCoarseMaskStart + gl_GlobalInvocationID.x*CoarseBinStride+1] = 0;
|
|
}
|
|
|
|
)";
|
|
|
|
const std::string BinCombined =
|
|
PolygonBuffer +
|
|
BinningBuffer +
|
|
XSpanSetupBuffer +
|
|
WorkDescBuffer + R"(
|
|
|
|
layout (local_size_x = 32) in;
|
|
|
|
bool BinPolygon(Polygon polygon, ivec2 topLeft, ivec2 botRight)
|
|
{
|
|
if (polygon.YTop > botRight.y || polygon.YBot <= topLeft.y)
|
|
return false;
|
|
|
|
int polygonHeight = polygon.YBot - polygon.YTop;
|
|
|
|
/*
|
|
All (good) polygons are convex. So the following holds true:
|
|
|
|
Starting from the top most point where both edges originate
|
|
the X coordinate of the left edge will stay the same or falls until
|
|
the minimum X-axis coordinate is reached. Then it stays the same or
|
|
rises until the point it meets with the right edge.
|
|
|
|
The same applies to the right edge, except that it first may rise or stay equal and
|
|
after the maximum point may only fall or stay the same.
|
|
|
|
This means that for every tile which doesn't contain the point where the direction changes
|
|
we can just get the maximum point by sampling the top most and bottom most coordinate
|
|
within the tile.
|
|
|
|
For a tile which is that the height of the direction change
|
|
|
|
As a sidenote another consequence of this design decision is
|
|
that malformed polygons aren't binned properly.
|
|
|
|
As a note bottom Y is exclusive!
|
|
*/
|
|
int polyInnerTopY = clamp(topLeft.y - polygon.YTop, 0, max(polygonHeight-1, 0));
|
|
int polyInnerBotY = clamp(botRight.y - polygon.YTop, 0, max(polygonHeight-1, 0));
|
|
|
|
XSpanSetup xspanTop = XSpanSetups[polygon.FirstXSpan + polyInnerTopY];
|
|
XSpanSetup xspanBot = XSpanSetups[polygon.FirstXSpan + polyInnerBotY];
|
|
|
|
int minXL;
|
|
if (polygon.XMinY >= topLeft.y && polygon.XMinY <= botRight.y)
|
|
minXL = polygon.XMin;
|
|
else
|
|
minXL = min(xspanTop.X0, xspanBot.X0);
|
|
|
|
if (minXL > botRight.x)
|
|
return false;
|
|
|
|
int maxXR;
|
|
if (polygon.XMaxY >= topLeft.y && polygon.XMaxY <= botRight.y)
|
|
maxXR = polygon.XMax;
|
|
else
|
|
maxXR = max(xspanTop.X1, xspanBot.X1) - 1;
|
|
|
|
if (maxXR < topLeft.x)
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
shared uint mergedMaskShared;
|
|
|
|
void main()
|
|
{
|
|
int groupIdx = int(gl_WorkGroupID.x);
|
|
ivec2 coarseTile = ivec2(gl_WorkGroupID.yz);
|
|
|
|
#if 0
|
|
int localIdx = int(gl_SubGroupInvocationARB);
|
|
#else
|
|
int localIdx = int(gl_LocalInvocationIndex);
|
|
|
|
if (localIdx == 0)
|
|
mergedMaskShared = 0U;
|
|
barrier();
|
|
#endif
|
|
|
|
int polygonIdx = groupIdx * 32 + localIdx;
|
|
|
|
ivec2 coarseTopLeft = coarseTile * ivec2(CoarseTileW, CoarseTileH);
|
|
ivec2 coarseBotRight = coarseTopLeft + ivec2(CoarseTileW-1, CoarseTileH-1);
|
|
|
|
bool binned = false;
|
|
if (polygonIdx < NumPolygons)
|
|
{
|
|
binned = BinPolygon(Polygons[polygonIdx], coarseTopLeft, coarseBotRight);
|
|
}
|
|
|
|
#if 0
|
|
uint mergedMask = unpackUint2x32(ballotARB(binned)).x;
|
|
#else
|
|
if (binned)
|
|
atomicOr(mergedMaskShared, 1U << localIdx);
|
|
barrier();
|
|
uint mergedMask = mergedMaskShared;
|
|
#endif
|
|
|
|
ivec2 fineTile = ivec2(localIdx & 0x7, localIdx >> 3);
|
|
|
|
ivec2 fineTileTopLeft = coarseTopLeft + fineTile * ivec2(TileSize, TileSize);
|
|
ivec2 fineTileBotRight = fineTileTopLeft + ivec2(TileSize-1, TileSize-1);
|
|
|
|
uint binnedMask = 0U;
|
|
while (mergedMask != 0U)
|
|
{
|
|
int bit = findLSB(mergedMask);
|
|
mergedMask &= ~(1U << bit);
|
|
|
|
int polygonIdx = groupIdx * 32 + bit;
|
|
|
|
if (BinPolygon(Polygons[polygonIdx], fineTileTopLeft, fineTileBotRight))
|
|
binnedMask |= 1U << bit;
|
|
}
|
|
|
|
int linearTile = fineTile.x + fineTile.y * TilesPerLine + coarseTile.x * CoarseTileCountX + coarseTile.y * TilesPerLine * CoarseTileCountY;
|
|
|
|
BinningMaskAndOffset[BinningMaskStart + linearTile * BinStride + groupIdx] = binnedMask;
|
|
int coarseMaskIdx = linearTile * CoarseBinStride + (groupIdx >> 5);
|
|
if (binnedMask != 0U)
|
|
atomicOr(BinningMaskAndOffset[BinningCoarseMaskStart + coarseMaskIdx], 1U << (groupIdx & 0x1F));
|
|
|
|
if (binnedMask != 0U)
|
|
{
|
|
uint workOffset = atomicAdd(VariantWorkCount[0].w, uint(bitCount(binnedMask)));
|
|
BinningMaskAndOffset[BinningWorkOffsetsStart + linearTile * BinStride + groupIdx] = workOffset;
|
|
|
|
uint tilePositionCombined = bitfieldInsert(fineTileTopLeft.x, fineTileTopLeft.y, 16, 16);
|
|
|
|
int idx = 0;
|
|
while (binnedMask != 0U)
|
|
{
|
|
int bit = findLSB(binnedMask);
|
|
binnedMask &= ~(1U << bit);
|
|
|
|
int polygonIdx = groupIdx * 32 + bit;
|
|
int variantIdx = Polygons[polygonIdx].Variant;
|
|
|
|
int inVariantOffset = int(atomicAdd(VariantWorkCount[variantIdx].z, 1));
|
|
WorkDescs[WorkDescsUnsortedStart + workOffset + idx] = uvec2(tilePositionCombined, bitfieldInsert(polygonIdx, inVariantOffset, 11, 21));
|
|
|
|
idx++;
|
|
}
|
|
}
|
|
}
|
|
|
|
)";
|
|
|
|
const std::string CalcOffsets =
|
|
BinningBuffer + R"(
|
|
|
|
layout (local_size_x = 32) in;
|
|
|
|
void main()
|
|
{
|
|
if (gl_GlobalInvocationID.x < NumVariants)
|
|
{
|
|
if (gl_GlobalInvocationID.x == 0)
|
|
{
|
|
// a bit of a cheat putting this here, but this shader won't run that often
|
|
SortWorkWorkCount = uvec4((VariantWorkCount[0].w + 31) / 32, 1, 1, 0);
|
|
}
|
|
SortedWorkOffset[gl_GlobalInvocationID.x] = atomicAdd(VariantWorkCount[1].w, VariantWorkCount[gl_GlobalInvocationID.x].z);
|
|
}
|
|
}
|
|
|
|
|
|
)";
|
|
|
|
const std::string SortWork =
|
|
PolygonBuffer +
|
|
BinningBuffer +
|
|
WorkDescBuffer + R"(
|
|
|
|
layout (local_size_x = 32) in;
|
|
|
|
void main()
|
|
{
|
|
if (gl_GlobalInvocationID.x < VariantWorkCount[0].w)
|
|
{
|
|
uvec2 workDesc = WorkDescs[WorkDescsUnsortedStart + gl_GlobalInvocationID.x];
|
|
int inVariantOffset = int(bitfieldExtract(workDesc.y, 11, 21));
|
|
int polygonIdx = int(bitfieldExtract(workDesc.y, 0, 11));
|
|
int variantIdx = Polygons[polygonIdx].Variant;
|
|
|
|
int sortedIndex = int(SortedWorkOffset[variantIdx]) + inVariantOffset;
|
|
WorkDescs[WorkDescsSortedStart + sortedIndex] = uvec2(workDesc.x, bitfieldInsert(workDesc.y, gl_GlobalInvocationID.x, 11, 21));
|
|
}
|
|
}
|
|
|
|
)";
|
|
|
|
const std::string Rasterise =
|
|
PolygonBuffer +
|
|
WorkDescBuffer +
|
|
XSpanSetupBuffer +
|
|
BinningBuffer +
|
|
Tilebuffers + R"(
|
|
|
|
layout (local_size_x = TileSize, local_size_y = TileSize) in;
|
|
|
|
layout (binding = 0) uniform usampler2DArray CurrentTexture;
|
|
|
|
layout (location = 0) uniform uint CurVariant;
|
|
layout (location = 1) uniform vec2 InvTextureSize;
|
|
|
|
void main()
|
|
{
|
|
uvec2 workDesc = WorkDescs[WorkDescsSortedStart + SortedWorkOffset[CurVariant] + gl_WorkGroupID.z];
|
|
Polygon polygon = Polygons[bitfieldExtract(workDesc.y, 0, 11)];
|
|
ivec2 position = ivec2(bitfieldExtract(workDesc.x, 0, 16), bitfieldExtract(workDesc.x, 16, 16)) + ivec2(gl_LocalInvocationID.xy);
|
|
int tileOffset = int(bitfieldExtract(workDesc.y, 11, 21)) * TileSize * TileSize + TileSize * int(gl_LocalInvocationID.y) + int(gl_LocalInvocationID.x);
|
|
|
|
uint color = 0U;
|
|
if (position.y >= polygon.YTop && position.y < polygon.YBot)
|
|
{
|
|
XSpanSetup xspan = XSpanSetups[polygon.FirstXSpan + (position.y - polygon.YTop)];
|
|
|
|
bool insideLeftEdge = position.x < xspan.InsideStart;
|
|
bool insideRightEdge = position.x >= xspan.InsideEnd;
|
|
bool insidePolygonInside = !insideLeftEdge && !insideRightEdge;
|
|
|
|
if (position.x >= xspan.X0 && position.x < xspan.X1
|
|
&& ((insideLeftEdge && (xspan.Flags & XSpanSetup_FillLeft) != 0U)
|
|
|| (insideRightEdge && (xspan.Flags & XSpanSetup_FillRight) != 0U)
|
|
|| (insidePolygonInside && (xspan.Flags & XSpanSetup_FillInside) != 0U)))
|
|
{
|
|
uint attr = 0;
|
|
if (position.y == polygon.YTop)
|
|
attr |= 0x4U;
|
|
else if (position.y == polygon.YBot - 1)
|
|
attr |= 0x8U;
|
|
|
|
if (insideLeftEdge)
|
|
{
|
|
attr |= 0x1U;
|
|
|
|
int cov = xspan.EdgeCovL;
|
|
if (cov < 0)
|
|
{
|
|
int xcov = xspan.CovLInitial + (xspan.EdgeCovL & 0x3FF) * (position.x - xspan.X0);
|
|
cov = min(xcov >> 5, 31);
|
|
}
|
|
|
|
attr |= uint(cov) << 8;
|
|
}
|
|
else if (insideRightEdge)
|
|
{
|
|
attr |= 0x2U;
|
|
|
|
int cov = xspan.EdgeCovR;
|
|
if (cov < 0)
|
|
{
|
|
int xcov = xspan.CovRInitial + (xspan.EdgeCovR & 0x3FF) * (position.x - xspan.InsideEnd);
|
|
cov = max(0x1F - (xcov >> 5), 0);
|
|
}
|
|
|
|
attr |= uint(cov) << 8;
|
|
}
|
|
|
|
uint z;
|
|
int u, v, vr, vg, vb;
|
|
|
|
if (xspan.X0 == xspan.X1)
|
|
{
|
|
z = xspan.Z0;
|
|
u = xspan.TexcoordU0;
|
|
v = xspan.TexcoordV0;
|
|
vr = xspan.ColorR0;
|
|
vg = xspan.ColorG0;
|
|
vb = xspan.ColorB0;
|
|
}
|
|
else
|
|
{
|
|
int ifactor = CalcYFactorX(xspan, position.x);
|
|
int idiff = xspan.X1 - xspan.X0;
|
|
int i = position.x - xspan.X0;
|
|
|
|
#ifdef ZBuffer
|
|
z = InterpolateZZBuffer(xspan.Z0, xspan.Z1, i, xspan.XRecip, idiff);
|
|
#endif
|
|
#ifdef WBuffer
|
|
z = InterpolateZWBuffer(xspan.Z0, xspan.Z1, ifactor);
|
|
#endif
|
|
if ((xspan.Flags & XSpanSetup_Linear) == 0U)
|
|
{
|
|
u = InterpolateAttrPersp(xspan.TexcoordU0, xspan.TexcoordU1, ifactor);
|
|
v = InterpolateAttrPersp(xspan.TexcoordV0, xspan.TexcoordV1, ifactor);
|
|
|
|
vr = InterpolateAttrPersp(xspan.ColorR0, xspan.ColorR1, ifactor);
|
|
vg = InterpolateAttrPersp(xspan.ColorG0, xspan.ColorG1, ifactor);
|
|
vb = InterpolateAttrPersp(xspan.ColorB0, xspan.ColorB1, ifactor);
|
|
}
|
|
else
|
|
{
|
|
u = InterpolateAttrLinear(xspan.TexcoordU0, xspan.TexcoordU1, i, xspan.XRecip, idiff);
|
|
v = InterpolateAttrLinear(xspan.TexcoordV0, xspan.TexcoordV1, i, xspan.XRecip, idiff);
|
|
|
|
vr = InterpolateAttrLinear(xspan.ColorR0, xspan.ColorR1, i, xspan.XRecip, idiff);
|
|
vg = InterpolateAttrLinear(xspan.ColorG0, xspan.ColorG1, i, xspan.XRecip, idiff);
|
|
vb = InterpolateAttrLinear(xspan.ColorB0, xspan.ColorB1, i, xspan.XRecip, idiff);
|
|
}
|
|
}
|
|
|
|
#ifndef ShadowMask
|
|
vr >>= 3;
|
|
vg >>= 3;
|
|
vb >>= 3;
|
|
|
|
uint r, g, b, a;
|
|
uint polyalpha = bitfieldExtract(polygon.Attr, 16, 5);
|
|
|
|
#ifdef Toon
|
|
uint tooncolor = ToonTable[vr >> 1].r;
|
|
vr = int(bitfieldExtract(tooncolor, 0, 8));
|
|
vg = int(bitfieldExtract(tooncolor, 8, 8));
|
|
vb = int(bitfieldExtract(tooncolor, 16, 8));
|
|
#endif
|
|
#ifdef Highlight
|
|
vg = vr;
|
|
vb = vr;
|
|
#endif
|
|
|
|
#ifdef NoTexture
|
|
a = int(polyalpha);
|
|
#endif
|
|
r = vr;
|
|
g = vg;
|
|
b = vb;
|
|
|
|
#ifdef UseTexture
|
|
vec2 uvf = vec2(ivec2(u, v)) * vec2(1.0 / 16.0) * InvTextureSize;
|
|
|
|
uvec4 texcolor = texture(CurrentTexture, vec3(uvf, polygon.TextureLayer));
|
|
#ifdef Decal
|
|
if (texcolor.a == 31)
|
|
{
|
|
r = int(texcolor.r);
|
|
g = int(texcolor.g);
|
|
b = int(texcolor.b);
|
|
}
|
|
else if (texcolor.a > 0)
|
|
{
|
|
r = int((texcolor.r * texcolor.a) + (vr * (31-texcolor.a))) >> 5;
|
|
g = int((texcolor.g * texcolor.a) + (vg * (31-texcolor.a))) >> 5;
|
|
b = int((texcolor.b * texcolor.a) + (vb * (31-texcolor.a))) >> 5;
|
|
}
|
|
a = int(polyalpha);
|
|
#endif
|
|
#if defined(Modulate) || defined(Toon) || defined(Highlight)
|
|
r = int((texcolor.r+1) * (vr+1) - 1) >> 6;
|
|
g = int((texcolor.g+1) * (vg+1) - 1) >> 6;
|
|
b = int((texcolor.b+1) * (vb+1) - 1) >> 6;
|
|
a = int((texcolor.a+1) * (polyalpha+1) - 1) >> 5;
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef Highlight
|
|
uint tooncolor = ToonTable[vr >> 1].r;
|
|
|
|
r = min(r + int(bitfieldExtract(tooncolor, 0, 8)), 63);
|
|
g = min(g + int(bitfieldExtract(tooncolor, 8, 8)), 63);
|
|
b = min(b + int(bitfieldExtract(tooncolor, 16, 8)), 63);
|
|
#endif
|
|
|
|
if (polyalpha == 0)
|
|
a = 31;
|
|
|
|
if (a > AlphaRef)
|
|
{
|
|
color = r | (g << 8) | (b << 16) | (a << 24);
|
|
|
|
DepthTiles[tileOffset] = z;
|
|
AttrTiles[tileOffset] = attr;
|
|
}
|
|
#else
|
|
color = 0xFFFFFFFF; // doesn't really matter as long as it's not 0
|
|
DepthTiles[tileOffset] = z;
|
|
#endif
|
|
}
|
|
}
|
|
|
|
ColorTiles[tileOffset] = color;
|
|
}
|
|
|
|
)";
|
|
|
|
const std::string DepthBlend =
|
|
PolygonBuffer +
|
|
Tilebuffers +
|
|
ResultBuffer +
|
|
BinningBuffer + R"(
|
|
|
|
layout (local_size_x = TileSize, local_size_y = TileSize) in;
|
|
|
|
void PlotTranslucent(inout uint color, inout uint depth, inout uint attr, bool isShadow, uint tileColor, uint srcA, uint tileDepth, uint srcAttr, bool writeDepth)
|
|
{
|
|
uint blendAttr = (srcAttr & 0xE0F0U) | ((srcAttr >> 8) & 0xFF0000U) | (1U<<22) | (attr & 0xFF001F0FU);
|
|
|
|
if ((!isShadow || (attr & (1U<<22)) != 0U)
|
|
? (attr & 0x007F0000U) != (blendAttr & 0x007F0000U)
|
|
: (attr & 0x3F000000U) != (srcAttr & 0x3F000000U))
|
|
{
|
|
// le blend
|
|
if (writeDepth)
|
|
depth = tileDepth;
|
|
|
|
if ((attr & (1U<<15)) == 0)
|
|
blendAttr &= ~(1U<<15);
|
|
attr = blendAttr;
|
|
|
|
uint srcRB = tileColor & 0x3F003FU;
|
|
uint srcG = tileColor & 0x003F00U;
|
|
uint dstRB = color & 0x3F003FU;
|
|
uint dstG = color & 0x003F00U;
|
|
uint dstA = color & 0x1F000000U;
|
|
|
|
uint alpha = (srcA >> 24) + 1;
|
|
if (dstA != 0)
|
|
{
|
|
srcRB = ((srcRB * alpha) + (dstRB * (32-alpha))) >> 5;
|
|
srcG = ((srcG * alpha) + (dstG * (32-alpha))) >> 5;
|
|
}
|
|
|
|
color = (srcRB & 0x3F003FU) | (srcG & 0x003F00U) | max(dstA, srcA);
|
|
}
|
|
}
|
|
|
|
void ProcessCoarseMask(int linearTile, uint coarseMask, uint coarseOffset,
|
|
inout uvec2 color, inout uvec2 depth, inout uvec2 attr, inout uint stencil,
|
|
inout bool prevIsShadowMask)
|
|
{
|
|
int tileInnerOffset = int(gl_LocalInvocationID.x) + int(gl_LocalInvocationID.y) * TileSize;
|
|
|
|
while (coarseMask != 0U)
|
|
{
|
|
uint coarseBit = findLSB(coarseMask);
|
|
coarseMask &= ~(1U << coarseBit);
|
|
|
|
uint tileOffset = linearTile * BinStride + coarseBit + coarseOffset;
|
|
|
|
uint fineMask = BinningMaskAndOffset[BinningMaskStart + tileOffset];
|
|
uint workIdx = BinningMaskAndOffset[BinningWorkOffsetsStart + tileOffset];
|
|
|
|
while (fineMask != 0U)
|
|
{
|
|
uint fineIdx = findLSB(fineMask);
|
|
fineMask &= ~(1U << fineIdx);
|
|
|
|
uint pixelindex = tileInnerOffset + workIdx * TileSize * TileSize;
|
|
uint tileColor = ColorTiles[pixelindex];
|
|
workIdx++;
|
|
|
|
uint polygonIdx = fineIdx + (coarseBit + coarseOffset) * 32;
|
|
|
|
if (tileColor != 0U)
|
|
{
|
|
uint polygonAttr = Polygons[polygonIdx].Attr;
|
|
|
|
bool isShadowMask = ((polygonAttr & 0x3F000030U) == 0x00000030U);
|
|
bool prevIsShadowMaskOld = prevIsShadowMask;
|
|
prevIsShadowMask = isShadowMask;
|
|
|
|
bool equalDepthTest = (polygonAttr & (1U << 14)) != 0U;
|
|
|
|
uint tileDepth = DepthTiles[pixelindex];
|
|
uint tileAttr = AttrTiles[pixelindex];
|
|
|
|
uint dstattr = attr.x;
|
|
|
|
if (!isShadowMask)
|
|
{
|
|
bool isShadow = (polygonAttr & 0x30U) == 0x30U;
|
|
|
|
bool writeSecondLayer = false;
|
|
|
|
if (isShadow)
|
|
{
|
|
if (stencil == 0U)
|
|
continue;
|
|
if ((stencil & 1U) == 0U)
|
|
writeSecondLayer = true;
|
|
if ((stencil & 2U) == 0U)
|
|
dstattr &= ~0x3U;
|
|
}
|
|
|
|
uint dstDepth = writeSecondLayer ? depth.y : depth.x;
|
|
if (!(equalDepthTest
|
|
#ifdef WBuffer
|
|
? dstDepth - tileDepth + 0xFFU <= 0x1FE
|
|
#endif
|
|
#ifdef ZBuffer
|
|
? dstDepth - tileDepth + 0x200 <= 0x400
|
|
#endif
|
|
: tileDepth < dstDepth))
|
|
{
|
|
if ((dstattr & 0x3U) == 0U || writeSecondLayer)
|
|
continue;
|
|
|
|
writeSecondLayer = true;
|
|
dstattr = attr.y;
|
|
if (!(equalDepthTest
|
|
#ifdef WBuffer
|
|
? depth.y - tileDepth + 0xFFU <= 0x1FE
|
|
#endif
|
|
#ifdef ZBuffer
|
|
? depth.y - tileDepth + 0x200 <= 0x400
|
|
#endif
|
|
: tileDepth < depth.y))
|
|
continue;
|
|
}
|
|
|
|
uint srcAttr = (polygonAttr & 0x3F008000U);
|
|
|
|
uint srcA = tileColor & 0x1F000000U;
|
|
if (srcA == 0x1F000000U)
|
|
{
|
|
srcAttr |= tileAttr;
|
|
|
|
if (!writeSecondLayer)
|
|
{
|
|
if ((srcAttr & 0x3U) != 0U)
|
|
{
|
|
color.y = color.x;
|
|
depth.y = depth.x;
|
|
attr.y = attr.x;
|
|
}
|
|
|
|
color.x = tileColor;
|
|
depth.x = tileDepth;
|
|
attr.x = srcAttr;
|
|
}
|
|
else
|
|
{
|
|
color.y = tileColor;
|
|
depth.y = tileDepth;
|
|
attr.y = srcAttr;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
bool writeDepth = (polygonAttr & (1U<<11)) != 0;
|
|
|
|
if (!writeSecondLayer)
|
|
{
|
|
// blend into both layers
|
|
PlotTranslucent(color.x, depth.x, attr.x, isShadow, tileColor, srcA, tileDepth, srcAttr, writeDepth);
|
|
}
|
|
if (writeSecondLayer || (dstattr & 0x3U) != 0U)
|
|
{
|
|
PlotTranslucent(color.y, depth.y, attr.y, isShadow, tileColor, srcA, tileDepth, srcAttr, writeDepth);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (!prevIsShadowMaskOld)
|
|
stencil = 0;
|
|
|
|
if (!(equalDepthTest
|
|
#ifdef WBuffer
|
|
? depth.x - tileDepth + 0xFFU <= 0x1FE
|
|
#endif
|
|
#ifdef ZBuffer
|
|
? depth.x - tileDepth + 0x200 <= 0x400
|
|
#endif
|
|
: tileDepth < depth.x))
|
|
stencil = 0x1U;
|
|
|
|
if ((dstattr & 0x3U) != 0U)
|
|
{
|
|
if (!(equalDepthTest
|
|
#ifdef WBuffer
|
|
? depth.y - tileDepth + 0xFFU <= 0x1FE
|
|
#endif
|
|
#ifdef ZBuffer
|
|
? depth.y - tileDepth + 0x200 <= 0x400
|
|
#endif
|
|
: tileDepth < depth.y))
|
|
stencil |= 0x2U;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void main()
|
|
{
|
|
int linearTile = int(gl_WorkGroupID.x + (gl_WorkGroupID.y * TilesPerLine));
|
|
|
|
uint coarseMaskLo = BinningMaskAndOffset[BinningCoarseMaskStart + linearTile*CoarseBinStride + 0];
|
|
uint coarseMaskHi = BinningMaskAndOffset[BinningCoarseMaskStart + linearTile*CoarseBinStride + 1];
|
|
|
|
uvec2 color = uvec2(ClearColor, 0U);
|
|
uvec2 depth = uvec2(ClearDepth, 0U);
|
|
uvec2 attr = uvec2(ClearAttr, 0U);
|
|
uint stencil = 0U;
|
|
bool prevIsShadowMask = false;
|
|
|
|
ProcessCoarseMask(linearTile, coarseMaskLo, 0, color, depth, attr, stencil, prevIsShadowMask);
|
|
ProcessCoarseMask(linearTile, coarseMaskHi, BinStride/2, color, depth, attr, stencil, prevIsShadowMask);
|
|
|
|
int resultOffset = int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y) * ScreenWidth;
|
|
ResultValue[ResultColorStart+resultOffset] = color.x;
|
|
ResultValue[ResultColorStart+resultOffset+FramebufferStride] = color.y;
|
|
ResultValue[ResultDepthStart+resultOffset] = depth.x;
|
|
ResultValue[ResultDepthStart+resultOffset+FramebufferStride] = depth.y;
|
|
ResultValue[ResultAttrStart+resultOffset] = attr.x;
|
|
ResultValue[ResultAttrStart+resultOffset+FramebufferStride] = attr.y;
|
|
}
|
|
|
|
)";
|
|
|
|
const std::string FinalPass =
|
|
ResultBuffer + R"(
|
|
|
|
layout (local_size_x = 32) in;
|
|
|
|
layout (binding = 0, rgba8) writeonly uniform image2D FinalFB;
|
|
layout (binding = 1, rgba8ui) writeonly uniform uimage2D LowResFB;
|
|
|
|
uint BlendFog(uint color, uint depth)
|
|
{
|
|
uint densityid = 0, densityfrac = 0;
|
|
|
|
if (depth >= FogOffset)
|
|
{
|
|
depth -= FogOffset;
|
|
depth = (depth >> 2) << FogShift;
|
|
|
|
densityid = depth >> 17;
|
|
if (densityid >= 32)
|
|
{
|
|
densityid = 32;
|
|
densityfrac = 0;
|
|
}
|
|
else
|
|
{
|
|
densityfrac = depth & 0x1FFFFU;
|
|
}
|
|
}
|
|
|
|
uint density =
|
|
((ToonTable[densityid].g * (0x20000U-densityfrac)) +
|
|
(ToonTable[densityid+1].g * densityfrac)) >> 17;
|
|
density = min(density, 128U);
|
|
|
|
uint colorRB = color & 0x3F003FU;
|
|
uint colorGA = (color >> 8) & 0x3F003FU;
|
|
|
|
uint fogRB = FogColor & 0x3F003FU;
|
|
uint fogGA = (FogColor >> 8) & 0x1F003FU;
|
|
|
|
uint finalColorRB = ((fogRB * density) + (colorRB * (128-density))) >> 7;
|
|
uint finalColorGA = ((fogGA * density) + (colorGA * (128-density))) >> 7;
|
|
|
|
finalColorRB &= 0x3F003FU;
|
|
finalColorGA &= 0x1F003FU;
|
|
|
|
return (DispCnt & (1U<<6)) != 0
|
|
? (bitfieldInsert(color, finalColorGA >> 16, 24, 8))
|
|
: (finalColorRB | (finalColorGA << 8));
|
|
}
|
|
|
|
void main()
|
|
{
|
|
int srcX = int(gl_GlobalInvocationID.x);
|
|
int resultOffset = int(srcX) + int(gl_GlobalInvocationID.y) * ScreenWidth;
|
|
|
|
uvec2 color = uvec2(ResultValue[resultOffset+ResultColorStart], ResultValue[resultOffset+FramebufferStride+ResultColorStart]);
|
|
uvec2 depth = uvec2(ResultValue[resultOffset+ResultDepthStart], ResultValue[resultOffset+FramebufferStride+ResultDepthStart]);
|
|
uvec2 attr = uvec2(ResultValue[resultOffset+ResultAttrStart], ResultValue[resultOffset+FramebufferStride+ResultAttrStart]);
|
|
|
|
#ifdef EdgeMarking
|
|
if ((attr.x & 0xFU) != 0U)
|
|
{
|
|
uvec4 otherAttr = uvec4(ClearAttr);
|
|
uvec4 otherDepth = uvec4(ClearDepth);
|
|
|
|
if (srcX > 0U)
|
|
{
|
|
otherAttr.x = ResultValue[resultOffset-1+ResultAttrStart];
|
|
otherDepth.x = ResultValue[resultOffset-1+ResultDepthStart];
|
|
}
|
|
if (srcX < ScreenWidth-1)
|
|
{
|
|
otherAttr.y = ResultValue[resultOffset+1+ResultAttrStart];
|
|
otherDepth.y = ResultValue[resultOffset+1+ResultDepthStart];
|
|
}
|
|
if (gl_GlobalInvocationID.y > 0U)
|
|
{
|
|
otherAttr.z = ResultValue[resultOffset-ScreenWidth+ResultAttrStart];
|
|
otherDepth.z = ResultValue[resultOffset-ScreenWidth+ResultDepthStart];
|
|
}
|
|
if (gl_GlobalInvocationID.y < ScreenHeight-1)
|
|
{
|
|
otherAttr.w = ResultValue[resultOffset+ScreenWidth+ResultAttrStart];
|
|
otherDepth.w = ResultValue[resultOffset+ScreenWidth+ResultDepthStart];
|
|
}
|
|
|
|
uint polyId = bitfieldExtract(attr.x, 24, 6);
|
|
uvec4 otherPolyId = bitfieldExtract(otherAttr, 24, 6);
|
|
|
|
bvec4 polyIdMismatch = notEqual(uvec4(polyId), otherPolyId);
|
|
bvec4 nearer = lessThan(uvec4(depth.x), otherDepth);
|
|
|
|
if ((polyIdMismatch.x && nearer.x)
|
|
|| (polyIdMismatch.y && nearer.y)
|
|
|| (polyIdMismatch.z && nearer.z)
|
|
|| (polyIdMismatch.w && nearer.w))
|
|
{
|
|
color.x = ToonTable[polyId >> 3].b | (color.x & 0xFF000000U);
|
|
attr.x = (attr.x & 0xFFFFE0FFU) | 0x00001000U;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#ifdef Fog
|
|
if ((attr.x & (1U<<15)) != 0U)
|
|
{
|
|
color.x = BlendFog(color.x, depth.x);
|
|
}
|
|
|
|
if ((attr.x & 0xFU) != 0 && (attr.y & (1U<<15)) != 0U)
|
|
{
|
|
color.y = BlendFog(color.y, depth.y);
|
|
}
|
|
#endif
|
|
|
|
#ifdef AntiAliasing
|
|
// resolve anti-aliasing
|
|
if ((attr.x & 0x3U) != 0)
|
|
{
|
|
uint coverage = (attr.x >> 8) & 0x1FU;
|
|
|
|
if (coverage != 0)
|
|
{
|
|
uint topRB = color.x & 0x3F003FU;
|
|
uint topG = color.x & 0x003F00U;
|
|
uint topA = bitfieldExtract(color.x, 24, 5);
|
|
|
|
uint botRB = color.y & 0x3F003FU;
|
|
uint botG = color.y & 0x003F00U;
|
|
uint botA = bitfieldExtract(color.y, 24, 5);
|
|
|
|
coverage++;
|
|
|
|
if (botA > 0)
|
|
{
|
|
topRB = ((topRB * coverage) + (botRB * (32-coverage))) >> 5;
|
|
topG = ((topG * coverage) + (botG * (32-coverage))) >> 5;
|
|
|
|
topRB &= 0x3F003FU;
|
|
topG &= 0x003F00U;
|
|
}
|
|
|
|
topA = ((topA * coverage) + (botA * (32-coverage))) >> 5;
|
|
|
|
color.x = topRB | topG | (topA << 24);
|
|
}
|
|
else
|
|
{
|
|
color.x = color.y;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// if (bitfieldExtract(color.x, 24, 8) != 0U)
|
|
// color.x |= 0x40000000U;
|
|
// else
|
|
// color.x = 0U;
|
|
|
|
//if ((gl_GlobalInvocationID.y % 8) == 7 || (gl_GlobalInvocationID.y % 8) == 7)
|
|
// color.x = 0x1F00001FU | 0x40000000U;
|
|
|
|
vec4 result = vec4(bitfieldExtract(color.x, 16, 8), bitfieldExtract(color.x, 8, 8), color.x & 0x3FU, bitfieldExtract(color.x, 24, 8));
|
|
result /= vec4(63.0, 63.0, 63.0, 31.0);
|
|
imageStore(FinalFB, ivec2(gl_GlobalInvocationID.xy), result);
|
|
|
|
// It's a division by constant, so using the builtin division is fine
|
|
const int scale = ScreenWidth/256;
|
|
ivec2 lowresCoordinate = ivec2(gl_GlobalInvocationID.xy) / scale;
|
|
ivec2 lowresCoordinateRest = ivec2(gl_GlobalInvocationID.xy) % scale;
|
|
if (lowresCoordinateRest == ivec2(0, 0))
|
|
{
|
|
uvec4 color8;
|
|
color8.x = bitfieldExtract(color.x, 0, 8);
|
|
color8.y = bitfieldExtract(color.x, 8, 8);
|
|
color8.z = bitfieldExtract(color.x, 16, 8);
|
|
color8.w = bitfieldExtract(color.x, 24, 8);
|
|
imageStore(LowResFB, lowresCoordinate, color8);
|
|
}
|
|
}
|
|
|
|
)";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif |