stop using fixed size buffers based on scale factor in shaders
this makes shader compile times tolerable on Wintel - beginning of the shader cache - increase size of tile idx in workdesc to 20 bits
This commit is contained in:
@ -39,7 +39,7 @@ ComputeRenderer::~ComputeRenderer()
bool ComputeRenderer::CompileShader(GLuint& shader, const char* source, const std::initializer_list<const char*>& defines)
bool ComputeRenderer::CompileShader(GLuint& shader, const std::string& source, const std::initializer_list<const char*>& defines)
std::string shaderName;
std::string shaderSource;
@ -72,8 +72,8 @@ void blah(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,con
bool ComputeRenderer::Init()
//glDebugMessageCallback(blah, NULL);
glDebugMessageCallback(blah, NULL);
glGenBuffers(1, &YSpanSetupMemory);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupY)*MaxYSpanSetups, nullptr, GL_DYNAMIC_DRAW);
@ -86,7 +86,8 @@ bool ComputeRenderer::Init()
glGenBuffers(1, &BinResultMemory);
glGenBuffers(1, &FinalTileMemory);
glGenBuffers(1, &YSpanIndicesTextureMemory);
glGenBuffers(1, &TileMemory);
glGenBuffers(tilememoryLayer_Num, TileMemory);
glGenBuffers(1, &WorkDescMemory);
glGenTextures(1, &YSpanIndicesTexture);
glGenTextures(1, &LowResFramebuffer);
@ -123,9 +124,10 @@ void ComputeRenderer::DeInit()
glDeleteBuffers(1, &YSpanSetupMemory);
glDeleteBuffers(1, &RenderPolygonMemory);
glDeleteBuffers(1, &TileMemory);
glDeleteBuffers(1, &XSpanSetupMemory);
glDeleteBuffers(1, &BinResultMemory);
glDeleteBuffers(tilememoryLayer_Num, TileMemory);
glDeleteBuffers(1, &WorkDescMemory);
glDeleteBuffers(1, &FinalTileMemory);
glDeleteBuffers(1, &YSpanIndicesTextureMemory);
glDeleteTextures(1, &YSpanIndicesTexture);
@ -214,21 +216,25 @@ void ComputeRenderer::SetRenderSettings(GPU::RenderSettings& settings)
MaxWorkTiles = TilesPerLine*TileLines*8;
glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*TileSize*TileSize*MaxWorkTiles, nullptr, GL_DYNAMIC_DRAW);
for (int i = 0; i < tilememoryLayer_Num; i++)
glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory[i]);
glBufferData(GL_SHADER_STORAGE_BUFFER, 4*TileSize*TileSize*MaxWorkTiles, nullptr, GL_DYNAMIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, FinalTileMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*2*ScreenWidth*ScreenHeight, nullptr, GL_DYNAMIC_DRAW);
int binResultSize = sizeof(BinResultHeader)
+ MaxWorkTiles*2*4 // UnsortedWorkDescs
+ MaxWorkTiles*2*4 // SortedWork
+ TilesPerLine*TileLines*CoarseBinStride*4 // BinnedMaskCoarse
+ TilesPerLine*TileLines*BinStride*4 // BinnedMask
+ TilesPerLine*TileLines*BinStride*4; // WorkOffsets
glBindBuffer(GL_SHADER_STORAGE_BUFFER, BinResultMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, binResultSize, nullptr, GL_DYNAMIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, WorkDescMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, MaxWorkTiles*2*4*2, nullptr, GL_DYNAMIC_DRAW);
if (Framebuffer != 0)
glDeleteTextures(1, &Framebuffer);
glGenTextures(1, &Framebuffer);
@ -1237,12 +1243,14 @@ void ComputeRenderer::RenderFrame()
//printf("found via %d %d %d of %d\n", foundviatexcache, foundviaprev, numslow, RenderNumPolygons);
// bind everything
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, YSpanSetupMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, RenderPolygonMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, XSpanSetupMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, RenderPolygonMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, BinResultMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, TileMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, YSpanSetupMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, FinalTileMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, BinResultMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 7, WorkDescMemory);
MetaUniform meta;
meta.DispCnt = RenderDispCnt;
@ -1327,7 +1335,6 @@ void ComputeRenderer::RenderFrame()
glDispatchCompute((numVariants + 31) / 32, 1, 1);
// sort shader work
glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory);
@ -1336,6 +1343,9 @@ void ComputeRenderer::RenderFrame()
for (int i = 0; i < tilememoryLayer_Num; i++)
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2+i, TileMemory[i]);
// rasterise
bool highLightMode = RenderDispCnt & (1<<1);
@ -74,7 +74,17 @@ private:
GLuint XSpanSetupMemory;
GLuint BinResultMemory;
GLuint RenderPolygonMemory;
GLuint TileMemory;
GLuint WorkDescMemory;
GLuint TileMemory[tilememoryLayer_Num];
GLuint FinalTileMemory;
u32 DummyLine[256] = {};
@ -102,7 +112,7 @@ private:
s32 DxInitial;
s32 XCovIncr;
u32 IsDummy, __pad1;
u32 IsDummy;
struct SpanSetupX
@ -138,7 +148,6 @@ private:
u32 Attr;
float TextureLayer;
u32 __pad0, __pad1;
static constexpr int TileSize = 8;
@ -233,7 +242,7 @@ private:
void SetupYSpan(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int from, int to, int side, s32 positions[10][2]);
void SetupYSpanDummy(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2]);
bool CompileShader(GLuint& shader, const char* source, const std::initializer_list<const char*>& defines);
bool CompileShader(GLuint& shader, const std::string& source, const std::initializer_list<const char*>& defines);
@ -19,6 +19,8 @@
#include <string>
namespace GPU3D
@ -69,23 +71,67 @@ namespace ComputeRendererShaders
const std::string XSpanSetupBuffer{R"(
const char* Common = R"(
struct Polygon
const uint XSpanSetup_Linear = 1U << 0;
const uint XSpanSetup_FillInside = 1U << 1;
const uint XSpanSetup_FillLeft = 1U << 2;
const uint XSpanSetup_FillRight = 1U << 3;
struct XSpanSetup
int FirstXSpan;
int YTop, YBot;
int X0, X1;
int XMin, XMax;
int XMinY, XMaxY;
int InsideStart, InsideEnd, EdgeCovL, EdgeCovR;
int Variant;
int XRecip;
uint Attr;
uint Flags;
float TextureLayer;
int Z0, Z1, W0, W1;
int ColorR0, ColorG0, ColorB0;
int ColorR1, ColorG1, ColorB1;
int TexcoordU0, TexcoordV0;
int TexcoordU1, TexcoordV1;
int CovLInitial, CovRInitial;
#if defined(Rasterise)
int CalcYFactorX(XSpanSetup span, int x)
x -= span.X0;
if (span.X0 != span.X1)
uint numLo = uint(x) * uint(span.W0);
uint numHi = 0U;
numHi |= numLo >> (32U-YFactorShift);
numLo <<= YFactorShift;
uint den = uint(x) * uint(span.W0) + uint(span.X1 - span.X0 - x) * uint(span.W1);
if (den == 0)
return 0;
return int(Div64_32_32(numHi, numLo, den));
return 0;
layout (std430, binding = 1) buffer XSpanSetupsBuffer
XSpanSetup XSpanSetups[];
const std::string YSpanSetupBuffer{R"(
struct YSpanSetup
// Attributes
@ -113,53 +159,185 @@ struct YSpanSetup
bool IsDummy;
const uint XSpanSetup_Linear = 1U << 0;
const uint XSpanSetup_FillInside = 1U << 1;
const uint XSpanSetup_FillLeft = 1U << 2;
const uint XSpanSetup_FillRight = 1U << 3;
struct XSpanSetup
#if defined(InterpSpans)
int CalcYFactorY(YSpanSetup span, int i)
int X0, X1;
maybe it would be better to do use a 32x32=64 multiplication?
uint numLo = uint(abs(i)) * uint(span.W0n);
uint numHi = 0U;
numHi |= numLo >> (32U-YFactorShift);
numLo <<= YFactorShift;
int InsideStart, InsideEnd, EdgeCovL, EdgeCovR;
uint den = uint(abs(i)) * uint(span.W0d) + uint(abs(span.I1 - span.I0 - i)) * span.W1d;
int XRecip;
if (den == 0)
return 0;
return int(Div64_32_32(numHi, numLo, den));
uint Flags;
int CalculateDx(int y, YSpanSetup span)
return span.DxInitial + (y - span.Y0) * span.Increment;
int Z0, Z1, W0, W1;
int ColorR0, ColorG0, ColorB0;
int ColorR1, ColorG1, ColorB1;
int TexcoordU0, TexcoordV0;
int TexcoordU1, TexcoordV1;
int CalculateX(int dx, YSpanSetup span)
int x = span.X0;
if (span.X1 < span.X0)
x -= dx >> 18;
x += dx >> 18;
return clamp(x, span.XMin, span.XMax);
int CovLInitial, CovRInitial;
void EdgeParams_XMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov)
bool negative = span.X1 < span.X0;
int len;
if (side != negative)
len = (dx >> 18) - ((dx-span.Increment) >> 18);
len = ((dx+span.Increment) >> 18) - (dx >> 18);
edgelen = len;
layout (std140, binding = 0) readonly buffer YSpanSetupsBuffer
int xlen = span.XMax + 1 - span.XMin;
int startx = dx >> 18;
if (negative) startx = xlen - startx;
if (side) startx = startx - len + 1;
uint r;
int startcov = int(Div(uint(((startx << 10) + 0x1FF) * (span.Y1 - span.Y0)), uint(xlen), r));
edgecov = (1<<31) | ((startcov & 0x3FF) << 12) | (span.XCovIncr & 0x3FF);
void EdgeParams_YMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov)
bool negative = span.X1 < span.X0;
edgelen = 1;
if (span.Increment == 0)
edgecov = 31;
int cov = ((dx >> 9) + (span.Increment >> 10)) >> 4;
if ((cov >> 5) != (dx >> 18)) cov = 31;
cov &= 0x1F;
if (side == negative) cov = 0x1F - cov;
edgecov = cov;
layout (std430, binding = 2) buffer YSpanSetupsBuffer
YSpanSetup YSpanSetups[];
#if defined(InterpSpans) || defined(BinCombined) || defined(Rasterise)
layout (std140, binding = 1)
#ifdef InterpSpans
#if defined(BinCombined) || defined(Rasterise)
buffer XSpanSetupsBuffer
XSpanSetup XSpanSetups[];
layout (std140, binding = 2) readonly buffer PolygonBuffer
const std::string PolygonBuffer{R"(
struct Polygon
int FirstXSpan;
int YTop, YBot;
int XMin, XMax;
int XMinY, XMaxY;
int Variant;
uint Attr;
float TextureLayer;
layout (std430, binding = 0) readonly buffer PolygonBuffer
Polygon Polygons[];
const std::string BinningBuffer{R"(
layout (std430, binding = 6) buffer BinResultBuffer
uvec4 VariantWorkCount[MaxVariants];
uint SortedWorkOffset[MaxVariants];
uvec4 SortWorkWorkCount;
uint BinningMaskAndOffset[];
//uint BinnedMaskCoarse[TilesPerLine*TileLines*CoarseBinStride];
//uint BinnedMask[TilesPerLine*TileLines*BinStride];
//uint WorkOffsets[TilesPerLine*TileLines*BinStride];
const int BinningCoarseMaskStart = 0;
const int BinningMaskStart = BinningCoarseMaskStart+TilesPerLine*TileLines*CoarseBinStride;
const int BinningWorkOffsetsStart = BinningMaskStart+TilesPerLine*TileLines*BinStride;
structure of each WorkDesc item:
bits 0-10: polygon idx
bits 11-31: tile idx (before sorting within variant after sorting within all tiles)
bits 0-15: X position on screen
bits 15-31: Y position on screen
const std::string WorkDescBuffer{R"(
layout (std430, binding = 7) buffer WorkDescBuffer
//uvec2 UnsortedWorkDescs[MaxWorkTiles];
//uvec2 SortedWorkDescs[MaxWorkTiles];
uvec2 WorkDescs[];
const uint WorkDescsUnsortedStart = 0;
const uint WorkDescsSortedStart = WorkDescsUnsortedStart+MaxWorkTiles;
const std::string Tilebuffers{R"(
layout (std430, binding = 2) buffer ColorTileBuffer
uint ColorTiles[];
layout (std430, binding = 3) buffer DepthTileBuffer
uint DepthTiles[];
layout (std430, binding = 4) buffer AttrTileBuffer
uint AttrTiles[];
const std::string ResultBuffer{R"(
layout (std430, binding = 5) buffer ResultBuffer
uint ResultValue[];
const uint ResultColorStart = 0;
const uint ResultDepthStart = ResultColorStart+ScreenWidth*ScreenHeight*2;
const uint ResultAttrStart = ResultDepthStart+ScreenWidth*ScreenHeight*2;
const char* Common = R"(
#define TileSize 8
const int CoarseTileCountX = 8;
@ -174,56 +352,8 @@ const int TileLines = ScreenHeight/TileSize;
const int BinStride = 2048/32;
const int CoarseBinStride = BinStride/32;
const int MaxVariants = 256;
layout (std430, binding = 3)
buffer BinResultBuffer
uvec4 VariantWorkCount[MaxVariants];
uint SortedWorkOffset[MaxVariants];
uvec4 SortWorkWorkCount;
uvec2 UnsortedWorkDescs[MaxWorkTiles];
uvec2 SortedWork[MaxWorkTiles];
uint BinnedMaskCoarse[TilesPerLine*TileLines*CoarseBinStride];
uint BinnedMask[TilesPerLine*TileLines*BinStride];
uint WorkOffsets[TilesPerLine*TileLines*BinStride];
#if defined(Rasterise) || defined(DepthBlend)
layout (std430, binding = 4)
#ifdef Rasterise
#ifdef DepthBlend
buffer TilesBuffer
uint ColorTiles[MaxWorkTiles*TileSize*TileSize];
uint DepthTiles[MaxWorkTiles*TileSize*TileSize];
uint AttrTiles[MaxWorkTiles*TileSize*TileSize];
#if defined(DepthBlend) || defined(FinalPass)
layout (std430, binding = 5)
#ifdef DepthBlend
#ifdef FinalPass
buffer RasterResult
uint ColorResult[ScreenWidth*ScreenHeight*2];
uint DepthResult[ScreenWidth*ScreenHeight*2];
uint AttrResult[ScreenWidth*ScreenHeight*2];
layout (std140, binding = 0) uniform MetaUniform
uint NumPolygons;
@ -243,6 +373,12 @@ layout (std140, binding = 0) uniform MetaUniform
uint FogOffset, FogShift, FogColor;
#ifdef InterpSpans
const int YFactorShift = 9;
const int YFactorShift = 8;
#if defined(InterpSpans) || defined(Rasterise)
uint Umulh(uint a, uint b)
@ -338,58 +474,6 @@ uint Div64_32_32(uint numHi, uint numLo, uint den)
return bitfieldInsert(qhat, q1, 16, 16);
#ifdef InterpSpans
const int YFactorShift = 9;
const int YFactorShift = 8;
int CalcYFactorY(YSpanSetup span, int i)
maybe it would be better to do use a 32x32=64 multiplication?
uint numLo = uint(abs(i)) * uint(span.W0n);
uint numHi = 0U;
numHi |= numLo >> (32U-YFactorShift);
numLo <<= YFactorShift;
uint den = uint(abs(i)) * uint(span.W0d) + uint(abs(span.I1 - span.I0 - i)) * span.W1d;
if (den == 0)
return 0;
return int(Div64_32_32(numHi, numLo, den));
int CalcYFactorX(XSpanSetup span, int x)
x -= span.X0;
if (span.X0 != span.X1)
uint numLo = uint(x) * uint(span.W0);
uint numHi = 0U;
numHi |= numLo >> (32U-YFactorShift);
numLo <<= YFactorShift;
uint den = uint(x) * uint(span.W0) + uint(span.X1 - span.X0 - x) * uint(span.W1);
if (den == 0)
return 0;
return int(Div64_32_32(numHi, numLo, den));
return 0;
int InterpolateAttrPersp(int y0, int y1, int ifactor)
if (y0 == y1)
@ -548,67 +632,14 @@ uint InterpolateZWBuffer(int z0, int z1, int ifactor)
return uint(z1) + uint((int64_t(z0-z1) * int64_t((1<<YFactorShift)-ifactor)) >> YFactorShift);
int CalculateDx(int y, YSpanSetup span)
return span.DxInitial + (y - span.Y0) * span.Increment;
int CalculateX(int dx, YSpanSetup span)
int x = span.X0;
if (span.X1 < span.X0)
x -= dx >> 18;
x += dx >> 18;
return clamp(x, span.XMin, span.XMax);
void EdgeParams_XMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov)
bool negative = span.X1 < span.X0;
int len;
if (side != negative)
len = (dx >> 18) - ((dx-span.Increment) >> 18);
len = ((dx+span.Increment) >> 18) - (dx >> 18);
edgelen = len;
int xlen = span.XMax + 1 - span.XMin;
int startx = dx >> 18;
if (negative) startx = xlen - startx;
if (side) startx = startx - len + 1;
uint r;
int startcov = int(Div(uint(((startx << 10) + 0x1FF) * (span.Y1 - span.Y0)), uint(xlen), r));
edgecov = (1<<31) | ((startcov & 0x3FF) << 12) | (span.XCovIncr & 0x3FF);
void EdgeParams_YMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov)
bool negative = span.X1 < span.X0;
edgelen = 1;
if (span.Increment == 0)
edgecov = 31;
int cov = ((dx >> 9) + (span.Increment >> 10)) >> 4;
if ((cov >> 5) != (dx >> 18)) cov = 31;
cov &= 0x1F;
if (side == negative) cov = 0x1F - cov;
edgecov = cov;
const char* InterpSpans = R"(
const std::string InterpSpans =
PolygonBuffer +
XSpanSetupBuffer +
YSpanSetupBuffer + R"(
layout (local_size_x = 32) in;
layout (binding = 0, rgba16ui) uniform readonly uimageBuffer SetupIndices;
@ -803,7 +834,8 @@ void main()
const char* ClearIndirectWorkCount = R"(
const std::string ClearIndirectWorkCount =
BinningBuffer + R"(
layout (local_size_x = 32) in;
@ -814,19 +846,23 @@ void main()
const char* ClearCoarseBinMask = R"(
const std::string ClearCoarseBinMask =
BinningBuffer + R"(
layout (local_size_x = 32) in;
void main()
BinnedMaskCoarse[gl_GlobalInvocationID.x*CoarseBinStride+0] = 0;
BinnedMaskCoarse[gl_GlobalInvocationID.x*CoarseBinStride+1] = 0;
BinningMaskAndOffset[BinningCoarseMaskStart + gl_GlobalInvocationID.x*CoarseBinStride+0] = 0;
BinningMaskAndOffset[BinningCoarseMaskStart + gl_GlobalInvocationID.x*CoarseBinStride+1] = 0;
const char* BinCombined = R"(
const std::string BinCombined =
PolygonBuffer +
BinningBuffer +
XSpanSetupBuffer +
WorkDescBuffer + R"(
layout (local_size_x = 32) in;
@ -942,15 +978,15 @@ void main()
int linearTile = fineTile.x + fineTile.y * TilesPerLine + coarseTile.x * CoarseTileCountX + coarseTile.y * TilesPerLine * CoarseTileCountY;
BinnedMask[linearTile * BinStride + groupIdx] = binnedMask;
BinningMaskAndOffset[BinningMaskStart + linearTile * BinStride + groupIdx] = binnedMask;
int coarseMaskIdx = linearTile * CoarseBinStride + (groupIdx >> 5);
if (binnedMask != 0U)
atomicOr(BinnedMaskCoarse[coarseMaskIdx], 1U << (groupIdx & 0x1F));
atomicOr(BinningMaskAndOffset[BinningCoarseMaskStart + coarseMaskIdx], 1U << (groupIdx & 0x1F));
if (binnedMask != 0U)
uint workOffset = atomicAdd(VariantWorkCount[0].w, uint(bitCount(binnedMask)));
WorkOffsets[linearTile * BinStride + groupIdx] = workOffset;
BinningMaskAndOffset[BinningWorkOffsetsStart + linearTile * BinStride + groupIdx] = workOffset;
uint tilePositionCombined = bitfieldInsert(fineTileTopLeft.x, fineTileTopLeft.y, 16, 16);
@ -964,7 +1000,7 @@ void main()
int variantIdx = Polygons[polygonIdx].Variant;
int inVariantOffset = int(atomicAdd(VariantWorkCount[variantIdx].z, 1));
UnsortedWorkDescs[workOffset + idx] = uvec2(tilePositionCombined, bitfieldInsert(inVariantOffset, polygonIdx, 16, 16));
WorkDescs[WorkDescsUnsortedStart + workOffset + idx] = uvec2(tilePositionCombined, bitfieldInsert(polygonIdx, inVariantOffset, 12, 20));
@ -973,7 +1009,8 @@ void main()
const char* CalcOffsets = R"(
const std::string CalcOffsets =
BinningBuffer + R"(
layout (local_size_x = 32) in;
@ -993,7 +1030,10 @@ void main()
const char* SortWork = R"(
const std::string SortWork =
PolygonBuffer +
BinningBuffer +
WorkDescBuffer + R"(
layout (local_size_x = 32) in;
@ -1001,19 +1041,24 @@ void main()
if (gl_GlobalInvocationID.x < VariantWorkCount[0].w)
uvec2 workDesc = UnsortedWorkDescs[gl_GlobalInvocationID.x];
int inVariantOffset = int(bitfieldExtract(workDesc.y, 0, 16));
int polygonIdx = int(bitfieldExtract(workDesc.y, 16, 16));
uvec2 workDesc = WorkDescs[WorkDescsUnsortedStart + gl_GlobalInvocationID.x];
int inVariantOffset = int(bitfieldExtract(workDesc.y, 12, 20));
int polygonIdx = int(bitfieldExtract(workDesc.y, 0, 12));
int variantIdx = Polygons[polygonIdx].Variant;
int sortedIndex = int(SortedWorkOffset[variantIdx]) + inVariantOffset;
SortedWork[sortedIndex] = uvec2(workDesc.x, bitfieldInsert(workDesc.y, gl_GlobalInvocationID.x, 0, 16));
WorkDescs[WorkDescsSortedStart + sortedIndex] = uvec2(workDesc.x, bitfieldInsert(workDesc.y, gl_GlobalInvocationID.x, 12, 20));
const char* Rasterise = R"(
const std::string Rasterise =
PolygonBuffer +
WorkDescBuffer +
XSpanSetupBuffer +
BinningBuffer +
Tilebuffers + R"(
layout (local_size_x = TileSize, local_size_y = TileSize) in;
@ -1024,10 +1069,10 @@ layout (location = 1) uniform vec2 InvTextureSize;
void main()
uvec2 workDesc = SortedWork[SortedWorkOffset[CurVariant] + gl_WorkGroupID.z];
Polygon polygon = Polygons[bitfieldExtract(workDesc.y, 16, 16)];
uvec2 workDesc = WorkDescs[WorkDescsSortedStart + SortedWorkOffset[CurVariant] + gl_WorkGroupID.z];
Polygon polygon = Polygons[bitfieldExtract(workDesc.y, 0, 12)];
ivec2 position = ivec2(bitfieldExtract(workDesc.x, 0, 16), bitfieldExtract(workDesc.x, 16, 16)) + ivec2(gl_LocalInvocationID.xy);
int tileOffset = int(bitfieldExtract(workDesc.y, 0, 16)) * TileSize * TileSize + TileSize * int(gl_LocalInvocationID.y) + int(gl_LocalInvocationID.x);
int tileOffset = int(bitfieldExtract(workDesc.y, 12, 20)) * TileSize * TileSize + TileSize * int(gl_LocalInvocationID.y) + int(gl_LocalInvocationID.x);
uint color = 0U;
if (position.y >= polygon.YTop && position.y < polygon.YBot)
@ -1203,7 +1248,11 @@ void main()
const char* DepthBlend = R"(
const std::string DepthBlend =
PolygonBuffer +
Tilebuffers +
ResultBuffer +
BinningBuffer + R"(
layout (local_size_x = TileSize, local_size_y = TileSize) in;
@ -1253,8 +1302,8 @@ void ProcessCoarseMask(int linearTile, uint coarseMask, uint coarseOffset,
uint tileOffset = linearTile * BinStride + coarseBit + coarseOffset;
uint fineMask = BinnedMask[tileOffset];
uint workIdx = WorkOffsets[tileOffset];
uint fineMask = BinningMaskAndOffset[BinningMaskStart + tileOffset];
uint workIdx = BinningMaskAndOffset[BinningWorkOffsetsStart + tileOffset];
while (fineMask != 0U)
@ -1403,8 +1452,8 @@ void main()
int linearTile = int(gl_WorkGroupID.x + (gl_WorkGroupID.y * TilesPerLine));
uint coarseMaskLo = BinnedMaskCoarse[linearTile*CoarseBinStride + 0];
uint coarseMaskHi = BinnedMaskCoarse[linearTile*CoarseBinStride + 1];
uint coarseMaskLo = BinningMaskAndOffset[BinningCoarseMaskStart + linearTile*CoarseBinStride + 0];
uint coarseMaskHi = BinningMaskAndOffset[BinningCoarseMaskStart + linearTile*CoarseBinStride + 1];
uvec2 color = uvec2(ClearColor, 0U);
uvec2 depth = uvec2(ClearDepth, 0U);
@ -1416,17 +1465,18 @@ void main()
ProcessCoarseMask(linearTile, coarseMaskHi, BinStride/2, color, depth, attr, stencil, prevIsShadowMask);
int resultOffset = int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y) * ScreenWidth;
ColorResult[resultOffset] = color.x;
ColorResult[resultOffset+FramebufferStride] = color.y;
DepthResult[resultOffset] = depth.x;
DepthResult[resultOffset+FramebufferStride] = depth.y;
AttrResult[resultOffset] = attr.x;
AttrResult[resultOffset+FramebufferStride] = attr.y;
ResultValue[ResultColorStart+resultOffset] = color.x;
ResultValue[ResultColorStart+resultOffset+FramebufferStride] = color.y;
ResultValue[ResultDepthStart+resultOffset] = depth.x;
ResultValue[ResultDepthStart+resultOffset+FramebufferStride] = depth.y;
ResultValue[ResultAttrStart+resultOffset] = attr.x;
ResultValue[ResultAttrStart+resultOffset+FramebufferStride] = attr.y;
const char* FinalPass = R"(
const std::string FinalPass =
ResultBuffer + R"(
layout (local_size_x = 32) in;
@ -1481,9 +1531,9 @@ void main()
int srcX = int(gl_GlobalInvocationID.x);
int resultOffset = int(srcX) + int(gl_GlobalInvocationID.y) * ScreenWidth;
uvec2 color = uvec2(ColorResult[resultOffset], ColorResult[resultOffset+FramebufferStride]);
uvec2 depth = uvec2(DepthResult[resultOffset], DepthResult[resultOffset+FramebufferStride]);
uvec2 attr = uvec2(AttrResult[resultOffset], AttrResult[resultOffset+FramebufferStride]);
uvec2 color = uvec2(ResultValue[resultOffset+ResultColorStart], ResultValue[resultOffset+FramebufferStride+ResultColorStart]);
uvec2 depth = uvec2(ResultValue[resultOffset+ResultDepthStart], ResultValue[resultOffset+FramebufferStride+ResultDepthStart]);
uvec2 attr = uvec2(ResultValue[resultOffset+ResultAttrStart], ResultValue[resultOffset+FramebufferStride+ResultAttrStart]);
#ifdef EdgeMarking
if ((attr.x & 0xFU) != 0U)
@ -1493,23 +1543,23 @@ void main()
if (srcX > 0U)
otherAttr.x = AttrResult[resultOffset-1];
otherDepth.x = DepthResult[resultOffset-1];
otherAttr.x = ResultValue[resultOffset-1+ResultAttrStart];
otherDepth.x = ResultValue[resultOffset-1+ResultDepthStart];
if (srcX < ScreenWidth-1)
otherAttr.y = AttrResult[resultOffset+1];
otherDepth.y = DepthResult[resultOffset+1];
otherAttr.y = ResultValue[resultOffset+1+ResultAttrStart];
otherDepth.y = ResultValue[resultOffset+1+ResultDepthStart];
if (gl_GlobalInvocationID.y > 0U)
otherAttr.z = AttrResult[resultOffset-ScreenWidth];
otherDepth.z = DepthResult[resultOffset-ScreenWidth];
otherAttr.z = ResultValue[resultOffset-ScreenWidth+ResultAttrStart];
otherDepth.z = ResultValue[resultOffset-ScreenWidth+ResultDepthStart];
if (gl_GlobalInvocationID.y < ScreenHeight-1)
otherAttr.w = AttrResult[resultOffset+ScreenWidth];
otherDepth.w = DepthResult[resultOffset+ScreenWidth];
otherAttr.w = ResultValue[resultOffset+ScreenWidth+ResultAttrStart];
otherDepth.w = ResultValue[resultOffset+ScreenWidth+ResultDepthStart];
uint polyId = bitfieldExtract(attr.x, 24, 6);
@ -28,26 +28,22 @@
namespace GPU3D
bool GLRenderer::BuildRenderShader(u32 flags, const char* vs, const char* fs)
bool GLRenderer::BuildRenderShader(u32 flags, const std::string& vs, const std::string& fs)
char shadername[32];
sprintf(shadername, "RenderShader%02X", flags);
int headerlen = strlen(kShaderHeader);
int vslen = strlen(vs);
int vsclen = strlen(kRenderVSCommon);
char* vsbuf = new char[headerlen + vsclen + vslen + 1];
strcpy(&vsbuf[0], kShaderHeader);
strcpy(&vsbuf[headerlen], kRenderVSCommon);
strcpy(&vsbuf[headerlen + vsclen], vs);
std::string vsbuf;
vsbuf += kShaderHeader;
vsbuf += kRenderVSCommon;
vsbuf += vs;
int fslen = strlen(fs);
int fsclen = strlen(kRenderFSCommon);
char* fsbuf = new char[headerlen + fsclen + fslen + 1];
strcpy(&fsbuf[0], kShaderHeader);
strcpy(&fsbuf[headerlen], kRenderFSCommon);
strcpy(&fsbuf[headerlen + fsclen], fs);
std::string fsbuf;
fsbuf += kShaderHeader;
fsbuf += kRenderFSCommon;
fsbuf += fs;
GLuint prog;
bool ret = OpenGL::CompileVertexFragmentProgram(prog,
@ -56,9 +52,6 @@ bool GLRenderer::BuildRenderShader(u32 flags, const char* vs, const char* fs)
{{"vPosition", 0}, {"vColor", 1}, {"vTexcoord", 2}, {"vPolygonAttr", 3}},
{{"oColor", 0}, {"oAttr", 1}});
delete[] vsbuf;
delete[] fsbuf;
if (!ret) return false;
GLint uni_id = glGetUniformBlockIndex(prog, "uConfig");
@ -67,7 +67,7 @@ private:
RendererPolygon PolygonList[2048] {};
bool BuildRenderShader(u32 flags, const char* vs, const char* fs);
bool BuildRenderShader(u32 flags, const std::string& vs, const std::string& fs);
void UseRenderShader(u32 flags);
void SetupPolygon(RendererPolygon* rp, Polygon* polygon);
u32* SetupVertex(Polygon* poly, int vid, Vertex* vtx, u32 vtxattr, u32* vptr);
@ -18,15 +18,174 @@
#include "OpenGLSupport.h"
#include <unordered_map>
#include <vector>
#include <assert.h>
#include "xxhash/xxhash.h"
using Platform::Log;
using Platform::LogLevel;
namespace OpenGL
bool CompilerShader(GLuint& id, const char* source, const char* name, const char* type)
struct ShaderCacheEntry
u32 Length;
u8* Data;
u32 BinaryFormat;
ShaderCacheEntry(u8* data, u32 length, u32 binaryFmt)
: Length(length), Data(data), BinaryFormat(binaryFmt)
assert(data != nullptr);
ShaderCacheEntry(const ShaderCacheEntry&) = delete;
ShaderCacheEntry(ShaderCacheEntry&& other)
Data = other.Data;
Length = other.Length;
BinaryFormat = other.BinaryFormat;
other.Data = nullptr;
other.Length = 0;
other.BinaryFormat = 0;
if (Data) // check whether it was moved
delete[] Data;
std::unordered_map<u64, ShaderCacheEntry> ShaderCache;
std::vector<u64> NewShaders;
constexpr u32 ShaderCacheMagic = 0x11CAC4E1;
constexpr u32 ShaderCacheVersion = 1;
void LoadShaderCache()
// for now the shader cache only contains only compute shaders
// because they take the longest to compile
FILE* file = Platform::OpenLocalFile("shadercache", "rb");
if (file == nullptr)
Log(LogLevel::Error, "Could not find shader cache\n");
u32 magic, version, numPrograms;
if (fread(&magic, 4, 1, file) != 1 || magic != ShaderCacheMagic)
Log(LogLevel::Error, "Shader cache file has invalid magic\n");
goto fileInvalid;
if (fread(&version, 4, 1, file) != 1 || version != ShaderCacheVersion)
Log(LogLevel::Error, "Shader cache file has bad version\n");
goto fileInvalid;
if (fread(&numPrograms, 4, 1, file) != 1)
Log(LogLevel::Error, "Shader cache file invalid program count\n");
goto fileInvalid;
// not the best approach, because once changes pile up
// we read and overwrite the old files
for (u32 i = 0; i < numPrograms; i++)
int error = 3;
u32 length, binaryFormat;
u64 sourceHash;
error -= fread(&sourceHash, 8, 1, file);
error -= fread(&length, 4, 1, file);
error -= fread(&binaryFormat, 4, 1, file);
if (error != 0)
Log(LogLevel::Error, "Invalid shader cache entry\n");
goto fileInvalid;
u8* data = new u8[length];
if (fread(data, length, 1, file) != 1)
Log(LogLevel::Error, "Could not read shader cache entry data\n");
delete[] data;
goto fileInvalid;
ShaderCache.emplace(sourceHash, ShaderCacheEntry(data, length, binaryFormat));
void SaveShaderCache()
FILE* file = Platform::OpenLocalFile("shadercache", "rb+");
if (file == nullptr)
file = Platform::OpenLocalFile("shadercache", "wb");
if (file == nullptr)
Log(LogLevel::Error, "Could not open or create shader cache file\n");
int written = 3;
u32 magic = ShaderCacheMagic, version = ShaderCacheVersion, numPrograms = ShaderCache.size();
written -= fwrite(&magic, 4, 1, file);
written -= fwrite(&version, 4, 1, file);
written -= fwrite(&numPrograms, 4, 1, file);
if (written != 0)
Log(LogLevel::Error, "Could not write shader cache header\n");
goto writeError;
fseek(file, 0, SEEK_END);
printf("new shaders %d\n", NewShaders.size());
for (u64 newShader : NewShaders)
int error = 4;
auto it = ShaderCache.find(newShader);
error -= fwrite(&it->first, 8, 1, file);
error -= fwrite(&it->second.Length, 4, 1, file);
error -= fwrite(&it->second.BinaryFormat, 4, 1, file);
error -= fwrite(it->second.Data, it->second.Length, 1, file);
if (error != 0)
Log(LogLevel::Error, "Could not insert new shader cache entry\n");
goto writeError;
bool CompilerShader(GLuint& id, const std::string& source, const std::string& name, const std::string& type)
int len;
int res;
if (!glCreateShader)
@ -35,8 +194,10 @@ bool CompilerShader(GLuint& id, const char* source, const char* name, const char
return false;
len = strlen(source);
glShaderSource(id, 1, &source, &len);
const char* sourceC = source.c_str();
int len = source.length();
glShaderSource(id, 1, &sourceC, &len);
glGetShaderiv(id, GL_COMPILE_STATUS, &res);
@ -46,8 +207,8 @@ bool CompilerShader(GLuint& id, const char* source, const char* name, const char
if (res < 1) res = 1024;
char* log = new char[res+1];
glGetShaderInfoLog(id, res+1, NULL, log);
Log(LogLevel::Error, "OpenGL: failed to compile %s shader %s: %s\n", type, name, log);
Log(LogLevel::Debug, "shader source:\n--\n%s\n--\n", source);
Log(LogLevel::Error, "OpenGL: failed to compile %s shader %s: %s\n", type.c_str(), name.c_str(), log);
Log(LogLevel::Debug, "shader source:\n--\n%s\n--\n", source.c_str());
delete[] log;
return false;
@ -92,8 +253,29 @@ bool LinkProgram(GLuint& result, GLuint* ids, int numIds)
return true;
bool CompileComputeProgram(GLuint& result, const char* source, const char* name)
bool CompileComputeProgram(GLuint& result, const std::string& source, const std::string& name)
result = glCreateProgram();
/*u64 sourceHash = XXH64(, source.size(), 0);
auto it = ShaderCache.find(sourceHash);
if (it != ShaderCache.end())
glProgramBinary(result, it->second.BinaryFormat, it->second.Data, it->second.Length);
GLint linkStatus;
glGetProgramiv(result, GL_LINK_STATUS, &linkStatus);
if (linkStatus == GL_TRUE)
Log(LogLevel::Info, "Restored shader %s from cache\n", name.c_str());
return true;
Log(LogLevel::Error, "Shader %s from cache was rejected\n", name.c_str());
GLuint shader = glCreateShader(GL_COMPUTE_SHADER);
bool linkingSucess = false;
if (glDeleteProgram)
@ -101,9 +283,6 @@ bool CompileComputeProgram(GLuint& result, const char* source, const char* name)
goto error;
result = glCreateProgram();
printf("compiling %s", name);
if (!CompilerShader(shader, source, name, "compute"))
goto error;
@ -113,14 +292,28 @@ error:
if (!linkingSucess)
GLint length;
GLenum format;
glGetProgramiv(result, GL_PROGRAM_BINARY_LENGTH, &length);
u8* buffer = new u8[length];
glGetProgramBinary(result, length, nullptr, &format, buffer);
ShaderCache.emplace(sourceHash, ShaderCacheEntry(buffer, length, format));
return linkingSucess;
bool CompileVertexFragmentProgram(GLuint& result,
const char* vs, const char* fs,
const char* name,
const std::string& vs, const std::string& fs,
const std::string& name,
const std::initializer_list<AttributeTarget>& vertexInAttrs,
const std::initializer_list<AttributeTarget>& fragmentOutAttrs)
@ -29,19 +29,23 @@
namespace OpenGL
void LoadShaderCache();
void SaveShaderCache();
struct AttributeTarget
const char* Name;
u32 Location;
bool CompileVertexFragmentProgram(GLuint& result,
const char* vs, const char* fs,
const char* name,
const std::string& vs, const std::string& fs,
const std::string& name,
const std::initializer_list<AttributeTarget>& vertexInAttrs,
const std::initializer_list<AttributeTarget>& fragmentOutAttrs);
bool CompileComputeProgram(GLuint& result, const char* source, const char* name);
bool CompileComputeProgram(GLuint& result, const std::string& source, const std::string& name);
Reference in New Issue