add hires rendering to the compute shader renderer

This commit is contained in:
RSDuck 2023-04-17 00:07:47 +02:00
parent ea719ff4a4
commit 60a3fe24ed
3 changed files with 331 additions and 230 deletions

View File

@ -52,6 +52,13 @@ bool ComputeRenderer::CompileShader(GLuint& shader, const char* source, const st
shaderName += define; shaderName += define;
shaderName += ','; shaderName += ',';
} }
shaderSource += "#define ScreenWidth ";
shaderSource += std::to_string(ScreenWidth);
shaderSource += "\n#define ScreenHeight ";
shaderSource += std::to_string(ScreenHeight);
shaderSource += "\n#define MaxWorkTiles ";
shaderSource += std::to_string(MaxWorkTiles);
shaderSource += ComputeRendererShaders::Common; shaderSource += ComputeRendererShaders::Common;
shaderSource += source; shaderSource += source;
@ -65,8 +72,8 @@ void blah(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,con
bool ComputeRenderer::Init() bool ComputeRenderer::Init()
{ {
glDebugMessageCallback(blah, NULL); //glDebugMessageCallback(blah, NULL);
glEnable(GL_DEBUG_OUTPUT); //glEnable(GL_DEBUG_OUTPUT);
glGenBuffers(1, &YSpanSetupMemory); glGenBuffers(1, &YSpanSetupMemory);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory); glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupY)*MaxYSpanSetups, nullptr, GL_DYNAMIC_DRAW); glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupY)*MaxYSpanSetups, nullptr, GL_DYNAMIC_DRAW);
@ -75,72 +82,22 @@ bool ComputeRenderer::Init()
glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory); glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(RenderPolygon)*2048, nullptr, GL_DYNAMIC_DRAW); glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(RenderPolygon)*2048, nullptr, GL_DYNAMIC_DRAW);
glGenBuffers(1, &TileMemory);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(Tiles), nullptr, GL_DYNAMIC_DRAW);
glGenBuffers(1, &XSpanSetupMemory); glGenBuffers(1, &XSpanSetupMemory);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, XSpanSetupMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupX)*MaxYSpanIndices, nullptr, GL_DYNAMIC_DRAW);
glGenBuffers(1, &BinResultMemory); glGenBuffers(1, &BinResultMemory);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, BinResultMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(BinResult), nullptr, GL_DYNAMIC_DRAW);
glGenBuffers(1, &FinalTileMemory); glGenBuffers(1, &FinalTileMemory);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, FinalTileMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(FinalTiles), nullptr, GL_DYNAMIC_DRAW);
glGenBuffers(1, &YSpanIndicesTextureMemory); glGenBuffers(1, &YSpanIndicesTextureMemory);
glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory); glGenBuffers(1, &TileMemory);
glBufferData(GL_TEXTURE_BUFFER, MaxYSpanIndices*2*4, nullptr, GL_DYNAMIC_DRAW);
glGenTextures(1, &YSpanIndicesTexture); glGenTextures(1, &YSpanIndicesTexture);
glBindTexture(GL_TEXTURE_BUFFER, YSpanIndicesTexture);
glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA16UI, YSpanIndicesTextureMemory);
glGenTextures(1, &Framebuffer); glGenTextures(1, &Framebuffer);
glBindTexture(GL_TEXTURE_2D, Framebuffer); glGenTextures(1, &LowResFramebuffer);
glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, 256, 192); glBindTexture(GL_TEXTURE_2D, LowResFramebuffer);
glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8UI, 256, 192);
glGenBuffers(1, &MetaUniformMemory); glGenBuffers(1, &MetaUniformMemory);
glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory); glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory);
glBufferData(GL_UNIFORM_BUFFER, sizeof(MetaUniform), nullptr, GL_DYNAMIC_DRAW); glBufferData(GL_UNIFORM_BUFFER, sizeof(MetaUniform), nullptr, GL_DYNAMIC_DRAW);
CompileShader(ShaderInterpXSpans[0], ComputeRendererShaders::InterpSpans, {"InterpSpans", "ZBuffer"});
CompileShader(ShaderInterpXSpans[1], ComputeRendererShaders::InterpSpans, {"InterpSpans", "WBuffer"});
CompileShader(ShaderBinCombined, ComputeRendererShaders::BinCombined, {"BinCombined"});
CompileShader(ShaderDepthBlend[0], ComputeRendererShaders::DepthBlend, {"DepthBlend", "ZBuffer"});
CompileShader(ShaderDepthBlend[1], ComputeRendererShaders::DepthBlend, {"DepthBlend", "WBuffer"});
CompileShader(ShaderRasteriseNoTexture[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture"});
CompileShader(ShaderRasteriseNoTexture[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture"});
CompileShader(ShaderRasteriseNoTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Toon"});
CompileShader(ShaderRasteriseNoTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Toon"});
CompileShader(ShaderRasteriseNoTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Highlight"});
CompileShader(ShaderRasteriseNoTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Highlight"});
CompileShader(ShaderRasteriseUseTextureDecal[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Decal"});
CompileShader(ShaderRasteriseUseTextureDecal[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Decal"});
CompileShader(ShaderRasteriseUseTextureModulate[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Modulate"});
CompileShader(ShaderRasteriseUseTextureModulate[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Modulate"});
CompileShader(ShaderRasteriseUseTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Toon"});
CompileShader(ShaderRasteriseUseTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Toon"});
CompileShader(ShaderRasteriseUseTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Highlight"});
CompileShader(ShaderRasteriseUseTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Highlight"});
CompileShader(ShaderRasteriseShadowMask[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "ShadowMask"});
CompileShader(ShaderRasteriseShadowMask[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "ShadowMask"});
CompileShader(ShaderClearCoarseBinMask, ComputeRendererShaders::ClearCoarseBinMask, {"ClearCoarseBinMask"});
CompileShader(ShaderClearIndirectWorkCount, ComputeRendererShaders::ClearIndirectWorkCount, {"ClearIndirectWorkCount"});
CompileShader(ShaderCalculateWorkListOffset, ComputeRendererShaders::CalcOffsets, {"CalculateWorkOffsets"});
CompileShader(ShaderSortWork, ComputeRendererShaders::SortWork, {"SortWork"});
CompileShader(ShaderFinalPass[0], ComputeRendererShaders::FinalPass, {"FinalPass"});
CompileShader(ShaderFinalPass[1], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking"});
CompileShader(ShaderFinalPass[2], ComputeRendererShaders::FinalPass, {"FinalPass", "Fog"});
CompileShader(ShaderFinalPass[3], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking", "Fog"});
CompileShader(ShaderFinalPass[4], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing"});
CompileShader(ShaderFinalPass[5], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking"});
CompileShader(ShaderFinalPass[6], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "Fog"});
CompileShader(ShaderFinalPass[7], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking", "Fog"});
glGenSamplers(9, Samplers); glGenSamplers(9, Samplers);
for (u32 j = 0; j < 3; j++) for (u32 j = 0; j < 3; j++)
{ {
@ -176,6 +133,12 @@ void ComputeRenderer::DeInit()
glDeleteTextures(1, &Framebuffer); glDeleteTextures(1, &Framebuffer);
glDeleteBuffers(1, &MetaUniformMemory); glDeleteBuffers(1, &MetaUniformMemory);
glDeleteSamplers(9, Samplers);
glDeleteBuffers(1, &PixelBuffer);
}
void ComputeRenderer::DeleteShaders()
{
std::initializer_list<GLuint> allPrograms = std::initializer_list<GLuint> allPrograms =
{ {
ShaderInterpXSpans[0], ShaderInterpXSpans[0],
@ -214,9 +177,6 @@ void ComputeRenderer::DeInit()
}; };
for (GLuint program : allPrograms) for (GLuint program : allPrograms)
glDeleteProgram(program); glDeleteProgram(program);
glDeleteSamplers(9, Samplers);
glDeleteBuffers(1, &PixelBuffer);
} }
void ComputeRenderer::ResetTexcache() void ComputeRenderer::ResetTexcache()
@ -241,7 +201,85 @@ void ComputeRenderer::Reset()
void ComputeRenderer::SetRenderSettings(GPU::RenderSettings& settings) void ComputeRenderer::SetRenderSettings(GPU::RenderSettings& settings)
{ {
if (ScaleFactor != -1)
{
DeleteShaders();
}
ScaleFactor = settings.GL_ScaleFactor;
ScreenWidth = 256 * ScaleFactor;
ScreenHeight = 192 * ScaleFactor;
TilesPerLine = ScreenWidth/TileSize;
TileLines = ScreenHeight/TileSize;
MaxWorkTiles = TilesPerLine*TileLines*8;
glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*TileSize*TileSize*MaxWorkTiles, nullptr, GL_DYNAMIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, FinalTileMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*2*ScreenWidth*ScreenHeight, nullptr, GL_DYNAMIC_DRAW);
int binResultSize = sizeof(BinResultHeader)
+ MaxWorkTiles*2*4 // UnsortedWorkDescs
+ MaxWorkTiles*2*4 // SortedWork
+ TilesPerLine*TileLines*CoarseBinStride*4 // BinnedMaskCoarse
+ TilesPerLine*TileLines*BinStride*4 // BinnedMask
+ TilesPerLine*TileLines*BinStride*4; // WorkOffsets
glBindBuffer(GL_SHADER_STORAGE_BUFFER, BinResultMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, binResultSize, nullptr, GL_DYNAMIC_DRAW);
glBindTexture(GL_TEXTURE_2D, Framebuffer);
glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, ScreenWidth, ScreenHeight);
// eh those are pretty bad guesses
// though real hw shouldn't be eable to render all 2048 polygons on every line either
int maxYSpanIndices = 64*2048 * ScaleFactor;
YSpanIndices.resize(maxYSpanIndices);
glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory);
glBufferData(GL_TEXTURE_BUFFER, maxYSpanIndices*2*4, nullptr, GL_DYNAMIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, XSpanSetupMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupX)*maxYSpanIndices, nullptr, GL_DYNAMIC_DRAW);
glBindTexture(GL_TEXTURE_BUFFER, YSpanIndicesTexture);
glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA16UI, YSpanIndicesTextureMemory);
CompileShader(ShaderInterpXSpans[0], ComputeRendererShaders::InterpSpans, {"InterpSpans", "ZBuffer"});
CompileShader(ShaderInterpXSpans[1], ComputeRendererShaders::InterpSpans, {"InterpSpans", "WBuffer"});
CompileShader(ShaderBinCombined, ComputeRendererShaders::BinCombined, {"BinCombined"});
CompileShader(ShaderDepthBlend[0], ComputeRendererShaders::DepthBlend, {"DepthBlend", "ZBuffer"});
CompileShader(ShaderDepthBlend[1], ComputeRendererShaders::DepthBlend, {"DepthBlend", "WBuffer"});
CompileShader(ShaderRasteriseNoTexture[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture"});
CompileShader(ShaderRasteriseNoTexture[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture"});
CompileShader(ShaderRasteriseNoTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Toon"});
CompileShader(ShaderRasteriseNoTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Toon"});
CompileShader(ShaderRasteriseNoTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Highlight"});
CompileShader(ShaderRasteriseNoTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Highlight"});
CompileShader(ShaderRasteriseUseTextureDecal[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Decal"});
CompileShader(ShaderRasteriseUseTextureDecal[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Decal"});
CompileShader(ShaderRasteriseUseTextureModulate[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Modulate"});
CompileShader(ShaderRasteriseUseTextureModulate[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Modulate"});
CompileShader(ShaderRasteriseUseTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Toon"});
CompileShader(ShaderRasteriseUseTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Toon"});
CompileShader(ShaderRasteriseUseTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Highlight"});
CompileShader(ShaderRasteriseUseTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Highlight"});
CompileShader(ShaderRasteriseShadowMask[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "ShadowMask"});
CompileShader(ShaderRasteriseShadowMask[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "ShadowMask"});
CompileShader(ShaderClearCoarseBinMask, ComputeRendererShaders::ClearCoarseBinMask, {"ClearCoarseBinMask"});
CompileShader(ShaderClearIndirectWorkCount, ComputeRendererShaders::ClearIndirectWorkCount, {"ClearIndirectWorkCount"});
CompileShader(ShaderCalculateWorkListOffset, ComputeRendererShaders::CalcOffsets, {"CalculateWorkOffsets"});
CompileShader(ShaderSortWork, ComputeRendererShaders::SortWork, {"SortWork"});
CompileShader(ShaderFinalPass[0], ComputeRendererShaders::FinalPass, {"FinalPass"});
CompileShader(ShaderFinalPass[1], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking"});
CompileShader(ShaderFinalPass[2], ComputeRendererShaders::FinalPass, {"FinalPass", "Fog"});
CompileShader(ShaderFinalPass[3], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking", "Fog"});
CompileShader(ShaderFinalPass[4], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing"});
CompileShader(ShaderFinalPass[5], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking"});
CompileShader(ShaderFinalPass[6], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "Fog"});
CompileShader(ShaderFinalPass[7], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking", "Fog"});
} }
void ComputeRenderer::VCount144() void ComputeRenderer::VCount144()
@ -267,9 +305,9 @@ void ComputeRenderer::SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int
span->TexcoordV1 = poly->Vertices[to]->TexCoords[1]; span->TexcoordV1 = poly->Vertices[to]->TexCoords[1];
} }
void ComputeRenderer::SetupYSpanDummy(SpanSetupY* span, Polygon* poly, int vertex, int side) void ComputeRenderer::SetupYSpanDummy(SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2])
{ {
s32 x0 = poly->Vertices[vertex]->FinalPosition[0]; s32 x0 = positions[vertex][0];
if (side) if (side)
{ {
span->DxInitial = -0x40000; span->DxInitial = -0x40000;
@ -283,7 +321,7 @@ void ComputeRenderer::SetupYSpanDummy(SpanSetupY* span, Polygon* poly, int verte
span->X0 = span->X1 = x0; span->X0 = span->X1 = x0;
span->XMin = x0; span->XMin = x0;
span->XMax = x0; span->XMax = x0;
span->Y0 = span->Y1 = poly->Vertices[vertex]->FinalPosition[1]; span->Y0 = span->Y1 = positions[vertex][1];
span->Increment = 0; span->Increment = 0;
@ -297,12 +335,12 @@ void ComputeRenderer::SetupYSpanDummy(SpanSetupY* span, Polygon* poly, int verte
SetupAttrs(span, poly, vertex, vertex); SetupAttrs(span, poly, vertex, vertex);
} }
void ComputeRenderer::SetupYSpan(int polynum, SpanSetupY* span, Polygon* poly, int from, int to, u32 y, int side) void ComputeRenderer::SetupYSpan(int polynum, SpanSetupY* span, Polygon* poly, int from, int to, u32 y, int side, s32 positions[10][2])
{ {
span->X0 = poly->Vertices[from]->FinalPosition[0]; span->X0 = positions[from][0];
span->X1 = poly->Vertices[to]->FinalPosition[0]; span->X1 = positions[to][0];
span->Y0 = poly->Vertices[from]->FinalPosition[1]; span->Y0 = positions[from][1];
span->Y1 = poly->Vertices[to]->FinalPosition[1]; span->Y1 = positions[to][1];
SetupAttrs(span, poly, from, to); SetupAttrs(span, poly, from, to);
@ -396,9 +434,6 @@ void ComputeRenderer::SetupYSpan(int polynum, SpanSetupY* span, Polygon* poly, i
span->I1 = span->Y1; span->I1 = span->Y1;
} }
//if (span->I1 < span->I0)
// std::swap(span->I0, span->I1);
if (span->I0 != span->I1) if (span->I0 != span->I1)
span->IRecip = (1<<30) / (span->I1 - span->I0); span->IRecip = (1<<30) / (span->I1 - span->I0);
else else
@ -940,14 +975,11 @@ void ComputeRenderer::RenderFrame()
u32 nverts = polygon->NumVertices; u32 nverts = polygon->NumVertices;
u32 vtop = polygon->VTop, vbot = polygon->VBottom; u32 vtop = polygon->VTop, vbot = polygon->VBottom;
s32 ytop = polygon->YTop, ybot = polygon->YBottom;
u32 curVL = vtop, curVR = vtop; u32 curVL = vtop, curVR = vtop;
u32 nextVL, nextVR; u32 nextVL, nextVR;
RenderPolygons[i].FirstXSpan = numSetupIndices; RenderPolygons[i].FirstXSpan = numSetupIndices;
RenderPolygons[i].YTop = ytop;
RenderPolygons[i].YBot = ybot;
RenderPolygons[i].Attr = polygon->Attr; RenderPolygons[i].Attr = polygon->Attr;
bool foundVariant = false; bool foundVariant = false;
@ -1034,10 +1066,22 @@ void ComputeRenderer::RenderFrame()
if (nextVR >= nverts) nextVR = 0; if (nextVR >= nverts) nextVR = 0;
} }
s32 minX = polygon->Vertices[vtop]->FinalPosition[0]; s32 scaledPositions[10][2];
s32 minXY = polygon->Vertices[vtop]->FinalPosition[1]; s32 ytop = ScreenHeight, ybot = 0;
s32 maxX = polygon->Vertices[vtop]->FinalPosition[0]; for (int i = 0; i < polygon->NumVertices; i++)
s32 maxXY = polygon->Vertices[vtop]->FinalPosition[1]; {
scaledPositions[i][0] = (polygon->Vertices[i]->HiresPosition[0] * ScaleFactor) >> 4;
scaledPositions[i][1] = (polygon->Vertices[i]->HiresPosition[1] * ScaleFactor) >> 4;
ytop = std::min(scaledPositions[i][1], ytop);
ybot = std::max(scaledPositions[i][1], ybot);
}
RenderPolygons[i].YTop = ytop;
RenderPolygons[i].YBot = ybot;
s32 minX = scaledPositions[vtop][0];
s32 minXY = scaledPositions[vtop][1];
s32 maxX = scaledPositions[vtop][0];
s32 maxXY = scaledPositions[vtop][1];
if (ybot == ytop) if (ybot == ytop)
{ {
@ -1046,19 +1090,19 @@ void ComputeRenderer::RenderFrame()
RenderPolygons[i].YBot++; RenderPolygons[i].YBot++;
int j = 1; int j = 1;
if (polygon->Vertices[j]->FinalPosition[0] < polygon->Vertices[vtop]->FinalPosition[0]) vtop = j; if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j;
if (polygon->Vertices[j]->FinalPosition[0] > polygon->Vertices[vbot]->FinalPosition[0]) vbot = j; if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j;
j = nverts - 1; j = nverts - 1;
if (polygon->Vertices[j]->FinalPosition[0] < polygon->Vertices[vtop]->FinalPosition[0]) vtop = j; if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j;
if (polygon->Vertices[j]->FinalPosition[0] > polygon->Vertices[vbot]->FinalPosition[0]) vbot = j; if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j;
assert(numYSpans < MaxYSpanSetups); assert(numYSpans < MaxYSpanSetups);
u32 curSpanL = numYSpans; u32 curSpanL = numYSpans;
SetupYSpanDummy(&YSpanSetups[numYSpans++], polygon, vtop, 0); SetupYSpanDummy(&YSpanSetups[numYSpans++], polygon, vtop, 0, scaledPositions);
assert(numYSpans < MaxYSpanSetups); assert(numYSpans < MaxYSpanSetups);
u32 curSpanR = numYSpans; u32 curSpanR = numYSpans;
SetupYSpanDummy(&YSpanSetups[numYSpans++], polygon, vbot, 1); SetupYSpanDummy(&YSpanSetups[numYSpans++], polygon, vbot, 1, scaledPositions);
minX = YSpanSetups[curSpanL].X0; minX = YSpanSetups[curSpanL].X0;
minXY = YSpanSetups[curSpanL].Y0; minXY = YSpanSetups[curSpanL].Y0;
@ -1070,7 +1114,6 @@ void ComputeRenderer::RenderFrame()
std::swap(minXY, maxXY); std::swap(minXY, maxXY);
} }
assert(numSetupIndices < MaxYSpanIndices);
YSpanIndices[numSetupIndices].PolyIdx = i; YSpanIndices[numSetupIndices].PolyIdx = i;
YSpanIndices[numSetupIndices].SpanIdxL = curSpanL; YSpanIndices[numSetupIndices].SpanIdxL = curSpanL;
YSpanIndices[numSetupIndices].SpanIdxR = curSpanR; YSpanIndices[numSetupIndices].SpanIdxR = curSpanR;
@ -1081,16 +1124,16 @@ void ComputeRenderer::RenderFrame()
{ {
u32 curSpanL = numYSpans; u32 curSpanL = numYSpans;
assert(numYSpans < MaxYSpanSetups); assert(numYSpans < MaxYSpanSetups);
SetupYSpan(i, &YSpanSetups[numYSpans++], polygon, curVL, nextVL, ytop, 0); SetupYSpan(i, &YSpanSetups[numYSpans++], polygon, curVL, nextVL, ytop, 0, scaledPositions);
u32 curSpanR = numYSpans; u32 curSpanR = numYSpans;
assert(numYSpans < MaxYSpanSetups); assert(numYSpans < MaxYSpanSetups);
SetupYSpan(i, &YSpanSetups[numYSpans++], polygon, curVR, nextVR, ytop, 1); SetupYSpan(i, &YSpanSetups[numYSpans++], polygon, curVR, nextVR, ytop, 1, scaledPositions);
for (u32 y = ytop; y < ybot; y++) for (u32 y = ytop; y < ybot; y++)
{ {
if (y >= polygon->Vertices[nextVL]->FinalPosition[1] && curVL != polygon->VBottom) if (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom)
{ {
while (y >= polygon->Vertices[nextVL]->FinalPosition[1] && curVL != polygon->VBottom) while (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom)
{ {
curVL = nextVL; curVL = nextVL;
if (polygon->FacingView) if (polygon->FacingView)
@ -1107,24 +1150,24 @@ void ComputeRenderer::RenderFrame()
} }
} }
if (polygon->Vertices[curVL]->FinalPosition[0] < minX) if (scaledPositions[curVL][0] < minX)
{ {
minX = polygon->Vertices[curVL]->FinalPosition[0]; minX = scaledPositions[curVL][0];
minXY = polygon->Vertices[curVL]->FinalPosition[1]; minXY = scaledPositions[curVL][1];
} }
if (polygon->Vertices[curVL]->FinalPosition[0] > maxX) if (scaledPositions[curVL][0] > maxX)
{ {
maxX = polygon->Vertices[curVL]->FinalPosition[0]; maxX = scaledPositions[curVL][0];
maxXY = polygon->Vertices[curVL]->FinalPosition[1]; maxXY = scaledPositions[curVL][1];
} }
assert(numYSpans < MaxYSpanSetups); assert(numYSpans < MaxYSpanSetups);
curSpanL = numYSpans; curSpanL = numYSpans;
SetupYSpan(i,&YSpanSetups[numYSpans++], polygon, curVL, nextVL, y, 0); SetupYSpan(i,&YSpanSetups[numYSpans++], polygon, curVL, nextVL, y, 0, scaledPositions);
} }
if (y >= polygon->Vertices[nextVR]->FinalPosition[1] && curVR != polygon->VBottom) if (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom)
{ {
while (y >= polygon->Vertices[nextVR]->FinalPosition[1] && curVR != polygon->VBottom) while (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom)
{ {
curVR = nextVR; curVR = nextVR;
if (polygon->FacingView) if (polygon->FacingView)
@ -1141,23 +1184,22 @@ void ComputeRenderer::RenderFrame()
} }
} }
if (polygon->Vertices[curVR]->FinalPosition[0] < minX) if (scaledPositions[curVR][0] < minX)
{ {
minX = polygon->Vertices[curVR]->FinalPosition[0]; minX = scaledPositions[curVR][0];
minXY = polygon->Vertices[curVR]->FinalPosition[1]; minXY = scaledPositions[curVR][1];
} }
if (polygon->Vertices[curVR]->FinalPosition[0] > maxX) if (scaledPositions[curVR][0] > maxX)
{ {
maxX = polygon->Vertices[curVR]->FinalPosition[0]; maxX = scaledPositions[curVR][0];
maxXY = polygon->Vertices[curVR]->FinalPosition[1]; maxXY = scaledPositions[curVR][1];
} }
assert(numYSpans < MaxYSpanSetups); assert(numYSpans < MaxYSpanSetups);
curSpanR = numYSpans; curSpanR = numYSpans;
SetupYSpan(i,&YSpanSetups[numYSpans++], polygon, curVR, nextVR, y, 1); SetupYSpan(i,&YSpanSetups[numYSpans++], polygon, curVR, nextVR, y, 1, scaledPositions);
} }
assert(numSetupIndices < MaxYSpanIndices);
YSpanIndices[numSetupIndices].PolyIdx = i; YSpanIndices[numSetupIndices].PolyIdx = i;
YSpanIndices[numSetupIndices].SpanIdxL = curSpanL; YSpanIndices[numSetupIndices].SpanIdxL = curSpanL;
YSpanIndices[numSetupIndices].SpanIdxR = curSpanR; YSpanIndices[numSetupIndices].SpanIdxR = curSpanR;
@ -1166,25 +1208,25 @@ void ComputeRenderer::RenderFrame()
} }
} }
if (polygon->Vertices[nextVL]->FinalPosition[0] < minX) if (scaledPositions[nextVL][0] < minX)
{ {
minX = polygon->Vertices[nextVL]->FinalPosition[0]; minX = scaledPositions[nextVL][0];
minXY = polygon->Vertices[nextVL]->FinalPosition[1]; minXY = scaledPositions[nextVL][1];
} }
if (polygon->Vertices[nextVL]->FinalPosition[0] > maxX) if (scaledPositions[nextVL][0] > maxX)
{ {
maxX = polygon->Vertices[nextVL]->FinalPosition[0]; maxX = scaledPositions[nextVL][0];
maxXY = polygon->Vertices[nextVL]->FinalPosition[1]; maxXY = scaledPositions[nextVL][1];
} }
if (polygon->Vertices[nextVR]->FinalPosition[0] < minX) if (scaledPositions[nextVR][0] < minX)
{ {
minX = polygon->Vertices[nextVR]->FinalPosition[0]; minX = scaledPositions[nextVR][0];
minXY = polygon->Vertices[nextVR]->FinalPosition[1]; minXY = scaledPositions[nextVR][1];
} }
if (polygon->Vertices[nextVR]->FinalPosition[0] > maxX) if (scaledPositions[nextVR][0] > maxX)
{ {
maxX = polygon->Vertices[nextVR]->FinalPosition[0]; maxX = scaledPositions[nextVR][0];
maxXY = polygon->Vertices[nextVR]->FinalPosition[1]; maxXY = scaledPositions[nextVR][1];
} }
RenderPolygons[i].XMin = minX; RenderPolygons[i].XMin = minX;
@ -1210,7 +1252,7 @@ void ComputeRenderer::RenderFrame()
glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(SpanSetupY)*numYSpans, YSpanSetups); glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(SpanSetupY)*numYSpans, YSpanSetups);
glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory); glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory);
glBufferSubData(GL_TEXTURE_BUFFER, 0, numSetupIndices*4*2, YSpanIndices); glBufferSubData(GL_TEXTURE_BUFFER, 0, numSetupIndices*4*2, YSpanIndices.data());
glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory); glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory);
glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, RenderNumPolygons*sizeof(RenderPolygon), RenderPolygons); glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, RenderNumPolygons*sizeof(RenderPolygon), RenderPolygons);
@ -1278,8 +1320,6 @@ void ComputeRenderer::RenderFrame()
u32 fogA = (RenderFogColor >> 16) & 0x1F; u32 fogA = (RenderFogColor >> 16) & 0x1F;
meta.FogColor = fogR | (fogG << 8) | (fogB << 16) | (fogA << 24); meta.FogColor = fogR | (fogG << 8) | (fogB << 16) | (fogA << 24);
} }
meta.XScroll = 0;
//meta.XScroll = RenderXPos;
glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory); glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory);
glBufferSubData(GL_UNIFORM_BUFFER, 0, sizeof(MetaUniform), &meta); glBufferSubData(GL_UNIFORM_BUFFER, 0, sizeof(MetaUniform), &meta);
@ -1304,7 +1344,7 @@ void ComputeRenderer::RenderFrame()
// bin polygons // bin polygons
glUseProgram(ShaderBinCombined); glUseProgram(ShaderBinCombined);
glDispatchCompute(((RenderNumPolygons + 31) / 32), 256/CoarseTileW, 192/CoarseTileH); glDispatchCompute(((RenderNumPolygons + 31) / 32), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH);
glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
// calculate list offsets // calculate list offsets
@ -1316,7 +1356,7 @@ void ComputeRenderer::RenderFrame()
// sort shader work // sort shader work
glUseProgram(ShaderSortWork); glUseProgram(ShaderSortWork);
glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory); glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory);
glDispatchComputeIndirect(offsetof(BinResult, SortWorkWorkCount)); glDispatchComputeIndirect(offsetof(BinResultHeader, SortWorkWorkCount));
glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
glActiveTexture(GL_TEXTURE0); glActiveTexture(GL_TEXTURE0);
@ -1379,7 +1419,7 @@ void ComputeRenderer::RenderFrame()
glUniform1ui(UniformIdxCurVariant, i); glUniform1ui(UniformIdxCurVariant, i);
glUniform2f(UniformIdxTextureSize, 1.f / variants[i].Width, 1.f / variants[i].Height); glUniform2f(UniformIdxTextureSize, 1.f / variants[i].Width, 1.f / variants[i].Height);
glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory); glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory);
glDispatchComputeIndirect(offsetof(BinResult, VariantWorkCount) + i*4*4); glDispatchComputeIndirect(offsetof(BinResultHeader, VariantWorkCount) + i*4*4);
} }
} }
} }
@ -1387,10 +1427,11 @@ void ComputeRenderer::RenderFrame()
// compose final image // compose final image
glUseProgram(ShaderDepthBlend[wbuffer]); glUseProgram(ShaderDepthBlend[wbuffer]);
glDispatchCompute(256/8, 192/8, 1); glDispatchCompute(ScreenWidth/TileSize, ScreenHeight/TileSize, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
glBindImageTexture(0, Framebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8); glBindImageTexture(0, Framebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8);
glBindImageTexture(1, LowResFramebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8UI);
u32 finalPassShader = 0; u32 finalPassShader = 0;
if (RenderDispCnt & (1<<4)) if (RenderDispCnt & (1<<4))
finalPassShader |= 0x4; finalPassShader |= 0x4;
@ -1400,7 +1441,7 @@ void ComputeRenderer::RenderFrame()
finalPassShader |= 0x1; finalPassShader |= 0x1;
glUseProgram(ShaderFinalPass[finalPassShader]); glUseProgram(ShaderFinalPass[finalPassShader]);
glDispatchCompute(256/32, 192, 1); glDispatchCompute(ScreenWidth/32, ScreenHeight, 1);
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
/*u64 starttime = armGetSystemTick(); /*u64 starttime = armGetSystemTick();
@ -1467,15 +1508,6 @@ u32* ComputeRenderer::GetLine(int line)
glUnmapBuffer(GL_PIXEL_PACK_BUFFER); glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
} }
u64* ptr = (u64*)&FramebufferCPU[stride * line];
for (int i = 0; i < stride; i+=2)
{
u64 rgb = *ptr & 0x00FCFCFC00FCFCFC;
u64 a = *ptr & 0xF8000000F8000000;
*ptr++ = (rgb >> 2) | (a >> 3);
}
return &FramebufferCPU[stride * line]; return &FramebufferCPU[stride * line];
} }
@ -1487,8 +1519,8 @@ void ComputeRenderer::SetupAccelFrame()
void ComputeRenderer::PrepareCaptureFrame() void ComputeRenderer::PrepareCaptureFrame()
{ {
glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer); glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer);
glBindTexture(GL_TEXTURE_2D, Framebuffer); glBindTexture(GL_TEXTURE_2D, LowResFramebuffer);
glGetTexImage(GL_TEXTURE_2D, 0, GL_BGRA, GL_UNSIGNED_BYTE, nullptr); glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, nullptr);
} }
} }

View File

@ -141,57 +141,32 @@ private:
u32 __pad0, __pad1; u32 __pad0, __pad1;
}; };
static const int TileSize = 8; static constexpr int TileSize = 8;
static const int CoarseTileCountX = 8; static constexpr int CoarseTileCountX = 8;
static const int CoarseTileCountY = 4; static constexpr int CoarseTileCountY = 4;
static const int CoarseTileW = CoarseTileCountX * TileSize; static constexpr int CoarseTileW = CoarseTileCountX * TileSize;
static const int CoarseTileH = CoarseTileCountY * TileSize; static constexpr int CoarseTileH = CoarseTileCountY * TileSize;
static const int TilesPerLine = 256/TileSize; static constexpr int BinStride = 2048/32;
static const int TileLines = 192/TileSize; static constexpr int CoarseBinStride = BinStride/32;
static const int BinStride = 2048/32; static constexpr int MaxVariants = 256;
static const int CoarseBinStride = BinStride/32;
static const int MaxWorkTiles = TilesPerLine*TileLines*48; static constexpr int UniformIdxCurVariant = 0;
static const int MaxVariants = 256; static constexpr int UniformIdxTextureSize = 1;
static const int UniformIdxCurVariant = 0; static constexpr int MaxFullscreenLayers = 16;
static const int UniformIdxTextureSize = 1;
struct BinResult struct BinResultHeader
{ {
u32 VariantWorkCount[MaxVariants*4]; u32 VariantWorkCount[MaxVariants*4];
u32 SortedWorkOffset[MaxVariants]; u32 SortedWorkOffset[MaxVariants];
u32 SortWorkWorkCount[4]; u32 SortWorkWorkCount[4];
u32 UnsortedWorkDescs[MaxWorkTiles*2];
u32 SortedWork[MaxWorkTiles*2];
u32 BinnedMaskCoarse[TilesPerLine*TileLines*CoarseBinStride];
u32 BinnedMask[TilesPerLine*TileLines*BinStride];
u32 WorkOffsets[TilesPerLine*TileLines*BinStride];
}; };
struct Tiles
{
u32 ColorTiles[MaxWorkTiles*TileSize*TileSize];
u32 DepthTiles[MaxWorkTiles*TileSize*TileSize];
u32 AttrStencilTiles[MaxWorkTiles*TileSize*TileSize];
};
struct FinalTiles
{
u32 ColorResult[256*192*2];
u32 DepthResult[256*192*2];
u32 AttrResult[256*192*2];
};
// eh those are pretty bad guesses
// though real hw shouldn't be eable to render all 2048 polygons on every line either
static const int MaxYSpanIndices = 64*2048;
static const int MaxYSpanSetups = 6144*2; static const int MaxYSpanSetups = 6144*2;
SetupIndices YSpanIndices[MaxYSpanIndices]; std::vector<SetupIndices> YSpanIndices;
SpanSetupY YSpanSetups[MaxYSpanSetups]; SpanSetupY YSpanSetups[MaxYSpanSetups];
RenderPolygon RenderPolygons[2048]; RenderPolygon RenderPolygons[2048];
@ -228,8 +203,6 @@ private:
u32 ClearColor, ClearDepth, ClearAttr; u32 ClearColor, ClearDepth, ClearAttr;
u32 FogOffset, FogShift, FogColor; u32 FogOffset, FogShift, FogColor;
u32 XScroll;
}; };
GLuint MetaUniformMemory; GLuint MetaUniformMemory;
@ -241,17 +214,24 @@ private:
u32 TextureDecodingBuffer[1024*1024]; u32 TextureDecodingBuffer[1024*1024];
GLuint Framebuffer; GLuint Framebuffer;
GLuint LowResFramebuffer;
GLuint PixelBuffer; GLuint PixelBuffer;
u32 FramebufferCPU[256*192]; u32 FramebufferCPU[256*192];
TexCacheEntry& GetTexture(u32 textureParam, u32 paletteParam); TexCacheEntry& GetTexture(u32 textureParam, u32 paletteParam);
int ScreenWidth, ScreenHeight;
int TilesPerLine, TileLines;
int ScaleFactor = -1;
int MaxWorkTiles;
void ResetTexcache(); void ResetTexcache();
void DeleteShaders();
void SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to); void SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to);
void SetupYSpan(int polynum, SpanSetupY* span, Polygon* poly, int from, int to, u32 y, int side); void SetupYSpan(int polynum, SpanSetupY* span, Polygon* poly, int from, int to, u32 y, int side, s32 positions[10][2]);
void SetupYSpanDummy(SpanSetupY* span, Polygon* poly, int vertex, int side); void SetupYSpanDummy(SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2]);
bool CompileShader(GLuint& shader, const char* source, const std::initializer_list<const char*>& defines); bool CompileShader(GLuint& shader, const char* source, const std::initializer_list<const char*>& defines);
}; };

View File

@ -52,6 +52,24 @@ namespace ComputeRendererShaders
// Highlight // Highlight
// ShadowMask // ShadowMask
/*
Some notes on signed division:
we want to avoid it, so we can avoid higher precision numbers
in a few places.
Fortunately all divisions *should* assuming I'm not mistaken
have the same sign on the divisor and the dividend.
Thus we apply:
assuming n < 0 <=> d < 0
n/d = abs(n)/abs(d)
*/
const char* Common = R"( const char* Common = R"(
struct Polygon struct Polygon
{ {
@ -149,14 +167,14 @@ const int CoarseTileCountY = 4;
const int CoarseTileW = (CoarseTileCountX * TileSize); const int CoarseTileW = (CoarseTileCountX * TileSize);
const int CoarseTileH = (CoarseTileCountY * TileSize); const int CoarseTileH = (CoarseTileCountY * TileSize);
const int FramebufferStride = 256*192; const int FramebufferStride = ScreenWidth*ScreenHeight;
const int TilesPerLine = 256/TileSize; const int TilesPerLine = ScreenWidth/TileSize;
const int TileLines = 192/TileSize; const int TileLines = ScreenHeight/TileSize;
const int BinStride = 2048/32; const int BinStride = 2048/32;
const int CoarseBinStride = BinStride/32; const int CoarseBinStride = BinStride/32;
const int MaxWorkTiles = TilesPerLine*TileLines*48;
const int MaxVariants = 256; const int MaxVariants = 256;
layout (std430, binding = 3) layout (std430, binding = 3)
@ -199,9 +217,9 @@ readonly
#endif #endif
buffer RasterResult buffer RasterResult
{ {
uint ColorResult[256*192*2]; uint ColorResult[ScreenWidth*ScreenHeight*2];
uint DepthResult[256*192*2]; uint DepthResult[ScreenWidth*ScreenHeight*2];
uint AttrResult[256*192*2]; uint AttrResult[ScreenWidth*ScreenHeight*2];
}; };
layout (std140, binding = 0) uniform MetaUniform layout (std140, binding = 0) uniform MetaUniform
@ -221,8 +239,6 @@ layout (std140, binding = 0) uniform MetaUniform
uint ClearColor, ClearDepth, ClearAttr; uint ClearColor, ClearDepth, ClearAttr;
uint FogOffset, FogShift, FogColor; uint FogOffset, FogShift, FogColor;
int XScroll;
}; };
#if defined(InterpSpans) || defined(Rasterise) #if defined(InterpSpans) || defined(Rasterise)
@ -238,7 +254,7 @@ const uint startTable[256] = uint[256](
157, 156, 154, 153, 152, 151, 149, 148, 147, 146, 144, 143, 142, 141, 139, 138, 137, 136, 135, 134, 132, 131, 130, 129, 128, 127, 126, 125, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 88, 87, 86, 85, 84, 83, 82, 81, 80, 80, 79, 78, 77, 76, 75, 74, 74, 73, 72, 71, 70, 70, 69, 68, 67, 66, 66, 65, 64, 63, 62, 62, 61, 60, 59, 59, 58, 57, 56, 56, 55, 54, 53, 53, 52, 51, 50, 50, 49, 48, 48, 47, 46, 46, 45, 44, 43, 43, 42, 41, 41, 40, 39, 39, 38, 37, 37, 36, 35, 35, 34, 33, 33, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0 157, 156, 154, 153, 152, 151, 149, 148, 147, 146, 144, 143, 142, 141, 139, 138, 137, 136, 135, 134, 132, 131, 130, 129, 128, 127, 126, 125, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 88, 87, 86, 85, 84, 83, 82, 81, 80, 80, 79, 78, 77, 76, 75, 74, 74, 73, 72, 71, 70, 70, 69, 68, 67, 66, 66, 65, 64, 63, 62, 62, 61, 60, 59, 59, 58, 57, 56, 56, 55, 54, 53, 53, 52, 51, 50, 50, 49, 48, 48, 47, 46, 46, 45, 44, 43, 43, 42, 41, 41, 40, 39, 39, 38, 37, 37, 36, 35, 35, 34, 33, 33, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0
); );
uint Div(uint x, uint y) uint Div(uint x, uint y, out uint r)
{ {
// https://www.microsoft.com/en-us/research/publication/software-integer-division/ // https://www.microsoft.com/en-us/research/publication/software-integer-division/
uint k = 31 - findMSB(y); uint k = 31 - findMSB(y);
@ -251,7 +267,7 @@ uint Div(uint x, uint y)
z += Umulh(z, my * z); z += Umulh(z, my * z);
uint q = Umulh(x, z); uint q = Umulh(x, z);
uint r = x - y * q; r = x - y * q;
if(r >= y) if(r >= y)
{ {
r = r - y; r = r - y;
@ -266,16 +282,77 @@ uint Div(uint x, uint y)
return q; return q;
} }
uint Div64_32_32(uint numHi, uint numLo, uint den)
{
// based on https://github.com/ridiculousfish/libdivide/blob/3bd34388573681ce563348cdf04fe15d24770d04/libdivide.h#L469
// modified to work with half the size 64/32=32 instead of 128/64=64
// for further details see https://ridiculousfish.com/blog/posts/labor-of-division-episode-iv.html
// We work in base 2**16.
// A uint32 holds a single digit (in the lower 16 bit). A uint32 holds two digits.
// Our numerator is conceptually [num3, num2, num1, num0].
// Our denominator is [den1, den0].
const uint b = (1U << 16);
// Determine the normalization factor. We multiply den by this, so that its leading digit is at
// least half b. In binary this means just shifting left by the number of leading zeros, so that
// there's a 1 in the MSB.
// We also shift numer by the same amount. This cannot overflow because numHi < den.
// The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
// by 64. <---- in C. I'm pretty sure shifts are masked in GLSL, but whatever.
uint shift = 31 - findMSB(den);
den <<= shift;
numHi <<= shift;
numHi |= (numLo >> (-shift & 63U)) & uint(-int(shift) >> 63);
numLo <<= shift;
// Extract the low digits of the numerator and both digits of the denominator.
uint num1 = (numLo >> 16);
uint num0 = (numLo & 0xFFFFU);
uint den1 = (den >> 16);
uint den0 = (den & 0xFFFFU);
// We wish to compute q1 = [n3 n2 n1] / [d1 d0].
// Estimate q1 as [n3 n2] / [d1], and then correct it.
// Note while qhat may be 2 digits, q1 is always 1 digit.
uint rhat;
uint qhat = Div(numHi, den1, rhat);
uint c1 = qhat * den0;
uint c2 = rhat * b + num1;
if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
uint q1 = qhat & 0xFFFFU;
// Compute the true (partial) remainder.
uint rem = numHi * b + num1 - q1 * den;
// We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
// Estimate q0 as [rem1 rem0] / [d1] and correct it.
qhat = Div(rem, den1, rhat);
c1 = qhat * den0;
c2 = rhat * b + num0;
if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
return bitfieldInsert(qhat, q1, 16, 16);
}
#ifdef InterpSpans #ifdef InterpSpans
const int Shift = 9; const int YFactorShift = 9;
#else #else
const int Shift = 8; const int YFactorShift = 8;
#endif #endif
int CalcYFactorY(YSpanSetup span, int i) int CalcYFactorY(YSpanSetup span, int i)
{ {
int num = abs((i) * span.W0n) << Shift; /*
int den = abs(((i) * span.W0d) + (((span.I1 - span.I0 - i) * span.W1d))); maybe it would be better to do use a 32x32=64 multiplication?
*/
uint numLo = abs(i * span.W0n);
uint numHi = 0U;
numHi |= numLo >> (32-YFactorShift);
numLo <<= YFactorShift;
uint den = abs(i * span.W0d + (span.I1 - span.I0 - i) * span.W1d);
if (den == 0) if (den == 0)
{ {
@ -283,10 +360,7 @@ int CalcYFactorY(YSpanSetup span, int i)
} }
else else
{ {
int q = int(Div(num, den)); return int(Div64_32_32(numHi, numLo, den));
//if ((num < 0) != (den < 0))
// return -q;
return q;
} }
} }
@ -296,13 +370,17 @@ int CalcYFactorX(XSpanSetup span, int x)
if (span.X0 != span.X1) if (span.X0 != span.X1)
{ {
uint num = (uint(x) * span.W0) << Shift; uint numLo = uint(x) * span.W0;
uint numHi = 0U;
numHi |= numLo >> (32-YFactorShift);
numLo <<= YFactorShift;
uint den = (uint(x) * span.W0) + (uint(span.X1 - span.X0 - x) * span.W1); uint den = (uint(x) * span.W0) + (uint(span.X1 - span.X0 - x) * span.W1);
if (den == 0) if (den == 0)
return 0; return 0;
else else
return int(Div(num, den)); return int(Div64_32_32(numHi, numLo, den));
} }
else else
{ {
@ -316,9 +394,9 @@ int InterpolateAttrPersp(int y0, int y1, int ifactor)
return y0; return y0;
if (y0 < y1) if (y0 < y1)
return y0 + (((y1-y0) * ifactor) >> Shift); return y0 + (((y1-y0) * ifactor) >> YFactorShift);
else else
return y1 + (((y0-y1) * ((1<<Shift)-ifactor)) >> Shift); return y1 + (((y0-y1) * ((1<<YFactorShift)-ifactor)) >> YFactorShift);
} }
int InterpolateAttrLinear(int y0, int y1, int i, int irecip, int idiff) int InterpolateAttrLinear(int y0, int y1, int i, int irecip, int idiff)
@ -439,11 +517,11 @@ uint InterpolateZWBuffer(int z0, int z1, int ifactor)
// since the precision along x spans is only 8 bit the result will always fit in 32-bit // since the precision along x spans is only 8 bit the result will always fit in 32-bit
if (z0 < z1) if (z0 < z1)
{ {
return uint(z0) + (((z1-z0) * ifactor) >> Shift); return uint(z0) + (((z1-z0) * ifactor) >> YFactorShift);
} }
else else
{ {
return uint(z1) + (((z0-z1) * ((1<<Shift)-ifactor)) >> Shift); return uint(z1) + (((z0-z1) * ((1<<YFactorShift)-ifactor)) >> YFactorShift);
} }
#else #else
uint mulLo, mulHi; uint mulLo, mulHi;
@ -451,21 +529,21 @@ uint InterpolateZWBuffer(int z0, int z1, int ifactor)
{ {
umulExtended(z1-z0, ifactor, mulHi, mulLo); umulExtended(z1-z0, ifactor, mulHi, mulLo);
// 64-bit shift // 64-bit shift
return uint(z0) + ((mulLo >> Shift) | (mulHi << (32-Shift))); return uint(z0) + ((mulLo >> YFactorShift) | (mulHi << (32-YFactorShift)));
} }
else else
{ {
umulExtended(z0-z1, (1<<Shift)-ifactor, mulHi, mulLo); umulExtended(z0-z1, (1<<YFactorShift)-ifactor, mulHi, mulLo);
return uint(z1) + ((mulLo >> Shift) | (mulHi << (32-Shift))); return uint(z1) + ((mulLo >> YFactorShift) | (mulHi << (32-YFactorShift)));
} }
#endif #endif
/*if (z0 < z1) /*if (z0 < z1)
{ {
return uint(z0) + uint((int64_t(z1-z0) * int64_t(ifactor)) >> Shift); return uint(z0) + uint((int64_t(z1-z0) * int64_t(ifactor)) >> YFactorShift);
} }
else else
{ {
return uint(z1) + uint((int64_t(z0-z1) * int64_t((1<<Shift)-ifactor)) >> Shift); return uint(z1) + uint((int64_t(z0-z1) * int64_t((1<<YFactorShift)-ifactor)) >> YFactorShift);
}*/ }*/
} }
@ -499,7 +577,8 @@ void EdgeParams_XMajor(bool side, int dx, YSpanSetup span, out int edgelen, out
if (negative) startx = xlen - startx; if (negative) startx = xlen - startx;
if (side) startx = startx - len + 1; if (side) startx = startx - len + 1;
int startcov = int(Div(uint(((startx << 10) + 0x1FF) * (span.Y1 - span.Y0)), uint(xlen))); uint r;
int startcov = int(Div(uint(((startx << 10) + 0x1FF) * (span.Y1 - span.Y0)), uint(xlen), r));
edgecov = (1<<31) | ((startcov & 0x3FF) << 12) | (span.XCovIncr & 0x3FF); edgecov = (1<<31) | ((startcov & 0x3FF) << 12) | (span.XCovIncr & 0x3FF);
} }
@ -713,7 +792,8 @@ void main()
} }
{ {
#endif #endif
xspan.XRecip = int(Div(1U<<30, uint(xspan.X1 - xspan.X0))); uint r;
xspan.XRecip = int(Div(1U<<30, uint(xspan.X1 - xspan.X0), r));
} }
XSpanSetups[gl_GlobalInvocationID.x] = xspan; XSpanSetups[gl_GlobalInvocationID.x] = xspan;
@ -950,7 +1030,7 @@ void main()
attr |= 0x1U; attr |= 0x1U;
int cov = xspan.EdgeCovL; int cov = xspan.EdgeCovL;
if ((cov & (1U<<31)) != 0U) if (cov < 0)
{ {
int xcov = xspan.CovLInitial + (xspan.EdgeCovL & 0x3FF) * (position.x - xspan.X0); int xcov = xspan.CovLInitial + (xspan.EdgeCovL & 0x3FF) * (position.x - xspan.X0);
cov = min(xcov >> 5, 31); cov = min(xcov >> 5, 31);
@ -963,7 +1043,7 @@ void main()
attr |= 0x2U; attr |= 0x2U;
int cov = xspan.EdgeCovR; int cov = xspan.EdgeCovR;
if ((cov & (1U<<31)) != 0U) if (cov < 0)
{ {
int xcov = xspan.CovRInitial + (xspan.EdgeCovR & 0x3FF) * (position.x - xspan.InsideEnd); int xcov = xspan.CovRInitial + (xspan.EdgeCovR & 0x3FF) * (position.x - xspan.InsideEnd);
cov = max(0x1F - (xcov >> 5), 0); cov = max(0x1F - (xcov >> 5), 0);
@ -1311,7 +1391,7 @@ void main()
ProcessCoarseMask(linearTile, coarseMaskLo, 0, color, depth, attr, stencil, prevIsShadowMask); ProcessCoarseMask(linearTile, coarseMaskLo, 0, color, depth, attr, stencil, prevIsShadowMask);
ProcessCoarseMask(linearTile, coarseMaskHi, BinStride/2, color, depth, attr, stencil, prevIsShadowMask); ProcessCoarseMask(linearTile, coarseMaskHi, BinStride/2, color, depth, attr, stencil, prevIsShadowMask);
int resultOffset = int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y) * 256; int resultOffset = int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y) * ScreenWidth;
ColorResult[resultOffset] = color.x; ColorResult[resultOffset] = color.x;
ColorResult[resultOffset+FramebufferStride] = color.y; ColorResult[resultOffset+FramebufferStride] = color.y;
DepthResult[resultOffset] = depth.x; DepthResult[resultOffset] = depth.x;
@ -1327,6 +1407,7 @@ const char* FinalPass = R"(
layout (local_size_x = 32) in; layout (local_size_x = 32) in;
layout (binding = 0, rgba8) writeonly uniform image2D FinalFB; layout (binding = 0, rgba8) writeonly uniform image2D FinalFB;
layout (binding = 1, rgba8ui) writeonly uniform uimage2D LowResFB;
uint BlendFog(uint color, uint depth) uint BlendFog(uint color, uint depth)
{ {
@ -1373,18 +1454,12 @@ uint BlendFog(uint color, uint depth)
void main() void main()
{ {
int srcX = (int(gl_GlobalInvocationID.x) + XScroll) & 0x1FF; int srcX = int(gl_GlobalInvocationID.x);
int resultOffset = int(srcX) + int(gl_GlobalInvocationID.y) * 256; int resultOffset = int(srcX) + int(gl_GlobalInvocationID.y) * ScreenWidth;
uvec2 color = uvec2(0); uvec2 color = uvec2(ColorResult[resultOffset], ColorResult[resultOffset+FramebufferStride]);
uvec2 depth = uvec2(0); uvec2 depth = uvec2(DepthResult[resultOffset], DepthResult[resultOffset+FramebufferStride]);
uvec2 attr = uvec2(0); uvec2 attr = uvec2(AttrResult[resultOffset], AttrResult[resultOffset+FramebufferStride]);
if (srcX < 256)
{
color = uvec2(ColorResult[resultOffset], ColorResult[resultOffset+FramebufferStride]);
depth = uvec2(DepthResult[resultOffset], DepthResult[resultOffset+FramebufferStride]);
attr = uvec2(AttrResult[resultOffset], AttrResult[resultOffset+FramebufferStride]);
}
#ifdef EdgeMarking #ifdef EdgeMarking
if ((attr.x & 0xFU) != 0U) if ((attr.x & 0xFU) != 0U)
@ -1397,20 +1472,20 @@ void main()
otherAttr.x = AttrResult[resultOffset-1]; otherAttr.x = AttrResult[resultOffset-1];
otherDepth.x = DepthResult[resultOffset-1]; otherDepth.x = DepthResult[resultOffset-1];
} }
if (srcX < 255U) if (srcX < ScreenWidth-1)
{ {
otherAttr.y = AttrResult[resultOffset+1]; otherAttr.y = AttrResult[resultOffset+1];
otherDepth.y = DepthResult[resultOffset+1]; otherDepth.y = DepthResult[resultOffset+1];
} }
if (gl_GlobalInvocationID.y > 0U) if (gl_GlobalInvocationID.y > 0U)
{ {
otherAttr.z = AttrResult[resultOffset-256]; otherAttr.z = AttrResult[resultOffset-ScreenWidth];
otherDepth.z = DepthResult[resultOffset-256]; otherDepth.z = DepthResult[resultOffset-ScreenWidth];
} }
if (gl_GlobalInvocationID.y < 191U) if (gl_GlobalInvocationID.y < ScreenHeight-1)
{ {
otherAttr.w = AttrResult[resultOffset+256]; otherAttr.w = AttrResult[resultOffset+ScreenWidth];
otherDepth.w = DepthResult[resultOffset+256]; otherDepth.w = DepthResult[resultOffset+ScreenWidth];
} }
uint polyId = bitfieldExtract(attr.x, 24, 5); uint polyId = bitfieldExtract(attr.x, 24, 5);
@ -1491,6 +1566,20 @@ void main()
vec4 result = vec4(bitfieldExtract(color.x, 16, 8), bitfieldExtract(color.x, 8, 8), color.x & 0x3FU, bitfieldExtract(color.x, 24, 8)); vec4 result = vec4(bitfieldExtract(color.x, 16, 8), bitfieldExtract(color.x, 8, 8), color.x & 0x3FU, bitfieldExtract(color.x, 24, 8));
result /= vec4(63.0, 63.0, 63.0, 31.0); result /= vec4(63.0, 63.0, 63.0, 31.0);
imageStore(FinalFB, ivec2(gl_GlobalInvocationID.xy), result); imageStore(FinalFB, ivec2(gl_GlobalInvocationID.xy), result);
// It's a division by constant, so using the builtin division is fine
const int scale = ScreenWidth/256;
ivec2 lowresCoordinate = ivec2(gl_GlobalInvocationID.xy) / scale;
ivec2 lowresCoordinateRest = ivec2(gl_GlobalInvocationID.xy) % scale;
if (lowresCoordinateRest == ivec2(0, 0))
{
uvec4 color8;
color8.x = bitfieldExtract(color.x, 0, 8);
color8.y = bitfieldExtract(color.x, 8, 8);
color8.z = bitfieldExtract(color.x, 16, 8);
color8.w = bitfieldExtract(color.x, 24, 8);
imageStore(LowResFB, lowresCoordinate, color8);
}
} }
)"; )";