From 60a3fe24ed3bb4bc15ad5d5a2b5b5a12a2fda2e6 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Mon, 17 Apr 2023 00:07:47 +0200 Subject: [PATCH] add hires rendering to the compute shader renderer --- src/GPU3D_Compute.cpp | 304 ++++++++++++++++++++---------------- src/GPU3D_Compute.h | 64 +++----- src/GPU3D_Compute_shaders.h | 193 +++++++++++++++++------ 3 files changed, 331 insertions(+), 230 deletions(-) diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp index 55ad21b0..5421708e 100644 --- a/src/GPU3D_Compute.cpp +++ b/src/GPU3D_Compute.cpp @@ -52,6 +52,13 @@ bool ComputeRenderer::CompileShader(GLuint& shader, const char* source, const st shaderName += define; shaderName += ','; } + shaderSource += "#define ScreenWidth "; + shaderSource += std::to_string(ScreenWidth); + shaderSource += "\n#define ScreenHeight "; + shaderSource += std::to_string(ScreenHeight); + shaderSource += "\n#define MaxWorkTiles "; + shaderSource += std::to_string(MaxWorkTiles); + shaderSource += ComputeRendererShaders::Common; shaderSource += source; @@ -65,8 +72,8 @@ void blah(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,con bool ComputeRenderer::Init() { - glDebugMessageCallback(blah, NULL); - glEnable(GL_DEBUG_OUTPUT); + //glDebugMessageCallback(blah, NULL); + //glEnable(GL_DEBUG_OUTPUT); glGenBuffers(1, &YSpanSetupMemory); glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupY)*MaxYSpanSetups, nullptr, GL_DYNAMIC_DRAW); @@ -75,72 +82,22 @@ bool ComputeRenderer::Init() glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(RenderPolygon)*2048, nullptr, GL_DYNAMIC_DRAW); - glGenBuffers(1, &TileMemory); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory); - glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(Tiles), nullptr, GL_DYNAMIC_DRAW); - glGenBuffers(1, &XSpanSetupMemory); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, XSpanSetupMemory); - glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupX)*MaxYSpanIndices, nullptr, GL_DYNAMIC_DRAW); - glGenBuffers(1, &BinResultMemory); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, BinResultMemory); - glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(BinResult), nullptr, GL_DYNAMIC_DRAW); - glGenBuffers(1, &FinalTileMemory); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, FinalTileMemory); - glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(FinalTiles), nullptr, GL_DYNAMIC_DRAW); - glGenBuffers(1, &YSpanIndicesTextureMemory); - glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory); - glBufferData(GL_TEXTURE_BUFFER, MaxYSpanIndices*2*4, nullptr, GL_DYNAMIC_DRAW); + glGenBuffers(1, &TileMemory); glGenTextures(1, &YSpanIndicesTexture); - glBindTexture(GL_TEXTURE_BUFFER, YSpanIndicesTexture); - glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA16UI, YSpanIndicesTextureMemory); - glGenTextures(1, &Framebuffer); - glBindTexture(GL_TEXTURE_2D, Framebuffer); - glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, 256, 192); + glGenTextures(1, &LowResFramebuffer); + glBindTexture(GL_TEXTURE_2D, LowResFramebuffer); + glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8UI, 256, 192); glGenBuffers(1, &MetaUniformMemory); glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory); glBufferData(GL_UNIFORM_BUFFER, sizeof(MetaUniform), nullptr, GL_DYNAMIC_DRAW); - CompileShader(ShaderInterpXSpans[0], ComputeRendererShaders::InterpSpans, {"InterpSpans", "ZBuffer"}); - CompileShader(ShaderInterpXSpans[1], ComputeRendererShaders::InterpSpans, {"InterpSpans", "WBuffer"}); - CompileShader(ShaderBinCombined, ComputeRendererShaders::BinCombined, {"BinCombined"}); - CompileShader(ShaderDepthBlend[0], ComputeRendererShaders::DepthBlend, {"DepthBlend", "ZBuffer"}); - CompileShader(ShaderDepthBlend[1], ComputeRendererShaders::DepthBlend, {"DepthBlend", "WBuffer"}); - CompileShader(ShaderRasteriseNoTexture[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture"}); - CompileShader(ShaderRasteriseNoTexture[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture"}); - CompileShader(ShaderRasteriseNoTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Toon"}); - CompileShader(ShaderRasteriseNoTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Toon"}); - CompileShader(ShaderRasteriseNoTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Highlight"}); - CompileShader(ShaderRasteriseNoTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Highlight"}); - CompileShader(ShaderRasteriseUseTextureDecal[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Decal"}); - CompileShader(ShaderRasteriseUseTextureDecal[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Decal"}); - CompileShader(ShaderRasteriseUseTextureModulate[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Modulate"}); - CompileShader(ShaderRasteriseUseTextureModulate[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Modulate"}); - CompileShader(ShaderRasteriseUseTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Toon"}); - CompileShader(ShaderRasteriseUseTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Toon"}); - CompileShader(ShaderRasteriseUseTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Highlight"}); - CompileShader(ShaderRasteriseUseTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Highlight"}); - CompileShader(ShaderRasteriseShadowMask[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "ShadowMask"}); - CompileShader(ShaderRasteriseShadowMask[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "ShadowMask"}); - CompileShader(ShaderClearCoarseBinMask, ComputeRendererShaders::ClearCoarseBinMask, {"ClearCoarseBinMask"}); - CompileShader(ShaderClearIndirectWorkCount, ComputeRendererShaders::ClearIndirectWorkCount, {"ClearIndirectWorkCount"}); - CompileShader(ShaderCalculateWorkListOffset, ComputeRendererShaders::CalcOffsets, {"CalculateWorkOffsets"}); - CompileShader(ShaderSortWork, ComputeRendererShaders::SortWork, {"SortWork"}); - CompileShader(ShaderFinalPass[0], ComputeRendererShaders::FinalPass, {"FinalPass"}); - CompileShader(ShaderFinalPass[1], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking"}); - CompileShader(ShaderFinalPass[2], ComputeRendererShaders::FinalPass, {"FinalPass", "Fog"}); - CompileShader(ShaderFinalPass[3], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking", "Fog"}); - CompileShader(ShaderFinalPass[4], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing"}); - CompileShader(ShaderFinalPass[5], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking"}); - CompileShader(ShaderFinalPass[6], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "Fog"}); - CompileShader(ShaderFinalPass[7], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking", "Fog"}); - glGenSamplers(9, Samplers); for (u32 j = 0; j < 3; j++) { @@ -176,6 +133,12 @@ void ComputeRenderer::DeInit() glDeleteTextures(1, &Framebuffer); glDeleteBuffers(1, &MetaUniformMemory); + glDeleteSamplers(9, Samplers); + glDeleteBuffers(1, &PixelBuffer); +} + +void ComputeRenderer::DeleteShaders() +{ std::initializer_list allPrograms = { ShaderInterpXSpans[0], @@ -214,9 +177,6 @@ void ComputeRenderer::DeInit() }; for (GLuint program : allPrograms) glDeleteProgram(program); - - glDeleteSamplers(9, Samplers); - glDeleteBuffers(1, &PixelBuffer); } void ComputeRenderer::ResetTexcache() @@ -241,7 +201,85 @@ void ComputeRenderer::Reset() void ComputeRenderer::SetRenderSettings(GPU::RenderSettings& settings) { + if (ScaleFactor != -1) + { + DeleteShaders(); + } + ScaleFactor = settings.GL_ScaleFactor; + ScreenWidth = 256 * ScaleFactor; + ScreenHeight = 192 * ScaleFactor; + + TilesPerLine = ScreenWidth/TileSize; + TileLines = ScreenHeight/TileSize; + + MaxWorkTiles = TilesPerLine*TileLines*8; + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory); + glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*TileSize*TileSize*MaxWorkTiles, nullptr, GL_DYNAMIC_DRAW); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, FinalTileMemory); + glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*2*ScreenWidth*ScreenHeight, nullptr, GL_DYNAMIC_DRAW); + + int binResultSize = sizeof(BinResultHeader) + + MaxWorkTiles*2*4 // UnsortedWorkDescs + + MaxWorkTiles*2*4 // SortedWork + + TilesPerLine*TileLines*CoarseBinStride*4 // BinnedMaskCoarse + + TilesPerLine*TileLines*BinStride*4 // BinnedMask + + TilesPerLine*TileLines*BinStride*4; // WorkOffsets + glBindBuffer(GL_SHADER_STORAGE_BUFFER, BinResultMemory); + glBufferData(GL_SHADER_STORAGE_BUFFER, binResultSize, nullptr, GL_DYNAMIC_DRAW); + + glBindTexture(GL_TEXTURE_2D, Framebuffer); + glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, ScreenWidth, ScreenHeight); + + // eh those are pretty bad guesses + // though real hw shouldn't be eable to render all 2048 polygons on every line either + int maxYSpanIndices = 64*2048 * ScaleFactor; + YSpanIndices.resize(maxYSpanIndices); + + glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory); + glBufferData(GL_TEXTURE_BUFFER, maxYSpanIndices*2*4, nullptr, GL_DYNAMIC_DRAW); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, XSpanSetupMemory); + glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupX)*maxYSpanIndices, nullptr, GL_DYNAMIC_DRAW); + + glBindTexture(GL_TEXTURE_BUFFER, YSpanIndicesTexture); + glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA16UI, YSpanIndicesTextureMemory); + + CompileShader(ShaderInterpXSpans[0], ComputeRendererShaders::InterpSpans, {"InterpSpans", "ZBuffer"}); + CompileShader(ShaderInterpXSpans[1], ComputeRendererShaders::InterpSpans, {"InterpSpans", "WBuffer"}); + CompileShader(ShaderBinCombined, ComputeRendererShaders::BinCombined, {"BinCombined"}); + CompileShader(ShaderDepthBlend[0], ComputeRendererShaders::DepthBlend, {"DepthBlend", "ZBuffer"}); + CompileShader(ShaderDepthBlend[1], ComputeRendererShaders::DepthBlend, {"DepthBlend", "WBuffer"}); + CompileShader(ShaderRasteriseNoTexture[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture"}); + CompileShader(ShaderRasteriseNoTexture[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture"}); + CompileShader(ShaderRasteriseNoTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Toon"}); + CompileShader(ShaderRasteriseNoTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Toon"}); + CompileShader(ShaderRasteriseNoTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Highlight"}); + CompileShader(ShaderRasteriseNoTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Highlight"}); + CompileShader(ShaderRasteriseUseTextureDecal[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Decal"}); + CompileShader(ShaderRasteriseUseTextureDecal[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Decal"}); + CompileShader(ShaderRasteriseUseTextureModulate[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Modulate"}); + CompileShader(ShaderRasteriseUseTextureModulate[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Modulate"}); + CompileShader(ShaderRasteriseUseTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Toon"}); + CompileShader(ShaderRasteriseUseTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Toon"}); + CompileShader(ShaderRasteriseUseTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Highlight"}); + CompileShader(ShaderRasteriseUseTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Highlight"}); + CompileShader(ShaderRasteriseShadowMask[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "ShadowMask"}); + CompileShader(ShaderRasteriseShadowMask[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "ShadowMask"}); + CompileShader(ShaderClearCoarseBinMask, ComputeRendererShaders::ClearCoarseBinMask, {"ClearCoarseBinMask"}); + CompileShader(ShaderClearIndirectWorkCount, ComputeRendererShaders::ClearIndirectWorkCount, {"ClearIndirectWorkCount"}); + CompileShader(ShaderCalculateWorkListOffset, ComputeRendererShaders::CalcOffsets, {"CalculateWorkOffsets"}); + CompileShader(ShaderSortWork, ComputeRendererShaders::SortWork, {"SortWork"}); + CompileShader(ShaderFinalPass[0], ComputeRendererShaders::FinalPass, {"FinalPass"}); + CompileShader(ShaderFinalPass[1], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking"}); + CompileShader(ShaderFinalPass[2], ComputeRendererShaders::FinalPass, {"FinalPass", "Fog"}); + CompileShader(ShaderFinalPass[3], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking", "Fog"}); + CompileShader(ShaderFinalPass[4], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing"}); + CompileShader(ShaderFinalPass[5], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking"}); + CompileShader(ShaderFinalPass[6], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "Fog"}); + CompileShader(ShaderFinalPass[7], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking", "Fog"}); } void ComputeRenderer::VCount144() @@ -267,9 +305,9 @@ void ComputeRenderer::SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int span->TexcoordV1 = poly->Vertices[to]->TexCoords[1]; } -void ComputeRenderer::SetupYSpanDummy(SpanSetupY* span, Polygon* poly, int vertex, int side) +void ComputeRenderer::SetupYSpanDummy(SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2]) { - s32 x0 = poly->Vertices[vertex]->FinalPosition[0]; + s32 x0 = positions[vertex][0]; if (side) { span->DxInitial = -0x40000; @@ -283,7 +321,7 @@ void ComputeRenderer::SetupYSpanDummy(SpanSetupY* span, Polygon* poly, int verte span->X0 = span->X1 = x0; span->XMin = x0; span->XMax = x0; - span->Y0 = span->Y1 = poly->Vertices[vertex]->FinalPosition[1]; + span->Y0 = span->Y1 = positions[vertex][1]; span->Increment = 0; @@ -297,12 +335,12 @@ void ComputeRenderer::SetupYSpanDummy(SpanSetupY* span, Polygon* poly, int verte SetupAttrs(span, poly, vertex, vertex); } -void ComputeRenderer::SetupYSpan(int polynum, SpanSetupY* span, Polygon* poly, int from, int to, u32 y, int side) +void ComputeRenderer::SetupYSpan(int polynum, SpanSetupY* span, Polygon* poly, int from, int to, u32 y, int side, s32 positions[10][2]) { - span->X0 = poly->Vertices[from]->FinalPosition[0]; - span->X1 = poly->Vertices[to]->FinalPosition[0]; - span->Y0 = poly->Vertices[from]->FinalPosition[1]; - span->Y1 = poly->Vertices[to]->FinalPosition[1]; + span->X0 = positions[from][0]; + span->X1 = positions[to][0]; + span->Y0 = positions[from][1]; + span->Y1 = positions[to][1]; SetupAttrs(span, poly, from, to); @@ -396,9 +434,6 @@ void ComputeRenderer::SetupYSpan(int polynum, SpanSetupY* span, Polygon* poly, i span->I1 = span->Y1; } - //if (span->I1 < span->I0) - // std::swap(span->I0, span->I1); - if (span->I0 != span->I1) span->IRecip = (1<<30) / (span->I1 - span->I0); else @@ -940,14 +975,11 @@ void ComputeRenderer::RenderFrame() u32 nverts = polygon->NumVertices; u32 vtop = polygon->VTop, vbot = polygon->VBottom; - s32 ytop = polygon->YTop, ybot = polygon->YBottom; u32 curVL = vtop, curVR = vtop; u32 nextVL, nextVR; RenderPolygons[i].FirstXSpan = numSetupIndices; - RenderPolygons[i].YTop = ytop; - RenderPolygons[i].YBot = ybot; RenderPolygons[i].Attr = polygon->Attr; bool foundVariant = false; @@ -1034,10 +1066,22 @@ void ComputeRenderer::RenderFrame() if (nextVR >= nverts) nextVR = 0; } - s32 minX = polygon->Vertices[vtop]->FinalPosition[0]; - s32 minXY = polygon->Vertices[vtop]->FinalPosition[1]; - s32 maxX = polygon->Vertices[vtop]->FinalPosition[0]; - s32 maxXY = polygon->Vertices[vtop]->FinalPosition[1]; + s32 scaledPositions[10][2]; + s32 ytop = ScreenHeight, ybot = 0; + for (int i = 0; i < polygon->NumVertices; i++) + { + scaledPositions[i][0] = (polygon->Vertices[i]->HiresPosition[0] * ScaleFactor) >> 4; + scaledPositions[i][1] = (polygon->Vertices[i]->HiresPosition[1] * ScaleFactor) >> 4; + ytop = std::min(scaledPositions[i][1], ytop); + ybot = std::max(scaledPositions[i][1], ybot); + } + RenderPolygons[i].YTop = ytop; + RenderPolygons[i].YBot = ybot; + + s32 minX = scaledPositions[vtop][0]; + s32 minXY = scaledPositions[vtop][1]; + s32 maxX = scaledPositions[vtop][0]; + s32 maxXY = scaledPositions[vtop][1]; if (ybot == ytop) { @@ -1046,19 +1090,19 @@ void ComputeRenderer::RenderFrame() RenderPolygons[i].YBot++; int j = 1; - if (polygon->Vertices[j]->FinalPosition[0] < polygon->Vertices[vtop]->FinalPosition[0]) vtop = j; - if (polygon->Vertices[j]->FinalPosition[0] > polygon->Vertices[vbot]->FinalPosition[0]) vbot = j; + if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j; + if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j; j = nverts - 1; - if (polygon->Vertices[j]->FinalPosition[0] < polygon->Vertices[vtop]->FinalPosition[0]) vtop = j; - if (polygon->Vertices[j]->FinalPosition[0] > polygon->Vertices[vbot]->FinalPosition[0]) vbot = j; + if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j; + if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j; assert(numYSpans < MaxYSpanSetups); u32 curSpanL = numYSpans; - SetupYSpanDummy(&YSpanSetups[numYSpans++], polygon, vtop, 0); + SetupYSpanDummy(&YSpanSetups[numYSpans++], polygon, vtop, 0, scaledPositions); assert(numYSpans < MaxYSpanSetups); u32 curSpanR = numYSpans; - SetupYSpanDummy(&YSpanSetups[numYSpans++], polygon, vbot, 1); + SetupYSpanDummy(&YSpanSetups[numYSpans++], polygon, vbot, 1, scaledPositions); minX = YSpanSetups[curSpanL].X0; minXY = YSpanSetups[curSpanL].Y0; @@ -1070,7 +1114,6 @@ void ComputeRenderer::RenderFrame() std::swap(minXY, maxXY); } - assert(numSetupIndices < MaxYSpanIndices); YSpanIndices[numSetupIndices].PolyIdx = i; YSpanIndices[numSetupIndices].SpanIdxL = curSpanL; YSpanIndices[numSetupIndices].SpanIdxR = curSpanR; @@ -1081,16 +1124,16 @@ void ComputeRenderer::RenderFrame() { u32 curSpanL = numYSpans; assert(numYSpans < MaxYSpanSetups); - SetupYSpan(i, &YSpanSetups[numYSpans++], polygon, curVL, nextVL, ytop, 0); + SetupYSpan(i, &YSpanSetups[numYSpans++], polygon, curVL, nextVL, ytop, 0, scaledPositions); u32 curSpanR = numYSpans; assert(numYSpans < MaxYSpanSetups); - SetupYSpan(i, &YSpanSetups[numYSpans++], polygon, curVR, nextVR, ytop, 1); + SetupYSpan(i, &YSpanSetups[numYSpans++], polygon, curVR, nextVR, ytop, 1, scaledPositions); for (u32 y = ytop; y < ybot; y++) { - if (y >= polygon->Vertices[nextVL]->FinalPosition[1] && curVL != polygon->VBottom) + if (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom) { - while (y >= polygon->Vertices[nextVL]->FinalPosition[1] && curVL != polygon->VBottom) + while (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom) { curVL = nextVL; if (polygon->FacingView) @@ -1107,24 +1150,24 @@ void ComputeRenderer::RenderFrame() } } - if (polygon->Vertices[curVL]->FinalPosition[0] < minX) + if (scaledPositions[curVL][0] < minX) { - minX = polygon->Vertices[curVL]->FinalPosition[0]; - minXY = polygon->Vertices[curVL]->FinalPosition[1]; + minX = scaledPositions[curVL][0]; + minXY = scaledPositions[curVL][1]; } - if (polygon->Vertices[curVL]->FinalPosition[0] > maxX) + if (scaledPositions[curVL][0] > maxX) { - maxX = polygon->Vertices[curVL]->FinalPosition[0]; - maxXY = polygon->Vertices[curVL]->FinalPosition[1]; + maxX = scaledPositions[curVL][0]; + maxXY = scaledPositions[curVL][1]; } assert(numYSpans < MaxYSpanSetups); curSpanL = numYSpans; - SetupYSpan(i,&YSpanSetups[numYSpans++], polygon, curVL, nextVL, y, 0); + SetupYSpan(i,&YSpanSetups[numYSpans++], polygon, curVL, nextVL, y, 0, scaledPositions); } - if (y >= polygon->Vertices[nextVR]->FinalPosition[1] && curVR != polygon->VBottom) + if (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom) { - while (y >= polygon->Vertices[nextVR]->FinalPosition[1] && curVR != polygon->VBottom) + while (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom) { curVR = nextVR; if (polygon->FacingView) @@ -1141,23 +1184,22 @@ void ComputeRenderer::RenderFrame() } } - if (polygon->Vertices[curVR]->FinalPosition[0] < minX) + if (scaledPositions[curVR][0] < minX) { - minX = polygon->Vertices[curVR]->FinalPosition[0]; - minXY = polygon->Vertices[curVR]->FinalPosition[1]; + minX = scaledPositions[curVR][0]; + minXY = scaledPositions[curVR][1]; } - if (polygon->Vertices[curVR]->FinalPosition[0] > maxX) + if (scaledPositions[curVR][0] > maxX) { - maxX = polygon->Vertices[curVR]->FinalPosition[0]; - maxXY = polygon->Vertices[curVR]->FinalPosition[1]; + maxX = scaledPositions[curVR][0]; + maxXY = scaledPositions[curVR][1]; } assert(numYSpans < MaxYSpanSetups); curSpanR = numYSpans; - SetupYSpan(i,&YSpanSetups[numYSpans++], polygon, curVR, nextVR, y, 1); + SetupYSpan(i,&YSpanSetups[numYSpans++], polygon, curVR, nextVR, y, 1, scaledPositions); } - assert(numSetupIndices < MaxYSpanIndices); YSpanIndices[numSetupIndices].PolyIdx = i; YSpanIndices[numSetupIndices].SpanIdxL = curSpanL; YSpanIndices[numSetupIndices].SpanIdxR = curSpanR; @@ -1166,25 +1208,25 @@ void ComputeRenderer::RenderFrame() } } - if (polygon->Vertices[nextVL]->FinalPosition[0] < minX) + if (scaledPositions[nextVL][0] < minX) { - minX = polygon->Vertices[nextVL]->FinalPosition[0]; - minXY = polygon->Vertices[nextVL]->FinalPosition[1]; + minX = scaledPositions[nextVL][0]; + minXY = scaledPositions[nextVL][1]; } - if (polygon->Vertices[nextVL]->FinalPosition[0] > maxX) + if (scaledPositions[nextVL][0] > maxX) { - maxX = polygon->Vertices[nextVL]->FinalPosition[0]; - maxXY = polygon->Vertices[nextVL]->FinalPosition[1]; + maxX = scaledPositions[nextVL][0]; + maxXY = scaledPositions[nextVL][1]; } - if (polygon->Vertices[nextVR]->FinalPosition[0] < minX) + if (scaledPositions[nextVR][0] < minX) { - minX = polygon->Vertices[nextVR]->FinalPosition[0]; - minXY = polygon->Vertices[nextVR]->FinalPosition[1]; + minX = scaledPositions[nextVR][0]; + minXY = scaledPositions[nextVR][1]; } - if (polygon->Vertices[nextVR]->FinalPosition[0] > maxX) + if (scaledPositions[nextVR][0] > maxX) { - maxX = polygon->Vertices[nextVR]->FinalPosition[0]; - maxXY = polygon->Vertices[nextVR]->FinalPosition[1]; + maxX = scaledPositions[nextVR][0]; + maxXY = scaledPositions[nextVR][1]; } RenderPolygons[i].XMin = minX; @@ -1210,7 +1252,7 @@ void ComputeRenderer::RenderFrame() glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(SpanSetupY)*numYSpans, YSpanSetups); glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory); - glBufferSubData(GL_TEXTURE_BUFFER, 0, numSetupIndices*4*2, YSpanIndices); + glBufferSubData(GL_TEXTURE_BUFFER, 0, numSetupIndices*4*2, YSpanIndices.data()); glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory); glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, RenderNumPolygons*sizeof(RenderPolygon), RenderPolygons); @@ -1278,8 +1320,6 @@ void ComputeRenderer::RenderFrame() u32 fogA = (RenderFogColor >> 16) & 0x1F; meta.FogColor = fogR | (fogG << 8) | (fogB << 16) | (fogA << 24); } - meta.XScroll = 0; - //meta.XScroll = RenderXPos; glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory); glBufferSubData(GL_UNIFORM_BUFFER, 0, sizeof(MetaUniform), &meta); @@ -1304,7 +1344,7 @@ void ComputeRenderer::RenderFrame() // bin polygons glUseProgram(ShaderBinCombined); - glDispatchCompute(((RenderNumPolygons + 31) / 32), 256/CoarseTileW, 192/CoarseTileH); + glDispatchCompute(((RenderNumPolygons + 31) / 32), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH); glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); // calculate list offsets @@ -1316,7 +1356,7 @@ void ComputeRenderer::RenderFrame() // sort shader work glUseProgram(ShaderSortWork); glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory); - glDispatchComputeIndirect(offsetof(BinResult, SortWorkWorkCount)); + glDispatchComputeIndirect(offsetof(BinResultHeader, SortWorkWorkCount)); glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); glActiveTexture(GL_TEXTURE0); @@ -1379,7 +1419,7 @@ void ComputeRenderer::RenderFrame() glUniform1ui(UniformIdxCurVariant, i); glUniform2f(UniformIdxTextureSize, 1.f / variants[i].Width, 1.f / variants[i].Height); glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory); - glDispatchComputeIndirect(offsetof(BinResult, VariantWorkCount) + i*4*4); + glDispatchComputeIndirect(offsetof(BinResultHeader, VariantWorkCount) + i*4*4); } } } @@ -1387,10 +1427,11 @@ void ComputeRenderer::RenderFrame() // compose final image glUseProgram(ShaderDepthBlend[wbuffer]); - glDispatchCompute(256/8, 192/8, 1); + glDispatchCompute(ScreenWidth/TileSize, ScreenHeight/TileSize, 1); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); glBindImageTexture(0, Framebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8); + glBindImageTexture(1, LowResFramebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8UI); u32 finalPassShader = 0; if (RenderDispCnt & (1<<4)) finalPassShader |= 0x4; @@ -1400,7 +1441,7 @@ void ComputeRenderer::RenderFrame() finalPassShader |= 0x1; glUseProgram(ShaderFinalPass[finalPassShader]); - glDispatchCompute(256/32, 192, 1); + glDispatchCompute(ScreenWidth/32, ScreenHeight, 1); glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); /*u64 starttime = armGetSystemTick(); @@ -1467,15 +1508,6 @@ u32* ComputeRenderer::GetLine(int line) glUnmapBuffer(GL_PIXEL_PACK_BUFFER); } - u64* ptr = (u64*)&FramebufferCPU[stride * line]; - for (int i = 0; i < stride; i+=2) - { - u64 rgb = *ptr & 0x00FCFCFC00FCFCFC; - u64 a = *ptr & 0xF8000000F8000000; - - *ptr++ = (rgb >> 2) | (a >> 3); - } - return &FramebufferCPU[stride * line]; } @@ -1487,8 +1519,8 @@ void ComputeRenderer::SetupAccelFrame() void ComputeRenderer::PrepareCaptureFrame() { glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer); - glBindTexture(GL_TEXTURE_2D, Framebuffer); - glGetTexImage(GL_TEXTURE_2D, 0, GL_BGRA, GL_UNSIGNED_BYTE, nullptr); + glBindTexture(GL_TEXTURE_2D, LowResFramebuffer); + glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, nullptr); } } \ No newline at end of file diff --git a/src/GPU3D_Compute.h b/src/GPU3D_Compute.h index 9b83b38c..7a3af101 100644 --- a/src/GPU3D_Compute.h +++ b/src/GPU3D_Compute.h @@ -141,57 +141,32 @@ private: u32 __pad0, __pad1; }; - static const int TileSize = 8; - static const int CoarseTileCountX = 8; - static const int CoarseTileCountY = 4; - static const int CoarseTileW = CoarseTileCountX * TileSize; - static const int CoarseTileH = CoarseTileCountY * TileSize; + static constexpr int TileSize = 8; + static constexpr int CoarseTileCountX = 8; + static constexpr int CoarseTileCountY = 4; + static constexpr int CoarseTileW = CoarseTileCountX * TileSize; + static constexpr int CoarseTileH = CoarseTileCountY * TileSize; - static const int TilesPerLine = 256/TileSize; - static const int TileLines = 192/TileSize; + static constexpr int BinStride = 2048/32; + static constexpr int CoarseBinStride = BinStride/32; - static const int BinStride = 2048/32; - static const int CoarseBinStride = BinStride/32; + static constexpr int MaxVariants = 256; - static const int MaxWorkTiles = TilesPerLine*TileLines*48; - static const int MaxVariants = 256; + static constexpr int UniformIdxCurVariant = 0; + static constexpr int UniformIdxTextureSize = 1; - static const int UniformIdxCurVariant = 0; - static const int UniformIdxTextureSize = 1; + static constexpr int MaxFullscreenLayers = 16; - struct BinResult + struct BinResultHeader { u32 VariantWorkCount[MaxVariants*4]; u32 SortedWorkOffset[MaxVariants]; u32 SortWorkWorkCount[4]; - u32 UnsortedWorkDescs[MaxWorkTiles*2]; - u32 SortedWork[MaxWorkTiles*2]; - - u32 BinnedMaskCoarse[TilesPerLine*TileLines*CoarseBinStride]; - u32 BinnedMask[TilesPerLine*TileLines*BinStride]; - u32 WorkOffsets[TilesPerLine*TileLines*BinStride]; }; - struct Tiles - { - u32 ColorTiles[MaxWorkTiles*TileSize*TileSize]; - u32 DepthTiles[MaxWorkTiles*TileSize*TileSize]; - u32 AttrStencilTiles[MaxWorkTiles*TileSize*TileSize]; - }; - - struct FinalTiles - { - u32 ColorResult[256*192*2]; - u32 DepthResult[256*192*2]; - u32 AttrResult[256*192*2]; - }; - - // eh those are pretty bad guesses - // though real hw shouldn't be eable to render all 2048 polygons on every line either - static const int MaxYSpanIndices = 64*2048; static const int MaxYSpanSetups = 6144*2; - SetupIndices YSpanIndices[MaxYSpanIndices]; + std::vector YSpanIndices; SpanSetupY YSpanSetups[MaxYSpanSetups]; RenderPolygon RenderPolygons[2048]; @@ -228,8 +203,6 @@ private: u32 ClearColor, ClearDepth, ClearAttr; u32 FogOffset, FogShift, FogColor; - - u32 XScroll; }; GLuint MetaUniformMemory; @@ -241,17 +214,24 @@ private: u32 TextureDecodingBuffer[1024*1024]; GLuint Framebuffer; + GLuint LowResFramebuffer; GLuint PixelBuffer; u32 FramebufferCPU[256*192]; TexCacheEntry& GetTexture(u32 textureParam, u32 paletteParam); + int ScreenWidth, ScreenHeight; + int TilesPerLine, TileLines; + int ScaleFactor = -1; + int MaxWorkTiles; + void ResetTexcache(); + void DeleteShaders(); void SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to); - void SetupYSpan(int polynum, SpanSetupY* span, Polygon* poly, int from, int to, u32 y, int side); - void SetupYSpanDummy(SpanSetupY* span, Polygon* poly, int vertex, int side); + void SetupYSpan(int polynum, SpanSetupY* span, Polygon* poly, int from, int to, u32 y, int side, s32 positions[10][2]); + void SetupYSpanDummy(SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2]); bool CompileShader(GLuint& shader, const char* source, const std::initializer_list& defines); }; diff --git a/src/GPU3D_Compute_shaders.h b/src/GPU3D_Compute_shaders.h index 786f66c9..26e7adeb 100644 --- a/src/GPU3D_Compute_shaders.h +++ b/src/GPU3D_Compute_shaders.h @@ -52,6 +52,24 @@ namespace ComputeRendererShaders // Highlight // ShadowMask + +/* + Some notes on signed division: + + we want to avoid it, so we can avoid higher precision numbers + in a few places. + + Fortunately all divisions *should* assuming I'm not mistaken + have the same sign on the divisor and the dividend. + + Thus we apply: + + assuming n < 0 <=> d < 0 + n/d = abs(n)/abs(d) + +*/ + + const char* Common = R"( struct Polygon { @@ -149,14 +167,14 @@ const int CoarseTileCountY = 4; const int CoarseTileW = (CoarseTileCountX * TileSize); const int CoarseTileH = (CoarseTileCountY * TileSize); -const int FramebufferStride = 256*192; -const int TilesPerLine = 256/TileSize; -const int TileLines = 192/TileSize; +const int FramebufferStride = ScreenWidth*ScreenHeight; +const int TilesPerLine = ScreenWidth/TileSize; +const int TileLines = ScreenHeight/TileSize; const int BinStride = 2048/32; const int CoarseBinStride = BinStride/32; -const int MaxWorkTiles = TilesPerLine*TileLines*48; + const int MaxVariants = 256; layout (std430, binding = 3) @@ -199,9 +217,9 @@ readonly #endif buffer RasterResult { - uint ColorResult[256*192*2]; - uint DepthResult[256*192*2]; - uint AttrResult[256*192*2]; + uint ColorResult[ScreenWidth*ScreenHeight*2]; + uint DepthResult[ScreenWidth*ScreenHeight*2]; + uint AttrResult[ScreenWidth*ScreenHeight*2]; }; layout (std140, binding = 0) uniform MetaUniform @@ -221,8 +239,6 @@ layout (std140, binding = 0) uniform MetaUniform uint ClearColor, ClearDepth, ClearAttr; uint FogOffset, FogShift, FogColor; - - int XScroll; }; #if defined(InterpSpans) || defined(Rasterise) @@ -238,7 +254,7 @@ const uint startTable[256] = uint[256]( 157, 156, 154, 153, 152, 151, 149, 148, 147, 146, 144, 143, 142, 141, 139, 138, 137, 136, 135, 134, 132, 131, 130, 129, 128, 127, 126, 125, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 88, 87, 86, 85, 84, 83, 82, 81, 80, 80, 79, 78, 77, 76, 75, 74, 74, 73, 72, 71, 70, 70, 69, 68, 67, 66, 66, 65, 64, 63, 62, 62, 61, 60, 59, 59, 58, 57, 56, 56, 55, 54, 53, 53, 52, 51, 50, 50, 49, 48, 48, 47, 46, 46, 45, 44, 43, 43, 42, 41, 41, 40, 39, 39, 38, 37, 37, 36, 35, 35, 34, 33, 33, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0 ); -uint Div(uint x, uint y) +uint Div(uint x, uint y, out uint r) { // https://www.microsoft.com/en-us/research/publication/software-integer-division/ uint k = 31 - findMSB(y); @@ -251,7 +267,7 @@ uint Div(uint x, uint y) z += Umulh(z, my * z); uint q = Umulh(x, z); - uint r = x - y * q; + r = x - y * q; if(r >= y) { r = r - y; @@ -266,16 +282,77 @@ uint Div(uint x, uint y) return q; } +uint Div64_32_32(uint numHi, uint numLo, uint den) +{ + // based on https://github.com/ridiculousfish/libdivide/blob/3bd34388573681ce563348cdf04fe15d24770d04/libdivide.h#L469 + // modified to work with half the size 64/32=32 instead of 128/64=64 + // for further details see https://ridiculousfish.com/blog/posts/labor-of-division-episode-iv.html + + // We work in base 2**16. + // A uint32 holds a single digit (in the lower 16 bit). A uint32 holds two digits. + // Our numerator is conceptually [num3, num2, num1, num0]. + // Our denominator is [den1, den0]. + const uint b = (1U << 16); + + // Determine the normalization factor. We multiply den by this, so that its leading digit is at + // least half b. In binary this means just shifting left by the number of leading zeros, so that + // there's a 1 in the MSB. + // We also shift numer by the same amount. This cannot overflow because numHi < den. + // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting + // by 64. <---- in C. I'm pretty sure shifts are masked in GLSL, but whatever. + uint shift = 31 - findMSB(den); + den <<= shift; + numHi <<= shift; + numHi |= (numLo >> (-shift & 63U)) & uint(-int(shift) >> 63); + numLo <<= shift; + + // Extract the low digits of the numerator and both digits of the denominator. + uint num1 = (numLo >> 16); + uint num0 = (numLo & 0xFFFFU); + uint den1 = (den >> 16); + uint den0 = (den & 0xFFFFU); + + // We wish to compute q1 = [n3 n2 n1] / [d1 d0]. + // Estimate q1 as [n3 n2] / [d1], and then correct it. + // Note while qhat may be 2 digits, q1 is always 1 digit. + + uint rhat; + uint qhat = Div(numHi, den1, rhat); + uint c1 = qhat * den0; + uint c2 = rhat * b + num1; + if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; + uint q1 = qhat & 0xFFFFU; + + // Compute the true (partial) remainder. + uint rem = numHi * b + num1 - q1 * den; + + // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0]. + // Estimate q0 as [rem1 rem0] / [d1] and correct it. + qhat = Div(rem, den1, rhat); + c1 = qhat * den0; + c2 = rhat * b + num0; + if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; + + return bitfieldInsert(qhat, q1, 16, 16); +} + #ifdef InterpSpans -const int Shift = 9; +const int YFactorShift = 9; #else -const int Shift = 8; +const int YFactorShift = 8; #endif int CalcYFactorY(YSpanSetup span, int i) { - int num = abs((i) * span.W0n) << Shift; - int den = abs(((i) * span.W0d) + (((span.I1 - span.I0 - i) * span.W1d))); + /* + maybe it would be better to do use a 32x32=64 multiplication? + */ + uint numLo = abs(i * span.W0n); + uint numHi = 0U; + numHi |= numLo >> (32-YFactorShift); + numLo <<= YFactorShift; + + uint den = abs(i * span.W0d + (span.I1 - span.I0 - i) * span.W1d); if (den == 0) { @@ -283,10 +360,7 @@ int CalcYFactorY(YSpanSetup span, int i) } else { - int q = int(Div(num, den)); - //if ((num < 0) != (den < 0)) - // return -q; - return q; + return int(Div64_32_32(numHi, numLo, den)); } } @@ -296,13 +370,17 @@ int CalcYFactorX(XSpanSetup span, int x) if (span.X0 != span.X1) { - uint num = (uint(x) * span.W0) << Shift; + uint numLo = uint(x) * span.W0; + uint numHi = 0U; + numHi |= numLo >> (32-YFactorShift); + numLo <<= YFactorShift; + uint den = (uint(x) * span.W0) + (uint(span.X1 - span.X0 - x) * span.W1); if (den == 0) return 0; else - return int(Div(num, den)); + return int(Div64_32_32(numHi, numLo, den)); } else { @@ -316,9 +394,9 @@ int InterpolateAttrPersp(int y0, int y1, int ifactor) return y0; if (y0 < y1) - return y0 + (((y1-y0) * ifactor) >> Shift); + return y0 + (((y1-y0) * ifactor) >> YFactorShift); else - return y1 + (((y0-y1) * ((1<> Shift); + return y1 + (((y0-y1) * ((1<> YFactorShift); } int InterpolateAttrLinear(int y0, int y1, int i, int irecip, int idiff) @@ -439,11 +517,11 @@ uint InterpolateZWBuffer(int z0, int z1, int ifactor) // since the precision along x spans is only 8 bit the result will always fit in 32-bit if (z0 < z1) { - return uint(z0) + (((z1-z0) * ifactor) >> Shift); + return uint(z0) + (((z1-z0) * ifactor) >> YFactorShift); } else { - return uint(z1) + (((z0-z1) * ((1<> Shift); + return uint(z1) + (((z0-z1) * ((1<> YFactorShift); } #else uint mulLo, mulHi; @@ -451,21 +529,21 @@ uint InterpolateZWBuffer(int z0, int z1, int ifactor) { umulExtended(z1-z0, ifactor, mulHi, mulLo); // 64-bit shift - return uint(z0) + ((mulLo >> Shift) | (mulHi << (32-Shift))); + return uint(z0) + ((mulLo >> YFactorShift) | (mulHi << (32-YFactorShift))); } else { - umulExtended(z0-z1, (1<> Shift) | (mulHi << (32-Shift))); + umulExtended(z0-z1, (1<> YFactorShift) | (mulHi << (32-YFactorShift))); } #endif /*if (z0 < z1) { - return uint(z0) + uint((int64_t(z1-z0) * int64_t(ifactor)) >> Shift); + return uint(z0) + uint((int64_t(z1-z0) * int64_t(ifactor)) >> YFactorShift); } else { - return uint(z1) + uint((int64_t(z0-z1) * int64_t((1<> Shift); + return uint(z1) + uint((int64_t(z0-z1) * int64_t((1<> YFactorShift); }*/ } @@ -499,7 +577,8 @@ void EdgeParams_XMajor(bool side, int dx, YSpanSetup span, out int edgelen, out if (negative) startx = xlen - startx; if (side) startx = startx - len + 1; - int startcov = int(Div(uint(((startx << 10) + 0x1FF) * (span.Y1 - span.Y0)), uint(xlen))); + uint r; + int startcov = int(Div(uint(((startx << 10) + 0x1FF) * (span.Y1 - span.Y0)), uint(xlen), r)); edgecov = (1<<31) | ((startcov & 0x3FF) << 12) | (span.XCovIncr & 0x3FF); } @@ -713,7 +792,8 @@ void main() } { #endif - xspan.XRecip = int(Div(1U<<30, uint(xspan.X1 - xspan.X0))); + uint r; + xspan.XRecip = int(Div(1U<<30, uint(xspan.X1 - xspan.X0), r)); } XSpanSetups[gl_GlobalInvocationID.x] = xspan; @@ -950,7 +1030,7 @@ void main() attr |= 0x1U; int cov = xspan.EdgeCovL; - if ((cov & (1U<<31)) != 0U) + if (cov < 0) { int xcov = xspan.CovLInitial + (xspan.EdgeCovL & 0x3FF) * (position.x - xspan.X0); cov = min(xcov >> 5, 31); @@ -963,7 +1043,7 @@ void main() attr |= 0x2U; int cov = xspan.EdgeCovR; - if ((cov & (1U<<31)) != 0U) + if (cov < 0) { int xcov = xspan.CovRInitial + (xspan.EdgeCovR & 0x3FF) * (position.x - xspan.InsideEnd); cov = max(0x1F - (xcov >> 5), 0); @@ -1311,7 +1391,7 @@ void main() ProcessCoarseMask(linearTile, coarseMaskLo, 0, color, depth, attr, stencil, prevIsShadowMask); ProcessCoarseMask(linearTile, coarseMaskHi, BinStride/2, color, depth, attr, stencil, prevIsShadowMask); - int resultOffset = int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y) * 256; + int resultOffset = int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y) * ScreenWidth; ColorResult[resultOffset] = color.x; ColorResult[resultOffset+FramebufferStride] = color.y; DepthResult[resultOffset] = depth.x; @@ -1327,6 +1407,7 @@ const char* FinalPass = R"( layout (local_size_x = 32) in; layout (binding = 0, rgba8) writeonly uniform image2D FinalFB; +layout (binding = 1, rgba8ui) writeonly uniform uimage2D LowResFB; uint BlendFog(uint color, uint depth) { @@ -1373,18 +1454,12 @@ uint BlendFog(uint color, uint depth) void main() { - int srcX = (int(gl_GlobalInvocationID.x) + XScroll) & 0x1FF; - int resultOffset = int(srcX) + int(gl_GlobalInvocationID.y) * 256; + int srcX = int(gl_GlobalInvocationID.x); + int resultOffset = int(srcX) + int(gl_GlobalInvocationID.y) * ScreenWidth; - uvec2 color = uvec2(0); - uvec2 depth = uvec2(0); - uvec2 attr = uvec2(0); - if (srcX < 256) - { - color = uvec2(ColorResult[resultOffset], ColorResult[resultOffset+FramebufferStride]); - depth = uvec2(DepthResult[resultOffset], DepthResult[resultOffset+FramebufferStride]); - attr = uvec2(AttrResult[resultOffset], AttrResult[resultOffset+FramebufferStride]); - } + uvec2 color = uvec2(ColorResult[resultOffset], ColorResult[resultOffset+FramebufferStride]); + uvec2 depth = uvec2(DepthResult[resultOffset], DepthResult[resultOffset+FramebufferStride]); + uvec2 attr = uvec2(AttrResult[resultOffset], AttrResult[resultOffset+FramebufferStride]); #ifdef EdgeMarking if ((attr.x & 0xFU) != 0U) @@ -1397,20 +1472,20 @@ void main() otherAttr.x = AttrResult[resultOffset-1]; otherDepth.x = DepthResult[resultOffset-1]; } - if (srcX < 255U) + if (srcX < ScreenWidth-1) { otherAttr.y = AttrResult[resultOffset+1]; otherDepth.y = DepthResult[resultOffset+1]; } if (gl_GlobalInvocationID.y > 0U) { - otherAttr.z = AttrResult[resultOffset-256]; - otherDepth.z = DepthResult[resultOffset-256]; + otherAttr.z = AttrResult[resultOffset-ScreenWidth]; + otherDepth.z = DepthResult[resultOffset-ScreenWidth]; } - if (gl_GlobalInvocationID.y < 191U) + if (gl_GlobalInvocationID.y < ScreenHeight-1) { - otherAttr.w = AttrResult[resultOffset+256]; - otherDepth.w = DepthResult[resultOffset+256]; + otherAttr.w = AttrResult[resultOffset+ScreenWidth]; + otherDepth.w = DepthResult[resultOffset+ScreenWidth]; } uint polyId = bitfieldExtract(attr.x, 24, 5); @@ -1491,6 +1566,20 @@ void main() vec4 result = vec4(bitfieldExtract(color.x, 16, 8), bitfieldExtract(color.x, 8, 8), color.x & 0x3FU, bitfieldExtract(color.x, 24, 8)); result /= vec4(63.0, 63.0, 63.0, 31.0); imageStore(FinalFB, ivec2(gl_GlobalInvocationID.xy), result); + + // It's a division by constant, so using the builtin division is fine + const int scale = ScreenWidth/256; + ivec2 lowresCoordinate = ivec2(gl_GlobalInvocationID.xy) / scale; + ivec2 lowresCoordinateRest = ivec2(gl_GlobalInvocationID.xy) % scale; + if (lowresCoordinateRest == ivec2(0, 0)) + { + uvec4 color8; + color8.x = bitfieldExtract(color.x, 0, 8); + color8.y = bitfieldExtract(color.x, 8, 8); + color8.z = bitfieldExtract(color.x, 16, 8); + color8.w = bitfieldExtract(color.x, 24, 8); + imageStore(LowResFB, lowresCoordinate, color8); + } } )";