/* Copyright 2016-2024 melonDS team This file is part of melonDS. melonDS is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. melonDS is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with melonDS. If not, see http://www.gnu.org/licenses/. */ #include "GPU3D_Compute.h" #include #include "OpenGLSupport.h" #include "GPU3D_Compute_shaders.h" namespace melonDS { ComputeRenderer::ComputeRenderer(GLCompositor&& compositor) : Renderer3D(true), Texcache(TexcacheOpenGLLoader()), CurGLCompositor(std::move(compositor)) {} bool ComputeRenderer::CompileShader(GLuint& shader, const std::string& source, const std::initializer_list& defines) { std::string shaderName; std::string shaderSource; shaderSource += "#version 430 core\n"; for (const char* define : defines) { shaderSource += "#define "; shaderSource += define; shaderSource += '\n'; shaderName += define; shaderName += ','; } shaderSource += "#define ScreenWidth "; shaderSource += std::to_string(ScreenWidth); shaderSource += "\n#define ScreenHeight "; shaderSource += std::to_string(ScreenHeight); shaderSource += "\n#define MaxWorkTiles "; shaderSource += std::to_string(MaxWorkTiles); shaderSource += ComputeRendererShaders::Common; shaderSource += source; return OpenGL::CompileComputeProgram(shader, shaderSource.c_str(), shaderName.c_str()); } void ComputeRenderer::ShaderCompileStep(int& current, int& count) { current = ShaderStepIdx; ShaderStepIdx++; count = 33; switch (current) { case 0: CompileShader(ShaderInterpXSpans[0], ComputeRendererShaders::InterpSpans, {"InterpSpans", "ZBuffer"}); return; case 1: CompileShader(ShaderInterpXSpans[1], ComputeRendererShaders::InterpSpans, {"InterpSpans", "WBuffer"}); return; case 2: CompileShader(ShaderBinCombined, ComputeRendererShaders::BinCombined, {"BinCombined"}); return; case 3: CompileShader(ShaderDepthBlend[0], ComputeRendererShaders::DepthBlend, {"DepthBlend", "ZBuffer"}); return; case 4: CompileShader(ShaderDepthBlend[1], ComputeRendererShaders::DepthBlend, {"DepthBlend", "WBuffer"}); return; case 5: CompileShader(ShaderRasteriseNoTexture[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture"}); return; case 6: CompileShader(ShaderRasteriseNoTexture[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture"}); return; case 7: CompileShader(ShaderRasteriseNoTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Toon"}); return; case 8: CompileShader(ShaderRasteriseNoTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Toon"}); return; case 9: CompileShader(ShaderRasteriseNoTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Highlight"}); return; case 10: CompileShader(ShaderRasteriseNoTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Highlight"}); return; case 11: CompileShader(ShaderRasteriseUseTextureDecal[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Decal"}); return; case 12: CompileShader(ShaderRasteriseUseTextureDecal[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Decal"}); return; case 13: CompileShader(ShaderRasteriseUseTextureModulate[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Modulate"}); return; case 14: CompileShader(ShaderRasteriseUseTextureModulate[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Modulate"}); return; case 15: CompileShader(ShaderRasteriseUseTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Toon"}); return; case 16: CompileShader(ShaderRasteriseUseTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Toon"}); return; case 17: CompileShader(ShaderRasteriseUseTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Highlight"}); return; case 18: CompileShader(ShaderRasteriseUseTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Highlight"}); return; case 19: CompileShader(ShaderRasteriseShadowMask[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "ShadowMask"}); return; case 20: CompileShader(ShaderRasteriseShadowMask[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "ShadowMask"}); return; case 21: CompileShader(ShaderClearCoarseBinMask, ComputeRendererShaders::ClearCoarseBinMask, {"ClearCoarseBinMask"}); return; case 22: CompileShader(ShaderClearIndirectWorkCount, ComputeRendererShaders::ClearIndirectWorkCount, {"ClearIndirectWorkCount"}); return; case 23: CompileShader(ShaderCalculateWorkListOffset, ComputeRendererShaders::CalcOffsets, {"CalculateWorkOffsets"}); return; case 24: CompileShader(ShaderSortWork, ComputeRendererShaders::SortWork, {"SortWork"}); return; case 25: CompileShader(ShaderFinalPass[0], ComputeRendererShaders::FinalPass, {"FinalPass"}); return; case 26: CompileShader(ShaderFinalPass[1], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking"}); return; case 27: CompileShader(ShaderFinalPass[2], ComputeRendererShaders::FinalPass, {"FinalPass", "Fog"}); return; case 28: CompileShader(ShaderFinalPass[3], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking", "Fog"}); return; case 29: CompileShader(ShaderFinalPass[4], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing"}); return; case 30: CompileShader(ShaderFinalPass[5], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking"}); return; case 31: CompileShader(ShaderFinalPass[6], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "Fog"}); return; case 32: CompileShader(ShaderFinalPass[7], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking", "Fog"}); return; default: __builtin_unreachable(); return; } } void blah(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar *message, const void *userParam) { printf("%s\n", message); } std::unique_ptr ComputeRenderer::New() { std::optional compositor = GLCompositor::New(); if (!compositor) return nullptr; std::unique_ptr result = std::unique_ptr(new ComputeRenderer(std::move(*compositor))); //glDebugMessageCallback(blah, NULL); //glEnable(GL_DEBUG_OUTPUT); glGenBuffers(1, &result->YSpanSetupMemory); glBindBuffer(GL_SHADER_STORAGE_BUFFER, result->YSpanSetupMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupY)*MaxYSpanSetups, nullptr, GL_DYNAMIC_DRAW); glGenBuffers(1, &result->RenderPolygonMemory); glBindBuffer(GL_SHADER_STORAGE_BUFFER, result->RenderPolygonMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(RenderPolygon)*2048, nullptr, GL_DYNAMIC_DRAW); glGenBuffers(1, &result->XSpanSetupMemory); glGenBuffers(1, &result->BinResultMemory); glGenBuffers(1, &result->FinalTileMemory); glGenBuffers(1, &result->YSpanIndicesTextureMemory); glGenBuffers(tilememoryLayer_Num, result->TileMemory); glGenBuffers(1, &result->WorkDescMemory); glGenTextures(1, &result->YSpanIndicesTexture); glGenTextures(1, &result->LowResFramebuffer); glBindTexture(GL_TEXTURE_2D, result->LowResFramebuffer); glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8UI, 256, 192); glGenBuffers(1, &result->MetaUniformMemory); glBindBuffer(GL_UNIFORM_BUFFER, result->MetaUniformMemory); glBufferData(GL_UNIFORM_BUFFER, sizeof(MetaUniform), nullptr, GL_DYNAMIC_DRAW); glGenSamplers(9, result->Samplers); for (u32 j = 0; j < 3; j++) { for (u32 i = 0; i < 3; i++) { const GLenum translateWrapMode[3] = {GL_CLAMP_TO_EDGE, GL_REPEAT, GL_MIRRORED_REPEAT}; glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_WRAP_S, translateWrapMode[i]); glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_WRAP_T, translateWrapMode[j]); glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_MIN_FILTER, GL_NEAREST); glSamplerParameterf(result->Samplers[i+j*3], GL_TEXTURE_MAG_FILTER, GL_NEAREST); } } glGenBuffers(1, &result->PixelBuffer); glBindBuffer(GL_PIXEL_PACK_BUFFER, result->PixelBuffer); glBufferData(GL_PIXEL_PACK_BUFFER, 256*192*4, NULL, GL_DYNAMIC_READ); return result; } ComputeRenderer::~ComputeRenderer() { Texcache.Reset(); glDeleteBuffers(1, &YSpanSetupMemory); glDeleteBuffers(1, &RenderPolygonMemory); glDeleteBuffers(1, &XSpanSetupMemory); glDeleteBuffers(1, &BinResultMemory); glDeleteBuffers(tilememoryLayer_Num, TileMemory); glDeleteBuffers(1, &WorkDescMemory); glDeleteBuffers(1, &FinalTileMemory); glDeleteBuffers(1, &YSpanIndicesTextureMemory); glDeleteTextures(1, &YSpanIndicesTexture); glDeleteTextures(1, &Framebuffer); glDeleteBuffers(1, &MetaUniformMemory); glDeleteSamplers(9, Samplers); glDeleteBuffers(1, &PixelBuffer); } void ComputeRenderer::DeleteShaders() { std::initializer_list allPrograms = { ShaderInterpXSpans[0], ShaderInterpXSpans[1], ShaderBinCombined, ShaderDepthBlend[0], ShaderDepthBlend[1], ShaderRasteriseNoTexture[0], ShaderRasteriseNoTexture[1], ShaderRasteriseNoTextureToon[0], ShaderRasteriseNoTextureToon[1], ShaderRasteriseNoTextureHighlight[0], ShaderRasteriseNoTextureHighlight[1], ShaderRasteriseUseTextureDecal[0], ShaderRasteriseUseTextureDecal[1], ShaderRasteriseUseTextureModulate[0], ShaderRasteriseUseTextureModulate[1], ShaderRasteriseUseTextureToon[0], ShaderRasteriseUseTextureToon[1], ShaderRasteriseUseTextureHighlight[0], ShaderRasteriseUseTextureHighlight[1], ShaderRasteriseShadowMask[0], ShaderRasteriseShadowMask[1], ShaderClearCoarseBinMask, ShaderClearIndirectWorkCount, ShaderCalculateWorkListOffset, ShaderSortWork, ShaderFinalPass[0], ShaderFinalPass[1], ShaderFinalPass[2], ShaderFinalPass[3], ShaderFinalPass[4], ShaderFinalPass[5], ShaderFinalPass[6], ShaderFinalPass[7], }; for (GLuint program : allPrograms) glDeleteProgram(program); } void ComputeRenderer::Reset(GPU& gpu) { Texcache.Reset(); } void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinates) { CurGLCompositor.SetScaleFactor(scale); if (ScaleFactor != -1) { DeleteShaders(); } ShaderStepIdx = 0; ScaleFactor = scale; ScreenWidth = 256 * ScaleFactor; ScreenHeight = 192 * ScaleFactor; TilesPerLine = ScreenWidth/TileSize; TileLines = ScreenHeight/TileSize; HiresCoordinates = highResolutionCoordinates; MaxWorkTiles = TilesPerLine*TileLines*16; for (int i = 0; i < tilememoryLayer_Num; i++) { glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory[i]); glBufferData(GL_SHADER_STORAGE_BUFFER, 4*TileSize*TileSize*MaxWorkTiles, nullptr, GL_DYNAMIC_DRAW); } glBindBuffer(GL_SHADER_STORAGE_BUFFER, FinalTileMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*2*ScreenWidth*ScreenHeight, nullptr, GL_DYNAMIC_DRAW); int binResultSize = sizeof(BinResultHeader) + TilesPerLine*TileLines*CoarseBinStride*4 // BinnedMaskCoarse + TilesPerLine*TileLines*BinStride*4 // BinnedMask + TilesPerLine*TileLines*BinStride*4; // WorkOffsets glBindBuffer(GL_SHADER_STORAGE_BUFFER, BinResultMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, binResultSize, nullptr, GL_DYNAMIC_DRAW); glBindBuffer(GL_SHADER_STORAGE_BUFFER, WorkDescMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, MaxWorkTiles*2*4*2, nullptr, GL_DYNAMIC_DRAW); if (Framebuffer != 0) glDeleteTextures(1, &Framebuffer); glGenTextures(1, &Framebuffer); glBindTexture(GL_TEXTURE_2D, Framebuffer); glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, ScreenWidth, ScreenHeight); // eh those are pretty bad guesses // though real hw shouldn't be eable to render all 2048 polygons on every line either int maxYSpanIndices = 64*2048 * ScaleFactor; YSpanIndices.resize(maxYSpanIndices); glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory); glBufferData(GL_TEXTURE_BUFFER, maxYSpanIndices*2*4, nullptr, GL_DYNAMIC_DRAW); glBindBuffer(GL_SHADER_STORAGE_BUFFER, XSpanSetupMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupX)*maxYSpanIndices, nullptr, GL_DYNAMIC_DRAW); glBindTexture(GL_TEXTURE_BUFFER, YSpanIndicesTexture); glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA16UI, YSpanIndicesTextureMemory); } void ComputeRenderer::VCount144(GPU& gpu) { } void ComputeRenderer::SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to) { span->Z0 = poly->FinalZ[from]; span->W0 = poly->FinalW[from]; span->Z1 = poly->FinalZ[to]; span->W1 = poly->FinalW[to]; span->ColorR0 = poly->Vertices[from]->FinalColor[0]; span->ColorG0 = poly->Vertices[from]->FinalColor[1]; span->ColorB0 = poly->Vertices[from]->FinalColor[2]; span->ColorR1 = poly->Vertices[to]->FinalColor[0]; span->ColorG1 = poly->Vertices[to]->FinalColor[1]; span->ColorB1 = poly->Vertices[to]->FinalColor[2]; span->TexcoordU0 = poly->Vertices[from]->TexCoords[0]; span->TexcoordV0 = poly->Vertices[from]->TexCoords[1]; span->TexcoordU1 = poly->Vertices[to]->TexCoords[0]; span->TexcoordV1 = poly->Vertices[to]->TexCoords[1]; } void ComputeRenderer::SetupYSpanDummy(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2]) { s32 x0 = positions[vertex][0]; if (side) { span->DxInitial = -0x40000; x0--; } else { span->DxInitial = 0; } span->X0 = span->X1 = x0; span->XMin = x0; span->XMax = x0; span->Y0 = span->Y1 = positions[vertex][1]; if (span->XMin < rp->XMin) { rp->XMin = span->XMin; rp->XMinY = span->Y0; } if (span->XMax > rp->XMax) { rp->XMax = span->XMax; rp->XMaxY = span->Y0; } span->Increment = 0; span->I0 = span->I1 = span->IRecip = 0; span->Linear = true; span->XCovIncr = 0; span->IsDummy = true; SetupAttrs(span, poly, vertex, vertex); } void ComputeRenderer::SetupYSpan(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int from, int to, int side, s32 positions[10][2]) { span->X0 = positions[from][0]; span->X1 = positions[to][0]; span->Y0 = positions[from][1]; span->Y1 = positions[to][1]; SetupAttrs(span, poly, from, to); s32 minXY, maxXY; bool negative = false; if (span->X1 > span->X0) { span->XMin = span->X0; span->XMax = span->X1-1; minXY = span->Y0; maxXY = span->Y1; } else if (span->X1 < span->X0) { span->XMin = span->X1; span->XMax = span->X0-1; negative = true; minXY = span->Y1; maxXY = span->Y0; } else { span->XMin = span->X0; if (side) span->XMin--; span->XMax = span->XMin; // doesn't matter for completely vertical slope minXY = span->Y0; maxXY = span->Y0; } if (span->XMin < rp->XMin) { rp->XMin = span->XMin; rp->XMinY = minXY; } if (span->XMax > rp->XMax) { rp->XMax = span->XMax; rp->XMaxY = maxXY; } span->IsDummy = false; s32 xlen = span->XMax+1 - span->XMin; s32 ylen = span->Y1 - span->Y0; // slope increment has a 18-bit fractional part // note: for some reason, x/y isn't calculated directly, // instead, 1/y is calculated and then multiplied by x // TODO: this is still not perfect (see for example x=169 y=33) if (ylen == 0) { span->Increment = 0; } else if (ylen == xlen) { span->Increment = 0x40000; } else { s32 yrecip = (1<<18) / ylen; span->Increment = (span->X1-span->X0) * yrecip; if (span->Increment < 0) span->Increment = -span->Increment; } bool xMajor = (span->Increment > 0x40000); if (side) { // right if (xMajor) span->DxInitial = negative ? (0x20000 + 0x40000) : (span->Increment - 0x20000); else if (span->Increment != 0) span->DxInitial = negative ? 0x40000 : 0; else span->DxInitial = -0x40000; } else { // left if (xMajor) span->DxInitial = negative ? ((span->Increment - 0x20000) + 0x40000) : 0x20000; else if (span->Increment != 0) span->DxInitial = negative ? 0x40000 : 0; else span->DxInitial = 0; } if (xMajor) { if (side) { span->I0 = span->X0 - 1; span->I1 = span->X1 - 1; } else { span->I0 = span->X0; span->I1 = span->X1; } // used for calculating AA coverage span->XCovIncr = (ylen << 10) / xlen; } else { span->I0 = span->Y0; span->I1 = span->Y1; } if (span->I0 != span->I1) span->IRecip = (1<<30) / (span->I1 - span->I0); else span->IRecip = 0; span->Linear = (span->W0 == span->W1) && !(span->W0 & 0x7E) && !(span->W1 & 0x7E); if ((span->W0 & 0x1) && !(span->W1 & 0x1)) { span->W0n = (span->W0 - 1) >> 1; span->W0d = (span->W0 + 1) >> 1; span->W1d = span->W1 >> 1; } else { span->W0n = span->W0 >> 1; span->W0d = span->W0 >> 1; span->W1d = span->W1 >> 1; } } struct Variant { GLuint Texture, Sampler; u16 Width, Height; u8 BlendMode; bool operator==(const Variant& other) { return Texture == other.Texture && Sampler == other.Sampler && BlendMode == other.BlendMode; } }; /* Antialiasing W-Buffer With Texture 0 1, 3 2 without Texture 2 0, 1, 3 => 20 Shader + 1x Shadow Mask */ void ComputeRenderer::RenderFrame(GPU& gpu) { assert(!NeedsShaderCompile()); if (!Texcache.Update(gpu) && gpu.GPU3D.RenderFrameIdentical) { return; } int numYSpans = 0; int numSetupIndices = 0; /* Some games really like to spam small textures, often to store the data like PPU tiles. E.g. Shantae or some Mega Man game. Fortunately they are usually kind enough to not vary the texture size all too often (usually they just use 8x8 or 16x for everything). This is the reason we have this whole mess where textures of the same size are put into array textures. This allows to increase the batch size. Less variance between each Variant hah! */ u32 numVariants = 0, prevVariant, prevTexLayer; Variant variants[MaxVariants]; bool enableTextureMaps = gpu.GPU3D.RenderDispCnt & (1<<0); for (int i = 0; i < gpu.GPU3D.RenderNumPolygons; i++) { Polygon* polygon = gpu.GPU3D.RenderPolygonRAM[i]; u32 nverts = polygon->NumVertices; u32 vtop = polygon->VTop, vbot = polygon->VBottom; u32 curVL = vtop, curVR = vtop; u32 nextVL, nextVR; RenderPolygons[i].FirstXSpan = numSetupIndices; RenderPolygons[i].Attr = polygon->Attr; bool foundVariant = false; if (i > 0) { // if the whole texture attribute matches // the texture layer will also match Polygon* prevPolygon = gpu.GPU3D.RenderPolygonRAM[i - 1]; foundVariant = prevPolygon->TexParam == polygon->TexParam && prevPolygon->TexPalette == polygon->TexPalette && (prevPolygon->Attr & 0x30) == (polygon->Attr & 0x30) && prevPolygon->IsShadowMask == polygon->IsShadowMask; } if (!foundVariant) { Variant variant; variant.BlendMode = polygon->IsShadowMask ? 4 : ((polygon->Attr >> 4) & 0x3); variant.Texture = 0; variant.Sampler = 0; u32* textureLastVariant = nullptr; // we always need to look up the texture to get the layer of the array texture if (enableTextureMaps && (polygon->TexParam >> 26) & 0x7) { Texcache.GetTexture(gpu, polygon->TexParam, polygon->TexPalette, variant.Texture, prevTexLayer, textureLastVariant); bool wrapS = (polygon->TexParam >> 16) & 1; bool wrapT = (polygon->TexParam >> 17) & 1; bool mirrorS = (polygon->TexParam >> 18) & 1; bool mirrorT = (polygon->TexParam >> 19) & 1; variant.Sampler = Samplers[(wrapS ? (mirrorS ? 2 : 1) : 0) + (wrapT ? (mirrorT ? 2 : 1) : 0) * 3]; if (*textureLastVariant < numVariants && variants[*textureLastVariant] == variant) { foundVariant = true; prevVariant = *textureLastVariant; } } if (!foundVariant) { for (int j = numVariants - 1; j >= 0; j--) { if (variants[j] == variant) { foundVariant = true; prevVariant = j; goto foundVariant; } } prevVariant = numVariants; variants[numVariants] = variant; variants[numVariants].Width = TextureWidth(polygon->TexParam); variants[numVariants].Height = TextureHeight(polygon->TexParam); numVariants++; assert(numVariants <= MaxVariants); foundVariant:; if (textureLastVariant) *textureLastVariant = prevVariant; } } RenderPolygons[i].Variant = prevVariant; RenderPolygons[i].TextureLayer = (float)prevTexLayer; if (polygon->FacingView) { nextVL = curVL + 1; if (nextVL >= nverts) nextVL = 0; nextVR = curVR - 1; if ((s32)nextVR < 0) nextVR = nverts - 1; } else { nextVL = curVL - 1; if ((s32)nextVL < 0) nextVL = nverts - 1; nextVR = curVR + 1; if (nextVR >= nverts) nextVR = 0; } s32 scaledPositions[10][2]; s32 ytop = ScreenHeight, ybot = 0; for (int i = 0; i < polygon->NumVertices; i++) { if (HiresCoordinates) { scaledPositions[i][0] = (polygon->Vertices[i]->HiresPosition[0] * ScaleFactor) >> 4; scaledPositions[i][1] = (polygon->Vertices[i]->HiresPosition[1] * ScaleFactor) >> 4; } else { scaledPositions[i][0] = polygon->Vertices[i]->FinalPosition[0] * ScaleFactor; scaledPositions[i][1] = polygon->Vertices[i]->FinalPosition[1] * ScaleFactor; } ytop = std::min(scaledPositions[i][1], ytop); ybot = std::max(scaledPositions[i][1], ybot); } RenderPolygons[i].YTop = ytop; RenderPolygons[i].YBot = ybot; RenderPolygons[i].XMin = ScreenWidth; RenderPolygons[i].XMax = 0; if (ybot == ytop) { vtop = 0; vbot = 0; RenderPolygons[i].YBot++; int j = 1; if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j; if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j; j = nverts - 1; if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j; if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j; assert(numYSpans < MaxYSpanSetups); u32 curSpanL = numYSpans; SetupYSpanDummy(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, vtop, 0, scaledPositions); assert(numYSpans < MaxYSpanSetups); u32 curSpanR = numYSpans; SetupYSpanDummy(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, vbot, 1, scaledPositions); YSpanIndices[numSetupIndices].PolyIdx = i; YSpanIndices[numSetupIndices].SpanIdxL = curSpanL; YSpanIndices[numSetupIndices].SpanIdxR = curSpanR; YSpanIndices[numSetupIndices].Y = ytop; numSetupIndices++; } else { u32 curSpanL = numYSpans; assert(numYSpans < MaxYSpanSetups); SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVL, nextVL, 0, scaledPositions); u32 curSpanR = numYSpans; assert(numYSpans < MaxYSpanSetups); SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVR, nextVR, 1, scaledPositions); for (u32 y = ytop; y < ybot; y++) { if (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom) { while (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom) { curVL = nextVL; if (polygon->FacingView) { nextVL = curVL + 1; if (nextVL >= nverts) nextVL = 0; } else { nextVL = curVL - 1; if ((s32)nextVL < 0) nextVL = nverts - 1; } } assert(numYSpans < MaxYSpanSetups); curSpanL = numYSpans; SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVL, nextVL, 0, scaledPositions); } if (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom) { while (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom) { curVR = nextVR; if (polygon->FacingView) { nextVR = curVR - 1; if ((s32)nextVR < 0) nextVR = nverts - 1; } else { nextVR = curVR + 1; if (nextVR >= nverts) nextVR = 0; } } assert(numYSpans < MaxYSpanSetups); curSpanR = numYSpans; SetupYSpan(&RenderPolygons[i] ,&YSpanSetups[numYSpans++], polygon, curVR, nextVR, 1, scaledPositions); } YSpanIndices[numSetupIndices].PolyIdx = i; YSpanIndices[numSetupIndices].SpanIdxL = curSpanL; YSpanIndices[numSetupIndices].SpanIdxR = curSpanR; YSpanIndices[numSetupIndices].Y = y; numSetupIndices++; } } //printf("polygon min max %d %d | %d %d\n", RenderPolygons[i].XMin, RenderPolygons[i].XMinY, RenderPolygons[i].XMax, RenderPolygons[i].XMaxY); } /*for (u32 i = 0; i < RenderNumPolygons; i++) { if (RenderPolygons[i].Variant >= numVariants) { printf("blarb2 %d %d %d\n", RenderPolygons[i].Variant, i, RenderNumPolygons); } //assert(RenderPolygons[i].Variant < numVariants); }*/ if (numYSpans > 0) { glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory); glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(SpanSetupY)*numYSpans, YSpanSetups); glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory); glBufferSubData(GL_TEXTURE_BUFFER, 0, numSetupIndices*4*2, YSpanIndices.data()); glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory); glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, gpu.GPU3D.RenderNumPolygons*sizeof(RenderPolygon), RenderPolygons); // we haven't accessed image data yet, so we don't need to invalidate anything } //printf("found via %d %d %d of %d\n", foundviatexcache, foundviaprev, numslow, RenderNumPolygons); // bind everything glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, RenderPolygonMemory); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, XSpanSetupMemory); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, YSpanSetupMemory); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, FinalTileMemory); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, BinResultMemory); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 7, WorkDescMemory); MetaUniform meta; meta.DispCnt = gpu.GPU3D.RenderDispCnt; meta.NumPolygons = gpu.GPU3D.RenderNumPolygons; meta.NumVariants = numVariants; meta.AlphaRef = gpu.GPU3D.RenderAlphaRef; { u32 r = (gpu.GPU3D.RenderClearAttr1 << 1) & 0x3E; if (r) r++; u32 g = (gpu.GPU3D.RenderClearAttr1 >> 4) & 0x3E; if (g) g++; u32 b = (gpu.GPU3D.RenderClearAttr1 >> 9) & 0x3E; if (b) b++; u32 a = (gpu.GPU3D.RenderClearAttr1 >> 16) & 0x1F; meta.ClearColor = r | (g << 8) | (b << 16) | (a << 24); meta.ClearDepth = ((gpu.GPU3D.RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF; meta.ClearAttr = gpu.GPU3D.RenderClearAttr1 & 0x3F008000; } for (u32 i = 0; i < 32; i++) { u32 color = gpu.GPU3D.RenderToonTable[i]; u32 r = (color << 1) & 0x3E; u32 g = (color >> 4) & 0x3E; u32 b = (color >> 9) & 0x3E; if (r) r++; if (g) g++; if (b) b++; meta.ToonTable[i*4+0] = r | (g << 8) | (b << 16); } for (u32 i = 0; i < 34; i++) { meta.ToonTable[i*4+1] = gpu.GPU3D.RenderFogDensityTable[i]; } for (u32 i = 0; i < 8; i++) { u32 color = gpu.GPU3D.RenderEdgeTable[i]; u32 r = (color << 1) & 0x3E; u32 g = (color >> 4) & 0x3E; u32 b = (color >> 9) & 0x3E; if (r) r++; if (g) g++; if (b) b++; meta.ToonTable[i*4+2] = r | (g << 8) | (b << 16); } meta.FogOffset = gpu.GPU3D.RenderFogOffset; meta.FogShift = gpu.GPU3D.RenderFogShift; { u32 fogR = (gpu.GPU3D.RenderFogColor << 1) & 0x3E; if (fogR) fogR++; u32 fogG = (gpu.GPU3D.RenderFogColor >> 4) & 0x3E; if (fogG) fogG++; u32 fogB = (gpu.GPU3D.RenderFogColor >> 9) & 0x3E; if (fogB) fogB++; u32 fogA = (gpu.GPU3D.RenderFogColor >> 16) & 0x1F; meta.FogColor = fogR | (fogG << 8) | (fogB << 16) | (fogA << 24); } glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory); glBufferSubData(GL_UNIFORM_BUFFER, 0, sizeof(MetaUniform), &meta); glBindBufferBase(GL_UNIFORM_BUFFER, 0, MetaUniformMemory); glUseProgram(ShaderClearCoarseBinMask); glDispatchCompute(TilesPerLine*TileLines/32, 1, 1); bool wbuffer = false; if (numYSpans > 0) { wbuffer = gpu.GPU3D.RenderPolygonRAM[0]->WBuffer; glUseProgram(ShaderClearIndirectWorkCount); glDispatchCompute((numVariants+31)/32, 1, 1); // calculate x-spans glBindImageTexture(0, YSpanIndicesTexture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGBA16UI); glUseProgram(ShaderInterpXSpans[wbuffer]); glDispatchCompute((numSetupIndices + 31) / 32, 1, 1); glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); // bin polygons glUseProgram(ShaderBinCombined); glDispatchCompute(((gpu.GPU3D.RenderNumPolygons + 31) / 32), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH); glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); // calculate list offsets glUseProgram(ShaderCalculateWorkListOffset); glDispatchCompute((numVariants + 31) / 32, 1, 1); glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); // sort shader work glUseProgram(ShaderSortWork); glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory); glDispatchComputeIndirect(offsetof(BinResultHeader, SortWorkWorkCount)); glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); glActiveTexture(GL_TEXTURE0); for (int i = 0; i < tilememoryLayer_Num; i++) glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2+i, TileMemory[i]); // rasterise { bool highLightMode = gpu.GPU3D.RenderDispCnt & (1<<1); GLuint shadersNoTexture[] = { ShaderRasteriseNoTexture[wbuffer], ShaderRasteriseNoTexture[wbuffer], highLightMode ? ShaderRasteriseNoTextureHighlight[wbuffer] : ShaderRasteriseNoTextureToon[wbuffer], ShaderRasteriseNoTexture[wbuffer], ShaderRasteriseShadowMask[wbuffer] }; GLuint shadersUseTexture[] = { ShaderRasteriseUseTextureModulate[wbuffer], ShaderRasteriseUseTextureDecal[wbuffer], highLightMode ? ShaderRasteriseUseTextureHighlight[wbuffer] : ShaderRasteriseUseTextureToon[wbuffer], ShaderRasteriseUseTextureDecal[wbuffer], ShaderRasteriseShadowMask[wbuffer] }; GLuint prevShader = 0; s32 prevTexture = 0, prevSampler = 0; for (int i = 0; i < numVariants; i++) { GLuint shader = 0; if (variants[i].Texture == 0) { shader = shadersNoTexture[variants[i].BlendMode]; } else { shader = shadersUseTexture[variants[i].BlendMode]; if (variants[i].Texture != prevTexture) { glBindTexture(GL_TEXTURE_2D_ARRAY, variants[i].Texture); prevTexture = variants[i].Texture; } if (variants[i].Sampler != prevSampler) { glBindSampler(0, variants[i].Sampler); prevSampler = variants[i].Sampler; } } assert(shader != 0); if (shader != prevShader) { glUseProgram(shader); prevShader = shader; } glUniform1ui(UniformIdxCurVariant, i); glUniform2f(UniformIdxTextureSize, 1.f / variants[i].Width, 1.f / variants[i].Height); glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory); glDispatchComputeIndirect(offsetof(BinResultHeader, VariantWorkCount) + i*4*4); } } } glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); // compose final image glUseProgram(ShaderDepthBlend[wbuffer]); glDispatchCompute(ScreenWidth/TileSize, ScreenHeight/TileSize, 1); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); glBindImageTexture(0, Framebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8); glBindImageTexture(1, LowResFramebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8UI); u32 finalPassShader = 0; if (gpu.GPU3D.RenderDispCnt & (1<<4)) finalPassShader |= 0x4; if (gpu.GPU3D.RenderDispCnt & (1<<7)) finalPassShader |= 0x2; if (gpu.GPU3D.RenderDispCnt & (1<<5)) finalPassShader |= 0x1; glUseProgram(ShaderFinalPass[finalPassShader]); glDispatchCompute(ScreenWidth/32, ScreenHeight, 1); glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); glBindSampler(0, 0); /*u64 starttime = armGetSystemTick(); EmuQueue.waitIdle(); printf("total time %f\n", armTicksToNs(armGetSystemTick()-starttime)*0.000001f);*/ /*for (u32 i = 0; i < RenderNumPolygons; i++) { if (RenderPolygons[i].Variant >= numVariants) { printf("blarb %d %d %d\n", RenderPolygons[i].Variant, i, RenderNumPolygons); } //assert(RenderPolygons[i].Variant < numVariants); }*/ /*for (int i = 0; i < binresult->SortWorkWorkCount[0]*32; i++) { printf("sorted %x %x\n", binresult->SortedWork[i*2+0], binresult->SortedWork[i*2+1]); }*/ /* if (polygonvisible != -1) { SpanSetupX* xspans = Gfx::DataHeap->CpuAddr(XSpanSetupMemory); printf("span result\n"); Polygon* poly = RenderPolygonRAM[polygonvisible]; u32 xspanoffset = RenderPolygons[polygonvisible].FirstXSpan; for (u32 i = 0; i < (poly->YBottom - poly->YTop); i++) { printf("%d: %d - %d | %d %d | %d %d\n", i + poly->YTop, xspans[xspanoffset + i].X0, xspans[xspanoffset + i].X1, xspans[xspanoffset + i].__pad0, xspans[xspanoffset + i].__pad1, RenderPolygons[polygonvisible].YTop, RenderPolygons[polygonvisible].YBot); } }*/ /* printf("xspans: %d\n", numSetupIndices); SpanSetupX* xspans = Gfx::DataHeap->CpuAddr(XSpanSetupMemory[curSlice]); for (int i = 0; i < numSetupIndices; i++) { printf("poly %d %d %d | line %d | %d to %d\n", YSpanIndices[i].PolyIdx, YSpanIndices[i].SpanIdxL, YSpanIndices[i].SpanIdxR, YSpanIndices[i].Y, xspans[i].X0, xspans[i].X1); } printf("bin result\n"); BinResult* binresult = Gfx::DataHeap->CpuAddr(BinResultMemory); for (u32 y = 0; y < 192/8; y++) { for (u32 x = 0; x < 256/8; x++) { printf("%08x ", binresult->BinnedMaskCoarse[(x + y * (256/8)) * 2]); } printf("\n"); }*/ } void ComputeRenderer::RestartFrame(GPU& gpu) { } u32* ComputeRenderer::GetLine(int line) { int stride = 256; if (line == 0) { glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer); u8* data = (u8*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY); if (data) memcpy(&FramebufferCPU[0], data, 4*stride*192); glUnmapBuffer(GL_PIXEL_PACK_BUFFER); } return &FramebufferCPU[stride * line]; } void ComputeRenderer::SetupAccelFrame() { glBindTexture(GL_TEXTURE_2D, Framebuffer); } void ComputeRenderer::PrepareCaptureFrame() { glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer); glBindTexture(GL_TEXTURE_2D, LowResFramebuffer); glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, nullptr); } void ComputeRenderer::BindOutputTexture(int buffer) { CurGLCompositor.BindOutputTexture(buffer); } void ComputeRenderer::Blit(const GPU &gpu) { CurGLCompositor.RenderFrame(gpu, *this); } void ComputeRenderer::Stop(const GPU &gpu) { CurGLCompositor.Stop(gpu); } }