melonDS/src/GPU3D_Compute.cpp

1137 lines
39 KiB
C++

/*
Copyright 2016-2024 melonDS team
This file is part of melonDS.
melonDS is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your option)
any later version.
melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with melonDS. If not, see http://www.gnu.org/licenses/.
*/
#include "GPU3D_Compute.h"
#include <assert.h>
#include "OpenGLSupport.h"
#include "GPU3D_Compute_shaders.h"
namespace melonDS
{
ComputeRenderer::ComputeRenderer(GLCompositor&& compositor)
: Renderer3D(true), Texcache(TexcacheOpenGLLoader()), CurGLCompositor(std::move(compositor))
{}
bool ComputeRenderer::CompileShader(GLuint& shader, const std::string& source, const std::initializer_list<const char*>& defines)
{
std::string shaderName;
std::string shaderSource;
shaderSource += "#version 430 core\n";
for (const char* define : defines)
{
shaderSource += "#define ";
shaderSource += define;
shaderSource += '\n';
shaderName += define;
shaderName += ',';
}
shaderSource += "#define ScreenWidth ";
shaderSource += std::to_string(ScreenWidth);
shaderSource += "\n#define ScreenHeight ";
shaderSource += std::to_string(ScreenHeight);
shaderSource += "\n#define MaxWorkTiles ";
shaderSource += std::to_string(MaxWorkTiles);
shaderSource += ComputeRendererShaders::Common;
shaderSource += source;
return OpenGL::CompileComputeProgram(shader, shaderSource.c_str(), shaderName.c_str());
}
void ComputeRenderer::ShaderCompileStep(int& current, int& count)
{
current = ShaderStepIdx;
ShaderStepIdx++;
count = 33;
switch (current)
{
case 0:
CompileShader(ShaderInterpXSpans[0], ComputeRendererShaders::InterpSpans, {"InterpSpans", "ZBuffer"});
return;
case 1:
CompileShader(ShaderInterpXSpans[1], ComputeRendererShaders::InterpSpans, {"InterpSpans", "WBuffer"});
return;
case 2:
CompileShader(ShaderBinCombined, ComputeRendererShaders::BinCombined, {"BinCombined"});
return;
case 3:
CompileShader(ShaderDepthBlend[0], ComputeRendererShaders::DepthBlend, {"DepthBlend", "ZBuffer"});
return;
case 4:
CompileShader(ShaderDepthBlend[1], ComputeRendererShaders::DepthBlend, {"DepthBlend", "WBuffer"});
return;
case 5:
CompileShader(ShaderRasteriseNoTexture[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture"});
return;
case 6:
CompileShader(ShaderRasteriseNoTexture[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture"});
return;
case 7:
CompileShader(ShaderRasteriseNoTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Toon"});
return;
case 8:
CompileShader(ShaderRasteriseNoTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Toon"});
return;
case 9:
CompileShader(ShaderRasteriseNoTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Highlight"});
return;
case 10:
CompileShader(ShaderRasteriseNoTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Highlight"});
return;
case 11:
CompileShader(ShaderRasteriseUseTextureDecal[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Decal"});
return;
case 12:
CompileShader(ShaderRasteriseUseTextureDecal[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Decal"});
return;
case 13:
CompileShader(ShaderRasteriseUseTextureModulate[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Modulate"});
return;
case 14:
CompileShader(ShaderRasteriseUseTextureModulate[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Modulate"});
return;
case 15:
CompileShader(ShaderRasteriseUseTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Toon"});
return;
case 16:
CompileShader(ShaderRasteriseUseTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Toon"});
return;
case 17:
CompileShader(ShaderRasteriseUseTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Highlight"});
return;
case 18:
CompileShader(ShaderRasteriseUseTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Highlight"});
return;
case 19:
CompileShader(ShaderRasteriseShadowMask[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "ShadowMask"});
return;
case 20:
CompileShader(ShaderRasteriseShadowMask[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "ShadowMask"});
return;
case 21:
CompileShader(ShaderClearCoarseBinMask, ComputeRendererShaders::ClearCoarseBinMask, {"ClearCoarseBinMask"});
return;
case 22:
CompileShader(ShaderClearIndirectWorkCount, ComputeRendererShaders::ClearIndirectWorkCount, {"ClearIndirectWorkCount"});
return;
case 23:
CompileShader(ShaderCalculateWorkListOffset, ComputeRendererShaders::CalcOffsets, {"CalculateWorkOffsets"});
return;
case 24:
CompileShader(ShaderSortWork, ComputeRendererShaders::SortWork, {"SortWork"});
return;
case 25:
CompileShader(ShaderFinalPass[0], ComputeRendererShaders::FinalPass, {"FinalPass"});
return;
case 26:
CompileShader(ShaderFinalPass[1], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking"});
return;
case 27:
CompileShader(ShaderFinalPass[2], ComputeRendererShaders::FinalPass, {"FinalPass", "Fog"});
return;
case 28:
CompileShader(ShaderFinalPass[3], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking", "Fog"});
return;
case 29:
CompileShader(ShaderFinalPass[4], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing"});
return;
case 30:
CompileShader(ShaderFinalPass[5], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking"});
return;
case 31:
CompileShader(ShaderFinalPass[6], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "Fog"});
return;
case 32:
CompileShader(ShaderFinalPass[7], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking", "Fog"});
return;
default:
__builtin_unreachable();
return;
}
}
void blah(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar *message, const void *userParam)
{
printf("%s\n", message);
}
std::unique_ptr<ComputeRenderer> ComputeRenderer::New()
{
std::optional<GLCompositor> compositor = GLCompositor::New();
if (!compositor)
return nullptr;
std::unique_ptr<ComputeRenderer> result = std::unique_ptr<ComputeRenderer>(new ComputeRenderer(std::move(*compositor)));
//glDebugMessageCallback(blah, NULL);
//glEnable(GL_DEBUG_OUTPUT);
glGenBuffers(1, &result->YSpanSetupMemory);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, result->YSpanSetupMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupY)*MaxYSpanSetups, nullptr, GL_DYNAMIC_DRAW);
glGenBuffers(1, &result->RenderPolygonMemory);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, result->RenderPolygonMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(RenderPolygon)*2048, nullptr, GL_DYNAMIC_DRAW);
glGenBuffers(1, &result->XSpanSetupMemory);
glGenBuffers(1, &result->BinResultMemory);
glGenBuffers(1, &result->FinalTileMemory);
glGenBuffers(1, &result->YSpanIndicesTextureMemory);
glGenBuffers(tilememoryLayer_Num, result->TileMemory);
glGenBuffers(1, &result->WorkDescMemory);
glGenTextures(1, &result->YSpanIndicesTexture);
glGenTextures(1, &result->LowResFramebuffer);
glBindTexture(GL_TEXTURE_2D, result->LowResFramebuffer);
glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8UI, 256, 192);
glGenBuffers(1, &result->MetaUniformMemory);
glBindBuffer(GL_UNIFORM_BUFFER, result->MetaUniformMemory);
glBufferData(GL_UNIFORM_BUFFER, sizeof(MetaUniform), nullptr, GL_DYNAMIC_DRAW);
glGenSamplers(9, result->Samplers);
for (u32 j = 0; j < 3; j++)
{
for (u32 i = 0; i < 3; i++)
{
const GLenum translateWrapMode[3] = {GL_CLAMP_TO_EDGE, GL_REPEAT, GL_MIRRORED_REPEAT};
glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_WRAP_S, translateWrapMode[i]);
glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_WRAP_T, translateWrapMode[j]);
glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glSamplerParameterf(result->Samplers[i+j*3], GL_TEXTURE_MAG_FILTER, GL_NEAREST);
}
}
glGenBuffers(1, &result->PixelBuffer);
glBindBuffer(GL_PIXEL_PACK_BUFFER, result->PixelBuffer);
glBufferData(GL_PIXEL_PACK_BUFFER, 256*192*4, NULL, GL_DYNAMIC_READ);
return result;
}
ComputeRenderer::~ComputeRenderer()
{
Texcache.Reset();
glDeleteBuffers(1, &YSpanSetupMemory);
glDeleteBuffers(1, &RenderPolygonMemory);
glDeleteBuffers(1, &XSpanSetupMemory);
glDeleteBuffers(1, &BinResultMemory);
glDeleteBuffers(tilememoryLayer_Num, TileMemory);
glDeleteBuffers(1, &WorkDescMemory);
glDeleteBuffers(1, &FinalTileMemory);
glDeleteBuffers(1, &YSpanIndicesTextureMemory);
glDeleteTextures(1, &YSpanIndicesTexture);
glDeleteTextures(1, &Framebuffer);
glDeleteBuffers(1, &MetaUniformMemory);
glDeleteSamplers(9, Samplers);
glDeleteBuffers(1, &PixelBuffer);
}
void ComputeRenderer::DeleteShaders()
{
std::initializer_list<GLuint> allPrograms =
{
ShaderInterpXSpans[0],
ShaderInterpXSpans[1],
ShaderBinCombined,
ShaderDepthBlend[0],
ShaderDepthBlend[1],
ShaderRasteriseNoTexture[0],
ShaderRasteriseNoTexture[1],
ShaderRasteriseNoTextureToon[0],
ShaderRasteriseNoTextureToon[1],
ShaderRasteriseNoTextureHighlight[0],
ShaderRasteriseNoTextureHighlight[1],
ShaderRasteriseUseTextureDecal[0],
ShaderRasteriseUseTextureDecal[1],
ShaderRasteriseUseTextureModulate[0],
ShaderRasteriseUseTextureModulate[1],
ShaderRasteriseUseTextureToon[0],
ShaderRasteriseUseTextureToon[1],
ShaderRasteriseUseTextureHighlight[0],
ShaderRasteriseUseTextureHighlight[1],
ShaderRasteriseShadowMask[0],
ShaderRasteriseShadowMask[1],
ShaderClearCoarseBinMask,
ShaderClearIndirectWorkCount,
ShaderCalculateWorkListOffset,
ShaderSortWork,
ShaderFinalPass[0],
ShaderFinalPass[1],
ShaderFinalPass[2],
ShaderFinalPass[3],
ShaderFinalPass[4],
ShaderFinalPass[5],
ShaderFinalPass[6],
ShaderFinalPass[7],
};
for (GLuint program : allPrograms)
glDeleteProgram(program);
}
void ComputeRenderer::Reset(GPU& gpu)
{
Texcache.Reset();
}
void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinates)
{
CurGLCompositor.SetScaleFactor(scale);
if (ScaleFactor != -1)
{
DeleteShaders();
}
ShaderStepIdx = 0;
ScaleFactor = scale;
ScreenWidth = 256 * ScaleFactor;
ScreenHeight = 192 * ScaleFactor;
TilesPerLine = ScreenWidth/TileSize;
TileLines = ScreenHeight/TileSize;
HiresCoordinates = highResolutionCoordinates;
MaxWorkTiles = TilesPerLine*TileLines*16;
for (int i = 0; i < tilememoryLayer_Num; i++)
{
glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory[i]);
glBufferData(GL_SHADER_STORAGE_BUFFER, 4*TileSize*TileSize*MaxWorkTiles, nullptr, GL_DYNAMIC_DRAW);
}
glBindBuffer(GL_SHADER_STORAGE_BUFFER, FinalTileMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*2*ScreenWidth*ScreenHeight, nullptr, GL_DYNAMIC_DRAW);
int binResultSize = sizeof(BinResultHeader)
+ TilesPerLine*TileLines*CoarseBinStride*4 // BinnedMaskCoarse
+ TilesPerLine*TileLines*BinStride*4 // BinnedMask
+ TilesPerLine*TileLines*BinStride*4; // WorkOffsets
glBindBuffer(GL_SHADER_STORAGE_BUFFER, BinResultMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, binResultSize, nullptr, GL_DYNAMIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, WorkDescMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, MaxWorkTiles*2*4*2, nullptr, GL_DYNAMIC_DRAW);
if (Framebuffer != 0)
glDeleteTextures(1, &Framebuffer);
glGenTextures(1, &Framebuffer);
glBindTexture(GL_TEXTURE_2D, Framebuffer);
glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, ScreenWidth, ScreenHeight);
// eh those are pretty bad guesses
// though real hw shouldn't be eable to render all 2048 polygons on every line either
int maxYSpanIndices = 64*2048 * ScaleFactor;
YSpanIndices.resize(maxYSpanIndices);
glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory);
glBufferData(GL_TEXTURE_BUFFER, maxYSpanIndices*2*4, nullptr, GL_DYNAMIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, XSpanSetupMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupX)*maxYSpanIndices, nullptr, GL_DYNAMIC_DRAW);
glBindTexture(GL_TEXTURE_BUFFER, YSpanIndicesTexture);
glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA16UI, YSpanIndicesTextureMemory);
}
void ComputeRenderer::VCount144(GPU& gpu)
{
}
void ComputeRenderer::SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to)
{
span->Z0 = poly->FinalZ[from];
span->W0 = poly->FinalW[from];
span->Z1 = poly->FinalZ[to];
span->W1 = poly->FinalW[to];
span->ColorR0 = poly->Vertices[from]->FinalColor[0];
span->ColorG0 = poly->Vertices[from]->FinalColor[1];
span->ColorB0 = poly->Vertices[from]->FinalColor[2];
span->ColorR1 = poly->Vertices[to]->FinalColor[0];
span->ColorG1 = poly->Vertices[to]->FinalColor[1];
span->ColorB1 = poly->Vertices[to]->FinalColor[2];
span->TexcoordU0 = poly->Vertices[from]->TexCoords[0];
span->TexcoordV0 = poly->Vertices[from]->TexCoords[1];
span->TexcoordU1 = poly->Vertices[to]->TexCoords[0];
span->TexcoordV1 = poly->Vertices[to]->TexCoords[1];
}
void ComputeRenderer::SetupYSpanDummy(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2])
{
s32 x0 = positions[vertex][0];
if (side)
{
span->DxInitial = -0x40000;
x0--;
}
else
{
span->DxInitial = 0;
}
span->X0 = span->X1 = x0;
span->XMin = x0;
span->XMax = x0;
span->Y0 = span->Y1 = positions[vertex][1];
if (span->XMin < rp->XMin)
{
rp->XMin = span->XMin;
rp->XMinY = span->Y0;
}
if (span->XMax > rp->XMax)
{
rp->XMax = span->XMax;
rp->XMaxY = span->Y0;
}
span->Increment = 0;
span->I0 = span->I1 = span->IRecip = 0;
span->Linear = true;
span->XCovIncr = 0;
span->IsDummy = true;
SetupAttrs(span, poly, vertex, vertex);
}
void ComputeRenderer::SetupYSpan(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int from, int to, int side, s32 positions[10][2])
{
span->X0 = positions[from][0];
span->X1 = positions[to][0];
span->Y0 = positions[from][1];
span->Y1 = positions[to][1];
SetupAttrs(span, poly, from, to);
s32 minXY, maxXY;
bool negative = false;
if (span->X1 > span->X0)
{
span->XMin = span->X0;
span->XMax = span->X1-1;
minXY = span->Y0;
maxXY = span->Y1;
}
else if (span->X1 < span->X0)
{
span->XMin = span->X1;
span->XMax = span->X0-1;
negative = true;
minXY = span->Y1;
maxXY = span->Y0;
}
else
{
span->XMin = span->X0;
if (side) span->XMin--;
span->XMax = span->XMin;
// doesn't matter for completely vertical slope
minXY = span->Y0;
maxXY = span->Y0;
}
if (span->XMin < rp->XMin)
{
rp->XMin = span->XMin;
rp->XMinY = minXY;
}
if (span->XMax > rp->XMax)
{
rp->XMax = span->XMax;
rp->XMaxY = maxXY;
}
span->IsDummy = false;
s32 xlen = span->XMax+1 - span->XMin;
s32 ylen = span->Y1 - span->Y0;
// slope increment has a 18-bit fractional part
// note: for some reason, x/y isn't calculated directly,
// instead, 1/y is calculated and then multiplied by x
// TODO: this is still not perfect (see for example x=169 y=33)
if (ylen == 0)
{
span->Increment = 0;
}
else if (ylen == xlen)
{
span->Increment = 0x40000;
}
else
{
s32 yrecip = (1<<18) / ylen;
span->Increment = (span->X1-span->X0) * yrecip;
if (span->Increment < 0) span->Increment = -span->Increment;
}
bool xMajor = (span->Increment > 0x40000);
if (side)
{
// right
if (xMajor)
span->DxInitial = negative ? (0x20000 + 0x40000) : (span->Increment - 0x20000);
else if (span->Increment != 0)
span->DxInitial = negative ? 0x40000 : 0;
else
span->DxInitial = -0x40000;
}
else
{
// left
if (xMajor)
span->DxInitial = negative ? ((span->Increment - 0x20000) + 0x40000) : 0x20000;
else if (span->Increment != 0)
span->DxInitial = negative ? 0x40000 : 0;
else
span->DxInitial = 0;
}
if (xMajor)
{
if (side)
{
span->I0 = span->X0 - 1;
span->I1 = span->X1 - 1;
}
else
{
span->I0 = span->X0;
span->I1 = span->X1;
}
// used for calculating AA coverage
span->XCovIncr = (ylen << 10) / xlen;
}
else
{
span->I0 = span->Y0;
span->I1 = span->Y1;
}
if (span->I0 != span->I1)
span->IRecip = (1<<30) / (span->I1 - span->I0);
else
span->IRecip = 0;
span->Linear = (span->W0 == span->W1) && !(span->W0 & 0x7E) && !(span->W1 & 0x7E);
if ((span->W0 & 0x1) && !(span->W1 & 0x1))
{
span->W0n = (span->W0 - 1) >> 1;
span->W0d = (span->W0 + 1) >> 1;
span->W1d = span->W1 >> 1;
}
else
{
span->W0n = span->W0 >> 1;
span->W0d = span->W0 >> 1;
span->W1d = span->W1 >> 1;
}
}
struct Variant
{
GLuint Texture, Sampler;
u16 Width, Height;
u8 BlendMode;
bool operator==(const Variant& other)
{
return Texture == other.Texture && Sampler == other.Sampler && BlendMode == other.BlendMode;
}
};
/*
Antialiasing
W-Buffer
With Texture
0
1, 3
2
without Texture
2
0, 1, 3
=> 20 Shader + 1x Shadow Mask
*/
void ComputeRenderer::RenderFrame(GPU& gpu)
{
assert(!NeedsShaderCompile());
if (!Texcache.Update(gpu) && gpu.GPU3D.RenderFrameIdentical)
{
return;
}
int numYSpans = 0;
int numSetupIndices = 0;
/*
Some games really like to spam small textures, often
to store the data like PPU tiles. E.g. Shantae
or some Mega Man game. Fortunately they are usually kind
enough to not vary the texture size all too often (usually
they just use 8x8 or 16x for everything).
This is the reason we have this whole mess where textures of
the same size are put into array textures. This allows
to increase the batch size.
Less variance between each Variant hah!
*/
u32 numVariants = 0, prevVariant, prevTexLayer;
Variant variants[MaxVariants];
bool enableTextureMaps = gpu.GPU3D.RenderDispCnt & (1<<0);
for (int i = 0; i < gpu.GPU3D.RenderNumPolygons; i++)
{
Polygon* polygon = gpu.GPU3D.RenderPolygonRAM[i];
u32 nverts = polygon->NumVertices;
u32 vtop = polygon->VTop, vbot = polygon->VBottom;
u32 curVL = vtop, curVR = vtop;
u32 nextVL, nextVR;
RenderPolygons[i].FirstXSpan = numSetupIndices;
RenderPolygons[i].Attr = polygon->Attr;
bool foundVariant = false;
if (i > 0)
{
// if the whole texture attribute matches
// the texture layer will also match
Polygon* prevPolygon = gpu.GPU3D.RenderPolygonRAM[i - 1];
foundVariant = prevPolygon->TexParam == polygon->TexParam
&& prevPolygon->TexPalette == polygon->TexPalette
&& (prevPolygon->Attr & 0x30) == (polygon->Attr & 0x30)
&& prevPolygon->IsShadowMask == polygon->IsShadowMask;
}
if (!foundVariant)
{
Variant variant;
variant.BlendMode = polygon->IsShadowMask ? 4 : ((polygon->Attr >> 4) & 0x3);
variant.Texture = 0;
variant.Sampler = 0;
u32* textureLastVariant = nullptr;
// we always need to look up the texture to get the layer of the array texture
if (enableTextureMaps && (polygon->TexParam >> 26) & 0x7)
{
Texcache.GetTexture(gpu, polygon->TexParam, polygon->TexPalette, variant.Texture, prevTexLayer, textureLastVariant);
bool wrapS = (polygon->TexParam >> 16) & 1;
bool wrapT = (polygon->TexParam >> 17) & 1;
bool mirrorS = (polygon->TexParam >> 18) & 1;
bool mirrorT = (polygon->TexParam >> 19) & 1;
variant.Sampler = Samplers[(wrapS ? (mirrorS ? 2 : 1) : 0) + (wrapT ? (mirrorT ? 2 : 1) : 0) * 3];
if (*textureLastVariant < numVariants && variants[*textureLastVariant] == variant)
{
foundVariant = true;
prevVariant = *textureLastVariant;
}
}
if (!foundVariant)
{
for (int j = numVariants - 1; j >= 0; j--)
{
if (variants[j] == variant)
{
foundVariant = true;
prevVariant = j;
goto foundVariant;
}
}
prevVariant = numVariants;
variants[numVariants] = variant;
variants[numVariants].Width = TextureWidth(polygon->TexParam);
variants[numVariants].Height = TextureHeight(polygon->TexParam);
numVariants++;
assert(numVariants <= MaxVariants);
foundVariant:;
if (textureLastVariant)
*textureLastVariant = prevVariant;
}
}
RenderPolygons[i].Variant = prevVariant;
RenderPolygons[i].TextureLayer = (float)prevTexLayer;
if (polygon->FacingView)
{
nextVL = curVL + 1;
if (nextVL >= nverts) nextVL = 0;
nextVR = curVR - 1;
if ((s32)nextVR < 0) nextVR = nverts - 1;
}
else
{
nextVL = curVL - 1;
if ((s32)nextVL < 0) nextVL = nverts - 1;
nextVR = curVR + 1;
if (nextVR >= nverts) nextVR = 0;
}
s32 scaledPositions[10][2];
s32 ytop = ScreenHeight, ybot = 0;
for (int i = 0; i < polygon->NumVertices; i++)
{
if (HiresCoordinates)
{
scaledPositions[i][0] = (polygon->Vertices[i]->HiresPosition[0] * ScaleFactor) >> 4;
scaledPositions[i][1] = (polygon->Vertices[i]->HiresPosition[1] * ScaleFactor) >> 4;
}
else
{
scaledPositions[i][0] = polygon->Vertices[i]->FinalPosition[0] * ScaleFactor;
scaledPositions[i][1] = polygon->Vertices[i]->FinalPosition[1] * ScaleFactor;
}
ytop = std::min(scaledPositions[i][1], ytop);
ybot = std::max(scaledPositions[i][1], ybot);
}
RenderPolygons[i].YTop = ytop;
RenderPolygons[i].YBot = ybot;
RenderPolygons[i].XMin = ScreenWidth;
RenderPolygons[i].XMax = 0;
if (ybot == ytop)
{
vtop = 0; vbot = 0;
RenderPolygons[i].YBot++;
int j = 1;
if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j;
if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j;
j = nverts - 1;
if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j;
if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j;
assert(numYSpans < MaxYSpanSetups);
u32 curSpanL = numYSpans;
SetupYSpanDummy(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, vtop, 0, scaledPositions);
assert(numYSpans < MaxYSpanSetups);
u32 curSpanR = numYSpans;
SetupYSpanDummy(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, vbot, 1, scaledPositions);
YSpanIndices[numSetupIndices].PolyIdx = i;
YSpanIndices[numSetupIndices].SpanIdxL = curSpanL;
YSpanIndices[numSetupIndices].SpanIdxR = curSpanR;
YSpanIndices[numSetupIndices].Y = ytop;
numSetupIndices++;
}
else
{
u32 curSpanL = numYSpans;
assert(numYSpans < MaxYSpanSetups);
SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVL, nextVL, 0, scaledPositions);
u32 curSpanR = numYSpans;
assert(numYSpans < MaxYSpanSetups);
SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVR, nextVR, 1, scaledPositions);
for (u32 y = ytop; y < ybot; y++)
{
if (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom)
{
while (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom)
{
curVL = nextVL;
if (polygon->FacingView)
{
nextVL = curVL + 1;
if (nextVL >= nverts)
nextVL = 0;
}
else
{
nextVL = curVL - 1;
if ((s32)nextVL < 0)
nextVL = nverts - 1;
}
}
assert(numYSpans < MaxYSpanSetups);
curSpanL = numYSpans;
SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVL, nextVL, 0, scaledPositions);
}
if (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom)
{
while (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom)
{
curVR = nextVR;
if (polygon->FacingView)
{
nextVR = curVR - 1;
if ((s32)nextVR < 0)
nextVR = nverts - 1;
}
else
{
nextVR = curVR + 1;
if (nextVR >= nverts)
nextVR = 0;
}
}
assert(numYSpans < MaxYSpanSetups);
curSpanR = numYSpans;
SetupYSpan(&RenderPolygons[i] ,&YSpanSetups[numYSpans++], polygon, curVR, nextVR, 1, scaledPositions);
}
YSpanIndices[numSetupIndices].PolyIdx = i;
YSpanIndices[numSetupIndices].SpanIdxL = curSpanL;
YSpanIndices[numSetupIndices].SpanIdxR = curSpanR;
YSpanIndices[numSetupIndices].Y = y;
numSetupIndices++;
}
}
//printf("polygon min max %d %d | %d %d\n", RenderPolygons[i].XMin, RenderPolygons[i].XMinY, RenderPolygons[i].XMax, RenderPolygons[i].XMaxY);
}
/*for (u32 i = 0; i < RenderNumPolygons; i++)
{
if (RenderPolygons[i].Variant >= numVariants)
{
printf("blarb2 %d %d %d\n", RenderPolygons[i].Variant, i, RenderNumPolygons);
}
//assert(RenderPolygons[i].Variant < numVariants);
}*/
if (numYSpans > 0)
{
glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory);
glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(SpanSetupY)*numYSpans, YSpanSetups);
glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory);
glBufferSubData(GL_TEXTURE_BUFFER, 0, numSetupIndices*4*2, YSpanIndices.data());
glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory);
glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, gpu.GPU3D.RenderNumPolygons*sizeof(RenderPolygon), RenderPolygons);
// we haven't accessed image data yet, so we don't need to invalidate anything
}
//printf("found via %d %d %d of %d\n", foundviatexcache, foundviaprev, numslow, RenderNumPolygons);
// bind everything
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, RenderPolygonMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, XSpanSetupMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, YSpanSetupMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, FinalTileMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, BinResultMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 7, WorkDescMemory);
MetaUniform meta;
meta.DispCnt = gpu.GPU3D.RenderDispCnt;
meta.NumPolygons = gpu.GPU3D.RenderNumPolygons;
meta.NumVariants = numVariants;
meta.AlphaRef = gpu.GPU3D.RenderAlphaRef;
{
u32 r = (gpu.GPU3D.RenderClearAttr1 << 1) & 0x3E; if (r) r++;
u32 g = (gpu.GPU3D.RenderClearAttr1 >> 4) & 0x3E; if (g) g++;
u32 b = (gpu.GPU3D.RenderClearAttr1 >> 9) & 0x3E; if (b) b++;
u32 a = (gpu.GPU3D.RenderClearAttr1 >> 16) & 0x1F;
meta.ClearColor = r | (g << 8) | (b << 16) | (a << 24);
meta.ClearDepth = ((gpu.GPU3D.RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF;
meta.ClearAttr = gpu.GPU3D.RenderClearAttr1 & 0x3F008000;
}
for (u32 i = 0; i < 32; i++)
{
u32 color = gpu.GPU3D.RenderToonTable[i];
u32 r = (color << 1) & 0x3E;
u32 g = (color >> 4) & 0x3E;
u32 b = (color >> 9) & 0x3E;
if (r) r++;
if (g) g++;
if (b) b++;
meta.ToonTable[i*4+0] = r | (g << 8) | (b << 16);
}
for (u32 i = 0; i < 34; i++)
{
meta.ToonTable[i*4+1] = gpu.GPU3D.RenderFogDensityTable[i];
}
for (u32 i = 0; i < 8; i++)
{
u32 color = gpu.GPU3D.RenderEdgeTable[i];
u32 r = (color << 1) & 0x3E;
u32 g = (color >> 4) & 0x3E;
u32 b = (color >> 9) & 0x3E;
if (r) r++;
if (g) g++;
if (b) b++;
meta.ToonTable[i*4+2] = r | (g << 8) | (b << 16);
}
meta.FogOffset = gpu.GPU3D.RenderFogOffset;
meta.FogShift = gpu.GPU3D.RenderFogShift;
{
u32 fogR = (gpu.GPU3D.RenderFogColor << 1) & 0x3E; if (fogR) fogR++;
u32 fogG = (gpu.GPU3D.RenderFogColor >> 4) & 0x3E; if (fogG) fogG++;
u32 fogB = (gpu.GPU3D.RenderFogColor >> 9) & 0x3E; if (fogB) fogB++;
u32 fogA = (gpu.GPU3D.RenderFogColor >> 16) & 0x1F;
meta.FogColor = fogR | (fogG << 8) | (fogB << 16) | (fogA << 24);
}
glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory);
glBufferSubData(GL_UNIFORM_BUFFER, 0, sizeof(MetaUniform), &meta);
glBindBufferBase(GL_UNIFORM_BUFFER, 0, MetaUniformMemory);
glUseProgram(ShaderClearCoarseBinMask);
glDispatchCompute(TilesPerLine*TileLines/32, 1, 1);
bool wbuffer = false;
if (numYSpans > 0)
{
wbuffer = gpu.GPU3D.RenderPolygonRAM[0]->WBuffer;
glUseProgram(ShaderClearIndirectWorkCount);
glDispatchCompute((numVariants+31)/32, 1, 1);
// calculate x-spans
glBindImageTexture(0, YSpanIndicesTexture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGBA16UI);
glUseProgram(ShaderInterpXSpans[wbuffer]);
glDispatchCompute((numSetupIndices + 31) / 32, 1, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
// bin polygons
glUseProgram(ShaderBinCombined);
glDispatchCompute(((gpu.GPU3D.RenderNumPolygons + 31) / 32), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH);
glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
// calculate list offsets
glUseProgram(ShaderCalculateWorkListOffset);
glDispatchCompute((numVariants + 31) / 32, 1, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
// sort shader work
glUseProgram(ShaderSortWork);
glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory);
glDispatchComputeIndirect(offsetof(BinResultHeader, SortWorkWorkCount));
glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
glActiveTexture(GL_TEXTURE0);
for (int i = 0; i < tilememoryLayer_Num; i++)
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2+i, TileMemory[i]);
// rasterise
{
bool highLightMode = gpu.GPU3D.RenderDispCnt & (1<<1);
GLuint shadersNoTexture[] =
{
ShaderRasteriseNoTexture[wbuffer],
ShaderRasteriseNoTexture[wbuffer],
highLightMode
? ShaderRasteriseNoTextureHighlight[wbuffer]
: ShaderRasteriseNoTextureToon[wbuffer],
ShaderRasteriseNoTexture[wbuffer],
ShaderRasteriseShadowMask[wbuffer]
};
GLuint shadersUseTexture[] =
{
ShaderRasteriseUseTextureModulate[wbuffer],
ShaderRasteriseUseTextureDecal[wbuffer],
highLightMode
? ShaderRasteriseUseTextureHighlight[wbuffer]
: ShaderRasteriseUseTextureToon[wbuffer],
ShaderRasteriseUseTextureDecal[wbuffer],
ShaderRasteriseShadowMask[wbuffer]
};
GLuint prevShader = 0;
s32 prevTexture = 0, prevSampler = 0;
for (int i = 0; i < numVariants; i++)
{
GLuint shader = 0;
if (variants[i].Texture == 0)
{
shader = shadersNoTexture[variants[i].BlendMode];
}
else
{
shader = shadersUseTexture[variants[i].BlendMode];
if (variants[i].Texture != prevTexture)
{
glBindTexture(GL_TEXTURE_2D_ARRAY, variants[i].Texture);
prevTexture = variants[i].Texture;
}
if (variants[i].Sampler != prevSampler)
{
glBindSampler(0, variants[i].Sampler);
prevSampler = variants[i].Sampler;
}
}
assert(shader != 0);
if (shader != prevShader)
{
glUseProgram(shader);
prevShader = shader;
}
glUniform1ui(UniformIdxCurVariant, i);
glUniform2f(UniformIdxTextureSize, 1.f / variants[i].Width, 1.f / variants[i].Height);
glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory);
glDispatchComputeIndirect(offsetof(BinResultHeader, VariantWorkCount) + i*4*4);
}
}
}
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
// compose final image
glUseProgram(ShaderDepthBlend[wbuffer]);
glDispatchCompute(ScreenWidth/TileSize, ScreenHeight/TileSize, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
glBindImageTexture(0, Framebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8);
glBindImageTexture(1, LowResFramebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8UI);
u32 finalPassShader = 0;
if (gpu.GPU3D.RenderDispCnt & (1<<4))
finalPassShader |= 0x4;
if (gpu.GPU3D.RenderDispCnt & (1<<7))
finalPassShader |= 0x2;
if (gpu.GPU3D.RenderDispCnt & (1<<5))
finalPassShader |= 0x1;
glUseProgram(ShaderFinalPass[finalPassShader]);
glDispatchCompute(ScreenWidth/32, ScreenHeight, 1);
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
glBindSampler(0, 0);
/*u64 starttime = armGetSystemTick();
EmuQueue.waitIdle();
printf("total time %f\n", armTicksToNs(armGetSystemTick()-starttime)*0.000001f);*/
/*for (u32 i = 0; i < RenderNumPolygons; i++)
{
if (RenderPolygons[i].Variant >= numVariants)
{
printf("blarb %d %d %d\n", RenderPolygons[i].Variant, i, RenderNumPolygons);
}
//assert(RenderPolygons[i].Variant < numVariants);
}*/
/*for (int i = 0; i < binresult->SortWorkWorkCount[0]*32; i++)
{
printf("sorted %x %x\n", binresult->SortedWork[i*2+0], binresult->SortedWork[i*2+1]);
}*/
/* if (polygonvisible != -1)
{
SpanSetupX* xspans = Gfx::DataHeap->CpuAddr<SpanSetupX>(XSpanSetupMemory);
printf("span result\n");
Polygon* poly = RenderPolygonRAM[polygonvisible];
u32 xspanoffset = RenderPolygons[polygonvisible].FirstXSpan;
for (u32 i = 0; i < (poly->YBottom - poly->YTop); i++)
{
printf("%d: %d - %d | %d %d | %d %d\n", i + poly->YTop, xspans[xspanoffset + i].X0, xspans[xspanoffset + i].X1, xspans[xspanoffset + i].__pad0, xspans[xspanoffset + i].__pad1, RenderPolygons[polygonvisible].YTop, RenderPolygons[polygonvisible].YBot);
}
}*/
/*
printf("xspans: %d\n", numSetupIndices);
SpanSetupX* xspans = Gfx::DataHeap->CpuAddr<SpanSetupX>(XSpanSetupMemory[curSlice]);
for (int i = 0; i < numSetupIndices; i++)
{
printf("poly %d %d %d | line %d | %d to %d\n", YSpanIndices[i].PolyIdx, YSpanIndices[i].SpanIdxL, YSpanIndices[i].SpanIdxR, YSpanIndices[i].Y, xspans[i].X0, xspans[i].X1);
}
printf("bin result\n");
BinResult* binresult = Gfx::DataHeap->CpuAddr<BinResult>(BinResultMemory);
for (u32 y = 0; y < 192/8; y++)
{
for (u32 x = 0; x < 256/8; x++)
{
printf("%08x ", binresult->BinnedMaskCoarse[(x + y * (256/8)) * 2]);
}
printf("\n");
}*/
}
void ComputeRenderer::RestartFrame(GPU& gpu)
{
}
u32* ComputeRenderer::GetLine(int line)
{
int stride = 256;
if (line == 0)
{
glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer);
u8* data = (u8*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
if (data) memcpy(&FramebufferCPU[0], data, 4*stride*192);
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
}
return &FramebufferCPU[stride * line];
}
void ComputeRenderer::SetupAccelFrame()
{
glBindTexture(GL_TEXTURE_2D, Framebuffer);
}
void ComputeRenderer::PrepareCaptureFrame()
{
glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer);
glBindTexture(GL_TEXTURE_2D, LowResFramebuffer);
glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, nullptr);
}
void ComputeRenderer::BindOutputTexture(int buffer)
{
CurGLCompositor.BindOutputTexture(buffer);
}
void ComputeRenderer::Blit(const GPU &gpu)
{
CurGLCompositor.RenderFrame(gpu, *this);
}
void ComputeRenderer::Stop(const GPU &gpu)
{
CurGLCompositor.Stop(gpu);
}
}