From aafd20fb3cfa52b350676caee14c0afde115a6e6 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sun, 13 Jun 2021 20:37:13 +1000 Subject: [PATCH] GPU/ShaderGen: Move some calculations from fragment to vertex shader --- src/core/gpu_hw_d3d11.cpp | 12 +++-- src/core/gpu_hw_d3d11.h | 2 +- src/core/gpu_hw_opengl.cpp | 3 +- src/core/gpu_hw_shadergen.cpp | 96 ++++++++++++++++++++--------------- src/core/gpu_hw_shadergen.h | 2 +- src/core/gpu_hw_vulkan.cpp | 10 ++-- 6 files changed, 71 insertions(+), 54 deletions(-) diff --git a/src/core/gpu_hw_d3d11.cpp b/src/core/gpu_hw_d3d11.cpp index 09ba7e78b..06f7f3e6c 100644 --- a/src/core/gpu_hw_d3d11.cpp +++ b/src/core/gpu_hw_d3d11.cpp @@ -537,7 +537,7 @@ bool GPU_HW_D3D11::CompileShaders() // we need a vertex shader... ComPtr vs_bytecode = - shader_cache.GetShaderBlob(D3D11::ShaderCompiler::Type::Vertex, shadergen.GenerateBatchVertexShader(true)); + shader_cache.GetShaderBlob(D3D11::ShaderCompiler::Type::Vertex, shadergen.GenerateBatchVertexShader(true, true)); if (!vs_bytecode) return false; @@ -562,9 +562,9 @@ bool GPU_HW_D3D11::CompileShaders() UPDATE_PROGRESS(); - for (u8 textured = 0; textured < 2; textured++) + for (u8 textured = 0; textured < 3; textured++) { - const std::string vs = shadergen.GenerateBatchVertexShader(ConvertToBoolUnchecked(textured)); + const std::string vs = shadergen.GenerateBatchVertexShader(textured != 0, textured > 1); m_batch_vertex_shaders[textured] = shader_cache.GetVertexShader(m_device.Get(), vs); if (!m_batch_vertex_shaders[textured]) return false; @@ -807,9 +807,11 @@ bool GPU_HW_D3D11::BlitVRAMReplacementTexture(const TextureReplacementTexture* t void GPU_HW_D3D11::DrawBatchVertices(BatchRenderMode render_mode, u32 base_vertex, u32 num_vertices) { - const bool textured = (m_batch.texture_mode != GPUTextureMode::Disabled); + const u8 textured = + BoolToUInt8(m_batch.texture_mode != GPUTextureMode::Disabled) + + BoolToUInt8((m_batch.texture_mode & ~GPUTextureMode::RawTextureBit) <= GPUTextureMode::Palette8Bit); - m_context->VSSetShader(m_batch_vertex_shaders[BoolToUInt8(textured)].Get(), nullptr, 0); + m_context->VSSetShader(m_batch_vertex_shaders[textured].Get(), nullptr, 0); m_context->PSSetShader(m_batch_pixel_shaders[static_cast(render_mode)][static_cast(m_batch.texture_mode)] [BoolToUInt8(m_batch.dithering)][BoolToUInt8(m_batch.interlacing)] diff --git a/src/core/gpu_hw_d3d11.h b/src/core/gpu_hw_d3d11.h index eec2867b2..8595eff5d 100644 --- a/src/core/gpu_hw_d3d11.h +++ b/src/core/gpu_hw_d3d11.h @@ -116,7 +116,7 @@ private: std::array, 5> m_batch_blend_states; // [transparency_mode] ComPtr m_batch_input_layout; - std::array, 2> m_batch_vertex_shaders; // [textured] + std::array, 3> m_batch_vertex_shaders; // [textured/palette] std::array, 2>, 2>, 9>, 4> m_batch_pixel_shaders; // [render_mode][texture_mode][dithering][interlacing] diff --git a/src/core/gpu_hw_opengl.cpp b/src/core/gpu_hw_opengl.cpp index c4350a875..16810f494 100644 --- a/src/core/gpu_hw_opengl.cpp +++ b/src/core/gpu_hw_opengl.cpp @@ -544,7 +544,8 @@ bool GPU_HW_OpenGL::CompilePrograms() for (u8 interlacing = 0; interlacing < 2; interlacing++) { const bool textured = (static_cast(texture_mode) != GPUTextureMode::Disabled); - const std::string batch_vs = shadergen.GenerateBatchVertexShader(textured); + const bool paletted = textured && (static_cast(texture_mode & 3u) <= GPUTextureMode::Palette8Bit); + const std::string batch_vs = shadergen.GenerateBatchVertexShader(textured, paletted); const std::string fs = shadergen.GenerateBatchFragmentShader( static_cast(render_mode), static_cast(texture_mode), ConvertToBoolUnchecked(dithering), ConvertToBoolUnchecked(interlacing)); diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp index 83e329ec5..afe1d5648 100644 --- a/src/core/gpu_hw_shadergen.cpp +++ b/src/core/gpu_hw_shadergen.cpp @@ -23,6 +23,8 @@ void GPU_HW_ShaderGen::WriteCommonFunctions(std::stringstream& ss) ss << "CONSTANT uint RESOLUTION_SCALE = " << m_resolution_scale << "u;\n"; ss << "CONSTANT uint2 VRAM_SIZE = uint2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ") * RESOLUTION_SCALE;\n"; ss << "CONSTANT float2 RCP_VRAM_SIZE = float2(1.0, 1.0) / float2(VRAM_SIZE);\n"; + ss << "CONSTANT uint2 NATIVE_VRAM_SIZE = uint2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ");\n"; + ss << "CONSTANT float2 RCP_NATIVE_VRAM_SIZE = float2(1.0, 1.0) / float2(NATIVE_VRAM_SIZE);\n"; ss << "CONSTANT uint MULTISAMPLES = " << m_multisamples << "u;\n"; ss << "CONSTANT bool PER_SAMPLE_SHADING = " << (m_per_sample_shading ? "true" : "false") << ";\n"; ss << R"( @@ -45,6 +47,15 @@ uint fixYCoord(uint y) #endif } +uint fixNativeYCoord(uint y) +{ +#if API_OPENGL || API_OPENGL_ES + return NATIVE_VRAM_SIZE.y - y - 1u; +#else + return y; +#endif +} + uint RGBA8ToRGBA5551(float4 v) { uint r = uint(roundEven(v.r * 31.0)); @@ -75,11 +86,12 @@ void GPU_HW_ShaderGen::WriteBatchUniformBuffer(std::stringstream& ss) false); } -std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured) +std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool palette) { std::stringstream ss; WriteHeader(ss); DefineMacro(ss, "TEXTURED", textured); + DefineMacro(ss, "PALETTE", palette); DefineMacro(ss, "UV_LIMITS", m_uv_limits); DefineMacro(ss, "PGXP_DEPTH", m_pgxp_depth); @@ -159,21 +171,41 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured) v_pos = float4(pos_x * pos_w, pos_y * pos_w, pos_z * pos_w, pos_w); - v_col0 = a_col0; + v_col0 = a_col0 * float4(255.0, 255.0, 255.0, 255.0); #if TEXTURED - v_tex0 = float2(float((a_texcoord & 0xFFFFu) * RESOLUTION_SCALE), - float((a_texcoord >> 16) * RESOLUTION_SCALE)); + #if PALETTE + // We can't currently use upscaled coordinate for palettes because of how they're packed. + // Not that it would be any benefit anyway, render-to-texture effects don't use palettes. + v_tex0 = float2(float(a_texcoord & 0xFFFFu), float((a_texcoord >> 16))); - // base_x,base_y,palette_x,palette_y - v_texpage.x = (a_texpage & 15u) * 64u * RESOLUTION_SCALE; - v_texpage.y = ((a_texpage >> 4) & 1u) * 256u * RESOLUTION_SCALE; - v_texpage.z = ((a_texpage >> 16) & 63u) * 16u * RESOLUTION_SCALE; - v_texpage.w = ((a_texpage >> 22) & 511u) * RESOLUTION_SCALE; + // base_x,base_y,palette_x,palette_y + v_texpage.x = (a_texpage & 15u) * 64u; + v_texpage.y = ((a_texpage >> 4) & 1u) * 256u; + v_texpage.z = ((a_texpage >> 16) & 63u) * 16u; + v_texpage.w = ((a_texpage >> 22) & 511u); - #if UV_LIMITS - v_uv_limits = a_uv_limits * float4(255.0, 255.0, 255.0, 255.0); - #endif - #endif + #if UV_LIMITS + v_uv_limits = a_uv_limits * float4(255.0, 255.0, 255.0, 255.0); + #endif + #else + v_tex0 = float2(float((a_texcoord & 0xFFFFu) * RESOLUTION_SCALE), + float((a_texcoord >> 16) * RESOLUTION_SCALE)); + + // base_x,base_y,palette_x,palette_y + v_texpage.x = (a_texpage & 15u) * 64u * RESOLUTION_SCALE; + v_texpage.y = ((a_texpage >> 4) & 1u) * 256u * RESOLUTION_SCALE; + v_texpage.z = ((a_texpage >> 16) & 63u) * 16u * RESOLUTION_SCALE; + v_texpage.w = ((a_texpage >> 22) & 511u) * RESOLUTION_SCALE; + + #if UV_LIMITS + // Extend the UV range to all "upscaled" pixels. This means 1-pixel-high polygon-based + // framebuffer effects won't be downsampled. (e.g. Mega Man Legends 2 haze effect) + v_uv_limits = a_uv_limits * float4(255.0, 255.0, 255.0, 255.0); + v_uv_limits.xy *= float(RESOLUTION_SCALE); + v_uv_limits.zw = (v_uv_limits.zw * float(RESOLUTION_SCALE + 1u)) - float(RESOLUTION_SCALE - 1u); + #endif + #endif // PALETTE + #endif // TEXTURED } )"; @@ -767,11 +799,9 @@ float4 SampleFromVRAM(uint4 texpage, float2 coords) index_coord.x /= 2u; #endif - // fixup coords - uint2 vicoord = uint2(texpage.x + index_coord.x * RESOLUTION_SCALE, fixYCoord(texpage.y + index_coord.y * RESOLUTION_SCALE)); - - // load colour/palette - float4 texel = SAMPLE_TEXTURE(samp0, float2(vicoord) * RCP_VRAM_SIZE); + // load palette index + uint2 vicoord = uint2(texpage.x + index_coord.x, fixNativeYCoord(texpage.y + index_coord.y)); + float4 texel = SAMPLE_TEXTURE(samp0, float2(vicoord) * RCP_NATIVE_VRAM_SIZE); uint vram_value = RGBA8ToRGBA5551(texel); // apply palette @@ -784,8 +814,8 @@ float4 SampleFromVRAM(uint4 texpage, float2 coords) #endif // sample palette - uint2 palette_icoord = uint2(texpage.z + (palette_index * RESOLUTION_SCALE), fixYCoord(texpage.w)); - return SAMPLE_TEXTURE(samp0, float2(palette_icoord) * RCP_VRAM_SIZE); + uint2 palette_icoord = uint2(texpage.z + palette_index, fixNativeYCoord(texpage.w)); + return SAMPLE_TEXTURE(samp0, float2(palette_icoord) * RCP_NATIVE_VRAM_SIZE); #else // Direct texturing. Render-to-texture effects. Use upscaled coordinates. uint2 icoord = ApplyUpscaledTextureWindow(FloatToIntegerCoords(coords)); @@ -822,7 +852,7 @@ float4 SampleFromVRAM(uint4 texpage, float2 coords) ss << R"( { - uint3 vertcol = uint3(v_col0.rgb * float3(255.0, 255.0, 255.0)); + uint3 vertcol = uint3(v_col0.rgb); bool semitransparent; uint3 icolor; @@ -835,34 +865,16 @@ float4 SampleFromVRAM(uint4 texpage, float2 coords) #endif #if TEXTURED - - // We can't currently use upscaled coordinate for palettes because of how they're packed. - // Not that it would be any benefit anyway, render-to-texture effects don't use palettes. - float2 coords = v_tex0; - #if PALETTE - coords /= float2(RESOLUTION_SCALE, RESOLUTION_SCALE); - #endif - - #if UV_LIMITS - float4 uv_limits = v_uv_limits; - #if !PALETTE - // Extend the UV range to all "upscaled" pixels. This means 1-pixel-high polygon-based - // framebuffer effects won't be downsampled. (e.g. Mega Man Legends 2 haze effect) - uv_limits.xy *= float(RESOLUTION_SCALE); - uv_limits.zw = (uv_limits.zw * float(RESOLUTION_SCALE + 1u)) - float(RESOLUTION_SCALE - 1u); - #endif - #endif - float4 texcol; #if TEXTURE_FILTERING - FilteredSampleFromVRAM(v_texpage, coords, uv_limits, texcol, ialpha); + FilteredSampleFromVRAM(v_texpage, v_tex0, v_uv_limits, texcol, ialpha); if (ialpha < 0.5) discard; #else #if UV_LIMITS - texcol = SampleFromVRAM(v_texpage, clamp(coords, uv_limits.xy, uv_limits.zw)); + texcol = SampleFromVRAM(v_texpage, clamp(v_tex0, v_uv_limits.xy, v_uv_limits.zw)); #else - texcol = SampleFromVRAM(v_texpage, coords); + texcol = SampleFromVRAM(v_texpage, v_tex0); #endif if (VECTOR_EQ(texcol, TRANSPARENT_PIXEL_COLOR)) discard; diff --git a/src/core/gpu_hw_shadergen.h b/src/core/gpu_hw_shadergen.h index dff617e97..3f0291927 100644 --- a/src/core/gpu_hw_shadergen.h +++ b/src/core/gpu_hw_shadergen.h @@ -10,7 +10,7 @@ public: bool pgxp_depth, bool supports_dual_source_blend); ~GPU_HW_ShaderGen(); - std::string GenerateBatchVertexShader(bool textured); + std::string GenerateBatchVertexShader(bool textured, bool palette); std::string GenerateBatchFragmentShader(GPU_HW::BatchRenderMode transparency, GPUTextureMode texture_mode, bool dithering, bool interlacing); std::string GenerateInterlacedFillFragmentShader(); diff --git a/src/core/gpu_hw_vulkan.cpp b/src/core/gpu_hw_vulkan.cpp index 7896a5cce..4b4f4eede 100644 --- a/src/core/gpu_hw_vulkan.cpp +++ b/src/core/gpu_hw_vulkan.cpp @@ -859,16 +859,16 @@ bool GPU_HW_Vulkan::CompilePipelines() // vertex shaders - [textured] // fragment shaders - [render_mode][texture_mode][dithering][interlacing] - DimensionalArray batch_vertex_shaders{}; + DimensionalArray batch_vertex_shaders{}; DimensionalArray batch_fragment_shaders{}; Common::ScopeGuard batch_shader_guard([&batch_vertex_shaders, &batch_fragment_shaders]() { batch_vertex_shaders.enumerate(Vulkan::Util::SafeDestroyShaderModule); batch_fragment_shaders.enumerate(Vulkan::Util::SafeDestroyShaderModule); }); - for (u8 textured = 0; textured < 2; textured++) + for (u8 textured = 0; textured < 3; textured++) { - const std::string vs = shadergen.GenerateBatchVertexShader(ConvertToBoolUnchecked(textured)); + const std::string vs = shadergen.GenerateBatchVertexShader(textured != 0, textured > 1); VkShaderModule shader = g_vulkan_shader_cache->GetVertexShader(vs); if (shader == VK_NULL_HANDLE) return false; @@ -918,6 +918,8 @@ bool GPU_HW_Vulkan::CompilePipelines() static constexpr std::array depth_test_values = { VK_COMPARE_OP_ALWAYS, VK_COMPARE_OP_GREATER_OR_EQUAL, VK_COMPARE_OP_LESS_OR_EQUAL}; const bool textured = (static_cast(texture_mode) != GPUTextureMode::Disabled); + const bool paletted = + textured && (static_cast(texture_mode & 3u) <= GPUTextureMode::Palette8Bit); gpbuilder.SetPipelineLayout(m_batch_pipeline_layout); gpbuilder.SetRenderPass(m_vram_render_pass, 0); @@ -934,7 +936,7 @@ bool GPU_HW_Vulkan::CompilePipelines() } gpbuilder.SetPrimitiveTopology(VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST); - gpbuilder.SetVertexShader(batch_vertex_shaders[BoolToUInt8(textured)]); + gpbuilder.SetVertexShader(batch_vertex_shaders[BoolToUInt8(textured) + BoolToUInt8(paletted)]); gpbuilder.SetFragmentShader(batch_fragment_shaders[render_mode][texture_mode][dithering][interlacing]); gpbuilder.SetRasterizationState(VK_POLYGON_MODE_FILL, VK_CULL_MODE_NONE, VK_FRONT_FACE_CLOCKWISE);