From 5997ec6668258fd933113b4fa4893e2474ade90d Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 16 Sep 2018 17:14:21 +0300 Subject: [PATCH] [D3D12] 64bpp resolve and clear - GTA IV ingame --- src/xenia/gpu/d3d12/render_target_cache.cc | 18 ++++++++- src/xenia/gpu/d3d12/render_target_cache.h | 1 + .../d3d12/shaders/edram_clear_64bpp.cs.hlsl | 25 +++++++++++++ .../d3d12/shaders/texture_tile_32bpp.cs.hlsl | 2 +- .../d3d12/shaders/texture_tile_64bpp.cs.hlsl | 37 +++++++++++++++++++ src/xenia/gpu/d3d12/texture_cache.cc | 12 +++--- src/xenia/gpu/d3d12/texture_cache.h | 1 + 7 files changed, 88 insertions(+), 8 deletions(-) create mode 100644 src/xenia/gpu/d3d12/shaders/edram_clear_64bpp.cs.hlsl create mode 100644 src/xenia/gpu/d3d12/shaders/texture_tile_64bpp.cs.hlsl diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index 8d36e1854..b77cd9e68 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -29,6 +29,7 @@ namespace d3d12 { // Generated with `xb buildhlsl`. #include "xenia/gpu/d3d12/shaders/bin/edram_clear_32bpp_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/edram_clear_64bpp_cs.h" #include "xenia/gpu/d3d12/shaders/bin/edram_clear_depth_float_cs.h" #include "xenia/gpu/d3d12/shaders/bin/edram_load_color_32bpp_cs.h" #include "xenia/gpu/d3d12/shaders/bin/edram_load_color_64bpp_cs.h" @@ -194,6 +195,16 @@ bool RenderTargetCache::Initialize() { return false; } edram_clear_32bpp_pipeline_->SetName(L"EDRAM Clear 32bpp"); + // Clear 64-bit color. + edram_clear_64bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline( + device, edram_clear_64bpp_cs, sizeof(edram_clear_64bpp_cs), + edram_clear_root_signature_); + if (edram_clear_64bpp_pipeline_ == nullptr) { + XELOGE("Failed to create the EDRAM 64bpp clear pipeline"); + Shutdown(); + return false; + } + edram_clear_64bpp_pipeline_->SetName(L"EDRAM Clear 64bpp"); // Clear float depth. edram_clear_depth_float_pipeline_ = ui::d3d12::util::CreateComputePipeline( device, edram_clear_depth_float_cs, sizeof(edram_clear_depth_float_cs), @@ -272,6 +283,7 @@ void RenderTargetCache::Shutdown() { ui::d3d12::util::ReleaseAndNull(edram_tile_sample_64bpp_pipeline_); ui::d3d12::util::ReleaseAndNull(edram_tile_sample_32bpp_pipeline_); ui::d3d12::util::ReleaseAndNull(edram_clear_depth_float_pipeline_); + ui::d3d12::util::ReleaseAndNull(edram_clear_64bpp_pipeline_); ui::d3d12::util::ReleaseAndNull(edram_clear_32bpp_pipeline_); for (uint32_t i = 0; i < uint32_t(EDRAMLoadStoreMode::kCount); ++i) { ui::d3d12::util::ReleaseAndNull(edram_store_pipelines_[i]); @@ -1473,8 +1485,10 @@ bool RenderTargetCache::ResolveClear(uint32_t edram_base, } command_processor_->SetComputePipeline(edram_clear_depth_float_pipeline_); } else if (is_64bpp) { - // TODO(Triang3l): 64bpp color clear. - return false; + // TODO(Triang3l): Check which 32-bit portion is in which register. + root_constants.clear_color_high = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32; + root_constants.clear_color_low = regs[XE_GPU_REG_RB_COLOR_CLEAR_LOW].u32; + command_processor_->SetComputePipeline(edram_clear_64bpp_pipeline_); } else { Register reg = is_depth ? XE_GPU_REG_RB_DEPTH_CLEAR : XE_GPU_REG_RB_COLOR_CLEAR; diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index 07570fd28..bda75f991 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -483,6 +483,7 @@ class RenderTargetCache { ID3D12PipelineState* edram_tile_sample_32bpp_pipeline_ = nullptr; ID3D12PipelineState* edram_tile_sample_64bpp_pipeline_ = nullptr; ID3D12PipelineState* edram_clear_32bpp_pipeline_ = nullptr; + ID3D12PipelineState* edram_clear_64bpp_pipeline_ = nullptr; ID3D12PipelineState* edram_clear_depth_float_pipeline_ = nullptr; // 48 MB heaps backing used render targets resources, created when needed. diff --git a/src/xenia/gpu/d3d12/shaders/edram_clear_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_clear_64bpp.cs.hlsl new file mode 100644 index 000000000..b0447388d --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_clear_64bpp.cs.hlsl @@ -0,0 +1,25 @@ +#define XE_EDRAM_WRITE_ONLY +#include "edram_load_store.hlsli" + +// Load4/Store4 aren't needed here, but 80x16 threads is over the limit. +[numthreads(40, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + uint4 clear_rect; + clear_rect.xz = xe_edram_clear_rect & 0xFFFFu; + clear_rect.yw = xe_edram_clear_rect >> 16u; + uint2 sample_index = xe_thread_id.xy; + sample_index.x *= 2u; + [branch] if (any(sample_index < clear_rect.xy) || + any(sample_index >= clear_rect.zw)) { + return; + } + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 2u; + uint edram_offset = XeEDRAMOffset64bpp(xe_group_id.xy, tile_sample_index); + xe_edram_load_store_dest.Store2(edram_offset, xe_edram_clear_color64); + if (sample_index.x + 1u < clear_rect.z) { + xe_edram_load_store_dest.Store2(edram_offset + 8u, xe_edram_clear_color64); + } +} diff --git a/src/xenia/gpu/d3d12/shaders/texture_tile_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_tile_32bpp.cs.hlsl index 890746dbc..1c7b2cd9b 100644 --- a/src/xenia/gpu/d3d12/shaders/texture_tile_32bpp.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/texture_tile_32bpp.cs.hlsl @@ -14,7 +14,7 @@ void main(uint3 xe_thread_id : SV_DispatchThreadID) { uint4 texels = xe_texture_tile_source.Load4( xe_texture_tile_host_base + texel_index.y * xe_texture_tile_host_pitch + texel_index.x * 4u); - texels = XeByteSwap(texels, xe_texture_tile_endian_guest_pitch & 7u); + texels = XeByteSwap(texels, xe_texture_tile_endian_guest_pitch); uint4 texel_addresses = xe_texture_tile_guest_base + XeTextureTiledOffset2D( texel_index, xe_texture_tile_endian_guest_pitch >> 3u, 2u); xe_texture_tile_dest.Store(texel_addresses.x, texels.x); diff --git a/src/xenia/gpu/d3d12/shaders/texture_tile_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_tile_64bpp.cs.hlsl new file mode 100644 index 000000000..e0bd56c4c --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/texture_tile_64bpp.cs.hlsl @@ -0,0 +1,37 @@ +#include "texture_tile.hlsli" + +RWByteAddressBuffer xe_texture_tile_dest : register(u0); + +[numthreads(8, 32, 1)] +void main(uint3 xe_thread_id : SV_DispatchThreadID) { + // 1 thread = 4 texels. + uint2 texture_size = (xe_texture_tile_size >> uint2(0u, 16u)) & 0xFFFFu; + uint2 texel_index = xe_thread_id.xy; + texel_index.x <<= 2u; + [branch] if (any(texel_index >= texture_size)) { + return; + } + + uint4 texel_addresses = xe_texture_tile_guest_base + XeTextureTiledOffset2D( + texel_index, xe_texture_tile_endian_guest_pitch >> 3u, 3u); + bool3 texels_inside = uint3(1u, 2u, 3u) + texel_index.x < texture_size.x; + + uint texels_source_offset = xe_texture_tile_host_base + texel_index.y * + xe_texture_tile_host_pitch + texel_index.x * 8u; + uint4 texels = XeByteSwap64( + xe_texture_tile_source.Load4(texels_source_offset), + xe_texture_tile_endian_guest_pitch); + xe_texture_tile_dest.Store2(texel_addresses.x, texels.xy); + [branch] if (texels_inside.x) { + xe_texture_tile_dest.Store2(texel_addresses.y, texels.zw); + [branch] if (texels_inside.y) { + texels = XeByteSwap64( + xe_texture_tile_source.Load4(texels_source_offset + 16u), + xe_texture_tile_endian_guest_pitch); + xe_texture_tile_dest.Store2(texel_addresses.z, texels.xy); + [branch] if (texels_inside.z) { + xe_texture_tile_dest.Store2(texel_addresses.w, texels.zw); + } + } + } +} diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index 07a00156e..f83425f61 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -36,6 +36,7 @@ namespace d3d12 { #include "xenia/gpu/d3d12/shaders/bin/texture_load_depth_unorm_cs.h" #include "xenia/gpu/d3d12/shaders/bin/texture_load_dxt3a_cs.h" #include "xenia/gpu/d3d12/shaders/bin/texture_tile_32bpp_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/texture_tile_64bpp_cs.h" const TextureCache::HostFormat TextureCache::host_formats_[64] = { // k_1_REVERSE @@ -81,7 +82,7 @@ const TextureCache::HostFormat TextureCache::host_formats_[64] = { // k_DXT4_5 {DXGI_FORMAT_BC3_UNORM, LoadMode::k128bpb, TileMode::kUnknown}, // k_16_16_16_16_EDRAM - {DXGI_FORMAT_R16G16B16A16_UNORM, LoadMode::k64bpb, TileMode::kUnknown}, + {DXGI_FORMAT_R16G16B16A16_UNORM, LoadMode::k64bpb, TileMode::k64bpp}, // R32_FLOAT for depth because shaders would require an additional SRV to // sample stencil, which we don't provide. // k_24_8 @@ -93,19 +94,19 @@ const TextureCache::HostFormat TextureCache::host_formats_[64] = { // k_16_16 {DXGI_FORMAT_R16G16_UNORM, LoadMode::k32bpb, TileMode::k32bpp}, // k_16_16_16_16 - {DXGI_FORMAT_R16G16B16A16_UNORM, LoadMode::k64bpb, TileMode::kUnknown}, + {DXGI_FORMAT_R16G16B16A16_UNORM, LoadMode::k64bpb, TileMode::k64bpp}, // k_16_EXPAND {DXGI_FORMAT_R16_FLOAT, LoadMode::k16bpb, TileMode::kUnknown}, // k_16_16_EXPAND {DXGI_FORMAT_R16G16_FLOAT, LoadMode::k32bpb, TileMode::k32bpp}, // k_16_16_16_16_EXPAND - {DXGI_FORMAT_R16G16B16A16_FLOAT, LoadMode::k64bpb, TileMode::kUnknown}, + {DXGI_FORMAT_R16G16B16A16_FLOAT, LoadMode::k64bpb, TileMode::k64bpp}, // k_16_FLOAT {DXGI_FORMAT_R16_FLOAT, LoadMode::k16bpb, TileMode::kUnknown}, // k_16_16_FLOAT {DXGI_FORMAT_R16G16_FLOAT, LoadMode::k32bpb, TileMode::k32bpp}, // k_16_16_16_16_FLOAT - {DXGI_FORMAT_R16G16B16A16_FLOAT, LoadMode::k64bpb, TileMode::kUnknown}, + {DXGI_FORMAT_R16G16B16A16_FLOAT, LoadMode::k64bpb, TileMode::k64bpp}, // k_32 {DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, TileMode::kUnknown}, // k_32_32 @@ -115,7 +116,7 @@ const TextureCache::HostFormat TextureCache::host_formats_[64] = { // k_32_FLOAT {DXGI_FORMAT_R32_FLOAT, LoadMode::k32bpb, TileMode::k32bpp}, // k_32_32_FLOAT - {DXGI_FORMAT_R32G32_FLOAT, LoadMode::k64bpb, TileMode::kUnknown}, + {DXGI_FORMAT_R32G32_FLOAT, LoadMode::k64bpb, TileMode::k64bpp}, // k_32_32_32_32_FLOAT {DXGI_FORMAT_R32G32B32A32_FLOAT, LoadMode::k128bpb, TileMode::kUnknown}, // k_32_AS_8 @@ -187,6 +188,7 @@ const TextureCache::LoadModeInfo TextureCache::load_mode_info_[] = { const TextureCache::TileModeInfo TextureCache::tile_mode_info_[] = { {texture_tile_32bpp_cs, sizeof(texture_tile_32bpp_cs)}, + {texture_tile_64bpp_cs, sizeof(texture_tile_64bpp_cs)}, }; TextureCache::TextureCache(D3D12CommandProcessor* command_processor, diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h index fd9497476..9f7eaa16d 100644 --- a/src/xenia/gpu/d3d12/texture_cache.h +++ b/src/xenia/gpu/d3d12/texture_cache.h @@ -118,6 +118,7 @@ class TextureCache { // formats that can be resolved to. enum class TileMode { k32bpp, + k64bpp, kCount,