diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index e7c7b0a1c..9b99677e0 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -19,6 +19,7 @@ #include "xenia/base/memory.h" #include "xenia/base/profiling.h" #include "xenia/gpu/d3d12/d3d12_command_processor.h" +#include "xenia/gpu/texture_info.h" namespace xe { namespace gpu { @@ -876,7 +877,7 @@ bool RenderTargetCache::Resolve(SharedMemory* shared_memory, Memory* memory) { bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, uint32_t edram_base, uint32_t surface_pitch, MsaaSamples msaa_samples, bool is_depth, - uint32_t format, + uint32_t src_format, const D3D12_RECT& src_rect) { auto& regs = *register_file_; @@ -909,9 +910,77 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, uint32_t src_height = std::min(uint32_t(src_rect.bottom - src_rect.top), dest_height); - XELOGGPU("Copying samples %u to 0x%.8X (%ux%u), info 0x%.8X", - (rb_copy_control >> 4) & 0x7, regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32, - dest_pitch, dest_height, regs[XE_GPU_REG_RB_COPY_DEST_INFO].u32); + // Get format info. + uint32_t dest_info = regs[XE_GPU_REG_RB_COPY_DEST_INFO].u32; + TextureFormat src_texture_format; + if (is_depth) { + src_texture_format = + DepthRenderTargetToTextureFormat(DepthRenderTargetFormat(src_format)); + } else { + src_texture_format = + ColorRenderTargetToTextureFormat(ColorRenderTargetFormat(src_format)); + } + assert_true(src_texture_format != TextureFormat::kUnknown); + src_texture_format = GetBaseFormat(src_texture_format); + TextureFormat dest_format = + GetBaseFormat(TextureFormat((dest_info >> 7) & 0x3F)); + + // Get the destination location. + uint32_t dest_address = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32 & 0x1FFFFFFF; + if (dest_address & 0x3) { + assert_always(); + // Not 4-aligning may break UAV access significantly, let's hope games don't + // resolve to 8bpp or 16bpp textures at very odd locations. + return false; + } + int32_t dest_exp_bias = int32_t((dest_info >> 16) << 26) >> 26; + uint32_t dest_swap = (dest_info >> 24) & 0x1; + // TODO(Triang3l): Copy to array slices. + // TODO(Triang3l): Investigate what copy_dest_number is. + XELOGGPU( + "Copying samples %u to 0x%.8X (%ux%u), destination format %s, " + "exponent bias %d, red and blue %sswapped", + (rb_copy_control >> 4) & 0x7, dest_address, dest_pitch, dest_height, + FormatInfo::Get(dest_format)->name, dest_exp_bias, + dest_swap ? "" : "not "); + + // There are 3 paths for resolving in this function - they don't necessarily + // have to map directly to kRaw and kConvert CopyCommands. + // - Raw color - when the source is single-sampled and has the same format as + // the destination, and there's no need to apply exponent bias. A regular + // EDRAM load is done to a buffer, and the buffer is then tiled to the + // shared memory. Because swapping red and blue is very common, this path + // supports swapping. + // - Depth to depth - when the source and the destination formats are + // renderable depth-stencil ones (D24S8 or D24FS8). A single sample is + // taken from the EDRAM buffer, converted between D24 and D24F if needed, + // and tiled directly to the shared memory buffer. + // - Conversion - when a simple copy is not enough. The EDRAM region is loaded + // to a render target resource, which is then used as a texture in a shader + // performing the resolve (by sampling the texture on or between pixels with + // bilinear filtering), applying exponent bias and swapping red and blue in + // a format-agnostic way, then the resulting color is written to a temporary + // RTV of the destination format. This also works for converting depth to + // 16-bit or 32-bit. + if (dest_format == TextureFormat::k_24_8 || + dest_format == TextureFormat::k_24_8_FLOAT) { + // Depth to depth. + XELOGGPU("Resolving to a depth texture"); + if (!is_depth) { + return false; + } + // TODO(Triang3l): Depth to depth. + return false; + } else if (src_texture_format == dest_format && + msaa_samples == MsaaSamples::k1X && dest_exp_bias == 0) { + XELOGGPU("Resolving a single-sampled surface without conversion"); + // TODO(Triang3l): Raw resolve. + return false; + } else { + XELOGGPU("Resolving with a pixel shader"); + // TODO(Triang3l): Conversion. + return false; + } return true; } @@ -1290,10 +1359,12 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() { location_dest.PlacedFootprint = render_target->footprints[1]; command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source, nullptr); - root_constants.rt_stencil_offset = + root_constants.rt_stencil_offset_or_swap_red_blue = uint32_t(location_dest.PlacedFootprint.Offset); root_constants.rt_stencil_pitch = location_dest.PlacedFootprint.Footprint.RowPitch; + } else { + root_constants.rt_stencil_offset_or_swap_red_blue = 0; } // Transition the copy buffer to SRV. @@ -1458,10 +1529,12 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM( root_constants.rt_color_depth_pitch = render_target->footprints[0].Footprint.RowPitch; if (render_target->key.is_depth) { - root_constants.rt_stencil_offset = + root_constants.rt_stencil_offset_or_swap_red_blue = uint32_t(render_target->footprints[1].Offset); root_constants.rt_stencil_pitch = render_target->footprints[1].Footprint.RowPitch; + } else { + root_constants.rt_stencil_offset_or_swap_red_blue = 0; } // Validate the height in case the resolve is somehow too large (shouldn't diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index d8d79de51..027da1643 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -338,7 +338,8 @@ class RenderTargetCache { // Performs the copying part of a resolve. bool ResolveCopy(SharedMemory* shared_memory, uint32_t edram_base, uint32_t surface_pitch, MsaaSamples msaa_samples, - bool is_depth, uint32_t format, const D3D12_RECT& src_rect); + bool is_depth, uint32_t src_format, + const D3D12_RECT& src_rect); D3D12CommandProcessor* command_processor_; RegisterFile* register_file_; @@ -354,7 +355,7 @@ class RenderTargetCache { uint32_t base_tiles; uint32_t pitch_tiles; uint32_t rt_color_depth_pitch; - uint32_t rt_stencil_offset; + uint32_t rt_stencil_offset_or_swap_red_blue; uint32_t rt_stencil_pitch; }; // EDRAM buffer load/store pipelines. diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl index cd4079c67..f0ca434a0 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl @@ -8,6 +8,15 @@ void main(uint3 xe_group_id : SV_GroupID, tile_dword_index.x *= 4u; uint4 pixels = xe_edram_load_store_source.Load4( XeEDRAMOffset(xe_group_id.xy, tile_dword_index)); + if (xe_edram_swap_red_blue != 0u) { + // Not a very long shift, just 16 or 20. + uint blue_shift = xe_edram_swap_red_blue >> 16u; + uint red_mask = xe_edram_swap_red_blue & 0xFFFFu; + uint blue_mask = red_mask << blue_shift; + pixels = (pixels & ~(red_mask | blue_mask)) | + ((pixels & red_mask) << blue_shift) | + ((pixels >> blue_shift) & red_mask); + } uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u; xe_edram_load_store_dest.Store4(rt_offset, pixels); diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl index 273ee41cf..7ee08448a 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl @@ -13,6 +13,11 @@ void main(uint3 xe_group_id : SV_GroupID, } uint4 pixels = xe_edram_load_store_source.Load4( XeEDRAMOffset(xe_group_id.xy, tile_dword_index)); + if (xe_edram_swap_red_blue != 0u) { + // The only 64-bit formats with a blue component are 16_16_16_16 and + // 16_16_16_16_FLOAT. + pixels = (pixels.yxwz & 0xFFFFu) | (pixels & 0xFFFF0000u); + } uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u; xe_edram_load_store_dest.Store4(rt_offset, pixels); diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl index dd8611ae6..844b78d22 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl @@ -14,6 +14,10 @@ void main(uint3 xe_group_id : SV_GroupID, uint4 pixels_f16u32_packed = uint4(pixel_0_f16u32.xz, pixel_1_f16u32.xz) | (uint4(pixel_0_f16u32.yw, pixel_1_f16u32.yw) << 16u); + if (xe_edram_swap_red_blue != 0u) { + pixels_f16u32_packed = (pixels_f16u32_packed.yxwz & 0xFFFFu) | + (pixels_f16u32_packed & 0xFFFF0000u); + } uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u; xe_edram_load_store_dest.Store4(rt_offset, pixels_f16u32_packed); diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli index f7636266a..22b4a9107 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli +++ b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli @@ -5,9 +5,13 @@ cbuffer XeEDRAMLoadStoreConstants : register(b0) { uint xe_edram_base_tiles; uint xe_edram_pitch_tiles; uint xe_edram_rt_color_depth_pitch; - uint xe_edram_rt_stencil_offset; + uint xe_edram_rt_stencil_offset_or_swap_red_blue; uint xe_edram_rt_stencil_pitch; }; +#define xe_edram_rt_stencil_offset xe_edram_rt_stencil_offset_or_swap_red_blue +// For loads only. How exactly it's handled depends on the specific load shader, +// but 0 always means red and blue shouldn't be swapped. +#define xe_edram_swap_red_blue xe_edram_rt_stencil_offset_or_swap_red_blue ByteAddressBuffer xe_edram_load_store_source : register(t0); RWByteAddressBuffer xe_edram_load_store_dest : register(u0);