From ea1abdaa6eb53737c842b87d33494a94bad0f6f9 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Thu, 23 Aug 2018 13:25:36 +0300 Subject: [PATCH] [D3D12] Raw 32bpp resolve --- src/xenia/gpu/d3d12/render_target_cache.cc | 104 +++++++++++++++++- src/xenia/gpu/d3d12/render_target_cache.h | 8 +- src/xenia/gpu/d3d12/shaders/byte_swap.hlsli | 24 +++- .../gpu/d3d12/shaders/edram_load_store.hlsli | 8 +- .../shaders/edram_tile_sample_32bpp.cs.hlsl | 23 ++-- .../gpu/d3d12/shaders/pixel_formats.hlsli | 2 +- src/xenia/gpu/d3d12/shared_memory.cc | 2 +- src/xenia/gpu/d3d12/shared_memory.h | 3 +- 8 files changed, 146 insertions(+), 28 deletions(-) diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index 4e7add734..28adddf6f 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -893,9 +893,10 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, assert_always(); return false; } + Endian128 dest_endian = Endian128(dest_info & 0x7); int32_t dest_exp_bias = !is_depth ? (int32_t((dest_info >> 16) << 26) >> 26) : 0; - uint32_t dest_swap = (dest_info >> 24) & 0x1; + bool dest_swap = !is_depth && ((dest_info >> 24) & 0x1); // Get the destination location. uint32_t dest_address = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32 & 0x1FFFFFFF; @@ -950,14 +951,105 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, // RTV of the destination format. auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); auto device = provider->GetDevice(); + auto descriptor_size_view = provider->GetDescriptorSizeView(); if (sample_select <= xenos::CopySampleSelect::k3 && src_texture_format == dest_format && dest_exp_bias == 0) { XELOGGPU("Resolving a single sample without conversion"); + if (src_64bpp) { + // TODO(Triang3l): 64bpp sample copy shader. + return false; + } + // Make sure we have the memory to write to. if (!shared_memory->MakeTilesResident(dest_address, dest_size)) { return false; } - // TODO(Triang3l): Raw resolve. + + // Write the source and destination descriptors. + D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start; + D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start; + if (command_processor_->RequestViewDescriptors( + 0, 2, 2, descriptor_cpu_start, descriptor_gpu_start) == 0) { + return false; + } + D3D12_SHADER_RESOURCE_VIEW_DESC srv_desc; + srv_desc.Format = DXGI_FORMAT_R32_TYPELESS; + srv_desc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER; + srv_desc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING; + srv_desc.Buffer.FirstElement = 0; + srv_desc.Buffer.NumElements = 2 * 2048 * 1280; + srv_desc.Buffer.StructureByteStride = 0; + srv_desc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW; + device->CreateShaderResourceView(edram_buffer_, &srv_desc, + descriptor_cpu_start); + D3D12_CPU_DESCRIPTOR_HANDLE uav_cpu_handle; + uav_cpu_handle.ptr = descriptor_cpu_start.ptr + descriptor_size_view; + shared_memory->CreateRawUAV(uav_cpu_handle); + + // Transition the buffers. + command_processor_->PushTransitionBarrier( + edram_buffer_, edram_buffer_state_, + D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE); + edram_buffer_state_ = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE; + shared_memory->UseForWriting(); + command_processor_->SubmitBarriers(); + + // Dispatch the computation. + command_list->SetComputeRootSignature(edram_load_store_root_signature_); + EDRAMLoadStoreRootConstants root_constants; + root_constants.tile_sample_rect_tl = copy_rect.left | (copy_rect.top << 16); + root_constants.tile_sample_rect_br = + copy_rect.right | (copy_rect.bottom << 16); + root_constants.tile_sample_dest_base = dest_address; + assert_true(dest_pitch <= 8192); + root_constants.tile_sample_dest_info = dest_pitch | + (uint32_t(sample_select) << 16) | + (uint32_t(dest_endian) << 18); + if (msaa_samples >= MsaaSamples::k2X) { + root_constants.tile_sample_dest_info |= 1 << 14; + if (msaa_samples >= MsaaSamples::k4X) { + root_constants.tile_sample_dest_info |= 1 << 15; + } + } + if (dest_swap) { + switch (ColorRenderTargetFormat(src_format)) { + case ColorRenderTargetFormat::k_8_8_8_8: + case ColorRenderTargetFormat::k_8_8_8_8_GAMMA: + root_constants.tile_sample_dest_info |= (8 << 21) | (16 << 26); + break; + case ColorRenderTargetFormat::k_2_10_10_10: + case ColorRenderTargetFormat::k_2_10_10_10_FLOAT: + case ColorRenderTargetFormat::k_2_10_10_10_AS_16_16_16_16: + case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: + root_constants.tile_sample_dest_info |= (10 << 21) | (20 << 26); + break; + case ColorRenderTargetFormat::k_16_16_16_16: + case ColorRenderTargetFormat::k_16_16_16_16_FLOAT: + root_constants.tile_sample_dest_info |= 1 << 21; + break; + default: + break; + } + } + root_constants.base_pitch_tiles = edram_base | (surface_pitch_tiles << 11); + command_list->SetComputeRoot32BitConstants( + 0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0); + command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start); + // TODO(Triang3l): 64bpp pipeline. + command_processor_->SetPipeline(edram_tile_sample_32bpp_pipeline_); + // 1 group per destination 80x16 (32bpp) / 80x8 (64bpp) region. + uint32_t group_count_x = row_tiles, group_count_y = rows; + if (msaa_samples >= MsaaSamples::k2X) { + group_count_y = (group_count_y + 1) >> 1; + if (msaa_samples >= MsaaSamples::k4X) { + group_count_x = (group_count_x + 1) >> 1; + } + } + command_list->Dispatch(group_count_x, group_count_y, 1); + + // Commit the write. + command_processor_->PushUAVBarrier(shared_memory->GetBuffer()); + // Make the texture cache refresh the data. shared_memory->RangeWrittenByGPU(dest_address, dest_size); } else { @@ -1386,8 +1478,6 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() { command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source, nullptr); EDRAMLoadStoreRootConstants root_constants; - root_constants.base_pitch_tiles = - binding.edram_base | (rt_pitch_tiles << 11); root_constants.rt_color_depth_offset = uint32_t(location_dest.PlacedFootprint.Offset); root_constants.rt_color_depth_pitch = @@ -1402,6 +1492,8 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() { root_constants.rt_stencil_pitch = location_dest.PlacedFootprint.Footprint.RowPitch; } + root_constants.base_pitch_tiles = + binding.edram_base | (rt_pitch_tiles << 11); // Transition the copy buffer to SRV. command_processor_->PushTransitionBarrier( @@ -1534,8 +1626,6 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM( // Load the data. command_processor_->SubmitBarriers(); EDRAMLoadStoreRootConstants root_constants; - root_constants.base_pitch_tiles = - edram_bases[i] | (edram_pitch_tiles << 11); root_constants.rt_color_depth_offset = uint32_t(render_target->footprints[0].Offset); root_constants.rt_color_depth_pitch = @@ -1546,6 +1636,8 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM( root_constants.rt_stencil_pitch = render_target->footprints[1].Footprint.RowPitch; } + root_constants.base_pitch_tiles = + edram_bases[i] | (edram_pitch_tiles << 11); command_list->SetComputeRoot32BitConstants( 0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0); EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth, diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index 3c26b5f47..253624854 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -385,11 +385,11 @@ class RenderTargetCache { // 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA. // 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA. // 16:17 - sample to load (16 - vertical index, 17 - horizontal index). - // 18:19 - destination endianness. - // 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping. + // 18:20 - destination endianness. + // 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping. // For 32 bits per pixel: - // 20:24 - red/blue bit depth. - // 25:29 - blue offset. + // 21:25 - red/blue bit depth. + // 26:30 - blue offset. // For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47. uint32_t tile_sample_dest_info; }; diff --git a/src/xenia/gpu/d3d12/shaders/byte_swap.hlsli b/src/xenia/gpu/d3d12/shaders/byte_swap.hlsli index b345a5ad3..1cdd55ae1 100644 --- a/src/xenia/gpu/d3d12/shaders/byte_swap.hlsli +++ b/src/xenia/gpu/d3d12/shaders/byte_swap.hlsli @@ -1,12 +1,15 @@ #ifndef XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_ #define XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_ +// These functions may accept endianness without it being masked with & 3 - +// don't use ==, <=, >= here! + #define XE_BYTE_SWAP_OVERLOAD(XeByteSwapType) \ XeByteSwapType XeByteSwap(XeByteSwapType v, uint endian) { \ - [flatten] if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \ + if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \ v = ((v & 0x00FF00FFu) << 8u) | ((v & 0xFF00FF00u) >> 8u); \ } \ - [flatten] if ((endian & 2u) != 0u) { \ + if ((endian & 2u) != 0u) { \ v = (v << 16u) | (v >> 16u); \ } \ return v; \ @@ -18,7 +21,7 @@ XE_BYTE_SWAP_OVERLOAD(uint4) #define XE_BYTE_SWAP_16_OVERLOAD(XeByteSwapType) \ XeByteSwapType XeByteSwap16(XeByteSwapType v, uint endian) { \ - [flatten] if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \ + if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \ v = (v << 8u) | (v >> 8u); \ } \ return v; \ @@ -28,4 +31,19 @@ XE_BYTE_SWAP_16_OVERLOAD(uint2) XE_BYTE_SWAP_16_OVERLOAD(uint3) XE_BYTE_SWAP_16_OVERLOAD(uint4) +uint2 XeByteSwap64(uint2 v, uint endian) { + if (endian & 4u) { + v = v.yx; + endian = 2u; + } + return XeByteSwap(v, endian); +} +uint4 XeByteSwap64(uint4 v, uint endian) { + if (endian & 4u) { + v = v.yxwz; + endian = 2u; + } + return XeByteSwap(v, endian); +} + #endif // XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_ diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli index 0314abd6d..e55b783a4 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli +++ b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli @@ -23,11 +23,11 @@ cbuffer XeEDRAMLoadStoreConstants : register(b0) { // 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA. // 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA. // 16:17 - sample to load (16 - vertical index, 17 - horizontal index). -// 18:19 - destination endianness. -// 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping. +// 18:20 - destination endianness. +// 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping. // For 32 bits per pixel: -// 20:24 - red/blue bit depth. -// 25:29 - blue offset. +// 21:25 - red/blue bit depth. +// 26:30 - blue offset. // For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47. #define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w) diff --git a/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl index 29633a1aa..0df7dbdbb 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_tile_sample_32bpp.cs.hlsl @@ -1,3 +1,4 @@ +#include "byte_swap.hlsli" #include "edram_load_store.hlsli" #include "texture_address.hlsli" @@ -6,8 +7,9 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_group_thread_id : SV_GroupThreadID, uint3 xe_thread_id : SV_DispatchThreadID) { // Check if not outside of the destination texture completely. - uint4 copy_rect = - (xe_edram_tile_sample_rect.xyxy >> uint4(0u, 0u, 16u, 16u)) & 0xFFFFu; + uint4 copy_rect; + copy_rect.xz = xe_edram_tile_sample_rect & 0xFFFFu; + copy_rect.yw = xe_edram_tile_sample_rect >> 16u; uint2 texel_index = xe_thread_id.xy; texel_index.x *= 4u; [branch] if (any(texel_index < copy_rect.xy) || @@ -19,9 +21,12 @@ void main(uint3 xe_group_id : SV_GroupID, // XY - log2(pixel size), ZW - selected sample offset. uint4 sample_info = (xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u; + uint2 edram_tile_quarter = + uint2(uint2(10u, 8u) <= xe_group_thread_id) * sample_info.xy; uint edram_offset = XeEDRAMOffset( - xe_group_id.xy << sample_info.xy, - xe_thread_id.xy << (sample_info.xy + uint2(2u, 0u)) + sample_info.zw); + (xe_group_id.xy << sample_info.xy) + edram_tile_quarter, + (xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) << + (sample_info.xy + uint2(2u, 0u)) + sample_info.zw); // At 1x and 2x, this contains samples of 4 pixels. At 4x, this contains // samples of 2, need to load 2 more. uint4 pixels = xe_edram_load_store_source.Load4(edram_offset); @@ -30,7 +35,7 @@ void main(uint3 xe_group_id : SV_GroupID, pixels.zw = xe_edram_load_store_source.Load3(edram_offset + 16u).xz; } - uint red_blue_swap = xe_edram_tile_sample_dest_info >> 20u; + uint red_blue_swap = xe_edram_tile_sample_dest_info >> 21u; if (red_blue_swap != 0u) { uint red_mask = (1u << (red_blue_swap & 31u)) - 1u; // No need to be ready for a long shift Barney, it's just 16 or 20. @@ -42,16 +47,18 @@ void main(uint3 xe_group_id : SV_GroupID, } // Tile the pixels to the shared memory. + pixels = XeByteSwap(pixels, xe_edram_tile_sample_dest_info >> 18u); uint4 texel_addresses = xe_edram_tile_sample_dest_base + XeTextureTiledOffset2D(texel_index - copy_rect.xy, xe_edram_tile_sample_dest_info & 16383u, 2u); xe_edram_load_store_dest.Store(texel_addresses.x, pixels.x); - [branch] if (texel_index.x + 1u < copy_rect.z) { + bool3 texels_in_rect = uint3(1u, 2u, 3u) + texel_index.x < copy_rect.z; + [branch] if (texels_in_rect.x) { xe_edram_load_store_dest.Store(texel_addresses.y, pixels.y); - [branch] if (texel_index.x + 2u < copy_rect.z) { + [branch] if (texels_in_rect.y) { xe_edram_load_store_dest.Store(texel_addresses.z, pixels.z); - [branch] if (texel_index.x + 3u < copy_rect.z) { + [branch] if (texels_in_rect.z) { xe_edram_load_store_dest.Store(texel_addresses.w, pixels.w); } } diff --git a/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli b/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli index fbdbb0221..eff4e0a7d 100644 --- a/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli +++ b/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli @@ -57,7 +57,7 @@ uint4 XeFloat32To20e4(uint4 f32u32) { } uint4 XeFloat20e4To32(uint4 f24u32) { - uint4 mantissa = f24u32 & 0xF00000u; + uint4 mantissa = f24u32 & 0xFFFFFu; uint4 exponent = f24u32 >> 20u; // Normalize the values for the denormalized components. // Exponent = 1; diff --git a/src/xenia/gpu/d3d12/shared_memory.cc b/src/xenia/gpu/d3d12/shared_memory.cc index 118ac9e62..158d1decd 100644 --- a/src/xenia/gpu/d3d12/shared_memory.cc +++ b/src/xenia/gpu/d3d12/shared_memory.cc @@ -541,7 +541,7 @@ void SharedMemory::CreateSRV(D3D12_CPU_DESCRIPTOR_HANDLE handle) { device->CreateShaderResourceView(buffer_, &desc, handle); } -void SharedMemory::CreateUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle) { +void SharedMemory::CreateRawUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle) { auto device = command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice(); D3D12_UNORDERED_ACCESS_VIEW_DESC desc; diff --git a/src/xenia/gpu/d3d12/shared_memory.h b/src/xenia/gpu/d3d12/shared_memory.h index 23f85dfa9..39b21eb8a 100644 --- a/src/xenia/gpu/d3d12/shared_memory.h +++ b/src/xenia/gpu/d3d12/shared_memory.h @@ -36,6 +36,7 @@ class SharedMemory { bool Initialize(); void Shutdown(); + ID3D12Resource* GetBuffer() const { return buffer_; } D3D12_GPU_VIRTUAL_ADDRESS GetGPUAddress() const { return buffer_gpu_address_; } @@ -90,7 +91,7 @@ class SharedMemory { void UseForWriting(); void CreateSRV(D3D12_CPU_DESCRIPTOR_HANDLE handle); - void CreateUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle); + void CreateRawUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle); private: D3D12CommandProcessor* command_processor_;