[D3D12] Raw 32bpp resolve

This commit is contained in:
Triang3l 2018-08-23 13:25:36 +03:00
parent bc4125584c
commit ea1abdaa6e
8 changed files with 146 additions and 28 deletions

View File

@ -893,9 +893,10 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
assert_always(); assert_always();
return false; return false;
} }
Endian128 dest_endian = Endian128(dest_info & 0x7);
int32_t dest_exp_bias = int32_t dest_exp_bias =
!is_depth ? (int32_t((dest_info >> 16) << 26) >> 26) : 0; !is_depth ? (int32_t((dest_info >> 16) << 26) >> 26) : 0;
uint32_t dest_swap = (dest_info >> 24) & 0x1; bool dest_swap = !is_depth && ((dest_info >> 24) & 0x1);
// Get the destination location. // Get the destination location.
uint32_t dest_address = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32 & 0x1FFFFFFF; uint32_t dest_address = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32 & 0x1FFFFFFF;
@ -950,14 +951,105 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
// RTV of the destination format. // RTV of the destination format.
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
auto device = provider->GetDevice(); auto device = provider->GetDevice();
auto descriptor_size_view = provider->GetDescriptorSizeView();
if (sample_select <= xenos::CopySampleSelect::k3 && if (sample_select <= xenos::CopySampleSelect::k3 &&
src_texture_format == dest_format && dest_exp_bias == 0) { src_texture_format == dest_format && dest_exp_bias == 0) {
XELOGGPU("Resolving a single sample without conversion"); XELOGGPU("Resolving a single sample without conversion");
if (src_64bpp) {
// TODO(Triang3l): 64bpp sample copy shader.
return false;
}
// Make sure we have the memory to write to. // Make sure we have the memory to write to.
if (!shared_memory->MakeTilesResident(dest_address, dest_size)) { if (!shared_memory->MakeTilesResident(dest_address, dest_size)) {
return false; return false;
} }
// TODO(Triang3l): Raw resolve.
// Write the source and destination descriptors.
D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start;
D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start;
if (command_processor_->RequestViewDescriptors(
0, 2, 2, descriptor_cpu_start, descriptor_gpu_start) == 0) {
return false;
}
D3D12_SHADER_RESOURCE_VIEW_DESC srv_desc;
srv_desc.Format = DXGI_FORMAT_R32_TYPELESS;
srv_desc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
srv_desc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
srv_desc.Buffer.FirstElement = 0;
srv_desc.Buffer.NumElements = 2 * 2048 * 1280;
srv_desc.Buffer.StructureByteStride = 0;
srv_desc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW;
device->CreateShaderResourceView(edram_buffer_, &srv_desc,
descriptor_cpu_start);
D3D12_CPU_DESCRIPTOR_HANDLE uav_cpu_handle;
uav_cpu_handle.ptr = descriptor_cpu_start.ptr + descriptor_size_view;
shared_memory->CreateRawUAV(uav_cpu_handle);
// Transition the buffers.
command_processor_->PushTransitionBarrier(
edram_buffer_, edram_buffer_state_,
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
edram_buffer_state_ = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
shared_memory->UseForWriting();
command_processor_->SubmitBarriers();
// Dispatch the computation.
command_list->SetComputeRootSignature(edram_load_store_root_signature_);
EDRAMLoadStoreRootConstants root_constants;
root_constants.tile_sample_rect_tl = copy_rect.left | (copy_rect.top << 16);
root_constants.tile_sample_rect_br =
copy_rect.right | (copy_rect.bottom << 16);
root_constants.tile_sample_dest_base = dest_address;
assert_true(dest_pitch <= 8192);
root_constants.tile_sample_dest_info = dest_pitch |
(uint32_t(sample_select) << 16) |
(uint32_t(dest_endian) << 18);
if (msaa_samples >= MsaaSamples::k2X) {
root_constants.tile_sample_dest_info |= 1 << 14;
if (msaa_samples >= MsaaSamples::k4X) {
root_constants.tile_sample_dest_info |= 1 << 15;
}
}
if (dest_swap) {
switch (ColorRenderTargetFormat(src_format)) {
case ColorRenderTargetFormat::k_8_8_8_8:
case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
root_constants.tile_sample_dest_info |= (8 << 21) | (16 << 26);
break;
case ColorRenderTargetFormat::k_2_10_10_10:
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
case ColorRenderTargetFormat::k_2_10_10_10_AS_16_16_16_16:
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
root_constants.tile_sample_dest_info |= (10 << 21) | (20 << 26);
break;
case ColorRenderTargetFormat::k_16_16_16_16:
case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
root_constants.tile_sample_dest_info |= 1 << 21;
break;
default:
break;
}
}
root_constants.base_pitch_tiles = edram_base | (surface_pitch_tiles << 11);
command_list->SetComputeRoot32BitConstants(
0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
// TODO(Triang3l): 64bpp pipeline.
command_processor_->SetPipeline(edram_tile_sample_32bpp_pipeline_);
// 1 group per destination 80x16 (32bpp) / 80x8 (64bpp) region.
uint32_t group_count_x = row_tiles, group_count_y = rows;
if (msaa_samples >= MsaaSamples::k2X) {
group_count_y = (group_count_y + 1) >> 1;
if (msaa_samples >= MsaaSamples::k4X) {
group_count_x = (group_count_x + 1) >> 1;
}
}
command_list->Dispatch(group_count_x, group_count_y, 1);
// Commit the write.
command_processor_->PushUAVBarrier(shared_memory->GetBuffer());
// Make the texture cache refresh the data. // Make the texture cache refresh the data.
shared_memory->RangeWrittenByGPU(dest_address, dest_size); shared_memory->RangeWrittenByGPU(dest_address, dest_size);
} else { } else {
@ -1386,8 +1478,6 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source, command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source,
nullptr); nullptr);
EDRAMLoadStoreRootConstants root_constants; EDRAMLoadStoreRootConstants root_constants;
root_constants.base_pitch_tiles =
binding.edram_base | (rt_pitch_tiles << 11);
root_constants.rt_color_depth_offset = root_constants.rt_color_depth_offset =
uint32_t(location_dest.PlacedFootprint.Offset); uint32_t(location_dest.PlacedFootprint.Offset);
root_constants.rt_color_depth_pitch = root_constants.rt_color_depth_pitch =
@ -1402,6 +1492,8 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
root_constants.rt_stencil_pitch = root_constants.rt_stencil_pitch =
location_dest.PlacedFootprint.Footprint.RowPitch; location_dest.PlacedFootprint.Footprint.RowPitch;
} }
root_constants.base_pitch_tiles =
binding.edram_base | (rt_pitch_tiles << 11);
// Transition the copy buffer to SRV. // Transition the copy buffer to SRV.
command_processor_->PushTransitionBarrier( command_processor_->PushTransitionBarrier(
@ -1534,8 +1626,6 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
// Load the data. // Load the data.
command_processor_->SubmitBarriers(); command_processor_->SubmitBarriers();
EDRAMLoadStoreRootConstants root_constants; EDRAMLoadStoreRootConstants root_constants;
root_constants.base_pitch_tiles =
edram_bases[i] | (edram_pitch_tiles << 11);
root_constants.rt_color_depth_offset = root_constants.rt_color_depth_offset =
uint32_t(render_target->footprints[0].Offset); uint32_t(render_target->footprints[0].Offset);
root_constants.rt_color_depth_pitch = root_constants.rt_color_depth_pitch =
@ -1546,6 +1636,8 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
root_constants.rt_stencil_pitch = root_constants.rt_stencil_pitch =
render_target->footprints[1].Footprint.RowPitch; render_target->footprints[1].Footprint.RowPitch;
} }
root_constants.base_pitch_tiles =
edram_bases[i] | (edram_pitch_tiles << 11);
command_list->SetComputeRoot32BitConstants( command_list->SetComputeRoot32BitConstants(
0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0); 0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth, EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,

View File

@ -385,11 +385,11 @@ class RenderTargetCache {
// 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA. // 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA.
// 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA. // 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA.
// 16:17 - sample to load (16 - vertical index, 17 - horizontal index). // 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
// 18:19 - destination endianness. // 18:20 - destination endianness.
// 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping. // 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
// For 32 bits per pixel: // For 32 bits per pixel:
// 20:24 - red/blue bit depth. // 21:25 - red/blue bit depth.
// 25:29 - blue offset. // 26:30 - blue offset.
// For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47. // For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
uint32_t tile_sample_dest_info; uint32_t tile_sample_dest_info;
}; };

View File

@ -1,12 +1,15 @@
#ifndef XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_ #ifndef XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_
#define XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_ #define XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_
// These functions may accept endianness without it being masked with & 3 -
// don't use ==, <=, >= here!
#define XE_BYTE_SWAP_OVERLOAD(XeByteSwapType) \ #define XE_BYTE_SWAP_OVERLOAD(XeByteSwapType) \
XeByteSwapType XeByteSwap(XeByteSwapType v, uint endian) { \ XeByteSwapType XeByteSwap(XeByteSwapType v, uint endian) { \
[flatten] if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \ if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \
v = ((v & 0x00FF00FFu) << 8u) | ((v & 0xFF00FF00u) >> 8u); \ v = ((v & 0x00FF00FFu) << 8u) | ((v & 0xFF00FF00u) >> 8u); \
} \ } \
[flatten] if ((endian & 2u) != 0u) { \ if ((endian & 2u) != 0u) { \
v = (v << 16u) | (v >> 16u); \ v = (v << 16u) | (v >> 16u); \
} \ } \
return v; \ return v; \
@ -18,7 +21,7 @@ XE_BYTE_SWAP_OVERLOAD(uint4)
#define XE_BYTE_SWAP_16_OVERLOAD(XeByteSwapType) \ #define XE_BYTE_SWAP_16_OVERLOAD(XeByteSwapType) \
XeByteSwapType XeByteSwap16(XeByteSwapType v, uint endian) { \ XeByteSwapType XeByteSwap16(XeByteSwapType v, uint endian) { \
[flatten] if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \ if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \
v = (v << 8u) | (v >> 8u); \ v = (v << 8u) | (v >> 8u); \
} \ } \
return v; \ return v; \
@ -28,4 +31,19 @@ XE_BYTE_SWAP_16_OVERLOAD(uint2)
XE_BYTE_SWAP_16_OVERLOAD(uint3) XE_BYTE_SWAP_16_OVERLOAD(uint3)
XE_BYTE_SWAP_16_OVERLOAD(uint4) XE_BYTE_SWAP_16_OVERLOAD(uint4)
uint2 XeByteSwap64(uint2 v, uint endian) {
if (endian & 4u) {
v = v.yx;
endian = 2u;
}
return XeByteSwap(v, endian);
}
uint4 XeByteSwap64(uint4 v, uint endian) {
if (endian & 4u) {
v = v.yxwz;
endian = 2u;
}
return XeByteSwap(v, endian);
}
#endif // XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_ #endif // XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_

View File

@ -23,11 +23,11 @@ cbuffer XeEDRAMLoadStoreConstants : register(b0) {
// 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA. // 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA.
// 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA. // 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA.
// 16:17 - sample to load (16 - vertical index, 17 - horizontal index). // 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
// 18:19 - destination endianness. // 18:20 - destination endianness.
// 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping. // 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
// For 32 bits per pixel: // For 32 bits per pixel:
// 20:24 - red/blue bit depth. // 21:25 - red/blue bit depth.
// 25:29 - blue offset. // 26:30 - blue offset.
// For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47. // For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
#define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w) #define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w)

View File

@ -1,3 +1,4 @@
#include "byte_swap.hlsli"
#include "edram_load_store.hlsli" #include "edram_load_store.hlsli"
#include "texture_address.hlsli" #include "texture_address.hlsli"
@ -6,8 +7,9 @@ void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID, uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) { uint3 xe_thread_id : SV_DispatchThreadID) {
// Check if not outside of the destination texture completely. // Check if not outside of the destination texture completely.
uint4 copy_rect = uint4 copy_rect;
(xe_edram_tile_sample_rect.xyxy >> uint4(0u, 0u, 16u, 16u)) & 0xFFFFu; copy_rect.xz = xe_edram_tile_sample_rect & 0xFFFFu;
copy_rect.yw = xe_edram_tile_sample_rect >> 16u;
uint2 texel_index = xe_thread_id.xy; uint2 texel_index = xe_thread_id.xy;
texel_index.x *= 4u; texel_index.x *= 4u;
[branch] if (any(texel_index < copy_rect.xy) || [branch] if (any(texel_index < copy_rect.xy) ||
@ -19,9 +21,12 @@ void main(uint3 xe_group_id : SV_GroupID,
// XY - log2(pixel size), ZW - selected sample offset. // XY - log2(pixel size), ZW - selected sample offset.
uint4 sample_info = uint4 sample_info =
(xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u; (xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u;
uint2 edram_tile_quarter =
uint2(uint2(10u, 8u) <= xe_group_thread_id) * sample_info.xy;
uint edram_offset = XeEDRAMOffset( uint edram_offset = XeEDRAMOffset(
xe_group_id.xy << sample_info.xy, (xe_group_id.xy << sample_info.xy) + edram_tile_quarter,
xe_thread_id.xy << (sample_info.xy + uint2(2u, 0u)) + sample_info.zw); (xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) <<
(sample_info.xy + uint2(2u, 0u)) + sample_info.zw);
// At 1x and 2x, this contains samples of 4 pixels. At 4x, this contains // At 1x and 2x, this contains samples of 4 pixels. At 4x, this contains
// samples of 2, need to load 2 more. // samples of 2, need to load 2 more.
uint4 pixels = xe_edram_load_store_source.Load4(edram_offset); uint4 pixels = xe_edram_load_store_source.Load4(edram_offset);
@ -30,7 +35,7 @@ void main(uint3 xe_group_id : SV_GroupID,
pixels.zw = xe_edram_load_store_source.Load3(edram_offset + 16u).xz; pixels.zw = xe_edram_load_store_source.Load3(edram_offset + 16u).xz;
} }
uint red_blue_swap = xe_edram_tile_sample_dest_info >> 20u; uint red_blue_swap = xe_edram_tile_sample_dest_info >> 21u;
if (red_blue_swap != 0u) { if (red_blue_swap != 0u) {
uint red_mask = (1u << (red_blue_swap & 31u)) - 1u; uint red_mask = (1u << (red_blue_swap & 31u)) - 1u;
// No need to be ready for a long shift Barney, it's just 16 or 20. // No need to be ready for a long shift Barney, it's just 16 or 20.
@ -42,16 +47,18 @@ void main(uint3 xe_group_id : SV_GroupID,
} }
// Tile the pixels to the shared memory. // Tile the pixels to the shared memory.
pixels = XeByteSwap(pixels, xe_edram_tile_sample_dest_info >> 18u);
uint4 texel_addresses = uint4 texel_addresses =
xe_edram_tile_sample_dest_base + xe_edram_tile_sample_dest_base +
XeTextureTiledOffset2D(texel_index - copy_rect.xy, XeTextureTiledOffset2D(texel_index - copy_rect.xy,
xe_edram_tile_sample_dest_info & 16383u, 2u); xe_edram_tile_sample_dest_info & 16383u, 2u);
xe_edram_load_store_dest.Store(texel_addresses.x, pixels.x); xe_edram_load_store_dest.Store(texel_addresses.x, pixels.x);
[branch] if (texel_index.x + 1u < copy_rect.z) { bool3 texels_in_rect = uint3(1u, 2u, 3u) + texel_index.x < copy_rect.z;
[branch] if (texels_in_rect.x) {
xe_edram_load_store_dest.Store(texel_addresses.y, pixels.y); xe_edram_load_store_dest.Store(texel_addresses.y, pixels.y);
[branch] if (texel_index.x + 2u < copy_rect.z) { [branch] if (texels_in_rect.y) {
xe_edram_load_store_dest.Store(texel_addresses.z, pixels.z); xe_edram_load_store_dest.Store(texel_addresses.z, pixels.z);
[branch] if (texel_index.x + 3u < copy_rect.z) { [branch] if (texels_in_rect.z) {
xe_edram_load_store_dest.Store(texel_addresses.w, pixels.w); xe_edram_load_store_dest.Store(texel_addresses.w, pixels.w);
} }
} }

View File

@ -57,7 +57,7 @@ uint4 XeFloat32To20e4(uint4 f32u32) {
} }
uint4 XeFloat20e4To32(uint4 f24u32) { uint4 XeFloat20e4To32(uint4 f24u32) {
uint4 mantissa = f24u32 & 0xF00000u; uint4 mantissa = f24u32 & 0xFFFFFu;
uint4 exponent = f24u32 >> 20u; uint4 exponent = f24u32 >> 20u;
// Normalize the values for the denormalized components. // Normalize the values for the denormalized components.
// Exponent = 1; // Exponent = 1;

View File

@ -541,7 +541,7 @@ void SharedMemory::CreateSRV(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
device->CreateShaderResourceView(buffer_, &desc, handle); device->CreateShaderResourceView(buffer_, &desc, handle);
} }
void SharedMemory::CreateUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle) { void SharedMemory::CreateRawUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
auto device = auto device =
command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice(); command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice();
D3D12_UNORDERED_ACCESS_VIEW_DESC desc; D3D12_UNORDERED_ACCESS_VIEW_DESC desc;

View File

@ -36,6 +36,7 @@ class SharedMemory {
bool Initialize(); bool Initialize();
void Shutdown(); void Shutdown();
ID3D12Resource* GetBuffer() const { return buffer_; }
D3D12_GPU_VIRTUAL_ADDRESS GetGPUAddress() const { D3D12_GPU_VIRTUAL_ADDRESS GetGPUAddress() const {
return buffer_gpu_address_; return buffer_gpu_address_;
} }
@ -90,7 +91,7 @@ class SharedMemory {
void UseForWriting(); void UseForWriting();
void CreateSRV(D3D12_CPU_DESCRIPTOR_HANDLE handle); void CreateSRV(D3D12_CPU_DESCRIPTOR_HANDLE handle);
void CreateUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle); void CreateRawUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle);
private: private:
D3D12CommandProcessor* command_processor_; D3D12CommandProcessor* command_processor_;