[D3D12] Common root constants for EDRAM load/store and single sample load

This commit is contained in:
Triang3l 2018-08-22 19:54:51 +03:00
parent c4f80aac0d
commit 2d8527c9df
13 changed files with 109 additions and 83 deletions

View File

@ -959,6 +959,8 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
// bilinear filtering), applying exponent bias and swapping red and blue in
// a format-agnostic way, then the resulting color is written to a temporary
// RTV of the destination format.
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
auto device = provider->GetDevice();
if (sample_select <= xenos::CopySampleSelect::k3 &&
src_texture_format == dest_format && dest_exp_bias == 0) {
XELOGGPU("Resolving a single sample without conversion");
@ -1385,6 +1387,14 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
const RenderTarget* render_target = binding.render_target;
bool is_64bpp = false;
// Get the number of X thread groups.
uint32_t rt_pitch_tiles = surface_pitch_tiles;
if (!render_target->key.is_depth &&
IsColorFormat64bpp(
ColorRenderTargetFormat(render_target->key.format))) {
rt_pitch_tiles *= 2;
}
// Copy from the render target planes and set up the layout.
D3D12_TEXTURE_COPY_LOCATION location_source, location_dest;
location_source.pResource = render_target->resource;
@ -1397,13 +1407,10 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source,
nullptr);
EDRAMLoadStoreRootConstants root_constants;
root_constants.base_tiles = binding.edram_base;
root_constants.pitch_tiles = surface_pitch_tiles;
if (!render_target->key.is_depth &&
IsColorFormat64bpp(
ColorRenderTargetFormat(render_target->key.format))) {
root_constants.pitch_tiles *= 2;
}
root_constants.base_pitch_tiles =
binding.edram_base | (rt_pitch_tiles << 11);
root_constants.rt_color_depth_offset =
uint32_t(location_dest.PlacedFootprint.Offset);
root_constants.rt_color_depth_pitch =
location_dest.PlacedFootprint.Footprint.RowPitch;
if (render_target->key.is_depth) {
@ -1411,12 +1418,10 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
location_dest.PlacedFootprint = render_target->footprints[1];
command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source,
nullptr);
root_constants.rt_stencil_offset_or_swap_red_blue =
root_constants.rt_stencil_offset =
uint32_t(location_dest.PlacedFootprint.Offset);
root_constants.rt_stencil_pitch =
location_dest.PlacedFootprint.Footprint.RowPitch;
} else {
root_constants.rt_stencil_offset_or_swap_red_blue = 0;
}
// Transition the copy buffer to SRV.
@ -1437,8 +1442,7 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
render_target->key.format);
command_processor_->SetPipeline(edram_store_pipelines_[size_t(mode)]);
command_list->Dispatch(root_constants.pitch_tiles, binding.edram_dirty_rows,
1);
command_list->Dispatch(rt_pitch_tiles, binding.edram_dirty_rows, 1);
// Commit the UAV write and prepare for copying again.
barrier_count = 1;
@ -1569,31 +1573,18 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
}
const RenderTarget* render_target = render_targets[i];
// Set up the layout.
EDRAMLoadStoreRootConstants root_constants;
root_constants.base_tiles = edram_bases[i];
root_constants.pitch_tiles = render_target->key.width_ss_div_80;
// Get the number of X thread groups.
uint32_t edram_pitch_tiles = render_target->key.width_ss_div_80;
if (!render_target->key.is_depth &&
IsColorFormat64bpp(
ColorRenderTargetFormat(render_target->key.format))) {
root_constants.pitch_tiles *= 2;
edram_pitch_tiles *= 2;
}
root_constants.rt_color_depth_pitch =
render_target->footprints[0].Footprint.RowPitch;
if (render_target->key.is_depth) {
root_constants.rt_stencil_offset_or_swap_red_blue =
uint32_t(render_target->footprints[1].Offset);
root_constants.rt_stencil_pitch =
render_target->footprints[1].Footprint.RowPitch;
} else {
root_constants.rt_stencil_offset_or_swap_red_blue = 0;
}
// Validate the height in case the resolve is somehow too large (shouldn't
// happen though, but who knows what games do).
uint32_t edram_rows =
std::min(render_target->key.height_ss_div_16,
(2048u - edram_bases[i]) / root_constants.pitch_tiles);
(2048u - edram_bases[i]) / edram_pitch_tiles);
if (edram_rows == 0) {
continue;
}
@ -1612,12 +1603,25 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
}
// Load the data.
EDRAMLoadStoreRootConstants root_constants;
root_constants.base_pitch_tiles =
edram_bases[i] | (edram_pitch_tiles << 11);
root_constants.rt_color_depth_offset =
uint32_t(render_target->footprints[0].Offset);
root_constants.rt_color_depth_pitch =
render_target->footprints[0].Footprint.RowPitch;
if (render_target->key.is_depth) {
root_constants.rt_stencil_offset =
uint32_t(render_target->footprints[1].Offset);
root_constants.rt_stencil_pitch =
render_target->footprints[1].Footprint.RowPitch;
}
command_list->SetComputeRoot32BitConstants(
0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
render_target->key.format);
command_processor_->SetPipeline(edram_load_pipelines_[size_t(mode)]);
command_list->Dispatch(root_constants.pitch_tiles, edram_rows, 1);
command_list->Dispatch(edram_pitch_tiles, edram_rows, 1);
// Commit the UAV write and transition the copy buffer to copy source.
barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_UAV;

View File

@ -369,11 +369,33 @@ class RenderTargetCache {
// EDRAM buffer load/store root signature.
ID3D12RootSignature* edram_load_store_root_signature_ = nullptr;
struct EDRAMLoadStoreRootConstants {
uint32_t base_tiles;
uint32_t pitch_tiles;
uint32_t rt_color_depth_pitch;
uint32_t rt_stencil_offset_or_swap_red_blue;
uint32_t rt_stencil_pitch;
union {
struct {
uint32_t rt_color_depth_offset;
uint32_t rt_color_depth_pitch;
uint32_t rt_stencil_offset;
uint32_t rt_stencil_pitch;
};
struct {
// 16 bits for X, 16 bits for Y.
uint32_t tile_sample_rect_tl;
uint32_t tile_sample_rect_br;
uint32_t tile_sample_dest_base;
// 0:13 - destination pitch.
// 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA.
// 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA.
// 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
// 18:19 - destination endianness.
// 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
// For 32 bits per pixel:
// 20:24 - red/blue bit depth.
// 25:29 - blue offset.
// For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
uint32_t tile_sample_dest_info;
};
};
// Base in the lower 11 bits, pitch above.
uint32_t base_pitch_tiles;
};
// EDRAM buffer load/store pipelines.
static const EDRAMLoadStoreModeInfo

View File

@ -8,16 +8,7 @@ void main(uint3 xe_group_id : SV_GroupID,
tile_dword_index.x *= 4u;
uint4 pixels = xe_edram_load_store_source.Load4(
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
if (xe_edram_swap_red_blue != 0u) {
// Not a very long shift, just 16 or 20.
uint blue_shift = xe_edram_swap_red_blue >> 16u;
uint red_mask = xe_edram_swap_red_blue & 0xFFFFu;
uint blue_mask = red_mask << blue_shift;
pixels = (pixels & ~(red_mask | blue_mask)) |
((pixels & red_mask) << blue_shift) |
((pixels >> blue_shift) & red_mask);
}
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u;
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
xe_edram_load_store_dest.Store4(rt_offset, pixels);
}

View File

@ -13,12 +13,7 @@ void main(uint3 xe_group_id : SV_GroupID,
}
uint4 pixels = xe_edram_load_store_source.Load4(
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
if (xe_edram_swap_red_blue != 0u) {
// The only 64-bit formats with a blue component are 16_16_16_16 and
// 16_16_16_16_FLOAT.
pixels = (pixels.yxwz & 0xFFFFu) | (pixels & 0xFFFF0000u);
}
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u;
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
xe_edram_load_store_dest.Store4(rt_offset, pixels);
}

View File

@ -14,11 +14,7 @@ void main(uint3 xe_group_id : SV_GroupID,
uint4 pixels_f16u32_packed =
uint4(pixel_0_f16u32.xz, pixel_1_f16u32.xz) |
(uint4(pixel_0_f16u32.yw, pixel_1_f16u32.yw) << 16u);
if (xe_edram_swap_red_blue != 0u) {
pixels_f16u32_packed = (pixels_f16u32_packed.yxwz & 0xFFFFu) |
(pixels_f16u32_packed & 0xFFFF0000u);
}
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u;
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
xe_edram_load_store_dest.Store4(rt_offset, pixels_f16u32_packed);
}

View File

@ -19,13 +19,13 @@ void main(uint3 xe_group_id : SV_GroupID,
uint4 depth = depth24to32 + (depth32 - depth24to32) *
uint4(XeFloat32To20e4(depth32) == depth24);
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u;
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
xe_edram_load_store_dest.Store4(rt_offset, depth);
// Stencil.
uint4 stencil = (depth24_stencil & 0xFFu) << uint4(0u, 8u, 16u, 24u);
stencil.xy |= stencil.zw;
stencil.x |= stencil.y;
rt_offset = xe_edram_rt_stencil_offset +
xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
xe_edram_rt_stencil_offset;
xe_edram_load_store_dest.Store(rt_offset, stencil.x);
}

View File

@ -10,13 +10,13 @@ void main(uint3 xe_group_id : SV_GroupID,
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
// Depth.
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u;
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
xe_edram_load_store_dest.Store4(rt_offset, pixels >> 8u);
// Stencil.
uint4 stencil = (pixels & 0xFFu) << uint4(0u, 8u, 16u, 24u);
stencil.xy |= stencil.zw;
stencil.x |= stencil.y;
rt_offset = xe_edram_rt_stencil_offset +
xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
xe_edram_rt_stencil_offset;
xe_edram_load_store_dest.Store(rt_offset, stencil.x);
}

View File

@ -1,25 +1,43 @@
#ifndef XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
#define XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
// Root constants.
cbuffer XeEDRAMLoadStoreConstants : register(b0) {
uint xe_edram_base_tiles;
uint xe_edram_pitch_tiles;
uint xe_edram_rt_color_depth_pitch;
uint xe_edram_rt_stencil_offset_or_swap_red_blue;
uint xe_edram_rt_stencil_pitch;
uint4 xe_edram_load_store_constants;
// Base in the lower 11 bits, pitch in the upper part, in tiles.
uint xe_edram_base_pitch_tiles;
};
#define xe_edram_rt_stencil_offset xe_edram_rt_stencil_offset_or_swap_red_blue
// For loads only. How exactly it's handled depends on the specific load shader,
// but 0 always means red and blue shouldn't be swapped.
#define xe_edram_swap_red_blue xe_edram_rt_stencil_offset_or_swap_red_blue
// For loading and storing render targets.
#define xe_edram_rt_color_depth_offset (xe_edram_load_store_constants.x)
#define xe_edram_rt_color_depth_pitch (xe_edram_load_store_constants.y)
#define xe_edram_rt_stencil_offset (xe_edram_load_store_constants.z)
#define xe_edram_rt_stencil_pitch (xe_edram_load_store_constants.w)
// For single sample resolving.
// Left/top of the copied region (relative to EDRAM base) in the lower 16 bits,
// right/bottom in the upper.
#define xe_edram_tile_sample_rect (xe_edram_load_store_constants.xy)
#define xe_edram_tile_sample_dest_base (xe_edram_load_store_constants.z)
// 0:13 - destination pitch.
// 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA.
// 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA.
// 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
// 18:19 - destination endianness.
// 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
// For 32 bits per pixel:
// 20:24 - red/blue bit depth.
// 25:29 - blue offset.
// For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
#define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w)
ByteAddressBuffer xe_edram_load_store_source : register(t0);
RWByteAddressBuffer xe_edram_load_store_dest : register(u0);
uint XeEDRAMOffset(uint2 tile_index, uint2 tile_dword_index) {
return (xe_edram_base_tiles + (tile_index.y * xe_edram_pitch_tiles) +
tile_index.x) * 5120u + tile_dword_index.y * 320u +
tile_dword_index.x * 4u;
return ((xe_edram_base_pitch_tiles & 2047u) +
tile_index.y * (xe_edram_base_pitch_tiles >> 11u) + tile_index.x) *
5120u + tile_dword_index.y * 320u + tile_dword_index.x * 4u;
}
#endif // XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_

View File

@ -5,7 +5,7 @@ void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u;
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
uint2 tile_dword_index = xe_group_thread_id.xy;
tile_dword_index.x *= 4u;

View File

@ -5,7 +5,7 @@ void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u;
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
// One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
// from 1 render target row rather than 1. Threads with X 0-19 are for the

View File

@ -6,7 +6,7 @@ void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u;
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
uint4 pixels_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset);
uint4 pixel_0_f16u32 = pixels_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u);
uint4 pixel_1_f16u32 = pixels_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u);

View File

@ -7,12 +7,12 @@ void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_thread_id : SV_DispatchThreadID) {
// Depth.
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u;
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
uint4 depth32 = xe_edram_load_store_source.Load4(rt_offset);
uint4 depth24_stencil = XeFloat32To20e4(depth32) << 8u;
// Stencil.
rt_offset = xe_edram_rt_stencil_offset +
xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
xe_edram_rt_stencil_offset;
depth24_stencil |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
uint4(0u, 8u, 16u, 24u)) & 0xFFu;
uint2 tile_dword_index = xe_group_thread_id.xy;

View File

@ -6,12 +6,12 @@ void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_thread_id : SV_DispatchThreadID) {
// Depth.
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u;
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
uint4 pixels =
(xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu) << 8u;
// Stencil.
rt_offset = xe_edram_rt_stencil_offset +
xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
xe_edram_rt_stencil_offset;
pixels |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
uint4(0u, 8u, 16u, 24u)) & 0xFFu;
uint2 tile_dword_index = xe_group_thread_id.xy;