[D3D12] Common root constants for EDRAM load/store and single sample load
This commit is contained in:
parent
c4f80aac0d
commit
2d8527c9df
|
@ -959,6 +959,8 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
|
|||
// bilinear filtering), applying exponent bias and swapping red and blue in
|
||||
// a format-agnostic way, then the resulting color is written to a temporary
|
||||
// RTV of the destination format.
|
||||
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
|
||||
auto device = provider->GetDevice();
|
||||
if (sample_select <= xenos::CopySampleSelect::k3 &&
|
||||
src_texture_format == dest_format && dest_exp_bias == 0) {
|
||||
XELOGGPU("Resolving a single sample without conversion");
|
||||
|
@ -1385,6 +1387,14 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
|
|||
const RenderTarget* render_target = binding.render_target;
|
||||
bool is_64bpp = false;
|
||||
|
||||
// Get the number of X thread groups.
|
||||
uint32_t rt_pitch_tiles = surface_pitch_tiles;
|
||||
if (!render_target->key.is_depth &&
|
||||
IsColorFormat64bpp(
|
||||
ColorRenderTargetFormat(render_target->key.format))) {
|
||||
rt_pitch_tiles *= 2;
|
||||
}
|
||||
|
||||
// Copy from the render target planes and set up the layout.
|
||||
D3D12_TEXTURE_COPY_LOCATION location_source, location_dest;
|
||||
location_source.pResource = render_target->resource;
|
||||
|
@ -1397,13 +1407,10 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
|
|||
command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source,
|
||||
nullptr);
|
||||
EDRAMLoadStoreRootConstants root_constants;
|
||||
root_constants.base_tiles = binding.edram_base;
|
||||
root_constants.pitch_tiles = surface_pitch_tiles;
|
||||
if (!render_target->key.is_depth &&
|
||||
IsColorFormat64bpp(
|
||||
ColorRenderTargetFormat(render_target->key.format))) {
|
||||
root_constants.pitch_tiles *= 2;
|
||||
}
|
||||
root_constants.base_pitch_tiles =
|
||||
binding.edram_base | (rt_pitch_tiles << 11);
|
||||
root_constants.rt_color_depth_offset =
|
||||
uint32_t(location_dest.PlacedFootprint.Offset);
|
||||
root_constants.rt_color_depth_pitch =
|
||||
location_dest.PlacedFootprint.Footprint.RowPitch;
|
||||
if (render_target->key.is_depth) {
|
||||
|
@ -1411,12 +1418,10 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
|
|||
location_dest.PlacedFootprint = render_target->footprints[1];
|
||||
command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source,
|
||||
nullptr);
|
||||
root_constants.rt_stencil_offset_or_swap_red_blue =
|
||||
root_constants.rt_stencil_offset =
|
||||
uint32_t(location_dest.PlacedFootprint.Offset);
|
||||
root_constants.rt_stencil_pitch =
|
||||
location_dest.PlacedFootprint.Footprint.RowPitch;
|
||||
} else {
|
||||
root_constants.rt_stencil_offset_or_swap_red_blue = 0;
|
||||
}
|
||||
|
||||
// Transition the copy buffer to SRV.
|
||||
|
@ -1437,8 +1442,7 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
|
|||
EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
|
||||
render_target->key.format);
|
||||
command_processor_->SetPipeline(edram_store_pipelines_[size_t(mode)]);
|
||||
command_list->Dispatch(root_constants.pitch_tiles, binding.edram_dirty_rows,
|
||||
1);
|
||||
command_list->Dispatch(rt_pitch_tiles, binding.edram_dirty_rows, 1);
|
||||
|
||||
// Commit the UAV write and prepare for copying again.
|
||||
barrier_count = 1;
|
||||
|
@ -1569,31 +1573,18 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
|
|||
}
|
||||
const RenderTarget* render_target = render_targets[i];
|
||||
|
||||
// Set up the layout.
|
||||
EDRAMLoadStoreRootConstants root_constants;
|
||||
root_constants.base_tiles = edram_bases[i];
|
||||
root_constants.pitch_tiles = render_target->key.width_ss_div_80;
|
||||
// Get the number of X thread groups.
|
||||
uint32_t edram_pitch_tiles = render_target->key.width_ss_div_80;
|
||||
if (!render_target->key.is_depth &&
|
||||
IsColorFormat64bpp(
|
||||
ColorRenderTargetFormat(render_target->key.format))) {
|
||||
root_constants.pitch_tiles *= 2;
|
||||
edram_pitch_tiles *= 2;
|
||||
}
|
||||
root_constants.rt_color_depth_pitch =
|
||||
render_target->footprints[0].Footprint.RowPitch;
|
||||
if (render_target->key.is_depth) {
|
||||
root_constants.rt_stencil_offset_or_swap_red_blue =
|
||||
uint32_t(render_target->footprints[1].Offset);
|
||||
root_constants.rt_stencil_pitch =
|
||||
render_target->footprints[1].Footprint.RowPitch;
|
||||
} else {
|
||||
root_constants.rt_stencil_offset_or_swap_red_blue = 0;
|
||||
}
|
||||
|
||||
// Validate the height in case the resolve is somehow too large (shouldn't
|
||||
// happen though, but who knows what games do).
|
||||
uint32_t edram_rows =
|
||||
std::min(render_target->key.height_ss_div_16,
|
||||
(2048u - edram_bases[i]) / root_constants.pitch_tiles);
|
||||
(2048u - edram_bases[i]) / edram_pitch_tiles);
|
||||
if (edram_rows == 0) {
|
||||
continue;
|
||||
}
|
||||
|
@ -1612,12 +1603,25 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
|
|||
}
|
||||
|
||||
// Load the data.
|
||||
EDRAMLoadStoreRootConstants root_constants;
|
||||
root_constants.base_pitch_tiles =
|
||||
edram_bases[i] | (edram_pitch_tiles << 11);
|
||||
root_constants.rt_color_depth_offset =
|
||||
uint32_t(render_target->footprints[0].Offset);
|
||||
root_constants.rt_color_depth_pitch =
|
||||
render_target->footprints[0].Footprint.RowPitch;
|
||||
if (render_target->key.is_depth) {
|
||||
root_constants.rt_stencil_offset =
|
||||
uint32_t(render_target->footprints[1].Offset);
|
||||
root_constants.rt_stencil_pitch =
|
||||
render_target->footprints[1].Footprint.RowPitch;
|
||||
}
|
||||
command_list->SetComputeRoot32BitConstants(
|
||||
0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
|
||||
EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
|
||||
render_target->key.format);
|
||||
command_processor_->SetPipeline(edram_load_pipelines_[size_t(mode)]);
|
||||
command_list->Dispatch(root_constants.pitch_tiles, edram_rows, 1);
|
||||
command_list->Dispatch(edram_pitch_tiles, edram_rows, 1);
|
||||
|
||||
// Commit the UAV write and transition the copy buffer to copy source.
|
||||
barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_UAV;
|
||||
|
|
|
@ -369,11 +369,33 @@ class RenderTargetCache {
|
|||
// EDRAM buffer load/store root signature.
|
||||
ID3D12RootSignature* edram_load_store_root_signature_ = nullptr;
|
||||
struct EDRAMLoadStoreRootConstants {
|
||||
uint32_t base_tiles;
|
||||
uint32_t pitch_tiles;
|
||||
uint32_t rt_color_depth_pitch;
|
||||
uint32_t rt_stencil_offset_or_swap_red_blue;
|
||||
uint32_t rt_stencil_pitch;
|
||||
union {
|
||||
struct {
|
||||
uint32_t rt_color_depth_offset;
|
||||
uint32_t rt_color_depth_pitch;
|
||||
uint32_t rt_stencil_offset;
|
||||
uint32_t rt_stencil_pitch;
|
||||
};
|
||||
struct {
|
||||
// 16 bits for X, 16 bits for Y.
|
||||
uint32_t tile_sample_rect_tl;
|
||||
uint32_t tile_sample_rect_br;
|
||||
uint32_t tile_sample_dest_base;
|
||||
// 0:13 - destination pitch.
|
||||
// 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA.
|
||||
// 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA.
|
||||
// 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
|
||||
// 18:19 - destination endianness.
|
||||
// 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
|
||||
// For 32 bits per pixel:
|
||||
// 20:24 - red/blue bit depth.
|
||||
// 25:29 - blue offset.
|
||||
// For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
|
||||
uint32_t tile_sample_dest_info;
|
||||
};
|
||||
};
|
||||
// Base in the lower 11 bits, pitch above.
|
||||
uint32_t base_pitch_tiles;
|
||||
};
|
||||
// EDRAM buffer load/store pipelines.
|
||||
static const EDRAMLoadStoreModeInfo
|
||||
|
|
|
@ -8,16 +8,7 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
tile_dword_index.x *= 4u;
|
||||
uint4 pixels = xe_edram_load_store_source.Load4(
|
||||
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
|
||||
if (xe_edram_swap_red_blue != 0u) {
|
||||
// Not a very long shift, just 16 or 20.
|
||||
uint blue_shift = xe_edram_swap_red_blue >> 16u;
|
||||
uint red_mask = xe_edram_swap_red_blue & 0xFFFFu;
|
||||
uint blue_mask = red_mask << blue_shift;
|
||||
pixels = (pixels & ~(red_mask | blue_mask)) |
|
||||
((pixels & red_mask) << blue_shift) |
|
||||
((pixels >> blue_shift) & red_mask);
|
||||
}
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u;
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
xe_edram_load_store_dest.Store4(rt_offset, pixels);
|
||||
}
|
||||
|
|
|
@ -13,12 +13,7 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
}
|
||||
uint4 pixels = xe_edram_load_store_source.Load4(
|
||||
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
|
||||
if (xe_edram_swap_red_blue != 0u) {
|
||||
// The only 64-bit formats with a blue component are 16_16_16_16 and
|
||||
// 16_16_16_16_FLOAT.
|
||||
pixels = (pixels.yxwz & 0xFFFFu) | (pixels & 0xFFFF0000u);
|
||||
}
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u;
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
xe_edram_load_store_dest.Store4(rt_offset, pixels);
|
||||
}
|
||||
|
|
|
@ -14,11 +14,7 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
uint4 pixels_f16u32_packed =
|
||||
uint4(pixel_0_f16u32.xz, pixel_1_f16u32.xz) |
|
||||
(uint4(pixel_0_f16u32.yw, pixel_1_f16u32.yw) << 16u);
|
||||
if (xe_edram_swap_red_blue != 0u) {
|
||||
pixels_f16u32_packed = (pixels_f16u32_packed.yxwz & 0xFFFFu) |
|
||||
(pixels_f16u32_packed & 0xFFFF0000u);
|
||||
}
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u;
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
xe_edram_load_store_dest.Store4(rt_offset, pixels_f16u32_packed);
|
||||
}
|
||||
|
|
|
@ -19,13 +19,13 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
uint4 depth = depth24to32 + (depth32 - depth24to32) *
|
||||
uint4(XeFloat32To20e4(depth32) == depth24);
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u;
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
xe_edram_load_store_dest.Store4(rt_offset, depth);
|
||||
// Stencil.
|
||||
uint4 stencil = (depth24_stencil & 0xFFu) << uint4(0u, 8u, 16u, 24u);
|
||||
stencil.xy |= stencil.zw;
|
||||
stencil.x |= stencil.y;
|
||||
rt_offset = xe_edram_rt_stencil_offset +
|
||||
xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
|
||||
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
|
||||
xe_edram_rt_stencil_offset;
|
||||
xe_edram_load_store_dest.Store(rt_offset, stencil.x);
|
||||
}
|
||||
|
|
|
@ -10,13 +10,13 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
|
||||
// Depth.
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u;
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
xe_edram_load_store_dest.Store4(rt_offset, pixels >> 8u);
|
||||
// Stencil.
|
||||
uint4 stencil = (pixels & 0xFFu) << uint4(0u, 8u, 16u, 24u);
|
||||
stencil.xy |= stencil.zw;
|
||||
stencil.x |= stencil.y;
|
||||
rt_offset = xe_edram_rt_stencil_offset +
|
||||
xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
|
||||
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
|
||||
xe_edram_rt_stencil_offset;
|
||||
xe_edram_load_store_dest.Store(rt_offset, stencil.x);
|
||||
}
|
||||
|
|
|
@ -1,25 +1,43 @@
|
|||
#ifndef XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
|
||||
#define XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
|
||||
|
||||
// Root constants.
|
||||
cbuffer XeEDRAMLoadStoreConstants : register(b0) {
|
||||
uint xe_edram_base_tiles;
|
||||
uint xe_edram_pitch_tiles;
|
||||
uint xe_edram_rt_color_depth_pitch;
|
||||
uint xe_edram_rt_stencil_offset_or_swap_red_blue;
|
||||
uint xe_edram_rt_stencil_pitch;
|
||||
uint4 xe_edram_load_store_constants;
|
||||
// Base in the lower 11 bits, pitch in the upper part, in tiles.
|
||||
uint xe_edram_base_pitch_tiles;
|
||||
};
|
||||
#define xe_edram_rt_stencil_offset xe_edram_rt_stencil_offset_or_swap_red_blue
|
||||
// For loads only. How exactly it's handled depends on the specific load shader,
|
||||
// but 0 always means red and blue shouldn't be swapped.
|
||||
#define xe_edram_swap_red_blue xe_edram_rt_stencil_offset_or_swap_red_blue
|
||||
|
||||
// For loading and storing render targets.
|
||||
#define xe_edram_rt_color_depth_offset (xe_edram_load_store_constants.x)
|
||||
#define xe_edram_rt_color_depth_pitch (xe_edram_load_store_constants.y)
|
||||
#define xe_edram_rt_stencil_offset (xe_edram_load_store_constants.z)
|
||||
#define xe_edram_rt_stencil_pitch (xe_edram_load_store_constants.w)
|
||||
|
||||
// For single sample resolving.
|
||||
// Left/top of the copied region (relative to EDRAM base) in the lower 16 bits,
|
||||
// right/bottom in the upper.
|
||||
#define xe_edram_tile_sample_rect (xe_edram_load_store_constants.xy)
|
||||
#define xe_edram_tile_sample_dest_base (xe_edram_load_store_constants.z)
|
||||
// 0:13 - destination pitch.
|
||||
// 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA.
|
||||
// 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA.
|
||||
// 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
|
||||
// 18:19 - destination endianness.
|
||||
// 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
|
||||
// For 32 bits per pixel:
|
||||
// 20:24 - red/blue bit depth.
|
||||
// 25:29 - blue offset.
|
||||
// For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
|
||||
#define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w)
|
||||
|
||||
ByteAddressBuffer xe_edram_load_store_source : register(t0);
|
||||
RWByteAddressBuffer xe_edram_load_store_dest : register(u0);
|
||||
|
||||
uint XeEDRAMOffset(uint2 tile_index, uint2 tile_dword_index) {
|
||||
return (xe_edram_base_tiles + (tile_index.y * xe_edram_pitch_tiles) +
|
||||
tile_index.x) * 5120u + tile_dword_index.y * 320u +
|
||||
tile_dword_index.x * 4u;
|
||||
return ((xe_edram_base_pitch_tiles & 2047u) +
|
||||
tile_index.y * (xe_edram_base_pitch_tiles >> 11u) + tile_index.x) *
|
||||
5120u + tile_dword_index.y * 320u + tile_dword_index.x * 4u;
|
||||
}
|
||||
|
||||
#endif // XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
|
||||
|
|
|
@ -5,7 +5,7 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u;
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||
tile_dword_index.x *= 4u;
|
||||
|
|
|
@ -5,7 +5,7 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u;
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
|
||||
// One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
|
||||
// from 1 render target row rather than 1. Threads with X 0-19 are for the
|
||||
|
|
|
@ -6,7 +6,7 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u;
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
uint4 pixels_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset);
|
||||
uint4 pixel_0_f16u32 = pixels_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u);
|
||||
uint4 pixel_1_f16u32 = pixels_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u);
|
||||
|
|
|
@ -7,12 +7,12 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
// Depth.
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u;
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
uint4 depth32 = xe_edram_load_store_source.Load4(rt_offset);
|
||||
uint4 depth24_stencil = XeFloat32To20e4(depth32) << 8u;
|
||||
// Stencil.
|
||||
rt_offset = xe_edram_rt_stencil_offset +
|
||||
xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
|
||||
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
|
||||
xe_edram_rt_stencil_offset;
|
||||
depth24_stencil |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
|
||||
uint4(0u, 8u, 16u, 24u)) & 0xFFu;
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||
|
|
|
@ -6,12 +6,12 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
// Depth.
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u;
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
uint4 pixels =
|
||||
(xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu) << 8u;
|
||||
// Stencil.
|
||||
rt_offset = xe_edram_rt_stencil_offset +
|
||||
xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
|
||||
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
|
||||
xe_edram_rt_stencil_offset;
|
||||
pixels |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
|
||||
uint4(0u, 8u, 16u, 24u)) & 0xFFu;
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||
|
|
Loading…
Reference in New Issue