[D3D12] 64bpp raw resolve and EDRAM refactoring

This commit is contained in:
Triang3l 2018-09-16 15:11:11 +03:00
parent 5be78ab369
commit c9ffe98d21
17 changed files with 188 additions and 106 deletions

View File

@ -41,6 +41,7 @@ namespace d3d12 {
#include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_float_cs.h"
#include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_unorm_cs.h"
#include "xenia/gpu/d3d12/shaders/bin/edram_tile_sample_32bpp_cs.h"
#include "xenia/gpu/d3d12/shaders/bin/edram_tile_sample_64bpp_cs.h"
#include "xenia/gpu/d3d12/shaders/bin/resolve_ps.h"
#include "xenia/gpu/d3d12/shaders/bin/resolve_vs.h"
@ -173,6 +174,16 @@ bool RenderTargetCache::Initialize() {
return false;
}
edram_tile_sample_32bpp_pipeline_->SetName(L"EDRAM Raw Resolve 32bpp");
// Tile single sample into a texture - 64 bits per pixel.
edram_tile_sample_64bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
device, edram_tile_sample_64bpp_cs, sizeof(edram_tile_sample_64bpp_cs),
edram_load_store_root_signature_);
if (edram_tile_sample_64bpp_pipeline_ == nullptr) {
XELOGE("Failed to create the 64bpp EDRAM raw resolve pipeline");
Shutdown();
return false;
}
edram_tile_sample_64bpp_pipeline_->SetName(L"EDRAM Raw Resolve 64bpp");
// Clear 32-bit color or unorm depth.
edram_clear_32bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
device, edram_clear_32bpp_cs, sizeof(edram_clear_32bpp_cs),
@ -258,6 +269,7 @@ void RenderTargetCache::Shutdown() {
}
resolve_pipelines_.clear();
ui::d3d12::util::ReleaseAndNull(resolve_root_signature_);
ui::d3d12::util::ReleaseAndNull(edram_tile_sample_64bpp_pipeline_);
ui::d3d12::util::ReleaseAndNull(edram_tile_sample_32bpp_pipeline_);
ui::d3d12::util::ReleaseAndNull(edram_clear_depth_float_pipeline_);
ui::d3d12::util::ReleaseAndNull(edram_clear_32bpp_pipeline_);
@ -977,9 +989,10 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
// Validate and clamp the source region, skip parts that don't need to be
// copied and calculate the number of threads needed for copying/loading.
uint32_t surface_pitch_tiles, row_tiles, rows;
uint32_t surface_pitch_tiles, row_width_ss_div_80, rows;
if (!GetEDRAMLayout(surface_pitch, msaa_samples, src_64bpp, edram_base,
copy_rect, surface_pitch_tiles, row_tiles, rows)) {
copy_rect, surface_pitch_tiles, row_width_ss_div_80,
rows)) {
// Nothing to copy.
return true;
}
@ -1008,10 +1021,6 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
// Raw copy
// *************************************************************************
XELOGGPU("Resolve: Copying using a compute shader");
if (src_64bpp) {
// TODO(Triang3l): 64bpp sample copy shader.
return false;
}
// Make sure we have the memory to write to.
if (!shared_memory->MakeTilesResident(dest_address, dest_size)) {
@ -1079,10 +1088,11 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
command_list->SetComputeRoot32BitConstants(
0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
// TODO(Triang3l): 64bpp pipeline.
command_processor_->SetComputePipeline(edram_tile_sample_32bpp_pipeline_);
// 1 group per destination 80x16 (32bpp) / 80x8 (64bpp) region.
uint32_t group_count_x = row_tiles, group_count_y = rows;
command_processor_->SetComputePipeline(
src_64bpp ? edram_tile_sample_64bpp_pipeline_
: edram_tile_sample_32bpp_pipeline_);
// 1 group per destination 80x16 region.
uint32_t group_count_x = row_width_ss_div_80, group_count_y = rows;
if (msaa_samples >= MsaaSamples::k2X) {
group_count_y = (group_count_y + 1) >> 1;
if (msaa_samples >= MsaaSamples::k4X) {
@ -1121,7 +1131,7 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
return false;
}
RenderTargetKey render_target_key;
render_target_key.width_ss_div_80 = row_tiles >> (src_64bpp ? 1 : 0);
render_target_key.width_ss_div_80 = row_width_ss_div_80;
render_target_key.height_ss_div_16 = rows;
render_target_key.is_depth = false;
render_target_key.format = src_format;
@ -1190,7 +1200,8 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
command_processor_->SetComputePipeline(
edram_load_pipelines_[size_t(GetLoadStoreMode(false, src_format))]);
command_list->Dispatch(row_tiles, rows, 1);
// 1 group per 80x16 samples.
command_list->Dispatch(row_width_ss_div_80, rows, 1);
command_processor_->PushUAVBarrier(copy_buffer);
// Go to the next descriptor set.
@ -1405,9 +1416,10 @@ bool RenderTargetCache::ResolveClear(uint32_t edram_base,
bool is_64bpp =
!is_depth && IsColorFormat64bpp(ColorRenderTargetFormat(format));
D3D12_RECT clear_rect = rect;
uint32_t surface_pitch_tiles, row_tiles, rows;
uint32_t surface_pitch_tiles, row_width_ss_div_80, rows;
if (!GetEDRAMLayout(surface_pitch, msaa_samples, is_64bpp, edram_base,
clear_rect, surface_pitch_tiles, row_tiles, rows)) {
clear_rect, surface_pitch_tiles, row_width_ss_div_80,
rows)) {
// Nothing to clear.
return true;
}
@ -1475,7 +1487,8 @@ bool RenderTargetCache::ResolveClear(uint32_t edram_base,
ui::d3d12::util::CreateRawBufferUAV(device, descriptor_cpu_start,
edram_buffer_, kEDRAMBufferSize);
command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
command_list->Dispatch(row_tiles, rows, 1);
// 1 group per 80x16 samples.
command_list->Dispatch(row_width_ss_div_80, rows, 1);
command_processor_->PushUAVBarrier(edram_buffer_);
return true;
@ -1871,7 +1884,7 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
bool RenderTargetCache::GetEDRAMLayout(
uint32_t pitch_pixels, MsaaSamples msaa_samples, bool is_64bpp,
uint32_t& base_in_out, D3D12_RECT& rect_in_out, uint32_t& pitch_tiles_out,
uint32_t& row_tiles_out, uint32_t& rows_out) {
uint32_t& row_width_ss_div_80_out, uint32_t& rows_out) {
if (pitch_pixels == 0 || rect_in_out.right <= 0 || rect_in_out.bottom <= 0 ||
rect_in_out.top >= rect_in_out.bottom) {
return false;
@ -1921,8 +1934,7 @@ bool RenderTargetCache::GetEDRAMLayout(
base_in_out = base;
rect_in_out = rect;
pitch_tiles_out = pitch_tiles;
row_tiles_out = (((rect.right << samples_x_log2) + 79) / 80)
<< sample_size_log2;
row_width_ss_div_80_out = ((rect.right << samples_x_log2) + 79) / 80;
rows_out = rows;
return true;
}
@ -2044,14 +2056,6 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
const RenderTarget* render_target = binding.render_target;
bool is_64bpp = false;
// Get the number of X thread groups.
uint32_t rt_pitch_tiles = surface_pitch_tiles;
if (!render_target->key.is_depth &&
IsColorFormat64bpp(
ColorRenderTargetFormat(render_target->key.format))) {
rt_pitch_tiles *= 2;
}
// Transition the copy buffer to copy destination.
command_processor_->PushTransitionBarrier(copy_buffer, copy_buffer_state,
D3D12_RESOURCE_STATE_COPY_DEST);
@ -2084,6 +2088,12 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
root_constants.rt_stencil_pitch =
location_dest.PlacedFootprint.Footprint.RowPitch;
}
uint32_t rt_pitch_tiles = surface_pitch_tiles;
if (!render_target->key.is_depth &&
IsColorFormat64bpp(
ColorRenderTargetFormat(render_target->key.format))) {
rt_pitch_tiles *= 2;
}
root_constants.base_pitch_tiles =
binding.edram_base | (rt_pitch_tiles << 11);
@ -2101,7 +2111,8 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
render_target->key.format);
command_processor_->SetComputePipeline(
edram_store_pipelines_[size_t(mode)]);
command_list->Dispatch(rt_pitch_tiles, binding.edram_dirty_rows, 1);
// 1 group per 80x16 samples.
command_list->Dispatch(surface_pitch_tiles, binding.edram_dirty_rows, 1);
// Commit the UAV write.
command_processor_->PushUAVBarrier(edram_buffer_);
@ -2178,7 +2189,7 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
}
const RenderTarget* render_target = render_targets[i];
// Get the number of X thread groups.
// Get the number of EDRAM tiles per row.
uint32_t edram_pitch_tiles = render_target->key.width_ss_div_80;
if (!render_target->key.is_depth &&
IsColorFormat64bpp(
@ -2218,7 +2229,8 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
render_target->key.format);
command_processor_->SetComputePipeline(edram_load_pipelines_[size_t(mode)]);
command_list->Dispatch(edram_pitch_tiles, edram_rows, 1);
// 1 group per 80x16 samples.
command_list->Dispatch(render_target->key.width_ss_div_80, edram_rows, 1);
// Commit the UAV write and transition the copy buffer to copy source now.
command_processor_->PushUAVBarrier(copy_buffer);

View File

@ -381,7 +381,8 @@ class RenderTargetCache {
static bool GetEDRAMLayout(uint32_t pitch_pixels, MsaaSamples msaa_samples,
bool is_64bpp, uint32_t& base_in_out,
D3D12_RECT& rect_in_out, uint32_t& pitch_tiles_out,
uint32_t& row_tiles_out, uint32_t& rows_out);
uint32_t& row_width_ss_div_80_out,
uint32_t& rows_out);
static EDRAMLoadStoreMode GetLoadStoreMode(bool is_depth, uint32_t format);
@ -480,6 +481,7 @@ class RenderTargetCache {
ID3D12PipelineState*
edram_store_pipelines_[size_t(EDRAMLoadStoreMode::kCount)] = {};
ID3D12PipelineState* edram_tile_sample_32bpp_pipeline_ = nullptr;
ID3D12PipelineState* edram_tile_sample_64bpp_pipeline_ = nullptr;
ID3D12PipelineState* edram_clear_32bpp_pipeline_ = nullptr;
ID3D12PipelineState* edram_clear_depth_float_pipeline_ = nullptr;

View File

@ -15,9 +15,9 @@ void main(uint3 xe_group_id : SV_GroupID,
any(sample_index >= clear_rect.zw)) {
return;
}
uint2 tile_dword_index = xe_group_thread_id.xy;
tile_dword_index.x *= 2u;
uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 2u;
uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
xe_edram_load_store_dest.Store(edram_offset, xe_edram_clear_color32);
if (sample_index.x + 1u < clear_rect.z) {
xe_edram_load_store_dest.Store(edram_offset + 4u, xe_edram_clear_color32);

View File

@ -15,11 +15,11 @@ void main(uint3 xe_group_id : SV_GroupID,
any(sample_index >= clear_rect.zw)) {
return;
}
uint2 tile_dword_index = xe_group_thread_id.xy;
tile_dword_index.x *= 2u;
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 2u;
bool second_sample_inside = sample_index.x + 1u < clear_rect.z;
// 24-bit depth.
uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
xe_edram_load_store_dest.Store(edram_offset, xe_edram_clear_depth24);
[branch] if (second_sample_inside) {
xe_edram_load_store_dest.Store(edram_offset + 4u, xe_edram_clear_depth24);

View File

@ -4,11 +4,11 @@
void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
uint2 tile_dword_index = xe_group_thread_id.xy;
tile_dword_index.x *= 4u;
uint4 pixels = xe_edram_load_store_source.Load4(
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 4u;
uint4 samples = xe_edram_load_store_source.Load4(
XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index));
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
xe_edram_load_store_dest.Store4(rt_offset, pixels);
xe_edram_load_store_dest.Store4(rt_offset, samples);
}

View File

@ -1,19 +1,14 @@
#include "edram_load_store.hlsli"
[numthreads(40, 8, 1)]
[numthreads(40, 16, 1)]
void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
// One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
// from 1 render target row rather than 1. Threads with X 0-19 are for the
// first row, with 20-39 are for the second.
uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u);
[flatten] if (xe_group_thread_id.x >= 20u) {
tile_dword_index += uint2(uint(-80), 1u);
}
uint4 pixels = xe_edram_load_store_source.Load4(
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 2u;
uint4 samples = xe_edram_load_store_source.Load4(
XeEDRAMOffset64bpp(xe_group_id.xy, tile_sample_index));
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
xe_edram_load_store_dest.Store4(rt_offset, pixels);
xe_edram_load_store_dest.Store4(rt_offset, samples);
}

View File

@ -5,16 +5,16 @@
void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
uint2 tile_dword_index = xe_group_thread_id.xy;
tile_dword_index.x *= 2u;
uint2 pixels_7e3_packed = xe_edram_load_store_source.Load2(
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
uint4 pixel_0_f16u32 = XeFloat7e3To16(pixels_7e3_packed.x);
uint4 pixel_1_f16u32 = XeFloat7e3To16(pixels_7e3_packed.y);
uint4 pixels_f16u32_packed =
uint4(pixel_0_f16u32.xz, pixel_1_f16u32.xz) |
(uint4(pixel_0_f16u32.yw, pixel_1_f16u32.yw) << 16u);
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 2u;
uint2 samples_7e3_packed = xe_edram_load_store_source.Load2(
XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index));
uint4 sample_0_f16u32 = XeFloat7e3To16(samples_7e3_packed.x);
uint4 sample_1_f16u32 = XeFloat7e3To16(samples_7e3_packed.y);
uint4 samples_f16u32_packed =
uint4(sample_0_f16u32.xz, sample_1_f16u32.xz) |
(uint4(sample_0_f16u32.yw, sample_1_f16u32.yw) << 16u);
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
xe_edram_load_store_dest.Store4(rt_offset, pixels_f16u32_packed);
xe_edram_load_store_dest.Store4(rt_offset, samples_f16u32_packed);
}

View File

@ -5,9 +5,9 @@
void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
uint2 tile_dword_index = xe_group_thread_id.xy;
tile_dword_index.x *= 4u;
uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 4u;
uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset);
uint4 depth24 = depth24_stencil >> 8u;
uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset);

View File

@ -4,16 +4,16 @@
void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
uint2 tile_dword_index = xe_group_thread_id.xy;
tile_dword_index.x *= 4u;
uint4 pixels = xe_edram_load_store_source.Load4(
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 4u;
uint4 samples = xe_edram_load_store_source.Load4(
XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index));
// Depth.
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
xe_edram_load_store_dest.Store4(rt_offset, pixels >> 8u);
xe_edram_load_store_dest.Store4(rt_offset, samples >> 8u);
// Stencil.
uint4 stencil = (pixels & 0xFFu) << uint4(0u, 8u, 16u, 24u);
uint4 stencil = (samples & 0xFFu) << uint4(0u, 8u, 16u, 24u);
stencil.xy |= stencil.zw;
stencil.x |= stencil.y;
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +

View File

@ -25,10 +25,10 @@ cbuffer XeEDRAMLoadStoreConstants : register(b0) {
// 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
// 18:20 - destination endianness.
// 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
// For 32 bits per pixel:
// For 32 bits per sample:
// 21:25 - red/blue bit depth.
// 26:30 - blue offset.
// For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
// For 64 bits per sample, it's 1 if need to swap 0:15 and 32:47.
#define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w)
// For clearing.
@ -45,10 +45,20 @@ ByteAddressBuffer xe_edram_load_store_source : register(t0);
#endif
RWByteAddressBuffer xe_edram_load_store_dest : register(u0);
uint XeEDRAMOffset(uint2 tile_index, uint2 tile_dword_index) {
uint XeEDRAMOffset32bpp(uint2 tile_index, uint2 tile_sample_index) {
return ((xe_edram_base_pitch_tiles & 2047u) +
tile_index.y * (xe_edram_base_pitch_tiles >> 11u) + tile_index.x) *
5120u + tile_dword_index.y * 320u + tile_dword_index.x * 4u;
5120u + tile_sample_index.y * 320u + tile_sample_index.x * 4u;
}
// Instead of individual tiles, this works on two consecutive tiles, the first
// one containing the top 80x8 samples, and the second one containing the bottom
// 80x8 samples.
uint XeEDRAMOffset64bpp(uint2 tile_pair_index, uint2 tile_pair_sample_index) {
return ((xe_edram_base_pitch_tiles & 2047u) +
tile_pair_index.y * (xe_edram_base_pitch_tiles >> 11u) +
(tile_pair_index.x << 1u)) * 5120u +
tile_pair_sample_index.y * 640u + tile_pair_sample_index.x * 8u;
}
#endif // XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_

View File

@ -6,9 +6,9 @@ void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_thread_id : SV_DispatchThreadID) {
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
uint2 tile_dword_index = xe_group_thread_id.xy;
tile_dword_index.x *= 4u;
uint4 samples = xe_edram_load_store_source.Load4(rt_offset);
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 4u;
xe_edram_load_store_dest.Store4(
XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index), samples);
}

View File

@ -1,19 +1,14 @@
#include "edram_load_store.hlsli"
[numthreads(40, 8, 1)]
[numthreads(40, 16, 1)]
void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
// One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
// from 1 render target row rather than 1. Threads with X 0-19 are for the
// first row, with 20-39 are for the second.
uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u);
[flatten] if (xe_group_thread_id.x >= 20u) {
tile_dword_index += uint2(uint(-80), 1u);
}
uint4 samples = xe_edram_load_store_source.Load4(rt_offset);
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 2u;
xe_edram_load_store_dest.Store4(
XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
XeEDRAMOffset64bpp(xe_group_id.xy, tile_sample_index), samples);
}

View File

@ -7,13 +7,14 @@ void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_thread_id : SV_DispatchThreadID) {
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
uint4 pixels_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset);
uint4 pixel_0_f16u32 = pixels_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u);
uint4 pixel_1_f16u32 = pixels_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u);
uint2 pixels_7e3_packed =
uint2(XeFloat16To7e3(pixel_0_f16u32), XeFloat16To7e3(pixel_1_f16u32));
uint2 tile_dword_index = xe_group_thread_id.xy;
tile_dword_index.x *= 2u;
uint4 samples_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset);
uint4 sample_0_f16u32 = samples_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u);
uint4 sample_1_f16u32 = samples_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u);
uint2 samples_7e3_packed =
uint2(XeFloat16To7e3(sample_0_f16u32), XeFloat16To7e3(sample_1_f16u32));
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 2u;
xe_edram_load_store_dest.Store2(
XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels_7e3_packed);
XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index),
samples_7e3_packed);
}

View File

@ -15,9 +15,9 @@ void main(uint3 xe_group_id : SV_GroupID,
xe_edram_rt_stencil_offset;
depth24_stencil |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
uint4(0u, 8u, 16u, 24u)) & 0xFFu;
uint2 tile_dword_index = xe_group_thread_id.xy;
tile_dword_index.x *= 4u;
uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 4u;
uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
// Store 24-bit depth for aliasing and checking if 32-bit depth is up to date.
xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil);
// Store 32-bit depth so precision isn't lost when doing multipass rendering.

View File

@ -7,15 +7,15 @@ void main(uint3 xe_group_id : SV_GroupID,
// Depth.
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
uint4 pixels =
uint4 samples =
(xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu) << 8u;
// Stencil.
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
xe_edram_rt_stencil_offset;
pixels |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
samples |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
uint4(0u, 8u, 16u, 24u)) & 0xFFu;
uint2 tile_dword_index = xe_group_thread_id.xy;
tile_dword_index.x *= 4u;
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 4u;
xe_edram_load_store_dest.Store4(
XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index), samples);
}

View File

@ -23,7 +23,7 @@ void main(uint3 xe_group_id : SV_GroupID,
(xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u;
uint2 edram_tile_quarter =
uint2(uint2(10u, 8u) <= xe_group_thread_id.xy) * sample_info.xy;
uint edram_offset = XeEDRAMOffset(
uint edram_offset = XeEDRAMOffset32bpp(
(xe_group_id.xy << sample_info.xy) + edram_tile_quarter,
(xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) <<
(sample_info.xy + uint2(2u, 0u)) + sample_info.zw);

View File

@ -0,0 +1,67 @@
#include "byte_swap.hlsli"
#include "edram_load_store.hlsli"
#include "texture_address.hlsli"
[numthreads(20, 16, 1)]
void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
// Check if not outside of the destination texture completely.
uint4 copy_rect;
copy_rect.xz = xe_edram_tile_sample_rect & 0xFFFFu;
copy_rect.yw = xe_edram_tile_sample_rect >> 16u;
uint2 texel_index = xe_thread_id.xy;
texel_index.x *= 4u;
[branch] if (any(texel_index < copy_rect.xy) ||
any(texel_index >= copy_rect.zw)) {
return;
}
// Get the samples from the EDRAM buffer.
// XY - log2(pixel size), ZW - selected sample offset.
uint4 sample_info =
(xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u;
uint2 edram_tile_quarter =
uint2(uint2(10u, 8u) <= xe_group_thread_id.xy) * sample_info.xy;
uint edram_offset = XeEDRAMOffset64bpp(
(xe_group_id.xy << sample_info.xy) + edram_tile_quarter,
(xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) <<
(sample_info.xy + uint2(2u, 0u)) + sample_info.zw);
// Loaded with the first 2 pixels at 1x and 2x, or the first 1 pixel at 4x.
uint4 pixels_01 = xe_edram_load_store_source.Load4(edram_offset);
// Loaded with the second 2 pixels at 1x and 2x, or the second 1 pixel at 4x.
uint4 pixels_23 = xe_edram_load_store_source.Load4(edram_offset + 16u);
[branch] if (sample_info.x != 0u) {
// Rather than 4 pixels, at 4x, we only have 2 - in xy of each variable
// rather than in xyzw of pixels_01. Combine and load 2 more.
pixels_01.zw = pixels_23.xy;
pixels_23.xy = xe_edram_load_store_source.Load2(edram_offset + 32u);
pixels_23.zw = xe_edram_load_store_source.Load2(edram_offset + 48u);
}
if ((xe_edram_tile_sample_dest_info >> 21u) != 0u) {
// Swap red and blue - all 64bpp formats where this is possible are
// 16:16:16:16.
pixels_01 = (pixels_01 & 0xFFFF0000u) | (pixels_01.yxwz & 0xFFFFu);
pixels_23 = (pixels_23 & 0xFFFF0000u) | (pixels_23.yxwz & 0xFFFFu);
}
// Tile the pixels to the shared memory.
pixels_01 = XeByteSwap(pixels_01, xe_edram_tile_sample_dest_info >> 18u);
pixels_23 = XeByteSwap(pixels_23, xe_edram_tile_sample_dest_info >> 18u);
uint4 texel_addresses =
xe_edram_tile_sample_dest_base +
XeTextureTiledOffset2D(texel_index - copy_rect.xy,
xe_edram_tile_sample_dest_info & 16383u, 3u);
xe_edram_load_store_dest.Store2(texel_addresses.x, pixels_01.xy);
bool3 texels_in_rect = uint3(1u, 2u, 3u) + texel_index.x < copy_rect.z;
[branch] if (texels_in_rect.x) {
xe_edram_load_store_dest.Store2(texel_addresses.y, pixels_01.zw);
[branch] if (texels_in_rect.y) {
xe_edram_load_store_dest.Store2(texel_addresses.z, pixels_23.xy);
[branch] if (texels_in_rect.z) {
xe_edram_load_store_dest.Store2(texel_addresses.w, pixels_23.zw);
}
}
}
}