[D3D12] 64bpp raw resolve and EDRAM refactoring
This commit is contained in:
parent
5be78ab369
commit
c9ffe98d21
|
@ -41,6 +41,7 @@ namespace d3d12 {
|
|||
#include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_float_cs.h"
|
||||
#include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_unorm_cs.h"
|
||||
#include "xenia/gpu/d3d12/shaders/bin/edram_tile_sample_32bpp_cs.h"
|
||||
#include "xenia/gpu/d3d12/shaders/bin/edram_tile_sample_64bpp_cs.h"
|
||||
#include "xenia/gpu/d3d12/shaders/bin/resolve_ps.h"
|
||||
#include "xenia/gpu/d3d12/shaders/bin/resolve_vs.h"
|
||||
|
||||
|
@ -173,6 +174,16 @@ bool RenderTargetCache::Initialize() {
|
|||
return false;
|
||||
}
|
||||
edram_tile_sample_32bpp_pipeline_->SetName(L"EDRAM Raw Resolve 32bpp");
|
||||
// Tile single sample into a texture - 64 bits per pixel.
|
||||
edram_tile_sample_64bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
|
||||
device, edram_tile_sample_64bpp_cs, sizeof(edram_tile_sample_64bpp_cs),
|
||||
edram_load_store_root_signature_);
|
||||
if (edram_tile_sample_64bpp_pipeline_ == nullptr) {
|
||||
XELOGE("Failed to create the 64bpp EDRAM raw resolve pipeline");
|
||||
Shutdown();
|
||||
return false;
|
||||
}
|
||||
edram_tile_sample_64bpp_pipeline_->SetName(L"EDRAM Raw Resolve 64bpp");
|
||||
// Clear 32-bit color or unorm depth.
|
||||
edram_clear_32bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
|
||||
device, edram_clear_32bpp_cs, sizeof(edram_clear_32bpp_cs),
|
||||
|
@ -258,6 +269,7 @@ void RenderTargetCache::Shutdown() {
|
|||
}
|
||||
resolve_pipelines_.clear();
|
||||
ui::d3d12::util::ReleaseAndNull(resolve_root_signature_);
|
||||
ui::d3d12::util::ReleaseAndNull(edram_tile_sample_64bpp_pipeline_);
|
||||
ui::d3d12::util::ReleaseAndNull(edram_tile_sample_32bpp_pipeline_);
|
||||
ui::d3d12::util::ReleaseAndNull(edram_clear_depth_float_pipeline_);
|
||||
ui::d3d12::util::ReleaseAndNull(edram_clear_32bpp_pipeline_);
|
||||
|
@ -977,9 +989,10 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
|
|||
|
||||
// Validate and clamp the source region, skip parts that don't need to be
|
||||
// copied and calculate the number of threads needed for copying/loading.
|
||||
uint32_t surface_pitch_tiles, row_tiles, rows;
|
||||
uint32_t surface_pitch_tiles, row_width_ss_div_80, rows;
|
||||
if (!GetEDRAMLayout(surface_pitch, msaa_samples, src_64bpp, edram_base,
|
||||
copy_rect, surface_pitch_tiles, row_tiles, rows)) {
|
||||
copy_rect, surface_pitch_tiles, row_width_ss_div_80,
|
||||
rows)) {
|
||||
// Nothing to copy.
|
||||
return true;
|
||||
}
|
||||
|
@ -1008,10 +1021,6 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
|
|||
// Raw copy
|
||||
// *************************************************************************
|
||||
XELOGGPU("Resolve: Copying using a compute shader");
|
||||
if (src_64bpp) {
|
||||
// TODO(Triang3l): 64bpp sample copy shader.
|
||||
return false;
|
||||
}
|
||||
|
||||
// Make sure we have the memory to write to.
|
||||
if (!shared_memory->MakeTilesResident(dest_address, dest_size)) {
|
||||
|
@ -1079,10 +1088,11 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
|
|||
command_list->SetComputeRoot32BitConstants(
|
||||
0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
|
||||
command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
|
||||
// TODO(Triang3l): 64bpp pipeline.
|
||||
command_processor_->SetComputePipeline(edram_tile_sample_32bpp_pipeline_);
|
||||
// 1 group per destination 80x16 (32bpp) / 80x8 (64bpp) region.
|
||||
uint32_t group_count_x = row_tiles, group_count_y = rows;
|
||||
command_processor_->SetComputePipeline(
|
||||
src_64bpp ? edram_tile_sample_64bpp_pipeline_
|
||||
: edram_tile_sample_32bpp_pipeline_);
|
||||
// 1 group per destination 80x16 region.
|
||||
uint32_t group_count_x = row_width_ss_div_80, group_count_y = rows;
|
||||
if (msaa_samples >= MsaaSamples::k2X) {
|
||||
group_count_y = (group_count_y + 1) >> 1;
|
||||
if (msaa_samples >= MsaaSamples::k4X) {
|
||||
|
@ -1121,7 +1131,7 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
|
|||
return false;
|
||||
}
|
||||
RenderTargetKey render_target_key;
|
||||
render_target_key.width_ss_div_80 = row_tiles >> (src_64bpp ? 1 : 0);
|
||||
render_target_key.width_ss_div_80 = row_width_ss_div_80;
|
||||
render_target_key.height_ss_div_16 = rows;
|
||||
render_target_key.is_depth = false;
|
||||
render_target_key.format = src_format;
|
||||
|
@ -1190,7 +1200,8 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
|
|||
|
||||
command_processor_->SetComputePipeline(
|
||||
edram_load_pipelines_[size_t(GetLoadStoreMode(false, src_format))]);
|
||||
command_list->Dispatch(row_tiles, rows, 1);
|
||||
// 1 group per 80x16 samples.
|
||||
command_list->Dispatch(row_width_ss_div_80, rows, 1);
|
||||
command_processor_->PushUAVBarrier(copy_buffer);
|
||||
|
||||
// Go to the next descriptor set.
|
||||
|
@ -1405,9 +1416,10 @@ bool RenderTargetCache::ResolveClear(uint32_t edram_base,
|
|||
bool is_64bpp =
|
||||
!is_depth && IsColorFormat64bpp(ColorRenderTargetFormat(format));
|
||||
D3D12_RECT clear_rect = rect;
|
||||
uint32_t surface_pitch_tiles, row_tiles, rows;
|
||||
uint32_t surface_pitch_tiles, row_width_ss_div_80, rows;
|
||||
if (!GetEDRAMLayout(surface_pitch, msaa_samples, is_64bpp, edram_base,
|
||||
clear_rect, surface_pitch_tiles, row_tiles, rows)) {
|
||||
clear_rect, surface_pitch_tiles, row_width_ss_div_80,
|
||||
rows)) {
|
||||
// Nothing to clear.
|
||||
return true;
|
||||
}
|
||||
|
@ -1475,7 +1487,8 @@ bool RenderTargetCache::ResolveClear(uint32_t edram_base,
|
|||
ui::d3d12::util::CreateRawBufferUAV(device, descriptor_cpu_start,
|
||||
edram_buffer_, kEDRAMBufferSize);
|
||||
command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
|
||||
command_list->Dispatch(row_tiles, rows, 1);
|
||||
// 1 group per 80x16 samples.
|
||||
command_list->Dispatch(row_width_ss_div_80, rows, 1);
|
||||
command_processor_->PushUAVBarrier(edram_buffer_);
|
||||
|
||||
return true;
|
||||
|
@ -1871,7 +1884,7 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
|
|||
bool RenderTargetCache::GetEDRAMLayout(
|
||||
uint32_t pitch_pixels, MsaaSamples msaa_samples, bool is_64bpp,
|
||||
uint32_t& base_in_out, D3D12_RECT& rect_in_out, uint32_t& pitch_tiles_out,
|
||||
uint32_t& row_tiles_out, uint32_t& rows_out) {
|
||||
uint32_t& row_width_ss_div_80_out, uint32_t& rows_out) {
|
||||
if (pitch_pixels == 0 || rect_in_out.right <= 0 || rect_in_out.bottom <= 0 ||
|
||||
rect_in_out.top >= rect_in_out.bottom) {
|
||||
return false;
|
||||
|
@ -1921,8 +1934,7 @@ bool RenderTargetCache::GetEDRAMLayout(
|
|||
base_in_out = base;
|
||||
rect_in_out = rect;
|
||||
pitch_tiles_out = pitch_tiles;
|
||||
row_tiles_out = (((rect.right << samples_x_log2) + 79) / 80)
|
||||
<< sample_size_log2;
|
||||
row_width_ss_div_80_out = ((rect.right << samples_x_log2) + 79) / 80;
|
||||
rows_out = rows;
|
||||
return true;
|
||||
}
|
||||
|
@ -2044,14 +2056,6 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
|
|||
const RenderTarget* render_target = binding.render_target;
|
||||
bool is_64bpp = false;
|
||||
|
||||
// Get the number of X thread groups.
|
||||
uint32_t rt_pitch_tiles = surface_pitch_tiles;
|
||||
if (!render_target->key.is_depth &&
|
||||
IsColorFormat64bpp(
|
||||
ColorRenderTargetFormat(render_target->key.format))) {
|
||||
rt_pitch_tiles *= 2;
|
||||
}
|
||||
|
||||
// Transition the copy buffer to copy destination.
|
||||
command_processor_->PushTransitionBarrier(copy_buffer, copy_buffer_state,
|
||||
D3D12_RESOURCE_STATE_COPY_DEST);
|
||||
|
@ -2084,6 +2088,12 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
|
|||
root_constants.rt_stencil_pitch =
|
||||
location_dest.PlacedFootprint.Footprint.RowPitch;
|
||||
}
|
||||
uint32_t rt_pitch_tiles = surface_pitch_tiles;
|
||||
if (!render_target->key.is_depth &&
|
||||
IsColorFormat64bpp(
|
||||
ColorRenderTargetFormat(render_target->key.format))) {
|
||||
rt_pitch_tiles *= 2;
|
||||
}
|
||||
root_constants.base_pitch_tiles =
|
||||
binding.edram_base | (rt_pitch_tiles << 11);
|
||||
|
||||
|
@ -2101,7 +2111,8 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
|
|||
render_target->key.format);
|
||||
command_processor_->SetComputePipeline(
|
||||
edram_store_pipelines_[size_t(mode)]);
|
||||
command_list->Dispatch(rt_pitch_tiles, binding.edram_dirty_rows, 1);
|
||||
// 1 group per 80x16 samples.
|
||||
command_list->Dispatch(surface_pitch_tiles, binding.edram_dirty_rows, 1);
|
||||
|
||||
// Commit the UAV write.
|
||||
command_processor_->PushUAVBarrier(edram_buffer_);
|
||||
|
@ -2178,7 +2189,7 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
|
|||
}
|
||||
const RenderTarget* render_target = render_targets[i];
|
||||
|
||||
// Get the number of X thread groups.
|
||||
// Get the number of EDRAM tiles per row.
|
||||
uint32_t edram_pitch_tiles = render_target->key.width_ss_div_80;
|
||||
if (!render_target->key.is_depth &&
|
||||
IsColorFormat64bpp(
|
||||
|
@ -2218,7 +2229,8 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
|
|||
EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
|
||||
render_target->key.format);
|
||||
command_processor_->SetComputePipeline(edram_load_pipelines_[size_t(mode)]);
|
||||
command_list->Dispatch(edram_pitch_tiles, edram_rows, 1);
|
||||
// 1 group per 80x16 samples.
|
||||
command_list->Dispatch(render_target->key.width_ss_div_80, edram_rows, 1);
|
||||
|
||||
// Commit the UAV write and transition the copy buffer to copy source now.
|
||||
command_processor_->PushUAVBarrier(copy_buffer);
|
||||
|
|
|
@ -381,7 +381,8 @@ class RenderTargetCache {
|
|||
static bool GetEDRAMLayout(uint32_t pitch_pixels, MsaaSamples msaa_samples,
|
||||
bool is_64bpp, uint32_t& base_in_out,
|
||||
D3D12_RECT& rect_in_out, uint32_t& pitch_tiles_out,
|
||||
uint32_t& row_tiles_out, uint32_t& rows_out);
|
||||
uint32_t& row_width_ss_div_80_out,
|
||||
uint32_t& rows_out);
|
||||
|
||||
static EDRAMLoadStoreMode GetLoadStoreMode(bool is_depth, uint32_t format);
|
||||
|
||||
|
@ -480,6 +481,7 @@ class RenderTargetCache {
|
|||
ID3D12PipelineState*
|
||||
edram_store_pipelines_[size_t(EDRAMLoadStoreMode::kCount)] = {};
|
||||
ID3D12PipelineState* edram_tile_sample_32bpp_pipeline_ = nullptr;
|
||||
ID3D12PipelineState* edram_tile_sample_64bpp_pipeline_ = nullptr;
|
||||
ID3D12PipelineState* edram_clear_32bpp_pipeline_ = nullptr;
|
||||
ID3D12PipelineState* edram_clear_depth_float_pipeline_ = nullptr;
|
||||
|
||||
|
|
|
@ -15,9 +15,9 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
any(sample_index >= clear_rect.zw)) {
|
||||
return;
|
||||
}
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||
tile_dword_index.x *= 2u;
|
||||
uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
|
||||
uint2 tile_sample_index = xe_group_thread_id.xy;
|
||||
tile_sample_index.x *= 2u;
|
||||
uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
|
||||
xe_edram_load_store_dest.Store(edram_offset, xe_edram_clear_color32);
|
||||
if (sample_index.x + 1u < clear_rect.z) {
|
||||
xe_edram_load_store_dest.Store(edram_offset + 4u, xe_edram_clear_color32);
|
||||
|
|
|
@ -15,11 +15,11 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
any(sample_index >= clear_rect.zw)) {
|
||||
return;
|
||||
}
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||
tile_dword_index.x *= 2u;
|
||||
uint2 tile_sample_index = xe_group_thread_id.xy;
|
||||
tile_sample_index.x *= 2u;
|
||||
bool second_sample_inside = sample_index.x + 1u < clear_rect.z;
|
||||
// 24-bit depth.
|
||||
uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
|
||||
uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
|
||||
xe_edram_load_store_dest.Store(edram_offset, xe_edram_clear_depth24);
|
||||
[branch] if (second_sample_inside) {
|
||||
xe_edram_load_store_dest.Store(edram_offset + 4u, xe_edram_clear_depth24);
|
||||
|
|
|
@ -4,11 +4,11 @@
|
|||
void main(uint3 xe_group_id : SV_GroupID,
|
||||
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||
tile_dword_index.x *= 4u;
|
||||
uint4 pixels = xe_edram_load_store_source.Load4(
|
||||
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
|
||||
uint2 tile_sample_index = xe_group_thread_id.xy;
|
||||
tile_sample_index.x *= 4u;
|
||||
uint4 samples = xe_edram_load_store_source.Load4(
|
||||
XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index));
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
xe_edram_load_store_dest.Store4(rt_offset, pixels);
|
||||
xe_edram_load_store_dest.Store4(rt_offset, samples);
|
||||
}
|
||||
|
|
|
@ -1,19 +1,14 @@
|
|||
#include "edram_load_store.hlsli"
|
||||
|
||||
[numthreads(40, 8, 1)]
|
||||
[numthreads(40, 16, 1)]
|
||||
void main(uint3 xe_group_id : SV_GroupID,
|
||||
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
// One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
|
||||
// from 1 render target row rather than 1. Threads with X 0-19 are for the
|
||||
// first row, with 20-39 are for the second.
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u);
|
||||
[flatten] if (xe_group_thread_id.x >= 20u) {
|
||||
tile_dword_index += uint2(uint(-80), 1u);
|
||||
}
|
||||
uint4 pixels = xe_edram_load_store_source.Load4(
|
||||
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
|
||||
uint2 tile_sample_index = xe_group_thread_id.xy;
|
||||
tile_sample_index.x *= 2u;
|
||||
uint4 samples = xe_edram_load_store_source.Load4(
|
||||
XeEDRAMOffset64bpp(xe_group_id.xy, tile_sample_index));
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
xe_edram_load_store_dest.Store4(rt_offset, pixels);
|
||||
xe_edram_load_store_dest.Store4(rt_offset, samples);
|
||||
}
|
||||
|
|
|
@ -5,16 +5,16 @@
|
|||
void main(uint3 xe_group_id : SV_GroupID,
|
||||
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||
tile_dword_index.x *= 2u;
|
||||
uint2 pixels_7e3_packed = xe_edram_load_store_source.Load2(
|
||||
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
|
||||
uint4 pixel_0_f16u32 = XeFloat7e3To16(pixels_7e3_packed.x);
|
||||
uint4 pixel_1_f16u32 = XeFloat7e3To16(pixels_7e3_packed.y);
|
||||
uint4 pixels_f16u32_packed =
|
||||
uint4(pixel_0_f16u32.xz, pixel_1_f16u32.xz) |
|
||||
(uint4(pixel_0_f16u32.yw, pixel_1_f16u32.yw) << 16u);
|
||||
uint2 tile_sample_index = xe_group_thread_id.xy;
|
||||
tile_sample_index.x *= 2u;
|
||||
uint2 samples_7e3_packed = xe_edram_load_store_source.Load2(
|
||||
XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index));
|
||||
uint4 sample_0_f16u32 = XeFloat7e3To16(samples_7e3_packed.x);
|
||||
uint4 sample_1_f16u32 = XeFloat7e3To16(samples_7e3_packed.y);
|
||||
uint4 samples_f16u32_packed =
|
||||
uint4(sample_0_f16u32.xz, sample_1_f16u32.xz) |
|
||||
(uint4(sample_0_f16u32.yw, sample_1_f16u32.yw) << 16u);
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
xe_edram_load_store_dest.Store4(rt_offset, pixels_f16u32_packed);
|
||||
xe_edram_load_store_dest.Store4(rt_offset, samples_f16u32_packed);
|
||||
}
|
||||
|
|
|
@ -5,9 +5,9 @@
|
|||
void main(uint3 xe_group_id : SV_GroupID,
|
||||
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||
tile_dword_index.x *= 4u;
|
||||
uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
|
||||
uint2 tile_sample_index = xe_group_thread_id.xy;
|
||||
tile_sample_index.x *= 4u;
|
||||
uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
|
||||
uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset);
|
||||
uint4 depth24 = depth24_stencil >> 8u;
|
||||
uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset);
|
||||
|
|
|
@ -4,16 +4,16 @@
|
|||
void main(uint3 xe_group_id : SV_GroupID,
|
||||
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||
tile_dword_index.x *= 4u;
|
||||
uint4 pixels = xe_edram_load_store_source.Load4(
|
||||
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
|
||||
uint2 tile_sample_index = xe_group_thread_id.xy;
|
||||
tile_sample_index.x *= 4u;
|
||||
uint4 samples = xe_edram_load_store_source.Load4(
|
||||
XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index));
|
||||
// Depth.
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
xe_edram_load_store_dest.Store4(rt_offset, pixels >> 8u);
|
||||
xe_edram_load_store_dest.Store4(rt_offset, samples >> 8u);
|
||||
// Stencil.
|
||||
uint4 stencil = (pixels & 0xFFu) << uint4(0u, 8u, 16u, 24u);
|
||||
uint4 stencil = (samples & 0xFFu) << uint4(0u, 8u, 16u, 24u);
|
||||
stencil.xy |= stencil.zw;
|
||||
stencil.x |= stencil.y;
|
||||
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
|
||||
|
|
|
@ -25,10 +25,10 @@ cbuffer XeEDRAMLoadStoreConstants : register(b0) {
|
|||
// 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
|
||||
// 18:20 - destination endianness.
|
||||
// 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
|
||||
// For 32 bits per pixel:
|
||||
// For 32 bits per sample:
|
||||
// 21:25 - red/blue bit depth.
|
||||
// 26:30 - blue offset.
|
||||
// For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
|
||||
// For 64 bits per sample, it's 1 if need to swap 0:15 and 32:47.
|
||||
#define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w)
|
||||
|
||||
// For clearing.
|
||||
|
@ -45,10 +45,20 @@ ByteAddressBuffer xe_edram_load_store_source : register(t0);
|
|||
#endif
|
||||
RWByteAddressBuffer xe_edram_load_store_dest : register(u0);
|
||||
|
||||
uint XeEDRAMOffset(uint2 tile_index, uint2 tile_dword_index) {
|
||||
uint XeEDRAMOffset32bpp(uint2 tile_index, uint2 tile_sample_index) {
|
||||
return ((xe_edram_base_pitch_tiles & 2047u) +
|
||||
tile_index.y * (xe_edram_base_pitch_tiles >> 11u) + tile_index.x) *
|
||||
5120u + tile_dword_index.y * 320u + tile_dword_index.x * 4u;
|
||||
5120u + tile_sample_index.y * 320u + tile_sample_index.x * 4u;
|
||||
}
|
||||
|
||||
// Instead of individual tiles, this works on two consecutive tiles, the first
|
||||
// one containing the top 80x8 samples, and the second one containing the bottom
|
||||
// 80x8 samples.
|
||||
uint XeEDRAMOffset64bpp(uint2 tile_pair_index, uint2 tile_pair_sample_index) {
|
||||
return ((xe_edram_base_pitch_tiles & 2047u) +
|
||||
tile_pair_index.y * (xe_edram_base_pitch_tiles >> 11u) +
|
||||
(tile_pair_index.x << 1u)) * 5120u +
|
||||
tile_pair_sample_index.y * 640u + tile_pair_sample_index.x * 8u;
|
||||
}
|
||||
|
||||
#endif // XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
|
||||
|
|
|
@ -6,9 +6,9 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||
tile_dword_index.x *= 4u;
|
||||
uint4 samples = xe_edram_load_store_source.Load4(rt_offset);
|
||||
uint2 tile_sample_index = xe_group_thread_id.xy;
|
||||
tile_sample_index.x *= 4u;
|
||||
xe_edram_load_store_dest.Store4(
|
||||
XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
|
||||
XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index), samples);
|
||||
}
|
||||
|
|
|
@ -1,19 +1,14 @@
|
|||
#include "edram_load_store.hlsli"
|
||||
|
||||
[numthreads(40, 8, 1)]
|
||||
[numthreads(40, 16, 1)]
|
||||
void main(uint3 xe_group_id : SV_GroupID,
|
||||
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
|
||||
// One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
|
||||
// from 1 render target row rather than 1. Threads with X 0-19 are for the
|
||||
// first row, with 20-39 are for the second.
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u);
|
||||
[flatten] if (xe_group_thread_id.x >= 20u) {
|
||||
tile_dword_index += uint2(uint(-80), 1u);
|
||||
}
|
||||
uint4 samples = xe_edram_load_store_source.Load4(rt_offset);
|
||||
uint2 tile_sample_index = xe_group_thread_id.xy;
|
||||
tile_sample_index.x *= 2u;
|
||||
xe_edram_load_store_dest.Store4(
|
||||
XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
|
||||
XeEDRAMOffset64bpp(xe_group_id.xy, tile_sample_index), samples);
|
||||
}
|
||||
|
|
|
@ -7,13 +7,14 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
uint4 pixels_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset);
|
||||
uint4 pixel_0_f16u32 = pixels_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u);
|
||||
uint4 pixel_1_f16u32 = pixels_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u);
|
||||
uint2 pixels_7e3_packed =
|
||||
uint2(XeFloat16To7e3(pixel_0_f16u32), XeFloat16To7e3(pixel_1_f16u32));
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||
tile_dword_index.x *= 2u;
|
||||
uint4 samples_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset);
|
||||
uint4 sample_0_f16u32 = samples_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u);
|
||||
uint4 sample_1_f16u32 = samples_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u);
|
||||
uint2 samples_7e3_packed =
|
||||
uint2(XeFloat16To7e3(sample_0_f16u32), XeFloat16To7e3(sample_1_f16u32));
|
||||
uint2 tile_sample_index = xe_group_thread_id.xy;
|
||||
tile_sample_index.x *= 2u;
|
||||
xe_edram_load_store_dest.Store2(
|
||||
XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels_7e3_packed);
|
||||
XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index),
|
||||
samples_7e3_packed);
|
||||
}
|
||||
|
|
|
@ -15,9 +15,9 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
xe_edram_rt_stencil_offset;
|
||||
depth24_stencil |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
|
||||
uint4(0u, 8u, 16u, 24u)) & 0xFFu;
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||
tile_dword_index.x *= 4u;
|
||||
uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
|
||||
uint2 tile_sample_index = xe_group_thread_id.xy;
|
||||
tile_sample_index.x *= 4u;
|
||||
uint edram_offset = XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index);
|
||||
// Store 24-bit depth for aliasing and checking if 32-bit depth is up to date.
|
||||
xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil);
|
||||
// Store 32-bit depth so precision isn't lost when doing multipass rendering.
|
||||
|
|
|
@ -7,15 +7,15 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
// Depth.
|
||||
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
|
||||
uint4 pixels =
|
||||
uint4 samples =
|
||||
(xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu) << 8u;
|
||||
// Stencil.
|
||||
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
|
||||
xe_edram_rt_stencil_offset;
|
||||
pixels |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
|
||||
samples |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
|
||||
uint4(0u, 8u, 16u, 24u)) & 0xFFu;
|
||||
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||
tile_dword_index.x *= 4u;
|
||||
uint2 tile_sample_index = xe_group_thread_id.xy;
|
||||
tile_sample_index.x *= 4u;
|
||||
xe_edram_load_store_dest.Store4(
|
||||
XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
|
||||
XeEDRAMOffset32bpp(xe_group_id.xy, tile_sample_index), samples);
|
||||
}
|
||||
|
|
|
@ -23,7 +23,7 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
(xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u;
|
||||
uint2 edram_tile_quarter =
|
||||
uint2(uint2(10u, 8u) <= xe_group_thread_id.xy) * sample_info.xy;
|
||||
uint edram_offset = XeEDRAMOffset(
|
||||
uint edram_offset = XeEDRAMOffset32bpp(
|
||||
(xe_group_id.xy << sample_info.xy) + edram_tile_quarter,
|
||||
(xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) <<
|
||||
(sample_info.xy + uint2(2u, 0u)) + sample_info.zw);
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
#include "byte_swap.hlsli"
|
||||
#include "edram_load_store.hlsli"
|
||||
#include "texture_address.hlsli"
|
||||
|
||||
[numthreads(20, 16, 1)]
|
||||
void main(uint3 xe_group_id : SV_GroupID,
|
||||
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
// Check if not outside of the destination texture completely.
|
||||
uint4 copy_rect;
|
||||
copy_rect.xz = xe_edram_tile_sample_rect & 0xFFFFu;
|
||||
copy_rect.yw = xe_edram_tile_sample_rect >> 16u;
|
||||
uint2 texel_index = xe_thread_id.xy;
|
||||
texel_index.x *= 4u;
|
||||
[branch] if (any(texel_index < copy_rect.xy) ||
|
||||
any(texel_index >= copy_rect.zw)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Get the samples from the EDRAM buffer.
|
||||
// XY - log2(pixel size), ZW - selected sample offset.
|
||||
uint4 sample_info =
|
||||
(xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u;
|
||||
uint2 edram_tile_quarter =
|
||||
uint2(uint2(10u, 8u) <= xe_group_thread_id.xy) * sample_info.xy;
|
||||
uint edram_offset = XeEDRAMOffset64bpp(
|
||||
(xe_group_id.xy << sample_info.xy) + edram_tile_quarter,
|
||||
(xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) <<
|
||||
(sample_info.xy + uint2(2u, 0u)) + sample_info.zw);
|
||||
// Loaded with the first 2 pixels at 1x and 2x, or the first 1 pixel at 4x.
|
||||
uint4 pixels_01 = xe_edram_load_store_source.Load4(edram_offset);
|
||||
// Loaded with the second 2 pixels at 1x and 2x, or the second 1 pixel at 4x.
|
||||
uint4 pixels_23 = xe_edram_load_store_source.Load4(edram_offset + 16u);
|
||||
[branch] if (sample_info.x != 0u) {
|
||||
// Rather than 4 pixels, at 4x, we only have 2 - in xy of each variable
|
||||
// rather than in xyzw of pixels_01. Combine and load 2 more.
|
||||
pixels_01.zw = pixels_23.xy;
|
||||
pixels_23.xy = xe_edram_load_store_source.Load2(edram_offset + 32u);
|
||||
pixels_23.zw = xe_edram_load_store_source.Load2(edram_offset + 48u);
|
||||
}
|
||||
|
||||
if ((xe_edram_tile_sample_dest_info >> 21u) != 0u) {
|
||||
// Swap red and blue - all 64bpp formats where this is possible are
|
||||
// 16:16:16:16.
|
||||
pixels_01 = (pixels_01 & 0xFFFF0000u) | (pixels_01.yxwz & 0xFFFFu);
|
||||
pixels_23 = (pixels_23 & 0xFFFF0000u) | (pixels_23.yxwz & 0xFFFFu);
|
||||
}
|
||||
|
||||
// Tile the pixels to the shared memory.
|
||||
pixels_01 = XeByteSwap(pixels_01, xe_edram_tile_sample_dest_info >> 18u);
|
||||
pixels_23 = XeByteSwap(pixels_23, xe_edram_tile_sample_dest_info >> 18u);
|
||||
uint4 texel_addresses =
|
||||
xe_edram_tile_sample_dest_base +
|
||||
XeTextureTiledOffset2D(texel_index - copy_rect.xy,
|
||||
xe_edram_tile_sample_dest_info & 16383u, 3u);
|
||||
xe_edram_load_store_dest.Store2(texel_addresses.x, pixels_01.xy);
|
||||
bool3 texels_in_rect = uint3(1u, 2u, 3u) + texel_index.x < copy_rect.z;
|
||||
[branch] if (texels_in_rect.x) {
|
||||
xe_edram_load_store_dest.Store2(texel_addresses.y, pixels_01.zw);
|
||||
[branch] if (texels_in_rect.y) {
|
||||
xe_edram_load_store_dest.Store2(texel_addresses.z, pixels_23.xy);
|
||||
[branch] if (texels_in_rect.z) {
|
||||
xe_edram_load_store_dest.Store2(texel_addresses.w, pixels_23.zw);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue