[D3D12] 32bpp and 20e4 clearing in resolves

This commit is contained in:
Triang3l 2018-08-25 17:03:06 +03:00
parent 66510b2e6f
commit 50470d67a8
5 changed files with 265 additions and 4 deletions

View File

@ -27,6 +27,8 @@ namespace gpu {
namespace d3d12 {
// Generated with `xb buildhlsl`.
#include "xenia/gpu/d3d12/shaders/bin/edram_clear_32bpp_cs.h"
#include "xenia/gpu/d3d12/shaders/bin/edram_clear_depth_float_cs.h"
#include "xenia/gpu/d3d12/shaders/bin/edram_load_color_32bpp_cs.h"
#include "xenia/gpu/d3d12/shaders/bin/edram_load_color_64bpp_cs.h"
#include "xenia/gpu/d3d12/shaders/bin/edram_load_color_7e3_cs.h"
@ -150,6 +152,7 @@ bool RenderTargetCache::Initialize() {
}
if (load_store_root_error_blob != nullptr) {
load_store_root_error_blob->Release();
load_store_root_error_blob = nullptr;
}
if (FAILED(device->CreateRootSignature(
0, load_store_root_blob->GetBufferPointer(),
@ -162,6 +165,36 @@ bool RenderTargetCache::Initialize() {
}
load_store_root_blob->Release();
// Create the clear root signature (the same, but with the UAV only).
load_store_root_parameters[1].DescriptorTable.NumDescriptorRanges = 1;
++load_store_root_parameters[1].DescriptorTable.pDescriptorRanges;
if (FAILED(D3D12SerializeRootSignature(
&load_store_root_desc, D3D_ROOT_SIGNATURE_VERSION_1,
&load_store_root_blob, &load_store_root_error_blob))) {
XELOGE("Failed to serialize the EDRAM buffer clear root signature");
if (load_store_root_error_blob != nullptr) {
XELOGE("%s", reinterpret_cast<const char*>(
load_store_root_error_blob->GetBufferPointer()));
load_store_root_error_blob->Release();
}
Shutdown();
return false;
}
if (load_store_root_error_blob != nullptr) {
load_store_root_error_blob->Release();
load_store_root_error_blob = nullptr;
}
if (FAILED(device->CreateRootSignature(
0, load_store_root_blob->GetBufferPointer(),
load_store_root_blob->GetBufferSize(),
IID_PPV_ARGS(&edram_clear_root_signature_)))) {
XELOGE("Failed to create the EDRAM buffer clear root signature");
load_store_root_blob->Release();
Shutdown();
return false;
}
load_store_root_blob->Release();
// Create the load/store pipelines.
D3D12_COMPUTE_PIPELINE_STATE_DESC pipeline_desc;
pipeline_desc.pRootSignature = edram_load_store_root_signature_;
@ -203,6 +236,29 @@ bool RenderTargetCache::Initialize() {
}
edram_tile_sample_32bpp_pipeline_->SetName(L"EDRAM Raw Resolve 32bpp");
// Create the clear pipelines.
pipeline_desc.pRootSignature = edram_clear_root_signature_;
// 32-bit color or unorm depth.
pipeline_desc.CS.pShaderBytecode = edram_clear_32bpp_cs;
pipeline_desc.CS.BytecodeLength = sizeof(edram_clear_32bpp_cs);
if (FAILED(device->CreateComputePipelineState(
&pipeline_desc, IID_PPV_ARGS(&edram_clear_32bpp_pipeline_)))) {
XELOGE("Failed to create the EDRAM 32bpp clear pipeline");
Shutdown();
return false;
}
edram_clear_32bpp_pipeline_->SetName(L"EDRAM Clear 32bpp");
// Float depth.
pipeline_desc.CS.pShaderBytecode = edram_clear_depth_float_cs;
pipeline_desc.CS.BytecodeLength = sizeof(edram_clear_depth_float_cs);
if (FAILED(device->CreateComputePipelineState(
&pipeline_desc, IID_PPV_ARGS(&edram_clear_depth_float_pipeline_)))) {
XELOGE("Failed to create the EDRAM float depth clear pipeline");
Shutdown();
return false;
}
edram_clear_depth_float_pipeline_->SetName(L"EDRAM Clear Float Depth");
// Create the converting resolve root signature.
D3D12_ROOT_PARAMETER resolve_root_parameters[2];
// Parameter 0 is constants.
@ -295,6 +351,14 @@ void RenderTargetCache::Shutdown() {
edram_tile_sample_32bpp_pipeline_->Release();
edram_tile_sample_32bpp_pipeline_ = nullptr;
}
if (edram_clear_depth_float_pipeline_ != nullptr) {
edram_clear_depth_float_pipeline_->Release();
edram_clear_depth_float_pipeline_ = nullptr;
}
if (edram_clear_32bpp_pipeline_ != nullptr) {
edram_clear_32bpp_pipeline_->Release();
edram_clear_32bpp_pipeline_ = nullptr;
}
for (uint32_t i = 0; i < uint32_t(EDRAMLoadStoreMode::kCount); ++i) {
if (edram_load_pipelines_[i] != nullptr) {
edram_load_pipelines_[i]->Release();
@ -305,6 +369,10 @@ void RenderTargetCache::Shutdown() {
edram_store_pipelines_[i] = nullptr;
}
}
if (edram_clear_root_signature_ != nullptr) {
edram_clear_root_signature_->Release();
edram_clear_root_signature_ = nullptr;
}
if (edram_load_store_root_signature_ != nullptr) {
edram_load_store_root_signature_->Release();
edram_load_store_root_signature_ = nullptr;
@ -924,8 +992,9 @@ bool RenderTargetCache::Resolve(SharedMemory* shared_memory,
bool copied = ResolveCopy(shared_memory, texture_cache, surface_edram_base,
surface_pitch, msaa_samples, surface_is_depth,
surface_format, src_rect);
// TODO(Triang3l): Clear.
return copied;
bool cleared = ResolveClear(surface_edram_base, surface_pitch, msaa_samples,
surface_is_depth, surface_format, src_rect);
return copied || cleared;
}
bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
@ -1459,6 +1528,106 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
return true;
}
bool RenderTargetCache::ResolveClear(uint32_t edram_base,
uint32_t surface_pitch,
MsaaSamples msaa_samples, bool is_depth,
uint32_t format, const D3D12_RECT& rect) {
auto& regs = *register_file_;
// Check if clearing is enabled.
uint32_t rb_copy_control = regs[XE_GPU_REG_RB_COPY_CONTROL].u32;
if (!(rb_copy_control & (is_depth ? (1 << 9) : (1 << 8)))) {
return true;
}
// Calculate the layout.
bool is_64bpp =
!is_depth && IsColorFormat64bpp(ColorRenderTargetFormat(format));
D3D12_RECT clear_rect = rect;
uint32_t surface_pitch_tiles, row_tiles, rows;
if (!GetEDRAMLayout(surface_pitch, msaa_samples, is_64bpp, edram_base,
clear_rect, surface_pitch_tiles, row_tiles, rows)) {
// Nothing to clear.
return true;
}
uint32_t samples_x_log2 = msaa_samples >= MsaaSamples::k4X ? 1 : 0;
uint32_t samples_y_log2 = msaa_samples >= MsaaSamples::k2X ? 1 : 0;
// Get everything needed for clearing.
auto command_list = command_processor_->GetCurrentCommandList();
if (command_list == nullptr) {
return false;
}
auto device =
command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice();
D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start;
D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start;
if (command_processor_->RequestViewDescriptors(0, 1, 1, descriptor_cpu_start,
descriptor_gpu_start) == 0) {
return false;
}
// Submit the clear.
command_processor_->PushTransitionBarrier(
edram_buffer_, edram_buffer_state_,
D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
edram_buffer_state_ = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
command_processor_->SubmitBarriers();
EDRAMLoadStoreRootConstants root_constants;
root_constants.clear_rect_lt = (clear_rect.left << samples_x_log2) |
(clear_rect.top << (16 + samples_y_log2));
root_constants.clear_rect_rb = (clear_rect.right << samples_x_log2) |
(clear_rect.bottom << (16 + samples_y_log2));
root_constants.base_pitch_tiles = edram_base | (surface_pitch_tiles << 11);
if (is_depth &&
DepthRenderTargetFormat(format) == DepthRenderTargetFormat::kD24FS8) {
root_constants.clear_depth24 = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
// 20e4 [0,2), based on CFloat24 from d3dref9.dll and on 6e4 in DirectXTex.
uint32_t depth24 = root_constants.clear_depth24 >> 8;
if (depth24 == 0) {
root_constants.clear_depth32 = 0;
} else {
uint32_t mantissa = depth24 & 0xFFFFFu, exponent = depth24 >> 20;
if (exponent == 0) {
// Normalize the value in the resulting float.
// do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x100000) == 0)
uint32_t mantissa_lzcnt = xe::lzcnt(mantissa) - (32u - 21u);
exponent = 1u - mantissa_lzcnt;
mantissa = (mantissa << mantissa_lzcnt) & 0xFFFFFu;
}
root_constants.clear_depth32 =
((exponent + 112u) << 23) | (mantissa << 3);
}
command_processor_->SetComputePipeline(edram_clear_depth_float_pipeline_);
} else if (is_64bpp) {
// TODO(Triang3l): 64bpp color clear.
return false;
} else {
Register reg =
is_depth ? XE_GPU_REG_RB_DEPTH_CLEAR : XE_GPU_REG_RB_COLOR_CLEAR;
root_constants.clear_color_high = regs[reg].u32;
command_processor_->SetComputePipeline(edram_clear_32bpp_pipeline_);
}
command_list->SetComputeRootSignature(edram_clear_root_signature_);
command_list->SetComputeRoot32BitConstants(
0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
D3D12_UNORDERED_ACCESS_VIEW_DESC uav_desc;
uav_desc.Format = DXGI_FORMAT_R32_TYPELESS;
uav_desc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
uav_desc.Buffer.FirstElement = 0;
uav_desc.Buffer.NumElements = 2 * 2048 * 1280;
uav_desc.Buffer.StructureByteStride = 0;
uav_desc.Buffer.CounterOffsetInBytes = 0;
uav_desc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
device->CreateUnorderedAccessView(edram_buffer_, nullptr, &uav_desc,
descriptor_cpu_start);
command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
command_list->Dispatch(row_tiles, rows, 1);
command_processor_->PushUAVBarrier(edram_buffer_);
return true;
}
ID3D12PipelineState* RenderTargetCache::GetResolvePipeline(
DXGI_FORMAT dest_format) {
// Try to find an existing pipeline.

View File

@ -399,6 +399,10 @@ class RenderTargetCache {
uint32_t edram_base, uint32_t surface_pitch,
MsaaSamples msaa_samples, bool is_depth, uint32_t src_format,
const D3D12_RECT& src_rect);
// Performs the clearing part of a resolve.
bool ResolveClear(uint32_t edram_base, uint32_t surface_pitch,
MsaaSamples msaa_samples, bool is_depth, uint32_t format,
const D3D12_RECT& rect);
ID3D12PipelineState* GetResolvePipeline(DXGI_FORMAT dest_format);
// Returns any available resolve target placed at least at
@ -416,8 +420,9 @@ class RenderTargetCache {
D3D12_RESOURCE_STATES edram_buffer_state_;
bool edram_buffer_cleared_;
// EDRAM buffer load/store root signature.
// EDRAM root signatures.
ID3D12RootSignature* edram_load_store_root_signature_ = nullptr;
ID3D12RootSignature* edram_clear_root_signature_ = nullptr;
struct EDRAMLoadStoreRootConstants {
union {
struct {
@ -443,11 +448,26 @@ class RenderTargetCache {
// For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
uint32_t tile_sample_dest_info;
};
struct {
// 16 bits for X, 16 bits for Y.
uint32_t clear_rect_lt;
uint32_t clear_rect_rb;
union {
struct {
uint32_t clear_color_high;
uint32_t clear_color_low;
};
struct {
uint32_t clear_depth24;
uint32_t clear_depth32;
};
};
};
};
// Base in the lower 11 bits, pitch above.
uint32_t base_pitch_tiles;
};
// EDRAM buffer load/store pipelines.
// EDRAM pipelines.
static const EDRAMLoadStoreModeInfo
edram_load_store_mode_info_[size_t(EDRAMLoadStoreMode::kCount)];
ID3D12PipelineState*
@ -455,6 +475,8 @@ class RenderTargetCache {
ID3D12PipelineState*
edram_store_pipelines_[size_t(EDRAMLoadStoreMode::kCount)] = {};
ID3D12PipelineState* edram_tile_sample_32bpp_pipeline_ = nullptr;
ID3D12PipelineState* edram_clear_32bpp_pipeline_ = nullptr;
ID3D12PipelineState* edram_clear_depth_float_pipeline_ = nullptr;
// 48 MB heaps backing used render targets resources, created when needed.
// 24 MB proved to be not enough to store a single render target occupying the

View File

@ -0,0 +1,25 @@
#define XE_EDRAM_WRITE_ONLY
#include "edram_load_store.hlsli"
// Load4/Store4 aren't needed here, but 80x16 threads is over the limit.
[numthreads(40, 16, 1)]
void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
uint4 clear_rect;
clear_rect.xz = xe_edram_clear_rect & 0xFFFFu;
clear_rect.yw = xe_edram_clear_rect >> 16u;
uint2 sample_index = xe_thread_id.xy;
sample_index.x *= 2u;
[branch] if (any(sample_index < clear_rect.xy) ||
any(sample_index >= clear_rect.zw)) {
return;
}
uint2 tile_dword_index = xe_group_thread_id.xy;
tile_dword_index.x *= 2u;
uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
xe_edram_load_store_dest.Store(edram_offset, xe_edram_clear_color32);
if (sample_index.x + 1u < clear_rect.z) {
xe_edram_load_store_dest.Store(edram_offset + 4u, xe_edram_clear_color32);
}
}

View File

@ -0,0 +1,34 @@
#define XE_EDRAM_WRITE_ONLY
#include "edram_load_store.hlsli"
// Load4/Store4 aren't needed here, but 80x16 threads is over the limit.
[numthreads(40, 16, 1)]
void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
uint4 clear_rect;
clear_rect.xz = xe_edram_clear_rect & 0xFFFFu;
clear_rect.yw = xe_edram_clear_rect >> 16u;
uint2 sample_index = xe_thread_id.xy;
sample_index.x *= 2u;
[branch] if (any(sample_index < clear_rect.xy) ||
any(sample_index >= clear_rect.zw)) {
return;
}
uint2 tile_dword_index = xe_group_thread_id.xy;
tile_dword_index.x *= 2u;
bool second_sample_inside = sample_index.x + 1u < clear_rect.z;
// 24-bit depth.
uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
xe_edram_load_store_dest.Store(edram_offset, xe_edram_clear_depth24);
[branch] if (second_sample_inside) {
xe_edram_load_store_dest.Store(edram_offset + 4u, xe_edram_clear_depth24);
}
// 32-bit depth (pre-converted on the CPU).
xe_edram_load_store_dest.Store(edram_offset + 10485760u,
xe_edram_clear_depth32);
[branch] if (second_sample_inside) {
xe_edram_load_store_dest.Store(edram_offset + 10485764u,
xe_edram_clear_depth32);
}
}

View File

@ -31,7 +31,18 @@ cbuffer XeEDRAMLoadStoreConstants : register(b0) {
// For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
#define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w)
// For clearing.
// Left/top of the cleared region (relative to EDRAM base) in the lower 16 bits,
// right/bottom in the upper, in samples.
#define xe_edram_clear_rect (xe_edram_load_store_constants.xy)
#define xe_edram_clear_color32 (xe_edram_load_store_constants.z)
#define xe_edram_clear_color64 (xe_edram_load_store_constants.zw)
#define xe_edram_clear_depth24 (xe_edram_load_store_constants.z)
#define xe_edram_clear_depth32 (xe_edram_load_store_constants.w)
#ifndef XE_EDRAM_WRITE_ONLY
ByteAddressBuffer xe_edram_load_store_source : register(t0);
#endif
RWByteAddressBuffer xe_edram_load_store_dest : register(u0);
uint XeEDRAMOffset(uint2 tile_index, uint2 tile_dword_index) {