[D3D12] EDRAM storing and random cleanup
This commit is contained in:
parent
a4b98cda31
commit
9b303c64ba
|
@ -377,7 +377,7 @@ ID3D12Resource* D3D12CommandProcessor::RequestScratchGPUBuffer(
|
||||||
barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
||||||
barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
|
barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
|
||||||
barrier.Transition.pResource = scratch_buffer_;
|
barrier.Transition.pResource = scratch_buffer_;
|
||||||
barrier.Transition.Subresource = 0;
|
barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
|
||||||
barrier.Transition.StateBefore = scratch_buffer_state_;
|
barrier.Transition.StateBefore = scratch_buffer_state_;
|
||||||
barrier.Transition.StateAfter = state;
|
barrier.Transition.StateAfter = state;
|
||||||
GetCurrentCommandList()->ResourceBarrier(1, &barrier);
|
GetCurrentCommandList()->ResourceBarrier(1, &barrier);
|
||||||
|
@ -489,6 +489,10 @@ bool D3D12CommandProcessor::SetupContext() {
|
||||||
|
|
||||||
render_target_cache_ =
|
render_target_cache_ =
|
||||||
std::make_unique<RenderTargetCache>(this, register_file_);
|
std::make_unique<RenderTargetCache>(this, register_file_);
|
||||||
|
if (!render_target_cache_->Initialize()) {
|
||||||
|
XELOGE("Failed to initialize the render target cache");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,13 +21,176 @@ namespace xe {
|
||||||
namespace gpu {
|
namespace gpu {
|
||||||
namespace d3d12 {
|
namespace d3d12 {
|
||||||
|
|
||||||
|
// Generated with `xb buildhlsl`.
|
||||||
|
#include "xenia/gpu/d3d12/shaders/bin/edram_load_color_32bpp_cs.h"
|
||||||
|
#include "xenia/gpu/d3d12/shaders/bin/edram_load_color_64bpp_cs.h"
|
||||||
|
#include "xenia/gpu/d3d12/shaders/bin/edram_load_color_7e3_cs.h"
|
||||||
|
#include "xenia/gpu/d3d12/shaders/bin/edram_load_depth_float_cs.h"
|
||||||
|
#include "xenia/gpu/d3d12/shaders/bin/edram_load_depth_unorm_cs.h"
|
||||||
|
#include "xenia/gpu/d3d12/shaders/bin/edram_store_color_32bpp_cs.h"
|
||||||
|
#include "xenia/gpu/d3d12/shaders/bin/edram_store_color_64bpp_cs.h"
|
||||||
|
#include "xenia/gpu/d3d12/shaders/bin/edram_store_color_7e3_cs.h"
|
||||||
|
#include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_float_cs.h"
|
||||||
|
#include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_unorm_cs.h"
|
||||||
|
|
||||||
|
const RenderTargetCache::EDRAMLoadStorePipelineInfo
|
||||||
|
RenderTargetCache::edram_load_store_pipeline_info_[size_t(
|
||||||
|
RenderTargetCache::EDRAMLoadStorePipelineIndex::kCount)] = {
|
||||||
|
{edram_load_color_32bpp_cs, sizeof(edram_load_color_32bpp_cs),
|
||||||
|
L"EDRAM Load 32bpp Color"},
|
||||||
|
{edram_store_color_32bpp_cs, sizeof(edram_store_color_32bpp_cs),
|
||||||
|
L"EDRAM Store 32bpp Color"},
|
||||||
|
{edram_load_color_64bpp_cs, sizeof(edram_load_color_64bpp_cs),
|
||||||
|
L"EDRAM Load 64bpp Color"},
|
||||||
|
{edram_store_color_64bpp_cs, sizeof(edram_store_color_64bpp_cs),
|
||||||
|
L"EDRAM Store 64bpp Color"},
|
||||||
|
{edram_load_color_7e3_cs, sizeof(edram_load_color_7e3_cs),
|
||||||
|
L"EDRAM Load 7e3 Color"},
|
||||||
|
{edram_store_color_7e3_cs, sizeof(edram_store_color_7e3_cs),
|
||||||
|
L"EDRAM Store 7e3 Color"},
|
||||||
|
{edram_load_depth_unorm_cs, sizeof(edram_load_depth_unorm_cs),
|
||||||
|
L"EDRAM Load UNorm Depth"},
|
||||||
|
{edram_store_depth_unorm_cs, sizeof(edram_store_depth_unorm_cs),
|
||||||
|
L"EDRAM Store UNorm Depth"},
|
||||||
|
{edram_load_depth_float_cs, sizeof(edram_load_depth_float_cs),
|
||||||
|
L"EDRAM Load Float Depth"},
|
||||||
|
{edram_store_depth_float_cs, sizeof(edram_store_depth_float_cs),
|
||||||
|
L"EDRAM Store Float Depth"},
|
||||||
|
};
|
||||||
|
|
||||||
RenderTargetCache::RenderTargetCache(D3D12CommandProcessor* command_processor,
|
RenderTargetCache::RenderTargetCache(D3D12CommandProcessor* command_processor,
|
||||||
RegisterFile* register_file)
|
RegisterFile* register_file)
|
||||||
: command_processor_(command_processor), register_file_(register_file) {}
|
: command_processor_(command_processor), register_file_(register_file) {}
|
||||||
|
|
||||||
RenderTargetCache::~RenderTargetCache() { Shutdown(); }
|
RenderTargetCache::~RenderTargetCache() { Shutdown(); }
|
||||||
|
|
||||||
void RenderTargetCache::Shutdown() { ClearCache(); }
|
bool RenderTargetCache::Initialize() {
|
||||||
|
auto device =
|
||||||
|
command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice();
|
||||||
|
|
||||||
|
// Create the buffer for reinterpreting EDRAM contents.
|
||||||
|
D3D12_RESOURCE_DESC edram_buffer_desc;
|
||||||
|
edram_buffer_desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
|
||||||
|
edram_buffer_desc.Alignment = 0;
|
||||||
|
// First 10 MB is guest pixel data, second 10 MB is 32-bit depth when using
|
||||||
|
// D24FS8 so loads/stores don't corrupt multipass rendering.
|
||||||
|
edram_buffer_desc.Width = 2 * 2048 * 5120;
|
||||||
|
edram_buffer_desc.Height = 1;
|
||||||
|
edram_buffer_desc.DepthOrArraySize = 1;
|
||||||
|
edram_buffer_desc.MipLevels = 1;
|
||||||
|
edram_buffer_desc.Format = DXGI_FORMAT_UNKNOWN;
|
||||||
|
edram_buffer_desc.SampleDesc.Count = 1;
|
||||||
|
edram_buffer_desc.SampleDesc.Quality = 0;
|
||||||
|
edram_buffer_desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
|
||||||
|
edram_buffer_desc.Flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;
|
||||||
|
D3D12_HEAP_PROPERTIES edram_buffer_heap_properties = {};
|
||||||
|
edram_buffer_heap_properties.Type = D3D12_HEAP_TYPE_DEFAULT;
|
||||||
|
// The first operation will be a clear.
|
||||||
|
edram_buffer_state_ = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
|
||||||
|
if (FAILED(device->CreateCommittedResource(
|
||||||
|
&edram_buffer_heap_properties, D3D12_HEAP_FLAG_NONE,
|
||||||
|
&edram_buffer_desc, edram_buffer_state_, nullptr,
|
||||||
|
IID_PPV_ARGS(&edram_buffer_)))) {
|
||||||
|
XELOGE("Failed to create the EDRAM buffer");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
edram_buffer_cleared_ = false;
|
||||||
|
|
||||||
|
// Create the root signature for EDRAM buffer load/store.
|
||||||
|
D3D12_ROOT_PARAMETER root_parameters[2];
|
||||||
|
// Parameter 0 is constants (changed for each render target binding).
|
||||||
|
root_parameters[0].ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS;
|
||||||
|
root_parameters[0].Constants.ShaderRegister = 0;
|
||||||
|
root_parameters[0].Constants.RegisterSpace = 0;
|
||||||
|
root_parameters[0].Constants.Num32BitValues =
|
||||||
|
sizeof(EDRAMLoadStoreRootConstants) / sizeof(uint32_t);
|
||||||
|
root_parameters[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
|
||||||
|
// Parameter 1 is source and target.
|
||||||
|
D3D12_DESCRIPTOR_RANGE root_load_store_ranges[2];
|
||||||
|
root_load_store_ranges[0].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV;
|
||||||
|
root_load_store_ranges[0].NumDescriptors = 1;
|
||||||
|
root_load_store_ranges[0].BaseShaderRegister = 0;
|
||||||
|
root_load_store_ranges[0].RegisterSpace = 0;
|
||||||
|
root_load_store_ranges[0].OffsetInDescriptorsFromTableStart = 0;
|
||||||
|
root_load_store_ranges[1].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_UAV;
|
||||||
|
root_load_store_ranges[1].NumDescriptors = 1;
|
||||||
|
root_load_store_ranges[1].BaseShaderRegister = 0;
|
||||||
|
root_load_store_ranges[1].RegisterSpace = 0;
|
||||||
|
root_load_store_ranges[1].OffsetInDescriptorsFromTableStart = 1;
|
||||||
|
root_parameters[1].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
|
||||||
|
root_parameters[1].DescriptorTable.NumDescriptorRanges = 2;
|
||||||
|
root_parameters[1].DescriptorTable.pDescriptorRanges = root_load_store_ranges;
|
||||||
|
root_parameters[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
|
||||||
|
D3D12_ROOT_SIGNATURE_DESC root_signature_desc;
|
||||||
|
root_signature_desc.NumParameters = UINT(xe::countof(root_parameters));
|
||||||
|
root_signature_desc.pParameters = root_parameters;
|
||||||
|
root_signature_desc.NumStaticSamplers = 0;
|
||||||
|
root_signature_desc.pStaticSamplers = nullptr;
|
||||||
|
root_signature_desc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE;
|
||||||
|
ID3DBlob* root_signature_blob;
|
||||||
|
ID3DBlob* root_signature_error_blob = nullptr;
|
||||||
|
if (FAILED(D3D12SerializeRootSignature(
|
||||||
|
&root_signature_desc, D3D_ROOT_SIGNATURE_VERSION_1,
|
||||||
|
&root_signature_blob, &root_signature_error_blob))) {
|
||||||
|
XELOGE("Failed to serialize the EDRAM buffer load/store root signature");
|
||||||
|
if (root_signature_error_blob != nullptr) {
|
||||||
|
XELOGE("%s", reinterpret_cast<const char*>(
|
||||||
|
root_signature_error_blob->GetBufferPointer()));
|
||||||
|
root_signature_error_blob->Release();
|
||||||
|
}
|
||||||
|
Shutdown();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (root_signature_error_blob != nullptr) {
|
||||||
|
root_signature_error_blob->Release();
|
||||||
|
}
|
||||||
|
if (FAILED(device->CreateRootSignature(
|
||||||
|
0, root_signature_blob->GetBufferPointer(),
|
||||||
|
root_signature_blob->GetBufferSize(),
|
||||||
|
IID_PPV_ARGS(&edram_load_store_root_signature_)))) {
|
||||||
|
XELOGE("Failed to create the EDRAM buffer load/store root signature");
|
||||||
|
root_signature_blob->Release();
|
||||||
|
Shutdown();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
root_signature_blob->Release();
|
||||||
|
|
||||||
|
// Create the load/store pipelines.
|
||||||
|
D3D12_COMPUTE_PIPELINE_STATE_DESC pipeline_desc;
|
||||||
|
pipeline_desc.pRootSignature = edram_load_store_root_signature_;
|
||||||
|
pipeline_desc.NodeMask = 0;
|
||||||
|
pipeline_desc.CachedPSO.pCachedBlob = nullptr;
|
||||||
|
pipeline_desc.CachedPSO.CachedBlobSizeInBytes = 0;
|
||||||
|
pipeline_desc.Flags = D3D12_PIPELINE_STATE_FLAG_NONE;
|
||||||
|
for (uint32_t i = 0; i < uint32_t(EDRAMLoadStorePipelineIndex::kCount); ++i) {
|
||||||
|
const EDRAMLoadStorePipelineInfo& pipeline_info =
|
||||||
|
edram_load_store_pipeline_info_[i];
|
||||||
|
pipeline_desc.CS.pShaderBytecode = pipeline_info.shader;
|
||||||
|
pipeline_desc.CS.BytecodeLength = pipeline_info.shader_size;
|
||||||
|
if (FAILED(device->CreateComputePipelineState(
|
||||||
|
&pipeline_desc, IID_PPV_ARGS(&edram_load_store_pipelines_[i])))) {
|
||||||
|
XELOGE("Failed to create EDRAM load/store pipeline for mode %u", i);
|
||||||
|
Shutdown();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void RenderTargetCache::Shutdown() {
|
||||||
|
ClearCache();
|
||||||
|
|
||||||
|
if (edram_load_store_root_signature_ != nullptr) {
|
||||||
|
edram_load_store_root_signature_->Release();
|
||||||
|
edram_load_store_root_signature_ = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (edram_buffer_ != nullptr) {
|
||||||
|
edram_buffer_->Release();
|
||||||
|
edram_buffer_ = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void RenderTargetCache::ClearCache() {
|
void RenderTargetCache::ClearCache() {
|
||||||
for (auto render_target_pair : render_targets_) {
|
for (auto render_target_pair : render_targets_) {
|
||||||
|
@ -334,7 +497,7 @@ bool RenderTargetCache::UpdateRenderTargets() {
|
||||||
uint32_t heap_usage[5] = {};
|
uint32_t heap_usage[5] = {};
|
||||||
if (full_update) {
|
if (full_update) {
|
||||||
// Export the currently bound render targets before we ruin the bindings.
|
// Export the currently bound render targets before we ruin the bindings.
|
||||||
WriteRenderTargetsToEDRAM();
|
StoreRenderTargetsToEDRAM();
|
||||||
|
|
||||||
ClearBindings();
|
ClearBindings();
|
||||||
current_surface_pitch_ = surface_pitch;
|
current_surface_pitch_ = surface_pitch;
|
||||||
|
@ -527,7 +690,7 @@ bool RenderTargetCache::UpdateRenderTargets() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void RenderTargetCache::EndFrame() {
|
void RenderTargetCache::EndFrame() {
|
||||||
WriteRenderTargetsToEDRAM();
|
StoreRenderTargetsToEDRAM();
|
||||||
ClearBindings();
|
ClearBindings();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -709,6 +872,7 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
|
||||||
}
|
}
|
||||||
++descriptor_heap->descriptors_used;
|
++descriptor_heap->descriptors_used;
|
||||||
|
|
||||||
|
// Get the layout for copying to the EDRAM buffer.
|
||||||
RenderTarget* render_target = new RenderTarget;
|
RenderTarget* render_target = new RenderTarget;
|
||||||
render_target->resource = resource;
|
render_target->resource = resource;
|
||||||
render_target->state = state;
|
render_target->state = state;
|
||||||
|
@ -716,11 +880,245 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
|
||||||
render_target->key = key;
|
render_target->key = key;
|
||||||
render_target->heap_page_first = heap_page_first;
|
render_target->heap_page_first = heap_page_first;
|
||||||
render_target->heap_page_count = heap_page_count;
|
render_target->heap_page_count = heap_page_count;
|
||||||
|
UINT64 copy_buffer_size;
|
||||||
|
device->GetCopyableFootprints(&resource_desc, 0, key.is_depth ? 2 : 1, 0,
|
||||||
|
render_target->footprints, nullptr, nullptr,
|
||||||
|
©_buffer_size);
|
||||||
|
render_target->copy_buffer_size = uint32_t(copy_buffer_size);
|
||||||
render_targets_.insert(std::make_pair(key.value, render_target));
|
render_targets_.insert(std::make_pair(key.value, render_target));
|
||||||
return render_target;
|
return render_target;
|
||||||
}
|
}
|
||||||
|
|
||||||
void RenderTargetCache::WriteRenderTargetsToEDRAM() {}
|
void RenderTargetCache::StoreRenderTargetsToEDRAM() {
|
||||||
|
auto command_list = command_processor_->GetCurrentCommandList();
|
||||||
|
if (command_list == nullptr) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t surface_pitch_ss =
|
||||||
|
current_surface_pitch_ *
|
||||||
|
(current_msaa_samples_ >= MsaaSamples::k4X ? 2 : 1);
|
||||||
|
uint32_t surface_pitch_tiles = (surface_pitch_ss + 79) / 80;
|
||||||
|
assert_true(surface_pitch_tiles != 0);
|
||||||
|
|
||||||
|
// TODO(Triang3l): Clear the buffer if calling for the first time.
|
||||||
|
|
||||||
|
uint32_t store_bindings[5];
|
||||||
|
uint32_t store_binding_count = 0;
|
||||||
|
|
||||||
|
D3D12_RESOURCE_BARRIER barriers[6];
|
||||||
|
uint32_t barrier_count;
|
||||||
|
|
||||||
|
// Extract only the render targets that need to be stored, transition them to
|
||||||
|
// copy sources and calculate intermediate buffer size.
|
||||||
|
uint32_t copy_buffer_size = 0;
|
||||||
|
barrier_count = 0;
|
||||||
|
for (uint32_t i = 0; i < 5; ++i) {
|
||||||
|
const RenderTargetBinding& binding = current_bindings_[i];
|
||||||
|
RenderTarget* render_target = binding.render_target;
|
||||||
|
// TODO(Triang3l): Change edram_dirty_length to dirty row count.
|
||||||
|
if (!binding.is_bound || render_target == nullptr ||
|
||||||
|
binding.edram_dirty_length < surface_pitch_tiles) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
store_bindings[store_binding_count] = i;
|
||||||
|
copy_buffer_size =
|
||||||
|
std::max(copy_buffer_size, render_target->copy_buffer_size);
|
||||||
|
++store_binding_count;
|
||||||
|
if (render_target->state != D3D12_RESOURCE_STATE_COPY_SOURCE) {
|
||||||
|
D3D12_RESOURCE_BARRIER& barrier = barriers[barrier_count++];
|
||||||
|
barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
||||||
|
barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
|
||||||
|
barrier.Transition.pResource = render_target->resource;
|
||||||
|
barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
|
||||||
|
barrier.Transition.StateBefore = render_target->state;
|
||||||
|
barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE;
|
||||||
|
render_target->state = D3D12_RESOURCE_STATE_COPY_SOURCE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (store_binding_count == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (edram_buffer_state_ != D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
|
||||||
|
// Also transition the EDRAM buffer to UAV.
|
||||||
|
D3D12_RESOURCE_BARRIER& barrier = barriers[barrier_count++];
|
||||||
|
barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
||||||
|
barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
|
||||||
|
barrier.Transition.pResource = edram_buffer_;
|
||||||
|
barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
|
||||||
|
barrier.Transition.StateBefore = edram_buffer_state_;
|
||||||
|
barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
|
||||||
|
edram_buffer_state_ = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
|
||||||
|
}
|
||||||
|
if (barrier_count != 0) {
|
||||||
|
command_list->ResourceBarrier(barrier_count, barriers);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate descriptors for the buffers.
|
||||||
|
D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start;
|
||||||
|
D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start;
|
||||||
|
if (command_processor_->RequestViewDescriptors(0, 2, 2, descriptor_cpu_start,
|
||||||
|
descriptor_gpu_start) == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the buffer for copying.
|
||||||
|
D3D12_RESOURCE_STATES copy_buffer_state = D3D12_RESOURCE_STATE_COPY_DEST;
|
||||||
|
ID3D12Resource* copy_buffer = command_processor_->RequestScratchGPUBuffer(
|
||||||
|
copy_buffer_size, copy_buffer_state);
|
||||||
|
if (copy_buffer == nullptr) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prepare for writing.
|
||||||
|
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
|
||||||
|
auto device = provider->GetDevice();
|
||||||
|
auto descriptor_size_view = provider->GetDescriptorSizeView();
|
||||||
|
D3D12_SHADER_RESOURCE_VIEW_DESC srv_desc;
|
||||||
|
srv_desc.Format = DXGI_FORMAT_R32_TYPELESS;
|
||||||
|
srv_desc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
|
||||||
|
srv_desc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
|
||||||
|
srv_desc.Buffer.FirstElement = 0;
|
||||||
|
srv_desc.Buffer.NumElements = copy_buffer_size >> 2;
|
||||||
|
srv_desc.Buffer.StructureByteStride = 0;
|
||||||
|
srv_desc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW;
|
||||||
|
device->CreateShaderResourceView(copy_buffer, &srv_desc,
|
||||||
|
descriptor_cpu_start);
|
||||||
|
D3D12_UNORDERED_ACCESS_VIEW_DESC uav_desc;
|
||||||
|
uav_desc.Format = DXGI_FORMAT_R32_TYPELESS;
|
||||||
|
uav_desc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
|
||||||
|
uav_desc.Buffer.FirstElement = 0;
|
||||||
|
uav_desc.Buffer.NumElements = 2 * 2048 * 1280;
|
||||||
|
uav_desc.Buffer.StructureByteStride = 0;
|
||||||
|
uav_desc.Buffer.CounterOffsetInBytes = 0;
|
||||||
|
uav_desc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
|
||||||
|
D3D12_CPU_DESCRIPTOR_HANDLE uav_cpu_handle;
|
||||||
|
uav_cpu_handle.ptr = descriptor_cpu_start.ptr + descriptor_size_view;
|
||||||
|
device->CreateUnorderedAccessView(edram_buffer_, nullptr, &uav_desc,
|
||||||
|
uav_cpu_handle);
|
||||||
|
command_list->SetComputeRootSignature(edram_load_store_root_signature_);
|
||||||
|
command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
|
||||||
|
|
||||||
|
// Sort the bindings in ascending order of EDRAM base so data in the render
|
||||||
|
// targets placed farther in EDRAM isn't lost in case of overlap.
|
||||||
|
std::sort(
|
||||||
|
store_bindings, store_bindings + store_binding_count,
|
||||||
|
[this](uint32_t a, uint32_t b) {
|
||||||
|
if (current_bindings_[a].edram_base < current_bindings_[b].edram_base) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return a < b;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Store each render target.
|
||||||
|
for (uint32_t i = 0; i < store_binding_count; ++i) {
|
||||||
|
const RenderTargetBinding& binding = current_bindings_[store_bindings[i]];
|
||||||
|
const RenderTarget* render_target = binding.render_target;
|
||||||
|
EDRAMLoadStorePipelineIndex pipeline_index;
|
||||||
|
bool is_64bpp = false;
|
||||||
|
if (render_target->key.is_depth) {
|
||||||
|
if (DepthRenderTargetFormat(render_target->key.format) ==
|
||||||
|
DepthRenderTargetFormat::kD24FS8) {
|
||||||
|
pipeline_index = EDRAMLoadStorePipelineIndex::kDepthFloatStore;
|
||||||
|
} else {
|
||||||
|
pipeline_index = EDRAMLoadStorePipelineIndex::kDepthUnormStore;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
switch (ColorRenderTargetFormat(render_target->key.format)) {
|
||||||
|
case ColorRenderTargetFormat::k_8_8_8_8:
|
||||||
|
case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
|
||||||
|
case ColorRenderTargetFormat::k_2_10_10_10:
|
||||||
|
case ColorRenderTargetFormat::k_16_16:
|
||||||
|
case ColorRenderTargetFormat::k_16_16_FLOAT:
|
||||||
|
case ColorRenderTargetFormat::k_2_10_10_10_AS_16_16_16_16:
|
||||||
|
case ColorRenderTargetFormat::k_32_FLOAT:
|
||||||
|
pipeline_index = EDRAMLoadStorePipelineIndex::kColor32bppStore;
|
||||||
|
break;
|
||||||
|
case ColorRenderTargetFormat::k_16_16_16_16:
|
||||||
|
case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
|
||||||
|
case ColorRenderTargetFormat::k_32_32_FLOAT:
|
||||||
|
pipeline_index = EDRAMLoadStorePipelineIndex::kColor64bppStore;
|
||||||
|
is_64bpp = true;
|
||||||
|
break;
|
||||||
|
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
|
||||||
|
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
|
||||||
|
pipeline_index = EDRAMLoadStorePipelineIndex::kColor7e3Store;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert_unhandled_case(render_target->key.format);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
D3D12_TEXTURE_COPY_LOCATION location_source, location_dest;
|
||||||
|
location_source.pResource = render_target->resource;
|
||||||
|
location_source.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
|
||||||
|
location_source.SubresourceIndex = 0;
|
||||||
|
location_dest.pResource = copy_buffer;
|
||||||
|
location_dest.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
|
||||||
|
location_dest.PlacedFootprint = render_target->footprints[0];
|
||||||
|
// TODO(Triang3l): Box for color render targets.
|
||||||
|
command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source,
|
||||||
|
nullptr);
|
||||||
|
EDRAMLoadStoreRootConstants root_constants;
|
||||||
|
root_constants.base_tiles = binding.edram_base;
|
||||||
|
root_constants.pitch_tiles = surface_pitch_tiles * (is_64bpp ? 2 : 1);
|
||||||
|
root_constants.rt_color_depth_pitch =
|
||||||
|
location_dest.PlacedFootprint.Footprint.RowPitch;
|
||||||
|
if (render_target->key.is_depth) {
|
||||||
|
location_source.SubresourceIndex = 1;
|
||||||
|
location_dest.PlacedFootprint = render_target->footprints[1];
|
||||||
|
command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source,
|
||||||
|
nullptr);
|
||||||
|
root_constants.rt_stencil_offset =
|
||||||
|
uint32_t(location_dest.PlacedFootprint.Offset);
|
||||||
|
root_constants.rt_stencil_pitch =
|
||||||
|
location_dest.PlacedFootprint.Footprint.RowPitch;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transition the copy buffer to SRV.
|
||||||
|
barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
||||||
|
barriers[0].Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
|
||||||
|
barriers[0].Transition.pResource = copy_buffer;
|
||||||
|
barriers[0].Transition.Subresource =
|
||||||
|
D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
|
||||||
|
barriers[0].Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST;
|
||||||
|
barriers[0].Transition.StateAfter =
|
||||||
|
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
|
||||||
|
copy_buffer_state = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
|
||||||
|
command_list->ResourceBarrier(1, barriers);
|
||||||
|
|
||||||
|
// Store the data.
|
||||||
|
command_list->SetComputeRoot32BitConstants(
|
||||||
|
0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
|
||||||
|
command_processor_->SetPipeline(
|
||||||
|
edram_load_store_pipelines_[size_t(pipeline_index)]);
|
||||||
|
command_list->Dispatch(
|
||||||
|
root_constants.pitch_tiles,
|
||||||
|
binding.edram_dirty_length / root_constants.pitch_tiles, 1);
|
||||||
|
|
||||||
|
// Commit the UAV write and prepare for copying again.
|
||||||
|
barrier_count = 1;
|
||||||
|
barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_UAV;
|
||||||
|
barriers[0].Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
|
||||||
|
barriers[0].UAV.pResource = edram_buffer_;
|
||||||
|
if (i + 1 < store_binding_count) {
|
||||||
|
barrier_count = 2;
|
||||||
|
barriers[1].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
||||||
|
barriers[1].Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
|
||||||
|
barriers[1].Transition.pResource = copy_buffer;
|
||||||
|
barriers[1].Transition.Subresource =
|
||||||
|
D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
|
||||||
|
barriers[1].Transition.StateBefore =
|
||||||
|
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
|
||||||
|
barriers[1].Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_DEST;
|
||||||
|
copy_buffer_state = D3D12_RESOURCE_STATE_COPY_DEST;
|
||||||
|
}
|
||||||
|
command_list->ResourceBarrier(barrier_count, barriers);
|
||||||
|
}
|
||||||
|
|
||||||
|
command_processor_->ReleaseScratchGPUBuffer(copy_buffer, copy_buffer_state);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace d3d12
|
} // namespace d3d12
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
|
|
|
@ -201,6 +201,7 @@ class RenderTargetCache {
|
||||||
RegisterFile* register_file);
|
RegisterFile* register_file);
|
||||||
~RenderTargetCache();
|
~RenderTargetCache();
|
||||||
|
|
||||||
|
bool Initialize();
|
||||||
void Shutdown();
|
void Shutdown();
|
||||||
void ClearCache();
|
void ClearCache();
|
||||||
|
|
||||||
|
@ -233,6 +234,27 @@ class RenderTargetCache {
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
enum class EDRAMLoadStorePipelineIndex {
|
||||||
|
kColor32bppLoad,
|
||||||
|
kColor32bppStore,
|
||||||
|
kColor64bppLoad,
|
||||||
|
kColor64bppStore,
|
||||||
|
kColor7e3Load,
|
||||||
|
kColor7e3Store,
|
||||||
|
kDepthUnormLoad,
|
||||||
|
kDepthUnormStore,
|
||||||
|
kDepthFloatLoad,
|
||||||
|
kDepthFloatStore,
|
||||||
|
|
||||||
|
kCount
|
||||||
|
};
|
||||||
|
|
||||||
|
struct EDRAMLoadStorePipelineInfo {
|
||||||
|
const void* shader;
|
||||||
|
size_t shader_size;
|
||||||
|
const WCHAR* name;
|
||||||
|
};
|
||||||
|
|
||||||
union RenderTargetKey {
|
union RenderTargetKey {
|
||||||
struct {
|
struct {
|
||||||
// Supersampled (_ss - scaled 2x if needed) dimensions, divided by 80x16.
|
// Supersampled (_ss - scaled 2x if needed) dimensions, divided by 80x16.
|
||||||
|
@ -267,8 +289,12 @@ class RenderTargetCache {
|
||||||
RenderTargetKey key;
|
RenderTargetKey key;
|
||||||
// The first 4 MB page in the heaps.
|
// The first 4 MB page in the heaps.
|
||||||
uint32_t heap_page_first;
|
uint32_t heap_page_first;
|
||||||
// Number of 4 MB pages this render target uses.
|
// The number of 4 MB pages this render target uses.
|
||||||
uint32_t heap_page_count;
|
uint32_t heap_page_count;
|
||||||
|
// Color/depth and stencil layouts.
|
||||||
|
D3D12_PLACED_SUBRESOURCE_FOOTPRINT footprints[2];
|
||||||
|
// Buffer size needed to copy the render target to the EDRAM buffer.
|
||||||
|
uint32_t copy_buffer_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct RenderTargetBinding {
|
struct RenderTargetBinding {
|
||||||
|
@ -294,13 +320,34 @@ class RenderTargetCache {
|
||||||
RenderTarget* FindOrCreateRenderTarget(RenderTargetKey key,
|
RenderTarget* FindOrCreateRenderTarget(RenderTargetKey key,
|
||||||
uint32_t heap_page_first);
|
uint32_t heap_page_first);
|
||||||
|
|
||||||
// Must be in a frame to call. Writes the dirty areas of the currently bound
|
// Must be in a frame to call. Stores the dirty areas of the currently bound
|
||||||
// render targets and marks them as clean.
|
// render targets and marks them as clean.
|
||||||
void WriteRenderTargetsToEDRAM();
|
void StoreRenderTargetsToEDRAM();
|
||||||
|
|
||||||
D3D12CommandProcessor* command_processor_;
|
D3D12CommandProcessor* command_processor_;
|
||||||
RegisterFile* register_file_;
|
RegisterFile* register_file_;
|
||||||
|
|
||||||
|
// The EDRAM buffer allowing color and depth data to be reinterpreted.
|
||||||
|
ID3D12Resource* edram_buffer_ = nullptr;
|
||||||
|
D3D12_RESOURCE_STATES edram_buffer_state_;
|
||||||
|
bool edram_buffer_cleared_;
|
||||||
|
|
||||||
|
// EDRAM buffer load/store root signature.
|
||||||
|
ID3D12RootSignature* edram_load_store_root_signature_ = nullptr;
|
||||||
|
struct EDRAMLoadStoreRootConstants {
|
||||||
|
uint32_t base_tiles;
|
||||||
|
uint32_t pitch_tiles;
|
||||||
|
uint32_t rt_color_depth_pitch;
|
||||||
|
uint32_t rt_stencil_offset;
|
||||||
|
uint32_t rt_stencil_pitch;
|
||||||
|
};
|
||||||
|
// EDRAM buffer load/store pipelines.
|
||||||
|
static const EDRAMLoadStorePipelineInfo
|
||||||
|
edram_load_store_pipeline_info_[size_t(
|
||||||
|
EDRAMLoadStorePipelineIndex::kCount)];
|
||||||
|
ID3D12PipelineState* edram_load_store_pipelines_[size_t(
|
||||||
|
EDRAMLoadStorePipelineIndex::kCount)] = {};
|
||||||
|
|
||||||
// 32 MB heaps backing used render targets resources, created when needed.
|
// 32 MB heaps backing used render targets resources, created when needed.
|
||||||
// 24 MB proved to be not enough to store a single render target occupying the
|
// 24 MB proved to be not enough to store a single render target occupying the
|
||||||
// entire EDRAM - a 32-bit depth/stencil one - at some resolution.
|
// entire EDRAM - a 32-bit depth/stencil one - at some resolution.
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
#include "edram_load_store.hlsli"
|
||||||
|
|
||||||
|
[numthreads(20, 16, 1)]
|
||||||
|
void main(uint3 xe_group_id : SV_GroupID,
|
||||||
|
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||||
|
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||||
|
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||||
|
tile_dword_index.x *= 4u;
|
||||||
|
uint4 pixels = xe_edram_load_store_source.Load4(
|
||||||
|
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
|
||||||
|
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||||
|
xe_thread_id.x * 16u;
|
||||||
|
xe_edram_load_store_dest.Store4(rt_offset, pixels);
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
#include "edram_load_store.hlsli"
|
||||||
|
|
||||||
|
[numthreads(40, 8, 1)]
|
||||||
|
void main(uint3 xe_group_id : SV_GroupID,
|
||||||
|
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||||
|
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||||
|
// One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
|
||||||
|
// from 1 render target row rather than 1. Threads with X 0-19 are for the
|
||||||
|
// first row, with 20-39 are for the second.
|
||||||
|
uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u);
|
||||||
|
[flatten] if (xe_group_thread_id.x >= 20u) {
|
||||||
|
tile_dword_index += uint2(uint(-80), 1u);
|
||||||
|
}
|
||||||
|
uint4 pixels = xe_edram_load_store_source.Load4(
|
||||||
|
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
|
||||||
|
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||||
|
xe_thread_id.x * 16u;
|
||||||
|
xe_edram_load_store_dest.Store4(rt_offset, pixels);
|
||||||
|
}
|
|
@ -0,0 +1,20 @@
|
||||||
|
#include "edram_load_store.hlsli"
|
||||||
|
#include "pixel_formats.hlsli"
|
||||||
|
|
||||||
|
[numthreads(40, 16, 1)]
|
||||||
|
void main(uint3 xe_group_id : SV_GroupID,
|
||||||
|
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||||
|
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||||
|
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||||
|
tile_dword_index.x *= 2u;
|
||||||
|
uint2 pixels_7e3_packed = xe_edram_load_store_source.Load2(
|
||||||
|
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
|
||||||
|
uint4 pixel_0_f16u32 = XeFloat7e3To16(pixels_7e3_packed.x);
|
||||||
|
uint4 pixel_1_f16u32 = XeFloat7e3To16(pixels_7e3_packed.y);
|
||||||
|
uint4 pixels_f16u32_packed =
|
||||||
|
uint4(pixel_0_f16u32.xz, pixel_1_f16u32.xz) |
|
||||||
|
(uint4(pixel_0_f16u32.yw, pixel_1_f16u32.yw) << 16u);
|
||||||
|
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||||
|
xe_thread_id.x * 16u;
|
||||||
|
xe_edram_load_store_dest.Store4(rt_offset, pixels_f16u32_packed);
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
#include "edram_load_store.hlsli"
|
||||||
|
#include "pixel_formats.hlsli"
|
||||||
|
|
||||||
|
[numthreads(20, 16, 1)]
|
||||||
|
void main(uint3 xe_group_id : SV_GroupID,
|
||||||
|
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||||
|
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||||
|
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||||
|
tile_dword_index.x *= 4u;
|
||||||
|
uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
|
||||||
|
uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset);
|
||||||
|
uint4 depth24 = depth24_stencil & 0xFFFFFFu;
|
||||||
|
uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset);
|
||||||
|
// Depth. If the stored 32-bit depth converted to 24-bit is the same as the
|
||||||
|
// stored 24-bit depth, load the 32-bit value because it has more precision
|
||||||
|
// (and multipass rendering is possible), if it's not, convert the 24-bit
|
||||||
|
// depth because it was overwritten by aliasing.
|
||||||
|
uint4 depth24to32 = XeFloat20e4To32(depth24);
|
||||||
|
uint4 depth = depth24to32 + (depth32 - depth24to32) *
|
||||||
|
uint4(XeFloat32To20e4(depth32) == depth24);
|
||||||
|
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||||
|
xe_thread_id.x * 16u;
|
||||||
|
xe_edram_load_store_dest.Store4(rt_offset, depth);
|
||||||
|
// Stencil.
|
||||||
|
uint4 stencil = (depth24_stencil >> 24u) << uint4(0u, 8u, 16u, 24u);
|
||||||
|
stencil.xy |= stencil.zw;
|
||||||
|
stencil.x |= stencil.y;
|
||||||
|
rt_offset = xe_edram_rt_stencil_offset +
|
||||||
|
xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
|
||||||
|
xe_edram_load_store_dest.Store(rt_offset, stencil.x);
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
#include "edram_load_store.hlsli"
|
||||||
|
|
||||||
|
[numthreads(20, 16, 1)]
|
||||||
|
void main(uint3 xe_group_id : SV_GroupID,
|
||||||
|
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||||
|
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||||
|
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||||
|
tile_dword_index.x *= 4u;
|
||||||
|
uint4 pixels = xe_edram_load_store_source.Load4(
|
||||||
|
XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
|
||||||
|
// Depth.
|
||||||
|
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||||
|
xe_thread_id.x * 16u;
|
||||||
|
xe_edram_load_store_dest.Store4(rt_offset, pixels & 0xFFFFFFu);
|
||||||
|
// Stencil.
|
||||||
|
uint4 stencil = (pixels >> 24u) << uint4(0u, 8u, 16u, 24u);
|
||||||
|
stencil.xy |= stencil.zw;
|
||||||
|
stencil.x |= stencil.y;
|
||||||
|
rt_offset = xe_edram_rt_stencil_offset +
|
||||||
|
xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
|
||||||
|
xe_edram_load_store_dest.Store(rt_offset, stencil.x);
|
||||||
|
}
|
|
@ -0,0 +1,21 @@
|
||||||
|
#ifndef XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
|
||||||
|
#define XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
|
||||||
|
|
||||||
|
cbuffer XeEDRAMLoadStoreConstants : register(b0) {
|
||||||
|
uint xe_edram_base_tiles;
|
||||||
|
uint xe_edram_pitch_tiles;
|
||||||
|
uint xe_edram_rt_color_depth_pitch;
|
||||||
|
uint xe_edram_rt_stencil_offset;
|
||||||
|
uint xe_edram_rt_stencil_pitch;
|
||||||
|
};
|
||||||
|
|
||||||
|
ByteAddressBuffer xe_edram_load_store_source : register(t0);
|
||||||
|
RWByteAddressBuffer xe_edram_load_store_dest : register(u0);
|
||||||
|
|
||||||
|
uint XeEDRAMOffset(uint2 tile_index, uint2 tile_dword_index) {
|
||||||
|
return (xe_edram_base_tiles + (tile_index.y * xe_edram_pitch_tiles) +
|
||||||
|
tile_index.x) * 5120u + tile_dword_index.y * 320u +
|
||||||
|
tile_dword_index.x * 4u;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
|
|
@ -0,0 +1,14 @@
|
||||||
|
#include "edram_load_store.hlsli"
|
||||||
|
|
||||||
|
[numthreads(20, 16, 1)]
|
||||||
|
void main(uint3 xe_group_id : SV_GroupID,
|
||||||
|
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||||
|
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||||
|
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||||
|
xe_thread_id.x * 16u;
|
||||||
|
uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
|
||||||
|
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||||
|
tile_dword_index.x *= 4u;
|
||||||
|
xe_edram_load_store_dest.Store4(
|
||||||
|
XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
#include "edram_load_store.hlsli"
|
||||||
|
|
||||||
|
[numthreads(40, 8, 1)]
|
||||||
|
void main(uint3 xe_group_id : SV_GroupID,
|
||||||
|
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||||
|
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||||
|
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||||
|
xe_thread_id.x * 16u;
|
||||||
|
uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
|
||||||
|
// One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
|
||||||
|
// from 1 render target row rather than 1. Threads with X 0-19 are for the
|
||||||
|
// first row, with 20-39 are for the second.
|
||||||
|
uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u);
|
||||||
|
[flatten] if (xe_group_thread_id.x >= 20u) {
|
||||||
|
tile_dword_index += uint2(uint(-80), 1u);
|
||||||
|
}
|
||||||
|
xe_edram_load_store_dest.Store4(
|
||||||
|
XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
#include "edram_load_store.hlsli"
|
||||||
|
#include "pixel_formats.hlsli"
|
||||||
|
|
||||||
|
[numthreads(40, 16, 1)]
|
||||||
|
void main(uint3 xe_group_id : SV_GroupID,
|
||||||
|
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||||
|
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||||
|
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||||
|
xe_thread_id.x * 16u;
|
||||||
|
uint4 pixels_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset);
|
||||||
|
uint4 pixel_0_f16u32 = pixels_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u);
|
||||||
|
uint4 pixel_1_f16u32 = pixels_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u);
|
||||||
|
uint2 pixels_7e3_packed =
|
||||||
|
uint2(XeFloat16To7e3(pixel_0_f16u32), XeFloat16To7e3(pixel_1_f16u32));
|
||||||
|
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||||
|
tile_dword_index.x *= 2u;
|
||||||
|
xe_edram_load_store_dest.Store2(
|
||||||
|
XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels_7e3_packed);
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
#include "edram_load_store.hlsli"
|
||||||
|
#include "pixel_formats.hlsli"
|
||||||
|
|
||||||
|
[numthreads(20, 16, 1)]
|
||||||
|
void main(uint3 xe_group_id : SV_GroupID,
|
||||||
|
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||||
|
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||||
|
// Depth.
|
||||||
|
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||||
|
xe_thread_id.x * 16u;
|
||||||
|
uint4 depth32 = xe_edram_load_store_source.Load4(rt_offset);
|
||||||
|
uint4 depth24_stencil = XeFloat32To20e4(depth32);
|
||||||
|
// Stencil.
|
||||||
|
rt_offset = xe_edram_rt_stencil_offset +
|
||||||
|
xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
|
||||||
|
depth24_stencil |= xe_edram_load_store_source.Load(rt_offset).xxxx >>
|
||||||
|
uint4(0u, 8u, 16u, 24u) << 24u;
|
||||||
|
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||||
|
tile_dword_index.x *= 4u;
|
||||||
|
uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
|
||||||
|
// Store 24-bit depth for aliasing and checking if 32-bit depth is up to date.
|
||||||
|
xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil);
|
||||||
|
// Store 32-bit depth so precision isn't lost when doing multipass rendering.
|
||||||
|
xe_edram_load_store_dest.Store4(10485760u + edram_offset, depth32);
|
||||||
|
}
|
|
@ -0,0 +1,20 @@
|
||||||
|
#include "edram_load_store.hlsli"
|
||||||
|
|
||||||
|
[numthreads(20, 16, 1)]
|
||||||
|
void main(uint3 xe_group_id : SV_GroupID,
|
||||||
|
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||||
|
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||||
|
// Depth.
|
||||||
|
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
|
||||||
|
xe_thread_id.x * 16u;
|
||||||
|
uint4 pixels = xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu;
|
||||||
|
// Stencil.
|
||||||
|
rt_offset = xe_edram_rt_stencil_offset +
|
||||||
|
xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
|
||||||
|
pixels |= xe_edram_load_store_source.Load(rt_offset).xxxx >>
|
||||||
|
uint4(0u, 8u, 16u, 24u) << 24u;
|
||||||
|
uint2 tile_dword_index = xe_group_thread_id.xy;
|
||||||
|
tile_dword_index.x *= 4u;
|
||||||
|
xe_edram_load_store_dest.Store4(
|
||||||
|
XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
|
||||||
|
}
|
|
@ -0,0 +1,74 @@
|
||||||
|
#ifndef XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_
|
||||||
|
#define XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_
|
||||||
|
|
||||||
|
// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
|
||||||
|
|
||||||
|
uint XeFloat16To7e3(uint4 rgba_f16u32) {
|
||||||
|
float4 rgba_f32 = f16tof32(rgba_f16u32);
|
||||||
|
uint3 rgb_f32u32 = asuint(rgba_f32.xyz);
|
||||||
|
// Keep only positive (high bit set means negative for both float and int) and
|
||||||
|
// saturate to 31.875 (also dropping NaNs).
|
||||||
|
rgb_f32u32 = uint3(clamp(int3(rgb_f32u32), 0, 0x41FF0000));
|
||||||
|
uint3 normalized = rgb_f32u32 + 0xC2000000u;
|
||||||
|
uint3 denormalized = ((rgb_f32u32 & 0x7FFFFFu) | 0x800000u) >>
|
||||||
|
((125u).xxx - (rgb_f32u32 >> 23u));
|
||||||
|
uint3 rgb_f10u32 = normalized + (denormalized - normalized) *
|
||||||
|
uint3(rgb_f32u32 < 0x3E800000u);
|
||||||
|
rgb_f10u32 =
|
||||||
|
((rgb_f10u32 + 0x7FFFu + ((rgb_f10u32 >> 16u) & 1u)) >> 16u) & 0x3FFu;
|
||||||
|
return rgb_f10u32.r | (rgb_f10u32.g << 10u) | (rgb_f10u32.b << 20u) |
|
||||||
|
(uint(saturate(rgba_f32.a) * 3.0) << 30u);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint4 XeFloat7e3To16(uint rgba_packed) {
|
||||||
|
uint3 rgb_f10u32 = (rgba_packed.xxx >> uint3(0u, 10u, 20u)) & 0x3FFu;
|
||||||
|
uint3 mantissa = rgb_f10u32 & 0x7Fu;
|
||||||
|
uint3 exponent = rgb_f10u32 >> 7u;
|
||||||
|
// Normalize the values for the denormalized components.
|
||||||
|
// Exponent = 1;
|
||||||
|
// do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x80) == 0);
|
||||||
|
uint3 is_denormalized = uint3(exponent == 0u);
|
||||||
|
uint3 mantissa_lzcnt = (7u).xxx - firstbithigh(mantissa);
|
||||||
|
exponent += ((1u).xxx - mantissa_lzcnt - exponent) * is_denormalized;
|
||||||
|
mantissa +=
|
||||||
|
(((mantissa << mantissa_lzcnt) & 0x7Fu) - mantissa) * is_denormalized;
|
||||||
|
// Combine into 32-bit float bits and clear zeros.
|
||||||
|
uint3 rgb_f32u32 = (((exponent + 124u) << 23u) | (mantissa << 16u)) *
|
||||||
|
uint3(rgb_f10u32 != 0u);
|
||||||
|
return f32tof16(float4(asfloat(rgb_f32u32),
|
||||||
|
float(rgba_packed >> 30u) * (1.0 / 3.0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Based on CFloat24 from d3dref9.dll and the 6e4 code from:
|
||||||
|
// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
|
||||||
|
// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
|
||||||
|
// We also can't clamp the stored value to 1 as load->store->load must be exact.
|
||||||
|
|
||||||
|
uint4 XeFloat32To20e4(uint4 f32u32) {
|
||||||
|
// Keep only positive (high bit set means negative for both float and int) and
|
||||||
|
// saturate to the maximum representable value near 2 (also dropping NaNs).
|
||||||
|
f32u32 = uint4(clamp(int4(f32u32), 0, 0x3FFFFFF8));
|
||||||
|
uint4 normalized = f32u32 + 0xC8000000u;
|
||||||
|
uint4 denormalized =
|
||||||
|
((f32u32 & 0x7FFFFFu) | 0x800000u) >> ((113u).xxxx - (f32u32 >> 23u));
|
||||||
|
uint4 f24u32 =
|
||||||
|
normalized + (denormalized - normalized) * uint4(f32u32 < 0x38800000u);
|
||||||
|
return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint4 XeFloat20e4To32(uint4 f24u32) {
|
||||||
|
uint4 mantissa = f24u32 & 0xF00000u;
|
||||||
|
uint4 exponent = f24u32 >> 20u;
|
||||||
|
// Normalize the values for the denormalized components.
|
||||||
|
// Exponent = 1;
|
||||||
|
// do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x100000) == 0);
|
||||||
|
uint4 is_denormalized = uint4(exponent == 0u);
|
||||||
|
uint4 mantissa_lzcnt = (20u).xxxx - firstbithigh(mantissa);
|
||||||
|
exponent += ((1u).xxxx - mantissa_lzcnt - exponent) * is_denormalized;
|
||||||
|
mantissa +=
|
||||||
|
(((mantissa << mantissa_lzcnt) & 0xFFFFFu) - mantissa) * is_denormalized;
|
||||||
|
// Combine into 32-bit float bits and clear zeros.
|
||||||
|
return (((exponent + 112u) << 23u) | (mantissa << 3u)) * uint4(f24u32 != 0u);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_
|
|
@ -394,7 +394,7 @@ void SharedMemory::TransitionBuffer(D3D12_RESOURCE_STATES new_state,
|
||||||
barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
||||||
barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
|
barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
|
||||||
barrier.Transition.pResource = buffer_;
|
barrier.Transition.pResource = buffer_;
|
||||||
barrier.Transition.Subresource = 0;
|
barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
|
||||||
barrier.Transition.StateBefore = buffer_state_;
|
barrier.Transition.StateBefore = buffer_state_;
|
||||||
barrier.Transition.StateAfter = new_state;
|
barrier.Transition.StateAfter = new_state;
|
||||||
command_list->ResourceBarrier(1, &barrier);
|
command_list->ResourceBarrier(1, &barrier);
|
||||||
|
|
|
@ -741,7 +741,8 @@ bool TextureCache::LoadTextureData(Texture* texture) {
|
||||||
if (copy_buffer_state != D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
|
if (copy_buffer_state != D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
|
||||||
barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
||||||
barriers[0].Transition.pResource = copy_buffer;
|
barriers[0].Transition.pResource = copy_buffer;
|
||||||
barriers[0].Transition.Subresource = 0;
|
barriers[0].Transition.Subresource =
|
||||||
|
D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
|
||||||
barriers[0].Transition.StateBefore = copy_buffer_state;
|
barriers[0].Transition.StateBefore = copy_buffer_state;
|
||||||
barriers[0].Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
|
barriers[0].Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
|
||||||
command_list->ResourceBarrier(1, barriers);
|
command_list->ResourceBarrier(1, barriers);
|
||||||
|
@ -792,7 +793,8 @@ bool TextureCache::LoadTextureData(Texture* texture) {
|
||||||
barriers[0].UAV.pResource = copy_buffer;
|
barriers[0].UAV.pResource = copy_buffer;
|
||||||
barriers[1].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
barriers[1].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
||||||
barriers[1].Transition.pResource = copy_buffer;
|
barriers[1].Transition.pResource = copy_buffer;
|
||||||
barriers[1].Transition.Subresource = 0;
|
barriers[1].Transition.Subresource =
|
||||||
|
D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
|
||||||
barriers[1].Transition.StateBefore = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
|
barriers[1].Transition.StateBefore = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
|
||||||
barriers[1].Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE;
|
barriers[1].Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE;
|
||||||
command_list->ResourceBarrier(2, barriers);
|
command_list->ResourceBarrier(2, barriers);
|
||||||
|
|
Loading…
Reference in New Issue