[D3D12] Raw 32bpp resolve
This commit is contained in:
parent
bc4125584c
commit
ea1abdaa6e
|
@ -893,9 +893,10 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
|
|||
assert_always();
|
||||
return false;
|
||||
}
|
||||
Endian128 dest_endian = Endian128(dest_info & 0x7);
|
||||
int32_t dest_exp_bias =
|
||||
!is_depth ? (int32_t((dest_info >> 16) << 26) >> 26) : 0;
|
||||
uint32_t dest_swap = (dest_info >> 24) & 0x1;
|
||||
bool dest_swap = !is_depth && ((dest_info >> 24) & 0x1);
|
||||
|
||||
// Get the destination location.
|
||||
uint32_t dest_address = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32 & 0x1FFFFFFF;
|
||||
|
@ -950,14 +951,105 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
|
|||
// RTV of the destination format.
|
||||
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
|
||||
auto device = provider->GetDevice();
|
||||
auto descriptor_size_view = provider->GetDescriptorSizeView();
|
||||
if (sample_select <= xenos::CopySampleSelect::k3 &&
|
||||
src_texture_format == dest_format && dest_exp_bias == 0) {
|
||||
XELOGGPU("Resolving a single sample without conversion");
|
||||
if (src_64bpp) {
|
||||
// TODO(Triang3l): 64bpp sample copy shader.
|
||||
return false;
|
||||
}
|
||||
|
||||
// Make sure we have the memory to write to.
|
||||
if (!shared_memory->MakeTilesResident(dest_address, dest_size)) {
|
||||
return false;
|
||||
}
|
||||
// TODO(Triang3l): Raw resolve.
|
||||
|
||||
// Write the source and destination descriptors.
|
||||
D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start;
|
||||
D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start;
|
||||
if (command_processor_->RequestViewDescriptors(
|
||||
0, 2, 2, descriptor_cpu_start, descriptor_gpu_start) == 0) {
|
||||
return false;
|
||||
}
|
||||
D3D12_SHADER_RESOURCE_VIEW_DESC srv_desc;
|
||||
srv_desc.Format = DXGI_FORMAT_R32_TYPELESS;
|
||||
srv_desc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
|
||||
srv_desc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
|
||||
srv_desc.Buffer.FirstElement = 0;
|
||||
srv_desc.Buffer.NumElements = 2 * 2048 * 1280;
|
||||
srv_desc.Buffer.StructureByteStride = 0;
|
||||
srv_desc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW;
|
||||
device->CreateShaderResourceView(edram_buffer_, &srv_desc,
|
||||
descriptor_cpu_start);
|
||||
D3D12_CPU_DESCRIPTOR_HANDLE uav_cpu_handle;
|
||||
uav_cpu_handle.ptr = descriptor_cpu_start.ptr + descriptor_size_view;
|
||||
shared_memory->CreateRawUAV(uav_cpu_handle);
|
||||
|
||||
// Transition the buffers.
|
||||
command_processor_->PushTransitionBarrier(
|
||||
edram_buffer_, edram_buffer_state_,
|
||||
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
|
||||
edram_buffer_state_ = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
|
||||
shared_memory->UseForWriting();
|
||||
command_processor_->SubmitBarriers();
|
||||
|
||||
// Dispatch the computation.
|
||||
command_list->SetComputeRootSignature(edram_load_store_root_signature_);
|
||||
EDRAMLoadStoreRootConstants root_constants;
|
||||
root_constants.tile_sample_rect_tl = copy_rect.left | (copy_rect.top << 16);
|
||||
root_constants.tile_sample_rect_br =
|
||||
copy_rect.right | (copy_rect.bottom << 16);
|
||||
root_constants.tile_sample_dest_base = dest_address;
|
||||
assert_true(dest_pitch <= 8192);
|
||||
root_constants.tile_sample_dest_info = dest_pitch |
|
||||
(uint32_t(sample_select) << 16) |
|
||||
(uint32_t(dest_endian) << 18);
|
||||
if (msaa_samples >= MsaaSamples::k2X) {
|
||||
root_constants.tile_sample_dest_info |= 1 << 14;
|
||||
if (msaa_samples >= MsaaSamples::k4X) {
|
||||
root_constants.tile_sample_dest_info |= 1 << 15;
|
||||
}
|
||||
}
|
||||
if (dest_swap) {
|
||||
switch (ColorRenderTargetFormat(src_format)) {
|
||||
case ColorRenderTargetFormat::k_8_8_8_8:
|
||||
case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
|
||||
root_constants.tile_sample_dest_info |= (8 << 21) | (16 << 26);
|
||||
break;
|
||||
case ColorRenderTargetFormat::k_2_10_10_10:
|
||||
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
|
||||
case ColorRenderTargetFormat::k_2_10_10_10_AS_16_16_16_16:
|
||||
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
|
||||
root_constants.tile_sample_dest_info |= (10 << 21) | (20 << 26);
|
||||
break;
|
||||
case ColorRenderTargetFormat::k_16_16_16_16:
|
||||
case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
|
||||
root_constants.tile_sample_dest_info |= 1 << 21;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
root_constants.base_pitch_tiles = edram_base | (surface_pitch_tiles << 11);
|
||||
command_list->SetComputeRoot32BitConstants(
|
||||
0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
|
||||
command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
|
||||
// TODO(Triang3l): 64bpp pipeline.
|
||||
command_processor_->SetPipeline(edram_tile_sample_32bpp_pipeline_);
|
||||
// 1 group per destination 80x16 (32bpp) / 80x8 (64bpp) region.
|
||||
uint32_t group_count_x = row_tiles, group_count_y = rows;
|
||||
if (msaa_samples >= MsaaSamples::k2X) {
|
||||
group_count_y = (group_count_y + 1) >> 1;
|
||||
if (msaa_samples >= MsaaSamples::k4X) {
|
||||
group_count_x = (group_count_x + 1) >> 1;
|
||||
}
|
||||
}
|
||||
command_list->Dispatch(group_count_x, group_count_y, 1);
|
||||
|
||||
// Commit the write.
|
||||
command_processor_->PushUAVBarrier(shared_memory->GetBuffer());
|
||||
|
||||
// Make the texture cache refresh the data.
|
||||
shared_memory->RangeWrittenByGPU(dest_address, dest_size);
|
||||
} else {
|
||||
|
@ -1386,8 +1478,6 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
|
|||
command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source,
|
||||
nullptr);
|
||||
EDRAMLoadStoreRootConstants root_constants;
|
||||
root_constants.base_pitch_tiles =
|
||||
binding.edram_base | (rt_pitch_tiles << 11);
|
||||
root_constants.rt_color_depth_offset =
|
||||
uint32_t(location_dest.PlacedFootprint.Offset);
|
||||
root_constants.rt_color_depth_pitch =
|
||||
|
@ -1402,6 +1492,8 @@ void RenderTargetCache::StoreRenderTargetsToEDRAM() {
|
|||
root_constants.rt_stencil_pitch =
|
||||
location_dest.PlacedFootprint.Footprint.RowPitch;
|
||||
}
|
||||
root_constants.base_pitch_tiles =
|
||||
binding.edram_base | (rt_pitch_tiles << 11);
|
||||
|
||||
// Transition the copy buffer to SRV.
|
||||
command_processor_->PushTransitionBarrier(
|
||||
|
@ -1534,8 +1626,6 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
|
|||
// Load the data.
|
||||
command_processor_->SubmitBarriers();
|
||||
EDRAMLoadStoreRootConstants root_constants;
|
||||
root_constants.base_pitch_tiles =
|
||||
edram_bases[i] | (edram_pitch_tiles << 11);
|
||||
root_constants.rt_color_depth_offset =
|
||||
uint32_t(render_target->footprints[0].Offset);
|
||||
root_constants.rt_color_depth_pitch =
|
||||
|
@ -1546,6 +1636,8 @@ void RenderTargetCache::LoadRenderTargetsFromEDRAM(
|
|||
root_constants.rt_stencil_pitch =
|
||||
render_target->footprints[1].Footprint.RowPitch;
|
||||
}
|
||||
root_constants.base_pitch_tiles =
|
||||
edram_bases[i] | (edram_pitch_tiles << 11);
|
||||
command_list->SetComputeRoot32BitConstants(
|
||||
0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
|
||||
EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
|
||||
|
|
|
@ -385,11 +385,11 @@ class RenderTargetCache {
|
|||
// 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA.
|
||||
// 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA.
|
||||
// 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
|
||||
// 18:19 - destination endianness.
|
||||
// 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
|
||||
// 18:20 - destination endianness.
|
||||
// 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
|
||||
// For 32 bits per pixel:
|
||||
// 20:24 - red/blue bit depth.
|
||||
// 25:29 - blue offset.
|
||||
// 21:25 - red/blue bit depth.
|
||||
// 26:30 - blue offset.
|
||||
// For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
|
||||
uint32_t tile_sample_dest_info;
|
||||
};
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
#ifndef XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_
|
||||
#define XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_
|
||||
|
||||
// These functions may accept endianness without it being masked with & 3 -
|
||||
// don't use ==, <=, >= here!
|
||||
|
||||
#define XE_BYTE_SWAP_OVERLOAD(XeByteSwapType) \
|
||||
XeByteSwapType XeByteSwap(XeByteSwapType v, uint endian) { \
|
||||
[flatten] if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \
|
||||
if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \
|
||||
v = ((v & 0x00FF00FFu) << 8u) | ((v & 0xFF00FF00u) >> 8u); \
|
||||
} \
|
||||
[flatten] if ((endian & 2u) != 0u) { \
|
||||
if ((endian & 2u) != 0u) { \
|
||||
v = (v << 16u) | (v >> 16u); \
|
||||
} \
|
||||
return v; \
|
||||
|
@ -18,7 +21,7 @@ XE_BYTE_SWAP_OVERLOAD(uint4)
|
|||
|
||||
#define XE_BYTE_SWAP_16_OVERLOAD(XeByteSwapType) \
|
||||
XeByteSwapType XeByteSwap16(XeByteSwapType v, uint endian) { \
|
||||
[flatten] if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \
|
||||
if (((endian ^ (endian >> 1u)) & 1u) != 0u) { \
|
||||
v = (v << 8u) | (v >> 8u); \
|
||||
} \
|
||||
return v; \
|
||||
|
@ -28,4 +31,19 @@ XE_BYTE_SWAP_16_OVERLOAD(uint2)
|
|||
XE_BYTE_SWAP_16_OVERLOAD(uint3)
|
||||
XE_BYTE_SWAP_16_OVERLOAD(uint4)
|
||||
|
||||
uint2 XeByteSwap64(uint2 v, uint endian) {
|
||||
if (endian & 4u) {
|
||||
v = v.yx;
|
||||
endian = 2u;
|
||||
}
|
||||
return XeByteSwap(v, endian);
|
||||
}
|
||||
uint4 XeByteSwap64(uint4 v, uint endian) {
|
||||
if (endian & 4u) {
|
||||
v = v.yxwz;
|
||||
endian = 2u;
|
||||
}
|
||||
return XeByteSwap(v, endian);
|
||||
}
|
||||
|
||||
#endif // XENIA_GPU_D3D12_SHADERS_BYTE_SWAP_HLSLI_
|
||||
|
|
|
@ -23,11 +23,11 @@ cbuffer XeEDRAMLoadStoreConstants : register(b0) {
|
|||
// 14 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA.
|
||||
// 15 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA.
|
||||
// 16:17 - sample to load (16 - vertical index, 17 - horizontal index).
|
||||
// 18:19 - destination endianness.
|
||||
// 20:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
|
||||
// 18:20 - destination endianness.
|
||||
// 21:31 - BPP-specific info for swapping red/blue, 0 if not swapping.
|
||||
// For 32 bits per pixel:
|
||||
// 20:24 - red/blue bit depth.
|
||||
// 25:29 - blue offset.
|
||||
// 21:25 - red/blue bit depth.
|
||||
// 26:30 - blue offset.
|
||||
// For 64 bits per pixel, it's 1 if need to swap 0:15 and 32:47.
|
||||
#define xe_edram_tile_sample_dest_info (xe_edram_load_store_constants.w)
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
#include "byte_swap.hlsli"
|
||||
#include "edram_load_store.hlsli"
|
||||
#include "texture_address.hlsli"
|
||||
|
||||
|
@ -6,8 +7,9 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
uint3 xe_group_thread_id : SV_GroupThreadID,
|
||||
uint3 xe_thread_id : SV_DispatchThreadID) {
|
||||
// Check if not outside of the destination texture completely.
|
||||
uint4 copy_rect =
|
||||
(xe_edram_tile_sample_rect.xyxy >> uint4(0u, 0u, 16u, 16u)) & 0xFFFFu;
|
||||
uint4 copy_rect;
|
||||
copy_rect.xz = xe_edram_tile_sample_rect & 0xFFFFu;
|
||||
copy_rect.yw = xe_edram_tile_sample_rect >> 16u;
|
||||
uint2 texel_index = xe_thread_id.xy;
|
||||
texel_index.x *= 4u;
|
||||
[branch] if (any(texel_index < copy_rect.xy) ||
|
||||
|
@ -19,9 +21,12 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
// XY - log2(pixel size), ZW - selected sample offset.
|
||||
uint4 sample_info =
|
||||
(xe_edram_tile_sample_dest_info.xxxx >> uint4(15u, 14u, 17u, 16u)) & 1u;
|
||||
uint2 edram_tile_quarter =
|
||||
uint2(uint2(10u, 8u) <= xe_group_thread_id) * sample_info.xy;
|
||||
uint edram_offset = XeEDRAMOffset(
|
||||
xe_group_id.xy << sample_info.xy,
|
||||
xe_thread_id.xy << (sample_info.xy + uint2(2u, 0u)) + sample_info.zw);
|
||||
(xe_group_id.xy << sample_info.xy) + edram_tile_quarter,
|
||||
(xe_group_thread_id.xy - edram_tile_quarter * uint2(10u, 8u)) <<
|
||||
(sample_info.xy + uint2(2u, 0u)) + sample_info.zw);
|
||||
// At 1x and 2x, this contains samples of 4 pixels. At 4x, this contains
|
||||
// samples of 2, need to load 2 more.
|
||||
uint4 pixels = xe_edram_load_store_source.Load4(edram_offset);
|
||||
|
@ -30,7 +35,7 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
pixels.zw = xe_edram_load_store_source.Load3(edram_offset + 16u).xz;
|
||||
}
|
||||
|
||||
uint red_blue_swap = xe_edram_tile_sample_dest_info >> 20u;
|
||||
uint red_blue_swap = xe_edram_tile_sample_dest_info >> 21u;
|
||||
if (red_blue_swap != 0u) {
|
||||
uint red_mask = (1u << (red_blue_swap & 31u)) - 1u;
|
||||
// No need to be ready for a long shift Barney, it's just 16 or 20.
|
||||
|
@ -42,16 +47,18 @@ void main(uint3 xe_group_id : SV_GroupID,
|
|||
}
|
||||
|
||||
// Tile the pixels to the shared memory.
|
||||
pixels = XeByteSwap(pixels, xe_edram_tile_sample_dest_info >> 18u);
|
||||
uint4 texel_addresses =
|
||||
xe_edram_tile_sample_dest_base +
|
||||
XeTextureTiledOffset2D(texel_index - copy_rect.xy,
|
||||
xe_edram_tile_sample_dest_info & 16383u, 2u);
|
||||
xe_edram_load_store_dest.Store(texel_addresses.x, pixels.x);
|
||||
[branch] if (texel_index.x + 1u < copy_rect.z) {
|
||||
bool3 texels_in_rect = uint3(1u, 2u, 3u) + texel_index.x < copy_rect.z;
|
||||
[branch] if (texels_in_rect.x) {
|
||||
xe_edram_load_store_dest.Store(texel_addresses.y, pixels.y);
|
||||
[branch] if (texel_index.x + 2u < copy_rect.z) {
|
||||
[branch] if (texels_in_rect.y) {
|
||||
xe_edram_load_store_dest.Store(texel_addresses.z, pixels.z);
|
||||
[branch] if (texel_index.x + 3u < copy_rect.z) {
|
||||
[branch] if (texels_in_rect.z) {
|
||||
xe_edram_load_store_dest.Store(texel_addresses.w, pixels.w);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -57,7 +57,7 @@ uint4 XeFloat32To20e4(uint4 f32u32) {
|
|||
}
|
||||
|
||||
uint4 XeFloat20e4To32(uint4 f24u32) {
|
||||
uint4 mantissa = f24u32 & 0xF00000u;
|
||||
uint4 mantissa = f24u32 & 0xFFFFFu;
|
||||
uint4 exponent = f24u32 >> 20u;
|
||||
// Normalize the values for the denormalized components.
|
||||
// Exponent = 1;
|
||||
|
|
|
@ -541,7 +541,7 @@ void SharedMemory::CreateSRV(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
|
|||
device->CreateShaderResourceView(buffer_, &desc, handle);
|
||||
}
|
||||
|
||||
void SharedMemory::CreateUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
|
||||
void SharedMemory::CreateRawUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
|
||||
auto device =
|
||||
command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice();
|
||||
D3D12_UNORDERED_ACCESS_VIEW_DESC desc;
|
||||
|
|
|
@ -36,6 +36,7 @@ class SharedMemory {
|
|||
bool Initialize();
|
||||
void Shutdown();
|
||||
|
||||
ID3D12Resource* GetBuffer() const { return buffer_; }
|
||||
D3D12_GPU_VIRTUAL_ADDRESS GetGPUAddress() const {
|
||||
return buffer_gpu_address_;
|
||||
}
|
||||
|
@ -90,7 +91,7 @@ class SharedMemory {
|
|||
void UseForWriting();
|
||||
|
||||
void CreateSRV(D3D12_CPU_DESCRIPTOR_HANDLE handle);
|
||||
void CreateUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle);
|
||||
void CreateRawUAV(D3D12_CPU_DESCRIPTOR_HANDLE handle);
|
||||
|
||||
private:
|
||||
D3D12CommandProcessor* command_processor_;
|
||||
|
|
Loading…
Reference in New Issue