diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index ebfbbe986..b869513bc 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -777,12 +777,12 @@ std::string D3D12CommandProcessor::GetWindowTitleText() const { default: break; } - uint32_t resolution_scale_x = - texture_cache_ ? texture_cache_->GetDrawResolutionScaleX() : 1; - uint32_t resolution_scale_y = - texture_cache_ ? texture_cache_->GetDrawResolutionScaleY() : 1; - if (resolution_scale_x > 1 || resolution_scale_y > 1) { - title << ' ' << resolution_scale_x << 'x' << resolution_scale_y; + uint32_t draw_resolution_scale_x = + texture_cache_ ? texture_cache_->draw_resolution_scale_x() : 1; + uint32_t draw_resolution_scale_y = + texture_cache_ ? texture_cache_->draw_resolution_scale_y() : 1; + if (draw_resolution_scale_x > 1 || draw_resolution_scale_y > 1) { + title << ' ' << draw_resolution_scale_x << 'x' << draw_resolution_scale_y; } } return title.str(); @@ -845,11 +845,28 @@ bool D3D12CommandProcessor::SetupContext() { cvars::d3d12_bindless && provider.GetResourceBindingTier() >= D3D12_RESOURCE_BINDING_TIER_2; + // Get the draw resolution scale for the render target cache and the texture + // cache. + uint32_t draw_resolution_scale_x, draw_resolution_scale_y; + bool draw_resolution_scale_not_clamped = + TextureCache::GetConfigDrawResolutionScale(draw_resolution_scale_x, + draw_resolution_scale_y); + if (!D3D12TextureCache::ClampDrawResolutionScaleToMaxSupported( + draw_resolution_scale_x, draw_resolution_scale_y, provider)) { + draw_resolution_scale_not_clamped = false; + } + if (!draw_resolution_scale_not_clamped) { + XELOGW( + "The requested draw resolution scale is not supported by the device or " + "the emulator, reducing to {}x{}", + draw_resolution_scale_x, draw_resolution_scale_y); + } + // Initialize the render target cache before configuring binding - need to // know if using rasterizer-ordered views for the bindless root signature. render_target_cache_ = std::make_unique( - *register_file_, *memory_, trace_writer_, *this, - bindless_resources_used_); + *register_file_, *memory_, trace_writer_, draw_resolution_scale_x, + draw_resolution_scale_y, *this, bindless_resources_used_); if (!render_target_cache_->Initialize()) { XELOGE("Failed to initialize the render target cache"); return false; @@ -1141,11 +1158,10 @@ bool D3D12CommandProcessor::SetupContext() { return false; } - texture_cache_ = std::make_unique( - *this, *register_file_, *shared_memory_, bindless_resources_used_, - render_target_cache_->GetResolutionScaleX(), - render_target_cache_->GetResolutionScaleY()); - if (!texture_cache_->Initialize()) { + texture_cache_ = D3D12TextureCache::Create( + *register_file_, *shared_memory_, draw_resolution_scale_x, + draw_resolution_scale_y, *this, bindless_resources_used_); + if (!texture_cache_) { XELOGE("Failed to initialize the texture cache"); return false; } @@ -1741,12 +1757,12 @@ void D3D12CommandProcessor::IssueSwap(uint32_t frontbuffer_ptr, } D3D12_RESOURCE_DESC swap_texture_desc = swap_texture_resource->GetDesc(); - uint32_t resolution_scale_max = - std::max(texture_cache_->GetDrawResolutionScaleX(), - texture_cache_->GetDrawResolutionScaleY()); + uint32_t draw_resolution_scale_max = + std::max(texture_cache_->draw_resolution_scale_x(), + texture_cache_->draw_resolution_scale_y()); presenter->RefreshGuestOutput( uint32_t(swap_texture_desc.Width), uint32_t(swap_texture_desc.Height), - 1280 * resolution_scale_max, 720 * resolution_scale_max, + 1280 * draw_resolution_scale_max, 720 * draw_resolution_scale_max, [this, &swap_texture_srv_desc, frontbuffer_format, swap_texture_resource, &swap_texture_desc]( ui::Presenter::GuestOutputRefreshContext& context) -> bool { @@ -2233,13 +2249,13 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, } // Get dynamic rasterizer state. - uint32_t resolution_scale_x = texture_cache_->GetDrawResolutionScaleX(); - uint32_t resolution_scale_y = texture_cache_->GetDrawResolutionScaleY(); + uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x(); + uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y(); RenderTargetCache::DepthFloat24Conversion depth_float24_conversion = render_target_cache_->depth_float24_conversion(); draw_util::ViewportInfo viewport_info; draw_util::GetHostViewportInfo( - regs, resolution_scale_x, resolution_scale_y, true, + regs, draw_resolution_scale_x, draw_resolution_scale_y, true, D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false, normalized_depth_control, host_render_targets_used && @@ -2251,10 +2267,10 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, viewport_info); draw_util::Scissor scissor; draw_util::GetScissor(regs, scissor); - scissor.offset[0] *= resolution_scale_x; - scissor.offset[1] *= resolution_scale_y; - scissor.extent[0] *= resolution_scale_x; - scissor.extent[1] *= resolution_scale_y; + scissor.offset[0] *= draw_resolution_scale_x; + scissor.offset[1] *= draw_resolution_scale_y; + scissor.extent[0] *= draw_resolution_scale_x; + scissor.extent[1] *= draw_resolution_scale_y; // Update viewport, scissor, blend factor and stencil reference. UpdateFixedFunctionState(viewport_info, scissor, primitive_polygonal, @@ -2774,6 +2790,8 @@ void D3D12CommandProcessor::CheckSubmissionFence(uint64_t await_submission) { primitive_processor_->CompletedSubmissionUpdated(); render_target_cache_->CompletedSubmissionUpdated(); + + texture_cache_->CompletedSubmissionUpdated(submission_completed_); } bool D3D12CommandProcessor::BeginSubmission(bool is_guest_command) { @@ -2856,7 +2874,7 @@ bool D3D12CommandProcessor::BeginSubmission(bool is_guest_command) { render_target_cache_->BeginSubmission(); - texture_cache_->BeginSubmission(); + texture_cache_->BeginSubmission(submission_current_); } if (is_opening_frame) { @@ -3166,8 +3184,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( bool edram_rov_used = render_target_cache_->GetPath() == RenderTargetCache::Path::kPixelShaderInterlock; - uint32_t resolution_scale_x = texture_cache_->GetDrawResolutionScaleX(); - uint32_t resolution_scale_y = texture_cache_->GetDrawResolutionScaleY(); + uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x(); + uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y(); // Get the color info register values for each render target. Also, for ROV, // exclude components that don't exist in the format from the write mask. @@ -3381,10 +3399,10 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( // radius conversion to avoid multiplying the per-vertex diameter by an // additional constant in the shader. float point_screen_diameter_to_ndc_radius_x = - (/* 0.5f * 2.0f * */ float(resolution_scale_x)) / + (/* 0.5f * 2.0f * */ float(draw_resolution_scale_x)) / std::max(viewport_info.xy_extent[0], uint32_t(1)); float point_screen_diameter_to_ndc_radius_y = - (/* 0.5f * 2.0f * */ float(resolution_scale_y)) / + (/* 0.5f * 2.0f * */ float(draw_resolution_scale_y)) / std::max(viewport_info.xy_extent[1], uint32_t(1)); dirty |= system_constants_.point_screen_diameter_to_ndc_radius[0] != point_screen_diameter_to_ndc_radius_x; @@ -3457,9 +3475,9 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( dirty |= system_constants_.alpha_to_mask != alpha_to_mask; system_constants_.alpha_to_mask = alpha_to_mask; - uint32_t edram_tile_dwords_scaled = xenos::kEdramTileWidthSamples * - xenos::kEdramTileHeightSamples * - (resolution_scale_x * resolution_scale_y); + uint32_t edram_tile_dwords_scaled = + xenos::kEdramTileWidthSamples * xenos::kEdramTileHeightSamples * + (draw_resolution_scale_x * draw_resolution_scale_y); // EDRAM pitch for ROV writing. if (edram_rov_used) { @@ -3571,7 +3589,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( // background is more likely. float poly_offset_scale_factor = xenos::kPolygonOffsetScaleSubpixelUnit * - std::max(resolution_scale_x, resolution_scale_y); + std::max(draw_resolution_scale_x, draw_resolution_scale_y); poly_offset_front_scale *= poly_offset_scale_factor; poly_offset_back_scale *= poly_offset_scale_factor; dirty |= system_constants_.edram_poly_offset_front_scale != @@ -3879,7 +3897,7 @@ bool D3D12CommandProcessor::UpdateBindings( current_samplers_vertex_.resize( std::max(current_samplers_vertex_.size(), sampler_count_vertex)); for (size_t i = 0; i < sampler_count_vertex; ++i) { - TextureCache::SamplerParameters parameters = + D3D12TextureCache::SamplerParameters parameters = texture_cache_->GetSamplerParameters(samplers_vertex[i]); if (current_samplers_vertex_[i] != parameters) { cbuffer_binding_descriptor_indices_vertex_.up_to_date = false; @@ -3911,7 +3929,7 @@ bool D3D12CommandProcessor::UpdateBindings( current_samplers_pixel_.resize(std::max(current_samplers_pixel_.size(), size_t(sampler_count_pixel))); for (uint32_t i = 0; i < sampler_count_pixel; ++i) { - TextureCache::SamplerParameters parameters = + D3D12TextureCache::SamplerParameters parameters = texture_cache_->GetSamplerParameters((*samplers_pixel)[i]); if (current_samplers_pixel_[i] != parameters) { current_samplers_pixel_[i] = parameters; @@ -4018,7 +4036,7 @@ bool D3D12CommandProcessor::UpdateBindings( std::max(current_sampler_bindless_indices_vertex_.size(), size_t(sampler_count_vertex))); for (uint32_t j = 0; j < sampler_count_vertex; ++j) { - TextureCache::SamplerParameters sampler_parameters = + D3D12TextureCache::SamplerParameters sampler_parameters = current_samplers_vertex_[j]; uint32_t sampler_index; auto it = texture_cache_bindless_sampler_map_.find( @@ -4050,7 +4068,7 @@ bool D3D12CommandProcessor::UpdateBindings( std::max(current_sampler_bindless_indices_pixel_.size(), size_t(sampler_count_pixel))); for (uint32_t j = 0; j < sampler_count_pixel; ++j) { - TextureCache::SamplerParameters sampler_parameters = + D3D12TextureCache::SamplerParameters sampler_parameters = current_samplers_pixel_[j]; uint32_t sampler_index; auto it = texture_cache_bindless_sampler_map_.find( diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 2bb7a1c84..e9cefb337 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -24,9 +24,9 @@ #include "xenia/gpu/d3d12/d3d12_primitive_processor.h" #include "xenia/gpu/d3d12/d3d12_render_target_cache.h" #include "xenia/gpu/d3d12/d3d12_shared_memory.h" +#include "xenia/gpu/d3d12/d3d12_texture_cache.h" #include "xenia/gpu/d3d12/deferred_command_list.h" #include "xenia/gpu/d3d12/pipeline_cache.h" -#include "xenia/gpu/d3d12/texture_cache.h" #include "xenia/gpu/draw_util.h" #include "xenia/gpu/dxbc_shader.h" #include "xenia/gpu/dxbc_shader_translator.h" @@ -482,7 +482,7 @@ class D3D12CommandProcessor : public CommandProcessor { // number (so checking if the first can be reused is enough). std::deque> sampler_bindless_heaps_overflowed_; - // TextureCache::SamplerParameters::value -> indices within the current + // D3D12TextureCache::SamplerParameters::value -> indices within the current // bindless sampler heap. std::unordered_map texture_cache_bindless_sampler_map_; @@ -497,7 +497,7 @@ class D3D12CommandProcessor : public CommandProcessor { std::unique_ptr pipeline_cache_; - std::unique_ptr texture_cache_; + std::unique_ptr texture_cache_; // Bytes 0x0...0x3FF - 256-entry gamma ramp table with B10G10R10X2 data (read // as R10G10B10X2 with swizzle). @@ -648,10 +648,11 @@ class D3D12CommandProcessor : public CommandProcessor { // Size of these should be ignored when checking whether these are up to date, // layout UID should be checked first (they will be different for different // binding counts). - std::vector current_texture_srv_keys_vertex_; - std::vector current_texture_srv_keys_pixel_; - std::vector current_samplers_vertex_; - std::vector current_samplers_pixel_; + std::vector + current_texture_srv_keys_vertex_; + std::vector current_texture_srv_keys_pixel_; + std::vector current_samplers_vertex_; + std::vector current_samplers_pixel_; std::vector current_sampler_bindless_indices_vertex_; std::vector current_sampler_bindless_indices_pixel_; diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc index 510669dbc..8c541e531 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc @@ -26,8 +26,8 @@ #include "xenia/base/math.h" #include "xenia/base/string.h" #include "xenia/gpu/d3d12/d3d12_command_processor.h" +#include "xenia/gpu/d3d12/d3d12_texture_cache.h" #include "xenia/gpu/d3d12/deferred_command_list.h" -#include "xenia/gpu/d3d12/texture_cache.h" #include "xenia/gpu/draw_util.h" #include "xenia/gpu/dxbc.h" #include "xenia/gpu/dxbc_shader_translator.h" @@ -250,35 +250,10 @@ bool D3D12RenderTargetCache::Initialize() { path_ = Path::kHostRenderTargets; } - uint32_t config_resolution_scale_x = - uint32_t(std::max(cvars::draw_resolution_scale_x, int32_t(1))); - uint32_t config_resolution_scale_y = - uint32_t(std::max(cvars::draw_resolution_scale_y, int32_t(1))); - // Hard limit, originating from the half-pixel offset (two-pixel offset is too - // much, the resolve shaders, being generic for different scales, only - // duplicate the second pixel into the first, not the third), and also due to - // the bit counts used for passing the scale to shaders, and hardcoded scales - // and shifts for fast division by integer constants. - const uint32_t kMaxResolutionScale = 3; - resolution_scale_x_ = - std::min(config_resolution_scale_x, kMaxResolutionScale); - resolution_scale_y_ = - std::min(config_resolution_scale_y, kMaxResolutionScale); - TextureCache::ClampDrawResolutionScaleToSupportedRange( - resolution_scale_x_, resolution_scale_y_, provider); - if (resolution_scale_x_ != config_resolution_scale_x || - resolution_scale_y_ != config_resolution_scale_y) { - XELOGW( - "D3D12RenderTargetCache: {}x{} resolution scale not supported by the " - "device or the emulator, reducing to {}x{}", - config_resolution_scale_x, config_resolution_scale_y, - resolution_scale_x_, resolution_scale_y_); - } - bool resolution_scaled = resolution_scale_x_ > 1 || resolution_scale_y_ > 1; - // Create the buffer for reinterpreting EDRAM contents. uint32_t edram_buffer_size = - xenos::kEdramSizeBytes * resolution_scale_x_ * resolution_scale_y_; + xenos::kEdramSizeBytes * + (draw_resolution_scale_x() * draw_resolution_scale_y()); D3D12_RESOURCE_DESC edram_buffer_desc; ui::d3d12::util::FillBufferResourceDesc( edram_buffer_desc, edram_buffer_size, @@ -369,6 +344,8 @@ bool D3D12RenderTargetCache::Initialize() { uint32_t(EdramBufferDescriptorIndex::kR32G32B32A32UintUAV)), edram_buffer_, DXGI_FORMAT_R32G32B32A32_UINT, edram_buffer_size >> 4); + bool draw_resolution_scaled = IsDrawResolutionScaled(); + // Create the resolve copying root signature. D3D12_ROOT_PARAMETER resolve_copy_root_parameters[4]; // Parameter 0 is constants. @@ -379,7 +356,7 @@ bool D3D12RenderTargetCache::Initialize() { // Binding all of the shared memory at 1x resolution, portions with scaled // resolution. resolve_copy_root_parameters[0].Constants.Num32BitValues = - (IsResolutionScaled() + (draw_resolution_scaled ? sizeof(draw_util::ResolveCopyShaderConstants::DestRelative) : sizeof(draw_util::ResolveCopyShaderConstants)) / sizeof(uint32_t); @@ -414,7 +391,7 @@ bool D3D12RenderTargetCache::Initialize() { resolve_copy_root_parameters[2].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; // Parameter 3 is the resolution scale. - if (resolution_scaled) { + if (draw_resolution_scaled) { resolve_copy_root_parameters[3].ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS; resolve_copy_root_parameters[3].Constants.ShaderRegister = 1; @@ -427,7 +404,8 @@ bool D3D12RenderTargetCache::Initialize() { D3D12_SHADER_VISIBILITY_ALL; } D3D12_ROOT_SIGNATURE_DESC resolve_copy_root_signature_desc; - resolve_copy_root_signature_desc.NumParameters = resolution_scaled ? 4 : 3; + resolve_copy_root_signature_desc.NumParameters = + draw_resolution_scaled ? 4 : 3; resolve_copy_root_signature_desc.pParameters = resolve_copy_root_parameters; resolve_copy_root_signature_desc.NumStaticSamplers = 0; resolve_copy_root_signature_desc.pStaticSamplers = nullptr; @@ -457,10 +435,10 @@ bool D3D12RenderTargetCache::Initialize() { ID3D12PipelineState* resolve_copy_pipeline = ui::d3d12::util::CreateComputePipeline( device, - resolution_scaled ? resolve_copy_shader_code.scaled - : resolve_copy_shader_code.unscaled, - resolution_scaled ? resolve_copy_shader_code.scaled_size - : resolve_copy_shader_code.unscaled_size, + draw_resolution_scaled ? resolve_copy_shader_code.scaled + : resolve_copy_shader_code.unscaled, + draw_resolution_scaled ? resolve_copy_shader_code.scaled_size + : resolve_copy_shader_code.unscaled_size, resolve_copy_root_signature_); if (resolve_copy_pipeline == nullptr) { XELOGE( @@ -1081,7 +1059,7 @@ bool D3D12RenderTargetCache::Initialize() { resolve_rov_clear_root_parameters[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; // Parameter 2 is the resolution scale. - if (resolution_scaled) { + if (draw_resolution_scaled) { resolve_rov_clear_root_parameters[2].ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS; resolve_rov_clear_root_parameters[2].Constants.ShaderRegister = 1; @@ -1095,7 +1073,7 @@ bool D3D12RenderTargetCache::Initialize() { } D3D12_ROOT_SIGNATURE_DESC resolve_rov_clear_root_signature_desc; resolve_rov_clear_root_signature_desc.NumParameters = - resolution_scaled ? 3 : 2; + draw_resolution_scaled ? 3 : 2; resolve_rov_clear_root_signature_desc.pParameters = resolve_rov_clear_root_parameters; resolve_rov_clear_root_signature_desc.NumStaticSamplers = 0; @@ -1115,10 +1093,10 @@ bool D3D12RenderTargetCache::Initialize() { // Create the resolve EDRAM buffer clearing pipelines. resolve_rov_clear_32bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline( device, - resolution_scaled ? shaders::resolve_clear_32bpp_scaled_cs - : shaders::resolve_clear_32bpp_cs, - resolution_scaled ? sizeof(shaders::resolve_clear_32bpp_scaled_cs) - : sizeof(shaders::resolve_clear_32bpp_cs), + draw_resolution_scaled ? shaders::resolve_clear_32bpp_scaled_cs + : shaders::resolve_clear_32bpp_cs, + draw_resolution_scaled ? sizeof(shaders::resolve_clear_32bpp_scaled_cs) + : sizeof(shaders::resolve_clear_32bpp_cs), resolve_rov_clear_root_signature_); if (resolve_rov_clear_32bpp_pipeline_ == nullptr) { XELOGE( @@ -1130,10 +1108,10 @@ bool D3D12RenderTargetCache::Initialize() { resolve_rov_clear_32bpp_pipeline_->SetName(L"Resolve Clear 32bpp"); resolve_rov_clear_64bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline( device, - resolution_scaled ? shaders::resolve_clear_64bpp_scaled_cs - : shaders::resolve_clear_64bpp_cs, - resolution_scaled ? sizeof(shaders::resolve_clear_64bpp_scaled_cs) - : sizeof(shaders::resolve_clear_64bpp_cs), + draw_resolution_scaled ? shaders::resolve_clear_64bpp_scaled_cs + : shaders::resolve_clear_64bpp_cs, + draw_resolution_scaled ? sizeof(shaders::resolve_clear_64bpp_scaled_cs) + : sizeof(shaders::resolve_clear_64bpp_cs), resolve_rov_clear_root_signature_); if (resolve_rov_clear_64bpp_pipeline_ == nullptr) { XELOGE( @@ -1366,17 +1344,17 @@ void D3D12RenderTargetCache::WriteEdramUintPow2UAVDescriptor( bool D3D12RenderTargetCache::Resolve(const Memory& memory, D3D12SharedMemory& shared_memory, - TextureCache& texture_cache, + D3D12TextureCache& texture_cache, uint32_t& written_address_out, uint32_t& written_length_out) { written_address_out = 0; written_length_out = 0; - bool resolution_scaled = IsResolutionScaled(); + bool draw_resolution_scaled = IsDrawResolutionScaled(); draw_util::ResolveInfo resolve_info; if (!draw_util::GetResolveInfo( - register_file(), memory, trace_writer_, resolution_scaled, + register_file(), memory, trace_writer_, draw_resolution_scaled, IsFixed16TruncatedToMinus1To1(), resolve_info)) { return false; } @@ -1387,8 +1365,8 @@ bool D3D12RenderTargetCache::Resolve(const Memory& memory, } draw_util::ResolveResolutionScaleConstant resolution_scale_constant; - resolution_scale_constant.resolution_scale_x = resolution_scale_x_; - resolution_scale_constant.resolution_scale_y = resolution_scale_y_; + resolution_scale_constant.resolution_scale_x = draw_resolution_scale_x(); + resolution_scale_constant.resolution_scale_y = draw_resolution_scale_y(); DeferredCommandList& command_list = command_processor_.GetDeferredCommandList(); @@ -1413,8 +1391,8 @@ bool D3D12RenderTargetCache::Resolve(const Memory& memory, draw_util::ResolveCopyShaderConstants copy_shader_constants; uint32_t copy_group_count_x, copy_group_count_y; draw_util::ResolveCopyShaderIndex copy_shader = resolve_info.GetCopyShader( - resolution_scale_x_, resolution_scale_y_, copy_shader_constants, - copy_group_count_x, copy_group_count_y); + draw_resolution_scale_x(), draw_resolution_scale_y(), + copy_shader_constants, copy_group_count_x, copy_group_count_y); assert_true(copy_group_count_x && copy_group_count_y); if (copy_shader != draw_util::ResolveCopyShaderIndex::kUnknown) { const draw_util::ResolveCopyShaderInfo& copy_shader_info = @@ -1422,7 +1400,7 @@ bool D3D12RenderTargetCache::Resolve(const Memory& memory, // Make sure there is memory to write to. bool copy_dest_committed; - if (resolution_scaled) { + if (draw_resolution_scaled) { copy_dest_committed = texture_cache.EnsureScaledResolveMemoryCommitted( resolve_info.copy_dest_base, resolve_info.copy_dest_length) && @@ -1441,10 +1419,10 @@ bool D3D12RenderTargetCache::Resolve(const Memory& memory, ui::d3d12::util::DescriptorCpuGpuHandlePair descriptor_source; ui::d3d12::util::DescriptorCpuGpuHandlePair descriptors[2]; if (command_processor_.RequestOneUseSingleViewDescriptors( - bindless_resources_used_ ? uint32_t(resolution_scaled) : 2, + bindless_resources_used_ ? uint32_t(draw_resolution_scaled) : 2, descriptors)) { if (bindless_resources_used_) { - if (resolution_scaled) { + if (draw_resolution_scaled) { descriptor_dest = descriptors[0]; } else { descriptor_dest = @@ -1463,7 +1441,7 @@ bool D3D12RenderTargetCache::Resolve(const Memory& memory, } } else { descriptor_dest = descriptors[0]; - if (!resolution_scaled) { + if (!draw_resolution_scaled) { shared_memory.WriteUintPow2UAVDescriptor( descriptor_dest.first, copy_shader_info.dest_bpe_log2); } @@ -1475,7 +1453,7 @@ bool D3D12RenderTargetCache::Resolve(const Memory& memory, copy_shader_info.source_bpe_log2); } } - if (resolution_scaled) { + if (draw_resolution_scaled) { texture_cache.CreateCurrentScaledResolveRangeUintPow2UAV( descriptor_dest.first, copy_shader_info.dest_bpe_log2); texture_cache.TransitionCurrentScaledResolveRange( @@ -1487,7 +1465,7 @@ bool D3D12RenderTargetCache::Resolve(const Memory& memory, // Submit the resolve. command_list.D3DSetComputeRootSignature(resolve_copy_root_signature_); - if (resolution_scaled) { + if (draw_resolution_scaled) { command_list.D3DSetComputeRoot32BitConstants( 3, sizeof(resolution_scale_constant) / sizeof(uint32_t), &resolution_scale_constant, 0); @@ -1496,7 +1474,7 @@ bool D3D12RenderTargetCache::Resolve(const Memory& memory, 2, descriptor_source.second); command_list.D3DSetComputeRootDescriptorTable(1, descriptor_dest.second); - if (resolution_scaled) { + if (draw_resolution_scaled) { command_list.D3DSetComputeRoot32BitConstants( 0, sizeof(copy_shader_constants.dest_relative) / sizeof(uint32_t), @@ -1512,7 +1490,7 @@ bool D3D12RenderTargetCache::Resolve(const Memory& memory, command_list.D3DDispatch(copy_group_count_x, copy_group_count_y, 1); // Order the resolve with other work using the destination as a UAV. - if (resolution_scaled) { + if (draw_resolution_scaled) { texture_cache.MarkCurrentScaledResolveRangeUAVWritesCommitNeeded(); } else { shared_memory.MarkUAVWritesCommitNeeded(); @@ -1585,7 +1563,7 @@ bool D3D12RenderTargetCache::Resolve(const Memory& memory, CommitEdramBufferUAVWrites(); command_list.D3DSetComputeRootSignature( resolve_rov_clear_root_signature_); - if (resolution_scaled) { + if (draw_resolution_scaled) { command_list.D3DSetComputeRoot32BitConstants( 2, sizeof(resolution_scale_constant) / sizeof(uint32_t), &resolution_scale_constant, 0); @@ -1593,8 +1571,8 @@ bool D3D12RenderTargetCache::Resolve(const Memory& memory, command_list.D3DSetComputeRootDescriptorTable( 1, descriptor_edram.second); std::pair clear_group_count = - resolve_info.GetClearShaderGroupCount(resolution_scale_x_, - resolution_scale_y_); + resolve_info.GetClearShaderGroupCount(draw_resolution_scale_x(), + draw_resolution_scale_y()); assert_true(clear_group_count.first && clear_group_count.second); if (clear_depth) { draw_util::ResolveClearShaderConstants depth_clear_constants; @@ -1648,7 +1626,7 @@ bool D3D12RenderTargetCache::Resolve(const Memory& memory, } bool D3D12RenderTargetCache::InitializeTraceSubmitDownloads() { - if (IsResolutionScaled()) { + if (IsDrawResolutionScaled()) { // No 1:1 mapping. return false; } @@ -1704,7 +1682,7 @@ void D3D12RenderTargetCache::InitializeTraceCompleteDownloads() { } void D3D12RenderTargetCache::RestoreEdramSnapshot(const void* snapshot) { - if (IsResolutionScaled()) { + if (IsDrawResolutionScaled()) { // No 1:1 mapping. return; } @@ -1962,10 +1940,10 @@ RenderTargetCache::RenderTarget* D3D12RenderTargetCache::CreateRenderTarget( D3D12_RESOURCE_DESC resource_desc; resource_desc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D; resource_desc.Alignment = 0; - resource_desc.Width = key.GetWidth() * resolution_scale_x_; + resource_desc.Width = key.GetWidth() * draw_resolution_scale_x(); resource_desc.Height = GetRenderTargetHeight(key.pitch_tiles_at_32bpp, key.msaa_samples) * - resolution_scale_y_; + draw_resolution_scale_y(); resource_desc.DepthOrArraySize = 1; resource_desc.MipLevels = 1; if (key.is_depth) { @@ -2963,10 +2941,13 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { // for the coordinates for that load. Currently 3 temps are enough. a.OpDclTemps(3); + uint32_t draw_resolution_scale_x = this->draw_resolution_scale_x(); + uint32_t draw_resolution_scale_y = this->draw_resolution_scale_y(); + uint32_t tile_width_samples_scaled = - xenos::kEdramTileWidthSamples * resolution_scale_x_; + xenos::kEdramTileWidthSamples * draw_resolution_scale_x; uint32_t tile_height_samples_scaled = - xenos::kEdramTileHeightSamples * resolution_scale_y_; + xenos::kEdramTileHeightSamples * draw_resolution_scale_y; // Split the destination pixel index into 32bpp tile in r0.z and // 32bpp-tile-relative pixel index in r0.xy. @@ -2979,12 +2960,16 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X); uint32_t dest_tile_width_divide_scale, dest_tile_width_divide_upper_shift; draw_util::GetEdramTileWidthDivideScaleAndUpperShift( - resolution_scale_x_, dest_tile_width_divide_scale, + draw_resolution_scale_x, dest_tile_width_divide_scale, dest_tile_width_divide_upper_shift); assert_true(dest_tile_width_divide_upper_shift >= dest_sample_width_log2); // Need the host tile size in pixels, not samples. dest_tile_width_divide_upper_shift -= dest_sample_width_log2; - if (resolution_scale_y_ == 3) { + static_assert( + TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, + "D3D12RenderTargetCache EDRAM range ownership transfer shader generation " + "supports Y draw resolution scaling factors of only up to 3"); + if (draw_resolution_scale_y == 3) { // r0.zw = upper 32 bits in the division process of pixel XY by pixel count // in a 32bpp tile a.OpUMul(dxbc::Dest::R(0, 0b1100), dxbc::Dest::Null(), @@ -3000,14 +2985,14 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { a.OpIMAd( dxbc::Dest::R(0, 0b0011), dxbc::Src::R(0, 0b1110), dxbc::Src::LI( - -int32_t((80 * resolution_scale_x_) >> dest_sample_width_log2), - -int32_t((16 * resolution_scale_y_) >> dest_sample_height_log2), 0, - 0), + -int32_t((80 * draw_resolution_scale_x) >> dest_sample_width_log2), + -int32_t((16 * draw_resolution_scale_y) >> dest_sample_height_log2), + 0, 0), dxbc::Src::R(0, 0b0100)); } else { - assert_true(resolution_scale_y_ <= 2); + assert_true(draw_resolution_scale_y <= 2); uint32_t dest_tile_height_pixels_log2 = - (resolution_scale_y_ == 2 ? 5 : 4) - dest_sample_height_log2; + (draw_resolution_scale_y == 2 ? 5 : 4) - dest_sample_height_log2; // r0.z = upper 32 bits in the division process of pixel X by pixel count in // a 32bpp tile a.OpUMul(dxbc::Dest::R(0, 0b0100), dxbc::Dest::Null(), @@ -3019,7 +3004,7 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { dest_tile_height_pixels_log2)); // r0.x = destination pixel X index within the 32bpp tile a.OpIMAd(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kZZZZ), - dxbc::Src::LI(-int32_t((80 * resolution_scale_x_) >> + dxbc::Src::LI(-int32_t((80 * draw_resolution_scale_x) >> dest_sample_width_log2)), dxbc::Src::R(0, dxbc::Src::kXXXX)); // r0.y = destination pixel Y index within the 32bpp tile @@ -4518,15 +4503,15 @@ void D3D12RenderTargetCache::PerformTransfersAndResolveClears( // Assuming the rectangle is already clamped by the setup function from the // common render target cache. clear_rect.left = - LONG(resolve_clear_rectangle->x_pixels * resolution_scale_x_); + LONG(resolve_clear_rectangle->x_pixels * draw_resolution_scale_x()); clear_rect.top = - LONG(resolve_clear_rectangle->y_pixels * resolution_scale_y_); + LONG(resolve_clear_rectangle->y_pixels * draw_resolution_scale_y()); clear_rect.right = LONG((resolve_clear_rectangle->x_pixels + resolve_clear_rectangle->width_pixels) * - resolution_scale_x_); + draw_resolution_scale_x()); clear_rect.bottom = LONG((resolve_clear_rectangle->y_pixels + resolve_clear_rectangle->height_pixels) * - resolution_scale_y_); + draw_resolution_scale_y()); } // Do host depth storing for the depth destination (assuming there can be only @@ -4811,8 +4796,8 @@ void D3D12RenderTargetCache::PerformTransfersAndResolveClears( bool transfer_viewport_set = false; float pixels_to_ndc_unscaled = 2.0f / float(D3D12_REQ_TEXTURE2D_U_OR_V_DIMENSION); - float pixels_to_ndc_x = pixels_to_ndc_unscaled * resolution_scale_x_; - float pixels_to_ndc_y = pixels_to_ndc_unscaled * resolution_scale_y_; + float pixels_to_ndc_x = pixels_to_ndc_unscaled * draw_resolution_scale_x(); + float pixels_to_ndc_y = pixels_to_ndc_unscaled * draw_resolution_scale_y(); TransferRootSignatureIndex last_transfer_root_signature_index = TransferRootSignatureIndex::kCount; @@ -4988,18 +4973,18 @@ void D3D12RenderTargetCache::PerformTransfersAndResolveClears( ++j) { const Transfer::Rectangle& stencil_clear_rectangle = transfer_stencil_clear_rectangles[j]; - stencil_clear_rect_write_ptr->left = - LONG(stencil_clear_rectangle.x_pixels * resolution_scale_x_); - stencil_clear_rect_write_ptr->top = - LONG(stencil_clear_rectangle.y_pixels * resolution_scale_y_); + stencil_clear_rect_write_ptr->left = LONG( + stencil_clear_rectangle.x_pixels * draw_resolution_scale_x()); + stencil_clear_rect_write_ptr->top = LONG( + stencil_clear_rectangle.y_pixels * draw_resolution_scale_y()); stencil_clear_rect_write_ptr->right = LONG((stencil_clear_rectangle.x_pixels + stencil_clear_rectangle.width_pixels) * - resolution_scale_x_); + draw_resolution_scale_x()); stencil_clear_rect_write_ptr->bottom = LONG((stencil_clear_rectangle.y_pixels + stencil_clear_rectangle.height_pixels) * - resolution_scale_y_); + draw_resolution_scale_y()); ++stencil_clear_rect_write_ptr; } } @@ -5967,13 +5952,20 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline( // fits in it, while 80x16 doesn't. a.OpDclThreadGroup(40, 16, 1); + uint32_t draw_resolution_scale_x = this->draw_resolution_scale_x(); + uint32_t draw_resolution_scale_y = this->draw_resolution_scale_y(); + // For now, as the exact addressing in 64bpp render targets relatively to // 32bpp is unknown, treating 64bpp tiles as storing 40x16 samples rather than // 80x16 for simplicity of addressing into the texture. // Get the parts of the address along Y - tile row index within the dispatch // to r0.w, sample Y within the tile to r0.y. - if (resolution_scale_y_ == 3) { + static_assert( + TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, + "D3D12RenderTargetCache render target dump shader generation supports Y " + "draw resolution scaling factors of only up to 3"); + if (draw_resolution_scale_y == 3) { // Multiplication part of the division by the (16 * scale) tile height, // specifically 48 here, or 16 * 3. // r0.w = (Y * kDivideScale3) >> 32 @@ -5988,28 +5980,28 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline( // r0.y = Y sample position within the tile // r0.w = Y tile position a.OpIMAd(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kWWWW), - dxbc::Src::LI(-16 * resolution_scale_y_), + dxbc::Src::LI(-16 * draw_resolution_scale_y), dxbc::Src::VThreadID(dxbc::Src::kYYYY)); } else { - assert_true(resolution_scale_y_ <= 2); + assert_true(draw_resolution_scale_y <= 2); // Tile height is a power of two, can use bit operations. // Get the tile row index into r0.w. // r0.w = Y tile position. a.OpUShR(dxbc::Dest::R(0, 0b1000), dxbc::Src::VThreadID(dxbc::Src::kYYYY), - dxbc::Src::LU(resolution_scale_y_ == 2 ? 5 : 4)); + dxbc::Src::LU(draw_resolution_scale_y == 2 ? 5 : 4)); // Get the Y sample position within the tile into r0.y. // r0.y = Y sample position within the tile // r0.w = Y tile position a.OpAnd(dxbc::Dest::R(0, 0b0010), dxbc::Src::VThreadID(dxbc::Src::kYYYY), - dxbc::Src::LU((16 * resolution_scale_y_) - 1)); + dxbc::Src::LU((16 * draw_resolution_scale_y) - 1)); } // Get the X tile offset within the dispatch to r0.z. - uint32_t tile_width = xenos::kEdramTileWidthSamples * resolution_scale_x_; + uint32_t tile_width = xenos::kEdramTileWidthSamples * draw_resolution_scale_x; uint32_t tile_width_divide_scale; uint32_t tile_width_divide_upper_shift; draw_util::GetEdramTileWidthDivideScaleAndUpperShift( - resolution_scale_x_, tile_width_divide_scale, + draw_resolution_scale_x, tile_width_divide_scale, tile_width_divide_upper_shift); if (format_is_64bpp) { tile_width >>= 1; @@ -6082,7 +6074,7 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline( // r0.w = tile index in the EDRAM a.OpUMAd(dxbc::Dest::R(0, 0b0100), dxbc::Src::R(0, dxbc::Src::kWWWW), dxbc::Src::LU( - resolution_scale_x_ * resolution_scale_y_ * + draw_resolution_scale_x * draw_resolution_scale_y * (xenos::kEdramTileWidthSamples >> uint32_t(format_is_64bpp)) * xenos::kEdramTileHeightSamples), dxbc::Src::R(0, dxbc::Src::kXXXX)); @@ -6177,9 +6169,10 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline( // r0.y = Y sample position within the source texture // r0.z = sample offset in the EDRAM // r1.x = free - a.OpUMAd(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(1, dxbc::Src::kXXXX), - dxbc::Src::LU(xenos::kEdramTileHeightSamples * resolution_scale_y_), - dxbc::Src::R(0, dxbc::Src::kYYYY)); + a.OpUMAd( + dxbc::Dest::R(0, 0b0010), dxbc::Src::R(1, dxbc::Src::kXXXX), + dxbc::Src::LU(xenos::kEdramTileHeightSamples * draw_resolution_scale_y), + dxbc::Src::R(0, dxbc::Src::kYYYY)); // Will be using the source texture coordinates from r0.xy, and for // single-sampled source, LOD from r0.w. dxbc::Src source_address_src(dxbc::Src::R(0, 0b11000100)); @@ -6708,9 +6701,10 @@ void D3D12RenderTargetCache::DumpRenderTargets(uint32_t dump_base, command_processor_.SubmitBarriers(); // Processing 40 x 16 x scale samples per dispatch (a 32bpp tile in two // dispatches at 1x1 scale, 64bpp in one dispatch). - command_list.D3DDispatch((dispatch.width_tiles * resolution_scale_x_) - << uint32_t(!format_is_64bpp), - dispatch.height_tiles * resolution_scale_y_, 1); + command_list.D3DDispatch( + (dispatch.width_tiles * draw_resolution_scale_x()) + << uint32_t(!format_is_64bpp), + dispatch.height_tiles * draw_resolution_scale_y(), 1); } MarkEdramBufferModified(); } diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h index 817275843..4ad7d4b15 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h @@ -23,7 +23,7 @@ #include "xenia/base/assert.h" #include "xenia/gpu/d3d12/d3d12_shared_memory.h" -#include "xenia/gpu/d3d12/texture_cache.h" +#include "xenia/gpu/d3d12/d3d12_texture_cache.h" #include "xenia/gpu/draw_util.h" #include "xenia/gpu/render_target_cache.h" #include "xenia/gpu/trace_writer.h" @@ -44,9 +44,12 @@ class D3D12RenderTargetCache final : public RenderTargetCache { public: D3D12RenderTargetCache(const RegisterFile& register_file, const Memory& memory, TraceWriter& trace_writer, + uint32_t draw_resolution_scale_x, + uint32_t draw_resolution_scale_y, D3D12CommandProcessor& command_processor, bool bindless_resources_used) - : RenderTargetCache(register_file, memory, &trace_writer), + : RenderTargetCache(register_file, memory, &trace_writer, + draw_resolution_scale_x, draw_resolution_scale_y), command_processor_(command_processor), trace_writer_(trace_writer), bindless_resources_used_(bindless_resources_used) {} @@ -60,9 +63,6 @@ class D3D12RenderTargetCache final : public RenderTargetCache { Path GetPath() const override { return path_; } - uint32_t GetResolutionScaleX() const override { return resolution_scale_x_; } - uint32_t GetResolutionScaleY() const override { return resolution_scale_y_; } - bool Update(bool is_rasterization_done, reg::RB_DEPTHCONTROL normalized_depth_control, uint32_t normalized_color_mask, @@ -85,7 +85,7 @@ class D3D12RenderTargetCache final : public RenderTargetCache { // register values, and also clears the render targets if needed. Must be in a // frame for calling. bool Resolve(const Memory& memory, D3D12SharedMemory& shared_memory, - TextureCache& texture_cache, uint32_t& written_address_out, + D3D12TextureCache& texture_cache, uint32_t& written_address_out, uint32_t& written_length_out); // Returns true if any downloads were submitted to the command processor. @@ -164,8 +164,6 @@ class D3D12RenderTargetCache final : public RenderTargetCache { bool bindless_resources_used_; Path path_ = Path::kHostRenderTargets; - uint32_t resolution_scale_x_ = 1; - uint32_t resolution_scale_y_ = 1; // For host render targets, an EDRAM-sized scratch buffer for: // - Guest render target data copied from host render targets during copying diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc similarity index 52% rename from src/xenia/gpu/d3d12/texture_cache.cc rename to src/xenia/gpu/d3d12/d3d12_texture_cache.cc index 05cce6662..ce38c1cba 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc @@ -2,59 +2,32 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2018 Ben Vanik. All rights reserved. * + * Copyright 2022 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ -#include "xenia/gpu/d3d12/texture_cache.h" +#include "xenia/gpu/d3d12/d3d12_texture_cache.h" #include #include #include #include +#include +#include #include "xenia/base/assert.h" -#include "xenia/base/clock.h" -#include "xenia/base/cvar.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/base/profiling.h" -#include "xenia/base/xxhash.h" #include "xenia/gpu/d3d12/d3d12_command_processor.h" #include "xenia/gpu/d3d12/d3d12_shared_memory.h" -#include "xenia/gpu/gpu_flags.h" #include "xenia/gpu/texture_info.h" #include "xenia/gpu/texture_util.h" +#include "xenia/gpu/xenos.h" #include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h" #include "xenia/ui/d3d12/d3d12_util.h" -DEFINE_uint32( - texture_cache_memory_limit_soft, 384, - "Maximum host texture memory usage (in megabytes) above which old textures " - "will be destroyed.", - "GPU"); -DEFINE_uint32( - texture_cache_memory_limit_soft_lifetime, 30, - "Seconds a texture should be unused to be considered old enough to be " - "deleted if texture memory usage exceeds texture_cache_memory_limit_soft.", - "GPU"); -DEFINE_uint32( - texture_cache_memory_limit_hard, 768, - "Maximum host texture memory usage (in megabytes) above which textures " - "will be destroyed as soon as possible.", - "GPU"); -DEFINE_uint32( - texture_cache_memory_limit_render_to_texture, 24, - "Part of the host texture memory budget (in megabytes) that will be scaled " - "by the current drawing resolution scale.\n" - "If texture_cache_memory_limit_soft, for instance, is 384, and this is 24, " - "it will be assumed that the game will be using roughly 24 MB of " - "render-to-texture (resolve) targets and 384 - 24 = 360 MB of regular " - "textures - so with 2x2 resolution scaling, the soft limit will be 360 + " - "96 MB, and with 3x3, it will be 360 + 216 MB.", - "GPU"); - namespace xe { namespace gpu { namespace d3d12 { @@ -101,679 +74,301 @@ namespace shaders { #include "xenia/gpu/shaders/bytecode/d3d12_5_1/texture_load_r5g6b5_b5g6r5_scaled_cs.h" } // namespace shaders -// For formats with less than 4 components, assuming the last component is -// replicated into the non-existent ones, similar to what is done for unused -// components of operands in shaders. -// For DXT3A and DXT5A, RRRR swizzle is specified in: -// http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf -// 4D5307E6 also expects replicated components in k_8 sprites. -// DXN is read as RG in 4D5307E6, but as RA in 415607E6. -// TODO(Triang3l): Find out the correct contents of unused texture components. -const TextureCache::HostFormat TextureCache::host_formats_[64] = { +const D3D12TextureCache::HostFormat D3D12TextureCache::host_formats_[64] = { // k_1_REVERSE - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_1 - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_8 - {DXGI_FORMAT_R8_TYPELESS, - DXGI_FORMAT_R8_UNORM, - LoadMode::k8bpb, - DXGI_FORMAT_R8_SNORM, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, LoadMode::k8bpb, + DXGI_FORMAT_R8_SNORM, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_1_5_5_5 // Red and blue swapped in the load shader for simplicity. - {DXGI_FORMAT_B5G5R5A1_UNORM, - DXGI_FORMAT_B5G5R5A1_UNORM, - LoadMode::kR5G5B5A1ToB5G5R5A1, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_B5G5R5A1_UNORM, DXGI_FORMAT_B5G5R5A1_UNORM, + LoadMode::kR5G5B5A1ToB5G5R5A1, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + false, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_5_6_5 // Red and blue swapped in the load shader for simplicity. - {DXGI_FORMAT_B5G6R5_UNORM, - DXGI_FORMAT_B5G6R5_UNORM, - LoadMode::kR5G6B5ToB5G6R5, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 2}}, + {DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM, + LoadMode::kR5G6B5ToB5G6R5, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, // k_6_5_5 // On the host, green bits in blue, blue bits in green. - {DXGI_FORMAT_B5G6R5_UNORM, - DXGI_FORMAT_B5G6R5_UNORM, - LoadMode::kR5G5B6ToB5G6R5WithRBGASwizzle, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 2, 1, 1}}, + {DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM, + LoadMode::kR5G5B6ToB5G6R5WithRBGASwizzle, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + XE_GPU_MAKE_TEXTURE_SWIZZLE(R, B, G, G)}, // k_8_8_8_8 - {DXGI_FORMAT_R8G8B8A8_TYPELESS, - DXGI_FORMAT_R8G8B8A8_UNORM, - LoadMode::k32bpb, - DXGI_FORMAT_R8G8B8A8_SNORM, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM, + LoadMode::k32bpb, DXGI_FORMAT_R8G8B8A8_SNORM, LoadMode::kUnknown, false, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_2_10_10_10 - {DXGI_FORMAT_R10G10B10A2_TYPELESS, - DXGI_FORMAT_R10G10B10A2_UNORM, - LoadMode::k32bpb, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_R10G10B10A2_TYPELESS, DXGI_FORMAT_R10G10B10A2_UNORM, + LoadMode::k32bpb, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_8_A - {DXGI_FORMAT_R8_TYPELESS, - DXGI_FORMAT_R8_UNORM, - LoadMode::k8bpb, - DXGI_FORMAT_R8_SNORM, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, LoadMode::k8bpb, + DXGI_FORMAT_R8_SNORM, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_8_B - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_8_8 - {DXGI_FORMAT_R8G8_TYPELESS, - DXGI_FORMAT_R8G8_UNORM, - LoadMode::k16bpb, - DXGI_FORMAT_R8G8_SNORM, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 1, 1}}, + {DXGI_FORMAT_R8G8_TYPELESS, DXGI_FORMAT_R8G8_UNORM, LoadMode::k16bpb, + DXGI_FORMAT_R8G8_SNORM, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, // k_Cr_Y1_Cb_Y0_REP // Red and blue probably must be swapped, similar to k_Y1_Cr_Y0_Cb_REP. - {DXGI_FORMAT_G8R8_G8B8_UNORM, - DXGI_FORMAT_G8R8_G8B8_UNORM, - LoadMode::k32bpb, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - true, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {2, 1, 0, 3}}, + {DXGI_FORMAT_G8R8_G8B8_UNORM, DXGI_FORMAT_G8R8_G8B8_UNORM, LoadMode::k32bpb, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, true, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_BGRA}, // k_Y1_Cr_Y0_Cb_REP // Used for videos in 54540829. Red and blue must be swapped. // TODO(Triang3l): D3DFMT_G8R8_G8B8 is DXGI_FORMAT_R8G8_B8G8_UNORM * 255.0f, // watch out for num_format int, division in shaders, etc., in 54540829 it // works as is. Also need to decompress if the size is uneven, but should be // a very rare case. - {DXGI_FORMAT_R8G8_B8G8_UNORM, - DXGI_FORMAT_R8G8_B8G8_UNORM, - LoadMode::k32bpb, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - true, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {2, 1, 0, 3}}, + {DXGI_FORMAT_R8G8_B8G8_UNORM, DXGI_FORMAT_R8G8_B8G8_UNORM, LoadMode::k32bpb, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, true, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_BGRA}, // k_16_16_EDRAM // Not usable as a texture, also has -32...32 range. - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 1, 1}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, // k_8_8_8_8_A - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_4_4_4_4 // Red and blue swapped in the load shader for simplicity. - {DXGI_FORMAT_B4G4R4A4_UNORM, - DXGI_FORMAT_B4G4R4A4_UNORM, - LoadMode::kR4G4B4A4ToB4G4R4A4, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM, + LoadMode::kR4G4B4A4ToB4G4R4A4, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + false, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_10_11_11 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, - DXGI_FORMAT_R16G16B16A16_UNORM, - LoadMode::kR11G11B10ToRGBA16, - DXGI_FORMAT_R16G16B16A16_SNORM, - LoadMode::kR11G11B10ToRGBA16SNorm, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 2}}, + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + LoadMode::kR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, + LoadMode::kR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, // k_11_11_10 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, - DXGI_FORMAT_R16G16B16A16_UNORM, - LoadMode::kR10G11B11ToRGBA16, - DXGI_FORMAT_R16G16B16A16_SNORM, - LoadMode::kR10G11B11ToRGBA16SNorm, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 2}}, + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + LoadMode::kR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, + LoadMode::kR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, // k_DXT1 - {DXGI_FORMAT_BC1_UNORM, - DXGI_FORMAT_BC1_UNORM, - LoadMode::k64bpb, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - true, - DXGI_FORMAT_R8G8B8A8_UNORM, - LoadMode::kDXT1ToRGBA8, - {0, 1, 2, 3}}, + {DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, LoadMode::k64bpb, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, true, DXGI_FORMAT_R8G8B8A8_UNORM, + LoadMode::kDXT1ToRGBA8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_DXT2_3 - {DXGI_FORMAT_BC2_UNORM, - DXGI_FORMAT_BC2_UNORM, - LoadMode::k128bpb, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - true, - DXGI_FORMAT_R8G8B8A8_UNORM, - LoadMode::kDXT3ToRGBA8, - {0, 1, 2, 3}}, + {DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, LoadMode::k128bpb, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, true, DXGI_FORMAT_R8G8B8A8_UNORM, + LoadMode::kDXT3ToRGBA8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_DXT4_5 - {DXGI_FORMAT_BC3_UNORM, - DXGI_FORMAT_BC3_UNORM, - LoadMode::k128bpb, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - true, - DXGI_FORMAT_R8G8B8A8_UNORM, - LoadMode::kDXT5ToRGBA8, - {0, 1, 2, 3}}, + {DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, LoadMode::k128bpb, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, true, DXGI_FORMAT_R8G8B8A8_UNORM, + LoadMode::kDXT5ToRGBA8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_16_16_16_16_EDRAM // Not usable as a texture, also has -32...32 range. - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // R32_FLOAT for depth because shaders would require an additional SRV to // sample stencil, which we don't provide. // k_24_8 - {DXGI_FORMAT_R32_FLOAT, - DXGI_FORMAT_R32_FLOAT, - LoadMode::kDepthUnorm, - DXGI_FORMAT_R32_FLOAT, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, LoadMode::kDepthUnorm, + DXGI_FORMAT_R32_FLOAT, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_24_8_FLOAT - {DXGI_FORMAT_R32_FLOAT, - DXGI_FORMAT_R32_FLOAT, - LoadMode::kDepthFloat, - DXGI_FORMAT_R32_FLOAT, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, LoadMode::kDepthFloat, + DXGI_FORMAT_R32_FLOAT, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_16 - {DXGI_FORMAT_R16_TYPELESS, - DXGI_FORMAT_R16_UNORM, - LoadMode::k16bpb, - DXGI_FORMAT_R16_SNORM, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_R16_TYPELESS, DXGI_FORMAT_R16_UNORM, LoadMode::k16bpb, + DXGI_FORMAT_R16_SNORM, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_16_16 - {DXGI_FORMAT_R16G16_TYPELESS, - DXGI_FORMAT_R16G16_UNORM, - LoadMode::k32bpb, - DXGI_FORMAT_R16G16_SNORM, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 1, 1}}, + {DXGI_FORMAT_R16G16_TYPELESS, DXGI_FORMAT_R16G16_UNORM, LoadMode::k32bpb, + DXGI_FORMAT_R16G16_SNORM, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, // k_16_16_16_16 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, - DXGI_FORMAT_R16G16B16A16_UNORM, - LoadMode::k64bpb, - DXGI_FORMAT_R16G16B16A16_SNORM, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + LoadMode::k64bpb, DXGI_FORMAT_R16G16B16A16_SNORM, LoadMode::kUnknown, + false, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_16_EXPAND - {DXGI_FORMAT_R16_FLOAT, - DXGI_FORMAT_R16_FLOAT, - LoadMode::k16bpb, - DXGI_FORMAT_R16_FLOAT, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, LoadMode::k16bpb, + DXGI_FORMAT_R16_FLOAT, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_16_16_EXPAND - {DXGI_FORMAT_R16G16_FLOAT, - DXGI_FORMAT_R16G16_FLOAT, - LoadMode::k32bpb, - DXGI_FORMAT_R16G16_FLOAT, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 1, 1}}, + {DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT, LoadMode::k32bpb, + DXGI_FORMAT_R16G16_FLOAT, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, // k_16_16_16_16_EXPAND - {DXGI_FORMAT_R16G16B16A16_FLOAT, - DXGI_FORMAT_R16G16B16A16_FLOAT, - LoadMode::k64bpb, - DXGI_FORMAT_R16G16B16A16_FLOAT, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT, + LoadMode::k64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT, LoadMode::kUnknown, + false, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_16_FLOAT - {DXGI_FORMAT_R16_FLOAT, - DXGI_FORMAT_R16_FLOAT, - LoadMode::k16bpb, - DXGI_FORMAT_R16_FLOAT, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, LoadMode::k16bpb, + DXGI_FORMAT_R16_FLOAT, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_16_16_FLOAT - {DXGI_FORMAT_R16G16_FLOAT, - DXGI_FORMAT_R16G16_FLOAT, - LoadMode::k32bpb, - DXGI_FORMAT_R16G16_FLOAT, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 1, 1}}, + {DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT, LoadMode::k32bpb, + DXGI_FORMAT_R16G16_FLOAT, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, // k_16_16_16_16_FLOAT - {DXGI_FORMAT_R16G16B16A16_FLOAT, - DXGI_FORMAT_R16G16B16A16_FLOAT, - LoadMode::k64bpb, - DXGI_FORMAT_R16G16B16A16_FLOAT, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT, + LoadMode::k64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT, LoadMode::kUnknown, + false, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_32 - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_32_32 - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 1, 1}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, // k_32_32_32_32 - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_32_FLOAT - {DXGI_FORMAT_R32_FLOAT, - DXGI_FORMAT_R32_FLOAT, - LoadMode::k32bpb, - DXGI_FORMAT_R32_FLOAT, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, LoadMode::k32bpb, + DXGI_FORMAT_R32_FLOAT, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_32_32_FLOAT - {DXGI_FORMAT_R32G32_FLOAT, - DXGI_FORMAT_R32G32_FLOAT, - LoadMode::k64bpb, - DXGI_FORMAT_R32G32_FLOAT, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 1, 1}}, + {DXGI_FORMAT_R32G32_FLOAT, DXGI_FORMAT_R32G32_FLOAT, LoadMode::k64bpb, + DXGI_FORMAT_R32G32_FLOAT, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, // k_32_32_32_32_FLOAT - {DXGI_FORMAT_R32G32B32A32_FLOAT, - DXGI_FORMAT_R32G32B32A32_FLOAT, - LoadMode::k128bpb, - DXGI_FORMAT_R32G32B32A32_FLOAT, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_R32G32B32A32_FLOAT, DXGI_FORMAT_R32G32B32A32_FLOAT, + LoadMode::k128bpb, DXGI_FORMAT_R32G32B32A32_FLOAT, LoadMode::kUnknown, + false, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_32_AS_8 - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_32_AS_8_8 - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 1, 1}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, // k_16_MPEG - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_16_16_MPEG - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 1, 1}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, // k_8_INTERLACED - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_32_AS_8_INTERLACED - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_32_AS_8_8_INTERLACED - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 1, 1}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, // k_16_INTERLACED - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_16_MPEG_INTERLACED - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_16_16_MPEG_INTERLACED - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 1, 1}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, // k_DXN - {DXGI_FORMAT_BC5_UNORM, - DXGI_FORMAT_BC5_UNORM, - LoadMode::k128bpb, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - true, - DXGI_FORMAT_R8G8_UNORM, - LoadMode::kDXNToRG8, - {0, 1, 1, 1}}, + {DXGI_FORMAT_BC5_UNORM, DXGI_FORMAT_BC5_UNORM, LoadMode::k128bpb, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, true, DXGI_FORMAT_R8G8_UNORM, + LoadMode::kDXNToRG8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, // k_8_8_8_8_AS_16_16_16_16 - {DXGI_FORMAT_R8G8B8A8_TYPELESS, - DXGI_FORMAT_R8G8B8A8_UNORM, - LoadMode::k32bpb, - DXGI_FORMAT_R8G8B8A8_SNORM, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM, + LoadMode::k32bpb, DXGI_FORMAT_R8G8B8A8_SNORM, LoadMode::kUnknown, false, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_DXT1_AS_16_16_16_16 - {DXGI_FORMAT_BC1_UNORM, - DXGI_FORMAT_BC1_UNORM, - LoadMode::k64bpb, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - true, - DXGI_FORMAT_R8G8B8A8_UNORM, - LoadMode::kDXT1ToRGBA8, - {0, 1, 2, 3}}, + {DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, LoadMode::k64bpb, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, true, DXGI_FORMAT_R8G8B8A8_UNORM, + LoadMode::kDXT1ToRGBA8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_DXT2_3_AS_16_16_16_16 - {DXGI_FORMAT_BC2_UNORM, - DXGI_FORMAT_BC2_UNORM, - LoadMode::k128bpb, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - true, - DXGI_FORMAT_R8G8B8A8_UNORM, - LoadMode::kDXT3ToRGBA8, - {0, 1, 2, 3}}, + {DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, LoadMode::k128bpb, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, true, DXGI_FORMAT_R8G8B8A8_UNORM, + LoadMode::kDXT3ToRGBA8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_DXT4_5_AS_16_16_16_16 - {DXGI_FORMAT_BC3_UNORM, - DXGI_FORMAT_BC3_UNORM, - LoadMode::k128bpb, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - true, - DXGI_FORMAT_R8G8B8A8_UNORM, - LoadMode::kDXT5ToRGBA8, - {0, 1, 2, 3}}, + {DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, LoadMode::k128bpb, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, true, DXGI_FORMAT_R8G8B8A8_UNORM, + LoadMode::kDXT5ToRGBA8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_2_10_10_10_AS_16_16_16_16 - {DXGI_FORMAT_R10G10B10A2_UNORM, - DXGI_FORMAT_R10G10B10A2_UNORM, - LoadMode::k32bpb, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_R10G10B10A2_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM, + LoadMode::k32bpb, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_10_11_11_AS_16_16_16_16 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, - DXGI_FORMAT_R16G16B16A16_UNORM, - LoadMode::kR11G11B10ToRGBA16, - DXGI_FORMAT_R16G16B16A16_SNORM, - LoadMode::kR11G11B10ToRGBA16SNorm, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 2}}, + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + LoadMode::kR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, + LoadMode::kR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, // k_11_11_10_AS_16_16_16_16 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, - DXGI_FORMAT_R16G16B16A16_UNORM, - LoadMode::kR10G11B11ToRGBA16, - DXGI_FORMAT_R16G16B16A16_SNORM, - LoadMode::kR10G11B11ToRGBA16SNorm, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 2}}, + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + LoadMode::kR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, + LoadMode::kR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, // k_32_32_32_FLOAT - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 2}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, // k_DXT3A // R8_UNORM has the same size as BC2, but doesn't have the 4x4 size // alignment requirement. - {DXGI_FORMAT_R8_UNORM, - DXGI_FORMAT_R8_UNORM, - LoadMode::kDXT3A, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 0, 0, 0}}, + {DXGI_FORMAT_R8_UNORM, DXGI_FORMAT_R8_UNORM, LoadMode::kDXT3A, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_DXT5A - {DXGI_FORMAT_BC4_UNORM, - DXGI_FORMAT_BC4_UNORM, - LoadMode::k64bpb, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - true, - DXGI_FORMAT_R8_UNORM, - LoadMode::kDXT5AToR8, - {0, 0, 0, 0}}, + {DXGI_FORMAT_BC4_UNORM, DXGI_FORMAT_BC4_UNORM, LoadMode::k64bpb, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, true, DXGI_FORMAT_R8_UNORM, + LoadMode::kDXT5AToR8, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, // k_CTX1 - {DXGI_FORMAT_R8G8_UNORM, - DXGI_FORMAT_R8G8_UNORM, - LoadMode::kCTX1, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 1, 1}}, + {DXGI_FORMAT_R8G8_UNORM, DXGI_FORMAT_R8G8_UNORM, LoadMode::kCTX1, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, // k_DXT3A_AS_1_1_1_1 - {DXGI_FORMAT_B4G4R4A4_UNORM, - DXGI_FORMAT_B4G4R4A4_UNORM, - LoadMode::kDXT3AAs1111ToBGRA4, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM, + LoadMode::kDXT3AAs1111ToBGRA4, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + false, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_8_8_8_8_GAMMA_EDRAM // Not usable as a texture. - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, // k_2_10_10_10_FLOAT_EDRAM // Not usable as a texture. - {DXGI_FORMAT_UNKNOWN, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - false, - DXGI_FORMAT_UNKNOWN, - LoadMode::kUnknown, - {0, 1, 2, 3}}, + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, + DXGI_FORMAT_UNKNOWN, LoadMode::kUnknown, false, DXGI_FORMAT_UNKNOWN, + LoadMode::kUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, }; -const char* const TextureCache::dimension_names_[4] = {"1D", "2D", "3D", - "cube"}; - -const TextureCache::LoadModeInfo TextureCache::load_mode_info_[] = { +const D3D12TextureCache::LoadModeInfo D3D12TextureCache::load_mode_info_[] = { {shaders::texture_load_8bpb_cs, sizeof(shaders::texture_load_8bpb_cs), shaders::texture_load_8bpb_scaled_cs, sizeof(shaders::texture_load_8bpb_scaled_cs), 3, 4, 16}, @@ -848,27 +443,32 @@ const TextureCache::LoadModeInfo TextureCache::load_mode_info_[] = { sizeof(shaders::texture_load_depth_float_scaled_cs), 4, 4, 8}, }; -TextureCache::TextureCache(D3D12CommandProcessor& command_processor, - const RegisterFile& register_file, - D3D12SharedMemory& shared_memory, - bool bindless_resources_used, - uint32_t draw_resolution_scale_x, - uint32_t draw_resolution_scale_y) - : command_processor_(command_processor), - register_file_(register_file), - shared_memory_(shared_memory), - bindless_resources_used_(bindless_resources_used), - draw_resolution_scale_x_(draw_resolution_scale_x), - draw_resolution_scale_y_(draw_resolution_scale_y) { - assert_true(draw_resolution_scale_x >= 1); - assert_true(draw_resolution_scale_x <= kMaxDrawResolutionScaleAlongAxis); - assert_true(draw_resolution_scale_y >= 1); - assert_true(draw_resolution_scale_y <= kMaxDrawResolutionScaleAlongAxis); +D3D12TextureCache::D3D12TextureCache(const RegisterFile& register_file, + D3D12SharedMemory& shared_memory, + uint32_t draw_resolution_scale_x, + uint32_t draw_resolution_scale_y, + D3D12CommandProcessor& command_processor, + bool bindless_resources_used) + : TextureCache(register_file, shared_memory, draw_resolution_scale_x, + draw_resolution_scale_y), + command_processor_(command_processor), + bindless_resources_used_(bindless_resources_used) {} + +D3D12TextureCache::~D3D12TextureCache() { + // While the texture descriptor cache still exists (referenced by + // ~D3D12Texture), destroy all textures. + DestroyAllTextures(true); + + // First release the buffers to detach them from the heaps. + for (std::unique_ptr& scaled_resolve_buffer_ptr : + scaled_resolve_2gb_buffers_) { + scaled_resolve_buffer_ptr.reset(); + } + scaled_resolve_heaps_.clear(); + COUNT_profile_set("gpu/texture_cache/scaled_resolve_buffer_used_mb", 0); } -TextureCache::~TextureCache() { Shutdown(); } - -bool TextureCache::Initialize() { +bool D3D12TextureCache::Initialize() { const ui::d3d12::D3D12Provider& provider = command_processor_.GetD3D12Provider(); ID3D12Device* device = provider.GetDevice(); @@ -881,15 +481,9 @@ bool TextureCache::Initialize() { assert_true(scaled_resolve_heaps_.empty()); uint64_t scaled_resolve_address_space_size = uint64_t(SharedMemory::kBufferSize) * - (draw_resolution_scale_x_ * draw_resolution_scale_y_); + (draw_resolution_scale_x() * draw_resolution_scale_y()); scaled_resolve_heaps_.resize(size_t(scaled_resolve_address_space_size >> kScaledResolveHeapSizeLog2)); - constexpr uint32_t kScaledResolvePageDwordCount = - SharedMemory::kBufferSize / 4096 / 32; - scaled_resolve_pages_ = new uint32_t[kScaledResolvePageDwordCount]; - std::memset(scaled_resolve_pages_, 0, - kScaledResolvePageDwordCount * sizeof(uint32_t)); - std::memset(scaled_resolve_pages_l2_, 0, sizeof(scaled_resolve_pages_l2_)); } scaled_resolve_heap_count_ = 0; @@ -929,40 +523,39 @@ bool TextureCache::Initialize() { root_signature_desc.NumStaticSamplers = 0; root_signature_desc.pStaticSamplers = nullptr; root_signature_desc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE; - load_root_signature_ = + *(load_root_signature_.ReleaseAndGetAddressOf()) = ui::d3d12::util::CreateRootSignature(provider, root_signature_desc); - if (load_root_signature_ == nullptr) { + if (!load_root_signature_) { XELOGE( "D3D12TextureCache: Failed to create the texture loading root " "signature"); - Shutdown(); return false; } // Create the loading pipelines. for (uint32_t i = 0; i < uint32_t(LoadMode::kCount); ++i) { const LoadModeInfo& load_mode_info = load_mode_info_[i]; - load_pipelines_[i] = ui::d3d12::util::CreateComputePipeline( - device, load_mode_info.shader, load_mode_info.shader_size, - load_root_signature_); - if (load_pipelines_[i] == nullptr) { + *(load_pipelines_[i].ReleaseAndGetAddressOf()) = + ui::d3d12::util::CreateComputePipeline(device, load_mode_info.shader, + load_mode_info.shader_size, + load_root_signature_.Get()); + if (!load_pipelines_[i]) { XELOGE( "D3D12TextureCache: Failed to create the texture loading pipeline " "for mode {}", i); - Shutdown(); return false; } if (IsDrawResolutionScaled() && load_mode_info.shader_scaled) { - load_pipelines_scaled_[i] = ui::d3d12::util::CreateComputePipeline( - device, load_mode_info.shader_scaled, - load_mode_info.shader_scaled_size, load_root_signature_); - if (load_pipelines_scaled_[i] == nullptr) { + *(load_pipelines_scaled_[i].ReleaseAndGetAddressOf()) = + ui::d3d12::util::CreateComputePipeline( + device, load_mode_info.shader_scaled, + load_mode_info.shader_scaled_size, load_root_signature_.Get()); + if (!load_pipelines_scaled_[i]) { XELOGE( "D3D12TextureCache: Failed to create the resolution-scaled texture " "loading pipeline for mode {}", i); - Shutdown(); return false; } } @@ -985,7 +578,6 @@ bool TextureCache::Initialize() { XELOGE( "D3D12TextureCache: Failed to create the descriptor heap for null " "SRVs"); - Shutdown(); return false; } null_srv_descriptor_heap_start_ = @@ -1027,98 +619,27 @@ bool TextureCache::Initialize() { provider.OffsetViewDescriptor(null_srv_descriptor_heap_start_, uint32_t(NullSRVDescriptorIndex::kCube))); - if (IsDrawResolutionScaled()) { - scaled_resolve_global_watch_handle_ = shared_memory_.RegisterGlobalWatch( - ScaledResolveGlobalWatchCallbackThunk, this); - } - - texture_current_usage_time_ = xe::Clock::QueryHostUptimeMillis(); - return true; } -void TextureCache::Shutdown() { - ClearCache(); - - if (scaled_resolve_global_watch_handle_ != nullptr) { - shared_memory_.UnregisterGlobalWatch(scaled_resolve_global_watch_handle_); - scaled_resolve_global_watch_handle_ = nullptr; - } - - ui::d3d12::util::ReleaseAndNull(null_srv_descriptor_heap_); - - for (uint32_t i = 0; i < uint32_t(LoadMode::kCount); ++i) { - ui::d3d12::util::ReleaseAndNull(load_pipelines_scaled_[i]); - ui::d3d12::util::ReleaseAndNull(load_pipelines_[i]); - } - ui::d3d12::util::ReleaseAndNull(load_root_signature_); - - if (scaled_resolve_pages_ != nullptr) { - delete[] scaled_resolve_pages_; - scaled_resolve_pages_ = nullptr; - } - // First free the buffers to detach them from the heaps. - for (size_t i = 0; i < xe::countof(scaled_resolve_2gb_buffers_); ++i) { - ScaledResolveVirtualBuffer*& scaled_resolve_buffer = - scaled_resolve_2gb_buffers_[i]; - if (scaled_resolve_buffer) { - delete scaled_resolve_buffer; - scaled_resolve_buffer = nullptr; - } - } - for (ID3D12Heap* scaled_resolve_heap : scaled_resolve_heaps_) { - if (scaled_resolve_heap) { - scaled_resolve_heap->Release(); - } - } - scaled_resolve_heaps_.clear(); - scaled_resolve_heap_count_ = 0; - COUNT_profile_set("gpu/texture_cache/scaled_resolve_buffer_used_mb", 0); -} - -void TextureCache::ClearCache() { - // Destroy all the textures. - for (auto texture_pair : textures_) { - Texture* texture = texture_pair.second; - shared_memory_.UnwatchMemoryRange(texture->base_watch_handle); - shared_memory_.UnwatchMemoryRange(texture->mip_watch_handle); - // Bindful descriptor cache will be cleared entirely now, so only release - // bindless descriptors. - if (bindless_resources_used_) { - for (auto descriptor_pair : texture->srv_descriptors) { - command_processor_.ReleaseViewBindlessDescriptorImmediately( - descriptor_pair.second); - } - } - texture->resource->Release(); - delete texture; - } - textures_.clear(); - COUNT_profile_set("gpu/texture_cache/textures", 0); - textures_total_size_ = 0; - COUNT_profile_set("gpu/texture_cache/total_size_mb", 0); - texture_used_first_ = texture_used_last_ = nullptr; +void D3D12TextureCache::ClearCache() { + TextureCache::ClearCache(); // Clear texture descriptor cache. srv_descriptor_cache_free_.clear(); srv_descriptor_cache_allocated_ = 0; - for (auto& page : srv_descriptor_cache_) { - page.heap->Release(); - } srv_descriptor_cache_.clear(); } -void TextureCache::TextureFetchConstantWritten(uint32_t index) { - texture_bindings_in_sync_ &= ~(1u << index); -} +void D3D12TextureCache::BeginSubmission(uint64_t new_submission_index) { + TextureCache::BeginSubmission(new_submission_index); -void TextureCache::BeginSubmission() { // ExecuteCommandLists is a full UAV and aliasing barrier. if (IsDrawResolutionScaled()) { size_t scaled_resolve_buffer_count = GetScaledResolveBufferCount(); for (size_t i = 0; i < scaled_resolve_buffer_count; ++i) { ScaledResolveVirtualBuffer* scaled_resolve_buffer = - scaled_resolve_2gb_buffers_[i]; + scaled_resolve_2gb_buffers_[i].get(); if (scaled_resolve_buffer) { scaled_resolve_buffer->ClearUAVBarrierPending(); } @@ -1128,88 +649,14 @@ void TextureCache::BeginSubmission() { } } -void TextureCache::BeginFrame() { - // In case there was a failure creating something in the previous frame, make - // sure bindings are reset so a new attempt will surely be made if the texture - // is requested again. - ClearBindings(); +void D3D12TextureCache::BeginFrame() { + TextureCache::BeginFrame(); std::memset(unsupported_format_features_used_, 0, sizeof(unsupported_format_features_used_)); - - texture_current_usage_time_ = xe::Clock::QueryHostUptimeMillis(); - - // If memory usage is too high, destroy unused textures. - uint64_t completed_frame = command_processor_.GetCompletedFrame(); - // texture_cache_memory_limit_render_to_texture is assumed to be included in - // texture_cache_memory_limit_soft and texture_cache_memory_limit_hard, at 1x, - // so subtracting 1 from the scale. - uint32_t limit_scaled_resolve_add_mb = - cvars::texture_cache_memory_limit_render_to_texture * - (draw_resolution_scale_x_ * draw_resolution_scale_y_ - 1); - uint32_t limit_soft_mb = - cvars::texture_cache_memory_limit_soft + limit_scaled_resolve_add_mb; - uint32_t limit_hard_mb = - cvars::texture_cache_memory_limit_hard + limit_scaled_resolve_add_mb; - uint32_t limit_soft_lifetime = - cvars::texture_cache_memory_limit_soft_lifetime * 1000; - bool destroyed_any = false; - while (texture_used_first_ != nullptr) { - uint64_t total_size_mb = textures_total_size_ >> 20; - bool limit_hard_exceeded = total_size_mb >= limit_hard_mb; - if (total_size_mb < limit_soft_mb && !limit_hard_exceeded) { - break; - } - Texture* texture = texture_used_first_; - if (texture->last_usage_frame > completed_frame) { - break; - } - if (!limit_hard_exceeded && - (texture->last_usage_time + limit_soft_lifetime) > - texture_current_usage_time_) { - break; - } - destroyed_any = true; - // Remove the texture from the map. - auto found_texture_it = textures_.find(texture->key); - assert_true(found_texture_it != textures_.end()); - if (found_texture_it != textures_.end()) { - assert_true(found_texture_it->second == texture); - textures_.erase(found_texture_it); - } - // Unlink the texture. - texture_used_first_ = texture->used_next; - if (texture_used_first_ != nullptr) { - texture_used_first_->used_previous = nullptr; - } else { - texture_used_last_ = nullptr; - } - // Exclude the texture from the memory usage counter. - textures_total_size_ -= texture->resource_size; - // Destroy the texture. - shared_memory_.UnwatchMemoryRange(texture->base_watch_handle); - shared_memory_.UnwatchMemoryRange(texture->mip_watch_handle); - if (bindless_resources_used_) { - for (auto descriptor_pair : texture->srv_descriptors) { - command_processor_.ReleaseViewBindlessDescriptorImmediately( - descriptor_pair.second); - } - } else { - for (auto descriptor_pair : texture->srv_descriptors) { - srv_descriptor_cache_free_.push_back(descriptor_pair.second); - } - } - texture->resource->Release(); - delete texture; - } - if (destroyed_any) { - COUNT_profile_set("gpu/texture_cache/textures", textures_.size()); - COUNT_profile_set("gpu/texture_cache/total_size_mb", - uint32_t(textures_total_size_ >> 20)); - } } -void TextureCache::EndFrame() { +void D3D12TextureCache::EndFrame() { // Report used unsupported texture formats. bool unsupported_header_written = false; for (uint32_t i = 0; i < 64; ++i) { @@ -1229,131 +676,12 @@ void TextureCache::EndFrame() { } } -void TextureCache::RequestTextures(uint32_t used_texture_mask) { - const auto& regs = register_file_; - +void D3D12TextureCache::RequestTextures(uint32_t used_texture_mask) { #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES - if (texture_invalidated_.exchange(false, std::memory_order_acquire)) { - // Clear the bindings not only for this draw call, but entirely, because - // loading may be needed in some draw call later, which may have the same - // key for some binding as before the invalidation, but texture_invalidated_ - // being false (menu background in 4D5307E6). - for (size_t i = 0; i < xe::countof(texture_bindings_); ++i) { - texture_bindings_[i].Clear(); - } - texture_bindings_in_sync_ = 0; - } - - // Update the texture keys and the textures. - uint32_t textures_remaining = used_texture_mask; - uint32_t index = 0; - while (xe::bit_scan_forward(textures_remaining, &index)) { - uint32_t index_bit = uint32_t(1) << index; - textures_remaining &= ~index_bit; - if (texture_bindings_in_sync_ & index_bit) { - continue; - } - TextureBinding& binding = texture_bindings_[index]; - const auto& fetch = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + index * 6); - TextureKey old_key = binding.key; - uint8_t old_swizzled_signs = binding.swizzled_signs; - BindingInfoFromFetchConstant(fetch, binding.key, &binding.host_swizzle, - &binding.swizzled_signs); - texture_bindings_in_sync_ |= index_bit; - if (!binding.key.is_valid) { - binding.texture = nullptr; - binding.texture_signed = nullptr; - binding.descriptor_index = UINT32_MAX; - binding.descriptor_index_signed = UINT32_MAX; - continue; - } - - // Check if need to load the unsigned and the signed versions of the texture - // (if the format is emulated with different host bit representations for - // signed and unsigned - otherwise only the unsigned one is loaded). - bool key_changed = binding.key != old_key; - bool load_unsigned_data = false, load_signed_data = false; - if (IsSignedVersionSeparate(binding.key.format)) { - // Can reuse previously loaded unsigned/signed versions if the key is the - // same and the texture was previously bound as unsigned/signed - // respectively (checking the previous values of signedness rather than - // binding.texture != nullptr and binding.texture_signed != nullptr also - // prevents repeated attempts to load the texture if it has failed to - // load). - if (texture_util::IsAnySignNotSigned(binding.swizzled_signs)) { - if (key_changed || - !texture_util::IsAnySignNotSigned(old_swizzled_signs)) { - binding.texture = FindOrCreateTexture(binding.key); - binding.descriptor_index = - binding.texture - ? FindOrCreateTextureDescriptor(*binding.texture, false, - binding.host_swizzle) - : UINT32_MAX; - load_unsigned_data = true; - } - } else { - binding.texture = nullptr; - binding.descriptor_index = UINT32_MAX; - } - if (texture_util::IsAnySignSigned(binding.swizzled_signs)) { - if (key_changed || !texture_util::IsAnySignSigned(old_swizzled_signs)) { - TextureKey signed_key = binding.key; - signed_key.signed_separate = 1; - binding.texture_signed = FindOrCreateTexture(signed_key); - binding.descriptor_index_signed = - binding.texture_signed - ? FindOrCreateTextureDescriptor(*binding.texture_signed, true, - binding.host_swizzle) - : UINT32_MAX; - load_signed_data = true; - } - } else { - binding.texture_signed = nullptr; - binding.descriptor_index_signed = UINT32_MAX; - } - } else { - // Same resource for both unsigned and signed, but descriptor formats may - // be different. - if (key_changed) { - binding.texture = FindOrCreateTexture(binding.key); - load_unsigned_data = true; - } - binding.texture_signed = nullptr; - if (texture_util::IsAnySignNotSigned(binding.swizzled_signs)) { - if (key_changed || - !texture_util::IsAnySignNotSigned(old_swizzled_signs)) { - binding.descriptor_index = - binding.texture - ? FindOrCreateTextureDescriptor(*binding.texture, false, - binding.host_swizzle) - : UINT32_MAX; - } - } else { - binding.descriptor_index = UINT32_MAX; - } - if (texture_util::IsAnySignSigned(binding.swizzled_signs)) { - if (key_changed || !texture_util::IsAnySignSigned(old_swizzled_signs)) { - binding.descriptor_index_signed = - binding.texture - ? FindOrCreateTextureDescriptor(*binding.texture, true, - binding.host_swizzle) - : UINT32_MAX; - } - } else { - binding.descriptor_index_signed = UINT32_MAX; - } - } - if (load_unsigned_data && binding.texture != nullptr) { - LoadTextureData(binding.texture); - } - if (load_signed_data && binding.texture_signed != nullptr) { - LoadTextureData(binding.texture_signed); - } - } + TextureCache::RequestTextures(used_texture_mask); // Transition the textures to the needed usage - always in // NON_PIXEL_SHADER_RESOURCE | PIXEL_SHADER_RESOURCE states because barriers @@ -1361,87 +689,109 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) { // tracked separately, checks would be needed to make sure, if the same // texture is bound through different fetch constants to both VS and PS, it // would be in both states). - textures_remaining = used_texture_mask; + uint32_t textures_remaining = used_texture_mask; + uint32_t index; while (xe::bit_scan_forward(textures_remaining, &index)) { textures_remaining &= ~(uint32_t(1) << index); - TextureBinding& binding = texture_bindings_[index]; - if (binding.texture != nullptr) { - // Will be referenced by the command list, so mark as used. - MarkTextureUsed(binding.texture); - command_processor_.PushTransitionBarrier( - binding.texture->resource, binding.texture->state, - D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE | - D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE); - binding.texture->state = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE | - D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE; + const TextureBinding* binding = GetValidTextureBinding(index); + if (!binding) { + continue; } - if (binding.texture_signed != nullptr) { - MarkTextureUsed(binding.texture_signed); + D3D12Texture* binding_texture = + static_cast(binding->texture); + if (binding_texture != nullptr) { + // Will be referenced by the command list, so mark as used. + binding_texture->MarkAsUsed(); command_processor_.PushTransitionBarrier( - binding.texture_signed->resource, binding.texture_signed->state, + binding_texture->resource(), + binding_texture->SetResourceState( + D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE | + D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE), D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE | D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE); - binding.texture_signed->state = + } + D3D12Texture* binding_texture_signed = + static_cast(binding->texture_signed); + if (binding_texture_signed != nullptr) { + binding_texture_signed->MarkAsUsed(); + command_processor_.PushTransitionBarrier( + binding_texture_signed->resource(), + binding_texture_signed->SetResourceState( + D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE | + D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE), D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE | - D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE; + D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE); } } } -bool TextureCache::AreActiveTextureSRVKeysUpToDate( +bool D3D12TextureCache::AreActiveTextureSRVKeysUpToDate( const TextureSRVKey* keys, const D3D12Shader::TextureBinding* host_shader_bindings, size_t host_shader_binding_count) const { for (size_t i = 0; i < host_shader_binding_count; ++i) { const TextureSRVKey& key = keys[i]; - const TextureBinding& binding = - texture_bindings_[host_shader_bindings[i].fetch_constant]; - if (key.key != binding.key || key.host_swizzle != binding.host_swizzle || - key.swizzled_signs != binding.swizzled_signs) { + const TextureBinding* binding = + GetValidTextureBinding(host_shader_bindings[i].fetch_constant); + if (!binding) { + if (key.key.is_valid) { + return false; + } + continue; + } + if (key.key != binding->key || key.host_swizzle != binding->host_swizzle || + key.swizzled_signs != binding->swizzled_signs) { return false; } } return true; } -void TextureCache::WriteActiveTextureSRVKeys( +void D3D12TextureCache::WriteActiveTextureSRVKeys( TextureSRVKey* keys, const D3D12Shader::TextureBinding* host_shader_bindings, size_t host_shader_binding_count) const { for (size_t i = 0; i < host_shader_binding_count; ++i) { TextureSRVKey& key = keys[i]; - const TextureBinding& binding = - texture_bindings_[host_shader_bindings[i].fetch_constant]; - key.key = binding.key; - key.host_swizzle = binding.host_swizzle; - key.swizzled_signs = binding.swizzled_signs; + const TextureBinding* binding = + GetValidTextureBinding(host_shader_bindings[i].fetch_constant); + if (!binding) { + key.key.MakeInvalid(); + key.host_swizzle = xenos::XE_GPU_TEXTURE_SWIZZLE_0000; + key.swizzled_signs = kSwizzledSignsUnsigned; + continue; + } + key.key = binding->key; + key.host_swizzle = binding->host_swizzle; + key.swizzled_signs = binding->swizzled_signs; } } -void TextureCache::WriteActiveTextureBindfulSRV( +void D3D12TextureCache::WriteActiveTextureBindfulSRV( const D3D12Shader::TextureBinding& host_shader_binding, D3D12_CPU_DESCRIPTOR_HANDLE handle) { assert_false(bindless_resources_used_); - const TextureBinding& binding = - texture_bindings_[host_shader_binding.fetch_constant]; uint32_t descriptor_index = UINT32_MAX; Texture* texture = nullptr; - if (binding.key.is_valid && - AreDimensionsCompatible(host_shader_binding.dimension, - binding.key.dimension)) { + uint32_t fetch_constant_index = host_shader_binding.fetch_constant; + const TextureBinding* binding = GetValidTextureBinding(fetch_constant_index); + if (binding && AreDimensionsCompatible(host_shader_binding.dimension, + binding->key.dimension)) { + const D3D12TextureBinding& d3d12_binding = + d3d12_texture_bindings_[fetch_constant_index]; if (host_shader_binding.is_signed) { // Not supporting signed compressed textures - hopefully DXN and DXT5A are // not used as signed. - if (texture_util::IsAnySignSigned(binding.swizzled_signs)) { - descriptor_index = binding.descriptor_index_signed; - texture = IsSignedVersionSeparate(binding.key.format) - ? binding.texture_signed - : binding.texture; + if (texture_util::IsAnySignSigned(binding->swizzled_signs)) { + descriptor_index = d3d12_binding.descriptor_index_signed; + texture = IsSignedVersionSeparateForFormat(binding->key) + ? binding->texture_signed + : binding->texture; } } else { - if (texture_util::IsAnySignNotSigned(binding.swizzled_signs)) { - descriptor_index = binding.descriptor_index; - texture = binding.texture; + if (texture_util::IsAnySignNotSigned(binding->swizzled_signs)) { + descriptor_index = d3d12_binding.descriptor_index; + texture = binding->texture; } } } @@ -1450,7 +800,7 @@ void TextureCache::WriteActiveTextureBindfulSRV( D3D12_CPU_DESCRIPTOR_HANDLE source_handle; if (descriptor_index != UINT32_MAX) { assert_not_null(texture); - MarkTextureUsed(texture); + texture->MarkAsUsed(); source_handle = GetTextureDescriptorCPUHandle(descriptor_index); } else { NullSRVDescriptorIndex null_descriptor_index; @@ -1475,7 +825,7 @@ void TextureCache::WriteActiveTextureBindfulSRV( #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_i( "gpu", - "xe::gpu::d3d12::TextureCache::WriteActiveTextureBindfulSRV->" + "xe::gpu::d3d12::D3D12TextureCache::WriteActiveTextureBindfulSRV->" "CopyDescriptorsSimple"); #endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES device->CopyDescriptorsSimple(1, handle, source_handle, @@ -1483,18 +833,19 @@ void TextureCache::WriteActiveTextureBindfulSRV( } } -uint32_t TextureCache::GetActiveTextureBindlessSRVIndex( +uint32_t D3D12TextureCache::GetActiveTextureBindlessSRVIndex( const D3D12Shader::TextureBinding& host_shader_binding) { assert_true(bindless_resources_used_); uint32_t descriptor_index = UINT32_MAX; - const TextureBinding& binding = - texture_bindings_[host_shader_binding.fetch_constant]; - if (binding.key.is_valid && - AreDimensionsCompatible(host_shader_binding.dimension, - binding.key.dimension)) { + uint32_t fetch_constant_index = host_shader_binding.fetch_constant; + const TextureBinding* binding = GetValidTextureBinding(fetch_constant_index); + if (binding && AreDimensionsCompatible(host_shader_binding.dimension, + binding->key.dimension)) { + const D3D12TextureBinding& d3d12_binding = + d3d12_texture_bindings_[fetch_constant_index]; descriptor_index = host_shader_binding.is_signed - ? binding.descriptor_index_signed - : binding.descriptor_index; + ? d3d12_binding.descriptor_index_signed + : d3d12_binding.descriptor_index; } if (descriptor_index == UINT32_MAX) { switch (host_shader_binding.dimension) { @@ -1517,9 +868,9 @@ uint32_t TextureCache::GetActiveTextureBindlessSRVIndex( return descriptor_index; } -TextureCache::SamplerParameters TextureCache::GetSamplerParameters( +D3D12TextureCache::SamplerParameters D3D12TextureCache::GetSamplerParameters( const D3D12Shader::SamplerBinding& binding) const { - const auto& regs = register_file_; + const auto& regs = register_file(); const auto& fetch = regs.Get( XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + binding.fetch_constant * 6); @@ -1567,8 +918,8 @@ TextureCache::SamplerParameters TextureCache::GetSamplerParameters( return parameters; } -void TextureCache::WriteSampler(SamplerParameters parameters, - D3D12_CPU_DESCRIPTOR_HANDLE handle) const { +void D3D12TextureCache::WriteSampler(SamplerParameters parameters, + D3D12_CPU_DESCRIPTOR_HANDLE handle) const { D3D12_SAMPLER_DESC desc; if (parameters.aniso_filter != xenos::AnisoFilter::kDisabled) { desc.Filter = D3D12_FILTER_ANISOTROPIC; @@ -1628,52 +979,18 @@ void TextureCache::WriteSampler(SamplerParameters parameters, device->CreateSampler(&desc, handle); } -void TextureCache::MarkRangeAsResolved(uint32_t start_unscaled, - uint32_t length_unscaled) { - if (length_unscaled == 0) { - return; - } - start_unscaled &= 0x1FFFFFFF; - length_unscaled = std::min(length_unscaled, 0x20000000 - start_unscaled); - - if (IsDrawResolutionScaled()) { - uint32_t page_first = start_unscaled >> 12; - uint32_t page_last = (start_unscaled + length_unscaled - 1) >> 12; - uint32_t block_first = page_first >> 5; - uint32_t block_last = page_last >> 5; - auto global_lock = global_critical_region_.Acquire(); - for (uint32_t i = block_first; i <= block_last; ++i) { - uint32_t add_bits = UINT32_MAX; - if (i == block_first) { - add_bits &= ~((1u << (page_first & 31)) - 1); - } - if (i == block_last && (page_last & 31) != 31) { - add_bits &= (1u << ((page_last & 31) + 1)) - 1; - } - scaled_resolve_pages_[i] |= add_bits; - scaled_resolve_pages_l2_[i >> 6] |= 1ull << (i & 63); - } - } - - // Invalidate textures. Toggling individual textures between scaled and - // unscaled also relies on invalidation through shared memory. - shared_memory_.RangeWrittenByGpu(start_unscaled, length_unscaled, true); -} - -void TextureCache::ClampDrawResolutionScaleToSupportedRange( +bool D3D12TextureCache::ClampDrawResolutionScaleToMaxSupported( uint32_t& scale_x, uint32_t& scale_y, const ui::d3d12::D3D12Provider& provider) { + bool was_clamped; if (provider.GetTiledResourcesTier() < D3D12_TILED_RESOURCES_TIER_1) { + was_clamped = scale_x > 1 || scale_y > 1; scale_x = 1; scale_y = 1; - return; + return !was_clamped; } - // Ensure it's not zero. - scale_x = std::max(scale_x, uint32_t(1)); - scale_y = std::max(scale_y, uint32_t(1)); - scale_x = std::min(scale_x, kMaxDrawResolutionScaleAlongAxis); - scale_y = std::min(scale_y, kMaxDrawResolutionScaleAlongAxis); // Limit to the virtual address space available for a resource. + was_clamped = false; uint32_t virtual_address_bits_per_resource = provider.GetVirtualAddressBitsPerResource(); while (scale_x > 1 || scale_y > 1) { @@ -1686,15 +1003,17 @@ void TextureCache::ClampDrawResolutionScaleToSupportedRange( // When reducing from a square size, prefer decreasing the horizontal // resolution as vertical resolution difference is visible more clearly in // perspective. + was_clamped = true; if (scale_x >= scale_y) { --scale_x; } else { --scale_y; } } + return !was_clamped; } -bool TextureCache::EnsureScaledResolveMemoryCommitted( +bool D3D12TextureCache::EnsureScaledResolveMemoryCommitted( uint32_t start_unscaled, uint32_t length_unscaled) { assert_true(IsDrawResolutionScaled()); @@ -1708,7 +1027,7 @@ bool TextureCache::EnsureScaledResolveMemoryCommitted( } uint32_t draw_resolution_scale_area = - draw_resolution_scale_x_ * draw_resolution_scale_y_; + draw_resolution_scale_x() * draw_resolution_scale_y(); uint64_t first_scaled = uint64_t(start_unscaled) * draw_resolution_scale_area; uint64_t last_scaled = uint64_t(start_unscaled + (length_unscaled - 1)) * draw_resolution_scale_area; @@ -1755,15 +1074,17 @@ bool TextureCache::EnsureScaledResolveMemoryCommitted( return false; } scaled_resolve_2gb_buffers_[i] = - new ScaledResolveVirtualBuffer(scaled_resolve_buffer_resource, - kScaledResolveVirtualBufferInitialState); + std::unique_ptr( + new ScaledResolveVirtualBuffer( + scaled_resolve_buffer_resource, + kScaledResolveVirtualBufferInitialState)); scaled_resolve_buffer_resource->Release(); } uint32_t heap_first = uint32_t(first_scaled >> kScaledResolveHeapSizeLog2); uint32_t heap_last = uint32_t(last_scaled >> kScaledResolveHeapSizeLog2); for (uint32_t i = heap_first; i <= heap_last; ++i) { - if (scaled_resolve_heaps_[i] != nullptr) { + if (scaled_resolve_heaps_[i]) { continue; } auto direct_queue = provider.GetDirectQueue(); @@ -1772,10 +1093,10 @@ bool TextureCache::EnsureScaledResolveMemoryCommitted( heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT; heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS | provider.GetHeapFlagCreateNotZeroed(); - ID3D12Heap* scaled_resolve_heap; + Microsoft::WRL::ComPtr scaled_resolve_heap; if (FAILED(device->CreateHeap(&heap_desc, IID_PPV_ARGS(&scaled_resolve_heap)))) { - XELOGE("Texture cache: Failed to create a scaled resolve tile heap"); + XELOGE("D3D12TextureCache: Failed to create a scaled resolve tile heap"); return false; } scaled_resolve_heaps_[i] = scaled_resolve_heap; @@ -1809,7 +1130,7 @@ bool TextureCache::EnsureScaledResolveMemoryCommitted( D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES); direct_queue->UpdateTileMappings( scaled_resolve_2gb_buffers_[buffer_index]->resource(), 1, - ®ion_start_coordinates, ®ion_size, scaled_resolve_heap, 1, + ®ion_start_coordinates, ®ion_size, scaled_resolve_heap.Get(), 1, &range_flags, &heap_range_start_offset, &range_tile_count, D3D12_TILE_MAPPING_FLAG_NONE); } @@ -1818,8 +1139,8 @@ bool TextureCache::EnsureScaledResolveMemoryCommitted( return true; } -bool TextureCache::MakeScaledResolveRangeCurrent(uint32_t start_unscaled, - uint32_t length_unscaled) { +bool D3D12TextureCache::MakeScaledResolveRangeCurrent( + uint32_t start_unscaled, uint32_t length_unscaled) { assert_true(IsDrawResolutionScaled()); if (!length_unscaled || start_unscaled >= SharedMemory::kBufferSize || @@ -1830,7 +1151,7 @@ bool TextureCache::MakeScaledResolveRangeCurrent(uint32_t start_unscaled, } uint32_t draw_resolution_scale_area = - draw_resolution_scale_x_ * draw_resolution_scale_y_; + draw_resolution_scale_x() * draw_resolution_scale_y(); uint64_t start_scaled = uint64_t(start_unscaled) * draw_resolution_scale_area; uint64_t length_scaled = uint64_t(length_unscaled) * draw_resolution_scale_area; @@ -1903,7 +1224,7 @@ bool TextureCache::MakeScaledResolveRangeCurrent(uint32_t start_unscaled, // Switch the current buffer for the range. const ScaledResolveVirtualBuffer* new_buffer = - scaled_resolve_2gb_buffers_[new_buffer_index]; + scaled_resolve_2gb_buffers_[new_buffer_index].get(); assert_not_null(new_buffer); ID3D12Resource* new_buffer_resource = new_buffer->resource(); for (size_t i = gigabyte_first; i <= gigabyte_last; ++i) { @@ -1914,7 +1235,7 @@ bool TextureCache::MakeScaledResolveRangeCurrent(uint32_t start_unscaled, } if (gigabyte_current_buffer_index != SIZE_MAX) { ScaledResolveVirtualBuffer* gigabyte_current_buffer = - scaled_resolve_2gb_buffers_[gigabyte_current_buffer_index]; + scaled_resolve_2gb_buffers_[gigabyte_current_buffer_index].get(); assert_not_null(gigabyte_current_buffer); command_processor_.PushAliasingBarrier( gigabyte_current_buffer->resource(), new_buffer_resource); @@ -1929,7 +1250,7 @@ bool TextureCache::MakeScaledResolveRangeCurrent(uint32_t start_unscaled, return true; } -void TextureCache::TransitionCurrentScaledResolveRange( +void D3D12TextureCache::TransitionCurrentScaledResolveRange( D3D12_RESOURCE_STATES new_state) { assert_true(IsDrawResolutionScaled()); ScaledResolveVirtualBuffer& buffer = GetCurrentScaledResolveBuffer(); @@ -1937,12 +1258,12 @@ void TextureCache::TransitionCurrentScaledResolveRange( buffer.resource(), buffer.SetResourceState(new_state), new_state); } -void TextureCache::CreateCurrentScaledResolveRangeUintPow2SRV( +void D3D12TextureCache::CreateCurrentScaledResolveRangeUintPow2SRV( D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2) { assert_true(IsDrawResolutionScaled()); size_t buffer_index = GetCurrentScaledResolveBufferIndex(); const ScaledResolveVirtualBuffer* buffer = - scaled_resolve_2gb_buffers_[buffer_index]; + scaled_resolve_2gb_buffers_[buffer_index].get(); assert_not_null(buffer); ui::d3d12::util::CreateBufferTypedSRV( command_processor_.GetD3D12Provider().GetDevice(), handle, @@ -1955,12 +1276,12 @@ void TextureCache::CreateCurrentScaledResolveRangeUintPow2SRV( element_size_bytes_pow2); } -void TextureCache::CreateCurrentScaledResolveRangeUintPow2UAV( +void D3D12TextureCache::CreateCurrentScaledResolveRangeUintPow2UAV( D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2) { assert_true(IsDrawResolutionScaled()); size_t buffer_index = GetCurrentScaledResolveBufferIndex(); const ScaledResolveVirtualBuffer* buffer = - scaled_resolve_2gb_buffers_[buffer_index]; + scaled_resolve_2gb_buffers_[buffer_index].get(); assert_not_null(buffer); ui::d3d12::util::CreateBufferTypedUAV( command_processor_.GetD3D12Provider().GetDevice(), handle, @@ -1973,46 +1294,66 @@ void TextureCache::CreateCurrentScaledResolveRangeUintPow2UAV( element_size_bytes_pow2); } -ID3D12Resource* TextureCache::RequestSwapTexture( +ID3D12Resource* D3D12TextureCache::RequestSwapTexture( D3D12_SHADER_RESOURCE_VIEW_DESC& srv_desc_out, xenos::TextureFormat& format_out) { - const auto& regs = register_file_; + const auto& regs = register_file(); const auto& fetch = regs.Get( XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0); TextureKey key; - uint32_t swizzle; - BindingInfoFromFetchConstant(fetch, key, &swizzle, nullptr); - if (key.base_page == 0 || + BindingInfoFromFetchConstant(fetch, key, nullptr); + if (!key.is_valid || key.base_page == 0 || key.dimension != xenos::DataDimension::k2DOrStacked) { return nullptr; } - Texture* texture = FindOrCreateTexture(key); - if (texture == nullptr || !LoadTextureData(texture)) { + D3D12Texture* texture = static_cast(FindOrCreateTexture(key)); + if (texture == nullptr || !LoadTextureData(*texture)) { return nullptr; } - MarkTextureUsed(texture); + texture->MarkAsUsed(); // The swap texture is likely to be used only for the presentation pixel // shader, and not during emulation, where it'd be NON_PIXEL_SHADER_RESOURCE | // PIXEL_SHADER_RESOURCE. + ID3D12Resource* texture_resource = texture->resource(); command_processor_.PushTransitionBarrier( - texture->resource, texture->state, + texture_resource, + texture->SetResourceState(D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE), D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE); - texture->state = D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE; srv_desc_out.Format = GetDXGIUnormFormat(key); srv_desc_out.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D; srv_desc_out.Shader4ComponentMapping = - swizzle | + GuestToHostSwizzle(fetch.swizzle, GetHostFormatSwizzle(key)) | D3D12_SHADER_COMPONENT_MAPPING_ALWAYS_SET_BIT_AVOIDING_ZEROMEM_MISTAKES; srv_desc_out.Texture2D.MostDetailedMip = 0; srv_desc_out.Texture2D.MipLevels = 1; srv_desc_out.Texture2D.PlaneSlice = 0; srv_desc_out.Texture2D.ResourceMinLODClamp = 0.0f; format_out = key.format; - return texture->resource; + return texture_resource; } -bool TextureCache::IsDecompressionNeeded(xenos::TextureFormat format, - uint32_t width, uint32_t height) { +D3D12TextureCache::D3D12Texture::D3D12Texture( + D3D12TextureCache& texture_cache, const TextureKey& key, + ID3D12Resource* resource, D3D12_RESOURCE_STATES resource_state) + : Texture(texture_cache, key), + resource_(resource), + resource_state_(resource_state) { + ID3D12Device* device = + texture_cache.command_processor_.GetD3D12Provider().GetDevice(); + D3D12_RESOURCE_DESC resource_desc = resource_->GetDesc(); + SetHostMemoryUsage( + device->GetResourceAllocationInfo(0, 1, &resource_desc).SizeInBytes); +} + +D3D12TextureCache::D3D12Texture::~D3D12Texture() { + auto& d3d12_texture_cache = static_cast(texture_cache()); + for (const auto& descriptor_pair : srv_descriptors_) { + d3d12_texture_cache.ReleaseTextureDescriptor(descriptor_pair.second); + } +} + +bool D3D12TextureCache::IsDecompressionNeeded(xenos::TextureFormat format, + uint32_t width, uint32_t height) { DXGI_FORMAT dxgi_format_uncompressed = host_formats_[uint32_t(format)].dxgi_format_uncompressed; if (dxgi_format_uncompressed == DXGI_FORMAT_UNKNOWN) { @@ -2023,7 +1364,7 @@ bool TextureCache::IsDecompressionNeeded(xenos::TextureFormat format, (height & (format_info->block_height - 1)) != 0; } -TextureCache::LoadMode TextureCache::GetLoadMode(TextureKey key) { +D3D12TextureCache::LoadMode D3D12TextureCache::GetLoadMode(TextureKey key) { const HostFormat& host_format = host_formats_[uint32_t(key.format)]; if (key.signed_separate) { return host_format.load_mode_snorm; @@ -2034,204 +1375,59 @@ TextureCache::LoadMode TextureCache::GetLoadMode(TextureKey key) { return host_format.load_mode; } -void TextureCache::BindingInfoFromFetchConstant( - const xenos::xe_gpu_texture_fetch_t& fetch, TextureKey& key_out, - uint32_t* host_swizzle_out, uint8_t* swizzled_signs_out) { - // Reset the key and the swizzle. - key_out.MakeInvalid(); - if (host_swizzle_out != nullptr) { - *host_swizzle_out = - xenos::XE_GPU_SWIZZLE_0 | (xenos::XE_GPU_SWIZZLE_0 << 3) | - (xenos::XE_GPU_SWIZZLE_0 << 6) | (xenos::XE_GPU_SWIZZLE_0 << 9); - } - if (swizzled_signs_out != nullptr) { - *swizzled_signs_out = - uint8_t(xenos::TextureSign::kUnsigned) * uint8_t(0b01010101); - } +bool D3D12TextureCache::IsSignedVersionSeparateForFormat(TextureKey key) const { + const HostFormat& host_format = host_formats_[uint32_t(key.format)]; + return host_format.load_mode_snorm != LoadMode::kUnknown && + host_format.load_mode_snorm != host_format.load_mode; +} - switch (fetch.type) { - case xenos::FetchConstantType::kTexture: - break; - case xenos::FetchConstantType::kInvalidTexture: - if (cvars::gpu_allow_invalid_fetch_constants) { - break; - } - XELOGW( - "Texture fetch constant ({:08X} {:08X} {:08X} {:08X} {:08X} {:08X}) " - "has \"invalid\" type! This is incorrect behavior, but you can try " - "bypassing this by launching Xenia with " - "--gpu_allow_invalid_fetch_constants=true.", - fetch.dword_0, fetch.dword_1, fetch.dword_2, fetch.dword_3, - fetch.dword_4, fetch.dword_5); - return; +bool D3D12TextureCache::IsScaledResolveSupportedForFormat( + TextureKey key) const { + LoadMode load_mode = GetLoadMode(key); + return load_mode != LoadMode::kUnknown && + load_pipelines_scaled_[uint32_t(load_mode)] != nullptr; +} + +uint32_t D3D12TextureCache::GetHostFormatSwizzle(TextureKey key) const { + return host_formats_[uint32_t(key.format)].swizzle; +} + +uint32_t D3D12TextureCache::GetMaxHostTextureWidthHeight( + xenos::DataDimension dimension) const { + switch (dimension) { + case xenos::DataDimension::k1D: + case xenos::DataDimension::k2DOrStacked: + // 1D and 2D are emulated as 2D arrays. + return D3D12_REQ_TEXTURE2D_U_OR_V_DIMENSION; + case xenos::DataDimension::k3D: + return D3D12_REQ_TEXTURE3D_U_V_OR_W_DIMENSION; + case xenos::DataDimension::kCube: + return D3D12_REQ_TEXTURECUBE_DIMENSION; default: - XELOGW( - "Texture fetch constant ({:08X} {:08X} {:08X} {:08X} {:08X} {:08X}) " - "is completely invalid!", - fetch.dword_0, fetch.dword_1, fetch.dword_2, fetch.dword_3, - fetch.dword_4, fetch.dword_5); - return; - } - - uint32_t width_minus_1, height_minus_1, depth_or_array_size_minus_1; - uint32_t base_page, mip_page, mip_max_level; - texture_util::GetSubresourcesFromFetchConstant( - fetch, &width_minus_1, &height_minus_1, &depth_or_array_size_minus_1, - &base_page, &mip_page, nullptr, &mip_max_level); - if (base_page == 0 && mip_page == 0) { - // No texture data at all. - return; - } - if (fetch.dimension == xenos::DataDimension::k1D) { - bool is_invalid_1d = false; - // TODO(Triang3l): Support long 1D textures. - if (width_minus_1 >= xenos::kTexture2DCubeMaxWidthHeight) { - XELOGE( - "1D texture is too wide ({}) - ignoring! Report the game to Xenia " - "developers", - width_minus_1 + 1); - is_invalid_1d = true; - } - assert_false(fetch.tiled); - if (fetch.tiled) { - XELOGE( - "1D texture has tiling enabled in the fetch constant, but this " - "appears to be completely wrong - ignoring! Report the game to Xenia " - "developers"); - is_invalid_1d = true; - } - assert_false(fetch.packed_mips); - if (fetch.packed_mips) { - XELOGE( - "1D texture has packed mips enabled in the fetch constant, but this " - "appears to be completely wrong - ignoring! Report the game to Xenia " - "developers"); - is_invalid_1d = true; - } - if (is_invalid_1d) { - return; - } - } - - xenos::TextureFormat format = GetBaseFormat(fetch.format); - - key_out.base_page = base_page; - key_out.mip_page = mip_page; - key_out.dimension = fetch.dimension; - key_out.width_minus_1 = width_minus_1; - key_out.height_minus_1 = height_minus_1; - key_out.depth_or_array_size_minus_1 = depth_or_array_size_minus_1; - key_out.pitch = fetch.pitch; - key_out.mip_max_level = mip_max_level; - key_out.tiled = fetch.tiled; - key_out.packed_mips = fetch.packed_mips; - key_out.format = format; - key_out.endianness = fetch.endianness; - - key_out.is_valid = 1; - - if (host_swizzle_out != nullptr) { - uint32_t host_swizzle = 0; - for (uint32_t i = 0; i < 4; ++i) { - uint32_t host_swizzle_component = (fetch.swizzle >> (i * 3)) & 0b111; - if (host_swizzle_component >= 4) { - // Get rid of 6 and 7 values (to prevent device losses if the game has - // something broken) the quick and dirty way - by changing them to 4 (0) - // and 5 (1). - host_swizzle_component &= 0b101; - } else { - host_swizzle_component = - host_formats_[uint32_t(format)].swizzle[host_swizzle_component]; - } - host_swizzle |= host_swizzle_component << (i * 3); - } - *host_swizzle_out = host_swizzle; - } - - if (swizzled_signs_out != nullptr) { - *swizzled_signs_out = texture_util::SwizzleSigns(fetch); + assert_unhandled_case(dimension); + return 0; } } -void TextureCache::LogTextureKeyAction(TextureKey key, const char* action) { - XELOGGPU( - "{} {} {}{}x{}x{} {} {} texture with {} {}packed mip level{}, " - "base at 0x{:08X} (pitch {}), mips at 0x{:08X}", - action, key.tiled ? "tiled" : "linear", - key.scaled_resolve ? "scaled " : "", key.GetWidth(), key.GetHeight(), - key.GetDepthOrArraySize(), dimension_names_[uint32_t(key.dimension)], - FormatInfo::Get(key.format)->name, key.mip_max_level + 1, - key.packed_mips ? "" : "un", key.mip_max_level != 0 ? "s" : "", - key.base_page << 12, key.pitch << 5, key.mip_page << 12); +uint32_t D3D12TextureCache::GetMaxHostTextureDepthOrArraySize( + xenos::DataDimension dimension) const { + switch (dimension) { + case xenos::DataDimension::k1D: + case xenos::DataDimension::k2DOrStacked: + // 1D and 2D are emulated as 2D arrays. + return D3D12_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION; + case xenos::DataDimension::k3D: + return D3D12_REQ_TEXTURE3D_U_V_OR_W_DIMENSION; + case xenos::DataDimension::kCube: + return D3D12_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION / 6 * 6; + default: + assert_unhandled_case(dimension); + return 0; + } } -void TextureCache::LogTextureAction(const Texture* texture, - const char* action) { - XELOGGPU( - "{} {} {}{}x{}x{} {} {} texture with {} {}packed mip level{}, " - "base at 0x{:08X} (pitch {}, size 0x{:08X}), mips at 0x{:08X} (size " - "0x{:08X})", - action, texture->key.tiled ? "tiled" : "linear", - texture->key.scaled_resolve ? "scaled " : "", texture->key.GetWidth(), - texture->key.GetHeight(), texture->key.GetDepthOrArraySize(), - dimension_names_[uint32_t(texture->key.dimension)], - FormatInfo::Get(texture->key.format)->name, - texture->key.mip_max_level + 1, texture->key.packed_mips ? "" : "un", - texture->key.mip_max_level != 0 ? "s" : "", texture->key.base_page << 12, - texture->key.pitch << 5, texture->GetGuestBaseSize(), - texture->key.mip_page << 12, texture->GetGuestMipsSize()); -} - -TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) { - // Check if the texture is a scaled resolve texture. - if (IsDrawResolutionScaled() && key.tiled) { - LoadMode load_mode = GetLoadMode(key); - if (load_mode != LoadMode::kUnknown && - load_pipelines_scaled_[uint32_t(load_mode)] != nullptr) { - texture_util::TextureGuestLayout scaled_resolve_guest_layout = - texture_util::GetGuestTextureLayout( - key.dimension, key.pitch, key.GetWidth(), key.GetHeight(), - key.GetDepthOrArraySize(), key.tiled, key.format, key.packed_mips, - key.base_page != 0, key.mip_max_level); - if ((scaled_resolve_guest_layout.base.level_data_extent_bytes && - IsRangeScaledResolved( - key.base_page << 12, - scaled_resolve_guest_layout.base.level_data_extent_bytes)) || - (scaled_resolve_guest_layout.mips_total_extent_bytes && - IsRangeScaledResolved( - key.mip_page << 12, - scaled_resolve_guest_layout.mips_total_extent_bytes))) { - key.scaled_resolve = 1; - } - } - } - uint32_t host_width = key.GetWidth(); - uint32_t host_height = key.GetHeight(); - if (key.scaled_resolve) { - host_width *= draw_resolution_scale_x_; - host_height *= draw_resolution_scale_y_; - } - // With 3x resolution scaling, a 2D texture may become bigger than the - // Direct3D 11 limit, and with 2x, a 3D one as well. - uint32_t max_host_width_height = GetMaxHostTextureWidthHeight(key.dimension); - uint32_t max_host_depth_or_array_size = - GetMaxHostTextureDepthOrArraySize(key.dimension); - if (host_width > max_host_width_height || - host_height > max_host_width_height || - key.GetDepthOrArraySize() > max_host_depth_or_array_size) { - return nullptr; - } - - // Try to find an existing texture. - // TODO(Triang3l): Reuse a texture with mip_page unchanged, but base_page - // previously 0, now not 0, to save memory - common case in streaming. - auto found_texture_it = textures_.find(key); - if (found_texture_it != textures_.end()) { - return found_texture_it->second; - } - - // Create the resource. If failed to create one, don't create a texture object - // at all so it won't be in indeterminate state. +std::unique_ptr D3D12TextureCache::CreateTexture( + TextureKey key) { D3D12_RESOURCE_DESC desc; desc.Format = GetDXGIResourceFormat(key); if (desc.Format == DXGI_FORMAT_UNKNOWN) { @@ -2246,8 +1442,12 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) { desc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D; } desc.Alignment = 0; - desc.Width = host_width; - desc.Height = host_height; + desc.Width = key.GetWidth(); + desc.Height = key.GetHeight(); + if (key.scaled_resolve) { + desc.Width *= draw_resolution_scale_x(); + desc.Height *= draw_resolution_scale_y(); + } desc.DepthOrArraySize = key.GetDepthOrArraySize(); desc.MipLevels = key.mip_max_level + 1; desc.SampleDesc.Count = 1; @@ -2260,142 +1460,68 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) { command_processor_.GetD3D12Provider(); ID3D12Device* device = provider.GetDevice(); // Assuming untiling will be the next operation. - D3D12_RESOURCE_STATES state = D3D12_RESOURCE_STATE_COPY_DEST; - ID3D12Resource* resource; + D3D12_RESOURCE_STATES resource_state = D3D12_RESOURCE_STATE_COPY_DEST; + Microsoft::WRL::ComPtr resource; if (FAILED(device->CreateCommittedResource( &ui::d3d12::util::kHeapPropertiesDefault, - provider.GetHeapFlagCreateNotZeroed(), &desc, state, nullptr, + provider.GetHeapFlagCreateNotZeroed(), &desc, resource_state, nullptr, IID_PPV_ARGS(&resource)))) { - LogTextureKeyAction(key, "Failed to create"); return nullptr; } - - // Create the texture object and add it to the map. - Texture* texture = new Texture; - texture->key = key; - texture->resource = resource; - texture->resource_size = - device->GetResourceAllocationInfo(0, 1, &desc).SizeInBytes; - texture->state = state; - texture->last_usage_frame = command_processor_.GetCurrentFrame(); - texture->last_usage_time = texture_current_usage_time_; - texture->used_previous = texture_used_last_; - texture->used_next = nullptr; - if (texture_used_last_ != nullptr) { - texture_used_last_->used_next = texture; - } else { - texture_used_first_ = texture; - } - texture_used_last_ = texture; - texture->base_resolved = key.scaled_resolve; - texture->mips_resolved = key.scaled_resolve; - texture->guest_layout = texture_util::GetGuestTextureLayout( - key.dimension, key.pitch, key.GetWidth(), key.GetHeight(), - key.GetDepthOrArraySize(), key.tiled, key.format, key.packed_mips, - key.base_page != 0, key.mip_max_level); - // Never try to upload data that doesn't exist. - texture->base_in_sync = !texture->guest_layout.base.level_data_extent_bytes; - texture->mips_in_sync = !texture->guest_layout.mips_total_extent_bytes; - texture->base_watch_handle = nullptr; - texture->mip_watch_handle = nullptr; - textures_.emplace(key, texture); - COUNT_profile_set("gpu/texture_cache/textures", textures_.size()); - textures_total_size_ += texture->resource_size; - COUNT_profile_set("gpu/texture_cache/total_size_mb", - uint32_t(textures_total_size_ >> 20)); - LogTextureAction(texture, "Created"); - - return texture; + return std::unique_ptr( + new D3D12Texture(*this, key, resource.Get(), resource_state)); } -bool TextureCache::LoadTextureData(Texture* texture) { - // See what we need to upload. - bool base_in_sync, mips_in_sync; - { - auto global_lock = global_critical_region_.Acquire(); - base_in_sync = texture->base_in_sync; - mips_in_sync = texture->mips_in_sync; - } - if (base_in_sync && mips_in_sync) { - return true; - } +bool D3D12TextureCache::LoadTextureDataFromResidentMemoryImpl(Texture& texture, + bool load_base, + bool load_mips) { + D3D12Texture& d3d12_texture = static_cast(texture); + TextureKey texture_key = d3d12_texture.key(); DeferredCommandList& command_list = command_processor_.GetDeferredCommandList(); ID3D12Device* device = command_processor_.GetD3D12Provider().GetDevice(); // Get the pipeline. - LoadMode load_mode = GetLoadMode(texture->key); + LoadMode load_mode = GetLoadMode(texture_key); if (load_mode == LoadMode::kUnknown) { return false; } - bool texture_resolution_scaled = texture->key.scaled_resolve; + bool texture_resolution_scaled = texture_key.scaled_resolve; ID3D12PipelineState* pipeline = - texture_resolution_scaled ? load_pipelines_scaled_[uint32_t(load_mode)] - : load_pipelines_[uint32_t(load_mode)]; + texture_resolution_scaled + ? load_pipelines_scaled_[uint32_t(load_mode)].Get() + : load_pipelines_[uint32_t(load_mode)].Get(); if (pipeline == nullptr) { return false; } const LoadModeInfo& load_mode_info = load_mode_info_[uint32_t(load_mode)]; - // Request uploading of the texture data to the shared memory. - // This is also necessary when resolution scale is used - the texture cache - // relies on shared memory for invalidation of both unscaled and scaled - // textures! Plus a texture may be unscaled partially, when only a portion of - // its pages is invalidated, in this case we'll need the texture from the - // shared memory to load the unscaled parts. - // TODO(Triang3l): Load unscaled parts. - bool base_resolved = texture->base_resolved; - if (!base_in_sync) { - if (!shared_memory_.RequestRange( - texture->key.base_page << 12, texture->GetGuestBaseSize(), - texture_resolution_scaled ? nullptr : &base_resolved)) { - return false; - } - } - bool mips_resolved = texture->mips_resolved; - if (!mips_in_sync) { - if (!shared_memory_.RequestRange( - texture->key.mip_page << 12, texture->GetGuestMipsSize(), - texture_resolution_scaled ? nullptr : &mips_resolved)) { - return false; - } - } - if (texture_resolution_scaled) { - // Make sure all heaps are created. - if (!EnsureScaledResolveMemoryCommitted(texture->key.base_page << 12, - texture->GetGuestBaseSize())) { - return false; - } - if (!EnsureScaledResolveMemoryCommitted(texture->key.mip_page << 12, - texture->GetGuestMipsSize())) { - return false; - } - } - // Get the guest layout. - xenos::DataDimension dimension = texture->key.dimension; + const texture_util::TextureGuestLayout& guest_layout = + d3d12_texture.guest_layout(); + xenos::DataDimension dimension = texture_key.dimension; bool is_3d = dimension == xenos::DataDimension::k3D; - uint32_t width = texture->key.GetWidth(); - uint32_t height = texture->key.GetHeight(); - uint32_t depth_or_array_size = texture->key.GetDepthOrArraySize(); + uint32_t width = texture_key.GetWidth(); + uint32_t height = texture_key.GetHeight(); + uint32_t depth_or_array_size = texture_key.GetDepthOrArraySize(); uint32_t depth = is_3d ? depth_or_array_size : 1; uint32_t array_size = is_3d ? 1 : depth_or_array_size; - xenos::TextureFormat guest_format = texture->key.format; + xenos::TextureFormat guest_format = texture_key.format; const FormatInfo* guest_format_info = FormatInfo::Get(guest_format); uint32_t block_width = guest_format_info->block_width; uint32_t block_height = guest_format_info->block_height; uint32_t bytes_per_block = guest_format_info->bytes_per_block(); - uint32_t level_first = base_in_sync ? 1 : 0; - uint32_t level_last = mips_in_sync ? 0 : texture->key.mip_max_level; + uint32_t level_first = load_base ? 0 : 1; + uint32_t level_last = load_mips ? texture_key.mip_max_level : 0; assert_true(level_first <= level_last); - uint32_t level_packed = texture->guest_layout.packed_level; + uint32_t level_packed = guest_layout.packed_level; uint32_t level_stored_first = std::min(level_first, level_packed); uint32_t level_stored_last = std::min(level_last, level_packed); uint32_t texture_resolution_scale_x = - texture_resolution_scaled ? draw_resolution_scale_x_ : 1; + texture_resolution_scaled ? draw_resolution_scale_x() : 1; uint32_t texture_resolution_scale_y = - texture_resolution_scaled ? draw_resolution_scale_y_ : 1; + texture_resolution_scaled ? draw_resolution_scale_y() : 1; // Get the host layout and the buffer. UINT64 copy_buffer_size = 0; @@ -2428,13 +1554,11 @@ bool TextureCache::LoadTextureData(Texture* texture) { if (!level_packed) { // Loading the packed tail for the base - load the whole tail to copy // regions out of it. - const texture_util::TextureGuestLayout::Level& guest_layout_base = - texture->guest_layout.base; host_slice_layout_base.Footprint.Width = - guest_layout_base.x_extent_blocks * block_width; + guest_layout.base.x_extent_blocks * block_width; host_slice_layout_base.Footprint.Height = - guest_layout_base.y_extent_blocks * block_height; - host_slice_layout_base.Footprint.Depth = guest_layout_base.z_extent; + guest_layout.base.y_extent_blocks * block_height; + host_slice_layout_base.Footprint.Depth = guest_layout.base.z_extent; } else { host_slice_layout_base.Footprint.Width = width; host_slice_layout_base.Footprint.Height = height; @@ -2471,7 +1595,7 @@ bool TextureCache::LoadTextureData(Texture* texture) { // Loading the packed tail for the mips - load the whole tail to copy // regions out of it. const texture_util::TextureGuestLayout::Level& - guest_layout_packed_mips = texture->guest_layout.mips[level]; + guest_layout_packed_mips = guest_layout.mips[level]; host_slice_layout_mip.Footprint.Width = guest_layout_packed_mips.x_extent_blocks * block_width; host_slice_layout_mip.Footprint.Height = @@ -2547,7 +1671,7 @@ bool TextureCache::LoadTextureData(Texture* texture) { } uint32_t descriptor_write_index = 0; command_processor_.SetExternalPipeline(pipeline); - command_list.D3DSetComputeRootSignature(load_root_signature_); + command_list.D3DSetComputeRootSignature(load_root_signature_.Get()); // Set up the destination descriptor. assert_true(descriptor_write_index < descriptor_count); ui::d3d12::util::DescriptorCpuGpuHandlePair descriptor_dest = @@ -2561,7 +1685,9 @@ bool TextureCache::LoadTextureData(Texture* texture) { // depend on the buffer being current, so they will be set later - for mips, // after loading the base is done). if (!texture_resolution_scaled) { - shared_memory_.UseForReading(); + D3D12SharedMemory& d3d12_shared_memory = + reinterpret_cast(shared_memory()); + d3d12_shared_memory.UseForReading(); ui::d3d12::util::DescriptorCpuGpuHandlePair descriptor_unscaled_source; if (bindless_resources_used_) { descriptor_unscaled_source = @@ -2571,7 +1697,7 @@ bool TextureCache::LoadTextureData(Texture* texture) { assert_true(descriptor_write_index < descriptor_count); descriptor_unscaled_source = descriptors_allocated[descriptor_write_index++]; - shared_memory_.WriteUintPow2SRVDescriptor( + d3d12_shared_memory.WriteUintPow2SRVDescriptor( descriptor_unscaled_source.first, load_mode_info.srv_bpe_log2); } command_list.D3DSetComputeRootDescriptorTable( @@ -2583,8 +1709,8 @@ bool TextureCache::LoadTextureData(Texture* texture) { auto& cbuffer_pool = command_processor_.GetConstantBufferPool(); LoadConstants load_constants; load_constants.is_tiled_3d_endian_scale = - uint32_t(texture->key.tiled) | (uint32_t(is_3d) << 1) | - (uint32_t(texture->key.endianness) << 2) | + uint32_t(texture_key.tiled) | (uint32_t(is_3d) << 1) | + (uint32_t(texture_key.endianness) << 2) | (texture_resolution_scale_x << 4) | (texture_resolution_scale_y << 6); // The loop counter can mean two things depending on whether the packed mip @@ -2616,13 +1742,13 @@ bool TextureCache::LoadTextureData(Texture* texture) { uint32_t level = (level_packed == 0) ? 0 : loop_level; uint32_t guest_address = - (is_base ? texture->key.base_page : texture->key.mip_page) << 12; + (is_base ? texture_key.base_page : texture_key.mip_page) << 12; // Set up the base or mips source, also making it accessible if loading from // scaled resolve memory. if (texture_resolution_scaled && (is_base || !scaled_mips_source_set_up)) { - uint32_t guest_size_unscaled = - is_base ? texture->GetGuestBaseSize() : texture->GetGuestMipsSize(); + uint32_t guest_size_unscaled = is_base ? d3d12_texture.GetGuestBaseSize() + : d3d12_texture.GetGuestMipsSize(); if (!MakeScaledResolveRangeCurrent(guest_address, guest_size_unscaled)) { command_processor_.ReleaseScratchGPUBuffer(copy_buffer, copy_buffer_state); @@ -2651,14 +1777,13 @@ bool TextureCache::LoadTextureData(Texture* texture) { } if (!is_base) { load_constants.guest_offset += - texture->guest_layout.mip_offsets_bytes[level] * + guest_layout.mip_offsets_bytes[level] * (texture_resolution_scale_x * texture_resolution_scale_y); } const texture_util::TextureGuestLayout::Level& level_guest_layout = - is_base ? texture->guest_layout.base - : texture->guest_layout.mips[level]; + is_base ? guest_layout.base : guest_layout.mips[level]; uint32_t level_guest_pitch = level_guest_layout.row_pitch_bytes; - if (texture->key.tiled) { + if (texture_key.tiled) { // Shaders expect pitch in blocks for tiled textures. level_guest_pitch /= bytes_per_block; assert_zero(level_guest_pitch & (xenos::kTextureTileWidthHeight - 1)); @@ -2727,21 +1852,23 @@ bool TextureCache::LoadTextureData(Texture* texture) { } // Update LRU caching because the texture will be used by the command list. - MarkTextureUsed(texture); + d3d12_texture.MarkAsUsed(); // Submit copying from the copy buffer to the host texture. - command_processor_.PushTransitionBarrier(texture->resource, texture->state, - D3D12_RESOURCE_STATE_COPY_DEST); - texture->state = D3D12_RESOURCE_STATE_COPY_DEST; + ID3D12Resource* texture_resource = d3d12_texture.resource(); + command_processor_.PushTransitionBarrier( + texture_resource, + d3d12_texture.SetResourceState(D3D12_RESOURCE_STATE_COPY_DEST), + D3D12_RESOURCE_STATE_COPY_DEST); command_processor_.PushTransitionBarrier(copy_buffer, copy_buffer_state, D3D12_RESOURCE_STATE_COPY_SOURCE); copy_buffer_state = D3D12_RESOURCE_STATE_COPY_SOURCE; command_processor_.SubmitBarriers(); - uint32_t texture_level_count = texture->key.mip_max_level + 1; + uint32_t texture_level_count = texture_key.mip_max_level + 1; D3D12_TEXTURE_COPY_LOCATION location_source, location_dest; location_source.pResource = copy_buffer; location_source.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT; - location_dest.pResource = texture->resource; + location_dest.pResource = texture_resource; location_dest.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; for (uint32_t level = level_first; level <= level_last; ++level) { uint32_t guest_level = std::min(level, level_packed); @@ -2782,62 +1909,78 @@ bool TextureCache::LoadTextureData(Texture* texture) { command_processor_.ReleaseScratchGPUBuffer(copy_buffer, copy_buffer_state); - // Update the source of the texture (resolve vs. CPU or memexport) for - // purposes of handling piecewise gamma emulation via sRGB and for resolution - // scale in sampling offsets. - texture->base_resolved = base_resolved; - texture->mips_resolved = mips_resolved; - - // Mark the ranges as uploaded and watch them. This is needed for scaled - // resolves as well to detect when the CPU wants to reuse the memory for a - // regular texture or a vertex buffer, and thus the scaled resolve version is - // not up to date anymore. - { - auto global_lock = global_critical_region_.Acquire(); - texture->base_in_sync = true; - texture->mips_in_sync = true; - if (!base_in_sync) { - texture->base_watch_handle = shared_memory_.WatchMemoryRange( - texture->key.base_page << 12, texture->GetGuestBaseSize(), - WatchCallbackThunk, this, texture, 0); - } - if (!mips_in_sync) { - texture->mip_watch_handle = shared_memory_.WatchMemoryRange( - texture->key.mip_page << 12, texture->GetGuestMipsSize(), - WatchCallbackThunk, this, texture, 1); - } - } - - LogTextureAction(texture, "Loaded"); return true; } -uint32_t TextureCache::FindOrCreateTextureDescriptor(Texture& texture, - bool is_signed, - uint32_t host_swizzle) { +void D3D12TextureCache::UpdateTextureBindingsImpl( + uint32_t fetch_constant_mask) { + uint32_t bindings_remaining = fetch_constant_mask; + uint32_t binding_index; + while (xe::bit_scan_forward(bindings_remaining, &binding_index)) { + bindings_remaining &= ~(UINT32_C(1) << binding_index); + D3D12TextureBinding& d3d12_binding = d3d12_texture_bindings_[binding_index]; + d3d12_binding.Reset(); + const TextureBinding* binding = GetValidTextureBinding(binding_index); + if (!binding) { + continue; + } + if (IsSignedVersionSeparateForFormat(binding->key)) { + if (binding->texture && + texture_util::IsAnySignNotSigned(binding->swizzled_signs)) { + d3d12_binding.descriptor_index = FindOrCreateTextureDescriptor( + *static_cast(binding->texture), false, + binding->host_swizzle); + } + if (binding->texture_signed && + texture_util::IsAnySignSigned(binding->swizzled_signs)) { + d3d12_binding.descriptor_index_signed = FindOrCreateTextureDescriptor( + *static_cast(binding->texture_signed), true, + binding->host_swizzle); + } + } else { + D3D12Texture* texture = static_cast(binding->texture); + if (texture) { + if (texture_util::IsAnySignNotSigned(binding->swizzled_signs)) { + d3d12_binding.descriptor_index = FindOrCreateTextureDescriptor( + *texture, false, binding->host_swizzle); + } + if (texture_util::IsAnySignSigned(binding->swizzled_signs)) { + d3d12_binding.descriptor_index_signed = FindOrCreateTextureDescriptor( + *texture, true, binding->host_swizzle); + } + } + } + } +} + +uint32_t D3D12TextureCache::FindOrCreateTextureDescriptor( + D3D12Texture& texture, bool is_signed, uint32_t host_swizzle) { uint32_t descriptor_key = uint32_t(is_signed) | (host_swizzle << 1); // Try to find an existing descriptor. - auto it = texture.srv_descriptors.find(descriptor_key); - if (it != texture.srv_descriptors.end()) { - return it->second; + uint32_t existing_descriptor_index = + texture.GetSRVDescriptorIndex(descriptor_key); + if (existing_descriptor_index != UINT32_MAX) { + return existing_descriptor_index; } + TextureKey texture_key = texture.key(); + // Create a new bindless or cached descriptor if supported. D3D12_SHADER_RESOURCE_VIEW_DESC desc; - xenos::TextureFormat format = texture.key.format; - if (IsSignedVersionSeparate(format) && - texture.key.signed_separate != uint32_t(is_signed)) { + if (IsSignedVersionSeparateForFormat(texture_key) && + texture_key.signed_separate != uint32_t(is_signed)) { // Not the version with the needed signedness. return UINT32_MAX; } + xenos::TextureFormat format = texture_key.format; if (is_signed) { // Not supporting signed compressed textures - hopefully DXN and DXT5A are // not used as signed. desc.Format = host_formats_[uint32_t(format)].dxgi_format_snorm; } else { - desc.Format = GetDXGIUnormFormat(texture.key); + desc.Format = GetDXGIUnormFormat(texture_key); } if (desc.Format == DXGI_FORMAT_UNKNOWN) { unsupported_format_features_used_[uint32_t(format)] |= @@ -2845,15 +1988,15 @@ uint32_t TextureCache::FindOrCreateTextureDescriptor(Texture& texture, return UINT32_MAX; } - uint32_t mip_levels = texture.key.mip_max_level + 1; - switch (texture.key.dimension) { + uint32_t mip_levels = texture_key.mip_max_level + 1; + switch (texture_key.dimension) { case xenos::DataDimension::k1D: case xenos::DataDimension::k2DOrStacked: desc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2DARRAY; desc.Texture2DArray.MostDetailedMip = 0; desc.Texture2DArray.MipLevels = mip_levels; desc.Texture2DArray.FirstArraySlice = 0; - desc.Texture2DArray.ArraySize = texture.key.GetDepthOrArraySize(); + desc.Texture2DArray.ArraySize = texture_key.GetDepthOrArraySize(); desc.Texture2DArray.PlaneSlice = 0; desc.Texture2DArray.ResourceMinLODClamp = 0.0f; break; @@ -2870,7 +2013,7 @@ uint32_t TextureCache::FindOrCreateTextureDescriptor(Texture& texture, desc.TextureCube.ResourceMinLODClamp = 0.0f; break; default: - assert_unhandled_case(texture.key.dimension); + assert_unhandled_case(texture_key.dimension); return UINT32_MAX; } @@ -2895,40 +2038,48 @@ uint32_t TextureCache::FindOrCreateTextureDescriptor(Texture& texture, srv_descriptor_cache_free_.pop_back(); } else { // Allocated + 1 (including the descriptor that is being added), rounded - // up to SRVDescriptorCachePage::kHeapSize, (allocated + 1 + size - 1). - uint32_t cache_pages_needed = (srv_descriptor_cache_allocated_ + - SRVDescriptorCachePage::kHeapSize) / - SRVDescriptorCachePage::kHeapSize; + // up to kSRVDescriptorCachePageSize, (allocated + 1 + size - 1). + uint32_t cache_pages_needed = + (srv_descriptor_cache_allocated_ + kSRVDescriptorCachePageSize) / + kSRVDescriptorCachePageSize; if (srv_descriptor_cache_.size() < cache_pages_needed) { D3D12_DESCRIPTOR_HEAP_DESC cache_heap_desc; cache_heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; - cache_heap_desc.NumDescriptors = SRVDescriptorCachePage::kHeapSize; + cache_heap_desc.NumDescriptors = kSRVDescriptorCachePageSize; cache_heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE; cache_heap_desc.NodeMask = 0; while (srv_descriptor_cache_.size() < cache_pages_needed) { - SRVDescriptorCachePage cache_page; - if (FAILED(device->CreateDescriptorHeap( - &cache_heap_desc, IID_PPV_ARGS(&cache_page.heap)))) { + Microsoft::WRL::ComPtr cache_heap; + if (FAILED(device->CreateDescriptorHeap(&cache_heap_desc, + IID_PPV_ARGS(&cache_heap)))) { XELOGE( - "Failed to create a texture descriptor - couldn't create a " - "descriptor cache heap"); + "D3D12TextureCache: Failed to create a texture descriptor - " + "couldn't create a descriptor cache heap"); return UINT32_MAX; } - cache_page.heap_start = - cache_page.heap->GetCPUDescriptorHandleForHeapStart(); - srv_descriptor_cache_.push_back(cache_page); + srv_descriptor_cache_.emplace_back(cache_heap.Get()); } } descriptor_index = srv_descriptor_cache_allocated_++; } } device->CreateShaderResourceView( - texture.resource, &desc, GetTextureDescriptorCPUHandle(descriptor_index)); - texture.srv_descriptors.emplace(descriptor_key, descriptor_index); + texture.resource(), &desc, + GetTextureDescriptorCPUHandle(descriptor_index)); + texture.AddSRVDescriptorIndex(descriptor_key, descriptor_index); return descriptor_index; } -D3D12_CPU_DESCRIPTOR_HANDLE TextureCache::GetTextureDescriptorCPUHandle( +void D3D12TextureCache::ReleaseTextureDescriptor(uint32_t descriptor_index) { + if (bindless_resources_used_) { + command_processor_.ReleaseViewBindlessDescriptorImmediately( + descriptor_index); + } else { + srv_descriptor_cache_free_.push_back(descriptor_index); + } +} + +D3D12_CPU_DESCRIPTOR_HANDLE D3D12TextureCache::GetTextureDescriptorCPUHandle( uint32_t descriptor_index) const { const ui::d3d12::D3D12Provider& provider = command_processor_.GetD3D12Provider(); @@ -2937,164 +2088,12 @@ D3D12_CPU_DESCRIPTOR_HANDLE TextureCache::GetTextureDescriptorCPUHandle( command_processor_.GetViewBindlessHeapCPUStart(), descriptor_index); } D3D12_CPU_DESCRIPTOR_HANDLE heap_start = - srv_descriptor_cache_[descriptor_index / - SRVDescriptorCachePage::kHeapSize] - .heap_start; - uint32_t heap_offset = descriptor_index % SRVDescriptorCachePage::kHeapSize; + srv_descriptor_cache_[descriptor_index / kSRVDescriptorCachePageSize] + .heap_start(); + uint32_t heap_offset = descriptor_index % kSRVDescriptorCachePageSize; return provider.OffsetViewDescriptor(heap_start, heap_offset); } -void TextureCache::MarkTextureUsed(Texture* texture) { - uint64_t current_frame = command_processor_.GetCurrentFrame(); - // This is called very frequently, don't relink unless needed for caching. - if (texture->last_usage_frame != current_frame) { - texture->last_usage_frame = current_frame; - texture->last_usage_time = texture_current_usage_time_; - if (texture->used_next == nullptr) { - // Simplify the code a bit - already in the end of the list. - return; - } - if (texture->used_previous != nullptr) { - texture->used_previous->used_next = texture->used_next; - } else { - texture_used_first_ = texture->used_next; - } - texture->used_next->used_previous = texture->used_previous; - texture->used_previous = texture_used_last_; - texture->used_next = nullptr; - if (texture_used_last_ != nullptr) { - texture_used_last_->used_next = texture; - } - texture_used_last_ = texture; - } -} - -void TextureCache::WatchCallbackThunk(void* context, void* data, - uint64_t argument, - bool invalidated_by_gpu) { - TextureCache* texture_cache = reinterpret_cast(context); - texture_cache->WatchCallback(reinterpret_cast(data), argument != 0); -} - -void TextureCache::WatchCallback(Texture* texture, bool is_mip) { - // Mutex already locked here. - if (is_mip) { - texture->mips_in_sync = false; - texture->mip_watch_handle = nullptr; - } else { - texture->base_in_sync = false; - texture->base_watch_handle = nullptr; - } - texture_invalidated_.store(true, std::memory_order_release); -} - -void TextureCache::ClearBindings() { - for (size_t i = 0; i < xe::countof(texture_bindings_); ++i) { - texture_bindings_[i].Clear(); - } - texture_bindings_in_sync_ = 0; - // Already reset everything. - texture_invalidated_.store(false, std::memory_order_relaxed); -} - -bool TextureCache::IsRangeScaledResolved(uint32_t start_unscaled, - uint32_t length_unscaled) { - if (!IsDrawResolutionScaled()) { - return false; - } - - start_unscaled = std::min(start_unscaled, SharedMemory::kBufferSize); - length_unscaled = - std::min(length_unscaled, SharedMemory::kBufferSize - start_unscaled); - if (!length_unscaled) { - return false; - } - - // Two-level check for faster rejection since resolve targets are usually - // placed in relatively small and localized memory portions (confirmed by - // testing - pretty much all times the deeper level was entered, the texture - // was a resolve target). - uint32_t page_first = start_unscaled >> 12; - uint32_t page_last = (start_unscaled + length_unscaled - 1) >> 12; - uint32_t block_first = page_first >> 5; - uint32_t block_last = page_last >> 5; - uint32_t l2_block_first = block_first >> 6; - uint32_t l2_block_last = block_last >> 6; - auto global_lock = global_critical_region_.Acquire(); - for (uint32_t i = l2_block_first; i <= l2_block_last; ++i) { - uint64_t l2_block = scaled_resolve_pages_l2_[i]; - if (i == l2_block_first) { - l2_block &= ~((1ull << (block_first & 63)) - 1); - } - if (i == l2_block_last && (block_last & 63) != 63) { - l2_block &= (1ull << ((block_last & 63) + 1)) - 1; - } - uint32_t block_relative_index; - while (xe::bit_scan_forward(l2_block, &block_relative_index)) { - l2_block &= ~(1ull << block_relative_index); - uint32_t block_index = (i << 6) + block_relative_index; - uint32_t check_bits = UINT32_MAX; - if (block_index == block_first) { - check_bits &= ~((1u << (page_first & 31)) - 1); - } - if (block_index == block_last && (page_last & 31) != 31) { - check_bits &= (1u << ((page_last & 31) + 1)) - 1; - } - if (scaled_resolve_pages_[block_index] & check_bits) { - return true; - } - } - } - return false; -} - -void TextureCache::ScaledResolveGlobalWatchCallbackThunk( - void* context, uint32_t address_first, uint32_t address_last, - bool invalidated_by_gpu) { - TextureCache* texture_cache = reinterpret_cast(context); - texture_cache->ScaledResolveGlobalWatchCallback(address_first, address_last, - invalidated_by_gpu); -} - -void TextureCache::ScaledResolveGlobalWatchCallback(uint32_t address_first, - uint32_t address_last, - bool invalidated_by_gpu) { - assert_true(IsDrawResolutionScaled()); - if (invalidated_by_gpu) { - // Resolves themselves do exactly the opposite of what this should do. - return; - } - // Mark scaled resolve ranges as non-scaled. Textures themselves will be - // invalidated by their own per-range watches. - uint32_t resolve_page_first = address_first >> 12; - uint32_t resolve_page_last = address_last >> 12; - uint32_t resolve_block_first = resolve_page_first >> 5; - uint32_t resolve_block_last = resolve_page_last >> 5; - uint32_t resolve_l2_block_first = resolve_block_first >> 6; - uint32_t resolve_l2_block_last = resolve_block_last >> 6; - for (uint32_t i = resolve_l2_block_first; i <= resolve_l2_block_last; ++i) { - uint64_t resolve_l2_block = scaled_resolve_pages_l2_[i]; - uint32_t resolve_block_relative_index; - while ( - xe::bit_scan_forward(resolve_l2_block, &resolve_block_relative_index)) { - resolve_l2_block &= ~(1ull << resolve_block_relative_index); - uint32_t resolve_block_index = (i << 6) + resolve_block_relative_index; - uint32_t resolve_keep_bits = 0; - if (resolve_block_index == resolve_block_first) { - resolve_keep_bits |= (1u << (resolve_page_first & 31)) - 1; - } - if (resolve_block_index == resolve_block_last && - (resolve_page_last & 31) != 31) { - resolve_keep_bits |= ~((1u << ((resolve_page_last & 31) + 1)) - 1); - } - scaled_resolve_pages_[resolve_block_index] &= resolve_keep_bits; - if (scaled_resolve_pages_[resolve_block_index] == 0) { - scaled_resolve_pages_l2_[i] &= ~(1ull << resolve_block_relative_index); - } - } - } -} - } // namespace d3d12 } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/d3d12/d3d12_texture_cache.h b/src/xenia/gpu/d3d12/d3d12_texture_cache.h new file mode 100644 index 000000000..0f1164922 --- /dev/null +++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.h @@ -0,0 +1,592 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2022 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D12_D3D12_TEXTURE_CACHE_H_ +#define XENIA_GPU_D3D12_D3D12_TEXTURE_CACHE_H_ + +#include +#include +#include +#include +#include + +#include "xenia/base/assert.h" +#include "xenia/gpu/d3d12/d3d12_shader.h" +#include "xenia/gpu/d3d12/d3d12_shared_memory.h" +#include "xenia/gpu/register_file.h" +#include "xenia/gpu/texture_cache.h" +#include "xenia/gpu/texture_util.h" +#include "xenia/gpu/xenos.h" +#include "xenia/ui/d3d12/d3d12_api.h" +#include "xenia/ui/d3d12/d3d12_provider.h" + +namespace xe { +namespace gpu { +namespace d3d12 { + +class D3D12CommandProcessor; + +class D3D12TextureCache final : public TextureCache { + public: + // Keys that can be stored for checking validity whether descriptors for host + // shader bindings are up to date. + struct TextureSRVKey { + TextureKey key; + uint32_t host_swizzle; + uint8_t swizzled_signs; + }; + + // Sampler parameters that can be directly converted to a host sampler or used + // for binding checking validity whether samplers are up to date. + union SamplerParameters { + uint32_t value; + struct { + xenos::ClampMode clamp_x : 3; // 3 + xenos::ClampMode clamp_y : 3; // 6 + xenos::ClampMode clamp_z : 3; // 9 + xenos::BorderColor border_color : 2; // 11 + // For anisotropic, these are true. + uint32_t mag_linear : 1; // 12 + uint32_t min_linear : 1; // 13 + uint32_t mip_linear : 1; // 14 + xenos::AnisoFilter aniso_filter : 3; // 17 + uint32_t mip_min_level : 4; // 21 + // Maximum mip level is in the texture resource itself. + }; + + SamplerParameters() : value(0) { static_assert_size(*this, sizeof(value)); } + bool operator==(const SamplerParameters& parameters) const { + return value == parameters.value; + } + bool operator!=(const SamplerParameters& parameters) const { + return value != parameters.value; + } + }; + + static std::unique_ptr Create( + const RegisterFile& register_file, D3D12SharedMemory& shared_memory, + uint32_t draw_resolution_scale_x, uint32_t draw_resolution_scale_y, + D3D12CommandProcessor& command_processor, bool bindless_resources_used) { + std::unique_ptr texture_cache(new D3D12TextureCache( + register_file, shared_memory, draw_resolution_scale_x, + draw_resolution_scale_y, command_processor, bindless_resources_used)); + if (!texture_cache->Initialize()) { + return nullptr; + } + return std::move(texture_cache); + } + + ~D3D12TextureCache(); + + void ClearCache(); + + void BeginSubmission(uint64_t new_submission_index) override; + void BeginFrame() override; + void EndFrame(); + + // Must be called within a submission - creates and untiles textures needed by + // shaders and puts them in the SRV state. This may bind compute pipelines + // (notifying the command processor about that), so this must be called before + // binding the actual drawing pipeline. + void RequestTextures(uint32_t used_texture_mask) override; + + // Returns whether texture SRV keys stored externally are still valid for the + // current bindings and host shader binding layout. Both keys and + // host_shader_bindings must have host_shader_binding_count elements + // (otherwise they are incompatible - like if this function returned false). + bool AreActiveTextureSRVKeysUpToDate( + const TextureSRVKey* keys, + const D3D12Shader::TextureBinding* host_shader_bindings, + size_t host_shader_binding_count) const; + // Exports the current binding data to texture SRV keys so they can be stored + // for checking whether subsequent draw calls can keep using the same + // bindings. Write host_shader_binding_count keys. + void WriteActiveTextureSRVKeys( + TextureSRVKey* keys, + const D3D12Shader::TextureBinding* host_shader_bindings, + size_t host_shader_binding_count) const; + void WriteActiveTextureBindfulSRV( + const D3D12Shader::TextureBinding& host_shader_binding, + D3D12_CPU_DESCRIPTOR_HANDLE handle); + uint32_t GetActiveTextureBindlessSRVIndex( + const D3D12Shader::TextureBinding& host_shader_binding); + + SamplerParameters GetSamplerParameters( + const D3D12Shader::SamplerBinding& binding) const; + void WriteSampler(SamplerParameters parameters, + D3D12_CPU_DESCRIPTOR_HANDLE handle) const; + + // Returns whether the actual scale is not smaller than the requested one. + static bool ClampDrawResolutionScaleToMaxSupported( + uint32_t& scale_x, uint32_t& scale_y, + const ui::d3d12::D3D12Provider& provider); + // Ensures the tiles backing the range in the buffers are allocated. + bool EnsureScaledResolveMemoryCommitted(uint32_t start_unscaled, + uint32_t length_unscaled) override; + // Makes the specified range of up to 1-2 GB currently accessible on the GPU. + // One draw call can access only at most one range - the same memory is + // accessible through different buffers based on the range needed, so aliasing + // barriers are required. + bool MakeScaledResolveRangeCurrent(uint32_t start_unscaled, + uint32_t length_unscaled); + // These functions create a view of the range specified in the last successful + // MakeScaledResolveRangeCurrent call because that function must be called + // before this. + void CreateCurrentScaledResolveRangeUintPow2SRV( + D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2); + void CreateCurrentScaledResolveRangeUintPow2UAV( + D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2); + void TransitionCurrentScaledResolveRange(D3D12_RESOURCE_STATES new_state); + void MarkCurrentScaledResolveRangeUAVWritesCommitNeeded() { + assert_true(IsDrawResolutionScaled()); + GetCurrentScaledResolveBuffer().SetUAVBarrierPending(); + } + + // Returns the ID3D12Resource of the front buffer texture (in + // PIXEL_SHADER_RESOURCE state), or nullptr in case of failure, and writes the + // description of its SRV. May call LoadTextureData, so the same restrictions + // (such as about descriptor heap change possibility) apply. + ID3D12Resource* RequestSwapTexture( + D3D12_SHADER_RESOURCE_VIEW_DESC& srv_desc_out, + xenos::TextureFormat& format_out); + + protected: + bool IsSignedVersionSeparateForFormat(TextureKey key) const override; + bool IsScaledResolveSupportedForFormat(TextureKey key) const override; + uint32_t GetHostFormatSwizzle(TextureKey key) const override; + + uint32_t GetMaxHostTextureWidthHeight( + xenos::DataDimension dimension) const override; + uint32_t GetMaxHostTextureDepthOrArraySize( + xenos::DataDimension dimension) const override; + + std::unique_ptr CreateTexture(TextureKey key) override; + + // This binds pipelines, allocates descriptors, and copies! + bool LoadTextureDataFromResidentMemoryImpl(Texture& texture, bool load_base, + bool load_mips) override; + + void UpdateTextureBindingsImpl(uint32_t fetch_constant_mask) override; + + private: + enum class LoadMode { + k8bpb, + k16bpb, + k32bpb, + k64bpb, + k128bpb, + kR5G5B5A1ToB5G5R5A1, + kR5G6B5ToB5G6R5, + kR5G5B6ToB5G6R5WithRBGASwizzle, + kR4G4B4A4ToB4G4R4A4, + kR10G11B11ToRGBA16, + kR10G11B11ToRGBA16SNorm, + kR11G11B10ToRGBA16, + kR11G11B10ToRGBA16SNorm, + kDXT1ToRGBA8, + kDXT3ToRGBA8, + kDXT5ToRGBA8, + kDXNToRG8, + kDXT3A, + kDXT3AAs1111ToBGRA4, + kDXT5AToR8, + kCTX1, + kDepthUnorm, + kDepthFloat, + + kCount, + + kUnknown = kCount + }; + + struct LoadModeInfo { + // Shader without resolution scaling. + const void* shader; + size_t shader_size; + // Shader with resolution scaling, if available. These shaders are separate + // so the majority of the textures are not affected by the code needed for + // resolution scale support, and also to check if the format allows + // resolution scaling. + const void* shader_scaled; + size_t shader_scaled_size; + // Log2 of the sizes, in bytes, of the source (guest) SRV and the + // destination (host) UAV accessed by the copying shader, since the shader + // may copy multiple blocks per one invocation. + uint32_t srv_bpe_log2; + uint32_t uav_bpe_log2; + // Number of host blocks (or texels for uncompressed) along X axis written + // by every compute shader thread - rows in the upload buffer are padded to + // at least this amount. + uint32_t host_x_blocks_per_thread; + }; + + struct HostFormat { + // Format info for the regular case. + // DXGI format (typeless when different signedness or number representation + // is used) for the texture resource. + DXGI_FORMAT dxgi_format_resource; + // DXGI format for unsigned normalized or unsigned/signed float SRV. + DXGI_FORMAT dxgi_format_unorm; + // The regular load mode, used when special modes (like signed-specific or + // decompressing) aren't needed. + LoadMode load_mode; + // DXGI format for signed normalized or unsigned/signed float SRV. + DXGI_FORMAT dxgi_format_snorm; + // If the signed version needs a different bit representation on the host, + // this is the load mode for the signed version. Otherwise the regular + // load_mode will be used for the signed version, and a single copy will be + // created if both unsigned and signed are used. + LoadMode load_mode_snorm; + + // Do NOT add integer DXGI formats to this - they are not filterable, can + // only be read with Load, not Sample! If any game is seen using num_format + // 1 for fixed-point formats (for floating-point, it's normally set to 1 + // though), add a constant buffer containing multipliers for the + // textures and multiplication to the tfetch implementation. + + // Whether the DXGI format, if not uncompressing the texture, consists of + // blocks, thus copy regions must be aligned to block size. + bool dxgi_format_block_aligned; + // Uncompression info for when the regular host format for this texture is + // block-compressed, but the size is not block-aligned, and thus such + // texture cannot be created in Direct3D on PC and needs decompression, + // however, such textures are common, for instance, in 4D5307E6. This only + // supports unsigned normalized formats - let's hope GPUSIGN_SIGNED was not + // used for DXN and DXT5A. + DXGI_FORMAT dxgi_format_uncompressed; + LoadMode decompress_mode; + + // Mapping of Xenos swizzle components to DXGI format components. + uint32_t swizzle; + }; + + class D3D12Texture final : public Texture { + public: + D3D12Texture(D3D12TextureCache& texture_cache, const TextureKey& key, + ID3D12Resource* resource, + D3D12_RESOURCE_STATES resource_state); + ~D3D12Texture(); + + ID3D12Resource* resource() const { return resource_.Get(); } + + D3D12_RESOURCE_STATES SetResourceState(D3D12_RESOURCE_STATES new_state) { + D3D12_RESOURCE_STATES old_state = resource_state_; + resource_state_ = new_state; + return old_state; + } + + uint32_t GetSRVDescriptorIndex(uint32_t descriptor_key) const { + auto it = srv_descriptors_.find(descriptor_key); + return it != srv_descriptors_.cend() ? it->second : UINT32_MAX; + } + + void AddSRVDescriptorIndex(uint32_t descriptor_key, + uint32_t descriptor_index) { + srv_descriptors_.emplace(descriptor_key, descriptor_index); + } + + private: + Microsoft::WRL::ComPtr resource_; + D3D12_RESOURCE_STATES resource_state_; + + // For bindful - indices in the non-shader-visible descriptor cache for + // copying to the shader-visible heap (much faster than recreating, which, + // according to profiling, was often a bottleneck in many games). + // For bindless - indices in the global shader-visible descriptor heap. + std::unordered_map srv_descriptors_; + }; + + static constexpr uint32_t kSRVDescriptorCachePageSize = 65536; + + struct SRVDescriptorCachePage { + public: + explicit SRVDescriptorCachePage(ID3D12DescriptorHeap* heap) + : heap_(heap), + heap_start_(heap->GetCPUDescriptorHandleForHeapStart()) {} + SRVDescriptorCachePage(const SRVDescriptorCachePage& page) = delete; + SRVDescriptorCachePage& operator=(const SRVDescriptorCachePage& page) = + delete; + SRVDescriptorCachePage(SRVDescriptorCachePage&& page) { + std::swap(heap_, page.heap_); + std::swap(heap_start_, page.heap_start_); + } + SRVDescriptorCachePage& operator=(SRVDescriptorCachePage&& page) { + std::swap(heap_, page.heap_); + std::swap(heap_start_, page.heap_start_); + return *this; + } + + ID3D12DescriptorHeap* heap() const { return heap_.Get(); } + D3D12_CPU_DESCRIPTOR_HANDLE heap_start() const { return heap_start_; } + + private: + Microsoft::WRL::ComPtr heap_; + D3D12_CPU_DESCRIPTOR_HANDLE heap_start_; + }; + + struct D3D12TextureBinding { + // Descriptor indices of texture and texture_signed of the respective + // TextureBinding returned from FindOrCreateTextureDescriptor. + uint32_t descriptor_index; + uint32_t descriptor_index_signed; + + D3D12TextureBinding() { Reset(); } + + void Reset() { + descriptor_index = UINT32_MAX; + descriptor_index_signed = UINT32_MAX; + } + }; + + class ScaledResolveVirtualBuffer { + public: + ScaledResolveVirtualBuffer(ID3D12Resource* resource, + D3D12_RESOURCE_STATES resource_state) + : resource_(resource), resource_state_(resource_state) {} + ID3D12Resource* resource() const { return resource_.Get(); } + D3D12_RESOURCE_STATES SetResourceState(D3D12_RESOURCE_STATES new_state) { + D3D12_RESOURCE_STATES old_state = resource_state_; + if (old_state == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) { + uav_barrier_pending_ = false; + } + resource_state_ = new_state; + return old_state; + } + // After writing through a UAV. + void SetUAVBarrierPending() { + if (resource_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) { + uav_barrier_pending_ = true; + } + } + // After an aliasing barrier (which is even stronger than an UAV barrier). + void ClearUAVBarrierPending() { uav_barrier_pending_ = false; } + + private: + Microsoft::WRL::ComPtr resource_; + D3D12_RESOURCE_STATES resource_state_; + bool uav_barrier_pending_ = false; + }; + + D3D12TextureCache(const RegisterFile& register_file, + D3D12SharedMemory& shared_memory, + uint32_t draw_resolution_scale_x, + uint32_t draw_resolution_scale_y, + D3D12CommandProcessor& command_processor, + bool bindless_resources_used); + + bool Initialize(); + + // Whether decompression is needed on the host (Direct3D only allows creation + // of block-compressed textures with 4x4-aligned dimensions on PC). + static bool IsDecompressionNeeded(xenos::TextureFormat format, uint32_t width, + uint32_t height); + static DXGI_FORMAT GetDXGIResourceFormat(xenos::TextureFormat format, + uint32_t width, uint32_t height) { + const HostFormat& host_format = host_formats_[uint32_t(format)]; + return IsDecompressionNeeded(format, width, height) + ? host_format.dxgi_format_uncompressed + : host_format.dxgi_format_resource; + } + static DXGI_FORMAT GetDXGIResourceFormat(TextureKey key) { + return GetDXGIResourceFormat(key.format, key.GetWidth(), key.GetHeight()); + } + static DXGI_FORMAT GetDXGIUnormFormat(xenos::TextureFormat format, + uint32_t width, uint32_t height) { + const HostFormat& host_format = host_formats_[uint32_t(format)]; + return IsDecompressionNeeded(format, width, height) + ? host_format.dxgi_format_uncompressed + : host_format.dxgi_format_unorm; + } + static DXGI_FORMAT GetDXGIUnormFormat(TextureKey key) { + return GetDXGIUnormFormat(key.format, key.GetWidth(), key.GetHeight()); + } + + static LoadMode GetLoadMode(TextureKey key); + + static constexpr bool AreDimensionsCompatible( + xenos::FetchOpDimension binding_dimension, + xenos::DataDimension resource_dimension) { + switch (binding_dimension) { + case xenos::FetchOpDimension::k1D: + case xenos::FetchOpDimension::k2D: + return resource_dimension == xenos::DataDimension::k1D || + resource_dimension == xenos::DataDimension::k2DOrStacked; + case xenos::FetchOpDimension::k3DOrStacked: + return resource_dimension == xenos::DataDimension::k3D; + case xenos::FetchOpDimension::kCube: + return resource_dimension == xenos::DataDimension::kCube; + default: + return false; + } + } + + // Returns the index of an existing of a newly created non-shader-visible + // cached (for bindful) or a shader-visible global (for bindless) descriptor, + // or UINT32_MAX if failed to create. + uint32_t FindOrCreateTextureDescriptor(D3D12Texture& texture, bool is_signed, + uint32_t host_swizzle); + void ReleaseTextureDescriptor(uint32_t descriptor_index); + D3D12_CPU_DESCRIPTOR_HANDLE GetTextureDescriptorCPUHandle( + uint32_t descriptor_index) const; + + size_t GetScaledResolveBufferCount() const { + assert_true(IsDrawResolutionScaled()); + // Make sure any range up to 1 GB is accessible through 1 or 2 buffers. + // 2x2 scale buffers - just one 2 GB buffer for all 2 GB. + // 3x3 scale buffers - 4 buffers: + // +0.0 +0.5 +1.0 +1.5 +2.0 +2.5 +3.0 +3.5 +4.0 +4.5 + // |___________________|___________________| + // |___________________|______________| + // Buffer N has an offset of N * 1 GB in the scaled resolve address space. + // The logic is: + // - 2 GB can be accessed through a [0 GB ... 2 GB) buffer - only need one. + // - 2.1 GB needs [0 GB ... 2 GB) and [1 GB ... 2.1 GB) - two buffers. + // - 3 GB needs [0 GB ... 2 GB) and [1 GB ... 3 GB) - two buffers. + // - 3.1 GB needs [0 GB ... 2 GB), [1 GB ... 3 GB) and [2 GB ... 3.1 GB) - + // three buffers. + uint64_t address_space_size = + uint64_t(SharedMemory::kBufferSize) * + (draw_resolution_scale_x() * draw_resolution_scale_y()); + return size_t((address_space_size - 1) >> 30); + } + // Returns indices of two scaled resolve virtual buffers that the location in + // memory may be accessible through. May be the same if it's a location near + // the beginning or the end of the address represented only by one buffer. + std::array GetPossibleScaledResolveBufferIndices( + uint64_t address_scaled) const { + assert_true(IsDrawResolutionScaled()); + size_t address_gb = size_t(address_scaled >> 30); + size_t max_index = GetScaledResolveBufferCount() - 1; + // In different cases for 3x3: + // +0.0 +0.5 +1.0 +1.5 +2.0 +2.5 +3.0 +3.5 +4.0 +4.5 + // |12________2________|1_________2________| + // |1_________2________|1_________12__| + return std::array{ + std::min(address_gb, max_index), + std::min(std::max(address_gb, size_t(1)) - size_t(1), max_index)}; + } + // The index is also the gigabyte offset of the buffer from the start of the + // scaled physical memory address space. + size_t GetCurrentScaledResolveBufferIndex() const { + return scaled_resolve_1gb_buffer_indices_ + [scaled_resolve_current_range_start_scaled_ >> 30]; + } + ScaledResolveVirtualBuffer& GetCurrentScaledResolveBuffer() { + ScaledResolveVirtualBuffer* scaled_resolve_buffer = + scaled_resolve_2gb_buffers_[GetCurrentScaledResolveBufferIndex()].get(); + assert_not_null(scaled_resolve_buffer); + return *scaled_resolve_buffer; + } + + static const HostFormat host_formats_[64]; + + D3D12CommandProcessor& command_processor_; + bool bindless_resources_used_; + + static const LoadModeInfo load_mode_info_[]; + Microsoft::WRL::ComPtr load_root_signature_; + std::array, + size_t(LoadMode::kCount)> + load_pipelines_; + // Load pipelines for resolution-scaled resolve targets. + std::array, + size_t(LoadMode::kCount)> + load_pipelines_scaled_; + + std::vector srv_descriptor_cache_; + uint32_t srv_descriptor_cache_allocated_; + // Indices of cached descriptors used by deleted textures, for reuse. + std::vector srv_descriptor_cache_free_; + + enum class NullSRVDescriptorIndex { + k2DArray, + k3D, + kCube, + + kCount, + }; + // Contains null SRV descriptors of dimensions from NullSRVDescriptorIndex. + // For copying, not shader-visible. + Microsoft::WRL::ComPtr null_srv_descriptor_heap_; + D3D12_CPU_DESCRIPTOR_HANDLE null_srv_descriptor_heap_start_; + + std::array + d3d12_texture_bindings_; + + // Unsupported texture formats used during this frame (for research and + // testing). + enum : uint8_t { + kUnsupportedResourceBit = 1, + kUnsupportedUnormBit = kUnsupportedResourceBit << 1, + kUnsupportedSnormBit = kUnsupportedUnormBit << 1, + }; + uint8_t unsupported_format_features_used_[64]; + + // The tiled buffer for resolved data with resolution scaling. + // Because on Direct3D 12 (at least on Windows 10 2004) typed SRV or UAV + // creation fails for offsets above 4 GB, a single tiled 4.5 GB buffer can't + // be used for 3x3 resolution scaling. + // Instead, "sliding window" buffers allowing to access a single range of up + // to 1 GB (or up to 2 GB, depending on the low bits) at any moment are used. + // Parts of 4.5 GB address space can be accessed through 2 GB buffers as: + // +0.0 +0.5 +1.0 +1.5 +2.0 +2.5 +3.0 +3.5 +4.0 +4.5 + // |___________________|___________________| or + // |___________________|______________| + // (2 GB is also the amount of scaled physical memory with 2x resolution + // scale, and older Intel GPUs, while support tiled resources, only support 31 + // virtual address bits per resource). + // Index is first gigabyte. Only including buffers containing over 1 GB + // (because otherwise the data will be fully contained in another). + // Size is calculated the same as in GetScaledResolveBufferCount. + std::array, + (uint64_t(SharedMemory::kBufferSize) * + (kMaxDrawResolutionScaleAlongAxis * + kMaxDrawResolutionScaleAlongAxis) - + 1) / + (UINT32_C(1) << 30)> + scaled_resolve_2gb_buffers_; + // Not very big heaps (16 MB) because they are needed pretty sparsely. One + // 2x-scaled 1280x720x32bpp texture is slighly bigger than 14 MB. + static constexpr uint32_t kScaledResolveHeapSizeLog2 = 24; + static constexpr uint32_t kScaledResolveHeapSize = + uint32_t(1) << kScaledResolveHeapSizeLog2; + static_assert( + (kScaledResolveHeapSize % D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES) == 0, + "Scaled resolve heap size must be a multiple of Direct3D tile size"); + static_assert( + kScaledResolveHeapSizeLog2 <= SharedMemory::kBufferSizeLog2, + "Scaled resolve heaps are assumed to be wholly mappable irrespective of " + "resolution scale, never truncated, for example, if the scaled resolve " + "address space is 4.5 GB, but the heap size is 1 GB"); + static_assert( + kScaledResolveHeapSizeLog2 <= 30, + "Scaled resolve heaps are assumed to only be wholly mappable to up to " + "two 2 GB buffers"); + // Resident portions of the tiled buffer. + std::vector> scaled_resolve_heaps_; + // Number of currently resident portions of the tiled buffer, for profiling. + uint32_t scaled_resolve_heap_count_ = 0; + // Current scaled resolve state. + // For aliasing barrier placement, last owning buffer index for each of 1 GB. + size_t + scaled_resolve_1gb_buffer_indices_[(uint64_t(SharedMemory::kBufferSize) * + kMaxDrawResolutionScaleAlongAxis * + kMaxDrawResolutionScaleAlongAxis + + ((uint32_t(1) << 30) - 1)) >> + 30]; + // Range used in the last successful MakeScaledResolveRangeCurrent call. + uint64_t scaled_resolve_current_range_start_scaled_; + uint64_t scaled_resolve_current_range_length_scaled_; +}; + +} // namespace d3d12 +} // namespace gpu +} // namespace xe + +#endif // XENIA_GPU_D3D12_D3D12_TEXTURE_CACHE_H_ diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index 1ae7bef55..b06e92e42 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -98,8 +98,8 @@ PipelineCache::PipelineCache(D3D12CommandProcessor& command_processor, provider.GetAdapterVendorID(), bindless_resources_used_, edram_rov_used, render_target_cache_.gamma_render_target_as_srgb(), render_target_cache_.msaa_2x_supported(), - render_target_cache_.GetResolutionScaleX(), - render_target_cache_.GetResolutionScaleY(), + render_target_cache_.draw_resolution_scale_x(), + render_target_cache_.draw_resolution_scale_y(), provider.GetGraphicsAnalysis() != nullptr); if (edram_rov_used) { @@ -426,8 +426,8 @@ void PipelineCache::InitializeShaderStorage( provider.GetAdapterVendorID(), bindless_resources_used_, edram_rov_used, render_target_cache_.gamma_render_target_as_srgb(), render_target_cache_.msaa_2x_supported(), - render_target_cache_.GetResolutionScaleX(), - render_target_cache_.GetResolutionScaleY(), + render_target_cache_.draw_resolution_scale_x(), + render_target_cache_.draw_resolution_scale_y(), provider.GetGraphicsAnalysis() != nullptr); // If needed and possible, create objects needed for DXIL conversion and // disassembly on this thread. @@ -3001,8 +3001,8 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( // more likely. state_desc.RasterizerState.SlopeScaledDepthBias = description.depth_bias_slope_scaled * - float(std::max(render_target_cache_.GetResolutionScaleX(), - render_target_cache_.GetResolutionScaleY())); + float(std::max(render_target_cache_.draw_resolution_scale_x(), + render_target_cache_.draw_resolution_scale_y())); state_desc.RasterizerState.DepthClipEnable = description.depth_clip ? TRUE : FALSE; uint32_t msaa_sample_count = uint32_t(1) diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h deleted file mode 100644 index 75dd96643..000000000 --- a/src/xenia/gpu/d3d12/texture_cache.h +++ /dev/null @@ -1,887 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2018 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_D3D12_TEXTURE_CACHE_H_ -#define XENIA_GPU_D3D12_TEXTURE_CACHE_H_ - -#include -#include -#include -#include -#include -#include - -#include "xenia/base/assert.h" -#include "xenia/base/hash.h" -#include "xenia/base/mutex.h" -#include "xenia/gpu/d3d12/d3d12_shader.h" -#include "xenia/gpu/d3d12/d3d12_shared_memory.h" -#include "xenia/gpu/register_file.h" -#include "xenia/gpu/texture_info.h" -#include "xenia/gpu/texture_util.h" -#include "xenia/gpu/xenos.h" -#include "xenia/ui/d3d12/d3d12_api.h" -#include "xenia/ui/d3d12/d3d12_provider.h" - -namespace xe { -namespace gpu { -namespace d3d12 { - -class D3D12CommandProcessor; - -// Manages host copies of guest textures, performing untiling, format and endian -// conversion of textures stored in the shared memory, and also handling -// invalidation. -// -// Mipmaps are treated the following way, according to the GPU hang message -// found in game executables explaining the valid usage of BaseAddress when -// streaming the largest LOD (it says games should not use 0 as the base address -// when the largest LOD isn't loaded, but rather, either allocate a valid -// address for it or make it the same as mip_address): -// - If the texture has a base address, but no mip address, it's not mipmapped - -// the host texture has only the largest level too. -// - If the texture has different non-zero base address and mip address, a host -// texture with mip_max_level+1 mipmaps is created - mip_min_level is ignored -// and treated purely as sampler state because there are tfetch instructions -// working directly with LOD values - including fetching with an explicit LOD. -// However, the max level is not ignored because any mip count can be -// specified when creating a texture, and another texture may be placed after -// the last one. -// - If the texture has a mip address, but the base address is 0 or the same as -// the mip address, a mipmapped texture is created, but min/max LOD is clamped -// to the lower bound of 1 - the game is expected to do that anyway until the -// largest LOD is loaded. -// TODO(Triang3l): Attach the largest LOD to existing textures with a valid -// mip_address but no base ever used yet (no base_address) to save memory -// because textures are streamed this way anyway. -class TextureCache { - struct TextureKey { - // Dimensions minus 1 are stored similarly to how they're stored in fetch - // constants so fewer bits can be used, while the maximum size (8192 for 2D) - // can still be encoded (a 8192x sky texture is used in 4D530910). - - // Physical 4 KB page with the base mip level, disregarding A/C/E address - // range prefix. - uint32_t base_page : 17; // 17 total - xenos::DataDimension dimension : 2; // 19 - uint32_t width_minus_1 : 13; // 32 - - uint32_t height_minus_1 : 13; // 45 - uint32_t tiled : 1; // 46 - uint32_t packed_mips : 1; // 47 - // Physical 4 KB page with mip 1 and smaller. - uint32_t mip_page : 17; // 64 - - // (Layers for stacked and 3D, 6 for cube, 1 for other dimensions) - 1. - uint32_t depth_or_array_size_minus_1 : 10; // 74 - uint32_t pitch : 9; // 83 - uint32_t mip_max_level : 4; // 87 - xenos::TextureFormat format : 6; // 93 - xenos::Endian endianness : 2; // 95 - // Whether this texture is signed and has a different host representation - // than an unsigned view of the same guest texture. - uint32_t signed_separate : 1; // 96 - - // Whether this texture is a resolution-scaled resolve target. - uint32_t scaled_resolve : 1; // 97 - // Least important in ==, so placed last. - uint32_t is_valid : 1; // 98 - - TextureKey() { MakeInvalid(); } - TextureKey(const TextureKey& key) { - std::memcpy(this, &key, sizeof(*this)); - } - TextureKey& operator=(const TextureKey& key) { - std::memcpy(this, &key, sizeof(*this)); - return *this; - } - void MakeInvalid() { - // Zero everything, including the padding, for a stable hash. - std::memset(this, 0, sizeof(*this)); - } - - uint32_t GetWidth() const { return width_minus_1 + 1; } - uint32_t GetHeight() const { return height_minus_1 + 1; } - uint32_t GetDepthOrArraySize() const { - return depth_or_array_size_minus_1 + 1; - } - - using Hasher = xe::hash::XXHasher; - bool operator==(const TextureKey& key) const { - return !std::memcmp(this, &key, sizeof(*this)); - } - bool operator!=(const TextureKey& key) const { return !(*this == key); } - }; - - public: - // Keys that can be stored for checking validity whether descriptors for host - // shader bindings are up to date. - struct TextureSRVKey { - TextureKey key; - uint32_t host_swizzle; - uint8_t swizzled_signs; - }; - - // Sampler parameters that can be directly converted to a host sampler or used - // for binding checking validity whether samplers are up to date. - union SamplerParameters { - uint32_t value; - struct { - xenos::ClampMode clamp_x : 3; // 3 - xenos::ClampMode clamp_y : 3; // 6 - xenos::ClampMode clamp_z : 3; // 9 - xenos::BorderColor border_color : 2; // 11 - // For anisotropic, these are true. - uint32_t mag_linear : 1; // 12 - uint32_t min_linear : 1; // 13 - uint32_t mip_linear : 1; // 14 - xenos::AnisoFilter aniso_filter : 3; // 17 - uint32_t mip_min_level : 4; // 21 - // Maximum mip level is in the texture resource itself. - }; - - SamplerParameters() : value(0) { static_assert_size(*this, sizeof(value)); } - bool operator==(const SamplerParameters& parameters) const { - return value == parameters.value; - } - bool operator!=(const SamplerParameters& parameters) const { - return value != parameters.value; - } - }; - - TextureCache(D3D12CommandProcessor& command_processor, - const RegisterFile& register_file, - D3D12SharedMemory& shared_memory, bool bindless_resources_used, - uint32_t draw_resolution_scale_x, - uint32_t draw_resolution_scale_y); - ~TextureCache(); - - bool Initialize(); - void Shutdown(); - void ClearCache(); - - void TextureFetchConstantWritten(uint32_t index); - - void BeginSubmission(); - void BeginFrame(); - void EndFrame(); - - // Must be called within a frame - creates and untiles textures needed by - // shaders and puts them in the SRV state. This may bind compute pipelines - // (notifying the command processor about that), so this must be called before - // binding the actual drawing pipeline. - void RequestTextures(uint32_t used_texture_mask); - - // "ActiveTexture" means as of the latest RequestTextures call. - - // Returns whether texture SRV keys stored externally are still valid for the - // current bindings and host shader binding layout. Both keys and - // host_shader_bindings must have host_shader_binding_count elements - // (otherwise they are incompatible - like if this function returned false). - bool AreActiveTextureSRVKeysUpToDate( - const TextureSRVKey* keys, - const D3D12Shader::TextureBinding* host_shader_bindings, - size_t host_shader_binding_count) const; - // Exports the current binding data to texture SRV keys so they can be stored - // for checking whether subsequent draw calls can keep using the same - // bindings. Write host_shader_binding_count keys. - void WriteActiveTextureSRVKeys( - TextureSRVKey* keys, - const D3D12Shader::TextureBinding* host_shader_bindings, - size_t host_shader_binding_count) const; - // Returns the post-swizzle signedness of a currently bound texture (must be - // called after RequestTextures). - uint8_t GetActiveTextureSwizzledSigns(uint32_t index) const { - return texture_bindings_[index].swizzled_signs; - } - bool IsActiveTextureResolved(uint32_t index) const { - const TextureBinding& binding = texture_bindings_[index]; - if (binding.texture && binding.texture->IsResolved()) { - return true; - } - if (binding.texture_signed && binding.texture_signed->IsResolved()) { - return true; - } - return false; - } - void WriteActiveTextureBindfulSRV( - const D3D12Shader::TextureBinding& host_shader_binding, - D3D12_CPU_DESCRIPTOR_HANDLE handle); - uint32_t GetActiveTextureBindlessSRVIndex( - const D3D12Shader::TextureBinding& host_shader_binding); - - SamplerParameters GetSamplerParameters( - const D3D12Shader::SamplerBinding& binding) const; - void WriteSampler(SamplerParameters parameters, - D3D12_CPU_DESCRIPTOR_HANDLE handle) const; - - void MarkRangeAsResolved(uint32_t start_unscaled, uint32_t length_unscaled); - // In textures, resolution scaling is done for 8-byte portions of memory for - // 8bpp textures, and for 16-byte portions for textures of higher bit depths - // (these are the sizes of regions where contiguous texels in memory are also - // contiguous in the texture along the horizontal axis, so 64-bit and 128-bit - // loads / stores, for 8bpp and 16bpp+ respectively, can be used for untiling - // regardless of the resolution scale). - static void ClampDrawResolutionScaleToSupportedRange( - uint32_t& scale_x, uint32_t& scale_y, - const ui::d3d12::D3D12Provider& provider); - uint32_t GetDrawResolutionScaleX() const { return draw_resolution_scale_x_; } - uint32_t GetDrawResolutionScaleY() const { return draw_resolution_scale_y_; } - bool IsDrawResolutionScaled() const { - return draw_resolution_scale_x_ > 1 || draw_resolution_scale_y_ > 1; - } - // Ensures the tiles backing the range in the buffers are allocated. - bool EnsureScaledResolveMemoryCommitted(uint32_t start_unscaled, - uint32_t length_unscaled); - // Makes the specified range of up to 1-2 GB currently accessible on the GPU. - // One draw call can access only at most one range - the same memory is - // accessible through different buffers based on the range needed, so aliasing - // barriers are required. - bool MakeScaledResolveRangeCurrent(uint32_t start_unscaled, - uint32_t length_unscaled); - // These functions create a view of the range specified in the last successful - // MakeScaledResolveRangeCurrent call because that function must be called - // before this. - void CreateCurrentScaledResolveRangeUintPow2SRV( - D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2); - void CreateCurrentScaledResolveRangeUintPow2UAV( - D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2); - void TransitionCurrentScaledResolveRange(D3D12_RESOURCE_STATES new_state); - void MarkCurrentScaledResolveRangeUAVWritesCommitNeeded() { - assert_true(IsDrawResolutionScaled()); - GetCurrentScaledResolveBuffer().SetUAVBarrierPending(); - } - - // Returns the ID3D12Resource of the front buffer texture (in - // PIXEL_SHADER_RESOURCE state), or nullptr in case of failure, and writes the - // description of its SRV. May call LoadTextureData, so the same restrictions - // (such as about descriptor heap change possibility) apply. - ID3D12Resource* RequestSwapTexture( - D3D12_SHADER_RESOURCE_VIEW_DESC& srv_desc_out, - xenos::TextureFormat& format_out); - - private: - // Hard limit, originating from the half-pixel offset (two-pixel offset is too - // much, the resolve shaders, being generic for different scales, only - // duplicate the second pixel into the first, not the third), and also due to - // the bit counts used for passing the scale to shaders. - static constexpr uint32_t kMaxDrawResolutionScaleAlongAxis = 3; - - enum class LoadMode { - k8bpb, - k16bpb, - k32bpb, - k64bpb, - k128bpb, - kR5G5B5A1ToB5G5R5A1, - kR5G6B5ToB5G6R5, - kR5G5B6ToB5G6R5WithRBGASwizzle, - kR4G4B4A4ToB4G4R4A4, - kR10G11B11ToRGBA16, - kR10G11B11ToRGBA16SNorm, - kR11G11B10ToRGBA16, - kR11G11B10ToRGBA16SNorm, - kDXT1ToRGBA8, - kDXT3ToRGBA8, - kDXT5ToRGBA8, - kDXNToRG8, - kDXT3A, - kDXT3AAs1111ToBGRA4, - kDXT5AToR8, - kCTX1, - kDepthUnorm, - kDepthFloat, - - kCount, - - kUnknown = kCount - }; - - struct LoadModeInfo { - // Rules of data access in load shaders: - // - Source reading (from the shared memory or the scaled resolve buffer): - // - Guest data may be stored in a sparsely-allocated buffer, or, in - // Direct3D 12 terms, a tiled buffer. This means that some regions of - // the buffer may not be mapped. On tiled resources tier 1 hardware, - // accesing unmapped tiles results in undefined behavior, including a - // GPU page fault and device removal. So, shaders must not try to access - // potentially unmapped regions (that are outside the texture memory - // extents calculated on the CPU, taking into account that Xenia can't - // overestimate texture sizes freely since it must not try to upload - // unallocated pages on the CPU). - // - Buffer tiles have 64 KB size on Direct3D 12. Vulkan has its own - // alignment requirements for sparse binding. But overall, we're - // allocating pretty large regions. - // - Resolution scaling disabled: - // - Shared memory allocates regions of power of two sizes that map - // directly to the same portions of the 512 MB of the console's - // physical memory. So, a 64 KB-aligned host buffer region is also 64 - // KB-aligned in the guest address space. - // - Tiled textures: 32x32x4-block tiles are always resident each as a - // whole. If the width is bigger than the pitch, the overflowing - // 32x32x4 tiles are also loaded as entire tiles. We do not have - // separate shaders for 2D and 3D. So, for tiled textures, it's safe - // to consider that if any location within a 32x32-aligned portion is - // within the texture bounds, the entire 32x32 portion also can be - // read. - // - Linear textures: Pitch is aligned to 256 bytes. Row count, however, - // is not aligned to anything (unless the mip tail is being loaded). - // The overflowing last row in case `width > pitch`, however, is made - // resident up to the last texel in it. But row start alignment is - // 256, which is a power of two, and is smaller than the Direct3D 12 - // tile size of 64 KB. So, if any block within a 256-aligned region is - // within the texture bounds, without resolution scaling, reading from - // any location in that 256-aligned region is safe. - // - Since we use the same shaders for tiled and linear textures (as - // well as 1D textures), this means that without resolution scaling, - // it's safe to access a min(256 bytes, 32 blocks)-aligned portion - // along X, but only within the same row of blocks, with bounds - // checking only for such portion as a whole, but without additional - // bounds checking inside of it. - // - Therefore, it's recommended that shaders read power-of-two amounts - // of blocks (so there will naturally be some alignment to some power - // of two), and this way, each thread may read at most 16 16bpb blocks - // or at most 32 8bpb or smaller blocks with in a single - // `if (x < width)` for the whole aligned range of the same length. - // - Resolution scaling enabled: - // - For simplicity, unlike in the shared memory, buffer tile boundaries - // are not aligned to powers of 2 the same way as guest addresses are. - // While for 2x2 resolution scaling it still happens to be the case - // because `host scaling unit address = guest scaling unit - // address << 2` (similarly for 2x1 and 1x2), for 3x or x3, it's not - - // a 64 KB host tile would represent 7281.777 guest bytes with 3x3 - // (disregarding that sequences of texels that are adjacent in memory - // alongside the horizontal axis, not individual bytes, are scaled, - // but even in that case it's not scaling by 2^n still). - // - The above would affect the `width > pitch` case for linear - // textures, requiring overestimating the width in calculation of the - // range of the tiles to map, while not doing this overestimation on - // the guest memory extent calculation side (otherwise it may result - // in attempting to upload unallocated memory on the CPU). For - // example, let's take look at an extreme case of a 369x28 k_8 texture - // with pitch of 256 bytes. The last row, in guest memory, would be - // loaded from the [7168, 7281) range, or, with 3x3 resolution - // scaling, from bytes [64512, 65529). However, if we try to - // unconditionally load 2 pixels, like the texture is 370x28, we will - // be accessing the bytes [64512, 65538). But bytes 65536 and 65537 - // will be in another 64 KB tile, which may be not mapped yet. - // However, none of this is an issue for one simple reason - resolving - // is only possible to tiled textures, so linear textures will never - // be resolution-scaled. - // - Tiled textures have potentially referenced guest 32x32-block tiles - // loaded in their entirety. So, just like for unscaled textures, if - // any block within a tile is available, the entire tile is as well. - // - Destination writing (to the linear buffer): - // - host_x_blocks_per_thread specifies how many pixels can be written - // without bounds checking within increments of that amount - the pitch - // of the destination buffer is manually overaligned if needed. - // Shader without resolution scaling. - const void* shader; - size_t shader_size; - // Shader with resolution scaling, if available. These shaders are separate - // so the majority of the textures are not affected by the code needed for - // resolution scale support, and also to check if the format allows - // resolution scaling. - const void* shader_scaled; - size_t shader_scaled_size; - // Log2 of the sizes, in bytes, of the source (guest) SRV and the - // destination (host) UAV accessed by the copying shader, since the shader - // may copy multiple blocks per one invocation. - uint32_t srv_bpe_log2; - uint32_t uav_bpe_log2; - // Number of host blocks (or texels for uncompressed) along X axis written - // by every compute shader thread - rows in the upload buffer are padded to - // at least this amount. - uint32_t host_x_blocks_per_thread; - }; - - struct HostFormat { - // Format info for the regular case. - // DXGI format (typeless when different signedness or number representation - // is used) for the texture resource. - DXGI_FORMAT dxgi_format_resource; - // DXGI format for unsigned normalized or unsigned/signed float SRV. - DXGI_FORMAT dxgi_format_unorm; - // The regular load mode, used when special modes (like signed-specific or - // decompressing) aren't needed. - LoadMode load_mode; - // DXGI format for signed normalized or unsigned/signed float SRV. - DXGI_FORMAT dxgi_format_snorm; - // If the signed version needs a different bit representation on the host, - // this is the load mode for the signed version. Otherwise the regular - // load_mode will be used for the signed version, and a single copy will be - // created if both unsigned and signed are used. - LoadMode load_mode_snorm; - - // Do NOT add integer DXGI formats to this - they are not filterable, can - // only be read with Load, not Sample! If any game is seen using num_format - // 1 for fixed-point formats (for floating-point, it's normally set to 1 - // though), add a constant buffer containing multipliers for the - // textures and multiplication to the tfetch implementation. - - // Whether the DXGI format, if not uncompressing the texture, consists of - // blocks, thus copy regions must be aligned to block size. - bool dxgi_format_block_aligned; - // Uncompression info for when the regular host format for this texture is - // block-compressed, but the size is not block-aligned, and thus such - // texture cannot be created in Direct3D on PC and needs decompression, - // however, such textures are common, for instance, in 4D5307E6. This only - // supports unsigned normalized formats - let's hope GPUSIGN_SIGNED was not - // used for DXN and DXT5A. - DXGI_FORMAT dxgi_format_uncompressed; - LoadMode decompress_mode; - - // Mapping of Xenos swizzle components to DXGI format components. - uint8_t swizzle[4]; - }; - - struct Texture { - TextureKey key; - ID3D12Resource* resource; - uint64_t resource_size; - D3D12_RESOURCE_STATES state; - // Whether the most up-to-date base / mips contain pages with data from a - // resolve operation (rather than from the CPU or memexport), primarily for - // choosing between piecewise linear gamma and sRGB when the former is - // emulated with the latter. - bool base_resolved; - bool mips_resolved; - - uint64_t last_usage_frame; - uint64_t last_usage_time; - Texture* used_previous; - Texture* used_next; - - texture_util::TextureGuestLayout guest_layout; - - // For bindful - indices in the non-shader-visible descriptor cache for - // copying to the shader-visible heap (much faster than recreating, which, - // according to profiling, was often a bottleneck in many games). - // For bindless - indices in the global shader-visible descriptor heap. - std::unordered_map srv_descriptors; - - // These are to be accessed within the global critical region to synchronize - // with shared memory. - // Watch handles for the memory ranges. - SharedMemory::WatchHandle base_watch_handle; - SharedMemory::WatchHandle mip_watch_handle; - // Whether the recent base level data has been loaded from the memory. - bool base_in_sync; - // Whether the recent mip data has been loaded from the memory. - bool mips_in_sync; - - bool IsResolved() const { return base_resolved || mips_resolved; } - uint32_t GetGuestBaseSize() const { - return guest_layout.base.level_data_extent_bytes; - } - uint32_t GetGuestMipsSize() const { - return guest_layout.mips_total_extent_bytes; - } - }; - - struct SRVDescriptorCachePage { - static constexpr uint32_t kHeapSize = 65536; - ID3D12DescriptorHeap* heap; - D3D12_CPU_DESCRIPTOR_HANDLE heap_start; - }; - - struct LoadConstants { - // vec4 0. - uint32_t is_tiled_3d_endian_scale; - // Base offset in bytes, resolution-scaled. - uint32_t guest_offset; - // For tiled textures - row pitch in blocks, aligned to 32, unscaled. - // For linear textures - row pitch in bytes. - uint32_t guest_pitch_aligned; - // For 3D textures only (ignored otherwise) - aligned to 32, unscaled. - uint32_t guest_z_stride_block_rows_aligned; - - // vec4 1. - // If this is a packed mip tail, this is aligned to tile dimensions. - // Resolution-scaled. - uint32_t size_blocks[3]; - // Base offset in bytes. - uint32_t host_offset; - - // vec4 2. - uint32_t host_pitch; - uint32_t height_texels; - }; - - struct TextureBinding { - TextureKey key; - // Destination swizzle merged with guest->host format swizzle. - uint32_t host_swizzle; - // Packed TextureSign values, 2 bit per each component, with guest-side - // destination swizzle from the fetch constant applied to them. - uint8_t swizzled_signs; - // Unsigned version of the texture (or signed if they have the same data). - Texture* texture; - // Signed version of the texture if the data in the signed version is - // different on the host. - Texture* texture_signed; - // Descriptor indices of texture and texture_signed returned from - // FindOrCreateTextureDescriptor. - uint32_t descriptor_index; - uint32_t descriptor_index_signed; - void Clear() { - std::memset(this, 0, sizeof(*this)); - descriptor_index = descriptor_index_signed = UINT32_MAX; - } - }; - - static uint32_t GetMaxHostTextureWidthHeight(xenos::DataDimension dimension) { - switch (dimension) { - case xenos::DataDimension::k1D: - case xenos::DataDimension::k2DOrStacked: - // 1D and 2D are emulated as 2D arrays. - return D3D12_REQ_TEXTURE2D_U_OR_V_DIMENSION; - case xenos::DataDimension::k3D: - return D3D12_REQ_TEXTURE3D_U_V_OR_W_DIMENSION; - case xenos::DataDimension::kCube: - return D3D12_REQ_TEXTURECUBE_DIMENSION; - default: - assert_unhandled_case(dimension); - return 0; - } - } - static uint32_t GetMaxHostTextureDepthOrArraySize( - xenos::DataDimension dimension) { - switch (dimension) { - case xenos::DataDimension::k1D: - case xenos::DataDimension::k2DOrStacked: - // 1D and 2D are emulated as 2D arrays. - return D3D12_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION; - case xenos::DataDimension::k3D: - return D3D12_REQ_TEXTURE3D_U_V_OR_W_DIMENSION; - case xenos::DataDimension::kCube: - return D3D12_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION / 6 * 6; - default: - assert_unhandled_case(dimension); - return 0; - } - } - - class ScaledResolveVirtualBuffer { - public: - ScaledResolveVirtualBuffer(ID3D12Resource* resource, - D3D12_RESOURCE_STATES resource_state) - : resource_(resource), resource_state_(resource_state) {} - ID3D12Resource* resource() const { return resource_.Get(); } - D3D12_RESOURCE_STATES SetResourceState(D3D12_RESOURCE_STATES new_state) { - D3D12_RESOURCE_STATES old_state = resource_state_; - if (old_state == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) { - uav_barrier_pending_ = false; - } - resource_state_ = new_state; - return old_state; - } - // After writing through a UAV. - void SetUAVBarrierPending() { - if (resource_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) { - uav_barrier_pending_ = true; - } - } - // After an aliasing barrier (which is even stronger than an UAV barrier). - void ClearUAVBarrierPending() { uav_barrier_pending_ = false; } - - private: - Microsoft::WRL::ComPtr resource_; - D3D12_RESOURCE_STATES resource_state_; - bool uav_barrier_pending_ = false; - }; - - // Whether the signed version of the texture has a different representation on - // the host than its unsigned version (for example, if it's a fixed-point - // texture emulated with a larger host pixel format). - static bool IsSignedVersionSeparate(xenos::TextureFormat format) { - const HostFormat& host_format = host_formats_[uint32_t(format)]; - return host_format.load_mode_snorm != LoadMode::kUnknown && - host_format.load_mode_snorm != host_format.load_mode; - } - // Whether decompression is needed on the host (Direct3D only allows creation - // of block-compressed textures with 4x4-aligned dimensions on PC). - static bool IsDecompressionNeeded(xenos::TextureFormat format, uint32_t width, - uint32_t height); - static DXGI_FORMAT GetDXGIResourceFormat(xenos::TextureFormat format, - uint32_t width, uint32_t height) { - const HostFormat& host_format = host_formats_[uint32_t(format)]; - return IsDecompressionNeeded(format, width, height) - ? host_format.dxgi_format_uncompressed - : host_format.dxgi_format_resource; - } - static DXGI_FORMAT GetDXGIResourceFormat(TextureKey key) { - return GetDXGIResourceFormat(key.format, key.GetWidth(), key.GetHeight()); - } - static DXGI_FORMAT GetDXGIUnormFormat(xenos::TextureFormat format, - uint32_t width, uint32_t height) { - const HostFormat& host_format = host_formats_[uint32_t(format)]; - return IsDecompressionNeeded(format, width, height) - ? host_format.dxgi_format_uncompressed - : host_format.dxgi_format_unorm; - } - static DXGI_FORMAT GetDXGIUnormFormat(TextureKey key) { - return GetDXGIUnormFormat(key.format, key.GetWidth(), key.GetHeight()); - } - - static LoadMode GetLoadMode(TextureKey key); - - // Converts a texture fetch constant to a texture key, normalizing and - // validating the values, or creating an invalid key, and also gets the - // host swizzle and post-guest-swizzle signedness. - static void BindingInfoFromFetchConstant( - const xenos::xe_gpu_texture_fetch_t& fetch, TextureKey& key_out, - uint32_t* host_swizzle_out, uint8_t* swizzled_signs_out); - - static constexpr bool AreDimensionsCompatible( - xenos::FetchOpDimension binding_dimension, - xenos::DataDimension resource_dimension) { - switch (binding_dimension) { - case xenos::FetchOpDimension::k1D: - case xenos::FetchOpDimension::k2D: - return resource_dimension == xenos::DataDimension::k1D || - resource_dimension == xenos::DataDimension::k2DOrStacked; - case xenos::FetchOpDimension::k3DOrStacked: - return resource_dimension == xenos::DataDimension::k3D; - case xenos::FetchOpDimension::kCube: - return resource_dimension == xenos::DataDimension::kCube; - default: - return false; - } - } - - static void LogTextureKeyAction(TextureKey key, const char* action); - static void LogTextureAction(const Texture* texture, const char* action); - - // Returns nullptr if the key is not supported, but also if couldn't create - // the texture - if it's nullptr, occasionally a recreation attempt should be - // made. - Texture* FindOrCreateTexture(TextureKey key); - - // Writes data from the shared memory to the texture. This binds pipelines, - // allocates descriptors and copies! - bool LoadTextureData(Texture* texture); - - // Returns the index of an existing of a newly created non-shader-visible - // cached (for bindful) or a shader-visible global (for bindless) descriptor, - // or UINT32_MAX if failed to create. - uint32_t FindOrCreateTextureDescriptor(Texture& texture, bool is_signed, - uint32_t host_swizzle); - D3D12_CPU_DESCRIPTOR_HANDLE GetTextureDescriptorCPUHandle( - uint32_t descriptor_index) const; - - // For LRU caching - updates the last usage frame and moves the texture to - // the end of the usage queue. Must be called any time the texture is - // referenced by any command list to make sure it's not destroyed while still - // in use. - void MarkTextureUsed(Texture* texture); - - // Shared memory callback for texture data invalidation. - static void WatchCallbackThunk(void* context, void* data, uint64_t argument, - bool invalidated_by_gpu); - void WatchCallback(Texture* texture, bool is_mip); - - // Makes all bindings invalid. Also requesting textures after calling this - // will cause another attempt to create a texture or to untile it if there was - // an error. - void ClearBindings(); - - size_t GetScaledResolveBufferCount() const { - assert_true(IsDrawResolutionScaled()); - // Make sure any range up to 1 GB is accessible through 1 or 2 buffers. - // 2x2 scale buffers - just one 2 GB buffer for all 2 GB. - // 3x3 scale buffers - 4 buffers: - // +0.0 +0.5 +1.0 +1.5 +2.0 +2.5 +3.0 +3.5 +4.0 +4.5 - // |___________________|___________________| - // |___________________|______________| - // Buffer N has an offset of N * 1 GB in the scaled resolve address space. - // The logic is: - // - 2 GB can be accessed through a [0 GB ... 2 GB) buffer - only need one. - // - 2.1 GB needs [0 GB ... 2 GB) and [1 GB ... 2.1 GB) - two buffers. - // - 3 GB needs [0 GB ... 2 GB) and [1 GB ... 3 GB) - two buffers. - // - 3.1 GB needs [0 GB ... 2 GB), [1 GB ... 3 GB) and [2 GB ... 3.1 GB) - - // three buffers. - uint64_t address_space_size = - uint64_t(SharedMemory::kBufferSize) * - (draw_resolution_scale_x_ * draw_resolution_scale_y_); - return size_t((address_space_size - 1) >> 30); - } - // Returns indices of two scaled resolve virtual buffers that the location in - // memory may be accessible through. May be the same if it's a location near - // the beginning or the end of the address represented only by one buffer. - std::array GetPossibleScaledResolveBufferIndices( - uint64_t address_scaled) const { - assert_true(IsDrawResolutionScaled()); - size_t address_gb = size_t(address_scaled >> 30); - size_t max_index = GetScaledResolveBufferCount() - 1; - // In different cases for 3x3: - // +0.0 +0.5 +1.0 +1.5 +2.0 +2.5 +3.0 +3.5 +4.0 +4.5 - // |12________2________|1_________2________| - // |1_________2________|1_________12__| - return std::array{ - std::min(address_gb, max_index), - std::min(std::max(address_gb, size_t(1)) - size_t(1), max_index)}; - } - // Checks if there are any pages that contain scaled resolve data within the - // range. - bool IsRangeScaledResolved(uint32_t start_unscaled, uint32_t length_unscaled); - // Global shared memory invalidation callback for invalidating scaled resolved - // texture data. - static void ScaledResolveGlobalWatchCallbackThunk(void* context, - uint32_t address_first, - uint32_t address_last, - bool invalidated_by_gpu); - void ScaledResolveGlobalWatchCallback(uint32_t address_first, - uint32_t address_last, - bool invalidated_by_gpu); - // The index is also the gigabyte offset of the buffer from the start of the - // scaled physical memory address space. - size_t GetCurrentScaledResolveBufferIndex() const { - return scaled_resolve_1gb_buffer_indices_ - [scaled_resolve_current_range_start_scaled_ >> 30]; - } - ScaledResolveVirtualBuffer& GetCurrentScaledResolveBuffer() { - ScaledResolveVirtualBuffer* scaled_resolve_buffer = - scaled_resolve_2gb_buffers_[GetCurrentScaledResolveBufferIndex()]; - assert_not_null(scaled_resolve_buffer); - return *scaled_resolve_buffer; - } - - static const HostFormat host_formats_[64]; - - static const char* const dimension_names_[4]; - - D3D12CommandProcessor& command_processor_; - const RegisterFile& register_file_; - D3D12SharedMemory& shared_memory_; - bool bindless_resources_used_; - - static const LoadModeInfo load_mode_info_[]; - ID3D12RootSignature* load_root_signature_ = nullptr; - ID3D12PipelineState* load_pipelines_[size_t(LoadMode::kCount)] = {}; - // Load pipelines for resolution-scaled resolve targets. - ID3D12PipelineState* load_pipelines_scaled_[size_t(LoadMode::kCount)] = {}; - - std::unordered_map textures_; - uint64_t textures_total_size_ = 0; - Texture* texture_used_first_ = nullptr; - Texture* texture_used_last_ = nullptr; - uint64_t texture_current_usage_time_; - - std::vector srv_descriptor_cache_; - uint32_t srv_descriptor_cache_allocated_; - // Indices of cached descriptors used by deleted textures, for reuse. - std::vector srv_descriptor_cache_free_; - - enum class NullSRVDescriptorIndex { - k2DArray, - k3D, - kCube, - - kCount, - }; - // Contains null SRV descriptors of dimensions from NullSRVDescriptorIndex. - // For copying, not shader-visible. - ID3D12DescriptorHeap* null_srv_descriptor_heap_ = nullptr; - D3D12_CPU_DESCRIPTOR_HANDLE null_srv_descriptor_heap_start_; - - TextureBinding texture_bindings_[32] = {}; - // Bit vector with bits reset on fetch constant writes to avoid parsing fetch - // constants again and again. - uint32_t texture_bindings_in_sync_ = 0; - - // Whether a texture has been invalidated (a watch has been triggered), so - // need to try to reload textures, disregarding whether fetch constants have - // been changed. - std::atomic texture_invalidated_ = false; - - // Unsupported texture formats used during this frame (for research and - // testing). - enum : uint8_t { - kUnsupportedResourceBit = 1, - kUnsupportedUnormBit = kUnsupportedResourceBit << 1, - kUnsupportedSnormBit = kUnsupportedUnormBit << 1, - }; - uint8_t unsupported_format_features_used_[64]; - - uint32_t draw_resolution_scale_x_ = 1; - uint32_t draw_resolution_scale_y_ = 1; - // The tiled buffer for resolved data with resolution scaling. - // Because on Direct3D 12 (at least on Windows 10 2004) typed SRV or UAV - // creation fails for offsets above 4 GB, a single tiled 4.5 GB buffer can't - // be used for 3x3 resolution scaling. - // Instead, "sliding window" buffers allowing to access a single range of up - // to 1 GB (or up to 2 GB, depending on the low bits) at any moment are used. - // Parts of 4.5 GB address space can be accessed through 2 GB buffers as: - // +0.0 +0.5 +1.0 +1.5 +2.0 +2.5 +3.0 +3.5 +4.0 +4.5 - // |___________________|___________________| or - // |___________________|______________| - // (2 GB is also the amount of scaled physical memory with 2x resolution - // scale, and older Intel GPUs, while support tiled resources, only support 31 - // virtual address bits per resource). - // Index is first gigabyte. Only including buffers containing over 1 GB - // (because otherwise the data will be fully contained in another). - // Size is calculated the same as in GetScaledResolveBufferCount. - ScaledResolveVirtualBuffer* - scaled_resolve_2gb_buffers_[(uint64_t(SharedMemory::kBufferSize) * - (kMaxDrawResolutionScaleAlongAxis * - kMaxDrawResolutionScaleAlongAxis) - - 1) >> - 30] = {}; - // Not very big heaps (16 MB) because they are needed pretty sparsely. One - // 2x-scaled 1280x720x32bpp texture is slighly bigger than 14 MB. - static constexpr uint32_t kScaledResolveHeapSizeLog2 = 24; - static constexpr uint32_t kScaledResolveHeapSize = - uint32_t(1) << kScaledResolveHeapSizeLog2; - static_assert( - (kScaledResolveHeapSize % D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES) == 0, - "Scaled resolve heap size must be a multiple of Direct3D tile size"); - static_assert( - kScaledResolveHeapSizeLog2 <= SharedMemory::kBufferSizeLog2, - "Scaled resolve heaps are assumed to be wholly mappable irrespective of " - "resolution scale, never truncated, for example, if the scaled resolve " - "address space is 4.5 GB, but the heap size is 1 GB"); - static_assert( - kScaledResolveHeapSizeLog2 <= 30, - "Scaled resolve heaps are assumed to only be wholly mappable to up to " - "two 2 GB buffers"); - // Resident portions of the tiled buffer. - std::vector scaled_resolve_heaps_; - // Number of currently resident portions of the tiled buffer, for profiling. - uint32_t scaled_resolve_heap_count_ = 0; - // Global watch for scaled resolve data invalidation. - SharedMemory::GlobalWatchHandle scaled_resolve_global_watch_handle_ = nullptr; - // Current scaled resolve state. - // For aliasing barrier placement, last owning buffer index for each of 1 GB. - size_t - scaled_resolve_1gb_buffer_indices_[(uint64_t(SharedMemory::kBufferSize) * - kMaxDrawResolutionScaleAlongAxis * - kMaxDrawResolutionScaleAlongAxis + - ((uint32_t(1) << 30) - 1)) >> - 30]; - // Range used in the last successful MakeScaledResolveRangeCurrent call. - uint64_t scaled_resolve_current_range_start_scaled_; - uint64_t scaled_resolve_current_range_length_scaled_; - - xe::global_critical_region global_critical_region_; - // Bit vector storing whether each 4 KB physical memory page contains scaled - // resolve data. uint32_t rather than uint64_t because parts of it can be sent - // to shaders. - uint32_t* scaled_resolve_pages_ = nullptr; - // Second level of the bit vector for faster rejection of non-scaled textures. - // >> 12 for 4 KB pages, >> 5 for uint32_t level 1 bits, >> 6 for uint64_t - // level 2 bits. - uint64_t scaled_resolve_pages_l2_[SharedMemory::kBufferSize >> (12 + 5 + 6)]; -}; - -} // namespace d3d12 -} // namespace gpu -} // namespace xe - -#endif // XENIA_GPU_D3D12_TEXTURE_CACHE_H_ diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index b79d90c48..ea9deb591 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -20,6 +20,7 @@ #include "xenia/base/memory.h" #include "xenia/gpu/gpu_flags.h" #include "xenia/gpu/registers.h" +#include "xenia/gpu/texture_cache.h" #include "xenia/gpu/texture_info.h" #include "xenia/gpu/texture_util.h" #include "xenia/gpu/xenos.h" @@ -166,15 +167,17 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader, return false; } -void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale_x, - uint32_t resolution_scale_y, bool origin_bottom_left, - uint32_t x_max, uint32_t y_max, bool allow_reverse_z, +void GetHostViewportInfo(const RegisterFile& regs, + uint32_t draw_resolution_scale_x, + uint32_t draw_resolution_scale_y, + bool origin_bottom_left, uint32_t x_max, + uint32_t y_max, bool allow_reverse_z, reg::RB_DEPTHCONTROL normalized_depth_control, bool convert_z_to_float24, bool full_float24_in_0_to_1, bool pixel_shader_writes_depth, ViewportInfo& viewport_info_out) { - assert_not_zero(resolution_scale_x); - assert_not_zero(resolution_scale_y); + assert_not_zero(draw_resolution_scale_x); + assert_not_zero(draw_resolution_scale_y); // A vertex position goes the following path: // @@ -343,8 +346,8 @@ void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale_x, // The maximum value is at least the maximum host render target size anyway - // and a guest pixel is always treated as a whole with resolution scaling. - uint32_t xy_max_unscaled[] = {x_max / resolution_scale_x, - y_max / resolution_scale_y}; + uint32_t xy_max_unscaled[] = {x_max / draw_resolution_scale_x, + y_max / draw_resolution_scale_y}; assert_not_zero(xy_max_unscaled[0]); assert_not_zero(xy_max_unscaled[1]); @@ -363,7 +366,8 @@ void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale_x, uint32_t extent_axis_unscaled = std::min(xenos::kTexture2DCubeMaxWidthHeight, xy_max_unscaled[i]); viewport_info_out.xy_extent[i] = - extent_axis_unscaled * (i ? resolution_scale_y : resolution_scale_x); + extent_axis_unscaled * + (i ? draw_resolution_scale_y : draw_resolution_scale_x); float extent_axis_unscaled_float = float(extent_axis_unscaled); float pixels_to_ndc_axis = 2.0f / extent_axis_unscaled_float; ndc_scale[i] = scale_xy[i] * pixels_to_ndc_axis; @@ -390,7 +394,7 @@ void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale_x, // doing truncation for simplicity - since maxing with 0 is done anyway // (we only return viewports in the positive quarter-plane). uint32_t axis_resolution_scale = - i ? resolution_scale_y : resolution_scale_x; + i ? draw_resolution_scale_y : draw_resolution_scale_x; float offset_axis = offset_base_xy[i] + offset_add_xy[i]; float scale_axis = scale_xy[i]; float scale_axis_abs = std::abs(scale_xy[i]); @@ -645,6 +649,31 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs, return normalized_color_mask; } +void GetEdramTileWidthDivideScaleAndUpperShift( + uint32_t draw_resolution_scale_x, uint32_t& divide_scale_out, + uint32_t& divide_upper_shift_out) { + static_assert( + TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, + "GetEdramTileWidthDivideScaleAndUpperShift provides values only for draw " + "resolution scaling factors of up to 3"); + switch (draw_resolution_scale_x) { + case 1: + divide_scale_out = kDivideScale5; + divide_upper_shift_out = kDivideUpperShift5 + 4; + break; + case 2: + divide_scale_out = kDivideScale5; + divide_upper_shift_out = kDivideUpperShift5 + 5; + break; + case 3: + divide_scale_out = kDivideScale15; + divide_upper_shift_out = kDivideUpperShift15 + 4; + break; + default: + assert_unhandled_case(draw_resolution_scale_x); + } +} + xenos::CopySampleSelect SanitizeCopySampleSelect( xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples, bool is_depth) { @@ -1098,7 +1127,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, } ResolveCopyShaderIndex ResolveInfo::GetCopyShader( - uint32_t resolution_scale_x, uint32_t resolution_scale_y, + uint32_t draw_resolution_scale_x, uint32_t draw_resolution_scale_y, ResolveCopyShaderConstants& constants_out, uint32_t& group_count_x_out, uint32_t& group_count_y_out) const { ResolveCopyShaderIndex shader = ResolveCopyShaderIndex::kUnknown; @@ -1152,10 +1181,10 @@ ResolveCopyShaderIndex ResolveInfo::GetCopyShader( if (shader != ResolveCopyShaderIndex::kUnknown) { uint32_t width = (address.width_div_8 << xenos::kResolveAlignmentPixelsLog2) * - resolution_scale_x; + draw_resolution_scale_x; uint32_t height = (address.height_div_8 << xenos::kResolveAlignmentPixelsLog2) * - resolution_scale_y; + draw_resolution_scale_y; const ResolveCopyShaderInfo& shader_info = resolve_copy_shader_info[size_t(shader)]; group_count_x_out = (width + ((1 << shader_info.group_size_x_log2) - 1)) >> diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index 2cfed6134..7009e9d3e 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -196,9 +196,11 @@ struct ViewportInfo { // a viewport, plus values to multiply-add the returned position by, usable on // host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the // Direct3D clip space with 0...W Z rather than -W...W. -void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale_x, - uint32_t resolution_scale_y, bool origin_bottom_left, - uint32_t x_max, uint32_t y_max, bool allow_reverse_z, +void GetHostViewportInfo(const RegisterFile& regs, + uint32_t draw_resolution_scale_x, + uint32_t draw_resolution_scale_y, + bool origin_bottom_left, uint32_t x_max, + uint32_t y_max, bool allow_reverse_z, reg::RB_DEPTHCONTROL normalized_depth_control, bool convert_z_to_float24, bool full_float24_in_0_to_1, bool pixel_shader_writes_depth, @@ -234,26 +236,9 @@ constexpr uint32_t kDivideUpperShift5 = 2; constexpr uint32_t kDivideScale15 = 0x88888889u; constexpr uint32_t kDivideUpperShift15 = 3; -inline void GetEdramTileWidthDivideScaleAndUpperShift( - uint32_t resolution_scale_x, uint32_t& divide_scale, - uint32_t& divide_upper_shift) { - switch (resolution_scale_x) { - case 1: - divide_scale = kDivideScale5; - divide_upper_shift = kDivideUpperShift5 + 4; - break; - case 2: - divide_scale = kDivideScale5; - divide_upper_shift = kDivideUpperShift5 + 5; - break; - case 3: - divide_scale = kDivideScale15; - divide_upper_shift = kDivideUpperShift15 + 4; - break; - default: - assert_unhandled_case(resolution_scale_x); - } -} +void GetEdramTileWidthDivideScaleAndUpperShift( + uint32_t draw_resolution_scale_x, uint32_t& divide_scale_out, + uint32_t& divide_upper_shift_out); // Never an identity conversion - can always write conditional move instructions // to shaders that will be no-ops for conversion from guest to host samples. @@ -474,7 +459,7 @@ struct ResolveInfo { } ResolveCopyShaderIndex GetCopyShader( - uint32_t resolution_scale_x, uint32_t resolution_scale_y, + uint32_t draw_resolution_scale_x, uint32_t draw_resolution_scale_y, ResolveCopyShaderConstants& constants_out, uint32_t& group_count_x_out, uint32_t& group_count_y_out) const; @@ -509,7 +494,8 @@ struct ResolveInfo { } std::pair GetClearShaderGroupCount( - uint32_t resolution_scale_x, uint32_t resolution_scale_y) const { + uint32_t draw_resolution_scale_x, + uint32_t draw_resolution_scale_y) const { // 8 guest MSAA samples per invocation. uint32_t width_samples_div_8 = address.width_div_8; uint32_t height_samples_div_8 = address.height_div_8; @@ -522,8 +508,8 @@ struct ResolveInfo { width_samples_div_8 <<= 1; } } - width_samples_div_8 *= resolution_scale_x; - height_samples_div_8 *= resolution_scale_y; + width_samples_div_8 *= draw_resolution_scale_x; + height_samples_div_8 *= draw_resolution_scale_y; return std::make_pair((width_samples_div_8 + uint32_t(7)) >> 3, height_samples_div_8); } diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index b99e9cbe8..4bb8c918e 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -81,10 +81,8 @@ DxbcShaderTranslator::DxbcShaderTranslator( draw_resolution_scale_x_(draw_resolution_scale_x), draw_resolution_scale_y_(draw_resolution_scale_y), emit_source_map_(force_emit_source_map || cvars::dxbc_source_map) { - assert_true(draw_resolution_scale_x >= 1); - assert_true(draw_resolution_scale_x <= 3); - assert_true(draw_resolution_scale_y >= 1); - assert_true(draw_resolution_scale_y <= 3); + assert_not_zero(draw_resolution_scale_x); + assert_not_zero(draw_resolution_scale_y); // Don't allocate again and again for the first shader. shader_code_.reserve(8192); shader_object_.reserve(16384); diff --git a/src/xenia/gpu/dxbc_shader_translator_memexport.cc b/src/xenia/gpu/dxbc_shader_translator_memexport.cc index e2f65e66f..b345f12f4 100644 --- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc +++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc @@ -11,6 +11,7 @@ #include "xenia/base/math.h" #include "xenia/gpu/draw_util.h" #include "xenia/gpu/dxbc_shader_translator.h" +#include "xenia/gpu/texture_cache.h" namespace xe { namespace gpu { @@ -159,6 +160,11 @@ void DxbcShaderTranslator::ExportToMemory() { dxbc::Src::R(control_temp).Select(1 + i)); uint32_t axis_resolution_scale = i ? draw_resolution_scale_y_ : draw_resolution_scale_x_; + static_assert( + TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, + "DxbcShaderTranslator memexport draw resolution scaling " + "conditional generation supports draw resolution scaling factors " + "of only up to 3"); switch (axis_resolution_scale) { case 2: // xy & 1 == 1. diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc index 164ac07fb..685911285 100644 --- a/src/xenia/gpu/dxbc_shader_translator_om.cc +++ b/src/xenia/gpu/dxbc_shader_translator_om.cc @@ -14,6 +14,7 @@ #include "xenia/base/assert.h" #include "xenia/base/math.h" #include "xenia/gpu/draw_util.h" +#include "xenia/gpu/texture_cache.h" namespace xe { namespace gpu { @@ -200,6 +201,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { assert_not_zero(tile_or_half_tile_width_divide_upper_shift); --tile_or_half_tile_width_divide_upper_shift; } + static_assert( + TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, + "DxbcShaderTranslator ROV sample address calculation supports Y draw " + "resolution scaling factors of only up to 3"); if (draw_resolution_scale_y_ == 3) { // Multiplication part of the division by 40|80 x 16 x scale (specifically // 40|80 * scale width here, and 48 height, or 16 * 3 height). diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc index 5b5e9f613..2ca0c0a3d 100644 --- a/src/xenia/gpu/render_target_cache.cc +++ b/src/xenia/gpu/render_target_cache.cc @@ -76,25 +76,6 @@ DEFINE_string( " Any other value:\n" " Choose what is considered the most optimal (currently \"on_copy\").", "GPU"); -DEFINE_int32( - draw_resolution_scale_x, 1, - "Integer pixel width scale used for scaling the rendering resolution " - "opaquely to the game.\n" - "1, 2 and 3 may be supported, but support of anything above 1 depends on " - "the device properties, such as whether it supports sparse binding / tiled " - "resources, the number of virtual address bits per resource, and other " - "factors.\n" - "Various effects and parts of game rendering pipelines may work " - "incorrectly as pixels become ambiguous from the game's perspective and " - "because half-pixel offset (which normally doesn't affect coverage when " - "MSAA isn't used) becomes full-pixel.", - "GPU"); -DEFINE_int32( - draw_resolution_scale_y, 1, - "Integer pixel width scale used for scaling the rendering resolution " - "opaquely to the game.\n" - "See draw_resolution_scale_x for more information.", - "GPU"); DEFINE_bool( draw_resolution_scaled_texture_offsets, true, "Apply offsets from texture fetch instructions taking resolution scale " @@ -416,7 +397,7 @@ bool RenderTargetCache::Update(bool is_rasterization_done, uint32_t pitch_pixels_tile_aligned_scaled = pitch_tiles_at_32bpp * (xenos::kEdramTileWidthSamples >> msaa_samples_x_log2) * - GetResolutionScaleX(); + draw_resolution_scale_x(); uint32_t max_render_target_width = GetMaxRenderTargetWidth(); if (pitch_pixels_tile_aligned_scaled > max_render_target_width) { // TODO(Triang3l): If really needed for some game on some device, clamp @@ -834,14 +815,13 @@ uint32_t RenderTargetCache::GetRenderTargetHeight( !(xenos::kTexture2DCubeMaxWidthHeight % xenos::kEdramTileHeightSamples), "Maximum guest render target height is assumed to always be a multiple " "of an EDRAM tile height"); - uint32_t resolution_scale_y = GetResolutionScaleY(); uint32_t max_height_scaled = - std::min(xenos::kTexture2DCubeMaxWidthHeight * resolution_scale_y, + std::min(xenos::kTexture2DCubeMaxWidthHeight * draw_resolution_scale_y(), GetMaxRenderTargetHeight()); uint32_t msaa_samples_y_log2 = uint32_t(msaa_samples >= xenos::MsaaSamples::k2X); uint32_t tile_height_samples_scaled = - xenos::kEdramTileHeightSamples * resolution_scale_y; + xenos::kEdramTileHeightSamples * draw_resolution_scale_y(); tile_rows = std::min(tile_rows, (max_height_scaled << msaa_samples_y_log2) / tile_height_samples_scaled); assert_not_zero(tile_rows); @@ -868,9 +848,9 @@ void RenderTargetCache::GetHostDepthStoreRectangleInfo( (transfer_rectangle.width_pixels >> 3) - 1; rectangle_constant_out = rectangle_constant; // 1 thread group = 64x8 host samples. - uint32_t pixel_size_x = GetResolutionScaleX() + uint32_t pixel_size_x = draw_resolution_scale_x() << uint32_t(msaa_samples >= xenos::MsaaSamples::k4X); - uint32_t pixel_size_y = GetResolutionScaleY() + uint32_t pixel_size_y = draw_resolution_scale_y() << uint32_t(msaa_samples >= xenos::MsaaSamples::k2X); group_count_x_out = (transfer_rectangle.width_pixels * pixel_size_x + 63) >> 6; @@ -1001,7 +981,7 @@ bool RenderTargetCache::PrepareHostRenderTargetsResolveClear( uint32_t pitch_pixels = pitch_tiles_at_32bpp * (xenos::kEdramTileWidthSamples >> msaa_samples_x_log2); - uint32_t pitch_pixels_scaled = pitch_pixels * GetResolutionScaleX(); + uint32_t pitch_pixels_scaled = pitch_pixels * draw_resolution_scale_x(); uint32_t max_render_target_width = GetMaxRenderTargetWidth(); if (pitch_pixels_scaled > max_render_target_width) { // TODO(Triang3l): If really needed for some game on some device, clamp the @@ -1147,12 +1127,10 @@ RenderTargetCache::RenderTarget* RenderTargetCache::PrepareFullEdram1280xRenderTargetForSnapshotRestoration( xenos::ColorRenderTargetFormat color_format) { assert_true(GetPath() == Path::kHostRenderTargets); - uint32_t resolution_scale_x = GetResolutionScaleX(); - uint32_t resolution_scale_y = GetResolutionScaleY(); constexpr uint32_t kPitchTilesAt32bpp = 16; constexpr uint32_t kWidth = kPitchTilesAt32bpp * xenos::kEdramTileWidthSamples; - if (kWidth * resolution_scale_x > GetMaxRenderTargetWidth()) { + if (kWidth * draw_resolution_scale_x() > GetMaxRenderTargetWidth()) { return nullptr; } // Same render target height is used for 32bpp and 64bpp to allow mixing them. @@ -1168,7 +1146,7 @@ RenderTargetCache::PrepareFullEdram1280xRenderTargetForSnapshotRestoration( "Using width of the render target for EDRAM snapshot restoration that is " "expect to fully cover the EDRAM without exceeding the maximum guest " "render target height."); - if (kHeight * resolution_scale_y > GetMaxRenderTargetHeight()) { + if (kHeight * draw_resolution_scale_y() > GetMaxRenderTargetHeight()) { return nullptr; } RenderTargetKey render_target_key; diff --git a/src/xenia/gpu/render_target_cache.h b/src/xenia/gpu/render_target_cache.h index 010675d6b..a8cab45d6 100644 --- a/src/xenia/gpu/render_target_cache.h +++ b/src/xenia/gpu/render_target_cache.h @@ -29,8 +29,6 @@ #include "xenia/gpu/xenos.h" DECLARE_bool(depth_transfer_not_equal_test); -DECLARE_int32(draw_resolution_scale_x); -DECLARE_int32(draw_resolution_scale_y); DECLARE_bool(draw_resolution_scaled_texture_offsets); DECLARE_bool(gamma_render_target_as_srgb); DECLARE_bool(native_2x_msaa); @@ -204,10 +202,10 @@ class RenderTargetCache { // would participate in filtering. However, 1x1 scissor rounded to 1x1, with // the half-pixel offset of vertices, would cause the entire 0.75...2.25 quad // to be discarded. - virtual uint32_t GetResolutionScaleX() const = 0; - virtual uint32_t GetResolutionScaleY() const = 0; - bool IsResolutionScaled() const { - return GetResolutionScaleX() > 1 || GetResolutionScaleY() > 1; + uint32_t draw_resolution_scale_x() const { return draw_resolution_scale_x_; } + uint32_t draw_resolution_scale_y() const { return draw_resolution_scale_y_; } + bool IsDrawResolutionScaled() const { + return draw_resolution_scale_x() > 1 || draw_resolution_scale_y() > 1; } // Virtual (both the common code and the implementation may do something @@ -232,9 +230,15 @@ class RenderTargetCache { protected: RenderTargetCache(const RegisterFile& register_file, const Memory& memory, - TraceWriter* trace_writer) + TraceWriter* trace_writer, uint32_t draw_resolution_scale_x, + uint32_t draw_resolution_scale_y) : register_file_(register_file), - draw_extent_estimator_(register_file, memory, trace_writer) {} + draw_extent_estimator_(register_file, memory, trace_writer), + draw_resolution_scale_x_(draw_resolution_scale_x), + draw_resolution_scale_y_(draw_resolution_scale_y) { + assert_not_zero(draw_resolution_scale_x); + assert_not_zero(draw_resolution_scale_y); + } const RegisterFile& register_file() const { return register_file_; } @@ -559,8 +563,8 @@ class RenderTargetCache { uint32_t pitch_tiles, bool msaa_2x_supported) const { HostDepthStoreRenderTargetConstant constant; constant.pitch_tiles = pitch_tiles; - constant.resolution_scale_x = GetResolutionScaleX(); - constant.resolution_scale_y = GetResolutionScaleY(); + constant.resolution_scale_x = draw_resolution_scale_x(); + constant.resolution_scale_y = draw_resolution_scale_y(); constant.msaa_2x_supported = uint32_t(msaa_2x_supported); return constant; } @@ -612,6 +616,8 @@ class RenderTargetCache { private: const RegisterFile& register_file_; + uint32_t draw_resolution_scale_x_; + uint32_t draw_resolution_scale_y_; DrawExtentEstimator draw_extent_estimator_; diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc index 2b05821dc..c7c5c9c19 100644 --- a/src/xenia/gpu/shared_memory.cc +++ b/src/xenia/gpu/shared_memory.cc @@ -208,10 +208,6 @@ SharedMemory::WatchHandle SharedMemory::WatchMemoryRange( } void SharedMemory::UnwatchMemoryRange(WatchHandle handle) { - if (handle == nullptr) { - // Could be a zero length range. - return; - } auto global_lock = global_critical_region_.Acquire(); UnlinkWatchRange(reinterpret_cast(handle)); } @@ -228,8 +224,8 @@ void SharedMemory::FireWatches(uint32_t page_first, uint32_t page_last, // Fire global watches. for (const auto global_watch : global_watches_) { - global_watch->callback(global_watch->callback_context, address_first, - address_last, invalidated_by_gpu); + global_watch->callback(global_lock, global_watch->callback_context, + address_first, address_last, invalidated_by_gpu); } // Fire per-range watches. @@ -241,8 +237,9 @@ void SharedMemory::FireWatches(uint32_t page_first, uint32_t page_last, // will be broken. node = node->bucket_node_next; if (page_first <= range->page_last && page_last >= range->page_first) { - range->callback(range->callback_context, range->callback_data, - range->callback_argument, invalidated_by_gpu); + range->callback(global_lock, range->callback_context, + range->callback_data, range->callback_argument, + invalidated_by_gpu); UnlinkWatchRange(range); } } diff --git a/src/xenia/gpu/shared_memory.h b/src/xenia/gpu/shared_memory.h index 386997bab..75d98f143 100644 --- a/src/xenia/gpu/shared_memory.h +++ b/src/xenia/gpu/shared_memory.h @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2020 Ben Vanik. All rights reserved. * + * Copyright 2022 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -11,6 +11,7 @@ #define XENIA_GPU_SHARED_MEMORY_H_ #include +#include #include #include @@ -32,9 +33,9 @@ class SharedMemory { // Call in the implementation-specific ClearCache. virtual void ClearCache(); - typedef void (*GlobalWatchCallback)(void* context, uint32_t address_first, - uint32_t address_last, - bool invalidated_by_gpu); + typedef void (*GlobalWatchCallback)( + const std::unique_lock& global_lock, void* context, + uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu); typedef void* GlobalWatchHandle; // Registers a callback invoked when something is invalidated in the GPU // memory copy by the CPU or (if triggered explicitly - such as by a resolve) @@ -47,8 +48,9 @@ class SharedMemory { GlobalWatchHandle RegisterGlobalWatch(GlobalWatchCallback callback, void* callback_context); void UnregisterGlobalWatch(GlobalWatchHandle handle); - typedef void (*WatchCallback)(void* context, void* data, uint64_t argument, - bool invalidated_by_gpu); + typedef void (*WatchCallback)( + const std::unique_lock& global_lock, void* context, + void* data, uint64_t argument, bool invalidated_by_gpu); typedef void* WatchHandle; // Registers a callback invoked when the specified memory range is invalidated // in the GPU memory copy by the CPU or (if triggered explicitly - such as by diff --git a/src/xenia/gpu/texture_cache.cc b/src/xenia/gpu/texture_cache.cc new file mode 100644 index 000000000..ebe503ce3 --- /dev/null +++ b/src/xenia/gpu/texture_cache.cc @@ -0,0 +1,871 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2022 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/texture_cache.h" + +#include +#include +#include + +#include "xenia/base/assert.h" +#include "xenia/base/clock.h" +#include "xenia/base/cvar.h" +#include "xenia/base/logging.h" +#include "xenia/base/math.h" +#include "xenia/base/profiling.h" +#include "xenia/gpu/gpu_flags.h" +#include "xenia/gpu/register_file.h" +#include "xenia/gpu/texture_info.h" +#include "xenia/gpu/texture_util.h" +#include "xenia/gpu/xenos.h" + +DEFINE_int32( + draw_resolution_scale_x, 1, + "Integer pixel width scale used for scaling the rendering resolution " + "opaquely to the game.\n" + "1, 2 and 3 may be supported, but support of anything above 1 depends on " + "the device properties, such as whether it supports sparse binding / tiled " + "resources, the number of virtual address bits per resource, and other " + "factors.\n" + "Various effects and parts of game rendering pipelines may work " + "incorrectly as pixels become ambiguous from the game's perspective and " + "because half-pixel offset (which normally doesn't affect coverage when " + "MSAA isn't used) becomes full-pixel.", + "GPU"); +DEFINE_int32( + draw_resolution_scale_y, 1, + "Integer pixel width scale used for scaling the rendering resolution " + "opaquely to the game.\n" + "See draw_resolution_scale_x for more information.", + "GPU"); +DEFINE_uint32( + texture_cache_memory_limit_soft, 384, + "Maximum host texture memory usage (in megabytes) above which old textures " + "will be destroyed.", + "GPU"); +DEFINE_uint32( + texture_cache_memory_limit_soft_lifetime, 30, + "Seconds a texture should be unused to be considered old enough to be " + "deleted if texture memory usage exceeds texture_cache_memory_limit_soft.", + "GPU"); +DEFINE_uint32( + texture_cache_memory_limit_hard, 768, + "Maximum host texture memory usage (in megabytes) above which textures " + "will be destroyed as soon as possible.", + "GPU"); +DEFINE_uint32( + texture_cache_memory_limit_render_to_texture, 24, + "Part of the host texture memory budget (in megabytes) that will be scaled " + "by the current drawing resolution scale.\n" + "If texture_cache_memory_limit_soft, for instance, is 384, and this is 24, " + "it will be assumed that the game will be using roughly 24 MB of " + "render-to-texture (resolve) targets and 384 - 24 = 360 MB of regular " + "textures - so with 2x2 resolution scaling, the soft limit will be 360 + " + "96 MB, and with 3x3, it will be 360 + 216 MB.", + "GPU"); + +namespace xe { +namespace gpu { + +TextureCache::TextureCache(const RegisterFile& register_file, + SharedMemory& shared_memory, + uint32_t draw_resolution_scale_x, + uint32_t draw_resolution_scale_y) + : register_file_(register_file), + shared_memory_(shared_memory), + draw_resolution_scale_x_(draw_resolution_scale_x), + draw_resolution_scale_y_(draw_resolution_scale_y) { + assert_true(draw_resolution_scale_x >= 1); + assert_true(draw_resolution_scale_x <= kMaxDrawResolutionScaleAlongAxis); + assert_true(draw_resolution_scale_y >= 1); + assert_true(draw_resolution_scale_y <= kMaxDrawResolutionScaleAlongAxis); + + if (draw_resolution_scale_x > 1 || draw_resolution_scale_y > 1) { + constexpr uint32_t kScaledResolvePageDwordCount = + SharedMemory::kBufferSize / 4096 / 32; + scaled_resolve_pages_ = + std::unique_ptr(new uint32_t[kScaledResolvePageDwordCount]); + std::memset(scaled_resolve_pages_.get(), 0, + kScaledResolvePageDwordCount * sizeof(uint32_t)); + std::memset(scaled_resolve_pages_l2_, 0, sizeof(scaled_resolve_pages_l2_)); + scaled_resolve_global_watch_handle_ = shared_memory.RegisterGlobalWatch( + ScaledResolveGlobalWatchCallbackThunk, this); + } +} + +TextureCache::~TextureCache() { + DestroyAllTextures(true); + + if (scaled_resolve_global_watch_handle_) { + shared_memory().UnregisterGlobalWatch(scaled_resolve_global_watch_handle_); + } +} + +bool TextureCache::GetConfigDrawResolutionScale(uint32_t& x_out, + uint32_t& y_out) { + uint32_t config_x = + uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_x)); + uint32_t config_y = + uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_y)); + uint32_t clamped_x = std::min(kMaxDrawResolutionScaleAlongAxis, config_x); + uint32_t clamped_y = std::min(kMaxDrawResolutionScaleAlongAxis, config_y); + x_out = clamped_x; + y_out = clamped_y; + return clamped_x == config_x && clamped_y == config_y; +} + +void TextureCache::ClearCache() { DestroyAllTextures(); } + +void TextureCache::CompletedSubmissionUpdated( + uint64_t completed_submission_index) { + // If memory usage is too high, destroy unused textures. + uint64_t current_time = xe::Clock::QueryHostUptimeMillis(); + // texture_cache_memory_limit_render_to_texture is assumed to be included in + // texture_cache_memory_limit_soft and texture_cache_memory_limit_hard, at 1x, + // so subtracting 1 from the scale. + uint32_t limit_scaled_resolve_add_mb = + cvars::texture_cache_memory_limit_render_to_texture * + (draw_resolution_scale_x() * draw_resolution_scale_y() - 1); + uint32_t limit_soft_mb = + cvars::texture_cache_memory_limit_soft + limit_scaled_resolve_add_mb; + uint32_t limit_hard_mb = + cvars::texture_cache_memory_limit_hard + limit_scaled_resolve_add_mb; + uint32_t limit_soft_lifetime = + cvars::texture_cache_memory_limit_soft_lifetime * 1000; + bool destroyed_any = false; + while (texture_used_first_ != nullptr) { + uint64_t total_host_memory_usage_mb = + (textures_total_host_memory_usage_ + ((UINT32_C(1) << 20) - 1)) >> 20; + bool limit_hard_exceeded = total_host_memory_usage_mb > limit_hard_mb; + if (total_host_memory_usage_mb <= limit_soft_mb && !limit_hard_exceeded) { + break; + } + Texture* texture = texture_used_first_; + if (texture->last_usage_submission_index() > completed_submission_index) { + break; + } + if (!limit_hard_exceeded && + (texture->last_usage_time() + limit_soft_lifetime) > current_time) { + break; + } + if (!destroyed_any) { + destroyed_any = true; + // The texture being destroyed might have been bound in the previous + // submissions, and nothing has overwritten the binding yet, so completion + // of the submission where the texture was last actually used on the GPU + // doesn't imply that it's not bound currently. Reset bindings if + // any texture has been destroyed. + ResetTextureBindings(); + } + // Remove the texture from the map and destroy it via its unique_ptr. + auto found_texture_it = textures_.find(texture->key()); + assert_true(found_texture_it != textures_.end()); + if (found_texture_it != textures_.end()) { + assert_true(found_texture_it->second.get() == texture); + textures_.erase(found_texture_it); + // `texture` is invalid now. + } + } + if (destroyed_any) { + COUNT_profile_set("gpu/texture_cache/textures", textures_.size()); + } +} + +void TextureCache::BeginSubmission(uint64_t new_submission_index) { + assert_true(new_submission_index > current_submission_index_); + current_submission_index_ = new_submission_index; + current_submission_time_ = xe::Clock::QueryHostUptimeMillis(); +} + +void TextureCache::BeginFrame() { + // In case there was a failure to create something in the previous frame, make + // sure bindings are reset so a new attempt will surely be made if the texture + // is requested again. + ResetTextureBindings(); +} + +void TextureCache::MarkRangeAsResolved(uint32_t start_unscaled, + uint32_t length_unscaled) { + if (length_unscaled == 0) { + return; + } + start_unscaled &= 0x1FFFFFFF; + length_unscaled = std::min(length_unscaled, 0x20000000 - start_unscaled); + + if (IsDrawResolutionScaled()) { + uint32_t page_first = start_unscaled >> 12; + uint32_t page_last = (start_unscaled + length_unscaled - 1) >> 12; + uint32_t block_first = page_first >> 5; + uint32_t block_last = page_last >> 5; + auto global_lock = global_critical_region_.Acquire(); + for (uint32_t i = block_first; i <= block_last; ++i) { + uint32_t add_bits = UINT32_MAX; + if (i == block_first) { + add_bits &= ~((UINT32_C(1) << (page_first & 31)) - 1); + } + if (i == block_last && (page_last & 31) != 31) { + add_bits &= (UINT32_C(1) << ((page_last & 31) + 1)) - 1; + } + scaled_resolve_pages_[i] |= add_bits; + scaled_resolve_pages_l2_[i >> 6] |= UINT64_C(1) << (i & 63); + } + } + + // Invalidate textures. Toggling individual textures between scaled and + // unscaled also relies on invalidation through shared memory. + shared_memory().RangeWrittenByGpu(start_unscaled, length_unscaled, true); +} + +uint32_t TextureCache::GuestToHostSwizzle(uint32_t guest_swizzle, + uint32_t host_format_swizzle) { + uint32_t host_swizzle = 0; + for (uint32_t i = 0; i < 4; ++i) { + uint32_t guest_swizzle_component = (guest_swizzle >> (3 * i)) & 0b111; + uint32_t host_swizzle_component; + if (guest_swizzle_component >= xenos::XE_GPU_TEXTURE_SWIZZLE_0) { + // Get rid of 6 and 7 values (to prevent host GPU errors if the game has + // something broken) the simple way - by changing them to 4 (0) and 5 (1). + host_swizzle_component = guest_swizzle_component & 0b101; + } else { + host_swizzle_component = + (host_format_swizzle >> (3 * guest_swizzle_component)) & 0b111; + } + host_swizzle |= host_swizzle_component << (3 * i); + } + return host_swizzle; +} + +void TextureCache::RequestTextures(uint32_t used_texture_mask) { + const auto& regs = register_file(); + + if (texture_became_outdated_.exchange(false, std::memory_order_acquire)) { + // A texture has become outdated - make sure whether textures are outdated + // is rechecked in this draw and in subsequent ones to reload the new data + // if needed. + ResetTextureBindings(); + } + + // Update the texture keys and the textures. + uint32_t bindings_changed = 0; + uint32_t textures_remaining = used_texture_mask & ~texture_bindings_in_sync_; + uint32_t index = 0; + while (xe::bit_scan_forward(textures_remaining, &index)) { + uint32_t index_bit = UINT32_C(1) << index; + textures_remaining &= ~index_bit; + TextureBinding& binding = texture_bindings_[index]; + const auto& fetch = regs.Get( + XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + index * 6); + TextureKey old_key = binding.key; + uint8_t old_swizzled_signs = binding.swizzled_signs; + BindingInfoFromFetchConstant(fetch, binding.key, &binding.swizzled_signs); + texture_bindings_in_sync_ |= index_bit; + if (!binding.key.is_valid) { + if (old_key.is_valid) { + bindings_changed |= index_bit; + } + binding.Reset(); + continue; + } + uint32_t old_host_swizzle = binding.host_swizzle; + binding.host_swizzle = + GuestToHostSwizzle(fetch.swizzle, GetHostFormatSwizzle(binding.key)); + + // Check if need to load the unsigned and the signed versions of the texture + // (if the format is emulated with different host bit representations for + // signed and unsigned - otherwise only the unsigned one is loaded). + bool key_changed = binding.key != old_key; + bool any_sign_was_not_signed = + texture_util::IsAnySignNotSigned(old_swizzled_signs); + bool any_sign_was_signed = + texture_util::IsAnySignSigned(old_swizzled_signs); + bool any_sign_is_not_signed = + texture_util::IsAnySignNotSigned(binding.swizzled_signs); + bool any_sign_is_signed = + texture_util::IsAnySignSigned(binding.swizzled_signs); + if (key_changed || binding.host_swizzle != old_host_swizzle || + any_sign_is_not_signed != any_sign_was_not_signed || + any_sign_is_signed != any_sign_was_signed) { + bindings_changed |= index_bit; + } + bool load_unsigned_data = false, load_signed_data = false; + if (IsSignedVersionSeparateForFormat(binding.key)) { + // Can reuse previously loaded unsigned/signed versions if the key is the + // same and the texture was previously bound as unsigned/signed + // respectively (checking the previous values of signedness rather than + // binding.texture != nullptr and binding.texture_signed != nullptr also + // prevents repeated attempts to load the texture if it has failed to + // load). + if (any_sign_is_not_signed) { + if (key_changed || !any_sign_was_not_signed) { + binding.texture = FindOrCreateTexture(binding.key); + load_unsigned_data = true; + } + } else { + binding.texture = nullptr; + } + if (any_sign_is_signed) { + if (key_changed || !any_sign_was_signed) { + TextureKey signed_key = binding.key; + signed_key.signed_separate = 1; + binding.texture_signed = FindOrCreateTexture(signed_key); + load_signed_data = true; + } + } else { + binding.texture_signed = nullptr; + } + } else { + // Same resource for both unsigned and signed, but descriptor formats may + // be different. + if (key_changed) { + binding.texture = FindOrCreateTexture(binding.key); + load_unsigned_data = true; + } + binding.texture_signed = nullptr; + } + if (load_unsigned_data && binding.texture != nullptr) { + LoadTextureData(*binding.texture); + } + if (load_signed_data && binding.texture_signed != nullptr) { + LoadTextureData(*binding.texture_signed); + } + } + if (bindings_changed) { + UpdateTextureBindingsImpl(bindings_changed); + } +} + +const char* TextureCache::TextureKey::GetLogDimensionName( + xenos::DataDimension dimension) { + switch (dimension) { + case xenos::DataDimension::k1D: + return "1D"; + case xenos::DataDimension::k2DOrStacked: + return "2D"; + case xenos::DataDimension::k3D: + return "3D"; + case xenos::DataDimension::kCube: + return "cube"; + default: + assert_unhandled_case(dimension); + return "unknown"; + } +} + +void TextureCache::TextureKey::LogAction(const char* action) const { + XELOGGPU( + "{} {} {}{}x{}x{} {} {} texture with {} {}packed mip level{}, " + "base at 0x{:08X} (pitch {}), mips at 0x{:08X}", + action, tiled ? "tiled" : "linear", scaled_resolve ? "scaled " : "", + GetWidth(), GetHeight(), GetDepthOrArraySize(), GetLogDimensionName(), + FormatInfo::Get(format)->name, mip_max_level + 1, packed_mips ? "" : "un", + mip_max_level != 0 ? "s" : "", base_page << 12, pitch << 5, + mip_page << 12); +} + +void TextureCache::Texture::LogAction(const char* action) const { + XELOGGPU( + "{} {} {}{}x{}x{} {} {} texture with {} {}packed mip level{}, " + "base at 0x{:08X} (pitch {}, size 0x{:08X}), mips at 0x{:08X} (size " + "0x{:08X})", + action, key_.tiled ? "tiled" : "linear", + key_.scaled_resolve ? "scaled " : "", key_.GetWidth(), key_.GetHeight(), + key_.GetDepthOrArraySize(), key_.GetLogDimensionName(), + FormatInfo::Get(key_.format)->name, key_.mip_max_level + 1, + key_.packed_mips ? "" : "un", key_.mip_max_level != 0 ? "s" : "", + key_.base_page << 12, key_.pitch << 5, GetGuestBaseSize(), + key_.mip_page << 12, GetGuestMipsSize()); +} + +// The texture must be in the recent usage list. Place it in front now because +// after creation, the texture will likely be used immediately, and it should +// not be destroyed immediately after creation if dropping of old textures is +// performed somehow. The list is maintained by the Texture, not the +// TextureCache itself (unlike the `textures_` container). +TextureCache::Texture::Texture(TextureCache& texture_cache, + const TextureKey& key) + : texture_cache_(texture_cache), + key_(key), + guest_layout_(key.GetGuestLayout()), + base_resolved_(key.scaled_resolve), + mips_resolved_(key.scaled_resolve), + last_usage_submission_index_(texture_cache.current_submission_index_), + last_usage_time_(texture_cache.current_submission_time_), + used_previous_(texture_cache.texture_used_last_), + used_next_(nullptr) { + if (texture_cache.texture_used_last_) { + texture_cache.texture_used_last_->used_next_ = this; + } else { + texture_cache.texture_used_first_ = this; + } + texture_cache.texture_used_last_ = this; + + // Never try to upload data that doesn't exist. + base_outdated_ = guest_layout().base.level_data_extent_bytes != 0; + mips_outdated_ = guest_layout().mips_total_extent_bytes != 0; +} + +TextureCache::Texture::~Texture() { + if (mips_watch_handle_) { + texture_cache().shared_memory().UnwatchMemoryRange(mips_watch_handle_); + } + if (base_watch_handle_) { + texture_cache().shared_memory().UnwatchMemoryRange(base_watch_handle_); + } + + if (used_previous_) { + used_previous_->used_next_ = used_next_; + } else { + texture_cache_.texture_used_first_ = used_next_; + } + if (used_next_) { + used_next_->used_previous_ = used_previous_; + } else { + texture_cache_.texture_used_last_ = used_previous_; + } + + texture_cache_.UpdateTexturesTotalHostMemoryUsage(0, host_memory_usage_); +} + +void TextureCache::Texture::MakeUpToDateAndWatch( + const std::unique_lock& global_lock) { + SharedMemory& shared_memory = texture_cache().shared_memory(); + if (base_outdated_) { + assert_not_zero(GetGuestBaseSize()); + base_outdated_ = false; + base_watch_handle_ = shared_memory.WatchMemoryRange( + key().base_page << 12, GetGuestBaseSize(), TextureCache::WatchCallback, + this, nullptr, 0); + } + if (mips_outdated_) { + assert_not_zero(GetGuestMipsSize()); + mips_outdated_ = false; + mips_watch_handle_ = shared_memory.WatchMemoryRange( + key().mip_page << 12, GetGuestMipsSize(), TextureCache::WatchCallback, + this, nullptr, 1); + } +} + +void TextureCache::Texture::MarkAsUsed() { + // This is called very frequently, don't relink unless needed for caching. + if (last_usage_submission_index_ == + texture_cache_.current_submission_index_) { + return; + } + last_usage_submission_index_ = texture_cache_.current_submission_index_; + last_usage_time_ = texture_cache_.current_submission_time_; + if (used_next_ == nullptr) { + // Already the most recently used. + return; + } + if (used_previous_ != nullptr) { + used_previous_->used_next_ = used_next_; + } else { + texture_cache_.texture_used_first_ = used_next_; + } + used_next_->used_previous_ = used_previous_; + used_previous_ = texture_cache_.texture_used_last_; + used_next_ = nullptr; + if (texture_cache_.texture_used_last_ != nullptr) { + texture_cache_.texture_used_last_->used_next_ = this; + } + texture_cache_.texture_used_last_ = this; +} + +void TextureCache::Texture::WatchCallback( + [[maybe_unused]] const std::unique_lock& global_lock, + bool is_mip) { + if (is_mip) { + assert_not_zero(GetGuestMipsSize()); + mips_outdated_ = true; + mips_watch_handle_ = nullptr; + } else { + assert_not_zero(GetGuestBaseSize()); + base_outdated_ = true; + base_watch_handle_ = nullptr; + } +} + +void TextureCache::WatchCallback( + const std::unique_lock& global_lock, void* context, + void* data, uint64_t argument, bool invalidated_by_gpu) { + Texture& texture = *static_cast(context); + texture.WatchCallback(global_lock, argument != 0); + texture.texture_cache().texture_became_outdated_.store( + true, std::memory_order_release); +} + +void TextureCache::DestroyAllTextures(bool from_destructor) { + ResetTextureBindings(from_destructor); + textures_.clear(); + COUNT_profile_set("gpu/texture_cache/textures", 0); +} + +TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) { + // Check if the texture is a scaled resolve texture. + if (IsDrawResolutionScaled() && key.tiled && + IsScaledResolveSupportedForFormat(key)) { + texture_util::TextureGuestLayout scaled_resolve_guest_layout = + key.GetGuestLayout(); + if ((scaled_resolve_guest_layout.base.level_data_extent_bytes && + IsRangeScaledResolved( + key.base_page << 12, + scaled_resolve_guest_layout.base.level_data_extent_bytes)) || + (scaled_resolve_guest_layout.mips_total_extent_bytes && + IsRangeScaledResolved( + key.mip_page << 12, + scaled_resolve_guest_layout.mips_total_extent_bytes))) { + key.scaled_resolve = 1; + } + } + + uint32_t host_width = key.GetWidth(); + uint32_t host_height = key.GetHeight(); + if (key.scaled_resolve) { + host_width *= draw_resolution_scale_x(); + host_height *= draw_resolution_scale_y(); + } + // With 3x resolution scaling, a 2D texture may become bigger than the + // Direct3D 11 limit, and with 2x, a 3D one as well. + // TODO(Triang3l): Skip mips on Vulkan in this case - the minimum requirement + // there is 4096, which is below the Xenos maximum texture size of 8192. + uint32_t max_host_width_height = GetMaxHostTextureWidthHeight(key.dimension); + uint32_t max_host_depth_or_array_size = + GetMaxHostTextureDepthOrArraySize(key.dimension); + if (host_width > max_host_width_height || + host_height > max_host_width_height || + key.GetDepthOrArraySize() > max_host_depth_or_array_size) { + return nullptr; + } + + // Try to find an existing texture. + // TODO(Triang3l): Reuse a texture with mip_page unchanged, but base_page + // previously 0, now not 0, to save memory - common case in streaming. + auto found_texture_it = textures_.find(key); + if (found_texture_it != textures_.end()) { + return found_texture_it->second.get(); + } + + // Create the texture and add it to the map. + Texture* texture; + { + std::unique_ptr new_texture = CreateTexture(key); + if (!new_texture) { + key.LogAction("Failed to create"); + return nullptr; + } + assert_true(new_texture->key() == key); + texture = + textures_.emplace(key, std::move(new_texture)).first->second.get(); + } + COUNT_profile_set("gpu/texture_cache/textures", textures_.size()); + texture->LogAction("Created"); + return texture; +} + +bool TextureCache::LoadTextureData(Texture& texture) { + // Check what needs to be uploaded. + bool base_outdated, mips_outdated; + { + auto global_lock = global_critical_region_.Acquire(); + base_outdated = texture.base_outdated(global_lock); + mips_outdated = texture.mips_outdated(global_lock); + } + if (!base_outdated && !mips_outdated) { + return true; + } + + TextureKey texture_key = texture.key(); + + // Request uploading of the texture data to the shared memory. + // This is also necessary when resolution scaling is used - the texture cache + // relies on shared memory for invalidation of both unscaled and scaled + // textures. Plus a texture may be unscaled partially, when only a portion of + // its pages is invalidated, in this case we'll need the texture from the + // shared memory to load the unscaled parts. + // TODO(Triang3l): Load unscaled parts. + bool base_resolved = texture.GetBaseResolved(); + if (base_outdated) { + if (!shared_memory().RequestRange( + texture_key.base_page << 12, texture.GetGuestBaseSize(), + texture_key.scaled_resolve ? nullptr : &base_resolved)) { + return false; + } + } + bool mips_resolved = texture.GetMipsResolved(); + if (mips_outdated) { + if (!shared_memory().RequestRange( + texture_key.mip_page << 12, texture.GetGuestMipsSize(), + texture_key.scaled_resolve ? nullptr : &mips_resolved)) { + return false; + } + } + if (texture_key.scaled_resolve) { + // Make sure all the scaled resolve memory is resident and accessible from + // the shader, including any possible padding that hasn't yet been touched + // by an actual resolve, but is still included in the texture size, so the + // GPU won't be trying to access unmapped memory. + if (!EnsureScaledResolveMemoryCommitted(texture_key.base_page << 12, + texture.GetGuestBaseSize())) { + return false; + } + if (!EnsureScaledResolveMemoryCommitted(texture_key.mip_page << 12, + texture.GetGuestMipsSize())) { + return false; + } + } + + // Actually load the texture data. + if (!LoadTextureDataFromResidentMemoryImpl(texture, base_outdated, + mips_outdated)) { + return false; + } + + // Update the source of the texture (resolve vs. CPU or memexport) for + // purposes of handling piecewise gamma emulation via sRGB and for resolution + // scale in sampling offsets. + if (!texture_key.scaled_resolve) { + texture.SetBaseResolved(base_resolved); + texture.SetMipsResolved(mips_resolved); + } + + // Mark the ranges as uploaded and watch them. This is needed for scaled + // resolves as well to detect when the CPU wants to reuse the memory for a + // regular texture or a vertex buffer, and thus the scaled resolve version is + // not up to date anymore. + texture.MakeUpToDateAndWatch(global_critical_region_.Acquire()); + + texture.LogAction("Loaded"); + + return true; +} + +void TextureCache::BindingInfoFromFetchConstant( + const xenos::xe_gpu_texture_fetch_t& fetch, TextureKey& key_out, + uint8_t* swizzled_signs_out) { + // Reset the key and the signedness. + key_out.MakeInvalid(); + if (swizzled_signs_out != nullptr) { + *swizzled_signs_out = + uint8_t(xenos::TextureSign::kUnsigned) * uint8_t(0b01010101); + } + + switch (fetch.type) { + case xenos::FetchConstantType::kTexture: + break; + case xenos::FetchConstantType::kInvalidTexture: + if (cvars::gpu_allow_invalid_fetch_constants) { + break; + } + XELOGW( + "Texture fetch constant ({:08X} {:08X} {:08X} {:08X} {:08X} {:08X}) " + "has \"invalid\" type! This is incorrect behavior, but you can try " + "bypassing this by launching Xenia with " + "--gpu_allow_invalid_fetch_constants=true.", + fetch.dword_0, fetch.dword_1, fetch.dword_2, fetch.dword_3, + fetch.dword_4, fetch.dword_5); + return; + default: + XELOGW( + "Texture fetch constant ({:08X} {:08X} {:08X} {:08X} {:08X} {:08X}) " + "is completely invalid!", + fetch.dword_0, fetch.dword_1, fetch.dword_2, fetch.dword_3, + fetch.dword_4, fetch.dword_5); + return; + } + + uint32_t width_minus_1, height_minus_1, depth_or_array_size_minus_1; + uint32_t base_page, mip_page, mip_max_level; + texture_util::GetSubresourcesFromFetchConstant( + fetch, &width_minus_1, &height_minus_1, &depth_or_array_size_minus_1, + &base_page, &mip_page, nullptr, &mip_max_level); + if (base_page == 0 && mip_page == 0) { + // No texture data at all. + return; + } + if (fetch.dimension == xenos::DataDimension::k1D) { + bool is_invalid_1d = false; + // TODO(Triang3l): Support long 1D textures. + if (width_minus_1 >= xenos::kTexture2DCubeMaxWidthHeight) { + XELOGE( + "1D texture is too wide ({}) - ignoring! Report the game to Xenia " + "developers", + width_minus_1 + 1); + is_invalid_1d = true; + } + assert_false(fetch.tiled); + if (fetch.tiled) { + XELOGE( + "1D texture has tiling enabled in the fetch constant, but this " + "appears to be completely wrong - ignoring! Report the game to Xenia " + "developers"); + is_invalid_1d = true; + } + assert_false(fetch.packed_mips); + if (fetch.packed_mips) { + XELOGE( + "1D texture has packed mips enabled in the fetch constant, but this " + "appears to be completely wrong - ignoring! Report the game to Xenia " + "developers"); + is_invalid_1d = true; + } + if (is_invalid_1d) { + return; + } + } + + xenos::TextureFormat format = GetBaseFormat(fetch.format); + + key_out.base_page = base_page; + key_out.mip_page = mip_page; + key_out.dimension = fetch.dimension; + key_out.width_minus_1 = width_minus_1; + key_out.height_minus_1 = height_minus_1; + key_out.depth_or_array_size_minus_1 = depth_or_array_size_minus_1; + key_out.pitch = fetch.pitch; + key_out.mip_max_level = mip_max_level; + key_out.tiled = fetch.tiled; + key_out.packed_mips = fetch.packed_mips; + key_out.format = format; + key_out.endianness = fetch.endianness; + + key_out.is_valid = 1; + + if (swizzled_signs_out != nullptr) { + *swizzled_signs_out = texture_util::SwizzleSigns(fetch); + } +} + +void TextureCache::ResetTextureBindings(bool from_destructor) { + uint32_t bindings_reset = 0; + for (size_t i = 0; i < texture_bindings_.size(); ++i) { + TextureBinding& binding = texture_bindings_[i]; + if (!binding.key.is_valid) { + continue; + } + binding.Reset(); + bindings_reset |= UINT32_C(1) << i; + } + texture_bindings_in_sync_ &= ~bindings_reset; + if (!from_destructor && bindings_reset) { + UpdateTextureBindingsImpl(bindings_reset); + } +} + +void TextureCache::UpdateTexturesTotalHostMemoryUsage(uint64_t add, + uint64_t subtract) { + textures_total_host_memory_usage_ = + textures_total_host_memory_usage_ - subtract + add; + COUNT_profile_set("gpu/texture_cache/total_host_memory_usage_mb", + uint32_t((textures_total_host_memory_usage_ + + ((UINT32_C(1) << 20) - 1)) >> + 20)); +} + +bool TextureCache::IsRangeScaledResolved(uint32_t start_unscaled, + uint32_t length_unscaled) { + if (!IsDrawResolutionScaled()) { + return false; + } + + start_unscaled = std::min(start_unscaled, SharedMemory::kBufferSize); + length_unscaled = + std::min(length_unscaled, SharedMemory::kBufferSize - start_unscaled); + if (!length_unscaled) { + return false; + } + + // Two-level check for faster rejection since resolve targets are usually + // placed in relatively small and localized memory portions (confirmed by + // testing - pretty much all times the deeper level was entered, the texture + // was a resolve target). + uint32_t page_first = start_unscaled >> 12; + uint32_t page_last = (start_unscaled + length_unscaled - 1) >> 12; + uint32_t block_first = page_first >> 5; + uint32_t block_last = page_last >> 5; + uint32_t l2_block_first = block_first >> 6; + uint32_t l2_block_last = block_last >> 6; + auto global_lock = global_critical_region_.Acquire(); + for (uint32_t i = l2_block_first; i <= l2_block_last; ++i) { + uint64_t l2_block = scaled_resolve_pages_l2_[i]; + if (i == l2_block_first) { + l2_block &= ~((UINT64_C(1) << (block_first & 63)) - 1); + } + if (i == l2_block_last && (block_last & 63) != 63) { + l2_block &= (UINT64_C(1) << ((block_last & 63) + 1)) - 1; + } + uint32_t block_relative_index; + while (xe::bit_scan_forward(l2_block, &block_relative_index)) { + l2_block &= ~(UINT64_C(1) << block_relative_index); + uint32_t block_index = (i << 6) + block_relative_index; + uint32_t check_bits = UINT32_MAX; + if (block_index == block_first) { + check_bits &= ~((UINT32_C(1) << (page_first & 31)) - 1); + } + if (block_index == block_last && (page_last & 31) != 31) { + check_bits &= (UINT32_C(1) << ((page_last & 31) + 1)) - 1; + } + if (scaled_resolve_pages_[block_index] & check_bits) { + return true; + } + } + } + return false; +} + +void TextureCache::ScaledResolveGlobalWatchCallbackThunk( + const std::unique_lock& global_lock, void* context, + uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) { + TextureCache* texture_cache = reinterpret_cast(context); + texture_cache->ScaledResolveGlobalWatchCallback( + global_lock, address_first, address_last, invalidated_by_gpu); +} + +void TextureCache::ScaledResolveGlobalWatchCallback( + const std::unique_lock& global_lock, + uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) { + assert_true(IsDrawResolutionScaled()); + if (invalidated_by_gpu) { + // Resolves themselves do exactly the opposite of what this should do. + return; + } + // Mark scaled resolve ranges as non-scaled. Textures themselves will be + // invalidated by their shared memory watches. + uint32_t resolve_page_first = address_first >> 12; + uint32_t resolve_page_last = address_last >> 12; + uint32_t resolve_block_first = resolve_page_first >> 5; + uint32_t resolve_block_last = resolve_page_last >> 5; + uint32_t resolve_l2_block_first = resolve_block_first >> 6; + uint32_t resolve_l2_block_last = resolve_block_last >> 6; + for (uint32_t i = resolve_l2_block_first; i <= resolve_l2_block_last; ++i) { + uint64_t resolve_l2_block = scaled_resolve_pages_l2_[i]; + uint32_t resolve_block_relative_index; + while ( + xe::bit_scan_forward(resolve_l2_block, &resolve_block_relative_index)) { + resolve_l2_block &= ~(UINT64_C(1) << resolve_block_relative_index); + uint32_t resolve_block_index = (i << 6) + resolve_block_relative_index; + uint32_t resolve_keep_bits = 0; + if (resolve_block_index == resolve_block_first) { + resolve_keep_bits |= (UINT32_C(1) << (resolve_page_first & 31)) - 1; + } + if (resolve_block_index == resolve_block_last && + (resolve_page_last & 31) != 31) { + resolve_keep_bits |= + ~((UINT32_C(1) << ((resolve_page_last & 31) + 1)) - 1); + } + scaled_resolve_pages_[resolve_block_index] &= resolve_keep_bits; + if (scaled_resolve_pages_[resolve_block_index] == 0) { + scaled_resolve_pages_l2_[i] &= + ~(UINT64_C(1) << resolve_block_relative_index); + } + } + } +} + +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h new file mode 100644 index 000000000..475cdfdfc --- /dev/null +++ b/src/xenia/gpu/texture_cache.h @@ -0,0 +1,568 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2022 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_TEXTURE_CACHE_H_ +#define XENIA_GPU_TEXTURE_CACHE_H_ + +#include +#include +#include +#include +#include +#include + +#include "xenia/base/assert.h" +#include "xenia/base/hash.h" +#include "xenia/base/mutex.h" +#include "xenia/gpu/register_file.h" +#include "xenia/gpu/shared_memory.h" +#include "xenia/gpu/texture_util.h" +#include "xenia/gpu/xenos.h" + +namespace xe { +namespace gpu { + +// Manages host copies of guest textures, performing untiling, format and endian +// conversion of textures stored in the shared memory, and also handling +// invalidation. +// +// Mipmaps are treated the following way, according to the GPU hang message +// found in game executables explaining the valid usage of BaseAddress when +// streaming the largest LOD (it says games should not use 0 as the base address +// when the largest LOD isn't loaded, but rather, either allocate a valid +// address for it or make it the same as mip_address): +// - If the texture has a base address, but no mip address, it's not mipmapped - +// the host texture has only the largest level too. +// - If the texture has different non-zero base address and mip address, a host +// texture with mip_max_level+1 mipmaps is created - mip_min_level is ignored +// and treated purely as sampler state because there are tfetch instructions +// working directly with LOD values - including fetching with an explicit LOD. +// However, the max level is not ignored because any mip count can be +// specified when creating a texture, and another texture may be placed after +// the last one. +// - If the texture has a mip address, but the base address is 0 or the same as +// the mip address, a mipmapped texture is created, but min/max LOD is clamped +// to the lower bound of 1 - the game is expected to do that anyway until the +// largest LOD is loaded. +// TODO(Triang3l): Attach the largest LOD to existing textures with a valid +// mip_address but no base ever used yet (no base_address) to save memory +// because textures are streamed this way anyway. +class TextureCache { + public: + // Hard limit, originating from the half-pixel offset (two-pixel offset is too + // much, the resolve shaders, being generic for different scales, only + // duplicate the second pixel into the first, not the third), and also due to + // the bit counts used for passing the scale to shaders. + static constexpr uint32_t kMaxDrawResolutionScaleAlongAxis = 3; + + TextureCache(const TextureCache& texture_cache) = delete; + TextureCache& operator=(const TextureCache& texture_cache) = delete; + virtual ~TextureCache(); + + // Returns whether the actual scale is not smaller than the requested one. + static bool GetConfigDrawResolutionScale(uint32_t& x_out, uint32_t& y_out); + uint32_t draw_resolution_scale_x() const { return draw_resolution_scale_x_; } + uint32_t draw_resolution_scale_y() const { return draw_resolution_scale_y_; } + bool IsDrawResolutionScaled() const { + return draw_resolution_scale_x_ > 1 || draw_resolution_scale_y_ > 1; + } + + virtual void ClearCache(); + + virtual void CompletedSubmissionUpdated(uint64_t completed_submission_index); + virtual void BeginSubmission(uint64_t new_submission_index); + virtual void BeginFrame(); + + void MarkRangeAsResolved(uint32_t start_unscaled, uint32_t length_unscaled); + // Ensures the memory backing the range in the scaled resolve address space is + // allocated and returns whether it is. + virtual bool EnsureScaledResolveMemoryCommitted(uint32_t start_unscaled, + uint32_t length_unscaled) { + return false; + } + + static uint32_t GuestToHostSwizzle(uint32_t guest_swizzle, + uint32_t host_format_swizzle); + + void TextureFetchConstantWritten(uint32_t index) { + texture_bindings_in_sync_ &= ~(UINT32_C(1) << index); + } + + virtual void RequestTextures(uint32_t used_texture_mask); + + // "ActiveTexture" means as of the latest RequestTextures call. + + // Returns the post-swizzle signedness of a currently bound texture (must be + // called after RequestTextures). + uint8_t GetActiveTextureSwizzledSigns(uint32_t fetch_constant_index) const { + const TextureBinding* binding = + GetValidTextureBinding(fetch_constant_index); + return binding ? binding->swizzled_signs : kSwizzledSignsUnsigned; + } + bool IsActiveTextureResolved(uint32_t fetch_constant_index) const { + const TextureBinding* binding = + GetValidTextureBinding(fetch_constant_index); + if (!binding) { + return false; + } + return (binding->texture && binding->texture->IsResolved()) || + (binding->texture_signed && binding->texture_signed->IsResolved()); + } + + protected: + struct TextureKey { + // Dimensions minus 1 are stored similarly to how they're stored in fetch + // constants so fewer bits can be used, while the maximum size (8192 for 2D) + // can still be encoded (a 8192x sky texture is used in 4D530910). + + // Physical 4 KB page with the base mip level, disregarding A/C/E address + // range prefix. + uint32_t base_page : 17; // 17 total + xenos::DataDimension dimension : 2; // 19 + uint32_t width_minus_1 : 13; // 32 + + uint32_t height_minus_1 : 13; // 45 + uint32_t tiled : 1; // 46 + uint32_t packed_mips : 1; // 47 + // Physical 4 KB page with mip 1 and smaller. + uint32_t mip_page : 17; // 64 + + // (Layers for stacked and 3D, 6 for cube, 1 for other dimensions) - 1. + uint32_t depth_or_array_size_minus_1 : 10; // 74 + uint32_t pitch : 9; // 83 + uint32_t mip_max_level : 4; // 87 + xenos::TextureFormat format : 6; // 93 + xenos::Endian endianness : 2; // 95 + // Whether this texture is signed and has a different host representation + // than an unsigned view of the same guest texture. + uint32_t signed_separate : 1; // 96 + + // Whether this texture is a resolution-scaled resolve target. + uint32_t scaled_resolve : 1; // 97 + // Least important in ==, so placed last. + uint32_t is_valid : 1; // 98 + + TextureKey() { MakeInvalid(); } + TextureKey(const TextureKey& key) { + std::memcpy(this, &key, sizeof(*this)); + } + TextureKey& operator=(const TextureKey& key) { + std::memcpy(this, &key, sizeof(*this)); + return *this; + } + void MakeInvalid() { + // Zero everything, including the padding, for a stable hash. + std::memset(this, 0, sizeof(*this)); + } + + using Hasher = xe::hash::XXHasher; + bool operator==(const TextureKey& key) const { + return !std::memcmp(this, &key, sizeof(*this)); + } + bool operator!=(const TextureKey& key) const { return !(*this == key); } + + uint32_t GetWidth() const { return width_minus_1 + 1; } + uint32_t GetHeight() const { return height_minus_1 + 1; } + uint32_t GetDepthOrArraySize() const { + return depth_or_array_size_minus_1 + 1; + } + + texture_util::TextureGuestLayout GetGuestLayout() const { + return texture_util::GetGuestTextureLayout( + dimension, pitch, GetWidth(), GetHeight(), GetDepthOrArraySize(), + tiled, format, packed_mips, base_page != 0, mip_max_level); + } + + static const char* GetLogDimensionName(xenos::DataDimension dimension); + const char* GetLogDimensionName() const { + return GetLogDimensionName(dimension); + } + void LogAction(const char* action) const; + }; + + class Texture { + public: + Texture(const Texture& texture) = delete; + Texture& operator=(const Texture& texture) = delete; + virtual ~Texture(); + + TextureCache& texture_cache() const { return texture_cache_; } + + const TextureKey& key() const { return key_; } + + const texture_util::TextureGuestLayout& guest_layout() const { + return guest_layout_; + } + uint32_t GetGuestBaseSize() const { + return guest_layout().base.level_data_extent_bytes; + } + uint32_t GetGuestMipsSize() const { + return guest_layout().mips_total_extent_bytes; + } + + uint64_t GetHostMemoryUsage() const { return host_memory_usage_; } + + uint64_t last_usage_submission_index() const { + return last_usage_submission_index_; + } + uint64_t last_usage_time() const { return last_usage_time_; } + + bool GetBaseResolved() const { return base_resolved_; } + void SetBaseResolved(bool base_resolved) { + assert_false(!base_resolved && key().scaled_resolve); + base_resolved_ = base_resolved; + } + bool GetMipsResolved() const { return mips_resolved_; } + void SetMipsResolved(bool mips_resolved) { + assert_false(!mips_resolved && key().scaled_resolve); + mips_resolved_ = mips_resolved; + } + bool IsResolved() const { return base_resolved_ || mips_resolved_; } + + bool base_outdated( + const std::unique_lock& global_lock) const { + return base_outdated_; + } + bool mips_outdated( + const std::unique_lock& global_lock) const { + return mips_outdated_; + } + void MakeUpToDateAndWatch( + const std::unique_lock& global_lock); + + void WatchCallback( + const std::unique_lock& global_lock, bool is_mip); + + // For LRU caching - updates the last usage frame and moves the texture to + // the end of the usage queue. Must be called any time the texture is + // referenced by any GPU work in the implementation to make sure it's not + // destroyed while still in use. + void MarkAsUsed(); + + void LogAction(const char* action) const; + + protected: + Texture(TextureCache& texture_cache, const TextureKey& key); + + void SetHostMemoryUsage(uint64_t new_host_memory_usage) { + texture_cache_.UpdateTexturesTotalHostMemoryUsage(new_host_memory_usage, + host_memory_usage_); + host_memory_usage_ = new_host_memory_usage; + } + + private: + TextureCache& texture_cache_; + + TextureKey key_; + + texture_util::TextureGuestLayout guest_layout_; + + uint64_t host_memory_usage_ = 0; + + uint64_t last_usage_submission_index_; + uint64_t last_usage_time_; + Texture* used_previous_; + Texture* used_next_; + + // Whether the most up-to-date base / mips contain pages with data from a + // resolve operation (rather than from the CPU or memexport), primarily for + // choosing between piecewise linear gamma and sRGB when the former is + // emulated with the latter. + bool base_resolved_; + bool mips_resolved_; + + // These are to be accessed within the global critical region to synchronize + // with shared memory. + // Whether the recent base level data needs reloading from the memory. + bool base_outdated_ = false; + // Whether the recent mip data needs reloading from the memory. + bool mips_outdated_ = false; + // Watch handles for the memory ranges. + SharedMemory::WatchHandle base_watch_handle_ = nullptr; + SharedMemory::WatchHandle mips_watch_handle_ = nullptr; + }; + + // Rules of data access in load shaders: + // - Source reading (from the shared memory or the scaled resolve buffer): + // - Guest data may be stored in a sparsely-allocated buffer, or, in + // Direct3D 12 terms, a tiled buffer. This means that some regions of the + // buffer may not be mapped. On tiled resources tier 1 hardware, accessing + // unmapped tiles results in undefined behavior, including a GPU page + // fault and device removal. So, shaders must not try to access + // potentially unmapped regions (that are outside the texture memory + // extents calculated on the CPU, taking into account that Xenia can't + // overestimate texture sizes freely since it must not try to upload + // unallocated pages on the CPU). + // - Buffer tiles have 64 KB size on Direct3D 12. Vulkan has its own + // alignment requirements for sparse binding. But overall, we're + // allocating pretty large regions. + // - Resolution scaling disabled: + // - Shared memory allocates regions of power of two sizes that map + // directly to the same portions of the 512 MB of the console's + // physical memory. So, a 64 KB-aligned host buffer region is also 64 + // KB-aligned in the guest address space. + // - Tiled textures: 32x32x4-block tiles are always resident each as a + // whole. If the width is bigger than the pitch, the overflowing 32x32x4 + // tiles are also loaded as entire tiles. We do not have separate + // shaders for 2D and 3D. So, for tiled textures, it's safe to consider + // that if any location within a 32x32-aligned portion is within the + // texture bounds, the entire 32x32 portion also can be read. + // - Linear textures: Pitch is aligned to 256 bytes. Row count, however, + // is not aligned to anything (unless the mip tail is being loaded). The + // overflowing last row in case `width > pitch`, however, is made + // resident up to the last texel in it. But row start alignment is 256, + // which is a power of two, and is smaller than the Direct3D 12 tile + // size of 64 KB. So, if any block within a 256-aligned region is within + // the texture bounds, without resolution scaling, reading from any + // location in that 256-aligned region is safe. + // - Since we use the same shaders for tiled and linear textures (as well + // as 1D textures), this means that without resolution scaling, it's + // safe to access a min(256 bytes, 32 blocks)-aligned portion along X, + // but only within the same row of blocks, with bounds checking only for + // such portion as a whole, but without additional bounds checking + // inside of it. + // - Therefore, it's recommended that shaders read power-of-two amounts of + // blocks (so there will naturally be some alignment to some power of + // two), and this way, each thread may read at most 16 16bpb blocks or + // at most 32 8bpb or smaller blocks with in a single `if (x < width)` + // for the whole aligned range of the same length. + // - Resolution scaling enabled: + // - For simplicity, unlike in the shared memory, buffer tile boundaries + // are not aligned to powers of 2 the same way as guest addresses are. + // While for 2x2 resolution scaling it still happens to be the case + // because `host scaling unit address = guest scaling unit address << 2` + // (similarly for 2x1 and 1x2), for 3x or x3, it's not - a 64 KB host + // tile would represent 7281.777 guest bytes with 3x3 (disregarding that + // sequences of texels that are adjacent in memory alongside the + // horizontal axis, not individual bytes, are scaled, but even in that + // case it's not scaling by 2^n still). + // - The above would affect the `width > pitch` case for linear textures, + // requiring overestimating the width in calculation of the range of the + // tiles to map, while not doing this overestimation on the guest memory + // extent calculation side (otherwise it may result in attempting to + // upload unallocated memory on the CPU). For example, let's take look + // at an extreme case of a 369x28 k_8 texture with a pitch of 256 bytes. + // The last row, in guest memory, would be loaded from the [7168, 7281) + // range, or, with 3x3 resolution scaling, from bytes [64512, 65529). + // However, if we try to unconditionally load 2 pixels, like the texture + // is 370x28, we will be accessing the bytes [64512, 65538). But bytes + // 65536 and 65537 will be in another 64 KB tile, which may be not + // mapped yet. However, none of this is an issue for one simple reason - + // resolving is only possible to tiled textures, so linear textures will + // never be resolution-scaled. + // - Tiled textures have potentially referenced guest 32x32-block tiles + // loaded in their entirety. So, just like for unscaled textures, if any + // block within a tile is available, the entire tile is as well. + // - Destination writing (to the linear buffer): + // - host_x_blocks_per_thread specifies how many pixels can be written + // without bounds checking within increments of that amount - the pitch of + // the destination buffer is manually overaligned if needed. + + // In textures, resolution scaling is done for 8-byte portions of memory for + // 8bpp textures, and for 16-byte portions for textures of higher bit depths + // (these are the sizes of regions where contiguous texels in memory are also + // contiguous in the texture along the horizontal axis, so 64-bit and 128-bit + // loads / stores, for 8bpp and 16bpp+ respectively, can be used for untiling + // regardless of the resolution scale). + + struct LoadConstants { + uint32_t is_tiled_3d_endian_scale; + // Base offset in bytes, resolution-scaled. + uint32_t guest_offset; + // For tiled textures - row pitch in blocks, aligned to 32, unscaled. + // For linear textures - row pitch in bytes. + uint32_t guest_pitch_aligned; + // For 3D textures only (ignored otherwise) - aligned to 32, unscaled. + uint32_t guest_z_stride_block_rows_aligned; + + // - std140 vector boundary - + + // If this is a packed mip tail, this is aligned to tile dimensions. + // Resolution-scaled. + uint32_t size_blocks[3]; + // Base offset in bytes. + uint32_t host_offset; + + // - std140 vector boundary - + + uint32_t host_pitch; + uint32_t height_texels; + }; + + static constexpr uint8_t kSwizzledSignsUnsigned = + uint8_t(xenos::TextureSign::kUnsigned) * uint8_t(0b01010101); + + struct TextureBinding { + TextureKey key; + // Destination swizzle merged with guest to host format swizzle. + uint32_t host_swizzle; + // Packed TextureSign values, 2 bit per each component, with guest-side + // destination swizzle from the fetch constant applied to them. + uint8_t swizzled_signs; + // Unsigned version of the texture (or signed if they have the same data). + Texture* texture; + // Signed version of the texture if the data in the signed version is + // different on the host. + Texture* texture_signed; + + TextureBinding() { Reset(); } + + void Reset() { + std::memset(this, 0, sizeof(*this)); + host_swizzle = xenos::XE_GPU_TEXTURE_SWIZZLE_0000; + swizzled_signs = kSwizzledSignsUnsigned; + } + }; + + TextureCache(const RegisterFile& register_file, SharedMemory& shared_memory, + uint32_t draw_resolution_scale_x, + uint32_t draw_resolution_scale_y); + + const RegisterFile& register_file() const { return register_file_; } + SharedMemory& shared_memory() const { return shared_memory_; } + + // May be called for purposes like clearing the cache, as well as in the + // destructor of the implementation if textures, for instance, have references + // to the implementation that are used in their destructor, and will become + // invalid if the implementation is destroyed before the texture. + void DestroyAllTextures(bool from_destructor = false); + + // Whether the signed version of the texture has a different representation on + // the host than its unsigned version (for example, if it's a fixed-point + // texture emulated with a larger host pixel format). + virtual bool IsSignedVersionSeparateForFormat(TextureKey key) const { + return false; + } + // Parameters like whether the texture is tiled and its dimensions are checked + // externally, the implementation should take only format-related parameters + // such as the format itself and the signedness into account. + virtual bool IsScaledResolveSupportedForFormat(TextureKey key) const { + return false; + } + // For formats with less than 4 components, implementations normally should + // replicate the last component into the non-existent ones, similar to what is + // done for unused components of operands in shaders by Microsoft's Xbox 360 + // shader compiler (.xxxx, .xyyy, .xyzz, .xyzw). + // For DXT3A and DXT5A, RRRR swizzle is specified in: + // http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf + // 4D5307E6 also expects replicated components in k_8 sprites. + // DXN is read as RG in 4D5307E6, but as RA in 415607E6. + // TODO(Triang3l): Find out the correct contents of unused texture components. + virtual uint32_t GetHostFormatSwizzle(TextureKey key) const = 0; + + virtual uint32_t GetMaxHostTextureWidthHeight( + xenos::DataDimension dimension) const = 0; + virtual uint32_t GetMaxHostTextureDepthOrArraySize( + xenos::DataDimension dimension) const = 0; + + // The texture must be created exactly with this key (if the implementation + // supports the texture with this key, otherwise, or in case of a runtime + // failure, it should return nullptr), modifying it is not allowed. + virtual std::unique_ptr CreateTexture(TextureKey key) = 0; + + // Returns nullptr not only if the key is not supported, but also if couldn't + // create the texture - if it's nullptr, occasionally a recreation attempt + // should be made. + Texture* FindOrCreateTexture(TextureKey key); + + bool LoadTextureData(Texture& texture); + // Writes the texture data (for base, mips or both - but not neither) from the + // shared memory or the scaled resolve memory. The shared memory management is + // done outside this function, the implementation just needs to load the data + // into the texture object. + virtual bool LoadTextureDataFromResidentMemoryImpl(Texture& texture, + bool load_base, + bool load_mips) = 0; + + // Converts a texture fetch constant to a texture key, normalizing and + // validating the values, or creating an invalid key, and also gets the + // post-guest-swizzle signedness. + static void BindingInfoFromFetchConstant( + const xenos::xe_gpu_texture_fetch_t& fetch, TextureKey& key_out, + uint8_t* swizzled_signs_out); + + // Makes all texture bindings invalid. Also requesting textures after calling + // this will cause another attempt to create a texture or to untile it if + // there was an error. + void ResetTextureBindings(bool from_destructor = false); + + const TextureBinding* GetValidTextureBinding( + uint32_t fetch_constant_index) const { + const TextureBinding& binding = texture_bindings_[fetch_constant_index]; + return binding.key.is_valid ? &binding : nullptr; + } + // Called when something in a texture binding is changed for the + // implementation to update the internal dependencies of the binding. + virtual void UpdateTextureBindingsImpl(uint32_t fetch_constant_mask) {} + + private: + void UpdateTexturesTotalHostMemoryUsage(uint64_t add, uint64_t subtract); + + // Shared memory callback for texture data invalidation. + static void WatchCallback( + const std::unique_lock& global_lock, void* context, + void* data, uint64_t argument, bool invalidated_by_gpu); + + // Checks if there are any pages that contain scaled resolve data within the + // range. + bool IsRangeScaledResolved(uint32_t start_unscaled, uint32_t length_unscaled); + // Global shared memory invalidation callback for invalidating scaled resolved + // texture data. + static void ScaledResolveGlobalWatchCallbackThunk( + const std::unique_lock& global_lock, void* context, + uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu); + void ScaledResolveGlobalWatchCallback( + const std::unique_lock& global_lock, + uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu); + + const RegisterFile& register_file_; + SharedMemory& shared_memory_; + uint32_t draw_resolution_scale_x_; + uint32_t draw_resolution_scale_y_; + + xe::global_critical_region global_critical_region_; + // Bit vector storing whether each 4 KB physical memory page contains scaled + // resolve data. uint32_t rather than uint64_t because parts of it can be sent + // to shaders. + std::unique_ptr scaled_resolve_pages_; + // Second level of the bit vector for faster rejection of non-scaled textures. + // >> 12 for 4 KB pages, >> 5 for uint32_t level 1 bits, >> 6 for uint64_t + // level 2 bits. + uint64_t scaled_resolve_pages_l2_[SharedMemory::kBufferSize >> (12 + 5 + 6)]; + + // Global watch for scaled resolve data invalidation. + SharedMemory::GlobalWatchHandle scaled_resolve_global_watch_handle_ = nullptr; + + uint64_t current_submission_index_ = 0; + uint64_t current_submission_time_ = 0; + + std::unordered_map, TextureKey::Hasher> + textures_; + + uint64_t textures_total_host_memory_usage_ = 0; + + Texture* texture_used_first_ = nullptr; + Texture* texture_used_last_ = nullptr; + + // Whether a texture has become outdated (a memory watch has been triggered), + // so need to recheck if textures aren't outdated, disregarding whether fetch + // constants have been changed. + std::atomic texture_became_outdated_{false}; + + std::array + texture_bindings_; + // Bit vector with bits reset on fetch constant writes to avoid parsing fetch + // constants again and again. + uint32_t texture_bindings_in_sync_ = 0; +}; + +} // namespace gpu +} // namespace xe + +#endif // XENIA_GPU_TEXTURE_CACHE_H_ diff --git a/src/xenia/gpu/texture_util.cc b/src/xenia/gpu/texture_util.cc index 03a912b81..849aac8e1 100644 --- a/src/xenia/gpu/texture_util.cc +++ b/src/xenia/gpu/texture_util.cc @@ -533,6 +533,9 @@ uint8_t SwizzleSigns(const xenos::xe_gpu_texture_fetch_t& fetch) { // If only constant components, choose according to the original format // (what would more likely be loaded if there were non-constant components). // If all components would be signed, use signed. + // Textures with only constant components must still be bound to shaders for + // various queries (such as filtering weights) not involving the color data + // itself. if (((fetch.dword_0 >> 2) & 0b11111111) == uint32_t(xenos::TextureSign::kSigned) * 0b01010101) { constants_sign = xenos::TextureSign::kSigned; diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 58e27d997..74c7f3fe6 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -199,8 +199,10 @@ bool VulkanCommandProcessor::SetupContext() { return false; } + // TODO(Triang3l): Get the actual draw resolution scale when the texture cache + // supports resolution scaling. render_target_cache_ = std::make_unique( - *register_file_, *memory_, &trace_writer_, *this); + *register_file_, *memory_, &trace_writer_, 1, 1, *this); if (!render_target_cache_->Initialize()) { XELOGE("Failed to initialize the render target cache"); return false; @@ -2199,8 +2201,8 @@ void VulkanCommandProcessor::UpdateDynamicState( // more likely. depth_bias_slope_factor *= xenos::kPolygonOffsetScaleSubpixelUnit * - float(std::max(render_target_cache_->GetResolutionScaleX(), - render_target_cache_->GetResolutionScaleY())); + float(std::max(render_target_cache_->draw_resolution_scale_x(), + render_target_cache_->draw_resolution_scale_y())); // std::memcmp instead of != so in case of NaN, every draw won't be // invalidating it. dynamic_depth_bias_update_needed_ |= diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc index db3c83aee..d979c5748 100644 --- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc @@ -27,6 +27,7 @@ #include "xenia/gpu/draw_util.h" #include "xenia/gpu/registers.h" #include "xenia/gpu/spirv_shader_translator.h" +#include "xenia/gpu/texture_cache.h" #include "xenia/gpu/vulkan/deferred_command_buffer.h" #include "xenia/gpu/vulkan/vulkan_command_processor.h" #include "xenia/gpu/xenos.h" @@ -115,8 +116,10 @@ const VulkanRenderTargetCache::TransferModeInfo VulkanRenderTargetCache::VulkanRenderTargetCache( const RegisterFile& register_file, const Memory& memory, - TraceWriter* trace_writer, VulkanCommandProcessor& command_processor) - : RenderTargetCache(register_file, memory, trace_writer), + TraceWriter* trace_writer, uint32_t draw_resolution_scale_x, + uint32_t draw_resolution_scale_y, VulkanCommandProcessor& command_processor) + : RenderTargetCache(register_file, memory, trace_writer, + draw_resolution_scale_x, draw_resolution_scale_y), command_processor_(command_processor) {} VulkanRenderTargetCache::~VulkanRenderTargetCache() { Shutdown(true); } @@ -201,8 +204,8 @@ bool VulkanRenderTargetCache::Initialize() { // maxStorageBufferRange. if (!ui::vulkan::util::CreateDedicatedAllocationBuffer( provider, - VkDeviceSize(xenos::kEdramSizeBytes * resolution_scale_x_ * - resolution_scale_y_), + VkDeviceSize(xenos::kEdramSizeBytes * + (draw_resolution_scale_x() * draw_resolution_scale_y())), VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, ui::vulkan::util::MemoryPurpose::kDeviceLocal, edram_buffer_, @@ -972,10 +975,10 @@ RenderTargetCache::RenderTarget* VulkanRenderTargetCache::CreateRenderTarget( image_create_info.pNext = nullptr; image_create_info.flags = 0; image_create_info.imageType = VK_IMAGE_TYPE_2D; - image_create_info.extent.width = key.GetWidth() * resolution_scale_x_; + image_create_info.extent.width = key.GetWidth() * draw_resolution_scale_x(); image_create_info.extent.height = GetRenderTargetHeight(key.pitch_tiles_at_32bpp, key.msaa_samples) * - resolution_scale_y_; + draw_resolution_scale_y(); image_create_info.extent.depth = 1; image_create_info.mipLevels = 1; image_create_info.arrayLayers = 1; @@ -1752,9 +1755,9 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( // be done at texture fetch. uint32_t tile_width_samples_scaled = - xenos::kEdramTileWidthSamples * resolution_scale_x_; + xenos::kEdramTileWidthSamples * draw_resolution_scale_x(); uint32_t tile_height_samples_scaled = - xenos::kEdramTileHeightSamples * resolution_scale_y_; + xenos::kEdramTileHeightSamples * draw_resolution_scale_y(); // Convert the fragment coordinates to uint2. uint_vector_temp.clear(); @@ -1788,7 +1791,7 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X); uint32_t dest_tile_width_divide_scale, dest_tile_width_divide_shift; draw_util::GetEdramTileWidthDivideScaleAndUpperShift( - resolution_scale_x_, dest_tile_width_divide_scale, + draw_resolution_scale_x(), dest_tile_width_divide_scale, dest_tile_width_divide_shift); // Doing 16*16=32 multiplication, not 32*32=64. // TODO(Triang3l): Abstract this away, don't do 32*32 on Direct3D 12 too. @@ -1808,7 +1811,11 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( builder.makeUintConstant(tile_width_samples_scaled >> dest_sample_width_log2))); spv::Id dest_tile_index_y, dest_tile_pixel_y; - if (resolution_scale_y_ == 3) { + static_assert( + TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, + "VulkanRenderTargetCache EDRAM range ownership transfer shader " + "generation supports Y draw resolution scaling factors of only up to 3"); + if (draw_resolution_scale_y() == 3) { dest_tile_index_y = builder.createBinOp( spv::OpShiftRightLogical, type_uint, builder.createBinOp( @@ -1823,9 +1830,9 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( builder.makeUintConstant(tile_height_samples_scaled >> dest_sample_height_log2))); } else { - assert_true(resolution_scale_y_ <= 2); + assert_true(draw_resolution_scale_y() <= 2); uint32_t dest_tile_height_pixels_log2 = - (resolution_scale_y_ == 2 ? 5 : 4) - dest_sample_height_log2; + (draw_resolution_scale_y() == 2 ? 5 : 4) - dest_sample_height_log2; dest_tile_index_y = builder.createBinOp( spv::OpShiftRightLogical, type_uint, dest_pixel_y, builder.makeUintConstant(dest_tile_height_pixels_log2)); @@ -3967,13 +3974,13 @@ void VulkanRenderTargetCache::PerformTransfersAndResolveClears( // Assuming the rectangle is already clamped by the setup function from the // common render target cache. resolve_clear_rect.rect.offset.x = - int32_t(resolve_clear_rectangle->x_pixels * resolution_scale_x_); + int32_t(resolve_clear_rectangle->x_pixels * draw_resolution_scale_x()); resolve_clear_rect.rect.offset.y = - int32_t(resolve_clear_rectangle->y_pixels * resolution_scale_y_); + int32_t(resolve_clear_rectangle->y_pixels * draw_resolution_scale_y()); resolve_clear_rect.rect.extent.width = - resolve_clear_rectangle->width_pixels * resolution_scale_x_; + resolve_clear_rectangle->width_pixels * draw_resolution_scale_x(); resolve_clear_rect.rect.extent.height = - resolve_clear_rectangle->height_pixels * resolution_scale_y_; + resolve_clear_rectangle->height_pixels * draw_resolution_scale_y(); resolve_clear_rect.baseArrayLayer = 0; resolve_clear_rect.layerCount = 1; } @@ -4437,14 +4444,16 @@ void VulkanRenderTargetCache::PerformTransfersAndResolveClears( ++j) { const Transfer::Rectangle& stencil_clear_rectangle = transfer_stencil_clear_rectangles[j]; - stencil_clear_rect_write_ptr->rect.offset.x = - int32_t(stencil_clear_rectangle.x_pixels * resolution_scale_x_); - stencil_clear_rect_write_ptr->rect.offset.y = - int32_t(stencil_clear_rectangle.y_pixels * resolution_scale_y_); + stencil_clear_rect_write_ptr->rect.offset.x = int32_t( + stencil_clear_rectangle.x_pixels * draw_resolution_scale_x()); + stencil_clear_rect_write_ptr->rect.offset.y = int32_t( + stencil_clear_rectangle.y_pixels * draw_resolution_scale_y()); stencil_clear_rect_write_ptr->rect.extent.width = - stencil_clear_rectangle.width_pixels * resolution_scale_x_; + stencil_clear_rectangle.width_pixels * + draw_resolution_scale_x(); stencil_clear_rect_write_ptr->rect.extent.height = - stencil_clear_rectangle.height_pixels * resolution_scale_y_; + stencil_clear_rectangle.height_pixels * + draw_resolution_scale_y(); stencil_clear_rect_write_ptr->baseArrayLayer = 0; stencil_clear_rect_write_ptr->layerCount = 1; ++stencil_clear_rect_write_ptr; diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h index 67eb85300..acd8f500d 100644 --- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.h +++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.h @@ -87,6 +87,8 @@ class VulkanRenderTargetCache final : public RenderTargetCache { VulkanRenderTargetCache(const RegisterFile& register_file, const Memory& memory, TraceWriter* trace_writer, + uint32_t draw_resolution_scale_x, + uint32_t draw_resolution_scale_y, VulkanCommandProcessor& command_processor); ~VulkanRenderTargetCache(); @@ -100,9 +102,6 @@ class VulkanRenderTargetCache final : public RenderTargetCache { // TODO(Triang3l): Fragment shader interlock. Path GetPath() const override { return Path::kHostRenderTargets; } - uint32_t GetResolutionScaleX() const override { return resolution_scale_x_; } - uint32_t GetResolutionScaleY() const override { return resolution_scale_y_; } - bool Update(bool is_rasterization_done, reg::RB_DEPTHCONTROL normalized_depth_control, uint32_t normalized_color_mask, @@ -206,9 +205,6 @@ class VulkanRenderTargetCache final : public RenderTargetCache { VulkanCommandProcessor& command_processor_; - uint32_t resolution_scale_x_ = 1; - uint32_t resolution_scale_y_ = 1; - // Accessible in fragment and compute shaders. VkDescriptorSetLayout descriptor_set_layout_storage_buffer_ = VK_NULL_HANDLE; VkDescriptorSetLayout descriptor_set_layout_sampled_image_ = VK_NULL_HANDLE; diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index 81f889ea2..35e2a89cf 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -905,29 +905,29 @@ constexpr bool IsSingleCopySampleSelected(CopySampleSelect copy_sample_select) { copy_sample_select <= CopySampleSelect::k3; } -#define XE_GPU_MAKE_SWIZZLE(x, y, z, w) \ - (((XE_GPU_SWIZZLE_##x) << 0) | ((XE_GPU_SWIZZLE_##y) << 3) | \ - ((XE_GPU_SWIZZLE_##z) << 6) | ((XE_GPU_SWIZZLE_##w) << 9)) +#define XE_GPU_MAKE_TEXTURE_SWIZZLE(x, y, z, w) \ + (((xe::gpu::xenos::XE_GPU_TEXTURE_SWIZZLE_##x) << 0) | \ + ((xe::gpu::xenos::XE_GPU_TEXTURE_SWIZZLE_##y) << 3) | \ + ((xe::gpu::xenos::XE_GPU_TEXTURE_SWIZZLE_##z) << 6) | \ + ((xe::gpu::xenos::XE_GPU_TEXTURE_SWIZZLE_##w) << 9)) typedef enum { - XE_GPU_SWIZZLE_X = 0, - XE_GPU_SWIZZLE_R = 0, - XE_GPU_SWIZZLE_Y = 1, - XE_GPU_SWIZZLE_G = 1, - XE_GPU_SWIZZLE_Z = 2, - XE_GPU_SWIZZLE_B = 2, - XE_GPU_SWIZZLE_W = 3, - XE_GPU_SWIZZLE_A = 3, - XE_GPU_SWIZZLE_0 = 4, - XE_GPU_SWIZZLE_1 = 5, - XE_GPU_SWIZZLE_RGBA = XE_GPU_MAKE_SWIZZLE(R, G, B, A), - XE_GPU_SWIZZLE_BGRA = XE_GPU_MAKE_SWIZZLE(B, G, R, A), - XE_GPU_SWIZZLE_RGB1 = XE_GPU_MAKE_SWIZZLE(R, G, B, 1), - XE_GPU_SWIZZLE_BGR1 = XE_GPU_MAKE_SWIZZLE(B, G, R, 1), - XE_GPU_SWIZZLE_000R = XE_GPU_MAKE_SWIZZLE(0, 0, 0, R), - XE_GPU_SWIZZLE_RRR1 = XE_GPU_MAKE_SWIZZLE(R, R, R, 1), - XE_GPU_SWIZZLE_R111 = XE_GPU_MAKE_SWIZZLE(R, 1, 1, 1), - XE_GPU_SWIZZLE_R000 = XE_GPU_MAKE_SWIZZLE(R, 0, 0, 0), -} XE_GPU_SWIZZLE; + XE_GPU_TEXTURE_SWIZZLE_X = 0, + XE_GPU_TEXTURE_SWIZZLE_R = 0, + XE_GPU_TEXTURE_SWIZZLE_Y = 1, + XE_GPU_TEXTURE_SWIZZLE_G = 1, + XE_GPU_TEXTURE_SWIZZLE_Z = 2, + XE_GPU_TEXTURE_SWIZZLE_B = 2, + XE_GPU_TEXTURE_SWIZZLE_W = 3, + XE_GPU_TEXTURE_SWIZZLE_A = 3, + XE_GPU_TEXTURE_SWIZZLE_0 = 4, + XE_GPU_TEXTURE_SWIZZLE_1 = 5, + XE_GPU_TEXTURE_SWIZZLE_RRRR = XE_GPU_MAKE_TEXTURE_SWIZZLE(R, R, R, R), + XE_GPU_TEXTURE_SWIZZLE_RGGG = XE_GPU_MAKE_TEXTURE_SWIZZLE(R, G, G, G), + XE_GPU_TEXTURE_SWIZZLE_RGBB = XE_GPU_MAKE_TEXTURE_SWIZZLE(R, G, B, B), + XE_GPU_TEXTURE_SWIZZLE_RGBA = XE_GPU_MAKE_TEXTURE_SWIZZLE(R, G, B, A), + XE_GPU_TEXTURE_SWIZZLE_BGRA = XE_GPU_MAKE_TEXTURE_SWIZZLE(B, G, R, A), + XE_GPU_TEXTURE_SWIZZLE_0000 = XE_GPU_MAKE_TEXTURE_SWIZZLE(0, 0, 0, 0), +} XE_GPU_TEXTURE_SWIZZLE; inline uint16_t GpuSwap(uint16_t value, Endian endianness) { switch (endianness) { @@ -999,6 +999,9 @@ enum class FetchConstantType : uint32_t { kVertex, }; +constexpr uint32_t kTextureFetchConstantCount = 32; +constexpr uint32_t kVertexFetchConstantCount = 3 * kTextureFetchConstantCount; + // XE_GPU_REG_SHADER_CONSTANT_FETCH_* union alignas(uint32_t) xe_gpu_vertex_fetch_t { struct { @@ -1128,7 +1131,7 @@ union alignas(uint32_t) xe_gpu_texture_fetch_t { }; uint32_t num_format : 1; // +0 dword_3 frac/int - // xyzw, 3b each (XE_GPU_SWIZZLE) + // xyzw, 3b each (XE_GPU_TEXTURE_SWIZZLE) uint32_t swizzle : 12; // +1 int32_t exp_adjust : 6; // +13 TextureFilter mag_filter : 2; // +19