diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 9c0e4f911..30b7f2cd7 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -32,12 +32,17 @@ DEFINE_bool(d3d12_edram_rov, true, // disable half-pixel offset by setting this to false. DEFINE_bool(d3d12_half_pixel_offset, true, "Enable half-pixel vertex and VPOS offset."); -DEFINE_bool(d3d12_memexport_readback, false, +DEFINE_bool(d3d12_readback_memexport, false, "Read data written by memory export in shaders on the CPU. This " "may be needed in some games (but many only access exported data " "on the GPU, and this flag isn't needed to handle such behavior), " "but causes mid-frame synchronization, so it has a huge " "performance impact."); +DEFINE_bool(d3d12_readback_resolve, false, + "Read render-to-texture results on the CPU. This may be needed in " + "some games, for instance, for screenshots in saved games, but " + "causes mid-frame synchronization, so it has a huge performance " + "impact."); DEFINE_bool(d3d12_ssaa_custom_sample_positions, false, "Enable custom SSAA sample positions for the RTV/DSV rendering " "path where available instead of centers (experimental, not very " @@ -1606,7 +1611,7 @@ bool D3D12CommandProcessor::IssueDraw(PrimitiveType primitive_type, memexport_range.base_address_dwords << 2, memexport_range.size_dwords << 2); } - if (FLAGS_d3d12_memexport_readback) { + if (FLAGS_d3d12_readback_memexport) { // Read the exported data on the CPU. uint32_t memexport_total_size = 0; for (uint32_t i = 0; i < memexport_range_count; ++i) { @@ -1661,8 +1666,39 @@ bool D3D12CommandProcessor::IssueCopy() { SCOPE_profile_cpu_f("gpu"); #endif // FINE_GRAINED_DRAW_SCOPES BeginFrame(); - return render_target_cache_->Resolve(shared_memory_.get(), - texture_cache_.get(), memory_); + uint32_t written_address, written_length; + if (!render_target_cache_->Resolve(shared_memory_.get(), texture_cache_.get(), + memory_, written_address, + written_length)) { + return false; + } + if (FLAGS_d3d12_readback_resolve && !texture_cache_->IsResolutionScale2X() && + written_length) { + // Read the resolved data on the CPU. + ID3D12Resource* readback_buffer = RequestReadbackBuffer(written_length); + if (readback_buffer != nullptr) { + shared_memory_->UseAsCopySource(); + SubmitBarriers(); + ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer(); + deferred_command_list_->D3DCopyBufferRegion( + readback_buffer, 0, shared_memory_buffer, written_address, + written_length); + EndFrame(); + GetD3D12Context()->AwaitAllFramesCompletion(); + D3D12_RANGE readback_range; + readback_range.Begin = 0; + readback_range.End = written_length; + void* readback_mapping; + if (SUCCEEDED( + readback_buffer->Map(0, &readback_range, &readback_mapping))) { + std::memcpy(memory_->TranslatePhysical(written_address), + readback_mapping, written_length); + D3D12_RANGE readback_write_range = {}; + readback_buffer->Unmap(0, &readback_write_range); + } + } + } + return true; } bool D3D12CommandProcessor::BeginFrame() { diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index a3c0bb1e6..12081e84a 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -988,7 +988,11 @@ bool RenderTargetCache::UpdateRenderTargets(const D3D12Shader* pixel_shader) { } bool RenderTargetCache::Resolve(SharedMemory* shared_memory, - TextureCache* texture_cache, Memory* memory) { + TextureCache* texture_cache, Memory* memory, + uint32_t& written_address_out, + uint32_t& written_length_out) { + written_address_out = written_length_out = 0; + if (!command_processor_->IsROVUsedForEDRAM()) { // Save the currently bound render targets to the EDRAM buffer that will be // used as the resolve source and clear bindings to allow render target @@ -1152,9 +1156,10 @@ bool RenderTargetCache::Resolve(SharedMemory* shared_memory, // GetEDRAMLayout in ResolveCopy and ResolveClear will perform the needed // clamping to the source render target size. - bool result = ResolveCopy(shared_memory, texture_cache, surface_edram_base, - surface_pitch, msaa_samples, surface_is_depth, - surface_format, rect); + bool result = + ResolveCopy(shared_memory, texture_cache, surface_edram_base, + surface_pitch, msaa_samples, surface_is_depth, surface_format, + rect, written_address_out, written_length_out); // Clear the color RT if needed. if (!surface_is_depth) { result &= ResolveClear(surface_edram_base, surface_pitch, msaa_samples, @@ -1170,8 +1175,11 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, TextureCache* texture_cache, uint32_t edram_base, uint32_t surface_pitch, MsaaSamples msaa_samples, bool is_depth, - uint32_t src_format, - const D3D12_RECT& rect) { + uint32_t src_format, const D3D12_RECT& rect, + uint32_t& written_address_out, + uint32_t& written_length_out) { + written_address_out = written_length_out = 0; + auto& regs = *register_file_; uint32_t rb_copy_control = regs[XE_GPU_REG_RB_COPY_CONTROL].u32; @@ -1475,6 +1483,8 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, // Invalidate textures and mark the range as scaled if needed. texture_cache->MarkRangeAsResolved(dest_modified_start, dest_modified_length); + written_address_out = dest_modified_start; + written_length_out = dest_modified_length; } else { // ************************************************************************* // Conversion and AA resolving @@ -1788,7 +1798,7 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, dest_format, dest_address, dest_pitch, dest_height, dest_3d, uint32_t(rect.left) & 31, uint32_t(rect.top) & 31, dest_z, copy_width, copy_height, dest_endian, copy_buffer, resolve_target->copy_buffer_size, - resolve_target->footprint); + resolve_target->footprint, &written_address_out, &written_length_out); // Done with the copy buffer. diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index c01e65a25..dd1d4999f 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -269,7 +269,8 @@ class RenderTargetCache { // register values, and also clears the EDRAM buffer if needed. Must be in a // frame for calling. bool Resolve(SharedMemory* shared_memory, TextureCache* texture_cache, - Memory* memory); + Memory* memory, uint32_t& written_address_out, + uint32_t& written_length_out); // Flushes the render targets to EDRAM and unbinds them, for instance, when // the command processor takes over framebuffer bindings to draw something // special. @@ -481,7 +482,8 @@ class RenderTargetCache { bool ResolveCopy(SharedMemory* shared_memory, TextureCache* texture_cache, uint32_t edram_base, uint32_t surface_pitch, MsaaSamples msaa_samples, bool is_depth, uint32_t src_format, - const D3D12_RECT& rect); + const D3D12_RECT& rect, uint32_t& written_address_out, + uint32_t& written_length_out); // Performs the clearing part of a resolve. bool ResolveClear(uint32_t edram_base, uint32_t surface_pitch, MsaaSamples msaa_samples, bool is_depth, uint32_t format, diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index 564456765..9690abc05 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -1291,7 +1291,15 @@ bool TextureCache::TileResolvedTexture( uint32_t texture_height, bool is_3d, uint32_t offset_x, uint32_t offset_y, uint32_t offset_z, uint32_t resolve_width, uint32_t resolve_height, Endian128 endian, ID3D12Resource* buffer, uint32_t buffer_size, - const D3D12_PLACED_SUBRESOURCE_FOOTPRINT& footprint) { + const D3D12_PLACED_SUBRESOURCE_FOOTPRINT& footprint, + uint32_t* written_address_out, uint32_t* written_length_out) { + if (written_address_out) { + *written_address_out = 0; + } + if (written_length_out) { + *written_length_out = 0; + } + ResolveTileMode resolve_tile_mode = host_formats_[uint32_t(format)].resolve_tile_mode; if (resolve_tile_mode == ResolveTileMode::kUnknown) { @@ -1456,6 +1464,12 @@ bool TextureCache::TileResolvedTexture( // Invalidate textures and mark the range as scaled if needed. MarkRangeAsResolved(texture_modified_start, texture_modified_length); + if (written_address_out) { + *written_address_out = texture_modified_start; + } + if (written_length_out) { + *written_length_out = texture_modified_length; + } return true; } diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h index 65024d80c..244fceff1 100644 --- a/src/xenia/gpu/d3d12/texture_cache.h +++ b/src/xenia/gpu/d3d12/texture_cache.h @@ -137,7 +137,9 @@ class TextureCache { uint32_t offset_z, uint32_t resolve_width, uint32_t resolve_height, Endian128 endian, ID3D12Resource* buffer, uint32_t buffer_size, - const D3D12_PLACED_SUBRESOURCE_FOOTPRINT& footprint); + const D3D12_PLACED_SUBRESOURCE_FOOTPRINT& footprint, + uint32_t* written_address_out, + uint32_t* written_length_out); inline bool IsResolutionScale2X() const { return scaled_resolve_buffer_ != nullptr;