diff --git a/src/xenia/base/filesystem_posix.cc b/src/xenia/base/filesystem_posix.cc index 2e9ddb2c5..193e637ea 100644 --- a/src/xenia/base/filesystem_posix.cc +++ b/src/xenia/base/filesystem_posix.cc @@ -217,6 +217,10 @@ std::vector ListFiles(const std::filesystem::path& path) { } while (auto ent = readdir(dir)) { + if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) { + continue; + } + FileInfo info; info.name = ent->d_name; @@ -225,6 +229,7 @@ std::vector ListFiles(const std::filesystem::path& path) { info.create_timestamp = convertUnixtimeToWinFiletime(st.st_ctime); info.access_timestamp = convertUnixtimeToWinFiletime(st.st_atime); info.write_timestamp = convertUnixtimeToWinFiletime(st.st_mtime); + info.path = path; if (ent->d_type == DT_DIR) { info.type = FileInfo::Type::kDirectory; info.total_size = 0; @@ -234,7 +239,7 @@ std::vector ListFiles(const std::filesystem::path& path) { } result.push_back(info); } - + closedir(dir); return result; } diff --git a/src/xenia/base/utf8.cc b/src/xenia/base/utf8.cc index 65f798f54..a96d6b194 100644 --- a/src/xenia/base/utf8.cc +++ b/src/xenia/base/utf8.cc @@ -10,6 +10,7 @@ #include "xenia/base/utf8.h" #include +#include #include #include #include diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index b9cb88869..791b9a87d 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -481,6 +481,43 @@ struct VECTOR_COMPARE_UGT_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW | + kX64EmitAVX512DQ) && + (i.instr->flags != FLOAT32_TYPE)) { + Xmm src1 = e.xmm0; + if (i.src1.is_constant) { + e.LoadConstantXmm(src1, i.src1.constant()); + } else { + src1 = i.src1; + } + + Xmm src2 = e.xmm1; + if (i.src2.is_constant) { + e.LoadConstantXmm(src2, i.src2.constant()); + } else { + src2 = i.src2; + } + + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpub(e.k1, src1, src2, 0x6); + e.vpmovm2b(i.dest, e.k1); + break; + case INT16_TYPE: + e.vpcmpuw(e.k1, src1, src2, 0x6); + e.vpmovm2w(i.dest, e.k1); + break; + case INT32_TYPE: + e.vpcmpud(e.k1, src1, src2, 0x6); + e.vpmovm2d(i.dest, e.k1); + break; + default: + assert_always(); + break; + } + return; + } + Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy switch (i.instr->flags) { case INT8_TYPE: diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index 4c1640302..bc4e91287 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -646,8 +646,9 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { break; case OPCODE_AND_NOT: if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { - v->set_from(i->src1.value); - v->AndNot(i->src2.value); + v->set_from(i->src2.value); + v->Not(); + v->And(i->src1.value); i->UnlinkAndNOP(); result = true; } diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index 0ea2fb4ad..15db03282 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -324,8 +324,13 @@ int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) { } int InstrEmit_vaddcuw(PPCHIRBuilder& f, const InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* sum = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT32_TYPE, + ARITHMETIC_UNSIGNED); + Value* overflow = f.VectorCompareUGT(f.LoadVR(i.VX.VA), sum, INT32_TYPE); + Value* carry = + f.VectorShr(overflow, f.LoadConstantVec128(vec128i(31)), INT32_TYPE); + f.StoreVR(i.VX.VD, carry); + return 0; } int InstrEmit_vaddfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb) { @@ -1665,7 +1670,11 @@ int InstrEmit_vsrw128(PPCHIRBuilder& f, const InstrData& i) { } int InstrEmit_vsubcuw(PPCHIRBuilder& f, const InstrData& i) { - XEINSTRNOTIMPLEMENTED(); + Value* underflow = + f.VectorCompareUGE(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT32_TYPE); + Value* borrow = + f.VectorShr(underflow, f.LoadConstantVec128(vec128i(31)), INT32_TYPE); + f.StoreVR(i.VX.VD, borrow); return 1; } diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 28c1b214d..3da3bfbde 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -2574,7 +2574,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, return false; } pipeline_cache_->AnalyzeShaderUcode(*vertex_shader); - const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used(); + + const bool memexport_used_vertex = vertex_shader->memexport_eM_written() != 0; // Pixel shader analysis. bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs); @@ -2604,7 +2605,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, } const bool memexport_used_pixel = - pixel_shader && pixel_shader->is_valid_memexport_used(); + pixel_shader && (pixel_shader->memexport_eM_written() != 0); const bool memexport_used = memexport_used_vertex || memexport_used_pixel; if (!BeginSubmission(true)) { @@ -2831,12 +2832,22 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, // Gather memexport ranges and ensure the heaps for them are resident, and // also load the data surrounding the export and to fill the regions that // won't be modified by the shaders. - - memexport_range_count_ = 0; - if (memexport_used_vertex || memexport_used_pixel) { - bool retflag; - bool retval = GatherMemexportRangesAndMakeResident(retflag); - if (retflag) return retval; + memexport_ranges_.clear(); + if (memexport_used_vertex) { + draw_util::AddMemExportRanges(regs, *vertex_shader, memexport_ranges_); + } + if (memexport_used_pixel) { + draw_util::AddMemExportRanges(regs, *pixel_shader, memexport_ranges_); + } + for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) { + if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2, + memexport_range.size_bytes)) { + XELOGE( + "Failed to request memexport stream at 0x{:08X} (size {}) in the " + "shared memory", + memexport_range.base_address_dwords << 2, memexport_range.size_bytes); + return false; + } } // Primitive topology. D3D_PRIMITIVE_TOPOLOGY primitive_topology; @@ -2935,11 +2946,22 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, // If the shared memory is a UAV, it can't be used as an index buffer // (UAV is a read/write state, index buffer is a read-only state). // Need to copy the indices to a buffer in the index buffer state. - bool retflag; - bool retval = HandleMemexportGuestDMA( - scratch_index_buffer, index_buffer_view, - primitive_processing_result.guest_index_base, retflag); - if (retflag) return retval; + scratch_index_buffer = RequestScratchGPUBuffer( + index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST); + if (scratch_index_buffer == nullptr) { + return false; + } + shared_memory_->UseAsCopySource(); + SubmitBarriers(); + deferred_command_list_.D3DCopyBufferRegion( + scratch_index_buffer, 0, shared_memory_->GetBuffer(), + primitive_processing_result.guest_index_base, + index_buffer_view.SizeInBytes); + PushTransitionBarrier(scratch_index_buffer, + D3D12_RESOURCE_STATE_COPY_DEST, + D3D12_RESOURCE_STATE_INDEX_BUFFER); + index_buffer_view.BufferLocation = + scratch_index_buffer->GetGPUVirtualAddress(); } else { index_buffer_view.BufferLocation = shared_memory_->GetGPUAddress() + @@ -2977,199 +2999,66 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, } if (memexport_used) { - HandleMemexportDrawOrdering_AndReadback(); - } - - return true; -} -XE_COLD -XE_NOINLINE -bool D3D12CommandProcessor::HandleMemexportGuestDMA( - ID3D12Resource*& scratch_index_buffer, - D3D12_INDEX_BUFFER_VIEW& index_buffer_view, uint32_t guest_index_base, - // xe::gpu::PrimitiveProcessor::ProcessingResult& - // primitive_processing_result, - bool& retflag) { - retflag = true; - scratch_index_buffer = RequestScratchGPUBuffer( - index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST); - if (scratch_index_buffer == nullptr) { - return false; - } - shared_memory_->UseAsCopySource(); - SubmitBarriers(); - deferred_command_list_.D3DCopyBufferRegion( - scratch_index_buffer, 0, shared_memory_->GetBuffer(), guest_index_base, - index_buffer_view.SizeInBytes); - PushTransitionBarrier(scratch_index_buffer, D3D12_RESOURCE_STATE_COPY_DEST, - D3D12_RESOURCE_STATE_INDEX_BUFFER); - index_buffer_view.BufferLocation = - scratch_index_buffer->GetGPUVirtualAddress(); - retflag = false; - return {}; -} -XE_NOINLINE -XE_COLD -bool D3D12CommandProcessor::GatherMemexportRangesAndMakeResident( - bool& retflag) { - auto vertex_shader = static_cast(active_vertex_shader()); - auto pixel_shader = static_cast(active_pixel_shader()); - const xe::gpu::RegisterFile& regs = *register_file_; - const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used(); - const bool memexport_used_pixel = - pixel_shader && pixel_shader->is_valid_memexport_used(); - retflag = true; - if (memexport_used_vertex) { - for (uint32_t constant_index : - vertex_shader->memexport_stream_constants()) { - const auto& memexport_stream = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4); - if (memexport_stream.index_count == 0) { - continue; - } - uint32_t memexport_format_size = - GetSupportedMemExportFormatSize(memexport_stream.format); - if (memexport_format_size == 0) { - XELOGE("Unsupported memexport format {}", - FormatInfo::GetName( - xenos::TextureFormat(uint32_t(memexport_stream.format)))); - return false; - } - uint32_t memexport_size_dwords = - memexport_stream.index_count * memexport_format_size; - // Try to reduce the number of shared memory operations when writing - // different elements into the same buffer through different exports - // (happens in 4D5307E6). - bool memexport_range_reused = false; - for (uint32_t i = 0; i < memexport_range_count_; ++i) { - MemExportRange& memexport_range = memexport_ranges_[i]; - if (memexport_range.base_address_dwords == - memexport_stream.base_address) { - memexport_range.size_dwords = - std::max(memexport_range.size_dwords, memexport_size_dwords); - memexport_range_reused = true; - break; - } - } - // Add a new range if haven't expanded an existing one. - if (!memexport_range_reused) { - MemExportRange& memexport_range = - memexport_ranges_[memexport_range_count_++]; - memexport_range.base_address_dwords = memexport_stream.base_address; - memexport_range.size_dwords = memexport_size_dwords; - } + // Make sure this memexporting draw is ordered with other work using shared + // memory as a UAV. + // TODO(Triang3l): Find some PM4 command that can be used for indication of + // when memexports should be awaited? + shared_memory_->MarkUAVWritesCommitNeeded(); + // Invalidate textures in memexported memory and watch for changes. + for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) { + shared_memory_->RangeWrittenByGpu( + memexport_range.base_address_dwords << 2, memexport_range.size_bytes, + false); } - } - if (memexport_used_pixel) { - for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) { - const auto& memexport_stream = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4); - if (memexport_stream.index_count == 0) { - continue; + if (cvars::d3d12_readback_memexport) { + // Read the exported data on the CPU. + uint32_t memexport_total_size = 0; + for (const draw_util::MemExportRange& memexport_range : + memexport_ranges_) { + memexport_total_size += memexport_range.size_bytes; } - uint32_t memexport_format_size = - GetSupportedMemExportFormatSize(memexport_stream.format); - if (memexport_format_size == 0) { - XELOGE("Unsupported memexport format {}", - FormatInfo::GetName( - xenos::TextureFormat(uint32_t(memexport_stream.format)))); - return false; - } - uint32_t memexport_size_dwords = - memexport_stream.index_count * memexport_format_size; - bool memexport_range_reused = false; - for (uint32_t i = 0; i < memexport_range_count_; ++i) { - MemExportRange& memexport_range = memexport_ranges_[i]; - if (memexport_range.base_address_dwords == - memexport_stream.base_address) { - memexport_range.size_dwords = - std::max(memexport_range.size_dwords, memexport_size_dwords); - memexport_range_reused = true; - break; - } - } - if (!memexport_range_reused) { - MemExportRange& memexport_range = - memexport_ranges_[memexport_range_count_++]; - memexport_range.base_address_dwords = memexport_stream.base_address; - memexport_range.size_dwords = memexport_size_dwords; - } - } - } - for (uint32_t i = 0; i < memexport_range_count_; ++i) { - const MemExportRange& memexport_range = memexport_ranges_[i]; - if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2, - memexport_range.size_dwords << 2)) { - XELOGE( - "Failed to request memexport stream at 0x{:08X} (size {}) in the " - "shared memory", - memexport_range.base_address_dwords << 2, - memexport_range.size_dwords << 2); - return false; - } - } - retflag = false; - return {}; -} -XE_NOINLINE -XE_COLD -void D3D12CommandProcessor::HandleMemexportDrawOrdering_AndReadback() { - // Make sure this memexporting draw is ordered with other work using shared - // memory as a UAV. - // TODO(Triang3l): Find some PM4 command that can be used for indication of - // when memexports should be awaited? - shared_memory_->MarkUAVWritesCommitNeeded(); - // Invalidate textures in memexported memory and watch for changes. - for (uint32_t i = 0; i < memexport_range_count_; ++i) { - const MemExportRange& memexport_range = memexport_ranges_[i]; - shared_memory_->RangeWrittenByGpu(memexport_range.base_address_dwords << 2, - memexport_range.size_dwords << 2, false); - } - if (cvars::d3d12_readback_memexport) { - // Read the exported data on the CPU. - uint32_t memexport_total_size = 0; - for (uint32_t i = 0; i < memexport_range_count_; ++i) { - memexport_total_size += memexport_ranges_[i].size_dwords << 2; - } - if (memexport_total_size != 0) { - ID3D12Resource* readback_buffer = - RequestReadbackBuffer(memexport_total_size); - if (readback_buffer != nullptr) { - shared_memory_->UseAsCopySource(); - SubmitBarriers(); - ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer(); - uint32_t readback_buffer_offset = 0; - for (uint32_t i = 0; i < memexport_range_count_; ++i) { - const MemExportRange& memexport_range = memexport_ranges_[i]; - uint32_t memexport_range_size = memexport_range.size_dwords << 2; - deferred_command_list_.D3DCopyBufferRegion( - readback_buffer, readback_buffer_offset, shared_memory_buffer, - memexport_range.base_address_dwords << 2, memexport_range_size); - readback_buffer_offset += memexport_range_size; - } - if (AwaitAllQueueOperationsCompletion()) { - D3D12_RANGE readback_range; - readback_range.Begin = 0; - readback_range.End = memexport_total_size; - void* readback_mapping; - if (SUCCEEDED(readback_buffer->Map(0, &readback_range, - &readback_mapping))) { - const uint32_t* readback_dwords = - reinterpret_cast(readback_mapping); - for (uint32_t i = 0; i < memexport_range_count_; ++i) { - const MemExportRange& memexport_range = memexport_ranges_[i]; - std::memcpy(memory_->TranslatePhysical( - memexport_range.base_address_dwords << 2), - readback_dwords, memexport_range.size_dwords << 2); - readback_dwords += memexport_range.size_dwords; + if (memexport_total_size != 0) { + ID3D12Resource* readback_buffer = + RequestReadbackBuffer(memexport_total_size); + if (readback_buffer != nullptr) { + shared_memory_->UseAsCopySource(); + SubmitBarriers(); + ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer(); + uint32_t readback_buffer_offset = 0; + for (const draw_util::MemExportRange& memexport_range : + memexport_ranges_) { + uint32_t memexport_range_size = memexport_range.size_bytes; + deferred_command_list_.D3DCopyBufferRegion( + readback_buffer, readback_buffer_offset, shared_memory_buffer, + memexport_range.base_address_dwords << 2, memexport_range_size); + readback_buffer_offset += memexport_range_size; + } + if (AwaitAllQueueOperationsCompletion()) { + D3D12_RANGE readback_range; + readback_range.Begin = 0; + readback_range.End = memexport_total_size; + void* readback_mapping; + if (SUCCEEDED(readback_buffer->Map(0, &readback_range, + &readback_mapping))) { + const uint8_t* readback_bytes = + reinterpret_cast(readback_mapping); + for (const draw_util::MemExportRange& memexport_range : + memexport_ranges_) { + std::memcpy(memory_->TranslatePhysical( + memexport_range.base_address_dwords << 2), + readback_bytes, memexport_range.size_bytes); + readback_bytes += memexport_range.size_bytes; + } + D3D12_RANGE readback_write_range = {}; + readback_buffer->Unmap(0, &readback_write_range); } - D3D12_RANGE readback_write_range = {}; - readback_buffer->Unmap(0, &readback_write_range); } } } } } + + return true; } void D3D12CommandProcessor::InitializeTrace() { @@ -5208,36 +5097,6 @@ bool D3D12CommandProcessor::UpdateBindings_BindfulPath( return {}; } -uint32_t D3D12CommandProcessor::GetSupportedMemExportFormatSize( - xenos::ColorFormat format) { - switch (format) { - case xenos::ColorFormat::k_8_8_8_8: - case xenos::ColorFormat::k_2_10_10_10: - // TODO(Triang3l): Investigate how k_8_8_8_8_A works - not supported in the - // texture cache currently. - // case xenos::ColorFormat::k_8_8_8_8_A: - case xenos::ColorFormat::k_10_11_11: - case xenos::ColorFormat::k_11_11_10: - case xenos::ColorFormat::k_16_16: - case xenos::ColorFormat::k_16_16_FLOAT: - case xenos::ColorFormat::k_32_FLOAT: - case xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16: - case xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16: - case xenos::ColorFormat::k_10_11_11_AS_16_16_16_16: - case xenos::ColorFormat::k_11_11_10_AS_16_16_16_16: - return 1; - case xenos::ColorFormat::k_16_16_16_16: - case xenos::ColorFormat::k_16_16_16_16_FLOAT: - case xenos::ColorFormat::k_32_32_FLOAT: - return 2; - case xenos::ColorFormat::k_32_32_32_32_FLOAT: - return 4; - default: - break; - } - return 0; -} - ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) { if (size == 0) { return nullptr; diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index c4dd454b3..46af23b99 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -18,6 +18,7 @@ #include #include #include +#include #include "xenia/base/assert.h" #include "xenia/gpu/command_processor.h" @@ -319,18 +320,7 @@ class D3D12CommandProcessor final : public CommandProcessor { bool IssueDraw(xenos::PrimitiveType primitive_type, uint32_t index_count, IndexBufferInfo* index_buffer_info, bool major_mode_explicit) override; - XE_COLD - XE_NOINLINE - bool HandleMemexportGuestDMA(ID3D12Resource*& scratch_index_buffer, - D3D12_INDEX_BUFFER_VIEW& index_buffer_view, - uint32_t guest_index_base, - bool& retflag); - XE_NOINLINE - XE_COLD - bool GatherMemexportRangesAndMakeResident(bool& retflag); - XE_NOINLINE - XE_COLD - void HandleMemexportDrawOrdering_AndReadback(); + bool IssueCopy() override; XE_NOINLINE bool IssueCopy_ReadbackResolvePath(); @@ -502,13 +492,6 @@ class D3D12CommandProcessor final : public CommandProcessor { const size_t sampler_count_vertex, const size_t sampler_count_pixel, bool& retflag); - // Returns dword count for one element for a memexport format, or 0 if it's - // not supported by the D3D12 command processor (if it's smaller that 1 dword, - // for instance). - // TODO(Triang3l): Check if any game uses memexport with formats smaller than - // 32 bits per element. - static uint32_t GetSupportedMemExportFormatSize(xenos::ColorFormat format); - // Returns a buffer for reading GPU data back to the CPU. Assuming // synchronizing immediately after use. Always in COPY_DEST state. ID3D12Resource* RequestReadbackBuffer(uint32_t size); @@ -811,12 +794,13 @@ class D3D12CommandProcessor final : public CommandProcessor { draw_util::GetViewportInfoArgs previous_viewport_info_args_; draw_util::ViewportInfo previous_viewport_info_; - // scratch memexport data - MemExportRange memexport_ranges_[512]; - uint32_t memexport_range_count_ = 0; + std::atomic pix_capture_requested_ = false; bool pix_capturing_; + + // Temporary storage for memexport stream constants used in the draw. + std::vector memexport_ranges_; }; } // namespace d3d12 diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index e6461e8bd..802997580 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2022 Ben Vanik. All rights reserved. * + * Copyright 2023 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -134,7 +134,7 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader, // // Memory export is an obvious intentional side effect. if (shader.kills_pixels() || shader.writes_depth() || - shader.is_valid_memexport_used() || + shader.memexport_eM_written() || (shader.writes_color_target(0) && DoesCoverageDependOnAlpha(regs.Get()))) { return true; @@ -765,8 +765,70 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs, } return normalized_color_mask; } + +void AddMemExportRanges(const RegisterFile& regs, const Shader& shader, + std::vector& ranges_out) { + if (!shader.memexport_eM_written()) { + // The shader has eA writes, but no real exports. + return; + } + uint32_t float_constants_base = shader.type() == xenos::ShaderType::kVertex + ? regs.Get().base + : regs.Get().base; + for (uint32_t constant_index : shader.memexport_stream_constants()) { + const auto& stream = regs.Get( + XE_GPU_REG_SHADER_CONSTANT_000_X + + (float_constants_base + constant_index) * 4); + if (!stream.index_count) { + continue; + } + const FormatInfo& format_info = + *FormatInfo::Get(xenos::TextureFormat(stream.format)); + if (format_info.type != FormatType::kResolvable) { + XELOGE("Unsupported memexport format {}", + FormatInfo::GetName(format_info.format)); + // Translated shaders shouldn't be performing exports with an unknown + // format, the draw can still be performed. + continue; + } + // TODO(Triang3l): Remove the unresearched format logging when it's known + // how exactly these formats need to be handled (most importantly what + // components need to be stored and in which order). + switch (stream.format) { + case xenos::ColorFormat::k_8_A: + case xenos::ColorFormat::k_8_B: + case xenos::ColorFormat::k_8_8_8_8_A: + XELOGW( + "Memexport done to an unresearched format {}, report the game to " + "Xenia developers!", + FormatInfo::GetName(format_info.format)); + break; + default: + break; + } + uint32_t stream_size_bytes = + stream.index_count * (format_info.bits_per_pixel >> 3); + // Try to reduce the number of shared memory operations when writing + // different elements into the same buffer through different exports + // (happens in 4D5307E6). + bool range_reused = false; + for (MemExportRange& range : ranges_out) { + if (range.base_address_dwords == stream.base_address) { + range.size_bytes = std::max(range.size_bytes, stream_size_bytes); + range_reused = true; + break; + } + } + // Add a new range if haven't expanded an existing one. + if (!range_reused) { + ranges_out.emplace_back(stream.base_address, stream_size_bytes); + } + } +} + XE_NOINLINE XE_NOALIAS + xenos::CopySampleSelect SanitizeCopySampleSelect( xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples, bool is_depth) { diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index 8196830b8..08c710e6c 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "xenia/base/assert.h" #include "xenia/gpu/register_file.h" @@ -474,6 +475,19 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA( return guest_sample_index ? 3 : 0; } +struct MemExportRange { + uint32_t base_address_dwords; + uint32_t size_bytes; + + explicit MemExportRange(uint32_t base_address_dwords, uint32_t size_bytes) + : base_address_dwords(base_address_dwords), size_bytes(size_bytes) {} +}; + +// Gathers memory ranges involved in memexports in the shader with the float +// constants from the registers, adding them to ranges_out. +void AddMemExportRanges(const RegisterFile& regs, const Shader& shader, + std::vector& ranges_out); + // To avoid passing values that the shader won't understand (even though // Direct3D 9 shouldn't pass them anyway). XE_NOINLINE diff --git a/src/xenia/gpu/dxbc.h b/src/xenia/gpu/dxbc.h index ea44abe46..e1587a7a5 100644 --- a/src/xenia/gpu/dxbc.h +++ b/src/xenia/gpu/dxbc.h @@ -913,6 +913,8 @@ enum class OperandModifier : uint32_t { struct Dest : OperandAddress { // Ignored for 0-component and 1-component operand types. + // For 4-component operand types, if the write mask is 0, it's treated as + // 0-component. uint32_t write_mask_; // Input destinations (v*) are for use only in declarations. Vector input @@ -1028,12 +1030,16 @@ struct Dest : OperandAddress { void Write(std::vector& code, bool in_dcl = false) const { uint32_t operand_token = GetOperandTokenTypeAndIndex(); OperandDimension dimension = GetDimension(in_dcl); - operand_token |= uint32_t(dimension); if (dimension == OperandDimension::kVector) { - assert_true(write_mask_ > 0b0000 && write_mask_ <= 0b1111); - operand_token |= - (uint32_t(ComponentSelection::kMask) << 2) | (write_mask_ << 4); + if (write_mask_) { + assert_true(write_mask_ <= 0b1111); + operand_token |= + (uint32_t(ComponentSelection::kMask) << 2) | (write_mask_ << 4); + } else { + dimension = OperandDimension::kNoData; + } } + operand_token |= uint32_t(dimension); code.push_back(operand_token); OperandAddress::Write(code); } @@ -1508,6 +1514,8 @@ enum class Opcode : uint32_t { kStoreUAVTyped = 164, kLdRaw = 165, kStoreRaw = 166, + kAtomicAnd = 169, + kAtomicOr = 170, kEvalSampleIndex = 204, kEvalCentroid = 205, }; @@ -2396,6 +2404,14 @@ class Assembler { ++stat_.instruction_count; ++stat_.c_texture_store_instructions; } + void OpAtomicAnd(const Dest& dest, const Src& address, + uint32_t address_components, const Src& value) { + EmitAtomicOp(Opcode::kAtomicAnd, dest, address, address_components, value); + } + void OpAtomicOr(const Dest& dest, const Src& address, + uint32_t address_components, const Src& value) { + EmitAtomicOp(Opcode::kAtomicOr, dest, address, address_components, value); + } void OpEvalSampleIndex(const Dest& dest, const Src& value, const Src& sample_index) { uint32_t dest_write_mask = dest.GetMask(); @@ -2522,6 +2538,22 @@ class Assembler { src1.Write(code_, true, 0b0000); ++stat_.instruction_count; } + void EmitAtomicOp(Opcode opcode, const Dest& dest, const Src& address, + uint32_t address_components, const Src& value) { + // Atomic operations require a 0-component memory destination. + assert_zero(dest.GetMask()); + uint32_t address_mask = (1 << address_components) - 1; + uint32_t operands_length = dest.GetLength() + + address.GetLength(address_mask) + + value.GetLength(0b0001); + code_.reserve(code_.size() + 1 + operands_length); + code_.push_back(OpcodeToken(opcode, operands_length)); + dest.Write(code_); + address.Write(code_, true, address_mask); + value.Write(code_, true, 0b0001); + ++stat_.instruction_count; + ++stat_.c_interlocked_instructions; + } std::vector& code_; Statistics& stat_; diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 921d7e346..5edf920b8 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -179,8 +179,6 @@ void DxbcShaderTranslator::Reset() { sampler_bindings_.clear(); - memexport_alloc_current_count_ = 0; - std::memset(&shader_feature_info_, 0, sizeof(shader_feature_info_)); std::memset(&statistics_, 0, sizeof(statistics_)); } @@ -789,6 +787,63 @@ void DxbcShaderTranslator::StartPixelShader() { PopSystemTemp(); } } + + if (current_shader().memexport_eM_written()) { + // Make sure memexport is done only once for a guest pixel. + dxbc::Dest memexport_enabled_dest( + dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0001)); + dxbc::Src memexport_enabled_src(dxbc::Src::R( + system_temp_memexport_enabled_and_eM_written_, dxbc::Src::kXXXX)); + uint32_t resolution_scaled_axes = + uint32_t(draw_resolution_scale_x_ > 1) | + (uint32_t(draw_resolution_scale_y_ > 1) << 1); + if (resolution_scaled_axes) { + uint32_t memexport_condition_temp = PushSystemTemp(); + // Only do memexport for one host pixel in a guest pixel - prefer the + // host pixel closer to the center of the guest pixel, but one that's + // covered with the half-pixel offset according to the top-left rule (1 + // for 2x because 0 isn't covered with the half-pixel offset, 1 for 3x + // because it's the center and is covered with the half-pixel offset too). + in_position_used_ |= resolution_scaled_axes; + a_.OpFToU(dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes), + dxbc::Src::V1D(in_reg_ps_position_)); + a_.OpUDiv(dxbc::Dest::Null(), + dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes), + dxbc::Src::R(memexport_condition_temp), + dxbc::Src::LU(draw_resolution_scale_x_, + draw_resolution_scale_y_, 0, 0)); + a_.OpIEq(dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes), + dxbc::Src::R(memexport_condition_temp), + dxbc::Src::LU(draw_resolution_scale_x_ >> 1, + draw_resolution_scale_y_ >> 1, 0, 0)); + for (uint32_t i = 0; i < 2; ++i) { + if (!(resolution_scaled_axes & (1 << i))) { + continue; + } + a_.OpAnd(memexport_enabled_dest, memexport_enabled_src, + dxbc::Src::R(memexport_condition_temp).Select(i)); + } + // Release memexport_condition_temp. + PopSystemTemp(); + } + // With sample-rate shading (with float24 conversion), only do memexport + // from one sample (as the shader is invoked multiple times for a pixel), + // if SV_SampleIndex == firstbit_lo(SV_Coverage). For zero coverage, + // firstbit_lo returns 0xFFFFFFFF. + if (IsSampleRate()) { + uint32_t memexport_condition_temp = PushSystemTemp(); + a_.OpFirstBitLo(dxbc::Dest::R(memexport_condition_temp, 0b0001), + dxbc::Src::VCoverage()); + a_.OpIEq( + dxbc::Dest::R(memexport_condition_temp, 0b0001), + dxbc::Src::V1D(in_reg_ps_front_face_sample_index_, dxbc::Src::kYYYY), + dxbc::Src::R(memexport_condition_temp, dxbc::Src::kXXXX)); + a_.OpAnd(memexport_enabled_dest, memexport_enabled_src, + dxbc::Src::R(memexport_condition_temp, dxbc::Src::kXXXX)); + // Release memexport_condition_temp. + PopSystemTemp(); + } + } } void DxbcShaderTranslator::StartTranslation() { @@ -885,34 +940,27 @@ void DxbcShaderTranslator::StartTranslation() { } } - if (!is_depth_only_pixel_shader_) { - // Allocate temporary registers for memexport addresses and data. - std::memset(system_temps_memexport_address_, 0xFF, - sizeof(system_temps_memexport_address_)); - std::memset(system_temps_memexport_data_, 0xFF, - sizeof(system_temps_memexport_data_)); - system_temp_memexport_written_ = UINT32_MAX; - const uint8_t* memexports_written = current_shader().memexport_eM_written(); - for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) { - uint32_t memexport_alloc_written = memexports_written[i]; - if (memexport_alloc_written == 0) { - continue; - } - // If memexport is used at all, allocate a register containing whether eM# - // have actually been written to. - if (system_temp_memexport_written_ == UINT32_MAX) { - system_temp_memexport_written_ = PushSystemTemp(0b1111); - } - system_temps_memexport_address_[i] = PushSystemTemp(0b1111); - uint32_t memexport_data_index; - while (xe::bit_scan_forward(memexport_alloc_written, - &memexport_data_index)) { - memexport_alloc_written &= ~(1u << memexport_data_index); - system_temps_memexport_data_[i][memexport_data_index] = - PushSystemTemp(); - } + // Allocate temporary registers for memexport. + uint8_t memexport_eM_written = current_shader().memexport_eM_written(); + if (memexport_eM_written) { + system_temp_memexport_enabled_and_eM_written_ = PushSystemTemp(0b0010); + // Initialize the memexport conditional to whether the shared memory is + // currently bound as UAV (to 0 or UINT32_MAX). It can be made narrower + // later. + a_.OpIBFE( + dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0001), + dxbc::Src::LU(1), dxbc::Src::LU(kSysFlag_SharedMemoryIsUAV_Shift), + LoadFlagsSystemConstant()); + system_temp_memexport_address_ = PushSystemTemp(0b1111); + uint8_t memexport_eM_remaining = memexport_eM_written; + uint32_t memexport_eM_index; + while (xe::bit_scan_forward(memexport_eM_remaining, &memexport_eM_index)) { + memexport_eM_remaining &= ~(uint8_t(1) << memexport_eM_index); + system_temps_memexport_data_[memexport_eM_index] = PushSystemTemp(0b1111); } + } + if (!is_depth_only_pixel_shader_) { // Allocate system temporary variables for the translated code. Since access // depends on the guest code (thus no guarantees), initialize everything // now (except for pv, it's an internal temporary variable, not accessible @@ -1091,27 +1139,19 @@ void DxbcShaderTranslator::CompleteShaderCode() { // - system_temp_grad_h_lod_. // - system_temp_grad_v_vfetch_address_. PopSystemTemp(6); + } - // Write memexported data to the shared memory UAV. - ExportToMemory(); + uint8_t memexport_eM_written = current_shader().memexport_eM_written(); + if (memexport_eM_written) { + // Write data for the last memexport. + ExportToMemory( + current_shader().memexport_eM_potentially_written_before_end()); - // Release memexport temporary registers. - for (int i = Shader::kMaxMemExports - 1; i >= 0; --i) { - if (system_temps_memexport_address_[i] == UINT32_MAX) { - continue; - } - // Release exported data registers. - for (int j = 4; j >= 0; --j) { - if (system_temps_memexport_data_[i][j] != UINT32_MAX) { - PopSystemTemp(); - } - } - // Release the address register. - PopSystemTemp(); - } - if (system_temp_memexport_written_ != UINT32_MAX) { - PopSystemTemp(); - } + // Release memexport temporary registers: + // - system_temp_memexport_enabled_and_eM_written_. + // - system_temp_memexport_address_. + // - system_temps_memexport_data_. + PopSystemTemp(xe::bit_count(uint32_t(memexport_eM_written)) + 2); } // Write stage-specific epilogue. @@ -1514,36 +1554,22 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, dest = dxbc::Dest::R(system_temp_point_size_edge_flag_kill_vertex_); break; case InstructionStorageTarget::kExportAddress: - // Validate memexport writes (4D5307E6 has some completely invalid ones). - if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 || - memexport_alloc_current_count_ > Shader::kMaxMemExports || - system_temps_memexport_address_[memexport_alloc_current_count_ - 1] == - UINT32_MAX) { + if (!current_shader().memexport_eM_written()) { return; } - dest = dxbc::Dest::R( - system_temps_memexport_address_[memexport_alloc_current_count_ - 1]); + dest = dxbc::Dest::R(system_temp_memexport_address_); break; case InstructionStorageTarget::kExportData: { - // Validate memexport writes (4D5307E6 has some completely invalid ones). - if (memexport_alloc_current_count_ == 0 || - memexport_alloc_current_count_ > Shader::kMaxMemExports || - system_temps_memexport_data_[memexport_alloc_current_count_ - 1] - [result.storage_index] == UINT32_MAX) { - return; - } - dest = dxbc::Dest::R( - system_temps_memexport_data_[memexport_alloc_current_count_ - 1] - [result.storage_index]); + assert_not_zero(current_shader().memexport_eM_written() & + (uint8_t(1) << result.storage_index)); + dest = dxbc::Dest::R(system_temps_memexport_data_[result.storage_index]); // Mark that the eM# has been written to and needs to be exported. assert_not_zero(used_write_mask); - uint32_t memexport_index = memexport_alloc_current_count_ - 1; - a_.OpOr(dxbc::Dest::R(system_temp_memexport_written_, - 1 << (memexport_index >> 2)), - dxbc::Src::R(system_temp_memexport_written_) - .Select(memexport_index >> 2), - dxbc::Src::LU(uint32_t(1) << (result.storage_index + - ((memexport_index & 3) << 3)))); + a_.OpOr( + dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0010), + dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_, + dxbc::Src::kYYYY), + dxbc::Src::LU(uint8_t(1) << result.storage_index)); } break; case InstructionStorageTarget::kColor: assert_not_zero(used_write_mask); @@ -1990,15 +2016,38 @@ void DxbcShaderTranslator::ProcessJumpInstruction( } void DxbcShaderTranslator::ProcessAllocInstruction( - const ParsedAllocInstruction& instr) { + const ParsedAllocInstruction& instr, uint8_t export_eM) { + bool start_memexport = instr.type == AllocType::kMemory && + current_shader().memexport_eM_written(); + if (export_eM || start_memexport) { + CloseExecConditionals(); + } + if (emit_source_map_) { instruction_disassembly_buffer_.Reset(); instr.Disassemble(&instruction_disassembly_buffer_); EmitInstructionDisassembly(); } - if (instr.type == AllocType::kMemory) { - ++memexport_alloc_current_count_; + if (export_eM) { + ExportToMemory(export_eM); + // Reset which eM# elements have been written. + a_.OpMov( + dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0010), + dxbc::Src::LU(0)); + // Break dependencies from the previous memexport. + uint8_t export_eM_remaining = export_eM; + uint32_t eM_index; + while (xe::bit_scan_forward(export_eM_remaining, &eM_index)) { + export_eM_remaining &= ~(uint8_t(1) << eM_index); + a_.OpMov(dxbc::Dest::R(system_temps_memexport_data_[eM_index]), + dxbc::Src::LF(0.0f)); + } + } + + if (start_memexport) { + // Initialize eA to an invalid address. + a_.OpMov(dxbc::Dest::R(system_temp_memexport_address_), dxbc::Src::LU(0)); } } @@ -2851,7 +2900,7 @@ void DxbcShaderTranslator::WriteInputSignature() { // Sample index (SV_SampleIndex) for safe memexport with sample-rate // shading. size_t sample_index_position = SIZE_MAX; - if (current_shader().is_valid_memexport_used() && IsSampleRate()) { + if (current_shader().memexport_eM_written() && IsSampleRate()) { size_t sample_index_position = shader_object_.size(); shader_object_.resize(shader_object_.size() + kParameterDwords); ++parameter_count; @@ -3625,7 +3674,7 @@ void DxbcShaderTranslator::WriteShaderCode() { dxbc::Name::kPosition); } bool sample_rate_memexport = - current_shader().is_valid_memexport_used() && IsSampleRate(); + current_shader().memexport_eM_written() && IsSampleRate(); // Sample-rate shading can't be done with UAV-only rendering (sample-rate // shading is only needed for float24 depth conversion when using a float32 // host depth buffer). diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index bcb38a21f..20fbdd328 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -20,6 +20,7 @@ #include "xenia/base/string_buffer.h" #include "xenia/gpu/dxbc.h" #include "xenia/gpu/shader_translator.h" +#include "xenia/gpu/ucode.h" #include "xenia/ui/graphics_provider.h" namespace xe { @@ -589,13 +590,16 @@ class DxbcShaderTranslator : public ShaderTranslator { void ProcessLoopEndInstruction( const ParsedLoopEndInstruction& instr) override; void ProcessJumpInstruction(const ParsedJumpInstruction& instr) override; - void ProcessAllocInstruction(const ParsedAllocInstruction& instr) override; + void ProcessAllocInstruction(const ParsedAllocInstruction& instr, + uint8_t export_eM) override; void ProcessVertexFetchInstruction( const ParsedVertexFetchInstruction& instr) override; void ProcessTextureFetchInstruction( const ParsedTextureFetchInstruction& instr) override; - void ProcessAluInstruction(const ParsedAluInstruction& instr) override; + void ProcessAluInstruction( + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before) override; private: // IF ANY OF THESE ARE CHANGED, WriteInputSignature and WriteOutputSignature @@ -674,6 +678,11 @@ class DxbcShaderTranslator : public ShaderTranslator { // Frees the last allocated internal r# registers for later reuse. void PopSystemTemp(uint32_t count = 1); + // ExportToMemory modifies the values of eA/eM# for simplicity, call only + // before starting a new export or ending the invocation or making it + // inactive. + void ExportToMemory(uint8_t export_eM); + // Converts one scalar from piecewise linear gamma to linear. The target may // be the same as the source, the temporary variables must be different. If // the source is not pre-saturated, saturation will be done internally. @@ -728,7 +737,7 @@ class DxbcShaderTranslator : public ShaderTranslator { bool ROV_IsDepthStencilEarly() const { assert_true(edram_rov_used_); return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() && - !current_shader().is_valid_memexport_used(); + !current_shader().memexport_eM_written(); } // Converts the pre-clamped depth value to 24-bit (storing the result in bits // 0:23 and zeros in 24:31, not creating room for stencil - since this may be @@ -787,14 +796,6 @@ class DxbcShaderTranslator : public ShaderTranslator { void StartPixelShader_LoadROVParameters(); void StartPixelShader(); - // Writing the epilogue. - // ExportToMemory modifies the values of eA/eM# for simplicity, don't call - // multiple times. - void ExportToMemory_PackFixed32(const uint32_t* eM_temps, uint32_t eM_count, - const uint32_t bits[4], - const dxbc::Src& is_integer, - const dxbc::Src& is_signed); - void ExportToMemory(); void CompleteVertexOrDomainShader(); // For RTV, adds the sample to coverage_temp.coverage_temp_component if it // passes alpha to mask (or, if initialize == true (for the first sample @@ -917,13 +918,16 @@ class DxbcShaderTranslator : public ShaderTranslator { .SelectFromSwizzled(word_index & 1); } - void KillPixel(bool condition, const dxbc::Src& condition_src); + void KillPixel(bool condition, const dxbc::Src& condition_src, + uint8_t memexport_eM_potentially_written_before); - void ProcessVectorAluOperation(const ParsedAluInstruction& instr, - uint32_t& result_swizzle, - bool& predicate_written); - void ProcessScalarAluOperation(const ParsedAluInstruction& instr, - bool& predicate_written); + void ProcessVectorAluOperation( + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before, uint32_t& result_swizzle, + bool& predicate_written); + void ProcessScalarAluOperation( + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before, bool& predicate_written); void WriteResourceDefinition(); void WriteInputSignature(); @@ -1124,14 +1128,16 @@ class DxbcShaderTranslator : public ShaderTranslator { // writing). uint32_t system_temps_color_[4]; - // Bits containing whether each eM# has been written, for up to 16 streams, or - // UINT32_MAX if memexport is not used. 8 bits (5 used) for each stream, with - // 4 `alloc export`s per component. - uint32_t system_temp_memexport_written_; - // eA in each `alloc export`, or UINT32_MAX if not used. - uint32_t system_temps_memexport_address_[Shader::kMaxMemExports]; - // eM# in each `alloc export`, or UINT32_MAX if not used. - uint32_t system_temps_memexport_data_[Shader::kMaxMemExports][5]; + // Memory export temporary registers are allocated if the shader writes any + // eM# (current_shader().memexport_eM_written() != 0). + // X - whether memexport is enabled for this invocation. + // Y - which eM# elements have been written so far by the invocation since the + // last memory write. + uint32_t system_temp_memexport_enabled_and_eM_written_; + // eA. + uint32_t system_temp_memexport_address_; + // eM#. + uint32_t system_temps_memexport_data_[ucode::kMaxMemExportElementCount]; // Vector ALU or fetch result / scratch (since Xenos write masks can contain // swizzles). @@ -1195,10 +1201,6 @@ class DxbcShaderTranslator : public ShaderTranslator { uint32_t uav_index_edram_; std::vector sampler_bindings_; - - // Number of `alloc export`s encountered so far in the translation. The index - // of the current eA/eM# temp register set is this minus 1, if it's not 0. - uint32_t memexport_alloc_current_count_; }; } // namespace gpu diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc index 948406b90..a1d2970f0 100644 --- a/src/xenia/gpu/dxbc_shader_translator_alu.cc +++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc @@ -19,22 +19,29 @@ namespace xe { namespace gpu { using namespace ucode; -void DxbcShaderTranslator::KillPixel(bool condition, - const dxbc::Src& condition_src) { +void DxbcShaderTranslator::KillPixel( + bool condition, const dxbc::Src& condition_src, + uint8_t memexport_eM_potentially_written_before) { + a_.OpIf(condition, condition_src); + // Perform outstanding memory exports before the invocation becomes inactive + // and UAV writes are disabled. + ExportToMemory(memexport_eM_potentially_written_before); // Discard the pixel, but continue execution if other lanes in the quad need // this lane for derivatives. The driver may also perform early exiting // internally if all lanes are discarded if deemed beneficial. - a_.OpDiscard(condition, condition_src); + a_.OpDiscard(true, dxbc::Src::LU(UINT32_MAX)); if (edram_rov_used_) { // Even though discarding disables all subsequent UAV/ROV writes, also skip // as much of the Render Backend emulation logic as possible by setting the // coverage and the mask of the written render targets to zero. a_.OpMov(dxbc::Dest::R(system_temp_rov_params_, 0b0001), dxbc::Src::LU(0)); } + a_.OpEndIf(); } void DxbcShaderTranslator::ProcessVectorAluOperation( - const ParsedAluInstruction& instr, uint32_t& result_swizzle, + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before, uint32_t& result_swizzle, bool& predicate_written) { result_swizzle = dxbc::Src::kXYZW; predicate_written = false; @@ -506,7 +513,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY)); - KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); + KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), + memexport_eM_potentially_written_before); if (used_result_components) { a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), @@ -522,7 +530,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY)); - KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); + KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), + memexport_eM_potentially_written_before); if (used_result_components) { a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), @@ -538,7 +547,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY)); - KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); + KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), + memexport_eM_potentially_written_before); if (used_result_components) { a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), @@ -554,7 +564,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY)); - KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX)); + KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), + memexport_eM_potentially_written_before); if (used_result_components) { a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001), dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX), @@ -640,7 +651,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( } void DxbcShaderTranslator::ProcessScalarAluOperation( - const ParsedAluInstruction& instr, bool& predicate_written) { + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before, bool& predicate_written) { predicate_written = false; if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev) { @@ -950,27 +962,27 @@ void DxbcShaderTranslator::ProcessScalarAluOperation( case AluScalarOpcode::kKillsEq: a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(0.0f)); - KillPixel(true, ps_src); + KillPixel(true, ps_src, memexport_eM_potentially_written_before); a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f)); break; case AluScalarOpcode::kKillsGt: a_.OpLT(ps_dest, dxbc::Src::LF(0.0f), operand_0_a); - KillPixel(true, ps_src); + KillPixel(true, ps_src, memexport_eM_potentially_written_before); a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f)); break; case AluScalarOpcode::kKillsGe: a_.OpGE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f)); - KillPixel(true, ps_src); + KillPixel(true, ps_src, memexport_eM_potentially_written_before); a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f)); break; case AluScalarOpcode::kKillsNe: a_.OpNE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f)); - KillPixel(true, ps_src); + KillPixel(true, ps_src, memexport_eM_potentially_written_before); a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f)); break; case AluScalarOpcode::kKillsOne: a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(1.0f)); - KillPixel(true, ps_src); + KillPixel(true, ps_src, memexport_eM_potentially_written_before); a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f)); break; @@ -1024,7 +1036,8 @@ void DxbcShaderTranslator::ProcessScalarAluOperation( } void DxbcShaderTranslator::ProcessAluInstruction( - const ParsedAluInstruction& instr) { + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before) { if (instr.IsNop()) { // Don't even disassemble or update predication. return; @@ -1041,10 +1054,11 @@ void DxbcShaderTranslator::ProcessAluInstruction( // checked again later. bool predicate_written_vector = false; uint32_t vector_result_swizzle = dxbc::Src::kXYZW; - ProcessVectorAluOperation(instr, vector_result_swizzle, - predicate_written_vector); + ProcessVectorAluOperation(instr, memexport_eM_potentially_written_before, + vector_result_swizzle, predicate_written_vector); bool predicate_written_scalar = false; - ProcessScalarAluOperation(instr, predicate_written_scalar); + ProcessScalarAluOperation(instr, memexport_eM_potentially_written_before, + predicate_written_scalar); StoreResult(instr.vector_and_constant_result, dxbc::Src::R(system_temp_result_, vector_result_swizzle), diff --git a/src/xenia/gpu/dxbc_shader_translator_memexport.cc b/src/xenia/gpu/dxbc_shader_translator_memexport.cc index c48facc08..1049fa739 100644 --- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc +++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc @@ -2,533 +2,830 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2018 Ben Vanik. All rights reserved. * + * Copyright 2023 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ +#include +#include +#include + #include "xenia/base/assert.h" #include "xenia/base/math.h" -#include "xenia/gpu/draw_util.h" #include "xenia/gpu/dxbc_shader_translator.h" -#include "xenia/gpu/texture_cache.h" namespace xe { namespace gpu { using namespace ucode; -// TODO(Triang3l): Support sub-dword memexports (like k_8 in 58410B86). This -// would require four 128 MB R8_UINT UAVs due to -// D3D12_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP. Need to be careful with -// resource binding tiers, however. Resource binding tier 1 on feature level -// 11_0 allows only 8 UAVs _across all stages_. RWByteAddressBuffer + 4 typed -// buffers is 5 per stage already, would need 10 for both VS and PS, or even 11 -// with the eDRAM ROV. Need to drop draw commands doing memexport in both VS and -// PS on FL 11_0 resource binding tier 1. - -void DxbcShaderTranslator::ExportToMemory_PackFixed32( - const uint32_t* eM_temps, uint32_t eM_count, const uint32_t bits[4], - const dxbc::Src& is_integer, const dxbc::Src& is_signed) { - // Will insert with BFI - sign extension of red will be overwritten, not - // truncated. - assert_not_zero(bits[0]); - assert_true(bits[0] + bits[1] + bits[2] + bits[3] == 32); - uint32_t mask = 0; - for (uint32_t i = 0; i < 4; ++i) { - if (bits[i]) { - mask |= 1 << i; - } - } - a_.OpIf(true, is_signed); - { - float range[4]; - for (uint32_t i = 0; i < 4; ++i) { - range[i] = bits[i] ? float((uint32_t(1) << (bits[i] - 1)) - 1) : 0.0f; - } - dxbc::Src range_src(dxbc::Src::LP(range)); - a_.OpIf(false, is_integer); - for (uint32_t i = 0; i < eM_count; ++i) { - uint32_t eM_temp = eM_temps[i]; - a_.OpMul(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp), range_src); - } - a_.OpEndIf(); - for (uint32_t i = 0; i < eM_count; ++i) { - dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[i], mask)); - dxbc::Src eM_src(dxbc::Src::R(eM_temps[i])); - // TODO(Triang3l): NaN should become zero, not -range. - a_.OpMax(eM_dest, eM_src, -range_src); - a_.OpMin(eM_dest, eM_src, range_src); - } - } - a_.OpElse(); - { - float range[4]; - for (uint32_t i = 0; i < 4; ++i) { - range[i] = float((uint32_t(1) << bits[i]) - 1); - } - dxbc::Src range_src(dxbc::Src::LP(range)); - a_.OpIf(false, is_integer); - for (uint32_t i = 0; i < eM_count; ++i) { - uint32_t eM_temp = eM_temps[i]; - a_.OpMul(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp), range_src); - } - a_.OpEndIf(); - for (uint32_t i = 0; i < eM_count; ++i) { - dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[i], mask)); - dxbc::Src eM_src(dxbc::Src::R(eM_temps[i])); - a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(0.0f)); - a_.OpMin(eM_dest, eM_src, range_src); - } - } - a_.OpEndIf(); - for (uint32_t i = 0; i < eM_count; ++i) { - uint32_t eM_temp = eM_temps[i]; - // Round to the nearest integer, according to the rules of handling integer - // formats in Direct3D. - // TODO(Triang3l): Round by adding +-0.5, not with round_ne. - a_.OpRoundNE(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp)); - a_.OpFToI(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp)); - dxbc::Dest eM_packed_dest(dxbc::Dest::R(eM_temp, 0b0001)); - dxbc::Src eM_packed_src(dxbc::Src::R(eM_temp, dxbc::Src::kXXXX)); - uint32_t offset = bits[0]; - for (uint32_t j = 1; j < 4; ++j) { - if (!bits[j]) { - continue; - } - a_.OpBFI(eM_packed_dest, dxbc::Src::LU(bits[j]), dxbc::Src::LU(offset), - dxbc::Src::R(eM_temp).Select(j), eM_packed_src); - offset += bits[j]; - } - } -} - -void DxbcShaderTranslator::ExportToMemory() { - if (system_temp_memexport_written_ == UINT32_MAX) { - // No exports in the shader. +void DxbcShaderTranslator::ExportToMemory(uint8_t export_eM) { + if (!export_eM) { return; } - // Allocate a register for temporary values at various stages. - uint32_t control_temp = PushSystemTemp(); + assert_zero(export_eM & ~current_shader().memexport_eM_written()); - // Safety check if the shared memory is bound as UAV. - a_.OpUBFE(dxbc::Dest::R(control_temp, 0b0001), dxbc::Src::LU(1), - dxbc::Src::LU(kSysFlag_SharedMemoryIsUAV_Shift), - LoadFlagsSystemConstant()); - // Open the `if` with the uniform condition for the shared memory buffer being - // bound as a UAV (more fine-grained checks are vector and likely divergent). - a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX)); + // Check if memory export is allowed in this invocation. + a_.OpIf(true, dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_, + dxbc::Src::kXXXX)); - // Check more fine-grained limitations. - bool inner_condition_provided = false; - if (is_pixel_shader()) { - uint32_t resolution_scaled_axes = - uint32_t(draw_resolution_scale_x_ > 1) | - (uint32_t(draw_resolution_scale_y_ > 1) << 1); - if (resolution_scaled_axes) { - // Only do memexport for one host pixel in a guest pixel - prefer the - // host pixel closer to the center of the guest pixel, but one that's - // covered with the half-pixel offset according to the top-left rule (1 - // for 2x because 0 isn't covered with the half-pixel offset, 1 for 3x - // because it's the center and is covered with the half-pixel offset too). - // Using control_temp.yz as per-axis temporary variables. - in_position_used_ |= resolution_scaled_axes; - a_.OpFToU(dxbc::Dest::R(control_temp, resolution_scaled_axes << 1), - dxbc::Src::V1D(in_reg_ps_position_, 0b0100 << 2)); - a_.OpUDiv(dxbc::Dest::Null(), - dxbc::Dest::R(control_temp, resolution_scaled_axes << 1), - dxbc::Src::R(control_temp, 0b1001 << 2), - dxbc::Src::LU(0, draw_resolution_scale_x_, - draw_resolution_scale_y_, 0)); - for (uint32_t i = 0; i < 2; ++i) { - if (!(resolution_scaled_axes & (1 << i))) { - continue; - } - // If there's no inner condition in control_temp.x yet, the condition - // for the current axis can go directly to it. Otherwise, need to merge - // with the previous condition, using control_temp.y or .z as an - // intermediate variable. - dxbc::Src resolution_scaled_axis_src( - dxbc::Src::R(control_temp).Select(1 + i)); - a_.OpIEq( - dxbc::Dest::R(control_temp, - inner_condition_provided ? 1 << (1 + i) : 0b0001), - resolution_scaled_axis_src, - dxbc::Src::LU( - (i ? draw_resolution_scale_y_ : draw_resolution_scale_x_) >> - 1)); - if (inner_condition_provided) { - // Merge with the previous condition in control_temp.x. - a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001), - dxbc::Src::R(control_temp, dxbc::Src::kXXXX), - resolution_scaled_axis_src); - } - inner_condition_provided = true; - } - } - // With sample-rate shading (with float24 conversion), only do memexport - // from one sample (as the shader is invoked multiple times for a pixel), - // if SV_SampleIndex == firstbit_lo(SV_Coverage). For zero coverage, - // firstbit_lo returns 0xFFFFFFFF. - if (IsSampleRate()) { - a_.OpFirstBitLo(dxbc::Dest::R(control_temp, 0b0010), - dxbc::Src::VCoverage()); - a_.OpIEq( - dxbc::Dest::R(control_temp, - inner_condition_provided ? 0b0010 : 0b0001), - dxbc::Src::V1D(in_reg_ps_front_face_sample_index_, dxbc::Src::kYYYY), - dxbc::Src::R(control_temp, dxbc::Src::kYYYY)); - if (inner_condition_provided) { - // Merge with the previous condition in control_temp.x. - a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001), - dxbc::Src::R(control_temp, dxbc::Src::kXXXX), - dxbc::Src::R(control_temp, dxbc::Src::kYYYY)); - } - inner_condition_provided = true; - } - } - // Open the inner (vector) conditional if needed. - if (inner_condition_provided) { - a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX)); - } - // control_temp.x is now free. - - for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) { - uint32_t eA_temp = system_temps_memexport_address_[i]; - if (eA_temp == UINT32_MAX) { - // Export not used. - continue; - } - // For simplicity of access, gather actually used eM# registers for this - // export. Zero-initialize eM_offsets because excess elements of it may be - // accessed, for stable caching. - uint32_t eM_temps[5], eM_offsets[5] = {}, eM_count = 0; - for (uint32_t j = 0; j < 5; ++j) { - uint32_t eM_temp = system_temps_memexport_data_[i][j]; - if (eM_temp == UINT32_MAX) { - continue; - } - eM_temps[eM_count] = eM_temp; - eM_offsets[eM_count] = j; - ++eM_count; - } - if (eM_count == 0) { - continue; - } - - // Swap red and blue if needed. - a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001), - dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ), - dxbc::Src::LU(uint32_t(1) << 19)); - for (uint32_t j = 0; j < eM_count; ++j) { - uint32_t eM_temp = eM_temps[j]; - a_.OpMovC(dxbc::Dest::R(eM_temp, 0b0101), - dxbc::Src::R(control_temp, dxbc::Src::kXXXX), - dxbc::Src::R(eM_temp, 0b000010), dxbc::Src::R(eM_temp)); - } - - // Initialize element size in control_temp.x to 4 bytes as this is the most - // common size. - dxbc::Dest element_size_dest(dxbc::Dest::R(control_temp, 0b0001)); - dxbc::Src element_size_src(dxbc::Src::R(control_temp, dxbc::Src::kXXXX)); - a_.OpMov(element_size_dest, dxbc::Src::LU(4)); - - // Each eM should get a packed value in the destination format now. - - // Extract format properties to control_temp. - // Y - signedness if fixed-point. - // Z - fractional/integer if fixed-point. - // W - color format. - a_.OpUBFE(dxbc::Dest::R(control_temp, 0b1110), dxbc::Src::LU(0, 1, 1, 6), - dxbc::Src::LU(0, 16, 17, 8), - dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ)); - dxbc::Src is_signed(dxbc::Src::R(control_temp, dxbc::Src::kYYYY)); - dxbc::Src is_integer(dxbc::Src::R(control_temp, dxbc::Src::kZZZZ)); - // Convert and pack the format. - a_.OpSwitch(dxbc::Src::R(control_temp, dxbc::Src::kWWWW)); - // control_temp.w is now free. - { - // k_8_8_8_8 - // k_8_8_8_8_AS_16_16_16_16 - a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8))); - a_.OpCase(dxbc::Src::LU( - uint32_t(xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16))); - { - uint32_t bits[4] = {8, 8, 8, 8}; - ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer, - is_signed); - } - a_.OpBreak(); - - // k_2_10_10_10 - // k_2_10_10_10_AS_16_16_16_16 - a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_2_10_10_10))); - a_.OpCase(dxbc::Src::LU( - uint32_t(xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16))); - { - uint32_t bits[4] = {10, 10, 10, 2}; - ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer, - is_signed); - } - a_.OpBreak(); - - // k_10_11_11 - // k_10_11_11_AS_16_16_16_16 - a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_10_11_11))); - a_.OpCase(dxbc::Src::LU( - uint32_t(xenos::ColorFormat::k_10_11_11_AS_16_16_16_16))); - { - uint32_t bits[4] = {11, 11, 10}; - ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer, - is_signed); - } - a_.OpBreak(); - - // k_11_11_10 - // k_11_11_10_AS_16_16_16_16 - a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_11_11_10))); - a_.OpCase(dxbc::Src::LU( - uint32_t(xenos::ColorFormat::k_11_11_10_AS_16_16_16_16))); - { - uint32_t bits[4] = {10, 11, 11}; - ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer, - is_signed); - } - a_.OpBreak(); - - // k_16_16 - a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16))); - { - uint32_t bits[4] = {16, 16}; - ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer, - is_signed); - } - a_.OpBreak(); - - // k_16_16_16_16 - a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16))); - a_.OpMov(element_size_dest, dxbc::Src::LU(8)); - a_.OpIf(true, is_signed); - { - a_.OpIf(false, is_integer); - for (uint32_t j = 0; j < eM_count; ++j) { - uint32_t eM_temp = eM_temps[j]; - a_.OpMul(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp), - dxbc::Src::LF(32767.0f)); - } - a_.OpEndIf(); - for (uint32_t j = 0; j < eM_count; ++j) { - dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j])); - dxbc::Src eM_src(dxbc::Src::R(eM_temps[j])); - // TODO(Triang3l): NaN should become zero, not -range. - a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(-32767.0f)); - a_.OpMin(eM_dest, eM_src, dxbc::Src::LF(32767.0f)); - } - } - a_.OpElse(); - { - a_.OpIf(false, is_integer); - for (uint32_t j = 0; j < eM_count; ++j) { - uint32_t eM_temp = eM_temps[j]; - a_.OpMul(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp), - dxbc::Src::LF(65535.0f)); - } - a_.OpEndIf(); - for (uint32_t j = 0; j < eM_count; ++j) { - dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j])); - dxbc::Src eM_src(dxbc::Src::R(eM_temps[j])); - a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(0.0f)); - a_.OpMin(eM_dest, eM_src, dxbc::Src::LF(65535.0f)); - } - } - a_.OpEndIf(); - for (uint32_t j = 0; j < eM_count; ++j) { - uint32_t eM_temp = eM_temps[j]; - // Round to the nearest integer, according to the rules of handling - // integer formats in Direct3D. - // TODO(Triang3l): Round by adding +-0.5, not with round_ne. - a_.OpRoundNE(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp)); - a_.OpFToI(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp)); - a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::LU(16), - dxbc::Src::LU(16), dxbc::Src::R(eM_temp, 0b1101), - dxbc::Src::R(eM_temp, 0b1000)); - } - a_.OpBreak(); - - // k_16_16_FLOAT - a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_FLOAT))); - for (uint32_t j = 0; j < eM_count; ++j) { - uint32_t eM_temp = eM_temps[j]; - a_.OpF32ToF16(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::R(eM_temp)); - a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0001), dxbc::Src::LU(16), - dxbc::Src::LU(16), dxbc::Src::R(eM_temp, dxbc::Src::kYYYY), - dxbc::Src::R(eM_temp, dxbc::Src::kXXXX)); - } - a_.OpBreak(); - - // k_16_16_16_16_FLOAT - a_.OpCase( - dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16_FLOAT))); - a_.OpMov(element_size_dest, dxbc::Src::LU(8)); - for (uint32_t j = 0; j < eM_count; ++j) { - uint32_t eM_temp = eM_temps[j]; - a_.OpF32ToF16(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp)); - a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::LU(16), - dxbc::Src::LU(16), dxbc::Src::R(eM_temp, 0b1101), - dxbc::Src::R(eM_temp, 0b1000)); - } - a_.OpBreak(); - - // k_32_FLOAT - // Already in the destination format, 4 bytes per element already - // selected. - - // k_32_32_FLOAT - a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_FLOAT))); - a_.OpMov(element_size_dest, dxbc::Src::LU(8)); - // Already in the destination format. - a_.OpBreak(); - - // k_32_32_32_32_FLOAT - a_.OpCase( - dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_32_32_FLOAT))); - a_.OpMov(element_size_dest, dxbc::Src::LU(16)); - // Already in the destination format. - a_.OpBreak(); - } - a_.OpEndSwitch(); - // control_temp.yz are now free. - - // Do endian swap. - { - dxbc::Dest endian_dest(dxbc::Dest::R(control_temp, 0b0010)); - dxbc::Src endian_src(dxbc::Src::R(control_temp, dxbc::Src::kYYYY)); - // Extract endianness into control_temp.y. - a_.OpAnd(endian_dest, dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ), - dxbc::Src::LU(0b111)); - - // Change 8-in-64 and 8-in-128 to 8-in-32. - for (uint32_t j = 0; j < 2; ++j) { - a_.OpIEq(dxbc::Dest::R(control_temp, 0b0100), endian_src, - dxbc::Src::LU(uint32_t(j ? xenos::Endian128::k8in128 - : xenos::Endian128::k8in64))); - for (uint32_t k = 0; k < eM_count; ++k) { - uint32_t eM_temp = eM_temps[k]; - a_.OpMovC(dxbc::Dest::R(eM_temp), - dxbc::Src::R(control_temp, dxbc::Src::kZZZZ), - dxbc::Src::R(eM_temp, j ? 0b00011011 : 0b10110001), - dxbc::Src::R(eM_temp)); - } - a_.OpMovC(endian_dest, dxbc::Src::R(control_temp, dxbc::Src::kZZZZ), - dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)), - endian_src); - } - - uint32_t swap_temp = PushSystemTemp(); - dxbc::Dest swap_temp_dest(dxbc::Dest::R(swap_temp)); - dxbc::Src swap_temp_src(dxbc::Src::R(swap_temp)); - - // 8-in-16 or one half of 8-in-32. - a_.OpSwitch(endian_src); - a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in16))); - a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32))); - for (uint32_t j = 0; j < eM_count; ++j) { - dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j])); - dxbc::Src eM_src(dxbc::Src::R(eM_temps[j])); - // Temp = X0Z0. - a_.OpAnd(swap_temp_dest, eM_src, dxbc::Src::LU(0x00FF00FF)); - // eM = YZW0. - a_.OpUShR(eM_dest, eM_src, dxbc::Src::LU(8)); - // eM = Y0W0. - a_.OpAnd(eM_dest, eM_src, dxbc::Src::LU(0x00FF00FF)); - // eM = YXWZ. - a_.OpUMAd(eM_dest, swap_temp_src, dxbc::Src::LU(256), eM_src); - } - a_.OpBreak(); - a_.OpEndSwitch(); - - // 16-in-32 or another half of 8-in-32. - a_.OpSwitch(endian_src); - a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32))); - a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k16in32))); - for (uint32_t j = 0; j < eM_count; ++j) { - dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j])); - dxbc::Src eM_src(dxbc::Src::R(eM_temps[j])); - // Temp = ZW00. - a_.OpUShR(swap_temp_dest, eM_src, dxbc::Src::LU(16)); - // eM = ZWXY. - a_.OpBFI(eM_dest, dxbc::Src::LU(16), dxbc::Src::LU(16), eM_src, - swap_temp_src); - } - a_.OpBreak(); - a_.OpEndSwitch(); - - // Release swap_temp. - PopSystemTemp(); - } - // control_temp.yz are now free. - - dxbc::Dest address_dest(dxbc::Dest::R(eA_temp, 0b0001)); - dxbc::Src address_src(dxbc::Src::R(eA_temp, dxbc::Src::kXXXX)); - // Multiply the base address by dword size, also dropping the 0x40000000 - // bit. - a_.OpIShL(address_dest, address_src, dxbc::Src::LU(2)); - // Drop the exponent in the element index. - a_.OpAnd(dxbc::Dest::R(eA_temp, 0b0010), - dxbc::Src::R(eA_temp, dxbc::Src::kYYYY), - dxbc::Src::LU((1 << 23) - 1)); - // Add the offset of the first written element to the base address. - a_.OpUMAd(address_dest, dxbc::Src::R(eA_temp, dxbc::Src::kYYYY), - element_size_src, address_src); - // Do the writes. - dxbc::Src eM_written_src( - dxbc::Src::R(system_temp_memexport_written_).Select(i >> 2)); - uint32_t eM_written_base = 1u << ((i & 3) << 3); - for (uint32_t j = 0; j < eM_count; ++j) { - // Go to the next eM#. - uint32_t eM_relative_offset = eM_offsets[j] - (j ? eM_offsets[j - 1] : 0); - if (eM_relative_offset) { - if (eM_relative_offset == 1) { - a_.OpIAdd(address_dest, element_size_src, address_src); - } else { - a_.OpUMAd(address_dest, dxbc::Src::LU(eM_relative_offset), - element_size_src, address_src); - } - } - // Check if the eM# was actually written to on the execution path. - a_.OpAnd(dxbc::Dest::R(control_temp, 0b0010), eM_written_src, - dxbc::Src::LU(eM_written_base << eM_offsets[j])); - a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kYYYY)); - // Write the element of the needed size. - dxbc::Src eM_src(dxbc::Src::R(eM_temps[j])); - a_.OpSwitch(element_size_src); - for (uint32_t k = 1; k <= 4; k <<= 1) { - a_.OpCase(dxbc::Src::LU(k * 4)); - if (uav_index_shared_memory_ == kBindingIndexUnallocated) { - uav_index_shared_memory_ = uav_count_++; - } - a_.OpStoreRaw( - dxbc::Dest::U(uav_index_shared_memory_, - uint32_t(UAVRegister::kSharedMemory), (1 << k) - 1), - address_src, eM_src); - a_.OpBreak(); - } - a_.OpEndSwitch(); - a_.OpEndIf(); - } - // control_temp.y is now free. + // Check if the address with the correct sign and exponent was written, and + // that the index doesn't overflow the mantissa bits. + { + uint32_t address_check_temp = PushSystemTemp(); + a_.OpUShR(dxbc::Dest::R(address_check_temp), + dxbc::Src::R(system_temp_memexport_address_), + dxbc::Src::LU(30, 23, 23, 23)); + a_.OpIEq(dxbc::Dest::R(address_check_temp), + dxbc::Src::R(address_check_temp), + dxbc::Src::LU(0x1, 0x96, 0x96, 0x96)); + a_.OpAnd(dxbc::Dest::R(address_check_temp, 0b0011), + dxbc::Src::R(address_check_temp), + dxbc::Src::R(address_check_temp, 0b1110)); + a_.OpAnd(dxbc::Dest::R(address_check_temp, 0b0001), + dxbc::Src::R(address_check_temp, dxbc::Src::kXXXX), + dxbc::Src::R(address_check_temp, dxbc::Src::kYYYY)); + a_.OpIf(true, dxbc::Src::R(address_check_temp, dxbc::Src::kXXXX)); + // Release address_check_temp. + PopSystemTemp(); } - // Close the inner memexport possibility conditional. - if (inner_condition_provided) { + uint8_t eM_remaining; + uint32_t eM_index; + + // Swap red and blue components if needed. + { + uint32_t red_blue_swap_temp = PushSystemTemp(); + a_.OpIBFE(dxbc::Dest::R(red_blue_swap_temp, 0b0001), dxbc::Src::LU(1), + dxbc::Src::LU(19), + dxbc::Src::R(system_temp_memexport_address_, dxbc::Src::kZZZZ)); + a_.OpIf(true, dxbc::Src::R(red_blue_swap_temp, dxbc::Src::kXXXX)); + // Release red_blue_swap_temp. + PopSystemTemp(); + + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + a_.OpMov( + dxbc::Dest::R(system_temps_memexport_data_[eM_index], 0b0101), + dxbc::Src::R(system_temps_memexport_data_[eM_index], 0b11000110)); + } + + // Close the red/blue swap conditional. a_.OpEndIf(); } - // Close the outer memexport possibility conditional. + uint32_t temp = PushSystemTemp(); + + // Extract the color format and the numeric format. + // temp.x = color format. + // temp.y = numeric format is signed. + // temp.z = numeric format is integer. + a_.OpUBFE(dxbc::Dest::R(temp, 0b0111), dxbc::Src::LU(6, 1, 1, 0), + dxbc::Src::LU(8, 16, 17, 0), + dxbc::Src::R(system_temp_memexport_address_, dxbc::Src::kZZZZ)); + + // Perform format packing. + // After the switch, temp.x must contain log2 of the number of bytes in an + // element, of UINT32_MAX if the format is unknown. + a_.OpSwitch(dxbc::Src::R(temp, dxbc::Src::kXXXX)); + { + dxbc::Dest element_size_dest(dxbc::Dest::R(temp, 0b0001)); + dxbc::Src num_format_signed(dxbc::Src::R(temp, dxbc::Src::kYYYY)); + dxbc::Src num_format_integer(dxbc::Src::R(temp, dxbc::Src::kZZZZ)); + + auto flush_nan = [this, export_eM](uint32_t components) { + uint8_t eM_remaining = export_eM; + uint32_t eM_index; + uint32_t is_nan_temp = PushSystemTemp(); + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpNE(dxbc::Dest::R(is_nan_temp, components), dxbc::Src::R(eM), + dxbc::Src::R(eM)); + a_.OpMovC(dxbc::Dest::R(eM, components), dxbc::Src::R(is_nan_temp), + dxbc::Src::LF(0.0f), dxbc::Src::R(eM)); + } + // Release is_nan_temp. + PopSystemTemp(); + }; + + // The result will be in eM#.x. The widths must be without holes (R, RG, + // RGB, RGBA), and expecting the widths to add up to the size of the stored + // texel (8, 16 or 32 bits), as the unused upper bits will contain junk from + // the sign extension of X if the number is signed. + auto pack_8_16_32 = [&](std::array widths) { + uint8_t eM_remaining; + uint32_t eM_index; + + uint32_t components = 0; + std::array offsets = {}; + for (uint32_t i = 0; i < 4; ++i) { + if (widths[i]) { + // Only formats for which max + 0.5 can be represented exactly. + assert(widths[i] <= 23); + components |= uint32_t(1) << i; + } + if (i) { + offsets[i] = offsets[i - 1] + widths[i - 1]; + } + } + // Will be packing components into eM#.x starting from green, assume red + // will already be there after the conversion. + assert_not_zero(components & 0b1); + + flush_nan(components); + + a_.OpIf(true, num_format_signed); + { + // Signed. + a_.OpIf(true, num_format_integer); + { + // Signed integer. + float min_value[4] = {}, max_value[4] = {}; + for (uint32_t i = 0; i < 4; ++i) { + if (widths[i]) { + max_value[i] = float((uint32_t(1) << (widths[i] - 1)) - 1); + min_value[i] = -1.0f - max_value[i]; + } + } + dxbc::Src min_value_src(dxbc::Src::LP(min_value)); + dxbc::Src max_value_src(dxbc::Src::LP(max_value)); + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpMax(dxbc::Dest::R(eM, components), min_value_src, + dxbc::Src::R(eM)); + a_.OpMin(dxbc::Dest::R(eM, components), max_value_src, + dxbc::Src::R(eM)); + } + } + a_.OpElse(); + { + // Signed normalized. + uint32_t scale_components = 0; + float scale[4] = {}; + for (uint32_t i = 0; i < 4; ++i) { + if (widths[i] > 2) { + scale_components |= uint32_t(1) << i; + scale[i] = float((uint32_t(1) << (widths[i] - 1)) - 1); + } + } + dxbc::Src scale_src(dxbc::Src::LP(scale)); + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpMax(dxbc::Dest::R(eM, components), dxbc::Src::LF(-1.0f), + dxbc::Src::R(eM)); + a_.OpMin(dxbc::Dest::R(eM, components), dxbc::Src::LF(1.0f), + dxbc::Src::R(eM)); + if (scale_components) { + a_.OpMul(dxbc::Dest::R(eM, scale_components), dxbc::Src::R(eM), + scale_src); + } + } + } + a_.OpEndIf(); + + // Add plus/minus 0.5 before truncating according to the Direct3D format + // conversion rules, and convert to signed integers. + uint32_t round_bias_temp = PushSystemTemp(); + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpBFI(dxbc::Dest::R(eM, components), dxbc::Src::LU(31), + dxbc::Src::LU(0), dxbc::Src::LF(0.5f), dxbc::Src::R(eM)); + a_.OpAdd(dxbc::Dest::R(eM, components), dxbc::Src::R(eM), + dxbc::Src::R(round_bias_temp)); + a_.OpFToI(dxbc::Dest::R(eM, components), dxbc::Src::R(eM)); + } + // Release round_bias_temp. + PopSystemTemp(); + } + a_.OpElse(); + { + // Unsigned. + a_.OpIf(true, num_format_integer); + { + // Unsigned integer. + float max_value[4]; + for (uint32_t i = 0; i < 4; ++i) { + max_value[i] = float((uint32_t(1) << widths[i]) - 1); + } + dxbc::Src max_value_src(dxbc::Src::LP(max_value)); + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpMax(dxbc::Dest::R(eM, components), dxbc::Src::LF(0.0f), + dxbc::Src::R(eM)); + a_.OpMin(dxbc::Dest::R(eM, components), max_value_src, + dxbc::Src::R(eM)); + } + } + a_.OpElse(); + { + // Unsigned normalized. + uint32_t scale_components = 0; + float scale[4] = {}; + for (uint32_t i = 0; i < 4; ++i) { + if (widths[i] > 1) { + scale_components |= uint32_t(1) << i; + scale[i] = float((uint32_t(1) << widths[i]) - 1); + } + } + dxbc::Src scale_src(dxbc::Src::LP(scale)); + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + // Saturate. + a_.OpMov(dxbc::Dest::R(eM, components), dxbc::Src::R(eM), true); + if (scale_components) { + a_.OpMul(dxbc::Dest::R(eM, scale_components), dxbc::Src::R(eM), + scale_src); + } + } + } + a_.OpEndIf(); + + // Add 0.5 before truncating according to the Direct3D format conversion + // rules, and convert to unsigned integers. + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpAdd(dxbc::Dest::R(eM, components), dxbc::Src::R(eM), + dxbc::Src::LF(0.5f)); + a_.OpFToU(dxbc::Dest::R(eM, components), dxbc::Src::R(eM)); + } + } + a_.OpEndIf(); + + // Pack into 32 bits. + for (uint32_t i = 0; i < 4; ++i) { + if (!widths[i]) { + continue; + } + dxbc::Src width_src(dxbc::Src::LU(widths[i])); + dxbc::Src offset_src(dxbc::Src::LU(offsets[i])); + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpBFI(dxbc::Dest::R(eM, 0b0001), width_src, offset_src, + dxbc::Src::R(eM).Select(i), + dxbc::Src::R(eM, dxbc::Src::kXXXX)); + } + } + }; + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8))); + // TODO(Triang3l): Investigate how input should be treated for k_8_A, k_8_B, + // k_8_8_8_8_A. + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_A))); + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_B))); + { + pack_8_16_32({8}); + a_.OpMov(element_size_dest, dxbc::Src::LU(0)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_1_5_5_5))); + { + pack_8_16_32({5, 5, 5, 1}); + a_.OpMov(element_size_dest, dxbc::Src::LU(1)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_5_6_5))); + { + pack_8_16_32({5, 6, 5}); + a_.OpMov(element_size_dest, dxbc::Src::LU(1)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_6_5_5))); + { + pack_8_16_32({5, 5, 6}); + a_.OpMov(element_size_dest, dxbc::Src::LU(1)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8))); + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8_A))); + a_.OpCase( + dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16))); + { + pack_8_16_32({8, 8, 8, 8}); + a_.OpMov(element_size_dest, dxbc::Src::LU(2)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_2_10_10_10))); + a_.OpCase(dxbc::Src::LU( + uint32_t(xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16))); + { + pack_8_16_32({10, 10, 10, 2}); + a_.OpMov(element_size_dest, dxbc::Src::LU(2)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8))); + { + pack_8_16_32({8, 8}); + a_.OpMov(element_size_dest, dxbc::Src::LU(1)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_4_4_4_4))); + { + pack_8_16_32({4, 4, 4, 4}); + a_.OpMov(element_size_dest, dxbc::Src::LU(1)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_10_11_11))); + a_.OpCase( + dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_10_11_11_AS_16_16_16_16))); + { + pack_8_16_32({11, 11, 10}); + a_.OpMov(element_size_dest, dxbc::Src::LU(2)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_11_11_10))); + a_.OpCase( + dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_11_11_10_AS_16_16_16_16))); + { + pack_8_16_32({10, 11, 11}); + a_.OpMov(element_size_dest, dxbc::Src::LU(2)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16))); + { + pack_8_16_32({16}); + a_.OpMov(element_size_dest, dxbc::Src::LU(1)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16))); + { + pack_8_16_32({16, 16}); + a_.OpMov(element_size_dest, dxbc::Src::LU(2)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16))); + { + flush_nan(0b1111); + + a_.OpIf(true, num_format_signed); + { + // Signed. + a_.OpIf(true, num_format_integer); + { + // Signed integer. + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpMax(dxbc::Dest::R(eM), dxbc::Src::LF(float(INT16_MIN)), + dxbc::Src::R(eM)); + a_.OpMin(dxbc::Dest::R(eM), dxbc::Src::LF(float(INT16_MAX)), + dxbc::Src::R(eM)); + } + } + a_.OpElse(); + { + // Signed normalized. + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpMax(dxbc::Dest::R(eM), dxbc::Src::LF(-1.0f), dxbc::Src::R(eM)); + a_.OpMin(dxbc::Dest::R(eM), dxbc::Src::LF(1.0f), dxbc::Src::R(eM)); + a_.OpMul(dxbc::Dest::R(eM), dxbc::Src::R(eM), + dxbc::Src::LF(float(INT16_MAX))); + } + } + a_.OpEndIf(); + + // Add plus/minus 0.5 before truncating according to the Direct3D format + // conversion rules, and convert to signed integers. + uint32_t round_bias_temp = PushSystemTemp(); + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpBFI(dxbc::Dest::R(eM), dxbc::Src::LU(31), dxbc::Src::LU(0), + dxbc::Src::LF(0.5f), dxbc::Src::R(eM)); + a_.OpAdd(dxbc::Dest::R(eM), dxbc::Src::R(eM), + dxbc::Src::R(round_bias_temp)); + a_.OpFToI(dxbc::Dest::R(eM), dxbc::Src::R(eM)); + } + // Release round_bias_temp. + PopSystemTemp(); + } + a_.OpElse(); + { + // Unsigned. + a_.OpIf(true, num_format_integer); + { + // Unsigned integer. + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpMax(dxbc::Dest::R(eM), dxbc::Src::LF(0.0f), dxbc::Src::R(eM)); + a_.OpMin(dxbc::Dest::R(eM), dxbc::Src::LF(float(UINT16_MAX)), + dxbc::Src::R(eM)); + } + } + a_.OpElse(); + { + // Unsigned normalized. + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + // Saturate. + a_.OpMov(dxbc::Dest::R(eM), dxbc::Src::R(eM), true); + a_.OpMul(dxbc::Dest::R(eM), dxbc::Src::R(eM), + dxbc::Src::LF(float(UINT16_MAX))); + } + } + a_.OpEndIf(); + + // Add 0.5 before truncating according to the Direct3D format conversion + // rules, and convert to unsigned integers. + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpAdd(dxbc::Dest::R(eM), dxbc::Src::R(eM), dxbc::Src::LF(0.5f)); + a_.OpFToU(dxbc::Dest::R(eM), dxbc::Src::R(eM)); + } + } + a_.OpEndIf(); + + // Pack. + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpBFI(dxbc::Dest::R(eM, 0b0011), dxbc::Src::LU(16), + dxbc::Src::LU(16), dxbc::Src::R(eM, 0b1101), + dxbc::Src::R(eM, 0b1000)); + } + + a_.OpMov(element_size_dest, dxbc::Src::LU(3)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_FLOAT))); + { + // TODO(Triang3l): Use extended range conversion. + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpF32ToF16(dxbc::Dest::R(eM, 0b0001), + dxbc::Src::R(eM, dxbc::Src::kXXXX)); + } + a_.OpMov(element_size_dest, dxbc::Src::LU(1)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_FLOAT))); + { + // TODO(Triang3l): Use extended range conversion. + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpF32ToF16(dxbc::Dest::R(eM, 0b0011), dxbc::Src::R(eM)); + a_.OpBFI(dxbc::Dest::R(eM, 0b0001), dxbc::Src::LU(16), + dxbc::Src::LU(16), dxbc::Src::R(eM, dxbc::Src::kYYYY), + dxbc::Src::R(eM, dxbc::Src::kXXXX)); + } + a_.OpMov(element_size_dest, dxbc::Src::LU(2)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16_FLOAT))); + { + // TODO(Triang3l): Use extended range conversion. + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpF32ToF16(dxbc::Dest::R(eM), dxbc::Src::R(eM)); + a_.OpBFI(dxbc::Dest::R(eM, 0b0011), dxbc::Src::LU(16), + dxbc::Src::LU(16), dxbc::Src::R(eM, 0b1101), + dxbc::Src::R(eM, 0b1000)); + } + a_.OpMov(element_size_dest, dxbc::Src::LU(3)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_FLOAT))); + { + // Already in eM#. + a_.OpMov(element_size_dest, dxbc::Src::LU(2)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_FLOAT))); + { + // Already in eM#. + a_.OpMov(element_size_dest, dxbc::Src::LU(3)); + } + a_.OpBreak(); + + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_32_32_FLOAT))); + { + // Already in eM#. + a_.OpMov(element_size_dest, dxbc::Src::LU(4)); + } + a_.OpBreak(); + + a_.OpDefault(); + a_.OpMov(element_size_dest, dxbc::Src::LU(UINT32_MAX)); + a_.OpBreak(); + } + // Close the color format switch. + a_.OpEndSwitch(); + + dxbc::Src element_size_src(dxbc::Src::R(temp, dxbc::Src::kXXXX)); + + // Only temp.x is used currently (for the element size log2). + + // Do endian swap, using temp.y for the endianness value, and temp.z as a + // temporary value. + { + dxbc::Dest endian_dest(dxbc::Dest::R(temp, 0b0010)); + dxbc::Src endian_src(dxbc::Src::R(temp, dxbc::Src::kYYYY)); + // Extract endianness into temp.y. + a_.OpUBFE(endian_dest, dxbc::Src::LU(3), dxbc::Src::LU(0), + dxbc::Src::R(system_temp_memexport_address_, dxbc::Src::kZZZZ)); + + // Change 8-in-64 and 8-in-128 to 8-in-32. + for (uint32_t i = 0; i < 2; ++i) { + a_.OpIEq(dxbc::Dest::R(temp, 0b0100), endian_src, + dxbc::Src::LU(uint32_t(i ? xenos::Endian128::k8in128 + : xenos::Endian128::k8in64))); + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + a_.OpMovC(dxbc::Dest::R(eM), dxbc::Src::R(temp, dxbc::Src::kZZZZ), + dxbc::Src::R(eM, i ? 0b00011011 : 0b10110001), + dxbc::Src::R(eM)); + } + a_.OpMovC(endian_dest, dxbc::Src::R(temp, dxbc::Src::kZZZZ), + dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)), endian_src); + } + + uint32_t swap_temp = PushSystemTemp(); + dxbc::Dest swap_temp_dest(dxbc::Dest::R(swap_temp)); + dxbc::Src swap_temp_src(dxbc::Src::R(swap_temp)); + + // 8-in-16 or one half of 8-in-32. + a_.OpSwitch(endian_src); + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in16))); + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32))); + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + dxbc::Dest eM_dest(dxbc::Dest::R(eM)); + dxbc::Src eM_src(dxbc::Src::R(eM)); + // Temp = X0Z0. + a_.OpAnd(swap_temp_dest, eM_src, dxbc::Src::LU(0x00FF00FF)); + // eM = YZW0. + a_.OpUShR(eM_dest, eM_src, dxbc::Src::LU(8)); + // eM = Y0W0. + a_.OpAnd(eM_dest, eM_src, dxbc::Src::LU(0x00FF00FF)); + // eM = YXWZ. + a_.OpUMAd(eM_dest, swap_temp_src, dxbc::Src::LU(256), eM_src); + } + a_.OpBreak(); + a_.OpEndSwitch(); + + // 16-in-32 or another half of 8-in-32. + a_.OpSwitch(endian_src); + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32))); + a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k16in32))); + eM_remaining = export_eM; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + uint32_t eM = system_temps_memexport_data_[eM_index]; + dxbc::Dest eM_dest(dxbc::Dest::R(eM)); + dxbc::Src eM_src(dxbc::Src::R(eM)); + // Temp = ZW00. + a_.OpUShR(swap_temp_dest, eM_src, dxbc::Src::LU(16)); + // eM = ZWXY. + a_.OpBFI(eM_dest, dxbc::Src::LU(16), dxbc::Src::LU(16), eM_src, + swap_temp_src); + } + a_.OpBreak(); + a_.OpEndSwitch(); + + // Release swap_temp. + PopSystemTemp(); + } + + // Extract the base index to temp.y and the index upper bound to temp.z. + a_.OpUBFE(dxbc::Dest::R(temp, 0b0110), dxbc::Src::LU(23), dxbc::Src::LU(0), + dxbc::Src::R(system_temp_memexport_address_, 0b1101 << 2)); + dxbc::Dest eM0_address_dest(dxbc::Dest::R(temp, 0b0010)); + dxbc::Src eM0_address_src(dxbc::Src::R(temp, dxbc::Src::kYYYY)); + dxbc::Src index_count_src(dxbc::Src::R(temp, dxbc::Src::kZZZZ)); + + // Check if eM0 isn't out of bounds via temp.w - if it is, eM1...4 also are + // (the base index can't be negative). + a_.OpILT(dxbc::Dest::R(temp, 0b1000), eM0_address_src, index_count_src); + a_.OpIf(true, dxbc::Src::R(temp, dxbc::Src::kWWWW)); + + // Extract the base address to temp.w as bytes (30 lower bits to 30 upper bits + // with 0 below). + a_.OpIShL(dxbc::Dest::R(temp, 0b1000), + dxbc::Src::R(system_temp_memexport_address_, dxbc::Src::kXXXX), + dxbc::Src::LU(2)); + dxbc::Src base_address_src(dxbc::Src::R(temp, dxbc::Src::kWWWW)); + + uint8_t export_eM14 = export_eM >> 1; + assert_zero(export_eM14 >> 4); + uint32_t eM14_address_temp = UINT32_MAX, store_eM14_temp = UINT32_MAX; + if (export_eM14) { + // Get eM1...4 indices and check if they're in bounds. + eM14_address_temp = PushSystemTemp(); + dxbc::Dest eM14_address_dest(dxbc::Dest::R(eM14_address_temp, export_eM14)); + dxbc::Src eM14_address_src(dxbc::Src::R(eM14_address_temp)); + store_eM14_temp = PushSystemTemp(); + dxbc::Dest store_eM14_dest(dxbc::Dest::R(store_eM14_temp, export_eM14)); + dxbc::Src store_eM14_src(dxbc::Src::R(store_eM14_temp)); + a_.OpIAdd(eM14_address_dest, eM0_address_src, dxbc::Src::LU(1, 2, 3, 4)); + a_.OpILT(store_eM14_dest, eM14_address_src, index_count_src); + // Check if eM1...4 were actually written by the invocation and merge the + // result with store_eM14_temp. + uint32_t eM14_written_temp = PushSystemTemp(); + a_.OpIBFE(dxbc::Dest::R(eM14_written_temp, export_eM14), dxbc::Src::LU(1), + dxbc::Src::LU(1, 2, 3, 4), + dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_, + dxbc::Src::kYYYY)); + a_.OpAnd(store_eM14_dest, store_eM14_src, dxbc::Src::R(eM14_written_temp)); + // Release eM14_written_temp. + PopSystemTemp(); + // Convert eM1...4 indices to global byte addresses. + a_.OpIShL(eM14_address_dest, eM14_address_src, element_size_src); + a_.OpIAdd(eM14_address_dest, base_address_src, eM14_address_src); + } + if (export_eM & 0b1) { + // Convert eM0 index to a global byte address if it's needed. + a_.OpIShL(eM0_address_dest, eM0_address_src, element_size_src); + a_.OpIAdd(eM0_address_dest, base_address_src, eM0_address_src); + // base_address_src and index_count_src are deallocated at this point (even + // if eM0 isn't potentially written), temp.zw are now free. + // Extract if eM0 was actually written by the invocation to temp.z. + a_.OpIBFE(dxbc::Dest::R(temp, 0b0100), dxbc::Src::LU(1), dxbc::Src::LU(0), + dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_, + dxbc::Src::kYYYY)); + } + dxbc::Src eM0_written_src(dxbc::Src::R(temp, dxbc::Src::kZZZZ)); + + // Write depending on the element size. + // No switch case will be entered for an unknown format (UINT32_MAX size + // written), so writing won't be attempted for it. + if (uav_index_shared_memory_ == kBindingIndexUnallocated) { + uav_index_shared_memory_ = uav_count_++; + } + uint8_t eM14_remaining; + uint32_t eM14_index; + a_.OpSwitch(element_size_src); + + // 8bpp, 16bpp. + dxbc::Dest atomic_dest(dxbc::Dest::U( + uav_index_shared_memory_, uint32_t(UAVRegister::kSharedMemory), 0)); + for (uint32_t i = 0; i <= 1; ++i) { + a_.OpCase(dxbc::Src::LU(i)); + dxbc::Src width_src(dxbc::Src::LU(8 << i)); + uint32_t sub_dword_temp = PushSystemTemp(); + if (export_eM & 0b1) { + a_.OpIf(true, eM0_written_src); + // sub_dword_temp.x = eM0 offset in the dword (8 << (byte_address & 3)) + // (assuming a little-endian host). + a_.OpBFI(dxbc::Dest::R(sub_dword_temp, 0b0001), dxbc::Src::LU(2), + dxbc::Src::LU(3), eM0_address_src, dxbc::Src::LU(0)); + // Keep only the dword part of the address. + a_.OpAnd(eM0_address_dest, eM0_address_src, dxbc::Src::LU(~uint32_t(3))); + // Erase the bits that will be replaced with eM0 via sub_dword_temp.y. + a_.OpBFI(dxbc::Dest::R(sub_dword_temp, 0b0010), width_src, + dxbc::Src::R(sub_dword_temp, dxbc::Src::kXXXX), dxbc::Src::LU(0), + dxbc::Src::LU(UINT32_MAX)); + a_.OpAtomicAnd(atomic_dest, eM0_address_src, 0b0001, + dxbc::Src::R(sub_dword_temp, dxbc::Src::kYYYY)); + // Add the eM0 bits via sub_dword_temp.y. + a_.OpBFI(dxbc::Dest::R(sub_dword_temp, 0b0010), width_src, + dxbc::Src::R(sub_dword_temp, dxbc::Src::kXXXX), + dxbc::Src::R(system_temps_memexport_data_[0], dxbc::Src::kXXXX), + dxbc::Src::LU(0)); + a_.OpAtomicOr(atomic_dest, eM0_address_src, 0b0001, + dxbc::Src::R(sub_dword_temp, dxbc::Src::kYYYY)); + a_.OpEndIf(); + } + if (export_eM14) { + // sub_dword_temp = eM# offset in the dword (8 << (byte_address & 3)) + // (assuming a little-endian host). + a_.OpBFI(dxbc::Dest::R(sub_dword_temp, export_eM14), dxbc::Src::LU(2), + dxbc::Src::LU(3), dxbc::Src::R(eM14_address_temp), + dxbc::Src::LU(0)); + // Keep only the dword part of the address. + a_.OpAnd(dxbc::Dest::R(eM14_address_temp, export_eM14), + dxbc::Src::R(eM14_address_temp), dxbc::Src::LU(~uint32_t(3))); + uint32_t sub_dword_data_temp = PushSystemTemp(); + eM14_remaining = export_eM14; + while (xe::bit_scan_forward(eM14_remaining, &eM14_index)) { + eM14_remaining &= ~(uint8_t(1) << eM14_index); + a_.OpIf(true, dxbc::Src::R(store_eM14_temp).Select(eM14_index)); + // Erase the bits that will be replaced with eM# via + // sub_dword_data_temp.x. + a_.OpBFI(dxbc::Dest::R(sub_dword_data_temp, 0b0001), width_src, + dxbc::Src::R(sub_dword_temp).Select(eM14_index), + dxbc::Src::LU(0), dxbc::Src::LU(UINT32_MAX)); + a_.OpAtomicAnd( + atomic_dest, dxbc::Src::R(eM14_address_temp).Select(eM14_index), + 0b0001, dxbc::Src::R(sub_dword_data_temp, dxbc::Src::kXXXX)); + // Add the eM# bits via sub_dword_temp.y. + a_.OpBFI(dxbc::Dest::R(sub_dword_data_temp, 0b0001), width_src, + dxbc::Src::R(sub_dword_temp).Select(eM14_index), + dxbc::Src::R(system_temps_memexport_data_[1 + eM14_index], + dxbc::Src::kXXXX), + dxbc::Src::LU(0)); + a_.OpAtomicOr( + atomic_dest, dxbc::Src::R(eM14_address_temp).Select(eM14_index), + 0b0001, dxbc::Src::R(sub_dword_data_temp, dxbc::Src::kXXXX)); + a_.OpEndIf(); + } + // Release sub_dword_data_temp. + PopSystemTemp(); + } + // Release sub_dword_temp. + PopSystemTemp(); + a_.OpBreak(); + } + + // 32bpp, 64bpp, 128bpp. + for (uint32_t i = 2; i <= 4; ++i) { + a_.OpCase(dxbc::Src::LU(i)); + // Store (0b0001), Store2 (0b0011), Store4 (0b1111). + uint32_t store_mask = (uint32_t(1) << (uint32_t(1) << (i - 2))) - 1; + dxbc::Dest store_dest(dxbc::Dest::U(uav_index_shared_memory_, + uint32_t(UAVRegister::kSharedMemory), + store_mask)); + if (export_eM & 0b1) { + a_.OpIf(true, eM0_written_src); + a_.OpStoreRaw(store_dest, eM0_address_src, + dxbc::Src::R(system_temps_memexport_data_[0])); + a_.OpEndIf(); + } + eM14_remaining = export_eM14; + while (xe::bit_scan_forward(eM14_remaining, &eM14_index)) { + eM14_remaining &= ~(uint8_t(1) << eM14_index); + a_.OpIf(true, dxbc::Src::R(store_eM14_temp).Select(eM14_index)); + a_.OpStoreRaw(store_dest, + dxbc::Src::R(eM14_address_temp).Select(eM14_index), + dxbc::Src::R(system_temps_memexport_data_[1 + eM14_index])); + a_.OpEndIf(); + } + a_.OpBreak(); + } + + // Close the element size switch. + a_.OpEndSwitch(); + + if (export_eM14) { + // Release eM14_address_temp and store_eM14_temp. + PopSystemTemp(2); + } + + // Close the eM0 bounds check. a_.OpEndIf(); - // Release control_temp. + // Release temp. PopSystemTemp(); + + // Close the address correctness conditional. + a_.OpEndIf(); + + // Close the memory export allowed conditional. + a_.OpEndIf(); } } // namespace gpu diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index e9c21d801..c3d57438d 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -672,7 +672,7 @@ class Shader { // For implementation without unconditional support for memory writes from // vertex shaders, vertex shader converted to a compute shader doing only // memory export. - kMemexportCompute, + kMemExportCompute, // 4 host vertices for 1 guest vertex, for implementations without // unconditional geometry shader support. @@ -769,9 +769,16 @@ class Shader { } }; - // Based on the number of AS_VS/PS_EXPORT_STREAM_* enum sets found in a game - // .pdb. - static constexpr uint32_t kMaxMemExports = 16; + struct ControlFlowMemExportInfo { + // Which eM elements have potentially (regardless of conditionals, loop + // iteration counts, predication) been written earlier in the predecessor + // graph of the instruction since an `alloc export`. + uint8_t eM_potentially_written_before = 0; + // For exec sequences, which eM elements are potentially (regardless of + // predication) written by the instructions in the sequence. For other + // control flow instructions, it's 0. + uint8_t eM_potentially_written_by_exec = 0; + }; class Translation { public: @@ -879,19 +886,21 @@ class Shader { return constant_register_map_; } - // uint5[Shader::kMaxMemExports] - bits indicating which eM# registers have - // been written to after each `alloc export`, for up to Shader::kMaxMemExports - // exports. This will contain zero for certain corrupt exports - for those to - // which a valid eA was not written via a MAD with a stream constant. - const uint8_t* memexport_eM_written() const { return memexport_eM_written_; } + // Information about memory export state at each control flow instruction. May + // be empty if there are no eM# writes. + const std::vector& cf_memexport_info() const { + return cf_memexport_info_; + } - // All c# registers used as the addend in MAD operations to eA. + uint8_t memexport_eM_written() const { return memexport_eM_written_; } + uint8_t memexport_eM_potentially_written_before_end() const { + return memexport_eM_potentially_written_before_end_; + } + + // c# registers used as the addend in MAD operations to eA. const std::set& memexport_stream_constants() const { return memexport_stream_constants_; } - bool is_valid_memexport_used() const { - return !memexport_stream_constants_.empty(); - } // Labels that jumps (explicit or from loops) can be done to. const std::set& label_addresses() const { return label_addresses_; } @@ -969,7 +978,7 @@ class Shader { // TODO(Triang3l): Investigate what happens to memexport when the pixel // fails the depth/stencil test, but in Direct3D 11 UAV writes disable early // depth/stencil. - return !kills_pixels() && !writes_depth() && !is_valid_memexport_used(); + return !kills_pixels() && !writes_depth() && !memexport_eM_written(); } // Whether each color render target is written to on any execution path. @@ -1041,8 +1050,6 @@ class Shader { std::vector vertex_bindings_; std::vector texture_bindings_; ConstantRegisterMap constant_register_map_ = {0}; - uint8_t memexport_eM_written_[kMaxMemExports] = {}; - std::set memexport_stream_constants_; std::set label_addresses_; uint32_t cf_pair_index_bound_ = 0; uint32_t register_static_address_bound_ = 0; @@ -1054,6 +1061,17 @@ class Shader { bool uses_texture_fetch_instruction_results_ = false; bool writes_depth_ = false; + // Memory export eM write info for each control flow instruction, if there are + // any eM writes in the shader. + std::vector cf_memexport_info_; + // Which memexport elements (eM#) are written for any memexport in the shader. + uint8_t memexport_eM_written_ = 0; + // ControlFlowMemExportInfo::eM_potentially_written_before equivalent for the + // end of the shader, for the last memory export (or exports if the end has + // multiple predecessor chains exporting to memory). + uint8_t memexport_eM_potentially_written_before_end_ = 0; + std::set memexport_stream_constants_; + // Modification bits -> translation. std::unordered_map translations_; @@ -1063,8 +1081,7 @@ class Shader { void GatherExecInformation( const ParsedExecInstruction& instr, ucode::VertexFetchInstruction& previous_vfetch_full, - uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count, - uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer); + uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer); void GatherVertexFetchInformation( const ucode::VertexFetchInstruction& op, ucode::VertexFetchInstruction& previous_vfetch_full, @@ -1073,13 +1090,12 @@ class Shader { uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer); void GatherAluInstructionInformation(const ucode::AluInstruction& op, - uint32_t memexport_alloc_current_count, - uint32_t& memexport_eA_written, + uint32_t exec_cf_index, StringBuffer& ucode_disasm_buffer); void GatherOperandInformation(const InstructionOperand& operand); void GatherFetchResultInformation(const InstructionResult& result); void GatherAluResultInformation(const InstructionResult& result, - uint32_t memexport_alloc_current_count); + uint32_t exec_cf_index); }; } // namespace gpu diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index 88c7f95f4..d381edc94 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -87,8 +87,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) { VertexFetchInstruction previous_vfetch_full; std::memset(&previous_vfetch_full, 0, sizeof(previous_vfetch_full)); uint32_t unique_texture_bindings = 0; - uint32_t memexport_alloc_count = 0; - uint32_t memexport_eA_written = 0; for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) { ControlFlowInstruction cf_ab[2]; UnpackControlFlowInstructions(ucode_data_.data() + i * 3, cf_ab); @@ -111,8 +109,7 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) { ParsedExecInstruction instr; ParseControlFlowExec(cf.exec, cf_index, instr); GatherExecInformation(instr, previous_vfetch_full, - unique_texture_bindings, memexport_alloc_count, - memexport_eA_written, ucode_disasm_buffer); + unique_texture_bindings, ucode_disasm_buffer); } break; case ControlFlowOpcode::kCondExec: case ControlFlowOpcode::kCondExecEnd: @@ -122,16 +119,14 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) { ParsedExecInstruction instr; ParseControlFlowCondExec(cf.cond_exec, cf_index, instr); GatherExecInformation(instr, previous_vfetch_full, - unique_texture_bindings, memexport_alloc_count, - memexport_eA_written, ucode_disasm_buffer); + unique_texture_bindings, ucode_disasm_buffer); } break; case ControlFlowOpcode::kCondExecPred: case ControlFlowOpcode::kCondExecPredEnd: { ParsedExecInstruction instr; ParseControlFlowCondExecPred(cf.cond_exec_pred, cf_index, instr); GatherExecInformation(instr, previous_vfetch_full, - unique_texture_bindings, memexport_alloc_count, - memexport_eA_written, ucode_disasm_buffer); + unique_texture_bindings, ucode_disasm_buffer); } break; case ControlFlowOpcode::kLoopStart: { ParsedLoopStartInstruction instr; @@ -173,9 +168,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) { ParseControlFlowAlloc(cf.alloc, cf_index, type() == xenos::ShaderType::kVertex, instr); instr.Disassemble(&ucode_disasm_buffer); - if (instr.type == AllocType::kMemory) { - ++memexport_alloc_count; - } } break; case ControlFlowOpcode::kMarkVsFetchDone: break; @@ -187,7 +179,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) { constant_register_map_.bool_bitmap[bool_constant_index / 32] |= uint32_t(1) << (bool_constant_index % 32); } - // TODO(benvanik): break if (DoesControlFlowOpcodeEndShader(cf.opcode()))? } } ucode_disassembly_ = ucode_disasm_buffer.to_string(); @@ -206,17 +197,125 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) { } } - // Cleanup invalid/unneeded memexport allocs. - for (uint32_t i = 0; i < kMaxMemExports; ++i) { - if (!(memexport_eA_written & (uint32_t(1) << i))) { - memexport_eM_written_[i] = 0; - } else if (!memexport_eM_written_[i]) { - memexport_eA_written &= ~(uint32_t(1) << i); + if (!cf_memexport_info_.empty()) { + // Gather potentially "dirty" memexport elements before each control flow + // instruction. `alloc` (any, not only `export`) flushes the previous memory + // export. On the guest GPU, yielding / serializing also terminates memory + // exports, but for simplicity disregarding that, as that functionally does + // nothing compared to flushing the previous memory export only at `alloc` + // or even only specifically at `alloc export`, Microsoft's validator checks + // if eM# aren't written after a `serialize`. + std::vector successor_stack; + for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) { + ControlFlowInstruction eM_writing_cf_ab[2]; + UnpackControlFlowInstructions(ucode_data_.data() + i * 3, + eM_writing_cf_ab); + for (uint32_t j = 0; j < 2; ++j) { + uint32_t eM_writing_cf_index = i * 2 + j; + uint32_t eM_written_by_cf_instr = + cf_memexport_info_[eM_writing_cf_index] + .eM_potentially_written_by_exec; + if (eM_writing_cf_ab[j].opcode() == ControlFlowOpcode::kCondCall) { + // Until subroutine calls are handled accurately, assume that all eM# + // have potentially been written by the subroutine for simplicity. + eM_written_by_cf_instr = memexport_eM_written_; + } + if (!eM_written_by_cf_instr) { + continue; + } + + // If the control flow instruction potentially results in any eM# being + // written, mark those eM# as potentially written before each successor. + bool is_successor_graph_head = true; + successor_stack.push_back(eM_writing_cf_index); + while (!successor_stack.empty()) { + uint32_t successor_cf_index = successor_stack.back(); + successor_stack.pop_back(); + + ControlFlowMemExportInfo& successor_memexport_info = + cf_memexport_info_[successor_cf_index]; + if ((successor_memexport_info.eM_potentially_written_before & + eM_written_by_cf_instr) == eM_written_by_cf_instr) { + // Already marked as written before this instruction (and thus + // before all its successors too). Possibly this instruction is in a + // loop, in this case an instruction may succeed itself. + break; + } + // The first instruction in the traversal is the writing instruction + // itself, not its successor. However, if it has been visited by the + // traversal twice, it's in a loop, so it succeeds itself, and thus + // writes from it are potentially done before it too. + if (!is_successor_graph_head) { + successor_memexport_info.eM_potentially_written_before |= + eM_written_by_cf_instr; + } + is_successor_graph_head = false; + + ControlFlowInstruction successor_cf_ab[2]; + UnpackControlFlowInstructions( + ucode_data_.data() + (successor_cf_index >> 1) * 3, + successor_cf_ab); + const ControlFlowInstruction& successor_cf = + successor_cf_ab[successor_cf_index & 1]; + + bool next_instr_is_new_successor = true; + switch (successor_cf.opcode()) { + case ControlFlowOpcode::kExecEnd: + // One successor: end. + memexport_eM_potentially_written_before_end_ |= + eM_written_by_cf_instr; + next_instr_is_new_successor = false; + break; + case ControlFlowOpcode::kCondExecEnd: + case ControlFlowOpcode::kCondExecPredEnd: + case ControlFlowOpcode::kCondExecPredCleanEnd: + // Two successors: next, end. + memexport_eM_potentially_written_before_end_ |= + eM_written_by_cf_instr; + break; + case ControlFlowOpcode::kLoopStart: + // Two successors: next, skip. + successor_stack.push_back(successor_cf.loop_start.address()); + break; + case ControlFlowOpcode::kLoopEnd: + // Two successors: next, repeat. + successor_stack.push_back(successor_cf.loop_end.address()); + break; + case ControlFlowOpcode::kCondCall: + // Two successors: next, target. + successor_stack.push_back(successor_cf.cond_call.address()); + break; + case ControlFlowOpcode::kReturn: + // Currently treating all subroutine calls as potentially writing + // all eM# for simplicity, so just exit the subroutine. + next_instr_is_new_successor = false; + break; + case ControlFlowOpcode::kCondJmp: + // One or two successors: next if conditional, target. + successor_stack.push_back(successor_cf.cond_jmp.address()); + if (successor_cf.cond_jmp.is_unconditional()) { + next_instr_is_new_successor = false; + } + break; + case ControlFlowOpcode::kAlloc: + // Any `alloc` ends the previous export. + next_instr_is_new_successor = false; + break; + default: + break; + } + if (next_instr_is_new_successor) { + if (successor_cf_index < (cf_pair_index_bound_ << 1)) { + successor_stack.push_back(successor_cf_index + 1); + } else { + memexport_eM_potentially_written_before_end_ |= + eM_written_by_cf_instr; + } + } + } + } } } - if (memexport_eA_written == 0) { - memexport_stream_constants_.clear(); - } is_ucode_analyzed_ = true; @@ -250,8 +349,7 @@ uint32_t Shader::GetInterpolatorInputMask(reg::SQ_PROGRAM_CNTL sq_program_cntl, void Shader::GatherExecInformation( const ParsedExecInstruction& instr, ucode::VertexFetchInstruction& previous_vfetch_full, - uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count, - uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) { + uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer) { instr.Disassemble(&ucode_disasm_buffer); uint32_t sequence = instr.sequence; for (uint32_t instr_offset = instr.instruction_address; @@ -273,8 +371,7 @@ void Shader::GatherExecInformation( } } else { auto& op = *reinterpret_cast(op_ptr); - GatherAluInstructionInformation(op, memexport_alloc_current_count, - memexport_eA_written, + GatherAluInstructionInformation(op, instr.dword_index, ucode_disasm_buffer); } } @@ -381,8 +478,8 @@ void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op, } void Shader::GatherAluInstructionInformation( - const AluInstruction& op, uint32_t memexport_alloc_current_count, - uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) { + const AluInstruction& op, uint32_t exec_cf_index, + StringBuffer& ucode_disasm_buffer) { ParsedAluInstruction instr; ParseAluInstruction(op, type(), instr); instr.Disassemble(&ucode_disasm_buffer); @@ -394,10 +491,8 @@ void Shader::GatherAluInstructionInformation( (ucode::GetAluScalarOpcodeInfo(op.scalar_opcode()).changed_state & ucode::kAluOpChangedStatePixelKill); - GatherAluResultInformation(instr.vector_and_constant_result, - memexport_alloc_current_count); - GatherAluResultInformation(instr.scalar_result, - memexport_alloc_current_count); + GatherAluResultInformation(instr.vector_and_constant_result, exec_cf_index); + GatherAluResultInformation(instr.scalar_result, exec_cf_index); for (size_t i = 0; i < instr.vector_operand_count; ++i) { GatherOperandInformation(instr.vector_operands[i]); } @@ -405,9 +500,7 @@ void Shader::GatherAluInstructionInformation( GatherOperandInformation(instr.scalar_operands[i]); } - // Store used memexport constants because CPU code needs addresses and sizes, - // and also whether there have been writes to eA and eM# for register - // allocation in shader translator implementations. + // Store used memexport constants because CPU code needs addresses and sizes. // eA is (hopefully) always written to using: // mad eA, r#, const0100, c# // (though there are some exceptions, shaders in 4D5307E6 for some reason set @@ -416,13 +509,9 @@ void Shader::GatherAluInstructionInformation( // Export is done to vector_dest of the ucode instruction for both vector and // scalar operations - no need to check separately. if (instr.vector_and_constant_result.storage_target == - InstructionStorageTarget::kExportAddress && - memexport_alloc_current_count > 0 && - memexport_alloc_current_count <= Shader::kMaxMemExports) { + InstructionStorageTarget::kExportAddress) { uint32_t memexport_stream_constant = instr.GetMemExportStreamConstant(); if (memexport_stream_constant != UINT32_MAX) { - memexport_eA_written |= uint32_t(1) - << (memexport_alloc_current_count - 1); memexport_stream_constants_.insert(memexport_stream_constant); } else { XELOGE( @@ -481,8 +570,8 @@ void Shader::GatherFetchResultInformation(const InstructionResult& result) { } } -void Shader::GatherAluResultInformation( - const InstructionResult& result, uint32_t memexport_alloc_current_count) { +void Shader::GatherAluResultInformation(const InstructionResult& result, + uint32_t exec_cf_index) { uint32_t used_write_mask = result.GetUsedWriteMask(); if (!used_write_mask) { return; @@ -504,11 +593,12 @@ void Shader::GatherAluResultInformation( writes_point_size_edge_flag_kill_vertex_ |= used_write_mask; break; case InstructionStorageTarget::kExportData: - if (memexport_alloc_current_count > 0 && - memexport_alloc_current_count <= Shader::kMaxMemExports) { - memexport_eM_written_[memexport_alloc_current_count - 1] |= - uint32_t(1) << result.storage_index; + memexport_eM_written_ |= uint8_t(1) << result.storage_index; + if (cf_memexport_info_.empty()) { + cf_memexport_info_.resize(2 * cf_pair_index_bound_); } + cf_memexport_info_[exec_cf_index].eM_potentially_written_by_exec |= + uint32_t(1) << result.storage_index; break; case InstructionStorageTarget::kColor: writes_color_targets_ |= uint32_t(1) << result.storage_index; @@ -665,7 +755,13 @@ void ShaderTranslator::TranslateControlFlowInstruction( case ControlFlowOpcode::kAlloc: { ParsedAllocInstruction instr; ParseControlFlowAlloc(cf.alloc, cf_index_, is_vertex_shader(), instr); - ProcessAllocInstruction(instr); + const std::vector& cf_memexport_info = + current_shader().cf_memexport_info(); + ProcessAllocInstruction(instr, + instr.dword_index < cf_memexport_info.size() + ? cf_memexport_info[instr.dword_index] + .eM_potentially_written_before + : 0); } break; case ControlFlowOpcode::kMarkVsFetchDone: break; @@ -807,6 +903,14 @@ void ParseControlFlowAlloc(const ControlFlowAllocInstruction& cf, void ShaderTranslator::TranslateExecInstructions( const ParsedExecInstruction& instr) { ProcessExecInstructionBegin(instr); + + const std::vector& cf_memexport_info = + current_shader().cf_memexport_info(); + uint8_t eM_potentially_written_before = + instr.dword_index < cf_memexport_info.size() + ? cf_memexport_info[instr.dword_index].eM_potentially_written_before + : 0; + const uint32_t* ucode_dwords = current_shader().ucode_data().data(); uint32_t sequence = instr.sequence; for (uint32_t instr_offset = instr.instruction_address; @@ -832,9 +936,22 @@ void ShaderTranslator::TranslateExecInstructions( auto& op = *reinterpret_cast(op_ptr); ParsedAluInstruction alu_instr; ParseAluInstruction(op, current_shader().type(), alu_instr); - ProcessAluInstruction(alu_instr); + ProcessAluInstruction(alu_instr, eM_potentially_written_before); + if (alu_instr.vector_and_constant_result.storage_target == + InstructionStorageTarget::kExportData && + alu_instr.vector_and_constant_result.GetUsedWriteMask()) { + eM_potentially_written_before |= + uint8_t(1) << alu_instr.vector_and_constant_result.storage_index; + } + if (alu_instr.scalar_result.storage_target == + InstructionStorageTarget::kExportData && + alu_instr.scalar_result.GetUsedWriteMask()) { + eM_potentially_written_before |= + uint8_t(1) << alu_instr.scalar_result.storage_index; + } } } + ProcessExecInstructionEnd(instr); } diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h index 0e764fe30..bcce051bd 100644 --- a/src/xenia/gpu/shader_translator.h +++ b/src/xenia/gpu/shader_translator.h @@ -118,8 +118,10 @@ class ShaderTranslator { virtual void ProcessReturnInstruction(const ParsedReturnInstruction& instr) {} // Handles translation for jump instructions. virtual void ProcessJumpInstruction(const ParsedJumpInstruction& instr) {} - // Handles translation for alloc instructions. - virtual void ProcessAllocInstruction(const ParsedAllocInstruction& instr) {} + // Handles translation for alloc instructions. Memory exports for eM# + // indicated by export_eM must be performed, regardless of the alloc type. + virtual void ProcessAllocInstruction(const ParsedAllocInstruction& instr, + uint8_t export_eM) {} // Handles translation for vertex fetch instructions. virtual void ProcessVertexFetchInstruction( @@ -128,7 +130,13 @@ class ShaderTranslator { virtual void ProcessTextureFetchInstruction( const ParsedTextureFetchInstruction& instr) {} // Handles translation for ALU instructions. - virtual void ProcessAluInstruction(const ParsedAluInstruction& instr) {} + // memexport_eM_potentially_written_before needs to be handled by `kill` + // instruction to make sure memory exports for the eM# writes earlier in + // previous execs and the current exec are done before the invocation becomes + // inactive. + virtual void ProcessAluInstruction( + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before) {} private: void TranslateControlFlowInstruction(const ucode::ControlFlowInstruction& cf); diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index b7da0678d..9889fb630 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -134,7 +134,7 @@ class SpirvShaderTranslator : public ShaderTranslator { // (32-bit only - 16-bit indices are always fetched via the Vulkan index // buffer). kSysFlag_VertexIndexLoad = 1u << kSysFlag_VertexIndexLoad_Shift, - // For HostVertexShaderTypes kMemexportCompute, kPointListAsTriangleStrip, + // For HostVertexShaderTypes kMemExportCompute, kPointListAsTriangleStrip, // kRectangleListAsTriangleStrip, whether the vertex index needs to be // loaded from the index buffer (rather than using autogenerated indices), // and whether it's 32-bit. This is separate from kSysFlag_VertexIndexLoad @@ -427,7 +427,9 @@ class SpirvShaderTranslator : public ShaderTranslator { const ParsedVertexFetchInstruction& instr) override; void ProcessTextureFetchInstruction( const ParsedTextureFetchInstruction& instr) override; - void ProcessAluInstruction(const ParsedAluInstruction& instr) override; + void ProcessAluInstruction( + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before) override; private: struct TextureBinding { @@ -620,7 +622,7 @@ class SpirvShaderTranslator : public ShaderTranslator { assert_true(edram_fragment_shader_interlock_); return !is_depth_only_fragment_shader_ && !current_shader().writes_depth() && - !current_shader().is_valid_memexport_used(); + !current_shader().memexport_eM_written(); } void FSI_LoadSampleMask(spv::Id msaa_samples); void FSI_LoadEdramOffsets(spv::Id msaa_samples); diff --git a/src/xenia/gpu/spirv_shader_translator_alu.cc b/src/xenia/gpu/spirv_shader_translator_alu.cc index 47978dd00..05e41d5ab 100644 --- a/src/xenia/gpu/spirv_shader_translator_alu.cc +++ b/src/xenia/gpu/spirv_shader_translator_alu.cc @@ -67,7 +67,8 @@ void SpirvShaderTranslator::KillPixel(spv::Id condition) { } void SpirvShaderTranslator::ProcessAluInstruction( - const ParsedAluInstruction& instr) { + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before) { if (instr.IsNop()) { // Don't even disassemble or update predication. return; diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index edecafe7f..5ae943f62 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -210,7 +210,7 @@ enum class AllocType : uint32_t { kVsInterpolators = 2, // Pixel shader exports colors. kPsColors = 2, - // MEMEXPORT? + // Memory export. kMemory = 3, }; @@ -1782,6 +1782,9 @@ inline uint32_t GetAluVectorOpNeededSourceComponents( .operand_components_used[src_index - 1]; } +// eM# (kExportData) register count. +constexpr uint32_t kMaxMemExportElementCount = 5; + enum class ExportRegister : uint32_t { kVSInterpolator0 = 0, kVSInterpolator1, diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index ece50a2f2..b1697dd06 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -2187,7 +2187,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, return false; } pipeline_cache_->AnalyzeShaderUcode(*vertex_shader); - bool memexport_used_vertex = vertex_shader->is_valid_memexport_used(); + bool memexport_used_vertex = vertex_shader->memexport_eM_written() != 0; // Pixel shader analysis. bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs); diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index 8f16690b3..4c1b30534 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -497,6 +497,18 @@ enum class TextureFormat : uint32_t { k_6_5_5 = 5, k_8_8_8_8 = 6, k_2_10_10_10 = 7, + // Possibly similar to k_8, but may be storing alpha instead of red when + // resolving/memexporting, though not exactly known. From the point of view of + // sampling, it should be treated the same as k_8 (given that textures have + // the last - and single-component textures have the only - component + // replicated into all the remaining ones before the swizzle). + // Used as: + // - Texture in 4B4E083C - text, starting from the "Loading..." and the "This + // game saves data automatically" messages. The swizzle in the fetch + // constant is 111W (suggesting that internally the only component may be + // the alpha one, not red). + // TODO(Triang3l): Investigate how k_8_A and k_8_B work in resolves and + // memexports, whether they store alpha/blue of the input or red. k_8_A = 8, k_8_B = 9, k_8_8 = 10, @@ -510,6 +522,12 @@ enum class TextureFormat : uint32_t { // Used for videos in 54540829. k_Y1_Cr_Y0_Cb_REP = 12, k_16_16_EDRAM = 13, + // Likely same as k_8_8_8_8. + // Used as: + // - Memexport destination in 4D5308BC - multiple small draws when looking + // back at the door behind the player in the first room of gameplay. + // - Memexport destination in 4D53085B and 4D530919 - in 4D53085B, in a frame + // between the intro video and the main menu, in a 8192-point draw. k_8_8_8_8_A = 14, k_4_4_4_4 = 15, k_10_11_11 = 16, @@ -1373,8 +1391,7 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6); // memexport on the Adreno 2xx using GL_OES_get_program_binary - it's also // interesting to see how alphatest interacts with it, whether it's still true // fixed-function alphatest, as it's claimed to be supported as usual by the -// extension specification - it's likely, however, that memory exports are -// discarded alongside other exports such as oC# and oDepth this way. +// extension specification. // // Y of eA contains the offset in elements - this is what shaders are supposed // to calculate from something like the vertex index. Again, it's specified as @@ -1397,6 +1414,69 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6); // elements using packing via addition to 2^23, so this field also doesn't need // more bits than that. // +// According to the sequencer specification from IPR2015-00325 (where memexport +// is called "pass thru export"): +// - Pass thru exports can occur anywhere in the shader program. +// - There can be any number of pass thru exports. +// - The address register is not kept across clause boundaries, so it must be +// refreshed after any Serialize (or yield), allocate instruction or resource +// change. +// - The write to eM# may be predicated if the export is not needed. +// - Exports are dropped if: +// - The index is above the maximum. +// - The index sign bit is 1. +// - The exponent of the index is not 23. +// The requirement that eM4 must be written if any eM# other than eM0 is also +// written doesn't apply to the final Xenos, it's likely an outdated note in the +// specification considering that it's very preliminary. +// +// According to Microsoft's shader validator: +// - eA can be written only by `mad`. +// - A single eM# can be written by any number of instruction, including with +// write masking. +// - eA must be written before eM#. +// - Any alloc instruction or a `serialize` terminates the current memory +// export. This doesn't apply to `exec Yield=true`, however, and it's not +// clear if that's an oversight or if that's not considered a yield that +// terminates the export. +// +// From the emulation perspective, this means that: +// - Alloc instructions (`alloc export` mandatorily, other allocs optionally), +// and optionally `serialize` instructions within `exec`, should be treated as +// the locations where the currently open export should be flushed to the +// memory. It should be taken into account that an export may be in looping +// control flow, and in this case it must be performed at every iteration. +// - Whether each eM# was written to must be tracked at shader execution time, +// as predication can disable the export of an element. +// +// TODO(Triang3l): Investigate how memory export interacts with pixel killing. +// Given that eM# writes disabled by predication don't cause an export, it's +// possible that killed invocations are treated as inactive (invalid in Xenos +// terms) overall, and thus new memory exports from them shouldn't be done, but +// that's not verified. However, given that on Direct3D 11+, OpenGL and Vulkan +// hosts, discarding disables subsequent storage resource writes, on the host, +// it would be natural to perform all outstanding memory exports before +// discarding if the kill condition passes. +// +// Memory exports can be performed to any ColorFormat, including 8bpp and 16bpp +// ones. Hosts, however, may have the memory bound as a 32bpp buffer (for +// instance, due to the minimum resource view size limitation on Direct3D 11). +// In this case, bytes and shorts aren't addressable directly. However, taking +// into account that memory accesses are coherent within one shader invocation +// on Direct3D 11+, OpenGL and Vulkan and thus are done in order relatively to +// each other, it should be possible to implement them by clearing the bits via +// an atomic AND, and writing the new value using an atomic OR. This will, of +// course, make the entire write operation non-atomic, and in case of a race +// between writes to the same location, the final result may not even be just a +// value from one of the invocations, but rather, it can be OR of the values +// from any invocations involved. However, on the Xenos, there doesn't seem to +// be any possibility of meaningfully accessing the same location from multiple +// invocations if any of them is writing, memory exports are out-of-order, so +// such an implementation shouldn't be causing issues in reality. Atomic +// compare-exchange, however, should not be used for this purpose, as it may +// result in an infinite loop if different invocations want to write different +// values to the same memory location. +// // Examples of setup in titles (Z from MSB to LSB): // // 4D5307E6 particles (different VS invocation counts, like 1, 2, 4): @@ -1432,6 +1512,11 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6); // c0: Z = 010010110000|0|010|11|011010|00011|001 // 8in16, 16_16_16_16, uint, RGBA - from 16_16_16_16 uint vfetch // (16_16_16_16 is the largest color format without special values) +// +// 58410B86 hierarchical depth buffer occlusion culling with the result read on +// the CPU (15000 VS invocations in the main menu): +// c8: Z = 010010110000|0|010|00|000010|00000|000, count = invocation count +// No endian swap, 8, uint, RGBA union alignas(uint32_t) xe_gpu_memexport_stream_t { struct { uint32_t dword_0; diff --git a/src/xenia/kernel/xam/xam_content.cc b/src/xenia/kernel/xam/xam_content.cc index 1b0b9c844..a08da87bd 100644 --- a/src/xenia/kernel/xam/xam_content.cc +++ b/src/xenia/kernel/xam/xam_content.cc @@ -119,6 +119,8 @@ dword_result_t XamContentCreateEnumerator_entry( } DECLARE_XAM_EXPORT1(XamContentCreateEnumerator, kContent, kImplemented); +enum class kDispositionState : uint32_t { Unknown = 0, Create = 1, Open = 2 }; + dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name, lpvoid_t content_data_ptr, dword_t content_data_size, dword_t flags, @@ -146,40 +148,37 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name, content_data, disposition_ptr, license_mask_ptr, overlapped_ptr]( uint32_t& extended_error, uint32_t& length) -> X_RESULT { X_RESULT result = X_ERROR_INVALID_PARAMETER; - bool create = false; - bool open = false; + kDispositionState disposition = kDispositionState::Unknown; switch (flags & 0xF) { case 1: // CREATE_NEW // Fail if exists. if (content_manager->ContentExists(content_data)) { result = X_ERROR_ALREADY_EXISTS; } else { - create = true; + disposition = kDispositionState::Create; } break; case 2: // CREATE_ALWAYS // Overwrite existing, if any. if (content_manager->ContentExists(content_data)) { content_manager->DeleteContent(content_data); - create = true; - } else { - create = true; } + disposition = kDispositionState::Create; break; case 3: // OPEN_EXISTING // Open only if exists. if (!content_manager->ContentExists(content_data)) { result = X_ERROR_PATH_NOT_FOUND; } else { - open = true; + disposition = kDispositionState::Open; } break; case 4: // OPEN_ALWAYS // Create if needed. if (!content_manager->ContentExists(content_data)) { - create = true; + disposition = kDispositionState::Create; } else { - open = true; + disposition = kDispositionState::Open; } break; case 5: // TRUNCATE_EXISTING @@ -188,7 +187,7 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name, result = X_ERROR_PATH_NOT_FOUND; } else { content_manager->DeleteContent(content_data); - create = true; + disposition = kDispositionState::Create; } break; default: @@ -196,21 +195,12 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name, break; } - // creation result - // 0 = ? - // 1 = created - // 2 = opened - uint32_t disposition = create ? 1 : 2; - if (disposition_ptr) { - *disposition_ptr = disposition; - } - - if (create) { + if (disposition == kDispositionState::Create) { result = content_manager->CreateContent(root_name, content_data); if (XSUCCEEDED(result)) { content_manager->WriteContentHeaderFile(&content_data); } - } else if (open) { + } else if (disposition == kDispositionState::Open) { result = content_manager->OpenContent(root_name, content_data); } @@ -224,12 +214,11 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name, } extended_error = X_HRESULT_FROM_WIN32(result); - length = disposition; + length = static_cast(disposition); if (result && overlapped_ptr) { result = X_ERROR_FUNCTION_FAILED; } - return result; }; @@ -451,7 +440,6 @@ static_assert_size(X_SWAPDISC_ERROR_MESSAGE, 12); dword_result_t XamSwapDisc_entry( dword_t disc_number, pointer_t completion_handle, pointer_t error_message) { - xex2_opt_execution_info* info = nullptr; kernel_state()->GetExecutableModule()->GetOptHeader(XEX_HEADER_EXECUTION_INFO, &info); diff --git a/src/xenia/kernel/xam/xam_info.cc b/src/xenia/kernel/xam/xam_info.cc index e3230aea7..d1a240312 100644 --- a/src/xenia/kernel/xam/xam_info.cc +++ b/src/xenia/kernel/xam/xam_info.cc @@ -254,202 +254,15 @@ dword_result_t XGetLanguage_entry() { } DECLARE_XAM_EXPORT1(XGetLanguage, kNone, kImplemented); -// http://www.noxa.org/blog/2011/02/28/building-an-xbox-360-emulator-part-3-feasibilityos/ -// http://www.noxa.org/blog/2011/08/13/building-an-xbox-360-emulator-part-5-xex-files/ -dword_result_t RtlSleep_entry(dword_t dwMilliseconds, dword_t bAlertable) { - LARGE_INTEGER delay{}; - - // Convert the delay time to 100-nanosecond intervals - delay.QuadPart = dwMilliseconds == -1 - ? LLONG_MAX - : static_cast(-10000) * dwMilliseconds; - - X_STATUS result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable, - (uint64_t*)&delay); - - // If the delay was interrupted by an APC, keep delaying the thread - while (bAlertable && result == X_STATUS_ALERTED) { - result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable, - (uint64_t*)&delay); - } - - return result == X_STATUS_SUCCESS ? X_STATUS_SUCCESS : X_STATUS_USER_APC; -} -DECLARE_XAM_EXPORT1(RtlSleep, kNone, kImplemented); - -dword_result_t SleepEx_entry(dword_t dwMilliseconds, dword_t bAlertable) { - return RtlSleep_entry(dwMilliseconds, bAlertable); -} -DECLARE_XAM_EXPORT1(SleepEx, kNone, kImplemented); - -// https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-sleep -void Sleep_entry(dword_t dwMilliseconds) { - RtlSleep_entry(dwMilliseconds, FALSE); -} -DECLARE_XAM_EXPORT1(Sleep, kNone, kImplemented); - -// https://learn.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-gettickcount -dword_result_t GetTickCount_entry() { return Clock::QueryGuestUptimeMillis(); } -DECLARE_XAM_EXPORT1(GetTickCount, kNone, kImplemented); - dword_result_t XamGetCurrentTitleId_entry() { return kernel_state()->emulator()->title_id(); } DECLARE_XAM_EXPORT1(XamGetCurrentTitleId, kNone, kImplemented); -dword_result_t RtlSetLastNTError_entry(dword_t error_code) { - const uint32_t result = - xe::kernel::xboxkrnl::xeRtlNtStatusToDosError(error_code); - XThread::SetLastError(result); - - return result; +dword_result_t XamIsCurrentTitleDash_entry(const ppc_context_t& ctx) { + return ctx->kernel_state->title_id() == 0xFFFE07D1; } -DECLARE_XAM_EXPORT1(RtlSetLastNTError, kNone, kImplemented); - -dword_result_t RtlGetLastError_entry() { return XThread::GetLastError(); } -DECLARE_XAM_EXPORT1(RtlGetLastError, kNone, kImplemented); - -dword_result_t GetLastError_entry() { return RtlGetLastError_entry(); } -DECLARE_XAM_EXPORT1(GetLastError, kNone, kImplemented); - -dword_result_t GetModuleHandleA_entry(lpstring_t module_name) { - xe::be module_ptr = 0; - const X_STATUS error_code = xe::kernel::xboxkrnl::XexGetModuleHandle( - module_name.value(), &module_ptr); - - if (XFAILED(error_code)) { - RtlSetLastNTError_entry(error_code); - - return NULL; - } - - return (uint32_t)module_ptr; -} -DECLARE_XAM_EXPORT1(GetModuleHandleA, kNone, kImplemented); - -dword_result_t XapipCreateThread_entry(lpdword_t lpThreadAttributes, - dword_t dwStackSize, - lpvoid_t lpStartAddress, - lpvoid_t lpParameter, - dword_t dwCreationFlags, dword_t unkn, - lpdword_t lpThreadId) { - uint32_t flags = (dwCreationFlags >> 2) & 1; - - if (unkn != -1) { - flags |= 1 << unkn << 24; - } - - xe::be result = 0; - - const X_STATUS error_code = xe::kernel::xboxkrnl::ExCreateThread( - &result, dwStackSize, lpThreadId, lpStartAddress, lpParameter, 0, flags); - - if (XFAILED(error_code)) { - RtlSetLastNTError_entry(error_code); - - return NULL; - } - - return (uint32_t)result; -} -DECLARE_XAM_EXPORT1(XapipCreateThread, kNone, kImplemented); - -dword_result_t CreateThread_entry(lpdword_t lpThreadAttributes, - dword_t dwStackSize, lpvoid_t lpStartAddress, - lpvoid_t lpParameter, dword_t dwCreationFlags, - lpdword_t lpThreadId) { - return XapipCreateThread_entry(lpThreadAttributes, dwStackSize, - lpStartAddress, lpParameter, dwCreationFlags, - -1, lpThreadId); -} -DECLARE_XAM_EXPORT1(CreateThread, kNone, kImplemented); - -dword_result_t CloseHandle_entry(dword_t hObject) { - const X_STATUS error_code = xe::kernel::xboxkrnl::NtClose(hObject); - - if (XFAILED(error_code)) { - RtlSetLastNTError_entry(error_code); - - return false; - } - - return true; -} -DECLARE_XAM_EXPORT1(CloseHandle, kNone, kImplemented); - -dword_result_t ResumeThread_entry(dword_t hThread) { - uint32_t suspend_count; - const X_STATUS error_code = - xe::kernel::xboxkrnl::NtResumeThread(hThread, &suspend_count); - - if (XFAILED(error_code)) { - RtlSetLastNTError_entry(error_code); - - return -1; - } - - return suspend_count; -} -DECLARE_XAM_EXPORT1(ResumeThread, kNone, kImplemented); - -void ExitThread_entry(dword_t exit_code) { - xe::kernel::xboxkrnl::ExTerminateThread(exit_code); -} -DECLARE_XAM_EXPORT1(ExitThread, kNone, kImplemented); - -dword_result_t GetCurrentThreadId_entry() { - return XThread::GetCurrentThread()->GetCurrentThreadId(); -} -DECLARE_XAM_EXPORT1(GetCurrentThreadId, kNone, kImplemented); - -qword_result_t XapiFormatTimeOut_entry(lpqword_t result, - dword_t dwMilliseconds) { - LARGE_INTEGER delay{}; - - // Convert the delay time to 100-nanosecond intervals - delay.QuadPart = - dwMilliseconds == -1 ? 0 : static_cast(-10000) * dwMilliseconds; - - return (uint64_t)&delay; -} -DECLARE_XAM_EXPORT1(XapiFormatTimeOut, kNone, kImplemented); - -dword_result_t WaitForSingleObjectEx_entry(dword_t hHandle, - dword_t dwMilliseconds, - dword_t bAlertable) { - uint64_t* timeout = nullptr; - uint64_t timeout_ptr = XapiFormatTimeOut_entry(timeout, dwMilliseconds); - - X_STATUS result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx( - hHandle, 1, bAlertable, &timeout_ptr); - - while (bAlertable && result == X_STATUS_ALERTED) { - result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx( - hHandle, 1, bAlertable, &timeout_ptr); - } - - RtlSetLastNTError_entry(result); - result = -1; - - return result; -} -DECLARE_XAM_EXPORT1(WaitForSingleObjectEx, kNone, kImplemented); - -dword_result_t WaitForSingleObject_entry(dword_t hHandle, - dword_t dwMilliseconds) { - return WaitForSingleObjectEx_entry(hHandle, dwMilliseconds, 0); -} -DECLARE_XAM_EXPORT1(WaitForSingleObject, kNone, kImplemented); - -dword_result_t lstrlenW_entry(lpu16string_t string) { - // wcslen? - if (string) { - return (uint32_t)string.value().length(); - } - - return NULL; -} -DECLARE_XAM_EXPORT1(lstrlenW, kNone, kImplemented); +DECLARE_XAM_EXPORT1(XamIsCurrentTitleDash, kNone, kImplemented); dword_result_t XamGetExecutionId_entry(lpdword_t info_ptr) { auto module = kernel_state()->GetExecutableModule(); @@ -611,16 +424,204 @@ dword_result_t XamQueryLiveHiveW_entry(lpu16string_t name, lpvoid_t out_buf, } DECLARE_XAM_EXPORT1(XamQueryLiveHiveW, kNone, kStub); -dword_result_t XamIsCurrentTitleDash_entry(const ppc_context_t& ctx) { - return ctx->kernel_state->title_id() == 0xFFFE07D1; +// http://www.noxa.org/blog/2011/02/28/building-an-xbox-360-emulator-part-3-feasibilityos/ +// http://www.noxa.org/blog/2011/08/13/building-an-xbox-360-emulator-part-5-xex-files/ +dword_result_t RtlSleep_entry(dword_t dwMilliseconds, dword_t bAlertable) { + LARGE_INTEGER delay{}; + + // Convert the delay time to 100-nanosecond intervals + delay.QuadPart = dwMilliseconds == -1 + ? LLONG_MAX + : static_cast(-10000) * dwMilliseconds; + + X_STATUS result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable, + (uint64_t*)&delay); + + // If the delay was interrupted by an APC, keep delaying the thread + while (bAlertable && result == X_STATUS_ALERTED) { + result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable, + (uint64_t*)&delay); + } + + return result == X_STATUS_SUCCESS ? X_STATUS_SUCCESS : X_STATUS_USER_APC; } -DECLARE_XAM_EXPORT1(XamIsCurrentTitleDash, kNone, kImplemented); +DECLARE_XAM_EXPORT1(RtlSleep, kNone, kImplemented); + +dword_result_t SleepEx_entry(dword_t dwMilliseconds, dword_t bAlertable) { + return RtlSleep_entry(dwMilliseconds, bAlertable); +} +DECLARE_XAM_EXPORT1(SleepEx, kNone, kImplemented); + +// https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-sleep +void Sleep_entry(dword_t dwMilliseconds) { + RtlSleep_entry(dwMilliseconds, FALSE); +} +DECLARE_XAM_EXPORT1(Sleep, kNone, kImplemented); + +// https://learn.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-gettickcount +dword_result_t GetTickCount_entry() { return Clock::QueryGuestUptimeMillis(); } +DECLARE_XAM_EXPORT1(GetTickCount, kNone, kImplemented); + +dword_result_t RtlSetLastNTError_entry(dword_t error_code) { + const uint32_t result = + xe::kernel::xboxkrnl::xeRtlNtStatusToDosError(error_code); + XThread::SetLastError(result); + + return result; +} +DECLARE_XAM_EXPORT1(RtlSetLastNTError, kNone, kImplemented); + +dword_result_t RtlGetLastError_entry() { return XThread::GetLastError(); } +DECLARE_XAM_EXPORT1(RtlGetLastError, kNone, kImplemented); + +dword_result_t GetLastError_entry() { return RtlGetLastError_entry(); } +DECLARE_XAM_EXPORT1(GetLastError, kNone, kImplemented); + +dword_result_t GetModuleHandleA_entry(lpstring_t module_name) { + xe::be module_ptr = 0; + const X_STATUS error_code = xe::kernel::xboxkrnl::XexGetModuleHandle( + module_name.value(), &module_ptr); + + if (XFAILED(error_code)) { + RtlSetLastNTError_entry(error_code); + + return NULL; + } + + return (uint32_t)module_ptr; +} +DECLARE_XAM_EXPORT1(GetModuleHandleA, kNone, kImplemented); + +dword_result_t XapipCreateThread_entry(lpdword_t lpThreadAttributes, + dword_t dwStackSize, + lpvoid_t lpStartAddress, + lpvoid_t lpParameter, + dword_t dwCreationFlags, dword_t unkn, + lpdword_t lpThreadId) { + uint32_t flags = (dwCreationFlags >> 2) & 1; + + if (unkn != -1) { + flags |= 1 << unkn << 24; + } + + xe::be result = 0; + + const X_STATUS error_code = xe::kernel::xboxkrnl::ExCreateThread( + &result, dwStackSize, lpThreadId, lpStartAddress, lpParameter, 0, flags); + + if (XFAILED(error_code)) { + RtlSetLastNTError_entry(error_code); + + return NULL; + } + + return (uint32_t)result; +} +DECLARE_XAM_EXPORT1(XapipCreateThread, kNone, kImplemented); + +dword_result_t CreateThread_entry(lpdword_t lpThreadAttributes, + dword_t dwStackSize, lpvoid_t lpStartAddress, + lpvoid_t lpParameter, dword_t dwCreationFlags, + lpdword_t lpThreadId) { + return XapipCreateThread_entry(lpThreadAttributes, dwStackSize, + lpStartAddress, lpParameter, dwCreationFlags, + -1, lpThreadId); +} +DECLARE_XAM_EXPORT1(CreateThread, kNone, kImplemented); + +dword_result_t CloseHandle_entry(dword_t hObject) { + const X_STATUS error_code = xe::kernel::xboxkrnl::NtClose(hObject); + + if (XFAILED(error_code)) { + RtlSetLastNTError_entry(error_code); + + return false; + } + + return true; +} +DECLARE_XAM_EXPORT1(CloseHandle, kNone, kImplemented); + +dword_result_t ResumeThread_entry(dword_t hThread) { + uint32_t suspend_count; + const X_STATUS error_code = + xe::kernel::xboxkrnl::NtResumeThread(hThread, &suspend_count); + + if (XFAILED(error_code)) { + RtlSetLastNTError_entry(error_code); + + return -1; + } + + return suspend_count; +} +DECLARE_XAM_EXPORT1(ResumeThread, kNone, kImplemented); + +void ExitThread_entry(dword_t exit_code) { + xe::kernel::xboxkrnl::ExTerminateThread(exit_code); +} +DECLARE_XAM_EXPORT1(ExitThread, kNone, kImplemented); + +dword_result_t GetCurrentThreadId_entry() { + return XThread::GetCurrentThread()->GetCurrentThreadId(); +} +DECLARE_XAM_EXPORT1(GetCurrentThreadId, kNone, kImplemented); + +qword_result_t XapiFormatTimeOut_entry(lpqword_t result, + dword_t dwMilliseconds) { + LARGE_INTEGER delay{}; + + // Convert the delay time to 100-nanosecond intervals + delay.QuadPart = + dwMilliseconds == -1 ? 0 : static_cast(-10000) * dwMilliseconds; + + return (uint64_t)&delay; +} +DECLARE_XAM_EXPORT1(XapiFormatTimeOut, kNone, kImplemented); + +dword_result_t WaitForSingleObjectEx_entry(dword_t hHandle, + dword_t dwMilliseconds, + dword_t bAlertable) { + uint64_t* timeout = nullptr; + uint64_t timeout_ptr = XapiFormatTimeOut_entry(timeout, dwMilliseconds); + + X_STATUS result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx( + hHandle, 1, bAlertable, &timeout_ptr); + + while (bAlertable && result == X_STATUS_ALERTED) { + result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx( + hHandle, 1, bAlertable, &timeout_ptr); + } + + RtlSetLastNTError_entry(result); + result = -1; + + return result; +} +DECLARE_XAM_EXPORT1(WaitForSingleObjectEx, kNone, kImplemented); + +dword_result_t WaitForSingleObject_entry(dword_t hHandle, + dword_t dwMilliseconds) { + return WaitForSingleObjectEx_entry(hHandle, dwMilliseconds, 0); +} +DECLARE_XAM_EXPORT1(WaitForSingleObject, kNone, kImplemented); + +dword_result_t lstrlenW_entry(lpu16string_t string) { + // wcslen? + if (string) { + return (uint32_t)string.value().length(); + } + + return NULL; +} +DECLARE_XAM_EXPORT1(lstrlenW, kNone, kImplemented); dword_result_t XGetAudioFlags_entry() { return 65537; } DECLARE_XAM_EXPORT1(XGetAudioFlags, kNone, kStub); /* - todo: this table should instead be pointed to by a member of kernel state and initialized along with the process + todo: this table should instead be pointed to by a member of kernel + state and initialized along with the process */ static int32_t XamRtlRandomTable[128] = { 1284227242, 1275210071, 573735546, 790525478, 2139871995, 1547161642,