Merge branch 'master' of https://github.com/xenia-project/xenia into canary_experimental

This commit is contained in:
Gliniak 2023-09-01 18:20:29 +02:00
commit ce9a82ccf8
24 changed files with 1771 additions and 1184 deletions

View File

@ -217,6 +217,10 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
}
while (auto ent = readdir(dir)) {
if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
continue;
}
FileInfo info;
info.name = ent->d_name;
@ -225,6 +229,7 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
info.create_timestamp = convertUnixtimeToWinFiletime(st.st_ctime);
info.access_timestamp = convertUnixtimeToWinFiletime(st.st_atime);
info.write_timestamp = convertUnixtimeToWinFiletime(st.st_mtime);
info.path = path;
if (ent->d_type == DT_DIR) {
info.type = FileInfo::Type::kDirectory;
info.total_size = 0;
@ -234,7 +239,7 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
}
result.push_back(info);
}
closedir(dir);
return result;
}

View File

@ -10,6 +10,7 @@
#include "xenia/base/utf8.h"
#include <algorithm>
#include <cstdint>
#include <locale>
#include <numeric>
#include <tuple>

View File

@ -481,6 +481,43 @@ struct VECTOR_COMPARE_UGT_V128
: Sequence<VECTOR_COMPARE_UGT_V128,
I<OPCODE_VECTOR_COMPARE_UGT, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW |
kX64EmitAVX512DQ) &&
(i.instr->flags != FLOAT32_TYPE)) {
Xmm src1 = e.xmm0;
if (i.src1.is_constant) {
e.LoadConstantXmm(src1, i.src1.constant());
} else {
src1 = i.src1;
}
Xmm src2 = e.xmm1;
if (i.src2.is_constant) {
e.LoadConstantXmm(src2, i.src2.constant());
} else {
src2 = i.src2;
}
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcmpub(e.k1, src1, src2, 0x6);
e.vpmovm2b(i.dest, e.k1);
break;
case INT16_TYPE:
e.vpcmpuw(e.k1, src1, src2, 0x6);
e.vpmovm2w(i.dest, e.k1);
break;
case INT32_TYPE:
e.vpcmpud(e.k1, src1, src2, 0x6);
e.vpmovm2d(i.dest, e.k1);
break;
default:
assert_always();
break;
}
return;
}
Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy
switch (i.instr->flags) {
case INT8_TYPE:

View File

@ -646,8 +646,9 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
break;
case OPCODE_AND_NOT:
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
v->set_from(i->src1.value);
v->AndNot(i->src2.value);
v->set_from(i->src2.value);
v->Not();
v->And(i->src1.value);
i->UnlinkAndNOP();
result = true;
}

View File

@ -324,8 +324,13 @@ int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
}
int InstrEmit_vaddcuw(PPCHIRBuilder& f, const InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
Value* sum = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT32_TYPE,
ARITHMETIC_UNSIGNED);
Value* overflow = f.VectorCompareUGT(f.LoadVR(i.VX.VA), sum, INT32_TYPE);
Value* carry =
f.VectorShr(overflow, f.LoadConstantVec128(vec128i(31)), INT32_TYPE);
f.StoreVR(i.VX.VD, carry);
return 0;
}
int InstrEmit_vaddfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb) {
@ -1665,7 +1670,11 @@ int InstrEmit_vsrw128(PPCHIRBuilder& f, const InstrData& i) {
}
int InstrEmit_vsubcuw(PPCHIRBuilder& f, const InstrData& i) {
XEINSTRNOTIMPLEMENTED();
Value* underflow =
f.VectorCompareUGE(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT32_TYPE);
Value* borrow =
f.VectorShr(underflow, f.LoadConstantVec128(vec128i(31)), INT32_TYPE);
f.StoreVR(i.VX.VD, borrow);
return 1;
}

View File

@ -2574,7 +2574,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
return false;
}
pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
const bool memexport_used_vertex = vertex_shader->memexport_eM_written() != 0;
// Pixel shader analysis.
bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
@ -2604,7 +2605,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
}
const bool memexport_used_pixel =
pixel_shader && pixel_shader->is_valid_memexport_used();
pixel_shader && (pixel_shader->memexport_eM_written() != 0);
const bool memexport_used = memexport_used_vertex || memexport_used_pixel;
if (!BeginSubmission(true)) {
@ -2831,12 +2832,22 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
// Gather memexport ranges and ensure the heaps for them are resident, and
// also load the data surrounding the export and to fill the regions that
// won't be modified by the shaders.
memexport_range_count_ = 0;
if (memexport_used_vertex || memexport_used_pixel) {
bool retflag;
bool retval = GatherMemexportRangesAndMakeResident(retflag);
if (retflag) return retval;
memexport_ranges_.clear();
if (memexport_used_vertex) {
draw_util::AddMemExportRanges(regs, *vertex_shader, memexport_ranges_);
}
if (memexport_used_pixel) {
draw_util::AddMemExportRanges(regs, *pixel_shader, memexport_ranges_);
}
for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) {
if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
memexport_range.size_bytes)) {
XELOGE(
"Failed to request memexport stream at 0x{:08X} (size {}) in the "
"shared memory",
memexport_range.base_address_dwords << 2, memexport_range.size_bytes);
return false;
}
}
// Primitive topology.
D3D_PRIMITIVE_TOPOLOGY primitive_topology;
@ -2935,11 +2946,22 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
// If the shared memory is a UAV, it can't be used as an index buffer
// (UAV is a read/write state, index buffer is a read-only state).
// Need to copy the indices to a buffer in the index buffer state.
bool retflag;
bool retval = HandleMemexportGuestDMA(
scratch_index_buffer, index_buffer_view,
primitive_processing_result.guest_index_base, retflag);
if (retflag) return retval;
scratch_index_buffer = RequestScratchGPUBuffer(
index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST);
if (scratch_index_buffer == nullptr) {
return false;
}
shared_memory_->UseAsCopySource();
SubmitBarriers();
deferred_command_list_.D3DCopyBufferRegion(
scratch_index_buffer, 0, shared_memory_->GetBuffer(),
primitive_processing_result.guest_index_base,
index_buffer_view.SizeInBytes);
PushTransitionBarrier(scratch_index_buffer,
D3D12_RESOURCE_STATE_COPY_DEST,
D3D12_RESOURCE_STATE_INDEX_BUFFER);
index_buffer_view.BufferLocation =
scratch_index_buffer->GetGPUVirtualAddress();
} else {
index_buffer_view.BufferLocation =
shared_memory_->GetGPUAddress() +
@ -2977,159 +2999,23 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
}
if (memexport_used) {
HandleMemexportDrawOrdering_AndReadback();
}
return true;
}
XE_COLD
XE_NOINLINE
bool D3D12CommandProcessor::HandleMemexportGuestDMA(
ID3D12Resource*& scratch_index_buffer,
D3D12_INDEX_BUFFER_VIEW& index_buffer_view, uint32_t guest_index_base,
// xe::gpu::PrimitiveProcessor::ProcessingResult&
// primitive_processing_result,
bool& retflag) {
retflag = true;
scratch_index_buffer = RequestScratchGPUBuffer(
index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST);
if (scratch_index_buffer == nullptr) {
return false;
}
shared_memory_->UseAsCopySource();
SubmitBarriers();
deferred_command_list_.D3DCopyBufferRegion(
scratch_index_buffer, 0, shared_memory_->GetBuffer(), guest_index_base,
index_buffer_view.SizeInBytes);
PushTransitionBarrier(scratch_index_buffer, D3D12_RESOURCE_STATE_COPY_DEST,
D3D12_RESOURCE_STATE_INDEX_BUFFER);
index_buffer_view.BufferLocation =
scratch_index_buffer->GetGPUVirtualAddress();
retflag = false;
return {};
}
XE_NOINLINE
XE_COLD
bool D3D12CommandProcessor::GatherMemexportRangesAndMakeResident(
bool& retflag) {
auto vertex_shader = static_cast<D3D12Shader*>(active_vertex_shader());
auto pixel_shader = static_cast<D3D12Shader*>(active_pixel_shader());
const xe::gpu::RegisterFile& regs = *register_file_;
const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
const bool memexport_used_pixel =
pixel_shader && pixel_shader->is_valid_memexport_used();
retflag = true;
if (memexport_used_vertex) {
for (uint32_t constant_index :
vertex_shader->memexport_stream_constants()) {
const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4);
if (memexport_stream.index_count == 0) {
continue;
}
uint32_t memexport_format_size =
GetSupportedMemExportFormatSize(memexport_stream.format);
if (memexport_format_size == 0) {
XELOGE("Unsupported memexport format {}",
FormatInfo::GetName(
xenos::TextureFormat(uint32_t(memexport_stream.format))));
return false;
}
uint32_t memexport_size_dwords =
memexport_stream.index_count * memexport_format_size;
// Try to reduce the number of shared memory operations when writing
// different elements into the same buffer through different exports
// (happens in 4D5307E6).
bool memexport_range_reused = false;
for (uint32_t i = 0; i < memexport_range_count_; ++i) {
MemExportRange& memexport_range = memexport_ranges_[i];
if (memexport_range.base_address_dwords ==
memexport_stream.base_address) {
memexport_range.size_dwords =
std::max(memexport_range.size_dwords, memexport_size_dwords);
memexport_range_reused = true;
break;
}
}
// Add a new range if haven't expanded an existing one.
if (!memexport_range_reused) {
MemExportRange& memexport_range =
memexport_ranges_[memexport_range_count_++];
memexport_range.base_address_dwords = memexport_stream.base_address;
memexport_range.size_dwords = memexport_size_dwords;
}
}
}
if (memexport_used_pixel) {
for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) {
const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4);
if (memexport_stream.index_count == 0) {
continue;
}
uint32_t memexport_format_size =
GetSupportedMemExportFormatSize(memexport_stream.format);
if (memexport_format_size == 0) {
XELOGE("Unsupported memexport format {}",
FormatInfo::GetName(
xenos::TextureFormat(uint32_t(memexport_stream.format))));
return false;
}
uint32_t memexport_size_dwords =
memexport_stream.index_count * memexport_format_size;
bool memexport_range_reused = false;
for (uint32_t i = 0; i < memexport_range_count_; ++i) {
MemExportRange& memexport_range = memexport_ranges_[i];
if (memexport_range.base_address_dwords ==
memexport_stream.base_address) {
memexport_range.size_dwords =
std::max(memexport_range.size_dwords, memexport_size_dwords);
memexport_range_reused = true;
break;
}
}
if (!memexport_range_reused) {
MemExportRange& memexport_range =
memexport_ranges_[memexport_range_count_++];
memexport_range.base_address_dwords = memexport_stream.base_address;
memexport_range.size_dwords = memexport_size_dwords;
}
}
}
for (uint32_t i = 0; i < memexport_range_count_; ++i) {
const MemExportRange& memexport_range = memexport_ranges_[i];
if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
memexport_range.size_dwords << 2)) {
XELOGE(
"Failed to request memexport stream at 0x{:08X} (size {}) in the "
"shared memory",
memexport_range.base_address_dwords << 2,
memexport_range.size_dwords << 2);
return false;
}
}
retflag = false;
return {};
}
XE_NOINLINE
XE_COLD
void D3D12CommandProcessor::HandleMemexportDrawOrdering_AndReadback() {
// Make sure this memexporting draw is ordered with other work using shared
// memory as a UAV.
// TODO(Triang3l): Find some PM4 command that can be used for indication of
// when memexports should be awaited?
shared_memory_->MarkUAVWritesCommitNeeded();
// Invalidate textures in memexported memory and watch for changes.
for (uint32_t i = 0; i < memexport_range_count_; ++i) {
const MemExportRange& memexport_range = memexport_ranges_[i];
shared_memory_->RangeWrittenByGpu(memexport_range.base_address_dwords << 2,
memexport_range.size_dwords << 2, false);
for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) {
shared_memory_->RangeWrittenByGpu(
memexport_range.base_address_dwords << 2, memexport_range.size_bytes,
false);
}
if (cvars::d3d12_readback_memexport) {
// Read the exported data on the CPU.
uint32_t memexport_total_size = 0;
for (uint32_t i = 0; i < memexport_range_count_; ++i) {
memexport_total_size += memexport_ranges_[i].size_dwords << 2;
for (const draw_util::MemExportRange& memexport_range :
memexport_ranges_) {
memexport_total_size += memexport_range.size_bytes;
}
if (memexport_total_size != 0) {
ID3D12Resource* readback_buffer =
@ -3139,9 +3025,9 @@ void D3D12CommandProcessor::HandleMemexportDrawOrdering_AndReadback() {
SubmitBarriers();
ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
uint32_t readback_buffer_offset = 0;
for (uint32_t i = 0; i < memexport_range_count_; ++i) {
const MemExportRange& memexport_range = memexport_ranges_[i];
uint32_t memexport_range_size = memexport_range.size_dwords << 2;
for (const draw_util::MemExportRange& memexport_range :
memexport_ranges_) {
uint32_t memexport_range_size = memexport_range.size_bytes;
deferred_command_list_.D3DCopyBufferRegion(
readback_buffer, readback_buffer_offset, shared_memory_buffer,
memexport_range.base_address_dwords << 2, memexport_range_size);
@ -3154,14 +3040,14 @@ void D3D12CommandProcessor::HandleMemexportDrawOrdering_AndReadback() {
void* readback_mapping;
if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
&readback_mapping))) {
const uint32_t* readback_dwords =
reinterpret_cast<const uint32_t*>(readback_mapping);
for (uint32_t i = 0; i < memexport_range_count_; ++i) {
const MemExportRange& memexport_range = memexport_ranges_[i];
const uint8_t* readback_bytes =
reinterpret_cast<const uint8_t*>(readback_mapping);
for (const draw_util::MemExportRange& memexport_range :
memexport_ranges_) {
std::memcpy(memory_->TranslatePhysical(
memexport_range.base_address_dwords << 2),
readback_dwords, memexport_range.size_dwords << 2);
readback_dwords += memexport_range.size_dwords;
readback_bytes, memexport_range.size_bytes);
readback_bytes += memexport_range.size_bytes;
}
D3D12_RANGE readback_write_range = {};
readback_buffer->Unmap(0, &readback_write_range);
@ -3172,6 +3058,9 @@ void D3D12CommandProcessor::HandleMemexportDrawOrdering_AndReadback() {
}
}
return true;
}
void D3D12CommandProcessor::InitializeTrace() {
CommandProcessor::InitializeTrace();
@ -5208,36 +5097,6 @@ bool D3D12CommandProcessor::UpdateBindings_BindfulPath(
return {};
}
uint32_t D3D12CommandProcessor::GetSupportedMemExportFormatSize(
xenos::ColorFormat format) {
switch (format) {
case xenos::ColorFormat::k_8_8_8_8:
case xenos::ColorFormat::k_2_10_10_10:
// TODO(Triang3l): Investigate how k_8_8_8_8_A works - not supported in the
// texture cache currently.
// case xenos::ColorFormat::k_8_8_8_8_A:
case xenos::ColorFormat::k_10_11_11:
case xenos::ColorFormat::k_11_11_10:
case xenos::ColorFormat::k_16_16:
case xenos::ColorFormat::k_16_16_FLOAT:
case xenos::ColorFormat::k_32_FLOAT:
case xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16:
case xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16:
case xenos::ColorFormat::k_10_11_11_AS_16_16_16_16:
case xenos::ColorFormat::k_11_11_10_AS_16_16_16_16:
return 1;
case xenos::ColorFormat::k_16_16_16_16:
case xenos::ColorFormat::k_16_16_16_16_FLOAT:
case xenos::ColorFormat::k_32_32_FLOAT:
return 2;
case xenos::ColorFormat::k_32_32_32_32_FLOAT:
return 4;
default:
break;
}
return 0;
}
ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
if (size == 0) {
return nullptr;

View File

@ -18,6 +18,7 @@
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "xenia/base/assert.h"
#include "xenia/gpu/command_processor.h"
@ -319,18 +320,7 @@ class D3D12CommandProcessor final : public CommandProcessor {
bool IssueDraw(xenos::PrimitiveType primitive_type, uint32_t index_count,
IndexBufferInfo* index_buffer_info,
bool major_mode_explicit) override;
XE_COLD
XE_NOINLINE
bool HandleMemexportGuestDMA(ID3D12Resource*& scratch_index_buffer,
D3D12_INDEX_BUFFER_VIEW& index_buffer_view,
uint32_t guest_index_base,
bool& retflag);
XE_NOINLINE
XE_COLD
bool GatherMemexportRangesAndMakeResident(bool& retflag);
XE_NOINLINE
XE_COLD
void HandleMemexportDrawOrdering_AndReadback();
bool IssueCopy() override;
XE_NOINLINE
bool IssueCopy_ReadbackResolvePath();
@ -502,13 +492,6 @@ class D3D12CommandProcessor final : public CommandProcessor {
const size_t sampler_count_vertex, const size_t sampler_count_pixel,
bool& retflag);
// Returns dword count for one element for a memexport format, or 0 if it's
// not supported by the D3D12 command processor (if it's smaller that 1 dword,
// for instance).
// TODO(Triang3l): Check if any game uses memexport with formats smaller than
// 32 bits per element.
static uint32_t GetSupportedMemExportFormatSize(xenos::ColorFormat format);
// Returns a buffer for reading GPU data back to the CPU. Assuming
// synchronizing immediately after use. Always in COPY_DEST state.
ID3D12Resource* RequestReadbackBuffer(uint32_t size);
@ -811,12 +794,13 @@ class D3D12CommandProcessor final : public CommandProcessor {
draw_util::GetViewportInfoArgs previous_viewport_info_args_;
draw_util::ViewportInfo previous_viewport_info_;
// scratch memexport data
MemExportRange memexport_ranges_[512];
uint32_t memexport_range_count_ = 0;
std::atomic<bool> pix_capture_requested_ = false;
bool pix_capturing_;
// Temporary storage for memexport stream constants used in the draw.
std::vector<draw_util::MemExportRange> memexport_ranges_;
};
} // namespace d3d12

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2022 Ben Vanik. All rights reserved. *
* Copyright 2023 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -134,7 +134,7 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
//
// Memory export is an obvious intentional side effect.
if (shader.kills_pixels() || shader.writes_depth() ||
shader.is_valid_memexport_used() ||
shader.memexport_eM_written() ||
(shader.writes_color_target(0) &&
DoesCoverageDependOnAlpha(regs.Get<reg::RB_COLORCONTROL>()))) {
return true;
@ -765,8 +765,70 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
}
return normalized_color_mask;
}
void AddMemExportRanges(const RegisterFile& regs, const Shader& shader,
std::vector<MemExportRange>& ranges_out) {
if (!shader.memexport_eM_written()) {
// The shader has eA writes, but no real exports.
return;
}
uint32_t float_constants_base = shader.type() == xenos::ShaderType::kVertex
? regs.Get<reg::SQ_VS_CONST>().base
: regs.Get<reg::SQ_PS_CONST>().base;
for (uint32_t constant_index : shader.memexport_stream_constants()) {
const auto& stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
XE_GPU_REG_SHADER_CONSTANT_000_X +
(float_constants_base + constant_index) * 4);
if (!stream.index_count) {
continue;
}
const FormatInfo& format_info =
*FormatInfo::Get(xenos::TextureFormat(stream.format));
if (format_info.type != FormatType::kResolvable) {
XELOGE("Unsupported memexport format {}",
FormatInfo::GetName(format_info.format));
// Translated shaders shouldn't be performing exports with an unknown
// format, the draw can still be performed.
continue;
}
// TODO(Triang3l): Remove the unresearched format logging when it's known
// how exactly these formats need to be handled (most importantly what
// components need to be stored and in which order).
switch (stream.format) {
case xenos::ColorFormat::k_8_A:
case xenos::ColorFormat::k_8_B:
case xenos::ColorFormat::k_8_8_8_8_A:
XELOGW(
"Memexport done to an unresearched format {}, report the game to "
"Xenia developers!",
FormatInfo::GetName(format_info.format));
break;
default:
break;
}
uint32_t stream_size_bytes =
stream.index_count * (format_info.bits_per_pixel >> 3);
// Try to reduce the number of shared memory operations when writing
// different elements into the same buffer through different exports
// (happens in 4D5307E6).
bool range_reused = false;
for (MemExportRange& range : ranges_out) {
if (range.base_address_dwords == stream.base_address) {
range.size_bytes = std::max(range.size_bytes, stream_size_bytes);
range_reused = true;
break;
}
}
// Add a new range if haven't expanded an existing one.
if (!range_reused) {
ranges_out.emplace_back(stream.base_address, stream_size_bytes);
}
}
}
XE_NOINLINE
XE_NOALIAS
xenos::CopySampleSelect SanitizeCopySampleSelect(
xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
bool is_depth) {

View File

@ -13,6 +13,7 @@
#include <cmath>
#include <cstdint>
#include <utility>
#include <vector>
#include "xenia/base/assert.h"
#include "xenia/gpu/register_file.h"
@ -474,6 +475,19 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA(
return guest_sample_index ? 3 : 0;
}
struct MemExportRange {
uint32_t base_address_dwords;
uint32_t size_bytes;
explicit MemExportRange(uint32_t base_address_dwords, uint32_t size_bytes)
: base_address_dwords(base_address_dwords), size_bytes(size_bytes) {}
};
// Gathers memory ranges involved in memexports in the shader with the float
// constants from the registers, adding them to ranges_out.
void AddMemExportRanges(const RegisterFile& regs, const Shader& shader,
std::vector<MemExportRange>& ranges_out);
// To avoid passing values that the shader won't understand (even though
// Direct3D 9 shouldn't pass them anyway).
XE_NOINLINE

View File

@ -913,6 +913,8 @@ enum class OperandModifier : uint32_t {
struct Dest : OperandAddress {
// Ignored for 0-component and 1-component operand types.
// For 4-component operand types, if the write mask is 0, it's treated as
// 0-component.
uint32_t write_mask_;
// Input destinations (v*) are for use only in declarations. Vector input
@ -1028,12 +1030,16 @@ struct Dest : OperandAddress {
void Write(std::vector<uint32_t>& code, bool in_dcl = false) const {
uint32_t operand_token = GetOperandTokenTypeAndIndex();
OperandDimension dimension = GetDimension(in_dcl);
operand_token |= uint32_t(dimension);
if (dimension == OperandDimension::kVector) {
assert_true(write_mask_ > 0b0000 && write_mask_ <= 0b1111);
if (write_mask_) {
assert_true(write_mask_ <= 0b1111);
operand_token |=
(uint32_t(ComponentSelection::kMask) << 2) | (write_mask_ << 4);
} else {
dimension = OperandDimension::kNoData;
}
}
operand_token |= uint32_t(dimension);
code.push_back(operand_token);
OperandAddress::Write(code);
}
@ -1508,6 +1514,8 @@ enum class Opcode : uint32_t {
kStoreUAVTyped = 164,
kLdRaw = 165,
kStoreRaw = 166,
kAtomicAnd = 169,
kAtomicOr = 170,
kEvalSampleIndex = 204,
kEvalCentroid = 205,
};
@ -2396,6 +2404,14 @@ class Assembler {
++stat_.instruction_count;
++stat_.c_texture_store_instructions;
}
void OpAtomicAnd(const Dest& dest, const Src& address,
uint32_t address_components, const Src& value) {
EmitAtomicOp(Opcode::kAtomicAnd, dest, address, address_components, value);
}
void OpAtomicOr(const Dest& dest, const Src& address,
uint32_t address_components, const Src& value) {
EmitAtomicOp(Opcode::kAtomicOr, dest, address, address_components, value);
}
void OpEvalSampleIndex(const Dest& dest, const Src& value,
const Src& sample_index) {
uint32_t dest_write_mask = dest.GetMask();
@ -2522,6 +2538,22 @@ class Assembler {
src1.Write(code_, true, 0b0000);
++stat_.instruction_count;
}
void EmitAtomicOp(Opcode opcode, const Dest& dest, const Src& address,
uint32_t address_components, const Src& value) {
// Atomic operations require a 0-component memory destination.
assert_zero(dest.GetMask());
uint32_t address_mask = (1 << address_components) - 1;
uint32_t operands_length = dest.GetLength() +
address.GetLength(address_mask) +
value.GetLength(0b0001);
code_.reserve(code_.size() + 1 + operands_length);
code_.push_back(OpcodeToken(opcode, operands_length));
dest.Write(code_);
address.Write(code_, true, address_mask);
value.Write(code_, true, 0b0001);
++stat_.instruction_count;
++stat_.c_interlocked_instructions;
}
std::vector<uint32_t>& code_;
Statistics& stat_;

View File

@ -179,8 +179,6 @@ void DxbcShaderTranslator::Reset() {
sampler_bindings_.clear();
memexport_alloc_current_count_ = 0;
std::memset(&shader_feature_info_, 0, sizeof(shader_feature_info_));
std::memset(&statistics_, 0, sizeof(statistics_));
}
@ -789,6 +787,63 @@ void DxbcShaderTranslator::StartPixelShader() {
PopSystemTemp();
}
}
if (current_shader().memexport_eM_written()) {
// Make sure memexport is done only once for a guest pixel.
dxbc::Dest memexport_enabled_dest(
dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0001));
dxbc::Src memexport_enabled_src(dxbc::Src::R(
system_temp_memexport_enabled_and_eM_written_, dxbc::Src::kXXXX));
uint32_t resolution_scaled_axes =
uint32_t(draw_resolution_scale_x_ > 1) |
(uint32_t(draw_resolution_scale_y_ > 1) << 1);
if (resolution_scaled_axes) {
uint32_t memexport_condition_temp = PushSystemTemp();
// Only do memexport for one host pixel in a guest pixel - prefer the
// host pixel closer to the center of the guest pixel, but one that's
// covered with the half-pixel offset according to the top-left rule (1
// for 2x because 0 isn't covered with the half-pixel offset, 1 for 3x
// because it's the center and is covered with the half-pixel offset too).
in_position_used_ |= resolution_scaled_axes;
a_.OpFToU(dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
dxbc::Src::V1D(in_reg_ps_position_));
a_.OpUDiv(dxbc::Dest::Null(),
dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
dxbc::Src::R(memexport_condition_temp),
dxbc::Src::LU(draw_resolution_scale_x_,
draw_resolution_scale_y_, 0, 0));
a_.OpIEq(dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
dxbc::Src::R(memexport_condition_temp),
dxbc::Src::LU(draw_resolution_scale_x_ >> 1,
draw_resolution_scale_y_ >> 1, 0, 0));
for (uint32_t i = 0; i < 2; ++i) {
if (!(resolution_scaled_axes & (1 << i))) {
continue;
}
a_.OpAnd(memexport_enabled_dest, memexport_enabled_src,
dxbc::Src::R(memexport_condition_temp).Select(i));
}
// Release memexport_condition_temp.
PopSystemTemp();
}
// With sample-rate shading (with float24 conversion), only do memexport
// from one sample (as the shader is invoked multiple times for a pixel),
// if SV_SampleIndex == firstbit_lo(SV_Coverage). For zero coverage,
// firstbit_lo returns 0xFFFFFFFF.
if (IsSampleRate()) {
uint32_t memexport_condition_temp = PushSystemTemp();
a_.OpFirstBitLo(dxbc::Dest::R(memexport_condition_temp, 0b0001),
dxbc::Src::VCoverage());
a_.OpIEq(
dxbc::Dest::R(memexport_condition_temp, 0b0001),
dxbc::Src::V1D(in_reg_ps_front_face_sample_index_, dxbc::Src::kYYYY),
dxbc::Src::R(memexport_condition_temp, dxbc::Src::kXXXX));
a_.OpAnd(memexport_enabled_dest, memexport_enabled_src,
dxbc::Src::R(memexport_condition_temp, dxbc::Src::kXXXX));
// Release memexport_condition_temp.
PopSystemTemp();
}
}
}
void DxbcShaderTranslator::StartTranslation() {
@ -885,34 +940,27 @@ void DxbcShaderTranslator::StartTranslation() {
}
}
if (!is_depth_only_pixel_shader_) {
// Allocate temporary registers for memexport addresses and data.
std::memset(system_temps_memexport_address_, 0xFF,
sizeof(system_temps_memexport_address_));
std::memset(system_temps_memexport_data_, 0xFF,
sizeof(system_temps_memexport_data_));
system_temp_memexport_written_ = UINT32_MAX;
const uint8_t* memexports_written = current_shader().memexport_eM_written();
for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) {
uint32_t memexport_alloc_written = memexports_written[i];
if (memexport_alloc_written == 0) {
continue;
}
// If memexport is used at all, allocate a register containing whether eM#
// have actually been written to.
if (system_temp_memexport_written_ == UINT32_MAX) {
system_temp_memexport_written_ = PushSystemTemp(0b1111);
}
system_temps_memexport_address_[i] = PushSystemTemp(0b1111);
uint32_t memexport_data_index;
while (xe::bit_scan_forward(memexport_alloc_written,
&memexport_data_index)) {
memexport_alloc_written &= ~(1u << memexport_data_index);
system_temps_memexport_data_[i][memexport_data_index] =
PushSystemTemp();
// Allocate temporary registers for memexport.
uint8_t memexport_eM_written = current_shader().memexport_eM_written();
if (memexport_eM_written) {
system_temp_memexport_enabled_and_eM_written_ = PushSystemTemp(0b0010);
// Initialize the memexport conditional to whether the shared memory is
// currently bound as UAV (to 0 or UINT32_MAX). It can be made narrower
// later.
a_.OpIBFE(
dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0001),
dxbc::Src::LU(1), dxbc::Src::LU(kSysFlag_SharedMemoryIsUAV_Shift),
LoadFlagsSystemConstant());
system_temp_memexport_address_ = PushSystemTemp(0b1111);
uint8_t memexport_eM_remaining = memexport_eM_written;
uint32_t memexport_eM_index;
while (xe::bit_scan_forward(memexport_eM_remaining, &memexport_eM_index)) {
memexport_eM_remaining &= ~(uint8_t(1) << memexport_eM_index);
system_temps_memexport_data_[memexport_eM_index] = PushSystemTemp(0b1111);
}
}
if (!is_depth_only_pixel_shader_) {
// Allocate system temporary variables for the translated code. Since access
// depends on the guest code (thus no guarantees), initialize everything
// now (except for pv, it's an internal temporary variable, not accessible
@ -1091,27 +1139,19 @@ void DxbcShaderTranslator::CompleteShaderCode() {
// - system_temp_grad_h_lod_.
// - system_temp_grad_v_vfetch_address_.
PopSystemTemp(6);
}
// Write memexported data to the shared memory UAV.
ExportToMemory();
uint8_t memexport_eM_written = current_shader().memexport_eM_written();
if (memexport_eM_written) {
// Write data for the last memexport.
ExportToMemory(
current_shader().memexport_eM_potentially_written_before_end());
// Release memexport temporary registers.
for (int i = Shader::kMaxMemExports - 1; i >= 0; --i) {
if (system_temps_memexport_address_[i] == UINT32_MAX) {
continue;
}
// Release exported data registers.
for (int j = 4; j >= 0; --j) {
if (system_temps_memexport_data_[i][j] != UINT32_MAX) {
PopSystemTemp();
}
}
// Release the address register.
PopSystemTemp();
}
if (system_temp_memexport_written_ != UINT32_MAX) {
PopSystemTemp();
}
// Release memexport temporary registers:
// - system_temp_memexport_enabled_and_eM_written_.
// - system_temp_memexport_address_.
// - system_temps_memexport_data_.
PopSystemTemp(xe::bit_count(uint32_t(memexport_eM_written)) + 2);
}
// Write stage-specific epilogue.
@ -1514,36 +1554,22 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
dest = dxbc::Dest::R(system_temp_point_size_edge_flag_kill_vertex_);
break;
case InstructionStorageTarget::kExportAddress:
// Validate memexport writes (4D5307E6 has some completely invalid ones).
if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 ||
memexport_alloc_current_count_ > Shader::kMaxMemExports ||
system_temps_memexport_address_[memexport_alloc_current_count_ - 1] ==
UINT32_MAX) {
if (!current_shader().memexport_eM_written()) {
return;
}
dest = dxbc::Dest::R(
system_temps_memexport_address_[memexport_alloc_current_count_ - 1]);
dest = dxbc::Dest::R(system_temp_memexport_address_);
break;
case InstructionStorageTarget::kExportData: {
// Validate memexport writes (4D5307E6 has some completely invalid ones).
if (memexport_alloc_current_count_ == 0 ||
memexport_alloc_current_count_ > Shader::kMaxMemExports ||
system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
[result.storage_index] == UINT32_MAX) {
return;
}
dest = dxbc::Dest::R(
system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
[result.storage_index]);
assert_not_zero(current_shader().memexport_eM_written() &
(uint8_t(1) << result.storage_index));
dest = dxbc::Dest::R(system_temps_memexport_data_[result.storage_index]);
// Mark that the eM# has been written to and needs to be exported.
assert_not_zero(used_write_mask);
uint32_t memexport_index = memexport_alloc_current_count_ - 1;
a_.OpOr(dxbc::Dest::R(system_temp_memexport_written_,
1 << (memexport_index >> 2)),
dxbc::Src::R(system_temp_memexport_written_)
.Select(memexport_index >> 2),
dxbc::Src::LU(uint32_t(1) << (result.storage_index +
((memexport_index & 3) << 3))));
a_.OpOr(
dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0010),
dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_,
dxbc::Src::kYYYY),
dxbc::Src::LU(uint8_t(1) << result.storage_index));
} break;
case InstructionStorageTarget::kColor:
assert_not_zero(used_write_mask);
@ -1990,15 +2016,38 @@ void DxbcShaderTranslator::ProcessJumpInstruction(
}
void DxbcShaderTranslator::ProcessAllocInstruction(
const ParsedAllocInstruction& instr) {
const ParsedAllocInstruction& instr, uint8_t export_eM) {
bool start_memexport = instr.type == AllocType::kMemory &&
current_shader().memexport_eM_written();
if (export_eM || start_memexport) {
CloseExecConditionals();
}
if (emit_source_map_) {
instruction_disassembly_buffer_.Reset();
instr.Disassemble(&instruction_disassembly_buffer_);
EmitInstructionDisassembly();
}
if (instr.type == AllocType::kMemory) {
++memexport_alloc_current_count_;
if (export_eM) {
ExportToMemory(export_eM);
// Reset which eM# elements have been written.
a_.OpMov(
dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0010),
dxbc::Src::LU(0));
// Break dependencies from the previous memexport.
uint8_t export_eM_remaining = export_eM;
uint32_t eM_index;
while (xe::bit_scan_forward(export_eM_remaining, &eM_index)) {
export_eM_remaining &= ~(uint8_t(1) << eM_index);
a_.OpMov(dxbc::Dest::R(system_temps_memexport_data_[eM_index]),
dxbc::Src::LF(0.0f));
}
}
if (start_memexport) {
// Initialize eA to an invalid address.
a_.OpMov(dxbc::Dest::R(system_temp_memexport_address_), dxbc::Src::LU(0));
}
}
@ -2851,7 +2900,7 @@ void DxbcShaderTranslator::WriteInputSignature() {
// Sample index (SV_SampleIndex) for safe memexport with sample-rate
// shading.
size_t sample_index_position = SIZE_MAX;
if (current_shader().is_valid_memexport_used() && IsSampleRate()) {
if (current_shader().memexport_eM_written() && IsSampleRate()) {
size_t sample_index_position = shader_object_.size();
shader_object_.resize(shader_object_.size() + kParameterDwords);
++parameter_count;
@ -3625,7 +3674,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
dxbc::Name::kPosition);
}
bool sample_rate_memexport =
current_shader().is_valid_memexport_used() && IsSampleRate();
current_shader().memexport_eM_written() && IsSampleRate();
// Sample-rate shading can't be done with UAV-only rendering (sample-rate
// shading is only needed for float24 depth conversion when using a float32
// host depth buffer).

View File

@ -20,6 +20,7 @@
#include "xenia/base/string_buffer.h"
#include "xenia/gpu/dxbc.h"
#include "xenia/gpu/shader_translator.h"
#include "xenia/gpu/ucode.h"
#include "xenia/ui/graphics_provider.h"
namespace xe {
@ -589,13 +590,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
void ProcessLoopEndInstruction(
const ParsedLoopEndInstruction& instr) override;
void ProcessJumpInstruction(const ParsedJumpInstruction& instr) override;
void ProcessAllocInstruction(const ParsedAllocInstruction& instr) override;
void ProcessAllocInstruction(const ParsedAllocInstruction& instr,
uint8_t export_eM) override;
void ProcessVertexFetchInstruction(
const ParsedVertexFetchInstruction& instr) override;
void ProcessTextureFetchInstruction(
const ParsedTextureFetchInstruction& instr) override;
void ProcessAluInstruction(const ParsedAluInstruction& instr) override;
void ProcessAluInstruction(
const ParsedAluInstruction& instr,
uint8_t memexport_eM_potentially_written_before) override;
private:
// IF ANY OF THESE ARE CHANGED, WriteInputSignature and WriteOutputSignature
@ -674,6 +678,11 @@ class DxbcShaderTranslator : public ShaderTranslator {
// Frees the last allocated internal r# registers for later reuse.
void PopSystemTemp(uint32_t count = 1);
// ExportToMemory modifies the values of eA/eM# for simplicity, call only
// before starting a new export or ending the invocation or making it
// inactive.
void ExportToMemory(uint8_t export_eM);
// Converts one scalar from piecewise linear gamma to linear. The target may
// be the same as the source, the temporary variables must be different. If
// the source is not pre-saturated, saturation will be done internally.
@ -728,7 +737,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
bool ROV_IsDepthStencilEarly() const {
assert_true(edram_rov_used_);
return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() &&
!current_shader().is_valid_memexport_used();
!current_shader().memexport_eM_written();
}
// Converts the pre-clamped depth value to 24-bit (storing the result in bits
// 0:23 and zeros in 24:31, not creating room for stencil - since this may be
@ -787,14 +796,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
void StartPixelShader_LoadROVParameters();
void StartPixelShader();
// Writing the epilogue.
// ExportToMemory modifies the values of eA/eM# for simplicity, don't call
// multiple times.
void ExportToMemory_PackFixed32(const uint32_t* eM_temps, uint32_t eM_count,
const uint32_t bits[4],
const dxbc::Src& is_integer,
const dxbc::Src& is_signed);
void ExportToMemory();
void CompleteVertexOrDomainShader();
// For RTV, adds the sample to coverage_temp.coverage_temp_component if it
// passes alpha to mask (or, if initialize == true (for the first sample
@ -917,13 +918,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
.SelectFromSwizzled(word_index & 1);
}
void KillPixel(bool condition, const dxbc::Src& condition_src);
void KillPixel(bool condition, const dxbc::Src& condition_src,
uint8_t memexport_eM_potentially_written_before);
void ProcessVectorAluOperation(const ParsedAluInstruction& instr,
uint32_t& result_swizzle,
bool& predicate_written);
void ProcessScalarAluOperation(const ParsedAluInstruction& instr,
void ProcessVectorAluOperation(
const ParsedAluInstruction& instr,
uint8_t memexport_eM_potentially_written_before, uint32_t& result_swizzle,
bool& predicate_written);
void ProcessScalarAluOperation(
const ParsedAluInstruction& instr,
uint8_t memexport_eM_potentially_written_before, bool& predicate_written);
void WriteResourceDefinition();
void WriteInputSignature();
@ -1124,14 +1128,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
// writing).
uint32_t system_temps_color_[4];
// Bits containing whether each eM# has been written, for up to 16 streams, or
// UINT32_MAX if memexport is not used. 8 bits (5 used) for each stream, with
// 4 `alloc export`s per component.
uint32_t system_temp_memexport_written_;
// eA in each `alloc export`, or UINT32_MAX if not used.
uint32_t system_temps_memexport_address_[Shader::kMaxMemExports];
// eM# in each `alloc export`, or UINT32_MAX if not used.
uint32_t system_temps_memexport_data_[Shader::kMaxMemExports][5];
// Memory export temporary registers are allocated if the shader writes any
// eM# (current_shader().memexport_eM_written() != 0).
// X - whether memexport is enabled for this invocation.
// Y - which eM# elements have been written so far by the invocation since the
// last memory write.
uint32_t system_temp_memexport_enabled_and_eM_written_;
// eA.
uint32_t system_temp_memexport_address_;
// eM#.
uint32_t system_temps_memexport_data_[ucode::kMaxMemExportElementCount];
// Vector ALU or fetch result / scratch (since Xenos write masks can contain
// swizzles).
@ -1195,10 +1201,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
uint32_t uav_index_edram_;
std::vector<SamplerBinding> sampler_bindings_;
// Number of `alloc export`s encountered so far in the translation. The index
// of the current eA/eM# temp register set is this minus 1, if it's not 0.
uint32_t memexport_alloc_current_count_;
};
} // namespace gpu

View File

@ -19,22 +19,29 @@ namespace xe {
namespace gpu {
using namespace ucode;
void DxbcShaderTranslator::KillPixel(bool condition,
const dxbc::Src& condition_src) {
void DxbcShaderTranslator::KillPixel(
bool condition, const dxbc::Src& condition_src,
uint8_t memexport_eM_potentially_written_before) {
a_.OpIf(condition, condition_src);
// Perform outstanding memory exports before the invocation becomes inactive
// and UAV writes are disabled.
ExportToMemory(memexport_eM_potentially_written_before);
// Discard the pixel, but continue execution if other lanes in the quad need
// this lane for derivatives. The driver may also perform early exiting
// internally if all lanes are discarded if deemed beneficial.
a_.OpDiscard(condition, condition_src);
a_.OpDiscard(true, dxbc::Src::LU(UINT32_MAX));
if (edram_rov_used_) {
// Even though discarding disables all subsequent UAV/ROV writes, also skip
// as much of the Render Backend emulation logic as possible by setting the
// coverage and the mask of the written render targets to zero.
a_.OpMov(dxbc::Dest::R(system_temp_rov_params_, 0b0001), dxbc::Src::LU(0));
}
a_.OpEndIf();
}
void DxbcShaderTranslator::ProcessVectorAluOperation(
const ParsedAluInstruction& instr, uint32_t& result_swizzle,
const ParsedAluInstruction& instr,
uint8_t memexport_eM_potentially_written_before, uint32_t& result_swizzle,
bool& predicate_written) {
result_swizzle = dxbc::Src::kXYZW;
predicate_written = false;
@ -506,7 +513,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
memexport_eM_potentially_written_before);
if (used_result_components) {
a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@ -522,7 +530,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
memexport_eM_potentially_written_before);
if (used_result_components) {
a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@ -538,7 +547,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
memexport_eM_potentially_written_before);
if (used_result_components) {
a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@ -554,7 +564,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
memexport_eM_potentially_written_before);
if (used_result_components) {
a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
@ -640,7 +651,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
}
void DxbcShaderTranslator::ProcessScalarAluOperation(
const ParsedAluInstruction& instr, bool& predicate_written) {
const ParsedAluInstruction& instr,
uint8_t memexport_eM_potentially_written_before, bool& predicate_written) {
predicate_written = false;
if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev) {
@ -950,27 +962,27 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
case AluScalarOpcode::kKillsEq:
a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
KillPixel(true, ps_src);
KillPixel(true, ps_src, memexport_eM_potentially_written_before);
a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
break;
case AluScalarOpcode::kKillsGt:
a_.OpLT(ps_dest, dxbc::Src::LF(0.0f), operand_0_a);
KillPixel(true, ps_src);
KillPixel(true, ps_src, memexport_eM_potentially_written_before);
a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
break;
case AluScalarOpcode::kKillsGe:
a_.OpGE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
KillPixel(true, ps_src);
KillPixel(true, ps_src, memexport_eM_potentially_written_before);
a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
break;
case AluScalarOpcode::kKillsNe:
a_.OpNE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
KillPixel(true, ps_src);
KillPixel(true, ps_src, memexport_eM_potentially_written_before);
a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
break;
case AluScalarOpcode::kKillsOne:
a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(1.0f));
KillPixel(true, ps_src);
KillPixel(true, ps_src, memexport_eM_potentially_written_before);
a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
break;
@ -1024,7 +1036,8 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
}
void DxbcShaderTranslator::ProcessAluInstruction(
const ParsedAluInstruction& instr) {
const ParsedAluInstruction& instr,
uint8_t memexport_eM_potentially_written_before) {
if (instr.IsNop()) {
// Don't even disassemble or update predication.
return;
@ -1041,10 +1054,11 @@ void DxbcShaderTranslator::ProcessAluInstruction(
// checked again later.
bool predicate_written_vector = false;
uint32_t vector_result_swizzle = dxbc::Src::kXYZW;
ProcessVectorAluOperation(instr, vector_result_swizzle,
predicate_written_vector);
ProcessVectorAluOperation(instr, memexport_eM_potentially_written_before,
vector_result_swizzle, predicate_written_vector);
bool predicate_written_scalar = false;
ProcessScalarAluOperation(instr, predicate_written_scalar);
ProcessScalarAluOperation(instr, memexport_eM_potentially_written_before,
predicate_written_scalar);
StoreResult(instr.vector_and_constant_result,
dxbc::Src::R(system_temp_result_, vector_result_swizzle),

File diff suppressed because it is too large Load Diff

View File

@ -672,7 +672,7 @@ class Shader {
// For implementation without unconditional support for memory writes from
// vertex shaders, vertex shader converted to a compute shader doing only
// memory export.
kMemexportCompute,
kMemExportCompute,
// 4 host vertices for 1 guest vertex, for implementations without
// unconditional geometry shader support.
@ -769,9 +769,16 @@ class Shader {
}
};
// Based on the number of AS_VS/PS_EXPORT_STREAM_* enum sets found in a game
// .pdb.
static constexpr uint32_t kMaxMemExports = 16;
struct ControlFlowMemExportInfo {
// Which eM elements have potentially (regardless of conditionals, loop
// iteration counts, predication) been written earlier in the predecessor
// graph of the instruction since an `alloc export`.
uint8_t eM_potentially_written_before = 0;
// For exec sequences, which eM elements are potentially (regardless of
// predication) written by the instructions in the sequence. For other
// control flow instructions, it's 0.
uint8_t eM_potentially_written_by_exec = 0;
};
class Translation {
public:
@ -879,19 +886,21 @@ class Shader {
return constant_register_map_;
}
// uint5[Shader::kMaxMemExports] - bits indicating which eM# registers have
// been written to after each `alloc export`, for up to Shader::kMaxMemExports
// exports. This will contain zero for certain corrupt exports - for those to
// which a valid eA was not written via a MAD with a stream constant.
const uint8_t* memexport_eM_written() const { return memexport_eM_written_; }
// Information about memory export state at each control flow instruction. May
// be empty if there are no eM# writes.
const std::vector<ControlFlowMemExportInfo>& cf_memexport_info() const {
return cf_memexport_info_;
}
// All c# registers used as the addend in MAD operations to eA.
uint8_t memexport_eM_written() const { return memexport_eM_written_; }
uint8_t memexport_eM_potentially_written_before_end() const {
return memexport_eM_potentially_written_before_end_;
}
// c# registers used as the addend in MAD operations to eA.
const std::set<uint32_t>& memexport_stream_constants() const {
return memexport_stream_constants_;
}
bool is_valid_memexport_used() const {
return !memexport_stream_constants_.empty();
}
// Labels that jumps (explicit or from loops) can be done to.
const std::set<uint32_t>& label_addresses() const { return label_addresses_; }
@ -969,7 +978,7 @@ class Shader {
// TODO(Triang3l): Investigate what happens to memexport when the pixel
// fails the depth/stencil test, but in Direct3D 11 UAV writes disable early
// depth/stencil.
return !kills_pixels() && !writes_depth() && !is_valid_memexport_used();
return !kills_pixels() && !writes_depth() && !memexport_eM_written();
}
// Whether each color render target is written to on any execution path.
@ -1041,8 +1050,6 @@ class Shader {
std::vector<VertexBinding> vertex_bindings_;
std::vector<TextureBinding> texture_bindings_;
ConstantRegisterMap constant_register_map_ = {0};
uint8_t memexport_eM_written_[kMaxMemExports] = {};
std::set<uint32_t> memexport_stream_constants_;
std::set<uint32_t> label_addresses_;
uint32_t cf_pair_index_bound_ = 0;
uint32_t register_static_address_bound_ = 0;
@ -1054,6 +1061,17 @@ class Shader {
bool uses_texture_fetch_instruction_results_ = false;
bool writes_depth_ = false;
// Memory export eM write info for each control flow instruction, if there are
// any eM writes in the shader.
std::vector<ControlFlowMemExportInfo> cf_memexport_info_;
// Which memexport elements (eM#) are written for any memexport in the shader.
uint8_t memexport_eM_written_ = 0;
// ControlFlowMemExportInfo::eM_potentially_written_before equivalent for the
// end of the shader, for the last memory export (or exports if the end has
// multiple predecessor chains exporting to memory).
uint8_t memexport_eM_potentially_written_before_end_ = 0;
std::set<uint32_t> memexport_stream_constants_;
// Modification bits -> translation.
std::unordered_map<uint64_t, Translation*> translations_;
@ -1063,8 +1081,7 @@ class Shader {
void GatherExecInformation(
const ParsedExecInstruction& instr,
ucode::VertexFetchInstruction& previous_vfetch_full,
uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count,
uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer);
uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer);
void GatherVertexFetchInformation(
const ucode::VertexFetchInstruction& op,
ucode::VertexFetchInstruction& previous_vfetch_full,
@ -1073,13 +1090,12 @@ class Shader {
uint32_t& unique_texture_bindings,
StringBuffer& ucode_disasm_buffer);
void GatherAluInstructionInformation(const ucode::AluInstruction& op,
uint32_t memexport_alloc_current_count,
uint32_t& memexport_eA_written,
uint32_t exec_cf_index,
StringBuffer& ucode_disasm_buffer);
void GatherOperandInformation(const InstructionOperand& operand);
void GatherFetchResultInformation(const InstructionResult& result);
void GatherAluResultInformation(const InstructionResult& result,
uint32_t memexport_alloc_current_count);
uint32_t exec_cf_index);
};
} // namespace gpu

View File

@ -87,8 +87,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
VertexFetchInstruction previous_vfetch_full;
std::memset(&previous_vfetch_full, 0, sizeof(previous_vfetch_full));
uint32_t unique_texture_bindings = 0;
uint32_t memexport_alloc_count = 0;
uint32_t memexport_eA_written = 0;
for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) {
ControlFlowInstruction cf_ab[2];
UnpackControlFlowInstructions(ucode_data_.data() + i * 3, cf_ab);
@ -111,8 +109,7 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
ParsedExecInstruction instr;
ParseControlFlowExec(cf.exec, cf_index, instr);
GatherExecInformation(instr, previous_vfetch_full,
unique_texture_bindings, memexport_alloc_count,
memexport_eA_written, ucode_disasm_buffer);
unique_texture_bindings, ucode_disasm_buffer);
} break;
case ControlFlowOpcode::kCondExec:
case ControlFlowOpcode::kCondExecEnd:
@ -122,16 +119,14 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
ParsedExecInstruction instr;
ParseControlFlowCondExec(cf.cond_exec, cf_index, instr);
GatherExecInformation(instr, previous_vfetch_full,
unique_texture_bindings, memexport_alloc_count,
memexport_eA_written, ucode_disasm_buffer);
unique_texture_bindings, ucode_disasm_buffer);
} break;
case ControlFlowOpcode::kCondExecPred:
case ControlFlowOpcode::kCondExecPredEnd: {
ParsedExecInstruction instr;
ParseControlFlowCondExecPred(cf.cond_exec_pred, cf_index, instr);
GatherExecInformation(instr, previous_vfetch_full,
unique_texture_bindings, memexport_alloc_count,
memexport_eA_written, ucode_disasm_buffer);
unique_texture_bindings, ucode_disasm_buffer);
} break;
case ControlFlowOpcode::kLoopStart: {
ParsedLoopStartInstruction instr;
@ -173,9 +168,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
ParseControlFlowAlloc(cf.alloc, cf_index,
type() == xenos::ShaderType::kVertex, instr);
instr.Disassemble(&ucode_disasm_buffer);
if (instr.type == AllocType::kMemory) {
++memexport_alloc_count;
}
} break;
case ControlFlowOpcode::kMarkVsFetchDone:
break;
@ -187,7 +179,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
constant_register_map_.bool_bitmap[bool_constant_index / 32] |=
uint32_t(1) << (bool_constant_index % 32);
}
// TODO(benvanik): break if (DoesControlFlowOpcodeEndShader(cf.opcode()))?
}
}
ucode_disassembly_ = ucode_disasm_buffer.to_string();
@ -206,16 +197,124 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
}
}
// Cleanup invalid/unneeded memexport allocs.
for (uint32_t i = 0; i < kMaxMemExports; ++i) {
if (!(memexport_eA_written & (uint32_t(1) << i))) {
memexport_eM_written_[i] = 0;
} else if (!memexport_eM_written_[i]) {
memexport_eA_written &= ~(uint32_t(1) << i);
if (!cf_memexport_info_.empty()) {
// Gather potentially "dirty" memexport elements before each control flow
// instruction. `alloc` (any, not only `export`) flushes the previous memory
// export. On the guest GPU, yielding / serializing also terminates memory
// exports, but for simplicity disregarding that, as that functionally does
// nothing compared to flushing the previous memory export only at `alloc`
// or even only specifically at `alloc export`, Microsoft's validator checks
// if eM# aren't written after a `serialize`.
std::vector<uint32_t> successor_stack;
for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) {
ControlFlowInstruction eM_writing_cf_ab[2];
UnpackControlFlowInstructions(ucode_data_.data() + i * 3,
eM_writing_cf_ab);
for (uint32_t j = 0; j < 2; ++j) {
uint32_t eM_writing_cf_index = i * 2 + j;
uint32_t eM_written_by_cf_instr =
cf_memexport_info_[eM_writing_cf_index]
.eM_potentially_written_by_exec;
if (eM_writing_cf_ab[j].opcode() == ControlFlowOpcode::kCondCall) {
// Until subroutine calls are handled accurately, assume that all eM#
// have potentially been written by the subroutine for simplicity.
eM_written_by_cf_instr = memexport_eM_written_;
}
if (!eM_written_by_cf_instr) {
continue;
}
// If the control flow instruction potentially results in any eM# being
// written, mark those eM# as potentially written before each successor.
bool is_successor_graph_head = true;
successor_stack.push_back(eM_writing_cf_index);
while (!successor_stack.empty()) {
uint32_t successor_cf_index = successor_stack.back();
successor_stack.pop_back();
ControlFlowMemExportInfo& successor_memexport_info =
cf_memexport_info_[successor_cf_index];
if ((successor_memexport_info.eM_potentially_written_before &
eM_written_by_cf_instr) == eM_written_by_cf_instr) {
// Already marked as written before this instruction (and thus
// before all its successors too). Possibly this instruction is in a
// loop, in this case an instruction may succeed itself.
break;
}
// The first instruction in the traversal is the writing instruction
// itself, not its successor. However, if it has been visited by the
// traversal twice, it's in a loop, so it succeeds itself, and thus
// writes from it are potentially done before it too.
if (!is_successor_graph_head) {
successor_memexport_info.eM_potentially_written_before |=
eM_written_by_cf_instr;
}
is_successor_graph_head = false;
ControlFlowInstruction successor_cf_ab[2];
UnpackControlFlowInstructions(
ucode_data_.data() + (successor_cf_index >> 1) * 3,
successor_cf_ab);
const ControlFlowInstruction& successor_cf =
successor_cf_ab[successor_cf_index & 1];
bool next_instr_is_new_successor = true;
switch (successor_cf.opcode()) {
case ControlFlowOpcode::kExecEnd:
// One successor: end.
memexport_eM_potentially_written_before_end_ |=
eM_written_by_cf_instr;
next_instr_is_new_successor = false;
break;
case ControlFlowOpcode::kCondExecEnd:
case ControlFlowOpcode::kCondExecPredEnd:
case ControlFlowOpcode::kCondExecPredCleanEnd:
// Two successors: next, end.
memexport_eM_potentially_written_before_end_ |=
eM_written_by_cf_instr;
break;
case ControlFlowOpcode::kLoopStart:
// Two successors: next, skip.
successor_stack.push_back(successor_cf.loop_start.address());
break;
case ControlFlowOpcode::kLoopEnd:
// Two successors: next, repeat.
successor_stack.push_back(successor_cf.loop_end.address());
break;
case ControlFlowOpcode::kCondCall:
// Two successors: next, target.
successor_stack.push_back(successor_cf.cond_call.address());
break;
case ControlFlowOpcode::kReturn:
// Currently treating all subroutine calls as potentially writing
// all eM# for simplicity, so just exit the subroutine.
next_instr_is_new_successor = false;
break;
case ControlFlowOpcode::kCondJmp:
// One or two successors: next if conditional, target.
successor_stack.push_back(successor_cf.cond_jmp.address());
if (successor_cf.cond_jmp.is_unconditional()) {
next_instr_is_new_successor = false;
}
break;
case ControlFlowOpcode::kAlloc:
// Any `alloc` ends the previous export.
next_instr_is_new_successor = false;
break;
default:
break;
}
if (next_instr_is_new_successor) {
if (successor_cf_index < (cf_pair_index_bound_ << 1)) {
successor_stack.push_back(successor_cf_index + 1);
} else {
memexport_eM_potentially_written_before_end_ |=
eM_written_by_cf_instr;
}
}
}
}
}
if (memexport_eA_written == 0) {
memexport_stream_constants_.clear();
}
is_ucode_analyzed_ = true;
@ -250,8 +349,7 @@ uint32_t Shader::GetInterpolatorInputMask(reg::SQ_PROGRAM_CNTL sq_program_cntl,
void Shader::GatherExecInformation(
const ParsedExecInstruction& instr,
ucode::VertexFetchInstruction& previous_vfetch_full,
uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count,
uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) {
uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer) {
instr.Disassemble(&ucode_disasm_buffer);
uint32_t sequence = instr.sequence;
for (uint32_t instr_offset = instr.instruction_address;
@ -273,8 +371,7 @@ void Shader::GatherExecInformation(
}
} else {
auto& op = *reinterpret_cast<const AluInstruction*>(op_ptr);
GatherAluInstructionInformation(op, memexport_alloc_current_count,
memexport_eA_written,
GatherAluInstructionInformation(op, instr.dword_index,
ucode_disasm_buffer);
}
}
@ -381,8 +478,8 @@ void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op,
}
void Shader::GatherAluInstructionInformation(
const AluInstruction& op, uint32_t memexport_alloc_current_count,
uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) {
const AluInstruction& op, uint32_t exec_cf_index,
StringBuffer& ucode_disasm_buffer) {
ParsedAluInstruction instr;
ParseAluInstruction(op, type(), instr);
instr.Disassemble(&ucode_disasm_buffer);
@ -394,10 +491,8 @@ void Shader::GatherAluInstructionInformation(
(ucode::GetAluScalarOpcodeInfo(op.scalar_opcode()).changed_state &
ucode::kAluOpChangedStatePixelKill);
GatherAluResultInformation(instr.vector_and_constant_result,
memexport_alloc_current_count);
GatherAluResultInformation(instr.scalar_result,
memexport_alloc_current_count);
GatherAluResultInformation(instr.vector_and_constant_result, exec_cf_index);
GatherAluResultInformation(instr.scalar_result, exec_cf_index);
for (size_t i = 0; i < instr.vector_operand_count; ++i) {
GatherOperandInformation(instr.vector_operands[i]);
}
@ -405,9 +500,7 @@ void Shader::GatherAluInstructionInformation(
GatherOperandInformation(instr.scalar_operands[i]);
}
// Store used memexport constants because CPU code needs addresses and sizes,
// and also whether there have been writes to eA and eM# for register
// allocation in shader translator implementations.
// Store used memexport constants because CPU code needs addresses and sizes.
// eA is (hopefully) always written to using:
// mad eA, r#, const0100, c#
// (though there are some exceptions, shaders in 4D5307E6 for some reason set
@ -416,13 +509,9 @@ void Shader::GatherAluInstructionInformation(
// Export is done to vector_dest of the ucode instruction for both vector and
// scalar operations - no need to check separately.
if (instr.vector_and_constant_result.storage_target ==
InstructionStorageTarget::kExportAddress &&
memexport_alloc_current_count > 0 &&
memexport_alloc_current_count <= Shader::kMaxMemExports) {
InstructionStorageTarget::kExportAddress) {
uint32_t memexport_stream_constant = instr.GetMemExportStreamConstant();
if (memexport_stream_constant != UINT32_MAX) {
memexport_eA_written |= uint32_t(1)
<< (memexport_alloc_current_count - 1);
memexport_stream_constants_.insert(memexport_stream_constant);
} else {
XELOGE(
@ -481,8 +570,8 @@ void Shader::GatherFetchResultInformation(const InstructionResult& result) {
}
}
void Shader::GatherAluResultInformation(
const InstructionResult& result, uint32_t memexport_alloc_current_count) {
void Shader::GatherAluResultInformation(const InstructionResult& result,
uint32_t exec_cf_index) {
uint32_t used_write_mask = result.GetUsedWriteMask();
if (!used_write_mask) {
return;
@ -504,11 +593,12 @@ void Shader::GatherAluResultInformation(
writes_point_size_edge_flag_kill_vertex_ |= used_write_mask;
break;
case InstructionStorageTarget::kExportData:
if (memexport_alloc_current_count > 0 &&
memexport_alloc_current_count <= Shader::kMaxMemExports) {
memexport_eM_written_[memexport_alloc_current_count - 1] |=
uint32_t(1) << result.storage_index;
memexport_eM_written_ |= uint8_t(1) << result.storage_index;
if (cf_memexport_info_.empty()) {
cf_memexport_info_.resize(2 * cf_pair_index_bound_);
}
cf_memexport_info_[exec_cf_index].eM_potentially_written_by_exec |=
uint32_t(1) << result.storage_index;
break;
case InstructionStorageTarget::kColor:
writes_color_targets_ |= uint32_t(1) << result.storage_index;
@ -665,7 +755,13 @@ void ShaderTranslator::TranslateControlFlowInstruction(
case ControlFlowOpcode::kAlloc: {
ParsedAllocInstruction instr;
ParseControlFlowAlloc(cf.alloc, cf_index_, is_vertex_shader(), instr);
ProcessAllocInstruction(instr);
const std::vector<Shader::ControlFlowMemExportInfo>& cf_memexport_info =
current_shader().cf_memexport_info();
ProcessAllocInstruction(instr,
instr.dword_index < cf_memexport_info.size()
? cf_memexport_info[instr.dword_index]
.eM_potentially_written_before
: 0);
} break;
case ControlFlowOpcode::kMarkVsFetchDone:
break;
@ -807,6 +903,14 @@ void ParseControlFlowAlloc(const ControlFlowAllocInstruction& cf,
void ShaderTranslator::TranslateExecInstructions(
const ParsedExecInstruction& instr) {
ProcessExecInstructionBegin(instr);
const std::vector<Shader::ControlFlowMemExportInfo>& cf_memexport_info =
current_shader().cf_memexport_info();
uint8_t eM_potentially_written_before =
instr.dword_index < cf_memexport_info.size()
? cf_memexport_info[instr.dword_index].eM_potentially_written_before
: 0;
const uint32_t* ucode_dwords = current_shader().ucode_data().data();
uint32_t sequence = instr.sequence;
for (uint32_t instr_offset = instr.instruction_address;
@ -832,9 +936,22 @@ void ShaderTranslator::TranslateExecInstructions(
auto& op = *reinterpret_cast<const AluInstruction*>(op_ptr);
ParsedAluInstruction alu_instr;
ParseAluInstruction(op, current_shader().type(), alu_instr);
ProcessAluInstruction(alu_instr);
ProcessAluInstruction(alu_instr, eM_potentially_written_before);
if (alu_instr.vector_and_constant_result.storage_target ==
InstructionStorageTarget::kExportData &&
alu_instr.vector_and_constant_result.GetUsedWriteMask()) {
eM_potentially_written_before |=
uint8_t(1) << alu_instr.vector_and_constant_result.storage_index;
}
if (alu_instr.scalar_result.storage_target ==
InstructionStorageTarget::kExportData &&
alu_instr.scalar_result.GetUsedWriteMask()) {
eM_potentially_written_before |=
uint8_t(1) << alu_instr.scalar_result.storage_index;
}
}
}
ProcessExecInstructionEnd(instr);
}

View File

@ -118,8 +118,10 @@ class ShaderTranslator {
virtual void ProcessReturnInstruction(const ParsedReturnInstruction& instr) {}
// Handles translation for jump instructions.
virtual void ProcessJumpInstruction(const ParsedJumpInstruction& instr) {}
// Handles translation for alloc instructions.
virtual void ProcessAllocInstruction(const ParsedAllocInstruction& instr) {}
// Handles translation for alloc instructions. Memory exports for eM#
// indicated by export_eM must be performed, regardless of the alloc type.
virtual void ProcessAllocInstruction(const ParsedAllocInstruction& instr,
uint8_t export_eM) {}
// Handles translation for vertex fetch instructions.
virtual void ProcessVertexFetchInstruction(
@ -128,7 +130,13 @@ class ShaderTranslator {
virtual void ProcessTextureFetchInstruction(
const ParsedTextureFetchInstruction& instr) {}
// Handles translation for ALU instructions.
virtual void ProcessAluInstruction(const ParsedAluInstruction& instr) {}
// memexport_eM_potentially_written_before needs to be handled by `kill`
// instruction to make sure memory exports for the eM# writes earlier in
// previous execs and the current exec are done before the invocation becomes
// inactive.
virtual void ProcessAluInstruction(
const ParsedAluInstruction& instr,
uint8_t memexport_eM_potentially_written_before) {}
private:
void TranslateControlFlowInstruction(const ucode::ControlFlowInstruction& cf);

View File

@ -134,7 +134,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
// (32-bit only - 16-bit indices are always fetched via the Vulkan index
// buffer).
kSysFlag_VertexIndexLoad = 1u << kSysFlag_VertexIndexLoad_Shift,
// For HostVertexShaderTypes kMemexportCompute, kPointListAsTriangleStrip,
// For HostVertexShaderTypes kMemExportCompute, kPointListAsTriangleStrip,
// kRectangleListAsTriangleStrip, whether the vertex index needs to be
// loaded from the index buffer (rather than using autogenerated indices),
// and whether it's 32-bit. This is separate from kSysFlag_VertexIndexLoad
@ -427,7 +427,9 @@ class SpirvShaderTranslator : public ShaderTranslator {
const ParsedVertexFetchInstruction& instr) override;
void ProcessTextureFetchInstruction(
const ParsedTextureFetchInstruction& instr) override;
void ProcessAluInstruction(const ParsedAluInstruction& instr) override;
void ProcessAluInstruction(
const ParsedAluInstruction& instr,
uint8_t memexport_eM_potentially_written_before) override;
private:
struct TextureBinding {
@ -620,7 +622,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
assert_true(edram_fragment_shader_interlock_);
return !is_depth_only_fragment_shader_ &&
!current_shader().writes_depth() &&
!current_shader().is_valid_memexport_used();
!current_shader().memexport_eM_written();
}
void FSI_LoadSampleMask(spv::Id msaa_samples);
void FSI_LoadEdramOffsets(spv::Id msaa_samples);

View File

@ -67,7 +67,8 @@ void SpirvShaderTranslator::KillPixel(spv::Id condition) {
}
void SpirvShaderTranslator::ProcessAluInstruction(
const ParsedAluInstruction& instr) {
const ParsedAluInstruction& instr,
uint8_t memexport_eM_potentially_written_before) {
if (instr.IsNop()) {
// Don't even disassemble or update predication.
return;

View File

@ -210,7 +210,7 @@ enum class AllocType : uint32_t {
kVsInterpolators = 2,
// Pixel shader exports colors.
kPsColors = 2,
// MEMEXPORT?
// Memory export.
kMemory = 3,
};
@ -1782,6 +1782,9 @@ inline uint32_t GetAluVectorOpNeededSourceComponents(
.operand_components_used[src_index - 1];
}
// eM# (kExportData) register count.
constexpr uint32_t kMaxMemExportElementCount = 5;
enum class ExportRegister : uint32_t {
kVSInterpolator0 = 0,
kVSInterpolator1,

View File

@ -2187,7 +2187,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
return false;
}
pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
bool memexport_used_vertex = vertex_shader->memexport_eM_written() != 0;
// Pixel shader analysis.
bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);

View File

@ -497,6 +497,18 @@ enum class TextureFormat : uint32_t {
k_6_5_5 = 5,
k_8_8_8_8 = 6,
k_2_10_10_10 = 7,
// Possibly similar to k_8, but may be storing alpha instead of red when
// resolving/memexporting, though not exactly known. From the point of view of
// sampling, it should be treated the same as k_8 (given that textures have
// the last - and single-component textures have the only - component
// replicated into all the remaining ones before the swizzle).
// Used as:
// - Texture in 4B4E083C - text, starting from the "Loading..." and the "This
// game saves data automatically" messages. The swizzle in the fetch
// constant is 111W (suggesting that internally the only component may be
// the alpha one, not red).
// TODO(Triang3l): Investigate how k_8_A and k_8_B work in resolves and
// memexports, whether they store alpha/blue of the input or red.
k_8_A = 8,
k_8_B = 9,
k_8_8 = 10,
@ -510,6 +522,12 @@ enum class TextureFormat : uint32_t {
// Used for videos in 54540829.
k_Y1_Cr_Y0_Cb_REP = 12,
k_16_16_EDRAM = 13,
// Likely same as k_8_8_8_8.
// Used as:
// - Memexport destination in 4D5308BC - multiple small draws when looking
// back at the door behind the player in the first room of gameplay.
// - Memexport destination in 4D53085B and 4D530919 - in 4D53085B, in a frame
// between the intro video and the main menu, in a 8192-point draw.
k_8_8_8_8_A = 14,
k_4_4_4_4 = 15,
k_10_11_11 = 16,
@ -1373,8 +1391,7 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
// memexport on the Adreno 2xx using GL_OES_get_program_binary - it's also
// interesting to see how alphatest interacts with it, whether it's still true
// fixed-function alphatest, as it's claimed to be supported as usual by the
// extension specification - it's likely, however, that memory exports are
// discarded alongside other exports such as oC# and oDepth this way.
// extension specification.
//
// Y of eA contains the offset in elements - this is what shaders are supposed
// to calculate from something like the vertex index. Again, it's specified as
@ -1397,6 +1414,69 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
// elements using packing via addition to 2^23, so this field also doesn't need
// more bits than that.
//
// According to the sequencer specification from IPR2015-00325 (where memexport
// is called "pass thru export"):
// - Pass thru exports can occur anywhere in the shader program.
// - There can be any number of pass thru exports.
// - The address register is not kept across clause boundaries, so it must be
// refreshed after any Serialize (or yield), allocate instruction or resource
// change.
// - The write to eM# may be predicated if the export is not needed.
// - Exports are dropped if:
// - The index is above the maximum.
// - The index sign bit is 1.
// - The exponent of the index is not 23.
// The requirement that eM4 must be written if any eM# other than eM0 is also
// written doesn't apply to the final Xenos, it's likely an outdated note in the
// specification considering that it's very preliminary.
//
// According to Microsoft's shader validator:
// - eA can be written only by `mad`.
// - A single eM# can be written by any number of instruction, including with
// write masking.
// - eA must be written before eM#.
// - Any alloc instruction or a `serialize` terminates the current memory
// export. This doesn't apply to `exec Yield=true`, however, and it's not
// clear if that's an oversight or if that's not considered a yield that
// terminates the export.
//
// From the emulation perspective, this means that:
// - Alloc instructions (`alloc export` mandatorily, other allocs optionally),
// and optionally `serialize` instructions within `exec`, should be treated as
// the locations where the currently open export should be flushed to the
// memory. It should be taken into account that an export may be in looping
// control flow, and in this case it must be performed at every iteration.
// - Whether each eM# was written to must be tracked at shader execution time,
// as predication can disable the export of an element.
//
// TODO(Triang3l): Investigate how memory export interacts with pixel killing.
// Given that eM# writes disabled by predication don't cause an export, it's
// possible that killed invocations are treated as inactive (invalid in Xenos
// terms) overall, and thus new memory exports from them shouldn't be done, but
// that's not verified. However, given that on Direct3D 11+, OpenGL and Vulkan
// hosts, discarding disables subsequent storage resource writes, on the host,
// it would be natural to perform all outstanding memory exports before
// discarding if the kill condition passes.
//
// Memory exports can be performed to any ColorFormat, including 8bpp and 16bpp
// ones. Hosts, however, may have the memory bound as a 32bpp buffer (for
// instance, due to the minimum resource view size limitation on Direct3D 11).
// In this case, bytes and shorts aren't addressable directly. However, taking
// into account that memory accesses are coherent within one shader invocation
// on Direct3D 11+, OpenGL and Vulkan and thus are done in order relatively to
// each other, it should be possible to implement them by clearing the bits via
// an atomic AND, and writing the new value using an atomic OR. This will, of
// course, make the entire write operation non-atomic, and in case of a race
// between writes to the same location, the final result may not even be just a
// value from one of the invocations, but rather, it can be OR of the values
// from any invocations involved. However, on the Xenos, there doesn't seem to
// be any possibility of meaningfully accessing the same location from multiple
// invocations if any of them is writing, memory exports are out-of-order, so
// such an implementation shouldn't be causing issues in reality. Atomic
// compare-exchange, however, should not be used for this purpose, as it may
// result in an infinite loop if different invocations want to write different
// values to the same memory location.
//
// Examples of setup in titles (Z from MSB to LSB):
//
// 4D5307E6 particles (different VS invocation counts, like 1, 2, 4):
@ -1432,6 +1512,11 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
// c0: Z = 010010110000|0|010|11|011010|00011|001
// 8in16, 16_16_16_16, uint, RGBA - from 16_16_16_16 uint vfetch
// (16_16_16_16 is the largest color format without special values)
//
// 58410B86 hierarchical depth buffer occlusion culling with the result read on
// the CPU (15000 VS invocations in the main menu):
// c8: Z = 010010110000|0|010|00|000010|00000|000, count = invocation count
// No endian swap, 8, uint, RGBA
union alignas(uint32_t) xe_gpu_memexport_stream_t {
struct {
uint32_t dword_0;

View File

@ -119,6 +119,8 @@ dword_result_t XamContentCreateEnumerator_entry(
}
DECLARE_XAM_EXPORT1(XamContentCreateEnumerator, kContent, kImplemented);
enum class kDispositionState : uint32_t { Unknown = 0, Create = 1, Open = 2 };
dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
lpvoid_t content_data_ptr,
dword_t content_data_size, dword_t flags,
@ -146,40 +148,37 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
content_data, disposition_ptr, license_mask_ptr, overlapped_ptr](
uint32_t& extended_error, uint32_t& length) -> X_RESULT {
X_RESULT result = X_ERROR_INVALID_PARAMETER;
bool create = false;
bool open = false;
kDispositionState disposition = kDispositionState::Unknown;
switch (flags & 0xF) {
case 1: // CREATE_NEW
// Fail if exists.
if (content_manager->ContentExists(content_data)) {
result = X_ERROR_ALREADY_EXISTS;
} else {
create = true;
disposition = kDispositionState::Create;
}
break;
case 2: // CREATE_ALWAYS
// Overwrite existing, if any.
if (content_manager->ContentExists(content_data)) {
content_manager->DeleteContent(content_data);
create = true;
} else {
create = true;
}
disposition = kDispositionState::Create;
break;
case 3: // OPEN_EXISTING
// Open only if exists.
if (!content_manager->ContentExists(content_data)) {
result = X_ERROR_PATH_NOT_FOUND;
} else {
open = true;
disposition = kDispositionState::Open;
}
break;
case 4: // OPEN_ALWAYS
// Create if needed.
if (!content_manager->ContentExists(content_data)) {
create = true;
disposition = kDispositionState::Create;
} else {
open = true;
disposition = kDispositionState::Open;
}
break;
case 5: // TRUNCATE_EXISTING
@ -188,7 +187,7 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
result = X_ERROR_PATH_NOT_FOUND;
} else {
content_manager->DeleteContent(content_data);
create = true;
disposition = kDispositionState::Create;
}
break;
default:
@ -196,21 +195,12 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
break;
}
// creation result
// 0 = ?
// 1 = created
// 2 = opened
uint32_t disposition = create ? 1 : 2;
if (disposition_ptr) {
*disposition_ptr = disposition;
}
if (create) {
if (disposition == kDispositionState::Create) {
result = content_manager->CreateContent(root_name, content_data);
if (XSUCCEEDED(result)) {
content_manager->WriteContentHeaderFile(&content_data);
}
} else if (open) {
} else if (disposition == kDispositionState::Open) {
result = content_manager->OpenContent(root_name, content_data);
}
@ -224,12 +214,11 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
}
extended_error = X_HRESULT_FROM_WIN32(result);
length = disposition;
length = static_cast<uint32_t>(disposition);
if (result && overlapped_ptr) {
result = X_ERROR_FUNCTION_FAILED;
}
return result;
};
@ -451,7 +440,6 @@ static_assert_size(X_SWAPDISC_ERROR_MESSAGE, 12);
dword_result_t XamSwapDisc_entry(
dword_t disc_number, pointer_t<X_KEVENT> completion_handle,
pointer_t<X_SWAPDISC_ERROR_MESSAGE> error_message) {
xex2_opt_execution_info* info = nullptr;
kernel_state()->GetExecutableModule()->GetOptHeader(XEX_HEADER_EXECUTION_INFO,
&info);

View File

@ -254,202 +254,15 @@ dword_result_t XGetLanguage_entry() {
}
DECLARE_XAM_EXPORT1(XGetLanguage, kNone, kImplemented);
// http://www.noxa.org/blog/2011/02/28/building-an-xbox-360-emulator-part-3-feasibilityos/
// http://www.noxa.org/blog/2011/08/13/building-an-xbox-360-emulator-part-5-xex-files/
dword_result_t RtlSleep_entry(dword_t dwMilliseconds, dword_t bAlertable) {
LARGE_INTEGER delay{};
// Convert the delay time to 100-nanosecond intervals
delay.QuadPart = dwMilliseconds == -1
? LLONG_MAX
: static_cast<LONGLONG>(-10000) * dwMilliseconds;
X_STATUS result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
(uint64_t*)&delay);
// If the delay was interrupted by an APC, keep delaying the thread
while (bAlertable && result == X_STATUS_ALERTED) {
result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
(uint64_t*)&delay);
}
return result == X_STATUS_SUCCESS ? X_STATUS_SUCCESS : X_STATUS_USER_APC;
}
DECLARE_XAM_EXPORT1(RtlSleep, kNone, kImplemented);
dword_result_t SleepEx_entry(dword_t dwMilliseconds, dword_t bAlertable) {
return RtlSleep_entry(dwMilliseconds, bAlertable);
}
DECLARE_XAM_EXPORT1(SleepEx, kNone, kImplemented);
// https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-sleep
void Sleep_entry(dword_t dwMilliseconds) {
RtlSleep_entry(dwMilliseconds, FALSE);
}
DECLARE_XAM_EXPORT1(Sleep, kNone, kImplemented);
// https://learn.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-gettickcount
dword_result_t GetTickCount_entry() { return Clock::QueryGuestUptimeMillis(); }
DECLARE_XAM_EXPORT1(GetTickCount, kNone, kImplemented);
dword_result_t XamGetCurrentTitleId_entry() {
return kernel_state()->emulator()->title_id();
}
DECLARE_XAM_EXPORT1(XamGetCurrentTitleId, kNone, kImplemented);
dword_result_t RtlSetLastNTError_entry(dword_t error_code) {
const uint32_t result =
xe::kernel::xboxkrnl::xeRtlNtStatusToDosError(error_code);
XThread::SetLastError(result);
return result;
dword_result_t XamIsCurrentTitleDash_entry(const ppc_context_t& ctx) {
return ctx->kernel_state->title_id() == 0xFFFE07D1;
}
DECLARE_XAM_EXPORT1(RtlSetLastNTError, kNone, kImplemented);
dword_result_t RtlGetLastError_entry() { return XThread::GetLastError(); }
DECLARE_XAM_EXPORT1(RtlGetLastError, kNone, kImplemented);
dword_result_t GetLastError_entry() { return RtlGetLastError_entry(); }
DECLARE_XAM_EXPORT1(GetLastError, kNone, kImplemented);
dword_result_t GetModuleHandleA_entry(lpstring_t module_name) {
xe::be<uint32_t> module_ptr = 0;
const X_STATUS error_code = xe::kernel::xboxkrnl::XexGetModuleHandle(
module_name.value(), &module_ptr);
if (XFAILED(error_code)) {
RtlSetLastNTError_entry(error_code);
return NULL;
}
return (uint32_t)module_ptr;
}
DECLARE_XAM_EXPORT1(GetModuleHandleA, kNone, kImplemented);
dword_result_t XapipCreateThread_entry(lpdword_t lpThreadAttributes,
dword_t dwStackSize,
lpvoid_t lpStartAddress,
lpvoid_t lpParameter,
dword_t dwCreationFlags, dword_t unkn,
lpdword_t lpThreadId) {
uint32_t flags = (dwCreationFlags >> 2) & 1;
if (unkn != -1) {
flags |= 1 << unkn << 24;
}
xe::be<uint32_t> result = 0;
const X_STATUS error_code = xe::kernel::xboxkrnl::ExCreateThread(
&result, dwStackSize, lpThreadId, lpStartAddress, lpParameter, 0, flags);
if (XFAILED(error_code)) {
RtlSetLastNTError_entry(error_code);
return NULL;
}
return (uint32_t)result;
}
DECLARE_XAM_EXPORT1(XapipCreateThread, kNone, kImplemented);
dword_result_t CreateThread_entry(lpdword_t lpThreadAttributes,
dword_t dwStackSize, lpvoid_t lpStartAddress,
lpvoid_t lpParameter, dword_t dwCreationFlags,
lpdword_t lpThreadId) {
return XapipCreateThread_entry(lpThreadAttributes, dwStackSize,
lpStartAddress, lpParameter, dwCreationFlags,
-1, lpThreadId);
}
DECLARE_XAM_EXPORT1(CreateThread, kNone, kImplemented);
dword_result_t CloseHandle_entry(dword_t hObject) {
const X_STATUS error_code = xe::kernel::xboxkrnl::NtClose(hObject);
if (XFAILED(error_code)) {
RtlSetLastNTError_entry(error_code);
return false;
}
return true;
}
DECLARE_XAM_EXPORT1(CloseHandle, kNone, kImplemented);
dword_result_t ResumeThread_entry(dword_t hThread) {
uint32_t suspend_count;
const X_STATUS error_code =
xe::kernel::xboxkrnl::NtResumeThread(hThread, &suspend_count);
if (XFAILED(error_code)) {
RtlSetLastNTError_entry(error_code);
return -1;
}
return suspend_count;
}
DECLARE_XAM_EXPORT1(ResumeThread, kNone, kImplemented);
void ExitThread_entry(dword_t exit_code) {
xe::kernel::xboxkrnl::ExTerminateThread(exit_code);
}
DECLARE_XAM_EXPORT1(ExitThread, kNone, kImplemented);
dword_result_t GetCurrentThreadId_entry() {
return XThread::GetCurrentThread()->GetCurrentThreadId();
}
DECLARE_XAM_EXPORT1(GetCurrentThreadId, kNone, kImplemented);
qword_result_t XapiFormatTimeOut_entry(lpqword_t result,
dword_t dwMilliseconds) {
LARGE_INTEGER delay{};
// Convert the delay time to 100-nanosecond intervals
delay.QuadPart =
dwMilliseconds == -1 ? 0 : static_cast<LONGLONG>(-10000) * dwMilliseconds;
return (uint64_t)&delay;
}
DECLARE_XAM_EXPORT1(XapiFormatTimeOut, kNone, kImplemented);
dword_result_t WaitForSingleObjectEx_entry(dword_t hHandle,
dword_t dwMilliseconds,
dword_t bAlertable) {
uint64_t* timeout = nullptr;
uint64_t timeout_ptr = XapiFormatTimeOut_entry(timeout, dwMilliseconds);
X_STATUS result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
hHandle, 1, bAlertable, &timeout_ptr);
while (bAlertable && result == X_STATUS_ALERTED) {
result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
hHandle, 1, bAlertable, &timeout_ptr);
}
RtlSetLastNTError_entry(result);
result = -1;
return result;
}
DECLARE_XAM_EXPORT1(WaitForSingleObjectEx, kNone, kImplemented);
dword_result_t WaitForSingleObject_entry(dword_t hHandle,
dword_t dwMilliseconds) {
return WaitForSingleObjectEx_entry(hHandle, dwMilliseconds, 0);
}
DECLARE_XAM_EXPORT1(WaitForSingleObject, kNone, kImplemented);
dword_result_t lstrlenW_entry(lpu16string_t string) {
// wcslen?
if (string) {
return (uint32_t)string.value().length();
}
return NULL;
}
DECLARE_XAM_EXPORT1(lstrlenW, kNone, kImplemented);
DECLARE_XAM_EXPORT1(XamIsCurrentTitleDash, kNone, kImplemented);
dword_result_t XamGetExecutionId_entry(lpdword_t info_ptr) {
auto module = kernel_state()->GetExecutableModule();
@ -611,16 +424,204 @@ dword_result_t XamQueryLiveHiveW_entry(lpu16string_t name, lpvoid_t out_buf,
}
DECLARE_XAM_EXPORT1(XamQueryLiveHiveW, kNone, kStub);
dword_result_t XamIsCurrentTitleDash_entry(const ppc_context_t& ctx) {
return ctx->kernel_state->title_id() == 0xFFFE07D1;
// http://www.noxa.org/blog/2011/02/28/building-an-xbox-360-emulator-part-3-feasibilityos/
// http://www.noxa.org/blog/2011/08/13/building-an-xbox-360-emulator-part-5-xex-files/
dword_result_t RtlSleep_entry(dword_t dwMilliseconds, dword_t bAlertable) {
LARGE_INTEGER delay{};
// Convert the delay time to 100-nanosecond intervals
delay.QuadPart = dwMilliseconds == -1
? LLONG_MAX
: static_cast<LONGLONG>(-10000) * dwMilliseconds;
X_STATUS result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
(uint64_t*)&delay);
// If the delay was interrupted by an APC, keep delaying the thread
while (bAlertable && result == X_STATUS_ALERTED) {
result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
(uint64_t*)&delay);
}
DECLARE_XAM_EXPORT1(XamIsCurrentTitleDash, kNone, kImplemented);
return result == X_STATUS_SUCCESS ? X_STATUS_SUCCESS : X_STATUS_USER_APC;
}
DECLARE_XAM_EXPORT1(RtlSleep, kNone, kImplemented);
dword_result_t SleepEx_entry(dword_t dwMilliseconds, dword_t bAlertable) {
return RtlSleep_entry(dwMilliseconds, bAlertable);
}
DECLARE_XAM_EXPORT1(SleepEx, kNone, kImplemented);
// https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-sleep
void Sleep_entry(dword_t dwMilliseconds) {
RtlSleep_entry(dwMilliseconds, FALSE);
}
DECLARE_XAM_EXPORT1(Sleep, kNone, kImplemented);
// https://learn.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-gettickcount
dword_result_t GetTickCount_entry() { return Clock::QueryGuestUptimeMillis(); }
DECLARE_XAM_EXPORT1(GetTickCount, kNone, kImplemented);
dword_result_t RtlSetLastNTError_entry(dword_t error_code) {
const uint32_t result =
xe::kernel::xboxkrnl::xeRtlNtStatusToDosError(error_code);
XThread::SetLastError(result);
return result;
}
DECLARE_XAM_EXPORT1(RtlSetLastNTError, kNone, kImplemented);
dword_result_t RtlGetLastError_entry() { return XThread::GetLastError(); }
DECLARE_XAM_EXPORT1(RtlGetLastError, kNone, kImplemented);
dword_result_t GetLastError_entry() { return RtlGetLastError_entry(); }
DECLARE_XAM_EXPORT1(GetLastError, kNone, kImplemented);
dword_result_t GetModuleHandleA_entry(lpstring_t module_name) {
xe::be<uint32_t> module_ptr = 0;
const X_STATUS error_code = xe::kernel::xboxkrnl::XexGetModuleHandle(
module_name.value(), &module_ptr);
if (XFAILED(error_code)) {
RtlSetLastNTError_entry(error_code);
return NULL;
}
return (uint32_t)module_ptr;
}
DECLARE_XAM_EXPORT1(GetModuleHandleA, kNone, kImplemented);
dword_result_t XapipCreateThread_entry(lpdword_t lpThreadAttributes,
dword_t dwStackSize,
lpvoid_t lpStartAddress,
lpvoid_t lpParameter,
dword_t dwCreationFlags, dword_t unkn,
lpdword_t lpThreadId) {
uint32_t flags = (dwCreationFlags >> 2) & 1;
if (unkn != -1) {
flags |= 1 << unkn << 24;
}
xe::be<uint32_t> result = 0;
const X_STATUS error_code = xe::kernel::xboxkrnl::ExCreateThread(
&result, dwStackSize, lpThreadId, lpStartAddress, lpParameter, 0, flags);
if (XFAILED(error_code)) {
RtlSetLastNTError_entry(error_code);
return NULL;
}
return (uint32_t)result;
}
DECLARE_XAM_EXPORT1(XapipCreateThread, kNone, kImplemented);
dword_result_t CreateThread_entry(lpdword_t lpThreadAttributes,
dword_t dwStackSize, lpvoid_t lpStartAddress,
lpvoid_t lpParameter, dword_t dwCreationFlags,
lpdword_t lpThreadId) {
return XapipCreateThread_entry(lpThreadAttributes, dwStackSize,
lpStartAddress, lpParameter, dwCreationFlags,
-1, lpThreadId);
}
DECLARE_XAM_EXPORT1(CreateThread, kNone, kImplemented);
dword_result_t CloseHandle_entry(dword_t hObject) {
const X_STATUS error_code = xe::kernel::xboxkrnl::NtClose(hObject);
if (XFAILED(error_code)) {
RtlSetLastNTError_entry(error_code);
return false;
}
return true;
}
DECLARE_XAM_EXPORT1(CloseHandle, kNone, kImplemented);
dword_result_t ResumeThread_entry(dword_t hThread) {
uint32_t suspend_count;
const X_STATUS error_code =
xe::kernel::xboxkrnl::NtResumeThread(hThread, &suspend_count);
if (XFAILED(error_code)) {
RtlSetLastNTError_entry(error_code);
return -1;
}
return suspend_count;
}
DECLARE_XAM_EXPORT1(ResumeThread, kNone, kImplemented);
void ExitThread_entry(dword_t exit_code) {
xe::kernel::xboxkrnl::ExTerminateThread(exit_code);
}
DECLARE_XAM_EXPORT1(ExitThread, kNone, kImplemented);
dword_result_t GetCurrentThreadId_entry() {
return XThread::GetCurrentThread()->GetCurrentThreadId();
}
DECLARE_XAM_EXPORT1(GetCurrentThreadId, kNone, kImplemented);
qword_result_t XapiFormatTimeOut_entry(lpqword_t result,
dword_t dwMilliseconds) {
LARGE_INTEGER delay{};
// Convert the delay time to 100-nanosecond intervals
delay.QuadPart =
dwMilliseconds == -1 ? 0 : static_cast<LONGLONG>(-10000) * dwMilliseconds;
return (uint64_t)&delay;
}
DECLARE_XAM_EXPORT1(XapiFormatTimeOut, kNone, kImplemented);
dword_result_t WaitForSingleObjectEx_entry(dword_t hHandle,
dword_t dwMilliseconds,
dword_t bAlertable) {
uint64_t* timeout = nullptr;
uint64_t timeout_ptr = XapiFormatTimeOut_entry(timeout, dwMilliseconds);
X_STATUS result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
hHandle, 1, bAlertable, &timeout_ptr);
while (bAlertable && result == X_STATUS_ALERTED) {
result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
hHandle, 1, bAlertable, &timeout_ptr);
}
RtlSetLastNTError_entry(result);
result = -1;
return result;
}
DECLARE_XAM_EXPORT1(WaitForSingleObjectEx, kNone, kImplemented);
dword_result_t WaitForSingleObject_entry(dword_t hHandle,
dword_t dwMilliseconds) {
return WaitForSingleObjectEx_entry(hHandle, dwMilliseconds, 0);
}
DECLARE_XAM_EXPORT1(WaitForSingleObject, kNone, kImplemented);
dword_result_t lstrlenW_entry(lpu16string_t string) {
// wcslen?
if (string) {
return (uint32_t)string.value().length();
}
return NULL;
}
DECLARE_XAM_EXPORT1(lstrlenW, kNone, kImplemented);
dword_result_t XGetAudioFlags_entry() { return 65537; }
DECLARE_XAM_EXPORT1(XGetAudioFlags, kNone, kStub);
/*
todo: this table should instead be pointed to by a member of kernel state and initialized along with the process
todo: this table should instead be pointed to by a member of kernel
state and initialized along with the process
*/
static int32_t XamRtlRandomTable[128] = {
1284227242, 1275210071, 573735546, 790525478, 2139871995, 1547161642,