Merge branch 'master' of https://github.com/xenia-project/xenia into canary_experimental
This commit is contained in:
commit
ce9a82ccf8
|
@ -217,6 +217,10 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
|
|||
}
|
||||
|
||||
while (auto ent = readdir(dir)) {
|
||||
if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
FileInfo info;
|
||||
|
||||
info.name = ent->d_name;
|
||||
|
@ -225,6 +229,7 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
|
|||
info.create_timestamp = convertUnixtimeToWinFiletime(st.st_ctime);
|
||||
info.access_timestamp = convertUnixtimeToWinFiletime(st.st_atime);
|
||||
info.write_timestamp = convertUnixtimeToWinFiletime(st.st_mtime);
|
||||
info.path = path;
|
||||
if (ent->d_type == DT_DIR) {
|
||||
info.type = FileInfo::Type::kDirectory;
|
||||
info.total_size = 0;
|
||||
|
@ -234,7 +239,7 @@ std::vector<FileInfo> ListFiles(const std::filesystem::path& path) {
|
|||
}
|
||||
result.push_back(info);
|
||||
}
|
||||
|
||||
closedir(dir);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#include "xenia/base/utf8.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <locale>
|
||||
#include <numeric>
|
||||
#include <tuple>
|
||||
|
|
|
@ -481,6 +481,43 @@ struct VECTOR_COMPARE_UGT_V128
|
|||
: Sequence<VECTOR_COMPARE_UGT_V128,
|
||||
I<OPCODE_VECTOR_COMPARE_UGT, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW |
|
||||
kX64EmitAVX512DQ) &&
|
||||
(i.instr->flags != FLOAT32_TYPE)) {
|
||||
Xmm src1 = e.xmm0;
|
||||
if (i.src1.is_constant) {
|
||||
e.LoadConstantXmm(src1, i.src1.constant());
|
||||
} else {
|
||||
src1 = i.src1;
|
||||
}
|
||||
|
||||
Xmm src2 = e.xmm1;
|
||||
if (i.src2.is_constant) {
|
||||
e.LoadConstantXmm(src2, i.src2.constant());
|
||||
} else {
|
||||
src2 = i.src2;
|
||||
}
|
||||
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
e.vpcmpub(e.k1, src1, src2, 0x6);
|
||||
e.vpmovm2b(i.dest, e.k1);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
e.vpcmpuw(e.k1, src1, src2, 0x6);
|
||||
e.vpmovm2w(i.dest, e.k1);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
e.vpcmpud(e.k1, src1, src2, 0x6);
|
||||
e.vpmovm2d(i.dest, e.k1);
|
||||
break;
|
||||
default:
|
||||
assert_always();
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
|
|
|
@ -646,8 +646,9 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
break;
|
||||
case OPCODE_AND_NOT:
|
||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->AndNot(i->src2.value);
|
||||
v->set_from(i->src2.value);
|
||||
v->Not();
|
||||
v->And(i->src1.value);
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
|
|
|
@ -324,8 +324,13 @@ int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) {
|
|||
}
|
||||
|
||||
int InstrEmit_vaddcuw(PPCHIRBuilder& f, const InstrData& i) {
|
||||
XEINSTRNOTIMPLEMENTED();
|
||||
return 1;
|
||||
Value* sum = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT32_TYPE,
|
||||
ARITHMETIC_UNSIGNED);
|
||||
Value* overflow = f.VectorCompareUGT(f.LoadVR(i.VX.VA), sum, INT32_TYPE);
|
||||
Value* carry =
|
||||
f.VectorShr(overflow, f.LoadConstantVec128(vec128i(31)), INT32_TYPE);
|
||||
f.StoreVR(i.VX.VD, carry);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int InstrEmit_vaddfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb) {
|
||||
|
@ -1665,7 +1670,11 @@ int InstrEmit_vsrw128(PPCHIRBuilder& f, const InstrData& i) {
|
|||
}
|
||||
|
||||
int InstrEmit_vsubcuw(PPCHIRBuilder& f, const InstrData& i) {
|
||||
XEINSTRNOTIMPLEMENTED();
|
||||
Value* underflow =
|
||||
f.VectorCompareUGE(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT32_TYPE);
|
||||
Value* borrow =
|
||||
f.VectorShr(underflow, f.LoadConstantVec128(vec128i(31)), INT32_TYPE);
|
||||
f.StoreVR(i.VX.VD, borrow);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -2574,7 +2574,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
|||
return false;
|
||||
}
|
||||
pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
|
||||
const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
|
||||
|
||||
const bool memexport_used_vertex = vertex_shader->memexport_eM_written() != 0;
|
||||
|
||||
// Pixel shader analysis.
|
||||
bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
|
||||
|
@ -2604,7 +2605,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
|||
}
|
||||
|
||||
const bool memexport_used_pixel =
|
||||
pixel_shader && pixel_shader->is_valid_memexport_used();
|
||||
pixel_shader && (pixel_shader->memexport_eM_written() != 0);
|
||||
const bool memexport_used = memexport_used_vertex || memexport_used_pixel;
|
||||
|
||||
if (!BeginSubmission(true)) {
|
||||
|
@ -2831,12 +2832,22 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
|||
// Gather memexport ranges and ensure the heaps for them are resident, and
|
||||
// also load the data surrounding the export and to fill the regions that
|
||||
// won't be modified by the shaders.
|
||||
|
||||
memexport_range_count_ = 0;
|
||||
if (memexport_used_vertex || memexport_used_pixel) {
|
||||
bool retflag;
|
||||
bool retval = GatherMemexportRangesAndMakeResident(retflag);
|
||||
if (retflag) return retval;
|
||||
memexport_ranges_.clear();
|
||||
if (memexport_used_vertex) {
|
||||
draw_util::AddMemExportRanges(regs, *vertex_shader, memexport_ranges_);
|
||||
}
|
||||
if (memexport_used_pixel) {
|
||||
draw_util::AddMemExportRanges(regs, *pixel_shader, memexport_ranges_);
|
||||
}
|
||||
for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) {
|
||||
if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
|
||||
memexport_range.size_bytes)) {
|
||||
XELOGE(
|
||||
"Failed to request memexport stream at 0x{:08X} (size {}) in the "
|
||||
"shared memory",
|
||||
memexport_range.base_address_dwords << 2, memexport_range.size_bytes);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Primitive topology.
|
||||
D3D_PRIMITIVE_TOPOLOGY primitive_topology;
|
||||
|
@ -2935,11 +2946,22 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
|||
// If the shared memory is a UAV, it can't be used as an index buffer
|
||||
// (UAV is a read/write state, index buffer is a read-only state).
|
||||
// Need to copy the indices to a buffer in the index buffer state.
|
||||
bool retflag;
|
||||
bool retval = HandleMemexportGuestDMA(
|
||||
scratch_index_buffer, index_buffer_view,
|
||||
primitive_processing_result.guest_index_base, retflag);
|
||||
if (retflag) return retval;
|
||||
scratch_index_buffer = RequestScratchGPUBuffer(
|
||||
index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST);
|
||||
if (scratch_index_buffer == nullptr) {
|
||||
return false;
|
||||
}
|
||||
shared_memory_->UseAsCopySource();
|
||||
SubmitBarriers();
|
||||
deferred_command_list_.D3DCopyBufferRegion(
|
||||
scratch_index_buffer, 0, shared_memory_->GetBuffer(),
|
||||
primitive_processing_result.guest_index_base,
|
||||
index_buffer_view.SizeInBytes);
|
||||
PushTransitionBarrier(scratch_index_buffer,
|
||||
D3D12_RESOURCE_STATE_COPY_DEST,
|
||||
D3D12_RESOURCE_STATE_INDEX_BUFFER);
|
||||
index_buffer_view.BufferLocation =
|
||||
scratch_index_buffer->GetGPUVirtualAddress();
|
||||
} else {
|
||||
index_buffer_view.BufferLocation =
|
||||
shared_memory_->GetGPUAddress() +
|
||||
|
@ -2977,199 +2999,66 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
|||
}
|
||||
|
||||
if (memexport_used) {
|
||||
HandleMemexportDrawOrdering_AndReadback();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
XE_COLD
|
||||
XE_NOINLINE
|
||||
bool D3D12CommandProcessor::HandleMemexportGuestDMA(
|
||||
ID3D12Resource*& scratch_index_buffer,
|
||||
D3D12_INDEX_BUFFER_VIEW& index_buffer_view, uint32_t guest_index_base,
|
||||
// xe::gpu::PrimitiveProcessor::ProcessingResult&
|
||||
// primitive_processing_result,
|
||||
bool& retflag) {
|
||||
retflag = true;
|
||||
scratch_index_buffer = RequestScratchGPUBuffer(
|
||||
index_buffer_view.SizeInBytes, D3D12_RESOURCE_STATE_COPY_DEST);
|
||||
if (scratch_index_buffer == nullptr) {
|
||||
return false;
|
||||
}
|
||||
shared_memory_->UseAsCopySource();
|
||||
SubmitBarriers();
|
||||
deferred_command_list_.D3DCopyBufferRegion(
|
||||
scratch_index_buffer, 0, shared_memory_->GetBuffer(), guest_index_base,
|
||||
index_buffer_view.SizeInBytes);
|
||||
PushTransitionBarrier(scratch_index_buffer, D3D12_RESOURCE_STATE_COPY_DEST,
|
||||
D3D12_RESOURCE_STATE_INDEX_BUFFER);
|
||||
index_buffer_view.BufferLocation =
|
||||
scratch_index_buffer->GetGPUVirtualAddress();
|
||||
retflag = false;
|
||||
return {};
|
||||
}
|
||||
XE_NOINLINE
|
||||
XE_COLD
|
||||
bool D3D12CommandProcessor::GatherMemexportRangesAndMakeResident(
|
||||
bool& retflag) {
|
||||
auto vertex_shader = static_cast<D3D12Shader*>(active_vertex_shader());
|
||||
auto pixel_shader = static_cast<D3D12Shader*>(active_pixel_shader());
|
||||
const xe::gpu::RegisterFile& regs = *register_file_;
|
||||
const bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
|
||||
const bool memexport_used_pixel =
|
||||
pixel_shader && pixel_shader->is_valid_memexport_used();
|
||||
retflag = true;
|
||||
if (memexport_used_vertex) {
|
||||
for (uint32_t constant_index :
|
||||
vertex_shader->memexport_stream_constants()) {
|
||||
const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
|
||||
XE_GPU_REG_SHADER_CONSTANT_000_X + constant_index * 4);
|
||||
if (memexport_stream.index_count == 0) {
|
||||
continue;
|
||||
}
|
||||
uint32_t memexport_format_size =
|
||||
GetSupportedMemExportFormatSize(memexport_stream.format);
|
||||
if (memexport_format_size == 0) {
|
||||
XELOGE("Unsupported memexport format {}",
|
||||
FormatInfo::GetName(
|
||||
xenos::TextureFormat(uint32_t(memexport_stream.format))));
|
||||
return false;
|
||||
}
|
||||
uint32_t memexport_size_dwords =
|
||||
memexport_stream.index_count * memexport_format_size;
|
||||
// Try to reduce the number of shared memory operations when writing
|
||||
// different elements into the same buffer through different exports
|
||||
// (happens in 4D5307E6).
|
||||
bool memexport_range_reused = false;
|
||||
for (uint32_t i = 0; i < memexport_range_count_; ++i) {
|
||||
MemExportRange& memexport_range = memexport_ranges_[i];
|
||||
if (memexport_range.base_address_dwords ==
|
||||
memexport_stream.base_address) {
|
||||
memexport_range.size_dwords =
|
||||
std::max(memexport_range.size_dwords, memexport_size_dwords);
|
||||
memexport_range_reused = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Add a new range if haven't expanded an existing one.
|
||||
if (!memexport_range_reused) {
|
||||
MemExportRange& memexport_range =
|
||||
memexport_ranges_[memexport_range_count_++];
|
||||
memexport_range.base_address_dwords = memexport_stream.base_address;
|
||||
memexport_range.size_dwords = memexport_size_dwords;
|
||||
}
|
||||
// Make sure this memexporting draw is ordered with other work using shared
|
||||
// memory as a UAV.
|
||||
// TODO(Triang3l): Find some PM4 command that can be used for indication of
|
||||
// when memexports should be awaited?
|
||||
shared_memory_->MarkUAVWritesCommitNeeded();
|
||||
// Invalidate textures in memexported memory and watch for changes.
|
||||
for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) {
|
||||
shared_memory_->RangeWrittenByGpu(
|
||||
memexport_range.base_address_dwords << 2, memexport_range.size_bytes,
|
||||
false);
|
||||
}
|
||||
}
|
||||
if (memexport_used_pixel) {
|
||||
for (uint32_t constant_index : pixel_shader->memexport_stream_constants()) {
|
||||
const auto& memexport_stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
|
||||
XE_GPU_REG_SHADER_CONSTANT_256_X + constant_index * 4);
|
||||
if (memexport_stream.index_count == 0) {
|
||||
continue;
|
||||
if (cvars::d3d12_readback_memexport) {
|
||||
// Read the exported data on the CPU.
|
||||
uint32_t memexport_total_size = 0;
|
||||
for (const draw_util::MemExportRange& memexport_range :
|
||||
memexport_ranges_) {
|
||||
memexport_total_size += memexport_range.size_bytes;
|
||||
}
|
||||
uint32_t memexport_format_size =
|
||||
GetSupportedMemExportFormatSize(memexport_stream.format);
|
||||
if (memexport_format_size == 0) {
|
||||
XELOGE("Unsupported memexport format {}",
|
||||
FormatInfo::GetName(
|
||||
xenos::TextureFormat(uint32_t(memexport_stream.format))));
|
||||
return false;
|
||||
}
|
||||
uint32_t memexport_size_dwords =
|
||||
memexport_stream.index_count * memexport_format_size;
|
||||
bool memexport_range_reused = false;
|
||||
for (uint32_t i = 0; i < memexport_range_count_; ++i) {
|
||||
MemExportRange& memexport_range = memexport_ranges_[i];
|
||||
if (memexport_range.base_address_dwords ==
|
||||
memexport_stream.base_address) {
|
||||
memexport_range.size_dwords =
|
||||
std::max(memexport_range.size_dwords, memexport_size_dwords);
|
||||
memexport_range_reused = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!memexport_range_reused) {
|
||||
MemExportRange& memexport_range =
|
||||
memexport_ranges_[memexport_range_count_++];
|
||||
memexport_range.base_address_dwords = memexport_stream.base_address;
|
||||
memexport_range.size_dwords = memexport_size_dwords;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (uint32_t i = 0; i < memexport_range_count_; ++i) {
|
||||
const MemExportRange& memexport_range = memexport_ranges_[i];
|
||||
if (!shared_memory_->RequestRange(memexport_range.base_address_dwords << 2,
|
||||
memexport_range.size_dwords << 2)) {
|
||||
XELOGE(
|
||||
"Failed to request memexport stream at 0x{:08X} (size {}) in the "
|
||||
"shared memory",
|
||||
memexport_range.base_address_dwords << 2,
|
||||
memexport_range.size_dwords << 2);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
retflag = false;
|
||||
return {};
|
||||
}
|
||||
XE_NOINLINE
|
||||
XE_COLD
|
||||
void D3D12CommandProcessor::HandleMemexportDrawOrdering_AndReadback() {
|
||||
// Make sure this memexporting draw is ordered with other work using shared
|
||||
// memory as a UAV.
|
||||
// TODO(Triang3l): Find some PM4 command that can be used for indication of
|
||||
// when memexports should be awaited?
|
||||
shared_memory_->MarkUAVWritesCommitNeeded();
|
||||
// Invalidate textures in memexported memory and watch for changes.
|
||||
for (uint32_t i = 0; i < memexport_range_count_; ++i) {
|
||||
const MemExportRange& memexport_range = memexport_ranges_[i];
|
||||
shared_memory_->RangeWrittenByGpu(memexport_range.base_address_dwords << 2,
|
||||
memexport_range.size_dwords << 2, false);
|
||||
}
|
||||
if (cvars::d3d12_readback_memexport) {
|
||||
// Read the exported data on the CPU.
|
||||
uint32_t memexport_total_size = 0;
|
||||
for (uint32_t i = 0; i < memexport_range_count_; ++i) {
|
||||
memexport_total_size += memexport_ranges_[i].size_dwords << 2;
|
||||
}
|
||||
if (memexport_total_size != 0) {
|
||||
ID3D12Resource* readback_buffer =
|
||||
RequestReadbackBuffer(memexport_total_size);
|
||||
if (readback_buffer != nullptr) {
|
||||
shared_memory_->UseAsCopySource();
|
||||
SubmitBarriers();
|
||||
ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
|
||||
uint32_t readback_buffer_offset = 0;
|
||||
for (uint32_t i = 0; i < memexport_range_count_; ++i) {
|
||||
const MemExportRange& memexport_range = memexport_ranges_[i];
|
||||
uint32_t memexport_range_size = memexport_range.size_dwords << 2;
|
||||
deferred_command_list_.D3DCopyBufferRegion(
|
||||
readback_buffer, readback_buffer_offset, shared_memory_buffer,
|
||||
memexport_range.base_address_dwords << 2, memexport_range_size);
|
||||
readback_buffer_offset += memexport_range_size;
|
||||
}
|
||||
if (AwaitAllQueueOperationsCompletion()) {
|
||||
D3D12_RANGE readback_range;
|
||||
readback_range.Begin = 0;
|
||||
readback_range.End = memexport_total_size;
|
||||
void* readback_mapping;
|
||||
if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
|
||||
&readback_mapping))) {
|
||||
const uint32_t* readback_dwords =
|
||||
reinterpret_cast<const uint32_t*>(readback_mapping);
|
||||
for (uint32_t i = 0; i < memexport_range_count_; ++i) {
|
||||
const MemExportRange& memexport_range = memexport_ranges_[i];
|
||||
std::memcpy(memory_->TranslatePhysical(
|
||||
memexport_range.base_address_dwords << 2),
|
||||
readback_dwords, memexport_range.size_dwords << 2);
|
||||
readback_dwords += memexport_range.size_dwords;
|
||||
if (memexport_total_size != 0) {
|
||||
ID3D12Resource* readback_buffer =
|
||||
RequestReadbackBuffer(memexport_total_size);
|
||||
if (readback_buffer != nullptr) {
|
||||
shared_memory_->UseAsCopySource();
|
||||
SubmitBarriers();
|
||||
ID3D12Resource* shared_memory_buffer = shared_memory_->GetBuffer();
|
||||
uint32_t readback_buffer_offset = 0;
|
||||
for (const draw_util::MemExportRange& memexport_range :
|
||||
memexport_ranges_) {
|
||||
uint32_t memexport_range_size = memexport_range.size_bytes;
|
||||
deferred_command_list_.D3DCopyBufferRegion(
|
||||
readback_buffer, readback_buffer_offset, shared_memory_buffer,
|
||||
memexport_range.base_address_dwords << 2, memexport_range_size);
|
||||
readback_buffer_offset += memexport_range_size;
|
||||
}
|
||||
if (AwaitAllQueueOperationsCompletion()) {
|
||||
D3D12_RANGE readback_range;
|
||||
readback_range.Begin = 0;
|
||||
readback_range.End = memexport_total_size;
|
||||
void* readback_mapping;
|
||||
if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
|
||||
&readback_mapping))) {
|
||||
const uint8_t* readback_bytes =
|
||||
reinterpret_cast<const uint8_t*>(readback_mapping);
|
||||
for (const draw_util::MemExportRange& memexport_range :
|
||||
memexport_ranges_) {
|
||||
std::memcpy(memory_->TranslatePhysical(
|
||||
memexport_range.base_address_dwords << 2),
|
||||
readback_bytes, memexport_range.size_bytes);
|
||||
readback_bytes += memexport_range.size_bytes;
|
||||
}
|
||||
D3D12_RANGE readback_write_range = {};
|
||||
readback_buffer->Unmap(0, &readback_write_range);
|
||||
}
|
||||
D3D12_RANGE readback_write_range = {};
|
||||
readback_buffer->Unmap(0, &readback_write_range);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void D3D12CommandProcessor::InitializeTrace() {
|
||||
|
@ -5208,36 +5097,6 @@ bool D3D12CommandProcessor::UpdateBindings_BindfulPath(
|
|||
return {};
|
||||
}
|
||||
|
||||
uint32_t D3D12CommandProcessor::GetSupportedMemExportFormatSize(
|
||||
xenos::ColorFormat format) {
|
||||
switch (format) {
|
||||
case xenos::ColorFormat::k_8_8_8_8:
|
||||
case xenos::ColorFormat::k_2_10_10_10:
|
||||
// TODO(Triang3l): Investigate how k_8_8_8_8_A works - not supported in the
|
||||
// texture cache currently.
|
||||
// case xenos::ColorFormat::k_8_8_8_8_A:
|
||||
case xenos::ColorFormat::k_10_11_11:
|
||||
case xenos::ColorFormat::k_11_11_10:
|
||||
case xenos::ColorFormat::k_16_16:
|
||||
case xenos::ColorFormat::k_16_16_FLOAT:
|
||||
case xenos::ColorFormat::k_32_FLOAT:
|
||||
case xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16:
|
||||
case xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16:
|
||||
case xenos::ColorFormat::k_10_11_11_AS_16_16_16_16:
|
||||
case xenos::ColorFormat::k_11_11_10_AS_16_16_16_16:
|
||||
return 1;
|
||||
case xenos::ColorFormat::k_16_16_16_16:
|
||||
case xenos::ColorFormat::k_16_16_16_16_FLOAT:
|
||||
case xenos::ColorFormat::k_32_32_FLOAT:
|
||||
return 2;
|
||||
case xenos::ColorFormat::k_32_32_32_32_FLOAT:
|
||||
return 4;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
|
||||
if (size == 0) {
|
||||
return nullptr;
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/gpu/command_processor.h"
|
||||
|
@ -319,18 +320,7 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
bool IssueDraw(xenos::PrimitiveType primitive_type, uint32_t index_count,
|
||||
IndexBufferInfo* index_buffer_info,
|
||||
bool major_mode_explicit) override;
|
||||
XE_COLD
|
||||
XE_NOINLINE
|
||||
bool HandleMemexportGuestDMA(ID3D12Resource*& scratch_index_buffer,
|
||||
D3D12_INDEX_BUFFER_VIEW& index_buffer_view,
|
||||
uint32_t guest_index_base,
|
||||
bool& retflag);
|
||||
XE_NOINLINE
|
||||
XE_COLD
|
||||
bool GatherMemexportRangesAndMakeResident(bool& retflag);
|
||||
XE_NOINLINE
|
||||
XE_COLD
|
||||
void HandleMemexportDrawOrdering_AndReadback();
|
||||
|
||||
bool IssueCopy() override;
|
||||
XE_NOINLINE
|
||||
bool IssueCopy_ReadbackResolvePath();
|
||||
|
@ -502,13 +492,6 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
const size_t sampler_count_vertex, const size_t sampler_count_pixel,
|
||||
bool& retflag);
|
||||
|
||||
// Returns dword count for one element for a memexport format, or 0 if it's
|
||||
// not supported by the D3D12 command processor (if it's smaller that 1 dword,
|
||||
// for instance).
|
||||
// TODO(Triang3l): Check if any game uses memexport with formats smaller than
|
||||
// 32 bits per element.
|
||||
static uint32_t GetSupportedMemExportFormatSize(xenos::ColorFormat format);
|
||||
|
||||
// Returns a buffer for reading GPU data back to the CPU. Assuming
|
||||
// synchronizing immediately after use. Always in COPY_DEST state.
|
||||
ID3D12Resource* RequestReadbackBuffer(uint32_t size);
|
||||
|
@ -811,12 +794,13 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
|
||||
draw_util::GetViewportInfoArgs previous_viewport_info_args_;
|
||||
draw_util::ViewportInfo previous_viewport_info_;
|
||||
// scratch memexport data
|
||||
MemExportRange memexport_ranges_[512];
|
||||
uint32_t memexport_range_count_ = 0;
|
||||
|
||||
|
||||
std::atomic<bool> pix_capture_requested_ = false;
|
||||
bool pix_capturing_;
|
||||
|
||||
// Temporary storage for memexport stream constants used in the draw.
|
||||
std::vector<draw_util::MemExportRange> memexport_ranges_;
|
||||
};
|
||||
|
||||
} // namespace d3d12
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
******************************************************************************
|
||||
* Copyright 2022 Ben Vanik. All rights reserved. *
|
||||
* Copyright 2023 Ben Vanik. All rights reserved. *
|
||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
@ -134,7 +134,7 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
|
|||
//
|
||||
// Memory export is an obvious intentional side effect.
|
||||
if (shader.kills_pixels() || shader.writes_depth() ||
|
||||
shader.is_valid_memexport_used() ||
|
||||
shader.memexport_eM_written() ||
|
||||
(shader.writes_color_target(0) &&
|
||||
DoesCoverageDependOnAlpha(regs.Get<reg::RB_COLORCONTROL>()))) {
|
||||
return true;
|
||||
|
@ -765,8 +765,70 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
|
|||
}
|
||||
return normalized_color_mask;
|
||||
}
|
||||
|
||||
void AddMemExportRanges(const RegisterFile& regs, const Shader& shader,
|
||||
std::vector<MemExportRange>& ranges_out) {
|
||||
if (!shader.memexport_eM_written()) {
|
||||
// The shader has eA writes, but no real exports.
|
||||
return;
|
||||
}
|
||||
uint32_t float_constants_base = shader.type() == xenos::ShaderType::kVertex
|
||||
? regs.Get<reg::SQ_VS_CONST>().base
|
||||
: regs.Get<reg::SQ_PS_CONST>().base;
|
||||
for (uint32_t constant_index : shader.memexport_stream_constants()) {
|
||||
const auto& stream = regs.Get<xenos::xe_gpu_memexport_stream_t>(
|
||||
XE_GPU_REG_SHADER_CONSTANT_000_X +
|
||||
(float_constants_base + constant_index) * 4);
|
||||
if (!stream.index_count) {
|
||||
continue;
|
||||
}
|
||||
const FormatInfo& format_info =
|
||||
*FormatInfo::Get(xenos::TextureFormat(stream.format));
|
||||
if (format_info.type != FormatType::kResolvable) {
|
||||
XELOGE("Unsupported memexport format {}",
|
||||
FormatInfo::GetName(format_info.format));
|
||||
// Translated shaders shouldn't be performing exports with an unknown
|
||||
// format, the draw can still be performed.
|
||||
continue;
|
||||
}
|
||||
// TODO(Triang3l): Remove the unresearched format logging when it's known
|
||||
// how exactly these formats need to be handled (most importantly what
|
||||
// components need to be stored and in which order).
|
||||
switch (stream.format) {
|
||||
case xenos::ColorFormat::k_8_A:
|
||||
case xenos::ColorFormat::k_8_B:
|
||||
case xenos::ColorFormat::k_8_8_8_8_A:
|
||||
XELOGW(
|
||||
"Memexport done to an unresearched format {}, report the game to "
|
||||
"Xenia developers!",
|
||||
FormatInfo::GetName(format_info.format));
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
uint32_t stream_size_bytes =
|
||||
stream.index_count * (format_info.bits_per_pixel >> 3);
|
||||
// Try to reduce the number of shared memory operations when writing
|
||||
// different elements into the same buffer through different exports
|
||||
// (happens in 4D5307E6).
|
||||
bool range_reused = false;
|
||||
for (MemExportRange& range : ranges_out) {
|
||||
if (range.base_address_dwords == stream.base_address) {
|
||||
range.size_bytes = std::max(range.size_bytes, stream_size_bytes);
|
||||
range_reused = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Add a new range if haven't expanded an existing one.
|
||||
if (!range_reused) {
|
||||
ranges_out.emplace_back(stream.base_address, stream_size_bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
XE_NOINLINE
|
||||
XE_NOALIAS
|
||||
|
||||
xenos::CopySampleSelect SanitizeCopySampleSelect(
|
||||
xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
|
||||
bool is_depth) {
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/gpu/register_file.h"
|
||||
|
@ -474,6 +475,19 @@ inline uint32_t GetD3D10SampleIndexForGuest2xMSAA(
|
|||
return guest_sample_index ? 3 : 0;
|
||||
}
|
||||
|
||||
struct MemExportRange {
|
||||
uint32_t base_address_dwords;
|
||||
uint32_t size_bytes;
|
||||
|
||||
explicit MemExportRange(uint32_t base_address_dwords, uint32_t size_bytes)
|
||||
: base_address_dwords(base_address_dwords), size_bytes(size_bytes) {}
|
||||
};
|
||||
|
||||
// Gathers memory ranges involved in memexports in the shader with the float
|
||||
// constants from the registers, adding them to ranges_out.
|
||||
void AddMemExportRanges(const RegisterFile& regs, const Shader& shader,
|
||||
std::vector<MemExportRange>& ranges_out);
|
||||
|
||||
// To avoid passing values that the shader won't understand (even though
|
||||
// Direct3D 9 shouldn't pass them anyway).
|
||||
XE_NOINLINE
|
||||
|
|
|
@ -913,6 +913,8 @@ enum class OperandModifier : uint32_t {
|
|||
|
||||
struct Dest : OperandAddress {
|
||||
// Ignored for 0-component and 1-component operand types.
|
||||
// For 4-component operand types, if the write mask is 0, it's treated as
|
||||
// 0-component.
|
||||
uint32_t write_mask_;
|
||||
|
||||
// Input destinations (v*) are for use only in declarations. Vector input
|
||||
|
@ -1028,12 +1030,16 @@ struct Dest : OperandAddress {
|
|||
void Write(std::vector<uint32_t>& code, bool in_dcl = false) const {
|
||||
uint32_t operand_token = GetOperandTokenTypeAndIndex();
|
||||
OperandDimension dimension = GetDimension(in_dcl);
|
||||
operand_token |= uint32_t(dimension);
|
||||
if (dimension == OperandDimension::kVector) {
|
||||
assert_true(write_mask_ > 0b0000 && write_mask_ <= 0b1111);
|
||||
operand_token |=
|
||||
(uint32_t(ComponentSelection::kMask) << 2) | (write_mask_ << 4);
|
||||
if (write_mask_) {
|
||||
assert_true(write_mask_ <= 0b1111);
|
||||
operand_token |=
|
||||
(uint32_t(ComponentSelection::kMask) << 2) | (write_mask_ << 4);
|
||||
} else {
|
||||
dimension = OperandDimension::kNoData;
|
||||
}
|
||||
}
|
||||
operand_token |= uint32_t(dimension);
|
||||
code.push_back(operand_token);
|
||||
OperandAddress::Write(code);
|
||||
}
|
||||
|
@ -1508,6 +1514,8 @@ enum class Opcode : uint32_t {
|
|||
kStoreUAVTyped = 164,
|
||||
kLdRaw = 165,
|
||||
kStoreRaw = 166,
|
||||
kAtomicAnd = 169,
|
||||
kAtomicOr = 170,
|
||||
kEvalSampleIndex = 204,
|
||||
kEvalCentroid = 205,
|
||||
};
|
||||
|
@ -2396,6 +2404,14 @@ class Assembler {
|
|||
++stat_.instruction_count;
|
||||
++stat_.c_texture_store_instructions;
|
||||
}
|
||||
void OpAtomicAnd(const Dest& dest, const Src& address,
|
||||
uint32_t address_components, const Src& value) {
|
||||
EmitAtomicOp(Opcode::kAtomicAnd, dest, address, address_components, value);
|
||||
}
|
||||
void OpAtomicOr(const Dest& dest, const Src& address,
|
||||
uint32_t address_components, const Src& value) {
|
||||
EmitAtomicOp(Opcode::kAtomicOr, dest, address, address_components, value);
|
||||
}
|
||||
void OpEvalSampleIndex(const Dest& dest, const Src& value,
|
||||
const Src& sample_index) {
|
||||
uint32_t dest_write_mask = dest.GetMask();
|
||||
|
@ -2522,6 +2538,22 @@ class Assembler {
|
|||
src1.Write(code_, true, 0b0000);
|
||||
++stat_.instruction_count;
|
||||
}
|
||||
void EmitAtomicOp(Opcode opcode, const Dest& dest, const Src& address,
|
||||
uint32_t address_components, const Src& value) {
|
||||
// Atomic operations require a 0-component memory destination.
|
||||
assert_zero(dest.GetMask());
|
||||
uint32_t address_mask = (1 << address_components) - 1;
|
||||
uint32_t operands_length = dest.GetLength() +
|
||||
address.GetLength(address_mask) +
|
||||
value.GetLength(0b0001);
|
||||
code_.reserve(code_.size() + 1 + operands_length);
|
||||
code_.push_back(OpcodeToken(opcode, operands_length));
|
||||
dest.Write(code_);
|
||||
address.Write(code_, true, address_mask);
|
||||
value.Write(code_, true, 0b0001);
|
||||
++stat_.instruction_count;
|
||||
++stat_.c_interlocked_instructions;
|
||||
}
|
||||
|
||||
std::vector<uint32_t>& code_;
|
||||
Statistics& stat_;
|
||||
|
|
|
@ -179,8 +179,6 @@ void DxbcShaderTranslator::Reset() {
|
|||
|
||||
sampler_bindings_.clear();
|
||||
|
||||
memexport_alloc_current_count_ = 0;
|
||||
|
||||
std::memset(&shader_feature_info_, 0, sizeof(shader_feature_info_));
|
||||
std::memset(&statistics_, 0, sizeof(statistics_));
|
||||
}
|
||||
|
@ -789,6 +787,63 @@ void DxbcShaderTranslator::StartPixelShader() {
|
|||
PopSystemTemp();
|
||||
}
|
||||
}
|
||||
|
||||
if (current_shader().memexport_eM_written()) {
|
||||
// Make sure memexport is done only once for a guest pixel.
|
||||
dxbc::Dest memexport_enabled_dest(
|
||||
dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0001));
|
||||
dxbc::Src memexport_enabled_src(dxbc::Src::R(
|
||||
system_temp_memexport_enabled_and_eM_written_, dxbc::Src::kXXXX));
|
||||
uint32_t resolution_scaled_axes =
|
||||
uint32_t(draw_resolution_scale_x_ > 1) |
|
||||
(uint32_t(draw_resolution_scale_y_ > 1) << 1);
|
||||
if (resolution_scaled_axes) {
|
||||
uint32_t memexport_condition_temp = PushSystemTemp();
|
||||
// Only do memexport for one host pixel in a guest pixel - prefer the
|
||||
// host pixel closer to the center of the guest pixel, but one that's
|
||||
// covered with the half-pixel offset according to the top-left rule (1
|
||||
// for 2x because 0 isn't covered with the half-pixel offset, 1 for 3x
|
||||
// because it's the center and is covered with the half-pixel offset too).
|
||||
in_position_used_ |= resolution_scaled_axes;
|
||||
a_.OpFToU(dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
|
||||
dxbc::Src::V1D(in_reg_ps_position_));
|
||||
a_.OpUDiv(dxbc::Dest::Null(),
|
||||
dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
|
||||
dxbc::Src::R(memexport_condition_temp),
|
||||
dxbc::Src::LU(draw_resolution_scale_x_,
|
||||
draw_resolution_scale_y_, 0, 0));
|
||||
a_.OpIEq(dxbc::Dest::R(memexport_condition_temp, resolution_scaled_axes),
|
||||
dxbc::Src::R(memexport_condition_temp),
|
||||
dxbc::Src::LU(draw_resolution_scale_x_ >> 1,
|
||||
draw_resolution_scale_y_ >> 1, 0, 0));
|
||||
for (uint32_t i = 0; i < 2; ++i) {
|
||||
if (!(resolution_scaled_axes & (1 << i))) {
|
||||
continue;
|
||||
}
|
||||
a_.OpAnd(memexport_enabled_dest, memexport_enabled_src,
|
||||
dxbc::Src::R(memexport_condition_temp).Select(i));
|
||||
}
|
||||
// Release memexport_condition_temp.
|
||||
PopSystemTemp();
|
||||
}
|
||||
// With sample-rate shading (with float24 conversion), only do memexport
|
||||
// from one sample (as the shader is invoked multiple times for a pixel),
|
||||
// if SV_SampleIndex == firstbit_lo(SV_Coverage). For zero coverage,
|
||||
// firstbit_lo returns 0xFFFFFFFF.
|
||||
if (IsSampleRate()) {
|
||||
uint32_t memexport_condition_temp = PushSystemTemp();
|
||||
a_.OpFirstBitLo(dxbc::Dest::R(memexport_condition_temp, 0b0001),
|
||||
dxbc::Src::VCoverage());
|
||||
a_.OpIEq(
|
||||
dxbc::Dest::R(memexport_condition_temp, 0b0001),
|
||||
dxbc::Src::V1D(in_reg_ps_front_face_sample_index_, dxbc::Src::kYYYY),
|
||||
dxbc::Src::R(memexport_condition_temp, dxbc::Src::kXXXX));
|
||||
a_.OpAnd(memexport_enabled_dest, memexport_enabled_src,
|
||||
dxbc::Src::R(memexport_condition_temp, dxbc::Src::kXXXX));
|
||||
// Release memexport_condition_temp.
|
||||
PopSystemTemp();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DxbcShaderTranslator::StartTranslation() {
|
||||
|
@ -885,34 +940,27 @@ void DxbcShaderTranslator::StartTranslation() {
|
|||
}
|
||||
}
|
||||
|
||||
if (!is_depth_only_pixel_shader_) {
|
||||
// Allocate temporary registers for memexport addresses and data.
|
||||
std::memset(system_temps_memexport_address_, 0xFF,
|
||||
sizeof(system_temps_memexport_address_));
|
||||
std::memset(system_temps_memexport_data_, 0xFF,
|
||||
sizeof(system_temps_memexport_data_));
|
||||
system_temp_memexport_written_ = UINT32_MAX;
|
||||
const uint8_t* memexports_written = current_shader().memexport_eM_written();
|
||||
for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) {
|
||||
uint32_t memexport_alloc_written = memexports_written[i];
|
||||
if (memexport_alloc_written == 0) {
|
||||
continue;
|
||||
}
|
||||
// If memexport is used at all, allocate a register containing whether eM#
|
||||
// have actually been written to.
|
||||
if (system_temp_memexport_written_ == UINT32_MAX) {
|
||||
system_temp_memexport_written_ = PushSystemTemp(0b1111);
|
||||
}
|
||||
system_temps_memexport_address_[i] = PushSystemTemp(0b1111);
|
||||
uint32_t memexport_data_index;
|
||||
while (xe::bit_scan_forward(memexport_alloc_written,
|
||||
&memexport_data_index)) {
|
||||
memexport_alloc_written &= ~(1u << memexport_data_index);
|
||||
system_temps_memexport_data_[i][memexport_data_index] =
|
||||
PushSystemTemp();
|
||||
}
|
||||
// Allocate temporary registers for memexport.
|
||||
uint8_t memexport_eM_written = current_shader().memexport_eM_written();
|
||||
if (memexport_eM_written) {
|
||||
system_temp_memexport_enabled_and_eM_written_ = PushSystemTemp(0b0010);
|
||||
// Initialize the memexport conditional to whether the shared memory is
|
||||
// currently bound as UAV (to 0 or UINT32_MAX). It can be made narrower
|
||||
// later.
|
||||
a_.OpIBFE(
|
||||
dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0001),
|
||||
dxbc::Src::LU(1), dxbc::Src::LU(kSysFlag_SharedMemoryIsUAV_Shift),
|
||||
LoadFlagsSystemConstant());
|
||||
system_temp_memexport_address_ = PushSystemTemp(0b1111);
|
||||
uint8_t memexport_eM_remaining = memexport_eM_written;
|
||||
uint32_t memexport_eM_index;
|
||||
while (xe::bit_scan_forward(memexport_eM_remaining, &memexport_eM_index)) {
|
||||
memexport_eM_remaining &= ~(uint8_t(1) << memexport_eM_index);
|
||||
system_temps_memexport_data_[memexport_eM_index] = PushSystemTemp(0b1111);
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_depth_only_pixel_shader_) {
|
||||
// Allocate system temporary variables for the translated code. Since access
|
||||
// depends on the guest code (thus no guarantees), initialize everything
|
||||
// now (except for pv, it's an internal temporary variable, not accessible
|
||||
|
@ -1091,27 +1139,19 @@ void DxbcShaderTranslator::CompleteShaderCode() {
|
|||
// - system_temp_grad_h_lod_.
|
||||
// - system_temp_grad_v_vfetch_address_.
|
||||
PopSystemTemp(6);
|
||||
}
|
||||
|
||||
// Write memexported data to the shared memory UAV.
|
||||
ExportToMemory();
|
||||
uint8_t memexport_eM_written = current_shader().memexport_eM_written();
|
||||
if (memexport_eM_written) {
|
||||
// Write data for the last memexport.
|
||||
ExportToMemory(
|
||||
current_shader().memexport_eM_potentially_written_before_end());
|
||||
|
||||
// Release memexport temporary registers.
|
||||
for (int i = Shader::kMaxMemExports - 1; i >= 0; --i) {
|
||||
if (system_temps_memexport_address_[i] == UINT32_MAX) {
|
||||
continue;
|
||||
}
|
||||
// Release exported data registers.
|
||||
for (int j = 4; j >= 0; --j) {
|
||||
if (system_temps_memexport_data_[i][j] != UINT32_MAX) {
|
||||
PopSystemTemp();
|
||||
}
|
||||
}
|
||||
// Release the address register.
|
||||
PopSystemTemp();
|
||||
}
|
||||
if (system_temp_memexport_written_ != UINT32_MAX) {
|
||||
PopSystemTemp();
|
||||
}
|
||||
// Release memexport temporary registers:
|
||||
// - system_temp_memexport_enabled_and_eM_written_.
|
||||
// - system_temp_memexport_address_.
|
||||
// - system_temps_memexport_data_.
|
||||
PopSystemTemp(xe::bit_count(uint32_t(memexport_eM_written)) + 2);
|
||||
}
|
||||
|
||||
// Write stage-specific epilogue.
|
||||
|
@ -1514,36 +1554,22 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
|
|||
dest = dxbc::Dest::R(system_temp_point_size_edge_flag_kill_vertex_);
|
||||
break;
|
||||
case InstructionStorageTarget::kExportAddress:
|
||||
// Validate memexport writes (4D5307E6 has some completely invalid ones).
|
||||
if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 ||
|
||||
memexport_alloc_current_count_ > Shader::kMaxMemExports ||
|
||||
system_temps_memexport_address_[memexport_alloc_current_count_ - 1] ==
|
||||
UINT32_MAX) {
|
||||
if (!current_shader().memexport_eM_written()) {
|
||||
return;
|
||||
}
|
||||
dest = dxbc::Dest::R(
|
||||
system_temps_memexport_address_[memexport_alloc_current_count_ - 1]);
|
||||
dest = dxbc::Dest::R(system_temp_memexport_address_);
|
||||
break;
|
||||
case InstructionStorageTarget::kExportData: {
|
||||
// Validate memexport writes (4D5307E6 has some completely invalid ones).
|
||||
if (memexport_alloc_current_count_ == 0 ||
|
||||
memexport_alloc_current_count_ > Shader::kMaxMemExports ||
|
||||
system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
|
||||
[result.storage_index] == UINT32_MAX) {
|
||||
return;
|
||||
}
|
||||
dest = dxbc::Dest::R(
|
||||
system_temps_memexport_data_[memexport_alloc_current_count_ - 1]
|
||||
[result.storage_index]);
|
||||
assert_not_zero(current_shader().memexport_eM_written() &
|
||||
(uint8_t(1) << result.storage_index));
|
||||
dest = dxbc::Dest::R(system_temps_memexport_data_[result.storage_index]);
|
||||
// Mark that the eM# has been written to and needs to be exported.
|
||||
assert_not_zero(used_write_mask);
|
||||
uint32_t memexport_index = memexport_alloc_current_count_ - 1;
|
||||
a_.OpOr(dxbc::Dest::R(system_temp_memexport_written_,
|
||||
1 << (memexport_index >> 2)),
|
||||
dxbc::Src::R(system_temp_memexport_written_)
|
||||
.Select(memexport_index >> 2),
|
||||
dxbc::Src::LU(uint32_t(1) << (result.storage_index +
|
||||
((memexport_index & 3) << 3))));
|
||||
a_.OpOr(
|
||||
dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0010),
|
||||
dxbc::Src::R(system_temp_memexport_enabled_and_eM_written_,
|
||||
dxbc::Src::kYYYY),
|
||||
dxbc::Src::LU(uint8_t(1) << result.storage_index));
|
||||
} break;
|
||||
case InstructionStorageTarget::kColor:
|
||||
assert_not_zero(used_write_mask);
|
||||
|
@ -1990,15 +2016,38 @@ void DxbcShaderTranslator::ProcessJumpInstruction(
|
|||
}
|
||||
|
||||
void DxbcShaderTranslator::ProcessAllocInstruction(
|
||||
const ParsedAllocInstruction& instr) {
|
||||
const ParsedAllocInstruction& instr, uint8_t export_eM) {
|
||||
bool start_memexport = instr.type == AllocType::kMemory &&
|
||||
current_shader().memexport_eM_written();
|
||||
if (export_eM || start_memexport) {
|
||||
CloseExecConditionals();
|
||||
}
|
||||
|
||||
if (emit_source_map_) {
|
||||
instruction_disassembly_buffer_.Reset();
|
||||
instr.Disassemble(&instruction_disassembly_buffer_);
|
||||
EmitInstructionDisassembly();
|
||||
}
|
||||
|
||||
if (instr.type == AllocType::kMemory) {
|
||||
++memexport_alloc_current_count_;
|
||||
if (export_eM) {
|
||||
ExportToMemory(export_eM);
|
||||
// Reset which eM# elements have been written.
|
||||
a_.OpMov(
|
||||
dxbc::Dest::R(system_temp_memexport_enabled_and_eM_written_, 0b0010),
|
||||
dxbc::Src::LU(0));
|
||||
// Break dependencies from the previous memexport.
|
||||
uint8_t export_eM_remaining = export_eM;
|
||||
uint32_t eM_index;
|
||||
while (xe::bit_scan_forward(export_eM_remaining, &eM_index)) {
|
||||
export_eM_remaining &= ~(uint8_t(1) << eM_index);
|
||||
a_.OpMov(dxbc::Dest::R(system_temps_memexport_data_[eM_index]),
|
||||
dxbc::Src::LF(0.0f));
|
||||
}
|
||||
}
|
||||
|
||||
if (start_memexport) {
|
||||
// Initialize eA to an invalid address.
|
||||
a_.OpMov(dxbc::Dest::R(system_temp_memexport_address_), dxbc::Src::LU(0));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2851,7 +2900,7 @@ void DxbcShaderTranslator::WriteInputSignature() {
|
|||
// Sample index (SV_SampleIndex) for safe memexport with sample-rate
|
||||
// shading.
|
||||
size_t sample_index_position = SIZE_MAX;
|
||||
if (current_shader().is_valid_memexport_used() && IsSampleRate()) {
|
||||
if (current_shader().memexport_eM_written() && IsSampleRate()) {
|
||||
size_t sample_index_position = shader_object_.size();
|
||||
shader_object_.resize(shader_object_.size() + kParameterDwords);
|
||||
++parameter_count;
|
||||
|
@ -3625,7 +3674,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
|
|||
dxbc::Name::kPosition);
|
||||
}
|
||||
bool sample_rate_memexport =
|
||||
current_shader().is_valid_memexport_used() && IsSampleRate();
|
||||
current_shader().memexport_eM_written() && IsSampleRate();
|
||||
// Sample-rate shading can't be done with UAV-only rendering (sample-rate
|
||||
// shading is only needed for float24 depth conversion when using a float32
|
||||
// host depth buffer).
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "xenia/base/string_buffer.h"
|
||||
#include "xenia/gpu/dxbc.h"
|
||||
#include "xenia/gpu/shader_translator.h"
|
||||
#include "xenia/gpu/ucode.h"
|
||||
#include "xenia/ui/graphics_provider.h"
|
||||
|
||||
namespace xe {
|
||||
|
@ -589,13 +590,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
|
|||
void ProcessLoopEndInstruction(
|
||||
const ParsedLoopEndInstruction& instr) override;
|
||||
void ProcessJumpInstruction(const ParsedJumpInstruction& instr) override;
|
||||
void ProcessAllocInstruction(const ParsedAllocInstruction& instr) override;
|
||||
void ProcessAllocInstruction(const ParsedAllocInstruction& instr,
|
||||
uint8_t export_eM) override;
|
||||
|
||||
void ProcessVertexFetchInstruction(
|
||||
const ParsedVertexFetchInstruction& instr) override;
|
||||
void ProcessTextureFetchInstruction(
|
||||
const ParsedTextureFetchInstruction& instr) override;
|
||||
void ProcessAluInstruction(const ParsedAluInstruction& instr) override;
|
||||
void ProcessAluInstruction(
|
||||
const ParsedAluInstruction& instr,
|
||||
uint8_t memexport_eM_potentially_written_before) override;
|
||||
|
||||
private:
|
||||
// IF ANY OF THESE ARE CHANGED, WriteInputSignature and WriteOutputSignature
|
||||
|
@ -674,6 +678,11 @@ class DxbcShaderTranslator : public ShaderTranslator {
|
|||
// Frees the last allocated internal r# registers for later reuse.
|
||||
void PopSystemTemp(uint32_t count = 1);
|
||||
|
||||
// ExportToMemory modifies the values of eA/eM# for simplicity, call only
|
||||
// before starting a new export or ending the invocation or making it
|
||||
// inactive.
|
||||
void ExportToMemory(uint8_t export_eM);
|
||||
|
||||
// Converts one scalar from piecewise linear gamma to linear. The target may
|
||||
// be the same as the source, the temporary variables must be different. If
|
||||
// the source is not pre-saturated, saturation will be done internally.
|
||||
|
@ -728,7 +737,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
|
|||
bool ROV_IsDepthStencilEarly() const {
|
||||
assert_true(edram_rov_used_);
|
||||
return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() &&
|
||||
!current_shader().is_valid_memexport_used();
|
||||
!current_shader().memexport_eM_written();
|
||||
}
|
||||
// Converts the pre-clamped depth value to 24-bit (storing the result in bits
|
||||
// 0:23 and zeros in 24:31, not creating room for stencil - since this may be
|
||||
|
@ -787,14 +796,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
|
|||
void StartPixelShader_LoadROVParameters();
|
||||
void StartPixelShader();
|
||||
|
||||
// Writing the epilogue.
|
||||
// ExportToMemory modifies the values of eA/eM# for simplicity, don't call
|
||||
// multiple times.
|
||||
void ExportToMemory_PackFixed32(const uint32_t* eM_temps, uint32_t eM_count,
|
||||
const uint32_t bits[4],
|
||||
const dxbc::Src& is_integer,
|
||||
const dxbc::Src& is_signed);
|
||||
void ExportToMemory();
|
||||
void CompleteVertexOrDomainShader();
|
||||
// For RTV, adds the sample to coverage_temp.coverage_temp_component if it
|
||||
// passes alpha to mask (or, if initialize == true (for the first sample
|
||||
|
@ -917,13 +918,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
|
|||
.SelectFromSwizzled(word_index & 1);
|
||||
}
|
||||
|
||||
void KillPixel(bool condition, const dxbc::Src& condition_src);
|
||||
void KillPixel(bool condition, const dxbc::Src& condition_src,
|
||||
uint8_t memexport_eM_potentially_written_before);
|
||||
|
||||
void ProcessVectorAluOperation(const ParsedAluInstruction& instr,
|
||||
uint32_t& result_swizzle,
|
||||
bool& predicate_written);
|
||||
void ProcessScalarAluOperation(const ParsedAluInstruction& instr,
|
||||
bool& predicate_written);
|
||||
void ProcessVectorAluOperation(
|
||||
const ParsedAluInstruction& instr,
|
||||
uint8_t memexport_eM_potentially_written_before, uint32_t& result_swizzle,
|
||||
bool& predicate_written);
|
||||
void ProcessScalarAluOperation(
|
||||
const ParsedAluInstruction& instr,
|
||||
uint8_t memexport_eM_potentially_written_before, bool& predicate_written);
|
||||
|
||||
void WriteResourceDefinition();
|
||||
void WriteInputSignature();
|
||||
|
@ -1124,14 +1128,16 @@ class DxbcShaderTranslator : public ShaderTranslator {
|
|||
// writing).
|
||||
uint32_t system_temps_color_[4];
|
||||
|
||||
// Bits containing whether each eM# has been written, for up to 16 streams, or
|
||||
// UINT32_MAX if memexport is not used. 8 bits (5 used) for each stream, with
|
||||
// 4 `alloc export`s per component.
|
||||
uint32_t system_temp_memexport_written_;
|
||||
// eA in each `alloc export`, or UINT32_MAX if not used.
|
||||
uint32_t system_temps_memexport_address_[Shader::kMaxMemExports];
|
||||
// eM# in each `alloc export`, or UINT32_MAX if not used.
|
||||
uint32_t system_temps_memexport_data_[Shader::kMaxMemExports][5];
|
||||
// Memory export temporary registers are allocated if the shader writes any
|
||||
// eM# (current_shader().memexport_eM_written() != 0).
|
||||
// X - whether memexport is enabled for this invocation.
|
||||
// Y - which eM# elements have been written so far by the invocation since the
|
||||
// last memory write.
|
||||
uint32_t system_temp_memexport_enabled_and_eM_written_;
|
||||
// eA.
|
||||
uint32_t system_temp_memexport_address_;
|
||||
// eM#.
|
||||
uint32_t system_temps_memexport_data_[ucode::kMaxMemExportElementCount];
|
||||
|
||||
// Vector ALU or fetch result / scratch (since Xenos write masks can contain
|
||||
// swizzles).
|
||||
|
@ -1195,10 +1201,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
|
|||
uint32_t uav_index_edram_;
|
||||
|
||||
std::vector<SamplerBinding> sampler_bindings_;
|
||||
|
||||
// Number of `alloc export`s encountered so far in the translation. The index
|
||||
// of the current eA/eM# temp register set is this minus 1, if it's not 0.
|
||||
uint32_t memexport_alloc_current_count_;
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
|
|
|
@ -19,22 +19,29 @@ namespace xe {
|
|||
namespace gpu {
|
||||
using namespace ucode;
|
||||
|
||||
void DxbcShaderTranslator::KillPixel(bool condition,
|
||||
const dxbc::Src& condition_src) {
|
||||
void DxbcShaderTranslator::KillPixel(
|
||||
bool condition, const dxbc::Src& condition_src,
|
||||
uint8_t memexport_eM_potentially_written_before) {
|
||||
a_.OpIf(condition, condition_src);
|
||||
// Perform outstanding memory exports before the invocation becomes inactive
|
||||
// and UAV writes are disabled.
|
||||
ExportToMemory(memexport_eM_potentially_written_before);
|
||||
// Discard the pixel, but continue execution if other lanes in the quad need
|
||||
// this lane for derivatives. The driver may also perform early exiting
|
||||
// internally if all lanes are discarded if deemed beneficial.
|
||||
a_.OpDiscard(condition, condition_src);
|
||||
a_.OpDiscard(true, dxbc::Src::LU(UINT32_MAX));
|
||||
if (edram_rov_used_) {
|
||||
// Even though discarding disables all subsequent UAV/ROV writes, also skip
|
||||
// as much of the Render Backend emulation logic as possible by setting the
|
||||
// coverage and the mask of the written render targets to zero.
|
||||
a_.OpMov(dxbc::Dest::R(system_temp_rov_params_, 0b0001), dxbc::Src::LU(0));
|
||||
}
|
||||
a_.OpEndIf();
|
||||
}
|
||||
|
||||
void DxbcShaderTranslator::ProcessVectorAluOperation(
|
||||
const ParsedAluInstruction& instr, uint32_t& result_swizzle,
|
||||
const ParsedAluInstruction& instr,
|
||||
uint8_t memexport_eM_potentially_written_before, uint32_t& result_swizzle,
|
||||
bool& predicate_written) {
|
||||
result_swizzle = dxbc::Src::kXYZW;
|
||||
predicate_written = false;
|
||||
|
@ -506,7 +513,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
|
|||
a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
|
||||
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
|
||||
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
|
||||
memexport_eM_potentially_written_before);
|
||||
if (used_result_components) {
|
||||
a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
|
||||
|
@ -522,7 +530,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
|
|||
a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
|
||||
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
|
||||
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
|
||||
memexport_eM_potentially_written_before);
|
||||
if (used_result_components) {
|
||||
a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
|
||||
|
@ -538,7 +547,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
|
|||
a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
|
||||
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
|
||||
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
|
||||
memexport_eM_potentially_written_before);
|
||||
if (used_result_components) {
|
||||
a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
|
||||
|
@ -554,7 +564,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
|
|||
a_.OpOr(dxbc::Dest::R(system_temp_result_, 0b0001),
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kYYYY));
|
||||
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX));
|
||||
KillPixel(true, dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
|
||||
memexport_eM_potentially_written_before);
|
||||
if (used_result_components) {
|
||||
a_.OpAnd(dxbc::Dest::R(system_temp_result_, 0b0001),
|
||||
dxbc::Src::R(system_temp_result_, dxbc::Src::kXXXX),
|
||||
|
@ -640,7 +651,8 @@ void DxbcShaderTranslator::ProcessVectorAluOperation(
|
|||
}
|
||||
|
||||
void DxbcShaderTranslator::ProcessScalarAluOperation(
|
||||
const ParsedAluInstruction& instr, bool& predicate_written) {
|
||||
const ParsedAluInstruction& instr,
|
||||
uint8_t memexport_eM_potentially_written_before, bool& predicate_written) {
|
||||
predicate_written = false;
|
||||
|
||||
if (instr.scalar_opcode == ucode::AluScalarOpcode::kRetainPrev) {
|
||||
|
@ -950,27 +962,27 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
|
|||
|
||||
case AluScalarOpcode::kKillsEq:
|
||||
a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
|
||||
KillPixel(true, ps_src);
|
||||
KillPixel(true, ps_src, memexport_eM_potentially_written_before);
|
||||
a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
|
||||
break;
|
||||
case AluScalarOpcode::kKillsGt:
|
||||
a_.OpLT(ps_dest, dxbc::Src::LF(0.0f), operand_0_a);
|
||||
KillPixel(true, ps_src);
|
||||
KillPixel(true, ps_src, memexport_eM_potentially_written_before);
|
||||
a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
|
||||
break;
|
||||
case AluScalarOpcode::kKillsGe:
|
||||
a_.OpGE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
|
||||
KillPixel(true, ps_src);
|
||||
KillPixel(true, ps_src, memexport_eM_potentially_written_before);
|
||||
a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
|
||||
break;
|
||||
case AluScalarOpcode::kKillsNe:
|
||||
a_.OpNE(ps_dest, operand_0_a, dxbc::Src::LF(0.0f));
|
||||
KillPixel(true, ps_src);
|
||||
KillPixel(true, ps_src, memexport_eM_potentially_written_before);
|
||||
a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
|
||||
break;
|
||||
case AluScalarOpcode::kKillsOne:
|
||||
a_.OpEq(ps_dest, operand_0_a, dxbc::Src::LF(1.0f));
|
||||
KillPixel(true, ps_src);
|
||||
KillPixel(true, ps_src, memexport_eM_potentially_written_before);
|
||||
a_.OpAnd(ps_dest, ps_src, dxbc::Src::LF(1.0f));
|
||||
break;
|
||||
|
||||
|
@ -1024,7 +1036,8 @@ void DxbcShaderTranslator::ProcessScalarAluOperation(
|
|||
}
|
||||
|
||||
void DxbcShaderTranslator::ProcessAluInstruction(
|
||||
const ParsedAluInstruction& instr) {
|
||||
const ParsedAluInstruction& instr,
|
||||
uint8_t memexport_eM_potentially_written_before) {
|
||||
if (instr.IsNop()) {
|
||||
// Don't even disassemble or update predication.
|
||||
return;
|
||||
|
@ -1041,10 +1054,11 @@ void DxbcShaderTranslator::ProcessAluInstruction(
|
|||
// checked again later.
|
||||
bool predicate_written_vector = false;
|
||||
uint32_t vector_result_swizzle = dxbc::Src::kXYZW;
|
||||
ProcessVectorAluOperation(instr, vector_result_swizzle,
|
||||
predicate_written_vector);
|
||||
ProcessVectorAluOperation(instr, memexport_eM_potentially_written_before,
|
||||
vector_result_swizzle, predicate_written_vector);
|
||||
bool predicate_written_scalar = false;
|
||||
ProcessScalarAluOperation(instr, predicate_written_scalar);
|
||||
ProcessScalarAluOperation(instr, memexport_eM_potentially_written_before,
|
||||
predicate_written_scalar);
|
||||
|
||||
StoreResult(instr.vector_and_constant_result,
|
||||
dxbc::Src::R(system_temp_result_, vector_result_swizzle),
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -672,7 +672,7 @@ class Shader {
|
|||
// For implementation without unconditional support for memory writes from
|
||||
// vertex shaders, vertex shader converted to a compute shader doing only
|
||||
// memory export.
|
||||
kMemexportCompute,
|
||||
kMemExportCompute,
|
||||
|
||||
// 4 host vertices for 1 guest vertex, for implementations without
|
||||
// unconditional geometry shader support.
|
||||
|
@ -769,9 +769,16 @@ class Shader {
|
|||
}
|
||||
};
|
||||
|
||||
// Based on the number of AS_VS/PS_EXPORT_STREAM_* enum sets found in a game
|
||||
// .pdb.
|
||||
static constexpr uint32_t kMaxMemExports = 16;
|
||||
struct ControlFlowMemExportInfo {
|
||||
// Which eM elements have potentially (regardless of conditionals, loop
|
||||
// iteration counts, predication) been written earlier in the predecessor
|
||||
// graph of the instruction since an `alloc export`.
|
||||
uint8_t eM_potentially_written_before = 0;
|
||||
// For exec sequences, which eM elements are potentially (regardless of
|
||||
// predication) written by the instructions in the sequence. For other
|
||||
// control flow instructions, it's 0.
|
||||
uint8_t eM_potentially_written_by_exec = 0;
|
||||
};
|
||||
|
||||
class Translation {
|
||||
public:
|
||||
|
@ -879,19 +886,21 @@ class Shader {
|
|||
return constant_register_map_;
|
||||
}
|
||||
|
||||
// uint5[Shader::kMaxMemExports] - bits indicating which eM# registers have
|
||||
// been written to after each `alloc export`, for up to Shader::kMaxMemExports
|
||||
// exports. This will contain zero for certain corrupt exports - for those to
|
||||
// which a valid eA was not written via a MAD with a stream constant.
|
||||
const uint8_t* memexport_eM_written() const { return memexport_eM_written_; }
|
||||
// Information about memory export state at each control flow instruction. May
|
||||
// be empty if there are no eM# writes.
|
||||
const std::vector<ControlFlowMemExportInfo>& cf_memexport_info() const {
|
||||
return cf_memexport_info_;
|
||||
}
|
||||
|
||||
// All c# registers used as the addend in MAD operations to eA.
|
||||
uint8_t memexport_eM_written() const { return memexport_eM_written_; }
|
||||
uint8_t memexport_eM_potentially_written_before_end() const {
|
||||
return memexport_eM_potentially_written_before_end_;
|
||||
}
|
||||
|
||||
// c# registers used as the addend in MAD operations to eA.
|
||||
const std::set<uint32_t>& memexport_stream_constants() const {
|
||||
return memexport_stream_constants_;
|
||||
}
|
||||
bool is_valid_memexport_used() const {
|
||||
return !memexport_stream_constants_.empty();
|
||||
}
|
||||
|
||||
// Labels that jumps (explicit or from loops) can be done to.
|
||||
const std::set<uint32_t>& label_addresses() const { return label_addresses_; }
|
||||
|
@ -969,7 +978,7 @@ class Shader {
|
|||
// TODO(Triang3l): Investigate what happens to memexport when the pixel
|
||||
// fails the depth/stencil test, but in Direct3D 11 UAV writes disable early
|
||||
// depth/stencil.
|
||||
return !kills_pixels() && !writes_depth() && !is_valid_memexport_used();
|
||||
return !kills_pixels() && !writes_depth() && !memexport_eM_written();
|
||||
}
|
||||
|
||||
// Whether each color render target is written to on any execution path.
|
||||
|
@ -1041,8 +1050,6 @@ class Shader {
|
|||
std::vector<VertexBinding> vertex_bindings_;
|
||||
std::vector<TextureBinding> texture_bindings_;
|
||||
ConstantRegisterMap constant_register_map_ = {0};
|
||||
uint8_t memexport_eM_written_[kMaxMemExports] = {};
|
||||
std::set<uint32_t> memexport_stream_constants_;
|
||||
std::set<uint32_t> label_addresses_;
|
||||
uint32_t cf_pair_index_bound_ = 0;
|
||||
uint32_t register_static_address_bound_ = 0;
|
||||
|
@ -1054,6 +1061,17 @@ class Shader {
|
|||
bool uses_texture_fetch_instruction_results_ = false;
|
||||
bool writes_depth_ = false;
|
||||
|
||||
// Memory export eM write info for each control flow instruction, if there are
|
||||
// any eM writes in the shader.
|
||||
std::vector<ControlFlowMemExportInfo> cf_memexport_info_;
|
||||
// Which memexport elements (eM#) are written for any memexport in the shader.
|
||||
uint8_t memexport_eM_written_ = 0;
|
||||
// ControlFlowMemExportInfo::eM_potentially_written_before equivalent for the
|
||||
// end of the shader, for the last memory export (or exports if the end has
|
||||
// multiple predecessor chains exporting to memory).
|
||||
uint8_t memexport_eM_potentially_written_before_end_ = 0;
|
||||
std::set<uint32_t> memexport_stream_constants_;
|
||||
|
||||
// Modification bits -> translation.
|
||||
std::unordered_map<uint64_t, Translation*> translations_;
|
||||
|
||||
|
@ -1063,8 +1081,7 @@ class Shader {
|
|||
void GatherExecInformation(
|
||||
const ParsedExecInstruction& instr,
|
||||
ucode::VertexFetchInstruction& previous_vfetch_full,
|
||||
uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count,
|
||||
uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer);
|
||||
uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer);
|
||||
void GatherVertexFetchInformation(
|
||||
const ucode::VertexFetchInstruction& op,
|
||||
ucode::VertexFetchInstruction& previous_vfetch_full,
|
||||
|
@ -1073,13 +1090,12 @@ class Shader {
|
|||
uint32_t& unique_texture_bindings,
|
||||
StringBuffer& ucode_disasm_buffer);
|
||||
void GatherAluInstructionInformation(const ucode::AluInstruction& op,
|
||||
uint32_t memexport_alloc_current_count,
|
||||
uint32_t& memexport_eA_written,
|
||||
uint32_t exec_cf_index,
|
||||
StringBuffer& ucode_disasm_buffer);
|
||||
void GatherOperandInformation(const InstructionOperand& operand);
|
||||
void GatherFetchResultInformation(const InstructionResult& result);
|
||||
void GatherAluResultInformation(const InstructionResult& result,
|
||||
uint32_t memexport_alloc_current_count);
|
||||
uint32_t exec_cf_index);
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
|
|
|
@ -87,8 +87,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
|
|||
VertexFetchInstruction previous_vfetch_full;
|
||||
std::memset(&previous_vfetch_full, 0, sizeof(previous_vfetch_full));
|
||||
uint32_t unique_texture_bindings = 0;
|
||||
uint32_t memexport_alloc_count = 0;
|
||||
uint32_t memexport_eA_written = 0;
|
||||
for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) {
|
||||
ControlFlowInstruction cf_ab[2];
|
||||
UnpackControlFlowInstructions(ucode_data_.data() + i * 3, cf_ab);
|
||||
|
@ -111,8 +109,7 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
|
|||
ParsedExecInstruction instr;
|
||||
ParseControlFlowExec(cf.exec, cf_index, instr);
|
||||
GatherExecInformation(instr, previous_vfetch_full,
|
||||
unique_texture_bindings, memexport_alloc_count,
|
||||
memexport_eA_written, ucode_disasm_buffer);
|
||||
unique_texture_bindings, ucode_disasm_buffer);
|
||||
} break;
|
||||
case ControlFlowOpcode::kCondExec:
|
||||
case ControlFlowOpcode::kCondExecEnd:
|
||||
|
@ -122,16 +119,14 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
|
|||
ParsedExecInstruction instr;
|
||||
ParseControlFlowCondExec(cf.cond_exec, cf_index, instr);
|
||||
GatherExecInformation(instr, previous_vfetch_full,
|
||||
unique_texture_bindings, memexport_alloc_count,
|
||||
memexport_eA_written, ucode_disasm_buffer);
|
||||
unique_texture_bindings, ucode_disasm_buffer);
|
||||
} break;
|
||||
case ControlFlowOpcode::kCondExecPred:
|
||||
case ControlFlowOpcode::kCondExecPredEnd: {
|
||||
ParsedExecInstruction instr;
|
||||
ParseControlFlowCondExecPred(cf.cond_exec_pred, cf_index, instr);
|
||||
GatherExecInformation(instr, previous_vfetch_full,
|
||||
unique_texture_bindings, memexport_alloc_count,
|
||||
memexport_eA_written, ucode_disasm_buffer);
|
||||
unique_texture_bindings, ucode_disasm_buffer);
|
||||
} break;
|
||||
case ControlFlowOpcode::kLoopStart: {
|
||||
ParsedLoopStartInstruction instr;
|
||||
|
@ -173,9 +168,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
|
|||
ParseControlFlowAlloc(cf.alloc, cf_index,
|
||||
type() == xenos::ShaderType::kVertex, instr);
|
||||
instr.Disassemble(&ucode_disasm_buffer);
|
||||
if (instr.type == AllocType::kMemory) {
|
||||
++memexport_alloc_count;
|
||||
}
|
||||
} break;
|
||||
case ControlFlowOpcode::kMarkVsFetchDone:
|
||||
break;
|
||||
|
@ -187,7 +179,6 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
|
|||
constant_register_map_.bool_bitmap[bool_constant_index / 32] |=
|
||||
uint32_t(1) << (bool_constant_index % 32);
|
||||
}
|
||||
// TODO(benvanik): break if (DoesControlFlowOpcodeEndShader(cf.opcode()))?
|
||||
}
|
||||
}
|
||||
ucode_disassembly_ = ucode_disasm_buffer.to_string();
|
||||
|
@ -206,17 +197,125 @@ void Shader::AnalyzeUcode(StringBuffer& ucode_disasm_buffer) {
|
|||
}
|
||||
}
|
||||
|
||||
// Cleanup invalid/unneeded memexport allocs.
|
||||
for (uint32_t i = 0; i < kMaxMemExports; ++i) {
|
||||
if (!(memexport_eA_written & (uint32_t(1) << i))) {
|
||||
memexport_eM_written_[i] = 0;
|
||||
} else if (!memexport_eM_written_[i]) {
|
||||
memexport_eA_written &= ~(uint32_t(1) << i);
|
||||
if (!cf_memexport_info_.empty()) {
|
||||
// Gather potentially "dirty" memexport elements before each control flow
|
||||
// instruction. `alloc` (any, not only `export`) flushes the previous memory
|
||||
// export. On the guest GPU, yielding / serializing also terminates memory
|
||||
// exports, but for simplicity disregarding that, as that functionally does
|
||||
// nothing compared to flushing the previous memory export only at `alloc`
|
||||
// or even only specifically at `alloc export`, Microsoft's validator checks
|
||||
// if eM# aren't written after a `serialize`.
|
||||
std::vector<uint32_t> successor_stack;
|
||||
for (uint32_t i = 0; i < cf_pair_index_bound_; ++i) {
|
||||
ControlFlowInstruction eM_writing_cf_ab[2];
|
||||
UnpackControlFlowInstructions(ucode_data_.data() + i * 3,
|
||||
eM_writing_cf_ab);
|
||||
for (uint32_t j = 0; j < 2; ++j) {
|
||||
uint32_t eM_writing_cf_index = i * 2 + j;
|
||||
uint32_t eM_written_by_cf_instr =
|
||||
cf_memexport_info_[eM_writing_cf_index]
|
||||
.eM_potentially_written_by_exec;
|
||||
if (eM_writing_cf_ab[j].opcode() == ControlFlowOpcode::kCondCall) {
|
||||
// Until subroutine calls are handled accurately, assume that all eM#
|
||||
// have potentially been written by the subroutine for simplicity.
|
||||
eM_written_by_cf_instr = memexport_eM_written_;
|
||||
}
|
||||
if (!eM_written_by_cf_instr) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If the control flow instruction potentially results in any eM# being
|
||||
// written, mark those eM# as potentially written before each successor.
|
||||
bool is_successor_graph_head = true;
|
||||
successor_stack.push_back(eM_writing_cf_index);
|
||||
while (!successor_stack.empty()) {
|
||||
uint32_t successor_cf_index = successor_stack.back();
|
||||
successor_stack.pop_back();
|
||||
|
||||
ControlFlowMemExportInfo& successor_memexport_info =
|
||||
cf_memexport_info_[successor_cf_index];
|
||||
if ((successor_memexport_info.eM_potentially_written_before &
|
||||
eM_written_by_cf_instr) == eM_written_by_cf_instr) {
|
||||
// Already marked as written before this instruction (and thus
|
||||
// before all its successors too). Possibly this instruction is in a
|
||||
// loop, in this case an instruction may succeed itself.
|
||||
break;
|
||||
}
|
||||
// The first instruction in the traversal is the writing instruction
|
||||
// itself, not its successor. However, if it has been visited by the
|
||||
// traversal twice, it's in a loop, so it succeeds itself, and thus
|
||||
// writes from it are potentially done before it too.
|
||||
if (!is_successor_graph_head) {
|
||||
successor_memexport_info.eM_potentially_written_before |=
|
||||
eM_written_by_cf_instr;
|
||||
}
|
||||
is_successor_graph_head = false;
|
||||
|
||||
ControlFlowInstruction successor_cf_ab[2];
|
||||
UnpackControlFlowInstructions(
|
||||
ucode_data_.data() + (successor_cf_index >> 1) * 3,
|
||||
successor_cf_ab);
|
||||
const ControlFlowInstruction& successor_cf =
|
||||
successor_cf_ab[successor_cf_index & 1];
|
||||
|
||||
bool next_instr_is_new_successor = true;
|
||||
switch (successor_cf.opcode()) {
|
||||
case ControlFlowOpcode::kExecEnd:
|
||||
// One successor: end.
|
||||
memexport_eM_potentially_written_before_end_ |=
|
||||
eM_written_by_cf_instr;
|
||||
next_instr_is_new_successor = false;
|
||||
break;
|
||||
case ControlFlowOpcode::kCondExecEnd:
|
||||
case ControlFlowOpcode::kCondExecPredEnd:
|
||||
case ControlFlowOpcode::kCondExecPredCleanEnd:
|
||||
// Two successors: next, end.
|
||||
memexport_eM_potentially_written_before_end_ |=
|
||||
eM_written_by_cf_instr;
|
||||
break;
|
||||
case ControlFlowOpcode::kLoopStart:
|
||||
// Two successors: next, skip.
|
||||
successor_stack.push_back(successor_cf.loop_start.address());
|
||||
break;
|
||||
case ControlFlowOpcode::kLoopEnd:
|
||||
// Two successors: next, repeat.
|
||||
successor_stack.push_back(successor_cf.loop_end.address());
|
||||
break;
|
||||
case ControlFlowOpcode::kCondCall:
|
||||
// Two successors: next, target.
|
||||
successor_stack.push_back(successor_cf.cond_call.address());
|
||||
break;
|
||||
case ControlFlowOpcode::kReturn:
|
||||
// Currently treating all subroutine calls as potentially writing
|
||||
// all eM# for simplicity, so just exit the subroutine.
|
||||
next_instr_is_new_successor = false;
|
||||
break;
|
||||
case ControlFlowOpcode::kCondJmp:
|
||||
// One or two successors: next if conditional, target.
|
||||
successor_stack.push_back(successor_cf.cond_jmp.address());
|
||||
if (successor_cf.cond_jmp.is_unconditional()) {
|
||||
next_instr_is_new_successor = false;
|
||||
}
|
||||
break;
|
||||
case ControlFlowOpcode::kAlloc:
|
||||
// Any `alloc` ends the previous export.
|
||||
next_instr_is_new_successor = false;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (next_instr_is_new_successor) {
|
||||
if (successor_cf_index < (cf_pair_index_bound_ << 1)) {
|
||||
successor_stack.push_back(successor_cf_index + 1);
|
||||
} else {
|
||||
memexport_eM_potentially_written_before_end_ |=
|
||||
eM_written_by_cf_instr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (memexport_eA_written == 0) {
|
||||
memexport_stream_constants_.clear();
|
||||
}
|
||||
|
||||
is_ucode_analyzed_ = true;
|
||||
|
||||
|
@ -250,8 +349,7 @@ uint32_t Shader::GetInterpolatorInputMask(reg::SQ_PROGRAM_CNTL sq_program_cntl,
|
|||
void Shader::GatherExecInformation(
|
||||
const ParsedExecInstruction& instr,
|
||||
ucode::VertexFetchInstruction& previous_vfetch_full,
|
||||
uint32_t& unique_texture_bindings, uint32_t memexport_alloc_current_count,
|
||||
uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) {
|
||||
uint32_t& unique_texture_bindings, StringBuffer& ucode_disasm_buffer) {
|
||||
instr.Disassemble(&ucode_disasm_buffer);
|
||||
uint32_t sequence = instr.sequence;
|
||||
for (uint32_t instr_offset = instr.instruction_address;
|
||||
|
@ -273,8 +371,7 @@ void Shader::GatherExecInformation(
|
|||
}
|
||||
} else {
|
||||
auto& op = *reinterpret_cast<const AluInstruction*>(op_ptr);
|
||||
GatherAluInstructionInformation(op, memexport_alloc_current_count,
|
||||
memexport_eA_written,
|
||||
GatherAluInstructionInformation(op, instr.dword_index,
|
||||
ucode_disasm_buffer);
|
||||
}
|
||||
}
|
||||
|
@ -381,8 +478,8 @@ void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op,
|
|||
}
|
||||
|
||||
void Shader::GatherAluInstructionInformation(
|
||||
const AluInstruction& op, uint32_t memexport_alloc_current_count,
|
||||
uint32_t& memexport_eA_written, StringBuffer& ucode_disasm_buffer) {
|
||||
const AluInstruction& op, uint32_t exec_cf_index,
|
||||
StringBuffer& ucode_disasm_buffer) {
|
||||
ParsedAluInstruction instr;
|
||||
ParseAluInstruction(op, type(), instr);
|
||||
instr.Disassemble(&ucode_disasm_buffer);
|
||||
|
@ -394,10 +491,8 @@ void Shader::GatherAluInstructionInformation(
|
|||
(ucode::GetAluScalarOpcodeInfo(op.scalar_opcode()).changed_state &
|
||||
ucode::kAluOpChangedStatePixelKill);
|
||||
|
||||
GatherAluResultInformation(instr.vector_and_constant_result,
|
||||
memexport_alloc_current_count);
|
||||
GatherAluResultInformation(instr.scalar_result,
|
||||
memexport_alloc_current_count);
|
||||
GatherAluResultInformation(instr.vector_and_constant_result, exec_cf_index);
|
||||
GatherAluResultInformation(instr.scalar_result, exec_cf_index);
|
||||
for (size_t i = 0; i < instr.vector_operand_count; ++i) {
|
||||
GatherOperandInformation(instr.vector_operands[i]);
|
||||
}
|
||||
|
@ -405,9 +500,7 @@ void Shader::GatherAluInstructionInformation(
|
|||
GatherOperandInformation(instr.scalar_operands[i]);
|
||||
}
|
||||
|
||||
// Store used memexport constants because CPU code needs addresses and sizes,
|
||||
// and also whether there have been writes to eA and eM# for register
|
||||
// allocation in shader translator implementations.
|
||||
// Store used memexport constants because CPU code needs addresses and sizes.
|
||||
// eA is (hopefully) always written to using:
|
||||
// mad eA, r#, const0100, c#
|
||||
// (though there are some exceptions, shaders in 4D5307E6 for some reason set
|
||||
|
@ -416,13 +509,9 @@ void Shader::GatherAluInstructionInformation(
|
|||
// Export is done to vector_dest of the ucode instruction for both vector and
|
||||
// scalar operations - no need to check separately.
|
||||
if (instr.vector_and_constant_result.storage_target ==
|
||||
InstructionStorageTarget::kExportAddress &&
|
||||
memexport_alloc_current_count > 0 &&
|
||||
memexport_alloc_current_count <= Shader::kMaxMemExports) {
|
||||
InstructionStorageTarget::kExportAddress) {
|
||||
uint32_t memexport_stream_constant = instr.GetMemExportStreamConstant();
|
||||
if (memexport_stream_constant != UINT32_MAX) {
|
||||
memexport_eA_written |= uint32_t(1)
|
||||
<< (memexport_alloc_current_count - 1);
|
||||
memexport_stream_constants_.insert(memexport_stream_constant);
|
||||
} else {
|
||||
XELOGE(
|
||||
|
@ -481,8 +570,8 @@ void Shader::GatherFetchResultInformation(const InstructionResult& result) {
|
|||
}
|
||||
}
|
||||
|
||||
void Shader::GatherAluResultInformation(
|
||||
const InstructionResult& result, uint32_t memexport_alloc_current_count) {
|
||||
void Shader::GatherAluResultInformation(const InstructionResult& result,
|
||||
uint32_t exec_cf_index) {
|
||||
uint32_t used_write_mask = result.GetUsedWriteMask();
|
||||
if (!used_write_mask) {
|
||||
return;
|
||||
|
@ -504,11 +593,12 @@ void Shader::GatherAluResultInformation(
|
|||
writes_point_size_edge_flag_kill_vertex_ |= used_write_mask;
|
||||
break;
|
||||
case InstructionStorageTarget::kExportData:
|
||||
if (memexport_alloc_current_count > 0 &&
|
||||
memexport_alloc_current_count <= Shader::kMaxMemExports) {
|
||||
memexport_eM_written_[memexport_alloc_current_count - 1] |=
|
||||
uint32_t(1) << result.storage_index;
|
||||
memexport_eM_written_ |= uint8_t(1) << result.storage_index;
|
||||
if (cf_memexport_info_.empty()) {
|
||||
cf_memexport_info_.resize(2 * cf_pair_index_bound_);
|
||||
}
|
||||
cf_memexport_info_[exec_cf_index].eM_potentially_written_by_exec |=
|
||||
uint32_t(1) << result.storage_index;
|
||||
break;
|
||||
case InstructionStorageTarget::kColor:
|
||||
writes_color_targets_ |= uint32_t(1) << result.storage_index;
|
||||
|
@ -665,7 +755,13 @@ void ShaderTranslator::TranslateControlFlowInstruction(
|
|||
case ControlFlowOpcode::kAlloc: {
|
||||
ParsedAllocInstruction instr;
|
||||
ParseControlFlowAlloc(cf.alloc, cf_index_, is_vertex_shader(), instr);
|
||||
ProcessAllocInstruction(instr);
|
||||
const std::vector<Shader::ControlFlowMemExportInfo>& cf_memexport_info =
|
||||
current_shader().cf_memexport_info();
|
||||
ProcessAllocInstruction(instr,
|
||||
instr.dword_index < cf_memexport_info.size()
|
||||
? cf_memexport_info[instr.dword_index]
|
||||
.eM_potentially_written_before
|
||||
: 0);
|
||||
} break;
|
||||
case ControlFlowOpcode::kMarkVsFetchDone:
|
||||
break;
|
||||
|
@ -807,6 +903,14 @@ void ParseControlFlowAlloc(const ControlFlowAllocInstruction& cf,
|
|||
void ShaderTranslator::TranslateExecInstructions(
|
||||
const ParsedExecInstruction& instr) {
|
||||
ProcessExecInstructionBegin(instr);
|
||||
|
||||
const std::vector<Shader::ControlFlowMemExportInfo>& cf_memexport_info =
|
||||
current_shader().cf_memexport_info();
|
||||
uint8_t eM_potentially_written_before =
|
||||
instr.dword_index < cf_memexport_info.size()
|
||||
? cf_memexport_info[instr.dword_index].eM_potentially_written_before
|
||||
: 0;
|
||||
|
||||
const uint32_t* ucode_dwords = current_shader().ucode_data().data();
|
||||
uint32_t sequence = instr.sequence;
|
||||
for (uint32_t instr_offset = instr.instruction_address;
|
||||
|
@ -832,9 +936,22 @@ void ShaderTranslator::TranslateExecInstructions(
|
|||
auto& op = *reinterpret_cast<const AluInstruction*>(op_ptr);
|
||||
ParsedAluInstruction alu_instr;
|
||||
ParseAluInstruction(op, current_shader().type(), alu_instr);
|
||||
ProcessAluInstruction(alu_instr);
|
||||
ProcessAluInstruction(alu_instr, eM_potentially_written_before);
|
||||
if (alu_instr.vector_and_constant_result.storage_target ==
|
||||
InstructionStorageTarget::kExportData &&
|
||||
alu_instr.vector_and_constant_result.GetUsedWriteMask()) {
|
||||
eM_potentially_written_before |=
|
||||
uint8_t(1) << alu_instr.vector_and_constant_result.storage_index;
|
||||
}
|
||||
if (alu_instr.scalar_result.storage_target ==
|
||||
InstructionStorageTarget::kExportData &&
|
||||
alu_instr.scalar_result.GetUsedWriteMask()) {
|
||||
eM_potentially_written_before |=
|
||||
uint8_t(1) << alu_instr.scalar_result.storage_index;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ProcessExecInstructionEnd(instr);
|
||||
}
|
||||
|
||||
|
|
|
@ -118,8 +118,10 @@ class ShaderTranslator {
|
|||
virtual void ProcessReturnInstruction(const ParsedReturnInstruction& instr) {}
|
||||
// Handles translation for jump instructions.
|
||||
virtual void ProcessJumpInstruction(const ParsedJumpInstruction& instr) {}
|
||||
// Handles translation for alloc instructions.
|
||||
virtual void ProcessAllocInstruction(const ParsedAllocInstruction& instr) {}
|
||||
// Handles translation for alloc instructions. Memory exports for eM#
|
||||
// indicated by export_eM must be performed, regardless of the alloc type.
|
||||
virtual void ProcessAllocInstruction(const ParsedAllocInstruction& instr,
|
||||
uint8_t export_eM) {}
|
||||
|
||||
// Handles translation for vertex fetch instructions.
|
||||
virtual void ProcessVertexFetchInstruction(
|
||||
|
@ -128,7 +130,13 @@ class ShaderTranslator {
|
|||
virtual void ProcessTextureFetchInstruction(
|
||||
const ParsedTextureFetchInstruction& instr) {}
|
||||
// Handles translation for ALU instructions.
|
||||
virtual void ProcessAluInstruction(const ParsedAluInstruction& instr) {}
|
||||
// memexport_eM_potentially_written_before needs to be handled by `kill`
|
||||
// instruction to make sure memory exports for the eM# writes earlier in
|
||||
// previous execs and the current exec are done before the invocation becomes
|
||||
// inactive.
|
||||
virtual void ProcessAluInstruction(
|
||||
const ParsedAluInstruction& instr,
|
||||
uint8_t memexport_eM_potentially_written_before) {}
|
||||
|
||||
private:
|
||||
void TranslateControlFlowInstruction(const ucode::ControlFlowInstruction& cf);
|
||||
|
|
|
@ -134,7 +134,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
|
|||
// (32-bit only - 16-bit indices are always fetched via the Vulkan index
|
||||
// buffer).
|
||||
kSysFlag_VertexIndexLoad = 1u << kSysFlag_VertexIndexLoad_Shift,
|
||||
// For HostVertexShaderTypes kMemexportCompute, kPointListAsTriangleStrip,
|
||||
// For HostVertexShaderTypes kMemExportCompute, kPointListAsTriangleStrip,
|
||||
// kRectangleListAsTriangleStrip, whether the vertex index needs to be
|
||||
// loaded from the index buffer (rather than using autogenerated indices),
|
||||
// and whether it's 32-bit. This is separate from kSysFlag_VertexIndexLoad
|
||||
|
@ -427,7 +427,9 @@ class SpirvShaderTranslator : public ShaderTranslator {
|
|||
const ParsedVertexFetchInstruction& instr) override;
|
||||
void ProcessTextureFetchInstruction(
|
||||
const ParsedTextureFetchInstruction& instr) override;
|
||||
void ProcessAluInstruction(const ParsedAluInstruction& instr) override;
|
||||
void ProcessAluInstruction(
|
||||
const ParsedAluInstruction& instr,
|
||||
uint8_t memexport_eM_potentially_written_before) override;
|
||||
|
||||
private:
|
||||
struct TextureBinding {
|
||||
|
@ -620,7 +622,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
|
|||
assert_true(edram_fragment_shader_interlock_);
|
||||
return !is_depth_only_fragment_shader_ &&
|
||||
!current_shader().writes_depth() &&
|
||||
!current_shader().is_valid_memexport_used();
|
||||
!current_shader().memexport_eM_written();
|
||||
}
|
||||
void FSI_LoadSampleMask(spv::Id msaa_samples);
|
||||
void FSI_LoadEdramOffsets(spv::Id msaa_samples);
|
||||
|
|
|
@ -67,7 +67,8 @@ void SpirvShaderTranslator::KillPixel(spv::Id condition) {
|
|||
}
|
||||
|
||||
void SpirvShaderTranslator::ProcessAluInstruction(
|
||||
const ParsedAluInstruction& instr) {
|
||||
const ParsedAluInstruction& instr,
|
||||
uint8_t memexport_eM_potentially_written_before) {
|
||||
if (instr.IsNop()) {
|
||||
// Don't even disassemble or update predication.
|
||||
return;
|
||||
|
|
|
@ -210,7 +210,7 @@ enum class AllocType : uint32_t {
|
|||
kVsInterpolators = 2,
|
||||
// Pixel shader exports colors.
|
||||
kPsColors = 2,
|
||||
// MEMEXPORT?
|
||||
// Memory export.
|
||||
kMemory = 3,
|
||||
};
|
||||
|
||||
|
@ -1782,6 +1782,9 @@ inline uint32_t GetAluVectorOpNeededSourceComponents(
|
|||
.operand_components_used[src_index - 1];
|
||||
}
|
||||
|
||||
// eM# (kExportData) register count.
|
||||
constexpr uint32_t kMaxMemExportElementCount = 5;
|
||||
|
||||
enum class ExportRegister : uint32_t {
|
||||
kVSInterpolator0 = 0,
|
||||
kVSInterpolator1,
|
||||
|
|
|
@ -2187,7 +2187,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
|
|||
return false;
|
||||
}
|
||||
pipeline_cache_->AnalyzeShaderUcode(*vertex_shader);
|
||||
bool memexport_used_vertex = vertex_shader->is_valid_memexport_used();
|
||||
bool memexport_used_vertex = vertex_shader->memexport_eM_written() != 0;
|
||||
|
||||
// Pixel shader analysis.
|
||||
bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs);
|
||||
|
|
|
@ -497,6 +497,18 @@ enum class TextureFormat : uint32_t {
|
|||
k_6_5_5 = 5,
|
||||
k_8_8_8_8 = 6,
|
||||
k_2_10_10_10 = 7,
|
||||
// Possibly similar to k_8, but may be storing alpha instead of red when
|
||||
// resolving/memexporting, though not exactly known. From the point of view of
|
||||
// sampling, it should be treated the same as k_8 (given that textures have
|
||||
// the last - and single-component textures have the only - component
|
||||
// replicated into all the remaining ones before the swizzle).
|
||||
// Used as:
|
||||
// - Texture in 4B4E083C - text, starting from the "Loading..." and the "This
|
||||
// game saves data automatically" messages. The swizzle in the fetch
|
||||
// constant is 111W (suggesting that internally the only component may be
|
||||
// the alpha one, not red).
|
||||
// TODO(Triang3l): Investigate how k_8_A and k_8_B work in resolves and
|
||||
// memexports, whether they store alpha/blue of the input or red.
|
||||
k_8_A = 8,
|
||||
k_8_B = 9,
|
||||
k_8_8 = 10,
|
||||
|
@ -510,6 +522,12 @@ enum class TextureFormat : uint32_t {
|
|||
// Used for videos in 54540829.
|
||||
k_Y1_Cr_Y0_Cb_REP = 12,
|
||||
k_16_16_EDRAM = 13,
|
||||
// Likely same as k_8_8_8_8.
|
||||
// Used as:
|
||||
// - Memexport destination in 4D5308BC - multiple small draws when looking
|
||||
// back at the door behind the player in the first room of gameplay.
|
||||
// - Memexport destination in 4D53085B and 4D530919 - in 4D53085B, in a frame
|
||||
// between the intro video and the main menu, in a 8192-point draw.
|
||||
k_8_8_8_8_A = 14,
|
||||
k_4_4_4_4 = 15,
|
||||
k_10_11_11 = 16,
|
||||
|
@ -1373,8 +1391,7 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
|
|||
// memexport on the Adreno 2xx using GL_OES_get_program_binary - it's also
|
||||
// interesting to see how alphatest interacts with it, whether it's still true
|
||||
// fixed-function alphatest, as it's claimed to be supported as usual by the
|
||||
// extension specification - it's likely, however, that memory exports are
|
||||
// discarded alongside other exports such as oC# and oDepth this way.
|
||||
// extension specification.
|
||||
//
|
||||
// Y of eA contains the offset in elements - this is what shaders are supposed
|
||||
// to calculate from something like the vertex index. Again, it's specified as
|
||||
|
@ -1397,6 +1414,69 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
|
|||
// elements using packing via addition to 2^23, so this field also doesn't need
|
||||
// more bits than that.
|
||||
//
|
||||
// According to the sequencer specification from IPR2015-00325 (where memexport
|
||||
// is called "pass thru export"):
|
||||
// - Pass thru exports can occur anywhere in the shader program.
|
||||
// - There can be any number of pass thru exports.
|
||||
// - The address register is not kept across clause boundaries, so it must be
|
||||
// refreshed after any Serialize (or yield), allocate instruction or resource
|
||||
// change.
|
||||
// - The write to eM# may be predicated if the export is not needed.
|
||||
// - Exports are dropped if:
|
||||
// - The index is above the maximum.
|
||||
// - The index sign bit is 1.
|
||||
// - The exponent of the index is not 23.
|
||||
// The requirement that eM4 must be written if any eM# other than eM0 is also
|
||||
// written doesn't apply to the final Xenos, it's likely an outdated note in the
|
||||
// specification considering that it's very preliminary.
|
||||
//
|
||||
// According to Microsoft's shader validator:
|
||||
// - eA can be written only by `mad`.
|
||||
// - A single eM# can be written by any number of instruction, including with
|
||||
// write masking.
|
||||
// - eA must be written before eM#.
|
||||
// - Any alloc instruction or a `serialize` terminates the current memory
|
||||
// export. This doesn't apply to `exec Yield=true`, however, and it's not
|
||||
// clear if that's an oversight or if that's not considered a yield that
|
||||
// terminates the export.
|
||||
//
|
||||
// From the emulation perspective, this means that:
|
||||
// - Alloc instructions (`alloc export` mandatorily, other allocs optionally),
|
||||
// and optionally `serialize` instructions within `exec`, should be treated as
|
||||
// the locations where the currently open export should be flushed to the
|
||||
// memory. It should be taken into account that an export may be in looping
|
||||
// control flow, and in this case it must be performed at every iteration.
|
||||
// - Whether each eM# was written to must be tracked at shader execution time,
|
||||
// as predication can disable the export of an element.
|
||||
//
|
||||
// TODO(Triang3l): Investigate how memory export interacts with pixel killing.
|
||||
// Given that eM# writes disabled by predication don't cause an export, it's
|
||||
// possible that killed invocations are treated as inactive (invalid in Xenos
|
||||
// terms) overall, and thus new memory exports from them shouldn't be done, but
|
||||
// that's not verified. However, given that on Direct3D 11+, OpenGL and Vulkan
|
||||
// hosts, discarding disables subsequent storage resource writes, on the host,
|
||||
// it would be natural to perform all outstanding memory exports before
|
||||
// discarding if the kill condition passes.
|
||||
//
|
||||
// Memory exports can be performed to any ColorFormat, including 8bpp and 16bpp
|
||||
// ones. Hosts, however, may have the memory bound as a 32bpp buffer (for
|
||||
// instance, due to the minimum resource view size limitation on Direct3D 11).
|
||||
// In this case, bytes and shorts aren't addressable directly. However, taking
|
||||
// into account that memory accesses are coherent within one shader invocation
|
||||
// on Direct3D 11+, OpenGL and Vulkan and thus are done in order relatively to
|
||||
// each other, it should be possible to implement them by clearing the bits via
|
||||
// an atomic AND, and writing the new value using an atomic OR. This will, of
|
||||
// course, make the entire write operation non-atomic, and in case of a race
|
||||
// between writes to the same location, the final result may not even be just a
|
||||
// value from one of the invocations, but rather, it can be OR of the values
|
||||
// from any invocations involved. However, on the Xenos, there doesn't seem to
|
||||
// be any possibility of meaningfully accessing the same location from multiple
|
||||
// invocations if any of them is writing, memory exports are out-of-order, so
|
||||
// such an implementation shouldn't be causing issues in reality. Atomic
|
||||
// compare-exchange, however, should not be used for this purpose, as it may
|
||||
// result in an infinite loop if different invocations want to write different
|
||||
// values to the same memory location.
|
||||
//
|
||||
// Examples of setup in titles (Z from MSB to LSB):
|
||||
//
|
||||
// 4D5307E6 particles (different VS invocation counts, like 1, 2, 4):
|
||||
|
@ -1432,6 +1512,11 @@ static_assert_size(xe_gpu_fetch_group_t, sizeof(uint32_t) * 6);
|
|||
// c0: Z = 010010110000|0|010|11|011010|00011|001
|
||||
// 8in16, 16_16_16_16, uint, RGBA - from 16_16_16_16 uint vfetch
|
||||
// (16_16_16_16 is the largest color format without special values)
|
||||
//
|
||||
// 58410B86 hierarchical depth buffer occlusion culling with the result read on
|
||||
// the CPU (15000 VS invocations in the main menu):
|
||||
// c8: Z = 010010110000|0|010|00|000010|00000|000, count = invocation count
|
||||
// No endian swap, 8, uint, RGBA
|
||||
union alignas(uint32_t) xe_gpu_memexport_stream_t {
|
||||
struct {
|
||||
uint32_t dword_0;
|
||||
|
|
|
@ -119,6 +119,8 @@ dword_result_t XamContentCreateEnumerator_entry(
|
|||
}
|
||||
DECLARE_XAM_EXPORT1(XamContentCreateEnumerator, kContent, kImplemented);
|
||||
|
||||
enum class kDispositionState : uint32_t { Unknown = 0, Create = 1, Open = 2 };
|
||||
|
||||
dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
|
||||
lpvoid_t content_data_ptr,
|
||||
dword_t content_data_size, dword_t flags,
|
||||
|
@ -146,40 +148,37 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
|
|||
content_data, disposition_ptr, license_mask_ptr, overlapped_ptr](
|
||||
uint32_t& extended_error, uint32_t& length) -> X_RESULT {
|
||||
X_RESULT result = X_ERROR_INVALID_PARAMETER;
|
||||
bool create = false;
|
||||
bool open = false;
|
||||
kDispositionState disposition = kDispositionState::Unknown;
|
||||
switch (flags & 0xF) {
|
||||
case 1: // CREATE_NEW
|
||||
// Fail if exists.
|
||||
if (content_manager->ContentExists(content_data)) {
|
||||
result = X_ERROR_ALREADY_EXISTS;
|
||||
} else {
|
||||
create = true;
|
||||
disposition = kDispositionState::Create;
|
||||
}
|
||||
break;
|
||||
case 2: // CREATE_ALWAYS
|
||||
// Overwrite existing, if any.
|
||||
if (content_manager->ContentExists(content_data)) {
|
||||
content_manager->DeleteContent(content_data);
|
||||
create = true;
|
||||
} else {
|
||||
create = true;
|
||||
}
|
||||
disposition = kDispositionState::Create;
|
||||
break;
|
||||
case 3: // OPEN_EXISTING
|
||||
// Open only if exists.
|
||||
if (!content_manager->ContentExists(content_data)) {
|
||||
result = X_ERROR_PATH_NOT_FOUND;
|
||||
} else {
|
||||
open = true;
|
||||
disposition = kDispositionState::Open;
|
||||
}
|
||||
break;
|
||||
case 4: // OPEN_ALWAYS
|
||||
// Create if needed.
|
||||
if (!content_manager->ContentExists(content_data)) {
|
||||
create = true;
|
||||
disposition = kDispositionState::Create;
|
||||
} else {
|
||||
open = true;
|
||||
disposition = kDispositionState::Open;
|
||||
}
|
||||
break;
|
||||
case 5: // TRUNCATE_EXISTING
|
||||
|
@ -188,7 +187,7 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
|
|||
result = X_ERROR_PATH_NOT_FOUND;
|
||||
} else {
|
||||
content_manager->DeleteContent(content_data);
|
||||
create = true;
|
||||
disposition = kDispositionState::Create;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
|
@ -196,21 +195,12 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
|
|||
break;
|
||||
}
|
||||
|
||||
// creation result
|
||||
// 0 = ?
|
||||
// 1 = created
|
||||
// 2 = opened
|
||||
uint32_t disposition = create ? 1 : 2;
|
||||
if (disposition_ptr) {
|
||||
*disposition_ptr = disposition;
|
||||
}
|
||||
|
||||
if (create) {
|
||||
if (disposition == kDispositionState::Create) {
|
||||
result = content_manager->CreateContent(root_name, content_data);
|
||||
if (XSUCCEEDED(result)) {
|
||||
content_manager->WriteContentHeaderFile(&content_data);
|
||||
}
|
||||
} else if (open) {
|
||||
} else if (disposition == kDispositionState::Open) {
|
||||
result = content_manager->OpenContent(root_name, content_data);
|
||||
}
|
||||
|
||||
|
@ -224,12 +214,11 @@ dword_result_t xeXamContentCreate(dword_t user_index, lpstring_t root_name,
|
|||
}
|
||||
|
||||
extended_error = X_HRESULT_FROM_WIN32(result);
|
||||
length = disposition;
|
||||
length = static_cast<uint32_t>(disposition);
|
||||
|
||||
if (result && overlapped_ptr) {
|
||||
result = X_ERROR_FUNCTION_FAILED;
|
||||
}
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
|
@ -451,7 +440,6 @@ static_assert_size(X_SWAPDISC_ERROR_MESSAGE, 12);
|
|||
dword_result_t XamSwapDisc_entry(
|
||||
dword_t disc_number, pointer_t<X_KEVENT> completion_handle,
|
||||
pointer_t<X_SWAPDISC_ERROR_MESSAGE> error_message) {
|
||||
|
||||
xex2_opt_execution_info* info = nullptr;
|
||||
kernel_state()->GetExecutableModule()->GetOptHeader(XEX_HEADER_EXECUTION_INFO,
|
||||
&info);
|
||||
|
|
|
@ -254,202 +254,15 @@ dword_result_t XGetLanguage_entry() {
|
|||
}
|
||||
DECLARE_XAM_EXPORT1(XGetLanguage, kNone, kImplemented);
|
||||
|
||||
// http://www.noxa.org/blog/2011/02/28/building-an-xbox-360-emulator-part-3-feasibilityos/
|
||||
// http://www.noxa.org/blog/2011/08/13/building-an-xbox-360-emulator-part-5-xex-files/
|
||||
dword_result_t RtlSleep_entry(dword_t dwMilliseconds, dword_t bAlertable) {
|
||||
LARGE_INTEGER delay{};
|
||||
|
||||
// Convert the delay time to 100-nanosecond intervals
|
||||
delay.QuadPart = dwMilliseconds == -1
|
||||
? LLONG_MAX
|
||||
: static_cast<LONGLONG>(-10000) * dwMilliseconds;
|
||||
|
||||
X_STATUS result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
|
||||
(uint64_t*)&delay);
|
||||
|
||||
// If the delay was interrupted by an APC, keep delaying the thread
|
||||
while (bAlertable && result == X_STATUS_ALERTED) {
|
||||
result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
|
||||
(uint64_t*)&delay);
|
||||
}
|
||||
|
||||
return result == X_STATUS_SUCCESS ? X_STATUS_SUCCESS : X_STATUS_USER_APC;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(RtlSleep, kNone, kImplemented);
|
||||
|
||||
dword_result_t SleepEx_entry(dword_t dwMilliseconds, dword_t bAlertable) {
|
||||
return RtlSleep_entry(dwMilliseconds, bAlertable);
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(SleepEx, kNone, kImplemented);
|
||||
|
||||
// https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-sleep
|
||||
void Sleep_entry(dword_t dwMilliseconds) {
|
||||
RtlSleep_entry(dwMilliseconds, FALSE);
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(Sleep, kNone, kImplemented);
|
||||
|
||||
// https://learn.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-gettickcount
|
||||
dword_result_t GetTickCount_entry() { return Clock::QueryGuestUptimeMillis(); }
|
||||
DECLARE_XAM_EXPORT1(GetTickCount, kNone, kImplemented);
|
||||
|
||||
dword_result_t XamGetCurrentTitleId_entry() {
|
||||
return kernel_state()->emulator()->title_id();
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(XamGetCurrentTitleId, kNone, kImplemented);
|
||||
|
||||
dword_result_t RtlSetLastNTError_entry(dword_t error_code) {
|
||||
const uint32_t result =
|
||||
xe::kernel::xboxkrnl::xeRtlNtStatusToDosError(error_code);
|
||||
XThread::SetLastError(result);
|
||||
|
||||
return result;
|
||||
dword_result_t XamIsCurrentTitleDash_entry(const ppc_context_t& ctx) {
|
||||
return ctx->kernel_state->title_id() == 0xFFFE07D1;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(RtlSetLastNTError, kNone, kImplemented);
|
||||
|
||||
dword_result_t RtlGetLastError_entry() { return XThread::GetLastError(); }
|
||||
DECLARE_XAM_EXPORT1(RtlGetLastError, kNone, kImplemented);
|
||||
|
||||
dword_result_t GetLastError_entry() { return RtlGetLastError_entry(); }
|
||||
DECLARE_XAM_EXPORT1(GetLastError, kNone, kImplemented);
|
||||
|
||||
dword_result_t GetModuleHandleA_entry(lpstring_t module_name) {
|
||||
xe::be<uint32_t> module_ptr = 0;
|
||||
const X_STATUS error_code = xe::kernel::xboxkrnl::XexGetModuleHandle(
|
||||
module_name.value(), &module_ptr);
|
||||
|
||||
if (XFAILED(error_code)) {
|
||||
RtlSetLastNTError_entry(error_code);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return (uint32_t)module_ptr;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(GetModuleHandleA, kNone, kImplemented);
|
||||
|
||||
dword_result_t XapipCreateThread_entry(lpdword_t lpThreadAttributes,
|
||||
dword_t dwStackSize,
|
||||
lpvoid_t lpStartAddress,
|
||||
lpvoid_t lpParameter,
|
||||
dword_t dwCreationFlags, dword_t unkn,
|
||||
lpdword_t lpThreadId) {
|
||||
uint32_t flags = (dwCreationFlags >> 2) & 1;
|
||||
|
||||
if (unkn != -1) {
|
||||
flags |= 1 << unkn << 24;
|
||||
}
|
||||
|
||||
xe::be<uint32_t> result = 0;
|
||||
|
||||
const X_STATUS error_code = xe::kernel::xboxkrnl::ExCreateThread(
|
||||
&result, dwStackSize, lpThreadId, lpStartAddress, lpParameter, 0, flags);
|
||||
|
||||
if (XFAILED(error_code)) {
|
||||
RtlSetLastNTError_entry(error_code);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return (uint32_t)result;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(XapipCreateThread, kNone, kImplemented);
|
||||
|
||||
dword_result_t CreateThread_entry(lpdword_t lpThreadAttributes,
|
||||
dword_t dwStackSize, lpvoid_t lpStartAddress,
|
||||
lpvoid_t lpParameter, dword_t dwCreationFlags,
|
||||
lpdword_t lpThreadId) {
|
||||
return XapipCreateThread_entry(lpThreadAttributes, dwStackSize,
|
||||
lpStartAddress, lpParameter, dwCreationFlags,
|
||||
-1, lpThreadId);
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(CreateThread, kNone, kImplemented);
|
||||
|
||||
dword_result_t CloseHandle_entry(dword_t hObject) {
|
||||
const X_STATUS error_code = xe::kernel::xboxkrnl::NtClose(hObject);
|
||||
|
||||
if (XFAILED(error_code)) {
|
||||
RtlSetLastNTError_entry(error_code);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(CloseHandle, kNone, kImplemented);
|
||||
|
||||
dword_result_t ResumeThread_entry(dword_t hThread) {
|
||||
uint32_t suspend_count;
|
||||
const X_STATUS error_code =
|
||||
xe::kernel::xboxkrnl::NtResumeThread(hThread, &suspend_count);
|
||||
|
||||
if (XFAILED(error_code)) {
|
||||
RtlSetLastNTError_entry(error_code);
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
return suspend_count;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(ResumeThread, kNone, kImplemented);
|
||||
|
||||
void ExitThread_entry(dword_t exit_code) {
|
||||
xe::kernel::xboxkrnl::ExTerminateThread(exit_code);
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(ExitThread, kNone, kImplemented);
|
||||
|
||||
dword_result_t GetCurrentThreadId_entry() {
|
||||
return XThread::GetCurrentThread()->GetCurrentThreadId();
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(GetCurrentThreadId, kNone, kImplemented);
|
||||
|
||||
qword_result_t XapiFormatTimeOut_entry(lpqword_t result,
|
||||
dword_t dwMilliseconds) {
|
||||
LARGE_INTEGER delay{};
|
||||
|
||||
// Convert the delay time to 100-nanosecond intervals
|
||||
delay.QuadPart =
|
||||
dwMilliseconds == -1 ? 0 : static_cast<LONGLONG>(-10000) * dwMilliseconds;
|
||||
|
||||
return (uint64_t)&delay;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(XapiFormatTimeOut, kNone, kImplemented);
|
||||
|
||||
dword_result_t WaitForSingleObjectEx_entry(dword_t hHandle,
|
||||
dword_t dwMilliseconds,
|
||||
dword_t bAlertable) {
|
||||
uint64_t* timeout = nullptr;
|
||||
uint64_t timeout_ptr = XapiFormatTimeOut_entry(timeout, dwMilliseconds);
|
||||
|
||||
X_STATUS result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
|
||||
hHandle, 1, bAlertable, &timeout_ptr);
|
||||
|
||||
while (bAlertable && result == X_STATUS_ALERTED) {
|
||||
result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
|
||||
hHandle, 1, bAlertable, &timeout_ptr);
|
||||
}
|
||||
|
||||
RtlSetLastNTError_entry(result);
|
||||
result = -1;
|
||||
|
||||
return result;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(WaitForSingleObjectEx, kNone, kImplemented);
|
||||
|
||||
dword_result_t WaitForSingleObject_entry(dword_t hHandle,
|
||||
dword_t dwMilliseconds) {
|
||||
return WaitForSingleObjectEx_entry(hHandle, dwMilliseconds, 0);
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(WaitForSingleObject, kNone, kImplemented);
|
||||
|
||||
dword_result_t lstrlenW_entry(lpu16string_t string) {
|
||||
// wcslen?
|
||||
if (string) {
|
||||
return (uint32_t)string.value().length();
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(lstrlenW, kNone, kImplemented);
|
||||
DECLARE_XAM_EXPORT1(XamIsCurrentTitleDash, kNone, kImplemented);
|
||||
|
||||
dword_result_t XamGetExecutionId_entry(lpdword_t info_ptr) {
|
||||
auto module = kernel_state()->GetExecutableModule();
|
||||
|
@ -611,16 +424,204 @@ dword_result_t XamQueryLiveHiveW_entry(lpu16string_t name, lpvoid_t out_buf,
|
|||
}
|
||||
DECLARE_XAM_EXPORT1(XamQueryLiveHiveW, kNone, kStub);
|
||||
|
||||
dword_result_t XamIsCurrentTitleDash_entry(const ppc_context_t& ctx) {
|
||||
return ctx->kernel_state->title_id() == 0xFFFE07D1;
|
||||
// http://www.noxa.org/blog/2011/02/28/building-an-xbox-360-emulator-part-3-feasibilityos/
|
||||
// http://www.noxa.org/blog/2011/08/13/building-an-xbox-360-emulator-part-5-xex-files/
|
||||
dword_result_t RtlSleep_entry(dword_t dwMilliseconds, dword_t bAlertable) {
|
||||
LARGE_INTEGER delay{};
|
||||
|
||||
// Convert the delay time to 100-nanosecond intervals
|
||||
delay.QuadPart = dwMilliseconds == -1
|
||||
? LLONG_MAX
|
||||
: static_cast<LONGLONG>(-10000) * dwMilliseconds;
|
||||
|
||||
X_STATUS result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
|
||||
(uint64_t*)&delay);
|
||||
|
||||
// If the delay was interrupted by an APC, keep delaying the thread
|
||||
while (bAlertable && result == X_STATUS_ALERTED) {
|
||||
result = xboxkrnl::KeDelayExecutionThread(MODE::UserMode, bAlertable,
|
||||
(uint64_t*)&delay);
|
||||
}
|
||||
|
||||
return result == X_STATUS_SUCCESS ? X_STATUS_SUCCESS : X_STATUS_USER_APC;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(XamIsCurrentTitleDash, kNone, kImplemented);
|
||||
DECLARE_XAM_EXPORT1(RtlSleep, kNone, kImplemented);
|
||||
|
||||
dword_result_t SleepEx_entry(dword_t dwMilliseconds, dword_t bAlertable) {
|
||||
return RtlSleep_entry(dwMilliseconds, bAlertable);
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(SleepEx, kNone, kImplemented);
|
||||
|
||||
// https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-sleep
|
||||
void Sleep_entry(dword_t dwMilliseconds) {
|
||||
RtlSleep_entry(dwMilliseconds, FALSE);
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(Sleep, kNone, kImplemented);
|
||||
|
||||
// https://learn.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-gettickcount
|
||||
dword_result_t GetTickCount_entry() { return Clock::QueryGuestUptimeMillis(); }
|
||||
DECLARE_XAM_EXPORT1(GetTickCount, kNone, kImplemented);
|
||||
|
||||
dword_result_t RtlSetLastNTError_entry(dword_t error_code) {
|
||||
const uint32_t result =
|
||||
xe::kernel::xboxkrnl::xeRtlNtStatusToDosError(error_code);
|
||||
XThread::SetLastError(result);
|
||||
|
||||
return result;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(RtlSetLastNTError, kNone, kImplemented);
|
||||
|
||||
dword_result_t RtlGetLastError_entry() { return XThread::GetLastError(); }
|
||||
DECLARE_XAM_EXPORT1(RtlGetLastError, kNone, kImplemented);
|
||||
|
||||
dword_result_t GetLastError_entry() { return RtlGetLastError_entry(); }
|
||||
DECLARE_XAM_EXPORT1(GetLastError, kNone, kImplemented);
|
||||
|
||||
dword_result_t GetModuleHandleA_entry(lpstring_t module_name) {
|
||||
xe::be<uint32_t> module_ptr = 0;
|
||||
const X_STATUS error_code = xe::kernel::xboxkrnl::XexGetModuleHandle(
|
||||
module_name.value(), &module_ptr);
|
||||
|
||||
if (XFAILED(error_code)) {
|
||||
RtlSetLastNTError_entry(error_code);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return (uint32_t)module_ptr;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(GetModuleHandleA, kNone, kImplemented);
|
||||
|
||||
dword_result_t XapipCreateThread_entry(lpdword_t lpThreadAttributes,
|
||||
dword_t dwStackSize,
|
||||
lpvoid_t lpStartAddress,
|
||||
lpvoid_t lpParameter,
|
||||
dword_t dwCreationFlags, dword_t unkn,
|
||||
lpdword_t lpThreadId) {
|
||||
uint32_t flags = (dwCreationFlags >> 2) & 1;
|
||||
|
||||
if (unkn != -1) {
|
||||
flags |= 1 << unkn << 24;
|
||||
}
|
||||
|
||||
xe::be<uint32_t> result = 0;
|
||||
|
||||
const X_STATUS error_code = xe::kernel::xboxkrnl::ExCreateThread(
|
||||
&result, dwStackSize, lpThreadId, lpStartAddress, lpParameter, 0, flags);
|
||||
|
||||
if (XFAILED(error_code)) {
|
||||
RtlSetLastNTError_entry(error_code);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return (uint32_t)result;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(XapipCreateThread, kNone, kImplemented);
|
||||
|
||||
dword_result_t CreateThread_entry(lpdword_t lpThreadAttributes,
|
||||
dword_t dwStackSize, lpvoid_t lpStartAddress,
|
||||
lpvoid_t lpParameter, dword_t dwCreationFlags,
|
||||
lpdword_t lpThreadId) {
|
||||
return XapipCreateThread_entry(lpThreadAttributes, dwStackSize,
|
||||
lpStartAddress, lpParameter, dwCreationFlags,
|
||||
-1, lpThreadId);
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(CreateThread, kNone, kImplemented);
|
||||
|
||||
dword_result_t CloseHandle_entry(dword_t hObject) {
|
||||
const X_STATUS error_code = xe::kernel::xboxkrnl::NtClose(hObject);
|
||||
|
||||
if (XFAILED(error_code)) {
|
||||
RtlSetLastNTError_entry(error_code);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(CloseHandle, kNone, kImplemented);
|
||||
|
||||
dword_result_t ResumeThread_entry(dword_t hThread) {
|
||||
uint32_t suspend_count;
|
||||
const X_STATUS error_code =
|
||||
xe::kernel::xboxkrnl::NtResumeThread(hThread, &suspend_count);
|
||||
|
||||
if (XFAILED(error_code)) {
|
||||
RtlSetLastNTError_entry(error_code);
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
return suspend_count;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(ResumeThread, kNone, kImplemented);
|
||||
|
||||
void ExitThread_entry(dword_t exit_code) {
|
||||
xe::kernel::xboxkrnl::ExTerminateThread(exit_code);
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(ExitThread, kNone, kImplemented);
|
||||
|
||||
dword_result_t GetCurrentThreadId_entry() {
|
||||
return XThread::GetCurrentThread()->GetCurrentThreadId();
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(GetCurrentThreadId, kNone, kImplemented);
|
||||
|
||||
qword_result_t XapiFormatTimeOut_entry(lpqword_t result,
|
||||
dword_t dwMilliseconds) {
|
||||
LARGE_INTEGER delay{};
|
||||
|
||||
// Convert the delay time to 100-nanosecond intervals
|
||||
delay.QuadPart =
|
||||
dwMilliseconds == -1 ? 0 : static_cast<LONGLONG>(-10000) * dwMilliseconds;
|
||||
|
||||
return (uint64_t)&delay;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(XapiFormatTimeOut, kNone, kImplemented);
|
||||
|
||||
dword_result_t WaitForSingleObjectEx_entry(dword_t hHandle,
|
||||
dword_t dwMilliseconds,
|
||||
dword_t bAlertable) {
|
||||
uint64_t* timeout = nullptr;
|
||||
uint64_t timeout_ptr = XapiFormatTimeOut_entry(timeout, dwMilliseconds);
|
||||
|
||||
X_STATUS result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
|
||||
hHandle, 1, bAlertable, &timeout_ptr);
|
||||
|
||||
while (bAlertable && result == X_STATUS_ALERTED) {
|
||||
result = xe::kernel::xboxkrnl::NtWaitForSingleObjectEx(
|
||||
hHandle, 1, bAlertable, &timeout_ptr);
|
||||
}
|
||||
|
||||
RtlSetLastNTError_entry(result);
|
||||
result = -1;
|
||||
|
||||
return result;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(WaitForSingleObjectEx, kNone, kImplemented);
|
||||
|
||||
dword_result_t WaitForSingleObject_entry(dword_t hHandle,
|
||||
dword_t dwMilliseconds) {
|
||||
return WaitForSingleObjectEx_entry(hHandle, dwMilliseconds, 0);
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(WaitForSingleObject, kNone, kImplemented);
|
||||
|
||||
dword_result_t lstrlenW_entry(lpu16string_t string) {
|
||||
// wcslen?
|
||||
if (string) {
|
||||
return (uint32_t)string.value().length();
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
DECLARE_XAM_EXPORT1(lstrlenW, kNone, kImplemented);
|
||||
|
||||
dword_result_t XGetAudioFlags_entry() { return 65537; }
|
||||
DECLARE_XAM_EXPORT1(XGetAudioFlags, kNone, kStub);
|
||||
|
||||
/*
|
||||
todo: this table should instead be pointed to by a member of kernel state and initialized along with the process
|
||||
todo: this table should instead be pointed to by a member of kernel
|
||||
state and initialized along with the process
|
||||
*/
|
||||
static int32_t XamRtlRandomTable[128] = {
|
||||
1284227242, 1275210071, 573735546, 790525478, 2139871995, 1547161642,
|
||||
|
|
Loading…
Reference in New Issue