Merge branch 'master' into vulkan

This commit is contained in:
Triang3l 2020-12-13 18:41:07 +03:00
parent c14e3770a2
commit 4617dc5569
115 changed files with 4290 additions and 3872 deletions

View File

@ -29,6 +29,7 @@ init:
- git config --global core.autocrlf input
install:
- cmd: vcpkg integrate remove
- cmd: xb setup
platform: Windows

3
.gitmodules vendored
View File

@ -64,6 +64,9 @@
[submodule "third_party/date"]
path = third_party/date
url = https://github.com/HowardHinnant/date.git
[submodule "third_party/xxhash"]
path = third_party/xxhash
url = https://github.com/Cyan4973/xxHash.git
[submodule "third_party/glslang"]
path = third_party/glslang
url = https://github.com/KhronosGroup/glslang.git

View File

@ -65,6 +65,14 @@ DEFINE_path(
"Root path for guest content storage (saves, etc.), or empty to use the "
"content folder under the storage root.",
"Storage");
DEFINE_path(
cache_root, "",
"Root path for files used to speed up certain parts of the emulator or the "
"game. These files may be persistent, but they can be deleted without "
"major side effects such as progress loss. If empty, the cache folder "
"under the storage root, or, if available, the cache directory preferred "
"for the OS, will be used.",
"Storage");
DEFINE_bool(mount_scratch, false, "Enable scratch mount", "Storage");
DEFINE_bool(mount_cache, false, "Enable cache mount", "Storage");
@ -189,10 +197,12 @@ std::vector<std::unique_ptr<hid::InputDriver>> CreateInputDrivers(
Factory<hid::InputDriver, ui::Window*> factory;
#if XE_PLATFORM_WIN32
factory.Add("xinput", xe::hid::xinput::Create);
#endif // XE_PLATFORM_WIN32
factory.Add("sdl", xe::hid::sdl::Create);
#if XE_PLATFORM_WIN32
// WinKey input driver should always be the last input driver added!
factory.Add("winkey", xe::hid::winkey::Create);
#endif // XE_PLATFORM_WIN32
factory.Add("sdl", xe::hid::sdl::Create);
for (auto& driver : factory.CreateAll(cvars::hid, window)) {
if (XSUCCEEDED(driver->Setup())) {
drivers.emplace_back(std::move(driver));
@ -220,6 +230,8 @@ int xenia_main(const std::vector<std::string>& args) {
#if defined(XE_PLATFORM_WIN32) || defined(XE_PLATFORM_GNU_LINUX)
storage_root = storage_root / "Xenia";
#else
// TODO(Triang3l): Point to the app's external storage "files" directory
// on Android.
#warning Unhandled platform for the data root.
storage_root = storage_root / "Xenia";
#endif
@ -243,13 +255,29 @@ int xenia_main(const std::vector<std::string>& args) {
content_root = std::filesystem::absolute(content_root);
XELOGI("Content root: {}", xe::path_to_utf8(content_root));
std::filesystem::path cache_root = cvars::cache_root;
if (cache_root.empty()) {
cache_root = storage_root / "cache";
// TODO(Triang3l): Point to the app's external storage "cache" directory on
// Android.
} else {
// If content root isn't an absolute path, then it should be relative to the
// storage root.
if (!cache_root.is_absolute()) {
cache_root = storage_root / cache_root;
}
}
cache_root = std::filesystem::absolute(cache_root);
XELOGI("Cache root: {}", xe::path_to_utf8(cache_root));
if (cvars::discord) {
discord::DiscordPresence::Initialize();
discord::DiscordPresence::NotPlaying();
}
// Create the emulator but don't initialize so we can setup the window.
auto emulator = std::make_unique<Emulator>("", storage_root, content_root);
auto emulator =
std::make_unique<Emulator>("", storage_root, content_root, cache_root);
// Main emulator display window.
auto emulator_window = EmulatorWindow::Create(emulator.get());

View File

@ -17,7 +17,7 @@ namespace hash {
// For use in unordered_sets and unordered_maps (primarily multisets and
// multimaps, with manual collision resolution), where the hash is calculated
// externally (for instance, as XXH64), possibly requiring context data rather
// externally (for instance, as XXH3), possibly requiring context data rather
// than a pure function to calculate the hash
template <typename Key>
struct IdentityHasher {

21
src/xenia/base/xxhash.h Normal file
View File

@ -0,0 +1,21 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2020 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_BASE_XXHASH_H_
#define XENIA_BASE_XXHASH_H_
#define XXH_INLINE_ALL
// Can't use XXH_X86DISPATCH because XXH is calculated on multiple threads,
// while the dispatch writes the result (multiple pointers without any
// synchronization) to XXH_g_dispatch at the first call.
#include "third_party/xxhash/xxhash.h"
#endif // XENIA_BASE_XXHASH_H_

View File

@ -746,6 +746,8 @@ static const vec128_t xmm_consts[] = {
/* XMMIntMaxPD */ vec128d(INT_MAX),
/* XMMPosIntMinPS */ vec128f((float)0x80000000u),
/* XMMQNaN */ vec128i(0x7FC00000u),
/* XMMInt127 */ vec128i(0x7Fu),
/* XMM2To32 */ vec128f(0x1.0p32f),
};
// First location to try and place constants.

View File

@ -114,6 +114,8 @@ enum XmmConst {
XMMIntMaxPD,
XMMPosIntMinPS,
XMMQNaN,
XMMInt127,
XMM2To32,
};
// Unfortunately due to the design of xbyak we have to pass this to the ctor.

View File

@ -33,19 +33,41 @@ struct VECTOR_CONVERT_I2F
static void Emit(X64Emitter& e, const EmitArgType& i) {
// flags = ARITHMETIC_UNSIGNED
if (i.instr->flags & ARITHMETIC_UNSIGNED) {
// xmm0 = mask of positive values
e.vpcmpgtd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMFFFF));
// Round manually to (1.stored mantissa bits * 2^31) or to 2^32 to the
// nearest even (the only rounding mode used on AltiVec) if the number is
// 0x80000000 or greater, instead of converting src & 0x7FFFFFFF and then
// adding 2147483648.0f, which results in double rounding that can give a
// result larger than needed - see OPCODE_VECTOR_CONVERT_I2F notes.
// scale any values >= (unsigned)INT_MIN back to [0, INT_MAX]
e.vpsubd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMSignMaskI32));
e.vblendvps(e.xmm1, e.xmm1, i.src1, e.xmm0);
// [0x80000000, 0xFFFFFFFF] case:
// xmm1 = [0, INT_MAX]
e.vcvtdq2ps(i.dest, e.xmm1);
// Round to the nearest even, from (0x80000000 | 31 stored mantissa bits)
// to ((-1 << 23) | 23 stored mantissa bits), or to 0 if the result should
// be 4294967296.0f.
// xmm0 = src + 0b01111111 + ((src >> 8) & 1)
// (xmm1 also used to launch reg + mem early and to require it late)
e.vpaddd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMInt127));
e.vpslld(e.xmm0, i.src1, 31 - 8);
e.vpsrld(e.xmm0, e.xmm0, 31);
e.vpaddd(e.xmm0, e.xmm0, e.xmm1);
// xmm0 = (0xFF800000 | 23 explicit mantissa bits), or 0 if overflowed
e.vpsrad(e.xmm0, e.xmm0, 8);
// Calculate the result for the [0x80000000, 0xFFFFFFFF] case - take the
// rounded mantissa, and add -1 or 0 to the exponent of 32, depending on
// whether the number should be (1.stored mantissa bits * 2^31) or 2^32.
// xmm0 = [0x80000000, 0xFFFFFFFF] case result
e.vpaddd(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMM2To32));
// scale values back above [INT_MIN, UINT_MAX]
e.vpandn(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS));
e.vaddps(i.dest, i.dest, e.xmm0);
// [0x00000000, 0x7FFFFFFF] case
// (during vblendvps reg -> vpaddd reg -> vpaddd mem dependency):
// Convert from signed integer to float.
// xmm1 = [0x00000000, 0x7FFFFFFF] case result
e.vcvtdq2ps(e.xmm1, i.src1);
// Merge the two ways depending on whether the number is >= 0x80000000
// (has high bit set).
e.vblendvps(i.dest, e.xmm1, e.xmm0, i.src1);
} else {
e.vcvtdq2ps(i.dest, i.src1);
}

View File

@ -143,6 +143,55 @@ enum Opcode {
OPCODE_TRUNCATE,
OPCODE_CONVERT,
OPCODE_ROUND,
// Note that 2147483648.0 + (src & 0x7FFFFFFF) is not a correct way of
// performing the uint -> float conversion for large numbers on backends where
// only sint -> float is available.
//
// Take 0b11000000000000000000000101000001 as an example,
// or 1.1000000000000000000000101000001 * 2^31.
// This one has 31 mantissa bits (excluding the implicit 1.), and needs to be
// rounded to 23 bits - 8 mantissa bits need to be dropped:
// 10000000000000000000001_01000001
//
// Rounding to the nearest even (the only rounding mode that exists on
// AltiVec, and the likely rounding mode in the implementations) should be
// done downwards - 01000001 of 1_01000001 is in [00000000, 01111111].
// The correct mantissa in this case is:
// 1.10000000000000000000001 * 2^31.
//
// With a two-step conversion, rounding is done twice instead, which gives an
// incorrect result.
//
// First, converting the low 31 bits to float:
// The number is 0.1000000000000000000000101000001 * 2^31.
// Normalizing it, we get 1.000000000000000000000101000001 (30 significand
// bits).
// We need to round 30 bits to 23 - 7 bits need to be dropped:
// 00000000000000000000010_1000001
//
// Rounding to the nearest even is done upwards in this case - 1000001 of
// 0_1000001 is in [1000001, 1111111].
// The result of the sint -> float conversion is:
// 1.00000000000000000000011 * 2^30.
//
// Now 2147483648.0 (1 * 2^31) needs to be added. Aligning the exponents, we
// get:
// 0.|10000000000000000000001|1 * 2^31
// + 1.|00000000000000000000000| * 2^31
// = 1.|10000000000000000000001|1 * 2^31
//
// At "infinite precision", the result has 24 significand bits, but only 23
// can be stored, thus rounding to the nearest even needs to be done. 1_1 is
// (odd + 0.5). 0.5 is ambiguous, thus tie-breaking to the nearest even -
// which is above in this case - is done. The result is:
// 1.10000000000000000000010 * 2^31.
//
// This is incorrect - larger than the correctly rounded result, which is:
// 1.10000000000000000000001 * 2^31.
//
// Test cases checked on real hardware via vcfux: 0xFFFDFF7E, 0xFFFCFF7D -
// should be 0x4F7FFDFF and 0x4F7FFCFF respectively, not 0x4F7FFE00 and
// 0x4F7FFD00.
OPCODE_VECTOR_CONVERT_I2F,
OPCODE_VECTOR_CONVERT_F2I,
OPCODE_LOAD_VECTOR_SHL,

View File

@ -519,9 +519,11 @@ int InstrEmit_vavguw(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_vcfsx_(PPCHIRBuilder& f, uint32_t vd, uint32_t vb,
uint32_t uimm) {
// (VD) <- float(VB as signed) / 2^uimm
float fuimm = static_cast<float>(std::exp2(uimm));
Value* v = f.Div(f.VectorConvertI2F(f.LoadVR(vb)),
f.Splat(f.LoadConstantFloat32(fuimm), VEC128_TYPE));
Value* v = f.VectorConvertI2F(f.LoadVR(vb));
if (uimm) {
float fuimm = std::ldexp(1.0f, -int(uimm));
v = f.Mul(v, f.Splat(f.LoadConstantFloat32(fuimm), VEC128_TYPE));
}
f.StoreVR(vd, v);
return 0;
}
@ -535,9 +537,11 @@ int InstrEmit_vcsxwfp128(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_vcfux_(PPCHIRBuilder& f, uint32_t vd, uint32_t vb,
uint32_t uimm) {
// (VD) <- float(VB as unsigned) / 2^uimm
float fuimm = static_cast<float>(std::exp2(uimm));
Value* v = f.Div(f.VectorConvertI2F(f.LoadVR(vb), ARITHMETIC_UNSIGNED),
f.Splat(f.LoadConstantFloat32(fuimm), VEC128_TYPE));
Value* v = f.VectorConvertI2F(f.LoadVR(vb), ARITHMETIC_UNSIGNED);
if (uimm) {
float fuimm = std::ldexp(1.0f, -int(uimm));
v = f.Mul(v, f.Splat(f.LoadConstantFloat32(fuimm), VEC128_TYPE));
}
f.StoreVR(vd, v);
return 0;
}

View File

@ -1,21 +1,21 @@
# frsqrte tests disabled because accuracy is CPU dependent.
#test_frsqrte_1:
#_ REGISTER_IN f1 1.0
test_frsqrte_1:
# _ REGISTER_IN f1 1.0
# frsqrte f1, f1
# blr
#_ REGISTER_OUT f1 0.99975585937500000
blr
# _ REGISTER_OUT f1 0.99975585937500000
# want: 0.97
#test_frsqrte_2:
#_ REGISTER_IN f1 64.0
test_frsqrte_2:
# _ REGISTER_IN f1 64.0
# frsqrte f1, f1
# blr
#_ REGISTER_OUT f1 0.12496948242187500
blr
# _ REGISTER_OUT f1 0.12496948242187500
#test_frsqrte_3:
#_ REGISTER_IN f1 0.5
test_frsqrte_3:
# _ REGISTER_IN f1 0.5
# frsqrte f1, f1
# blr
#_ REGISTER_OUT f1 1.41381835937500000
blr
# _ REGISTER_OUT f1 1.41381835937500000
# want: 1.375

View File

@ -7,6 +7,7 @@
******************************************************************************
*/
#include "xenia/base/cvar.h"
#include "xenia/base/filesystem.h"
#include "xenia/base/logging.h"
#include "xenia/base/main.h"
@ -28,7 +29,7 @@ DEFINE_path(test_path, "src/xenia/cpu/ppc/testing/",
"Directory scanned for test files.", "Other");
DEFINE_path(test_bin_path, "src/xenia/cpu/ppc/testing/bin/",
"Directory with binary outputs of the test files.", "Other");
DEFINE_transient_string(test_name, "", "Specifies test name.", "General");
DEFINE_transient_string(test_name, "", "Test suite name.", "General");
namespace xe {
namespace cpu {
@ -475,13 +476,7 @@ bool RunTests(const std::string_view test_name) {
}
int main(const std::vector<std::string>& args) {
// Grab test name, if present.
std::string test_name;
if (args.size() >= 2) {
test_name = args[1];
}
return RunTests(test_name) ? 0 : 1;
return RunTests(cvars::test_name) ? 0 : 1;
}
} // namespace test

View File

@ -358,7 +358,6 @@ bool Processor::ExecuteRaw(ThreadState* thread_state, uint32_t address) {
return false;
}
auto context = thread_state->context();
return function->Call(thread_state, 0xBCBCBCBC);
}

View File

@ -59,13 +59,15 @@ namespace xe {
Emulator::Emulator(const std::filesystem::path& command_line,
const std::filesystem::path& storage_root,
const std::filesystem::path& content_root)
const std::filesystem::path& content_root,
const std::filesystem::path& cache_root)
: on_launch(),
on_terminate(),
on_exit(),
command_line_(command_line),
storage_root_(storage_root),
content_root_(content_root),
cache_root_(cache_root),
game_title_(),
display_window_(nullptr),
memory_(),
@ -689,7 +691,7 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
// playing before the video can be seen if doing this in parallel with the
// main thread.
on_shader_storage_initialization(true);
graphics_system_->InitializeShaderStorage(storage_root_, title_id_, true);
graphics_system_->InitializeShaderStorage(cache_root_, title_id_, true);
on_shader_storage_initialization(false);
auto main_thread = kernel_state_->LaunchModule(module);

View File

@ -49,7 +49,8 @@ class Emulator {
public:
explicit Emulator(const std::filesystem::path& command_line,
const std::filesystem::path& storage_root,
const std::filesystem::path& content_root);
const std::filesystem::path& content_root,
const std::filesystem::path& cache_root);
~Emulator();
// Full command line used when launching the process.
@ -61,6 +62,9 @@ class Emulator {
// Folder guest content is stored in.
const std::filesystem::path& content_root() const { return content_root_; }
// Folder files safe to remove without significant side effects are stored in.
const std::filesystem::path& cache_root() const { return cache_root_; }
// Title of the game in the default language.
const std::string& game_title() const { return game_title_; }
@ -166,6 +170,7 @@ class Emulator {
std::filesystem::path command_line_;
std::filesystem::path storage_root_;
std::filesystem::path content_root_;
std::filesystem::path cache_root_;
std::string game_title_;

View File

@ -89,8 +89,8 @@ void CommandProcessor::Shutdown() {
}
void CommandProcessor::InitializeShaderStorage(
const std::filesystem::path& storage_root, uint32_t title_id,
bool blocking) {}
const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) {
}
void CommandProcessor::RequestFrameTrace(
const std::filesystem::path& root_path) {

View File

@ -133,9 +133,8 @@ class CommandProcessor {
// May be called not only from the command processor thread when the command
// processor is paused, and the termination of this function may be explicitly
// awaited.
virtual void InitializeShaderStorage(
const std::filesystem::path& storage_root, uint32_t title_id,
bool blocking);
virtual void InitializeShaderStorage(const std::filesystem::path& cache_root,
uint32_t title_id, bool blocking);
virtual void RequestFrameTrace(const std::filesystem::path& root_path);
virtual void BeginTracing(const std::filesystem::path& root_path);

View File

@ -7,8 +7,6 @@
******************************************************************************
*/
#include "third_party/xxhash/xxhash.h"
#include <algorithm>
#include <cstring>
#include <utility>
@ -73,10 +71,9 @@ void D3D12CommandProcessor::ClearCaches() {
}
void D3D12CommandProcessor::InitializeShaderStorage(
const std::filesystem::path& storage_root, uint32_t title_id,
bool blocking) {
CommandProcessor::InitializeShaderStorage(storage_root, title_id, blocking);
pipeline_cache_->InitializeShaderStorage(storage_root, title_id, blocking);
const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) {
CommandProcessor::InitializeShaderStorage(cache_root, title_id, blocking);
pipeline_cache_->InitializeShaderStorage(cache_root, title_id, blocking);
}
void D3D12CommandProcessor::RequestFrameTrace(
@ -102,7 +99,7 @@ void D3D12CommandProcessor::RestoreEdramSnapshot(const void* snapshot) {
}
uint32_t D3D12CommandProcessor::GetCurrentColorMask(
const D3D12Shader* pixel_shader) const {
const Shader* pixel_shader) const {
if (pixel_shader == nullptr) {
return 0;
}
@ -159,25 +156,16 @@ void D3D12CommandProcessor::SubmitBarriers() {
}
ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature(
const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader) {
assert_true(vertex_shader->is_translated());
const DxbcShader* vertex_shader, const DxbcShader* pixel_shader,
bool tessellated) {
if (bindless_resources_used_) {
return vertex_shader->host_vertex_shader_type() !=
Shader::HostVertexShaderType::kVertex
? root_signature_bindless_ds_
: root_signature_bindless_vs_;
return tessellated ? root_signature_bindless_ds_
: root_signature_bindless_vs_;
}
assert_true(pixel_shader == nullptr || pixel_shader->is_translated());
D3D12_SHADER_VISIBILITY vertex_visibility;
if (vertex_shader->host_vertex_shader_type() !=
Shader::HostVertexShaderType::kVertex) {
vertex_visibility = D3D12_SHADER_VISIBILITY_DOMAIN;
} else {
vertex_visibility = D3D12_SHADER_VISIBILITY_VERTEX;
}
D3D12_SHADER_VISIBILITY vertex_visibility =
tessellated ? D3D12_SHADER_VISIBILITY_DOMAIN
: D3D12_SHADER_VISIBILITY_VERTEX;
uint32_t texture_count_vertex, sampler_count_vertex;
vertex_shader->GetTextureBindings(texture_count_vertex);
@ -393,7 +381,7 @@ ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature(
}
uint32_t D3D12CommandProcessor::GetRootBindfulExtraParameterIndices(
const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader,
const DxbcShader* vertex_shader, const DxbcShader* pixel_shader,
RootBindfulExtraParameterIndices& indices_out) {
uint32_t texture_count_pixel = 0, sampler_count_pixel = 0;
if (pixel_shader != nullptr) {
@ -1202,6 +1190,7 @@ bool D3D12CommandProcessor::SetupContext() {
pipeline_cache_ = std::make_unique<PipelineCache>(
*this, *register_file_, bindless_resources_used_, edram_rov_used_,
render_target_cache_->depth_float24_conversion(),
texture_cache_->IsResolutionScale2X() ? 2 : 1);
if (!pipeline_cache_->Initialize()) {
XELOGE("Failed to initialize the graphics pipeline cache");
@ -1804,8 +1793,7 @@ Shader* D3D12CommandProcessor::LoadShader(xenos::ShaderType shader_type,
uint32_t guest_address,
const uint32_t* host_address,
uint32_t dword_count) {
return pipeline_cache_->LoadShader(shader_type, guest_address, host_address,
dword_count);
return pipeline_cache_->LoadShader(shader_type, host_address, dword_count);
}
bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
@ -1851,21 +1839,30 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
// Need a pixel shader in normal color mode.
return false;
}
// Get tessellation info for the current draw for vertex shader translation.
Shader::HostVertexShaderType host_vertex_shader_type =
pipeline_cache_->GetHostVertexShaderTypeIfValid();
if (host_vertex_shader_type == Shader::HostVertexShaderType(-1)) {
DxbcShaderTranslator::Modification vertex_shader_modification;
DxbcShaderTranslator::Modification pixel_shader_modification;
if (!pipeline_cache_->GetCurrentShaderModifications(
vertex_shader_modification, pixel_shader_modification)) {
return false;
}
D3D12Shader::D3D12Translation* vertex_shader_translation =
static_cast<D3D12Shader::D3D12Translation*>(
vertex_shader->GetOrCreateTranslation(
vertex_shader_modification.value));
D3D12Shader::D3D12Translation* pixel_shader_translation =
pixel_shader ? static_cast<D3D12Shader::D3D12Translation*>(
pixel_shader->GetOrCreateTranslation(
pixel_shader_modification.value))
: nullptr;
// Translate the shaders now to get memexport configuration and color mask,
// which is needed by the render target cache, to check the possibility of
// doing early depth/stencil, and also to get used textures and samplers.
if (!pipeline_cache_->EnsureShadersTranslated(vertex_shader, pixel_shader,
host_vertex_shader_type)) {
// which is needed by the render target cache, and also to get used textures
// and samplers.
if (!pipeline_cache_->EnsureShadersTranslated(vertex_shader_translation,
pixel_shader_translation)) {
return false;
}
bool tessellated =
host_vertex_shader_type != Shader::HostVertexShaderType::kVertex;
bool tessellated = vertex_shader_modification.host_vertex_shader_type !=
Shader::HostVertexShaderType::kVertex;
// Check if memexport is used. If it is, we can't skip draw calls that have no
// visual effect.
@ -1967,26 +1964,14 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
(pixel_shader != nullptr ? pixel_shader->GetUsedTextureMask() : 0);
texture_cache_->RequestTextures(used_texture_mask);
// Check if early depth/stencil can be enabled.
bool early_z;
if (pixel_shader) {
auto rb_colorcontrol = regs.Get<reg::RB_COLORCONTROL>();
early_z = pixel_shader->implicit_early_z_allowed() &&
(!rb_colorcontrol.alpha_test_enable ||
rb_colorcontrol.alpha_func == xenos::CompareFunction::kAlways) &&
!rb_colorcontrol.alpha_to_mask_enable;
} else {
early_z = true;
}
// Create the pipeline if needed and bind it.
void* pipeline_handle;
ID3D12RootSignature* root_signature;
if (!pipeline_cache_->ConfigurePipeline(
vertex_shader, pixel_shader, primitive_type_converted,
vertex_shader_translation, pixel_shader_translation,
primitive_type_converted,
indexed ? index_buffer_info->format : xenos::IndexFormat::kInt16,
early_z, pipeline_render_targets, &pipeline_handle,
&root_signature)) {
pipeline_render_targets, &pipeline_handle, &root_signature)) {
return false;
}
if (current_cached_pipeline_ != pipeline_handle) {
@ -2014,11 +1999,18 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
pixel_size_x *= 2;
pixel_size_y *= 2;
}
flags::DepthFloat24Conversion depth_float24_conversion =
render_target_cache_->depth_float24_conversion();
draw_util::ViewportInfo viewport_info;
draw_util::GetHostViewportInfo(regs, float(pixel_size_x), float(pixel_size_y),
true, float(D3D12_VIEWPORT_BOUNDS_MAX),
float(D3D12_VIEWPORT_BOUNDS_MAX), false,
viewport_info);
draw_util::GetHostViewportInfo(
regs, float(pixel_size_x), float(pixel_size_y), true,
float(D3D12_VIEWPORT_BOUNDS_MAX), float(D3D12_VIEWPORT_BOUNDS_MAX), false,
!edram_rov_used_ &&
(depth_float24_conversion ==
flags::DepthFloat24Conversion::kOnOutputTruncating ||
depth_float24_conversion ==
flags::DepthFloat24Conversion::kOnOutputRounding),
viewport_info);
draw_util::Scissor scissor;
draw_util::GetScissor(regs, scissor);
scissor.left *= pixel_size_x;
@ -2033,7 +2025,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
UpdateSystemConstantValues(
memexport_used, primitive_polygonal, line_loop_closing_index,
indexed ? index_buffer_info->endianness : xenos::Endian::kNone,
viewport_info, pixel_size_x, pixel_size_y, used_texture_mask, early_z,
viewport_info, pixel_size_x, pixel_size_y, used_texture_mask,
GetCurrentColorMask(pixel_shader), pipeline_render_targets);
// Update constant buffers, descriptors and root parameters.
@ -2659,6 +2651,8 @@ bool D3D12CommandProcessor::EndSubmission(bool is_swap) {
bool is_closing_frame = is_swap && frame_open_;
if (is_closing_frame) {
render_target_cache_->EndFrame();
texture_cache_->EndFrame();
}
@ -2873,8 +2867,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
bool shared_memory_is_uav, bool primitive_polygonal,
uint32_t line_loop_closing_index, xenos::Endian index_endian,
const draw_util::ViewportInfo& viewport_info, uint32_t pixel_size_x,
uint32_t pixel_size_y, uint32_t used_texture_mask, bool early_z,
uint32_t color_mask,
uint32_t pixel_size_y, uint32_t used_texture_mask, uint32_t color_mask,
const RenderTargetCache::PipelineRenderTarget render_targets[4]) {
#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
SCOPE_profile_cpu_f("gpu");
@ -2992,14 +2985,11 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
flags |= DxbcShaderTranslator::kSysFlag_KillIfAnyVertexKilled;
}
// Alpha test.
if (rb_colorcontrol.alpha_test_enable) {
flags |= uint32_t(rb_colorcontrol.alpha_func)
<< DxbcShaderTranslator::kSysFlag_AlphaPassIfLess_Shift;
} else {
flags |= DxbcShaderTranslator::kSysFlag_AlphaPassIfLess |
DxbcShaderTranslator::kSysFlag_AlphaPassIfEqual |
DxbcShaderTranslator::kSysFlag_AlphaPassIfGreater;
}
xenos::CompareFunction alpha_test_function =
rb_colorcontrol.alpha_test_enable ? rb_colorcontrol.alpha_func
: xenos::CompareFunction::kAlways;
flags |= uint32_t(alpha_test_function)
<< DxbcShaderTranslator::kSysFlag_AlphaPassIfLess_Shift;
// Gamma writing.
for (uint32_t i = 0; i < 4; ++i) {
if (color_infos[i].color_format ==
@ -3028,7 +3018,9 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
if (rb_depthcontrol.stencil_enable) {
flags |= DxbcShaderTranslator::kSysFlag_ROVStencilTest;
}
if (early_z) {
// Hint - if not applicable to the shader, will not have effect.
if (alpha_test_function == xenos::CompareFunction::kAlways &&
!rb_colorcontrol.alpha_to_mask_enable) {
flags |= DxbcShaderTranslator::kSysFlag_ROVDepthStencilEarlyWrite;
}
}

View File

@ -27,6 +27,7 @@
#include "xenia/gpu/d3d12/render_target_cache.h"
#include "xenia/gpu/d3d12/texture_cache.h"
#include "xenia/gpu/draw_util.h"
#include "xenia/gpu/dxbc_shader.h"
#include "xenia/gpu/dxbc_shader_translator.h"
#include "xenia/gpu/xenos.h"
#include "xenia/kernel/kernel_state.h"
@ -47,7 +48,7 @@ class D3D12CommandProcessor : public CommandProcessor {
void ClearCaches() override;
void InitializeShaderStorage(const std::filesystem::path& storage_root,
void InitializeShaderStorage(const std::filesystem::path& cache_root,
uint32_t title_id, bool blocking) override;
void RequestFrameTrace(const std::filesystem::path& root_path) override;
@ -88,7 +89,7 @@ class D3D12CommandProcessor : public CommandProcessor {
// there are 4 render targets bound with the same EDRAM base (clearly not
// correct usage), but the shader only clears 1, and then EDRAM buffer stores
// conflict with each other.
uint32_t GetCurrentColorMask(const D3D12Shader* pixel_shader) const;
uint32_t GetCurrentColorMask(const Shader* pixel_shader) const;
void PushTransitionBarrier(
ID3D12Resource* resource, D3D12_RESOURCE_STATES old_state,
@ -100,8 +101,9 @@ class D3D12CommandProcessor : public CommandProcessor {
void SubmitBarriers();
// Finds or creates root signature for a pipeline.
ID3D12RootSignature* GetRootSignature(const D3D12Shader* vertex_shader,
const D3D12Shader* pixel_shader);
ID3D12RootSignature* GetRootSignature(const DxbcShader* vertex_shader,
const DxbcShader* pixel_shader,
bool tessellated);
ui::d3d12::D3D12UploadBufferPool& GetConstantBufferPool() const {
return *constant_buffer_pool_;
@ -300,7 +302,7 @@ class D3D12CommandProcessor : public CommandProcessor {
// Gets the indices of optional root parameters. Returns the total parameter
// count.
static uint32_t GetRootBindfulExtraParameterIndices(
const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader,
const DxbcShader* vertex_shader, const DxbcShader* pixel_shader,
RootBindfulExtraParameterIndices& indices_out);
// BeginSubmission and EndSubmission may be called at any time. If there's an
@ -353,8 +355,7 @@ class D3D12CommandProcessor : public CommandProcessor {
bool shared_memory_is_uav, bool primitive_polygonal,
uint32_t line_loop_closing_index, xenos::Endian index_endian,
const draw_util::ViewportInfo& viewport_info, uint32_t pixel_size_x,
uint32_t pixel_size_y, uint32_t used_texture_mask, bool early_z,
uint32_t color_mask,
uint32_t pixel_size_y, uint32_t used_texture_mask, uint32_t color_mask,
const RenderTargetCache::PipelineRenderTarget render_targets[4]);
bool UpdateBindings(const D3D12Shader* vertex_shader,
const D3D12Shader* pixel_shader,

View File

@ -10,9 +10,11 @@
#include "xenia/gpu/d3d12/d3d12_shader.h"
#include <cstring>
#include <utility>
#include "xenia/base/assert.h"
#include "xenia/base/logging.h"
#include "xenia/gpu/dxbc_shader.h"
#include "xenia/gpu/gpu_flags.h"
#include "xenia/ui/d3d12/d3d12_api.h"
@ -22,51 +24,13 @@ namespace d3d12 {
D3D12Shader::D3D12Shader(xenos::ShaderType shader_type, uint64_t data_hash,
const uint32_t* dword_ptr, uint32_t dword_count)
: Shader(shader_type, data_hash, dword_ptr, dword_count) {}
: DxbcShader(shader_type, data_hash, dword_ptr, dword_count) {}
void D3D12Shader::SetTexturesAndSamplers(
const DxbcShaderTranslator::TextureBinding* texture_bindings,
uint32_t texture_binding_count,
const DxbcShaderTranslator::SamplerBinding* sampler_bindings,
uint32_t sampler_binding_count) {
texture_bindings_.clear();
texture_bindings_.reserve(texture_binding_count);
used_texture_mask_ = 0;
for (uint32_t i = 0; i < texture_binding_count; ++i) {
TextureBinding& binding = texture_bindings_.emplace_back();
// For a stable hash.
std::memset(&binding, 0, sizeof(binding));
const DxbcShaderTranslator::TextureBinding& translator_binding =
texture_bindings[i];
binding.bindless_descriptor_index =
translator_binding.bindless_descriptor_index;
binding.fetch_constant = translator_binding.fetch_constant;
binding.dimension = translator_binding.dimension;
binding.is_signed = translator_binding.is_signed;
used_texture_mask_ |= 1u << translator_binding.fetch_constant;
}
sampler_bindings_.clear();
sampler_bindings_.reserve(sampler_binding_count);
for (uint32_t i = 0; i < sampler_binding_count; ++i) {
SamplerBinding binding;
const DxbcShaderTranslator::SamplerBinding& translator_binding =
sampler_bindings[i];
binding.bindless_descriptor_index =
translator_binding.bindless_descriptor_index;
binding.fetch_constant = translator_binding.fetch_constant;
binding.mag_filter = translator_binding.mag_filter;
binding.min_filter = translator_binding.min_filter;
binding.mip_filter = translator_binding.mip_filter;
binding.aniso_filter = translator_binding.aniso_filter;
sampler_bindings_.push_back(binding);
}
}
void D3D12Shader::DisassembleDxbc(const ui::d3d12::D3D12Provider& provider,
bool disassemble_dxbc,
IDxbcConverter* dxbc_converter,
IDxcUtils* dxc_utils,
IDxcCompiler* dxc_compiler) {
void D3D12Shader::D3D12Translation::DisassembleDxbcAndDxil(
const ui::d3d12::D3D12Provider& provider, bool disassemble_dxbc,
IDxbcConverter* dxbc_converter, IDxcUtils* dxc_utils,
IDxcCompiler* dxc_compiler) {
std::string disassembly;
bool is_first_disassembly = true;
if (disassemble_dxbc) {
ID3DBlob* dxbc_disassembly;
@ -77,11 +41,12 @@ void D3D12Shader::DisassembleDxbc(const ui::d3d12::D3D12Provider& provider,
nullptr, &dxbc_disassembly))) {
assert_true(is_first_disassembly);
is_first_disassembly = false;
host_disassembly_.append(
disassembly.append(
reinterpret_cast<const char*>(dxbc_disassembly->GetBufferPointer()));
dxbc_disassembly->Release();
} else {
XELOGE("Failed to disassemble DXBC shader {:016X}", ucode_data_hash());
XELOGE("Failed to disassemble DXBC shader {:016X}",
shader().ucode_data_hash());
}
}
if (dxbc_converter && dxc_utils && dxc_compiler) {
@ -106,29 +71,36 @@ void D3D12Shader::DisassembleDxbc(const ui::d3d12::D3D12Provider& provider,
dxil_disassembly->Release();
if (dxil_disassembly_got_utf8) {
if (!is_first_disassembly) {
host_disassembly_.append("\n\n");
disassembly.append("\n\n");
}
is_first_disassembly = false;
host_disassembly_.append(reinterpret_cast<const char*>(
disassembly.append(reinterpret_cast<const char*>(
dxil_disassembly_utf8->GetStringPointer()));
dxil_disassembly_utf8->Release();
} else {
XELOGE("Failed to get DXIL shader {:016X} disassembly as UTF-8",
ucode_data_hash());
shader().ucode_data_hash());
}
} else {
XELOGE("Failed to disassemble DXIL shader {:016X}",
ucode_data_hash());
shader().ucode_data_hash());
}
} else {
XELOGE("Failed to create a blob with DXIL shader {:016X}",
ucode_data_hash());
shader().ucode_data_hash());
CoTaskMemFree(dxil);
}
} else {
XELOGE("Failed to convert shader {:016X} to DXIL", ucode_data_hash());
XELOGE("Failed to convert shader {:016X} to DXIL",
shader().ucode_data_hash());
}
}
set_host_disassembly(std::move(disassembly));
}
Shader::Translation* D3D12Shader::CreateTranslationInstance(
uint32_t modification) {
return new D3D12Translation(*this, modification);
}
} // namespace d3d12

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2018 Ben Vanik. All rights reserved. *
* Copyright 2020 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -10,106 +10,62 @@
#ifndef XENIA_GPU_D3D12_D3D12_SHADER_H_
#define XENIA_GPU_D3D12_D3D12_SHADER_H_
#include <vector>
#include <atomic>
#include "xenia/gpu/dxbc_shader_translator.h"
#include "xenia/gpu/shader.h"
#include "xenia/gpu/xenos.h"
#include "xenia/gpu/dxbc_shader.h"
#include "xenia/ui/d3d12/d3d12_provider.h"
namespace xe {
namespace gpu {
namespace d3d12 {
class D3D12Shader : public Shader {
class D3D12Shader : public DxbcShader {
public:
class D3D12Translation : public DxbcTranslation {
public:
D3D12Translation(D3D12Shader& shader, uint32_t modification)
: DxbcTranslation(shader, modification) {}
void DisassembleDxbcAndDxil(const ui::d3d12::D3D12Provider& provider,
bool disassemble_dxbc,
IDxbcConverter* dxbc_converter = nullptr,
IDxcUtils* dxc_utils = nullptr,
IDxcCompiler* dxc_compiler = nullptr);
};
D3D12Shader(xenos::ShaderType shader_type, uint64_t data_hash,
const uint32_t* dword_ptr, uint32_t dword_count);
void SetTexturesAndSamplers(
const DxbcShaderTranslator::TextureBinding* texture_bindings,
uint32_t texture_binding_count,
const DxbcShaderTranslator::SamplerBinding* sampler_bindings,
uint32_t sampler_binding_count);
void SetForcedEarlyZShaderObject(const std::vector<uint8_t>& shader_object) {
forced_early_z_shader_ = shader_object;
}
// Returns the shader with forced early depth/stencil set with
// SetForcedEarlyZShader after translation. If there's none (for example,
// if the shader discards pixels or writes to the depth buffer), an empty
// vector is returned.
const std::vector<uint8_t>& GetForcedEarlyZShaderObject() const {
return forced_early_z_shader_;
}
void DisassembleDxbc(const ui::d3d12::D3D12Provider& provider,
bool disassemble_dxbc,
IDxbcConverter* dxbc_converter = nullptr,
IDxcUtils* dxc_utils = nullptr,
IDxcCompiler* dxc_compiler = nullptr);
static constexpr uint32_t kMaxTextureBindingIndexBits =
DxbcShaderTranslator::kMaxTextureBindingIndexBits;
static constexpr uint32_t kMaxTextureBindings =
DxbcShaderTranslator::kMaxTextureBindings;
struct TextureBinding {
uint32_t bindless_descriptor_index;
uint32_t fetch_constant;
// Stacked and 3D are separate TextureBindings, even for bindless for null
// descriptor handling simplicity.
xenos::FetchOpDimension dimension;
bool is_signed;
};
// Safe to hash and compare with memcmp for layout hashing.
const TextureBinding* GetTextureBindings(uint32_t& count_out) const {
count_out = uint32_t(texture_bindings_.size());
return texture_bindings_.data();
}
const uint32_t GetUsedTextureMask() const { return used_texture_mask_; }
static constexpr uint32_t kMaxSamplerBindingIndexBits =
DxbcShaderTranslator::kMaxSamplerBindingIndexBits;
static constexpr uint32_t kMaxSamplerBindings =
DxbcShaderTranslator::kMaxSamplerBindings;
struct SamplerBinding {
uint32_t bindless_descriptor_index;
uint32_t fetch_constant;
xenos::TextureFilter mag_filter;
xenos::TextureFilter min_filter;
xenos::TextureFilter mip_filter;
xenos::AnisoFilter aniso_filter;
};
const SamplerBinding* GetSamplerBindings(uint32_t& count_out) const {
count_out = uint32_t(sampler_bindings_.size());
return sampler_bindings_.data();
}
// For owning subsystems like the pipeline cache, accessors for unique
// For owning subsystem like the pipeline cache, accessors for unique
// identifiers (used instead of hashes to make sure collisions can't happen)
// of binding layouts used by the shader, for invalidation if a shader with an
// incompatible layout was bound.
size_t GetTextureBindingLayoutUserUID() const {
return texture_binding_layout_user_uid_;
}
void SetTextureBindingLayoutUserUID(size_t uid) {
texture_binding_layout_user_uid_ = uid;
}
size_t GetSamplerBindingLayoutUserUID() const {
return sampler_binding_layout_user_uid_;
}
// Modifications of the same shader can be translated on different threads.
// The "set" function must only be called if "enter" returned true - these are
// set up only once.
bool EnterBindingLayoutUserUIDSetup() {
return !binding_layout_user_uids_set_up_.test_and_set();
}
void SetTextureBindingLayoutUserUID(size_t uid) {
texture_binding_layout_user_uid_ = uid;
}
void SetSamplerBindingLayoutUserUID(size_t uid) {
sampler_binding_layout_user_uid_ = uid;
}
protected:
Translation* CreateTranslationInstance(uint32_t modification) override;
private:
std::vector<TextureBinding> texture_bindings_;
std::vector<SamplerBinding> sampler_bindings_;
std::atomic_flag binding_layout_user_uids_set_up_ = ATOMIC_FLAG_INIT;
size_t texture_binding_layout_user_uid_ = 0;
size_t sampler_binding_layout_user_uid_ = 0;
uint32_t used_texture_mask_ = 0;
std::vector<uint8_t> forced_early_z_shader_;
};
} // namespace d3d12

View File

@ -221,7 +221,9 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
*reinterpret_cast<const D3DSetSamplePositionsArguments*>(stream);
command_list_1->SetSamplePositions(
args.num_samples_per_pixel, args.num_pixels,
const_cast<D3D12_SAMPLE_POSITION*>(args.sample_positions));
(args.num_samples_per_pixel && args.num_pixels)
? const_cast<D3D12_SAMPLE_POSITION*>(args.sample_positions)
: nullptr);
}
} break;
default:

File diff suppressed because it is too large Load Diff

View File

@ -27,6 +27,7 @@
#include "xenia/gpu/d3d12/d3d12_shader.h"
#include "xenia/gpu/d3d12/render_target_cache.h"
#include "xenia/gpu/dxbc_shader_translator.h"
#include "xenia/gpu/gpu_flags.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/xenos.h"
#include "xenia/ui/d3d12/d3d12_api.h"
@ -43,36 +44,39 @@ class PipelineCache {
PipelineCache(D3D12CommandProcessor& command_processor,
const RegisterFile& register_file, bool bindless_resources_used,
bool edram_rov_used, uint32_t resolution_scale);
bool edram_rov_used,
flags::DepthFloat24Conversion depth_float24_conversion,
uint32_t resolution_scale);
~PipelineCache();
bool Initialize();
void Shutdown();
void ClearCache(bool shutting_down = false);
void InitializeShaderStorage(const std::filesystem::path& storage_root,
void InitializeShaderStorage(const std::filesystem::path& cache_root,
uint32_t title_id, bool blocking);
void ShutdownShaderStorage();
void EndSubmission();
bool IsCreatingPipelines();
D3D12Shader* LoadShader(xenos::ShaderType shader_type, uint32_t guest_address,
D3D12Shader* LoadShader(xenos::ShaderType shader_type,
const uint32_t* host_address, uint32_t dword_count);
// Returns the host vertex shader type for the current draw if it's valid and
// supported, or Shader::HostVertexShaderType(-1) if not.
Shader::HostVertexShaderType GetHostVertexShaderTypeIfValid() const;
// Retrieves the shader modifications for the current state, and returns
// whether they are valid.
bool GetCurrentShaderModifications(
DxbcShaderTranslator::Modification& vertex_shader_modification_out,
DxbcShaderTranslator::Modification& pixel_shader_modification_out) const;
// Translates shaders if needed, also making shader info up to date.
bool EnsureShadersTranslated(
D3D12Shader* vertex_shader, D3D12Shader* pixel_shader,
Shader::HostVertexShaderType host_vertex_shader_type);
bool EnsureShadersTranslated(D3D12Shader::D3D12Translation* vertex_shader,
D3D12Shader::D3D12Translation* pixel_shader);
bool ConfigurePipeline(
D3D12Shader* vertex_shader, D3D12Shader* pixel_shader,
D3D12Shader::D3D12Translation* vertex_shader,
D3D12Shader::D3D12Translation* pixel_shader,
xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format,
bool early_z,
const RenderTargetCache::PipelineRenderTarget render_targets[5],
void** pipeline_handle_out, ID3D12RootSignature** root_signature_out);
@ -86,13 +90,12 @@ class PipelineCache {
XEPACKEDSTRUCT(ShaderStoredHeader, {
uint64_t ucode_data_hash;
uint32_t ucode_dword_count : 16;
uint32_t ucode_dword_count : 31;
xenos::ShaderType type : 1;
Shader::HostVertexShaderType host_vertex_shader_type : 3;
reg::SQ_PROGRAM_CNTL sq_program_cntl;
static constexpr uint32_t kVersion = 0x20200405;
static constexpr uint32_t kVersion = 0x20201207;
});
// Update PipelineDescription::kVersion if any of the Pipeline* enums are
@ -170,28 +173,28 @@ class PipelineCache {
uint64_t vertex_shader_hash;
// 0 if drawing without a pixel shader.
uint64_t pixel_shader_hash;
uint32_t vertex_shader_modification;
uint32_t pixel_shader_modification;
int32_t depth_bias;
float depth_bias_slope_scaled;
PipelineStripCutIndex strip_cut_index : 2; // 2
Shader::HostVertexShaderType host_vertex_shader_type : 3; // 5
PipelineStripCutIndex strip_cut_index : 2; // 2
// PipelinePrimitiveTopologyType for a vertex shader.
// xenos::TessellationMode for a domain shader.
uint32_t primitive_topology_type_or_tessellation_mode : 2; // 7
uint32_t primitive_topology_type_or_tessellation_mode : 2; // 4
// Zero for non-kVertex host_vertex_shader_type.
PipelineGeometryShader geometry_shader : 2; // 9
uint32_t fill_mode_wireframe : 1; // 10
PipelineCullMode cull_mode : 2; // 12
uint32_t front_counter_clockwise : 1; // 13
uint32_t depth_clip : 1; // 14
uint32_t rov_msaa : 1; // 15
xenos::DepthRenderTargetFormat depth_format : 1; // 16
xenos::CompareFunction depth_func : 3; // 19
uint32_t depth_write : 1; // 20
uint32_t stencil_enable : 1; // 21
uint32_t stencil_read_mask : 8; // 29
uint32_t force_early_z : 1; // 30
PipelineGeometryShader geometry_shader : 2; // 6
uint32_t fill_mode_wireframe : 1; // 7
PipelineCullMode cull_mode : 2; // 9
uint32_t front_counter_clockwise : 1; // 10
uint32_t depth_clip : 1; // 11
uint32_t rov_msaa : 1; // 12
xenos::DepthRenderTargetFormat depth_format : 1; // 13
xenos::CompareFunction depth_func : 3; // 16
uint32_t depth_write : 1; // 17
uint32_t stencil_enable : 1; // 18
uint32_t stencil_read_mask : 8; // 26
uint32_t stencil_write_mask : 8; // 8
xenos::StencilOp stencil_front_fail_op : 3; // 11
@ -205,7 +208,7 @@ class PipelineCache {
PipelineRenderTarget render_targets[4];
static constexpr uint32_t kVersion = 0x20200405;
static constexpr uint32_t kVersion = 0x20201207;
});
XEPACKEDSTRUCT(PipelineStoredDescription, {
@ -215,24 +218,31 @@ class PipelineCache {
struct PipelineRuntimeDescription {
ID3D12RootSignature* root_signature;
D3D12Shader* vertex_shader;
D3D12Shader* pixel_shader;
D3D12Shader::D3D12Translation* vertex_shader;
D3D12Shader::D3D12Translation* pixel_shader;
PipelineDescription description;
};
// Returns the host vertex shader type for the current draw if it's valid and
// supported, or Shader::HostVertexShaderType(-1) if not.
Shader::HostVertexShaderType GetCurrentHostVertexShaderTypeIfValid() const;
D3D12Shader* LoadShader(xenos::ShaderType shader_type,
const uint32_t* host_address, uint32_t dword_count,
uint64_t data_hash);
// Can be called from multiple threads.
bool TranslateShader(DxbcShaderTranslator& translator, D3D12Shader& shader,
bool TranslateShader(DxbcShaderTranslator& translator,
D3D12Shader::D3D12Translation& translation,
reg::SQ_PROGRAM_CNTL cntl,
IDxbcConverter* dxbc_converter = nullptr,
IDxcUtils* dxc_utils = nullptr,
IDxcCompiler* dxc_compiler = nullptr,
Shader::HostVertexShaderType host_vertex_shader_type =
Shader::HostVertexShaderType::kVertex);
IDxcCompiler* dxc_compiler = nullptr);
bool GetCurrentStateDescription(
D3D12Shader* vertex_shader, D3D12Shader* pixel_shader,
D3D12Shader::D3D12Translation* vertex_shader,
D3D12Shader::D3D12Translation* pixel_shader,
xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format,
bool early_z,
const RenderTargetCache::PipelineRenderTarget render_targets[5],
PipelineRuntimeDescription& runtime_description_out);
@ -243,6 +253,8 @@ class PipelineCache {
const RegisterFile& register_file_;
bool bindless_resources_used_;
bool edram_rov_used_;
// 20e4 depth conversion mode to use for non-ROV output.
flags::DepthFloat24Conversion depth_float24_conversion_;
uint32_t resolution_scale_;
// Reusable shader translator.
@ -267,7 +279,7 @@ class PipelineCache {
// Texture binding layouts of different shaders, for obtaining layout UIDs.
std::vector<D3D12Shader::TextureBinding> texture_binding_layouts_;
// Map of texture binding layouts used by shaders, for obtaining UIDs. Keys
// are XXH64 hashes of layouts, values need manual collision resolution using
// are XXH3 hashes of layouts, values need manual collision resolution using
// layout_vector_offset:layout_length of texture_binding_layouts_.
std::unordered_multimap<uint64_t, LayoutUID,
xe::hash::IdentityHasher<uint64_t>>
@ -275,7 +287,7 @@ class PipelineCache {
// Bindless sampler indices of different shaders, for obtaining layout UIDs.
// For bindful, sampler count is used as the UID instead.
std::vector<uint32_t> bindless_sampler_layouts_;
// Keys are XXH64 hashes of used bindless sampler indices.
// Keys are XXH3 hashes of used bindless sampler indices.
std::unordered_multimap<uint64_t, LayoutUID,
xe::hash::IdentityHasher<uint64_t>>
bindless_sampler_layout_map_;
@ -300,11 +312,14 @@ class PipelineCache {
Pipeline* current_pipeline_ = nullptr;
// Currently open shader storage path.
std::filesystem::path shader_storage_root_;
std::filesystem::path shader_storage_cache_root_;
uint32_t shader_storage_title_id_ = 0;
// Shader storage output stream, for preload in the next emulator runs.
FILE* shader_storage_file_ = nullptr;
// For only writing shaders to the currently open storage once, incremented
// when switching the storage.
uint32_t shader_storage_index_ = 0;
bool shader_storage_file_flush_needed_ = false;
// Pipeline storage output stream, for preload in the next emulator runs.

View File

@ -40,11 +40,13 @@ namespace d3d12 {
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/edram_load_color_32bpp_cs.h"
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/edram_load_color_64bpp_cs.h"
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/edram_load_color_7e3_cs.h"
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/edram_load_depth_float24and32_cs.h"
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/edram_load_depth_float_cs.h"
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/edram_load_depth_unorm_cs.h"
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/edram_store_color_32bpp_cs.h"
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/edram_store_color_64bpp_cs.h"
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/edram_store_color_7e3_cs.h"
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/edram_store_depth_float24and32_cs.h"
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/edram_store_depth_float_cs.h"
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/edram_store_depth_unorm_cs.h"
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/resolve_clear_32bpp_2xres_cs.h"
@ -87,6 +89,12 @@ const RenderTargetCache::EdramLoadStoreModeInfo
{edram_load_depth_float_cs, sizeof(edram_load_depth_float_cs),
L"EDRAM Load Float Depth", edram_store_depth_float_cs,
sizeof(edram_store_depth_float_cs), L"EDRAM Store Float Depth"},
{edram_load_depth_float24and32_cs,
sizeof(edram_load_depth_float24and32_cs),
L"EDRAM Load 24-bit & 32-bit Float Depth",
edram_store_depth_float24and32_cs,
sizeof(edram_store_depth_float24and32_cs),
L"EDRAM Store 24-bit & 32-bit Float Depth"},
};
const std::pair<const uint8_t*, size_t>
@ -126,6 +134,8 @@ RenderTargetCache::RenderTargetCache(D3D12CommandProcessor& command_processor,
RenderTargetCache::~RenderTargetCache() { Shutdown(); }
bool RenderTargetCache::Initialize(const TextureCache& texture_cache) {
depth_float24_conversion_ = flags::GetDepthFloat24Conversion();
// EDRAM buffer size depends on this.
resolution_scale_2x_ = texture_cache.IsResolutionScale2X();
assert_false(resolution_scale_2x_ && !edram_rov_used_);
@ -420,7 +430,8 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) {
return false;
}
resolve_clear_64bpp_pipeline_->SetName(L"Resolve Clear 64bpp");
if (!edram_rov_used_) {
if (!edram_rov_used_ &&
depth_float24_conversion_ == flags::DepthFloat24Conversion::kOnCopy) {
assert_false(resolution_scale_2x_);
resolve_clear_depth_24_32_pipeline_ =
ui::d3d12::util::CreateComputePipeline(
@ -434,7 +445,7 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) {
Shutdown();
return false;
}
resolve_clear_64bpp_pipeline_->SetName(
resolve_clear_depth_24_32_pipeline_->SetName(
L"Resolve Clear 24-bit & 32-bit Depth");
}
@ -1266,10 +1277,12 @@ bool RenderTargetCache::Resolve(const Memory& memory,
if (clear_depth) {
// Also clear the host 32-bit floating-point depth used for loaing and
// storing 24-bit floating-point depth at full precision.
bool clear_float32_depth =
!edram_rov_used_ && xenos::DepthRenderTargetFormat(
resolve_info.depth_edram_info.format) ==
xenos::DepthRenderTargetFormat::kD24FS8;
bool clear_float32_depth = !edram_rov_used_ &&
depth_float24_conversion_ ==
flags::DepthFloat24Conversion::kOnCopy &&
xenos::DepthRenderTargetFormat(
resolve_info.depth_edram_info.format) ==
xenos::DepthRenderTargetFormat::kD24FS8;
draw_util::ResolveClearShaderConstants depth_clear_constants;
resolve_info.GetDepthClearShaderConstants(clear_float32_depth,
depth_clear_constants);
@ -1558,7 +1571,8 @@ void RenderTargetCache::RestoreEdramSnapshot(const void* snapshot) {
uint32_t RenderTargetCache::GetEdramBufferSize() const {
uint32_t size = xenos::kEdramSizeBytes;
if (!edram_rov_used_) {
if (!edram_rov_used_ &&
depth_float24_conversion_ == flags::DepthFloat24Conversion::kOnCopy) {
// Two 10 MB pages, one containing color and integer depth data, another
// with 32-bit float depth when 20e4 depth is used to allow for multipass
// drawing without precision loss in case of EDRAM store/load.
@ -1831,12 +1845,15 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
}
RenderTargetCache::EdramLoadStoreMode RenderTargetCache::GetLoadStoreMode(
bool is_depth, uint32_t format) {
bool is_depth, uint32_t format) const {
if (is_depth) {
return xenos::DepthRenderTargetFormat(format) ==
xenos::DepthRenderTargetFormat::kD24FS8
? EdramLoadStoreMode::kDepthFloat
: EdramLoadStoreMode::kDepthUnorm;
if (xenos::DepthRenderTargetFormat(format) ==
xenos::DepthRenderTargetFormat::kD24FS8) {
return depth_float24_conversion_ == flags::DepthFloat24Conversion::kOnCopy
? EdramLoadStoreMode::kDepthFloat24And32
: EdramLoadStoreMode::kDepthFloat;
}
return EdramLoadStoreMode::kDepthUnorm;
}
xenos::ColorRenderTargetFormat color_format =
xenos::ColorRenderTargetFormat(format);

View File

@ -18,6 +18,7 @@
#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
#include "xenia/gpu/d3d12/texture_cache.h"
#include "xenia/gpu/draw_util.h"
#include "xenia/gpu/gpu_flags.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/xenos.h"
@ -259,6 +260,10 @@ class RenderTargetCache {
void Shutdown();
void ClearCache();
flags::DepthFloat24Conversion depth_float24_conversion() const {
return depth_float24_conversion_;
}
void CompletedSubmissionUpdated();
void BeginSubmission();
void EndFrame();
@ -318,6 +323,7 @@ class RenderTargetCache {
kColor7e3,
kDepthUnorm,
kDepthFloat,
kDepthFloat24And32,
kCount
};
@ -424,7 +430,7 @@ class RenderTargetCache {
uint32_t instance);
#endif
static EdramLoadStoreMode GetLoadStoreMode(bool is_depth, uint32_t format);
EdramLoadStoreMode GetLoadStoreMode(bool is_depth, uint32_t format) const;
// Must be in a frame to call. Stores the dirty areas of the currently bound
// render targets and marks them as clean.
@ -442,6 +448,9 @@ class RenderTargetCache {
bool bindless_resources_used_;
bool edram_rov_used_;
// 20e4 depth conversion mode to use for non-ROV output.
flags::DepthFloat24Conversion depth_float24_conversion_;
// Whether 1 guest pixel is rendered as 2x2 host pixels (currently only
// supported with ROV).
bool resolution_scale_2x_ = false;

View File

@ -9,8 +9,6 @@
#include "xenia/gpu/d3d12/texture_cache.h"
#include "third_party/xxhash/xxhash.h"
#include <algorithm>
#include <cfloat>
#include <cstring>
@ -21,6 +19,7 @@
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/profiling.h"
#include "xenia/base/xxhash.h"
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
#include "xenia/gpu/gpu_flags.h"
#include "xenia/gpu/texture_info.h"

View File

@ -114,6 +114,7 @@ int32_t FloatToD3D11Fixed16p8(float f32) {
void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x,
float pixel_size_y, bool origin_bottom_left,
float x_max, float y_max, bool allow_reverse_z,
bool convert_z_to_float24,
ViewportInfo& viewport_info_out) {
assert_true(pixel_size_x >= 1.0f);
assert_true(pixel_size_y >= 1.0f);
@ -227,6 +228,7 @@ void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x,
ndc_offset_y = 0.0f;
}
} else {
viewport_top = 0.0f;
viewport_height = std::min(
float(xenos::kTexture2DCubeMaxWidthHeight) * pixel_size_y, y_max);
ndc_scale_y = (2.0f * pixel_size_y) / viewport_height;
@ -269,6 +271,17 @@ void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x,
ndc_scale_z = -ndc_scale_z;
ndc_offset_z = 1.0f - ndc_offset_z;
}
if (convert_z_to_float24 && regs.Get<reg::RB_DEPTHCONTROL>().z_enable &&
regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
xenos::DepthRenderTargetFormat::kD24FS8) {
// Need to adjust the bounds that the resulting depth values will be clamped
// to after the pixel shader. Preferring adding some error to interpolated Z
// instead if conversion can't be done exactly, without modifying clipping
// bounds by adjusting Z in vertex shaders, as that may cause polygons
// placed explicitly at Z = 0 or Z = W to be clipped.
viewport_z_min = xenos::Float20e4To32(xenos::Float32To20e4(viewport_z_min));
viewport_z_max = xenos::Float20e4To32(xenos::Float32To20e4(viewport_z_max));
}
viewport_info_out.left = viewport_left;
viewport_info_out.top = viewport_top;

View File

@ -53,6 +53,7 @@ struct ViewportInfo {
void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x,
float pixel_size_y, bool origin_bottom_left,
float x_max, float y_max, bool allow_reverse_z,
bool convert_z_to_float24,
ViewportInfo& viewport_info_out);
struct Scissor {

View File

@ -0,0 +1,27 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2020 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#include "xenia/gpu/dxbc_shader.h"
#include <cstring>
namespace xe {
namespace gpu {
DxbcShader::DxbcShader(xenos::ShaderType shader_type, uint64_t data_hash,
const uint32_t* dword_ptr, uint32_t dword_count)
: Shader(shader_type, data_hash, dword_ptr, dword_count) {}
Shader::Translation* DxbcShader::CreateTranslationInstance(
uint32_t modification) {
return new DxbcTranslation(*this, modification);
}
} // namespace gpu
} // namespace xe

View File

@ -0,0 +1,83 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2020 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_GPU_DXBC_SHADER_H_
#define XENIA_GPU_DXBC_SHADER_H_
#include <vector>
#include "xenia/gpu/dxbc_shader_translator.h"
#include "xenia/gpu/shader.h"
#include "xenia/gpu/xenos.h"
namespace xe {
namespace gpu {
class DxbcShader : public Shader {
public:
class DxbcTranslation : public Translation {
public:
DxbcTranslation(DxbcShader& shader, uint32_t modification)
: Translation(shader, modification) {}
};
DxbcShader(xenos::ShaderType shader_type, uint64_t data_hash,
const uint32_t* dword_ptr, uint32_t dword_count);
static constexpr uint32_t kMaxTextureBindingIndexBits =
DxbcShaderTranslator::kMaxTextureBindingIndexBits;
static constexpr uint32_t kMaxTextureBindings =
DxbcShaderTranslator::kMaxTextureBindings;
struct TextureBinding {
uint32_t bindless_descriptor_index;
uint32_t fetch_constant;
// Stacked and 3D are separate TextureBindings, even for bindless for null
// descriptor handling simplicity.
xenos::FetchOpDimension dimension;
bool is_signed;
};
// Safe to hash and compare with memcmp for layout hashing.
const TextureBinding* GetTextureBindings(uint32_t& count_out) const {
count_out = uint32_t(texture_bindings_.size());
return texture_bindings_.data();
}
const uint32_t GetUsedTextureMask() const { return used_texture_mask_; }
static constexpr uint32_t kMaxSamplerBindingIndexBits =
DxbcShaderTranslator::kMaxSamplerBindingIndexBits;
static constexpr uint32_t kMaxSamplerBindings =
DxbcShaderTranslator::kMaxSamplerBindings;
struct SamplerBinding {
uint32_t bindless_descriptor_index;
uint32_t fetch_constant;
xenos::TextureFilter mag_filter;
xenos::TextureFilter min_filter;
xenos::TextureFilter mip_filter;
xenos::AnisoFilter aniso_filter;
};
const SamplerBinding* GetSamplerBindings(uint32_t& count_out) const {
count_out = uint32_t(sampler_bindings_.size());
return sampler_bindings_.data();
}
protected:
Translation* CreateTranslationInstance(uint32_t modification) override;
private:
friend class DxbcShaderTranslator;
std::vector<TextureBinding> texture_bindings_;
std::vector<SamplerBinding> sampler_bindings_;
uint32_t used_texture_mask_ = 0;
};
} // namespace gpu
} // namespace xe
#endif // XENIA_GPU_DXBC_SHADER_H_

View File

@ -19,6 +19,7 @@
#include "xenia/base/assert.h"
#include "xenia/base/cvar.h"
#include "xenia/base/math.h"
#include "xenia/gpu/dxbc_shader.h"
DEFINE_bool(dxbc_switch, true,
"Use switch rather than if for flow control. Turning this off or "
@ -76,64 +77,31 @@ DxbcShaderTranslator::DxbcShaderTranslator(uint32_t vendor_id,
}
DxbcShaderTranslator::~DxbcShaderTranslator() = default;
std::vector<uint8_t> DxbcShaderTranslator::ForceEarlyDepthStencil(
const uint8_t* shader) {
const uint32_t* old_shader = reinterpret_cast<const uint32_t*>(shader);
// To return something anyway even if patching fails.
std::vector<uint8_t> new_shader;
uint32_t shader_size_bytes = old_shader[6];
new_shader.resize(shader_size_bytes);
std::memcpy(new_shader.data(), shader, shader_size_bytes);
// Find the SHEX chunk.
uint32_t chunk_count = old_shader[7];
for (uint32_t i = 0; i < chunk_count; ++i) {
uint32_t chunk_offset_bytes = old_shader[8 + i];
const uint32_t* chunk = old_shader + chunk_offset_bytes / sizeof(uint32_t);
if (chunk[0] != 'XEHS') {
continue;
}
// Find dcl_globalFlags and patch it.
uint32_t code_size_dwords = chunk[3];
chunk += 4;
for (uint32_t j = 0; j < code_size_dwords;) {
uint32_t opcode_token = chunk[j];
uint32_t opcode = DECODE_D3D10_SB_OPCODE_TYPE(opcode_token);
if (opcode == D3D10_SB_OPCODE_DCL_GLOBAL_FLAGS) {
opcode_token |= D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL;
std::memcpy(new_shader.data() +
(chunk_offset_bytes + (4 + j) * sizeof(uint32_t)),
&opcode_token, sizeof(uint32_t));
// Recalculate the checksum since the shader was modified.
CalculateDXBCChecksum(
reinterpret_cast<unsigned char*>(new_shader.data()),
shader_size_bytes,
reinterpret_cast<unsigned int*>(new_shader.data() +
sizeof(uint32_t)));
break;
}
if (opcode == D3D10_SB_OPCODE_CUSTOMDATA) {
j += chunk[j + 1];
} else {
j += DECODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(opcode_token);
}
}
break;
}
return std::move(new_shader);
}
std::vector<uint8_t> DxbcShaderTranslator::CreateDepthOnlyPixelShader() {
Reset();
Reset(xenos::ShaderType::kPixel);
is_depth_only_pixel_shader_ = true;
StartTranslation();
return std::move(CompleteTranslation());
}
void DxbcShaderTranslator::Reset() {
ShaderTranslator::Reset();
uint32_t DxbcShaderTranslator::GetDefaultModification(
xenos::ShaderType shader_type,
Shader::HostVertexShaderType host_vertex_shader_type) const {
Modification shader_modification;
switch (shader_type) {
case xenos::ShaderType::kVertex:
shader_modification.host_vertex_shader_type = host_vertex_shader_type;
break;
case xenos::ShaderType::kPixel:
shader_modification.depth_stencil_mode =
Modification::DepthStencilMode::kNoModifiers;
break;
}
return shader_modification.value;
}
void DxbcShaderTranslator::Reset(xenos::ShaderType shader_type) {
ShaderTranslator::Reset(shader_type);
shader_code_.clear();
@ -152,7 +120,7 @@ void DxbcShaderTranslator::Reset() {
in_domain_location_used_ = 0;
in_primitive_id_used_ = false;
in_control_point_index_used_ = false;
in_position_xy_used_ = false;
in_position_used_ = 0;
in_front_face_used_ = false;
system_temp_count_current_ = 0;
@ -457,7 +425,9 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() {
// Remember that x# are only accessible via mov load or store - use a
// temporary variable if need to do any computations!
switch (host_vertex_shader_type()) {
Shader::HostVertexShaderType host_vertex_shader_type =
GetDxbcShaderModification().host_vertex_shader_type;
switch (host_vertex_shader_type) {
case Shader::HostVertexShaderType::kVertex:
StartVertexShader_LoadVertexIndex();
break;
@ -618,7 +588,7 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() {
default:
// TODO(Triang3l): Support line and non-adaptive quad patches.
assert_unhandled_case(host_vertex_shader_type());
assert_unhandled_case(host_vertex_shader_type);
EmitTranslationError(
"Unsupported host vertex shader type in StartVertexOrDomainShader");
break;
@ -720,7 +690,7 @@ void DxbcShaderTranslator::StartPixelShader() {
// faceness as X sign bit. Using Z as scratch register now.
if (edram_rov_used_) {
// Get XY address of the current host pixel as float.
in_position_xy_used_ = true;
in_position_used_ |= 0b0011;
DxbcOpRoundZ(DxbcDest::R(param_gen_temp, 0b0011),
DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition)));
// Revert resolution scale - after truncating, so if the pixel position
@ -744,7 +714,7 @@ void DxbcShaderTranslator::StartPixelShader() {
} else {
// Get XY address of the current SSAA sample by converting
// SV_Position.xy to an integer.
in_position_xy_used_ = true;
in_position_used_ |= 0b0011;
DxbcOpFToU(DxbcDest::R(param_gen_temp, 0b0011),
DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition)));
// Undo SSAA that is used instead of MSAA - since it's used as a
@ -870,7 +840,7 @@ void DxbcShaderTranslator::StartPixelShader() {
void DxbcShaderTranslator::StartTranslation() {
// Allocate global system temporary registers that may also be used in the
// epilogue.
if (IsDxbcVertexOrDomainShader()) {
if (is_vertex_shader()) {
system_temp_position_ = PushSystemTemp(0b1111);
system_temp_point_size_edge_flag_kill_vertex_ = PushSystemTemp(0b0100);
// Set the point size to a negative value to tell the geometry shader that
@ -879,20 +849,21 @@ void DxbcShaderTranslator::StartTranslation() {
DxbcOpMov(
DxbcDest::R(system_temp_point_size_edge_flag_kill_vertex_, 0b0001),
DxbcSrc::LF(-1.0f));
} else if (IsDxbcPixelShader()) {
} else if (is_pixel_shader()) {
if (edram_rov_used_) {
// Will be initialized unconditionally.
system_temp_rov_params_ = PushSystemTemp();
if (ROV_IsDepthStencilEarly() || writes_depth()) {
// If the shader doesn't write to oDepth, each component will be written
// to if depth/stencil is enabled and the respective sample is covered -
// so need to initialize now because the first writes will be
// conditional. If the shader writes to oDepth, this is oDepth of the
// shader, written by the guest code, so initialize because assumptions
// can't be made about the integrity of the guest code.
system_temp_rov_depth_stencil_ =
PushSystemTemp(writes_depth() ? 0b0001 : 0b1111);
}
}
if (IsDepthStencilSystemTempUsed()) {
// If the shader doesn't write to oDepth, and ROV is used, each
// component will be written to if depth/stencil is enabled and the
// respective sample is covered - so need to initialize now because the
// first writes will be conditional.
// If the shader writes to oDepth, this is oDepth of the shader, written
// by the guest code, so initialize because assumptions can't be made
// about the integrity of the guest code.
system_temp_depth_stencil_ =
PushSystemTemp(writes_depth() ? 0b0001 : 0b1111);
}
for (uint32_t i = 0; i < 4; ++i) {
if (writes_color_target(i)) {
@ -942,7 +913,7 @@ void DxbcShaderTranslator::StartTranslation() {
// Zero general-purpose registers to prevent crashes when the game
// references them after only initializing them conditionally.
for (uint32_t i = IsDxbcPixelShader() ? xenos::kMaxInterpolators : 0;
for (uint32_t i = is_pixel_shader() ? xenos::kMaxInterpolators : 0;
i < register_count(); ++i) {
DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, i)
: DxbcDest::R(i),
@ -951,9 +922,9 @@ void DxbcShaderTranslator::StartTranslation() {
}
// Write stage-specific prologue.
if (IsDxbcVertexOrDomainShader()) {
if (is_vertex_shader()) {
StartVertexOrDomainShader();
} else if (IsDxbcPixelShader()) {
} else if (is_pixel_shader()) {
StartPixelShader();
}
@ -1168,31 +1139,31 @@ void DxbcShaderTranslator::CompleteShaderCode() {
}
// Write stage-specific epilogue.
if (IsDxbcVertexOrDomainShader()) {
if (is_vertex_shader()) {
CompleteVertexOrDomainShader();
} else if (IsDxbcPixelShader()) {
} else if (is_pixel_shader()) {
CompletePixelShader();
}
// Return from `main`.
DxbcOpRet();
if (IsDxbcVertexOrDomainShader()) {
if (is_vertex_shader()) {
// Release system_temp_position_ and
// system_temp_point_size_edge_flag_kill_vertex_.
PopSystemTemp(2);
} else if (IsDxbcPixelShader()) {
} else if (is_pixel_shader()) {
// Release system_temps_color_.
for (int32_t i = 3; i >= 0; --i) {
if (writes_color_target(i)) {
PopSystemTemp();
}
}
if (IsDepthStencilSystemTempUsed()) {
// Release system_temp_depth_stencil_.
PopSystemTemp();
}
if (edram_rov_used_) {
if (ROV_IsDepthStencilEarly() || writes_depth()) {
// Release system_temp_rov_depth_stencil_.
PopSystemTemp();
}
// Release system_temp_rov_params_.
PopSystemTemp();
}
@ -1303,6 +1274,44 @@ std::vector<uint8_t> DxbcShaderTranslator::CompleteTranslation() {
return shader_object_bytes;
}
void DxbcShaderTranslator::PostTranslation(
Shader::Translation& translation, bool setup_shader_post_translation_info) {
if (setup_shader_post_translation_info) {
DxbcShader* dxbc_shader = dynamic_cast<DxbcShader*>(&translation.shader());
if (dxbc_shader) {
dxbc_shader->texture_bindings_.clear();
dxbc_shader->texture_bindings_.reserve(texture_bindings_.size());
dxbc_shader->used_texture_mask_ = 0;
for (const TextureBinding& translator_binding : texture_bindings_) {
DxbcShader::TextureBinding& shader_binding =
dxbc_shader->texture_bindings_.emplace_back();
// For a stable hash.
std::memset(&shader_binding, 0, sizeof(shader_binding));
shader_binding.bindless_descriptor_index =
translator_binding.bindless_descriptor_index;
shader_binding.fetch_constant = translator_binding.fetch_constant;
shader_binding.dimension = translator_binding.dimension;
shader_binding.is_signed = translator_binding.is_signed;
dxbc_shader->used_texture_mask_ |= 1u
<< translator_binding.fetch_constant;
}
dxbc_shader->sampler_bindings_.clear();
dxbc_shader->sampler_bindings_.reserve(sampler_bindings_.size());
for (const SamplerBinding& translator_binding : sampler_bindings_) {
DxbcShader::SamplerBinding& shader_binding =
dxbc_shader->sampler_bindings_.emplace_back();
shader_binding.bindless_descriptor_index =
translator_binding.bindless_descriptor_index;
shader_binding.fetch_constant = translator_binding.fetch_constant;
shader_binding.mag_filter = translator_binding.mag_filter;
shader_binding.min_filter = translator_binding.min_filter;
shader_binding.mip_filter = translator_binding.mip_filter;
shader_binding.aniso_filter = translator_binding.aniso_filter;
}
}
}
}
void DxbcShaderTranslator::EmitInstructionDisassembly() {
if (!emit_source_map_) {
return;
@ -1527,19 +1536,20 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
}
break;
case InstructionStorageTarget::kDepth:
// Writes X to scalar oDepth or to X of system_temp_rov_depth_stencil_, no
// Writes X to scalar oDepth or to X of system_temp_depth_stencil_, no
// additional swizzling needed.
assert_true(used_write_mask == 0b0001);
assert_true(writes_depth());
if (edram_rov_used_) {
dest = DxbcDest::R(system_temp_rov_depth_stencil_);
if (IsDepthStencilSystemTempUsed()) {
dest = DxbcDest::R(system_temp_depth_stencil_);
} else {
dest = DxbcDest::ODepth();
}
// Depth outside [0, 1] is not safe for use with the ROV code. Though 20e4
// float depth can store values below 2, it's a very unusual case.
// Direct3D 10+ SV_Depth, however, can accept any values, including
// specials, when the depth buffer is floating-point.
// Depth outside [0, 1] is not safe for use with the ROV code and with
// 20e4-as-32 conversion. Though 20e4 float depth can store values between
// 1 and 2, it's a very unusual case. Direct3D 10+ SV_Depth, however, can
// accept any values, including specials, when the depth buffer is
// floating-point; but depth is clamped to the viewport bounds anyway.
is_clamped = true;
break;
}
@ -2094,7 +2104,7 @@ void DxbcShaderTranslator::WriteResourceDefinitions() {
// ds_5_1
shader_object_.push_back(0x44530501u);
} else {
assert_true(IsDxbcPixelShader());
assert_true(is_pixel_shader());
// ps_5_1
shader_object_.push_back(0xFFFF0501u);
}
@ -2765,7 +2775,7 @@ void DxbcShaderTranslator::WriteInputSignature() {
control_point_index.semantic_name = semantic_offset;
}
semantic_offset += AppendString(shader_object_, "XEVERTEXID");
} else if (IsDxbcPixelShader()) {
} else if (is_pixel_shader()) {
// Written dynamically, so assume it's always used if it can be written to
// any interpolator register.
bool param_gen_used = !is_depth_only_pixel_shader_ && register_count() != 0;
@ -2843,7 +2853,7 @@ void DxbcShaderTranslator::WriteInputSignature() {
position.component_type = DxbcSignatureRegisterComponentType::kFloat32;
position.register_index = uint32_t(InOutRegister::kPSInPosition);
position.mask = 0b1111;
position.always_reads_mask = in_position_xy_used_ ? 0b0011 : 0b0000;
position.always_reads_mask = in_position_used_;
}
// Is front face (SV_IsFrontFace).
@ -2927,7 +2937,9 @@ void DxbcShaderTranslator::WritePatchConstantSignature() {
DxbcName tess_factor_edge_system_value = DxbcName::kUndefined;
uint32_t tess_factor_inside_count = 0;
DxbcName tess_factor_inside_system_value = DxbcName::kUndefined;
switch (host_vertex_shader_type()) {
Shader::HostVertexShaderType host_vertex_shader_type =
GetDxbcShaderModification().host_vertex_shader_type;
switch (host_vertex_shader_type) {
case Shader::HostVertexShaderType::kTriangleDomainCPIndexed:
case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed:
tess_factor_edge_count = 3;
@ -2944,7 +2956,7 @@ void DxbcShaderTranslator::WritePatchConstantSignature() {
break;
default:
// TODO(Triang3l): Support line patches.
assert_unhandled_case(host_vertex_shader_type());
assert_unhandled_case(host_vertex_shader_type);
EmitTranslationError(
"Unsupported host vertex shader type in WritePatchConstantSignature");
}
@ -3033,7 +3045,7 @@ void DxbcShaderTranslator::WriteOutputSignature() {
constexpr size_t kParameterDwords =
sizeof(DxbcSignatureParameter) / sizeof(uint32_t);
if (IsDxbcVertexOrDomainShader()) {
if (is_vertex_shader()) {
// Intepolators (TEXCOORD#).
size_t interpolator_position = shader_object_.size();
shader_object_.resize(shader_object_.size() +
@ -3195,7 +3207,7 @@ void DxbcShaderTranslator::WriteOutputSignature() {
cull_distance.semantic_name = semantic_offset;
}
semantic_offset += AppendString(shader_object_, "SV_CullDistance");
} else if (IsDxbcPixelShader()) {
} else if (is_pixel_shader()) {
if (!edram_rov_used_) {
// Color render targets (SV_Target#).
size_t target_position = SIZE_MAX;
@ -3217,9 +3229,11 @@ void DxbcShaderTranslator::WriteOutputSignature() {
}
}
// Depth (SV_Depth).
// Depth (SV_Depth or SV_DepthLessEqual).
Modification::DepthStencilMode depth_stencil_mode =
GetDxbcShaderModification().depth_stencil_mode;
size_t depth_position = SIZE_MAX;
if (writes_depth()) {
if (writes_depth() || DSV_IsWritingFloat24Depth()) {
depth_position = shader_object_.size();
shader_object_.resize(shader_object_.size() + kParameterDwords);
++parameter_count;
@ -3253,7 +3267,15 @@ void DxbcShaderTranslator::WriteOutputSignature() {
depth_position);
depth.semantic_name = semantic_offset;
}
semantic_offset += AppendString(shader_object_, "SV_Depth");
const char* depth_semantic_name;
if (!writes_depth() &&
GetDxbcShaderModification().depth_stencil_mode ==
Modification::DepthStencilMode::kFloat24Truncating) {
depth_semantic_name = "SV_DepthLessEqual";
} else {
depth_semantic_name = "SV_Depth";
}
semantic_offset += AppendString(shader_object_, depth_semantic_name);
}
}
}
@ -3276,7 +3298,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
} else if (IsDxbcDomainShader()) {
shader_type = D3D11_SB_DOMAIN_SHADER;
} else {
assert_true(IsDxbcPixelShader());
assert_true(is_pixel_shader());
shader_type = D3D10_SB_PIXEL_SHADER;
}
shader_object_.push_back(
@ -3296,12 +3318,14 @@ void DxbcShaderTranslator::WriteShaderCode() {
// Inputs/outputs have 1D-indexed operands with a component mask and a
// register index.
Modification shader_modification = GetDxbcShaderModification();
if (IsDxbcDomainShader()) {
// Not using control point data since Xenos only has a vertex shader acting
// as both vertex shader and domain shader.
stat_.c_control_points = 3;
stat_.tessellator_domain = DxbcTessellatorDomain::kTriangle;
switch (host_vertex_shader_type()) {
switch (shader_modification.host_vertex_shader_type) {
case Shader::HostVertexShaderType::kTriangleDomainCPIndexed:
case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed:
stat_.c_control_points = 3;
@ -3314,7 +3338,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
break;
default:
// TODO(Triang3l): Support line patches.
assert_unhandled_case(host_vertex_shader_type());
assert_unhandled_case(shader_modification.host_vertex_shader_type);
EmitTranslationError(
"Unsupported host vertex shader type in WriteShaderCode");
}
@ -3330,11 +3354,17 @@ void DxbcShaderTranslator::WriteShaderCode() {
}
// Don't allow refactoring when converting to native code to maintain position
// invariance (needed even in pixel shaders for oDepth invariance). Also this
// dcl will be modified by ForceEarlyDepthStencil.
shader_object_.push_back(
// invariance (needed even in pixel shaders for oDepth invariance).
uint32_t global_flags_opcode =
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_GLOBAL_FLAGS) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1));
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1);
if (is_pixel_shader() &&
GetDxbcShaderModification().depth_stencil_mode ==
Modification::DepthStencilMode::kEarlyHint &&
!edram_rov_used_ && CanWriteZEarly()) {
global_flags_opcode |= D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL;
}
shader_object_.push_back(global_flags_opcode);
// Constant buffers, from most frequenly accessed to least frequently accessed
// (the order is a hint to the driver according to the DXBC header).
@ -3560,7 +3590,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
}
// Inputs and outputs.
if (IsDxbcVertexOrDomainShader()) {
if (is_vertex_shader()) {
if (IsDxbcDomainShader()) {
if (in_domain_location_used_) {
// Domain location input.
@ -3584,7 +3614,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
if (in_control_point_index_used_) {
// Control point indices as float input.
uint32_t control_point_array_size;
switch (host_vertex_shader_type()) {
switch (shader_modification.host_vertex_shader_type) {
case Shader::HostVertexShaderType::kTriangleDomainCPIndexed:
control_point_array_size = 3;
break;
@ -3593,7 +3623,7 @@ void DxbcShaderTranslator::WriteShaderCode() {
break;
default:
// TODO(Triang3l): Support line patches.
assert_unhandled_case(host_vertex_shader_type());
assert_unhandled_case(shader_modification.host_vertex_shader_type);
EmitTranslationError(
"Unsupported host vertex shader type in "
"StartVertexOrDomainShader");
@ -3683,7 +3713,8 @@ void DxbcShaderTranslator::WriteShaderCode() {
uint32_t(InOutRegister::kVSDSOutClipDistance45AndCullDistance));
shader_object_.push_back(ENCODE_D3D10_SB_NAME(D3D10_SB_NAME_CULL_DISTANCE));
++stat_.dcl_count;
} else if (IsDxbcPixelShader()) {
} else if (is_pixel_shader()) {
bool is_writing_float24_depth = DSV_IsWritingFloat24Depth();
// Interpolator input.
if (!is_depth_only_pixel_shader_) {
uint32_t interpolator_count =
@ -3725,16 +3756,26 @@ void DxbcShaderTranslator::WriteShaderCode() {
shader_object_.push_back(uint32_t(InOutRegister::kPSInClipSpaceZW));
++stat_.dcl_count;
}
if (in_position_xy_used_) {
// Position input (only XY needed for ps_param_gen, and the ROV depth code
// calculates the depth from clip space Z and W).
if (in_position_used_) {
// Position input (XY needed for ps_param_gen, Z needed for non-ROV
// float24 conversion; the ROV depth code calculates the depth the from
// clip space Z and W with pull-mode per-sample interpolation instead).
// At the cost of possibility of MSAA with pixel-rate shading, need
// per-sample depth - otherwise intersections cannot be antialiased, and
// with SV_DepthLessEqual, per-sample (or centroid, but this isn't
// applicable here) position is mandatory. However, with depth output, on
// the guest, there's only one depth value for the whole pixel.
D3D10_SB_INTERPOLATION_MODE position_interpolation_mode =
is_writing_float24_depth && !writes_depth()
? D3D10_SB_INTERPOLATION_LINEAR_NOPERSPECTIVE_SAMPLE
: D3D10_SB_INTERPOLATION_LINEAR_NOPERSPECTIVE;
shader_object_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_INPUT_PS_SIV) |
ENCODE_D3D10_SB_INPUT_INTERPOLATION_MODE(
D3D10_SB_INTERPOLATION_LINEAR_NOPERSPECTIVE) |
position_interpolation_mode) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(4));
shader_object_.push_back(
EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_INPUT, 0b0011, 1));
shader_object_.push_back(EncodeVectorMaskedOperand(
D3D10_SB_OPERAND_TYPE_INPUT, in_position_used_, 1));
shader_object_.push_back(uint32_t(InOutRegister::kPSInPosition));
shader_object_.push_back(ENCODE_D3D10_SB_NAME(D3D10_SB_NAME_POSITION));
++stat_.dcl_count;
@ -3778,12 +3819,19 @@ void DxbcShaderTranslator::WriteShaderCode() {
}
}
// Depth output.
if (writes_depth()) {
if (is_writing_float24_depth || writes_depth()) {
D3D10_SB_OPERAND_TYPE depth_operand_type;
if (!writes_depth() &&
GetDxbcShaderModification().depth_stencil_mode ==
Modification::DepthStencilMode::kFloat24Truncating) {
depth_operand_type = D3D11_SB_OPERAND_TYPE_OUTPUT_DEPTH_LESS_EQUAL;
} else {
depth_operand_type = D3D10_SB_OPERAND_TYPE_OUTPUT_DEPTH;
}
shader_object_.push_back(
ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_OUTPUT) |
ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(2));
shader_object_.push_back(
EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_OUTPUT_DEPTH, 0));
shader_object_.push_back(EncodeScalarOperand(depth_operand_type, 0));
++stat_.dcl_count;
}
}

View File

@ -102,6 +102,51 @@ class DxbcShaderTranslator : public ShaderTranslator {
bool edram_rov_used, bool force_emit_source_map = false);
~DxbcShaderTranslator() override;
union Modification {
// If anything in this is structure is changed in a way not compatible with
// the previous layout, invalidate the pipeline storages by increasing this
// version number (0xYYYYMMDD)!
static constexpr uint32_t kVersion = 0x20201203;
enum class DepthStencilMode : uint32_t {
kNoModifiers,
// [earlydepthstencil] - enable if alpha test and alpha to coverage are
// disabled; ignored if anything in the shader blocks early Z writing
// (which is not known before translation, so this will be set anyway).
kEarlyHint,
// Converting the depth to the closest 32-bit float representable exactly
// as a 20e4 float, to support invariance in cases when the guest
// reuploads a previously resolved depth buffer to the EDRAM, rounding
// towards zero (which contradicts the rounding used by the Direct3D 9
// reference rasterizer, but allows SV_DepthLessEqual to be used to allow
// slightly coarse early Z culling; also truncating regardless of whether
// the shader writes depth and thus always uses SV_Depth, for
// consistency). MSAA is limited - depth must be per-sample
// (SV_DepthLessEqual also explicitly requires sample or centroid position
// interpolation), thus the sampler has to run at sample frequency even if
// the device supports stencil loading and thus true non-ROV MSAA via
// SV_StencilRef.
// Fixed-function viewport depth bounds must be snapped to float24 for
// clamping purposes.
kFloat24Truncating,
// Similar to kFloat24Truncating, but rounding to the nearest even,
// however, always using SV_Depth rather than SV_DepthLessEqual because
// rounding up results in a bigger value. Same viewport usage rules apply.
kFloat24Rounding,
};
struct {
// VS - pipeline stage and input configuration.
Shader::HostVertexShaderType host_vertex_shader_type
: Shader::kHostVertexShaderTypeBitCount;
// PS, non-ROV - depth / stencil output mode.
DepthStencilMode depth_stencil_mode : 2;
};
uint32_t value = 0;
Modification(uint32_t modification_value = 0) : value(modification_value) {}
};
// Constant buffer bindings in space 0.
enum class CbufferRegister {
kSystemConstants,
@ -144,12 +189,14 @@ class DxbcShaderTranslator : public ShaderTranslator {
kSysFlag_ROVStencilTest_Shift,
// If the depth/stencil test has failed, but resulted in a stencil value
// that is different than the one currently in the depth buffer, write it
// anyway and don't run the shader (to check if the sample may be discarded
// some way). This, however, also results in depth/stencil testing done
// entirely early even when it passes to prevent writing in divergent places
// in the shader. When the shader can kill, this must be set only for
// RB_DEPTHCONTROL EARLY_Z_ENABLE, not for alpha test/alpha to coverage
// disabled.
// anyway and don't run the rest of the shader (to check if the sample may
// be discarded some way) - use when alpha test and alpha to coverage are
// disabled. Ignored by the shader if not applicable to it (like if it has
// kill instructions or writes the depth output).
// TODO(Triang3l): Investigate replacement with an alpha-to-mask flag,
// checking `(flags & (alpha test | alpha to mask)) == (always | disabled)`,
// taking into account the potential relation with occlusion queries (but
// should be safe at least temporarily).
kSysFlag_ROVDepthStencilEarlyWrite_Shift,
kSysFlag_Count,
@ -238,15 +285,15 @@ class DxbcShaderTranslator : public ShaderTranslator {
// EDRAM address calculation.
uint32_t sample_count_log2[2];
float alpha_test_reference;
// If alpha to mask is disabled, the entire alpha_to_mask value must be 0.
// If alpha to mask is enabled, bits 0:7 are sample offsets, and bit 8 must
// be 1.
uint32_t alpha_to_mask;
float color_exp_bias[4];
uint32_t color_output_map[4];
// If alpha to mask is disabled, the entire alpha_to_mask value must be 0.
// If alpha to mask is enabled, bits 0:7 are sample offsets, and bit 8 must
// be 1.
uint32_t edram_resolution_square_scale;
uint32_t edram_pitch_tiles;
union {
@ -358,12 +405,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
bool is_signed;
std::string name;
};
// The first binding returned is at t[SRVMainRegister::kBindfulTexturesStart]
// of space SRVSpace::kMain.
const TextureBinding* GetTextureBindings(uint32_t& count_out) const {
count_out = uint32_t(texture_bindings_.size());
return texture_bindings_.data();
}
// Arbitrary limit - there can't be more than 2048 in a shader-visible
// descriptor heap, though some older hardware (tier 1 resource binding -
@ -385,16 +426,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
xenos::AnisoFilter aniso_filter;
std::string name;
};
const SamplerBinding* GetSamplerBindings(uint32_t& count_out) const {
count_out = uint32_t(sampler_bindings_.size());
return sampler_bindings_.data();
}
// Returns the number of texture SRV and sampler offsets that need to be
// passed via a constant buffer to the shader.
uint32_t GetBindlessResourceCount() const {
return uint32_t(texture_bindings_.size() + sampler_bindings_.size());
}
// Unordered access view bindings in space 0.
enum class UAVRegister {
@ -402,10 +433,6 @@ class DxbcShaderTranslator : public ShaderTranslator {
kEdram,
};
// Creates a copy of the shader with early depth/stencil testing forced,
// overriding that alpha testing is used in the shader.
static std::vector<uint8_t> ForceEarlyDepthStencil(const uint8_t* shader);
// Returns the format with internal flags for passing via the
// edram_rt_format_flags system constant.
static constexpr uint32_t ROV_AddColorFormatFlags(
@ -440,16 +467,22 @@ class DxbcShaderTranslator : public ShaderTranslator {
float& clamp_alpha_high, uint32_t& keep_mask_low,
uint32_t& keep_mask_high);
uint32_t GetDefaultModification(
xenos::ShaderType shader_type,
Shader::HostVertexShaderType host_vertex_shader_type =
Shader::HostVertexShaderType::kVertex) const override;
// Creates a special pixel shader without color outputs - this resets the
// state of the translator.
std::vector<uint8_t> CreateDepthOnlyPixelShader();
protected:
void Reset() override;
void Reset(xenos::ShaderType shader_type) override;
void StartTranslation() override;
std::vector<uint8_t> CompleteTranslation() override;
void PostTranslation(Shader::Translation& translation,
bool setup_shader_post_translation_info) override;
void ProcessLabel(uint32_t cf_index) override;
@ -650,6 +683,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
kInputDomainPoint = 28,
kUnorderedAccessView = 30,
kInputCoverageMask = 35,
kOutputDepthLessEqual = 39,
};
// D3D10_SB_OPERAND_INDEX_DIMENSION
@ -689,6 +723,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
return DxbcOperandDimension::kNoData;
case DxbcOperandType::kInputPrimitiveID:
case DxbcOperandType::kOutputDepth:
case DxbcOperandType::kOutputDepthLessEqual:
return DxbcOperandDimension::kScalar;
case DxbcOperandType::kInputCoverageMask:
return dest_in_dcl ? DxbcOperandDimension::kScalar
@ -860,6 +895,9 @@ class DxbcShaderTranslator : public ShaderTranslator {
return DxbcDest(DxbcOperandType::kUnorderedAccessView, write_mask,
index_1d, index_2d);
}
static DxbcDest ODepthLE() {
return DxbcDest(DxbcOperandType::kOutputDepthLessEqual, 0b0001);
}
uint32_t GetMask() const {
switch (GetDimension()) {
@ -2145,21 +2183,19 @@ class DxbcShaderTranslator : public ShaderTranslator {
(index_representation_1 << 25) | (index_representation_2 << 28);
}
// Use these instead of is_vertex_shader/is_pixel_shader because they don't
// take is_depth_only_pixel_shader_ into account.
inline bool IsDxbcVertexOrDomainShader() const {
return !is_depth_only_pixel_shader_ && is_vertex_shader();
Modification GetDxbcShaderModification() const {
return Modification(modification());
}
inline bool IsDxbcVertexShader() const {
return IsDxbcVertexOrDomainShader() &&
host_vertex_shader_type() == Shader::HostVertexShaderType::kVertex;
bool IsDxbcVertexShader() const {
return is_vertex_shader() &&
GetDxbcShaderModification().host_vertex_shader_type ==
Shader::HostVertexShaderType::kVertex;
}
inline bool IsDxbcDomainShader() const {
return IsDxbcVertexOrDomainShader() &&
host_vertex_shader_type() != Shader::HostVertexShaderType::kVertex;
}
inline bool IsDxbcPixelShader() const {
return is_depth_only_pixel_shader_ || is_pixel_shader();
bool IsDxbcDomainShader() const {
return is_vertex_shader() &&
GetDxbcShaderModification().host_vertex_shader_type !=
Shader::HostVertexShaderType::kVertex;
}
// Whether to use switch-case rather than if (pc >= label) for control flow.
@ -2181,10 +2217,37 @@ class DxbcShaderTranslator : public ShaderTranslator {
uint32_t piece_temp_component, uint32_t accumulator_temp,
uint32_t accumulator_temp_component);
// Converts the depth value externally clamped to the representable [0, 2)
// range to 20e4 floating point, with zeros in bits 24:31, rounding to the
// nearest even. Source and destination may be the same, temporary must be
// different than both.
void PreClampedDepthTo20e4(uint32_t d24_temp, uint32_t d24_temp_component,
uint32_t d32_temp, uint32_t d32_temp_component,
uint32_t temp_temp, uint32_t temp_temp_component);
bool IsDepthStencilSystemTempUsed() const {
// See system_temp_depth_stencil_ documentation for explanation of cases.
if (edram_rov_used_) {
return writes_depth() || ROV_IsDepthStencilEarly();
}
return writes_depth() && DSV_IsWritingFloat24Depth();
}
// Whether the current non-ROV pixel shader should convert the depth to 20e4.
bool DSV_IsWritingFloat24Depth() const {
if (edram_rov_used_) {
return false;
}
Modification::DepthStencilMode depth_stencil_mode =
GetDxbcShaderModification().depth_stencil_mode;
return depth_stencil_mode ==
Modification::DepthStencilMode::kFloat24Truncating ||
depth_stencil_mode ==
Modification::DepthStencilMode::kFloat24Rounding;
}
// Whether it's possible and worth skipping running the translated shader for
// 2x2 quads.
bool ROV_IsDepthStencilEarly() const {
return !is_depth_only_pixel_shader_ && !writes_depth();
return !is_depth_only_pixel_shader_ && !writes_depth() &&
memexport_stream_constants().empty();
}
// Converts the depth value to 24-bit (storing the result in bits 0:23 and
// zeros in 24:31, not creating room for stencil - since this may be involved
@ -2197,8 +2260,8 @@ class DxbcShaderTranslator : public ShaderTranslator {
// Does all the depth/stencil-related things, including or not including
// writing based on whether it's late, or on whether it's safe to do it early.
// Updates system_temp_rov_params_ result and coverage if allowed and safe,
// updates system_temp_rov_depth_stencil_, and if early and the coverage is
// empty for all pixels in the 2x2 quad and safe to return early (stencil is
// updates system_temp_depth_stencil_, and if early and the coverage is empty
// for all pixels in the 2x2 quad and safe to return early (stencil is
// unchanged or known that it's safe not to await kills/alphatest/AtoC),
// returns from the shader.
void ROV_DepthStencilTest();
@ -2248,6 +2311,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
// Discards the SSAA sample if it's masked out by alpha to coverage.
void CompletePixelShader_WriteToRTVs_AlphaToMask();
void CompletePixelShader_WriteToRTVs();
void CompletePixelShader_DSV_DepthTo24Bit();
// Masks the sample away from system_temp_rov_params_.x if it's not covered.
// threshold_offset and temp.temp_component can be the same if needed.
void CompletePixelShader_ROV_AlphaToMaskSample(
@ -2333,6 +2397,11 @@ class DxbcShaderTranslator : public ShaderTranslator {
xenos::TextureFilter min_filter,
xenos::TextureFilter mip_filter,
xenos::AnisoFilter aniso_filter);
// Returns the number of texture SRV and sampler offsets that need to be
// passed via a constant buffer to the shader.
uint32_t GetBindlessResourceCount() const {
return uint32_t(texture_bindings_.size() + sampler_bindings_.size());
}
// Marks fetch constants as used by the DXBC shader and returns DxbcSrc
// for the words 01 (pair 0), 23 (pair 1) or 45 (pair 2) of the texture fetch
// constant.
@ -2364,7 +2433,7 @@ class DxbcShaderTranslator : public ShaderTranslator {
static uint32_t AppendString(std::vector<uint32_t>& dest, const char* source);
// Returns the length of a string as if it was appended to a DWORD stream, in
// bytes.
static inline uint32_t GetStringLength(const char* source) {
static uint32_t GetStringLength(const char* source) {
return uint32_t(xe::align(std::strlen(source) + 1, sizeof(uint32_t)));
}
@ -2479,8 +2548,8 @@ class DxbcShaderTranslator : public ShaderTranslator {
bool in_primitive_id_used_;
// Whether InOutRegister::kDSInControlPointIndex has been used in the shader.
bool in_control_point_index_used_;
// Whether the XY of the pixel position has been used in the pixel shader.
bool in_position_xy_used_;
// Mask of the pixel/sample position actually used in the pixel shader.
uint32_t in_position_used_;
// Whether the faceness has been used in the pixel shader.
bool in_front_face_used_;
@ -2518,15 +2587,14 @@ class DxbcShaderTranslator : public ShaderTranslator {
// W - Base-relative resolution-scaled EDRAM offset for 64bpp color data, in
// dwords.
uint32_t system_temp_rov_params_;
// ROV only - new depth/stencil data. 4 VGPRs when not writing to oDepth, 1
// VGPR when writing to oDepth. Not used in the depth-only pixel shader (or,
// more formally, if neither early depth-stencil nor oDepth are used) because
// it always calculates and writes in the same place.
// When not writing to oDepth: New per-sample depth/stencil values, generated
// during early depth/stencil test (actual writing checks coverage bits).
// When writing to oDepth: X also used to hold the depth written by the
// shader, later used as a temporary during depth/stencil testing.
uint32_t system_temp_rov_depth_stencil_;
// Two purposes:
// - When writing to oDepth, and either using ROV or converting the depth to
// float24: X also used to hold the depth written by the shader,
// later used as a temporary during depth/stencil testing.
// - Otherwise, when using ROV output with ROV_IsDepthStencilEarly being true:
// New per-sample depth/stencil values, generated during early depth/stencil
// test (actual writing checks coverage bits).
uint32_t system_temp_depth_stencil_;
// Up to 4 color outputs in pixel shaders (because of exponent bias, alpha
// test and remapping, and also for ROV writing).
uint32_t system_temps_color_[4];
@ -2587,6 +2655,8 @@ class DxbcShaderTranslator : public ShaderTranslator {
uint32_t srv_index_bindless_textures_3d_;
uint32_t srv_index_bindless_textures_cube_;
// The first binding is at t[SRVMainRegister::kBindfulTexturesStart] of space
// SRVSpace::kMain.
std::vector<TextureBinding> texture_bindings_;
std::unordered_map<uint32_t, uint32_t>
texture_bindings_for_bindful_srv_indices_;

View File

@ -677,7 +677,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
// Whether to use gradients (implicit or explicit) for LOD calculation.
bool use_computed_lod =
instr.attributes.use_computed_lod &&
(IsDxbcPixelShader() || instr.attributes.use_register_gradients);
(is_pixel_shader() || instr.attributes.use_register_gradients);
if (instr.opcode == FetchOpcode::kGetTextureComputedLod &&
(!use_computed_lod || instr.attributes.use_register_gradients)) {
assert_always();

View File

@ -106,7 +106,7 @@ void DxbcShaderTranslator::ExportToMemory() {
kSysConst_Flags_Vec)
.Select(kSysConst_Flags_Comp),
DxbcSrc::LU(kSysFlag_SharedMemoryIsUAV));
if (IsDxbcPixelShader()) {
if (is_pixel_shader()) {
// Disable memexport in pixel shaders with supersampling since VPOS is
// ambiguous.
if (edram_rov_used_) {

View File

@ -167,7 +167,7 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// bigger) to integer to system_temp_rov_params_.zw.
// system_temp_rov_params_.z = X host pixel position as uint
// system_temp_rov_params_.w = Y host pixel position as uint
in_position_xy_used_ = true;
in_position_used_ |= 0b0011;
DxbcOpFToU(DxbcDest::R(system_temp_rov_params_, 0b1100),
DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition), 0b01000000));
// Revert the resolution scale to convert the position to guest pixels.
@ -315,7 +315,7 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// Add host pixel offsets.
// system_temp_rov_params_.y = scaled 32bpp depth/stencil address
// system_temp_rov_params_.z = scaled 32bpp color offset if needed
in_position_xy_used_ = true;
in_position_used_ |= 0b0011;
for (uint32_t i = 0; i < 2; ++i) {
// Convert a position component to integer.
DxbcOpFToU(DxbcDest::R(system_temp_rov_params_, 0b0001),
@ -417,23 +417,50 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() {
// With early depth/stencil, depth/stencil writing may be deferred to the
// end of the shader to prevent writing in case something (like alpha test,
// which is dynamic GPU state) discards the pixel. So, write directly to the
// persistent register, system_temp_rov_depth_stencil_, instead of a local
// persistent register, system_temp_depth_stencil_, instead of a local
// temporary register.
DxbcDest sample_depth_stencil_dest(
depth_stencil_early
? DxbcDest::R(system_temp_rov_depth_stencil_, 1 << i)
: temp_x_dest);
depth_stencil_early ? DxbcDest::R(system_temp_depth_stencil_, 1 << i)
: temp_x_dest);
DxbcSrc sample_depth_stencil_src(
depth_stencil_early
? DxbcSrc::R(system_temp_rov_depth_stencil_).Select(i)
: temp_x_src);
depth_stencil_early ? DxbcSrc::R(system_temp_depth_stencil_).Select(i)
: temp_x_src);
if (!i) {
if (writes_depth()) {
// Clamp oDepth to the lower viewport depth bound (depth clamp happens
// after the pixel shader in the pipeline, at least on Direct3D 11 and
// Vulkan, thus applies to the shader's depth output too).
system_constants_used_ |= 1ull << kSysConst_EdramDepthRange_Index;
DxbcOpMax(DxbcDest::R(system_temp_depth_stencil_, 0b0001),
DxbcSrc::R(system_temp_depth_stencil_, DxbcSrc::kXXXX),
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_EdramDepthRange_Vec)
.Select(kSysConst_EdramDepthRangeOffset_Comp));
// Calculate the upper Z range bound to temp.x for clamping after
// biasing.
// temp.x = viewport maximum depth
system_constants_used_ |= 1ull << kSysConst_EdramDepthRange_Index;
DxbcOpAdd(temp_x_dest,
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_EdramDepthRange_Vec)
.Select(kSysConst_EdramDepthRangeOffset_Comp),
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_EdramDepthRange_Vec)
.Select(kSysConst_EdramDepthRangeScale_Comp));
// Clamp oDepth to the upper viewport depth bound (already not above 1,
// but saturate for total safety).
// temp.x = free
DxbcOpMin(DxbcDest::R(system_temp_depth_stencil_, 0b0001),
DxbcSrc::R(system_temp_depth_stencil_, DxbcSrc::kXXXX),
temp_x_src, true);
// Convert the shader-generated depth to 24-bit, using temp.x as
// temporary.
ROV_DepthTo24Bit(system_temp_rov_depth_stencil_, 0,
system_temp_rov_depth_stencil_, 0, temp, 0);
ROV_DepthTo24Bit(system_temp_depth_stencil_, 0,
system_temp_depth_stencil_, 0, temp, 0);
} else {
// Load the first sample's Z*W and W to temp.xy - need this regardless
// of coverage for polygon offset.
@ -529,14 +556,14 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() {
}
// Get if the current sample is covered to temp.w.
// temp.x = first sample's viewport space Z or 24-bit oDepth
// temp.x = first sample's viewport space Z if not writing to oDepth
// temp.y = polygon offset if not writing to oDepth
// temp.z = viewport maximum depth if not writing to oDepth
// temp.w = coverage of the current sample
DxbcOpAnd(temp_w_dest, DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kXXXX),
DxbcSrc::LU(1 << i));
// Check if the current sample is covered. Release 1 VGPR.
// temp.x = first sample's viewport space Z or 24-bit oDepth
// temp.x = first sample's viewport space Z if not writing to oDepth
// temp.y = polygon offset if not writing to oDepth
// temp.z = viewport maximum depth if not writing to oDepth
// temp.w = free
@ -546,7 +573,7 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() {
// Copy the 24-bit depth common to all samples to sample_depth_stencil.
// temp.x = shader-generated 24-bit depth
DxbcOpMov(sample_depth_stencil_dest,
DxbcSrc::R(system_temp_rov_depth_stencil_, DxbcSrc::kXXXX));
DxbcSrc::R(system_temp_depth_stencil_, DxbcSrc::kXXXX));
} else {
if (i) {
// Sample's depth precalculated for sample 0 (for slope-scaled depth
@ -997,51 +1024,60 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() {
// temp.z = viewport maximum depth if not writing to oDepth
// temp.w = whether depth/stencil has been modified
DxbcOpINE(temp_w_dest, sample_depth_stencil_src, temp_w_src);
// Check if need to write.
// temp.x? = resulting sample depth/stencil
// temp.y = polygon offset if not writing to oDepth
// temp.z = viewport maximum depth if not writing to oDepth
// temp.w = free
DxbcOpIf(true, temp_w_src);
{
if (depth_stencil_early) {
// Get if early depth/stencil write is enabled to temp.w.
// temp.w = whether early depth/stencil write is enabled
system_constants_used_ |= 1ull << kSysConst_Flags_Index;
DxbcOpAnd(temp_w_dest,
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_Flags_Vec)
.Select(kSysConst_Flags_Comp),
DxbcSrc::LU(kSysFlag_ROVDepthStencilEarlyWrite));
// Check if need to write early.
// temp.w = free
DxbcOpIf(true, temp_w_src);
}
// Write the new depth/stencil.
if (uav_index_edram_ == kBindingIndexUnallocated) {
uav_index_edram_ = uav_count_++;
}
DxbcOpStoreUAVTyped(
DxbcDest::U(uav_index_edram_, uint32_t(UAVRegister::kEdram)),
DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kYYYY), 1,
sample_depth_stencil_src);
if (depth_stencil_early) {
// Need to still run the shader to know whether to write the
// depth/stencil value.
DxbcOpElse();
// Set sample bit out of bits 4:7 of system_temp_rov_params_.x if need
// to write later (after checking if the sample is not discarded by a
// kill instruction, alphatest or alpha-to-coverage).
DxbcOpOr(DxbcDest::R(system_temp_rov_params_, 0b0001),
DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kXXXX),
DxbcSrc::LU(1 << (4 + i)));
// Close the early depth/stencil check.
DxbcOpEndIf();
if (depth_stencil_early && !CanWriteZEarly()) {
// Set the sample bit in bits 4:7 of system_temp_rov_params_.x - always
// need to write late in this shader, as it may do something like
// explicitly killing pixels.
DxbcOpBFI(DxbcDest::R(system_temp_rov_params_, 0b0001), DxbcSrc::LU(1),
DxbcSrc::LU(4 + i), temp_w_src,
DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kXXXX));
} else {
// Check if need to write.
// temp.x? = resulting sample depth/stencil
// temp.y = polygon offset if not writing to oDepth
// temp.z = viewport maximum depth if not writing to oDepth
// temp.w = free
DxbcOpIf(true, temp_w_src);
{
if (depth_stencil_early) {
// Get if early depth/stencil write is enabled to temp.w.
// temp.w = whether early depth/stencil write is enabled
system_constants_used_ |= 1ull << kSysConst_Flags_Index;
DxbcOpAnd(temp_w_dest,
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_Flags_Vec)
.Select(kSysConst_Flags_Comp),
DxbcSrc::LU(kSysFlag_ROVDepthStencilEarlyWrite));
// Check if need to write early.
// temp.w = free
DxbcOpIf(true, temp_w_src);
}
// Write the new depth/stencil.
if (uav_index_edram_ == kBindingIndexUnallocated) {
uav_index_edram_ = uav_count_++;
}
DxbcOpStoreUAVTyped(
DxbcDest::U(uav_index_edram_, uint32_t(UAVRegister::kEdram)),
DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kYYYY), 1,
sample_depth_stencil_src);
if (depth_stencil_early) {
// Need to still run the shader to know whether to write the
// depth/stencil value.
DxbcOpElse();
// Set the sample bit in bits 4:7 of system_temp_rov_params_.x if need
// to write later (after checking if the sample is not discarded by a
// kill instruction, alphatest or alpha-to-coverage).
DxbcOpOr(DxbcDest::R(system_temp_rov_params_, 0b0001),
DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kXXXX),
DxbcSrc::LU(1 << (4 + i)));
// Close the early depth/stencil check.
DxbcOpEndIf();
}
}
// Close the write check.
DxbcOpEndIf();
}
// Close the write check.
DxbcOpEndIf();
// Release sample_temp.
PopSystemTemp();
@ -1720,7 +1756,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs_AlphaToMask() {
// Convert SSAA sample position to integer to temp.xy (not caring about the
// resolution scale because it's not supported anywhere on the RTV output
// path).
in_position_xy_used_ = true;
in_position_used_ |= 0b0011;
DxbcOpFToU(DxbcDest::R(temp, 0b0011),
DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition)));
@ -1913,6 +1949,139 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() {
PopSystemTemp(2);
}
void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() {
if (!DSV_IsWritingFloat24Depth()) {
return;
}
uint32_t temp;
if (writes_depth()) {
// The depth is already written to system_temp_depth_stencil_.x and clamped
// to 0...1 with NaNs dropped (saturating in StoreResult); yzw are free.
temp = system_temp_depth_stencil_;
} else {
// Need a temporary variable; copy the sample's depth input to it and
// saturate it (in Direct3D 11, depth is clamped to the viewport bounds
// after the pixel shader, and SV_Position.z contains the unclamped depth,
// which may be outside the viewport's depth range if it's biased); though
// it will be clamped to the viewport bounds anyway, but to be able to make
// the assumption of it being clamped while working with the bit
// representation.
temp = PushSystemTemp();
in_position_used_ |= 0b0100;
DxbcOpMov(
DxbcDest::R(temp, 0b0001),
DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition), DxbcSrc::kZZZZ),
true);
}
DxbcDest temp_x_dest(DxbcDest::R(temp, 0b0001));
DxbcSrc temp_x_src(DxbcSrc::R(temp, DxbcSrc::kXXXX));
DxbcDest temp_y_dest(DxbcDest::R(temp, 0b0010));
DxbcSrc temp_y_src(DxbcSrc::R(temp, DxbcSrc::kYYYY));
if (GetDxbcShaderModification().depth_stencil_mode ==
Modification::DepthStencilMode::kFloat24Truncating) {
// Simplified conversion, always less than or equal to the original value -
// just drop the lower bits.
// The float32 exponent bias is 127.
// After saturating, the exponent range is -127...0.
// The smallest normalized 20e4 exponent is -14 - should drop 3 mantissa
// bits at -14 or above.
// The smallest denormalized 20e4 number is -34 - should drop 23 mantissa
// bits at -34.
// Anything smaller than 2^-34 becomes 0.
DxbcDest truncate_dest(writes_depth() ? DxbcDest::ODepth()
: DxbcDest::ODepthLE());
// Check if the number is representable as a float24 after truncation - the
// exponent is at least -34.
DxbcOpUGE(temp_y_dest, temp_x_src, DxbcSrc::LU(0x2E800000));
DxbcOpIf(true, temp_y_src);
{
// Extract the biased float32 exponent to temp.y.
// temp.y = 113+ at exponent -14+.
// temp.y = 93 at exponent -34.
DxbcOpUBFE(temp_y_dest, DxbcSrc::LU(8), DxbcSrc::LU(23), temp_x_src);
// Convert exponent to the unclamped number of bits to truncate.
// 116 - 113 = 3.
// 116 - 93 = 23.
// temp.y = 3+ at exponent -14+.
// temp.y = 23 at exponent -34.
DxbcOpIAdd(temp_y_dest, DxbcSrc::LI(116), -temp_y_src);
// Clamp the truncated bit count to drop 3 bits of any normal number.
// Exponents below -34 are handled separately.
// temp.y = 3 at exponent -14.
// temp.y = 23 at exponent -34.
DxbcOpIMax(temp_y_dest, temp_y_src, DxbcSrc::LI(3));
// Truncate the mantissa - fill the low bits with zeros.
DxbcOpBFI(truncate_dest, temp_y_src, DxbcSrc::LU(0), DxbcSrc::LU(0),
temp_x_src);
}
// The number is not representable as float24 after truncation - zero.
DxbcOpElse();
DxbcOpMov(truncate_dest, DxbcSrc::LF(0.0f));
// Close the non-zero result check.
DxbcOpEndIf();
} else {
// Properly convert to 20e4, with rounding to the nearest even.
PreClampedDepthTo20e4(temp, 0, temp, 0, temp, 1);
// Convert back to float32.
// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
// Unpack the exponent to temp.y.
DxbcOpUShR(temp_y_dest, temp_x_src, DxbcSrc::LU(20));
// Unpack the mantissa to temp.x.
DxbcOpAnd(temp_x_dest, temp_x_src, DxbcSrc::LU(0xFFFFF));
// Check if the number is denormalized.
DxbcOpIf(false, temp_y_src);
{
// Check if the number is non-zero (if the mantissa isn't zero - the
// exponent is known to be zero at this point).
DxbcOpIf(true, temp_x_src);
{
// Normalize the mantissa.
// Note that HLSL firstbithigh(x) is compiled to DXBC like:
// `x ? 31 - firstbit_hi(x) : -1`
// (returns the index from the LSB, not the MSB, but -1 for zero too).
// temp.y = firstbit_hi(mantissa)
DxbcOpFirstBitHi(temp_y_dest, temp_x_src);
// temp.y = 20 - firstbithigh(mantissa)
// Or:
// temp.y = 20 - (31 - firstbit_hi(mantissa))
DxbcOpIAdd(temp_y_dest, temp_y_src, DxbcSrc::LI(20 - 31));
// mantissa = mantissa << (20 - firstbithigh(mantissa))
// AND 0xFFFFF not needed after this - BFI will do it.
DxbcOpIShL(temp_x_dest, temp_x_src, temp_y_src);
// Get the normalized exponent.
// exponent = 1 - (20 - firstbithigh(mantissa))
DxbcOpIAdd(temp_y_dest, DxbcSrc::LI(1), -temp_y_src);
}
// The number is zero.
DxbcOpElse();
{
// Set the unbiased exponent to -112 for zero - 112 will be added later,
// resulting in zero float32.
DxbcOpMov(temp_y_dest, DxbcSrc::LI(-112));
}
// Close the non-zero check.
DxbcOpEndIf();
}
// Close the denormal check.
DxbcOpEndIf();
// Bias the exponent and move it to the correct location in float32 to
// temp.y.
DxbcOpIMAd(temp_y_dest, temp_y_src, DxbcSrc::LI(1 << 23),
DxbcSrc::LI(112 << 23));
// Combine the mantissa and the exponent into the result.
DxbcOpBFI(DxbcDest::ODepth(), DxbcSrc::LU(20), DxbcSrc::LU(3), temp_x_src,
temp_y_src);
}
if (!writes_depth()) {
// Release temp.
PopSystemTemp();
}
}
void DxbcShaderTranslator::CompletePixelShader_ROV_AlphaToMaskSample(
uint32_t sample_index, float threshold_base, DxbcSrc threshold_offset,
float threshold_offset_scale, uint32_t temp, uint32_t temp_component) {
@ -1957,7 +2126,7 @@ void DxbcShaderTranslator::CompletePixelShader_ROV_AlphaToMask() {
// floating-point. With resolution scaling, still using host pixels, to
// preserve the idea of dithering.
// temp.x = alpha to coverage offset as float 0.0...3.0.
in_position_xy_used_ = true;
in_position_used_ |= 0b0011;
DxbcOpFToU(DxbcDest::R(temp, 0b0011),
DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition)));
DxbcOpAnd(DxbcDest::R(temp, 0b0010), DxbcSrc::R(temp, DxbcSrc::kYYYY),
@ -2067,7 +2236,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
DxbcOpStoreUAVTyped(
DxbcDest::U(uav_index_edram_, uint32_t(UAVRegister::kEdram)),
DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kYYYY), 1,
DxbcSrc::R(system_temp_rov_depth_stencil_).Select(i));
DxbcSrc::R(system_temp_depth_stencil_).Select(i));
}
// Close the write check.
DxbcOpEndIf();
@ -3059,15 +3228,16 @@ void DxbcShaderTranslator::CompletePixelShader() {
CompletePixelShader_WriteToROV();
} else {
CompletePixelShader_WriteToRTVs();
CompletePixelShader_DSV_DepthTo24Bit();
}
}
void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp,
uint32_t d24_temp_component,
uint32_t d32_temp,
uint32_t d32_temp_component,
uint32_t temp_temp,
uint32_t temp_temp_component) {
void DxbcShaderTranslator::PreClampedDepthTo20e4(uint32_t d24_temp,
uint32_t d24_temp_component,
uint32_t d32_temp,
uint32_t d32_temp_component,
uint32_t temp_temp,
uint32_t temp_temp_component) {
assert_true(temp_temp != d24_temp ||
temp_temp_component != d24_temp_component);
assert_true(temp_temp != d32_temp ||
@ -3079,68 +3249,83 @@ void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp,
DxbcDest temp_dest(DxbcDest::R(temp_temp, 1 << temp_temp_component));
DxbcSrc temp_src(DxbcSrc::R(temp_temp).Select(temp_temp_component));
// CFloat24 from d3dref9.dll.
// Assuming the depth is already clamped to [0, 2) (in all places, the depth
// is written with the saturate flag set).
// Check if the number is too small to be represented as normalized 20e4.
// temp = f32 < 2^-14
DxbcOpULT(temp_dest, d32_src, DxbcSrc::LU(0x38800000));
// Handle denormalized numbers separately.
DxbcOpIf(true, temp_src);
{
// temp = f32 >> 23
DxbcOpUShR(temp_dest, d32_src, DxbcSrc::LU(23));
// temp = 113 - (f32 >> 23)
DxbcOpIAdd(temp_dest, DxbcSrc::LI(113), -temp_src);
// Don't allow the shift to overflow, since in DXBC the lower 5 bits of the
// shift amount are used (otherwise 0 becomes 8).
// temp = min(113 - (f32 >> 23), 24)
DxbcOpUMin(temp_dest, temp_src, DxbcSrc::LU(24));
// biased_f32 = (f32 & 0x7FFFFF) | 0x800000
DxbcOpBFI(d24_dest, DxbcSrc::LU(9), DxbcSrc::LU(23), DxbcSrc::LU(1),
d32_src);
// biased_f32 = ((f32 & 0x7FFFFF) | 0x800000) >> min(113 - (f32 >> 23), 24)
DxbcOpUShR(d24_dest, d24_src, temp_src);
}
// Not denormalized?
DxbcOpElse();
{
// Bias the exponent.
// biased_f32 = f32 + (-112 << 23)
// (left shift of a negative value is undefined behavior)
DxbcOpIAdd(d24_dest, d32_src, DxbcSrc::LU(0xC8000000u));
}
// Close the denormal check.
DxbcOpEndIf();
// Build the 20e4 number.
// temp = (biased_f32 >> 3) & 1
DxbcOpUBFE(temp_dest, DxbcSrc::LU(1), DxbcSrc::LU(3), d24_src);
// f24 = biased_f32 + 3
DxbcOpIAdd(d24_dest, d24_src, DxbcSrc::LU(3));
// f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1)
DxbcOpIAdd(d24_dest, d24_src, temp_src);
// f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF
DxbcOpUBFE(d24_dest, DxbcSrc::LU(24), DxbcSrc::LU(3), d24_src);
}
void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp,
uint32_t d24_temp_component,
uint32_t d32_temp,
uint32_t d32_temp_component,
uint32_t temp_temp,
uint32_t temp_temp_component) {
assert_true(temp_temp != d32_temp ||
temp_temp_component != d32_temp_component);
// Source and destination may be the same.
system_constants_used_ |= 1ull << kSysConst_Flags_Index;
DxbcOpAnd(temp_dest,
DxbcOpAnd(DxbcDest::R(temp_temp, 1 << temp_temp_component),
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_Flags_Vec)
.Select(kSysConst_Flags_Comp),
DxbcSrc::LU(kSysFlag_ROVDepthFloat24));
// Convert according to the format.
DxbcOpIf(true, temp_src);
DxbcOpIf(true, DxbcSrc::R(temp_temp).Select(temp_temp_component));
{
// 20e4 conversion, using 1 VGPR.
// CFloat24 from d3dref9.dll.
// Assuming the depth is already clamped to [0, 2) (in all places, the depth
// is written with the saturate flag set).
// Check if the number is too small to be represented as normalized 20e4.
// temp = f32 < 2^-14
DxbcOpULT(temp_dest, d32_src, DxbcSrc::LU(0x38800000));
// Handle denormalized numbers separately.
DxbcOpIf(true, temp_src);
{
// temp = f32 >> 23
DxbcOpUShR(temp_dest, d32_src, DxbcSrc::LU(23));
// temp = 113 - (f32 >> 23)
DxbcOpIAdd(temp_dest, DxbcSrc::LI(113), -temp_src);
// Don't allow the shift to overflow, since in DXBC the lower 5 bits of
// the shift amount are used (otherwise 0 becomes 8).
// temp = min(113 - (f32 >> 23), 24)
DxbcOpUMin(temp_dest, temp_src, DxbcSrc::LU(24));
// biased_f32 = (f32 & 0x7FFFFF) | 0x800000
DxbcOpBFI(d24_dest, DxbcSrc::LU(9), DxbcSrc::LU(23), DxbcSrc::LU(1),
d32_src);
// biased_f32 =
// ((f32 & 0x7FFFFF) | 0x800000) >> min(113 - (f32 >> 23), 24)
DxbcOpUShR(d24_dest, d24_src, temp_src);
}
// Not denormalized?
DxbcOpElse();
{
// Bias the exponent.
// biased_f32 = f32 + (-112 << 23)
// (left shift of a negative value is undefined behavior)
DxbcOpIAdd(d24_dest, d32_src, DxbcSrc::LU(0xC8000000u));
}
// Close the denormal check.
DxbcOpEndIf();
// Build the 20e4 number.
// temp = (biased_f32 >> 3) & 1
DxbcOpUBFE(temp_dest, DxbcSrc::LU(1), DxbcSrc::LU(3), d24_src);
// f24 = biased_f32 + 3
DxbcOpIAdd(d24_dest, d24_src, DxbcSrc::LU(3));
// f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1)
DxbcOpIAdd(d24_dest, d24_src, temp_src);
// f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF
DxbcOpUBFE(d24_dest, DxbcSrc::LU(24), DxbcSrc::LU(3), d24_src);
// 20e4 conversion.
PreClampedDepthTo20e4(d24_temp, d24_temp_component, d32_temp,
d32_temp_component, temp_temp, temp_temp_component);
}
DxbcOpElse();
{
// Unorm24 conversion.
DxbcDest d24_dest(DxbcDest::R(d24_temp, 1 << d24_temp_component));
DxbcSrc d24_src(DxbcSrc::R(d24_temp).Select(d24_temp_component));
// Multiply by float(0xFFFFFF).
DxbcOpMul(d24_dest, d32_src, DxbcSrc::LF(16777215.0f));
DxbcOpMul(d24_dest, DxbcSrc::R(d32_temp).Select(d32_temp_component),
DxbcSrc::LF(16777215.0f));
// Round to the nearest even integer. This seems to be the correct way:
// rounding towards zero gives 0xFF instead of 0x100 in clear shaders in,
// for instance, Halo 3, but other clear shaders in it are also broken if

View File

@ -40,9 +40,63 @@ DEFINE_bool(
"be fully covered when MSAA is used with fullscreen passes.",
"GPU");
DEFINE_string(
depth_float24_conversion, "",
"Method for converting 32-bit Z values to 20e4 floating point when using "
"host depth buffers without native 20e4 support (when not using rasterizer-"
"ordered views / fragment shader interlocks to perform depth testing "
"manually).\n"
"Use: [any, on_copy, truncate, round]\n"
" on_copy:\n"
" Do depth testing at host precision, converting when copying between "
"host depth buffers and the EDRAM buffer to support reinterpretation, "
"maintaining two copies, in both host and 20e4 formats, for reloading data "
"to host depth buffers when it wasn't overwritten.\n"
" + Highest performance, allows early depth test and writing.\n"
" + Host MSAA is possible with pixel-rate shading where supported.\n"
" - EDRAM > RAM > EDRAM depth buffer round trip done in certain games "
"(such as GTA IV) destroys precision irreparably, causing artifacts if "
"another rendering pass is done after the EDRAM reupload.\n"
" truncate:\n"
" Convert to 20e4 directly in pixel shaders, always rounding down.\n"
" + Good performance, conservative early depth test is possible.\n"
" + No precision loss when anything changes in the storage of the depth "
"buffer, EDRAM > RAM > EDRAM copying preserves precision.\n"
" - Rounding mode is incorrect, sometimes giving results smaller than "
"they should be - may cause inaccuracy especially in edge cases when the "
"game wants to write an exact value.\n"
" - Host MSAA is only possible at SSAA speed, with per-sample shading.\n"
" round:\n"
" Convert to 20e4 directly in pixel shaders, correctly rounding to the "
"nearest even.\n"
" + Highest accuracy.\n"
" - Significantly limited performance, early depth test is not possible.\n"
" - Host MSAA is only possible at SSAA speed, with per-sample shading.\n"
" Any other value:\n"
" Choose what is considered the most optimal (currently \"on_copy\").",
"GPU");
DEFINE_int32(query_occlusion_fake_sample_count, 1000,
"If set to -1 no sample counts are written, games may hang. Else, "
"the sample count of every tile will be incremented on every "
"EVENT_WRITE_ZPD by this number. Setting this to 0 means "
"everything is reported as occluded.",
"GPU");
namespace xe {
namespace gpu {
namespace flags {
DepthFloat24Conversion GetDepthFloat24Conversion() {
if (cvars::depth_float24_conversion == "truncate") {
return DepthFloat24Conversion::kOnOutputTruncating;
}
if (cvars::depth_float24_conversion == "round") {
return DepthFloat24Conversion::kOnOutputRounding;
}
return DepthFloat24Conversion::kOnCopy;
}
} // namespace flags
} // namespace gpu
} // namespace xe

View File

@ -22,6 +22,69 @@ DECLARE_bool(gpu_allow_invalid_fetch_constants);
DECLARE_bool(half_pixel_offset);
DECLARE_string(depth_float24_conversion);
DECLARE_int32(query_occlusion_fake_sample_count);
namespace xe {
namespace gpu {
namespace flags {
enum class DepthFloat24Conversion {
// Doing depth test at the host precision, converting to 20e4 to support
// reinterpretation, but keeping a separate EDRAM view containing depth values
// in the host format. When copying from the EDRAM buffer to host depth
// buffers, writing the stored host pixel if stored_f24 == to_f24(stored_host)
// (otherwise it was overwritten by something else, like clearing, or a color
// buffer; this is inexact though, and will incorrectly load pixels that were
// overwritten by something else in the EDRAM, but turned out to have the same
// value on the guest as before - an outdated host-precision value will be
// loaded in these cases instead).
//
// EDRAM > RAM, then reusing the EDRAM region for something else > EDRAM round
// trip destroys precision beyond repair.
//
// Full host early Z and MSAA with pixel-rate shading are supported.
kOnCopy,
// Converting the depth to the closest host value representable exactly as a
// 20e4 float in pixel shaders, to support invariance in cases when the guest
// reuploads a previously resolved depth buffer to the EDRAM, rounding towards
// zero (which contradicts the rounding used by the Direct3D 9 reference
// rasterizer, but allows less-than-or-equal pixel shader depth output to be
// used to preserve most of early Z culling when the game is using reversed
// depth, which is the usual way of doing depth testing on the Xbox 360 and of
// utilizing the advantages of a floating-point encoding).
//
// With MSAA, pixel shaders must run at sample frequency - otherwise, if the
// depth is the same for the entire pixel, intersections of polygons cannot be
// antialiased.
//
// Important usage note: When using this mode, bounds of the fixed-function
// viewport must be converted to and back from float24 too (preferably using
// correct rounding to the nearest even, to reduce the error already caused by
// truncation rather than to amplify it). This ensures that clamping to the
// viewport bounds, which happens after the pixel shader even if it overwrites
// the resulting depth, is never done to a value not representable as float24
// (for example, if the minimum Z is a number too small to be represented as
// float24, but not zero, it won't be possible to write what should become
// 0x000000 to the depth buffer). Note that this may add some error to the
// depth values from the rasterizer; however, modifying Z in the vertex shader
// to make interpolated depth values would cause clipping to be done to
// different bounds, which may be more undesirable, especially in cases when Z
// is explicitly set to a value like 0 or W (in such cases, the adjusted
// polygon may go outside 0...W in clip space and disappear).
kOnOutputTruncating,
// Similar to kOnOutputTruncating, but rounding to the nearest even, more
// correctly, however, because the resulting depth can be bigger than the
// original host value, early depth testing can't be used at all. Same
// viewport usage rules apply.
kOnOutputRounding,
};
DepthFloat24Conversion GetDepthFloat24Conversion();
} // namespace flags
} // namespace gpu
} // namespace xe
#endif // XENIA_GPU_GPU_FLAGS_H_

View File

@ -277,8 +277,7 @@ void GraphicsSystem::ClearCaches() {
}
void GraphicsSystem::InitializeShaderStorage(
const std::filesystem::path& storage_root, uint32_t title_id,
bool blocking) {
const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) {
if (!cvars::store_shaders) {
return;
}
@ -286,21 +285,18 @@ void GraphicsSystem::InitializeShaderStorage(
if (command_processor_->is_paused()) {
// Safe to run on any thread while the command processor is paused, no
// race condition.
command_processor_->InitializeShaderStorage(storage_root, title_id, true);
command_processor_->InitializeShaderStorage(cache_root, title_id, true);
} else {
xe::threading::Fence fence;
command_processor_->CallInThread(
[this, storage_root, title_id, &fence]() {
command_processor_->InitializeShaderStorage(storage_root, title_id,
true);
fence.Signal();
});
command_processor_->CallInThread([this, cache_root, title_id, &fence]() {
command_processor_->InitializeShaderStorage(cache_root, title_id, true);
fence.Signal();
});
fence.Wait();
}
} else {
command_processor_->CallInThread([this, storage_root, title_id]() {
command_processor_->InitializeShaderStorage(storage_root, title_id,
false);
command_processor_->CallInThread([this, cache_root, title_id]() {
command_processor_->InitializeShaderStorage(cache_root, title_id, false);
});
}
}

View File

@ -63,7 +63,7 @@ class GraphicsSystem {
virtual void ClearCaches();
void InitializeShaderStorage(const std::filesystem::path& storage_root,
void InitializeShaderStorage(const std::filesystem::path& cache_root,
uint32_t title_id, bool blocking);
void RequestFrameTrace();

View File

@ -254,15 +254,15 @@ union PA_SU_SC_MODE_CNTL {
uint32_t msaa_enable : 1; // +15
uint32_t vtx_window_offset_enable : 1; // +16
// LINE_STIPPLE_ENABLE was added on Adreno.
uint32_t : 2; // +17
uint32_t provoking_vtx_last : 1; // +19
uint32_t persp_corr_dis : 1; // +20
uint32_t multi_prim_ib_ena : 1; // +21
uint32_t : 1; // +22
uint32_t quad_order_enable : 1; // +23
uint32_t : 2; // +17
uint32_t provoking_vtx_last : 1; // +19
uint32_t persp_corr_dis : 1; // +20
uint32_t multi_prim_ib_ena : 1; // +21
uint32_t : 1; // +22
uint32_t quad_order_enable : 1; // +23
uint32_t sc_one_quad_per_clock : 1; // +24
// WAIT_RB_IDLE_ALL_TRI and WAIT_RB_IDLE_FIRST_TRI_NEW_STATE were added on
// Adreno.
// TODO(Triang3l): Find SC_ONE_QUAD_PER_CLOCK offset.
};
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_SU_SC_MODE_CNTL;
@ -298,7 +298,7 @@ union PA_SC_VIZ_QUERY {
// discard geometry after test (but use for testing)
uint32_t kill_pix_post_hi_z : 1; // +7
// not used with d3d
uint32_t kill_pix_detail_mask : 1; // +8
uint32_t kill_pix_post_detail_mask : 1; // +8
};
uint32_t value;
static constexpr Register register_index = XE_GPU_REG_PA_SC_VIZ_QUERY;

View File

@ -12,7 +12,7 @@
#include <cstring>
#include <memory>
#include "third_party/xxhash/xxhash.h"
#include "xenia/base/xxhash.h"
namespace xe {
namespace gpu {
@ -51,7 +51,7 @@ bool SamplerInfo::Prepare(const xenos::xe_gpu_texture_fetch_t& fetch,
}
uint64_t SamplerInfo::hash() const {
return XXH64(this, sizeof(SamplerInfo), 0);
return XXH3_64bits(this, sizeof(SamplerInfo));
}
} // namespace gpu

View File

@ -31,9 +31,13 @@ Shader::Shader(xenos::ShaderType shader_type, uint64_t ucode_data_hash,
xe::copy_and_swap(ucode_data_.data(), ucode_dwords, ucode_dword_count);
}
Shader::~Shader() = default;
Shader::~Shader() {
for (auto it : translations_) {
delete it.second;
}
}
std::string Shader::GetTranslatedBinaryString() const {
std::string Shader::Translation::GetTranslatedBinaryString() const {
std::string result;
result.resize(translated_binary_.size());
std::memcpy(const_cast<char*>(result.data()), translated_binary_.data(),
@ -41,36 +45,24 @@ std::string Shader::GetTranslatedBinaryString() const {
return result;
}
std::pair<std::filesystem::path, std::filesystem::path> Shader::Dump(
std::filesystem::path Shader::Translation::Dump(
const std::filesystem::path& base_path, const char* path_prefix) {
std::filesystem::path path = base_path;
// Ensure target path exists.
auto target_path = base_path;
if (!target_path.empty()) {
target_path = std::filesystem::absolute(target_path);
std::filesystem::create_directories(target_path);
if (!path.empty()) {
path = std::filesystem::absolute(path);
std::filesystem::create_directories(path);
}
auto base_name =
fmt::format("shader_{}_{:016X}", path_prefix, ucode_data_hash_);
std::string txt_name, bin_name;
if (shader_type_ == xenos::ShaderType::kVertex) {
txt_name = base_name + ".vert";
bin_name = base_name + ".bin.vert";
} else {
txt_name = base_name + ".frag";
bin_name = base_name + ".bin.frag";
}
std::filesystem::path txt_path, bin_path;
txt_path = base_path / txt_name;
bin_path = base_path / bin_name;
FILE* f = filesystem::OpenFile(txt_path, "wb");
path = path /
fmt::format(
"shader_{:016X}_{:08X}.{}.{}", shader().ucode_data_hash(),
modification(), path_prefix,
shader().type() == xenos::ShaderType::kVertex ? "vert" : "frag");
FILE* f = filesystem::OpenFile(path, "wb");
if (f) {
fwrite(translated_binary_.data(), 1, translated_binary_.size(), f);
fprintf(f, "\n\n");
auto ucode_disasm_ptr = ucode_disassembly().c_str();
auto ucode_disasm_ptr = shader().ucode_disassembly().c_str();
while (*ucode_disasm_ptr) {
auto line_end = std::strchr(ucode_disasm_ptr, '\n');
fprintf(f, "// ");
@ -83,14 +75,58 @@ std::pair<std::filesystem::path, std::filesystem::path> Shader::Dump(
}
fclose(f);
}
return std::move(path);
}
f = filesystem::OpenFile(bin_path, "wb");
Shader::Translation* Shader::GetOrCreateTranslation(uint32_t modification,
bool* is_new) {
auto it = translations_.find(modification);
if (it != translations_.end()) {
if (is_new) {
*is_new = false;
}
return it->second;
}
Translation* translation = CreateTranslationInstance(modification);
translations_.emplace(modification, translation);
if (is_new) {
*is_new = true;
}
return translation;
}
void Shader::DestroyTranslation(uint32_t modification) {
auto it = translations_.find(modification);
if (it == translations_.end()) {
return;
}
delete it->second;
translations_.erase(it);
}
std::filesystem::path Shader::DumpUcodeBinary(
const std::filesystem::path& base_path) {
// Ensure target path exists.
std::filesystem::path path = base_path;
if (!path.empty()) {
path = std::filesystem::absolute(path);
std::filesystem::create_directories(path);
}
path = path /
fmt::format("shader_{:016X}.ucode.bin.{}", ucode_data_hash(),
type() == xenos::ShaderType::kVertex ? "vert" : "frag");
FILE* f = filesystem::OpenFile(path, "wb");
if (f) {
fwrite(ucode_data_.data(), 4, ucode_data_.size(), f);
fwrite(ucode_data().data(), 4, ucode_data().size(), f);
fclose(f);
}
return std::move(path);
}
return {std::move(txt_path), std::move(bin_path)};
Shader::Translation* Shader::CreateTranslationInstance(uint32_t modification) {
// Default implementation for simple cases like ucode disassembly.
return new Translation(*this, modification);
}
} // namespace gpu

View File

@ -11,8 +11,12 @@
#define XENIA_GPU_SHADER_H_
#include <algorithm>
#include <atomic>
#include <cstdint>
#include <filesystem>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "xenia/base/math.h"
@ -591,6 +595,8 @@ struct ParsedAluInstruction {
class Shader {
public:
// Type of the vertex shader in a D3D11-like rendering pipeline - shader
// interface depends on in, so it must be known at translation time.
// If values are changed, INVALIDATE SHADER STORAGES (increase their version
// constexpr) where those are stored! And check bit count where this is
// packed. This is : uint32_t for simplicity of packing in bit fields.
@ -603,6 +609,8 @@ class Shader {
kQuadDomainCPIndexed,
kQuadDomainPatchIndexed,
};
// For packing HostVertexShaderType in bit fields.
static constexpr uint32_t kHostVertexShaderTypeBitCount = 3;
struct Error {
bool is_fatal = false;
@ -683,6 +691,70 @@ class Shader {
}
};
class Translation {
public:
virtual ~Translation() {}
Shader& shader() const { return shader_; }
// Translator-specific modification bits.
uint32_t modification() const { return modification_; }
// True if the shader was translated and prepared without error.
bool is_valid() const { return is_valid_; }
// True if the shader has already been translated.
bool is_translated() const { return is_translated_; }
// Errors that occurred during translation.
const std::vector<Error>& errors() const { return errors_; }
// Translated shader binary (or text).
const std::vector<uint8_t>& translated_binary() const {
return translated_binary_;
}
// Gets the translated shader binary as a string.
// This is only valid if it is actually text.
std::string GetTranslatedBinaryString() const;
// Disassembly of the translated from the host graphics layer.
// May be empty if the host does not support disassembly.
const std::string& host_disassembly() const { return host_disassembly_; }
// In case disassembly depends on the GPU backend, for setting it
// externally.
void set_host_disassembly(std::string disassembly) {
host_disassembly_ = std::move(disassembly);
}
// For dumping after translation. Dumps the shader's disassembled microcode,
// translated code, and, if available, translated disassembly, to a file in
// the given path based on ucode hash. Returns the name of the written file.
std::filesystem::path Dump(const std::filesystem::path& base_path,
const char* path_prefix);
protected:
Translation(Shader& shader, uint32_t modification)
: shader_(shader), modification_(modification) {}
// If there was some failure during preparation on the implementation side.
void MakeInvalid() { is_valid_ = false; }
private:
friend class Shader;
friend class ShaderTranslator;
Shader& shader_;
uint32_t modification_;
bool is_valid_ = false;
bool is_translated_ = false;
std::vector<Error> errors_;
std::vector<uint8_t> translated_binary_;
std::string host_disassembly_;
};
Shader(xenos::ShaderType shader_type, uint64_t ucode_data_hash,
const uint32_t* ucode_dwords, size_t ucode_dword_count);
virtual ~Shader();
@ -690,19 +762,30 @@ class Shader {
// Whether the shader is identified as a vertex or pixel shader.
xenos::ShaderType type() const { return shader_type_; }
// If this is a vertex shader, and it has been translated, type of the shader
// in a D3D11-like rendering pipeline - shader interface depends on in, so it
// must be known at translation time.
HostVertexShaderType host_vertex_shader_type() const {
return host_vertex_shader_type_;
}
// Microcode dwords in host endianness.
const std::vector<uint32_t>& ucode_data() const { return ucode_data_; }
uint64_t ucode_data_hash() const { return ucode_data_hash_; }
const uint32_t* ucode_dwords() const { return ucode_data_.data(); }
size_t ucode_dword_count() const { return ucode_data_.size(); }
// Host translations with the specified modification bits. Not thread-safe
// with respect to translation creation/destruction.
const std::unordered_map<uint32_t, Translation*>& translations() const {
return translations_;
}
Translation* GetTranslation(uint32_t modification) const {
auto it = translations_.find(modification);
if (it != translations_.cend()) {
return it->second;
}
return nullptr;
}
Translation* GetOrCreateTranslation(uint32_t modification,
bool* is_new = nullptr);
// For shader storage loading, to remove a modification in case of translation
// failure. Not thread-safe.
void DestroyTranslation(uint32_t modification);
// All vertex bindings used in the shader.
// Valid for vertex shaders only.
const std::vector<VertexBinding>& vertex_bindings() const {
@ -733,73 +816,55 @@ class Shader {
// True if the shader overrides the pixel depth.
bool writes_depth() const { return writes_depth_; }
// True if Xenia can automatically enable early depth/stencil for the pixel
// shader when RB_DEPTHCONTROL EARLY_Z_ENABLE is not set, provided alpha
// testing and alpha to coverage are disabled.
bool implicit_early_z_allowed() const { return implicit_early_z_allowed_; }
// True if the shader was translated and prepared without error.
bool is_valid() const { return is_valid_; }
// True if the shader has already been translated.
bool is_translated() const { return is_translated_; }
// Errors that occurred during translation.
const std::vector<Error>& errors() const { return errors_; }
// True if the current shader has any `kill` instructions.
bool kills_pixels() const { return kills_pixels_; }
// Microcode disassembly in D3D format.
const std::string& ucode_disassembly() const { return ucode_disassembly_; }
// Translated shader binary (or text).
const std::vector<uint8_t>& translated_binary() const {
return translated_binary_;
// An externally managed identifier of the shader storage the microcode of the
// shader was last written to, or was loaded from, to only write the shader
// microcode to the storage once. UINT32_MAX by default.
uint32_t ucode_storage_index() const { return ucode_storage_index_; }
void set_ucode_storage_index(uint32_t storage_index) {
ucode_storage_index_ = storage_index;
}
// Gets the translated shader binary as a string.
// This is only valid if it is actually text.
std::string GetTranslatedBinaryString() const;
// Disassembly of the translated from the host graphics layer.
// May be empty if the host does not support disassembly.
const std::string& host_disassembly() const { return host_disassembly_; }
// A lot of errors that occurred during preparation of the host shader.
const std::string& host_error_log() const { return host_error_log_; }
// Host binary that can be saved and reused across runs.
// May be empty if the host does not support saving binaries.
const std::vector<uint8_t>& host_binary() const { return host_binary_; }
// Dumps the shader to a file in the given path based on ucode hash.
// Both the ucode binary and disassembled and translated shader will be
// written.
// Returns the filename of the shader and the binary.
std::pair<std::filesystem::path, std::filesystem::path> Dump(
const std::filesystem::path& base_path, const char* path_prefix);
// Dumps the shader's microcode binary to a file in the given path based on
// ucode hash. Returns the name of the written file. Can be called at any
// time, doesn't require the shader to be translated.
std::filesystem::path DumpUcodeBinary(const std::filesystem::path& base_path);
protected:
friend class ShaderTranslator;
virtual Translation* CreateTranslationInstance(uint32_t modification);
xenos::ShaderType shader_type_;
HostVertexShaderType host_vertex_shader_type_ = HostVertexShaderType::kVertex;
std::vector<uint32_t> ucode_data_;
uint64_t ucode_data_hash_;
// Modification bits -> translation.
std::unordered_map<uint32_t, Translation*> translations_;
// Whether setup of the post-translation parameters (listed below, plus those
// specific to the implementation) has been initiated, by any thread. If
// translation is performed on multiple threads, only one thread must be
// setting this up (other threads would write the same data anyway).
std::atomic_flag post_translation_info_set_up_ = ATOMIC_FLAG_INIT;
// Initialized after the first successful translation (these don't depend on
// the host-side modification bits).
std::string ucode_disassembly_;
std::vector<VertexBinding> vertex_bindings_;
std::vector<TextureBinding> texture_bindings_;
ConstantRegisterMap constant_register_map_ = {0};
bool writes_color_targets_[4] = {false, false, false, false};
bool writes_depth_ = false;
bool implicit_early_z_allowed_ = true;
bool kills_pixels_ = false;
std::vector<uint32_t> memexport_stream_constants_;
bool is_valid_ = false;
bool is_translated_ = false;
std::vector<Error> errors_;
std::string ucode_disassembly_;
std::vector<uint8_t> translated_binary_;
std::string host_disassembly_;
std::string host_error_log_;
std::vector<uint8_t> host_binary_;
uint32_t ucode_storage_index_ = UINT32_MAX;
};
} // namespace gpu

View File

@ -144,11 +144,15 @@ int shader_compiler_main(const std::vector<std::string>& args) {
Shader::HostVertexShaderType::kQuadDomainPatchIndexed;
}
}
uint32_t modification =
translator->GetDefaultModification(shader_type, host_vertex_shader_type);
translator->Translate(shader.get(), host_vertex_shader_type);
Shader::Translation* translation =
shader->GetOrCreateTranslation(modification);
translator->Translate(*translation);
const void* source_data = shader->translated_binary().data();
size_t source_data_size = shader->translated_binary().size();
const void* source_data = translation->translated_binary().data();
size_t source_data_size = translation->translated_binary().size();
std::string spirv_disasm;
if (cvars::shader_output_type == "spirvtext") {

View File

@ -1,4 +1,3 @@
#include "shader_translator.h"
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
@ -14,6 +13,7 @@
#include <set>
#include <string>
#include "xenia/base/assert.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
@ -46,7 +46,9 @@ ShaderTranslator::ShaderTranslator() = default;
ShaderTranslator::~ShaderTranslator() = default;
void ShaderTranslator::Reset() {
void ShaderTranslator::Reset(xenos::ShaderType shader_type) {
shader_type_ = shader_type;
modification_ = GetDefaultModification(shader_type);
errors_.clear();
ucode_disasm_buffer_.Reset();
ucode_disasm_line_number_ = 0;
@ -64,37 +66,37 @@ void ShaderTranslator::Reset() {
writes_color_targets_[i] = false;
}
writes_depth_ = false;
implicit_early_z_allowed_ = true;
kills_pixels_ = false;
memexport_alloc_count_ = 0;
memexport_eA_written_ = 0;
std::memset(&memexport_eM_written_, 0, sizeof(memexport_eM_written_));
memexport_stream_constants_.clear();
}
bool ShaderTranslator::Translate(
Shader* shader, reg::SQ_PROGRAM_CNTL cntl,
Shader::HostVertexShaderType host_vertex_shader_type) {
Reset();
uint32_t cntl_num_reg = shader->type() == xenos::ShaderType::kVertex
bool ShaderTranslator::Translate(Shader::Translation& translation,
reg::SQ_PROGRAM_CNTL cntl) {
xenos::ShaderType shader_type = translation.shader().type();
Reset(shader_type);
uint32_t cntl_num_reg = shader_type == xenos::ShaderType::kVertex
? cntl.vs_num_reg
: cntl.ps_num_reg;
register_count_ = (cntl_num_reg & 0x80) ? 0 : (cntl_num_reg + 1);
return TranslateInternal(shader, host_vertex_shader_type);
return TranslateInternal(translation);
}
bool ShaderTranslator::Translate(
Shader* shader, Shader::HostVertexShaderType host_vertex_shader_type) {
Reset();
return TranslateInternal(shader, host_vertex_shader_type);
bool ShaderTranslator::Translate(Shader::Translation& translation) {
Reset(translation.shader().type());
return TranslateInternal(translation);
}
bool ShaderTranslator::TranslateInternal(
Shader* shader, Shader::HostVertexShaderType host_vertex_shader_type) {
shader_type_ = shader->type();
host_vertex_shader_type_ = host_vertex_shader_type;
ucode_dwords_ = shader->ucode_dwords();
ucode_dword_count_ = shader->ucode_dword_count();
bool ShaderTranslator::TranslateInternal(Shader::Translation& translation) {
Shader& shader = translation.shader();
assert_true(shader_type_ == shader.type());
shader_type_ = shader.type();
ucode_dwords_ = shader.ucode_dwords();
ucode_dword_count_ = shader.ucode_dword_count();
modification_ = translation.modification();
// Control flow instructions come paired in blocks of 3 dwords and all are
// listed at the top of the ucode.
@ -147,12 +149,6 @@ bool ShaderTranslator::TranslateInternal(
if (memexport_eA_written_ == 0) {
memexport_stream_constants_.clear();
}
if (!memexport_stream_constants_.empty()) {
// TODO(Triang3l): Investigate what happens to memexport when the pixel
// fails the depth/stencil test, but in Direct3D 11 UAV writes disable early
// depth/stencil.
implicit_early_z_allowed_ = false;
}
StartTranslation();
@ -187,35 +183,44 @@ bool ShaderTranslator::TranslateInternal(
++cf_index;
}
shader->errors_ = std::move(errors_);
shader->translated_binary_ = CompleteTranslation();
shader->ucode_disassembly_ = ucode_disasm_buffer_.to_string();
shader->host_vertex_shader_type_ = host_vertex_shader_type_;
shader->vertex_bindings_ = std::move(vertex_bindings_);
shader->texture_bindings_ = std::move(texture_bindings_);
shader->constant_register_map_ = std::move(constant_register_map_);
for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) {
shader->writes_color_targets_[i] = writes_color_targets_[i];
}
shader->writes_depth_ = writes_depth_;
shader->implicit_early_z_allowed_ = implicit_early_z_allowed_;
shader->memexport_stream_constants_.clear();
for (uint32_t memexport_stream_constant : memexport_stream_constants_) {
shader->memexport_stream_constants_.push_back(memexport_stream_constant);
}
translation.errors_ = std::move(errors_);
translation.translated_binary_ = CompleteTranslation();
translation.is_translated_ = true;
shader->is_valid_ = true;
shader->is_translated_ = true;
for (const auto& error : shader->errors_) {
bool is_valid = true;
for (const auto& error : translation.errors_) {
if (error.is_fatal) {
shader->is_valid_ = false;
is_valid = false;
break;
}
}
translation.is_valid_ = is_valid;
PostTranslation(shader);
// Setup info that doesn't depend on the modification only once.
bool setup_shader_post_translation_info =
is_valid && !shader.post_translation_info_set_up_.test_and_set();
if (setup_shader_post_translation_info) {
shader.ucode_disassembly_ = ucode_disasm_buffer_.to_string();
shader.vertex_bindings_ = std::move(vertex_bindings_);
shader.texture_bindings_ = std::move(texture_bindings_);
shader.constant_register_map_ = std::move(constant_register_map_);
for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) {
shader.writes_color_targets_[i] = writes_color_targets_[i];
}
shader.writes_depth_ = writes_depth_;
shader.kills_pixels_ = kills_pixels_;
shader.memexport_stream_constants_.clear();
shader.memexport_stream_constants_.reserve(
memexport_stream_constants_.size());
shader.memexport_stream_constants_.insert(
shader.memexport_stream_constants_.cend(),
memexport_stream_constants_.cbegin(),
memexport_stream_constants_.cend());
}
PostTranslation(translation, setup_shader_post_translation_info);
return shader->is_valid_;
// In case is_valid_ is modified by PostTranslation, reload.
return translation.is_valid_;
}
void ShaderTranslator::MarkUcodeInstruction(uint32_t dword_offset) {
@ -338,14 +343,9 @@ void ShaderTranslator::GatherInstructionInformation(
ParsedAluInstruction instr;
ParseAluInstruction(op, instr);
const auto& vector_opcode_info =
alu_vector_opcode_infos_[uint32_t(op.vector_opcode())];
implicit_early_z_allowed_ &=
!vector_opcode_info.disable_implicit_early_z;
const auto& scalar_opcode_info =
alu_scalar_opcode_infos_[uint32_t(op.scalar_opcode())];
implicit_early_z_allowed_ &=
!scalar_opcode_info.disable_implicit_early_z;
kills_pixels_ = kills_pixels_ ||
ucode::AluVectorOpcodeIsKill(op.vector_opcode()) ||
ucode::AluScalarOpcodeIsKill(op.scalar_opcode());
if (instr.vector_and_constant_result.storage_target !=
InstructionStorageTarget::kRegister ||
@ -403,7 +403,6 @@ void ShaderTranslator::GatherInstructionInformation(
break;
case InstructionStorageTarget::kDepth:
writes_depth_ = true;
implicit_early_z_allowed_ = false;
break;
default:
break;
@ -1077,91 +1076,91 @@ uint32_t ParsedTextureFetchInstruction::GetNonZeroResultComponents() const {
const ShaderTranslator::AluOpcodeInfo
ShaderTranslator::alu_vector_opcode_infos_[0x20] = {
{"add", 2, 4, false}, // 0
{"mul", 2, 4, false}, // 1
{"max", 2, 4, false}, // 2
{"min", 2, 4, false}, // 3
{"seq", 2, 4, false}, // 4
{"sgt", 2, 4, false}, // 5
{"sge", 2, 4, false}, // 6
{"sne", 2, 4, false}, // 7
{"frc", 1, 4, false}, // 8
{"trunc", 1, 4, false}, // 9
{"floor", 1, 4, false}, // 10
{"mad", 3, 4, false}, // 11
{"cndeq", 3, 4, false}, // 12
{"cndge", 3, 4, false}, // 13
{"cndgt", 3, 4, false}, // 14
{"dp4", 2, 4, false}, // 15
{"dp3", 2, 4, false}, // 16
{"dp2add", 3, 4, false}, // 17
{"cube", 2, 4, false}, // 18
{"max4", 1, 4, false}, // 19
{"setp_eq_push", 2, 4, false}, // 20
{"setp_ne_push", 2, 4, false}, // 21
{"setp_gt_push", 2, 4, false}, // 22
{"setp_ge_push", 2, 4, false}, // 23
{"kill_eq", 2, 4, true}, // 24
{"kill_gt", 2, 4, true}, // 25
{"kill_ge", 2, 4, true}, // 26
{"kill_ne", 2, 4, true}, // 27
{"dst", 2, 4, false}, // 28
{"maxa", 2, 4, false}, // 29
{"add", 2, 4}, // 0
{"mul", 2, 4}, // 1
{"max", 2, 4}, // 2
{"min", 2, 4}, // 3
{"seq", 2, 4}, // 4
{"sgt", 2, 4}, // 5
{"sge", 2, 4}, // 6
{"sne", 2, 4}, // 7
{"frc", 1, 4}, // 8
{"trunc", 1, 4}, // 9
{"floor", 1, 4}, // 10
{"mad", 3, 4}, // 11
{"cndeq", 3, 4}, // 12
{"cndge", 3, 4}, // 13
{"cndgt", 3, 4}, // 14
{"dp4", 2, 4}, // 15
{"dp3", 2, 4}, // 16
{"dp2add", 3, 4}, // 17
{"cube", 2, 4}, // 18
{"max4", 1, 4}, // 19
{"setp_eq_push", 2, 4}, // 20
{"setp_ne_push", 2, 4}, // 21
{"setp_gt_push", 2, 4}, // 22
{"setp_ge_push", 2, 4}, // 23
{"kill_eq", 2, 4}, // 24
{"kill_gt", 2, 4}, // 25
{"kill_ge", 2, 4}, // 26
{"kill_ne", 2, 4}, // 27
{"dst", 2, 4}, // 28
{"maxa", 2, 4}, // 29
};
const ShaderTranslator::AluOpcodeInfo
ShaderTranslator::alu_scalar_opcode_infos_[0x40] = {
{"adds", 1, 2, false}, // 0
{"adds_prev", 1, 1, false}, // 1
{"muls", 1, 2, false}, // 2
{"muls_prev", 1, 1, false}, // 3
{"muls_prev2", 1, 2, false}, // 4
{"maxs", 1, 2, false}, // 5
{"mins", 1, 2, false}, // 6
{"seqs", 1, 1, false}, // 7
{"sgts", 1, 1, false}, // 8
{"sges", 1, 1, false}, // 9
{"snes", 1, 1, false}, // 10
{"frcs", 1, 1, false}, // 11
{"truncs", 1, 1, false}, // 12
{"floors", 1, 1, false}, // 13
{"exp", 1, 1, false}, // 14
{"logc", 1, 1, false}, // 15
{"log", 1, 1, false}, // 16
{"rcpc", 1, 1, false}, // 17
{"rcpf", 1, 1, false}, // 18
{"rcp", 1, 1, false}, // 19
{"rsqc", 1, 1, false}, // 20
{"rsqf", 1, 1, false}, // 21
{"rsq", 1, 1, false}, // 22
{"maxas", 1, 2, false}, // 23
{"maxasf", 1, 2, false}, // 24
{"subs", 1, 2, false}, // 25
{"subs_prev", 1, 1, false}, // 26
{"setp_eq", 1, 1, false}, // 27
{"setp_ne", 1, 1, false}, // 28
{"setp_gt", 1, 1, false}, // 29
{"setp_ge", 1, 1, false}, // 30
{"setp_inv", 1, 1, false}, // 31
{"setp_pop", 1, 1, false}, // 32
{"setp_clr", 0, 0, false}, // 33
{"setp_rstr", 1, 1, false}, // 34
{"kills_eq", 1, 1, true}, // 35
{"kills_gt", 1, 1, true}, // 36
{"kills_ge", 1, 1, true}, // 37
{"kills_ne", 1, 1, true}, // 38
{"kills_one", 1, 1, true}, // 39
{"sqrt", 1, 1, false}, // 40
{"UNKNOWN", 0, 0, false}, // 41
{"mulsc", 2, 1, false}, // 42
{"mulsc", 2, 1, false}, // 43
{"addsc", 2, 1, false}, // 44
{"addsc", 2, 1, false}, // 45
{"subsc", 2, 1, false}, // 46
{"subsc", 2, 1, false}, // 47
{"sin", 1, 1, false}, // 48
{"cos", 1, 1, false}, // 49
{"retain_prev", 0, 0, false}, // 50
{"adds", 1, 2}, // 0
{"adds_prev", 1, 1}, // 1
{"muls", 1, 2}, // 2
{"muls_prev", 1, 1}, // 3
{"muls_prev2", 1, 2}, // 4
{"maxs", 1, 2}, // 5
{"mins", 1, 2}, // 6
{"seqs", 1, 1}, // 7
{"sgts", 1, 1}, // 8
{"sges", 1, 1}, // 9
{"snes", 1, 1}, // 10
{"frcs", 1, 1}, // 11
{"truncs", 1, 1}, // 12
{"floors", 1, 1}, // 13
{"exp", 1, 1}, // 14
{"logc", 1, 1}, // 15
{"log", 1, 1}, // 16
{"rcpc", 1, 1}, // 17
{"rcpf", 1, 1}, // 18
{"rcp", 1, 1}, // 19
{"rsqc", 1, 1}, // 20
{"rsqf", 1, 1}, // 21
{"rsq", 1, 1}, // 22
{"maxas", 1, 2}, // 23
{"maxasf", 1, 2}, // 24
{"subs", 1, 2}, // 25
{"subs_prev", 1, 1}, // 26
{"setp_eq", 1, 1}, // 27
{"setp_ne", 1, 1}, // 28
{"setp_gt", 1, 1}, // 29
{"setp_ge", 1, 1}, // 30
{"setp_inv", 1, 1}, // 31
{"setp_pop", 1, 1}, // 32
{"setp_clr", 0, 0}, // 33
{"setp_rstr", 1, 1}, // 34
{"kills_eq", 1, 1}, // 35
{"kills_gt", 1, 1}, // 36
{"kills_ge", 1, 1}, // 37
{"kills_ne", 1, 1}, // 38
{"kills_one", 1, 1}, // 39
{"sqrt", 1, 1}, // 40
{"UNKNOWN", 0, 0}, // 41
{"mulsc", 2, 1}, // 42
{"mulsc", 2, 1}, // 43
{"addsc", 2, 1}, // 44
{"addsc", 2, 1}, // 45
{"subsc", 2, 1}, // 46
{"subsc", 2, 1}, // 47
{"sin", 1, 1}, // 48
{"cos", 1, 1}, // 49
{"retain_prev", 0, 0}, // 50
};
void ShaderTranslator::TranslateAluInstruction(const AluInstruction& op) {

View File

@ -29,18 +29,27 @@ class ShaderTranslator {
public:
virtual ~ShaderTranslator();
bool Translate(Shader* shader, reg::SQ_PROGRAM_CNTL cntl,
Shader::HostVertexShaderType host_vertex_shader_type =
Shader::HostVertexShaderType::kVertex);
bool Translate(Shader* shader,
Shader::HostVertexShaderType host_vertex_shader_type =
Shader::HostVertexShaderType::kVertex);
virtual uint32_t GetDefaultModification(
xenos::ShaderType shader_type,
Shader::HostVertexShaderType host_vertex_shader_type =
Shader::HostVertexShaderType::kVertex) const {
return 0;
}
bool Translate(Shader::Translation& translation, reg::SQ_PROGRAM_CNTL cntl);
bool Translate(Shader::Translation& translation);
protected:
ShaderTranslator();
// Resets translator state before beginning translation.
virtual void Reset();
// shader_type is passed here so translator implementations can generate
// special fixed shaders for internal use, and set up the type for this
// purpose.
virtual void Reset(xenos::ShaderType shader_type);
// Current host-side modification being generated.
uint32_t modification() const { return modification_; }
// Register count.
uint32_t register_count() const { return register_count_; }
@ -48,11 +57,6 @@ class ShaderTranslator {
bool is_vertex_shader() const {
return shader_type_ == xenos::ShaderType::kVertex;
}
// If translating a vertex shader, type of the shader in a D3D11-like
// rendering pipeline.
Shader::HostVertexShaderType host_vertex_shader_type() const {
return host_vertex_shader_type_;
}
// True if the current shader is a pixel shader.
bool is_pixel_shader() const {
return shader_type_ == xenos::ShaderType::kPixel;
@ -85,10 +89,8 @@ class ShaderTranslator {
// True if the current shader overrides the pixel depth, set before
// translation. Doesn't include writes with an empty used write mask.
bool writes_depth() const { return writes_depth_; }
// True if Xenia can automatically enable early depth/stencil for the pixel
// shader when RB_DEPTHCONTROL EARLY_Z_ENABLE is not set, provided alpha
// testing and alpha to coverage are disabled.
bool implicit_early_z_allowed() const { return implicit_early_z_allowed_; }
// True if the current shader has any `kill` instructions.
bool kills_pixels() const { return kills_pixels_; }
// A list of all vertex bindings, populated before translation occurs.
const std::vector<Shader::VertexBinding>& vertex_bindings() const {
return vertex_bindings_;
@ -112,6 +114,17 @@ class ShaderTranslator {
return memexport_stream_constants_;
}
// Whether the shader can have early depth and stencil writing enabled, unless
// alpha test or alpha to coverage is enabled. Data gathered before
// translation.
bool CanWriteZEarly() const {
// TODO(Triang3l): Investigate what happens to memexport when the pixel
// fails the depth/stencil test, but in Direct3D 11 UAV writes disable early
// depth/stencil.
return !writes_depth_ && !kills_pixels_ &&
memexport_stream_constants_.empty();
}
// Current line number in the ucode disassembly.
size_t ucode_disasm_line_number() const { return ucode_disasm_line_number_; }
// Ucode disassembly buffer accumulated during translation.
@ -130,10 +143,14 @@ class ShaderTranslator {
}
// Handles post-translation tasks when the shader has been fully translated.
virtual void PostTranslation(Shader* shader) {}
// setup_shader_post_translation_info if non-modification-specific parameters
// of the Shader object behind the Translation can be set by this invocation.
virtual void PostTranslation(Shader::Translation& translation,
bool setup_shader_post_translation_info) {}
// Sets the host disassembly on a shader.
void set_host_disassembly(Shader* shader, std::string value) {
shader->host_disassembly_ = std::move(value);
void set_host_disassembly(Shader::Translation& translation,
std::string value) {
translation.host_disassembly_ = std::move(value);
}
// Handles translation for control flow label addresses.
@ -184,11 +201,9 @@ class ShaderTranslator {
const char* name;
uint32_t argument_count;
uint32_t src_swizzle_component_count;
bool disable_implicit_early_z;
};
bool TranslateInternal(Shader* shader,
Shader::HostVertexShaderType host_vertex_shader_type);
bool TranslateInternal(Shader::Translation& translation);
void MarkUcodeInstruction(uint32_t dword_offset);
void AppendUcodeDisasm(char c);
@ -242,12 +257,13 @@ class ShaderTranslator {
// Input shader metadata and microcode.
xenos::ShaderType shader_type_;
Shader::HostVertexShaderType host_vertex_shader_type_;
const uint32_t* ucode_dwords_;
size_t ucode_dword_count_;
reg::SQ_PROGRAM_CNTL program_cntl_;
uint32_t register_count_;
// Current host-side modification being generated.
uint32_t modification_ = 0;
// Accumulated translation errors.
std::vector<Shader::Error> errors_;
@ -268,7 +284,8 @@ class ShaderTranslator {
// translation.
std::set<uint32_t> label_addresses_;
// Detected binding information gathered before translation.
// Detected binding information gathered before translation. Must not be
// affected by the modification index.
int total_attrib_count_ = 0;
std::vector<Shader::VertexBinding> vertex_bindings_;
std::vector<Shader::TextureBinding> texture_bindings_;
@ -278,13 +295,15 @@ class ShaderTranslator {
// These all are gathered before translation.
// uses_register_dynamic_addressing_ for writes, writes_color_targets_,
// writes_depth_ don't include empty used write masks.
// Must not be affected by the modification index.
Shader::ConstantRegisterMap constant_register_map_ = {0};
bool uses_register_dynamic_addressing_ = false;
bool writes_color_targets_[4] = {false, false, false, false};
bool writes_depth_ = false;
bool implicit_early_z_allowed_ = true;
bool kills_pixels_ = false;
// Memexport info is gathered before translation.
// Must not be affected by the modification index.
uint32_t memexport_alloc_count_ = 0;
// For register allocation in implementations - what was used after each
// `alloc export`.

View File

@ -0,0 +1,296 @@
// generated from `xb buildhlsl`
// source: edram_load_depth_float24and32.cs.hlsl
const uint8_t edram_load_depth_float24and32_cs[] = {
0x44, 0x58, 0x42, 0x43, 0xF3, 0xA3, 0xA4, 0x14, 0x0A, 0x50, 0x56, 0x49,
0x5D, 0x09, 0x6C, 0xBF, 0x33, 0xC9, 0xC1, 0x9A, 0x01, 0x00, 0x00, 0x00,
0xAC, 0x0D, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
0x0C, 0x03, 0x00, 0x00, 0x1C, 0x03, 0x00, 0x00, 0x2C, 0x03, 0x00, 0x00,
0x10, 0x0D, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xD0, 0x02, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0x53, 0x43, 0x00, 0x05, 0x00, 0x00,
0xA8, 0x02, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00,
0x18, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
0x24, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xB4, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xCF, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x6C, 0x6F, 0x61,
0x64, 0x5F, 0x73, 0x74, 0x6F, 0x72, 0x65, 0x5F, 0x73, 0x6F, 0x75, 0x72,
0x63, 0x65, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F,
0x6C, 0x6F, 0x61, 0x64, 0x5F, 0x73, 0x74, 0x6F, 0x72, 0x65, 0x5F, 0x64,
0x65, 0x73, 0x74, 0x00, 0x58, 0x65, 0x45, 0x64, 0x72, 0x61, 0x6D, 0x4C,
0x6F, 0x61, 0x64, 0x53, 0x74, 0x6F, 0x72, 0x65, 0x43, 0x6F, 0x6E, 0x73,
0x74, 0x61, 0x6E, 0x74, 0x73, 0x00, 0xAB, 0xAB, 0xE8, 0x00, 0x00, 0x00,
0x05, 0x00, 0x00, 0x00, 0x1C, 0x01, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x01, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
0x30, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0x4E, 0x02, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x69, 0x02, 0x00, 0x00,
0x0C, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
0x83, 0x02, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D,
0x5F, 0x72, 0x74, 0x5F, 0x63, 0x6F, 0x6C, 0x6F, 0x72, 0x5F, 0x64, 0x65,
0x70, 0x74, 0x68, 0x5F, 0x6F, 0x66, 0x66, 0x73, 0x65, 0x74, 0x00, 0x64,
0x77, 0x6F, 0x72, 0x64, 0x00, 0xAB, 0xAB, 0xAB, 0x00, 0x00, 0x13, 0x00,
0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x03, 0x02, 0x00, 0x00, 0x78, 0x65, 0x5F, 0x65,
0x64, 0x72, 0x61, 0x6D, 0x5F, 0x72, 0x74, 0x5F, 0x63, 0x6F, 0x6C, 0x6F,
0x72, 0x5F, 0x64, 0x65, 0x70, 0x74, 0x68, 0x5F, 0x70, 0x69, 0x74, 0x63,
0x68, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x72,
0x74, 0x5F, 0x73, 0x74, 0x65, 0x6E, 0x63, 0x69, 0x6C, 0x5F, 0x6F, 0x66,
0x66, 0x73, 0x65, 0x74, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61,
0x6D, 0x5F, 0x72, 0x74, 0x5F, 0x73, 0x74, 0x65, 0x6E, 0x63, 0x69, 0x6C,
0x5F, 0x70, 0x69, 0x74, 0x63, 0x68, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64,
0x72, 0x61, 0x6D, 0x5F, 0x62, 0x61, 0x73, 0x65, 0x5F, 0x73, 0x61, 0x6D,
0x70, 0x6C, 0x65, 0x73, 0x5F, 0x32, 0x78, 0x5F, 0x64, 0x65, 0x70, 0x74,
0x68, 0x5F, 0x70, 0x69, 0x74, 0x63, 0x68, 0x00, 0x4D, 0x69, 0x63, 0x72,
0x6F, 0x73, 0x6F, 0x66, 0x74, 0x20, 0x28, 0x52, 0x29, 0x20, 0x48, 0x4C,
0x53, 0x4C, 0x20, 0x53, 0x68, 0x61, 0x64, 0x65, 0x72, 0x20, 0x43, 0x6F,
0x6D, 0x70, 0x69, 0x6C, 0x65, 0x72, 0x20, 0x31, 0x30, 0x2E, 0x31, 0x00,
0x49, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x4F, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x53, 0x48, 0x45, 0x58,
0xDC, 0x09, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x77, 0x02, 0x00, 0x00,
0x6A, 0x08, 0x00, 0x01, 0x59, 0x00, 0x00, 0x07, 0x46, 0x8E, 0x30, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x06,
0x46, 0x7E, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x00, 0x00, 0x06,
0x46, 0xEE, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x02,
0x32, 0x10, 0x02, 0x00, 0x5F, 0x00, 0x00, 0x02, 0x32, 0x20, 0x02, 0x00,
0x5F, 0x00, 0x00, 0x02, 0x32, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x02,
0x07, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x06,
0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00,
0x01, 0x40, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0C,
0x62, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x80, 0x30, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
0xFF, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x04, 0x03,
0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x06,
0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00,
0x01, 0x40, 0x00, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x06,
0x82, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00,
0x01, 0x40, 0x00, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A,
0xA2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x56, 0x0D, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x28, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xFF, 0xFF, 0xFF,
0x1E, 0x00, 0x00, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x01,
0x55, 0x00, 0x00, 0x09, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
0x23, 0x00, 0x00, 0x08, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1A, 0x10, 0x02, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x06,
0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0A, 0x10, 0x02, 0x00, 0x26, 0x00, 0x00, 0x07,
0x00, 0xD0, 0x00, 0x00, 0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, 0x40, 0x01, 0x00, 0x00,
0x23, 0x00, 0x00, 0x09, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0x00, 0x14, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x29, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0B,
0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00,
0x0A, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, 0x22, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07,
0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x7E, 0x20, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0x00, 0x00, 0xA0, 0x00, 0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x46, 0x7E, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
0x14, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x05, 0xF2, 0x00, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0xF5, 0xFF, 0xFF, 0xFF, 0xF5, 0xFF, 0xFF, 0xFF, 0xF5, 0xFF, 0xFF, 0xFF,
0xF5, 0xFF, 0xFF, 0xFF, 0x37, 0x00, 0x00, 0x0C, 0xF2, 0x00, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x15, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00,
0x15, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B, 0xF2, 0x00, 0x10, 0x00,
0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, 0x41, 0x00, 0x00, 0x00,
0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x37, 0x00, 0x00, 0x09, 0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00,
0x29, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0xFF, 0xFF, 0x0F, 0x00, 0xFF, 0xFF, 0x0F, 0x00,
0xFF, 0xFF, 0x0F, 0x00, 0xFF, 0xFF, 0x0F, 0x00, 0x37, 0x00, 0x00, 0x09,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x06, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x38,
0x00, 0x00, 0x00, 0x38, 0x29, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x37, 0x00, 0x00, 0x0C, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80,
0x41, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F,
0xFF, 0xFF, 0xFF, 0x7F, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F,
0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, 0x8C, 0x00, 0x00, 0x14,
0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00,
0x00, 0x00, 0x80, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B,
0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80,
0x41, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
0x71, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x07,
0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00,
0x4F, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38,
0x00, 0x00, 0x80, 0x38, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8,
0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, 0x37, 0x00, 0x00, 0x09,
0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x23, 0x00, 0x00, 0x09, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x29, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x06, 0x00, 0x02, 0x00, 0x02, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x23, 0x00, 0x00, 0x0A, 0x32, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x56, 0x05, 0x02, 0x00, 0xD6, 0x85, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x86, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08, 0xF2, 0xE0, 0x21, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0xFF, 0x00, 0x00, 0x00, 0x8C, 0x00, 0x00, 0x14, 0xE2, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x56, 0x0E, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0xE6, 0x0A, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08,
0x12, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54,
0x94, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1F, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
};

View File

@ -0,0 +1,117 @@
//
// Generated by Microsoft (R) HLSL Shader Compiler 10.1
//
//
// Buffer Definitions:
//
// cbuffer XeEdramLoadStoreConstants
// {
//
// uint xe_edram_rt_color_depth_offset;// Offset: 0 Size: 4
// uint xe_edram_rt_color_depth_pitch;// Offset: 4 Size: 4
// uint xe_edram_rt_stencil_offset; // Offset: 8 Size: 4
// uint xe_edram_rt_stencil_pitch; // Offset: 12 Size: 4
// uint xe_edram_base_samples_2x_depth_pitch;// Offset: 16 Size: 4
//
// }
//
//
// Resource Bindings:
//
// Name Type Format Dim ID HLSL Bind Count
// ------------------------------ ---------- ------- ----------- ------- -------------- ------
// xe_edram_load_store_source texture byte r/o T0 t0 1
// xe_edram_load_store_dest UAV byte r/w U0 u0 1
// XeEdramLoadStoreConstants cbuffer NA NA CB0 cb0 1
//
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// no Input
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// no Output
cs_5_1
dcl_globalFlags refactoringAllowed
dcl_constantbuffer CB0[0:0][2], immediateIndexed, space=0
dcl_resource_raw T0[0:0], space=0
dcl_uav_raw U0[0:0], space=0
dcl_input vThreadGroupID.xy
dcl_input vThreadIDInGroup.xy
dcl_input vThreadID.xy
dcl_temps 7
dcl_thread_group 20, 16, 1
ishl r0.x, vThreadIDInGroup.x, l(2)
and r0.yz, CB0[0][1].xxxx, l(0, 0x00008000, 2047, 0)
if_nz r0.y
ult r0.y, vThreadIDInGroup.x, l(10)
uge r0.w, vThreadIDInGroup.x, l(10)
and r0.yw, r0.yyyw, l(0, 40, 0, -40)
iadd r0.y, r0.w, r0.y
iadd r0.x, r0.y, r0.x
endif
ushr r0.y, CB0[0][1].x, l(16)
imad r0.y, vThreadGroupID.y, r0.y, r0.z
iadd r0.y, r0.y, vThreadGroupID.x
imul null, r0.z, vThreadIDInGroup.y, l(320)
imad r0.y, r0.y, l(5120), r0.z
ishl r0.x, r0.x, l(2)
iadd r0.x, r0.x, r0.y
ubfe r0.y, l(1), l(13), CB0[0][1].x
ishl r0.y, r0.y, l(1)
ishl r0.x, r0.x, r0.y
ld_raw r1.xyzw, r0.x, T0[0].xyzw
ushr r2.xyzw, r1.xyzw, l(8, 8, 8, 8)
iadd r0.x, r0.x, l(0x00a00000)
ld_raw r0.xyzw, r0.x, T0[0].xyzw
ubfe r3.xyzw, l(20, 20, 20, 20), l(8, 8, 8, 8), r1.xyzw
ushr r4.xyzw, r2.xyzw, l(20, 20, 20, 20)
firstbit_hi r5.xyzw, r3.xyzw
iadd r5.xyzw, r5.xyzw, l(-11, -11, -11, -11)
movc r5.xyzw, r3.xyzw, r5.xyzw, l(21,21,21,21)
iadd r6.xyzw, -r5.xyzw, l(1, 1, 1, 1)
movc r6.xyzw, r4.xyzw, r4.xyzw, r6.xyzw
ishl r5.xyzw, r3.xyzw, r5.xyzw
and r5.xyzw, r5.xyzw, l(0x000fffff, 0x000fffff, 0x000fffff, 0x000fffff)
movc r3.xyzw, r4.xyzw, r3.xyzw, r5.xyzw
ishl r4.xyzw, r6.xyzw, l(23, 23, 23, 23)
iadd r4.xyzw, r4.xyzw, l(0x38000000, 0x38000000, 0x38000000, 0x38000000)
ishl r3.xyzw, r3.xyzw, l(3, 3, 3, 3)
iadd r3.xyzw, r4.xyzw, r3.xyzw
movc r3.xyzw, r2.xyzw, r3.xyzw, l(0,0,0,0)
iadd r4.xyzw, r0.xyzw, -r3.xyzw
uge r5.xyzw, l(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff), r0.xyzw
and r0.xyzw, r0.xyzw, r5.xyzw
umin r0.xyzw, r0.xyzw, l(0x3ffffff8, 0x3ffffff8, 0x3ffffff8, 0x3ffffff8)
bfi r5.xyzw, l(23, 23, 23, 23), l(0, 0, 0, 0), r0.xyzw, l(0x00800000, 0x00800000, 0x00800000, 0x00800000)
ushr r6.xyzw, r0.xyzw, l(23, 23, 23, 23)
iadd r6.xyzw, -r6.xyzw, l(113, 113, 113, 113)
umin r6.xyzw, r6.xyzw, l(24, 24, 24, 24)
ushr r5.xyzw, r5.xyzw, r6.xyzw
ult r6.xyzw, r0.xyzw, l(0x38800000, 0x38800000, 0x38800000, 0x38800000)
iadd r0.xyzw, r0.xyzw, l(0xc8000000, 0xc8000000, 0xc8000000, 0xc8000000)
movc r0.xyzw, r6.xyzw, r5.xyzw, r0.xyzw
iadd r5.xyzw, r0.xyzw, l(3, 3, 3, 3)
ubfe r0.xyzw, l(1, 1, 1, 1), l(3, 3, 3, 3), r0.xyzw
iadd r0.xyzw, r0.xyzw, r5.xyzw
ubfe r0.xyzw, l(24, 24, 24, 24), l(3, 3, 3, 3), r0.xyzw
ieq r0.xyzw, r2.xyzw, r0.xyzw
and r0.xyzw, r0.xyzw, l(1, 1, 1, 1)
imad r0.xyzw, r4.xyzw, r0.xyzw, r3.xyzw
ishl r2.xy, vThreadID.xxxx, l(4, 2, 0, 0)
imad r2.xy, vThreadID.yyyy, CB0[0][0].ywyy, r2.xyxx
iadd r2.xy, r2.xyxx, CB0[0][0].xzxx
store_raw U0[0].xyzw, r2.x, r0.xyzw
and r0.x, r1.x, l(255)
bfi r0.yzw, l(0, 8, 8, 8), l(0, 8, 16, 24), r1.yyzw, l(0, 0, 0, 0)
iadd r0.xy, r0.zwzz, r0.xyxx
iadd r0.x, r0.y, r0.x
store_raw U0[0].x, r2.y, r0.x
ret
// Approximately 67 instruction slots used

View File

@ -1,11 +1,11 @@
// generated from `xb buildhlsl`
// source: edram_load_depth_float.cs.hlsl
const uint8_t edram_load_depth_float_cs[] = {
0x44, 0x58, 0x42, 0x43, 0xF3, 0xA3, 0xA4, 0x14, 0x0A, 0x50, 0x56, 0x49,
0x5D, 0x09, 0x6C, 0xBF, 0x33, 0xC9, 0xC1, 0x9A, 0x01, 0x00, 0x00, 0x00,
0xAC, 0x0D, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
0x44, 0x58, 0x42, 0x43, 0x17, 0xEE, 0x03, 0x06, 0xD3, 0x6E, 0x58, 0x75,
0x66, 0x3B, 0x5B, 0x87, 0x2F, 0xF9, 0x44, 0x9E, 0x01, 0x00, 0x00, 0x00,
0x64, 0x0A, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
0x0C, 0x03, 0x00, 0x00, 0x1C, 0x03, 0x00, 0x00, 0x2C, 0x03, 0x00, 0x00,
0x10, 0x0D, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xD0, 0x02, 0x00, 0x00,
0xC8, 0x09, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xD0, 0x02, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0x53, 0x43, 0x00, 0x05, 0x00, 0x00,
0xA8, 0x02, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00,
@ -69,7 +69,7 @@ const uint8_t edram_load_depth_float_cs[] = {
0x49, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x4F, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x53, 0x48, 0x45, 0x58,
0xDC, 0x09, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x77, 0x02, 0x00, 0x00,
0x94, 0x06, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0xA5, 0x01, 0x00, 0x00,
0x6A, 0x08, 0x00, 0x01, 0x59, 0x00, 0x00, 0x07, 0x46, 0x8E, 0x30, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x06,
@ -126,168 +126,98 @@ const uint8_t edram_load_depth_float_cs[] = {
0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07,
0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x7E, 0x20, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x09,
0x32, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x02, 0x00,
0x02, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x0A,
0x32, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x56, 0x05, 0x02, 0x00,
0xD6, 0x85, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x46, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x86, 0x80, 0x30, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0x00, 0x00, 0xA0, 0x00, 0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x46, 0x7E, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
0x14, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x05, 0xF2, 0x00, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0xF5, 0xFF, 0xFF, 0xFF, 0xF5, 0xFF, 0xFF, 0xFF, 0xF5, 0xFF, 0xFF, 0xFF,
0xF5, 0xFF, 0xFF, 0xFF, 0x37, 0x00, 0x00, 0x0C, 0xF2, 0x00, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x15, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00,
0x15, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B, 0xF2, 0x00, 0x10, 0x00,
0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, 0x41, 0x00, 0x00, 0x00,
0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x37, 0x00, 0x00, 0x09, 0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00,
0x29, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x05,
0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0xFF, 0xFF, 0x0F, 0x00, 0xFF, 0xFF, 0x0F, 0x00,
0xFF, 0xFF, 0x0F, 0x00, 0xFF, 0xFF, 0x0F, 0x00, 0x37, 0x00, 0x00, 0x09,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x06, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x38,
0x00, 0x00, 0x00, 0x38, 0x29, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x37, 0x00, 0x00, 0x0C, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80,
0x41, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F,
0xFF, 0xFF, 0xFF, 0x7F, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F,
0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, 0x8C, 0x00, 0x00, 0x14,
0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00,
0x00, 0x00, 0x80, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B,
0x02, 0x40, 0x00, 0x00, 0xF5, 0xFF, 0xFF, 0xFF, 0xF5, 0xFF, 0xFF, 0xFF,
0xF5, 0xFF, 0xFF, 0xFF, 0xF5, 0xFF, 0xFF, 0xFF, 0x37, 0x00, 0x00, 0x0C,
0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00,
0x15, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B,
0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80,
0x41, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
0x71, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x07,
0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00,
0x4F, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38,
0x00, 0x00, 0x80, 0x38, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8,
0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, 0x37, 0x00, 0x00, 0x09,
0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x41, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x23, 0x00, 0x00, 0x09, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x09, 0xF2, 0x00, 0x10, 0x00,
0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x29, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x06, 0x00, 0x02, 0x00, 0x02, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x23, 0x00, 0x00, 0x0A, 0x32, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x56, 0x05, 0x02, 0x00, 0xD6, 0x85, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x86, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x06, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0xFF, 0xFF, 0x0F, 0x00,
0xFF, 0xFF, 0x0F, 0x00, 0xFF, 0xFF, 0x0F, 0x00, 0xFF, 0xFF, 0x0F, 0x00,
0x37, 0x00, 0x00, 0x09, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00,
0x29, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x38,
0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x38, 0x29, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x0C, 0xF2, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08, 0xF2, 0xE0, 0x21, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0xFF, 0x00, 0x00, 0x00, 0x8C, 0x00, 0x00, 0x14, 0xE2, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x56, 0x0E, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0xE6, 0x0A, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00,
0xE6, 0x0A, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08,
0x12, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54,
0x94, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
0x94, 0x00, 0x00, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1F, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,

View File

@ -66,11 +66,12 @@ iadd r0.x, r0.x, r0.y
ubfe r0.y, l(1), l(13), CB0[0][1].x
ishl r0.y, r0.y, l(1)
ishl r0.x, r0.x, r0.y
ld_raw r1.xyzw, r0.x, T0[0].xyzw
ushr r2.xyzw, r1.xyzw, l(8, 8, 8, 8)
iadd r0.x, r0.x, l(0x00a00000)
ld_raw r0.xyzw, r0.x, T0[0].xyzw
ubfe r3.xyzw, l(20, 20, 20, 20), l(8, 8, 8, 8), r1.xyzw
ishl r1.xy, vThreadID.xxxx, l(4, 2, 0, 0)
imad r1.xy, vThreadID.yyyy, CB0[0][0].ywyy, r1.xyxx
iadd r1.xy, r1.xyxx, CB0[0][0].xzxx
ushr r2.xyzw, r0.xyzw, l(8, 8, 8, 8)
ubfe r3.xyzw, l(20, 20, 20, 20), l(8, 8, 8, 8), r0.xyzw
ushr r4.xyzw, r2.xyzw, l(20, 20, 20, 20)
firstbit_hi r5.xyzw, r3.xyzw
iadd r5.xyzw, r5.xyzw, l(-11, -11, -11, -11)
@ -84,34 +85,12 @@ ishl r4.xyzw, r6.xyzw, l(23, 23, 23, 23)
iadd r4.xyzw, r4.xyzw, l(0x38000000, 0x38000000, 0x38000000, 0x38000000)
ishl r3.xyzw, r3.xyzw, l(3, 3, 3, 3)
iadd r3.xyzw, r4.xyzw, r3.xyzw
movc r3.xyzw, r2.xyzw, r3.xyzw, l(0,0,0,0)
iadd r4.xyzw, r0.xyzw, -r3.xyzw
uge r5.xyzw, l(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff), r0.xyzw
and r0.xyzw, r0.xyzw, r5.xyzw
umin r0.xyzw, r0.xyzw, l(0x3ffffff8, 0x3ffffff8, 0x3ffffff8, 0x3ffffff8)
bfi r5.xyzw, l(23, 23, 23, 23), l(0, 0, 0, 0), r0.xyzw, l(0x00800000, 0x00800000, 0x00800000, 0x00800000)
ushr r6.xyzw, r0.xyzw, l(23, 23, 23, 23)
iadd r6.xyzw, -r6.xyzw, l(113, 113, 113, 113)
umin r6.xyzw, r6.xyzw, l(24, 24, 24, 24)
ushr r5.xyzw, r5.xyzw, r6.xyzw
ult r6.xyzw, r0.xyzw, l(0x38800000, 0x38800000, 0x38800000, 0x38800000)
iadd r0.xyzw, r0.xyzw, l(0xc8000000, 0xc8000000, 0xc8000000, 0xc8000000)
movc r0.xyzw, r6.xyzw, r5.xyzw, r0.xyzw
iadd r5.xyzw, r0.xyzw, l(3, 3, 3, 3)
ubfe r0.xyzw, l(1, 1, 1, 1), l(3, 3, 3, 3), r0.xyzw
iadd r0.xyzw, r0.xyzw, r5.xyzw
ubfe r0.xyzw, l(24, 24, 24, 24), l(3, 3, 3, 3), r0.xyzw
ieq r0.xyzw, r2.xyzw, r0.xyzw
and r0.xyzw, r0.xyzw, l(1, 1, 1, 1)
imad r0.xyzw, r4.xyzw, r0.xyzw, r3.xyzw
ishl r2.xy, vThreadID.xxxx, l(4, 2, 0, 0)
imad r2.xy, vThreadID.yyyy, CB0[0][0].ywyy, r2.xyxx
iadd r2.xy, r2.xyxx, CB0[0][0].xzxx
store_raw U0[0].xyzw, r2.x, r0.xyzw
and r0.x, r1.x, l(255)
bfi r0.yzw, l(0, 8, 8, 8), l(0, 8, 16, 24), r1.yyzw, l(0, 0, 0, 0)
iadd r0.xy, r0.zwzz, r0.xyxx
movc r2.xyzw, r2.xyzw, r3.xyzw, l(0,0,0,0)
store_raw U0[0].xyzw, r1.x, r2.xyzw
and r2.x, r0.x, l(255)
bfi r2.yzw, l(0, 8, 8, 8), l(0, 8, 16, 24), r0.yyzw, l(0, 0, 0, 0)
iadd r0.xy, r2.zwzz, r2.xyxx
iadd r0.x, r0.y, r0.x
store_raw U0[0].x, r2.y, r0.x
store_raw U0[0].x, r1.y, r0.x
ret
// Approximately 67 instruction slots used
// Approximately 46 instruction slots used

View File

@ -0,0 +1,226 @@
// generated from `xb buildhlsl`
// source: edram_store_depth_float24and32.cs.hlsl
const uint8_t edram_store_depth_float24and32_cs[] = {
0x44, 0x58, 0x42, 0x43, 0xC6, 0x10, 0x80, 0x14, 0x97, 0x01, 0xE4, 0x46,
0x76, 0xF1, 0x67, 0xD3, 0xDF, 0x50, 0x25, 0xF7, 0x01, 0x00, 0x00, 0x00,
0x64, 0x0A, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
0x0C, 0x03, 0x00, 0x00, 0x1C, 0x03, 0x00, 0x00, 0x2C, 0x03, 0x00, 0x00,
0xC8, 0x09, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xD0, 0x02, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0x53, 0x43, 0x00, 0x05, 0x00, 0x00,
0xA8, 0x02, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00,
0x18, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
0x24, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xB4, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xCF, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x6C, 0x6F, 0x61,
0x64, 0x5F, 0x73, 0x74, 0x6F, 0x72, 0x65, 0x5F, 0x73, 0x6F, 0x75, 0x72,
0x63, 0x65, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F,
0x6C, 0x6F, 0x61, 0x64, 0x5F, 0x73, 0x74, 0x6F, 0x72, 0x65, 0x5F, 0x64,
0x65, 0x73, 0x74, 0x00, 0x58, 0x65, 0x45, 0x64, 0x72, 0x61, 0x6D, 0x4C,
0x6F, 0x61, 0x64, 0x53, 0x74, 0x6F, 0x72, 0x65, 0x43, 0x6F, 0x6E, 0x73,
0x74, 0x61, 0x6E, 0x74, 0x73, 0x00, 0xAB, 0xAB, 0xE8, 0x00, 0x00, 0x00,
0x05, 0x00, 0x00, 0x00, 0x1C, 0x01, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x01, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
0x30, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0x4E, 0x02, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x69, 0x02, 0x00, 0x00,
0x0C, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
0x83, 0x02, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0x00, 0x00, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D,
0x5F, 0x72, 0x74, 0x5F, 0x63, 0x6F, 0x6C, 0x6F, 0x72, 0x5F, 0x64, 0x65,
0x70, 0x74, 0x68, 0x5F, 0x6F, 0x66, 0x66, 0x73, 0x65, 0x74, 0x00, 0x64,
0x77, 0x6F, 0x72, 0x64, 0x00, 0xAB, 0xAB, 0xAB, 0x00, 0x00, 0x13, 0x00,
0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x03, 0x02, 0x00, 0x00, 0x78, 0x65, 0x5F, 0x65,
0x64, 0x72, 0x61, 0x6D, 0x5F, 0x72, 0x74, 0x5F, 0x63, 0x6F, 0x6C, 0x6F,
0x72, 0x5F, 0x64, 0x65, 0x70, 0x74, 0x68, 0x5F, 0x70, 0x69, 0x74, 0x63,
0x68, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x72,
0x74, 0x5F, 0x73, 0x74, 0x65, 0x6E, 0x63, 0x69, 0x6C, 0x5F, 0x6F, 0x66,
0x66, 0x73, 0x65, 0x74, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61,
0x6D, 0x5F, 0x72, 0x74, 0x5F, 0x73, 0x74, 0x65, 0x6E, 0x63, 0x69, 0x6C,
0x5F, 0x70, 0x69, 0x74, 0x63, 0x68, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64,
0x72, 0x61, 0x6D, 0x5F, 0x62, 0x61, 0x73, 0x65, 0x5F, 0x73, 0x61, 0x6D,
0x70, 0x6C, 0x65, 0x73, 0x5F, 0x32, 0x78, 0x5F, 0x64, 0x65, 0x70, 0x74,
0x68, 0x5F, 0x70, 0x69, 0x74, 0x63, 0x68, 0x00, 0x4D, 0x69, 0x63, 0x72,
0x6F, 0x73, 0x6F, 0x66, 0x74, 0x20, 0x28, 0x52, 0x29, 0x20, 0x48, 0x4C,
0x53, 0x4C, 0x20, 0x53, 0x68, 0x61, 0x64, 0x65, 0x72, 0x20, 0x43, 0x6F,
0x6D, 0x70, 0x69, 0x6C, 0x65, 0x72, 0x20, 0x31, 0x30, 0x2E, 0x31, 0x00,
0x49, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x4F, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x53, 0x48, 0x45, 0x58,
0x94, 0x06, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0xA5, 0x01, 0x00, 0x00,
0x6A, 0x08, 0x00, 0x01, 0x59, 0x00, 0x00, 0x07, 0x46, 0x8E, 0x30, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x06,
0x46, 0x7E, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x00, 0x00, 0x06,
0x46, 0xEE, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x02,
0x32, 0x10, 0x02, 0x00, 0x5F, 0x00, 0x00, 0x02, 0x32, 0x20, 0x02, 0x00,
0x5F, 0x00, 0x00, 0x02, 0x32, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x02,
0x05, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x09,
0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x02, 0x00,
0x02, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x0A,
0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x56, 0x05, 0x02, 0x00,
0xD6, 0x85, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x46, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0x80, 0x30, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x7E, 0x20, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F,
0xFF, 0xFF, 0xFF, 0x7F, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F,
0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, 0x8C, 0x00, 0x00, 0x14,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00,
0x00, 0x00, 0x80, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B,
0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80,
0x41, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
0x71, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x07,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x4F, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38,
0x00, 0x00, 0x80, 0x38, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8,
0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, 0x37, 0x00, 0x00, 0x09,
0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xA5, 0x00, 0x00, 0x08,
0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x06, 0x70, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xE2, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x8C, 0x00, 0x00, 0x11,
0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
0x18, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x06, 0x12, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0C, 0x62, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x06, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0xFF, 0x07, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x04, 0x03, 0x1A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x06, 0x22, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00,
0x0A, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x06, 0x82, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00,
0x0A, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xA2, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x56, 0x0D, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xD8, 0xFF, 0xFF, 0xFF, 0x1E, 0x00, 0x00, 0x07,
0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x01, 0x55, 0x00, 0x00, 0x09,
0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x80, 0x30, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x08,
0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x10, 0x02, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x06, 0x22, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x0A, 0x10, 0x02, 0x00, 0x26, 0x00, 0x00, 0x07, 0x00, 0xD0, 0x00, 0x00,
0x42, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x20, 0x02, 0x00,
0x01, 0x40, 0x00, 0x00, 0x40, 0x01, 0x00, 0x00, 0x23, 0x00, 0x00, 0x09,
0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00,
0x2A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07,
0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0B, 0x22, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, 0x0A, 0x80, 0x30, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x29, 0x00, 0x00, 0x07, 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08,
0xF2, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0xA0, 0x00, 0xA6, 0x00, 0x00, 0x08,
0xF2, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54,
0x94, 0x00, 0x00, 0x00, 0x2D, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x13, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
};

View File

@ -0,0 +1,95 @@
//
// Generated by Microsoft (R) HLSL Shader Compiler 10.1
//
//
// Buffer Definitions:
//
// cbuffer XeEdramLoadStoreConstants
// {
//
// uint xe_edram_rt_color_depth_offset;// Offset: 0 Size: 4
// uint xe_edram_rt_color_depth_pitch;// Offset: 4 Size: 4
// uint xe_edram_rt_stencil_offset; // Offset: 8 Size: 4
// uint xe_edram_rt_stencil_pitch; // Offset: 12 Size: 4
// uint xe_edram_base_samples_2x_depth_pitch;// Offset: 16 Size: 4
//
// }
//
//
// Resource Bindings:
//
// Name Type Format Dim ID HLSL Bind Count
// ------------------------------ ---------- ------- ----------- ------- -------------- ------
// xe_edram_load_store_source texture byte r/o T0 t0 1
// xe_edram_load_store_dest UAV byte r/w U0 u0 1
// XeEdramLoadStoreConstants cbuffer NA NA CB0 cb0 1
//
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// no Input
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// no Output
cs_5_1
dcl_globalFlags refactoringAllowed
dcl_constantbuffer CB0[0:0][2], immediateIndexed, space=0
dcl_resource_raw T0[0:0], space=0
dcl_uav_raw U0[0:0], space=0
dcl_input vThreadGroupID.xy
dcl_input vThreadIDInGroup.xy
dcl_input vThreadID.xy
dcl_temps 5
dcl_thread_group 20, 16, 1
ishl r0.xy, vThreadID.xxxx, l(4, 2, 0, 0)
imad r0.xy, vThreadID.yyyy, CB0[0][0].ywyy, r0.xyxx
iadd r0.xy, r0.xyxx, CB0[0][0].xzxx
ld_raw r1.xyzw, r0.x, T0[0].xyzw
uge r2.xyzw, l(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff), r1.xyzw
and r2.xyzw, r1.xyzw, r2.xyzw
umin r2.xyzw, r2.xyzw, l(0x3ffffff8, 0x3ffffff8, 0x3ffffff8, 0x3ffffff8)
bfi r3.xyzw, l(23, 23, 23, 23), l(0, 0, 0, 0), r2.xyzw, l(0x00800000, 0x00800000, 0x00800000, 0x00800000)
ushr r4.xyzw, r2.xyzw, l(23, 23, 23, 23)
iadd r4.xyzw, -r4.xyzw, l(113, 113, 113, 113)
umin r4.xyzw, r4.xyzw, l(24, 24, 24, 24)
ushr r3.xyzw, r3.xyzw, r4.xyzw
ult r4.xyzw, r2.xyzw, l(0x38800000, 0x38800000, 0x38800000, 0x38800000)
iadd r2.xyzw, r2.xyzw, l(0xc8000000, 0xc8000000, 0xc8000000, 0xc8000000)
movc r2.xyzw, r4.xyzw, r3.xyzw, r2.xyzw
iadd r3.xyzw, r2.xyzw, l(3, 3, 3, 3)
ubfe r2.xyzw, l(1, 1, 1, 1), l(3, 3, 3, 3), r2.xyzw
iadd r2.xyzw, r2.xyzw, r3.xyzw
ushr r2.xyzw, r2.xyzw, l(3, 3, 3, 3)
ld_raw r0.x, r0.y, T0[0].xxxx
ushr r0.yzw, r0.xxxx, l(0, 8, 16, 24)
bfi r0.xyzw, l(24, 24, 24, 24), l(8, 8, 8, 8), r2.xyzw, r0.xyzw
ishl r2.x, vThreadIDInGroup.x, l(2)
and r2.yz, CB0[0][1].xxxx, l(0, 0x00008000, 2047, 0)
if_nz r2.y
ult r2.y, vThreadIDInGroup.x, l(10)
uge r2.w, vThreadIDInGroup.x, l(10)
and r2.yw, r2.yyyw, l(0, 40, 0, -40)
iadd r2.y, r2.w, r2.y
iadd r2.x, r2.y, r2.x
endif
ushr r2.y, CB0[0][1].x, l(16)
imad r2.y, vThreadGroupID.y, r2.y, r2.z
iadd r2.y, r2.y, vThreadGroupID.x
imul null, r2.z, vThreadIDInGroup.y, l(320)
imad r2.y, r2.y, l(5120), r2.z
ishl r2.x, r2.x, l(2)
iadd r2.x, r2.x, r2.y
ubfe r2.y, l(1), l(13), CB0[0][1].x
ishl r2.y, r2.y, l(1)
ishl r2.x, r2.x, r2.y
store_raw U0[0].xyzw, r2.x, r0.xyzw
iadd r0.x, r2.x, l(0x00a00000)
store_raw U0[0].xyzw, r0.x, r1.xyzw
ret
// Approximately 45 instruction slots used

View File

@ -1,11 +1,11 @@
// generated from `xb buildhlsl`
// source: edram_store_depth_float.cs.hlsl
const uint8_t edram_store_depth_float_cs[] = {
0x44, 0x58, 0x42, 0x43, 0xC6, 0x10, 0x80, 0x14, 0x97, 0x01, 0xE4, 0x46,
0x76, 0xF1, 0x67, 0xD3, 0xDF, 0x50, 0x25, 0xF7, 0x01, 0x00, 0x00, 0x00,
0x64, 0x0A, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
0x44, 0x58, 0x42, 0x43, 0xF1, 0x72, 0x64, 0x54, 0x9D, 0xF6, 0x79, 0x48,
0x2F, 0x8C, 0xD1, 0x59, 0x56, 0x1C, 0x90, 0x9A, 0x01, 0x00, 0x00, 0x00,
0x28, 0x0A, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
0x0C, 0x03, 0x00, 0x00, 0x1C, 0x03, 0x00, 0x00, 0x2C, 0x03, 0x00, 0x00,
0xC8, 0x09, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xD0, 0x02, 0x00, 0x00,
0x8C, 0x09, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xD0, 0x02, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0x53, 0x43, 0x00, 0x05, 0x00, 0x00,
0xA8, 0x02, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00,
@ -69,7 +69,7 @@ const uint8_t edram_store_depth_float_cs[] = {
0x49, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x4F, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x53, 0x48, 0x45, 0x58,
0x94, 0x06, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0xA5, 0x01, 0x00, 0x00,
0x58, 0x06, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x96, 0x01, 0x00, 0x00,
0x6A, 0x08, 0x00, 0x01, 0x59, 0x00, 0x00, 0x07, 0x46, 0x8E, 0x30, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x06,
@ -79,7 +79,7 @@ const uint8_t edram_store_depth_float_cs[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x02,
0x32, 0x10, 0x02, 0x00, 0x5F, 0x00, 0x00, 0x02, 0x32, 0x20, 0x02, 0x00,
0x5F, 0x00, 0x00, 0x02, 0x32, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x02,
0x05, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x09,
0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x02, 0x00,
0x02, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
@ -96,53 +96,53 @@ const uint8_t edram_store_depth_float_cs[] = {
0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F,
0xFF, 0xFF, 0xFF, 0x7F, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F,
0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, 0x8C, 0x00, 0x00, 0x14,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00,
0x00, 0x00, 0x80, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B,
0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80,
0x41, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80,
0x41, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
0x71, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x07,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x4F, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x4F, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38,
0x00, 0x00, 0x80, 0x38, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8,
0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, 0x37, 0x00, 0x00, 0x09,
0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A,
0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x03, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xA5, 0x00, 0x00, 0x08,
0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
@ -155,64 +155,59 @@ const uint8_t edram_store_depth_float_cs[] = {
0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
0x18, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x06, 0x12, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0C, 0x62, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x06, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x06, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0xFF, 0x07, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x04, 0x03, 0x1A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x06, 0x22, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x06, 0x22, 0x00, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00,
0x0A, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x06, 0x82, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00,
0x0A, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xA2, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x56, 0x0D, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x56, 0x0D, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xD8, 0xFF, 0xFF, 0xFF, 0x1E, 0x00, 0x00, 0x07,
0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x01, 0x55, 0x00, 0x00, 0x09,
0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x80, 0x30, 0x00,
0x22, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x01, 0x55, 0x00, 0x00, 0x09,
0x22, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0A, 0x80, 0x30, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x08,
0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x10, 0x02, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x06, 0x22, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x22, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x10, 0x02, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x06, 0x22, 0x00, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x0A, 0x10, 0x02, 0x00, 0x26, 0x00, 0x00, 0x07, 0x00, 0xD0, 0x00, 0x00,
0x42, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x20, 0x02, 0x00,
0x42, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x20, 0x02, 0x00,
0x01, 0x40, 0x00, 0x00, 0x40, 0x01, 0x00, 0x00, 0x23, 0x00, 0x00, 0x09,
0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00,
0x2A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07,
0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0B, 0x22, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x22, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00,
0x2A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07,
0x12, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0B, 0x22, 0x00, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, 0x0A, 0x80, 0x30, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x29, 0x00, 0x00, 0x07, 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0x29, 0x00, 0x00, 0x07, 0x22, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00,
0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08,
0x01, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08,
0xF2, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0xA0, 0x00, 0xA6, 0x00, 0x00, 0x08,
0xF2, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54,
0x94, 0x00, 0x00, 0x00, 0x2D, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54,
0x94, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x13, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x12, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@ -222,5 +217,5 @@ const uint8_t edram_store_depth_float_cs[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
};

View File

@ -45,51 +45,49 @@ dcl_uav_raw U0[0:0], space=0
dcl_input vThreadGroupID.xy
dcl_input vThreadIDInGroup.xy
dcl_input vThreadID.xy
dcl_temps 5
dcl_temps 4
dcl_thread_group 20, 16, 1
ishl r0.xy, vThreadID.xxxx, l(4, 2, 0, 0)
imad r0.xy, vThreadID.yyyy, CB0[0][0].ywyy, r0.xyxx
iadd r0.xy, r0.xyxx, CB0[0][0].xzxx
ld_raw r1.xyzw, r0.x, T0[0].xyzw
uge r2.xyzw, l(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff), r1.xyzw
and r2.xyzw, r1.xyzw, r2.xyzw
umin r2.xyzw, r2.xyzw, l(0x3ffffff8, 0x3ffffff8, 0x3ffffff8, 0x3ffffff8)
bfi r3.xyzw, l(23, 23, 23, 23), l(0, 0, 0, 0), r2.xyzw, l(0x00800000, 0x00800000, 0x00800000, 0x00800000)
ushr r4.xyzw, r2.xyzw, l(23, 23, 23, 23)
iadd r4.xyzw, -r4.xyzw, l(113, 113, 113, 113)
umin r4.xyzw, r4.xyzw, l(24, 24, 24, 24)
ushr r3.xyzw, r3.xyzw, r4.xyzw
ult r4.xyzw, r2.xyzw, l(0x38800000, 0x38800000, 0x38800000, 0x38800000)
iadd r2.xyzw, r2.xyzw, l(0xc8000000, 0xc8000000, 0xc8000000, 0xc8000000)
movc r2.xyzw, r4.xyzw, r3.xyzw, r2.xyzw
iadd r3.xyzw, r2.xyzw, l(3, 3, 3, 3)
ubfe r2.xyzw, l(1, 1, 1, 1), l(3, 3, 3, 3), r2.xyzw
iadd r2.xyzw, r2.xyzw, r3.xyzw
ushr r2.xyzw, r2.xyzw, l(3, 3, 3, 3)
and r1.xyzw, r1.xyzw, r2.xyzw
umin r1.xyzw, r1.xyzw, l(0x3ffffff8, 0x3ffffff8, 0x3ffffff8, 0x3ffffff8)
bfi r2.xyzw, l(23, 23, 23, 23), l(0, 0, 0, 0), r1.xyzw, l(0x00800000, 0x00800000, 0x00800000, 0x00800000)
ushr r3.xyzw, r1.xyzw, l(23, 23, 23, 23)
iadd r3.xyzw, -r3.xyzw, l(113, 113, 113, 113)
umin r3.xyzw, r3.xyzw, l(24, 24, 24, 24)
ushr r2.xyzw, r2.xyzw, r3.xyzw
ult r3.xyzw, r1.xyzw, l(0x38800000, 0x38800000, 0x38800000, 0x38800000)
iadd r1.xyzw, r1.xyzw, l(0xc8000000, 0xc8000000, 0xc8000000, 0xc8000000)
movc r1.xyzw, r3.xyzw, r2.xyzw, r1.xyzw
iadd r2.xyzw, r1.xyzw, l(3, 3, 3, 3)
ubfe r1.xyzw, l(1, 1, 1, 1), l(3, 3, 3, 3), r1.xyzw
iadd r1.xyzw, r1.xyzw, r2.xyzw
ushr r1.xyzw, r1.xyzw, l(3, 3, 3, 3)
ld_raw r0.x, r0.y, T0[0].xxxx
ushr r0.yzw, r0.xxxx, l(0, 8, 16, 24)
bfi r0.xyzw, l(24, 24, 24, 24), l(8, 8, 8, 8), r2.xyzw, r0.xyzw
ishl r2.x, vThreadIDInGroup.x, l(2)
and r2.yz, CB0[0][1].xxxx, l(0, 0x00008000, 2047, 0)
if_nz r2.y
ult r2.y, vThreadIDInGroup.x, l(10)
uge r2.w, vThreadIDInGroup.x, l(10)
and r2.yw, r2.yyyw, l(0, 40, 0, -40)
iadd r2.y, r2.w, r2.y
iadd r2.x, r2.y, r2.x
bfi r0.xyzw, l(24, 24, 24, 24), l(8, 8, 8, 8), r1.xyzw, r0.xyzw
ishl r1.x, vThreadIDInGroup.x, l(2)
and r1.yz, CB0[0][1].xxxx, l(0, 0x00008000, 2047, 0)
if_nz r1.y
ult r1.y, vThreadIDInGroup.x, l(10)
uge r1.w, vThreadIDInGroup.x, l(10)
and r1.yw, r1.yyyw, l(0, 40, 0, -40)
iadd r1.y, r1.w, r1.y
iadd r1.x, r1.y, r1.x
endif
ushr r2.y, CB0[0][1].x, l(16)
imad r2.y, vThreadGroupID.y, r2.y, r2.z
iadd r2.y, r2.y, vThreadGroupID.x
imul null, r2.z, vThreadIDInGroup.y, l(320)
imad r2.y, r2.y, l(5120), r2.z
ishl r2.x, r2.x, l(2)
iadd r2.x, r2.x, r2.y
ubfe r2.y, l(1), l(13), CB0[0][1].x
ishl r2.y, r2.y, l(1)
ishl r2.x, r2.x, r2.y
store_raw U0[0].xyzw, r2.x, r0.xyzw
iadd r0.x, r2.x, l(0x00a00000)
store_raw U0[0].xyzw, r0.x, r1.xyzw
ushr r1.y, CB0[0][1].x, l(16)
imad r1.y, vThreadGroupID.y, r1.y, r1.z
iadd r1.y, r1.y, vThreadGroupID.x
imul null, r1.z, vThreadIDInGroup.y, l(320)
imad r1.y, r1.y, l(5120), r1.z
ishl r1.x, r1.x, l(2)
iadd r1.x, r1.x, r1.y
ubfe r1.y, l(1), l(13), CB0[0][1].x
ishl r1.y, r1.y, l(1)
ishl r1.x, r1.x, r1.y
store_raw U0[0].xyzw, r1.x, r0.xyzw
ret
// Approximately 45 instruction slots used
// Approximately 43 instruction slots used

View File

@ -0,0 +1,156 @@
// generated from `xb buildhlsl`
// source: float24_round.ps.hlsl
const uint8_t float24_round_ps[] = {
0x44, 0x58, 0x42, 0x43, 0xDF, 0x71, 0xF3, 0x0A, 0x4A, 0xDB, 0xC3, 0x80,
0x1E, 0xE4, 0x39, 0x21, 0x59, 0x07, 0x78, 0x97, 0x01, 0x00, 0x00, 0x00,
0x18, 0x07, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
0xA0, 0x00, 0x00, 0x00, 0x90, 0x02, 0x00, 0x00, 0xC4, 0x02, 0x00, 0x00,
0x7C, 0x06, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0x64, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0xFF, 0xFF, 0x00, 0x05, 0x00, 0x00,
0x3C, 0x00, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00,
0x18, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
0x24, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x4D, 0x69, 0x63, 0x72, 0x6F, 0x73, 0x6F, 0x66, 0x74, 0x20, 0x28, 0x52,
0x29, 0x20, 0x48, 0x4C, 0x53, 0x4C, 0x20, 0x53, 0x68, 0x61, 0x64, 0x65,
0x72, 0x20, 0x43, 0x6F, 0x6D, 0x70, 0x69, 0x6C, 0x65, 0x72, 0x20, 0x31,
0x30, 0x2E, 0x31, 0x00, 0x49, 0x53, 0x47, 0x4E, 0xE8, 0x01, 0x00, 0x00,
0x13, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x05, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x06, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x07, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x09, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x0B, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x0C, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x0D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x0D, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x0E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x0E, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x0F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x0F, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x11, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xD9, 0x01, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x12, 0x00, 0x00, 0x00, 0x0F, 0x04, 0x00, 0x00, 0x54, 0x45, 0x58, 0x43,
0x4F, 0x4F, 0x52, 0x44, 0x00, 0x53, 0x56, 0x5F, 0x50, 0x6F, 0x73, 0x69,
0x74, 0x69, 0x6F, 0x6E, 0x00, 0xAB, 0xAB, 0xAB, 0x4F, 0x53, 0x47, 0x4E,
0x2C, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x0E, 0x00, 0x00,
0x53, 0x56, 0x5F, 0x44, 0x65, 0x70, 0x74, 0x68, 0x00, 0xAB, 0xAB, 0xAB,
0x53, 0x48, 0x45, 0x58, 0xB0, 0x03, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00,
0xEC, 0x00, 0x00, 0x00, 0x6A, 0x08, 0x00, 0x01, 0x64, 0x38, 0x00, 0x04,
0x42, 0x10, 0x10, 0x00, 0x12, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x65, 0x00, 0x00, 0x02, 0x01, 0xC0, 0x00, 0x00, 0x68, 0x00, 0x00, 0x02,
0x02, 0x00, 0x00, 0x00, 0x36, 0x20, 0x08, 0x05, 0x12, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x2A, 0x10, 0x10, 0x00, 0x12, 0x00, 0x00, 0x00,
0x50, 0x00, 0x10, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0x7F, 0x0A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x08, 0x07, 0x12, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x54, 0x00, 0x08, 0x07,
0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0xF8, 0xFF, 0xFF, 0x3F,
0x8C, 0x00, 0x10, 0x0B, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x55, 0x00, 0x20, 0x07,
0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x20, 0x08, 0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x2A, 0x00, 0x10, 0x80, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x54, 0x00, 0x20, 0x07,
0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
0x55, 0x00, 0x10, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x20, 0x07, 0x42, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x80, 0x38, 0x1E, 0x00, 0x08, 0x07,
0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC8,
0x37, 0x00, 0x08, 0x09, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x10, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x08, 0x09, 0x12, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x08, 0x07, 0x12, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x38, 0x0F,
0x72, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00,
0x18, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x06, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x40, 0x05,
0x82, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x40, 0x07, 0x82, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0xF5, 0xFF, 0xFF, 0xFF, 0x37, 0x00, 0x40, 0x09,
0x82, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x08, 0x08,
0x12, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x80,
0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x37, 0x00, 0x08, 0x09, 0x12, 0x00, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x40, 0x07, 0x82, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x40, 0x07,
0x82, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0xFF, 0xFF, 0x0F, 0x00,
0x37, 0x00, 0x10, 0x09, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x29, 0x00, 0x20, 0x07, 0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x20, 0x07, 0x42, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x29, 0x00, 0x10, 0x07,
0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x10, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x37, 0x00, 0x08, 0x08, 0x01, 0xC0, 0x00, 0x00,
0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, 0x94, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
};

View File

@ -0,0 +1,74 @@
//
// Generated by Microsoft (R) HLSL Shader Compiler 10.1
//
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// TEXCOORD 0 xyzw 0 NONE float
// TEXCOORD 1 xyzw 1 NONE float
// TEXCOORD 2 xyzw 2 NONE float
// TEXCOORD 3 xyzw 3 NONE float
// TEXCOORD 4 xyzw 4 NONE float
// TEXCOORD 5 xyzw 5 NONE float
// TEXCOORD 6 xyzw 6 NONE float
// TEXCOORD 7 xyzw 7 NONE float
// TEXCOORD 8 xyzw 8 NONE float
// TEXCOORD 9 xyzw 9 NONE float
// TEXCOORD 10 xyzw 10 NONE float
// TEXCOORD 11 xyzw 11 NONE float
// TEXCOORD 12 xyzw 12 NONE float
// TEXCOORD 13 xyzw 13 NONE float
// TEXCOORD 14 xyzw 14 NONE float
// TEXCOORD 15 xyzw 15 NONE float
// TEXCOORD 16 xyz 16 NONE float
// TEXCOORD 17 xy 17 NONE float
// SV_Position 0 xyzw 18 POS float z
//
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// SV_Depth 0 N/A oDepth DEPTH float YES
//
// Pixel Shader runs at sample frequency
//
ps_5_1
dcl_globalFlags refactoringAllowed
dcl_input_ps_siv linear noperspective sample v18.z, position
dcl_output oDepth
dcl_temps 2
mov_sat [precise(x)] r0.x, v18.z
uge [precise(y)] r0.y, l(0x7fffffff), r0.x
and [precise(x)] r0.x, r0.x, r0.y
umin [precise(x)] r0.x, r0.x, l(0x3ffffff8)
bfi [precise(y)] r0.y, l(23), l(0), r0.x, l(0x00800000)
ushr [precise(z)] r0.z, r0.x, l(23)
iadd [precise(z)] r0.z, -r0.z, l(113)
umin [precise(z)] r0.z, r0.z, l(24)
ushr [precise(y)] r0.y, r0.y, r0.z
ult [precise(z)] r0.z, r0.x, l(0x38800000)
iadd [precise(x)] r0.x, r0.x, l(0xc8000000)
movc [precise(x)] r0.x, r0.z, r0.y, r0.x
iadd [precise(y)] r0.y, r0.x, l(3)
ubfe [precise(x)] r0.x, l(1), l(3), r0.x
iadd [precise(x)] r0.x, r0.x, r0.y
ubfe [precise(xyz)] r0.xyz, l(24, 20, 4, 0), l(3, 3, 23, 0), r0.xxxx
firstbit_hi [precise(w)] r0.w, r0.y
iadd [precise(w)] r0.w, r0.w, l(-11)
movc [precise(w)] r0.w, r0.y, r0.w, l(21)
iadd [precise(x)] r1.x, -r0.w, l(1)
movc [precise(x)] r1.x, r0.z, r0.z, r1.x
ishl [precise(w)] r0.w, r0.y, r0.w
and [precise(w)] r0.w, r0.w, l(0x000fffff)
movc [precise(y)] r0.y, r0.z, r0.y, r0.w
ishl [precise(z)] r0.z, r1.x, l(23)
iadd [precise(z)] r0.z, r0.z, l(0x38000000)
ishl [precise(y)] r0.y, r0.y, l(3)
iadd [precise(y)] r0.y, r0.z, r0.y
movc [precise(x)] oDepth, r0.x, r0.y, l(0)
ret
// Approximately 30 instruction slots used

View File

@ -0,0 +1,100 @@
// generated from `xb buildhlsl`
// source: float24_truncate.ps.hlsl
const uint8_t float24_truncate_ps[] = {
0x44, 0x58, 0x42, 0x43, 0xB8, 0x51, 0x55, 0x1D, 0xF4, 0xF1, 0xC9, 0xC0,
0x0C, 0x22, 0xD3, 0x43, 0x94, 0xDF, 0x83, 0x9D, 0x01, 0x00, 0x00, 0x00,
0x7C, 0x04, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
0xA0, 0x00, 0x00, 0x00, 0x90, 0x02, 0x00, 0x00, 0xCC, 0x02, 0x00, 0x00,
0xE0, 0x03, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0x64, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0xFF, 0xFF, 0x00, 0x05, 0x00, 0x00,
0x3C, 0x00, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00,
0x18, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
0x24, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x4D, 0x69, 0x63, 0x72, 0x6F, 0x73, 0x6F, 0x66, 0x74, 0x20, 0x28, 0x52,
0x29, 0x20, 0x48, 0x4C, 0x53, 0x4C, 0x20, 0x53, 0x68, 0x61, 0x64, 0x65,
0x72, 0x20, 0x43, 0x6F, 0x6D, 0x70, 0x69, 0x6C, 0x65, 0x72, 0x20, 0x31,
0x30, 0x2E, 0x31, 0x00, 0x49, 0x53, 0x47, 0x4E, 0xE8, 0x01, 0x00, 0x00,
0x13, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x05, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x06, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x07, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x09, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x0A, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x0B, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x0C, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x0D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x0D, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x0E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x0E, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x0F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x0F, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00,
0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x11, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xD9, 0x01, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x12, 0x00, 0x00, 0x00, 0x0F, 0x04, 0x00, 0x00, 0x54, 0x45, 0x58, 0x43,
0x4F, 0x4F, 0x52, 0x44, 0x00, 0x53, 0x56, 0x5F, 0x50, 0x6F, 0x73, 0x69,
0x74, 0x69, 0x6F, 0x6E, 0x00, 0xAB, 0xAB, 0xAB, 0x4F, 0x53, 0x47, 0x4E,
0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x03, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x0E, 0x00, 0x00,
0x53, 0x56, 0x5F, 0x44, 0x65, 0x70, 0x74, 0x68, 0x4C, 0x65, 0x73, 0x73,
0x45, 0x71, 0x75, 0x61, 0x6C, 0x00, 0xAB, 0xAB, 0x53, 0x48, 0x45, 0x58,
0x0C, 0x01, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00,
0x6A, 0x08, 0x00, 0x01, 0x64, 0x38, 0x00, 0x04, 0x42, 0x10, 0x10, 0x00,
0x12, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x02,
0x01, 0x70, 0x02, 0x00, 0x68, 0x00, 0x00, 0x02, 0x01, 0x00, 0x00, 0x00,
0x36, 0x20, 0x08, 0x05, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x2A, 0x10, 0x10, 0x00, 0x12, 0x00, 0x00, 0x00, 0x50, 0x00, 0x10, 0x07,
0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x80, 0x2E,
0x1F, 0x00, 0x04, 0x03, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x8A, 0x00, 0x10, 0x09, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00,
0x17, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1E, 0x00, 0x10, 0x08, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
0x1A, 0x00, 0x10, 0x80, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x24, 0x00, 0x10, 0x07,
0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
0x8C, 0x00, 0x08, 0x0A, 0x01, 0x70, 0x02, 0x00, 0x1A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x01, 0x36, 0x00, 0x08, 0x04,
0x01, 0x70, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x15, 0x00, 0x00, 0x01, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54,
0x94, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};

View File

@ -0,0 +1,55 @@
//
// Generated by Microsoft (R) HLSL Shader Compiler 10.1
//
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// TEXCOORD 0 xyzw 0 NONE float
// TEXCOORD 1 xyzw 1 NONE float
// TEXCOORD 2 xyzw 2 NONE float
// TEXCOORD 3 xyzw 3 NONE float
// TEXCOORD 4 xyzw 4 NONE float
// TEXCOORD 5 xyzw 5 NONE float
// TEXCOORD 6 xyzw 6 NONE float
// TEXCOORD 7 xyzw 7 NONE float
// TEXCOORD 8 xyzw 8 NONE float
// TEXCOORD 9 xyzw 9 NONE float
// TEXCOORD 10 xyzw 10 NONE float
// TEXCOORD 11 xyzw 11 NONE float
// TEXCOORD 12 xyzw 12 NONE float
// TEXCOORD 13 xyzw 13 NONE float
// TEXCOORD 14 xyzw 14 NONE float
// TEXCOORD 15 xyzw 15 NONE float
// TEXCOORD 16 xyz 16 NONE float
// TEXCOORD 17 xy 17 NONE float
// SV_Position 0 xyzw 18 POS float z
//
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// SV_DepthLessEqual 0 N/A oDepthLE DEPTHLE float YES
//
// Pixel Shader runs at sample frequency
//
ps_5_1
dcl_globalFlags refactoringAllowed
dcl_input_ps_siv linear noperspective sample v18.z, position
dcl_output oDepthLE
dcl_temps 1
mov_sat [precise(x)] r0.x, v18.z
uge [precise(y)] r0.y, r0.x, l(0x2e800000)
if_nz r0.y
ubfe [precise(y)] r0.y, l(8), l(23), r0.x
iadd [precise(y)] r0.y, -r0.y, l(116)
imax [precise(y)] r0.y, r0.y, l(3)
bfi [precise(x)] oDepthLE, r0.y, l(0), l(0), r0.x
else
mov [precise(x)] oDepthLE, l(0)
endif
ret
// Approximately 11 instruction slots used

View File

@ -7,22 +7,14 @@ void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_thread_id : SV_DispatchThreadID) {
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 4u;
uint edram_offset = XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index);
uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset);
uint4 depth24 = depth24_stencil >> 8u;
uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset);
// Depth. If the stored 32-bit depth converted to 24-bit is the same as the
// stored 24-bit depth, load the 32-bit value because it has more precision
// (and multipass rendering is possible), if it's not, convert the 24-bit
// depth because it was overwritten by aliasing.
uint4 depth24to32 = XeFloat20e4To32(depth24);
uint4 depth = depth24to32 + (depth32 - depth24to32) *
uint4(XeFloat32To20e4(depth32) == depth24);
uint4 samples = xe_edram_load_store_source.Load4(
XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index));
// Depth (exact conversion ensured during drawing).
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
xe_edram_load_store_dest.Store4(rt_offset, depth);
xe_edram_load_store_dest.Store4(rt_offset, XeFloat20e4To32(samples >> 8u));
// Stencil.
uint4 stencil = (depth24_stencil & 0xFFu) << uint4(0u, 8u, 16u, 24u);
uint4 stencil = (samples & 0xFFu) << uint4(0u, 8u, 16u, 24u);
stencil.xy |= stencil.zw;
stencil.x |= stencil.y;
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +

View File

@ -0,0 +1,31 @@
#include "edram_load_store.hlsli"
#include "pixel_formats.hlsli"
[numthreads(20, 16, 1)]
void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 4u;
uint edram_offset = XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index);
uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset);
uint4 depth24 = depth24_stencil >> 8u;
uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset);
// Depth. If the stored 32-bit depth converted to 24-bit is the same as the
// stored 24-bit depth, load the 32-bit value because it has more precision
// (and multipass rendering is possible), if it's not, convert the 24-bit
// depth because it was overwritten by aliasing.
uint4 depth24to32 = XeFloat20e4To32(depth24);
uint4 depth = depth24to32 + (depth32 - depth24to32) *
uint4(XeFloat32To20e4(depth32) == depth24);
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
xe_edram_load_store_dest.Store4(rt_offset, depth);
// Stencil.
uint4 stencil = (depth24_stencil & 0xFFu) << uint4(0u, 8u, 16u, 24u);
stencil.xy |= stencil.zw;
stencil.x |= stencil.y;
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
xe_edram_rt_stencil_offset;
xe_edram_load_store_dest.Store(rt_offset, stencil.x);
}

View File

@ -5,21 +5,18 @@
void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
// Depth.
// Depth (exact conversion ensured during drawing).
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
uint4 depth32 = xe_edram_load_store_source.Load4(rt_offset);
uint4 depth24_stencil = XeFloat32To20e4(depth32) << 8u;
uint4 samples =
XeFloat32To20e4(xe_edram_load_store_source.Load4(rt_offset)) << 8u;
// Stencil.
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
xe_edram_rt_stencil_offset;
depth24_stencil |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
uint4(0u, 8u, 16u, 24u)) & 0xFFu;
samples |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
uint4(0u, 8u, 16u, 24u)) & 0xFFu;
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 4u;
uint edram_offset = XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index);
// Store 24-bit depth for aliasing and checking if 32-bit depth is up to date.
xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil);
// Store 32-bit depth so precision isn't lost when doing multipass rendering.
xe_edram_load_store_dest.Store4(10485760u + edram_offset, depth32);
xe_edram_load_store_dest.Store4(
XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index), samples);
}

View File

@ -0,0 +1,25 @@
#include "edram_load_store.hlsli"
#include "pixel_formats.hlsli"
[numthreads(20, 16, 1)]
void main(uint3 xe_group_id : SV_GroupID,
uint3 xe_group_thread_id : SV_GroupThreadID,
uint3 xe_thread_id : SV_DispatchThreadID) {
// Depth.
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
uint4 depth32 = xe_edram_load_store_source.Load4(rt_offset);
uint4 depth24_stencil = XeFloat32To20e4(depth32) << 8u;
// Stencil.
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
xe_edram_rt_stencil_offset;
depth24_stencil |= (xe_edram_load_store_source.Load(rt_offset).xxxx >>
uint4(0u, 8u, 16u, 24u)) & 0xFFu;
uint2 tile_sample_index = xe_group_thread_id.xy;
tile_sample_index.x *= 4u;
uint edram_offset = XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index);
// Store 24-bit depth for aliasing and checking if 32-bit depth is up to date.
xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil);
// Store 32-bit depth so precision isn't lost when doing multipass rendering.
xe_edram_load_store_dest.Store4(10485760u + edram_offset, depth32);
}

View File

@ -7,8 +7,7 @@ void main(uint3 xe_group_id : SV_GroupID,
// Depth.
uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset;
uint4 samples =
(xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu) << 8u;
uint4 samples = xe_edram_load_store_source.Load4(rt_offset) << 8u;
// Stencil.
rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u +
xe_edram_rt_stencil_offset;

View File

@ -0,0 +1,13 @@
#include "pixel_formats.hlsli"
#include "xenos_draw.hlsli"
struct XePSInput {
XeVertexPrePS pre_ps;
sample float4 position : SV_Position;
};
precise float main(XePSInput xe_input) : SV_Depth {
// Input Z may be outside the viewport range (it's clamped after the shader).
return asfloat(
XeFloat20e4To32(XeFloat32To20e4(asuint(saturate(xe_input.position.z)))));
}

View File

@ -0,0 +1,38 @@
#include "pixel_formats.hlsli"
#include "xenos_draw.hlsli"
struct XePSInput {
XeVertexPrePS pre_ps;
sample float4 position : SV_Position;
};
precise float main(XePSInput xe_input) : SV_DepthLessEqual {
// Simplified conversion, always less than or equal to the original value -
// just drop the lower bits.
// The float32 exponent bias is 127.
// After saturating, the exponent range is -127...0.
// The smallest normalized 20e4 exponent is -14 - should drop 3 mantissa bits
// at -14 or above.
// The smallest denormalized 20e4 number is -34 - should drop 23 mantissa bits
// at -34.
// Anything smaller than 2^-34 becomes 0.
// Input Z may be outside the viewport range (it's clamped after the shader).
precise uint depth = asuint(saturate(xe_input.position.z));
// Check if the number is representable as a float24 after truncation - the
// exponent is at least -34.
if (depth >= 0x2E800000u) {
// Extract the biased float32 exponent:
// 113+ at exponent -14+.
// 93 at exponent -34.
uint exponent = (depth >> 23u) & 0xFFu;
// Convert exponent to the shift amount.
// 116 - 113 = 3.
// 116 - 93 = 23.
uint shift = asuint(max(116 - asint(exponent), 3));
depth = depth >> shift << shift;
} else {
// The number is not representable as float24 after truncation - zero.
depth = 0u;
}
return asfloat(depth);
}

View File

@ -495,6 +495,16 @@ void XeR11G11B10SNormToRGBA16(uint4 packed_texels, out uint4 out_01,
// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
// We also can't clamp the stored value to 1 as load->store->load must be exact.
uint XeFloat32To20e4(uint f32u32) {
// Keep only positive (high bit set means negative for both float and int) and
// saturate to the maximum representable value near 2 (also dropping NaNs).
f32u32 = min((f32u32 <= 0x7FFFFFFFu) ? f32u32 : 0u, 0x3FFFFFF8u);
uint denormalized =
((f32u32 & 0x7FFFFFu) | 0x800000u) >> min(113u - (f32u32 >> 23u), 24u);
uint f24u32 = (f32u32 < 0x38800000u) ? denormalized : (f32u32 + 0xC8000000u);
return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu;
}
uint4 XeFloat32To20e4(uint4 f32u32) {
// Keep only positive (high bit set means negative for both float and int) and
// saturate to the maximum representable value near 2 (also dropping NaNs).
@ -505,6 +515,21 @@ uint4 XeFloat32To20e4(uint4 f32u32) {
return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu;
}
uint XeFloat20e4To32(uint f24u32) {
uint mantissa = f24u32 & 0xFFFFFu;
uint exponent = f24u32 >> 20u;
// Normalize the values for the denormalized components.
// Exponent = 1;
// do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x100000) == 0);
bool is_denormalized = exponent == 0u;
uint mantissa_lzcnt = 20u - firstbithigh(mantissa);
exponent = is_denormalized ? (1u - mantissa_lzcnt) : exponent;
mantissa =
is_denormalized ? ((mantissa << mantissa_lzcnt) & 0xFFFFFu) : mantissa;
// Combine into 32-bit float bits and clear zeros.
return (f24u32 != 0u) ? (((exponent + 112u) << 23u) | (mantissa << 3u)) : 0u;
}
uint4 XeFloat20e4To32(uint4 f24u32) {
uint4 mantissa = f24u32 & 0xFFFFFu;
uint4 exponent = f24u32 >> 20u;

View File

@ -10,9 +10,9 @@ void main(point XeVertexPreGS xe_in[1],
}
XeVertexPostGS xe_out;
xe_out.interpolators = xe_in[0].post_gs.interpolators;
xe_out.point_params.z = xe_in[0].post_gs.point_params.z;
xe_out.clip_space_zw = xe_in[0].post_gs.clip_space_zw;
xe_out.pre_ps.interpolators = xe_in[0].post_gs.pre_ps.interpolators;
xe_out.pre_ps.point_params.z = xe_in[0].post_gs.pre_ps.point_params.z;
xe_out.pre_ps.clip_space_zw = xe_in[0].post_gs.pre_ps.clip_space_zw;
xe_out.position.zw = xe_in[0].post_gs.position.zw;
xe_out.clip_distance_0123 = xe_in[0].post_gs.clip_distance_0123;
xe_out.clip_distance_45 = xe_in[0].post_gs.clip_distance_45;
@ -20,26 +20,27 @@ void main(point XeVertexPreGS xe_in[1],
// Shader header writes -1.0f to point_size by default, so any positive value
// means that it was overwritten by the translated vertex shader.
float2 point_size =
(xe_in[0].post_gs.point_params.z > 0.0f ? xe_in[0].post_gs.point_params.zz
: xe_point_size);
xe_in[0].post_gs.pre_ps.point_params.z > 0.0f
? xe_in[0].post_gs.pre_ps.point_params.zz
: xe_point_size;
point_size =
clamp(point_size, xe_point_size_min_max.xx, xe_point_size_min_max.yy) *
xe_point_screen_to_ndc * xe_in[0].post_gs.position.w;
xe_out.point_params.xy = float2(0.0, 0.0);
xe_out.pre_ps.point_params.xy = float2(0.0, 0.0);
// TODO(Triang3l): On Vulkan, sign of Y needs to inverted because of
// upper-left origin.
// TODO(Triang3l): Investigate the true signs of point sprites.
xe_out.position.xy =
xe_in[0].post_gs.position.xy + float2(-point_size.x, point_size.y);
xe_stream.Append(xe_out);
xe_out.point_params.xy = float2(0.0, 1.0);
xe_out.pre_ps.point_params.xy = float2(0.0, 1.0);
xe_out.position.xy = xe_in[0].post_gs.position.xy - point_size;
xe_stream.Append(xe_out);
xe_out.point_params.xy = float2(1.0, 0.0);
xe_out.pre_ps.point_params.xy = float2(1.0, 0.0);
xe_out.position.xy = xe_in[0].post_gs.position.xy + point_size;
xe_stream.Append(xe_out);
xe_out.point_params.xy = float2(1.0, 1.0);
xe_out.pre_ps.point_params.xy = float2(1.0, 1.0);
xe_out.position.xy =
xe_in[0].post_gs.position.xy + float2(point_size.x, -point_size.y);
xe_stream.Append(xe_out);

View File

@ -80,16 +80,19 @@ void main(triangle XeVertexPreGS xe_in[3],
v3_signs = float3(1.0f, 1.0f, -1.0f);
}
[unroll] for (int i = 0; i < 16; ++i) {
xe_out.interpolators[i] = v3_signs.x * xe_in[0].post_gs.interpolators[i] +
v3_signs.y * xe_in[1].post_gs.interpolators[i] +
v3_signs.z * xe_in[2].post_gs.interpolators[i];
xe_out.pre_ps.interpolators[i] =
v3_signs.x * xe_in[0].post_gs.pre_ps.interpolators[i] +
v3_signs.y * xe_in[1].post_gs.pre_ps.interpolators[i] +
v3_signs.z * xe_in[2].post_gs.pre_ps.interpolators[i];
}
xe_out.point_params = v3_signs.x * xe_in[0].post_gs.point_params +
v3_signs.y * xe_in[1].post_gs.point_params +
v3_signs.z * xe_in[2].post_gs.point_params;
xe_out.clip_space_zw = v3_signs.x * xe_in[0].post_gs.clip_space_zw +
v3_signs.y * xe_in[1].post_gs.clip_space_zw +
v3_signs.z * xe_in[2].post_gs.clip_space_zw;
xe_out.pre_ps.point_params =
v3_signs.x * xe_in[0].post_gs.pre_ps.point_params +
v3_signs.y * xe_in[1].post_gs.pre_ps.point_params +
v3_signs.z * xe_in[2].post_gs.pre_ps.point_params;
xe_out.pre_ps.clip_space_zw =
v3_signs.x * xe_in[0].post_gs.pre_ps.clip_space_zw +
v3_signs.y * xe_in[1].post_gs.pre_ps.clip_space_zw +
v3_signs.z * xe_in[2].post_gs.pre_ps.clip_space_zw;
xe_out.position = v3_signs.x * xe_in[0].post_gs.position +
v3_signs.y * xe_in[1].post_gs.position +
v3_signs.z * xe_in[2].post_gs.position;

View File

@ -63,10 +63,14 @@ struct XeHSControlPointOutput {
float index : XEVERTEXID;
};
struct XeVertexPostGS {
struct XeVertexPrePS {
float4 interpolators[16] : TEXCOORD0;
float3 point_params : TEXCOORD16;
float2 clip_space_zw : TEXCOORD17;
};
struct XeVertexPostGS {
XeVertexPrePS pre_ps;
// Precise needed to preserve NaN - guest primitives may be converted to more
// than 1 triangle, so need to kill them entirely manually in GS if any vertex
// is NaN.

View File

@ -66,8 +66,22 @@ SpirvShaderTranslator::Features::Features(
SpirvShaderTranslator::SpirvShaderTranslator(const Features& features)
: features_(features) {}
void SpirvShaderTranslator::Reset() {
ShaderTranslator::Reset();
uint32_t SpirvShaderTranslator::GetDefaultModification(
xenos::ShaderType shader_type,
Shader::HostVertexShaderType host_vertex_shader_type) const {
Modification shader_modification;
switch (shader_type) {
case xenos::ShaderType::kVertex:
shader_modification.host_vertex_shader_type = host_vertex_shader_type;
break;
case xenos::ShaderType::kPixel:
break;
}
return shader_modification.value;
}
void SpirvShaderTranslator::Reset(xenos::ShaderType shader_type) {
ShaderTranslator::Reset(shader_type);
builder_.reset();
@ -226,8 +240,8 @@ void SpirvShaderTranslator::StartTranslation() {
"xe_uniform_float_constants");
builder_->addDecoration(
uniform_float_constants_, spv::DecorationDescriptorSet,
int(IsSpirvFragmentShader() ? kDescriptorSetFloatConstantsPixel
: kDescriptorSetFloatConstantsVertex));
int(is_pixel_shader() ? kDescriptorSetFloatConstantsPixel
: kDescriptorSetFloatConstantsVertex));
builder_->addDecoration(uniform_float_constants_, spv::DecorationBinding,
0);
if (features_.spirv_version >= spv::Spv_1_4) {
@ -335,7 +349,7 @@ void SpirvShaderTranslator::StartTranslation() {
main_interface_.push_back(buffers_shared_memory_);
}
if (IsSpirvVertexOrTessEvalShader()) {
if (is_vertex_shader()) {
StartVertexOrTessEvalShaderBeforeMain();
}
@ -383,7 +397,7 @@ void SpirvShaderTranslator::StartTranslation() {
// Write the execution model-specific prologue with access to variables in the
// main function.
if (IsSpirvVertexOrTessEvalShader()) {
if (is_vertex_shader()) {
StartVertexOrTessEvalShaderInMain();
}
@ -507,7 +521,7 @@ std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() {
function_main_->addBlock(main_loop_merge_);
builder_->setBuildPoint(main_loop_merge_);
if (IsSpirvVertexOrTessEvalShader()) {
if (is_vertex_shader()) {
CompleteVertexOrTessEvalShaderInMain();
}
@ -516,12 +530,12 @@ std::vector<uint8_t> SpirvShaderTranslator::CompleteTranslation() {
// Make the main function the entry point.
spv::ExecutionModel execution_model;
if (IsSpirvFragmentShader()) {
if (is_pixel_shader()) {
execution_model = spv::ExecutionModelFragment;
builder_->addExecutionMode(function_main_,
spv::ExecutionModeOriginUpperLeft);
} else {
assert_true(IsSpirvVertexOrTessEvalShader());
assert_true(is_vertex_shader());
execution_model = IsSpirvTessEvalShader()
? spv::ExecutionModelTessellationEvaluation
: spv::ExecutionModelVertex;
@ -1479,7 +1493,7 @@ void SpirvShaderTranslator::StoreResult(const InstructionResult& result,
spv::StorageClassFunction, var_main_registers_, id_vector_temp_util_);
} break;
case InstructionStorageTarget::kPosition:
assert_true(IsSpirvVertexOrTessEvalShader());
assert_true(is_vertex_shader());
id_vector_temp_util_.clear();
id_vector_temp_util_.push_back(
builder_->makeIntConstant(kOutputPerVertexMemberPosition));

View File

@ -25,6 +25,25 @@ namespace gpu {
class SpirvShaderTranslator : public ShaderTranslator {
public:
union Modification {
// If anything in this is structure is changed in a way not compatible with
// the previous layout, invalidate the pipeline storages by increasing this
// version number (0xYYYYMMDD)!
// TODO(Triang3l): Change to 0xYYYYMMDD once it's out of the rapid
// prototyping stage (easier to do small granular updates with an
// incremental counter).
static constexpr uint32_t kVersion = 1;
struct {
// VS - pipeline stage and input configuration.
Shader::HostVertexShaderType host_vertex_shader_type
: Shader::kHostVertexShaderTypeBitCount;
};
uint32_t value = 0;
Modification(uint32_t modification_value = 0) : value(modification_value) {}
};
enum : uint32_t {
kSysFlag_XYDividedByW_Shift,
kSysFlag_ZDividedByW_Shift,
@ -118,6 +137,11 @@ class SpirvShaderTranslator : public ShaderTranslator {
};
SpirvShaderTranslator(const Features& features);
uint32_t GetDefaultModification(
xenos::ShaderType shader_type,
Shader::HostVertexShaderType host_vertex_shader_type =
Shader::HostVertexShaderType::kVertex) const override;
static constexpr uint32_t GetSharedMemoryStorageBufferCountLog2(
uint32_t max_storage_buffer_range) {
if (max_storage_buffer_range >= 512 * 1024 * 1024) {
@ -134,7 +158,7 @@ class SpirvShaderTranslator : public ShaderTranslator {
}
protected:
void Reset() override;
void Reset(xenos::ShaderType shader_type) override;
void StartTranslation() override;
@ -166,17 +190,21 @@ class SpirvShaderTranslator : public ShaderTranslator {
builder_->getBuildPoint()->addInstruction(std::move(selection_merge_op));
}
Modification GetSpirvShaderModification() const {
return Modification(modification());
}
// TODO(Triang3l): Depth-only pixel shader.
bool IsSpirvVertexOrTessEvalShader() const { return is_vertex_shader(); }
bool IsSpirvVertexShader() const {
return IsSpirvVertexOrTessEvalShader() &&
host_vertex_shader_type() == Shader::HostVertexShaderType::kVertex;
return is_vertex_shader() &&
GetSpirvShaderModification().host_vertex_shader_type ==
Shader::HostVertexShaderType::kVertex;
}
bool IsSpirvTessEvalShader() const {
return IsSpirvVertexOrTessEvalShader() &&
host_vertex_shader_type() != Shader::HostVertexShaderType::kVertex;
return is_vertex_shader() &&
GetSpirvShaderModification().host_vertex_shader_type !=
Shader::HostVertexShaderType::kVertex;
}
bool IsSpirvFragmentShader() const { return is_pixel_shader(); }
// Must be called before emitting any SPIR-V operations that must be in a
// block in translator callbacks to ensure that if the last instruction added

View File

@ -18,8 +18,7 @@
#include "xenia/base/math.h"
#include "xenia/base/memory.h"
#include "xenia/base/profiling.h"
#include "third_party/xxhash/xxhash.h"
#include "xenia/base/xxhash.h"
namespace xe {
namespace gpu {

View File

@ -16,8 +16,7 @@
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/memory.h"
#include "third_party/xxhash/xxhash.h"
#include "xenia/base/xxhash.h"
namespace xe {
namespace gpu {
@ -319,7 +318,7 @@ bool TextureInfo::GetPackedTileOffset(int packed_tile, uint32_t* offset_x,
}
uint64_t TextureInfo::hash() const {
return XXH64(this, sizeof(TextureInfo), 0);
return XXH3_64bits(this, sizeof(TextureInfo));
}
void TextureInfo::SetupMemoryInfo(uint32_t base_address, uint32_t mip_address) {

View File

@ -92,7 +92,7 @@ int TraceDump::Main(const std::vector<std::string>& args) {
bool TraceDump::Setup() {
// Create the emulator but don't initialize so we can setup the window.
emulator_ = std::make_unique<Emulator>("", "", "");
emulator_ = std::make_unique<Emulator>("", "", "", "");
X_STATUS result = emulator_->Setup(
nullptr, nullptr, [this]() { return CreateGraphicsSystem(); }, nullptr);
if (XFAILED(result)) {

View File

@ -121,7 +121,7 @@ bool TraceViewer::Setup() {
window_->Resize(1920, 1200);
// Create the emulator but don't initialize so we can setup the window.
emulator_ = std::make_unique<Emulator>("", "", "");
emulator_ = std::make_unique<Emulator>("", "", "", "");
X_STATUS result = emulator_->Setup(
window_.get(), nullptr, [this]() { return CreateGraphicsSystem(); },
nullptr);
@ -566,8 +566,21 @@ TraceViewer::ShaderDisplayType TraceViewer::DrawShaderTypeUI() {
void TraceViewer::DrawShaderUI(Shader* shader, ShaderDisplayType display_type) {
// Must be prepared for advanced display modes.
// FIXME(Triang3l): This should display the actual translation used in the
// draw, but it may depend on multiple backend-related factors, including
// drawing multiple times with multiple modifications, even depending on
// values obtained during translation of other modifications (for instance,
// a memexporting shader can be executed both as a vertex shader (to draw the
// points) and as a compute shader (to actually export) if the host doesn't
// support writes from vertex shaders.
const Shader::Translation* translation = nullptr;
if (display_type != ShaderDisplayType::kUcode) {
if (!shader->is_valid()) {
for (const auto& translation_pair : shader->translations()) {
if (translation_pair.second->is_valid()) {
translation = translation_pair.second;
}
}
if (!translation) {
ImGui::TextColored(kColorError,
"ERROR: shader error during parsing/translation");
return;
@ -580,7 +593,7 @@ void TraceViewer::DrawShaderUI(Shader* shader, ShaderDisplayType display_type) {
break;
}
case ShaderDisplayType::kTranslated: {
const auto& str = shader->GetTranslatedBinaryString();
const auto& str = translation->GetTranslatedBinaryString();
size_t i = 0;
bool done = false;
while (!done && i < str.size()) {
@ -600,7 +613,7 @@ void TraceViewer::DrawShaderUI(Shader* shader, ShaderDisplayType display_type) {
break;
}
case ShaderDisplayType::kHostDisasm: {
DrawMultilineString(shader->host_disassembly());
DrawMultilineString(translation->host_disassembly());
break;
}
}

View File

@ -816,10 +816,11 @@ static_assert_size(TextureFetchInstruction, 12);
// move of the third operand in case of zero multiplicands, because the term
// may be -0, while the result should be +0 in this case.
// http://developer.amd.com/wordpress/media/2013/10/R5xx_Acceleration_v1.5.pdf
// Multiply-add also appears to be not fused (the SM3 behavior instruction on
// GCN is called v_mad_legacy_f32, not v_fma_legacy_f32) - shader translators
// should not use instructions that may be interpreted by the host GPU as
// fused multiply-add.
// Multiply-add also appears to be not fused; the SM3 behavior instruction on
// GCN is called v_mad_legacy_f32, not v_fma_legacy_f32 (in 2012-2020, before
// RDNA 2, which removed v_mad_f32 as well) - shader translators should not
// use instructions that may be interpreted by the host GPU as fused
// multiply-add.
enum class AluScalarOpcode : uint32_t {
// Floating-Point Add
@ -1147,6 +1148,19 @@ enum class AluScalarOpcode : uint32_t {
kRetainPrev = 50,
};
constexpr bool AluScalarOpcodeIsKill(AluScalarOpcode scalar_opcode) {
switch (scalar_opcode) {
case AluScalarOpcode::kKillsEq:
case AluScalarOpcode::kKillsGt:
case AluScalarOpcode::kKillsGe:
case AluScalarOpcode::kKillsNe:
case AluScalarOpcode::kKillsOne:
return true;
default:
return false;
}
}
enum class AluVectorOpcode : uint32_t {
// Per-Component Floating-Point Add
// add/ADDv dest, src0, src1
@ -1471,27 +1485,37 @@ enum class AluVectorOpcode : uint32_t {
kMaxA = 29,
};
constexpr bool AluVectorOpcodeIsKill(AluVectorOpcode vector_opcode) {
switch (vector_opcode) {
case AluVectorOpcode::kKillEq:
case AluVectorOpcode::kKillGt:
case AluVectorOpcode::kKillGe:
case AluVectorOpcode::kKillNe:
return true;
default:
return false;
}
}
// Whether the vector instruction has side effects such as discarding a pixel or
// setting the predicate and can't be ignored even if it doesn't write to
// anywhere. Note that all scalar operations except for retain_prev have a side
// effect of modifying the previous scalar result register, so they must always
// be executed even if not writing.
constexpr bool AluVectorOpHasSideEffects(AluVectorOpcode vector_opcode) {
if (AluVectorOpcodeIsKill(vector_opcode)) {
return true;
}
switch (vector_opcode) {
case AluVectorOpcode::kSetpEqPush:
case AluVectorOpcode::kSetpNePush:
case AluVectorOpcode::kSetpGtPush:
case AluVectorOpcode::kSetpGePush:
case AluVectorOpcode::kKillEq:
case AluVectorOpcode::kKillGt:
case AluVectorOpcode::kKillGe:
case AluVectorOpcode::kKillNe:
case AluVectorOpcode::kMaxA:
return true;
default:
break;
return false;
}
return false;
}
// Whether each component of a source operand is used at all in the instruction

View File

@ -627,6 +627,17 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
}
// TODO(Triang3l): Get a pixel shader.
VulkanShader* pixel_shader = nullptr;
SpirvShaderTranslator::Modification vertex_shader_modification;
SpirvShaderTranslator::Modification pixel_shader_modification;
if (!pipeline_cache_->GetCurrentShaderModifications(
vertex_shader_modification, pixel_shader_modification)) {
return false;
}
VulkanShader::VulkanTranslation* vertex_shader_translation =
static_cast<VulkanShader::VulkanTranslation*>(
vertex_shader->GetOrCreateTranslation(
vertex_shader_modification.value));
VulkanShader::VulkanTranslation* pixel_shader_translation = nullptr;
VulkanRenderTargetCache::FramebufferKey framebuffer_key;
if (!render_target_cache_->UpdateRenderTargets(framebuffer_key)) {
@ -648,7 +659,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
// current_graphics_pipeline_layout_.
VkPipeline pipeline;
const VulkanPipelineCache::PipelineLayoutProvider* pipeline_layout_provider;
if (!pipeline_cache_->ConfigurePipeline(vertex_shader, pixel_shader,
if (!pipeline_cache_->ConfigurePipeline(vertex_shader_translation,
pixel_shader_translation,
framebuffer_key.render_pass_key,
pipeline, pipeline_layout_provider)) {
return false;
@ -713,7 +725,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
draw_util::GetHostViewportInfo(
regs, 1.0f, 1.0f, false,
float(device_properties.limits.maxViewportDimensions[0]),
float(device_properties.limits.maxViewportDimensions[1]), true,
float(device_properties.limits.maxViewportDimensions[1]), true, false,
viewport_info);
// Update fixed-function dynamic state.

View File

@ -17,6 +17,8 @@
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/profiling.h"
#include "xenia/base/xxhash.h"
#include "xenia/gpu/gpu_flags.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/spirv_shader_translator.h"
@ -84,7 +86,8 @@ VulkanShader* VulkanPipelineCache::LoadShader(xenos::ShaderType shader_type,
const uint32_t* host_address,
uint32_t dword_count) {
// Hash the input memory and lookup the shader.
uint64_t data_hash = XXH64(host_address, dword_count * sizeof(uint32_t), 0);
uint64_t data_hash =
XXH3_64bits(host_address, dword_count * sizeof(uint32_t));
auto it = shaders_.find(data_hash);
if (it != shaders_.end()) {
// Shader has been previously loaded.
@ -94,16 +97,31 @@ VulkanShader* VulkanPipelineCache::LoadShader(xenos::ShaderType shader_type,
// Always create the shader and stash it away.
// We need to track it even if it fails translation so we know not to try
// again.
VulkanShader* shader =
new VulkanShader(shader_type, data_hash, host_address, dword_count);
VulkanShader* shader = new VulkanShader(
shader_type, data_hash, host_address, dword_count,
command_processor_.GetVulkanContext().GetVulkanProvider());
shaders_.emplace(data_hash, shader);
if (!cvars::dump_shaders.empty()) {
shader->DumpUcodeBinary(cvars::dump_shaders);
}
return shader;
}
bool VulkanPipelineCache::GetCurrentShaderModifications(
SpirvShaderTranslator::Modification& vertex_shader_modification_out,
SpirvShaderTranslator::Modification& pixel_shader_modification_out) const {
// TODO(Triang3l): Tessellation, depth output.
vertex_shader_modification_out = SpirvShaderTranslator::Modification(
shader_translator_->GetDefaultModification(xenos::ShaderType::kVertex));
pixel_shader_modification_out = SpirvShaderTranslator::Modification(
shader_translator_->GetDefaultModification(xenos::ShaderType::kPixel));
return true;
}
bool VulkanPipelineCache::EnsureShadersTranslated(
VulkanShader* vertex_shader, VulkanShader* pixel_shader,
Shader::HostVertexShaderType host_vertex_shader_type) {
VulkanShader::VulkanTranslation* vertex_shader,
VulkanShader::VulkanTranslation* pixel_shader) {
const RegisterFile& regs = register_file_;
auto sq_program_cntl = regs.Get<reg::SQ_PROGRAM_CNTL>();
@ -133,7 +151,8 @@ bool VulkanPipelineCache::EnsureShadersTranslated(
}
bool VulkanPipelineCache::ConfigurePipeline(
VulkanShader* vertex_shader, VulkanShader* pixel_shader,
VulkanShader::VulkanTranslation* vertex_shader,
VulkanShader::VulkanTranslation* pixel_shader,
VulkanRenderTargetCache::RenderPassKey render_pass_key,
VkPipeline& pipeline_out,
const PipelineLayoutProvider*& pipeline_layout_out) {
@ -160,8 +179,7 @@ bool VulkanPipelineCache::ConfigurePipeline(
}
// Create the pipeline if not the latest and not already existing.
if (!EnsureShadersTranslated(vertex_shader, pixel_shader,
Shader::HostVertexShaderType::kVertex)) {
if (!EnsureShadersTranslated(vertex_shader, pixel_shader)) {
return false;
}
const PipelineLayoutProvider* pipeline_layout =
@ -189,24 +207,22 @@ bool VulkanPipelineCache::ConfigurePipeline(
return true;
}
bool VulkanPipelineCache::TranslateShader(SpirvShaderTranslator& translator,
VulkanShader& shader,
reg::SQ_PROGRAM_CNTL cntl) {
bool VulkanPipelineCache::TranslateShader(
SpirvShaderTranslator& translator,
VulkanShader::VulkanTranslation& translation, reg::SQ_PROGRAM_CNTL cntl) {
// Perform translation.
// If this fails the shader will be marked as invalid and ignored later.
// TODO(Triang3l): Host vertex shader type.
if (!translator.Translate(&shader, cntl,
Shader::HostVertexShaderType::kVertex)) {
if (!translator.Translate(translation, cntl)) {
XELOGE("Shader {:016X} translation failed; marking as ignored",
shader.ucode_data_hash());
translation.shader().ucode_data_hash());
return false;
}
return shader.InitializeShaderModule(
command_processor_.GetVulkanContext().GetVulkanProvider());
return translation.GetOrCreateShaderModule() != VK_NULL_HANDLE;
}
bool VulkanPipelineCache::GetCurrentStateDescription(
const VulkanShader* vertex_shader, const VulkanShader* pixel_shader,
const VulkanShader::VulkanTranslation* vertex_shader,
const VulkanShader::VulkanTranslation* pixel_shader,
VulkanRenderTargetCache::RenderPassKey render_pass_key,
PipelineDescription& description_out) const {
description_out.Reset();
@ -215,9 +231,14 @@ bool VulkanPipelineCache::GetCurrentStateDescription(
auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
auto vgt_draw_initiator = regs.Get<reg::VGT_DRAW_INITIATOR>();
description_out.vertex_shader_hash = vertex_shader->ucode_data_hash();
description_out.pixel_shader_hash =
pixel_shader ? pixel_shader->ucode_data_hash() : 0;
description_out.vertex_shader_hash =
vertex_shader->shader().ucode_data_hash();
description_out.vertex_shader_modification = vertex_shader->modification();
if (pixel_shader) {
description_out.pixel_shader_hash =
pixel_shader->shader().ucode_data_hash();
description_out.pixel_shader_modification = pixel_shader->modification();
}
description_out.render_pass_key = render_pass_key;
xenos::PrimitiveType primitive_type = vgt_draw_initiator.prim_type;
@ -321,11 +342,11 @@ bool VulkanPipelineCache::EnsurePipelineCreated(
if (creation_arguments.pixel_shader) {
XELOGGPU("Creating graphics pipeline state with VS {:016X}, PS {:016X}",
creation_arguments.vertex_shader->ucode_data_hash(),
creation_arguments.pixel_shader->ucode_data_hash());
creation_arguments.vertex_shader->shader().ucode_data_hash(),
creation_arguments.pixel_shader->shader().ucode_data_hash());
} else {
XELOGGPU("Creating graphics pipeline state with VS {:016X}",
creation_arguments.vertex_shader->ucode_data_hash());
creation_arguments.vertex_shader->shader().ucode_data_hash());
}
const PipelineDescription& description = creation_arguments.pipeline->first;
@ -514,11 +535,11 @@ bool VulkanPipelineCache::EnsurePipelineCreated(
/* if (creation_arguments.pixel_shader) {
XELOGE(
"Failed to create graphics pipeline with VS {:016X}, PS {:016X}",
creation_arguments.vertex_shader->ucode_data_hash(),
creation_arguments.pixel_shader->ucode_data_hash());
creation_arguments.vertex_shader->shader().ucode_data_hash(),
creation_arguments.pixel_shader->shader().ucode_data_hash());
} else {
XELOGE("Failed to create graphics pipeline with VS {:016X}",
creation_arguments.vertex_shader->ucode_data_hash());
creation_arguments.vertex_shader->shader().ucode_data_hash());
} */
return false;
}

View File

@ -16,9 +16,9 @@
#include <unordered_map>
#include <utility>
#include "third_party/xxhash/xxhash.h"
#include "xenia/base/hash.h"
#include "xenia/base/platform.h"
#include "xenia/base/xxhash.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/spirv_shader_translator.h"
#include "xenia/gpu/vulkan/vulkan_render_target_cache.h"
@ -55,14 +55,19 @@ class VulkanPipelineCache {
uint32_t guest_address, const uint32_t* host_address,
uint32_t dword_count);
// Retrieves the shader modifications for the current state, and returns
// whether they are valid.
bool GetCurrentShaderModifications(
SpirvShaderTranslator::Modification& vertex_shader_modification_out,
SpirvShaderTranslator::Modification& pixel_shader_modification_out) const;
// Translates shaders if needed, also making shader info up to date.
bool EnsureShadersTranslated(
VulkanShader* vertex_shader, VulkanShader* pixel_shader,
Shader::HostVertexShaderType host_vertex_shader_type);
bool EnsureShadersTranslated(VulkanShader::VulkanTranslation* vertex_shader,
VulkanShader::VulkanTranslation* pixel_shader);
// TODO(Triang3l): Return a deferred creation handle.
bool ConfigurePipeline(VulkanShader* vertex_shader,
VulkanShader* pixel_shader,
bool ConfigurePipeline(VulkanShader::VulkanTranslation* vertex_shader,
VulkanShader::VulkanTranslation* pixel_shader,
VulkanRenderTargetCache::RenderPassKey render_pass_key,
VkPipeline& pipeline_out,
const PipelineLayoutProvider*& pipeline_layout_out);
@ -102,6 +107,8 @@ class VulkanPipelineCache {
uint64_t vertex_shader_hash;
// 0 if no pixel shader.
uint64_t pixel_shader_hash;
uint32_t vertex_shader_modification;
uint32_t pixel_shader_modification;
VulkanRenderTargetCache::RenderPassKey render_pass_key;
// Input assembly.
@ -126,7 +133,7 @@ class VulkanPipelineCache {
return std::memcmp(this, &description, sizeof(*this)) == 0;
}
void Reset() { std::memset(this, 0, sizeof(*this)); }
uint64_t GetHash() const { return XXH64(this, sizeof(*this), 0); }
uint64_t GetHash() const { return XXH3_64bits(this, sizeof(*this)); }
struct Hasher {
size_t operator()(const PipelineDescription& description) const {
return size_t(description.GetHash());
@ -146,17 +153,19 @@ class VulkanPipelineCache {
// creation threads, with everything needed from caches pre-looked-up.
struct PipelineCreationArguments {
std::pair<const PipelineDescription, Pipeline>* pipeline;
const VulkanShader* vertex_shader;
const VulkanShader* pixel_shader;
const VulkanShader::VulkanTranslation* vertex_shader;
const VulkanShader::VulkanTranslation* pixel_shader;
VkRenderPass render_pass;
};
// Can be called from multiple threads.
bool TranslateShader(SpirvShaderTranslator& translator, VulkanShader& shader,
bool TranslateShader(SpirvShaderTranslator& translator,
VulkanShader::VulkanTranslation& translation,
reg::SQ_PROGRAM_CNTL cntl);
bool GetCurrentStateDescription(
const VulkanShader* vertex_shader, const VulkanShader* pixel_shader,
const VulkanShader::VulkanTranslation* vertex_shader,
const VulkanShader::VulkanTranslation* pixel_shader,
VulkanRenderTargetCache::RenderPassKey render_pass_key,
PipelineDescription& description_out) const;

View File

@ -14,7 +14,7 @@
#include <cstring>
#include <unordered_map>
#include "third_party/xxhash/xxhash.h"
#include "xenia/base/xxhash.h"
#include "xenia/gpu/register_file.h"
#include "xenia/ui/vulkan/vulkan_provider.h"
@ -49,7 +49,7 @@ class VulkanRenderTargetCache {
return std::memcmp(this, &key, sizeof(*this)) == 0;
}
void Reset() { std::memset(this, 0, sizeof(*this)); }
uint64_t GetHash() const { return XXH64(this, sizeof(*this), 0); }
uint64_t GetHash() const { return XXH3_64bits(this, sizeof(*this)); }
struct Hasher {
size_t operator()(const FramebufferKey& description) const {
return size_t(description.GetHash());

View File

@ -11,22 +11,30 @@
#include <cstdint>
#include "xenia/ui/vulkan/vulkan_provider.h"
namespace xe {
namespace gpu {
namespace vulkan {
VulkanShader::VulkanShader(xenos::ShaderType shader_type, uint64_t data_hash,
const uint32_t* dword_ptr, uint32_t dword_count)
: Shader(shader_type, data_hash, dword_ptr, dword_count) {}
VulkanShader::VulkanTranslation::~VulkanTranslation() {
if (shader_module_) {
const ui::vulkan::VulkanProvider& provider =
static_cast<const VulkanShader&>(shader()).provider_;
provider.dfn().vkDestroyShaderModule(provider.device(), shader_module_,
nullptr);
}
}
bool VulkanShader::InitializeShaderModule(
const ui::vulkan::VulkanProvider& provider) {
VkShaderModule VulkanShader::VulkanTranslation::GetOrCreateShaderModule() {
if (!is_valid()) {
return false;
return VK_NULL_HANDLE;
}
if (shader_module_ != VK_NULL_HANDLE) {
return true;
return shader_module_;
}
const ui::vulkan::VulkanProvider& provider =
static_cast<const VulkanShader&>(shader()).provider_;
VkShaderModuleCreateInfo shader_module_create_info;
shader_module_create_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
shader_module_create_info.pNext = nullptr;
@ -37,10 +45,21 @@ bool VulkanShader::InitializeShaderModule(
if (provider.dfn().vkCreateShaderModule(provider.device(),
&shader_module_create_info, nullptr,
&shader_module_) != VK_SUCCESS) {
is_valid_ = false;
return false;
MakeInvalid();
return VK_NULL_HANDLE;
}
return true;
return shader_module_;
}
VulkanShader::VulkanShader(xenos::ShaderType shader_type, uint64_t data_hash,
const uint32_t* dword_ptr, uint32_t dword_count,
const ui::vulkan::VulkanProvider& provider)
: Shader(shader_type, data_hash, dword_ptr, dword_count),
provider_(provider) {}
Shader::Translation* VulkanShader::CreateTranslationInstance(
uint32_t modification) {
return new VulkanTranslation(*this, modification);
}
} // namespace vulkan

View File

@ -22,14 +22,28 @@ namespace vulkan {
class VulkanShader : public Shader {
public:
VulkanShader(xenos::ShaderType shader_type, uint64_t data_hash,
const uint32_t* dword_ptr, uint32_t dword_count);
class VulkanTranslation : public Translation {
public:
VulkanTranslation(VulkanShader& shader, uint32_t modification)
: Translation(shader, modification) {}
~VulkanTranslation() override;
bool InitializeShaderModule(const ui::vulkan::VulkanProvider& provider);
VkShaderModule shader_module() const { return shader_module_; }
VkShaderModule GetOrCreateShaderModule();
VkShaderModule shader_module() const { return shader_module_; }
private:
VkShaderModule shader_module_ = VK_NULL_HANDLE;
};
VulkanShader(xenos::ShaderType shader_type, uint64_t data_hash,
const uint32_t* dword_ptr, uint32_t dword_count,
const ui::vulkan::VulkanProvider& provider);
protected:
Translation* CreateTranslationInstance(uint32_t modification) override;
private:
VkShaderModule shader_module_ = VK_NULL_HANDLE;
const ui::vulkan::VulkanProvider& provider_;
};
} // namespace vulkan

View File

@ -9,17 +9,41 @@
#include "xenia/gpu/xenos.h"
#include <cmath>
#include "xenia/base/math.h"
namespace xe {
namespace gpu {
namespace xenos {
// Based on CFloat24 from d3dref9.dll and the 6e4 code from:
// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
uint32_t Float32To20e4(float f32) {
if (!(f32 > 0.0f)) {
// Positive only, and not -0 or NaN.
return 0;
}
uint32_t f32u32 = *reinterpret_cast<const uint32_t*>(&f32);
if (f32u32 >= 0x3FFFFFF8) {
// Saturate.
return 0xFFFFFF;
}
if (f32u32 < 0x38800000) {
// The number is too small to be represented as a normalized 20e4.
// Convert it to a denormalized value.
uint32_t shift = std::min(uint32_t(113 - (f32u32 >> 23)), uint32_t(24));
f32u32 = (0x800000 | (f32u32 & 0x7FFFFF)) >> shift;
} else {
// Rebias the exponent to represent the value as a normalized 20e4.
f32u32 += 0xC8000000u;
}
return ((f32u32 + 3 + ((f32u32 >> 3) & 1)) >> 3) & 0xFFFFFF;
}
float Float20e4To32(uint32_t f24) {
// Based on CFloat24 from d3dref9.dll and the 6e4 code from:
// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows
// [0,2).
f24 &= 0xFFFFFF;
if (!f24) {
return 0.0f;

View File

@ -305,6 +305,9 @@ enum class DepthRenderTargetFormat : uint32_t {
const char* GetDepthRenderTargetFormatName(DepthRenderTargetFormat format);
// Converts an IEEE-754 32-bit floating-point number to Xenos floating-point
// depth, rounding to the nearest even.
uint32_t Float32To20e4(float f32);
// Converts Xenos floating-point depth in bits 0:23 (not clamping) to an
// IEEE-754 32-bit floating-point number.
float Float20e4To32(uint32_t f24);
@ -1036,10 +1039,9 @@ XEPACKEDUNION(xe_gpu_texture_fetch_t, {
ClampMode clamp_y : 3; // +13
ClampMode clamp_z : 3; // +16
SignedRepeatingFractionMode signed_rf_mode_all : 1; // +19
// TODO(Triang3l): 1 or 2 dim_tbd bits?
uint32_t unk_0 : 2; // +20
uint32_t pitch : 9; // +22 byte_pitch >> 5
uint32_t tiled : 1; // +31
uint32_t dim_tbd : 2; // +20
uint32_t pitch : 9; // +22 byte_pitch >> 5
uint32_t tiled : 1; // +31
TextureFormat format : 6; // +0 dword_1
Endian endianness : 2; // +6

View File

@ -38,6 +38,7 @@ DEFINE_string(hid, "any", "Input system. Use: [any, nop, sdl, winkey, xinput]",
"General");
#define MAX_USERS 4
#define ROW_HEIGHT_GENERAL 60
#define COL_WIDTH_STATE 320
#define COL_WIDTH_STROKE 416
@ -45,6 +46,7 @@ namespace xe {
namespace hid {
std::unique_ptr<xe::hid::InputSystem> input_system_;
bool is_active = true;
std::vector<std::unique_ptr<hid::InputDriver>> CreateInputDrivers(
ui::Window* window) {
@ -118,7 +120,7 @@ int hid_demo_main(const std::vector<std::string>& args) {
loop->on_quit.AddListener([&window](xe::ui::UIEvent* e) { window.reset(); });
// Initial size setting, done here so that it knows the menu exists.
window->Resize(COL_WIDTH_STATE + COL_WIDTH_STROKE, 500);
window->Resize(COL_WIDTH_STATE + COL_WIDTH_STROKE, ROW_HEIGHT_GENERAL + 500);
// Create the graphics context used for drawing and setup the window.
std::unique_ptr<xe::ui::GraphicsProvider> graphics_provider;
@ -133,7 +135,9 @@ int hid_demo_main(const std::vector<std::string>& args) {
input_system_ = std::make_unique<xe::hid::InputSystem>(window.get());
auto drivers = CreateInputDrivers(window.get());
for (size_t i = 0; i < drivers.size(); ++i) {
input_system_->AddDriver(std::move(drivers[i]));
auto& driver = drivers[i];
driver->set_is_active_callback([]() -> bool { return is_active; });
input_system_->AddDriver(std::move(driver));
}
window->Invalidate();
@ -149,10 +153,22 @@ int hid_demo_main(const std::vector<std::string>& args) {
ImGuiWindowFlags_NoCollapse | ImGuiWindowFlags_NoSavedSettings |
ImGuiWindowFlags_NoScrollbar;
ImGui::Begin("GetState()", nullptr, wflags);
ImGui::Begin("General", nullptr, wflags);
{
ImGui::SetWindowPos(ImVec2(0, 0));
ImGui::SetWindowSize(ImVec2(COL_WIDTH_STATE, io.DisplaySize.y));
ImGui::SetWindowSize(
ImVec2(COL_WIDTH_STATE + COL_WIDTH_STROKE, ROW_HEIGHT_GENERAL));
ImGui::Text("Input System (hid) = \"%s\"", cvars::hid.c_str());
ImGui::Checkbox("is_active", &is_active);
}
ImGui::End();
ImGui::Begin("GetState()", nullptr, wflags);
{
ImGui::SetWindowPos(ImVec2(0, ROW_HEIGHT_GENERAL));
ImGui::SetWindowSize(
ImVec2(COL_WIDTH_STATE, io.DisplaySize.y - ROW_HEIGHT_GENERAL));
static bool enable_GetState = false;
ImGui::Checkbox("Active", &enable_GetState);
@ -167,8 +183,9 @@ int hid_demo_main(const std::vector<std::string>& args) {
ImGui::Begin("GetKeystroke()", nullptr, wflags);
{
ImGui::SetWindowPos(ImVec2(COL_WIDTH_STATE, 0));
ImGui::SetWindowSize(ImVec2(COL_WIDTH_STROKE, io.DisplaySize.y));
ImGui::SetWindowPos(ImVec2(COL_WIDTH_STATE, ROW_HEIGHT_GENERAL));
ImGui::SetWindowSize(
ImVec2(COL_WIDTH_STROKE, io.DisplaySize.y - ROW_HEIGHT_GENERAL));
static bool enable_GetKeystroke = false;
static bool hide_repeats = false;

View File

@ -77,7 +77,7 @@ X_STATUS SDLInputDriver::Setup() {
sdl_events_initialized_ = true;
SDL_EventFilter event_filter{[](void* userdata, SDL_Event* event) -> int {
if (!userdata) {
if (!userdata || !event) {
assert_always();
return 0;
}
@ -102,17 +102,17 @@ X_STATUS SDLInputDriver::Setup() {
}
switch (type) {
case SDL_CONTROLLERDEVICEADDED:
driver->OnControllerDeviceAdded(event);
driver->OnControllerDeviceAdded(*event);
break;
case SDL_CONTROLLERDEVICEREMOVED:
driver->OnControllerDeviceRemoved(event);
driver->OnControllerDeviceRemoved(*event);
break;
case SDL_CONTROLLERAXISMOTION:
driver->OnControllerDeviceAxisMotion(event);
driver->OnControllerDeviceAxisMotion(*event);
break;
case SDL_CONTROLLERBUTTONDOWN:
case SDL_CONTROLLERBUTTONUP:
driver->OnControllerDeviceButtonChanged(event);
driver->OnControllerDeviceButtonChanged(*event);
break;
default:
break;
@ -193,7 +193,11 @@ X_RESULT SDLInputDriver::GetState(uint32_t user_index,
return X_ERROR_BAD_ARGUMENTS;
}
QueueControllerUpdate();
auto is_active = this->is_active();
if (is_active) {
QueueControllerUpdate();
}
std::unique_lock<std::mutex> guard(controllers_mutex_);
@ -203,12 +207,20 @@ X_RESULT SDLInputDriver::GetState(uint32_t user_index,
}
// Make sure packet_number is only incremented by 1, even if there have been
// multiple updates between GetState calls.
if (controller->state_changed) {
// multiple updates between GetState calls. Also track `is_active` to
// increment the packet number if it changed.
if ((is_active != controller->is_active) ||
(is_active && controller->state_changed)) {
controller->state.packet_number++;
controller->is_active = is_active;
controller->state_changed = false;
}
*out_state = controller->state;
std::memcpy(out_state, &controller->state, sizeof(*out_state));
if (!is_active) {
// Simulate an "untouched" controller. When we become active again the
// pressed buttons aren't lost and will be visible again.
std::memset(&out_state->gamepad, 0, sizeof(out_state->gamepad));
}
return X_ERROR_SUCCESS;
}
@ -242,6 +254,8 @@ X_RESULT SDLInputDriver::SetState(uint32_t user_index,
X_RESULT SDLInputDriver::GetKeystroke(uint32_t users, uint32_t flags,
X_INPUT_KEYSTROKE* out_keystroke) {
// TODO(JoelLinn): Figure out the flags
// https://github.com/evilC/UCR/blob/0489929e2a8e39caa3484c67f3993d3fba39e46f/Libraries/XInput.ahk#L85-L98
assert(sdl_events_initialized_ && sdl_gamecontroller_initialized_);
bool user_any = users == 0xFF;
if (users >= HID_SDL_USER_COUNT && !user_any) {
@ -296,7 +310,11 @@ X_RESULT SDLInputDriver::GetKeystroke(uint32_t users, uint32_t flags,
X_INPUT_GAMEPAD_VK_RTHUMB_DOWNLEFT,
};
QueueControllerUpdate();
auto is_active = this->is_active();
if (is_active) {
QueueControllerUpdate();
}
std::unique_lock<std::mutex> guard(controllers_mutex_);
@ -311,8 +329,13 @@ X_RESULT SDLInputDriver::GetKeystroke(uint32_t users, uint32_t flags,
}
}
const uint64_t curr_butts = controller->state.gamepad.buttons |
AnalogToKeyfield(controller->state.gamepad);
// If input is not active (e.g. due to a dialog overlay), force buttons to
// "unpressed". The algorithm will automatically send UP events when
// `is_active()` goes low and DOWN events when it goes high again.
const uint64_t curr_butts =
is_active ? (controller->state.gamepad.buttons |
AnalogToKeyfield(controller->state.gamepad))
: uint64_t(0);
KeystrokeState& last = keystroke_states_.at(user_index);
// Handle repeating
@ -384,12 +407,12 @@ X_RESULT SDLInputDriver::GetKeystroke(uint32_t users, uint32_t flags,
return X_ERROR_EMPTY;
}
void SDLInputDriver::OnControllerDeviceAdded(SDL_Event* event) {
void SDLInputDriver::OnControllerDeviceAdded(const SDL_Event& event) {
assert(window()->loop()->is_on_loop_thread());
std::unique_lock<std::mutex> guard(controllers_mutex_);
// Open the controller.
const auto controller = SDL_GameControllerOpen(event->cdevice.which);
const auto controller = SDL_GameControllerOpen(event.cdevice.which);
if (!controller) {
assert_always();
return;
@ -423,52 +446,52 @@ void SDLInputDriver::OnControllerDeviceAdded(SDL_Event* event) {
}
}
void SDLInputDriver::OnControllerDeviceRemoved(SDL_Event* event) {
void SDLInputDriver::OnControllerDeviceRemoved(const SDL_Event& event) {
assert(window()->loop()->is_on_loop_thread());
std::unique_lock<std::mutex> guard(controllers_mutex_);
// Find the disconnected gamecontroller and close it.
auto [found, i] = GetControllerIndexFromInstanceID(event->cdevice.which);
assert(found);
SDL_GameControllerClose(controllers_.at(i).sdl);
controllers_.at(i) = {};
keystroke_states_.at(i) = {};
auto idx = GetControllerIndexFromInstanceID(event.cdevice.which);
assert(idx);
SDL_GameControllerClose(controllers_.at(*idx).sdl);
controllers_.at(*idx) = {};
keystroke_states_.at(*idx) = {};
}
void SDLInputDriver::OnControllerDeviceAxisMotion(SDL_Event* event) {
void SDLInputDriver::OnControllerDeviceAxisMotion(const SDL_Event& event) {
assert(window()->loop()->is_on_loop_thread());
std::unique_lock<std::mutex> guard(controllers_mutex_);
auto [found, i] = GetControllerIndexFromInstanceID(event->caxis.which);
assert(found);
auto& pad = controllers_.at(i).state.gamepad;
switch (event->caxis.axis) {
auto idx = GetControllerIndexFromInstanceID(event.caxis.which);
assert(idx);
auto& pad = controllers_.at(*idx).state.gamepad;
switch (event.caxis.axis) {
case SDL_CONTROLLER_AXIS_LEFTX:
pad.thumb_lx = event->caxis.value;
pad.thumb_lx = event.caxis.value;
break;
case SDL_CONTROLLER_AXIS_LEFTY:
pad.thumb_ly = ~event->caxis.value;
pad.thumb_ly = ~event.caxis.value;
break;
case SDL_CONTROLLER_AXIS_RIGHTX:
pad.thumb_rx = event->caxis.value;
pad.thumb_rx = event.caxis.value;
break;
case SDL_CONTROLLER_AXIS_RIGHTY:
pad.thumb_ry = ~event->caxis.value;
pad.thumb_ry = ~event.caxis.value;
break;
case SDL_CONTROLLER_AXIS_TRIGGERLEFT:
pad.left_trigger = static_cast<uint8_t>(event->caxis.value >> 7);
pad.left_trigger = static_cast<uint8_t>(event.caxis.value >> 7);
break;
case SDL_CONTROLLER_AXIS_TRIGGERRIGHT:
pad.right_trigger = static_cast<uint8_t>(event->caxis.value >> 7);
pad.right_trigger = static_cast<uint8_t>(event.caxis.value >> 7);
break;
default:
assert_always();
break;
}
controllers_.at(i).state_changed = true;
controllers_.at(*idx).state_changed = true;
}
void SDLInputDriver::OnControllerDeviceButtonChanged(SDL_Event* event) {
void SDLInputDriver::OnControllerDeviceButtonChanged(const SDL_Event& event) {
assert(window()->loop()->is_on_loop_thread());
std::unique_lock<std::mutex> guard(controllers_mutex_);
@ -492,15 +515,15 @@ void SDLInputDriver::OnControllerDeviceButtonChanged(SDL_Event* event) {
X_INPUT_GAMEPAD_DPAD_LEFT,
X_INPUT_GAMEPAD_DPAD_RIGHT};
auto [found, i] = GetControllerIndexFromInstanceID(event->cbutton.which);
assert(found);
auto& controller = controllers_.at(i);
auto idx = GetControllerIndexFromInstanceID(event.cbutton.which);
assert(idx);
auto& controller = controllers_.at(*idx);
uint16_t xbuttons = controller.state.gamepad.buttons;
// Lookup the XInput button code.
auto xbutton = xbutton_lookup.at(event->cbutton.button);
auto xbutton = xbutton_lookup.at(event.cbutton.button);
// Pressed or released?
if (event->cbutton.state == SDL_PRESSED) {
if (event.cbutton.state == SDL_PRESSED) {
if (xbutton == X_INPUT_GAMEPAD_GUIDE && !cvars::guide_button) {
return;
}
@ -512,7 +535,7 @@ void SDLInputDriver::OnControllerDeviceButtonChanged(SDL_Event* event) {
controller.state_changed = true;
}
std::pair<bool, size_t> SDLInputDriver::GetControllerIndexFromInstanceID(
std::optional<size_t> SDLInputDriver::GetControllerIndexFromInstanceID(
SDL_JoystickID instance_id) {
// Loop through our controllers and try to match the given ID.
for (size_t i = 0; i < controllers_.size(); i++) {
@ -525,10 +548,10 @@ std::pair<bool, size_t> SDLInputDriver::GetControllerIndexFromInstanceID(
auto joy_instance_id = SDL_JoystickInstanceID(joystick);
assert(joy_instance_id >= 0);
if (joy_instance_id == instance_id) {
return {true, i};
return i;
}
}
return {false, 0};
return std::nullopt;
}
SDLInputDriver::ControllerState* SDLInputDriver::GetControllerState(

View File

@ -13,6 +13,7 @@
#include <array>
#include <atomic>
#include <mutex>
#include <optional>
#include "SDL.h"
#include "xenia/hid/input_driver.h"
@ -44,8 +45,9 @@ class SDLInputDriver : public InputDriver {
protected:
struct ControllerState {
SDL_GameController* sdl;
bool state_changed;
X_INPUT_STATE state;
bool state_changed;
bool is_active;
};
enum class RepeatState {
@ -63,11 +65,11 @@ class SDLInputDriver : public InputDriver {
};
protected:
void OnControllerDeviceAdded(SDL_Event* event);
void OnControllerDeviceRemoved(SDL_Event* event);
void OnControllerDeviceAxisMotion(SDL_Event* event);
void OnControllerDeviceButtonChanged(SDL_Event* event);
std::pair<bool, size_t> GetControllerIndexFromInstanceID(
void OnControllerDeviceAdded(const SDL_Event& event);
void OnControllerDeviceRemoved(const SDL_Event& event);
void OnControllerDeviceAxisMotion(const SDL_Event& event);
void OnControllerDeviceButtonChanged(const SDL_Event& event);
std::optional<size_t> GetControllerIndexFromInstanceID(
SDL_JoystickID instance_id);
ControllerState* GetControllerState(uint32_t user_index);
bool TestSDLVersion() const;

View File

@ -202,7 +202,7 @@ class UserProfile {
uint64_t xuid() const { return xuid_; }
std::string name() const { return name_; }
uint32_t signin_state() const { return 1; }
uint32_t type() const { return 2; /* online profile? */ }
uint32_t type() const { return 1 | 2; /* local | online profile? */ }
void AddSetting(std::unique_ptr<Setting> setting);
Setting* GetSetting(uint32_t setting_id);

View File

@ -32,50 +32,44 @@ uint32_t xeXamEnumerate(uint32_t handle, uint32_t flags, void* buffer,
uint32_t overlapped_ptr) {
assert_true(flags == 0);
auto e = kernel_state()->object_table()->LookupObject<XEnumerator>(handle);
if (!e) {
if (overlapped_ptr) {
kernel_state()->CompleteOverlappedImmediateEx(
overlapped_ptr, X_ERROR_INVALID_HANDLE, X_ERROR_INVALID_HANDLE, 0);
return X_ERROR_IO_PENDING;
} else {
return X_ERROR_INVALID_HANDLE;
}
}
size_t actual_buffer_length = buffer_length;
if (buffer_length == e->items_per_enumerate()) {
actual_buffer_length = e->item_size() * e->items_per_enumerate();
// Known culprits:
// Final Fight: Double Impact (saves)
XELOGW(
"Broken usage of XamEnumerate! buffer length={:X} vs actual "
"length={:X} "
"(item size={:X}, items per enumerate={})",
(uint32_t)buffer_length, actual_buffer_length, e->item_size(),
e->items_per_enumerate());
}
std::memset(buffer, 0, actual_buffer_length);
X_RESULT result;
uint32_t item_count = 0;
if (actual_buffer_length < e->item_size()) {
result = X_ERROR_INSUFFICIENT_BUFFER;
} else if (e->current_item() >= e->item_count()) {
result = X_ERROR_NO_MORE_FILES;
auto e = kernel_state()->object_table()->LookupObject<XEnumerator>(handle);
if (!e) {
result = X_ERROR_INVALID_HANDLE;
} else {
auto item_buffer = static_cast<uint8_t*>(buffer);
auto max_items = actual_buffer_length / e->item_size();
while (max_items--) {
if (!e->WriteItem(item_buffer)) {
break;
}
item_buffer += e->item_size();
item_count++;
size_t actual_buffer_length = buffer_length;
if (buffer_length == e->items_per_enumerate()) {
actual_buffer_length = e->item_size() * e->items_per_enumerate();
// Known culprits:
// Final Fight: Double Impact (saves)
XELOGW(
"Broken usage of XamEnumerate! buffer length={:X} vs actual "
"length={:X} "
"(item size={:X}, items per enumerate={})",
(uint32_t)buffer_length, actual_buffer_length, e->item_size(),
e->items_per_enumerate());
}
std::memset(buffer, 0, actual_buffer_length);
if (actual_buffer_length < e->item_size()) {
result = X_ERROR_INSUFFICIENT_BUFFER;
} else if (e->current_item() >= e->item_count()) {
result = X_ERROR_NO_MORE_FILES;
} else {
auto item_buffer = static_cast<uint8_t*>(buffer);
auto max_items = actual_buffer_length / e->item_size();
while (max_items--) {
if (!e->WriteItem(item_buffer)) {
break;
}
item_buffer += e->item_size();
item_count++;
}
result = X_ERROR_SUCCESS;
}
result = X_ERROR_SUCCESS;
}
if (items_returned) {

View File

@ -958,6 +958,11 @@ dword_result_t NetDll___WSAFDIsSet(dword_t socket_handle,
}
DECLARE_XAM_EXPORT1(NetDll___WSAFDIsSet, kNetworking, kImplemented);
void NetDll_WSASetLastError(dword_t error_code) {
XThread::SetLastError(error_code);
}
DECLARE_XAM_EXPORT1(NetDll_WSASetLastError, kNetworking, kImplemented);
void RegisterNetExports(xe::cpu::ExportResolver* export_resolver,
KernelState* kernel_state) {}

View File

@ -142,7 +142,8 @@ dword_result_t NtCreateFile(lpdword_t handle_out, dword_t desired_access,
X_STATUS result = kernel_state()->file_system()->OpenFile(
root_entry, target_path,
vfs::FileDisposition((uint32_t)creation_disposition), desired_access,
(create_options & CreateOptions::FILE_DIRECTORY_FILE) != 0, &vfs_file,
(create_options & CreateOptions::FILE_DIRECTORY_FILE) != 0,
(create_options & CreateOptions::FILE_NON_DIRECTORY_FILE) != 0, &vfs_file,
&file_action);
object_ref<XFile> file = nullptr;

View File

@ -135,8 +135,10 @@ dword_result_t NtAllocateVirtualMemory(lpdword_t base_addr_ptr,
}
uint32_t protect = FromXdkProtectFlags(protect_bits);
uint32_t address = 0;
BaseHeap* heap;
if (adjusted_base != 0) {
auto heap = kernel_memory()->LookupHeap(adjusted_base);
heap = kernel_memory()->LookupHeap(adjusted_base);
if (heap->page_size() != page_size) {
// Specified the wrong page size for the wrong heap.
return X_STATUS_ACCESS_DENIED;
@ -148,7 +150,7 @@ dword_result_t NtAllocateVirtualMemory(lpdword_t base_addr_ptr,
}
} else {
bool top_down = !!(alloc_type & X_MEM_TOP_DOWN);
auto heap = kernel_memory()->LookupHeapByType(false, page_size);
heap = kernel_memory()->LookupHeapByType(false, page_size);
heap->Alloc(adjusted_size, page_size, allocation_type, protect, top_down,
&address);
}
@ -160,7 +162,14 @@ dword_result_t NtAllocateVirtualMemory(lpdword_t base_addr_ptr,
// Zero memory, if needed.
if (address && !(alloc_type & X_MEM_NOZERO)) {
if (alloc_type & X_MEM_COMMIT) {
if (!(protect & kMemoryProtectWrite)) {
heap->Protect(address, adjusted_size,
kMemoryProtectRead | kMemoryProtectWrite);
}
kernel_memory()->Zero(address, adjusted_size);
if (!(protect & kMemoryProtectWrite)) {
heap->Protect(address, adjusted_size, protect);
}
}
}
@ -400,7 +409,7 @@ dword_result_t MmQueryAddressProtect(dword_t base_address) {
if (!heap->QueryProtect(base_address, &access)) {
access = 0;
}
access = ToXdkProtectFlags(access);
access = !access ? 0 : ToXdkProtectFlags(access);
return access;
}

View File

@ -205,22 +205,30 @@ dword_result_t NtSuspendThread(dword_t handle, lpdword_t suspend_count_ptr) {
}
DECLARE_XBOXKRNL_EXPORT1(NtSuspendThread, kThreading, kImplemented);
void KeSetCurrentStackPointers(lpvoid_t stack_ptr,
pointer_t<X_KTHREAD> cur_thread,
void KeSetCurrentStackPointers(lpvoid_t stack_ptr, pointer_t<X_KTHREAD> thread,
lpvoid_t stack_alloc_base, lpvoid_t stack_base,
lpvoid_t stack_limit) {
auto thread = XThread::GetCurrentThread();
auto context = thread->thread_state()->context();
context->r[1] = stack_ptr.guest_address();
auto current_thread = XThread::GetCurrentThread();
auto context = current_thread->thread_state()->context();
auto pcr = kernel_memory()->TranslateVirtual<X_KPCR*>(
static_cast<uint32_t>(context->r[13]));
auto pcr =
kernel_memory()->TranslateVirtual<X_KPCR*>((uint32_t)context->r[13]);
thread->stack_alloc_base = stack_alloc_base.value();
thread->stack_base = stack_base.value();
thread->stack_limit = stack_limit.value();
pcr->stack_base_ptr = stack_base.guest_address();
pcr->stack_end_ptr = stack_limit.guest_address();
context->r[1] = stack_ptr.guest_address();
// TODO: Do we need to set the stack info on cur_thread?
// If a fiber is set, and the thread matches, reenter to avoid issues with
// host stack overflowing.
if (thread->fiber_ptr &&
current_thread->guest_object() == thread.guest_address()) {
current_thread->Reenter(static_cast<uint32_t>(context->lr));
}
}
DECLARE_XBOXKRNL_EXPORT1(KeSetCurrentStackPointers, kThreading, kImplemented);
DECLARE_XBOXKRNL_EXPORT2(KeSetCurrentStackPointers, kThreading, kImplemented,
kHighFrequency);
dword_result_t KeSetAffinityThread(lpvoid_t thread_ptr, dword_t affinity,
lpdword_t previous_affinity_ptr) {

Some files were not shown because too many files have changed in this diff Show More