From 7931dbe1802d0ca4daa4261e6dd0f6d06327f32a Mon Sep 17 00:00:00 2001 From: Joel Linn Date: Sat, 28 Nov 2020 15:45:03 +0100 Subject: [PATCH 01/29] [HID] `is_active` debugging for hid-demo. --- src/xenia/hid/hid_demo.cc | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/xenia/hid/hid_demo.cc b/src/xenia/hid/hid_demo.cc index 1829d2b47..44d8c780c 100644 --- a/src/xenia/hid/hid_demo.cc +++ b/src/xenia/hid/hid_demo.cc @@ -38,6 +38,7 @@ DEFINE_string(hid, "any", "Input system. Use: [any, nop, sdl, winkey, xinput]", "General"); #define MAX_USERS 4 +#define ROW_HEIGHT_GENERAL 60 #define COL_WIDTH_STATE 320 #define COL_WIDTH_STROKE 416 @@ -45,6 +46,7 @@ namespace xe { namespace hid { std::unique_ptr input_system_; +bool is_active = true; std::vector> CreateInputDrivers( ui::Window* window) { @@ -118,7 +120,7 @@ int hid_demo_main(const std::vector& args) { loop->on_quit.AddListener([&window](xe::ui::UIEvent* e) { window.reset(); }); // Initial size setting, done here so that it knows the menu exists. - window->Resize(COL_WIDTH_STATE + COL_WIDTH_STROKE, 500); + window->Resize(COL_WIDTH_STATE + COL_WIDTH_STROKE, ROW_HEIGHT_GENERAL + 500); // Create the graphics context used for drawing and setup the window. std::unique_ptr graphics_provider; @@ -133,7 +135,9 @@ int hid_demo_main(const std::vector& args) { input_system_ = std::make_unique(window.get()); auto drivers = CreateInputDrivers(window.get()); for (size_t i = 0; i < drivers.size(); ++i) { - input_system_->AddDriver(std::move(drivers[i])); + auto& driver = drivers[i]; + driver->set_is_active_callback([]() -> bool { return is_active; }); + input_system_->AddDriver(std::move(driver)); } window->Invalidate(); @@ -149,10 +153,22 @@ int hid_demo_main(const std::vector& args) { ImGuiWindowFlags_NoCollapse | ImGuiWindowFlags_NoSavedSettings | ImGuiWindowFlags_NoScrollbar; - ImGui::Begin("GetState()", nullptr, wflags); + ImGui::Begin("General", nullptr, wflags); { ImGui::SetWindowPos(ImVec2(0, 0)); - ImGui::SetWindowSize(ImVec2(COL_WIDTH_STATE, io.DisplaySize.y)); + ImGui::SetWindowSize( + ImVec2(COL_WIDTH_STATE + COL_WIDTH_STROKE, ROW_HEIGHT_GENERAL)); + + ImGui::Text("Input System (hid) = \"%s\"", cvars::hid.c_str()); + ImGui::Checkbox("is_active", &is_active); + } + ImGui::End(); + + ImGui::Begin("GetState()", nullptr, wflags); + { + ImGui::SetWindowPos(ImVec2(0, ROW_HEIGHT_GENERAL)); + ImGui::SetWindowSize( + ImVec2(COL_WIDTH_STATE, io.DisplaySize.y - ROW_HEIGHT_GENERAL)); static bool enable_GetState = false; ImGui::Checkbox("Active", &enable_GetState); @@ -167,8 +183,9 @@ int hid_demo_main(const std::vector& args) { ImGui::Begin("GetKeystroke()", nullptr, wflags); { - ImGui::SetWindowPos(ImVec2(COL_WIDTH_STATE, 0)); - ImGui::SetWindowSize(ImVec2(COL_WIDTH_STROKE, io.DisplaySize.y)); + ImGui::SetWindowPos(ImVec2(COL_WIDTH_STATE, ROW_HEIGHT_GENERAL)); + ImGui::SetWindowSize( + ImVec2(COL_WIDTH_STROKE, io.DisplaySize.y - ROW_HEIGHT_GENERAL)); static bool enable_GetKeystroke = false; static bool hide_repeats = false; From ff56fbdf46e8a15f73f6cc60f1445e25084d46c6 Mon Sep 17 00:00:00 2001 From: Joel Linn Date: Sat, 28 Nov 2020 15:46:25 +0100 Subject: [PATCH 02/29] [HID] Honor `is_active()` in SDL input backend. --- src/xenia/hid/sdl/sdl_input_driver.cc | 37 ++++++++++++++++++++++----- src/xenia/hid/sdl/sdl_input_driver.h | 3 ++- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/src/xenia/hid/sdl/sdl_input_driver.cc b/src/xenia/hid/sdl/sdl_input_driver.cc index fb77397da..c1404c358 100644 --- a/src/xenia/hid/sdl/sdl_input_driver.cc +++ b/src/xenia/hid/sdl/sdl_input_driver.cc @@ -193,7 +193,11 @@ X_RESULT SDLInputDriver::GetState(uint32_t user_index, return X_ERROR_BAD_ARGUMENTS; } - QueueControllerUpdate(); + auto is_active = this->is_active(); + + if (is_active) { + QueueControllerUpdate(); + } std::unique_lock guard(controllers_mutex_); @@ -203,12 +207,20 @@ X_RESULT SDLInputDriver::GetState(uint32_t user_index, } // Make sure packet_number is only incremented by 1, even if there have been - // multiple updates between GetState calls. - if (controller->state_changed) { + // multiple updates between GetState calls. Also track `is_active` to + // increment the packet number if it changed. + if ((is_active != controller->is_active) || + (is_active && controller->state_changed)) { controller->state.packet_number++; + controller->is_active = is_active; controller->state_changed = false; } - *out_state = controller->state; + std::memcpy(out_state, &controller->state, sizeof(*out_state)); + if (!is_active) { + // Simulate an "untouched" controller. When we become active again the + // pressed buttons aren't lost and will be visible again. + std::memset(&out_state->gamepad, 0, sizeof(out_state->gamepad)); + } return X_ERROR_SUCCESS; } @@ -242,6 +254,8 @@ X_RESULT SDLInputDriver::SetState(uint32_t user_index, X_RESULT SDLInputDriver::GetKeystroke(uint32_t users, uint32_t flags, X_INPUT_KEYSTROKE* out_keystroke) { + // TODO(JoelLinn): Figure out the flags + // https://github.com/evilC/UCR/blob/0489929e2a8e39caa3484c67f3993d3fba39e46f/Libraries/XInput.ahk#L85-L98 assert(sdl_events_initialized_ && sdl_gamecontroller_initialized_); bool user_any = users == 0xFF; if (users >= HID_SDL_USER_COUNT && !user_any) { @@ -296,7 +310,11 @@ X_RESULT SDLInputDriver::GetKeystroke(uint32_t users, uint32_t flags, X_INPUT_GAMEPAD_VK_RTHUMB_DOWNLEFT, }; - QueueControllerUpdate(); + auto is_active = this->is_active(); + + if (is_active) { + QueueControllerUpdate(); + } std::unique_lock guard(controllers_mutex_); @@ -311,8 +329,13 @@ X_RESULT SDLInputDriver::GetKeystroke(uint32_t users, uint32_t flags, } } - const uint64_t curr_butts = controller->state.gamepad.buttons | - AnalogToKeyfield(controller->state.gamepad); + // If input is not active (e.g. due to a dialog overlay), force buttons to + // "unpressed". The algorithm will automatically send UP events when + // `is_active()` goes low and DOWN events when it goes high again. + const uint64_t curr_butts = + is_active ? (controller->state.gamepad.buttons | + AnalogToKeyfield(controller->state.gamepad)) + : uint64_t(0); KeystrokeState& last = keystroke_states_.at(user_index); // Handle repeating diff --git a/src/xenia/hid/sdl/sdl_input_driver.h b/src/xenia/hid/sdl/sdl_input_driver.h index 84555f70d..033862cd7 100644 --- a/src/xenia/hid/sdl/sdl_input_driver.h +++ b/src/xenia/hid/sdl/sdl_input_driver.h @@ -44,8 +44,9 @@ class SDLInputDriver : public InputDriver { protected: struct ControllerState { SDL_GameController* sdl; - bool state_changed; X_INPUT_STATE state; + bool state_changed; + bool is_active; }; enum class RepeatState { From 842ac86b1fb6a77133c9d705e930600c4c143a83 Mon Sep 17 00:00:00 2001 From: Joel Linn Date: Sat, 28 Nov 2020 16:17:08 +0100 Subject: [PATCH 03/29] [HID] More modern c++ in SDL backend. --- src/xenia/hid/sdl/sdl_input_driver.cc | 68 +++++++++++++-------------- src/xenia/hid/sdl/sdl_input_driver.h | 11 +++-- 2 files changed, 40 insertions(+), 39 deletions(-) diff --git a/src/xenia/hid/sdl/sdl_input_driver.cc b/src/xenia/hid/sdl/sdl_input_driver.cc index c1404c358..97a21cc8d 100644 --- a/src/xenia/hid/sdl/sdl_input_driver.cc +++ b/src/xenia/hid/sdl/sdl_input_driver.cc @@ -77,7 +77,7 @@ X_STATUS SDLInputDriver::Setup() { sdl_events_initialized_ = true; SDL_EventFilter event_filter{[](void* userdata, SDL_Event* event) -> int { - if (!userdata) { + if (!userdata || !event) { assert_always(); return 0; } @@ -102,17 +102,17 @@ X_STATUS SDLInputDriver::Setup() { } switch (type) { case SDL_CONTROLLERDEVICEADDED: - driver->OnControllerDeviceAdded(event); + driver->OnControllerDeviceAdded(*event); break; case SDL_CONTROLLERDEVICEREMOVED: - driver->OnControllerDeviceRemoved(event); + driver->OnControllerDeviceRemoved(*event); break; case SDL_CONTROLLERAXISMOTION: - driver->OnControllerDeviceAxisMotion(event); + driver->OnControllerDeviceAxisMotion(*event); break; case SDL_CONTROLLERBUTTONDOWN: case SDL_CONTROLLERBUTTONUP: - driver->OnControllerDeviceButtonChanged(event); + driver->OnControllerDeviceButtonChanged(*event); break; default: break; @@ -407,12 +407,12 @@ X_RESULT SDLInputDriver::GetKeystroke(uint32_t users, uint32_t flags, return X_ERROR_EMPTY; } -void SDLInputDriver::OnControllerDeviceAdded(SDL_Event* event) { +void SDLInputDriver::OnControllerDeviceAdded(const SDL_Event& event) { assert(window()->loop()->is_on_loop_thread()); std::unique_lock guard(controllers_mutex_); // Open the controller. - const auto controller = SDL_GameControllerOpen(event->cdevice.which); + const auto controller = SDL_GameControllerOpen(event.cdevice.which); if (!controller) { assert_always(); return; @@ -446,52 +446,52 @@ void SDLInputDriver::OnControllerDeviceAdded(SDL_Event* event) { } } -void SDLInputDriver::OnControllerDeviceRemoved(SDL_Event* event) { +void SDLInputDriver::OnControllerDeviceRemoved(const SDL_Event& event) { assert(window()->loop()->is_on_loop_thread()); std::unique_lock guard(controllers_mutex_); // Find the disconnected gamecontroller and close it. - auto [found, i] = GetControllerIndexFromInstanceID(event->cdevice.which); - assert(found); - SDL_GameControllerClose(controllers_.at(i).sdl); - controllers_.at(i) = {}; - keystroke_states_.at(i) = {}; + auto idx = GetControllerIndexFromInstanceID(event.cdevice.which); + assert(idx); + SDL_GameControllerClose(controllers_.at(*idx).sdl); + controllers_.at(*idx) = {}; + keystroke_states_.at(*idx) = {}; } -void SDLInputDriver::OnControllerDeviceAxisMotion(SDL_Event* event) { +void SDLInputDriver::OnControllerDeviceAxisMotion(const SDL_Event& event) { assert(window()->loop()->is_on_loop_thread()); std::unique_lock guard(controllers_mutex_); - auto [found, i] = GetControllerIndexFromInstanceID(event->caxis.which); - assert(found); - auto& pad = controllers_.at(i).state.gamepad; - switch (event->caxis.axis) { + auto idx = GetControllerIndexFromInstanceID(event.caxis.which); + assert(idx); + auto& pad = controllers_.at(*idx).state.gamepad; + switch (event.caxis.axis) { case SDL_CONTROLLER_AXIS_LEFTX: - pad.thumb_lx = event->caxis.value; + pad.thumb_lx = event.caxis.value; break; case SDL_CONTROLLER_AXIS_LEFTY: - pad.thumb_ly = ~event->caxis.value; + pad.thumb_ly = ~event.caxis.value; break; case SDL_CONTROLLER_AXIS_RIGHTX: - pad.thumb_rx = event->caxis.value; + pad.thumb_rx = event.caxis.value; break; case SDL_CONTROLLER_AXIS_RIGHTY: - pad.thumb_ry = ~event->caxis.value; + pad.thumb_ry = ~event.caxis.value; break; case SDL_CONTROLLER_AXIS_TRIGGERLEFT: - pad.left_trigger = static_cast(event->caxis.value >> 7); + pad.left_trigger = static_cast(event.caxis.value >> 7); break; case SDL_CONTROLLER_AXIS_TRIGGERRIGHT: - pad.right_trigger = static_cast(event->caxis.value >> 7); + pad.right_trigger = static_cast(event.caxis.value >> 7); break; default: assert_always(); break; } - controllers_.at(i).state_changed = true; + controllers_.at(*idx).state_changed = true; } -void SDLInputDriver::OnControllerDeviceButtonChanged(SDL_Event* event) { +void SDLInputDriver::OnControllerDeviceButtonChanged(const SDL_Event& event) { assert(window()->loop()->is_on_loop_thread()); std::unique_lock guard(controllers_mutex_); @@ -515,15 +515,15 @@ void SDLInputDriver::OnControllerDeviceButtonChanged(SDL_Event* event) { X_INPUT_GAMEPAD_DPAD_LEFT, X_INPUT_GAMEPAD_DPAD_RIGHT}; - auto [found, i] = GetControllerIndexFromInstanceID(event->cbutton.which); - assert(found); - auto& controller = controllers_.at(i); + auto idx = GetControllerIndexFromInstanceID(event.cbutton.which); + assert(idx); + auto& controller = controllers_.at(*idx); uint16_t xbuttons = controller.state.gamepad.buttons; // Lookup the XInput button code. - auto xbutton = xbutton_lookup.at(event->cbutton.button); + auto xbutton = xbutton_lookup.at(event.cbutton.button); // Pressed or released? - if (event->cbutton.state == SDL_PRESSED) { + if (event.cbutton.state == SDL_PRESSED) { if (xbutton == X_INPUT_GAMEPAD_GUIDE && !cvars::guide_button) { return; } @@ -535,7 +535,7 @@ void SDLInputDriver::OnControllerDeviceButtonChanged(SDL_Event* event) { controller.state_changed = true; } -std::pair SDLInputDriver::GetControllerIndexFromInstanceID( +std::optional SDLInputDriver::GetControllerIndexFromInstanceID( SDL_JoystickID instance_id) { // Loop through our controllers and try to match the given ID. for (size_t i = 0; i < controllers_.size(); i++) { @@ -548,10 +548,10 @@ std::pair SDLInputDriver::GetControllerIndexFromInstanceID( auto joy_instance_id = SDL_JoystickInstanceID(joystick); assert(joy_instance_id >= 0); if (joy_instance_id == instance_id) { - return {true, i}; + return i; } } - return {false, 0}; + return std::nullopt; } SDLInputDriver::ControllerState* SDLInputDriver::GetControllerState( diff --git a/src/xenia/hid/sdl/sdl_input_driver.h b/src/xenia/hid/sdl/sdl_input_driver.h index 033862cd7..f98619a2f 100644 --- a/src/xenia/hid/sdl/sdl_input_driver.h +++ b/src/xenia/hid/sdl/sdl_input_driver.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "SDL.h" #include "xenia/hid/input_driver.h" @@ -64,11 +65,11 @@ class SDLInputDriver : public InputDriver { }; protected: - void OnControllerDeviceAdded(SDL_Event* event); - void OnControllerDeviceRemoved(SDL_Event* event); - void OnControllerDeviceAxisMotion(SDL_Event* event); - void OnControllerDeviceButtonChanged(SDL_Event* event); - std::pair GetControllerIndexFromInstanceID( + void OnControllerDeviceAdded(const SDL_Event& event); + void OnControllerDeviceRemoved(const SDL_Event& event); + void OnControllerDeviceAxisMotion(const SDL_Event& event); + void OnControllerDeviceButtonChanged(const SDL_Event& event); + std::optional GetControllerIndexFromInstanceID( SDL_JoystickID instance_id); ControllerState* GetControllerState(uint32_t user_index); bool TestSDLVersion() const; From b30fcbd29aa9159a78138964f91a2fa84840afcf Mon Sep 17 00:00:00 2001 From: Joel Linn Date: Sat, 28 Nov 2020 17:01:26 +0100 Subject: [PATCH 04/29] [HID] Change order to xinput, sdl, winkey --- src/xenia/app/xenia_main.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/xenia/app/xenia_main.cc b/src/xenia/app/xenia_main.cc index 97faba005..d6099a047 100644 --- a/src/xenia/app/xenia_main.cc +++ b/src/xenia/app/xenia_main.cc @@ -188,10 +188,12 @@ std::vector> CreateInputDrivers( Factory factory; #if XE_PLATFORM_WIN32 factory.Add("xinput", xe::hid::xinput::Create); +#endif // XE_PLATFORM_WIN32 + factory.Add("sdl", xe::hid::sdl::Create); +#if XE_PLATFORM_WIN32 // WinKey input driver should always be the last input driver added! factory.Add("winkey", xe::hid::winkey::Create); #endif // XE_PLATFORM_WIN32 - factory.Add("sdl", xe::hid::sdl::Create); for (auto& driver : factory.CreateAll(cvars::hid, window)) { if (XSUCCEEDED(driver->Setup())) { drivers.emplace_back(std::move(driver)); From c55918d944db8a8c2ae20619c86b94794e4a668b Mon Sep 17 00:00:00 2001 From: Joel Linn Date: Sat, 28 Nov 2020 17:53:25 +0100 Subject: [PATCH 05/29] [AppVeyor] Remove vcpkg. --- .appveyor.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.appveyor.yml b/.appveyor.yml index ccd75d2ab..4329b47e0 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -29,6 +29,7 @@ init: - git config --global core.autocrlf input install: + - cmd: vcpkg integrate remove - cmd: xb setup platform: Windows From 215f2a340b7c06a232f8480401b37c73bb79f238 Mon Sep 17 00:00:00 2001 From: gibbed Date: Sun, 29 Nov 2020 02:00:16 -0600 Subject: [PATCH 06/29] [XAM] Ensure items returned is set in enumerate. [XAM] Ensure items returned is set in xeXamEnumerate. --- src/xenia/kernel/xam/xam_enum.cc | 72 +++++++++++++++----------------- 1 file changed, 33 insertions(+), 39 deletions(-) diff --git a/src/xenia/kernel/xam/xam_enum.cc b/src/xenia/kernel/xam/xam_enum.cc index 2cab56ab7..9aec9b056 100644 --- a/src/xenia/kernel/xam/xam_enum.cc +++ b/src/xenia/kernel/xam/xam_enum.cc @@ -32,50 +32,44 @@ uint32_t xeXamEnumerate(uint32_t handle, uint32_t flags, void* buffer, uint32_t overlapped_ptr) { assert_true(flags == 0); - auto e = kernel_state()->object_table()->LookupObject(handle); - if (!e) { - if (overlapped_ptr) { - kernel_state()->CompleteOverlappedImmediateEx( - overlapped_ptr, X_ERROR_INVALID_HANDLE, X_ERROR_INVALID_HANDLE, 0); - return X_ERROR_IO_PENDING; - } else { - return X_ERROR_INVALID_HANDLE; - } - } - - size_t actual_buffer_length = buffer_length; - if (buffer_length == e->items_per_enumerate()) { - actual_buffer_length = e->item_size() * e->items_per_enumerate(); - // Known culprits: - // Final Fight: Double Impact (saves) - XELOGW( - "Broken usage of XamEnumerate! buffer length={:X} vs actual " - "length={:X} " - "(item size={:X}, items per enumerate={})", - (uint32_t)buffer_length, actual_buffer_length, e->item_size(), - e->items_per_enumerate()); - } - - std::memset(buffer, 0, actual_buffer_length); - X_RESULT result; uint32_t item_count = 0; - if (actual_buffer_length < e->item_size()) { - result = X_ERROR_INSUFFICIENT_BUFFER; - } else if (e->current_item() >= e->item_count()) { - result = X_ERROR_NO_MORE_FILES; + auto e = kernel_state()->object_table()->LookupObject(handle); + if (!e) { + result = X_ERROR_INVALID_HANDLE; } else { - auto item_buffer = static_cast(buffer); - auto max_items = actual_buffer_length / e->item_size(); - while (max_items--) { - if (!e->WriteItem(item_buffer)) { - break; - } - item_buffer += e->item_size(); - item_count++; + size_t actual_buffer_length = buffer_length; + if (buffer_length == e->items_per_enumerate()) { + actual_buffer_length = e->item_size() * e->items_per_enumerate(); + // Known culprits: + // Final Fight: Double Impact (saves) + XELOGW( + "Broken usage of XamEnumerate! buffer length={:X} vs actual " + "length={:X} " + "(item size={:X}, items per enumerate={})", + (uint32_t)buffer_length, actual_buffer_length, e->item_size(), + e->items_per_enumerate()); + } + + std::memset(buffer, 0, actual_buffer_length); + + if (actual_buffer_length < e->item_size()) { + result = X_ERROR_INSUFFICIENT_BUFFER; + } else if (e->current_item() >= e->item_count()) { + result = X_ERROR_NO_MORE_FILES; + } else { + auto item_buffer = static_cast(buffer); + auto max_items = actual_buffer_length / e->item_size(); + while (max_items--) { + if (!e->WriteItem(item_buffer)) { + break; + } + item_buffer += e->item_size(); + item_count++; + } + result = X_ERROR_SUCCESS; } - result = X_ERROR_SUCCESS; } if (items_returned) { From c3d48eb2fa2e43a19716b52fa5e545f643486b6b Mon Sep 17 00:00:00 2001 From: Gliniak Date: Sun, 29 Nov 2020 17:07:59 +0100 Subject: [PATCH 07/29] [XAM/Net] Implemented WSASetLastError --- src/xenia/kernel/xam/xam_net.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/xenia/kernel/xam/xam_net.cc b/src/xenia/kernel/xam/xam_net.cc index a28b788e3..7b37e2b94 100644 --- a/src/xenia/kernel/xam/xam_net.cc +++ b/src/xenia/kernel/xam/xam_net.cc @@ -958,6 +958,11 @@ dword_result_t NetDll___WSAFDIsSet(dword_t socket_handle, } DECLARE_XAM_EXPORT1(NetDll___WSAFDIsSet, kNetworking, kImplemented); +void NetDll_WSASetLastError(dword_t error_code) { + XThread::SetLastError(error_code); +} +DECLARE_XAM_EXPORT1(NetDll_WSASetLastError, kNetworking, kImplemented); + void RegisterNetExports(xe::cpu::ExportResolver* export_resolver, KernelState* kernel_state) {} From d2cf0167fd5aceef4166d69c80c16bc3bd9fab12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Gli=C5=84ski?= Date: Thu, 3 Dec 2020 08:18:29 +0100 Subject: [PATCH 08/29] [XAM/User] Added flag for local profile It should fix games where right now there is no profile found. Example: EA Sports games --- src/xenia/kernel/xam/user_profile.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xenia/kernel/xam/user_profile.h b/src/xenia/kernel/xam/user_profile.h index 309d0e0f8..92bf80bec 100644 --- a/src/xenia/kernel/xam/user_profile.h +++ b/src/xenia/kernel/xam/user_profile.h @@ -202,7 +202,7 @@ class UserProfile { uint64_t xuid() const { return xuid_; } std::string name() const { return name_; } uint32_t signin_state() const { return 1; } - uint32_t type() const { return 2; /* online profile? */ } + uint32_t type() const { return 1 | 2; /* local | online profile? */ } void AddSetting(std::unique_ptr setting); Setting* GetSetting(uint32_t setting_id); From d420215de1a738c876d49c9b951c7a45ed5884cf Mon Sep 17 00:00:00 2001 From: gibbed Date: Fri, 4 Dec 2020 15:06:26 -0600 Subject: [PATCH 09/29] [Kernel] Allow 0 return for MmQueryAddressProtect. --- src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc index 8ecdabd5d..bcb88123c 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc @@ -400,7 +400,7 @@ dword_result_t MmQueryAddressProtect(dword_t base_address) { if (!heap->QueryProtect(base_address, &access)) { access = 0; } - access = ToXdkProtectFlags(access); + access = !access ? 0 : ToXdkProtectFlags(access); return access; } From 1513dd235bfe3a944c621e09e98a94bfbb0ab5a3 Mon Sep 17 00:00:00 2001 From: gibbed Date: Fri, 4 Dec 2020 14:57:48 -0600 Subject: [PATCH 10/29] [Kernel] Code reentrance for guest fibers. [Kernel] Code reentrance using exceptions for guest fibers. --- src/xenia/cpu/processor.cc | 1 - .../kernel/xboxkrnl/xboxkrnl_threading.cc | 26 +++--- src/xenia/kernel/xthread.cc | 70 ++++++++++++---- src/xenia/kernel/xthread.h | 81 ++++++++++++++----- 4 files changed, 132 insertions(+), 46 deletions(-) diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc index 7a787873d..95b016d17 100644 --- a/src/xenia/cpu/processor.cc +++ b/src/xenia/cpu/processor.cc @@ -358,7 +358,6 @@ bool Processor::ExecuteRaw(ThreadState* thread_state, uint32_t address) { return false; } - auto context = thread_state->context(); return function->Call(thread_state, 0xBCBCBCBC); } diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc index bf399937e..bbe78ec87 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc @@ -205,22 +205,30 @@ dword_result_t NtSuspendThread(dword_t handle, lpdword_t suspend_count_ptr) { } DECLARE_XBOXKRNL_EXPORT1(NtSuspendThread, kThreading, kImplemented); -void KeSetCurrentStackPointers(lpvoid_t stack_ptr, - pointer_t cur_thread, +void KeSetCurrentStackPointers(lpvoid_t stack_ptr, pointer_t thread, lpvoid_t stack_alloc_base, lpvoid_t stack_base, lpvoid_t stack_limit) { - auto thread = XThread::GetCurrentThread(); - auto context = thread->thread_state()->context(); - context->r[1] = stack_ptr.guest_address(); + auto current_thread = XThread::GetCurrentThread(); + auto context = current_thread->thread_state()->context(); + auto pcr = kernel_memory()->TranslateVirtual( + static_cast(context->r[13])); - auto pcr = - kernel_memory()->TranslateVirtual((uint32_t)context->r[13]); + thread->stack_alloc_base = stack_alloc_base.value(); + thread->stack_base = stack_base.value(); + thread->stack_limit = stack_limit.value(); pcr->stack_base_ptr = stack_base.guest_address(); pcr->stack_end_ptr = stack_limit.guest_address(); + context->r[1] = stack_ptr.guest_address(); - // TODO: Do we need to set the stack info on cur_thread? + // If a fiber is set, and the thread matches, reenter to avoid issues with + // host stack overflowing. + if (thread->fiber_ptr && + current_thread->guest_object() == thread.guest_address()) { + current_thread->Reenter(static_cast(context->lr)); + } } -DECLARE_XBOXKRNL_EXPORT1(KeSetCurrentStackPointers, kThreading, kImplemented); +DECLARE_XBOXKRNL_EXPORT2(KeSetCurrentStackPointers, kThreading, kImplemented, + kHighFrequency); dword_result_t KeSetAffinityThread(lpvoid_t thread_ptr, dword_t affinity, lpdword_t previous_affinity_ptr) { diff --git a/src/xenia/kernel/xthread.cc b/src/xenia/kernel/xthread.cc index baa014b01..46f1ef961 100644 --- a/src/xenia/kernel/xthread.cc +++ b/src/xenia/kernel/xthread.cc @@ -498,6 +498,16 @@ X_STATUS XThread::Terminate(int exit_code) { return X_STATUS_SUCCESS; } +class reenter_exception { + public: + reenter_exception(uint32_t address) : address_(address){}; + virtual ~reenter_exception(){}; + uint32_t address() const { return address_; } + + private: + uint32_t address_; +}; + void XThread::Execute() { XELOGKERNEL("XThread::Execute thid {} (handle={:08X}, '{}', native={:08X})", thread_id_, handle(), thread_name_, thread_->system_id()); @@ -510,31 +520,61 @@ void XThread::Execute() { // have time to initialize shared structures AFTER CreateThread (RR). xe::threading::Sleep(std::chrono::milliseconds(10)); - int exit_code = 0; - // Dispatch any APCs that were queued before the thread was created first. DeliverAPCs(); + uint32_t address; + std::vector args; + bool want_exit_code; + int exit_code = 0; + // If a XapiThreadStartup value is present, we use that as a trampoline. // Otherwise, we are a raw thread. if (creation_params_.xapi_thread_startup) { - uint64_t args[] = {creation_params_.start_address, - creation_params_.start_context}; - kernel_state()->processor()->Execute(thread_state_, - creation_params_.xapi_thread_startup, - args, xe::countof(args)); + address = creation_params_.xapi_thread_startup; + args.push_back(creation_params_.start_address); + args.push_back(creation_params_.start_context); + want_exit_code = false; } else { // Run user code. - uint64_t args[] = {creation_params_.start_context}; - exit_code = static_cast(kernel_state()->processor()->Execute( - thread_state_, creation_params_.start_address, args, - xe::countof(args))); - // If we got here it means the execute completed without an exit being - // called. - // Treat the return code as an implicit exit code. + address = creation_params_.start_address; + args.push_back(creation_params_.start_context); + want_exit_code = true; } - Exit(exit_code); + uint32_t next_address; + try { + exit_code = static_cast(kernel_state()->processor()->Execute( + thread_state_, address, args.data(), args.size())); + next_address = 0; + } catch (const reenter_exception& ree) { + next_address = ree.address(); + } + + // See XThread::Reenter comments. + while (next_address != 0) { + try { + kernel_state()->processor()->ExecuteRaw(thread_state_, next_address); + next_address = 0; + if (want_exit_code) { + exit_code = static_cast(thread_state_->context()->r[3]); + } + } catch (const reenter_exception& ree) { + next_address = ree.address(); + } + } + + // If we got here it means the execute completed without an exit being called. + // Treat the return code as an implicit exit code (if desired). + Exit(!want_exit_code ? 0 : exit_code); +} + +void XThread::Reenter(uint32_t address) { + // TODO(gibbed): Maybe use setjmp/longjmp on Windows? + // https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/longjmp#remarks + // On Windows with /EH, setjmp/longjmp do stack unwinding. + // Is there a better solution than exceptions for stack unwinding? + throw reenter_exception(address); } void XThread::EnterCriticalRegion() { diff --git a/src/xenia/kernel/xthread.h b/src/xenia/kernel/xthread.h index ec0fe41ef..78a6591a1 100644 --- a/src/xenia/kernel/xthread.h +++ b/src/xenia/kernel/xthread.h @@ -70,35 +70,72 @@ struct XAPC { // Processor Control Region struct X_KPCR { xe::be tls_ptr; // 0x0 - char unk_04[0x2C]; // 0x4 + uint8_t unk_04[0x2C]; // 0x4 xe::be pcr_ptr; // 0x30 - char unk_34[0x3C]; // 0x34 + uint8_t unk_34[0x3C]; // 0x34 xe::be stack_base_ptr; // 0x70 Stack base address (high addr) xe::be stack_end_ptr; // 0x74 Stack end (low addr) - char unk_78[0x88]; // 0x78 + uint8_t unk_78[0x88]; // 0x78 xe::be current_thread; // 0x100 - char unk_104[0x8]; // 0x104 - xe::be current_cpu; // 0x10C - char unk_10D[0x43]; // 0x10D + uint8_t unk_104[0x8]; // 0x104 + uint8_t current_cpu; // 0x10C + uint8_t unk_10D[0x43]; // 0x10D xe::be dpc_active; // 0x150 }; struct X_KTHREAD { - X_DISPATCH_HEADER header; // 0x0 - char unk_10[0xAC]; // 0x10 - uint8_t suspend_count; // 0xBC - uint8_t unk_BD; // 0xBD - uint8_t unk_BE; // 0xBE - uint8_t current_cpu; // 0xBF - char unk_C0[0x70]; // 0xC0 - xe::be create_time; // 0x130 - xe::be exit_time; // 0x138 - xe::be exit_status; // 0x140 - char unk_144[0x8]; // 0x144 - xe::be thread_id; // 0x14C - char unk_150[0x10]; // 0x150 - xe::be last_error; // 0x160 - char unk_164[0x94C]; // 0x164 + X_DISPATCH_HEADER header; // 0x0 + xe::be unk_10; // 0x10 + xe::be unk_14; // 0x14 + uint8_t unk_18[0x28]; // 0x10 + xe::be unk_40; // 0x40 + xe::be unk_44; // 0x44 + xe::be unk_48; // 0x48 + xe::be unk_4C; // 0x4C + uint8_t unk_50[0x4]; // 0x50 + xe::be unk_54; // 0x54 + xe::be unk_56; // 0x56 + uint8_t unk_58[0x4]; // 0x58 + xe::be stack_base; // 0x5C + xe::be stack_limit; // 0x60 + uint8_t unk_64[0x4]; // 0x64 + xe::be tls_address; // 0x68 + uint8_t unk_6C; // 0x6C + uint8_t unk_6D[0x7]; // 0x6D + xe::be unk_74; // 0x74 + xe::be unk_78; // 0x78 + xe::be unk_7C; // 0x7C + xe::be unk_80; // 0x80 + xe::be unk_84; // 0x84 + uint8_t unk_88[0x3]; // 0x88 + uint8_t unk_8B; // 0x8B + uint8_t unk_8C[0x10]; // 0x8C + xe::be unk_9C; // 0x9C + uint8_t unk_A0[0x1C]; // 0xA0 + uint8_t suspend_count; // 0xBC + uint8_t unk_BD; // 0xBD + uint8_t unk_BE; // 0xBE + uint8_t current_cpu; // 0xBF + uint8_t unk_C0[0x10]; // 0xC0 + xe::be stack_alloc_base; // 0xD0 + uint8_t unk_D4[0x5C]; // 0xD4 + xe::be create_time; // 0x130 + xe::be exit_time; // 0x138 + xe::be exit_status; // 0x140 + xe::be unk_144; // 0x144 + xe::be unk_148; // 0x148 + xe::be thread_id; // 0x14C + xe::be start_address; // 0x150 + xe::be unk_154; // 0x154 + xe::be unk_158; // 0x158 + uint8_t unk_15C[0x4]; // 0x15C + xe::be last_error; // 0x160 + xe::be fiber_ptr; // 0x164 + uint8_t unk_168[0x4]; // 0x168 + xe::be creation_flags; // 0x16C + uint8_t unk_170[0xC]; // 0x170 + xe::be unk_17C; // 0x17C + uint8_t unk_180[0x930]; // 0x180 // This struct is actually quite long... so uh, not filling this out! }; @@ -151,6 +188,8 @@ class XThread : public XObject, public cpu::Thread { virtual void Execute(); + virtual void Reenter(uint32_t address); + static void EnterCriticalRegion(); static void LeaveCriticalRegion(); uint32_t RaiseIrql(uint32_t new_irql); From ff5c5f01e0e31abee675be7e48c11c20642a40fb Mon Sep 17 00:00:00 2001 From: Gliniak Date: Sun, 29 Nov 2020 12:03:17 +0100 Subject: [PATCH 11/29] [Kernel] Zeroing out pages without write protect flag --- src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc index bcb88123c..de672b227 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc @@ -135,8 +135,10 @@ dword_result_t NtAllocateVirtualMemory(lpdword_t base_addr_ptr, } uint32_t protect = FromXdkProtectFlags(protect_bits); uint32_t address = 0; + BaseHeap* heap; + if (adjusted_base != 0) { - auto heap = kernel_memory()->LookupHeap(adjusted_base); + heap = kernel_memory()->LookupHeap(adjusted_base); if (heap->page_size() != page_size) { // Specified the wrong page size for the wrong heap. return X_STATUS_ACCESS_DENIED; @@ -148,7 +150,7 @@ dword_result_t NtAllocateVirtualMemory(lpdword_t base_addr_ptr, } } else { bool top_down = !!(alloc_type & X_MEM_TOP_DOWN); - auto heap = kernel_memory()->LookupHeapByType(false, page_size); + heap = kernel_memory()->LookupHeapByType(false, page_size); heap->Alloc(adjusted_size, page_size, allocation_type, protect, top_down, &address); } @@ -160,7 +162,14 @@ dword_result_t NtAllocateVirtualMemory(lpdword_t base_addr_ptr, // Zero memory, if needed. if (address && !(alloc_type & X_MEM_NOZERO)) { if (alloc_type & X_MEM_COMMIT) { + if (!(protect & kMemoryProtectWrite)) { + heap->Protect(address, adjusted_size, + kMemoryProtectRead | kMemoryProtectWrite); + } kernel_memory()->Zero(address, adjusted_size); + if (!(protect & kMemoryProtectWrite)) { + heap->Protect(address, adjusted_size, protect); + } } } From cadc31c93fe1e713b98d34c81cf34b3183dc200a Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 6 Dec 2020 15:55:28 +0300 Subject: [PATCH 12/29] [GPU] Fix uninitialized viewport_top when drawing without a viewport --- src/xenia/gpu/draw_util.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 29072c3e8..98cc90615 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -227,6 +227,7 @@ void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x, ndc_offset_y = 0.0f; } } else { + viewport_top = 0.0f; viewport_height = std::min( float(xenos::kTexture2DCubeMaxWidthHeight) * pixel_size_y, y_max); ndc_scale_y = (2.0f * pixel_size_y) / viewport_height; From 06ab8589b4b6fd59367fd1b3e6fb758cd656f11a Mon Sep 17 00:00:00 2001 From: Gliniak Date: Sun, 2 Aug 2020 17:09:32 +0200 Subject: [PATCH 13/29] [Kernel/IO] Return error creating dir as non-dir. [Kernel/IO] Return error when creating directory with non-directory flag in NtCreateFile. --- src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc | 3 ++- src/xenia/kernel/xfile.cc | 2 +- src/xenia/vfs/virtual_file_system.cc | 9 ++++++++- src/xenia/vfs/virtual_file_system.h | 3 ++- src/xenia/xbox.h | 1 + 5 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc index 6b0b6783b..5f19d7ca2 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc @@ -142,7 +142,8 @@ dword_result_t NtCreateFile(lpdword_t handle_out, dword_t desired_access, X_STATUS result = kernel_state()->file_system()->OpenFile( root_entry, target_path, vfs::FileDisposition((uint32_t)creation_disposition), desired_access, - (create_options & CreateOptions::FILE_DIRECTORY_FILE) != 0, &vfs_file, + (create_options & CreateOptions::FILE_DIRECTORY_FILE) != 0, + (create_options & CreateOptions::FILE_NON_DIRECTORY_FILE) != 0, &vfs_file, &file_action); object_ref file = nullptr; diff --git a/src/xenia/kernel/xfile.cc b/src/xenia/kernel/xfile.cc index ee749b9e7..dc4553505 100644 --- a/src/xenia/kernel/xfile.cc +++ b/src/xenia/kernel/xfile.cc @@ -266,7 +266,7 @@ object_ref XFile::Restore(KernelState* kernel_state, vfs::FileAction action; auto res = kernel_state->file_system()->OpenFile( nullptr, abs_path, vfs::FileDisposition::kOpen, access, is_directory, - &vfs_file, &action); + false, &vfs_file, &action); if (XFAILED(res)) { XELOGE("Failed to open XFile: error {:08X}", res); return object_ref(file); diff --git a/src/xenia/vfs/virtual_file_system.cc b/src/xenia/vfs/virtual_file_system.cc index 31cb82f66..c0f888318 100644 --- a/src/xenia/vfs/virtual_file_system.cc +++ b/src/xenia/vfs/virtual_file_system.cc @@ -172,7 +172,8 @@ X_STATUS VirtualFileSystem::OpenFile(Entry* root_entry, const std::string_view path, FileDisposition creation_disposition, uint32_t desired_access, bool is_directory, - File** out_file, FileAction* out_action) { + bool is_non_directory, File** out_file, + FileAction* out_action) { // TODO(gibbed): should 'is_directory' remain as a bool or should it be // flipped to a generic FileAttributeFlags? @@ -207,6 +208,12 @@ X_STATUS VirtualFileSystem::OpenFile(Entry* root_entry, entry = !root_entry ? ResolvePath(path) : root_entry->GetChild(path); } + if (entry) { + if (entry->attributes() & kFileAttributeDirectory && is_non_directory) { + return X_STATUS_FILE_IS_A_DIRECTORY; + } + } + // Check if exists (if we need it to), or that it doesn't (if it shouldn't). switch (creation_disposition) { case FileDisposition::kOpen: diff --git a/src/xenia/vfs/virtual_file_system.h b/src/xenia/vfs/virtual_file_system.h index 8d5b84697..49e9083dc 100644 --- a/src/xenia/vfs/virtual_file_system.h +++ b/src/xenia/vfs/virtual_file_system.h @@ -43,7 +43,8 @@ class VirtualFileSystem { X_STATUS OpenFile(Entry* root_entry, const std::string_view path, FileDisposition creation_disposition, - uint32_t desired_access, bool is_directory, File** out_file, + uint32_t desired_access, bool is_directory, + bool is_non_directory, File** out_file, FileAction* out_action); private: diff --git a/src/xenia/xbox.h b/src/xenia/xbox.h index 62f4a1f65..2080b236c 100644 --- a/src/xenia/xbox.h +++ b/src/xenia/xbox.h @@ -64,6 +64,7 @@ typedef uint32_t X_STATUS; #define X_STATUS_PROCEDURE_NOT_FOUND ((X_STATUS)0xC000007AL) #define X_STATUS_INSUFFICIENT_RESOURCES ((X_STATUS)0xC000009AL) #define X_STATUS_MEMORY_NOT_ALLOCATED ((X_STATUS)0xC00000A0L) +#define X_STATUS_FILE_IS_A_DIRECTORY ((X_STATUS)0xC00000BAL) #define X_STATUS_NOT_SUPPORTED ((X_STATUS)0xC00000BBL) #define X_STATUS_INVALID_PARAMETER_1 ((X_STATUS)0xC00000EFL) #define X_STATUS_INVALID_PARAMETER_2 ((X_STATUS)0xC00000F0L) From a86609e93a5126bcb4f90d9f76654997c994787b Mon Sep 17 00:00:00 2001 From: Prism Tutaj Date: Sun, 8 Sep 2019 03:10:47 -0500 Subject: [PATCH 14/29] [GTK UI] Fix GTK submenus --- src/xenia/ui/window_gtk.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/xenia/ui/window_gtk.cc b/src/xenia/ui/window_gtk.cc index a6ca2087b..ed8fad055 100644 --- a/src/xenia/ui/window_gtk.cc +++ b/src/xenia/ui/window_gtk.cc @@ -415,14 +415,20 @@ GTKMenuItem::~GTKMenuItem() { void GTKMenuItem::OnChildAdded(MenuItem* generic_child_item) { auto child_item = static_cast(generic_child_item); + GtkWidget* submenu = nullptr; switch (child_item->type()) { case MenuItem::Type::kNormal: // Nothing special. break; case MenuItem::Type::kPopup: if (GTK_IS_MENU_ITEM(menu_)) { - assert(gtk_menu_item_get_submenu(GTK_MENU_ITEM(menu_)) == nullptr); - gtk_menu_item_set_submenu(GTK_MENU_ITEM(menu_), child_item->handle()); + submenu = gtk_menu_item_get_submenu(GTK_MENU_ITEM(menu_)); + // Get sub menu and if it doesn't exist create it + if (submenu == nullptr) { + submenu = gtk_menu_new(); + gtk_menu_item_set_submenu(GTK_MENU_ITEM(menu_), submenu); + } + gtk_menu_shell_append(GTK_MENU_SHELL(submenu), child_item->handle()); } else { gtk_menu_shell_append(GTK_MENU_SHELL(menu_), child_item->handle()); } @@ -431,7 +437,7 @@ void GTKMenuItem::OnChildAdded(MenuItem* generic_child_item) { case MenuItem::Type::kString: assert(GTK_IS_MENU_ITEM(menu_)); // Get sub menu and if it doesn't exist create it - GtkWidget* submenu = gtk_menu_item_get_submenu(GTK_MENU_ITEM(menu_)); + submenu = gtk_menu_item_get_submenu(GTK_MENU_ITEM(menu_)); if (submenu == nullptr) { submenu = gtk_menu_new(); gtk_menu_item_set_submenu(GTK_MENU_ITEM(menu_), submenu); From 9a4643d0f2435e5acb2bf4636d949a9995f9f73d Mon Sep 17 00:00:00 2001 From: Triang3l Date: Mon, 7 Dec 2020 22:23:54 +0300 Subject: [PATCH 15/29] [GPU] Non-ROV f24 trunc/round, host shader modifications, cache dir --- src/xenia/app/xenia_main.cc | 28 +- src/xenia/emulator.cc | 6 +- src/xenia/emulator.h | 7 +- src/xenia/gpu/command_processor.cc | 4 +- src/xenia/gpu/command_processor.h | 5 +- .../gpu/d3d12/d3d12_command_processor.cc | 122 +- src/xenia/gpu/d3d12/d3d12_command_processor.h | 15 +- src/xenia/gpu/d3d12/d3d12_shader.cc | 76 +- src/xenia/gpu/d3d12/d3d12_shader.h | 104 +- src/xenia/gpu/d3d12/pipeline_cache.cc | 1081 +++++++++-------- src/xenia/gpu/d3d12/pipeline_cache.h | 91 +- src/xenia/gpu/d3d12/render_target_cache.cc | 41 +- src/xenia/gpu/d3d12/render_target_cache.h | 11 +- .../dxbc/edram_load_depth_float24and32_cs.cso | Bin 0 -> 3500 bytes .../dxbc/edram_load_depth_float24and32_cs.h | 296 +++++ .../dxbc/edram_load_depth_float24and32_cs.txt | 117 ++ .../dxbc/edram_load_depth_float_cs.cso | Bin 3500 -> 2660 bytes .../shaders/dxbc/edram_load_depth_float_cs.h | 218 ++-- .../dxbc/edram_load_depth_float_cs.txt | 45 +- .../edram_store_depth_float24and32_cs.cso | Bin 0 -> 2660 bytes .../dxbc/edram_store_depth_float24and32_cs.h | 226 ++++ .../edram_store_depth_float24and32_cs.txt | 95 ++ .../dxbc/edram_store_depth_float_cs.cso | Bin 2660 -> 2600 bytes .../shaders/dxbc/edram_store_depth_float_cs.h | 135 +- .../dxbc/edram_store_depth_float_cs.txt | 74 +- .../d3d12/shaders/dxbc/float24_round_ps.cso | Bin 0 -> 1816 bytes .../gpu/d3d12/shaders/dxbc/float24_round_ps.h | 156 +++ .../d3d12/shaders/dxbc/float24_round_ps.txt | 74 ++ .../shaders/dxbc/float24_truncate_ps.cso | Bin 0 -> 1148 bytes .../d3d12/shaders/dxbc/float24_truncate_ps.h | 100 ++ .../shaders/dxbc/float24_truncate_ps.txt | 55 + .../shaders/edram_load_depth_float.cs.hlsl | 18 +- .../edram_load_depth_float24and32.cs.hlsl | 31 + .../shaders/edram_store_depth_float.cs.hlsl | 17 +- .../edram_store_depth_float24and32.cs.hlsl | 25 + .../shaders/edram_store_depth_unorm.cs.hlsl | 3 +- .../gpu/d3d12/shaders/float24_round.ps.hlsl | 13 + .../d3d12/shaders/float24_truncate.ps.hlsl | 38 + .../gpu/d3d12/shaders/pixel_formats.hlsli | 25 + .../shaders/primitive_point_list.gs.hlsl | 19 +- .../shaders/primitive_rectangle_list.gs.hlsl | 21 +- src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli | 6 +- src/xenia/gpu/draw_util.cc | 12 + src/xenia/gpu/draw_util.h | 1 + src/xenia/gpu/dxbc_shader.cc | 27 + src/xenia/gpu/dxbc_shader.h | 83 ++ src/xenia/gpu/dxbc_shader_translator.cc | 282 +++-- src/xenia/gpu/dxbc_shader_translator.h | 174 ++- src/xenia/gpu/dxbc_shader_translator_fetch.cc | 2 +- .../gpu/dxbc_shader_translator_memexport.cc | 2 +- src/xenia/gpu/dxbc_shader_translator_om.cc | 320 +++-- src/xenia/gpu/gpu_flags.cc | 54 + src/xenia/gpu/gpu_flags.h | 63 + src/xenia/gpu/graphics_system.cc | 20 +- src/xenia/gpu/graphics_system.h | 2 +- src/xenia/gpu/shader.cc | 94 +- src/xenia/gpu/shader.h | 168 ++- src/xenia/gpu/shader_compiler_main.cc | 10 +- src/xenia/gpu/shader_translator.cc | 273 +++-- src/xenia/gpu/shader_translator.h | 71 +- src/xenia/gpu/spirv_shader_translator.cc | 15 +- src/xenia/gpu/spirv_shader_translator.h | 3 +- src/xenia/gpu/trace_dump.cc | 2 +- src/xenia/gpu/trace_viewer.cc | 21 +- src/xenia/gpu/ucode.h | 35 +- src/xenia/gpu/vulkan/pipeline_cache.cc | 54 +- src/xenia/gpu/vulkan/pipeline_cache.h | 3 +- src/xenia/gpu/vulkan/vulkan_shader.cc | 42 +- src/xenia/gpu/vulkan/vulkan_shader.h | 23 +- src/xenia/gpu/xenos.cc | 32 +- src/xenia/gpu/xenos.h | 3 + 71 files changed, 3656 insertions(+), 1633 deletions(-) create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.cso create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.h create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.txt create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.cso create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.h create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.txt create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.cso create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.h create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.txt create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.cso create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.h create mode 100644 src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.txt create mode 100644 src/xenia/gpu/d3d12/shaders/edram_load_depth_float24and32.cs.hlsl create mode 100644 src/xenia/gpu/d3d12/shaders/edram_store_depth_float24and32.cs.hlsl create mode 100644 src/xenia/gpu/d3d12/shaders/float24_round.ps.hlsl create mode 100644 src/xenia/gpu/d3d12/shaders/float24_truncate.ps.hlsl create mode 100644 src/xenia/gpu/dxbc_shader.cc create mode 100644 src/xenia/gpu/dxbc_shader.h diff --git a/src/xenia/app/xenia_main.cc b/src/xenia/app/xenia_main.cc index d6099a047..342d5cc2b 100644 --- a/src/xenia/app/xenia_main.cc +++ b/src/xenia/app/xenia_main.cc @@ -65,6 +65,14 @@ DEFINE_path( "Root path for guest content storage (saves, etc.), or empty to use the " "content folder under the storage root.", "Storage"); +DEFINE_path( + cache_root, "", + "Root path for files used to speed up certain parts of the emulator or the " + "game. These files may be persistent, but they can be deleted without " + "major side effects such as progress loss. If empty, the cache folder " + "under the storage root, or, if available, the cache directory preferred " + "for the OS, will be used.", + "Storage"); DEFINE_bool(mount_scratch, false, "Enable scratch mount", "Storage"); DEFINE_bool(mount_cache, false, "Enable cache mount", "Storage"); @@ -221,6 +229,8 @@ int xenia_main(const std::vector& args) { #if defined(XE_PLATFORM_WIN32) || defined(XE_PLATFORM_GNU_LINUX) storage_root = storage_root / "Xenia"; #else + // TODO(Triang3l): Point to the app's external storage "files" directory + // on Android. #warning Unhandled platform for the data root. storage_root = storage_root / "Xenia"; #endif @@ -244,13 +254,29 @@ int xenia_main(const std::vector& args) { content_root = std::filesystem::absolute(content_root); XELOGI("Content root: {}", xe::path_to_utf8(content_root)); + std::filesystem::path cache_root = cvars::cache_root; + if (cache_root.empty()) { + cache_root = storage_root / "cache"; + // TODO(Triang3l): Point to the app's external storage "cache" directory on + // Android. + } else { + // If content root isn't an absolute path, then it should be relative to the + // storage root. + if (!cache_root.is_absolute()) { + cache_root = storage_root / cache_root; + } + } + cache_root = std::filesystem::absolute(cache_root); + XELOGI("Cache root: {}", xe::path_to_utf8(cache_root)); + if (cvars::discord) { discord::DiscordPresence::Initialize(); discord::DiscordPresence::NotPlaying(); } // Create the emulator but don't initialize so we can setup the window. - auto emulator = std::make_unique("", storage_root, content_root); + auto emulator = + std::make_unique("", storage_root, content_root, cache_root); // Main emulator display window. auto emulator_window = EmulatorWindow::Create(emulator.get()); diff --git a/src/xenia/emulator.cc b/src/xenia/emulator.cc index 44f284d34..4e6b10783 100644 --- a/src/xenia/emulator.cc +++ b/src/xenia/emulator.cc @@ -59,13 +59,15 @@ namespace xe { Emulator::Emulator(const std::filesystem::path& command_line, const std::filesystem::path& storage_root, - const std::filesystem::path& content_root) + const std::filesystem::path& content_root, + const std::filesystem::path& cache_root) : on_launch(), on_terminate(), on_exit(), command_line_(command_line), storage_root_(storage_root), content_root_(content_root), + cache_root_(cache_root), game_title_(), display_window_(nullptr), memory_(), @@ -689,7 +691,7 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path, // playing before the video can be seen if doing this in parallel with the // main thread. on_shader_storage_initialization(true); - graphics_system_->InitializeShaderStorage(storage_root_, title_id_, true); + graphics_system_->InitializeShaderStorage(cache_root_, title_id_, true); on_shader_storage_initialization(false); auto main_thread = kernel_state_->LaunchModule(module); diff --git a/src/xenia/emulator.h b/src/xenia/emulator.h index df5426227..739c12b51 100644 --- a/src/xenia/emulator.h +++ b/src/xenia/emulator.h @@ -49,7 +49,8 @@ class Emulator { public: explicit Emulator(const std::filesystem::path& command_line, const std::filesystem::path& storage_root, - const std::filesystem::path& content_root); + const std::filesystem::path& content_root, + const std::filesystem::path& cache_root); ~Emulator(); // Full command line used when launching the process. @@ -61,6 +62,9 @@ class Emulator { // Folder guest content is stored in. const std::filesystem::path& content_root() const { return content_root_; } + // Folder files safe to remove without significant side effects are stored in. + const std::filesystem::path& cache_root() const { return cache_root_; } + // Title of the game in the default language. const std::string& game_title() const { return game_title_; } @@ -166,6 +170,7 @@ class Emulator { std::filesystem::path command_line_; std::filesystem::path storage_root_; std::filesystem::path content_root_; + std::filesystem::path cache_root_; std::string game_title_; diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index 651952da6..044773161 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -89,8 +89,8 @@ void CommandProcessor::Shutdown() { } void CommandProcessor::InitializeShaderStorage( - const std::filesystem::path& storage_root, uint32_t title_id, - bool blocking) {} + const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) { +} void CommandProcessor::RequestFrameTrace( const std::filesystem::path& root_path) { diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h index b94562d79..5002f0137 100644 --- a/src/xenia/gpu/command_processor.h +++ b/src/xenia/gpu/command_processor.h @@ -133,9 +133,8 @@ class CommandProcessor { // May be called not only from the command processor thread when the command // processor is paused, and the termination of this function may be explicitly // awaited. - virtual void InitializeShaderStorage( - const std::filesystem::path& storage_root, uint32_t title_id, - bool blocking); + virtual void InitializeShaderStorage(const std::filesystem::path& cache_root, + uint32_t title_id, bool blocking); virtual void RequestFrameTrace(const std::filesystem::path& root_path); virtual void BeginTracing(const std::filesystem::path& root_path); diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 5b216b22b..ef38ff5b1 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -73,10 +73,9 @@ void D3D12CommandProcessor::ClearCaches() { } void D3D12CommandProcessor::InitializeShaderStorage( - const std::filesystem::path& storage_root, uint32_t title_id, - bool blocking) { - CommandProcessor::InitializeShaderStorage(storage_root, title_id, blocking); - pipeline_cache_->InitializeShaderStorage(storage_root, title_id, blocking); + const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) { + CommandProcessor::InitializeShaderStorage(cache_root, title_id, blocking); + pipeline_cache_->InitializeShaderStorage(cache_root, title_id, blocking); } void D3D12CommandProcessor::RequestFrameTrace( @@ -102,7 +101,7 @@ void D3D12CommandProcessor::RestoreEdramSnapshot(const void* snapshot) { } uint32_t D3D12CommandProcessor::GetCurrentColorMask( - const D3D12Shader* pixel_shader) const { + const Shader* pixel_shader) const { if (pixel_shader == nullptr) { return 0; } @@ -159,25 +158,16 @@ void D3D12CommandProcessor::SubmitBarriers() { } ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature( - const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader) { - assert_true(vertex_shader->is_translated()); - + const DxbcShader* vertex_shader, const DxbcShader* pixel_shader, + bool tessellated) { if (bindless_resources_used_) { - return vertex_shader->host_vertex_shader_type() != - Shader::HostVertexShaderType::kVertex - ? root_signature_bindless_ds_ - : root_signature_bindless_vs_; + return tessellated ? root_signature_bindless_ds_ + : root_signature_bindless_vs_; } - assert_true(pixel_shader == nullptr || pixel_shader->is_translated()); - - D3D12_SHADER_VISIBILITY vertex_visibility; - if (vertex_shader->host_vertex_shader_type() != - Shader::HostVertexShaderType::kVertex) { - vertex_visibility = D3D12_SHADER_VISIBILITY_DOMAIN; - } else { - vertex_visibility = D3D12_SHADER_VISIBILITY_VERTEX; - } + D3D12_SHADER_VISIBILITY vertex_visibility = + tessellated ? D3D12_SHADER_VISIBILITY_DOMAIN + : D3D12_SHADER_VISIBILITY_VERTEX; uint32_t texture_count_vertex, sampler_count_vertex; vertex_shader->GetTextureBindings(texture_count_vertex); @@ -393,7 +383,7 @@ ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature( } uint32_t D3D12CommandProcessor::GetRootBindfulExtraParameterIndices( - const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader, + const DxbcShader* vertex_shader, const DxbcShader* pixel_shader, RootBindfulExtraParameterIndices& indices_out) { uint32_t texture_count_pixel = 0, sampler_count_pixel = 0; if (pixel_shader != nullptr) { @@ -1202,6 +1192,7 @@ bool D3D12CommandProcessor::SetupContext() { pipeline_cache_ = std::make_unique( *this, *register_file_, bindless_resources_used_, edram_rov_used_, + render_target_cache_->depth_float24_conversion(), texture_cache_->IsResolutionScale2X() ? 2 : 1); if (!pipeline_cache_->Initialize()) { XELOGE("Failed to initialize the graphics pipeline cache"); @@ -1804,8 +1795,7 @@ Shader* D3D12CommandProcessor::LoadShader(xenos::ShaderType shader_type, uint32_t guest_address, const uint32_t* host_address, uint32_t dword_count) { - return pipeline_cache_->LoadShader(shader_type, guest_address, host_address, - dword_count); + return pipeline_cache_->LoadShader(shader_type, host_address, dword_count); } bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, @@ -1851,21 +1841,30 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, // Need a pixel shader in normal color mode. return false; } - // Get tessellation info for the current draw for vertex shader translation. - Shader::HostVertexShaderType host_vertex_shader_type = - pipeline_cache_->GetHostVertexShaderTypeIfValid(); - if (host_vertex_shader_type == Shader::HostVertexShaderType(-1)) { + DxbcShaderTranslator::Modification vertex_shader_modification; + DxbcShaderTranslator::Modification pixel_shader_modification; + if (!pipeline_cache_->GetCurrentShaderModifications( + vertex_shader_modification, pixel_shader_modification)) { return false; } + D3D12Shader::D3D12Translation* vertex_shader_translation = + static_cast( + vertex_shader->GetOrCreateTranslation( + vertex_shader_modification.value)); + D3D12Shader::D3D12Translation* pixel_shader_translation = + pixel_shader ? static_cast( + pixel_shader->GetOrCreateTranslation( + pixel_shader_modification.value)) + : nullptr; // Translate the shaders now to get memexport configuration and color mask, - // which is needed by the render target cache, to check the possibility of - // doing early depth/stencil, and also to get used textures and samplers. - if (!pipeline_cache_->EnsureShadersTranslated(vertex_shader, pixel_shader, - host_vertex_shader_type)) { + // which is needed by the render target cache, and also to get used textures + // and samplers. + if (!pipeline_cache_->EnsureShadersTranslated(vertex_shader_translation, + pixel_shader_translation)) { return false; } - bool tessellated = - host_vertex_shader_type != Shader::HostVertexShaderType::kVertex; + bool tessellated = vertex_shader_modification.host_vertex_shader_type != + Shader::HostVertexShaderType::kVertex; // Check if memexport is used. If it is, we can't skip draw calls that have no // visual effect. @@ -1967,26 +1966,14 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, (pixel_shader != nullptr ? pixel_shader->GetUsedTextureMask() : 0); texture_cache_->RequestTextures(used_texture_mask); - // Check if early depth/stencil can be enabled. - bool early_z; - if (pixel_shader) { - auto rb_colorcontrol = regs.Get(); - early_z = pixel_shader->implicit_early_z_allowed() && - (!rb_colorcontrol.alpha_test_enable || - rb_colorcontrol.alpha_func == xenos::CompareFunction::kAlways) && - !rb_colorcontrol.alpha_to_mask_enable; - } else { - early_z = true; - } - // Create the pipeline if needed and bind it. void* pipeline_handle; ID3D12RootSignature* root_signature; if (!pipeline_cache_->ConfigurePipeline( - vertex_shader, pixel_shader, primitive_type_converted, + vertex_shader_translation, pixel_shader_translation, + primitive_type_converted, indexed ? index_buffer_info->format : xenos::IndexFormat::kInt16, - early_z, pipeline_render_targets, &pipeline_handle, - &root_signature)) { + pipeline_render_targets, &pipeline_handle, &root_signature)) { return false; } if (current_cached_pipeline_ != pipeline_handle) { @@ -2014,11 +2001,18 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, pixel_size_x *= 2; pixel_size_y *= 2; } + flags::DepthFloat24Conversion depth_float24_conversion = + render_target_cache_->depth_float24_conversion(); draw_util::ViewportInfo viewport_info; - draw_util::GetHostViewportInfo(regs, float(pixel_size_x), float(pixel_size_y), - true, float(D3D12_VIEWPORT_BOUNDS_MAX), - float(D3D12_VIEWPORT_BOUNDS_MAX), false, - viewport_info); + draw_util::GetHostViewportInfo( + regs, float(pixel_size_x), float(pixel_size_y), true, + float(D3D12_VIEWPORT_BOUNDS_MAX), float(D3D12_VIEWPORT_BOUNDS_MAX), false, + !edram_rov_used_ && + (depth_float24_conversion == + flags::DepthFloat24Conversion::kOnOutputTruncating || + depth_float24_conversion == + flags::DepthFloat24Conversion::kOnOutputRounding), + viewport_info); draw_util::Scissor scissor; draw_util::GetScissor(regs, scissor); scissor.left *= pixel_size_x; @@ -2033,7 +2027,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, UpdateSystemConstantValues( memexport_used, primitive_polygonal, line_loop_closing_index, indexed ? index_buffer_info->endianness : xenos::Endian::kNone, - viewport_info, pixel_size_x, pixel_size_y, used_texture_mask, early_z, + viewport_info, pixel_size_x, pixel_size_y, used_texture_mask, GetCurrentColorMask(pixel_shader), pipeline_render_targets); // Update constant buffers, descriptors and root parameters. @@ -2873,8 +2867,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( bool shared_memory_is_uav, bool primitive_polygonal, uint32_t line_loop_closing_index, xenos::Endian index_endian, const draw_util::ViewportInfo& viewport_info, uint32_t pixel_size_x, - uint32_t pixel_size_y, uint32_t used_texture_mask, bool early_z, - uint32_t color_mask, + uint32_t pixel_size_y, uint32_t used_texture_mask, uint32_t color_mask, const RenderTargetCache::PipelineRenderTarget render_targets[4]) { #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); @@ -2992,14 +2985,11 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( flags |= DxbcShaderTranslator::kSysFlag_KillIfAnyVertexKilled; } // Alpha test. - if (rb_colorcontrol.alpha_test_enable) { - flags |= uint32_t(rb_colorcontrol.alpha_func) - << DxbcShaderTranslator::kSysFlag_AlphaPassIfLess_Shift; - } else { - flags |= DxbcShaderTranslator::kSysFlag_AlphaPassIfLess | - DxbcShaderTranslator::kSysFlag_AlphaPassIfEqual | - DxbcShaderTranslator::kSysFlag_AlphaPassIfGreater; - } + xenos::CompareFunction alpha_test_function = + rb_colorcontrol.alpha_test_enable ? rb_colorcontrol.alpha_func + : xenos::CompareFunction::kAlways; + flags |= uint32_t(alpha_test_function) + << DxbcShaderTranslator::kSysFlag_AlphaPassIfLess_Shift; // Gamma writing. for (uint32_t i = 0; i < 4; ++i) { if (color_infos[i].color_format == @@ -3028,7 +3018,9 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( if (rb_depthcontrol.stencil_enable) { flags |= DxbcShaderTranslator::kSysFlag_ROVStencilTest; } - if (early_z) { + // Hint - if not applicable to the shader, will not have effect. + if (alpha_test_function == xenos::CompareFunction::kAlways && + !rb_colorcontrol.alpha_to_mask_enable) { flags |= DxbcShaderTranslator::kSysFlag_ROVDepthStencilEarlyWrite; } } diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index c75b5c203..a9181f1c3 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -27,6 +27,7 @@ #include "xenia/gpu/d3d12/render_target_cache.h" #include "xenia/gpu/d3d12/texture_cache.h" #include "xenia/gpu/draw_util.h" +#include "xenia/gpu/dxbc_shader.h" #include "xenia/gpu/dxbc_shader_translator.h" #include "xenia/gpu/xenos.h" #include "xenia/kernel/kernel_state.h" @@ -47,7 +48,7 @@ class D3D12CommandProcessor : public CommandProcessor { void ClearCaches() override; - void InitializeShaderStorage(const std::filesystem::path& storage_root, + void InitializeShaderStorage(const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) override; void RequestFrameTrace(const std::filesystem::path& root_path) override; @@ -88,7 +89,7 @@ class D3D12CommandProcessor : public CommandProcessor { // there are 4 render targets bound with the same EDRAM base (clearly not // correct usage), but the shader only clears 1, and then EDRAM buffer stores // conflict with each other. - uint32_t GetCurrentColorMask(const D3D12Shader* pixel_shader) const; + uint32_t GetCurrentColorMask(const Shader* pixel_shader) const; void PushTransitionBarrier( ID3D12Resource* resource, D3D12_RESOURCE_STATES old_state, @@ -100,8 +101,9 @@ class D3D12CommandProcessor : public CommandProcessor { void SubmitBarriers(); // Finds or creates root signature for a pipeline. - ID3D12RootSignature* GetRootSignature(const D3D12Shader* vertex_shader, - const D3D12Shader* pixel_shader); + ID3D12RootSignature* GetRootSignature(const DxbcShader* vertex_shader, + const DxbcShader* pixel_shader, + bool tessellated); ui::d3d12::D3D12UploadBufferPool& GetConstantBufferPool() const { return *constant_buffer_pool_; @@ -300,7 +302,7 @@ class D3D12CommandProcessor : public CommandProcessor { // Gets the indices of optional root parameters. Returns the total parameter // count. static uint32_t GetRootBindfulExtraParameterIndices( - const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader, + const DxbcShader* vertex_shader, const DxbcShader* pixel_shader, RootBindfulExtraParameterIndices& indices_out); // BeginSubmission and EndSubmission may be called at any time. If there's an @@ -353,8 +355,7 @@ class D3D12CommandProcessor : public CommandProcessor { bool shared_memory_is_uav, bool primitive_polygonal, uint32_t line_loop_closing_index, xenos::Endian index_endian, const draw_util::ViewportInfo& viewport_info, uint32_t pixel_size_x, - uint32_t pixel_size_y, uint32_t used_texture_mask, bool early_z, - uint32_t color_mask, + uint32_t pixel_size_y, uint32_t used_texture_mask, uint32_t color_mask, const RenderTargetCache::PipelineRenderTarget render_targets[4]); bool UpdateBindings(const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader, diff --git a/src/xenia/gpu/d3d12/d3d12_shader.cc b/src/xenia/gpu/d3d12/d3d12_shader.cc index 0b5296a4f..672f1e37d 100644 --- a/src/xenia/gpu/d3d12/d3d12_shader.cc +++ b/src/xenia/gpu/d3d12/d3d12_shader.cc @@ -10,9 +10,11 @@ #include "xenia/gpu/d3d12/d3d12_shader.h" #include +#include #include "xenia/base/assert.h" #include "xenia/base/logging.h" +#include "xenia/gpu/dxbc_shader.h" #include "xenia/gpu/gpu_flags.h" #include "xenia/ui/d3d12/d3d12_api.h" @@ -22,51 +24,13 @@ namespace d3d12 { D3D12Shader::D3D12Shader(xenos::ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr, uint32_t dword_count) - : Shader(shader_type, data_hash, dword_ptr, dword_count) {} + : DxbcShader(shader_type, data_hash, dword_ptr, dword_count) {} -void D3D12Shader::SetTexturesAndSamplers( - const DxbcShaderTranslator::TextureBinding* texture_bindings, - uint32_t texture_binding_count, - const DxbcShaderTranslator::SamplerBinding* sampler_bindings, - uint32_t sampler_binding_count) { - texture_bindings_.clear(); - texture_bindings_.reserve(texture_binding_count); - used_texture_mask_ = 0; - for (uint32_t i = 0; i < texture_binding_count; ++i) { - TextureBinding& binding = texture_bindings_.emplace_back(); - // For a stable hash. - std::memset(&binding, 0, sizeof(binding)); - const DxbcShaderTranslator::TextureBinding& translator_binding = - texture_bindings[i]; - binding.bindless_descriptor_index = - translator_binding.bindless_descriptor_index; - binding.fetch_constant = translator_binding.fetch_constant; - binding.dimension = translator_binding.dimension; - binding.is_signed = translator_binding.is_signed; - used_texture_mask_ |= 1u << translator_binding.fetch_constant; - } - sampler_bindings_.clear(); - sampler_bindings_.reserve(sampler_binding_count); - for (uint32_t i = 0; i < sampler_binding_count; ++i) { - SamplerBinding binding; - const DxbcShaderTranslator::SamplerBinding& translator_binding = - sampler_bindings[i]; - binding.bindless_descriptor_index = - translator_binding.bindless_descriptor_index; - binding.fetch_constant = translator_binding.fetch_constant; - binding.mag_filter = translator_binding.mag_filter; - binding.min_filter = translator_binding.min_filter; - binding.mip_filter = translator_binding.mip_filter; - binding.aniso_filter = translator_binding.aniso_filter; - sampler_bindings_.push_back(binding); - } -} - -void D3D12Shader::DisassembleDxbc(const ui::d3d12::D3D12Provider& provider, - bool disassemble_dxbc, - IDxbcConverter* dxbc_converter, - IDxcUtils* dxc_utils, - IDxcCompiler* dxc_compiler) { +void D3D12Shader::D3D12Translation::DisassembleDxbcAndDxil( + const ui::d3d12::D3D12Provider& provider, bool disassemble_dxbc, + IDxbcConverter* dxbc_converter, IDxcUtils* dxc_utils, + IDxcCompiler* dxc_compiler) { + std::string disassembly; bool is_first_disassembly = true; if (disassemble_dxbc) { ID3DBlob* dxbc_disassembly; @@ -77,11 +41,12 @@ void D3D12Shader::DisassembleDxbc(const ui::d3d12::D3D12Provider& provider, nullptr, &dxbc_disassembly))) { assert_true(is_first_disassembly); is_first_disassembly = false; - host_disassembly_.append( + disassembly.append( reinterpret_cast(dxbc_disassembly->GetBufferPointer())); dxbc_disassembly->Release(); } else { - XELOGE("Failed to disassemble DXBC shader {:016X}", ucode_data_hash()); + XELOGE("Failed to disassemble DXBC shader {:016X}", + shader().ucode_data_hash()); } } if (dxbc_converter && dxc_utils && dxc_compiler) { @@ -106,29 +71,36 @@ void D3D12Shader::DisassembleDxbc(const ui::d3d12::D3D12Provider& provider, dxil_disassembly->Release(); if (dxil_disassembly_got_utf8) { if (!is_first_disassembly) { - host_disassembly_.append("\n\n"); + disassembly.append("\n\n"); } is_first_disassembly = false; - host_disassembly_.append(reinterpret_cast( + disassembly.append(reinterpret_cast( dxil_disassembly_utf8->GetStringPointer())); dxil_disassembly_utf8->Release(); } else { XELOGE("Failed to get DXIL shader {:016X} disassembly as UTF-8", - ucode_data_hash()); + shader().ucode_data_hash()); } } else { XELOGE("Failed to disassemble DXIL shader {:016X}", - ucode_data_hash()); + shader().ucode_data_hash()); } } else { XELOGE("Failed to create a blob with DXIL shader {:016X}", - ucode_data_hash()); + shader().ucode_data_hash()); CoTaskMemFree(dxil); } } else { - XELOGE("Failed to convert shader {:016X} to DXIL", ucode_data_hash()); + XELOGE("Failed to convert shader {:016X} to DXIL", + shader().ucode_data_hash()); } } + set_host_disassembly(std::move(disassembly)); +} + +Shader::Translation* D3D12Shader::CreateTranslationInstance( + uint32_t modification) { + return new D3D12Translation(*this, modification); } } // namespace d3d12 diff --git a/src/xenia/gpu/d3d12/d3d12_shader.h b/src/xenia/gpu/d3d12/d3d12_shader.h index c24d6a00a..384e48a8a 100644 --- a/src/xenia/gpu/d3d12/d3d12_shader.h +++ b/src/xenia/gpu/d3d12/d3d12_shader.h @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2018 Ben Vanik. All rights reserved. * + * Copyright 2020 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -10,106 +10,62 @@ #ifndef XENIA_GPU_D3D12_D3D12_SHADER_H_ #define XENIA_GPU_D3D12_D3D12_SHADER_H_ -#include +#include -#include "xenia/gpu/dxbc_shader_translator.h" -#include "xenia/gpu/shader.h" -#include "xenia/gpu/xenos.h" +#include "xenia/gpu/dxbc_shader.h" #include "xenia/ui/d3d12/d3d12_provider.h" namespace xe { namespace gpu { namespace d3d12 { -class D3D12Shader : public Shader { +class D3D12Shader : public DxbcShader { public: + class D3D12Translation : public DxbcTranslation { + public: + D3D12Translation(D3D12Shader& shader, uint32_t modification) + : DxbcTranslation(shader, modification) {} + + void DisassembleDxbcAndDxil(const ui::d3d12::D3D12Provider& provider, + bool disassemble_dxbc, + IDxbcConverter* dxbc_converter = nullptr, + IDxcUtils* dxc_utils = nullptr, + IDxcCompiler* dxc_compiler = nullptr); + }; + D3D12Shader(xenos::ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr, uint32_t dword_count); - void SetTexturesAndSamplers( - const DxbcShaderTranslator::TextureBinding* texture_bindings, - uint32_t texture_binding_count, - const DxbcShaderTranslator::SamplerBinding* sampler_bindings, - uint32_t sampler_binding_count); - - void SetForcedEarlyZShaderObject(const std::vector& shader_object) { - forced_early_z_shader_ = shader_object; - } - // Returns the shader with forced early depth/stencil set with - // SetForcedEarlyZShader after translation. If there's none (for example, - // if the shader discards pixels or writes to the depth buffer), an empty - // vector is returned. - const std::vector& GetForcedEarlyZShaderObject() const { - return forced_early_z_shader_; - } - - void DisassembleDxbc(const ui::d3d12::D3D12Provider& provider, - bool disassemble_dxbc, - IDxbcConverter* dxbc_converter = nullptr, - IDxcUtils* dxc_utils = nullptr, - IDxcCompiler* dxc_compiler = nullptr); - - static constexpr uint32_t kMaxTextureBindingIndexBits = - DxbcShaderTranslator::kMaxTextureBindingIndexBits; - static constexpr uint32_t kMaxTextureBindings = - DxbcShaderTranslator::kMaxTextureBindings; - struct TextureBinding { - uint32_t bindless_descriptor_index; - uint32_t fetch_constant; - // Stacked and 3D are separate TextureBindings, even for bindless for null - // descriptor handling simplicity. - xenos::FetchOpDimension dimension; - bool is_signed; - }; - // Safe to hash and compare with memcmp for layout hashing. - const TextureBinding* GetTextureBindings(uint32_t& count_out) const { - count_out = uint32_t(texture_bindings_.size()); - return texture_bindings_.data(); - } - const uint32_t GetUsedTextureMask() const { return used_texture_mask_; } - - static constexpr uint32_t kMaxSamplerBindingIndexBits = - DxbcShaderTranslator::kMaxSamplerBindingIndexBits; - static constexpr uint32_t kMaxSamplerBindings = - DxbcShaderTranslator::kMaxSamplerBindings; - struct SamplerBinding { - uint32_t bindless_descriptor_index; - uint32_t fetch_constant; - xenos::TextureFilter mag_filter; - xenos::TextureFilter min_filter; - xenos::TextureFilter mip_filter; - xenos::AnisoFilter aniso_filter; - }; - const SamplerBinding* GetSamplerBindings(uint32_t& count_out) const { - count_out = uint32_t(sampler_bindings_.size()); - return sampler_bindings_.data(); - } - - // For owning subsystems like the pipeline cache, accessors for unique + // For owning subsystem like the pipeline cache, accessors for unique // identifiers (used instead of hashes to make sure collisions can't happen) // of binding layouts used by the shader, for invalidation if a shader with an // incompatible layout was bound. size_t GetTextureBindingLayoutUserUID() const { return texture_binding_layout_user_uid_; } - void SetTextureBindingLayoutUserUID(size_t uid) { - texture_binding_layout_user_uid_ = uid; - } size_t GetSamplerBindingLayoutUserUID() const { return sampler_binding_layout_user_uid_; } + // Modifications of the same shader can be translated on different threads. + // The "set" function must only be called if "enter" returned true - these are + // set up only once. + bool EnterBindingLayoutUserUIDSetup() { + return !binding_layout_user_uids_set_up_.test_and_set(); + } + void SetTextureBindingLayoutUserUID(size_t uid) { + texture_binding_layout_user_uid_ = uid; + } void SetSamplerBindingLayoutUserUID(size_t uid) { sampler_binding_layout_user_uid_ = uid; } + protected: + Translation* CreateTranslationInstance(uint32_t modification) override; + private: - std::vector texture_bindings_; - std::vector sampler_bindings_; + std::atomic_flag binding_layout_user_uids_set_up_ = ATOMIC_FLAG_INIT; size_t texture_binding_layout_user_uid_ = 0; size_t sampler_binding_layout_user_uid_ = 0; - uint32_t used_texture_mask_ = 0; - - std::vector forced_early_z_shader_; }; } // namespace d3d12 diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index b36ecfea8..e1b1cbeaf 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "third_party/fmt/include/fmt/format.h" @@ -63,19 +64,23 @@ namespace d3d12 { #include "xenia/gpu/d3d12/shaders/dxbc/continuous_triangle_hs.h" #include "xenia/gpu/d3d12/shaders/dxbc/discrete_quad_hs.h" #include "xenia/gpu/d3d12/shaders/dxbc/discrete_triangle_hs.h" +#include "xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.h" +#include "xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.h" #include "xenia/gpu/d3d12/shaders/dxbc/primitive_point_list_gs.h" #include "xenia/gpu/d3d12/shaders/dxbc/primitive_quad_list_gs.h" #include "xenia/gpu/d3d12/shaders/dxbc/primitive_rectangle_list_gs.h" #include "xenia/gpu/d3d12/shaders/dxbc/tessellation_vs.h" -PipelineCache::PipelineCache(D3D12CommandProcessor& command_processor, - const RegisterFile& register_file, - bool bindless_resources_used, bool edram_rov_used, - uint32_t resolution_scale) +PipelineCache::PipelineCache( + D3D12CommandProcessor& command_processor, const RegisterFile& register_file, + bool bindless_resources_used, bool edram_rov_used, + flags::DepthFloat24Conversion depth_float24_conversion, + uint32_t resolution_scale) : command_processor_(command_processor), register_file_(register_file), bindless_resources_used_(bindless_resources_used), edram_rov_used_(edram_rov_used), + depth_float24_conversion_(depth_float24_conversion), resolution_scale_(resolution_scale) { auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); @@ -177,10 +182,10 @@ void PipelineCache::Shutdown() { void PipelineCache::ClearCache(bool shutting_down) { bool reinitialize_shader_storage = !shutting_down && storage_write_thread_ != nullptr; - std::filesystem::path shader_storage_root; + std::filesystem::path shader_storage_cache_root; uint32_t shader_storage_title_id = shader_storage_title_id_; if (reinitialize_shader_storage) { - shader_storage_root = shader_storage_root_; + shader_storage_cache_root = shader_storage_cache_root_; } ShutdownShaderStorage(); @@ -226,19 +231,19 @@ void PipelineCache::ClearCache(bool shutting_down) { delete it.second; } shaders_.clear(); + shader_storage_index_ = 0; if (reinitialize_shader_storage) { - InitializeShaderStorage(shader_storage_root, shader_storage_title_id, + InitializeShaderStorage(shader_storage_cache_root, shader_storage_title_id, false); } } void PipelineCache::InitializeShaderStorage( - const std::filesystem::path& storage_root, uint32_t title_id, - bool blocking) { + const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) { ShutdownShaderStorage(); - auto shader_storage_root = storage_root / "shaders"; + auto shader_storage_root = cache_root / "shaders"; // For files that can be moved between different hosts. // Host PSO blobs - if ever added - should be stored in shaders/local/ (they // currently aren't used because because they may be not very practical - @@ -256,6 +261,90 @@ void PipelineCache::InitializeShaderStorage( } } + // Initialize the pipeline storage stream - read pipeline descriptions and + // collect used shader modifications to translate. + std::vector pipeline_stored_descriptions; + // . + std::set> shader_translations_needed; + auto pipeline_storage_file_path = + shader_storage_shareable_root / + fmt::format("{:08X}.{}.d3d12.xpso", title_id, + edram_rov_used_ ? "rov" : "rtv"); + pipeline_storage_file_ = + xe::filesystem::OpenFile(pipeline_storage_file_path, "a+b"); + if (!pipeline_storage_file_) { + XELOGE( + "Failed to open the Direct3D 12 pipeline description storage file for " + "writing, persistent shader storage will be disabled: {}", + xe::path_to_utf8(pipeline_storage_file_path)); + return; + } + pipeline_storage_file_flush_needed_ = false; + // 'XEPS'. + const uint32_t pipeline_storage_magic = 0x53504558; + // 'DXRO' or 'DXRT'. + const uint32_t pipeline_storage_magic_api = + edram_rov_used_ ? 0x4F525844 : 0x54525844; + const uint32_t pipeline_storage_version_swapped = + xe::byte_swap(std::max(PipelineDescription::kVersion, + DxbcShaderTranslator::Modification::kVersion)); + struct { + uint32_t magic; + uint32_t magic_api; + uint32_t version_swapped; + uint32_t device_features; + } pipeline_storage_file_header; + if (fread(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header), + 1, pipeline_storage_file_) && + pipeline_storage_file_header.magic == pipeline_storage_magic && + pipeline_storage_file_header.magic_api == pipeline_storage_magic_api && + pipeline_storage_file_header.version_swapped == + pipeline_storage_version_swapped) { + xe::filesystem::Seek(pipeline_storage_file_, 0, SEEK_END); + int64_t pipeline_storage_told_end = + xe::filesystem::Tell(pipeline_storage_file_); + size_t pipeline_storage_told_count = + size_t(pipeline_storage_told_end >= + int64_t(sizeof(pipeline_storage_file_header)) + ? (uint64_t(pipeline_storage_told_end) - + sizeof(pipeline_storage_file_header)) / + sizeof(PipelineStoredDescription) + : 0); + if (pipeline_storage_told_count && + xe::filesystem::Seek(pipeline_storage_file_, + int64_t(sizeof(pipeline_storage_file_header)), + SEEK_SET)) { + pipeline_stored_descriptions.resize(pipeline_storage_told_count); + pipeline_stored_descriptions.resize( + fread(pipeline_stored_descriptions.data(), + sizeof(PipelineStoredDescription), pipeline_storage_told_count, + pipeline_storage_file_)); + size_t pipeline_storage_read_count = pipeline_stored_descriptions.size(); + for (size_t i = 0; i < pipeline_storage_read_count; ++i) { + const PipelineStoredDescription& pipeline_stored_description = + pipeline_stored_descriptions[i]; + // Validate file integrity, stop and truncate the stream if data is + // corrupted. + if (XXH64(&pipeline_stored_description.description, + sizeof(pipeline_stored_description.description), + 0) != pipeline_stored_description.description_hash) { + pipeline_stored_descriptions.resize(i); + break; + } + // Mark the shader modifications as needed for translation. + shader_translations_needed.emplace( + pipeline_stored_description.description.vertex_shader_hash, + pipeline_stored_description.description.vertex_shader_modification); + if (pipeline_stored_description.description.pixel_shader_hash) { + shader_translations_needed.emplace( + pipeline_stored_description.description.pixel_shader_hash, + pipeline_stored_description.description + .pixel_shader_modification); + } + } + } + } + size_t logical_processor_count = xe::threading::logical_processor_count(); if (!logical_processor_count) { // Pick some reasonable amount if couldn't determine the number of cores. @@ -274,8 +363,11 @@ void PipelineCache::InitializeShaderStorage( "Failed to open the guest shader storage file for writing, persistent " "shader storage will be disabled: {}", xe::path_to_utf8(shader_storage_file_path)); + fclose(pipeline_storage_file_); + pipeline_storage_file_ = nullptr; return; } + ++shader_storage_index_; shader_storage_file_flush_needed_ = false; struct { uint32_t magic; @@ -299,12 +391,12 @@ void PipelineCache::InitializeShaderStorage( // Threads overlapping file reading. std::mutex shaders_translation_thread_mutex; std::condition_variable shaders_translation_thread_cond; - std::deque> + std::deque> shaders_to_translate; size_t shader_translation_threads_busy = 0; bool shader_translation_threads_shutdown = false; std::mutex shaders_failed_to_translate_mutex; - std::vector shaders_failed_to_translate; + std::vector shaders_failed_to_translate; auto shader_translation_thread_function = [&]() { auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); DxbcShaderTranslator translator( @@ -324,7 +416,8 @@ void PipelineCache::InitializeShaderStorage( IID_PPV_ARGS(&dxc_compiler)); } for (;;) { - std::pair shader_to_translate; + std::pair + shader_to_translate; for (;;) { std::unique_lock lock(shaders_translation_thread_mutex); if (shaders_to_translate.empty()) { @@ -340,11 +433,9 @@ void PipelineCache::InitializeShaderStorage( break; } assert_not_null(shader_to_translate.second); - if (!TranslateShader( - translator, *shader_to_translate.second, - shader_to_translate.first.sq_program_cntl, dxbc_converter, - dxc_utils, dxc_compiler, - shader_to_translate.first.host_vertex_shader_type)) { + if (!TranslateShader(translator, *shader_to_translate.second, + shader_to_translate.first.sq_program_cntl, + dxbc_converter, dxc_utils, dxc_compiler)) { std::lock_guard lock(shaders_failed_to_translate_mutex); shaders_failed_to_translate.push_back(shader_to_translate.second); } @@ -373,18 +464,6 @@ void PipelineCache::InitializeShaderStorage( } size_t ucode_byte_count = shader_header.ucode_dword_count * sizeof(uint32_t); - if (shaders_.find(shader_header.ucode_data_hash) != shaders_.end()) { - // Already added - usually shaders aren't added without the intention of - // translating them imminently, so don't do additional checks to - // actually ensure that translation happens right now (they would cause - // a race condition with shaders currently queued for translation). - if (!xe::filesystem::Seek(shader_storage_file_, - int64_t(ucode_byte_count), SEEK_CUR)) { - break; - } - shader_storage_valid_bytes += sizeof(shader_header) + ucode_byte_count; - continue; - } ucode_dwords.resize(shader_header.ucode_dword_count); if (shader_header.ucode_dword_count && !fread(ucode_dwords.data(), ucode_byte_count, 1, @@ -397,34 +476,60 @@ void PipelineCache::InitializeShaderStorage( // Validation failed. break; } - D3D12Shader* shader = - new D3D12Shader(shader_header.type, ucode_data_hash, - ucode_dwords.data(), shader_header.ucode_dword_count); - shaders_.emplace(ucode_data_hash, shader); - // Create new threads if the currently existing threads can't keep up with - // file reading, but not more than the number of logical processors minus - // one. - size_t shader_translation_threads_needed; - { - std::lock_guard lock(shaders_translation_thread_mutex); - shader_translation_threads_needed = - std::min(shader_translation_threads_busy + - shaders_to_translate.size() + size_t(1), - logical_processor_count - size_t(1)); - } - while (shader_translation_threads.size() < - shader_translation_threads_needed) { - shader_translation_threads.push_back(xe::threading::Thread::Create( - {}, shader_translation_thread_function)); - shader_translation_threads.back()->set_name("Shader Translation"); - } - { - std::lock_guard lock(shaders_translation_thread_mutex); - shaders_to_translate.emplace_back(shader_header, shader); - } - shaders_translation_thread_cond.notify_one(); shader_storage_valid_bytes += sizeof(shader_header) + ucode_byte_count; - ++shaders_translated; + // Only add the shader if needed. + auto modification_it = shader_translations_needed.lower_bound( + std::make_pair(ucode_data_hash, uint32_t(0))); + if (modification_it == shader_translations_needed.end() || + modification_it->first != ucode_data_hash) { + continue; + } + D3D12Shader* shader = + LoadShader(shader_header.type, ucode_dwords.data(), + shader_header.ucode_dword_count, ucode_data_hash); + // Loaded from the current storage - don't write again. + shader->set_ucode_storage_index(shader_storage_index_); + // Translate all the needed modifications. + for (; modification_it != shader_translations_needed.end() && + modification_it->first == ucode_data_hash; + ++modification_it) { + bool translation_is_new; + D3D12Shader::D3D12Translation* translation = + static_cast( + shader->GetOrCreateTranslation(modification_it->second, + &translation_is_new)); + if (!translation_is_new) { + // Already added - usually shaders aren't added without the intention + // of translating them imminently, so don't do additional checks to + // actually ensure that translation happens right now (they would + // cause a race condition with shaders currently queued for + // translation). + continue; + } + // Create new threads if the currently existing threads can't keep up + // with file reading, but not more than the number of logical processors + // minus one. + size_t shader_translation_threads_needed; + { + std::lock_guard lock(shaders_translation_thread_mutex); + shader_translation_threads_needed = + std::min(shader_translation_threads_busy + + shaders_to_translate.size() + size_t(1), + logical_processor_count - size_t(1)); + } + while (shader_translation_threads.size() < + shader_translation_threads_needed) { + shader_translation_threads.push_back(xe::threading::Thread::Create( + {}, shader_translation_thread_function)); + shader_translation_threads.back()->set_name("Shader Translation"); + } + { + std::lock_guard lock(shaders_translation_thread_mutex); + shaders_to_translate.emplace_back(shader_header, translation); + } + shaders_translation_thread_cond.notify_one(); + ++shaders_translated; + } } if (!shader_translation_threads.empty()) { { @@ -436,9 +541,14 @@ void PipelineCache::InitializeShaderStorage( xe::threading::Wait(shader_translation_thread.get(), false); } shader_translation_threads.clear(); - for (D3D12Shader* shader : shaders_failed_to_translate) { - shaders_.erase(shader->ucode_data_hash()); - delete shader; + for (D3D12Shader::D3D12Translation* translation : + shaders_failed_to_translate) { + D3D12Shader* shader = static_cast(&translation->shader()); + shader->DestroyTranslation(translation->modification()); + if (shader->translations().empty()) { + shaders_.erase(shader->ucode_data_hash()); + delete shader; + } } } XELOGGPU("Translated {} shaders from the storage in {} milliseconds", @@ -457,220 +567,177 @@ void PipelineCache::InitializeShaderStorage( shader_storage_file_); } - // 'DXRO' or 'DXRT'. - const uint32_t pipeline_storage_magic_api = - edram_rov_used_ ? 0x4F525844 : 0x54525844; + // Create the pipelines. + if (!pipeline_stored_descriptions.empty()) { + uint64_t pipeline_creation_start_ = xe::Clock::QueryHostTickCount(); - // Initialize the pipeline storage stream. - uint64_t pipeline_storage_initialization_start_ = - xe::Clock::QueryHostTickCount(); - auto pipeline_storage_file_path = - shader_storage_shareable_root / - fmt::format("{:08X}.{}.d3d12.xpso", title_id, - edram_rov_used_ ? "rov" : "rtv"); - pipeline_storage_file_ = - xe::filesystem::OpenFile(pipeline_storage_file_path, "a+b"); - if (!pipeline_storage_file_) { - XELOGE( - "Failed to open the Direct3D 12 pipeline description storage file for " - "writing, persistent shader storage will be disabled: {}", - xe::path_to_utf8(pipeline_storage_file_path)); - fclose(shader_storage_file_); - shader_storage_file_ = nullptr; - return; - } - pipeline_storage_file_flush_needed_ = false; - // 'XEPS'. - const uint32_t pipeline_storage_magic = 0x53504558; - struct { - uint32_t magic; - uint32_t magic_api; - uint32_t version_swapped; - } pipeline_storage_file_header; - if (fread(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header), - 1, pipeline_storage_file_) && - pipeline_storage_file_header.magic == pipeline_storage_magic && - pipeline_storage_file_header.magic_api == pipeline_storage_magic_api && - xe::byte_swap(pipeline_storage_file_header.version_swapped) == - PipelineDescription::kVersion) { - uint64_t pipeline_storage_valid_bytes = - sizeof(pipeline_storage_file_header); - // Enqueue pipeline descriptions written by previous Xenia executions until - // the end of the file or until a corrupted one is detected. - xe::filesystem::Seek(pipeline_storage_file_, 0, SEEK_END); - int64_t pipeline_storage_told_end = - xe::filesystem::Tell(pipeline_storage_file_); - size_t pipeline_storage_told_count = size_t( - pipeline_storage_told_end >= int64_t(pipeline_storage_valid_bytes) - ? (uint64_t(pipeline_storage_told_end) - - pipeline_storage_valid_bytes) / - sizeof(PipelineStoredDescription) - : 0); - if (pipeline_storage_told_count && - xe::filesystem::Seek(pipeline_storage_file_, - int64_t(pipeline_storage_valid_bytes), SEEK_SET)) { - std::vector pipeline_stored_descriptions; - pipeline_stored_descriptions.resize(pipeline_storage_told_count); - pipeline_stored_descriptions.resize( - fread(pipeline_stored_descriptions.data(), - sizeof(PipelineStoredDescription), pipeline_storage_told_count, - pipeline_storage_file_)); - if (!pipeline_stored_descriptions.empty()) { - // Launch additional creation threads to use all cores to create - // pipelines faster. Will also be using the main thread, so minus 1. - size_t creation_thread_original_count = creation_threads_.size(); - size_t creation_thread_needed_count = - std::max(std::min(pipeline_stored_descriptions.size(), - logical_processor_count) - - size_t(1), - creation_thread_original_count); - while (creation_threads_.size() < creation_thread_original_count) { - size_t creation_thread_index = creation_threads_.size(); - std::unique_ptr creation_thread = - xe::threading::Thread::Create( - {}, [this, creation_thread_index]() { - CreationThread(creation_thread_index); - }); - creation_thread->set_name("D3D12 Pipelines"); - creation_threads_.push_back(std::move(creation_thread)); - } - size_t pipelines_created = 0; - for (const PipelineStoredDescription& pipeline_stored_description : - pipeline_stored_descriptions) { - const PipelineDescription& pipeline_description = - pipeline_stored_description.description; - // Validate file integrity, stop and truncate the stream if data is - // corrupted. - if (XXH64(&pipeline_stored_description.description, - sizeof(pipeline_stored_description.description), - 0) != pipeline_stored_description.description_hash) { - break; - } - pipeline_storage_valid_bytes += sizeof(PipelineStoredDescription); - // Skip already known pipelines - those have already been enqueued. - auto found_range = pipelines_.equal_range( - pipeline_stored_description.description_hash); - bool pipeline_found = false; - for (auto it = found_range.first; it != found_range.second; ++it) { - Pipeline* found_pipeline = it->second; - if (!std::memcmp(&found_pipeline->description.description, - &pipeline_description, - sizeof(pipeline_description))) { - pipeline_found = true; - break; - } - } - if (pipeline_found) { - continue; - } + // Launch additional creation threads to use all cores to create + // pipelines faster. Will also be using the main thread, so minus 1. + size_t creation_thread_original_count = creation_threads_.size(); + size_t creation_thread_needed_count = std::max( + std::min(pipeline_stored_descriptions.size(), logical_processor_count) - + size_t(1), + creation_thread_original_count); + while (creation_threads_.size() < creation_thread_original_count) { + size_t creation_thread_index = creation_threads_.size(); + std::unique_ptr creation_thread = + xe::threading::Thread::Create({}, [this, creation_thread_index]() { + CreationThread(creation_thread_index); + }); + creation_thread->set_name("D3D12 Pipelines"); + creation_threads_.push_back(std::move(creation_thread)); + } - PipelineRuntimeDescription pipeline_runtime_description; - auto vertex_shader_it = - shaders_.find(pipeline_description.vertex_shader_hash); - if (vertex_shader_it == shaders_.end()) { - continue; - } - pipeline_runtime_description.vertex_shader = vertex_shader_it->second; - if (!pipeline_runtime_description.vertex_shader->is_valid()) { - continue; - } - if (pipeline_description.pixel_shader_hash) { - auto pixel_shader_it = - shaders_.find(pipeline_description.pixel_shader_hash); - if (pixel_shader_it == shaders_.end()) { - continue; - } - pipeline_runtime_description.pixel_shader = pixel_shader_it->second; - if (!pipeline_runtime_description.pixel_shader->is_valid()) { - continue; - } - } else { - pipeline_runtime_description.pixel_shader = nullptr; - } - pipeline_runtime_description.root_signature = - command_processor_.GetRootSignature( - pipeline_runtime_description.vertex_shader, - pipeline_runtime_description.pixel_shader); - if (!pipeline_runtime_description.root_signature) { - continue; - } - std::memcpy(&pipeline_runtime_description.description, - &pipeline_description, sizeof(pipeline_description)); + size_t pipelines_created = 0; + for (const PipelineStoredDescription& pipeline_stored_description : + pipeline_stored_descriptions) { + const PipelineDescription& pipeline_description = + pipeline_stored_description.description; + // Skip already known pipelines - those have already been enqueued. + auto found_range = + pipelines_.equal_range(pipeline_stored_description.description_hash); + bool pipeline_found = false; + for (auto it = found_range.first; it != found_range.second; ++it) { + Pipeline* found_pipeline = it->second; + if (!std::memcmp(&found_pipeline->description.description, + &pipeline_description, sizeof(pipeline_description))) { + pipeline_found = true; + break; + } + } + if (pipeline_found) { + continue; + } - Pipeline* new_pipeline = new Pipeline; - new_pipeline->state = nullptr; - std::memcpy(&new_pipeline->description, &pipeline_runtime_description, - sizeof(pipeline_runtime_description)); - pipelines_.emplace(pipeline_stored_description.description_hash, - new_pipeline); - COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size()); - if (!creation_threads_.empty()) { - // Submit the pipeline for creation to any available thread. - { - std::lock_guard lock(creation_request_lock_); - creation_queue_.push_back(new_pipeline); - } - creation_request_cond_.notify_one(); - } else { - new_pipeline->state = - CreateD3D12Pipeline(pipeline_runtime_description); - } - ++pipelines_created; + PipelineRuntimeDescription pipeline_runtime_description; + auto vertex_shader_it = + shaders_.find(pipeline_description.vertex_shader_hash); + if (vertex_shader_it == shaders_.end()) { + continue; + } + D3D12Shader* vertex_shader = vertex_shader_it->second; + pipeline_runtime_description.vertex_shader = + static_cast( + vertex_shader->GetTranslation( + pipeline_description.vertex_shader_modification)); + if (!pipeline_runtime_description.vertex_shader || + !pipeline_runtime_description.vertex_shader->is_valid()) { + continue; + } + D3D12Shader* pixel_shader; + if (pipeline_description.pixel_shader_hash) { + auto pixel_shader_it = + shaders_.find(pipeline_description.pixel_shader_hash); + if (pixel_shader_it == shaders_.end()) { + continue; } - CreateQueuedPipelinesOnProcessorThread(); - if (creation_threads_.size() > creation_thread_original_count) { - { - std::lock_guard lock(creation_request_lock_); - creation_threads_shutdown_from_ = creation_thread_original_count; - // Assuming the queue is empty because of - // CreateQueuedPipelinesOnProcessorThread. - } - creation_request_cond_.notify_all(); - while (creation_threads_.size() > creation_thread_original_count) { - xe::threading::Wait(creation_threads_.back().get(), false); - creation_threads_.pop_back(); - } - bool await_creation_completion_event; - { - // Cleanup so additional threads can be created later again. - std::lock_guard lock(creation_request_lock_); - creation_threads_shutdown_from_ = SIZE_MAX; - // If the invocation is blocking, all the shader storage - // initialization is expected to be done before proceeding, to avoid - // latency in the command processor after the invocation. - await_creation_completion_event = - blocking && creation_threads_busy_ != 0; - if (await_creation_completion_event) { - creation_completion_event_->Reset(); - creation_completion_set_event_ = true; - } - } - if (await_creation_completion_event) { - creation_request_cond_.notify_one(); - xe::threading::Wait(creation_completion_event_.get(), false); - } + pixel_shader = pixel_shader_it->second; + pipeline_runtime_description.pixel_shader = + static_cast( + pixel_shader->GetTranslation( + pipeline_description.pixel_shader_modification)); + if (!pipeline_runtime_description.pixel_shader || + !pipeline_runtime_description.pixel_shader->is_valid()) { + continue; } - XELOGGPU( - "Created {} graphics pipelines from the storage in {} milliseconds", - pipelines_created, - (xe::Clock::QueryHostTickCount() - - pipeline_storage_initialization_start_) * - 1000 / xe::Clock::QueryHostTickFrequency()); + } else { + pixel_shader = nullptr; + pipeline_runtime_description.pixel_shader = nullptr; + } + pipeline_runtime_description.root_signature = + command_processor_.GetRootSignature( + vertex_shader, pixel_shader, + DxbcShaderTranslator::Modification( + pipeline_description.vertex_shader_modification) + .host_vertex_shader_type != + Shader::HostVertexShaderType::kVertex); + if (!pipeline_runtime_description.root_signature) { + continue; + } + std::memcpy(&pipeline_runtime_description.description, + &pipeline_description, sizeof(pipeline_description)); + + Pipeline* new_pipeline = new Pipeline; + new_pipeline->state = nullptr; + std::memcpy(&new_pipeline->description, &pipeline_runtime_description, + sizeof(pipeline_runtime_description)); + pipelines_.emplace(pipeline_stored_description.description_hash, + new_pipeline); + COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size()); + if (!creation_threads_.empty()) { + // Submit the pipeline for creation to any available thread. + { + std::lock_guard lock(creation_request_lock_); + creation_queue_.push_back(new_pipeline); + } + creation_request_cond_.notify_one(); + } else { + new_pipeline->state = CreateD3D12Pipeline(pipeline_runtime_description); + } + ++pipelines_created; + } + + CreateQueuedPipelinesOnProcessorThread(); + if (creation_threads_.size() > creation_thread_original_count) { + { + std::lock_guard lock(creation_request_lock_); + creation_threads_shutdown_from_ = creation_thread_original_count; + // Assuming the queue is empty because of + // CreateQueuedPipelinesOnProcessorThread. + } + creation_request_cond_.notify_all(); + while (creation_threads_.size() > creation_thread_original_count) { + xe::threading::Wait(creation_threads_.back().get(), false); + creation_threads_.pop_back(); + } + bool await_creation_completion_event; + { + // Cleanup so additional threads can be created later again. + std::lock_guard lock(creation_request_lock_); + creation_threads_shutdown_from_ = SIZE_MAX; + // If the invocation is blocking, all the shader storage initialization + // is expected to be done before proceeding, to avoid latency in the + // command processor after the invocation. + await_creation_completion_event = + blocking && creation_threads_busy_ != 0; + if (await_creation_completion_event) { + creation_completion_event_->Reset(); + creation_completion_set_event_ = true; + } + } + if (await_creation_completion_event) { + creation_request_cond_.notify_one(); + xe::threading::Wait(creation_completion_event_.get(), false); } } - xe::filesystem::TruncateStdioFile(pipeline_storage_file_, - pipeline_storage_valid_bytes); + + XELOGGPU( + "Created {} graphics pipelines (not including reading the " + "descriptions) from the storage in {} milliseconds", + pipelines_created, + (xe::Clock::QueryHostTickCount() - pipeline_creation_start_) * 1000 / + xe::Clock::QueryHostTickFrequency()); + // If any pipeline descriptions were corrupted (or the whole file has excess + // bytes in the end), truncate to the last valid pipeline description. + xe::filesystem::TruncateStdioFile( + pipeline_storage_file_, + uint64_t(sizeof(pipeline_storage_file_header) + + sizeof(PipelineStoredDescription) * + pipeline_stored_descriptions.size())); } else { xe::filesystem::TruncateStdioFile(pipeline_storage_file_, 0); pipeline_storage_file_header.magic = pipeline_storage_magic; pipeline_storage_file_header.magic_api = pipeline_storage_magic_api; pipeline_storage_file_header.version_swapped = - xe::byte_swap(PipelineDescription::kVersion); + pipeline_storage_version_swapped; + // Reserved for future (for Vulkan) - host device features affecting legal + // pipeline descriptions. + pipeline_storage_file_header.device_features = 0; fwrite(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header), 1, pipeline_storage_file_); } - shader_storage_root_ = storage_root; + shader_storage_cache_root_ = cache_root; shader_storage_title_id_ = title_id; // Start the storage writing thread. @@ -706,7 +773,7 @@ void PipelineCache::ShutdownShaderStorage() { shader_storage_file_flush_needed_ = false; } - shader_storage_root_.clear(); + shader_storage_cache_root_.clear(); shader_storage_title_id_ = 0; } @@ -757,11 +824,17 @@ bool PipelineCache::IsCreatingPipelines() { } D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type, - uint32_t guest_address, const uint32_t* host_address, uint32_t dword_count) { // Hash the input memory and lookup the shader. - uint64_t data_hash = XXH64(host_address, dword_count * sizeof(uint32_t), 0); + return LoadShader(shader_type, host_address, dword_count, + XXH64(host_address, dword_count * sizeof(uint32_t), 0)); +} + +D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type, + const uint32_t* host_address, + uint32_t dword_count, + uint64_t data_hash) { auto it = shaders_.find(data_hash); if (it != shaders_.end()) { // Shader has been previously loaded. @@ -774,12 +847,64 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type, D3D12Shader* shader = new D3D12Shader(shader_type, data_hash, host_address, dword_count); shaders_.emplace(data_hash, shader); + if (!cvars::dump_shaders.empty()) { + shader->DumpUcodeBinary(cvars::dump_shaders); + } return shader; } -Shader::HostVertexShaderType PipelineCache::GetHostVertexShaderTypeIfValid() - const { +bool PipelineCache::GetCurrentShaderModifications( + DxbcShaderTranslator::Modification& vertex_shader_modification_out, + DxbcShaderTranslator::Modification& pixel_shader_modification_out) const { + Shader::HostVertexShaderType host_vertex_shader_type = + GetCurrentHostVertexShaderTypeIfValid(); + if (host_vertex_shader_type == Shader::HostVertexShaderType(-1)) { + return false; + } + vertex_shader_modification_out = DxbcShaderTranslator::Modification( + shader_translator_->GetDefaultModification(xenos::ShaderType::kVertex, + host_vertex_shader_type)); + DxbcShaderTranslator::Modification pixel_shader_modification( + shader_translator_->GetDefaultModification(xenos::ShaderType::kPixel)); + if (!edram_rov_used_) { + const auto& regs = register_file_; + using DepthStencilMode = + DxbcShaderTranslator::Modification::DepthStencilMode; + if ((depth_float24_conversion_ == + flags::DepthFloat24Conversion::kOnOutputTruncating || + depth_float24_conversion_ == + flags::DepthFloat24Conversion::kOnOutputRounding) && + regs.Get().z_enable && + regs.Get().depth_format == + xenos::DepthRenderTargetFormat::kD24FS8) { + pixel_shader_modification.depth_stencil_mode = + depth_float24_conversion_ == + flags::DepthFloat24Conversion::kOnOutputTruncating + ? DepthStencilMode::kFloat24Truncating + : DepthStencilMode::kFloat24Rounding; + } else { + // Hint to enable early depth/stencil writing if possible - whether it + // will actually take effect depends on the shader itself, it's not known + // before translation. + auto rb_colorcontrol = regs.Get(); + if ((!rb_colorcontrol.alpha_test_enable || + rb_colorcontrol.alpha_func == xenos::CompareFunction::kAlways) && + !rb_colorcontrol.alpha_to_mask_enable) { + pixel_shader_modification.depth_stencil_mode = + DepthStencilMode::kEarlyHint; + } else { + pixel_shader_modification.depth_stencil_mode = + DepthStencilMode::kNoModifiers; + } + } + } + pixel_shader_modification_out = pixel_shader_modification; + return true; +} + +Shader::HostVertexShaderType +PipelineCache::GetCurrentHostVertexShaderTypeIfValid() const { // If the values this functions returns are changed, INVALIDATE THE SHADER // STORAGE (increase kVersion for BOTH shaders and pipelines)! The exception // is when the function originally returned "unsupported", but started to @@ -855,8 +980,8 @@ Shader::HostVertexShaderType PipelineCache::GetHostVertexShaderTypeIfValid() } bool PipelineCache::EnsureShadersTranslated( - D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, - Shader::HostVertexShaderType host_vertex_shader_type) { + D3D12Shader::D3D12Translation* vertex_shader, + D3D12Shader::D3D12Translation* pixel_shader) { const auto& regs = register_file_; auto sq_program_cntl = regs.Get(); @@ -869,18 +994,19 @@ bool PipelineCache::EnsureShadersTranslated( if (!vertex_shader->is_translated()) { if (!TranslateShader(*shader_translator_, *vertex_shader, sq_program_cntl, - dxbc_converter_, dxc_utils_, dxc_compiler_, - host_vertex_shader_type)) { + dxbc_converter_, dxc_utils_, dxc_compiler_)) { XELOGE("Failed to translate the vertex shader!"); return false; } - if (shader_storage_file_) { + if (shader_storage_file_ && vertex_shader->shader().ucode_storage_index() != + shader_storage_index_) { + vertex_shader->shader().set_ucode_storage_index(shader_storage_index_); assert_not_null(storage_write_thread_); shader_storage_file_flush_needed_ = true; { std::lock_guard lock(storage_write_request_lock_); storage_write_shader_queue_.push_back( - std::make_pair(vertex_shader, sq_program_cntl)); + std::make_pair(&vertex_shader->shader(), sq_program_cntl)); } storage_write_request_cond_.notify_all(); } @@ -892,13 +1018,15 @@ bool PipelineCache::EnsureShadersTranslated( XELOGE("Failed to translate the pixel shader!"); return false; } - if (shader_storage_file_) { + if (shader_storage_file_ && + pixel_shader->shader().ucode_storage_index() != shader_storage_index_) { + pixel_shader->shader().set_ucode_storage_index(shader_storage_index_); assert_not_null(storage_write_thread_); shader_storage_file_flush_needed_ = true; { std::lock_guard lock(storage_write_request_lock_); storage_write_shader_queue_.push_back( - std::make_pair(pixel_shader, sq_program_cntl)); + std::make_pair(&pixel_shader->shader(), sq_program_cntl)); } storage_write_request_cond_.notify_all(); } @@ -908,9 +1036,9 @@ bool PipelineCache::EnsureShadersTranslated( } bool PipelineCache::ConfigurePipeline( - D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, + D3D12Shader::D3D12Translation* vertex_shader, + D3D12Shader::D3D12Translation* pixel_shader, xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format, - bool early_z, const RenderTargetCache::PipelineRenderTarget render_targets[5], void** pipeline_handle_out, ID3D12RootSignature** root_signature_out) { #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES @@ -922,7 +1050,7 @@ bool PipelineCache::ConfigurePipeline( PipelineRuntimeDescription runtime_description; if (!GetCurrentStateDescription(vertex_shader, pixel_shader, primitive_type, - index_format, early_z, render_targets, + index_format, render_targets, runtime_description)) { return false; } @@ -950,9 +1078,7 @@ bool PipelineCache::ConfigurePipeline( } } - if (!EnsureShadersTranslated( - vertex_shader, pixel_shader, - Shader::HostVertexShaderType(description.host_vertex_shader_type))) { + if (!EnsureShadersTranslated(vertex_shader, pixel_shader)) { return false; } @@ -995,14 +1121,17 @@ bool PipelineCache::ConfigurePipeline( return true; } -bool PipelineCache::TranslateShader( - DxbcShaderTranslator& translator, D3D12Shader& shader, - reg::SQ_PROGRAM_CNTL cntl, IDxbcConverter* dxbc_converter, - IDxcUtils* dxc_utils, IDxcCompiler* dxc_compiler, - Shader::HostVertexShaderType host_vertex_shader_type) { +bool PipelineCache::TranslateShader(DxbcShaderTranslator& translator, + D3D12Shader::D3D12Translation& translation, + reg::SQ_PROGRAM_CNTL cntl, + IDxbcConverter* dxbc_converter, + IDxcUtils* dxc_utils, + IDxcCompiler* dxc_compiler) { + D3D12Shader& shader = static_cast(translation.shader()); + // Perform translation. // If this fails the shader will be marked as invalid and ignored later. - if (!translator.Translate(&shader, cntl, host_vertex_shader_type)) { + if (!translator.Translate(translation, cntl)) { XELOGE("Shader {:016X} translation failed; marking as ignored", shader.ucode_data_hash()); return false; @@ -1010,7 +1139,8 @@ bool PipelineCache::TranslateShader( const char* host_shader_type; if (shader.type() == xenos::ShaderType::kVertex) { - switch (shader.host_vertex_shader_type()) { + DxbcShaderTranslator::Modification modification(translation.modification()); + switch (modification.host_vertex_shader_type) { case Shader::HostVertexShaderType::kLineDomainCPIndexed: host_shader_type = "control-point-indexed line domain"; break; @@ -1039,169 +1169,156 @@ bool PipelineCache::TranslateShader( shader.ucode_dword_count() * 4, shader.ucode_data_hash(), shader.ucode_disassembly().c_str()); - // Set up texture and sampler bindings. - uint32_t texture_binding_count; - const DxbcShaderTranslator::TextureBinding* translator_texture_bindings = - translator.GetTextureBindings(texture_binding_count); - uint32_t sampler_binding_count; - const DxbcShaderTranslator::SamplerBinding* sampler_bindings = - translator.GetSamplerBindings(sampler_binding_count); - shader.SetTexturesAndSamplers(translator_texture_bindings, - texture_binding_count, sampler_bindings, - sampler_binding_count); - assert_false(bindless_resources_used_ && - texture_binding_count + sampler_binding_count > - D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * 4); - // Get hashable texture bindings, without translator-specific info. - const D3D12Shader::TextureBinding* texture_bindings = - shader.GetTextureBindings(texture_binding_count); - size_t texture_binding_layout_bytes = - texture_binding_count * sizeof(*texture_bindings); - uint64_t texture_binding_layout_hash = 0; - if (texture_binding_count) { - texture_binding_layout_hash = - XXH64(texture_bindings, texture_binding_layout_bytes, 0); - } - uint32_t bindless_sampler_count = - bindless_resources_used_ ? sampler_binding_count : 0; - uint64_t bindless_sampler_layout_hash = 0; - if (bindless_sampler_count) { - XXH64_state_t hash_state; - XXH64_reset(&hash_state, 0); - for (uint32_t i = 0; i < bindless_sampler_count; ++i) { - XXH64_update(&hash_state, &sampler_bindings[i].bindless_descriptor_index, - sizeof(sampler_bindings[i].bindless_descriptor_index)); - } - bindless_sampler_layout_hash = XXH64_digest(&hash_state); - } - // Obtain the unique IDs of binding layouts if there are any texture bindings - // or bindless samplers, for invalidation in the command processor. - size_t texture_binding_layout_uid = kLayoutUIDEmpty; - // Use sampler count for the bindful case because it's the only thing that - // must be the same for layouts to be compatible in this case - // (instruction-specified parameters are used as overrides for actual - // samplers). - static_assert( - kLayoutUIDEmpty == 0, - "Empty layout UID is assumed to be 0 because for bindful samplers, the " - "UID is their count"); - size_t sampler_binding_layout_uid = bindless_resources_used_ - ? kLayoutUIDEmpty - : size_t(sampler_binding_count); - if (texture_binding_count || bindless_sampler_count) { - std::lock_guard layouts_mutex_(layouts_mutex_); + // Set up texture and sampler binding layouts. + if (shader.EnterBindingLayoutUserUIDSetup()) { + uint32_t texture_binding_count; + const D3D12Shader::TextureBinding* texture_bindings = + shader.GetTextureBindings(texture_binding_count); + uint32_t sampler_binding_count; + const D3D12Shader::SamplerBinding* sampler_bindings = + shader.GetSamplerBindings(sampler_binding_count); + assert_false(bindless_resources_used_ && + texture_binding_count + sampler_binding_count > + D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * 4); + size_t texture_binding_layout_bytes = + texture_binding_count * sizeof(*texture_bindings); + uint64_t texture_binding_layout_hash = 0; if (texture_binding_count) { - auto found_range = - texture_binding_layout_map_.equal_range(texture_binding_layout_hash); - for (auto it = found_range.first; it != found_range.second; ++it) { - if (it->second.vector_span_length == texture_binding_count && - !std::memcmp( - texture_binding_layouts_.data() + it->second.vector_span_offset, - texture_bindings, texture_binding_layout_bytes)) { - texture_binding_layout_uid = it->second.uid; - break; - } - } - if (texture_binding_layout_uid == kLayoutUIDEmpty) { - static_assert( - kLayoutUIDEmpty == 0, - "Layout UID is size + 1 because it's assumed that 0 is the UID for " - "an empty layout"); - texture_binding_layout_uid = texture_binding_layout_map_.size() + 1; - LayoutUID new_uid; - new_uid.uid = texture_binding_layout_uid; - new_uid.vector_span_offset = texture_binding_layouts_.size(); - new_uid.vector_span_length = texture_binding_count; - texture_binding_layouts_.resize(new_uid.vector_span_offset + - texture_binding_count); - std::memcpy( - texture_binding_layouts_.data() + new_uid.vector_span_offset, - texture_bindings, texture_binding_layout_bytes); - texture_binding_layout_map_.emplace(texture_binding_layout_hash, - new_uid); - } + texture_binding_layout_hash = + XXH64(texture_bindings, texture_binding_layout_bytes, 0); } + uint32_t bindless_sampler_count = + bindless_resources_used_ ? sampler_binding_count : 0; + uint64_t bindless_sampler_layout_hash = 0; if (bindless_sampler_count) { - auto found_range = - bindless_sampler_layout_map_.equal_range(sampler_binding_layout_uid); - for (auto it = found_range.first; it != found_range.second; ++it) { - if (it->second.vector_span_length != bindless_sampler_count) { - continue; - } - sampler_binding_layout_uid = it->second.uid; - const uint32_t* vector_bindless_sampler_layout = - bindless_sampler_layouts_.data() + it->second.vector_span_offset; - for (uint32_t i = 0; i < bindless_sampler_count; ++i) { - if (vector_bindless_sampler_layout[i] != - sampler_bindings[i].bindless_descriptor_index) { - sampler_binding_layout_uid = kLayoutUIDEmpty; + XXH64_state_t hash_state; + XXH64_reset(&hash_state, 0); + for (uint32_t i = 0; i < bindless_sampler_count; ++i) { + XXH64_update(&hash_state, + &sampler_bindings[i].bindless_descriptor_index, + sizeof(sampler_bindings[i].bindless_descriptor_index)); + } + bindless_sampler_layout_hash = XXH64_digest(&hash_state); + } + // Obtain the unique IDs of binding layouts if there are any texture + // bindings or bindless samplers, for invalidation in the command processor. + size_t texture_binding_layout_uid = kLayoutUIDEmpty; + // Use sampler count for the bindful case because it's the only thing that + // must be the same for layouts to be compatible in this case + // (instruction-specified parameters are used as overrides for actual + // samplers). + static_assert( + kLayoutUIDEmpty == 0, + "Empty layout UID is assumed to be 0 because for bindful samplers, the " + "UID is their count"); + size_t sampler_binding_layout_uid = bindless_resources_used_ + ? kLayoutUIDEmpty + : size_t(sampler_binding_count); + if (texture_binding_count || bindless_sampler_count) { + std::lock_guard layouts_mutex_(layouts_mutex_); + if (texture_binding_count) { + auto found_range = texture_binding_layout_map_.equal_range( + texture_binding_layout_hash); + for (auto it = found_range.first; it != found_range.second; ++it) { + if (it->second.vector_span_length == texture_binding_count && + !std::memcmp(texture_binding_layouts_.data() + + it->second.vector_span_offset, + texture_bindings, texture_binding_layout_bytes)) { + texture_binding_layout_uid = it->second.uid; break; } } - if (sampler_binding_layout_uid != kLayoutUIDEmpty) { - break; + if (texture_binding_layout_uid == kLayoutUIDEmpty) { + static_assert( + kLayoutUIDEmpty == 0, + "Layout UID is size + 1 because it's assumed that 0 is the UID " + "for an empty layout"); + texture_binding_layout_uid = texture_binding_layout_map_.size() + 1; + LayoutUID new_uid; + new_uid.uid = texture_binding_layout_uid; + new_uid.vector_span_offset = texture_binding_layouts_.size(); + new_uid.vector_span_length = texture_binding_count; + texture_binding_layouts_.resize(new_uid.vector_span_offset + + texture_binding_count); + std::memcpy( + texture_binding_layouts_.data() + new_uid.vector_span_offset, + texture_bindings, texture_binding_layout_bytes); + texture_binding_layout_map_.emplace(texture_binding_layout_hash, + new_uid); } } - if (sampler_binding_layout_uid == kLayoutUIDEmpty) { - sampler_binding_layout_uid = bindless_sampler_layout_map_.size(); - LayoutUID new_uid; - static_assert( - kLayoutUIDEmpty == 0, - "Layout UID is size + 1 because it's assumed that 0 is the UID for " - "an empty layout"); - new_uid.uid = sampler_binding_layout_uid + 1; - new_uid.vector_span_offset = bindless_sampler_layouts_.size(); - new_uid.vector_span_length = sampler_binding_count; - bindless_sampler_layouts_.resize(new_uid.vector_span_offset + - sampler_binding_count); - uint32_t* vector_bindless_sampler_layout = - bindless_sampler_layouts_.data() + new_uid.vector_span_offset; - for (uint32_t i = 0; i < bindless_sampler_count; ++i) { - vector_bindless_sampler_layout[i] = - sampler_bindings[i].bindless_descriptor_index; + if (bindless_sampler_count) { + auto found_range = bindless_sampler_layout_map_.equal_range( + sampler_binding_layout_uid); + for (auto it = found_range.first; it != found_range.second; ++it) { + if (it->second.vector_span_length != bindless_sampler_count) { + continue; + } + sampler_binding_layout_uid = it->second.uid; + const uint32_t* vector_bindless_sampler_layout = + bindless_sampler_layouts_.data() + it->second.vector_span_offset; + for (uint32_t i = 0; i < bindless_sampler_count; ++i) { + if (vector_bindless_sampler_layout[i] != + sampler_bindings[i].bindless_descriptor_index) { + sampler_binding_layout_uid = kLayoutUIDEmpty; + break; + } + } + if (sampler_binding_layout_uid != kLayoutUIDEmpty) { + break; + } + } + if (sampler_binding_layout_uid == kLayoutUIDEmpty) { + sampler_binding_layout_uid = bindless_sampler_layout_map_.size(); + LayoutUID new_uid; + static_assert( + kLayoutUIDEmpty == 0, + "Layout UID is size + 1 because it's assumed that 0 is the UID " + "for an empty layout"); + new_uid.uid = sampler_binding_layout_uid + 1; + new_uid.vector_span_offset = bindless_sampler_layouts_.size(); + new_uid.vector_span_length = sampler_binding_count; + bindless_sampler_layouts_.resize(new_uid.vector_span_offset + + sampler_binding_count); + uint32_t* vector_bindless_sampler_layout = + bindless_sampler_layouts_.data() + new_uid.vector_span_offset; + for (uint32_t i = 0; i < bindless_sampler_count; ++i) { + vector_bindless_sampler_layout[i] = + sampler_bindings[i].bindless_descriptor_index; + } + bindless_sampler_layout_map_.emplace(bindless_sampler_layout_hash, + new_uid); } - bindless_sampler_layout_map_.emplace(bindless_sampler_layout_hash, - new_uid); } } - } - shader.SetTextureBindingLayoutUserUID(texture_binding_layout_uid); - shader.SetSamplerBindingLayoutUserUID(sampler_binding_layout_uid); - - // Create a version of the shader with early depth/stencil forced by Xenia - // itself when it's safe to do so or when EARLY_Z_ENABLE is set in - // RB_DEPTHCONTROL. - if (shader.type() == xenos::ShaderType::kPixel && !edram_rov_used_ && - !shader.writes_depth()) { - shader.SetForcedEarlyZShaderObject( - std::move(DxbcShaderTranslator::ForceEarlyDepthStencil( - shader.translated_binary().data()))); + shader.SetTextureBindingLayoutUserUID(texture_binding_layout_uid); + shader.SetSamplerBindingLayoutUserUID(sampler_binding_layout_uid); } // Disassemble the shader for dumping. auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); if (cvars::d3d12_dxbc_disasm_dxilconv) { - shader.DisassembleDxbc(provider, cvars::d3d12_dxbc_disasm, dxbc_converter, - dxc_utils, dxc_compiler); + translation.DisassembleDxbcAndDxil(provider, cvars::d3d12_dxbc_disasm, + dxbc_converter, dxc_utils, dxc_compiler); } else { - shader.DisassembleDxbc(provider, cvars::d3d12_dxbc_disasm); + translation.DisassembleDxbcAndDxil(provider, cvars::d3d12_dxbc_disasm); } // Dump shader files if desired. if (!cvars::dump_shaders.empty()) { - shader.Dump(cvars::dump_shaders, - (shader.type() == xenos::ShaderType::kPixel) - ? (edram_rov_used_ ? "d3d12_rov" : "d3d12_rtv") - : "d3d12"); + translation.Dump(cvars::dump_shaders, + (shader.type() == xenos::ShaderType::kPixel) + ? (edram_rov_used_ ? "d3d12_rov" : "d3d12_rtv") + : "d3d12"); } - return shader.is_valid(); + return translation.is_valid(); } bool PipelineCache::GetCurrentStateDescription( - D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, + D3D12Shader::D3D12Translation* vertex_shader, + D3D12Shader::D3D12Translation* pixel_shader, xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format, - bool early_z, const RenderTargetCache::PipelineRenderTarget render_targets[5], PipelineRuntimeDescription& runtime_description_out) { PipelineDescription& description_out = runtime_description_out.description; @@ -1212,19 +1329,30 @@ bool PipelineCache::GetCurrentStateDescription( // Initialize all unused fields to zero for comparison/hashing. std::memset(&runtime_description_out, 0, sizeof(runtime_description_out)); + bool tessellated = + DxbcShaderTranslator::Modification(vertex_shader->modification()) + .host_vertex_shader_type != Shader::HostVertexShaderType::kVertex; + // Root signature. - runtime_description_out.root_signature = - command_processor_.GetRootSignature(vertex_shader, pixel_shader); + runtime_description_out.root_signature = command_processor_.GetRootSignature( + static_cast(&vertex_shader->shader()), + pixel_shader ? static_cast(&pixel_shader->shader()) + : nullptr, + tessellated); if (runtime_description_out.root_signature == nullptr) { return false; } // Shaders. runtime_description_out.vertex_shader = vertex_shader; - description_out.vertex_shader_hash = vertex_shader->ucode_data_hash(); + description_out.vertex_shader_hash = + vertex_shader->shader().ucode_data_hash(); + description_out.vertex_shader_modification = vertex_shader->modification(); if (pixel_shader) { runtime_description_out.pixel_shader = pixel_shader; - description_out.pixel_shader_hash = pixel_shader->ucode_data_hash(); + description_out.pixel_shader_hash = + pixel_shader->shader().ucode_data_hash(); + description_out.pixel_shader_modification = pixel_shader->modification(); } // Index buffer strip cut value. @@ -1239,13 +1367,10 @@ bool PipelineCache::GetCurrentStateDescription( } // Host vertex shader type and primitive topology. - Shader::HostVertexShaderType host_vertex_shader_type = - GetHostVertexShaderTypeIfValid(); - if (host_vertex_shader_type == Shader::HostVertexShaderType(-1)) { - return false; - } - description_out.host_vertex_shader_type = host_vertex_shader_type; - if (host_vertex_shader_type == Shader::HostVertexShaderType::kVertex) { + if (tessellated) { + description_out.primitive_topology_type_or_tessellation_mode = + uint32_t(regs.Get().tess_mode); + } else { switch (primitive_type) { case xenos::PrimitiveType::kPointList: description_out.primitive_topology_type_or_tessellation_mode = @@ -1280,14 +1405,10 @@ bool PipelineCache::GetCurrentStateDescription( description_out.geometry_shader = PipelineGeometryShader::kNone; break; } - } else { - description_out.primitive_topology_type_or_tessellation_mode = - uint32_t(regs.Get().tess_mode); } - bool primitive_polygonal = xenos::IsPrimitivePolygonal( - host_vertex_shader_type != Shader::HostVertexShaderType::kVertex, - primitive_type); + bool primitive_polygonal = + xenos::IsPrimitivePolygonal(tessellated, primitive_type); // Rasterizer state. // Because Direct3D 12 doesn't support per-side fill mode and depth bias, the @@ -1386,8 +1507,7 @@ bool PipelineCache::GetCurrentStateDescription( description_out.depth_bias_slope_scaled = poly_offset_scale * (1.0f / 16.0f); } - if (cvars::d3d12_tessellation_wireframe && - host_vertex_shader_type != Shader::HostVertexShaderType::kVertex) { + if (tessellated && cvars::d3d12_tessellation_wireframe) { description_out.fill_mode_wireframe = 1; } description_out.depth_clip = !regs.Get().clip_disable; @@ -1453,13 +1573,11 @@ bool PipelineCache::GetCurrentStateDescription( } else { description_out.depth_func = xenos::CompareFunction::kAlways; } - if (early_z) { - description_out.force_early_z = 1; - } // Render targets and blending state. 32 because of 0x1F mask, for safety // (all unknown to zero). - uint32_t color_mask = command_processor_.GetCurrentColorMask(pixel_shader); + uint32_t color_mask = command_processor_.GetCurrentColorMask( + pixel_shader ? &pixel_shader->shader() : nullptr); static const PipelineBlendFactor kBlendFactorMap[32] = { /* 0 */ PipelineBlendFactor::kZero, /* 1 */ PipelineBlendFactor::kOne, @@ -1550,11 +1668,11 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( if (runtime_description.pixel_shader != nullptr) { XELOGGPU("Creating graphics pipeline with VS {:016X}, PS {:016X}", - runtime_description.vertex_shader->ucode_data_hash(), - runtime_description.pixel_shader->ucode_data_hash()); + runtime_description.vertex_shader->shader().ucode_data_hash(), + runtime_description.pixel_shader->shader().ucode_data_hash()); } else { XELOGGPU("Creating graphics pipeline with VS {:016X}", - runtime_description.vertex_shader->ucode_data_hash()); + runtime_description.vertex_shader->shader().ucode_data_hash()); } D3D12_GRAPHICS_PIPELINE_STATE_DESC state_desc; @@ -1580,21 +1698,14 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( // Primitive topology, vertex, hull, domain and geometry shaders. if (!runtime_description.vertex_shader->is_translated()) { XELOGE("Vertex shader {:016X} not translated", - runtime_description.vertex_shader->ucode_data_hash()); + runtime_description.vertex_shader->shader().ucode_data_hash()); assert_always(); return nullptr; } Shader::HostVertexShaderType host_vertex_shader_type = - description.host_vertex_shader_type; - if (runtime_description.vertex_shader->host_vertex_shader_type() != - host_vertex_shader_type) { - XELOGE( - "Vertex shader {:016X} translated into the wrong host shader " - "type", - runtime_description.vertex_shader->ucode_data_hash()); - assert_always(); - return nullptr; - } + DxbcShaderTranslator::Modification( + runtime_description.vertex_shader->modification()) + .host_vertex_shader_type; if (host_vertex_shader_type == Shader::HostVertexShaderType::kVertex) { state_desc.VS.pShaderBytecode = runtime_description.vertex_shader->translated_binary().data(); @@ -1704,24 +1815,34 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( if (runtime_description.pixel_shader != nullptr) { if (!runtime_description.pixel_shader->is_translated()) { XELOGE("Pixel shader {:016X} not translated", - runtime_description.pixel_shader->ucode_data_hash()); + runtime_description.pixel_shader->shader().ucode_data_hash()); assert_always(); return nullptr; } - const auto& forced_early_z_shader = - runtime_description.pixel_shader->GetForcedEarlyZShaderObject(); - if (description.force_early_z && forced_early_z_shader.size() != 0) { - state_desc.PS.pShaderBytecode = forced_early_z_shader.data(); - state_desc.PS.BytecodeLength = forced_early_z_shader.size(); - } else { - state_desc.PS.pShaderBytecode = - runtime_description.pixel_shader->translated_binary().data(); - state_desc.PS.BytecodeLength = - runtime_description.pixel_shader->translated_binary().size(); - } + state_desc.PS.pShaderBytecode = + runtime_description.pixel_shader->translated_binary().data(); + state_desc.PS.BytecodeLength = + runtime_description.pixel_shader->translated_binary().size(); } else if (edram_rov_used_) { state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data(); state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size(); + } else { + if ((description.depth_func != xenos::CompareFunction::kAlways || + description.depth_write) && + description.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) { + switch (depth_float24_conversion_) { + case flags::DepthFloat24Conversion::kOnOutputTruncating: + state_desc.PS.pShaderBytecode = float24_truncate_ps; + state_desc.PS.BytecodeLength = sizeof(float24_truncate_ps); + break; + case flags::DepthFloat24Conversion::kOnOutputRounding: + state_desc.PS.pShaderBytecode = float24_round_ps; + state_desc.PS.BytecodeLength = sizeof(float24_round_ps); + break; + default: + break; + } + } } // Rasterizer state. @@ -1810,9 +1931,6 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( state_desc.DSVFormat = RenderTargetCache::GetDepthDXGIFormat(description.depth_format); } - // TODO(Triang3l): EARLY_Z_ENABLE (needs to be enabled in shaders, but alpha - // test is dynamic - should be enabled anyway if there's no alpha test, - // discarding and depth output). // Render targets and blending. state_desc.BlendState.IndependentBlendEnable = TRUE; @@ -1874,22 +1992,24 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( IID_PPV_ARGS(&state)))) { if (runtime_description.pixel_shader != nullptr) { XELOGE("Failed to create graphics pipeline with VS {:016X}, PS {:016X}", - runtime_description.vertex_shader->ucode_data_hash(), - runtime_description.pixel_shader->ucode_data_hash()); + runtime_description.vertex_shader->shader().ucode_data_hash(), + runtime_description.pixel_shader->shader().ucode_data_hash()); } else { XELOGE("Failed to create graphics pipeline with VS {:016X}", - runtime_description.vertex_shader->ucode_data_hash()); + runtime_description.vertex_shader->shader().ucode_data_hash()); } return nullptr; } std::wstring name; if (runtime_description.pixel_shader != nullptr) { - name = fmt::format(L"VS {:016X}, PS {:016X}", - runtime_description.vertex_shader->ucode_data_hash(), - runtime_description.pixel_shader->ucode_data_hash()); + name = fmt::format( + L"VS {:016X}, PS {:016X}", + runtime_description.vertex_shader->shader().ucode_data_hash(), + runtime_description.pixel_shader->shader().ucode_data_hash()); } else { - name = fmt::format(L"VS {:016X}", - runtime_description.vertex_shader->ucode_data_hash()); + name = fmt::format( + L"VS {:016X}", + runtime_description.vertex_shader->shader().ucode_data_hash()); } state->SetName(name.c_str()); return state; @@ -1954,7 +2074,6 @@ void PipelineCache::StorageWriteThread() { shader_header.ucode_data_hash = shader->ucode_data_hash(); shader_header.ucode_dword_count = shader->ucode_dword_count(); shader_header.type = shader->type(); - shader_header.host_vertex_shader_type = shader->host_vertex_shader_type(); shader_header.sq_program_cntl = shader_pair.second; assert_not_null(shader_storage_file_); fwrite(&shader_header, sizeof(shader_header), 1, shader_storage_file_); diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index 8159416d0..d09d373b8 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -27,6 +27,7 @@ #include "xenia/gpu/d3d12/d3d12_shader.h" #include "xenia/gpu/d3d12/render_target_cache.h" #include "xenia/gpu/dxbc_shader_translator.h" +#include "xenia/gpu/gpu_flags.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/xenos.h" #include "xenia/ui/d3d12/d3d12_api.h" @@ -43,36 +44,39 @@ class PipelineCache { PipelineCache(D3D12CommandProcessor& command_processor, const RegisterFile& register_file, bool bindless_resources_used, - bool edram_rov_used, uint32_t resolution_scale); + bool edram_rov_used, + flags::DepthFloat24Conversion depth_float24_conversion, + uint32_t resolution_scale); ~PipelineCache(); bool Initialize(); void Shutdown(); void ClearCache(bool shutting_down = false); - void InitializeShaderStorage(const std::filesystem::path& storage_root, + void InitializeShaderStorage(const std::filesystem::path& cache_root, uint32_t title_id, bool blocking); void ShutdownShaderStorage(); void EndSubmission(); bool IsCreatingPipelines(); - D3D12Shader* LoadShader(xenos::ShaderType shader_type, uint32_t guest_address, + D3D12Shader* LoadShader(xenos::ShaderType shader_type, const uint32_t* host_address, uint32_t dword_count); - // Returns the host vertex shader type for the current draw if it's valid and - // supported, or Shader::HostVertexShaderType(-1) if not. - Shader::HostVertexShaderType GetHostVertexShaderTypeIfValid() const; + // Retrieves the shader modifications for the current state, and returns + // whether they are valid. + bool GetCurrentShaderModifications( + DxbcShaderTranslator::Modification& vertex_shader_modification_out, + DxbcShaderTranslator::Modification& pixel_shader_modification_out) const; // Translates shaders if needed, also making shader info up to date. - bool EnsureShadersTranslated( - D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, - Shader::HostVertexShaderType host_vertex_shader_type); + bool EnsureShadersTranslated(D3D12Shader::D3D12Translation* vertex_shader, + D3D12Shader::D3D12Translation* pixel_shader); bool ConfigurePipeline( - D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, + D3D12Shader::D3D12Translation* vertex_shader, + D3D12Shader::D3D12Translation* pixel_shader, xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format, - bool early_z, const RenderTargetCache::PipelineRenderTarget render_targets[5], void** pipeline_handle_out, ID3D12RootSignature** root_signature_out); @@ -86,13 +90,12 @@ class PipelineCache { XEPACKEDSTRUCT(ShaderStoredHeader, { uint64_t ucode_data_hash; - uint32_t ucode_dword_count : 16; + uint32_t ucode_dword_count : 31; xenos::ShaderType type : 1; - Shader::HostVertexShaderType host_vertex_shader_type : 3; reg::SQ_PROGRAM_CNTL sq_program_cntl; - static constexpr uint32_t kVersion = 0x20200405; + static constexpr uint32_t kVersion = 0x20201129; }); // Update PipelineDescription::kVersion if any of the Pipeline* enums are @@ -170,28 +173,28 @@ class PipelineCache { uint64_t vertex_shader_hash; // 0 if drawing without a pixel shader. uint64_t pixel_shader_hash; + uint32_t vertex_shader_modification; + uint32_t pixel_shader_modification; int32_t depth_bias; float depth_bias_slope_scaled; PipelineStripCutIndex strip_cut_index : 2; // 2 - Shader::HostVertexShaderType host_vertex_shader_type : 3; // 5 // PipelinePrimitiveTopologyType for a vertex shader. // xenos::TessellationMode for a domain shader. - uint32_t primitive_topology_type_or_tessellation_mode : 2; // 7 + uint32_t primitive_topology_type_or_tessellation_mode : 2; // 4 // Zero for non-kVertex host_vertex_shader_type. - PipelineGeometryShader geometry_shader : 2; // 9 - uint32_t fill_mode_wireframe : 1; // 10 - PipelineCullMode cull_mode : 2; // 12 - uint32_t front_counter_clockwise : 1; // 13 - uint32_t depth_clip : 1; // 14 - uint32_t rov_msaa : 1; // 15 - xenos::DepthRenderTargetFormat depth_format : 1; // 16 - xenos::CompareFunction depth_func : 3; // 19 - uint32_t depth_write : 1; // 20 - uint32_t stencil_enable : 1; // 21 - uint32_t stencil_read_mask : 8; // 29 - uint32_t force_early_z : 1; // 30 + PipelineGeometryShader geometry_shader : 2; // 6 + uint32_t fill_mode_wireframe : 1; // 7 + PipelineCullMode cull_mode : 2; // 9 + uint32_t front_counter_clockwise : 1; // 10 + uint32_t depth_clip : 1; // 11 + uint32_t rov_msaa : 1; // 12 + xenos::DepthRenderTargetFormat depth_format : 1; // 13 + xenos::CompareFunction depth_func : 3; // 16 + uint32_t depth_write : 1; // 17 + uint32_t stencil_enable : 1; // 18 + uint32_t stencil_read_mask : 8; // 26 uint32_t stencil_write_mask : 8; // 8 xenos::StencilOp stencil_front_fail_op : 3; // 11 @@ -205,7 +208,7 @@ class PipelineCache { PipelineRenderTarget render_targets[4]; - static constexpr uint32_t kVersion = 0x20200405; + static constexpr uint32_t kVersion = 0x20201202; }); XEPACKEDSTRUCT(PipelineStoredDescription, { @@ -215,24 +218,31 @@ class PipelineCache { struct PipelineRuntimeDescription { ID3D12RootSignature* root_signature; - D3D12Shader* vertex_shader; - D3D12Shader* pixel_shader; + D3D12Shader::D3D12Translation* vertex_shader; + D3D12Shader::D3D12Translation* pixel_shader; PipelineDescription description; }; + // Returns the host vertex shader type for the current draw if it's valid and + // supported, or Shader::HostVertexShaderType(-1) if not. + Shader::HostVertexShaderType GetCurrentHostVertexShaderTypeIfValid() const; + + D3D12Shader* LoadShader(xenos::ShaderType shader_type, + const uint32_t* host_address, uint32_t dword_count, + uint64_t data_hash); + // Can be called from multiple threads. - bool TranslateShader(DxbcShaderTranslator& translator, D3D12Shader& shader, + bool TranslateShader(DxbcShaderTranslator& translator, + D3D12Shader::D3D12Translation& translation, reg::SQ_PROGRAM_CNTL cntl, IDxbcConverter* dxbc_converter = nullptr, IDxcUtils* dxc_utils = nullptr, - IDxcCompiler* dxc_compiler = nullptr, - Shader::HostVertexShaderType host_vertex_shader_type = - Shader::HostVertexShaderType::kVertex); + IDxcCompiler* dxc_compiler = nullptr); bool GetCurrentStateDescription( - D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, + D3D12Shader::D3D12Translation* vertex_shader, + D3D12Shader::D3D12Translation* pixel_shader, xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format, - bool early_z, const RenderTargetCache::PipelineRenderTarget render_targets[5], PipelineRuntimeDescription& runtime_description_out); @@ -243,6 +253,8 @@ class PipelineCache { const RegisterFile& register_file_; bool bindless_resources_used_; bool edram_rov_used_; + // 20e4 depth conversion mode to use for non-ROV output. + flags::DepthFloat24Conversion depth_float24_conversion_; uint32_t resolution_scale_; // Reusable shader translator. @@ -300,11 +312,14 @@ class PipelineCache { Pipeline* current_pipeline_ = nullptr; // Currently open shader storage path. - std::filesystem::path shader_storage_root_; + std::filesystem::path shader_storage_cache_root_; uint32_t shader_storage_title_id_ = 0; // Shader storage output stream, for preload in the next emulator runs. FILE* shader_storage_file_ = nullptr; + // For only writing shaders to the currently open storage once, incremented + // when switching the storage. + uint32_t shader_storage_index_ = 0; bool shader_storage_file_flush_needed_ = false; // Pipeline storage output stream, for preload in the next emulator runs. diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index 66ef2ba9f..8669d58a3 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -40,11 +40,13 @@ namespace d3d12 { #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_32bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_64bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_7e3_cs.h" +#include "xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_unorm_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_color_32bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_color_64bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_color_7e3_cs.h" +#include "xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_unorm_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/resolve_clear_32bpp_2xres_cs.h" @@ -87,6 +89,12 @@ const RenderTargetCache::EdramLoadStoreModeInfo {edram_load_depth_float_cs, sizeof(edram_load_depth_float_cs), L"EDRAM Load Float Depth", edram_store_depth_float_cs, sizeof(edram_store_depth_float_cs), L"EDRAM Store Float Depth"}, + {edram_load_depth_float24and32_cs, + sizeof(edram_load_depth_float24and32_cs), + L"EDRAM Load 24-bit & 32-bit Float Depth", + edram_store_depth_float24and32_cs, + sizeof(edram_store_depth_float24and32_cs), + L"EDRAM Store 24-bit & 32-bit Float Depth"}, }; const std::pair @@ -126,6 +134,8 @@ RenderTargetCache::RenderTargetCache(D3D12CommandProcessor& command_processor, RenderTargetCache::~RenderTargetCache() { Shutdown(); } bool RenderTargetCache::Initialize(const TextureCache& texture_cache) { + depth_float24_conversion_ = flags::GetDepthFloat24Conversion(); + // EDRAM buffer size depends on this. resolution_scale_2x_ = texture_cache.IsResolutionScale2X(); assert_false(resolution_scale_2x_ && !edram_rov_used_); @@ -420,7 +430,8 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) { return false; } resolve_clear_64bpp_pipeline_->SetName(L"Resolve Clear 64bpp"); - if (!edram_rov_used_) { + if (!edram_rov_used_ && + depth_float24_conversion_ == flags::DepthFloat24Conversion::kOnCopy) { assert_false(resolution_scale_2x_); resolve_clear_depth_24_32_pipeline_ = ui::d3d12::util::CreateComputePipeline( @@ -434,7 +445,7 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) { Shutdown(); return false; } - resolve_clear_64bpp_pipeline_->SetName( + resolve_clear_depth_24_32_pipeline_->SetName( L"Resolve Clear 24-bit & 32-bit Depth"); } @@ -1266,10 +1277,12 @@ bool RenderTargetCache::Resolve(const Memory& memory, if (clear_depth) { // Also clear the host 32-bit floating-point depth used for loaing and // storing 24-bit floating-point depth at full precision. - bool clear_float32_depth = - !edram_rov_used_ && xenos::DepthRenderTargetFormat( - resolve_info.depth_edram_info.format) == - xenos::DepthRenderTargetFormat::kD24FS8; + bool clear_float32_depth = !edram_rov_used_ && + depth_float24_conversion_ == + flags::DepthFloat24Conversion::kOnCopy && + xenos::DepthRenderTargetFormat( + resolve_info.depth_edram_info.format) == + xenos::DepthRenderTargetFormat::kD24FS8; draw_util::ResolveClearShaderConstants depth_clear_constants; resolve_info.GetDepthClearShaderConstants(clear_float32_depth, depth_clear_constants); @@ -1558,7 +1571,8 @@ void RenderTargetCache::RestoreEdramSnapshot(const void* snapshot) { uint32_t RenderTargetCache::GetEdramBufferSize() const { uint32_t size = xenos::kEdramSizeBytes; - if (!edram_rov_used_) { + if (!edram_rov_used_ && + depth_float24_conversion_ == flags::DepthFloat24Conversion::kOnCopy) { // Two 10 MB pages, one containing color and integer depth data, another // with 32-bit float depth when 20e4 depth is used to allow for multipass // drawing without precision loss in case of EDRAM store/load. @@ -1831,12 +1845,15 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget( } RenderTargetCache::EdramLoadStoreMode RenderTargetCache::GetLoadStoreMode( - bool is_depth, uint32_t format) { + bool is_depth, uint32_t format) const { if (is_depth) { - return xenos::DepthRenderTargetFormat(format) == - xenos::DepthRenderTargetFormat::kD24FS8 - ? EdramLoadStoreMode::kDepthFloat - : EdramLoadStoreMode::kDepthUnorm; + if (xenos::DepthRenderTargetFormat(format) == + xenos::DepthRenderTargetFormat::kD24FS8) { + return depth_float24_conversion_ == flags::DepthFloat24Conversion::kOnCopy + ? EdramLoadStoreMode::kDepthFloat24And32 + : EdramLoadStoreMode::kDepthFloat; + } + return EdramLoadStoreMode::kDepthUnorm; } xenos::ColorRenderTargetFormat color_format = xenos::ColorRenderTargetFormat(format); diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index 5069b3cb7..2f71c13c8 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -18,6 +18,7 @@ #include "xenia/gpu/d3d12/d3d12_shared_memory.h" #include "xenia/gpu/d3d12/texture_cache.h" #include "xenia/gpu/draw_util.h" +#include "xenia/gpu/gpu_flags.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/trace_writer.h" #include "xenia/gpu/xenos.h" @@ -259,6 +260,10 @@ class RenderTargetCache { void Shutdown(); void ClearCache(); + flags::DepthFloat24Conversion depth_float24_conversion() const { + return depth_float24_conversion_; + } + void CompletedSubmissionUpdated(); void BeginSubmission(); void EndFrame(); @@ -318,6 +323,7 @@ class RenderTargetCache { kColor7e3, kDepthUnorm, kDepthFloat, + kDepthFloat24And32, kCount }; @@ -424,7 +430,7 @@ class RenderTargetCache { uint32_t instance); #endif - static EdramLoadStoreMode GetLoadStoreMode(bool is_depth, uint32_t format); + EdramLoadStoreMode GetLoadStoreMode(bool is_depth, uint32_t format) const; // Must be in a frame to call. Stores the dirty areas of the currently bound // render targets and marks them as clean. @@ -442,6 +448,9 @@ class RenderTargetCache { bool bindless_resources_used_; bool edram_rov_used_; + // 20e4 depth conversion mode to use for non-ROV output. + flags::DepthFloat24Conversion depth_float24_conversion_; + // Whether 1 guest pixel is rendered as 2x2 host pixels (currently only // supported with ROV). bool resolution_scale_2x_ = false; diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.cso new file mode 100644 index 0000000000000000000000000000000000000000..c389242a1ee9667b3dfdf6346a236e612ce4770b GIT binary patch literal 3500 zcmbtWO=w(I6h686vGbbRm{ychbhK4c5lNz;i0Ex68C!^L?Zgll@|e!NkdaAVocE+% z)j?4~#FZ;W@E;*A-HSV2*_F6(AtKV13q^1@e&3yU-rRYe7{LSQ-FMIV?sx7v_q?B_ z^)r<}Kl$`%aB=P23#Ing4}bT~dt)N<1qAB4`perajBdfkW+H-QVx z@&t5jvR2`uFW@_JWa&QI?u5=|dJ^L_l=2ny7U&d|JSzS@ZQnx=K=;GlAb5n3&*!`v zg;BFp-wNAty%{EH+=;>@e!0_#WR$BJC8?}O%UrU86*VrX#M?<)-%b$7;$4c8__+Ru=81YAIEwWBhQVMJ_VjxHxmhfAG7^lgKzqBW}oT~;&Y7w#z_~w zNK(k%GBJKE!(RaBy#>wyn2WH#I>HXl;N#4`c8EJ0#F%q4kss0XccD(o`*?=enDl8L zY6~*V?MU@Gk&>QeQ_L~cPKeuq9D?%V4D!9}o{3=(_<5;MdEOE^sQV?JXTuS%S1IPm7>d>Px@J^-#e9aE{DHj4 zt?{eZ%9!;9bq(m}UaPnEamMz0pZ%{oDi6jN`A{?lXQN^u&%=x{4!^OkbSz)4v}qh2 zbuYR?Ahg}Hjc)M{Dofh^-(-4#kJ&6Jo-<31@A%P z`8@)zdm5u+-5Fx8Be+W**1eO-t@)6H=eJjU@Qlo%*E@{w$Urp?^Y;05&ix^dXzG~eR$I)0^L>}{1`7{PcWBdOCyjNxKFh9j9FXz~|tG)WP4>>m$mx6%H^|Nx& zt!KbBct_>+GS+=-U%_$b%Gyv+SGOnLtBoV;w3>1FK4V4x?)9GX^=-7Fo*Vss4S7#n zQ8bpZ^mK<*dp+wb0>s#xc}Mjh*AV*>Mqf1-J9zH&`J+v-I_lmK zW6vHa^7^&n{lD>+3%uIXSjN6{wdB3-!`PemeWFmKLchD5@cfk)@m!IA3Rm(F$GHl4 z0>!LN8u{V%d3Cw5nyp7IaxS&K;eBTb{+!R=&-di(`K~YIHuN2IXJGLRVg;X-Ux23= ziwi{G^&E}G1;TrxSmQPHA32Q0r64>z=0*7}=#Shtg}&MBUcZC-U1AQiUJmp#2lYRR oqo}Q(UVRs~3N-sYa`na(|4AaZL0xV%>>>UcD1^QIe}tU>0517<`~Uy| literal 0 HcmV?d00001 diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.h b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.h new file mode 100644 index 000000000..34f44b18c --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.h @@ -0,0 +1,296 @@ +// generated from `xb buildhlsl` +// source: edram_load_depth_float24and32.cs.hlsl +const uint8_t edram_load_depth_float24and32_cs[] = { + 0x44, 0x58, 0x42, 0x43, 0xF3, 0xA3, 0xA4, 0x14, 0x0A, 0x50, 0x56, 0x49, + 0x5D, 0x09, 0x6C, 0xBF, 0x33, 0xC9, 0xC1, 0x9A, 0x01, 0x00, 0x00, 0x00, + 0xAC, 0x0D, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, + 0x0C, 0x03, 0x00, 0x00, 0x1C, 0x03, 0x00, 0x00, 0x2C, 0x03, 0x00, 0x00, + 0x10, 0x0D, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xD0, 0x02, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0x53, 0x43, 0x00, 0x05, 0x00, 0x00, + 0xA8, 0x02, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x24, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xB4, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xCF, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x6C, 0x6F, 0x61, + 0x64, 0x5F, 0x73, 0x74, 0x6F, 0x72, 0x65, 0x5F, 0x73, 0x6F, 0x75, 0x72, + 0x63, 0x65, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, + 0x6C, 0x6F, 0x61, 0x64, 0x5F, 0x73, 0x74, 0x6F, 0x72, 0x65, 0x5F, 0x64, + 0x65, 0x73, 0x74, 0x00, 0x58, 0x65, 0x45, 0x64, 0x72, 0x61, 0x6D, 0x4C, + 0x6F, 0x61, 0x64, 0x53, 0x74, 0x6F, 0x72, 0x65, 0x43, 0x6F, 0x6E, 0x73, + 0x74, 0x61, 0x6E, 0x74, 0x73, 0x00, 0xAB, 0xAB, 0xE8, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x1C, 0x01, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, + 0x30, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0x4E, 0x02, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x69, 0x02, 0x00, 0x00, + 0x0C, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, + 0x83, 0x02, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, + 0x5F, 0x72, 0x74, 0x5F, 0x63, 0x6F, 0x6C, 0x6F, 0x72, 0x5F, 0x64, 0x65, + 0x70, 0x74, 0x68, 0x5F, 0x6F, 0x66, 0x66, 0x73, 0x65, 0x74, 0x00, 0x64, + 0x77, 0x6F, 0x72, 0x64, 0x00, 0xAB, 0xAB, 0xAB, 0x00, 0x00, 0x13, 0x00, + 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x02, 0x00, 0x00, 0x78, 0x65, 0x5F, 0x65, + 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x72, 0x74, 0x5F, 0x63, 0x6F, 0x6C, 0x6F, + 0x72, 0x5F, 0x64, 0x65, 0x70, 0x74, 0x68, 0x5F, 0x70, 0x69, 0x74, 0x63, + 0x68, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x72, + 0x74, 0x5F, 0x73, 0x74, 0x65, 0x6E, 0x63, 0x69, 0x6C, 0x5F, 0x6F, 0x66, + 0x66, 0x73, 0x65, 0x74, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, + 0x6D, 0x5F, 0x72, 0x74, 0x5F, 0x73, 0x74, 0x65, 0x6E, 0x63, 0x69, 0x6C, + 0x5F, 0x70, 0x69, 0x74, 0x63, 0x68, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, + 0x72, 0x61, 0x6D, 0x5F, 0x62, 0x61, 0x73, 0x65, 0x5F, 0x73, 0x61, 0x6D, + 0x70, 0x6C, 0x65, 0x73, 0x5F, 0x32, 0x78, 0x5F, 0x64, 0x65, 0x70, 0x74, + 0x68, 0x5F, 0x70, 0x69, 0x74, 0x63, 0x68, 0x00, 0x4D, 0x69, 0x63, 0x72, + 0x6F, 0x73, 0x6F, 0x66, 0x74, 0x20, 0x28, 0x52, 0x29, 0x20, 0x48, 0x4C, + 0x53, 0x4C, 0x20, 0x53, 0x68, 0x61, 0x64, 0x65, 0x72, 0x20, 0x43, 0x6F, + 0x6D, 0x70, 0x69, 0x6C, 0x65, 0x72, 0x20, 0x31, 0x30, 0x2E, 0x31, 0x00, + 0x49, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x4F, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x53, 0x48, 0x45, 0x58, + 0xDC, 0x09, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x77, 0x02, 0x00, 0x00, + 0x6A, 0x08, 0x00, 0x01, 0x59, 0x00, 0x00, 0x07, 0x46, 0x8E, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x06, + 0x46, 0x7E, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x00, 0x00, 0x06, + 0x46, 0xEE, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x02, + 0x32, 0x10, 0x02, 0x00, 0x5F, 0x00, 0x00, 0x02, 0x32, 0x20, 0x02, 0x00, + 0x5F, 0x00, 0x00, 0x02, 0x32, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x02, + 0x07, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x06, + 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0C, + 0x62, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x80, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, + 0xFF, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x04, 0x03, + 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x06, + 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x06, + 0x82, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, + 0xA2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x56, 0x0D, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD8, 0xFF, 0xFF, 0xFF, + 0x1E, 0x00, 0x00, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x01, + 0x55, 0x00, 0x00, 0x09, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x08, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x10, 0x02, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x06, + 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x10, 0x02, 0x00, 0x26, 0x00, 0x00, 0x07, + 0x00, 0xD0, 0x00, 0x00, 0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, 0x40, 0x01, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x09, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x29, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0B, + 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, + 0x0A, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, 0x22, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, + 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x7E, 0x20, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x00, 0x00, 0xA0, 0x00, 0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x46, 0x7E, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x05, 0xF2, 0x00, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0xF5, 0xFF, 0xFF, 0xFF, 0xF5, 0xFF, 0xFF, 0xFF, 0xF5, 0xFF, 0xFF, 0xFF, + 0xF5, 0xFF, 0xFF, 0xFF, 0x37, 0x00, 0x00, 0x0C, 0xF2, 0x00, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x15, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B, 0xF2, 0x00, 0x10, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, 0x41, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x37, 0x00, 0x00, 0x09, 0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x29, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0xFF, 0xFF, 0x0F, 0x00, 0xFF, 0xFF, 0x0F, 0x00, + 0xFF, 0xFF, 0x0F, 0x00, 0xFF, 0xFF, 0x0F, 0x00, 0x37, 0x00, 0x00, 0x09, + 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x38, + 0x00, 0x00, 0x00, 0x38, 0x29, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, + 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x37, 0x00, 0x00, 0x0C, 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, + 0x41, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, + 0xFF, 0xFF, 0xFF, 0x7F, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, + 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, 0x8C, 0x00, 0x00, 0x14, + 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, + 0x00, 0x00, 0x80, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B, + 0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, + 0x41, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x71, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x07, + 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x4F, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, + 0x00, 0x00, 0x80, 0x38, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, + 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, 0x37, 0x00, 0x00, 0x09, + 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x05, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x09, 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x29, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x02, 0x00, 0x02, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x0A, 0x32, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x56, 0x05, 0x02, 0x00, 0xD6, 0x85, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x86, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08, 0xF2, 0xE0, 0x21, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0x00, 0x8C, 0x00, 0x00, 0x14, 0xE2, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x56, 0x0E, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xE6, 0x0A, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08, + 0x12, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, + 0x94, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1F, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, +}; diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.txt b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.txt new file mode 100644 index 000000000..4ad3f4288 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float24and32_cs.txt @@ -0,0 +1,117 @@ +// +// Generated by Microsoft (R) HLSL Shader Compiler 10.1 +// +// +// Buffer Definitions: +// +// cbuffer XeEdramLoadStoreConstants +// { +// +// uint xe_edram_rt_color_depth_offset;// Offset: 0 Size: 4 +// uint xe_edram_rt_color_depth_pitch;// Offset: 4 Size: 4 +// uint xe_edram_rt_stencil_offset; // Offset: 8 Size: 4 +// uint xe_edram_rt_stencil_pitch; // Offset: 12 Size: 4 +// uint xe_edram_base_samples_2x_depth_pitch;// Offset: 16 Size: 4 +// +// } +// +// +// Resource Bindings: +// +// Name Type Format Dim ID HLSL Bind Count +// ------------------------------ ---------- ------- ----------- ------- -------------- ------ +// xe_edram_load_store_source texture byte r/o T0 t0 1 +// xe_edram_load_store_dest UAV byte r/w U0 u0 1 +// XeEdramLoadStoreConstants cbuffer NA NA CB0 cb0 1 +// +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// no Input +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// no Output +cs_5_1 +dcl_globalFlags refactoringAllowed +dcl_constantbuffer CB0[0:0][2], immediateIndexed, space=0 +dcl_resource_raw T0[0:0], space=0 +dcl_uav_raw U0[0:0], space=0 +dcl_input vThreadGroupID.xy +dcl_input vThreadIDInGroup.xy +dcl_input vThreadID.xy +dcl_temps 7 +dcl_thread_group 20, 16, 1 +ishl r0.x, vThreadIDInGroup.x, l(2) +and r0.yz, CB0[0][1].xxxx, l(0, 0x00008000, 2047, 0) +if_nz r0.y + ult r0.y, vThreadIDInGroup.x, l(10) + uge r0.w, vThreadIDInGroup.x, l(10) + and r0.yw, r0.yyyw, l(0, 40, 0, -40) + iadd r0.y, r0.w, r0.y + iadd r0.x, r0.y, r0.x +endif +ushr r0.y, CB0[0][1].x, l(16) +imad r0.y, vThreadGroupID.y, r0.y, r0.z +iadd r0.y, r0.y, vThreadGroupID.x +imul null, r0.z, vThreadIDInGroup.y, l(320) +imad r0.y, r0.y, l(5120), r0.z +ishl r0.x, r0.x, l(2) +iadd r0.x, r0.x, r0.y +ubfe r0.y, l(1), l(13), CB0[0][1].x +ishl r0.y, r0.y, l(1) +ishl r0.x, r0.x, r0.y +ld_raw r1.xyzw, r0.x, T0[0].xyzw +ushr r2.xyzw, r1.xyzw, l(8, 8, 8, 8) +iadd r0.x, r0.x, l(0x00a00000) +ld_raw r0.xyzw, r0.x, T0[0].xyzw +ubfe r3.xyzw, l(20, 20, 20, 20), l(8, 8, 8, 8), r1.xyzw +ushr r4.xyzw, r2.xyzw, l(20, 20, 20, 20) +firstbit_hi r5.xyzw, r3.xyzw +iadd r5.xyzw, r5.xyzw, l(-11, -11, -11, -11) +movc r5.xyzw, r3.xyzw, r5.xyzw, l(21,21,21,21) +iadd r6.xyzw, -r5.xyzw, l(1, 1, 1, 1) +movc r6.xyzw, r4.xyzw, r4.xyzw, r6.xyzw +ishl r5.xyzw, r3.xyzw, r5.xyzw +and r5.xyzw, r5.xyzw, l(0x000fffff, 0x000fffff, 0x000fffff, 0x000fffff) +movc r3.xyzw, r4.xyzw, r3.xyzw, r5.xyzw +ishl r4.xyzw, r6.xyzw, l(23, 23, 23, 23) +iadd r4.xyzw, r4.xyzw, l(0x38000000, 0x38000000, 0x38000000, 0x38000000) +ishl r3.xyzw, r3.xyzw, l(3, 3, 3, 3) +iadd r3.xyzw, r4.xyzw, r3.xyzw +movc r3.xyzw, r2.xyzw, r3.xyzw, l(0,0,0,0) +iadd r4.xyzw, r0.xyzw, -r3.xyzw +uge r5.xyzw, l(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff), r0.xyzw +and r0.xyzw, r0.xyzw, r5.xyzw +umin r0.xyzw, r0.xyzw, l(0x3ffffff8, 0x3ffffff8, 0x3ffffff8, 0x3ffffff8) +bfi r5.xyzw, l(23, 23, 23, 23), l(0, 0, 0, 0), r0.xyzw, l(0x00800000, 0x00800000, 0x00800000, 0x00800000) +ushr r6.xyzw, r0.xyzw, l(23, 23, 23, 23) +iadd r6.xyzw, -r6.xyzw, l(113, 113, 113, 113) +umin r6.xyzw, r6.xyzw, l(24, 24, 24, 24) +ushr r5.xyzw, r5.xyzw, r6.xyzw +ult r6.xyzw, r0.xyzw, l(0x38800000, 0x38800000, 0x38800000, 0x38800000) +iadd r0.xyzw, r0.xyzw, l(0xc8000000, 0xc8000000, 0xc8000000, 0xc8000000) +movc r0.xyzw, r6.xyzw, r5.xyzw, r0.xyzw +iadd r5.xyzw, r0.xyzw, l(3, 3, 3, 3) +ubfe r0.xyzw, l(1, 1, 1, 1), l(3, 3, 3, 3), r0.xyzw +iadd r0.xyzw, r0.xyzw, r5.xyzw +ubfe r0.xyzw, l(24, 24, 24, 24), l(3, 3, 3, 3), r0.xyzw +ieq r0.xyzw, r2.xyzw, r0.xyzw +and r0.xyzw, r0.xyzw, l(1, 1, 1, 1) +imad r0.xyzw, r4.xyzw, r0.xyzw, r3.xyzw +ishl r2.xy, vThreadID.xxxx, l(4, 2, 0, 0) +imad r2.xy, vThreadID.yyyy, CB0[0][0].ywyy, r2.xyxx +iadd r2.xy, r2.xyxx, CB0[0][0].xzxx +store_raw U0[0].xyzw, r2.x, r0.xyzw +and r0.x, r1.x, l(255) +bfi r0.yzw, l(0, 8, 8, 8), l(0, 8, 16, 24), r1.yyzw, l(0, 0, 0, 0) +iadd r0.xy, r0.zwzz, r0.xyxx +iadd r0.x, r0.y, r0.x +store_raw U0[0].x, r2.y, r0.x +ret +// Approximately 67 instruction slots used diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.cso index c389242a1ee9667b3dfdf6346a236e612ce4770b..01be358b0614568897a08046e96147be2d234110 100644 GIT binary patch delta 343 zcmZ1@{Y1pbCBn&B{2epf<-CZ}H0$Vg{hu!L7#SECQn(lxSb?+&5c4oIFvtM04iKN< z+-UTViE9cQ14AHC!BWP}R?MrICVQ|+aB4Cza2hcPFfdL&$gLa-G9-+ZiQ!tS0g#0N zZXgvvt{hA?kPQ=SYd}>S%D}+&38Ep&Mccn2flg3oA-V1z4`fGKWO$Itlsbce7jc< zm!CddGn?;ke*ExS0XUcdJhrzO=S#p<#sZ@{aoGEH_dT*uyWd*G~=2QEAfwPb54U@ z&=F4$GB)+6C ze?R~3xdB7jtA7T7o6zG4ASWZY|EAn zZK7@^xYzJT$SyGHC3qe<31%E6`<=Qs;Q8RWa9;^-L5yd|yc+mHzMP%(i(xkJM{!sV z{3yIs&IQuSlnw^5*JeGi%HrJ1;!y=e1K*^ zw(NtRn7MGrfa4tI!WDqyjOW7L1g`rZaP^^-V?P%b!?K?bO7Vmrj*mw{Ecq*8Igc?+ z0CC#L1rgQ#USzru9_-qpreIc zIgG;b*zG;F#XUTd8F4cc*?drT2gAuyp@^6Mz8(E?By;E(XH23z-t;?jczEnS7V#ub z{25xkFkcd9vBv3peN`WQpw%ip!P=&;HQH!+ME$q^8pwM3Se1cpN2<~_65BHYJ$LLO zj`#PMd&9p)-+5J@TKOot=yZ#CAFHvVg@SgN~SX~aImSbhM{ z+$rOizUI~v;dugT+S$M3Rd@Y5*1gW@MKxde&r_W0DCsw9#ZVn%{hFECZ}VM|9>(Ce zhxQeaHdEJ3c07>^%(HyjL?xW&sf~YrDhLT8JR4(qQp3#Rd1x@)vUbVyhl6-?9vkU zL2|stv!R_j;%(MGC`q`BN!G$zQ?4Tk4}N&ZJi5*qebTy5oRj0>L8{=c@XrvmR@&Rx z3w?3+$@NV!@wd!-mb}qIkC>^aMt^4E>^)@MUOqq{a3MK+jl0L-H79M_hw6COSp&u_RMK)z^D|n3g0u0v(>~wj(T5eO98hTx#g|ML9NNm(L-a1S_jc7}jq0r8bR&Jzk*o63?@tTnO3RBXcNL{ne*ce#Jb;On!V?hln-O8gzfO zK3dhrxFV0Gl`k#yWv%;AZf5kr=zY9z2lKyz)K(MtXBJrmuJ&bgtpw-PO!WC5V$9#t CxiDz} literal 0 HcmV?d00001 diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.h b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.h new file mode 100644 index 000000000..c5a2d2118 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.h @@ -0,0 +1,226 @@ +// generated from `xb buildhlsl` +// source: edram_store_depth_float24and32.cs.hlsl +const uint8_t edram_store_depth_float24and32_cs[] = { + 0x44, 0x58, 0x42, 0x43, 0xC6, 0x10, 0x80, 0x14, 0x97, 0x01, 0xE4, 0x46, + 0x76, 0xF1, 0x67, 0xD3, 0xDF, 0x50, 0x25, 0xF7, 0x01, 0x00, 0x00, 0x00, + 0x64, 0x0A, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, + 0x0C, 0x03, 0x00, 0x00, 0x1C, 0x03, 0x00, 0x00, 0x2C, 0x03, 0x00, 0x00, + 0xC8, 0x09, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0xD0, 0x02, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0x53, 0x43, 0x00, 0x05, 0x00, 0x00, + 0xA8, 0x02, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x24, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xB4, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xCF, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x6C, 0x6F, 0x61, + 0x64, 0x5F, 0x73, 0x74, 0x6F, 0x72, 0x65, 0x5F, 0x73, 0x6F, 0x75, 0x72, + 0x63, 0x65, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, + 0x6C, 0x6F, 0x61, 0x64, 0x5F, 0x73, 0x74, 0x6F, 0x72, 0x65, 0x5F, 0x64, + 0x65, 0x73, 0x74, 0x00, 0x58, 0x65, 0x45, 0x64, 0x72, 0x61, 0x6D, 0x4C, + 0x6F, 0x61, 0x64, 0x53, 0x74, 0x6F, 0x72, 0x65, 0x43, 0x6F, 0x6E, 0x73, + 0x74, 0x61, 0x6E, 0x74, 0x73, 0x00, 0xAB, 0xAB, 0xE8, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x1C, 0x01, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE4, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, + 0x30, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0x4E, 0x02, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x69, 0x02, 0x00, 0x00, + 0x0C, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, + 0x83, 0x02, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0C, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0x00, 0x00, 0x00, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, + 0x5F, 0x72, 0x74, 0x5F, 0x63, 0x6F, 0x6C, 0x6F, 0x72, 0x5F, 0x64, 0x65, + 0x70, 0x74, 0x68, 0x5F, 0x6F, 0x66, 0x66, 0x73, 0x65, 0x74, 0x00, 0x64, + 0x77, 0x6F, 0x72, 0x64, 0x00, 0xAB, 0xAB, 0xAB, 0x00, 0x00, 0x13, 0x00, + 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x02, 0x00, 0x00, 0x78, 0x65, 0x5F, 0x65, + 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x72, 0x74, 0x5F, 0x63, 0x6F, 0x6C, 0x6F, + 0x72, 0x5F, 0x64, 0x65, 0x70, 0x74, 0x68, 0x5F, 0x70, 0x69, 0x74, 0x63, + 0x68, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, 0x6D, 0x5F, 0x72, + 0x74, 0x5F, 0x73, 0x74, 0x65, 0x6E, 0x63, 0x69, 0x6C, 0x5F, 0x6F, 0x66, + 0x66, 0x73, 0x65, 0x74, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, 0x72, 0x61, + 0x6D, 0x5F, 0x72, 0x74, 0x5F, 0x73, 0x74, 0x65, 0x6E, 0x63, 0x69, 0x6C, + 0x5F, 0x70, 0x69, 0x74, 0x63, 0x68, 0x00, 0x78, 0x65, 0x5F, 0x65, 0x64, + 0x72, 0x61, 0x6D, 0x5F, 0x62, 0x61, 0x73, 0x65, 0x5F, 0x73, 0x61, 0x6D, + 0x70, 0x6C, 0x65, 0x73, 0x5F, 0x32, 0x78, 0x5F, 0x64, 0x65, 0x70, 0x74, + 0x68, 0x5F, 0x70, 0x69, 0x74, 0x63, 0x68, 0x00, 0x4D, 0x69, 0x63, 0x72, + 0x6F, 0x73, 0x6F, 0x66, 0x74, 0x20, 0x28, 0x52, 0x29, 0x20, 0x48, 0x4C, + 0x53, 0x4C, 0x20, 0x53, 0x68, 0x61, 0x64, 0x65, 0x72, 0x20, 0x43, 0x6F, + 0x6D, 0x70, 0x69, 0x6C, 0x65, 0x72, 0x20, 0x31, 0x30, 0x2E, 0x31, 0x00, + 0x49, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x4F, 0x53, 0x47, 0x4E, 0x08, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x53, 0x48, 0x45, 0x58, + 0x94, 0x06, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0xA5, 0x01, 0x00, 0x00, + 0x6A, 0x08, 0x00, 0x01, 0x59, 0x00, 0x00, 0x07, 0x46, 0x8E, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x06, + 0x46, 0x7E, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9D, 0x00, 0x00, 0x06, + 0x46, 0xEE, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x02, + 0x32, 0x10, 0x02, 0x00, 0x5F, 0x00, 0x00, 0x02, 0x32, 0x20, 0x02, 0x00, + 0x5F, 0x00, 0x00, 0x02, 0x32, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x02, + 0x05, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x09, + 0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x02, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x0A, + 0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x56, 0x05, 0x02, 0x00, + 0xD6, 0x85, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x46, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x09, 0x32, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x46, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0x80, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xA5, 0x00, 0x00, 0x08, 0xF2, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x7E, 0x20, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0x7F, + 0xFF, 0xFF, 0xFF, 0x7F, 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, + 0xF8, 0xFF, 0xFF, 0x3F, 0xF8, 0xFF, 0xFF, 0x3F, 0x8C, 0x00, 0x00, 0x14, + 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, + 0x00, 0x00, 0x80, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0B, + 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x80, + 0x41, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x71, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x07, + 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x4F, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, 0x00, 0x00, 0x80, 0x38, + 0x00, 0x00, 0x80, 0x38, 0x1E, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, + 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC8, 0x37, 0x00, 0x00, 0x09, + 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x0A, + 0xF2, 0x00, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x8A, 0x00, 0x00, 0x0F, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0xF2, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xF2, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xA5, 0x00, 0x00, 0x08, + 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x06, 0x70, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x0A, 0xE2, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x8C, 0x00, 0x00, 0x11, + 0xF2, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x46, 0x0E, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x06, 0x12, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0C, 0x62, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x06, 0x80, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0xFF, 0x07, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x04, 0x03, 0x1A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x06, 0x22, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x06, 0x82, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x0A, 0xA2, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x56, 0x0D, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xD8, 0xFF, 0xFF, 0xFF, 0x1E, 0x00, 0x00, 0x07, + 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x01, 0x55, 0x00, 0x00, 0x09, + 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x80, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x08, + 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x10, 0x02, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x06, 0x22, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x0A, 0x10, 0x02, 0x00, 0x26, 0x00, 0x00, 0x07, 0x00, 0xD0, 0x00, 0x00, + 0x42, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x20, 0x02, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x40, 0x01, 0x00, 0x00, 0x23, 0x00, 0x00, 0x09, + 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, + 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x0B, 0x22, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, 0x0A, 0x80, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x29, 0x00, 0x00, 0x07, 0x22, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x08, + 0xF2, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x07, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0xA0, 0x00, 0xA6, 0x00, 0x00, 0x08, + 0xF2, 0xE0, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 0x0E, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, + 0x94, 0x00, 0x00, 0x00, 0x2D, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, +}; diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.txt b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.txt new file mode 100644 index 000000000..1a0cc82cc --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float24and32_cs.txt @@ -0,0 +1,95 @@ +// +// Generated by Microsoft (R) HLSL Shader Compiler 10.1 +// +// +// Buffer Definitions: +// +// cbuffer XeEdramLoadStoreConstants +// { +// +// uint xe_edram_rt_color_depth_offset;// Offset: 0 Size: 4 +// uint xe_edram_rt_color_depth_pitch;// Offset: 4 Size: 4 +// uint xe_edram_rt_stencil_offset; // Offset: 8 Size: 4 +// uint xe_edram_rt_stencil_pitch; // Offset: 12 Size: 4 +// uint xe_edram_base_samples_2x_depth_pitch;// Offset: 16 Size: 4 +// +// } +// +// +// Resource Bindings: +// +// Name Type Format Dim ID HLSL Bind Count +// ------------------------------ ---------- ------- ----------- ------- -------------- ------ +// xe_edram_load_store_source texture byte r/o T0 t0 1 +// xe_edram_load_store_dest UAV byte r/w U0 u0 1 +// XeEdramLoadStoreConstants cbuffer NA NA CB0 cb0 1 +// +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// no Input +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// no Output +cs_5_1 +dcl_globalFlags refactoringAllowed +dcl_constantbuffer CB0[0:0][2], immediateIndexed, space=0 +dcl_resource_raw T0[0:0], space=0 +dcl_uav_raw U0[0:0], space=0 +dcl_input vThreadGroupID.xy +dcl_input vThreadIDInGroup.xy +dcl_input vThreadID.xy +dcl_temps 5 +dcl_thread_group 20, 16, 1 +ishl r0.xy, vThreadID.xxxx, l(4, 2, 0, 0) +imad r0.xy, vThreadID.yyyy, CB0[0][0].ywyy, r0.xyxx +iadd r0.xy, r0.xyxx, CB0[0][0].xzxx +ld_raw r1.xyzw, r0.x, T0[0].xyzw +uge r2.xyzw, l(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff), r1.xyzw +and r2.xyzw, r1.xyzw, r2.xyzw +umin r2.xyzw, r2.xyzw, l(0x3ffffff8, 0x3ffffff8, 0x3ffffff8, 0x3ffffff8) +bfi r3.xyzw, l(23, 23, 23, 23), l(0, 0, 0, 0), r2.xyzw, l(0x00800000, 0x00800000, 0x00800000, 0x00800000) +ushr r4.xyzw, r2.xyzw, l(23, 23, 23, 23) +iadd r4.xyzw, -r4.xyzw, l(113, 113, 113, 113) +umin r4.xyzw, r4.xyzw, l(24, 24, 24, 24) +ushr r3.xyzw, r3.xyzw, r4.xyzw +ult r4.xyzw, r2.xyzw, l(0x38800000, 0x38800000, 0x38800000, 0x38800000) +iadd r2.xyzw, r2.xyzw, l(0xc8000000, 0xc8000000, 0xc8000000, 0xc8000000) +movc r2.xyzw, r4.xyzw, r3.xyzw, r2.xyzw +iadd r3.xyzw, r2.xyzw, l(3, 3, 3, 3) +ubfe r2.xyzw, l(1, 1, 1, 1), l(3, 3, 3, 3), r2.xyzw +iadd r2.xyzw, r2.xyzw, r3.xyzw +ushr r2.xyzw, r2.xyzw, l(3, 3, 3, 3) +ld_raw r0.x, r0.y, T0[0].xxxx +ushr r0.yzw, r0.xxxx, l(0, 8, 16, 24) +bfi r0.xyzw, l(24, 24, 24, 24), l(8, 8, 8, 8), r2.xyzw, r0.xyzw +ishl r2.x, vThreadIDInGroup.x, l(2) +and r2.yz, CB0[0][1].xxxx, l(0, 0x00008000, 2047, 0) +if_nz r2.y + ult r2.y, vThreadIDInGroup.x, l(10) + uge r2.w, vThreadIDInGroup.x, l(10) + and r2.yw, r2.yyyw, l(0, 40, 0, -40) + iadd r2.y, r2.w, r2.y + iadd r2.x, r2.y, r2.x +endif +ushr r2.y, CB0[0][1].x, l(16) +imad r2.y, vThreadGroupID.y, r2.y, r2.z +iadd r2.y, r2.y, vThreadGroupID.x +imul null, r2.z, vThreadIDInGroup.y, l(320) +imad r2.y, r2.y, l(5120), r2.z +ishl r2.x, r2.x, l(2) +iadd r2.x, r2.x, r2.y +ubfe r2.y, l(1), l(13), CB0[0][1].x +ishl r2.y, r2.y, l(1) +ishl r2.x, r2.x, r2.y +store_raw U0[0].xyzw, r2.x, r0.xyzw +iadd r0.x, r2.x, l(0x00a00000) +store_raw U0[0].xyzw, r0.x, r1.xyzw +ret +// Approximately 45 instruction slots used diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.cso index 1dd12cb19ed6d6ff47944307e6569ccf24516077..b636e8d752c13074a4df604f5c44f04b355f32b7 100644 GIT binary patch delta 740 zcmZ8eKTASU6hHU<^}RAopGYRXN{q_Dj244JEu*a@728`HB4~?;crRiN$^bE}Ijnjk{VN$&4WNklHP6TgOySS+J7fGr88t||J`@1P?-iR#ys4xR zX)CsYVu3W32%lGde$lDr5RrL7?QvwjuiCt(S;LzEW(NY$Q#X*;oAROB=+Cs!h%*(P z#)Lxh<|Q=Z6tq;DbcpNZ=y?bp@ykN%Vf;b63A?|L05bqukkq`WN6>XoPbCP)oZ6&t zHOD1jS+v6b7WU5^{)--Qa##GM{?zSd(k%&1RNR!#@kh+ delta 833 zcmZuwOG{fp6#nizH*Mr%5&}WIZ9{}ULQLt#LM74awn|m}0fM+F-F4GNZI@jYPQYzl zC}x=-pwLa}GLTgl(k}7`%*LgjGnsQ0abPlc&i6Rq43m@6sMLD9y|ny0*)JV9qt#Cr zyTgBL^-Bl9P96w&eZ}*k2h8w%!SheQb>+t05LjpYh!(t$PCz|x=(zMXEYc4@7rU%k zol4-+p`SV){qmcPUifL~8-&jMD$NJE!epE5cew1+N|463!n+SaTIYVoNLNAM0r@c! zDAT1r>uy^94bq10l%sm7&su#}qk5Qfyp^!ovUnu^>Y=OfH?3A{E**sjW!*6WPl0e+ z)U+N|*mW;z)&=L(;S=kcf8@exvlZ9Zxc<%XPos)C#a2Po=TZ7bbg!+ZYFADl8GpvK zmbX|La5!OX`p`nI_C*}3iJLK>-MD0FG8g8}u{#F9l6JL8+x#MG>U^uSoB}04efjiR zI!^V{T%wR9$nEGFS(|?c9!z2v;^*cX3__i_Xby3ge+=Zt{~xpKWGF7LPKNf9Z^cYl n{KS1);fYpW{vu!F`;E>!q*5mlQ)xs^e+>?T!9^$z z4uY$D7XJi62S-s=UYwlzy!yTG%BQ#O z{FgIFZe$+5OqtnG#w-ng3ib;2yvytz>`|{-wLE>X0nNubJfzZb%p~%`!SXTsA=rqs zqwsrR#MS2Qwbpv;&PFs+EkrZ3wb`h)e5crSAnNsFO?IM-iY-E}fpBPA)y92v8^I**O3G z@C0>o`j75X==eMU%VDVX>BXr^rCPSy^_z38_2x#i^}srvP6hXU9Ba6qvx#5=^J5@| zO(Lzlw6d|xoZ8Iv;%fxw5&s6e8(6AQw4TW@O!~3b60Sb=#%vj$@4)FONDm~tHyd^9 z<}Az%J1td8dOhybvjrKgmv-32>N~`k|1*v+JhQOhuXzZ)86#9@>$zA9DF5*d)b@}}*c<0idwWi|b`900r#r!VkLdo~z7GyOK)lzNM z$$NdJaZlYSu<7Mf;~gocwLA4!k4-D)*4#O}kk4}-fa%+%#y&L8kLY#lpIRyX#BtTj zTPY*$tT(?Oc;CjYlvPaY{WS+s8+#2n51s?DKkkQ`=0&b~$@9F_3*H|7>f^=y-f^5m z?Jzs+_NMo4#c?L6Ao f>J#-Bh~;7WN6Yhn#_yl|{2y?={?ohV{Y%YXB~X9q literal 0 HcmV?d00001 diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.h b/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.h new file mode 100644 index 000000000..b155ee5d0 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.h @@ -0,0 +1,156 @@ +// generated from `xb buildhlsl` +// source: float24_round.ps.hlsl +const uint8_t float24_round_ps[] = { + 0x44, 0x58, 0x42, 0x43, 0xDF, 0x71, 0xF3, 0x0A, 0x4A, 0xDB, 0xC3, 0x80, + 0x1E, 0xE4, 0x39, 0x21, 0x59, 0x07, 0x78, 0x97, 0x01, 0x00, 0x00, 0x00, + 0x18, 0x07, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, + 0xA0, 0x00, 0x00, 0x00, 0x90, 0x02, 0x00, 0x00, 0xC4, 0x02, 0x00, 0x00, + 0x7C, 0x06, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0x64, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0xFF, 0xFF, 0x00, 0x05, 0x00, 0x00, + 0x3C, 0x00, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x24, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x4D, 0x69, 0x63, 0x72, 0x6F, 0x73, 0x6F, 0x66, 0x74, 0x20, 0x28, 0x52, + 0x29, 0x20, 0x48, 0x4C, 0x53, 0x4C, 0x20, 0x53, 0x68, 0x61, 0x64, 0x65, + 0x72, 0x20, 0x43, 0x6F, 0x6D, 0x70, 0x69, 0x6C, 0x65, 0x72, 0x20, 0x31, + 0x30, 0x2E, 0x31, 0x00, 0x49, 0x53, 0x47, 0x4E, 0xE8, 0x01, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0B, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0C, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0D, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0E, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0F, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xD9, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x12, 0x00, 0x00, 0x00, 0x0F, 0x04, 0x00, 0x00, 0x54, 0x45, 0x58, 0x43, + 0x4F, 0x4F, 0x52, 0x44, 0x00, 0x53, 0x56, 0x5F, 0x50, 0x6F, 0x73, 0x69, + 0x74, 0x69, 0x6F, 0x6E, 0x00, 0xAB, 0xAB, 0xAB, 0x4F, 0x53, 0x47, 0x4E, + 0x2C, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x0E, 0x00, 0x00, + 0x53, 0x56, 0x5F, 0x44, 0x65, 0x70, 0x74, 0x68, 0x00, 0xAB, 0xAB, 0xAB, + 0x53, 0x48, 0x45, 0x58, 0xB0, 0x03, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, + 0xEC, 0x00, 0x00, 0x00, 0x6A, 0x08, 0x00, 0x01, 0x64, 0x38, 0x00, 0x04, + 0x42, 0x10, 0x10, 0x00, 0x12, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x65, 0x00, 0x00, 0x02, 0x01, 0xC0, 0x00, 0x00, 0x68, 0x00, 0x00, 0x02, + 0x02, 0x00, 0x00, 0x00, 0x36, 0x20, 0x08, 0x05, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2A, 0x10, 0x10, 0x00, 0x12, 0x00, 0x00, 0x00, + 0x50, 0x00, 0x10, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0x7F, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x08, 0x07, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x54, 0x00, 0x08, 0x07, + 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0xF8, 0xFF, 0xFF, 0x3F, + 0x8C, 0x00, 0x10, 0x0B, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x55, 0x00, 0x20, 0x07, + 0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x20, 0x08, 0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x80, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x54, 0x00, 0x20, 0x07, + 0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x10, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x20, 0x07, 0x42, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x80, 0x38, 0x1E, 0x00, 0x08, 0x07, + 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC8, + 0x37, 0x00, 0x08, 0x09, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x10, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x08, 0x09, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x08, 0x07, 0x12, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x38, 0x0F, + 0x72, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x40, 0x05, + 0x82, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x40, 0x07, 0x82, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0xF5, 0xFF, 0xFF, 0xFF, 0x37, 0x00, 0x40, 0x09, + 0x82, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x08, 0x08, + 0x12, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x80, + 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x37, 0x00, 0x08, 0x09, 0x12, 0x00, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x29, 0x00, 0x40, 0x07, 0x82, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x40, 0x07, + 0x82, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0xFF, 0xFF, 0x0F, 0x00, + 0x37, 0x00, 0x10, 0x09, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x29, 0x00, 0x20, 0x07, 0x42, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x20, 0x07, 0x42, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x29, 0x00, 0x10, 0x07, + 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x10, 0x07, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x37, 0x00, 0x08, 0x08, 0x01, 0xC0, 0x00, 0x00, + 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, 0x94, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, +}; diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.txt b/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.txt new file mode 100644 index 000000000..c9661e6ac --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/float24_round_ps.txt @@ -0,0 +1,74 @@ +// +// Generated by Microsoft (R) HLSL Shader Compiler 10.1 +// +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// TEXCOORD 0 xyzw 0 NONE float +// TEXCOORD 1 xyzw 1 NONE float +// TEXCOORD 2 xyzw 2 NONE float +// TEXCOORD 3 xyzw 3 NONE float +// TEXCOORD 4 xyzw 4 NONE float +// TEXCOORD 5 xyzw 5 NONE float +// TEXCOORD 6 xyzw 6 NONE float +// TEXCOORD 7 xyzw 7 NONE float +// TEXCOORD 8 xyzw 8 NONE float +// TEXCOORD 9 xyzw 9 NONE float +// TEXCOORD 10 xyzw 10 NONE float +// TEXCOORD 11 xyzw 11 NONE float +// TEXCOORD 12 xyzw 12 NONE float +// TEXCOORD 13 xyzw 13 NONE float +// TEXCOORD 14 xyzw 14 NONE float +// TEXCOORD 15 xyzw 15 NONE float +// TEXCOORD 16 xyz 16 NONE float +// TEXCOORD 17 xy 17 NONE float +// SV_Position 0 xyzw 18 POS float z +// +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// SV_Depth 0 N/A oDepth DEPTH float YES +// +// Pixel Shader runs at sample frequency +// +ps_5_1 +dcl_globalFlags refactoringAllowed +dcl_input_ps_siv linear noperspective sample v18.z, position +dcl_output oDepth +dcl_temps 2 +mov_sat [precise(x)] r0.x, v18.z +uge [precise(y)] r0.y, l(0x7fffffff), r0.x +and [precise(x)] r0.x, r0.x, r0.y +umin [precise(x)] r0.x, r0.x, l(0x3ffffff8) +bfi [precise(y)] r0.y, l(23), l(0), r0.x, l(0x00800000) +ushr [precise(z)] r0.z, r0.x, l(23) +iadd [precise(z)] r0.z, -r0.z, l(113) +umin [precise(z)] r0.z, r0.z, l(24) +ushr [precise(y)] r0.y, r0.y, r0.z +ult [precise(z)] r0.z, r0.x, l(0x38800000) +iadd [precise(x)] r0.x, r0.x, l(0xc8000000) +movc [precise(x)] r0.x, r0.z, r0.y, r0.x +iadd [precise(y)] r0.y, r0.x, l(3) +ubfe [precise(x)] r0.x, l(1), l(3), r0.x +iadd [precise(x)] r0.x, r0.x, r0.y +ubfe [precise(xyz)] r0.xyz, l(24, 20, 4, 0), l(3, 3, 23, 0), r0.xxxx +firstbit_hi [precise(w)] r0.w, r0.y +iadd [precise(w)] r0.w, r0.w, l(-11) +movc [precise(w)] r0.w, r0.y, r0.w, l(21) +iadd [precise(x)] r1.x, -r0.w, l(1) +movc [precise(x)] r1.x, r0.z, r0.z, r1.x +ishl [precise(w)] r0.w, r0.y, r0.w +and [precise(w)] r0.w, r0.w, l(0x000fffff) +movc [precise(y)] r0.y, r0.z, r0.y, r0.w +ishl [precise(z)] r0.z, r1.x, l(23) +iadd [precise(z)] r0.z, r0.z, l(0x38000000) +ishl [precise(y)] r0.y, r0.y, l(3) +iadd [precise(y)] r0.y, r0.z, r0.y +movc [precise(x)] oDepth, r0.x, r0.y, l(0) +ret +// Approximately 30 instruction slots used diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.cso b/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.cso new file mode 100644 index 0000000000000000000000000000000000000000..a22366f58ae8811c10d9b0abd7460227ca0c1f55 GIT binary patch literal 1148 zcma)*ziU%r6o$`j5_=n?y*7v7iA~%b3U1=y z&{c5HKj7#pI0#NoI=Tw(@qI4$K&RphC+Gd1@0|PnxHr}IM&;A?ql9QW$o(Wv(F)$X2} zlg(PYq{eO4RTR%c3pzI~zY#_0x;(S1Paqi1SdXmnS{xP@(o>jS%j&i)nqACTG0VXn z72CneWy|L-WbCa)b6&?h`z4=K_P$FNgc+}TN7nM^GMCfbI}3~K^(Yrk vd&$0r?lf4lFi$PL=&aanMbcVt9cs+?!FiGYAILDz%>9qN|F~LT>$sYK-_TnG literal 0 HcmV?d00001 diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.h b/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.h new file mode 100644 index 000000000..b8d1d7bb7 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.h @@ -0,0 +1,100 @@ +// generated from `xb buildhlsl` +// source: float24_truncate.ps.hlsl +const uint8_t float24_truncate_ps[] = { + 0x44, 0x58, 0x42, 0x43, 0xB8, 0x51, 0x55, 0x1D, 0xF4, 0xF1, 0xC9, 0xC0, + 0x0C, 0x22, 0xD3, 0x43, 0x94, 0xDF, 0x83, 0x9D, 0x01, 0x00, 0x00, 0x00, + 0x7C, 0x04, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, + 0xA0, 0x00, 0x00, 0x00, 0x90, 0x02, 0x00, 0x00, 0xCC, 0x02, 0x00, 0x00, + 0xE0, 0x03, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0x64, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x3C, 0x00, 0x00, 0x00, 0x01, 0x05, 0xFF, 0xFF, 0x00, 0x05, 0x00, 0x00, + 0x3C, 0x00, 0x00, 0x00, 0x13, 0x13, 0x44, 0x25, 0x3C, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x24, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x4D, 0x69, 0x63, 0x72, 0x6F, 0x73, 0x6F, 0x66, 0x74, 0x20, 0x28, 0x52, + 0x29, 0x20, 0x48, 0x4C, 0x53, 0x4C, 0x20, 0x53, 0x68, 0x61, 0x64, 0x65, + 0x72, 0x20, 0x43, 0x6F, 0x6D, 0x70, 0x69, 0x6C, 0x65, 0x72, 0x20, 0x31, + 0x30, 0x2E, 0x31, 0x00, 0x49, 0x53, 0x47, 0x4E, 0xE8, 0x01, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0B, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0C, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0D, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0E, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x0F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x0F, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0xD0, 0x01, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xD9, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x12, 0x00, 0x00, 0x00, 0x0F, 0x04, 0x00, 0x00, 0x54, 0x45, 0x58, 0x43, + 0x4F, 0x4F, 0x52, 0x44, 0x00, 0x53, 0x56, 0x5F, 0x50, 0x6F, 0x73, 0x69, + 0x74, 0x69, 0x6F, 0x6E, 0x00, 0xAB, 0xAB, 0xAB, 0x4F, 0x53, 0x47, 0x4E, + 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x0E, 0x00, 0x00, + 0x53, 0x56, 0x5F, 0x44, 0x65, 0x70, 0x74, 0x68, 0x4C, 0x65, 0x73, 0x73, + 0x45, 0x71, 0x75, 0x61, 0x6C, 0x00, 0xAB, 0xAB, 0x53, 0x48, 0x45, 0x58, + 0x0C, 0x01, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, + 0x6A, 0x08, 0x00, 0x01, 0x64, 0x38, 0x00, 0x04, 0x42, 0x10, 0x10, 0x00, + 0x12, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x02, + 0x01, 0x70, 0x02, 0x00, 0x68, 0x00, 0x00, 0x02, 0x01, 0x00, 0x00, 0x00, + 0x36, 0x20, 0x08, 0x05, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2A, 0x10, 0x10, 0x00, 0x12, 0x00, 0x00, 0x00, 0x50, 0x00, 0x10, 0x07, + 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x80, 0x2E, + 0x1F, 0x00, 0x04, 0x03, 0x1A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x8A, 0x00, 0x10, 0x09, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x10, 0x08, 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x10, 0x80, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x24, 0x00, 0x10, 0x07, + 0x22, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x8C, 0x00, 0x08, 0x0A, 0x01, 0x70, 0x02, 0x00, 0x1A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x01, 0x36, 0x00, 0x08, 0x04, + 0x01, 0x70, 0x02, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x00, 0x01, 0x3E, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, + 0x94, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.txt b/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.txt new file mode 100644 index 000000000..dd969f04d --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/dxbc/float24_truncate_ps.txt @@ -0,0 +1,55 @@ +// +// Generated by Microsoft (R) HLSL Shader Compiler 10.1 +// +// +// +// Input signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// TEXCOORD 0 xyzw 0 NONE float +// TEXCOORD 1 xyzw 1 NONE float +// TEXCOORD 2 xyzw 2 NONE float +// TEXCOORD 3 xyzw 3 NONE float +// TEXCOORD 4 xyzw 4 NONE float +// TEXCOORD 5 xyzw 5 NONE float +// TEXCOORD 6 xyzw 6 NONE float +// TEXCOORD 7 xyzw 7 NONE float +// TEXCOORD 8 xyzw 8 NONE float +// TEXCOORD 9 xyzw 9 NONE float +// TEXCOORD 10 xyzw 10 NONE float +// TEXCOORD 11 xyzw 11 NONE float +// TEXCOORD 12 xyzw 12 NONE float +// TEXCOORD 13 xyzw 13 NONE float +// TEXCOORD 14 xyzw 14 NONE float +// TEXCOORD 15 xyzw 15 NONE float +// TEXCOORD 16 xyz 16 NONE float +// TEXCOORD 17 xy 17 NONE float +// SV_Position 0 xyzw 18 POS float z +// +// +// Output signature: +// +// Name Index Mask Register SysValue Format Used +// -------------------- ----- ------ -------- -------- ------- ------ +// SV_DepthLessEqual 0 N/A oDepthLE DEPTHLE float YES +// +// Pixel Shader runs at sample frequency +// +ps_5_1 +dcl_globalFlags refactoringAllowed +dcl_input_ps_siv linear noperspective sample v18.z, position +dcl_output oDepthLE +dcl_temps 1 +mov_sat [precise(x)] r0.x, v18.z +uge [precise(y)] r0.y, r0.x, l(0x2e800000) +if_nz r0.y + ubfe [precise(y)] r0.y, l(8), l(23), r0.x + iadd [precise(y)] r0.y, -r0.y, l(116) + imax [precise(y)] r0.y, r0.y, l(3) + bfi [precise(x)] oDepthLE, r0.y, l(0), l(0), r0.x +else + mov [precise(x)] oDepthLE, l(0) +endif +ret +// Approximately 11 instruction slots used diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl index bc02b4623..ef72713a3 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl @@ -7,22 +7,14 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_thread_id : SV_DispatchThreadID) { uint2 tile_sample_index = xe_group_thread_id.xy; tile_sample_index.x *= 4u; - uint edram_offset = XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index); - uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset); - uint4 depth24 = depth24_stencil >> 8u; - uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset); - // Depth. If the stored 32-bit depth converted to 24-bit is the same as the - // stored 24-bit depth, load the 32-bit value because it has more precision - // (and multipass rendering is possible), if it's not, convert the 24-bit - // depth because it was overwritten by aliasing. - uint4 depth24to32 = XeFloat20e4To32(depth24); - uint4 depth = depth24to32 + (depth32 - depth24to32) * - uint4(XeFloat32To20e4(depth32) == depth24); + uint4 samples = xe_edram_load_store_source.Load4( + XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index)); + // Depth (exact conversion ensured during drawing). uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; - xe_edram_load_store_dest.Store4(rt_offset, depth); + xe_edram_load_store_dest.Store4(rt_offset, XeFloat20e4To32(samples >> 8u)); // Stencil. - uint4 stencil = (depth24_stencil & 0xFFu) << uint4(0u, 8u, 16u, 24u); + uint4 stencil = (samples & 0xFFu) << uint4(0u, 8u, 16u, 24u); stencil.xy |= stencil.zw; stencil.x |= stencil.y; rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float24and32.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float24and32.cs.hlsl new file mode 100644 index 000000000..bc02b4623 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float24and32.cs.hlsl @@ -0,0 +1,31 @@ +#include "edram_load_store.hlsli" +#include "pixel_formats.hlsli" + +[numthreads(20, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 4u; + uint edram_offset = XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index); + uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset); + uint4 depth24 = depth24_stencil >> 8u; + uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset); + // Depth. If the stored 32-bit depth converted to 24-bit is the same as the + // stored 24-bit depth, load the 32-bit value because it has more precision + // (and multipass rendering is possible), if it's not, convert the 24-bit + // depth because it was overwritten by aliasing. + uint4 depth24to32 = XeFloat20e4To32(depth24); + uint4 depth = depth24to32 + (depth32 - depth24to32) * + uint4(XeFloat32To20e4(depth32) == depth24); + uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; + xe_edram_load_store_dest.Store4(rt_offset, depth); + // Stencil. + uint4 stencil = (depth24_stencil & 0xFFu) << uint4(0u, 8u, 16u, 24u); + stencil.xy |= stencil.zw; + stencil.x |= stencil.y; + rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + + xe_edram_rt_stencil_offset; + xe_edram_load_store_dest.Store(rt_offset, stencil.x); +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl index ac7626721..d0123c69f 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl @@ -5,21 +5,18 @@ void main(uint3 xe_group_id : SV_GroupID, uint3 xe_group_thread_id : SV_GroupThreadID, uint3 xe_thread_id : SV_DispatchThreadID) { - // Depth. + // Depth (exact conversion ensured during drawing). uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; - uint4 depth32 = xe_edram_load_store_source.Load4(rt_offset); - uint4 depth24_stencil = XeFloat32To20e4(depth32) << 8u; + uint4 samples = + XeFloat32To20e4(xe_edram_load_store_source.Load4(rt_offset)) << 8u; // Stencil. rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + xe_edram_rt_stencil_offset; - depth24_stencil |= (xe_edram_load_store_source.Load(rt_offset).xxxx >> - uint4(0u, 8u, 16u, 24u)) & 0xFFu; + samples |= (xe_edram_load_store_source.Load(rt_offset).xxxx >> + uint4(0u, 8u, 16u, 24u)) & 0xFFu; uint2 tile_sample_index = xe_group_thread_id.xy; tile_sample_index.x *= 4u; - uint edram_offset = XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index); - // Store 24-bit depth for aliasing and checking if 32-bit depth is up to date. - xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil); - // Store 32-bit depth so precision isn't lost when doing multipass rendering. - xe_edram_load_store_dest.Store4(10485760u + edram_offset, depth32); + xe_edram_load_store_dest.Store4( + XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index), samples); } diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float24and32.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float24and32.cs.hlsl new file mode 100644 index 000000000..ac7626721 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float24and32.cs.hlsl @@ -0,0 +1,25 @@ +#include "edram_load_store.hlsli" +#include "pixel_formats.hlsli" + +[numthreads(20, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + // Depth. + uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; + uint4 depth32 = xe_edram_load_store_source.Load4(rt_offset); + uint4 depth24_stencil = XeFloat32To20e4(depth32) << 8u; + // Stencil. + rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + + xe_edram_rt_stencil_offset; + depth24_stencil |= (xe_edram_load_store_source.Load(rt_offset).xxxx >> + uint4(0u, 8u, 16u, 24u)) & 0xFFu; + uint2 tile_sample_index = xe_group_thread_id.xy; + tile_sample_index.x *= 4u; + uint edram_offset = XeEdramOffset32bpp(xe_group_id.xy, tile_sample_index); + // Store 24-bit depth for aliasing and checking if 32-bit depth is up to date. + xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil); + // Store 32-bit depth so precision isn't lost when doing multipass rendering. + xe_edram_load_store_dest.Store4(10485760u + edram_offset, depth32); +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl index d5e782bbb..093f533af 100644 --- a/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl @@ -7,8 +7,7 @@ void main(uint3 xe_group_id : SV_GroupID, // Depth. uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + xe_thread_id.x * 16u + xe_edram_rt_color_depth_offset; - uint4 samples = - (xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu) << 8u; + uint4 samples = xe_edram_load_store_source.Load4(rt_offset) << 8u; // Stencil. rt_offset = xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u + xe_edram_rt_stencil_offset; diff --git a/src/xenia/gpu/d3d12/shaders/float24_round.ps.hlsl b/src/xenia/gpu/d3d12/shaders/float24_round.ps.hlsl new file mode 100644 index 000000000..346b21b4f --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/float24_round.ps.hlsl @@ -0,0 +1,13 @@ +#include "pixel_formats.hlsli" +#include "xenos_draw.hlsli" + +struct XePSInput { + XeVertexPrePS pre_ps; + sample float4 position : SV_Position; +}; + +precise float main(XePSInput xe_input) : SV_Depth { + // Input Z may be outside the viewport range (it's clamped after the shader). + return asfloat( + XeFloat20e4To32(XeFloat32To20e4(asuint(saturate(xe_input.position.z))))); +} diff --git a/src/xenia/gpu/d3d12/shaders/float24_truncate.ps.hlsl b/src/xenia/gpu/d3d12/shaders/float24_truncate.ps.hlsl new file mode 100644 index 000000000..83a5d08d9 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/float24_truncate.ps.hlsl @@ -0,0 +1,38 @@ +#include "pixel_formats.hlsli" +#include "xenos_draw.hlsli" + +struct XePSInput { + XeVertexPrePS pre_ps; + sample float4 position : SV_Position; +}; + +precise float main(XePSInput xe_input) : SV_DepthLessEqual { + // Simplified conversion, always less than or equal to the original value - + // just drop the lower bits. + // The float32 exponent bias is 127. + // After saturating, the exponent range is -127...0. + // The smallest normalized 20e4 exponent is -14 - should drop 3 mantissa bits + // at -14 or above. + // The smallest denormalized 20e4 number is -34 - should drop 23 mantissa bits + // at -34. + // Anything smaller than 2^-34 becomes 0. + // Input Z may be outside the viewport range (it's clamped after the shader). + precise uint depth = asuint(saturate(xe_input.position.z)); + // Check if the number is representable as a float24 after truncation - the + // exponent is at least -34. + if (depth >= 0x2E800000u) { + // Extract the biased float32 exponent: + // 113+ at exponent -14+. + // 93 at exponent -34. + uint exponent = (depth >> 23u) & 0xFFu; + // Convert exponent to the shift amount. + // 116 - 113 = 3. + // 116 - 93 = 23. + uint shift = asuint(max(116 - asint(exponent), 3)); + depth = depth >> shift << shift; + } else { + // The number is not representable as float24 after truncation - zero. + depth = 0u; + } + return asfloat(depth); +} diff --git a/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli b/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli index 1e7f5e319..e3654211d 100644 --- a/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli +++ b/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli @@ -495,6 +495,16 @@ void XeR11G11B10SNormToRGBA16(uint4 packed_texels, out uint4 out_01, // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2). // We also can't clamp the stored value to 1 as load->store->load must be exact. +uint XeFloat32To20e4(uint f32u32) { + // Keep only positive (high bit set means negative for both float and int) and + // saturate to the maximum representable value near 2 (also dropping NaNs). + f32u32 = min((f32u32 <= 0x7FFFFFFFu) ? f32u32 : 0u, 0x3FFFFFF8u); + uint denormalized = + ((f32u32 & 0x7FFFFFu) | 0x800000u) >> min(113u - (f32u32 >> 23u), 24u); + uint f24u32 = (f32u32 < 0x38800000u) ? denormalized : (f32u32 + 0xC8000000u); + return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu; +} + uint4 XeFloat32To20e4(uint4 f32u32) { // Keep only positive (high bit set means negative for both float and int) and // saturate to the maximum representable value near 2 (also dropping NaNs). @@ -505,6 +515,21 @@ uint4 XeFloat32To20e4(uint4 f32u32) { return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu; } +uint XeFloat20e4To32(uint f24u32) { + uint mantissa = f24u32 & 0xFFFFFu; + uint exponent = f24u32 >> 20u; + // Normalize the values for the denormalized components. + // Exponent = 1; + // do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x100000) == 0); + bool is_denormalized = exponent == 0u; + uint mantissa_lzcnt = 20u - firstbithigh(mantissa); + exponent = is_denormalized ? (1u - mantissa_lzcnt) : exponent; + mantissa = + is_denormalized ? ((mantissa << mantissa_lzcnt) & 0xFFFFFu) : mantissa; + // Combine into 32-bit float bits and clear zeros. + return (f24u32 != 0u) ? (((exponent + 112u) << 23u) | (mantissa << 3u)) : 0u; +} + uint4 XeFloat20e4To32(uint4 f24u32) { uint4 mantissa = f24u32 & 0xFFFFFu; uint4 exponent = f24u32 >> 20u; diff --git a/src/xenia/gpu/d3d12/shaders/primitive_point_list.gs.hlsl b/src/xenia/gpu/d3d12/shaders/primitive_point_list.gs.hlsl index 33d5a5c48..ab165504a 100644 --- a/src/xenia/gpu/d3d12/shaders/primitive_point_list.gs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/primitive_point_list.gs.hlsl @@ -10,9 +10,9 @@ void main(point XeVertexPreGS xe_in[1], } XeVertexPostGS xe_out; - xe_out.interpolators = xe_in[0].post_gs.interpolators; - xe_out.point_params.z = xe_in[0].post_gs.point_params.z; - xe_out.clip_space_zw = xe_in[0].post_gs.clip_space_zw; + xe_out.pre_ps.interpolators = xe_in[0].post_gs.pre_ps.interpolators; + xe_out.pre_ps.point_params.z = xe_in[0].post_gs.pre_ps.point_params.z; + xe_out.pre_ps.clip_space_zw = xe_in[0].post_gs.pre_ps.clip_space_zw; xe_out.position.zw = xe_in[0].post_gs.position.zw; xe_out.clip_distance_0123 = xe_in[0].post_gs.clip_distance_0123; xe_out.clip_distance_45 = xe_in[0].post_gs.clip_distance_45; @@ -20,26 +20,27 @@ void main(point XeVertexPreGS xe_in[1], // Shader header writes -1.0f to point_size by default, so any positive value // means that it was overwritten by the translated vertex shader. float2 point_size = - (xe_in[0].post_gs.point_params.z > 0.0f ? xe_in[0].post_gs.point_params.zz - : xe_point_size); + xe_in[0].post_gs.pre_ps.point_params.z > 0.0f + ? xe_in[0].post_gs.pre_ps.point_params.zz + : xe_point_size; point_size = clamp(point_size, xe_point_size_min_max.xx, xe_point_size_min_max.yy) * xe_point_screen_to_ndc * xe_in[0].post_gs.position.w; - xe_out.point_params.xy = float2(0.0, 0.0); + xe_out.pre_ps.point_params.xy = float2(0.0, 0.0); // TODO(Triang3l): On Vulkan, sign of Y needs to inverted because of // upper-left origin. // TODO(Triang3l): Investigate the true signs of point sprites. xe_out.position.xy = xe_in[0].post_gs.position.xy + float2(-point_size.x, point_size.y); xe_stream.Append(xe_out); - xe_out.point_params.xy = float2(0.0, 1.0); + xe_out.pre_ps.point_params.xy = float2(0.0, 1.0); xe_out.position.xy = xe_in[0].post_gs.position.xy - point_size; xe_stream.Append(xe_out); - xe_out.point_params.xy = float2(1.0, 0.0); + xe_out.pre_ps.point_params.xy = float2(1.0, 0.0); xe_out.position.xy = xe_in[0].post_gs.position.xy + point_size; xe_stream.Append(xe_out); - xe_out.point_params.xy = float2(1.0, 1.0); + xe_out.pre_ps.point_params.xy = float2(1.0, 1.0); xe_out.position.xy = xe_in[0].post_gs.position.xy + float2(point_size.x, -point_size.y); xe_stream.Append(xe_out); diff --git a/src/xenia/gpu/d3d12/shaders/primitive_rectangle_list.gs.hlsl b/src/xenia/gpu/d3d12/shaders/primitive_rectangle_list.gs.hlsl index 8411e54c2..45b7b05e5 100644 --- a/src/xenia/gpu/d3d12/shaders/primitive_rectangle_list.gs.hlsl +++ b/src/xenia/gpu/d3d12/shaders/primitive_rectangle_list.gs.hlsl @@ -80,16 +80,19 @@ void main(triangle XeVertexPreGS xe_in[3], v3_signs = float3(1.0f, 1.0f, -1.0f); } [unroll] for (int i = 0; i < 16; ++i) { - xe_out.interpolators[i] = v3_signs.x * xe_in[0].post_gs.interpolators[i] + - v3_signs.y * xe_in[1].post_gs.interpolators[i] + - v3_signs.z * xe_in[2].post_gs.interpolators[i]; + xe_out.pre_ps.interpolators[i] = + v3_signs.x * xe_in[0].post_gs.pre_ps.interpolators[i] + + v3_signs.y * xe_in[1].post_gs.pre_ps.interpolators[i] + + v3_signs.z * xe_in[2].post_gs.pre_ps.interpolators[i]; } - xe_out.point_params = v3_signs.x * xe_in[0].post_gs.point_params + - v3_signs.y * xe_in[1].post_gs.point_params + - v3_signs.z * xe_in[2].post_gs.point_params; - xe_out.clip_space_zw = v3_signs.x * xe_in[0].post_gs.clip_space_zw + - v3_signs.y * xe_in[1].post_gs.clip_space_zw + - v3_signs.z * xe_in[2].post_gs.clip_space_zw; + xe_out.pre_ps.point_params = + v3_signs.x * xe_in[0].post_gs.pre_ps.point_params + + v3_signs.y * xe_in[1].post_gs.pre_ps.point_params + + v3_signs.z * xe_in[2].post_gs.pre_ps.point_params; + xe_out.pre_ps.clip_space_zw = + v3_signs.x * xe_in[0].post_gs.pre_ps.clip_space_zw + + v3_signs.y * xe_in[1].post_gs.pre_ps.clip_space_zw + + v3_signs.z * xe_in[2].post_gs.pre_ps.clip_space_zw; xe_out.position = v3_signs.x * xe_in[0].post_gs.position + v3_signs.y * xe_in[1].post_gs.position + v3_signs.z * xe_in[2].post_gs.position; diff --git a/src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli b/src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli index a7e841eeb..98c5f26ed 100644 --- a/src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli +++ b/src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli @@ -63,10 +63,14 @@ struct XeHSControlPointOutput { float index : XEVERTEXID; }; -struct XeVertexPostGS { +struct XeVertexPrePS { float4 interpolators[16] : TEXCOORD0; float3 point_params : TEXCOORD16; float2 clip_space_zw : TEXCOORD17; +}; + +struct XeVertexPostGS { + XeVertexPrePS pre_ps; // Precise needed to preserve NaN - guest primitives may be converted to more // than 1 triangle, so need to kill them entirely manually in GS if any vertex // is NaN. diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 98cc90615..a9d9fff92 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -114,6 +114,7 @@ int32_t FloatToD3D11Fixed16p8(float f32) { void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x, float pixel_size_y, bool origin_bottom_left, float x_max, float y_max, bool allow_reverse_z, + bool convert_z_to_float24, ViewportInfo& viewport_info_out) { assert_true(pixel_size_x >= 1.0f); assert_true(pixel_size_y >= 1.0f); @@ -270,6 +271,17 @@ void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x, ndc_scale_z = -ndc_scale_z; ndc_offset_z = 1.0f - ndc_offset_z; } + if (convert_z_to_float24 && regs.Get().z_enable && + regs.Get().depth_format == + xenos::DepthRenderTargetFormat::kD24FS8) { + // Need to adjust the bounds that the resulting depth values will be clamped + // to after the pixel shader. Preferring adding some error to interpolated Z + // instead if conversion can't be done exactly, without modifying clipping + // bounds by adjusting Z in vertex shaders, as that may cause polygons + // placed explicitly at Z = 0 or Z = W to be clipped. + viewport_z_min = xenos::Float20e4To32(xenos::Float32To20e4(viewport_z_min)); + viewport_z_max = xenos::Float20e4To32(xenos::Float32To20e4(viewport_z_max)); + } viewport_info_out.left = viewport_left; viewport_info_out.top = viewport_top; diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index 1a9798aeb..c47640a20 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -53,6 +53,7 @@ struct ViewportInfo { void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x, float pixel_size_y, bool origin_bottom_left, float x_max, float y_max, bool allow_reverse_z, + bool convert_z_to_float24, ViewportInfo& viewport_info_out); struct Scissor { diff --git a/src/xenia/gpu/dxbc_shader.cc b/src/xenia/gpu/dxbc_shader.cc new file mode 100644 index 000000000..144308d57 --- /dev/null +++ b/src/xenia/gpu/dxbc_shader.cc @@ -0,0 +1,27 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2020 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/dxbc_shader.h" + +#include + +namespace xe { +namespace gpu { + +DxbcShader::DxbcShader(xenos::ShaderType shader_type, uint64_t data_hash, + const uint32_t* dword_ptr, uint32_t dword_count) + : Shader(shader_type, data_hash, dword_ptr, dword_count) {} + +Shader::Translation* DxbcShader::CreateTranslationInstance( + uint32_t modification) { + return new DxbcTranslation(*this, modification); +} + +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/dxbc_shader.h b/src/xenia/gpu/dxbc_shader.h new file mode 100644 index 000000000..49439a2a6 --- /dev/null +++ b/src/xenia/gpu/dxbc_shader.h @@ -0,0 +1,83 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2020 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_DXBC_SHADER_H_ +#define XENIA_GPU_DXBC_SHADER_H_ + +#include + +#include "xenia/gpu/dxbc_shader_translator.h" +#include "xenia/gpu/shader.h" +#include "xenia/gpu/xenos.h" + +namespace xe { +namespace gpu { + +class DxbcShader : public Shader { + public: + class DxbcTranslation : public Translation { + public: + DxbcTranslation(DxbcShader& shader, uint32_t modification) + : Translation(shader, modification) {} + }; + + DxbcShader(xenos::ShaderType shader_type, uint64_t data_hash, + const uint32_t* dword_ptr, uint32_t dword_count); + + static constexpr uint32_t kMaxTextureBindingIndexBits = + DxbcShaderTranslator::kMaxTextureBindingIndexBits; + static constexpr uint32_t kMaxTextureBindings = + DxbcShaderTranslator::kMaxTextureBindings; + struct TextureBinding { + uint32_t bindless_descriptor_index; + uint32_t fetch_constant; + // Stacked and 3D are separate TextureBindings, even for bindless for null + // descriptor handling simplicity. + xenos::FetchOpDimension dimension; + bool is_signed; + }; + // Safe to hash and compare with memcmp for layout hashing. + const TextureBinding* GetTextureBindings(uint32_t& count_out) const { + count_out = uint32_t(texture_bindings_.size()); + return texture_bindings_.data(); + } + const uint32_t GetUsedTextureMask() const { return used_texture_mask_; } + + static constexpr uint32_t kMaxSamplerBindingIndexBits = + DxbcShaderTranslator::kMaxSamplerBindingIndexBits; + static constexpr uint32_t kMaxSamplerBindings = + DxbcShaderTranslator::kMaxSamplerBindings; + struct SamplerBinding { + uint32_t bindless_descriptor_index; + uint32_t fetch_constant; + xenos::TextureFilter mag_filter; + xenos::TextureFilter min_filter; + xenos::TextureFilter mip_filter; + xenos::AnisoFilter aniso_filter; + }; + const SamplerBinding* GetSamplerBindings(uint32_t& count_out) const { + count_out = uint32_t(sampler_bindings_.size()); + return sampler_bindings_.data(); + } + + protected: + Translation* CreateTranslationInstance(uint32_t modification) override; + + private: + friend class DxbcShaderTranslator; + + std::vector texture_bindings_; + std::vector sampler_bindings_; + uint32_t used_texture_mask_ = 0; +}; + +} // namespace gpu +} // namespace xe + +#endif // XENIA_GPU_DXBC_SHADER_H_ diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index a08cafd5e..865fbd77e 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -19,6 +19,7 @@ #include "xenia/base/assert.h" #include "xenia/base/cvar.h" #include "xenia/base/math.h" +#include "xenia/gpu/dxbc_shader.h" DEFINE_bool(dxbc_switch, true, "Use switch rather than if for flow control. Turning this off or " @@ -76,64 +77,31 @@ DxbcShaderTranslator::DxbcShaderTranslator(uint32_t vendor_id, } DxbcShaderTranslator::~DxbcShaderTranslator() = default; -std::vector DxbcShaderTranslator::ForceEarlyDepthStencil( - const uint8_t* shader) { - const uint32_t* old_shader = reinterpret_cast(shader); - - // To return something anyway even if patching fails. - std::vector new_shader; - uint32_t shader_size_bytes = old_shader[6]; - new_shader.resize(shader_size_bytes); - std::memcpy(new_shader.data(), shader, shader_size_bytes); - - // Find the SHEX chunk. - uint32_t chunk_count = old_shader[7]; - for (uint32_t i = 0; i < chunk_count; ++i) { - uint32_t chunk_offset_bytes = old_shader[8 + i]; - const uint32_t* chunk = old_shader + chunk_offset_bytes / sizeof(uint32_t); - if (chunk[0] != 'XEHS') { - continue; - } - // Find dcl_globalFlags and patch it. - uint32_t code_size_dwords = chunk[3]; - chunk += 4; - for (uint32_t j = 0; j < code_size_dwords;) { - uint32_t opcode_token = chunk[j]; - uint32_t opcode = DECODE_D3D10_SB_OPCODE_TYPE(opcode_token); - if (opcode == D3D10_SB_OPCODE_DCL_GLOBAL_FLAGS) { - opcode_token |= D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL; - std::memcpy(new_shader.data() + - (chunk_offset_bytes + (4 + j) * sizeof(uint32_t)), - &opcode_token, sizeof(uint32_t)); - // Recalculate the checksum since the shader was modified. - CalculateDXBCChecksum( - reinterpret_cast(new_shader.data()), - shader_size_bytes, - reinterpret_cast(new_shader.data() + - sizeof(uint32_t))); - break; - } - if (opcode == D3D10_SB_OPCODE_CUSTOMDATA) { - j += chunk[j + 1]; - } else { - j += DECODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(opcode_token); - } - } - break; - } - - return std::move(new_shader); -} - std::vector DxbcShaderTranslator::CreateDepthOnlyPixelShader() { - Reset(); + Reset(xenos::ShaderType::kPixel); is_depth_only_pixel_shader_ = true; StartTranslation(); return std::move(CompleteTranslation()); } -void DxbcShaderTranslator::Reset() { - ShaderTranslator::Reset(); +uint32_t DxbcShaderTranslator::GetDefaultModification( + xenos::ShaderType shader_type, + Shader::HostVertexShaderType host_vertex_shader_type) const { + Modification shader_modification; + switch (shader_type) { + case xenos::ShaderType::kVertex: + shader_modification.host_vertex_shader_type = host_vertex_shader_type; + break; + case xenos::ShaderType::kPixel: + shader_modification.depth_stencil_mode = + Modification::DepthStencilMode::kNoModifiers; + break; + } + return shader_modification.value; +} + +void DxbcShaderTranslator::Reset(xenos::ShaderType shader_type) { + ShaderTranslator::Reset(shader_type); shader_code_.clear(); @@ -152,7 +120,7 @@ void DxbcShaderTranslator::Reset() { in_domain_location_used_ = 0; in_primitive_id_used_ = false; in_control_point_index_used_ = false; - in_position_xy_used_ = false; + in_position_used_ = 0; in_front_face_used_ = false; system_temp_count_current_ = 0; @@ -457,7 +425,9 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() { // Remember that x# are only accessible via mov load or store - use a // temporary variable if need to do any computations! - switch (host_vertex_shader_type()) { + Shader::HostVertexShaderType host_vertex_shader_type = + GetDxbcShaderModification().host_vertex_shader_type; + switch (host_vertex_shader_type) { case Shader::HostVertexShaderType::kVertex: StartVertexShader_LoadVertexIndex(); break; @@ -618,7 +588,7 @@ void DxbcShaderTranslator::StartVertexOrDomainShader() { default: // TODO(Triang3l): Support line and non-adaptive quad patches. - assert_unhandled_case(host_vertex_shader_type()); + assert_unhandled_case(host_vertex_shader_type); EmitTranslationError( "Unsupported host vertex shader type in StartVertexOrDomainShader"); break; @@ -720,7 +690,7 @@ void DxbcShaderTranslator::StartPixelShader() { // faceness as X sign bit. Using Z as scratch register now. if (edram_rov_used_) { // Get XY address of the current host pixel as float. - in_position_xy_used_ = true; + in_position_used_ |= 0b0011; DxbcOpRoundZ(DxbcDest::R(param_gen_temp, 0b0011), DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition))); // Revert resolution scale - after truncating, so if the pixel position @@ -744,7 +714,7 @@ void DxbcShaderTranslator::StartPixelShader() { } else { // Get XY address of the current SSAA sample by converting // SV_Position.xy to an integer. - in_position_xy_used_ = true; + in_position_used_ |= 0b0011; DxbcOpFToU(DxbcDest::R(param_gen_temp, 0b0011), DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition))); // Undo SSAA that is used instead of MSAA - since it's used as a @@ -870,7 +840,7 @@ void DxbcShaderTranslator::StartPixelShader() { void DxbcShaderTranslator::StartTranslation() { // Allocate global system temporary registers that may also be used in the // epilogue. - if (IsDxbcVertexOrDomainShader()) { + if (is_vertex_shader()) { system_temp_position_ = PushSystemTemp(0b1111); system_temp_point_size_edge_flag_kill_vertex_ = PushSystemTemp(0b0100); // Set the point size to a negative value to tell the geometry shader that @@ -879,20 +849,21 @@ void DxbcShaderTranslator::StartTranslation() { DxbcOpMov( DxbcDest::R(system_temp_point_size_edge_flag_kill_vertex_, 0b0001), DxbcSrc::LF(-1.0f)); - } else if (IsDxbcPixelShader()) { + } else if (is_pixel_shader()) { if (edram_rov_used_) { // Will be initialized unconditionally. system_temp_rov_params_ = PushSystemTemp(); - if (ROV_IsDepthStencilEarly() || writes_depth()) { - // If the shader doesn't write to oDepth, each component will be written - // to if depth/stencil is enabled and the respective sample is covered - - // so need to initialize now because the first writes will be - // conditional. If the shader writes to oDepth, this is oDepth of the - // shader, written by the guest code, so initialize because assumptions - // can't be made about the integrity of the guest code. - system_temp_rov_depth_stencil_ = - PushSystemTemp(writes_depth() ? 0b0001 : 0b1111); - } + } + if (IsDepthStencilSystemTempUsed()) { + // If the shader doesn't write to oDepth, and ROV is used, each + // component will be written to if depth/stencil is enabled and the + // respective sample is covered - so need to initialize now because the + // first writes will be conditional. + // If the shader writes to oDepth, this is oDepth of the shader, written + // by the guest code, so initialize because assumptions can't be made + // about the integrity of the guest code. + system_temp_depth_stencil_ = + PushSystemTemp(writes_depth() ? 0b0001 : 0b1111); } for (uint32_t i = 0; i < 4; ++i) { if (writes_color_target(i)) { @@ -942,7 +913,7 @@ void DxbcShaderTranslator::StartTranslation() { // Zero general-purpose registers to prevent crashes when the game // references them after only initializing them conditionally. - for (uint32_t i = IsDxbcPixelShader() ? xenos::kMaxInterpolators : 0; + for (uint32_t i = is_pixel_shader() ? xenos::kMaxInterpolators : 0; i < register_count(); ++i) { DxbcOpMov(uses_register_dynamic_addressing() ? DxbcDest::X(0, i) : DxbcDest::R(i), @@ -951,9 +922,9 @@ void DxbcShaderTranslator::StartTranslation() { } // Write stage-specific prologue. - if (IsDxbcVertexOrDomainShader()) { + if (is_vertex_shader()) { StartVertexOrDomainShader(); - } else if (IsDxbcPixelShader()) { + } else if (is_pixel_shader()) { StartPixelShader(); } @@ -1168,31 +1139,31 @@ void DxbcShaderTranslator::CompleteShaderCode() { } // Write stage-specific epilogue. - if (IsDxbcVertexOrDomainShader()) { + if (is_vertex_shader()) { CompleteVertexOrDomainShader(); - } else if (IsDxbcPixelShader()) { + } else if (is_pixel_shader()) { CompletePixelShader(); } // Return from `main`. DxbcOpRet(); - if (IsDxbcVertexOrDomainShader()) { + if (is_vertex_shader()) { // Release system_temp_position_ and // system_temp_point_size_edge_flag_kill_vertex_. PopSystemTemp(2); - } else if (IsDxbcPixelShader()) { + } else if (is_pixel_shader()) { // Release system_temps_color_. for (int32_t i = 3; i >= 0; --i) { if (writes_color_target(i)) { PopSystemTemp(); } } + if (IsDepthStencilSystemTempUsed()) { + // Release system_temp_depth_stencil_. + PopSystemTemp(); + } if (edram_rov_used_) { - if (ROV_IsDepthStencilEarly() || writes_depth()) { - // Release system_temp_rov_depth_stencil_. - PopSystemTemp(); - } // Release system_temp_rov_params_. PopSystemTemp(); } @@ -1303,6 +1274,44 @@ std::vector DxbcShaderTranslator::CompleteTranslation() { return shader_object_bytes; } +void DxbcShaderTranslator::PostTranslation( + Shader::Translation& translation, bool setup_shader_post_translation_info) { + if (setup_shader_post_translation_info) { + DxbcShader* dxbc_shader = dynamic_cast(&translation.shader()); + if (dxbc_shader) { + dxbc_shader->texture_bindings_.clear(); + dxbc_shader->texture_bindings_.reserve(texture_bindings_.size()); + dxbc_shader->used_texture_mask_ = 0; + for (const TextureBinding& translator_binding : texture_bindings_) { + DxbcShader::TextureBinding& shader_binding = + dxbc_shader->texture_bindings_.emplace_back(); + // For a stable hash. + std::memset(&shader_binding, 0, sizeof(shader_binding)); + shader_binding.bindless_descriptor_index = + translator_binding.bindless_descriptor_index; + shader_binding.fetch_constant = translator_binding.fetch_constant; + shader_binding.dimension = translator_binding.dimension; + shader_binding.is_signed = translator_binding.is_signed; + dxbc_shader->used_texture_mask_ |= 1u + << translator_binding.fetch_constant; + } + dxbc_shader->sampler_bindings_.clear(); + dxbc_shader->sampler_bindings_.reserve(sampler_bindings_.size()); + for (const SamplerBinding& translator_binding : sampler_bindings_) { + DxbcShader::SamplerBinding& shader_binding = + dxbc_shader->sampler_bindings_.emplace_back(); + shader_binding.bindless_descriptor_index = + translator_binding.bindless_descriptor_index; + shader_binding.fetch_constant = translator_binding.fetch_constant; + shader_binding.mag_filter = translator_binding.mag_filter; + shader_binding.min_filter = translator_binding.min_filter; + shader_binding.mip_filter = translator_binding.mip_filter; + shader_binding.aniso_filter = translator_binding.aniso_filter; + } + } + } +} + void DxbcShaderTranslator::EmitInstructionDisassembly() { if (!emit_source_map_) { return; @@ -1527,19 +1536,20 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, } break; case InstructionStorageTarget::kDepth: - // Writes X to scalar oDepth or to X of system_temp_rov_depth_stencil_, no + // Writes X to scalar oDepth or to X of system_temp_depth_stencil_, no // additional swizzling needed. assert_true(used_write_mask == 0b0001); assert_true(writes_depth()); - if (edram_rov_used_) { - dest = DxbcDest::R(system_temp_rov_depth_stencil_); + if (IsDepthStencilSystemTempUsed()) { + dest = DxbcDest::R(system_temp_depth_stencil_); } else { dest = DxbcDest::ODepth(); } - // Depth outside [0, 1] is not safe for use with the ROV code. Though 20e4 - // float depth can store values below 2, it's a very unusual case. - // Direct3D 10+ SV_Depth, however, can accept any values, including - // specials, when the depth buffer is floating-point. + // Depth outside [0, 1] is not safe for use with the ROV code and with + // 20e4-as-32 conversion. Though 20e4 float depth can store values between + // 1 and 2, it's a very unusual case. Direct3D 10+ SV_Depth, however, can + // accept any values, including specials, when the depth buffer is + // floating-point; but depth is clamped to the viewport bounds anyway. is_clamped = true; break; } @@ -2094,7 +2104,7 @@ void DxbcShaderTranslator::WriteResourceDefinitions() { // ds_5_1 shader_object_.push_back(0x44530501u); } else { - assert_true(IsDxbcPixelShader()); + assert_true(is_pixel_shader()); // ps_5_1 shader_object_.push_back(0xFFFF0501u); } @@ -2765,7 +2775,7 @@ void DxbcShaderTranslator::WriteInputSignature() { control_point_index.semantic_name = semantic_offset; } semantic_offset += AppendString(shader_object_, "XEVERTEXID"); - } else if (IsDxbcPixelShader()) { + } else if (is_pixel_shader()) { // Written dynamically, so assume it's always used if it can be written to // any interpolator register. bool param_gen_used = !is_depth_only_pixel_shader_ && register_count() != 0; @@ -2843,7 +2853,7 @@ void DxbcShaderTranslator::WriteInputSignature() { position.component_type = DxbcSignatureRegisterComponentType::kFloat32; position.register_index = uint32_t(InOutRegister::kPSInPosition); position.mask = 0b1111; - position.always_reads_mask = in_position_xy_used_ ? 0b0011 : 0b0000; + position.always_reads_mask = in_position_used_; } // Is front face (SV_IsFrontFace). @@ -2927,7 +2937,9 @@ void DxbcShaderTranslator::WritePatchConstantSignature() { DxbcName tess_factor_edge_system_value = DxbcName::kUndefined; uint32_t tess_factor_inside_count = 0; DxbcName tess_factor_inside_system_value = DxbcName::kUndefined; - switch (host_vertex_shader_type()) { + Shader::HostVertexShaderType host_vertex_shader_type = + GetDxbcShaderModification().host_vertex_shader_type; + switch (host_vertex_shader_type) { case Shader::HostVertexShaderType::kTriangleDomainCPIndexed: case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed: tess_factor_edge_count = 3; @@ -2944,7 +2956,7 @@ void DxbcShaderTranslator::WritePatchConstantSignature() { break; default: // TODO(Triang3l): Support line patches. - assert_unhandled_case(host_vertex_shader_type()); + assert_unhandled_case(host_vertex_shader_type); EmitTranslationError( "Unsupported host vertex shader type in WritePatchConstantSignature"); } @@ -3033,7 +3045,7 @@ void DxbcShaderTranslator::WriteOutputSignature() { constexpr size_t kParameterDwords = sizeof(DxbcSignatureParameter) / sizeof(uint32_t); - if (IsDxbcVertexOrDomainShader()) { + if (is_vertex_shader()) { // Intepolators (TEXCOORD#). size_t interpolator_position = shader_object_.size(); shader_object_.resize(shader_object_.size() + @@ -3195,7 +3207,7 @@ void DxbcShaderTranslator::WriteOutputSignature() { cull_distance.semantic_name = semantic_offset; } semantic_offset += AppendString(shader_object_, "SV_CullDistance"); - } else if (IsDxbcPixelShader()) { + } else if (is_pixel_shader()) { if (!edram_rov_used_) { // Color render targets (SV_Target#). size_t target_position = SIZE_MAX; @@ -3217,9 +3229,11 @@ void DxbcShaderTranslator::WriteOutputSignature() { } } - // Depth (SV_Depth). + // Depth (SV_Depth or SV_DepthLessEqual). + Modification::DepthStencilMode depth_stencil_mode = + GetDxbcShaderModification().depth_stencil_mode; size_t depth_position = SIZE_MAX; - if (writes_depth()) { + if (writes_depth() || DSV_IsWritingFloat24Depth()) { depth_position = shader_object_.size(); shader_object_.resize(shader_object_.size() + kParameterDwords); ++parameter_count; @@ -3253,7 +3267,15 @@ void DxbcShaderTranslator::WriteOutputSignature() { depth_position); depth.semantic_name = semantic_offset; } - semantic_offset += AppendString(shader_object_, "SV_Depth"); + const char* depth_semantic_name; + if (!writes_depth() && + GetDxbcShaderModification().depth_stencil_mode == + Modification::DepthStencilMode::kFloat24Truncating) { + depth_semantic_name = "SV_DepthLessEqual"; + } else { + depth_semantic_name = "SV_Depth"; + } + semantic_offset += AppendString(shader_object_, depth_semantic_name); } } } @@ -3276,7 +3298,7 @@ void DxbcShaderTranslator::WriteShaderCode() { } else if (IsDxbcDomainShader()) { shader_type = D3D11_SB_DOMAIN_SHADER; } else { - assert_true(IsDxbcPixelShader()); + assert_true(is_pixel_shader()); shader_type = D3D10_SB_PIXEL_SHADER; } shader_object_.push_back( @@ -3296,12 +3318,14 @@ void DxbcShaderTranslator::WriteShaderCode() { // Inputs/outputs have 1D-indexed operands with a component mask and a // register index. + Modification shader_modification = GetDxbcShaderModification(); + if (IsDxbcDomainShader()) { // Not using control point data since Xenos only has a vertex shader acting // as both vertex shader and domain shader. stat_.c_control_points = 3; stat_.tessellator_domain = DxbcTessellatorDomain::kTriangle; - switch (host_vertex_shader_type()) { + switch (shader_modification.host_vertex_shader_type) { case Shader::HostVertexShaderType::kTriangleDomainCPIndexed: case Shader::HostVertexShaderType::kTriangleDomainPatchIndexed: stat_.c_control_points = 3; @@ -3314,7 +3338,7 @@ void DxbcShaderTranslator::WriteShaderCode() { break; default: // TODO(Triang3l): Support line patches. - assert_unhandled_case(host_vertex_shader_type()); + assert_unhandled_case(shader_modification.host_vertex_shader_type); EmitTranslationError( "Unsupported host vertex shader type in WriteShaderCode"); } @@ -3330,11 +3354,17 @@ void DxbcShaderTranslator::WriteShaderCode() { } // Don't allow refactoring when converting to native code to maintain position - // invariance (needed even in pixel shaders for oDepth invariance). Also this - // dcl will be modified by ForceEarlyDepthStencil. - shader_object_.push_back( + // invariance (needed even in pixel shaders for oDepth invariance). + uint32_t global_flags_opcode = ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_GLOBAL_FLAGS) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1)); + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(1); + if (is_pixel_shader() && + GetDxbcShaderModification().depth_stencil_mode == + Modification::DepthStencilMode::kEarlyHint && + !edram_rov_used_ && CanWriteZEarly()) { + global_flags_opcode |= D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL; + } + shader_object_.push_back(global_flags_opcode); // Constant buffers, from most frequenly accessed to least frequently accessed // (the order is a hint to the driver according to the DXBC header). @@ -3560,7 +3590,7 @@ void DxbcShaderTranslator::WriteShaderCode() { } // Inputs and outputs. - if (IsDxbcVertexOrDomainShader()) { + if (is_vertex_shader()) { if (IsDxbcDomainShader()) { if (in_domain_location_used_) { // Domain location input. @@ -3584,7 +3614,7 @@ void DxbcShaderTranslator::WriteShaderCode() { if (in_control_point_index_used_) { // Control point indices as float input. uint32_t control_point_array_size; - switch (host_vertex_shader_type()) { + switch (shader_modification.host_vertex_shader_type) { case Shader::HostVertexShaderType::kTriangleDomainCPIndexed: control_point_array_size = 3; break; @@ -3593,7 +3623,7 @@ void DxbcShaderTranslator::WriteShaderCode() { break; default: // TODO(Triang3l): Support line patches. - assert_unhandled_case(host_vertex_shader_type()); + assert_unhandled_case(shader_modification.host_vertex_shader_type); EmitTranslationError( "Unsupported host vertex shader type in " "StartVertexOrDomainShader"); @@ -3683,7 +3713,8 @@ void DxbcShaderTranslator::WriteShaderCode() { uint32_t(InOutRegister::kVSDSOutClipDistance45AndCullDistance)); shader_object_.push_back(ENCODE_D3D10_SB_NAME(D3D10_SB_NAME_CULL_DISTANCE)); ++stat_.dcl_count; - } else if (IsDxbcPixelShader()) { + } else if (is_pixel_shader()) { + bool is_writing_float24_depth = DSV_IsWritingFloat24Depth(); // Interpolator input. if (!is_depth_only_pixel_shader_) { uint32_t interpolator_count = @@ -3725,16 +3756,26 @@ void DxbcShaderTranslator::WriteShaderCode() { shader_object_.push_back(uint32_t(InOutRegister::kPSInClipSpaceZW)); ++stat_.dcl_count; } - if (in_position_xy_used_) { - // Position input (only XY needed for ps_param_gen, and the ROV depth code - // calculates the depth from clip space Z and W). + if (in_position_used_) { + // Position input (XY needed for ps_param_gen, Z needed for non-ROV + // float24 conversion; the ROV depth code calculates the depth the from + // clip space Z and W with pull-mode per-sample interpolation instead). + // At the cost of possibility of MSAA with pixel-rate shading, need + // per-sample depth - otherwise intersections cannot be antialiased, and + // with SV_DepthLessEqual, per-sample (or centroid, but this isn't + // applicable here) position is mandatory. However, with depth output, on + // the guest, there's only one depth value for the whole pixel. + D3D10_SB_INTERPOLATION_MODE position_interpolation_mode = + is_writing_float24_depth && !writes_depth() + ? D3D10_SB_INTERPOLATION_LINEAR_NOPERSPECTIVE_SAMPLE + : D3D10_SB_INTERPOLATION_LINEAR_NOPERSPECTIVE; shader_object_.push_back( ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_INPUT_PS_SIV) | ENCODE_D3D10_SB_INPUT_INTERPOLATION_MODE( - D3D10_SB_INTERPOLATION_LINEAR_NOPERSPECTIVE) | + position_interpolation_mode) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(4)); - shader_object_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_INPUT, 0b0011, 1)); + shader_object_.push_back(EncodeVectorMaskedOperand( + D3D10_SB_OPERAND_TYPE_INPUT, in_position_used_, 1)); shader_object_.push_back(uint32_t(InOutRegister::kPSInPosition)); shader_object_.push_back(ENCODE_D3D10_SB_NAME(D3D10_SB_NAME_POSITION)); ++stat_.dcl_count; @@ -3778,12 +3819,19 @@ void DxbcShaderTranslator::WriteShaderCode() { } } // Depth output. - if (writes_depth()) { + if (is_writing_float24_depth || writes_depth()) { + D3D10_SB_OPERAND_TYPE depth_operand_type; + if (!writes_depth() && + GetDxbcShaderModification().depth_stencil_mode == + Modification::DepthStencilMode::kFloat24Truncating) { + depth_operand_type = D3D11_SB_OPERAND_TYPE_OUTPUT_DEPTH_LESS_EQUAL; + } else { + depth_operand_type = D3D10_SB_OPERAND_TYPE_OUTPUT_DEPTH; + } shader_object_.push_back( ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DCL_OUTPUT) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(2)); - shader_object_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_OUTPUT_DEPTH, 0)); + shader_object_.push_back(EncodeScalarOperand(depth_operand_type, 0)); ++stat_.dcl_count; } } diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 9edc40b56..2ca52e7f5 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -102,6 +102,51 @@ class DxbcShaderTranslator : public ShaderTranslator { bool edram_rov_used, bool force_emit_source_map = false); ~DxbcShaderTranslator() override; + union Modification { + // If anything in this is structure is changed in a way not compatible with + // the previous layout, invalidate the pipeline storages by increasing this + // version number (0xYYYYMMDD)! + static constexpr uint32_t kVersion = 0x20201203; + + enum class DepthStencilMode : uint32_t { + kNoModifiers, + // [earlydepthstencil] - enable if alpha test and alpha to coverage are + // disabled; ignored if anything in the shader blocks early Z writing + // (which is not known before translation, so this will be set anyway). + kEarlyHint, + // Converting the depth to the closest 32-bit float representable exactly + // as a 20e4 float, to support invariance in cases when the guest + // reuploads a previously resolved depth buffer to the EDRAM, rounding + // towards zero (which contradicts the rounding used by the Direct3D 9 + // reference rasterizer, but allows SV_DepthLessEqual to be used to allow + // slightly coarse early Z culling; also truncating regardless of whether + // the shader writes depth and thus always uses SV_Depth, for + // consistency). MSAA is limited - depth must be per-sample + // (SV_DepthLessEqual also explicitly requires sample or centroid position + // interpolation), thus the sampler has to run at sample frequency even if + // the device supports stencil loading and thus true non-ROV MSAA via + // SV_StencilRef. + // Fixed-function viewport depth bounds must be snapped to float24 for + // clamping purposes. + kFloat24Truncating, + // Similar to kFloat24Truncating, but rounding to the nearest even, + // however, always using SV_Depth rather than SV_DepthLessEqual because + // rounding up results in a bigger value. Same viewport usage rules apply. + kFloat24Rounding, + }; + + struct { + // VS - pipeline stage and input configuration. + Shader::HostVertexShaderType host_vertex_shader_type + : Shader::kHostVertexShaderTypeBitCount; + // PS, non-ROV - depth / stencil output mode. + DepthStencilMode depth_stencil_mode : 2; + }; + uint32_t value = 0; + + Modification(uint32_t modification_value = 0) : value(modification_value) {} + }; + // Constant buffer bindings in space 0. enum class CbufferRegister { kSystemConstants, @@ -238,15 +283,15 @@ class DxbcShaderTranslator : public ShaderTranslator { // EDRAM address calculation. uint32_t sample_count_log2[2]; float alpha_test_reference; + // If alpha to mask is disabled, the entire alpha_to_mask value must be 0. + // If alpha to mask is enabled, bits 0:7 are sample offsets, and bit 8 must + // be 1. uint32_t alpha_to_mask; float color_exp_bias[4]; uint32_t color_output_map[4]; - // If alpha to mask is disabled, the entire alpha_to_mask value must be 0. - // If alpha to mask is enabled, bits 0:7 are sample offsets, and bit 8 must - // be 1. uint32_t edram_resolution_square_scale; uint32_t edram_pitch_tiles; union { @@ -358,12 +403,6 @@ class DxbcShaderTranslator : public ShaderTranslator { bool is_signed; std::string name; }; - // The first binding returned is at t[SRVMainRegister::kBindfulTexturesStart] - // of space SRVSpace::kMain. - const TextureBinding* GetTextureBindings(uint32_t& count_out) const { - count_out = uint32_t(texture_bindings_.size()); - return texture_bindings_.data(); - } // Arbitrary limit - there can't be more than 2048 in a shader-visible // descriptor heap, though some older hardware (tier 1 resource binding - @@ -385,16 +424,6 @@ class DxbcShaderTranslator : public ShaderTranslator { xenos::AnisoFilter aniso_filter; std::string name; }; - const SamplerBinding* GetSamplerBindings(uint32_t& count_out) const { - count_out = uint32_t(sampler_bindings_.size()); - return sampler_bindings_.data(); - } - - // Returns the number of texture SRV and sampler offsets that need to be - // passed via a constant buffer to the shader. - uint32_t GetBindlessResourceCount() const { - return uint32_t(texture_bindings_.size() + sampler_bindings_.size()); - } // Unordered access view bindings in space 0. enum class UAVRegister { @@ -402,10 +431,6 @@ class DxbcShaderTranslator : public ShaderTranslator { kEdram, }; - // Creates a copy of the shader with early depth/stencil testing forced, - // overriding that alpha testing is used in the shader. - static std::vector ForceEarlyDepthStencil(const uint8_t* shader); - // Returns the format with internal flags for passing via the // edram_rt_format_flags system constant. static constexpr uint32_t ROV_AddColorFormatFlags( @@ -440,16 +465,22 @@ class DxbcShaderTranslator : public ShaderTranslator { float& clamp_alpha_high, uint32_t& keep_mask_low, uint32_t& keep_mask_high); + uint32_t GetDefaultModification( + xenos::ShaderType shader_type, + Shader::HostVertexShaderType host_vertex_shader_type = + Shader::HostVertexShaderType::kVertex) const override; + // Creates a special pixel shader without color outputs - this resets the // state of the translator. std::vector CreateDepthOnlyPixelShader(); protected: - void Reset() override; + void Reset(xenos::ShaderType shader_type) override; void StartTranslation() override; - std::vector CompleteTranslation() override; + void PostTranslation(Shader::Translation& translation, + bool setup_shader_post_translation_info) override; void ProcessLabel(uint32_t cf_index) override; @@ -650,6 +681,7 @@ class DxbcShaderTranslator : public ShaderTranslator { kInputDomainPoint = 28, kUnorderedAccessView = 30, kInputCoverageMask = 35, + kOutputDepthLessEqual = 39, }; // D3D10_SB_OPERAND_INDEX_DIMENSION @@ -689,6 +721,7 @@ class DxbcShaderTranslator : public ShaderTranslator { return DxbcOperandDimension::kNoData; case DxbcOperandType::kInputPrimitiveID: case DxbcOperandType::kOutputDepth: + case DxbcOperandType::kOutputDepthLessEqual: return DxbcOperandDimension::kScalar; case DxbcOperandType::kInputCoverageMask: return dest_in_dcl ? DxbcOperandDimension::kScalar @@ -860,6 +893,9 @@ class DxbcShaderTranslator : public ShaderTranslator { return DxbcDest(DxbcOperandType::kUnorderedAccessView, write_mask, index_1d, index_2d); } + static DxbcDest ODepthLE() { + return DxbcDest(DxbcOperandType::kOutputDepthLessEqual, 0b0001); + } uint32_t GetMask() const { switch (GetDimension()) { @@ -2145,21 +2181,19 @@ class DxbcShaderTranslator : public ShaderTranslator { (index_representation_1 << 25) | (index_representation_2 << 28); } - // Use these instead of is_vertex_shader/is_pixel_shader because they don't - // take is_depth_only_pixel_shader_ into account. - inline bool IsDxbcVertexOrDomainShader() const { - return !is_depth_only_pixel_shader_ && is_vertex_shader(); + Modification GetDxbcShaderModification() const { + return Modification(modification()); } - inline bool IsDxbcVertexShader() const { - return IsDxbcVertexOrDomainShader() && - host_vertex_shader_type() == Shader::HostVertexShaderType::kVertex; + + bool IsDxbcVertexShader() const { + return is_vertex_shader() && + GetDxbcShaderModification().host_vertex_shader_type == + Shader::HostVertexShaderType::kVertex; } - inline bool IsDxbcDomainShader() const { - return IsDxbcVertexOrDomainShader() && - host_vertex_shader_type() != Shader::HostVertexShaderType::kVertex; - } - inline bool IsDxbcPixelShader() const { - return is_depth_only_pixel_shader_ || is_pixel_shader(); + bool IsDxbcDomainShader() const { + return is_vertex_shader() && + GetDxbcShaderModification().host_vertex_shader_type != + Shader::HostVertexShaderType::kVertex; } // Whether to use switch-case rather than if (pc >= label) for control flow. @@ -2181,10 +2215,37 @@ class DxbcShaderTranslator : public ShaderTranslator { uint32_t piece_temp_component, uint32_t accumulator_temp, uint32_t accumulator_temp_component); + // Converts the depth value externally clamped to the representable [0, 2) + // range to 20e4 floating point, with zeros in bits 24:31, rounding to the + // nearest even. Source and destination may be the same, temporary must be + // different than both. + void PreClampedDepthTo20e4(uint32_t d24_temp, uint32_t d24_temp_component, + uint32_t d32_temp, uint32_t d32_temp_component, + uint32_t temp_temp, uint32_t temp_temp_component); + bool IsDepthStencilSystemTempUsed() const { + // See system_temp_depth_stencil_ documentation for explanation of cases. + if (edram_rov_used_) { + return writes_depth() || ROV_IsDepthStencilEarly(); + } + return writes_depth() && DSV_IsWritingFloat24Depth(); + } + // Whether the current non-ROV pixel shader should convert the depth to 20e4. + bool DSV_IsWritingFloat24Depth() const { + if (edram_rov_used_) { + return false; + } + Modification::DepthStencilMode depth_stencil_mode = + GetDxbcShaderModification().depth_stencil_mode; + return depth_stencil_mode == + Modification::DepthStencilMode::kFloat24Truncating || + depth_stencil_mode == + Modification::DepthStencilMode::kFloat24Rounding; + } // Whether it's possible and worth skipping running the translated shader for // 2x2 quads. bool ROV_IsDepthStencilEarly() const { - return !is_depth_only_pixel_shader_ && !writes_depth(); + return !is_depth_only_pixel_shader_ && !writes_depth() && + memexport_stream_constants().empty(); } // Converts the depth value to 24-bit (storing the result in bits 0:23 and // zeros in 24:31, not creating room for stencil - since this may be involved @@ -2197,8 +2258,8 @@ class DxbcShaderTranslator : public ShaderTranslator { // Does all the depth/stencil-related things, including or not including // writing based on whether it's late, or on whether it's safe to do it early. // Updates system_temp_rov_params_ result and coverage if allowed and safe, - // updates system_temp_rov_depth_stencil_, and if early and the coverage is - // empty for all pixels in the 2x2 quad and safe to return early (stencil is + // updates system_temp_depth_stencil_, and if early and the coverage is empty + // for all pixels in the 2x2 quad and safe to return early (stencil is // unchanged or known that it's safe not to await kills/alphatest/AtoC), // returns from the shader. void ROV_DepthStencilTest(); @@ -2248,6 +2309,7 @@ class DxbcShaderTranslator : public ShaderTranslator { // Discards the SSAA sample if it's masked out by alpha to coverage. void CompletePixelShader_WriteToRTVs_AlphaToMask(); void CompletePixelShader_WriteToRTVs(); + void CompletePixelShader_DSV_DepthTo24Bit(); // Masks the sample away from system_temp_rov_params_.x if it's not covered. // threshold_offset and temp.temp_component can be the same if needed. void CompletePixelShader_ROV_AlphaToMaskSample( @@ -2333,6 +2395,11 @@ class DxbcShaderTranslator : public ShaderTranslator { xenos::TextureFilter min_filter, xenos::TextureFilter mip_filter, xenos::AnisoFilter aniso_filter); + // Returns the number of texture SRV and sampler offsets that need to be + // passed via a constant buffer to the shader. + uint32_t GetBindlessResourceCount() const { + return uint32_t(texture_bindings_.size() + sampler_bindings_.size()); + } // Marks fetch constants as used by the DXBC shader and returns DxbcSrc // for the words 01 (pair 0), 23 (pair 1) or 45 (pair 2) of the texture fetch // constant. @@ -2364,7 +2431,7 @@ class DxbcShaderTranslator : public ShaderTranslator { static uint32_t AppendString(std::vector& dest, const char* source); // Returns the length of a string as if it was appended to a DWORD stream, in // bytes. - static inline uint32_t GetStringLength(const char* source) { + static uint32_t GetStringLength(const char* source) { return uint32_t(xe::align(std::strlen(source) + 1, sizeof(uint32_t))); } @@ -2479,8 +2546,8 @@ class DxbcShaderTranslator : public ShaderTranslator { bool in_primitive_id_used_; // Whether InOutRegister::kDSInControlPointIndex has been used in the shader. bool in_control_point_index_used_; - // Whether the XY of the pixel position has been used in the pixel shader. - bool in_position_xy_used_; + // Mask of the pixel/sample position actually used in the pixel shader. + uint32_t in_position_used_; // Whether the faceness has been used in the pixel shader. bool in_front_face_used_; @@ -2518,15 +2585,14 @@ class DxbcShaderTranslator : public ShaderTranslator { // W - Base-relative resolution-scaled EDRAM offset for 64bpp color data, in // dwords. uint32_t system_temp_rov_params_; - // ROV only - new depth/stencil data. 4 VGPRs when not writing to oDepth, 1 - // VGPR when writing to oDepth. Not used in the depth-only pixel shader (or, - // more formally, if neither early depth-stencil nor oDepth are used) because - // it always calculates and writes in the same place. - // When not writing to oDepth: New per-sample depth/stencil values, generated - // during early depth/stencil test (actual writing checks coverage bits). - // When writing to oDepth: X also used to hold the depth written by the - // shader, later used as a temporary during depth/stencil testing. - uint32_t system_temp_rov_depth_stencil_; + // Two purposes: + // - When writing to oDepth, and either using ROV or converting the depth to + // float24: X also used to hold the depth written by the shader, + // later used as a temporary during depth/stencil testing. + // - Otherwise, when using ROV output with ROV_IsDepthStencilEarly being true: + // New per-sample depth/stencil values, generated during early depth/stencil + // test (actual writing checks coverage bits). + uint32_t system_temp_depth_stencil_; // Up to 4 color outputs in pixel shaders (because of exponent bias, alpha // test and remapping, and also for ROV writing). uint32_t system_temps_color_[4]; @@ -2587,6 +2653,8 @@ class DxbcShaderTranslator : public ShaderTranslator { uint32_t srv_index_bindless_textures_3d_; uint32_t srv_index_bindless_textures_cube_; + // The first binding is at t[SRVMainRegister::kBindfulTexturesStart] of space + // SRVSpace::kMain. std::vector texture_bindings_; std::unordered_map texture_bindings_for_bindful_srv_indices_; diff --git a/src/xenia/gpu/dxbc_shader_translator_fetch.cc b/src/xenia/gpu/dxbc_shader_translator_fetch.cc index 76eed4d10..b4813b381 100644 --- a/src/xenia/gpu/dxbc_shader_translator_fetch.cc +++ b/src/xenia/gpu/dxbc_shader_translator_fetch.cc @@ -677,7 +677,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( // Whether to use gradients (implicit or explicit) for LOD calculation. bool use_computed_lod = instr.attributes.use_computed_lod && - (IsDxbcPixelShader() || instr.attributes.use_register_gradients); + (is_pixel_shader() || instr.attributes.use_register_gradients); if (instr.opcode == FetchOpcode::kGetTextureComputedLod && (!use_computed_lod || instr.attributes.use_register_gradients)) { assert_always(); diff --git a/src/xenia/gpu/dxbc_shader_translator_memexport.cc b/src/xenia/gpu/dxbc_shader_translator_memexport.cc index d20cb11bf..5f3d47bc0 100644 --- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc +++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc @@ -106,7 +106,7 @@ void DxbcShaderTranslator::ExportToMemory() { kSysConst_Flags_Vec) .Select(kSysConst_Flags_Comp), DxbcSrc::LU(kSysFlag_SharedMemoryIsUAV)); - if (IsDxbcPixelShader()) { + if (is_pixel_shader()) { // Disable memexport in pixel shaders with supersampling since VPOS is // ambiguous. if (edram_rov_used_) { diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc index 24963008f..f3b964ae2 100644 --- a/src/xenia/gpu/dxbc_shader_translator_om.cc +++ b/src/xenia/gpu/dxbc_shader_translator_om.cc @@ -167,7 +167,7 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { // bigger) to integer to system_temp_rov_params_.zw. // system_temp_rov_params_.z = X host pixel position as uint // system_temp_rov_params_.w = Y host pixel position as uint - in_position_xy_used_ = true; + in_position_used_ |= 0b0011; DxbcOpFToU(DxbcDest::R(system_temp_rov_params_, 0b1100), DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition), 0b01000000)); // Revert the resolution scale to convert the position to guest pixels. @@ -315,7 +315,7 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { // Add host pixel offsets. // system_temp_rov_params_.y = scaled 32bpp depth/stencil address // system_temp_rov_params_.z = scaled 32bpp color offset if needed - in_position_xy_used_ = true; + in_position_used_ |= 0b0011; for (uint32_t i = 0; i < 2; ++i) { // Convert a position component to integer. DxbcOpFToU(DxbcDest::R(system_temp_rov_params_, 0b0001), @@ -417,23 +417,50 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { // With early depth/stencil, depth/stencil writing may be deferred to the // end of the shader to prevent writing in case something (like alpha test, // which is dynamic GPU state) discards the pixel. So, write directly to the - // persistent register, system_temp_rov_depth_stencil_, instead of a local + // persistent register, system_temp_depth_stencil_, instead of a local // temporary register. DxbcDest sample_depth_stencil_dest( - depth_stencil_early - ? DxbcDest::R(system_temp_rov_depth_stencil_, 1 << i) - : temp_x_dest); + depth_stencil_early ? DxbcDest::R(system_temp_depth_stencil_, 1 << i) + : temp_x_dest); DxbcSrc sample_depth_stencil_src( - depth_stencil_early - ? DxbcSrc::R(system_temp_rov_depth_stencil_).Select(i) - : temp_x_src); + depth_stencil_early ? DxbcSrc::R(system_temp_depth_stencil_).Select(i) + : temp_x_src); if (!i) { if (writes_depth()) { + // Clamp oDepth to the lower viewport depth bound (depth clamp happens + // after the pixel shader in the pipeline, at least on Direct3D 11 and + // Vulkan, thus applies to the shader's depth output too). + system_constants_used_ |= 1ull << kSysConst_EdramDepthRange_Index; + DxbcOpMax(DxbcDest::R(system_temp_depth_stencil_, 0b0001), + DxbcSrc::R(system_temp_depth_stencil_, DxbcSrc::kXXXX), + DxbcSrc::CB(cbuffer_index_system_constants_, + uint32_t(CbufferRegister::kSystemConstants), + kSysConst_EdramDepthRange_Vec) + .Select(kSysConst_EdramDepthRangeOffset_Comp)); + // Calculate the upper Z range bound to temp.x for clamping after + // biasing. + // temp.x = viewport maximum depth + system_constants_used_ |= 1ull << kSysConst_EdramDepthRange_Index; + DxbcOpAdd(temp_x_dest, + DxbcSrc::CB(cbuffer_index_system_constants_, + uint32_t(CbufferRegister::kSystemConstants), + kSysConst_EdramDepthRange_Vec) + .Select(kSysConst_EdramDepthRangeOffset_Comp), + DxbcSrc::CB(cbuffer_index_system_constants_, + uint32_t(CbufferRegister::kSystemConstants), + kSysConst_EdramDepthRange_Vec) + .Select(kSysConst_EdramDepthRangeScale_Comp)); + // Clamp oDepth to the upper viewport depth bound (already not above 1, + // but saturate for total safety). + // temp.x = free + DxbcOpMin(DxbcDest::R(system_temp_depth_stencil_, 0b0001), + DxbcSrc::R(system_temp_depth_stencil_, DxbcSrc::kXXXX), + temp_x_src, true); // Convert the shader-generated depth to 24-bit, using temp.x as // temporary. - ROV_DepthTo24Bit(system_temp_rov_depth_stencil_, 0, - system_temp_rov_depth_stencil_, 0, temp, 0); + ROV_DepthTo24Bit(system_temp_depth_stencil_, 0, + system_temp_depth_stencil_, 0, temp, 0); } else { // Load the first sample's Z*W and W to temp.xy - need this regardless // of coverage for polygon offset. @@ -529,14 +556,14 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { } // Get if the current sample is covered to temp.w. - // temp.x = first sample's viewport space Z or 24-bit oDepth + // temp.x = first sample's viewport space Z if not writing to oDepth // temp.y = polygon offset if not writing to oDepth // temp.z = viewport maximum depth if not writing to oDepth // temp.w = coverage of the current sample DxbcOpAnd(temp_w_dest, DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kXXXX), DxbcSrc::LU(1 << i)); // Check if the current sample is covered. Release 1 VGPR. - // temp.x = first sample's viewport space Z or 24-bit oDepth + // temp.x = first sample's viewport space Z if not writing to oDepth // temp.y = polygon offset if not writing to oDepth // temp.z = viewport maximum depth if not writing to oDepth // temp.w = free @@ -546,7 +573,7 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { // Copy the 24-bit depth common to all samples to sample_depth_stencil. // temp.x = shader-generated 24-bit depth DxbcOpMov(sample_depth_stencil_dest, - DxbcSrc::R(system_temp_rov_depth_stencil_, DxbcSrc::kXXXX)); + DxbcSrc::R(system_temp_depth_stencil_, DxbcSrc::kXXXX)); } else { if (i) { // Sample's depth precalculated for sample 0 (for slope-scaled depth @@ -1720,7 +1747,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs_AlphaToMask() { // Convert SSAA sample position to integer to temp.xy (not caring about the // resolution scale because it's not supported anywhere on the RTV output // path). - in_position_xy_used_ = true; + in_position_used_ |= 0b0011; DxbcOpFToU(DxbcDest::R(temp, 0b0011), DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition))); @@ -1913,6 +1940,139 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToRTVs() { PopSystemTemp(2); } +void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() { + if (!DSV_IsWritingFloat24Depth()) { + return; + } + + uint32_t temp; + if (writes_depth()) { + // The depth is already written to system_temp_depth_stencil_.x and clamped + // to 0...1 with NaNs dropped (saturating in StoreResult); yzw are free. + temp = system_temp_depth_stencil_; + } else { + // Need a temporary variable; copy the sample's depth input to it and + // saturate it (in Direct3D 11, depth is clamped to the viewport bounds + // after the pixel shader, and SV_Position.z contains the unclamped depth, + // which may be outside the viewport's depth range if it's biased); though + // it will be clamped to the viewport bounds anyway, but to be able to make + // the assumption of it being clamped while working with the bit + // representation. + temp = PushSystemTemp(); + in_position_used_ |= 0b0100; + DxbcOpMov( + DxbcDest::R(temp, 0b0001), + DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition), DxbcSrc::kZZZZ), + true); + } + + DxbcDest temp_x_dest(DxbcDest::R(temp, 0b0001)); + DxbcSrc temp_x_src(DxbcSrc::R(temp, DxbcSrc::kXXXX)); + DxbcDest temp_y_dest(DxbcDest::R(temp, 0b0010)); + DxbcSrc temp_y_src(DxbcSrc::R(temp, DxbcSrc::kYYYY)); + + if (GetDxbcShaderModification().depth_stencil_mode == + Modification::DepthStencilMode::kFloat24Truncating) { + // Simplified conversion, always less than or equal to the original value - + // just drop the lower bits. + // The float32 exponent bias is 127. + // After saturating, the exponent range is -127...0. + // The smallest normalized 20e4 exponent is -14 - should drop 3 mantissa + // bits at -14 or above. + // The smallest denormalized 20e4 number is -34 - should drop 23 mantissa + // bits at -34. + // Anything smaller than 2^-34 becomes 0. + DxbcDest truncate_dest(writes_depth() ? DxbcDest::ODepth() + : DxbcDest::ODepthLE()); + // Check if the number is representable as a float24 after truncation - the + // exponent is at least -34. + DxbcOpUGE(temp_y_dest, temp_x_src, DxbcSrc::LU(0x2E800000)); + DxbcOpIf(true, temp_y_src); + { + // Extract the biased float32 exponent to temp.y. + // temp.y = 113+ at exponent -14+. + // temp.y = 93 at exponent -34. + DxbcOpUBFE(temp_y_dest, DxbcSrc::LU(8), DxbcSrc::LU(23), temp_x_src); + // Convert exponent to the unclamped number of bits to truncate. + // 116 - 113 = 3. + // 116 - 93 = 23. + // temp.y = 3+ at exponent -14+. + // temp.y = 23 at exponent -34. + DxbcOpIAdd(temp_y_dest, DxbcSrc::LI(116), -temp_y_src); + // Clamp the truncated bit count to drop 3 bits of any normal number. + // Exponents below -34 are handled separately. + // temp.y = 3 at exponent -14. + // temp.y = 23 at exponent -34. + DxbcOpIMax(temp_y_dest, temp_y_src, DxbcSrc::LI(3)); + // Truncate the mantissa - fill the low bits with zeros. + DxbcOpBFI(truncate_dest, temp_y_src, DxbcSrc::LU(0), DxbcSrc::LU(0), + temp_x_src); + } + // The number is not representable as float24 after truncation - zero. + DxbcOpElse(); + DxbcOpMov(truncate_dest, DxbcSrc::LF(0.0f)); + // Close the non-zero result check. + DxbcOpEndIf(); + } else { + // Properly convert to 20e4, with rounding to the nearest even. + PreClampedDepthTo20e4(temp, 0, temp, 0, temp, 1); + // Convert back to float32. + // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp + // Unpack the exponent to temp.y. + DxbcOpUShR(temp_y_dest, temp_x_src, DxbcSrc::LU(20)); + // Unpack the mantissa to temp.x. + DxbcOpAnd(temp_x_dest, temp_x_src, DxbcSrc::LU(0xFFFFF)); + // Check if the number is denormalized. + DxbcOpIf(false, temp_y_src); + { + // Check if the number is non-zero (if the mantissa isn't zero - the + // exponent is known to be zero at this point). + DxbcOpIf(true, temp_x_src); + { + // Normalize the mantissa. + // Note that HLSL firstbithigh(x) is compiled to DXBC like: + // `x ? 31 - firstbit_hi(x) : -1` + // (returns the index from the LSB, not the MSB, but -1 for zero too). + // temp.y = firstbit_hi(mantissa) + DxbcOpFirstBitHi(temp_y_dest, temp_x_src); + // temp.y = 20 - firstbithigh(mantissa) + // Or: + // temp.y = 20 - (31 - firstbit_hi(mantissa)) + DxbcOpIAdd(temp_y_dest, temp_y_src, DxbcSrc::LI(20 - 31)); + // mantissa = mantissa << (20 - firstbithigh(mantissa)) + // AND 0xFFFFF not needed after this - BFI will do it. + DxbcOpIShL(temp_x_dest, temp_x_src, temp_y_src); + // Get the normalized exponent. + // exponent = 1 - (20 - firstbithigh(mantissa)) + DxbcOpIAdd(temp_y_dest, DxbcSrc::LI(1), -temp_y_src); + } + // The number is zero. + DxbcOpElse(); + { + // Set the unbiased exponent to -112 for zero - 112 will be added later, + // resulting in zero float32. + DxbcOpMov(temp_y_dest, DxbcSrc::LI(-112)); + } + // Close the non-zero check. + DxbcOpEndIf(); + } + // Close the denormal check. + DxbcOpEndIf(); + // Bias the exponent and move it to the correct location in float32 to + // temp.y. + DxbcOpIMAd(temp_y_dest, temp_y_src, DxbcSrc::LI(1 << 23), + DxbcSrc::LI(112 << 23)); + // Combine the mantissa and the exponent into the result. + DxbcOpBFI(DxbcDest::ODepth(), DxbcSrc::LU(20), DxbcSrc::LU(3), temp_x_src, + temp_y_src); + } + + if (!writes_depth()) { + // Release temp. + PopSystemTemp(); + } +} + void DxbcShaderTranslator::CompletePixelShader_ROV_AlphaToMaskSample( uint32_t sample_index, float threshold_base, DxbcSrc threshold_offset, float threshold_offset_scale, uint32_t temp, uint32_t temp_component) { @@ -1957,7 +2117,7 @@ void DxbcShaderTranslator::CompletePixelShader_ROV_AlphaToMask() { // floating-point. With resolution scaling, still using host pixels, to // preserve the idea of dithering. // temp.x = alpha to coverage offset as float 0.0...3.0. - in_position_xy_used_ = true; + in_position_used_ |= 0b0011; DxbcOpFToU(DxbcDest::R(temp, 0b0011), DxbcSrc::V(uint32_t(InOutRegister::kPSInPosition))); DxbcOpAnd(DxbcDest::R(temp, 0b0010), DxbcSrc::R(temp, DxbcSrc::kYYYY), @@ -2067,7 +2227,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { DxbcOpStoreUAVTyped( DxbcDest::U(uav_index_edram_, uint32_t(UAVRegister::kEdram)), DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kYYYY), 1, - DxbcSrc::R(system_temp_rov_depth_stencil_).Select(i)); + DxbcSrc::R(system_temp_depth_stencil_).Select(i)); } // Close the write check. DxbcOpEndIf(); @@ -3059,15 +3219,16 @@ void DxbcShaderTranslator::CompletePixelShader() { CompletePixelShader_WriteToROV(); } else { CompletePixelShader_WriteToRTVs(); + CompletePixelShader_DSV_DepthTo24Bit(); } } -void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp, - uint32_t d24_temp_component, - uint32_t d32_temp, - uint32_t d32_temp_component, - uint32_t temp_temp, - uint32_t temp_temp_component) { +void DxbcShaderTranslator::PreClampedDepthTo20e4(uint32_t d24_temp, + uint32_t d24_temp_component, + uint32_t d32_temp, + uint32_t d32_temp_component, + uint32_t temp_temp, + uint32_t temp_temp_component) { assert_true(temp_temp != d24_temp || temp_temp_component != d24_temp_component); assert_true(temp_temp != d32_temp || @@ -3079,68 +3240,83 @@ void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp, DxbcDest temp_dest(DxbcDest::R(temp_temp, 1 << temp_temp_component)); DxbcSrc temp_src(DxbcSrc::R(temp_temp).Select(temp_temp_component)); + // CFloat24 from d3dref9.dll. + // Assuming the depth is already clamped to [0, 2) (in all places, the depth + // is written with the saturate flag set). + + // Check if the number is too small to be represented as normalized 20e4. + // temp = f32 < 2^-14 + DxbcOpULT(temp_dest, d32_src, DxbcSrc::LU(0x38800000)); + // Handle denormalized numbers separately. + DxbcOpIf(true, temp_src); + { + // temp = f32 >> 23 + DxbcOpUShR(temp_dest, d32_src, DxbcSrc::LU(23)); + // temp = 113 - (f32 >> 23) + DxbcOpIAdd(temp_dest, DxbcSrc::LI(113), -temp_src); + // Don't allow the shift to overflow, since in DXBC the lower 5 bits of the + // shift amount are used (otherwise 0 becomes 8). + // temp = min(113 - (f32 >> 23), 24) + DxbcOpUMin(temp_dest, temp_src, DxbcSrc::LU(24)); + // biased_f32 = (f32 & 0x7FFFFF) | 0x800000 + DxbcOpBFI(d24_dest, DxbcSrc::LU(9), DxbcSrc::LU(23), DxbcSrc::LU(1), + d32_src); + // biased_f32 = ((f32 & 0x7FFFFF) | 0x800000) >> min(113 - (f32 >> 23), 24) + DxbcOpUShR(d24_dest, d24_src, temp_src); + } + // Not denormalized? + DxbcOpElse(); + { + // Bias the exponent. + // biased_f32 = f32 + (-112 << 23) + // (left shift of a negative value is undefined behavior) + DxbcOpIAdd(d24_dest, d32_src, DxbcSrc::LU(0xC8000000u)); + } + // Close the denormal check. + DxbcOpEndIf(); + // Build the 20e4 number. + // temp = (biased_f32 >> 3) & 1 + DxbcOpUBFE(temp_dest, DxbcSrc::LU(1), DxbcSrc::LU(3), d24_src); + // f24 = biased_f32 + 3 + DxbcOpIAdd(d24_dest, d24_src, DxbcSrc::LU(3)); + // f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1) + DxbcOpIAdd(d24_dest, d24_src, temp_src); + // f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF + DxbcOpUBFE(d24_dest, DxbcSrc::LU(24), DxbcSrc::LU(3), d24_src); +} + +void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp, + uint32_t d24_temp_component, + uint32_t d32_temp, + uint32_t d32_temp_component, + uint32_t temp_temp, + uint32_t temp_temp_component) { + assert_true(temp_temp != d32_temp || + temp_temp_component != d32_temp_component); + // Source and destination may be the same. + system_constants_used_ |= 1ull << kSysConst_Flags_Index; - DxbcOpAnd(temp_dest, + DxbcOpAnd(DxbcDest::R(temp_temp, 1 << temp_temp_component), DxbcSrc::CB(cbuffer_index_system_constants_, uint32_t(CbufferRegister::kSystemConstants), kSysConst_Flags_Vec) .Select(kSysConst_Flags_Comp), DxbcSrc::LU(kSysFlag_ROVDepthFloat24)); // Convert according to the format. - DxbcOpIf(true, temp_src); + DxbcOpIf(true, DxbcSrc::R(temp_temp).Select(temp_temp_component)); { - // 20e4 conversion, using 1 VGPR. - // CFloat24 from d3dref9.dll. - // Assuming the depth is already clamped to [0, 2) (in all places, the depth - // is written with the saturate flag set). - - // Check if the number is too small to be represented as normalized 20e4. - // temp = f32 < 2^-14 - DxbcOpULT(temp_dest, d32_src, DxbcSrc::LU(0x38800000)); - // Handle denormalized numbers separately. - DxbcOpIf(true, temp_src); - { - // temp = f32 >> 23 - DxbcOpUShR(temp_dest, d32_src, DxbcSrc::LU(23)); - // temp = 113 - (f32 >> 23) - DxbcOpIAdd(temp_dest, DxbcSrc::LI(113), -temp_src); - // Don't allow the shift to overflow, since in DXBC the lower 5 bits of - // the shift amount are used (otherwise 0 becomes 8). - // temp = min(113 - (f32 >> 23), 24) - DxbcOpUMin(temp_dest, temp_src, DxbcSrc::LU(24)); - // biased_f32 = (f32 & 0x7FFFFF) | 0x800000 - DxbcOpBFI(d24_dest, DxbcSrc::LU(9), DxbcSrc::LU(23), DxbcSrc::LU(1), - d32_src); - // biased_f32 = - // ((f32 & 0x7FFFFF) | 0x800000) >> min(113 - (f32 >> 23), 24) - DxbcOpUShR(d24_dest, d24_src, temp_src); - } - // Not denormalized? - DxbcOpElse(); - { - // Bias the exponent. - // biased_f32 = f32 + (-112 << 23) - // (left shift of a negative value is undefined behavior) - DxbcOpIAdd(d24_dest, d32_src, DxbcSrc::LU(0xC8000000u)); - } - // Close the denormal check. - DxbcOpEndIf(); - // Build the 20e4 number. - // temp = (biased_f32 >> 3) & 1 - DxbcOpUBFE(temp_dest, DxbcSrc::LU(1), DxbcSrc::LU(3), d24_src); - // f24 = biased_f32 + 3 - DxbcOpIAdd(d24_dest, d24_src, DxbcSrc::LU(3)); - // f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1) - DxbcOpIAdd(d24_dest, d24_src, temp_src); - // f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF - DxbcOpUBFE(d24_dest, DxbcSrc::LU(24), DxbcSrc::LU(3), d24_src); + // 20e4 conversion. + PreClampedDepthTo20e4(d24_temp, d24_temp_component, d32_temp, + d32_temp_component, temp_temp, temp_temp_component); } DxbcOpElse(); { // Unorm24 conversion. - + DxbcDest d24_dest(DxbcDest::R(d24_temp, 1 << d24_temp_component)); + DxbcSrc d24_src(DxbcSrc::R(d24_temp).Select(d24_temp_component)); // Multiply by float(0xFFFFFF). - DxbcOpMul(d24_dest, d32_src, DxbcSrc::LF(16777215.0f)); + DxbcOpMul(d24_dest, DxbcSrc::R(d32_temp).Select(d32_temp_component), + DxbcSrc::LF(16777215.0f)); // Round to the nearest even integer. This seems to be the correct way: // rounding towards zero gives 0xFF instead of 0x100 in clear shaders in, // for instance, Halo 3, but other clear shaders in it are also broken if diff --git a/src/xenia/gpu/gpu_flags.cc b/src/xenia/gpu/gpu_flags.cc index 5f73fd3c2..07eff0bc8 100644 --- a/src/xenia/gpu/gpu_flags.cc +++ b/src/xenia/gpu/gpu_flags.cc @@ -40,9 +40,63 @@ DEFINE_bool( "be fully covered when MSAA is used with fullscreen passes.", "GPU"); +DEFINE_string( + depth_float24_conversion, "", + "Method for converting 32-bit Z values to 20e4 floating point when using " + "host depth buffers without native 20e4 support (when not using rasterizer-" + "ordered views / fragment shader interlocks to perform depth testing " + "manually).\n" + "Use: [any, on_copy, truncate, round]\n" + " on_copy:\n" + " Do depth testing at host precision, converting when copying between " + "host depth buffers and the EDRAM buffer to support reinterpretation, " + "maintaining two copies, in both host and 20e4 formats, for reloading data " + "to host depth buffers when it wasn't overwritten.\n" + " + Highest performance, allows early depth test and writing.\n" + " + Host MSAA is possible with pixel-rate shading where supported.\n" + " - EDRAM > RAM > EDRAM depth buffer round trip done in certain games " + "(such as GTA IV) destroys precision irreparably, causing artifacts if " + "another rendering pass is done after the EDRAM reupload.\n" + " truncate:\n" + " Convert to 20e4 directly in pixel shaders, always rounding down.\n" + " + Good performance, conservative early depth test is possible.\n" + " + No precision loss when anything changes in the storage of the depth " + "buffer, EDRAM > RAM > EDRAM copying preserves precision.\n" + " - Rounding mode is incorrect, sometimes giving results smaller than " + "they should be - may cause inaccuracy especially in edge cases when the " + "game wants to write an exact value.\n" + " - Host MSAA is only possible at SSAA speed, with per-sample shading.\n" + " round:\n" + " Convert to 20e4 directly in pixel shaders, correctly rounding to the " + "nearest even.\n" + " + Highest accuracy.\n" + " - Significantly limited performance, early depth test is not possible.\n" + " - Host MSAA is only possible at SSAA speed, with per-sample shading.\n" + " Any other value:\n" + " Choose what is considered the most optimal (currently \"on_copy\").", + "GPU"); + DEFINE_int32(query_occlusion_fake_sample_count, 1000, "If set to -1 no sample counts are written, games may hang. Else, " "the sample count of every tile will be incremented on every " "EVENT_WRITE_ZPD by this number. Setting this to 0 means " "everything is reported as occluded.", "GPU"); + +namespace xe { +namespace gpu { +namespace flags { + +DepthFloat24Conversion GetDepthFloat24Conversion() { + if (cvars::depth_float24_conversion == "truncate") { + return DepthFloat24Conversion::kOnOutputTruncating; + } + if (cvars::depth_float24_conversion == "round") { + return DepthFloat24Conversion::kOnOutputRounding; + } + return DepthFloat24Conversion::kOnCopy; +} + +} // namespace flags +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/gpu_flags.h b/src/xenia/gpu/gpu_flags.h index 5ae64b76e..2405dc23c 100644 --- a/src/xenia/gpu/gpu_flags.h +++ b/src/xenia/gpu/gpu_flags.h @@ -22,6 +22,69 @@ DECLARE_bool(gpu_allow_invalid_fetch_constants); DECLARE_bool(half_pixel_offset); +DECLARE_string(depth_float24_conversion); + DECLARE_int32(query_occlusion_fake_sample_count); +namespace xe { +namespace gpu { +namespace flags { + +enum class DepthFloat24Conversion { + // Doing depth test at the host precision, converting to 20e4 to support + // reinterpretation, but keeping a separate EDRAM view containing depth values + // in the host format. When copying from the EDRAM buffer to host depth + // buffers, writing the stored host pixel if stored_f24 == to_f24(stored_host) + // (otherwise it was overwritten by something else, like clearing, or a color + // buffer; this is inexact though, and will incorrectly load pixels that were + // overwritten by something else in the EDRAM, but turned out to have the same + // value on the guest as before - an outdated host-precision value will be + // loaded in these cases instead). + // + // EDRAM > RAM, then reusing the EDRAM region for something else > EDRAM round + // trip destroys precision beyond repair. + // + // Full host early Z and MSAA with pixel-rate shading are supported. + kOnCopy, + // Converting the depth to the closest host value representable exactly as a + // 20e4 float in pixel shaders, to support invariance in cases when the guest + // reuploads a previously resolved depth buffer to the EDRAM, rounding towards + // zero (which contradicts the rounding used by the Direct3D 9 reference + // rasterizer, but allows less-than-or-equal pixel shader depth output to be + // used to preserve most of early Z culling when the game is using reversed + // depth, which is the usual way of doing depth testing on the Xbox 360 and of + // utilizing the advantages of a floating-point encoding). + // + // With MSAA, pixel shaders must run at sample frequency - otherwise, if the + // depth is the same for the entire pixel, intersections of polygons cannot be + // antialiased. + // + // Important usage note: When using this mode, bounds of the fixed-function + // viewport must be converted to and back from float24 too (preferably using + // correct rounding to the nearest even, to reduce the error already caused by + // truncation rather than to amplify it). This ensures that clamping to the + // viewport bounds, which happens after the pixel shader even if it overwrites + // the resulting depth, is never done to a value not representable as float24 + // (for example, if the minimum Z is a number too small to be represented as + // float24, but not zero, it won't be possible to write what should become + // 0x000000 to the depth buffer). Note that this may add some error to the + // depth values from the rasterizer; however, modifying Z in the vertex shader + // to make interpolated depth values would cause clipping to be done to + // different bounds, which may be more undesirable, especially in cases when Z + // is explicitly set to a value like 0 or W (in such cases, the adjusted + // polygon may go outside 0...W in clip space and disappear). + kOnOutputTruncating, + // Similar to kOnOutputTruncating, but rounding to the nearest even, more + // correctly, however, because the resulting depth can be bigger than the + // original host value, early depth testing can't be used at all. Same + // viewport usage rules apply. + kOnOutputRounding, +}; + +DepthFloat24Conversion GetDepthFloat24Conversion(); + +} // namespace flags +} // namespace gpu +} // namespace xe + #endif // XENIA_GPU_GPU_FLAGS_H_ diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index 04bc8024b..de327869c 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -276,8 +276,7 @@ void GraphicsSystem::ClearCaches() { } void GraphicsSystem::InitializeShaderStorage( - const std::filesystem::path& storage_root, uint32_t title_id, - bool blocking) { + const std::filesystem::path& cache_root, uint32_t title_id, bool blocking) { if (!cvars::store_shaders) { return; } @@ -285,21 +284,18 @@ void GraphicsSystem::InitializeShaderStorage( if (command_processor_->is_paused()) { // Safe to run on any thread while the command processor is paused, no // race condition. - command_processor_->InitializeShaderStorage(storage_root, title_id, true); + command_processor_->InitializeShaderStorage(cache_root, title_id, true); } else { xe::threading::Fence fence; - command_processor_->CallInThread( - [this, storage_root, title_id, &fence]() { - command_processor_->InitializeShaderStorage(storage_root, title_id, - true); - fence.Signal(); - }); + command_processor_->CallInThread([this, cache_root, title_id, &fence]() { + command_processor_->InitializeShaderStorage(cache_root, title_id, true); + fence.Signal(); + }); fence.Wait(); } } else { - command_processor_->CallInThread([this, storage_root, title_id]() { - command_processor_->InitializeShaderStorage(storage_root, title_id, - false); + command_processor_->CallInThread([this, cache_root, title_id]() { + command_processor_->InitializeShaderStorage(cache_root, title_id, false); }); } } diff --git a/src/xenia/gpu/graphics_system.h b/src/xenia/gpu/graphics_system.h index 47a4d3f7b..148206af2 100644 --- a/src/xenia/gpu/graphics_system.h +++ b/src/xenia/gpu/graphics_system.h @@ -63,7 +63,7 @@ class GraphicsSystem { virtual void ClearCaches(); - void InitializeShaderStorage(const std::filesystem::path& storage_root, + void InitializeShaderStorage(const std::filesystem::path& cache_root, uint32_t title_id, bool blocking); void RequestFrameTrace(); diff --git a/src/xenia/gpu/shader.cc b/src/xenia/gpu/shader.cc index 931b728da..6df03fb81 100644 --- a/src/xenia/gpu/shader.cc +++ b/src/xenia/gpu/shader.cc @@ -31,9 +31,13 @@ Shader::Shader(xenos::ShaderType shader_type, uint64_t ucode_data_hash, xe::copy_and_swap(ucode_data_.data(), ucode_dwords, ucode_dword_count); } -Shader::~Shader() = default; +Shader::~Shader() { + for (auto it : translations_) { + delete it.second; + } +} -std::string Shader::GetTranslatedBinaryString() const { +std::string Shader::Translation::GetTranslatedBinaryString() const { std::string result; result.resize(translated_binary_.size()); std::memcpy(const_cast(result.data()), translated_binary_.data(), @@ -41,36 +45,24 @@ std::string Shader::GetTranslatedBinaryString() const { return result; } -std::pair Shader::Dump( +std::filesystem::path Shader::Translation::Dump( const std::filesystem::path& base_path, const char* path_prefix) { + std::filesystem::path path = base_path; // Ensure target path exists. - auto target_path = base_path; - if (!target_path.empty()) { - target_path = std::filesystem::absolute(target_path); - std::filesystem::create_directories(target_path); + if (!path.empty()) { + path = std::filesystem::absolute(path); + std::filesystem::create_directories(path); } - - auto base_name = - fmt::format("shader_{}_{:016X}", path_prefix, ucode_data_hash_); - - std::string txt_name, bin_name; - if (shader_type_ == xenos::ShaderType::kVertex) { - txt_name = base_name + ".vert"; - bin_name = base_name + ".bin.vert"; - } else { - txt_name = base_name + ".frag"; - bin_name = base_name + ".bin.frag"; - } - - std::filesystem::path txt_path, bin_path; - txt_path = base_path / txt_name; - bin_path = base_path / bin_name; - - FILE* f = filesystem::OpenFile(txt_path, "wb"); + path = path / + fmt::format( + "shader_{:016X}_{:08X}.{}.{}", shader().ucode_data_hash(), + modification(), path_prefix, + shader().type() == xenos::ShaderType::kVertex ? "vert" : "frag"); + FILE* f = filesystem::OpenFile(path, "wb"); if (f) { fwrite(translated_binary_.data(), 1, translated_binary_.size(), f); fprintf(f, "\n\n"); - auto ucode_disasm_ptr = ucode_disassembly().c_str(); + auto ucode_disasm_ptr = shader().ucode_disassembly().c_str(); while (*ucode_disasm_ptr) { auto line_end = std::strchr(ucode_disasm_ptr, '\n'); fprintf(f, "// "); @@ -83,14 +75,58 @@ std::pair Shader::Dump( } fclose(f); } + return std::move(path); +} - f = filesystem::OpenFile(bin_path, "wb"); +Shader::Translation* Shader::GetOrCreateTranslation(uint32_t modification, + bool* is_new) { + auto it = translations_.find(modification); + if (it != translations_.end()) { + if (is_new) { + *is_new = false; + } + return it->second; + } + Translation* translation = CreateTranslationInstance(modification); + translations_.emplace(modification, translation); + if (is_new) { + *is_new = true; + } + return translation; +} + +void Shader::DestroyTranslation(uint32_t modification) { + auto it = translations_.find(modification); + if (it == translations_.end()) { + return; + } + delete it->second; + translations_.erase(it); +} + +std::filesystem::path Shader::DumpUcodeBinary( + const std::filesystem::path& base_path) { + // Ensure target path exists. + std::filesystem::path path = base_path; + if (!path.empty()) { + path = std::filesystem::absolute(path); + std::filesystem::create_directories(path); + } + path = path / + fmt::format("shader_{:016X}.ucode.bin.{}", ucode_data_hash(), + type() == xenos::ShaderType::kVertex ? "vert" : "frag"); + + FILE* f = filesystem::OpenFile(path, "wb"); if (f) { - fwrite(ucode_data_.data(), 4, ucode_data_.size(), f); + fwrite(ucode_data().data(), 4, ucode_data().size(), f); fclose(f); } + return std::move(path); +} - return {std::move(txt_path), std::move(bin_path)}; +Shader::Translation* Shader::CreateTranslationInstance(uint32_t modification) { + // Default implementation for simple cases like ucode disassembly. + return new Translation(*this, modification); } } // namespace gpu diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 23998c307..e533ba9b8 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -11,8 +11,12 @@ #define XENIA_GPU_SHADER_H_ #include +#include +#include #include #include +#include +#include #include #include "xenia/base/math.h" @@ -591,6 +595,8 @@ struct ParsedAluInstruction { class Shader { public: + // Type of the vertex shader in a D3D11-like rendering pipeline - shader + // interface depends on in, so it must be known at translation time. // If values are changed, INVALIDATE SHADER STORAGES (increase their version // constexpr) where those are stored! And check bit count where this is // packed. This is : uint32_t for simplicity of packing in bit fields. @@ -603,6 +609,8 @@ class Shader { kQuadDomainCPIndexed, kQuadDomainPatchIndexed, }; + // For packing HostVertexShaderType in bit fields. + static constexpr uint32_t kHostVertexShaderTypeBitCount = 3; struct Error { bool is_fatal = false; @@ -683,6 +691,67 @@ class Shader { } }; + class Translation { + public: + virtual ~Translation() {} + + Shader& shader() const { return shader_; } + + // Translator-specific modification bits. + uint32_t modification() const { return modification_; } + + // True if the shader was translated and prepared without error. + bool is_valid() const { return is_valid_; } + + // True if the shader has already been translated. + bool is_translated() const { return is_translated_; } + + // Errors that occurred during translation. + const std::vector& errors() const { return errors_; } + + // Translated shader binary (or text). + const std::vector& translated_binary() const { + return translated_binary_; + } + + // Gets the translated shader binary as a string. + // This is only valid if it is actually text. + std::string GetTranslatedBinaryString() const; + + // Disassembly of the translated from the host graphics layer. + // May be empty if the host does not support disassembly. + const std::string& host_disassembly() const { return host_disassembly_; } + + // In case disassembly depends on the GPU backend, for setting it + // externally. + void set_host_disassembly(std::string disassembly) { + host_disassembly_ = std::move(disassembly); + } + + // For dumping after translation. Dumps the shader's disassembled microcode, + // translated code, and, if available, translated disassembly, to a file in + // the given path based on ucode hash. Returns the name of the written file. + std::filesystem::path Dump(const std::filesystem::path& base_path, + const char* path_prefix); + + protected: + Translation(Shader& shader, uint32_t modification) + : shader_(shader), modification_(modification) {} + + private: + friend class Shader; + friend class ShaderTranslator; + + Shader& shader_; + uint32_t modification_; + + bool is_valid_ = false; + bool is_translated_ = false; + std::vector errors_; + std::vector translated_binary_; + std::string host_disassembly_; + }; + Shader(xenos::ShaderType shader_type, uint64_t ucode_data_hash, const uint32_t* ucode_dwords, size_t ucode_dword_count); virtual ~Shader(); @@ -690,19 +759,30 @@ class Shader { // Whether the shader is identified as a vertex or pixel shader. xenos::ShaderType type() const { return shader_type_; } - // If this is a vertex shader, and it has been translated, type of the shader - // in a D3D11-like rendering pipeline - shader interface depends on in, so it - // must be known at translation time. - HostVertexShaderType host_vertex_shader_type() const { - return host_vertex_shader_type_; - } - // Microcode dwords in host endianness. const std::vector& ucode_data() const { return ucode_data_; } uint64_t ucode_data_hash() const { return ucode_data_hash_; } const uint32_t* ucode_dwords() const { return ucode_data_.data(); } size_t ucode_dword_count() const { return ucode_data_.size(); } + // Host translations with the specified modification bits. Not thread-safe + // with respect to translation creation/destruction. + const std::unordered_map& translations() const { + return translations_; + } + Translation* GetTranslation(uint32_t modification) const { + auto it = translations_.find(modification); + if (it != translations_.cend()) { + return it->second; + } + return nullptr; + } + Translation* GetOrCreateTranslation(uint32_t modification, + bool* is_new = nullptr); + // For shader storage loading, to remove a modification in case of translation + // failure. Not thread-safe. + void DestroyTranslation(uint32_t modification); + // All vertex bindings used in the shader. // Valid for vertex shaders only. const std::vector& vertex_bindings() const { @@ -733,73 +813,55 @@ class Shader { // True if the shader overrides the pixel depth. bool writes_depth() const { return writes_depth_; } - // True if Xenia can automatically enable early depth/stencil for the pixel - // shader when RB_DEPTHCONTROL EARLY_Z_ENABLE is not set, provided alpha - // testing and alpha to coverage are disabled. - bool implicit_early_z_allowed() const { return implicit_early_z_allowed_; } - - // True if the shader was translated and prepared without error. - bool is_valid() const { return is_valid_; } - - // True if the shader has already been translated. - bool is_translated() const { return is_translated_; } - - // Errors that occurred during translation. - const std::vector& errors() const { return errors_; } + // True if the current shader has any `kill` instructions. + bool kills_pixels() const { return kills_pixels_; } // Microcode disassembly in D3D format. const std::string& ucode_disassembly() const { return ucode_disassembly_; } - // Translated shader binary (or text). - const std::vector& translated_binary() const { - return translated_binary_; + // An externally managed identifier of the shader storage the microcode of the + // shader was last written to, or was loaded from, to only write the shader + // microcode to the storage once. UINT32_MAX by default. + uint32_t ucode_storage_index() const { return ucode_storage_index_; } + void set_ucode_storage_index(uint32_t storage_index) { + ucode_storage_index_ = storage_index; } - // Gets the translated shader binary as a string. - // This is only valid if it is actually text. - std::string GetTranslatedBinaryString() const; - - // Disassembly of the translated from the host graphics layer. - // May be empty if the host does not support disassembly. - const std::string& host_disassembly() const { return host_disassembly_; } - // A lot of errors that occurred during preparation of the host shader. - const std::string& host_error_log() const { return host_error_log_; } - // Host binary that can be saved and reused across runs. - // May be empty if the host does not support saving binaries. - const std::vector& host_binary() const { return host_binary_; } - - // Dumps the shader to a file in the given path based on ucode hash. - // Both the ucode binary and disassembled and translated shader will be - // written. - // Returns the filename of the shader and the binary. - std::pair Dump( - const std::filesystem::path& base_path, const char* path_prefix); + // Dumps the shader's microcode binary to a file in the given path based on + // ucode hash. Returns the name of the written file. Can be called at any + // time, doesn't require the shader to be translated. + std::filesystem::path DumpUcodeBinary(const std::filesystem::path& base_path); protected: friend class ShaderTranslator; + virtual Translation* CreateTranslationInstance(uint32_t modification); + xenos::ShaderType shader_type_; - HostVertexShaderType host_vertex_shader_type_ = HostVertexShaderType::kVertex; std::vector ucode_data_; uint64_t ucode_data_hash_; + // Modification bits -> translation. + std::unordered_map translations_; + + // Whether setup of the post-translation parameters (listed below, plus those + // specific to the implementation) has been initiated, by any thread. If + // translation is performed on multiple threads, only one thread must be + // setting this up (other threads would write the same data anyway). + std::atomic_flag post_translation_info_set_up_ = ATOMIC_FLAG_INIT; + + // Initialized after the first successful translation (these don't depend on + // the host-side modification bits). + std::string ucode_disassembly_; std::vector vertex_bindings_; std::vector texture_bindings_; ConstantRegisterMap constant_register_map_ = {0}; bool writes_color_targets_[4] = {false, false, false, false}; bool writes_depth_ = false; - bool implicit_early_z_allowed_ = true; + bool kills_pixels_ = false; std::vector memexport_stream_constants_; - bool is_valid_ = false; - bool is_translated_ = false; - std::vector errors_; - - std::string ucode_disassembly_; - std::vector translated_binary_; - std::string host_disassembly_; - std::string host_error_log_; - std::vector host_binary_; + uint32_t ucode_storage_index_ = UINT32_MAX; }; } // namespace gpu diff --git a/src/xenia/gpu/shader_compiler_main.cc b/src/xenia/gpu/shader_compiler_main.cc index f5392216b..a9a744955 100644 --- a/src/xenia/gpu/shader_compiler_main.cc +++ b/src/xenia/gpu/shader_compiler_main.cc @@ -140,11 +140,15 @@ int shader_compiler_main(const std::vector& args) { Shader::HostVertexShaderType::kQuadDomainPatchIndexed; } } + uint32_t modification = + translator->GetDefaultModification(shader_type, host_vertex_shader_type); - translator->Translate(shader.get(), host_vertex_shader_type); + Shader::Translation* translation = + shader->GetOrCreateTranslation(modification); + translator->Translate(*translation); - const void* source_data = shader->translated_binary().data(); - size_t source_data_size = shader->translated_binary().size(); + const void* source_data = translation->translated_binary().data(); + size_t source_data_size = translation->translated_binary().size(); std::unique_ptr spirv_disasm_result; if (cvars::shader_output_type == "spirvtext") { diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index 304acf602..6d79e82c2 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -1,4 +1,3 @@ -#include "shader_translator.h" /** ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * @@ -14,6 +13,7 @@ #include #include +#include "xenia/base/assert.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" @@ -46,7 +46,9 @@ ShaderTranslator::ShaderTranslator() = default; ShaderTranslator::~ShaderTranslator() = default; -void ShaderTranslator::Reset() { +void ShaderTranslator::Reset(xenos::ShaderType shader_type) { + shader_type_ = shader_type; + modification_ = GetDefaultModification(shader_type); errors_.clear(); ucode_disasm_buffer_.Reset(); ucode_disasm_line_number_ = 0; @@ -64,37 +66,37 @@ void ShaderTranslator::Reset() { writes_color_targets_[i] = false; } writes_depth_ = false; - implicit_early_z_allowed_ = true; + kills_pixels_ = false; memexport_alloc_count_ = 0; memexport_eA_written_ = 0; std::memset(&memexport_eM_written_, 0, sizeof(memexport_eM_written_)); memexport_stream_constants_.clear(); } -bool ShaderTranslator::Translate( - Shader* shader, reg::SQ_PROGRAM_CNTL cntl, - Shader::HostVertexShaderType host_vertex_shader_type) { - Reset(); - uint32_t cntl_num_reg = shader->type() == xenos::ShaderType::kVertex +bool ShaderTranslator::Translate(Shader::Translation& translation, + reg::SQ_PROGRAM_CNTL cntl) { + xenos::ShaderType shader_type = translation.shader().type(); + Reset(shader_type); + uint32_t cntl_num_reg = shader_type == xenos::ShaderType::kVertex ? cntl.vs_num_reg : cntl.ps_num_reg; register_count_ = (cntl_num_reg & 0x80) ? 0 : (cntl_num_reg + 1); - return TranslateInternal(shader, host_vertex_shader_type); + return TranslateInternal(translation); } -bool ShaderTranslator::Translate( - Shader* shader, Shader::HostVertexShaderType host_vertex_shader_type) { - Reset(); - return TranslateInternal(shader, host_vertex_shader_type); +bool ShaderTranslator::Translate(Shader::Translation& translation) { + Reset(translation.shader().type()); + return TranslateInternal(translation); } -bool ShaderTranslator::TranslateInternal( - Shader* shader, Shader::HostVertexShaderType host_vertex_shader_type) { - shader_type_ = shader->type(); - host_vertex_shader_type_ = host_vertex_shader_type; - ucode_dwords_ = shader->ucode_dwords(); - ucode_dword_count_ = shader->ucode_dword_count(); +bool ShaderTranslator::TranslateInternal(Shader::Translation& translation) { + Shader& shader = translation.shader(); + assert_true(shader_type_ == shader.type()); + shader_type_ = shader.type(); + ucode_dwords_ = shader.ucode_dwords(); + ucode_dword_count_ = shader.ucode_dword_count(); + modification_ = translation.modification(); // Control flow instructions come paired in blocks of 3 dwords and all are // listed at the top of the ucode. @@ -150,12 +152,6 @@ bool ShaderTranslator::TranslateInternal( if (memexport_eA_written_ == 0) { memexport_stream_constants_.clear(); } - if (!memexport_stream_constants_.empty()) { - // TODO(Triang3l): Investigate what happens to memexport when the pixel - // fails the depth/stencil test, but in Direct3D 11 UAV writes disable early - // depth/stencil. - implicit_early_z_allowed_ = false; - } StartTranslation(); @@ -192,35 +188,44 @@ bool ShaderTranslator::TranslateInternal( ++cf_index; } - shader->errors_ = std::move(errors_); - shader->translated_binary_ = CompleteTranslation(); - shader->ucode_disassembly_ = ucode_disasm_buffer_.to_string(); - shader->host_vertex_shader_type_ = host_vertex_shader_type_; - shader->vertex_bindings_ = std::move(vertex_bindings_); - shader->texture_bindings_ = std::move(texture_bindings_); - shader->constant_register_map_ = std::move(constant_register_map_); - for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) { - shader->writes_color_targets_[i] = writes_color_targets_[i]; - } - shader->writes_depth_ = writes_depth_; - shader->implicit_early_z_allowed_ = implicit_early_z_allowed_; - shader->memexport_stream_constants_.clear(); - for (uint32_t memexport_stream_constant : memexport_stream_constants_) { - shader->memexport_stream_constants_.push_back(memexport_stream_constant); - } + translation.errors_ = std::move(errors_); + translation.translated_binary_ = CompleteTranslation(); + translation.is_translated_ = true; - shader->is_valid_ = true; - shader->is_translated_ = true; - for (const auto& error : shader->errors_) { + bool is_valid = true; + for (const auto& error : translation.errors_) { if (error.is_fatal) { - shader->is_valid_ = false; + is_valid = false; break; } } + translation.is_valid_ = is_valid; - PostTranslation(shader); + // Setup info that doesn't depend on the modification only once. + bool setup_shader_post_translation_info = + is_valid && !shader.post_translation_info_set_up_.test_and_set(); + if (setup_shader_post_translation_info) { + shader.ucode_disassembly_ = ucode_disasm_buffer_.to_string(); + shader.vertex_bindings_ = std::move(vertex_bindings_); + shader.texture_bindings_ = std::move(texture_bindings_); + shader.constant_register_map_ = std::move(constant_register_map_); + for (size_t i = 0; i < xe::countof(writes_color_targets_); ++i) { + shader.writes_color_targets_[i] = writes_color_targets_[i]; + } + shader.writes_depth_ = writes_depth_; + shader.kills_pixels_ = kills_pixels_; + shader.memexport_stream_constants_.clear(); + shader.memexport_stream_constants_.reserve( + memexport_stream_constants_.size()); + shader.memexport_stream_constants_.insert( + shader.memexport_stream_constants_.cend(), + memexport_stream_constants_.cbegin(), + memexport_stream_constants_.cend()); + } + PostTranslation(translation, setup_shader_post_translation_info); - return shader->is_valid_; + // In case is_valid_ is modified by PostTranslation, reload. + return translation.is_valid_; } void ShaderTranslator::MarkUcodeInstruction(uint32_t dword_offset) { @@ -343,14 +348,9 @@ void ShaderTranslator::GatherInstructionInformation( ParsedAluInstruction instr; ParseAluInstruction(op, instr); - const auto& vector_opcode_info = - alu_vector_opcode_infos_[uint32_t(op.vector_opcode())]; - implicit_early_z_allowed_ &= - !vector_opcode_info.disable_implicit_early_z; - const auto& scalar_opcode_info = - alu_scalar_opcode_infos_[uint32_t(op.scalar_opcode())]; - implicit_early_z_allowed_ &= - !scalar_opcode_info.disable_implicit_early_z; + kills_pixels_ = kills_pixels_ || + ucode::AluVectorOpcodeIsKill(op.vector_opcode()) || + ucode::AluScalarOpcodeIsKill(op.scalar_opcode()); if (instr.vector_and_constant_result.storage_target != InstructionStorageTarget::kRegister || @@ -408,7 +408,6 @@ void ShaderTranslator::GatherInstructionInformation( break; case InstructionStorageTarget::kDepth: writes_depth_ = true; - implicit_early_z_allowed_ = false; break; default: break; @@ -1082,91 +1081,91 @@ uint32_t ParsedTextureFetchInstruction::GetNonZeroResultComponents() const { const ShaderTranslator::AluOpcodeInfo ShaderTranslator::alu_vector_opcode_infos_[0x20] = { - {"add", 2, 4, false}, // 0 - {"mul", 2, 4, false}, // 1 - {"max", 2, 4, false}, // 2 - {"min", 2, 4, false}, // 3 - {"seq", 2, 4, false}, // 4 - {"sgt", 2, 4, false}, // 5 - {"sge", 2, 4, false}, // 6 - {"sne", 2, 4, false}, // 7 - {"frc", 1, 4, false}, // 8 - {"trunc", 1, 4, false}, // 9 - {"floor", 1, 4, false}, // 10 - {"mad", 3, 4, false}, // 11 - {"cndeq", 3, 4, false}, // 12 - {"cndge", 3, 4, false}, // 13 - {"cndgt", 3, 4, false}, // 14 - {"dp4", 2, 4, false}, // 15 - {"dp3", 2, 4, false}, // 16 - {"dp2add", 3, 4, false}, // 17 - {"cube", 2, 4, false}, // 18 - {"max4", 1, 4, false}, // 19 - {"setp_eq_push", 2, 4, false}, // 20 - {"setp_ne_push", 2, 4, false}, // 21 - {"setp_gt_push", 2, 4, false}, // 22 - {"setp_ge_push", 2, 4, false}, // 23 - {"kill_eq", 2, 4, true}, // 24 - {"kill_gt", 2, 4, true}, // 25 - {"kill_ge", 2, 4, true}, // 26 - {"kill_ne", 2, 4, true}, // 27 - {"dst", 2, 4, false}, // 28 - {"maxa", 2, 4, false}, // 29 + {"add", 2, 4}, // 0 + {"mul", 2, 4}, // 1 + {"max", 2, 4}, // 2 + {"min", 2, 4}, // 3 + {"seq", 2, 4}, // 4 + {"sgt", 2, 4}, // 5 + {"sge", 2, 4}, // 6 + {"sne", 2, 4}, // 7 + {"frc", 1, 4}, // 8 + {"trunc", 1, 4}, // 9 + {"floor", 1, 4}, // 10 + {"mad", 3, 4}, // 11 + {"cndeq", 3, 4}, // 12 + {"cndge", 3, 4}, // 13 + {"cndgt", 3, 4}, // 14 + {"dp4", 2, 4}, // 15 + {"dp3", 2, 4}, // 16 + {"dp2add", 3, 4}, // 17 + {"cube", 2, 4}, // 18 + {"max4", 1, 4}, // 19 + {"setp_eq_push", 2, 4}, // 20 + {"setp_ne_push", 2, 4}, // 21 + {"setp_gt_push", 2, 4}, // 22 + {"setp_ge_push", 2, 4}, // 23 + {"kill_eq", 2, 4}, // 24 + {"kill_gt", 2, 4}, // 25 + {"kill_ge", 2, 4}, // 26 + {"kill_ne", 2, 4}, // 27 + {"dst", 2, 4}, // 28 + {"maxa", 2, 4}, // 29 }; const ShaderTranslator::AluOpcodeInfo ShaderTranslator::alu_scalar_opcode_infos_[0x40] = { - {"adds", 1, 2, false}, // 0 - {"adds_prev", 1, 1, false}, // 1 - {"muls", 1, 2, false}, // 2 - {"muls_prev", 1, 1, false}, // 3 - {"muls_prev2", 1, 2, false}, // 4 - {"maxs", 1, 2, false}, // 5 - {"mins", 1, 2, false}, // 6 - {"seqs", 1, 1, false}, // 7 - {"sgts", 1, 1, false}, // 8 - {"sges", 1, 1, false}, // 9 - {"snes", 1, 1, false}, // 10 - {"frcs", 1, 1, false}, // 11 - {"truncs", 1, 1, false}, // 12 - {"floors", 1, 1, false}, // 13 - {"exp", 1, 1, false}, // 14 - {"logc", 1, 1, false}, // 15 - {"log", 1, 1, false}, // 16 - {"rcpc", 1, 1, false}, // 17 - {"rcpf", 1, 1, false}, // 18 - {"rcp", 1, 1, false}, // 19 - {"rsqc", 1, 1, false}, // 20 - {"rsqf", 1, 1, false}, // 21 - {"rsq", 1, 1, false}, // 22 - {"maxas", 1, 2, false}, // 23 - {"maxasf", 1, 2, false}, // 24 - {"subs", 1, 2, false}, // 25 - {"subs_prev", 1, 1, false}, // 26 - {"setp_eq", 1, 1, false}, // 27 - {"setp_ne", 1, 1, false}, // 28 - {"setp_gt", 1, 1, false}, // 29 - {"setp_ge", 1, 1, false}, // 30 - {"setp_inv", 1, 1, false}, // 31 - {"setp_pop", 1, 1, false}, // 32 - {"setp_clr", 0, 0, false}, // 33 - {"setp_rstr", 1, 1, false}, // 34 - {"kills_eq", 1, 1, true}, // 35 - {"kills_gt", 1, 1, true}, // 36 - {"kills_ge", 1, 1, true}, // 37 - {"kills_ne", 1, 1, true}, // 38 - {"kills_one", 1, 1, true}, // 39 - {"sqrt", 1, 1, false}, // 40 - {"UNKNOWN", 0, 0, false}, // 41 - {"mulsc", 2, 1, false}, // 42 - {"mulsc", 2, 1, false}, // 43 - {"addsc", 2, 1, false}, // 44 - {"addsc", 2, 1, false}, // 45 - {"subsc", 2, 1, false}, // 46 - {"subsc", 2, 1, false}, // 47 - {"sin", 1, 1, false}, // 48 - {"cos", 1, 1, false}, // 49 - {"retain_prev", 0, 0, false}, // 50 + {"adds", 1, 2}, // 0 + {"adds_prev", 1, 1}, // 1 + {"muls", 1, 2}, // 2 + {"muls_prev", 1, 1}, // 3 + {"muls_prev2", 1, 2}, // 4 + {"maxs", 1, 2}, // 5 + {"mins", 1, 2}, // 6 + {"seqs", 1, 1}, // 7 + {"sgts", 1, 1}, // 8 + {"sges", 1, 1}, // 9 + {"snes", 1, 1}, // 10 + {"frcs", 1, 1}, // 11 + {"truncs", 1, 1}, // 12 + {"floors", 1, 1}, // 13 + {"exp", 1, 1}, // 14 + {"logc", 1, 1}, // 15 + {"log", 1, 1}, // 16 + {"rcpc", 1, 1}, // 17 + {"rcpf", 1, 1}, // 18 + {"rcp", 1, 1}, // 19 + {"rsqc", 1, 1}, // 20 + {"rsqf", 1, 1}, // 21 + {"rsq", 1, 1}, // 22 + {"maxas", 1, 2}, // 23 + {"maxasf", 1, 2}, // 24 + {"subs", 1, 2}, // 25 + {"subs_prev", 1, 1}, // 26 + {"setp_eq", 1, 1}, // 27 + {"setp_ne", 1, 1}, // 28 + {"setp_gt", 1, 1}, // 29 + {"setp_ge", 1, 1}, // 30 + {"setp_inv", 1, 1}, // 31 + {"setp_pop", 1, 1}, // 32 + {"setp_clr", 0, 0}, // 33 + {"setp_rstr", 1, 1}, // 34 + {"kills_eq", 1, 1}, // 35 + {"kills_gt", 1, 1}, // 36 + {"kills_ge", 1, 1}, // 37 + {"kills_ne", 1, 1}, // 38 + {"kills_one", 1, 1}, // 39 + {"sqrt", 1, 1}, // 40 + {"UNKNOWN", 0, 0}, // 41 + {"mulsc", 2, 1}, // 42 + {"mulsc", 2, 1}, // 43 + {"addsc", 2, 1}, // 44 + {"addsc", 2, 1}, // 45 + {"subsc", 2, 1}, // 46 + {"subsc", 2, 1}, // 47 + {"sin", 1, 1}, // 48 + {"cos", 1, 1}, // 49 + {"retain_prev", 0, 0}, // 50 }; void ShaderTranslator::TranslateAluInstruction(const AluInstruction& op) { diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h index 3d4fa208d..e1c97808a 100644 --- a/src/xenia/gpu/shader_translator.h +++ b/src/xenia/gpu/shader_translator.h @@ -29,18 +29,27 @@ class ShaderTranslator { public: virtual ~ShaderTranslator(); - bool Translate(Shader* shader, reg::SQ_PROGRAM_CNTL cntl, - Shader::HostVertexShaderType host_vertex_shader_type = - Shader::HostVertexShaderType::kVertex); - bool Translate(Shader* shader, - Shader::HostVertexShaderType host_vertex_shader_type = - Shader::HostVertexShaderType::kVertex); + virtual uint32_t GetDefaultModification( + xenos::ShaderType shader_type, + Shader::HostVertexShaderType host_vertex_shader_type = + Shader::HostVertexShaderType::kVertex) const { + return 0; + } + + bool Translate(Shader::Translation& translation, reg::SQ_PROGRAM_CNTL cntl); + bool Translate(Shader::Translation& translation); protected: ShaderTranslator(); // Resets translator state before beginning translation. - virtual void Reset(); + // shader_type is passed here so translator implementations can generate + // special fixed shaders for internal use, and set up the type for this + // purpose. + virtual void Reset(xenos::ShaderType shader_type); + + // Current host-side modification being generated. + uint32_t modification() const { return modification_; } // Register count. uint32_t register_count() const { return register_count_; } @@ -48,11 +57,6 @@ class ShaderTranslator { bool is_vertex_shader() const { return shader_type_ == xenos::ShaderType::kVertex; } - // If translating a vertex shader, type of the shader in a D3D11-like - // rendering pipeline. - Shader::HostVertexShaderType host_vertex_shader_type() const { - return host_vertex_shader_type_; - } // True if the current shader is a pixel shader. bool is_pixel_shader() const { return shader_type_ == xenos::ShaderType::kPixel; @@ -85,10 +89,8 @@ class ShaderTranslator { // True if the current shader overrides the pixel depth, set before // translation. Doesn't include writes with an empty used write mask. bool writes_depth() const { return writes_depth_; } - // True if Xenia can automatically enable early depth/stencil for the pixel - // shader when RB_DEPTHCONTROL EARLY_Z_ENABLE is not set, provided alpha - // testing and alpha to coverage are disabled. - bool implicit_early_z_allowed() const { return implicit_early_z_allowed_; } + // True if the current shader has any `kill` instructions. + bool kills_pixels() const { return kills_pixels_; } // A list of all vertex bindings, populated before translation occurs. const std::vector& vertex_bindings() const { return vertex_bindings_; @@ -112,6 +114,17 @@ class ShaderTranslator { return memexport_stream_constants_; } + // Whether the shader can have early depth and stencil writing enabled, unless + // alpha test or alpha to coverage is enabled. Data gathered before + // translation. + bool CanWriteZEarly() const { + // TODO(Triang3l): Investigate what happens to memexport when the pixel + // fails the depth/stencil test, but in Direct3D 11 UAV writes disable early + // depth/stencil. + return !writes_depth_ && !kills_pixels_ && + memexport_stream_constants_.empty(); + } + // Current line number in the ucode disassembly. size_t ucode_disasm_line_number() const { return ucode_disasm_line_number_; } // Ucode disassembly buffer accumulated during translation. @@ -130,10 +143,14 @@ class ShaderTranslator { } // Handles post-translation tasks when the shader has been fully translated. - virtual void PostTranslation(Shader* shader) {} + // setup_shader_post_translation_info if non-modification-specific parameters + // of the Shader object behind the Translation can be set by this invocation. + virtual void PostTranslation(Shader::Translation& translation, + bool setup_shader_post_translation_info) {} // Sets the host disassembly on a shader. - void set_host_disassembly(Shader* shader, std::string value) { - shader->host_disassembly_ = std::move(value); + void set_host_disassembly(Shader::Translation& translation, + std::string value) { + translation.host_disassembly_ = std::move(value); } // Pre-process a control-flow instruction before anything else. @@ -188,11 +205,9 @@ class ShaderTranslator { const char* name; uint32_t argument_count; uint32_t src_swizzle_component_count; - bool disable_implicit_early_z; }; - bool TranslateInternal(Shader* shader, - Shader::HostVertexShaderType host_vertex_shader_type); + bool TranslateInternal(Shader::Translation& translation); void MarkUcodeInstruction(uint32_t dword_offset); void AppendUcodeDisasm(char c); @@ -246,12 +261,13 @@ class ShaderTranslator { // Input shader metadata and microcode. xenos::ShaderType shader_type_; - Shader::HostVertexShaderType host_vertex_shader_type_; const uint32_t* ucode_dwords_; size_t ucode_dword_count_; - reg::SQ_PROGRAM_CNTL program_cntl_; uint32_t register_count_; + // Current host-side modification being generated. + uint32_t modification_ = 0; + // Accumulated translation errors. std::vector errors_; @@ -272,7 +288,8 @@ class ShaderTranslator { // translation. std::set label_addresses_; - // Detected binding information gathered before translation. + // Detected binding information gathered before translation. Must not be + // affected by the modification index. int total_attrib_count_ = 0; std::vector vertex_bindings_; std::vector texture_bindings_; @@ -282,13 +299,15 @@ class ShaderTranslator { // These all are gathered before translation. // uses_register_dynamic_addressing_ for writes, writes_color_targets_, // writes_depth_ don't include empty used write masks. + // Must not be affected by the modification index. Shader::ConstantRegisterMap constant_register_map_ = {0}; bool uses_register_dynamic_addressing_ = false; bool writes_color_targets_[4] = {false, false, false, false}; bool writes_depth_ = false; - bool implicit_early_z_allowed_ = true; + bool kills_pixels_ = false; // Memexport info is gathered before translation. + // Must not be affected by the modification index. uint32_t memexport_alloc_count_ = 0; // For register allocation in implementations - what was used after each // `alloc export`. diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index bb1bb51f0..0ff228d53 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -667,12 +667,14 @@ std::vector SpirvShaderTranslator::CompleteTranslation() { return spirv_bytes; } -void SpirvShaderTranslator::PostTranslation(Shader* shader) { +void SpirvShaderTranslator::PostTranslation( + Shader::Translation& translation, bool setup_shader_post_translation_info) { // Validation. if (cvars::spv_validate) { auto validation = validator_.Validate( - reinterpret_cast(shader->translated_binary().data()), - shader->translated_binary().size() / sizeof(uint32_t)); + reinterpret_cast( + translation.translated_binary().data()), + translation.translated_binary().size() / sizeof(uint32_t)); if (validation->has_error()) { XELOGE("SPIR-V Shader Validation failed! Error: {}", validation->error_string()); @@ -682,12 +684,13 @@ void SpirvShaderTranslator::PostTranslation(Shader* shader) { if (cvars::spv_disasm) { // TODO(benvanik): only if needed? could be slowish. auto disasm = disassembler_.Disassemble( - reinterpret_cast(shader->translated_binary().data()), - shader->translated_binary().size() / 4); + reinterpret_cast( + translation.translated_binary().data()), + translation.translated_binary().size() / sizeof(uint32_t)); if (disasm->has_error()) { XELOGE("Failed to disassemble SPIRV - invalid?"); } else { - set_host_disassembly(shader, disasm->to_string()); + set_host_disassembly(translation, disasm->to_string()); } } } diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 044dea019..478aa3428 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -61,7 +61,8 @@ class SpirvShaderTranslator : public ShaderTranslator { protected: void StartTranslation() override; std::vector CompleteTranslation() override; - void PostTranslation(Shader* shader) override; + void PostTranslation(Shader::Translation& translation, + bool setup_shader_post_translation_info) override; void PreProcessControlFlowInstructions( std::vector instrs) override; diff --git a/src/xenia/gpu/trace_dump.cc b/src/xenia/gpu/trace_dump.cc index 984984c4a..fdebcfba4 100644 --- a/src/xenia/gpu/trace_dump.cc +++ b/src/xenia/gpu/trace_dump.cc @@ -92,7 +92,7 @@ int TraceDump::Main(const std::vector& args) { bool TraceDump::Setup() { // Create the emulator but don't initialize so we can setup the window. - emulator_ = std::make_unique("", "", ""); + emulator_ = std::make_unique("", "", "", ""); X_STATUS result = emulator_->Setup( nullptr, nullptr, [this]() { return CreateGraphicsSystem(); }, nullptr); if (XFAILED(result)) { diff --git a/src/xenia/gpu/trace_viewer.cc b/src/xenia/gpu/trace_viewer.cc index 5305c50ae..5297d6856 100644 --- a/src/xenia/gpu/trace_viewer.cc +++ b/src/xenia/gpu/trace_viewer.cc @@ -121,7 +121,7 @@ bool TraceViewer::Setup() { window_->Resize(1920, 1200); // Create the emulator but don't initialize so we can setup the window. - emulator_ = std::make_unique("", "", ""); + emulator_ = std::make_unique("", "", "", ""); X_STATUS result = emulator_->Setup( window_.get(), nullptr, [this]() { return CreateGraphicsSystem(); }, nullptr); @@ -566,8 +566,21 @@ TraceViewer::ShaderDisplayType TraceViewer::DrawShaderTypeUI() { void TraceViewer::DrawShaderUI(Shader* shader, ShaderDisplayType display_type) { // Must be prepared for advanced display modes. + // FIXME(Triang3l): This should display the actual translation used in the + // draw, but it may depend on multiple backend-related factors, including + // drawing multiple times with multiple modifications, even depending on + // values obtained during translation of other modifications (for instance, + // a memexporting shader can be executed both as a vertex shader (to draw the + // points) and as a compute shader (to actually export) if the host doesn't + // support writes from vertex shaders. + const Shader::Translation* translation = nullptr; if (display_type != ShaderDisplayType::kUcode) { - if (!shader->is_valid()) { + for (const auto& translation_pair : shader->translations()) { + if (translation_pair.second->is_valid()) { + translation = translation_pair.second; + } + } + if (!translation) { ImGui::TextColored(kColorError, "ERROR: shader error during parsing/translation"); return; @@ -580,7 +593,7 @@ void TraceViewer::DrawShaderUI(Shader* shader, ShaderDisplayType display_type) { break; } case ShaderDisplayType::kTranslated: { - const auto& str = shader->GetTranslatedBinaryString(); + const auto& str = translation->GetTranslatedBinaryString(); size_t i = 0; bool done = false; while (!done && i < str.size()) { @@ -600,7 +613,7 @@ void TraceViewer::DrawShaderUI(Shader* shader, ShaderDisplayType display_type) { break; } case ShaderDisplayType::kHostDisasm: { - DrawMultilineString(shader->host_disassembly()); + DrawMultilineString(translation->host_disassembly()); break; } } diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index 21ccbaff9..85b52a377 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -1147,6 +1147,19 @@ enum class AluScalarOpcode : uint32_t { kRetainPrev = 50, }; +constexpr bool AluScalarOpcodeIsKill(AluScalarOpcode scalar_opcode) { + switch (scalar_opcode) { + case AluScalarOpcode::kKillsEq: + case AluScalarOpcode::kKillsGt: + case AluScalarOpcode::kKillsGe: + case AluScalarOpcode::kKillsNe: + case AluScalarOpcode::kKillsOne: + return true; + default: + return false; + } +} + enum class AluVectorOpcode : uint32_t { // Per-Component Floating-Point Add // add/ADDv dest, src0, src1 @@ -1471,27 +1484,37 @@ enum class AluVectorOpcode : uint32_t { kMaxA = 29, }; +constexpr bool AluVectorOpcodeIsKill(AluVectorOpcode vector_opcode) { + switch (vector_opcode) { + case AluVectorOpcode::kKillEq: + case AluVectorOpcode::kKillGt: + case AluVectorOpcode::kKillGe: + case AluVectorOpcode::kKillNe: + return true; + default: + return false; + } +} + // Whether the vector instruction has side effects such as discarding a pixel or // setting the predicate and can't be ignored even if it doesn't write to // anywhere. Note that all scalar operations except for retain_prev have a side // effect of modifying the previous scalar result register, so they must always // be executed even if not writing. constexpr bool AluVectorOpHasSideEffects(AluVectorOpcode vector_opcode) { + if (AluVectorOpcodeIsKill(vector_opcode)) { + return true; + } switch (vector_opcode) { case AluVectorOpcode::kSetpEqPush: case AluVectorOpcode::kSetpNePush: case AluVectorOpcode::kSetpGtPush: case AluVectorOpcode::kSetpGePush: - case AluVectorOpcode::kKillEq: - case AluVectorOpcode::kKillGt: - case AluVectorOpcode::kKillGe: - case AluVectorOpcode::kKillNe: case AluVectorOpcode::kMaxA: return true; default: - break; + return false; } - return false; } // Whether each component of a source operand is used at all in the instruction diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc index 8db418de9..3ab45245c 100644 --- a/src/xenia/gpu/vulkan/pipeline_cache.cc +++ b/src/xenia/gpu/vulkan/pipeline_cache.cc @@ -362,35 +362,38 @@ VkPipeline PipelineCache::GetPipeline(const RenderState* render_state, return pipeline; } -bool PipelineCache::TranslateShader(VulkanShader* shader, - reg::SQ_PROGRAM_CNTL cntl) { +bool PipelineCache::TranslateShader( + VulkanShader::VulkanTranslation& translation, reg::SQ_PROGRAM_CNTL cntl) { // Perform translation. // If this fails the shader will be marked as invalid and ignored later. - if (!shader_translator_->Translate(shader, cntl)) { + if (!shader_translator_->Translate(translation, cntl)) { XELOGE("Shader translation failed; marking shader as ignored"); return false; } // Prepare the shader for use (creates our VkShaderModule). // It could still fail at this point. - if (!shader->Prepare()) { + if (!translation.Prepare()) { XELOGE("Shader preparation failed; marking shader as ignored"); return false; } - if (shader->is_valid()) { + if (translation.is_valid()) { XELOGGPU("Generated {} shader ({}b) - hash {:016X}:\n{}\n", - shader->type() == xenos::ShaderType::kVertex ? "vertex" : "pixel", - shader->ucode_dword_count() * 4, shader->ucode_data_hash(), - shader->ucode_disassembly()); + translation.shader().type() == xenos::ShaderType::kVertex + ? "vertex" + : "pixel", + translation.shader().ucode_dword_count() * 4, + translation.shader().ucode_data_hash(), + translation.shader().ucode_disassembly()); } // Dump shader files if desired. if (!cvars::dump_shaders.empty()) { - shader->Dump(cvars::dump_shaders, "vk"); + translation.Dump(cvars::dump_shaders, "vk"); } - return shader->is_valid(); + return translation.is_valid(); } static void DumpShaderStatisticsAMD(const VkShaderStatisticsInfoAMD& stats) { @@ -1063,16 +1066,28 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages( return UpdateStatus::kCompatible; } - if (!vertex_shader->is_translated() && - !TranslateShader(vertex_shader, regs.sq_program_cntl)) { + VulkanShader::VulkanTranslation* vertex_shader_translation = + static_cast( + vertex_shader->GetOrCreateTranslation( + shader_translator_->GetDefaultModification( + xenos::ShaderType::kVertex))); + if (!vertex_shader_translation->is_translated() && + !TranslateShader(*vertex_shader_translation, regs.sq_program_cntl)) { XELOGE("Failed to translate the vertex shader!"); return UpdateStatus::kError; } - if (pixel_shader && !pixel_shader->is_translated() && - !TranslateShader(pixel_shader, regs.sq_program_cntl)) { - XELOGE("Failed to translate the pixel shader!"); - return UpdateStatus::kError; + VulkanShader::VulkanTranslation* pixel_shader_translation = nullptr; + if (pixel_shader) { + pixel_shader_translation = static_cast( + pixel_shader->GetOrCreateTranslation( + shader_translator_->GetDefaultModification( + xenos::ShaderType::kPixel))); + if (!pixel_shader_translation->is_translated() && + !TranslateShader(*pixel_shader_translation, regs.sq_program_cntl)) { + XELOGE("Failed to translate the pixel shader!"); + return UpdateStatus::kError; + } } update_shader_stages_stage_count_ = 0; @@ -1084,7 +1099,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages( vertex_pipeline_stage.pNext = nullptr; vertex_pipeline_stage.flags = 0; vertex_pipeline_stage.stage = VK_SHADER_STAGE_VERTEX_BIT; - vertex_pipeline_stage.module = vertex_shader->shader_module(); + vertex_pipeline_stage.module = vertex_shader_translation->shader_module(); vertex_pipeline_stage.pName = "main"; vertex_pipeline_stage.pSpecializationInfo = nullptr; @@ -1116,8 +1131,9 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages( pixel_pipeline_stage.pNext = nullptr; pixel_pipeline_stage.flags = 0; pixel_pipeline_stage.stage = VK_SHADER_STAGE_FRAGMENT_BIT; - pixel_pipeline_stage.module = - pixel_shader ? pixel_shader->shader_module() : dummy_pixel_shader_; + pixel_pipeline_stage.module = pixel_shader_translation + ? pixel_shader_translation->shader_module() + : dummy_pixel_shader_; pixel_pipeline_stage.pName = "main"; pixel_pipeline_stage.pSpecializationInfo = nullptr; diff --git a/src/xenia/gpu/vulkan/pipeline_cache.h b/src/xenia/gpu/vulkan/pipeline_cache.h index 3e03dce1e..693dd4594 100644 --- a/src/xenia/gpu/vulkan/pipeline_cache.h +++ b/src/xenia/gpu/vulkan/pipeline_cache.h @@ -79,7 +79,8 @@ class PipelineCache { // state. VkPipeline GetPipeline(const RenderState* render_state, uint64_t hash_key); - bool TranslateShader(VulkanShader* shader, reg::SQ_PROGRAM_CNTL cntl); + bool TranslateShader(VulkanShader::VulkanTranslation& translation, + reg::SQ_PROGRAM_CNTL cntl); void DumpShaderDisasmAMD(VkPipeline pipeline); void DumpShaderDisasmNV(const VkGraphicsPipelineCreateInfo& info); diff --git a/src/xenia/gpu/vulkan/vulkan_shader.cc b/src/xenia/gpu/vulkan/vulkan_shader.cc index 659ad9326..2eb41e9e5 100644 --- a/src/xenia/gpu/vulkan/vulkan_shader.cc +++ b/src/xenia/gpu/vulkan/vulkan_shader.cc @@ -27,38 +27,56 @@ VulkanShader::VulkanShader(ui::vulkan::VulkanDevice* device, const uint32_t* dword_ptr, uint32_t dword_count) : Shader(shader_type, data_hash, dword_ptr, dword_count), device_(device) {} -VulkanShader::~VulkanShader() { +VulkanShader::VulkanTranslation::~VulkanTranslation() { if (shader_module_) { - vkDestroyShaderModule(*device_, shader_module_, nullptr); + const VulkanShader& vulkan_shader = static_cast(shader()); + vkDestroyShaderModule(*vulkan_shader.device_, shader_module_, nullptr); shader_module_ = nullptr; } } -bool VulkanShader::Prepare() { +bool VulkanShader::VulkanTranslation::Prepare() { assert_null(shader_module_); assert_true(is_valid()); + const VulkanShader& vulkan_shader = static_cast(shader()); + ui::vulkan::VulkanDevice* device = vulkan_shader.device_; + // Create the shader module. VkShaderModuleCreateInfo shader_info; shader_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; shader_info.pNext = nullptr; shader_info.flags = 0; - shader_info.codeSize = translated_binary_.size(); + shader_info.codeSize = translated_binary().size(); shader_info.pCode = - reinterpret_cast(translated_binary_.data()); + reinterpret_cast(translated_binary().data()); auto status = - vkCreateShaderModule(*device_, &shader_info, nullptr, &shader_module_); + vkCreateShaderModule(*device, &shader_info, nullptr, &shader_module_); CheckResult(status, "vkCreateShaderModule"); - char typeChar = shader_type_ == xenos::ShaderType::kPixel - ? 'p' - : shader_type_ == xenos::ShaderType::kVertex ? 'v' : 'u'; - device_->DbgSetObjectName( - uint64_t(shader_module_), VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT, - fmt::format("S({}): {:016X}", typeChar, ucode_data_hash())); + char type_char; + switch (vulkan_shader.type()) { + case xenos::ShaderType::kVertex: + type_char = 'v'; + break; + case xenos::ShaderType::kPixel: + type_char = 'p'; + break; + default: + type_char = 'u'; + } + device->DbgSetObjectName(uint64_t(shader_module_), + VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT, + fmt::format("S({}): {:016X}", type_char, + vulkan_shader.ucode_data_hash())); return status == VK_SUCCESS; } +Shader::Translation* VulkanShader::CreateTranslationInstance( + uint32_t modification) { + return new VulkanTranslation(*this, modification); +} + } // namespace vulkan } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/vulkan/vulkan_shader.h b/src/xenia/gpu/vulkan/vulkan_shader.h index 9dd64a22c..7d948ac71 100644 --- a/src/xenia/gpu/vulkan/vulkan_shader.h +++ b/src/xenia/gpu/vulkan/vulkan_shader.h @@ -21,19 +21,30 @@ namespace vulkan { class VulkanShader : public Shader { public: + class VulkanTranslation : public Translation { + public: + VulkanTranslation(VulkanShader& shader, uint32_t modification) + : Translation(shader, modification) {} + ~VulkanTranslation() override; + + bool Prepare(); + + // Available only if the translation is_valid and has been prepared. + VkShaderModule shader_module() const { return shader_module_; } + + private: + VkShaderModule shader_module_ = nullptr; + }; + VulkanShader(ui::vulkan::VulkanDevice* device, xenos::ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr, uint32_t dword_count); - ~VulkanShader() override; - // Available only if the shader is_valid and has been prepared. - VkShaderModule shader_module() const { return shader_module_; } - - bool Prepare(); + protected: + Translation* CreateTranslationInstance(uint32_t modification) override; private: ui::vulkan::VulkanDevice* device_ = nullptr; - VkShaderModule shader_module_ = nullptr; }; } // namespace vulkan diff --git a/src/xenia/gpu/xenos.cc b/src/xenia/gpu/xenos.cc index 4f9e2875f..faaf4818d 100644 --- a/src/xenia/gpu/xenos.cc +++ b/src/xenia/gpu/xenos.cc @@ -9,17 +9,41 @@ #include "xenia/gpu/xenos.h" +#include + #include "xenia/base/math.h" namespace xe { namespace gpu { namespace xenos { +// Based on CFloat24 from d3dref9.dll and the 6e4 code from: +// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp +// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2). + +uint32_t Float32To20e4(float f32) { + if (!(f32 > 0.0f)) { + // Positive only, and not -0 or NaN. + return 0; + } + uint32_t f32u32 = *reinterpret_cast(&f32); + if (f32u32 >= 0x3FFFFFF8) { + // Saturate. + return 0xFFFFFF; + } + if (f32u32 < 0x38800000) { + // The number is too small to be represented as a normalized 20e4. + // Convert it to a denormalized value. + uint32_t shift = std::min(uint32_t(113 - (f32u32 >> 23)), uint32_t(24)); + f32u32 = (0x800000 | (f32u32 & 0x7FFFFF)) >> shift; + } else { + // Rebias the exponent to represent the value as a normalized 20e4. + f32u32 += 0xC8000000u; + } + return ((f32u32 + 3 + ((f32u32 >> 3) & 1)) >> 3) & 0xFFFFFF; +} + float Float20e4To32(uint32_t f24) { - // Based on CFloat24 from d3dref9.dll and the 6e4 code from: - // https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp - // 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows - // [0,2). f24 &= 0xFFFFFF; if (!f24) { return 0.0f; diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index 4117a8293..542372569 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -305,6 +305,9 @@ enum class DepthRenderTargetFormat : uint32_t { const char* GetDepthRenderTargetFormatName(DepthRenderTargetFormat format); +// Converts an IEEE-754 32-bit floating-point number to Xenos floating-point +// depth, rounding to the nearest even. +uint32_t Float32To20e4(float f32); // Converts Xenos floating-point depth in bits 0:23 (not clamping) to an // IEEE-754 32-bit floating-point number. float Float20e4To32(uint32_t f24); From 36a0bcec8b85e225c49c8a4e26cdd3c79cf47710 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Tue, 8 Dec 2020 22:31:09 +0300 Subject: [PATCH 16/29] [GPU] XXH3 hash instead of XXH64 --- .gitmodules | 3 + src/xenia/base/hash.h | 2 +- src/xenia/base/xxhash.h | 21 + .../gpu/d3d12/d3d12_command_processor.cc | 2 - src/xenia/gpu/d3d12/pipeline_cache.cc | 28 +- src/xenia/gpu/d3d12/pipeline_cache.h | 8 +- src/xenia/gpu/d3d12/texture_cache.cc | 2 +- src/xenia/gpu/sampler_info.cc | 4 +- src/xenia/gpu/texture_conversion.cc | 3 +- src/xenia/gpu/texture_info.cc | 5 +- src/xenia/gpu/vulkan/buffer_cache.cc | 20 +- src/xenia/gpu/vulkan/buffer_cache.h | 4 +- src/xenia/gpu/vulkan/pipeline_cache.cc | 25 +- src/xenia/gpu/vulkan/pipeline_cache.h | 5 +- src/xenia/gpu/vulkan/texture_cache.cc | 10 +- src/xenia/gpu/vulkan/texture_cache.h | 2 +- third_party/xxhash | 1 + third_party/xxhash/LICENSE | 24 - third_party/xxhash/Makefile | 67 -- third_party/xxhash/README.md | 74 -- third_party/xxhash/README.xenia | 2 - third_party/xxhash/xxhash.c | 928 ------------------ third_party/xxhash/xxhash.h | 156 --- third_party/xxhash/xxhsum.c | 689 ------------- 24 files changed, 83 insertions(+), 2002 deletions(-) create mode 100644 src/xenia/base/xxhash.h create mode 160000 third_party/xxhash delete mode 100644 third_party/xxhash/LICENSE delete mode 100644 third_party/xxhash/Makefile delete mode 100644 third_party/xxhash/README.md delete mode 100644 third_party/xxhash/README.xenia delete mode 100644 third_party/xxhash/xxhash.c delete mode 100644 third_party/xxhash/xxhash.h delete mode 100644 third_party/xxhash/xxhsum.c diff --git a/.gitmodules b/.gitmodules index b8f139b8a..fef75312b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -73,3 +73,6 @@ [submodule "third_party/date"] path = third_party/date url = https://github.com/HowardHinnant/date.git +[submodule "third_party/xxhash"] + path = third_party/xxhash + url = https://github.com/Cyan4973/xxHash.git diff --git a/src/xenia/base/hash.h b/src/xenia/base/hash.h index b4f252eb4..88c98b64c 100644 --- a/src/xenia/base/hash.h +++ b/src/xenia/base/hash.h @@ -17,7 +17,7 @@ namespace hash { // For use in unordered_sets and unordered_maps (primarily multisets and // multimaps, with manual collision resolution), where the hash is calculated -// externally (for instance, as XXH64), possibly requiring context data rather +// externally (for instance, as XXH3), possibly requiring context data rather // than a pure function to calculate the hash template struct IdentityHasher { diff --git a/src/xenia/base/xxhash.h b/src/xenia/base/xxhash.h new file mode 100644 index 000000000..30960e8d5 --- /dev/null +++ b/src/xenia/base/xxhash.h @@ -0,0 +1,21 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2020 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_BASE_XXHASH_H_ +#define XENIA_BASE_XXHASH_H_ + +#define XXH_INLINE_ALL + +// Can't use XXH_X86DISPATCH because XXH is calculated on multiple threads, +// while the dispatch writes the result (multiple pointers without any +// synchronization) to XXH_g_dispatch at the first call. + +#include "third_party/xxhash/xxhash.h" + +#endif // XENIA_BASE_XXHASH_H_ diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index ef38ff5b1..d355d83c6 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -7,8 +7,6 @@ ****************************************************************************** */ -#include "third_party/xxhash/xxhash.h" - #include #include #include diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index e1b1cbeaf..cc9f5c9be 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -20,7 +20,6 @@ #include #include "third_party/fmt/include/fmt/format.h" -#include "third_party/xxhash/xxhash.h" #include "xenia/base/assert.h" #include "xenia/base/byte_order.h" #include "xenia/base/clock.h" @@ -30,6 +29,7 @@ #include "xenia/base/math.h" #include "xenia/base/profiling.h" #include "xenia/base/string.h" +#include "xenia/base/xxhash.h" #include "xenia/gpu/d3d12/d3d12_command_processor.h" #include "xenia/gpu/gpu_flags.h" #include "xenia/ui/d3d12/d3d12_util.h" @@ -325,9 +325,9 @@ void PipelineCache::InitializeShaderStorage( pipeline_stored_descriptions[i]; // Validate file integrity, stop and truncate the stream if data is // corrupted. - if (XXH64(&pipeline_stored_description.description, - sizeof(pipeline_stored_description.description), - 0) != pipeline_stored_description.description_hash) { + if (XXH3_64bits(&pipeline_stored_description.description, + sizeof(pipeline_stored_description.description)) != + pipeline_stored_description.description_hash) { pipeline_stored_descriptions.resize(i); break; } @@ -471,7 +471,7 @@ void PipelineCache::InitializeShaderStorage( break; } uint64_t ucode_data_hash = - XXH64(ucode_dwords.data(), ucode_byte_count, 0); + XXH3_64bits(ucode_dwords.data(), ucode_byte_count); if (shader_header.ucode_data_hash != ucode_data_hash) { // Validation failed. break; @@ -828,7 +828,7 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type, uint32_t dword_count) { // Hash the input memory and lookup the shader. return LoadShader(shader_type, host_address, dword_count, - XXH64(host_address, dword_count * sizeof(uint32_t), 0)); + XXH3_64bits(host_address, dword_count * sizeof(uint32_t))); } D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type, @@ -1065,7 +1065,7 @@ bool PipelineCache::ConfigurePipeline( } // Find an existing pipeline in the cache. - uint64_t hash = XXH64(&description, sizeof(description), 0); + uint64_t hash = XXH3_64bits(&description, sizeof(description)); auto found_range = pipelines_.equal_range(hash); for (auto it = found_range.first; it != found_range.second; ++it) { Pipeline* found_pipeline = it->second; @@ -1185,20 +1185,20 @@ bool PipelineCache::TranslateShader(DxbcShaderTranslator& translator, uint64_t texture_binding_layout_hash = 0; if (texture_binding_count) { texture_binding_layout_hash = - XXH64(texture_bindings, texture_binding_layout_bytes, 0); + XXH3_64bits(texture_bindings, texture_binding_layout_bytes); } uint32_t bindless_sampler_count = bindless_resources_used_ ? sampler_binding_count : 0; uint64_t bindless_sampler_layout_hash = 0; if (bindless_sampler_count) { - XXH64_state_t hash_state; - XXH64_reset(&hash_state, 0); + XXH3_state_t hash_state; + XXH3_64bits_reset(&hash_state); for (uint32_t i = 0; i < bindless_sampler_count; ++i) { - XXH64_update(&hash_state, - &sampler_bindings[i].bindless_descriptor_index, - sizeof(sampler_bindings[i].bindless_descriptor_index)); + XXH3_64bits_update( + &hash_state, &sampler_bindings[i].bindless_descriptor_index, + sizeof(sampler_bindings[i].bindless_descriptor_index)); } - bindless_sampler_layout_hash = XXH64_digest(&hash_state); + bindless_sampler_layout_hash = XXH3_64bits_digest(&hash_state); } // Obtain the unique IDs of binding layouts if there are any texture // bindings or bindless samplers, for invalidation in the command processor. diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index d09d373b8..f4c79a213 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -95,7 +95,7 @@ class PipelineCache { reg::SQ_PROGRAM_CNTL sq_program_cntl; - static constexpr uint32_t kVersion = 0x20201129; + static constexpr uint32_t kVersion = 0x20201207; }); // Update PipelineDescription::kVersion if any of the Pipeline* enums are @@ -208,7 +208,7 @@ class PipelineCache { PipelineRenderTarget render_targets[4]; - static constexpr uint32_t kVersion = 0x20201202; + static constexpr uint32_t kVersion = 0x20201207; }); XEPACKEDSTRUCT(PipelineStoredDescription, { @@ -279,7 +279,7 @@ class PipelineCache { // Texture binding layouts of different shaders, for obtaining layout UIDs. std::vector texture_binding_layouts_; // Map of texture binding layouts used by shaders, for obtaining UIDs. Keys - // are XXH64 hashes of layouts, values need manual collision resolution using + // are XXH3 hashes of layouts, values need manual collision resolution using // layout_vector_offset:layout_length of texture_binding_layouts_. std::unordered_multimap> @@ -287,7 +287,7 @@ class PipelineCache { // Bindless sampler indices of different shaders, for obtaining layout UIDs. // For bindful, sampler count is used as the UID instead. std::vector bindless_sampler_layouts_; - // Keys are XXH64 hashes of used bindless sampler indices. + // Keys are XXH3 hashes of used bindless sampler indices. std::unordered_multimap> bindless_sampler_layout_map_; diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index 44d76c9ed..23bc20c78 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -9,7 +9,6 @@ #include "xenia/gpu/d3d12/texture_cache.h" -#include "third_party/xxhash/xxhash.h" #include #include @@ -21,6 +20,7 @@ #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/base/profiling.h" +#include "xenia/base/xxhash.h" #include "xenia/gpu/d3d12/d3d12_command_processor.h" #include "xenia/gpu/gpu_flags.h" #include "xenia/gpu/texture_info.h" diff --git a/src/xenia/gpu/sampler_info.cc b/src/xenia/gpu/sampler_info.cc index 916be887f..025dcd3fe 100644 --- a/src/xenia/gpu/sampler_info.cc +++ b/src/xenia/gpu/sampler_info.cc @@ -12,7 +12,7 @@ #include #include -#include "third_party/xxhash/xxhash.h" +#include "xenia/base/xxhash.h" namespace xe { namespace gpu { @@ -51,7 +51,7 @@ bool SamplerInfo::Prepare(const xenos::xe_gpu_texture_fetch_t& fetch, } uint64_t SamplerInfo::hash() const { - return XXH64(this, sizeof(SamplerInfo), 0); + return XXH3_64bits(this, sizeof(SamplerInfo)); } } // namespace gpu diff --git a/src/xenia/gpu/texture_conversion.cc b/src/xenia/gpu/texture_conversion.cc index 27c228780..bd028f47e 100644 --- a/src/xenia/gpu/texture_conversion.cc +++ b/src/xenia/gpu/texture_conversion.cc @@ -18,8 +18,7 @@ #include "xenia/base/math.h" #include "xenia/base/memory.h" #include "xenia/base/profiling.h" - -#include "third_party/xxhash/xxhash.h" +#include "xenia/base/xxhash.h" namespace xe { namespace gpu { diff --git a/src/xenia/gpu/texture_info.cc b/src/xenia/gpu/texture_info.cc index b20096d19..d190fb31e 100644 --- a/src/xenia/gpu/texture_info.cc +++ b/src/xenia/gpu/texture_info.cc @@ -16,8 +16,7 @@ #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/base/memory.h" - -#include "third_party/xxhash/xxhash.h" +#include "xenia/base/xxhash.h" namespace xe { namespace gpu { @@ -319,7 +318,7 @@ bool TextureInfo::GetPackedTileOffset(int packed_tile, uint32_t* offset_x, } uint64_t TextureInfo::hash() const { - return XXH64(this, sizeof(TextureInfo), 0); + return XXH3_64bits(this, sizeof(TextureInfo)); } void TextureInfo::SetupMemoryInfo(uint32_t base_address, uint32_t mip_address) { diff --git a/src/xenia/gpu/vulkan/buffer_cache.cc b/src/xenia/gpu/vulkan/buffer_cache.cc index e953e72a9..31aed6982 100644 --- a/src/xenia/gpu/vulkan/buffer_cache.cc +++ b/src/xenia/gpu/vulkan/buffer_cache.cc @@ -552,14 +552,14 @@ std::pair BufferCache::UploadVertexBuffer( } void BufferCache::HashVertexBindings( - XXH64_state_t* hash_state, + XXH3_state_t* hash_state, const std::vector& vertex_bindings) { auto& regs = *register_file_; for (const auto& vertex_binding : vertex_bindings) { #if 0 - XXH64_update(hash_state, &vertex_binding.binding_index, sizeof(vertex_binding.binding_index)); - XXH64_update(hash_state, &vertex_binding.fetch_constant, sizeof(vertex_binding.fetch_constant)); - XXH64_update(hash_state, &vertex_binding.stride_words, sizeof(vertex_binding.stride_words)); + XXH3_64bits_update(hash_state, &vertex_binding.binding_index, sizeof(vertex_binding.binding_index)); + XXH3_64bits_update(hash_state, &vertex_binding.fetch_constant, sizeof(vertex_binding.fetch_constant)); + XXH3_64bits_update(hash_state, &vertex_binding.stride_words, sizeof(vertex_binding.stride_words)); #endif int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (vertex_binding.fetch_constant / 3) * 6; @@ -567,15 +567,15 @@ void BufferCache::HashVertexBindings( switch (vertex_binding.fetch_constant % 3) { case 0: { auto& fetch = group->vertex_fetch_0; - XXH64_update(hash_state, &fetch, sizeof(fetch)); + XXH3_64bits_update(hash_state, &fetch, sizeof(fetch)); } break; case 1: { auto& fetch = group->vertex_fetch_1; - XXH64_update(hash_state, &fetch, sizeof(fetch)); + XXH3_64bits_update(hash_state, &fetch, sizeof(fetch)); } break; case 2: { auto& fetch = group->vertex_fetch_2; - XXH64_update(hash_state, &fetch, sizeof(fetch)); + XXH3_64bits_update(hash_state, &fetch, sizeof(fetch)); } break; } } @@ -585,12 +585,12 @@ VkDescriptorSet BufferCache::PrepareVertexSet( VkCommandBuffer command_buffer, VkFence fence, const std::vector& vertex_bindings) { // (quickly) Generate a hash. - XXH64_state_t hash_state; - XXH64_reset(&hash_state, 0); + XXH3_state_t hash_state; + XXH3_64bits_reset(&hash_state); // (quickly) Generate a hash. HashVertexBindings(&hash_state, vertex_bindings); - uint64_t hash = XXH64_digest(&hash_state); + uint64_t hash = XXH3_64bits_digest(&hash_state); for (auto it = vertex_sets_.find(hash); it != vertex_sets_.end(); ++it) { // TODO(DrChat): We need to compare the bindings and ensure they're equal. return it->second; diff --git a/src/xenia/gpu/vulkan/buffer_cache.h b/src/xenia/gpu/vulkan/buffer_cache.h index f53359cd3..4080b1803 100644 --- a/src/xenia/gpu/vulkan/buffer_cache.h +++ b/src/xenia/gpu/vulkan/buffer_cache.h @@ -10,6 +10,7 @@ #ifndef XENIA_GPU_VULKAN_BUFFER_CACHE_H_ #define XENIA_GPU_VULKAN_BUFFER_CACHE_H_ +#include "xenia/base/xxhash.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/shader.h" #include "xenia/gpu/xenos.h" @@ -20,7 +21,6 @@ #include "xenia/ui/vulkan/vulkan_device.h" #include "third_party/vulkan/vk_mem_alloc.h" -#include "third_party/xxhash/xxhash.h" #include #include @@ -127,7 +127,7 @@ class BufferCache { void FreeConstantDescriptorSet(); void HashVertexBindings( - XXH64_state_t* hash_state, + XXH3_state_t* hash_state, const std::vector& vertex_bindings); // Allocates a block of memory in the transient buffer. diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc index 3ab45245c..52bb607f4 100644 --- a/src/xenia/gpu/vulkan/pipeline_cache.cc +++ b/src/xenia/gpu/vulkan/pipeline_cache.cc @@ -9,11 +9,11 @@ #include "xenia/gpu/vulkan/pipeline_cache.h" -#include "third_party/xxhash/xxhash.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/base/memory.h" #include "xenia/base/profiling.h" +#include "xenia/base/xxhash.h" #include "xenia/gpu/gpu_flags.h" #include "xenia/gpu/vulkan/vulkan_gpu_flags.h" @@ -208,7 +208,8 @@ VulkanShader* PipelineCache::LoadShader(xenos::ShaderType shader_type, const uint32_t* host_address, uint32_t dword_count) { // Hash the input memory and lookup the shader. - uint64_t data_hash = XXH64(host_address, dword_count * sizeof(uint32_t), 0); + uint64_t data_hash = + XXH3_64bits(host_address, dword_count * sizeof(uint32_t)); auto it = shader_map_.find(data_hash); if (it != shader_map_.end()) { // Shader has been previously loaded. @@ -259,7 +260,7 @@ PipelineCache::UpdateStatus PipelineCache::ConfigurePipeline( } if (!pipeline) { // Should have a hash key produced by the UpdateState pass. - uint64_t hash_key = XXH64_digest(&hash_state_); + uint64_t hash_key = XXH3_64bits_digest(&hash_state_); pipeline = GetPipeline(render_state, hash_key); current_pipeline_ = pipeline; if (!pipeline) { @@ -961,7 +962,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateState( bool mismatch = false; // Reset hash so we can build it up. - XXH64_reset(&hash_state_, 0); + XXH3_64bits_reset(&hash_state_); #define CHECK_UPDATE_STATUS(status, mismatch, error_message) \ { \ @@ -1028,7 +1029,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRenderTargetState() { regs.rb_color1_info.color_format = cur_regs->rb_color1_info.color_format; regs.rb_color2_info.color_format = cur_regs->rb_color2_info.color_format; regs.rb_color3_info.color_format = cur_regs->rb_color3_info.color_format; - XXH64_update(&hash_state_, ®s, sizeof(regs)); + XXH3_64bits_update(&hash_state_, ®s, sizeof(regs)); if (!dirty) { return UpdateStatus::kCompatible; } @@ -1061,7 +1062,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages( regs.vertex_shader = vertex_shader; regs.pixel_shader = pixel_shader; regs.primitive_type = primitive_type; - XXH64_update(&hash_state_, ®s, sizeof(regs)); + XXH3_64bits_update(&hash_state_, ®s, sizeof(regs)); if (!dirty) { return UpdateStatus::kCompatible; } @@ -1148,7 +1149,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateVertexInputState( bool dirty = false; dirty |= vertex_shader != regs.vertex_shader; regs.vertex_shader = vertex_shader; - XXH64_update(&hash_state_, ®s, sizeof(regs)); + XXH3_64bits_update(&hash_state_, ®s, sizeof(regs)); if (!dirty) { return UpdateStatus::kCompatible; } @@ -1177,7 +1178,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateInputAssemblyState( dirty |= SetShadowRegister(®s.multi_prim_ib_reset_index, XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX); regs.primitive_type = primitive_type; - XXH64_update(&hash_state_, ®s, sizeof(regs)); + XXH3_64bits_update(&hash_state_, ®s, sizeof(regs)); if (!dirty) { return UpdateStatus::kCompatible; } @@ -1303,7 +1304,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState( dirty = true; } - XXH64_update(&hash_state_, ®s, sizeof(regs)); + XXH3_64bits_update(&hash_state_, ®s, sizeof(regs)); if (!dirty) { return UpdateStatus::kCompatible; } @@ -1385,7 +1386,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateMultisampleState() { dirty |= SetShadowRegister(®s.pa_su_sc_mode_cntl, XE_GPU_REG_PA_SU_SC_MODE_CNTL); dirty |= SetShadowRegister(®s.rb_surface_info, XE_GPU_REG_RB_SURFACE_INFO); - XXH64_update(&hash_state_, ®s, sizeof(regs)); + XXH3_64bits_update(&hash_state_, ®s, sizeof(regs)); if (!dirty) { return UpdateStatus::kCompatible; } @@ -1437,7 +1438,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateDepthStencilState() { dirty |= SetShadowRegister(®s.rb_depthcontrol, XE_GPU_REG_RB_DEPTHCONTROL); dirty |= SetShadowRegister(®s.rb_stencilrefmask, XE_GPU_REG_RB_STENCILREFMASK); - XXH64_update(&hash_state_, ®s, sizeof(regs)); + XXH3_64bits_update(&hash_state_, ®s, sizeof(regs)); if (!dirty) { return UpdateStatus::kCompatible; } @@ -1526,7 +1527,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateColorBlendState() { dirty |= SetShadowRegister(®s.rb_blendcontrol[3], XE_GPU_REG_RB_BLENDCONTROL3); dirty |= SetShadowRegister(®s.rb_modecontrol, XE_GPU_REG_RB_MODECONTROL); - XXH64_update(&hash_state_, ®s, sizeof(regs)); + XXH3_64bits_update(&hash_state_, ®s, sizeof(regs)); if (!dirty) { return UpdateStatus::kCompatible; } diff --git a/src/xenia/gpu/vulkan/pipeline_cache.h b/src/xenia/gpu/vulkan/pipeline_cache.h index 693dd4594..d6a88fdcf 100644 --- a/src/xenia/gpu/vulkan/pipeline_cache.h +++ b/src/xenia/gpu/vulkan/pipeline_cache.h @@ -12,8 +12,7 @@ #include -#include "third_party/xxhash/xxhash.h" - +#include "xenia/base/xxhash.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/spirv_shader_translator.h" #include "xenia/gpu/vulkan/render_cache.h" @@ -121,7 +120,7 @@ class PipelineCache { // Hash state used to incrementally produce pipeline hashes during update. // By the time the full update pass has run the hash will represent the // current state in a way that can uniquely identify the produced VkPipeline. - XXH64_state_t hash_state_; + XXH3_state_t hash_state_; // All previously generated pipelines mapped by hash. std::unordered_map cached_pipelines_; diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc index fa6cdcd69..0e9d3ee26 100644 --- a/src/xenia/gpu/vulkan/texture_cache.cc +++ b/src/xenia/gpu/vulkan/texture_cache.cc @@ -1377,7 +1377,7 @@ void TextureCache::WritebackTexture(Texture* texture) { } void TextureCache::HashTextureBindings( - XXH64_state_t* hash_state, uint32_t& fetch_mask, + XXH3_state_t* hash_state, uint32_t& fetch_mask, const std::vector& bindings) { for (auto& binding : bindings) { uint32_t fetch_bit = 1 << binding.fetch_constant; @@ -1393,7 +1393,7 @@ void TextureCache::HashTextureBindings( reinterpret_cast(®s.values[r]); auto& fetch = group->texture_fetch; - XXH64_update(hash_state, &fetch, sizeof(fetch)); + XXH3_64bits_update(hash_state, &fetch, sizeof(fetch)); } } @@ -1401,14 +1401,14 @@ VkDescriptorSet TextureCache::PrepareTextureSet( VkCommandBuffer command_buffer, VkFence completion_fence, const std::vector& vertex_bindings, const std::vector& pixel_bindings) { - XXH64_state_t hash_state; - XXH64_reset(&hash_state, 0); + XXH3_state_t hash_state; + XXH3_64bits_reset(&hash_state); // (quickly) Generate a hash. uint32_t fetch_mask = 0; HashTextureBindings(&hash_state, fetch_mask, vertex_bindings); HashTextureBindings(&hash_state, fetch_mask, pixel_bindings); - uint64_t hash = XXH64_digest(&hash_state); + uint64_t hash = XXH3_64bits_digest(&hash_state); for (auto it = texture_sets_.find(hash); it != texture_sets_.end(); ++it) { // TODO(DrChat): We need to compare the bindings and ensure they're equal. return it->second; diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h index f4d2ad564..70db17472 100644 --- a/src/xenia/gpu/vulkan/texture_cache.h +++ b/src/xenia/gpu/vulkan/texture_cache.h @@ -186,7 +186,7 @@ class TextureCache { bool UploadTexture(VkCommandBuffer command_buffer, VkFence completion_fence, Texture* dest, const TextureInfo& src); - void HashTextureBindings(XXH64_state_t* hash_state, uint32_t& fetch_mask, + void HashTextureBindings(XXH3_state_t* hash_state, uint32_t& fetch_mask, const std::vector& bindings); bool SetupTextureBindings( VkCommandBuffer command_buffer, VkFence completion_fence, diff --git a/third_party/xxhash b/third_party/xxhash new file mode 160000 index 000000000..4c881f796 --- /dev/null +++ b/third_party/xxhash @@ -0,0 +1 @@ +Subproject commit 4c881f796d6af27ef7d9c48f87817da0d3d75dc1 diff --git a/third_party/xxhash/LICENSE b/third_party/xxhash/LICENSE deleted file mode 100644 index 7de801ed1..000000000 --- a/third_party/xxhash/LICENSE +++ /dev/null @@ -1,24 +0,0 @@ -xxHash Library -Copyright (c) 2012-2014, Yann Collet -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, this - list of conditions and the following disclaimer in the documentation and/or - other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/xxhash/Makefile b/third_party/xxhash/Makefile deleted file mode 100644 index 94cf4a939..000000000 --- a/third_party/xxhash/Makefile +++ /dev/null @@ -1,67 +0,0 @@ -# ################################################################ -# xxHash Makefile -# Copyright (C) Yann Collet 2012-2014 -# GPL v2 License -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -# -# You can contact the author at : -# - xxHash source repository : http://code.google.com/p/xxhash/ -# ################################################################ -# xxHash.exe : benchmark program, to demonstrate xxHash speed -# ################################################################ - -CC := $(CC) -CFLAGS ?= -O3 -CFLAGS += -I. -std=c99 -Wall -Wextra -Wundef -Wshadow -Wcast-align -Wstrict-prototypes - - -# Define *.exe as extension for Windows systems -ifneq (,$(filter Windows%,$(OS))) -EXT =.exe -else -EXT = -endif - - -default: xxhsum - -all: xxhsum xxhsum32 - -xxhsum: xxhash.c xxhsum.c - $(CC) $(CFLAGS) $^ -o $@$(EXT) - ln -sf $@ xxh32sum - ln -sf $@ xxh64sum - -xxhsum32: xxhash.c xxhsum.c - $(CC) -m32 $(CFLAGS) $^ -o $@$(EXT) - -test: $(TEST_TARGETS) - -test: xxhsum - ./xxhsum < xxhash.c - ./xxhsum -b xxhash.c - valgrind --leak-check=yes ./xxhsum -bi1 xxhash.c - valgrind --leak-check=yes ./xxhsum -H0 xxhash.c - valgrind --leak-check=yes ./xxhsum -H1 xxhash.c - -test-all: test xxhsum32 - ./xxhsum32 -b xxhash.c - -clean: - @rm -f core *.o xxhsum$(EXT) xxhsum32$(EXT) xxh32sum xxh64sum - @echo cleaning completed - - diff --git a/third_party/xxhash/README.md b/third_party/xxhash/README.md deleted file mode 100644 index 06f63764c..000000000 --- a/third_party/xxhash/README.md +++ /dev/null @@ -1,74 +0,0 @@ -xxHash - Extremely fast hash algorithm -====================================== - -xxHash is an Extremely fast Hash algorithm, running at RAM speed limits. -It successfully passes the [SMHasher](http://code.google.com/p/smhasher/wiki/SMHasher) Test suite evaluating Hash quality. - -|Branch |Status | -|------------|---------| -|master | [![Build Status](https://travis-ci.org/Cyan4973/xxHash.svg?branch=master)](https://travis-ci.org/Cyan4973/xxHash?branch=master) | -|dev | [![Build Status](https://travis-ci.org/Cyan4973/xxHash.svg?branch=dev)](https://travis-ci.org/Cyan4973/xxHash?branch=dev) | - - -Benchmarks -------------------------- - -The benchmark uses SMHasher speed test, compiled with Visual on a Windows Seven 32 bits system. -The reference system uses a Core 2 Duo @3GHz - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameSpeedQ.ScoreAuthor
xxHash5.4 GB/s10Y.C.
MumurHash 3a2.7 GB/s10Austin Appleby
SBox1.4 GB/s9Bret Mulvey
Lookup31.2 GB/s9Bob Jenkins
CityHash641.05 GB/s10Pike & Alakuijala
FNV0.55 GB/s5Fowler, Noll, Vo
CRC320.43 GB/s9
SipHash0.34 GB/s10Jean-Philippe Aumasson
MD5-320.33 GB/s10Ronald L. Rivest
SHA1-320.28 GB/s10
- - -Q.Score is a measure of quality of the hash function. -It depends on successfully passing SMHasher test set. -10 is a perfect score. - -A new version, XXH64, has been created thanks to Mathias Westerdahl contribution, which offers superior speed and dispersion for 64-bits systems. Note however that 32-bits applications will still run faster using the 32-bits version. - -SMHasher speed test, compiled using GCC 4.8.2, a Linux Mint 64-bits. -The reference system uses a Core i5-3340M @2.7GHz - -| Version | Speed on 64-bits | Speed on 32-bits | -|------------|------------------|------------------| -| XXH64 | 13.8 GB/s | 1.9 GB/s | -| XXH32 | 6.8 GB/s | 6.0 GB/s | - - -This is an official mirror of xxHash project, [hosted on Google Code](http://code.google.com/p/xxhash/). -The intention is to offer github's capabilities to xxhash users, such as cloning, branch, pull requests or source download. - -The "master" branch will reflect, the status of xxhash at its official homepage. The "dev" branch is the one where all contributions will be merged. If you plan to propose a patch, please commit into the "dev" branch. Direct commit to "master" are not permitted. Feature branches will also exist, typically to introduce new requirements, and be temporarily available for testing before merge into "dev" branch. diff --git a/third_party/xxhash/README.xenia b/third_party/xxhash/README.xenia deleted file mode 100644 index b4b90c1f8..000000000 --- a/third_party/xxhash/README.xenia +++ /dev/null @@ -1,2 +0,0 @@ -https://code.google.com/p/xxhash/ -r39 on 12/23/2014 diff --git a/third_party/xxhash/xxhash.c b/third_party/xxhash/xxhash.c deleted file mode 100644 index 24a64b5f8..000000000 --- a/third_party/xxhash/xxhash.c +++ /dev/null @@ -1,928 +0,0 @@ -/* -xxHash - Fast Hash algorithm -Copyright (C) 2012-2014, Yann Collet. -BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -* Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -You can contact the author at : -- xxHash source repository : http://code.google.com/p/xxhash/ -- public discussion board : https://groups.google.com/forum/#!forum/lz4c -*/ - - -//************************************** -// Tuning parameters -//************************************** -// Unaligned memory access is automatically enabled for "common" CPU, such as x86. -// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected. -// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance. -// You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32). -#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) -# define XXH_USE_UNALIGNED_ACCESS 1 -#endif - -// XXH_ACCEPT_NULL_INPUT_POINTER : -// If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer. -// When this option is enabled, xxHash output for null input pointers will be the same as a null-length input. -// This option has a very small performance cost (only measurable on small inputs). -// By default, this option is disabled. To enable it, uncomment below define : -// #define XXH_ACCEPT_NULL_INPUT_POINTER 1 - -// XXH_FORCE_NATIVE_FORMAT : -// By default, xxHash library provides endian-independant Hash values, based on little-endian convention. -// Results are therefore identical for little-endian and big-endian CPU. -// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. -// Should endian-independance be of no importance for your application, you may set the #define below to 1. -// It will improve speed for Big-endian CPU. -// This option has no impact on Little_Endian CPU. -#define XXH_FORCE_NATIVE_FORMAT 0 - -//************************************** -// Compiler Specific Options -//************************************** -// Disable some Visual warning messages -#ifdef _MSC_VER // Visual Studio -# pragma warning(disable : 4127) // disable: C4127: conditional expression is constant -#endif - -#ifdef _MSC_VER // Visual Studio -# define FORCE_INLINE static __forceinline -#else -# ifdef __GNUC__ -# define FORCE_INLINE static inline __attribute__((always_inline)) -# else -# define FORCE_INLINE static inline -# endif -#endif - -//************************************** -// Includes & Memory related functions -//************************************** -#include "xxhash.h" -// Modify the local functions below should you wish to use some other memory routines -// for malloc(), free() -#include -static void* XXH_malloc(size_t s) { return malloc(s); } -static void XXH_free (void* p) { free(p); } -// for memcpy() -#include -static void* XXH_memcpy(void* dest, const void* src, size_t size) -{ - return memcpy(dest,src,size); -} - - -//************************************** -// Basic Types -//************************************** -#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 -# include -typedef uint8_t BYTE; -typedef uint16_t U16; -typedef uint32_t U32; -typedef int32_t S32; -typedef uint64_t U64; -#else -typedef unsigned char BYTE; -typedef unsigned short U16; -typedef unsigned int U32; -typedef signed int S32; -typedef unsigned long long U64; -#endif - -#if defined(__GNUC__) && !defined(XXH_USE_UNALIGNED_ACCESS) -# define _PACKED __attribute__ ((packed)) -#else -# define _PACKED -#endif - -#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__) -# ifdef __IBMC__ -# pragma pack(1) -# else -# pragma pack(push, 1) -# endif -#endif - -typedef struct _U32_S -{ - U32 v; -} _PACKED U32_S; -typedef struct _U64_S -{ - U64 v; -} _PACKED U64_S; - -#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__) -# pragma pack(pop) -#endif - -#define A32(x) (((U32_S *)(x))->v) -#define A64(x) (((U64_S *)(x))->v) - - -//*************************************** -// Compiler-specific Functions and Macros -//*************************************** -#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) - -// Note : although _rotl exists for minGW (GCC under windows), performance seems poor -#if defined(_MSC_VER) -# define XXH_rotl32(x,r) _rotl(x,r) -# define XXH_rotl64(x,r) _rotl64(x,r) -#else -# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) -# define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r))) -#endif - -#if defined(_MSC_VER) // Visual Studio -# define XXH_swap32 _byteswap_ulong -# define XXH_swap64 _byteswap_uint64 -#elif GCC_VERSION >= 403 -# define XXH_swap32 __builtin_bswap32 -# define XXH_swap64 __builtin_bswap64 -#else -static inline U32 XXH_swap32 (U32 x) -{ - return ((x << 24) & 0xff000000 ) | - ((x << 8) & 0x00ff0000 ) | - ((x >> 8) & 0x0000ff00 ) | - ((x >> 24) & 0x000000ff ); -} -static inline U64 XXH_swap64 (U64 x) -{ - return ((x << 56) & 0xff00000000000000ULL) | - ((x << 40) & 0x00ff000000000000ULL) | - ((x << 24) & 0x0000ff0000000000ULL) | - ((x << 8) & 0x000000ff00000000ULL) | - ((x >> 8) & 0x00000000ff000000ULL) | - ((x >> 24) & 0x0000000000ff0000ULL) | - ((x >> 40) & 0x000000000000ff00ULL) | - ((x >> 56) & 0x00000000000000ffULL); -} -#endif - - -//************************************** -// Constants -//************************************** -#define PRIME32_1 2654435761U -#define PRIME32_2 2246822519U -#define PRIME32_3 3266489917U -#define PRIME32_4 668265263U -#define PRIME32_5 374761393U - -#define PRIME64_1 11400714785074694791ULL -#define PRIME64_2 14029467366897019727ULL -#define PRIME64_3 1609587929392839161ULL -#define PRIME64_4 9650029242287828579ULL -#define PRIME64_5 2870177450012600261ULL - -//************************************** -// Architecture Macros -//************************************** -typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; -#ifndef XXH_CPU_LITTLE_ENDIAN // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch -static const int one = 1; -# define XXH_CPU_LITTLE_ENDIAN (*(char*)(&one)) -#endif - - -//************************************** -// Macros -//************************************** -#define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(!!(c)) }; } // use only *after* variable declarations - - -//**************************** -// Memory reads -//**************************** -typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; - -FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) -{ - if (align==XXH_unaligned) - return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr)); - else - return endian==XXH_littleEndian ? *(U32*)ptr : XXH_swap32(*(U32*)ptr); -} - -FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) -{ - return XXH_readLE32_align(ptr, endian, XXH_unaligned); -} - -FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) -{ - if (align==XXH_unaligned) - return endian==XXH_littleEndian ? A64(ptr) : XXH_swap64(A64(ptr)); - else - return endian==XXH_littleEndian ? *(U64*)ptr : XXH_swap64(*(U64*)ptr); -} - -FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) -{ - return XXH_readLE64_align(ptr, endian, XXH_unaligned); -} - - -//**************************** -// Simple Hash Functions -//**************************** -FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align) -{ - const BYTE* p = (const BYTE*)input; - const BYTE* bEnd = p + len; - U32 h32; -#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) - -#ifdef XXH_ACCEPT_NULL_INPUT_POINTER - if (p==NULL) - { - len=0; - bEnd=p=(const BYTE*)(size_t)16; - } -#endif - - if (len>=16) - { - const BYTE* const limit = bEnd - 16; - U32 v1 = seed + PRIME32_1 + PRIME32_2; - U32 v2 = seed + PRIME32_2; - U32 v3 = seed + 0; - U32 v4 = seed - PRIME32_1; - - do - { - v1 += XXH_get32bits(p) * PRIME32_2; - v1 = XXH_rotl32(v1, 13); - v1 *= PRIME32_1; - p+=4; - v2 += XXH_get32bits(p) * PRIME32_2; - v2 = XXH_rotl32(v2, 13); - v2 *= PRIME32_1; - p+=4; - v3 += XXH_get32bits(p) * PRIME32_2; - v3 = XXH_rotl32(v3, 13); - v3 *= PRIME32_1; - p+=4; - v4 += XXH_get32bits(p) * PRIME32_2; - v4 = XXH_rotl32(v4, 13); - v4 *= PRIME32_1; - p+=4; - } - while (p<=limit); - - h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); - } - else - { - h32 = seed + PRIME32_5; - } - - h32 += (U32) len; - - while (p+4<=bEnd) - { - h32 += XXH_get32bits(p) * PRIME32_3; - h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; - p+=4; - } - - while (p> 15; - h32 *= PRIME32_2; - h32 ^= h32 >> 13; - h32 *= PRIME32_3; - h32 ^= h32 >> 16; - - return h32; -} - - -unsigned int XXH32 (const void* input, size_t len, unsigned seed) -{ -#if 0 - // Simple version, good for code maintenance, but unfortunately slow for small inputs - XXH32_state_t state; - XXH32_reset(&state, seed); - XXH32_update(&state, input, len); - return XXH32_digest(&state); -#else - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - -# if !defined(XXH_USE_UNALIGNED_ACCESS) - if ((((size_t)input) & 3) == 0) // Input is aligned, let's leverage the speed advantage - { - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); - else - return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); - } -# endif - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); - else - return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); -#endif -} - -FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align) -{ - const BYTE* p = (const BYTE*)input; - const BYTE* bEnd = p + len; - U64 h64; -#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align) - -#ifdef XXH_ACCEPT_NULL_INPUT_POINTER - if (p==NULL) - { - len=0; - bEnd=p=(const BYTE*)(size_t)32; - } -#endif - - if (len>=32) - { - const BYTE* const limit = bEnd - 32; - U64 v1 = seed + PRIME64_1 + PRIME64_2; - U64 v2 = seed + PRIME64_2; - U64 v3 = seed + 0; - U64 v4 = seed - PRIME64_1; - - do - { - v1 += XXH_get64bits(p) * PRIME64_2; - p+=8; - v1 = XXH_rotl64(v1, 31); - v1 *= PRIME64_1; - v2 += XXH_get64bits(p) * PRIME64_2; - p+=8; - v2 = XXH_rotl64(v2, 31); - v2 *= PRIME64_1; - v3 += XXH_get64bits(p) * PRIME64_2; - p+=8; - v3 = XXH_rotl64(v3, 31); - v3 *= PRIME64_1; - v4 += XXH_get64bits(p) * PRIME64_2; - p+=8; - v4 = XXH_rotl64(v4, 31); - v4 *= PRIME64_1; - } - while (p<=limit); - - h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); - - v1 *= PRIME64_2; - v1 = XXH_rotl64(v1, 31); - v1 *= PRIME64_1; - h64 ^= v1; - h64 = h64 * PRIME64_1 + PRIME64_4; - - v2 *= PRIME64_2; - v2 = XXH_rotl64(v2, 31); - v2 *= PRIME64_1; - h64 ^= v2; - h64 = h64 * PRIME64_1 + PRIME64_4; - - v3 *= PRIME64_2; - v3 = XXH_rotl64(v3, 31); - v3 *= PRIME64_1; - h64 ^= v3; - h64 = h64 * PRIME64_1 + PRIME64_4; - - v4 *= PRIME64_2; - v4 = XXH_rotl64(v4, 31); - v4 *= PRIME64_1; - h64 ^= v4; - h64 = h64 * PRIME64_1 + PRIME64_4; - } - else - { - h64 = seed + PRIME64_5; - } - - h64 += (U64) len; - - while (p+8<=bEnd) - { - U64 k1 = XXH_get64bits(p); - k1 *= PRIME64_2; - k1 = XXH_rotl64(k1,31); - k1 *= PRIME64_1; - h64 ^= k1; - h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; - p+=8; - } - - if (p+4<=bEnd) - { - h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; - h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; - p+=4; - } - - while (p> 33; - h64 *= PRIME64_2; - h64 ^= h64 >> 29; - h64 *= PRIME64_3; - h64 ^= h64 >> 32; - - return h64; -} - - -unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed) -{ -#if 0 - // Simple version, good for code maintenance, but unfortunately slow for small inputs - XXH64_state_t state; - XXH64_reset(&state, seed); - XXH64_update(&state, input, len); - return XXH64_digest(&state); -#else - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - -# if !defined(XXH_USE_UNALIGNED_ACCESS) - if ((((size_t)input) & 7)==0) // Input is aligned, let's leverage the speed advantage - { - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); - else - return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); - } -# endif - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); - else - return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); -#endif -} - -/**************************************************** - * Advanced Hash Functions -****************************************************/ - -/*** Allocation ***/ -typedef struct -{ - U64 total_len; - U32 seed; - U32 v1; - U32 v2; - U32 v3; - U32 v4; - U32 mem32[4]; /* defined as U32 for alignment */ - U32 memsize; -} XXH_istate32_t; - -typedef struct -{ - U64 total_len; - U64 seed; - U64 v1; - U64 v2; - U64 v3; - U64 v4; - U64 mem64[4]; /* defined as U64 for alignment */ - U32 memsize; -} XXH_istate64_t; - - -XXH32_state_t* XXH32_createState(void) -{ - XXH_STATIC_ASSERT(sizeof(XXH32_state_t) >= sizeof(XXH_istate32_t)); // A compilation error here means XXH32_state_t is not large enough - return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); -} -XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) -{ - XXH_free(statePtr); - return XXH_OK; -}; - -XXH64_state_t* XXH64_createState(void) -{ - XXH_STATIC_ASSERT(sizeof(XXH64_state_t) >= sizeof(XXH_istate64_t)); // A compilation error here means XXH64_state_t is not large enough - return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); -} -XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) -{ - XXH_free(statePtr); - return XXH_OK; -}; - - -/*** Hash feed ***/ - -XXH_errorcode XXH32_reset(XXH32_state_t* state_in, U32 seed) -{ - XXH_istate32_t* state = (XXH_istate32_t*) state_in; - state->seed = seed; - state->v1 = seed + PRIME32_1 + PRIME32_2; - state->v2 = seed + PRIME32_2; - state->v3 = seed + 0; - state->v4 = seed - PRIME32_1; - state->total_len = 0; - state->memsize = 0; - return XXH_OK; -} - -XXH_errorcode XXH64_reset(XXH64_state_t* state_in, unsigned long long seed) -{ - XXH_istate64_t* state = (XXH_istate64_t*) state_in; - state->seed = seed; - state->v1 = seed + PRIME64_1 + PRIME64_2; - state->v2 = seed + PRIME64_2; - state->v3 = seed + 0; - state->v4 = seed - PRIME64_1; - state->total_len = 0; - state->memsize = 0; - return XXH_OK; -} - - -FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const void* input, size_t len, XXH_endianess endian) -{ - XXH_istate32_t* state = (XXH_istate32_t *) state_in; - const BYTE* p = (const BYTE*)input; - const BYTE* const bEnd = p + len; - -#ifdef XXH_ACCEPT_NULL_INPUT_POINTER - if (input==NULL) return XXH_ERROR; -#endif - - state->total_len += len; - - if (state->memsize + len < 16) // fill in tmp buffer - { - XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); - state->memsize += (U32)len; - return XXH_OK; - } - - if (state->memsize) // some data left from previous update - { - XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); - { - const U32* p32 = state->mem32; - state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; - state->v1 = XXH_rotl32(state->v1, 13); - state->v1 *= PRIME32_1; - p32++; - state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; - state->v2 = XXH_rotl32(state->v2, 13); - state->v2 *= PRIME32_1; - p32++; - state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; - state->v3 = XXH_rotl32(state->v3, 13); - state->v3 *= PRIME32_1; - p32++; - state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; - state->v4 = XXH_rotl32(state->v4, 13); - state->v4 *= PRIME32_1; - p32++; - } - p += 16-state->memsize; - state->memsize = 0; - } - - if (p <= bEnd-16) - { - const BYTE* const limit = bEnd - 16; - U32 v1 = state->v1; - U32 v2 = state->v2; - U32 v3 = state->v3; - U32 v4 = state->v4; - - do - { - v1 += XXH_readLE32(p, endian) * PRIME32_2; - v1 = XXH_rotl32(v1, 13); - v1 *= PRIME32_1; - p+=4; - v2 += XXH_readLE32(p, endian) * PRIME32_2; - v2 = XXH_rotl32(v2, 13); - v2 *= PRIME32_1; - p+=4; - v3 += XXH_readLE32(p, endian) * PRIME32_2; - v3 = XXH_rotl32(v3, 13); - v3 *= PRIME32_1; - p+=4; - v4 += XXH_readLE32(p, endian) * PRIME32_2; - v4 = XXH_rotl32(v4, 13); - v4 *= PRIME32_1; - p+=4; - } - while (p<=limit); - - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; - } - - if (p < bEnd) - { - XXH_memcpy(state->mem32, p, bEnd-p); - state->memsize = (int)(bEnd-p); - } - - return XXH_OK; -} - -XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len) -{ - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH32_update_endian(state_in, input, len, XXH_littleEndian); - else - return XXH32_update_endian(state_in, input, len, XXH_bigEndian); -} - - - -FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian) -{ - XXH_istate32_t* state = (XXH_istate32_t*) state_in; - const BYTE * p = (const BYTE*)state->mem32; - BYTE* bEnd = (BYTE*)(state->mem32) + state->memsize; - U32 h32; - - if (state->total_len >= 16) - { - h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); - } - else - { - h32 = state->seed + PRIME32_5; - } - - h32 += (U32) state->total_len; - - while (p+4<=bEnd) - { - h32 += XXH_readLE32(p, endian) * PRIME32_3; - h32 = XXH_rotl32(h32, 17) * PRIME32_4; - p+=4; - } - - while (p> 15; - h32 *= PRIME32_2; - h32 ^= h32 >> 13; - h32 *= PRIME32_3; - h32 ^= h32 >> 16; - - return h32; -} - - -U32 XXH32_digest (const XXH32_state_t* state_in) -{ - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH32_digest_endian(state_in, XXH_littleEndian); - else - return XXH32_digest_endian(state_in, XXH_bigEndian); -} - - -FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const void* input, size_t len, XXH_endianess endian) -{ - XXH_istate64_t * state = (XXH_istate64_t *) state_in; - const BYTE* p = (const BYTE*)input; - const BYTE* const bEnd = p + len; - -#ifdef XXH_ACCEPT_NULL_INPUT_POINTER - if (input==NULL) return XXH_ERROR; -#endif - - state->total_len += len; - - if (state->memsize + len < 32) // fill in tmp buffer - { - XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); - state->memsize += (U32)len; - return XXH_OK; - } - - if (state->memsize) // some data left from previous update - { - XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize); - { - const U64* p64 = state->mem64; - state->v1 += XXH_readLE64(p64, endian) * PRIME64_2; - state->v1 = XXH_rotl64(state->v1, 31); - state->v1 *= PRIME64_1; - p64++; - state->v2 += XXH_readLE64(p64, endian) * PRIME64_2; - state->v2 = XXH_rotl64(state->v2, 31); - state->v2 *= PRIME64_1; - p64++; - state->v3 += XXH_readLE64(p64, endian) * PRIME64_2; - state->v3 = XXH_rotl64(state->v3, 31); - state->v3 *= PRIME64_1; - p64++; - state->v4 += XXH_readLE64(p64, endian) * PRIME64_2; - state->v4 = XXH_rotl64(state->v4, 31); - state->v4 *= PRIME64_1; - p64++; - } - p += 32-state->memsize; - state->memsize = 0; - } - - if (p+32 <= bEnd) - { - const BYTE* const limit = bEnd - 32; - U64 v1 = state->v1; - U64 v2 = state->v2; - U64 v3 = state->v3; - U64 v4 = state->v4; - - do - { - v1 += XXH_readLE64(p, endian) * PRIME64_2; - v1 = XXH_rotl64(v1, 31); - v1 *= PRIME64_1; - p+=8; - v2 += XXH_readLE64(p, endian) * PRIME64_2; - v2 = XXH_rotl64(v2, 31); - v2 *= PRIME64_1; - p+=8; - v3 += XXH_readLE64(p, endian) * PRIME64_2; - v3 = XXH_rotl64(v3, 31); - v3 *= PRIME64_1; - p+=8; - v4 += XXH_readLE64(p, endian) * PRIME64_2; - v4 = XXH_rotl64(v4, 31); - v4 *= PRIME64_1; - p+=8; - } - while (p<=limit); - - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; - } - - if (p < bEnd) - { - XXH_memcpy(state->mem64, p, bEnd-p); - state->memsize = (int)(bEnd-p); - } - - return XXH_OK; -} - -XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len) -{ - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH64_update_endian(state_in, input, len, XXH_littleEndian); - else - return XXH64_update_endian(state_in, input, len, XXH_bigEndian); -} - - - -FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endianess endian) -{ - XXH_istate64_t * state = (XXH_istate64_t *) state_in; - const BYTE * p = (const BYTE*)state->mem64; - BYTE* bEnd = (BYTE*)state->mem64 + state->memsize; - U64 h64; - - if (state->total_len >= 32) - { - U64 v1 = state->v1; - U64 v2 = state->v2; - U64 v3 = state->v3; - U64 v4 = state->v4; - - h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); - - v1 *= PRIME64_2; - v1 = XXH_rotl64(v1, 31); - v1 *= PRIME64_1; - h64 ^= v1; - h64 = h64*PRIME64_1 + PRIME64_4; - - v2 *= PRIME64_2; - v2 = XXH_rotl64(v2, 31); - v2 *= PRIME64_1; - h64 ^= v2; - h64 = h64*PRIME64_1 + PRIME64_4; - - v3 *= PRIME64_2; - v3 = XXH_rotl64(v3, 31); - v3 *= PRIME64_1; - h64 ^= v3; - h64 = h64*PRIME64_1 + PRIME64_4; - - v4 *= PRIME64_2; - v4 = XXH_rotl64(v4, 31); - v4 *= PRIME64_1; - h64 ^= v4; - h64 = h64*PRIME64_1 + PRIME64_4; - } - else - { - h64 = state->seed + PRIME64_5; - } - - h64 += (U64) state->total_len; - - while (p+8<=bEnd) - { - U64 k1 = XXH_readLE64(p, endian); - k1 *= PRIME64_2; - k1 = XXH_rotl64(k1,31); - k1 *= PRIME64_1; - h64 ^= k1; - h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; - p+=8; - } - - if (p+4<=bEnd) - { - h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1; - h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; - p+=4; - } - - while (p> 33; - h64 *= PRIME64_2; - h64 ^= h64 >> 29; - h64 *= PRIME64_3; - h64 ^= h64 >> 32; - - return h64; -} - - -unsigned long long XXH64_digest (const XXH64_state_t* state_in) -{ - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH64_digest_endian(state_in, XXH_littleEndian); - else - return XXH64_digest_endian(state_in, XXH_bigEndian); -} - - diff --git a/third_party/xxhash/xxhash.h b/third_party/xxhash/xxhash.h deleted file mode 100644 index 55b45015a..000000000 --- a/third_party/xxhash/xxhash.h +++ /dev/null @@ -1,156 +0,0 @@ -/* - xxHash - Extremely Fast Hash algorithm - Header File - Copyright (C) 2012-2014, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - xxHash source repository : http://code.google.com/p/xxhash/ -*/ - -/* Notice extracted from xxHash homepage : - -xxHash is an extremely fast Hash algorithm, running at RAM speed limits. -It also successfully passes all tests from the SMHasher suite. - -Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) - -Name Speed Q.Score Author -xxHash 5.4 GB/s 10 -CrapWow 3.2 GB/s 2 Andrew -MumurHash 3a 2.7 GB/s 10 Austin Appleby -SpookyHash 2.0 GB/s 10 Bob Jenkins -SBox 1.4 GB/s 9 Bret Mulvey -Lookup3 1.2 GB/s 9 Bob Jenkins -SuperFastHash 1.2 GB/s 1 Paul Hsieh -CityHash64 1.05 GB/s 10 Pike & Alakuijala -FNV 0.55 GB/s 5 Fowler, Noll, Vo -CRC32 0.43 GB/s 9 -MD5-32 0.33 GB/s 10 Ronald L. Rivest -SHA1-32 0.28 GB/s 10 - -Q.Score is a measure of quality of the hash function. -It depends on successfully passing SMHasher test set. -10 is a perfect score. -*/ - -#pragma once - -#if defined (__cplusplus) -extern "C" { -#endif - - -/***************************** - Includes -*****************************/ -#include /* size_t */ - - -/***************************** - Type -*****************************/ -typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; - - - -/***************************** - Simple Hash Functions -*****************************/ - -unsigned int XXH32 (const void* input, size_t length, unsigned seed); -unsigned long long XXH64 (const void* input, size_t length, unsigned long long seed); - -/* -XXH32() : - Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input". - The memory between input & input+length must be valid (allocated and read-accessible). - "seed" can be used to alter the result predictably. - This function successfully passes all SMHasher tests. - Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s -XXH64() : - Calculate the 64-bits hash of sequence of length "len" stored at memory address "input". -*/ - - - -/***************************** - Advanced Hash Functions -*****************************/ -typedef struct { long long ll[ 6]; } XXH32_state_t; -typedef struct { long long ll[11]; } XXH64_state_t; - -/* -These structures allow static allocation of XXH states. -States must then be initialized using XXHnn_reset() before first use. - -If you prefer dynamic allocation, please refer to functions below. -*/ - -XXH32_state_t* XXH32_createState(void); -XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); - -XXH64_state_t* XXH64_createState(void); -XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); - -/* -These functions create and release memory for XXH state. -States must then be initialized using XXHnn_reset() before first use. -*/ - - -XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned seed); -XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); -unsigned int XXH32_digest (const XXH32_state_t* statePtr); - -XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed); -XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); -unsigned long long XXH64_digest (const XXH64_state_t* statePtr); - -/* -These functions calculate the xxHash of an input provided in multiple smaller packets, -as opposed to an input provided as a single block. - -XXH state space must first be allocated, using either static or dynamic method provided above. - -Start a new hash by initializing state with a seed, using XXHnn_reset(). - -Then, feed the hash state by calling XXHnn_update() as many times as necessary. -Obviously, input must be valid, meaning allocated and read accessible. -The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. - -Finally, you can produce a hash anytime, by using XXHnn_digest(). -This function returns the final nn-bits hash. -You can nonetheless continue feeding the hash state with more input, -and therefore get some new hashes, by calling again XXHnn_digest(). - -When you are done, don't forget to free XXH state space, using typically XXHnn_freeState(). -*/ - - -#if defined (__cplusplus) -} -#endif diff --git a/third_party/xxhash/xxhsum.c b/third_party/xxhash/xxhsum.c deleted file mode 100644 index e090e5111..000000000 --- a/third_party/xxhash/xxhsum.c +++ /dev/null @@ -1,689 +0,0 @@ -/* -bench.c - Demo program to benchmark open-source algorithm -Copyright (C) Yann Collet 2012-2014 - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along -with this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -You can contact the author at : -- Blog homepage : http://fastcompression.blogspot.com/ -- Discussion group : https://groups.google.com/forum/?fromgroups#!forum/lz4c -*/ - -/************************************** - * Compiler Options - *************************************/ -/* MS Visual */ -#if defined(_MSC_VER) || defined(_WIN32) -# define _CRT_SECURE_NO_WARNINGS /* removes visual warnings */ -# define BMK_LEGACY_TIMER 1 /* gettimeofday() not supported by MSVC */ -#endif - -/* Under Linux at least, pull in the *64 commands */ -#define _LARGEFILE64_SOURCE - - -/************************************** - * Includes - *************************************/ -#include // malloc -#include // fprintf, fopen, ftello64, fread, stdin, stdout; when present : _fileno -#include // strcmp -#include // stat64 -#include // stat64 - -#include "xxhash.h" - - -/************************************** - * OS-Specific Includes - *************************************/ -// Use ftime() if gettimeofday() is not available on your target -#if defined(BMK_LEGACY_TIMER) -# include // timeb, ftime -#else -# include // gettimeofday -#endif - -#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__) -# include // _O_BINARY -# include // _setmode, _isatty -# ifdef __MINGW32__ - int _fileno(FILE *stream); // MINGW somehow forgets to include this windows declaration into -# endif -# define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY) -# define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream)) -#else -# include // isatty, STDIN_FILENO -# define SET_BINARY_MODE(file) -# define IS_CONSOLE(stdStream) isatty(STDIN_FILENO) -#endif - -#if !defined(S_ISREG) -# define S_ISREG(x) (((x) & S_IFMT) == S_IFREG) -#endif - - -/************************************** - * Basic Types - *************************************/ -#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 -# include - typedef uint8_t BYTE; - typedef uint16_t U16; - typedef uint32_t U32; - typedef int32_t S32; - typedef uint64_t U64; -#else - typedef unsigned char BYTE; - typedef unsigned short U16; - typedef unsigned int U32; - typedef signed int S32; - typedef unsigned long long U64; -#endif - - -/************************************** - * Constants - *************************************/ -#define PROGRAM_NAME exename -#define PROGRAM_VERSION "" -#define COMPILED __DATE__ -#define AUTHOR "Yann Collet" -#define WELCOME_MESSAGE "*** %s %i-bits %s, by %s (%s) ***\n", PROGRAM_NAME, (int)(sizeof(void*)*8), PROGRAM_VERSION, AUTHOR, COMPILED - -#define NBLOOPS 3 // Default number of benchmark iterations -#define TIMELOOP 2500 // Minimum timing per iteration -#define PRIME 2654435761U - -#define KB *(1<<10) -#define MB *(1<<20) -#define GB *(1U<<30) - -#define MAX_MEM (2 GB - 64 MB) - -static const char stdinName[] = "-"; - - -//************************************** -// Display macros -//************************************** -#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) -#define DISPLAYRESULT(...) fprintf(stdout, __VA_ARGS__) -#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) DISPLAY(__VA_ARGS__); -static unsigned g_displayLevel = 1; - - -//************************************** -// Unit variables -//************************************** -static int g_nbIterations = NBLOOPS; -static int g_fn_selection = 1; // required within main() & usage() - - -//********************************************************* -// Benchmark Functions -//********************************************************* - -#if defined(BMK_LEGACY_TIMER) - -static int BMK_GetMilliStart(void) -{ - // Based on Legacy ftime() - // Rolls over every ~ 12.1 days (0x100000/24/60/60) - // Use GetMilliSpan to correct for rollover - struct timeb tb; - int nCount; - ftime( &tb ); - nCount = (int) (tb.millitm + (tb.time & 0xfffff) * 1000); - return nCount; -} - -#else - -static int BMK_GetMilliStart(void) -{ - // Based on newer gettimeofday() - // Use GetMilliSpan to correct for rollover - struct timeval tv; - int nCount; - gettimeofday(&tv, NULL); - nCount = (int) (tv.tv_usec/1000 + (tv.tv_sec & 0xfffff) * 1000); - return nCount; -} - -#endif - -static int BMK_GetMilliSpan( int nTimeStart ) -{ - int nSpan = BMK_GetMilliStart() - nTimeStart; - if ( nSpan < 0 ) - nSpan += 0x100000 * 1000; - return nSpan; -} - - -static size_t BMK_findMaxMem(U64 requestedMem) -{ - size_t step = (64 MB); - size_t allocatedMemory; - BYTE* testmem=NULL; - - requestedMem += 3*step; - requestedMem -= (size_t)requestedMem & (step-1); - if (requestedMem > MAX_MEM) requestedMem = MAX_MEM; - allocatedMemory = (size_t)requestedMem; - - while (!testmem) - { - allocatedMemory -= step; - testmem = (BYTE*) malloc((size_t)allocatedMemory); - } - free (testmem); - - return (size_t) (allocatedMemory - step); -} - - -static U64 BMK_GetFileSize(char* infilename) -{ - int r; -#if defined(_MSC_VER) - struct _stat64 statbuf; - r = _stat64(infilename, &statbuf); -#else - struct stat statbuf; - r = stat(infilename, &statbuf); -#endif - if (r || !S_ISREG(statbuf.st_mode)) return 0; // No good... - return (U64)statbuf.st_size; -} - - -static int BMK_benchFile(char** fileNamesTable, int nbFiles) -{ - int fileIdx=0; - U32 hashResult=0; - - U64 totals = 0; - double totalc = 0.; - - - // Loop for each file - while (fileIdx inFileSize) benchedSize = (size_t)inFileSize; - if (benchedSize < inFileSize) - { - DISPLAY("Not enough memory for '%s' full size; testing %i MB only...\n", inFileName, (int)(benchedSize>>20)); - } - - buffer = (char*)malloc((size_t )benchedSize+16); - if(!buffer) - { - DISPLAY("\nError: not enough memory!\n"); - fclose(inFile); - return 12; - } - alignedBuffer = (buffer+15) - (((size_t)(buffer+15)) & 0xF); // align on next 16 bytes boundaries - - // Fill input buffer - DISPLAY("\rLoading %s... \n", inFileName); - readSize = fread(alignedBuffer, 1, benchedSize, inFile); - fclose(inFile); - - if(readSize != benchedSize) - { - DISPLAY("\nError: problem reading file '%s' !! \n", inFileName); - free(buffer); - return 13; - } - - - // Bench XXH32 - { - int interationNb; - double fastestC = 100000000.; - - DISPLAY("\r%79s\r", ""); // Clean display line - for (interationNb = 1; interationNb <= g_nbIterations; interationNb++) - { - int nbHashes = 0; - int milliTime; - - DISPLAY("%1i-%-14.14s : %10i ->\r", interationNb, "XXH32", (int)benchedSize); - - // Hash loop - milliTime = BMK_GetMilliStart(); - while(BMK_GetMilliStart() == milliTime); - milliTime = BMK_GetMilliStart(); - while(BMK_GetMilliSpan(milliTime) < TIMELOOP) - { - int i; - for (i=0; i<100; i++) - { - hashResult = XXH32(alignedBuffer, benchedSize, 0); - nbHashes++; - } - } - milliTime = BMK_GetMilliSpan(milliTime); - if ((double)milliTime < fastestC*nbHashes) fastestC = (double)milliTime/nbHashes; - DISPLAY("%1i-%-14.14s : %10i -> %7.1f MB/s\r", interationNb, "XXH32", (int)benchedSize, (double)benchedSize / fastestC / 1000.); - } - DISPLAY("%-16.16s : %10i -> %7.1f MB/s 0x%08X\n", "XXH32", (int)benchedSize, (double)benchedSize / fastestC / 1000., hashResult); - - totals += benchedSize; - totalc += fastestC; - } - - // Bench Unaligned XXH32 - { - int interationNb; - double fastestC = 100000000.; - - DISPLAY("\r%79s\r", ""); // Clean display line - for (interationNb = 1; (interationNb <= g_nbIterations) && ((benchedSize>1)); interationNb++) - { - int nbHashes = 0; - int milliTime; - - DISPLAY("%1i-%-14.14s : %10i ->\r", interationNb, "(unaligned)", (int)benchedSize); - // Hash loop - milliTime = BMK_GetMilliStart(); - while(BMK_GetMilliStart() == milliTime); - milliTime = BMK_GetMilliStart(); - while(BMK_GetMilliSpan(milliTime) < TIMELOOP) - { - int i; - for (i=0; i<100; i++) - { - hashResult = XXH32(alignedBuffer+1, benchedSize-1, 0); - nbHashes++; - } - } - milliTime = BMK_GetMilliSpan(milliTime); - if ((double)milliTime < fastestC*nbHashes) fastestC = (double)milliTime/nbHashes; - DISPLAY("%1i-%-14.14s : %10i -> %7.1f MB/s\r", interationNb, "XXH32 (unaligned)", (int)(benchedSize-1), (double)(benchedSize-1) / fastestC / 1000.); - } - DISPLAY("%-16.16s : %10i -> %7.1f MB/s \n", "XXH32 (unaligned)", (int)benchedSize-1, (double)(benchedSize-1) / fastestC / 1000.); - } - - // Bench XXH64 - { - int interationNb; - double fastestC = 100000000.; - unsigned long long h64 = 0; - - DISPLAY("\r%79s\r", ""); // Clean display line - for (interationNb = 1; interationNb <= g_nbIterations; interationNb++) - { - int nbHashes = 0; - int milliTime; - - DISPLAY("%1i-%-14.14s : %10i ->\r", interationNb, "XXH64", (int)benchedSize); - - // Hash loop - milliTime = BMK_GetMilliStart(); - while(BMK_GetMilliStart() == milliTime); - milliTime = BMK_GetMilliStart(); - while(BMK_GetMilliSpan(milliTime) < TIMELOOP) - { - int i; - for (i=0; i<100; i++) - { - h64 = XXH64(alignedBuffer, benchedSize, 0); - nbHashes++; - } - } - milliTime = BMK_GetMilliSpan(milliTime); - if ((double)milliTime < fastestC*nbHashes) fastestC = (double)milliTime/nbHashes; - DISPLAY("%1i-%-14.14s : %10i -> %7.1f MB/s\r", interationNb, "XXH64", (int)benchedSize, (double)benchedSize / fastestC / 1000.); - } - DISPLAY("%-16.16s : %10i -> %7.1f MB/s 0x%08X%08X\n", "XXH64", (int)benchedSize, (double)benchedSize / fastestC / 1000., (U32)(h64>>32), (U32)(h64)); - - totals += benchedSize; - totalc += fastestC; - } - - free(buffer); - } - - if (nbFiles > 1) - printf("%-16.16s :%11llu -> %7.1f MB/s\n", " TOTAL", (long long unsigned int)totals, (double)totals/totalc/1000.); - - return 0; -} - - - -static void BMK_checkResult(U32 r1, U32 r2) -{ - static int nbTests = 1; - - if (r1==r2) DISPLAY("\rTest%3i : %08X == %08X ok ", nbTests, r1, r2); - else - { - DISPLAY("\rERROR : Test%3i : %08X <> %08X !!!!! \n", nbTests, r1, r2); - exit(1); - } - nbTests++; -} - - -static void BMK_checkResult64(U64 r1, U64 r2) -{ - static int nbTests = 1; - - if (r1!=r2) - { - DISPLAY("\rERROR : Test%3i : 64-bits values non equals !!!!! \n", nbTests); - DISPLAY("\r %08X%08X != %08X%08X \n", (U32)(r1>>32), (U32)r1, (U32)(r2<<32), (U32)r2); - exit(1); - } - nbTests++; -} - - -static void BMK_testSequence64(void* sentence, int len, U64 seed, U64 Nresult) -{ - U64 Dresult; - XXH64_state_t state; - int index; - - Dresult = XXH64(sentence, len, seed); - BMK_checkResult64(Dresult, Nresult); - - XXH64_reset(&state, seed); - XXH64_update(&state, sentence, len); - Dresult = XXH64_digest(&state); - BMK_checkResult64(Dresult, Nresult); - - XXH64_reset(&state, seed); - for (index=0; index>24); - prime *= prime; - } - - BMK_testSequence(NULL, 0, 0, 0x02CC5D05); - BMK_testSequence(NULL, 0, PRIME, 0x36B78AE7); - BMK_testSequence(sanityBuffer, 1, 0, 0xB85CBEE5); - BMK_testSequence(sanityBuffer, 1, PRIME, 0xD5845D64); - BMK_testSequence(sanityBuffer, 14, 0, 0xE5AA0AB4); - BMK_testSequence(sanityBuffer, 14, PRIME, 0x4481951D); - BMK_testSequence(sanityBuffer, SANITY_BUFFER_SIZE, 0, 0x1F1AA412); - BMK_testSequence(sanityBuffer, SANITY_BUFFER_SIZE, PRIME, 0x498EC8E2); - - BMK_testSequence64(NULL , 0, 0, 0xEF46DB3751D8E999ULL); - BMK_testSequence64(NULL , 0, PRIME, 0xAC75FDA2929B17EFULL); - BMK_testSequence64(sanityBuffer, 1, 0, 0x4FCE394CC88952D8ULL); - BMK_testSequence64(sanityBuffer, 1, PRIME, 0x739840CB819FA723ULL); - BMK_testSequence64(sanityBuffer, 14, 0, 0xCFFA8DB881BC3A3DULL); - BMK_testSequence64(sanityBuffer, 14, PRIME, 0x5B9611585EFCC9CBULL); - BMK_testSequence64(sanityBuffer, SANITY_BUFFER_SIZE, 0, 0x0EAB543384F878ADULL); - BMK_testSequence64(sanityBuffer, SANITY_BUFFER_SIZE, PRIME, 0xCAA65939306F1E21ULL); - - DISPLAY("\r%79s\r", ""); // Clean display line - DISPLAYLEVEL(2, "Sanity check -- all tests ok\n"); -} - - -static int BMK_hash(const char* fileName, U32 hashNb) -{ - FILE* inFile; - size_t const blockSize = 64 KB; - size_t readSize; - char* buffer; - XXH64_state_t state; - - // Check file existence - if (fileName == stdinName) - { - inFile = stdin; - SET_BINARY_MODE(stdin); - } - else - inFile = fopen( fileName, "rb" ); - if (inFile==NULL) - { - DISPLAY( "Pb opening %s\n", fileName); - return 11; - } - - // Memory allocation & restrictions - buffer = (char*)malloc(blockSize); - if(!buffer) - { - DISPLAY("\nError: not enough memory!\n"); - fclose(inFile); - return 12; - } - - // Init - switch(hashNb) - { - case 0: - XXH32_reset((XXH32_state_t*)&state, 0); - break; - case 1: - XXH64_reset(&state, 0); - break; - default: - DISPLAY("Error : bad hash algorithm ID\n"); - fclose(inFile); - free(buffer); - return -1; - } - - - // Load file & update hash - DISPLAY("\rLoading %s... \r", fileName); - readSize = 1; - while (readSize) - { - readSize = fread(buffer, 1, blockSize, inFile); - switch(hashNb) - { - case 0: - XXH32_update((XXH32_state_t*)&state, buffer, readSize); - break; - case 1: - XXH64_update(&state, buffer, readSize); - break; - default: - break; - } - } - fclose(inFile); - free(buffer); - - // display Hash - switch(hashNb) - { - case 0: - { - U32 h32 = XXH32_digest((XXH32_state_t*)&state); - DISPLAYRESULT("%08x %s \n", h32, fileName); - break; - } - case 1: - { - U64 h64 = XXH64_digest(&state); - DISPLAYRESULT("%08x%08x %s \n", (U32)(h64>>32), (U32)(h64), fileName); - break; - } - default: - break; - } - - return 0; -} - - -//********************************************************* -// Main -//********************************************************* - -static int usage(const char* exename) -{ - DISPLAY( WELCOME_MESSAGE ); - DISPLAY( "Usage :\n"); - DISPLAY( " %s [arg] [filename]\n", exename); - DISPLAY( "When no filename provided, or - provided : use stdin as input\n"); - DISPLAY( "Arguments :\n"); - DISPLAY( " -H# : hash selection : 0=32bits, 1=64bits (default %i)\n", g_fn_selection); - DISPLAY( " -b : benchmark mode \n"); - DISPLAY( " -i# : number of iterations (benchmark mode; default %i)\n", g_nbIterations); - DISPLAY( " -h : help (this text)\n"); - return 0; -} - - -static int badusage(const char* exename) -{ - DISPLAY("Wrong parameters\n"); - usage(exename); - return 1; -} - - -int main(int argc, char** argv) -{ - int i, filenamesStart=0; - const char* input_filename = (char*)stdinName; - const char* exename = argv[0]; - U32 benchmarkMode = 0; - - // xxh32sum default to 32 bits checksum - if (strstr(exename, "xxh32sum")!=NULL) g_fn_selection=0; - - for(i=1; i 1) return badusage(exename); - - return BMK_hash(input_filename, g_fn_selection); -} From b7216f91f7b7681aff04cfaecd0a91bdeebc835c Mon Sep 17 00:00:00 2001 From: Triang3l Date: Tue, 8 Dec 2020 22:36:47 +0300 Subject: [PATCH 17/29] [D3D12] Re-add forgotten RenderTargetCache::EndFrame call --- src/xenia/gpu/d3d12/d3d12_command_processor.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index d355d83c6..f6af89881 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -2651,6 +2651,8 @@ bool D3D12CommandProcessor::EndSubmission(bool is_swap) { bool is_closing_frame = is_swap && frame_open_; if (is_closing_frame) { + render_target_cache_->EndFrame(); + texture_cache_->EndFrame(); } From 9349cf4ff4c37caa2f2b2a27bc51111a79c0b561 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Tue, 8 Dec 2020 22:43:15 +0300 Subject: [PATCH 18/29] [D3D12] Fix custom sample position reset --- src/xenia/gpu/d3d12/deferred_command_list.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/xenia/gpu/d3d12/deferred_command_list.cc b/src/xenia/gpu/d3d12/deferred_command_list.cc index eb8d8922e..e618931d4 100644 --- a/src/xenia/gpu/d3d12/deferred_command_list.cc +++ b/src/xenia/gpu/d3d12/deferred_command_list.cc @@ -221,7 +221,9 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list, *reinterpret_cast(stream); command_list_1->SetSamplePositions( args.num_samples_per_pixel, args.num_pixels, - const_cast(args.sample_positions)); + (args.num_samples_per_pixel && args.num_pixels) + ? const_cast(args.sample_positions) + : nullptr); } } break; default: From bc0c2040e275d5684ca66dbf17d63e4b172052b7 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Thu, 10 Dec 2020 12:36:37 +0300 Subject: [PATCH 19/29] [DXBC] ROV: Force late Z write with kill instructions --- src/xenia/gpu/dxbc_shader_translator.h | 12 +-- src/xenia/gpu/dxbc_shader_translator_om.cc | 95 ++++++++++++---------- 2 files changed, 58 insertions(+), 49 deletions(-) diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 2ca52e7f5..87820587b 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -189,12 +189,12 @@ class DxbcShaderTranslator : public ShaderTranslator { kSysFlag_ROVStencilTest_Shift, // If the depth/stencil test has failed, but resulted in a stencil value // that is different than the one currently in the depth buffer, write it - // anyway and don't run the shader (to check if the sample may be discarded - // some way). This, however, also results in depth/stencil testing done - // entirely early even when it passes to prevent writing in divergent places - // in the shader. When the shader can kill, this must be set only for - // RB_DEPTHCONTROL EARLY_Z_ENABLE, not for alpha test/alpha to coverage - // disabled. + // anyway and don't run the rest of the shader (to check if the sample may + // be discarded some way) - use when alpha test and alpha to coverage are + // disabled. Ignored by the shader if not applicable to it (like if it has + // kill instructions or writes the depth output). + // TODO(Triang3l): Replace with an alpha-to-mask flag, check if + // (flags & (alpha test | alpha to mask)) == (always | disabled). kSysFlag_ROVDepthStencilEarlyWrite_Shift, kSysFlag_Count, diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc index f3b964ae2..ea79b737c 100644 --- a/src/xenia/gpu/dxbc_shader_translator_om.cc +++ b/src/xenia/gpu/dxbc_shader_translator_om.cc @@ -1024,51 +1024,60 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { // temp.z = viewport maximum depth if not writing to oDepth // temp.w = whether depth/stencil has been modified DxbcOpINE(temp_w_dest, sample_depth_stencil_src, temp_w_src); - // Check if need to write. - // temp.x? = resulting sample depth/stencil - // temp.y = polygon offset if not writing to oDepth - // temp.z = viewport maximum depth if not writing to oDepth - // temp.w = free - DxbcOpIf(true, temp_w_src); - { - if (depth_stencil_early) { - // Get if early depth/stencil write is enabled to temp.w. - // temp.w = whether early depth/stencil write is enabled - system_constants_used_ |= 1ull << kSysConst_Flags_Index; - DxbcOpAnd(temp_w_dest, - DxbcSrc::CB(cbuffer_index_system_constants_, - uint32_t(CbufferRegister::kSystemConstants), - kSysConst_Flags_Vec) - .Select(kSysConst_Flags_Comp), - DxbcSrc::LU(kSysFlag_ROVDepthStencilEarlyWrite)); - // Check if need to write early. - // temp.w = free - DxbcOpIf(true, temp_w_src); - } - // Write the new depth/stencil. - if (uav_index_edram_ == kBindingIndexUnallocated) { - uav_index_edram_ = uav_count_++; - } - DxbcOpStoreUAVTyped( - DxbcDest::U(uav_index_edram_, uint32_t(UAVRegister::kEdram)), - DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kYYYY), 1, - sample_depth_stencil_src); - if (depth_stencil_early) { - // Need to still run the shader to know whether to write the - // depth/stencil value. - DxbcOpElse(); - // Set sample bit out of bits 4:7 of system_temp_rov_params_.x if need - // to write later (after checking if the sample is not discarded by a - // kill instruction, alphatest or alpha-to-coverage). - DxbcOpOr(DxbcDest::R(system_temp_rov_params_, 0b0001), - DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kXXXX), - DxbcSrc::LU(1 << (4 + i))); - // Close the early depth/stencil check. - DxbcOpEndIf(); + if (depth_stencil_early && !CanWriteZEarly()) { + // Set the sample bit in bits 4:7 of system_temp_rov_params_.x - always + // need to write late in this shader, as it may do something like + // explicitly killing pixels. + DxbcOpBFI(DxbcDest::R(system_temp_rov_params_, 0b0001), DxbcSrc::LU(1), + DxbcSrc::LU(4 + i), temp_w_src, + DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kXXXX)); + } else { + // Check if need to write. + // temp.x? = resulting sample depth/stencil + // temp.y = polygon offset if not writing to oDepth + // temp.z = viewport maximum depth if not writing to oDepth + // temp.w = free + DxbcOpIf(true, temp_w_src); + { + if (depth_stencil_early) { + // Get if early depth/stencil write is enabled to temp.w. + // temp.w = whether early depth/stencil write is enabled + system_constants_used_ |= 1ull << kSysConst_Flags_Index; + DxbcOpAnd(temp_w_dest, + DxbcSrc::CB(cbuffer_index_system_constants_, + uint32_t(CbufferRegister::kSystemConstants), + kSysConst_Flags_Vec) + .Select(kSysConst_Flags_Comp), + DxbcSrc::LU(kSysFlag_ROVDepthStencilEarlyWrite)); + // Check if need to write early. + // temp.w = free + DxbcOpIf(true, temp_w_src); + } + // Write the new depth/stencil. + if (uav_index_edram_ == kBindingIndexUnallocated) { + uav_index_edram_ = uav_count_++; + } + DxbcOpStoreUAVTyped( + DxbcDest::U(uav_index_edram_, uint32_t(UAVRegister::kEdram)), + DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kYYYY), 1, + sample_depth_stencil_src); + if (depth_stencil_early) { + // Need to still run the shader to know whether to write the + // depth/stencil value. + DxbcOpElse(); + // Set the sample bit in bits 4:7 of system_temp_rov_params_.x if need + // to write later (after checking if the sample is not discarded by a + // kill instruction, alphatest or alpha-to-coverage). + DxbcOpOr(DxbcDest::R(system_temp_rov_params_, 0b0001), + DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kXXXX), + DxbcSrc::LU(1 << (4 + i))); + // Close the early depth/stencil check. + DxbcOpEndIf(); + } } + // Close the write check. + DxbcOpEndIf(); } - // Close the write check. - DxbcOpEndIf(); // Release sample_temp. PopSystemTemp(); From 8bcfcf64525a956e29ffcd00febc134e49e2678f Mon Sep 17 00:00:00 2001 From: Triang3l Date: Thu, 10 Dec 2020 12:39:02 +0300 Subject: [PATCH 20/29] [GPU] Update v_mad_legacy_f32 comment --- src/xenia/gpu/ucode.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index 85b52a377..ea11f10cd 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -816,10 +816,11 @@ static_assert_size(TextureFetchInstruction, 12); // move of the third operand in case of zero multiplicands, because the term // may be -0, while the result should be +0 in this case. // http://developer.amd.com/wordpress/media/2013/10/R5xx_Acceleration_v1.5.pdf -// Multiply-add also appears to be not fused (the SM3 behavior instruction on -// GCN is called v_mad_legacy_f32, not v_fma_legacy_f32) - shader translators -// should not use instructions that may be interpreted by the host GPU as -// fused multiply-add. +// Multiply-add also appears to be not fused; the SM3 behavior instruction on +// GCN is called v_mad_legacy_f32, not v_fma_legacy_f32 (in 2012-2020, before +// RDNA 2, which removed v_mad_f32 as well) - shader translators should not +// use instructions that may be interpreted by the host GPU as fused +// multiply-add. enum class AluScalarOpcode : uint32_t { // Floating-Point Add From 34c5fc9c2f548d55f834823c13ecf022ded2651d Mon Sep 17 00:00:00 2001 From: Triang3l Date: Thu, 10 Dec 2020 12:41:29 +0300 Subject: [PATCH 21/29] [DXBC] ROV: Update a todo comment about early Z --- src/xenia/gpu/dxbc_shader_translator.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 87820587b..1e9891771 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -193,8 +193,10 @@ class DxbcShaderTranslator : public ShaderTranslator { // be discarded some way) - use when alpha test and alpha to coverage are // disabled. Ignored by the shader if not applicable to it (like if it has // kill instructions or writes the depth output). - // TODO(Triang3l): Replace with an alpha-to-mask flag, check if - // (flags & (alpha test | alpha to mask)) == (always | disabled). + // TODO(Triang3l): Investigate replacement with an alpha-to-mask flag, + // checking `(flags & (alpha test | alpha to mask)) == (always | disabled)`, + // taking into account the potential relation with occlusion queries (but + // should be safe at least temporarily). kSysFlag_ROVDepthStencilEarlyWrite_Shift, kSysFlag_Count, From 040661f3669c003dd3de23112daa711ce9902b10 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Thu, 10 Dec 2020 21:23:13 +0300 Subject: [PATCH 22/29] [D3D12] Fix a lint error in pipeline desc --- src/xenia/gpu/d3d12/pipeline_cache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index f4c79a213..fe867c82a 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -179,7 +179,7 @@ class PipelineCache { int32_t depth_bias; float depth_bias_slope_scaled; - PipelineStripCutIndex strip_cut_index : 2; // 2 + PipelineStripCutIndex strip_cut_index : 2; // 2 // PipelinePrimitiveTopologyType for a vertex shader. // xenos::TessellationMode for a domain shader. uint32_t primitive_topology_type_or_tessellation_mode : 2; // 4 From db1d6b1fefea608fc3279bd60be64c329ff2b397 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Thu, 10 Dec 2020 21:27:26 +0300 Subject: [PATCH 23/29] [PPC] Fix test suite name being ignored --- src/xenia/cpu/ppc/testing/ppc_testing_main.cc | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/xenia/cpu/ppc/testing/ppc_testing_main.cc b/src/xenia/cpu/ppc/testing/ppc_testing_main.cc index bb18dfcb6..1d115af1e 100644 --- a/src/xenia/cpu/ppc/testing/ppc_testing_main.cc +++ b/src/xenia/cpu/ppc/testing/ppc_testing_main.cc @@ -7,6 +7,7 @@ ****************************************************************************** */ +#include "xenia/base/cvar.h" #include "xenia/base/filesystem.h" #include "xenia/base/logging.h" #include "xenia/base/main.h" @@ -28,7 +29,7 @@ DEFINE_path(test_path, "src/xenia/cpu/ppc/testing/", "Directory scanned for test files.", "Other"); DEFINE_path(test_bin_path, "src/xenia/cpu/ppc/testing/bin/", "Directory with binary outputs of the test files.", "Other"); -DEFINE_transient_string(test_name, "", "Specifies test name.", "General"); +DEFINE_transient_string(test_name, "", "Test suite name.", "General"); namespace xe { namespace cpu { @@ -475,13 +476,7 @@ bool RunTests(const std::string_view test_name) { } int main(const std::vector& args) { - // Grab test name, if present. - std::string test_name; - if (args.size() >= 2) { - test_name = args[1]; - } - - return RunTests(test_name) ? 0 : 1; + return RunTests(cvars::test_name) ? 0 : 1; } } // namespace test From cb93ddf8730872fadbd4fa4616f761b389964691 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Thu, 10 Dec 2020 21:29:33 +0300 Subject: [PATCH 24/29] [PPC] vcfsx/vcfux optimization/simplification --- src/xenia/cpu/ppc/ppc_emit_altivec.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index 08ea1b2fa..9c53e61ce 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -519,8 +519,8 @@ int InstrEmit_vavguw(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_vcfsx_(PPCHIRBuilder& f, uint32_t vd, uint32_t vb, uint32_t uimm) { // (VD) <- float(VB as signed) / 2^uimm - float fuimm = static_cast(std::exp2(uimm)); - Value* v = f.Div(f.VectorConvertI2F(f.LoadVR(vb)), + float fuimm = std::ldexp(1.0f, -int32_t(uimm)); + Value* v = f.Mul(f.VectorConvertI2F(f.LoadVR(vb)), f.Splat(f.LoadConstantFloat32(fuimm), VEC128_TYPE)); f.StoreVR(vd, v); return 0; @@ -535,8 +535,8 @@ int InstrEmit_vcsxwfp128(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_vcfux_(PPCHIRBuilder& f, uint32_t vd, uint32_t vb, uint32_t uimm) { // (VD) <- float(VB as unsigned) / 2^uimm - float fuimm = static_cast(std::exp2(uimm)); - Value* v = f.Div(f.VectorConvertI2F(f.LoadVR(vb), ARITHMETIC_UNSIGNED), + float fuimm = std::ldexp(1.0f, -int32_t(uimm)); + Value* v = f.Mul(f.VectorConvertI2F(f.LoadVR(vb), ARITHMETIC_UNSIGNED), f.Splat(f.LoadConstantFloat32(fuimm), VEC128_TYPE)); f.StoreVR(vd, v); return 0; From d0b849aad74e44e66c1590535785e93a001eba50 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Thu, 10 Dec 2020 21:34:37 +0300 Subject: [PATCH 25/29] [PPC] vcfsx/vcfux: Only mul if needed --- src/xenia/cpu/ppc/ppc_emit_altivec.cc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index 9c53e61ce..770def3c1 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -519,9 +519,11 @@ int InstrEmit_vavguw(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_vcfsx_(PPCHIRBuilder& f, uint32_t vd, uint32_t vb, uint32_t uimm) { // (VD) <- float(VB as signed) / 2^uimm - float fuimm = std::ldexp(1.0f, -int32_t(uimm)); - Value* v = f.Mul(f.VectorConvertI2F(f.LoadVR(vb)), - f.Splat(f.LoadConstantFloat32(fuimm), VEC128_TYPE)); + Value* v = f.VectorConvertI2F(f.LoadVR(vb)); + if (uimm) { + float fuimm = std::ldexp(1.0f, -int(uimm)); + v = f.Mul(v, f.Splat(f.LoadConstantFloat32(fuimm), VEC128_TYPE)); + } f.StoreVR(vd, v); return 0; } @@ -535,9 +537,11 @@ int InstrEmit_vcsxwfp128(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_vcfux_(PPCHIRBuilder& f, uint32_t vd, uint32_t vb, uint32_t uimm) { // (VD) <- float(VB as unsigned) / 2^uimm - float fuimm = std::ldexp(1.0f, -int32_t(uimm)); - Value* v = f.Mul(f.VectorConvertI2F(f.LoadVR(vb), ARITHMETIC_UNSIGNED), - f.Splat(f.LoadConstantFloat32(fuimm), VEC128_TYPE)); + Value* v = f.VectorConvertI2F(f.LoadVR(vb), ARITHMETIC_UNSIGNED); + if (uimm) { + float fuimm = std::ldexp(1.0f, -int(uimm)); + v = f.Mul(v, f.Splat(f.LoadConstantFloat32(fuimm), VEC128_TYPE)); + } f.StoreVR(vd, v); return 0; } From 5c47a3a5880896c9a2c4622381f77ac601cceef3 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Fri, 11 Dec 2020 21:20:13 +0300 Subject: [PATCH 26/29] [x64] vcfux single rounding for 0x80000000+ --- src/xenia/cpu/backend/x64/x64_emitter.cc | 2 + src/xenia/cpu/backend/x64/x64_emitter.h | 2 + src/xenia/cpu/backend/x64/x64_seq_vector.cc | 42 +++++++++++++----- src/xenia/cpu/hir/opcodes.h | 49 +++++++++++++++++++++ 4 files changed, 85 insertions(+), 10 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 37d1cdc77..92f45d493 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -746,6 +746,8 @@ static const vec128_t xmm_consts[] = { /* XMMIntMaxPD */ vec128d(INT_MAX), /* XMMPosIntMinPS */ vec128f((float)0x80000000u), /* XMMQNaN */ vec128i(0x7FC00000u), + /* XMMInt127 */ vec128i(0x7Fu), + /* XMM2To32 */ vec128f(0x1.0p32f), }; // First location to try and place constants. diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 4f661a331..4a31543b6 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -114,6 +114,8 @@ enum XmmConst { XMMIntMaxPD, XMMPosIntMinPS, XMMQNaN, + XMMInt127, + XMM2To32, }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 5cfb4615c..4c7fb665a 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -33,19 +33,41 @@ struct VECTOR_CONVERT_I2F static void Emit(X64Emitter& e, const EmitArgType& i) { // flags = ARITHMETIC_UNSIGNED if (i.instr->flags & ARITHMETIC_UNSIGNED) { - // xmm0 = mask of positive values - e.vpcmpgtd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMFFFF)); + // Round manually to (1.stored mantissa bits * 2^31) or to 2^32 to the + // nearest even (the only rounding mode used on AltiVec) if the number is + // 0x80000000 or greater, instead of converting src & 0x7FFFFFFF and then + // adding 2147483648.0f, which results in double rounding that can give a + // result larger than needed - see OPCODE_VECTOR_CONVERT_I2F notes. - // scale any values >= (unsigned)INT_MIN back to [0, INT_MAX] - e.vpsubd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMSignMaskI32)); - e.vblendvps(e.xmm1, e.xmm1, i.src1, e.xmm0); + // [0x80000000, 0xFFFFFFFF] case: - // xmm1 = [0, INT_MAX] - e.vcvtdq2ps(i.dest, e.xmm1); + // Round to the nearest even, from (0x80000000 | 31 stored mantissa bits) + // to ((-1 << 23) | 23 stored mantissa bits), or to 0 if the result should + // be 4294967296.0f. + // xmm0 = src + 0b01111111 + ((src >> 8) & 1) + // (xmm1 also used to launch reg + mem early and to require it late) + e.vpaddd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMInt127)); + e.vpslld(e.xmm0, i.src1, 31 - 8); + e.vpsrld(e.xmm0, e.xmm0, 31); + e.vpaddd(e.xmm0, e.xmm0, e.xmm1); + // xmm0 = (0xFF800000 | 23 explicit mantissa bits), or 0 if overflowed + e.vpsrad(e.xmm0, e.xmm0, 8); + // Calculate the result for the [0x80000000, 0xFFFFFFFF] case - take the + // rounded mantissa, and add -1 or 0 to the exponent of 32, depending on + // whether the number should be (1.stored mantissa bits * 2^31) or 2^32. + // xmm0 = [0x80000000, 0xFFFFFFFF] case result + e.vpaddd(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMM2To32)); - // scale values back above [INT_MIN, UINT_MAX] - e.vpandn(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS)); - e.vaddps(i.dest, i.dest, e.xmm0); + // [0x00000000, 0x7FFFFFFF] case + // (during vblendvps reg -> vpaddd reg -> vpaddd mem dependency): + + // Convert from signed integer to float. + // xmm1 = [0x00000000, 0x7FFFFFFF] case result + e.vcvtdq2ps(e.xmm1, i.src1); + + // Merge the two ways depending on whether the number is >= 0x80000000 + // (has high bit set). + e.vblendvps(i.dest, e.xmm1, e.xmm0, i.src1); } else { e.vcvtdq2ps(i.dest, i.src1); } diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index 488e7e168..1649ec9dc 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -143,6 +143,55 @@ enum Opcode { OPCODE_TRUNCATE, OPCODE_CONVERT, OPCODE_ROUND, + // Note that 2147483648.0 + (src & 0x7FFFFFFF) is not a correct way of + // performing the uint -> float conversion for large numbers on backends where + // only sint -> float is available. + // + // Take 0b11000000000000000000000101000001 as an example, + // or 1.1000000000000000000000101000001 * 2^31. + // This one has 31 mantissa bits (excluding the implicit 1.), and needs to be + // rounded to 23 bits - 8 mantissa bits need to be dropped: + // 10000000000000000000001_01000001 + // + // Rounding to the nearest even (the only rounding mode that exists on + // AltiVec, and the likely rounding mode in the implementations) should be + // done downwards - 01000001 of 1_01000001 is in [00000000, 01111111]. + // The correct mantissa in this case is: + // 1.10000000000000000000001 * 2^31. + // + // With a two-step conversion, rounding is done twice instead, which gives an + // incorrect result. + // + // First, converting the low 31 bits to float: + // The number is 0.1000000000000000000000101000001 * 2^31. + // Normalizing it, we get 1.000000000000000000000101000001 (30 significand + // bits). + // We need to round 30 bits to 23 - 7 bits need to be dropped: + // 00000000000000000000010_1000001 + // + // Rounding to the nearest even is done upwards in this case - 1000001 of + // 0_1000001 is in [1000001, 1111111]. + // The result of the sint -> float conversion is: + // 1.00000000000000000000011 * 2^30. + // + // Now 2147483648.0 (1 * 2^31) needs to be added. Aligning the exponents, we + // get: + // 0.|10000000000000000000001|1 * 2^31 + // + 1.|00000000000000000000000| * 2^31 + // = 1.|10000000000000000000001|1 * 2^31 + // + // At "infinite precision", the result has 24 significand bits, but only 23 + // can be stored, thus rounding to the nearest even needs to be done. 1_1 is + // (odd + 0.5). 0.5 is ambiguous, thus tie-breaking to the nearest even - + // which is above in this case - is done. The result is: + // 1.10000000000000000000010 * 2^31. + // + // This is incorrect - larger than the correctly rounded result, which is: + // 1.10000000000000000000001 * 2^31. + // + // Test cases checked on real hardware via vcfux: 0xFFFDFF7E, 0xFFFCFF7D - + // should be 0x4F7FFDFF and 0x4F7FFCFF respectively, not 0x4F7FFE00 and + // 0x4F7FFD00. OPCODE_VECTOR_CONVERT_I2F, OPCODE_VECTOR_CONVERT_F2I, OPCODE_LOAD_VECTOR_SHL, From da60649202f7e40d407bae9984d57f8fd9944c98 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Fri, 11 Dec 2020 21:21:26 +0300 Subject: [PATCH 27/29] [D3D12] Fix texture_cache.cc formatting --- src/xenia/gpu/d3d12/texture_cache.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index 23bc20c78..e1f9bdcc4 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -9,7 +9,6 @@ #include "xenia/gpu/d3d12/texture_cache.h" - #include #include #include From e348d6361ed89db884446a991391cfa6fdfbed4e Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sat, 12 Dec 2020 14:00:29 +0300 Subject: [PATCH 28/29] [PPC] Disable frsqrte tests in a way not breaking the rest --- src/xenia/cpu/ppc/testing/instr_frsqrte.s | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/xenia/cpu/ppc/testing/instr_frsqrte.s b/src/xenia/cpu/ppc/testing/instr_frsqrte.s index f114cb597..df7f0e1d2 100644 --- a/src/xenia/cpu/ppc/testing/instr_frsqrte.s +++ b/src/xenia/cpu/ppc/testing/instr_frsqrte.s @@ -1,21 +1,21 @@ # frsqrte tests disabled because accuracy is CPU dependent. -#test_frsqrte_1: - #_ REGISTER_IN f1 1.0 +test_frsqrte_1: + # _ REGISTER_IN f1 1.0 # frsqrte f1, f1 -# blr - #_ REGISTER_OUT f1 0.99975585937500000 + blr + # _ REGISTER_OUT f1 0.99975585937500000 # want: 0.97 -#test_frsqrte_2: - #_ REGISTER_IN f1 64.0 +test_frsqrte_2: + # _ REGISTER_IN f1 64.0 # frsqrte f1, f1 -# blr - #_ REGISTER_OUT f1 0.12496948242187500 + blr + # _ REGISTER_OUT f1 0.12496948242187500 -#test_frsqrte_3: - #_ REGISTER_IN f1 0.5 +test_frsqrte_3: + # _ REGISTER_IN f1 0.5 # frsqrte f1, f1 -# blr - #_ REGISTER_OUT f1 1.41381835937500000 + blr + # _ REGISTER_OUT f1 1.41381835937500000 # want: 1.375 From b106aa88e689128a352ca624e5e6f39513a9f17a Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sat, 12 Dec 2020 20:01:41 +0300 Subject: [PATCH 29/29] [GPU] Complete some register bitfield declarations --- src/xenia/gpu/registers.h | 16 ++++++++-------- src/xenia/gpu/xenos.h | 7 +++---- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/xenia/gpu/registers.h b/src/xenia/gpu/registers.h index dd1a7dfc2..07986b169 100644 --- a/src/xenia/gpu/registers.h +++ b/src/xenia/gpu/registers.h @@ -254,15 +254,15 @@ union PA_SU_SC_MODE_CNTL { uint32_t msaa_enable : 1; // +15 uint32_t vtx_window_offset_enable : 1; // +16 // LINE_STIPPLE_ENABLE was added on Adreno. - uint32_t : 2; // +17 - uint32_t provoking_vtx_last : 1; // +19 - uint32_t persp_corr_dis : 1; // +20 - uint32_t multi_prim_ib_ena : 1; // +21 - uint32_t : 1; // +22 - uint32_t quad_order_enable : 1; // +23 + uint32_t : 2; // +17 + uint32_t provoking_vtx_last : 1; // +19 + uint32_t persp_corr_dis : 1; // +20 + uint32_t multi_prim_ib_ena : 1; // +21 + uint32_t : 1; // +22 + uint32_t quad_order_enable : 1; // +23 + uint32_t sc_one_quad_per_clock : 1; // +24 // WAIT_RB_IDLE_ALL_TRI and WAIT_RB_IDLE_FIRST_TRI_NEW_STATE were added on // Adreno. - // TODO(Triang3l): Find SC_ONE_QUAD_PER_CLOCK offset. }; uint32_t value; static constexpr Register register_index = XE_GPU_REG_PA_SU_SC_MODE_CNTL; @@ -298,7 +298,7 @@ union PA_SC_VIZ_QUERY { // discard geometry after test (but use for testing) uint32_t kill_pix_post_hi_z : 1; // +7 // not used with d3d - uint32_t kill_pix_detail_mask : 1; // +8 + uint32_t kill_pix_post_detail_mask : 1; // +8 }; uint32_t value; static constexpr Register register_index = XE_GPU_REG_PA_SC_VIZ_QUERY; diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index 542372569..1c21ed8ff 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -1039,10 +1039,9 @@ XEPACKEDUNION(xe_gpu_texture_fetch_t, { ClampMode clamp_y : 3; // +13 ClampMode clamp_z : 3; // +16 SignedRepeatingFractionMode signed_rf_mode_all : 1; // +19 - // TODO(Triang3l): 1 or 2 dim_tbd bits? - uint32_t unk_0 : 2; // +20 - uint32_t pitch : 9; // +22 byte_pitch >> 5 - uint32_t tiled : 1; // +31 + uint32_t dim_tbd : 2; // +20 + uint32_t pitch : 9; // +22 byte_pitch >> 5 + uint32_t tiled : 1; // +31 TextureFormat format : 6; // +0 dword_1 Endian endianness : 2; // +6