diff --git a/.gdbinit b/.gdbinit new file mode 100644 index 000000000..09b4af30f --- /dev/null +++ b/.gdbinit @@ -0,0 +1,10 @@ +# Ignore HighResolutionTimer custom event +handle SIG34 nostop noprint +# Ignore PosixTimer custom event +handle SIG35 nostop noprint +# Ignore PosixThread exit event +handle SIG32 nostop noprint +# Ignore PosixThread suspend event +handle SIG36 nostop noprint +# Ignore PosixThread user callback event +handle SIG37 nostop noprint diff --git a/.gitmodules b/.gitmodules index 6c3ca7278..c8b4ef272 100644 --- a/.gitmodules +++ b/.gitmodules @@ -64,3 +64,6 @@ [submodule "third_party/DirectXShaderCompiler"] path = third_party/DirectXShaderCompiler url = https://github.com/microsoft/DirectXShaderCompiler.git +[submodule "third_party/premake-cmake"] + path = third_party/premake-cmake + url = https://github.com/Enhex/premake-cmake.git diff --git a/.travis.yml b/.travis.yml index 7536f47a3..188278034 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,9 +28,9 @@ addons: jobs: include: - - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 LINT=true - - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 BUILD=true CONFIG=Debug - - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 BUILD=true CONFIG=Release + - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 AR_COMPILER=llvm-ar-9 LINT=true + - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 AR_COMPILER=llvm-ar-9 BUILD=true CONFIG=Debug + - env: C_COMPILER=clang-9 CXX_COMPILER=clang++-9 AR_COMPILER=llvm-ar-9 BUILD=true CONFIG=Release git: # We handle submodules ourselves in xenia-build setup. @@ -40,8 +40,10 @@ before_script: - export LIBVULKAN_VERSION=1.1.70 - export CXX=$CXX_COMPILER - export CC=$C_COMPILER + - export AR=$AR_COMPILER # Dump useful info. - $CXX --version + - $AR_COMPILER --version - python3 --version - clang-format-9 --version - clang-format-9 -style=file -dump-config diff --git a/docs/building.md b/docs/building.md index 6aafc521e..0a70fb206 100644 --- a/docs/building.md +++ b/docs/building.md @@ -91,12 +91,14 @@ Linux support is extremely experimental and presently incomplete. The build script uses LLVM/Clang 9. GCC while it should work in theory, is not easily interchangeable right now. -[CodeLite](https://codelite.org) is the supported IDE and `xb devenv` will generate a workspace and attempt to open it. Your distribution's version may be out of date so check their website. -Normal building via `xb build` uses Make. +* Normal building via `xb build` uses Make. +* [CodeLite](https://codelite.org) is supported. `xb devenv` will generate a workspace and attempt to open it. Your distribution's version may be out of date so check their website. +* Experimental CMake generation is available to facilitate use of other IDEs such as [CLion](https://www.jetbrains.com/clion/). If `clion` is available inside `$PATH`, `xb devenv` will start it. Otherwise `build/CMakeLists.txt` needs to be generated by invoking `xb premake --devenv=cmake` manually. Clang-9 or newer should be available from system repositories on all up to date distributions. You will also need some development libraries. To get them on an Ubuntu system: -``` + +```bash sudo apt-get install libgtk-3-dev libpthread-stubs0-dev liblz4-dev libx11-dev libvulkan-dev libsdl2-dev libiberty-dev libunwind-dev libc++-dev libc++abi-dev ``` diff --git a/premake5.lua b/premake5.lua index bf7a1b286..ffa3b78db 100644 --- a/premake5.lua +++ b/premake5.lua @@ -1,5 +1,6 @@ include("tools/build") require("third_party/premake-export-compile-commands/export-compile-commands") +require("third_party/premake-cmake/cmake") location(build_root) targetdir(build_bin) @@ -24,6 +25,9 @@ defines({ "UNICODE", }) +cppdialect("C++17") +symbols("On") + -- TODO(DrChat): Find a way to disable this on other architectures. if ARCH ~= "ppc64" then filter("architecture:x86_64") @@ -44,30 +48,29 @@ filter("kind:StaticLib") filter("configurations:Checked") runtime("Debug") + optimize("Off") defines({ "DEBUG", }) - runtime("Debug") filter({"configurations:Checked", "platforms:Windows"}) buildoptions({ - "/RTCsu", -- Full Run-Time Checks. + "/RTCsu", -- Full Run-Time Checks. + }) +filter({"configurations:Checked", "platforms:Linux"}) + defines({ + "_GLIBCXX_DEBUG", -- libstdc++ debug mode }) filter("configurations:Debug") - runtime("Debug") + runtime("Release") + optimize("Off") defines({ "DEBUG", "_NO_DEBUG_HEAP=1", }) - runtime("Release") -filter({"configurations:Debug", "platforms:Windows"}) - linkoptions({ - "/NODEFAULTLIB:MSVCRTD", - }) - filter({"configurations:Debug", "platforms:Linux"}) - buildoptions({ - "-g", + defines({ + "_GLIBCXX_DEBUG", -- make dbg symbols work on some distros }) filter("configurations:Release") @@ -76,26 +79,18 @@ filter("configurations:Release") "NDEBUG", "_NO_DEBUG_HEAP=1", }) - optimize("speed") + optimize("Speed") inlining("Auto") floatingpoint("Fast") flags({ "LinkTimeOptimization", }) - runtime("Release") -filter({"configurations:Release", "platforms:Windows"}) - linkoptions({ - "/NODEFAULTLIB:MSVCRTD", - }) - filter("platforms:Linux") system("linux") toolset("clang") - cppdialect("C++17") buildoptions({ -- "-mlzcnt", -- (don't) Assume lzcnt is supported. - "`pkg-config --cflags gtk+-x11-3.0`", - "-fno-lto", -- Premake doesn't support LTO on clang + ({os.outputof("pkg-config --cflags gtk+-x11-3.0")})[1], }) links({ "stdc++fs", @@ -105,14 +100,13 @@ filter("platforms:Linux") "rt", }) linkoptions({ - "`pkg-config --libs gtk+-3.0`", + ({os.outputof("pkg-config --libs gtk+-3.0")})[1], }) filter({"platforms:Linux", "kind:*App"}) linkgroups("On") filter({"platforms:Linux", "language:C++", "toolset:gcc"}) - cppdialect("C++17") links({ }) disablewarnings({ @@ -147,13 +141,11 @@ filter({"platforms:Linux", "language:C++", "toolset:clang", "files:*.cc or *.cpp filter("platforms:Windows") system("windows") toolset("msc") - cppdialect("C++17") buildoptions({ - "/MP", -- Multiprocessor compilation. "/utf-8", -- 'build correctly on systems with non-Latin codepages'. -- Mark warnings as severe - "/w14839", -- non-standard use of class 'type' as an argument to a variadic function - "/w14840", -- non-portable use of class 'type' as an argument to a variadic function + "/w14839", -- non-standard use of class 'type' as an argument to a variadic function + "/w14840", -- non-portable use of class 'type' as an argument to a variadic function -- Disable warnings "/wd4100", -- Unreferenced parameters are ok. "/wd4201", -- Nameless struct/unions are ok. @@ -163,10 +155,10 @@ filter("platforms:Windows") "/wd4189", -- 'local variable is initialized but not referenced'. }) flags({ - "NoMinimalRebuild", -- Required for /MP above. + "MultiProcessorCompile", -- Multiprocessor compilation. + "NoMinimalRebuild", -- Required for /MP above. }) - symbols("On") defines({ "_CRT_NONSTDC_NO_DEPRECATE", "_CRT_SECURE_NO_WARNINGS", diff --git a/src/xenia/app/emulator_window.cc b/src/xenia/app/emulator_window.cc index 8c66fa880..576b1e4b6 100644 --- a/src/xenia/app/emulator_window.cc +++ b/src/xenia/app/emulator_window.cc @@ -71,8 +71,8 @@ std::unique_ptr EmulatorWindow::Create(Emulator* emulator) { std::unique_ptr emulator_window(new EmulatorWindow(emulator)); emulator_window->loop()->PostSynchronous([&emulator_window]() { - xe::threading::set_name("Win32 Loop"); - xe::Profiler::ThreadEnter("Win32 Loop"); + xe::threading::set_name("Windowing Loop"); + xe::Profiler::ThreadEnter("Windowing Loop"); if (!emulator_window->Initialize()) { xe::FatalError("Failed to initialize main window"); diff --git a/src/xenia/app/premake5.lua b/src/xenia/app/premake5.lua index 2110fd0ab..639f79d94 100644 --- a/src/xenia/app/premake5.lua +++ b/src/xenia/app/premake5.lua @@ -8,19 +8,6 @@ project("xenia-app") targetname("xenia_canary") language("C++") links({ - "aes_128", - "capstone", - "fmt", - "dxbc", - "discord-rpc", - "glslang-spirv", - "imgui", - "libavcodec", - "libavutil", - "mspack", - "snappy", - "spirv-tools", - "volk", "xenia-app-discord", "xenia-apu", "xenia-apu-nop", @@ -43,6 +30,21 @@ project("xenia-app") "xenia-ui-vulkan", "xenia-patcher", "xenia-vfs", + }) + links({ + "aes_128", + "capstone", + "fmt", + "dxbc", + "discord-rpc", + "glslang-spirv", + "imgui", + "libavcodec", + "libavutil", + "mspack", + "snappy", + "spirv-tools", + "volk", "xxhash", }) defines({ diff --git a/src/xenia/apu/xma_context.cc b/src/xenia/apu/xma_context.cc index 16d6e66a8..e5cdb2561 100644 --- a/src/xenia/apu/xma_context.cc +++ b/src/xenia/apu/xma_context.cc @@ -302,6 +302,7 @@ void XmaContext::DecodePackets(XMA_CONTEXT_DATA* data) { // No available data. if (!data->input_buffer_0_valid && !data->input_buffer_1_valid) { + data->output_buffer_valid = 0; return; } diff --git a/src/xenia/apu/xma_decoder.cc b/src/xenia/apu/xma_decoder.cc index dd7d30817..ee1c9aa45 100644 --- a/src/xenia/apu/xma_decoder.cc +++ b/src/xenia/apu/xma_decoder.cc @@ -144,7 +144,7 @@ X_STATUS XmaDecoder::Setup(kernel::KernelState* kernel_state) { WorkerThreadMain(); return 0; })); - worker_thread_->set_name("XMA Decoder Worker"); + worker_thread_->set_name("XMA Decoder"); worker_thread_->set_can_debugger_suspend(true); worker_thread_->Create(); diff --git a/src/xenia/base/debugging_posix.cc b/src/xenia/base/debugging_posix.cc index a9c08ed60..3b73ab12a 100644 --- a/src/xenia/base/debugging_posix.cc +++ b/src/xenia/base/debugging_posix.cc @@ -9,21 +9,51 @@ #include "xenia/base/debugging.h" -#include +#include #include +#include +#include +#include +#include #include "xenia/base/string_buffer.h" namespace xe { namespace debugging { -bool IsDebuggerAttached() { return false; } -void Break() { raise(SIGTRAP); } +bool IsDebuggerAttached() { + std::ifstream proc_status_stream("/proc/self/status"); + if (!proc_status_stream.is_open()) { + return false; + } + std::string line; + while (std::getline(proc_status_stream, line)) { + std::istringstream line_stream(line); + std::string key; + line_stream >> key; + if (key == "TracerPid:") { + uint32_t tracer_pid; + line_stream >> tracer_pid; + return tracer_pid != 0; + } + } + return false; +} + +void Break() { + static std::once_flag flag; + std::call_once(flag, []() { + // Install handler for sigtrap only once + std::signal(SIGTRAP, [](int) { + // Forward signal to default handler after being caught + std::signal(SIGTRAP, SIG_DFL); + }); + }); + std::raise(SIGTRAP); +} namespace internal { -void DebugPrint(const char* s) { - // TODO: proper implementation. -} +void DebugPrint(const char* s) { std::clog << s << std::endl; } } // namespace internal } // namespace debugging diff --git a/src/xenia/base/logging.cc b/src/xenia/base/logging.cc index aa688c87e..8584892d4 100644 --- a/src/xenia/base/logging.cc +++ b/src/xenia/base/logging.cc @@ -93,7 +93,7 @@ class Logger { write_thread_ = xe::threading::Thread::Create({}, [this]() { WriteThread(); }); - write_thread_->set_name("xe::FileLogSink Writer"); + write_thread_->set_name("Logging Writer"); } ~Logger() { diff --git a/src/xenia/base/platform.h b/src/xenia/base/platform.h index 33083a831..9b98175c5 100644 --- a/src/xenia/base/platform.h +++ b/src/xenia/base/platform.h @@ -76,14 +76,12 @@ #endif // XE_PLATFORM_MAC #if XE_COMPILER_MSVC -#define XEPACKEDSTRUCT(name, value) \ - __pragma(pack(push, 1)) struct name##_s value __pragma(pack(pop)); \ - typedef struct name##_s name; +#define XEPACKEDSTRUCT(name, value) \ + __pragma(pack(push, 1)) struct name value __pragma(pack(pop)); #define XEPACKEDSTRUCTANONYMOUS(value) \ __pragma(pack(push, 1)) struct value __pragma(pack(pop)); -#define XEPACKEDUNION(name, value) \ - __pragma(pack(push, 1)) union name##_s value __pragma(pack(pop)); \ - typedef union name##_s name; +#define XEPACKEDUNION(name, value) \ + __pragma(pack(push, 1)) union name value __pragma(pack(pop)); #else #define XEPACKEDSTRUCT(name, value) struct __attribute__((packed)) name value; #define XEPACKEDSTRUCTANONYMOUS(value) struct __attribute__((packed)) value; diff --git a/src/xenia/base/string_util.h b/src/xenia/base/string_util.h index f1499bb5f..adb2012af 100644 --- a/src/xenia/base/string_util.h +++ b/src/xenia/base/string_util.h @@ -10,11 +10,15 @@ #ifndef XENIA_BASE_STRING_UTIL_H_ #define XENIA_BASE_STRING_UTIL_H_ +#include #include +#include +#include #include #include "third_party/fmt/include/fmt/format.h" #include "xenia/base/assert.h" +#include "xenia/base/memory.h" #include "xenia/base/platform.h" #include "xenia/base/string.h" #include "xenia/base/vec128.h" @@ -30,6 +34,40 @@ namespace xe { namespace string_util { +inline size_t copy_truncating(char* dest, const std::string_view source, + size_t dest_buffer_count) { + if (!dest_buffer_count) { + return 0; + } + size_t chars_copied = std::min(source.size(), dest_buffer_count - size_t(1)); + std::memcpy(dest, source.data(), chars_copied); + dest[chars_copied] = '\0'; + return chars_copied; +} + +inline size_t copy_truncating(char16_t* dest, const std::u16string_view source, + size_t dest_buffer_count) { + if (!dest_buffer_count) { + return 0; + } + size_t chars_copied = std::min(source.size(), dest_buffer_count - size_t(1)); + std::memcpy(dest, source.data(), chars_copied * sizeof(char16_t)); + dest[chars_copied] = u'\0'; + return chars_copied; +} + +inline size_t copy_and_swap_truncating(char16_t* dest, + const std::u16string_view source, + size_t dest_buffer_count) { + if (!dest_buffer_count) { + return 0; + } + size_t chars_copied = std::min(source.size(), dest_buffer_count - size_t(1)); + xe::copy_and_swap(dest, source.data(), chars_copied); + dest[chars_copied] = u'\0'; + return chars_copied; +} + inline std::string to_hex_string(uint32_t value) { return fmt::format("{:08X}", value); } diff --git a/src/xenia/base/testing/threading_test.cc b/src/xenia/base/testing/threading_test.cc new file mode 100644 index 000000000..8d5f74449 --- /dev/null +++ b/src/xenia/base/testing/threading_test.cc @@ -0,0 +1,967 @@ +/** +****************************************************************************** +* Xenia : Xbox 360 Emulator Research Project * +****************************************************************************** +* Copyright 2018 Ben Vanik. All rights reserved. * +* Released under the BSD license - see LICENSE in the root for more details. * +****************************************************************************** +*/ + +#include + +#include "xenia/base/threading.h" + +#include "third_party/catch/include/catch.hpp" + +namespace xe { +namespace base { +namespace test { +using namespace threading; +using namespace std::chrono_literals; + +TEST_CASE("Fence") { + std::unique_ptr pFence; + std::unique_ptr pTimer; + + // Signal without wait + pFence = std::make_unique(); + pFence->Signal(); + + // Signal once and wait + pFence = std::make_unique(); + pFence->Signal(); + pFence->Wait(); + + // Signal twice and wait + pFence = std::make_unique(); + pFence->Signal(); + pFence->Signal(); + pFence->Wait(); + + // Signal and wait two times + pFence = std::make_unique(); + pFence->Signal(); + pFence->Wait(); + pFence->Signal(); + pFence->Wait(); + + // Test to synchronize multiple threads + std::atomic started(0); + std::atomic finished(0); + pFence = std::make_unique(); + auto func = [&pFence, &started, &finished] { + started.fetch_add(1); + pFence->Wait(); + finished.fetch_add(1); + }; + + auto threads = std::array({ + std::thread(func), + std::thread(func), + std::thread(func), + std::thread(func), + std::thread(func), + }); + + Sleep(100ms); + REQUIRE(started.load() == threads.size()); + REQUIRE(finished.load() == 0); + + pFence->Signal(); + + for (auto& t : threads) t.join(); + REQUIRE(finished.load() == threads.size()); +} // namespace test + +TEST_CASE("Get number of logical processors") { + auto count = std::thread::hardware_concurrency(); + REQUIRE(logical_processor_count() == count); + REQUIRE(logical_processor_count() == count); + REQUIRE(logical_processor_count() == count); +} + +TEST_CASE("Enable process to set thread affinity") { + EnableAffinityConfiguration(); +} + +TEST_CASE("Yield Current Thread", "MaybeYield") { + // Run to see if there are any errors + MaybeYield(); +} + +TEST_CASE("Sync with Memory Barrier", "SyncMemory") { + // Run to see if there are any errors + SyncMemory(); +} + +TEST_CASE("Sleep Current Thread", "Sleep") { + auto wait_time = 50ms; + auto start = std::chrono::steady_clock::now(); + Sleep(wait_time); + auto duration = std::chrono::steady_clock::now() - start; + REQUIRE(duration >= wait_time); +} + +TEST_CASE("Sleep Current Thread in Alertable State", "Sleep") { + auto wait_time = 50ms; + auto start = std::chrono::steady_clock::now(); + auto result = threading::AlertableSleep(wait_time); + auto duration = std::chrono::steady_clock::now() - start; + REQUIRE(duration >= wait_time); + REQUIRE(result == threading::SleepResult::kSuccess); + + // TODO(bwrsandman): Test a Thread to return kAlerted. + // Need callback to call extended I/O function (ReadFileEx or WriteFileEx) +} + +TEST_CASE("TlsHandle") { + // Test Allocate + auto handle = threading::AllocateTlsHandle(); + + // Test Free + REQUIRE(threading::FreeTlsHandle(handle)); + REQUIRE(!threading::FreeTlsHandle(handle)); + REQUIRE(!threading::FreeTlsHandle(threading::kInvalidTlsHandle)); + + // Test setting values + handle = threading::AllocateTlsHandle(); + REQUIRE(threading::GetTlsValue(handle) == 0); + uint32_t value = 0xDEADBEEF; + threading::SetTlsValue(handle, reinterpret_cast(&value)); + auto p_received_value = threading::GetTlsValue(handle); + REQUIRE(threading::GetTlsValue(handle) != 0); + auto received_value = *reinterpret_cast(p_received_value); + REQUIRE(received_value == value); + + uintptr_t non_thread_local_value = 0; + auto thread = Thread::Create({}, [&non_thread_local_value, &handle] { + non_thread_local_value = threading::GetTlsValue(handle); + }); + + auto result = Wait(thread.get(), false, 50ms); + REQUIRE(result == WaitResult::kSuccess); + REQUIRE(non_thread_local_value == 0); + + // Cleanup + REQUIRE(threading::FreeTlsHandle(handle)); +} + +TEST_CASE("HighResolutionTimer") { + // The wait time is 500ms with an interval of 50ms + // Smaller values are not as precise and fail the test + const auto wait_time = 500ms; + + // Time the actual sleep duration + { + const auto interval = 50ms; + std::atomic counter; + auto start = std::chrono::steady_clock::now(); + auto cb = [&counter] { ++counter; }; + auto pTimer = HighResolutionTimer::CreateRepeating(interval, cb); + Sleep(wait_time); + pTimer.reset(); + auto duration = std::chrono::steady_clock::now() - start; + + // Should have run as many times as wait_time / timer_interval plus or + // minus 1 due to imprecision of Sleep + REQUIRE(duration.count() >= wait_time.count()); + auto ratio = static_cast(duration / interval); + REQUIRE(counter >= ratio - 1); + REQUIRE(counter <= ratio + 1); + } + + // Test concurrent timers + { + const auto interval1 = 100ms; + const auto interval2 = 200ms; + std::atomic counter1; + std::atomic counter2; + auto start = std::chrono::steady_clock::now(); + auto cb1 = [&counter1] { ++counter1; }; + auto cb2 = [&counter2] { ++counter2; }; + auto pTimer1 = HighResolutionTimer::CreateRepeating(interval1, cb1); + auto pTimer2 = HighResolutionTimer::CreateRepeating(interval2, cb2); + Sleep(wait_time); + pTimer1.reset(); + pTimer2.reset(); + auto duration = std::chrono::steady_clock::now() - start; + + // Should have run as many times as wait_time / timer_interval plus or + // minus 1 due to imprecision of Sleep + REQUIRE(duration.count() >= wait_time.count()); + auto ratio1 = static_cast(duration / interval1); + auto ratio2 = static_cast(duration / interval2); + REQUIRE(counter1 >= ratio1 - 1); + REQUIRE(counter1 <= ratio1 + 1); + REQUIRE(counter2 >= ratio2 - 1); + REQUIRE(counter2 <= ratio2 + 1); + } + + // TODO(bwrsandman): Check on which thread callbacks are executed when + // spawned from differing threads +} + +TEST_CASE("Wait on Multiple Handles", "Wait") { + auto mutant = Mutant::Create(true); + auto semaphore = Semaphore::Create(10, 10); + auto event_ = Event::CreateManualResetEvent(false); + auto thread = Thread::Create({}, [&mutant, &semaphore, &event_] { + event_->Set(); + Wait(mutant.get(), false, 25ms); + semaphore->Release(1, nullptr); + Wait(mutant.get(), false, 25ms); + mutant->Release(); + }); + + std::vector handles = { + mutant.get(), + semaphore.get(), + event_.get(), + thread.get(), + }; + + auto any_result = WaitAny(handles, false, 100ms); + REQUIRE(any_result.first == WaitResult::kSuccess); + REQUIRE(any_result.second == 0); + + auto all_result = WaitAll(handles, false, 100ms); + REQUIRE(all_result == WaitResult::kSuccess); +} + +TEST_CASE("Signal and Wait") { + WaitResult result; + auto mutant = Mutant::Create(true); + auto event_ = Event::CreateAutoResetEvent(false); + auto thread = Thread::Create({}, [&mutant, &event_] { + Wait(mutant.get(), false); + event_->Set(); + }); + result = Wait(event_.get(), false, 50ms); + REQUIRE(result == WaitResult::kTimeout); + result = SignalAndWait(mutant.get(), event_.get(), false, 50ms); + REQUIRE(result == WaitResult::kSuccess); + result = Wait(thread.get(), false, 50ms); + REQUIRE(result == WaitResult::kSuccess); +} + +TEST_CASE("Wait on Event", "Event") { + auto evt = Event::CreateAutoResetEvent(false); + WaitResult result; + + // Call wait on unset Event + result = Wait(evt.get(), false, 50ms); + REQUIRE(result == WaitResult::kTimeout); + + // Call wait on set Event + evt->Set(); + result = Wait(evt.get(), false, 50ms); + REQUIRE(result == WaitResult::kSuccess); + + // Call wait on now consumed Event + result = Wait(evt.get(), false, 50ms); + REQUIRE(result == WaitResult::kTimeout); +} + +TEST_CASE("Reset Event", "Event") { + auto evt = Event::CreateAutoResetEvent(false); + WaitResult result; + + // Call wait on reset Event + evt->Set(); + evt->Reset(); + result = Wait(evt.get(), false, 50ms); + REQUIRE(result == WaitResult::kTimeout); + + // Test resetting the unset event + evt->Reset(); + result = Wait(evt.get(), false, 50ms); + REQUIRE(result == WaitResult::kTimeout); + + // Test setting the reset event + evt->Set(); + result = Wait(evt.get(), false, 50ms); + REQUIRE(result == WaitResult::kSuccess); +} + +TEST_CASE("Wait on Multiple Events", "Event") { + auto events = std::array, 4>{ + Event::CreateAutoResetEvent(false), + Event::CreateAutoResetEvent(false), + Event::CreateAutoResetEvent(false), + Event::CreateManualResetEvent(false), + }; + + std::array order = {0}; + std::atomic_uint index(0); + auto sign_in = [&order, &index](uint32_t id) { + auto i = index.fetch_add(1, std::memory_order::memory_order_relaxed); + order[i] = static_cast('0' + id); + }; + + auto threads = std::array{ + std::thread([&events, &sign_in] { + auto res = WaitAll({events[1].get(), events[3].get()}, false, 100ms); + if (res == WaitResult::kSuccess) { + sign_in(1); + } + }), + std::thread([&events, &sign_in] { + auto res = WaitAny({events[0].get(), events[2].get()}, false, 100ms); + if (res.first == WaitResult::kSuccess) { + sign_in(2); + } + }), + std::thread([&events, &sign_in] { + auto res = WaitAll({events[0].get(), events[2].get(), events[3].get()}, + false, 100ms); + if (res == WaitResult::kSuccess) { + sign_in(3); + } + }), + std::thread([&events, &sign_in] { + auto res = WaitAny({events[1].get(), events[3].get()}, false, 100ms); + if (res.first == WaitResult::kSuccess) { + sign_in(4); + } + }), + }; + + Sleep(10ms); + events[3]->Set(); // Signals thread id=4 and stays on for 1 and 3 + Sleep(10ms); + events[1]->Set(); // Signals thread id=1 + Sleep(10ms); + events[0]->Set(); // Signals thread id=2 + Sleep(10ms); + events[2]->Set(); // Partial signals thread id=3 + events[0]->Set(); // Signals thread id=3 + + for (auto& t : threads) { + t.join(); + } + + INFO(order.data()); + REQUIRE(order[0] == '4'); + // TODO(bwrsandman): Order is not always maintained on linux + // REQUIRE(order[1] == '1'); + // REQUIRE(order[2] == '2'); + // REQUIRE(order[3] == '3'); +} + +TEST_CASE("Wait on Semaphore", "Semaphore") { + WaitResult result; + std::unique_ptr sem; + int previous_count = 0; + + // Wait on semaphore with no room + sem = Semaphore::Create(0, 5); + result = Wait(sem.get(), false, 10ms); + REQUIRE(result == WaitResult::kTimeout); + + // Add room in semaphore + REQUIRE(sem->Release(2, &previous_count)); + REQUIRE(previous_count == 0); + REQUIRE(sem->Release(1, &previous_count)); + REQUIRE(previous_count == 2); + result = Wait(sem.get(), false, 10ms); + REQUIRE(result == WaitResult::kSuccess); + REQUIRE(sem->Release(1, &previous_count)); + REQUIRE(previous_count == 2); + + // Set semaphore over maximum_count + sem = Semaphore::Create(5, 5); + previous_count = -1; + REQUIRE_FALSE(sem->Release(1, &previous_count)); + REQUIRE(previous_count == -1); + REQUIRE_FALSE(sem->Release(10, &previous_count)); + REQUIRE(previous_count == -1); + sem = Semaphore::Create(0, 5); + REQUIRE_FALSE(sem->Release(10, &previous_count)); + REQUIRE(previous_count == -1); + REQUIRE_FALSE(sem->Release(10, &previous_count)); + REQUIRE(previous_count == -1); + + // Test invalid Release parameters + REQUIRE_FALSE(sem->Release(0, &previous_count)); + REQUIRE(previous_count == -1); + REQUIRE_FALSE(sem->Release(-1, &previous_count)); + REQUIRE(previous_count == -1); + + // Wait on fully available semaphore + sem = Semaphore::Create(5, 5); + result = Wait(sem.get(), false, 10ms); + REQUIRE(result == WaitResult::kSuccess); + result = Wait(sem.get(), false, 10ms); + REQUIRE(result == WaitResult::kSuccess); + result = Wait(sem.get(), false, 10ms); + REQUIRE(result == WaitResult::kSuccess); + result = Wait(sem.get(), false, 10ms); + REQUIRE(result == WaitResult::kSuccess); + result = Wait(sem.get(), false, 10ms); + REQUIRE(result == WaitResult::kSuccess); + result = Wait(sem.get(), false, 10ms); + REQUIRE(result == WaitResult::kTimeout); + + // Semaphore between threads + sem = Semaphore::Create(5, 5); + Sleep(10ms); + // Occupy the semaphore with 5 threads + auto func = [&sem] { + auto res = Wait(sem.get(), false, 100ms); + Sleep(500ms); + if (res == WaitResult::kSuccess) { + sem->Release(1, nullptr); + } + }; + auto threads = std::array{ + std::thread(func), std::thread(func), std::thread(func), + std::thread(func), std::thread(func), + }; + // Give threads time to acquire semaphore + Sleep(10ms); + // Attempt to acquire full semaphore with current (6th) thread + result = Wait(sem.get(), false, 20ms); + REQUIRE(result == WaitResult::kTimeout); + // Give threads time to release semaphore + for (auto& t : threads) { + t.join(); + } + result = Wait(sem.get(), false, 10ms); + REQUIRE(result == WaitResult::kSuccess); + sem->Release(1, &previous_count); + REQUIRE(previous_count == 4); + + // Test invalid construction parameters + // These are invalid according to documentation + // TODO(bwrsandman): Many of these invalid invocations succeed + sem = Semaphore::Create(-1, 5); + // REQUIRE(sem.get() == nullptr); + sem = Semaphore::Create(10, 5); + // REQUIRE(sem.get() == nullptr); + sem = Semaphore::Create(0, 0); + // REQUIRE(sem.get() == nullptr); + sem = Semaphore::Create(0, -1); + // REQUIRE(sem.get() == nullptr); +} + +TEST_CASE("Wait on Multiple Semaphores", "Semaphore") { + WaitResult all_result; + std::pair any_result; + int previous_count; + std::unique_ptr sem0, sem1; + + // Test Wait all which should fail + sem0 = Semaphore::Create(0, 5); + sem1 = Semaphore::Create(5, 5); + all_result = WaitAll({sem0.get(), sem1.get()}, false, 10ms); + REQUIRE(all_result == WaitResult::kTimeout); + previous_count = -1; + REQUIRE(sem0->Release(1, &previous_count)); + REQUIRE(previous_count == 0); + previous_count = -1; + REQUIRE_FALSE(sem1->Release(1, &previous_count)); + REQUIRE(previous_count == -1); + + // Test Wait all again which should succeed + sem0 = Semaphore::Create(1, 5); + sem1 = Semaphore::Create(5, 5); + all_result = WaitAll({sem0.get(), sem1.get()}, false, 10ms); + REQUIRE(all_result == WaitResult::kSuccess); + previous_count = -1; + REQUIRE(sem0->Release(1, &previous_count)); + REQUIRE(previous_count == 0); + previous_count = -1; + REQUIRE(sem1->Release(1, &previous_count)); + REQUIRE(previous_count == 4); + + // Test Wait Any which should fail + sem0 = Semaphore::Create(0, 5); + sem1 = Semaphore::Create(0, 5); + any_result = WaitAny({sem0.get(), sem1.get()}, false, 10ms); + REQUIRE(any_result.first == WaitResult::kTimeout); + REQUIRE(any_result.second == 0); + previous_count = -1; + REQUIRE(sem0->Release(1, &previous_count)); + REQUIRE(previous_count == 0); + previous_count = -1; + REQUIRE(sem1->Release(1, &previous_count)); + REQUIRE(previous_count == 0); + + // Test Wait Any which should succeed + sem0 = Semaphore::Create(0, 5); + sem1 = Semaphore::Create(5, 5); + any_result = WaitAny({sem0.get(), sem1.get()}, false, 10ms); + REQUIRE(any_result.first == WaitResult::kSuccess); + REQUIRE(any_result.second == 1); + previous_count = -1; + REQUIRE(sem0->Release(1, &previous_count)); + REQUIRE(previous_count == 0); + previous_count = -1; + REQUIRE(sem1->Release(1, &previous_count)); + REQUIRE(previous_count == 4); +} + +TEST_CASE("Wait on Mutant", "Mutant") { + WaitResult result; + std::unique_ptr mut; + + // Release on initially owned mutant + mut = Mutant::Create(true); + REQUIRE(mut->Release()); + REQUIRE_FALSE(mut->Release()); + + // Release on initially not-owned mutant + mut = Mutant::Create(false); + REQUIRE_FALSE(mut->Release()); + + // Wait on initially owned mutant + mut = Mutant::Create(true); + result = Wait(mut.get(), false, 1ms); + REQUIRE(result == WaitResult::kSuccess); + REQUIRE(mut->Release()); + REQUIRE(mut->Release()); + REQUIRE_FALSE(mut->Release()); + + // Wait on initially not owned mutant + mut = Mutant::Create(false); + result = Wait(mut.get(), false, 1ms); + REQUIRE(result == WaitResult::kSuccess); + REQUIRE(mut->Release()); + REQUIRE_FALSE(mut->Release()); + + // Multiple waits (or locks) + mut = Mutant::Create(false); + for (int i = 0; i < 10; ++i) { + result = Wait(mut.get(), false, 1ms); + REQUIRE(result == WaitResult::kSuccess); + } + for (int i = 0; i < 10; ++i) { + REQUIRE(mut->Release()); + } + REQUIRE_FALSE(mut->Release()); + + // Test mutants on other threads + auto thread1 = std::thread([&mut] { + Sleep(5ms); + mut = Mutant::Create(true); + Sleep(100ms); + mut->Release(); + }); + Sleep(10ms); + REQUIRE_FALSE(mut->Release()); + Sleep(10ms); + result = Wait(mut.get(), false, 50ms); + REQUIRE(result == WaitResult::kTimeout); + thread1.join(); + result = Wait(mut.get(), false, 1ms); + REQUIRE(result == WaitResult::kSuccess); + REQUIRE(mut->Release()); +} + +TEST_CASE("Wait on Multiple Mutants", "Mutant") { + WaitResult all_result; + std::pair any_result; + std::unique_ptr mut0, mut1; + + // Test which should fail for WaitAll and WaitAny + auto thread0 = std::thread([&mut0, &mut1] { + mut0 = Mutant::Create(true); + mut1 = Mutant::Create(true); + Sleep(50ms); + mut0->Release(); + mut1->Release(); + }); + Sleep(10ms); + all_result = WaitAll({mut0.get(), mut1.get()}, false, 10ms); + REQUIRE(all_result == WaitResult::kTimeout); + REQUIRE_FALSE(mut0->Release()); + REQUIRE_FALSE(mut1->Release()); + any_result = WaitAny({mut0.get(), mut1.get()}, false, 10ms); + REQUIRE(any_result.first == WaitResult::kTimeout); + REQUIRE(any_result.second == 0); + REQUIRE_FALSE(mut0->Release()); + REQUIRE_FALSE(mut1->Release()); + thread0.join(); + + // Test which should fail for WaitAll but not WaitAny + auto thread1 = std::thread([&mut0, &mut1] { + mut0 = Mutant::Create(true); + mut1 = Mutant::Create(false); + Sleep(50ms); + mut0->Release(); + }); + Sleep(10ms); + all_result = WaitAll({mut0.get(), mut1.get()}, false, 10ms); + REQUIRE(all_result == WaitResult::kTimeout); + REQUIRE_FALSE(mut0->Release()); + REQUIRE_FALSE(mut1->Release()); + any_result = WaitAny({mut0.get(), mut1.get()}, false, 10ms); + REQUIRE(any_result.first == WaitResult::kSuccess); + REQUIRE(any_result.second == 1); + REQUIRE_FALSE(mut0->Release()); + REQUIRE(mut1->Release()); + thread1.join(); + + // Test which should pass for WaitAll and WaitAny + auto thread2 = std::thread([&mut0, &mut1] { + mut0 = Mutant::Create(false); + mut1 = Mutant::Create(false); + Sleep(50ms); + }); + Sleep(10ms); + all_result = WaitAll({mut0.get(), mut1.get()}, false, 10ms); + REQUIRE(all_result == WaitResult::kSuccess); + REQUIRE(mut0->Release()); + REQUIRE(mut1->Release()); + any_result = WaitAny({mut0.get(), mut1.get()}, false, 10ms); + REQUIRE(any_result.first == WaitResult::kSuccess); + REQUIRE(any_result.second == 0); + REQUIRE(mut0->Release()); + REQUIRE_FALSE(mut1->Release()); + thread2.join(); +} + +TEST_CASE("Wait on Timer", "Timer") { + WaitResult result; + std::unique_ptr timer; + + // Test Manual Reset + timer = Timer::CreateManualResetTimer(); + result = Wait(timer.get(), false, 1ms); + REQUIRE(result == WaitResult::kTimeout); + REQUIRE(timer->SetOnce(1ms)); // Signals it + result = Wait(timer.get(), false, 2ms); + REQUIRE(result == WaitResult::kSuccess); + result = Wait(timer.get(), false, 1ms); + REQUIRE(result == WaitResult::kSuccess); // Did not reset + + // Test Synchronization + timer = Timer::CreateSynchronizationTimer(); + result = Wait(timer.get(), false, 1ms); + REQUIRE(result == WaitResult::kTimeout); + REQUIRE(timer->SetOnce(1ms)); // Signals it + result = Wait(timer.get(), false, 2ms); + REQUIRE(result == WaitResult::kSuccess); + result = Wait(timer.get(), false, 1ms); + REQUIRE(result == WaitResult::kTimeout); // Did reset + + // TODO(bwrsandman): This test unexpectedly fails under windows + // Test long due time + // timer = Timer::CreateSynchronizationTimer(); + // REQUIRE(timer->SetOnce(10s)); + // result = Wait(timer.get(), false, 10ms); // Still signals under windows + // REQUIRE(result == WaitResult::kTimeout); + + // Test Repeating + REQUIRE(timer->SetRepeating(1ms, 10ms)); + for (int i = 0; i < 10; ++i) { + result = Wait(timer.get(), false, 20ms); + INFO(i); + REQUIRE(result == WaitResult::kSuccess); + } + MaybeYield(); + Sleep(10ms); // Skip a few events + for (int i = 0; i < 10; ++i) { + result = Wait(timer.get(), false, 20ms); + REQUIRE(result == WaitResult::kSuccess); + } + // Cancel it + timer->Cancel(); + result = Wait(timer.get(), false, 20ms); + REQUIRE(result == WaitResult::kTimeout); + MaybeYield(); + Sleep(10ms); // Skip a few events + result = Wait(timer.get(), false, 20ms); + REQUIRE(result == WaitResult::kTimeout); + // Cancel with SetOnce + REQUIRE(timer->SetRepeating(1ms, 10ms)); + for (int i = 0; i < 10; ++i) { + result = Wait(timer.get(), false, 20ms); + REQUIRE(result == WaitResult::kSuccess); + } + REQUIRE(timer->SetOnce(1ms)); + result = Wait(timer.get(), false, 20ms); + REQUIRE(result == WaitResult::kSuccess); // Signal from Set Once + result = Wait(timer.get(), false, 20ms); + REQUIRE(result == WaitResult::kTimeout); // No more signals from repeating +} + +TEST_CASE("Wait on Multiple Timers", "Timer") { + WaitResult all_result; + std::pair any_result; + + auto timer0 = Timer::CreateSynchronizationTimer(); + auto timer1 = Timer::CreateManualResetTimer(); + + // None signaled + all_result = WaitAll({timer0.get(), timer1.get()}, false, 1ms); + REQUIRE(all_result == WaitResult::kTimeout); + any_result = WaitAny({timer0.get(), timer1.get()}, false, 1ms); + REQUIRE(any_result.first == WaitResult::kTimeout); + REQUIRE(any_result.second == 0); + + // Some signaled + REQUIRE(timer1->SetOnce(1ms)); + all_result = WaitAll({timer0.get(), timer1.get()}, false, 100ms); + REQUIRE(all_result == WaitResult::kTimeout); + any_result = WaitAny({timer0.get(), timer1.get()}, false, 100ms); + REQUIRE(any_result.first == WaitResult::kSuccess); + REQUIRE(any_result.second == 1); + + // All signaled + REQUIRE(timer0->SetOnce(1ms)); + all_result = WaitAll({timer0.get(), timer1.get()}, false, 100ms); + REQUIRE(all_result == WaitResult::kSuccess); + REQUIRE(timer0->SetOnce(1ms)); + Sleep(1ms); + any_result = WaitAny({timer0.get(), timer1.get()}, false, 100ms); + REQUIRE(any_result.first == WaitResult::kSuccess); + REQUIRE(any_result.second == 0); + + // Check that timer0 reset + any_result = WaitAny({timer0.get(), timer1.get()}, false, 100ms); + REQUIRE(any_result.first == WaitResult::kSuccess); + REQUIRE(any_result.second == 1); +} + +TEST_CASE("Create and Trigger Timer Callbacks", "Timer") { + // TODO(bwrsandman): Check which thread performs callback and timing of + // callback + REQUIRE(true); +} + +TEST_CASE("Set and Test Current Thread ID", "Thread") { + // System ID + auto system_id = current_thread_system_id(); + REQUIRE(system_id > 0); + + // Thread ID + auto thread_id = current_thread_id(); + REQUIRE(thread_id == system_id); + + // Set a new thread id + const uint32_t new_thread_id = 0xDEADBEEF; + set_current_thread_id(new_thread_id); + REQUIRE(current_thread_id() == new_thread_id); + + // Set back original thread id of system + set_current_thread_id(std::numeric_limits::max()); + REQUIRE(current_thread_id() == system_id); + + // TODO(bwrsandman): Test on Thread object +} + +TEST_CASE("Set and Test Current Thread Name", "Thread") { + auto current_thread = Thread::GetCurrentThread(); + REQUIRE(current_thread); + auto old_thread_name = current_thread->name(); + + std::string new_thread_name = "Threading Test"; + REQUIRE_NOTHROW(set_name(new_thread_name)); + + // Restore the old catch.hpp thread name + REQUIRE_NOTHROW(set_name(old_thread_name)); +} + +TEST_CASE("Create and Run Thread", "Thread") { + std::unique_ptr thread; + WaitResult result; + Thread::CreationParameters params = {}; + auto func = [] { Sleep(20ms); }; + + // Create most basic case of thread + thread = Thread::Create(params, func); + REQUIRE(thread->native_handle() != nullptr); + REQUIRE_NOTHROW(thread->affinity_mask()); + REQUIRE(thread->name().empty()); + result = Wait(thread.get(), false, 50ms); + REQUIRE(result == WaitResult::kSuccess); + + // Add thread name + std::string new_name = "Test thread name"; + thread = Thread::Create(params, func); + auto name = thread->name(); + INFO(name.c_str()); + REQUIRE(name.empty()); + thread->set_name(new_name); + REQUIRE(thread->name() == new_name); + result = Wait(thread.get(), false, 50ms); + REQUIRE(result == WaitResult::kSuccess); + + // Use Terminate to end an infinitely looping thread + thread = Thread::Create(params, [] { + while (true) { + Sleep(1ms); + } + }); + result = Wait(thread.get(), false, 50ms); + REQUIRE(result == WaitResult::kTimeout); + thread->Terminate(-1); + result = Wait(thread.get(), false, 50ms); + REQUIRE(result == WaitResult::kSuccess); + + // Call Exit from inside an infinitely looping thread + thread = Thread::Create(params, [] { + while (true) { + Thread::Exit(-1); + } + }); + result = Wait(thread.get(), false, 50ms); + REQUIRE(result == WaitResult::kSuccess); + + // Call timeout wait on self + result = Wait(Thread::GetCurrentThread(), false, 50ms); + REQUIRE(result == WaitResult::kTimeout); + + params.stack_size = 16 * 1024; + thread = Thread::Create(params, [] { + while (true) { + Thread::Exit(-1); + } + }); + REQUIRE(thread != nullptr); + result = Wait(thread.get(), false, 50ms); + REQUIRE(result == WaitResult::kSuccess); + + // TODO(bwrsandman): Test with different priorities + // TODO(bwrsandman): Test setting and getting thread affinity +} + +TEST_CASE("Test Suspending Thread", "Thread") { + std::unique_ptr thread; + WaitResult result; + Thread::CreationParameters params = {}; + auto func = [] { Sleep(20ms); }; + + // Create initially suspended + params.create_suspended = true; + thread = threading::Thread::Create(params, func); + result = threading::Wait(thread.get(), false, 50ms); + REQUIRE(result == threading::WaitResult::kTimeout); + thread->Resume(); + result = threading::Wait(thread.get(), false, 50ms); + REQUIRE(result == threading::WaitResult::kSuccess); + params.create_suspended = false; + + // Create and then suspend + thread = threading::Thread::Create(params, func); + thread->Suspend(); + result = threading::Wait(thread.get(), false, 50ms); + REQUIRE(result == threading::WaitResult::kTimeout); + thread->Resume(); + result = threading::Wait(thread.get(), false, 50ms); + REQUIRE(result == threading::WaitResult::kSuccess); + + // Test recursive suspend + thread = threading::Thread::Create(params, func); + thread->Suspend(); + thread->Suspend(); + result = threading::Wait(thread.get(), false, 50ms); + REQUIRE(result == threading::WaitResult::kTimeout); + thread->Resume(); + result = threading::Wait(thread.get(), false, 50ms); + REQUIRE(result == threading::WaitResult::kTimeout); + thread->Resume(); + result = threading::Wait(thread.get(), false, 50ms); + REQUIRE(result == threading::WaitResult::kSuccess); + + // Test suspend count + uint32_t suspend_count = 0; + thread = threading::Thread::Create(params, func); + thread->Suspend(&suspend_count); + REQUIRE(suspend_count == 0); + thread->Suspend(&suspend_count); + REQUIRE(suspend_count == 1); + thread->Suspend(&suspend_count); + REQUIRE(suspend_count == 2); + thread->Resume(&suspend_count); + REQUIRE(suspend_count == 3); + thread->Resume(&suspend_count); + REQUIRE(suspend_count == 2); + thread->Resume(&suspend_count); + REQUIRE(suspend_count == 1); + thread->Suspend(&suspend_count); + REQUIRE(suspend_count == 0); + thread->Resume(&suspend_count); + REQUIRE(suspend_count == 1); + result = threading::Wait(thread.get(), false, 50ms); + REQUIRE(result == threading::WaitResult::kSuccess); +} + +TEST_CASE("Test Thread QueueUserCallback", "Thread") { + std::unique_ptr thread; + WaitResult result; + Thread::CreationParameters params = {}; + std::atomic_int order; + int is_modified; + int has_finished; + auto callback = [&is_modified, &order] { + is_modified = std::atomic_fetch_add_explicit( + &order, 1, std::memory_order::memory_order_relaxed); + }; + + // Without alertable + order = 0; + is_modified = -1; + has_finished = -1; + thread = Thread::Create(params, [&has_finished, &order] { + // Not using Alertable so callback is not registered + Sleep(90ms); + has_finished = std::atomic_fetch_add_explicit( + &order, 1, std::memory_order::memory_order_relaxed); + }); + result = Wait(thread.get(), true, 50ms); + REQUIRE(result == WaitResult::kTimeout); + REQUIRE(is_modified == -1); + thread->QueueUserCallback(callback); + result = Wait(thread.get(), true, 100ms); + REQUIRE(result == WaitResult::kSuccess); + REQUIRE(is_modified == -1); + REQUIRE(has_finished == 0); + + // With alertable + order = 0; + is_modified = -1; + has_finished = -1; + thread = Thread::Create(params, [&has_finished, &order] { + // Using Alertable so callback is registered + AlertableSleep(90ms); + has_finished = std::atomic_fetch_add_explicit( + &order, 1, std::memory_order::memory_order_relaxed); + }); + result = Wait(thread.get(), true, 50ms); + REQUIRE(result == WaitResult::kTimeout); + REQUIRE(is_modified == -1); + thread->QueueUserCallback(callback); + result = Wait(thread.get(), true, 100ms); + REQUIRE(result == WaitResult::kSuccess); + REQUIRE(is_modified == 0); + REQUIRE(has_finished == 1); + + // Test Exit command with QueueUserCallback + order = 0; + is_modified = -1; + has_finished = -1; + thread = Thread::Create(params, [&is_modified, &has_finished, &order] { + is_modified = std::atomic_fetch_add_explicit( + &order, 1, std::memory_order::memory_order_relaxed); + // Using Alertable so callback is registered + AlertableSleep(200ms); + has_finished = std::atomic_fetch_add_explicit( + &order, 1, std::memory_order::memory_order_relaxed); + }); + result = Wait(thread.get(), true, 100ms); + REQUIRE(result == WaitResult::kTimeout); + thread->QueueUserCallback([] { Thread::Exit(0); }); + result = Wait(thread.get(), true, 500ms); + REQUIRE(result == WaitResult::kSuccess); + REQUIRE(is_modified == 0); + REQUIRE(has_finished == -1); + + // TODO(bwrsandman): Test alertable wait returning kUserCallback by using IO + // callbacks. +} + +} // namespace test +} // namespace base +} // namespace xe diff --git a/src/xenia/base/threading.h b/src/xenia/base/threading.h index fef37dd06..776a158e0 100644 --- a/src/xenia/base/threading.h +++ b/src/xenia/base/threading.h @@ -24,29 +24,56 @@ #include #include +#include "xenia/base/assert.h" + namespace xe { namespace threading { +// This is more like an Event with self-reset when returning from Wait() class Fence { public: - Fence() : signaled_(false) {} + Fence() : signal_state_(0) {} + void Signal() { std::unique_lock lock(mutex_); - signaled_.store(true); + signal_state_ |= SIGMASK_; cond_.notify_all(); } + + // Wait for the Fence to be signaled. Clears the signal on return. void Wait() { std::unique_lock lock(mutex_); - while (!signaled_.load()) { + assert_true((signal_state_ & ~SIGMASK_) < (SIGMASK_ - 1) && + "Too many threads?"); + + // keep local copy to minimize loads + auto signal_state = ++signal_state_; + for (; !(signal_state & SIGMASK_); signal_state = signal_state_) { cond_.wait(lock); } - signaled_.store(false); + + // We can't just clear the signal as other threads may not have read it yet + assert_true((signal_state & ~SIGMASK_) > 0); // wait_count > 0 + if (signal_state == (1 | SIGMASK_)) { // wait_count == 1 + // Last one out turn off the lights + signal_state_ = 0; + } else { + // Oops, another thread is still waiting, set the new count and keep the + // signal. + signal_state_ = --signal_state; + } } private: + using state_t_ = uint_fast32_t; + static constexpr state_t_ SIGMASK_ = state_t_(1) + << (sizeof(state_t_) * 8 - 1); + std::mutex mutex_; std::condition_variable cond_; - std::atomic signaled_; + // Use the highest bit (sign bit) as the signal flag and the rest to count + // waiting threads. + volatile state_t_ signal_state_; }; // Returns the total number of logical processors in the host system. @@ -308,12 +335,12 @@ class Timer : public WaitHandle { std::chrono::milliseconds period, std::function opt_callback = nullptr) = 0; template - void SetRepeating(std::chrono::nanoseconds due_time, + bool SetRepeating(std::chrono::nanoseconds due_time, std::chrono::duration period, std::function opt_callback = nullptr) { - SetRepeating(due_time, - std::chrono::duration_cast(period), - std::move(opt_callback)); + return SetRepeating( + due_time, std::chrono::duration_cast(period), + std::move(opt_callback)); } // Stops the timer before it can be set to the signaled state and cancels @@ -391,7 +418,7 @@ class Thread : public WaitHandle { // Decrements a thread's suspend count. When the suspend count is decremented // to zero, the execution of the thread is resumed. - virtual bool Resume(uint32_t* out_new_suspend_count = nullptr) = 0; + virtual bool Resume(uint32_t* out_previous_suspend_count = nullptr) = 0; // Suspends the specified thread. virtual bool Suspend(uint32_t* out_previous_suspend_count = nullptr) = 0; diff --git a/src/xenia/base/threading_posix.cc b/src/xenia/base/threading_posix.cc index 28597e608..9e39b17a5 100644 --- a/src/xenia/base/threading_posix.cc +++ b/src/xenia/base/threading_posix.cc @@ -13,16 +13,64 @@ #include "xenia/base/logging.h" #include +#include #include #include #include #include -#include #include +#include +#include namespace xe { namespace threading { +template +inline timespec DurationToTimeSpec( + std::chrono::duration<_Rep, _Period> duration) { + auto nanoseconds = + std::chrono::duration_cast(duration); + auto div = ldiv(nanoseconds.count(), 1000000000L); + return timespec{div.quot, div.rem}; +} + +// Thread interruption is done using user-defined signals +// This implementation uses the SIGRTMAX - SIGRTMIN to signal to a thread +// gdb tip, for SIG = SIGRTMIN + SignalType : handle SIG nostop +// lldb tip, for SIG = SIGRTMIN + SignalType : process handle SIG -s false +enum class SignalType { + kHighResolutionTimer, + kTimer, + kThreadSuspend, + kThreadUserCallback, + k_Count +}; + +int GetSystemSignal(SignalType num) { + auto result = SIGRTMIN + static_cast(num); + assert_true(result < SIGRTMAX); + return result; +} + +SignalType GetSystemSignalType(int num) { + return static_cast(num - SIGRTMIN); +} + +thread_local std::array(SignalType::k_Count)> + signal_handler_installed = {}; + +static void signal_handler(int signal, siginfo_t* info, void* context); + +void install_signal_handler(SignalType type) { + if (signal_handler_installed[static_cast(type)]) return; + struct sigaction action {}; + action.sa_flags = SA_SIGINFO; + action.sa_sigaction = signal_handler; + sigemptyset(&action.sa_mask); + if (sigaction(GetSystemSignal(type), &action, nullptr) == -1) + signal_handler_installed[static_cast(type)] = true; +} + // TODO(dougvj) void EnableAffinityConfiguration() {} @@ -47,55 +95,81 @@ void MaybeYield() { void SyncMemory() { __sync_synchronize(); } void Sleep(std::chrono::microseconds duration) { - timespec rqtp = {time_t(duration.count() / 1000000), - time_t(duration.count() % 1000)}; - nanosleep(&rqtp, nullptr); - // TODO(benvanik): spin while rmtp >0? + timespec rqtp = DurationToTimeSpec(duration); + timespec rmtp = {}; + auto p_rqtp = &rqtp; + auto p_rmtp = &rmtp; + int ret = 0; + do { + ret = nanosleep(p_rqtp, p_rmtp); + // Swap requested for remaining in case of signal interruption + // in which case, we start sleeping again for the remainder + std::swap(p_rqtp, p_rmtp); + } while (ret == -1 && errno == EINTR); } -// TODO(dougvj) Not sure how to implement the equivalent of this on POSIX. +// TODO(bwrsandman) Implement by allowing alert interrupts from IO operations +thread_local bool alertable_state_ = false; SleepResult AlertableSleep(std::chrono::microseconds duration) { - sleep(duration.count() / 1000); + alertable_state_ = true; + Sleep(duration); + alertable_state_ = false; return SleepResult::kSuccess; } -// TODO(dougvj) We can probably wrap this with pthread_key_t but the type of -// TlsHandle probably needs to be refactored TlsHandle AllocateTlsHandle() { - assert_always(); - return 0; + auto key = static_cast(-1); + auto res = pthread_key_create(&key, nullptr); + assert_zero(res); + assert_true(key != static_cast(-1)); + return static_cast(key); } -bool FreeTlsHandle(TlsHandle handle) { return true; } +bool FreeTlsHandle(TlsHandle handle) { + return pthread_key_delete(static_cast(handle)) == 0; +} uintptr_t GetTlsValue(TlsHandle handle) { - assert_always(); - return 0; + return reinterpret_cast( + pthread_getspecific(static_cast(handle))); } bool SetTlsValue(TlsHandle handle, uintptr_t value) { - assert_always(); - return false; + return pthread_setspecific(static_cast(handle), + reinterpret_cast(value)) == 0; } -// TODO(dougvj) class PosixHighResolutionTimer : public HighResolutionTimer { public: - PosixHighResolutionTimer(std::function callback) - : callback_(callback) {} - ~PosixHighResolutionTimer() override {} + explicit PosixHighResolutionTimer(std::function callback) + : callback_(std::move(callback)), timer_(nullptr) {} + ~PosixHighResolutionTimer() override { + if (timer_) timer_delete(timer_); + } bool Initialize(std::chrono::milliseconds period) { - assert_always(); - return false; + // Create timer + sigevent sev{}; + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = GetSystemSignal(SignalType::kHighResolutionTimer); + sev.sigev_value.sival_ptr = (void*)&callback_; + if (timer_create(CLOCK_REALTIME, &sev, &timer_) == -1) return false; + + // Start timer + itimerspec its{}; + its.it_value = DurationToTimeSpec(period); + its.it_interval = its.it_value; + return timer_settime(timer_, 0, &its, nullptr) != -1; } private: std::function callback_; + timer_t timer_; }; std::unique_ptr HighResolutionTimer::CreateRepeating( std::chrono::milliseconds period, std::function callback) { + install_signal_handler(SignalType::kHighResolutionTimer); auto timer = std::make_unique(std::move(callback)); if (!timer->Initialize(period)) { return nullptr; @@ -103,209 +177,669 @@ std::unique_ptr HighResolutionTimer::CreateRepeating( return std::unique_ptr(timer.release()); } -// TODO(dougvj) There really is no native POSIX handle for a single wait/signal -// construct pthreads is at a lower level with more handles for such a mechanism -// This simple wrapper class could function as our handle, but probably needs -// some more functionality -class PosixCondition { +class PosixConditionBase { public: - PosixCondition() : signal_(false) { - pthread_mutex_init(&mutex_, NULL); - pthread_cond_init(&cond_, NULL); + virtual bool Signal() = 0; + + WaitResult Wait(std::chrono::milliseconds timeout) { + bool executed; + auto predicate = [this] { return this->signaled(); }; + auto lock = std::unique_lock(mutex_); + if (predicate()) { + executed = true; + } else { + if (timeout == std::chrono::milliseconds::max()) { + cond_.wait(lock, predicate); + executed = true; // Did not time out; + } else { + executed = cond_.wait_for(lock, timeout, predicate); + } + } + if (executed) { + post_execution(); + return WaitResult::kSuccess; + } else { + return WaitResult::kTimeout; + } } - ~PosixCondition() { - pthread_mutex_destroy(&mutex_); - pthread_cond_destroy(&cond_); + static std::pair WaitMultiple( + std::vector&& handles, bool wait_all, + std::chrono::milliseconds timeout) { + using iter_t = std::vector::const_iterator; + bool executed; + auto predicate = [](auto h) { return h->signaled(); }; + + // Construct a condition for all or any depending on wait_all + auto operation = wait_all ? std::all_of + : std::any_of; + auto aggregate = [&handles, operation, predicate] { + return operation(handles.cbegin(), handles.cend(), predicate); + }; + + // TODO(bwrsandman, Triang3l) This is controversial, see issue #1677 + // This will probably cause a deadlock on the next thread doing any waiting + // if the thread is suspended between locking and waiting + std::unique_lock lock(PosixConditionBase::mutex_); + + // Check if the aggregate lambda (all or any) is already satisfied + if (aggregate()) { + executed = true; + } else { + // If the aggregate is not yet satisfied and the timeout is infinite, + // wait without timeout. + if (timeout == std::chrono::milliseconds::max()) { + PosixConditionBase::cond_.wait(lock, aggregate); + executed = true; + } else { + // Wait with timeout. + executed = PosixConditionBase::cond_.wait_for(lock, timeout, aggregate); + } + } + if (executed) { + auto first_signaled = std::numeric_limits::max(); + for (auto i = 0u; i < handles.size(); ++i) { + if (handles[i]->signaled()) { + if (first_signaled > i) { + first_signaled = i; + } + handles[i]->post_execution(); + if (!wait_all) break; + } + } + return std::make_pair(WaitResult::kSuccess, first_signaled); + } else { + return std::make_pair(WaitResult::kTimeout, 0); + } } - void Signal() { - pthread_mutex_lock(&mutex_); + virtual void* native_handle() const { return cond_.native_handle(); } + + protected: + inline virtual bool signaled() const = 0; + inline virtual void post_execution() = 0; + static std::condition_variable cond_; + static std::mutex mutex_; +}; + +std::condition_variable PosixConditionBase::cond_; +std::mutex PosixConditionBase::mutex_; + +// There really is no native POSIX handle for a single wait/signal construct +// pthreads is at a lower level with more handles for such a mechanism. +// This simple wrapper class functions as our handle and uses conditional +// variables for waits and signals. +template +class PosixCondition {}; + +template <> +class PosixCondition : public PosixConditionBase { + public: + PosixCondition(bool manual_reset, bool initial_state) + : signal_(initial_state), manual_reset_(manual_reset) {} + virtual ~PosixCondition() = default; + + bool Signal() override { + auto lock = std::unique_lock(mutex_); signal_ = true; - pthread_cond_broadcast(&cond_); - pthread_mutex_unlock(&mutex_); + if (manual_reset_) { + cond_.notify_all(); + } else { + // FIXME(bwrsandman): Potential cause for deadlock + // See issue #1678 for possible fix and discussion + cond_.notify_one(); + } + return true; } void Reset() { - pthread_mutex_lock(&mutex_); + auto lock = std::unique_lock(mutex_); signal_ = false; - pthread_mutex_unlock(&mutex_); - } - - bool Wait(unsigned int timeout_ms) { - // Assume 0 means no timeout, not instant timeout - if (timeout_ms == 0) { - Wait(); - } - struct timespec time_to_wait; - struct timeval now; - gettimeofday(&now, NULL); - - // Add the number of seconds we want to wait to the current time - time_to_wait.tv_sec = now.tv_sec + (timeout_ms / 1000); - // Add the number of nanoseconds we want to wait to the current nanosecond - // stride - long nsec = (now.tv_usec + (timeout_ms % 1000)) * 1000; - // If we overflowed the nanosecond count then we add a second - time_to_wait.tv_sec += nsec / 1000000000UL; - // We only add nanoseconds within the 1 second stride - time_to_wait.tv_nsec = nsec % 1000000000UL; - pthread_mutex_lock(&mutex_); - while (!signal_) { - int status = pthread_cond_timedwait(&cond_, &mutex_, &time_to_wait); - if (status == ETIMEDOUT) return false; // We timed out - } - pthread_mutex_unlock(&mutex_); - return true; // We didn't time out - } - - bool Wait() { - pthread_mutex_lock(&mutex_); - while (!signal_) { - pthread_cond_wait(&cond_, &mutex_); - } - pthread_mutex_unlock(&mutex_); - return true; // Did not time out; } private: + inline bool signaled() const override { return signal_; } + inline void post_execution() override { + if (!manual_reset_) { + signal_ = false; + } + } bool signal_; - pthread_cond_t cond_; - pthread_mutex_t mutex_; + const bool manual_reset_; }; -// Native posix thread handle -template -class PosixThreadHandle : public T { +template <> +class PosixCondition : public PosixConditionBase { public: - explicit PosixThreadHandle(pthread_t handle) : handle_(handle) {} - ~PosixThreadHandle() override {} + PosixCondition(uint32_t initial_count, uint32_t maximum_count) + : count_(initial_count), maximum_count_(maximum_count) {} - protected: - void* native_handle() const override { - return reinterpret_cast(handle_); + bool Signal() override { return Release(1, nullptr); } + + bool Release(uint32_t release_count, int* out_previous_count) { + if (maximum_count_ - count_ >= release_count) { + auto lock = std::unique_lock(mutex_); + if (out_previous_count) *out_previous_count = count_; + count_ += release_count; + cond_.notify_all(); + return true; + } + return false; } - pthread_t handle_; + private: + inline bool signaled() const override { return count_ > 0; } + inline void post_execution() override { + count_--; + cond_.notify_all(); + } + uint32_t count_; + const uint32_t maximum_count_; }; -// This is wraps a condition object as our handle because posix has no single -// native handle for higher level concurrency constructs such as semaphores -template -class PosixConditionHandle : public T { +template <> +class PosixCondition : public PosixConditionBase { public: - ~PosixConditionHandle() override {} - - protected: - void* native_handle() const override { - return reinterpret_cast(const_cast(&handle_)); + explicit PosixCondition(bool initial_owner) : count_(0) { + if (initial_owner) { + count_ = 1; + owner_ = std::this_thread::get_id(); + } } - PosixCondition handle_; + bool Signal() override { return Release(); } + + bool Release() { + if (owner_ == std::this_thread::get_id() && count_ > 0) { + auto lock = std::unique_lock(mutex_); + --count_; + // Free to be acquired by another thread + if (count_ == 0) { + cond_.notify_one(); + } + return true; + } + return false; + } + + void* native_handle() const override { return mutex_.native_handle(); } + + private: + inline bool signaled() const override { + return count_ == 0 || owner_ == std::this_thread::get_id(); + } + inline void post_execution() override { + count_++; + owner_ = std::this_thread::get_id(); + } + uint32_t count_; + std::thread::id owner_; }; -template -class PosixFdHandle : public T { +template <> +class PosixCondition : public PosixConditionBase { public: - explicit PosixFdHandle(intptr_t handle) : handle_(handle) {} - ~PosixFdHandle() override { - close(handle_); - handle_ = 0; + explicit PosixCondition(bool manual_reset) + : callback_(), + timer_(nullptr), + signal_(false), + manual_reset_(manual_reset) {} + + virtual ~PosixCondition() { Cancel(); } + + bool Signal() override { + CompletionRoutine(); + return true; } - protected: - void* native_handle() const override { - return reinterpret_cast(handle_); - } + // TODO(bwrsandman): due_times of under 1ms deadlock under travis + bool Set(std::chrono::nanoseconds due_time, std::chrono::milliseconds period, + std::function opt_callback = nullptr) { + std::lock_guard lock(mutex_); - intptr_t handle_; -}; + callback_ = std::move(opt_callback); + signal_ = false; -// TODO(dougvj) -WaitResult Wait(WaitHandle* wait_handle, bool is_alertable, - std::chrono::milliseconds timeout) { - intptr_t handle = reinterpret_cast(wait_handle->native_handle()); - - fd_set set; - struct timeval time_val; - int ret; - - FD_ZERO(&set); - FD_SET(handle, &set); - - time_val.tv_sec = timeout.count() / 1000; - time_val.tv_usec = timeout.count() * 1000; - ret = select(handle + 1, &set, NULL, NULL, &time_val); - if (ret == -1) { - return WaitResult::kFailed; - } else if (ret == 0) { - return WaitResult::kTimeout; - } else { - uint64_t buf = 0; - ret = read(handle, &buf, sizeof(buf)); - if (ret < 8) { - return WaitResult::kTimeout; + // Create timer + if (timer_ == nullptr) { + sigevent sev{}; + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = GetSystemSignal(SignalType::kTimer); + sev.sigev_value.sival_ptr = this; + if (timer_create(CLOCK_REALTIME, &sev, &timer_) == -1) return false; } - return WaitResult::kSuccess; + // Start timer + itimerspec its{}; + its.it_value = DurationToTimeSpec(due_time); + its.it_interval = DurationToTimeSpec(period); + return timer_settime(timer_, 0, &its, nullptr) == 0; } + + void CompletionRoutine() { + // As the callback may reset the timer, store local. + std::function callback; + { + std::lock_guard lock(mutex_); + // Store callback + if (callback_) callback = callback_; + signal_ = true; + if (manual_reset_) { + cond_.notify_all(); + } else { + cond_.notify_one(); + } + } + // Call callback + if (callback) callback(); + } + + bool Cancel() { + std::lock_guard lock(mutex_); + bool result = true; + if (timer_) { + result = timer_delete(timer_) == 0; + timer_ = nullptr; + } + return result; + } + + void* native_handle() const override { + return reinterpret_cast(timer_); + } + + private: + inline bool signaled() const override { return signal_; } + inline void post_execution() override { + if (!manual_reset_) { + signal_ = false; + } + } + std::function callback_; + timer_t timer_; + volatile bool signal_; + const bool manual_reset_; +}; + +struct ThreadStartData { + std::function start_routine; + bool create_suspended; + Thread* thread_obj; +}; + +template <> +class PosixCondition : public PosixConditionBase { + enum class State { + kUninitialized, + kRunning, + kSuspended, + kFinished, + }; + + public: + PosixCondition() + : thread_(0), + signaled_(false), + exit_code_(0), + state_(State::kUninitialized), + suspend_count_(0) {} + bool Initialize(Thread::CreationParameters params, + ThreadStartData* start_data) { + start_data->create_suspended = params.create_suspended; + pthread_attr_t attr; + if (pthread_attr_init(&attr) != 0) return false; + if (pthread_attr_setstacksize(&attr, params.stack_size) != 0) { + pthread_attr_destroy(&attr); + return false; + } + if (params.initial_priority != 0) { + sched_param sched{}; + sched.sched_priority = params.initial_priority + 1; + if (pthread_attr_setschedpolicy(&attr, SCHED_FIFO) != 0) { + pthread_attr_destroy(&attr); + return false; + } + if (pthread_attr_setschedparam(&attr, &sched) != 0) { + pthread_attr_destroy(&attr); + return false; + } + } + if (pthread_create(&thread_, &attr, ThreadStartRoutine, start_data) != 0) { + return false; + } + pthread_attr_destroy(&attr); + return true; + } + + /// Constructor for existing thread. This should only happen once called by + /// Thread::GetCurrentThread() on the main thread + explicit PosixCondition(pthread_t thread) + : thread_(thread), + signaled_(false), + exit_code_(0), + state_(State::kRunning) {} + + virtual ~PosixCondition() { + if (thread_ && !signaled_) { + if (pthread_cancel(thread_) != 0) { + assert_always(); + } + if (pthread_join(thread_, nullptr) != 0) { + assert_always(); + } + } + } + + bool Signal() override { return true; } + + std::string name() const { + WaitStarted(); + auto result = std::array{'\0'}; + std::unique_lock lock(state_mutex_); + if (state_ != State::kUninitialized && state_ != State::kFinished) { + if (pthread_getname_np(thread_, result.data(), result.size() - 1) != 0) + assert_always(); + } + return std::string(result.data()); + } + + void set_name(const std::string& name) { + WaitStarted(); + std::unique_lock lock(state_mutex_); + if (state_ != State::kUninitialized && state_ != State::kFinished) { + threading::set_name(static_cast(thread_), + name); + } + } + + uint32_t system_id() const { return static_cast(thread_); } + + uint64_t affinity_mask() { + WaitStarted(); + cpu_set_t cpu_set; + if (pthread_getaffinity_np(thread_, sizeof(cpu_set_t), &cpu_set) != 0) + assert_always(); + uint64_t result = 0; + auto cpu_count = std::min(CPU_SETSIZE, 64); + for (auto i = 0u; i < cpu_count; i++) { + auto set = CPU_ISSET(i, &cpu_set); + result |= set << i; + } + return result; + } + + void set_affinity_mask(uint64_t mask) { + WaitStarted(); + cpu_set_t cpu_set; + CPU_ZERO(&cpu_set); + for (auto i = 0u; i < 64; i++) { + if (mask & (1 << i)) { + CPU_SET(i, &cpu_set); + } + } + if (pthread_setaffinity_np(thread_, sizeof(cpu_set_t), &cpu_set) != 0) { + assert_always(); + } + } + + int priority() { + WaitStarted(); + int policy; + sched_param param{}; + int ret = pthread_getschedparam(thread_, &policy, ¶m); + if (ret != 0) { + return -1; + } + + return param.sched_priority; + } + + void set_priority(int new_priority) { + WaitStarted(); + sched_param param{}; + param.sched_priority = new_priority; + if (pthread_setschedparam(thread_, SCHED_FIFO, ¶m) != 0) + assert_always(); + } + + void QueueUserCallback(std::function callback) { + WaitStarted(); + std::unique_lock lock(callback_mutex_); + user_callback_ = std::move(callback); + sigval value{}; + value.sival_ptr = this; + pthread_sigqueue(thread_, GetSystemSignal(SignalType::kThreadUserCallback), + value); + } + + void CallUserCallback() { + std::unique_lock lock(callback_mutex_); + user_callback_(); + } + + bool Resume(uint32_t* out_previous_suspend_count = nullptr) { + if (out_previous_suspend_count) { + *out_previous_suspend_count = 0; + } + WaitStarted(); + std::unique_lock lock(state_mutex_); + if (state_ != State::kSuspended) return false; + if (out_previous_suspend_count) { + *out_previous_suspend_count = suspend_count_; + } + --suspend_count_; + state_signal_.notify_all(); + return true; + } + + bool Suspend(uint32_t* out_previous_suspend_count = nullptr) { + if (out_previous_suspend_count) { + *out_previous_suspend_count = 0; + } + WaitStarted(); + { + if (out_previous_suspend_count) { + *out_previous_suspend_count = suspend_count_; + } + state_ = State::kSuspended; + ++suspend_count_; + } + int result = + pthread_kill(thread_, GetSystemSignal(SignalType::kThreadSuspend)); + return result == 0; + } + + void Terminate(int exit_code) { + { + std::unique_lock lock(state_mutex_); + state_ = State::kFinished; + } + + std::lock_guard lock(mutex_); + + // Sometimes the thread can call terminate twice before stopping + if (thread_ == 0) return; + auto thread = thread_; + + exit_code_ = exit_code; + signaled_ = true; + cond_.notify_all(); + + if (pthread_cancel(thread) != 0) assert_always(); + } + + void WaitStarted() const { + std::unique_lock lock(state_mutex_); + state_signal_.wait(lock, + [this] { return state_ != State::kUninitialized; }); + } + + /// Set state to suspended and wait until it reset by another thread + void WaitSuspended() { + std::unique_lock lock(state_mutex_); + state_signal_.wait(lock, [this] { return suspend_count_ == 0; }); + state_ = State::kRunning; + } + + void* native_handle() const override { + return reinterpret_cast(thread_); + } + + private: + static void* ThreadStartRoutine(void* parameter); + inline bool signaled() const override { return signaled_; } + inline void post_execution() override { + if (thread_) { + pthread_join(thread_, nullptr); + thread_ = 0; + } + } + pthread_t thread_; + bool signaled_; + int exit_code_; + volatile State state_; + volatile uint32_t suspend_count_; + mutable std::mutex state_mutex_; + mutable std::mutex callback_mutex_; + mutable std::condition_variable state_signal_; + std::function user_callback_; +}; + +class PosixWaitHandle { + public: + virtual PosixConditionBase& condition() = 0; +}; + +// This wraps a condition object as our handle because posix has no single +// native handle for higher level concurrency constructs such as semaphores +template +class PosixConditionHandle : public T, public PosixWaitHandle { + public: + PosixConditionHandle() = default; + explicit PosixConditionHandle(bool); + explicit PosixConditionHandle(pthread_t thread); + PosixConditionHandle(bool manual_reset, bool initial_state); + PosixConditionHandle(uint32_t initial_count, uint32_t maximum_count); + ~PosixConditionHandle() override = default; + + PosixConditionBase& condition() override { return handle_; } + void* native_handle() const override { return handle_.native_handle(); } + + protected: + PosixCondition handle_; + friend PosixCondition; +}; + +template <> +PosixConditionHandle::PosixConditionHandle(uint32_t initial_count, + uint32_t maximum_count) + : handle_(initial_count, maximum_count) {} + +template <> +PosixConditionHandle::PosixConditionHandle(bool initial_owner) + : handle_(initial_owner) {} + +template <> +PosixConditionHandle::PosixConditionHandle(bool manual_reset) + : handle_(manual_reset) {} + +template <> +PosixConditionHandle::PosixConditionHandle(bool manual_reset, + bool initial_state) + : handle_(manual_reset, initial_state) {} + +template <> +PosixConditionHandle::PosixConditionHandle(pthread_t thread) + : handle_(thread) {} + +WaitResult Wait(WaitHandle* wait_handle, bool is_alertable, + std::chrono::milliseconds timeout) { + auto posix_wait_handle = dynamic_cast(wait_handle); + if (posix_wait_handle == nullptr) { + return WaitResult::kFailed; + } + if (is_alertable) alertable_state_ = true; + auto result = posix_wait_handle->condition().Wait(timeout); + if (is_alertable) alertable_state_ = false; + return result; } -// TODO(dougvj) WaitResult SignalAndWait(WaitHandle* wait_handle_to_signal, WaitHandle* wait_handle_to_wait_on, bool is_alertable, std::chrono::milliseconds timeout) { - assert_always(); - return WaitResult::kFailed; + auto result = WaitResult::kFailed; + auto posix_wait_handle_to_signal = + dynamic_cast(wait_handle_to_signal); + auto posix_wait_handle_to_wait_on = + dynamic_cast(wait_handle_to_wait_on); + if (posix_wait_handle_to_signal == nullptr || + posix_wait_handle_to_wait_on == nullptr) { + return WaitResult::kFailed; + } + if (is_alertable) alertable_state_ = true; + if (posix_wait_handle_to_signal->condition().Signal()) { + result = posix_wait_handle_to_wait_on->condition().Wait(timeout); + } + if (is_alertable) alertable_state_ = false; + return result; } -// TODO(dougvj) std::pair WaitMultiple(WaitHandle* wait_handles[], size_t wait_handle_count, bool wait_all, bool is_alertable, std::chrono::milliseconds timeout) { - assert_always(); - return std::pair(WaitResult::kFailed, 0); + std::vector conditions; + conditions.reserve(wait_handle_count); + for (size_t i = 0u; i < wait_handle_count; ++i) { + auto handle = dynamic_cast(wait_handles[i]); + if (handle == nullptr) { + return std::make_pair(WaitResult::kFailed, 0); + } + conditions.push_back(&handle->condition()); + } + if (is_alertable) alertable_state_ = true; + auto result = PosixConditionBase::WaitMultiple(std::move(conditions), + wait_all, timeout); + if (is_alertable) alertable_state_ = false; + return result; } -// TODO(dougvj) -class PosixEvent : public PosixFdHandle { +class PosixEvent : public PosixConditionHandle { public: - PosixEvent(intptr_t fd) : PosixFdHandle(fd) {} + PosixEvent(bool manual_reset, bool initial_state) + : PosixConditionHandle(manual_reset, initial_state) {} ~PosixEvent() override = default; - void Set() override { - uint64_t buf = 1; - write(handle_, &buf, sizeof(buf)); + void Set() override { handle_.Signal(); } + void Reset() override { handle_.Reset(); } + void Pulse() override { + using namespace std::chrono_literals; + handle_.Signal(); + MaybeYield(); + Sleep(10us); + handle_.Reset(); } - void Reset() override { assert_always(); } - void Pulse() override { assert_always(); } - - private: - PosixCondition condition_; }; std::unique_ptr Event::CreateManualResetEvent(bool initial_state) { - // Linux's eventfd doesn't appear to support manual reset natively. - return nullptr; + return std::make_unique(true, initial_state); } std::unique_ptr Event::CreateAutoResetEvent(bool initial_state) { - int fd = eventfd(initial_state ? 1 : 0, EFD_CLOEXEC); - if (fd == -1) { - return nullptr; - } - - return std::make_unique(PosixEvent(fd)); + return std::make_unique(false, initial_state); } -// TODO(dougvj) class PosixSemaphore : public PosixConditionHandle { public: - PosixSemaphore(int initial_count, int maximum_count) { assert_always(); } + PosixSemaphore(int initial_count, int maximum_count) + : PosixConditionHandle(static_cast(initial_count), + static_cast(maximum_count)) {} ~PosixSemaphore() override = default; bool Release(int release_count, int* out_previous_count) override { - assert_always(); - return false; + if (release_count < 1) { + return false; + } + return handle_.Release(static_cast(release_count), + out_previous_count); } }; @@ -314,149 +848,210 @@ std::unique_ptr Semaphore::Create(int initial_count, return std::make_unique(initial_count, maximum_count); } -// TODO(dougvj) class PosixMutant : public PosixConditionHandle { public: - PosixMutant(bool initial_owner) { assert_always(); } - ~PosixMutant() = default; - bool Release() override { - assert_always(); - return false; - } + explicit PosixMutant(bool initial_owner) + : PosixConditionHandle(initial_owner) {} + ~PosixMutant() override = default; + bool Release() override { return handle_.Release(); } }; std::unique_ptr Mutant::Create(bool initial_owner) { return std::make_unique(initial_owner); } -// TODO(dougvj) class PosixTimer : public PosixConditionHandle { public: - PosixTimer(bool manual_reset) { assert_always(); } - ~PosixTimer() = default; + explicit PosixTimer(bool manual_reset) : PosixConditionHandle(manual_reset) {} + ~PosixTimer() override = default; bool SetOnce(std::chrono::nanoseconds due_time, std::function opt_callback) override { - assert_always(); - return false; + return handle_.Set(due_time, std::chrono::milliseconds::zero(), + std::move(opt_callback)); } bool SetRepeating(std::chrono::nanoseconds due_time, std::chrono::milliseconds period, std::function opt_callback) override { - assert_always(); - return false; - } - bool Cancel() override { - assert_always(); - return false; + return handle_.Set(due_time, period, std::move(opt_callback)); } + bool Cancel() override { return handle_.Cancel(); } }; std::unique_ptr Timer::CreateManualResetTimer() { + install_signal_handler(SignalType::kTimer); return std::make_unique(true); } std::unique_ptr Timer::CreateSynchronizationTimer() { + install_signal_handler(SignalType::kTimer); return std::make_unique(false); } -class PosixThread : public PosixThreadHandle { +class PosixThread : public PosixConditionHandle { public: - explicit PosixThread(pthread_t handle) : PosixThreadHandle(handle) {} - ~PosixThread() = default; + PosixThread() = default; + explicit PosixThread(pthread_t thread) : PosixConditionHandle(thread) {} + ~PosixThread() override = default; + + bool Initialize(CreationParameters params, + std::function start_routine) { + auto start_data = + new ThreadStartData({std::move(start_routine), false, this}); + return handle_.Initialize(params, start_data); + } void set_name(std::string name) override { - pthread_setname_np(handle_, name.c_str()); - } - - uint32_t system_id() const override { return 0; } - - // TODO(DrChat) - uint64_t affinity_mask() override { return 0; } - void set_affinity_mask(uint64_t mask) override { assert_always(); } - - int priority() override { - int policy; - struct sched_param param; - int ret = pthread_getschedparam(handle_, &policy, ¶m); - if (ret != 0) { - return -1; + handle_.WaitStarted(); + Thread::set_name(name); + if (name.length() > 15) { + name = name.substr(0, 15); } - - return param.sched_priority; + handle_.set_name(name); } + uint32_t system_id() const override { return handle_.system_id(); } + + uint64_t affinity_mask() override { return handle_.affinity_mask(); } + void set_affinity_mask(uint64_t mask) override { + handle_.set_affinity_mask(mask); + } + + int priority() override { return handle_.priority(); } void set_priority(int new_priority) override { - struct sched_param param; - param.sched_priority = new_priority; - int ret = pthread_setschedparam(handle_, SCHED_FIFO, ¶m); + handle_.set_priority(new_priority); } - // TODO(DrChat) void QueueUserCallback(std::function callback) override { - assert_always(); + handle_.QueueUserCallback(std::move(callback)); } - bool Resume(uint32_t* out_new_suspend_count = nullptr) override { - assert_always(); - return false; + bool Resume(uint32_t* out_previous_suspend_count) override { + return handle_.Resume(out_previous_suspend_count); } - bool Suspend(uint32_t* out_previous_suspend_count = nullptr) override { - assert_always(); - return false; + bool Suspend(uint32_t* out_previous_suspend_count) override { + return handle_.Suspend(out_previous_suspend_count); } - void Terminate(int exit_code) override {} + void Terminate(int exit_code) override { handle_.Terminate(exit_code); } + + void WaitSuspended() { handle_.WaitSuspended(); } }; -thread_local std::unique_ptr current_thread_ = nullptr; +thread_local PosixThread* current_thread_ = nullptr; -struct ThreadStartData { - std::function start_routine; -}; -void* ThreadStartRoutine(void* parameter) { - current_thread_ = - std::unique_ptr(new PosixThread(::pthread_self())); +void* PosixCondition::ThreadStartRoutine(void* parameter) { + if (pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, nullptr) != 0) { + assert_always(); + } + threading::set_name(""); - auto start_data = reinterpret_cast(parameter); - start_data->start_routine(); + auto start_data = static_cast(parameter); + assert_not_null(start_data); + assert_not_null(start_data->thread_obj); + + auto thread = dynamic_cast(start_data->thread_obj); + auto start_routine = std::move(start_data->start_routine); + auto create_suspended = start_data->create_suspended; delete start_data; - return 0; + + current_thread_ = thread; + { + std::unique_lock lock(thread->handle_.state_mutex_); + thread->handle_.state_ = + create_suspended ? State::kSuspended : State::kRunning; + thread->handle_.state_signal_.notify_all(); + } + + if (create_suspended) { + std::unique_lock lock(thread->handle_.state_mutex_); + thread->handle_.suspend_count_ = 1; + thread->handle_.state_signal_.wait( + lock, [thread] { return thread->handle_.suspend_count_ == 0; }); + } + + start_routine(); + + { + std::unique_lock lock(thread->handle_.state_mutex_); + thread->handle_.state_ = State::kFinished; + } + + std::unique_lock lock(mutex_); + thread->handle_.exit_code_ = 0; + thread->handle_.signaled_ = true; + cond_.notify_all(); + + current_thread_ = nullptr; + return nullptr; } std::unique_ptr Thread::Create(CreationParameters params, std::function start_routine) { - auto start_data = new ThreadStartData({std::move(start_routine)}); - - assert_false(params.create_suspended); - pthread_t handle; - pthread_attr_t attr; - pthread_attr_init(&attr); - int ret = pthread_create(&handle, &attr, ThreadStartRoutine, start_data); - if (ret != 0) { - // TODO(benvanik): pass back? - auto last_error = errno; - XELOGE("Unable to pthread_create: {}", last_error); - delete start_data; - return nullptr; - } - - return std::unique_ptr(new PosixThread(handle)); + install_signal_handler(SignalType::kThreadSuspend); + install_signal_handler(SignalType::kThreadUserCallback); + auto thread = std::make_unique(); + if (!thread->Initialize(params, std::move(start_routine))) return nullptr; + assert_not_null(thread); + return thread; } Thread* Thread::GetCurrentThread() { if (current_thread_) { - return current_thread_.get(); + return current_thread_; } + // Should take this route only for threads not created by Thread::Create. + // The only thread not created by Thread::Create should be the main thread. pthread_t handle = pthread_self(); - current_thread_ = std::make_unique(handle); - return current_thread_.get(); + current_thread_ = new PosixThread(handle); + atexit([] { delete current_thread_; }); + + return current_thread_; } void Thread::Exit(int exit_code) { - pthread_exit(reinterpret_cast(exit_code)); + if (current_thread_) { + current_thread_->Terminate(exit_code); + // Sometimes the current thread keeps running after being cancelled. + // Prevent other calls from this thread from using current_thread_. + current_thread_ = nullptr; + } else { + // Should only happen with the main thread + pthread_exit(reinterpret_cast(exit_code)); + } +} + +static void signal_handler(int signal, siginfo_t* info, void* /*context*/) { + switch (GetSystemSignalType(signal)) { + case SignalType::kHighResolutionTimer: { + assert_not_null(info->si_value.sival_ptr); + auto callback = + *static_cast*>(info->si_value.sival_ptr); + callback(); + } break; + case SignalType::kTimer: { + assert_not_null(info->si_value.sival_ptr); + auto pTimer = + static_cast*>(info->si_value.sival_ptr); + pTimer->CompletionRoutine(); + } break; + case SignalType::kThreadSuspend: { + assert_not_null(current_thread_); + current_thread_->WaitSuspended(); + } break; + case SignalType::kThreadUserCallback: { + assert_not_null(info->si_value.sival_ptr); + auto p_thread = + static_cast*>(info->si_value.sival_ptr); + if (alertable_state_) { + p_thread->CallUserCallback(); + } + } break; + default: + assert_always(); + } } } // namespace threading diff --git a/src/xenia/base/threading_win.cc b/src/xenia/base/threading_win.cc index 605c2ccbf..6b4e31a99 100644 --- a/src/xenia/base/threading_win.cc +++ b/src/xenia/base/threading_win.cc @@ -388,16 +388,16 @@ class Win32Thread : public Win32Handle { QueueUserAPC(DispatchApc, handle_, reinterpret_cast(apc_data)); } - bool Resume(uint32_t* out_new_suspend_count = nullptr) override { - if (out_new_suspend_count) { - *out_new_suspend_count = 0; + bool Resume(uint32_t* out_previous_suspend_count = nullptr) override { + if (out_previous_suspend_count) { + *out_previous_suspend_count = 0; } DWORD result = ResumeThread(handle_); if (result == UINT_MAX) { return false; } - if (out_new_suspend_count) { - *out_new_suspend_count = result; + if (out_previous_suspend_count) { + *out_previous_suspend_count = result; } return true; } diff --git a/src/xenia/cpu/export_resolver.cc b/src/xenia/cpu/export_resolver.cc index ecc5d8246..b05df5d83 100644 --- a/src/xenia/cpu/export_resolver.cc +++ b/src/xenia/cpu/export_resolver.cc @@ -30,7 +30,7 @@ ExportResolver::Table::Table(const std::string_view module_name, } std::sort( exports_by_name_.begin(), exports_by_name_.end(), - [](Export* a, Export* b) { return std::strcmp(a->name, b->name) <= 0; }); + [](Export* a, Export* b) { return std::strcmp(a->name, b->name) < 0; }); } ExportResolver::ExportResolver() = default; @@ -51,7 +51,7 @@ void ExportResolver::RegisterTable( } std::sort( all_exports_by_name_.begin(), all_exports_by_name_.end(), - [](Export* a, Export* b) { return std::strcmp(a->name, b->name) <= 0; }); + [](Export* a, Export* b) { return std::strcmp(a->name, b->name) < 0; }); } Export* ExportResolver::GetExportByOrdinal(const std::string_view module_name, diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index 4f5e7f96b..69a94c7f4 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -73,7 +73,7 @@ bool CommandProcessor::Initialize( WorkerThreadMain(); return 0; })); - worker_thread_->set_name("GraphicsSystem Command Processor"); + worker_thread_->set_name("GPU Commands"); worker_thread_->Create(); return true; @@ -731,12 +731,20 @@ bool CommandProcessor::ExecutePacketType3(RingBuffer* reader, uint32_t packet) { } break; case PM4_CONTEXT_UPDATE: { assert_true(count == 1); - uint64_t value = reader->ReadAndSwap(); + uint32_t value = reader->ReadAndSwap(); XELOGGPU("GPU context update = {:08X}", value); assert_true(value == 0); result = true; break; } + case PM4_WAIT_FOR_IDLE: { + // This opcode is used by "Duke Nukem Forever" while going/being ingame + assert_true(count == 1); + uint32_t value = reader->ReadAndSwap(); + XELOGGPU("GPU wait for idle = {:08X}", value); + result = true; + break; + } default: XELOGGPU("Unimplemented GPU OPCODE: 0x{:02X}\t\tCOUNT: {}\n", opcode, diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index f0be8c50e..8db6f1626 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -21,6 +21,7 @@ #include "xenia/gpu/d3d12/d3d12_command_processor.h" #include "xenia/gpu/d3d12/d3d12_graphics_system.h" #include "xenia/gpu/d3d12/d3d12_shader.h" +#include "xenia/gpu/draw_util.h" #include "xenia/gpu/gpu_flags.h" #include "xenia/gpu/xenos.h" #include "xenia/ui/d3d12/d3d12_util.h" @@ -387,7 +388,7 @@ ID3D12RootSignature* D3D12CommandProcessor::GetRootSignature( sampler_count_vertex); return nullptr; } - root_signatures_bindful_.insert({index, root_signature}); + root_signatures_bindful_.emplace(index, root_signature); return root_signature; } @@ -745,12 +746,11 @@ void D3D12CommandProcessor::SetSamplePositions( current_sample_positions_ = sample_positions; } -void D3D12CommandProcessor::SetComputePipelineState( - ID3D12PipelineState* pipeline_state) { - if (current_external_pipeline_state_ != pipeline_state) { - deferred_command_list_.D3DSetPipelineState(pipeline_state); - current_external_pipeline_state_ = pipeline_state; - current_cached_pipeline_state_ = nullptr; +void D3D12CommandProcessor::SetComputePipeline(ID3D12PipelineState* pipeline) { + if (current_external_pipeline_ != pipeline) { + deferred_command_list_.D3DSetPipelineState(pipeline); + current_external_pipeline_ = pipeline; + current_cached_pipeline_ = nullptr; } } @@ -773,8 +773,16 @@ std::string D3D12CommandProcessor::GetWindowTitleText() const { } // Currently scaling is only supported with ROV. if (texture_cache_ != nullptr && texture_cache_->IsResolutionScale2X()) { - return "Direct3D 12 - 2x"; + return "Direct3D 12 - ROV 2x"; } + // Rasterizer-ordered views are a feature very rarely used as of 2020 and + // that faces adoption complications (outside of Direct3D - on Vulkan - at + // least), but crucial to Xenia - raise awareness of its usage. + // https://github.com/KhronosGroup/Vulkan-Ecosystem/issues/27#issuecomment-455712319 + // "In Xenia's title bar "D3D12 ROV" can be seen, which was a surprise, as I + // wasn't aware that Xenia D3D12 backend was using Raster Order Views + // feature" - oscarbg in that issue. + return "Direct3D 12 - ROV"; } return "Direct3D 12"; } @@ -1196,7 +1204,7 @@ bool D3D12CommandProcessor::SetupContext() { *this, *register_file_, bindless_resources_used_, edram_rov_used_, texture_cache_->IsResolutionScale2X() ? 2 : 1); if (!pipeline_cache_->Initialize()) { - XELOGE("Failed to initialize the graphics pipeline state cache"); + XELOGE("Failed to initialize the graphics pipeline cache"); return false; } @@ -1526,8 +1534,7 @@ void D3D12CommandProcessor::ShutdownContext() { // Shut down binding - bindless descriptors may be owned by subsystems like // the texture cache. - // Root signatured are used by pipeline states, thus freed after the pipeline - // states. + // Root signatures are used by pipelines, thus freed after the pipelines. ui::d3d12::util::ReleaseAndNull(root_signature_bindless_ds_); ui::d3d12::util::ReleaseAndNull(root_signature_bindless_vs_); for (auto it : root_signatures_bindful_) { @@ -1878,7 +1885,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, xenos::VertexShaderExportMode::kMultipass || (primitive_two_faced && pa_su_sc_mode_cntl.cull_front && pa_su_sc_mode_cntl.cull_back))) { - // All faces are culled - can't be expressed in the pipeline state. + // All faces are culled - can't be expressed in the pipeline. return true; } @@ -1954,7 +1961,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, line_loop_closing_index = 0; } - // Update the textures - this may bind pipeline state objects. + // Update the textures - this may bind pipelines. uint32_t used_texture_mask = vertex_shader->GetUsedTextureMask() | (pixel_shader != nullptr ? pixel_shader->GetUsedTextureMask() : 0); @@ -1972,21 +1979,21 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, early_z = true; } - // Create the pipeline state object if needed and bind it. - void* pipeline_state_handle; + // Create the pipeline if needed and bind it. + void* pipeline_handle; ID3D12RootSignature* root_signature; if (!pipeline_cache_->ConfigurePipeline( vertex_shader, pixel_shader, primitive_type_converted, indexed ? index_buffer_info->format : xenos::IndexFormat::kInt16, - early_z, pipeline_render_targets, &pipeline_state_handle, + early_z, pipeline_render_targets, &pipeline_handle, &root_signature)) { return false; } - if (current_cached_pipeline_state_ != pipeline_state_handle) { + if (current_cached_pipeline_ != pipeline_handle) { deferred_command_list_.SetPipelineStateHandle( - reinterpret_cast(pipeline_state_handle)); - current_cached_pipeline_state_ = pipeline_state_handle; - current_external_pipeline_state_ = nullptr; + reinterpret_cast(pipeline_handle)); + current_cached_pipeline_ = pipeline_handle; + current_external_pipeline_ = nullptr; } // Update viewport, scissor, blend factor and stencil reference. @@ -2005,14 +2012,15 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, } // Must not call anything that can change the descriptor heap from now on! - // Ensure vertex and index buffers are resident and draw. + // Ensure vertex buffers are resident. // TODO(Triang3l): Cache residency for ranges in a way similar to how texture - // validity will be tracked. + // validity is tracked. uint64_t vertex_buffers_resident[2] = {}; - for (const auto& vertex_binding : vertex_shader->vertex_bindings()) { + for (const Shader::VertexBinding& vertex_binding : + vertex_shader->vertex_bindings()) { uint32_t vfetch_index = vertex_binding.fetch_constant; if (vertex_buffers_resident[vfetch_index >> 6] & - (1ull << (vfetch_index & 63))) { + (uint64_t(1) << (vfetch_index & 63))) { continue; } const auto& vfetch_constant = regs.Get( @@ -2045,7 +2053,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, vfetch_constant.address << 2, vfetch_constant.size << 2); return false; } - vertex_buffers_resident[vfetch_index >> 6] |= 1ull << (vfetch_index & 63); + vertex_buffers_resident[vfetch_index >> 6] |= uint64_t(1) + << (vfetch_index & 63); } // Gather memexport ranges and ensure the heaps for them are resident, and @@ -2517,8 +2526,8 @@ void D3D12CommandProcessor::BeginSubmission(bool is_guest_command) { submission_open_ = true; // Start a new deferred command list - will submit it to the real one in the - // end of the submission (when async pipeline state object creation requests - // are fulfilled). + // end of the submission (when async pipeline creation requests are + // fulfilled). deferred_command_list_.Reset(); // Reset cached state of the command list. @@ -2527,8 +2536,8 @@ void D3D12CommandProcessor::BeginSubmission(bool is_guest_command) { ff_blend_factor_update_needed_ = true; ff_stencil_ref_update_needed_ = true; current_sample_positions_ = xenos::MsaaSamples::k1X; - current_cached_pipeline_state_ = nullptr; - current_external_pipeline_state_ = nullptr; + current_cached_pipeline_ = nullptr; + current_external_pipeline_ = nullptr; current_graphics_root_signature_ = nullptr; current_graphics_root_up_to_date_ = 0; if (bindless_resources_used_) { @@ -2724,7 +2733,7 @@ bool D3D12CommandProcessor::EndSubmission(bool is_swap) { } bool D3D12CommandProcessor::CanEndSubmissionImmediately() const { - return !submission_open_ || !pipeline_cache_->IsCreatingPipelineStates(); + return !submission_open_ || !pipeline_cache_->IsCreatingPipelines(); } void D3D12CommandProcessor::ClearCommandAllocatorCache() { @@ -2745,12 +2754,12 @@ void D3D12CommandProcessor::ClearCommandAllocatorCache() { } void D3D12CommandProcessor::UpdateFixedFunctionState(bool primitive_two_faced) { - auto& regs = *register_file_; - #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES + const RegisterFile& regs = *register_file_; + // Window parameters. // http://ftp.tku.edu.tw/NetBSD/NetBSD-current/xsrc/external/mit/xf86-video-ati/dist/src/r600_reg_auto_r6xx.h // See r200UpdateWindow: @@ -2838,34 +2847,20 @@ void D3D12CommandProcessor::UpdateFixedFunctionState(bool primitive_two_faced) { } // Scissor. - auto pa_sc_window_scissor_tl = regs.Get(); - auto pa_sc_window_scissor_br = regs.Get(); - D3D12_RECT scissor; - scissor.left = pa_sc_window_scissor_tl.tl_x; - scissor.top = pa_sc_window_scissor_tl.tl_y; - scissor.right = pa_sc_window_scissor_br.br_x; - scissor.bottom = pa_sc_window_scissor_br.br_y; - if (!pa_sc_window_scissor_tl.window_offset_disable) { - scissor.left = - std::max(scissor.left + pa_sc_window_offset.window_x_offset, LONG(0)); - scissor.top = - std::max(scissor.top + pa_sc_window_offset.window_y_offset, LONG(0)); - scissor.right = - std::max(scissor.right + pa_sc_window_offset.window_x_offset, LONG(0)); - scissor.bottom = - std::max(scissor.bottom + pa_sc_window_offset.window_y_offset, LONG(0)); - } - scissor.left *= pixel_size_x; - scissor.top *= pixel_size_y; - scissor.right *= pixel_size_x; - scissor.bottom *= pixel_size_y; - ff_scissor_update_needed_ |= ff_scissor_.left != scissor.left; - ff_scissor_update_needed_ |= ff_scissor_.top != scissor.top; - ff_scissor_update_needed_ |= ff_scissor_.right != scissor.right; - ff_scissor_update_needed_ |= ff_scissor_.bottom != scissor.bottom; + draw_util::Scissor scissor; + draw_util::GetScissor(regs, scissor); + D3D12_RECT scissor_rect; + scissor_rect.left = LONG(scissor.left * pixel_size_x); + scissor_rect.top = LONG(scissor.top * pixel_size_y); + scissor_rect.right = LONG((scissor.left + scissor.width) * pixel_size_x); + scissor_rect.bottom = LONG((scissor.top + scissor.height) * pixel_size_y); + ff_scissor_update_needed_ |= ff_scissor_.left != scissor_rect.left; + ff_scissor_update_needed_ |= ff_scissor_.top != scissor_rect.top; + ff_scissor_update_needed_ |= ff_scissor_.right != scissor_rect.right; + ff_scissor_update_needed_ |= ff_scissor_.bottom != scissor_rect.bottom; if (ff_scissor_update_needed_) { - ff_scissor_ = scissor; - deferred_command_list_.RSSetScissorRect(scissor); + ff_scissor_ = scissor_rect; + deferred_command_list_.RSSetScissorRect(scissor_rect); ff_scissor_update_needed_ = false; } @@ -2915,12 +2910,11 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( uint32_t line_loop_closing_index, xenos::Endian index_endian, uint32_t used_texture_mask, bool early_z, uint32_t color_mask, const RenderTargetCache::PipelineRenderTarget render_targets[4]) { - auto& regs = *register_file_; - #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES + const RegisterFile& regs = *register_file_; auto pa_cl_clip_cntl = regs.Get(); auto pa_cl_vte_cntl = regs.Get(); auto pa_su_point_minmax = regs.Get(); @@ -3103,14 +3097,14 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( dirty |= system_constants_.line_loop_closing_index != line_loop_closing_index; system_constants_.line_loop_closing_index = line_loop_closing_index; - // Vertex index offset. - dirty |= system_constants_.vertex_base_index != vgt_indx_offset; - system_constants_.vertex_base_index = vgt_indx_offset; - // Index or tessellation edge factor buffer endianness. dirty |= system_constants_.vertex_index_endian != index_endian; system_constants_.vertex_index_endian = index_endian; + // Vertex index offset. + dirty |= system_constants_.vertex_base_index != vgt_indx_offset; + system_constants_.vertex_base_index = vgt_indx_offset; + // User clip planes (UCP_ENA_#), when not CLIP_DISABLE. if (!pa_cl_clip_cntl.clip_disable) { for (uint32_t i = 0; i < 6; ++i) { @@ -3574,7 +3568,7 @@ bool D3D12CommandProcessor::UpdateBindings( float_constant_map_vertex.float_bitmap[i]; // If no float constants at all, we can reuse any buffer for them, so not // invalidating. - if (float_constant_map_vertex.float_count != 0) { + if (float_constant_count_vertex) { cbuffer_binding_float_vertex_.up_to_date = false; } } @@ -3589,7 +3583,7 @@ bool D3D12CommandProcessor::UpdateBindings( float_constant_map_pixel.float_bitmap[i]) { current_float_constant_map_pixel_[i] = float_constant_map_pixel.float_bitmap[i]; - if (float_constant_map_pixel.float_count != 0) { + if (float_constant_count_pixel) { cbuffer_binding_float_pixel_.up_to_date = false; } } @@ -3889,8 +3883,8 @@ bool D3D12CommandProcessor::UpdateBindings( sampler_parameters, provider.OffsetSamplerDescriptor( sampler_bindless_heap_cpu_start_, sampler_index)); - texture_cache_bindless_sampler_map_.insert( - {sampler_parameters.value, sampler_index}); + texture_cache_bindless_sampler_map_.emplace( + sampler_parameters.value, sampler_index); } current_sampler_bindless_indices_vertex_[j] = sampler_index; } @@ -3921,8 +3915,8 @@ bool D3D12CommandProcessor::UpdateBindings( sampler_parameters, provider.OffsetSamplerDescriptor( sampler_bindless_heap_cpu_start_, sampler_index)); - texture_cache_bindless_sampler_map_.insert( - {sampler_parameters.value, sampler_index}); + texture_cache_bindless_sampler_map_.emplace( + sampler_parameters.value, sampler_index); } current_sampler_bindless_indices_pixel_[j] = sampler_index; } diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 5caa6bb78..ef2aa2cc3 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -190,19 +190,17 @@ class D3D12CommandProcessor : public CommandProcessor { // render targets or copying to depth render targets. void SetSamplePositions(xenos::MsaaSamples sample_positions); - // Returns a pipeline state object with deferred creation by its handle. May - // return nullptr if failed to create the pipeline state object. - inline ID3D12PipelineState* GetD3D12PipelineStateByHandle( - void* handle) const { - return pipeline_cache_->GetD3D12PipelineStateByHandle(handle); + // Returns a pipeline with deferred creation by its handle. May return nullptr + // if failed to create the pipeline. + ID3D12PipelineState* GetD3D12PipelineByHandle(void* handle) const { + return pipeline_cache_->GetD3D12PipelineByHandle(handle); } - // Sets the current pipeline state to a compute one. This is for cache - // invalidation primarily. A submission must be open. - void SetComputePipelineState(ID3D12PipelineState* pipeline_state); + // Sets the current pipeline to a compute one. This is for cache invalidation + // primarily. A submission must be open. + void SetComputePipeline(ID3D12PipelineState* pipeline); - // For the pipeline state cache to call when binding layout UIDs may be - // reused. + // For the pipeline cache to call when binding layout UIDs may be reused. void NotifyShaderBindingsLayoutUIDsInvalidated(); // Returns the text to display in the GPU backend name in the window title. @@ -327,8 +325,8 @@ class D3D12CommandProcessor : public CommandProcessor { bool EndSubmission(bool is_swap); // Checks if ending a submission right now would not cause potentially more // delay than it would reduce by making the GPU start working earlier - such - // as when there are unfinished graphics pipeline state creation requests that - // would need to be fulfilled before actually submitting the command list. + // as when there are unfinished graphics pipeline creation requests that would + // need to be fulfilled before actually submitting the command list. bool CanEndSubmissionImmediately() const; bool AwaitAllQueueOperationsCompletion() { CheckSubmissionFence(submission_current_); @@ -512,7 +510,7 @@ class D3D12CommandProcessor : public CommandProcessor { return cvars::internal_tile_height; } - inline std::pair GetSwapTextureSize() const { + std::pair GetSwapTextureSize() const { if (texture_cache_->IsResolutionScale2X()) { return std::make_pair(kSwapTextureWidth() * 2, kSwapTextureHeight() * 2); } @@ -557,13 +555,12 @@ class D3D12CommandProcessor : public CommandProcessor { // Current SSAA sample positions (to be updated by the render target cache). xenos::MsaaSamples current_sample_positions_; - // Currently bound pipeline state, either a graphics pipeline state object - // from the pipeline state cache (with potentially deferred creation - - // current_external_pipeline_state_ is nullptr in this case) or a non-Xenos - // graphics or compute pipeline state object (current_cached_pipeline_state_ - // is nullptr in this case). - void* current_cached_pipeline_state_; - ID3D12PipelineState* current_external_pipeline_state_; + // Currently bound pipeline, either a graphics pipeline from the pipeline + // cache (with potentially deferred creation - current_external_pipeline_ is + // nullptr in this case) or a non-Xenos graphics or compute pipeline + // (current_cached_pipeline_ is nullptr in this case). + void* current_cached_pipeline_; + ID3D12PipelineState* current_external_pipeline_; // Currently bound graphics root signature. ID3D12RootSignature* current_graphics_root_signature_; diff --git a/src/xenia/gpu/d3d12/d3d12_graphics_system.cc b/src/xenia/gpu/d3d12/d3d12_graphics_system.cc index e50bbbaac..d32f223ce 100644 --- a/src/xenia/gpu/d3d12/d3d12_graphics_system.cc +++ b/src/xenia/gpu/d3d12/d3d12_graphics_system.cc @@ -157,7 +157,7 @@ X_STATUS D3D12GraphicsSystem::Setup(cpu::Processor* processor, stretch_pipeline_desc.SampleDesc.Count = 1; if (FAILED(device->CreateGraphicsPipelineState( &stretch_pipeline_desc, IID_PPV_ARGS(&stretch_pipeline_)))) { - XELOGE("Failed to create the front buffer stretch pipeline state"); + XELOGE("Failed to create the front buffer stretch pipeline"); stretch_gamma_root_signature_->Release(); stretch_gamma_root_signature_ = nullptr; stretch_root_signature_->Release(); @@ -170,8 +170,7 @@ X_STATUS D3D12GraphicsSystem::Setup(cpu::Processor* processor, if (FAILED(device->CreateGraphicsPipelineState( &stretch_pipeline_desc, IID_PPV_ARGS(&stretch_gamma_pipeline_)))) { XELOGE( - "Failed to create the gamma-correcting front buffer stretch " - "pipeline state"); + "Failed to create the gamma-correcting front buffer stretch pipeline"); stretch_pipeline_->Release(); stretch_pipeline_ = nullptr; stretch_gamma_root_signature_->Release(); diff --git a/src/xenia/gpu/d3d12/d3d12_shader.h b/src/xenia/gpu/d3d12/d3d12_shader.h index 7eb4ac6e0..c24d6a00a 100644 --- a/src/xenia/gpu/d3d12/d3d12_shader.h +++ b/src/xenia/gpu/d3d12/d3d12_shader.h @@ -85,7 +85,7 @@ class D3D12Shader : public Shader { return sampler_bindings_.data(); } - // For owning subsystems like the pipeline state cache, accessors for unique + // For owning subsystems like the pipeline cache, accessors for unique // identifiers (used instead of hashes to make sure collisions can't happen) // of binding layouts used by the shader, for invalidation if a shader with an // incompatible layout was bound. diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.h b/src/xenia/gpu/d3d12/d3d12_shared_memory.h index 6620cecaa..dc918bb11 100644 --- a/src/xenia/gpu/d3d12/d3d12_shared_memory.h +++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h @@ -48,7 +48,7 @@ class D3D12SharedMemory : public SharedMemory { // UseForReading or UseForWriting. // Makes the buffer usable for vertices, indices and texture untiling. - inline void UseForReading() { + void UseForReading() { // Vertex fetch is also allowed in pixel shaders. CommitUAVWritesAndTransitionBuffer( D3D12_RESOURCE_STATE_INDEX_BUFFER | @@ -56,18 +56,18 @@ class D3D12SharedMemory : public SharedMemory { D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE); } // Makes the buffer usable for texture tiling after a resolve. - inline void UseForWriting() { + void UseForWriting() { CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS); } // Makes the buffer usable as a source for copy commands. - inline void UseAsCopySource() { + void UseAsCopySource() { CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_SOURCE); } // Must be called when doing draws/dispatches modifying data within the shared // memory buffer as a UAV, to make sure that when UseForWriting is called the // next time, a UAV barrier will be done, and subsequent overlapping UAV // writes and reads are ordered. - inline void MarkUAVWritesCommitNeeded() { + void MarkUAVWritesCommitNeeded() { if (buffer_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) { buffer_uav_writes_commit_needed_ = true; } diff --git a/src/xenia/gpu/d3d12/deferred_command_list.cc b/src/xenia/gpu/d3d12/deferred_command_list.cc index 2b013e8ad..eb8d8922e 100644 --- a/src/xenia/gpu/d3d12/deferred_command_list.cc +++ b/src/xenia/gpu/d3d12/deferred_command_list.cc @@ -209,9 +209,8 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list, } } break; case Command::kSetPipelineStateHandle: { - current_pipeline_state = - command_processor_.GetD3D12PipelineStateByHandle( - *reinterpret_cast(stream)); + current_pipeline_state = command_processor_.GetD3D12PipelineByHandle( + *reinterpret_cast(stream)); if (current_pipeline_state) { command_list->SetPipelineState(current_pipeline_state); } diff --git a/src/xenia/gpu/d3d12/deferred_command_list.h b/src/xenia/gpu/d3d12/deferred_command_list.h index 9393798c3..e8060371c 100644 --- a/src/xenia/gpu/d3d12/deferred_command_list.h +++ b/src/xenia/gpu/d3d12/deferred_command_list.h @@ -33,7 +33,7 @@ class DeferredCommandList { void Execute(ID3D12GraphicsCommandList* command_list, ID3D12GraphicsCommandList1* command_list_1); - inline void D3DClearUnorderedAccessViewUint( + void D3DClearUnorderedAccessViewUint( D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle_in_current_heap, D3D12_CPU_DESCRIPTOR_HANDLE view_cpu_handle, ID3D12Resource* resource, const UINT values[4], UINT num_rects, const D3D12_RECT* rects) { @@ -51,9 +51,9 @@ class DeferredCommandList { } } - inline void D3DCopyBufferRegion(ID3D12Resource* dst_buffer, UINT64 dst_offset, - ID3D12Resource* src_buffer, UINT64 src_offset, - UINT64 num_bytes) { + void D3DCopyBufferRegion(ID3D12Resource* dst_buffer, UINT64 dst_offset, + ID3D12Resource* src_buffer, UINT64 src_offset, + UINT64 num_bytes) { auto& args = *reinterpret_cast(WriteCommand( Command::kD3DCopyBufferRegion, sizeof(D3DCopyBufferRegionArguments))); args.dst_buffer = dst_buffer; @@ -63,26 +63,26 @@ class DeferredCommandList { args.num_bytes = num_bytes; } - inline void D3DCopyResource(ID3D12Resource* dst_resource, - ID3D12Resource* src_resource) { + void D3DCopyResource(ID3D12Resource* dst_resource, + ID3D12Resource* src_resource) { auto& args = *reinterpret_cast(WriteCommand( Command::kD3DCopyResource, sizeof(D3DCopyResourceArguments))); args.dst_resource = dst_resource; args.src_resource = src_resource; } - inline void CopyTexture(const D3D12_TEXTURE_COPY_LOCATION& dst, - const D3D12_TEXTURE_COPY_LOCATION& src) { + void CopyTexture(const D3D12_TEXTURE_COPY_LOCATION& dst, + const D3D12_TEXTURE_COPY_LOCATION& src) { auto& args = *reinterpret_cast( WriteCommand(Command::kCopyTexture, sizeof(CopyTextureArguments))); std::memcpy(&args.dst, &dst, sizeof(D3D12_TEXTURE_COPY_LOCATION)); std::memcpy(&args.src, &src, sizeof(D3D12_TEXTURE_COPY_LOCATION)); } - inline void CopyTextureRegion(const D3D12_TEXTURE_COPY_LOCATION& dst, - UINT dst_x, UINT dst_y, UINT dst_z, - const D3D12_TEXTURE_COPY_LOCATION& src, - const D3D12_BOX& src_box) { + void CopyTextureRegion(const D3D12_TEXTURE_COPY_LOCATION& dst, UINT dst_x, + UINT dst_y, UINT dst_z, + const D3D12_TEXTURE_COPY_LOCATION& src, + const D3D12_BOX& src_box) { auto& args = *reinterpret_cast(WriteCommand( Command::kCopyTextureRegion, sizeof(CopyTextureRegionArguments))); std::memcpy(&args.dst, &dst, sizeof(D3D12_TEXTURE_COPY_LOCATION)); @@ -93,8 +93,8 @@ class DeferredCommandList { args.src_box = src_box; } - inline void D3DDispatch(UINT thread_group_count_x, UINT thread_group_count_y, - UINT thread_group_count_z) { + void D3DDispatch(UINT thread_group_count_x, UINT thread_group_count_y, + UINT thread_group_count_z) { auto& args = *reinterpret_cast( WriteCommand(Command::kD3DDispatch, sizeof(D3DDispatchArguments))); args.thread_group_count_x = thread_group_count_x; @@ -102,11 +102,10 @@ class DeferredCommandList { args.thread_group_count_z = thread_group_count_z; } - inline void D3DDrawIndexedInstanced(UINT index_count_per_instance, - UINT instance_count, - UINT start_index_location, - INT base_vertex_location, - UINT start_instance_location) { + void D3DDrawIndexedInstanced(UINT index_count_per_instance, + UINT instance_count, UINT start_index_location, + INT base_vertex_location, + UINT start_instance_location) { auto& args = *reinterpret_cast( WriteCommand(Command::kD3DDrawIndexedInstanced, sizeof(D3DDrawIndexedInstancedArguments))); @@ -117,9 +116,9 @@ class DeferredCommandList { args.start_instance_location = start_instance_location; } - inline void D3DDrawInstanced(UINT vertex_count_per_instance, - UINT instance_count, UINT start_vertex_location, - UINT start_instance_location) { + void D3DDrawInstanced(UINT vertex_count_per_instance, UINT instance_count, + UINT start_vertex_location, + UINT start_instance_location) { auto& args = *reinterpret_cast(WriteCommand( Command::kD3DDrawInstanced, sizeof(D3DDrawInstancedArguments))); args.vertex_count_per_instance = vertex_count_per_instance; @@ -128,7 +127,7 @@ class DeferredCommandList { args.start_instance_location = start_instance_location; } - inline void D3DIASetIndexBuffer(const D3D12_INDEX_BUFFER_VIEW* view) { + void D3DIASetIndexBuffer(const D3D12_INDEX_BUFFER_VIEW* view) { auto& args = *reinterpret_cast(WriteCommand( Command::kD3DIASetIndexBuffer, sizeof(D3D12_INDEX_BUFFER_VIEW))); if (view != nullptr) { @@ -142,14 +141,13 @@ class DeferredCommandList { } } - inline void D3DIASetPrimitiveTopology( - D3D12_PRIMITIVE_TOPOLOGY primitive_topology) { + void D3DIASetPrimitiveTopology(D3D12_PRIMITIVE_TOPOLOGY primitive_topology) { auto& arg = *reinterpret_cast(WriteCommand( Command::kD3DIASetPrimitiveTopology, sizeof(D3D12_PRIMITIVE_TOPOLOGY))); arg = primitive_topology; } - inline void D3DOMSetBlendFactor(const FLOAT blend_factor[4]) { + void D3DOMSetBlendFactor(const FLOAT blend_factor[4]) { auto args = reinterpret_cast( WriteCommand(Command::kD3DOMSetBlendFactor, 4 * sizeof(FLOAT))); args[0] = blend_factor[0]; @@ -158,7 +156,7 @@ class DeferredCommandList { args[3] = blend_factor[3]; } - inline void D3DOMSetRenderTargets( + void D3DOMSetRenderTargets( UINT num_render_target_descriptors, const D3D12_CPU_DESCRIPTOR_HANDLE* render_target_descriptors, BOOL rts_single_handle_to_descriptor_range, @@ -185,14 +183,14 @@ class DeferredCommandList { } } - inline void D3DOMSetStencilRef(UINT stencil_ref) { + void D3DOMSetStencilRef(UINT stencil_ref) { auto& arg = *reinterpret_cast( WriteCommand(Command::kD3DOMSetStencilRef, sizeof(UINT))); arg = stencil_ref; } - inline void D3DResourceBarrier(UINT num_barriers, - const D3D12_RESOURCE_BARRIER* barriers) { + void D3DResourceBarrier(UINT num_barriers, + const D3D12_RESOURCE_BARRIER* barriers) { if (num_barriers == 0) { return; } @@ -207,21 +205,22 @@ class DeferredCommandList { num_barriers * sizeof(D3D12_RESOURCE_BARRIER)); } - inline void RSSetScissorRect(const D3D12_RECT& rect) { + void RSSetScissorRect(const D3D12_RECT& rect) { auto& arg = *reinterpret_cast( WriteCommand(Command::kRSSetScissorRect, sizeof(D3D12_RECT))); arg = rect; } - inline void RSSetViewport(const D3D12_VIEWPORT& viewport) { + void RSSetViewport(const D3D12_VIEWPORT& viewport) { auto& arg = *reinterpret_cast( WriteCommand(Command::kRSSetViewport, sizeof(D3D12_VIEWPORT))); arg = viewport; } - inline void D3DSetComputeRoot32BitConstants( - UINT root_parameter_index, UINT num_32bit_values_to_set, - const void* src_data, UINT dest_offset_in_32bit_values) { + void D3DSetComputeRoot32BitConstants(UINT root_parameter_index, + UINT num_32bit_values_to_set, + const void* src_data, + UINT dest_offset_in_32bit_values) { if (num_32bit_values_to_set == 0) { return; } @@ -235,9 +234,10 @@ class DeferredCommandList { std::memcpy(args + 1, src_data, num_32bit_values_to_set * sizeof(uint32_t)); } - inline void D3DSetGraphicsRoot32BitConstants( - UINT root_parameter_index, UINT num_32bit_values_to_set, - const void* src_data, UINT dest_offset_in_32bit_values) { + void D3DSetGraphicsRoot32BitConstants(UINT root_parameter_index, + UINT num_32bit_values_to_set, + const void* src_data, + UINT dest_offset_in_32bit_values) { if (num_32bit_values_to_set == 0) { return; } @@ -251,7 +251,7 @@ class DeferredCommandList { std::memcpy(args + 1, src_data, num_32bit_values_to_set * sizeof(uint32_t)); } - inline void D3DSetComputeRootConstantBufferView( + void D3DSetComputeRootConstantBufferView( UINT root_parameter_index, D3D12_GPU_VIRTUAL_ADDRESS buffer_location) { auto& args = *reinterpret_cast( WriteCommand(Command::kD3DSetComputeRootConstantBufferView, @@ -260,7 +260,7 @@ class DeferredCommandList { args.buffer_location = buffer_location; } - inline void D3DSetGraphicsRootConstantBufferView( + void D3DSetGraphicsRootConstantBufferView( UINT root_parameter_index, D3D12_GPU_VIRTUAL_ADDRESS buffer_location) { auto& args = *reinterpret_cast( WriteCommand(Command::kD3DSetGraphicsRootConstantBufferView, @@ -269,7 +269,7 @@ class DeferredCommandList { args.buffer_location = buffer_location; } - inline void D3DSetComputeRootDescriptorTable( + void D3DSetComputeRootDescriptorTable( UINT root_parameter_index, D3D12_GPU_DESCRIPTOR_HANDLE base_descriptor) { auto& args = *reinterpret_cast( WriteCommand(Command::kD3DSetComputeRootDescriptorTable, @@ -278,7 +278,7 @@ class DeferredCommandList { args.base_descriptor.ptr = base_descriptor.ptr; } - inline void D3DSetGraphicsRootDescriptorTable( + void D3DSetGraphicsRootDescriptorTable( UINT root_parameter_index, D3D12_GPU_DESCRIPTOR_HANDLE base_descriptor) { auto& args = *reinterpret_cast( WriteCommand(Command::kD3DSetGraphicsRootDescriptorTable, @@ -287,42 +287,40 @@ class DeferredCommandList { args.base_descriptor.ptr = base_descriptor.ptr; } - inline void D3DSetComputeRootSignature(ID3D12RootSignature* root_signature) { + void D3DSetComputeRootSignature(ID3D12RootSignature* root_signature) { auto& arg = *reinterpret_cast(WriteCommand( Command::kD3DSetComputeRootSignature, sizeof(ID3D12RootSignature*))); arg = root_signature; } - inline void D3DSetGraphicsRootSignature(ID3D12RootSignature* root_signature) { + void D3DSetGraphicsRootSignature(ID3D12RootSignature* root_signature) { auto& arg = *reinterpret_cast(WriteCommand( Command::kD3DSetGraphicsRootSignature, sizeof(ID3D12RootSignature*))); arg = root_signature; } - inline void SetDescriptorHeaps( - ID3D12DescriptorHeap* cbv_srv_uav_descriptor_heap, - ID3D12DescriptorHeap* sampler_descriptor_heap) { + void SetDescriptorHeaps(ID3D12DescriptorHeap* cbv_srv_uav_descriptor_heap, + ID3D12DescriptorHeap* sampler_descriptor_heap) { auto& args = *reinterpret_cast(WriteCommand( Command::kSetDescriptorHeaps, sizeof(SetDescriptorHeapsArguments))); args.cbv_srv_uav_descriptor_heap = cbv_srv_uav_descriptor_heap; args.sampler_descriptor_heap = sampler_descriptor_heap; } - inline void D3DSetPipelineState(ID3D12PipelineState* pipeline_state) { + void D3DSetPipelineState(ID3D12PipelineState* pipeline_state) { auto& arg = *reinterpret_cast(WriteCommand( Command::kD3DSetPipelineState, sizeof(ID3D12PipelineState*))); arg = pipeline_state; } - inline void SetPipelineStateHandle(void* pipeline_state_handle) { + void SetPipelineStateHandle(void* pipeline_state_handle) { auto& arg = *reinterpret_cast( WriteCommand(Command::kSetPipelineStateHandle, sizeof(void*))); arg = pipeline_state_handle; } - inline void D3DSetSamplePositions( - UINT num_samples_per_pixel, UINT num_pixels, - const D3D12_SAMPLE_POSITION* sample_positions) { + void D3DSetSamplePositions(UINT num_samples_per_pixel, UINT num_pixels, + const D3D12_SAMPLE_POSITION* sample_positions) { auto& args = *reinterpret_cast( WriteCommand(Command::kD3DSetSamplePositions, sizeof(D3DSetSamplePositionsArguments))); diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index 3a9f609d3..b2db2654e 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -43,10 +43,10 @@ DEFINE_bool( "D3D12"); DEFINE_int32( d3d12_pipeline_creation_threads, -1, - "Number of threads used for graphics pipeline state object creation. -1 to " - "calculate automatically (75% of logical CPU cores), a positive number to " - "specify the number of threads explicitly (up to the number of logical CPU " - "cores), 0 to disable multithreaded pipeline state object creation.", + "Number of threads used for graphics pipeline creation. -1 to calculate " + "automatically (75% of logical CPU cores), a positive number to specify " + "the number of threads explicitly (up to the number of logical CPU cores), " + "0 to disable multithreaded pipeline creation.", "D3D12"); DEFINE_bool(d3d12_tessellation_wireframe, false, "Display tessellated surfaces as wireframe for debugging.", @@ -125,8 +125,8 @@ bool PipelineCache::Initialize() { logical_processor_count = 6; } // Initialize creation thread synchronization data even if not using creation - // threads because they may be used anyway to create pipeline state objects - // from the storage. + // threads because they may be used anyway to create pipelines from the + // storage. creation_threads_busy_ = 0; creation_completion_event_ = xe::threading::Event::CreateManualResetEvent(true); @@ -145,7 +145,7 @@ bool PipelineCache::Initialize() { for (size_t i = 0; i < creation_thread_count; ++i) { std::unique_ptr creation_thread = xe::threading::Thread::Create({}, [this, i]() { CreationThread(i); }); - creation_thread->set_name("D3D12 Pipeline States"); + creation_thread->set_name("D3D12 Pipelines"); creation_threads_.push_back(std::move(creation_thread)); } } @@ -184,13 +184,12 @@ void PipelineCache::ClearCache(bool shutting_down) { } ShutdownShaderStorage(); - // Remove references to the current pipeline state object. - current_pipeline_state_ = nullptr; + // Remove references to the current pipeline. + current_pipeline_ = nullptr; if (!creation_threads_.empty()) { - // Empty the pipeline state object creation queue and make sure there are no - // threads currently creating pipeline state objects because pipeline states - // are going to be deleted. + // Empty the pipeline creation queue and make sure there are no threads + // currently creating pipelines because pipelines are going to be deleted. bool await_creation_completion_event = false; { std::lock_guard lock(creation_request_lock_); @@ -207,13 +206,13 @@ void PipelineCache::ClearCache(bool shutting_down) { } } - // Destroy all pipeline state objects. - for (auto it : pipeline_states_) { + // Destroy all pipelines. + for (auto it : pipelines_) { it.second->state->Release(); delete it.second; } - pipeline_states_.clear(); - COUNT_profile_set("gpu/pipeline_cache/pipeline_states", 0); + pipelines_.clear(); + COUNT_profile_set("gpu/pipeline_cache/pipelines", 0); // Destroy all shaders. command_processor_.NotifyShaderBindingsLayoutUIDsInvalidated(); @@ -223,10 +222,10 @@ void PipelineCache::ClearCache(bool shutting_down) { } texture_binding_layout_map_.clear(); texture_binding_layouts_.clear(); - for (auto it : shader_map_) { + for (auto it : shaders_) { delete it.second; } - shader_map_.clear(); + shaders_.clear(); if (reinitialize_shader_storage) { InitializeShaderStorage(shader_storage_root, shader_storage_title_id, @@ -374,8 +373,7 @@ void PipelineCache::InitializeShaderStorage( } size_t ucode_byte_count = shader_header.ucode_dword_count * sizeof(uint32_t); - if (shader_map_.find(shader_header.ucode_data_hash) != - shader_map_.end()) { + if (shaders_.find(shader_header.ucode_data_hash) != shaders_.end()) { // Already added - usually shaders aren't added without the intention of // translating them imminently, so don't do additional checks to // actually ensure that translation happens right now (they would cause @@ -402,7 +400,7 @@ void PipelineCache::InitializeShaderStorage( D3D12Shader* shader = new D3D12Shader(shader_header.type, ucode_data_hash, ucode_dwords.data(), shader_header.ucode_dword_count); - shader_map_.insert({ucode_data_hash, shader}); + shaders_.emplace(ucode_data_hash, shader); // Create new threads if the currently existing threads can't keep up with // file reading, but not more than the number of logical processors minus // one. @@ -439,7 +437,7 @@ void PipelineCache::InitializeShaderStorage( } shader_translation_threads.clear(); for (D3D12Shader* shader : shaders_failed_to_translate) { - shader_map_.erase(shader->ucode_data_hash()); + shaders_.erase(shader->ucode_data_hash()); delete shader; } } @@ -460,72 +458,66 @@ void PipelineCache::InitializeShaderStorage( } // 'DXRO' or 'DXRT'. - const uint32_t pipeline_state_storage_magic_api = + const uint32_t pipeline_storage_magic_api = edram_rov_used_ ? 0x4F525844 : 0x54525844; - // Initialize the pipeline state storage stream. - uint64_t pipeline_state_storage_initialization_start_ = + // Initialize the pipeline storage stream. + uint64_t pipeline_storage_initialization_start_ = xe::Clock::QueryHostTickCount(); - auto pipeline_state_storage_file_path = + auto pipeline_storage_file_path = shader_storage_shareable_root / fmt::format("{:08X}.{}.d3d12.xpso", title_id, edram_rov_used_ ? "rov" : "rtv"); - pipeline_state_storage_file_ = - xe::filesystem::OpenFile(pipeline_state_storage_file_path, "a+b"); - if (!pipeline_state_storage_file_) { + pipeline_storage_file_ = + xe::filesystem::OpenFile(pipeline_storage_file_path, "a+b"); + if (!pipeline_storage_file_) { XELOGE( - "Failed to open the Direct3D 12 pipeline state description storage " - "file for writing, persistent shader storage will be disabled: {}", - xe::path_to_utf8(pipeline_state_storage_file_path)); + "Failed to open the Direct3D 12 pipeline description storage file for " + "writing, persistent shader storage will be disabled: {}", + xe::path_to_utf8(pipeline_storage_file_path)); fclose(shader_storage_file_); shader_storage_file_ = nullptr; return; } - pipeline_state_storage_file_flush_needed_ = false; + pipeline_storage_file_flush_needed_ = false; // 'XEPS'. - const uint32_t pipeline_state_storage_magic = 0x53504558; + const uint32_t pipeline_storage_magic = 0x53504558; struct { uint32_t magic; uint32_t magic_api; uint32_t version_swapped; - } pipeline_state_storage_file_header; - if (fread(&pipeline_state_storage_file_header, - sizeof(pipeline_state_storage_file_header), 1, - pipeline_state_storage_file_) && - pipeline_state_storage_file_header.magic == - pipeline_state_storage_magic && - pipeline_state_storage_file_header.magic_api == - pipeline_state_storage_magic_api && - xe::byte_swap(pipeline_state_storage_file_header.version_swapped) == + } pipeline_storage_file_header; + if (fread(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header), + 1, pipeline_storage_file_) && + pipeline_storage_file_header.magic == pipeline_storage_magic && + pipeline_storage_file_header.magic_api == pipeline_storage_magic_api && + xe::byte_swap(pipeline_storage_file_header.version_swapped) == PipelineDescription::kVersion) { - uint64_t pipeline_state_storage_valid_bytes = - sizeof(pipeline_state_storage_file_header); - // Enqueue pipeline state descriptions written by previous Xenia executions - // until the end of the file or until a corrupted one is detected. - xe::filesystem::Seek(pipeline_state_storage_file_, 0, SEEK_END); - int64_t pipeline_state_storage_told_end = - xe::filesystem::Tell(pipeline_state_storage_file_); - size_t pipeline_state_storage_told_count = - size_t(pipeline_state_storage_told_end >= - int64_t(pipeline_state_storage_valid_bytes) - ? (uint64_t(pipeline_state_storage_told_end) - - pipeline_state_storage_valid_bytes) / - sizeof(PipelineStoredDescription) - : 0); - if (pipeline_state_storage_told_count && - xe::filesystem::Seek(pipeline_state_storage_file_, - int64_t(pipeline_state_storage_valid_bytes), - SEEK_SET)) { + uint64_t pipeline_storage_valid_bytes = + sizeof(pipeline_storage_file_header); + // Enqueue pipeline descriptions written by previous Xenia executions until + // the end of the file or until a corrupted one is detected. + xe::filesystem::Seek(pipeline_storage_file_, 0, SEEK_END); + int64_t pipeline_storage_told_end = + xe::filesystem::Tell(pipeline_storage_file_); + size_t pipeline_storage_told_count = size_t( + pipeline_storage_told_end >= int64_t(pipeline_storage_valid_bytes) + ? (uint64_t(pipeline_storage_told_end) - + pipeline_storage_valid_bytes) / + sizeof(PipelineStoredDescription) + : 0); + if (pipeline_storage_told_count && + xe::filesystem::Seek(pipeline_storage_file_, + int64_t(pipeline_storage_valid_bytes), SEEK_SET)) { std::vector pipeline_stored_descriptions; - pipeline_stored_descriptions.resize(pipeline_state_storage_told_count); - pipeline_stored_descriptions.resize(fread( - pipeline_stored_descriptions.data(), - sizeof(PipelineStoredDescription), pipeline_state_storage_told_count, - pipeline_state_storage_file_)); + pipeline_stored_descriptions.resize(pipeline_storage_told_count); + pipeline_stored_descriptions.resize( + fread(pipeline_stored_descriptions.data(), + sizeof(PipelineStoredDescription), pipeline_storage_told_count, + pipeline_storage_file_)); if (!pipeline_stored_descriptions.empty()) { // Launch additional creation threads to use all cores to create - // pipeline state objects faster. Will also be using the main thread, so - // minus 1. + // pipelines faster. Will also be using the main thread, so minus 1. size_t creation_thread_original_count = creation_threads_.size(); size_t creation_thread_needed_count = std::max(std::min(pipeline_stored_descriptions.size(), @@ -539,10 +531,10 @@ void PipelineCache::InitializeShaderStorage( {}, [this, creation_thread_index]() { CreationThread(creation_thread_index); }); - creation_thread->set_name("D3D12 Pipeline States Additional"); + creation_thread->set_name("D3D12 Pipelines"); creation_threads_.push_back(std::move(creation_thread)); } - size_t pipeline_states_created = 0; + size_t pipelines_created = 0; for (const PipelineStoredDescription& pipeline_stored_description : pipeline_stored_descriptions) { const PipelineDescription& pipeline_description = @@ -554,30 +546,28 @@ void PipelineCache::InitializeShaderStorage( 0) != pipeline_stored_description.description_hash) { break; } - pipeline_state_storage_valid_bytes += - sizeof(PipelineStoredDescription); - // Skip already known pipeline states - those have already been - // enqueued. - auto found_range = pipeline_states_.equal_range( + pipeline_storage_valid_bytes += sizeof(PipelineStoredDescription); + // Skip already known pipelines - those have already been enqueued. + auto found_range = pipelines_.equal_range( pipeline_stored_description.description_hash); - bool pipeline_state_found = false; + bool pipeline_found = false; for (auto it = found_range.first; it != found_range.second; ++it) { - PipelineState* found_pipeline_state = it->second; - if (!std::memcmp(&found_pipeline_state->description.description, + Pipeline* found_pipeline = it->second; + if (!std::memcmp(&found_pipeline->description.description, &pipeline_description, sizeof(pipeline_description))) { - pipeline_state_found = true; + pipeline_found = true; break; } } - if (pipeline_state_found) { + if (pipeline_found) { continue; } PipelineRuntimeDescription pipeline_runtime_description; auto vertex_shader_it = - shader_map_.find(pipeline_description.vertex_shader_hash); - if (vertex_shader_it == shader_map_.end()) { + shaders_.find(pipeline_description.vertex_shader_hash); + if (vertex_shader_it == shaders_.end()) { continue; } pipeline_runtime_description.vertex_shader = vertex_shader_it->second; @@ -586,8 +576,8 @@ void PipelineCache::InitializeShaderStorage( } if (pipeline_description.pixel_shader_hash) { auto pixel_shader_it = - shader_map_.find(pipeline_description.pixel_shader_hash); - if (pixel_shader_it == shader_map_.end()) { + shaders_.find(pipeline_description.pixel_shader_hash); + if (pixel_shader_it == shaders_.end()) { continue; } pipeline_runtime_description.pixel_shader = pixel_shader_it->second; @@ -607,36 +597,33 @@ void PipelineCache::InitializeShaderStorage( std::memcpy(&pipeline_runtime_description.description, &pipeline_description, sizeof(pipeline_description)); - PipelineState* new_pipeline_state = new PipelineState; - new_pipeline_state->state = nullptr; - std::memcpy(&new_pipeline_state->description, - &pipeline_runtime_description, + Pipeline* new_pipeline = new Pipeline; + new_pipeline->state = nullptr; + std::memcpy(&new_pipeline->description, &pipeline_runtime_description, sizeof(pipeline_runtime_description)); - pipeline_states_.insert( - std::make_pair(pipeline_stored_description.description_hash, - new_pipeline_state)); - COUNT_profile_set("gpu/pipeline_cache/pipeline_states", - pipeline_states_.size()); + pipelines_.emplace(pipeline_stored_description.description_hash, + new_pipeline); + COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size()); if (!creation_threads_.empty()) { // Submit the pipeline for creation to any available thread. { std::lock_guard lock(creation_request_lock_); - creation_queue_.push_back(new_pipeline_state); + creation_queue_.push_back(new_pipeline); } creation_request_cond_.notify_one(); } else { - new_pipeline_state->state = - CreateD3D12PipelineState(pipeline_runtime_description); + new_pipeline->state = + CreateD3D12Pipeline(pipeline_runtime_description); } - ++pipeline_states_created; + ++pipelines_created; } - CreateQueuedPipelineStatesOnProcessorThread(); + CreateQueuedPipelinesOnProcessorThread(); if (creation_threads_.size() > creation_thread_original_count) { { std::lock_guard lock(creation_request_lock_); creation_threads_shutdown_from_ = creation_thread_original_count; // Assuming the queue is empty because of - // CreateQueuedPipelineStatesOnProcessorThread. + // CreateQueuedPipelinesOnProcessorThread. } creation_request_cond_.notify_all(); while (creation_threads_.size() > creation_thread_original_count) { @@ -664,26 +651,23 @@ void PipelineCache::InitializeShaderStorage( } } XELOGGPU( - "Created {} graphics pipeline state objects from the storage in {} " - "milliseconds", - pipeline_states_created, + "Created {} graphics pipelines from the storage in {} milliseconds", + pipelines_created, (xe::Clock::QueryHostTickCount() - - pipeline_state_storage_initialization_start_) * + pipeline_storage_initialization_start_) * 1000 / xe::Clock::QueryHostTickFrequency()); } } - xe::filesystem::TruncateStdioFile(pipeline_state_storage_file_, - pipeline_state_storage_valid_bytes); + xe::filesystem::TruncateStdioFile(pipeline_storage_file_, + pipeline_storage_valid_bytes); } else { - xe::filesystem::TruncateStdioFile(pipeline_state_storage_file_, 0); - pipeline_state_storage_file_header.magic = pipeline_state_storage_magic; - pipeline_state_storage_file_header.magic_api = - pipeline_state_storage_magic_api; - pipeline_state_storage_file_header.version_swapped = + xe::filesystem::TruncateStdioFile(pipeline_storage_file_, 0); + pipeline_storage_file_header.magic = pipeline_storage_magic; + pipeline_storage_file_header.magic_api = pipeline_storage_magic_api; + pipeline_storage_file_header.version_swapped = xe::byte_swap(PipelineDescription::kVersion); - fwrite(&pipeline_state_storage_file_header, - sizeof(pipeline_state_storage_file_header), 1, - pipeline_state_storage_file_); + fwrite(&pipeline_storage_file_header, sizeof(pipeline_storage_file_header), + 1, pipeline_storage_file_); } shader_storage_root_ = storage_root; @@ -691,7 +675,7 @@ void PipelineCache::InitializeShaderStorage( // Start the storage writing thread. storage_write_flush_shaders_ = false; - storage_write_flush_pipeline_states_ = false; + storage_write_flush_pipelines_ = false; storage_write_thread_shutdown_ = false; storage_write_thread_ = xe::threading::Thread::Create({}, [this]() { StorageWriteThread(); }); @@ -708,12 +692,12 @@ void PipelineCache::ShutdownShaderStorage() { storage_write_thread_.reset(); } storage_write_shader_queue_.clear(); - storage_write_pipeline_state_queue_.clear(); + storage_write_pipeline_queue_.clear(); - if (pipeline_state_storage_file_) { - fclose(pipeline_state_storage_file_); - pipeline_state_storage_file_ = nullptr; - pipeline_state_storage_file_flush_needed_ = false; + if (pipeline_storage_file_) { + fclose(pipeline_storage_file_); + pipeline_storage_file_ = nullptr; + pipeline_storage_file_flush_needed_ = false; } if (shader_storage_file_) { @@ -728,30 +712,29 @@ void PipelineCache::ShutdownShaderStorage() { void PipelineCache::EndSubmission() { if (shader_storage_file_flush_needed_ || - pipeline_state_storage_file_flush_needed_) { + pipeline_storage_file_flush_needed_) { { std::lock_guard lock(storage_write_request_lock_); if (shader_storage_file_flush_needed_) { storage_write_flush_shaders_ = true; } - if (pipeline_state_storage_file_flush_needed_) { - storage_write_flush_pipeline_states_ = true; + if (pipeline_storage_file_flush_needed_) { + storage_write_flush_pipelines_ = true; } } storage_write_request_cond_.notify_one(); shader_storage_file_flush_needed_ = false; - pipeline_state_storage_file_flush_needed_ = false; + pipeline_storage_file_flush_needed_ = false; } if (!creation_threads_.empty()) { - CreateQueuedPipelineStatesOnProcessorThread(); - // Await creation of all queued pipeline state objects. + CreateQueuedPipelinesOnProcessorThread(); + // Await creation of all queued pipelines. bool await_creation_completion_event; { std::lock_guard lock(creation_request_lock_); // Assuming the creation queue is already empty (because the processor - // thread also worked on creating the leftover pipeline state objects), so - // only check if there are threads with pipeline state objects currently - // being created. + // thread also worked on creating the leftover pipelines), so only check + // if there are threads with pipelines currently being created. await_creation_completion_event = creation_threads_busy_ != 0; if (await_creation_completion_event) { creation_completion_event_->Reset(); @@ -765,7 +748,7 @@ void PipelineCache::EndSubmission() { } } -bool PipelineCache::IsCreatingPipelineStates() { +bool PipelineCache::IsCreatingPipelines() { if (creation_threads_.empty()) { return false; } @@ -779,8 +762,8 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type, uint32_t dword_count) { // Hash the input memory and lookup the shader. uint64_t data_hash = XXH64(host_address, dword_count * sizeof(uint32_t), 0); - auto it = shader_map_.find(data_hash); - if (it != shader_map_.end()) { + auto it = shaders_.find(data_hash); + if (it != shaders_.end()) { // Shader has been previously loaded. return it->second; } @@ -790,7 +773,7 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type, // again. D3D12Shader* shader = new D3D12Shader(shader_type, data_hash, host_address, dword_count); - shader_map_.insert({data_hash, shader}); + shaders_.emplace(data_hash, shader); return shader; } @@ -798,11 +781,11 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type, Shader::HostVertexShaderType PipelineCache::GetHostVertexShaderTypeIfValid() const { // If the values this functions returns are changed, INVALIDATE THE SHADER - // STORAGE (increase kVersion for BOTH shaders and pipeline states)! The - // exception is when the function originally returned "unsupported", but - // started to return a valid value (in this case the shader wouldn't be cached - // in the first place). Otherwise games will not be able to locate shaders for - // draws for which the host vertex shader type has changed! + // STORAGE (increase kVersion for BOTH shaders and pipelines)! The exception + // is when the function originally returned "unsupported", but started to + // return a valid value (in this case the shader wouldn't be cached in the + // first place). Otherwise games will not be able to locate shaders for draws + // for which the host vertex shader type has changed! const auto& regs = register_file_; auto vgt_draw_initiator = regs.Get(); if (!xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode, @@ -929,13 +912,12 @@ bool PipelineCache::ConfigurePipeline( xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format, bool early_z, const RenderTargetCache::PipelineRenderTarget render_targets[5], - void** pipeline_state_handle_out, - ID3D12RootSignature** root_signature_out) { + void** pipeline_handle_out, ID3D12RootSignature** root_signature_out) { #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES - assert_not_null(pipeline_state_handle_out); + assert_not_null(pipeline_handle_out); assert_not_null(root_signature_out); PipelineRuntimeDescription runtime_description; @@ -946,24 +928,24 @@ bool PipelineCache::ConfigurePipeline( } PipelineDescription& description = runtime_description.description; - if (current_pipeline_state_ != nullptr && - !std::memcmp(¤t_pipeline_state_->description.description, - &description, sizeof(description))) { - *pipeline_state_handle_out = current_pipeline_state_; + if (current_pipeline_ != nullptr && + !std::memcmp(¤t_pipeline_->description.description, &description, + sizeof(description))) { + *pipeline_handle_out = current_pipeline_; *root_signature_out = runtime_description.root_signature; return true; } - // Find an existing pipeline state object in the cache. + // Find an existing pipeline in the cache. uint64_t hash = XXH64(&description, sizeof(description), 0); - auto found_range = pipeline_states_.equal_range(hash); + auto found_range = pipelines_.equal_range(hash); for (auto it = found_range.first; it != found_range.second; ++it) { - PipelineState* found_pipeline_state = it->second; - if (!std::memcmp(&found_pipeline_state->description.description, - &description, sizeof(description))) { - current_pipeline_state_ = found_pipeline_state; - *pipeline_state_handle_out = found_pipeline_state; - *root_signature_out = found_pipeline_state->description.root_signature; + Pipeline* found_pipeline = it->second; + if (!std::memcmp(&found_pipeline->description.description, &description, + sizeof(description))) { + current_pipeline_ = found_pipeline; + *pipeline_handle_out = found_pipeline; + *root_signature_out = found_pipeline->description.root_signature; return true; } } @@ -974,33 +956,32 @@ bool PipelineCache::ConfigurePipeline( return false; } - PipelineState* new_pipeline_state = new PipelineState; - new_pipeline_state->state = nullptr; - std::memcpy(&new_pipeline_state->description, &runtime_description, + Pipeline* new_pipeline = new Pipeline; + new_pipeline->state = nullptr; + std::memcpy(&new_pipeline->description, &runtime_description, sizeof(runtime_description)); - pipeline_states_.insert(std::make_pair(hash, new_pipeline_state)); - COUNT_profile_set("gpu/pipeline_cache/pipeline_states", - pipeline_states_.size()); + pipelines_.emplace(hash, new_pipeline); + COUNT_profile_set("gpu/pipeline_cache/pipelines", pipelines_.size()); if (!creation_threads_.empty()) { - // Submit the pipeline state object for creation to any available thread. + // Submit the pipeline for creation to any available thread. { std::lock_guard lock(creation_request_lock_); - creation_queue_.push_back(new_pipeline_state); + creation_queue_.push_back(new_pipeline); } creation_request_cond_.notify_one(); } else { - new_pipeline_state->state = CreateD3D12PipelineState(runtime_description); + new_pipeline->state = CreateD3D12Pipeline(runtime_description); } - if (pipeline_state_storage_file_) { + if (pipeline_storage_file_) { assert_not_null(storage_write_thread_); - pipeline_state_storage_file_flush_needed_ = true; + pipeline_storage_file_flush_needed_ = true; { std::lock_guard lock(storage_write_request_lock_); - storage_write_pipeline_state_queue_.emplace_back(); + storage_write_pipeline_queue_.emplace_back(); PipelineStoredDescription& stored_description = - storage_write_pipeline_state_queue_.back(); + storage_write_pipeline_queue_.back(); stored_description.description_hash = hash; std::memcpy(&stored_description.description, &description, sizeof(description)); @@ -1008,8 +989,8 @@ bool PipelineCache::ConfigurePipeline( storage_write_request_cond_.notify_all(); } - current_pipeline_state_ = new_pipeline_state; - *pipeline_state_handle_out = new_pipeline_state; + current_pipeline_ = new_pipeline; + *pipeline_handle_out = new_pipeline; *root_signature_out = runtime_description.root_signature; return true; } @@ -1136,8 +1117,8 @@ bool PipelineCache::TranslateShader( std::memcpy( texture_binding_layouts_.data() + new_uid.vector_span_offset, texture_bindings, texture_binding_layout_bytes); - texture_binding_layout_map_.insert( - {texture_binding_layout_hash, new_uid}); + texture_binding_layout_map_.emplace(texture_binding_layout_hash, + new_uid); } } if (bindless_sampler_count) { @@ -1179,8 +1160,8 @@ bool PipelineCache::TranslateShader( vector_bindless_sampler_layout[i] = sampler_bindings[i].bindless_descriptor_index; } - bindless_sampler_layout_map_.insert( - {bindless_sampler_layout_hash, new_uid}); + bindless_sampler_layout_map_.emplace(bindless_sampler_layout_hash, + new_uid); } } } @@ -1508,8 +1489,7 @@ bool PipelineCache::GetCurrentStateDescription( /* 16 */ PipelineBlendFactor::kSrcAlphaSat, }; // Like kBlendFactorMap, but with color modes changed to alpha. Some - // pipeline state objects aren't created in Prey because a color mode is - // used for alpha. + // pipelines aren't created in Prey because a color mode is used for alpha. static const PipelineBlendFactor kBlendFactorAlphaMap[32] = { /* 0 */ PipelineBlendFactor::kZero, /* 1 */ PipelineBlendFactor::kOne, @@ -1569,18 +1549,16 @@ bool PipelineCache::GetCurrentStateDescription( return true; } -ID3D12PipelineState* PipelineCache::CreateD3D12PipelineState( +ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( const PipelineRuntimeDescription& runtime_description) { const PipelineDescription& description = runtime_description.description; if (runtime_description.pixel_shader != nullptr) { - XELOGGPU( - "Creating graphics pipeline state with VS {:016X}" - ", PS {:016X}", - runtime_description.vertex_shader->ucode_data_hash(), - runtime_description.pixel_shader->ucode_data_hash()); + XELOGGPU("Creating graphics pipeline with VS {:016X}, PS {:016X}", + runtime_description.vertex_shader->ucode_data_hash(), + runtime_description.pixel_shader->ucode_data_hash()); } else { - XELOGGPU("Creating graphics pipeline state with VS {:016X}", + XELOGGPU("Creating graphics pipeline with VS {:016X}", runtime_description.vertex_shader->ucode_data_hash()); } @@ -1893,20 +1871,18 @@ ID3D12PipelineState* PipelineCache::CreateD3D12PipelineState( } } - // Create the pipeline state object. + // Create the D3D12 pipeline state object. auto device = command_processor_.GetD3D12Context().GetD3D12Provider().GetDevice(); ID3D12PipelineState* state; if (FAILED(device->CreateGraphicsPipelineState(&state_desc, IID_PPV_ARGS(&state)))) { if (runtime_description.pixel_shader != nullptr) { - XELOGE( - "Failed to create graphics pipeline state with VS {:016X}" - ", PS {:016X}", - runtime_description.vertex_shader->ucode_data_hash(), - runtime_description.pixel_shader->ucode_data_hash()); + XELOGE("Failed to create graphics pipeline with VS {:016X}, PS {:016X}", + runtime_description.vertex_shader->ucode_data_hash(), + runtime_description.pixel_shader->ucode_data_hash()); } else { - XELOGE("Failed to create graphics pipeline state with VS {:016X}", + XELOGE("Failed to create graphics pipeline with VS {:016X}", runtime_description.vertex_shader->ucode_data_hash()); } return nullptr; @@ -1933,7 +1909,7 @@ void PipelineCache::StorageWriteThread() { ucode_guest_endian.reserve(0xFFFF); bool flush_shaders = false; - bool flush_pipeline_states = false; + bool flush_pipelines = false; while (true) { if (flush_shaders) { @@ -1941,15 +1917,15 @@ void PipelineCache::StorageWriteThread() { assert_not_null(shader_storage_file_); fflush(shader_storage_file_); } - if (flush_pipeline_states) { - flush_pipeline_states = false; - assert_not_null(pipeline_state_storage_file_); - fflush(pipeline_state_storage_file_); + if (flush_pipelines) { + flush_pipelines = false; + assert_not_null(pipeline_storage_file_); + fflush(pipeline_storage_file_); } std::pair shader_pair = {}; PipelineStoredDescription pipeline_description; - bool write_pipeline_state = false; + bool write_pipeline = false; { std::unique_lock lock(storage_write_request_lock_); if (storage_write_thread_shutdown_) { @@ -1962,17 +1938,17 @@ void PipelineCache::StorageWriteThread() { storage_write_flush_shaders_ = false; flush_shaders = true; } - if (!storage_write_pipeline_state_queue_.empty()) { + if (!storage_write_pipeline_queue_.empty()) { std::memcpy(&pipeline_description, - &storage_write_pipeline_state_queue_.front(), + &storage_write_pipeline_queue_.front(), sizeof(pipeline_description)); - storage_write_pipeline_state_queue_.pop_front(); - write_pipeline_state = true; - } else if (storage_write_flush_pipeline_states_) { - storage_write_flush_pipeline_states_ = false; - flush_pipeline_states = true; + storage_write_pipeline_queue_.pop_front(); + write_pipeline = true; + } else if (storage_write_flush_pipelines_) { + storage_write_flush_pipelines_ = false; + flush_pipelines = true; } - if (!shader_pair.first && !write_pipeline_state) { + if (!shader_pair.first && !write_pipeline) { storage_write_request_cond_.wait(lock); continue; } @@ -1999,27 +1975,26 @@ void PipelineCache::StorageWriteThread() { } } - if (write_pipeline_state) { - assert_not_null(pipeline_state_storage_file_); + if (write_pipeline) { + assert_not_null(pipeline_storage_file_); fwrite(&pipeline_description, sizeof(pipeline_description), 1, - pipeline_state_storage_file_); + pipeline_storage_file_); } } } void PipelineCache::CreationThread(size_t thread_index) { while (true) { - PipelineState* pipeline_state_to_create = nullptr; + Pipeline* pipeline_to_create = nullptr; // Check if need to shut down or set the completion event and dequeue the - // pipeline state if there is any. + // pipeline if there is any. { std::unique_lock lock(creation_request_lock_); if (thread_index >= creation_threads_shutdown_from_ || creation_queue_.empty()) { if (creation_completion_set_event_ && creation_threads_busy_ == 0) { - // Last pipeline state object in the queue created - signal the event - // if requested. + // Last pipeline in the queue created - signal the event if requested. creation_completion_set_event_ = false; creation_completion_event_->Set(); } @@ -2029,23 +2004,22 @@ void PipelineCache::CreationThread(size_t thread_index) { creation_request_cond_.wait(lock); continue; } - // Take the pipeline state from the queue and increment the busy thread - // count until the pipeline state object is created - other threads must - // be able to dequeue requests, but can't set the completion event until - // the pipeline state objects are fully created (rather than just started - // creating). - pipeline_state_to_create = creation_queue_.front(); + // Take the pipeline from the queue and increment the busy thread count + // until the pipeline is created - other threads must be able to dequeue + // requests, but can't set the completion event until the pipelines are + // fully created (rather than just started creating). + pipeline_to_create = creation_queue_.front(); creation_queue_.pop_front(); ++creation_threads_busy_; } // Create the D3D12 pipeline state object. - pipeline_state_to_create->state = - CreateD3D12PipelineState(pipeline_state_to_create->description); + pipeline_to_create->state = + CreateD3D12Pipeline(pipeline_to_create->description); - // Pipeline state object created - the thread is not busy anymore, safe to - // set the completion event if needed (at the next iteration, or in some - // other thread). + // Pipeline created - the thread is not busy anymore, safe to set the + // completion event if needed (at the next iteration, or in some other + // thread). { std::lock_guard lock(creation_request_lock_); --creation_threads_busy_; @@ -2053,20 +2027,20 @@ void PipelineCache::CreationThread(size_t thread_index) { } } -void PipelineCache::CreateQueuedPipelineStatesOnProcessorThread() { +void PipelineCache::CreateQueuedPipelinesOnProcessorThread() { assert_false(creation_threads_.empty()); while (true) { - PipelineState* pipeline_state_to_create; + Pipeline* pipeline_to_create; { std::lock_guard lock(creation_request_lock_); if (creation_queue_.empty()) { break; } - pipeline_state_to_create = creation_queue_.front(); + pipeline_to_create = creation_queue_.front(); creation_queue_.pop_front(); } - pipeline_state_to_create->state = - CreateD3D12PipelineState(pipeline_state_to_create->description); + pipeline_to_create->state = + CreateD3D12Pipeline(pipeline_to_create->description); } } diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index cdc6ed5f3..8159416d0 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -29,6 +29,7 @@ #include "xenia/gpu/dxbc_shader_translator.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/xenos.h" +#include "xenia/ui/d3d12/d3d12_api.h" namespace xe { namespace gpu { @@ -54,7 +55,7 @@ class PipelineCache { void ShutdownShaderStorage(); void EndSubmission(); - bool IsCreatingPipelineStates(); + bool IsCreatingPipelines(); D3D12Shader* LoadShader(xenos::ShaderType shader_type, uint32_t guest_address, const uint32_t* host_address, uint32_t dword_count); @@ -73,14 +74,12 @@ class PipelineCache { xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format, bool early_z, const RenderTargetCache::PipelineRenderTarget render_targets[5], - void** pipeline_state_handle_out, - ID3D12RootSignature** root_signature_out); + void** pipeline_handle_out, ID3D12RootSignature** root_signature_out); - // Returns a pipeline state object with deferred creation by its handle. May - // return nullptr if failed to create the pipeline state object. - inline ID3D12PipelineState* GetD3D12PipelineStateByHandle( - void* handle) const { - return reinterpret_cast(handle)->state; + // Returns a pipeline with deferred creation by its handle. May return nullptr + // if failed to create the pipeline. + ID3D12PipelineState* GetD3D12PipelineByHandle(void* handle) const { + return reinterpret_cast(handle)->state; } private: @@ -237,7 +236,7 @@ class PipelineCache { const RenderTargetCache::PipelineRenderTarget render_targets[5], PipelineRuntimeDescription& runtime_description_out); - ID3D12PipelineState* CreateD3D12PipelineState( + ID3D12PipelineState* CreateD3D12Pipeline( const PipelineRuntimeDescription& runtime_description); D3D12CommandProcessor& command_processor_; @@ -255,9 +254,9 @@ class PipelineCache { IDxcUtils* dxc_utils_ = nullptr; IDxcCompiler* dxc_compiler_ = nullptr; - // All loaded shaders mapped by their guest hash key. + // Ucode hash -> shader. std::unordered_map> - shader_map_; + shaders_; struct LayoutUID { size_t uid; @@ -285,21 +284,20 @@ class PipelineCache { // Xenos pixel shader provided. std::vector depth_only_pixel_shader_; - struct PipelineState { + struct Pipeline { // nullptr if creation has failed. ID3D12PipelineState* state; PipelineRuntimeDescription description; }; - // All previously generated pipeline state objects identified by hash and the - // description. - std::unordered_multimap> - pipeline_states_; + pipelines_; - // Previously used pipeline state object. This matches our current state - // settings and allows us to quickly(ish) reuse the pipeline state if no - // registers have changed. - PipelineState* current_pipeline_state_ = nullptr; + // Previously used pipeline. This matches our current state settings and + // allows us to quickly(ish) reuse the pipeline if no registers have been + // changed. + Pipeline* current_pipeline_ = nullptr; // Currently open shader storage path. std::filesystem::path shader_storage_root_; @@ -309,10 +307,9 @@ class PipelineCache { FILE* shader_storage_file_ = nullptr; bool shader_storage_file_flush_needed_ = false; - // Pipeline state storage output stream, for preload in the next emulator - // runs. - FILE* pipeline_state_storage_file_ = nullptr; - bool pipeline_state_storage_file_flush_needed_ = false; + // Pipeline storage output stream, for preload in the next emulator runs. + FILE* pipeline_storage_file_ = nullptr; + bool pipeline_storage_file_flush_needed_ = false; // Thread for asynchronous writing to the storage streams. void StorageWriteThread(); @@ -322,28 +319,27 @@ class PipelineCache { // thread is notified about its change via storage_write_request_cond_. std::deque> storage_write_shader_queue_; - std::deque storage_write_pipeline_state_queue_; + std::deque storage_write_pipeline_queue_; bool storage_write_flush_shaders_ = false; - bool storage_write_flush_pipeline_states_ = false; + bool storage_write_flush_pipelines_ = false; bool storage_write_thread_shutdown_ = false; std::unique_ptr storage_write_thread_; - // Pipeline state object creation threads. + // Pipeline creation threads. void CreationThread(size_t thread_index); - void CreateQueuedPipelineStatesOnProcessorThread(); + void CreateQueuedPipelinesOnProcessorThread(); std::mutex creation_request_lock_; std::condition_variable creation_request_cond_; // Protected with creation_request_lock_, notify_one creation_request_cond_ // when set. - std::deque creation_queue_; - // Number of threads that are currently creating a pipeline state object - - // incremented when a pipeline state object is dequeued (the completion event - // can't be triggered before this is zero). Protected with - // creation_request_lock_. + std::deque creation_queue_; + // Number of threads that are currently creating a pipeline - incremented when + // a pipeline is dequeued (the completion event can't be triggered before this + // is zero). Protected with creation_request_lock_. size_t creation_threads_busy_ = 0; - // Manual-reset event set when the last queued pipeline state object is - // created and there are no more pipeline state objects to create. This is - // triggered by the thread creating the last pipeline state object. + // Manual-reset event set when the last queued pipeline is created and there + // are no more pipelines to create. This is triggered by the thread creating + // the last pipeline. std::unique_ptr creation_completion_event_; // Whether setting the event on completion is queued. Protected with // creation_request_lock_, notify_one creation_request_cond_ when set. diff --git a/src/xenia/gpu/d3d12/premake5.lua b/src/xenia/gpu/d3d12/premake5.lua index b4b9f3ecb..fa82fdb6c 100644 --- a/src/xenia/gpu/d3d12/premake5.lua +++ b/src/xenia/gpu/d3d12/premake5.lua @@ -25,15 +25,6 @@ project("xenia-gpu-d3d12-trace-viewer") kind("WindowedApp") language("C++") links({ - "aes_128", - "capstone", - "dxbc", - "fmt", - "imgui", - "libavcodec", - "libavutil", - "mspack", - "snappy", "xenia-apu", "xenia-apu-nop", "xenia-base", @@ -49,6 +40,17 @@ project("xenia-gpu-d3d12-trace-viewer") "xenia-ui-d3d12", "xenia-vfs", "xenia-patcher", + }) + links({ + "aes_128", + "capstone", + "dxbc", + "fmt", + "imgui", + "libavcodec", + "libavutil", + "mspack", + "snappy", "xxhash", }) files({ @@ -71,15 +73,6 @@ project("xenia-gpu-d3d12-trace-dump") kind("ConsoleApp") language("C++") links({ - "aes_128", - "capstone", - "dxbc", - "fmt", - "imgui", - "libavcodec", - "libavutil", - "mspack", - "snappy", "xenia-apu", "xenia-apu-nop", "xenia-base", @@ -95,6 +88,17 @@ project("xenia-gpu-d3d12-trace-dump") "xenia-ui-d3d12", "xenia-vfs", "xenia-patcher", + }) + links({ + "aes_128", + "capstone", + "dxbc", + "fmt", + "imgui", + "libavcodec", + "libavutil", + "mspack", + "snappy", "xxhash", }) files({ @@ -109,4 +113,4 @@ project("xenia-gpu-d3d12-trace-dump") "2>&1", "1>scratch/stdout-trace-dump.txt", }) - end \ No newline at end of file + end diff --git a/src/xenia/gpu/d3d12/primitive_converter.cc b/src/xenia/gpu/d3d12/primitive_converter.cc index d4f989123..90ba11ac5 100644 --- a/src/xenia/gpu/d3d12/primitive_converter.cc +++ b/src/xenia/gpu/d3d12/primitive_converter.cc @@ -454,8 +454,8 @@ PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives( // again and again and exit. if (!conversion_needed || converted_index_count == 0) { converted_indices.gpu_address = 0; - converted_indices_cache_.insert( - std::make_pair(converted_indices.key.value, converted_indices)); + converted_indices_cache_.emplace(converted_indices.key.value, + converted_indices); memory_regions_used_ |= memory_regions_used_bits; return converted_index_count == 0 ? ConversionResult::kPrimitiveEmpty : ConversionResult::kConversionNotNeeded; @@ -670,8 +670,8 @@ PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives( // Cache and return the indices. converted_indices.gpu_address = gpu_address; - converted_indices_cache_.insert( - std::make_pair(converted_indices.key.value, converted_indices)); + converted_indices_cache_.emplace(converted_indices.key.value, + converted_indices); memory_regions_used_ |= memory_regions_used_bits; gpu_address_out = gpu_address; index_count_out = converted_index_count; diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index b2c964a55..66ef2ba9f 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -277,20 +277,19 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) { return false; } - // Create the EDRAM load/store pipeline state objects. + // Create the EDRAM load/store pipelines. for (uint32_t i = 0; i < uint32_t(EdramLoadStoreMode::kCount); ++i) { const EdramLoadStoreModeInfo& mode_info = edram_load_store_mode_info_[i]; - edram_load_pipelines_[i] = ui::d3d12::util::CreateComputePipelineState( + edram_load_pipelines_[i] = ui::d3d12::util::CreateComputePipeline( device, mode_info.load_shader, mode_info.load_shader_size, edram_load_store_root_signature_); - edram_store_pipelines_[i] = ui::d3d12::util::CreateComputePipelineState( + edram_store_pipelines_[i] = ui::d3d12::util::CreateComputePipeline( device, mode_info.store_shader, mode_info.store_shader_size, edram_load_store_root_signature_); if (edram_load_pipelines_[i] == nullptr || edram_store_pipelines_[i] == nullptr) { - XELOGE( - "Failed to create the EDRAM load/store pipeline states for mode {}", - i); + XELOGE("Failed to create the EDRAM load/store pipelines for mode {}", + i); Shutdown(); return false; } @@ -299,7 +298,7 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) { } } - // Create the resolve root signatures and pipeline state objects. + // Create the resolve root signatures and pipelines. D3D12_ROOT_PARAMETER resolve_root_parameters[3]; // Copying root signature. @@ -369,7 +368,7 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) { return false; } - // Copying pipeline state objects. + // Copying pipelines. uint32_t resolution_scale = resolution_scale_2x_ ? 2 : 1; for (size_t i = 0; i < size_t(draw_util::ResolveCopyShaderIndex::kCount); ++i) { @@ -381,63 +380,61 @@ bool RenderTargetCache::Initialize(const TextureCache& texture_cache) { continue; } const auto& resolve_copy_shader = resolve_copy_shaders_[i]; - ID3D12PipelineState* resolve_copy_pipeline_state = - ui::d3d12::util::CreateComputePipelineState( + ID3D12PipelineState* resolve_copy_pipeline = + ui::d3d12::util::CreateComputePipeline( device, resolve_copy_shader.first, resolve_copy_shader.second, resolve_copy_root_signature_); - if (resolve_copy_pipeline_state == nullptr) { - XELOGE("Failed to create {} resolve copy pipeline state", + if (resolve_copy_pipeline == nullptr) { + XELOGE("Failed to create {} resolve copy pipeline", resolve_copy_shader_info.debug_name); } - resolve_copy_pipeline_state->SetName(reinterpret_cast( + resolve_copy_pipeline->SetName(reinterpret_cast( xe::to_utf16(resolve_copy_shader_info.debug_name).c_str())); - resolve_copy_pipeline_states_[i] = resolve_copy_pipeline_state; + resolve_copy_pipelines_[i] = resolve_copy_pipeline; } - // Clearing pipeline state objects. - resolve_clear_32bpp_pipeline_state_ = - ui::d3d12::util::CreateComputePipelineState( - device, - resolution_scale_2x_ ? resolve_clear_32bpp_2xres_cs - : resolve_clear_32bpp_cs, - resolution_scale_2x_ ? sizeof(resolve_clear_32bpp_2xres_cs) - : sizeof(resolve_clear_32bpp_cs), - resolve_clear_root_signature_); - if (resolve_clear_32bpp_pipeline_state_ == nullptr) { - XELOGE("Failed to create the 32bpp resolve clear pipeline state"); + // Clearing pipelines. + resolve_clear_32bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline( + device, + resolution_scale_2x_ ? resolve_clear_32bpp_2xres_cs + : resolve_clear_32bpp_cs, + resolution_scale_2x_ ? sizeof(resolve_clear_32bpp_2xres_cs) + : sizeof(resolve_clear_32bpp_cs), + resolve_clear_root_signature_); + if (resolve_clear_32bpp_pipeline_ == nullptr) { + XELOGE("Failed to create the 32bpp resolve clear pipeline"); Shutdown(); return false; } - resolve_clear_32bpp_pipeline_state_->SetName(L"Resolve Clear 32bpp"); - resolve_clear_64bpp_pipeline_state_ = - ui::d3d12::util::CreateComputePipelineState( - device, - resolution_scale_2x_ ? resolve_clear_64bpp_2xres_cs - : resolve_clear_64bpp_cs, - resolution_scale_2x_ ? sizeof(resolve_clear_64bpp_2xres_cs) - : sizeof(resolve_clear_64bpp_cs), - resolve_clear_root_signature_); - if (resolve_clear_64bpp_pipeline_state_ == nullptr) { - XELOGE("Failed to create the 64bpp resolve clear pipeline state"); + resolve_clear_32bpp_pipeline_->SetName(L"Resolve Clear 32bpp"); + resolve_clear_64bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline( + device, + resolution_scale_2x_ ? resolve_clear_64bpp_2xres_cs + : resolve_clear_64bpp_cs, + resolution_scale_2x_ ? sizeof(resolve_clear_64bpp_2xres_cs) + : sizeof(resolve_clear_64bpp_cs), + resolve_clear_root_signature_); + if (resolve_clear_64bpp_pipeline_ == nullptr) { + XELOGE("Failed to create the 64bpp resolve clear pipeline"); Shutdown(); return false; } - resolve_clear_64bpp_pipeline_state_->SetName(L"Resolve Clear 64bpp"); + resolve_clear_64bpp_pipeline_->SetName(L"Resolve Clear 64bpp"); if (!edram_rov_used_) { assert_false(resolution_scale_2x_); - resolve_clear_depth_24_32_pipeline_state_ = - ui::d3d12::util::CreateComputePipelineState( + resolve_clear_depth_24_32_pipeline_ = + ui::d3d12::util::CreateComputePipeline( device, resolve_clear_depth_24_32_cs, sizeof(resolve_clear_depth_24_32_cs), resolve_clear_root_signature_); - if (resolve_clear_depth_24_32_pipeline_state_ == nullptr) { + if (resolve_clear_depth_24_32_pipeline_ == nullptr) { XELOGE( "Failed to create the 24-bit and 32-bit depth resolve clear pipeline " "state"); Shutdown(); return false; } - resolve_clear_64bpp_pipeline_state_->SetName( + resolve_clear_64bpp_pipeline_->SetName( L"Resolve Clear 24-bit & 32-bit Depth"); } @@ -451,12 +448,12 @@ void RenderTargetCache::Shutdown() { edram_snapshot_restore_pool_.reset(); ui::d3d12::util::ReleaseAndNull(edram_snapshot_download_buffer_); - ui::d3d12::util::ReleaseAndNull(resolve_clear_depth_24_32_pipeline_state_); - ui::d3d12::util::ReleaseAndNull(resolve_clear_64bpp_pipeline_state_); - ui::d3d12::util::ReleaseAndNull(resolve_clear_32bpp_pipeline_state_); + ui::d3d12::util::ReleaseAndNull(resolve_clear_depth_24_32_pipeline_); + ui::d3d12::util::ReleaseAndNull(resolve_clear_64bpp_pipeline_); + ui::d3d12::util::ReleaseAndNull(resolve_clear_32bpp_pipeline_); ui::d3d12::util::ReleaseAndNull(resolve_clear_root_signature_); - for (size_t i = 0; i < xe::countof(resolve_copy_pipeline_states_); ++i) { - ui::d3d12::util::ReleaseAndNull(resolve_copy_pipeline_states_[i]); + for (size_t i = 0; i < xe::countof(resolve_copy_pipelines_); ++i) { + ui::d3d12::util::ReleaseAndNull(resolve_copy_pipelines_[i]); } ui::d3d12::util::ReleaseAndNull(resolve_copy_root_signature_); for (uint32_t i = 0; i < uint32_t(EdramLoadStoreMode::kCount); ++i) { @@ -1209,8 +1206,8 @@ bool RenderTargetCache::Resolve(const Memory& memory, 0, sizeof(copy_shader_constants) / sizeof(uint32_t), ©_shader_constants, 0); } - command_processor_.SetComputePipelineState( - resolve_copy_pipeline_states_[size_t(copy_shader)]); + command_processor_.SetComputePipeline( + resolve_copy_pipelines_[size_t(copy_shader)]); command_processor_.SubmitBarriers(); command_list.D3DDispatch(copy_group_count_x, copy_group_count_y, 1); @@ -1279,9 +1276,9 @@ bool RenderTargetCache::Resolve(const Memory& memory, command_list.D3DSetComputeRoot32BitConstants( 0, sizeof(depth_clear_constants) / sizeof(uint32_t), &depth_clear_constants, 0); - command_processor_.SetComputePipelineState( - clear_float32_depth ? resolve_clear_depth_24_32_pipeline_state_ - : resolve_clear_32bpp_pipeline_state_); + command_processor_.SetComputePipeline( + clear_float32_depth ? resolve_clear_depth_24_32_pipeline_ + : resolve_clear_32bpp_pipeline_); command_processor_.SubmitBarriers(); command_list.D3DDispatch(clear_group_count.first, clear_group_count.second, 1); @@ -1301,10 +1298,10 @@ bool RenderTargetCache::Resolve(const Memory& memory, 0, sizeof(color_clear_constants) / sizeof(uint32_t), &color_clear_constants, 0); } - command_processor_.SetComputePipelineState( + command_processor_.SetComputePipeline( resolve_info.color_edram_info.format_is_64bpp - ? resolve_clear_64bpp_pipeline_state_ - : resolve_clear_32bpp_pipeline_state_); + ? resolve_clear_64bpp_pipeline_ + : resolve_clear_32bpp_pipeline_); command_processor_.SubmitBarriers(); command_list.D3DDispatch(clear_group_count.first, clear_group_count.second, 1); @@ -1816,7 +1813,7 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget( render_target->footprints, nullptr, nullptr, ©_buffer_size); render_target->copy_buffer_size = uint32_t(copy_buffer_size); - render_targets_.insert(std::make_pair(key.value, render_target)); + render_targets_.emplace(key.value, render_target); COUNT_profile_set("gpu/render_target_cache/render_targets", render_targets_.size()); #if 0 @@ -2015,8 +2012,7 @@ void RenderTargetCache::StoreRenderTargetsToEdram() { 0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0); EdramLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth, render_target->key.format); - command_processor_.SetComputePipelineState( - edram_store_pipelines_[size_t(mode)]); + command_processor_.SetComputePipeline(edram_store_pipelines_[size_t(mode)]); // 1 group per 80x16 samples. command_list.D3DDispatch(surface_pitch_tiles, binding.edram_dirty_rows, 1); @@ -2140,8 +2136,7 @@ void RenderTargetCache::LoadRenderTargetsFromEdram( 0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0); EdramLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth, render_target->key.format); - command_processor_.SetComputePipelineState( - edram_load_pipelines_[size_t(mode)]); + command_processor_.SetComputePipeline(edram_load_pipelines_[size_t(mode)]); // 1 group per 80x16 samples. command_list.D3DDispatch(render_target->key.width_ss_div_80, edram_rows, 1); diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index 0def0d25c..6d20e8d52 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -237,14 +237,13 @@ class D3D12CommandProcessor; // get each of the 4 host pixels for each sample. class RenderTargetCache { public: - // Direct3D 12 debug layer does some kaschenit-style trolling by giving errors - // that contradict each other when you use null RTV descriptors - if you set - // a valid format in RTVFormats in the pipeline state, it says that null - // descriptors can only be used if the format in the pipeline state is - // DXGI_FORMAT_UNKNOWN, however, if DXGI_FORMAT_UNKNOWN is set, it complains - // that the format in the pipeline doesn't match the RTV format. So we have to - // make render target bindings consecutive and remap the output indices in - // pixel shaders. + // Direct3D 12 debug layer is giving errors that contradict each other when + // you use null RTV descriptors - if you set a valid format in RTVFormats in + // the pipeline state, it says that null descriptors can only be used if the + // format in the pipeline state is DXGI_FORMAT_UNKNOWN, however, if + // DXGI_FORMAT_UNKNOWN is set, it complains that the format in the pipeline + // state doesn't match the RTV format. So we have to make render target + // bindings consecutive and remap the output indices in pixel shaders. struct PipelineRenderTarget { uint32_t guest_render_target; DXGI_FORMAT format; @@ -304,8 +303,7 @@ class RenderTargetCache { // performance difference, but with EDRAM loads/stores less conversion should // be performed by the shaders if D24S8 is emulated as D24_UNORM_S8_UINT, and // it's probably more accurate. - static inline DXGI_FORMAT GetDepthDXGIFormat( - xenos::DepthRenderTargetFormat format) { + static DXGI_FORMAT GetDepthDXGIFormat(xenos::DepthRenderTargetFormat format) { return format == xenos::DepthRenderTargetFormat::kD24FS8 ? DXGI_FORMAT_D32_FLOAT_S8X24_UINT : DXGI_FORMAT_D24_UNORM_S8_UINT; @@ -537,7 +535,7 @@ class RenderTargetCache { // 16: - EDRAM pitch in tiles. uint32_t base_samples_2x_depth_pitch; }; - // EDRAM pipeline states for the RTV/DSV path. + // EDRAM pipelines for the RTV/DSV path. static const EdramLoadStoreModeInfo edram_load_store_mode_info_[size_t(EdramLoadStoreMode::kCount)]; ID3D12PipelineState* @@ -546,20 +544,20 @@ class RenderTargetCache { ID3D12PipelineState* edram_store_pipelines_[size_t(EdramLoadStoreMode::kCount)] = {}; - // Resolve root signatures and pipeline state objects. + // Resolve root signatures and pipelines. ID3D12RootSignature* resolve_copy_root_signature_ = nullptr; static const std::pair resolve_copy_shaders_[size_t(draw_util::ResolveCopyShaderIndex::kCount)]; - ID3D12PipelineState* resolve_copy_pipeline_states_[size_t( + ID3D12PipelineState* resolve_copy_pipelines_[size_t( draw_util::ResolveCopyShaderIndex::kCount)] = {}; ID3D12RootSignature* resolve_clear_root_signature_ = nullptr; // Clearing 32bpp color, depth with ROV, or unorm depth without ROV. - ID3D12PipelineState* resolve_clear_32bpp_pipeline_state_ = nullptr; + ID3D12PipelineState* resolve_clear_32bpp_pipeline_ = nullptr; // Clearing 64bpp color. - ID3D12PipelineState* resolve_clear_64bpp_pipeline_state_ = nullptr; + ID3D12PipelineState* resolve_clear_64bpp_pipeline_ = nullptr; // Clearing float depth without ROV, both the float24 and the host float32 // versions. - ID3D12PipelineState* resolve_clear_depth_24_32_pipeline_state_ = nullptr; + ID3D12PipelineState* resolve_clear_depth_24_32_pipeline_ = nullptr; // FIXME(Triang3l): Investigate what's wrong with placed RTV/DSV aliasing on // Nvidia Maxwell 1st generation and older. diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index c8b1e6297..44d76c9ed 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -918,27 +918,24 @@ bool TextureCache::Initialize(bool edram_rov_used) { return false; } - // Create the loading pipeline state objects. + // Create the loading pipelines. for (uint32_t i = 0; i < uint32_t(LoadMode::kCount); ++i) { const LoadModeInfo& mode_info = load_mode_info_[i]; - load_pipeline_states_[i] = ui::d3d12::util::CreateComputePipelineState( + load_pipelines_[i] = ui::d3d12::util::CreateComputePipeline( device, mode_info.shader, mode_info.shader_size, load_root_signature_); - if (load_pipeline_states_[i] == nullptr) { - XELOGE( - "Failed to create the texture loading pipeline state object for mode " - "{}", - i); + if (load_pipelines_[i] == nullptr) { + XELOGE("Failed to create the texture loading pipeline for mode {}", i); Shutdown(); return false; } if (IsResolutionScale2X() && mode_info.shader_2x != nullptr) { - load_pipeline_states_2x_[i] = ui::d3d12::util::CreateComputePipelineState( + load_pipelines_2x_[i] = ui::d3d12::util::CreateComputePipeline( device, mode_info.shader_2x, mode_info.shader_2x_size, load_root_signature_); - if (load_pipeline_states_2x_[i] == nullptr) { + if (load_pipelines_2x_[i] == nullptr) { XELOGE( - "Failed to create the 2x-scaled texture loading pipeline state " - "for mode {}", + "Failed to create the 2x-scaled texture loading pipeline for mode " + "{}", i); Shutdown(); return false; @@ -1024,8 +1021,8 @@ void TextureCache::Shutdown() { ui::d3d12::util::ReleaseAndNull(null_srv_descriptor_heap_); for (uint32_t i = 0; i < uint32_t(LoadMode::kCount); ++i) { - ui::d3d12::util::ReleaseAndNull(load_pipeline_states_2x_[i]); - ui::d3d12::util::ReleaseAndNull(load_pipeline_states_[i]); + ui::d3d12::util::ReleaseAndNull(load_pipelines_2x_[i]); + ui::d3d12::util::ReleaseAndNull(load_pipelines_[i]); } ui::d3d12::util::ReleaseAndNull(load_root_signature_); @@ -1892,7 +1889,7 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) { if (IsResolutionScale2X() && key.tiled) { LoadMode load_mode = GetLoadMode(key); if (load_mode != LoadMode::kUnknown && - load_pipeline_states_2x_[uint32_t(load_mode)] != nullptr) { + load_pipelines_2x_[uint32_t(load_mode)] != nullptr) { uint32_t base_size = 0, mip_size = 0; texture_util::GetTextureTotalSize( key.dimension, key.width, key.height, key.depth, key.format, @@ -2047,7 +2044,7 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) { } texture->base_watch_handle = nullptr; texture->mip_watch_handle = nullptr; - textures_.insert(std::make_pair(map_key, texture)); + textures_.emplace(map_key, texture); COUNT_profile_set("gpu/texture_cache/textures", textures_.size()); textures_total_size_ += texture->resource_size; COUNT_profile_set("gpu/texture_cache/total_size_mb", @@ -2079,10 +2076,10 @@ bool TextureCache::LoadTextureData(Texture* texture) { return false; } bool scaled_resolve = texture->key.scaled_resolve ? true : false; - ID3D12PipelineState* pipeline_state = - scaled_resolve ? load_pipeline_states_2x_[uint32_t(load_mode)] - : load_pipeline_states_[uint32_t(load_mode)]; - if (pipeline_state == nullptr) { + ID3D12PipelineState* pipeline = scaled_resolve + ? load_pipelines_2x_[uint32_t(load_mode)] + : load_pipelines_[uint32_t(load_mode)]; + if (pipeline == nullptr) { return false; } const LoadModeInfo& load_mode_info = load_mode_info_[uint32_t(load_mode)]; @@ -2296,7 +2293,7 @@ bool TextureCache::LoadTextureData(Texture* texture) { load_mode_info.srv_bpe_log2); } } - command_processor_.SetComputePipelineState(pipeline_state); + command_processor_.SetComputePipeline(pipeline); command_list.D3DSetComputeRootSignature(load_root_signature_); command_list.D3DSetComputeRootDescriptorTable(2, descriptor_dest.second); @@ -2597,7 +2594,7 @@ uint32_t TextureCache::FindOrCreateTextureDescriptor(Texture& texture, } device->CreateShaderResourceView( texture.resource, &desc, GetTextureDescriptorCPUHandle(descriptor_index)); - texture.srv_descriptors.insert({descriptor_key, descriptor_index}); + texture.srv_descriptors.emplace(descriptor_key, descriptor_index); return descriptor_index; } diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h index 1345d8faf..85131f25d 100644 --- a/src/xenia/gpu/d3d12/texture_cache.h +++ b/src/xenia/gpu/d3d12/texture_cache.h @@ -106,18 +106,18 @@ class TextureCache { bool operator!=(const TextureKey& key) const { return GetMapKey() != key.GetMapKey() || bucket_key != key.bucket_key; } - inline uint64_t GetMapKey() const { + uint64_t GetMapKey() const { return uint64_t(map_key[0]) | (uint64_t(map_key[1]) << 32); } - inline void SetMapKey(uint64_t key) { + void SetMapKey(uint64_t key) { map_key[0] = uint32_t(key); map_key[1] = uint32_t(key >> 32); } - inline bool IsInvalid() const { + bool IsInvalid() const { // Zero base and zero width is enough for a binding to be invalid. return map_key[0] == 0; } - inline void MakeInvalid() { + void MakeInvalid() { // Reset all for a stable hash. SetMapKey(0); bucket_key = 0; @@ -222,9 +222,7 @@ class TextureCache { void MarkRangeAsResolved(uint32_t start_unscaled, uint32_t length_unscaled); - inline bool IsResolutionScale2X() const { - return scaled_resolve_buffer_ != nullptr; - } + bool IsResolutionScale2X() const { return scaled_resolve_buffer_ != nullptr; } ID3D12Resource* GetScaledResolveBuffer() const { return scaled_resolve_buffer_; } @@ -233,7 +231,7 @@ class TextureCache { uint32_t length_unscaled); void UseScaledResolveBufferForReading(); void UseScaledResolveBufferForWriting(); - inline void MarkScaledResolveBufferUAVWritesCommitNeeded() { + void MarkScaledResolveBufferUAVWritesCommitNeeded() { if (scaled_resolve_buffer_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) { scaled_resolve_buffer_uav_writes_commit_needed_ = true; } @@ -432,7 +430,7 @@ class TextureCache { // Whether the signed version of the texture has a different representation on // the host than its unsigned version (for example, if it's a fixed-point // texture emulated with a larger host pixel format). - static inline bool IsSignedVersionSeparate(xenos::TextureFormat format) { + static bool IsSignedVersionSeparate(xenos::TextureFormat format) { const HostFormat& host_format = host_formats_[uint32_t(format)]; return host_format.load_mode_snorm != LoadMode::kUnknown && host_format.load_mode_snorm != host_format.load_mode; @@ -441,26 +439,24 @@ class TextureCache { // of block-compressed textures with 4x4-aligned dimensions on PC). static bool IsDecompressionNeeded(xenos::TextureFormat format, uint32_t width, uint32_t height); - static inline DXGI_FORMAT GetDXGIResourceFormat(xenos::TextureFormat format, - uint32_t width, - uint32_t height) { + static DXGI_FORMAT GetDXGIResourceFormat(xenos::TextureFormat format, + uint32_t width, uint32_t height) { const HostFormat& host_format = host_formats_[uint32_t(format)]; return IsDecompressionNeeded(format, width, height) ? host_format.dxgi_format_uncompressed : host_format.dxgi_format_resource; } - static inline DXGI_FORMAT GetDXGIResourceFormat(TextureKey key) { + static DXGI_FORMAT GetDXGIResourceFormat(TextureKey key) { return GetDXGIResourceFormat(key.format, key.width, key.height); } - static inline DXGI_FORMAT GetDXGIUnormFormat(xenos::TextureFormat format, - uint32_t width, - uint32_t height) { + static DXGI_FORMAT GetDXGIUnormFormat(xenos::TextureFormat format, + uint32_t width, uint32_t height) { const HostFormat& host_format = host_formats_[uint32_t(format)]; return IsDecompressionNeeded(format, width, height) ? host_format.dxgi_format_uncompressed : host_format.dxgi_format_unorm; } - static inline DXGI_FORMAT GetDXGIUnormFormat(TextureKey key) { + static DXGI_FORMAT GetDXGIUnormFormat(TextureKey key) { return GetDXGIUnormFormat(key.format, key.width, key.height); } @@ -550,9 +546,9 @@ class TextureCache { static const LoadModeInfo load_mode_info_[]; ID3D12RootSignature* load_root_signature_ = nullptr; - ID3D12PipelineState* load_pipeline_states_[size_t(LoadMode::kCount)] = {}; - // Load pipeline state objects for 2x-scaled resolved targets. - ID3D12PipelineState* load_pipeline_states_2x_[size_t(LoadMode::kCount)] = {}; + ID3D12PipelineState* load_pipelines_[size_t(LoadMode::kCount)] = {}; + // Load pipelines for 2x-scaled resolved targets. + ID3D12PipelineState* load_pipelines_2x_[size_t(LoadMode::kCount)] = {}; std::unordered_multimap textures_; uint64_t textures_total_size_ = 0; diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 6aaa1b856..202d34965 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -111,6 +111,34 @@ int32_t FloatToD3D11Fixed16p8(float f32) { return result.s; } +void GetScissor(const RegisterFile& regs, Scissor& scissor_out) { + // FIXME(Triang3l): Screen scissor isn't applied here, but it seems to be + // unused on Xbox 360 Direct3D 9. + auto pa_sc_window_scissor_tl = regs.Get(); + auto pa_sc_window_scissor_br = regs.Get(); + uint32_t tl_x = pa_sc_window_scissor_tl.tl_x; + uint32_t tl_y = pa_sc_window_scissor_tl.tl_y; + uint32_t br_x = pa_sc_window_scissor_br.br_x; + uint32_t br_y = pa_sc_window_scissor_br.br_y; + if (!pa_sc_window_scissor_tl.window_offset_disable) { + auto pa_sc_window_offset = regs.Get(); + tl_x = uint32_t(std::max( + int32_t(tl_x) + pa_sc_window_offset.window_x_offset, int32_t(0))); + tl_y = uint32_t(std::max( + int32_t(tl_y) + pa_sc_window_offset.window_y_offset, int32_t(0))); + br_x = uint32_t(std::max( + int32_t(br_x) + pa_sc_window_offset.window_x_offset, int32_t(0))); + br_y = uint32_t(std::max( + int32_t(br_y) + pa_sc_window_offset.window_y_offset, int32_t(0))); + } + br_x = std::max(br_x, tl_x); + br_y = std::max(br_y, tl_y); + scissor_out.left = tl_x; + scissor_out.top = tl_y; + scissor_out.width = br_x - tl_x; + scissor_out.height = br_y - tl_y; +} + xenos::CopySampleSelect SanitizeCopySampleSelect( xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples, bool is_depth) { diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index edb880ab0..7ef3186a0 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -33,6 +33,14 @@ namespace draw_util { // for use with the top-left rasterization rule later. int32_t FloatToD3D11Fixed16p8(float f32); +struct Scissor { + uint32_t left; + uint32_t top; + uint32_t width; + uint32_t height; +}; +void GetScissor(const RegisterFile& regs, Scissor& scissor_out); + // To avoid passing values that the shader won't understand (even though // Direct3D 9 shouldn't pass them anyway). xenos::CopySampleSelect SanitizeCopySampleSelect( diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc index 74faf6e13..b2d24f89b 100644 --- a/src/xenia/gpu/dxbc_shader_translator_alu.cc +++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc @@ -68,32 +68,34 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( break; case AluVectorOpcode::kMul: case AluVectorOpcode::kMad: { - bool is_mad = instr.vector_opcode == AluVectorOpcode::kMad; - if (is_mad) { - DxbcOpMAd(per_component_dest, operands[0], operands[1], operands[2]); - } else { - DxbcOpMul(per_component_dest, operands[0], operands[1]); - } - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. - uint32_t absolute_different = + // Not using DXBC mad to prevent fused multiply-add (mul followed by add + // may be optimized into non-fused mad by the driver in the identical + // operands case also). + DxbcOpMul(per_component_dest, operands[0], operands[1]); + uint32_t multiplicands_different = used_result_components & - ~instr.vector_operands[0].GetAbsoluteIdenticalComponents( + ~instr.vector_operands[0].GetIdenticalComponents( instr.vector_operands[1]); - if (absolute_different) { + if (multiplicands_different) { + // Shader Model 3: +-0 or denormal * anything = +0. uint32_t is_zero_temp = PushSystemTemp(); - DxbcOpMin(DxbcDest::R(is_zero_temp, absolute_different), + DxbcOpMin(DxbcDest::R(is_zero_temp, multiplicands_different), operands[0].Abs(), operands[1].Abs()); // min isn't required to flush denormals, eq is. - DxbcOpEq(DxbcDest::R(is_zero_temp, absolute_different), + DxbcOpEq(DxbcDest::R(is_zero_temp, multiplicands_different), DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f)); - DxbcOpMovC(DxbcDest::R(system_temp_result_, absolute_different), - DxbcSrc::R(is_zero_temp), - is_mad ? operands[2] : DxbcSrc::LF(0.0f), + // Not replacing true `0 + term` with movc of the term because +0 + -0 + // should result in +0, not -0. + DxbcOpMovC(DxbcDest::R(system_temp_result_, multiplicands_different), + DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f), DxbcSrc::R(system_temp_result_)); // Release is_zero_temp. PopSystemTemp(); } + if (instr.vector_opcode == AluVectorOpcode::kMad) { + DxbcOpAdd(per_component_dest, DxbcSrc::R(system_temp_result_), + operands[2]); + } } break; case AluVectorOpcode::kMax: @@ -179,69 +181,40 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( component_count = 4; } result_swizzle = DxbcSrc::kXXXX; - uint32_t absolute_different = - uint32_t((1 << component_count) - 1) & - ~instr.vector_operands[0].GetAbsoluteIdenticalComponents( - instr.vector_operands[1]); - if (absolute_different) { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. - // Add component products only if non-zero. For dp4, 16 scalar - // operations in the worst case (as opposed to always 20 for - // eq/movc/eq/movc/dp4 or min/eq/movc/movc/dp4 for preparing operands - // for dp4). - DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0001), - operands[0].SelectFromSwizzled(0), - operands[1].SelectFromSwizzled(0)); - if (absolute_different & 0b0001) { - DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0010), - operands[0].SelectFromSwizzled(0).Abs(), - operands[1].SelectFromSwizzled(0).Abs()); - DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0010), - DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY), + uint32_t different = uint32_t((1 << component_count) - 1) & + ~instr.vector_operands[0].GetIdenticalComponents( + instr.vector_operands[1]); + for (uint32_t i = 0; i < component_count; ++i) { + DxbcOpMul(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001), + operands[0].SelectFromSwizzled(i), + operands[1].SelectFromSwizzled(i)); + if ((different & (1 << i)) != 0) { + // Shader Model 3: +-0 or denormal * anything = +0 (also not replacing + // true `0 + term` with movc of the term because +0 + -0 should result + // in +0, not -0). + DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100), + operands[0].SelectFromSwizzled(i).Abs(), + operands[1].SelectFromSwizzled(i).Abs()); + DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100), + DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), DxbcSrc::LF(0.0f)); - DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001), - DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY), - DxbcSrc::LF(0.0f), - DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); - } - for (uint32_t i = 1; i < component_count; ++i) { - bool component_different = (absolute_different & (1 << i)) != 0; - DxbcOpMAd(DxbcDest::R(system_temp_result_, - component_different ? 0b0010 : 0b0001), - operands[0].SelectFromSwizzled(i), - operands[1].SelectFromSwizzled(i), - DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); - if (component_different) { - DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100), - operands[0].SelectFromSwizzled(i).Abs(), - operands[1].SelectFromSwizzled(i).Abs()); - DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100), + DxbcOpMovC(DxbcDest::R(system_temp_result_, i ? 0b0010 : 0b0001), DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), - DxbcSrc::LF(0.0f)); - DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001), - DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), - DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), - DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY)); - } + DxbcSrc::LF(0.0f), + DxbcSrc::R(system_temp_result_, + i ? DxbcSrc::kYYYY : DxbcSrc::kXXXX)); } - } else { - if (component_count == 2) { - DxbcOpDP2(DxbcDest::R(system_temp_result_, 0b0001), operands[0], - operands[1]); - } else if (component_count == 3) { - DxbcOpDP3(DxbcDest::R(system_temp_result_, 0b0001), operands[0], - operands[1]); - } else { - assert_true(component_count == 4); - DxbcOpDP4(DxbcDest::R(system_temp_result_, 0b0001), operands[0], - operands[1]); + if (i) { + // Not using DXBC dp# to avoid fused multiply-add, PC GPUs are scalar + // as of 2020 anyway, and not using mad for the same reason (mul + // followed by add may be optimized into non-fused mad by the driver + // in the identical operands case also). + DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY)); } } if (component_count == 2) { - // Add the third operand. Since floating-point addition isn't - // associative, even though adding this in multiply-add for the first - // component would be faster, it's safer to add here, in the end. DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001), DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), operands[2].SelectFromSwizzled(0)); @@ -592,14 +565,13 @@ void DxbcShaderTranslator::ProcessVectorAluOperation( DxbcOpMov(DxbcDest::R(system_temp_result_, 0b0001), DxbcSrc::LF(1.0f)); } if (used_result_components & 0b0010) { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0010), operands[0].SelectFromSwizzled(1), operands[1].SelectFromSwizzled(1)); - if (!(instr.vector_operands[0].GetAbsoluteIdenticalComponents( + if (!(instr.vector_operands[0].GetIdenticalComponents( instr.vector_operands[1]) & 0b0010)) { + // Shader Model 3: +-0 or denormal * anything = +0. DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100), operands[0].SelectFromSwizzled(1).Abs(), operands[1].SelectFromSwizzled(1).Abs()); @@ -700,8 +672,7 @@ void DxbcShaderTranslator::ProcessScalarAluOperation( DxbcOpMul(ps_dest, operand_0_a, operand_0_b); if (instr.scalar_operands[0].components[0] != instr.scalar_operands[0].components[1]) { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. + // Shader Model 3: +-0 or denormal * anything = +0. uint32_t is_zero_temp = PushSystemTemp(); DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(), operand_0_b.Abs()); @@ -714,58 +685,50 @@ void DxbcShaderTranslator::ProcessScalarAluOperation( PopSystemTemp(); } break; - case AluScalarOpcode::kMulsPrev: { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. - uint32_t is_zero_temp = PushSystemTemp(); - DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(), - ps_src.Abs()); - // min isn't required to flush denormals, eq is. - DxbcOpEq(DxbcDest::R(is_zero_temp, 0b0001), - DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f)); - DxbcOpMul(ps_dest, operand_0_a, ps_src); - DxbcOpMovC(ps_dest, DxbcSrc::R(is_zero_temp, DxbcSrc::kXXXX), - DxbcSrc::LF(0.0f), ps_src); - // Release is_zero_temp. - PopSystemTemp(); - } break; + case AluScalarOpcode::kMulsPrev: case AluScalarOpcode::kMulsPrev2: { uint32_t test_temp = PushSystemTemp(); - // Check if need to select the src0.a * ps case. - // ps != -FLT_MAX. - DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX)); - // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since -FLT_MAX - // is already loaded to an SGPR, this is also false if it's NaN. - DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(), - DxbcSrc::LF(-FLT_MAX)); - DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), - DxbcSrc::R(test_temp, DxbcSrc::kXXXX), - DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); - // isfinite(src0.b). - DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -operand_0_b.Abs(), - DxbcSrc::LF(-FLT_MAX)); - DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), - DxbcSrc::R(test_temp, DxbcSrc::kXXXX), - DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); - // src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked - // for NaN). - DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f), operand_0_b); - DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), - DxbcSrc::R(test_temp, DxbcSrc::kXXXX), - DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); - DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX)); - // Shader Model 3: 0 or denormal * anything = 0. - // ps is already known to be not NaN or Infinity, so multiplying it by 0 - // will result in 0. However, src0.a can be anything, so the result should - // be zero if ps is zero. - // FIXME(Triang3l): Signed zero needs research and handling. - DxbcOpEq(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(0.0f)); + if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) { + // Check if need to select the src0.a * ps case. + // ps != -FLT_MAX. + DxbcOpNE(DxbcDest::R(test_temp, 0b0001), ps_src, DxbcSrc::LF(-FLT_MAX)); + // isfinite(ps), or |ps| <= FLT_MAX, or -|ps| >= -FLT_MAX, since + // -FLT_MAX is already loaded to an SGPR, this is also false if it's + // NaN. + DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -ps_src.Abs(), + DxbcSrc::LF(-FLT_MAX)); + DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), + DxbcSrc::R(test_temp, DxbcSrc::kXXXX), + DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); + // isfinite(src0.b). + DxbcOpGE(DxbcDest::R(test_temp, 0b0010), -operand_0_b.Abs(), + DxbcSrc::LF(-FLT_MAX)); + DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), + DxbcSrc::R(test_temp, DxbcSrc::kXXXX), + DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); + // src0.b > 0 (need !(src0.b <= 0), but src0.b has already been checked + // for NaN). + DxbcOpLT(DxbcDest::R(test_temp, 0b0010), DxbcSrc::LF(0.0f), + operand_0_b); + DxbcOpAnd(DxbcDest::R(test_temp, 0b0001), + DxbcSrc::R(test_temp, DxbcSrc::kXXXX), + DxbcSrc::R(test_temp, DxbcSrc::kYYYY)); + DxbcOpIf(true, DxbcSrc::R(test_temp, DxbcSrc::kXXXX)); + } + // Shader Model 3: +-0 or denormal * anything = +0. + DxbcOpMin(DxbcDest::R(test_temp, 0b0001), operand_0_a.Abs(), + ps_src.Abs()); + // min isn't required to flush denormals, eq is. + DxbcOpEq(DxbcDest::R(test_temp, 0b0001), + DxbcSrc::R(test_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f)); DxbcOpMul(ps_dest, operand_0_a, ps_src); DxbcOpMovC(ps_dest, DxbcSrc::R(test_temp, DxbcSrc::kXXXX), DxbcSrc::LF(0.0f), ps_src); - DxbcOpElse(); - DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX)); - DxbcOpEndIf(); + if (instr.scalar_opcode == AluScalarOpcode::kMulsPrev2) { + DxbcOpElse(); + DxbcOpMov(ps_dest, DxbcSrc::LF(-FLT_MAX)); + DxbcOpEndIf(); + } // Release test_temp. PopSystemTemp(); } break; @@ -1023,11 +986,10 @@ void DxbcShaderTranslator::ProcessScalarAluOperation( case AluScalarOpcode::kMulsc0: case AluScalarOpcode::kMulsc1: DxbcOpMul(ps_dest, operand_0_a, operand_1); - if (!(instr.scalar_operands[0].GetAbsoluteIdenticalComponents( + if (!(instr.scalar_operands[0].GetIdenticalComponents( instr.scalar_operands[1]) & 0b0001)) { - // Shader Model 3: 0 or denormal * anything = 0. - // FIXME(Triang3l): Signed zero needs research and handling. + // Shader Model 3: +-0 or denormal * anything = +0. uint32_t is_zero_temp = PushSystemTemp(); DxbcOpMin(DxbcDest::R(is_zero_temp, 0b0001), operand_0_a.Abs(), operand_1.Abs()); diff --git a/src/xenia/gpu/dxbc_shader_translator_fetch.cc b/src/xenia/gpu/dxbc_shader_translator_fetch.cc index 0a86f7ff6..76eed4d10 100644 --- a/src/xenia/gpu/dxbc_shader_translator_fetch.cc +++ b/src/xenia/gpu/dxbc_shader_translator_fetch.cc @@ -99,8 +99,8 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( DxbcOpAnd(address_dest, fetch_constant_src.SelectFromSwizzled(0), DxbcSrc::LU(~uint32_t(3))); } - // Add the word offset from the instruction, plus the offset of the first - // needed word within the element. + // Add the word offset from the instruction (signed), plus the offset of the + // first needed word within the element. uint32_t first_word_index; xe::bit_scan_forward(needed_words, &first_word_index); int32_t first_word_buffer_offset = @@ -1730,10 +1730,10 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( } uint32_t texture_binding_index_unsigned = FindOrAddTextureBinding(tfetch_index, srv_dimension, false); - const TextureBinding& texture_binding_unsigned = - texture_bindings_[texture_binding_index_unsigned]; uint32_t texture_binding_index_signed = FindOrAddTextureBinding(tfetch_index, srv_dimension, true); + const TextureBinding& texture_binding_unsigned = + texture_bindings_[texture_binding_index_unsigned]; const TextureBinding& texture_binding_signed = texture_bindings_[texture_binding_index_signed]; DxbcSrc srv_unsigned(DxbcSrc::LF(0.0f)), srv_signed(DxbcSrc::LF(0.0f)); diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index e54792a27..04bc8024b 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -135,7 +135,7 @@ X_STATUS GraphicsSystem::Setup(cpu::Processor* processor, })); // As we run vblank interrupts the debugger must be able to suspend us. vsync_worker_thread_->set_can_debugger_suspend(true); - vsync_worker_thread_->set_name("GraphicsSystem Vsync"); + vsync_worker_thread_->set_name("GPU VSync"); vsync_worker_thread_->Create(); if (cvars::trace_gpu_stream) { diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 2c25e682d..23998c307 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -65,17 +65,17 @@ enum class InstructionStorageTarget { // disassembly (because oPts.x000 will be assembled, but oPts.x00_ has both // skipped components and zeros, which cannot be encoded, and therefore it will // not). -constexpr uint32_t GetInstructionStorageTargetUsedComponents( +constexpr uint32_t GetInstructionStorageTargetUsedComponentCount( InstructionStorageTarget target) { switch (target) { case InstructionStorageTarget::kNone: - return 0b0000; + return 0; case InstructionStorageTarget::kPointSizeEdgeFlagKillVertex: - return 0b0111; + return 3; case InstructionStorageTarget::kDepth: - return 0b0001; + return 1; default: - return 0b1111; + return 4; } } @@ -136,8 +136,9 @@ struct InstructionResult { // Returns the write mask containing only components actually present in the // target. uint32_t GetUsedWriteMask() const { - return original_write_mask & - GetInstructionStorageTargetUsedComponents(storage_target); + uint32_t target_component_count = + GetInstructionStorageTargetUsedComponentCount(storage_target); + return original_write_mask & ((1 << target_component_count) - 1); } // True if the components are in their 'standard' swizzle arrangement (xyzw). bool IsStandardSwizzle() const { @@ -161,6 +162,28 @@ struct InstructionResult { } return used_components; } + // Returns which components of the used write mask are constant, and what + // values they have. + uint32_t GetUsedConstantComponents(uint32_t& constant_values_out) const { + uint32_t constant_components = 0; + uint32_t constant_values = 0; + uint32_t used_write_mask = GetUsedWriteMask(); + for (uint32_t i = 0; i < 4; ++i) { + if (!(used_write_mask & (1 << i))) { + continue; + } + SwizzleSource component = components[i]; + if (component >= SwizzleSource::kX && component <= SwizzleSource::kW) { + continue; + } + constant_components |= 1 << i; + if (component == SwizzleSource::k1) { + constant_values |= 1 << i; + } + } + constant_values_out = constant_values; + return constant_components; + } }; enum class InstructionStorageSource { @@ -212,14 +235,18 @@ struct InstructionOperand { return false; } - // Returns which components of two operands are identical, but may have - // different signs (for simplicity of usage with GetComponent, treating the - // rightmost component as replicated). - uint32_t GetAbsoluteIdenticalComponents( - const InstructionOperand& other) const { + // Returns which components of two operands will always be bitwise equal + // (disregarding component_count for simplicity of usage with GetComponent, + // treating the rightmost component as replicated). This, strictly with all + // conditions, must be used when emulating Shader Model 3 +-0 * x = +0 + // multiplication behavior with IEEE-compliant multiplication (because + // -0 * |-0|, or -0 * +0, is -0, while the result must be +0). + uint32_t GetIdenticalComponents(const InstructionOperand& other) const { if (storage_source != other.storage_source || storage_index != other.storage_index || - storage_addressing_mode != other.storage_addressing_mode) { + storage_addressing_mode != other.storage_addressing_mode || + is_negated != other.is_negated || + is_absolute_value != other.is_absolute_value) { return 0; } uint32_t identical_components = 0; @@ -229,16 +256,6 @@ struct InstructionOperand { } return identical_components; } - // Returns which components of two operands will always be bitwise equal, but - // may have different signs (disregarding component_count for simplicity of - // usage with GetComponent, treating the rightmost component as replicated). - uint32_t GetIdenticalComponents(const InstructionOperand& other) const { - if (is_negated != other.is_negated || - is_absolute_value != other.is_absolute_value) { - return 0; - } - return GetAbsoluteIdenticalComponents(other); - } }; struct ParsedExecInstruction { diff --git a/src/xenia/gpu/shared_memory.h b/src/xenia/gpu/shared_memory.h index 496836a38..98719b670 100644 --- a/src/xenia/gpu/shared_memory.h +++ b/src/xenia/gpu/shared_memory.h @@ -25,6 +25,9 @@ namespace gpu { // system page size granularity. class SharedMemory { public: + static constexpr uint32_t kBufferSizeLog2 = 29; + static constexpr uint32_t kBufferSize = 1 << kBufferSizeLog2; + virtual ~SharedMemory(); // Call in the implementation-specific ClearCache. virtual void ClearCache(); @@ -98,9 +101,6 @@ class SharedMemory { // destructor. void ShutdownCommon(); - static constexpr uint32_t kBufferSizeLog2 = 29; - static constexpr uint32_t kBufferSize = 1 << kBufferSizeLog2; - // Sparse allocations are 4 MB, so not too many of them are allocated, but // also not to waste too much memory for padding (with 16 MB there's too // much). diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index c0c035167..21ccbaff9 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -800,13 +800,26 @@ static_assert_size(TextureFetchInstruction, 12); // Both are valid only within the current ALU clause. They are not modified // when the instruction that would write them fails its predication check. // - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for -// multiplication (0 or denormal * anything = 0) wherever it's present (mul, -// mad, dp, etc.) and for NaN in min/max. It's very important to respect this -// rule for multiplication, as games often rely on it in vector normalization -// (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of things in -// games - causes white screen in Halo 3, white specular on characters in GTA -// IV. -// TODO(Triang3l): Investigate signed zero handling in multiplication. +// multiplication (+-0 or denormal * anything = +0) wherever it's present +// (mul, mad, dp, etc.) and for NaN in min/max. It's very important to respect +// this rule for multiplication, as games often rely on it in vector +// normalization (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of +// things in games - causes white screen in Halo 3, white specular on +// characters in GTA IV. The result is always positive zero in this case, no +// matter what the signs of the other operands are, according to R5xx +// Acceleration section 8.7.5 "Legacy multiply behavior" and testing on +// Adreno 200. This means that the following need to be taken into account +// (according to 8.7.2 "ALU Non-Transcendental Floating Point"): +// - +0 * -0 is -0 with IEEE conformance, however, with this legacy SM3 +// handling, it should result in +0. +// - +0 + -0 is +0, so multiply-add should not be replaced with conditional +// move of the third operand in case of zero multiplicands, because the term +// may be -0, while the result should be +0 in this case. +// http://developer.amd.com/wordpress/media/2013/10/R5xx_Acceleration_v1.5.pdf +// Multiply-add also appears to be not fused (the SM3 behavior instruction on +// GCN is called v_mad_legacy_f32, not v_fma_legacy_f32) - shader translators +// should not use instructions that may be interpreted by the host GPU as +// fused multiply-add. enum class AluScalarOpcode : uint32_t { // Floating-Point Add diff --git a/src/xenia/gpu/vulkan/premake5.lua b/src/xenia/gpu/vulkan/premake5.lua index 14259b183..58512ec17 100644 --- a/src/xenia/gpu/vulkan/premake5.lua +++ b/src/xenia/gpu/vulkan/premake5.lua @@ -30,17 +30,6 @@ project("xenia-gpu-vulkan-trace-viewer") kind("WindowedApp") language("C++") links({ - "aes_128", - "capstone", - "fmt", - "glslang-spirv", - "imgui", - "libavcodec", - "libavutil", - "mspack", - "snappy", - "spirv-tools", - "volk", "xenia-apu", "xenia-apu-nop", "xenia-base", @@ -57,6 +46,19 @@ project("xenia-gpu-vulkan-trace-viewer") "xenia-ui-vulkan", "xenia-vfs", "xenia-patcher", + }) + links({ + "aes_128", + "capstone", + "fmt", + "glslang-spirv", + "imgui", + "libavcodec", + "libavutil", + "mspack", + "snappy", + "spirv-tools", + "volk", "xxhash", }) defines({ @@ -98,17 +100,6 @@ project("xenia-gpu-vulkan-trace-dump") kind("ConsoleApp") language("C++") links({ - "aes_128", - "capstone", - "fmt", - "glslang-spirv", - "imgui", - "libavcodec", - "libavutil", - "mspack", - "snappy", - "spirv-tools", - "volk", "xenia-apu", "xenia-apu-nop", "xenia-base", @@ -125,6 +116,19 @@ project("xenia-gpu-vulkan-trace-dump") "xenia-ui-vulkan", "xenia-vfs", "xenia-patcher", + }) + links({ + "aes_128", + "capstone", + "fmt", + "glslang-spirv", + "imgui", + "libavcodec", + "libavutil", + "mspack", + "snappy", + "spirv-tools", + "volk", "xxhash", }) defines({ diff --git a/src/xenia/hid/premake5.lua b/src/xenia/hid/premake5.lua index 152887e2b..348e12371 100644 --- a/src/xenia/hid/premake5.lua +++ b/src/xenia/hid/premake5.lua @@ -41,11 +41,11 @@ project("xenia-hid-demo") filter("platforms:Linux") links({ + "SDL2", + "vulkan", "X11", "xcb", "X11-xcb", - "vulkan", - "SDL2", }) filter("platforms:Windows") diff --git a/src/xenia/kernel/kernel_state.cc b/src/xenia/kernel/kernel_state.cc index 01fed1e7f..dd0d7ec5f 100644 --- a/src/xenia/kernel/kernel_state.cc +++ b/src/xenia/kernel/kernel_state.cc @@ -359,7 +359,7 @@ void KernelState::SetExecutableModule(object_ref module) { } return 0; })); - dispatch_thread_->set_name("Kernel Dispatch Thread"); + dispatch_thread_->set_name("Kernel Dispatch"); dispatch_thread_->Create(); } } diff --git a/src/xenia/kernel/xam/xam_content.cc b/src/xenia/kernel/xam/xam_content.cc index f12612b10..cc42bfcb6 100644 --- a/src/xenia/kernel/xam/xam_content.cc +++ b/src/xenia/kernel/xam/xam_content.cc @@ -8,6 +8,7 @@ */ #include "xenia/base/logging.h" +#include "xenia/base/math.h" #include "xenia/kernel/kernel_state.h" #include "xenia/kernel/util/shim_utils.h" #include "xenia/kernel/xam/xam_private.h" @@ -235,7 +236,8 @@ dword_result_t XamContentCreateDeviceEnumerator(dword_t content_type, xe::store_and_swap(&dev->device_type, dummy_device_info_.device_type); xe::store_and_swap(&dev->total_bytes, dummy_device_info_.total_bytes); xe::store_and_swap(&dev->free_bytes, dummy_device_info_.free_bytes); - xe::copy_and_swap(dev->name, dummy_device_info_.name, 28); + xe::copy_and_swap(dev->name, dummy_device_info_.name, + xe::countof(dev->name)); } *handle_out = e->handle(); diff --git a/src/xenia/kernel/xam/xam_info.cc b/src/xenia/kernel/xam/xam_info.cc index 758dde9c2..f0c28c14a 100644 --- a/src/xenia/kernel/xam/xam_info.cc +++ b/src/xenia/kernel/xam/xam_info.cc @@ -9,6 +9,7 @@ #include "xenia/base/logging.h" #include "xenia/base/cvar.h" +#include "xenia/base/string_util.h" #include "xenia/kernel/kernel_state.h" #include "xenia/kernel/user_module.h" #include "xenia/kernel/util/shim_utils.h" @@ -77,15 +78,15 @@ static SYSTEMTIME xeGetLocalSystemTime(uint64_t filetime) { void XamFormatDateString(dword_t unk, qword_t filetime, lpvoid_t output_buffer, dword_t output_count) { - std::memset(output_buffer, 0, output_count * 2); + std::memset(output_buffer, 0, output_count * sizeof(char16_t)); // TODO: implement this for other platforms #if XE_PLATFORM_WIN32 auto st = xeGetLocalSystemTime(filetime); // TODO: format this depending on users locale? auto str = fmt::format(u"{:02d}/{:02d}/{}", st.wMonth, st.wDay, st.wYear); - auto copy_length = std::min(size_t(output_count), str.size()) * 2; - xe::copy_and_swap(output_buffer.as(), str.c_str(), copy_length); + xe::string_util::copy_and_swap_truncating(output_buffer.as(), str, + output_count); #else assert_always(); #endif @@ -94,15 +95,15 @@ DECLARE_XAM_EXPORT1(XamFormatDateString, kNone, kImplemented); void XamFormatTimeString(dword_t unk, qword_t filetime, lpvoid_t output_buffer, dword_t output_count) { - std::memset(output_buffer, 0, output_count * 2); + std::memset(output_buffer, 0, output_count * sizeof(char16_t)); // TODO: implement this for other platforms #if XE_PLATFORM_WIN32 auto st = xeGetLocalSystemTime(filetime); // TODO: format this depending on users locale? auto str = fmt::format(u"{:02d}:{:02d}", st.wHour, st.wMinute); - auto copy_count = std::min(size_t(output_count), str.size()); - xe::copy_and_swap(output_buffer.as(), str.c_str(), copy_count); + xe::string_util::copy_and_swap_truncating(output_buffer.as(), str, + output_count); #else assert_always(); #endif @@ -124,9 +125,8 @@ dword_result_t keXamBuildResourceLocator(uint64_t module, path = fmt::format(u"section://{:X},{}#{}", (uint32_t)module, container, resource); } - auto copy_count = std::min(size_t(buffer_count), path.size()); - xe::copy_and_swap(buffer_ptr.as(), path.c_str(), copy_count); - (buffer_ptr.as())[copy_count] = 0; + xe::string_util::copy_and_swap_truncating(buffer_ptr.as(), path, + buffer_count); return 0; } diff --git a/src/xenia/kernel/xam/xam_net.cc b/src/xenia/kernel/xam/xam_net.cc index ddeccd9e9..ed7e4a023 100644 --- a/src/xenia/kernel/xam/xam_net.cc +++ b/src/xenia/kernel/xam/xam_net.cc @@ -984,8 +984,7 @@ dword_result_t NetDll___WSAFDIsSet(dword_t socket_handle, DECLARE_XAM_EXPORT1(NetDll___WSAFDIsSet, kNetworking, kImplemented); void RegisterNetExports(xe::cpu::ExportResolver* export_resolver, - KernelState* kernel_state) { -} + KernelState* kernel_state) {} } // namespace xam } // namespace kernel diff --git a/src/xenia/kernel/xam/xam_ui.cc b/src/xenia/kernel/xam/xam_ui.cc index 6d2fc7ea4..1723a0949 100644 --- a/src/xenia/kernel/xam/xam_ui.cc +++ b/src/xenia/kernel/xam/xam_ui.cc @@ -9,6 +9,7 @@ #include "third_party/imgui/imgui.h" #include "xenia/base/logging.h" +#include "xenia/base/string_util.h" #include "xenia/emulator.h" #include "xenia/kernel/kernel_flags.h" #include "xenia/kernel/kernel_state.h" @@ -188,8 +189,8 @@ class KeyboardInputDialog : public xe::ui::ImGuiDialog { *out_text_ = default_text; } text_buffer_.resize(max_length); - std::strncpy(text_buffer_.data(), default_text_.c_str(), - std::min(text_buffer_.size() - 1, default_text_.size())); + xe::string_util::copy_truncating(text_buffer_.data(), default_text_, + text_buffer_.size()); } void OnDraw(ImGuiIO& io) override { diff --git a/src/xenia/kernel/xam/xam_user.cc b/src/xenia/kernel/xam/xam_user.cc index 3e58639a9..9c746548c 100644 --- a/src/xenia/kernel/xam/xam_user.cc +++ b/src/xenia/kernel/xam/xam_user.cc @@ -10,6 +10,8 @@ #include #include "xenia/base/logging.h" +#include "xenia/base/math.h" +#include "xenia/base/string_util.h" #include "xenia/kernel/kernel_state.h" #include "xenia/kernel/util/shim_utils.h" #include "xenia/kernel/xam/xam_private.h" @@ -168,7 +170,8 @@ X_HRESULT_result_t XamUserGetSigninInfo(dword_t user_index, dword_t flags, const auto& user_profile = kernel_state()->user_profile(); info->xuid = user_profile->xuid(); info->signin_state = user_profile->signin_state(); - std::strncpy(info->name, user_profile->name().data(), 15); + xe::string_util::copy_truncating(info->name, user_profile->name(), + xe::countof(info->name)); return X_E_SUCCESS; } DECLARE_XAM_EXPORT1(XamUserGetSigninInfo, kUserProfiles, kImplemented); @@ -187,10 +190,8 @@ dword_result_t XamUserGetName(dword_t user_index, lpstring_t buffer, const auto& user_name = user_profile->name(); // Real XAM will only copy a maximum of 15 characters out. - size_t copy_length = std::min( - {size_t(15), user_name.size(), static_cast(buffer_len) - 1}); - std::memcpy(buffer, user_name.data(), copy_length); - buffer[copy_length] = '\0'; + xe::string_util::copy_truncating(buffer, user_name, + std::min(buffer_len.value(), uint32_t(15))); return X_ERROR_SUCCESS; } DECLARE_XAM_EXPORT1(XamUserGetName, kUserProfiles, kImplemented); diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc index 6ec46617e..f4ab5cec4 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc @@ -226,19 +226,21 @@ DECLARE_XBOXKRNL_EXPORT1(KeSetCurrentStackPointers, kThreading, kImplemented); dword_result_t KeSetAffinityThread(lpvoid_t thread_ptr, dword_t affinity, lpdword_t previous_affinity_ptr) { - uint32_t previous_affinity = 0; - + // The Xbox 360, according to disassembly of KeSetAffinityThread, unlike + // Windows NT, stores the previous affinity via the pointer provided as an + // argument, not in the return value - the return value is used for the + // result. + if (!affinity) { + return X_STATUS_INVALID_PARAMETER; + } auto thread = XObject::GetNativeObject(kernel_state(), thread_ptr); if (thread) { - previous_affinity = thread->affinity(); + if (previous_affinity_ptr) { + *previous_affinity_ptr = uint32_t(1) << thread->active_cpu(); + } thread->SetAffinity(affinity); } - - if (previous_affinity_ptr) { - *previous_affinity_ptr = previous_affinity; - } - - return (uint32_t)affinity; + return X_STATUS_SUCCESS; } DECLARE_XBOXKRNL_EXPORT1(KeSetAffinityThread, kThreading, kImplemented); diff --git a/src/xenia/kernel/xthread.cc b/src/xenia/kernel/xthread.cc index 118d0b0a2..b14462a75 100644 --- a/src/xenia/kernel/xthread.cc +++ b/src/xenia/kernel/xthread.cc @@ -157,11 +157,17 @@ void XThread::set_name(const std::string_view name) { } } -uint8_t next_cpu = 0; -uint8_t GetFakeCpuNumber(uint8_t proc_mask) { +static uint8_t next_cpu = 0; +static uint8_t GetFakeCpuNumber(uint8_t proc_mask) { + // NOTE: proc_mask is logical processors, not physical processors or cores. if (!proc_mask) { next_cpu = (next_cpu + 1) % 6; return next_cpu; // is this reasonable? + // TODO(Triang3l): Does the following apply here? + // https://docs.microsoft.com/en-us/windows/win32/dxtecharts/coding-for-multiple-cores + // "On Xbox 360, you must explicitly assign software threads to a particular + // hardware thread by using XSetThreadProcessor. Otherwise, all child + // threads will stay on the same hardware thread as the parent." } assert_false(proc_mask & 0xC0); @@ -206,6 +212,7 @@ void XThread::InitializeGuestObject() { // 0xA88 = APC // 0x18 = timer xe::store_and_swap(p + 0x09C, 0xFDFFD7FF); + // current_cpu is expected to be initialized externally via SetActiveCpu. xe::store_and_swap(p + 0x0D0, stack_base_); xe::store_and_swap(p + 0x130, Clock::QueryGuestSystemTime()); xe::store_and_swap(p + 0x144, guest_object() + 0x144); @@ -347,6 +354,12 @@ X_STATUS XThread::Create() { // Exports use this to get the kernel. thread_state_->context()->kernel_state = kernel_state_; + uint8_t cpu_index = GetFakeCpuNumber( + static_cast(creation_params_.creation_flags >> 24)); + + // Initialize the KTHREAD object. + InitializeGuestObject(); + X_KPCR* pcr = memory()->TranslateVirtual(pcr_address_); pcr->tls_ptr = tls_static_address_; @@ -356,14 +369,11 @@ X_STATUS XThread::Create() { pcr->stack_base_ptr = stack_base_; pcr->stack_end_ptr = stack_limit_; - uint8_t proc_mask = - static_cast(creation_params_.creation_flags >> 24); + pcr->dpc_active = 0; // DPC active bool? - pcr->current_cpu = GetFakeCpuNumber(proc_mask); // Current CPU(?) - pcr->dpc_active = 0; // DPC active bool? - - // Initialize the KTHREAD object. - InitializeGuestObject(); + // Assign the thread to the logical processor, and also set up the current CPU + // in KPCR and KTHREAD. + SetActiveCpu(cpu_index); // Always retain when starting - the thread owns itself until exited. RetainHandle(); @@ -416,10 +426,6 @@ X_STATUS XThread::Create() { return X_STATUS_NO_MEMORY; } - if (!cvars::ignore_thread_affinities) { - thread_->set_affinity_mask(proc_mask); - } - // Set the thread name based on host ID (for easier debugging). if (thread_name_.empty()) { set_name(fmt::format("XThread{:04X}", thread_->system_id())); @@ -712,37 +718,36 @@ void XThread::SetPriority(int32_t increment) { } void XThread::SetAffinity(uint32_t affinity) { - // Affinity mask, as in SetThreadAffinityMask. - // Xbox thread IDs: - // 0 - core 0, thread 0 - user - // 1 - core 0, thread 1 - user - // 2 - core 1, thread 0 - sometimes xcontent - // 3 - core 1, thread 1 - user - // 4 - core 2, thread 0 - xaudio - // 5 - core 2, thread 1 - user - // TODO(benvanik): implement better thread distribution. - // NOTE: these are logical processors, not physical processors or cores. + SetActiveCpu(GetFakeCpuNumber(affinity)); +} + +uint8_t XThread::active_cpu() const { + const X_KPCR& pcr = *memory()->TranslateVirtual(pcr_address_); + return pcr.current_cpu; +} + +void XThread::SetActiveCpu(uint8_t cpu_index) { + // May be called during thread creation - don't skip if current == new. + + assert_true(cpu_index < 6); + + X_KPCR& pcr = *memory()->TranslateVirtual(pcr_address_); + pcr.current_cpu = cpu_index; + + if (is_guest_thread()) { + X_KTHREAD& thread_object = + *memory()->TranslateVirtual(guest_object()); + thread_object.current_cpu = cpu_index; + } + if (xe::threading::logical_processor_count() < 6) { XELOGW("Too few processors - scheduling will be wonky"); } - SetActiveCpu(GetFakeCpuNumber(affinity)); - affinity_ = affinity; if (!cvars::ignore_thread_affinities) { - thread_->set_affinity_mask(affinity); + thread_->set_affinity_mask(uint64_t(1) << cpu_index); } } -uint32_t XThread::active_cpu() const { - uint8_t* pcr = memory()->TranslateVirtual(pcr_address_); - return xe::load_and_swap(pcr + 0x10C); -} - -void XThread::SetActiveCpu(uint32_t cpu_index) { - assert_true(cpu_index < 6); - uint8_t* pcr = memory()->TranslateVirtual(pcr_address_); - xe::store_and_swap(pcr + 0x10C, cpu_index); -} - bool XThread::GetTLSValue(uint32_t slot, uint32_t* value_out) { if (slot * 4 > tls_total_size_) { return false; diff --git a/src/xenia/kernel/xthread.h b/src/xenia/kernel/xthread.h index 84abfd027..3c8e4ecaa 100644 --- a/src/xenia/kernel/xthread.h +++ b/src/xenia/kernel/xthread.h @@ -88,7 +88,8 @@ struct X_KTHREAD { char unk_10[0xAC]; // 0x10 uint8_t suspend_count; // 0xBC uint8_t unk_BD; // 0xBD - uint16_t unk_BE; // 0xBE + uint8_t unk_BE; // 0xBE + uint8_t current_cpu; // 0xBF char unk_C0[0x70]; // 0xC0 xe::be create_time; // 0x130 xe::be exit_time; // 0x138 @@ -171,10 +172,17 @@ class XThread : public XObject, public cpu::Thread { int32_t priority() const { return priority_; } int32_t QueryPriority(); void SetPriority(int32_t increment); - uint32_t affinity() const { return affinity_; } + + // Xbox thread IDs: + // 0 - core 0, thread 0 - user + // 1 - core 0, thread 1 - user + // 2 - core 1, thread 0 - sometimes xcontent + // 3 - core 1, thread 1 - user + // 4 - core 2, thread 0 - xaudio + // 5 - core 2, thread 1 - user void SetAffinity(uint32_t affinity); - uint32_t active_cpu() const; - void SetActiveCpu(uint32_t cpu_index); + uint8_t active_cpu() const; + void SetActiveCpu(uint8_t cpu_index); bool GetTLSValue(uint32_t slot, uint32_t* value_out); bool SetTLSValue(uint32_t slot, uint32_t value); @@ -226,7 +234,6 @@ class XThread : public XObject, public cpu::Thread { bool running_ = false; int32_t priority_ = 0; - uint32_t affinity_ = 0; xe::global_critical_region global_critical_region_; std::atomic irql_ = {0}; diff --git a/src/xenia/ui/d3d12/d3d12_immediate_drawer.cc b/src/xenia/ui/d3d12/d3d12_immediate_drawer.cc index b9e23dc93..5c0a104e5 100644 --- a/src/xenia/ui/d3d12/d3d12_immediate_drawer.cc +++ b/src/xenia/ui/d3d12/d3d12_immediate_drawer.cc @@ -118,15 +118,15 @@ bool D3D12ImmediateDrawer::Initialize() { return false; } - // Create the pipeline states. - D3D12_GRAPHICS_PIPELINE_STATE_DESC pipeline_state_desc = {}; - pipeline_state_desc.pRootSignature = root_signature_; - pipeline_state_desc.VS.pShaderBytecode = immediate_vs; - pipeline_state_desc.VS.BytecodeLength = sizeof(immediate_vs); - pipeline_state_desc.PS.pShaderBytecode = immediate_ps; - pipeline_state_desc.PS.BytecodeLength = sizeof(immediate_ps); + // Create the pipelines. + D3D12_GRAPHICS_PIPELINE_STATE_DESC pipeline_desc = {}; + pipeline_desc.pRootSignature = root_signature_; + pipeline_desc.VS.pShaderBytecode = immediate_vs; + pipeline_desc.VS.BytecodeLength = sizeof(immediate_vs); + pipeline_desc.PS.pShaderBytecode = immediate_ps; + pipeline_desc.PS.BytecodeLength = sizeof(immediate_ps); D3D12_RENDER_TARGET_BLEND_DESC& pipeline_blend_desc = - pipeline_state_desc.BlendState.RenderTarget[0]; + pipeline_desc.BlendState.RenderTarget[0]; pipeline_blend_desc.BlendEnable = TRUE; pipeline_blend_desc.SrcBlend = D3D12_BLEND_SRC_ALPHA; pipeline_blend_desc.DestBlend = D3D12_BLEND_INV_SRC_ALPHA; @@ -138,11 +138,11 @@ bool D3D12ImmediateDrawer::Initialize() { pipeline_blend_desc.RenderTargetWriteMask = D3D12_COLOR_WRITE_ENABLE_RED | D3D12_COLOR_WRITE_ENABLE_GREEN | D3D12_COLOR_WRITE_ENABLE_BLUE; - pipeline_state_desc.SampleMask = UINT_MAX; - pipeline_state_desc.RasterizerState.FillMode = D3D12_FILL_MODE_SOLID; - pipeline_state_desc.RasterizerState.CullMode = D3D12_CULL_MODE_NONE; - pipeline_state_desc.RasterizerState.FrontCounterClockwise = FALSE; - pipeline_state_desc.RasterizerState.DepthClipEnable = TRUE; + pipeline_desc.SampleMask = UINT_MAX; + pipeline_desc.RasterizerState.FillMode = D3D12_FILL_MODE_SOLID; + pipeline_desc.RasterizerState.CullMode = D3D12_CULL_MODE_NONE; + pipeline_desc.RasterizerState.FrontCounterClockwise = FALSE; + pipeline_desc.RasterizerState.DepthClipEnable = TRUE; D3D12_INPUT_ELEMENT_DESC pipeline_input_elements[3] = {}; pipeline_input_elements[0].SemanticName = "POSITION"; pipeline_input_elements[0].Format = DXGI_FORMAT_R32G32_FLOAT; @@ -154,26 +154,24 @@ bool D3D12ImmediateDrawer::Initialize() { pipeline_input_elements[2].Format = DXGI_FORMAT_R8G8B8A8_UNORM; pipeline_input_elements[2].AlignedByteOffset = offsetof(ImmediateVertex, color); - pipeline_state_desc.InputLayout.pInputElementDescs = pipeline_input_elements; - pipeline_state_desc.InputLayout.NumElements = + pipeline_desc.InputLayout.pInputElementDescs = pipeline_input_elements; + pipeline_desc.InputLayout.NumElements = UINT(xe::countof(pipeline_input_elements)); - pipeline_state_desc.PrimitiveTopologyType = - D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; - pipeline_state_desc.NumRenderTargets = 1; - pipeline_state_desc.RTVFormats[0] = D3D12Context::kSwapChainFormat; - pipeline_state_desc.SampleDesc.Count = 1; + pipeline_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; + pipeline_desc.NumRenderTargets = 1; + pipeline_desc.RTVFormats[0] = D3D12Context::kSwapChainFormat; + pipeline_desc.SampleDesc.Count = 1; if (FAILED(device->CreateGraphicsPipelineState( - &pipeline_state_desc, IID_PPV_ARGS(&pipeline_state_triangle_)))) { + &pipeline_desc, IID_PPV_ARGS(&pipeline_triangle_)))) { XELOGE( "Failed to create the Direct3D 12 immediate drawer triangle pipeline " "state"); Shutdown(); return false; } - pipeline_state_desc.PrimitiveTopologyType = - D3D12_PRIMITIVE_TOPOLOGY_TYPE_LINE; + pipeline_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_LINE; if (FAILED(device->CreateGraphicsPipelineState( - &pipeline_state_desc, IID_PPV_ARGS(&pipeline_state_line_)))) { + &pipeline_desc, IID_PPV_ARGS(&pipeline_line_)))) { XELOGE( "Failed to create the Direct3D 12 immediate drawer line pipeline " "state"); @@ -267,8 +265,8 @@ void D3D12ImmediateDrawer::Shutdown() { util::ReleaseAndNull(sampler_heap_); - util::ReleaseAndNull(pipeline_state_line_); - util::ReleaseAndNull(pipeline_state_triangle_); + util::ReleaseAndNull(pipeline_line_); + util::ReleaseAndNull(pipeline_triangle_); util::ReleaseAndNull(root_signature_); } @@ -611,17 +609,17 @@ void D3D12ImmediateDrawer::Draw(const ImmediateDraw& draw) { uint32_t(sampler_index))); } - // Set the primitive type and the pipeline state for it. + // Set the primitive type and the pipeline for it. D3D_PRIMITIVE_TOPOLOGY primitive_topology; - ID3D12PipelineState* pipeline_state; + ID3D12PipelineState* pipeline; switch (draw.primitive_type) { case ImmediatePrimitiveType::kLines: primitive_topology = D3D_PRIMITIVE_TOPOLOGY_LINELIST; - pipeline_state = pipeline_state_line_; + pipeline = pipeline_line_; break; case ImmediatePrimitiveType::kTriangles: primitive_topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST; - pipeline_state = pipeline_state_triangle_; + pipeline = pipeline_triangle_; break; default: assert_unhandled_case(draw.primitive_type); @@ -630,7 +628,7 @@ void D3D12ImmediateDrawer::Draw(const ImmediateDraw& draw) { if (current_primitive_topology_ != primitive_topology) { current_primitive_topology_ = primitive_topology; current_command_list_->IASetPrimitiveTopology(primitive_topology); - current_command_list_->SetPipelineState(pipeline_state); + current_command_list_->SetPipelineState(pipeline); } // Draw. diff --git a/src/xenia/ui/d3d12/d3d12_immediate_drawer.h b/src/xenia/ui/d3d12/d3d12_immediate_drawer.h index 4300af76e..fbc362f59 100644 --- a/src/xenia/ui/d3d12/d3d12_immediate_drawer.h +++ b/src/xenia/ui/d3d12/d3d12_immediate_drawer.h @@ -105,8 +105,8 @@ class D3D12ImmediateDrawer : public ImmediateDrawer { kCount }; - ID3D12PipelineState* pipeline_state_triangle_ = nullptr; - ID3D12PipelineState* pipeline_state_line_ = nullptr; + ID3D12PipelineState* pipeline_triangle_ = nullptr; + ID3D12PipelineState* pipeline_line_ = nullptr; ID3D12DescriptorHeap* sampler_heap_ = nullptr; D3D12_CPU_DESCRIPTOR_HANDLE sampler_heap_cpu_start_; diff --git a/src/xenia/ui/d3d12/d3d12_provider.h b/src/xenia/ui/d3d12/d3d12_provider.h index 0e70def17..255d42a3d 100644 --- a/src/xenia/ui/d3d12/d3d12_provider.h +++ b/src/xenia/ui/d3d12/d3d12_provider.h @@ -46,22 +46,22 @@ class D3D12Provider : public GraphicsProvider { uint32_t GetRTVDescriptorSize() const { return descriptor_size_rtv_; } uint32_t GetDSVDescriptorSize() const { return descriptor_size_dsv_; } template - inline T OffsetViewDescriptor(T start, uint32_t index) const { + T OffsetViewDescriptor(T start, uint32_t index) const { start.ptr += index * descriptor_size_view_; return start; } template - inline T OffsetSamplerDescriptor(T start, uint32_t index) const { + T OffsetSamplerDescriptor(T start, uint32_t index) const { start.ptr += index * descriptor_size_sampler_; return start; } template - inline T OffsetRTVDescriptor(T start, uint32_t index) const { + T OffsetRTVDescriptor(T start, uint32_t index) const { start.ptr += index * descriptor_size_rtv_; return start; } template - inline T OffsetDSVDescriptor(T start, uint32_t index) const { + T OffsetDSVDescriptor(T start, uint32_t index) const { start.ptr += index * descriptor_size_dsv_; return start; } @@ -91,32 +91,30 @@ class D3D12Provider : public GraphicsProvider { } // Proxies for Direct3D 12 functions since they are loaded dynamically. - inline HRESULT SerializeRootSignature(const D3D12_ROOT_SIGNATURE_DESC* desc, - D3D_ROOT_SIGNATURE_VERSION version, - ID3DBlob** blob_out, - ID3DBlob** error_blob_out) const { + HRESULT SerializeRootSignature(const D3D12_ROOT_SIGNATURE_DESC* desc, + D3D_ROOT_SIGNATURE_VERSION version, + ID3DBlob** blob_out, + ID3DBlob** error_blob_out) const { return pfn_d3d12_serialize_root_signature_(desc, version, blob_out, error_blob_out); } - inline HRESULT Disassemble(const void* src_data, size_t src_data_size, - UINT flags, const char* comments, - ID3DBlob** disassembly_out) const { + HRESULT Disassemble(const void* src_data, size_t src_data_size, UINT flags, + const char* comments, ID3DBlob** disassembly_out) const { if (!pfn_d3d_disassemble_) { return E_NOINTERFACE; } return pfn_d3d_disassemble_(src_data, src_data_size, flags, comments, disassembly_out); } - inline HRESULT DxbcConverterCreateInstance(const CLSID& rclsid, - const IID& riid, - void** ppv) const { + HRESULT DxbcConverterCreateInstance(const CLSID& rclsid, const IID& riid, + void** ppv) const { if (!pfn_dxilconv_dxc_create_instance_) { return E_NOINTERFACE; } return pfn_dxilconv_dxc_create_instance_(rclsid, riid, ppv); } - inline HRESULT DxcCreateInstance(const CLSID& rclsid, const IID& riid, - void** ppv) const { + HRESULT DxcCreateInstance(const CLSID& rclsid, const IID& riid, + void** ppv) const { if (!pfn_dxcompiler_dxc_create_instance_) { return E_NOINTERFACE; } diff --git a/src/xenia/ui/d3d12/d3d12_util.cc b/src/xenia/ui/d3d12/d3d12_util.cc index 710d3b6db..caea2b296 100644 --- a/src/xenia/ui/d3d12/d3d12_util.cc +++ b/src/xenia/ui/d3d12/d3d12_util.cc @@ -47,7 +47,7 @@ ID3D12RootSignature* CreateRootSignature( return root_signature; } -ID3D12PipelineState* CreateComputePipelineState( +ID3D12PipelineState* CreateComputePipeline( ID3D12Device* device, const void* shader, size_t shader_size, ID3D12RootSignature* root_signature) { D3D12_COMPUTE_PIPELINE_STATE_DESC desc; diff --git a/src/xenia/ui/d3d12/d3d12_util.h b/src/xenia/ui/d3d12/d3d12_util.h index 5bce23568..6798f4f1c 100644 --- a/src/xenia/ui/d3d12/d3d12_util.h +++ b/src/xenia/ui/d3d12/d3d12_util.h @@ -27,7 +27,7 @@ extern const D3D12_HEAP_PROPERTIES kHeapPropertiesUpload; extern const D3D12_HEAP_PROPERTIES kHeapPropertiesReadback; template -inline bool ReleaseAndNull(T& object) { +bool ReleaseAndNull(T& object) { if (object != nullptr) { object->Release(); object = nullptr; @@ -39,9 +39,10 @@ inline bool ReleaseAndNull(T& object) { ID3D12RootSignature* CreateRootSignature(const D3D12Provider& provider, const D3D12_ROOT_SIGNATURE_DESC& desc); -ID3D12PipelineState* CreateComputePipelineState( - ID3D12Device* device, const void* shader, size_t shader_size, - ID3D12RootSignature* root_signature); +ID3D12PipelineState* CreateComputePipeline(ID3D12Device* device, + const void* shader, + size_t shader_size, + ID3D12RootSignature* root_signature); constexpr DXGI_FORMAT GetUintPow2DXGIFormat(uint32_t element_size_bytes_log2) { switch (element_size_bytes_log2) { diff --git a/src/xenia/ui/graphics_upload_buffer_pool.cc b/src/xenia/ui/graphics_upload_buffer_pool.cc index 2a780b0c9..5eb04fba3 100644 --- a/src/xenia/ui/graphics_upload_buffer_pool.cc +++ b/src/xenia/ui/graphics_upload_buffer_pool.cc @@ -71,7 +71,7 @@ void GraphicsUploadBufferPool::FlushWrites() { GraphicsUploadBufferPool::Page* GraphicsUploadBufferPool::Request( uint64_t submission_index, size_t size, size_t alignment, size_t& offset_out) { - assert_not_zero(alignment); + alignment = std::max(alignment, size_t(1)); assert_true(xe::is_pow2(alignment)); size = xe::align(size, alignment); assert_true(size <= page_size_); @@ -126,7 +126,7 @@ GraphicsUploadBufferPool::Page* GraphicsUploadBufferPool::Request( GraphicsUploadBufferPool::Page* GraphicsUploadBufferPool::RequestPartial( uint64_t submission_index, size_t size, size_t alignment, size_t& offset_out, size_t& size_out) { - assert_not_zero(alignment); + alignment = std::max(alignment, size_t(1)); assert_true(xe::is_pow2(alignment)); size = xe::align(size, alignment); size = std::min(size, page_size_); diff --git a/third_party/SDL2-static.lua b/third_party/SDL2-static.lua index a9206e300..447ceb325 100644 --- a/third_party/SDL2-static.lua +++ b/third_party/SDL2-static.lua @@ -18,7 +18,7 @@ project("SDL2") "SDL2/include", }) buildoptions({ - "/wd4828", -- illegal characters in file + "/wd4828", -- illegal characters in file https://bugzilla.libsdl.org/show_bug.cgi?id=5333 }) files({ -- 1:1 from SDL.vcxproj file diff --git a/third_party/premake-cmake b/third_party/premake-cmake new file mode 160000 index 000000000..26fbbb996 --- /dev/null +++ b/third_party/premake-cmake @@ -0,0 +1 @@ +Subproject commit 26fbbb9962aefcb1c24aff1e7952033ce1361190 diff --git a/third_party/spirv-tools.lua b/third_party/spirv-tools.lua index bf900a6e9..0e6335b98 100644 --- a/third_party/spirv-tools.lua +++ b/third_party/spirv-tools.lua @@ -73,4 +73,4 @@ project("spirv-tools") buildoptions({ "/wd4800", -- Forcing value to bool 'true' or 'false' "/wd4996", -- Call to 'std::equal' with parameters that may be unsafe - }) \ No newline at end of file + }) diff --git a/tools/build/premake b/tools/build/premake index 14e3d5ebc..9113958a5 100644 --- a/tools/build/premake +++ b/tools/build/premake @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python3 # Copyright 2015 Ben Vanik. All Rights Reserved. @@ -107,13 +107,14 @@ def has_bin(bin): return None -def shell_call(command, throw_on_error=True, stdout_path=None): +def shell_call(command, throw_on_error=True, stdout_path=None, stderr_path=None, shell=False): """Executes a shell command. Args: command: Command to execute, as a list of parameters. throw_on_error: Whether to throw an error or return the status code. stdout_path: File path to write stdout output to. + stderr_path: File path to write stderr output to. Returns: If throw_on_error is False the status code of the call will be returned. @@ -121,17 +122,22 @@ def shell_call(command, throw_on_error=True, stdout_path=None): stdout_file = None if stdout_path: stdout_file = open(stdout_path, 'w') + stderr_file = None + if stderr_path: + stderr_file = open(stderr_path, 'w') result = 0 try: if throw_on_error: result = 1 - subprocess.check_call(command, shell=False, stdout=stdout_file) + subprocess.check_call(command, shell=shell, stdout=stdout_file, stderr=stderr_file) result = 0 else: - result = subprocess.call(command, shell=False, stdout=stdout_file) + result = subprocess.call(command, shell=shell, stdout=stdout_file, stderr=stderr_file) finally: if stdout_file: stdout_file.close() + if stderr_file: + stderr_file.close() return result @@ -196,42 +202,5 @@ def import_subprocess_environment(args): os.environ[var.upper()] = setting break -def git_submodule_update(): - """Runs a full recursive git submodule init and update. - - Older versions of git do not support 'update --init --recursive'. We could - check and run it on versions that do support it and speed things up a bit. - """ - if True: - shell_call([ - 'git', - 'submodule', - 'update', - '--init', - '--recursive', - ]) - else: - shell_call([ - 'git', - 'submodule', - 'init', - ]) - shell_call([ - 'git', - 'submodule', - 'foreach', - '--recursive', - 'git', - 'submodule', - 'init', - ]) - shell_call([ - 'git', - 'submodule', - 'update', - '--recursive', - ]) - - if __name__ == '__main__': main() diff --git a/xenia-build b/xenia-build index 19bcd0307..ec3c00345 100755 --- a/xenia-build +++ b/xenia-build @@ -34,8 +34,11 @@ def main(): # Check git exists. if not has_bin('git'): - print('ERROR: git must be installed and on PATH.') - sys.exit(1) + print('WARNING: Git should be installed and on PATH. Version info will be omitted from all binaries!') + print('') + elif not git_is_repository(): + print('WARNING: The source tree is unversioned. Version info will be omitted from all binaries!') + print('') # Check python version. if not sys.version_info[:2] >= (3, 6): @@ -85,6 +88,16 @@ def main(): sys.exit(return_code) +def print_box(msg): + """Prints an important message inside a box + """ + print( + '┌{0:─^{2}}╖\n' + '│{1: ^{2}}║\n' + '╘{0:═^{2}}╝\n' + .format('', msg, len(msg) + 2)) + + def import_vs_environment(): """Finds the installed Visual Studio version and imports interesting environment variables into os.environ. @@ -150,6 +163,7 @@ def import_subprocess_environment(args): os.environ[var.upper()] = setting break + def has_bin(binary): """Checks whether the given binary is present. @@ -185,13 +199,14 @@ def get_bin(binary): return None -def shell_call(command, throw_on_error=True, stdout_path=None, shell=False): +def shell_call(command, throw_on_error=True, stdout_path=None, stderr_path=None, shell=False): """Executes a shell command. Args: command: Command to execute, as a list of parameters. throw_on_error: Whether to throw an error or return the status code. stdout_path: File path to write stdout output to. + stderr_path: File path to write stderr output to. Returns: If throw_on_error is False the status code of the call will be returned. @@ -199,21 +214,49 @@ def shell_call(command, throw_on_error=True, stdout_path=None, shell=False): stdout_file = None if stdout_path: stdout_file = open(stdout_path, 'w') + stderr_file = None + if stderr_path: + stderr_file = open(stderr_path, 'w') result = 0 try: if throw_on_error: result = 1 - subprocess.check_call(command, shell=shell, stdout=stdout_file) + subprocess.check_call(command, shell=shell, stdout=stdout_file, stderr=stderr_file) result = 0 else: - result = subprocess.call(command, shell=shell, stdout=stdout_file) + result = subprocess.call(command, shell=shell, stdout=stdout_file, stderr=stderr_file) finally: if stdout_file: stdout_file.close() + if stderr_file: + stderr_file.close() return result -def get_git_head_info(): +def generate_version_h(): + """Generates a build/version.h file that contains current git info. + """ + if git_is_repository(): + (branch_name, commit, commit_short) = git_get_head_info() + else: + branch_name = 'tarball' + commit = ':(-dont-do-this' + commit_short = ':(' + + contents = '''// Autogenerated by `xb premake`. + #ifndef GENERATED_VERSION_H_ + #define GENERATED_VERSION_H_ + #define XE_BUILD_BRANCH "%s" + #define XE_BUILD_COMMIT "%s" + #define XE_BUILD_COMMIT_SHORT "%s" + #define XE_BUILD_DATE __DATE__ + #endif // GENERATED_VERSION_H_ + ''' % (branch_name, commit, commit_short) + with open('build/version.h', 'w') as f: + f.write(contents) + + +def git_get_head_info(): """Queries the current branch and commit checksum from git. Returns: @@ -247,58 +290,28 @@ def get_git_head_info(): return branch_name, commit, commit_short -def generate_version_h(): - """Generates a build/version.h file that contains current git info. +def git_is_repository(): + """Checks if git is available and this source tree is versioned. """ - (branch_name, commit, commit_short) = get_git_head_info() - contents = '''// Autogenerated by `xb premake`. - #ifndef GENERATED_VERSION_H_ - #define GENERATED_VERSION_H_ - #define XE_BUILD_BRANCH "%s" - #define XE_BUILD_COMMIT "%s" - #define XE_BUILD_COMMIT_SHORT "%s" - #define XE_BUILD_DATE __DATE__ - #endif // GENERATED_VERSION_H_ - ''' % (branch_name, commit, commit_short) - with open('build/version.h', 'w') as f: - f.write(contents) + if not has_bin('git'): + return False + return shell_call([ + 'git', + 'rev-parse', + '--is-inside-work-tree', + ], throw_on_error=False, stdout_path=os.devnull, stderr_path=os.devnull) == 0 def git_submodule_update(): """Runs a full recursive git submodule init and update. - - Older versions of git do not support 'update --init --recursive'. We could - check and run it on versions that do support it and speed things up a bit. """ - if True: - shell_call([ - 'git', - 'submodule', - 'update', - '--init', - '--recursive', - ]) - else: - shell_call([ - 'git', - 'submodule', - 'init', - ]) - shell_call([ - 'git', - 'submodule', - 'foreach', - '--recursive', - 'git', - 'submodule', - 'init', - ]) - shell_call([ - 'git', - 'submodule', - 'update', - '--recursive', - ]) + shell_call([ + 'git', + 'submodule', + 'update', + '--init', + '--recursive', + ]) def get_clang_format_binary(): @@ -370,9 +383,9 @@ def run_platform_premake(cc='clang', devenv=None): if 'VSVERSION' in os.environ: vs_version = os.environ['VSVERSION'] - return run_premake('windows', 'vs' + vs_version) + return run_premake('windows', devenv or ('vs' + vs_version)) else: - return run_premake('linux', devenv == 'codelite' and devenv or 'gmake2', cc) + return run_premake('linux', devenv or 'gmake2', cc) def run_premake_export_commands(): @@ -406,6 +419,43 @@ def get_build_bin_path(args): return os.path.join(self_path, 'build', 'bin', platform.capitalize(), args['config'].capitalize()) +def create_clion_workspace(): + """Creates some basic workspace information inside the .idea directory for first start. + """ + if os.path.exists('.idea'): + # No first start + return False + print('Generating CLion workspace files...') + # Might become easier in the future: https://youtrack.jetbrains.com/issue/CPP-7911 + + # Set the location of the CMakeLists.txt + os.mkdir('.idea') + with open(os.path.join('.idea', 'misc.xml'), 'w') as f: + f.write(""" + + + + + +""") + + # Set available configurations + # TODO Find a way to trigger a cmake reload + with open(os.path.join('.idea', 'workspace.xml'), 'w') as f: + f.write(""" + + + + + + + + +""") + + return True + + def discover_commands(subparsers): """Looks for all commands and returns a dictionary of them. In the future commands could be discovered on disk. @@ -491,7 +541,10 @@ class SetupCommand(Command): # Setup submodules. print('- git submodule init / update...') - git_submodule_update() + if git_is_repository(): + git_submodule_update() + else: + print('WARNING: Git not available or not a repository. Dependencies may be missing.') print('') print('- running premake...') @@ -1445,8 +1498,13 @@ class DevenvCommand(Command): def execute(self, args, pass_args, cwd): devenv = None + show_reload_prompt = False if sys.platform == 'win32': print('Launching Visual Studio...') + elif has_bin('clion') or has_bin('clion.sh'): + print('Launching CLion...') + show_reload_prompt = create_clion_workspace() + devenv = 'cmake' else: print('Launching CodeLite...') devenv = 'codelite' @@ -1457,11 +1515,23 @@ class DevenvCommand(Command): print('') print('- launching devenv...') + if show_reload_prompt: + print_box('Please run "File ⇒ ↺ Reload CMake Project" from inside the IDE!') if sys.platform == 'win32': shell_call([ 'devenv', 'build\\xenia.sln', ]) + elif has_bin('clion'): + shell_call([ + 'clion', + '.', + ]) + elif has_bin('clion.sh'): + shell_call([ + 'clion.sh', + '.', + ]) else: shell_call([ 'codelite',