diff --git a/Source/Core/Common/ChunkFile.h b/Source/Core/Common/ChunkFile.h index e657cd2217..b72c3ed818 100644 --- a/Source/Core/Common/ChunkFile.h +++ b/Source/Core/Common/ChunkFile.h @@ -32,11 +32,6 @@ #include "Common/Inline.h" #include "Common/Logging/Log.h" -// XXX: Replace this with std::is_trivially_copyable<T> once we stop using volatile -// on things that are put in savestates, as volatile types are not trivially copyable. -template <typename T> -constexpr bool IsTriviallyCopyable = std::is_trivially_copyable<std::remove_volatile_t<T>>::value; - // Wrapper class class PointerWrap { @@ -181,13 +176,13 @@ public: DoArray(x.data(), static_cast<u32>(x.size())); } - template <typename T, typename std::enable_if_t<IsTriviallyCopyable<T>, int> = 0> + template <typename T, typename std::enable_if_t<std::is_trivially_copyable_v<T>, int> = 0> void DoArray(T* x, u32 count) { DoVoid(x, count * sizeof(T)); } - template <typename T, typename std::enable_if_t<!IsTriviallyCopyable<T>, int> = 0> + template <typename T, typename std::enable_if_t<!std::is_trivially_copyable_v<T>, int> = 0> void DoArray(T* x, u32 count) { for (u32 i = 0; i < count; ++i) @@ -230,7 +225,7 @@ public: template <typename T> void Do(T& x) { - static_assert(IsTriviallyCopyable<T>, "Only sane for trivially copyable types"); + static_assert(std::is_trivially_copyable_v<T>, "Only sane for trivially copyable types"); // Note: // Usually we can just use x = **ptr, etc. However, this doesn't work // for unions containing BitFields (long story, stupid language rules) diff --git a/Source/Core/Core/HW/MMIO.cpp b/Source/Core/Core/HW/MMIO.cpp index 95fc143129..41a378990c 100644 --- a/Source/Core/Core/HW/MMIO.cpp +++ b/Source/Core/Core/HW/MMIO.cpp @@ -101,20 +101,10 @@ ReadHandlingMethod<T>* DirectRead(const T* addr, u32 mask) return new DirectHandlingMethod<T>(const_cast<T*>(addr), mask); } template <typename T> -ReadHandlingMethod<T>* DirectRead(volatile const T* addr, u32 mask) -{ - return new DirectHandlingMethod<T>((T*)addr, mask); -} -template <typename T> WriteHandlingMethod<T>* DirectWrite(T* addr, u32 mask) { return new DirectHandlingMethod<T>(addr, mask); } -template <typename T> -WriteHandlingMethod<T>* DirectWrite(volatile T* addr, u32 mask) -{ - return new DirectHandlingMethod<T>((T*)addr, mask); -} // Complex: holds a lambda that is called when a read or a write is executed. // This gives complete control to the user as to what is going to happen during diff --git a/Source/Core/Core/HW/MMIOHandlers.h b/Source/Core/Core/HW/MMIOHandlers.h index c9b6017554..300ed2909b 100644 --- a/Source/Core/Core/HW/MMIOHandlers.h +++ b/Source/Core/Core/HW/MMIOHandlers.h @@ -46,11 +46,7 @@ WriteHandlingMethod<T>* Nop(); template <typename T> ReadHandlingMethod<T>* DirectRead(const T* addr, u32 mask = 0xFFFFFFFF); template <typename T> -ReadHandlingMethod<T>* DirectRead(volatile const T* addr, u32 mask = 0xFFFFFFFF); -template <typename T> WriteHandlingMethod<T>* DirectWrite(T* addr, u32 mask = 0xFFFFFFFF); -template <typename T> -WriteHandlingMethod<T>* DirectWrite(volatile T* addr, u32 mask = 0xFFFFFFFF); // Complex: use when no other handling method fits your needs. These allow you // to directly provide a function that will be called when a read/write needs @@ -204,9 +200,7 @@ private: MaybeExtern template ReadHandlingMethod<T>* Constant<T>(T value); \ MaybeExtern template WriteHandlingMethod<T>* Nop<T>(); \ MaybeExtern template ReadHandlingMethod<T>* DirectRead(const T* addr, u32 mask); \ - MaybeExtern template ReadHandlingMethod<T>* DirectRead(volatile const T* addr, u32 mask); \ MaybeExtern template WriteHandlingMethod<T>* DirectWrite(T* addr, u32 mask); \ - MaybeExtern template WriteHandlingMethod<T>* DirectWrite(volatile T* addr, u32 mask); \ MaybeExtern template ReadHandlingMethod<T>* ComplexRead<T>(std::function<T(u32)>); \ MaybeExtern template WriteHandlingMethod<T>* ComplexWrite<T>(std::function<void(u32, T)>); \ MaybeExtern template ReadHandlingMethod<T>* InvalidRead<T>(); \ diff --git a/Source/Core/VideoBackends/D3D/D3DPerfQuery.cpp b/Source/Core/VideoBackends/D3D/D3DPerfQuery.cpp index a05077e14b..cf0e66aed5 100644 --- a/Source/Core/VideoBackends/D3D/D3DPerfQuery.cpp +++ b/Source/Core/VideoBackends/D3D/D3DPerfQuery.cpp @@ -27,11 +27,13 @@ PerfQuery::~PerfQuery() = default; void PerfQuery::EnableQuery(PerfQueryGroup type) { + const u32 query_count = m_query_count.load(std::memory_order_relaxed); + // Is this sane? - if (m_query_count > m_query_buffer.size() / 2) + if (query_count > m_query_buffer.size() / 2) WeakFlush(); - if (m_query_buffer.size() == m_query_count) + if (m_query_buffer.size() == query_count) { // TODO FlushOne(); @@ -41,12 +43,12 @@ void PerfQuery::EnableQuery(PerfQueryGroup type) // start query if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP) { - auto& entry = m_query_buffer[(m_query_read_pos + m_query_count) % m_query_buffer.size()]; + auto& entry = m_query_buffer[(m_query_read_pos + query_count) % m_query_buffer.size()]; D3D::context->Begin(entry.query.Get()); entry.query_type = type; - ++m_query_count; + m_query_count.fetch_add(1, std::memory_order_relaxed); } } @@ -55,7 +57,8 @@ void PerfQuery::DisableQuery(PerfQueryGroup type) // stop query if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP) { - auto& entry = m_query_buffer[(m_query_read_pos + m_query_count + m_query_buffer.size() - 1) % + auto& entry = m_query_buffer[(m_query_read_pos + m_query_count.load(std::memory_order_relaxed) + + m_query_buffer.size() - 1) % m_query_buffer.size()]; D3D::context->End(entry.query.Get()); } @@ -63,8 +66,9 @@ void PerfQuery::DisableQuery(PerfQueryGroup type) void PerfQuery::ResetQuery() { - m_query_count = 0; - std::fill(std::begin(m_results), std::end(m_results), 0); + m_query_count.store(0, std::memory_order_relaxed); + for (size_t i = 0; i < m_results.size(); ++i) + m_results[i].store(0, std::memory_order_relaxed); } u32 PerfQuery::GetQueryResult(PerfQueryType type) @@ -72,13 +76,22 @@ u32 PerfQuery::GetQueryResult(PerfQueryType type) u32 result = 0; if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC) - result = m_results[PQG_ZCOMP_ZCOMPLOC]; + { + result = m_results[PQG_ZCOMP_ZCOMPLOC].load(std::memory_order_relaxed); + } else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT) - result = m_results[PQG_ZCOMP]; + { + result = m_results[PQG_ZCOMP].load(std::memory_order_relaxed); + } else if (type == PQ_BLEND_INPUT) - result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC]; + { + result = m_results[PQG_ZCOMP].load(std::memory_order_relaxed) + + m_results[PQG_ZCOMP_ZCOMPLOC].load(std::memory_order_relaxed); + } else if (type == PQ_EFB_COPY_CLOCKS) - result = m_results[PQG_EFB_COPY_CLOCKS]; + { + result = m_results[PQG_EFB_COPY_CLOCKS].load(std::memory_order_relaxed); + } return result; } @@ -98,11 +111,13 @@ void PerfQuery::FlushOne() // NOTE: Reported pixel metrics should be referenced to native resolution // TODO: Dropping the lower 2 bits from this count should be closer to actual // hardware behavior when drawing triangles. - m_results[entry.query_type] += (u32)(result * EFB_WIDTH / g_renderer->GetTargetWidth() * - EFB_HEIGHT / g_renderer->GetTargetHeight()); + const u64 native_res_result = result * EFB_WIDTH / g_renderer->GetTargetWidth() * EFB_HEIGHT / + g_renderer->GetTargetHeight(); + m_results[entry.query_type].fetch_add(static_cast<u32>(native_res_result), + std::memory_order_relaxed); m_query_read_pos = (m_query_read_pos + 1) % m_query_buffer.size(); - --m_query_count; + m_query_count.fetch_sub(1, std::memory_order_relaxed); } // TODO: could selectively flush things, but I don't think that will do much @@ -125,11 +140,13 @@ void PerfQuery::WeakFlush() if (hr == S_OK) { // NOTE: Reported pixel metrics should be referenced to native resolution - m_results[entry.query_type] += (u32)(result * EFB_WIDTH / g_renderer->GetTargetWidth() * - EFB_HEIGHT / g_renderer->GetTargetHeight()); + const u64 native_res_result = result * EFB_WIDTH / g_renderer->GetTargetWidth() * EFB_HEIGHT / + g_renderer->GetTargetHeight(); + m_results[entry.query_type].store(static_cast<u32>(native_res_result), + std::memory_order_relaxed); m_query_read_pos = (m_query_read_pos + 1) % m_query_buffer.size(); - --m_query_count; + m_query_count.fetch_sub(1, std::memory_order_relaxed); } else { @@ -140,7 +157,7 @@ void PerfQuery::WeakFlush() bool PerfQuery::IsFlushed() const { - return 0 == m_query_count; + return m_query_count.load(std::memory_order_relaxed) == 0; } } // namespace DX11 diff --git a/Source/Core/VideoBackends/D3D12/D3D12PerfQuery.cpp b/Source/Core/VideoBackends/D3D12/D3D12PerfQuery.cpp index d6b6b6a193..69c78bd850 100644 --- a/Source/Core/VideoBackends/D3D12/D3D12PerfQuery.cpp +++ b/Source/Core/VideoBackends/D3D12/D3D12PerfQuery.cpp @@ -52,10 +52,11 @@ void PerfQuery::EnableQuery(PerfQueryGroup type) { // Block if there are no free slots. // Otherwise, try to keep half of them available. - if (m_query_count > m_query_buffer.size() / 2) + const u32 query_count = m_query_count.load(std::memory_order_relaxed); + if (query_count > m_query_buffer.size() / 2) { const bool do_resolve = m_unresolved_queries > m_query_buffer.size() / 2; - const bool blocking = m_query_count == PERF_QUERY_BUFFER_SIZE; + const bool blocking = query_count == PERF_QUERY_BUFFER_SIZE; PartialFlush(do_resolve, blocking); } @@ -83,19 +84,20 @@ void PerfQuery::DisableQuery(PerfQueryGroup type) g_dx_context->GetCommandList()->EndQuery(m_query_heap.Get(), D3D12_QUERY_TYPE_OCCLUSION, m_query_next_pos); m_query_next_pos = (m_query_next_pos + 1) % PERF_QUERY_BUFFER_SIZE; - m_query_count++; + m_query_count.fetch_add(1, std::memory_order_relaxed); m_unresolved_queries++; } } void PerfQuery::ResetQuery() { - m_query_count = 0; + m_query_count.store(0, std::memory_order_relaxed); m_unresolved_queries = 0; m_query_resolve_pos = 0; m_query_readback_pos = 0; m_query_next_pos = 0; - std::fill(std::begin(m_results), std::end(m_results), 0); + for (size_t i = 0; i < m_results.size(); ++i) + m_results[i].store(0, std::memory_order_relaxed); for (auto& entry : m_query_buffer) { entry.fence_value = 0; @@ -108,13 +110,22 @@ u32 PerfQuery::GetQueryResult(PerfQueryType type) { u32 result = 0; if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC) - result = m_results[PQG_ZCOMP_ZCOMPLOC]; + { + result = m_results[PQG_ZCOMP_ZCOMPLOC].load(std::memory_order_relaxed); + } else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT) - result = m_results[PQG_ZCOMP]; + { + result = m_results[PQG_ZCOMP].load(std::memory_order_relaxed); + } else if (type == PQ_BLEND_INPUT) - result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC]; + { + result = m_results[PQG_ZCOMP].load(std::memory_order_relaxed) + + m_results[PQG_ZCOMP_ZCOMPLOC].load(std::memory_order_relaxed); + } else if (type == PQ_EFB_COPY_CLOCKS) - result = m_results[PQG_EFB_COPY_CLOCKS]; + { + result = m_results[PQG_EFB_COPY_CLOCKS].load(std::memory_order_relaxed); + } return result / 4; } @@ -127,7 +138,7 @@ void PerfQuery::FlushResults() bool PerfQuery::IsFlushed() const { - return m_query_count == 0; + return m_query_count.load(std::memory_order_relaxed) == 0; } void PerfQuery::ResolveQueries() @@ -165,7 +176,7 @@ void PerfQuery::ReadbackQueries(bool blocking) u64 completed_fence_counter = g_dx_context->GetCompletedFenceValue(); // Need to save these since ProcessResults will modify them. - const u32 outstanding_queries = m_query_count; + const u32 outstanding_queries = m_query_count.load(std::memory_order_relaxed); u32 readback_count = 0; for (u32 i = 0; i < outstanding_queries; i++) { @@ -203,7 +214,7 @@ void PerfQuery::ReadbackQueries(bool blocking) void PerfQuery::AccumulateQueriesFromBuffer(u32 query_count) { // Should be at maximum query_count queries pending. - ASSERT(query_count <= m_query_count && + ASSERT(query_count <= m_query_count.load(std::memory_order_relaxed) && (m_query_readback_pos + query_count) <= PERF_QUERY_BUFFER_SIZE); const D3D12_RANGE read_range = {m_query_readback_pos * sizeof(PerfQueryDataType), @@ -231,16 +242,18 @@ void PerfQuery::AccumulateQueriesFromBuffer(u32 query_count) std::memcpy(&result, mapped_ptr + (index * sizeof(PerfQueryDataType)), sizeof(result)); // NOTE: Reported pixel metrics should be referenced to native resolution - m_results[entry.query_type] += - static_cast<u32>(static_cast<u64>(result) * EFB_WIDTH / g_renderer->GetTargetWidth() * - EFB_HEIGHT / g_renderer->GetTargetHeight()); + const u64 native_res_result = static_cast<u64>(result) * EFB_WIDTH / + g_renderer->GetTargetWidth() * EFB_HEIGHT / + g_renderer->GetTargetHeight(); + m_results[entry.query_type].fetch_add(static_cast<u32>(native_res_result), + std::memory_order_relaxed); } constexpr D3D12_RANGE write_range = {0, 0}; m_query_readback_buffer->Unmap(0, &write_range); m_query_readback_pos = (m_query_readback_pos + query_count) % PERF_QUERY_BUFFER_SIZE; - m_query_count -= query_count; + m_query_count.fetch_sub(query_count, std::memory_order_relaxed); } void PerfQuery::PartialFlush(bool resolve, bool blocking) diff --git a/Source/Core/VideoBackends/OGL/OGLPerfQuery.cpp b/Source/Core/VideoBackends/OGL/OGLPerfQuery.cpp index e708f93e10..2718dd49af 100644 --- a/Source/Core/VideoBackends/OGL/OGLPerfQuery.cpp +++ b/Source/Core/VideoBackends/OGL/OGLPerfQuery.cpp @@ -43,7 +43,7 @@ void PerfQuery::DisableQuery(PerfQueryGroup type) bool PerfQuery::IsFlushed() const { - return 0 == m_query_count; + return m_query_count.load(std::memory_order_relaxed) == 0; } // TODO: could selectively flush things, but I don't think that will do much @@ -54,8 +54,9 @@ void PerfQuery::FlushResults() void PerfQuery::ResetQuery() { - m_query_count = 0; - std::fill(std::begin(m_results), std::end(m_results), 0); + m_query_count.store(0, std::memory_order_relaxed); + for (size_t i = 0; i < m_results.size(); ++i) + m_results[i].store(0, std::memory_order_relaxed); } u32 PerfQuery::GetQueryResult(PerfQueryType type) @@ -64,19 +65,20 @@ u32 PerfQuery::GetQueryResult(PerfQueryType type) if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC) { - result = m_results[PQG_ZCOMP_ZCOMPLOC]; + result = m_results[PQG_ZCOMP_ZCOMPLOC].load(std::memory_order_relaxed); } else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT) { - result = m_results[PQG_ZCOMP]; + result = m_results[PQG_ZCOMP].load(std::memory_order_relaxed); } else if (type == PQ_BLEND_INPUT) { - result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC]; + result = m_results[PQG_ZCOMP].load(std::memory_order_relaxed) + + m_results[PQG_ZCOMP_ZCOMPLOC].load(std::memory_order_relaxed); } else if (type == PQ_EFB_COPY_CLOCKS) { - result = m_results[PQG_EFB_COPY_CLOCKS]; + result = m_results[PQG_EFB_COPY_CLOCKS].load(std::memory_order_relaxed); } return result; @@ -97,11 +99,13 @@ PerfQueryGL::~PerfQueryGL() void PerfQueryGL::EnableQuery(PerfQueryGroup type) { + const u32 query_count = m_query_count.load(std::memory_order_relaxed); + // Is this sane? - if (m_query_count > m_query_buffer.size() / 2) + if (query_count > m_query_buffer.size() / 2) WeakFlush(); - if (m_query_buffer.size() == m_query_count) + if (m_query_buffer.size() == query_count) { FlushOne(); // ERROR_LOG_FMT(VIDEO, "Flushed query buffer early!"); @@ -110,12 +114,12 @@ void PerfQueryGL::EnableQuery(PerfQueryGroup type) // start query if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP) { - auto& entry = m_query_buffer[(m_query_read_pos + m_query_count) % m_query_buffer.size()]; + auto& entry = m_query_buffer[(m_query_read_pos + query_count) % m_query_buffer.size()]; glBeginQuery(m_query_type, entry.query_id); entry.query_type = type; - ++m_query_count; + m_query_count.fetch_add(1, std::memory_order_relaxed); } } void PerfQueryGL::DisableQuery(PerfQueryGroup type) @@ -164,10 +168,10 @@ void PerfQueryGL::FlushOne() if (g_ActiveConfig.iMultisamples > 1) result /= g_ActiveConfig.iMultisamples; - m_results[entry.query_type] += result; + m_results[entry.query_type].fetch_add(result, std::memory_order_relaxed); m_query_read_pos = (m_query_read_pos + 1) % m_query_buffer.size(); - --m_query_count; + m_query_count.fetch_sub(1, std::memory_order_relaxed); } // TODO: could selectively flush things, but I don't think that will do much @@ -191,11 +195,12 @@ PerfQueryGLESNV::~PerfQueryGLESNV() void PerfQueryGLESNV::EnableQuery(PerfQueryGroup type) { + const u32 query_count = m_query_count.load(std::memory_order_relaxed); // Is this sane? - if (m_query_count > m_query_buffer.size() / 2) + if (query_count > m_query_buffer.size() / 2) WeakFlush(); - if (m_query_buffer.size() == m_query_count) + if (m_query_buffer.size() == query_count) { FlushOne(); // ERROR_LOG_FMT(VIDEO, "Flushed query buffer early!"); @@ -204,12 +209,12 @@ void PerfQueryGLESNV::EnableQuery(PerfQueryGroup type) // start query if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP) { - auto& entry = m_query_buffer[(m_query_read_pos + m_query_count) % m_query_buffer.size()]; + auto& entry = m_query_buffer[(m_query_read_pos + query_count) % m_query_buffer.size()]; glBeginOcclusionQueryNV(entry.query_id); entry.query_type = type; - ++m_query_count; + m_query_count.fetch_add(1, std::memory_order_relaxed); } } void PerfQueryGLESNV::DisableQuery(PerfQueryGroup type) @@ -251,11 +256,13 @@ void PerfQueryGLESNV::FlushOne() // NOTE: Reported pixel metrics should be referenced to native resolution // TODO: Dropping the lower 2 bits from this count should be closer to actual // hardware behavior when drawing triangles. - m_results[entry.query_type] += static_cast<u64>(result) * EFB_WIDTH * EFB_HEIGHT / - (g_renderer->GetTargetWidth() * g_renderer->GetTargetHeight()); + const u64 native_res_result = static_cast<u64>(result) * EFB_WIDTH * EFB_HEIGHT / + (g_renderer->GetTargetWidth() * g_renderer->GetTargetHeight()); + m_results[entry.query_type].fetch_add(static_cast<u32>(native_res_result), + std::memory_order_relaxed); m_query_read_pos = (m_query_read_pos + 1) % m_query_buffer.size(); - --m_query_count; + m_query_count.fetch_sub(1, std::memory_order_relaxed); } // TODO: could selectively flush things, but I don't think that will do much diff --git a/Source/Core/VideoBackends/Vulkan/VKPerfQuery.cpp b/Source/Core/VideoBackends/Vulkan/VKPerfQuery.cpp index c8fcd05842..d4f442c54d 100644 --- a/Source/Core/VideoBackends/Vulkan/VKPerfQuery.cpp +++ b/Source/Core/VideoBackends/Vulkan/VKPerfQuery.cpp @@ -43,8 +43,9 @@ void PerfQuery::EnableQuery(PerfQueryGroup type) { // Block if there are no free slots. // Otherwise, try to keep half of them available. - if (m_query_count > m_query_buffer.size() / 2) - PartialFlush(m_query_count == PERF_QUERY_BUFFER_SIZE); + const u32 query_count = m_query_count.load(std::memory_order_relaxed); + if (query_count > m_query_buffer.size() / 2) + PartialFlush(query_count == PERF_QUERY_BUFFER_SIZE); // Ensure command buffer is ready to go before beginning the query, that way we don't submit // a buffer with open queries. @@ -73,16 +74,17 @@ void PerfQuery::DisableQuery(PerfQueryGroup type) { vkCmdEndQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, m_query_next_pos); m_query_next_pos = (m_query_next_pos + 1) % PERF_QUERY_BUFFER_SIZE; - m_query_count++; + m_query_count.fetch_add(1, std::memory_order_relaxed); } } void PerfQuery::ResetQuery() { - m_query_count = 0; + m_query_count.store(0, std::memory_order_relaxed); m_query_readback_pos = 0; m_query_next_pos = 0; - std::fill(std::begin(m_results), std::end(m_results), 0); + for (size_t i = 0; i < m_results.size(); ++i) + m_results[i].store(0, std::memory_order_relaxed); // Reset entire query pool, ensuring all queries are ready to write to. StateTracker::GetInstance()->EndRenderPass(); @@ -96,13 +98,22 @@ u32 PerfQuery::GetQueryResult(PerfQueryType type) { u32 result = 0; if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC) - result = m_results[PQG_ZCOMP_ZCOMPLOC]; + { + result = m_results[PQG_ZCOMP_ZCOMPLOC].load(std::memory_order_relaxed); + } else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT) - result = m_results[PQG_ZCOMP]; + { + result = m_results[PQG_ZCOMP].load(std::memory_order_relaxed); + } else if (type == PQ_BLEND_INPUT) - result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC]; + { + result = m_results[PQG_ZCOMP].load(std::memory_order_relaxed) + + m_results[PQG_ZCOMP_ZCOMPLOC].load(std::memory_order_relaxed); + } else if (type == PQ_EFB_COPY_CLOCKS) - result = m_results[PQG_EFB_COPY_CLOCKS]; + { + result = m_results[PQG_EFB_COPY_CLOCKS].load(std::memory_order_relaxed); + } return result / 4; } @@ -115,7 +126,7 @@ void PerfQuery::FlushResults() bool PerfQuery::IsFlushed() const { - return m_query_count == 0; + return m_query_count.load(std::memory_order_relaxed) == 0; } bool PerfQuery::CreateQueryPool() @@ -144,7 +155,7 @@ void PerfQuery::ReadbackQueries() const u64 completed_fence_counter = g_command_buffer_mgr->GetCompletedFenceCounter(); // Need to save these since ProcessResults will modify them. - const u32 outstanding_queries = m_query_count; + const u32 outstanding_queries = m_query_count.load(std::memory_order_relaxed); u32 readback_count = 0; for (u32 i = 0; i < outstanding_queries; i++) { @@ -171,7 +182,7 @@ void PerfQuery::ReadbackQueries() void PerfQuery::ReadbackQueries(u32 query_count) { // Should be at maximum query_count queries pending. - ASSERT(query_count <= m_query_count && + ASSERT(query_count <= m_query_count.load(std::memory_order_relaxed) && (m_query_readback_pos + query_count) <= PERF_QUERY_BUFFER_SIZE); // Read back from the GPU. @@ -194,13 +205,15 @@ void PerfQuery::ReadbackQueries(u32 query_count) entry.has_value = false; // NOTE: Reported pixel metrics should be referenced to native resolution - m_results[entry.query_type] += - static_cast<u32>(static_cast<u64>(m_query_result_buffer[i]) * EFB_WIDTH / - g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight()); + const u64 native_res_result = static_cast<u64>(m_query_result_buffer[i]) * EFB_WIDTH / + g_renderer->GetTargetWidth() * EFB_HEIGHT / + g_renderer->GetTargetHeight(); + m_results[entry.query_type].fetch_add(static_cast<u32>(native_res_result), + std::memory_order_relaxed); } m_query_readback_pos = (m_query_readback_pos + query_count) % PERF_QUERY_BUFFER_SIZE; - m_query_count -= query_count; + m_query_count.fetch_sub(query_count, std::memory_order_relaxed); } void PerfQuery::PartialFlush(bool blocking) diff --git a/Source/Core/VideoCommon/CommandProcessor.cpp b/Source/Core/VideoCommon/CommandProcessor.cpp index 14b865386e..4cdb9bd9fc 100644 --- a/Source/Core/VideoCommon/CommandProcessor.cpp +++ b/Source/Core/VideoCommon/CommandProcessor.cpp @@ -119,11 +119,11 @@ void Init() m_tokenReg = 0; memset(&fifo, 0, sizeof(fifo)); - fifo.bFF_Breakpoint = 0; - fifo.bFF_HiWatermark = 0; - fifo.bFF_HiWatermarkInt = 0; - fifo.bFF_LoWatermark = 0; - fifo.bFF_LoWatermarkInt = 0; + fifo.bFF_Breakpoint.store(0, std::memory_order_relaxed); + fifo.bFF_HiWatermark.store(0, std::memory_order_relaxed); + fifo.bFF_HiWatermarkInt.store(0, std::memory_order_relaxed); + fifo.bFF_LoWatermark.store(0, std::memory_order_relaxed); + fifo.bFF_LoWatermarkInt.store(0, std::memory_order_relaxed); s_interrupt_set.Clear(); s_interrupt_waiting.Clear(); @@ -368,7 +368,7 @@ void GatherPipeBursted() } // If the game is running close to overflowing, make the exception checking more frequent. - if (fifo.bFF_HiWatermark) + if (fifo.bFF_HiWatermark.load(std::memory_order_relaxed) != 0) CoreTiming::ForceExceptionCheck(0); fifo.CPReadWriteDistance.fetch_add(GATHER_PIPE_SIZE, std::memory_order_seq_cst); @@ -427,47 +427,53 @@ bool IsInterruptWaiting() void SetCPStatusFromGPU() { // breakpoint - if (fifo.bFF_BPEnable) + const bool breakpoint = fifo.bFF_Breakpoint.load(std::memory_order_relaxed); + if (fifo.bFF_BPEnable.load(std::memory_order_relaxed) != 0) { if (fifo.CPBreakpoint.load(std::memory_order_relaxed) == fifo.CPReadPointer.load(std::memory_order_relaxed)) { - if (!fifo.bFF_Breakpoint) + if (!breakpoint) { DEBUG_LOG_FMT(COMMANDPROCESSOR, "Hit breakpoint at {}", fifo.CPReadPointer.load(std::memory_order_relaxed)); - fifo.bFF_Breakpoint = true; + fifo.bFF_Breakpoint.store(1, std::memory_order_relaxed); } } else { - if (fifo.bFF_Breakpoint) + if (breakpoint) { DEBUG_LOG_FMT(COMMANDPROCESSOR, "Cleared breakpoint at {}", fifo.CPReadPointer.load(std::memory_order_relaxed)); + fifo.bFF_Breakpoint.store(0, std::memory_order_relaxed); } - fifo.bFF_Breakpoint = false; } } else { - if (fifo.bFF_Breakpoint) + if (breakpoint) { DEBUG_LOG_FMT(COMMANDPROCESSOR, "Cleared breakpoint at {}", fifo.CPReadPointer.load(std::memory_order_relaxed)); + fifo.bFF_Breakpoint = false; } - fifo.bFF_Breakpoint = false; } // overflow & underflow check - fifo.bFF_HiWatermark = - (fifo.CPReadWriteDistance.load(std::memory_order_relaxed) > fifo.CPHiWatermark); - fifo.bFF_LoWatermark = - (fifo.CPReadWriteDistance.load(std::memory_order_relaxed) < fifo.CPLoWatermark); + fifo.bFF_HiWatermark.store( + (fifo.CPReadWriteDistance.load(std::memory_order_relaxed) > fifo.CPHiWatermark), + std::memory_order_relaxed); + fifo.bFF_LoWatermark.store( + (fifo.CPReadWriteDistance.load(std::memory_order_relaxed) < fifo.CPLoWatermark), + std::memory_order_relaxed); - bool bpInt = fifo.bFF_Breakpoint && fifo.bFF_BPInt; - bool ovfInt = fifo.bFF_HiWatermark && fifo.bFF_HiWatermarkInt; - bool undfInt = fifo.bFF_LoWatermark && fifo.bFF_LoWatermarkInt; + bool bpInt = fifo.bFF_Breakpoint.load(std::memory_order_relaxed) && + fifo.bFF_BPInt.load(std::memory_order_relaxed); + bool ovfInt = fifo.bFF_HiWatermark.load(std::memory_order_relaxed) && + fifo.bFF_HiWatermarkInt.load(std::memory_order_relaxed); + bool undfInt = fifo.bFF_LoWatermark.load(std::memory_order_relaxed) && + fifo.bFF_LoWatermarkInt.load(std::memory_order_relaxed); bool interrupt = (bpInt || ovfInt || undfInt) && m_CPCtrlReg.GPReadEnable; @@ -493,14 +499,19 @@ void SetCPStatusFromGPU() void SetCPStatusFromCPU() { // overflow & underflow check - fifo.bFF_HiWatermark = - (fifo.CPReadWriteDistance.load(std::memory_order_relaxed) > fifo.CPHiWatermark); - fifo.bFF_LoWatermark = - (fifo.CPReadWriteDistance.load(std::memory_order_relaxed) < fifo.CPLoWatermark); + fifo.bFF_HiWatermark.store( + (fifo.CPReadWriteDistance.load(std::memory_order_relaxed) > fifo.CPHiWatermark), + std::memory_order_relaxed); + fifo.bFF_LoWatermark.store( + (fifo.CPReadWriteDistance.load(std::memory_order_relaxed) < fifo.CPLoWatermark), + std::memory_order_relaxed); - bool bpInt = fifo.bFF_Breakpoint && fifo.bFF_BPInt; - bool ovfInt = fifo.bFF_HiWatermark && fifo.bFF_HiWatermarkInt; - bool undfInt = fifo.bFF_LoWatermark && fifo.bFF_LoWatermarkInt; + bool bpInt = fifo.bFF_Breakpoint.load(std::memory_order_relaxed) && + fifo.bFF_BPInt.load(std::memory_order_relaxed); + bool ovfInt = fifo.bFF_HiWatermark.load(std::memory_order_relaxed) && + fifo.bFF_HiWatermarkInt.load(std::memory_order_relaxed); + bool undfInt = fifo.bFF_LoWatermark.load(std::memory_order_relaxed) && + fifo.bFF_LoWatermarkInt.load(std::memory_order_relaxed); bool interrupt = (bpInt || ovfInt || undfInt) && m_CPCtrlReg.GPReadEnable; @@ -526,14 +537,15 @@ void SetCPStatusFromCPU() void SetCpStatusRegister() { // Here always there is one fifo attached to the GPU - m_CPStatusReg.Breakpoint = fifo.bFF_Breakpoint; + m_CPStatusReg.Breakpoint = fifo.bFF_Breakpoint.load(std::memory_order_relaxed); m_CPStatusReg.ReadIdle = !fifo.CPReadWriteDistance.load(std::memory_order_relaxed) || (fifo.CPReadPointer.load(std::memory_order_relaxed) == fifo.CPWritePointer.load(std::memory_order_relaxed)); m_CPStatusReg.CommandIdle = !fifo.CPReadWriteDistance.load(std::memory_order_relaxed) || - Fifo::AtBreakpoint() || !fifo.bFF_GPReadEnable; - m_CPStatusReg.UnderflowLoWatermark = fifo.bFF_LoWatermark; - m_CPStatusReg.OverflowHiWatermark = fifo.bFF_HiWatermark; + Fifo::AtBreakpoint() || + !fifo.bFF_GPReadEnable.load(std::memory_order_relaxed); + m_CPStatusReg.UnderflowLoWatermark = fifo.bFF_LoWatermark.load(std::memory_order_relaxed); + m_CPStatusReg.OverflowHiWatermark = fifo.bFF_HiWatermark.load(std::memory_order_relaxed); DEBUG_LOG_FMT(COMMANDPROCESSOR, "\t Read from STATUS_REGISTER : {:04x}", m_CPStatusReg.Hex); DEBUG_LOG_FMT( @@ -545,15 +557,15 @@ void SetCpStatusRegister() void SetCpControlRegister() { - fifo.bFF_BPInt = m_CPCtrlReg.BPInt; - fifo.bFF_BPEnable = m_CPCtrlReg.BPEnable; - fifo.bFF_HiWatermarkInt = m_CPCtrlReg.FifoOverflowIntEnable; - fifo.bFF_LoWatermarkInt = m_CPCtrlReg.FifoUnderflowIntEnable; - fifo.bFF_GPLinkEnable = m_CPCtrlReg.GPLinkEnable; + fifo.bFF_BPInt.store(m_CPCtrlReg.BPInt, std::memory_order_relaxed); + fifo.bFF_BPEnable.store(m_CPCtrlReg.BPEnable, std::memory_order_relaxed); + fifo.bFF_HiWatermarkInt.store(m_CPCtrlReg.FifoOverflowIntEnable, std::memory_order_relaxed); + fifo.bFF_LoWatermarkInt.store(m_CPCtrlReg.FifoUnderflowIntEnable, std::memory_order_relaxed); + fifo.bFF_GPLinkEnable.store(m_CPCtrlReg.GPLinkEnable, std::memory_order_relaxed); - if (fifo.bFF_GPReadEnable && !m_CPCtrlReg.GPReadEnable) + if (fifo.bFF_GPReadEnable.load(std::memory_order_relaxed) && !m_CPCtrlReg.GPReadEnable) { - fifo.bFF_GPReadEnable = m_CPCtrlReg.GPReadEnable; + fifo.bFF_GPReadEnable.store(m_CPCtrlReg.GPReadEnable, std::memory_order_relaxed); Fifo::FlushGpu(); } else @@ -562,8 +574,10 @@ void SetCpControlRegister() } DEBUG_LOG_FMT(COMMANDPROCESSOR, "\t GPREAD {} | BP {} | Int {} | OvF {} | UndF {} | LINK {}", - fifo.bFF_GPReadEnable ? "ON" : "OFF", fifo.bFF_BPEnable ? "ON" : "OFF", - fifo.bFF_BPInt ? "ON" : "OFF", m_CPCtrlReg.FifoOverflowIntEnable ? "ON" : "OFF", + fifo.bFF_GPReadEnable.load(std::memory_order_relaxed) ? "ON" : "OFF", + fifo.bFF_BPEnable.load(std::memory_order_relaxed) ? "ON" : "OFF", + fifo.bFF_BPInt.load(std::memory_order_relaxed) ? "ON" : "OFF", + m_CPCtrlReg.FifoOverflowIntEnable ? "ON" : "OFF", m_CPCtrlReg.FifoUnderflowIntEnable ? "ON" : "OFF", m_CPCtrlReg.GPLinkEnable ? "ON" : "OFF"); } @@ -588,32 +602,35 @@ void HandleUnknownOpcode(u8 cmd_byte, void* buffer, bool preprocess) cmd_byte, buffer, preprocess ? "preprocess=true" : "preprocess=false"); { - PanicAlertFmt( - "Illegal command {:02x}\n" - "CPBase: {:#010x}\n" - "CPEnd: {:#010x}\n" - "CPHiWatermark: {:#010x}\n" - "CPLoWatermark: {:#010x}\n" - "CPReadWriteDistance: {:#010x}\n" - "CPWritePointer: {:#010x}\n" - "CPReadPointer: {:#010x}\n" - "CPBreakpoint: {:#010x}\n" - "bFF_GPReadEnable: {}\n" - "bFF_BPEnable: {}\n" - "bFF_BPInt: {}\n" - "bFF_Breakpoint: {}\n" - "bFF_GPLinkEnable: {}\n" - "bFF_HiWatermarkInt: {}\n" - "bFF_LoWatermarkInt: {}\n", - cmd_byte, fifo.CPBase.load(std::memory_order_relaxed), - fifo.CPEnd.load(std::memory_order_relaxed), fifo.CPHiWatermark, fifo.CPLoWatermark, - fifo.CPReadWriteDistance.load(std::memory_order_relaxed), - fifo.CPWritePointer.load(std::memory_order_relaxed), - fifo.CPReadPointer.load(std::memory_order_relaxed), - fifo.CPBreakpoint.load(std::memory_order_relaxed), fifo.bFF_GPReadEnable ? "true" : "false", - fifo.bFF_BPEnable ? "true" : "false", fifo.bFF_BPInt ? "true" : "false", - fifo.bFF_Breakpoint ? "true" : "false", fifo.bFF_GPLinkEnable ? "true" : "false", - fifo.bFF_HiWatermarkInt ? "true" : "false", fifo.bFF_LoWatermarkInt ? "true" : "false"); + PanicAlertFmt("Illegal command {:02x}\n" + "CPBase: {:#010x}\n" + "CPEnd: {:#010x}\n" + "CPHiWatermark: {:#010x}\n" + "CPLoWatermark: {:#010x}\n" + "CPReadWriteDistance: {:#010x}\n" + "CPWritePointer: {:#010x}\n" + "CPReadPointer: {:#010x}\n" + "CPBreakpoint: {:#010x}\n" + "bFF_GPReadEnable: {}\n" + "bFF_BPEnable: {}\n" + "bFF_BPInt: {}\n" + "bFF_Breakpoint: {}\n" + "bFF_GPLinkEnable: {}\n" + "bFF_HiWatermarkInt: {}\n" + "bFF_LoWatermarkInt: {}\n", + cmd_byte, fifo.CPBase.load(std::memory_order_relaxed), + fifo.CPEnd.load(std::memory_order_relaxed), fifo.CPHiWatermark, + fifo.CPLoWatermark, fifo.CPReadWriteDistance.load(std::memory_order_relaxed), + fifo.CPWritePointer.load(std::memory_order_relaxed), + fifo.CPReadPointer.load(std::memory_order_relaxed), + fifo.CPBreakpoint.load(std::memory_order_relaxed), + fifo.bFF_GPReadEnable.load(std::memory_order_relaxed) ? "true" : "false", + fifo.bFF_BPEnable.load(std::memory_order_relaxed) ? "true" : "false", + fifo.bFF_BPInt.load(std::memory_order_relaxed) ? "true" : "false", + fifo.bFF_Breakpoint.load(std::memory_order_relaxed) ? "true" : "false", + fifo.bFF_GPLinkEnable.load(std::memory_order_relaxed) ? "true" : "false", + fifo.bFF_HiWatermarkInt.load(std::memory_order_relaxed) ? "true" : "false", + fifo.bFF_LoWatermarkInt.load(std::memory_order_relaxed) ? "true" : "false"); } } diff --git a/Source/Core/VideoCommon/CommandProcessor.h b/Source/Core/VideoCommon/CommandProcessor.h index bc0a8ede49..20f2bc243c 100644 --- a/Source/Core/VideoCommon/CommandProcessor.h +++ b/Source/Core/VideoCommon/CommandProcessor.h @@ -29,17 +29,17 @@ struct SCPFifoStruct std::atomic<u32> CPBreakpoint; std::atomic<u32> SafeCPReadPointer; - volatile u32 bFF_GPLinkEnable; - volatile u32 bFF_GPReadEnable; - volatile u32 bFF_BPEnable; - volatile u32 bFF_BPInt; - volatile u32 bFF_Breakpoint; + std::atomic<u32> bFF_GPLinkEnable; + std::atomic<u32> bFF_GPReadEnable; + std::atomic<u32> bFF_BPEnable; + std::atomic<u32> bFF_BPInt; + std::atomic<u32> bFF_Breakpoint; - volatile u32 bFF_LoWatermarkInt; - volatile u32 bFF_HiWatermarkInt; + std::atomic<u32> bFF_LoWatermarkInt; + std::atomic<u32> bFF_HiWatermarkInt; - volatile u32 bFF_LoWatermark; - volatile u32 bFF_HiWatermark; + std::atomic<u32> bFF_LoWatermark; + std::atomic<u32> bFF_HiWatermark; void DoState(PointerWrap& p); }; diff --git a/Source/Core/VideoCommon/Fifo.cpp b/Source/Core/VideoCommon/Fifo.cpp index f12776449c..cfc126cfc1 100644 --- a/Source/Core/VideoCommon/Fifo.cpp +++ b/Source/Core/VideoCommon/Fifo.cpp @@ -139,7 +139,7 @@ void Shutdown() void ExitGpuLoop() { // This should break the wait loop in CPU thread - CommandProcessor::fifo.bFF_GPReadEnable = false; + CommandProcessor::fifo.bFF_GPReadEnable.store(0, std::memory_order_relaxed); FlushGpu(); // Terminate GPU thread loop @@ -327,7 +327,8 @@ void RunGpuLoop() CommandProcessor::SetCPStatusFromGPU(); // check if we are able to run this buffer - while (!CommandProcessor::IsInterruptWaiting() && fifo.bFF_GPReadEnable && + while (!CommandProcessor::IsInterruptWaiting() && + fifo.bFF_GPReadEnable.load(std::memory_order_relaxed) && fifo.CPReadWriteDistance.load(std::memory_order_relaxed) && !AtBreakpoint()) { if (param.bSyncGPU && s_sync_ticks.load() < param.iSyncGpuMinDistance) @@ -415,8 +416,9 @@ void GpuMaySleep() bool AtBreakpoint() { CommandProcessor::SCPFifoStruct& fifo = CommandProcessor::fifo; - return fifo.bFF_BPEnable && (fifo.CPReadPointer.load(std::memory_order_relaxed) == - fifo.CPBreakpoint.load(std::memory_order_relaxed)); + return fifo.bFF_BPEnable.load(std::memory_order_relaxed) && + (fifo.CPReadPointer.load(std::memory_order_relaxed) == + fifo.CPBreakpoint.load(std::memory_order_relaxed)); } void RunGpu() @@ -446,8 +448,9 @@ static int RunGpuOnCpu(int ticks) CommandProcessor::SCPFifoStruct& fifo = CommandProcessor::fifo; bool reset_simd_state = false; int available_ticks = int(ticks * SConfig::GetInstance().fSyncGpuOverclock) + s_sync_ticks.load(); - while (fifo.bFF_GPReadEnable && fifo.CPReadWriteDistance.load(std::memory_order_relaxed) && - !AtBreakpoint() && available_ticks >= 0) + while (fifo.bFF_GPReadEnable.load(std::memory_order_relaxed) && + fifo.CPReadWriteDistance.load(std::memory_order_relaxed) && !AtBreakpoint() && + available_ticks >= 0) { if (s_use_deterministic_gpu_thread) { diff --git a/Source/Core/VideoCommon/PerfQueryBase.h b/Source/Core/VideoCommon/PerfQueryBase.h index de99b215c9..9e29e238a6 100644 --- a/Source/Core/VideoCommon/PerfQueryBase.h +++ b/Source/Core/VideoCommon/PerfQueryBase.h @@ -4,7 +4,10 @@ #pragma once +#include <array> +#include <atomic> #include <memory> + #include "Common/CommonTypes.h" enum PerfQueryType @@ -61,9 +64,8 @@ public: virtual bool IsFlushed() const { return true; } protected: - // TODO: sloppy - volatile u32 m_query_count; - volatile u32 m_results[PQG_NUM_MEMBERS]; + std::atomic<u32> m_query_count; + std::array<std::atomic<u32>, PQG_NUM_MEMBERS> m_results; }; extern std::unique_ptr<PerfQueryBase> g_perf_query;