diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h index 46354eb52..92712e240 100644 --- a/src/common/gsvector_neon.h +++ b/src/common/gsvector_neon.h @@ -279,7 +279,7 @@ public: { constexpr int bit1 = ((mask & 2) * 3) << 1; constexpr int bit0 = (mask & 1) * 3; - return blend16(v); + return blend16 < bit1 | bit0 > (v); } ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const @@ -2610,6 +2610,36 @@ public: #endif } + ALWAYS_INLINE float addv() const + { +#ifdef CPU_ARCH_ARM64 + return vaddvq_f32(v4s); +#else + float32x2_t tmp = vadd_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w) + return vget_lane_f32(vadd_f32(tmp, vdup_lane_f32(tmp, 1)), 0); +#endif + } + + ALWAYS_INLINE float minv() const + { +#ifdef CPU_ARCH_ARM64 + return vminvq_f32(v4s); +#else + float32x2_t tmp = vmin_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w) + return vget_lane_f32(vmin_f32(tmp, vdup_lane_f32(tmp, 1)), 0); +#endif + } + + ALWAYS_INLINE float maxv() const + { +#ifdef CPU_ARCH_ARM64 + return vmaxvq_f32(v4s); +#else + float32x2_t tmp = vmax_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w) + return vget_lane_f32(vmax_f32(tmp, vdup_lane_f32(tmp, 1)), 0); +#endif + } + ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); } ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h index 2269ef79e..547a37ce9 100644 --- a/src/common/gsvector_nosimd.h +++ b/src/common/gsvector_nosimd.h @@ -1859,6 +1859,10 @@ public: ALWAYS_INLINE float dot(const GSVector4& v) const { return (x * v.x) + (y * v.y) + (z * v.z) + (w * v.w); } + ALWAYS_INLINE float addv() const { return (x + y + z + w); } + ALWAYS_INLINE float minv() const { return std::min(x, std::min(y, std::min(z, w))); } + ALWAYS_INLINE float maxv() const { return std::max(x, std::max(y, std::max(z, w))); } + GSVector4 sat(const GSVector4& min, const GSVector4& max) const { return GSVector4(std::clamp(x, min.x, max.x), std::clamp(y, min.y, max.y), std::clamp(z, min.z, max.z), diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h index 7e9fb6761..4c93b72fb 100644 --- a/src/common/gsvector_sse.h +++ b/src/common/gsvector_sse.h @@ -2039,6 +2039,11 @@ public: #ifdef CPU_ARCH_SSE41 ALWAYS_INLINE float dot(const GSVector4& v) const { return _mm_cvtss_f32(_mm_dp_ps(m, v.m, 0xf1)); } + ALWAYS_INLINE float addv() const + { + const __m128 pairs = _mm_hadd_ps(m, m); + return _mm_cvtss_f32(_mm_hadd_ps(pairs, pairs)); + } #else float dot(const GSVector4& v) const { @@ -2047,8 +2052,28 @@ public: tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1))); return _mm_cvtss_f32(tmp); } + float addv() const + { + __m128 tmp = _mm_add_ps(m, _mm_movehl_ps(m, m)); // (x+z, y+w, ..., ...) + tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1))); + return _mm_cvtss_f32(tmp); + } #endif + ALWAYS_INLINE float minv() const + { + __m128 v = _mm_min_ps(m, _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 3, 2))); + v = _mm_min_ps(v, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1))); + return _mm_cvtss_f32(v); + } + + ALWAYS_INLINE float maxv() const + { + __m128 v = _mm_max_ps(m, _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 3, 2))); + v = _mm_max_ps(v, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1))); + return _mm_cvtss_f32(v); + } + ALWAYS_INLINE GSVector4 sat(const GSVector4& min, const GSVector4& max) const { return GSVector4(_mm_min_ps(_mm_max_ps(m, min), max)); diff --git a/src/core/imgui_overlays.cpp b/src/core/imgui_overlays.cpp index b73471a62..8c11e9da2 100644 --- a/src/core/imgui_overlays.cpp +++ b/src/core/imgui_overlays.cpp @@ -129,32 +129,6 @@ static InputOverlayState s_input_overlay_state = {}; } // namespace ImGuiManager -static std::tuple GetMinMax(std::span values) -{ - GSVector4 vmin(GSVector4::load(values.data())); - GSVector4 vmax(vmin); - - const u32 count = static_cast(values.size()); - const u32 aligned_count = Common::AlignDownPow2(count, 4); - u32 i = 4; - for (; i < aligned_count; i += 4) - { - const GSVector4 v(GSVector4::load(&values[i])); - vmin = vmin.min(v); - vmax = vmax.max(v); - } - - float min = std::min(vmin.x, std::min(vmin.y, std::min(vmin.z, vmin.w))); - float max = std::max(vmax.x, std::max(vmax.y, std::max(vmax.z, vmax.w))); - for (; i < count; i++) - { - min = std::min(min, values[i]); - max = std::max(max, values[i]); - } - - return std::tie(min, max); -} - bool ImGuiManager::AreAnyDebugWindowsEnabled(const SettingsInterface& si) { #ifndef __ANDROID__ @@ -731,7 +705,23 @@ void ImGuiManager::DrawFrameTimeOverlay(float& position_y, float scale, float ma { ImGui::PushFont(fixed_font, fixed_font_size, fixed_font_weight); - auto [min, max] = GetMinMax(PerformanceCounters::GetFrameTimeHistory()); + // LLVM likes to unroll this... whatever. + float min, max; + { + const PerformanceCounters::FrameTimeHistory& history = PerformanceCounters::GetFrameTimeHistory(); + static_assert((PerformanceCounters::NUM_FRAME_TIME_SAMPLES % 4) == 0); + GSVector4 vmin = GSVector4::load(history.data()); + GSVector4 vmax = vmin; + for (size_t i = 4; i < history.size(); i += 4) + { + const GSVector4 v = GSVector4::load(&history[i]); + vmin = vmin.min(v); + vmax = vmax.max(v); + } + + min = vmin.minv(); + max = vmin.maxv(); + } // add a little bit of space either side, so we're not constantly resizing if ((max - min) < 4.0f) diff --git a/src/core/performance_counters.cpp b/src/core/performance_counters.cpp index aa6f5aad2..c3d79fba0 100644 --- a/src/core/performance_counters.cpp +++ b/src/core/performance_counters.cpp @@ -22,7 +22,7 @@ namespace PerformanceCounters { namespace { -struct State +struct ALIGN_TO_CACHE_LINE State { Timer::Value last_update_time; Timer::Value last_frame_time; @@ -55,7 +55,7 @@ struct State float accumulated_gpu_time; float gpu_usage; - FrameTimeHistory frame_time_history; + alignas(VECTOR_ALIGNMENT) FrameTimeHistory frame_time_history; u32 frame_time_history_pos; }; @@ -63,7 +63,7 @@ struct State static constexpr const float PERFORMANCE_COUNTER_UPDATE_INTERVAL = 1.0f; -ALIGN_TO_CACHE_LINE State s_state = {}; +State s_state = {}; } // namespace PerformanceCounters diff --git a/src/core/performance_counters.h b/src/core/performance_counters.h index f122db0c2..29d63b27b 100644 --- a/src/core/performance_counters.h +++ b/src/core/performance_counters.h @@ -9,7 +9,7 @@ class GPUBackend; namespace PerformanceCounters { -inline constexpr u32 NUM_FRAME_TIME_SAMPLES = 150; +inline constexpr u32 NUM_FRAME_TIME_SAMPLES = 152; using FrameTimeHistory = std::array; float GetFPS();