PerformanceCounters: Align to 4 elements

And use minv()/maxv().
This commit is contained in:
Stenzek 2025-08-02 13:33:23 +10:00
parent b81dfa205c
commit 6c9d339855
No known key found for this signature in database
6 changed files with 81 additions and 32 deletions

View File

@ -279,7 +279,7 @@ public:
{
constexpr int bit1 = ((mask & 2) * 3) << 1;
constexpr int bit0 = (mask & 1) * 3;
return blend16<bit1 | bit0>(v);
return blend16 < bit1 | bit0 > (v);
}
ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const
@ -2610,6 +2610,36 @@ public:
#endif
}
ALWAYS_INLINE float addv() const
{
#ifdef CPU_ARCH_ARM64
return vaddvq_f32(v4s);
#else
float32x2_t tmp = vadd_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
return vget_lane_f32(vadd_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
#endif
}
ALWAYS_INLINE float minv() const
{
#ifdef CPU_ARCH_ARM64
return vminvq_f32(v4s);
#else
float32x2_t tmp = vmin_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
return vget_lane_f32(vmin_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
#endif
}
ALWAYS_INLINE float maxv() const
{
#ifdef CPU_ARCH_ARM64
return vmaxvq_f32(v4s);
#else
float32x2_t tmp = vmax_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
return vget_lane_f32(vmax_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
#endif
}
ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); }
ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const

View File

@ -1859,6 +1859,10 @@ public:
ALWAYS_INLINE float dot(const GSVector4& v) const { return (x * v.x) + (y * v.y) + (z * v.z) + (w * v.w); }
ALWAYS_INLINE float addv() const { return (x + y + z + w); }
ALWAYS_INLINE float minv() const { return std::min(x, std::min(y, std::min(z, w))); }
ALWAYS_INLINE float maxv() const { return std::max(x, std::max(y, std::max(z, w))); }
GSVector4 sat(const GSVector4& min, const GSVector4& max) const
{
return GSVector4(std::clamp(x, min.x, max.x), std::clamp(y, min.y, max.y), std::clamp(z, min.z, max.z),

View File

@ -2039,6 +2039,11 @@ public:
#ifdef CPU_ARCH_SSE41
ALWAYS_INLINE float dot(const GSVector4& v) const { return _mm_cvtss_f32(_mm_dp_ps(m, v.m, 0xf1)); }
ALWAYS_INLINE float addv() const
{
const __m128 pairs = _mm_hadd_ps(m, m);
return _mm_cvtss_f32(_mm_hadd_ps(pairs, pairs));
}
#else
float dot(const GSVector4& v) const
{
@ -2047,8 +2052,28 @@ public:
tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1)));
return _mm_cvtss_f32(tmp);
}
float addv() const
{
__m128 tmp = _mm_add_ps(m, _mm_movehl_ps(m, m)); // (x+z, y+w, ..., ...)
tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1)));
return _mm_cvtss_f32(tmp);
}
#endif
ALWAYS_INLINE float minv() const
{
__m128 v = _mm_min_ps(m, _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 3, 2)));
v = _mm_min_ps(v, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtss_f32(v);
}
ALWAYS_INLINE float maxv() const
{
__m128 v = _mm_max_ps(m, _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 3, 2)));
v = _mm_max_ps(v, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtss_f32(v);
}
ALWAYS_INLINE GSVector4 sat(const GSVector4& min, const GSVector4& max) const
{
return GSVector4(_mm_min_ps(_mm_max_ps(m, min), max));

View File

@ -129,32 +129,6 @@ static InputOverlayState s_input_overlay_state = {};
} // namespace ImGuiManager
static std::tuple<float, float> GetMinMax(std::span<const float> values)
{
GSVector4 vmin(GSVector4::load<false>(values.data()));
GSVector4 vmax(vmin);
const u32 count = static_cast<u32>(values.size());
const u32 aligned_count = Common::AlignDownPow2(count, 4);
u32 i = 4;
for (; i < aligned_count; i += 4)
{
const GSVector4 v(GSVector4::load<false>(&values[i]));
vmin = vmin.min(v);
vmax = vmax.max(v);
}
float min = std::min(vmin.x, std::min(vmin.y, std::min(vmin.z, vmin.w)));
float max = std::max(vmax.x, std::max(vmax.y, std::max(vmax.z, vmax.w)));
for (; i < count; i++)
{
min = std::min(min, values[i]);
max = std::max(max, values[i]);
}
return std::tie(min, max);
}
bool ImGuiManager::AreAnyDebugWindowsEnabled(const SettingsInterface& si)
{
#ifndef __ANDROID__
@ -731,7 +705,23 @@ void ImGuiManager::DrawFrameTimeOverlay(float& position_y, float scale, float ma
{
ImGui::PushFont(fixed_font, fixed_font_size, fixed_font_weight);
auto [min, max] = GetMinMax(PerformanceCounters::GetFrameTimeHistory());
// LLVM likes to unroll this... whatever.
float min, max;
{
const PerformanceCounters::FrameTimeHistory& history = PerformanceCounters::GetFrameTimeHistory();
static_assert((PerformanceCounters::NUM_FRAME_TIME_SAMPLES % 4) == 0);
GSVector4 vmin = GSVector4::load<false>(history.data());
GSVector4 vmax = vmin;
for (size_t i = 4; i < history.size(); i += 4)
{
const GSVector4 v = GSVector4::load<false>(&history[i]);
vmin = vmin.min(v);
vmax = vmax.max(v);
}
min = vmin.minv();
max = vmin.maxv();
}
// add a little bit of space either side, so we're not constantly resizing
if ((max - min) < 4.0f)

View File

@ -22,7 +22,7 @@ namespace PerformanceCounters {
namespace {
struct State
struct ALIGN_TO_CACHE_LINE State
{
Timer::Value last_update_time;
Timer::Value last_frame_time;
@ -55,7 +55,7 @@ struct State
float accumulated_gpu_time;
float gpu_usage;
FrameTimeHistory frame_time_history;
alignas(VECTOR_ALIGNMENT) FrameTimeHistory frame_time_history;
u32 frame_time_history_pos;
};
@ -63,7 +63,7 @@ struct State
static constexpr const float PERFORMANCE_COUNTER_UPDATE_INTERVAL = 1.0f;
ALIGN_TO_CACHE_LINE State s_state = {};
State s_state = {};
} // namespace PerformanceCounters

View File

@ -9,7 +9,7 @@ class GPUBackend;
namespace PerformanceCounters
{
inline constexpr u32 NUM_FRAME_TIME_SAMPLES = 150;
inline constexpr u32 NUM_FRAME_TIME_SAMPLES = 152;
using FrameTimeHistory = std::array<float, NUM_FRAME_TIME_SAMPLES>;
float GetFPS();