PerformanceCounters: Align to 4 elements
And use minv()/maxv().
This commit is contained in:
parent
b81dfa205c
commit
6c9d339855
|
@ -279,7 +279,7 @@ public:
|
|||
{
|
||||
constexpr int bit1 = ((mask & 2) * 3) << 1;
|
||||
constexpr int bit0 = (mask & 1) * 3;
|
||||
return blend16<bit1 | bit0>(v);
|
||||
return blend16 < bit1 | bit0 > (v);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const
|
||||
|
@ -2610,6 +2610,36 @@ public:
|
|||
#endif
|
||||
}
|
||||
|
||||
ALWAYS_INLINE float addv() const
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM64
|
||||
return vaddvq_f32(v4s);
|
||||
#else
|
||||
float32x2_t tmp = vadd_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
|
||||
return vget_lane_f32(vadd_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
ALWAYS_INLINE float minv() const
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM64
|
||||
return vminvq_f32(v4s);
|
||||
#else
|
||||
float32x2_t tmp = vmin_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
|
||||
return vget_lane_f32(vmin_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
ALWAYS_INLINE float maxv() const
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM64
|
||||
return vmaxvq_f32(v4s);
|
||||
#else
|
||||
float32x2_t tmp = vmax_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
|
||||
return vget_lane_f32(vmax_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); }
|
||||
|
||||
ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const
|
||||
|
|
|
@ -1859,6 +1859,10 @@ public:
|
|||
|
||||
ALWAYS_INLINE float dot(const GSVector4& v) const { return (x * v.x) + (y * v.y) + (z * v.z) + (w * v.w); }
|
||||
|
||||
ALWAYS_INLINE float addv() const { return (x + y + z + w); }
|
||||
ALWAYS_INLINE float minv() const { return std::min(x, std::min(y, std::min(z, w))); }
|
||||
ALWAYS_INLINE float maxv() const { return std::max(x, std::max(y, std::max(z, w))); }
|
||||
|
||||
GSVector4 sat(const GSVector4& min, const GSVector4& max) const
|
||||
{
|
||||
return GSVector4(std::clamp(x, min.x, max.x), std::clamp(y, min.y, max.y), std::clamp(z, min.z, max.z),
|
||||
|
|
|
@ -2039,6 +2039,11 @@ public:
|
|||
|
||||
#ifdef CPU_ARCH_SSE41
|
||||
ALWAYS_INLINE float dot(const GSVector4& v) const { return _mm_cvtss_f32(_mm_dp_ps(m, v.m, 0xf1)); }
|
||||
ALWAYS_INLINE float addv() const
|
||||
{
|
||||
const __m128 pairs = _mm_hadd_ps(m, m);
|
||||
return _mm_cvtss_f32(_mm_hadd_ps(pairs, pairs));
|
||||
}
|
||||
#else
|
||||
float dot(const GSVector4& v) const
|
||||
{
|
||||
|
@ -2047,8 +2052,28 @@ public:
|
|||
tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1)));
|
||||
return _mm_cvtss_f32(tmp);
|
||||
}
|
||||
float addv() const
|
||||
{
|
||||
__m128 tmp = _mm_add_ps(m, _mm_movehl_ps(m, m)); // (x+z, y+w, ..., ...)
|
||||
tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1)));
|
||||
return _mm_cvtss_f32(tmp);
|
||||
}
|
||||
#endif
|
||||
|
||||
ALWAYS_INLINE float minv() const
|
||||
{
|
||||
__m128 v = _mm_min_ps(m, _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
v = _mm_min_ps(v, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
return _mm_cvtss_f32(v);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE float maxv() const
|
||||
{
|
||||
__m128 v = _mm_max_ps(m, _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
v = _mm_max_ps(v, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
return _mm_cvtss_f32(v);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector4 sat(const GSVector4& min, const GSVector4& max) const
|
||||
{
|
||||
return GSVector4(_mm_min_ps(_mm_max_ps(m, min), max));
|
||||
|
|
|
@ -129,32 +129,6 @@ static InputOverlayState s_input_overlay_state = {};
|
|||
|
||||
} // namespace ImGuiManager
|
||||
|
||||
static std::tuple<float, float> GetMinMax(std::span<const float> values)
|
||||
{
|
||||
GSVector4 vmin(GSVector4::load<false>(values.data()));
|
||||
GSVector4 vmax(vmin);
|
||||
|
||||
const u32 count = static_cast<u32>(values.size());
|
||||
const u32 aligned_count = Common::AlignDownPow2(count, 4);
|
||||
u32 i = 4;
|
||||
for (; i < aligned_count; i += 4)
|
||||
{
|
||||
const GSVector4 v(GSVector4::load<false>(&values[i]));
|
||||
vmin = vmin.min(v);
|
||||
vmax = vmax.max(v);
|
||||
}
|
||||
|
||||
float min = std::min(vmin.x, std::min(vmin.y, std::min(vmin.z, vmin.w)));
|
||||
float max = std::max(vmax.x, std::max(vmax.y, std::max(vmax.z, vmax.w)));
|
||||
for (; i < count; i++)
|
||||
{
|
||||
min = std::min(min, values[i]);
|
||||
max = std::max(max, values[i]);
|
||||
}
|
||||
|
||||
return std::tie(min, max);
|
||||
}
|
||||
|
||||
bool ImGuiManager::AreAnyDebugWindowsEnabled(const SettingsInterface& si)
|
||||
{
|
||||
#ifndef __ANDROID__
|
||||
|
@ -731,7 +705,23 @@ void ImGuiManager::DrawFrameTimeOverlay(float& position_y, float scale, float ma
|
|||
{
|
||||
ImGui::PushFont(fixed_font, fixed_font_size, fixed_font_weight);
|
||||
|
||||
auto [min, max] = GetMinMax(PerformanceCounters::GetFrameTimeHistory());
|
||||
// LLVM likes to unroll this... whatever.
|
||||
float min, max;
|
||||
{
|
||||
const PerformanceCounters::FrameTimeHistory& history = PerformanceCounters::GetFrameTimeHistory();
|
||||
static_assert((PerformanceCounters::NUM_FRAME_TIME_SAMPLES % 4) == 0);
|
||||
GSVector4 vmin = GSVector4::load<false>(history.data());
|
||||
GSVector4 vmax = vmin;
|
||||
for (size_t i = 4; i < history.size(); i += 4)
|
||||
{
|
||||
const GSVector4 v = GSVector4::load<false>(&history[i]);
|
||||
vmin = vmin.min(v);
|
||||
vmax = vmax.max(v);
|
||||
}
|
||||
|
||||
min = vmin.minv();
|
||||
max = vmin.maxv();
|
||||
}
|
||||
|
||||
// add a little bit of space either side, so we're not constantly resizing
|
||||
if ((max - min) < 4.0f)
|
||||
|
|
|
@ -22,7 +22,7 @@ namespace PerformanceCounters {
|
|||
|
||||
namespace {
|
||||
|
||||
struct State
|
||||
struct ALIGN_TO_CACHE_LINE State
|
||||
{
|
||||
Timer::Value last_update_time;
|
||||
Timer::Value last_frame_time;
|
||||
|
@ -55,7 +55,7 @@ struct State
|
|||
float accumulated_gpu_time;
|
||||
float gpu_usage;
|
||||
|
||||
FrameTimeHistory frame_time_history;
|
||||
alignas(VECTOR_ALIGNMENT) FrameTimeHistory frame_time_history;
|
||||
u32 frame_time_history_pos;
|
||||
};
|
||||
|
||||
|
@ -63,7 +63,7 @@ struct State
|
|||
|
||||
static constexpr const float PERFORMANCE_COUNTER_UPDATE_INTERVAL = 1.0f;
|
||||
|
||||
ALIGN_TO_CACHE_LINE State s_state = {};
|
||||
State s_state = {};
|
||||
|
||||
} // namespace PerformanceCounters
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ class GPUBackend;
|
|||
|
||||
namespace PerformanceCounters
|
||||
{
|
||||
inline constexpr u32 NUM_FRAME_TIME_SAMPLES = 150;
|
||||
inline constexpr u32 NUM_FRAME_TIME_SAMPLES = 152;
|
||||
using FrameTimeHistory = std::array<float, NUM_FRAME_TIME_SAMPLES>;
|
||||
|
||||
float GetFPS();
|
||||
|
|
Loading…
Reference in New Issue