Misc: Fix alignment errors on ARM32
This commit is contained in:
parent
bb24d406f2
commit
5c03e1d940
|
@ -15,8 +15,8 @@ static void YUVToRGB_Vector(const std::array<s16, 64>& Crblk, const std::array<s
|
|||
const GSVector4i addval = signed_output ? GSVector4i::cxpr(0) : GSVector4i::cxpr(0x80808080);
|
||||
for (u32 y = 0; y < 8; y++)
|
||||
{
|
||||
const GSVector4i Cr = GSVector4i::loadl(&Crblk[(y / 2) * 8]).s16to32();
|
||||
const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(y / 2) * 8]).s16to32();
|
||||
const GSVector4i Cr = GSVector4i::loadl<false>(&Crblk[(y / 2) * 8]).s16to32();
|
||||
const GSVector4i Cb = GSVector4i::loadl<false>(&Cbblk[(y / 2) * 8]).s16to32();
|
||||
const GSVector4i Y = GSVector4i::load<true>(&Yblk[y * 8]);
|
||||
|
||||
// BT.601 YUV->RGB coefficients, rounding formula from Mednafen.
|
||||
|
|
|
@ -53,7 +53,7 @@ GSMatrix2x2 GSMatrix2x2::Rotation(float angle_in_radians)
|
|||
|
||||
GSVector2 GSMatrix2x2::row(size_t i) const
|
||||
{
|
||||
return GSVector2::load(&E[i][0]);
|
||||
return GSVector2::load<true>(&E[i][0]);
|
||||
}
|
||||
|
||||
GSVector2 GSMatrix2x2::col(size_t i) const
|
||||
|
|
|
@ -35,5 +35,5 @@ public:
|
|||
|
||||
void store(void* m);
|
||||
|
||||
float E[2][2];
|
||||
alignas(8) float E[2][2];
|
||||
};
|
||||
|
|
|
@ -690,7 +690,16 @@ public:
|
|||
|
||||
ALWAYS_INLINE static GSVector2i zext32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }
|
||||
|
||||
ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(vld1_s32((const int32_t*)p)); }
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector2i load(const void* p)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
return GSVector2i(vreinterpret_s32_s8(vld1_s8((const int8_t*)p)));
|
||||
#endif
|
||||
|
||||
return GSVector2i(vld1_s32((const int32_t*)p));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void store32(void* p, const GSVector2i& v)
|
||||
{
|
||||
|
@ -698,7 +707,19 @@ public:
|
|||
std::memcpy(p, &val, sizeof(s32));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { vst1_s32((int32_t*)p, v.v2s); }
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
{
|
||||
vst1_s8((int8_t*)p, vreinterpret_s8_s32(v.v2s));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
vst1_s32((int32_t*)p, v.v2s);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE void operator&=(const GSVector2i& v)
|
||||
{
|
||||
|
@ -903,9 +924,30 @@ public:
|
|||
|
||||
ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); }
|
||||
|
||||
ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(vld1_f32(static_cast<const float*>(p))); }
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector2 load(const void* p)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
return GSVector2(vreinterpret_f32_s8(vld1_s8((const int8_t*)p)));
|
||||
#endif
|
||||
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector2& v) { vst1_f32(static_cast<float*>(p), v.v2s); }
|
||||
return GSVector2(vld1_f32(static_cast<const float*>(p)));
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector2& v)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
{
|
||||
vst1_s8(static_cast<int8_t*>(p), vreinterpret_s8_f32(v.v2s));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
vst1_f32(static_cast<float*>(p), v.v2s);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
|
||||
|
||||
|
@ -2134,13 +2176,25 @@ public:
|
|||
|
||||
ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(vsetq_lane_s32(v, vdupq_n_s32(0), 0)); }
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector4i loadl(const void* p)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
return GSVector4i(vcombine_s32(vreinterpret_s32_s8(vld1_s8((int8_t*)p)), vcreate_s32(0)));
|
||||
#endif
|
||||
|
||||
return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0)));
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector4i loadh(const void* p)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vdup_n_s8(0), vld1_s8((int8_t*)p))));
|
||||
#endif
|
||||
|
||||
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
|
||||
}
|
||||
|
||||
|
@ -2149,6 +2203,11 @@ public:
|
|||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector4i load(const void* p)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
return GSVector4i(vreinterpretq_s32_s8(vld1q_s8((int8_t*)p)));
|
||||
#endif
|
||||
|
||||
return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
|
||||
}
|
||||
|
||||
|
@ -2167,19 +2226,45 @@ public:
|
|||
std::memcpy(p, &val, sizeof(u32));
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
{
|
||||
vst1_s8((int8_t*)p, vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s)));
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
{
|
||||
vst1_s8((int8_t*)p, vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s)));
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
{
|
||||
vst1q_s8((int8_t*)p, vreinterpretq_s8_s32(v.v4s));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
|
||||
}
|
||||
|
||||
|
@ -2652,8 +2737,14 @@ public:
|
|||
|
||||
ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector4 loadl(const void* p)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
return GSVector4(vcombine_f32(vreinterpret_f32_s8(vld1_s8((int8_t*)p)), vcreate_f32(0)));
|
||||
#endif
|
||||
|
||||
return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
|
||||
}
|
||||
|
||||
|
@ -2662,32 +2753,55 @@ public:
|
|||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector4 load(const void* p)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
return GSVector4(vreinterpretq_f32_s8(vld1q_s8((int8_t*)p)));
|
||||
#endif
|
||||
|
||||
return GSVector4(vld1q_f32((const float*)p));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); }
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM64
|
||||
vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s)));
|
||||
#else
|
||||
vst1_s64((s64*)p, vget_low_s64(vreinterpretq_s64_f32(v.v4s)));
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
{
|
||||
vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_low_f32(v.v4s)));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
vst1_f32((float*)p, vget_low_f32(v.v4s));
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM64
|
||||
vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s)));
|
||||
#else
|
||||
vst1_s64((s64*)p, vget_high_s64(vreinterpretq_s64_f32(v.v4s)));
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
{
|
||||
vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_high_f32(v.v4s)));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
vst1_f32((float*)p, vget_high_f32(v.v4s));
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector4& v)
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM32
|
||||
if constexpr (!aligned)
|
||||
{
|
||||
vst1q_s8((int8_t*)p, vreinterpretq_s8_f32(v.v4s));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
vst1q_f32((float*)p, v.v4s);
|
||||
}
|
||||
|
||||
|
|
|
@ -467,6 +467,7 @@ public:
|
|||
|
||||
ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(v, 0); }
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector2i load(const void* p)
|
||||
{
|
||||
GSVector2i ret;
|
||||
|
@ -474,7 +475,11 @@ public:
|
|||
return ret;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { std::memcpy(p, v.S32, sizeof(S32)); }
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
|
||||
{
|
||||
std::memcpy(p, v.S32, sizeof(S32));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { std::memcpy(p, &v.x, sizeof(s32)); }
|
||||
|
||||
|
@ -658,6 +663,7 @@ public:
|
|||
return ret;
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector2 load(const void* p)
|
||||
{
|
||||
GSVector2 ret;
|
||||
|
@ -665,7 +671,11 @@ public:
|
|||
return ret;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector2& v) { std::memcpy(p, &v.F32, sizeof(F32)); }
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector2& v)
|
||||
{
|
||||
std::memcpy(p, &v.F32, sizeof(F32));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
|
||||
|
||||
|
@ -1530,6 +1540,7 @@ public:
|
|||
|
||||
ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(v, 0, 0, 0); }
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector4i loadl(const void* p)
|
||||
{
|
||||
GSVector4i ret;
|
||||
|
@ -1538,6 +1549,7 @@ public:
|
|||
return ret;
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector4i loadh(const void* p)
|
||||
{
|
||||
GSVector4i ret;
|
||||
|
@ -1546,7 +1558,11 @@ public:
|
|||
return ret;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); }
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v)
|
||||
{
|
||||
return loadh<true>(&v);
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector4i load(const void* p)
|
||||
|
@ -1558,9 +1574,17 @@ public:
|
|||
|
||||
ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { std::memcpy(p, v.S32, sizeof(v.S32)); }
|
||||
|
||||
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[0], sizeof(s32) * 2); }
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
|
||||
{
|
||||
std::memcpy(p, &v.S32[0], sizeof(s32) * 2);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[2], sizeof(s32) * 2); }
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
|
||||
{
|
||||
std::memcpy(p, &v.S32[2], sizeof(s32) * 2);
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
|
||||
|
@ -1958,6 +1982,7 @@ public:
|
|||
return ret;
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector4 loadl(const void* p)
|
||||
{
|
||||
GSVector4 ret;
|
||||
|
@ -1977,9 +2002,17 @@ public:
|
|||
|
||||
ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { std::memcpy(p, &v, sizeof(v)); }
|
||||
|
||||
ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { std::memcpy(p, &v.x, sizeof(float) * 2); }
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
|
||||
{
|
||||
std::memcpy(p, &v.x, sizeof(float) * 2);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { std::memcpy(p, &v.z, sizeof(float) * 2); }
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
|
||||
{
|
||||
std::memcpy(p, &v.z, sizeof(float) * 2);
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector4& v)
|
||||
|
|
|
@ -585,12 +585,19 @@ public:
|
|||
|
||||
ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); }
|
||||
ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(_mm_cvtsi32_si128(v)); }
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector2i load(const void* p)
|
||||
{
|
||||
return GSVector2i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); }
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
|
||||
{
|
||||
_mm_storel_epi64(static_cast<__m128i*>(p), v.m);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); }
|
||||
|
||||
ALWAYS_INLINE GSVector2i& operator&=(const GSVector2i& v)
|
||||
|
@ -806,11 +813,13 @@ public:
|
|||
|
||||
ALWAYS_INLINE static GSVector2 xffffffff() { return zero() == zero(); }
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector2 load(const void* p)
|
||||
{
|
||||
return GSVector2(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void store(void* p, const GSVector2& v)
|
||||
{
|
||||
_mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
|
||||
|
@ -1711,16 +1720,19 @@ public:
|
|||
ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); }
|
||||
ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(_mm_cvtsi32_si128(v)); }
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector4i loadl(const void* p)
|
||||
{
|
||||
return GSVector4i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector4i loadh(const void* p)
|
||||
{
|
||||
return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), static_cast<const __m64*>(p))));
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v)
|
||||
{
|
||||
return GSVector4i(_mm_unpacklo_epi64(_mm_setzero_si128(), v.m));
|
||||
|
@ -1734,7 +1746,14 @@ public:
|
|||
}
|
||||
|
||||
ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128(static_cast<__m128i*>(p), v.m); }
|
||||
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); }
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
|
||||
{
|
||||
_mm_storel_epi64(static_cast<__m128i*>(p), v.m);
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
|
||||
{
|
||||
_mm_storeh_pi(static_cast<__m64*>(p), _mm_castsi128_ps(v.m));
|
||||
|
@ -2115,6 +2134,7 @@ public:
|
|||
|
||||
ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); }
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector4 loadl(const void* p)
|
||||
{
|
||||
return GSVector4(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
|
||||
|
@ -2127,10 +2147,14 @@ public:
|
|||
}
|
||||
|
||||
ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps(static_cast<float*>(p), v.m); }
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
|
||||
{
|
||||
_mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
|
||||
{
|
||||
_mm_storeh_pd(static_cast<double*>(p), _mm_castps_pd(v.m));
|
||||
|
|
|
@ -549,7 +549,9 @@ u8* Bus::GetLUTFastmemPointer(u32 address, u8* ram_ptr)
|
|||
|
||||
void Bus::MapFastmemViews()
|
||||
{
|
||||
#ifdef ENABLE_MMAP_FASTMEM
|
||||
Assert(s_fastmem_ram_views.empty());
|
||||
#endif
|
||||
|
||||
const CPUFastmemMode mode = g_settings.cpu_fastmem_mode;
|
||||
if (mode == CPUFastmemMode::MMap)
|
||||
|
|
|
@ -345,8 +345,8 @@ protected:
|
|||
// However, usually it'll undershoot not overshoot. If we wanted to make this more accurate, we'd need to intersect
|
||||
// the edges with the clip rectangle.
|
||||
// TODO: Coordinates are exclusive, so off by one here...
|
||||
const GSVector2i clamp_min = GSVector2i::load(&m_clamped_drawing_area.x);
|
||||
const GSVector2i clamp_max = GSVector2i::load(&m_clamped_drawing_area.z);
|
||||
const GSVector2i clamp_min = GSVector2i::load<true>(&m_clamped_drawing_area.x);
|
||||
const GSVector2i clamp_max = GSVector2i::load<true>(&m_clamped_drawing_area.z);
|
||||
v1 = v1.sat_s32(clamp_min, clamp_max);
|
||||
v2 = v2.sat_s32(clamp_min, clamp_max);
|
||||
v3 = v3.sat_s32(clamp_min, clamp_max);
|
||||
|
|
|
@ -2570,9 +2570,9 @@ void GPU_HW::LoadVertices()
|
|||
}
|
||||
|
||||
// Cull polygons which are too large.
|
||||
const GSVector2 v0f = GSVector2::load(&vertices[0].x);
|
||||
const GSVector2 v1f = GSVector2::load(&vertices[1].x);
|
||||
const GSVector2 v2f = GSVector2::load(&vertices[2].x);
|
||||
const GSVector2 v0f = GSVector2::load<false>(&vertices[0].x);
|
||||
const GSVector2 v1f = GSVector2::load<false>(&vertices[1].x);
|
||||
const GSVector2 v2f = GSVector2::load<false>(&vertices[2].x);
|
||||
const GSVector2 min_pos_12 = v1f.min(v2f);
|
||||
const GSVector2 max_pos_12 = v1f.max(v2f);
|
||||
const GSVector4i draw_rect_012 = GSVector4i(GSVector4(min_pos_12.min(v0f)).upld(GSVector4(max_pos_12.max(v0f))))
|
||||
|
@ -2617,7 +2617,7 @@ void GPU_HW::LoadVertices()
|
|||
// quads
|
||||
if (rc.quad_polygon)
|
||||
{
|
||||
const GSVector2 v3f = GSVector2::load(&vertices[3].x);
|
||||
const GSVector2 v3f = GSVector2::load<false>(&vertices[3].x);
|
||||
const GSVector4i draw_rect_123 = GSVector4i(GSVector4(min_pos_12.min(v3f)).upld(GSVector4(max_pos_12.max(v3f))))
|
||||
.add32(GSVector4i::cxpr(0, 0, 1, 1));
|
||||
const GSVector4i clamped_draw_rect_123 = draw_rect_123.rintersect(m_clamped_drawing_area);
|
||||
|
@ -2845,9 +2845,9 @@ void GPU_HW::LoadVertices()
|
|||
{
|
||||
GPUBackendDrawLineCommand* cmd = m_sw_renderer->NewDrawLineCommand(2);
|
||||
FillDrawCommand(cmd, rc);
|
||||
GSVector4i::storel(&cmd->vertices[0], bounds);
|
||||
GSVector4i::storel<false>(&cmd->vertices[0], bounds);
|
||||
cmd->vertices[0].color = start_color;
|
||||
GSVector4i::storeh(&cmd->vertices[1], bounds);
|
||||
GSVector4i::storeh<false>(&cmd->vertices[1], bounds);
|
||||
cmd->vertices[1].color = end_color;
|
||||
m_sw_renderer->PushCommand(cmd);
|
||||
}
|
||||
|
@ -2870,7 +2870,7 @@ void GPU_HW::LoadVertices()
|
|||
{
|
||||
cmd = m_sw_renderer->NewDrawLineCommand(num_vertices);
|
||||
FillDrawCommand(cmd, rc);
|
||||
GSVector2i::store(&cmd->vertices[0].x, start_pos);
|
||||
GSVector2i::store<false>(&cmd->vertices[0].x, start_pos);
|
||||
cmd->vertices[0].color = start_color;
|
||||
}
|
||||
else
|
||||
|
@ -2905,7 +2905,7 @@ void GPU_HW::LoadVertices()
|
|||
|
||||
if (cmd)
|
||||
{
|
||||
GSVector2i::store(&cmd->vertices[i], end_pos);
|
||||
GSVector2i::store<false>(&cmd->vertices[i], end_pos);
|
||||
cmd->vertices[i].color = end_color;
|
||||
}
|
||||
}
|
||||
|
@ -2978,7 +2978,7 @@ ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(GSVector4i uv_rect)
|
|||
|
||||
const GPUTextureMode tmode = m_draw_mode.mode_reg.texture_mode;
|
||||
const u32 xshift = (tmode >= GPUTextureMode::Direct16Bit) ? 0 : (2 - static_cast<u8>(tmode));
|
||||
const GSVector4i page_offset = GSVector4i::loadl(m_current_texture_page_offset).xyxy();
|
||||
const GSVector4i page_offset = GSVector4i::loadl<true>(m_current_texture_page_offset).xyxy();
|
||||
|
||||
uv_rect = uv_rect.blend32<5>(uv_rect.srl32(xshift)); // shift only goes on the x
|
||||
uv_rect = uv_rect.add32(page_offset); // page offset
|
||||
|
|
|
@ -317,7 +317,7 @@ private:
|
|||
GSVector4i m_vram_dirty_write_rect = INVALID_RECT; // TODO: Don't use in TC mode, should be kept at zero.
|
||||
GSVector4i m_current_uv_rect = INVALID_RECT;
|
||||
GSVector4i m_current_draw_rect = INVALID_RECT;
|
||||
s32 m_current_texture_page_offset[2] = {};
|
||||
alignas(8) s32 m_current_texture_page_offset[2] = {};
|
||||
|
||||
std::unique_ptr<GPUPipeline> m_wireframe_pipeline;
|
||||
|
||||
|
|
|
@ -3305,8 +3305,8 @@ void GPUTextureCache::ApplyTextureReplacements(SourceKey key, HashType tex_hash,
|
|||
// TODO: Use rects instead of fullscreen tris, maybe avoid the copy..
|
||||
alignas(VECTOR_ALIGNMENT) float uniforms[4];
|
||||
GSVector2 texture_size = GSVector2(GSVector2i(entry->texture->GetWidth(), entry->texture->GetHeight()));
|
||||
GSVector2::store(&uniforms[0], texture_size);
|
||||
GSVector2::store(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
|
||||
GSVector2::store<true>(&uniforms[0], texture_size);
|
||||
GSVector2::store<true>(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
|
||||
g_gpu_device->InvalidateRenderTarget(s_state.replacement_texture_render_target.get());
|
||||
g_gpu_device->SetRenderTarget(s_state.replacement_texture_render_target.get());
|
||||
g_gpu_device->SetViewportAndScissor(0, 0, new_width, new_height);
|
||||
|
@ -3325,8 +3325,8 @@ void GPUTextureCache::ApplyTextureReplacements(SourceKey key, HashType tex_hash,
|
|||
|
||||
const GSVector4i dst_rect = GSVector4i(GSVector4(si.dst_rect) * max_scale_v);
|
||||
texture_size = GSVector2(GSVector2i(temp_texture->GetWidth(), temp_texture->GetHeight()));
|
||||
GSVector2::store(&uniforms[0], texture_size);
|
||||
GSVector2::store(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
|
||||
GSVector2::store<true>(&uniforms[0], texture_size);
|
||||
GSVector2::store<true>(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
|
||||
g_gpu_device->SetViewportAndScissor(dst_rect);
|
||||
g_gpu_device->SetTextureSampler(0, temp_texture.get(), g_gpu_device->GetNearestSampler());
|
||||
g_gpu_device->SetPipeline(si.invert_alpha ? s_state.replacement_semitransparent_draw_pipeline.get() :
|
||||
|
|
|
@ -537,7 +537,7 @@ void GPU_SW::DispatchRenderCommand()
|
|||
vert->x = m_drawing_offset.x + vp.x;
|
||||
vert->y = m_drawing_offset.y + vp.y;
|
||||
vert->texcoord = textured ? Truncate16(FifoPop()) : 0;
|
||||
positions[i] = GSVector2i::load(&vert->x);
|
||||
positions[i] = GSVector2i::load<false>(&vert->x);
|
||||
}
|
||||
|
||||
// Cull polygons which are too large.
|
||||
|
@ -686,8 +686,8 @@ void GPU_SW::DispatchRenderCommand()
|
|||
cmd->vertices[1].y = m_drawing_offset.y + end_pos.y;
|
||||
}
|
||||
|
||||
const GSVector4i v0 = GSVector4i::loadl(&cmd->vertices[0].x);
|
||||
const GSVector4i v1 = GSVector4i::loadl(&cmd->vertices[1].x);
|
||||
const GSVector4i v0 = GSVector4i::loadl<false>(&cmd->vertices[0].x);
|
||||
const GSVector4i v1 = GSVector4i::loadl<false>(&cmd->vertices[1].x);
|
||||
const GSVector4i rect = v0.min_s32(v1).xyxy(v0.max_s32(v1)).add32(GSVector4i::cxpr(0, 0, 1, 1));
|
||||
const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);
|
||||
|
||||
|
@ -711,7 +711,7 @@ void GPU_SW::DispatchRenderCommand()
|
|||
|
||||
u32 buffer_pos = 0;
|
||||
const GPUVertexPosition start_vp{m_blit_buffer[buffer_pos++]};
|
||||
const GSVector2i draw_offset = GSVector2i::load(&m_drawing_offset.x);
|
||||
const GSVector2i draw_offset = GSVector2i::load<false>(&m_drawing_offset.x);
|
||||
GSVector2i start_pos = GSVector2i(start_vp.x, start_vp.y).add32(draw_offset);
|
||||
u32 start_color = m_render_command.color_for_first_vertex;
|
||||
|
||||
|
@ -740,9 +740,9 @@ void GPU_SW::DispatchRenderCommand()
|
|||
GPUBackendDrawLineCommand::Vertex* out_vertex = &cmd->vertices[out_vertex_count];
|
||||
out_vertex_count += 2;
|
||||
|
||||
GSVector2i::store(&out_vertex[0].x, start_pos);
|
||||
GSVector2i::store<false>(&out_vertex[0].x, start_pos);
|
||||
out_vertex[0].color = start_color;
|
||||
GSVector2i::store(&out_vertex[1].x, end_pos);
|
||||
GSVector2i::store<false>(&out_vertex[1].x, end_pos);
|
||||
out_vertex[1].color = end_color;
|
||||
}
|
||||
|
||||
|
|
|
@ -397,7 +397,7 @@ ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y)
|
|||
{
|
||||
if (x <= (VRAM_WIDTH - 4))
|
||||
{
|
||||
return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32();
|
||||
return GSVector4i::loadl<false>(&g_vram[y * VRAM_WIDTH + x]).u16to32();
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -415,7 +415,7 @@ ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color)
|
|||
const GSVector4i packed_color = color.pu32();
|
||||
if (x <= (VRAM_WIDTH - 4))
|
||||
{
|
||||
GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], packed_color);
|
||||
GSVector4i::storel<false>(&g_vram[y * VRAM_WIDTH + x], packed_color);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -711,10 +711,10 @@ void MDEC::CopyOutBlock(void* param, TickCount ticks, TickCount ticks_late)
|
|||
|
||||
for (u32 index = 0; index < s_state.block_rgb.size(); index += 16)
|
||||
{
|
||||
const GSVector4i rgbx0 = GSVector4i::load<false>(&s_state.block_rgb[index]);
|
||||
const GSVector4i rgbx1 = GSVector4i::load<false>(&s_state.block_rgb[index + 4]);
|
||||
const GSVector4i rgbx2 = GSVector4i::load<false>(&s_state.block_rgb[index + 8]);
|
||||
const GSVector4i rgbx3 = GSVector4i::load<false>(&s_state.block_rgb[index + 12]);
|
||||
const GSVector4i rgbx0 = GSVector4i::load<true>(&s_state.block_rgb[index]);
|
||||
const GSVector4i rgbx1 = GSVector4i::load<true>(&s_state.block_rgb[index + 4]);
|
||||
const GSVector4i rgbx2 = GSVector4i::load<true>(&s_state.block_rgb[index + 8]);
|
||||
const GSVector4i rgbx3 = GSVector4i::load<true>(&s_state.block_rgb[index + 12]);
|
||||
|
||||
GSVector4i::store<true>(&rgbp[0], rgbx0.shuffle8(mask00) | rgbx1.shuffle8(mask01));
|
||||
GSVector4i::store<true>(&rgbp[4], rgbx1.shuffle8(mask11) | rgbx2.shuffle8(mask12));
|
||||
|
@ -1048,8 +1048,8 @@ void MDEC::YUVToRGB_New(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const
|
|||
const GSVector4i addval = s_state.status.data_output_signed ? GSVector4i::cxpr(0) : GSVector4i::cxpr(0x80808080);
|
||||
for (u32 y = 0; y < 8; y++)
|
||||
{
|
||||
const GSVector4i Cr = GSVector4i::loadl(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
|
||||
const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
|
||||
const GSVector4i Cr = GSVector4i::loadl<false>(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
|
||||
const GSVector4i Cb = GSVector4i::loadl<false>(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
|
||||
const GSVector4i Y = GSVector4i::load<true>(&Yblk[y * 8]);
|
||||
|
||||
// BT.601 YUV->RGB coefficients, rounding formula from Mednafen.
|
||||
|
|
|
@ -2318,7 +2318,7 @@ void SPU::ProcessReverb(s32 left_in, s32 right_in, s32* left_out, s32* right_out
|
|||
srcs = GSVector4i::load<false>(&src[8]);
|
||||
acc = acc.add32(GSVector4i::load<true>(&resample_coeff[8]).mul32l(srcs.s16to32()));
|
||||
acc = acc.add32(GSVector4i::load<true>(&resample_coeff[12]).mul32l(srcs.uph64().s16to32()));
|
||||
srcs = GSVector4i::loadl(&src[16]);
|
||||
srcs = GSVector4i::loadl<false>(&src[16]);
|
||||
acc = acc.add32(GSVector4i::load<true>(&resample_coeff[16]).mul32l(srcs.s16to32()));
|
||||
|
||||
out[channel] = std::clamp<s32>(acc.addv_s32() >> 14, -32768, 32767);
|
||||
|
|
|
@ -1683,7 +1683,7 @@ GPUDevice::PresentResult PostProcessing::ReShadeFXShader::Apply(GPUTexture* inpu
|
|||
|
||||
case SourceOptionType::ViewportOffset:
|
||||
{
|
||||
GSVector4::storel(dst, GSVector4(final_rect));
|
||||
GSVector4::storel<false>(dst, GSVector4(final_rect));
|
||||
}
|
||||
break;
|
||||
|
||||
|
|
Loading…
Reference in New Issue