Misc: Fix alignment errors on ARM32

This commit is contained in:
Stenzek 2024-11-22 18:44:10 +10:00
parent bb24d406f2
commit 5c03e1d940
No known key found for this signature in database
16 changed files with 230 additions and 57 deletions

View File

@ -15,8 +15,8 @@ static void YUVToRGB_Vector(const std::array<s16, 64>& Crblk, const std::array<s
const GSVector4i addval = signed_output ? GSVector4i::cxpr(0) : GSVector4i::cxpr(0x80808080);
for (u32 y = 0; y < 8; y++)
{
const GSVector4i Cr = GSVector4i::loadl(&Crblk[(y / 2) * 8]).s16to32();
const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(y / 2) * 8]).s16to32();
const GSVector4i Cr = GSVector4i::loadl<false>(&Crblk[(y / 2) * 8]).s16to32();
const GSVector4i Cb = GSVector4i::loadl<false>(&Cbblk[(y / 2) * 8]).s16to32();
const GSVector4i Y = GSVector4i::load<true>(&Yblk[y * 8]);
// BT.601 YUV->RGB coefficients, rounding formula from Mednafen.

View File

@ -53,7 +53,7 @@ GSMatrix2x2 GSMatrix2x2::Rotation(float angle_in_radians)
GSVector2 GSMatrix2x2::row(size_t i) const
{
return GSVector2::load(&E[i][0]);
return GSVector2::load<true>(&E[i][0]);
}
GSVector2 GSMatrix2x2::col(size_t i) const

View File

@ -35,5 +35,5 @@ public:
void store(void* m);
float E[2][2];
alignas(8) float E[2][2];
};

View File

@ -690,7 +690,16 @@ public:
ALWAYS_INLINE static GSVector2i zext32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }
ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(vld1_s32((const int32_t*)p)); }
template<bool aligned>
ALWAYS_INLINE static GSVector2i load(const void* p)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
return GSVector2i(vreinterpret_s32_s8(vld1_s8((const int8_t*)p)));
#endif
return GSVector2i(vld1_s32((const int32_t*)p));
}
ALWAYS_INLINE static void store32(void* p, const GSVector2i& v)
{
@ -698,7 +707,19 @@ public:
std::memcpy(p, &val, sizeof(s32));
}
ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { vst1_s32((int32_t*)p, v.v2s); }
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1_s8((int8_t*)p, vreinterpret_s8_s32(v.v2s));
return;
}
#endif
vst1_s32((int32_t*)p, v.v2s);
}
ALWAYS_INLINE void operator&=(const GSVector2i& v)
{
@ -903,9 +924,30 @@ public:
ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); }
ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(vld1_f32(static_cast<const float*>(p))); }
template<bool aligned>
ALWAYS_INLINE static GSVector2 load(const void* p)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
return GSVector2(vreinterpret_f32_s8(vld1_s8((const int8_t*)p)));
#endif
ALWAYS_INLINE static void store(void* p, const GSVector2& v) { vst1_f32(static_cast<float*>(p), v.v2s); }
return GSVector2(vld1_f32(static_cast<const float*>(p)));
}
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector2& v)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1_s8(static_cast<int8_t*>(p), vreinterpret_s8_f32(v.v2s));
return;
}
#endif
vst1_f32(static_cast<float*>(p), v.v2s);
}
ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
@ -2134,13 +2176,25 @@ public:
ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(vsetq_lane_s32(v, vdupq_n_s32(0), 0)); }
template<bool aligned>
ALWAYS_INLINE static GSVector4i loadl(const void* p)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
return GSVector4i(vcombine_s32(vreinterpret_s32_s8(vld1_s8((int8_t*)p)), vcreate_s32(0)));
#endif
return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0)));
}
template<bool aligned>
ALWAYS_INLINE static GSVector4i loadh(const void* p)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vdup_n_s8(0), vld1_s8((int8_t*)p))));
#endif
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
}
@ -2149,6 +2203,11 @@ public:
template<bool aligned>
ALWAYS_INLINE static GSVector4i load(const void* p)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
return GSVector4i(vreinterpretq_s32_s8(vld1q_s8((int8_t*)p)));
#endif
return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
}
@ -2167,19 +2226,45 @@ public:
std::memcpy(p, &val, sizeof(u32));
}
template<bool aligned>
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1_s8((int8_t*)p, vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
return;
}
#endif
vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s)));
}
template<bool aligned>
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1_s8((int8_t*)p, vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
return;
}
#endif
vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s)));
}
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1q_s8((int8_t*)p, vreinterpretq_s8_s32(v.v4s));
return;
}
#endif
vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
}
@ -2652,8 +2737,14 @@ public:
ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
template<bool aligned>
ALWAYS_INLINE static GSVector4 loadl(const void* p)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
return GSVector4(vcombine_f32(vreinterpret_f32_s8(vld1_s8((int8_t*)p)), vcreate_f32(0)));
#endif
return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
}
@ -2662,32 +2753,55 @@ public:
template<bool aligned>
ALWAYS_INLINE static GSVector4 load(const void* p)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
return GSVector4(vreinterpretq_f32_s8(vld1q_s8((int8_t*)p)));
#endif
return GSVector4(vld1q_f32((const float*)p));
}
ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); }
template<bool aligned>
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
{
#ifdef CPU_ARCH_ARM64
vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s)));
#else
vst1_s64((s64*)p, vget_low_s64(vreinterpretq_s64_f32(v.v4s)));
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_low_f32(v.v4s)));
return;
}
#endif
vst1_f32((float*)p, vget_low_f32(v.v4s));
}
template<bool aligned>
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
{
#ifdef CPU_ARCH_ARM64
vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s)));
#else
vst1_s64((s64*)p, vget_high_s64(vreinterpretq_s64_f32(v.v4s)));
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_high_f32(v.v4s)));
return;
}
#endif
vst1_f32((float*)p, vget_high_f32(v.v4s));
}
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector4& v)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1q_s8((int8_t*)p, vreinterpretq_s8_f32(v.v4s));
return;
}
#endif
vst1q_f32((float*)p, v.v4s);
}

View File

@ -467,6 +467,7 @@ public:
ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(v, 0); }
template<bool aligned>
ALWAYS_INLINE static GSVector2i load(const void* p)
{
GSVector2i ret;
@ -474,7 +475,11 @@ public:
return ret;
}
ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { std::memcpy(p, v.S32, sizeof(S32)); }
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
{
std::memcpy(p, v.S32, sizeof(S32));
}
ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { std::memcpy(p, &v.x, sizeof(s32)); }
@ -658,6 +663,7 @@ public:
return ret;
}
template<bool aligned>
ALWAYS_INLINE static GSVector2 load(const void* p)
{
GSVector2 ret;
@ -665,7 +671,11 @@ public:
return ret;
}
ALWAYS_INLINE static void store(void* p, const GSVector2& v) { std::memcpy(p, &v.F32, sizeof(F32)); }
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector2& v)
{
std::memcpy(p, &v.F32, sizeof(F32));
}
ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
@ -1530,6 +1540,7 @@ public:
ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(v, 0, 0, 0); }
template<bool aligned>
ALWAYS_INLINE static GSVector4i loadl(const void* p)
{
GSVector4i ret;
@ -1538,6 +1549,7 @@ public:
return ret;
}
template<bool aligned>
ALWAYS_INLINE static GSVector4i loadh(const void* p)
{
GSVector4i ret;
@ -1546,7 +1558,11 @@ public:
return ret;
}
ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); }
template<bool aligned>
ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v)
{
return loadh<true>(&v);
}
template<bool aligned>
ALWAYS_INLINE static GSVector4i load(const void* p)
@ -1558,9 +1574,17 @@ public:
ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { std::memcpy(p, v.S32, sizeof(v.S32)); }
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[0], sizeof(s32) * 2); }
template<bool aligned>
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
{
std::memcpy(p, &v.S32[0], sizeof(s32) * 2);
}
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[2], sizeof(s32) * 2); }
template<bool aligned>
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
{
std::memcpy(p, &v.S32[2], sizeof(s32) * 2);
}
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
@ -1958,6 +1982,7 @@ public:
return ret;
}
template<bool aligned>
ALWAYS_INLINE static GSVector4 loadl(const void* p)
{
GSVector4 ret;
@ -1977,9 +2002,17 @@ public:
ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { std::memcpy(p, &v, sizeof(v)); }
ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { std::memcpy(p, &v.x, sizeof(float) * 2); }
template<bool aligned>
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
{
std::memcpy(p, &v.x, sizeof(float) * 2);
}
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { std::memcpy(p, &v.z, sizeof(float) * 2); }
template<bool aligned>
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
{
std::memcpy(p, &v.z, sizeof(float) * 2);
}
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector4& v)

View File

@ -585,12 +585,19 @@ public:
ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); }
ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(_mm_cvtsi32_si128(v)); }
template<bool aligned>
ALWAYS_INLINE static GSVector2i load(const void* p)
{
return GSVector2i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
}
ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); }
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
{
_mm_storel_epi64(static_cast<__m128i*>(p), v.m);
}
ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); }
ALWAYS_INLINE GSVector2i& operator&=(const GSVector2i& v)
@ -806,11 +813,13 @@ public:
ALWAYS_INLINE static GSVector2 xffffffff() { return zero() == zero(); }
template<bool aligned>
ALWAYS_INLINE static GSVector2 load(const void* p)
{
return GSVector2(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
}
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector2& v)
{
_mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
@ -1711,16 +1720,19 @@ public:
ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); }
ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(_mm_cvtsi32_si128(v)); }
template<bool aligned>
ALWAYS_INLINE static GSVector4i loadl(const void* p)
{
return GSVector4i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
}
template<bool aligned>
ALWAYS_INLINE static GSVector4i loadh(const void* p)
{
return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), static_cast<const __m64*>(p))));
}
template<bool aligned>
ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v)
{
return GSVector4i(_mm_unpacklo_epi64(_mm_setzero_si128(), v.m));
@ -1734,7 +1746,14 @@ public:
}
ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128(static_cast<__m128i*>(p), v.m); }
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); }
template<bool aligned>
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
{
_mm_storel_epi64(static_cast<__m128i*>(p), v.m);
}
template<bool aligned>
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
{
_mm_storeh_pi(static_cast<__m64*>(p), _mm_castsi128_ps(v.m));
@ -2115,6 +2134,7 @@ public:
ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); }
template<bool aligned>
ALWAYS_INLINE static GSVector4 loadl(const void* p)
{
return GSVector4(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
@ -2127,10 +2147,14 @@ public:
}
ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps(static_cast<float*>(p), v.m); }
template<bool aligned>
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
{
_mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
}
template<bool aligned>
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
{
_mm_storeh_pd(static_cast<double*>(p), _mm_castps_pd(v.m));

View File

@ -549,7 +549,9 @@ u8* Bus::GetLUTFastmemPointer(u32 address, u8* ram_ptr)
void Bus::MapFastmemViews()
{
#ifdef ENABLE_MMAP_FASTMEM
Assert(s_fastmem_ram_views.empty());
#endif
const CPUFastmemMode mode = g_settings.cpu_fastmem_mode;
if (mode == CPUFastmemMode::MMap)

View File

@ -345,8 +345,8 @@ protected:
// However, usually it'll undershoot not overshoot. If we wanted to make this more accurate, we'd need to intersect
// the edges with the clip rectangle.
// TODO: Coordinates are exclusive, so off by one here...
const GSVector2i clamp_min = GSVector2i::load(&m_clamped_drawing_area.x);
const GSVector2i clamp_max = GSVector2i::load(&m_clamped_drawing_area.z);
const GSVector2i clamp_min = GSVector2i::load<true>(&m_clamped_drawing_area.x);
const GSVector2i clamp_max = GSVector2i::load<true>(&m_clamped_drawing_area.z);
v1 = v1.sat_s32(clamp_min, clamp_max);
v2 = v2.sat_s32(clamp_min, clamp_max);
v3 = v3.sat_s32(clamp_min, clamp_max);

View File

@ -2570,9 +2570,9 @@ void GPU_HW::LoadVertices()
}
// Cull polygons which are too large.
const GSVector2 v0f = GSVector2::load(&vertices[0].x);
const GSVector2 v1f = GSVector2::load(&vertices[1].x);
const GSVector2 v2f = GSVector2::load(&vertices[2].x);
const GSVector2 v0f = GSVector2::load<false>(&vertices[0].x);
const GSVector2 v1f = GSVector2::load<false>(&vertices[1].x);
const GSVector2 v2f = GSVector2::load<false>(&vertices[2].x);
const GSVector2 min_pos_12 = v1f.min(v2f);
const GSVector2 max_pos_12 = v1f.max(v2f);
const GSVector4i draw_rect_012 = GSVector4i(GSVector4(min_pos_12.min(v0f)).upld(GSVector4(max_pos_12.max(v0f))))
@ -2617,7 +2617,7 @@ void GPU_HW::LoadVertices()
// quads
if (rc.quad_polygon)
{
const GSVector2 v3f = GSVector2::load(&vertices[3].x);
const GSVector2 v3f = GSVector2::load<false>(&vertices[3].x);
const GSVector4i draw_rect_123 = GSVector4i(GSVector4(min_pos_12.min(v3f)).upld(GSVector4(max_pos_12.max(v3f))))
.add32(GSVector4i::cxpr(0, 0, 1, 1));
const GSVector4i clamped_draw_rect_123 = draw_rect_123.rintersect(m_clamped_drawing_area);
@ -2845,9 +2845,9 @@ void GPU_HW::LoadVertices()
{
GPUBackendDrawLineCommand* cmd = m_sw_renderer->NewDrawLineCommand(2);
FillDrawCommand(cmd, rc);
GSVector4i::storel(&cmd->vertices[0], bounds);
GSVector4i::storel<false>(&cmd->vertices[0], bounds);
cmd->vertices[0].color = start_color;
GSVector4i::storeh(&cmd->vertices[1], bounds);
GSVector4i::storeh<false>(&cmd->vertices[1], bounds);
cmd->vertices[1].color = end_color;
m_sw_renderer->PushCommand(cmd);
}
@ -2870,7 +2870,7 @@ void GPU_HW::LoadVertices()
{
cmd = m_sw_renderer->NewDrawLineCommand(num_vertices);
FillDrawCommand(cmd, rc);
GSVector2i::store(&cmd->vertices[0].x, start_pos);
GSVector2i::store<false>(&cmd->vertices[0].x, start_pos);
cmd->vertices[0].color = start_color;
}
else
@ -2905,7 +2905,7 @@ void GPU_HW::LoadVertices()
if (cmd)
{
GSVector2i::store(&cmd->vertices[i], end_pos);
GSVector2i::store<false>(&cmd->vertices[i], end_pos);
cmd->vertices[i].color = end_color;
}
}
@ -2978,7 +2978,7 @@ ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(GSVector4i uv_rect)
const GPUTextureMode tmode = m_draw_mode.mode_reg.texture_mode;
const u32 xshift = (tmode >= GPUTextureMode::Direct16Bit) ? 0 : (2 - static_cast<u8>(tmode));
const GSVector4i page_offset = GSVector4i::loadl(m_current_texture_page_offset).xyxy();
const GSVector4i page_offset = GSVector4i::loadl<true>(m_current_texture_page_offset).xyxy();
uv_rect = uv_rect.blend32<5>(uv_rect.srl32(xshift)); // shift only goes on the x
uv_rect = uv_rect.add32(page_offset); // page offset

View File

@ -317,7 +317,7 @@ private:
GSVector4i m_vram_dirty_write_rect = INVALID_RECT; // TODO: Don't use in TC mode, should be kept at zero.
GSVector4i m_current_uv_rect = INVALID_RECT;
GSVector4i m_current_draw_rect = INVALID_RECT;
s32 m_current_texture_page_offset[2] = {};
alignas(8) s32 m_current_texture_page_offset[2] = {};
std::unique_ptr<GPUPipeline> m_wireframe_pipeline;

View File

@ -3305,8 +3305,8 @@ void GPUTextureCache::ApplyTextureReplacements(SourceKey key, HashType tex_hash,
// TODO: Use rects instead of fullscreen tris, maybe avoid the copy..
alignas(VECTOR_ALIGNMENT) float uniforms[4];
GSVector2 texture_size = GSVector2(GSVector2i(entry->texture->GetWidth(), entry->texture->GetHeight()));
GSVector2::store(&uniforms[0], texture_size);
GSVector2::store(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
GSVector2::store<true>(&uniforms[0], texture_size);
GSVector2::store<true>(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
g_gpu_device->InvalidateRenderTarget(s_state.replacement_texture_render_target.get());
g_gpu_device->SetRenderTarget(s_state.replacement_texture_render_target.get());
g_gpu_device->SetViewportAndScissor(0, 0, new_width, new_height);
@ -3325,8 +3325,8 @@ void GPUTextureCache::ApplyTextureReplacements(SourceKey key, HashType tex_hash,
const GSVector4i dst_rect = GSVector4i(GSVector4(si.dst_rect) * max_scale_v);
texture_size = GSVector2(GSVector2i(temp_texture->GetWidth(), temp_texture->GetHeight()));
GSVector2::store(&uniforms[0], texture_size);
GSVector2::store(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
GSVector2::store<true>(&uniforms[0], texture_size);
GSVector2::store<true>(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size);
g_gpu_device->SetViewportAndScissor(dst_rect);
g_gpu_device->SetTextureSampler(0, temp_texture.get(), g_gpu_device->GetNearestSampler());
g_gpu_device->SetPipeline(si.invert_alpha ? s_state.replacement_semitransparent_draw_pipeline.get() :

View File

@ -537,7 +537,7 @@ void GPU_SW::DispatchRenderCommand()
vert->x = m_drawing_offset.x + vp.x;
vert->y = m_drawing_offset.y + vp.y;
vert->texcoord = textured ? Truncate16(FifoPop()) : 0;
positions[i] = GSVector2i::load(&vert->x);
positions[i] = GSVector2i::load<false>(&vert->x);
}
// Cull polygons which are too large.
@ -686,8 +686,8 @@ void GPU_SW::DispatchRenderCommand()
cmd->vertices[1].y = m_drawing_offset.y + end_pos.y;
}
const GSVector4i v0 = GSVector4i::loadl(&cmd->vertices[0].x);
const GSVector4i v1 = GSVector4i::loadl(&cmd->vertices[1].x);
const GSVector4i v0 = GSVector4i::loadl<false>(&cmd->vertices[0].x);
const GSVector4i v1 = GSVector4i::loadl<false>(&cmd->vertices[1].x);
const GSVector4i rect = v0.min_s32(v1).xyxy(v0.max_s32(v1)).add32(GSVector4i::cxpr(0, 0, 1, 1));
const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);
@ -711,7 +711,7 @@ void GPU_SW::DispatchRenderCommand()
u32 buffer_pos = 0;
const GPUVertexPosition start_vp{m_blit_buffer[buffer_pos++]};
const GSVector2i draw_offset = GSVector2i::load(&m_drawing_offset.x);
const GSVector2i draw_offset = GSVector2i::load<false>(&m_drawing_offset.x);
GSVector2i start_pos = GSVector2i(start_vp.x, start_vp.y).add32(draw_offset);
u32 start_color = m_render_command.color_for_first_vertex;
@ -740,9 +740,9 @@ void GPU_SW::DispatchRenderCommand()
GPUBackendDrawLineCommand::Vertex* out_vertex = &cmd->vertices[out_vertex_count];
out_vertex_count += 2;
GSVector2i::store(&out_vertex[0].x, start_pos);
GSVector2i::store<false>(&out_vertex[0].x, start_pos);
out_vertex[0].color = start_color;
GSVector2i::store(&out_vertex[1].x, end_pos);
GSVector2i::store<false>(&out_vertex[1].x, end_pos);
out_vertex[1].color = end_color;
}

View File

@ -397,7 +397,7 @@ ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y)
{
if (x <= (VRAM_WIDTH - 4))
{
return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32();
return GSVector4i::loadl<false>(&g_vram[y * VRAM_WIDTH + x]).u16to32();
}
else
{
@ -415,7 +415,7 @@ ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color)
const GSVector4i packed_color = color.pu32();
if (x <= (VRAM_WIDTH - 4))
{
GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], packed_color);
GSVector4i::storel<false>(&g_vram[y * VRAM_WIDTH + x], packed_color);
}
else
{

View File

@ -711,10 +711,10 @@ void MDEC::CopyOutBlock(void* param, TickCount ticks, TickCount ticks_late)
for (u32 index = 0; index < s_state.block_rgb.size(); index += 16)
{
const GSVector4i rgbx0 = GSVector4i::load<false>(&s_state.block_rgb[index]);
const GSVector4i rgbx1 = GSVector4i::load<false>(&s_state.block_rgb[index + 4]);
const GSVector4i rgbx2 = GSVector4i::load<false>(&s_state.block_rgb[index + 8]);
const GSVector4i rgbx3 = GSVector4i::load<false>(&s_state.block_rgb[index + 12]);
const GSVector4i rgbx0 = GSVector4i::load<true>(&s_state.block_rgb[index]);
const GSVector4i rgbx1 = GSVector4i::load<true>(&s_state.block_rgb[index + 4]);
const GSVector4i rgbx2 = GSVector4i::load<true>(&s_state.block_rgb[index + 8]);
const GSVector4i rgbx3 = GSVector4i::load<true>(&s_state.block_rgb[index + 12]);
GSVector4i::store<true>(&rgbp[0], rgbx0.shuffle8(mask00) | rgbx1.shuffle8(mask01));
GSVector4i::store<true>(&rgbp[4], rgbx1.shuffle8(mask11) | rgbx2.shuffle8(mask12));
@ -1048,8 +1048,8 @@ void MDEC::YUVToRGB_New(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const
const GSVector4i addval = s_state.status.data_output_signed ? GSVector4i::cxpr(0) : GSVector4i::cxpr(0x80808080);
for (u32 y = 0; y < 8; y++)
{
const GSVector4i Cr = GSVector4i::loadl(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
const GSVector4i Cr = GSVector4i::loadl<false>(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
const GSVector4i Cb = GSVector4i::loadl<false>(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
const GSVector4i Y = GSVector4i::load<true>(&Yblk[y * 8]);
// BT.601 YUV->RGB coefficients, rounding formula from Mednafen.

View File

@ -2318,7 +2318,7 @@ void SPU::ProcessReverb(s32 left_in, s32 right_in, s32* left_out, s32* right_out
srcs = GSVector4i::load<false>(&src[8]);
acc = acc.add32(GSVector4i::load<true>(&resample_coeff[8]).mul32l(srcs.s16to32()));
acc = acc.add32(GSVector4i::load<true>(&resample_coeff[12]).mul32l(srcs.uph64().s16to32()));
srcs = GSVector4i::loadl(&src[16]);
srcs = GSVector4i::loadl<false>(&src[16]);
acc = acc.add32(GSVector4i::load<true>(&resample_coeff[16]).mul32l(srcs.s16to32()));
out[channel] = std::clamp<s32>(acc.addv_s32() >> 14, -32768, 32767);

View File

@ -1683,7 +1683,7 @@ GPUDevice::PresentResult PostProcessing::ReShadeFXShader::Apply(GPUTexture* inpu
case SourceOptionType::ViewportOffset:
{
GSVector4::storel(dst, GSVector4(final_rect));
GSVector4::storel<false>(dst, GSVector4(final_rect));
}
break;