From 5c03e1d9405b634e342f481f6ee9b7f1c6524fa4 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Fri, 22 Nov 2024 18:44:10 +1000 Subject: [PATCH] Misc: Fix alignment errors on ARM32 --- src/common-tests/gsvector_yuvtorgb_test.cpp | 4 +- src/common/gsvector.cpp | 2 +- src/common/gsvector.h | 2 +- src/common/gsvector_neon.h | 138 ++++++++++++++++++-- src/common/gsvector_nosimd.h | 47 ++++++- src/common/gsvector_sse.h | 28 +++- src/core/bus.cpp | 2 + src/core/gpu.h | 4 +- src/core/gpu_hw.cpp | 18 +-- src/core/gpu_hw.h | 2 +- src/core/gpu_hw_texture_cache.cpp | 8 +- src/core/gpu_sw.cpp | 12 +- src/core/gpu_sw_rasterizer.inl | 4 +- src/core/mdec.cpp | 12 +- src/core/spu.cpp | 2 +- src/util/postprocessing_shader_fx.cpp | 2 +- 16 files changed, 230 insertions(+), 57 deletions(-) diff --git a/src/common-tests/gsvector_yuvtorgb_test.cpp b/src/common-tests/gsvector_yuvtorgb_test.cpp index 3153b2822..edd435c11 100644 --- a/src/common-tests/gsvector_yuvtorgb_test.cpp +++ b/src/common-tests/gsvector_yuvtorgb_test.cpp @@ -15,8 +15,8 @@ static void YUVToRGB_Vector(const std::array& Crblk, const std::array(&Crblk[(y / 2) * 8]).s16to32(); + const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(y / 2) * 8]).s16to32(); const GSVector4i Y = GSVector4i::load(&Yblk[y * 8]); // BT.601 YUV->RGB coefficients, rounding formula from Mednafen. diff --git a/src/common/gsvector.cpp b/src/common/gsvector.cpp index 448ddcae8..392b9da44 100644 --- a/src/common/gsvector.cpp +++ b/src/common/gsvector.cpp @@ -53,7 +53,7 @@ GSMatrix2x2 GSMatrix2x2::Rotation(float angle_in_radians) GSVector2 GSMatrix2x2::row(size_t i) const { - return GSVector2::load(&E[i][0]); + return GSVector2::load(&E[i][0]); } GSVector2 GSMatrix2x2::col(size_t i) const diff --git a/src/common/gsvector.h b/src/common/gsvector.h index b343a43c9..3bccfa3ca 100644 --- a/src/common/gsvector.h +++ b/src/common/gsvector.h @@ -35,5 +35,5 @@ public: void store(void* m); - float E[2][2]; + alignas(8) float E[2][2]; }; diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h index 9a558269f..c8efec076 100644 --- a/src/common/gsvector_neon.h +++ b/src/common/gsvector_neon.h @@ -690,7 +690,16 @@ public: ALWAYS_INLINE static GSVector2i zext32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); } - ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(vld1_s32((const int32_t*)p)); } + template + ALWAYS_INLINE static GSVector2i load(const void* p) + { +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + return GSVector2i(vreinterpret_s32_s8(vld1_s8((const int8_t*)p))); +#endif + + return GSVector2i(vld1_s32((const int32_t*)p)); + } ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { @@ -698,7 +707,19 @@ public: std::memcpy(p, &val, sizeof(s32)); } - ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { vst1_s32((int32_t*)p, v.v2s); } + template + ALWAYS_INLINE static void store(void* p, const GSVector2i& v) + { +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + { + vst1_s8((int8_t*)p, vreinterpret_s8_s32(v.v2s)); + return; + } +#endif + + vst1_s32((int32_t*)p, v.v2s); + } ALWAYS_INLINE void operator&=(const GSVector2i& v) { @@ -903,9 +924,30 @@ public: ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); } - ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(vld1_f32(static_cast(p))); } + template + ALWAYS_INLINE static GSVector2 load(const void* p) + { +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + return GSVector2(vreinterpret_f32_s8(vld1_s8((const int8_t*)p))); +#endif - ALWAYS_INLINE static void store(void* p, const GSVector2& v) { vst1_f32(static_cast(p), v.v2s); } + return GSVector2(vld1_f32(static_cast(p))); + } + + template + ALWAYS_INLINE static void store(void* p, const GSVector2& v) + { +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + { + vst1_s8(static_cast(p), vreinterpret_s8_f32(v.v2s)); + return; + } +#endif + + vst1_f32(static_cast(p), v.v2s); + } ALWAYS_INLINE GSVector2 operator-() const { return neg(); } @@ -2134,13 +2176,25 @@ public: ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(vsetq_lane_s32(v, vdupq_n_s32(0), 0)); } + template ALWAYS_INLINE static GSVector4i loadl(const void* p) { +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + return GSVector4i(vcombine_s32(vreinterpret_s32_s8(vld1_s8((int8_t*)p)), vcreate_s32(0))); +#endif + return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0))); } + template ALWAYS_INLINE static GSVector4i loadh(const void* p) { +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vdup_n_s8(0), vld1_s8((int8_t*)p)))); +#endif + return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p)))); } @@ -2149,6 +2203,11 @@ public: template ALWAYS_INLINE static GSVector4i load(const void* p) { +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + return GSVector4i(vreinterpretq_s32_s8(vld1q_s8((int8_t*)p))); +#endif + return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p))); } @@ -2167,19 +2226,45 @@ public: std::memcpy(p, &val, sizeof(u32)); } + template ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + { + vst1_s8((int8_t*)p, vget_low_s8(vreinterpretq_s8_s32(v.v4s))); + return; + } +#endif + vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s))); } + template ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + { + vst1_s8((int8_t*)p, vget_high_s8(vreinterpretq_s8_s32(v.v4s))); + return; + } +#endif + vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s))); } template ALWAYS_INLINE static void store(void* p, const GSVector4i& v) { +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + { + vst1q_s8((int8_t*)p, vreinterpretq_s8_s32(v.v4s)); + return; + } +#endif + vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s)); } @@ -2652,8 +2737,14 @@ public: ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); } + template ALWAYS_INLINE static GSVector4 loadl(const void* p) { +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + return GSVector4(vcombine_f32(vreinterpret_f32_s8(vld1_s8((int8_t*)p)), vcreate_f32(0))); +#endif + return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0))); } @@ -2662,32 +2753,55 @@ public: template ALWAYS_INLINE static GSVector4 load(const void* p) { +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + return GSVector4(vreinterpretq_f32_s8(vld1q_s8((int8_t*)p))); +#endif + return GSVector4(vld1q_f32((const float*)p)); } ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); } + template ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { -#ifdef CPU_ARCH_ARM64 - vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s))); -#else - vst1_s64((s64*)p, vget_low_s64(vreinterpretq_s64_f32(v.v4s))); +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + { + vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_low_f32(v.v4s))); + return; + } #endif + + vst1_f32((float*)p, vget_low_f32(v.v4s)); } + template ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { -#ifdef CPU_ARCH_ARM64 - vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s))); -#else - vst1_s64((s64*)p, vget_high_s64(vreinterpretq_s64_f32(v.v4s))); +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + { + vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_high_f32(v.v4s))); + return; + } #endif + + vst1_f32((float*)p, vget_high_f32(v.v4s)); } template ALWAYS_INLINE static void store(void* p, const GSVector4& v) { +#ifdef CPU_ARCH_ARM32 + if constexpr (!aligned) + { + vst1q_s8((int8_t*)p, vreinterpretq_s8_f32(v.v4s)); + return; + } +#endif + vst1q_f32((float*)p, v.v4s); } diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h index b7a719858..6d73592d9 100644 --- a/src/common/gsvector_nosimd.h +++ b/src/common/gsvector_nosimd.h @@ -467,6 +467,7 @@ public: ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(v, 0); } + template ALWAYS_INLINE static GSVector2i load(const void* p) { GSVector2i ret; @@ -474,7 +475,11 @@ public: return ret; } - ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { std::memcpy(p, v.S32, sizeof(S32)); } + template + ALWAYS_INLINE static void store(void* p, const GSVector2i& v) + { + std::memcpy(p, v.S32, sizeof(S32)); + } ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { std::memcpy(p, &v.x, sizeof(s32)); } @@ -658,6 +663,7 @@ public: return ret; } + template ALWAYS_INLINE static GSVector2 load(const void* p) { GSVector2 ret; @@ -665,7 +671,11 @@ public: return ret; } - ALWAYS_INLINE static void store(void* p, const GSVector2& v) { std::memcpy(p, &v.F32, sizeof(F32)); } + template + ALWAYS_INLINE static void store(void* p, const GSVector2& v) + { + std::memcpy(p, &v.F32, sizeof(F32)); + } ALWAYS_INLINE GSVector2 operator-() const { return neg(); } @@ -1530,6 +1540,7 @@ public: ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(v, 0, 0, 0); } + template ALWAYS_INLINE static GSVector4i loadl(const void* p) { GSVector4i ret; @@ -1538,6 +1549,7 @@ public: return ret; } + template ALWAYS_INLINE static GSVector4i loadh(const void* p) { GSVector4i ret; @@ -1546,7 +1558,11 @@ public: return ret; } - ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); } + template + ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) + { + return loadh(&v); + } template ALWAYS_INLINE static GSVector4i load(const void* p) @@ -1558,9 +1574,17 @@ public: ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { std::memcpy(p, v.S32, sizeof(v.S32)); } - ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[0], sizeof(s32) * 2); } + template + ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) + { + std::memcpy(p, &v.S32[0], sizeof(s32) * 2); + } - ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[2], sizeof(s32) * 2); } + template + ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) + { + std::memcpy(p, &v.S32[2], sizeof(s32) * 2); + } template ALWAYS_INLINE static void store(void* p, const GSVector4i& v) @@ -1958,6 +1982,7 @@ public: return ret; } + template ALWAYS_INLINE static GSVector4 loadl(const void* p) { GSVector4 ret; @@ -1977,9 +2002,17 @@ public: ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { std::memcpy(p, &v, sizeof(v)); } - ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { std::memcpy(p, &v.x, sizeof(float) * 2); } + template + ALWAYS_INLINE static void storel(void* p, const GSVector4& v) + { + std::memcpy(p, &v.x, sizeof(float) * 2); + } - ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { std::memcpy(p, &v.z, sizeof(float) * 2); } + template + ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) + { + std::memcpy(p, &v.z, sizeof(float) * 2); + } template ALWAYS_INLINE static void store(void* p, const GSVector4& v) diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h index 86517ed56..c37d50d2f 100644 --- a/src/common/gsvector_sse.h +++ b/src/common/gsvector_sse.h @@ -585,12 +585,19 @@ public: ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); } ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(_mm_cvtsi32_si128(v)); } + + template ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(_mm_loadl_epi64(static_cast(p))); } - ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); } + template + ALWAYS_INLINE static void store(void* p, const GSVector2i& v) + { + _mm_storel_epi64(static_cast<__m128i*>(p), v.m); + } + ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); } ALWAYS_INLINE GSVector2i& operator&=(const GSVector2i& v) @@ -806,11 +813,13 @@ public: ALWAYS_INLINE static GSVector2 xffffffff() { return zero() == zero(); } + template ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(_mm_castpd_ps(_mm_load_sd(static_cast(p)))); } + template ALWAYS_INLINE static void store(void* p, const GSVector2& v) { _mm_store_sd(static_cast(p), _mm_castps_pd(v.m)); @@ -1711,16 +1720,19 @@ public: ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); } ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(_mm_cvtsi32_si128(v)); } + template ALWAYS_INLINE static GSVector4i loadl(const void* p) { return GSVector4i(_mm_loadl_epi64(static_cast(p))); } + template ALWAYS_INLINE static GSVector4i loadh(const void* p) { return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), static_cast(p)))); } + template ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return GSVector4i(_mm_unpacklo_epi64(_mm_setzero_si128(), v.m)); @@ -1734,7 +1746,14 @@ public: } ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128(static_cast<__m128i*>(p), v.m); } - ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); } + + template + ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) + { + _mm_storel_epi64(static_cast<__m128i*>(p), v.m); + } + + template ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { _mm_storeh_pi(static_cast<__m64*>(p), _mm_castsi128_ps(v.m)); @@ -2115,6 +2134,7 @@ public: ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); } + template ALWAYS_INLINE static GSVector4 loadl(const void* p) { return GSVector4(_mm_castpd_ps(_mm_load_sd(static_cast(p)))); @@ -2127,10 +2147,14 @@ public: } ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps(static_cast(p), v.m); } + + template ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { _mm_store_sd(static_cast(p), _mm_castps_pd(v.m)); } + + template ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { _mm_storeh_pd(static_cast(p), _mm_castps_pd(v.m)); diff --git a/src/core/bus.cpp b/src/core/bus.cpp index 2c7bcb3ca..2c04e8680 100644 --- a/src/core/bus.cpp +++ b/src/core/bus.cpp @@ -549,7 +549,9 @@ u8* Bus::GetLUTFastmemPointer(u32 address, u8* ram_ptr) void Bus::MapFastmemViews() { +#ifdef ENABLE_MMAP_FASTMEM Assert(s_fastmem_ram_views.empty()); +#endif const CPUFastmemMode mode = g_settings.cpu_fastmem_mode; if (mode == CPUFastmemMode::MMap) diff --git a/src/core/gpu.h b/src/core/gpu.h index a4f858794..a98b20cbb 100644 --- a/src/core/gpu.h +++ b/src/core/gpu.h @@ -345,8 +345,8 @@ protected: // However, usually it'll undershoot not overshoot. If we wanted to make this more accurate, we'd need to intersect // the edges with the clip rectangle. // TODO: Coordinates are exclusive, so off by one here... - const GSVector2i clamp_min = GSVector2i::load(&m_clamped_drawing_area.x); - const GSVector2i clamp_max = GSVector2i::load(&m_clamped_drawing_area.z); + const GSVector2i clamp_min = GSVector2i::load(&m_clamped_drawing_area.x); + const GSVector2i clamp_max = GSVector2i::load(&m_clamped_drawing_area.z); v1 = v1.sat_s32(clamp_min, clamp_max); v2 = v2.sat_s32(clamp_min, clamp_max); v3 = v3.sat_s32(clamp_min, clamp_max); diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp index 13dd40ebd..3b93b2f3d 100644 --- a/src/core/gpu_hw.cpp +++ b/src/core/gpu_hw.cpp @@ -2570,9 +2570,9 @@ void GPU_HW::LoadVertices() } // Cull polygons which are too large. - const GSVector2 v0f = GSVector2::load(&vertices[0].x); - const GSVector2 v1f = GSVector2::load(&vertices[1].x); - const GSVector2 v2f = GSVector2::load(&vertices[2].x); + const GSVector2 v0f = GSVector2::load(&vertices[0].x); + const GSVector2 v1f = GSVector2::load(&vertices[1].x); + const GSVector2 v2f = GSVector2::load(&vertices[2].x); const GSVector2 min_pos_12 = v1f.min(v2f); const GSVector2 max_pos_12 = v1f.max(v2f); const GSVector4i draw_rect_012 = GSVector4i(GSVector4(min_pos_12.min(v0f)).upld(GSVector4(max_pos_12.max(v0f)))) @@ -2617,7 +2617,7 @@ void GPU_HW::LoadVertices() // quads if (rc.quad_polygon) { - const GSVector2 v3f = GSVector2::load(&vertices[3].x); + const GSVector2 v3f = GSVector2::load(&vertices[3].x); const GSVector4i draw_rect_123 = GSVector4i(GSVector4(min_pos_12.min(v3f)).upld(GSVector4(max_pos_12.max(v3f)))) .add32(GSVector4i::cxpr(0, 0, 1, 1)); const GSVector4i clamped_draw_rect_123 = draw_rect_123.rintersect(m_clamped_drawing_area); @@ -2845,9 +2845,9 @@ void GPU_HW::LoadVertices() { GPUBackendDrawLineCommand* cmd = m_sw_renderer->NewDrawLineCommand(2); FillDrawCommand(cmd, rc); - GSVector4i::storel(&cmd->vertices[0], bounds); + GSVector4i::storel(&cmd->vertices[0], bounds); cmd->vertices[0].color = start_color; - GSVector4i::storeh(&cmd->vertices[1], bounds); + GSVector4i::storeh(&cmd->vertices[1], bounds); cmd->vertices[1].color = end_color; m_sw_renderer->PushCommand(cmd); } @@ -2870,7 +2870,7 @@ void GPU_HW::LoadVertices() { cmd = m_sw_renderer->NewDrawLineCommand(num_vertices); FillDrawCommand(cmd, rc); - GSVector2i::store(&cmd->vertices[0].x, start_pos); + GSVector2i::store(&cmd->vertices[0].x, start_pos); cmd->vertices[0].color = start_color; } else @@ -2905,7 +2905,7 @@ void GPU_HW::LoadVertices() if (cmd) { - GSVector2i::store(&cmd->vertices[i], end_pos); + GSVector2i::store(&cmd->vertices[i], end_pos); cmd->vertices[i].color = end_color; } } @@ -2978,7 +2978,7 @@ ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(GSVector4i uv_rect) const GPUTextureMode tmode = m_draw_mode.mode_reg.texture_mode; const u32 xshift = (tmode >= GPUTextureMode::Direct16Bit) ? 0 : (2 - static_cast(tmode)); - const GSVector4i page_offset = GSVector4i::loadl(m_current_texture_page_offset).xyxy(); + const GSVector4i page_offset = GSVector4i::loadl(m_current_texture_page_offset).xyxy(); uv_rect = uv_rect.blend32<5>(uv_rect.srl32(xshift)); // shift only goes on the x uv_rect = uv_rect.add32(page_offset); // page offset diff --git a/src/core/gpu_hw.h b/src/core/gpu_hw.h index a35f88bad..46bdc78fa 100644 --- a/src/core/gpu_hw.h +++ b/src/core/gpu_hw.h @@ -317,7 +317,7 @@ private: GSVector4i m_vram_dirty_write_rect = INVALID_RECT; // TODO: Don't use in TC mode, should be kept at zero. GSVector4i m_current_uv_rect = INVALID_RECT; GSVector4i m_current_draw_rect = INVALID_RECT; - s32 m_current_texture_page_offset[2] = {}; + alignas(8) s32 m_current_texture_page_offset[2] = {}; std::unique_ptr m_wireframe_pipeline; diff --git a/src/core/gpu_hw_texture_cache.cpp b/src/core/gpu_hw_texture_cache.cpp index 1efc0988d..f7f825887 100644 --- a/src/core/gpu_hw_texture_cache.cpp +++ b/src/core/gpu_hw_texture_cache.cpp @@ -3305,8 +3305,8 @@ void GPUTextureCache::ApplyTextureReplacements(SourceKey key, HashType tex_hash, // TODO: Use rects instead of fullscreen tris, maybe avoid the copy.. alignas(VECTOR_ALIGNMENT) float uniforms[4]; GSVector2 texture_size = GSVector2(GSVector2i(entry->texture->GetWidth(), entry->texture->GetHeight())); - GSVector2::store(&uniforms[0], texture_size); - GSVector2::store(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size); + GSVector2::store(&uniforms[0], texture_size); + GSVector2::store(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size); g_gpu_device->InvalidateRenderTarget(s_state.replacement_texture_render_target.get()); g_gpu_device->SetRenderTarget(s_state.replacement_texture_render_target.get()); g_gpu_device->SetViewportAndScissor(0, 0, new_width, new_height); @@ -3325,8 +3325,8 @@ void GPUTextureCache::ApplyTextureReplacements(SourceKey key, HashType tex_hash, const GSVector4i dst_rect = GSVector4i(GSVector4(si.dst_rect) * max_scale_v); texture_size = GSVector2(GSVector2i(temp_texture->GetWidth(), temp_texture->GetHeight())); - GSVector2::store(&uniforms[0], texture_size); - GSVector2::store(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size); + GSVector2::store(&uniforms[0], texture_size); + GSVector2::store(&uniforms[2], GSVector2::cxpr(1.0f) / texture_size); g_gpu_device->SetViewportAndScissor(dst_rect); g_gpu_device->SetTextureSampler(0, temp_texture.get(), g_gpu_device->GetNearestSampler()); g_gpu_device->SetPipeline(si.invert_alpha ? s_state.replacement_semitransparent_draw_pipeline.get() : diff --git a/src/core/gpu_sw.cpp b/src/core/gpu_sw.cpp index 10eb5a5bc..068db4164 100644 --- a/src/core/gpu_sw.cpp +++ b/src/core/gpu_sw.cpp @@ -537,7 +537,7 @@ void GPU_SW::DispatchRenderCommand() vert->x = m_drawing_offset.x + vp.x; vert->y = m_drawing_offset.y + vp.y; vert->texcoord = textured ? Truncate16(FifoPop()) : 0; - positions[i] = GSVector2i::load(&vert->x); + positions[i] = GSVector2i::load(&vert->x); } // Cull polygons which are too large. @@ -686,8 +686,8 @@ void GPU_SW::DispatchRenderCommand() cmd->vertices[1].y = m_drawing_offset.y + end_pos.y; } - const GSVector4i v0 = GSVector4i::loadl(&cmd->vertices[0].x); - const GSVector4i v1 = GSVector4i::loadl(&cmd->vertices[1].x); + const GSVector4i v0 = GSVector4i::loadl(&cmd->vertices[0].x); + const GSVector4i v1 = GSVector4i::loadl(&cmd->vertices[1].x); const GSVector4i rect = v0.min_s32(v1).xyxy(v0.max_s32(v1)).add32(GSVector4i::cxpr(0, 0, 1, 1)); const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area); @@ -711,7 +711,7 @@ void GPU_SW::DispatchRenderCommand() u32 buffer_pos = 0; const GPUVertexPosition start_vp{m_blit_buffer[buffer_pos++]}; - const GSVector2i draw_offset = GSVector2i::load(&m_drawing_offset.x); + const GSVector2i draw_offset = GSVector2i::load(&m_drawing_offset.x); GSVector2i start_pos = GSVector2i(start_vp.x, start_vp.y).add32(draw_offset); u32 start_color = m_render_command.color_for_first_vertex; @@ -740,9 +740,9 @@ void GPU_SW::DispatchRenderCommand() GPUBackendDrawLineCommand::Vertex* out_vertex = &cmd->vertices[out_vertex_count]; out_vertex_count += 2; - GSVector2i::store(&out_vertex[0].x, start_pos); + GSVector2i::store(&out_vertex[0].x, start_pos); out_vertex[0].color = start_color; - GSVector2i::store(&out_vertex[1].x, end_pos); + GSVector2i::store(&out_vertex[1].x, end_pos); out_vertex[1].color = end_color; } diff --git a/src/core/gpu_sw_rasterizer.inl b/src/core/gpu_sw_rasterizer.inl index 128f3af9c..3343862a9 100644 --- a/src/core/gpu_sw_rasterizer.inl +++ b/src/core/gpu_sw_rasterizer.inl @@ -397,7 +397,7 @@ ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y) { if (x <= (VRAM_WIDTH - 4)) { - return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32(); + return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32(); } else { @@ -415,7 +415,7 @@ ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color) const GSVector4i packed_color = color.pu32(); if (x <= (VRAM_WIDTH - 4)) { - GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], packed_color); + GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], packed_color); } else { diff --git a/src/core/mdec.cpp b/src/core/mdec.cpp index de2029833..f5a54f060 100644 --- a/src/core/mdec.cpp +++ b/src/core/mdec.cpp @@ -711,10 +711,10 @@ void MDEC::CopyOutBlock(void* param, TickCount ticks, TickCount ticks_late) for (u32 index = 0; index < s_state.block_rgb.size(); index += 16) { - const GSVector4i rgbx0 = GSVector4i::load(&s_state.block_rgb[index]); - const GSVector4i rgbx1 = GSVector4i::load(&s_state.block_rgb[index + 4]); - const GSVector4i rgbx2 = GSVector4i::load(&s_state.block_rgb[index + 8]); - const GSVector4i rgbx3 = GSVector4i::load(&s_state.block_rgb[index + 12]); + const GSVector4i rgbx0 = GSVector4i::load(&s_state.block_rgb[index]); + const GSVector4i rgbx1 = GSVector4i::load(&s_state.block_rgb[index + 4]); + const GSVector4i rgbx2 = GSVector4i::load(&s_state.block_rgb[index + 8]); + const GSVector4i rgbx3 = GSVector4i::load(&s_state.block_rgb[index + 12]); GSVector4i::store(&rgbp[0], rgbx0.shuffle8(mask00) | rgbx1.shuffle8(mask01)); GSVector4i::store(&rgbp[4], rgbx1.shuffle8(mask11) | rgbx2.shuffle8(mask12)); @@ -1048,8 +1048,8 @@ void MDEC::YUVToRGB_New(u32 xx, u32 yy, const std::array& Crblk, const const GSVector4i addval = s_state.status.data_output_signed ? GSVector4i::cxpr(0) : GSVector4i::cxpr(0x80808080); for (u32 y = 0; y < 8; y++) { - const GSVector4i Cr = GSVector4i::loadl(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32(); - const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32(); + const GSVector4i Cr = GSVector4i::loadl(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32(); + const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32(); const GSVector4i Y = GSVector4i::load(&Yblk[y * 8]); // BT.601 YUV->RGB coefficients, rounding formula from Mednafen. diff --git a/src/core/spu.cpp b/src/core/spu.cpp index 4800ec264..f24c8c8b2 100644 --- a/src/core/spu.cpp +++ b/src/core/spu.cpp @@ -2318,7 +2318,7 @@ void SPU::ProcessReverb(s32 left_in, s32 right_in, s32* left_out, s32* right_out srcs = GSVector4i::load(&src[8]); acc = acc.add32(GSVector4i::load(&resample_coeff[8]).mul32l(srcs.s16to32())); acc = acc.add32(GSVector4i::load(&resample_coeff[12]).mul32l(srcs.uph64().s16to32())); - srcs = GSVector4i::loadl(&src[16]); + srcs = GSVector4i::loadl(&src[16]); acc = acc.add32(GSVector4i::load(&resample_coeff[16]).mul32l(srcs.s16to32())); out[channel] = std::clamp(acc.addv_s32() >> 14, -32768, 32767); diff --git a/src/util/postprocessing_shader_fx.cpp b/src/util/postprocessing_shader_fx.cpp index 6486ce9ae..dc4234048 100644 --- a/src/util/postprocessing_shader_fx.cpp +++ b/src/util/postprocessing_shader_fx.cpp @@ -1683,7 +1683,7 @@ GPUDevice::PresentResult PostProcessing::ReShadeFXShader::Apply(GPUTexture* inpu case SourceOptionType::ViewportOffset: { - GSVector4::storel(dst, GSVector4(final_rect)); + GSVector4::storel(dst, GSVector4(final_rect)); } break;