diff --git a/pcsx2/GS/GSVector4i.h b/pcsx2/GS/GSVector4i.h index 5037706a32..07c4044a36 100644 --- a/pcsx2/GS/GSVector4i.h +++ b/pcsx2/GS/GSVector4i.h @@ -177,6 +177,16 @@ public: return sat_i32(a); } + __forceinline bool rintersects(const GSVector4i& v) const + { + return !rintersect(v).rempty(); + } + + __forceinline bool rcontains(const GSVector4i& v) const + { + return rintersect(v).eq(v); + } + template GSVector4i _ralign_helper(const GSVector4i& mask) const { @@ -671,42 +681,68 @@ public: return GSVector4i(_mm_slli_si128(m, i)); } - template - __forceinline GSVector4i sra16() const - { - return GSVector4i(_mm_srai_epi16(m, i)); - } - - template - __forceinline GSVector4i sra32() const - { - return GSVector4i(_mm_srai_epi32(m, i)); - } - - __forceinline GSVector4i sra32(int i) const - { - return GSVector4i(_mm_srai_epi32(m, i)); - } - -#if _M_SSE >= 0x501 - __forceinline GSVector4i srav32(const GSVector4i& v) const - { - return GSVector4i(_mm_srav_epi32(m, v.m)); - } -#endif - - template + template __forceinline GSVector4i sll16() const { return GSVector4i(_mm_slli_epi16(m, i)); } - template + __forceinline GSVector4i sll16(s32 i) const + { + return GSVector4i(_mm_sll_epi16(m, _mm_cvtsi32_si128(i))); + } + +#if _M_SSE >= 0x501 + __forceinline GSVector4i sllv16(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi16(m, v.m)); } +#endif + + template + __forceinline GSVector4i srl16() const + { + return GSVector4i(_mm_srli_epi16(m, i)); + } + + __forceinline GSVector4i srl16(s32 i) const + { + return GSVector4i(_mm_srl_epi16(m, _mm_cvtsi32_si128(i))); + } + +#if _M_SSE >= 0x501 + __forceinline GSVector4i srlv16(const GSVector4i& v) const + { + return GSVector4i(_mm_srlv_epi16(m, v.m)); + } +#endif + + template + __forceinline GSVector4i sra16() const + { + return GSVector4i(_mm_srai_epi16(m, i)); + } + + __forceinline GSVector4i sra16(s32 i) const + { + return GSVector4i(_mm_sra_epi16(m, _mm_cvtsi32_si128(i))); + } + +#if _M_SSE >= 0x501 + __forceinline GSVector4i srav16(const GSVector4i& v) const + { + return GSVector4i(_mm_srav_epi16(m, v.m)); + } +#endif + + template __forceinline GSVector4i sll32() const { return GSVector4i(_mm_slli_epi32(m, i)); } + __forceinline GSVector4i sll32(s32 i) const + { + return GSVector4i(_mm_sll_epi32(m, _mm_cvtsi32_si128(i))); + } + #if _M_SSE >= 0x501 __forceinline GSVector4i sllv32(const GSVector4i& v) const { @@ -714,32 +750,15 @@ public: } #endif - template - __forceinline GSVector4i sll64() const - { - return GSVector4i(_mm_slli_epi64(m, i)); - } - - template - __forceinline GSVector4i srl16() const - { - return GSVector4i(_mm_srli_epi16(m, i)); - } - - __forceinline GSVector4i srl16(int i) const - { - return GSVector4i(_mm_srli_epi16(m, i)); - } - - template + template __forceinline GSVector4i srl32() const { return GSVector4i(_mm_srli_epi32(m, i)); } - __forceinline GSVector4i srl32(int i) const + __forceinline GSVector4i srl32(s32 i) const { - return GSVector4i(_mm_srli_epi32(m, i)); + return GSVector4i(_mm_srl_epi32(m, _mm_cvtsi32_si128(i))); } #if _M_SSE >= 0x501 @@ -749,6 +768,72 @@ public: } #endif + template + __forceinline GSVector4i sra32() const + { + return GSVector4i(_mm_srai_epi32(m, i)); + } + + __forceinline GSVector4i sra32(s32 i) const + { + return GSVector4i(_mm_sra_epi32(m, _mm_cvtsi32_si128(i))); + } + +#if _M_SSE >= 0x501 + __forceinline GSVector4i srav32(const GSVector4i& v) const + { + return GSVector4i(_mm_srav_epi32(m, v.m)); + } +#endif + + template + __forceinline GSVector4i sll64() const + { + return GSVector4i(_mm_slli_epi64(m, i)); + } + + __forceinline GSVector4i sll64(s32 i) const + { + return GSVector4i(_mm_sll_epi64(m, _mm_cvtsi32_si128(i))); + } + +#if _M_SSE >= 0x501 + __forceinline GSVector4i sllv64(const GSVector4i& v) const + { + return GSVector4i(_mm_sllv_epi64(m, v.m)); + } +#endif + + template + __forceinline GSVector4i srl64() const + { + return GSVector4i(_mm_srli_epi64(m, i)); + } + + __forceinline GSVector4i srl64(s32 i) const + { + return GSVector4i(_mm_srl_epi64(m, _mm_cvtsi32_si128(i))); + } + +#if _M_SSE >= 0x501 + __forceinline GSVector4i srlv64(const GSVector4i& v) const + { + return GSVector4i(_mm_srlv_epi64(m, v.m)); + } +#endif + + __forceinline GSVector4i sra64(s32 i) const + { + return GSVector4i(_mm_sra_epi64(m, _mm_cvtsi32_si128(i))); + } + +#if _M_SSE >= 0x501 + __forceinline GSVector4i srav64(const GSVector4i& v) const + { + return GSVector4i(_mm_srav_epi64(m, v.m)); + } +#endif + template __forceinline GSVector4i srl64() const { @@ -967,6 +1052,21 @@ public: return GSVector4i(_mm_cmpgt_epi32(m, v.m)); } + __forceinline GSVector4i ge8(const GSVector4i& v) const + { + return ~GSVector4i(_mm_cmplt_epi8(m, v.m)); + } + + __forceinline GSVector4i ge16(const GSVector4i& v) const + { + return ~GSVector4i(_mm_cmplt_epi16(m, v.m)); + } + + __forceinline GSVector4i ge32(const GSVector4i& v) const + { + return ~GSVector4i(_mm_cmplt_epi32(m, v.m)); + } + __forceinline GSVector4i lt8(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi8(m, v.m)); @@ -982,6 +1082,19 @@ public: return GSVector4i(_mm_cmplt_epi32(m, v.m)); } + __forceinline GSVector4i le8(const GSVector4i& v) const + { + return ~GSVector4i(_mm_cmpgt_epi8(m, v.m)); + } + __forceinline GSVector4i le16(const GSVector4i& v) const + { + return ~GSVector4i(_mm_cmpgt_epi16(m, v.m)); + } + __forceinline GSVector4i le32(const GSVector4i& v) const + { + return ~GSVector4i(_mm_cmpgt_epi32(m, v.m)); + } + __forceinline GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(_mm_andnot_si128(v.m, m)); diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp index c5b13a4ae6..e71311ae9e 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp @@ -103,21 +103,7 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData& data, GSScanlineLocalData if (global.sel.mmin && global.sel.lcm) { -#if defined(__GNUC__) && _M_SSE >= 0x501 - // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80286 - // - // GCC 4.9/5/6 doesn't generate correct AVX2 code for extract32<0>. It is fixed in GCC7 - // Intrinsic code is _mm_cvtsi128_si32(_mm256_castsi256_si128(m)) - // It seems recent Clang got _mm256_cvtsi256_si32(m) instead. I don't know about GCC. - // - // Generated code keep the integer in an XMM register but bit [64:32] aren't cleared. - // So the srl16 shift will be huge and v will be 0. - // - int lod_x = global.lod.i.x0; - GSVector4i v = global.t.minmax.srl16(lod_x); -#else GSVector4i v = global.t.minmax.srl16(global.lod.i.extract32<0>()); //.x); -#endif v = v.upl16(v);