GS: Tidy up shifts in GSVector4i

This commit is contained in:
Stenzek 2024-06-08 23:10:18 +10:00 committed by Connor McLaughlin
parent 4731c6d290
commit 94fc34dd62
2 changed files with 159 additions and 60 deletions

View File

@ -177,6 +177,16 @@ public:
return sat_i32(a);
}
__forceinline bool rintersects(const GSVector4i& v) const
{
return !rintersect(v).rempty();
}
__forceinline bool rcontains(const GSVector4i& v) const
{
return rintersect(v).eq(v);
}
template <Align_Mode mode>
GSVector4i _ralign_helper(const GSVector4i& mask) const
{
@ -671,42 +681,68 @@ public:
return GSVector4i(_mm_slli_si128(m, i));
}
template <int i>
__forceinline GSVector4i sra16() const
{
return GSVector4i(_mm_srai_epi16(m, i));
}
template <int i>
__forceinline GSVector4i sra32() const
{
return GSVector4i(_mm_srai_epi32(m, i));
}
__forceinline GSVector4i sra32(int i) const
{
return GSVector4i(_mm_srai_epi32(m, i));
}
#if _M_SSE >= 0x501
__forceinline GSVector4i srav32(const GSVector4i& v) const
{
return GSVector4i(_mm_srav_epi32(m, v.m));
}
#endif
template<int i>
template <s32 i>
__forceinline GSVector4i sll16() const
{
return GSVector4i(_mm_slli_epi16(m, i));
}
template <int i>
__forceinline GSVector4i sll16(s32 i) const
{
return GSVector4i(_mm_sll_epi16(m, _mm_cvtsi32_si128(i)));
}
#if _M_SSE >= 0x501
__forceinline GSVector4i sllv16(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi16(m, v.m)); }
#endif
template <s32 i>
__forceinline GSVector4i srl16() const
{
return GSVector4i(_mm_srli_epi16(m, i));
}
__forceinline GSVector4i srl16(s32 i) const
{
return GSVector4i(_mm_srl_epi16(m, _mm_cvtsi32_si128(i)));
}
#if _M_SSE >= 0x501
__forceinline GSVector4i srlv16(const GSVector4i& v) const
{
return GSVector4i(_mm_srlv_epi16(m, v.m));
}
#endif
template <s32 i>
__forceinline GSVector4i sra16() const
{
return GSVector4i(_mm_srai_epi16(m, i));
}
__forceinline GSVector4i sra16(s32 i) const
{
return GSVector4i(_mm_sra_epi16(m, _mm_cvtsi32_si128(i)));
}
#if _M_SSE >= 0x501
__forceinline GSVector4i srav16(const GSVector4i& v) const
{
return GSVector4i(_mm_srav_epi16(m, v.m));
}
#endif
template <s32 i>
__forceinline GSVector4i sll32() const
{
return GSVector4i(_mm_slli_epi32(m, i));
}
__forceinline GSVector4i sll32(s32 i) const
{
return GSVector4i(_mm_sll_epi32(m, _mm_cvtsi32_si128(i)));
}
#if _M_SSE >= 0x501
__forceinline GSVector4i sllv32(const GSVector4i& v) const
{
@ -714,32 +750,15 @@ public:
}
#endif
template <int i>
__forceinline GSVector4i sll64() const
{
return GSVector4i(_mm_slli_epi64(m, i));
}
template <int i>
__forceinline GSVector4i srl16() const
{
return GSVector4i(_mm_srli_epi16(m, i));
}
__forceinline GSVector4i srl16(int i) const
{
return GSVector4i(_mm_srli_epi16(m, i));
}
template <int i>
template <s32 i>
__forceinline GSVector4i srl32() const
{
return GSVector4i(_mm_srli_epi32(m, i));
}
__forceinline GSVector4i srl32(int i) const
__forceinline GSVector4i srl32(s32 i) const
{
return GSVector4i(_mm_srli_epi32(m, i));
return GSVector4i(_mm_srl_epi32(m, _mm_cvtsi32_si128(i)));
}
#if _M_SSE >= 0x501
@ -749,6 +768,72 @@ public:
}
#endif
template <s32 i>
__forceinline GSVector4i sra32() const
{
return GSVector4i(_mm_srai_epi32(m, i));
}
__forceinline GSVector4i sra32(s32 i) const
{
return GSVector4i(_mm_sra_epi32(m, _mm_cvtsi32_si128(i)));
}
#if _M_SSE >= 0x501
__forceinline GSVector4i srav32(const GSVector4i& v) const
{
return GSVector4i(_mm_srav_epi32(m, v.m));
}
#endif
template <s64 i>
__forceinline GSVector4i sll64() const
{
return GSVector4i(_mm_slli_epi64(m, i));
}
__forceinline GSVector4i sll64(s32 i) const
{
return GSVector4i(_mm_sll_epi64(m, _mm_cvtsi32_si128(i)));
}
#if _M_SSE >= 0x501
__forceinline GSVector4i sllv64(const GSVector4i& v) const
{
return GSVector4i(_mm_sllv_epi64(m, v.m));
}
#endif
template <s64 i>
__forceinline GSVector4i srl64() const
{
return GSVector4i(_mm_srli_epi64(m, i));
}
__forceinline GSVector4i srl64(s32 i) const
{
return GSVector4i(_mm_srl_epi64(m, _mm_cvtsi32_si128(i)));
}
#if _M_SSE >= 0x501
__forceinline GSVector4i srlv64(const GSVector4i& v) const
{
return GSVector4i(_mm_srlv_epi64(m, v.m));
}
#endif
__forceinline GSVector4i sra64(s32 i) const
{
return GSVector4i(_mm_sra_epi64(m, _mm_cvtsi32_si128(i)));
}
#if _M_SSE >= 0x501
__forceinline GSVector4i srav64(const GSVector4i& v) const
{
return GSVector4i(_mm_srav_epi64(m, v.m));
}
#endif
template <int i>
__forceinline GSVector4i srl64() const
{
@ -967,6 +1052,21 @@ public:
return GSVector4i(_mm_cmpgt_epi32(m, v.m));
}
__forceinline GSVector4i ge8(const GSVector4i& v) const
{
return ~GSVector4i(_mm_cmplt_epi8(m, v.m));
}
__forceinline GSVector4i ge16(const GSVector4i& v) const
{
return ~GSVector4i(_mm_cmplt_epi16(m, v.m));
}
__forceinline GSVector4i ge32(const GSVector4i& v) const
{
return ~GSVector4i(_mm_cmplt_epi32(m, v.m));
}
__forceinline GSVector4i lt8(const GSVector4i& v) const
{
return GSVector4i(_mm_cmplt_epi8(m, v.m));
@ -982,6 +1082,19 @@ public:
return GSVector4i(_mm_cmplt_epi32(m, v.m));
}
__forceinline GSVector4i le8(const GSVector4i& v) const
{
return ~GSVector4i(_mm_cmpgt_epi8(m, v.m));
}
__forceinline GSVector4i le16(const GSVector4i& v) const
{
return ~GSVector4i(_mm_cmpgt_epi16(m, v.m));
}
__forceinline GSVector4i le32(const GSVector4i& v) const
{
return ~GSVector4i(_mm_cmpgt_epi32(m, v.m));
}
__forceinline GSVector4i andnot(const GSVector4i& v) const
{
return GSVector4i(_mm_andnot_si128(v.m, m));

View File

@ -103,21 +103,7 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData& data, GSScanlineLocalData
if (global.sel.mmin && global.sel.lcm)
{
#if defined(__GNUC__) && _M_SSE >= 0x501
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80286
//
// GCC 4.9/5/6 doesn't generate correct AVX2 code for extract32<0>. It is fixed in GCC7
// Intrinsic code is _mm_cvtsi128_si32(_mm256_castsi256_si128(m))
// It seems recent Clang got _mm256_cvtsi256_si32(m) instead. I don't know about GCC.
//
// Generated code keep the integer in an XMM register but bit [64:32] aren't cleared.
// So the srl16 shift will be huge and v will be 0.
//
int lod_x = global.lod.i.x0;
GSVector4i v = global.t.minmax.srl16(lod_x);
#else
GSVector4i v = global.t.minmax.srl16(global.lod.i.extract32<0>()); //.x);
#endif
v = v.upl16(v);