diff --git a/pcsx2/GS/GSVector4.h b/pcsx2/GS/GSVector4.h index 0a256258f1..13f4073cbb 100644 --- a/pcsx2/GS/GSVector4.h +++ b/pcsx2/GS/GSVector4.h @@ -205,6 +205,17 @@ public: return m; } + /// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks we don't need the whole vector + /// Useful for e.g. preventing clang from optimizing shuffles that remove possibly-denormal garbage data from vectors before computing with them + __forceinline GSVector4 noopt() + { + // Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the future the implementation should be updated +#ifdef __clang__ + __asm__("":"+x"(m)::); +#endif + return *this; + } + __forceinline uint32 rgba32() const { return GSVector4i(*this).rgba32(); diff --git a/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp b/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp index 306dffa316..a3d59c4ecb 100644 --- a/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp +++ b/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp @@ -202,9 +202,9 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun GSVector4 q = stq.wwww(); if (accurate_stq) - stq = (stq.xyww() / q).xyww(q); + stq = (stq.xyww() / q).noopt().xyww(q); else - stq = (stq.xyww() * q.rcpnr()).xyww(q); + stq = (stq.xyww() * q.rcpnr()).noopt().xyww(q); tmin = tmin.min(stq); tmax = tmax.max(stq); @@ -256,19 +256,21 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun GSVector4 stq0 = GSVector4::cast(c0); GSVector4 stq1 = GSVector4::cast(c1); + GSVector4 q = stq0.wwww(stq1); + if (accurate_stq) { - GSVector4 q = stq0.wwww(stq1); + GSVector4 st = stq0.xyxy(stq1) / q; - stq0 = (stq0.xyww() / q.xxxx()).xyww(stq0); - stq1 = (stq1.xyww() / q.zzzz()).xyww(stq1); + stq0 = st.xyww(stq1); + stq1 = st.zwww(stq1); } else { - GSVector4 q = stq0.wwww(stq1).rcpnr(); + GSVector4 st = stq0.xyxy(stq1) * q.rcpnr(); - stq0 = (stq0.xyww() * q.xxxx()).xyww(stq0); - stq1 = (stq1.xyww() * q.zzzz()).xyww(stq1); + stq0 = st.xyww(stq0); + stq1 = st.zwww(stq1); } tmin = tmin.min(stq0.min(stq1)); @@ -331,19 +333,20 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun if (accurate_stq) { - GSVector4 q = stq0.wwww(stq1).xzww(stq2); + GSVector4 st01 = stq0.xyxy(stq1) / stq0.wwww(stq1); - stq0 = (stq0.xyww() / q.xxxx()).xyww(stq0); - stq1 = (stq1.xyww() / q.yyyy()).xyww(stq1); - stq2 = (stq2.xyww() / q.zzzz()).xyww(stq2); + stq0 = st01.xyww(stq0); + stq1 = st01.zwww(stq1); + stq2 = (stq2.xyww() / stq2.wwww()).noopt().xyww(stq2); } else { GSVector4 q = stq0.wwww(stq1).xzww(stq2).rcpnr(); + GSVector4 st01 = stq0.xyxy(stq1) * q.xxyy(); - stq0 = (stq0.xyww() * q.xxxx()).xyww(stq0); - stq1 = (stq1.xyww() * q.yyyy()).xyww(stq1); - stq2 = (stq2.xyww() * q.zzzz()).xyww(stq2); + stq0 = st01.xyww(stq0); + stq1 = st01.zwww(stq1); + stq2 = (stq2.xyww() * q.zzzz()).noopt().xyww(stq2); } tmin = tmin.min(stq2).min(stq0.min(stq1)); @@ -410,17 +413,17 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun if (accurate_stq) { - GSVector4 q = stq1.wwww(); + GSVector4 st = stq0.xyxy(stq1) / stq1.wwww(); - stq0 = (stq0.xyww() / q).xyww(stq1); - stq1 = (stq1.xyww() / q).xyww(stq1); + stq0 = st.xyww(stq1); + stq1 = st.zwww(stq1); } else { - GSVector4 q = stq1.wwww().rcpnr(); + GSVector4 st = stq0.xyxy(stq1) * stq1.wwww().rcpnr(); - stq0 = (stq0.xyww() * q).xyww(stq1); - stq1 = (stq1.xyww() * q).xyww(stq1); + stq0 = st.xyww(stq1); + stq1 = st.zwww(stq1); } tmin = tmin.min(stq0.min(stq1));