GSVector: SIMD-ify YUVToRGB

Almost 10x speedup on the function alone, 15% in practice ingame.
2024-07-10 15:08:56 +10:00 · 2024-07-10 15:08:56 +10:00 · 9be7a37416
parent 18b0b11094
commit 9be7a37416
1 changed files with 28 additions and 15 deletions
--- a/src/core/mdec.cpp
+++ b/src/core/mdec.cpp
@ -978,25 +978,38 @@ void MDEC::IDCT_New(s16* blk)
 void MDEC::YUVToRGB_New(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const std::array<s16, 64>& Cbblk,
                        const std::array<s16, 64>& Yblk)
 {
-  const s32 addval = s_state.status.data_output_signed ? 0 : 0x80;
+  const GSVector4i addval = s_state.status.data_output_signed ? GSVector4i::cxpr(0) : GSVector4i::cxpr(0x80808080);
  for (u32 y = 0; y < 8; y++)
  {
-    for (u32 x = 0; x < 8; x++)
-    {
-      const s32 Cr = Crblk[((x + xx) / 2) + ((y + yy) / 2) * 8];
-      const s32 Cb = Cbblk[((x + xx) / 2) + ((y + yy) / 2) * 8];
-      const s32 Y = Yblk[x + y * 8];
+    const GSVector4i Cr = GSVector4i::loadl(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).i16to32();
+    const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).i16to32();
+    const GSVector4i Y = GSVector4i::load<true>(&Yblk[y * 8]);

-      // BT.601 YUV->RGB coefficients, rounding from Mednafen.
-      const s32 r = std::clamp(SignExtendN<9, s32>(Y + (((359 * Cr) + 0x80) >> 8)), -128, 127) + addval;
-      const s32 g =
-        std::clamp(SignExtendN<9, s32>(Y + ((((-88 * Cb) & ~0x1F) + ((-183 * Cr) & ~0x07) + 0x80) >> 8)), -128, 127) +
-        addval;
-      const s32 b = std::clamp(SignExtendN<9, s32>(Y + (((454 * Cb) + 0x80) >> 8)), -128, 127) + addval;
+    // BT.601 YUV->RGB coefficients, rounding formula from Mednafen.
+    // r = clamp(sext9(Y + (((359 * Cr) + 0x80) >> 8)), -128, 127) + addval;
+    // g = clamp(sext9(Y + ((((-88 * Cb) & ~0x1F) + ((-183 * Cr) & ~0x07) + 0x80) >> 8)), -128, 127) + addval
+    // b = clamp(sext9<9, s32>(Y + (((454 * Cb) + 0x80) >> 8)), -128, 127) + addval

-      s_state.block_rgb[(x + xx) + ((y + yy) * 16)] =
-        static_cast<u32>(r) | (static_cast<u32>(g) << 8) | (static_cast<u32>(b) << 16);
-    }
+    // Need to do the multiply as 32-bit, since 127 * 359 is greater than INT16_MAX.
+    // upl16(self) = interleave XYZW0000 -> XXYYZZWW.
+    const GSVector4i Crmul = Cr.mul32l(GSVector4i::cxpr(359)).add16(GSVector4i::cxpr(0x80)).sra32<8>().ps32();
+    const GSVector4i Cbmul = Cb.mul32l(GSVector4i::cxpr(454)).add16(GSVector4i::cxpr(0x80)).sra32<8>().ps32();
+    const GSVector4i CrCbmul = (Cb.mul32l(GSVector4i::cxpr(-88)) & GSVector4i::cxpr(~0x1F))
+                                 .add32(Cr.mul32l(GSVector4i::cxpr(-183)) & GSVector4i::cxpr(~0x07))
+                                 .add32(GSVector4i::cxpr(0x80))
+                                 .sra32<8>()
+                                 .ps32();
+    const GSVector4i r = Crmul.upl16(Crmul).add16(Y).sll16<7>().sra16<7>().ps16().add8(addval);
+    const GSVector4i g = CrCbmul.upl16(CrCbmul).add16(Y).sll16<7>().sra16<7>().ps16().add8(addval);
+    const GSVector4i b = Cbmul.upl16(Cbmul).add16(Y).sll16<7>().sra16<7>().ps16().add8(addval);
+    const GSVector4i rg = r.upl8(g);
+    const GSVector4i b0 = b.upl8();
+    const GSVector4i rgblow = rg.upl16(b0);
+    const GSVector4i rgbhigh = rg.uph16(b0);
+
+    u32* const out_row = &s_state.block_rgb[xx + ((y + yy) * 16)];
+    GSVector4i::store<false>(&out_row[0], rgblow);
+    GSVector4i::store<false>(&out_row[4], rgbhigh);
  }
 }