MDEC: Redo IDCT and conversion

RE2 backgrounds look better now (mainly due to the 24bpp->16bpp conversion). Also implements signed output (untested).
2022-12-14 17:28:34 +10:00 · 2022-12-14 17:28:34 +10:00 · 8ab46d0713
parent 1905ce3e01
commit 8ab46d0713
1 changed files with 28 additions and 31 deletions
--- a/src/core/mdec.cpp
+++ b/src/core/mdec.cpp
@ -5,6 +5,7 @@
 #include "common/log.h"
 #include "cpu_core.h"
 #include "dma.h"
+#include "gpu_types.h"
 #include "host.h"
 #include "imgui.h"
 #include "interrupt_controller.h"
@ -517,22 +518,22 @@ void MDEC::CopyOutBlock()

    case DataOutputDepth_15Bit:
    {
-      const u16 a = ZeroExtend16(m_status.data_output_bit15.GetValue());
+      const u32 a = ZeroExtend32(m_status.data_output_bit15.GetValue()) << 15;
      for (u32 i = 0; i < static_cast<u32>(m_block_rgb.size());)
      {
        u32 color = m_block_rgb[i++];
-        u16 r = Truncate16((color >> 3) & 0x1Fu);
-        u16 g = Truncate16((color >> 11) & 0x1Fu);
-        u16 b = Truncate16((color >> 19) & 0x1Fu);
-        const u16 color15a = r | (g << 5) | (b << 10) | (a << 15);
+        u32 r = VRAMConvert8To5(color & 0xFFu);
+        u32 g = VRAMConvert8To5((color >> 8) & 0xFFu);
+        u32 b = VRAMConvert8To5((color >> 16) & 0xFFu);
+        const u32 color15a = r | (g << 5) | (b << 10) | a;

        color = m_block_rgb[i++];
-        r = Truncate16((color >> 3) & 0x1Fu);
-        g = Truncate16((color >> 11) & 0x1Fu);
-        b = Truncate16((color >> 19) & 0x1Fu);
-        const u16 color15b = r | (g << 5) | (b << 10) | (a << 15);
+        r = VRAMConvert8To5(color & 0xFFu);
+        g = VRAMConvert8To5((color >> 8) & 0xFFu);
+        b = VRAMConvert8To5((color >> 16) & 0xFFu);
+        const u32 color15b = r | (g << 5) | (b << 10) | a;

-        m_data_out_fifo.Push(ZeroExtend32(color15a) | (ZeroExtend32(color15b) << 16));
+        m_data_out_fifo.Push(color15a | (color15b << 16));
      }
    }
    break;
@ -587,7 +588,7 @@ bool MDEC::rl_decode_block(s16* blk, const u8* qt)
    val = std::clamp(val, -0x400, 0x3FF);
    if (m_current_q_scale > 0)
      blk[zagzig[m_current_coefficient]] = static_cast<s16>(val);
-    else if (m_current_q_scale == 0)
+    else
      blk[m_current_coefficient] = static_cast<s16>(val);
  }

@ -610,7 +611,7 @@ bool MDEC::rl_decode_block(s16* blk, const u8* qt)
      val = std::clamp(val, -0x400, 0x3FF);
      if (m_current_q_scale > 0)
        blk[zagzig[m_current_coefficient]] = static_cast<s16>(val);
-      else if (m_current_q_scale == 0)
+      else
        blk[m_current_coefficient] = static_cast<s16>(val);
    }

@ -626,27 +627,27 @@ bool MDEC::rl_decode_block(s16* blk, const u8* qt)

 void MDEC::IDCT(s16* blk)
 {
-  std::array<s64, 64> temp_buffer;
+  std::array<s32, 64> temp;
  for (u32 x = 0; x < 8; x++)
  {
    for (u32 y = 0; y < 8; y++)
    {
-      s64 sum = 0;
-      for (u32 u = 0; u < 8; u++)
-        sum += s32(blk[u * 8 + x]) * s32(m_scale_table[u * 8 + y]);
-      temp_buffer[x + y * 8] = sum;
+      // TODO: We could alter zigzag and invert scale_table to get these in row-major order,
+      // in which case we could do optimize this to a vector multiply.
+      s32 sum = 0;
+      for (u32 z = 0; z < 8; z++)
+        sum += s32(blk[y + z * 8]) * s32(m_scale_table[x + z * 8] / 8);
+      temp[x + y * 8] = static_cast<s32>((sum + 0xfff) / 0x2000);
    }
  }
  for (u32 x = 0; x < 8; x++)
  {
    for (u32 y = 0; y < 8; y++)
    {
-      s64 sum = 0;
-      for (u32 u = 0; u < 8; u++)
-        sum += s64(temp_buffer[u + y * 8]) * s32(m_scale_table[u * 8 + x]);
-
-      blk[x + y * 8] =
-        static_cast<s16>(std::clamp<s32>(SignExtendN<9, s32>((sum >> 32) + ((sum >> 31) & 1)), -128, 127));
+      s32 sum = 0;
+      for (u32 z = 0; z < 8; z++)
+        sum += temp[y + z * 8] * s32(m_scale_table[x + z * 8] / 8);
+      blk[x + y * 8] = static_cast<s16>(std::clamp<s32>((sum + 0xfff) / 0x2000, -128, 127));
    }
  }
 }
@ -654,6 +655,7 @@ void MDEC::IDCT(s16* blk)
 void MDEC::yuv_to_rgb(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const std::array<s16, 64>& Cbblk,
                      const std::array<s16, 64>& Yblk)
 {
+  const s16 addval = m_status.data_output_signed ? 0 : 0x80;
  for (u32 y = 0; y < 8; y++)
  {
    for (u32 x = 0; x < 8; x++)
@ -666,14 +668,9 @@ void MDEC::yuv_to_rgb(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const st
      B = static_cast<s16>(1.772f * static_cast<float>(B));

      s16 Y = Yblk[x + y * 8];
-      R = static_cast<s16>(std::clamp(static_cast<int>(Y) + R, -128, 127));
-      G = static_cast<s16>(std::clamp(static_cast<int>(Y) + G, -128, 127));
-      B = static_cast<s16>(std::clamp(static_cast<int>(Y) + B, -128, 127));
-
-      // TODO: Signed output
-      R += 128;
-      G += 128;
-      B += 128;
+      R = static_cast<s16>(std::clamp(static_cast<int>(Y) + R, -128, 127)) + addval;
+      G = static_cast<s16>(std::clamp(static_cast<int>(Y) + G, -128, 127)) + addval;
+      B = static_cast<s16>(std::clamp(static_cast<int>(Y) + B, -128, 127)) + addval;

      m_block_rgb[(x + xx) + ((y + yy) * 16)] = ZeroExtend32(static_cast<u16>(R)) |
                                                (ZeroExtend32(static_cast<u16>(G)) << 8) |