same for mdec, nothing to see here

2020-04-09 16:04:43 -04:00 · 2020-04-09 16:04:43 -04:00 · d8fad23324
parent e2389e25d7
commit d8fad23324
1 changed files with 204 additions and 205 deletions
--- a/psx/octoshock/psx/mdec.cpp
+++ b/psx/octoshock/psx/mdec.cpp
@ -1,25 +1,25 @@
-/******************************************************************************/
-/* Mednafen Sony PS1 Emulation Module                                         */
-/******************************************************************************/
-/* mdec.cpp:
-**  Copyright (C) 2011-2016 Mednafen Team
-**
-** This program is free software; you can redistribute it and/or
-** modify it under the terms of the GNU General Public License
-** as published by the Free Software Foundation; either version 2
-** of the License, or (at your option) any later version.
-**
-** This program is distributed in the hope that it will be useful,
-** but WITHOUT ANY WARRANTY; without even the implied warranty of
-** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-** GNU General Public License for more details.
-**
-** You should have received a copy of the GNU General Public License
-** along with this program; if not, write to the Free Software Foundation, Inc.,
-** 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-*/
-
-#pragma GCC optimize ("unroll-loops")
+/******************************************************************************/
+/* Mednafen Sony PS1 Emulation Module                                         */
+/******************************************************************************/
+/* mdec.cpp:
+**  Copyright (C) 2011-2016 Mednafen Team
+**
+** This program is free software; you can redistribute it and/or
+** modify it under the terms of the GNU General Public License
+** as published by the Free Software Foundation; either version 2
+** of the License, or (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software Foundation, Inc.,
+** 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+*/
+
+#pragma GCC optimize ("unroll-loops")

 /*
 MDEC_READ_FIFO(tfr) vs InCounter vs MDEC_DMACanRead() is a bit fragile right now.  Actually, the entire horrible state machine monstrosity is fragile.
@ -73,8 +73,8 @@
 #include <emmintrin.h>
 #endif

-#if 0 //defined(HAVE_NEON_INTRINSICS)
-#include <arm_neon.h>
+#if 0 //defined(HAVE_NEON_INTRINSICS)
+#include <arm_neon.h>
 #endif

 #if defined(HAVE_ALTIVEC_INTRINSICS) && defined(HAVE_ALTIVEC_H)
@ -90,7 +90,7 @@ namespace MDFN_IEN_PSX

 static int32 ClockCounter;
 static unsigned MDRPhase;
-static FastFIFO<uint32, 0x20> InFIFO;
+static FastFIFO<uint32, 0x20> InFIFO;
 static FastFIFO<uint32, 0x20> OutFIFO;

 static int8 block_y[8][8];
@ -242,125 +242,125 @@ static INLINE int8 Mask9ClampS8(int32 v)
 return v;
 }

-////////////////////////
-//
-//
-#pragma GCC push_options
-
-#if defined(__SSE2__) || (defined(ARCH_X86) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)))
-//
-//
-//
-#pragma GCC target("sse2")
-template<typename T>
-static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
-{
-	for(unsigned col = 0; col < 8; col++)
-	{
-		__m128i c =  _mm_load_si128((__m128i *)&in_coeff[(col * 8)]);
-
-		for(unsigned x = 0; x < 8; x++)
-		{
-			__m128i sum;
-			__m128i m;
-			alignas(16) int32 tmp[4];
-
-			m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]);
-			sum = _mm_madd_epi16(m, c);
-			sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)));
-			sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2)));
-
-			//_mm_store_ss((float *)&tmp[0], (__m128)sum);
-			_mm_store_si128((__m128i*)tmp, sum);
-
-			if(sizeof(T) == 1)
-				out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15);
-			else
-				out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15;
-		}
-	}
-}
-//
-//
-//
-#elif 0 //defined(HAVE_NEON_INTRINSICS)
-//
-//
-//
-template<typename T>
-static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
-{
-	for(unsigned col = 0; col < 8; col++)
-	{
-		register int16x4_t c0 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 0, sizeof(int16x4_t)));
-		register int16x4_t c1 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 4, sizeof(int16x4_t)));
-		int32 buf[8];
-
-		for(unsigned x = 0; x < 8; x++)
-		{
-			register int32x4_t accum;
-			register int32x2_t sum2;
-
-			accum = vdupq_n_s32(0);
-			accum = vmlal_s16(accum, c0, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 0, sizeof(int16x4_t))));
-			accum = vmlal_s16(accum, c1, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 4, sizeof(int16x4_t))));
-			sum2 = vadd_s32(vget_high_s32(accum), vget_low_s32(accum));
-			sum2 = vpadd_s32(sum2, sum2);
-			vst1_lane_s32(buf + x, sum2, 0);
-		}
-
-		for(unsigned x = 0; x < 8; x++)
-		{
-			if(sizeof(T) == 1)
-				out_coeff[(col * 8) + x] = Mask9ClampS8((buf[x] + 0x4000) >> 15);
-			else
-				out_coeff[(x * 8) + col] = (buf[x] + 0x4000) >> 15;
-		}
-	}
-}
-//
-//
-//
-#else
-//
-//
-//
-template<typename T>
-static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
-{
-	for(unsigned col = 0; col < 8; col++)
-	{
-		for(unsigned x = 0; x < 8; x++)
-		{
-			int32 sum = 0;
-
-			for(unsigned u = 0; u < 8; u++)
-			{
-				sum += (in_coeff[(col * 8) + u] * IDCTMatrix[(x * 8) + u]);
-			}
-
-			if(sizeof(T) == 1)
-				out_coeff[(col * 8) + x] = Mask9ClampS8((sum + 0x4000) >> 15);
-			else
-				out_coeff[(x * 8) + col] = (sum + 0x4000) >> 15;
-		}
-	}
-}
-//
-//
-//
-#endif
-
-static NO_INLINE void IDCT(int16 *in_coeff, int8 *out_coeff)
-{
-	alignas(16) int16 tmpbuf[64];
-
-	IDCT_1D_Multi<int16>(in_coeff, tmpbuf);
-	IDCT_1D_Multi<int8>(tmpbuf, out_coeff);
-}
-#pragma GCC pop_options
-//
-//
+////////////////////////
+//
+//
+#pragma GCC push_options
+
+#if defined(__SSE2__) || (defined(ARCH_X86) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)))
+//
+//
+//
+#pragma GCC target("sse2")
+template<typename T>
+static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
+{
+ for(unsigned col = 0; col < 8; col++)
+ {
+  __m128i c =  _mm_load_si128((__m128i *)&in_coeff[(col * 8)]);
+
+  for(unsigned x = 0; x < 8; x++)
+  {
+   __m128i sum;
+   __m128i m;
+   alignas(16) int32 tmp[4];
+
+   m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]);
+   sum = _mm_madd_epi16(m, c);
+   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)));
+   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2)));
+
+   //_mm_store_ss((float *)&tmp[0], (__m128)sum);
+   _mm_store_si128((__m128i*)tmp, sum);
+
+   if(sizeof(T) == 1)
+    out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15);
+   else
+    out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15;
+  }
+ }
+}
+//
+//
+//
+#elif 0 //defined(HAVE_NEON_INTRINSICS)
+//
+//
+//
+template<typename T>
+static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
+{
+ for(unsigned col = 0; col < 8; col++)
+ {
+  register int16x4_t c0 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 0, sizeof(int16x4_t)));
+  register int16x4_t c1 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 4, sizeof(int16x4_t)));
+  int32 buf[8];
+
+  for(unsigned x = 0; x < 8; x++)
+  {
+   register int32x4_t accum;
+   register int32x2_t sum2;
+
+   accum = vdupq_n_s32(0);
+   accum = vmlal_s16(accum, c0, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 0, sizeof(int16x4_t))));
+   accum = vmlal_s16(accum, c1, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 4, sizeof(int16x4_t))));
+   sum2 = vadd_s32(vget_high_s32(accum), vget_low_s32(accum));
+   sum2 = vpadd_s32(sum2, sum2);
+   vst1_lane_s32(buf + x, sum2, 0);
+  }
+
+  for(unsigned x = 0; x < 8; x++)
+  {
+   if(sizeof(T) == 1)
+    out_coeff[(col * 8) + x] = Mask9ClampS8((buf[x] + 0x4000) >> 15);
+   else
+    out_coeff[(x * 8) + col] = (buf[x] + 0x4000) >> 15;
+  }
+ }
+}
+//
+//
+//
+#else
+//
+//
+//
+template<typename T>
+static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
+{
+ for(unsigned col = 0; col < 8; col++)
+ {
+  for(unsigned x = 0; x < 8; x++)
+  {
+   int32 sum = 0;
+
+   for(unsigned u = 0; u < 8; u++)
+   {
+    sum += (in_coeff[(col * 8) + u] * IDCTMatrix[(x * 8) + u]);
+   }
+
+   if(sizeof(T) == 1)
+    out_coeff[(col * 8) + x] = Mask9ClampS8((sum + 0x4000) >> 15);
+   else
+    out_coeff[(x * 8) + col] = (sum + 0x4000) >> 15;
+  }
+ }
+}
+//
+//
+//
+#endif
+
+static NO_INLINE void IDCT(int16 *in_coeff, int8 *out_coeff)
+{
+ alignas(16) int16 tmpbuf[64];
+
+ IDCT_1D_Multi<int16>(in_coeff, tmpbuf);
+ IDCT_1D_Multi<int8>(tmpbuf, out_coeff);
+}
+#pragma GCC pop_options
+//
+//
 ///////////////////////

 static INLINE void YCbCr_to_RGB(const int8 y, const int8 cb, const int8 cr, int &r, int &g, int &b)
@ -519,9 +519,9 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles)
    int ci = sign_10_to_s16(V & 0x3FF);
    int tmp;

-    if(q != 0)
-     tmp = (int32)((uint32)(ci * q) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0);
-    else
+    if(q != 0)
+     tmp = (int32)((uint32)(ci * q) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0);
+    else
     tmp = (uint32)(ci * 2) << 4;

    // Not sure if it should be 0x3FFF or 0x3FF0 or maybe 0x3FF8?
@ -552,9 +552,9 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles)
     int ci = sign_10_to_s16(V & 0x3FF);
     int tmp;

-     if(q != 0)
-      tmp = (int32)((uint32)((ci * q) >> 3) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0);
-     else
+     if(q != 0)
+      tmp = (int32)((uint32)((ci * q) >> 3) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0);
+     else
      tmp = (uint32)(ci * 2) << 4;

     // Not sure if it should be 0x3FFF or 0x3FF0 or maybe 0x3FF8?
@ -572,16 +572,16 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles)

   switch(DecodeWB)
   {
-    case 0: IDCT(Coeff, MDAP(block_cr)); break;
-    case 1: IDCT(Coeff, MDAP(block_cb)); break;
-    case 2: IDCT(Coeff, MDAP(block_y)); break;
-    case 3: IDCT(Coeff, MDAP(block_y)); break;
-    case 4: IDCT(Coeff, MDAP(block_y)); break;
+    case 0: IDCT(Coeff, MDAP(block_cr)); break;
+    case 1: IDCT(Coeff, MDAP(block_cb)); break;
+    case 2: IDCT(Coeff, MDAP(block_y)); break;
+    case 3: IDCT(Coeff, MDAP(block_y)); break;
+    case 4: IDCT(Coeff, MDAP(block_y)); break;
    case 5: IDCT(Coeff, MDAP(block_y)); break;
   }   

-   // Timing in the PS1 MDEC is complex due to (apparent) pipelining, but the average when decoding a large number of blocks is
-   // about 512.
+   // Timing in the PS1 MDEC is complex due to (apparent) pipelining, but the average when decoding a large number of blocks is
+   // about 512.
   *eat_cycles += 512;

   if(DecodeWB >= 2)
@ -604,7 +604,7 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles)
 //
 #define MDEC_WAIT_COND(n)  { case __COUNTER__: if(!(n)) { MDRPhase = __COUNTER__ - MDRPhaseBias - 1; return; } }

-#define MDEC_WRITE_FIFO(n) { MDEC_WAIT_COND(OutFIFO.CanWrite()); OutFIFO.Write(n);  }
+#define MDEC_WRITE_FIFO(n) { MDEC_WAIT_COND(OutFIFO.CanWrite()); OutFIFO.Write(n);  }
 #define MDEC_READ_FIFO(n)  { MDEC_WAIT_COND(InFIFO.CanRead()); n = InFIFO.Read(); }
 #define MDEC_EAT_CLOCKS(n) { ClockCounter -= (n); MDEC_WAIT_COND(ClockCounter > 0); }

@ -682,7 +682,7 @@ MDFN_FASTCALL void MDEC_Run(int32 clocks)
     PixelBufferReadOffset = 0;
     while(PixelBufferReadOffset < PixelBufferCount32)
     {
-      MDEC_WRITE_FIFO(MDFN_de32lsb<true>(&PixelBuffer.pix32[PixelBufferReadOffset++]));
+      MDEC_WRITE_FIFO((MDFN_de32lsb<true>(&PixelBuffer.pix32[PixelBufferReadOffset++])));
     }
    } while(InCounter != 0xFFFF);
   }
@ -746,52 +746,51 @@ MDFN_FASTCALL void MDEC_Run(int32 clocks)
 }
 #endif

-
-MDFN_FASTCALL void MDEC_DMAWrite(uint32 V)
-{
- if(InFIFO.CanWrite())
- {
-  InFIFO.Write(V);
-  MDEC_Run(0);
- }
- else
- {
-  PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA write when input FIFO is full!!\n");
- }
-}
-
-MDFN_FASTCALL uint32 MDEC_DMARead(uint32* offs)
-{
- uint32 V = 0;
-
- *offs = 0;
-
- if(MDFN_LIKELY(OutFIFO.CanRead()))
- {
-  V = OutFIFO.Read();
-
-  *offs = (RAMOffsetY & 0x7) * RAMOffsetWWS;
-
-  if(RAMOffsetY & 0x08)
-  {
-   *offs = (*offs - RAMOffsetWWS*7);
-  }
-
-  RAMOffsetCounter--;
-  if(!RAMOffsetCounter)
-  {
-   RAMOffsetCounter = RAMOffsetWWS;
-   RAMOffsetY++;
-  }
-
-  MDEC_Run(0);
- }
- else
- {
-  PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA read when output FIFO is empty!\n");
- }
-
- return(V);
+MDFN_FASTCALL void MDEC_DMAWrite(uint32 V)
+{
+ if(InFIFO.CanWrite())
+ {
+  InFIFO.Write(V);
+  MDEC_Run(0);
+ }
+ else
+ {
+  PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA write when input FIFO is full!!\n");
+ }
+}
+
+MDFN_FASTCALL uint32 MDEC_DMARead(uint32* offs)
+{
+ uint32 V = 0;
+
+ *offs = 0;
+
+ if(MDFN_LIKELY(OutFIFO.CanRead()))
+ {
+  V = OutFIFO.Read();
+
+  *offs = (RAMOffsetY & 0x7) * RAMOffsetWWS;
+
+  if(RAMOffsetY & 0x08)
+  {
+   *offs = (*offs - RAMOffsetWWS*7);
+  }
+
+  RAMOffsetCounter--;
+  if(!RAMOffsetCounter)
+  {
+   RAMOffsetCounter = RAMOffsetWWS;
+   RAMOffsetY++;
+  }
+
+  MDEC_Run(0);
+ }
+ else
+ {
+  PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA read when output FIFO is empty!\n");
+ }
+
+ return(V);
 }

 bool MDEC_DMACanWrite(void)