update to Mednafen 0.9.39-unstable : mdec

This commit is contained in:
zeromus 2020-04-05 23:13:06 -04:00
parent efb61b42b9
commit 827115ee9f
3 changed files with 175 additions and 84 deletions

View File

@ -90,7 +90,7 @@
[OK] psx/timer : Major functional changes [OK] psx/timer : Major functional changes
[NO] psx/timer : Added loadstate sanity checks [NO] psx/timer : Added loadstate sanity checks
0.9.38.7 -> 0.9.39-unstable 0.9.38.7 -> 0.9.39-unstable
[OK] psx/cdc : promote "Fantastic Pinball Kyutenkai" logic to unconditional; add MDFN_COLD [OK] psx/cdc : "Fantastic Pinball Kyutenkai" enable; add MDFN_COLD
[OK] psx/cpu : various improvements [OK] psx/cpu : various improvements
[OK] psx/fastfifo : .h copyright header [OK] psx/fastfifo : .h copyright header
[NO] psx/debug : (file not used) [NO] psx/debug : (file not used)
@ -101,4 +101,5 @@
[OK] psx/gte : (c), CR 30 change, add inlines [OK] psx/gte : (c), CR 30 change, add inlines
[OK] psx/input/* : (c) [OK] psx/input/* : (c)
[OK] psx/irq : (c) [OK] psx/irq : (c)
[OK] psx/masmem : (c) [OK] psx/masmem : (c)
[OK] psx/mdec :

View File

@ -1,19 +1,25 @@
/* Mednafen - Multi-system Emulator /******************************************************************************/
* /* Mednafen Sony PS1 Emulation Module */
* This program is free software; you can redistribute it and/or modify /******************************************************************************/
* it under the terms of the GNU General Public License as published by /* mdec.cpp:
* the Free Software Foundation; either version 2 of the License, or ** Copyright (C) 2011-2016 Mednafen Team
* (at your option) any later version. **
* ** This program is free software; you can redistribute it and/or
* This program is distributed in the hope that it will be useful, ** modify it under the terms of the GNU General Public License
* but WITHOUT ANY WARRANTY; without even the implied warranty of ** as published by the Free Software Foundation; either version 2
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** of the License, or (at your option) any later version.
* GNU General Public License for more details. **
* ** This program is distributed in the hope that it will be useful,
* You should have received a copy of the GNU General Public License ** but WITHOUT ANY WARRANTY; without even the implied warranty of
* along with this program; if not, write to the Free Software ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ** GNU General Public License for more details.
*/ **
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation, Inc.,
** 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#pragma GCC optimize ("unroll-loops")
/* /*
MDEC_READ_FIFO(tfr) vs InCounter vs MDEC_DMACanRead() is a bit fragile right now. Actually, the entire horrible state machine monstrosity is fragile. MDEC_READ_FIFO(tfr) vs InCounter vs MDEC_DMACanRead() is a bit fragile right now. Actually, the entire horrible state machine monstrosity is fragile.
@ -62,11 +68,15 @@
#include "masmem.h" #include "masmem.h"
#if defined(__SSE2__) #if defined(__SSE2__) || (defined(ARCH_X86) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)))
#include <xmmintrin.h> #include <xmmintrin.h>
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
#if 0 //defined(__ARM_NEON__)
#include <arm_neon.h>
#endif
#if defined(ARCH_POWERPC_ALTIVEC) && defined(HAVE_ALTIVEC_H) #if defined(ARCH_POWERPC_ALTIVEC) && defined(HAVE_ALTIVEC_H)
#include <altivec.h> #include <altivec.h>
#endif #endif
@ -228,65 +238,127 @@ static INLINE int8 Mask9ClampS8(int32 v)
return v; return v;
} }
template<typename T> ////////////////////////
static void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff) //
{ //
#if defined(__SSE2__) #pragma GCC push_options
{
for(unsigned col = 0; col < 8; col++) #if defined(__SSE2__) || (defined(ARCH_X86) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)))
{ //
__m128i c = _mm_load_si128((__m128i *)&in_coeff[(col * 8)]); //
//
for(unsigned x = 0; x < 8; x++) #pragma GCC target("sse2")
{ template<typename T>
__m128i sum; static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
__m128i m; {
int32 tmp[4] MDFN_ALIGN(16); for(unsigned col = 0; col < 8; col++)
{
m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]); __m128i c = _mm_load_si128((__m128i *)&in_coeff[(col * 8)]);
sum = _mm_madd_epi16(m, c);
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6))); for(unsigned x = 0; x < 8; x++)
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2))); {
__m128i sum;
//_mm_store_ss((float *)&tmp[0], (__m128)sum); __m128i m;
_mm_store_si128((__m128i*)tmp, sum); alignas(16) int32 tmp[4];
if(sizeof(T) == 1) m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]);
out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15); sum = _mm_madd_epi16(m, c);
else sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)));
out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15; sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2)));
}
} //_mm_store_ss((float *)&tmp[0], (__m128)sum);
} _mm_store_si128((__m128i*)tmp, sum);
#else
for(unsigned col = 0; col < 8; col++) if(sizeof(T) == 1)
{ out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15);
for(unsigned x = 0; x < 8; x++) else
{ out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15;
int32 sum = 0; }
}
for(unsigned u = 0; u < 8; u++) }
{ //
sum += (in_coeff[(col * 8) + u] * IDCTMatrix[(x * 8) + u]); //
} //
#elif 0 //defined(__ARM_NEON__)
if(sizeof(T) == 1) //
out_coeff[(col * 8) + x] = Mask9ClampS8((sum + 0x4000) >> 15); //
else //
out_coeff[(x * 8) + col] = (sum + 0x4000) >> 15; template<typename T>
} static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
} {
#endif for(unsigned col = 0; col < 8; col++)
} {
register int16x4_t c0 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 0, sizeof(int16x4_t)));
static void IDCT(int16 *in_coeff, int8 *out_coeff) NO_INLINE; register int16x4_t c1 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 4, sizeof(int16x4_t)));
static void IDCT(int16 *in_coeff, int8 *out_coeff) int32 buf[8];
{
EW_VAR_ALIGN(16) int16 tmpbuf[64]; for(unsigned x = 0; x < 8; x++)
{
IDCT_1D_Multi<int16>(in_coeff, tmpbuf); register int32x4_t accum;
IDCT_1D_Multi<int8>(tmpbuf, out_coeff); register int32x2_t sum2;
}
accum = vdupq_n_s32(0);
accum = vmlal_s16(accum, c0, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 0, sizeof(int16x4_t))));
accum = vmlal_s16(accum, c1, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 4, sizeof(int16x4_t))));
sum2 = vadd_s32(vget_high_s32(accum), vget_low_s32(accum));
sum2 = vpadd_s32(sum2, sum2);
vst1_lane_s32(buf + x, sum2, 0);
}
for(unsigned x = 0; x < 8; x++)
{
if(sizeof(T) == 1)
out_coeff[(col * 8) + x] = Mask9ClampS8((buf[x] + 0x4000) >> 15);
else
out_coeff[(x * 8) + col] = (buf[x] + 0x4000) >> 15;
}
}
}
//
//
//
#else
//
//
//
template<typename T>
static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
{
for(unsigned col = 0; col < 8; col++)
{
for(unsigned x = 0; x < 8; x++)
{
int32 sum = 0;
for(unsigned u = 0; u < 8; u++)
{
sum += (in_coeff[(col * 8) + u] * IDCTMatrix[(x * 8) + u]);
}
if(sizeof(T) == 1)
out_coeff[(col * 8) + x] = Mask9ClampS8((sum + 0x4000) >> 15);
else
out_coeff[(x * 8) + col] = (sum + 0x4000) >> 15;
}
}
}
//
//
//
#endif
static void IDCT(int16 *in_coeff, int8 *out_coeff) NO_INLINE;
static void IDCT(int16 *in_coeff, int8 *out_coeff)
{
alignas(16) int16 tmpbuf[64];
IDCT_1D_Multi<int16>(in_coeff, tmpbuf);
IDCT_1D_Multi<int8>(tmpbuf, out_coeff);
}
#pragma GCC pop_options
//
//
///////////////////////
static INLINE void YCbCr_to_RGB(const int8 y, const int8 cb, const int8 cr, int &r, int &g, int &b) static INLINE void YCbCr_to_RGB(const int8 y, const int8 cb, const int8 cr, int &r, int &g, int &b)
{ {
@ -505,12 +577,9 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles)
case 5: IDCT(Coeff, &block_y[0][0]); break; case 5: IDCT(Coeff, &block_y[0][0]); break;
} }
// // Timing in the PS1 MDEC is complex due to (apparent) pipelining, but the average when decoding a large number of blocks is
// Timing in the actual PS1 MDEC is complex due to (apparent) pipelining, but the average when decoding a large number of blocks is // about 512.
// about 512. We'll go with a lower value here to be conservative due to timing granularity and other timing deficiencies in Mednafen. BUT, don't *eat_cycles += 512;
// go lower than 460, or Parasite Eve 2's 3D models will stutter like crazy during FMV-background sequences.
//
*eat_cycles += 474;
if(DecodeWB >= 2) if(DecodeWB >= 2)
{ {

View File

@ -1,3 +1,24 @@
/******************************************************************************/
/* Mednafen Sony PS1 Emulation Module */
/******************************************************************************/
/* mdec.h:
** Copyright (C) 2011-2016 Mednafen Team
**
** This program is free software; you can redistribute it and/or
** modify it under the terms of the GNU General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation, Inc.,
** 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#ifndef __MDFN_PSX_MDEC_H #ifndef __MDFN_PSX_MDEC_H
#define __MDFN_PSX_MDEC_H #define __MDFN_PSX_MDEC_H