same for mdec, nothing to see here

This commit is contained in:
zeromus 2020-04-09 16:04:43 -04:00
parent e2389e25d7
commit d8fad23324
1 changed files with 204 additions and 205 deletions

View File

@ -1,25 +1,25 @@
/******************************************************************************/
/* Mednafen Sony PS1 Emulation Module */
/******************************************************************************/
/* mdec.cpp:
** Copyright (C) 2011-2016 Mednafen Team
**
** This program is free software; you can redistribute it and/or
** modify it under the terms of the GNU General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation, Inc.,
** 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#pragma GCC optimize ("unroll-loops")
/******************************************************************************/
/* Mednafen Sony PS1 Emulation Module */
/******************************************************************************/
/* mdec.cpp:
** Copyright (C) 2011-2016 Mednafen Team
**
** This program is free software; you can redistribute it and/or
** modify it under the terms of the GNU General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation, Inc.,
** 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#pragma GCC optimize ("unroll-loops")
/*
MDEC_READ_FIFO(tfr) vs InCounter vs MDEC_DMACanRead() is a bit fragile right now. Actually, the entire horrible state machine monstrosity is fragile.
@ -73,8 +73,8 @@
#include <emmintrin.h>
#endif
#if 0 //defined(HAVE_NEON_INTRINSICS)
#include <arm_neon.h>
#if 0 //defined(HAVE_NEON_INTRINSICS)
#include <arm_neon.h>
#endif
#if defined(HAVE_ALTIVEC_INTRINSICS) && defined(HAVE_ALTIVEC_H)
@ -90,7 +90,7 @@ namespace MDFN_IEN_PSX
static int32 ClockCounter;
static unsigned MDRPhase;
static FastFIFO<uint32, 0x20> InFIFO;
static FastFIFO<uint32, 0x20> InFIFO;
static FastFIFO<uint32, 0x20> OutFIFO;
static int8 block_y[8][8];
@ -242,125 +242,125 @@ static INLINE int8 Mask9ClampS8(int32 v)
return v;
}
////////////////////////
//
//
#pragma GCC push_options
#if defined(__SSE2__) || (defined(ARCH_X86) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)))
//
//
//
#pragma GCC target("sse2")
template<typename T>
static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
{
for(unsigned col = 0; col < 8; col++)
{
__m128i c = _mm_load_si128((__m128i *)&in_coeff[(col * 8)]);
for(unsigned x = 0; x < 8; x++)
{
__m128i sum;
__m128i m;
alignas(16) int32 tmp[4];
m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]);
sum = _mm_madd_epi16(m, c);
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)));
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2)));
//_mm_store_ss((float *)&tmp[0], (__m128)sum);
_mm_store_si128((__m128i*)tmp, sum);
if(sizeof(T) == 1)
out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15);
else
out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15;
}
}
}
//
//
//
#elif 0 //defined(HAVE_NEON_INTRINSICS)
//
//
//
template<typename T>
static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
{
for(unsigned col = 0; col < 8; col++)
{
register int16x4_t c0 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 0, sizeof(int16x4_t)));
register int16x4_t c1 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 4, sizeof(int16x4_t)));
int32 buf[8];
for(unsigned x = 0; x < 8; x++)
{
register int32x4_t accum;
register int32x2_t sum2;
accum = vdupq_n_s32(0);
accum = vmlal_s16(accum, c0, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 0, sizeof(int16x4_t))));
accum = vmlal_s16(accum, c1, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 4, sizeof(int16x4_t))));
sum2 = vadd_s32(vget_high_s32(accum), vget_low_s32(accum));
sum2 = vpadd_s32(sum2, sum2);
vst1_lane_s32(buf + x, sum2, 0);
}
for(unsigned x = 0; x < 8; x++)
{
if(sizeof(T) == 1)
out_coeff[(col * 8) + x] = Mask9ClampS8((buf[x] + 0x4000) >> 15);
else
out_coeff[(x * 8) + col] = (buf[x] + 0x4000) >> 15;
}
}
}
//
//
//
#else
//
//
//
template<typename T>
static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
{
for(unsigned col = 0; col < 8; col++)
{
for(unsigned x = 0; x < 8; x++)
{
int32 sum = 0;
for(unsigned u = 0; u < 8; u++)
{
sum += (in_coeff[(col * 8) + u] * IDCTMatrix[(x * 8) + u]);
}
if(sizeof(T) == 1)
out_coeff[(col * 8) + x] = Mask9ClampS8((sum + 0x4000) >> 15);
else
out_coeff[(x * 8) + col] = (sum + 0x4000) >> 15;
}
}
}
//
//
//
#endif
static NO_INLINE void IDCT(int16 *in_coeff, int8 *out_coeff)
{
alignas(16) int16 tmpbuf[64];
IDCT_1D_Multi<int16>(in_coeff, tmpbuf);
IDCT_1D_Multi<int8>(tmpbuf, out_coeff);
}
#pragma GCC pop_options
//
//
////////////////////////
//
//
#pragma GCC push_options
#if defined(__SSE2__) || (defined(ARCH_X86) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)))
//
//
//
#pragma GCC target("sse2")
template<typename T>
static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
{
for(unsigned col = 0; col < 8; col++)
{
__m128i c = _mm_load_si128((__m128i *)&in_coeff[(col * 8)]);
for(unsigned x = 0; x < 8; x++)
{
__m128i sum;
__m128i m;
alignas(16) int32 tmp[4];
m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]);
sum = _mm_madd_epi16(m, c);
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)));
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2)));
//_mm_store_ss((float *)&tmp[0], (__m128)sum);
_mm_store_si128((__m128i*)tmp, sum);
if(sizeof(T) == 1)
out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15);
else
out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15;
}
}
}
//
//
//
#elif 0 //defined(HAVE_NEON_INTRINSICS)
//
//
//
template<typename T>
static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
{
for(unsigned col = 0; col < 8; col++)
{
register int16x4_t c0 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 0, sizeof(int16x4_t)));
register int16x4_t c1 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 4, sizeof(int16x4_t)));
int32 buf[8];
for(unsigned x = 0; x < 8; x++)
{
register int32x4_t accum;
register int32x2_t sum2;
accum = vdupq_n_s32(0);
accum = vmlal_s16(accum, c0, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 0, sizeof(int16x4_t))));
accum = vmlal_s16(accum, c1, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 4, sizeof(int16x4_t))));
sum2 = vadd_s32(vget_high_s32(accum), vget_low_s32(accum));
sum2 = vpadd_s32(sum2, sum2);
vst1_lane_s32(buf + x, sum2, 0);
}
for(unsigned x = 0; x < 8; x++)
{
if(sizeof(T) == 1)
out_coeff[(col * 8) + x] = Mask9ClampS8((buf[x] + 0x4000) >> 15);
else
out_coeff[(x * 8) + col] = (buf[x] + 0x4000) >> 15;
}
}
}
//
//
//
#else
//
//
//
template<typename T>
static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
{
for(unsigned col = 0; col < 8; col++)
{
for(unsigned x = 0; x < 8; x++)
{
int32 sum = 0;
for(unsigned u = 0; u < 8; u++)
{
sum += (in_coeff[(col * 8) + u] * IDCTMatrix[(x * 8) + u]);
}
if(sizeof(T) == 1)
out_coeff[(col * 8) + x] = Mask9ClampS8((sum + 0x4000) >> 15);
else
out_coeff[(x * 8) + col] = (sum + 0x4000) >> 15;
}
}
}
//
//
//
#endif
static NO_INLINE void IDCT(int16 *in_coeff, int8 *out_coeff)
{
alignas(16) int16 tmpbuf[64];
IDCT_1D_Multi<int16>(in_coeff, tmpbuf);
IDCT_1D_Multi<int8>(tmpbuf, out_coeff);
}
#pragma GCC pop_options
//
//
///////////////////////
static INLINE void YCbCr_to_RGB(const int8 y, const int8 cb, const int8 cr, int &r, int &g, int &b)
@ -519,9 +519,9 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles)
int ci = sign_10_to_s16(V & 0x3FF);
int tmp;
if(q != 0)
tmp = (int32)((uint32)(ci * q) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0);
else
if(q != 0)
tmp = (int32)((uint32)(ci * q) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0);
else
tmp = (uint32)(ci * 2) << 4;
// Not sure if it should be 0x3FFF or 0x3FF0 or maybe 0x3FF8?
@ -552,9 +552,9 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles)
int ci = sign_10_to_s16(V & 0x3FF);
int tmp;
if(q != 0)
tmp = (int32)((uint32)((ci * q) >> 3) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0);
else
if(q != 0)
tmp = (int32)((uint32)((ci * q) >> 3) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0);
else
tmp = (uint32)(ci * 2) << 4;
// Not sure if it should be 0x3FFF or 0x3FF0 or maybe 0x3FF8?
@ -572,16 +572,16 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles)
switch(DecodeWB)
{
case 0: IDCT(Coeff, MDAP(block_cr)); break;
case 1: IDCT(Coeff, MDAP(block_cb)); break;
case 2: IDCT(Coeff, MDAP(block_y)); break;
case 3: IDCT(Coeff, MDAP(block_y)); break;
case 4: IDCT(Coeff, MDAP(block_y)); break;
case 0: IDCT(Coeff, MDAP(block_cr)); break;
case 1: IDCT(Coeff, MDAP(block_cb)); break;
case 2: IDCT(Coeff, MDAP(block_y)); break;
case 3: IDCT(Coeff, MDAP(block_y)); break;
case 4: IDCT(Coeff, MDAP(block_y)); break;
case 5: IDCT(Coeff, MDAP(block_y)); break;
}
// Timing in the PS1 MDEC is complex due to (apparent) pipelining, but the average when decoding a large number of blocks is
// about 512.
// Timing in the PS1 MDEC is complex due to (apparent) pipelining, but the average when decoding a large number of blocks is
// about 512.
*eat_cycles += 512;
if(DecodeWB >= 2)
@ -604,7 +604,7 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles)
//
#define MDEC_WAIT_COND(n) { case __COUNTER__: if(!(n)) { MDRPhase = __COUNTER__ - MDRPhaseBias - 1; return; } }
#define MDEC_WRITE_FIFO(n) { MDEC_WAIT_COND(OutFIFO.CanWrite()); OutFIFO.Write(n); }
#define MDEC_WRITE_FIFO(n) { MDEC_WAIT_COND(OutFIFO.CanWrite()); OutFIFO.Write(n); }
#define MDEC_READ_FIFO(n) { MDEC_WAIT_COND(InFIFO.CanRead()); n = InFIFO.Read(); }
#define MDEC_EAT_CLOCKS(n) { ClockCounter -= (n); MDEC_WAIT_COND(ClockCounter > 0); }
@ -682,7 +682,7 @@ MDFN_FASTCALL void MDEC_Run(int32 clocks)
PixelBufferReadOffset = 0;
while(PixelBufferReadOffset < PixelBufferCount32)
{
MDEC_WRITE_FIFO(MDFN_de32lsb<true>(&PixelBuffer.pix32[PixelBufferReadOffset++]));
MDEC_WRITE_FIFO((MDFN_de32lsb<true>(&PixelBuffer.pix32[PixelBufferReadOffset++])));
}
} while(InCounter != 0xFFFF);
}
@ -746,52 +746,51 @@ MDFN_FASTCALL void MDEC_Run(int32 clocks)
}
#endif
MDFN_FASTCALL void MDEC_DMAWrite(uint32 V)
{
if(InFIFO.CanWrite())
{
InFIFO.Write(V);
MDEC_Run(0);
}
else
{
PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA write when input FIFO is full!!\n");
}
}
MDFN_FASTCALL uint32 MDEC_DMARead(uint32* offs)
{
uint32 V = 0;
*offs = 0;
if(MDFN_LIKELY(OutFIFO.CanRead()))
{
V = OutFIFO.Read();
*offs = (RAMOffsetY & 0x7) * RAMOffsetWWS;
if(RAMOffsetY & 0x08)
{
*offs = (*offs - RAMOffsetWWS*7);
}
RAMOffsetCounter--;
if(!RAMOffsetCounter)
{
RAMOffsetCounter = RAMOffsetWWS;
RAMOffsetY++;
}
MDEC_Run(0);
}
else
{
PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA read when output FIFO is empty!\n");
}
return(V);
MDFN_FASTCALL void MDEC_DMAWrite(uint32 V)
{
if(InFIFO.CanWrite())
{
InFIFO.Write(V);
MDEC_Run(0);
}
else
{
PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA write when input FIFO is full!!\n");
}
}
MDFN_FASTCALL uint32 MDEC_DMARead(uint32* offs)
{
uint32 V = 0;
*offs = 0;
if(MDFN_LIKELY(OutFIFO.CanRead()))
{
V = OutFIFO.Read();
*offs = (RAMOffsetY & 0x7) * RAMOffsetWWS;
if(RAMOffsetY & 0x08)
{
*offs = (*offs - RAMOffsetWWS*7);
}
RAMOffsetCounter--;
if(!RAMOffsetCounter)
{
RAMOffsetCounter = RAMOffsetWWS;
RAMOffsetY++;
}
MDEC_Run(0);
}
else
{
PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA read when output FIFO is empty!\n");
}
return(V);
}
bool MDEC_DMACanWrite(void)