same for mdec, nothing to see here
This commit is contained in:
parent
e2389e25d7
commit
d8fad23324
|
@ -1,25 +1,25 @@
|
|||
/******************************************************************************/
|
||||
/* Mednafen Sony PS1 Emulation Module */
|
||||
/******************************************************************************/
|
||||
/* mdec.cpp:
|
||||
** Copyright (C) 2011-2016 Mednafen Team
|
||||
**
|
||||
** This program is free software; you can redistribute it and/or
|
||||
** modify it under the terms of the GNU General Public License
|
||||
** as published by the Free Software Foundation; either version 2
|
||||
** of the License, or (at your option) any later version.
|
||||
**
|
||||
** This program is distributed in the hope that it will be useful,
|
||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
** GNU General Public License for more details.
|
||||
**
|
||||
** You should have received a copy of the GNU General Public License
|
||||
** along with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
** 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
*/
|
||||
|
||||
#pragma GCC optimize ("unroll-loops")
|
||||
/******************************************************************************/
|
||||
/* Mednafen Sony PS1 Emulation Module */
|
||||
/******************************************************************************/
|
||||
/* mdec.cpp:
|
||||
** Copyright (C) 2011-2016 Mednafen Team
|
||||
**
|
||||
** This program is free software; you can redistribute it and/or
|
||||
** modify it under the terms of the GNU General Public License
|
||||
** as published by the Free Software Foundation; either version 2
|
||||
** of the License, or (at your option) any later version.
|
||||
**
|
||||
** This program is distributed in the hope that it will be useful,
|
||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
** GNU General Public License for more details.
|
||||
**
|
||||
** You should have received a copy of the GNU General Public License
|
||||
** along with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
** 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
*/
|
||||
|
||||
#pragma GCC optimize ("unroll-loops")
|
||||
|
||||
/*
|
||||
MDEC_READ_FIFO(tfr) vs InCounter vs MDEC_DMACanRead() is a bit fragile right now. Actually, the entire horrible state machine monstrosity is fragile.
|
||||
|
@ -73,8 +73,8 @@
|
|||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#if 0 //defined(HAVE_NEON_INTRINSICS)
|
||||
#include <arm_neon.h>
|
||||
#if 0 //defined(HAVE_NEON_INTRINSICS)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_ALTIVEC_INTRINSICS) && defined(HAVE_ALTIVEC_H)
|
||||
|
@ -90,7 +90,7 @@ namespace MDFN_IEN_PSX
|
|||
|
||||
static int32 ClockCounter;
|
||||
static unsigned MDRPhase;
|
||||
static FastFIFO<uint32, 0x20> InFIFO;
|
||||
static FastFIFO<uint32, 0x20> InFIFO;
|
||||
static FastFIFO<uint32, 0x20> OutFIFO;
|
||||
|
||||
static int8 block_y[8][8];
|
||||
|
@ -242,125 +242,125 @@ static INLINE int8 Mask9ClampS8(int32 v)
|
|||
return v;
|
||||
}
|
||||
|
||||
////////////////////////
|
||||
//
|
||||
//
|
||||
#pragma GCC push_options
|
||||
|
||||
#if defined(__SSE2__) || (defined(ARCH_X86) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)))
|
||||
//
|
||||
//
|
||||
//
|
||||
#pragma GCC target("sse2")
|
||||
template<typename T>
|
||||
static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
|
||||
{
|
||||
for(unsigned col = 0; col < 8; col++)
|
||||
{
|
||||
__m128i c = _mm_load_si128((__m128i *)&in_coeff[(col * 8)]);
|
||||
|
||||
for(unsigned x = 0; x < 8; x++)
|
||||
{
|
||||
__m128i sum;
|
||||
__m128i m;
|
||||
alignas(16) int32 tmp[4];
|
||||
|
||||
m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]);
|
||||
sum = _mm_madd_epi16(m, c);
|
||||
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)));
|
||||
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2)));
|
||||
|
||||
//_mm_store_ss((float *)&tmp[0], (__m128)sum);
|
||||
_mm_store_si128((__m128i*)tmp, sum);
|
||||
|
||||
if(sizeof(T) == 1)
|
||||
out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15);
|
||||
else
|
||||
out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15;
|
||||
}
|
||||
}
|
||||
}
|
||||
//
|
||||
//
|
||||
//
|
||||
#elif 0 //defined(HAVE_NEON_INTRINSICS)
|
||||
//
|
||||
//
|
||||
//
|
||||
template<typename T>
|
||||
static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
|
||||
{
|
||||
for(unsigned col = 0; col < 8; col++)
|
||||
{
|
||||
register int16x4_t c0 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 0, sizeof(int16x4_t)));
|
||||
register int16x4_t c1 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 4, sizeof(int16x4_t)));
|
||||
int32 buf[8];
|
||||
|
||||
for(unsigned x = 0; x < 8; x++)
|
||||
{
|
||||
register int32x4_t accum;
|
||||
register int32x2_t sum2;
|
||||
|
||||
accum = vdupq_n_s32(0);
|
||||
accum = vmlal_s16(accum, c0, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 0, sizeof(int16x4_t))));
|
||||
accum = vmlal_s16(accum, c1, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 4, sizeof(int16x4_t))));
|
||||
sum2 = vadd_s32(vget_high_s32(accum), vget_low_s32(accum));
|
||||
sum2 = vpadd_s32(sum2, sum2);
|
||||
vst1_lane_s32(buf + x, sum2, 0);
|
||||
}
|
||||
|
||||
for(unsigned x = 0; x < 8; x++)
|
||||
{
|
||||
if(sizeof(T) == 1)
|
||||
out_coeff[(col * 8) + x] = Mask9ClampS8((buf[x] + 0x4000) >> 15);
|
||||
else
|
||||
out_coeff[(x * 8) + col] = (buf[x] + 0x4000) >> 15;
|
||||
}
|
||||
}
|
||||
}
|
||||
//
|
||||
//
|
||||
//
|
||||
#else
|
||||
//
|
||||
//
|
||||
//
|
||||
template<typename T>
|
||||
static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
|
||||
{
|
||||
for(unsigned col = 0; col < 8; col++)
|
||||
{
|
||||
for(unsigned x = 0; x < 8; x++)
|
||||
{
|
||||
int32 sum = 0;
|
||||
|
||||
for(unsigned u = 0; u < 8; u++)
|
||||
{
|
||||
sum += (in_coeff[(col * 8) + u] * IDCTMatrix[(x * 8) + u]);
|
||||
}
|
||||
|
||||
if(sizeof(T) == 1)
|
||||
out_coeff[(col * 8) + x] = Mask9ClampS8((sum + 0x4000) >> 15);
|
||||
else
|
||||
out_coeff[(x * 8) + col] = (sum + 0x4000) >> 15;
|
||||
}
|
||||
}
|
||||
}
|
||||
//
|
||||
//
|
||||
//
|
||||
#endif
|
||||
|
||||
static NO_INLINE void IDCT(int16 *in_coeff, int8 *out_coeff)
|
||||
{
|
||||
alignas(16) int16 tmpbuf[64];
|
||||
|
||||
IDCT_1D_Multi<int16>(in_coeff, tmpbuf);
|
||||
IDCT_1D_Multi<int8>(tmpbuf, out_coeff);
|
||||
}
|
||||
#pragma GCC pop_options
|
||||
//
|
||||
//
|
||||
////////////////////////
|
||||
//
|
||||
//
|
||||
#pragma GCC push_options
|
||||
|
||||
#if defined(__SSE2__) || (defined(ARCH_X86) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)))
|
||||
//
|
||||
//
|
||||
//
|
||||
#pragma GCC target("sse2")
|
||||
template<typename T>
|
||||
static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
|
||||
{
|
||||
for(unsigned col = 0; col < 8; col++)
|
||||
{
|
||||
__m128i c = _mm_load_si128((__m128i *)&in_coeff[(col * 8)]);
|
||||
|
||||
for(unsigned x = 0; x < 8; x++)
|
||||
{
|
||||
__m128i sum;
|
||||
__m128i m;
|
||||
alignas(16) int32 tmp[4];
|
||||
|
||||
m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]);
|
||||
sum = _mm_madd_epi16(m, c);
|
||||
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)));
|
||||
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2)));
|
||||
|
||||
//_mm_store_ss((float *)&tmp[0], (__m128)sum);
|
||||
_mm_store_si128((__m128i*)tmp, sum);
|
||||
|
||||
if(sizeof(T) == 1)
|
||||
out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15);
|
||||
else
|
||||
out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15;
|
||||
}
|
||||
}
|
||||
}
|
||||
//
|
||||
//
|
||||
//
|
||||
#elif 0 //defined(HAVE_NEON_INTRINSICS)
|
||||
//
|
||||
//
|
||||
//
|
||||
template<typename T>
|
||||
static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
|
||||
{
|
||||
for(unsigned col = 0; col < 8; col++)
|
||||
{
|
||||
register int16x4_t c0 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 0, sizeof(int16x4_t)));
|
||||
register int16x4_t c1 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 4, sizeof(int16x4_t)));
|
||||
int32 buf[8];
|
||||
|
||||
for(unsigned x = 0; x < 8; x++)
|
||||
{
|
||||
register int32x4_t accum;
|
||||
register int32x2_t sum2;
|
||||
|
||||
accum = vdupq_n_s32(0);
|
||||
accum = vmlal_s16(accum, c0, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 0, sizeof(int16x4_t))));
|
||||
accum = vmlal_s16(accum, c1, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 4, sizeof(int16x4_t))));
|
||||
sum2 = vadd_s32(vget_high_s32(accum), vget_low_s32(accum));
|
||||
sum2 = vpadd_s32(sum2, sum2);
|
||||
vst1_lane_s32(buf + x, sum2, 0);
|
||||
}
|
||||
|
||||
for(unsigned x = 0; x < 8; x++)
|
||||
{
|
||||
if(sizeof(T) == 1)
|
||||
out_coeff[(col * 8) + x] = Mask9ClampS8((buf[x] + 0x4000) >> 15);
|
||||
else
|
||||
out_coeff[(x * 8) + col] = (buf[x] + 0x4000) >> 15;
|
||||
}
|
||||
}
|
||||
}
|
||||
//
|
||||
//
|
||||
//
|
||||
#else
|
||||
//
|
||||
//
|
||||
//
|
||||
template<typename T>
|
||||
static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
|
||||
{
|
||||
for(unsigned col = 0; col < 8; col++)
|
||||
{
|
||||
for(unsigned x = 0; x < 8; x++)
|
||||
{
|
||||
int32 sum = 0;
|
||||
|
||||
for(unsigned u = 0; u < 8; u++)
|
||||
{
|
||||
sum += (in_coeff[(col * 8) + u] * IDCTMatrix[(x * 8) + u]);
|
||||
}
|
||||
|
||||
if(sizeof(T) == 1)
|
||||
out_coeff[(col * 8) + x] = Mask9ClampS8((sum + 0x4000) >> 15);
|
||||
else
|
||||
out_coeff[(x * 8) + col] = (sum + 0x4000) >> 15;
|
||||
}
|
||||
}
|
||||
}
|
||||
//
|
||||
//
|
||||
//
|
||||
#endif
|
||||
|
||||
static NO_INLINE void IDCT(int16 *in_coeff, int8 *out_coeff)
|
||||
{
|
||||
alignas(16) int16 tmpbuf[64];
|
||||
|
||||
IDCT_1D_Multi<int16>(in_coeff, tmpbuf);
|
||||
IDCT_1D_Multi<int8>(tmpbuf, out_coeff);
|
||||
}
|
||||
#pragma GCC pop_options
|
||||
//
|
||||
//
|
||||
///////////////////////
|
||||
|
||||
static INLINE void YCbCr_to_RGB(const int8 y, const int8 cb, const int8 cr, int &r, int &g, int &b)
|
||||
|
@ -519,9 +519,9 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles)
|
|||
int ci = sign_10_to_s16(V & 0x3FF);
|
||||
int tmp;
|
||||
|
||||
if(q != 0)
|
||||
tmp = (int32)((uint32)(ci * q) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0);
|
||||
else
|
||||
if(q != 0)
|
||||
tmp = (int32)((uint32)(ci * q) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0);
|
||||
else
|
||||
tmp = (uint32)(ci * 2) << 4;
|
||||
|
||||
// Not sure if it should be 0x3FFF or 0x3FF0 or maybe 0x3FF8?
|
||||
|
@ -552,9 +552,9 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles)
|
|||
int ci = sign_10_to_s16(V & 0x3FF);
|
||||
int tmp;
|
||||
|
||||
if(q != 0)
|
||||
tmp = (int32)((uint32)((ci * q) >> 3) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0);
|
||||
else
|
||||
if(q != 0)
|
||||
tmp = (int32)((uint32)((ci * q) >> 3) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0);
|
||||
else
|
||||
tmp = (uint32)(ci * 2) << 4;
|
||||
|
||||
// Not sure if it should be 0x3FFF or 0x3FF0 or maybe 0x3FF8?
|
||||
|
@ -572,16 +572,16 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles)
|
|||
|
||||
switch(DecodeWB)
|
||||
{
|
||||
case 0: IDCT(Coeff, MDAP(block_cr)); break;
|
||||
case 1: IDCT(Coeff, MDAP(block_cb)); break;
|
||||
case 2: IDCT(Coeff, MDAP(block_y)); break;
|
||||
case 3: IDCT(Coeff, MDAP(block_y)); break;
|
||||
case 4: IDCT(Coeff, MDAP(block_y)); break;
|
||||
case 0: IDCT(Coeff, MDAP(block_cr)); break;
|
||||
case 1: IDCT(Coeff, MDAP(block_cb)); break;
|
||||
case 2: IDCT(Coeff, MDAP(block_y)); break;
|
||||
case 3: IDCT(Coeff, MDAP(block_y)); break;
|
||||
case 4: IDCT(Coeff, MDAP(block_y)); break;
|
||||
case 5: IDCT(Coeff, MDAP(block_y)); break;
|
||||
}
|
||||
|
||||
// Timing in the PS1 MDEC is complex due to (apparent) pipelining, but the average when decoding a large number of blocks is
|
||||
// about 512.
|
||||
// Timing in the PS1 MDEC is complex due to (apparent) pipelining, but the average when decoding a large number of blocks is
|
||||
// about 512.
|
||||
*eat_cycles += 512;
|
||||
|
||||
if(DecodeWB >= 2)
|
||||
|
@ -604,7 +604,7 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles)
|
|||
//
|
||||
#define MDEC_WAIT_COND(n) { case __COUNTER__: if(!(n)) { MDRPhase = __COUNTER__ - MDRPhaseBias - 1; return; } }
|
||||
|
||||
#define MDEC_WRITE_FIFO(n) { MDEC_WAIT_COND(OutFIFO.CanWrite()); OutFIFO.Write(n); }
|
||||
#define MDEC_WRITE_FIFO(n) { MDEC_WAIT_COND(OutFIFO.CanWrite()); OutFIFO.Write(n); }
|
||||
#define MDEC_READ_FIFO(n) { MDEC_WAIT_COND(InFIFO.CanRead()); n = InFIFO.Read(); }
|
||||
#define MDEC_EAT_CLOCKS(n) { ClockCounter -= (n); MDEC_WAIT_COND(ClockCounter > 0); }
|
||||
|
||||
|
@ -682,7 +682,7 @@ MDFN_FASTCALL void MDEC_Run(int32 clocks)
|
|||
PixelBufferReadOffset = 0;
|
||||
while(PixelBufferReadOffset < PixelBufferCount32)
|
||||
{
|
||||
MDEC_WRITE_FIFO(MDFN_de32lsb<true>(&PixelBuffer.pix32[PixelBufferReadOffset++]));
|
||||
MDEC_WRITE_FIFO((MDFN_de32lsb<true>(&PixelBuffer.pix32[PixelBufferReadOffset++])));
|
||||
}
|
||||
} while(InCounter != 0xFFFF);
|
||||
}
|
||||
|
@ -746,52 +746,51 @@ MDFN_FASTCALL void MDEC_Run(int32 clocks)
|
|||
}
|
||||
#endif
|
||||
|
||||
|
||||
MDFN_FASTCALL void MDEC_DMAWrite(uint32 V)
|
||||
{
|
||||
if(InFIFO.CanWrite())
|
||||
{
|
||||
InFIFO.Write(V);
|
||||
MDEC_Run(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA write when input FIFO is full!!\n");
|
||||
}
|
||||
}
|
||||
|
||||
MDFN_FASTCALL uint32 MDEC_DMARead(uint32* offs)
|
||||
{
|
||||
uint32 V = 0;
|
||||
|
||||
*offs = 0;
|
||||
|
||||
if(MDFN_LIKELY(OutFIFO.CanRead()))
|
||||
{
|
||||
V = OutFIFO.Read();
|
||||
|
||||
*offs = (RAMOffsetY & 0x7) * RAMOffsetWWS;
|
||||
|
||||
if(RAMOffsetY & 0x08)
|
||||
{
|
||||
*offs = (*offs - RAMOffsetWWS*7);
|
||||
}
|
||||
|
||||
RAMOffsetCounter--;
|
||||
if(!RAMOffsetCounter)
|
||||
{
|
||||
RAMOffsetCounter = RAMOffsetWWS;
|
||||
RAMOffsetY++;
|
||||
}
|
||||
|
||||
MDEC_Run(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA read when output FIFO is empty!\n");
|
||||
}
|
||||
|
||||
return(V);
|
||||
MDFN_FASTCALL void MDEC_DMAWrite(uint32 V)
|
||||
{
|
||||
if(InFIFO.CanWrite())
|
||||
{
|
||||
InFIFO.Write(V);
|
||||
MDEC_Run(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA write when input FIFO is full!!\n");
|
||||
}
|
||||
}
|
||||
|
||||
MDFN_FASTCALL uint32 MDEC_DMARead(uint32* offs)
|
||||
{
|
||||
uint32 V = 0;
|
||||
|
||||
*offs = 0;
|
||||
|
||||
if(MDFN_LIKELY(OutFIFO.CanRead()))
|
||||
{
|
||||
V = OutFIFO.Read();
|
||||
|
||||
*offs = (RAMOffsetY & 0x7) * RAMOffsetWWS;
|
||||
|
||||
if(RAMOffsetY & 0x08)
|
||||
{
|
||||
*offs = (*offs - RAMOffsetWWS*7);
|
||||
}
|
||||
|
||||
RAMOffsetCounter--;
|
||||
if(!RAMOffsetCounter)
|
||||
{
|
||||
RAMOffsetCounter = RAMOffsetWWS;
|
||||
RAMOffsetY++;
|
||||
}
|
||||
|
||||
MDEC_Run(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA read when output FIFO is empty!\n");
|
||||
}
|
||||
|
||||
return(V);
|
||||
}
|
||||
|
||||
bool MDEC_DMACanWrite(void)
|
||||
|
|
Loading…
Reference in New Issue