diff --git a/psx/octoshock/psx/mdec.cpp b/psx/octoshock/psx/mdec.cpp index ff88243974..52028ee12e 100644 --- a/psx/octoshock/psx/mdec.cpp +++ b/psx/octoshock/psx/mdec.cpp @@ -1,25 +1,25 @@ -/******************************************************************************/ -/* Mednafen Sony PS1 Emulation Module */ -/******************************************************************************/ -/* mdec.cpp: -** Copyright (C) 2011-2016 Mednafen Team -** -** This program is free software; you can redistribute it and/or -** modify it under the terms of the GNU General Public License -** as published by the Free Software Foundation; either version 2 -** of the License, or (at your option) any later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software Foundation, Inc., -** 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -*/ - -#pragma GCC optimize ("unroll-loops") +/******************************************************************************/ +/* Mednafen Sony PS1 Emulation Module */ +/******************************************************************************/ +/* mdec.cpp: +** Copyright (C) 2011-2016 Mednafen Team +** +** This program is free software; you can redistribute it and/or +** modify it under the terms of the GNU General Public License +** as published by the Free Software Foundation; either version 2 +** of the License, or (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, Inc., +** 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +#pragma GCC optimize ("unroll-loops") /* MDEC_READ_FIFO(tfr) vs InCounter vs MDEC_DMACanRead() is a bit fragile right now. Actually, the entire horrible state machine monstrosity is fragile. @@ -73,8 +73,8 @@ #include #endif -#if 0 //defined(HAVE_NEON_INTRINSICS) -#include +#if 0 //defined(HAVE_NEON_INTRINSICS) +#include #endif #if defined(HAVE_ALTIVEC_INTRINSICS) && defined(HAVE_ALTIVEC_H) @@ -90,7 +90,7 @@ namespace MDFN_IEN_PSX static int32 ClockCounter; static unsigned MDRPhase; -static FastFIFO InFIFO; +static FastFIFO InFIFO; static FastFIFO OutFIFO; static int8 block_y[8][8]; @@ -242,125 +242,125 @@ static INLINE int8 Mask9ClampS8(int32 v) return v; } -//////////////////////// -// -// -#pragma GCC push_options - -#if defined(__SSE2__) || (defined(ARCH_X86) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9))) -// -// -// -#pragma GCC target("sse2") -template -static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff) -{ - for(unsigned col = 0; col < 8; col++) - { - __m128i c = _mm_load_si128((__m128i *)&in_coeff[(col * 8)]); - - for(unsigned x = 0; x < 8; x++) - { - __m128i sum; - __m128i m; - alignas(16) int32 tmp[4]; - - m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]); - sum = _mm_madd_epi16(m, c); - sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6))); - sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2))); - - //_mm_store_ss((float *)&tmp[0], (__m128)sum); - _mm_store_si128((__m128i*)tmp, sum); - - if(sizeof(T) == 1) - out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15); - else - out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15; - } - } -} -// -// -// -#elif 0 //defined(HAVE_NEON_INTRINSICS) -// -// -// -template -static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff) -{ - for(unsigned col = 0; col < 8; col++) - { - register int16x4_t c0 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 0, sizeof(int16x4_t))); - register int16x4_t c1 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 4, sizeof(int16x4_t))); - int32 buf[8]; - - for(unsigned x = 0; x < 8; x++) - { - register int32x4_t accum; - register int32x2_t sum2; - - accum = vdupq_n_s32(0); - accum = vmlal_s16(accum, c0, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 0, sizeof(int16x4_t)))); - accum = vmlal_s16(accum, c1, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 4, sizeof(int16x4_t)))); - sum2 = vadd_s32(vget_high_s32(accum), vget_low_s32(accum)); - sum2 = vpadd_s32(sum2, sum2); - vst1_lane_s32(buf + x, sum2, 0); - } - - for(unsigned x = 0; x < 8; x++) - { - if(sizeof(T) == 1) - out_coeff[(col * 8) + x] = Mask9ClampS8((buf[x] + 0x4000) >> 15); - else - out_coeff[(x * 8) + col] = (buf[x] + 0x4000) >> 15; - } - } -} -// -// -// -#else -// -// -// -template -static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff) -{ - for(unsigned col = 0; col < 8; col++) - { - for(unsigned x = 0; x < 8; x++) - { - int32 sum = 0; - - for(unsigned u = 0; u < 8; u++) - { - sum += (in_coeff[(col * 8) + u] * IDCTMatrix[(x * 8) + u]); - } - - if(sizeof(T) == 1) - out_coeff[(col * 8) + x] = Mask9ClampS8((sum + 0x4000) >> 15); - else - out_coeff[(x * 8) + col] = (sum + 0x4000) >> 15; - } - } -} -// -// -// -#endif - -static NO_INLINE void IDCT(int16 *in_coeff, int8 *out_coeff) -{ - alignas(16) int16 tmpbuf[64]; - - IDCT_1D_Multi(in_coeff, tmpbuf); - IDCT_1D_Multi(tmpbuf, out_coeff); -} -#pragma GCC pop_options -// -// +//////////////////////// +// +// +#pragma GCC push_options + +#if defined(__SSE2__) || (defined(ARCH_X86) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9))) +// +// +// +#pragma GCC target("sse2") +template +static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff) +{ + for(unsigned col = 0; col < 8; col++) + { + __m128i c = _mm_load_si128((__m128i *)&in_coeff[(col * 8)]); + + for(unsigned x = 0; x < 8; x++) + { + __m128i sum; + __m128i m; + alignas(16) int32 tmp[4]; + + m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]); + sum = _mm_madd_epi16(m, c); + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6))); + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2))); + + //_mm_store_ss((float *)&tmp[0], (__m128)sum); + _mm_store_si128((__m128i*)tmp, sum); + + if(sizeof(T) == 1) + out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15); + else + out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15; + } + } +} +// +// +// +#elif 0 //defined(HAVE_NEON_INTRINSICS) +// +// +// +template +static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff) +{ + for(unsigned col = 0; col < 8; col++) + { + register int16x4_t c0 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 0, sizeof(int16x4_t))); + register int16x4_t c1 = vld1_s16(MDFN_ASSUME_ALIGNED(in_coeff + col * 8 + 4, sizeof(int16x4_t))); + int32 buf[8]; + + for(unsigned x = 0; x < 8; x++) + { + register int32x4_t accum; + register int32x2_t sum2; + + accum = vdupq_n_s32(0); + accum = vmlal_s16(accum, c0, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 0, sizeof(int16x4_t)))); + accum = vmlal_s16(accum, c1, vld1_s16(MDFN_ASSUME_ALIGNED(IDCTMatrix + x * 8 + 4, sizeof(int16x4_t)))); + sum2 = vadd_s32(vget_high_s32(accum), vget_low_s32(accum)); + sum2 = vpadd_s32(sum2, sum2); + vst1_lane_s32(buf + x, sum2, 0); + } + + for(unsigned x = 0; x < 8; x++) + { + if(sizeof(T) == 1) + out_coeff[(col * 8) + x] = Mask9ClampS8((buf[x] + 0x4000) >> 15); + else + out_coeff[(x * 8) + col] = (buf[x] + 0x4000) >> 15; + } + } +} +// +// +// +#else +// +// +// +template +static INLINE void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff) +{ + for(unsigned col = 0; col < 8; col++) + { + for(unsigned x = 0; x < 8; x++) + { + int32 sum = 0; + + for(unsigned u = 0; u < 8; u++) + { + sum += (in_coeff[(col * 8) + u] * IDCTMatrix[(x * 8) + u]); + } + + if(sizeof(T) == 1) + out_coeff[(col * 8) + x] = Mask9ClampS8((sum + 0x4000) >> 15); + else + out_coeff[(x * 8) + col] = (sum + 0x4000) >> 15; + } + } +} +// +// +// +#endif + +static NO_INLINE void IDCT(int16 *in_coeff, int8 *out_coeff) +{ + alignas(16) int16 tmpbuf[64]; + + IDCT_1D_Multi(in_coeff, tmpbuf); + IDCT_1D_Multi(tmpbuf, out_coeff); +} +#pragma GCC pop_options +// +// /////////////////////// static INLINE void YCbCr_to_RGB(const int8 y, const int8 cb, const int8 cr, int &r, int &g, int &b) @@ -519,9 +519,9 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles) int ci = sign_10_to_s16(V & 0x3FF); int tmp; - if(q != 0) - tmp = (int32)((uint32)(ci * q) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0); - else + if(q != 0) + tmp = (int32)((uint32)(ci * q) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0); + else tmp = (uint32)(ci * 2) << 4; // Not sure if it should be 0x3FFF or 0x3FF0 or maybe 0x3FF8? @@ -552,9 +552,9 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles) int ci = sign_10_to_s16(V & 0x3FF); int tmp; - if(q != 0) - tmp = (int32)((uint32)((ci * q) >> 3) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0); - else + if(q != 0) + tmp = (int32)((uint32)((ci * q) >> 3) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0); + else tmp = (uint32)(ci * 2) << 4; // Not sure if it should be 0x3FFF or 0x3FF0 or maybe 0x3FF8? @@ -572,16 +572,16 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles) switch(DecodeWB) { - case 0: IDCT(Coeff, MDAP(block_cr)); break; - case 1: IDCT(Coeff, MDAP(block_cb)); break; - case 2: IDCT(Coeff, MDAP(block_y)); break; - case 3: IDCT(Coeff, MDAP(block_y)); break; - case 4: IDCT(Coeff, MDAP(block_y)); break; + case 0: IDCT(Coeff, MDAP(block_cr)); break; + case 1: IDCT(Coeff, MDAP(block_cb)); break; + case 2: IDCT(Coeff, MDAP(block_y)); break; + case 3: IDCT(Coeff, MDAP(block_y)); break; + case 4: IDCT(Coeff, MDAP(block_y)); break; case 5: IDCT(Coeff, MDAP(block_y)); break; } - // Timing in the PS1 MDEC is complex due to (apparent) pipelining, but the average when decoding a large number of blocks is - // about 512. + // Timing in the PS1 MDEC is complex due to (apparent) pipelining, but the average when decoding a large number of blocks is + // about 512. *eat_cycles += 512; if(DecodeWB >= 2) @@ -604,7 +604,7 @@ static INLINE void WriteImageData(uint16 V, int32* eat_cycles) // #define MDEC_WAIT_COND(n) { case __COUNTER__: if(!(n)) { MDRPhase = __COUNTER__ - MDRPhaseBias - 1; return; } } -#define MDEC_WRITE_FIFO(n) { MDEC_WAIT_COND(OutFIFO.CanWrite()); OutFIFO.Write(n); } +#define MDEC_WRITE_FIFO(n) { MDEC_WAIT_COND(OutFIFO.CanWrite()); OutFIFO.Write(n); } #define MDEC_READ_FIFO(n) { MDEC_WAIT_COND(InFIFO.CanRead()); n = InFIFO.Read(); } #define MDEC_EAT_CLOCKS(n) { ClockCounter -= (n); MDEC_WAIT_COND(ClockCounter > 0); } @@ -682,7 +682,7 @@ MDFN_FASTCALL void MDEC_Run(int32 clocks) PixelBufferReadOffset = 0; while(PixelBufferReadOffset < PixelBufferCount32) { - MDEC_WRITE_FIFO(MDFN_de32lsb(&PixelBuffer.pix32[PixelBufferReadOffset++])); + MDEC_WRITE_FIFO((MDFN_de32lsb(&PixelBuffer.pix32[PixelBufferReadOffset++]))); } } while(InCounter != 0xFFFF); } @@ -746,52 +746,51 @@ MDFN_FASTCALL void MDEC_Run(int32 clocks) } #endif - -MDFN_FASTCALL void MDEC_DMAWrite(uint32 V) -{ - if(InFIFO.CanWrite()) - { - InFIFO.Write(V); - MDEC_Run(0); - } - else - { - PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA write when input FIFO is full!!\n"); - } -} - -MDFN_FASTCALL uint32 MDEC_DMARead(uint32* offs) -{ - uint32 V = 0; - - *offs = 0; - - if(MDFN_LIKELY(OutFIFO.CanRead())) - { - V = OutFIFO.Read(); - - *offs = (RAMOffsetY & 0x7) * RAMOffsetWWS; - - if(RAMOffsetY & 0x08) - { - *offs = (*offs - RAMOffsetWWS*7); - } - - RAMOffsetCounter--; - if(!RAMOffsetCounter) - { - RAMOffsetCounter = RAMOffsetWWS; - RAMOffsetY++; - } - - MDEC_Run(0); - } - else - { - PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA read when output FIFO is empty!\n"); - } - - return(V); +MDFN_FASTCALL void MDEC_DMAWrite(uint32 V) +{ + if(InFIFO.CanWrite()) + { + InFIFO.Write(V); + MDEC_Run(0); + } + else + { + PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA write when input FIFO is full!!\n"); + } +} + +MDFN_FASTCALL uint32 MDEC_DMARead(uint32* offs) +{ + uint32 V = 0; + + *offs = 0; + + if(MDFN_LIKELY(OutFIFO.CanRead())) + { + V = OutFIFO.Read(); + + *offs = (RAMOffsetY & 0x7) * RAMOffsetWWS; + + if(RAMOffsetY & 0x08) + { + *offs = (*offs - RAMOffsetWWS*7); + } + + RAMOffsetCounter--; + if(!RAMOffsetCounter) + { + RAMOffsetCounter = RAMOffsetWWS; + RAMOffsetY++; + } + + MDEC_Run(0); + } + else + { + PSX_DBG(PSX_DBG_WARNING, "[MDEC] DMA read when output FIFO is empty!\n"); + } + + return(V); } bool MDEC_DMACanWrite(void)