From 2e6b9383787b11ec3b134b55d3f63657999f11ab Mon Sep 17 00:00:00 2001 From: rogerman Date: Fri, 8 Apr 2022 03:43:07 -0700 Subject: [PATCH] Render3D / SoftRasterizer: Add some more NEON optimizations. --- desmume/src/OGLRender.h | 4 +- desmume/src/rasterize.cpp | 80 +++++++++++++++++++++++++++++++++++++-- desmume/src/rasterize.h | 73 +++++++++++++++++++++++------------ desmume/src/render3D.cpp | 47 +++++++++++++++++++++-- desmume/src/render3D.h | 16 ++++++-- 5 files changed, 183 insertions(+), 37 deletions(-) diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h index bedd963d6..3c8785a95 100755 --- a/desmume/src/OGLRender.h +++ b/desmume/src/OGLRender.h @@ -1,7 +1,7 @@ /* Copyright (C) 2006 yopyop Copyright (C) 2006-2007 shash - Copyright (C) 2008-2021 DeSmuME team + Copyright (C) 2008-2022 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -683,6 +683,8 @@ public: class OpenGLRenderer : public Render3D_AVX2 #elif defined(ENABLE_SSE2) class OpenGLRenderer : public Render3D_SSE2 +#elif defined(ENABLE_NEON_A64) +class OpenGLRenderer : public Render3D_NEON #elif defined(ENABLE_ALTIVEC) class OpenGLRenderer : public Render3D_AltiVec #else diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index a9b939882..7fa5c66b4 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2009-2021 DeSmuME team + Copyright (C) 2009-2022 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -1423,6 +1423,8 @@ static Render3D* SoftRasterizerRendererCreate() return new SoftRasterizerRenderer_AVX2; #elif defined(ENABLE_SSE2) return new SoftRasterizerRenderer_SSE2; +#elif defined(ENABLE_NEON_A64) + return new SoftRasterizerRenderer_NEON; #elif defined(ENABLE_ALTIVEC) return new SoftRasterizerRenderer_AltiVec; #else @@ -1438,6 +1440,8 @@ static void SoftRasterizerRendererDestroy() SoftRasterizerRenderer_AVX2 *oldRenderer = (SoftRasterizerRenderer_AVX2 *)CurrentRenderer; #elif defined(ENABLE_SSE2) SoftRasterizerRenderer_SSE2 *oldRenderer = (SoftRasterizerRenderer_SSE2 *)CurrentRenderer; +#elif defined(ENABLE_NEON_A64) + SoftRasterizerRenderer_NEON *oldRenderer = (SoftRasterizerRenderer_NEON *)CurrentRenderer; #elif defined(ENABLE_ALTIVEC) SoftRasterizerRenderer_AltiVec *oldRenderer = (SoftRasterizerRenderer_AltiVec *)CurrentRenderer; #else @@ -2501,7 +2505,7 @@ Render3DError SoftRasterizerRenderer::SetFramebufferSize(size_t w, size_t h) return RENDER3DERROR_NOERR; } -#if defined(ENABLE_AVX) || defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC) +#if defined(ENABLE_AVX) || defined(ENABLE_SSE2) || defined(ENABLE_NEON_A64) || defined(ENABLE_ALTIVEC) template SoftRasterizer_SIMD::SoftRasterizer_SIMD() @@ -2611,7 +2615,7 @@ Render3DError SoftRasterizer_SIMD::SetFramebufferSize(size_t w, size_ return RENDER3DERROR_NOERR; } -#endif +#endif // defined(ENABLE_AVX) || defined(ENABLE_SSE2) || defined(ENABLE_NEON_A64) || defined(ENABLE_ALTIVEC) #if defined(ENABLE_AVX2) @@ -2687,6 +2691,74 @@ void SoftRasterizerRenderer_SSE2::ClearUsingValues_Execute(const size_t startPix } } +#elif defined(ENABLE_NEON_A64) + +void SoftRasterizerRenderer_NEON::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) +{ + this->_clearColor_v128u32x4.val[0] = vdupq_n_u32(clearColor6665.color); + this->_clearColor_v128u32x4.val[1] = this->_clearColor_v128u32x4.val[0]; + this->_clearColor_v128u32x4.val[2] = this->_clearColor_v128u32x4.val[0]; + this->_clearColor_v128u32x4.val[3] = this->_clearColor_v128u32x4.val[0]; + + this->_clearDepth_v128u32x4.val[0] = vdupq_n_u32(clearAttributes.depth); + this->_clearDepth_v128u32x4.val[1] = this->_clearDepth_v128u32x4.val[0]; + this->_clearDepth_v128u32x4.val[2] = this->_clearDepth_v128u32x4.val[0]; + this->_clearDepth_v128u32x4.val[3] = this->_clearDepth_v128u32x4.val[0]; + + this->_clearAttrOpaquePolyID_v128u8x4.val[0] = vdupq_n_u8(clearAttributes.opaquePolyID); + this->_clearAttrOpaquePolyID_v128u8x4.val[1] = this->_clearAttrOpaquePolyID_v128u8x4.val[0]; + this->_clearAttrOpaquePolyID_v128u8x4.val[2] = this->_clearAttrOpaquePolyID_v128u8x4.val[0]; + this->_clearAttrOpaquePolyID_v128u8x4.val[3] = this->_clearAttrOpaquePolyID_v128u8x4.val[0]; + + this->_clearAttrTranslucentPolyID_v128u8x4.val[0] = vdupq_n_u8(clearAttributes.translucentPolyID); + this->_clearAttrTranslucentPolyID_v128u8x4.val[1] = this->_clearAttrTranslucentPolyID_v128u8x4.val[0]; + this->_clearAttrTranslucentPolyID_v128u8x4.val[2] = this->_clearAttrTranslucentPolyID_v128u8x4.val[0]; + this->_clearAttrTranslucentPolyID_v128u8x4.val[3] = this->_clearAttrTranslucentPolyID_v128u8x4.val[0]; + + this->_clearAttrStencil_v128u8x4.val[0] = vdupq_n_u8(clearAttributes.stencil); + this->_clearAttrStencil_v128u8x4.val[1] = this->_clearAttrStencil_v128u8x4.val[0]; + this->_clearAttrStencil_v128u8x4.val[2] = this->_clearAttrStencil_v128u8x4.val[0]; + this->_clearAttrStencil_v128u8x4.val[3] = this->_clearAttrStencil_v128u8x4.val[0]; + + this->_clearAttrIsFogged_v128u8x4.val[0] = vdupq_n_u8(clearAttributes.isFogged); + this->_clearAttrIsFogged_v128u8x4.val[1] = this->_clearAttrIsFogged_v128u8x4.val[0]; + this->_clearAttrIsFogged_v128u8x4.val[2] = this->_clearAttrIsFogged_v128u8x4.val[0]; + this->_clearAttrIsFogged_v128u8x4.val[3] = this->_clearAttrIsFogged_v128u8x4.val[0]; + + this->_clearAttrIsTranslucentPoly_v128u8x4.val[0] = vdupq_n_u8(clearAttributes.isTranslucentPoly); + this->_clearAttrIsTranslucentPoly_v128u8x4.val[1] = this->_clearAttrIsTranslucentPoly_v128u8x4.val[0]; + this->_clearAttrIsTranslucentPoly_v128u8x4.val[2] = this->_clearAttrIsTranslucentPoly_v128u8x4.val[0]; + this->_clearAttrIsTranslucentPoly_v128u8x4.val[3] = this->_clearAttrIsTranslucentPoly_v128u8x4.val[0]; + + this->_clearAttrPolyFacing_v128u8x4.val[0] = vdupq_n_u8(clearAttributes.polyFacing); + this->_clearAttrPolyFacing_v128u8x4.val[1] = this->_clearAttrPolyFacing_v128u8x4.val[0]; + this->_clearAttrPolyFacing_v128u8x4.val[2] = this->_clearAttrPolyFacing_v128u8x4.val[0]; + this->_clearAttrPolyFacing_v128u8x4.val[3] = this->_clearAttrPolyFacing_v128u8x4.val[0]; +} + +void SoftRasterizerRenderer_NEON::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel) +{ + for (size_t i = startPixel; i < endPixel; i+=(sizeof(v128u8)*4)) + { + vst1q_u32_x4((u32 *)(this->_framebufferColor + i) + 0, this->_clearColor_v128u32x4); + vst1q_u32_x4((u32 *)(this->_framebufferColor + i) + 16, this->_clearColor_v128u32x4); + vst1q_u32_x4((u32 *)(this->_framebufferColor + i) + 32, this->_clearColor_v128u32x4); + vst1q_u32_x4((u32 *)(this->_framebufferColor + i) + 48, this->_clearColor_v128u32x4); + + vst1q_u32_x4((this->_framebufferAttributes->depth + i) + 0, this->_clearDepth_v128u32x4); + vst1q_u32_x4((this->_framebufferAttributes->depth + i) + 16, this->_clearDepth_v128u32x4); + vst1q_u32_x4((this->_framebufferAttributes->depth + i) + 32, this->_clearDepth_v128u32x4); + vst1q_u32_x4((this->_framebufferAttributes->depth + i) + 48, this->_clearDepth_v128u32x4); + + vst1q_u8_x4((this->_framebufferAttributes->opaquePolyID + i), this->_clearAttrOpaquePolyID_v128u8x4); + vst1q_u8_x4((this->_framebufferAttributes->translucentPolyID + i), this->_clearAttrTranslucentPolyID_v128u8x4); + vst1q_u8_x4((this->_framebufferAttributes->stencil + i), this->_clearAttrStencil_v128u8x4); + vst1q_u8_x4((this->_framebufferAttributes->isFogged + i), this->_clearAttrIsFogged_v128u8x4); + vst1q_u8_x4((this->_framebufferAttributes->isTranslucentPoly + i), this->_clearAttrIsTranslucentPoly_v128u8x4); + vst1q_u8_x4((this->_framebufferAttributes->polyFacing + i), this->_clearAttrPolyFacing_v128u8x4); + } +} + #elif defined(ENABLE_ALTIVEC) void SoftRasterizerRenderer_AltiVec::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) @@ -2727,7 +2799,7 @@ void SoftRasterizerRenderer_AltiVec::LoadClearValues(const FragmentColor &clearC void SoftRasterizerRenderer_AltiVec::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel) { - for (size_t i = startPixel; i < endPixel; i+=16) + for (size_t i = startPixel; i < endPixel; i+=sizeof(v128u8)) { vec_st(this->_clearColor_v128u32, (i * 4) + 0, this->_framebufferColor); vec_st(this->_clearColor_v128u32, (i * 4) + 16, this->_framebufferColor); diff --git a/desmume/src/rasterize.h b/desmume/src/rasterize.h index 85c342331..6006f91e1 100644 --- a/desmume/src/rasterize.h +++ b/desmume/src/rasterize.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2009-2021 DeSmuME team + Copyright (C) 2009-2022 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,9 +21,6 @@ #include "render3D.h" #include "gfx3d.h" -#ifdef ENABLE_SSE2 -#include -#endif #define SOFTRASTERIZER_MAX_THREADS 32 @@ -138,6 +135,8 @@ public: class SoftRasterizerRenderer : public Render3D_AVX2 #elif defined(ENABLE_SSE2) class SoftRasterizerRenderer : public Render3D_SSE2 +#elif defined(ENABLE_NEON_A64) +class SoftRasterizerRenderer : public Render3D_NEON #elif defined(ENABLE_ALTIVEC) class SoftRasterizerRenderer : public Render3D_AltiVec #else @@ -218,26 +217,6 @@ template class SoftRasterizer_SIMD : public SoftRasterizerRenderer { protected: -#if defined(ENABLE_AVX2) - v256u32 _clearColor_v256u32; - v256u32 _clearDepth_v256u32; - v256u8 _clearAttrOpaquePolyID_v256u8; - v256u8 _clearAttrTranslucentPolyID_v256u8; - v256u8 _clearAttrStencil_v256u8; - v256u8 _clearAttrIsFogged_v256u8; - v256u8 _clearAttrIsTranslucentPoly_v256u8; - v256u8 _clearAttrPolyFacing_v256u8; -#elif defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC) - v128u32 _clearColor_v128u32; - v128u32 _clearDepth_v128u32; - v128u8 _clearAttrOpaquePolyID_v128u8; - v128u8 _clearAttrTranslucentPolyID_v128u8; - v128u8 _clearAttrStencil_v128u8; - v128u8 _clearAttrIsFogged_v128u8; - v128u8 _clearAttrIsTranslucentPoly_v128u8; - v128u8 _clearAttrPolyFacing_v128u8; -#endif - virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) = 0; virtual Render3DError ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes); @@ -251,6 +230,15 @@ public: class SoftRasterizerRenderer_AVX2 : public SoftRasterizer_SIMD<32> { protected: + v256u32 _clearColor_v256u32; + v256u32 _clearDepth_v256u32; + v256u8 _clearAttrOpaquePolyID_v256u8; + v256u8 _clearAttrTranslucentPolyID_v256u8; + v256u8 _clearAttrStencil_v256u8; + v256u8 _clearAttrIsFogged_v256u8; + v256u8 _clearAttrIsTranslucentPoly_v256u8; + v256u8 _clearAttrPolyFacing_v256u8; + virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes); public: @@ -261,6 +249,34 @@ public: class SoftRasterizerRenderer_SSE2 : public SoftRasterizer_SIMD<16> { protected: + v128u32 _clearColor_v128u32; + v128u32 _clearDepth_v128u32; + v128u8 _clearAttrOpaquePolyID_v128u8; + v128u8 _clearAttrTranslucentPolyID_v128u8; + v128u8 _clearAttrStencil_v128u8; + v128u8 _clearAttrIsFogged_v128u8; + v128u8 _clearAttrIsTranslucentPoly_v128u8; + v128u8 _clearAttrPolyFacing_v128u8; + + virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes); + +public: + virtual void ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel); +}; + +#elif defined(ENABLE_NEON_A64) +class SoftRasterizerRenderer_NEON : public SoftRasterizer_SIMD<16> +{ +protected: + uint32x4x4_t _clearColor_v128u32x4; + uint32x4x4_t _clearDepth_v128u32x4; + uint8x16x4_t _clearAttrOpaquePolyID_v128u8x4; + uint8x16x4_t _clearAttrTranslucentPolyID_v128u8x4; + uint8x16x4_t _clearAttrStencil_v128u8x4; + uint8x16x4_t _clearAttrIsFogged_v128u8x4; + uint8x16x4_t _clearAttrIsTranslucentPoly_v128u8x4; + uint8x16x4_t _clearAttrPolyFacing_v128u8x4; + virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes); public: @@ -271,6 +287,15 @@ public: class SoftRasterizerRenderer_AltiVec : public SoftRasterizer_SIMD<16> { protected: + v128u32 _clearColor_v128u32; + v128u32 _clearDepth_v128u32; + v128u8 _clearAttrOpaquePolyID_v128u8; + v128u8 _clearAttrTranslucentPolyID_v128u8; + v128u8 _clearAttrStencil_v128u8; + v128u8 _clearAttrIsFogged_v128u8; + v128u8 _clearAttrIsTranslucentPoly_v128u8; + v128u8 _clearAttrPolyFacing_v128u8; + virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes); public: diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 9b2e16a0f..abc3b9243 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -20,10 +20,6 @@ #include -#ifdef ENABLE_SSE2 -#include -#endif - #include "utils/bits.h" #include "MMU.h" #include "NDSSystem.h" @@ -779,6 +775,7 @@ Render3DError Render3D_SIMD::SetFramebufferSize(size_t w, size_t h) } #if defined(ENABLE_AVX2) + void Render3D_AVX2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog) { const __m256i calcDepthConstants = _mm256_set1_epi32(0x01FF0200); @@ -822,7 +819,9 @@ void Render3D_AVX2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u _mm256_store_si256( (__m256i *)(outFog + i), _mm256_permute4x64_epi64(_mm256_packus_epi16(clearFogLo, clearFogHi), 0xD8) ); } } + #elif defined(ENABLE_SSE2) + void Render3D_SSE2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog) { const __m128i calcDepthConstants = _mm_set1_epi32(0x01FF0200); @@ -866,7 +865,46 @@ void Render3D_SSE2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u _mm_store_si128((__m128i *)(outFog + i), _mm_packs_epi16(clearFogLo, clearFogHi)); } } + +#elif defined(ENABLE_NEON_A64) + +void Render3D_NEON::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog) +{ + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v128u16)) + { + // Copy the colors to the color buffer. + vst1q_u16_x2( outColor16 + i, vld1q_u16_x2(inColor16 + i) ); + + // Write the depth values to the depth buffer using the following formula from GBATEK. + // 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane + // D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF); + // + // For now, let's forget GBATEK (which could be wrong) and try using a simpified formula: + // D24 = (D15 * 0x0200) + 0x01FF; + + const uint16x8x2_t clearDepth = vld1q_u16_x2(inDepth16 + i); + + const uint16x8x2_t clearDepthValue = { + vandq_u16(clearDepth.val[0], vdupq_n_u16(0x7FFF)), + vandq_u16(clearDepth.val[1], vdupq_n_u16(0x7FFF)) + }; + + const uint32x4x4_t calcDepth = { + vmlal_n_u16( vdupq_n_u32(0x000001FF), vget_low_u16(clearDepthValue.val[0]), 0x0200 ), + vmlal_n_u16( vdupq_n_u32(0x000001FF), vget_high_u16(clearDepthValue.val[0]), 0x0200 ), + vmlal_n_u16( vdupq_n_u32(0x000001FF), vget_low_u16(clearDepthValue.val[1]), 0x0200 ), + vmlal_n_u16( vdupq_n_u32(0x000001FF), vget_high_u16(clearDepthValue.val[1]), 0x0200 ) + }; + + vst1q_u32_x4(outDepth24 + i, calcDepth); + + // Write the fog flags to the fog flag buffer. + vst1q_u8( outFog + i, vuzp1q_u16(vshrq_n_u16(clearDepth.val[0], 15), vshrq_n_u16(clearDepth.val[1], 15)) ); + } +} + #elif defined(ENABLE_ALTIVEC) + void Render3D_AltiVec::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog) { const v128u16 calcDepthMul = ((v128u16){0x0200,0,0x0200,0,0x0200,0,0x0200,0}); @@ -915,6 +953,7 @@ void Render3D_AltiVec::_ClearImageBaseLoop(const u16 *__restrict inColor16, cons vec_st( vec_pack(clearFogLo, clearFogHi), 0, outFog + i ); } } + #endif template Render3D_SIMD<16>::Render3D_SIMD(); diff --git a/desmume/src/render3D.h b/desmume/src/render3D.h index 149926cc7..8dfe55dff 100644 --- a/desmume/src/render3D.h +++ b/desmume/src/render3D.h @@ -1,6 +1,6 @@ /* Copyright (C) 2006-2007 shash - Copyright (C) 2007-2019 DeSmuME team + Copyright (C) 2007-2022 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -192,7 +192,7 @@ protected: CACHE_ALIGN u16 clearImageColor16Buffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; CACHE_ALIGN u32 clearImageDepthBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; CACHE_ALIGN u8 clearImageFogBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; - + virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog); template void _ClearImageScrolledLoop(const u8 xScroll, const u8 yScroll, const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog); @@ -286,7 +286,7 @@ public: class Render3D_AVX2 : public Render3D_SIMD<32> { -public: +public: virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog); }; @@ -294,7 +294,15 @@ public: class Render3D_SSE2 : public Render3D_SIMD<16> { -public: +public: + virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog); +}; + +#elif defined(ENABLE_NEON_A64) + +class Render3D_NEON : public Render3D_SIMD<16> +{ +public: virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog); };