Render3D / SoftRasterizer: Add some more NEON optimizations.

This commit is contained in:
rogerman 2022-04-08 03:43:07 -07:00
parent e8de3db99c
commit 2e6b938378
5 changed files with 183 additions and 37 deletions

View File

@ -1,7 +1,7 @@
/*
Copyright (C) 2006 yopyop
Copyright (C) 2006-2007 shash
Copyright (C) 2008-2021 DeSmuME team
Copyright (C) 2008-2022 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -683,6 +683,8 @@ public:
class OpenGLRenderer : public Render3D_AVX2
#elif defined(ENABLE_SSE2)
class OpenGLRenderer : public Render3D_SSE2
#elif defined(ENABLE_NEON_A64)
class OpenGLRenderer : public Render3D_NEON
#elif defined(ENABLE_ALTIVEC)
class OpenGLRenderer : public Render3D_AltiVec
#else

View File

@ -1,5 +1,5 @@
/*
Copyright (C) 2009-2021 DeSmuME team
Copyright (C) 2009-2022 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -1423,6 +1423,8 @@ static Render3D* SoftRasterizerRendererCreate()
return new SoftRasterizerRenderer_AVX2;
#elif defined(ENABLE_SSE2)
return new SoftRasterizerRenderer_SSE2;
#elif defined(ENABLE_NEON_A64)
return new SoftRasterizerRenderer_NEON;
#elif defined(ENABLE_ALTIVEC)
return new SoftRasterizerRenderer_AltiVec;
#else
@ -1438,6 +1440,8 @@ static void SoftRasterizerRendererDestroy()
SoftRasterizerRenderer_AVX2 *oldRenderer = (SoftRasterizerRenderer_AVX2 *)CurrentRenderer;
#elif defined(ENABLE_SSE2)
SoftRasterizerRenderer_SSE2 *oldRenderer = (SoftRasterizerRenderer_SSE2 *)CurrentRenderer;
#elif defined(ENABLE_NEON_A64)
SoftRasterizerRenderer_NEON *oldRenderer = (SoftRasterizerRenderer_NEON *)CurrentRenderer;
#elif defined(ENABLE_ALTIVEC)
SoftRasterizerRenderer_AltiVec *oldRenderer = (SoftRasterizerRenderer_AltiVec *)CurrentRenderer;
#else
@ -2501,7 +2505,7 @@ Render3DError SoftRasterizerRenderer::SetFramebufferSize(size_t w, size_t h)
return RENDER3DERROR_NOERR;
}
#if defined(ENABLE_AVX) || defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC)
#if defined(ENABLE_AVX) || defined(ENABLE_SSE2) || defined(ENABLE_NEON_A64) || defined(ENABLE_ALTIVEC)
template <size_t SIMDBYTES>
SoftRasterizer_SIMD<SIMDBYTES>::SoftRasterizer_SIMD()
@ -2611,7 +2615,7 @@ Render3DError SoftRasterizer_SIMD<SIMDBYTES>::SetFramebufferSize(size_t w, size_
return RENDER3DERROR_NOERR;
}
#endif
#endif // defined(ENABLE_AVX) || defined(ENABLE_SSE2) || defined(ENABLE_NEON_A64) || defined(ENABLE_ALTIVEC)
#if defined(ENABLE_AVX2)
@ -2687,6 +2691,74 @@ void SoftRasterizerRenderer_SSE2::ClearUsingValues_Execute(const size_t startPix
}
}
#elif defined(ENABLE_NEON_A64)
void SoftRasterizerRenderer_NEON::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes)
{
this->_clearColor_v128u32x4.val[0] = vdupq_n_u32(clearColor6665.color);
this->_clearColor_v128u32x4.val[1] = this->_clearColor_v128u32x4.val[0];
this->_clearColor_v128u32x4.val[2] = this->_clearColor_v128u32x4.val[0];
this->_clearColor_v128u32x4.val[3] = this->_clearColor_v128u32x4.val[0];
this->_clearDepth_v128u32x4.val[0] = vdupq_n_u32(clearAttributes.depth);
this->_clearDepth_v128u32x4.val[1] = this->_clearDepth_v128u32x4.val[0];
this->_clearDepth_v128u32x4.val[2] = this->_clearDepth_v128u32x4.val[0];
this->_clearDepth_v128u32x4.val[3] = this->_clearDepth_v128u32x4.val[0];
this->_clearAttrOpaquePolyID_v128u8x4.val[0] = vdupq_n_u8(clearAttributes.opaquePolyID);
this->_clearAttrOpaquePolyID_v128u8x4.val[1] = this->_clearAttrOpaquePolyID_v128u8x4.val[0];
this->_clearAttrOpaquePolyID_v128u8x4.val[2] = this->_clearAttrOpaquePolyID_v128u8x4.val[0];
this->_clearAttrOpaquePolyID_v128u8x4.val[3] = this->_clearAttrOpaquePolyID_v128u8x4.val[0];
this->_clearAttrTranslucentPolyID_v128u8x4.val[0] = vdupq_n_u8(clearAttributes.translucentPolyID);
this->_clearAttrTranslucentPolyID_v128u8x4.val[1] = this->_clearAttrTranslucentPolyID_v128u8x4.val[0];
this->_clearAttrTranslucentPolyID_v128u8x4.val[2] = this->_clearAttrTranslucentPolyID_v128u8x4.val[0];
this->_clearAttrTranslucentPolyID_v128u8x4.val[3] = this->_clearAttrTranslucentPolyID_v128u8x4.val[0];
this->_clearAttrStencil_v128u8x4.val[0] = vdupq_n_u8(clearAttributes.stencil);
this->_clearAttrStencil_v128u8x4.val[1] = this->_clearAttrStencil_v128u8x4.val[0];
this->_clearAttrStencil_v128u8x4.val[2] = this->_clearAttrStencil_v128u8x4.val[0];
this->_clearAttrStencil_v128u8x4.val[3] = this->_clearAttrStencil_v128u8x4.val[0];
this->_clearAttrIsFogged_v128u8x4.val[0] = vdupq_n_u8(clearAttributes.isFogged);
this->_clearAttrIsFogged_v128u8x4.val[1] = this->_clearAttrIsFogged_v128u8x4.val[0];
this->_clearAttrIsFogged_v128u8x4.val[2] = this->_clearAttrIsFogged_v128u8x4.val[0];
this->_clearAttrIsFogged_v128u8x4.val[3] = this->_clearAttrIsFogged_v128u8x4.val[0];
this->_clearAttrIsTranslucentPoly_v128u8x4.val[0] = vdupq_n_u8(clearAttributes.isTranslucentPoly);
this->_clearAttrIsTranslucentPoly_v128u8x4.val[1] = this->_clearAttrIsTranslucentPoly_v128u8x4.val[0];
this->_clearAttrIsTranslucentPoly_v128u8x4.val[2] = this->_clearAttrIsTranslucentPoly_v128u8x4.val[0];
this->_clearAttrIsTranslucentPoly_v128u8x4.val[3] = this->_clearAttrIsTranslucentPoly_v128u8x4.val[0];
this->_clearAttrPolyFacing_v128u8x4.val[0] = vdupq_n_u8(clearAttributes.polyFacing);
this->_clearAttrPolyFacing_v128u8x4.val[1] = this->_clearAttrPolyFacing_v128u8x4.val[0];
this->_clearAttrPolyFacing_v128u8x4.val[2] = this->_clearAttrPolyFacing_v128u8x4.val[0];
this->_clearAttrPolyFacing_v128u8x4.val[3] = this->_clearAttrPolyFacing_v128u8x4.val[0];
}
void SoftRasterizerRenderer_NEON::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel)
{
for (size_t i = startPixel; i < endPixel; i+=(sizeof(v128u8)*4))
{
vst1q_u32_x4((u32 *)(this->_framebufferColor + i) + 0, this->_clearColor_v128u32x4);
vst1q_u32_x4((u32 *)(this->_framebufferColor + i) + 16, this->_clearColor_v128u32x4);
vst1q_u32_x4((u32 *)(this->_framebufferColor + i) + 32, this->_clearColor_v128u32x4);
vst1q_u32_x4((u32 *)(this->_framebufferColor + i) + 48, this->_clearColor_v128u32x4);
vst1q_u32_x4((this->_framebufferAttributes->depth + i) + 0, this->_clearDepth_v128u32x4);
vst1q_u32_x4((this->_framebufferAttributes->depth + i) + 16, this->_clearDepth_v128u32x4);
vst1q_u32_x4((this->_framebufferAttributes->depth + i) + 32, this->_clearDepth_v128u32x4);
vst1q_u32_x4((this->_framebufferAttributes->depth + i) + 48, this->_clearDepth_v128u32x4);
vst1q_u8_x4((this->_framebufferAttributes->opaquePolyID + i), this->_clearAttrOpaquePolyID_v128u8x4);
vst1q_u8_x4((this->_framebufferAttributes->translucentPolyID + i), this->_clearAttrTranslucentPolyID_v128u8x4);
vst1q_u8_x4((this->_framebufferAttributes->stencil + i), this->_clearAttrStencil_v128u8x4);
vst1q_u8_x4((this->_framebufferAttributes->isFogged + i), this->_clearAttrIsFogged_v128u8x4);
vst1q_u8_x4((this->_framebufferAttributes->isTranslucentPoly + i), this->_clearAttrIsTranslucentPoly_v128u8x4);
vst1q_u8_x4((this->_framebufferAttributes->polyFacing + i), this->_clearAttrPolyFacing_v128u8x4);
}
}
#elif defined(ENABLE_ALTIVEC)
void SoftRasterizerRenderer_AltiVec::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes)
@ -2727,7 +2799,7 @@ void SoftRasterizerRenderer_AltiVec::LoadClearValues(const FragmentColor &clearC
void SoftRasterizerRenderer_AltiVec::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel)
{
for (size_t i = startPixel; i < endPixel; i+=16)
for (size_t i = startPixel; i < endPixel; i+=sizeof(v128u8))
{
vec_st(this->_clearColor_v128u32, (i * 4) + 0, this->_framebufferColor);
vec_st(this->_clearColor_v128u32, (i * 4) + 16, this->_framebufferColor);

View File

@ -1,5 +1,5 @@
/*
Copyright (C) 2009-2021 DeSmuME team
Copyright (C) 2009-2022 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -21,9 +21,6 @@
#include "render3D.h"
#include "gfx3d.h"
#ifdef ENABLE_SSE2
#include <emmintrin.h>
#endif
#define SOFTRASTERIZER_MAX_THREADS 32
@ -138,6 +135,8 @@ public:
class SoftRasterizerRenderer : public Render3D_AVX2
#elif defined(ENABLE_SSE2)
class SoftRasterizerRenderer : public Render3D_SSE2
#elif defined(ENABLE_NEON_A64)
class SoftRasterizerRenderer : public Render3D_NEON
#elif defined(ENABLE_ALTIVEC)
class SoftRasterizerRenderer : public Render3D_AltiVec
#else
@ -218,26 +217,6 @@ template <size_t SIMDBYTES>
class SoftRasterizer_SIMD : public SoftRasterizerRenderer
{
protected:
#if defined(ENABLE_AVX2)
v256u32 _clearColor_v256u32;
v256u32 _clearDepth_v256u32;
v256u8 _clearAttrOpaquePolyID_v256u8;
v256u8 _clearAttrTranslucentPolyID_v256u8;
v256u8 _clearAttrStencil_v256u8;
v256u8 _clearAttrIsFogged_v256u8;
v256u8 _clearAttrIsTranslucentPoly_v256u8;
v256u8 _clearAttrPolyFacing_v256u8;
#elif defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC)
v128u32 _clearColor_v128u32;
v128u32 _clearDepth_v128u32;
v128u8 _clearAttrOpaquePolyID_v128u8;
v128u8 _clearAttrTranslucentPolyID_v128u8;
v128u8 _clearAttrStencil_v128u8;
v128u8 _clearAttrIsFogged_v128u8;
v128u8 _clearAttrIsTranslucentPoly_v128u8;
v128u8 _clearAttrPolyFacing_v128u8;
#endif
virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) = 0;
virtual Render3DError ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes);
@ -251,6 +230,15 @@ public:
class SoftRasterizerRenderer_AVX2 : public SoftRasterizer_SIMD<32>
{
protected:
v256u32 _clearColor_v256u32;
v256u32 _clearDepth_v256u32;
v256u8 _clearAttrOpaquePolyID_v256u8;
v256u8 _clearAttrTranslucentPolyID_v256u8;
v256u8 _clearAttrStencil_v256u8;
v256u8 _clearAttrIsFogged_v256u8;
v256u8 _clearAttrIsTranslucentPoly_v256u8;
v256u8 _clearAttrPolyFacing_v256u8;
virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes);
public:
@ -261,6 +249,34 @@ public:
class SoftRasterizerRenderer_SSE2 : public SoftRasterizer_SIMD<16>
{
protected:
v128u32 _clearColor_v128u32;
v128u32 _clearDepth_v128u32;
v128u8 _clearAttrOpaquePolyID_v128u8;
v128u8 _clearAttrTranslucentPolyID_v128u8;
v128u8 _clearAttrStencil_v128u8;
v128u8 _clearAttrIsFogged_v128u8;
v128u8 _clearAttrIsTranslucentPoly_v128u8;
v128u8 _clearAttrPolyFacing_v128u8;
virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes);
public:
virtual void ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel);
};
#elif defined(ENABLE_NEON_A64)
class SoftRasterizerRenderer_NEON : public SoftRasterizer_SIMD<16>
{
protected:
uint32x4x4_t _clearColor_v128u32x4;
uint32x4x4_t _clearDepth_v128u32x4;
uint8x16x4_t _clearAttrOpaquePolyID_v128u8x4;
uint8x16x4_t _clearAttrTranslucentPolyID_v128u8x4;
uint8x16x4_t _clearAttrStencil_v128u8x4;
uint8x16x4_t _clearAttrIsFogged_v128u8x4;
uint8x16x4_t _clearAttrIsTranslucentPoly_v128u8x4;
uint8x16x4_t _clearAttrPolyFacing_v128u8x4;
virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes);
public:
@ -271,6 +287,15 @@ public:
class SoftRasterizerRenderer_AltiVec : public SoftRasterizer_SIMD<16>
{
protected:
v128u32 _clearColor_v128u32;
v128u32 _clearDepth_v128u32;
v128u8 _clearAttrOpaquePolyID_v128u8;
v128u8 _clearAttrTranslucentPolyID_v128u8;
v128u8 _clearAttrStencil_v128u8;
v128u8 _clearAttrIsFogged_v128u8;
v128u8 _clearAttrIsTranslucentPoly_v128u8;
v128u8 _clearAttrPolyFacing_v128u8;
virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes);
public:

View File

@ -20,10 +20,6 @@
#include <string.h>
#ifdef ENABLE_SSE2
#include <emmintrin.h>
#endif
#include "utils/bits.h"
#include "MMU.h"
#include "NDSSystem.h"
@ -779,6 +775,7 @@ Render3DError Render3D_SIMD<SIMDBYTES>::SetFramebufferSize(size_t w, size_t h)
}
#if defined(ENABLE_AVX2)
void Render3D_AVX2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
{
const __m256i calcDepthConstants = _mm256_set1_epi32(0x01FF0200);
@ -822,7 +819,9 @@ void Render3D_AVX2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u
_mm256_store_si256( (__m256i *)(outFog + i), _mm256_permute4x64_epi64(_mm256_packus_epi16(clearFogLo, clearFogHi), 0xD8) );
}
}
#elif defined(ENABLE_SSE2)
void Render3D_SSE2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
{
const __m128i calcDepthConstants = _mm_set1_epi32(0x01FF0200);
@ -866,7 +865,46 @@ void Render3D_SSE2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u
_mm_store_si128((__m128i *)(outFog + i), _mm_packs_epi16(clearFogLo, clearFogHi));
}
}
#elif defined(ENABLE_NEON_A64)
void Render3D_NEON::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
{
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v128u16))
{
// Copy the colors to the color buffer.
vst1q_u16_x2( outColor16 + i, vld1q_u16_x2(inColor16 + i) );
// Write the depth values to the depth buffer using the following formula from GBATEK.
// 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane
// D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF);
//
// For now, let's forget GBATEK (which could be wrong) and try using a simpified formula:
// D24 = (D15 * 0x0200) + 0x01FF;
const uint16x8x2_t clearDepth = vld1q_u16_x2(inDepth16 + i);
const uint16x8x2_t clearDepthValue = {
vandq_u16(clearDepth.val[0], vdupq_n_u16(0x7FFF)),
vandq_u16(clearDepth.val[1], vdupq_n_u16(0x7FFF))
};
const uint32x4x4_t calcDepth = {
vmlal_n_u16( vdupq_n_u32(0x000001FF), vget_low_u16(clearDepthValue.val[0]), 0x0200 ),
vmlal_n_u16( vdupq_n_u32(0x000001FF), vget_high_u16(clearDepthValue.val[0]), 0x0200 ),
vmlal_n_u16( vdupq_n_u32(0x000001FF), vget_low_u16(clearDepthValue.val[1]), 0x0200 ),
vmlal_n_u16( vdupq_n_u32(0x000001FF), vget_high_u16(clearDepthValue.val[1]), 0x0200 )
};
vst1q_u32_x4(outDepth24 + i, calcDepth);
// Write the fog flags to the fog flag buffer.
vst1q_u8( outFog + i, vuzp1q_u16(vshrq_n_u16(clearDepth.val[0], 15), vshrq_n_u16(clearDepth.val[1], 15)) );
}
}
#elif defined(ENABLE_ALTIVEC)
void Render3D_AltiVec::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
{
const v128u16 calcDepthMul = ((v128u16){0x0200,0,0x0200,0,0x0200,0,0x0200,0});
@ -915,6 +953,7 @@ void Render3D_AltiVec::_ClearImageBaseLoop(const u16 *__restrict inColor16, cons
vec_st( vec_pack(clearFogLo, clearFogHi), 0, outFog + i );
}
}
#endif
template Render3D_SIMD<16>::Render3D_SIMD();

View File

@ -1,6 +1,6 @@
/*
Copyright (C) 2006-2007 shash
Copyright (C) 2007-2019 DeSmuME team
Copyright (C) 2007-2022 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -192,7 +192,7 @@ protected:
CACHE_ALIGN u16 clearImageColor16Buffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
CACHE_ALIGN u32 clearImageDepthBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
CACHE_ALIGN u8 clearImageFogBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
template<bool ISCOLORBLANK, bool ISDEPTHBLANK> void _ClearImageScrolledLoop(const u8 xScroll, const u8 yScroll, const u16 *__restrict inColor16, const u16 *__restrict inDepth16,
u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
@ -286,7 +286,7 @@ public:
class Render3D_AVX2 : public Render3D_SIMD<32>
{
public:
public:
virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
};
@ -294,7 +294,15 @@ public:
class Render3D_SSE2 : public Render3D_SIMD<16>
{
public:
public:
virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
};
#elif defined(ENABLE_NEON_A64)
class Render3D_NEON : public Render3D_SIMD<16>
{
public:
virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
};