diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h index 3febe67f3..bcc228cdc 100755 --- a/desmume/src/OGLRender.h +++ b/desmume/src/OGLRender.h @@ -616,11 +616,15 @@ public: void SetDeposterizeBuffer(void *dstBuffer, void *workingBuffer); void SetUpscalingBuffer(void *upscaleBuffer); }; - -#if defined(ENABLE_SSE2) -class OpenGLRenderer : public Render3D_SSE2 -#else -class OpenGLRenderer : public Render3D + +#if defined(ENABLE_AVX2) +class OpenGLRenderer : public Render3D_AVX2 +#elif defined(ENABLE_SSE2) +class OpenGLRenderer : public Render3D_SSE2 +#elif defined(ENABLE_ALTIVEC) +class OpenGLRenderer : public Render3D_Altivec +#else +class OpenGLRenderer : public Render3D #endif { private: diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index 23d1d92aa..29dda14fc 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -1118,29 +1118,38 @@ static void* SoftRasterizer_RunRenderEdgeMarkAndFog(void *arg) static void* SoftRasterizer_RunClearUsingValues(void *arg) { SoftRasterizerClearParam *param = (SoftRasterizerClearParam *)arg; - param->renderer->ClearUsingValuesLoop(param->startPixel, param->endPixel); + param->renderer->ClearUsingValues_Execute(param->startPixel, param->endPixel); return NULL; } static Render3D* SoftRasterizerRendererCreate() -{ -#if defined(ENABLE_SSE2) - return new SoftRasterizerRenderer_SSE2; -#else - return new SoftRasterizerRenderer; +{ +#if defined(ENABLE_AVX2) + return new SoftRasterizerRenderer_AVX2; +#elif defined(ENABLE_SSE2) + return new SoftRasterizerRenderer_SSE2; +#elif defined(ENABLE_ALTIVEC) + return new SoftRasterizerRenderer_Altivec; +#else + return new SoftRasterizerRenderer; #endif } static void SoftRasterizerRendererDestroy() { if (CurrentRenderer != BaseRenderer) - { -#if defined(ENABLE_SSE2) - SoftRasterizerRenderer_SSE2 *oldRenderer = (SoftRasterizerRenderer_SSE2 *)CurrentRenderer; -#else - SoftRasterizerRenderer *oldRenderer = (SoftRasterizerRenderer *)CurrentRenderer; -#endif + { +#if defined(ENABLE_AVX2) + SoftRasterizerRenderer_AVX2 *oldRenderer = (SoftRasterizerRenderer_AVX2 *)CurrentRenderer; +#elif defined(ENABLE_SSE2) + SoftRasterizerRenderer_SSE2 *oldRenderer = (SoftRasterizerRenderer_SSE2 *)CurrentRenderer; +#elif defined(ENABLE_ALTIVEC) + SoftRasterizerRenderer_Altivec *oldRenderer = (SoftRasterizerRenderer_Altivec *)CurrentRenderer; +#else + SoftRasterizerRenderer *oldRenderer = (SoftRasterizerRenderer *)CurrentRenderer; +#endif + CurrentRenderer = BaseRenderer; delete oldRenderer; } @@ -2136,12 +2145,12 @@ Render3DError SoftRasterizerRenderer::ClearUsingImage(const u16 *__restrict colo return RENDER3DERROR_NOERR; } -void SoftRasterizerRenderer::ClearUsingValuesLoop(const size_t startPixel, const size_t endPixel) +void SoftRasterizerRenderer::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel) { for (size_t i = startPixel; i < endPixel; i++) { - this->_framebufferAttributes->SetAtIndex(i, this->_clearAttributes); this->_framebufferColor[i] = this->_clearColor6665; + this->_framebufferAttributes->SetAtIndex(i, this->_clearAttributes); } } @@ -2161,7 +2170,7 @@ Render3DError SoftRasterizerRenderer::ClearUsingValues(const FragmentColor &clea } else { - this->ClearUsingValuesLoop(0, this->_framebufferPixCount); + this->ClearUsingValues_Execute(0, this->_framebufferPixCount); } if (doMultithreadedClear) @@ -2327,11 +2336,12 @@ Render3DError SoftRasterizerRenderer::SetFramebufferSize(size_t w, size_t h) } return RENDER3DERROR_NOERR; -} - -#ifdef ENABLE_SSE2 +} -SoftRasterizerRenderer_SSE2::SoftRasterizerRenderer_SSE2() +#if defined(ENABLE_AVX2) || defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC) + +template +SoftRasterizer_SIMD::SoftRasterizer_SIMD() { if (_threadCount == 0) { @@ -2341,7 +2351,7 @@ SoftRasterizerRenderer_SSE2::SoftRasterizerRenderer_SSE2() } else { - const size_t pixelsPerThread = ((_framebufferSIMDPixCount / 16) / _threadCount) * 16; + const size_t pixelsPerThread = ((_framebufferSIMDPixCount / SIMDBYTES) / _threadCount) * SIMDBYTES; for (size_t i = 0; i < _threadCount; i++) { @@ -2352,37 +2362,10 @@ SoftRasterizerRenderer_SSE2::SoftRasterizerRenderer_SSE2() } } -void SoftRasterizerRenderer_SSE2::ClearUsingValuesLoop(const size_t startPixel, const size_t endPixel) +template +Render3DError SoftRasterizer_SIMD::ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) { - for (size_t i = startPixel; i < endPixel; i+=16) - { - _mm_stream_si128((__m128i *)(this->_framebufferColor + i + 0), this->_clearColor_v128u32); - _mm_stream_si128((__m128i *)(this->_framebufferColor + i + 4), this->_clearColor_v128u32); - _mm_stream_si128((__m128i *)(this->_framebufferColor + i + 8), this->_clearColor_v128u32); - _mm_stream_si128((__m128i *)(this->_framebufferColor + i + 12), this->_clearColor_v128u32); - - _mm_stream_si128((__m128i *)(this->_framebufferAttributes->depth + i + 0), this->_clearDepth_v128u32); - _mm_stream_si128((__m128i *)(this->_framebufferAttributes->depth + i + 4), this->_clearDepth_v128u32); - _mm_stream_si128((__m128i *)(this->_framebufferAttributes->depth + i + 8), this->_clearDepth_v128u32); - _mm_stream_si128((__m128i *)(this->_framebufferAttributes->depth + i + 12), this->_clearDepth_v128u32); - - _mm_stream_si128((__m128i *)(this->_framebufferAttributes->opaquePolyID + i), this->_clearAttrOpaquePolyID_v128u8); - _mm_stream_si128((__m128i *)(this->_framebufferAttributes->translucentPolyID + i), this->_clearAttrTranslucentPolyID_v128u8); - _mm_stream_si128((__m128i *)(this->_framebufferAttributes->stencil + i), this->_clearAttrStencil_v128u8); - _mm_stream_si128((__m128i *)(this->_framebufferAttributes->isFogged + i), this->_clearAttrIsFogged_v128u8); - _mm_stream_si128((__m128i *)(this->_framebufferAttributes->isTranslucentPoly + i), this->_clearAttrIsTranslucentPoly_v128u8); - } -} - -Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) -{ - this->_clearColor_v128u32 = _mm_set1_epi32(clearColor6665.color); - this->_clearDepth_v128u32 = _mm_set1_epi32(clearAttributes.depth); - this->_clearAttrOpaquePolyID_v128u8 = _mm_set1_epi8(clearAttributes.opaquePolyID); - this->_clearAttrTranslucentPolyID_v128u8 = _mm_set1_epi8(clearAttributes.translucentPolyID); - this->_clearAttrStencil_v128u8 = _mm_set1_epi8(clearAttributes.stencil); - this->_clearAttrIsFogged_v128u8 = _mm_set1_epi8(clearAttributes.isFogged); - this->_clearAttrIsTranslucentPoly_v128u8 = _mm_set1_epi8(clearAttributes.isTranslucentPoly); + this->LoadClearValues(clearColor6665, clearAttributes); const bool doMultithreadedClear = (this->_threadCount > 0); @@ -2395,16 +2378,14 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor } else { - this->ClearUsingValuesLoop(0, this->_framebufferSIMDPixCount); - } - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (size_t i = this->_framebufferSIMDPixCount; i < this->_framebufferPixCount; i++) - { - this->_framebufferColor[i] = clearColor6665; - this->_framebufferAttributes->SetAtIndex(i, clearAttributes); + this->ClearUsingValues_Execute(0, this->_framebufferSIMDPixCount); + } + +#pragma LOOPVECTORIZE_DISABLE + for (size_t i = this->_framebufferSIMDPixCount; i < this->_framebufferPixCount; i++) + { + this->_framebufferColor[i] = clearColor6665; + this->_framebufferAttributes->SetAtIndex(i, clearAttributes); } if (doMultithreadedClear) @@ -2413,14 +2394,15 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor { this->_task[threadIndex].finish(); } - } - - return RENDER3DERROR_NOERR; + } + + return RENDER3DERROR_NOERR; } -Render3DError SoftRasterizerRenderer_SSE2::SetFramebufferSize(size_t w, size_t h) +template +Render3DError SoftRasterizer_SIMD::SetFramebufferSize(size_t w, size_t h) { - Render3DError error = Render3D_SSE2::SetFramebufferSize(w, h); + Render3DError error = Render3D_SIMD::SetFramebufferSize(w, h); if (error != RENDER3DERROR_NOERR) { return RENDER3DERROR_NOERR; @@ -2444,7 +2426,7 @@ Render3DError SoftRasterizerRenderer_SSE2::SetFramebufferSize(size_t w, size_t h } else { - const size_t pixelsPerThread = ((pixCount / 16) / this->_threadCount) * 16; + const size_t pixelsPerThread = ((pixCount / SIMDBYTES) / this->_threadCount) * SIMDBYTES; this->_customLinesPerThread = h / this->_threadCount; this->_customPixelsPerThread = pixelsPerThread / this->_threadCount; @@ -2460,6 +2442,113 @@ Render3DError SoftRasterizerRenderer_SSE2::SetFramebufferSize(size_t w, size_t h } return RENDER3DERROR_NOERR; +} + +#endif + +#if defined(ENABLE_AVX2) + +void SoftRasterizerRenderer_AVX2::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) +{ + this->_clearColor_v256u32 = _mm256_set1_epi32(clearColor6665.color); + this->_clearDepth_v256u32 = _mm256_set1_epi32(clearAttributes.depth); + this->_clearAttrOpaquePolyID_v256u8 = _mm256_set1_epi8(clearAttributes.opaquePolyID); + this->_clearAttrTranslucentPolyID_v256u8 = _mm256_set1_epi8(clearAttributes.translucentPolyID); + this->_clearAttrStencil_v256u8 = _mm256_set1_epi8(clearAttributes.stencil); + this->_clearAttrIsFogged_v256u8 = _mm256_set1_epi8(clearAttributes.isFogged); + this->_clearAttrIsTranslucentPoly_v256u8 = _mm256_set1_epi8(clearAttributes.isTranslucentPoly); +} + +void SoftRasterizerRenderer_AVX2::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel) +{ + for (size_t i = startPixel; i < endPixel; i+=32) + { + _mm256_stream_si256((v256u32 *)(this->_framebufferColor + i + 0), this->_clearColor_v256u32); + _mm256_stream_si256((v256u32 *)(this->_framebufferColor + i + 8), this->_clearColor_v256u32); + _mm256_stream_si256((v256u32 *)(this->_framebufferColor + i + 16), this->_clearColor_v256u32); + _mm256_stream_si256((v256u32 *)(this->_framebufferColor + i + 24), this->_clearColor_v256u32); + + _mm256_stream_si256((v256u32 *)(this->_framebufferAttributes->depth + i + 0), this->_clearDepth_v256u32); + _mm256_stream_si256((v256u32 *)(this->_framebufferAttributes->depth + i + 8), this->_clearDepth_v256u32); + _mm256_stream_si256((v256u32 *)(this->_framebufferAttributes->depth + i + 16), this->_clearDepth_v256u32); + _mm256_stream_si256((v256u32 *)(this->_framebufferAttributes->depth + i + 24), this->_clearDepth_v256u32); + + _mm256_stream_si256((v256u8 *)(this->_framebufferAttributes->opaquePolyID + i), this->_clearAttrOpaquePolyID_v256u8); + _mm256_stream_si256((v256u8 *)(this->_framebufferAttributes->translucentPolyID + i), this->_clearAttrTranslucentPolyID_v256u8); + _mm256_stream_si256((v256u8 *)(this->_framebufferAttributes->stencil + i), this->_clearAttrStencil_v256u8); + _mm256_stream_si256((v256u8 *)(this->_framebufferAttributes->isFogged + i), this->_clearAttrIsFogged_v256u8); + _mm256_stream_si256((v256u8 *)(this->_framebufferAttributes->isTranslucentPoly + i), this->_clearAttrIsTranslucentPoly_v256u8); + } +} + +#elif defined(ENABLE_SSE2) + +void SoftRasterizerRenderer_SSE2::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) +{ + this->_clearColor_v128u32 = _mm_set1_epi32(clearColor6665.color); + this->_clearDepth_v128u32 = _mm_set1_epi32(clearAttributes.depth); + this->_clearAttrOpaquePolyID_v128u8 = _mm_set1_epi8(clearAttributes.opaquePolyID); + this->_clearAttrTranslucentPolyID_v128u8 = _mm_set1_epi8(clearAttributes.translucentPolyID); + this->_clearAttrStencil_v128u8 = _mm_set1_epi8(clearAttributes.stencil); + this->_clearAttrIsFogged_v128u8 = _mm_set1_epi8(clearAttributes.isFogged); + this->_clearAttrIsTranslucentPoly_v128u8 = _mm_set1_epi8(clearAttributes.isTranslucentPoly); +} + +void SoftRasterizerRenderer_SSE2::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel) +{ + for (size_t i = startPixel; i < endPixel; i+=16) + { + _mm_stream_si128((v128u32 *)(this->_framebufferColor + i + 0), this->_clearColor_v128u32); + _mm_stream_si128((v128u32 *)(this->_framebufferColor + i + 4), this->_clearColor_v128u32); + _mm_stream_si128((v128u32 *)(this->_framebufferColor + i + 8), this->_clearColor_v128u32); + _mm_stream_si128((v128u32 *)(this->_framebufferColor + i + 12), this->_clearColor_v128u32); + + _mm_stream_si128((v128u32 *)(this->_framebufferAttributes->depth + i + 0), this->_clearDepth_v128u32); + _mm_stream_si128((v128u32 *)(this->_framebufferAttributes->depth + i + 4), this->_clearDepth_v128u32); + _mm_stream_si128((v128u32 *)(this->_framebufferAttributes->depth + i + 8), this->_clearDepth_v128u32); + _mm_stream_si128((v128u32 *)(this->_framebufferAttributes->depth + i + 12), this->_clearDepth_v128u32); + + _mm_stream_si128((v128u8 *)(this->_framebufferAttributes->opaquePolyID + i), this->_clearAttrOpaquePolyID_v128u8); + _mm_stream_si128((v128u8 *)(this->_framebufferAttributes->translucentPolyID + i), this->_clearAttrTranslucentPolyID_v128u8); + _mm_stream_si128((v128u8 *)(this->_framebufferAttributes->stencil + i), this->_clearAttrStencil_v128u8); + _mm_stream_si128((v128u8 *)(this->_framebufferAttributes->isFogged + i), this->_clearAttrIsFogged_v128u8); + _mm_stream_si128((v128u8 *)(this->_framebufferAttributes->isTranslucentPoly + i), this->_clearAttrIsTranslucentPoly_v128u8); + } +} + +#elif defined(ENABLE_ALTIVEC) + +void SoftRasterizerRenderer_Altivec::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) +{ + this->_clearColor_v128u32 = vec_splat_u32(clearColor6665.color); + this->_clearDepth_v128u32 = vec_splat_u32(clearAttributes.depth); + this->_clearAttrOpaquePolyID_v128u8 = vec_splat_u8(clearAttributes.opaquePolyID); + this->_clearAttrTranslucentPolyID_v128u8 = vec_splat_u8(clearAttributes.translucentPolyID); + this->_clearAttrStencil_v128u8 = vec_splat_u8(clearAttributes.stencil); + this->_clearAttrIsFogged_v128u8 = vec_splat_u8(clearAttributes.isFogged); + this->_clearAttrIsTranslucentPoly_v128u8 = vec_splat_u8(clearAttributes.isTranslucentPoly); +} + +void SoftRasterizerRenderer_Altivec::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel) +{ + for (size_t i = startPixel; i < endPixel; i+=16) + { + vec_st(this->_clearColor_v128u32, i + 0, this->_framebufferColor); + vec_st(this->_clearColor_v128u32, i + 16, this->_framebufferColor); + vec_st(this->_clearColor_v128u32, i + 32, this->_framebufferColor); + vec_st(this->_clearColor_v128u32, i + 48, this->_framebufferColor); + + vec_st(this->_clearDepth_v128u32, i + 0, this->_framebufferAttributes->depth); + vec_st(this->_clearDepth_v128u32, i + 16, this->_framebufferAttributes->depth); + vec_st(this->_clearDepth_v128u32, i + 32, this->_framebufferAttributes->depth); + vec_st(this->_clearDepth_v128u32, i + 48, this->_framebufferAttributes->depth); + + vec_st(this->_clearAttrOpaquePolyID_v128u8, i, this->_framebufferAttributes->opaquePolyID); + vec_st(this->_clearAttrTranslucentPolyID_v128u8, i, this->_framebufferAttributes->translucentPolyID); + vec_st(this->_clearAttrStencil_v128u8, i, this->_framebufferAttributes->stencil); + vec_st(this->_clearAttrIsFogged_v128u8, i, this->_framebufferAttributes->isFogged); + vec_st(this->_clearAttrIsTranslucentPoly_v128u8, i, this->_framebufferAttributes->isTranslucentPoly); + } } -#endif // ENABLE_SSE2 +#endif diff --git a/desmume/src/rasterize.h b/desmume/src/rasterize.h index 733202df9..73014cb56 100644 --- a/desmume/src/rasterize.h +++ b/desmume/src/rasterize.h @@ -125,8 +125,12 @@ public: template FORCEINLINE void Render(); }; -#if defined(ENABLE_SSE2) -class SoftRasterizerRenderer : public Render3D_SSE2 +#if defined(ENABLE_AVX2) +class SoftRasterizerRenderer : public Render3D_AVX2 +#elif defined(ENABLE_SSE2) +class SoftRasterizerRenderer : public Render3D_SSE2 +#elif defined(ENABLE_ALTIVEC) +class SoftRasterizerRenderer : public Render3D_Altivec #else class SoftRasterizerRenderer : public Render3D #endif @@ -205,15 +209,23 @@ public: virtual Render3DError Render(const GFX3D &engine); virtual Render3DError RenderFinish(); virtual Render3DError RenderFlush(bool willFlushBuffer32, bool willFlushBuffer16); - virtual void ClearUsingValuesLoop(const size_t startPixel, const size_t endPixel); + virtual void ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel); virtual Render3DError SetFramebufferSize(size_t w, size_t h); -}; - -#ifdef ENABLE_SSE2 - -class SoftRasterizerRenderer_SSE2 : public SoftRasterizerRenderer +}; + +template +class SoftRasterizer_SIMD : public SoftRasterizerRenderer { protected: +#if defined(ENABLE_AVX2) + v256u32 _clearColor_v256u32; + v256u32 _clearDepth_v256u32; + v256u8 _clearAttrOpaquePolyID_v256u8; + v256u8 _clearAttrTranslucentPolyID_v256u8; + v256u8 _clearAttrStencil_v256u8; + v256u8 _clearAttrIsFogged_v256u8; + v256u8 _clearAttrIsTranslucentPoly_v256u8; +#elif defined(ENABLE_SSE2) || defined(ENABLE_ALTIVEC) v128u32 _clearColor_v128u32; v128u32 _clearDepth_v128u32; v128u8 _clearAttrOpaquePolyID_v128u8; @@ -221,14 +233,45 @@ protected: v128u8 _clearAttrStencil_v128u8; v128u8 _clearAttrIsFogged_v128u8; v128u8 _clearAttrIsTranslucentPoly_v128u8; - - virtual Render3DError ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes); +#endif + + virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) = 0; + virtual Render3DError ClearUsingValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes); public: - SoftRasterizerRenderer_SSE2(); + SoftRasterizer_SIMD(); - virtual void ClearUsingValuesLoop(const size_t startPixel, const size_t endPixel); virtual Render3DError SetFramebufferSize(size_t w, size_t h); +}; + +#if defined(ENABLE_AVX2) +class SoftRasterizerRenderer_AVX2 : public SoftRasterizer_SIMD<32> +{ +protected: + virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes); + +public: + virtual void ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel); +}; + +#elif defined(ENABLE_SSE2) +class SoftRasterizerRenderer_SSE2 : public SoftRasterizer_SIMD<16> +{ +protected: + virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes); + +public: + virtual void ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel); +}; + +#elif defined(ENABLE_ALTIVEC) +class SoftRasterizerRenderer_Altivec : public SoftRasterizer_SIMD<16> +{ +protected: + virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes); + +public: + virtual void ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel); }; #endif diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 7795dc481..f4231f769 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -113,43 +113,6 @@ void FragmentAttributesBuffer::SetAtIndex(const size_t index, const FragmentAttr this->isTranslucentPoly[index] = attr.isTranslucentPoly; } -void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr) -{ - size_t i = 0; - -#ifdef ENABLE_SSE2 - const __m128i attrDepth_vec128 = _mm_set1_epi32(attr.depth); - const __m128i attrOpaquePolyID_vec128 = _mm_set1_epi8(attr.opaquePolyID); - const __m128i attrTranslucentPolyID_vec128 = _mm_set1_epi8(attr.translucentPolyID); - const __m128i attrStencil_vec128 = _mm_set1_epi8(attr.stencil); - const __m128i attrIsFogged_vec128 = _mm_set1_epi8(attr.isFogged); - const __m128i attrIsTranslucentPoly_vec128 = _mm_set1_epi8(attr.isTranslucentPoly); - - const size_t sseCount = count - (count % 16); - for (; i < sseCount; i += 16) - { - _mm_stream_si128((__m128i *)(this->depth + 0), attrDepth_vec128); - _mm_stream_si128((__m128i *)(this->depth + 4), attrDepth_vec128); - _mm_stream_si128((__m128i *)(this->depth + 8), attrDepth_vec128); - _mm_stream_si128((__m128i *)(this->depth + 12), attrDepth_vec128); - - _mm_stream_si128((__m128i *)this->opaquePolyID, attrOpaquePolyID_vec128); - _mm_stream_si128((__m128i *)this->translucentPolyID, attrTranslucentPolyID_vec128); - _mm_stream_si128((__m128i *)this->stencil, attrStencil_vec128); - _mm_stream_si128((__m128i *)this->isFogged, attrIsFogged_vec128); - _mm_stream_si128((__m128i *)this->isTranslucentPoly, attrIsTranslucentPoly_vec128); - } -#endif - -#ifdef ENABLE_SSE2 -#pragma LOOPVECTORIZE_DISABLE -#endif - for (; i < count; i++) - { - this->SetAtIndex(i, attr); - } -} - Render3DTexture::Render3DTexture(TEXIMAGE_PARAM texAttributes, u32 palAttributes) : TextureStore(texAttributes, palAttributes) { _isSamplingEnabled = true; @@ -688,12 +651,14 @@ Render3DError Render3D::VramReconfigureSignal() return RENDER3DERROR_NOERR; } -Render3D_SIMD128::Render3D_SIMD128() +template +Render3D_SIMD::Render3D_SIMD() { - _framebufferSIMDPixCount = _framebufferPixCount - (_framebufferPixCount % 16); + _framebufferSIMDPixCount = (SIMDBYTES > 0) ? _framebufferPixCount - (_framebufferPixCount % SIMDBYTES) : _framebufferPixCount; } -Render3DError Render3D_SIMD128::SetFramebufferSize(size_t w, size_t h) +template +Render3DError Render3D_SIMD::SetFramebufferSize(size_t w, size_t h) { Render3DError error = this->Render3D::SetFramebufferSize(w, h); if (error != RENDER3DERROR_NOERR) @@ -701,32 +666,18 @@ Render3DError Render3D_SIMD128::SetFramebufferSize(size_t w, size_t h) return RENDER3DERROR_NOERR; } - this->_framebufferSIMDPixCount = this->_framebufferPixCount - (this->_framebufferPixCount % 16); + this->_framebufferSIMDPixCount = (SIMDBYTES > 0) ? this->_framebufferPixCount - (this->_framebufferPixCount % SIMDBYTES) : _framebufferPixCount; return error; } -Render3D_SIMD256::Render3D_SIMD256() -{ - _framebufferSIMDPixCount = _framebufferPixCount - (_framebufferPixCount % 32); -} - -Render3DError Render3D_SIMD256::SetFramebufferSize(size_t w, size_t h) -{ - Render3DError error = this->Render3D::SetFramebufferSize(w, h); - if (error != RENDER3DERROR_NOERR) - { - return RENDER3DERROR_NOERR; - } - - this->_framebufferSIMDPixCount = this->_framebufferPixCount - (this->_framebufferPixCount % 32); - - return error; -} - -#ifdef ENABLE_SSE2 +#if defined(ENABLE_AVX2) || defined(ENABLE_SSE2) +#if defined(ENABLE_AVX2) +Render3DError Render3D_AVX2::ClearFramebuffer(const GFX3D_State &renderState) +#elif defined(ENABLE_SSE2) Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState) +#endif { Render3DError error = RENDER3DERROR_NOERR; @@ -910,4 +861,8 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState) return error; } -#endif // ENABLE_SSE2 +#endif // defined(ENABLE_AVX2) || defined(ENABLE_SSE2) + +template Render3D_SIMD<0>::Render3D_SIMD(); +template Render3D_SIMD<16>::Render3D_SIMD(); +template Render3D_SIMD<32>::Render3D_SIMD(); diff --git a/desmume/src/render3D.h b/desmume/src/render3D.h index e0f47a705..bd9e6031c 100644 --- a/desmume/src/render3D.h +++ b/desmume/src/render3D.h @@ -96,7 +96,6 @@ struct FragmentAttributesBuffer ~FragmentAttributesBuffer(); void SetAtIndex(const size_t index, const FragmentAttributes &attr); - void SetAll(const FragmentAttributes &attr); }; struct Render3DDeviceInfo @@ -248,29 +247,35 @@ public: Render3DTexture* GetTextureByPolygonRenderIndex(size_t polyRenderIndex) const; }; -class Render3D_SIMD128 : public Render3D +template +class Render3D_SIMD : public Render3D { public: - Render3D_SIMD128(); - - virtual Render3DError SetFramebufferSize(size_t w, size_t h); -}; - -class Render3D_SIMD256 : public Render3D -{ -public: - Render3D_SIMD256(); + Render3D_SIMD(); virtual Render3DError SetFramebufferSize(size_t w, size_t h); }; + +#if defined(ENABLE_AVX2) + +class Render3D_AVX2 : public Render3D_SIMD<32> +{ +public: + virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState); +}; + +#elif defined(ENABLE_SSE2) -#ifdef ENABLE_SSE2 - -class Render3D_SSE2 : public Render3D_SIMD128 +class Render3D_SSE2 : public Render3D_SIMD<16> { public: virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState); -}; +}; + +#elif defined(ENABLE_ALTIVEC) + +class Render3D_Altivec : public Render3D_SIMD<16> +{}; #endif