From c49a480d51b4a44c6873734a1cc8905f6685933f Mon Sep 17 00:00:00 2001 From: rogerman Date: Sat, 4 Sep 2021 19:31:55 -0700 Subject: [PATCH] SoftRasterizer: Due to how the class inheritance works, SoftRasterizer_AVX actually requires AVX2 instead of just AVX. And in reality, AVX2 is just more practical than AVX for what we're doing here. - Also do some minor refactoring of Render3D. --- desmume/src/OGLRender.h | 4 +- desmume/src/rasterize.cpp | 14 +- desmume/src/rasterize.h | 12 +- desmume/src/render3D.cpp | 338 +++++++++++--------------------------- desmume/src/render3D.h | 15 +- 5 files changed, 116 insertions(+), 267 deletions(-) diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h index 5ef0bcf90..c9ca4794f 100755 --- a/desmume/src/OGLRender.h +++ b/desmume/src/OGLRender.h @@ -679,8 +679,8 @@ public: void SetUpscalingBuffer(void *upscaleBuffer); }; -#if defined(ENABLE_AVX) -class OpenGLRenderer : public Render3D_AVX +#if defined(ENABLE_AVX2) +class OpenGLRenderer : public Render3D_AVX2 #elif defined(ENABLE_SSE2) class OpenGLRenderer : public Render3D_SSE2 #elif defined(ENABLE_ALTIVEC) diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index 5c4609ec8..c7c4b5bb4 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -1419,8 +1419,8 @@ static void* SoftRasterizer_RunClearUsingValues(void *arg) static Render3D* SoftRasterizerRendererCreate() { -#if defined(ENABLE_AVX) - return new SoftRasterizerRenderer_AVX; +#if defined(ENABLE_AVX2) + return new SoftRasterizerRenderer_AVX2; #elif defined(ENABLE_SSE2) return new SoftRasterizerRenderer_SSE2; #elif defined(ENABLE_ALTIVEC) @@ -1434,8 +1434,8 @@ static void SoftRasterizerRendererDestroy() { if (CurrentRenderer != BaseRenderer) { -#if defined(ENABLE_AVX) - SoftRasterizerRenderer_AVX *oldRenderer = (SoftRasterizerRenderer_AVX *)CurrentRenderer; +#if defined(ENABLE_AVX2) + SoftRasterizerRenderer_AVX2 *oldRenderer = (SoftRasterizerRenderer_AVX2 *)CurrentRenderer; #elif defined(ENABLE_SSE2) SoftRasterizerRenderer_SSE2 *oldRenderer = (SoftRasterizerRenderer_SSE2 *)CurrentRenderer; #elif defined(ENABLE_ALTIVEC) @@ -2606,9 +2606,9 @@ Render3DError SoftRasterizer_SIMD::SetFramebufferSize(size_t w, size_ #endif -#if defined(ENABLE_AVX) +#if defined(ENABLE_AVX2) -void SoftRasterizerRenderer_AVX::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) +void SoftRasterizerRenderer_AVX2::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) { this->_clearColor_v256u32 = _mm256_set1_epi32(clearColor6665.color); this->_clearDepth_v256u32 = _mm256_set1_epi32(clearAttributes.depth); @@ -2620,7 +2620,7 @@ void SoftRasterizerRenderer_AVX::LoadClearValues(const FragmentColor &clearColor this->_clearAttrPolyFacing_v256u8 = _mm256_set1_epi8(clearAttributes.polyFacing); } -void SoftRasterizerRenderer_AVX::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel) +void SoftRasterizerRenderer_AVX2::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel) { for (size_t i = startPixel; i < endPixel; i+=sizeof(v256u8)) { diff --git a/desmume/src/rasterize.h b/desmume/src/rasterize.h index 419793352..85e0976dc 100644 --- a/desmume/src/rasterize.h +++ b/desmume/src/rasterize.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2009-2019 DeSmuME team + Copyright (C) 2009-2021 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -134,8 +134,8 @@ public: template FORCEINLINE void Render(); }; -#if defined(ENABLE_AVX) -class SoftRasterizerRenderer : public Render3D_AVX +#if defined(ENABLE_AVX2) +class SoftRasterizerRenderer : public Render3D_AVX2 #elif defined(ENABLE_SSE2) class SoftRasterizerRenderer : public Render3D_SSE2 #elif defined(ENABLE_ALTIVEC) @@ -218,7 +218,7 @@ template class SoftRasterizer_SIMD : public SoftRasterizerRenderer { protected: -#if defined(ENABLE_AVX) +#if defined(ENABLE_AVX2) v256u32 _clearColor_v256u32; v256u32 _clearDepth_v256u32; v256u8 _clearAttrOpaquePolyID_v256u8; @@ -247,8 +247,8 @@ public: virtual Render3DError SetFramebufferSize(size_t w, size_t h); }; -#if defined(ENABLE_AVX) -class SoftRasterizerRenderer_AVX : public SoftRasterizer_SIMD<32> +#if defined(ENABLE_AVX2) +class SoftRasterizerRenderer_AVX2 : public SoftRasterizer_SIMD<32> { protected: virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes); diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 1bd08bd99..890f62203 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -503,6 +503,16 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram return RENDER3DERROR_NOERR; } +void Render3D::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog) +{ + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++) + { + outColor16[i] = inColor16[i]; + outDepth24[i] = DS_DEPTH15TO24(inDepth16[i]); + outFog[i] = BIT15(inDepth16[i]); + } +} + template void Render3D::_ClearImageScrolledLoop(const u8 xScroll, const u8 yScroll, const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog) @@ -585,12 +595,7 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState) if (xScroll == 0 && yScroll == 0) { - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++) - { - this->clearImageColor16Buffer[i] = clearColorBuffer[i]; - this->clearImageDepthBuffer[i] = DS_DEPTH15TO24(clearDepthBuffer[i]); - this->clearImageFogBuffer[i] = BIT15(clearDepthBuffer[i]); - } + this->_ClearImageBaseLoop(clearColorBuffer, clearDepthBuffer, this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer); } else { @@ -773,252 +778,95 @@ Render3DError Render3D_SIMD::SetFramebufferSize(size_t w, size_t h) return error; } -#if defined(ENABLE_AVX) || defined(ENABLE_SSE2) - -#if defined(ENABLE_AVX) -Render3DError Render3D_AVX::ClearFramebuffer(const GFX3D_State &renderState) -#elif defined(ENABLE_SSE2) -Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState) -#endif +#if defined(ENABLE_AVX2) +void Render3D_AVX2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog) { - Render3DError error = RENDER3DERROR_NOERR; + const __m256i calcDepthConstants = _mm256_set1_epi32(0x01FF0200); - if (renderState.enableClearImage) + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v256u16)) { - //the lion, the witch, and the wardrobe (thats book 1, suck it you new-school numberers) - //uses the scroll registers in the main game engine - const u16 *__restrict clearColorBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[2]; - const u16 *__restrict clearDepthBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[3]; - const u16 scrollBits = T1ReadWord(MMU.ARM9_REG, 0x356); //CLRIMAGE_OFFSET - const u8 xScroll = scrollBits & 0xFF; - const u8 yScroll = (scrollBits >> 8) & 0xFF; + // Copy the colors to the color buffer. + _mm256_store_si256( (__m256i *)(outColor16 + i) + 0, _mm256_load_si256((__m256i *)(inColor16 + i) + 0) ); + _mm256_store_si256( (__m256i *)(outColor16 + i) + 1, _mm256_load_si256((__m256i *)(inColor16 + i) + 1) ); -#ifdef ENABLE_AVX2 - const __m256i calcDepthConstants = _mm256_set1_epi32(0x01FF0200); -#else - const __m128i calcDepthConstants = _mm_set1_epi32(0x01FF0200); -#endif - if (xScroll == 0 && yScroll == 0) - { -#ifdef ENABLE_AVX2 - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v256u16)) - { - // Copy the colors to the color buffer. - _mm256_store_si256( (__m256i *)(this->clearImageColor16Buffer + i) + 0, _mm256_load_si256((__m256i *)(clearColorBuffer + i) + 0) ); - _mm256_store_si256( (__m256i *)(this->clearImageColor16Buffer + i) + 1, _mm256_load_si256((__m256i *)(clearColorBuffer + i) + 1) ); - - // Write the depth values to the depth buffer using the following formula from GBATEK. - // 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane - // D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF); - // - // For now, let's forget GBATEK (which could be wrong) and try using a simpified formula: - // D24 = (D15 * 0x0200) + 0x01FF; - const __m256i clearDepthLo = _mm256_load_si256((__m256i *)(clearDepthBuffer + i) + 0); - const __m256i clearDepthHi = _mm256_load_si256((__m256i *)(clearDepthBuffer + i) + 1); - - const __m256i clearDepthValueLo = _mm256_permute4x64_epi64( _mm256_and_si256(clearDepthLo, _mm256_set1_epi16(0x7FFF)), 0xD8 ); - const __m256i clearDepthValueHi = _mm256_permute4x64_epi64( _mm256_and_si256(clearDepthHi, _mm256_set1_epi16(0x7FFF)), 0xD8 ); - - __m256i calcDepth0 = _mm256_unpacklo_epi16(clearDepthValueLo, _mm256_set1_epi16(1)); - __m256i calcDepth1 = _mm256_unpackhi_epi16(clearDepthValueLo, _mm256_set1_epi16(1)); - __m256i calcDepth2 = _mm256_unpacklo_epi16(clearDepthValueHi, _mm256_set1_epi16(1)); - __m256i calcDepth3 = _mm256_unpackhi_epi16(clearDepthValueHi, _mm256_set1_epi16(1)); - - calcDepth0 = _mm256_madd_epi16(calcDepth0, calcDepthConstants); - calcDepth1 = _mm256_madd_epi16(calcDepth1, calcDepthConstants); - calcDepth2 = _mm256_madd_epi16(calcDepth2, calcDepthConstants); - calcDepth3 = _mm256_madd_epi16(calcDepth3, calcDepthConstants); - - _mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 0, calcDepth0); - _mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 1, calcDepth1); - _mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 2, calcDepth2); - _mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 3, calcDepth3); - - // Write the fog flags to the fog flag buffer. - const __m256i clearFogLo = _mm256_srli_epi16(clearDepthLo, 15); - const __m256i clearFogHi = _mm256_srli_epi16(clearDepthHi, 15); - _mm256_store_si256( (__m256i *)(this->clearImageFogBuffer + i), _mm256_permute4x64_epi64(_mm256_packus_epi16(clearFogLo, clearFogHi), 0xD8) ); - } -#else - for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v128u16)) - { - // Copy the colors to the color buffer. - _mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i) + 0, _mm_load_si128((__m128i *)(clearColorBuffer + i) + 0) ); - _mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i) + 1, _mm_load_si128((__m128i *)(clearColorBuffer + i) + 1) ); - - // Write the depth values to the depth buffer using the following formula from GBATEK. - // 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane - // D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF); - // - // For now, let's forget GBATEK (which could be wrong) and try using a simpified formula: - // D24 = (D15 * 0x0200) + 0x01FF; - const __m128i clearDepthLo = _mm_load_si128((__m128i *)(clearDepthBuffer + i) + 0); - const __m128i clearDepthHi = _mm_load_si128((__m128i *)(clearDepthBuffer + i) + 1); - - const __m128i clearDepthValueLo = _mm_and_si128(clearDepthLo, _mm_set1_epi16(0x7FFF)); - const __m128i clearDepthValueHi = _mm_and_si128(clearDepthHi, _mm_set1_epi16(0x7FFF)); - - __m128i calcDepth0 = _mm_unpacklo_epi16(clearDepthValueLo, _mm_set1_epi16(1)); - __m128i calcDepth1 = _mm_unpackhi_epi16(clearDepthValueLo, _mm_set1_epi16(1)); - __m128i calcDepth2 = _mm_unpacklo_epi16(clearDepthValueHi, _mm_set1_epi16(1)); - __m128i calcDepth3 = _mm_unpackhi_epi16(clearDepthValueHi, _mm_set1_epi16(1)); - - calcDepth0 = _mm_madd_epi16(calcDepth0, calcDepthConstants); - calcDepth1 = _mm_madd_epi16(calcDepth1, calcDepthConstants); - calcDepth2 = _mm_madd_epi16(calcDepth2, calcDepthConstants); - calcDepth3 = _mm_madd_epi16(calcDepth3, calcDepthConstants); - - _mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 0, calcDepth0); - _mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 1, calcDepth1); - _mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 2, calcDepth2); - _mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 3, calcDepth3); - - // Write the fog flags to the fog flag buffer. - const __m128i clearFogLo = _mm_srli_epi16(clearDepthLo, 15); - const __m128i clearFogHi = _mm_srli_epi16(clearDepthHi, 15); - _mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packs_epi16(clearFogLo, clearFogHi)); - } -#endif - } - else - { - // FIXME: Fix SSE2 support for scrolled clear images. - // The depth-related code below doesn't actually work, and I don't know why - // this is, so just use the scalar version for now. - // - rogerman, 2018/09/19 - /* - const size_t shiftCount = xScroll & 0x07; - - for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++) - { - const size_t y = ((iy + yScroll) & 0xFF) << 8; - - for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex += 8, ix += 8) - { - const size_t x = (ix + xScroll) & 0xFF; - - __m128i clearColor; - __m128i clearDepth_vec128; - - if (shiftCount == 0) - { - const size_t srcIndex = y | x; - - clearColor = _mm_load_si128((__m128i *)(clearColorBuffer + srcIndex)); - clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + srcIndex)); - } - else - { - const size_t x1 = x & 0xF8; - const size_t x0 = (x1 == 0) ? (GPU_FRAMEBUFFER_NATIVE_WIDTH - 8) : x1 - 8; - const size_t srcIndex0 = y | x0; - const size_t srcIndex1 = y | x1; - - const __m128i clearColor0 = _mm_load_si128((__m128i *)(clearColorBuffer + srcIndex0)); - const __m128i clearColor1 = _mm_load_si128((__m128i *)(clearColorBuffer + srcIndex1)); - const __m128i clearDepth0 = _mm_load_si128((__m128i *)(clearDepthBuffer + srcIndex0)); - const __m128i clearDepth1 = _mm_load_si128((__m128i *)(clearDepthBuffer + srcIndex1)); - - switch (shiftCount) - { - case 1: - clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 14); - clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 14); - break; - - case 2: - clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 12); - clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 12); - break; - - case 3: - clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 10); - clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 10); - break; - - case 4: - clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 8); - clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 8); - break; - - case 5: - clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 6); - clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 6); - break; - - case 6: - clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 4); - clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 4); - break; - - case 7: - clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 2); - clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 2); - break; - - default: - clearColor = _mm_setzero_si128(); - clearDepth_vec128 = _mm_setzero_si128(); - break; - } - } - - const __m128i clearDepthValue = _mm_and_si128(clearDepth_vec128, _mm_set1_epi16(0x7FFF)); - const __m128i depthPlusOne = _mm_srli_epi16( _mm_adds_epu16(clearDepthValue, _mm_set1_epi16(1)), 15); - const __m128i clearFog = _mm_srli_epi16(clearDepth_vec128, 15); - - __m128i calcDepth0 = _mm_unpacklo_epi16(clearDepthValue, depthPlusOne); - __m128i calcDepth1 = _mm_unpackhi_epi16(clearDepthValue, depthPlusOne); - calcDepth0 = _mm_madd_epi16(calcDepth0, calcDepthConstants); - calcDepth1 = _mm_madd_epi16(calcDepth1, calcDepthConstants); - - _mm_store_si128((__m128i *)(this->clearImageColor16Buffer + dstIndex), clearColor); - _mm_store_si128((__m128i *)(this->clearImageDepthBuffer + dstIndex + 0), calcDepth0); - _mm_store_si128((__m128i *)(this->clearImageDepthBuffer + dstIndex + 4), calcDepth1); - _mm_storel_epi64((__m128i *)(this->clearImageFogBuffer + dstIndex), _mm_packs_epi16(clearFog, _mm_setzero_si128())); - } - } - */ - const bool isClearColorBlank = (clearColorBuffer >= (u16 *)MMU.blank_memory); - const bool isClearDepthBlank = (clearDepthBuffer >= (u16 *)MMU.blank_memory); - - if (!isClearColorBlank && !isClearDepthBlank) - { - this->_ClearImageScrolledLoop(xScroll, yScroll, clearColorBuffer, clearDepthBuffer, - this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer); - } - else if (isClearColorBlank) - { - this->_ClearImageScrolledLoop< true, false>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer, - this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer); - } - else if (isClearDepthBlank) - { - this->_ClearImageScrolledLoop(xScroll, yScroll, clearColorBuffer, clearDepthBuffer, - this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer); - } - else - { - this->_ClearImageScrolledLoop< true, true>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer, - this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer); - } - } + // Write the depth values to the depth buffer using the following formula from GBATEK. + // 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane + // D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF); + // + // For now, let's forget GBATEK (which could be wrong) and try using a simpified formula: + // D24 = (D15 * 0x0200) + 0x01FF; + const __m256i clearDepthLo = _mm256_load_si256((__m256i *)(inDepth16 + i) + 0); + const __m256i clearDepthHi = _mm256_load_si256((__m256i *)(inDepth16 + i) + 1); - error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->_clearAttributes.opaquePolyID); - if (error != RENDER3DERROR_NOERR) - { - error = this->ClearUsingValues(this->_clearColor6665, this->_clearAttributes); - } + const __m256i clearDepthValueLo = _mm256_permute4x64_epi64( _mm256_and_si256(clearDepthLo, _mm256_set1_epi16(0x7FFF)), 0xD8 ); + const __m256i clearDepthValueHi = _mm256_permute4x64_epi64( _mm256_and_si256(clearDepthHi, _mm256_set1_epi16(0x7FFF)), 0xD8 ); + + __m256i calcDepth0 = _mm256_unpacklo_epi16(clearDepthValueLo, _mm256_set1_epi16(1)); + __m256i calcDepth1 = _mm256_unpackhi_epi16(clearDepthValueLo, _mm256_set1_epi16(1)); + __m256i calcDepth2 = _mm256_unpacklo_epi16(clearDepthValueHi, _mm256_set1_epi16(1)); + __m256i calcDepth3 = _mm256_unpackhi_epi16(clearDepthValueHi, _mm256_set1_epi16(1)); + + calcDepth0 = _mm256_madd_epi16(calcDepth0, calcDepthConstants); + calcDepth1 = _mm256_madd_epi16(calcDepth1, calcDepthConstants); + calcDepth2 = _mm256_madd_epi16(calcDepth2, calcDepthConstants); + calcDepth3 = _mm256_madd_epi16(calcDepth3, calcDepthConstants); + + _mm256_store_si256((__m256i *)(outDepth24 + i) + 0, calcDepth0); + _mm256_store_si256((__m256i *)(outDepth24 + i) + 1, calcDepth1); + _mm256_store_si256((__m256i *)(outDepth24 + i) + 2, calcDepth2); + _mm256_store_si256((__m256i *)(outDepth24 + i) + 3, calcDepth3); + + // Write the fog flags to the fog flag buffer. + const __m256i clearFogLo = _mm256_srli_epi16(clearDepthLo, 15); + const __m256i clearFogHi = _mm256_srli_epi16(clearDepthHi, 15); + _mm256_store_si256( (__m256i *)(outFog + i), _mm256_permute4x64_epi64(_mm256_packus_epi16(clearFogLo, clearFogHi), 0xD8) ); } - else - { - error = this->ClearUsingValues(this->_clearColor6665, this->_clearAttributes); - } - - return error; } - -#endif // defined(ENABLE_AVX) || defined(ENABLE_SSE2) +#elif defined(ENABLE_SSE2) +void Render3D_SSE2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog) +{ + const __m128i calcDepthConstants = _mm_set1_epi32(0x01FF0200); + + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v128u16)) + { + // Copy the colors to the color buffer. + _mm_store_si128( (__m128i *)(outColor16 + i) + 0, _mm_load_si128((__m128i *)(inColor16 + i) + 0) ); + _mm_store_si128( (__m128i *)(outColor16 + i) + 1, _mm_load_si128((__m128i *)(inColor16 + i) + 1) ); + + // Write the depth values to the depth buffer using the following formula from GBATEK. + // 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane + // D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF); + // + // For now, let's forget GBATEK (which could be wrong) and try using a simpified formula: + // D24 = (D15 * 0x0200) + 0x01FF; + const __m128i clearDepthLo = _mm_load_si128((__m128i *)(inDepth16 + i) + 0); + const __m128i clearDepthHi = _mm_load_si128((__m128i *)(inDepth16 + i) + 1); + + const __m128i clearDepthValueLo = _mm_and_si128(clearDepthLo, _mm_set1_epi16(0x7FFF)); + const __m128i clearDepthValueHi = _mm_and_si128(clearDepthHi, _mm_set1_epi16(0x7FFF)); + + __m128i calcDepth0 = _mm_unpacklo_epi16(clearDepthValueLo, _mm_set1_epi16(1)); + __m128i calcDepth1 = _mm_unpackhi_epi16(clearDepthValueLo, _mm_set1_epi16(1)); + __m128i calcDepth2 = _mm_unpacklo_epi16(clearDepthValueHi, _mm_set1_epi16(1)); + __m128i calcDepth3 = _mm_unpackhi_epi16(clearDepthValueHi, _mm_set1_epi16(1)); + + calcDepth0 = _mm_madd_epi16(calcDepth0, calcDepthConstants); + calcDepth1 = _mm_madd_epi16(calcDepth1, calcDepthConstants); + calcDepth2 = _mm_madd_epi16(calcDepth2, calcDepthConstants); + calcDepth3 = _mm_madd_epi16(calcDepth3, calcDepthConstants); + + _mm_store_si128((__m128i *)(outDepth24 + i) + 0, calcDepth0); + _mm_store_si128((__m128i *)(outDepth24 + i) + 1, calcDepth1); + _mm_store_si128((__m128i *)(outDepth24 + i) + 2, calcDepth2); + _mm_store_si128((__m128i *)(outDepth24 + i) + 3, calcDepth3); + + // Write the fog flags to the fog flag buffer. + const __m128i clearFogLo = _mm_srli_epi16(clearDepthLo, 15); + const __m128i clearFogHi = _mm_srli_epi16(clearDepthHi, 15); + _mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packs_epi16(clearFogLo, clearFogHi)); + } +} +#endif template Render3D_SIMD<16>::Render3D_SIMD(); template Render3D_SIMD<32>::Render3D_SIMD(); diff --git a/desmume/src/render3D.h b/desmume/src/render3D.h index 587affc22..d169da18c 100644 --- a/desmume/src/render3D.h +++ b/desmume/src/render3D.h @@ -192,7 +192,8 @@ protected: CACHE_ALIGN u16 clearImageColor16Buffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; CACHE_ALIGN u32 clearImageDepthBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; CACHE_ALIGN u8 clearImageFogBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; - + + virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog); template void _ClearImageScrolledLoop(const u8 xScroll, const u8 yScroll, const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog); @@ -281,20 +282,20 @@ public: virtual Render3DError SetFramebufferSize(size_t w, size_t h); }; -#if defined(ENABLE_AVX) +#if defined(ENABLE_AVX2) -class Render3D_AVX : public Render3D_SIMD<32> +class Render3D_AVX2 : public Render3D_SIMD<32> { -public: - virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState); +public: + virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog); }; #elif defined(ENABLE_SSE2) class Render3D_SSE2 : public Render3D_SIMD<16> { -public: - virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState); +public: + virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog); }; #elif defined(ENABLE_ALTIVEC)