diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h index c9ca4794f..86fb084a8 100755 --- a/desmume/src/OGLRender.h +++ b/desmume/src/OGLRender.h @@ -684,7 +684,7 @@ class OpenGLRenderer : public Render3D_AVX2 #elif defined(ENABLE_SSE2) class OpenGLRenderer : public Render3D_SSE2 #elif defined(ENABLE_ALTIVEC) -class OpenGLRenderer : public Render3D_Altivec +class OpenGLRenderer : public Render3D_AltiVec #else class OpenGLRenderer : public Render3D #endif diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index 161030004..f4604f84b 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -417,7 +417,7 @@ static void makeTables() // Is GBATEK actually correct here? Let's try using a simplified formula and see if it's // more accurate. - dsDepthExtend_15bit_to_24bit[i] = LE_TO_LOCAL_32( (i*0x0200) + 0x01FF ); + dsDepthExtend_15bit_to_24bit[i] = (i * 0x0200) + 0x01FF; } for (size_t i = 0; i < 65536; i++) diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index c7c4b5bb4..bde5e9484 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -1424,7 +1424,7 @@ static Render3D* SoftRasterizerRendererCreate() #elif defined(ENABLE_SSE2) return new SoftRasterizerRenderer_SSE2; #elif defined(ENABLE_ALTIVEC) - return new SoftRasterizerRenderer_Altivec; + return new SoftRasterizerRenderer_AltiVec; #else return new SoftRasterizerRenderer; #endif @@ -1439,7 +1439,7 @@ static void SoftRasterizerRendererDestroy() #elif defined(ENABLE_SSE2) SoftRasterizerRenderer_SSE2 *oldRenderer = (SoftRasterizerRenderer_SSE2 *)CurrentRenderer; #elif defined(ENABLE_ALTIVEC) - SoftRasterizerRenderer_Altivec *oldRenderer = (SoftRasterizerRenderer_Altivec *)CurrentRenderer; + SoftRasterizerRenderer_AltiVec *oldRenderer = (SoftRasterizerRenderer_AltiVec *)CurrentRenderer; #else SoftRasterizerRenderer *oldRenderer = (SoftRasterizerRenderer *)CurrentRenderer; #endif @@ -2682,7 +2682,7 @@ void SoftRasterizerRenderer_SSE2::ClearUsingValues_Execute(const size_t startPix #elif defined(ENABLE_ALTIVEC) -void SoftRasterizerRenderer_Altivec::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) +void SoftRasterizerRenderer_AltiVec::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes) { this->_clearColor_v128u32 = (v128u32){clearColor6665.color,clearColor6665.color,clearColor6665.color,clearColor6665.color}; this->_clearDepth_v128u32 = (v128u32){clearAttributes.depth,clearAttributes.depth,clearAttributes.depth,clearAttributes.depth}; @@ -2718,7 +2718,7 @@ void SoftRasterizerRenderer_Altivec::LoadClearValues(const FragmentColor &clearC clearAttributes.polyFacing,clearAttributes.polyFacing,clearAttributes.polyFacing,clearAttributes.polyFacing}; } -void SoftRasterizerRenderer_Altivec::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel) +void SoftRasterizerRenderer_AltiVec::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel) { for (size_t i = startPixel; i < endPixel; i+=16) { diff --git a/desmume/src/rasterize.h b/desmume/src/rasterize.h index 85e0976dc..85c342331 100644 --- a/desmume/src/rasterize.h +++ b/desmume/src/rasterize.h @@ -139,7 +139,7 @@ class SoftRasterizerRenderer : public Render3D_AVX2 #elif defined(ENABLE_SSE2) class SoftRasterizerRenderer : public Render3D_SSE2 #elif defined(ENABLE_ALTIVEC) -class SoftRasterizerRenderer : public Render3D_Altivec +class SoftRasterizerRenderer : public Render3D_AltiVec #else class SoftRasterizerRenderer : public Render3D #endif @@ -268,7 +268,7 @@ public: }; #elif defined(ENABLE_ALTIVEC) -class SoftRasterizerRenderer_Altivec : public SoftRasterizer_SIMD<16> +class SoftRasterizerRenderer_AltiVec : public SoftRasterizer_SIMD<16> { protected: virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes); diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 890f62203..447d2af33 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -507,9 +507,9 @@ void Render3D::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *_ { for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++) { - outColor16[i] = inColor16[i]; - outDepth24[i] = DS_DEPTH15TO24(inDepth16[i]); - outFog[i] = BIT15(inDepth16[i]); + outColor16[i] = LE_TO_LOCAL_16(inColor16[i]); + outDepth24[i] = DS_DEPTH15TO24( LE_TO_LOCAL_16(inDepth16[i]) ); + outFog[i] = BIT15( LE_TO_LOCAL_16(inDepth16[i]) ); } } @@ -558,7 +558,7 @@ void Render3D::_ClearImageScrolledLoop(const u8 xScroll, const u8 yScroll, const // - Blazer Drive if (!ISCOLORBLANK) { - outColor16[dstIndex] = inColor16[srcIndex]; + outColor16[dstIndex] = LE_TO_LOCAL_16(inColor16[srcIndex]); } // Clear image depth buffer, where the first 15 bits are converted to @@ -571,8 +571,8 @@ void Render3D::_ClearImageScrolledLoop(const u8 xScroll, const u8 yScroll, const // - The Chronicles of Narnia: The Lion, the Witch and the Wardrobe if (!ISDEPTHBLANK) { - outDepth24[dstIndex] = DS_DEPTH15TO24(inDepth16[srcIndex]); - outFog[dstIndex] = BIT15(inDepth16[srcIndex]); + outDepth24[dstIndex] = DS_DEPTH15TO24( LE_TO_LOCAL_16(inDepth16[srcIndex]) ); + outFog[dstIndex] = BIT15( LE_TO_LOCAL_16(inDepth16[srcIndex]) ); } } } @@ -863,7 +863,56 @@ void Render3D_SSE2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u // Write the fog flags to the fog flag buffer. const __m128i clearFogLo = _mm_srli_epi16(clearDepthLo, 15); const __m128i clearFogHi = _mm_srli_epi16(clearDepthHi, 15); - _mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packs_epi16(clearFogLo, clearFogHi)); + _mm_store_si128((__m128i *)(outFog + i), _mm_packs_epi16(clearFogLo, clearFogHi)); + } +} +#elif defined(ENABLE_ALTIVEC) +void Render3D_AltiVec::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog) +{ + const v128u16 calcDepthMul = ((v128u16){0x0200,0,0x0200,0,0x0200,0,0x0200,0}); + const v128u32 calcDepthAdd = ((v128u32){0x01FF,0x01FF,0x01FF,0x01FF}); + + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v128u16)) + { + // Copy the colors to the color buffer. + v128u16 inColor16SwappedLo = vec_ld( 0, inColor16 + i); + v128u16 inColor16SwappedHi = vec_ld(16, inColor16 + i); + + inColor16SwappedLo = vec_perm(inColor16SwappedLo, inColor16SwappedLo, ((v128u8){1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14})); + inColor16SwappedHi = vec_perm(inColor16SwappedHi, inColor16SwappedHi, ((v128u8){1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14})); + + vec_st(inColor16SwappedLo, 0, outColor16 + i); + vec_st(inColor16SwappedHi, 16, outColor16 + i); + + // Write the depth values to the depth buffer using the following formula from GBATEK. + // 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane + // D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF); + // + // For now, let's forget GBATEK (which could be wrong) and try using a simpified formula: + // D24 = (D15 * 0x0200) + 0x01FF; + v128u16 clearDepthLo = vec_ld( 0, inDepth16 + i); + v128u16 clearDepthHi = vec_ld(16, inDepth16 + i); + + clearDepthLo = vec_perm(clearDepthLo, clearDepthLo, ((v128u8){1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14})); + clearDepthHi = vec_perm(clearDepthHi, clearDepthHi, ((v128u8){1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14})); + + const v128u16 clearDepthValueLo = vec_and(clearDepthLo, ((v128u16){0x7FFF,0x7FFF,0x7FFF,0x7FFF,0x7FFF,0x7FFF,0x7FFF,0x7FFF})); + const v128u16 clearDepthValueHi = vec_and(clearDepthHi, ((v128u16){0x7FFF,0x7FFF,0x7FFF,0x7FFF,0x7FFF,0x7FFF,0x7FFF,0x7FFF})); + + const v128u16 calcDepth0 = vec_perm(((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}), clearDepthValueLo, ((v128u8){0x10,0x11,0,0, 0x12,0x13,0,0, 0x14,0x15,0,0, 0x16,0x17,0,0})); + const v128u16 calcDepth1 = vec_perm(((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}), clearDepthValueLo, ((v128u8){0x18,0x19,0,0, 0x1A,0x1B,0,0, 0x1C,0x1D,0,0, 0x1E,0x1F,0,0})); + const v128u16 calcDepth2 = vec_perm(((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}), clearDepthValueHi, ((v128u8){0x10,0x11,0,0, 0x12,0x13,0,0, 0x14,0x15,0,0, 0x16,0x17,0,0})); + const v128u16 calcDepth3 = vec_perm(((v128u8){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}), clearDepthValueHi, ((v128u8){0x18,0x19,0,0, 0x1A,0x1B,0,0, 0x1C,0x1D,0,0, 0x1E,0x1F,0,0})); + + vec_st( vec_msum(calcDepth0, calcDepthMul, calcDepthAdd), 0, outDepth24 + i); + vec_st( vec_msum(calcDepth1, calcDepthMul, calcDepthAdd), 16, outDepth24 + i); + vec_st( vec_msum(calcDepth2, calcDepthMul, calcDepthAdd), 32, outDepth24 + i); + vec_st( vec_msum(calcDepth3, calcDepthMul, calcDepthAdd), 48, outDepth24 + i); + + // Write the fog flags to the fog flag buffer. + const v128u16 clearFogLo = vec_sr(clearDepthLo, ((v128u16){15,15,15,15,15,15,15,15})); + const v128u16 clearFogHi = vec_sr(clearDepthHi, ((v128u16){15,15,15,15,15,15,15,15})); + vec_st( vec_pack(clearFogLo, clearFogHi), 0, outFog + i ); } } #endif diff --git a/desmume/src/render3D.h b/desmume/src/render3D.h index d169da18c..149926cc7 100644 --- a/desmume/src/render3D.h +++ b/desmume/src/render3D.h @@ -300,8 +300,11 @@ public: #elif defined(ENABLE_ALTIVEC) -class Render3D_Altivec : public Render3D_SIMD<16> -{}; +class Render3D_AltiVec : public Render3D_SIMD<16> +{ +public: + virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog); +}; #endif