SoftRasterizer: Due to how the class inheritance works, SoftRasterizer_AVX actually requires AVX2 instead of just AVX. And in reality, AVX2 is just more practical than AVX for what we're doing here.
- Also do some minor refactoring of Render3D.
This commit is contained in:
parent
92fcd1313e
commit
c49a480d51
|
@ -679,8 +679,8 @@ public:
|
|||
void SetUpscalingBuffer(void *upscaleBuffer);
|
||||
};
|
||||
|
||||
#if defined(ENABLE_AVX)
|
||||
class OpenGLRenderer : public Render3D_AVX
|
||||
#if defined(ENABLE_AVX2)
|
||||
class OpenGLRenderer : public Render3D_AVX2
|
||||
#elif defined(ENABLE_SSE2)
|
||||
class OpenGLRenderer : public Render3D_SSE2
|
||||
#elif defined(ENABLE_ALTIVEC)
|
||||
|
|
|
@ -1419,8 +1419,8 @@ static void* SoftRasterizer_RunClearUsingValues(void *arg)
|
|||
|
||||
static Render3D* SoftRasterizerRendererCreate()
|
||||
{
|
||||
#if defined(ENABLE_AVX)
|
||||
return new SoftRasterizerRenderer_AVX;
|
||||
#if defined(ENABLE_AVX2)
|
||||
return new SoftRasterizerRenderer_AVX2;
|
||||
#elif defined(ENABLE_SSE2)
|
||||
return new SoftRasterizerRenderer_SSE2;
|
||||
#elif defined(ENABLE_ALTIVEC)
|
||||
|
@ -1434,8 +1434,8 @@ static void SoftRasterizerRendererDestroy()
|
|||
{
|
||||
if (CurrentRenderer != BaseRenderer)
|
||||
{
|
||||
#if defined(ENABLE_AVX)
|
||||
SoftRasterizerRenderer_AVX *oldRenderer = (SoftRasterizerRenderer_AVX *)CurrentRenderer;
|
||||
#if defined(ENABLE_AVX2)
|
||||
SoftRasterizerRenderer_AVX2 *oldRenderer = (SoftRasterizerRenderer_AVX2 *)CurrentRenderer;
|
||||
#elif defined(ENABLE_SSE2)
|
||||
SoftRasterizerRenderer_SSE2 *oldRenderer = (SoftRasterizerRenderer_SSE2 *)CurrentRenderer;
|
||||
#elif defined(ENABLE_ALTIVEC)
|
||||
|
@ -2606,9 +2606,9 @@ Render3DError SoftRasterizer_SIMD<SIMDBYTES>::SetFramebufferSize(size_t w, size_
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_AVX)
|
||||
#if defined(ENABLE_AVX2)
|
||||
|
||||
void SoftRasterizerRenderer_AVX::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes)
|
||||
void SoftRasterizerRenderer_AVX2::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes)
|
||||
{
|
||||
this->_clearColor_v256u32 = _mm256_set1_epi32(clearColor6665.color);
|
||||
this->_clearDepth_v256u32 = _mm256_set1_epi32(clearAttributes.depth);
|
||||
|
@ -2620,7 +2620,7 @@ void SoftRasterizerRenderer_AVX::LoadClearValues(const FragmentColor &clearColor
|
|||
this->_clearAttrPolyFacing_v256u8 = _mm256_set1_epi8(clearAttributes.polyFacing);
|
||||
}
|
||||
|
||||
void SoftRasterizerRenderer_AVX::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel)
|
||||
void SoftRasterizerRenderer_AVX2::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel)
|
||||
{
|
||||
for (size_t i = startPixel; i < endPixel; i+=sizeof(v256u8))
|
||||
{
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2009-2019 DeSmuME team
|
||||
Copyright (C) 2009-2021 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -134,8 +134,8 @@ public:
|
|||
template<bool SLI, bool USELINEHACK> FORCEINLINE void Render();
|
||||
};
|
||||
|
||||
#if defined(ENABLE_AVX)
|
||||
class SoftRasterizerRenderer : public Render3D_AVX
|
||||
#if defined(ENABLE_AVX2)
|
||||
class SoftRasterizerRenderer : public Render3D_AVX2
|
||||
#elif defined(ENABLE_SSE2)
|
||||
class SoftRasterizerRenderer : public Render3D_SSE2
|
||||
#elif defined(ENABLE_ALTIVEC)
|
||||
|
@ -218,7 +218,7 @@ template <size_t SIMDBYTES>
|
|||
class SoftRasterizer_SIMD : public SoftRasterizerRenderer
|
||||
{
|
||||
protected:
|
||||
#if defined(ENABLE_AVX)
|
||||
#if defined(ENABLE_AVX2)
|
||||
v256u32 _clearColor_v256u32;
|
||||
v256u32 _clearDepth_v256u32;
|
||||
v256u8 _clearAttrOpaquePolyID_v256u8;
|
||||
|
@ -247,8 +247,8 @@ public:
|
|||
virtual Render3DError SetFramebufferSize(size_t w, size_t h);
|
||||
};
|
||||
|
||||
#if defined(ENABLE_AVX)
|
||||
class SoftRasterizerRenderer_AVX : public SoftRasterizer_SIMD<32>
|
||||
#if defined(ENABLE_AVX2)
|
||||
class SoftRasterizerRenderer_AVX2 : public SoftRasterizer_SIMD<32>
|
||||
{
|
||||
protected:
|
||||
virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes);
|
||||
|
|
|
@ -503,6 +503,16 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram
|
|||
return RENDER3DERROR_NOERR;
|
||||
}
|
||||
|
||||
void Render3D::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
|
||||
{
|
||||
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
|
||||
{
|
||||
outColor16[i] = inColor16[i];
|
||||
outDepth24[i] = DS_DEPTH15TO24(inDepth16[i]);
|
||||
outFog[i] = BIT15(inDepth16[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool ISCOLORBLANK, bool ISDEPTHBLANK>
|
||||
void Render3D::_ClearImageScrolledLoop(const u8 xScroll, const u8 yScroll, const u16 *__restrict inColor16, const u16 *__restrict inDepth16,
|
||||
u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
|
||||
|
@ -585,12 +595,7 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState)
|
|||
|
||||
if (xScroll == 0 && yScroll == 0)
|
||||
{
|
||||
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
|
||||
{
|
||||
this->clearImageColor16Buffer[i] = clearColorBuffer[i];
|
||||
this->clearImageDepthBuffer[i] = DS_DEPTH15TO24(clearDepthBuffer[i]);
|
||||
this->clearImageFogBuffer[i] = BIT15(clearDepthBuffer[i]);
|
||||
}
|
||||
this->_ClearImageBaseLoop(clearColorBuffer, clearDepthBuffer, this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -773,252 +778,95 @@ Render3DError Render3D_SIMD<SIMDBYTES>::SetFramebufferSize(size_t w, size_t h)
|
|||
return error;
|
||||
}
|
||||
|
||||
#if defined(ENABLE_AVX) || defined(ENABLE_SSE2)
|
||||
|
||||
#if defined(ENABLE_AVX)
|
||||
Render3DError Render3D_AVX::ClearFramebuffer(const GFX3D_State &renderState)
|
||||
#elif defined(ENABLE_SSE2)
|
||||
Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
|
||||
#endif
|
||||
#if defined(ENABLE_AVX2)
|
||||
void Render3D_AVX2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
|
||||
{
|
||||
Render3DError error = RENDER3DERROR_NOERR;
|
||||
const __m256i calcDepthConstants = _mm256_set1_epi32(0x01FF0200);
|
||||
|
||||
if (renderState.enableClearImage)
|
||||
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v256u16))
|
||||
{
|
||||
//the lion, the witch, and the wardrobe (thats book 1, suck it you new-school numberers)
|
||||
//uses the scroll registers in the main game engine
|
||||
const u16 *__restrict clearColorBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[2];
|
||||
const u16 *__restrict clearDepthBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[3];
|
||||
const u16 scrollBits = T1ReadWord(MMU.ARM9_REG, 0x356); //CLRIMAGE_OFFSET
|
||||
const u8 xScroll = scrollBits & 0xFF;
|
||||
const u8 yScroll = (scrollBits >> 8) & 0xFF;
|
||||
// Copy the colors to the color buffer.
|
||||
_mm256_store_si256( (__m256i *)(outColor16 + i) + 0, _mm256_load_si256((__m256i *)(inColor16 + i) + 0) );
|
||||
_mm256_store_si256( (__m256i *)(outColor16 + i) + 1, _mm256_load_si256((__m256i *)(inColor16 + i) + 1) );
|
||||
|
||||
#ifdef ENABLE_AVX2
|
||||
const __m256i calcDepthConstants = _mm256_set1_epi32(0x01FF0200);
|
||||
#else
|
||||
const __m128i calcDepthConstants = _mm_set1_epi32(0x01FF0200);
|
||||
#endif
|
||||
if (xScroll == 0 && yScroll == 0)
|
||||
{
|
||||
#ifdef ENABLE_AVX2
|
||||
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v256u16))
|
||||
{
|
||||
// Copy the colors to the color buffer.
|
||||
_mm256_store_si256( (__m256i *)(this->clearImageColor16Buffer + i) + 0, _mm256_load_si256((__m256i *)(clearColorBuffer + i) + 0) );
|
||||
_mm256_store_si256( (__m256i *)(this->clearImageColor16Buffer + i) + 1, _mm256_load_si256((__m256i *)(clearColorBuffer + i) + 1) );
|
||||
|
||||
// Write the depth values to the depth buffer using the following formula from GBATEK.
|
||||
// 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane
|
||||
// D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF);
|
||||
//
|
||||
// For now, let's forget GBATEK (which could be wrong) and try using a simpified formula:
|
||||
// D24 = (D15 * 0x0200) + 0x01FF;
|
||||
const __m256i clearDepthLo = _mm256_load_si256((__m256i *)(clearDepthBuffer + i) + 0);
|
||||
const __m256i clearDepthHi = _mm256_load_si256((__m256i *)(clearDepthBuffer + i) + 1);
|
||||
|
||||
const __m256i clearDepthValueLo = _mm256_permute4x64_epi64( _mm256_and_si256(clearDepthLo, _mm256_set1_epi16(0x7FFF)), 0xD8 );
|
||||
const __m256i clearDepthValueHi = _mm256_permute4x64_epi64( _mm256_and_si256(clearDepthHi, _mm256_set1_epi16(0x7FFF)), 0xD8 );
|
||||
|
||||
__m256i calcDepth0 = _mm256_unpacklo_epi16(clearDepthValueLo, _mm256_set1_epi16(1));
|
||||
__m256i calcDepth1 = _mm256_unpackhi_epi16(clearDepthValueLo, _mm256_set1_epi16(1));
|
||||
__m256i calcDepth2 = _mm256_unpacklo_epi16(clearDepthValueHi, _mm256_set1_epi16(1));
|
||||
__m256i calcDepth3 = _mm256_unpackhi_epi16(clearDepthValueHi, _mm256_set1_epi16(1));
|
||||
|
||||
calcDepth0 = _mm256_madd_epi16(calcDepth0, calcDepthConstants);
|
||||
calcDepth1 = _mm256_madd_epi16(calcDepth1, calcDepthConstants);
|
||||
calcDepth2 = _mm256_madd_epi16(calcDepth2, calcDepthConstants);
|
||||
calcDepth3 = _mm256_madd_epi16(calcDepth3, calcDepthConstants);
|
||||
|
||||
_mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 0, calcDepth0);
|
||||
_mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 1, calcDepth1);
|
||||
_mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 2, calcDepth2);
|
||||
_mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 3, calcDepth3);
|
||||
|
||||
// Write the fog flags to the fog flag buffer.
|
||||
const __m256i clearFogLo = _mm256_srli_epi16(clearDepthLo, 15);
|
||||
const __m256i clearFogHi = _mm256_srli_epi16(clearDepthHi, 15);
|
||||
_mm256_store_si256( (__m256i *)(this->clearImageFogBuffer + i), _mm256_permute4x64_epi64(_mm256_packus_epi16(clearFogLo, clearFogHi), 0xD8) );
|
||||
}
|
||||
#else
|
||||
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v128u16))
|
||||
{
|
||||
// Copy the colors to the color buffer.
|
||||
_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i) + 0, _mm_load_si128((__m128i *)(clearColorBuffer + i) + 0) );
|
||||
_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i) + 1, _mm_load_si128((__m128i *)(clearColorBuffer + i) + 1) );
|
||||
|
||||
// Write the depth values to the depth buffer using the following formula from GBATEK.
|
||||
// 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane
|
||||
// D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF);
|
||||
//
|
||||
// For now, let's forget GBATEK (which could be wrong) and try using a simpified formula:
|
||||
// D24 = (D15 * 0x0200) + 0x01FF;
|
||||
const __m128i clearDepthLo = _mm_load_si128((__m128i *)(clearDepthBuffer + i) + 0);
|
||||
const __m128i clearDepthHi = _mm_load_si128((__m128i *)(clearDepthBuffer + i) + 1);
|
||||
|
||||
const __m128i clearDepthValueLo = _mm_and_si128(clearDepthLo, _mm_set1_epi16(0x7FFF));
|
||||
const __m128i clearDepthValueHi = _mm_and_si128(clearDepthHi, _mm_set1_epi16(0x7FFF));
|
||||
|
||||
__m128i calcDepth0 = _mm_unpacklo_epi16(clearDepthValueLo, _mm_set1_epi16(1));
|
||||
__m128i calcDepth1 = _mm_unpackhi_epi16(clearDepthValueLo, _mm_set1_epi16(1));
|
||||
__m128i calcDepth2 = _mm_unpacklo_epi16(clearDepthValueHi, _mm_set1_epi16(1));
|
||||
__m128i calcDepth3 = _mm_unpackhi_epi16(clearDepthValueHi, _mm_set1_epi16(1));
|
||||
|
||||
calcDepth0 = _mm_madd_epi16(calcDepth0, calcDepthConstants);
|
||||
calcDepth1 = _mm_madd_epi16(calcDepth1, calcDepthConstants);
|
||||
calcDepth2 = _mm_madd_epi16(calcDepth2, calcDepthConstants);
|
||||
calcDepth3 = _mm_madd_epi16(calcDepth3, calcDepthConstants);
|
||||
|
||||
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 0, calcDepth0);
|
||||
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 1, calcDepth1);
|
||||
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 2, calcDepth2);
|
||||
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 3, calcDepth3);
|
||||
|
||||
// Write the fog flags to the fog flag buffer.
|
||||
const __m128i clearFogLo = _mm_srli_epi16(clearDepthLo, 15);
|
||||
const __m128i clearFogHi = _mm_srli_epi16(clearDepthHi, 15);
|
||||
_mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packs_epi16(clearFogLo, clearFogHi));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
// FIXME: Fix SSE2 support for scrolled clear images.
|
||||
// The depth-related code below doesn't actually work, and I don't know why
|
||||
// this is, so just use the scalar version for now.
|
||||
// - rogerman, 2018/09/19
|
||||
/*
|
||||
const size_t shiftCount = xScroll & 0x07;
|
||||
|
||||
for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
|
||||
{
|
||||
const size_t y = ((iy + yScroll) & 0xFF) << 8;
|
||||
|
||||
for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex += 8, ix += 8)
|
||||
{
|
||||
const size_t x = (ix + xScroll) & 0xFF;
|
||||
|
||||
__m128i clearColor;
|
||||
__m128i clearDepth_vec128;
|
||||
|
||||
if (shiftCount == 0)
|
||||
{
|
||||
const size_t srcIndex = y | x;
|
||||
|
||||
clearColor = _mm_load_si128((__m128i *)(clearColorBuffer + srcIndex));
|
||||
clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + srcIndex));
|
||||
}
|
||||
else
|
||||
{
|
||||
const size_t x1 = x & 0xF8;
|
||||
const size_t x0 = (x1 == 0) ? (GPU_FRAMEBUFFER_NATIVE_WIDTH - 8) : x1 - 8;
|
||||
const size_t srcIndex0 = y | x0;
|
||||
const size_t srcIndex1 = y | x1;
|
||||
|
||||
const __m128i clearColor0 = _mm_load_si128((__m128i *)(clearColorBuffer + srcIndex0));
|
||||
const __m128i clearColor1 = _mm_load_si128((__m128i *)(clearColorBuffer + srcIndex1));
|
||||
const __m128i clearDepth0 = _mm_load_si128((__m128i *)(clearDepthBuffer + srcIndex0));
|
||||
const __m128i clearDepth1 = _mm_load_si128((__m128i *)(clearDepthBuffer + srcIndex1));
|
||||
|
||||
switch (shiftCount)
|
||||
{
|
||||
case 1:
|
||||
clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 14);
|
||||
clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 14);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 12);
|
||||
clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 12);
|
||||
break;
|
||||
|
||||
case 3:
|
||||
clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 10);
|
||||
clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 10);
|
||||
break;
|
||||
|
||||
case 4:
|
||||
clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 8);
|
||||
clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 8);
|
||||
break;
|
||||
|
||||
case 5:
|
||||
clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 6);
|
||||
clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 6);
|
||||
break;
|
||||
|
||||
case 6:
|
||||
clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 4);
|
||||
clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 4);
|
||||
break;
|
||||
|
||||
case 7:
|
||||
clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 2);
|
||||
clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 2);
|
||||
break;
|
||||
|
||||
default:
|
||||
clearColor = _mm_setzero_si128();
|
||||
clearDepth_vec128 = _mm_setzero_si128();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const __m128i clearDepthValue = _mm_and_si128(clearDepth_vec128, _mm_set1_epi16(0x7FFF));
|
||||
const __m128i depthPlusOne = _mm_srli_epi16( _mm_adds_epu16(clearDepthValue, _mm_set1_epi16(1)), 15);
|
||||
const __m128i clearFog = _mm_srli_epi16(clearDepth_vec128, 15);
|
||||
|
||||
__m128i calcDepth0 = _mm_unpacklo_epi16(clearDepthValue, depthPlusOne);
|
||||
__m128i calcDepth1 = _mm_unpackhi_epi16(clearDepthValue, depthPlusOne);
|
||||
calcDepth0 = _mm_madd_epi16(calcDepth0, calcDepthConstants);
|
||||
calcDepth1 = _mm_madd_epi16(calcDepth1, calcDepthConstants);
|
||||
|
||||
_mm_store_si128((__m128i *)(this->clearImageColor16Buffer + dstIndex), clearColor);
|
||||
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + dstIndex + 0), calcDepth0);
|
||||
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + dstIndex + 4), calcDepth1);
|
||||
_mm_storel_epi64((__m128i *)(this->clearImageFogBuffer + dstIndex), _mm_packs_epi16(clearFog, _mm_setzero_si128()));
|
||||
}
|
||||
}
|
||||
*/
|
||||
const bool isClearColorBlank = (clearColorBuffer >= (u16 *)MMU.blank_memory);
|
||||
const bool isClearDepthBlank = (clearDepthBuffer >= (u16 *)MMU.blank_memory);
|
||||
|
||||
if (!isClearColorBlank && !isClearDepthBlank)
|
||||
{
|
||||
this->_ClearImageScrolledLoop<false, false>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer,
|
||||
this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
|
||||
}
|
||||
else if (isClearColorBlank)
|
||||
{
|
||||
this->_ClearImageScrolledLoop< true, false>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer,
|
||||
this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
|
||||
}
|
||||
else if (isClearDepthBlank)
|
||||
{
|
||||
this->_ClearImageScrolledLoop<false, true>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer,
|
||||
this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
this->_ClearImageScrolledLoop< true, true>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer,
|
||||
this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
|
||||
}
|
||||
}
|
||||
// Write the depth values to the depth buffer using the following formula from GBATEK.
|
||||
// 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane
|
||||
// D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF);
|
||||
//
|
||||
// For now, let's forget GBATEK (which could be wrong) and try using a simpified formula:
|
||||
// D24 = (D15 * 0x0200) + 0x01FF;
|
||||
const __m256i clearDepthLo = _mm256_load_si256((__m256i *)(inDepth16 + i) + 0);
|
||||
const __m256i clearDepthHi = _mm256_load_si256((__m256i *)(inDepth16 + i) + 1);
|
||||
|
||||
error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->_clearAttributes.opaquePolyID);
|
||||
if (error != RENDER3DERROR_NOERR)
|
||||
{
|
||||
error = this->ClearUsingValues(this->_clearColor6665, this->_clearAttributes);
|
||||
}
|
||||
const __m256i clearDepthValueLo = _mm256_permute4x64_epi64( _mm256_and_si256(clearDepthLo, _mm256_set1_epi16(0x7FFF)), 0xD8 );
|
||||
const __m256i clearDepthValueHi = _mm256_permute4x64_epi64( _mm256_and_si256(clearDepthHi, _mm256_set1_epi16(0x7FFF)), 0xD8 );
|
||||
|
||||
__m256i calcDepth0 = _mm256_unpacklo_epi16(clearDepthValueLo, _mm256_set1_epi16(1));
|
||||
__m256i calcDepth1 = _mm256_unpackhi_epi16(clearDepthValueLo, _mm256_set1_epi16(1));
|
||||
__m256i calcDepth2 = _mm256_unpacklo_epi16(clearDepthValueHi, _mm256_set1_epi16(1));
|
||||
__m256i calcDepth3 = _mm256_unpackhi_epi16(clearDepthValueHi, _mm256_set1_epi16(1));
|
||||
|
||||
calcDepth0 = _mm256_madd_epi16(calcDepth0, calcDepthConstants);
|
||||
calcDepth1 = _mm256_madd_epi16(calcDepth1, calcDepthConstants);
|
||||
calcDepth2 = _mm256_madd_epi16(calcDepth2, calcDepthConstants);
|
||||
calcDepth3 = _mm256_madd_epi16(calcDepth3, calcDepthConstants);
|
||||
|
||||
_mm256_store_si256((__m256i *)(outDepth24 + i) + 0, calcDepth0);
|
||||
_mm256_store_si256((__m256i *)(outDepth24 + i) + 1, calcDepth1);
|
||||
_mm256_store_si256((__m256i *)(outDepth24 + i) + 2, calcDepth2);
|
||||
_mm256_store_si256((__m256i *)(outDepth24 + i) + 3, calcDepth3);
|
||||
|
||||
// Write the fog flags to the fog flag buffer.
|
||||
const __m256i clearFogLo = _mm256_srli_epi16(clearDepthLo, 15);
|
||||
const __m256i clearFogHi = _mm256_srli_epi16(clearDepthHi, 15);
|
||||
_mm256_store_si256( (__m256i *)(outFog + i), _mm256_permute4x64_epi64(_mm256_packus_epi16(clearFogLo, clearFogHi), 0xD8) );
|
||||
}
|
||||
else
|
||||
{
|
||||
error = this->ClearUsingValues(this->_clearColor6665, this->_clearAttributes);
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
#endif // defined(ENABLE_AVX) || defined(ENABLE_SSE2)
|
||||
#elif defined(ENABLE_SSE2)
|
||||
void Render3D_SSE2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
|
||||
{
|
||||
const __m128i calcDepthConstants = _mm_set1_epi32(0x01FF0200);
|
||||
|
||||
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v128u16))
|
||||
{
|
||||
// Copy the colors to the color buffer.
|
||||
_mm_store_si128( (__m128i *)(outColor16 + i) + 0, _mm_load_si128((__m128i *)(inColor16 + i) + 0) );
|
||||
_mm_store_si128( (__m128i *)(outColor16 + i) + 1, _mm_load_si128((__m128i *)(inColor16 + i) + 1) );
|
||||
|
||||
// Write the depth values to the depth buffer using the following formula from GBATEK.
|
||||
// 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane
|
||||
// D24 = (D15 * 0x0200) + (((D15 + 1) >> 15) * 0x01FF);
|
||||
//
|
||||
// For now, let's forget GBATEK (which could be wrong) and try using a simpified formula:
|
||||
// D24 = (D15 * 0x0200) + 0x01FF;
|
||||
const __m128i clearDepthLo = _mm_load_si128((__m128i *)(inDepth16 + i) + 0);
|
||||
const __m128i clearDepthHi = _mm_load_si128((__m128i *)(inDepth16 + i) + 1);
|
||||
|
||||
const __m128i clearDepthValueLo = _mm_and_si128(clearDepthLo, _mm_set1_epi16(0x7FFF));
|
||||
const __m128i clearDepthValueHi = _mm_and_si128(clearDepthHi, _mm_set1_epi16(0x7FFF));
|
||||
|
||||
__m128i calcDepth0 = _mm_unpacklo_epi16(clearDepthValueLo, _mm_set1_epi16(1));
|
||||
__m128i calcDepth1 = _mm_unpackhi_epi16(clearDepthValueLo, _mm_set1_epi16(1));
|
||||
__m128i calcDepth2 = _mm_unpacklo_epi16(clearDepthValueHi, _mm_set1_epi16(1));
|
||||
__m128i calcDepth3 = _mm_unpackhi_epi16(clearDepthValueHi, _mm_set1_epi16(1));
|
||||
|
||||
calcDepth0 = _mm_madd_epi16(calcDepth0, calcDepthConstants);
|
||||
calcDepth1 = _mm_madd_epi16(calcDepth1, calcDepthConstants);
|
||||
calcDepth2 = _mm_madd_epi16(calcDepth2, calcDepthConstants);
|
||||
calcDepth3 = _mm_madd_epi16(calcDepth3, calcDepthConstants);
|
||||
|
||||
_mm_store_si128((__m128i *)(outDepth24 + i) + 0, calcDepth0);
|
||||
_mm_store_si128((__m128i *)(outDepth24 + i) + 1, calcDepth1);
|
||||
_mm_store_si128((__m128i *)(outDepth24 + i) + 2, calcDepth2);
|
||||
_mm_store_si128((__m128i *)(outDepth24 + i) + 3, calcDepth3);
|
||||
|
||||
// Write the fog flags to the fog flag buffer.
|
||||
const __m128i clearFogLo = _mm_srli_epi16(clearDepthLo, 15);
|
||||
const __m128i clearFogHi = _mm_srli_epi16(clearDepthHi, 15);
|
||||
_mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packs_epi16(clearFogLo, clearFogHi));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template Render3D_SIMD<16>::Render3D_SIMD();
|
||||
template Render3D_SIMD<32>::Render3D_SIMD();
|
||||
|
|
|
@ -192,7 +192,8 @@ protected:
|
|||
CACHE_ALIGN u16 clearImageColor16Buffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
|
||||
CACHE_ALIGN u32 clearImageDepthBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
|
||||
CACHE_ALIGN u8 clearImageFogBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
|
||||
|
||||
|
||||
virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
|
||||
template<bool ISCOLORBLANK, bool ISDEPTHBLANK> void _ClearImageScrolledLoop(const u8 xScroll, const u8 yScroll, const u16 *__restrict inColor16, const u16 *__restrict inDepth16,
|
||||
u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
|
||||
|
||||
|
@ -281,20 +282,20 @@ public:
|
|||
virtual Render3DError SetFramebufferSize(size_t w, size_t h);
|
||||
};
|
||||
|
||||
#if defined(ENABLE_AVX)
|
||||
#if defined(ENABLE_AVX2)
|
||||
|
||||
class Render3D_AVX : public Render3D_SIMD<32>
|
||||
class Render3D_AVX2 : public Render3D_SIMD<32>
|
||||
{
|
||||
public:
|
||||
virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState);
|
||||
public:
|
||||
virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
|
||||
};
|
||||
|
||||
#elif defined(ENABLE_SSE2)
|
||||
|
||||
class Render3D_SSE2 : public Render3D_SIMD<16>
|
||||
{
|
||||
public:
|
||||
virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState);
|
||||
public:
|
||||
virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
|
||||
};
|
||||
|
||||
#elif defined(ENABLE_ALTIVEC)
|
||||
|
|
Loading…
Reference in New Issue