SoftRasterizer: Due to how the class inheritance works, SoftRasterizer_AVX actually requires AVX2 instead of just AVX. And in reality, AVX2 is just more practical than AVX for what we're doing here.

- Also do some minor refactoring of Render3D.
This commit is contained in:
rogerman 2021-09-04 19:31:55 -07:00
parent 92fcd1313e
commit c49a480d51
5 changed files with 116 additions and 267 deletions

View File

@ -679,8 +679,8 @@ public:
void SetUpscalingBuffer(void *upscaleBuffer);
};
#if defined(ENABLE_AVX)
class OpenGLRenderer : public Render3D_AVX
#if defined(ENABLE_AVX2)
class OpenGLRenderer : public Render3D_AVX2
#elif defined(ENABLE_SSE2)
class OpenGLRenderer : public Render3D_SSE2
#elif defined(ENABLE_ALTIVEC)

View File

@ -1419,8 +1419,8 @@ static void* SoftRasterizer_RunClearUsingValues(void *arg)
static Render3D* SoftRasterizerRendererCreate()
{
#if defined(ENABLE_AVX)
return new SoftRasterizerRenderer_AVX;
#if defined(ENABLE_AVX2)
return new SoftRasterizerRenderer_AVX2;
#elif defined(ENABLE_SSE2)
return new SoftRasterizerRenderer_SSE2;
#elif defined(ENABLE_ALTIVEC)
@ -1434,8 +1434,8 @@ static void SoftRasterizerRendererDestroy()
{
if (CurrentRenderer != BaseRenderer)
{
#if defined(ENABLE_AVX)
SoftRasterizerRenderer_AVX *oldRenderer = (SoftRasterizerRenderer_AVX *)CurrentRenderer;
#if defined(ENABLE_AVX2)
SoftRasterizerRenderer_AVX2 *oldRenderer = (SoftRasterizerRenderer_AVX2 *)CurrentRenderer;
#elif defined(ENABLE_SSE2)
SoftRasterizerRenderer_SSE2 *oldRenderer = (SoftRasterizerRenderer_SSE2 *)CurrentRenderer;
#elif defined(ENABLE_ALTIVEC)
@ -2606,9 +2606,9 @@ Render3DError SoftRasterizer_SIMD<SIMDBYTES>::SetFramebufferSize(size_t w, size_
#endif
#if defined(ENABLE_AVX)
#if defined(ENABLE_AVX2)
void SoftRasterizerRenderer_AVX::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes)
void SoftRasterizerRenderer_AVX2::LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes)
{
this->_clearColor_v256u32 = _mm256_set1_epi32(clearColor6665.color);
this->_clearDepth_v256u32 = _mm256_set1_epi32(clearAttributes.depth);
@ -2620,7 +2620,7 @@ void SoftRasterizerRenderer_AVX::LoadClearValues(const FragmentColor &clearColor
this->_clearAttrPolyFacing_v256u8 = _mm256_set1_epi8(clearAttributes.polyFacing);
}
void SoftRasterizerRenderer_AVX::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel)
void SoftRasterizerRenderer_AVX2::ClearUsingValues_Execute(const size_t startPixel, const size_t endPixel)
{
for (size_t i = startPixel; i < endPixel; i+=sizeof(v256u8))
{

View File

@ -1,5 +1,5 @@
/*
Copyright (C) 2009-2019 DeSmuME team
Copyright (C) 2009-2021 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -134,8 +134,8 @@ public:
template<bool SLI, bool USELINEHACK> FORCEINLINE void Render();
};
#if defined(ENABLE_AVX)
class SoftRasterizerRenderer : public Render3D_AVX
#if defined(ENABLE_AVX2)
class SoftRasterizerRenderer : public Render3D_AVX2
#elif defined(ENABLE_SSE2)
class SoftRasterizerRenderer : public Render3D_SSE2
#elif defined(ENABLE_ALTIVEC)
@ -218,7 +218,7 @@ template <size_t SIMDBYTES>
class SoftRasterizer_SIMD : public SoftRasterizerRenderer
{
protected:
#if defined(ENABLE_AVX)
#if defined(ENABLE_AVX2)
v256u32 _clearColor_v256u32;
v256u32 _clearDepth_v256u32;
v256u8 _clearAttrOpaquePolyID_v256u8;
@ -247,8 +247,8 @@ public:
virtual Render3DError SetFramebufferSize(size_t w, size_t h);
};
#if defined(ENABLE_AVX)
class SoftRasterizerRenderer_AVX : public SoftRasterizer_SIMD<32>
#if defined(ENABLE_AVX2)
class SoftRasterizerRenderer_AVX2 : public SoftRasterizer_SIMD<32>
{
protected:
virtual void LoadClearValues(const FragmentColor &clearColor6665, const FragmentAttributes &clearAttributes);

View File

@ -503,6 +503,16 @@ Render3DError Render3D::FlushFramebuffer(const FragmentColor *__restrict srcFram
return RENDER3DERROR_NOERR;
}
void Render3D::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
{
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
{
outColor16[i] = inColor16[i];
outDepth24[i] = DS_DEPTH15TO24(inDepth16[i]);
outFog[i] = BIT15(inDepth16[i]);
}
}
template <bool ISCOLORBLANK, bool ISDEPTHBLANK>
void Render3D::_ClearImageScrolledLoop(const u8 xScroll, const u8 yScroll, const u16 *__restrict inColor16, const u16 *__restrict inDepth16,
u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
@ -585,12 +595,7 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState)
if (xScroll == 0 && yScroll == 0)
{
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
{
this->clearImageColor16Buffer[i] = clearColorBuffer[i];
this->clearImageDepthBuffer[i] = DS_DEPTH15TO24(clearDepthBuffer[i]);
this->clearImageFogBuffer[i] = BIT15(clearDepthBuffer[i]);
}
this->_ClearImageBaseLoop(clearColorBuffer, clearDepthBuffer, this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
}
else
{
@ -773,39 +778,16 @@ Render3DError Render3D_SIMD<SIMDBYTES>::SetFramebufferSize(size_t w, size_t h)
return error;
}
#if defined(ENABLE_AVX) || defined(ENABLE_SSE2)
#if defined(ENABLE_AVX)
Render3DError Render3D_AVX::ClearFramebuffer(const GFX3D_State &renderState)
#elif defined(ENABLE_SSE2)
Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
#endif
#if defined(ENABLE_AVX2)
void Render3D_AVX2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
{
Render3DError error = RENDER3DERROR_NOERR;
if (renderState.enableClearImage)
{
//the lion, the witch, and the wardrobe (thats book 1, suck it you new-school numberers)
//uses the scroll registers in the main game engine
const u16 *__restrict clearColorBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[2];
const u16 *__restrict clearDepthBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[3];
const u16 scrollBits = T1ReadWord(MMU.ARM9_REG, 0x356); //CLRIMAGE_OFFSET
const u8 xScroll = scrollBits & 0xFF;
const u8 yScroll = (scrollBits >> 8) & 0xFF;
#ifdef ENABLE_AVX2
const __m256i calcDepthConstants = _mm256_set1_epi32(0x01FF0200);
#else
const __m128i calcDepthConstants = _mm_set1_epi32(0x01FF0200);
#endif
if (xScroll == 0 && yScroll == 0)
{
#ifdef ENABLE_AVX2
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v256u16))
{
// Copy the colors to the color buffer.
_mm256_store_si256( (__m256i *)(this->clearImageColor16Buffer + i) + 0, _mm256_load_si256((__m256i *)(clearColorBuffer + i) + 0) );
_mm256_store_si256( (__m256i *)(this->clearImageColor16Buffer + i) + 1, _mm256_load_si256((__m256i *)(clearColorBuffer + i) + 1) );
_mm256_store_si256( (__m256i *)(outColor16 + i) + 0, _mm256_load_si256((__m256i *)(inColor16 + i) + 0) );
_mm256_store_si256( (__m256i *)(outColor16 + i) + 1, _mm256_load_si256((__m256i *)(inColor16 + i) + 1) );
// Write the depth values to the depth buffer using the following formula from GBATEK.
// 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane
@ -813,8 +795,8 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
//
// For now, let's forget GBATEK (which could be wrong) and try using a simpified formula:
// D24 = (D15 * 0x0200) + 0x01FF;
const __m256i clearDepthLo = _mm256_load_si256((__m256i *)(clearDepthBuffer + i) + 0);
const __m256i clearDepthHi = _mm256_load_si256((__m256i *)(clearDepthBuffer + i) + 1);
const __m256i clearDepthLo = _mm256_load_si256((__m256i *)(inDepth16 + i) + 0);
const __m256i clearDepthHi = _mm256_load_si256((__m256i *)(inDepth16 + i) + 1);
const __m256i clearDepthValueLo = _mm256_permute4x64_epi64( _mm256_and_si256(clearDepthLo, _mm256_set1_epi16(0x7FFF)), 0xD8 );
const __m256i clearDepthValueHi = _mm256_permute4x64_epi64( _mm256_and_si256(clearDepthHi, _mm256_set1_epi16(0x7FFF)), 0xD8 );
@ -829,22 +811,27 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
calcDepth2 = _mm256_madd_epi16(calcDepth2, calcDepthConstants);
calcDepth3 = _mm256_madd_epi16(calcDepth3, calcDepthConstants);
_mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 0, calcDepth0);
_mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 1, calcDepth1);
_mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 2, calcDepth2);
_mm256_store_si256((__m256i *)(this->clearImageDepthBuffer + i) + 3, calcDepth3);
_mm256_store_si256((__m256i *)(outDepth24 + i) + 0, calcDepth0);
_mm256_store_si256((__m256i *)(outDepth24 + i) + 1, calcDepth1);
_mm256_store_si256((__m256i *)(outDepth24 + i) + 2, calcDepth2);
_mm256_store_si256((__m256i *)(outDepth24 + i) + 3, calcDepth3);
// Write the fog flags to the fog flag buffer.
const __m256i clearFogLo = _mm256_srli_epi16(clearDepthLo, 15);
const __m256i clearFogHi = _mm256_srli_epi16(clearDepthHi, 15);
_mm256_store_si256( (__m256i *)(this->clearImageFogBuffer + i), _mm256_permute4x64_epi64(_mm256_packus_epi16(clearFogLo, clearFogHi), 0xD8) );
_mm256_store_si256( (__m256i *)(outFog + i), _mm256_permute4x64_epi64(_mm256_packus_epi16(clearFogLo, clearFogHi), 0xD8) );
}
#else
}
#elif defined(ENABLE_SSE2)
void Render3D_SSE2::_ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog)
{
const __m128i calcDepthConstants = _mm_set1_epi32(0x01FF0200);
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i+=sizeof(v128u16))
{
// Copy the colors to the color buffer.
_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i) + 0, _mm_load_si128((__m128i *)(clearColorBuffer + i) + 0) );
_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i) + 1, _mm_load_si128((__m128i *)(clearColorBuffer + i) + 1) );
_mm_store_si128( (__m128i *)(outColor16 + i) + 0, _mm_load_si128((__m128i *)(inColor16 + i) + 0) );
_mm_store_si128( (__m128i *)(outColor16 + i) + 1, _mm_load_si128((__m128i *)(inColor16 + i) + 1) );
// Write the depth values to the depth buffer using the following formula from GBATEK.
// 15-bit to 24-bit depth formula from http://problemkaputt.de/gbatek.htm#ds3drearplane
@ -852,8 +839,8 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
//
// For now, let's forget GBATEK (which could be wrong) and try using a simpified formula:
// D24 = (D15 * 0x0200) + 0x01FF;
const __m128i clearDepthLo = _mm_load_si128((__m128i *)(clearDepthBuffer + i) + 0);
const __m128i clearDepthHi = _mm_load_si128((__m128i *)(clearDepthBuffer + i) + 1);
const __m128i clearDepthLo = _mm_load_si128((__m128i *)(inDepth16 + i) + 0);
const __m128i clearDepthHi = _mm_load_si128((__m128i *)(inDepth16 + i) + 1);
const __m128i clearDepthValueLo = _mm_and_si128(clearDepthLo, _mm_set1_epi16(0x7FFF));
const __m128i clearDepthValueHi = _mm_and_si128(clearDepthHi, _mm_set1_epi16(0x7FFF));
@ -868,157 +855,18 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
calcDepth2 = _mm_madd_epi16(calcDepth2, calcDepthConstants);
calcDepth3 = _mm_madd_epi16(calcDepth3, calcDepthConstants);
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 0, calcDepth0);
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 1, calcDepth1);
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 2, calcDepth2);
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i) + 3, calcDepth3);
_mm_store_si128((__m128i *)(outDepth24 + i) + 0, calcDepth0);
_mm_store_si128((__m128i *)(outDepth24 + i) + 1, calcDepth1);
_mm_store_si128((__m128i *)(outDepth24 + i) + 2, calcDepth2);
_mm_store_si128((__m128i *)(outDepth24 + i) + 3, calcDepth3);
// Write the fog flags to the fog flag buffer.
const __m128i clearFogLo = _mm_srli_epi16(clearDepthLo, 15);
const __m128i clearFogHi = _mm_srli_epi16(clearDepthHi, 15);
_mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packs_epi16(clearFogLo, clearFogHi));
}
}
#endif
}
else
{
// FIXME: Fix SSE2 support for scrolled clear images.
// The depth-related code below doesn't actually work, and I don't know why
// this is, so just use the scalar version for now.
// - rogerman, 2018/09/19
/*
const size_t shiftCount = xScroll & 0x07;
for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
{
const size_t y = ((iy + yScroll) & 0xFF) << 8;
for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex += 8, ix += 8)
{
const size_t x = (ix + xScroll) & 0xFF;
__m128i clearColor;
__m128i clearDepth_vec128;
if (shiftCount == 0)
{
const size_t srcIndex = y | x;
clearColor = _mm_load_si128((__m128i *)(clearColorBuffer + srcIndex));
clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + srcIndex));
}
else
{
const size_t x1 = x & 0xF8;
const size_t x0 = (x1 == 0) ? (GPU_FRAMEBUFFER_NATIVE_WIDTH - 8) : x1 - 8;
const size_t srcIndex0 = y | x0;
const size_t srcIndex1 = y | x1;
const __m128i clearColor0 = _mm_load_si128((__m128i *)(clearColorBuffer + srcIndex0));
const __m128i clearColor1 = _mm_load_si128((__m128i *)(clearColorBuffer + srcIndex1));
const __m128i clearDepth0 = _mm_load_si128((__m128i *)(clearDepthBuffer + srcIndex0));
const __m128i clearDepth1 = _mm_load_si128((__m128i *)(clearDepthBuffer + srcIndex1));
switch (shiftCount)
{
case 1:
clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 14);
clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 14);
break;
case 2:
clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 12);
clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 12);
break;
case 3:
clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 10);
clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 10);
break;
case 4:
clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 8);
clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 8);
break;
case 5:
clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 6);
clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 6);
break;
case 6:
clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 4);
clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 4);
break;
case 7:
clearColor = _mm_alignr_epi8(clearColor1, clearColor0, 2);
clearDepth_vec128 = _mm_alignr_epi8(clearDepth1, clearDepth0, 2);
break;
default:
clearColor = _mm_setzero_si128();
clearDepth_vec128 = _mm_setzero_si128();
break;
}
}
const __m128i clearDepthValue = _mm_and_si128(clearDepth_vec128, _mm_set1_epi16(0x7FFF));
const __m128i depthPlusOne = _mm_srli_epi16( _mm_adds_epu16(clearDepthValue, _mm_set1_epi16(1)), 15);
const __m128i clearFog = _mm_srli_epi16(clearDepth_vec128, 15);
__m128i calcDepth0 = _mm_unpacklo_epi16(clearDepthValue, depthPlusOne);
__m128i calcDepth1 = _mm_unpackhi_epi16(clearDepthValue, depthPlusOne);
calcDepth0 = _mm_madd_epi16(calcDepth0, calcDepthConstants);
calcDepth1 = _mm_madd_epi16(calcDepth1, calcDepthConstants);
_mm_store_si128((__m128i *)(this->clearImageColor16Buffer + dstIndex), clearColor);
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + dstIndex + 0), calcDepth0);
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + dstIndex + 4), calcDepth1);
_mm_storel_epi64((__m128i *)(this->clearImageFogBuffer + dstIndex), _mm_packs_epi16(clearFog, _mm_setzero_si128()));
}
}
*/
const bool isClearColorBlank = (clearColorBuffer >= (u16 *)MMU.blank_memory);
const bool isClearDepthBlank = (clearDepthBuffer >= (u16 *)MMU.blank_memory);
if (!isClearColorBlank && !isClearDepthBlank)
{
this->_ClearImageScrolledLoop<false, false>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer,
this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
}
else if (isClearColorBlank)
{
this->_ClearImageScrolledLoop< true, false>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer,
this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
}
else if (isClearDepthBlank)
{
this->_ClearImageScrolledLoop<false, true>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer,
this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
}
else
{
this->_ClearImageScrolledLoop< true, true>(xScroll, yScroll, clearColorBuffer, clearDepthBuffer,
this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer);
}
}
error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->_clearAttributes.opaquePolyID);
if (error != RENDER3DERROR_NOERR)
{
error = this->ClearUsingValues(this->_clearColor6665, this->_clearAttributes);
}
}
else
{
error = this->ClearUsingValues(this->_clearColor6665, this->_clearAttributes);
}
return error;
}
#endif // defined(ENABLE_AVX) || defined(ENABLE_SSE2)
template Render3D_SIMD<16>::Render3D_SIMD();
template Render3D_SIMD<32>::Render3D_SIMD();

View File

@ -193,6 +193,7 @@ protected:
CACHE_ALIGN u32 clearImageDepthBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
CACHE_ALIGN u8 clearImageFogBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
template<bool ISCOLORBLANK, bool ISDEPTHBLANK> void _ClearImageScrolledLoop(const u8 xScroll, const u8 yScroll, const u16 *__restrict inColor16, const u16 *__restrict inDepth16,
u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
@ -281,12 +282,12 @@ public:
virtual Render3DError SetFramebufferSize(size_t w, size_t h);
};
#if defined(ENABLE_AVX)
#if defined(ENABLE_AVX2)
class Render3D_AVX : public Render3D_SIMD<32>
class Render3D_AVX2 : public Render3D_SIMD<32>
{
public:
virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState);
virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
};
#elif defined(ENABLE_SSE2)
@ -294,7 +295,7 @@ public:
class Render3D_SSE2 : public Render3D_SIMD<16>
{
public:
virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState);
virtual void _ClearImageBaseLoop(const u16 *__restrict inColor16, const u16 *__restrict inDepth16, u16 *__restrict outColor16, u32 *__restrict outDepth24, u8 *__restrict outFog);
};
#elif defined(ENABLE_ALTIVEC)