GPU:
- Do some code cleanup, especially with the SSE2-related code. - Fix potential crash in the OpenGL renderer when changing the framebuffer size.
This commit is contained in:
parent
e53e289e77
commit
55516fc20e
|
@ -2206,7 +2206,7 @@ PLAIN_CLEAR:
|
|||
if (gpu->LayersEnable[4])
|
||||
{
|
||||
//n.b. - this is clearing the sprite line buffer to the background color,
|
||||
memset_u16(gpu->sprColor, backdrop_color, GPU_FRAMEBUFFER_NATIVE_WIDTH);
|
||||
memset_u16_fast<GPU_FRAMEBUFFER_NATIVE_WIDTH>(gpu->sprColor, backdrop_color);
|
||||
|
||||
//zero 06-may-09: I properly supported window color effects for backdrop, but I am not sure
|
||||
//how it interacts with this. I wish we knew why we needed this
|
||||
|
@ -2531,14 +2531,14 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
|
|||
{
|
||||
if (factor < 16)
|
||||
{
|
||||
#ifdef ENABLE_SSE2
|
||||
static size_t ssePixCount = pixCount - (pixCount % 4);
|
||||
static const __m128i colorMask = _mm_set1_epi16(0x7FFF);
|
||||
size_t i = 0;
|
||||
|
||||
for (size_t i = 0; i < ssePixCount; i += 8)
|
||||
#ifdef ENABLE_SSE2
|
||||
const size_t ssePixCount = pixCount - (pixCount % 8);
|
||||
for (; i < ssePixCount; i += 8)
|
||||
{
|
||||
__m128i dstColor_vec128 = _mm_load_si128((__m128i *)(dstLine + i));
|
||||
dstColor_vec128 = _mm_and_si128(dstColor_vec128, colorMask);
|
||||
dstColor_vec128 = _mm_and_si128(dstColor_vec128, _mm_set1_epi16(0x7FFF));
|
||||
|
||||
dstLine[i+7] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 7) ];
|
||||
dstLine[i+6] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 6) ];
|
||||
|
@ -2549,17 +2549,11 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
|
|||
dstLine[i+1] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 1) ];
|
||||
dstLine[i+0] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 0) ];
|
||||
}
|
||||
|
||||
for (size_t i = ssePixCount; i < pixCount; i++)
|
||||
{
|
||||
dstLine[i] = fadeInColors[factor][ dstLine[i] & 0x7FFF ];
|
||||
}
|
||||
#else
|
||||
for (size_t i = 0; i < pixCount; i++)
|
||||
{
|
||||
dstLine[i] = fadeInColors[factor][ dstLine[i] & 0x7FFF ];
|
||||
}
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dstLine[i] = fadeInColors[factor][ dstLine[i] & 0x7FFF ];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -2573,14 +2567,14 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
|
|||
{
|
||||
if (factor < 16)
|
||||
{
|
||||
#ifdef ENABLE_SSE2
|
||||
static size_t ssePixCount = pixCount - (pixCount % 4);
|
||||
static const __m128i colorMask = _mm_set1_epi16(0x7FFF);
|
||||
size_t i = 0;
|
||||
|
||||
for (size_t i = 0; i < ssePixCount; i += 8)
|
||||
#ifdef ENABLE_SSE2
|
||||
const size_t ssePixCount = pixCount - (pixCount % 8);
|
||||
for (; i < ssePixCount; i += 8)
|
||||
{
|
||||
__m128i dstColor_vec128 = _mm_load_si128((__m128i *)(dstLine + i));
|
||||
dstColor_vec128 = _mm_and_si128(dstColor_vec128, colorMask);
|
||||
dstColor_vec128 = _mm_and_si128(dstColor_vec128, _mm_set1_epi16(0x7FFF));
|
||||
|
||||
dstLine[i+7] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 7) ];
|
||||
dstLine[i+6] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 6) ];
|
||||
|
@ -2591,17 +2585,11 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
|
|||
dstLine[i+1] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 1) ];
|
||||
dstLine[i+0] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 0) ];
|
||||
}
|
||||
|
||||
for (size_t i = ssePixCount; i < pixCount; i++)
|
||||
{
|
||||
dstLine[i] = fadeOutColors[factor][ dstLine[i] & 0x7FFF ];
|
||||
}
|
||||
#else
|
||||
for (size_t i = 0; i < pixCount; i++)
|
||||
{
|
||||
dstLine[i] = fadeOutColors[factor][ dstLine[i] & 0x7FFF ];
|
||||
}
|
||||
#endif
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dstLine[i] = fadeOutColors[factor][ dstLine[i] & 0x7FFF ];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -2614,7 +2602,6 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod
|
|||
case GPUMasterBrightMode_Reserved:
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<size_t WIN_NUM>
|
||||
|
@ -2818,10 +2805,21 @@ void GPU_RenderLine(NDS_Screen *screen, const u16 l, bool skip)
|
|||
{
|
||||
//this has not been tested since the dma timing for dispfifo was changed around the time of
|
||||
//newemuloop. it may not work.
|
||||
for (size_t i = 0; i < 128; i++)
|
||||
#ifdef ENABLE_SSE2
|
||||
const __m128i fifoMask = _mm_set1_epi32(0x7FFF7FFF);
|
||||
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(__m128i); i++)
|
||||
{
|
||||
__m128i fifoColor = _mm_set_epi32(DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv(), DISP_FIFOrecv());
|
||||
fifoColor = _mm_shuffle_epi32(fifoColor, 0x1B); // We need to shuffle the four FIFO values back into the correct order, since they were originally loaded in reverse order.
|
||||
|
||||
((__m128i *)dstLine)[i] = fifoColor & fifoMask;
|
||||
}
|
||||
#else
|
||||
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16) / sizeof(u32); i++)
|
||||
{
|
||||
((u32 *)dstLine)[i] = DISP_FIFOrecv() & 0x7FFF7FFF;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (_gpuFramebufferWidth != GPU_FRAMEBUFFER_NATIVE_WIDTH)
|
||||
{
|
||||
|
|
|
@ -893,7 +893,6 @@ void OpenGLRenderer::SetVersion(unsigned int major, unsigned int minor, unsigned
|
|||
this->versionRevision = revision;
|
||||
}
|
||||
|
||||
#if defined(ENABLE_SSSE3) && defined(LOCAL_LE)
|
||||
Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
|
||||
{
|
||||
// Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL
|
||||
|
@ -905,7 +904,10 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
|
|||
|
||||
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
|
||||
{
|
||||
for (size_t x = 0; x < ssePixCount; x+=4, ir+=4, iw+=4)
|
||||
size_t x = 0;
|
||||
|
||||
#if defined(ENABLE_SSSE3) && defined(LOCAL_LE)
|
||||
for (; x < ssePixCount; x += 4, ir += 4, iw += 4)
|
||||
{
|
||||
// Convert to RGBA6665
|
||||
__m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + ir));
|
||||
|
@ -923,65 +925,42 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
|
|||
color = _mm_load_si128((__m128i *)(this->_framebufferColor + ir));
|
||||
|
||||
__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x000000F8)); // Read from R
|
||||
b = _mm_slli_epi32(b, 7); // Shift to B
|
||||
b = _mm_slli_si128(b, 7); // Shift to B
|
||||
|
||||
__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x0000F800)); // Read from G
|
||||
g = _mm_srli_epi32(g, 6); // Shift in G
|
||||
g = _mm_srli_si128(g, 6); // Shift in G
|
||||
|
||||
__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x00F80000)); // Read from B
|
||||
r = _mm_srli_epi32(r, 19); // Shift to R
|
||||
r = _mm_srli_si128(r, 19); // Shift to R
|
||||
|
||||
a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A
|
||||
a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A
|
||||
a = _mm_and_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A
|
||||
|
||||
color = _mm_or_si128(b, g);
|
||||
color = _mm_or_si128(color, r);
|
||||
color = _mm_or_si128(color, a);
|
||||
color = b | g | r | a;
|
||||
|
||||
// All the colors are currently placed every other 16 bits, so we need to swizzle them
|
||||
// to the lower 64 bits of our vector before we store them back to memory.
|
||||
color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
|
||||
_mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), color);
|
||||
}
|
||||
#endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE)
|
||||
|
||||
for (size_t x = ssePixCount; x < pixCount; x++, ir++, iw++)
|
||||
{
|
||||
dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(this->_framebufferColor[ir].color);
|
||||
dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F,
|
||||
(this->_framebufferColor[ir].g >> 3) & 0x1F,
|
||||
(this->_framebufferColor[ir].r >> 3) & 0x1F) |
|
||||
((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
|
||||
}
|
||||
}
|
||||
|
||||
return RENDER3DERROR_NOERR;
|
||||
}
|
||||
|
||||
#else // Code path where SSSE3 or little-endian is not supported
|
||||
|
||||
Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
|
||||
{
|
||||
// Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL
|
||||
// stores pixels using a flipped Y-coordinate, so this needs to be flipped back
|
||||
// to the DS Y-coordinate.
|
||||
for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2))
|
||||
{
|
||||
for (size_t x = 0; x < this->_framebufferWidth; x++, ir++, iw++)
|
||||
for (; x < pixCount; x++, ir++, iw++)
|
||||
{
|
||||
// Use the correct endian format since OpenGL uses the native endian of
|
||||
// the architecture it is running on.
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
#ifdef LOCAL_BE
|
||||
dstRGBA6665[iw].color = BGRA8888_32_To_RGBA6665_32(this->_framebufferColor[ir].color);
|
||||
dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F,
|
||||
(this->_framebufferColor[ir].g >> 3) & 0x1F,
|
||||
(this->_framebufferColor[ir].r >> 3) & 0x1F) |
|
||||
dstRGBA5551[iw] = R5G5B5TORGB15( (this->_framebufferColor[ir].b >> 3) & 0x1F,
|
||||
(this->_framebufferColor[ir].g >> 3) & 0x1F,
|
||||
(this->_framebufferColor[ir].r >> 3) & 0x1F) |
|
||||
((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
|
||||
#else
|
||||
dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(this->_framebufferColor[ir].color);
|
||||
dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F,
|
||||
(this->_framebufferColor[ir].g >> 3) & 0x1F,
|
||||
(this->_framebufferColor[ir].r >> 3) & 0x1F) |
|
||||
dstRGBA5551[iw] = R5G5B5TORGB15( (this->_framebufferColor[ir].b >> 3) & 0x1F,
|
||||
(this->_framebufferColor[ir].g >> 3) & 0x1F,
|
||||
(this->_framebufferColor[ir].r >> 3) & 0x1F) |
|
||||
((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000);
|
||||
#endif
|
||||
}
|
||||
|
@ -990,8 +969,6 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA
|
|||
return RENDER3DERROR_NOERR;
|
||||
}
|
||||
|
||||
#endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE)
|
||||
|
||||
OpenGLRenderer_1_2::~OpenGLRenderer_1_2()
|
||||
{
|
||||
glFinish();
|
||||
|
@ -1902,7 +1879,7 @@ Render3DError OpenGLRenderer_1_2::UploadClearImage(const u16 *__restrict colorBu
|
|||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i < this->_framebufferWidth * this->_framebufferHeight; i++)
|
||||
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
|
||||
{
|
||||
OGLRef.workingCIDepthStencilBuffer[i] = depthBuffer[i] << 8;
|
||||
}
|
||||
|
@ -2782,6 +2759,11 @@ Render3DError OpenGLRenderer_1_2::SetFramebufferSize(size_t w, size_t h)
|
|||
return OGLERROR_NOERR;
|
||||
}
|
||||
|
||||
if (!BEGINGL())
|
||||
{
|
||||
return OGLERROR_BEGINGL_FAILED;
|
||||
}
|
||||
|
||||
if (this->isFBOSupported)
|
||||
{
|
||||
glActiveTextureARB(GL_TEXTURE0_ARB + OGLTextureUnitID_GColor);
|
||||
|
@ -2843,6 +2825,8 @@ Render3DError OpenGLRenderer_1_2::SetFramebufferSize(size_t w, size_t h)
|
|||
|
||||
free_aligned(oldFramebufferColor);
|
||||
|
||||
ENDGL();
|
||||
|
||||
return OGLERROR_NOERR;
|
||||
}
|
||||
|
||||
|
@ -2892,7 +2876,7 @@ Render3DError OpenGLRenderer_1_3::UploadClearImage(const u16 *__restrict colorBu
|
|||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i < this->_framebufferWidth * this->_framebufferHeight; i++)
|
||||
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
|
||||
{
|
||||
OGLRef.workingCIDepthStencilBuffer[i] = depthBuffer[i] << 8;
|
||||
}
|
||||
|
@ -2931,6 +2915,11 @@ Render3DError OpenGLRenderer_1_3::SetFramebufferSize(size_t w, size_t h)
|
|||
return OGLERROR_NOERR;
|
||||
}
|
||||
|
||||
if (!BEGINGL())
|
||||
{
|
||||
return OGLERROR_BEGINGL_FAILED;
|
||||
}
|
||||
|
||||
if (this->isFBOSupported)
|
||||
{
|
||||
glActiveTexture(GL_TEXTURE0 + OGLTextureUnitID_GColor);
|
||||
|
@ -2992,6 +2981,8 @@ Render3DError OpenGLRenderer_1_3::SetFramebufferSize(size_t w, size_t h)
|
|||
|
||||
free_aligned(oldFramebufferColor);
|
||||
|
||||
ENDGL();
|
||||
|
||||
return OGLERROR_NOERR;
|
||||
}
|
||||
|
||||
|
|
|
@ -1148,7 +1148,7 @@ Render3DError OpenGLRenderer_3_2::BeginRender(const GFX3D &engine)
|
|||
{
|
||||
OGLRenderRef &OGLRef = *this->ref;
|
||||
|
||||
if(!BEGINGL())
|
||||
if (!BEGINGL())
|
||||
{
|
||||
return OGLERROR_BEGINGL_FAILED;
|
||||
}
|
||||
|
@ -1560,6 +1560,11 @@ Render3DError OpenGLRenderer_3_2::SetFramebufferSize(size_t w, size_t h)
|
|||
return OGLERROR_NOERR;
|
||||
}
|
||||
|
||||
if (!BEGINGL())
|
||||
{
|
||||
return OGLERROR_BEGINGL_FAILED;
|
||||
}
|
||||
|
||||
glActiveTexture(GL_TEXTURE0 + OGLTextureUnitID_GColor);
|
||||
glBindTexture(GL_TEXTURE_2D, OGLRef.texGDepthStencilID);
|
||||
glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, w, h, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL);
|
||||
|
@ -1615,5 +1620,7 @@ Render3DError OpenGLRenderer_3_2::SetFramebufferSize(size_t w, size_t h)
|
|||
|
||||
free_aligned(oldFramebufferColor);
|
||||
|
||||
ENDGL();
|
||||
|
||||
return OGLERROR_NOERR;
|
||||
}
|
||||
|
|
|
@ -125,23 +125,37 @@ static void memset_u16(void *dst, const u16 val, const size_t length)
|
|||
__m128i *dst_vec128 = (__m128i *)dst;
|
||||
const __m128i val_vec128 = _mm_set1_epi16(val);
|
||||
const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
|
||||
//MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128));
|
||||
|
||||
for (size_t i = 0; i < length_vec128; i++)
|
||||
_mm_stream_si128(dst_vec128 + i, val_vec128);
|
||||
}
|
||||
|
||||
template <size_t LENGTH>
|
||||
static void memset_u16_fast(void *dst, const u16 val)
|
||||
{
|
||||
__m128i *dst_vec128 = (__m128i *)dst;
|
||||
const __m128i val_vec128 = _mm_set1_epi16(val);
|
||||
MACRODO_N(LENGTH / (sizeof(val_vec128) / sizeof(val)), _mm_store_si128(dst_vec128 + (X), val_vec128));
|
||||
}
|
||||
|
||||
static void memset_u32(void *dst, const u32 val, const size_t length)
|
||||
{
|
||||
__m128i *dst_vec128 = (__m128i *)dst;
|
||||
const __m128i val_vec128 = _mm_set1_epi32(val);
|
||||
const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val));
|
||||
//MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128));
|
||||
|
||||
for (size_t i = 0; i < length_vec128; i++)
|
||||
_mm_stream_si128(dst_vec128 + i, val_vec128);
|
||||
}
|
||||
|
||||
template <size_t LENGTH>
|
||||
static void memset_u32_fast(void *dst, const u32 val)
|
||||
{
|
||||
__m128i *dst_vec128 = (__m128i *)dst;
|
||||
const __m128i val_vec128 = _mm_set1_epi32(val);
|
||||
MACRODO_N(LENGTH / (sizeof(val_vec128) / sizeof(val)), _mm_store_si128(dst_vec128 + (X), val_vec128));
|
||||
}
|
||||
|
||||
#else //no sse2
|
||||
|
||||
static void memset_u16(void *dst, const u16 val, const size_t length)
|
||||
|
@ -150,7 +164,6 @@ static void memset_u16(void *dst, const u16 val, const size_t length)
|
|||
u64 *dst_u64 = (u64 *)dst;
|
||||
const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
|
||||
const size_t length_u64 = length / (sizeof(val_u64) / sizeof(val));
|
||||
//MACRODO_N(length_u64, (dst_u64[X] = val_u64));
|
||||
|
||||
for (size_t i = 0; i < length_u64; i++)
|
||||
dst_u64[i] = val_u64;
|
||||
|
@ -160,13 +173,25 @@ static void memset_u16(void *dst, const u16 val, const size_t length)
|
|||
#endif
|
||||
}
|
||||
|
||||
template <size_t LENGTH>
|
||||
static void memset_u16_fast(void *dst, const u16 val)
|
||||
{
|
||||
#ifdef HOST_64
|
||||
u64 *dst_u64 = (u64 *)dst;
|
||||
const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
|
||||
MACRODO_N(LENGTH / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64));
|
||||
#else
|
||||
for (size_t i = 0; i < LENGTH; i++)
|
||||
((u16 *)dst)[i] = val;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void memset_u32(void *dst, const u32 val, const size_t length)
|
||||
{
|
||||
#ifdef HOST_64
|
||||
u64 *dst_u64 = (u64 *)dst;
|
||||
const u64 val_u64 = ((u64)val << 32) | (u64)val;
|
||||
const size_t length_u64 = length / (sizeof(val_u64) / sizeof(val));
|
||||
//MACRODO_N(length_u64, (dst_u64[X] = val_u64));
|
||||
|
||||
for (size_t i = 0; i < length_u64; i++)
|
||||
dst_u64[i] = val_u64;
|
||||
|
@ -176,7 +201,20 @@ static void memset_u32(void *dst, const u32 val, const size_t length)
|
|||
#endif
|
||||
}
|
||||
|
||||
template <size_t LENGTH>
|
||||
static void memset_u32_fast(void *dst, const u32 val)
|
||||
{
|
||||
#ifdef HOST_64
|
||||
u64 *dst_u64 = (u64 *)dst;
|
||||
const u64 val_u64 = ((u64)val << 32) | (u64)val;
|
||||
MACRODO_N(LENGTH / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64));
|
||||
#else
|
||||
for (size_t i = 0; i < LENGTH; i++)
|
||||
((u16 *)dst)[i] = val;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // ENABLE_SSE2
|
||||
|
||||
// NOSSE version always used in gfx3d.cpp
|
||||
void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr);
|
||||
|
@ -233,8 +271,6 @@ FORCEINLINE void MatrixMultiply(float * matrix, const float * rightMatrix)
|
|||
_mm_store_ps(matrix+12,row3);
|
||||
}
|
||||
|
||||
|
||||
|
||||
FORCEINLINE void MatrixMultVec4x4(const float *matrix, float *vecPtr)
|
||||
{
|
||||
_mm_store_ps(vecPtr,_util_MatrixMultVec4x4_((SSE_MATRIX)matrix,_mm_load_ps(vecPtr)));
|
||||
|
@ -311,18 +347,6 @@ FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
|
|||
_mm_store_ps(matrix+12,_mm_div_ps(_mm_load_ps(matrix+12),val));
|
||||
}
|
||||
|
||||
//WARNING: I do not think this is as fast as a memset, for some reason.
|
||||
//at least in vc2005 with sse enabled. better figure out why before using it
|
||||
template<int NUM>
|
||||
static FORCEINLINE void memset_u8(void* _dst, u8 val)
|
||||
{
|
||||
memset(_dst,val,NUM);
|
||||
//const u8* dst = (u8*)_dst;
|
||||
//u32 u32val = (val<<24)|(val<<16)|(val<<8)|val;
|
||||
//const __m128i temp = _mm_set_epi32(u32val,u32val,u32val,u32val);
|
||||
//MACRODO_N(NUM/16,_mm_store_si128((__m128i*)(dst+(X)*16), temp));
|
||||
}
|
||||
|
||||
#else //no sse
|
||||
|
||||
void MatrixMultVec4x4 (const float *matrix, float *vecPtr);
|
||||
|
@ -345,12 +369,6 @@ FORCEINLINE void vector_fix2float(float* matrix, const float divisor)
|
|||
matrix[i] /= divisor;
|
||||
}
|
||||
|
||||
template<int NUM>
|
||||
static FORCEINLINE void memset_u8(void* dst, u8 val)
|
||||
{
|
||||
memset(dst,val,NUM);
|
||||
}
|
||||
|
||||
#endif //switched SSE functions
|
||||
|
||||
void MatrixMultVec4x4 (const s32 *matrix, s32 *vecPtr);
|
||||
|
@ -360,5 +378,5 @@ void MatrixMultVec4x4_M2(const s32 *matrix, s32 *vecPtr);
|
|||
void MatrixMultiply(s32* matrix, const s32* rightMatrix);
|
||||
void MatrixScale(s32 *matrix, const s32 *ptr);
|
||||
void MatrixTranslate(s32 *matrix, const s32 *ptr);
|
||||
#endif
|
||||
|
||||
#endif // MATRIX_H
|
||||
|
|
|
@ -577,7 +577,6 @@ public:
|
|||
FragmentColor shaderOutput;
|
||||
bool isOpaquePixel;
|
||||
|
||||
//FragmentColor &dstColor = this->_softRender->GetFramebuffer()[fragmentIndex];
|
||||
u32 &dstAttributeDepth = this->_softRender->_framebufferAttributes->depth[fragmentIndex];
|
||||
u8 &dstAttributeOpaquePolyID = this->_softRender->_framebufferAttributes->opaquePolyID[fragmentIndex];
|
||||
u8 &dstAttributeTranslucentPolyID = this->_softRender->_framebufferAttributes->translucentPolyID[fragmentIndex];
|
||||
|
@ -2063,9 +2062,6 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor
|
|||
convertedClearColor.g = GFX3D_5TO6(clearColor.g);
|
||||
convertedClearColor.b = GFX3D_5TO6(clearColor.b);
|
||||
|
||||
const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
|
||||
const size_t ssePixCount = pixCount - (pixCount % 16);
|
||||
|
||||
const __m128i color_vec128 = _mm_set1_epi32(convertedClearColor.color);
|
||||
const __m128i attrDepth_vec128 = _mm_set1_epi32(clearAttributes.depth);
|
||||
const __m128i attrOpaquePolyID_vec128 = _mm_set1_epi8(clearAttributes.opaquePolyID);
|
||||
|
@ -2074,7 +2070,11 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor
|
|||
const __m128i attrIsFogged_vec128 = _mm_set1_epi8(clearAttributes.isFogged);
|
||||
const __m128i attrIsTranslucentPoly_vec128 = _mm_set1_epi8(clearAttributes.isTranslucentPoly);
|
||||
|
||||
for (size_t i = 0; i < ssePixCount; i += 16)
|
||||
size_t i = 0;
|
||||
const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
|
||||
const size_t ssePixCount = pixCount - (pixCount % 16);
|
||||
|
||||
for (; i < ssePixCount; i += 16)
|
||||
{
|
||||
_mm_stream_si128((__m128i *)(this->_framebufferColor + i + 0), color_vec128);
|
||||
_mm_stream_si128((__m128i *)(this->_framebufferColor + i + 4), color_vec128);
|
||||
|
@ -2093,7 +2093,7 @@ Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor
|
|||
_mm_stream_si128((__m128i *)(this->_framebufferAttributes->isTranslucentPoly + i), attrIsTranslucentPoly_vec128);
|
||||
}
|
||||
|
||||
for (size_t i = ssePixCount; i < pixCount; i++)
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
this->_framebufferColor[i] = convertedClearColor;
|
||||
this->_framebufferAttributes->SetAtIndex(i, clearAttributes);
|
||||
|
|
|
@ -158,9 +158,9 @@ void FragmentAttributesBuffer::SetAtIndex(const size_t index, const FragmentAttr
|
|||
|
||||
void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
|
||||
{
|
||||
#ifdef ENABLE_SSE2
|
||||
const size_t sseCount = count - (count % 16);
|
||||
size_t i = 0;
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
const __m128i attrDepth_vec128 = _mm_set1_epi32(attr.depth);
|
||||
const __m128i attrOpaquePolyID_vec128 = _mm_set1_epi8(attr.opaquePolyID);
|
||||
const __m128i attrTranslucentPolyID_vec128 = _mm_set1_epi8(attr.translucentPolyID);
|
||||
|
@ -168,7 +168,8 @@ void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
|
|||
const __m128i attrIsFogged_vec128 = _mm_set1_epi8(attr.isFogged);
|
||||
const __m128i attrIsTranslucentPoly_vec128 = _mm_set1_epi8(attr.isTranslucentPoly);
|
||||
|
||||
for (size_t i = 0; i < sseCount; i += 16)
|
||||
const size_t sseCount = count - (count % 16);
|
||||
for (; i < sseCount; i += 16)
|
||||
{
|
||||
_mm_stream_si128((__m128i *)(this->depth + 0), attrDepth_vec128);
|
||||
_mm_stream_si128((__m128i *)(this->depth + 4), attrDepth_vec128);
|
||||
|
@ -181,17 +182,12 @@ void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
|
|||
_mm_stream_si128((__m128i *)this->isFogged, attrIsFogged_vec128);
|
||||
_mm_stream_si128((__m128i *)this->isTranslucentPoly, attrIsTranslucentPoly_vec128);
|
||||
}
|
||||
|
||||
for (size_t i = sseCount; i < count; i++)
|
||||
{
|
||||
this->SetAtIndex(i, attr);
|
||||
}
|
||||
#else
|
||||
for (size_t i = 0; i < count; i++)
|
||||
{
|
||||
this->SetAtIndex(i, attr);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; i < count; i++)
|
||||
{
|
||||
this->SetAtIndex(i, attr);
|
||||
}
|
||||
}
|
||||
|
||||
Render3D::Render3D()
|
||||
|
@ -345,26 +341,39 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState)
|
|||
const u8 xScroll = scrollBits & 0xFF;
|
||||
const u8 yScroll = (scrollBits >> 8) & 0xFF;
|
||||
|
||||
for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
|
||||
if (xScroll == 0 && yScroll == 0)
|
||||
{
|
||||
const size_t y = ((iy + yScroll) & 0xFF) << 8;
|
||||
|
||||
for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex++, ix++)
|
||||
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
|
||||
{
|
||||
const size_t x = (ix + xScroll) & 0xFF;
|
||||
const size_t srcIndex = y | x;
|
||||
this->clearImageColor16Buffer[i] = clearColorBuffer[i];
|
||||
this->clearImageDepthBuffer[i] = dsDepthToD24_LUT[clearDepthBuffer[i] & 0x7FFF];
|
||||
this->clearImageFogBuffer[i] = BIT15(clearDepthBuffer[i]);
|
||||
this->clearImagePolyIDBuffer[i] = clearFragment.opaquePolyID;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
|
||||
{
|
||||
const size_t y = ((iy + yScroll) & 0xFF) << 8;
|
||||
|
||||
//this is tested by harry potter and the order of the phoenix.
|
||||
//TODO (optimization) dont do this if we are mapped to blank memory (such as in sonic chronicles)
|
||||
//(or use a special zero fill in the bulk clearing above)
|
||||
this->clearImageColor16Buffer[dstIndex] = clearColorBuffer[srcIndex];
|
||||
|
||||
//this is tested quite well in the sonic chronicles main map mode
|
||||
//where depth values are used for trees etc you can walk behind
|
||||
this->clearImageDepthBuffer[dstIndex] = dsDepthToD24_LUT[clearDepthBuffer[srcIndex] & 0x7FFF];
|
||||
|
||||
this->clearImageFogBuffer[dstIndex] = BIT15(clearDepthBuffer[srcIndex]);
|
||||
this->clearImagePolyIDBuffer[dstIndex] = clearFragment.opaquePolyID;
|
||||
for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex++, ix++)
|
||||
{
|
||||
const size_t x = (ix + xScroll) & 0xFF;
|
||||
const size_t srcIndex = y | x;
|
||||
|
||||
//this is tested by harry potter and the order of the phoenix.
|
||||
//TODO (optimization) dont do this if we are mapped to blank memory (such as in sonic chronicles)
|
||||
//(or use a special zero fill in the bulk clearing above)
|
||||
this->clearImageColor16Buffer[dstIndex] = clearColorBuffer[srcIndex];
|
||||
|
||||
//this is tested quite well in the sonic chronicles main map mode
|
||||
//where depth values are used for trees etc you can walk behind
|
||||
this->clearImageDepthBuffer[dstIndex] = dsDepthToD24_LUT[clearDepthBuffer[srcIndex] & 0x7FFF];
|
||||
|
||||
this->clearImageFogBuffer[dstIndex] = BIT15(clearDepthBuffer[srcIndex]);
|
||||
this->clearImagePolyIDBuffer[dstIndex] = clearFragment.opaquePolyID;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -470,11 +479,13 @@ Render3DError Render3D::VramReconfigureSignal()
|
|||
|
||||
Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
|
||||
{
|
||||
static const __m128i zeroColor = _mm_set1_epi32(0);
|
||||
const __m128i zero_vec128 = _mm_setzero_si128();
|
||||
|
||||
size_t i = 0;
|
||||
const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
|
||||
const size_t ssePixCount = pixCount - (pixCount % 4);
|
||||
|
||||
for (size_t i = 0; i < ssePixCount; i += 4)
|
||||
for (; i < ssePixCount; i += 4)
|
||||
{
|
||||
// Copy the framebufferColor buffer
|
||||
__m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i));
|
||||
|
@ -482,16 +493,16 @@ Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6
|
|||
|
||||
// Convert to RGBA5551
|
||||
__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E)); // Read from R
|
||||
r = _mm_srli_epi32(r, 1); // Shift to R
|
||||
r = _mm_srli_si128(r, 1); // Shift to R
|
||||
|
||||
__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00)); // Read from G
|
||||
g = _mm_srli_epi32(g, 4); // Shift in G
|
||||
g = _mm_srli_si128(g, 4); // Shift in G
|
||||
|
||||
__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000)); // Read from B
|
||||
b = _mm_srli_epi32(b, 7); // Shift to B
|
||||
b = _mm_srli_si128(b, 7); // Shift to B
|
||||
|
||||
__m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A
|
||||
a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A
|
||||
a = _mm_cmpgt_epi32(a, zero_vec128); // Determine A
|
||||
|
||||
// From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned
|
||||
// 16-bit. Since SSE2 only has packssdw (signed 16-bit pack), then the alpha bit
|
||||
|
@ -504,21 +515,18 @@ Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6
|
|||
// alpha vector with the post-packed color vector to get the final color.
|
||||
|
||||
a = _mm_and_si128(a, _mm_set1_epi32(0x00004000)); // Mask out the bit before A
|
||||
a = _mm_packs_epi32(a, zeroColor); // Pack 32-bit down to 16-bit
|
||||
a = _mm_slli_epi16(a, 1); // Shift the A bit back to where it needs to be
|
||||
a = _mm_packs_epi32(a, zero_vec128); // Pack 32-bit down to 16-bit
|
||||
a = _mm_slli_si128(a, 1); // Shift the A bit back to where it needs to be
|
||||
|
||||
// Assemble the RGB colors
|
||||
color = _mm_or_si128(r, g);
|
||||
color = _mm_or_si128(color, b);
|
||||
|
||||
// Pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
|
||||
color = _mm_packs_epi32(color, zeroColor);
|
||||
color = _mm_or_si128(color, a);
|
||||
// Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
|
||||
color = r | g | b;
|
||||
color = _mm_packs_epi32(color, zero_vec128);
|
||||
color |= a;
|
||||
|
||||
_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color);
|
||||
}
|
||||
|
||||
for (size_t i = ssePixCount; i < pixCount; i++)
|
||||
for (; i < pixCount; i++)
|
||||
{
|
||||
dstRGBA6665[i] = this->_framebufferColor[i];
|
||||
dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000);
|
||||
|
@ -560,59 +568,49 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
|
|||
|
||||
if (xScroll == 0 && yScroll == 0)
|
||||
{
|
||||
const __m128i depthBitMask_vec128 = _mm_set1_epi16(0x7FFF);
|
||||
const __m128i fogBufferBitMask_vec128 = _mm_set1_epi16(BIT(15));
|
||||
const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID);
|
||||
|
||||
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i += 16)
|
||||
{
|
||||
static const __m128i depthBitMask_vec128 = _mm_set1_epi16(0x7FFF);
|
||||
static const __m128i fogBufferBitMask_vec128 = _mm_set1_epi16(BIT(15));
|
||||
|
||||
// Copy the colors to the color buffer. Since we can only copy 8 elements at once,
|
||||
// we need to load-store twice.
|
||||
_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i + 8), _mm_load_si128((__m128i *)(clearColorBuffer + i + 8)) );
|
||||
_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i), _mm_load_si128((__m128i *)(clearColorBuffer + i)) );
|
||||
|
||||
// Write the depth values to the depth buffer.
|
||||
__m128i clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8));
|
||||
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, depthBitMask_vec128);
|
||||
__m128i clearDepthHi_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8));
|
||||
__m128i clearDepthLo_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i));
|
||||
clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, depthBitMask_vec128);
|
||||
clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, depthBitMask_vec128);
|
||||
|
||||
__m128i depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 7)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 6)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 5)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 4)]);
|
||||
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 12), depthValue_vec128);
|
||||
|
||||
depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 3)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 2)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 1)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 0)]);
|
||||
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 8), depthValue_vec128);
|
||||
|
||||
clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i));
|
||||
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, depthBitMask_vec128);
|
||||
|
||||
depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 7)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 6)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 5)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 4)]);
|
||||
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i + 4), depthValue_vec128);
|
||||
|
||||
depthValue_vec128 = _mm_set_epi32(dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 3)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 2)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 1)],
|
||||
dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 0)]);
|
||||
_mm_store_si128((__m128i *)(this->clearImageDepthBuffer + i), depthValue_vec128);
|
||||
this->clearImageDepthBuffer[i+15] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 7)];
|
||||
this->clearImageDepthBuffer[i+14] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 6)];
|
||||
this->clearImageDepthBuffer[i+13] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 5)];
|
||||
this->clearImageDepthBuffer[i+12] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 4)];
|
||||
this->clearImageDepthBuffer[i+11] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 3)];
|
||||
this->clearImageDepthBuffer[i+10] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 2)];
|
||||
this->clearImageDepthBuffer[i+ 9] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 1)];
|
||||
this->clearImageDepthBuffer[i+ 8] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 0)];
|
||||
this->clearImageDepthBuffer[i+ 7] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 7)];
|
||||
this->clearImageDepthBuffer[i+ 6] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 6)];
|
||||
this->clearImageDepthBuffer[i+ 5] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 5)];
|
||||
this->clearImageDepthBuffer[i+ 4] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 4)];
|
||||
this->clearImageDepthBuffer[i+ 3] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 3)];
|
||||
this->clearImageDepthBuffer[i+ 2] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 2)];
|
||||
this->clearImageDepthBuffer[i+ 1] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 1)];
|
||||
this->clearImageDepthBuffer[i+ 0] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 0)];
|
||||
|
||||
// Write the fog flags to the fog flag buffer.
|
||||
clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8)); // Read the upper values
|
||||
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, fogBufferBitMask_vec128);
|
||||
const __m128i clearDepthFogBit_vec128 = _mm_srli_epi16(clearDepth_vec128, 15); // Save the upper bits in another register
|
||||
clearDepthHi_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8));
|
||||
clearDepthLo_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i));
|
||||
clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, fogBufferBitMask_vec128);
|
||||
clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, fogBufferBitMask_vec128);
|
||||
clearDepthHi_vec128 = _mm_srli_si128(clearDepthHi_vec128, 15);
|
||||
clearDepthLo_vec128 = _mm_srli_si128(clearDepthLo_vec128, 15);
|
||||
|
||||
clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i)); // Read the lower values
|
||||
clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, fogBufferBitMask_vec128);
|
||||
clearDepth_vec128 = _mm_srli_epi16(clearDepth_vec128, 15); // These are the lower bits
|
||||
|
||||
_mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packus_epi16(clearDepth_vec128, clearDepthFogBit_vec128));
|
||||
_mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packus_epi16(clearDepthLo_vec128, clearDepthHi_vec128));
|
||||
|
||||
// The one is easy. Just set the values in the polygon ID buffer.
|
||||
_mm_store_si128((__m128i *)(this->clearImagePolyIDBuffer + i), opaquePolyID_vec128);
|
||||
|
@ -620,8 +618,8 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
|
|||
}
|
||||
else
|
||||
{
|
||||
static const __m128i addrOffset = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
static const __m128i addrRolloverMask = _mm_set1_epi16(0x00FF);
|
||||
const __m128i addrOffset = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
const __m128i addrRolloverMask = _mm_set1_epi16(0x00FF);
|
||||
const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID);
|
||||
|
||||
for (size_t dstIndex = 0, iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
|
||||
|
|
Loading…
Reference in New Issue