GPU: Try to clean up GPUEngineBase::_SpriteRenderPerform() a little bit.

This commit is contained in:
rogerman 2017-08-19 20:21:09 -07:00
parent a9f2e53c25
commit f02210bfdb
2 changed files with 205 additions and 299 deletions

View File

@ -3677,38 +3677,72 @@ void GPUEngineBase::_LineExtRot(GPUEngineCompositorInfo &compInfo, bool &outUseC
// SPRITE RENDERING -HELPER FUNCTIONS- // SPRITE RENDERING -HELPER FUNCTIONS-
/*****************************************************************************/ /*****************************************************************************/
template <bool ISDEBUGRENDER, bool ISOBJMODEBITMAP>
FORCEINLINE void GPUEngineBase::_RenderSpriteUpdatePixel(size_t frameX,
const u16 *__restrict srcPalette, const u8 palIndex, const OBJMode objMode, const u8 prio, const u8 spriteNum,
u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab)
{
if ( (ISOBJMODEBITMAP && ((*srcPalette & 0x8000) == 0)) || (!ISOBJMODEBITMAP && (palIndex == 0)) )
{
return;
}
if (ISDEBUGRENDER)
{
dst[frameX] = (ISOBJMODEBITMAP) ? *srcPalette : LE_TO_LOCAL_16(srcPalette[palIndex]);
return;
}
if ( !ISOBJMODEBITMAP && (objMode == OBJMode_Window) )
{
this->_sprWin[frameX] = 1;
return;
}
if (prio < prioTab[frameX])
{
dst[frameX] = (ISOBJMODEBITMAP) ? *srcPalette : LE_TO_LOCAL_16(srcPalette[palIndex]);
dst_alpha[frameX] = (ISOBJMODEBITMAP) ? palIndex : 0xFF;
typeTab[frameX] = (ISOBJMODEBITMAP) ? OBJMode_Bitmap : objMode;
prioTab[frameX] = prio;
this->_sprNum[frameX] = spriteNum;
}
}
/* if i understand it correct, and it fixes some sprite problems in chameleon shot */ /* if i understand it correct, and it fixes some sprite problems in chameleon shot */
/* we have a 15 bit color, and should use the pal entry bits as alpha ?*/ /* we have a 15 bit color, and should use the pal entry bits as alpha ?*/
/* http://nocash.emubase.de/gbatek.htm#dsvideoobjs */ /* http://nocash.emubase.de/gbatek.htm#dsvideoobjs */
template <bool ISDEBUGRENDER> template <bool ISDEBUGRENDER>
void GPUEngineBase::_RenderSpriteBMP(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha) void GPUEngineBase::_RenderSpriteBMP(const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep,
const u8 spriteAlpha, const OBJMode objMode, const u8 prio, const u8 spriteNum,
u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab)
{ {
const u16 *__restrict bmpBuffer = (u16 *)MMU_gpu_map(srcadr); const u16 *__restrict vramBuffer = (u16 *)MMU_gpu_map(objAddress);
size_t i = 0; size_t i = 0;
#ifdef ENABLE_SSE2 #ifdef ENABLE_SSE2
if (xdir == 1) if (readXStep == 1)
{ {
if (ISDEBUGRENDER) if (ISDEBUGRENDER)
{ {
const size_t ssePixCount = lg - (lg % 8); const size_t ssePixCount = length - (length % 8);
for (; i < ssePixCount; i += 8, x += 8, sprX += 8) for (; i < ssePixCount; i += 8, spriteX += 8, frameX += 8)
{ {
const __m128i color_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x)); const __m128i color_vec128 = _mm_loadu_si128((__m128i *)(vramBuffer + spriteX));
const __m128i alphaCompare = _mm_cmpeq_epi16( _mm_srli_epi16(color_vec128, 15), _mm_set1_epi16(0x0001) ); const __m128i alphaCompare = _mm_cmpeq_epi16( _mm_srli_epi16(color_vec128, 15), _mm_set1_epi16(0x0001) );
_mm_storeu_si128( (__m128i *)(dst + sprX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + sprX)), color_vec128, alphaCompare) ); _mm_storeu_si128( (__m128i *)(dst + frameX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + frameX)), color_vec128, alphaCompare) );
} }
} }
else else
{ {
const __m128i prio_vec128 = _mm_set1_epi8(prio); const __m128i prio_vec128 = _mm_set1_epi8(prio);
const size_t ssePixCount = lg - (lg % 16); const size_t ssePixCount = length - (length % 16);
for (; i < ssePixCount; i += 16, x += 16, sprX += 16) for (; i < ssePixCount; i += 16, spriteX += 16, frameX += 16)
{ {
const __m128i prioTab_vec128 = _mm_loadu_si128((__m128i *)(prioTab + sprX)); const __m128i prioTab_vec128 = _mm_loadu_si128((__m128i *)(prioTab + frameX));
const __m128i colorLo_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x)); const __m128i colorLo_vec128 = _mm_loadu_si128((__m128i *)(vramBuffer + spriteX));
const __m128i colorHi_vec128 = _mm_loadu_si128((__m128i *)(bmpBuffer + x + 8)); const __m128i colorHi_vec128 = _mm_loadu_si128((__m128i *)(vramBuffer + spriteX + 8));
const __m128i prioCompare = _mm_cmplt_epi8(prio_vec128, prioTab_vec128); const __m128i prioCompare = _mm_cmplt_epi8(prio_vec128, prioTab_vec128);
const __m128i alphaCompare = _mm_cmpeq_epi8( _mm_packs_epi16(_mm_srli_epi16(colorLo_vec128, 15), _mm_srli_epi16(colorHi_vec128, 15)), _mm_set1_epi8(0x01) ); const __m128i alphaCompare = _mm_cmpeq_epi8( _mm_packs_epi16(_mm_srli_epi16(colorLo_vec128, 15), _mm_srli_epi16(colorHi_vec128, 15)), _mm_set1_epi8(0x01) );
@ -3720,106 +3754,53 @@ void GPUEngineBase::_RenderSpriteBMP(GPUEngineCompositorInfo &compInfo, const u8
// Just in case you're wondering why we're not using maskmovdqu, but instead using movdqu+pblendvb+movdqu, it's because // Just in case you're wondering why we're not using maskmovdqu, but instead using movdqu+pblendvb+movdqu, it's because
// maskmovdqu won't keep the data in cache, and we really need the data in cache since we're about to render the sprite // maskmovdqu won't keep the data in cache, and we really need the data in cache since we're about to render the sprite
// to the framebuffer. In addition, the maskmovdqu instruction can be brutally slow on many non-Intel CPUs. // to the framebuffer. In addition, the maskmovdqu instruction can be brutally slow on many non-Intel CPUs.
_mm_storeu_si128( (__m128i *)(dst + sprX + 0), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + sprX + 0)), colorLo_vec128, combinedLoCompare) ); _mm_storeu_si128( (__m128i *)(dst + frameX + 0), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + frameX + 0)), colorLo_vec128, combinedLoCompare) );
_mm_storeu_si128( (__m128i *)(dst + sprX + 8), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + sprX + 8)), colorHi_vec128, combinedHiCompare) ); _mm_storeu_si128( (__m128i *)(dst + frameX + 8), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst + frameX + 8)), colorHi_vec128, combinedHiCompare) );
_mm_storeu_si128( (__m128i *)(dst_alpha + sprX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst_alpha + sprX)), _mm_set1_epi8(alpha + 1), combinedPackedCompare) ); _mm_storeu_si128( (__m128i *)(dst_alpha + frameX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(dst_alpha + frameX)), _mm_set1_epi8(spriteAlpha + 1), combinedPackedCompare) );
_mm_storeu_si128( (__m128i *)(typeTab + sprX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(typeTab + sprX)), _mm_set1_epi8(OBJMode_Bitmap), combinedPackedCompare) ); _mm_storeu_si128( (__m128i *)(typeTab + frameX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(typeTab + frameX)), _mm_set1_epi8(OBJMode_Bitmap), combinedPackedCompare) );
_mm_storeu_si128( (__m128i *)(prioTab + sprX), _mm_blendv_epi8(prioTab_vec128, prio_vec128, combinedPackedCompare) ); _mm_storeu_si128( (__m128i *)(prioTab + frameX), _mm_blendv_epi8(prioTab_vec128, prio_vec128, combinedPackedCompare) );
_mm_storeu_si128( (__m128i *)(this->_sprNum + sprX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(this->_sprNum + sprX)), _mm_set1_epi8(spriteNum), combinedPackedCompare) ); _mm_storeu_si128( (__m128i *)(this->_sprNum + frameX), _mm_blendv_epi8(_mm_loadu_si128((__m128i *)(this->_sprNum + frameX)), _mm_set1_epi8(spriteNum), combinedPackedCompare) );
} }
} }
} }
#endif #endif
for (; i < lg; i++, sprX++, x += xdir) for (; i < length; i++, frameX++, spriteX+=readXStep)
{ {
const u16 color = LE_TO_LOCAL_16(bmpBuffer[x]); const u16 vramColor = LE_TO_LOCAL_16(vramBuffer[spriteX]);
this->_RenderSpriteUpdatePixel<ISDEBUGRENDER, true>(frameX, &vramColor, spriteAlpha+1, OBJMode_Bitmap, prio, spriteNum, dst, dst_alpha, typeTab, prioTab);
//a cleared alpha bit suppresses the pixel from processing entirely; it doesnt exist
if (ISDEBUGRENDER)
{
if (color & 0x8000)
{
dst[sprX] = color;
}
}
else
{
if ((color & 0x8000) && (prio < prioTab[sprX]))
{
dst[sprX] = color;
dst_alpha[sprX] = alpha+1;
typeTab[sprX] = OBJMode_Bitmap;
prioTab[sprX] = prio;
this->_sprNum[sprX] = spriteNum;
}
}
} }
} }
template<bool ISDEBUGRENDER, bool ISWINDOW> template<bool ISDEBUGRENDER>
void GPUEngineBase::_RenderSprite256(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, const u16 *__restrict pal, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha) void GPUEngineBase::_RenderSprite256(const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep,
const u16 *__restrict palColorBuffer, const OBJMode objMode, const u8 prio, const u8 spriteNum,
u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab)
{ {
for (size_t i = 0; i < lg; i++, ++sprX, x += xdir) for (size_t i = 0; i < length; i++, frameX++, spriteX+=readXStep)
{ {
const u32 adr = srcadr + (u32)( (x & 0x7) + ((x & 0xFFF8) << 3) ); const u32 palIndexAddress = objAddress + (u32)( (spriteX & 0x0007) + ((spriteX & 0xFFF8) << 3) );
const u8 *__restrict src = (u8 *)MMU_gpu_map(adr); const u8 *__restrict palIndexBuffer = (u8 *)MMU_gpu_map(palIndexAddress);
const u8 palette_entry = *src; const u8 idx8 = *palIndexBuffer;
//a zero value suppresses the pixel from processing entirely; it doesnt exist this->_RenderSpriteUpdatePixel<ISDEBUGRENDER, false>(frameX, palColorBuffer, idx8, objMode, prio, spriteNum, dst, dst_alpha, typeTab, prioTab);
if (palette_entry > 0)
{
if (ISWINDOW)
{
this->_sprWin[sprX] = 1;
}
else if (ISDEBUGRENDER)
{
dst[sprX] = LE_TO_LOCAL_16(pal[palette_entry]);
}
else if (prio < prioTab[sprX])
{
dst[sprX] = LE_TO_LOCAL_16(pal[palette_entry]);
dst_alpha[sprX] = 0xFF;
typeTab[sprX] = (alpha ? OBJMode_Transparent : OBJMode_Normal);
prioTab[sprX] = prio;
this->_sprNum[sprX] = spriteNum;
}
}
} }
} }
template<bool ISDEBUGRENDER, bool ISWINDOW> template<bool ISDEBUGRENDER>
void GPUEngineBase::_RenderSprite16(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, const u16 *__restrict pal, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha) void GPUEngineBase::_RenderSprite16(const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep,
const u16 *__restrict palColorBuffer, const OBJMode objMode, const u8 prio, const u8 spriteNum,
u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab)
{ {
for (size_t i = 0; i < lg; i++, ++sprX, x += xdir) for (size_t i = 0; i < length; i++, frameX++, spriteX+=readXStep)
{ {
const u16 x1 = x >> 1; const u32 spriteX_word = spriteX >> 1;
const u32 adr = srcadr + (x1 & 0x3) + ((x1 & 0xFFFC) << 3); const u32 palIndexAddress = objAddress + (spriteX_word & 0x0003) + ((spriteX_word & 0xFFFC) << 3);
const u8 *__restrict src = (u8 *)MMU_gpu_map(adr); const u8 *__restrict palIndexBuffer = (u8 *)MMU_gpu_map(palIndexAddress);
const u8 palette = *src; const u8 palIndex = *palIndexBuffer;
const u8 palette_entry = (x & 1) ? palette >> 4 : palette & 0xF; const u8 idx4 = (spriteX & 1) ? palIndex >> 4 : palIndex & 0x0F;
//a zero value suppresses the pixel from processing entirely; it doesnt exist this->_RenderSpriteUpdatePixel<ISDEBUGRENDER, false>(frameX, palColorBuffer, idx4, objMode, prio, spriteNum, dst, dst_alpha, typeTab, prioTab);
if (palette_entry > 0)
{
if (ISWINDOW)
{
this->_sprWin[sprX] = 1;
}
else if (ISDEBUGRENDER)
{
dst[sprX] = LE_TO_LOCAL_16(pal[palette_entry]);
}
else if (prio < prioTab[sprX])
{
dst[sprX] = LE_TO_LOCAL_16(pal[palette_entry]);
dst_alpha[sprX] = 0xFF;
typeTab[sprX] = (alpha ? OBJMode_Transparent : OBJMode_Normal);
prioTab[sprX] = prio;
this->_sprNum[sprX] = spriteNum;
}
}
} }
} }
@ -3959,9 +3940,9 @@ void GPUEngineBase::_SpriteRenderPerform(GPUEngineCompositorInfo &compInfo, u16
const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT; const IOREG_DISPCNT &DISPCNT = this->_IORegisterMap->DISPCNT;
size_t cost = 0; size_t cost = 0;
for (size_t i = 0; i < 128; i++) for (size_t spriteNum = 0; spriteNum < 128; spriteNum++)
{ {
OAMAttributes spriteInfo = this->_oamList[i]; OAMAttributes spriteInfo = this->_oamList[spriteNum];
//for each sprite: //for each sprite:
if (cost >= 2130) if (cost >= 2130)
@ -3985,50 +3966,46 @@ void GPUEngineBase::_SpriteRenderPerform(GPUEngineCompositorInfo &compInfo, u16
const OBJMode objMode = (OBJMode)spriteInfo.Mode; const OBJMode objMode = (OBJMode)spriteInfo.Mode;
SpriteSize sprSize; SpriteSize sprSize;
s32 sprX; s32 frameX;
s32 sprY; s32 frameY;
s32 x; s32 spriteX;
s32 y; s32 spriteY;
s32 lg; s32 length;
s32 xdir; s32 readXStep;
u8 prio = spriteInfo.Priority; u8 prio = spriteInfo.Priority;
u16 *__restrict pal;
u8 *__restrict src;
u32 srcadr;
if (spriteInfo.RotScale != 0) if (spriteInfo.RotScale != 0)
{ {
s32 fieldX, fieldY, auxX, auxY, realX, realY, offset; s32 fieldX, fieldY, auxX, auxY, realX, realY;
u8 blockparameter; u8 blockparameter;
s16 dx, dmx, dy, dmy; s16 dx, dmx, dy, dmy;
u16 colour;
// Get sprite positions and size // Get sprite positions and size
sprX = spriteInfo.X; frameX = spriteInfo.X;
sprY = spriteInfo.Y; frameY = spriteInfo.Y;
sprSize = GPUEngineBase::_sprSizeTab[spriteInfo.Size][spriteInfo.Shape]; sprSize = GPUEngineBase::_sprSizeTab[spriteInfo.Size][spriteInfo.Shape];
// Copy sprite size, to check change it if needed // Copy sprite size, to check change it if needed
fieldX = sprSize.width; fieldX = sprSize.width;
fieldY = sprSize.height; fieldY = sprSize.height;
lg = sprSize.width; length = sprSize.width;
// If we are using double size mode, double our control vars // If we are using double size mode, double our control vars
if (spriteInfo.DoubleSize != 0) if (spriteInfo.DoubleSize != 0)
{ {
fieldX <<= 1; fieldX <<= 1;
fieldY <<= 1; fieldY <<= 1;
lg <<= 1; length <<= 1;
} }
//check if the sprite is visible y-wise. unfortunately our logic for x and y is different due to our scanline based rendering //check if the sprite is visible y-wise. unfortunately our logic for x and y is different due to our scanline based rendering
//tested thoroughly by many large sprites in Super Robot Wars K which wrap around the screen //tested thoroughly by many large sprites in Super Robot Wars K which wrap around the screen
y = (compInfo.line.indexNative - sprY) & 0xFF; spriteY = (compInfo.line.indexNative - frameY) & 0xFF;
if (y >= fieldY) if (spriteY >= fieldY)
continue; continue;
//check if sprite is visible x-wise. //check if sprite is visible x-wise.
if ((sprX == GPU_FRAMEBUFFER_NATIVE_WIDTH) || (sprX + fieldX <= 0)) if ((frameX == GPU_FRAMEBUFFER_NATIVE_WIDTH) || (frameX + fieldX <= 0))
continue; continue;
cost += (sprSize.width * 2) + 10; cost += (sprSize.width * 2) + 10;
@ -4043,86 +4020,36 @@ void GPUEngineBase::_SpriteRenderPerform(GPUEngineCompositorInfo &compInfo, u16
dmy = LE_TO_LOCAL_16((s16)this->_oamList[blockparameter+3].attr3); dmy = LE_TO_LOCAL_16((s16)this->_oamList[blockparameter+3].attr3);
// Calculate fixed point 8.8 start offsets // Calculate fixed point 8.8 start offsets
realX = (sprSize.width << 7) - (fieldX >> 1)*dx - (fieldY >> 1)*dmx + y*dmx; realX = (sprSize.width << 7) - (fieldX >> 1)*dx - (fieldY >> 1)*dmx + spriteY*dmx;
realY = (sprSize.height << 7) - (fieldX >> 1)*dy - (fieldY >> 1)*dmy + y*dmy; realY = (sprSize.height << 7) - (fieldX >> 1)*dy - (fieldY >> 1)*dmy + spriteY*dmy;
if (sprX < 0) if (frameX < 0)
{ {
// If sprite is not in the window // If sprite is not in the window
if (sprX + fieldX <= 0) if (frameX + fieldX <= 0)
continue; continue;
// Otherwise, is partially visible // Otherwise, is partially visible
lg += sprX; length += frameX;
realX -= sprX*dx; realX -= frameX*dx;
realY -= sprX*dy; realY -= frameX*dy;
sprX = 0; frameX = 0;
} }
else else
{ {
if (sprX + fieldX > GPU_FRAMEBUFFER_NATIVE_WIDTH) if (frameX + fieldX > GPU_FRAMEBUFFER_NATIVE_WIDTH)
lg = GPU_FRAMEBUFFER_NATIVE_WIDTH - sprX; length = GPU_FRAMEBUFFER_NATIVE_WIDTH - frameX;
} }
// If we are using 1 palette of 256 colours if (objMode == OBJMode_Bitmap) // Rotozoomed direct color
if (spriteInfo.PaletteMode == PaletteMode_1x256)
{
src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex << compInfo.renderState.spriteBoundary));
// If extended palettes are set, use them
pal = (DISPCNT.ExOBJPalette_Enable) ? (u16 *)(MMU.ObjExtPal[this->_engineID][0]+(spriteInfo.PaletteIndex*ADDRESS_STEP_512B)) : this->_paletteOBJ;
for (size_t j = 0; j < lg; ++j, ++sprX)
{
// Get the integer part of the fixed point 8.8, and check if it lies inside the sprite data
auxX = (realX >> 8);
auxY = (realY >> 8);
if (auxX >= 0 && auxY >= 0 && auxX < sprSize.width && auxY < sprSize.height)
{
if (MODE == SpriteRenderMode_Sprite2D)
offset = (auxX&0x7) + ((auxX&0xFFF8)<<3) + ((auxY>>3)<<10) + ((auxY&0x7)*8);
else
offset = (auxX&0x7) + ((auxX&0xFFF8)<<3) + ((auxY>>3)*sprSize.width*8) + ((auxY&0x7)*8);
colour = src[offset];
if (colour > 0)
{
if (objMode == OBJMode_Window)
{
this->_sprWin[sprX] = 1;
}
else if (ISDEBUGRENDER)
{
dst[sprX] = LE_TO_LOCAL_16(pal[colour]);
}
else if (prio < prioTab[sprX])
{
dst[sprX] = LE_TO_LOCAL_16(pal[colour]);
dst_alpha[sprX] = 0xFF;
typeTab[sprX] = objMode;
prioTab[sprX] = prio;
this->_sprNum[sprX] = i;
}
}
}
// Add the rotation/scale coefficients, here the rotation/scaling is performed
realX += dx;
realY += dy;
}
}
// Rotozoomed direct color
else if (objMode == OBJMode_Bitmap)
{ {
//transparent (i think, dont bother to render?) if alpha is 0 //transparent (i think, dont bother to render?) if alpha is 0
if (spriteInfo.PaletteIndex == 0) if (spriteInfo.PaletteIndex == 0)
continue; continue;
srcadr = this->_SpriteAddressBMP(compInfo, spriteInfo, sprSize, 0); const u32 objAddress = this->_SpriteAddressBMP(compInfo, spriteInfo, sprSize, 0);
for (size_t j = 0; j < lg; ++j, ++sprX) for (size_t j = 0; j < length; ++j, ++frameX)
{ {
// Get the integer part of the fixed point 8.8, and check if it lies inside the sprite data // Get the integer part of the fixed point 8.8, and check if it lies inside the sprite data
auxX = realX >> 8; auxX = realX >> 8;
@ -4133,35 +4060,25 @@ void GPUEngineBase::_SpriteRenderPerform(GPUEngineCompositorInfo &compInfo, u16
if (auxX >= 0 && auxY >= 0 && auxX < sprSize.width && auxY < sprSize.height) if (auxX >= 0 && auxY >= 0 && auxX < sprSize.width && auxY < sprSize.height)
{ {
size_t objOffset = 0;
if (DISPCNT.OBJ_BMP_2D_dim) if (DISPCNT.OBJ_BMP_2D_dim)
//tested by knights in the nightmare
offset = ((this->_SpriteAddressBMP(compInfo, spriteInfo, sprSize, auxY) - srcadr) / 2) + auxX;
else //tested by lego indiana jones (somehow?)
//tested by buffy sacrifice damage blood splatters in corner
offset = auxX + (auxY * sprSize.width);
const u32 finalAddr = srcadr + (offset << 1);
u16 *mem = (u16 *)MMU_gpu_map(finalAddr);
colour = LE_TO_LOCAL_16(*mem);
if (ISDEBUGRENDER)
{ {
if (colour & 0x8000) //tested by knights in the nightmare
{ objOffset = ((this->_SpriteAddressBMP(compInfo, spriteInfo, sprSize, auxY) - objAddress) / 2) + auxX;
dst[sprX] = colour;
}
} }
else else
{ {
if ((colour & 0x8000) && (prio < prioTab[sprX])) //tested by lego indiana jones (somehow?)
{ //tested by buffy sacrifice damage blood splatters in corner
dst[sprX] = colour; objOffset = (auxY * sprSize.width) + auxX;
dst_alpha[sprX] = spriteInfo.PaletteIndex;
typeTab[sprX] = objMode;
prioTab[sprX] = prio;
this->_sprNum[sprX] = i;
}
} }
const u32 vramAddress = objAddress + (objOffset << 1);
const u16 *vramBuffer = (u16 *)MMU_gpu_map(vramAddress);
const u16 vramColor = LE_TO_LOCAL_16(*vramBuffer);
this->_RenderSpriteUpdatePixel<ISDEBUGRENDER, true>(frameX, &vramColor, spriteInfo.PaletteIndex, OBJMode_Bitmap, prio, spriteNum, dst, dst_alpha, typeTab, prioTab);
} }
// Add the rotation/scale coefficients, here the rotation/scaling is performed // Add the rotation/scale coefficients, here the rotation/scaling is performed
@ -4169,21 +4086,39 @@ void GPUEngineBase::_SpriteRenderPerform(GPUEngineCompositorInfo &compInfo, u16
realY += dy; realY += dy;
} }
} }
// Rotozoomed 16/16 palette else if (spriteInfo.PaletteMode == PaletteMode_1x256) // If we are using 1 palette of 256 colours
else
{ {
if (MODE == SpriteRenderMode_Sprite2D) const u32 objAddress = this->_sprMem + (spriteInfo.TileIndex << compInfo.renderState.spriteBoundary);
{ const u8 *__restrict palIndexBuffer = (u8 *)MMU_gpu_map(objAddress);
src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex << 5)); const u16 *__restrict palColorBuffer = (DISPCNT.ExOBJPalette_Enable) ? (u16 *)(MMU.ObjExtPal[this->_engineID][0]+(spriteInfo.PaletteIndex*ADDRESS_STEP_512B)) : this->_paletteOBJ;
}
else
{
src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex << compInfo.renderState.spriteBoundary));
}
pal = this->_paletteOBJ + (spriteInfo.PaletteIndex << 4); for (size_t j = 0; j < length; ++j, ++frameX)
{
// Get the integer part of the fixed point 8.8, and check if it lies inside the sprite data
auxX = (realX >> 8);
auxY = (realY >> 8);
for (size_t j = 0; j < lg; ++j, ++sprX) if (auxX >= 0 && auxY >= 0 && auxX < sprSize.width && auxY < sprSize.height)
{
const size_t palOffset = (MODE == SpriteRenderMode_Sprite2D) ? (auxX&0x7) + ((auxX&0xFFF8)<<3) + ((auxY>>3)<<10) + ((auxY&0x7)*8) :
(auxX&0x7) + ((auxX&0xFFF8)<<3) + ((auxY>>3)*sprSize.width*8) + ((auxY&0x7)*8);
const u8 idx8 = palIndexBuffer[palOffset];
this->_RenderSpriteUpdatePixel<ISDEBUGRENDER, false>(frameX, palColorBuffer, idx8, objMode, prio, spriteNum, dst, dst_alpha, typeTab, prioTab);
}
// Add the rotation/scale coefficients, here the rotation/scaling is performed
realX += dx;
realY += dy;
}
}
else // Rotozoomed 16/16 palette
{
const u32 objAddress = (MODE == SpriteRenderMode_Sprite2D) ? this->_sprMem + (spriteInfo.TileIndex << 5) : this->_sprMem + (spriteInfo.TileIndex << compInfo.renderState.spriteBoundary);
const u8 *__restrict palIndexBuffer = (u8 *)MMU_gpu_map(objAddress);
const u16 *__restrict palColorBuffer = this->_paletteOBJ + (spriteInfo.PaletteIndex << 4);
for (size_t j = 0; j < length; ++j, ++frameX)
{ {
// Get the integer part of the fixed point 8.8, and check if it lies inside the sprite data // Get the integer part of the fixed point 8.8, and check if it lies inside the sprite data
auxX = realX >> 8; auxX = realX >> 8;
@ -4191,36 +4126,12 @@ void GPUEngineBase::_SpriteRenderPerform(GPUEngineCompositorInfo &compInfo, u16
if (auxX >= 0 && auxY >= 0 && auxX < sprSize.width && auxY < sprSize.height) if (auxX >= 0 && auxY >= 0 && auxX < sprSize.width && auxY < sprSize.height)
{ {
if (MODE == SpriteRenderMode_Sprite2D) const size_t palOffset = (MODE == SpriteRenderMode_Sprite2D) ? ((auxX>>1)&0x3) + (((auxX>>1)&0xFFFC)<<3) + ((auxY>>3)<<10) + ((auxY&0x7)*4) :
offset = ((auxX>>1)&0x3) + (((auxX>>1)&0xFFFC)<<3) + ((auxY>>3)<<10) + ((auxY&0x7)*4); ((auxX>>1)&0x3) + (((auxX>>1)&0xFFFC)<<3) + ((auxY>>3)*sprSize.width*4) + ((auxY&0x7)*4);
else const u8 palIndex = palIndexBuffer[palOffset];
offset = ((auxX>>1)&0x3) + (((auxX>>1)&0xFFFC)<<3) + ((auxY>>3)*sprSize.width)*4 + ((auxY&0x7)*4); const u8 idx4 = (auxX & 1) ? palIndex >> 4 : palIndex & 0x0F;
colour = src[offset]; this->_RenderSpriteUpdatePixel<ISDEBUGRENDER, false>(frameX, palColorBuffer, idx4, objMode, prio, spriteNum, dst, dst_alpha, typeTab, prioTab);
// Get 4bits value from the readed 8bits
if (auxX&1) colour >>= 4;
else colour &= 0xF;
if (colour > 0)
{
if (objMode == OBJMode_Window)
{
this->_sprWin[sprX] = 1;
}
else if (ISDEBUGRENDER)
{
dst[sprX] = LE_TO_LOCAL_16(pal[colour]);
}
else if (prio < prioTab[sprX])
{
dst[sprX] = LE_TO_LOCAL_16(pal[colour]);
dst_alpha[sprX] = 0xFF;
typeTab[sprX] = objMode;
prioTab[sprX] = prio;
this->_sprNum[sprX] = i;
}
}
} }
// Add the rotation/scale coeficients, here the rotation/scaling is performed // Add the rotation/scale coeficients, here the rotation/scaling is performed
@ -4231,7 +4142,7 @@ void GPUEngineBase::_SpriteRenderPerform(GPUEngineCompositorInfo &compInfo, u16
} }
else //NOT rotozoomed else //NOT rotozoomed
{ {
if (!this->_ComputeSpriteVars(compInfo, spriteInfo, sprSize, sprX, sprY, x, y, lg, xdir)) if (!this->_ComputeSpriteVars(compInfo, spriteInfo, sprSize, frameX, frameY, spriteX, spriteY, length, readXStep))
continue; continue;
cost += sprSize.width; cost += sprSize.width;
@ -4242,11 +4153,15 @@ void GPUEngineBase::_SpriteRenderPerform(GPUEngineCompositorInfo &compInfo, u16
if (spriteInfo.PaletteIndex == 0) if (spriteInfo.PaletteIndex == 0)
continue; continue;
srcadr = this->_SpriteAddressBMP(compInfo, spriteInfo, sprSize, y); const u32 objAddress = this->_SpriteAddressBMP(compInfo, spriteInfo, sprSize, spriteY);
this->_RenderSpriteBMP<ISDEBUGRENDER>(compInfo, i, dst, srcadr, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, spriteInfo.PaletteIndex); this->_RenderSpriteBMP<ISDEBUGRENDER>(objAddress, length, frameX, spriteX, readXStep,
spriteInfo.PaletteIndex, OBJMode_Bitmap, prio, spriteNum,
dst, dst_alpha, typeTab, prioTab);
const size_t vramPixel = (size_t)((u8 *)MMU_gpu_map(srcadr) - MMU.ARM9_LCD) / sizeof(u16); // When rendering at a custom framebuffer size, save a copy of the OBJ address as a reference
// for reading the custom VRAM.
const size_t vramPixel = (size_t)((u8 *)MMU_gpu_map(objAddress) - MMU.ARM9_LCD) / sizeof(u16);
if (vramPixel < (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES * 4)) if (vramPixel < (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES * 4))
{ {
const size_t blockID = vramPixel / (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES); const size_t blockID = vramPixel / (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_VRAM_BLOCK_LINES);
@ -4256,41 +4171,31 @@ void GPUEngineBase::_SpriteRenderPerform(GPUEngineCompositorInfo &compInfo, u16
if (!GPU->GetEngineMain()->isLineCaptureNative[blockID][blockLine] && (linePixel == 0)) if (!GPU->GetEngineMain()->isLineCaptureNative[blockID][blockLine] && (linePixel == 0))
{ {
this->vramBlockOBJAddress = srcadr; this->vramBlockOBJAddress = objAddress;
} }
} }
} }
else if (spriteInfo.PaletteMode == PaletteMode_1x256) //256 colors; handles OBJ windows too else if (spriteInfo.PaletteMode == PaletteMode_1x256) //256 colors; handles OBJ windows too
{ {
if (MODE == SpriteRenderMode_Sprite2D) const u32 objAddress = (MODE == SpriteRenderMode_Sprite2D) ? this->_sprMem + ((spriteInfo.TileIndex)<<5) + ((spriteY>>3)<<10) + ((spriteY&0x7)*8) :
srcadr = this->_sprMem + ((spriteInfo.TileIndex)<<5) + ((y>>3)<<10) + ((y&0x7)*8); this->_sprMem + (spriteInfo.TileIndex<<compInfo.renderState.spriteBoundary) + ((spriteY>>3)*sprSize.width*8) + ((spriteY&0x7)*8);
else
srcadr = this->_sprMem + (spriteInfo.TileIndex<<compInfo.renderState.spriteBoundary) + ((y>>3)*sprSize.width*8) + ((y&0x7)*8);
pal = (DISPCNT.ExOBJPalette_Enable) ? (u16 *)(MMU.ObjExtPal[this->_engineID][0]+(spriteInfo.PaletteIndex*ADDRESS_STEP_512B)) : this->_paletteOBJ; const u16 *__restrict palColorBuffer = (DISPCNT.ExOBJPalette_Enable) ? (u16 *)(MMU.ObjExtPal[this->_engineID][0]+(spriteInfo.PaletteIndex*ADDRESS_STEP_512B)) : this->_paletteOBJ;
if (objMode == OBJMode_Window) this->_RenderSprite256<ISDEBUGRENDER>(objAddress, length, frameX, spriteX, readXStep,
this->_RenderSprite256<ISDEBUGRENDER,true>(compInfo, i, dst, srcadr, pal, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, (objMode == OBJMode_Transparent)); palColorBuffer, objMode, prio, spriteNum,
else dst, dst_alpha, typeTab, prioTab);
this->_RenderSprite256<ISDEBUGRENDER,false>(compInfo, i, dst, srcadr, pal, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, (objMode == OBJMode_Transparent));
} }
else // 16 colors; handles OBJ windows too else // 16 colors; handles OBJ windows too
{ {
if (MODE == SpriteRenderMode_Sprite2D) const u32 objAddress = (MODE == SpriteRenderMode_Sprite2D) ? this->_sprMem + ((spriteInfo.TileIndex)<<5) + ((spriteY>>3)<<10) + ((spriteY&0x7)*4) :
{ this->_sprMem + (spriteInfo.TileIndex<<compInfo.renderState.spriteBoundary) + ((spriteY>>3)*sprSize.width*4) + ((spriteY&0x7)*4);
srcadr = this->_sprMem + ((spriteInfo.TileIndex)<<5) + ((y>>3)<<10) + ((y&0x7)*4);
}
else
{
srcadr = this->_sprMem + (spriteInfo.TileIndex<<compInfo.renderState.spriteBoundary) + ((y>>3)*sprSize.width*4) + ((y&0x7)*4);
}
pal = this->_paletteOBJ + (spriteInfo.PaletteIndex << 4); const u16 *__restrict palColorBuffer = this->_paletteOBJ + (spriteInfo.PaletteIndex << 4);
if (objMode == OBJMode_Window) this->_RenderSprite16<ISDEBUGRENDER>(objAddress, length, frameX, spriteX, readXStep,
this->_RenderSprite16<ISDEBUGRENDER, true>(compInfo, i, dst, srcadr, pal, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, (objMode == OBJMode_Transparent)); palColorBuffer, objMode, prio, spriteNum,
else dst, dst_alpha, typeTab, prioTab);
this->_RenderSprite16<ISDEBUGRENDER, false>(compInfo, i, dst, srcadr, pal, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, (objMode == OBJMode_Transparent));
} }
} }
} }

View File

@ -867,7 +867,7 @@ typedef union
struct struct
{ {
u16 Y:8; // 0- 7: Sprite Y-coordinate; 0...255 u16 Y:8; // 0- 7: Sprite Y-coordinate location within the framebuffer; 0...255
u16 RotScale:1; // 8: Perform rotation/scaling; 0=Disable, 1=Enable u16 RotScale:1; // 8: Perform rotation/scaling; 0=Disable, 1=Enable
u16 Disable:1; // 9: OBJ disable flag, only if Bit8 is cleared; 0=Perform render, 1=Do not perform render u16 Disable:1; // 9: OBJ disable flag, only if Bit8 is cleared; 0=Perform render, 1=Do not perform render
u16 Mode:2; // 10-11: OBJ mode; 0=Normal, 1=Transparent, 2=Window, 3=Bitmap u16 Mode:2; // 10-11: OBJ mode; 0=Normal, 1=Transparent, 2=Window, 3=Bitmap
@ -885,7 +885,7 @@ typedef union
}; };
}; };
s16 X:9; // 16-24: Sprite X-coordinate; 0...511 s16 X:9; // 16-24: Sprite X-coordinate location within the framebuffer; 0...511
u16 RotScaleIndex:3; // 25-27: Rotation/scaling parameter selection; 0...31 u16 RotScaleIndex:3; // 25-27: Rotation/scaling parameter selection; 0...31
u16 HFlip:1; // 28: Flip sprite horizontally; 0=Normal, 1=Flip u16 HFlip:1; // 28: Flip sprite horizontally; 0=Normal, 1=Flip
u16 VFlip:1; // 29: Flip sprite vertically; 0=Normal, 1=Flip u16 VFlip:1; // 29: Flip sprite vertically; 0=Normal, 1=Flip
@ -907,7 +907,7 @@ typedef union
struct struct
{ {
u16 Y:8; // 0- 7: Sprite Y-coordinate; 0...255 u16 Y:8; // 0- 7: Sprite Y-coordinate location within the framebuffer; 0...255
u16 Shape:2; // 14-15: OBJ shape; 0=Square, 1=Horizontal, 2=Vertical, 3=Prohibited u16 Shape:2; // 14-15: OBJ shape; 0=Square, 1=Horizontal, 2=Vertical, 3=Prohibited
u16 PaletteMode:1; // 13: Color/palette select; 0=16 palettes of 16 colors each, 1=Single palette of 256 colors u16 PaletteMode:1; // 13: Color/palette select; 0=16 palettes of 16 colors each, 1=Single palette of 256 colors
u16 Mosaic:1; // 12: Mosaic render: 0=Disable, 1=Enable u16 Mosaic:1; // 12: Mosaic render: 0=Disable, 1=Enable
@ -936,7 +936,7 @@ typedef union
u16 VFlip:1; // 29: Flip sprite vertically; 0=Normal, 1=Flip u16 VFlip:1; // 29: Flip sprite vertically; 0=Normal, 1=Flip
u16 HFlip:1; // 28: Flip sprite horizontally; 0=Normal, 1=Flip u16 HFlip:1; // 28: Flip sprite horizontally; 0=Normal, 1=Flip
u16 RotScaleIndex:3; // 25-27: Rotation/scaling parameter selection; 0...31 u16 RotScaleIndex:3; // 25-27: Rotation/scaling parameter selection; 0...31
s16 X:9; // 16-24: Sprite X-coordinate; 0...511 s16 X:9; // 16-24: Sprite X-coordinate location within the framebuffer; 0...511
// 32-47: Whenever this is used, you will need to explicitly convert endianness. // 32-47: Whenever this is used, you will need to explicitly convert endianness.
u16 PaletteIndex:4; // 44-47: Palette index; 0...15 u16 PaletteIndex:4; // 44-47: Palette index; 0...15
@ -1436,9 +1436,10 @@ protected:
template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST> FORCEINLINE void _PixelComposite16_SSE2(GPUEngineCompositorInfo &compInfo, const bool didAllPixelsPass, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, const __m128i &srcEffectEnableMask); template<GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST> FORCEINLINE void _PixelComposite16_SSE2(GPUEngineCompositorInfo &compInfo, const bool didAllPixelsPass, const __m128i &passMask8, const __m128i &src3, const __m128i &src2, const __m128i &src1, const __m128i &src0, const __m128i &srcEffectEnableMask);
#endif #endif
template<bool ISDEBUGRENDER> void _RenderSpriteBMP(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha); template<bool ISDEBUGRENDER, bool ISOBJMODEBITMAP> FORCEINLINE void _RenderSpriteUpdatePixel(size_t frameX, const u16 *__restrict srcPalette, const u8 palIndex, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab);
template<bool ISDEBUGRENDER, bool ISWINDOW> void _RenderSprite256(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, const u16 *__restrict pal, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha); template<bool ISDEBUGRENDER> void _RenderSpriteBMP(const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep, const u8 spriteAlpha, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab);
template<bool ISDEBUGRENDER, bool ISWINDOW> void _RenderSprite16(GPUEngineCompositorInfo &compInfo, const u8 spriteNum, u16 *__restrict dst, const u32 srcadr, const u16 *__restrict pal, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha); template<bool ISDEBUGRENDER> void _RenderSprite256(const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep, const u16 *__restrict palColorBuffer, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab);
template<bool ISDEBUGRENDER> void _RenderSprite16(const u32 objAddress, const size_t length, size_t frameX, size_t spriteX, const s32 readXStep, const u16 *__restrict palColorBuffer, const OBJMode objMode, const u8 prio, const u8 spriteNum, u16 *__restrict dst, u8 *__restrict dst_alpha, u8 *__restrict typeTab, u8 *__restrict prioTab);
void _RenderSpriteWin(const u8 *src, const bool col256, const size_t lg, size_t sprX, size_t x, const s32 xdir); void _RenderSpriteWin(const u8 *src, const bool col256, const size_t lg, size_t sprX, size_t x, const s32 xdir);
bool _ComputeSpriteVars(GPUEngineCompositorInfo &compInfo, const OAMAttributes &spriteInfo, SpriteSize &sprSize, s32 &sprX, s32 &sprY, s32 &x, s32 &y, s32 &lg, s32 &xdir); bool _ComputeSpriteVars(GPUEngineCompositorInfo &compInfo, const OAMAttributes &spriteInfo, SpriteSize &sprSize, s32 &sprX, s32 &sprY, s32 &x, s32 &y, s32 &lg, s32 &xdir);