diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index c427220a1..4a4c8da94 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -26,6 +26,15 @@ #include #include +#ifdef ENABLE_SSE2 +#include +#endif + +#ifdef ENABLE_SSSE3 +#include +#endif + +#include "common.h" #include "MMU.h" #include "FIFO.h" #include "debug.h" @@ -54,10 +63,7 @@ GPU::MosaicLookup GPU::mosaicLookup; //#define DEBUG_TRI -//this value should be 32-byte aligned u16 *GPU_screen = NULL; -//and this is the raw pointer -u16 *GPU_screen_raw = NULL; static size_t _gpuFramebufferWidth = GPU_FRAMEBUFFER_NATIVE_WIDTH; static size_t _gpuFramebufferHeight = GPU_FRAMEBUFFER_NATIVE_HEIGHT; @@ -70,9 +76,9 @@ static size_t _gpuDstPitchIndex[GPU_FRAMEBUFFER_NATIVE_WIDTH]; // Key: Source static size_t _gpuDstLineCount[GPU_FRAMEBUFFER_NATIVE_HEIGHT]; // Key: Source line index / Value: Number of destination lines for the source line static size_t _gpuDstLineIndex[GPU_FRAMEBUFFER_NATIVE_HEIGHT]; // Key: Source line index / Value: First destination line that maps to the source line -CACHE_ALIGN u8 sprWin[GPU_FRAMEBUFFER_NATIVE_WIDTH * 8]; +static CACHE_ALIGN u8 sprWin[GPU_FRAMEBUFFER_NATIVE_WIDTH * 8]; -u16 gpu_angle = 0; +static u16 gpu_angle = 0; const SpriteSize sprSizeTab[4][4] = { @@ -110,7 +116,7 @@ const short sizeTab[8][4][2] = static u8 *win_empty = NULL; static CACHE_ALIGN u16 fadeInColors[17][0x8000]; -CACHE_ALIGN u16 fadeOutColors[17][0x8000]; +static CACHE_ALIGN u16 fadeOutColors[17][0x8000]; //this should be public, because it gets used somewhere else CACHE_ALIGN u8 gpuBlendTable555[17][17][32][32]; @@ -264,15 +270,14 @@ void GPU_DeInit(GPU *gpu) { if (gpu == &GPU_main || gpu == &GPU_sub) return; - free(gpu->tempScanlineBufferRaw); + free_aligned(gpu->tempScanlineBuffer); gpu->tempScanlineBuffer = NULL; - gpu->tempScanlineBufferRaw = NULL; - free(gpu->bgPixels); + free_aligned(gpu->bgPixels); gpu->bgPixels = NULL; - free(gpu->h_win[0]); + free_aligned(gpu->h_win[0]); gpu->h_win[0] = NULL; - free(gpu->h_win[1]); + free_aligned(gpu->h_win[1]); gpu->h_win[1] = NULL; free(gpu); @@ -608,7 +613,7 @@ FORCEINLINE void GPU::renderline_checkWindows(const size_t dstX, bool &draw, boo /*****************************************************************************/ template -FORCEINLINE FASTCALL void GPU::_master_setFinal3dColor(const size_t dstX, u16 &outDst, u8 *bgPixelsLine, const FragmentColor src) +FORCEINLINE FASTCALL void GPU::_master_setFinal3dColor(const size_t dstX, u16 *dstLine, u8 *bgPixelsLine, const FragmentColor src) { u8 alpha = src.a; u16 final; @@ -635,7 +640,7 @@ FORCEINLINE FASTCALL void GPU::_master_setFinal3dColor(const size_t dstX, u16 &o //if the layer underneath is a blend bottom layer, then 3d always alpha blends with it COLOR c2, cfinal; - c2.val = outDst; + c2.val = dstLine[dstX]; cfinal.bits.red = ((src.r * alpha) + ((c2.bits.red<<1) * (32 - alpha)))>>6; cfinal.bits.green = ((src.g * alpha) + ((c2.bits.green<<1) * (32 - alpha)))>>6; @@ -662,12 +667,12 @@ FORCEINLINE FASTCALL void GPU::_master_setFinal3dColor(const size_t dstX, u16 &o } } - outDst = final | 0x8000; + dstLine[dstX] = final | 0x8000; bgPixelsLine[dstX] = 0; } template -FORCEINLINE FASTCALL bool GPU::_master_setFinalBGColor(const u16 *dstLine, const u8 *bgPixelsLine, u16 &outColor, const size_t dstX) +FORCEINLINE FASTCALL bool GPU::_master_setFinalBGColor(const size_t dstX, const u16 *dstLine, const u8 *bgPixelsLine, u16 &outColor) { //no further analysis for no special effects. on backdrops. just draw it. if ((FUNC == NoBlend) && BACKDROP) return true; @@ -707,8 +712,10 @@ FORCEINLINE FASTCALL bool GPU::_master_setFinalBGColor(const u16 *dstLine, const } template -static FORCEINLINE void _master_setFinalOBJColor(GPU *gpu, u16 *dstLine, u8 *bgPixelsLine, u16 color, const u8 alpha, const u8 type, const size_t dstX) +FORCEINLINE FASTCALL void GPU::_master_setFinalOBJColor(const size_t dstX, u16 *dstLine, u8 *bgPixelsLine, const u16 src, const u8 alpha, const u8 type) { + u16 finalDstColor = src; + const bool isObjTranslucentType = type == GPU_OBJ_MODE_Transparent || type == GPU_OBJ_MODE_Bitmap; bool windowDraw = true; @@ -716,7 +723,7 @@ static FORCEINLINE void _master_setFinalOBJColor(GPU *gpu, u16 *dstLine, u8 *bgP if (WINDOW) { - gpu->renderline_checkWindows(dstX, windowDraw, windowEffectSatisfied); + renderline_checkWindows(dstX, windowDraw, windowEffectSatisfied); if (!windowDraw) return; } @@ -724,24 +731,24 @@ static FORCEINLINE void _master_setFinalOBJColor(GPU *gpu, u16 *dstLine, u8 *bgP //if the window effect is satisfied, then we can do color effects to modify the color if (windowEffectSatisfied) { - const bool firstTargetSatisfied = gpu->blend1; const size_t bg_under = bgPixelsLine[dstX]; - const bool secondTargetSatisfied = (bg_under != 4) && gpu->blend2[bg_under]; + const bool firstTargetSatisfied = blend1; + const bool secondTargetSatisfied = (bg_under != 4) && blend2[bg_under]; BlendFunc selectedFunc = NoBlend; - u8 eva = gpu->BLDALPHA_EVA; - u8 evb = gpu->BLDALPHA_EVB; + u8 eva = BLDALPHA_EVA; + u8 evb = BLDALPHA_EVB; //if normal BLDCNT layer target conditions are met, then we can use the BLDCNT-specified color effect if (FUNC == Blend) { //blending requires first and second target screens to be satisfied - if(firstTargetSatisfied && secondTargetSatisfied) selectedFunc = FUNC; + if (firstTargetSatisfied && secondTargetSatisfied) selectedFunc = FUNC; } else { //brightness up and down requires only the first target screen to be satisfied - if(firstTargetSatisfied) selectedFunc = FUNC; + if (firstTargetSatisfied) selectedFunc = FUNC; } //translucent-capable OBJ are forcing the function to blend when the second target is satisfied @@ -764,15 +771,15 @@ static FORCEINLINE void _master_setFinalOBJColor(GPU *gpu, u16 *dstLine, u8 *bgP break; case Increase: - color = gpu->currentFadeInColors[color&0x7FFF]; + finalDstColor = currentFadeInColors[src & 0x7FFF]; break; case Decrease: - color = gpu->currentFadeOutColors[color&0x7FFF]; + finalDstColor = currentFadeOutColors[src & 0x7FFF]; break; case Blend: - color = _blend(color, dstLine[dstX], &gpuBlendTable555[eva][evb]); + finalDstColor = _blend(src, dstLine[dstX], &gpuBlendTable555[eva][evb]); break; default: @@ -780,70 +787,70 @@ static FORCEINLINE void _master_setFinalOBJColor(GPU *gpu, u16 *dstLine, u8 *bgP } } - dstLine[dstX] = color | 0x8000; + dstLine[dstX] = finalDstColor | 0x8000; bgPixelsLine[dstX] = 4; } //FUNCNUM is only set for backdrop, for an optimization of looking it up early template -FORCEINLINE void GPU::setFinalColorBG(u16 *dstLine, u8 *bgPixelsLine, u16 color, const size_t dstX) +FORCEINLINE void GPU::setFinalColorBG(const size_t dstX, u16 *dstLine, u8 *bgPixelsLine, u16 src) { //It is not safe to assert this here. //This is probably the best place to enforce it, since almost every single color that comes in here //will be pulled from a palette that needs the top bit stripped off anyway. - //assert((color&0x8000)==0); - if (!BACKDROP) color &= 0x7FFF; //but for the backdrop we can easily guarantee earlier that theres no bit here + //assert((src&0x8000)==0); + if (!BACKDROP) src &= 0x7FFF; //but for the backdrop we can easily guarantee earlier that theres no bit here bool draw = false; const int test = (BACKDROP) ? FUNCNUM : setFinalColorBck_funcNum; switch (test) { - case 0: draw = _master_setFinalBGColor(dstLine, bgPixelsLine, color, dstX); break; - case 1: draw = _master_setFinalBGColor(dstLine, bgPixelsLine, color, dstX); break; - case 2: draw = _master_setFinalBGColor(dstLine, bgPixelsLine, color, dstX); break; - case 3: draw = _master_setFinalBGColor(dstLine, bgPixelsLine, color, dstX); break; - case 4: draw = _master_setFinalBGColor(dstLine, bgPixelsLine, color, dstX); break; - case 5: draw = _master_setFinalBGColor(dstLine, bgPixelsLine, color, dstX); break; - case 6: draw = _master_setFinalBGColor(dstLine, bgPixelsLine, color, dstX); break; - case 7: draw = _master_setFinalBGColor(dstLine, bgPixelsLine, color, dstX); break; + case 0: draw = _master_setFinalBGColor(dstX, dstLine, bgPixelsLine, src); break; + case 1: draw = _master_setFinalBGColor(dstX, dstLine, bgPixelsLine, src); break; + case 2: draw = _master_setFinalBGColor(dstX, dstLine, bgPixelsLine, src); break; + case 3: draw = _master_setFinalBGColor(dstX, dstLine, bgPixelsLine, src); break; + case 4: draw = _master_setFinalBGColor(dstX, dstLine, bgPixelsLine, src); break; + case 5: draw = _master_setFinalBGColor(dstX, dstLine, bgPixelsLine, src); break; + case 6: draw = _master_setFinalBGColor(dstX, dstLine, bgPixelsLine, src); break; + case 7: draw = _master_setFinalBGColor(dstX, dstLine, bgPixelsLine, src); break; default: break; }; if (BACKDROP || draw) //backdrop must always be drawn { - dstLine[dstX] = color | 0x8000; + dstLine[dstX] = src | 0x8000; if (!BACKDROP) bgPixelsLine[dstX] = currBgNum; //lets do this in the backdrop drawing loop, should be faster } } -FORCEINLINE void GPU::setFinalColor3d(const size_t dstX, u16 &outDst, u8 *bgPixelsLine, const FragmentColor src) +FORCEINLINE void GPU::setFinalColor3d(const size_t dstX, u16 *dstLine, u8 *bgPixelsLine, const FragmentColor src) { switch (setFinalColor3d_funcNum) { - case 0x0: _master_setFinal3dColor(dstX, outDst, bgPixelsLine, src); break; - case 0x1: _master_setFinal3dColor(dstX, outDst, bgPixelsLine, src); break; - case 0x2: _master_setFinal3dColor(dstX, outDst, bgPixelsLine, src); break; - case 0x3: _master_setFinal3dColor(dstX, outDst, bgPixelsLine, src); break; - case 0x4: _master_setFinal3dColor(dstX, outDst, bgPixelsLine, src); break; - case 0x5: _master_setFinal3dColor(dstX, outDst, bgPixelsLine, src); break; - case 0x6: _master_setFinal3dColor(dstX, outDst, bgPixelsLine, src); break; - case 0x7: _master_setFinal3dColor(dstX, outDst, bgPixelsLine, src); break; + case 0x0: _master_setFinal3dColor(dstX, dstLine, bgPixelsLine, src); break; + case 0x1: _master_setFinal3dColor(dstX, dstLine, bgPixelsLine, src); break; + case 0x2: _master_setFinal3dColor(dstX, dstLine, bgPixelsLine, src); break; + case 0x3: _master_setFinal3dColor(dstX, dstLine, bgPixelsLine, src); break; + case 0x4: _master_setFinal3dColor(dstX, dstLine, bgPixelsLine, src); break; + case 0x5: _master_setFinal3dColor(dstX, dstLine, bgPixelsLine, src); break; + case 0x6: _master_setFinal3dColor(dstX, dstLine, bgPixelsLine, src); break; + case 0x7: _master_setFinal3dColor(dstX, dstLine, bgPixelsLine, src); break; }; } -FORCEINLINE void setFinalColorSpr(GPU *gpu, u16 *dstLine, u8 *bgPixelsLine, u16 color, const u8 alpha, const u8 type, const size_t dstX) +FORCEINLINE void GPU::setFinalColorSpr(const size_t dstX, u16 *dstLine, u8 *bgPixelsLine, const u16 src, const u8 alpha, const u8 type) { - switch (gpu->setFinalColorSpr_funcNum) + switch (setFinalColorSpr_funcNum) { - case 0x0: _master_setFinalOBJColor(gpu, dstLine, bgPixelsLine, color, alpha, type, dstX); break; - case 0x1: _master_setFinalOBJColor(gpu, dstLine, bgPixelsLine, color, alpha, type, dstX); break; - case 0x2: _master_setFinalOBJColor(gpu, dstLine, bgPixelsLine, color, alpha, type, dstX); break; - case 0x3: _master_setFinalOBJColor(gpu, dstLine, bgPixelsLine, color, alpha, type, dstX); break; - case 0x4: _master_setFinalOBJColor(gpu, dstLine, bgPixelsLine, color, alpha, type, dstX); break; - case 0x5: _master_setFinalOBJColor(gpu, dstLine, bgPixelsLine, color, alpha, type, dstX); break; - case 0x6: _master_setFinalOBJColor(gpu, dstLine, bgPixelsLine, color, alpha, type, dstX); break; - case 0x7: _master_setFinalOBJColor(gpu, dstLine, bgPixelsLine, color, alpha, type, dstX); break; + case 0x0: _master_setFinalOBJColor(dstX, dstLine, bgPixelsLine, src, alpha, type); break; + case 0x1: _master_setFinalOBJColor(dstX, dstLine, bgPixelsLine, src, alpha, type); break; + case 0x2: _master_setFinalOBJColor(dstX, dstLine, bgPixelsLine, src, alpha, type); break; + case 0x3: _master_setFinalOBJColor(dstX, dstLine, bgPixelsLine, src, alpha, type); break; + case 0x4: _master_setFinalOBJColor(dstX, dstLine, bgPixelsLine, src, alpha, type); break; + case 0x5: _master_setFinalOBJColor(dstX, dstLine, bgPixelsLine, src, alpha, type); break; + case 0x6: _master_setFinalOBJColor(dstX, dstLine, bgPixelsLine, src, alpha, type); break; + case 0x7: _master_setFinalOBJColor(dstX, dstLine, bgPixelsLine, src, alpha, type); break; }; } @@ -868,10 +875,10 @@ FORCEINLINE void GPU::___setFinalColorBck(u16 color, const size_t srcX, const bo { for (size_t p = 0; p < _gpuDstPitchCount[srcX]; p++) { - setFinalColorBG(currDst + (line * _gpuFramebufferWidth), + setFinalColorBG(_gpuDstPitchIndex[srcX] + p, + currDst + (line * _gpuFramebufferWidth), bgPixels + (line * _gpuFramebufferWidth), - color, - _gpuDstPitchIndex[srcX] + p); + color); } } } @@ -902,10 +909,10 @@ FORCEINLINE void GPU::___setFinalColorBck(u16 color, const size_t srcX, const bo { for (size_t p = 0; p < _gpuDstPitchCount[srcX]; p++) { - setFinalColorBG(currDst + (line * _gpuFramebufferWidth), + setFinalColorBG(_gpuDstPitchIndex[srcX] + p, + currDst + (line * _gpuFramebufferWidth), bgPixels + (line * _gpuFramebufferWidth), - color, - _gpuDstPitchIndex[srcX] + p); + color); } } } @@ -1225,18 +1232,18 @@ FORCEINLINE void rot_BMP_map(GPU *gpu, const s32 auxX, const s32 auxY, const int typedef void (*rot_fun)(GPU *gpu, const s32 auxX, const s32 auxY, const int lg, const u32 map, const u32 tile, const u16 *pal, const size_t i); template -FORCEINLINE void rot_scale_op(GPU *gpu, s32 X, s32 Y, s16 PA, s16 PB, s16 PC, s16 PD, u16 LG, s32 wh, s32 ht, u32 map, u32 tile, const u16 *pal) +FORCEINLINE void rot_scale_op(GPU *gpu, const BGxPARMS ¶m, const u16 LG, const s32 wh, const s32 ht, const u32 map, const u32 tile, const u16 *pal) { ROTOCOORD x, y; - x.val = X; - y.val = Y; + x.val = param.BGxX; + y.val = param.BGxY; - const s32 dx = (s32)PA; - const s32 dy = (s32)PC; + const s32 dx = (s32)param.BGxPA; + const s32 dy = (s32)param.BGxPC; // as an optimization, specially handle the fairly common case of // "unrotated + unscaled + no boundary checking required" - if (dx==0x100 && dy==0) + if (dx == GPU_FRAMEBUFFER_NATIVE_WIDTH && dy == 0) { s32 auxX = (WRAP) ? x.bits.Integer & (wh-1) : x.bits.Integer; const s32 auxY = (WRAP) ? y.bits.Integer & (ht-1) : y.bits.Integer; @@ -1267,30 +1274,30 @@ FORCEINLINE void rot_scale_op(GPU *gpu, s32 X, s32 Y, s16 PA, s16 PB, s16 PC, s1 } template -FORCEINLINE void apply_rot_fun(GPU *gpu, s32 X, s32 Y, s16 PA, s16 PB, s16 PC, s16 PD, u16 LG, u32 map, u32 tile, const u16 *pal) +FORCEINLINE void apply_rot_fun(GPU *gpu, const BGxPARMS ¶m, const u16 LG, const u32 map, const u32 tile, const u16 *pal) { struct _BGxCNT *bgCnt = &(gpu->dispx_st)->dispx_BGxCNT[gpu->currBgNum].bits; s32 wh = gpu->BGSize[gpu->currBgNum][0]; s32 ht = gpu->BGSize[gpu->currBgNum][1]; if (bgCnt->PaletteSet_Wrap) - rot_scale_op(gpu, X, Y, PA, PB, PC, PD, LG, wh, ht, map, tile, pal); + rot_scale_op(gpu, param, LG, wh, ht, map, tile, pal); else - rot_scale_op(gpu, X, Y, PA, PB, PC, PD, LG, wh, ht, map, tile, pal); + rot_scale_op(gpu, param, LG, wh, ht, map, tile, pal); } template -FORCEINLINE void rotBG2(GPU *gpu, s32 X, s32 Y, s16 PA, s16 PB, s16 PC, s16 PD, u16 LG) +FORCEINLINE void rotBG2(GPU *gpu, const BGxPARMS ¶m, const u16 LG) { const size_t num = gpu->currBgNum; const u16 *pal = (u16 *)(MMU.ARM9_VMEM + gpu->core * ADDRESS_STEP_1KB); // printf("rot mode\n"); - apply_rot_fun< rot_tiled_8bit_entry >(gpu,X,Y,PA,PB,PC,PD,LG, gpu->BG_map_ram[num], gpu->BG_tile_ram[num], pal); + apply_rot_fun< rot_tiled_8bit_entry >(gpu, param, LG, gpu->BG_map_ram[num], gpu->BG_tile_ram[num], pal); } template -FORCEINLINE void extRotBG2(GPU * gpu, s32 X, s32 Y, s16 PA, s16 PB, s16 PC, s16 PD, s16 LG) +FORCEINLINE void extRotBG2(GPU *gpu, const BGxPARMS ¶m, const u16 LG) { const size_t num = gpu->currBgNum; struct _DISPCNT * dispCnt = &(gpu->dispx_st)->dispx_DISPCNT.bits; @@ -1304,26 +1311,26 @@ FORCEINLINE void extRotBG2(GPU * gpu, s32 X, s32 Y, s16 PA, s16 PB, s16 PC, s16 if (pal == NULL) return; // 16 bit bgmap entries if(dispCnt->ExBGxPalette_Enable) - apply_rot_fun< rot_tiled_16bit_entry >(gpu,X,Y,PA,PB,PC,PD,LG, gpu->BG_map_ram[num], gpu->BG_tile_ram[num], pal); + apply_rot_fun< rot_tiled_16bit_entry >(gpu, param, LG, gpu->BG_map_ram[num], gpu->BG_tile_ram[num], pal); else - apply_rot_fun< rot_tiled_16bit_entry >(gpu,X,Y,PA,PB,PC,PD,LG, gpu->BG_map_ram[num], gpu->BG_tile_ram[num], pal); + apply_rot_fun< rot_tiled_16bit_entry >(gpu, param, LG, gpu->BG_map_ram[num], gpu->BG_tile_ram[num], pal); break; case BGType_AffineExt_256x1: // 256 colors pal = (u16 *)(MMU.ARM9_VMEM + gpu->core * ADDRESS_STEP_1KB); - apply_rot_fun< rot_256_map >(gpu,X,Y,PA,PB,PC,PD,LG, gpu->BG_bmp_ram[num], 0, pal); + apply_rot_fun< rot_256_map >(gpu, param, LG, gpu->BG_bmp_ram[num], 0, pal); break; case BGType_AffineExt_Direct: // direct colors / BMP - apply_rot_fun< rot_BMP_map >(gpu,X,Y,PA,PB,PC,PD,LG, gpu->BG_bmp_ram[num], 0, NULL); + apply_rot_fun< rot_BMP_map >(gpu, param, LG, gpu->BG_bmp_ram[num], 0, NULL); break; case BGType_Large8bpp: // large screen 256 colors pal = (u16 *)(MMU.ARM9_VMEM + gpu->core * ADDRESS_STEP_1KB); - apply_rot_fun< rot_256_map >(gpu,X,Y,PA,PB,PC,PD,LG, gpu->BG_bmp_large_ram[num], 0, pal); + apply_rot_fun< rot_256_map >(gpu, param, LG, gpu->BG_bmp_large_ram[num], 0, pal); break; default: @@ -1357,66 +1364,38 @@ template void lineText(GPU *gpu) } template void lineRot(GPU *gpu) -{ - BGxPARMS *parms; - if (gpu->currBgNum == 2) - { - parms = &(gpu->dispx_st)->dispx_BG2PARMS; - } - else - { - parms = &(gpu->dispx_st)->dispx_BG3PARMS; - } - +{ if (gpu->debug) { - s32 wh = gpu->BGSize[gpu->currBgNum][0]; - rotBG2(gpu, 0, (s16)gpu->currLine*GPU_FRAMEBUFFER_NATIVE_WIDTH, 256, 0, 0, -77, wh); + static const BGxPARMS debugParams = {256, 0, 0, -77, 0, (s16)gpu->currLine*GPU_FRAMEBUFFER_NATIVE_WIDTH}; + const s32 wh = gpu->BGSize[gpu->currBgNum][0]; + rotBG2(gpu, debugParams, wh); } else { - rotBG2(gpu, - parms->BGxX, - parms->BGxY, - parms->BGxPA, - parms->BGxPB, - parms->BGxPC, - parms->BGxPD, - 256); - parms->BGxX += parms->BGxPB; - parms->BGxY += parms->BGxPD; + BGxPARMS ¶ms = (gpu->currBgNum == 2) ? (gpu->dispx_st)->dispx_BG2PARMS : (gpu->dispx_st)->dispx_BG3PARMS; + + rotBG2(gpu, params, 256); + params.BGxX += params.BGxPB; + params.BGxY += params.BGxPD; } } template void lineExtRot(GPU *gpu) { - BGxPARMS *parms; - if (gpu->currBgNum == 2) - { - parms = &(gpu->dispx_st)->dispx_BG2PARMS; - } - else - { - parms = &(gpu->dispx_st)->dispx_BG3PARMS; - } - if (gpu->debug) { - s32 wh = gpu->BGSize[gpu->currBgNum][0]; - extRotBG2(gpu, 0, (s16)gpu->currLine*GPU_FRAMEBUFFER_NATIVE_WIDTH, 256, 0, 0, -77, wh); + static BGxPARMS debugParams = {256, 0, 0, -77, 0, (s16)gpu->currLine*GPU_FRAMEBUFFER_NATIVE_WIDTH}; + const s32 wh = gpu->BGSize[gpu->currBgNum][0]; + extRotBG2(gpu, debugParams, wh); } else { - extRotBG2(gpu, - parms->BGxX, - parms->BGxY, - parms->BGxPA, - parms->BGxPB, - parms->BGxPC, - parms->BGxPD, - 256); - parms->BGxX += parms->BGxPB; - parms->BGxY += parms->BGxPD; + BGxPARMS ¶ms = (gpu->currBgNum == 2) ? (gpu->dispx_st)->dispx_BG2PARMS : (gpu->dispx_st)->dispx_BG3PARMS; + + extRotBG2(gpu, params, 256); + params.BGxX += params.BGxPB; + params.BGxY += params.BGxPD; } } @@ -1942,6 +1921,7 @@ int Screen_Init() { MainScreen.gpu = GPU_Init(GPUCOREID_MAIN); SubScreen.gpu = GPU_Init(GPUCOREID_SUB); + gfx3d_init(); disp_fifo.head = disp_fifo.tail = 0; @@ -1949,7 +1929,6 @@ int Screen_Init() osd = new OSDCLASS(-1); delete previousOSD; - gfx3d_init(); GPU_SetFramebufferSize(GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT); return 0; @@ -1957,11 +1936,15 @@ int Screen_Init() void Screen_Reset(void) { + gfx3d_reset(); GPU_Reset(MainScreen.gpu); GPU_Reset(SubScreen.gpu); MainScreen.offset = 0; SubScreen.offset = _gpuFramebufferHeight; - memset_u16(GPU_screen, 0x7FFF, _gpuFramebufferWidth * _gpuFramebufferHeight * 2); + + memset(GPU_screen, 0xFF, _gpuFramebufferWidth * _gpuFramebufferHeight * 2 * sizeof(u16)); + memset(gfx3d_colorRGBA6665, 0, _gpuFramebufferWidth * _gpuFramebufferHeight * sizeof(FragmentColor)); + memset(gfx3d_colorRGBA5551, 0, _gpuFramebufferWidth * _gpuFramebufferHeight * sizeof(u16)); disp_fifo.head = disp_fifo.tail = 0; osd->clear(); @@ -1979,11 +1962,10 @@ void Screen_DeInit(void) delete osd; osd = NULL; - free(GPU_screen_raw); - GPU_screen_raw = NULL; + free_aligned(GPU_screen); GPU_screen = NULL; - free(win_empty); + free_aligned(win_empty); win_empty = NULL; } @@ -2004,20 +1986,35 @@ void GPU_SetFramebufferSize(size_t w, size_t h) return; } - _gpuFramebufferWidth = w; - _gpuFramebufferHeight = h; - _gpuWidthScale = (float)w / (float)GPU_FRAMEBUFFER_NATIVE_WIDTH; - _gpuHeightScale = (float)h / (float)GPU_FRAMEBUFFER_NATIVE_HEIGHT; - _gpuLargestDstLineCount = (size_t)ceilf(_gpuHeightScale); + // Check if we're calling this function from initialization. + // If we're not initializing, we need to finish rendering first. + if (gfx3d_colorRGBA6665 != NULL && gfx3d_colorRGBA5551 != NULL) + { + CurrentRenderer->RenderFinish(); + } - GPU_screen_raw = (u16 *)realloc(GPU_screen_raw, w * h * sizeof(u16) * 2 + 32); - GPU_screen = (u16*)(((uintptr_t)GPU_screen_raw+32) & ~31); - MainScreen.offset = (MainScreen.offset == 0) ? 0 : _gpuFramebufferHeight; - SubScreen.offset = (SubScreen.offset == 0) ? 0 : _gpuFramebufferHeight; + const float newGpuWidthScale = (float)w / (float)GPU_FRAMEBUFFER_NATIVE_WIDTH; + const float newGpuHeightScale = (float)h / (float)GPU_FRAMEBUFFER_NATIVE_HEIGHT; + const float newGpuLargestDstLineCount = (size_t)ceilf(newGpuHeightScale); + const size_t windowBufferSize = w * sizeof(u8); + + u16 *oldGPUScreenPtr = GPU_screen; + FragmentColor *oldColorRGBA6665Buffer = gfx3d_colorRGBA6665; + u16 *oldColorRGBA5551Buffer = gfx3d_colorRGBA5551; + u16 *oldMainScreenTempScanlineBuffer = MainScreen.gpu->tempScanlineBuffer; + u16 *oldSubScreenTempScanlineBuffer = SubScreen.gpu->tempScanlineBuffer; + u8 *oldMainScreenBGPixels = MainScreen.gpu->bgPixels; + u8 *oldSubScreenBGPixels = SubScreen.gpu->bgPixels; + + u8 *oldWinEmptyPtr = win_empty; + u8 *oldMainScreenHWin0 = MainScreen.gpu->h_win[0]; + u8 *oldMainScreenHWin1 = MainScreen.gpu->h_win[1]; + u8 *oldSubScreenHWin0 = SubScreen.gpu->h_win[0]; + u8 *oldSubScreenHWin1 = SubScreen.gpu->h_win[1]; for (size_t srcX = 0, currentPitchCount = 0; srcX < GPU_FRAMEBUFFER_NATIVE_WIDTH; srcX++) { - const size_t pitch = (size_t)ceilf((srcX+1) * _gpuWidthScale) - currentPitchCount; + const size_t pitch = (size_t)ceilf((srcX+1) * newGpuWidthScale) - currentPitchCount; _gpuDstPitchCount[srcX] = pitch; _gpuDstPitchIndex[srcX] = currentPitchCount; currentPitchCount += pitch; @@ -2025,36 +2022,72 @@ void GPU_SetFramebufferSize(size_t w, size_t h) for (size_t srcY = 0, currentLineCount = 0; srcY < GPU_FRAMEBUFFER_NATIVE_HEIGHT; srcY++) { - const size_t lineCount = (size_t)ceilf((srcY+1) * _gpuHeightScale) - currentLineCount; + const size_t lineCount = (size_t)ceilf((srcY+1) * newGpuHeightScale) - currentLineCount; _gpuDstLineCount[srcY] = lineCount; _gpuDstLineIndex[srcY] = currentLineCount; currentLineCount += lineCount; } - MainScreen.gpu->tempScanlineBufferRaw = (u16 *)realloc(MainScreen.gpu->tempScanlineBufferRaw, w * _gpuLargestDstLineCount * sizeof(u16) + 32); - SubScreen.gpu->tempScanlineBufferRaw = (u16 *)realloc(SubScreen.gpu->tempScanlineBufferRaw, w * _gpuLargestDstLineCount * sizeof(u16) + 32); - MainScreen.gpu->tempScanlineBuffer = (u16*)(((uintptr_t)MainScreen.gpu->tempScanlineBufferRaw+32) & ~31); - SubScreen.gpu->tempScanlineBuffer = (u16*)(((uintptr_t)SubScreen.gpu->tempScanlineBufferRaw+32) & ~31); - MainScreen.gpu->bgPixels = (u8 *)realloc(MainScreen.gpu->bgPixels, w * _gpuLargestDstLineCount * 4 * sizeof(u8)); // yes indeed, this is oversized. map debug tools try to write to it - SubScreen.gpu->bgPixels = (u8 *)realloc(SubScreen.gpu->bgPixels, w * _gpuLargestDstLineCount * 4 * sizeof(u8)); // yes indeed, this is oversized. map debug tools try to write to it + u16 *newGPUScreenPtr = (u16 *)malloc_alignedCacheLine(w * h * sizeof(u16) * 2); + memset_u16(newGPUScreenPtr, 0x7FFF, w * h * 2); - const size_t windowBufferSize = w * sizeof(u8); - const u8 *oldWinEmptyPtr = win_empty; - win_empty = (u8 *)realloc(win_empty, windowBufferSize); - memset(win_empty, 0, windowBufferSize); + FragmentColor *newColorRGBA6665Buffer = (FragmentColor *)malloc_alignedCacheLine(w * h * sizeof(FragmentColor)); + u16 *newColorRGBA5551 = (u16 *)malloc_alignedCacheLine(w * h * sizeof(u16)); - MainScreen.gpu->h_win[0] = (u8 *)realloc(MainScreen.gpu->h_win[0], windowBufferSize); - MainScreen.gpu->h_win[1] = (u8 *)realloc(MainScreen.gpu->h_win[1], windowBufferSize); - SubScreen.gpu->h_win[0] = (u8 *)realloc(SubScreen.gpu->h_win[0], windowBufferSize); - SubScreen.gpu->h_win[1] = (u8 *)realloc(SubScreen.gpu->h_win[1], windowBufferSize); + u16 *newMainScreenTempScanlineBuffer = (u16 *)malloc_alignedCacheLine(w * newGpuLargestDstLineCount * sizeof(u16)); + u16 *newSubScreenTempScanlineBuffer = (u16 *)malloc_alignedCacheLine(w * newGpuLargestDstLineCount * sizeof(u16)); + u8 *newMainScreenBGPixels = (u8 *)malloc_alignedCacheLine(w * newGpuLargestDstLineCount * 4 * sizeof(u8)); // yes indeed, this is oversized. map debug tools try to write to it + u8 *newSubScreenBGPixels = (u8 *)malloc_alignedCacheLine(w * newGpuLargestDstLineCount * 4 * sizeof(u8)); // yes indeed, this is oversized. map debug tools try to write to it - MainScreen.gpu->curr_win[0] = (MainScreen.gpu->curr_win[0] == NULL || MainScreen.gpu->curr_win[0] == oldWinEmptyPtr) ? win_empty : MainScreen.gpu->h_win[0]; - MainScreen.gpu->curr_win[1] = (MainScreen.gpu->curr_win[1] == NULL || MainScreen.gpu->curr_win[1] == oldWinEmptyPtr) ? win_empty : MainScreen.gpu->h_win[1]; - SubScreen.gpu->curr_win[0] = (SubScreen.gpu->curr_win[0] == NULL || SubScreen.gpu->curr_win[0] == oldWinEmptyPtr) ? win_empty : SubScreen.gpu->h_win[0]; - SubScreen.gpu->curr_win[1] = (SubScreen.gpu->curr_win[1] == NULL || SubScreen.gpu->curr_win[1] == oldWinEmptyPtr) ? win_empty : SubScreen.gpu->h_win[1]; + u8 *newWinEmptyPtr = (u8 *)malloc_alignedCacheLine(windowBufferSize); + u8 *newMainScreenHWin0 = (u8 *)malloc_alignedCacheLine(windowBufferSize); + u8 *newMainScreenHWin1 = (u8 *)malloc_alignedCacheLine(windowBufferSize); + u8 *newSubScreenHWin0 = (u8 *)malloc_alignedCacheLine(windowBufferSize); + u8 *newSubScreenHWin1 = (u8 *)malloc_alignedCacheLine(windowBufferSize); + memset(newWinEmptyPtr, 0, windowBufferSize); - memset_u16(GPU_screen, 0x7FFF, w * h * 2); - gfx3d_setFramebufferSize(_gpuFramebufferWidth, _gpuFramebufferHeight); + _gpuFramebufferWidth = w; + _gpuFramebufferHeight = h; + _gpuWidthScale = newGpuWidthScale; + _gpuHeightScale = newGpuHeightScale; + _gpuLargestDstLineCount = newGpuLargestDstLineCount; + + MainScreen.gpu->curr_win[0] = (MainScreen.gpu->curr_win[0] == NULL || MainScreen.gpu->curr_win[0] == oldWinEmptyPtr) ? newWinEmptyPtr : newMainScreenHWin0; + MainScreen.gpu->curr_win[1] = (MainScreen.gpu->curr_win[1] == NULL || MainScreen.gpu->curr_win[1] == oldWinEmptyPtr) ? newWinEmptyPtr : newMainScreenHWin1; + SubScreen.gpu->curr_win[0] = (SubScreen.gpu->curr_win[0] == NULL || SubScreen.gpu->curr_win[0] == oldWinEmptyPtr) ? newWinEmptyPtr : newSubScreenHWin0; + SubScreen.gpu->curr_win[1] = (SubScreen.gpu->curr_win[1] == NULL || SubScreen.gpu->curr_win[1] == oldWinEmptyPtr) ? newWinEmptyPtr : newSubScreenHWin1; + + win_empty = newWinEmptyPtr; + MainScreen.gpu->h_win[0] = newMainScreenHWin0; + MainScreen.gpu->h_win[1] = newMainScreenHWin1; + SubScreen.gpu->h_win[0] = newSubScreenHWin0; + SubScreen.gpu->h_win[1] = newSubScreenHWin1; + + MainScreen.gpu->tempScanlineBuffer = newMainScreenTempScanlineBuffer; + SubScreen.gpu->tempScanlineBuffer = newSubScreenTempScanlineBuffer; + MainScreen.gpu->bgPixels = newMainScreenBGPixels; + SubScreen.gpu->bgPixels = newSubScreenBGPixels; + GPU_screen = newGPUScreenPtr; + gfx3d_colorRGBA6665 = newColorRGBA6665Buffer; + gfx3d_colorRGBA5551 = newColorRGBA5551; + + MainScreen.offset = (MainScreen.offset == 0) ? 0 : h; + SubScreen.offset = (SubScreen.offset == 0) ? 0 : h; + + CurrentRenderer->SetFramebufferSize(w, h); + + free_aligned(oldGPUScreenPtr); + free_aligned(oldColorRGBA6665Buffer); + free_aligned(oldColorRGBA5551Buffer); + free_aligned(oldWinEmptyPtr); + free_aligned(oldMainScreenHWin0); + free_aligned(oldMainScreenHWin1); + free_aligned(oldSubScreenHWin0); + free_aligned(oldSubScreenHWin1); + free_aligned(oldMainScreenTempScanlineBuffer); + free_aligned(oldSubScreenTempScanlineBuffer); + free_aligned(oldMainScreenBGPixels); + free_aligned(oldSubScreenBGPixels); } /*****************************************************************************/ @@ -2112,17 +2145,11 @@ void GPU_set_DISPCAPCNT(u32 val) static void GPU_RenderLine_layer(GPU *gpu, const u16 l) { - CACHE_ALIGN u16 spr[GPU_FRAMEBUFFER_NATIVE_WIDTH]; - CACHE_ALIGN u8 sprAlpha[GPU_FRAMEBUFFER_NATIVE_WIDTH]; - CACHE_ALIGN u8 sprType[GPU_FRAMEBUFFER_NATIVE_WIDTH]; - CACHE_ALIGN u8 sprPrio[GPU_FRAMEBUFFER_NATIVE_WIDTH]; - const size_t pixCount = _gpuFramebufferWidth * _gpuDstLineCount[l]; u16 *dstLine = gpu->currDst; struct _DISPCNT *dispCnt = &(gpu->dispx_st)->dispx_DISPCNT.bits; itemsForPriority_t *item; - bool BG_enabled = true; gpu->currentFadeInColors = &fadeInColors[gpu->BLDY_EVY][0]; gpu->currentFadeOutColors = &fadeOutColors[gpu->BLDY_EVY][0]; @@ -2163,9 +2190,9 @@ PLAIN_CLEAR: memset(gpu->bgPixels, 5, pixCount); // init background color & priorities - memset(sprAlpha, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH); - memset(sprType, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH); - memset(sprPrio, 0xFF, GPU_FRAMEBUFFER_NATIVE_WIDTH); + memset(gpu->sprAlpha, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH); + memset(gpu->sprType, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH); + memset(gpu->sprPrio, 0xFF, GPU_FRAMEBUFFER_NATIVE_WIDTH); memset(sprWin, 0, _gpuFramebufferWidth); // init pixels priorities @@ -2179,18 +2206,18 @@ PLAIN_CLEAR: if (gpu->LayersEnable[4]) { //n.b. - this is clearing the sprite line buffer to the background color, - memset_u16(spr, backdrop_color, GPU_FRAMEBUFFER_NATIVE_WIDTH); + memset_u16(gpu->sprColor, backdrop_color, GPU_FRAMEBUFFER_NATIVE_WIDTH); //zero 06-may-09: I properly supported window color effects for backdrop, but I am not sure //how it interacts with this. I wish we knew why we needed this - gpu->spriteRender(spr, sprAlpha, sprType, sprPrio); - mosaicSpriteLine(gpu, l, spr, sprAlpha, sprType, sprPrio); + gpu->spriteRender(gpu->sprColor, gpu->sprAlpha, gpu->sprType, gpu->sprPrio); + mosaicSpriteLine(gpu, l, gpu->sprColor, gpu->sprAlpha, gpu->sprType, gpu->sprPrio); for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i++) { // assign them to the good priority item - const size_t prio = sprPrio[i]; + const size_t prio = gpu->sprPrio[i]; if (prio >= 4) continue; item = &(gpu->itemsForPriority[prio]); @@ -2198,12 +2225,11 @@ PLAIN_CLEAR: item->nbPixelsX++; } } - - if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] && !gpu->LayersEnable[2] && !gpu->LayersEnable[3]) - BG_enabled = false; for (size_t j = 0; j < 8; j++) gpu->blend2[j] = (gpu->BLDCNT & (0x100 << j)) != 0; + + const bool BG_enabled = gpu->LayersEnable[0] || gpu->LayersEnable[1] || gpu->LayersEnable[2] || gpu->LayersEnable[3]; // paint lower priorities first // then higher priorities on top @@ -2225,41 +2251,35 @@ PLAIN_CLEAR: struct _BGxCNT *bgCnt = &(gpu->dispx_st)->dispx_BGxCNT[layerNum].bits; gpu->curr_mosaic_enabled = bgCnt->Mosaic_Enable; - if (gpu->core == GPUCOREID_MAIN) + if (gpu->core == GPUCOREID_MAIN && layerNum == 0 && dispCnt->BG0_3D) { - if (layerNum == 0 && dispCnt->BG0_3D) + gpu->currBgNum = 0; + + const u16 hofs = (u16)( ((float)gpu->getHOFS(layerNum) * _gpuWidthScale) + 0.5f ); + + for (size_t line = 0; line < _gpuDstLineCount[l]; line++) { - gpu->currBgNum = 0; + const FragmentColor *srcLine = gfx3d_GetLineDataRGBA6665(_gpuDstLineIndex[l] + line); - const u16 hofs = (u16)( ((float)gpu->getHOFS(layerNum) * _gpuWidthScale) + 0.5f ); - u16 *oldDstLine = dstLine; - - for (size_t lineIndex = 0; lineIndex < _gpuDstLineCount[l]; lineIndex++, dstLine += _gpuFramebufferWidth) + for (size_t dstX = 0; dstX < _gpuFramebufferWidth; dstX++) { - const FragmentColor *colorLine = gfx3d_GetLineDataRGBA6665(_gpuDstLineIndex[l] + lineIndex); - - for (size_t k = 0; k < _gpuFramebufferWidth; k++) + size_t srcX = dstX + hofs; + if (srcX >= _gpuFramebufferWidth * 2) { - size_t q = k + hofs; - if (q >= _gpuFramebufferWidth * 2) - { - q -= _gpuFramebufferWidth * 2; - } - - if (q >= _gpuFramebufferWidth || colorLine[q].a == 0) - continue; - - gpu->setFinalColor3d(k, - dstLine[k], - gpu->bgPixels + (lineIndex * _gpuFramebufferWidth), - colorLine[q]); + srcX -= _gpuFramebufferWidth * 2; } + + if (srcX >= _gpuFramebufferWidth || srcLine[srcX].a == 0) + continue; + + gpu->setFinalColor3d(dstX, + dstLine + (line * _gpuFramebufferWidth), + gpu->bgPixels + (line * _gpuFramebufferWidth), + srcLine[srcX]); } - - dstLine = oldDstLine; - - continue; } + + continue; } //useful for debugging individual layers @@ -2288,13 +2308,12 @@ PLAIN_CLEAR: { for (size_t p = 0; p < _gpuDstPitchCount[x]; p++) { - setFinalColorSpr(gpu, - gpu->currDst + (line * _gpuFramebufferWidth), - gpu->bgPixels + (line * _gpuFramebufferWidth), - spr[x], - sprAlpha[x], - sprType[x], - _gpuDstPitchIndex[x] + p); + gpu->setFinalColorSpr(_gpuDstPitchIndex[x] + p, + gpu->currDst + (line * _gpuFramebufferWidth), + gpu->bgPixels + (line * _gpuFramebufferWidth), + gpu->sprColor[x], + gpu->sprAlpha[x], + gpu->sprType[x]); } } } @@ -2302,7 +2321,7 @@ PLAIN_CLEAR: } } -template static void GPU_RenderLine_DispCapture(u16 l) +template static void GPU_RenderLine_DispCapture(const u16 l) { //this macro takes advantage of the fact that there are only two possible values for capx #define CAPCOPY(SRC, DST, SETALPHABIT) \ @@ -2373,7 +2392,7 @@ template static void GPU_RenderLine_DispCapture(u16 l) case 0: // Capture screen (BG + OBJ + 3D) { //INFO("Capture screen (BG + OBJ + 3D)\n"); - const u16 *src = gpu->tempScanline; + const u16 *src = gpu->currDst; CAPCOPY(src, cap_dst, true); } break; @@ -2381,8 +2400,8 @@ template static void GPU_RenderLine_DispCapture(u16 l) case 1: // Capture 3D { //INFO("Capture 3D\n"); - const u16 *colorLine = gfx3d_GetLineDataRGBA5551(l); - CAPCOPY(colorLine, cap_dst, false); + const u16 *src = gfx3d_GetLineDataRGBA5551(l); + CAPCOPY(src, cap_dst, false); } break; } @@ -2418,7 +2437,7 @@ template static void GPU_RenderLine_DispCapture(u16 l) if (gpu->dispCapCnt.srcA == 0) { // Capture screen (BG + OBJ + 3D) - srcA = gpu->tempScanline; + srcA = gpu->currDst; } else { @@ -2512,10 +2531,35 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod { if (factor < 16) { - for (size_t i = 0; i < pixCount; ++i) +#ifdef ENABLE_SSE2 + static size_t ssePixCount = pixCount - (pixCount % 4); + static const __m128i colorMask = _mm_set1_epi16(0x7FFF); + + for (size_t i = 0; i < ssePixCount; i += 8) { - dstLine[i] = fadeInColors[factor][dstLine[i]&0x7FFF]; + __m128i dstColor_vec128 = _mm_load_si128((__m128i *)(dstLine + i)); + dstColor_vec128 = _mm_and_si128(dstColor_vec128, colorMask); + + dstLine[i+7] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 7) ]; + dstLine[i+6] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 6) ]; + dstLine[i+5] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 5) ]; + dstLine[i+4] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 4) ]; + dstLine[i+3] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 3) ]; + dstLine[i+2] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 2) ]; + dstLine[i+1] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 1) ]; + dstLine[i+0] = fadeInColors[factor][ _mm_extract_epi16(dstColor_vec128, 0) ]; } + + for (size_t i = ssePixCount; i < pixCount; i++) + { + dstLine[i] = fadeInColors[factor][ dstLine[i] & 0x7FFF ]; + } +#else + for (size_t i = 0; i < pixCount; i++) + { + dstLine[i] = fadeInColors[factor][ dstLine[i] & 0x7FFF ]; + } +#endif } else { @@ -2529,10 +2573,35 @@ static INLINE void GPU_RenderLine_MasterBrightness(const GPUMasterBrightMode mod { if (factor < 16) { - for (size_t i = 0; i < pixCount; ++i) +#ifdef ENABLE_SSE2 + static size_t ssePixCount = pixCount - (pixCount % 4); + static const __m128i colorMask = _mm_set1_epi16(0x7FFF); + + for (size_t i = 0; i < ssePixCount; i += 8) { - dstLine[i] = fadeOutColors[factor][dstLine[i]&0x7FFF]; + __m128i dstColor_vec128 = _mm_load_si128((__m128i *)(dstLine + i)); + dstColor_vec128 = _mm_and_si128(dstColor_vec128, colorMask); + + dstLine[i+7] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 7) ]; + dstLine[i+6] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 6) ]; + dstLine[i+5] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 5) ]; + dstLine[i+4] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 4) ]; + dstLine[i+3] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 3) ]; + dstLine[i+2] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 2) ]; + dstLine[i+1] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 1) ]; + dstLine[i+0] = fadeOutColors[factor][ _mm_extract_epi16(dstColor_vec128, 0) ]; } + + for (size_t i = ssePixCount; i < pixCount; i++) + { + dstLine[i] = fadeOutColors[factor][ dstLine[i] & 0x7FFF ]; + } +#else + for (size_t i = 0; i < pixCount; i++) + { + dstLine[i] = fadeOutColors[factor][ dstLine[i] & 0x7FFF ]; + } +#endif } else { @@ -2695,12 +2764,12 @@ void GPU_RenderLine(NDS_Screen *screen, const u16 l, bool skip) if (gpu->dispMode == GPUDisplayMode_Normal) { //optimization: render straight to the output buffer when thats what we are going to end up displaying anyway - gpu->tempScanline = gpu->currDst = dstLine; + gpu->currDst = dstLine; } else { //otherwise, we need to go to a temp buffer - gpu->tempScanline = gpu->currDst = gpu->tempScanlineBuffer; + gpu->currDst = gpu->tempScanlineBuffer; } GPU_RenderLine_layer(gpu, l); @@ -2731,14 +2800,16 @@ void GPU_RenderLine(NDS_Screen *screen, const u16 l, bool skip) { const u16 color = LE_TO_LOCAL_16(src[x]); - for (size_t line = 0; line < _gpuDstLineCount[l]; line++) + for (size_t p = 0; p < _gpuDstPitchCount[x]; p++) { - for (size_t p = 0; p < _gpuDstPitchCount[x]; p++) - { - dstLine[(line * _gpuFramebufferWidth) + (_gpuDstPitchIndex[x] + p)] = color; - } + dstLine[_gpuDstPitchIndex[x] + p] = color; } } + + for (size_t line = 1; line < dstLineCount; line++) + { + memcpy(dstLine + (line * _gpuFramebufferWidth), dstLine, _gpuFramebufferWidth * sizeof(u16)); + } } } break; @@ -2751,6 +2822,22 @@ void GPU_RenderLine(NDS_Screen *screen, const u16 l, bool skip) { ((u32 *)dstLine)[i] = DISP_FIFOrecv() & 0x7FFF7FFF; } + + if (_gpuFramebufferWidth != GPU_FRAMEBUFFER_NATIVE_WIDTH) + { + for (size_t i = GPU_FRAMEBUFFER_NATIVE_WIDTH - 1; i < GPU_FRAMEBUFFER_NATIVE_WIDTH; i--) + { + for (size_t p = _gpuDstPitchCount[i] - 1; p < _gpuDstPitchCount[i]; p--) + { + dstLine[_gpuDstPitchIndex[i] + p] = dstLine[i]; + } + } + + for (size_t line = 1; line < dstLineCount; line++) + { + memcpy(dstLine + (line * _gpuFramebufferWidth), dstLine, _gpuFramebufferWidth * sizeof(u16)); + } + } } break; } @@ -2875,12 +2962,12 @@ void GPU::refreshAffineStartRegs(const int num, const int xy) return; } - BGxPARMS *parms = (num == 2) ? &(dispx_st)->dispx_BG2PARMS : &(dispx_st)->dispx_BG3PARMS; + BGxPARMS *params = (num == 2) ? &(dispx_st)->dispx_BG2PARMS : &(dispx_st)->dispx_BG3PARMS; if (xy == 0) - parms->BGxX = affineInfo[num-2].x; + params->BGxX = affineInfo[num-2].x; else - parms->BGxY = affineInfo[num-2].y; + params->BGxY = affineInfo[num-2].y; } template void GPU::modeRender(const size_t layer) diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index b3ee85bca..75c323aab 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -621,6 +621,11 @@ struct GPU //this indicates whether this gpu is handling debug tools bool debug; + CACHE_ALIGN u16 sprColor[GPU_FRAMEBUFFER_NATIVE_WIDTH]; + CACHE_ALIGN u8 sprAlpha[GPU_FRAMEBUFFER_NATIVE_WIDTH]; + CACHE_ALIGN u8 sprType[GPU_FRAMEBUFFER_NATIVE_WIDTH]; + CACHE_ALIGN u8 sprPrio[GPU_FRAMEBUFFER_NATIVE_WIDTH]; + _BGxCNT & bgcnt(int num) { return (dispx_st)->dispx_BGxCNT[num].bits; } _DISPCNT & dispCnt() { return dispx_st->dispx_DISPCNT.bits; } template void modeRender(const size_t layer); @@ -705,14 +710,8 @@ struct GPU u8 BLDY_EVY; u16 *currentFadeInColors, *currentFadeOutColors; bool blend2[8]; - - //this should be suitably aligned for SSE2 (32bytes) + u16 *tempScanlineBuffer; - //this is the raw unadjusted pointer - u16 *tempScanlineBufferRaw; - - u16 *tempScanline; - GPUMasterBrightMode MasterBrightMode; u32 MasterBrightFactor; @@ -748,11 +747,14 @@ struct GPU u16 blend(const u16 colA, const u16 colB); template - FORCEINLINE FASTCALL bool _master_setFinalBGColor(const u16 *dstLine, const u8 *bgPixelsLine, u16 &outColor, const size_t dstX); + FORCEINLINE FASTCALL bool _master_setFinalBGColor(const size_t dstX, const u16 *dstLine, const u8 *bgPixelsLine, u16 &outColor); template - FORCEINLINE FASTCALL void _master_setFinal3dColor(const size_t dstX, u16 &outDst, u8 *bgPixelsLine, const FragmentColor src); - + FORCEINLINE FASTCALL void _master_setFinal3dColor(const size_t dstX, u16 *dstLine, u8 *bgPixelsLine, const FragmentColor src); + + template + FORCEINLINE FASTCALL void _master_setFinalOBJColor(const size_t dstX, u16 *dstLine, u8 *bgPixelsLine, const u16 src, const u8 alpha, const u8 type); + int setFinalColorBck_funcNum; int bgFunc; int setFinalColor3d_funcNum; @@ -774,9 +776,10 @@ struct GPU } - void setFinalColor3d(const size_t dstX, u16 &outDst, u8 *bgPixelsLine, const FragmentColor src); + void setFinalColor3d(const size_t dstX, u16 *dstLine, u8 *bgPixelsLine, const FragmentColor src); + void setFinalColorSpr(const size_t dstX, u16 *dstLine, u8 *bgPixelsLine, const u16 src, const u8 alpha, const u8 type); - template void setFinalColorBG(u16 *dstLine, u8 *bgPixelsLine, u16 color, const size_t dstX); + template void setFinalColorBG(const size_t dstX, u16 *dstLine, u8 *bgPixelsLine, u16 src); template FORCEINLINE void __setFinalColorBck(u16 color, const size_t srcX, const bool opaque); template FORCEINLINE void ___setFinalColorBck(u16 color, const size_t srcX, const bool opaque); diff --git a/desmume/src/NDSSystem.cpp b/desmume/src/NDSSystem.cpp index f899c3a65..343188821 100644 --- a/desmume/src/NDSSystem.cpp +++ b/desmume/src/NDSSystem.cpp @@ -2504,7 +2504,6 @@ void NDS_Reset() memcpy(&TSCal, firmware->getTouchCalibrate(), sizeof(TSCalInfo)); Screen_Reset(); - gfx3d_reset(); WIFI_Reset(); memcpy(FW_Mac, (MMU.fw.data + 0x36), 6); diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index 0d4218284..87f80de83 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -23,6 +23,7 @@ #include #include +#include "common.h" #include "debug.h" #include "gfx3d.h" #include "NDSSystem.h" @@ -892,78 +893,73 @@ void OpenGLRenderer::SetVersion(unsigned int major, unsigned int minor, unsigned this->versionRevision = revision; } -#if defined(ENABLE_SSE2) && defined(ENABLE_SSSE3) && defined(LOCAL_LE) +#if defined(ENABLE_SSSE3) && defined(LOCAL_LE) Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551) { // Convert from 32-bit BGRA8888 format to 32-bit RGBA6665 reversed format. OpenGL // stores pixels using a flipped Y-coordinate, so this needs to be flipped back // to the DS Y-coordinate. - if ((this->_framebufferWidth % 4) == 0) + const size_t pixCount = this->_framebufferWidth; + const size_t ssePixCount = pixCount - (pixCount % 4); + + for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) { - for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) + for (size_t x = 0; x < ssePixCount; x+=4, ir+=4, iw+=4) { - for (size_t x = 0; x < this->_framebufferWidth; x+=4, ir+=4, iw+=4) - { - // Convert to RGBA6665 - __m128i v = _mm_load_si128((__m128i *)(this->_framebufferColor + ir)); - v = _mm_srli_epi32(v, 2); - - __m128i a = _mm_srli_epi32(v, 1); // Special handling for 5-bit alpha - a = _mm_and_si128(a, _mm_set1_epi32(0x1F000000)); - - v = _mm_and_si128(v, _mm_set1_epi32(0x003F3F3F)); - v = _mm_or_si128(v, a); - v = _mm_shuffle_epi8(v, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)); // Swizzle RGBA to BGRA - _mm_store_si128((__m128i *)(dstRGBA6665 + iw), v); - - // Convert to RGBA5551 - v = _mm_load_si128((__m128i *)(this->_framebufferColor + ir)); - - __m128i b = _mm_and_si128(v, _mm_set1_epi32(0x000000F8)); // Read from R - b = _mm_slli_epi32(b, 7); // Shift to B - - __m128i g = _mm_and_si128(v, _mm_set1_epi32(0x0000F800)); // Read from G - g = _mm_srli_epi32(g, 6); // Shift in G - - __m128i r = _mm_and_si128(v, _mm_set1_epi32(0x00F80000)); // Read from B - r = _mm_srli_epi32(r, 19); // Shift to R - - a = _mm_and_si128(v, _mm_set1_epi32(0xFF000000)); // Read from A - a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A - a = _mm_and_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A - - v = b; - v = _mm_add_epi32(v, g); - v = _mm_add_epi32(v, r); - v = _mm_add_epi32(v, a); - - // All the colors are currently placed every other 16 bits, so we need to swizzle them - // to the lower 64 bits of our vector before we store them back to memory. - v = _mm_shuffle_epi8(v, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); - _mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), v); - } + // Convert to RGBA6665 + __m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + ir)); + color = _mm_srli_epi32(color, 2); + + __m128i a = _mm_srli_epi32(color, 1); // Special handling for 5-bit alpha + a = _mm_and_si128(a, _mm_set1_epi32(0x1F000000)); + + color = _mm_and_si128(color, _mm_set1_epi32(0x003F3F3F)); + color = _mm_or_si128(color, a); + color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)); // Swizzle RGBA to BGRA + _mm_store_si128((__m128i *)(dstRGBA6665 + iw), color); + + // Convert to RGBA5551 + color = _mm_load_si128((__m128i *)(this->_framebufferColor + ir)); + + __m128i b = _mm_and_si128(color, _mm_set1_epi32(0x000000F8)); // Read from R + b = _mm_slli_epi32(b, 7); // Shift to B + + __m128i g = _mm_and_si128(color, _mm_set1_epi32(0x0000F800)); // Read from G + g = _mm_srli_epi32(g, 6); // Shift in G + + __m128i r = _mm_and_si128(color, _mm_set1_epi32(0x00F80000)); // Read from B + r = _mm_srli_epi32(r, 19); // Shift to R + + a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A + a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A + a = _mm_and_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A + + color = b; + color = _mm_or_si128(color, g); + color = _mm_or_si128(color, r); + color = _mm_or_si128(color, a); + + // All the colors are currently placed every other 16 bits, so we need to swizzle them + // to the lower 64 bits of our vector before we store them back to memory. + color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + _mm_storel_epi64((__m128i *)(dstRGBA5551 + iw), color); } - } - else - { - for (size_t y = 0, ir = 0, iw = ((this->_framebufferHeight - 1) * this->_framebufferWidth); y < this->_framebufferHeight; y++, iw -= (this->_framebufferWidth * 2)) + + for (size_t x = ssePixCount; x < pixCount; x++, ir++, iw++) { - for (size_t x = 0; x < this->_framebufferWidth; x++, ir++, iw++) - { - dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(this->_framebufferColor[ir].color); - dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F, - (this->_framebufferColor[ir].g >> 3) & 0x1F, - (this->_framebufferColor[ir].r >> 3) & 0x1F) | - ((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000); - } + dstRGBA6665[iw].color = BGRA8888_32Rev_To_RGBA6665_32Rev(this->_framebufferColor[ir].color); + dstRGBA5551[iw] = R5G5B5TORGB15((this->_framebufferColor[ir].b >> 3) & 0x1F, + (this->_framebufferColor[ir].g >> 3) & 0x1F, + (this->_framebufferColor[ir].r >> 3) & 0x1F) | + ((this->_framebufferColor[ir].a == 0) ? 0x0000 : 0x8000); } } return RENDER3DERROR_NOERR; } -#else // Code path where SSE2, SSSE3, or little-endian is not supported +#else // Code path where SSSE3 or little-endian is not supported Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551) { @@ -995,7 +991,7 @@ Render3DError OpenGLRenderer::FlushFramebuffer(FragmentColor *__restrict dstRGBA return RENDER3DERROR_NOERR; } -#endif // defined(ENABLE_SSE2) && defined(ENABLE_SSSE3) && defined(LOCAL_LE) +#endif // defined(ENABLE_SSSE3) && defined(LOCAL_LE) OpenGLRenderer_1_2::~OpenGLRenderer_1_2() { @@ -1487,7 +1483,7 @@ Render3DError OpenGLRenderer_1_2::CreateFBOs() glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIDepthStencilID); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); @@ -1495,28 +1491,28 @@ Render3DError OpenGLRenderer_1_2::CreateFBOs() glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_MODE, GL_NONE); - glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8_EXT, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, GL_DEPTH_STENCIL_EXT, GL_UNSIGNED_INT_24_8_EXT, NULL); + glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8_EXT, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, GL_DEPTH_STENCIL_EXT, GL_UNSIGNED_INT_24_8_EXT, NULL); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIDepthID); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIPolyID); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIFogAttrID); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); glBindTexture(GL_TEXTURE_2D, 0); @@ -1891,13 +1887,13 @@ Render3DError OpenGLRenderer_1_2::DestroyToonTable() return OGLERROR_NOERR; } -Render3DError OpenGLRenderer_1_2::UploadClearImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const bool *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) +Render3DError OpenGLRenderer_1_2::UploadClearImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) { OGLRenderRef &OGLRef = *this->ref; if (this->isShaderSupported) { - for (size_t i = 0; i < GFX3D_FRAMEBUFFER_WIDTH * GFX3D_FRAMEBUFFER_HEIGHT; i++) + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++) { OGLRef.workingCIDepthStencilBuffer[i] = depthBuffer[i] << 8; OGLRef.workingCIDepthBuffer[i] = depthBuffer[i] | 0xFF000000; @@ -1916,20 +1912,20 @@ Render3DError OpenGLRenderer_1_2::UploadClearImage(const u16 *__restrict colorBu glActiveTextureARB(GL_TEXTURE0_ARB); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIColorID); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, colorBuffer); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, colorBuffer); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIDepthStencilID); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, GL_DEPTH_STENCIL_EXT, GL_UNSIGNED_INT_24_8_EXT, OGLRef.workingCIDepthStencilBuffer); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_DEPTH_STENCIL_EXT, GL_UNSIGNED_INT_24_8_EXT, OGLRef.workingCIDepthStencilBuffer); if (this->isShaderSupported) { glBindTexture(GL_TEXTURE_2D, OGLRef.texCIDepthID); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIDepthBuffer); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIDepthBuffer); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIFogAttrID); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIFogAttributesBuffer); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIFogAttributesBuffer); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIPolyID); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIPolyIDBuffer); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIPolyIDBuffer); } glBindTexture(GL_TEXTURE_2D, 0); @@ -2371,7 +2367,7 @@ Render3DError OpenGLRenderer_1_2::UpdateToonTable(const u16 *toonTableBuffer) return OGLERROR_NOERR; } -Render3DError OpenGLRenderer_1_2::ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const bool *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) +Render3DError OpenGLRenderer_1_2::ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) { if (!this->isFBOSupported) { @@ -2397,22 +2393,22 @@ Render3DError OpenGLRenderer_1_2::ClearUsingImage(const u16 *__restrict colorBuf // Blit the working depth buffer glReadBuffer(GL_COLOR_ATTACHMENT1_EXT); glDrawBuffer(GL_COLOR_ATTACHMENT1_EXT); - glBlitFramebufferEXT(0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST); + glBlitFramebufferEXT(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST); // Blit the polygon ID buffer glReadBuffer(GL_COLOR_ATTACHMENT2_EXT); glDrawBuffer(GL_COLOR_ATTACHMENT2_EXT); - glBlitFramebufferEXT(0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST); + glBlitFramebufferEXT(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST); // Blit the fog buffer glReadBuffer(GL_COLOR_ATTACHMENT3_EXT); glDrawBuffer(GL_COLOR_ATTACHMENT3_EXT); - glBlitFramebufferEXT(0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST); + glBlitFramebufferEXT(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST); // Blit the color buffer. Do this last so that color attachment 0 is set to the read FBO. glReadBuffer(GL_COLOR_ATTACHMENT0_EXT); glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT); - glBlitFramebufferEXT(0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT, GL_NEAREST); + glBlitFramebufferEXT(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT, GL_NEAREST); glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, OGLRef.fboRenderID); glDrawBuffers(4, RenderDrawList); @@ -2485,7 +2481,7 @@ Render3DError OpenGLRenderer_1_2::ClearUsingValues(const FragmentColor &clearCol glClear(GL_COLOR_BUFFER_BIT); glDrawBuffer(GL_COLOR_ATTACHMENT3_EXT); // texGFogAttrID - glClearColor((clearAttributes.isFogged) ? 1.0 : 0.0, 0.0, 0.0, 1.0); + glClearColor(clearAttributes.isFogged, 0.0, 0.0, 1.0); glClear(GL_COLOR_BUFFER_BIT); glDrawBuffers(4, RenderDrawList); @@ -2682,8 +2678,8 @@ Render3DError OpenGLRenderer_1_2::SetupTexture(const POLY &thePoly, bool enableT Render3DError OpenGLRenderer_1_2::SetupViewport(const u32 viewportValue) { - const GLfloat wScalar = this->_framebufferWidth / GFX3D_FRAMEBUFFER_WIDTH; - const GLfloat hScalar = this->_framebufferHeight / GFX3D_FRAMEBUFFER_HEIGHT; + const GLfloat wScalar = this->_framebufferWidth / GPU_FRAMEBUFFER_NATIVE_WIDTH; + const GLfloat hScalar = this->_framebufferHeight / GPU_FRAMEBUFFER_NATIVE_HEIGHT; VIEWPORT viewport; viewport.decode(viewportValue); @@ -2782,21 +2778,11 @@ Render3DError OpenGLRenderer_1_2::SetFramebufferSize(size_t w, size_t h) { OGLRenderRef &OGLRef = *this->ref; - if (w < GFX3D_FRAMEBUFFER_WIDTH || h < GFX3D_FRAMEBUFFER_HEIGHT) + if (w < GPU_FRAMEBUFFER_NATIVE_WIDTH || h < GPU_FRAMEBUFFER_NATIVE_HEIGHT) { return OGLERROR_NOERR; } - this->_framebufferWidth = w; - this->_framebufferHeight = h; - this->_framebufferColorSizeBytes = w * h * sizeof(FragmentColor); - this->_framebufferColor = (FragmentColor *)realloc(this->_framebufferColor, this->_framebufferColorSizeBytes); - - if (oglrender_framebufferDidResizeCallback != NULL) - { - oglrender_framebufferDidResizeCallback(w, h); - } - if (this->isFBOSupported) { glActiveTextureARB(GL_TEXTURE0_ARB + OGLTextureUnitID_GColor); @@ -2836,11 +2822,28 @@ Render3DError OpenGLRenderer_1_2::SetFramebufferSize(size_t w, size_t h) glRenderbufferStorageMultisampleEXT(GL_RENDERBUFFER_EXT, maxSamples, GL_DEPTH24_STENCIL8_EXT, w, h); } + const size_t newFramebufferColorSizeBytes = w * h * sizeof(FragmentColor); + FragmentColor *oldFramebufferColor = this->_framebufferColor; + FragmentColor *newFramebufferColor = (FragmentColor *)malloc_alignedCacheLine(newFramebufferColorSizeBytes); + memset(newFramebufferColor, 0, newFramebufferColorSizeBytes); + if (this->isPBOSupported) { - glBufferData(GL_PIXEL_PACK_BUFFER_ARB, this->_framebufferColorSizeBytes, NULL, GL_STREAM_READ); + glBufferData(GL_PIXEL_PACK_BUFFER_ARB, newFramebufferColorSizeBytes, newFramebufferColor, GL_STREAM_READ); } + this->_framebufferWidth = w; + this->_framebufferHeight = h; + this->_framebufferColorSizeBytes = newFramebufferColorSizeBytes; + this->_framebufferColor = newFramebufferColor; + + if (oglrender_framebufferDidResizeCallback != NULL) + { + oglrender_framebufferDidResizeCallback(w, h); + } + + free_aligned(oldFramebufferColor); + return OGLERROR_NOERR; } @@ -2874,13 +2877,13 @@ Render3DError OpenGLRenderer_1_3::UpdateToonTable(const u16 *toonTableBuffer) return OGLERROR_NOERR; } -Render3DError OpenGLRenderer_1_3::UploadClearImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const bool *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) +Render3DError OpenGLRenderer_1_3::UploadClearImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) { OGLRenderRef &OGLRef = *this->ref; if (this->isShaderSupported) { - for (size_t i = 0; i < GFX3D_FRAMEBUFFER_WIDTH * GFX3D_FRAMEBUFFER_HEIGHT; i++) + for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++) { OGLRef.workingCIDepthStencilBuffer[i] = depthBuffer[i] << 8; OGLRef.workingCIDepthBuffer[i] = depthBuffer[i] | 0xFF000000; @@ -2899,20 +2902,20 @@ Render3DError OpenGLRenderer_1_3::UploadClearImage(const u16 *__restrict colorBu glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIColorID); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, colorBuffer); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, colorBuffer); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIDepthStencilID); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, GL_DEPTH_STENCIL_EXT, GL_UNSIGNED_INT_24_8_EXT, OGLRef.workingCIDepthStencilBuffer); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_DEPTH_STENCIL_EXT, GL_UNSIGNED_INT_24_8_EXT, OGLRef.workingCIDepthStencilBuffer); if (this->isShaderSupported) { glBindTexture(GL_TEXTURE_2D, OGLRef.texCIDepthID); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIDepthBuffer); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIDepthBuffer); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIFogAttrID); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIFogAttributesBuffer); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIFogAttributesBuffer); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIPolyID); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIPolyIDBuffer); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIPolyIDBuffer); } glBindTexture(GL_TEXTURE_2D, 0); @@ -2924,21 +2927,11 @@ Render3DError OpenGLRenderer_1_3::SetFramebufferSize(size_t w, size_t h) { OGLRenderRef &OGLRef = *this->ref; - if (w < GFX3D_FRAMEBUFFER_WIDTH || h < GFX3D_FRAMEBUFFER_HEIGHT) + if (w < GPU_FRAMEBUFFER_NATIVE_WIDTH || h < GPU_FRAMEBUFFER_NATIVE_HEIGHT) { return OGLERROR_NOERR; } - this->_framebufferWidth = w; - this->_framebufferHeight = h; - this->_framebufferColorSizeBytes = w * h * sizeof(FragmentColor); - this->_framebufferColor = (FragmentColor *)realloc(this->_framebufferColor, this->_framebufferColorSizeBytes); - - if (oglrender_framebufferDidResizeCallback != NULL) - { - oglrender_framebufferDidResizeCallback(w, h); - } - if (this->isFBOSupported) { glActiveTexture(GL_TEXTURE0 + OGLTextureUnitID_GColor); @@ -2978,11 +2971,28 @@ Render3DError OpenGLRenderer_1_3::SetFramebufferSize(size_t w, size_t h) glRenderbufferStorageMultisampleEXT(GL_RENDERBUFFER_EXT, maxSamples, GL_DEPTH24_STENCIL8_EXT, w, h); } + const size_t newFramebufferColorSizeBytes = w * h * sizeof(FragmentColor); + FragmentColor *oldFramebufferColor = this->_framebufferColor; + FragmentColor *newFramebufferColor = (FragmentColor *)malloc_alignedCacheLine(newFramebufferColorSizeBytes); + memset(newFramebufferColor, 0, newFramebufferColorSizeBytes); + if (this->isPBOSupported) { - glBufferData(GL_PIXEL_PACK_BUFFER_ARB, this->_framebufferColorSizeBytes, NULL, GL_STREAM_READ); + glBufferData(GL_PIXEL_PACK_BUFFER_ARB, newFramebufferColorSizeBytes, newFramebufferColor, GL_STREAM_READ); } + this->_framebufferWidth = w; + this->_framebufferHeight = h; + this->_framebufferColorSizeBytes = newFramebufferColorSizeBytes; + this->_framebufferColor = newFramebufferColor; + + if (oglrender_framebufferDidResizeCallback != NULL) + { + oglrender_framebufferDidResizeCallback(w, h); + } + + free_aligned(oldFramebufferColor); + return OGLERROR_NOERR; } diff --git a/desmume/src/OGLRender.h b/desmume/src/OGLRender.h index 92a634018..1a68cbc90 100644 --- a/desmume/src/OGLRender.h +++ b/desmume/src/OGLRender.h @@ -489,10 +489,10 @@ struct OGLRenderRef // Client-side Buffers GLfloat *color4fBuffer; GLushort *vertIndexBuffer; - CACHE_ALIGN GLuint workingCIDepthBuffer[GFX3D_FRAMEBUFFER_WIDTH * GFX3D_FRAMEBUFFER_HEIGHT]; - CACHE_ALIGN GLuint workingCIDepthStencilBuffer[GFX3D_FRAMEBUFFER_WIDTH * GFX3D_FRAMEBUFFER_HEIGHT]; - CACHE_ALIGN GLuint workingCIFogAttributesBuffer[GFX3D_FRAMEBUFFER_WIDTH * GFX3D_FRAMEBUFFER_HEIGHT]; - CACHE_ALIGN GLuint workingCIPolyIDBuffer[GFX3D_FRAMEBUFFER_WIDTH * GFX3D_FRAMEBUFFER_HEIGHT]; + CACHE_ALIGN GLuint workingCIDepthBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; + CACHE_ALIGN GLuint workingCIDepthStencilBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; + CACHE_ALIGN GLuint workingCIFogAttributesBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; + CACHE_ALIGN GLuint workingCIPolyIDBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; // Vertex Attributes Pointers GLvoid *vtxPtrPosition; @@ -552,7 +552,13 @@ FORCEINLINE u32 BGRA8888_32_To_RGBA6665_32(const u32 srcPix); FORCEINLINE u32 BGRA8888_32Rev_To_RGBA6665_32Rev(const u32 srcPix); bool IsVersionSupported(unsigned int checkVersionMajor, unsigned int checkVersionMinor, unsigned int checkVersionRevision); +#if defined(ENABLE_SSSE3) +class OpenGLRenderer : public Render3D_SSSE3 +#elif defined(ENABLE_SSE2) +class OpenGLRenderer : public Render3D_SSE2 +#else class OpenGLRenderer : public Render3D +#endif { private: // Driver's OpenGL Version @@ -608,7 +614,7 @@ protected: virtual Render3DError InitGeometryProgramShaderLocations() = 0; virtual Render3DError CreateToonTable() = 0; virtual Render3DError DestroyToonTable() = 0; - virtual Render3DError UploadClearImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const bool *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) = 0; + virtual Render3DError UploadClearImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) = 0; virtual void GetExtensionSet(std::set *oglExtensionSet) = 0; virtual Render3DError ExpandFreeTextures() = 0; @@ -665,7 +671,7 @@ protected: virtual Render3DError CreateToonTable(); virtual Render3DError DestroyToonTable(); - virtual Render3DError UploadClearImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const bool *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); + virtual Render3DError UploadClearImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); virtual void GetExtensionSet(std::set *oglExtensionSet); virtual Render3DError ExpandFreeTextures(); @@ -679,7 +685,7 @@ protected: virtual Render3DError RenderGeometry(const GFX3D_State &renderState, const POLYLIST *polyList, const INDEXLIST *indexList); virtual Render3DError EndRender(const u64 frameCount); - virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const bool *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); + virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); virtual Render3DError ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const; virtual void SetPolygonIndex(const size_t index); @@ -703,7 +709,7 @@ class OpenGLRenderer_1_3 : public OpenGLRenderer_1_2 { protected: virtual Render3DError CreateToonTable(); - virtual Render3DError UploadClearImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const bool *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); + virtual Render3DError UploadClearImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); public: virtual Render3DError UpdateToonTable(const u16 *toonTableBuffer); diff --git a/desmume/src/OGLRender_3_2.cpp b/desmume/src/OGLRender_3_2.cpp index 27e6280b2..ba9e4b193 100644 --- a/desmume/src/OGLRender_3_2.cpp +++ b/desmume/src/OGLRender_3_2.cpp @@ -22,6 +22,7 @@ #include #include +#include "common.h" #include "debug.h" #include "gfx3d.h" #include "NDSSystem.h" @@ -660,7 +661,7 @@ Render3DError OpenGLRenderer_3_2::CreateFBOs() glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIDepthStencilID); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); @@ -668,28 +669,28 @@ Render3DError OpenGLRenderer_3_2::CreateFBOs() glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_MODE, GL_NONE); - glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL); + glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIDepthID); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIPolyID); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); glBindTexture(GL_TEXTURE_2D, OGLRef.texCIFogAttrID); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL); glBindTexture(GL_TEXTURE_2D, 0); @@ -1332,7 +1333,7 @@ Render3DError OpenGLRenderer_3_2::UpdateToonTable(const u16 *toonTableBuffer) return OGLERROR_NOERR; } -Render3DError OpenGLRenderer_3_2::ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const bool *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) +Render3DError OpenGLRenderer_3_2::ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) { OGLRenderRef &OGLRef = *this->ref; @@ -1344,22 +1345,22 @@ Render3DError OpenGLRenderer_3_2::ClearUsingImage(const u16 *__restrict colorBuf // Blit the working depth buffer glReadBuffer(GL_COLOR_ATTACHMENT1); glDrawBuffer(GL_COLOR_ATTACHMENT1); - glBlitFramebuffer(0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST); + glBlitFramebuffer(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST); // Blit the polygon ID buffer glReadBuffer(GL_COLOR_ATTACHMENT2); glDrawBuffer(GL_COLOR_ATTACHMENT2); - glBlitFramebuffer(0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST); + glBlitFramebuffer(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST); // Blit the fog buffer glReadBuffer(GL_COLOR_ATTACHMENT3); glDrawBuffer(GL_COLOR_ATTACHMENT3); - glBlitFramebuffer(0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST); + glBlitFramebuffer(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT, GL_NEAREST); // Blit the color buffer. Do this last so that color attachment 0 is set to the read FBO. glReadBuffer(GL_COLOR_ATTACHMENT0); glDrawBuffer(GL_COLOR_ATTACHMENT0); - glBlitFramebuffer(0, 0, GFX3D_FRAMEBUFFER_WIDTH, GFX3D_FRAMEBUFFER_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT, GL_NEAREST); + glBlitFramebuffer(0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, 0, this->_framebufferWidth, this->_framebufferHeight, GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT, GL_NEAREST); glBindFramebuffer(GL_FRAMEBUFFER, OGLRef.fboRenderID); glDrawBuffers(4, RenderDrawList); @@ -1408,7 +1409,7 @@ Render3DError OpenGLRenderer_3_2::ClearUsingValues(const FragmentColor &clearCol const GLfloat oglColor[4] = {divide5bitBy31_LUT[clearColor.r], divide5bitBy31_LUT[clearColor.g], divide5bitBy31_LUT[clearColor.b], divide5bitBy31_LUT[clearColor.a]}; const GLfloat oglDepth[4] = {(GLfloat)(clearAttributes.depth & 0x000000FF)/255.0f, (GLfloat)((clearAttributes.depth >> 8) & 0x000000FF)/255.0f, (GLfloat)((clearAttributes.depth >> 16) & 0x000000FF)/255.0f, 1.0}; const GLfloat oglPolyID[4] = {(GLfloat)clearAttributes.opaquePolyID/63.0f, 0.0, 0.0, 1.0}; - const GLfloat oglFogAttr[4] = {(clearAttributes.isFogged) ? 1.0 : 0.0, 0.0, 0.0, 1.0}; + const GLfloat oglFogAttr[4] = {clearAttributes.isFogged, 0.0, 0.0, 1.0}; glClearBufferfi(GL_DEPTH_STENCIL, 0, (GLfloat)clearAttributes.depth / (GLfloat)0x00FFFFFF, 0); glClearBufferfv(GL_COLOR, 0, oglColor); // texGColorID @@ -1554,21 +1555,11 @@ Render3DError OpenGLRenderer_3_2::SetFramebufferSize(size_t w, size_t h) { OGLRenderRef &OGLRef = *this->ref; - if (w < GFX3D_FRAMEBUFFER_WIDTH || h < GFX3D_FRAMEBUFFER_HEIGHT) + if (w < GPU_FRAMEBUFFER_NATIVE_WIDTH || h < GPU_FRAMEBUFFER_NATIVE_HEIGHT) { return OGLERROR_NOERR; } - this->_framebufferWidth = w; - this->_framebufferHeight = h; - this->_framebufferColorSizeBytes = w * h * sizeof(FragmentColor); - this->_framebufferColor = (FragmentColor *)realloc(this->_framebufferColor, this->_framebufferColorSizeBytes); - - if (oglrender_framebufferDidResizeCallback != NULL) - { - oglrender_framebufferDidResizeCallback(w, h); - } - glActiveTexture(GL_TEXTURE0 + OGLTextureUnitID_GColor); glBindTexture(GL_TEXTURE_2D, OGLRef.texGDepthStencilID); glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, w, h, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL); @@ -1605,7 +1596,24 @@ Render3DError OpenGLRenderer_3_2::SetFramebufferSize(size_t w, size_t h) glRenderbufferStorageMultisample(GL_RENDERBUFFER, maxSamples, GL_DEPTH24_STENCIL8, w, h); } - glBufferData(GL_PIXEL_PACK_BUFFER, this->_framebufferColorSizeBytes, NULL, GL_STREAM_READ); + const size_t newFramebufferColorSizeBytes = w * h * sizeof(FragmentColor); + FragmentColor *oldFramebufferColor = this->_framebufferColor; + FragmentColor *newFramebufferColor = (FragmentColor *)malloc_alignedCacheLine(newFramebufferColorSizeBytes); + memset(newFramebufferColor, 0, newFramebufferColorSizeBytes); + + glBufferData(GL_PIXEL_PACK_BUFFER, newFramebufferColorSizeBytes, newFramebufferColor, GL_STREAM_READ); + + this->_framebufferWidth = w; + this->_framebufferHeight = h; + this->_framebufferColorSizeBytes = newFramebufferColorSizeBytes; + this->_framebufferColor = newFramebufferColor; + + if (oglrender_framebufferDidResizeCallback != NULL) + { + oglrender_framebufferDidResizeCallback(w, h); + } + + free_aligned(oldFramebufferColor); return OGLERROR_NOERR; } diff --git a/desmume/src/OGLRender_3_2.h b/desmume/src/OGLRender_3_2.h index 14228f181..e8eff75a8 100644 --- a/desmume/src/OGLRender_3_2.h +++ b/desmume/src/OGLRender_3_2.h @@ -88,7 +88,7 @@ protected: virtual Render3DError CreateToonTable(); virtual Render3DError DestroyToonTable(); virtual Render3DError UpdateToonTable(const u16 *toonTableBuffer); - virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const bool *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); + virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); virtual Render3DError ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const; virtual void SetPolygonIndex(const size_t index); diff --git a/desmume/src/common.cpp b/desmume/src/common.cpp index 0ef65b71d..66468f685 100644 --- a/desmume/src/common.cpp +++ b/desmume/src/common.cpp @@ -24,6 +24,9 @@ #include #include #include +#include + +static std::map _alignedPtrList; // Key: Aligned pointer / Value: Original pointer char *trim(char *s, int len) { @@ -677,3 +680,66 @@ msgBoxInterface msgBoxFake = { }; msgBoxInterface *msgbox = &msgBoxFake; + +void* malloc_aligned(size_t length, size_t alignment) +{ + const uintptr_t ptrOffset = alignment; // This value must be a power of 2, or this function will fail. + const uintptr_t ptrOffsetMask = ~(ptrOffset - 1); + + void *originalPtr = malloc(length + ptrOffset); + if (originalPtr == NULL) + { + return originalPtr; + } + + void *alignedPtr = (void *)(((uintptr_t)originalPtr + ptrOffset) & ptrOffsetMask); + _alignedPtrList[alignedPtr] = originalPtr; + + return alignedPtr; +} + +void* malloc_aligned16(size_t length) +{ + return malloc_aligned(length, 16); +} + +void* malloc_aligned32(size_t length) +{ + return malloc_aligned(length, 32); +} + +void* malloc_aligned64(size_t length) +{ + return malloc_aligned(length, 64); +} + +void* malloc_alignedCacheLine(size_t length) +{ +#if defined(HOST_32) + return malloc_aligned32(length); +#elif defined(HOST_64) + return malloc_aligned64(length); +#else + return malloc_aligned16(length); +#endif +} + +void free_aligned(void *ptr) +{ + if (ptr == NULL) + { + return; + } + + // If the input pointer is aligned through malloc_aligned(), + // then retrieve the original pointer first. Otherwise, this + // function behaves like the usual free(). + void *originalPtr = ptr; + if (_alignedPtrList.find(ptr) != _alignedPtrList.end()) + { + originalPtr = _alignedPtrList[ptr]; + _alignedPtrList.erase(ptr); + } + + free(originalPtr); +} diff --git a/desmume/src/common.h b/desmume/src/common.h index 66d229949..290f9f6ba 100644 --- a/desmume/src/common.h +++ b/desmume/src/common.h @@ -91,4 +91,11 @@ extern int NDS_WritePNG(const char *fname, u16 *data); extern int NDS_WriteBMP(const char *filename, u16 *data); extern int NDS_WriteBMP_32bppBuffer(int width, int height, const void* buf, const char *filename); +void* malloc_aligned(size_t length, size_t alignment); +void* malloc_aligned16(size_t length); +void* malloc_aligned32(size_t length); +void* malloc_aligned64(size_t length); +void* malloc_alignedCacheLine(size_t length); +void free_aligned(void *ptr); + #endif diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index 97655fe34..1e58bdd3d 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -37,6 +37,7 @@ #include #include "armcpu.h" +#include "common.h" #include "debug.h" #include "driver.h" #include "emufile.h" @@ -312,8 +313,6 @@ static float normalTable[1024]; // Color buffer that is filled by the 3D renderer and is read by the GPU engine. FragmentColor *gfx3d_colorRGBA6665 = NULL; u16 *gfx3d_colorRGBA5551 = NULL; -static size_t gfx3d_framebufferWidth = GFX3D_FRAMEBUFFER_WIDTH; -static size_t gfx3d_framebufferHeight = GFX3D_FRAMEBUFFER_HEIGHT; // Matrix stack handling CACHE_ALIGN MatrixStack mtxStack[4] = { @@ -550,7 +549,6 @@ void gfx3d_init() makeTables(); Render3D_Init(); - gfx3d_setFramebufferSize(gfx3d_framebufferWidth, gfx3d_framebufferHeight); gfx3d_reset(); } @@ -566,10 +564,10 @@ void gfx3d_deinit() vertlists = NULL; vertlist = NULL; - free(gfx3d_colorRGBA6665); + free_aligned(gfx3d_colorRGBA6665); gfx3d_colorRGBA6665 = NULL; - free(gfx3d_colorRGBA5551); + free_aligned(gfx3d_colorRGBA5551); gfx3d_colorRGBA5551 = NULL; } @@ -645,9 +643,6 @@ void gfx3d_reset() last_s = 0; viewport = 0xBFFF0000; - memset(gfx3d_colorRGBA6665, 0, gfx3d_framebufferWidth * gfx3d_framebufferHeight * sizeof(FragmentColor)); - memset(gfx3d_colorRGBA5551, 0, gfx3d_framebufferWidth * gfx3d_framebufferHeight * sizeof(u16)); - gfx3d.state.clearDepth = DS_DEPTH15TO24(0x7FFF); clInd2 = 0; @@ -659,38 +654,6 @@ void gfx3d_reset() CurrentRenderer->Reset(); } -size_t gfx3d_getFramebufferWidth() -{ - return gfx3d_framebufferWidth; -} - -size_t gfx3d_getFramebufferHeight() -{ - return gfx3d_framebufferHeight; -} - -void gfx3d_setFramebufferSize(size_t w, size_t h) -{ - if (w < GFX3D_FRAMEBUFFER_WIDTH || h < GFX3D_FRAMEBUFFER_HEIGHT) - { - return; - } - - // Check if we're calling this function from initialization. - // If we're not initializing, we need to finish rendering first. - if (gfx3d_colorRGBA6665 != NULL && gfx3d_colorRGBA5551 != NULL) - { - CurrentRenderer->RenderFinish(); - } - - gfx3d_framebufferWidth = w; - gfx3d_framebufferHeight = h; - gfx3d_colorRGBA6665 = (FragmentColor *)realloc(gfx3d_colorRGBA6665, w * h * sizeof(FragmentColor)); - gfx3d_colorRGBA5551 = (u16 *)realloc(gfx3d_colorRGBA5551, w * h * sizeof(u16)); - - CurrentRenderer->SetFramebufferSize(w, h); -} - //================================================================================= Geometry Engine //================================================================================= //================================================================================= @@ -2351,7 +2314,7 @@ void gfx3d_VBlankEndSignal(bool skipFrame) if (!CommonSettings.showGpu.main) { - memset(gfx3d_colorRGBA6665, 0, sizeof(gfx3d_framebufferWidth * gfx3d_framebufferHeight * sizeof(FragmentColor))); + memset(gfx3d_colorRGBA6665, 0, sizeof(GPU_GetFramebufferWidth() * GPU_GetFramebufferHeight() * sizeof(FragmentColor))); return; } @@ -2466,13 +2429,13 @@ void gfx3d_glGetLightColor(const size_t index, u32 &dst) const FragmentColor* gfx3d_GetLineDataRGBA6665(const size_t line) { CurrentRenderer->RenderFinish(); - return (gfx3d_colorRGBA6665 + (line * gfx3d_framebufferWidth)); + return (gfx3d_colorRGBA6665 + (line * GPU_GetFramebufferWidth())); } const u16* gfx3d_GetLineDataRGBA5551(const size_t line) { CurrentRenderer->RenderFinish(); - return (gfx3d_colorRGBA5551 + (line * gfx3d_framebufferWidth)); + return (gfx3d_colorRGBA5551 + (line * GPU_GetFramebufferWidth())); } @@ -2562,7 +2525,7 @@ SFORMAT SF_GFX3D[]={ { "GTVC", 4, 1, &tempVertInfo.count}, { "GTVM", 4, 4, tempVertInfo.map}, { "GTVF", 4, 1, &tempVertInfo.first}, - { "G3CX", 1, 4*GFX3D_FRAMEBUFFER_WIDTH*GFX3D_FRAMEBUFFER_HEIGHT, gfx3d_colorRGBA6665}, + { "G3CX", 1, 4*GPU_FRAMEBUFFER_NATIVE_WIDTH*GPU_FRAMEBUFFER_NATIVE_HEIGHT, gfx3d_colorRGBA6665}, { 0 } }; diff --git a/desmume/src/gfx3d.h b/desmume/src/gfx3d.h index a30903cd8..422d7607a 100644 --- a/desmume/src/gfx3d.h +++ b/desmume/src/gfx3d.h @@ -28,10 +28,6 @@ class EMUFILE; -// Pixel dimensions of the NDS 3D framebuffer -#define GFX3D_FRAMEBUFFER_WIDTH 256 -#define GFX3D_FRAMEBUFFER_HEIGHT 192 - //geometry engine command numbers #define GFX3D_NOP 0x00 #define GFX3D_MTX_MODE 0x10 @@ -249,10 +245,6 @@ void gfx3d_init(); void gfx3d_deinit(); void gfx3d_reset(); -size_t gfx3d_getFramebufferWidth(); -size_t gfx3d_getFramebufferHeight(); -void gfx3d_setFramebufferSize(size_t w, size_t h); - typedef struct { u8 enableLightFlags; diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h index 683ab75d3..ba27a3c9f 100644 --- a/desmume/src/matrix.h +++ b/desmume/src/matrix.h @@ -122,39 +122,24 @@ FORCEINLINE s32 s32floor(double d) static void memset_u16(void *dst, const u16 val, const size_t length) { -#if defined(__GNUC__) || defined(__INTEL_COMPILER) __m128i *dst_vec128 = (__m128i *)dst; const __m128i val_vec128 = _mm_set1_epi16(val); const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val)); //MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128)); for (size_t i = 0; i < length_vec128; i++) - dst_vec128[i] = val_vec128; -#else - const u32 val_u32 = ((u32)val << 16) | (u32)val; - __m128 val_vec128; val_vec128.m128_i32[0] = val_u32; - const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val)); - //MACRODO_N(length_vec128,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), val_vec128)); - MACRODO_N(length_vec128, _mm_store_ps1((float*)((u8*)dst+(X)*16), val_vec128)); -#endif + _mm_stream_si128(dst_vec128 + i, val_vec128); } static void memset_u32(void *dst, const u32 val, const size_t length) { -#if defined(__GNUC__) || defined(__INTEL_COMPILER) __m128i *dst_vec128 = (__m128i *)dst; const __m128i val_vec128 = _mm_set1_epi32(val); const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val)); //MACRODO_N(length_vec128, (dst_vec128[X] = val_vec128)); for (size_t i = 0; i < length_vec128; i++) - dst_vec128[i] = val_vec128; -#else - __m128 val_vec128; val_vec128.m128_i32[0] = val; - const size_t length_vec128 = length / (sizeof(val_vec128) / sizeof(val)); - //MACRODO_N(length_vec128,_mm_store_si128((__m128i*)((u8*)dst+(X)*16), val_vec128)); - MACRODO_N(length_vec128, _mm_store_ps1((float*)((u8*)dst+(X)*16), val_vec128)); -#endif + _mm_stream_si128(dst_vec128 + i, val_vec128); } #else //no sse2 diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index 7090fd32a..16af83f57 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -45,6 +45,14 @@ #include #endif +#ifdef ENABLE_SSE2 +#include +#endif + +#ifdef ENABLE_SSSE3 +#include +#endif + #include "bits.h" #include "common.h" #include "matrix.h" @@ -563,30 +571,38 @@ public: } template - FORCEINLINE void pixel(const PolygonAttributes &polyAttr, FragmentAttributes &dstAttributes, FragmentColor &dstColor, float r, float g, float b, float invu, float invv, float w, float z) + FORCEINLINE void pixel(const PolygonAttributes &polyAttr, const size_t fragmentIndex, FragmentColor &dstColor, float r, float g, float b, float invu, float invv, float w, float z) { FragmentColor srcColor; FragmentColor shaderOutput; bool isOpaquePixel; + //FragmentColor &dstColor = this->_softRender->GetFramebuffer()[fragmentIndex]; + u32 &dstAttributeDepth = this->_softRender->_framebufferAttributes->depth[fragmentIndex]; + u8 &dstAttributeOpaquePolyID = this->_softRender->_framebufferAttributes->opaquePolyID[fragmentIndex]; + u8 &dstAttributeTranslucentPolyID = this->_softRender->_framebufferAttributes->translucentPolyID[fragmentIndex]; + u8 &dstAttributeStencil = this->_softRender->_framebufferAttributes->stencil[fragmentIndex]; + u8 &dstAttributeIsFogged = this->_softRender->_framebufferAttributes->isFogged[fragmentIndex]; + u8 &dstAttributeIsTranslucentPoly = this->_softRender->_framebufferAttributes->isTranslucentPoly[fragmentIndex]; + // not sure about the w-buffer depth value: this value was chosen to make the skybox, castle window decals, and water level render correctly in SM64 // hack: when using z-depth, drop some LSBs so that the overworld map in Dragon Quest IV shows up correctly - const u32 depth = (gfx3d.renderState.wbuffer) ? u32floor(4096*w) : DS_DEPTH15TO24( u32floor(z*0x7FFF) ) & 0x00FFFFFC; + const u32 newDepth = (gfx3d.renderState.wbuffer) ? u32floor(4096*w) : DS_DEPTH15TO24( u32floor(z*0x7FFF) ) & 0x00FFFFFC; // run the depth test if (polyAttr.enableDepthEqualTest) { - const u32 minDepth = max(0x00000000, dstAttributes.depth - SOFTRASTERIZER_DEPTH_EQUAL_TEST_TOLERANCE); - const u32 maxDepth = min(0x00FFFFFF, dstAttributes.depth + SOFTRASTERIZER_DEPTH_EQUAL_TEST_TOLERANCE); + const u32 minDepth = max(0x00000000, dstAttributeDepth - SOFTRASTERIZER_DEPTH_EQUAL_TEST_TOLERANCE); + const u32 maxDepth = min(0x00FFFFFF, dstAttributeDepth + SOFTRASTERIZER_DEPTH_EQUAL_TEST_TOLERANCE); - if (depth < minDepth || depth > maxDepth) + if (newDepth < minDepth || newDepth > maxDepth) { goto depth_fail; } } else { - if (depth >= dstAttributes.depth) + if (newDepth >= dstAttributeDepth) { goto depth_fail; } @@ -601,7 +617,7 @@ public: } else { - if (dstAttributes.stencil == 0) + if (dstAttributeStencil == 0) { goto rejected_fragment; } @@ -609,7 +625,7 @@ public: //shadow polys have a special check here to keep from self-shadowing when user //has tried to prevent it from happening //if this isnt here, then the vehicle select in mariokart will look terrible - if (dstAttributes.opaquePolyID == polyAttr.polygonID) + if (dstAttributeOpaquePolyID == polyAttr.polygonID) { goto rejected_fragment; } @@ -643,31 +659,31 @@ public: isOpaquePixel = (shaderOutput.a == 0x1F); if (isOpaquePixel) { - dstAttributes.opaquePolyID = polyAttr.polygonID; - dstAttributes.isTranslucentPoly = polyAttr.isTranslucent; - dstAttributes.isFogged = polyAttr.enableRenderFog; + dstAttributeOpaquePolyID = polyAttr.polygonID; + dstAttributeIsTranslucentPoly = polyAttr.isTranslucent; + dstAttributeIsFogged = polyAttr.enableRenderFog; dstColor = shaderOutput; } else { //dont overwrite pixels on translucent polys with the same polyids - if (dstAttributes.translucentPolyID == polyAttr.polygonID) + if (dstAttributeTranslucentPolyID == polyAttr.polygonID) goto rejected_fragment; //originally we were using a test case of shadows-behind-trees in sm64ds //but, it looks bad in that game. this is actually correct //if this isnt correct, then complex shape cart shadows in mario kart don't work right - dstAttributes.translucentPolyID = polyAttr.polygonID; + dstAttributeTranslucentPolyID = polyAttr.polygonID; //alpha blending and write color alphaBlend(dstColor, shaderOutput); - dstAttributes.isFogged = (dstAttributes.isFogged && polyAttr.enableRenderFog); + dstAttributeIsFogged = (dstAttributeIsFogged && polyAttr.enableRenderFog); } //depth writing if (isOpaquePixel || polyAttr.enableAlphaDepthWrite) - dstAttributes.depth = depth; + dstAttributeDepth = newDepth; //shadow cases: (need multi-bit stencil buffer to cope with all of these, especially the mariokart complex shadows) //1. sm64 (standing near signs and blocks) @@ -678,14 +694,14 @@ public: goto done; depth_fail: if (isShadowPolygon && polyAttr.polygonID == 0) - dstAttributes.stencil++; + dstAttributeStencil++; rejected_fragment: done: ; - if (isShadowPolygon && polyAttr.polygonID != 0 && dstAttributes.stencil) - dstAttributes.stencil--; + if (isShadowPolygon && polyAttr.polygonID != 0 && dstAttributeStencil) + dstAttributeStencil--; } //draws a single scanline @@ -729,16 +745,16 @@ public: (pRight->color[1].curr - color[1]) * invWidth, (pRight->color[2].curr - color[2]) * invWidth }; - int adr = (pLeft->Y*framebufferWidth)+XStart; + size_t adr = (pLeft->Y*framebufferWidth)+XStart; //CONSIDER: in case some other math is wrong (shouldve been clipped OK), we might go out of bounds here. //better check the Y value. - if (RENDERER && (pLeft->Y<0 || pLeft->Y > (framebufferHeight - 1))) + if (RENDERER && (pLeft->Y < 0 || pLeft->Y > (framebufferHeight - 1))) { printf("rasterizer rendering at y=%d! oops!\n",pLeft->Y); return; } - if (!RENDERER && (pLeft->Y<0 || pLeft->Y >= framebufferHeight)) + if (!RENDERER && (pLeft->Y < 0 || pLeft->Y >= framebufferHeight)) { printf("rasterizer rendering at y=%d! oops!\n",pLeft->Y); return; @@ -746,7 +762,7 @@ public: int x = XStart; - if (x<0) + if (x < 0) { if (RENDERER && !lineHack) { @@ -766,7 +782,7 @@ public: } if (x+width > framebufferWidth) { - if (RENDERER && !lineHack) + if (RENDERER && !lineHack && framebufferWidth == GPU_FRAMEBUFFER_NATIVE_WIDTH) { printf("rasterizer rendering at x=%d! oops!\n",x+width-1); return; @@ -776,7 +792,7 @@ public: while (width-- > 0) { - pixel(polyAttr, this->_softRender->_framebufferAttributes[adr], dstColor[adr], color[0], color[1], color[2], u, v, 1.0f/invw, z); + pixel(polyAttr, adr, dstColor[adr], color[0], color[1], color[2], u, v, 1.0f/invw, z); adr++; x++; @@ -1123,14 +1139,26 @@ void _HACK_Viewer_ExecUnit() static Render3D* SoftRasterizerRendererCreate() { +#if defined(ENABLE_SSSE3) + return new SoftRasterizerRenderer_SSSE3; +#elif defined(ENABLE_SSE2) + return new SoftRasterizerRenderer_SSE2; +#else return new SoftRasterizerRenderer; +#endif } static void SoftRasterizerRendererDestroy() { if (CurrentRenderer != BaseRenderer) { +#if defined(ENABLE_SSSE3) + delete (SoftRasterizerRenderer_SSSE3 *)CurrentRenderer; +#elif defined(ENABLE_SSE2) + delete (SoftRasterizerRenderer_SSE2 *)CurrentRenderer; +#else delete (SoftRasterizerRenderer *)CurrentRenderer; +#endif CurrentRenderer = BaseRenderer; } } @@ -1226,7 +1254,8 @@ SoftRasterizerRenderer::~SoftRasterizerRenderer() delete[] postprocessParam; postprocessParam = NULL; - free(_framebufferAttributes); + delete _framebufferAttributes; + _framebufferAttributes = NULL; } Render3DError SoftRasterizerRenderer::InitTables() @@ -1291,8 +1320,8 @@ size_t SoftRasterizerRenderer::performClipping(const VERTLIST *vertList, const P template void SoftRasterizerRenderer::performViewportTransforms() { - const float xfactor = (float)this->_framebufferWidth/(float)GFX3D_FRAMEBUFFER_WIDTH; - const float yfactor = (float)this->_framebufferHeight/(float)GFX3D_FRAMEBUFFER_HEIGHT; + const float xfactor = (float)this->_framebufferWidth/(float)GPU_FRAMEBUFFER_NATIVE_WIDTH; + const float yfactor = (float)this->_framebufferHeight/(float)GPU_FRAMEBUFFER_NATIVE_HEIGHT; const float xmax = (float)this->_framebufferWidth-(CUSTOM?0.001f:0); //fudge factor to keep from overrunning render buffers const float ymax = (float)this->_framebufferHeight-(CUSTOM?0.001f:0); @@ -1561,11 +1590,10 @@ Render3DError SoftRasterizerRenderer::RenderEdgeMarking(const u16 *colorTable, c { for (size_t x = 0; x < this->_framebufferWidth; x++, i++) { - const FragmentAttributes dstAttributes = this->_framebufferAttributes[i]; - const u8 polyID = dstAttributes.opaquePolyID; + const u8 polyID = this->_framebufferAttributes->opaquePolyID[i]; if (this->edgeMarkDisabled[polyID>>3]) continue; - if (dstAttributes.isTranslucentPoly) continue; + if (this->_framebufferAttributes->isTranslucentPoly[i] != 0) continue; // > is used instead of != to prevent double edges // between overlapping polys of different IDs. @@ -1575,7 +1603,7 @@ Render3DError SoftRasterizerRenderer::RenderEdgeMarking(const u16 *colorTable, c const FragmentColor edgeColor = this->edgeMarkTable[polyID>>3]; #define PIXOFFSET(dx,dy) ((dx)+(this->_framebufferWidth*(dy))) -#define ISEDGE(dx,dy) ((x+(dx) < this->_framebufferWidth) && (y+(dy) < this->_framebufferHeight) && polyID > this->_framebufferAttributes[i+PIXOFFSET(dx,dy)].opaquePolyID) +#define ISEDGE(dx,dy) ((x+(dx) < this->_framebufferWidth) && (y+(dy) < this->_framebufferHeight) && polyID > this->_framebufferAttributes->opaquePolyID[i+PIXOFFSET(dx,dy)]) #define DRAWEDGE(dx,dy) alphaBlend(_framebufferColor[i+PIXOFFSET(dx,dy)], edgeColor) bool upleft = ISEDGE(-1,-1); @@ -1717,10 +1745,9 @@ Render3DError SoftRasterizerRenderer::RenderFog(const u8 *densityTable, const u3 { for (size_t i = 0; i < framebufferFragmentCount; i++) { - const FragmentAttributes &destFragment = this->_framebufferAttributes[i]; - const size_t fogIndex = destFragment.depth >> 9; + const size_t fogIndex = this->_framebufferAttributes->depth[i] >> 9; assert(fogIndex < 32768); - const u8 fog = (destFragment.isFogged) ? this->fogTable[fogIndex] : 0; + const u8 fog = (this->_framebufferAttributes->isFogged[i] != 0) ? this->fogTable[fogIndex] : 0; FragmentColor &destFragmentColor = this->_framebufferColor[i]; destFragmentColor.r = ((128-fog)*destFragmentColor.r + r*fog)>>7; @@ -1733,10 +1760,9 @@ Render3DError SoftRasterizerRenderer::RenderFog(const u8 *densityTable, const u3 { for (size_t i = 0; i < framebufferFragmentCount; i++) { - const FragmentAttributes &destFragment = this->_framebufferAttributes[i]; - const size_t fogIndex = destFragment.depth >> 9; + const size_t fogIndex = this->_framebufferAttributes->depth[i] >> 9; assert(fogIndex < 32768); - const u8 fog = (destFragment.isFogged) ? this->fogTable[fogIndex] : 0; + const u8 fog = (this->_framebufferAttributes->isFogged[i] != 0) ? this->fogTable[fogIndex] : 0; FragmentColor &destFragmentColor = this->_framebufferColor[i]; destFragmentColor.a = ((128-fog)*destFragmentColor.a + a*fog)>>7; @@ -1753,9 +1779,8 @@ Render3DError SoftRasterizerRenderer::RenderEdgeMarkingAndFog(const SoftRasteriz for (size_t x = 0; x < this->_framebufferWidth; x++, i++) { FragmentColor &dstColor = this->_framebufferColor[i]; - const FragmentAttributes dstAttributes = this->_framebufferAttributes[i]; - const u32 depth = dstAttributes.depth; - const u8 polyID = dstAttributes.opaquePolyID; + const u32 depth = this->_framebufferAttributes->depth[i]; + const u8 polyID = this->_framebufferAttributes->opaquePolyID[i]; // TODO: New edge marking algorithm which tests both polyID and depth, but only checks 4 surrounding pixels. Can we keep this one? if (param.enableEdgeMarking) @@ -1769,15 +1794,19 @@ Render3DError SoftRasterizerRenderer::RenderEdgeMarkingAndFog(const SoftRasteriz // - the character edges in-level are clearly transparent, and also show well through shield powerups. FragmentColor edgeColor = this->edgeMarkTable[polyID>>3]; - bool right = false; - bool down = false; - bool left = false; + bool upleft = false; bool up = false; + bool upright = false; + bool left = false; + bool right = false; + bool downleft = false; + bool down = false; + bool downright = false; #define PIXOFFSET(dx,dy) ((dx)+(this->_framebufferWidth*(dy))) -#define ISEDGE(dx,dy) ((x+(dx) < this->_framebufferWidth) && (y+(dy) < this->_framebufferHeight) && polyID != this->_framebufferAttributes[i+PIXOFFSET(dx,dy)].opaquePolyID && depth >= this->_framebufferAttributes[i+PIXOFFSET(dx,dy)].depth) +#define ISEDGE(dx,dy) ((x+(dx) < this->_framebufferWidth) && (y+(dy) < this->_framebufferHeight) && polyID != this->_framebufferAttributes->opaquePolyID[i+PIXOFFSET(dx,dy)] && depth >= this->_framebufferAttributes->depth[i+PIXOFFSET(dx,dy)]) - if (this->edgeMarkDisabled[polyID>>3] || dstAttributes.isTranslucentPoly) + if (this->edgeMarkDisabled[polyID>>3] || this->_framebufferAttributes->isTranslucentPoly[i] != 0) goto END_EDGE_MARK; up = ISEDGE( 0,-1); @@ -1787,22 +1816,22 @@ Render3DError SoftRasterizerRenderer::RenderEdgeMarkingAndFog(const SoftRasteriz if (right) { - edgeColor = this->edgeMarkTable[this->_framebufferAttributes[i+PIXOFFSET( 1, 0)].opaquePolyID >> 3]; + edgeColor = this->edgeMarkTable[this->_framebufferAttributes->opaquePolyID[i+PIXOFFSET( 1, 0)] >> 3]; alphaBlend(dstColor, edgeColor); } else if (down) { - edgeColor = this->edgeMarkTable[this->_framebufferAttributes[i+PIXOFFSET( 0, 1)].opaquePolyID >> 3]; + edgeColor = this->edgeMarkTable[this->_framebufferAttributes->opaquePolyID[i+PIXOFFSET( 0, 1)] >> 3]; alphaBlend(dstColor, edgeColor); } else if (left) { - edgeColor = this->edgeMarkTable[this->_framebufferAttributes[i+PIXOFFSET(-1, 0)].opaquePolyID >> 3]; + edgeColor = this->edgeMarkTable[this->_framebufferAttributes->opaquePolyID[i+PIXOFFSET(-1, 0)] >> 3]; alphaBlend(dstColor, edgeColor); } else if (up) { - edgeColor = this->edgeMarkTable[this->_framebufferAttributes[i+PIXOFFSET( 0,-1)].opaquePolyID >> 3]; + edgeColor = this->edgeMarkTable[this->_framebufferAttributes->opaquePolyID[i+PIXOFFSET( 0,-1)] >> 3]; alphaBlend(dstColor, edgeColor); } @@ -1822,7 +1851,7 @@ END_EDGE_MARK: ; const size_t fogIndex = depth >> 9; assert(fogIndex < 32768); - const u8 fog = (dstAttributes.isFogged) ? this->fogTable[fogIndex] : 0; + const u8 fog = (this->_framebufferAttributes->isFogged[i] != 0) ? this->fogTable[fogIndex] : 0; if (!param.fogAlphaOnly) { @@ -1858,26 +1887,26 @@ Render3DError SoftRasterizerRenderer::UpdateToonTable(const u16 *toonTableBuffer return RENDER3DERROR_NOERR; } -Render3DError SoftRasterizerRenderer::ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const bool *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) +Render3DError SoftRasterizerRenderer::ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) { - const float lineDecrement = ((float)GFX3D_FRAMEBUFFER_HEIGHT / (float)this->_framebufferHeight) + 0.000001; - const float readIncrement = ((float)GFX3D_FRAMEBUFFER_WIDTH / (float)this->_framebufferWidth) + 0.000001; - float line = GFX3D_FRAMEBUFFER_HEIGHT - 1.0 + lineDecrement; - float readLocation = (GFX3D_FRAMEBUFFER_HEIGHT - 1) * GFX3D_FRAMEBUFFER_WIDTH; + const float lineDecrement = ((float)GPU_FRAMEBUFFER_NATIVE_HEIGHT / (float)this->_framebufferHeight) + 0.000001; + const float readIncrement = ((float)GPU_FRAMEBUFFER_NATIVE_WIDTH / (float)this->_framebufferWidth) + 0.000001; + float line = GPU_FRAMEBUFFER_NATIVE_HEIGHT - 1.0 + lineDecrement; + float readLocation = (GPU_FRAMEBUFFER_NATIVE_HEIGHT - 1) * GPU_FRAMEBUFFER_NATIVE_WIDTH; // The clear image buffer is y-flipped, so we need to flip it back to normal here. - for (size_t y = 0, iw = 0; y < this->_framebufferHeight; y++, readLocation = ((size_t)line * GFX3D_FRAMEBUFFER_WIDTH)) + for (size_t y = 0, iw = 0; y < this->_framebufferHeight; y++, readLocation = ((size_t)line * GPU_FRAMEBUFFER_NATIVE_WIDTH)) { for (size_t x = 0; x < this->_framebufferWidth; x++, iw++, readLocation += readIncrement) { const size_t ir = (size_t)readLocation; this->_framebufferColor[iw].color = RGB15TO6665(colorBuffer[ir] & 0x7FFF, (colorBuffer[ir] >> 15) * 0x1F); - this->_framebufferAttributes[iw].isFogged = fogBuffer[ir]; - this->_framebufferAttributes[iw].depth = depthBuffer[ir]; - this->_framebufferAttributes[iw].opaquePolyID = polyIDBuffer[ir]; - this->_framebufferAttributes[iw].translucentPolyID = kUnsetTranslucentPolyID; - this->_framebufferAttributes[iw].isTranslucentPoly = false; - this->_framebufferAttributes[iw].stencil = 0; + this->_framebufferAttributes->isFogged[iw] = fogBuffer[ir]; + this->_framebufferAttributes->depth[iw] = depthBuffer[ir]; + this->_framebufferAttributes->opaquePolyID[iw] = polyIDBuffer[ir]; + this->_framebufferAttributes->translucentPolyID[iw] = kUnsetTranslucentPolyID; + this->_framebufferAttributes->isTranslucentPoly[iw] = 0; + this->_framebufferAttributes->stencil[iw] = 0; } line -= lineDecrement; @@ -1888,15 +1917,14 @@ Render3DError SoftRasterizerRenderer::ClearUsingImage(const u16 *__restrict colo Render3DError SoftRasterizerRenderer::ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const { - FragmentColor convertedClearColor; + FragmentColor convertedClearColor = clearColor; convertedClearColor.r = GFX3D_5TO6(clearColor.r); convertedClearColor.g = GFX3D_5TO6(clearColor.g); convertedClearColor.b = GFX3D_5TO6(clearColor.b); - convertedClearColor.a = clearColor.a; for (size_t i = 0; i < (this->_framebufferWidth * this->_framebufferHeight); i++) { - this->_framebufferAttributes[i] = clearAttributes; + this->_framebufferAttributes->SetAtIndex(i, clearAttributes); this->_framebufferColor[i] = convertedClearColor; } @@ -2009,16 +2037,75 @@ Render3DError SoftRasterizerRenderer::RenderFinish() Render3DError SoftRasterizerRenderer::SetFramebufferSize(size_t w, size_t h) { - if (w < GFX3D_FRAMEBUFFER_WIDTH || h < GFX3D_FRAMEBUFFER_HEIGHT) + if (w < GPU_FRAMEBUFFER_NATIVE_WIDTH || h < GPU_FRAMEBUFFER_NATIVE_HEIGHT) { return RENDER3DERROR_NOERR; } - + + const size_t newFramebufferColorSizeBytes = w * h * sizeof(FragmentColor); + FragmentColor *oldFramebufferColor = this->_framebufferColor; + FragmentColor *newFramebufferColor = (FragmentColor *)malloc_alignedCacheLine(newFramebufferColorSizeBytes); + FragmentAttributesBuffer *oldFramebufferAttributes = this->_framebufferAttributes; + FragmentAttributesBuffer *newFramebufferAttributes = new FragmentAttributesBuffer(w * h); + this->_framebufferWidth = w; this->_framebufferHeight = h; - this->_framebufferColorSizeBytes = w * h * sizeof(FragmentColor); - this->_framebufferColor = (FragmentColor *)realloc(this->_framebufferColor, this->_framebufferColorSizeBytes); - this->_framebufferAttributes = (FragmentAttributes *)realloc(this->_framebufferAttributes, w * h * sizeof(FragmentAttributes)); + this->_framebufferColorSizeBytes = newFramebufferColorSizeBytes; + this->_framebufferColor = newFramebufferColor; + this->_framebufferAttributes = newFramebufferAttributes; + + free_aligned(oldFramebufferColor); + delete oldFramebufferAttributes; return RENDER3DERROR_NOERR; } + +#ifdef ENABLE_SSE2 + +Render3DError SoftRasterizerRenderer_SSE2::ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const +{ + FragmentColor convertedClearColor = clearColor; + convertedClearColor.r = GFX3D_5TO6(clearColor.r); + convertedClearColor.g = GFX3D_5TO6(clearColor.g); + convertedClearColor.b = GFX3D_5TO6(clearColor.b); + + const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight; + const size_t ssePixCount = pixCount - (pixCount % 16); + + const __m128i color_vec128 = _mm_set1_epi32(convertedClearColor.color); + const __m128i attrDepth_vec128 = _mm_set1_epi32(clearAttributes.depth); + const __m128i attrOpaquePolyID_vec128 = _mm_set1_epi8(clearAttributes.opaquePolyID); + const __m128i attrTranslucentPolyID_vec128 = _mm_set1_epi8(clearAttributes.translucentPolyID); + const __m128i attrStencil_vec128 = _mm_set1_epi8(clearAttributes.stencil); + const __m128i attrIsFogged_vec128 = _mm_set1_epi8(clearAttributes.isFogged); + const __m128i attrIsTranslucentPoly_vec128 = _mm_set1_epi8(clearAttributes.isTranslucentPoly); + + for (size_t i = 0; i < ssePixCount; i += 16) + { + _mm_stream_si128((__m128i *)(this->_framebufferColor + i + 0), color_vec128); + _mm_stream_si128((__m128i *)(this->_framebufferColor + i + 4), color_vec128); + _mm_stream_si128((__m128i *)(this->_framebufferColor + i + 8), color_vec128); + _mm_stream_si128((__m128i *)(this->_framebufferColor + i + 12), color_vec128); + + _mm_stream_si128((__m128i *)(this->_framebufferAttributes->depth + i + 0), attrDepth_vec128); + _mm_stream_si128((__m128i *)(this->_framebufferAttributes->depth + i + 4), attrDepth_vec128); + _mm_stream_si128((__m128i *)(this->_framebufferAttributes->depth + i + 8), attrDepth_vec128); + _mm_stream_si128((__m128i *)(this->_framebufferAttributes->depth + i + 12), attrDepth_vec128); + + _mm_stream_si128((__m128i *)(this->_framebufferAttributes->opaquePolyID + i), attrOpaquePolyID_vec128); + _mm_stream_si128((__m128i *)(this->_framebufferAttributes->translucentPolyID + i), attrTranslucentPolyID_vec128); + _mm_stream_si128((__m128i *)(this->_framebufferAttributes->stencil + i), attrStencil_vec128); + _mm_stream_si128((__m128i *)(this->_framebufferAttributes->isFogged + i), attrIsFogged_vec128); + _mm_stream_si128((__m128i *)(this->_framebufferAttributes->isTranslucentPoly + i), attrIsTranslucentPoly_vec128); + } + + for (size_t i = ssePixCount; i < pixCount; i++) + { + this->_framebufferColor[i] = convertedClearColor; + this->_framebufferAttributes->SetAtIndex(i, clearAttributes); + } + + return RENDER3DERROR_NOERR; +} + +#endif // ENABLE_SSE2 diff --git a/desmume/src/rasterize.h b/desmume/src/rasterize.h index 15d2fa9c3..5d9ff907a 100644 --- a/desmume/src/rasterize.h +++ b/desmume/src/rasterize.h @@ -39,7 +39,13 @@ struct SoftRasterizerPostProcessParams bool fogAlphaOnly; }; +#if defined(ENABLE_SSSE3) +class SoftRasterizerRenderer : public Render3D_SSSE3 +#elif defined(ENABLE_SSE2) +class SoftRasterizerRenderer : public Render3D_SSE2 +#else class SoftRasterizerRenderer : public Render3D +#endif { protected: GFX3D_Clipper clipper; @@ -62,7 +68,7 @@ protected: virtual Render3DError RenderFog(const u8 *densityTable, const u32 color, const u32 offset, const u8 shift, const bool alphaOnly); virtual Render3DError EndRender(const u64 frameCount); - virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const bool *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); + virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); virtual Render3DError ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const; public: @@ -70,7 +76,7 @@ public: size_t _clippedPolyCount; FragmentColor toonColor32LUT[32]; GFX3D_Clipper::TClippedPoly *clippedPolys; - FragmentAttributes *_framebufferAttributes; + FragmentAttributesBuffer *_framebufferAttributes; TexCacheItem *polyTexKeys[POLYLIST_SIZE]; bool polyVisible[POLYLIST_SIZE]; bool polyBackfacing[POLYLIST_SIZE]; @@ -96,4 +102,22 @@ public: virtual Render3DError SetFramebufferSize(size_t w, size_t h); }; +#ifdef ENABLE_SSE2 + +class SoftRasterizerRenderer_SSE2 : public SoftRasterizerRenderer +{ + virtual Render3DError ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const; +}; + #endif + +#ifdef ENABLE_SSSE3 + +class SoftRasterizerRenderer_SSSE3 : public SoftRasterizerRenderer_SSE2 +{ + +}; + +#endif + +#endif // _RASTERIZE_H_ diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index 6c11bf030..0c27e2392 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -20,11 +20,21 @@ #include +#ifdef ENABLE_SSE2 +#include +#endif + +#ifdef ENABLE_SSSE3 +#include +#endif + #include "bits.h" +#include "common.h" #include "gfx3d.h" #include "MMU.h" #include "texcache.h" + static CACHE_ALIGN u32 dsDepthToD24_LUT[32768] = {0}; int cur3DCore = GPU3D_NULL; @@ -85,7 +95,7 @@ bool NDS_3D_ChangeCore(int newCore) return result; } - Render3DError error = newRenderer->SetFramebufferSize(gfx3d_getFramebufferWidth(), gfx3d_getFramebufferHeight()); + Render3DError error = newRenderer->SetFramebufferSize(GPU_GetFramebufferWidth(), GPU_GetFramebufferHeight()); if (error != RENDER3DERROR_NOERR) { return result; @@ -114,6 +124,76 @@ void Render3DBaseDestroy() } } +FragmentAttributesBuffer::FragmentAttributesBuffer(size_t newCount) +{ + count = newCount; + + depth = (u32 *)malloc_alignedCacheLine(count * sizeof(u32)); + opaquePolyID = (u8 *)malloc_alignedCacheLine(count * sizeof(u8)); + translucentPolyID = (u8 *)malloc_alignedCacheLine(count * sizeof(u8)); + stencil = (u8 *)malloc_alignedCacheLine(count * sizeof(u8)); + isFogged = (u8 *)malloc_alignedCacheLine(count * sizeof(u8)); + isTranslucentPoly = (u8 *)malloc_alignedCacheLine(count * sizeof(u8)); +} + +FragmentAttributesBuffer::~FragmentAttributesBuffer() +{ + free_aligned(depth); + free_aligned(opaquePolyID); + free_aligned(translucentPolyID); + free_aligned(stencil); + free_aligned(isFogged); + free_aligned(isTranslucentPoly); +} + +void FragmentAttributesBuffer::SetAtIndex(const size_t index, const FragmentAttributes &attr) +{ + this->depth[index] = attr.depth; + this->opaquePolyID[index] = attr.opaquePolyID; + this->translucentPolyID[index] = attr.translucentPolyID; + this->stencil[index] = attr.stencil; + this->isFogged[index] = attr.isFogged; + this->isTranslucentPoly[index] = attr.isTranslucentPoly; +} + +void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr) +{ +#ifdef ENABLE_SSE2 + const size_t sseCount = count - (count % 16); + + const __m128i attrDepth_vec128 = _mm_set1_epi32(attr.depth); + const __m128i attrOpaquePolyID_vec128 = _mm_set1_epi8(attr.opaquePolyID); + const __m128i attrTranslucentPolyID_vec128 = _mm_set1_epi8(attr.translucentPolyID); + const __m128i attrStencil_vec128 = _mm_set1_epi8(attr.stencil); + const __m128i attrIsFogged_vec128 = _mm_set1_epi8(attr.isFogged); + const __m128i attrIsTranslucentPoly_vec128 = _mm_set1_epi8(attr.isTranslucentPoly); + + for (size_t i = 0; i < sseCount; i += 16) + { + _mm_stream_si128((__m128i *)(this->depth + 0), attrDepth_vec128); + _mm_stream_si128((__m128i *)(this->depth + 4), attrDepth_vec128); + _mm_stream_si128((__m128i *)(this->depth + 8), attrDepth_vec128); + _mm_stream_si128((__m128i *)(this->depth + 12), attrDepth_vec128); + + _mm_stream_si128((__m128i *)this->opaquePolyID, attrOpaquePolyID_vec128); + _mm_stream_si128((__m128i *)this->translucentPolyID, attrTranslucentPolyID_vec128); + _mm_stream_si128((__m128i *)this->stencil, attrStencil_vec128); + _mm_stream_si128((__m128i *)this->isFogged, attrIsFogged_vec128); + _mm_stream_si128((__m128i *)this->isTranslucentPoly, attrIsTranslucentPoly_vec128); + } + + for (size_t i = sseCount; i < count; i++) + { + this->SetAtIndex(i, attr); + } +#else + for (size_t i = 0; i < count; i++) + { + this->SetAtIndex(i, attr); + } +#endif +} + Render3D::Render3D() { _renderID = RENDERID_NULL; @@ -131,8 +211,8 @@ Render3D::Render3D() needTableInit = false; } - _framebufferWidth = GFX3D_FRAMEBUFFER_WIDTH; - _framebufferHeight = GFX3D_FRAMEBUFFER_HEIGHT; + _framebufferWidth = GPU_FRAMEBUFFER_NATIVE_WIDTH; + _framebufferHeight = GPU_FRAMEBUFFER_NATIVE_HEIGHT; _framebufferColorSizeBytes = 0; _framebufferColor = NULL; @@ -141,7 +221,7 @@ Render3D::Render3D() Render3D::~Render3D() { - free(_framebufferColor); + free_aligned(_framebufferColor); TexCache_Reset(); } @@ -172,15 +252,21 @@ size_t Render3D::GetFramebufferHeight() Render3DError Render3D::SetFramebufferSize(size_t w, size_t h) { - if (w < GFX3D_FRAMEBUFFER_WIDTH || h < GFX3D_FRAMEBUFFER_HEIGHT) + if (w < GPU_FRAMEBUFFER_NATIVE_WIDTH || h < GPU_FRAMEBUFFER_NATIVE_HEIGHT) { return RENDER3DERROR_NOERR; } + const size_t newFramebufferColorSizeBytes = w * h * sizeof(FragmentColor); + FragmentColor *oldFramebufferColor = this->_framebufferColor; + FragmentColor *newFramebufferColor = (FragmentColor *)malloc_alignedCacheLine(newFramebufferColorSizeBytes); + this->_framebufferWidth = w; this->_framebufferHeight = h; - this->_framebufferColorSizeBytes = w * h * sizeof(FragmentColor); - this->_framebufferColor = (FragmentColor *)realloc(this->_framebufferColor, this->_framebufferColorSizeBytes); + this->_framebufferColorSizeBytes = newFramebufferColorSizeBytes; + this->_framebufferColor = newFramebufferColor; + + free_aligned(oldFramebufferColor); return RENDER3DERROR_NOERR; } @@ -214,6 +300,7 @@ Render3DError Render3D::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, { memcpy(dstRGBA6665, this->_framebufferColor, this->_framebufferColorSizeBytes); + // Convert to RGBA5551 for (size_t i = 0; i < (this->_framebufferWidth * this->_framebufferHeight); i++) { dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000); @@ -245,7 +332,7 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState) clearFragment.translucentPolyID = kUnsetTranslucentPolyID; clearFragment.depth = renderState.clearDepth; clearFragment.stencil = 0; - clearFragment.isTranslucentPoly = false; + clearFragment.isTranslucentPoly = 0; clearFragment.isFogged = BIT15(renderState.clearColor); if (renderState.enableClearImage) @@ -258,33 +345,33 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState) const u8 xScroll = scrollBits & 0xFF; const u8 yScroll = (scrollBits >> 8) & 0xFF; - size_t dd = (GFX3D_FRAMEBUFFER_WIDTH * GFX3D_FRAMEBUFFER_HEIGHT) - GFX3D_FRAMEBUFFER_WIDTH; + size_t dstIndex = (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT) - GPU_FRAMEBUFFER_NATIVE_WIDTH; - for (size_t iy = 0; iy < GFX3D_FRAMEBUFFER_HEIGHT; iy++) + for (size_t iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++) { const size_t y = ((iy + yScroll) & 0xFF) << 8; - for (size_t ix = 0; ix < GFX3D_FRAMEBUFFER_WIDTH; ix++) + for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; ix++) { const size_t x = (ix + xScroll) & 0xFF; - const size_t adr = y + x; + const size_t srcIndex = y | x; //this is tested by harry potter and the order of the phoenix. //TODO (optimization) dont do this if we are mapped to blank memory (such as in sonic chronicles) //(or use a special zero fill in the bulk clearing above) - this->clearImageColor16Buffer[dd] = clearColorBuffer[adr]; + this->clearImageColor16Buffer[dstIndex] = clearColorBuffer[srcIndex]; //this is tested quite well in the sonic chronicles main map mode //where depth values are used for trees etc you can walk behind - this->clearImageDepthBuffer[dd] = dsDepthToD24_LUT[clearDepthBuffer[adr] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex] = dsDepthToD24_LUT[clearDepthBuffer[srcIndex] & 0x7FFF]; - this->clearImageFogBuffer[dd] = BIT15(clearDepthBuffer[adr]); - this->clearImagePolyIDBuffer[dd] = clearFragment.opaquePolyID; + this->clearImageFogBuffer[dstIndex] = BIT15(clearDepthBuffer[srcIndex]); + this->clearImagePolyIDBuffer[dstIndex] = clearFragment.opaquePolyID; - dd++; + dstIndex++; } - dd -= GFX3D_FRAMEBUFFER_WIDTH * 2; + dstIndex -= GPU_FRAMEBUFFER_NATIVE_WIDTH * 2; } error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer); @@ -301,7 +388,7 @@ Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState) return error; } -Render3DError Render3D::ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const bool *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) +Render3DError Render3D::ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer) { return RENDER3DERROR_NOERR; } @@ -384,3 +471,311 @@ Render3DError Render3D::VramReconfigureSignal() TexCache_Invalidate(); return RENDER3DERROR_NOERR; } + +#ifdef ENABLE_SSE2 + +Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState) +{ + Render3DError error = RENDER3DERROR_NOERR; + + FragmentColor clearColor; + clearColor.r = renderState.clearColor & 0x1F; + clearColor.g = (renderState.clearColor >> 5) & 0x1F; + clearColor.b = (renderState.clearColor >> 10) & 0x1F; + clearColor.a = (renderState.clearColor >> 16) & 0x1F; + + FragmentAttributes clearFragment; + clearFragment.opaquePolyID = (renderState.clearColor >> 24) & 0x3F; + //special value for uninitialized translucent polyid. without this, fires in spiderman2 dont display + //I am not sure whether it is right, though. previously this was cleared to 0, as a guess, + //but in spiderman2 some fires with polyid 0 try to render on top of the background + clearFragment.translucentPolyID = kUnsetTranslucentPolyID; + clearFragment.depth = renderState.clearDepth; + clearFragment.stencil = 0; + clearFragment.isTranslucentPoly = 0; + clearFragment.isFogged = BIT15(renderState.clearColor); + + if (renderState.enableClearImage) + { + //the lion, the witch, and the wardrobe (thats book 1, suck it you new-school numberers) + //uses the scroll registers in the main game engine + const u16 *__restrict clearColorBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[2]; + const u16 *__restrict clearDepthBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[3]; + const u16 scrollBits = T1ReadWord(MMU.ARM9_REG, 0x356); //CLRIMAGE_OFFSET + const u8 xScroll = scrollBits & 0xFF; + const u8 yScroll = (scrollBits >> 8) & 0xFF; + + size_t dstIndex = (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT) - GPU_FRAMEBUFFER_NATIVE_WIDTH; + + static const __m128i addrOffset = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); + static const __m128i addrRolloverMask = _mm_set1_epi16(0x00FF); + const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID); + + for (size_t iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++) + { + const size_t y = ((iy + yScroll) & 0xFF) << 8; + __m128i y_vec128 = _mm_set1_epi16(y); + + for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; ix += 8) + { + __m128i addr_vec128 = _mm_set1_epi16(ix + xScroll); + addr_vec128 = _mm_add_epi16(addr_vec128, addrOffset); + addr_vec128 = _mm_and_si128(addr_vec128, addrRolloverMask); + addr_vec128 = _mm_or_si128(addr_vec128, y_vec128); + + this->clearImageColor16Buffer[dstIndex+7] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 7)]; + this->clearImageColor16Buffer[dstIndex+6] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 6)]; + this->clearImageColor16Buffer[dstIndex+5] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 5)]; + this->clearImageColor16Buffer[dstIndex+4] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 4)]; + this->clearImageColor16Buffer[dstIndex+3] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 3)]; + this->clearImageColor16Buffer[dstIndex+2] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 2)]; + this->clearImageColor16Buffer[dstIndex+1] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 1)]; + this->clearImageColor16Buffer[dstIndex+0] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 0)]; + + this->clearImageDepthBuffer[dstIndex+7] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 7)] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex+6] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 6)] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex+5] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 5)] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex+4] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 4)] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex+3] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 3)] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex+2] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 2)] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex+1] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 1)] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex+0] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 0)] & 0x7FFF]; + + this->clearImageFogBuffer[dstIndex+7] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 7)] ); + this->clearImageFogBuffer[dstIndex+6] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 6)] ); + this->clearImageFogBuffer[dstIndex+5] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 5)] ); + this->clearImageFogBuffer[dstIndex+4] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 4)] ); + this->clearImageFogBuffer[dstIndex+3] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 3)] ); + this->clearImageFogBuffer[dstIndex+2] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 2)] ); + this->clearImageFogBuffer[dstIndex+1] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 1)] ); + this->clearImageFogBuffer[dstIndex+0] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 0)] ); + + _mm_storel_epi64((__m128i *)(this->clearImagePolyIDBuffer + dstIndex), opaquePolyID_vec128); + + dstIndex += 8; + } + + dstIndex -= GPU_FRAMEBUFFER_NATIVE_WIDTH * 2; + } + + error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer); + if (error != RENDER3DERROR_NOERR) + { + error = this->ClearUsingValues(clearColor, clearFragment); + } + } + else + { + error = this->ClearUsingValues(clearColor, clearFragment); + } + + return error; +} + +#endif // ENABLE_SSE2 + +#ifdef ENABLE_SSSE3 + +Render3DError Render3D_SSSE3::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551) +{ + // Convert to RGBA5551 + const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight; + const size_t ssePixCount = pixCount - (pixCount % 4); + + for (size_t i = 0; i < ssePixCount; i += 4) + { + __m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i)); + _mm_store_si128((__m128i *)(dstRGBA6665 + i), color); + + __m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E)); // Read from R + r = _mm_srli_epi32(r, 1); // Shift to R + + __m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00)); // Read from G + g = _mm_srli_epi32(g, 4); // Shift in G + + __m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000)); // Read from B + b = _mm_srli_epi32(b, 7); // Shift to B + + __m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A + a = _mm_cmpgt_epi32(a, _mm_set1_epi32(0x00000000)); // Determine A + a = _mm_and_si128(a, _mm_set1_epi32(0x00008000)); // Mask to A + + color = r; + color = _mm_or_si128(color, g); + color = _mm_or_si128(color, b); + color = _mm_or_si128(color, a); + + // All the colors are currently placed every other 16 bits, so we need to swizzle them + // to the lower 64 bits of our vector before we store them back to memory. + color = _mm_shuffle_epi8(color, _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + _mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color); + } + + for (size_t i = ssePixCount; i < pixCount; i++) + { + dstRGBA5551[i] = R6G6B6TORGB15(this->_framebufferColor[i].r, this->_framebufferColor[i].g, this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000); + } + + return RENDER3DERROR_NOERR; +} + +Render3DError Render3D_SSSE3::ClearFramebuffer(const GFX3D_State &renderState) +{ + Render3DError error = RENDER3DERROR_NOERR; + + FragmentColor clearColor; + clearColor.r = renderState.clearColor & 0x1F; + clearColor.g = (renderState.clearColor >> 5) & 0x1F; + clearColor.b = (renderState.clearColor >> 10) & 0x1F; + clearColor.a = (renderState.clearColor >> 16) & 0x1F; + + FragmentAttributes clearFragment; + clearFragment.opaquePolyID = (renderState.clearColor >> 24) & 0x3F; + //special value for uninitialized translucent polyid. without this, fires in spiderman2 dont display + //I am not sure whether it is right, though. previously this was cleared to 0, as a guess, + //but in spiderman2 some fires with polyid 0 try to render on top of the background + clearFragment.translucentPolyID = kUnsetTranslucentPolyID; + clearFragment.depth = renderState.clearDepth; + clearFragment.stencil = 0; + clearFragment.isTranslucentPoly = 0; + clearFragment.isFogged = BIT15(renderState.clearColor); + + if (renderState.enableClearImage) + { + //the lion, the witch, and the wardrobe (thats book 1, suck it you new-school numberers) + //uses the scroll registers in the main game engine + const u16 *__restrict clearColorBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[2]; + const u16 *__restrict clearDepthBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[3]; + const u16 scrollBits = T1ReadWord(MMU.ARM9_REG, 0x356); //CLRIMAGE_OFFSET + const u8 xScroll = scrollBits & 0xFF; + const u8 yScroll = (scrollBits >> 8) & 0xFF; + + size_t dstIndex = (GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT) - GPU_FRAMEBUFFER_NATIVE_WIDTH; + + if (xScroll == 0 && yScroll == 0) + { + const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID); + + for (size_t iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++) + { + for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; ix += 16) + { + static const __m128i depthBitMask_vec128 = _mm_set1_epi16(0x7FFF); + static const __m128i fogBufferBitMask_vec128 = _mm_set1_epi16(BIT(15)); + const u16 addr = (iy << 8) | ix; + + _mm_store_si128((__m128i *)(this->clearImageColor16Buffer + dstIndex + 8), *(__m128i *)(clearColorBuffer + addr + 8)); + _mm_store_si128((__m128i *)(this->clearImageColor16Buffer + dstIndex), *(__m128i *)(clearColorBuffer + addr)); + + __m128i clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + addr + 8)); + clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, depthBitMask_vec128); + this->clearImageDepthBuffer[dstIndex+15] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 7)]; + this->clearImageDepthBuffer[dstIndex+14] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 6)]; + this->clearImageDepthBuffer[dstIndex+13] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 5)]; + this->clearImageDepthBuffer[dstIndex+12] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 4)]; + this->clearImageDepthBuffer[dstIndex+11] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 3)]; + this->clearImageDepthBuffer[dstIndex+10] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 2)]; + this->clearImageDepthBuffer[dstIndex+ 9] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 1)]; + this->clearImageDepthBuffer[dstIndex+ 8] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 0)]; + + clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + addr)); + clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, depthBitMask_vec128); + this->clearImageDepthBuffer[dstIndex+ 7] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 7)]; + this->clearImageDepthBuffer[dstIndex+ 6] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 6)]; + this->clearImageDepthBuffer[dstIndex+ 5] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 5)]; + this->clearImageDepthBuffer[dstIndex+ 4] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 4)]; + this->clearImageDepthBuffer[dstIndex+ 3] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 3)]; + this->clearImageDepthBuffer[dstIndex+ 2] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 2)]; + this->clearImageDepthBuffer[dstIndex+ 1] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 1)]; + this->clearImageDepthBuffer[dstIndex+ 0] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepth_vec128, 0)]; + + clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + addr + 8)); + clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, fogBufferBitMask_vec128); + clearDepth_vec128 = _mm_srli_epi16(clearDepth_vec128, 15); + + __m128 clearDepthFogBit_vec128 = _mm_shuffle_epi8(clearDepth_vec128, _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0)); + + clearDepth_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + addr)); + clearDepth_vec128 = _mm_and_si128(clearDepth_vec128, fogBufferBitMask_vec128); + clearDepth_vec128 = _mm_srli_epi16(clearDepth_vec128, 15); + + clearDepth_vec128 = _mm_shuffle_epi8(clearDepth_vec128, _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1)); + + _mm_store_si128((__m128i *)(this->clearImageFogBuffer + dstIndex), _mm_or_si128(clearDepth_vec128, clearDepthFogBit_vec128)); + _mm_store_si128((__m128i *)(this->clearImagePolyIDBuffer + dstIndex), opaquePolyID_vec128); + + dstIndex += 16; + } + + dstIndex -= GPU_FRAMEBUFFER_NATIVE_WIDTH * 2; + } + } + else + { + static const __m128i addrOffset = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); + static const __m128i addrRolloverMask = _mm_set1_epi16(0x00FF); + const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID); + + for (size_t iy = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++) + { + const size_t y = ((iy + yScroll) & 0xFF) << 8; + __m128i y_vec128 = _mm_set1_epi16(y); + + for (size_t ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; ix += 8) + { + __m128i addr_vec128 = _mm_set1_epi16(ix + xScroll); + addr_vec128 = _mm_add_epi16(addr_vec128, addrOffset); + addr_vec128 = _mm_and_si128(addr_vec128, addrRolloverMask); + addr_vec128 = _mm_or_si128(addr_vec128, y_vec128); + + this->clearImageColor16Buffer[dstIndex+7] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 7)]; + this->clearImageColor16Buffer[dstIndex+6] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 6)]; + this->clearImageColor16Buffer[dstIndex+5] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 5)]; + this->clearImageColor16Buffer[dstIndex+4] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 4)]; + this->clearImageColor16Buffer[dstIndex+3] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 3)]; + this->clearImageColor16Buffer[dstIndex+2] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 2)]; + this->clearImageColor16Buffer[dstIndex+1] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 1)]; + this->clearImageColor16Buffer[dstIndex+0] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 0)]; + + this->clearImageDepthBuffer[dstIndex+7] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 7)] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex+6] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 6)] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex+5] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 5)] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex+4] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 4)] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex+3] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 3)] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex+2] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 2)] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex+1] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 1)] & 0x7FFF]; + this->clearImageDepthBuffer[dstIndex+0] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 0)] & 0x7FFF]; + + this->clearImageFogBuffer[dstIndex+7] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 7)] ); + this->clearImageFogBuffer[dstIndex+6] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 6)] ); + this->clearImageFogBuffer[dstIndex+5] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 5)] ); + this->clearImageFogBuffer[dstIndex+4] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 4)] ); + this->clearImageFogBuffer[dstIndex+3] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 3)] ); + this->clearImageFogBuffer[dstIndex+2] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 2)] ); + this->clearImageFogBuffer[dstIndex+1] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 1)] ); + this->clearImageFogBuffer[dstIndex+0] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 0)] ); + + _mm_storel_epi64((__m128i *)(this->clearImagePolyIDBuffer + dstIndex), opaquePolyID_vec128); + + dstIndex += 8; + } + + dstIndex -= GPU_FRAMEBUFFER_NATIVE_WIDTH * 2; + } + } + + error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer); + if (error != RENDER3DERROR_NOERR) + { + error = this->ClearUsingValues(clearColor, clearFragment); + } + } + else + { + error = this->ClearUsingValues(clearColor, clearFragment); + } + + return error; +} + +#endif // ENABLE_SSSE3 diff --git a/desmume/src/render3D.h b/desmume/src/render3D.h index eb24738ff..a967f0c02 100644 --- a/desmume/src/render3D.h +++ b/desmume/src/render3D.h @@ -77,8 +77,25 @@ struct FragmentAttributes u8 opaquePolyID; u8 translucentPolyID; u8 stencil; - bool isFogged; - bool isTranslucentPoly; + u8 isFogged; + u8 isTranslucentPoly; +}; + +struct FragmentAttributesBuffer +{ + size_t count; + u32 *depth; + u8 *opaquePolyID; + u8 *translucentPolyID; + u8 *stencil; + u8 *isFogged; + u8 *isTranslucentPoly; + + FragmentAttributesBuffer(size_t newCount); + ~FragmentAttributesBuffer(); + + void SetAtIndex(const size_t index, const FragmentAttributes &attr); + void SetAll(const FragmentAttributes &attr); }; class Render3D @@ -92,10 +109,10 @@ protected: size_t _framebufferColorSizeBytes; FragmentColor *_framebufferColor; - CACHE_ALIGN u16 clearImageColor16Buffer[GFX3D_FRAMEBUFFER_WIDTH * GFX3D_FRAMEBUFFER_HEIGHT]; - CACHE_ALIGN u32 clearImageDepthBuffer[GFX3D_FRAMEBUFFER_WIDTH * GFX3D_FRAMEBUFFER_HEIGHT]; - CACHE_ALIGN bool clearImageFogBuffer[GFX3D_FRAMEBUFFER_WIDTH * GFX3D_FRAMEBUFFER_HEIGHT]; - CACHE_ALIGN u8 clearImagePolyIDBuffer[GFX3D_FRAMEBUFFER_WIDTH * GFX3D_FRAMEBUFFER_HEIGHT]; + CACHE_ALIGN u16 clearImageColor16Buffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; + CACHE_ALIGN u32 clearImageDepthBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; + CACHE_ALIGN u8 clearImageFogBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; + CACHE_ALIGN u8 clearImagePolyIDBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT]; virtual Render3DError BeginRender(const GFX3D &engine); virtual Render3DError RenderGeometry(const GFX3D_State &renderState, const POLYLIST *polyList, const INDEXLIST *indexList); @@ -104,7 +121,7 @@ protected: virtual Render3DError EndRender(const u64 frameCount); virtual Render3DError FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551); - virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const bool *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); + virtual Render3DError ClearUsingImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer); virtual Render3DError ClearUsingValues(const FragmentColor &clearColor, const FragmentAttributes &clearAttributes) const; virtual Render3DError SetupPolygon(const POLY &thePoly); @@ -138,4 +155,27 @@ public: virtual Render3DError SetFramebufferSize(size_t w, size_t h); // Called whenever the output framebuffer size changes. }; +#ifdef ENABLE_SSE2 + +class Render3D_SSE2 : public Render3D +{ +public: + virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState); +}; + #endif + +#ifdef ENABLE_SSSE3 + +class Render3D_SSSE3 : public Render3D_SSE2 +{ +protected: + virtual Render3DError FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551); + +public: + virtual Render3DError ClearFramebuffer(const GFX3D_State &renderState); +}; + +#endif + +#endif // RENDER3D_H diff --git a/desmume/src/types.h b/desmume/src/types.h index 9071069cb..9b4838c67 100644 --- a/desmume/src/types.h +++ b/desmume/src/types.h @@ -86,6 +86,14 @@ #undef ENABLE_SSE2 #endif +#ifndef ENABLE_SSE2 +#undef ENABLE_SSE3 +#endif + +#ifndef ENABLE_SSE3 +#undef ENABLE_SSSE3 +#endif + #ifdef _MSC_VER #define strcasecmp(x,y) _stricmp(x,y) #define strncasecmp(x, y, l) strnicmp(x, y, l) @@ -119,7 +127,12 @@ #else #define DS_ALIGN(X) #endif + +#ifdef HOST_64 +#define CACHE_ALIGN DS_ALIGN(64) +#else #define CACHE_ALIGN DS_ALIGN(32) +#endif //use this for example when you want a byte value to be better-aligned #define FAST_ALIGN DS_ALIGN(4) //--------------------------------------------- diff --git a/desmume/src/windows/main.cpp b/desmume/src/windows/main.cpp index 11f02a83d..45d9ec074 100644 --- a/desmume/src/windows/main.cpp +++ b/desmume/src/windows/main.cpp @@ -4045,7 +4045,7 @@ void CloseRom() // clear screen so the last frame we rendered doesn't stick around // (TODO: maybe NDS_Reset should do this?) - memset(GPU_screen, 0xFF, sizeof(GPU_screen)); + memset(GPU_screen, 0xFF, GPU_GetFramebufferWidth() * GPU_GetFramebufferHeight() * 2 * sizeof(u16)); InvalidateRect(MainWindow->getHWnd(), NULL, TRUE); // make sure the window refreshes with the cleared screen