From 7e3f1d85ae9fcfc136ec4db590d37f1c6ace8d7d Mon Sep 17 00:00:00 2001 From: rogerman Date: Sat, 5 Sep 2015 22:35:34 +0000 Subject: [PATCH] GPU / MMU: - Do SSE2 optimization for direct-color sprite renders. - Make ARM9_LCD cache-aligned. Allows for SSE2 to perform aligned load/stores on certain operations, improving performance. - Further templatize some methods. - Do some misc. code cleanup. --- desmume/src/GPU.cpp | 404 +++++++++++++++++++++----------------- desmume/src/GPU.h | 23 ++- desmume/src/MMU.cpp | 323 +++++++++++++++--------------- desmume/src/MMU.h | 68 ++++--- desmume/src/NDSSystem.cpp | 2 +- desmume/src/render3D.cpp | 12 +- 6 files changed, 454 insertions(+), 378 deletions(-) diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 47d73e7ae..53b525941 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -159,63 +159,6 @@ FORCEINLINE void rot_BMP_map(GPUEngineBase *gpu, const s32 auxX, const s32 auxY, gpu->___setFinalColorBck(color, i, ((color & 0x8000) != 0)); } -typedef void (*rot_fun)(GPUEngineBase *gpu, const s32 auxX, const s32 auxY, const int lg, const u32 map, const u32 tile, const u16 *pal, const size_t i); - -template -void rot_scale_op(GPUEngineBase *gpu, const BGxPARMS ¶m, const u16 LG, const s32 wh, const s32 ht, const u32 map, const u32 tile, const u16 *pal) -{ - ROTOCOORD x, y; - x.val = param.BGxX; - y.val = param.BGxY; - - const s32 dx = (s32)param.BGxPA; - const s32 dy = (s32)param.BGxPC; - - // as an optimization, specially handle the fairly common case of - // "unrotated + unscaled + no boundary checking required" - if (dx == GPU_FRAMEBUFFER_NATIVE_WIDTH && dy == 0) - { - s32 auxX = (WRAP) ? x.bits.Integer & (wh-1) : x.bits.Integer; - const s32 auxY = (WRAP) ? y.bits.Integer & (ht-1) : y.bits.Integer; - - if (WRAP || (auxX + LG < wh && auxX >= 0 && auxY < ht && auxY >= 0)) - { - for (size_t i = 0; i < LG; i++) - { - fun(gpu, auxX, auxY, wh, map, tile, pal, i); - auxX++; - - if (WRAP) - auxX = auxX & (wh-1); - } - - return; - } - } - - for (size_t i = 0; i < LG; i++, x.val += dx, y.val += dy) - { - const s32 auxX = (WRAP) ? x.bits.Integer & (wh-1) : x.bits.Integer; - const s32 auxY = (WRAP) ? y.bits.Integer & (ht-1) : y.bits.Integer; - - if (WRAP || ((auxX >= 0) && (auxX < wh) && (auxY >= 0) && (auxY < ht))) - fun(gpu, auxX, auxY, wh, map, tile, pal, i); - } -} - -template -void apply_rot_fun(GPUEngineBase *gpu, const BGxPARMS ¶m, const u16 LG, const u32 map, const u32 tile, const u16 *pal) -{ - struct _BGxCNT *bgCnt = &(gpu->dispx_st)->dispx_BGxCNT[LAYERID].bits; - s32 wh = gpu->BGSize[LAYERID][0]; - s32 ht = gpu->BGSize[LAYERID][1]; - - if (bgCnt->PaletteSet_Wrap) - rot_scale_op(gpu, param, LG, wh, ht, map, tile, pal); - else - rot_scale_op(gpu, param, LG, wh, ht, map, tile, pal); -} - void gpu_savestate(EMUFILE* os) { const GPUEngineA *mainEngine = GPU->GetEngineMain(); @@ -351,6 +294,9 @@ void GPUEngineBase::_InitLUTs() GPUEngineBase::GPUEngineBase() { + _paletteBG = NULL; + _paletteOBJ = NULL; + debug = false; _InitLUTs(); workingScanline = NULL; @@ -419,7 +365,7 @@ void GPUEngineBase::_Reset_Base() this->_bgPrio[1] = 0; this->_bgPrio[2] = 0; this->_bgPrio[3] = 0; - this->_bgPrio[4] = 0xFF; + this->_bgPrio[4] = 0x7F; this->_bg0HasHighestPrio = true; @@ -677,54 +623,55 @@ void GPUEngineBase::SetVideoProp(const u32 ctrlBits) this->_sprEnable = cnt->OBJ_Enable; - this->SetBGProp(3, T1ReadWord(MMU.ARM9_REG, this->_engineID * ADDRESS_STEP_4KB + 14)); - this->SetBGProp(2, T1ReadWord(MMU.ARM9_REG, this->_engineID * ADDRESS_STEP_4KB + 12)); - this->SetBGProp(1, T1ReadWord(MMU.ARM9_REG, this->_engineID * ADDRESS_STEP_4KB + 10)); - this->SetBGProp(0, T1ReadWord(MMU.ARM9_REG, this->_engineID * ADDRESS_STEP_4KB + 8)); + this->SetBGProp( T1ReadWord(MMU.ARM9_REG, this->_engineID * ADDRESS_STEP_4KB + 14) ); + this->SetBGProp( T1ReadWord(MMU.ARM9_REG, this->_engineID * ADDRESS_STEP_4KB + 12) ); + this->SetBGProp( T1ReadWord(MMU.ARM9_REG, this->_engineID * ADDRESS_STEP_4KB + 10) ); + this->SetBGProp( T1ReadWord(MMU.ARM9_REG, this->_engineID * ADDRESS_STEP_4KB + 8) ); } //this handles writing in BGxCNT -void GPUEngineBase::SetBGProp(const size_t num, const u16 ctrlBits) +template +void GPUEngineBase::SetBGProp(const u16 ctrlBits) { - struct _BGxCNT *cnt = &((this->dispx_st)->dispx_BGxCNT[num].bits); + struct _BGxCNT *cnt = &((this->dispx_st)->dispx_BGxCNT[LAYERID].bits); struct _DISPCNT *dispCnt = &(this->dispx_st)->dispx_DISPCNT.bits; - this->dispx_st->dispx_BGxCNT[num].val = LE_TO_LOCAL_16(ctrlBits); + this->dispx_st->dispx_BGxCNT[LAYERID].val = LE_TO_LOCAL_16(ctrlBits); this->ResortBGLayers(); if (this->_engineID == GPUEngineID_Sub) { - this->_BG_tile_ram[num] = MMU_BBG; - this->_BG_bmp_ram[num] = MMU_BBG; - this->_BG_bmp_large_ram[num] = MMU_BBG; - this->_BG_map_ram[num] = MMU_BBG; + this->_BG_tile_ram[LAYERID] = MMU_BBG; + this->_BG_bmp_ram[LAYERID] = MMU_BBG; + this->_BG_bmp_large_ram[LAYERID] = MMU_BBG; + this->_BG_map_ram[LAYERID] = MMU_BBG; } else { - this->_BG_tile_ram[num] = MMU_ABG + dispCnt->CharacBase_Block * ADDRESS_STEP_64KB; - this->_BG_bmp_ram[num] = MMU_ABG; - this->_BG_bmp_large_ram[num] = MMU_ABG; - this->_BG_map_ram[num] = MMU_ABG + dispCnt->ScreenBase_Block * ADDRESS_STEP_64KB; + this->_BG_tile_ram[LAYERID] = MMU_ABG + dispCnt->CharacBase_Block * ADDRESS_STEP_64KB; + this->_BG_bmp_ram[LAYERID] = MMU_ABG; + this->_BG_bmp_large_ram[LAYERID] = MMU_ABG; + this->_BG_map_ram[LAYERID] = MMU_ABG + dispCnt->ScreenBase_Block * ADDRESS_STEP_64KB; } - this->_BG_tile_ram[num] += (cnt->CharacBase_Block * ADDRESS_STEP_16KB); - this->_BG_bmp_ram[num] += (cnt->ScreenBase_Block * ADDRESS_STEP_16KB); - this->_BG_map_ram[num] += (cnt->ScreenBase_Block * ADDRESS_STEP_2KB); + this->_BG_tile_ram[LAYERID] += (cnt->CharacBase_Block * ADDRESS_STEP_16KB); + this->_BG_bmp_ram[LAYERID] += (cnt->ScreenBase_Block * ADDRESS_STEP_16KB); + this->_BG_map_ram[LAYERID] += (cnt->ScreenBase_Block * ADDRESS_STEP_2KB); - switch (num) + switch (LAYERID) { case 0: case 1: - this->BGExtPalSlot[num] = cnt->PaletteSet_Wrap * 2 + num; + this->BGExtPalSlot[LAYERID] = cnt->PaletteSet_Wrap * 2 + LAYERID; break; default: - this->BGExtPalSlot[num] = (u8)num; + this->BGExtPalSlot[LAYERID] = (u8)LAYERID; break; } - BGType mode = GPUEngineBase::_mode2type[dispCnt->BG_Mode][num]; + BGType mode = GPUEngineBase::_mode2type[dispCnt->BG_Mode][LAYERID]; //clarify affine ext modes if (mode == BGType_AffineExt) @@ -746,12 +693,12 @@ void GPUEngineBase::SetBGProp(const size_t num, const u16 ctrlBits) } } - this->_BGTypes[num] = mode; + this->_BGTypes[LAYERID] = mode; - this->BGSize[num][0] = GPUEngineBase::_sizeTab[mode][cnt->ScreenSize][0]; - this->BGSize[num][1] = GPUEngineBase::_sizeTab[mode][cnt->ScreenSize][1]; + this->BGSize[LAYERID][0] = GPUEngineBase::_sizeTab[mode][cnt->ScreenSize][0]; + this->BGSize[LAYERID][1] = GPUEngineBase::_sizeTab[mode][cnt->ScreenSize][1]; - this->_bgPrio[num] = (ctrlBits & 0x3); + this->_bgPrio[LAYERID] = (ctrlBits & 0x3); } template @@ -789,6 +736,7 @@ void GPUEngineBase::SetLayerEnableState(const size_t layerIndex, bool theState) // ROUTINES FOR INSIDE / OUTSIDE WINDOW CHECKS /*****************************************************************************/ +// check whether (x,y) is within the rectangle (including wraparounds) template u8 GPUEngineBase::_WithinRect(const size_t x) const { @@ -1100,11 +1048,11 @@ FORCEINLINE void GPUEngineBase::_SetFinalColorSprite(const size_t srcX, const si template FORCEINLINE void GPUEngineBase::____setFinalColorBck(const u16 color, const size_t srcX) { - u16 *dstLine = this->currDst; - u8 *bgLine = this->_bgPixels; - if (ISCUSTOMRENDERINGNEEDED) { + u16 *dstLine = this->currDst; + u8 *bgLine = this->_bgPixels; + const NDSDisplayInfo &dispInfo = GPU->GetDisplayInfo(); for (size_t line = 0; line < _gpuDstLineCount[this->currLine]; line++) @@ -1130,8 +1078,8 @@ FORCEINLINE void GPUEngineBase::____setFinalColorBck(const u16 color, const size { this->_SetFinalColorBG(srcX, srcX, - dstLine, - bgLine, + this->currDst, + this->_bgPixels, color); } } @@ -1217,7 +1165,7 @@ void GPUEngineBase::_MosaicSpriteLinePixel(const size_t x, u16 l, u16 *dst, u8 * dst[x] = LE_TO_LOCAL_16(objColor.color); dst_alpha[x] = objColor.alpha; - if (!objColor.opaque) prioTab[x] = 0xFF; + if (!objColor.opaque) prioTab[x] = 0x7F; } void GPUEngineBase::_MosaicSpriteLine(u16 l, u16 *dst, u8 *dst_alpha, u8 *typeTab, u8 *prioTab) @@ -1228,6 +1176,61 @@ void GPUEngineBase::_MosaicSpriteLine(u16 l, u16 *dst, u8 *dst_alpha, u8 *typeTa this->_MosaicSpriteLinePixel(i, l, dst, dst_alpha, typeTab, prioTab); } +template +void GPUEngineBase::_rot_scale_op(const BGxPARMS ¶m, const u16 LG, const s32 wh, const s32 ht, const u32 map, const u32 tile, const u16 *pal) +{ + ROTOCOORD x, y; + x.val = param.BGxX; + y.val = param.BGxY; + + const s32 dx = (s32)param.BGxPA; + const s32 dy = (s32)param.BGxPC; + + // as an optimization, specially handle the fairly common case of + // "unrotated + unscaled + no boundary checking required" + if (dx == GPU_FRAMEBUFFER_NATIVE_WIDTH && dy == 0) + { + s32 auxX = (WRAP) ? x.bits.Integer & (wh-1) : x.bits.Integer; + const s32 auxY = (WRAP) ? y.bits.Integer & (ht-1) : y.bits.Integer; + + if (WRAP || (auxX + LG < wh && auxX >= 0 && auxY < ht && auxY >= 0)) + { + for (size_t i = 0; i < LG; i++) + { + fun(this, auxX, auxY, wh, map, tile, pal, i); + auxX++; + + if (WRAP) + auxX = auxX & (wh-1); + } + + return; + } + } + + for (size_t i = 0; i < LG; i++, x.val += dx, y.val += dy) + { + const s32 auxX = (WRAP) ? x.bits.Integer & (wh-1) : x.bits.Integer; + const s32 auxY = (WRAP) ? y.bits.Integer & (ht-1) : y.bits.Integer; + + if (WRAP || ((auxX >= 0) && (auxX < wh) && (auxY >= 0) && (auxY < ht))) + fun(this, auxX, auxY, wh, map, tile, pal, i); + } +} + +template +void GPUEngineBase::_apply_rot_fun(const BGxPARMS ¶m, const u16 LG, const u32 map, const u32 tile, const u16 *pal) +{ + struct _BGxCNT *bgCnt = &(this->dispx_st)->dispx_BGxCNT[LAYERID].bits; + s32 wh = this->BGSize[LAYERID][0]; + s32 ht = this->BGSize[LAYERID][1]; + + if (bgCnt->PaletteSet_Wrap) + this->_rot_scale_op(param, LG, wh, ht, map, tile, pal); + else + this->_rot_scale_op(param, LG, wh, ht, map, tile, pal); +} + template void GPUEngineBase::_LineLarge8bpp() { @@ -1250,12 +1253,10 @@ void GPUEngineBase::_LineLarge8bpp() u32 tmp_map = this->_BG_bmp_large_ram[LAYERID] + lg * YBG; u8 *map = (u8 *)MMU_gpu_map(tmp_map); - const u16 *pal = (u16 *)(MMU.ARM9_VMEM + this->_engineID * ADDRESS_STEP_1KB); - for (size_t x = 0; x < lg; ++x, ++XBG) { XBG &= wmask; - const u16 color = LE_TO_LOCAL_16( pal[map[XBG]] ); + const u16 color = LE_TO_LOCAL_16( this->_paletteBG[map[XBG]] ); this->__setFinalColorBck(color,x,(color!=0)); } } @@ -1295,7 +1296,7 @@ void GPUEngineBase::_RenderLine_TextBG(u16 XBG, u16 YBG, u16 LG) if (!bgCnt->Palette_256) // color: 16 palette entries { - const u16 *pal = (u16 *)(MMU.ARM9_VMEM + this->_engineID * ADDRESS_STEP_1KB); + const u16 *pal = this->_paletteBG; yoff = ((YBG&7)<<2); xfin = 8 - (xoff&7); @@ -1360,11 +1361,7 @@ void GPUEngineBase::_RenderLine_TextBG(u16 XBG, u16 YBG, u16 LG) } else //256-color BG { - const u16 *pal = (dispCnt->ExBGxPalette_Enable) ? (u16 *)MMU.ExtPal[this->_engineID][this->BGExtPalSlot[LAYERID]] : (u16 *)(MMU.ARM9_VMEM + this->_engineID * ADDRESS_STEP_1KB); - if (pal == NULL) - { - return; - } + const u16 *pal = (dispCnt->ExBGxPalette_Enable) ? (u16 *)MMU.ExtPal[this->_engineID][this->BGExtPalSlot[LAYERID]] : this->_paletteBG; yoff = ((YBG&7)<<3); xfin = 8 - (xoff&7); @@ -1407,9 +1404,8 @@ void GPUEngineBase::_RenderLine_TextBG(u16 XBG, u16 YBG, u16 LG) template void GPUEngineBase::_RotBG2(const BGxPARMS ¶m, const u16 LG) { - const u16 *pal = (u16 *)(MMU.ARM9_VMEM + this->_engineID * ADDRESS_STEP_1KB); // printf("rot mode\n"); - apply_rot_fun< LAYERID, rot_tiled_8bit_entry >(this, param, LG, this->_BG_map_ram[LAYERID], this->_BG_tile_ram[LAYERID], pal); + this->_apply_rot_fun< LAYERID, rot_tiled_8bit_entry >(param, LG, this->_BG_map_ram[LAYERID], this->_BG_tile_ram[LAYERID], this->_paletteBG); } template @@ -1417,41 +1413,43 @@ void GPUEngineBase::_ExtRotBG2(const BGxPARMS ¶m, const u16 LG) { struct _DISPCNT *dispCnt = &(this->dispx_st)->dispx_DISPCNT.bits; - u16 *pal = NULL; + u16 *pal = this->_paletteBG; switch (this->_BGTypes[LAYERID]) { case BGType_AffineExt_256x16: // 16 bit bgmap entries - pal = (dispCnt->ExBGxPalette_Enable) ? (u16 *)(MMU.ExtPal[this->_engineID][this->BGExtPalSlot[LAYERID]]) : (u16 *)(MMU.ARM9_VMEM + this->_engineID * ADDRESS_STEP_1KB); - if (pal == NULL) return; - - if(dispCnt->ExBGxPalette_Enable) - apply_rot_fun< LAYERID, rot_tiled_16bit_entry >(this, param, LG, this->_BG_map_ram[LAYERID], this->_BG_tile_ram[LAYERID], pal); + { + if (dispCnt->ExBGxPalette_Enable) + { + pal = (u16 *)(MMU.ExtPal[this->_engineID][this->BGExtPalSlot[LAYERID]]); + this->_apply_rot_fun< LAYERID, rot_tiled_16bit_entry >(param, LG, this->_BG_map_ram[LAYERID], this->_BG_tile_ram[LAYERID], pal); + } else - apply_rot_fun< LAYERID, rot_tiled_16bit_entry >(this, param, LG, this->_BG_map_ram[LAYERID], this->_BG_tile_ram[LAYERID], pal); + { + this->_apply_rot_fun< LAYERID, rot_tiled_16bit_entry >(param, LG, this->_BG_map_ram[LAYERID], this->_BG_tile_ram[LAYERID], pal); + } break; + } case BGType_AffineExt_256x1: // 256 colors - pal = (u16 *)(MMU.ARM9_VMEM + this->_engineID * ADDRESS_STEP_1KB); - apply_rot_fun< LAYERID, rot_256_map >(this, param, LG, this->_BG_bmp_ram[LAYERID], 0, pal); + this->_apply_rot_fun< LAYERID, rot_256_map >(param, LG, this->_BG_bmp_ram[LAYERID], 0, pal); break; case BGType_AffineExt_Direct: // direct colors / BMP { if (ISCUSTOMRENDERINGNEEDED && (LAYERID == this->vramBGLayer)) { - apply_rot_fun< LAYERID, rot_BMP_map >(this, param, LG, this->_BG_bmp_ram[LAYERID], 0, NULL); + this->_apply_rot_fun< LAYERID, rot_BMP_map >(param, LG, this->_BG_bmp_ram[LAYERID], 0, pal); } else { - apply_rot_fun< LAYERID, rot_BMP_map >(this, param, LG, this->_BG_bmp_ram[LAYERID], 0, NULL); + this->_apply_rot_fun< LAYERID, rot_BMP_map >(param, LG, this->_BG_bmp_ram[LAYERID], 0, pal); } break; } case BGType_Large8bpp: // large screen 256 colors - pal = (u16 *)(MMU.ARM9_VMEM + this->_engineID * ADDRESS_STEP_1KB); - apply_rot_fun< LAYERID, rot_256_map >(this, param, LG, this->_BG_bmp_large_ram[LAYERID], 0, pal); + this->_apply_rot_fun< LAYERID, rot_256_map >(param, LG, this->_BG_bmp_large_ram[LAYERID], 0, pal); break; default: @@ -1526,9 +1524,54 @@ void GPUEngineBase::_LineExtRot() /* http://nocash.emubase.de/gbatek.htm#dsvideoobjs */ void GPUEngineBase::_RenderSpriteBMP(const u8 spriteNum, const u16 l, u16 *dst, const u32 srcadr, u8 *dst_alpha, u8 *typeTab, u8 *prioTab, const u8 prio, const size_t lg, size_t sprX, size_t x, const s32 xdir, const u8 alpha) { - for (size_t i = 0; i < lg; i++, ++sprX, x += xdir) + const u16 *bmpBuffer = (u16 *)MMU_gpu_map(srcadr); + size_t i = 0; + +#ifdef ENABLE_SSE2 + if (xdir == 1) { - const u16 color = LE_TO_LOCAL_16( *(u16 *)MMU_gpu_map(srcadr + (x << 1)) ); + const __m128i prio_vec128 = _mm_set1_epi8(prio); + + const size_t ssePixCount = lg - (lg % 16); + for (; i < ssePixCount; i += 16, x += 16, sprX += 16) + { + __m128i prioTab_vec128 = _mm_load_si128((__m128i *)(prioTab + sprX)); + const __m128i prioCompare = _mm_cmplt_epi8(prio_vec128, prioTab_vec128); + + __m128i colorLo_vec128 = _mm_load_si128((__m128i *)(bmpBuffer + x)); + __m128i colorHi_vec128 = _mm_load_si128((__m128i *)(bmpBuffer + x + 8)); + + const __m128i colorAlphaLo_vec128 = _mm_and_si128(colorLo_vec128, _mm_set1_epi16(0x8000)); + const __m128i colorAlphaHi_vec128 = _mm_and_si128(colorHi_vec128, _mm_set1_epi16(0x8000)); + + const __m128i colorAlphaLoCompare = _mm_cmpeq_epi16(colorAlphaLo_vec128, _mm_set1_epi16(0x8000)); + const __m128i colorAlphaHiCompare = _mm_cmpeq_epi16(colorAlphaHi_vec128, _mm_set1_epi16(0x8000)); + const __m128i colorAlphaPackedCompare = _mm_cmpeq_epi8( _mm_packs_epi16(colorAlphaLoCompare, colorAlphaHiCompare), _mm_set1_epi8(0xFF) ); + + const __m128i combinedPackedCompare = _mm_and_si128(prioCompare, colorAlphaPackedCompare); + const __m128i combinedLoCompare = _mm_cmpeq_epi16( _mm_unpacklo_epi8(combinedPackedCompare, _mm_setzero_si128()), _mm_set1_epi16(0x00FF) ); + const __m128i combinedHiCompare = _mm_cmpeq_epi16( _mm_unpackhi_epi8(combinedPackedCompare, _mm_setzero_si128()), _mm_set1_epi16(0x00FF) ); + + colorLo_vec128 = _mm_or_si128( _mm_and_si128(combinedLoCompare, colorLo_vec128), _mm_andnot_si128(combinedLoCompare, _mm_load_si128((__m128i *)(dst + sprX))) ); + colorHi_vec128 = _mm_or_si128( _mm_and_si128(combinedHiCompare, colorHi_vec128), _mm_andnot_si128(combinedHiCompare, _mm_load_si128((__m128i *)(dst + sprX + 8))) ); + const __m128i dstAlpha_vec128 = _mm_or_si128( _mm_and_si128(combinedPackedCompare, _mm_set1_epi8(alpha + 1)), _mm_andnot_si128(combinedPackedCompare, _mm_load_si128((__m128i *)(dst_alpha + sprX))) ); + const __m128i dstTypeTab_vec128 = _mm_or_si128( _mm_and_si128(combinedPackedCompare, _mm_set1_epi8(3)), _mm_andnot_si128(combinedPackedCompare, _mm_load_si128((__m128i *)(typeTab + sprX))) ); + prioTab_vec128 = _mm_or_si128( _mm_and_si128(combinedPackedCompare, prio_vec128), _mm_andnot_si128(combinedPackedCompare, prioTab_vec128) ); + const __m128i sprNum_vec128 = _mm_or_si128( _mm_and_si128(combinedPackedCompare, _mm_set1_epi8(spriteNum)), _mm_andnot_si128(combinedPackedCompare, _mm_load_si128((__m128i *)(this->_sprNum + sprX))) ); + + _mm_store_si128((__m128i *)(dst + sprX), colorLo_vec128); + _mm_store_si128((__m128i *)(dst + sprX + 8), colorHi_vec128); + _mm_store_si128((__m128i *)(dst_alpha + sprX), dstAlpha_vec128); + _mm_store_si128((__m128i *)(typeTab + sprX), dstTypeTab_vec128); + _mm_store_si128((__m128i *)(prioTab + sprX), prioTab_vec128); + _mm_store_si128((__m128i *)(this->_sprNum + sprX), sprNum_vec128); + } + } +#endif + + for (; i < lg; i++, sprX++, x += xdir) + { + const u16 color = LE_TO_LOCAL_16(bmpBuffer[x]); //a cleared alpha bit suppresses the pixel from processing entirely; it doesnt exist if ((color & 0x8000) && (prio < prioTab[sprX])) @@ -1708,12 +1751,11 @@ void GPUEngineBase::SpriteRender(u16 *dst, u8 *dst_alpha, u8 *typeTab, u8 *prioT template void GPUEngineBase::_SpriteRenderPerform(u16 *dst, u8 *dst_alpha, u8 *typeTab, u8 *prioTab) { - u16 l = currLine; + u16 l = this->currLine; size_t cost = 0; struct _DISPCNT *dispCnt = &(this->dispx_st)->dispx_DISPCNT.bits; - u8 block = this->_sprBoundary; - + for (size_t i = 0; i < 128; i++) { const OAMAttributes &spriteInfo = this->_oamList[i]; @@ -1733,6 +1775,7 @@ void GPUEngineBase::_SpriteRenderPerform(u16 *dst, u8 *dst_alpha, u8 *typeTab, u s32 sprX, sprY, x, y, lg; s32 xdir; u8 prio; + u16 *pal; u8 *src; u32 srcadr; @@ -1746,7 +1789,6 @@ void GPUEngineBase::_SpriteRenderPerform(u16 *dst, u8 *dst_alpha, u8 *typeTab, u { s32 fieldX, fieldY, auxX, auxY, realX, realY, offset; u8 blockparameter; - u16 *pal; s16 dx, dmx, dy, dmy; u16 colour; @@ -1818,13 +1860,10 @@ void GPUEngineBase::_SpriteRenderPerform(u16 *dst, u8 *dst_alpha, u8 *typeTab, u // If we are using 1 palette of 256 colours if (spriteInfo.Depth) { - src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex << block)); + src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex << this->_sprBoundary)); // If extended palettes are set, use them - if (dispCnt->ExOBJPalette_Enable) - pal = (u16 *)(MMU.ObjExtPal[this->_engineID][0]+(spriteInfo.PaletteIndex*0x200)); - else - pal = (u16 *)(MMU.ARM9_VMEM + 0x200 + this->_engineID * ADDRESS_STEP_1KB); + pal = (dispCnt->ExOBJPalette_Enable) ? (u16 *)(MMU.ObjExtPal[this->_engineID][0]+(spriteInfo.PaletteIndex*0x200)) : this->_paletteOBJ; for (size_t j = 0; j < lg; ++j, ++sprX) { @@ -1908,13 +1947,13 @@ void GPUEngineBase::_SpriteRenderPerform(u16 *dst, u8 *dst_alpha, u8 *typeTab, u if (MODE == SpriteRenderMode_Sprite2D) { src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex << 5)); - pal = (u16 *)(MMU.ARM9_VMEM + 0x200 + (this->_engineID * ADDRESS_STEP_1KB) + (spriteInfo.PaletteIndex * 32)); } else { src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex << this->_sprBoundary)); - pal = (u16 *)(MMU.ARM9_VMEM + 0x200 + (this->_engineID * ADDRESS_STEP_1KB) + (spriteInfo.PaletteIndex * 32)); } + + pal = this->_paletteOBJ + (spriteInfo.PaletteIndex << 4); for (size_t j = 0; j < lg; ++j, ++sprX) { @@ -1977,9 +2016,9 @@ void GPUEngineBase::_SpriteRenderPerform(u16 *dst, u8 *dst_alpha, u8 *typeTab, u else { if (spriteInfo.Depth) - src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex<>3)*sprSize.x*8) + ((y&0x7)*8)); + src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex<_sprBoundary) + ((y>>3)*sprSize.x*8) + ((y&0x7)*8)); else - src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex<>3)*sprSize.x*4) + ((y&0x7)*4)); + src = (u8 *)MMU_gpu_map(this->_sprMem + (spriteInfo.TileIndex<_sprBoundary) + ((y>>3)*sprSize.x*4) + ((y&0x7)*4)); } this->_RenderSpriteWin(src, (spriteInfo.Depth != 0), lg, sprX, x, xdir); @@ -1999,9 +2038,9 @@ void GPUEngineBase::_SpriteRenderPerform(u16 *dst, u8 *dst_alpha, u8 *typeTab, u if (MODE == SpriteRenderMode_Sprite2D) srcadr = this->_sprMem + ((spriteInfo.TileIndex)<<5) + ((y>>3)<<10) + ((y&0x7)*8); else - srcadr = this->_sprMem + (spriteInfo.TileIndex<>3)*sprSize.x*8) + ((y&0x7)*8); + srcadr = this->_sprMem + (spriteInfo.TileIndex<_sprBoundary) + ((y>>3)*sprSize.x*8) + ((y&0x7)*8); - const u16 *pal = (dispCnt->ExOBJPalette_Enable) ? (u16 *)(MMU.ObjExtPal[this->_engineID][0]+(spriteInfo.PaletteIndex*0x200)) : (u16 *)(MMU.ARM9_VMEM + 0x200 + this->_engineID * ADDRESS_STEP_1KB); + pal = (dispCnt->ExOBJPalette_Enable) ? (u16 *)(MMU.ObjExtPal[this->_engineID][0]+(spriteInfo.PaletteIndex*0x200)) : this->_paletteOBJ; this->_RenderSprite256(i, l, dst, srcadr, pal, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, spriteInfo.Mode == 1); } else // 16 colors @@ -2012,10 +2051,10 @@ void GPUEngineBase::_SpriteRenderPerform(u16 *dst, u8 *dst_alpha, u8 *typeTab, u } else { - srcadr = this->_sprMem + (spriteInfo.TileIndex<>3)*sprSize.x*4) + ((y&0x7)*4); + srcadr = this->_sprMem + (spriteInfo.TileIndex<_sprBoundary) + ((y>>3)*sprSize.x*4) + ((y&0x7)*4); } - const u16 *pal = (u16 *)(MMU.ARM9_VMEM + 0x200 + this->_engineID * ADDRESS_STEP_1KB) + (spriteInfo.PaletteIndex << 4); + pal = this->_paletteOBJ + (spriteInfo.PaletteIndex << 4); this->_RenderSprite16(l, dst, srcadr, pal, dst_alpha, typeTab, prioTab, prio, lg, sprX, x, xdir, spriteInfo.Mode == 1); } } @@ -2259,7 +2298,7 @@ void GPUEngineBase::UpdateVRAM3DUsageProperties_OBJLayer(const size_t bankIndex, if ( (spriteInfo.RotScale != 2) && ((spriteInfo.RotScale & 1) == 0) && (spriteInfo.Mode == 3) && (spriteInfo.PaletteIndex != 0) ) { - const u32 vramAddress = ( (spriteInfo.TileIndex & 0x1F) * 0x10 ) + ( (spriteInfo.TileIndex & ~0x1F) * 0x80 ); + const u32 vramAddress = ((spriteInfo.TileIndex & 0x1F) << 5) + ((spriteInfo.TileIndex & ~0x1F) << 7); const SpriteSize sprSize = GPUEngineBase::_sprSizeTab[spriteInfo.Size][spriteInfo.Shape]; if( (vramAddress == (mainEngine->dispCapCnt.writeOffset * ADDRESS_STEP_32KB)) && (sprSize.x == 64) && (sprSize.y == 64) ) @@ -2272,58 +2311,62 @@ void GPUEngineBase::UpdateVRAM3DUsageProperties_OBJLayer(const size_t bankIndex, } } -u32 GPUEngineBase::getAffineStart(const size_t layer, int xy) +template +u32 GPUEngineBase::getAffineStart() { - if (xy == 0) - return affineInfo[layer-2].x; + if (SET_XY == 0) + return this->affineInfo[LAYERID-2].x; else - return affineInfo[layer-2].y; + return this->affineInfo[LAYERID-2].y; } -void GPUEngineBase::setAffineStartWord(const size_t layer, int xy, u16 val, int word) +template +void GPUEngineBase::setAffineStartWord(u16 val) { - u32 curr = getAffineStart(layer, xy); + u32 curr = this->getAffineStart(); - if (word == 0) + if (!HIWORD) curr = (curr & 0xFFFF0000) | val; else curr = (curr & 0x0000FFFF) | (((u32)val) << 16); - setAffineStart(layer, xy, curr); + this->setAffineStart(curr); } -void GPUEngineBase::setAffineStart(const size_t layer, int xy, u32 val) +template +void GPUEngineBase::setAffineStart(u32 val) { - if (xy == 0) - affineInfo[layer-2].x = val; + if (SET_XY == 0) + this->affineInfo[LAYERID-2].x = val; else - affineInfo[layer-2].y = val; + this->affineInfo[LAYERID-2].y = val; - refreshAffineStartRegs(layer, xy); + this->refreshAffineStartRegs(); } -void GPUEngineBase::refreshAffineStartRegs(const int num, const int xy) +template +void GPUEngineBase::refreshAffineStartRegs() { - if (num == -1) + if (LAYERID == -1) { - refreshAffineStartRegs(2, xy); - refreshAffineStartRegs(3, xy); + this->refreshAffineStartRegs(); + this->refreshAffineStartRegs(); return; } - if (xy == -1) + if (SET_XY == -1) { - refreshAffineStartRegs(num, 0); - refreshAffineStartRegs(num, 1); + this->refreshAffineStartRegs(); + this->refreshAffineStartRegs(); return; } - BGxPARMS *params = (num == 2) ? &(dispx_st)->dispx_BG2PARMS : &(dispx_st)->dispx_BG3PARMS; + BGxPARMS *params = (LAYERID == GPULayerID_BG2) ? &(dispx_st)->dispx_BG2PARMS : &(dispx_st)->dispx_BG3PARMS; - if (xy == 0) - params->BGxX = affineInfo[num-2].x; + if (SET_XY == 0) + params->BGxX = this->affineInfo[LAYERID-2].x; else - params->BGxY = affineInfo[num-2].y; + params->BGxY = this->affineInfo[LAYERID-2].y; } template @@ -2737,6 +2780,8 @@ void GPUEngineBase::REG_DISPx_pack_test() GPUEngineA::GPUEngineA() { _engineID = GPUEngineID_Main; + _paletteBG = (u16 *)MMU.ARM9_VMEM; + _paletteOBJ = (u16 *)(MMU.ARM9_VMEM + 0x200); _oamList = (OAMAttributes *)(MMU.ARM9_OAM); _sprMem = MMU_AOBJ; dispx_st = (REG_DISPx *)MMU.ARM9_REG; @@ -2888,7 +2933,7 @@ void GPUEngineA::RenderLine(const u16 l, bool skip) //bubble bobble revolution classic mode //NOTE: //I am REALLY unsatisfied with this logic now. But it seems to be working.. - this->refreshAffineStartRegs(-1,-1); + this->refreshAffineStartRegs<(GPULayerID)-1, -1>(); } if (skip) @@ -3019,7 +3064,7 @@ void GPUEngineA::_RenderLine_Layer(const u16 l, u16 *dstLine, const size_t dstLi this->_currentFadeInColors = &GPUEngineBase::_fadeInColors[this->_BLDY_EVY][0]; this->_currentFadeOutColors = &GPUEngineBase::_fadeOutColors[this->_BLDY_EVY][0]; - const u16 backdrop_color = T1ReadWord(MMU.ARM9_VMEM, 0) & 0x7FFF; + const u16 backdrop_color = LE_TO_LOCAL_16(this->_paletteBG[0]) & 0x7FFF; //we need to write backdrop colors in the same way as we do BG pixels in order to do correct window processing //this is currently eating up 2fps or so. it is a reasonable candidate for optimization. @@ -3056,7 +3101,7 @@ void GPUEngineA::_RenderLine_Layer(const u16 l, u16 *dstLine, const size_t dstLi // init background color & priorities memset(this->_sprAlpha, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH); memset(this->_sprType, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH); - memset(this->_sprPrio, 0xFF, GPU_FRAMEBUFFER_NATIVE_WIDTH); + memset(this->_sprPrio, 0x7F, GPU_FRAMEBUFFER_NATIVE_WIDTH); memset(this->_sprWin, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH); // init pixels priorities @@ -3275,7 +3320,6 @@ void GPUEngineA::_RenderLine_DisplayCapture(const u16 l) cap_dst_adr &= 0x1FFFF; cap_dst_adr += vramWriteBlock * GPU_VRAM_BLOCK_LINES * GPU_FRAMEBUFFER_NATIVE_WIDTH * sizeof(u16); - // TODO: Make MMU.blank_memory and MMU.ARM9_LCD 16-byte aligned so that we can use aligned load/store for better performance. const u16 *cap_src = (u16 *)MMU.blank_memory; u16 *cap_dst = (u16 *)(MMU.ARM9_LCD + cap_dst_adr); @@ -3507,7 +3551,7 @@ void GPUEngineA::_RenderLine_DispCapture_Copy(const u16 *__restrict src, u16 *__ if (CAPTUREFROMNATIVESRC) { #ifdef ENABLE_SSE2 - MACRODO_N(CAPTURELENGTH / (sizeof(__m128i) / sizeof(u16)), _mm_storeu_si128((__m128i *)dst + X, _mm_or_si128( _mm_loadu_si128( (__m128i *)src + X), alpha_vec128 ) )); + MACRODO_N(CAPTURELENGTH / (sizeof(__m128i) / sizeof(u16)), _mm_store_si128((__m128i *)dst + X, _mm_or_si128( _mm_load_si128( (__m128i *)src + X), alpha_vec128 ) )); #else for (size_t i = 0; i < CAPTURELENGTH; i++) { @@ -3742,7 +3786,7 @@ void GPUEngineA::_RenderLine_DispCapture_Blend(const u16 *__restrict srcA, const srcA[_gpuDstPitchIndex[i+1]], srcA[_gpuDstPitchIndex[i+0]]); - __m128i srcB_vec128 = (CAPTUREFROMNATIVESRCB) ? _mm_loadu_si128((__m128i *)(srcB + i)) : _mm_set_epi16(srcB[_gpuDstPitchIndex[i+7]], + __m128i srcB_vec128 = (CAPTUREFROMNATIVESRCB) ? _mm_load_si128((__m128i *)(srcB + i)) : _mm_set_epi16(srcB[_gpuDstPitchIndex[i+7]], srcB[_gpuDstPitchIndex[i+6]], srcB[_gpuDstPitchIndex[i+5]], srcB[_gpuDstPitchIndex[i+4]], @@ -3751,7 +3795,7 @@ void GPUEngineA::_RenderLine_DispCapture_Blend(const u16 *__restrict srcA, const srcB[_gpuDstPitchIndex[i+1]], srcB[_gpuDstPitchIndex[i+0]]); - _mm_storeu_si128( (__m128i *)(dst + i), this->_RenderLine_DispCapture_BlendFunc_SSE2(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) ); + _mm_store_si128( (__m128i *)(dst + i), this->_RenderLine_DispCapture_BlendFunc_SSE2(srcA_vec128, srcB_vec128, blendEVA_vec128, blendEVB_vec128) ); } #else for (size_t i = 0; i < CAPTURELENGTH; i++) @@ -3788,6 +3832,8 @@ void GPUEngineA::_RenderLine_DispCapture_Blend(const u16 *__restrict srcA, const GPUEngineB::GPUEngineB() { _engineID = GPUEngineID_Sub; + _paletteBG = (u16 *)(MMU.ARM9_VMEM + ADDRESS_STEP_1KB); + _paletteOBJ = (u16 *)(MMU.ARM9_VMEM + ADDRESS_STEP_1KB + 0x200); _oamList = (OAMAttributes *)(MMU.ARM9_OAM + ADDRESS_STEP_1KB); _sprMem = MMU_BOBJ; dispx_st = (REG_DISPx *)(&MMU.ARM9_REG[REG_DISPB]); @@ -3837,7 +3883,7 @@ void GPUEngineB::RenderLine(const u16 l, bool skip) //bubble bobble revolution classic mode //NOTE: //I am REALLY unsatisfied with this logic now. But it seems to be working.. - this->refreshAffineStartRegs(-1,-1); + this->refreshAffineStartRegs<(GPULayerID)-1, -1>(); } if (skip) @@ -3934,7 +3980,7 @@ void GPUEngineB::_RenderLine_Layer(const u16 l, u16 *dstLine, const size_t dstLi this->_currentFadeInColors = &GPUEngineBase::_fadeInColors[this->_BLDY_EVY][0]; this->_currentFadeOutColors = &GPUEngineBase::_fadeOutColors[this->_BLDY_EVY][0]; - const u16 backdrop_color = T1ReadWord(MMU.ARM9_VMEM, ADDRESS_STEP_1KB) & 0x7FFF; + const u16 backdrop_color = LE_TO_LOCAL_16(this->_paletteBG[0]) & 0x7FFF; //we need to write backdrop colors in the same way as we do BG pixels in order to do correct window processing //this is currently eating up 2fps or so. it is a reasonable candidate for optimization. @@ -3971,7 +4017,7 @@ void GPUEngineB::_RenderLine_Layer(const u16 l, u16 *dstLine, const size_t dstLi // init background color & priorities memset(this->_sprAlpha, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH); memset(this->_sprType, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH); - memset(this->_sprPrio, 0xFF, GPU_FRAMEBUFFER_NATIVE_WIDTH); + memset(this->_sprPrio, 0x7F, GPU_FRAMEBUFFER_NATIVE_WIDTH); memset(this->_sprWin, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH); // init pixels priorities @@ -4029,9 +4075,6 @@ void GPUEngineB::_RenderLine_Layer(const u16 l, u16 *dstLine, const size_t dstLi struct _BGxCNT *bgCnt = &(this->dispx_st)->dispx_BGxCNT[layerID].bits; this->_curr_mosaic_enabled = bgCnt->Mosaic_Enable; - //useful for debugging individual layers - //if(this->core == GPUEngineID_Sub || layerNum != 2) continue; - #ifndef DISABLE_MOSAIC if (this->_curr_mosaic_enabled) { @@ -4531,10 +4574,6 @@ void GPUSubsystem::RenderLine(const u16 l, bool skip) this->_engineSub->RenderLine(l, skip); } - if (l == 191) - { - - } } void GPUSubsystem::ClearWithColor(const u16 colorBGRA5551) @@ -4581,3 +4620,18 @@ void NDSDisplay::SetEngineByID(const GPUEngineID theID) this->_gpu = (theID == GPUEngineID_Main) ? (GPUEngineBase *)GPU->GetEngineMain() : (GPUEngineBase *)GPU->GetEngineSub(); this->_gpu->SetDisplayByID(this->_ID); } + +template void GPUEngineBase::setAffineStart(u32 val); +template void GPUEngineBase::setAffineStart(u32 val); +template void GPUEngineBase::setAffineStart(u32 val); +template void GPUEngineBase::setAffineStart(u32 val); + +template void GPUEngineBase::setAffineStartWord(u16 val); +template void GPUEngineBase::setAffineStartWord(u16 val); +template void GPUEngineBase::setAffineStartWord(u16 val); +template void GPUEngineBase::setAffineStartWord(u16 val); + +template void GPUEngineBase::setAffineStartWord(u16 val); +template void GPUEngineBase::setAffineStartWord(u16 val); +template void GPUEngineBase::setAffineStartWord(u16 val); +template void GPUEngineBase::setAffineStartWord(u16 val); diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index 941207610..f89a7a95d 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -30,6 +30,7 @@ #include #endif +class GPUEngineBase; class EMUFILE; struct MMU_struct; @@ -44,6 +45,8 @@ struct MMU_struct; void gpu_savestate(EMUFILE* os); bool gpu_loadstate(EMUFILE* is, int size); +typedef void (*rot_fun)(GPUEngineBase *gpu, const s32 auxX, const s32 auxY, const int lg, const u32 map, const u32 tile, const u16 *pal, const size_t i); + /******************************************************************************* this structure is for display control, it holds flags for general display @@ -751,12 +754,14 @@ protected: } _mosaicColors; GPUEngineID _engineID; + u16 *_paletteBG; + u16 *_paletteOBJ; + OAMAttributes *_oamList; + u32 _sprMem; u8 _bgPrio[5]; bool _bg0HasHighestPrio; - OAMAttributes *_oamList; - u32 _sprMem; u8 _sprBoundary; u8 _sprBMPBoundary; u8 _sprBMPMode; @@ -833,6 +838,9 @@ protected: void _MosaicSpriteLinePixel(const size_t x, u16 l, u16 *dst, u8 *dst_alpha, u8 *typeTab, u8 *prioTab); void _MosaicSpriteLine(u16 l, u16 *dst, u8 *dst_alpha, u8 *typeTab, u8 *prioTab); + template void _rot_scale_op(const BGxPARMS ¶m, const u16 LG, const s32 wh, const s32 ht, const u32 map, const u32 tile, const u16 *pal); + template void _apply_rot_fun(const BGxPARMS ¶m, const u16 LG, const u32 map, const u32 tile, const u16 *pal); + template void _LineLarge8bpp(); template void _RenderLine_TextBG(u16 XBG, u16 YBG, u16 LG); @@ -843,7 +851,6 @@ protected: template void _LineRot(); template void _LineExtRot(); - // check whether (x,y) is within the rectangle (including wraparounds) template u8 _WithinRect(const size_t x) const; template void _RenderLine_CheckWindows(const size_t srcX, bool &draw, bool &effect) const; @@ -890,7 +897,7 @@ public: void SetupFinalPixelBlitter(); void SetVideoProp(const u32 ctrlBits); - void SetBGProp(const size_t num, const u16 ctrlBits); + template void SetBGProp(const u16 ctrlBits); template void RenderLine(const u16 l, bool skip); @@ -945,10 +952,10 @@ public: void UpdateVRAM3DUsageProperties_BGLayer(const size_t bankIndex, VRAM3DUsageProperties &outProperty); void UpdateVRAM3DUsageProperties_OBJLayer(const size_t bankIndex, VRAM3DUsageProperties &outProperty); - void setAffineStart(const size_t layer, int xy, u32 val); - void setAffineStartWord(const size_t layer, int xy, u16 val, int word); - u32 getAffineStart(const size_t layer, int xy); - void refreshAffineStartRegs(const int num, const int xy); + template void setAffineStart(u32 val); + template void setAffineStartWord(u16 val); + template u32 getAffineStart(); + template void refreshAffineStartRegs(); void SpriteRender(u16 *dst, u8 *dst_alpha, u8 *typeTab, u8 *prioTab); void ModeRenderDebug(const GPULayerID layerID); diff --git a/desmume/src/MMU.cpp b/desmume/src/MMU.cpp index 9ddbfcd10..9fdcc7c36 100644 --- a/desmume/src/MMU.cpp +++ b/desmume/src/MMU.cpp @@ -305,7 +305,7 @@ struct TVramBankInfo { u8 page_addr, num_pages; }; -static const TVramBankInfo vram_bank_info[VRAM_BANKS] = { +static const TVramBankInfo vram_bank_info[VRAM_BANK_COUNT] = { {0,8}, {8,8}, {16,8}, @@ -483,7 +483,7 @@ std::string VramConfiguration::describePurpose(Purpose p) { std::string VramConfiguration::describe() { std::stringstream ret; - for(int i=0;i +static inline void MMU_VRAMmapRefreshBank() { - int block = bank; - if(bank >= VRAM_BANK_H) block++; - - u8 VRAMBankCnt = T1ReadByte(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x240 + block); + const size_t block = (VRAMBANK >= VRAM_BANK_H) ? VRAMBANK + 1 : VRAMBANK; + + VRAMCNT VRAMBankCnt; + VRAMBankCnt.value = T1ReadByte(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x240 + block); //do nothing if the bank isnt enabled - u8 en = VRAMBankCnt & 0x80; - if(!en) return; + if(VRAMBankCnt.Enable == 0) return; - int mst,ofs=0; - switch(bank) { + switch(VRAMBANK) { case VRAM_BANK_A: case VRAM_BANK_B: - mst = VRAMBankCnt & 3; - ofs = (VRAMBankCnt>>3) & 3; - switch(mst) + assert(VRAMBankCnt.MST == VRAMBankCnt.MST_ABHI); + switch(VRAMBankCnt.MST_ABHI) { case 0: //LCDC - vramConfiguration.banks[bank].purpose = VramConfiguration::LCDC; - MMU_vram_lcdc(bank); - if(ofs != 0) PROGINFO("Bank %i: MST %i OFS %i\n", mst, ofs); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::LCDC; + MMU_vram_lcdc(VRAMBANK); + if(VRAMBankCnt.OFS != 0) PROGINFO("Bank %i: MST %i OFS %i\n", VRAMBankCnt.MST_ABHI, VRAMBankCnt.OFS); break; case 1: //ABG - vramConfiguration.banks[bank].purpose = VramConfiguration::ABG; - MMU_vram_arm9(bank,VRAM_PAGE_ABG+ofs*8); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::ABG; + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_ABG+VRAMBankCnt.OFS*8); break; case 2: //AOBJ - vramConfiguration.banks[bank].purpose = VramConfiguration::AOBJ; - switch(ofs) { + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::AOBJ; + switch(VRAMBankCnt.OFS) { case 0: case 1: - MMU_vram_arm9(bank,VRAM_PAGE_AOBJ+ofs*8); + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_AOBJ+VRAMBankCnt.OFS*8); break; default: - PROGINFO("Unsupported ofs setting %d for engine A OBJ vram bank %c\n", ofs, 'A'+bank); + PROGINFO("Unsupported ofs setting %d for engine A OBJ vram bank %c\n", VRAMBankCnt.OFS, 'A'+VRAMBANK); } break; case 3: //texture - vramConfiguration.banks[bank].purpose = VramConfiguration::TEX; - MMU.texInfo.textureSlotAddr[ofs] = MMU_vram_physical(vram_bank_info[bank].page_addr); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::TEX; + MMU.texInfo.textureSlotAddr[VRAMBankCnt.OFS] = MMU_vram_physical(vram_bank_info[VRAMBANK].page_addr); break; default: goto unsupported_mst; } @@ -564,78 +561,75 @@ static inline void MMU_VRAMmapRefreshBank(const int bank) case VRAM_BANK_C: case VRAM_BANK_D: - mst = VRAMBankCnt & 7; - ofs = (VRAMBankCnt>>3) & 3; - switch(mst) + switch(VRAMBankCnt.MST) { case 0: //LCDC - vramConfiguration.banks[bank].purpose = VramConfiguration::LCDC; - MMU_vram_lcdc(bank); - if(ofs != 0) PROGINFO("Bank %i: MST %i OFS %i\n", mst, ofs); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::LCDC; + MMU_vram_lcdc(VRAMBANK); + if(VRAMBankCnt.OFS != 0) PROGINFO("Bank %i: MST %i OFS %i\n", VRAMBankCnt.MST, VRAMBankCnt.OFS); break; case 1: //ABG - vramConfiguration.banks[bank].purpose = VramConfiguration::ABG; - MMU_vram_arm9(bank,VRAM_PAGE_ABG+ofs*8); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::ABG; + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_ABG+VRAMBankCnt.OFS*8); break; case 2: //arm7 - vramConfiguration.banks[bank].purpose = VramConfiguration::ARM7; - if(bank == 2) T1WriteByte(MMU.MMU_MEM[ARMCPU_ARM7][0x40], 0x240, T1ReadByte(MMU.MMU_MEM[ARMCPU_ARM7][0x40], 0x240) | 1); - if(bank == 3) T1WriteByte(MMU.MMU_MEM[ARMCPU_ARM7][0x40], 0x240, T1ReadByte(MMU.MMU_MEM[ARMCPU_ARM7][0x40], 0x240) | 2); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::ARM7; + if(VRAMBANK == 2) T1WriteByte(MMU.MMU_MEM[ARMCPU_ARM7][0x40], 0x240, T1ReadByte(MMU.MMU_MEM[ARMCPU_ARM7][0x40], 0x240) | 1); + if(VRAMBANK == 3) T1WriteByte(MMU.MMU_MEM[ARMCPU_ARM7][0x40], 0x240, T1ReadByte(MMU.MMU_MEM[ARMCPU_ARM7][0x40], 0x240) | 2); //printf("DING!\n"); - switch(ofs) { + switch(VRAMBankCnt.OFS) { case 0: case 1: - vram_arm7_map[ofs] = vram_bank_info[bank].page_addr; + vram_arm7_map[VRAMBankCnt.OFS] = vram_bank_info[VRAMBANK].page_addr; break; default: - PROGINFO("Unsupported ofs setting %d for arm7 vram bank %c\n", ofs, 'A'+bank); + PROGINFO("Unsupported ofs setting %d for arm7 vram bank %c\n", VRAMBankCnt.OFS, 'A'+VRAMBANK); } break; case 3: //texture - vramConfiguration.banks[bank].purpose = VramConfiguration::TEX; - MMU.texInfo.textureSlotAddr[ofs] = MMU_vram_physical(vram_bank_info[bank].page_addr); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::TEX; + MMU.texInfo.textureSlotAddr[VRAMBankCnt.OFS] = MMU_vram_physical(vram_bank_info[VRAMBANK].page_addr); break; case 4: //BGB or BOBJ - if(bank == VRAM_BANK_C) { - vramConfiguration.banks[bank].purpose = VramConfiguration::BBG; - MMU_vram_arm9(bank,VRAM_PAGE_BBG); //BBG + if(VRAMBANK == VRAM_BANK_C) { + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::BBG; + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_BBG); //BBG } else { - vramConfiguration.banks[bank].purpose = VramConfiguration::BOBJ; - MMU_vram_arm9(bank,VRAM_PAGE_BOBJ); //BOBJ + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::BOBJ; + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_BOBJ); //BOBJ } - if(ofs != 0) PROGINFO("Bank %i: MST %i OFS %i\n", mst, ofs); + if(VRAMBankCnt.OFS != 0) PROGINFO("Bank %i: MST %i OFS %i\n", VRAMBankCnt.MST, VRAMBankCnt.OFS); break; default: goto unsupported_mst; } break; case VRAM_BANK_E: - mst = VRAMBankCnt & 7; - if(((VRAMBankCnt>>3)&3) != 0) PROGINFO("Bank %i: MST %i OFS %i\n", mst, ofs); - switch(mst) { + if(VRAMBankCnt.OFS != 0) PROGINFO("Bank %i: MST %i OFS %i\n", VRAMBankCnt.MST, VRAMBankCnt.OFS); + switch(VRAMBankCnt.MST) { case 0: //LCDC - vramConfiguration.banks[bank].purpose = VramConfiguration::LCDC; - MMU_vram_lcdc(bank); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::LCDC; + MMU_vram_lcdc(VRAMBANK); break; case 1: //ABG - vramConfiguration.banks[bank].purpose = VramConfiguration::ABG; - MMU_vram_arm9(bank,VRAM_PAGE_ABG); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::ABG; + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_ABG); break; case 2: //AOBJ - vramConfiguration.banks[bank].purpose = VramConfiguration::AOBJ; - MMU_vram_arm9(bank,VRAM_PAGE_AOBJ); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::AOBJ; + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_AOBJ); break; case 3: //texture palette - vramConfiguration.banks[bank].purpose = VramConfiguration::TEXPAL; - MMU.texInfo.texPalSlot[0] = MMU_vram_physical(vram_bank_info[bank].page_addr); - MMU.texInfo.texPalSlot[1] = MMU_vram_physical(vram_bank_info[bank].page_addr+1); - MMU.texInfo.texPalSlot[2] = MMU_vram_physical(vram_bank_info[bank].page_addr+2); - MMU.texInfo.texPalSlot[3] = MMU_vram_physical(vram_bank_info[bank].page_addr+3); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::TEXPAL; + MMU.texInfo.texPalSlot[0] = MMU_vram_physical(vram_bank_info[VRAMBANK].page_addr); + MMU.texInfo.texPalSlot[1] = MMU_vram_physical(vram_bank_info[VRAMBANK].page_addr+1); + MMU.texInfo.texPalSlot[2] = MMU_vram_physical(vram_bank_info[VRAMBANK].page_addr+2); + MMU.texInfo.texPalSlot[3] = MMU_vram_physical(vram_bank_info[VRAMBANK].page_addr+3); break; case 4: //A BG extended palette - vramConfiguration.banks[bank].purpose = VramConfiguration::ABGEXTPAL; - MMU.ExtPal[0][0] = MMU_vram_physical(vram_bank_info[bank].page_addr); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::ABGEXTPAL; + MMU.ExtPal[0][0] = MMU_vram_physical(vram_bank_info[VRAMBANK].page_addr); MMU.ExtPal[0][1] = MMU.ExtPal[0][0] + ADDRESS_STEP_8KB; MMU.ExtPal[0][2] = MMU.ExtPal[0][1] + ADDRESS_STEP_8KB; MMU.ExtPal[0][3] = MMU.ExtPal[0][2] + ADDRESS_STEP_8KB; @@ -646,50 +640,48 @@ static inline void MMU_VRAMmapRefreshBank(const int bank) case VRAM_BANK_F: case VRAM_BANK_G: { - mst = VRAMBankCnt & 7; - ofs = (VRAMBankCnt>>3) & 3; const int pageofslut[] = {0,1,4,5}; - const int pageofs = pageofslut[ofs]; - switch(mst) + const int pageofs = pageofslut[VRAMBankCnt.OFS]; + switch(VRAMBankCnt.MST) { case 0: //LCDC - vramConfiguration.banks[bank].purpose = VramConfiguration::LCDC; - MMU_vram_lcdc(bank); - if(ofs != 0) PROGINFO("Bank %i: MST %i OFS %i\n", mst, ofs); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::LCDC; + MMU_vram_lcdc(VRAMBANK); + if(VRAMBankCnt.OFS != 0) PROGINFO("Bank %i: MST %i OFS %i\n", VRAMBankCnt.MST, VRAMBankCnt.OFS); break; case 1: //ABG - vramConfiguration.banks[bank].purpose = VramConfiguration::ABG; - MMU_vram_arm9(bank,VRAM_PAGE_ABG+pageofs); - MMU_vram_arm9(bank,VRAM_PAGE_ABG+pageofs+2); //unexpected mirroring (required by spyro eternal night) + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::ABG; + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_ABG+pageofs); + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_ABG+pageofs+2); //unexpected mirroring (required by spyro eternal night) break; case 2: //AOBJ - vramConfiguration.banks[bank].purpose = VramConfiguration::AOBJ; - MMU_vram_arm9(bank,VRAM_PAGE_AOBJ+pageofs); - MMU_vram_arm9(bank,VRAM_PAGE_AOBJ+pageofs+2); //unexpected mirroring - I have no proof, but it is inferred from the ABG above + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::AOBJ; + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_AOBJ+pageofs); + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_AOBJ+pageofs+2); //unexpected mirroring - I have no proof, but it is inferred from the ABG above break; case 3: //texture palette - vramConfiguration.banks[bank].purpose = VramConfiguration::TEXPAL; - MMU.texInfo.texPalSlot[pageofs] = MMU_vram_physical(vram_bank_info[bank].page_addr); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::TEXPAL; + MMU.texInfo.texPalSlot[pageofs] = MMU_vram_physical(vram_bank_info[VRAMBANK].page_addr); break; case 4: //A BG extended palette - switch(ofs) { + switch(VRAMBankCnt.OFS) { case 0: case 1: - vramConfiguration.banks[bank].purpose = VramConfiguration::ABGEXTPAL; - MMU.ExtPal[0][ofs*2] = MMU_vram_physical(vram_bank_info[bank].page_addr); - MMU.ExtPal[0][ofs*2+1] = MMU.ExtPal[0][ofs*2] + ADDRESS_STEP_8KB; + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::ABGEXTPAL; + MMU.ExtPal[0][VRAMBankCnt.OFS*2] = MMU_vram_physical(vram_bank_info[VRAMBANK].page_addr); + MMU.ExtPal[0][VRAMBankCnt.OFS*2+1] = MMU.ExtPal[0][VRAMBankCnt.OFS*2] + ADDRESS_STEP_8KB; break; default: - vramConfiguration.banks[bank].purpose = VramConfiguration::INVALID; - PROGINFO("Unsupported ofs setting %d for engine A bgextpal vram bank %c\n", ofs, 'A'+bank); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::INVALID; + PROGINFO("Unsupported ofs setting %d for engine A bgextpal vram bank %c\n", VRAMBankCnt.OFS, 'A'+VRAMBANK); break; } break; case 5: //A OBJ extended palette - vramConfiguration.banks[bank].purpose = VramConfiguration::AOBJEXTPAL; - MMU.ObjExtPal[0][0] = MMU_vram_physical(vram_bank_info[bank].page_addr); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::AOBJEXTPAL; + MMU.ObjExtPal[0][0] = MMU_vram_physical(vram_bank_info[VRAMBANK].page_addr); MMU.ObjExtPal[0][1] = MMU.ObjExtPal[0][1] + ADDRESS_STEP_8KB; - if(ofs != 0) PROGINFO("Bank %i: MST %i OFS %i\n", mst, ofs); + if(VRAMBankCnt.OFS != 0) PROGINFO("Bank %i: MST %i OFS %i\n", VRAMBankCnt.MST, VRAMBankCnt.OFS); break; default: goto unsupported_mst; } @@ -697,22 +689,22 @@ static inline void MMU_VRAMmapRefreshBank(const int bank) } case VRAM_BANK_H: - mst = VRAMBankCnt & 3; - if(((VRAMBankCnt>>3)&3) != 0) PROGINFO("Bank %i: MST %i OFS %i\n", mst, ofs); - switch(mst) + assert(VRAMBankCnt.MST == VRAMBankCnt.MST_ABHI); + if(VRAMBankCnt.OFS != 0) PROGINFO("Bank %i: MST %i OFS %i\n", VRAMBankCnt.MST_ABHI, VRAMBankCnt.OFS); + switch(VRAMBankCnt.MST_ABHI) { case 0: //LCDC - vramConfiguration.banks[bank].purpose = VramConfiguration::LCDC; - MMU_vram_lcdc(bank); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::LCDC; + MMU_vram_lcdc(VRAMBANK); break; case 1: //BBG - vramConfiguration.banks[bank].purpose = VramConfiguration::BBG; - MMU_vram_arm9(bank,VRAM_PAGE_BBG); - MMU_vram_arm9(bank,VRAM_PAGE_BBG + 4); //unexpected mirroring + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::BBG; + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_BBG); + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_BBG + 4); //unexpected mirroring break; case 2: //B BG extended palette - vramConfiguration.banks[bank].purpose = VramConfiguration::BBGEXTPAL; - MMU.ExtPal[1][0] = MMU_vram_physical(vram_bank_info[bank].page_addr); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::BBGEXTPAL; + MMU.ExtPal[1][0] = MMU_vram_physical(vram_bank_info[VRAMBANK].page_addr); MMU.ExtPal[1][1] = MMU.ExtPal[1][0] + ADDRESS_STEP_8KB; MMU.ExtPal[1][2] = MMU.ExtPal[1][1] + ADDRESS_STEP_8KB; MMU.ExtPal[1][3] = MMU.ExtPal[1][2] + ADDRESS_STEP_8KB; @@ -722,27 +714,27 @@ static inline void MMU_VRAMmapRefreshBank(const int bank) break; case VRAM_BANK_I: - mst = VRAMBankCnt & 3; - if(((VRAMBankCnt>>3)&3) != 0) PROGINFO("Bank %i: MST %i OFS %i\n", mst, ofs); - switch(mst) + assert(VRAMBankCnt.MST == VRAMBankCnt.MST_ABHI); + if(VRAMBankCnt.OFS != 0) PROGINFO("Bank %i: MST %i OFS %i\n", VRAMBankCnt.MST_ABHI, VRAMBankCnt.OFS); + switch(VRAMBankCnt.MST_ABHI) { case 0: //LCDC - vramConfiguration.banks[bank].purpose = VramConfiguration::LCDC; - MMU_vram_lcdc(bank); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::LCDC; + MMU_vram_lcdc(VRAMBANK); break; case 1: //BBG - vramConfiguration.banks[bank].purpose = VramConfiguration::BBG; - MMU_vram_arm9(bank,VRAM_PAGE_BBG+2); - MMU_vram_arm9(bank,VRAM_PAGE_BBG+3); //unexpected mirroring + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::BBG; + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_BBG+2); + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_BBG+3); //unexpected mirroring break; case 2: //BOBJ - vramConfiguration.banks[bank].purpose = VramConfiguration::BOBJ; - MMU_vram_arm9(bank,VRAM_PAGE_BOBJ); - MMU_vram_arm9(bank,VRAM_PAGE_BOBJ+1); //FF3 end scene (lens flare sprite) needs this as it renders a sprite off the end of the 16KB and back around + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::BOBJ; + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_BOBJ); + MMU_vram_arm9(VRAMBANK,VRAM_PAGE_BOBJ+1); //FF3 end scene (lens flare sprite) needs this as it renders a sprite off the end of the 16KB and back around break; case 3: //B OBJ extended palette - vramConfiguration.banks[bank].purpose = VramConfiguration::BOBJEXTPAL; - MMU.ObjExtPal[1][0] = MMU_vram_physical(vram_bank_info[bank].page_addr); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::BOBJEXTPAL; + MMU.ObjExtPal[1][0] = MMU_vram_physical(vram_bank_info[VRAMBANK].page_addr); MMU.ObjExtPal[1][1] = MMU.ObjExtPal[1][1] + ADDRESS_STEP_8KB; break; default: goto unsupported_mst; @@ -750,15 +742,15 @@ static inline void MMU_VRAMmapRefreshBank(const int bank) break; - } //switch(bank) + } //switch(VRAMBANK) - vramConfiguration.banks[bank].ofs = ofs; + vramConfiguration.banks[VRAMBANK].ofs = VRAMBankCnt.OFS; return; unsupported_mst: - vramConfiguration.banks[bank].purpose = VramConfiguration::INVALID; - PROGINFO("Unsupported mst setting %d for vram bank %c\n", mst, 'A'+bank); + vramConfiguration.banks[VRAMBANK].purpose = VramConfiguration::INVALID; + PROGINFO("Unsupported mst setting %d for vram bank %c\n", VRAMBankCnt.MST, 'A'+VRAMBANK); } void MMU_VRAM_unmap_all() @@ -821,19 +813,19 @@ static inline void MMU_VRAMmapControl(u8 block, u8 VRAMBankCnt) //goblet of fire "care of magical creatures" maps I and D to BOBJ (the I is an accident) //and requires A to override it. //This may create other bugs.... - MMU_VRAMmapRefreshBank(VRAM_BANK_I); - MMU_VRAMmapRefreshBank(VRAM_BANK_H); - MMU_VRAMmapRefreshBank(VRAM_BANK_G); - MMU_VRAMmapRefreshBank(VRAM_BANK_F); - MMU_VRAMmapRefreshBank(VRAM_BANK_E); + MMU_VRAMmapRefreshBank(); + MMU_VRAMmapRefreshBank(); + MMU_VRAMmapRefreshBank(); + MMU_VRAMmapRefreshBank(); + MMU_VRAMmapRefreshBank(); //zero 21-jun-2012 //tomwi's streaming music demo sets A and D to ABG (the A is an accident). //in this case, D should get priority. //this is somewhat risky. will it break other things? - MMU_VRAMmapRefreshBank(VRAM_BANK_A); - MMU_VRAMmapRefreshBank(VRAM_BANK_B); - MMU_VRAMmapRefreshBank(VRAM_BANK_C); - MMU_VRAMmapRefreshBank(VRAM_BANK_D); + MMU_VRAMmapRefreshBank(); + MMU_VRAMmapRefreshBank(); + MMU_VRAMmapRefreshBank(); + MMU_VRAMmapRefreshBank(); //printf(vramConfiguration.describe().c_str()); //printf("vram remapped at vcount=%d\n",nds.VCount); @@ -912,6 +904,8 @@ void MMU_Init(void) LOG("MMU init\n"); memset(&MMU, 0, sizeof(MMU_struct)); + + MMU.blank_memory = &MMU.ARM9_LCD[0xA4000]; //MMU.DTCMRegion = 0x027C0000; //even though apps may change dtcm immediately upon startup, this is the correct hardware starting value: @@ -961,7 +955,6 @@ void MMU_Reset() memset(MMU.ARM9_VMEM, 0, sizeof(MMU.ARM9_VMEM)); memset(MMU.MAIN_MEM, 0, sizeof(MMU.MAIN_MEM)); - memset(MMU.blank_memory, 0, sizeof(MMU.blank_memory)); memset(MMU.UNUSED_RAM, 0, sizeof(MMU.UNUSED_RAM)); memset(MMU.MORE_UNUSED_RAM, 0, sizeof(MMU.UNUSED_RAM)); @@ -3591,22 +3584,22 @@ void FASTCALL _MMU_ARM9_write16(u32 adr, u16 val) val &= 0x7F7F; break; - case REG_DISPA_BG2XL: mainEngine->setAffineStartWord(2,0,val,0); break; - case REG_DISPA_BG2XH: mainEngine->setAffineStartWord(2,0,val,1); break; - case REG_DISPA_BG2YL: mainEngine->setAffineStartWord(2,1,val,0); break; - case REG_DISPA_BG2YH: mainEngine->setAffineStartWord(2,1,val,1); break; - case REG_DISPA_BG3XL: mainEngine->setAffineStartWord(3,0,val,0); break; - case REG_DISPA_BG3XH: mainEngine->setAffineStartWord(3,0,val,1); break; - case REG_DISPA_BG3YL: mainEngine->setAffineStartWord(3,1,val,0); break; - case REG_DISPA_BG3YH: mainEngine->setAffineStartWord(3,1,val,1); break; - case REG_DISPB_BG2XL: subEngine->setAffineStartWord(2,0,val,0); break; - case REG_DISPB_BG2XH: subEngine->setAffineStartWord(2,0,val,1); break; - case REG_DISPB_BG2YL: subEngine->setAffineStartWord(2,1,val,0); break; - case REG_DISPB_BG2YH: subEngine->setAffineStartWord(2,1,val,1); break; - case REG_DISPB_BG3XL: subEngine->setAffineStartWord(3,0,val,0); break; - case REG_DISPB_BG3XH: subEngine->setAffineStartWord(3,0,val,1); break; - case REG_DISPB_BG3YL: subEngine->setAffineStartWord(3,1,val,0); break; - case REG_DISPB_BG3YH: subEngine->setAffineStartWord(3,1,val,1); break; + case REG_DISPA_BG2XL: mainEngine->setAffineStartWord(val); break; + case REG_DISPA_BG2XH: mainEngine->setAffineStartWord(val); break; + case REG_DISPA_BG2YL: mainEngine->setAffineStartWord(val); break; + case REG_DISPA_BG2YH: mainEngine->setAffineStartWord(val); break; + case REG_DISPA_BG3XL: mainEngine->setAffineStartWord(val); break; + case REG_DISPA_BG3XH: mainEngine->setAffineStartWord(val); break; + case REG_DISPA_BG3YL: mainEngine->setAffineStartWord(val); break; + case REG_DISPA_BG3YH: mainEngine->setAffineStartWord(val); break; + case REG_DISPB_BG2XL: subEngine->setAffineStartWord(val); break; + case REG_DISPB_BG2XH: subEngine->setAffineStartWord(val); break; + case REG_DISPB_BG2YL: subEngine->setAffineStartWord(val); break; + case REG_DISPB_BG2YH: subEngine->setAffineStartWord(val); break; + case REG_DISPB_BG3XL: subEngine->setAffineStartWord(val); break; + case REG_DISPB_BG3XH: subEngine->setAffineStartWord(val); break; + case REG_DISPB_BG3YL: subEngine->setAffineStartWord(val); break; + case REG_DISPB_BG3YH: subEngine->setAffineStartWord(val); break; case REG_DISPA_DISP3DCNT: writereg_DISP3DCNT(16,adr,val); return; @@ -3815,42 +3808,42 @@ void FASTCALL _MMU_ARM9_write16(u32 adr, u16 val) case REG_DISPA_BG0CNT : //GPULOG("MAIN BG0 SETPROP 16B %08X\r\n", val); - mainEngine->SetBGProp(0, val); + mainEngine->SetBGProp(val); T1WriteWord(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x8, val); return; case REG_DISPA_BG1CNT : //GPULOG("MAIN BG1 SETPROP 16B %08X\r\n", val); - mainEngine->SetBGProp(1, val); + mainEngine->SetBGProp(val); T1WriteWord(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0xA, val); return; case REG_DISPA_BG2CNT : //GPULOG("MAIN BG2 SETPROP 16B %08X\r\n", val); - mainEngine->SetBGProp(2, val); + mainEngine->SetBGProp(val); T1WriteWord(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0xC, val); return; case REG_DISPA_BG3CNT : //GPULOG("MAIN BG3 SETPROP 16B %08X\r\n", val); - mainEngine->SetBGProp(3, val); + mainEngine->SetBGProp(val); T1WriteWord(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0xE, val); return; case REG_DISPB_BG0CNT : //GPULOG("SUB BG0 SETPROP 16B %08X\r\n", val); - subEngine->SetBGProp(0, val); + subEngine->SetBGProp(val); T1WriteWord(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x1008, val); return; case REG_DISPB_BG1CNT : //GPULOG("SUB BG1 SETPROP 16B %08X\r\n", val); - subEngine->SetBGProp(1, val); + subEngine->SetBGProp(val); T1WriteWord(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x100A, val); return; case REG_DISPB_BG2CNT : //GPULOG("SUB BG2 SETPROP 16B %08X\r\n", val); - subEngine->SetBGProp(2, val); + subEngine->SetBGProp(val); T1WriteWord(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x100C, val); return; case REG_DISPB_BG3CNT : //GPULOG("SUB BG3 SETPROP 16B %08X\r\n", val); - subEngine->SetBGProp(3, val); + subEngine->SetBGProp(val); T1WriteWord(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x100E, val); return; @@ -4133,28 +4126,28 @@ void FASTCALL _MMU_ARM9_write32(u32 adr, u32 val) MMU_new.gxstat.write32(val); break; case REG_DISPA_BG2XL: - mainEngine->setAffineStart(2,0,val); + mainEngine->setAffineStart(val); return; case REG_DISPA_BG2YL: - mainEngine->setAffineStart(2,1,val); + mainEngine->setAffineStart(val); return; case REG_DISPB_BG2XL: - subEngine->setAffineStart(2,0,val); + subEngine->setAffineStart(val); return; case REG_DISPB_BG2YL: - subEngine->setAffineStart(2,1,val); + subEngine->setAffineStart(val); return; case REG_DISPA_BG3XL: - mainEngine->setAffineStart(3,0,val); + mainEngine->setAffineStart(val); return; case REG_DISPA_BG3YL: - mainEngine->setAffineStart(3,1,val); + mainEngine->setAffineStart(val); return; case REG_DISPB_BG3XL: - subEngine->setAffineStart(3,0,val); + subEngine->setAffineStart(val); return; case REG_DISPB_BG3YL: - subEngine->setAffineStart(3,1,val); + subEngine->setAffineStart(val); return; // Alpha test reference value - Parameters:1 @@ -4363,24 +4356,24 @@ void FASTCALL _MMU_ARM9_write32(u32 adr, u32 val) return; case REG_DISPA_BG0CNT : - mainEngine->SetBGProp(0, (val & 0xFFFF)); - mainEngine->SetBGProp(1, (val >> 16)); + mainEngine->SetBGProp(val & 0xFFFF); + mainEngine->SetBGProp(val >> 16); //if((val>>16)==0x400) emu_halt(); T1WriteLong(MMU.ARM9_REG, 8, val); return; case REG_DISPA_BG2CNT : - mainEngine->SetBGProp(2, (val & 0xFFFF)); - mainEngine->SetBGProp(3, (val >> 16)); + mainEngine->SetBGProp(val & 0xFFFF); + mainEngine->SetBGProp(val >> 16); T1WriteLong(MMU.ARM9_REG, 0xC, val); return; case REG_DISPB_BG0CNT : - subEngine->SetBGProp(0, (val & 0xFFFF)); - subEngine->SetBGProp(1, (val >> 16)); + subEngine->SetBGProp(val & 0xFFFF); + subEngine->SetBGProp(val >> 16); T1WriteLong(MMU.ARM9_REG, 0x1008, val); return; case REG_DISPB_BG2CNT : - subEngine->SetBGProp(2, (val & 0xFFFF)); - subEngine->SetBGProp(3, (val >> 16)); + subEngine->SetBGProp(val & 0xFFFF); + subEngine->SetBGProp(val >> 16); T1WriteLong(MMU.ARM9_REG, 0x100C, val); return; case REG_DISPA_DISPMMEMFIFO: diff --git a/desmume/src/MMU.h b/desmume/src/MMU.h index 7df997253..5a381f118 100644 --- a/desmume/src/MMU.h +++ b/desmume/src/MMU.h @@ -312,6 +312,28 @@ struct GCBUS_Controller eCardMode mode; //probably only one of these }; +typedef union +{ + u8 value; + + struct + { + unsigned MST:3; + unsigned OFS:2; + unsigned :2; + unsigned Enable:1; + }; + + struct + { + unsigned MST_ABHI:2; + unsigned :1; + unsigned OFS_ABHI:2; + unsigned :2; + unsigned Enable_ABHI:1; + }; +} VRAMCNT; + #define DUP2(x) x, x #define DUP4(x) x, x, x, x #define DUP8(x) x, x, x, x, x, x, x, x @@ -328,18 +350,14 @@ struct MMU_struct u8 MAIN_MEM[16*1024*1024]; //expanded from 8MB to 16MB to support dsi u8 ARM9_REG[0x1000000]; //this variable is evil and should be removed by correctly emulating all registers. u8 ARM9_BIOS[0x8000]; - u8 ARM9_VMEM[0x800]; + CACHE_ALIGN u8 ARM9_VMEM[0x800]; + + //an extra 128KB for blank memory, directly after arm9_lcd, so that + //we can easily map things to the end of arm9_lcd to represent + //an unmapped state + CACHE_ALIGN u8 ARM9_LCD[0xA4000 + 0x20000]; + u8 *blank_memory; - #include "PACKED.h" - struct { - u8 ARM9_LCD[0xA4000]; - //an extra 128KB for blank memory, directly after arm9_lcd, so that - //we can easily map things to the end of arm9_lcd to represent - //an unmapped state - u8 blank_memory[0x20000]; - }; - #include "PACKED_END.h" - u8 ARM9_OAM[0x800]; u8* ExtPal[2][4]; @@ -519,16 +537,20 @@ extern const armcpu_memory_iface arm9_base_memory_iface; extern const armcpu_memory_iface arm7_base_memory_iface; extern const armcpu_memory_iface arm9_direct_memory_iface; -#define VRAM_BANKS 9 -#define VRAM_BANK_A 0 -#define VRAM_BANK_B 1 -#define VRAM_BANK_C 2 -#define VRAM_BANK_D 3 -#define VRAM_BANK_E 4 -#define VRAM_BANK_F 5 -#define VRAM_BANK_G 6 -#define VRAM_BANK_H 7 -#define VRAM_BANK_I 8 +enum VRAMBankID +{ + VRAM_BANK_A = 0, + VRAM_BANK_B = 1, + VRAM_BANK_C = 2, + VRAM_BANK_D = 3, + VRAM_BANK_E = 4, + VRAM_BANK_F = 5, + VRAM_BANK_G = 6, + VRAM_BANK_H = 7, + VRAM_BANK_I = 8, + + VRAM_BANK_COUNT = 9 +}; #define VRAM_PAGE_ABG 0 #define VRAM_PAGE_BBG 128 @@ -545,10 +567,10 @@ struct VramConfiguration { struct BankInfo { Purpose purpose; int ofs; - } banks[VRAM_BANKS]; + } banks[VRAM_BANK_COUNT]; inline void clear() { - for(int i=0;iclearImageColor16Buffer + i + 8), _mm_loadu_si128((__m128i *)(clearColorBuffer + i + 8)) ); - _mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i), _mm_loadu_si128((__m128i *)(clearColorBuffer + i)) ); + _mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i + 8), _mm_load_si128((__m128i *)(clearColorBuffer + i + 8)) ); + _mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i), _mm_load_si128((__m128i *)(clearColorBuffer + i)) ); // Write the depth values to the depth buffer. - __m128i clearDepthHi_vec128 = _mm_loadu_si128((__m128i *)(clearDepthBuffer + i + 8)); - __m128i clearDepthLo_vec128 = _mm_loadu_si128((__m128i *)(clearDepthBuffer + i)); + __m128i clearDepthHi_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8)); + __m128i clearDepthLo_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i)); clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, depthBitMask_vec128); clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, depthBitMask_vec128); @@ -602,8 +602,8 @@ Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState) this->clearImageDepthBuffer[i+ 0] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 0)]; // Write the fog flags to the fog flag buffer. - clearDepthHi_vec128 = _mm_loadu_si128((__m128i *)(clearDepthBuffer + i + 8)); - clearDepthLo_vec128 = _mm_loadu_si128((__m128i *)(clearDepthBuffer + i)); + clearDepthHi_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i + 8)); + clearDepthLo_vec128 = _mm_load_si128((__m128i *)(clearDepthBuffer + i)); clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, fogBufferBitMask_vec128); clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, fogBufferBitMask_vec128); clearDepthHi_vec128 = _mm_srli_epi16(clearDepthHi_vec128, 15);