diff --git a/desmume/ChangeLog b/desmume/ChangeLog index 8b0e4e081..39a15d31f 100644 --- a/desmume/ChangeLog +++ b/desmume/ChangeLog @@ -21,6 +21,8 @@ Graphics: bug: fix 256B granularity sprite addressing for sub gpu bug: fix 128-wide captures bug: fix color overflow in capture blending + bug: fix disp fifo capture + bug: fix simultaneous vram display and capture via same bank bug: swrast: add clear image and scroll emulation bug: swrast: fixes to shadow rendering diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 977bc4258..891d2fee5 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -53,7 +53,8 @@ GPU::MosaicLookup GPU::mosaicLookup; //#define DEBUG_TRI CACHE_ALIGN u8 GPU_screen[4*256*192]; -CACHE_ALIGN u8 *GPU_tempScanline; +u8 *GPU_tempScanline; +CACHE_ALIGN u16 GPU_tempScanlineBuffer[256]; CACHE_ALIGN u8 sprWin[256]; @@ -2536,16 +2537,15 @@ template static void GPU_ligne_DispCapture(u16 l) //INFO("Capture source is SourceB\n"); switch (gpu->dispCapCnt.srcB) { - case 0: // Capture VRAM - { - //INFO("Capture VRAM\n"); - CAPCOPY(cap_src,cap_dst); - } + case 0: + //Capture VRAM + CAPCOPY(cap_src,cap_dst); break; - case 1: // Capture Main Memory Display FIFO - { - //INFO("Capture Main Memory Display FIFO\n"); - } + case 1: + //capture dispfifo + //(not yet tested) + for(int i=0; i < 128; i++) + T1WriteLong(cap_dst, i << 2, DISP_FIFOrecv()); break; } } @@ -2566,50 +2566,56 @@ template static void GPU_ligne_DispCapture(u16 l) gfx3d_GetLineData(l, &srcA, NULL); } + static u16 fifoLine[256]; + if (gpu->dispCapCnt.srcB == 0) // VRAM screen srcB = (u16 *)cap_src; else - srcB = NULL; // DISP FIFOS - - if ((srcA) && (srcB)) { - const int todo = (gpu->dispCapCnt.capx==DISPCAPCNT::_128?128:256); + //fifo - tested by splinter cell chaos theory thermal view + srcB = fifoLine; + for (int i=0; i < 128; i++) + T1WriteLong((u8*)srcB, i << 2, DISP_FIFOrecv()); + } - for(u16 i = 0; i < todo; i++) + + const int todo = (gpu->dispCapCnt.capx==DISPCAPCNT::_128?128:256); + + for(u16 i = 0; i < todo; i++) + { + u16 a,r,g,b; + + u16 a_alpha = srcA[i] & 0x8000; + u16 b_alpha = srcB[i] & 0x8000; + + if(a_alpha) { - u16 a,r,g,b; + a = 0x8000; + r = ((srcA[i] & 0x1F) * gpu->dispCapCnt.EVA); + g = (((srcA[i] >> 5) & 0x1F) * gpu->dispCapCnt.EVA); + b = (((srcA[i] >> 10) & 0x1F) * gpu->dispCapCnt.EVA); + } + else + a = r = g = b = 0; - u16 a_alpha = srcA[i] & 0x8000; - u16 b_alpha = srcB[i] & 0x8000; - - if(a_alpha) - { - a = 0x8000; - r = ((srcA[i] & 0x1F) * gpu->dispCapCnt.EVA); - g = (((srcA[i] >> 5) & 0x1F) * gpu->dispCapCnt.EVA); - b = (((srcA[i] >> 10) & 0x1F) * gpu->dispCapCnt.EVA); - } - else - a = r = g = b = 0; - - if(b_alpha) - { - a = 0x8000; - r += ((srcB[i] & 0x1F) * gpu->dispCapCnt.EVB); - g += (((srcB[i] >> 5) & 0x1F) * gpu->dispCapCnt.EVB); - b += (((srcB[i] >> 10) & 0x1F) * gpu->dispCapCnt.EVB); - } - - r >>= 4; - g >>= 4; - b >>= 4; - - r = std::min((u16)31,r); - g = std::min((u16)31,g); - b = std::min((u16)31,b); - - T2WriteWord(cap_dst, i << 1, a | (b << 10) | (g << 5) | r); + if(b_alpha) + { + a = 0x8000; + r += ((srcB[i] & 0x1F) * gpu->dispCapCnt.EVB); + g += (((srcB[i] >> 5) & 0x1F) * gpu->dispCapCnt.EVB); + b += (((srcB[i] >> 10) & 0x1F) * gpu->dispCapCnt.EVB); } + + r >>= 4; + g >>= 4; + b >>= 4; + + //freedom wings sky will overflow while doing some fsaa/motionblur effect without this + r = std::min((u16)31,r); + g = std::min((u16)31,g); + b = std::min((u16)31,b); + + T2WriteWord(cap_dst, i << 1, a | (b << 10) | (g << 5) | r); } } break; @@ -2821,17 +2827,17 @@ void GPU_ligne(NDS_Screen * screen, u16 l, bool skip) gpu->setup_windows<0>(); gpu->setup_windows<1>(); - //always generate the 2d+3d, no matter what we're displaying, since we may need to capture it - //(if this seems inefficient in some cases, consider that the speed in those cases is not really a problem) - GPU_tempScanline = screen->gpu->currDst = (u8 *)(GPU_screen) + (screen->offset + l) * 512; - GPU_ligne_layer(screen, l); - - if (gpu->core == GPU_MAIN) - { - GPU_ligne_DispCapture(l); - if (l == 191) { disp_fifo.head = disp_fifo.tail = 0; } + //generate the 2d engine output + if(gpu->dispMode == 1) { + //optimization: render straight to the output buffer when thats what we are going to end up displaying anyway + GPU_tempScanline = screen->gpu->currDst = (u8 *)(GPU_screen) + (screen->offset + l) * 512; + } else { + //otherwise, we need to go to a temp buffer + GPU_tempScanline = screen->gpu->currDst = (u8 *)GPU_tempScanlineBuffer; } + GPU_ligne_layer(screen, l); + switch (gpu->dispMode) { case 0: // Display Off(Display white) @@ -2847,7 +2853,7 @@ void GPU_ligne(NDS_Screen * screen, u16 l, bool skip) //do nothing: it has already been generated into the right place break; - case 2: // Display framebuffer + case 2: // Display vram framebuffer { u8 * dst = GPU_screen + (screen->offset + l) * 512; u8 * src = gpu->VRAMaddr + (l*512); @@ -2856,6 +2862,8 @@ void GPU_ligne(NDS_Screen * screen, u16 l, bool skip) break; case 3: // Display memory FIFO { + //this has not been tested since the dma timing for dispfifo was changed around the time of + //newemuloop. it may not work. u8 * dst = GPU_screen + (screen->offset + l) * 512; for (int i=0; i < 128; i++) T1WriteLong(dst, i << 2, DISP_FIFOrecv() & 0x7FFF7FFF); @@ -2863,6 +2871,17 @@ void GPU_ligne(NDS_Screen * screen, u16 l, bool skip) break; } + //capture after displaying so that we can safely display vram before overwriting it here + if (gpu->core == GPU_MAIN) + { + //BUG!!! if someone is capturing and displaying both from the fifo, then it will have been + //consumed above by the display before we get here + //(is that even legal? i think so) + GPU_ligne_DispCapture(l); + if (l == 191) { disp_fifo.head = disp_fifo.tail = 0; } + } + + GPU_ligne_MasterBrightness(screen, l); } diff --git a/desmume/src/MMU.cpp b/desmume/src/MMU.cpp index 4ad7c0952..d164b71b6 100644 --- a/desmume/src/MMU.cpp +++ b/desmume/src/MMU.cpp @@ -1090,15 +1090,12 @@ void FASTCALL MMU_doDMA(u32 num) taille = (MMU.DMACrt[PROCNUM][num]&0x1FFFFF); if(taille == 0) taille = 0x200000; //according to gbatek.. - //THIS IS A BIG HACK - // If we are in "Main memory display" mode just copy an entire - // screen (256x192 pixels). - // Reference: http://nocash.emubase.de/gbatek.htm#dsvideocaptureandmainmemorydisplaymode - // (under DISP_MMEM_FIFO) - if ((MMU.DMAStartTime[PROCNUM][num]==EDMAMode_MemDisplay) && // Must be in main memory display mode - (taille==4) && // Word must be 4 - (((MMU.DMACrt[PROCNUM][num]>>26)&1) == 1)) // Transfer mode must be 32bit wide - taille = 24576; //256*192/2; + //for main memory display fifo dmas, check for normal conditions and then dma all 128 bytes at once + //(theyll get sent to the fifo, which can handle more than it ought to be able to) + if ((MMU.DMAStartTime[PROCNUM][num]==EDMAMode_MemDisplay) && + (taille==4) && + (((MMU.DMACrt[PROCNUM][num]>>26)&1) == 1)) + taille = 128; if(MMU.DMAStartTime[PROCNUM][num] == EDMAMode_Card) taille *= 0x80; diff --git a/desmume/src/NDSSystem.cpp b/desmume/src/NDSSystem.cpp index e2ac30a23..99bf8b711 100644 --- a/desmume/src/NDSSystem.cpp +++ b/desmume/src/NDSSystem.cpp @@ -1973,7 +1973,10 @@ static void execHardware_hstart() if(nds.VCount<192) { //this is hacky. - //there is a corresponding hack in doDMA + //there is a corresponding hack in doDMA. + //it should be driven by a fifo (and generate just in time as the scanline is displayed) + //but that isnt even possible until we have some sort of sub-scanline timing. + //it may not be necessary. execHardware_doAllDma(EDMAMode_MemDisplay); } diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index 111220403..c324ed889 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -345,7 +345,6 @@ struct Shader { mode = (polyattr>>4)&0x3; //if there is no texture set, then set to the mode which doesnt even use a texture - //unless we're in shadow if(sampler.texFormat == 0 && mode != 3) mode = 4; }