diff --git a/desmume/ChangeLog b/desmume/ChangeLog
index 8b0e4e081..39a15d31f 100644
--- a/desmume/ChangeLog
+++ b/desmume/ChangeLog
@@ -21,6 +21,8 @@ Graphics:
  bug: fix 256B granularity sprite addressing for sub gpu
  bug: fix 128-wide captures
  bug: fix color overflow in capture blending
+ bug: fix disp fifo capture
+ bug: fix simultaneous vram display and capture via same bank
  bug: swrast: add clear image and scroll emulation
  bug: swrast: fixes to shadow rendering
   
diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp
index 977bc4258..891d2fee5 100644
--- a/desmume/src/GPU.cpp
+++ b/desmume/src/GPU.cpp
@@ -53,7 +53,8 @@ GPU::MosaicLookup GPU::mosaicLookup;
 //#define DEBUG_TRI
 
 CACHE_ALIGN u8 GPU_screen[4*256*192];
-CACHE_ALIGN u8 *GPU_tempScanline;
+u8 *GPU_tempScanline;
+CACHE_ALIGN u16 GPU_tempScanlineBuffer[256];
 
 CACHE_ALIGN u8 sprWin[256];
 
@@ -2536,16 +2537,15 @@ template<bool SKIP> static void GPU_ligne_DispCapture(u16 l)
 						//INFO("Capture source is SourceB\n");
 						switch (gpu->dispCapCnt.srcB)
 						{
-							case 0:			// Capture VRAM
-								{
-									//INFO("Capture VRAM\n");
-									CAPCOPY(cap_src,cap_dst);
-								}
+							case 0:	
+								//Capture VRAM
+								CAPCOPY(cap_src,cap_dst);
 								break;
-							case 1:			// Capture Main Memory Display FIFO
-								{
-									//INFO("Capture Main Memory Display FIFO\n");
-								}
+							case 1:
+								//capture dispfifo
+								//(not yet tested)
+								for(int i=0; i < 128; i++)
+									T1WriteLong(cap_dst, i << 2, DISP_FIFOrecv());
 								break;
 						}
 					}
@@ -2566,50 +2566,56 @@ template<bool SKIP> static void GPU_ligne_DispCapture(u16 l)
 							gfx3d_GetLineData(l, &srcA, NULL);
 						}
 
+						static u16 fifoLine[256];
+
 						if (gpu->dispCapCnt.srcB == 0)			// VRAM screen
 							srcB = (u16 *)cap_src;
 						else
-							srcB = NULL; // DISP FIFOS
-
-						if ((srcA) && (srcB))
 						{
-							const int todo = (gpu->dispCapCnt.capx==DISPCAPCNT::_128?128:256);
+							//fifo - tested by splinter cell chaos theory thermal view
+							srcB = fifoLine;
+							for (int i=0; i < 128; i++)
+								T1WriteLong((u8*)srcB, i << 2, DISP_FIFOrecv());
+						}
 
-							for(u16 i = 0; i < todo; i++) 
+
+						const int todo = (gpu->dispCapCnt.capx==DISPCAPCNT::_128?128:256);
+
+						for(u16 i = 0; i < todo; i++) 
+						{
+							u16 a,r,g,b;
+
+							u16 a_alpha = srcA[i] & 0x8000;
+							u16 b_alpha = srcB[i] & 0x8000;
+
+							if(a_alpha)
 							{
-								u16 a,r,g,b;
+								a = 0x8000;
+								r = ((srcA[i] & 0x1F) * gpu->dispCapCnt.EVA);
+								g = (((srcA[i] >>  5) & 0x1F) * gpu->dispCapCnt.EVA);
+								b = (((srcA[i] >>  10) & 0x1F) * gpu->dispCapCnt.EVA);
+							} 
+							else
+								a = r = g = b = 0;
 
-								u16 a_alpha = srcA[i] & 0x8000;
-								u16 b_alpha = srcB[i] & 0x8000;
-
-								if(a_alpha)
-								{
-									a = 0x8000;
-									r = ((srcA[i] & 0x1F) * gpu->dispCapCnt.EVA);
-									g = (((srcA[i] >>  5) & 0x1F) * gpu->dispCapCnt.EVA);
-									b = (((srcA[i] >>  10) & 0x1F) * gpu->dispCapCnt.EVA);
-								} 
-								else
-									a = r = g = b = 0;
-
-								if(b_alpha)
-								{
-									a = 0x8000;
-									r += ((srcB[i] & 0x1F) * gpu->dispCapCnt.EVB);
-									g += (((srcB[i] >>  5) & 0x1F) * gpu->dispCapCnt.EVB);
-									b += (((srcB[i] >> 10) & 0x1F) * gpu->dispCapCnt.EVB);
-								}
-
-								r >>= 4;
-								g >>= 4;
-								b >>= 4;
-
-								r = std::min((u16)31,r);
-								g = std::min((u16)31,g);
-								b = std::min((u16)31,b);
-
-								T2WriteWord(cap_dst, i << 1, a | (b << 10) | (g << 5) | r);
+							if(b_alpha)
+							{
+								a = 0x8000;
+								r += ((srcB[i] & 0x1F) * gpu->dispCapCnt.EVB);
+								g += (((srcB[i] >>  5) & 0x1F) * gpu->dispCapCnt.EVB);
+								b += (((srcB[i] >> 10) & 0x1F) * gpu->dispCapCnt.EVB);
 							}
+
+							r >>= 4;
+							g >>= 4;
+							b >>= 4;
+
+							//freedom wings sky will overflow while doing some fsaa/motionblur effect without this
+							r = std::min((u16)31,r);
+							g = std::min((u16)31,g);
+							b = std::min((u16)31,b);
+
+							T2WriteWord(cap_dst, i << 1, a | (b << 10) | (g << 5) | r);
 						}
 					}
 				break;
@@ -2821,17 +2827,17 @@ void GPU_ligne(NDS_Screen * screen, u16 l, bool skip)
 	gpu->setup_windows<0>();
 	gpu->setup_windows<1>();
 
-	//always generate the 2d+3d, no matter what we're displaying, since we may need to capture it
-	//(if this seems inefficient in some cases, consider that the speed in those cases is not really a problem)
-	GPU_tempScanline = screen->gpu->currDst = (u8 *)(GPU_screen) + (screen->offset + l) * 512;
-	GPU_ligne_layer(screen, l);
-
-	if (gpu->core == GPU_MAIN) 
-	{
-		GPU_ligne_DispCapture<false>(l);
-		if (l == 191) { disp_fifo.head = disp_fifo.tail = 0; }
+	//generate the 2d engine output
+	if(gpu->dispMode == 1) {
+		//optimization: render straight to the output buffer when thats what we are going to end up displaying anyway
+		GPU_tempScanline = screen->gpu->currDst = (u8 *)(GPU_screen) + (screen->offset + l) * 512;
+	} else {
+		//otherwise, we need to go to a temp buffer
+		GPU_tempScanline = screen->gpu->currDst = (u8 *)GPU_tempScanlineBuffer;
 	}
 
+	GPU_ligne_layer(screen, l);
+
 	switch (gpu->dispMode)
 	{
 		case 0: // Display Off(Display white)
@@ -2847,7 +2853,7 @@ void GPU_ligne(NDS_Screen * screen, u16 l, bool skip)
 			//do nothing: it has already been generated into the right place
 			break;
 
-		case 2: // Display framebuffer
+		case 2: // Display vram framebuffer
 			{
 				u8 * dst = GPU_screen + (screen->offset + l) * 512;
 				u8 * src = gpu->VRAMaddr + (l*512);
@@ -2856,6 +2862,8 @@ void GPU_ligne(NDS_Screen * screen, u16 l, bool skip)
 			break;
 		case 3: // Display memory FIFO
 			{
+				//this has not been tested since the dma timing for dispfifo was changed around the time of
+				//newemuloop. it may not work.
 				u8 * dst =  GPU_screen + (screen->offset + l) * 512;
 				for (int i=0; i < 128; i++)
 					T1WriteLong(dst, i << 2, DISP_FIFOrecv() & 0x7FFF7FFF);
@@ -2863,6 +2871,17 @@ void GPU_ligne(NDS_Screen * screen, u16 l, bool skip)
 			break;
 	}
 
+	//capture after displaying so that we can safely display vram before overwriting it here
+	if (gpu->core == GPU_MAIN) 
+	{
+		//BUG!!! if someone is capturing and displaying both from the fifo, then it will have been 
+		//consumed above by the display before we get here
+		//(is that even legal? i think so)
+		GPU_ligne_DispCapture<false>(l);
+		if (l == 191) { disp_fifo.head = disp_fifo.tail = 0; }
+	}
+
+
 	GPU_ligne_MasterBrightness(screen, l);
 }
 
diff --git a/desmume/src/MMU.cpp b/desmume/src/MMU.cpp
index 4ad7c0952..d164b71b6 100644
--- a/desmume/src/MMU.cpp
+++ b/desmume/src/MMU.cpp
@@ -1090,15 +1090,12 @@ void FASTCALL MMU_doDMA(u32 num)
 	taille = (MMU.DMACrt[PROCNUM][num]&0x1FFFFF);
 	if(taille == 0) taille = 0x200000; //according to gbatek..
 	
-	//THIS IS A BIG HACK
-	// If we are in "Main memory display" mode just copy an entire 
-	// screen (256x192 pixels). 
-	//    Reference:  http://nocash.emubase.de/gbatek.htm#dsvideocaptureandmainmemorydisplaymode
-	//       (under DISP_MMEM_FIFO)
-	if ((MMU.DMAStartTime[PROCNUM][num]==EDMAMode_MemDisplay) &&		// Must be in main memory display mode
-		(taille==4) &&							// Word must be 4
-		(((MMU.DMACrt[PROCNUM][num]>>26)&1) == 1))	// Transfer mode must be 32bit wide
-		taille = 24576; //256*192/2;
+	//for main memory display fifo dmas, check for normal conditions and then dma all 128 bytes at once
+	//(theyll get sent to the fifo, which can handle more than it ought to be able to)
+	if ((MMU.DMAStartTime[PROCNUM][num]==EDMAMode_MemDisplay) &&
+		(taille==4) &&
+		(((MMU.DMACrt[PROCNUM][num]>>26)&1) == 1))
+		taille = 128; 
 	
 	if(MMU.DMAStartTime[PROCNUM][num] == EDMAMode_Card)
 		taille *= 0x80;
diff --git a/desmume/src/NDSSystem.cpp b/desmume/src/NDSSystem.cpp
index e2ac30a23..99bf8b711 100644
--- a/desmume/src/NDSSystem.cpp
+++ b/desmume/src/NDSSystem.cpp
@@ -1973,7 +1973,10 @@ static void execHardware_hstart()
 	if(nds.VCount<192)
 	{
 		//this is hacky.
-		//there is a corresponding hack in doDMA
+		//there is a corresponding hack in doDMA.
+		//it should be driven by a fifo (and generate just in time as the scanline is displayed)
+		//but that isnt even possible until we have some sort of sub-scanline timing.
+		//it may not be necessary.
 		execHardware_doAllDma(EDMAMode_MemDisplay);
 	}
 
diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp
index 111220403..c324ed889 100644
--- a/desmume/src/rasterize.cpp
+++ b/desmume/src/rasterize.cpp
@@ -345,7 +345,6 @@ struct Shader
 	{
 		mode = (polyattr>>4)&0x3;
 		//if there is no texture set, then set to the mode which doesnt even use a texture
-		//unless we're in shadow 
 		if(sampler.texFormat == 0 && mode != 3)
 			mode = 4;
 	}