diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp
index 84c392deb..569b674a2 100644
--- a/desmume/src/OGLRender.cpp
+++ b/desmume/src/OGLRender.cpp
@@ -78,8 +78,8 @@ static void ENDGL() {
 #define	CTASSERT(x)		typedef char __assert ## y[(x) ? 1 : -1]
 #endif
 
-static ALIGN(16) unsigned char  GPU_screen3D		[256*256*4]={0};
-static ALIGN(16) unsigned char  GPU_screenStencil[256*256]={0};
+static ALIGN(16) unsigned char  GPU_screen3D		[256*256*4];
+static ALIGN(16) unsigned char  GPU_screenStencil[256*256];
 
 static const unsigned short map3d_cull[4] = {GL_FRONT_AND_BACK, GL_FRONT, GL_BACK, 0};
 static const int texEnv[4] = { GL_MODULATE, GL_DECAL, GL_MODULATE, GL_MODULATE };
@@ -1161,6 +1161,16 @@ static void GL_ReadFramebuffer()
 	glReadPixels(0,0,256,192,GL_STENCIL_INDEX,		GL_UNSIGNED_BYTE,	GPU_screenStencil);
 	ENDGL();
 
+	//convert the pixels to a different format which is more convenient
+	//is it safe to modify the screen buffer? if not, we could make a temp copy
+	for(int i=0;i<256*192;i++) {
+		int t = i<<2;
+		u32 &u32screen3D = *(u32*)&GPU_screen3D[t];
+		u32screen3D>>=3;
+		u32screen3D &= 0x1F1F1F1F;
+	}
+		
+
 //debug: view depth buffer via color buffer for debugging
 	//int ctr=0;
 	//for(ctr=0;ctr<256*192;ctr++) {
@@ -1207,7 +1217,8 @@ static void GetLineCaptured(int line, u16* dst)
 		u32 g = screen3D[t+1];
 		u32 b = screen3D[t+2];
 
-		dst[i] = ((b>>3)<<10) | ((g>>3)<<5) | (r>>3) | 0x8000;
+		//if this math strikes you as wrong, be sure to look at GL_ReadFramebuffer() where the pixel format in screen3D is changed
+		dst[i] = (b<<10) | (g<<5) | (r>>3) | 0x8000;
 	}
 }
 
@@ -1233,7 +1244,7 @@ static void GetLine (int line, u16* dst)
 	//in fact, we are going to do that to fix some problems. 
 	//but beware that it i figure it might could CAUSE some problems
 
-	//this alpha compositing blending logic isnt thought through at all
+	//this alpha compositing blending logic isnt thought through very much
 	//someone needs to think about what bitdepth it should take place at and how to do it efficiently
 
 	for(int i = 0; i < 256; i++)
@@ -1243,31 +1254,42 @@ static void GetLine (int line, u16* dst)
 		//you would use this if you wanted to use the stencil buffer to make decisions here
 		if(!stencil) continue;
 
+		u16 oldcolor = dst[i];
+		
 		int t=i<<2;
-		u32 r = screen3D[t+0];
-		u32 g = screen3D[t+1];
-		u32 b = screen3D[t+2];
-		u32 a = screen3D[t+3];
-
-		u32 oldcolor = RGB15TO32(dst[i],0);
-		u32 oldr = oldcolor&0xFF;
-		u32 oldg = (oldcolor>>8)&0xFF;
-		u32 oldb = (oldcolor>>16)&0xFF;
-
-		r = (r*a + oldr*(255-a)) >> 8;
-		g = (g*a + oldg*(255-a)) >> 8;
-		b = (b*a + oldb*(255-a)) >> 8;
-
-		r=std::min((u32)255,r);
-		g=std::min((u32)255,g);
-		b=std::min((u32)255,b);
-
-		//debug: display alpha channel
+		u32 dstpixel;
+		
+		//old debug reminder: display alpha channel
 		//u32 r = screen3D[t+3];
 		//u32 g = screen3D[t+3];
 		//u32 b = screen3D[t+3];
 
-		dst[i] = ((b>>3)<<10) | ((g>>3)<<5) | (r>>3);
+		//if this math strikes you as wrong, be sure to look at GL_ReadFramebuffer() where the pixel format in screen3D is changed
+
+		u32 a = screen3D[t+3];
+		
+		typedef u8 mixtbl[32][32];
+		mixtbl & mix = mixTable555[a];
+		
+		//r
+		u32 newpix = screen3D[t+0];
+		u32 oldpix = oldcolor&0x1F;
+		newpix = mix[newpix][oldpix];
+		dstpixel = newpix;
+		
+		//g
+		newpix = screen3D[t+1];
+		oldpix = (oldcolor>>5)&0x1F;
+		newpix = mix[newpix][oldpix];
+		dstpixel |= (newpix<<5);
+
+		//b
+		newpix = screen3D[t+2];
+		oldpix = (oldcolor>>10)&0x1F;
+		newpix = mix[newpix][oldpix];
+		dstpixel |= (newpix<<10);
+
+		dst[i] = dstpixel;
 	}
 }
 
diff --git a/desmume/src/armcpu.cpp b/desmume/src/armcpu.cpp
index 5492528b5..f6426175d 100644
--- a/desmume/src/armcpu.cpp
+++ b/desmume/src/armcpu.cpp
@@ -502,7 +502,8 @@ u32 armcpu_exec()
 {
         u32 c = 1;
 
-		assert(ARMPROC.instruct_adr!=0x00000000);
+		//this assert is annoying. but sometimes it is handy.
+		//assert(ARMPROC.instruct_adr!=0x00000000);
 
 #ifdef GDB_STUB
         if (ARMPROC.stalled)
diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp
index f1f910e68..e301b3897 100644
--- a/desmume/src/gfx3d.cpp
+++ b/desmume/src/gfx3d.cpp
@@ -39,6 +39,7 @@ GFX3D gfx3d;
 
 //tables that are provided to anyone
 CACHE_ALIGN u32 color_15bit_to_24bit[32768];
+CACHE_ALIGN u8 mixTable555[32][32][32];
 
 //is this a crazy idea? this table spreads 5 bits evenly over 31 from exactly 0 to INT_MAX
 CACHE_ALIGN const int material_5bit_to_31bit[] = {
@@ -184,6 +185,13 @@ static void makeTables() {
 
 	for (int i = 0; i < 1024; i++)
 		normalTable[i] = ((signed short)(i<<6)) / (float)(1<<15);
+
+	for(int r=0;r<=31;r++) 
+		for(int oldr=0;oldr<=31;oldr++) 
+			for(int a=0;a<=31;a++)  {
+				int temp = (r*a + oldr*(31-a)) / 31;
+				mixTable555[a][r][oldr] = temp;
+			}
 }
 
 void gfx3d_init()
diff --git a/desmume/src/gfx3d.h b/desmume/src/gfx3d.h
index 3647e44a2..49209fbfb 100644
--- a/desmume/src/gfx3d.h
+++ b/desmume/src/gfx3d.h
@@ -131,6 +131,7 @@ extern GFX3D gfx3d;
 #define RGB15TO32_DIRECT(col,alpha8) ( ((alpha8)<<24) | (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
 
 extern CACHE_ALIGN u32 color_15bit_to_24bit[32768];
+extern CACHE_ALIGN u8 mixTable555[32][32][32];
 extern CACHE_ALIGN const int material_5bit_to_31bit[32];
 extern CACHE_ALIGN const u8 material_5bit_to_8bit[32];
 extern CACHE_ALIGN const u8 material_3bit_to_8bit[8];