diff --git a/GPU2D.cpp b/GPU2D.cpp
index 306a5227..6c73d940 100644
--- a/GPU2D.cpp
+++ b/GPU2D.cpp
@@ -174,6 +174,57 @@ void GPU2D::DrawScanline(u32 line)
 // temp. hax
 #define DrawBG_Text DrawBG_Text_4bpp
 
+template<u32 bgmode>
+void GPU2D::DrawScanlineBGMode(u32 line, u32* spritebuf, u16* dst)
+{
+    for (int i = 3; i >= 0; i--)
+    {
+        if ((BGCnt[3] & 0x3) == i)
+        {
+            if (DispCnt & 0x0800)
+            {
+                if (bgmode >= 3)
+                    {} // todo: ext
+                else if (bgmode >= 1)
+                    {} // todo: rotscale
+                else
+                    DrawBG_Text(line, dst, 3);
+            }
+        }
+        if ((BGCnt[2] & 0x3) == i)
+        {
+            if (DispCnt & 0x0400)
+            {
+                if (bgmode == 5)
+                    {} // todo: ext
+                else if (bgmode == 4 || bgmode == 2)
+                    {} // todo: rotscale
+                else
+                    DrawBG_Text(line, dst, 2);
+            }
+        }
+        if ((BGCnt[1] & 0x3) == i)
+        {
+            if (DispCnt & 0x0200)
+            {
+                DrawBG_Text(line, dst, 1);
+            }
+        }
+        if ((BGCnt[0] & 0x3) == i)
+        {
+            if (DispCnt & 0x0100)
+            {
+                if ((!Num) && (DispCnt & 0x8))
+                    {} // TODO
+                else
+                    DrawBG_Text(line, dst, 0);
+            }
+        }
+        if (DispCnt & 0x1000)
+            InterleaveSprites(spritebuf, 0x8000 | (i<<16), dst);
+    }
+}
+
 void GPU2D::DrawScanline_Mode1(u32 line, u16* dst)
 {
     u32 backdrop;
@@ -193,61 +244,12 @@ void GPU2D::DrawScanline_Mode1(u32 line, u16* dst)
 
     switch (DispCnt & 0x7)
     {
-    case 0:
-        for (int i = 3; i >= 0; i--)
-        {
-            if ((BGCnt[3] & 0x3) == i)
-            {
-                if (DispCnt & 0x0800)
-                    DrawBG_Text(line, dst, 3);
-            }
-            if ((BGCnt[2] & 0x3) == i)
-            {
-                if (DispCnt & 0x0400)
-                    DrawBG_Text(line, dst, 2);
-            }
-            if ((BGCnt[1] & 0x3) == i)
-            {
-                if (DispCnt & 0x0200)
-                    DrawBG_Text(line, dst, 1);
-            }
-            if ((BGCnt[0] & 0x3) == i)
-            {
-                if (DispCnt & 0x0100)
-                    DrawBG_Text(line, dst, 0);
-            }
-            if (DispCnt & 0x1000)
-                InterleaveSprites(spritebuf, 0x8000 | (i<<16), dst);
-        }
-        break;
-
-    case 5:
-        for (int i = 3; i >= 0; i--)
-        {
-            if ((BGCnt[3] & 0x3) == i)
-            {
-                //if (DispCnt & 0x0800)
-                    // ext todo
-            }
-            if ((BGCnt[2] & 0x3) == i)
-            {
-                //if (DispCnt & 0x0400)
-                    // ext todo
-            }
-            if ((BGCnt[1] & 0x3) == i)
-            {
-                if (DispCnt & 0x0200)
-                    DrawBG_Text(line, dst, 1);
-            }
-            if ((BGCnt[0] & 0x3) == i)
-            {
-                if (DispCnt & 0x0100)
-                    DrawBG_Text(line, dst, 0);
-            }
-            if (DispCnt & 0x1000)
-                InterleaveSprites(spritebuf, 0x8000 | (i<<16), dst);
-        }
-        break;
+    case 0: DrawScanlineBGMode<0>(line, spritebuf, dst); break;
+    case 1: DrawScanlineBGMode<1>(line, spritebuf, dst); break;
+    case 2: DrawScanlineBGMode<2>(line, spritebuf, dst); break;
+    case 3: DrawScanlineBGMode<3>(line, spritebuf, dst); break;
+    case 4: DrawScanlineBGMode<4>(line, spritebuf, dst); break;
+    case 5: DrawScanlineBGMode<5>(line, spritebuf, dst); break;
     }
 
     // debug crap
@@ -255,8 +257,7 @@ void GPU2D::DrawScanline_Mode1(u32 line, u16* dst)
     //    dst[i] = *(u16*)&GPU::Palette[Num*0x400 + (i>>4)*2 + (line>>4)*32];
 }
 
-// char   06218000
-// screen 06208000
+
 void GPU2D::DrawBG_Text_4bpp(u32 line, u16* dst, u32 bgnum)
 {
     u16 bgcnt = BGCnt[bgnum];
@@ -304,42 +305,84 @@ void GPU2D::DrawBG_Text_4bpp(u32 line, u16* dst, u32 bgnum)
     u16* curpal;
     u8* pixels;
 
-    // preload shit as needed
-    if (xoff & 0x7)
+    if (bgcnt & 0x0080)
     {
-        // load a new tile
-        curtile = tilemap[((xoff & 0xFF) >> 3) + ((xoff & widexmask) << 2)];
-        curpal = pal + ((curtile & 0xF000) >> 8);
-        pixels = tileset + ((curtile & 0x03FF) << 5) + ((yoff & 0x7) << 2);
-        pixels += ((xoff & 0x7) >> 1);
-    }
+        // 256-color
 
-    for (int i = 0; i < 256; i++)
+        // preload shit as needed
+        if (xoff & 0x7)
+        {
+            // load a new tile
+            curtile = tilemap[((xoff & 0xFF) >> 3) + ((xoff & widexmask) << 2)];
+            curpal = pal;// + ((curtile & 0xF000) >> 8); // TODO: this applies to ext palettes
+            pixels = tileset + ((curtile & 0x03FF) << 6);
+            pixels += (((curtile & 0x0800) ? (7-(yoff&0x7)) : (yoff&0x7)) << 3);
+        }
+
+        for (int i = 0; i < 256; i++)
+        {
+            if (!(xoff & 0x7))
+            {
+                // load a new tile
+                curtile = tilemap[((xoff & 0xFF) >> 3) + ((xoff & widexmask) << 2)];
+                curpal = pal;// + ((curtile & 0xF000) >> 8);
+                pixels = tileset + ((curtile & 0x03FF) << 6);
+                pixels += (((curtile & 0x0800) ? (7-(yoff&0x7)) : (yoff&0x7)) << 3);
+            }
+
+            // draw pixel
+            u8 color;
+            u32 tilexoff = (curtile & 0x0400) ? (7-(xoff&0x7)) : (xoff&0x7);
+            color = pixels[tilexoff];
+
+            if (color)
+                dst[i] = curpal[color];
+
+            xoff++;
+        }
+    }
+    else
     {
-        if (!(xoff & 0x7))
+        // 16-color
+
+        // preload shit as needed
+        if (xoff & 0x7)
         {
             // load a new tile
             curtile = tilemap[((xoff & 0xFF) >> 3) + ((xoff & widexmask) << 2)];
             curpal = pal + ((curtile & 0xF000) >> 8);
-            pixels = tileset + ((curtile & 0x03FF) << 5) + ((yoff & 0x7) << 2);
+            pixels = tileset + ((curtile & 0x03FF) << 5);
+            pixels += (((curtile & 0x0800) ? (7-(yoff&0x7)) : (yoff&0x7)) << 2);
         }
 
-        // draw pixel
-        u8 color;
-        if (xoff & 0x1)
+        for (int i = 0; i < 256; i++)
         {
-            color = *pixels >> 4;
-            pixels++;
-        }
-        else
-        {
-            color = *pixels & 0x0F;
-        }
+            if (!(xoff & 0x7))
+            {
+                // load a new tile
+                curtile = tilemap[((xoff & 0xFF) >> 3) + ((xoff & widexmask) << 2)];
+                curpal = pal + ((curtile & 0xF000) >> 8);
+                pixels = tileset + ((curtile & 0x03FF) << 5);
+                pixels += (((curtile & 0x0800) ? (7-(yoff&0x7)) : (yoff&0x7)) << 2);
+            }
 
-        if (color)
-            dst[i] = curpal[color];
+            // draw pixel
+            u8 color;
+            u32 tilexoff = (curtile & 0x0400) ? (7-(xoff&0x7)) : (xoff&0x7);
+            if (tilexoff & 0x1)
+            {
+                color = pixels[tilexoff>>1] >> 4;
+            }
+            else
+            {
+                color = pixels[tilexoff>>1] & 0x0F;
+            }
 
-        xoff++;
+            if (color)
+                dst[i] = curpal[color];
+
+            xoff++;
+        }
     }
 }
 
diff --git a/GPU2D.h b/GPU2D.h
index b65644cd..f26e92ab 100644
--- a/GPU2D.h
+++ b/GPU2D.h
@@ -48,6 +48,7 @@ private:
     u16 BGXPos[4];
     u16 BGYPos[4];
 
+    template<u32 bgmode> void DrawScanlineBGMode(u32 line, u32* spritebuf, u16* dst);
     void DrawScanline_Mode1(u32 line, u16* dst);
 
     void DrawBG_Text_4bpp(u32 line, u16* dst, u32 num);
diff --git a/NDS.cpp b/NDS.cpp
index 1e320c8b..413409aa 100644
--- a/NDS.cpp
+++ b/NDS.cpp
@@ -274,7 +274,7 @@ void Reset()
     // test
     //LoadROM();
     //LoadFirmware();
-    NDSCart::LoadROM("rom/sm64ds.nds");
+    NDSCart::LoadROM("rom/nsmb.nds");
 
     Running = true; // hax
 }
diff --git a/melonDS.depend b/melonDS.depend
index 767e4b7a..66d5ae20 100644
--- a/melonDS.depend
+++ b/melonDS.depend
@@ -10,7 +10,7 @@
 
 1481161027 c:\documents\sources\melonds\types.h
 
-1485982397 source:c:\documents\sources\melonds\nds.cpp
+1485988849 source:c:\documents\sources\melonds\nds.cpp
 	<stdio.h>
 	<string.h>
 	"NDS.h"
@@ -86,13 +86,13 @@
 	"NDS.h"
 	"SPI.h"
 
-1485971044 source:c:\documents\sources\melonds\gpu2d.cpp
+1485988879 source:c:\documents\sources\melonds\gpu2d.cpp
 	<stdio.h>
 	<string.h>
 	"NDS.h"
 	"GPU.h"
 
-1484969589 c:\documents\sources\melonds\gpu2d.h
+1485985325 c:\documents\sources\melonds\gpu2d.h
 
 1481040524 c:\documents\sources\melonds\wifi.h