Merge pull request #930 from PatrickvL/nv2a_work

NV2A work enabling more OpenGL code
2018-02-19 20:58:25 +00:00 · 2018-02-19 20:58:25 +00:00 · a9f2f78b04
parent c101a21240 25ee163a86
commit a9f2f78b04
23 changed files with 5149 additions and 718 deletions
--- a/build/win32/Cxbx.vcxproj
+++ b/build/win32/Cxbx.vcxproj
@ -264,8 +264,16 @@
    <ClInclude Include="..\..\src\devices\SMBus.h" />
    <ClInclude Include="..\..\src\devices\SMCDevice.h" />
    <ClInclude Include="..\..\src\devices\SMDevice.h" />
+    <ClInclude Include="..\..\src\devices\video\glextensions.h" />
+    <ClInclude Include="..\..\src\devices\video\gloffscreen.h" />
    <ClInclude Include="..\..\src\devices\video\nv2a.h" />
+    <ClInclude Include="..\..\src\devices\video\nv2a_debug.h" />
    <ClInclude Include="..\..\src\devices\video\nv2a_int.h" />
+    <ClInclude Include="..\..\src\devices\video\nv2a_psh.h" />
+    <ClInclude Include="..\..\src\devices\video\nv2a_shaders.h" />
+    <ClInclude Include="..\..\src\devices\video\nv2a_shaders_common.h" />
+    <ClInclude Include="..\..\src\devices\video\nv2a_vsh.h" />
+    <ClInclude Include="..\..\src\devices\video\queue.h" />
    <ClInclude Include="..\..\src\devices\video\swizzle.h" />
    <ClInclude Include="..\..\src\devices\video\vga.h" />
    <ClInclude Include="..\..\src\devices\Xbox.h" />
@ -632,7 +640,13 @@
    <ClCompile Include="..\..\src\devices\SMBus.cpp" />
    <ClCompile Include="..\..\src\devices\SMCDevice.cpp" />
    <ClCompile Include="..\..\src\devices\SMDevice.cpp" />
+    <ClCompile Include="..\..\src\devices\video\glextensions.c" />
+    <ClCompile Include="..\..\src\devices\video\gloffscreen.c" />
    <ClCompile Include="..\..\src\devices\video\nv2a.cpp" />
+    <ClCompile Include="..\..\src\devices\video\nv2a_debug.c" />
+    <ClCompile Include="..\..\src\devices\video\nv2a_psh.cpp" />
+    <ClCompile Include="..\..\src\devices\video\nv2a_shaders.cpp" />
+    <ClCompile Include="..\..\src\devices\video\nv2a_vsh.cpp" />
    <ClCompile Include="..\..\src\devices\video\swizzle.cpp" />
    <ClCompile Include="..\..\src\devices\Xbox.cpp" />
  </ItemGroup>
--- a/build/win32/Cxbx.vcxproj.filters
+++ b/build/win32/Cxbx.vcxproj.filters
@ -217,18 +217,12 @@
    <ClCompile Include="..\..\src\devices\SMDevice.cpp">
      <Filter>Hardware</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\src\devices\video\nv2a.cpp">
-      <Filter>Hardware</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\src\Common\CxbxDebugger.cpp">
      <Filter>Shared</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\devices\Xbox.cpp">
      <Filter>Hardware</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\src\devices\EmuNVNet.cpp">
-      <Filter>Hardware</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\src\Common\XbePrinter.cpp">
      <Filter>Shared</Filter>
    </ClCompile>
@ -241,12 +235,36 @@
    <ClCompile Include="..\..\src\CxbxKrnl\crc32c.cpp">
      <Filter>Shared</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\src\devices\video\swizzle.cpp">
-      <Filter>Hardware</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\src\CxbxKrnl\EmuKrnlKi.cpp">
      <Filter>Kernel</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\src\devices\EmuNVNet.cpp">
+      <Filter>Hardware\Video</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\devices\video\glextensions.c">
+      <Filter>Hardware\Video</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\devices\video\gloffscreen.c">
+      <Filter>Hardware\Video</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\devices\video\nv2a.cpp">
+      <Filter>Hardware\Video</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\devices\video\nv2a_debug.c">
+      <Filter>Hardware\Video</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\devices\video\nv2a_psh.cpp">
+      <Filter>Hardware\Video</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\devices\video\nv2a_shaders.cpp">
+      <Filter>Hardware\Video</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\devices\video\nv2a_vsh.cpp">
+      <Filter>Hardware\Video</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\devices\video\swizzle.cpp">
+      <Filter>Hardware\Video</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\..\src\Cxbx\DlgControllerConfig.h">
@ -447,27 +465,15 @@
    <ClInclude Include="..\..\src\devices\SMDevice.h">
      <Filter>Hardware</Filter>
    </ClInclude>
-    <ClInclude Include="..\..\src\devices\video\nv2a.h">
-      <Filter>Hardware</Filter>
-    </ClInclude>
    <ClInclude Include="..\..\src\devices\LED.h">
      <Filter>Hardware</Filter>
    </ClInclude>
    <ClInclude Include="..\..\src\devices\Xbox.h">
      <Filter>Hardware</Filter>
    </ClInclude>
-    <ClInclude Include="..\..\src\devices\EmuNVNet.h">
-      <Filter>Hardware</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\devices\video\nv2a_int.h">
-      <Filter>Hardware</Filter>
-    </ClInclude>
    <ClInclude Include="..\..\src\Common\CxbxDebugger.h">
      <Filter>Shared</Filter>
    </ClInclude>
-    <ClInclude Include="..\..\src\devices\video\vga.h">
-      <Filter>Hardware</Filter>
-    </ClInclude>
    <ClInclude Include="..\..\src\Common\XbePrinter.h">
      <Filter>Shared</Filter>
    </ClInclude>
@ -483,8 +489,44 @@
    <ClInclude Include="..\..\src\CxbxKrnl\EmuKrnlKi.h">
      <Filter>Kernel</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\src\devices\EmuNVNet.h">
+      <Filter>Hardware\Video</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\devices\video\glextensions.h">
+      <Filter>Hardware\Video</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\devices\video\gloffscreen.h">
+      <Filter>Hardware\Video</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\devices\video\nv2a.h">
+      <Filter>Hardware\Video</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\devices\video\nv2a_debug.h">
+      <Filter>Hardware\Video</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\devices\video\nv2a_int.h">
+      <Filter>Hardware\Video</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\devices\video\nv2a_psh.h">
+      <Filter>Hardware\Video</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\devices\video\nv2a_shaders.h">
+      <Filter>Hardware\Video</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\devices\video\nv2a_shaders_common.h">
+      <Filter>Hardware\Video</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\devices\video\nv2a_vsh.h">
+      <Filter>Hardware\Video</Filter>
+    </ClInclude>
    <ClInclude Include="..\..\src\devices\video\swizzle.h">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\devices\video\vga.h">
+      <Filter>Hardware\Video</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\devices\video\queue.h">
+      <Filter>Hardware\Video</Filter>
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
@ -741,67 +783,67 @@
      <Filter>HLEDatabase\XOnline</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_DEBUG.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PBUS.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PCOUNTER.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PCRTC.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PFB.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PFIFO.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PGRAPH.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PMC.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PRAMDAC.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PRAMIN.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PRMA.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PRMCIO.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PRMDIO.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PRMFB.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PRMVIO.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PSTRAPS.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PTIMER.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PTV.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PVIDEO.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_PVPE.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
    <None Include="..\..\src\devices\video\EmuNV2A_USER.cpp">
-      <Filter>Hardware</Filter>
+      <Filter>Hardware\Video</Filter>
    </None>
  </ItemGroup>
  <ItemGroup>
@ -847,6 +889,9 @@
    <Filter Include="Hardware">
      <UniqueIdentifier>{922ab09b-aa8e-41bb-b781-58654160ee3d}</UniqueIdentifier>
    </Filter>
+    <Filter Include="Hardware\Video">
+      <UniqueIdentifier>{deba5d3e-9a1a-4099-bc91-12737a48272e}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ResourceCompile Include="..\..\resource\Cxbx.rc">
--- a/src/CxbxKrnl/CxbxKrnl.cpp
+++ b/src/CxbxKrnl/CxbxKrnl.cpp
@ -1083,17 +1083,18 @@ __declspec(noreturn) void CxbxKrnlInit

 	SetupXboxDeviceTypes();

-	InitXboxHardware(HardwareModel::Revision1_5); // TODO : Make configurable
-
-	// Now the hardware devices exist, couple the EEPROM buffer to it's device
-	g_EEPROM->SetEEPROM((uint8_t*)EEPROM);
-
 	if (bLLE_GPU)
 	{
 		DbgPrintf("INIT: Initializing OpenGL.\n");
 		InitOpenGLContext();
 	}
-	else
+
+	InitXboxHardware(HardwareModel::Revision1_5); // TODO : Make configurable
+
+	// Now the hardware devices exist, couple the EEPROM buffer to it's device
+	g_EEPROM->SetEEPROM((uint8_t*)EEPROM);
+
+	if (!bLLE_GPU)
 	{
 		DbgPrintf("INIT: Initializing Direct3D.\n");
 		XTL::EmuD3DInit();
--- a/src/devices/video/EmuNV2A_PFIFO.cpp
+++ b/src/devices/video/EmuNV2A_PFIFO.cpp
@ -366,9 +366,7 @@ int pfifo_puller_thread(NV2AState *d)

 	Cache1State *state = &(d->pfifo.cache1);

-#ifdef COMPILE_OPENGL
-	glo_set_current(d->pgraph.gl_context);
-#endif
+	// glo_set_current(d->pgraph.gl_context);

 	std::unique_lock<std::mutex> cache_unique_lock(d->pfifo.cache1.cache_lock, std::defer_lock);

@ -381,9 +379,9 @@ int pfifo_puller_thread(NV2AState *d)

 			if (d->exiting) {
 				cache_unique_lock.unlock(); // UNTESTED
-#ifdef COMPILE_OPENGL
-				glo_set_current(NULL);
-#endif
+
+				// glo_set_current(NULL);
+
 				return 0;
 			}
 		}
--- a/src/devices/video/EmuNV2A_PGRAPH.cpp
+++ b/src/devices/video/EmuNV2A_PGRAPH.cpp
--- a/src/devices/video/g-lru-cache.c
+++ b/src/devices/video/g-lru-cache.c
@ -0,0 +1,368 @@
+/* g-lru-cache.c
+ *
+ * Copyright (C) 2009 - Christian Hergert
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ * 
+ * This is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/* 
+ * Ideally, you want to use fast_get. This is because we are using a
+ * GStaticRWLock which is indeed slower than a mutex if you have lots of writer
+ * acquisitions. This doesn't make it a true LRU, though, as the oldest
+ * retrieval from strorage is the first item evicted.
+ */
+
+#include "g-lru-cache.h"
+
+#define LRU_CACHE_PRIVATE(object)          \
+    (G_TYPE_INSTANCE_GET_PRIVATE((object), \
+    G_TYPE_LRU_CACHE,                      \
+    GLruCachePrivate))
+
+struct _GLruCachePrivate
+{
+	GRWLock         rw_lock;
+	guint           max_size;
+	gboolean        fast_get;
+	
+	GHashTable     *hash_table;
+	GEqualFunc      key_equal_func;
+	GCopyFunc       key_copy_func;
+	GList          *newest;
+	GList          *oldest;
+	
+	GLookupFunc     retrieve_func;
+	
+	gpointer        user_data;
+	GDestroyNotify  user_destroy_func;
+};
+
+G_DEFINE_TYPE (GLruCache, g_lru_cache, G_TYPE_OBJECT);
+
+static void
+g_lru_cache_finalize (GObject *object)
+{
+	GLruCachePrivate *priv = LRU_CACHE_PRIVATE (object);
+	
+	if (priv->user_data && priv->user_destroy_func)
+		priv->user_destroy_func (priv->user_data);
+	
+	priv->user_data = NULL;
+	priv->user_destroy_func = NULL;
+	
+	g_hash_table_destroy (priv->hash_table);
+	priv->hash_table = NULL;
+	
+	g_list_free (priv->newest);
+	priv->newest = NULL;
+	priv->oldest = NULL;
+	
+	G_OBJECT_CLASS (g_lru_cache_parent_class)->finalize (object);
+}
+
+static void
+g_lru_cache_class_init (GLruCacheClass *klass)
+{
+	GObjectClass *object_class = G_OBJECT_CLASS (klass);
+	
+	object_class->finalize = g_lru_cache_finalize;
+
+	g_type_class_add_private (object_class, sizeof (GLruCachePrivate));
+}
+
+static void
+g_lru_cache_init (GLruCache *self)
+{
+	self->priv = LRU_CACHE_PRIVATE (self);
+	
+	self->priv->max_size = 1024;
+	self->priv->fast_get = FALSE;
+	g_rw_lock_init (&self->priv->rw_lock);
+}
+
+static void
+g_lru_cache_evict_n_oldest_locked (GLruCache *self, gint n)
+{
+	GList *victim;
+	gint   i;
+	
+	for (i = 0; i < n; i++)
+	{
+		victim = self->priv->oldest;
+		
+		if (victim == NULL)
+			return;
+		
+		if (victim->prev)
+			victim->prev->next = NULL;
+		
+		self->priv->oldest = victim->prev;
+		g_hash_table_remove (self->priv->hash_table, victim->data);
+		
+		if (self->priv->newest == victim)
+			self->priv->newest = NULL;
+		
+		g_list_free1 (victim); /* victim->data is owned by hashtable */
+	}
+	
+#if DEBUG
+	g_assert (g_hash_table_size (self->priv->hash_table) == g_list_length (self->priv->newest));
+#endif
+}
+
+GLruCache*
+g_lru_cache_new (GHashFunc      key_hash_func,
+                 GEqualFunc     key_equal_func,
+                 GLookupFunc    retrieve_func,
+                 gpointer       user_data,
+                 GDestroyNotify user_destroy_func)
+{
+	return g_lru_cache_new_full (0,
+	                             NULL,
+	                             NULL,
+	                             0,
+	                             NULL,
+	                             NULL,
+	                             key_hash_func,
+	                             key_equal_func,
+	                             retrieve_func,
+	                             user_data,
+	                             user_destroy_func);
+}
+
+GLruCache*
+g_lru_cache_new_full (GType          key_type,
+                      GCopyFunc      key_copy_func,
+                      GDestroyNotify key_destroy_func,
+                      GType          value_type,
+                      GCopyFunc      value_copy_func,
+                      GDestroyNotify value_destroy_func,
+                      GHashFunc      key_hash_func,
+                      GEqualFunc     key_equal_func,
+                      GLookupFunc    retrieve_func,
+                      gpointer       user_data,
+                      GDestroyNotify user_destroy_func)
+{
+	GLruCache *self = g_object_new (G_TYPE_LRU_CACHE, NULL);
+	
+	self->priv->hash_table = g_hash_table_new_full (key_hash_func,
+	                                                key_equal_func,
+	                                                key_destroy_func,
+	                                                value_destroy_func);
+	
+	self->priv->key_equal_func = key_equal_func;
+	self->priv->key_copy_func = key_copy_func;
+	self->priv->retrieve_func = retrieve_func;
+	self->priv->user_data = user_data;
+	self->priv->user_destroy_func = user_destroy_func;
+	
+	return self;
+}
+
+void
+g_lru_cache_set_max_size (GLruCache *self, guint max_size)
+{
+	g_return_if_fail (G_IS_LRU_CACHE (self));
+	
+	guint old_max_size = self->priv->max_size;
+	
+	g_rw_lock_writer_lock (&(self->priv->rw_lock));
+	
+	self->priv->max_size = max_size;
+	
+	if (old_max_size > max_size)
+		g_lru_cache_evict_n_oldest_locked (self, old_max_size - max_size);
+	
+	g_rw_lock_writer_unlock (&(self->priv->rw_lock));
+}
+
+guint
+g_lru_cache_get_max_size (GLruCache *self)
+{
+	g_return_val_if_fail (G_IS_LRU_CACHE (self), -1);
+	return self->priv->max_size;
+}
+
+guint
+g_lru_cache_get_size (GLruCache *self)
+{
+	g_return_val_if_fail (G_IS_LRU_CACHE (self), -1);
+	return g_hash_table_size (self->priv->hash_table);
+}
+
+gpointer
+g_lru_cache_get (GLruCache *self, gpointer key, GError **error)
+{
+	g_return_val_if_fail (G_IS_LRU_CACHE (self), NULL);
+	
+	gpointer value;
+	GError *retrieve_error = NULL;
+	
+	g_rw_lock_reader_lock (&(self->priv->rw_lock));
+	
+	value = g_hash_table_lookup (self->priv->hash_table, key);
+	
+#if DEBUG
+	if (value)
+		g_debug ("Cache Hit!");
+	else
+		g_debug ("Cache miss");
+#endif
+	
+	g_rw_lock_reader_unlock (&(self->priv->rw_lock));
+	
+	if (!value)
+	{
+		g_rw_lock_writer_lock (&(self->priv->rw_lock));
+		
+		if (!g_hash_table_lookup (self->priv->hash_table, key))
+		{
+			if (g_hash_table_size (self->priv->hash_table) >= self->priv->max_size)
+#if DEBUG
+			{
+				g_debug ("We are at capacity, must evict oldest");
+#endif
+				g_lru_cache_evict_n_oldest_locked (self, 1);
+#if DEBUG
+			}
+			
+			g_debug ("Retrieving value from external resource");
+#endif
+
+			value = self->priv->retrieve_func (key,
+			                                   self->priv->user_data,
+			                                   &retrieve_error);
+
+			if (G_UNLIKELY (retrieve_error != NULL))
+			{
+				g_propagate_error (error, retrieve_error);
+				return value; /* likely 'NULL', but we should be transparent */
+			}
+			
+			if (self->priv->key_copy_func)
+				g_hash_table_insert (self->priv->hash_table,
+					self->priv->key_copy_func (key, self->priv->user_data),
+					value);
+			else
+				g_hash_table_insert (self->priv->hash_table, key, value);
+			
+			self->priv->newest = g_list_prepend (self->priv->newest, key);
+			
+			if (self->priv->oldest == NULL)
+				self->priv->oldest = self->priv->newest;
+		}
+#if DEBUG
+		else g_debug ("Lost storage race with another thread");
+#endif
+		
+		g_rw_lock_writer_unlock (&(self->priv->rw_lock));
+	}
+
+	/* fast_get means that we do not reposition the item to the head
+	 * of the list. it essentially makes the lru, a lru from storage,
+	 * not lru to user.
+	 */
+
+	else if (!self->priv->fast_get &&
+	         !self->priv->key_equal_func (key, self->priv->newest->data))
+	{
+#if DEBUG
+		g_debug ("Making item most recent");
+#endif
+
+		g_rw_lock_writer_lock (&(self->priv->rw_lock));
+
+		GList *list = self->priv->newest;
+		GList *tmp;
+		GEqualFunc equal = self->priv->key_equal_func;
+
+		for (tmp = list; tmp; tmp = tmp->next)
+		{
+			if (equal (key, tmp->data))
+			{
+				GList *tmp1 = g_list_remove_link (list, tmp);
+				self->priv->newest = g_list_prepend (tmp1, tmp);
+				break;
+			}
+		}
+
+		g_rw_lock_writer_unlock (&(self->priv->rw_lock));
+	}
+	
+	return value;
+}
+
+void
+g_lru_cache_evict (GLruCache *self, gpointer key)
+{
+	g_return_if_fail (G_IS_LRU_CACHE (self));
+	
+	GEqualFunc  equal = self->priv->key_equal_func;
+	GList      *list  = NULL;
+	
+	g_rw_lock_writer_lock (&(self->priv->rw_lock));
+	
+	if (equal (key, self->priv->oldest))
+	{
+		g_lru_cache_evict_n_oldest_locked (self, 1);
+	}
+	else
+	{
+		g_hash_table_remove (self->priv->hash_table, key);
+		
+		for (list = self->priv->newest; list; list = list->next)
+		{
+			if (equal (key, list->data))
+			{
+				self->priv->newest = g_list_remove_link (self->priv->newest, list);
+				g_list_free (list);
+				break;
+			}
+		}
+	}
+	
+	g_rw_lock_writer_unlock (&(self->priv->rw_lock));
+}
+
+void
+g_lru_cache_clear (GLruCache *self)
+{
+	g_return_if_fail (G_IS_LRU_CACHE (self));
+	
+	g_rw_lock_writer_lock (&(self->priv->rw_lock));
+	
+	g_hash_table_remove_all (self->priv->hash_table);
+	g_list_free (self->priv->newest);
+	
+	self->priv->oldest = NULL;
+	self->priv->newest = NULL;
+	
+	g_rw_lock_writer_unlock (&(self->priv->rw_lock));
+}
+
+void
+g_lru_cache_set_fast_get (GLruCache *self, gboolean fast_get)
+{
+	g_return_if_fail (G_IS_LRU_CACHE (self));
+	self->priv->fast_get = fast_get;
+}
+
+gboolean
+g_lru_cache_get_fast_get (GLruCache *self)
+{
+	g_return_val_if_fail (G_IS_LRU_CACHE (self), FALSE);
+	return self->priv->fast_get;
+}
+
--- a/src/devices/video/g-lru-cache.h
+++ b/src/devices/video/g-lru-cache.h
@ -0,0 +1,97 @@
+/* g-lru-cache.h
+ *
+ * Copyright (C) 2009 - Christian Hergert
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ * 
+ * This is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __G_LRU_CACHE_H__
+#define __G_LRU_CACHE_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <glib.h>
+#include <glib-object.h>
+
+G_BEGIN_DECLS
+
+#define G_TYPE_LRU_CACHE		(g_lru_cache_get_type ())
+#define G_LRU_CACHE(obj)		(G_TYPE_CHECK_INSTANCE_CAST ((obj), G_TYPE_LRU_CACHE, GLruCache))
+#define G_LRU_CACHE_CONST(obj)		(G_TYPE_CHECK_INSTANCE_CAST ((obj), G_TYPE_LRU_CACHE, GLruCache const))
+#define G_LRU_CACHE_CLASS(klass)	(G_TYPE_CHECK_CLASS_CAST ((klass), G_TYPE_LRU_CACHE, GLruCacheClass))
+#define G_IS_LRU_CACHE(obj)		(G_TYPE_CHECK_INSTANCE_TYPE ((obj), G_TYPE_LRU_CACHE))
+#define G_IS_LRU_CACHE_CLASS(klass)	(G_TYPE_CHECK_CLASS_TYPE ((klass), G_TYPE_LRU_CACHE))
+#define G_LRU_CACHE_GET_CLASS(obj)	(G_TYPE_INSTANCE_GET_CLASS ((obj), G_TYPE_LRU_CACHE, GLruCacheClass))
+#define G_LOOKUP_FUNC(func)             ((GLookupFunc)func)
+
+typedef struct _GLruCache		GLruCache;
+typedef struct _GLruCacheClass		GLruCacheClass;
+typedef struct _GLruCachePrivate	GLruCachePrivate;
+
+typedef gpointer (*GLookupFunc) (gpointer key, gpointer user_data, GError **error);
+
+struct _GLruCache
+{
+	GObject parent;
+	
+	GLruCachePrivate *priv;
+};
+
+struct _GLruCacheClass
+{
+	GObjectClass parent_class;
+};
+
+GType      g_lru_cache_get_type     (void) G_GNUC_CONST;
+
+GLruCache* g_lru_cache_new          (GHashFunc      key_hash_func,
+                                     GEqualFunc     key_equal_func,
+                                     GLookupFunc    retrieve_func,
+                                     gpointer       user_data,
+                                     GDestroyNotify user_destroy_func);
+
+GLruCache* g_lru_cache_new_full     (GType          key_type,
+                                     GCopyFunc      key_copy_func,
+                                     GDestroyNotify key_destroy_func,
+                                     GType          value_type,
+                                     GCopyFunc      value_copy_func,
+                                     GDestroyNotify value_destroy_func,
+                                     GHashFunc      key_hash_func,
+                                     GEqualFunc     key_equal_func,
+                                     GLookupFunc    retrieve_func,
+                                     gpointer       user_data,
+                                     GDestroyNotify user_destroy_func);
+
+void       g_lru_cache_set_max_size (GLruCache *self, guint max_size);
+guint      g_lru_cache_get_max_size (GLruCache *self);
+
+guint      g_lru_cache_get_size     (GLruCache *self);
+
+gpointer   g_lru_cache_get          (GLruCache *self, gpointer key, GError **error);
+void       g_lru_cache_evict        (GLruCache *self, gpointer key);
+void       g_lru_cache_clear        (GLruCache *self);
+
+gboolean   g_lru_cache_get_fast_get (GLruCache *self);
+void       g_lru_cache_set_fast_get (GLruCache *self, gboolean fast_get);
+
+G_END_DECLS
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __G_LRU_CACHE_H__ */
--- a/src/devices/video/glextensions.c
+++ b/src/devices/video/glextensions.c
@ -0,0 +1,46 @@
+/*
+ * QEMU OpenGL extensions
+ *
+ * Copyright (c) 2015 espes
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "gloffscreen.h"
+#include "glextensions.h"
+
+#ifdef __APPLE__
+void (*glFrameTerminatorGREMEDY)(void);
+
+void (*glDebugMessageInsert) (GLenum source, GLenum type, GLuint id,
+                              GLenum severity, GLsizei length,
+                              const GLchar *buf);
+void (*glPushDebugGroup)(GLenum source, GLuint id, GLsizei length,
+                         const GLchar *message);
+void (*glPopDebugGroup)(void);
+void (*glObjectLabel)(GLenum identifier, GLuint name, GLsizei length,
+                      const GLchar *label);
+
+#endif
+
+void glextensions_init(void)
+{
+#ifdef __APPLE__
+    glFrameTerminatorGREMEDY =
+        glo_get_extension_proc("glFrameTerminatorGREMEDY");
+    glDebugMessageInsert = glo_get_extension_proc("glDebugMessageInsert");
+    glPushDebugGroup = glo_get_extension_proc("glPushDebugGroup");
+    glPopDebugGroup = glo_get_extension_proc("glPopDebugGroup");
+    glObjectLabel = glo_get_extension_proc("glObjectLabel");
+#endif
+}
--- a/src/devices/video/glextensions.h
+++ b/src/devices/video/glextensions.h
@ -0,0 +1,54 @@
+/*
+ * QEMU OpenGL extensions
+ *
+ * Copyright (c) 2015 espes
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef GLEXTEENSIONS_H_
+#define GLEXTEENSIONS_H_
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+#include "gloffscreen.h"
+extern void (*glFrameTerminatorGREMEDY)(void);
+
+#define GL_DEBUG_SOURCE_APPLICATION       0x824A
+#define GL_DEBUG_TYPE_MARKER              0x8268
+#define GL_DEBUG_SEVERITY_NOTIFICATION    0x826B
+#define GL_DEBUG_OUTPUT                   0x92E0
+
+extern void (*glDebugMessageInsert) (GLenum source, GLenum type, GLuint id,
+                                     GLenum severity, GLsizei length,
+                                     const GLchar *buf);
+extern void (*glPushDebugGroup)(GLenum source, GLuint id, GLsizei length,
+                                const GLchar *message);
+extern void (*glPopDebugGroup)(void);
+extern void (*glObjectLabel)(GLenum identifier, GLuint name, GLsizei length,
+                             const GLchar *label);
+
+#endif
+
+void glextensions_init(void);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/devices/video/gloffscreen.c
+++ b/src/devices/video/gloffscreen.c
@ -0,0 +1,193 @@
+/*
+ *  Offscreen OpenGL abstraction layer - Common utilities
+ *
+ *  Copyright (c) 2010 Intel
+ *  Written by:
+ *    Gordon Williams <gordon.williams@collabora.co.uk>
+ *    Ian Molton <ian.molton@collabora.co.uk>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h>
+
+#include "gloffscreen.h"
+
+
+void glo_readpixels(GLenum gl_format, GLenum gl_type,
+                    unsigned int bytes_per_pixel, unsigned int stride,
+                    unsigned int width, unsigned int height, void *data)
+{
+    /* TODO: weird strides */
+    assert(stride % bytes_per_pixel == 0);
+
+    /* Save guest processes GL state before we ReadPixels() */
+    int rl, pa;
+    glGetIntegerv(GL_PACK_ROW_LENGTH, &rl);
+    glGetIntegerv(GL_PACK_ALIGNMENT, &pa);
+    glPixelStorei(GL_PACK_ROW_LENGTH, stride / bytes_per_pixel);
+    glPixelStorei(GL_PACK_ALIGNMENT, 1);
+
+#ifdef GETCONTENTS_INDIVIDUAL
+    GLubyte *b = (GLubyte *) data;
+    int irow;
+
+    for (irow = height - 1; irow >= 0; irow--) {
+        glReadPixels(0, irow, width, 1, gl_format, gl_type, b);
+        b += stride;
+    }
+#else
+    /* Faster buffer flip */
+    GLubyte *b = (GLubyte *) data;
+    GLubyte *c = &((GLubyte *) data)[stride * (height - 1)];
+    GLubyte *tmp = (GLubyte *) malloc(width * bytes_per_pixel);
+	unsigned int irow;
+
+    glReadPixels(0, 0, width, height, gl_format, gl_type, data);
+
+    for (irow = 0; irow < height / 2; irow++) {
+        memcpy(tmp, b, width * bytes_per_pixel);
+        memcpy(b, c, width * bytes_per_pixel);
+        memcpy(c, tmp, width * bytes_per_pixel);
+        b += stride;
+        c -= stride;
+    }
+    free(tmp);
+#endif
+
+    /* Restore GL state */
+    glPixelStorei(GL_PACK_ROW_LENGTH, rl);
+    glPixelStorei(GL_PACK_ALIGNMENT, pa);
+}
+
+
+bool glo_check_extension(const char* ext_name)
+{
+    int i;
+    int num_extensions = GL_NUM_EXTENSIONS;
+    for (i=0; i<num_extensions; i++) {
+      const char* ext = (const char*)glGetStringi(GL_EXTENSIONS, i);
+      if (!ext) break;
+      if (strcmp(ext, ext_name) == 0) return true;
+    }
+    return false;
+}
+
+
+
+
+
+
+
+
+
+
+/*
+ *  Offscreen OpenGL abstraction layer - CGL (Apple) specific
+ *
+ *  Copyright (c) 2013 Wayo
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//#include <stdlib.h>
+//#include <stdio.h>
+//#include <string.h>
+//#include <dlfcn.h>
+//
+//
+//#include <OpenGL/OpenGL.h>
+//#include <OpenGL/CGLTypes.h>
+//#include <OpenGL/CGLCurrent.h>
+//
+//#include "gloffscreen.h"
+//
+//struct _GloContext {
+//  CGLContextObj     cglContext;
+//};
+//
+///* Create an OpenGL context for a certain pixel format. formatflags are from 
+// * the GLO_ constants */
+//GloContext *glo_context_create(void)
+//{
+//    CGLError err;
+//
+//    GloContext *context = (GloContext *)malloc(sizeof(GloContext));
+//
+//    /* pixel format attributes */
+//    CGLPixelFormatAttribute attributes[] = {
+//        kCGLPFAAccelerated,
+//        kCGLPFAOpenGLProfile,
+//        (CGLPixelFormatAttribute)kCGLOGLPVersion_GL3_Core,
+//        (CGLPixelFormatAttribute)0
+//    };
+//
+//    CGLPixelFormatObj pix;
+//    GLint num;
+//    err = CGLChoosePixelFormat(attributes, &pix, &num);
+//    if (err) return NULL;
+//
+//    err = CGLCreateContext(pix, NULL, &context->cglContext);
+//    if (err) return NULL;
+//
+//    CGLDestroyPixelFormat(pix);
+//
+//    glo_set_current(context);
+//
+//    return context;
+//}
+//
+//void* glo_get_extension_proc(const char* ext_proc)
+//{
+//    return dlsym(RTLD_NEXT, ext_proc);
+//}
+//
+///* Set current context */
+//void glo_set_current(GloContext *context)
+//{
+//    if (context == NULL) {
+//        CGLSetCurrentContext(NULL);
+//    } else {
+//        CGLSetCurrentContext(context->cglContext);
+//    }
+//}
+//
+///* Destroy a previously created OpenGL context */
+//void glo_context_destroy(GloContext *context)
+//{
+//    if (!context) return;
+//    glo_set_current(NULL);
+//    CGLDestroyContext(context->cglContext);
+//}
--- a/src/devices/video/gloffscreen.h
+++ b/src/devices/video/gloffscreen.h
@ -0,0 +1,76 @@
+/*
+ *  Offscreen OpenGL abstraction layer
+ *
+ *  Copyright (c) 2010 Intel
+ *  Written by:
+ *    Gordon Williams <gordon.williams@collabora.co.uk>
+ *    Ian Molton <ian.molton@collabora.co.uk>
+ *  Copyright (c) 2013 Wayo
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef GLOFFSCREEN_H_
+#define GLOFFSCREEN_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+
+// #ifdef __APPLE__
+// #include <OpenGL/gl3.h>
+// #include <OpenGL/glext.h>
+// #else
+// #include <GL/glew.h>
+// #include <GL/gl.h>
+// #endif
+
+//#include <SDL.h>
+#include <GL/glew.h>
+
+/* Used to hold data for the OpenGL context */
+struct _GloContext;
+typedef struct _GloContext GloContext;
+
+/* Change current context */
+void glo_set_current(GloContext *context);
+
+/* Check GL Extensions */
+bool glo_check_extension(const char* ext_name);
+void* glo_get_extension_proc(const char* extProc);
+
+/* Create an OpenGL context */
+GloContext *glo_context_create(void);
+
+/* Destroy a previouslu created OpenGL context */
+void glo_context_destroy(GloContext *context);
+
+ /* Note that this is top-down, not bottom-up as glReadPixels would do. */
+void glo_readpixels(GLenum gl_format, GLenum gl_type,
+                    unsigned int bytes_per_pixel, unsigned int stride,
+                    unsigned int width, unsigned int height, void *data);
+
+
+#ifdef __cplusplus
+}
+#endif
+ 
+#endif /* GLOFFSCREEN_H_ */
--- a/src/devices/video/nv2a.cpp
+++ b/src/devices/video/nv2a.cpp
@ -67,13 +67,34 @@ namespace xboxkrnl
 #include "vga.h"
 #include "nv2a.h" // For NV2AState
 #include "nv2a_int.h" // from https://github.com/espes/xqemu/tree/xbox/hw/xbox
-
 //#include <gl\glew.h>
 #include <gl\GL.h>
 #include <gl\GLU.h>
 #include <cassert>
 //#include <gl\glut.h>

+// glib types
+typedef char gchar;
+typedef int gint;
+typedef unsigned int guint;
+typedef unsigned int guint32;
+typedef const void *gconstpointer;
+typedef gint   gboolean;
+typedef void* gpointer;
+
+typedef guint32 GQuark;
+
+typedef struct _GError GError;
+
+struct _GError
+{
+	GQuark       domain;
+	gint         code;
+	gchar       *message;
+};
+
+#include "glextensions.h" // for glextensions_init
+

 static void update_irq(NV2AState *d)
 {
@ -151,7 +172,12 @@ static inline uint32_t ldl_le_p(const void *p)
 	return *(uint32_t*)p;
 }

-static inline void stl_le_p(uint32_t *p, uint32 v)
+static inline void stq_le_p(uint64_t *p, uint64_t v)
+{
+	*p = v;
+}
+
+static inline void stl_le_p(uint32_t *p, uint32_t v)
 {
 	*p = v;
 }
@ -618,6 +644,7 @@ void CxbxReserveNV2AMemory(NV2AState *d)
 NV2ADevice::NV2ADevice()
 {
 	m_nv2a_state = new NV2AState();
+	m_nv2a_state->pgraph.opengl_enabled = bLLE_GPU;
 	pgraph_init(m_nv2a_state);
 }

--- a/src/devices/video/nv2a.h
+++ b/src/devices/video/nv2a.h
@ -46,6 +46,9 @@

 #include "swizzle.h"
 #include "nv2a_int.h"
+#include "nv2a_debug.h" // For HWADDR_PRIx, NV2A_DPRINTF, NV2A_GL_DPRINTF, etc.
+#include "gloffscreen.h" // For glo_readpixels
+#include "nv2a_shaders.h" // For ShaderBinding

 #define NV2A_ADDR  0xFD000000
 #define NV2A_SIZE             0x01000000
@ -61,19 +64,33 @@

 typedef xbaddr hwaddr; // Compatibility; Cxbx uses xbaddr, xqemu and OpenXbox use hwaddr 
 typedef uint32_t value_t; // Compatibility; Cxbx values are uint32_t (xqemu and OpenXbox use uint64_t)
-#define NV2A_DPRINTF(...) printf("[0x????] NV2A: " ## __VA_ARGS__) // Compatibility; TODO : Replace this by something equivalent
-#define NV2A_GL_DPRINTF EmuWarning // Compatibility; TODO : Replace this by something equivalent
+
+#ifdef __cplusplus
+template <size_t N> struct ArraySizeHelper { char _[N]; };
+template <typename T, size_t N>
+ArraySizeHelper<N> makeArraySizeHelper(T(&)[N]);
+#  define ARRAY_SIZE(a)  sizeof(makeArraySizeHelper(a))
+#else
+// The expression ARRAY_SIZE(a) is a compile-time constant of type
+// size_t which represents the number of elements of the given
+// array. You should only use ARRAY_SIZE on statically allocated
+// arrays.
+
+#define ARRAY_SIZE(a)                               \
+  ((sizeof(a) / sizeof(*(a))) /                     \
+  static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
+#endif
+
 #define VSH_TOKEN_SIZE 4 // Compatibility; TODO : Move this to nv2a_vsh.h
 #define MAX(a,b) ((a)>(b) ? (a) : (b)) // Compatibility
 #define MIN(a,b) ((a)<(b) ? (a) : (b)) // Compatibility
-#undef COMPILE_OPENGL // Compatibility; define this to include all OpenGL calls
-#define HWADDR_PRIx "p" // Compatibility
+
 #define g_free(x) free(x) // Compatibility
 #define g_malloc(x) malloc(x) // Compatibility
 #define g_malloc0(x) calloc(1, x) // Compatibility
 #define g_realloc(x, y) realloc(x, y) // Compatibility

-#define USE_TEXTURE_CACHE
+#undef USE_TEXTURE_CACHE

 // Public Domain ffs Implementation
 // See: http://snipplr.com/view/22147/stringsh-implementation/
@ -301,6 +318,7 @@ typedef struct GraphicsContext {


 typedef struct PGRAPHState {
+	bool opengl_enabled; // == bLLE_GPU
 	std::mutex pgraph_lock;

 	uint32_t pending_interrupts;
@ -333,12 +351,14 @@ typedef struct PGRAPHState {
 	SurfaceShape last_surface_shape;

 	xbaddr dma_a, dma_b;
-	//GLruCache *texture_cache;
+#ifdef USE_TEXTURE_CACHE
+	GLruCache *texture_cache;
+#endif
 	bool texture_dirty[NV2A_MAX_TEXTURES];
 	TextureBinding *texture_binding[NV2A_MAX_TEXTURES];

 	//GHashTable *shader_cache;
-	//ShaderBinding *shader_binding;
+	ShaderBinding *shader_binding;

 	bool texture_matrix_enable[NV2A_MAX_TEXTURES];

--- a/src/devices/video/nv2a_debug.c
+++ b/src/devices/video/nv2a_debug.c
@ -0,0 +1,94 @@
+/*
+ * QEMU Geforce NV2A debug helpers
+ *
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2012 espes
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "nv2a_debug.h"
+
+#ifdef DEBUG_NV2A_GL
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#include "gl/glextensions.h"
+
+void gl_debug_message(bool cc, const char *fmt, ...)
+{
+    size_t n;
+    char buffer[1024];
+    va_list ap;
+    va_start(ap, fmt);
+    n = vsnprintf(buffer, sizeof(buffer), fmt, ap);
+    assert(n <= sizeof(buffer));
+    va_end(ap);
+
+    if(glDebugMessageInsert) {
+        glDebugMessageInsert(GL_DEBUG_SOURCE_APPLICATION, GL_DEBUG_TYPE_MARKER,
+                             0, GL_DEBUG_SEVERITY_NOTIFICATION, n, buffer);
+    }
+    if (cc) {
+        fwrite(buffer, sizeof(char), n, stdout);
+        fputc('\n', stdout);
+    }
+}
+
+void gl_debug_group_begin(const char *fmt, ...)
+{
+    size_t n;
+    char buffer[1024];
+    va_list ap;
+    va_start(ap, fmt);
+    n = vsnprintf(buffer, sizeof(buffer), fmt, ap);
+    assert(n <= sizeof(buffer));
+    va_end(ap);
+
+    /* Check for errors before entering group */
+    assert(glGetError() == GL_NO_ERROR);
+
+    if (glPushDebugGroup) {
+        glPushDebugGroup(GL_DEBUG_SOURCE_APPLICATION, 0, n, buffer);
+    }
+}
+
+void gl_debug_group_end(void)
+{
+    /* Check for errors when leaving group */
+    assert(glGetError() == GL_NO_ERROR);
+
+    if (glPopDebugGroup) {
+        glPopDebugGroup();
+    }
+}
+
+void gl_debug_label(GLenum target, GLuint name, const char *fmt, ...)
+{
+    size_t n;
+    char buffer[1024];
+    va_list ap;
+    va_start(ap, fmt);
+    n = vsnprintf(buffer, sizeof(buffer), fmt, ap);
+    assert(n <= sizeof(buffer));
+    va_end(ap);
+
+    if (glObjectLabel) {
+        glObjectLabel(target, name, n, buffer);
+    }
+}
+
+#endif
--- a/src/devices/video/nv2a_debug.h
+++ b/src/devices/video/nv2a_debug.h
@ -0,0 +1,62 @@
+/*
+ * QEMU Geforce NV2A debug helpers
+ *
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2012 espes
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_NV2A_DEBUG_H
+#define HW_NV2A_DEBUG_H
+
+#define HWADDR_PRIx "x"
+
+#define DEBUG_NV2A
+#ifdef DEBUG_NV2A
+# define NV2A_DPRINTF(format, ...)       printf("[0x????] NV2A: " format, ## __VA_ARGS__)
+#else
+# define NV2A_DPRINTF(format, ...)       do { } while (0)
+#endif
+
+// #define DEBUG_NV2A_GL
+#ifdef DEBUG_NV2A_GL
+
+#include <stdbool.h>
+// #include "gl/gloffscreen.h"
+
+void gl_debug_message(bool cc, const char *fmt, ...);
+void gl_debug_group_begin(const char *fmt, ...);
+void gl_debug_group_end(void);
+void gl_debug_label(GLenum target, GLuint name, const char *fmt, ...);
+
+# define NV2A_GL_DPRINTF(cc, format, ...) \
+    gl_debug_message(cc, "nv2a: " format, ## __VA_ARGS__)
+# define NV2A_GL_DGROUP_BEGIN(format, ...) \
+    gl_debug_group_begin("nv2a: " format, ## __VA_ARGS__)
+# define NV2A_GL_DGROUP_END() \
+    gl_debug_group_end()
+# define NV2A_GL_DLABEL(target, name, format, ...)  \
+    gl_debug_label(target, name, "nv2a: { " format " }", ## __VA_ARGS__)
+
+#else
+# define NV2A_GL_DPRINTF(cc, format, ...)          do { \
+        if (cc) NV2A_DPRINTF(format "\n", ##__VA_ARGS__ ); \
+    } while (0)
+# define NV2A_GL_DGROUP_BEGIN(format, ...)         do { } while (0)
+# define NV2A_GL_DGROUP_END()                      do { } while (0)
+# define NV2A_GL_DLABEL(target, name, format, ...) do { } while (0)
+#endif
+
+#endif
--- a/src/devices/video/nv2a_psh.cpp
+++ b/src/devices/video/nv2a_psh.cpp
@ -0,0 +1,865 @@
+/*
+ * QEMU Geforce NV2A pixel shader translation
+ *
+ * Copyright (c) 2013 espes
+ * Copyright (c) 2015 Jannik Vogel
+ *
+ * Based on:
+ * Cxbx, PixelShader.cpp
+ * Copyright (c) 2004 Aaron Robinson <caustik@caustik.com>
+ *                    Kingofc <kingofc@freenet.de>
+ * Xeon, XBD3DPixelShader.cpp
+ * Copyright (c) 2003 _SF_
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <stdint.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#include "nv2a_shaders_common.h"
+#include "nv2a_psh.h"
+
+// fixme: clean this up (i'm a lazy bastard)
+#define qstring_append_fmt(str, ...) do { \
+    char buf[128]; \
+    snprintf(buf, sizeof(buf), __VA_ARGS__); \
+    str->append(buf); \
+} while (0) \
+
+#define qstring_get_str(str) str->c_str()
+
+static std::string *qstring_from_fmt(const char *fmt, ...)
+{
+    char buf[128];
+    va_list args;
+    va_start(args, fmt);
+    vsnprintf(buf, sizeof(buf), fmt, args);
+    va_end(args);
+    return new std::string(buf);
+}
+
+/*
+ * This implements translation of register combiners into glsl
+ * fragment shaders, but all terminology is in terms of Xbox DirectX
+ * pixel shaders, since I wanted to be lazy while referencing existing
+ * work / stealing code.
+ *
+ * For some background, see the OpenGL extension:
+ * https://www.opengl.org/registry/specs/NV/register_combiners.txt
+ */
+
+enum PS_TEXTUREMODES
+{                                 // valid in stage 0 1 2 3
+    PS_TEXTUREMODES_NONE=                 0x00L, // * * * *
+    PS_TEXTUREMODES_PROJECT2D=            0x01L, // * * * *
+    PS_TEXTUREMODES_PROJECT3D=            0x02L, // * * * *
+    PS_TEXTUREMODES_CUBEMAP=              0x03L, // * * * *
+    PS_TEXTUREMODES_PASSTHRU=             0x04L, // * * * *
+    PS_TEXTUREMODES_CLIPPLANE=            0x05L, // * * * *
+    PS_TEXTUREMODES_BUMPENVMAP=           0x06L, // - * * *
+    PS_TEXTUREMODES_BUMPENVMAP_LUM=       0x07L, // - * * *
+    PS_TEXTUREMODES_BRDF=                 0x08L, // - - * *
+    PS_TEXTUREMODES_DOT_ST=               0x09L, // - - * *
+    PS_TEXTUREMODES_DOT_ZW=               0x0aL, // - - * *
+    PS_TEXTUREMODES_DOT_RFLCT_DIFF=       0x0bL, // - - * -
+    PS_TEXTUREMODES_DOT_RFLCT_SPEC=       0x0cL, // - - - *
+    PS_TEXTUREMODES_DOT_STR_3D=           0x0dL, // - - - *
+    PS_TEXTUREMODES_DOT_STR_CUBE=         0x0eL, // - - - *
+    PS_TEXTUREMODES_DPNDNT_AR=            0x0fL, // - * * *
+    PS_TEXTUREMODES_DPNDNT_GB=            0x10L, // - * * *
+    PS_TEXTUREMODES_DOTPRODUCT=           0x11L, // - * * -
+    PS_TEXTUREMODES_DOT_RFLCT_SPEC_CONST= 0x12L, // - - - *
+    // 0x13-0x1f reserved
+};
+
+enum PS_INPUTMAPPING
+{
+    PS_INPUTMAPPING_UNSIGNED_IDENTITY= 0x00L, // max(0,x)         OK for final combiner
+    PS_INPUTMAPPING_UNSIGNED_INVERT=   0x20L, // 1 - max(0,x)     OK for final combiner
+    PS_INPUTMAPPING_EXPAND_NORMAL=     0x40L, // 2*max(0,x) - 1   invalid for final combiner
+    PS_INPUTMAPPING_EXPAND_NEGATE=     0x60L, // 1 - 2*max(0,x)   invalid for final combiner
+    PS_INPUTMAPPING_HALFBIAS_NORMAL=   0x80L, // max(0,x) - 1/2   invalid for final combiner
+    PS_INPUTMAPPING_HALFBIAS_NEGATE=   0xa0L, // 1/2 - max(0,x)   invalid for final combiner
+    PS_INPUTMAPPING_SIGNED_IDENTITY=   0xc0L, // x                invalid for final combiner
+    PS_INPUTMAPPING_SIGNED_NEGATE=     0xe0L, // -x               invalid for final combiner
+};
+
+enum PS_REGISTER
+{
+    PS_REGISTER_ZERO=              0x00L, // r
+    PS_REGISTER_DISCARD=           0x00L, // w
+    PS_REGISTER_C0=                0x01L, // r
+    PS_REGISTER_C1=                0x02L, // r
+    PS_REGISTER_FOG=               0x03L, // r
+    PS_REGISTER_V0=                0x04L, // r/w
+    PS_REGISTER_V1=                0x05L, // r/w
+    PS_REGISTER_T0=                0x08L, // r/w
+    PS_REGISTER_T1=                0x09L, // r/w
+    PS_REGISTER_T2=                0x0aL, // r/w
+    PS_REGISTER_T3=                0x0bL, // r/w
+    PS_REGISTER_R0=                0x0cL, // r/w
+    PS_REGISTER_R1=                0x0dL, // r/w
+    PS_REGISTER_V1R0_SUM=          0x0eL, // r
+    PS_REGISTER_EF_PROD=           0x0fL, // r
+
+    PS_REGISTER_ONE=               PS_REGISTER_ZERO | PS_INPUTMAPPING_UNSIGNED_INVERT, // OK for final combiner
+    PS_REGISTER_NEGATIVE_ONE=      PS_REGISTER_ZERO | PS_INPUTMAPPING_EXPAND_NORMAL,   // invalid for final combiner
+    PS_REGISTER_ONE_HALF=          PS_REGISTER_ZERO | PS_INPUTMAPPING_HALFBIAS_NEGATE, // invalid for final combiner
+    PS_REGISTER_NEGATIVE_ONE_HALF= PS_REGISTER_ZERO | PS_INPUTMAPPING_HALFBIAS_NORMAL, // invalid for final combiner
+};
+
+enum PS_COMBINERCOUNTFLAGS
+{
+    PS_COMBINERCOUNT_MUX_LSB=     0x0000L, // mux on r0.a lsb
+    PS_COMBINERCOUNT_MUX_MSB=     0x0001L, // mux on r0.a msb
+
+    PS_COMBINERCOUNT_SAME_C0=     0x0000L, // c0 same in each stage
+    PS_COMBINERCOUNT_UNIQUE_C0=   0x0010L, // c0 unique in each stage
+
+    PS_COMBINERCOUNT_SAME_C1=     0x0000L, // c1 same in each stage
+    PS_COMBINERCOUNT_UNIQUE_C1=   0x0100L  // c1 unique in each stage
+};
+
+enum PS_COMBINEROUTPUT
+{
+    PS_COMBINEROUTPUT_IDENTITY=            0x00L, // y = x
+    PS_COMBINEROUTPUT_BIAS=                0x08L, // y = x - 0.5
+    PS_COMBINEROUTPUT_SHIFTLEFT_1=         0x10L, // y = x*2
+    PS_COMBINEROUTPUT_SHIFTLEFT_1_BIAS=    0x18L, // y = (x - 0.5)*2
+    PS_COMBINEROUTPUT_SHIFTLEFT_2=         0x20L, // y = x*4
+    PS_COMBINEROUTPUT_SHIFTRIGHT_1=        0x30L, // y = x/2
+
+    PS_COMBINEROUTPUT_AB_BLUE_TO_ALPHA=    0x80L, // RGB only
+
+    PS_COMBINEROUTPUT_CD_BLUE_TO_ALPHA=    0x40L, // RGB only
+
+    PS_COMBINEROUTPUT_AB_MULTIPLY=         0x00L,
+    PS_COMBINEROUTPUT_AB_DOT_PRODUCT=      0x02L, // RGB only
+
+    PS_COMBINEROUTPUT_CD_MULTIPLY=         0x00L,
+    PS_COMBINEROUTPUT_CD_DOT_PRODUCT=      0x01L, // RGB only
+
+    PS_COMBINEROUTPUT_AB_CD_SUM=           0x00L, // 3rd output is AB+CD
+    PS_COMBINEROUTPUT_AB_CD_MUX=           0x04L, // 3rd output is MUX(AB,CD) based on R0.a
+};
+
+enum PS_CHANNEL
+{
+    PS_CHANNEL_RGB=   0x00, // used as RGB source
+    PS_CHANNEL_BLUE=  0x00, // used as ALPHA source
+    PS_CHANNEL_ALPHA= 0x10, // used as RGB or ALPHA source
+};
+
+
+enum PS_FINALCOMBINERSETTING
+{
+    PS_FINALCOMBINERSETTING_CLAMP_SUM=     0x80, // V1+R0 sum clamped to [0,1]
+
+    PS_FINALCOMBINERSETTING_COMPLEMENT_V1= 0x40, // unsigned invert mapping
+
+    PS_FINALCOMBINERSETTING_COMPLEMENT_R0= 0x20, // unsigned invert mapping
+};
+
+
+
+// Structures to describe the PS definition
+
+struct InputInfo {
+    int reg, mod, chan;
+    bool invert;
+};
+
+struct InputVarInfo {
+    struct InputInfo a, b, c, d;
+};
+
+struct FCInputInfo {
+    struct InputInfo a, b, c, d, e, f, g;
+    int c0, c1;
+    //uint32_t c0_value, c1_value;
+    bool c0_used, c1_used;
+    bool v1r0_sum, clamp_sum, inv_v1, inv_r0, enabled;
+};
+
+struct OutputInfo {
+    int ab, cd, muxsum, flags, ab_op, cd_op, muxsum_op,
+        mapping, ab_alphablue, cd_alphablue;
+};
+
+struct PSStageInfo {
+    struct InputVarInfo rgb_input, alpha_input;
+    struct OutputInfo rgb_output, alpha_output;
+    int c0, c1;
+    //uint32_t c0_value, c1_value;
+    bool c0_used, c1_used;
+};
+
+struct PixelShader {
+    PshState state;
+
+    int num_stages, flags;
+    struct PSStageInfo stage[8];
+    struct FCInputInfo final_input;
+    int tex_modes[4], input_tex[4];
+
+    //uint32_t dot_mapping, input_texture;
+
+    std::string *varE, *varF;
+    std::string *code;
+    int cur_stage;
+
+    int num_var_refs;
+    char var_refs[32][32];
+    int num_const_refs;
+    char const_refs[32][32];
+};
+
+static void add_var_ref(struct PixelShader *ps, const char *var)
+{
+    int i;
+    for (i=0; i<ps->num_var_refs; i++) {
+        if (strcmp((char*)ps->var_refs[i], var) == 0) return;
+    }
+    strcpy((char*)ps->var_refs[ps->num_var_refs++], var);
+}
+
+static void add_const_ref(struct PixelShader *ps, const char *var)
+{
+    int i;
+    for (i=0; i<ps->num_const_refs; i++) {
+        if (strcmp((char*)ps->const_refs[i], var) == 0) return;
+    }
+    strcpy((char*)ps->const_refs[ps->num_const_refs++], var);
+}
+
+// Get the code for a variable used in the program
+static std::string* get_var(struct PixelShader *ps, int reg, bool is_dest)
+{
+    switch (reg) {
+    case PS_REGISTER_DISCARD:
+        if (is_dest) {
+            return new std::string("");
+        } else {
+            return new std::string("0.0");
+        }
+        break;
+    case PS_REGISTER_C0:
+        /* TODO: should the final stage really always be unique? */
+        if (ps->flags & PS_COMBINERCOUNT_UNIQUE_C0 || ps->cur_stage == 8) {
+            std::string *reg = qstring_from_fmt("c_%d_%d", ps->cur_stage, 0);
+            add_const_ref(ps, qstring_get_str(reg));
+            if (ps->cur_stage == 8) {
+                ps->final_input.c0_used = true;
+            } else {
+                ps->stage[ps->cur_stage].c0_used = true;
+            }
+            return reg;
+        } else {  // Same c0
+            add_const_ref(ps, "c_0_0");
+            ps->stage[0].c0_used = true;
+            return new std::string("c_0_0");
+        }
+        break;
+    case PS_REGISTER_C1:
+        if (ps->flags & PS_COMBINERCOUNT_UNIQUE_C1 || ps->cur_stage == 8) {
+            std::string *reg = qstring_from_fmt("c_%d_%d", ps->cur_stage, 1);
+            add_const_ref(ps, qstring_get_str(reg));
+            if (ps->cur_stage == 8) {
+                ps->final_input.c1_used = true;
+            } else {
+                ps->stage[ps->cur_stage].c1_used = true;
+            }
+            return reg;
+        } else {  // Same c1
+            add_const_ref(ps, "c_0_1");
+            ps->stage[0].c1_used = true;
+            return new std::string("c_0_1");
+        }
+        break;
+    case PS_REGISTER_FOG:
+        return new std::string("pFog");
+    case PS_REGISTER_V0:
+        return new std::string("v0");
+    case PS_REGISTER_V1:
+        return new std::string("v1");
+    case PS_REGISTER_T0:
+        return new std::string("t0");
+    case PS_REGISTER_T1:
+        return new std::string("t1");
+    case PS_REGISTER_T2:
+        return new std::string("t2");
+    case PS_REGISTER_T3:
+        return new std::string("t3");
+    case PS_REGISTER_R0:
+        add_var_ref(ps, "r0");
+        return new std::string("r0");
+    case PS_REGISTER_R1:
+        add_var_ref(ps, "r1");
+        return new std::string("r1");
+    case PS_REGISTER_V1R0_SUM:
+        add_var_ref(ps, "r0");
+        return new std::string("(v1 + r0)");
+    case PS_REGISTER_EF_PROD:
+        return qstring_from_fmt("(%s * %s)", qstring_get_str(ps->varE),
+                                qstring_get_str(ps->varF));
+    default:
+        assert(false);
+        break;
+    }
+}
+
+// Get input variable code
+static std::string* get_input_var(struct PixelShader *ps, struct InputInfo in, bool is_alpha)
+{
+    std::string *reg = get_var(ps, in.reg, false);
+
+    if (strcmp(qstring_get_str(reg), "0.0") != 0
+        && (in.reg != PS_REGISTER_EF_PROD
+            || strstr(qstring_get_str(reg), ".a") == NULL)) {
+        switch (in.chan) {
+        case PS_CHANNEL_RGB:
+            if (is_alpha) {
+                reg->append(".b");
+            } else {
+                reg->append(".rgb");
+            }
+            break;
+        case PS_CHANNEL_ALPHA:
+            reg->append(".a");
+            break;
+        default:
+            assert(false);
+            break;
+        }
+    }
+
+    std::string *res;
+    switch (in.mod) {
+    case PS_INPUTMAPPING_SIGNED_IDENTITY:
+    case PS_INPUTMAPPING_UNSIGNED_IDENTITY:
+        return reg;
+    case PS_INPUTMAPPING_UNSIGNED_INVERT:
+        res = qstring_from_fmt("(1.0 - %s)", qstring_get_str(reg));
+        break;
+    case PS_INPUTMAPPING_EXPAND_NORMAL: // TODO: Change to max(0, x)??
+        res = qstring_from_fmt("(2.0 * %s - 1.0)", qstring_get_str(reg));
+        break;
+    case PS_INPUTMAPPING_EXPAND_NEGATE:
+        res = qstring_from_fmt("(1.0 - 2.0 * %s)", qstring_get_str(reg));
+        break;
+    case PS_INPUTMAPPING_HALFBIAS_NORMAL:
+        res = qstring_from_fmt("(%s - 0.5)", qstring_get_str(reg));
+        break;
+    case PS_INPUTMAPPING_HALFBIAS_NEGATE:
+        res = qstring_from_fmt("(0.5 - %s)", qstring_get_str(reg));
+        break;
+    case PS_INPUTMAPPING_SIGNED_NEGATE:
+        res = qstring_from_fmt("-%s", qstring_get_str(reg));
+        break;
+    default:
+        assert(false);
+        break;
+    }
+
+    delete reg;
+    return res;
+}
+
+// Get code for the output mapping of a stage
+static std::string* get_output(std::string *reg, int mapping)
+{
+    std::string *res;
+
+    switch (mapping) {
+    case PS_COMBINEROUTPUT_IDENTITY:
+        res = reg;
+    case PS_COMBINEROUTPUT_BIAS:
+        res = qstring_from_fmt("(%s - 0.5)", qstring_get_str(reg));
+        break;
+    case PS_COMBINEROUTPUT_SHIFTLEFT_1:
+        res = qstring_from_fmt("(%s * 2.0)", qstring_get_str(reg));
+        break;
+    case PS_COMBINEROUTPUT_SHIFTLEFT_1_BIAS:
+        res = qstring_from_fmt("((%s - 0.5) * 2.0)", qstring_get_str(reg));
+        break;
+    case PS_COMBINEROUTPUT_SHIFTLEFT_2:
+        res = qstring_from_fmt("(%s * 4.0)", qstring_get_str(reg));
+        break;
+    case PS_COMBINEROUTPUT_SHIFTRIGHT_1:
+        res = qstring_from_fmt("(%s / 2.0)", qstring_get_str(reg));
+        break;
+    default:
+        assert(false);
+        break;
+    }
+
+    return res;
+}
+
+// Add the HLSL code for a stage
+static void add_stage_code(struct PixelShader *ps,
+                           struct InputVarInfo input, struct OutputInfo output,
+                           const char *write_mask, bool is_alpha)
+{
+    std::string *a = get_input_var(ps, input.a, is_alpha);
+    std::string *b = get_input_var(ps, input.b, is_alpha);
+    std::string *c = get_input_var(ps, input.c, is_alpha);
+    std::string *d = get_input_var(ps, input.d, is_alpha);
+
+    const char *caster = "";
+    if (strlen(write_mask) == 3) {
+        caster = "vec3";
+    }
+
+    std::string *ab;
+    if (output.ab_op == PS_COMBINEROUTPUT_AB_DOT_PRODUCT) {
+        ab = qstring_from_fmt("dot(%s, %s)",
+                              qstring_get_str(a), qstring_get_str(b));
+    } else {
+        ab = qstring_from_fmt("(%s * %s)",
+                              qstring_get_str(a), qstring_get_str(b));
+    }
+
+    std::string *cd;
+    if (output.cd_op == PS_COMBINEROUTPUT_CD_DOT_PRODUCT) {
+        cd = qstring_from_fmt("dot(%s, %s)",
+                              qstring_get_str(c), qstring_get_str(d));
+    } else {
+        cd = qstring_from_fmt("(%s * %s)",
+                              qstring_get_str(c), qstring_get_str(d));
+    }
+
+    std::string *ab_mapping = get_output(ab, output.mapping);
+    std::string *cd_mapping = get_output(cd, output.mapping);
+    std::string *ab_dest = get_var(ps, output.ab, true);
+    std::string *cd_dest = get_var(ps, output.cd, true);
+    std::string *sum_dest = get_var(ps, output.muxsum, true);
+
+    if (ab_dest->length()) {
+        qstring_append_fmt(ps->code, "%s.%s = %s(%s);\n",
+                           qstring_get_str(ab_dest), write_mask, caster, qstring_get_str(ab_mapping));
+    } else {
+        // QDECREF(ab_dest);
+        // QINCREF(ab_mapping);
+        delete ab_dest;
+        ab_dest = new std::string(*ab_mapping);
+    }
+
+    if (cd_dest->length()) {
+        qstring_append_fmt(ps->code, "%s.%s = %s(%s);\n",
+                           qstring_get_str(cd_dest), write_mask, caster, qstring_get_str(cd_mapping));
+    } else {
+        // QDECREF(cd_dest);
+        // QINCREF(cd_mapping);
+        delete cd_dest;
+        cd_dest = new std::string(*cd_mapping);
+    }
+
+    if (!is_alpha && output.flags & PS_COMBINEROUTPUT_AB_BLUE_TO_ALPHA) {
+        qstring_append_fmt(ps->code, "%s.a = %s.b;\n",
+                           qstring_get_str(ab_dest), qstring_get_str(ab_dest));
+    }
+    if (!is_alpha && output.flags & PS_COMBINEROUTPUT_CD_BLUE_TO_ALPHA) {
+        qstring_append_fmt(ps->code, "%s.a = %s.b;\n",
+                           qstring_get_str(cd_dest), qstring_get_str(cd_dest));
+    }
+
+    std::string *sum;
+    if (output.muxsum_op == PS_COMBINEROUTPUT_AB_CD_SUM) {
+        sum = qstring_from_fmt("(%s + %s)", qstring_get_str(ab), qstring_get_str(cd));
+    } else {
+        sum = qstring_from_fmt("((r0.a >= 0.5) ? %s : %s)",
+                               qstring_get_str(cd), qstring_get_str(ab));
+    }
+
+    std::string *sum_mapping = get_output(sum, output.mapping);
+    if (sum_dest->length()) {
+        qstring_append_fmt(ps->code, "%s.%s = %s(%s);\n",
+                           qstring_get_str(sum_dest), write_mask, caster, qstring_get_str(sum_mapping));
+    }
+
+    delete a;
+    delete b;
+    delete c;
+    delete d;
+    delete ab;
+    delete cd;
+    delete ab_mapping;
+    delete cd_mapping;
+    delete ab_dest;
+    delete cd_dest;
+    delete sum_dest;
+    delete sum;
+    delete sum_mapping;
+}
+
+// Add code for the final combiner stage
+static void add_final_stage_code(struct PixelShader *ps, struct FCInputInfo _final)
+{
+    ps->varE = get_input_var(ps, _final.e, false);
+    ps->varF = get_input_var(ps, _final.f, false);
+
+    std::string *a = get_input_var(ps, _final.a, false);
+    std::string *b = get_input_var(ps, _final.b, false);
+    std::string *c = get_input_var(ps, _final.c, false);
+    std::string *d = get_input_var(ps, _final.d, false);
+    std::string *g = get_input_var(ps, _final.g, false);
+
+    add_var_ref(ps, "r0");
+    qstring_append_fmt(ps->code, "r0.rgb = %s + mix(vec3(%s), vec3(%s), vec3(%s));\n",
+                       qstring_get_str(d), qstring_get_str(c),
+                       qstring_get_str(b), qstring_get_str(a));
+    /* FIXME: Is .x correctly here? */
+    qstring_append_fmt(ps->code, "r0.a = vec3(%s).x;\n", qstring_get_str(g));
+
+    delete a;
+    delete b;
+    delete c;
+    delete d;
+    delete g;
+
+    delete ps->varE;
+    delete ps->varF;
+    ps->varE = ps->varF = NULL;
+}
+
+
+
+static std::string* psh_convert(struct PixelShader *ps)
+{
+    int i;
+
+    std::string *preflight = new std::string();
+    preflight->append(STRUCT_VERTEX_DATA);
+    preflight->append("noperspective in VertexData g_vtx;\n");
+    preflight->append("#define vtx g_vtx\n");
+    preflight->append("\n");
+    preflight->append("out vec4 fragColor;\n");
+    preflight->append("\n");
+    preflight->append("uniform vec4 fogColor;\n");
+
+    /* calculate perspective-correct inputs */
+    std::string *vars = new std::string();
+    vars->append("vec4 pD0 = vtx.D0 / vtx.inv_w;\n");
+    vars->append("vec4 pD1 = vtx.D1 / vtx.inv_w;\n");
+    vars->append("vec4 pB0 = vtx.B0 / vtx.inv_w;\n");
+    vars->append("vec4 pB1 = vtx.B1 / vtx.inv_w;\n");
+    vars->append("vec4 pFog = vec4(fogColor.rgb, clamp(vtx.Fog / vtx.inv_w, 0.0, 1.0));\n");
+    vars->append("vec4 pT0 = vtx.T0 / vtx.inv_w;\n");
+    vars->append("vec4 pT1 = vtx.T1 / vtx.inv_w;\n");
+    vars->append("vec4 pT2 = vtx.T2 / vtx.inv_w;\n");
+    vars->append("vec4 pT3 = vtx.T3 / vtx.inv_w;\n");
+    vars->append("\n");
+    vars->append("vec4 v0 = pD0;\n");
+    vars->append("vec4 v1 = pD1;\n");
+
+    ps->code = new std::string();
+
+    for (i = 0; i < 4; i++) {
+
+        const char *sampler_type = NULL;
+
+        switch (ps->tex_modes[i]) {
+        case PS_TEXTUREMODES_NONE:
+            qstring_append_fmt(vars, "vec4 t%d = vec4(0.0); /* PS_TEXTUREMODES_NONE */\n",
+                               i);
+            break;
+        case PS_TEXTUREMODES_PROJECT2D:
+            if (ps->state.rect_tex[i]) {
+                sampler_type = "sampler2DRect";
+            } else {
+                sampler_type = "sampler2D";
+            }
+            qstring_append_fmt(vars, "vec4 t%d = textureProj(texSamp%d, pT%d.xyw);\n",
+                               i, i, i);
+            break;
+        case PS_TEXTUREMODES_PROJECT3D:
+            sampler_type = "sampler3D";
+            qstring_append_fmt(vars, "vec4 t%d = textureProj(texSamp%d, pT%d.xyzw);\n",
+                               i, i, i);
+            break;
+        case PS_TEXTUREMODES_CUBEMAP:
+            sampler_type = "samplerCube";
+            qstring_append_fmt(vars, "vec4 t%d = texture(texSamp%d, pT%d.xyz / pT%d.w);\n",
+                               i, i, i, i);
+            break;
+        case PS_TEXTUREMODES_PASSTHRU:
+            qstring_append_fmt(vars, "vec4 t%d = pT%d;\n", i, i);
+            break;
+        case PS_TEXTUREMODES_CLIPPLANE: {
+            int j;
+            qstring_append_fmt(vars, "vec4 t%d = vec4(0.0); /* PS_TEXTUREMODES_CLIPPLANE */\n",
+                               i);
+            for (j = 0; j < 4; j++) {
+                qstring_append_fmt(vars, "  if(pT%d.%c %s 0.0) { discard; };\n",
+                                   i, "xyzw"[j],
+                                   ps->state.compare_mode[i][j] ? ">=" : "<");
+            }
+            break;
+        }
+        case PS_TEXTUREMODES_BUMPENVMAP:
+            assert(!ps->state.rect_tex[i]);
+            sampler_type = "sampler2D";
+            qstring_append_fmt(preflight, "uniform mat2 bumpMat%d;\n", i);
+            /* FIXME: Do bumpMat swizzle on CPU before upload */
+            qstring_append_fmt(vars, "vec4 t%d = texture(texSamp%d, pT%d.xy + t%d.rg * mat2(bumpMat%d[0].xy,bumpMat%d[1].yx));\n",
+                               i, i, i, ps->input_tex[i], i, i);
+            break;
+        case PS_TEXTUREMODES_BUMPENVMAP_LUM:
+            qstring_append_fmt(preflight, "uniform float bumpScale%d;\n", i);
+            qstring_append_fmt(preflight, "uniform float bumpOffset%d;\n", i);
+            qstring_append_fmt(ps->code, "/* BUMPENVMAP_LUM for stage %d */\n", i);
+            qstring_append_fmt(ps->code, "t%d = t%d * (bumpScale%d * t%d.b + bumpOffset%d);\n",
+                               i, i, i, ps->input_tex[i], i);
+            /* Now the same as BUMPENVMAP */
+            assert(!ps->state.rect_tex[i]);
+            sampler_type = "sampler2D";
+            qstring_append_fmt(preflight, "uniform mat2 bumpMat%d;\n", i);
+            /* FIXME: Do bumpMat swizzle on CPU before upload */
+            qstring_append_fmt(vars, "vec4 t%d = texture(texSamp%d, pT%d.xy + t%d.rg * mat2(bumpMat%d[0].xy,bumpMat%d[1].yx));\n",
+                               i, i, i, ps->input_tex[i], i, i);
+            break;
+        case PS_TEXTUREMODES_BRDF:
+            qstring_append_fmt(vars, "vec4 t%d = vec4(0.0); /* PS_TEXTUREMODES_BRDF */\n",
+                               i);
+            assert(false); /* Unimplemented */
+            break;
+        case PS_TEXTUREMODES_DOT_ST:
+            qstring_append_fmt(vars, "vec4 t%d = vec4(0.0); /* PS_TEXTUREMODES_DOT_ST */\n",
+                               i);
+            assert(false); /* Unimplemented */
+            break;
+        case PS_TEXTUREMODES_DOT_ZW:
+            qstring_append_fmt(vars, "vec4 t%d = vec4(0.0); /* PS_TEXTUREMODES_DOT_ZW */\n",
+                               i);
+            assert(false); /* Unimplemented */
+            break;
+        case PS_TEXTUREMODES_DOT_RFLCT_DIFF:
+            qstring_append_fmt(vars, "vec4 t%d = vec4(0.0); /* PS_TEXTUREMODES_DOT_RFLCT_DIFF */\n",
+                               i);
+            assert(false); /* Unimplemented */
+            break;
+        case PS_TEXTUREMODES_DOT_RFLCT_SPEC:
+            qstring_append_fmt(vars, "vec4 t%d = vec4(0.0); /* PS_TEXTUREMODES_DOT_RFLCT_SPEC */\n",
+                               i);
+            assert(false); /* Unimplemented */
+            break;
+        case PS_TEXTUREMODES_DOT_STR_3D:
+            qstring_append_fmt(vars, "vec4 t%d = vec4(0.0); /* PS_TEXTUREMODES_DOT_STR_3D */\n",
+                               i);
+            assert(false); /* Unimplemented */
+            break;
+        case PS_TEXTUREMODES_DOT_STR_CUBE:
+            qstring_append_fmt(vars, "vec4 t%d = vec4(0.0); /* PS_TEXTUREMODES_DOT_STR_CUBE */\n",
+                               i);
+            assert(false); /* Unimplemented */
+            break;
+        case PS_TEXTUREMODES_DPNDNT_AR:
+            assert(!ps->state.rect_tex[i]);
+            sampler_type = "sampler2D";
+            qstring_append_fmt(vars, "vec4 t%d = texture(texSamp%d, t%d.ar);\n",
+                               i, i, ps->input_tex[i]);
+            break;
+        case PS_TEXTUREMODES_DPNDNT_GB:
+            assert(!ps->state.rect_tex[i]);
+            sampler_type = "sampler2D";
+            qstring_append_fmt(vars, "vec4 t%d = texture(texSamp%d, t%d.gb);\n",
+                               i, i, ps->input_tex[i]);
+            break;
+        case PS_TEXTUREMODES_DOTPRODUCT:
+            qstring_append_fmt(vars, "vec4 t%d = vec4(dot(pT%d.xyz, t%d.rgb));\n",
+                               i, i, ps->input_tex[i]);
+            break;
+        case PS_TEXTUREMODES_DOT_RFLCT_SPEC_CONST:
+            qstring_append_fmt(vars, "vec4 t%d = vec4(0.0); /* PS_TEXTUREMODES_DOT_RFLCT_SPEC_CONST */\n",
+                               i);
+            assert(false); /* Unimplemented */
+            break;
+        default:
+            fprintf(stderr, "Unknown ps tex mode: 0x%x\n", ps->tex_modes[i]);
+            assert(false);
+            break;
+        }
+        
+        if (sampler_type != NULL) {
+            qstring_append_fmt(preflight, "uniform %s texSamp%d;\n", sampler_type, i);
+
+            /* As this means a texture fetch does happen, do alphakill */
+            if (ps->state.alphakill[i]) {
+                qstring_append_fmt(vars, "if (t%d.a == 0.0) { discard; };\n",
+                                   i);
+            }
+        }
+    }
+
+    for (i = 0; i < ps->num_stages; i++) {
+        ps->cur_stage = i;
+        qstring_append_fmt(ps->code, "// Stage %d\n", i);
+        add_stage_code(ps, ps->stage[i].rgb_input, ps->stage[i].rgb_output, "rgb", false);
+        add_stage_code(ps, ps->stage[i].alpha_input, ps->stage[i].alpha_output, "a", true);
+    }
+
+    if (ps->final_input.enabled) {
+        ps->cur_stage = 8;
+        ps->code->append("// Final Combiner\n");
+        add_final_stage_code(ps, ps->final_input);
+    }
+
+    for (i = 0; i < ps->num_var_refs; i++) {
+        qstring_append_fmt(vars, "vec4 %s;\n", ps->var_refs[i]);
+        if (strcmp(ps->var_refs[i], "r0") == 0) {
+            if (ps->tex_modes[0] != PS_TEXTUREMODES_NONE) {
+                vars->append("r0.a = t0.a;\n");
+            } else {
+                vars->append("r0.a = 1.0;\n");
+            }
+        }
+    }
+    for (i = 0; i < ps->num_const_refs; i++) {
+        qstring_append_fmt(preflight, "uniform vec4 %s;\n", ps->const_refs[i]);
+    }
+
+    if (ps->state.alpha_test && ps->state.alpha_func != ALPHA_FUNC_ALWAYS) {
+        qstring_append_fmt(preflight, "uniform float alphaRef;\n");
+        if (ps->state.alpha_func == ALPHA_FUNC_NEVER) {
+            ps->code->append("discard;\n");
+        } else {
+            const char* alpha_op;
+            switch (ps->state.alpha_func) {
+            case ALPHA_FUNC_LESS: alpha_op = "<"; break;
+            case ALPHA_FUNC_EQUAL: alpha_op = "=="; break;
+            case ALPHA_FUNC_LEQUAL: alpha_op = "<="; break;
+            case ALPHA_FUNC_GREATER: alpha_op = ">"; break;
+            case ALPHA_FUNC_NOTEQUAL: alpha_op = "!="; break;
+            case ALPHA_FUNC_GEQUAL: alpha_op = ">="; break;
+            default:
+                assert(false);
+                break;
+            }
+            qstring_append_fmt(ps->code, "if (!(r0.a %s alphaRef)) discard;\n",
+                               alpha_op);
+        }
+    }
+
+    std::string *final = new std::string();
+    final->append("#version 330\n\n");
+    final->append(qstring_get_str(preflight));
+    final->append("void main() {\n");
+    final->append(qstring_get_str(vars));
+    final->append(qstring_get_str(ps->code));
+    final->append("fragColor = r0;\n");
+    final->append("}\n");
+
+    delete preflight;
+    delete vars;
+    delete ps->code;
+
+    return final;
+}
+
+static void parse_input(struct InputInfo *var, int value)
+{
+    var->reg = value & 0xF;
+    var->chan = value & 0x10;
+    var->mod = value & 0xE0;
+}
+
+static void parse_combiner_inputs(uint32_t value,
+                                struct InputInfo *a, struct InputInfo *b,
+                                struct InputInfo *c, struct InputInfo *d)
+{
+    parse_input(d, value & 0xFF);
+    parse_input(c, (value >> 8) & 0xFF);
+    parse_input(b, (value >> 16) & 0xFF);
+    parse_input(a, (value >> 24) & 0xFF);
+}
+
+static void parse_combiner_output(uint32_t value, struct OutputInfo *out)
+{
+    out->cd = value & 0xF;
+    out->ab = (value >> 4) & 0xF;
+    out->muxsum = (value >> 8) & 0xF;
+    int flags = value >> 12;
+    out->flags = flags;
+    out->cd_op = flags & 1;
+    out->ab_op = flags & 2;
+    out->muxsum_op = flags & 4;
+    out->mapping = flags & 0x38;
+    out->ab_alphablue = flags & 0x80;
+    out->cd_alphablue = flags & 0x40;
+}
+
+std::string *psh_translate(const PshState state)
+{
+    int i;
+    struct PixelShader ps;
+    memset(&ps, 0, sizeof(ps));
+
+    ps.state = state;
+
+    ps.num_stages = state.combiner_control & 0xFF;
+    ps.flags = state.combiner_control >> 8;
+    for (i = 0; i < 4; i++) {
+        ps.tex_modes[i] = (state.shader_stage_program >> (i * 5)) & 0x1F;
+    }
+
+    ps.input_tex[0] = -1;
+    ps.input_tex[1] = 0;
+    ps.input_tex[2] = (state.other_stage_input >> 16) & 0xF;
+    ps.input_tex[3] = (state.other_stage_input >> 20) & 0xF;
+    for (i = 0; i < ps.num_stages; i++) {
+        parse_combiner_inputs(state.rgb_inputs[i],
+            &ps.stage[i].rgb_input.a, &ps.stage[i].rgb_input.b,
+            &ps.stage[i].rgb_input.c, &ps.stage[i].rgb_input.d);
+        parse_combiner_inputs(state.alpha_inputs[i],
+            &ps.stage[i].alpha_input.a, &ps.stage[i].alpha_input.b,
+            &ps.stage[i].alpha_input.c, &ps.stage[i].alpha_input.d);
+
+        parse_combiner_output(state.rgb_outputs[i], &ps.stage[i].rgb_output);
+        parse_combiner_output(state.alpha_outputs[i], &ps.stage[i].alpha_output);
+        //ps.stage[i].c0 = (pDef->PSC0Mapping >> (i * 4)) & 0xF;
+        //ps.stage[i].c1 = (pDef->PSC1Mapping >> (i * 4)) & 0xF;
+        //ps.stage[i].c0_value = constant_0[i];
+        //ps.stage[i].c1_value = constant_1[i];
+    }
+
+    struct InputInfo blank;
+    ps.final_input.enabled = state.final_inputs_0 || state.final_inputs_1;
+    if (ps.final_input.enabled) {
+        parse_combiner_inputs(state.final_inputs_0,
+                              &ps.final_input.a, &ps.final_input.b,
+                              &ps.final_input.c, &ps.final_input.d);
+        parse_combiner_inputs(state.final_inputs_1,
+                              &ps.final_input.e, &ps.final_input.f,
+                              &ps.final_input.g, &blank);
+        int flags = state.final_inputs_1 & 0xFF;
+        ps.final_input.clamp_sum = flags & PS_FINALCOMBINERSETTING_CLAMP_SUM;
+        ps.final_input.inv_v1 = flags & PS_FINALCOMBINERSETTING_COMPLEMENT_V1;
+        ps.final_input.inv_r0 = flags & PS_FINALCOMBINERSETTING_COMPLEMENT_R0;
+        //ps.final_input.c0 = (pDef->PSFinalCombinerConstants >> 0) & 0xF;
+        //ps.final_input.c1 = (pDef->PSFinalCombinerConstants >> 4) & 0xF;
+        //ps.final_input.c0_value = final_constant_0;
+        //ps.final_input.c1_value = final_constant_1;
+    }
+
+
+
+    return psh_convert(&ps);
+}
--- a/src/devices/video/nv2a_psh.h
+++ b/src/devices/video/nv2a_psh.h
@ -0,0 +1,59 @@
+/*
+ * QEMU Geforce NV2A pixel shader translation
+ *
+ * Copyright (c) 2013 espes
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#ifndef HW_NV2A_PSH_H
+#define HW_NV2A_PSH_H
+
+#include <stdint.h>
+
+enum PshAlphaFunc {
+    ALPHA_FUNC_NEVER,
+    ALPHA_FUNC_LESS,
+    ALPHA_FUNC_EQUAL,
+    ALPHA_FUNC_LEQUAL,
+    ALPHA_FUNC_GREATER,
+    ALPHA_FUNC_NOTEQUAL,
+    ALPHA_FUNC_GEQUAL,
+    ALPHA_FUNC_ALWAYS,
+};
+
+typedef struct PshState {
+    /* fragment shader - register combiner stuff */
+    uint32_t combiner_control;
+    uint32_t shader_stage_program;
+    uint32_t other_stage_input;
+    uint32_t final_inputs_0;
+    uint32_t final_inputs_1;
+
+    uint32_t rgb_inputs[8], rgb_outputs[8];
+    uint32_t alpha_inputs[8], alpha_outputs[8];
+
+    bool rect_tex[4];
+    bool compare_mode[4][4];
+    bool alphakill[4];
+
+    bool alpha_test;
+    enum PshAlphaFunc alpha_func;
+} PshState;
+
+std::string *psh_translate(const PshState state);
+
+#endif
--- a/src/devices/video/nv2a_shaders.cpp
+++ b/src/devices/video/nv2a_shaders.cpp
@ -0,0 +1,950 @@
+/*
+ * QEMU Geforce NV2A shader generator
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#define stringify(x) xstringify(x)
+#define xstringify(x) #x
+
+#include "nv2a_debug.h"
+#include "nv2a_shaders_common.h"
+#include "nv2a_shaders.h"
+
+// fixme: clean this up (i'm a lazy bastard)
+#define qstring_append_fmt(str, ...) do { \
+    char buf[128]; \
+    snprintf(buf, sizeof(buf), __VA_ARGS__); \
+    str->append(buf); \
+} while (0) \
+
+#define qstring_get_str(str) str->c_str()
+
+static std::string* generate_geometry_shader(
+                                      enum ShaderPolygonMode polygon_front_mode,
+                                      enum ShaderPolygonMode polygon_back_mode,
+                                      enum ShaderPrimitiveMode primitive_mode,
+                                      GLenum *gl_primitive_mode)
+{
+
+    /* FIXME: Missing support for 2-sided-poly mode */
+    assert(polygon_front_mode == polygon_back_mode);
+    enum ShaderPolygonMode polygon_mode = polygon_front_mode;
+
+    /* POINT mode shouldn't require any special work */
+    if (polygon_mode == POLY_MODE_POINT) {
+        *gl_primitive_mode = GL_POINTS;
+        return NULL;
+    }
+
+    /* Handle LINE and FILL mode */
+    const char *layout_in = NULL;
+    const char *layout_out = NULL;
+    const char *body = NULL;
+    switch (primitive_mode) {
+    case PRIM_TYPE_POINTS: *gl_primitive_mode = GL_POINTS; return NULL;
+    case PRIM_TYPE_LINES: *gl_primitive_mode = GL_LINES; return NULL;
+    case PRIM_TYPE_LINE_LOOP: *gl_primitive_mode = GL_LINE_LOOP; return NULL;
+    case PRIM_TYPE_LINE_STRIP: *gl_primitive_mode = GL_LINE_STRIP; return NULL;
+    case PRIM_TYPE_TRIANGLES:
+        *gl_primitive_mode = GL_TRIANGLES;
+        if (polygon_mode == POLY_MODE_FILL) { return NULL; }
+        assert(polygon_mode == POLY_MODE_LINE);
+        layout_in = "layout(triangles) in;\n";
+        layout_out = "layout(line_strip, max_vertices = 4) out;\n";
+        body = "  emit_vertex(0);\n"
+               "  emit_vertex(1);\n"
+               "  emit_vertex(2);\n"
+               "  emit_vertex(0);\n"
+               "  EndPrimitive();\n";
+        break;
+    case PRIM_TYPE_TRIANGLE_STRIP:
+        *gl_primitive_mode = GL_TRIANGLE_STRIP;
+        if (polygon_mode == POLY_MODE_FILL) { return NULL; }
+        assert(polygon_mode == POLY_MODE_LINE);
+        layout_in = "layout(triangles) in;\n";
+        layout_out = "layout(line_strip, max_vertices = 4) out;\n";
+        /* Imagine a quad made of a tristrip, the comments tell you which
+         * vertex we are using */
+        body = "  if ((gl_PrimitiveIDIn & 1) == 0) {\n"
+               "    if (gl_PrimitiveIDIn == 0) {\n"
+               "      emit_vertex(0);\n" /* bottom right */
+               "    }\n"
+               "    emit_vertex(1);\n" /* top right */
+               "    emit_vertex(2);\n" /* bottom left */
+               "    emit_vertex(0);\n" /* bottom right */
+               "  } else {\n"
+               "    emit_vertex(2);\n" /* bottom left */
+               "    emit_vertex(1);\n" /* top left */
+               "    emit_vertex(0);\n" /* top right */
+               "  }\n"
+               "  EndPrimitive();\n";
+        break;
+    case PRIM_TYPE_TRIANGLE_FAN:
+        *gl_primitive_mode = GL_TRIANGLE_FAN;
+        if (polygon_mode == POLY_MODE_FILL) { return NULL; }
+        assert(polygon_mode == POLY_MODE_LINE);
+        layout_in = "layout(triangles) in;\n";
+        layout_out = "layout(line_strip, max_vertices = 4) out;\n";
+        body = "  if (gl_PrimitiveIDIn == 0) {\n"
+               "    emit_vertex(0);\n"
+               "  }\n"
+               "  emit_vertex(1);\n"
+               "  emit_vertex(2);\n"
+               "  emit_vertex(0);\n"
+               "  EndPrimitive();\n";
+        break;
+    case PRIM_TYPE_QUADS:
+        *gl_primitive_mode = GL_LINES_ADJACENCY;
+        layout_in = "layout(lines_adjacency) in;\n";
+        if (polygon_mode == POLY_MODE_LINE) {
+            layout_out = "layout(line_strip, max_vertices = 5) out;\n";
+            body = "  emit_vertex(0);\n"
+                   "  emit_vertex(1);\n"
+                   "  emit_vertex(2);\n"
+                   "  emit_vertex(3);\n"
+                   "  emit_vertex(0);\n"
+                   "  EndPrimitive();\n";
+        } else if (polygon_mode == POLY_MODE_FILL) {
+            layout_out = "layout(triangle_strip, max_vertices = 4) out;\n";
+            body = "  emit_vertex(0);\n"
+                   "  emit_vertex(1);\n"
+                   "  emit_vertex(3);\n"
+                   "  emit_vertex(2);\n"
+                   "  EndPrimitive();\n";
+        } else {
+            assert(false);
+            return NULL;
+        }
+        break;
+    case PRIM_TYPE_QUAD_STRIP:
+        *gl_primitive_mode = GL_LINE_STRIP_ADJACENCY;
+        layout_in = "layout(lines_adjacency) in;\n";
+        if (polygon_mode == POLY_MODE_LINE) {
+            layout_out = "layout(line_strip, max_vertices = 5) out;\n";
+            body = "  if ((gl_PrimitiveIDIn & 1) != 0) { return; }\n"
+                   "  if (gl_PrimitiveIDIn == 0) {\n"
+                   "    emit_vertex(0);\n"
+                   "  }\n"
+                   "  emit_vertex(1);\n"
+                   "  emit_vertex(3);\n"
+                   "  emit_vertex(2);\n"
+                   "  emit_vertex(0);\n"
+                   "  EndPrimitive();\n";
+        } else if (polygon_mode == POLY_MODE_FILL) {
+            layout_out = "layout(triangle_strip, max_vertices = 4) out;\n";
+            body = "  if ((gl_PrimitiveIDIn & 1) != 0) { return; }\n"
+                   "  emit_vertex(0);\n"
+                   "  emit_vertex(1);\n"
+                   "  emit_vertex(2);\n"
+                   "  emit_vertex(3);\n"
+                   "  EndPrimitive();\n";
+        } else {
+            assert(false);
+            return NULL;
+        }
+        break;
+    case PRIM_TYPE_POLYGON:
+        if (polygon_mode == POLY_MODE_LINE) {
+            *gl_primitive_mode = GL_LINE_LOOP;
+        } else if (polygon_mode == POLY_MODE_FILL) {
+            *gl_primitive_mode = GL_TRIANGLE_FAN;
+        } else {
+            assert(false);
+        }
+        return NULL;
+    default:
+        assert(false);
+        return NULL;
+    }
+
+    /* generate a geometry shader to support deprecated primitive types */
+    assert(layout_in);
+    assert(layout_out);
+    assert(body);
+    std::string* s = new std::string("#version 330\n"
+                                  "\n");
+    s->append(layout_in);
+    s->append(layout_out);
+    s->append("\n"
+                      STRUCT_VERTEX_DATA
+                      "noperspective in VertexData v_vtx[];\n"
+                      "noperspective out VertexData g_vtx;\n"
+                      "\n"
+                      "void emit_vertex(int index) {\n"
+                      "  gl_Position = gl_in[index].gl_Position;\n"
+                      "  gl_PointSize = gl_in[index].gl_PointSize;\n"
+                      "  g_vtx = v_vtx[index];\n"
+                      "  EmitVertex();\n"
+                      "}\n"
+                      "\n"
+                      "void main() {\n");
+    s->append(body);
+    s->append("}\n");
+
+    return s;
+}
+
+static void append_skinning_code(std::string* str, bool mix,
+                                 unsigned int count, const char* type,
+                                 const char* output, const char* input,
+                                 const char* matrix, const char* swizzle)
+{
+
+    if (count == 0) {
+        qstring_append_fmt(str, "%s %s = (%s * %s0).%s;\n",
+                           type, output, input, matrix, swizzle);
+    } else {
+        qstring_append_fmt(str, "%s %s = %s(0.0);\n", type, output, type);
+        if (mix) {
+            /* Tweening */
+            if (count == 2) {
+                qstring_append_fmt(str,
+                                   "%s += mix((%s * %s1).%s,\n"
+                                   "          (%s * %s0).%s, weight.x);\n",
+                                   output,
+                                   input, matrix, swizzle,
+                                   input, matrix, swizzle);
+            } else {
+                /* FIXME: Not sure how blend weights are calculated */
+                assert(false);
+            }
+        } else {
+            /* Individual matrices */
+			unsigned int i;
+            for (i = 0; i < count; i++) {
+                char c = "xyzw"[i];
+                qstring_append_fmt(str, "%s += (%s * %s%d * weight.%c).%s;\n",
+                                   output, input, matrix, i, c,
+                                   swizzle);
+            }
+            assert(false); /* FIXME: Untested */
+        }
+    }
+}
+
+#define GLSL_C(idx) "c[" stringify(idx) "]"
+#define GLSL_LTCTXA(idx) "ltctxa[" stringify(idx) "]"
+
+#define GLSL_C_MAT4(idx) \
+    "mat4(" GLSL_C(idx) ", " GLSL_C(idx+1) ", " \
+            GLSL_C(idx+2) ", " GLSL_C(idx+3) ")"
+
+#define GLSL_DEFINE(a, b) "#define " stringify(a) " " b "\n"
+
+static void generate_fixed_function(const ShaderState state,
+                                    std::string *header, std::string *body)
+{
+    int i, j;
+
+    /* generate vertex shader mimicking fixed function */
+    header->append("#define position      v0\n"
+"#define weight        v1\n"
+"#define normal        v2.xyz\n"
+"#define diffuse       v3\n"
+"#define specular      v4\n"
+"#define fogCoord      v5.x\n"
+"#define pointSize     v6\n"
+"#define backDiffuse   v7\n"
+"#define backSpecular  v8\n"
+"#define texture0      v9\n"
+"#define texture1      v10\n"
+"#define texture2      v11\n"
+"#define texture3      v12\n"
+"#define reserved1     v13\n"
+"#define reserved2     v14\n"
+"#define reserved3     v15\n"
+"\n"
+"uniform vec4 ltctxa[" stringify(NV2A_LTCTXA_COUNT) "];\n"
+"uniform vec4 ltctxb[" stringify(NV2A_LTCTXB_COUNT) "];\n"
+"uniform vec4 ltc1[" stringify(NV2A_LTC1_COUNT) "];\n"
+"\n"
+GLSL_DEFINE(projectionMat, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_PMAT0))
+GLSL_DEFINE(compositeMat, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_CMAT0))
+"\n"
+GLSL_DEFINE(texPlaneS0, GLSL_C(NV_IGRAPH_XF_XFCTX_TG0MAT + 0))
+GLSL_DEFINE(texPlaneT0, GLSL_C(NV_IGRAPH_XF_XFCTX_TG0MAT + 1))
+GLSL_DEFINE(texPlaneQ0, GLSL_C(NV_IGRAPH_XF_XFCTX_TG0MAT + 2))
+GLSL_DEFINE(texPlaneR0, GLSL_C(NV_IGRAPH_XF_XFCTX_TG0MAT + 3))
+"\n"
+GLSL_DEFINE(texPlaneS1, GLSL_C(NV_IGRAPH_XF_XFCTX_TG1MAT + 0))
+GLSL_DEFINE(texPlaneT1, GLSL_C(NV_IGRAPH_XF_XFCTX_TG1MAT + 1))
+GLSL_DEFINE(texPlaneQ1, GLSL_C(NV_IGRAPH_XF_XFCTX_TG1MAT + 2))
+GLSL_DEFINE(texPlaneR1, GLSL_C(NV_IGRAPH_XF_XFCTX_TG1MAT + 3))
+"\n"
+GLSL_DEFINE(texPlaneS2, GLSL_C(NV_IGRAPH_XF_XFCTX_TG2MAT + 0))
+GLSL_DEFINE(texPlaneT2, GLSL_C(NV_IGRAPH_XF_XFCTX_TG2MAT + 1))
+GLSL_DEFINE(texPlaneQ2, GLSL_C(NV_IGRAPH_XF_XFCTX_TG2MAT + 2))
+GLSL_DEFINE(texPlaneR2, GLSL_C(NV_IGRAPH_XF_XFCTX_TG2MAT + 3))
+"\n"
+GLSL_DEFINE(texPlaneS3, GLSL_C(NV_IGRAPH_XF_XFCTX_TG3MAT + 0))
+GLSL_DEFINE(texPlaneT3, GLSL_C(NV_IGRAPH_XF_XFCTX_TG3MAT + 1))
+GLSL_DEFINE(texPlaneQ3, GLSL_C(NV_IGRAPH_XF_XFCTX_TG3MAT + 2))
+GLSL_DEFINE(texPlaneR3, GLSL_C(NV_IGRAPH_XF_XFCTX_TG3MAT + 3))
+"\n"
+GLSL_DEFINE(modelViewMat0, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_MMAT0))
+GLSL_DEFINE(modelViewMat1, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_MMAT1))
+GLSL_DEFINE(modelViewMat2, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_MMAT2))
+GLSL_DEFINE(modelViewMat3, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_MMAT3))
+"\n"
+GLSL_DEFINE(invModelViewMat0, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_IMMAT0))
+GLSL_DEFINE(invModelViewMat1, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_IMMAT1))
+GLSL_DEFINE(invModelViewMat2, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_IMMAT2))
+GLSL_DEFINE(invModelViewMat3, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_IMMAT3))
+"\n"
+GLSL_DEFINE(eyePosition, GLSL_C(NV_IGRAPH_XF_XFCTX_EYEP))
+"\n"
+"#define lightAmbientColor(i) "
+    "ltctxb[" stringify(NV_IGRAPH_XF_LTCTXB_L0_AMB) " + (i)*6].xyz\n"
+"#define lightDiffuseColor(i) "
+    "ltctxb[" stringify(NV_IGRAPH_XF_LTCTXB_L0_DIF) " + (i)*6].xyz\n"
+"#define lightSpecularColor(i) "
+    "ltctxb[" stringify(NV_IGRAPH_XF_LTCTXB_L0_SPC) " + (i)*6].xyz\n"
+"\n"
+"#define lightSpotFalloff(i) "
+    "ltctxa[" stringify(NV_IGRAPH_XF_LTCTXA_L0_K) " + (i)*2].xyz\n"
+"#define lightSpotDirection(i) "
+    "ltctxa[" stringify(NV_IGRAPH_XF_LTCTXA_L0_SPT) " + (i)*2]\n"
+"\n"
+"#define lightLocalRange(i) "
+    "ltc1[" stringify(NV_IGRAPH_XF_LTC1_r0) " + (i)].x\n"
+"\n"
+GLSL_DEFINE(sceneAmbientColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_FR_AMB) ".xyz")
+"\n"
+"uniform mat4 invViewport;\n"
+"\n");
+
+    /* Skinning */
+    unsigned int count;
+    bool mix;
+    switch (state.skinning) {
+    case SKINNING_OFF:
+        mix = false; count = 0; break;
+    case SKINNING_1WEIGHTS:
+        mix = true; count = 2; break;
+    case SKINNING_2WEIGHTS:
+        mix = true; count = 3; break;
+    case SKINNING_3WEIGHTS:
+        mix = true; count = 4; break;
+    case SKINNING_2WEIGHTS2MATRICES:
+        mix = false; count = 2; break;
+    case SKINNING_3WEIGHTS3MATRICES:
+        mix = false; count = 3; break;
+    case SKINNING_4WEIGHTS4MATRICES:
+        mix = false; count = 4; break;
+    default:
+        assert(false);
+        break;
+    }
+    qstring_append_fmt(body, "/* Skinning mode %d */\n",
+                       state.skinning);
+
+    append_skinning_code(body, mix, count, "vec4",
+                         "tPosition", "position",
+                         "modelViewMat", "xyzw");
+    append_skinning_code(body, mix, count, "vec3",
+                         "tNormal", "vec4(normal, 0.0)",
+                         "invModelViewMat", "xyz");
+
+    /* Normalization */
+    if (state.normalization) {
+        body->append("tNormal = normalize(tNormal);\n");
+    }
+
+    /* Texgen */
+    for (i = 0; i < NV2A_MAX_TEXTURES; i++) {
+        qstring_append_fmt(body, "/* Texgen for stage %d */\n",
+                           i);
+        /* Set each component individually */
+        /* FIXME: could be nicer if some channels share the same texgen */
+        for (j = 0; j < 4; j++) {
+            /* TODO: TexGen View Model missing! */
+            char c = "xyzw"[j];
+            char cSuffix = "STRQ"[j];
+            switch (state.texgen[i][j]) {
+            case TEXGEN_DISABLE:
+                qstring_append_fmt(body, "oT%d.%c = texture%d.%c;\n",
+                                   i, c, i, c);
+                break;
+            case TEXGEN_EYE_LINEAR:
+                qstring_append_fmt(body, "oT%d.%c = dot(texPlane%c%d, tPosition);\n",
+                                   i, c, cSuffix, i);
+                break;
+            case TEXGEN_OBJECT_LINEAR:
+                qstring_append_fmt(body, "oT%d.%c = dot(texPlane%c%d, position);\n",
+                                   i, c, cSuffix, i);
+                assert(false); /* Untested */
+                break;
+            case TEXGEN_SPHERE_MAP:
+                assert(i < 2);  /* Channels S,T only! */
+                body->append("{\n");
+                /* FIXME: u, r and m only have to be calculated once */
+                body->append("  vec3 u = normalize(tPosition.xyz);\n");
+                //FIXME: tNormal before or after normalization? Always normalize?
+                body->append("  vec3 r = reflect(u, tNormal);\n");
+
+                /* FIXME: This would consume 1 division fewer and *might* be
+                 *        faster than length:
+                 *   // [z=1/(2*x) => z=1/x*0.5]
+                 *   vec3 ro = r + vec3(0.0, 0.0, 1.0);
+                 *   float m = inversesqrt(dot(ro,ro))*0.5;
+                 */
+
+                body->append("  float invM = 1.0 / (2.0 * length(r + vec3(0.0, 0.0, 1.0)));\n");
+                qstring_append_fmt(body, "  oT%d.%c = r.%c * invM + 0.5;\n",
+                                   i, c, c);
+                body->append("}\n");
+                assert(false); /* Untested */
+                break;
+            case TEXGEN_REFLECTION_MAP:
+                assert(i < 3); /* Channels S,T,R only! */
+                body->append("{\n");
+                /* FIXME: u and r only have to be calculated once, can share the one from SPHERE_MAP */
+                body->append("  vec3 u = normalize(tPosition.xyz);\n");
+                body->append("  vec3 r = reflect(u, tNormal);\n");
+                qstring_append_fmt(body, "  oT%d.%c = r.%c;\n",
+                                   i, c, c);
+                body->append("}\n");
+                break;
+            case TEXGEN_NORMAL_MAP:
+                assert(i < 3); /* Channels S,T,R only! */
+                qstring_append_fmt(body, "oT%d.%c = tNormal.%c;\n",
+                                   i, c, c);
+                break;
+            default:
+                assert(false);
+                break;
+            }
+        }
+    }
+
+    /* Apply texture matrices */
+    for (i = 0; i < NV2A_MAX_TEXTURES; i++) {
+        if (state.texture_matrix_enable[i]) {
+            qstring_append_fmt(body,
+                               "oT%d = oT%d * texMat%d;\n",
+                               i, i, i);
+        }
+    }
+
+    /* Lighting */
+    if (state.lighting) {
+
+        //FIXME: Do 2 passes if we want 2 sided-lighting?
+        body->append("oD0 = vec4(sceneAmbientColor, diffuse.a);\n");
+        body->append("oD1 = vec4(0.0, 0.0, 0.0, specular.a);\n");
+
+        for (i = 0; i < NV2A_MAX_LIGHTS; i++) {
+            if (state.light[i] == LIGHT_OFF) {
+                continue;
+            }
+
+            /* FIXME: It seems that we only have to handle the surface colors if
+             *        they are not part of the material [= vertex colors].
+             *        If they are material the cpu will premultiply light
+             *        colors
+             */
+
+            qstring_append_fmt(body, "/* Light %d */ {\n", i);
+
+            if (state.light[i] == LIGHT_LOCAL
+                    || state.light[i] == LIGHT_SPOT) {
+
+                qstring_append_fmt(header,
+                    "uniform vec3 lightLocalPosition%d;\n"
+                    "uniform vec3 lightLocalAttenuation%d;\n",
+                    i, i);
+                qstring_append_fmt(body,
+                    "  vec3 VP = lightLocalPosition%d - tPosition.xyz/tPosition.w;\n"
+                    "  float d = length(VP);\n"
+//FIXME: if (d > lightLocalRange) { .. don't process this light .. } /* inclusive?! */ - what about directional lights?
+                    "  VP = normalize(VP);\n"
+                    "  float attenuation = 1.0 / (lightLocalAttenuation%d.x\n"
+                    "                               + lightLocalAttenuation%d.y * d\n"
+                    "                               + lightLocalAttenuation%d.z * d * d);\n"
+                    "  vec3 halfVector = normalize(VP + eyePosition.xyz / eyePosition.w);\n" /* FIXME: Not sure if eyePosition is correct */
+                    "  float nDotVP = max(0.0, dot(tNormal, VP));\n"
+                    "  float nDotHV = max(0.0, dot(tNormal, halfVector));\n",
+                    i, i, i, i);
+
+            }
+
+            switch(state.light[i]) {
+            case LIGHT_INFINITE:
+
+                /* lightLocalRange will be 1e+30 here */
+
+                qstring_append_fmt(header,
+                    "uniform vec3 lightInfiniteHalfVector%d;\n"
+                    "uniform vec3 lightInfiniteDirection%d;\n",
+                    i, i);
+                qstring_append_fmt(body,
+                    "  float attenuation = 1.0;\n"
+                    "  float nDotVP = max(0.0, dot(tNormal, normalize(vec3(lightInfiniteDirection%d))));\n"
+                    "  float nDotHV = max(0.0, dot(tNormal, vec3(lightInfiniteHalfVector%d)));\n",
+                    i, i);
+
+                /* FIXME: Do specular */
+
+                /* FIXME: tBackDiffuse */
+
+                break;
+            case LIGHT_LOCAL:
+                /* Everything done already */
+                break;
+            case LIGHT_SPOT:
+                assert(false);
+                /*FIXME: calculate falloff */
+                break;
+            default:
+                assert(false);
+                break;
+            }
+
+            qstring_append_fmt(body,
+                "  float pf;\n"
+                "  if (nDotVP == 0.0) {\n"
+                "    pf = 0.0;\n"
+                "  } else {\n"
+                "    pf = pow(nDotHV, /* specular(l, m, n, l1, m1, n1) */ 0.001);\n"
+                "  }\n"
+                "  vec3 lightAmbient = lightAmbientColor(%d) * attenuation;\n"
+                "  vec3 lightDiffuse = lightDiffuseColor(%d) * attenuation * nDotVP;\n"
+                "  vec3 lightSpecular = lightSpecularColor(%d) * pf;\n",
+                i, i, i);
+
+            body->append("  oD0.xyz += lightAmbient;\n");
+            body->append("  oD0.xyz += diffuse.xyz * lightDiffuse;\n");
+            body->append("  oD1.xyz += specular.xyz * lightSpecular;\n");
+            body->append("}\n");
+        }
+    } else {
+        body->append("  oD0 = diffuse;\n");
+        body->append("  oD1 = specular;\n");
+    }
+    body->append("  oB0 = backDiffuse;\n");
+    body->append("  oB1 = backSpecular;\n");
+
+    /* Fog */
+    if (state.fog_enable) {
+
+        /* From: https://www.opengl.org/registry/specs/NV/fog_distance.txt */
+        switch(state.foggen) {
+        case FOGGEN_SPEC_ALPHA:
+            /* FIXME: Do we have to clamp here? */
+            body->append("  float fogDistance = clamp(specular.a, 0.0, 1.0);\n");
+            break;
+        case FOGGEN_RADIAL:
+            body->append("  float fogDistance = length(tPosition.xyz);\n");
+            break;
+        case FOGGEN_PLANAR:
+        case FOGGEN_ABS_PLANAR:
+            body->append("  float fogDistance = dot(fogPlane.xyz, tPosition.xyz) + fogPlane.w;\n");
+            if (state.foggen == FOGGEN_ABS_PLANAR) {
+                body->append("  fogDistance = abs(fogDistance);\n");
+            }
+            break;
+        case FOGGEN_FOG_X:
+            body->append("  float fogDistance = fogCoord;\n");
+            break;
+        default:
+            assert(false);
+            break;
+        }
+
+    }
+
+    /* If skinning is off the composite matrix already includes the MV matrix */
+    if (state.skinning == SKINNING_OFF) {
+        body->append("  tPosition = position;\n");
+    }
+
+    body->append(
+        "   oPos = invViewport * (tPosition * compositeMat);\n"
+        "   oPos.z = oPos.z * 2.0 - oPos.w;\n");
+
+    body->append("  vtx.inv_w = 1.0 / oPos.w;\n");
+
+}
+
+static std::string *generate_vertex_shader(const ShaderState state,
+                                       char vtx_prefix)
+{
+    int i;
+    std::string *header = new std::string(
+"#version 330\n"
+"\n"
+"uniform vec2 clipRange;\n"
+"uniform vec2 surfaceSize;\n"
+"\n"
+/* All constants in 1 array declaration */
+"uniform vec4 c[" stringify(NV2A_VERTEXSHADER_CONSTANTS) "];\n"
+"\n"
+"uniform vec4 fogColor;\n"
+"uniform float fogParam[2];\n"
+"\n"
+
+GLSL_DEFINE(fogPlane, GLSL_C(NV_IGRAPH_XF_XFCTX_FOG))
+GLSL_DEFINE(texMat0, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_T0MAT))
+GLSL_DEFINE(texMat1, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_T1MAT))
+GLSL_DEFINE(texMat2, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_T2MAT))
+GLSL_DEFINE(texMat3, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_T3MAT))
+
+"\n"
+"vec4 oPos = vec4(0.0,0.0,0.0,1.0);\n"
+"vec4 oD0 = vec4(0.0,0.0,0.0,1.0);\n"
+"vec4 oD1 = vec4(0.0,0.0,0.0,1.0);\n"
+"vec4 oB0 = vec4(0.0,0.0,0.0,1.0);\n"
+"vec4 oB1 = vec4(0.0,0.0,0.0,1.0);\n"
+"vec4 oPts = vec4(0.0,0.0,0.0,1.0);\n"
+/* FIXME: NV_vertex_program says: "FOGC is the transformed vertex's fog
+ * coordinate. The register's first floating-point component is interpolated
+ * across the assembled primitive during rasterization and used as the fog
+ * distance to compute per-fragment the fog factor when fog is enabled.
+ * However, if both fog and vertex program mode are enabled, but the FOGC
+ * vertex result register is not written, the fog factor is overridden to
+ * 1.0. The register's other three components are ignored."
+ *
+ * That probably means it will read back as vec4(0.0, 0.0, 0.0, 1.0) but
+ * will be set to 1.0 AFTER the VP if it was never written?
+ * We should test on real hardware..
+ *
+ * We'll force 1.0 for oFog.x for now.
+ */
+"vec4 oFog = vec4(1.0,0.0,0.0,1.0);\n"
+"vec4 oT0 = vec4(0.0,0.0,0.0,1.0);\n"
+"vec4 oT1 = vec4(0.0,0.0,0.0,1.0);\n"
+"vec4 oT2 = vec4(0.0,0.0,0.0,1.0);\n"
+"vec4 oT3 = vec4(0.0,0.0,0.0,1.0);\n"
+"\n"
+STRUCT_VERTEX_DATA);
+
+    qstring_append_fmt(header, "noperspective out VertexData %c_vtx;\n",
+                       vtx_prefix);
+    qstring_append_fmt(header, "#define vtx %c_vtx\n",
+                       vtx_prefix);
+    header->append("\n");
+    for(i = 0; i < NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+        qstring_append_fmt(header, "in vec4 v%d;\n", i);
+    }
+    header->append("\n");
+
+    std::string *body = new std::string("void main() {\n");
+
+    if (state.fixed_function) {
+        generate_fixed_function(state, header, body);
+
+    } else if (state.vertex_program) {
+        vsh_translate(VSH_VERSION_XVS,
+                      (uint32_t*)state.program_data,
+                      state.program_length,
+                      state.z_perspective,
+                      header, body);
+    } else {
+        assert(false);
+    }
+
+
+    /* Fog */
+
+    if (state.fog_enable) {
+
+        if (state.vertex_program) {
+            /* FIXME: Does foggen do something here? Let's do some tracking..
+             *
+             *   "RollerCoaster Tycoon" has
+             *      state.vertex_program = true; state.foggen == FOGGEN_PLANAR
+             *      but expects oFog.x as fogdistance?! Writes oFog.xyzw = v0.z
+             */
+            body->append("  float fogDistance = oFog.x;\n");
+        }
+
+        /* FIXME: Do this per pixel? */
+
+        switch (state.fog_mode) {
+        case FOG_MODE_LINEAR:
+        case FOG_MODE_LINEAR_ABS:
+
+            /* f = (end - d) / (end - start)
+             *    fogParam[1] = 1 / (end - start)
+             *    fogParam[0] = 1 + end * fogParam[1];
+             */
+
+            body->append("  float fogFactor = fogParam[0] + fogDistance * fogParam[1];\n");
+            body->append("  fogFactor -= 1.0;\n"); /* FIXME: WHHYYY?!! */
+            break;
+        case FOG_MODE_EXP:
+        case FOG_MODE_EXP_ABS:
+
+            /* f = 1 / (e^(d * density))
+             *    fogParam[1] = -density / (2 * ln(256))
+             *    fogParam[0] = 1.5
+             */
+
+            body->append("  float fogFactor = fogParam[0] + exp2(fogDistance * fogParam[1] * 16.0);\n");
+            body->append("  fogFactor -= 1.5;\n"); /* FIXME: WHHYYY?!! */
+            break;
+        case FOG_MODE_EXP2:
+        case FOG_MODE_EXP2_ABS:
+
+            /* f = 1 / (e^((d * density)^2))
+             *    fogParam[1] = -density / (2 * sqrt(ln(256)))
+             *    fogParam[0] = 1.5
+             */
+
+            body->append("  float fogFactor = fogParam[0] + exp2(-fogDistance * fogDistance * fogParam[1] * fogParam[1] * 32.0);\n");
+            body->append("  fogFactor -= 1.5;\n"); /* FIXME: WHHYYY?!! */
+            break;
+        default:
+            assert(false);
+            break;
+        }
+        /* Calculate absolute for the modes which need it */
+        switch (state.fog_mode) {
+        case FOG_MODE_LINEAR_ABS:
+        case FOG_MODE_EXP_ABS:
+        case FOG_MODE_EXP2_ABS:
+            body->append("  fogFactor = abs(fogFactor);\n");
+            break;
+        default:
+            break;
+        }
+        /* FIXME: What about fog alpha?! */
+        body->append("  oFog.xyzw = vec4(fogFactor);\n");
+    } else {
+        /* FIXME: Is the fog still calculated / passed somehow?!
+         */
+        body->append("  oFog.xyzw = vec4(1.0);\n");
+    }
+
+    /* Set outputs */
+    body->append("\n"
+                      "  vtx.D0 = clamp(oD0, 0.0, 1.0) * vtx.inv_w;\n"
+                      "  vtx.D1 = clamp(oD1, 0.0, 1.0) * vtx.inv_w;\n"
+                      "  vtx.B0 = clamp(oB0, 0.0, 1.0) * vtx.inv_w;\n"
+                      "  vtx.B1 = clamp(oB1, 0.0, 1.0) * vtx.inv_w;\n"
+                      "  vtx.Fog = oFog.x * vtx.inv_w;\n"
+                      "  vtx.T0 = oT0 * vtx.inv_w;\n"
+                      "  vtx.T1 = oT1 * vtx.inv_w;\n"
+                      "  vtx.T2 = oT2 * vtx.inv_w;\n"
+                      "  vtx.T3 = oT3 * vtx.inv_w;\n"
+                      "  gl_Position = oPos;\n"
+                      "  gl_PointSize = oPts.x;\n"
+                      "\n"
+                      "}\n");
+
+
+    /* Return combined header + source */
+    header->append(qstring_get_str(body));
+    delete body;
+    return header;
+}
+
+static GLuint create_gl_shader(GLenum gl_shader_type,
+                               const char *code,
+                               const char *name)
+{
+    GLint compiled = 0;
+
+    NV2A_GL_DGROUP_BEGIN("Creating new %s", name);
+
+    NV2A_DPRINTF("compile new %s, code:\n%s\n", name, code);
+
+    GLuint shader = glCreateShader(gl_shader_type);
+    glShaderSource(shader, 1, &code, 0);
+    glCompileShader(shader);
+
+    /* Check it compiled */
+    compiled = 0;
+    glGetShaderiv(shader, GL_COMPILE_STATUS, &compiled);
+    if (!compiled) {
+        GLchar* log;
+        GLint log_length;
+        glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_length);
+        log = (GLchar*)malloc(log_length * sizeof(GLchar));
+        glGetShaderInfoLog(shader, log_length, NULL, log);
+        fprintf(stderr, "nv2a: %s compilation failed: %s\n", name, log);
+        free(log);
+
+        NV2A_GL_DGROUP_END();
+        abort();
+    }
+
+    NV2A_GL_DGROUP_END();
+
+    return shader;
+}
+
+ShaderBinding* generate_shaders(const ShaderState state)
+{
+    int i, j;
+    char tmp[64];
+
+    char vtx_prefix;
+    GLuint program = glCreateProgram();
+
+    /* Create an option geometry shader and find primitive type */
+
+    GLenum gl_primitive_mode;
+    std::string* geometry_shader_code =
+        generate_geometry_shader(state.polygon_front_mode,
+                                 state.polygon_back_mode,
+                                 state.primitive_mode,
+                                 &gl_primitive_mode);
+    if (geometry_shader_code) {
+        const char* geometry_shader_code_str =
+             qstring_get_str(geometry_shader_code);
+
+        GLuint geometry_shader = create_gl_shader(GL_GEOMETRY_SHADER,
+                                                  geometry_shader_code_str,
+                                                  "geometry shader");
+        glAttachShader(program, geometry_shader);
+
+        delete geometry_shader_code;
+
+        vtx_prefix = 'v';
+    } else {
+        vtx_prefix = 'g';
+    }
+
+    /* create the vertex shader */
+
+    std::string *vertex_shader_code = generate_vertex_shader(state, vtx_prefix);
+    GLuint vertex_shader = create_gl_shader(GL_VERTEX_SHADER,
+                                            qstring_get_str(vertex_shader_code),
+                                            "vertex shader");
+    glAttachShader(program, vertex_shader);
+    delete vertex_shader_code;
+
+
+    /* Bind attributes for vertices */
+    for(i = 0; i < NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+        snprintf(tmp, sizeof(tmp), "v%d", i);
+        glBindAttribLocation(program, i, tmp);
+    }
+
+
+    /* generate a fragment shader from register combiners */
+
+    std::string *fragment_shader_code = psh_translate(state.psh);
+
+    const char *fragment_shader_code_str = qstring_get_str(fragment_shader_code);
+
+    GLuint fragment_shader = create_gl_shader(GL_FRAGMENT_SHADER,
+                                              fragment_shader_code_str,
+                                              "fragment shader");
+    glAttachShader(program, fragment_shader);
+
+    delete fragment_shader_code;
+
+
+    /* link the program */
+    glLinkProgram(program);
+    GLint linked = 0;
+    glGetProgramiv(program, GL_LINK_STATUS, &linked);
+    if(!linked) {
+        GLchar log[2048];
+        glGetProgramInfoLog(program, 2048, NULL, log);
+        fprintf(stderr, "nv2a: shader linking failed: %s\n", log);
+        abort();
+    }
+
+    glUseProgram(program);
+
+    /* set texture samplers */
+    for (i = 0; i < NV2A_MAX_TEXTURES; i++) {
+        char samplerName[16];
+        snprintf(samplerName, sizeof(samplerName), "texSamp%d", i);
+        GLint texSampLoc = glGetUniformLocation(program, samplerName);
+        if (texSampLoc >= 0) {
+            glUniform1i(texSampLoc, i);
+        }
+    }
+
+    /* validate the program */
+    glValidateProgram(program);
+    GLint valid = 0;
+    glGetProgramiv(program, GL_VALIDATE_STATUS, &valid);
+    if (!valid) {
+        GLchar log[1024];
+        glGetProgramInfoLog(program, 1024, NULL, log);
+        fprintf(stderr, "nv2a: shader validation failed: %s\n", log);
+        abort();
+    }
+
+    ShaderBinding* ret = (ShaderBinding*)calloc(1, sizeof(ShaderBinding));
+    ret->gl_program = program;
+    ret->gl_primitive_mode = gl_primitive_mode;
+
+    /* lookup fragment shader uniforms */
+    for (i=0; i<=8; i++) {
+        for (j=0; j<2; j++) {
+            snprintf(tmp, sizeof(tmp), "c_%d_%d", i, j);
+            ret->psh_constant_loc[i][j] = glGetUniformLocation(program, tmp);
+        }
+    }
+    ret->alpha_ref_loc = glGetUniformLocation(program, "alphaRef");
+    for (i = 1; i < NV2A_MAX_TEXTURES; i++) {
+        snprintf(tmp, sizeof(tmp), "bumpMat%d", i);
+        ret->bump_mat_loc[i] = glGetUniformLocation(program, tmp);
+        snprintf(tmp, sizeof(tmp), "bumpScale%d", i);
+        ret->bump_scale_loc[i] = glGetUniformLocation(program, tmp);
+        snprintf(tmp, sizeof(tmp), "bumpOffset%d", i);
+        ret->bump_offset_loc[i] = glGetUniformLocation(program, tmp);
+    }
+
+    /* lookup vertex shader uniforms */
+    for(i = 0; i < NV2A_VERTEXSHADER_CONSTANTS; i++) {
+        snprintf(tmp, sizeof(tmp), "c[%d]", i);
+        ret->vsh_constant_loc[i] = glGetUniformLocation(program, tmp);
+    }
+    ret->surface_size_loc = glGetUniformLocation(program, "surfaceSize");
+    ret->clip_range_loc = glGetUniformLocation(program, "clipRange");
+    ret->fog_color_loc = glGetUniformLocation(program, "fogColor");
+    ret->fog_param_loc[0] = glGetUniformLocation(program, "fogParam[0]");
+    ret->fog_param_loc[1] = glGetUniformLocation(program, "fogParam[1]");
+
+    ret->inv_viewport_loc = glGetUniformLocation(program, "invViewport");
+    for (i = 0; i < NV2A_LTCTXA_COUNT; i++) {
+        snprintf(tmp, sizeof(tmp), "ltctxa[%d]", i);
+        ret->ltctxa_loc[i] = glGetUniformLocation(program, tmp);
+    }
+    for (i = 0; i < NV2A_LTCTXB_COUNT; i++) {
+        snprintf(tmp, sizeof(tmp), "ltctxb[%d]", i);
+        ret->ltctxb_loc[i] = glGetUniformLocation(program, tmp);
+    }
+    for (i = 0; i < NV2A_LTC1_COUNT; i++) {
+        snprintf(tmp, sizeof(tmp), "ltc1[%d]", i);
+        ret->ltc1_loc[i] = glGetUniformLocation(program, tmp);
+    }
+    for (i = 0; i < NV2A_MAX_LIGHTS; i++) {
+        snprintf(tmp, sizeof(tmp), "lightInfiniteHalfVector%d", i);
+        ret->light_infinite_half_vector_loc[i] = glGetUniformLocation(program, tmp);
+        snprintf(tmp, sizeof(tmp), "lightInfiniteDirection%d", i);
+        ret->light_infinite_direction_loc[i] = glGetUniformLocation(program, tmp);
+
+        snprintf(tmp, sizeof(tmp), "lightLocalPosition%d", i);
+        ret->light_local_position_loc[i] = glGetUniformLocation(program, tmp);
+        snprintf(tmp, sizeof(tmp), "lightLocalAttenuation%d", i);
+        ret->light_local_attenuation_loc[i] = glGetUniformLocation(program, tmp);
+    }
+
+    return ret;
+}
--- a/src/devices/video/nv2a_shaders.h
+++ b/src/devices/video/nv2a_shaders.h
@ -0,0 +1,115 @@
+/*
+ * QEMU Geforce NV2A shader generator
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_NV2A_SHADERS_H
+#define HW_NV2A_SHADERS_H
+
+//#include <SDL.h>
+#include <GL/glew.h>
+
+#include "nv2a_vsh.h"
+#include "nv2a_psh.h"
+#include "nv2a_int.h"
+
+
+enum ShaderPrimitiveMode {
+    PRIM_TYPE_NONE,
+    PRIM_TYPE_POINTS,
+    PRIM_TYPE_LINES,
+    PRIM_TYPE_LINE_LOOP,
+    PRIM_TYPE_LINE_STRIP,
+    PRIM_TYPE_TRIANGLES,
+    PRIM_TYPE_TRIANGLE_STRIP,
+    PRIM_TYPE_TRIANGLE_FAN,
+    PRIM_TYPE_QUADS,
+    PRIM_TYPE_QUAD_STRIP,
+    PRIM_TYPE_POLYGON,
+};
+
+enum ShaderPolygonMode {
+    POLY_MODE_FILL,
+    POLY_MODE_POINT,
+    POLY_MODE_LINE,
+};
+
+typedef struct ShaderState {
+    PshState psh;
+
+    bool texture_matrix_enable[4];
+    enum VshTexgen texgen[4][4];
+
+    bool fog_enable;
+    enum VshFoggen foggen;
+    enum VshFogMode fog_mode;
+
+    enum VshSkinning skinning;
+
+    bool normalization;
+
+    bool lighting;
+    enum VshLight light[NV2A_MAX_LIGHTS];
+
+    bool fixed_function;
+
+    /* vertex program */
+    bool vertex_program;
+    uint32_t program_data[NV2A_MAX_TRANSFORM_PROGRAM_LENGTH][VSH_TOKEN_SIZE];
+    int program_length;
+    bool z_perspective;
+
+    /* primitive format for geometry shader */
+    enum ShaderPolygonMode polygon_front_mode;
+    enum ShaderPolygonMode polygon_back_mode;
+    enum ShaderPrimitiveMode primitive_mode;
+} ShaderState;
+
+typedef struct ShaderBinding {
+    GLuint gl_program;
+    GLenum gl_primitive_mode;
+
+    GLint psh_constant_loc[9][2];
+    GLint alpha_ref_loc;
+
+    GLint bump_mat_loc[NV2A_MAX_TEXTURES];
+    GLint bump_scale_loc[NV2A_MAX_TEXTURES];
+    GLint bump_offset_loc[NV2A_MAX_TEXTURES];
+
+    GLint surface_size_loc;
+    GLint clip_range_loc;
+
+    GLint vsh_constant_loc[NV2A_VERTEXSHADER_CONSTANTS];
+
+    GLint inv_viewport_loc;
+    GLint ltctxa_loc[NV2A_LTCTXA_COUNT];
+    GLint ltctxb_loc[NV2A_LTCTXB_COUNT];
+    GLint ltc1_loc[NV2A_LTC1_COUNT];
+
+    GLint fog_color_loc;
+    GLint fog_param_loc[2];
+    GLint light_infinite_half_vector_loc[NV2A_MAX_LIGHTS];
+    GLint light_infinite_direction_loc[NV2A_MAX_LIGHTS];
+    GLint light_local_position_loc[NV2A_MAX_LIGHTS];
+    GLint light_local_attenuation_loc[NV2A_MAX_LIGHTS];
+
+} ShaderBinding;
+
+ShaderBinding* generate_shaders(const ShaderState state);
+
+#endif
--- a/src/devices/video/nv2a_shaders_common.h
+++ b/src/devices/video/nv2a_shaders_common.h
@ -0,0 +1,37 @@
+/*
+ * QEMU Geforce NV2A shader common definitions
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_NV2A_SHADERS_COMMON_H
+#define HW_NV2A_SHADERS_COMMON_H
+
+#define STRUCT_VERTEX_DATA "struct VertexData {\n" \
+                           "  float inv_w;\n" \
+                           "  vec4 D0;\n" \
+                           "  vec4 D1;\n" \
+                           "  vec4 B0;\n" \
+                           "  vec4 B1;\n" \
+                           "  float Fog;\n" \
+                           "  vec4 T0;\n" \
+                           "  vec4 T1;\n" \
+                           "  vec4 T2;\n" \
+                           "  vec4 T3;\n" \
+                           "};\n"
+
+#endif
--- a/src/devices/video/nv2a_vsh.cpp
+++ b/src/devices/video/nv2a_vsh.cpp
@ -0,0 +1,772 @@
+/*
+ * QEMU Geforce NV2A vertex shader translation
+ *
+ * Copyright (c) 2014 Jannik Vogel
+ * Copyright (c) 2012 espes
+ *
+ * Based on:
+ * Cxbx, VertexShader.cpp
+ * Copyright (c) 2004 Aaron Robinson <caustik@caustik.com>
+ *                    Kingofc <kingofc@freenet.de>
+ * Dxbx, uPushBuffer.pas
+ * Copyright (c) 2007 Shadow_tj, PatrickvL
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <string>
+
+#include "nv2a_shaders_common.h"
+#include "nv2a_vsh.h"
+
+#define VSH_D3DSCM_CORRECTION 96
+
+typedef enum {
+    PARAM_UNKNOWN = 0,
+    PARAM_R,
+    PARAM_V,
+    PARAM_C
+} VshParameterType;
+
+typedef enum {
+    OUTPUT_C = 0,
+    OUTPUT_O
+} VshOutputType;
+
+typedef enum {
+    OMUX_MAC = 0,
+    OMUX_ILU
+} VshOutputMux;
+
+typedef enum {
+    ILU_NOP = 0,
+    ILU_MOV,
+    ILU_RCP,
+    ILU_RCC,
+    ILU_RSQ,
+    ILU_EXP,
+    ILU_LOG,
+    ILU_LIT
+} VshILU;
+
+typedef enum {
+    MAC_NOP,
+    MAC_MOV,
+    MAC_MUL,
+    MAC_ADD,
+    MAC_MAD,
+    MAC_DP3,
+    MAC_DPH,
+    MAC_DP4,
+    MAC_DST,
+    MAC_MIN,
+    MAC_MAX,
+    MAC_SLT,
+    MAC_SGE,
+    MAC_ARL
+} VshMAC;
+
+typedef enum {
+    SWIZZLE_X = 0,
+    SWIZZLE_Y,
+    SWIZZLE_Z,
+    SWIZZLE_W
+} VshSwizzle;
+
+
+typedef struct VshFieldMapping {
+    VshFieldName field_name;
+    uint8_t subtoken;
+    uint8_t start_bit;
+    uint8_t bit_length;
+} VshFieldMapping;
+
+static const VshFieldMapping field_mapping[] = {
+    // Field Name         DWORD BitPos BitSize
+    {  FLD_ILU,              1,   25,     3 },
+    {  FLD_MAC,              1,   21,     4 },
+    {  FLD_CONST,            1,   13,     8 },
+    {  FLD_V,                1,    9,     4 },
+    // INPUT A
+    {  FLD_A_NEG,            1,    8,     1 },
+    {  FLD_A_SWZ_X,          1,    6,     2 },
+    {  FLD_A_SWZ_Y,          1,    4,     2 },
+    {  FLD_A_SWZ_Z,          1,    2,     2 },
+    {  FLD_A_SWZ_W,          1,    0,     2 },
+    {  FLD_A_R,              2,   28,     4 },
+    {  FLD_A_MUX,            2,   26,     2 },
+    // INPUT B
+    {  FLD_B_NEG,            2,   25,     1 },
+    {  FLD_B_SWZ_X,          2,   23,     2 },
+    {  FLD_B_SWZ_Y,          2,   21,     2 },
+    {  FLD_B_SWZ_Z,          2,   19,     2 },
+    {  FLD_B_SWZ_W,          2,   17,     2 },
+    {  FLD_B_R,              2,   13,     4 },
+    {  FLD_B_MUX,            2,   11,     2 },
+    // INPUT C
+    {  FLD_C_NEG,            2,   10,     1 },
+    {  FLD_C_SWZ_X,          2,    8,     2 },
+    {  FLD_C_SWZ_Y,          2,    6,     2 },
+    {  FLD_C_SWZ_Z,          2,    4,     2 },
+    {  FLD_C_SWZ_W,          2,    2,     2 },
+    {  FLD_C_R_HIGH,         2,    0,     2 },
+    {  FLD_C_R_LOW,          3,   30,     2 },
+    {  FLD_C_MUX,            3,   28,     2 },
+    // Output
+    {  FLD_OUT_MAC_MASK,     3,   24,     4 },
+    {  FLD_OUT_R,            3,   20,     4 },
+    {  FLD_OUT_ILU_MASK,     3,   16,     4 },
+    {  FLD_OUT_O_MASK,       3,   12,     4 },
+    {  FLD_OUT_ORB,          3,   11,     1 },
+    {  FLD_OUT_ADDRESS,      3,    3,     8 },
+    {  FLD_OUT_MUX,          3,    2,     1 },
+    // Other
+    {  FLD_A0X,              3,    1,     1 },
+    {  FLD_FINAL,            3,    0,     1 }
+};
+
+
+typedef struct VshOpcodeParams {
+    bool A;
+    bool B;
+    bool C;
+} VshOpcodeParams;
+
+#if 0
+static const VshOpcodeParams ilu_opcode_params[] = {
+    /* ILU OP       ParamA ParamB ParamC */
+    /* ILU_NOP */ { false, false, false }, // Dxbx note : Unused
+    /* ILU_MOV */ { false, false, true  },
+    /* ILU_RCP */ { false, false, true  },
+    /* ILU_RCC */ { false, false, true  },
+    /* ILU_RSQ */ { false, false, true  },
+    /* ILU_EXP */ { false, false, true  },
+    /* ILU_LOG */ { false, false, true  },
+    /* ILU_LIT */ { false, false, true  },
+};
+#endif
+
+static const VshOpcodeParams mac_opcode_params[] = {
+    /* MAC OP      ParamA  ParamB ParamC */
+    /* MAC_NOP */ { false, false, false }, // Dxbx note : Unused
+    /* MAC_MOV */ { true,  false, false },
+    /* MAC_MUL */ { true,  true,  false },
+    /* MAC_ADD */ { true,  false, true  },
+    /* MAC_MAD */ { true,  true,  true  },
+    /* MAC_DP3 */ { true,  true,  false },
+    /* MAC_DPH */ { true,  true,  false },
+    /* MAC_DP4 */ { true,  true,  false },
+    /* MAC_DST */ { true,  true,  false },
+    /* MAC_MIN */ { true,  true,  false },
+    /* MAC_MAX */ { true,  true,  false },
+    /* MAC_SLT */ { true,  true,  false },
+    /* MAC_SGE */ { true,  true,  false },
+    /* MAC_ARL */ { true,  false, false },
+};
+
+
+static const char* mask_str[] = {
+            // xyzw xyzw
+    ",",     // 0000 ____
+    ",w",   // 0001 ___w
+    ",z",   // 0010 __z_
+    ",zw",  // 0011 __zw
+    ",y",   // 0100 _y__
+    ",yw",  // 0101 _y_w
+    ",yz",  // 0110 _yz_
+    ",yzw", // 0111 _yzw
+    ",x",   // 1000 x___
+    ",xw",  // 1001 x__w
+    ",xz",  // 1010 x_z_
+    ",xzw", // 1011 x_zw
+    ",xy",  // 1100 xy__
+    ",xyw", // 1101 xy_w
+    ",xyz", // 1110 xyz_
+    ",xyzw" // 1111 xyzw
+};
+
+/* Note: OpenGL seems to be case-sensitive, and requires upper-case opcodes! */
+static const char* mac_opcode[] = {
+    "NOP",
+    "MOV",
+    "MUL",
+    "ADD",
+    "MAD",
+    "DP3",
+    "DPH",
+    "DP4",
+    "DST",
+    "MIN",
+    "MAX",
+    "SLT",
+    "SGE",
+    "ARL A0.x", // Dxbx note : Alias for "mov a0.x"
+};
+
+static const char* ilu_opcode[] = {
+    "NOP",
+    "MOV",
+    "RCP",
+    "RCC",
+    "RSQ",
+    "EXP",
+    "LOG",
+    "LIT",
+};
+
+static bool ilu_force_scalar[] = {
+    false,
+    false,
+    true,
+    true,
+    true,
+    true,
+    true,
+    false,
+};
+
+static const char* out_reg_name[] = {
+    "oPos",
+    "???",
+    "???",
+    "oD0",
+    "oD1",
+    "oFog",
+    "oPts",
+    "oB0",
+    "oB1",
+    "oT0",
+    "oT1",
+    "oT2",
+    "oT3",
+    "???",
+    "???",
+    "A0.x",
+};
+
+
+
+// Retrieves a number of bits in the instruction token
+static int vsh_get_from_token(const uint32_t *shader_token,
+                              uint8_t subtoken,
+                              uint8_t start_bit,
+                              uint8_t bit_length)
+{
+    return (shader_token[subtoken] >> start_bit) & ~(0xFFFFFFFF << bit_length);
+}
+
+uint8_t vsh_get_field(const uint32_t *shader_token, VshFieldName field_name)
+{
+
+    return (uint8_t)(vsh_get_from_token(shader_token,
+                                        field_mapping[field_name].subtoken,
+                                        field_mapping[field_name].start_bit,
+                                        field_mapping[field_name].bit_length));
+}
+
+
+// Converts the C register address to disassembly format
+static int16_t convert_c_register(const int16_t c_reg)
+{
+    int16_t r = ((((c_reg >> 5) & 7) - 3) * 32) + (c_reg & 31);
+    r += VSH_D3DSCM_CORRECTION; /* to map -96..95 to 0..191 */
+    return r; //FIXME: = c_reg?!
+}
+
+static std::string* decode_swizzle(
+    const uint32_t *shader_token,
+    VshFieldName    swizzle_field
+    )
+{
+    const char* swizzle_str = "xyzw";
+    uint8_t x, y, z, w;
+
+    /* some microcode instructions force a scalar value */
+    if (swizzle_field == FLD_C_SWZ_X
+        && ilu_force_scalar[vsh_get_field(shader_token, FLD_ILU)]) {
+        x = y = z = w = vsh_get_field(shader_token, swizzle_field);
+    } else {
+        int swizzle_field_i = (int)swizzle_field;
+        x = vsh_get_field(shader_token, (VshFieldName) swizzle_field_i++);
+        y = vsh_get_field(shader_token, (VshFieldName) swizzle_field_i++);
+        z = vsh_get_field(shader_token, (VshFieldName) swizzle_field_i++);
+        w = vsh_get_field(shader_token, (VshFieldName) swizzle_field_i);
+    }
+
+    if (x == SWIZZLE_X && y == SWIZZLE_Y
+        && z == SWIZZLE_Z && w == SWIZZLE_W) {
+        /* Don't print the swizzle if it's .xyzw */
+        return new std::string(""); // Will turn ".xyzw" into "."
+    /* Don't print duplicates */
+    } else if (x == y && y == z && z == w) {
+        return new std::string({'.', swizzle_str[x], '\0'});
+    } else if (y == z && z == w) {
+        return new std::string({'.',
+            swizzle_str[x], swizzle_str[y], '\0'});
+    } else if (z == w) {
+        return new std::string({'.',
+            swizzle_str[x], swizzle_str[y], swizzle_str[z], '\0'});
+    } else {
+        return new std::string({'.',
+                                       swizzle_str[x], swizzle_str[y],
+                                       swizzle_str[z], swizzle_str[w],
+                                       '\0'}); // Normal swizzle mask
+    }
+}
+
+static std::string* decode_opcode_input(const uint32_t *shader_token,
+                                    VshParameterType param,
+                                    VshFieldName neg_field,
+                                    int reg_num)
+{
+    /* This function decodes a vertex shader opcode parameter into a string.
+     * Input A, B or C is controlled via the Param and NEG fieldnames,
+     * the R-register address for each input is already given by caller. */
+
+    std::string *ret_str = new std::string();
+
+    if (vsh_get_field(shader_token, neg_field) > 0) {
+        ret_str->append("-");
+    }
+
+    /* PARAM_R uses the supplied reg_num, but the other two need to be
+     * determined */
+    char tmp[40];
+    switch (param) {
+    case PARAM_R:
+        snprintf(tmp, sizeof(tmp), "R%d", reg_num);
+        break;
+    case PARAM_V:
+        reg_num = vsh_get_field(shader_token, FLD_V);
+        snprintf(tmp, sizeof(tmp), "v%d", reg_num);
+        break;
+    case PARAM_C:
+        reg_num = convert_c_register(vsh_get_field(shader_token, FLD_CONST));
+        if (vsh_get_field(shader_token, FLD_A0X) > 0) {
+            //FIXME: does this really require the "correction" doe in convert_c_register?!
+            snprintf(tmp, sizeof(tmp), "c[A0+%d]", reg_num);
+        } else {
+            snprintf(tmp, sizeof(tmp), "c[%d]", reg_num);
+        }
+        break;
+    default:
+        fprintf(stderr, "Unknown vs param: 0x%x\n", param);
+        assert(false);
+        break;
+    }
+    ret_str->append(tmp);
+
+    {
+        /* swizzle bits are next to the neg bit */
+        std::string *swizzle_str = decode_swizzle(shader_token, (VshFieldName)(neg_field+1));
+        ret_str->append(*swizzle_str);
+        delete swizzle_str;
+    }
+
+    return ret_str;
+}
+
+
+static std::string* decode_opcode(
+    const uint32_t *shader_token,
+    VshOutputMux    out_mux,
+    uint32_t        mask,
+    const char     *opcode,
+    const char     *inputs
+    )
+{
+    char buf[128];
+    std::string *ret = new std::string();
+    int reg_num = vsh_get_field(shader_token, FLD_OUT_R);
+
+    /* Test for paired opcodes (in other words : Are both <> NOP?) */
+    if (out_mux == OMUX_MAC
+          &&  vsh_get_field(shader_token, FLD_ILU) != ILU_NOP
+          && reg_num == 1) {
+        /* Ignore paired MAC opcodes that write to R1 */
+        mask = 0;
+    } else if (out_mux == OMUX_ILU
+               && vsh_get_field(shader_token, FLD_MAC) != MAC_NOP) {
+        /* Paired ILU opcodes can only write to R1 */
+        reg_num = 1;
+    }
+
+    if (strcmp(opcode, mac_opcode[MAC_ARL]) == 0) {
+        snprintf(buf, sizeof(buf), "  ARL(A0%s);\n", inputs);
+        ret->append(buf);
+    } else if (mask > 0) {
+        snprintf(buf, sizeof(buf), "  %s(R%d%s%s);\n",
+                 opcode, reg_num, mask_str[mask], inputs);
+        ret->append(buf);
+    }
+
+    /* See if we must add a muxed opcode too: */
+    if (vsh_get_field(shader_token, FLD_OUT_MUX) == out_mux
+        /* Only if it's not masked away: */
+        && vsh_get_field(shader_token, FLD_OUT_O_MASK) != 0) {
+
+        ret->append("  ");
+        ret->append(opcode);
+        ret->append("(");
+
+        if (vsh_get_field(shader_token, FLD_OUT_ORB) == OUTPUT_C) {
+            /* TODO : Emulate writeable const registers */
+            ret->append("c");
+            snprintf(buf, sizeof(buf), "%d", convert_c_register(
+                     vsh_get_field(shader_token, FLD_OUT_ADDRESS)));
+            ret->append(buf);
+        } else {
+            ret->append(out_reg_name[
+                vsh_get_field(shader_token, FLD_OUT_ADDRESS) & 0xF]);
+        }
+        ret->append(mask_str[
+                vsh_get_field(shader_token, FLD_OUT_O_MASK)]);
+        ret->append(inputs);
+        ret->append(");\n");
+    }
+
+    return ret;
+}
+
+
+static std::string* decode_token(const uint32_t *shader_token)
+{
+    std::string *ret;
+
+    /* Since it's potentially used twice, decode input C once: */
+    std::string *input_c =
+        decode_opcode_input(shader_token,
+                            (VshParameterType) vsh_get_field(shader_token, FLD_C_MUX),
+                            FLD_C_NEG,
+                            (vsh_get_field(shader_token, FLD_C_R_HIGH) << 2)
+                                | vsh_get_field(shader_token, FLD_C_R_LOW));
+
+    /* See what MAC opcode is written to (if not masked away): */
+    VshMAC mac = (VshMAC) vsh_get_field(shader_token, FLD_MAC);
+    if (mac != MAC_NOP) {
+        std::string *inputs_mac = new std::string();
+        if (mac_opcode_params[mac].A) {
+            std::string *input_a =
+                decode_opcode_input(shader_token,
+                                    (VshParameterType) vsh_get_field(shader_token, FLD_A_MUX),
+                                    FLD_A_NEG,
+                                    vsh_get_field(shader_token, FLD_A_R));
+            inputs_mac->append(", ");
+            inputs_mac->append(*input_a);
+            delete input_a;
+        }
+        if (mac_opcode_params[mac].B) {
+            std::string *input_b =
+                decode_opcode_input(shader_token,
+                                    (VshParameterType) vsh_get_field(shader_token, FLD_B_MUX),
+                                    FLD_B_NEG,
+                                    vsh_get_field(shader_token, FLD_B_R));
+            inputs_mac->append(", ");
+            inputs_mac->append(*input_b);
+            delete input_b;
+        }
+        if (mac_opcode_params[mac].C) {
+            inputs_mac->append(", ");
+            inputs_mac->append(*input_c);
+        }
+
+        /* Then prepend these inputs with the actual opcode, mask, and input : */
+        ret = decode_opcode(shader_token,
+                            OMUX_MAC,
+                            vsh_get_field(shader_token, FLD_OUT_MAC_MASK),
+                            mac_opcode[mac],
+                            inputs_mac->c_str());
+        delete inputs_mac;
+    } else {
+        ret = new std::string();
+    }
+
+    /* See if a ILU opcode is present too: */
+    VshILU ilu = (VshILU) vsh_get_field(shader_token, FLD_ILU);
+    if (ilu != ILU_NOP) {
+        std::string *inputs_c = new std::string(", ");
+        inputs_c->append(*input_c);
+
+        /* Append the ILU opcode, mask and (the already determined) input C: */
+        std::string *ilu_op =
+            decode_opcode(shader_token,
+                          OMUX_ILU,
+                          vsh_get_field(shader_token, FLD_OUT_ILU_MASK),
+                          ilu_opcode[ilu],
+                          inputs_c->c_str());
+
+        ret->append(*ilu_op);
+
+        delete inputs_c;
+        delete ilu_op;
+    }
+
+    delete input_c;
+
+    return ret;
+}
+
+static const char* vsh_header =
+    "\n"
+    "int A0 = 0;\n"
+    "\n"
+    "vec4 R0 = vec4(0.0,0.0,0.0,0.0);\n"
+    "vec4 R1 = vec4(0.0,0.0,0.0,0.0);\n"
+    "vec4 R2 = vec4(0.0,0.0,0.0,0.0);\n"
+    "vec4 R3 = vec4(0.0,0.0,0.0,0.0);\n"
+    "vec4 R4 = vec4(0.0,0.0,0.0,0.0);\n"
+    "vec4 R5 = vec4(0.0,0.0,0.0,0.0);\n"
+    "vec4 R6 = vec4(0.0,0.0,0.0,0.0);\n"
+    "vec4 R7 = vec4(0.0,0.0,0.0,0.0);\n"
+    "vec4 R8 = vec4(0.0,0.0,0.0,0.0);\n"
+    "vec4 R9 = vec4(0.0,0.0,0.0,0.0);\n"
+    "vec4 R10 = vec4(0.0,0.0,0.0,0.0);\n"
+    "vec4 R11 = vec4(0.0,0.0,0.0,0.0);\n"
+    "#define R12 oPos\n" /* R12 is a mirror of oPos */
+    "\n"
+
+    /* See:
+     * http://msdn.microsoft.com/en-us/library/windows/desktop/bb174703%28v=vs.85%29.aspx
+     * https://www.opengl.org/registry/specs/NV/vertex_program1_1.txt
+     */
+    "\n"
+//QQQ #ifdef NICE_CODE
+    "/* Converts the input to vec4, pads with last component */\n"
+    "vec4 _in(float v) { return vec4(v); }\n"
+    "vec4 _in(vec2 v) { return v.xyyy; }\n"
+    "vec4 _in(vec3 v) { return v.xyzz; }\n"
+    "vec4 _in(vec4 v) { return v.xyzw; }\n"
+//#else
+//    "/* Make sure input is always a vec4 */\n"
+//   "#define _in(v) vec4(v)\n"
+//#endif
+    "\n"
+    "#define INFINITY (1.0 / 0.0)\n"
+    "\n"
+    "#define MOV(dest, mask, src) dest.mask = _MOV(_in(src)).mask\n"
+    "vec4 _MOV(vec4 src)\n"
+    "{\n"
+    "  return src;\n"
+    "}\n"
+    "\n"
+    "#define MUL(dest, mask, src0, src1) dest.mask = _MUL(_in(src0), _in(src1)).mask\n"
+    "vec4 _MUL(vec4 src0, vec4 src1)\n" 
+    "{\n"
+    "  return src0 * src1;\n"
+    "}\n"
+    "\n"
+    "#define ADD(dest, mask, src0, src1) dest.mask = _ADD(_in(src0), _in(src1)).mask\n"
+    "vec4 _ADD(vec4 src0, vec4 src1)\n" 
+    "{\n"
+    "  return src0 + src1;\n"
+    "}\n"
+    "\n"
+    "#define MAD(dest, mask, src0, src1, src2) dest.mask = _MAD(_in(src0), _in(src1), _in(src2)).mask\n"
+    "vec4 _MAD(vec4 src0, vec4 src1, vec4 src2)\n" 
+    "{\n"
+    "  return src0 * src1 + src2;\n"
+    "}\n"
+    "\n"
+    "#define DP3(dest, mask, src0, src1) dest.mask = _DP3(_in(src0), _in(src1)).mask\n"
+    "vec4 _DP3(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return vec4(dot(src0.xyz, src1.xyz));\n"
+    "}\n"
+    "\n"
+    "#define DPH(dest, mask, src0, src1) dest.mask = _DPH(_in(src0), _in(src1)).mask\n"
+    "vec4 _DPH(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return vec4(dot(vec4(src0.xyz, 1.0), src1));\n"
+    "}\n"
+    "\n"
+    "#define DP4(dest, mask, src0, src1) dest.mask = _DP4(_in(src0), _in(src1)).mask\n"
+    "vec4 _DP4(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return vec4(dot(src0, src1));\n"
+    "}\n"
+    "\n"
+    "#define DST(dest, mask, src0, src1) dest.mask = _DST(_in(src0), _in(src1)).mask\n"
+    "vec4 _DST(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return vec4(1.0,\n"
+    "              src0.y * src1.y,\n"
+    "              src0.z,\n"
+    "              src1.w);\n"
+    "}\n"
+    "\n"
+    "#define MIN(dest, mask, src0, src1) dest.mask = _MIN(_in(src0), _in(src1)).mask\n"
+    "vec4 _MIN(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return min(src0, src1);\n"
+    "}\n"
+    "\n"
+    "#define MAX(dest, mask, src0, src1) dest.mask = _MAX(_in(src0), _in(src1)).mask\n"
+    "vec4 _MAX(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return max(src0, src1);\n"
+    "}\n"
+    "\n"
+    "#define SLT(dest, mask, src0, src1) dest.mask = _SLT(_in(src0), _in(src1)).mask\n"
+    "vec4 _SLT(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return vec4(lessThan(src0, src1));\n"
+    "}\n"
+    "\n"
+    "#define ARL(dest, src) dest = _ARL(_in(src).x)\n"
+    "int _ARL(float src)\n"
+    "{\n"
+    "  return int(floor(src));\n"
+    "}\n"
+    "\n"
+    "#define SGE(dest, mask, src0, src1) dest.mask = _SGE(_in(src0), _in(src1)).mask\n"
+    "vec4 _SGE(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return vec4(greaterThanEqual(src0, src1));\n"
+    "}\n"
+    "\n"
+    "#define RCP(dest, mask, src) dest.mask = _RCP(_in(src).x).mask\n"
+    "vec4 _RCP(float src)\n"
+    "{\n"
+    "  return vec4(1.0 / src);\n"
+    "}\n"
+    "\n"
+    "#define RCC(dest, mask, src) dest.mask = _RCC(_in(src).x).mask\n"
+    "vec4 _RCC(float src)\n"
+    "{\n"
+    "  float t = 1.0 / src;\n"
+    "  if (t > 0.0) {\n"
+    "    t = clamp(t, 5.42101e-020, 1.884467e+019);\n"
+    "  } else {\n"
+    "    t = clamp(t, -1.884467e+019, -5.42101e-020);\n"
+    "  }\n"
+    "  return vec4(t);\n"
+    "}\n"
+    "\n"
+    "#define RSQ(dest, mask, src) dest.mask = _RSQ(_in(src).x).mask\n"
+    "vec4 _RSQ(float src)\n"
+    "{\n"
+    "  if (src == 0.0) { return vec4(INFINITY); }\n"
+    "  if (isinf(src)) { return vec4(0.0); }\n"
+    "  return vec4(inversesqrt(abs(src)));\n"
+    "}\n"
+    "\n"
+    "#define EXP(dest, mask, src) dest.mask = _EXP(_in(src).x).mask\n"
+    "vec4 _EXP(float src)\n"
+    "{\n"
+    "  return vec4(exp2(src));\n"
+    "}\n"
+    "\n"
+    "#define LOG(dest, mask, src) dest.mask = _LOG(_in(src).x).mask\n"
+    "vec4 _LOG(float src)\n"
+    "{\n"
+    "  return vec4(log2(src));\n"
+    "}\n"
+    "\n"
+    "#define LIT(dest, mask, src) dest.mask = _LIT(_in(src)).mask\n"
+    "vec4 _LIT(vec4 src)\n"
+    "{\n"
+    "  vec4 s = src;\n"
+    "  float epsilon = 1.0 / 256.0;\n"
+    "  s.w = clamp(s.w, -(128.0 - epsilon), 128.0 - epsilon);\n"
+    "  s.x = max(s.x, 0.0);\n"
+    "  s.y = max(s.y, 0.0);\n"
+    "  vec4 t = vec4(1.0, 0.0, 0.0, 1.0);\n"
+    "  t.y = s.x;\n"
+#if 1
+    "  t.z = (s.x > 0.0) ? exp2(s.w * log2(s.y)) : 0.0;\n"
+#else
+    "  t.z = (s.x > 0.0) ? pow(s.y, s.w) : 0.0;\n"
+#endif
+    "  return t;\n"
+    "}\n";
+
+void vsh_translate(uint16_t version,
+                   const uint32_t *tokens,
+                   unsigned int length,
+                   bool z_perspective,
+                   std::string *header, std::string *body)
+{
+    char buf[128];
+    header->append(vsh_header);
+
+    bool has_final = false;
+	unsigned int slot;
+    for (slot=0; slot < length; slot++) {
+        const uint32_t* cur_token = &tokens[slot * VSH_TOKEN_SIZE];
+        std::string *token_str = decode_token(cur_token);
+        snprintf(buf, sizeof(buf),
+            "  /* Slot %d: 0x%08X 0x%08X 0x%08X 0x%08X */",
+            slot, cur_token[0],cur_token[1],cur_token[2],cur_token[3]);
+        body->append(buf);
+        body->append("\n");
+        body->append(*token_str);
+        body->append("\n");
+        delete token_str;
+
+        if (vsh_get_field(cur_token, FLD_FINAL)) {
+            has_final = true;
+            break;
+        }
+    }
+    assert(has_final);
+
+    /* pre-divide and output the generated W so we can do persepctive correct
+     * interpolation manually. OpenGL can't, since we give it a W of 1 to work
+     * around the perspective divide */
+    body->append(
+        "  if (oPos.w == 0.0 || isinf(oPos.w)) {\n"
+        "    vtx.inv_w = 1.0;\n"
+        "  } else {\n"
+        "    vtx.inv_w = 1.0 / oPos.w;\n"
+        "  }\n"
+    );
+
+    body->append(
+        /* the shaders leave the result in screen space, while
+         * opengl expects it in clip space.
+         * TODO: the pixel-center co-ordinate differences should handled
+         */
+        "  oPos.x = 2.0 * (oPos.x - surfaceSize.x * 0.5) / surfaceSize.x;\n"
+        "  oPos.y = -2.0 * (oPos.y - surfaceSize.y * 0.5) / surfaceSize.y;\n"
+    );
+    if (z_perspective) {
+        body->append("  oPos.z = oPos.w;\n");
+    }
+    body->append(
+        /* Map the clip range into clip space so z is clipped correctly.
+         * Note this makes the values in the depth buffer wrong. This should be
+         * handled with gl_ClipDistance instead, but that has performance issues
+         * on OS X.
+         */
+        "  if (clipRange.y != clipRange.x) {\n"
+        "    oPos.z = (oPos.z - 0.5 * (clipRange.x + clipRange.y)) / (0.5 * (clipRange.y - clipRange.x));\n"
+        "  }\n"
+
+        /* Correct for the perspective divide */
+        "  if (oPos.w < 0.0) {\n"
+            /* undo the perspective divide in the case where the point would be
+             * clipped so opengl can clip it correctly */
+        "    oPos.xyz *= oPos.w;\n"
+        "  } else {\n"
+            /* we don't want the OpenGL perspective divide to happen, but we
+             * can't multiply by W because it could be meaningless here */
+        "    oPos.w = 1.0;\n"
+        "  }\n"
+    );
+
+}
+
--- a/src/devices/video/nv2a_vsh.h
+++ b/src/devices/video/nv2a_vsh.h
@ -0,0 +1,142 @@
+/*
+ * QEMU Geforce NV2A vertex shader translation
+ *
+ * Copyright (c) 2012 espes
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#ifndef HW_NV2A_VSH_H
+#define HW_NV2A_VSH_H
+
+#include <stdint.h>
+#include <string>
+
+enum VshLight {
+    LIGHT_OFF,
+    LIGHT_INFINITE,
+    LIGHT_LOCAL,
+    LIGHT_SPOT
+};
+
+enum VshTexgen {
+    TEXGEN_DISABLE,
+    TEXGEN_EYE_LINEAR,
+    TEXGEN_OBJECT_LINEAR,
+    TEXGEN_SPHERE_MAP,
+    TEXGEN_NORMAL_MAP,
+    TEXGEN_REFLECTION_MAP,
+};
+
+enum VshFogMode {
+    FOG_MODE_LINEAR,
+    FOG_MODE_EXP,
+    FOG_MODE_ERROR2, /* Doesn't exist */
+    FOG_MODE_EXP2,
+    FOG_MODE_LINEAR_ABS,
+    FOG_MODE_EXP_ABS,
+    FOG_MODE_ERROR6, /* Doesn't exist */
+    FOG_MODE_EXP2_ABS
+};
+
+enum VshFoggen {
+    FOGGEN_SPEC_ALPHA,
+    FOGGEN_RADIAL,
+    FOGGEN_PLANAR,
+    FOGGEN_ABS_PLANAR,
+    FOGGEN_ERROR4,
+    FOGGEN_ERROR5,
+    FOGGEN_FOG_X
+};
+
+enum VshSkinning {
+    SKINNING_OFF,
+    SKINNING_1WEIGHTS,
+    SKINNING_2WEIGHTS,
+    SKINNING_3WEIGHTS,
+    SKINNING_2WEIGHTS2MATRICES,
+    SKINNING_3WEIGHTS3MATRICES,
+    SKINNING_4WEIGHTS4MATRICES,
+};
+
+// vs.1.1, not an official value
+#define VSH_VERSION_VS                     0xF078
+
+// Xbox vertex shader
+#define VSH_VERSION_XVS                    0x2078
+
+// Xbox vertex state shader
+#define VSH_VERSION_XVSS                   0x7378
+
+// Xbox vertex read/write shader
+#define VSH_VERSION_XVSW                   0x7778
+
+#define VSH_TOKEN_SIZE 4
+
+typedef enum {
+    FLD_ILU = 0,
+    FLD_MAC,
+    FLD_CONST,
+    FLD_V,
+    // Input A
+    FLD_A_NEG,
+    FLD_A_SWZ_X,
+    FLD_A_SWZ_Y,
+    FLD_A_SWZ_Z,
+    FLD_A_SWZ_W,
+    FLD_A_R,
+    FLD_A_MUX,
+    // Input B
+    FLD_B_NEG,
+    FLD_B_SWZ_X,
+    FLD_B_SWZ_Y,
+    FLD_B_SWZ_Z,
+    FLD_B_SWZ_W,
+    FLD_B_R,
+    FLD_B_MUX,
+    // Input C
+    FLD_C_NEG,
+    FLD_C_SWZ_X,
+    FLD_C_SWZ_Y,
+    FLD_C_SWZ_Z,
+    FLD_C_SWZ_W,
+    FLD_C_R_HIGH,
+    FLD_C_R_LOW,
+    FLD_C_MUX,
+    // Output
+    FLD_OUT_MAC_MASK,
+    FLD_OUT_R,
+    FLD_OUT_ILU_MASK,
+    FLD_OUT_O_MASK,
+    FLD_OUT_ORB,
+    FLD_OUT_ADDRESS,
+    FLD_OUT_MUX,
+    // Relative addressing
+    FLD_A0X,
+    // Final instruction
+    FLD_FINAL
+} VshFieldName;
+
+uint8_t vsh_get_field(const uint32_t *shader_token, VshFieldName field_name);
+
+void vsh_translate(uint16_t version,
+                   const uint32_t *tokens,
+                   unsigned int length,
+                   bool z_perspective,
+                   std::string *header, std::string *body);
+
+
+#endif
--- a/src/devices/video/queue.h
+++ b/src/devices/video/queue.h
@ -0,0 +1,414 @@
+/*      $NetBSD: queue.h,v 1.52 2009/04/20 09:56:08 mschuett Exp $ */
+
+/*
+ * QEMU version: Copy from netbsd, removed debug code, removed some of
+ * the implementations.  Left in singly-linked lists, lists, simple
+ * queues, and tail queues.
+ */
+
+/*
+ * Copyright (c) 1991, 1993
+ *      The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *      @(#)queue.h     8.5 (Berkeley) 8/20/94
+ */
+
+#ifndef QEMU_SYS_QUEUE_H_
+#define QEMU_SYS_QUEUE_H_
+
+/*
+ * This file defines four types of data structures: singly-linked lists,
+ * lists, simple queues, and tail queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The
+ * elements are singly linked for minimum space and pointer manipulation
+ * overhead at the expense of O(n) removal for arbitrary elements. New
+ * elements can be added to the list after an existing element or at the
+ * head of the list.  Elements being removed from the head of the list
+ * should use the explicit macro for this purpose for optimum
+ * efficiency. A singly-linked list may only be traversed in the forward
+ * direction.  Singly-linked lists are ideal for applications with large
+ * datasets and few or no removals or for implementing a LIFO queue.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A simple queue is headed by a pair of pointers, one the head of the
+ * list and the other to the tail of the list. The elements are singly
+ * linked to save space, so elements can only be removed from the
+ * head of the list. New elements can be added to the list after
+ * an existing element, at the head of the list, or at the end of the
+ * list. A simple queue may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ */
+
+// #include "qemu/atomic.h" /* for smp_wmb() */
+
+/*
+ * List definitions.
+ */
+#define QLIST_HEAD(name, type)                                          \
+struct name {                                                           \
+        struct type *lh_first;  /* first element */                     \
+}
+
+#define QLIST_HEAD_INITIALIZER(head)                                    \
+        { NULL }
+
+#define QLIST_ENTRY(type)                                               \
+struct {                                                                \
+        struct type *le_next;   /* next element */                      \
+        struct type **le_prev;  /* address of previous next element */  \
+}
+
+/*
+ * List functions.
+ */
+#define QLIST_INIT(head) do {                                           \
+        (head)->lh_first = NULL;                                        \
+} while (/*CONSTCOND*/0)
+
+#define QLIST_INSERT_AFTER(listelm, elm, field) do {                    \
+        if (((elm)->field.le_next = (listelm)->field.le_next) != NULL)  \
+                (listelm)->field.le_next->field.le_prev =               \
+                    &(elm)->field.le_next;                              \
+        (listelm)->field.le_next = (elm);                               \
+        (elm)->field.le_prev = &(listelm)->field.le_next;               \
+} while (/*CONSTCOND*/0)
+
+#define QLIST_INSERT_BEFORE(listelm, elm, field) do {                   \
+        (elm)->field.le_prev = (listelm)->field.le_prev;                \
+        (elm)->field.le_next = (listelm);                               \
+        *(listelm)->field.le_prev = (elm);                              \
+        (listelm)->field.le_prev = &(elm)->field.le_next;               \
+} while (/*CONSTCOND*/0)
+
+#define QLIST_INSERT_HEAD(head, elm, field) do {                        \
+        if (((elm)->field.le_next = (head)->lh_first) != NULL)          \
+                (head)->lh_first->field.le_prev = &(elm)->field.le_next;\
+        (head)->lh_first = (elm);                                       \
+        (elm)->field.le_prev = &(head)->lh_first;                       \
+} while (/*CONSTCOND*/0)
+
+#define QLIST_INSERT_HEAD_RCU(head, elm, field) do {                    \
+        (elm)->field.le_prev = &(head)->lh_first;                       \
+        (elm)->field.le_next = (head)->lh_first;                        \
+        smp_wmb(); /* fill elm before linking it */                     \
+        if ((head)->lh_first != NULL)  {                                \
+            (head)->lh_first->field.le_prev = &(elm)->field.le_next;    \
+        }                                                               \
+        (head)->lh_first = (elm);                                       \
+        smp_wmb();                                                      \
+} while (/* CONSTCOND*/0)
+
+#define QLIST_REMOVE(elm, field) do {                                   \
+        if ((elm)->field.le_next != NULL)                               \
+                (elm)->field.le_next->field.le_prev =                   \
+                    (elm)->field.le_prev;                               \
+        *(elm)->field.le_prev = (elm)->field.le_next;                   \
+} while (/*CONSTCOND*/0)
+
+#define QLIST_FOREACH(var, head, field)                                 \
+        for ((var) = ((head)->lh_first);                                \
+                (var);                                                  \
+                (var) = ((var)->field.le_next))
+
+#define QLIST_FOREACH_SAFE(var, head, field, next_var)                  \
+        for ((var) = ((head)->lh_first);                                \
+                (var) && ((next_var) = ((var)->field.le_next), 1);      \
+                (var) = (next_var))
+
+/*
+ * List access methods.
+ */
+#define QLIST_EMPTY(head)                ((head)->lh_first == NULL)
+#define QLIST_FIRST(head)                ((head)->lh_first)
+#define QLIST_NEXT(elm, field)           ((elm)->field.le_next)
+
+
+/*
+ * Singly-linked List definitions.
+ */
+#define QSLIST_HEAD(name, type)                                          \
+struct name {                                                           \
+        struct type *slh_first; /* first element */                     \
+}
+
+#define QSLIST_HEAD_INITIALIZER(head)                                    \
+        { NULL }
+
+#define QSLIST_ENTRY(type)                                               \
+struct {                                                                \
+        struct type *sle_next;  /* next element */                      \
+}
+
+/*
+ * Singly-linked List functions.
+ */
+#define QSLIST_INIT(head) do {                                           \
+        (head)->slh_first = NULL;                                       \
+} while (/*CONSTCOND*/0)
+
+#define QSLIST_INSERT_AFTER(slistelm, elm, field) do {                   \
+        (elm)->field.sle_next = (slistelm)->field.sle_next;             \
+        (slistelm)->field.sle_next = (elm);                             \
+} while (/*CONSTCOND*/0)
+
+#define QSLIST_INSERT_HEAD(head, elm, field) do {                        \
+        (elm)->field.sle_next = (head)->slh_first;                      \
+        (head)->slh_first = (elm);                                      \
+} while (/*CONSTCOND*/0)
+
+#define QSLIST_REMOVE_HEAD(head, field) do {                             \
+        (head)->slh_first = (head)->slh_first->field.sle_next;          \
+} while (/*CONSTCOND*/0)
+
+#define QSLIST_REMOVE_AFTER(slistelm, field) do {                        \
+        (slistelm)->field.sle_next =                                    \
+            QSLIST_NEXT(QSLIST_NEXT((slistelm), field), field);           \
+} while (/*CONSTCOND*/0)
+
+#define QSLIST_FOREACH(var, head, field)                                 \
+        for((var) = (head)->slh_first; (var); (var) = (var)->field.sle_next)
+
+#define QSLIST_FOREACH_SAFE(var, head, field, tvar)                      \
+        for ((var) = QSLIST_FIRST((head));                               \
+            (var) && ((tvar) = QSLIST_NEXT((var), field), 1);            \
+            (var) = (tvar))
+
+/*
+ * Singly-linked List access methods.
+ */
+#define QSLIST_EMPTY(head)       ((head)->slh_first == NULL)
+#define QSLIST_FIRST(head)       ((head)->slh_first)
+#define QSLIST_NEXT(elm, field)  ((elm)->field.sle_next)
+
+
+/*
+ * Simple queue definitions.
+ */
+#define QSIMPLEQ_HEAD(name, type)                                       \
+struct name {                                                           \
+    struct type *sqh_first;    /* first element */                      \
+    struct type **sqh_last;    /* addr of last next element */          \
+}
+
+#define QSIMPLEQ_HEAD_INITIALIZER(head)                                 \
+    { NULL, &(head).sqh_first }
+
+#define QSIMPLEQ_ENTRY(type)                                            \
+struct {                                                                \
+    struct type *sqe_next;    /* next element */                        \
+}
+
+/*
+ * Simple queue functions.
+ */
+#define QSIMPLEQ_INIT(head) do {                                        \
+    (head)->sqh_first = NULL;                                           \
+    (head)->sqh_last = &(head)->sqh_first;                              \
+} while (/*CONSTCOND*/0)
+
+#define QSIMPLEQ_INSERT_HEAD(head, elm, field) do {                     \
+    if (((elm)->field.sqe_next = (head)->sqh_first) == NULL)            \
+        (head)->sqh_last = &(elm)->field.sqe_next;                      \
+    (head)->sqh_first = (elm);                                          \
+} while (/*CONSTCOND*/0)
+
+#define QSIMPLEQ_INSERT_TAIL(head, elm, field) do {                     \
+    (elm)->field.sqe_next = NULL;                                       \
+    *(head)->sqh_last = (elm);                                          \
+    (head)->sqh_last = &(elm)->field.sqe_next;                          \
+} while (/*CONSTCOND*/0)
+
+#define QSIMPLEQ_INSERT_AFTER(head, listelm, elm, field) do {           \
+    if (((elm)->field.sqe_next = (listelm)->field.sqe_next) == NULL)    \
+        (head)->sqh_last = &(elm)->field.sqe_next;                      \
+    (listelm)->field.sqe_next = (elm);                                  \
+} while (/*CONSTCOND*/0)
+
+#define QSIMPLEQ_REMOVE_HEAD(head, field) do {                          \
+    if (((head)->sqh_first = (head)->sqh_first->field.sqe_next) == NULL)\
+        (head)->sqh_last = &(head)->sqh_first;                          \
+} while (/*CONSTCOND*/0)
+
+#define QSIMPLEQ_REMOVE(head, elm, type, field) do {                    \
+    if ((head)->sqh_first == (elm)) {                                   \
+        QSIMPLEQ_REMOVE_HEAD((head), field);                            \
+    } else {                                                            \
+        struct type *curelm = (head)->sqh_first;                        \
+        while (curelm->field.sqe_next != (elm))                         \
+            curelm = curelm->field.sqe_next;                            \
+        if ((curelm->field.sqe_next =                                   \
+            curelm->field.sqe_next->field.sqe_next) == NULL)            \
+                (head)->sqh_last = &(curelm)->field.sqe_next;           \
+    }                                                                   \
+} while (/*CONSTCOND*/0)
+
+#define QSIMPLEQ_FOREACH(var, head, field)                              \
+    for ((var) = ((head)->sqh_first);                                   \
+        (var);                                                          \
+        (var) = ((var)->field.sqe_next))
+
+#define QSIMPLEQ_FOREACH_SAFE(var, head, field, next)                   \
+    for ((var) = ((head)->sqh_first);                                   \
+        (var) && ((next = ((var)->field.sqe_next)), 1);                 \
+        (var) = (next))
+
+#define QSIMPLEQ_CONCAT(head1, head2) do {                              \
+    if (!QSIMPLEQ_EMPTY((head2))) {                                     \
+        *(head1)->sqh_last = (head2)->sqh_first;                        \
+        (head1)->sqh_last = (head2)->sqh_last;                          \
+        QSIMPLEQ_INIT((head2));                                         \
+    }                                                                   \
+} while (/*CONSTCOND*/0)
+
+#define QSIMPLEQ_LAST(head, type, field)                                \
+    (QSIMPLEQ_EMPTY((head)) ?                                           \
+        NULL :                                                          \
+            ((struct type *)(void *)                                    \
+        ((char *)((head)->sqh_last) - offsetof(struct type, field))))
+
+/*
+ * Simple queue access methods.
+ */
+#define QSIMPLEQ_EMPTY(head)        ((head)->sqh_first == NULL)
+#define QSIMPLEQ_FIRST(head)        ((head)->sqh_first)
+#define QSIMPLEQ_NEXT(elm, field)   ((elm)->field.sqe_next)
+
+
+/*
+ * Tail queue definitions.
+ */
+#define Q_TAILQ_HEAD(name, type, qual)                                  \
+struct name {                                                           \
+        qual type *tqh_first;           /* first element */             \
+        qual type *qual *tqh_last;      /* addr of last next element */ \
+}
+#define QTAILQ_HEAD(name, type)  Q_TAILQ_HEAD(name, struct type,)
+
+#define QTAILQ_HEAD_INITIALIZER(head)                                   \
+        { NULL, &(head).tqh_first }
+
+#define Q_TAILQ_ENTRY(type, qual)                                       \
+struct {                                                                \
+        qual type *tqe_next;            /* next element */              \
+        qual type *qual *tqe_prev;      /* address of previous next element */\
+}
+#define QTAILQ_ENTRY(type)       Q_TAILQ_ENTRY(struct type,)
+
+/*
+ * Tail queue functions.
+ */
+#define QTAILQ_INIT(head) do {                                          \
+        (head)->tqh_first = NULL;                                       \
+        (head)->tqh_last = &(head)->tqh_first;                          \
+} while (/*CONSTCOND*/0)
+
+#define QTAILQ_INSERT_HEAD(head, elm, field) do {                       \
+        if (((elm)->field.tqe_next = (head)->tqh_first) != NULL)        \
+                (head)->tqh_first->field.tqe_prev =                     \
+                    &(elm)->field.tqe_next;                             \
+        else                                                            \
+                (head)->tqh_last = &(elm)->field.tqe_next;              \
+        (head)->tqh_first = (elm);                                      \
+        (elm)->field.tqe_prev = &(head)->tqh_first;                     \
+} while (/*CONSTCOND*/0)
+
+#define QTAILQ_INSERT_TAIL(head, elm, field) do {                       \
+        (elm)->field.tqe_next = NULL;                                   \
+        (elm)->field.tqe_prev = (head)->tqh_last;                       \
+        *(head)->tqh_last = (elm);                                      \
+        (head)->tqh_last = &(elm)->field.tqe_next;                      \
+} while (/*CONSTCOND*/0)
+
+#define QTAILQ_INSERT_AFTER(head, listelm, elm, field) do {             \
+        if (((elm)->field.tqe_next = (listelm)->field.tqe_next) != NULL)\
+                (elm)->field.tqe_next->field.tqe_prev =                 \
+                    &(elm)->field.tqe_next;                             \
+        else                                                            \
+                (head)->tqh_last = &(elm)->field.tqe_next;              \
+        (listelm)->field.tqe_next = (elm);                              \
+        (elm)->field.tqe_prev = &(listelm)->field.tqe_next;             \
+} while (/*CONSTCOND*/0)
+
+#define QTAILQ_INSERT_BEFORE(listelm, elm, field) do {                  \
+        (elm)->field.tqe_prev = (listelm)->field.tqe_prev;              \
+        (elm)->field.tqe_next = (listelm);                              \
+        *(listelm)->field.tqe_prev = (elm);                             \
+        (listelm)->field.tqe_prev = &(elm)->field.tqe_next;             \
+} while (/*CONSTCOND*/0)
+
+#define QTAILQ_REMOVE(head, elm, field) do {                            \
+        if (((elm)->field.tqe_next) != NULL)                            \
+                (elm)->field.tqe_next->field.tqe_prev =                 \
+                    (elm)->field.tqe_prev;                              \
+        else                                                            \
+                (head)->tqh_last = (elm)->field.tqe_prev;               \
+        *(elm)->field.tqe_prev = (elm)->field.tqe_next;                 \
+} while (/*CONSTCOND*/0)
+
+#define QTAILQ_FOREACH(var, head, field)                                \
+        for ((var) = ((head)->tqh_first);                               \
+                (var);                                                  \
+                (var) = ((var)->field.tqe_next))
+
+#define QTAILQ_FOREACH_SAFE(var, head, field, next_var)                 \
+        for ((var) = ((head)->tqh_first);                               \
+                (var) && ((next_var) = ((var)->field.tqe_next), 1);     \
+                (var) = (next_var))
+
+#define QTAILQ_FOREACH_REVERSE(var, head, headname, field)              \
+        for ((var) = (*(((struct headname *)((head)->tqh_last))->tqh_last));    \
+                (var);                                                  \
+                (var) = (*(((struct headname *)((var)->field.tqe_prev))->tqh_last)))
+
+/*
+ * Tail queue access methods.
+ */
+#define QTAILQ_EMPTY(head)               ((head)->tqh_first == NULL)
+#define QTAILQ_FIRST(head)               ((head)->tqh_first)
+#define QTAILQ_NEXT(elm, field)          ((elm)->field.tqe_next)
+
+#define QTAILQ_LAST(head, headname) \
+        (*(((struct headname *)((head)->tqh_last))->tqh_last))
+#define QTAILQ_PREV(elm, headname, field) \
+        (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#endif  /* !QEMU_SYS_QUEUE_H_ */