From 2d53c2dd7da1303da8a66ab775ff90943138c1b5 Mon Sep 17 00:00:00 2001
From: espes <nielkie2@gmail.com>
Date: Tue, 13 Nov 2012 01:18:46 +1100
Subject: [PATCH] xbox: broken vertex shader translation, and other shit that
 doesn't work.

---
 hw/i386/Makefile.objs |   2 +-
 hw/nv2a.c             | 820 +++++++++++++++++++++++++++++++++---------
 hw/nv2a_vsh.c         | 682 +++++++++++++++++++++++++++++++++++
 hw/nv2a_vsh.h         |  46 +++
 4 files changed, 1373 insertions(+), 177 deletions(-)
 create mode 100644 hw/nv2a_vsh.c
 create mode 100644 hw/nv2a_vsh.h

diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs
index f8bf4ae5bc..d96ae1cbdf 100644
--- a/hw/i386/Makefile.objs
+++ b/hw/i386/Makefile.objs
@@ -12,6 +12,6 @@ obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pt.o xen_pt_config_init.o xen_pt_msi.o
 obj-y += kvm/
 obj-$(CONFIG_SPICE) += qxl.o qxl-logger.o qxl-render.o
 
-obj-$(CONFIG_XBOX) += xbox.o xbox_pci.o acpi_xbox.o amd_smbus.o nv2a.o mcpx.o smbus_xbox_smc.o smbus_cx25871.o smbus_adm1032.o
+obj-$(CONFIG_XBOX) += xbox.o xbox_pci.o acpi_xbox.o amd_smbus.o nv2a.o nv2a_vsh.o mcpx.o smbus_xbox_smc.o smbus_cx25871.o smbus_adm1032.o
 
 obj-y := $(addprefix ../,$(obj-y))
diff --git a/hw/nv2a.c b/hw/nv2a.c
index 4c307cfdb4..0c72c537a6 100644
--- a/hw/nv2a.c
+++ b/hw/nv2a.c
@@ -3,20 +3,18 @@
  *
  * Copyright (c) 2012 espes
  *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2 as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
  *
- * This library is distributed in the hope that it will be useful,
+ * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>
- *
- * Contributions after 2012-01-13 are licensed under the terms of the
- * GNU GPL, version 2 or (at your option) any later version.
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 #include "hw.h"
 #include "pc.h"
@@ -24,6 +22,8 @@
 #include "pci.h"
 #include "vga.h"
 #include "vga_int.h"
+#include "qstring.h"
+#include "nv2a_vsh.h"
 
 #ifdef __APPLE__
 #include <OpenGL/gl.h>
@@ -176,6 +176,9 @@
 #define NV_PFIFO_CACHE1_DMA_SUBROUTINE                   0x0000124C
 #   define NV_PFIFO_CACHE1_DMA_SUBROUTINE_RETURN_OFFSET       0x1FFFFFFC
 #   define NV_PFIFO_CACHE1_DMA_SUBROUTINE_STATE                (1 << 0)
+#define NV_PFIFO_CACHE1_PULL0                            0x00001250
+#   define NV_PFIFO_CACHE1_PULL0_ACCESS                        (1 << 0)
+#define NV_PFIFO_CACHE1_ENGINE                           0x00001280
 #define NV_PFIFO_CACHE1_DMA_DCOUNT                       0x000012A0
 #   define NV_PFIFO_CACHE1_DMA_DCOUNT_VALUE                   0x00001FFC
 #define NV_PFIFO_CACHE1_DMA_GET_JMP_SHADOW               0x000012A4
@@ -342,10 +345,23 @@
 #   define NV097_SET_CONTEXT_DMA_A                            0x00970184
 #   define NV097_SET_CONTEXT_DMA_B                            0x00970188
 #   define NV097_SET_CONTEXT_DMA_STATE                        0x00970190
-#   define NV097_SET_CONTEXT_DMA_VERTEX_A                     0x0097019c
-#   define NV097_SET_CONTEXT_DMA_VERTEX_B                     0x009701a0
-#   define NV097_SET_CONTEXT_DMA_SEMAPHORE                    0x009701a4
-#   define NV097_SET_BEGIN_END                                0x009717fc
+#   define NV097_SET_CONTEXT_DMA_VERTEX_A                     0x0097019C
+#   define NV097_SET_CONTEXT_DMA_VERTEX_B                     0x009701A0
+#   define NV097_SET_CONTEXT_DMA_SEMAPHORE                    0x009701A4
+#   define NV097_SET_TRANSFORM_PROGRAM                        0x00970B00
+#   define NV097_SET_TRANSFORM_CONSTANT                       0x00970B80
+#   define NV097_SET_VERTEX_DATA_ARRAY_OFFSET                 0x00971720
+#   define NV097_SET_VERTEX_DATA_ARRAY_FORMAT                 0x00971760
+#       define NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE            0x0000000F
+#           define NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_UB_D3D     0
+#           define NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_S1         1
+#           define NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_F          2
+#           define NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_UB_OGL     3
+#           define NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_S32K       5
+#           define NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_CMP        6
+#       define NV097_SET_VERTEX_DATA_ARRAY_FORMAT_SIZE            0x000000F0
+#       define NV097_SET_VERTEX_DATA_ARRAY_FORMAT_STRIDE          0xFFFFFF00
+#   define NV097_SET_BEGIN_END                                0x009717fC
 #       define NV097_SET_BEGIN_END_OP_END                         0x00
 #       define NV097_SET_BEGIN_END_OP_POINTS                      0x01
 #       define NV097_SET_BEGIN_END_OP_LINES                       0x02
@@ -357,8 +373,40 @@
 #       define NV097_SET_BEGIN_END_OP_QUADS                       0x08
 #       define NV097_SET_BEGIN_END_OP_QUAD_STRIP                  0x09
 #       define NV097_SET_BEGIN_END_OP_POLYGON                     0x0A
-#   define NV097_SET_SEMAPHORE_OFFSET                         0x00971d6c
-#   define NV097_BACK_END_WRITE_SEMAPHORE_RELEASE             0x00971d70
+#   define NV097_ARRAY_ELEMENT16                              0x00971800
+#   define NV097_ARRAY_ELEMENT32                              0x00971808
+#   define NV097_DRAW_ARRAYS                                  0x00971810
+#   define NV097_INLINE_ARRAY                                 0x00971818
+#   define NV097_SET_SEMAPHORE_OFFSET                         0x00971D6C
+#   define NV097_BACK_END_WRITE_SEMAPHORE_RELEASE             0x00971D70
+#   define NV097_SET_ZSTENCIL_CLEAR_VALUE                     0x00971D8C
+#   define NV097_SET_COLOR_CLEAR_VALUE                        0x00971D90
+#   define NV097_CLEAR_SURFACE                                0x00971D94
+#       define NV097_CLEAR_SURFACE_Z                              (1 << 0)
+#       define NV097_CLEAR_SURFACE_STENCIL                        (1 << 1)
+#       define NV097_CLEAR_SURFACE_R                              (1 << 4)
+#       define NV097_CLEAR_SURFACE_G                              (1 << 5)
+#       define NV097_CLEAR_SURFACE_B                              (1 << 6)
+#       define NV097_CLEAR_SURFACE_A                              (1 << 7)
+#   define NV097_SET_TRANSFORM_EXECUTION_MODE                 0x00971E94
+#   define NV097_SET_TRANSFORM_PROGRAM_CXT_WRITE_EN           0x00971E98
+#   define NV097_SET_TRANSFORM_PROGRAM_LOAD                   0x00971E9C
+#   define NV097_SET_TRANSFORM_PROGRAM_START                  0x00971EA0
+#   define NV097_SET_TRANSFORM_CONSTANT_LOAD                  0x00971EA4
+
+static const GLenum kelvin_primitive_map[] = {
+    0,
+    GL_POINTS,
+    GL_LINES,
+    GL_LINE_LOOP,
+    GL_LINE_STRIP,
+    GL_TRIANGLES,
+    GL_TRIANGLE_STRIP,
+    GL_TRIANGLE_FAN,
+    GL_QUADS,
+    GL_QUAD_STRIP,
+    GL_POLYGON,
+};
 
 
 #define NV_MEMORY_TO_MEMORY_FORMAT                           0x00000039
@@ -371,6 +419,12 @@
 #define NV2A_CRYSTAL_FREQ 13500000
 #define NV2A_NUM_CHANNELS 32
 #define NV2A_NUM_SUBCHANNELS 8
+#define NV2A_MAX_PUSHBUFFER_METHOD 2048
+
+#define NV2A_VERTEXSHADER_SLOTS  32 /*???*/
+#define NV2A_MAX_VERTEXSHADER_LENGTH 136
+#define NV2A_VERTEXSHADER_CONSTANTS 192
+#define NV2A_VERTEXSHADER_ATTRIBUTES 16
 
 
 
@@ -379,7 +433,7 @@ enum FifoMode {
     FIFO_DMA = 1,
 };
 
-enum RAMHTEngine {
+enum FIFOEngine {
     ENGINE_SOFTWARE = 0,
     ENGINE_GRAPHICS = 1,
     ENGINE_DVD = 2,
@@ -390,7 +444,7 @@ enum RAMHTEngine {
 typedef struct RAMHTEntry {
     uint32_t handle;
     hwaddr instance;
-    enum RAMHTEngine engine;
+    enum FIFOEngine engine;
     unsigned int channel_id : 5;
     bool valid;
 } RAMHTEntry;
@@ -403,37 +457,87 @@ typedef struct DMAObject {
 } DMAObject;
 
 
+
+
+typedef struct VertexAttribute {
+    GLenum gl_type;
+    GLboolean gl_normalize;
+    unsigned int size; /* size of the data type */
+    unsigned int count; /* number of components */
+    uint32_t stride;
+} VertexAttribute;
+
+typedef struct VertexShaderConstant {
+    uint32 data[16];
+} VertexShaderConstant;
+
+typedef struct VertexShader {
+    unsigned int program_length;
+    uint32_t program_data[NV2A_MAX_VERTEXSHADER_LENGTH];
+
+    bool dirty;
+    GLuint gl_program;
+} VertexShader;
+
+typedef struct KelvinState {
+    hwaddr dma_notifies;
+    hwaddr dma_a;
+    hwaddr dma_b;
+    hwaddr dma_state;
+    hwaddr dma_vertex_a;
+    hwaddr dma_vertex_b;
+    hwaddr dma_semaphore;
+    unsigned int semaphore_offset;
+
+    unsigned int vertexshader_start_slot;
+    unsigned int vertexshader_load_slot;
+    VertexShader vertexshaders[NV2A_VERTEXSHADER_SLOTS];
+
+    unsigned int constant_load_slot;
+    VertexShaderConstant constants[NV2A_VERTEXSHADER_CONSTANTS];
+
+
+    GLenum gl_primitive_mode;
+
+    VertexAttribute vertex_attributes[NV2A_VERTEXSHADER_ATTRIBUTES];
+
+    struct {
+        uint32_t offset;
+        bool dma_select;
+    } vertex_attribute_offsets[NV2A_VERTEXSHADER_ATTRIBUTES];
+
+    unsigned int inline_vertex_data_offset;
+    uint32_t inline_vertex_data[NV2A_MAX_PUSHBUFFER_METHOD];
+
+    unsigned int array_batch_offset;
+    uint32_t array_batch[NV2A_MAX_PUSHBUFFER_METHOD];
+
+    bool use_vertex_program;
+    bool enable_vertex_program_write;
+} KelvinState;
+
 typedef struct GraphicsObject {
     uint8_t graphics_class;
     union {
         struct {
             hwaddr dma_notifies;
-        } memory_to_memory_format;
+        } m2mf;
 
-        struct {
-            hwaddr dma_notifies;
-            hwaddr dma_a;
-            hwaddr dma_b;
-            hwaddr dma_state;
-            hwaddr dma_vertex_a;
-            hwaddr dma_vertex_b;
-            hwaddr dma_semaphore;
-            unsigned int semaphore_offset;
-        } kelvin_primitive;
+        KelvinState kelvin;
     } data;
 } GraphicsObject;
 
-typedef struct GraphicsSubchannelData {
+typedef struct GraphicsSubchannel {
     hwaddr object_instance;
     GraphicsObject object;
     uint32_t object_cache[5];
-} GraphicsSubchannelData;
+} GraphicsSubchannel;
 
 typedef struct GraphicsContext {
     bool channel_3d;
     unsigned int subchannel;
 
-    GraphicsSubchannelData subchannel_data[NV2A_NUM_SUBCHANNELS];
+    GraphicsSubchannel subchannel_data[NV2A_NUM_SUBCHANNELS];
 
 
 
@@ -450,10 +554,9 @@ typedef struct Cache1State {
     unsigned int channel_id;
     enum FifoMode mode;
 
+    /* Pusher state */
     bool push_enabled;
     bool dma_push_enabled;
-
-    /* Pusher state */
     hwaddr dma_instance;
     bool method_nonincreasing;
     unsigned int method : 14;
@@ -468,8 +571,9 @@ typedef struct Cache1State {
     uint32_t error;
 
     /* Puller state */
-    uint8_t bound_engines[NV2A_NUM_SUBCHANNELS];
-    unsigned int last_engine : 5;
+    bool pull_enabled;
+    enum FIFOEngine bound_engines[NV2A_NUM_SUBCHANNELS];
+    enum FIFOEngine last_engine;
 } Cache1State;
 
 typedef struct ChannelControl {
@@ -487,6 +591,7 @@ typedef struct NV2AState {
     VGACommonState vga;
 
     MemoryRegion vram;
+    uint8_t *vram_ptr;
     MemoryRegion ramin;
     uint8_t *ramin_ptr;
 
@@ -661,6 +766,7 @@ static DMAObject nv2a_load_dma_object(NV2AState *d,
 static GraphicsObject nv2a_load_graphics_object(NV2AState *d,
                                                 hwaddr address)
 {
+    int i;
     uint8_t *obj_ptr;
     uint32_t switch1, switch2, switch3;
 
@@ -670,27 +776,247 @@ static GraphicsObject nv2a_load_graphics_object(NV2AState *d,
     switch2 = le32_to_cpupu((uint32_t*)(obj_ptr+4));
     switch3 = le32_to_cpupu((uint32_t*)(obj_ptr+8));
 
-    return (GraphicsObject){
-        .graphics_class = switch1 & NV_PGRAPH_CTX_SWITCH1_GRCLASS,
-    };
+    GraphicsObject ret;
+    ret.graphics_class = switch1 & NV_PGRAPH_CTX_SWITCH1_GRCLASS;
+
+    /* init graphics object */
+    KelvinState *kelvin;
+    switch (ret.graphics_class) {
+    case NV_KELVIN_PRIMITIVE:
+        kelvin = &ret.data.kelvin;
+
+        /* generate vertex programs */
+        for (i=0; i<NV2A_VERTEXSHADER_SLOTS; i++) {
+            VertexShader *shader = &kelvin->vertexshaders[i];
+            glGenProgramsARB(1, &shader->gl_program);
+        }
+        assert(glGetError() == GL_NO_ERROR);
+
+        break;
+    default:
+        break;
+    }
+
+    return ret;
 }
 
 
+static unsigned int kelvin_bind_inline_vertex_data(KelvinState *kelvin)
+{
+    int i;
+    unsigned int offset = 0;
+    for (i=0; i<NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+        VertexAttribute *attribute = &kelvin->vertex_attributes[i];
+        if (attribute->count) {
+
+            glVertexAttribPointer(i,
+                attribute->count,
+                attribute->gl_type,
+                attribute->gl_normalize,
+                attribute->stride,
+                kelvin->inline_vertex_data + offset);
+
+            glEnableVertexAttribArray(i);
+
+            offset += attribute->size * attribute->count;
+        } else {
+            glDisableVertexAttribArray(i);
+        }
+    }
+    return offset;
+}
+
+static void kelvin_bind_vertex_attribute_offsets(NV2AState *d,
+                                                 KelvinState *kelvin)
+{
+    int i;
+    for (i=0; i<NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+        VertexAttribute *attribute = &kelvin->vertex_attributes[i];
+        if (attribute->count) {
+            DMAObject vertex_dma;
+            if (kelvin->vertex_attribute_offsets[i].dma_select) {
+                vertex_dma = nv2a_load_dma_object(d, kelvin->dma_vertex_b);
+            } else {
+                vertex_dma = nv2a_load_dma_object(d, kelvin->dma_vertex_a);
+            }
+            uint32_t offset = kelvin->vertex_attribute_offsets[i].offset;
+            assert(offset < vertex_dma.limit);
+
+            if (vertex_dma.dma_class == NV_DMA_IN_MEMORY_CLASS) {
+                glVertexAttribPointer(i,
+                    attribute->count,
+                    attribute->gl_type,
+                    attribute->gl_normalize,
+                    attribute->stride,
+                    d->vram_ptr + vertex_dma.start + offset);
+            } else {
+                assert(false);
+            }
+        } else {
+            glDisableVertexAttribArray(i);
+        }
+    }
+}
+
+static void kelvin_bind_vertexshader(KelvinState *kelvin)
+{
+    int i;
+    VertexShader *shader;
+
+    assert(kelvin->use_vertex_program);
+
+    /* TODO */
+    assert(!kelvin->enable_vertex_program_write);
+
+    shader = &kelvin->vertexshaders[kelvin->vertexshader_start_slot];
+
+    glBindProgramARB(GL_VERTEX_PROGRAM_ARB, shader->gl_program);
+
+    if (shader->dirty) {
+        QString *shader_code = vsh_translate(VSH_VERSION_XVS,
+                                             shader->program_data,
+                                             shader->program_length);
+        const char* shader_code_str = qstring_get_str(shader_code);
+
+        NV2A_DPRINTF("nv2a bind shader %d, code:\n%s\n",
+                     kelvin->vertexshader_start_slot,
+                     shader_code_str);
+
+        glProgramStringARB(GL_VERTEX_PROGRAM_ARB,
+                           GL_PROGRAM_FORMAT_ASCII_ARB,
+                           strlen(shader_code_str),
+                           shader_code_str);
+
+        /* Check it compiled */
+        GLint pos;
+        glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
+        if (pos != -1) {
+            fprintf(stderr, "nv2a: Shader compilation failed:\n"
+                            "      pos %d, %s\n",
+                    pos, glGetString(GL_PROGRAM_ERROR_STRING_ARB));
+            fprintf(stderr, "ucode:\n");
+            for (i=0; i<shader->program_length; i++) {
+                fprintf(stderr, "    0x%08x,\n", shader->program_data[i]);
+            }
+            abort();
+        }
+
+        /* Check we're within resource limits */
+        GLint native;
+        glGetProgramivARB(GL_FRAGMENT_PROGRAM_ARB,
+                          GL_PROGRAM_UNDER_NATIVE_LIMITS_ARB,
+                          &native);
+        assert(native);
+
+        QDECREF(shader_code);
+        shader->dirty = false;
+    }
+
+    /* load constants */
+    for (i=0; i<NV2A_VERTEXSHADER_CONSTANTS; i++) {
+        VertexShaderConstant *constant = &kelvin->constants[i];
+        glProgramEnvParameter4fvARB(GL_VERTEX_PROGRAM_ARB,
+                                    i,
+                                    (const GLfloat*)constant->data);
+    }
+
+    assert(glGetError() == GL_NO_ERROR);
+}
+
+
+static void nv2a_pgraph_context_init(GraphicsContext *context)
+{
+    /* TODO: context creation on linux */
+    CGLPixelFormatAttribute attributes[] = {
+        kCGLPFAAccelerated,
+        (CGLPixelFormatAttribute)0
+    };
+
+    CGLPixelFormatObj pix;
+    GLint num;
+    CGLChoosePixelFormat(attributes, &pix, &num);
+    CGLCreateContext(pix, NULL, &context->gl_context);
+    CGLDestroyPixelFormat(pix);
+
+    CGLSetCurrentContext(context->gl_context);
+
+
+    /* Check context capabilities */
+    const GLubyte *extensions;
+    extensions = glGetString (GL_EXTENSIONS);
+
+    assert(gluCheckExtension((const GLubyte*)"GL_EXT_framebuffer_object",
+                             extensions));
+
+    assert(gluCheckExtension((const GLubyte*)"GL_ARB_vertex_program",
+                             extensions));
+
+    GLint max_vertex_attributes;
+    glGetIntegerv(GL_MAX_VERTEX_ATTRIBS, &max_vertex_attributes);
+    assert(max_vertex_attributes >= NV2A_VERTEXSHADER_ATTRIBUTES);
+
+
+
+    glGenFramebuffersEXT(1, &context->gl_framebuffer);
+    glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, context->gl_framebuffer);
+
+    glGenRenderbuffersEXT(1, &context->gl_renderbuffer);
+    glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, context->gl_renderbuffer);
+    glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_RGBA8,
+                             640, 480);
+    glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT,
+                                 GL_COLOR_ATTACHMENT0_EXT,
+                                 GL_RENDERBUFFER_EXT,
+                                 context->gl_renderbuffer);
+
+    assert(glCheckFramebufferStatusEXT(GL_FRAMEBUFFER_EXT)
+            == GL_FRAMEBUFFER_COMPLETE_EXT);
+
+
+    assert(glGetError() == GL_NO_ERROR);
+
+
+    CGLSetCurrentContext(NULL);
+}
+
+static void nv2a_pgraph_context_set_current(GraphicsContext *context)
+{
+    if (context) {
+        CGLSetCurrentContext(context->gl_context);
+    } else {
+        CGLSetCurrentContext(NULL);
+    }
+}
+
+static void nv2a_pgraph_context_destroy(GraphicsContext *context)
+{
+    CGLSetCurrentContext(context->gl_context);
+
+    glDeleteRenderbuffersEXT(1, &context->gl_renderbuffer);
+    glDeleteFramebuffersEXT(1, &context->gl_framebuffer);
+
+    CGLSetCurrentContext(NULL);
+
+    CGLDestroyContext(context->gl_context);
+}
+
 static void nv2a_pgraph_method(NV2AState *d,
                                unsigned int subchannel,
                                unsigned int method,
                                uint32_t parameter)
 {
     //assert(d->pgraph.channel_valid);
-
     GraphicsContext *context = &d->pgraph.context[d->pgraph.channel_id];
-    GraphicsSubchannelData *subchannel_data =
+    GraphicsSubchannel *subchannel_data =
         &context->subchannel_data[subchannel];
     GraphicsObject *object = &subchannel_data->object;
 
     NV2A_DPRINTF("nv2a pgraph method: 0x%x, 0x%x, 0x%x\n",
                  subchannel, method, parameter);
 
+
+    nv2a_pgraph_context_set_current(context);
+
     if (method == NV_SET_OBJECT) {
         subchannel_data->object_instance = parameter;
         *object = nv2a_load_graphics_object(d, parameter);
@@ -698,91 +1024,241 @@ static void nv2a_pgraph_method(NV2AState *d,
         return;
     }
 
+
+    KelvinState *kelvin = &object->data.kelvin;
+
     DMAObject dma_semaphore;
-
-    switch ((object->graphics_class << 16) | method) {
-        case NV_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY:
-            object->data.memory_to_memory_format.dma_notifies = parameter;
-            break;
+    unsigned int slot;
+    VertexAttribute *vertex_attribute;
+    VertexShader *vertexshader;
+    VertexShaderConstant *constant;
 
 
-        case NV097_NO_OPERATION:
-            break;
-        case NV097_WAIT_FOR_IDLE:
-            break;
-        case NV097_SET_CONTEXT_DMA_NOTIFIES:
-            object->data.kelvin_primitive.dma_notifies = parameter;
-            break;
-        case NV097_SET_CONTEXT_DMA_A:
-            object->data.kelvin_primitive.dma_a = parameter;
-            break;
-        case NV097_SET_CONTEXT_DMA_B:
-            object->data.kelvin_primitive.dma_b = parameter;
-            break;
-        case NV097_SET_CONTEXT_DMA_STATE:
-            object->data.kelvin_primitive.dma_state = parameter;
-            break;
-        case NV097_SET_CONTEXT_DMA_VERTEX_A:
-            object->data.kelvin_primitive.dma_vertex_a = parameter;
-            break;
-        case NV097_SET_CONTEXT_DMA_VERTEX_B:
-            object->data.kelvin_primitive.dma_vertex_b = parameter;
-            break;
-        case NV097_SET_CONTEXT_DMA_SEMAPHORE:
-            object->data.kelvin_primitive.dma_semaphore = parameter;
-            break;
+    uint32_t class_method = (object->graphics_class << 16) | method;
+    switch (class_method) {
+    case NV_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY:
+        object->data.m2mf.dma_notifies = parameter;
+        break;
 
-        case NV097_SET_BEGIN_END:
-            if (parameter == NV097_SET_BEGIN_END_OP_END) {
-                glEnd();
-            } else {
-                GLenum mode_map[] = {
-                    0,
-                    GL_POINTS,
-                    GL_LINES,
-                    GL_LINE_LOOP,
-                    GL_LINE_STRIP,
-                    GL_TRIANGLES,
-                    GL_TRIANGLE_STRIP,
-                    GL_TRIANGLE_FAN,
-                    GL_QUADS,
-                    GL_QUAD_STRIP,
-                    GL_POLYGON,
-                };
-                assert(parameter <= NV097_SET_BEGIN_END_OP_POLYGON);
 
-                glBegin(mode_map[parameter]);
-            }
+    case NV097_NO_OPERATION:
+        break;
+    case NV097_WAIT_FOR_IDLE:
+        break;
+    case NV097_SET_CONTEXT_DMA_NOTIFIES:
+        kelvin->dma_notifies = parameter;
+        break;
+    case NV097_SET_CONTEXT_DMA_A:
+        kelvin->dma_a = parameter;
+        break;
+    case NV097_SET_CONTEXT_DMA_B:
+        kelvin->dma_b = parameter;
+        break;
+    case NV097_SET_CONTEXT_DMA_STATE:
+        kelvin->dma_state = parameter;
+        break;
+    case NV097_SET_CONTEXT_DMA_VERTEX_A:
+        kelvin->dma_vertex_a = parameter;
+        break;
+    case NV097_SET_CONTEXT_DMA_VERTEX_B:
+        kelvin->dma_vertex_b = parameter;
+        break;
+    case NV097_SET_CONTEXT_DMA_SEMAPHORE:
+        kelvin->dma_semaphore = parameter;
+        break;
 
-            assert(e == GL_NO_ERROR);
+    case NV097_SET_TRANSFORM_PROGRAM ...
+            NV097_SET_TRANSFORM_PROGRAM + 0x7c:
+
+        slot = (class_method - NV097_SET_TRANSFORM_PROGRAM) / 4;
+        /* TODO: It should still work using a non-increasing slot??? */
+
+        vertexshader = &kelvin->vertexshaders[kelvin->vertexshader_load_slot];
+        assert(vertexshader->program_length < NV2A_MAX_VERTEXSHADER_LENGTH);
+        vertexshader->program_data[
+            vertexshader->program_length++] = parameter;
+        break;
+
+    case NV097_SET_TRANSFORM_CONSTANT ...
+            NV097_SET_TRANSFORM_CONSTANT + 0x7c:
+
+        slot = (class_method - NV097_SET_TRANSFORM_CONSTANT) / 4;
+
+        constant = &kelvin->constants[kelvin->constant_load_slot];
+        constant->data[slot] = parameter;
+        break;
+
+
+    case NV097_SET_VERTEX_DATA_ARRAY_FORMAT ...
+            NV097_SET_VERTEX_DATA_ARRAY_FORMAT + 0x3c:
+
+        slot = (class_method - NV097_SET_VERTEX_DATA_ARRAY_FORMAT) / 4;
+        vertex_attribute = &kelvin->vertex_attributes[slot];
+
+        switch (parameter & NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE) {
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_UB_D3D:
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_UB_OGL:
+            vertex_attribute->gl_type = GL_UNSIGNED_BYTE;
+            vertex_attribute->gl_normalize = GL_TRUE;
+            vertex_attribute->size = 1;
             break;
-        case NV097_SET_SEMAPHORE_OFFSET:
-            object->data.kelvin_primitive.semaphore_offset = parameter;
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_S1:
+            vertex_attribute->gl_type = GL_SHORT;
+            vertex_attribute->gl_normalize = GL_FALSE;
+            vertex_attribute->size = 2;
             break;
-        case NV097_BACK_END_WRITE_SEMAPHORE_RELEASE:
-            dma_semaphore = nv2a_load_dma_object(d,
-                                object->data.kelvin_primitive.dma_semaphore);
-
-            assert(object->data.kelvin_primitive.semaphore_offset
-                    < dma_semaphore.limit);
-
-            stl_le_phys(dma_semaphore.start
-                         + object->data.kelvin_primitive.semaphore_offset,
-                        parameter);
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_F:
+            vertex_attribute->gl_type = GL_FLOAT;
+            vertex_attribute->gl_normalize = GL_FALSE;
+            vertex_attribute->size = 4;
+            break;
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_S32K:
+            vertex_attribute->gl_type = GL_UNSIGNED_SHORT;
+            vertex_attribute->gl_normalize = GL_FALSE;
+            vertex_attribute->size = 2;
             break;
         default:
-            NV2A_DPRINTF("    unhandled  (0x%02x 0x%08x  -  0x%x)\n",
-                         object->graphics_class, method, parameter);
+            assert(false);
             break;
+        }
+        vertex_attribute->count =
+            (parameter & NV097_SET_VERTEX_DATA_ARRAY_FORMAT_SIZE) >> 4;
+        vertex_attribute->stride =
+            (parameter & NV097_SET_VERTEX_DATA_ARRAY_FORMAT_STRIDE) >> 8;
+
+        break;
+    case NV097_SET_VERTEX_DATA_ARRAY_OFFSET ...
+            NV097_SET_VERTEX_DATA_ARRAY_OFFSET + 0x3c:
+
+        slot = (class_method - NV097_SET_VERTEX_DATA_ARRAY_OFFSET) / 4;
+
+        kelvin->vertex_attribute_offsets[slot].dma_select =
+            parameter & 0x80000000;
+        kelvin->vertex_attribute_offsets[slot].offset =
+            parameter & 0x7fffffff;
+
+        break;
+
+    case NV097_SET_BEGIN_END:
+        if (parameter == NV097_SET_BEGIN_END_OP_END) {
+            if (kelvin->use_vertex_program) {
+                glEnable(GL_VERTEX_PROGRAM_ARB);
+                kelvin_bind_vertexshader(kelvin);
+            } else {
+                glDisable(GL_VERTEX_PROGRAM_ARB);
+            }
+
+            if (kelvin->inline_vertex_data_offset) {
+                unsigned int vertex_size =
+                    kelvin_bind_inline_vertex_data(kelvin);
+                unsigned int vertex_count =
+                    kelvin->inline_vertex_data_offset*4 / vertex_size;
+                glDrawArrays(kelvin->gl_primitive_mode,
+                             0, vertex_count);
+            } else if (kelvin->array_batch_offset) {
+                kelvin_bind_vertex_attribute_offsets(d, kelvin);
+
+                glDrawElements(kelvin->gl_primitive_mode,
+                               kelvin->array_batch_offset,
+                               GL_UNSIGNED_INT,
+                               kelvin->array_batch);
+            } else {
+                assert(false);
+            }
+            assert(glGetError() == GL_NO_ERROR);
+        } else {
+            assert(parameter <= NV097_SET_BEGIN_END_OP_POLYGON);
+
+            kelvin->gl_primitive_mode = kelvin_primitive_map[parameter];
+
+            kelvin->array_batch_offset = 0;
+            kelvin->inline_vertex_data_offset = 0;
+        }
+        break;
+    case NV097_ARRAY_ELEMENT16:
+        assert(kelvin->array_batch_offset < NV2A_MAX_PUSHBUFFER_METHOD);
+        kelvin->array_batch[
+            kelvin->array_batch_offset++] = parameter & 0xFFFF;
+        kelvin->array_batch[
+            kelvin->array_batch_offset++] = parameter >> 16;
+        break;
+    case NV097_ARRAY_ELEMENT32:
+        assert(kelvin->array_batch_offset < NV2A_MAX_PUSHBUFFER_METHOD);
+        kelvin->array_batch[
+            kelvin->array_batch_offset++] = parameter;
+        break;
+    case NV097_INLINE_ARRAY:
+        assert(kelvin->inline_vertex_data_offset < NV2A_MAX_PUSHBUFFER_METHOD);
+        kelvin->inline_vertex_data[
+            kelvin->inline_vertex_data_offset++] = parameter;
+        break;
+
+    case NV097_SET_SEMAPHORE_OFFSET:
+        kelvin->semaphore_offset = parameter;
+        break;
+    case NV097_BACK_END_WRITE_SEMAPHORE_RELEASE:
+        dma_semaphore = nv2a_load_dma_object(d, kelvin->dma_semaphore);
+
+        assert(kelvin->semaphore_offset < dma_semaphore.limit);
+
+        stl_le_phys(dma_semaphore.start + kelvin->semaphore_offset,
+                    parameter);
+        break;
+
+    case NV097_CLEAR_SURFACE:
+        /* QQQ */
+        printf("------------------CLEAR 0x%x---------------\n", parameter);
+        glClearColor(1, 0, 0, 1);
+
+        GLbitfield gl_mask = 0;
+        if (parameter & NV097_CLEAR_SURFACE_Z) {
+            gl_mask |= GL_DEPTH_BUFFER_BIT;
+        }
+        if (parameter & NV097_CLEAR_SURFACE_STENCIL) {
+            gl_mask |= GL_STENCIL_BUFFER_BIT;
+        }
+        if (parameter & (
+                NV097_CLEAR_SURFACE_R | NV097_CLEAR_SURFACE_G
+                | NV097_CLEAR_SURFACE_B | NV097_CLEAR_SURFACE_A)) {
+            gl_mask |= GL_COLOR_BUFFER_BIT;
+        }
+        glClear(gl_mask);
+        break;
+
+    case NV097_SET_TRANSFORM_EXECUTION_MODE:
+        kelvin->use_vertex_program = (parameter & 3) == 2;
+        break;
+    case NV097_SET_TRANSFORM_PROGRAM_CXT_WRITE_EN:
+        kelvin->enable_vertex_program_write = parameter;
+        break;
+    case NV097_SET_TRANSFORM_PROGRAM_LOAD:
+        assert(parameter < NV2A_VERTEXSHADER_SLOTS);
+        kelvin->vertexshader_load_slot = parameter;
+        kelvin->vertexshaders[parameter].program_length = 0; /* ??? */
+        kelvin->vertexshaders[parameter].dirty = true;
+        break;
+    case NV097_SET_TRANSFORM_PROGRAM_START:
+        assert(parameter < NV2A_VERTEXSHADER_SLOTS);
+        kelvin->vertexshader_start_slot = parameter;
+        break;
+    case NV097_SET_TRANSFORM_CONSTANT_LOAD:
+        assert(parameter < NV2A_VERTEXSHADER_CONSTANTS);
+        kelvin->constant_load_slot = parameter;
+        printf("load to %d\n", parameter);
+        break;
+
+    default:
+        NV2A_DPRINTF("    unhandled  (0x%02x 0x%08x)\n",
+                     object->graphics_class, method);
+        break;
     }
 }
 
-
-static void nv2a_cache_push(NV2AState *d,
-                            unsigned int subchannel,
-                            unsigned int method,
-                            uint32_t parameter,
-                            bool nonincreasing)
+static void nv2a_fifo_cache1_push(NV2AState *d,
+                                  unsigned int subchannel,
+                                  unsigned int method,
+                                  uint32_t parameter,
+                                  bool nonincreasing)
 {
     Cache1State *state = &d->pfifo.cache1;
 
@@ -805,6 +1281,11 @@ static void nv2a_cache_push(NV2AState *d,
             assert(false);
             break;
         case ENGINE_GRAPHICS:
+            /*if (!d->pgraph.channel_valid) {
+                d->pgraph.pending_interrupts |= NV_PGRAPH_INTR_CONTEXT_SWITCH;
+                nv2a_update_irq(d);
+                return -1;
+            }*/
             nv2a_pgraph_method(d, subchannel, 0, entry.instance);
             break;
         default:
@@ -847,7 +1328,7 @@ static void nv2a_cache_push(NV2AState *d,
 
 }
 
-static void nv2a_run_pusher(NV2AState *d) {
+static void nv2a_fifo_run_pusher(NV2AState *d) {
     uint8_t channel_id;
     ChannelControl *control;
     Cache1State *state;
@@ -892,8 +1373,8 @@ static void nv2a_run_pusher(NV2AState *d) {
             /* data word of methods command */
             state->data_shadow = word;
 
-            nv2a_cache_push(d, state->subchannel, state->method, word,
-                            state->method_nonincreasing);
+            nv2a_fifo_cache1_push(d, state->subchannel, state->method, word,
+                                  state->method_nonincreasing);
 
             if (!state->method_nonincreasing) {
                 state->method += 4;
@@ -963,61 +1444,6 @@ static void nv2a_run_pusher(NV2AState *d) {
 
 
 
-static void nv2a_pgraph_context_init(GraphicsContext *context)
-{
-    /* TODO: context creation on linux */
-    CGLPixelFormatAttribute attributes[] = {
-        kCGLPFAAccelerated,
-        (CGLPixelFormatAttribute)0
-    };
-
-    CGLPixelFormatObj pix;
-    GLint num;
-    CGLChoosePixelFormat(attributes, &pix, &num);
-    CGLCreateContext(pix, NULL, &context->gl_context);
-    CGLDestroyPixelFormat(pix);
-
-    CGLSetCurrentContext(context->gl_context);
-
-
-    const GLubyte *extensions;
-    extensions = glGetString (GL_EXTENSIONS);
-
-    assert(gluCheckExtension((const GLubyte*)"GL_EXT_framebuffer_object",
-                             extensions));
-
-    glGenFramebuffersEXT(1, &context->gl_framebuffer);
-    glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, context->gl_framebuffer);
-
-    glGenRenderbuffersEXT(1, &context->gl_renderbuffer);
-    glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, context->gl_renderbuffer);
-    glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_RGBA8,
-                             640, 480);
-    glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT,
-                                 GL_COLOR_ATTACHMENT0_EXT,
-                                 GL_RENDERBUFFER_EXT,
-                                 context->gl_renderbuffer);
-
-    assert(glCheckFramebufferStatusEXT(GL_FRAMEBUFFER_EXT)
-            == GL_FRAMEBUFFER_COMPLETE_EXT);
-
-    assert(glGetError() == GL_NO_ERROR);
-}
-
-static void nv2a_pgraph_context_set_current(GraphicsContext *context)
-{
-    printf("set current\n");
-    CGLSetCurrentContext(context->gl_context);
-}
-
-static void nv2a_pgraph_context_destroy(GraphicsContext *context)
-{
-    glDeleteRenderbuffersEXT(1, &context->gl_renderbuffer);
-    glDeleteFramebuffersEXT(1, &context->gl_framebuffer);
-
-    CGLDestroyContext(context->gl_context);
-}
-
 
 
 
@@ -1119,6 +1545,7 @@ static void nv2a_pbus_write(void *opaque, hwaddr addr,
 static uint64_t nv2a_pfifo_read(void *opaque,
                                   hwaddr addr, unsigned int size)
 {
+    int i;
     NV2AState *d = opaque;
 
     uint64_t r = 0;
@@ -1202,6 +1629,14 @@ static uint64_t nv2a_pfifo_read(void *opaque,
         r = d->pfifo.cache1.subroutine_return
             | d->pfifo.cache1.subroutine_active;
         break;
+    case NV_PFIFO_CACHE1_PULL0:
+        r = d->pfifo.cache1.pull_enabled;
+        break;
+    case NV_PFIFO_CACHE1_ENGINE:
+        for (i=0; i<NV2A_NUM_SUBCHANNELS; i++) {
+            r |= d->pfifo.cache1.bound_engines[i] << (i*2);
+        }
+        break;
     case NV_PFIFO_CACHE1_DMA_DCOUNT:
         r = d->pfifo.cache1.dcount;
         break;
@@ -1224,6 +1659,7 @@ static uint64_t nv2a_pfifo_read(void *opaque,
 static void nv2a_pfifo_write(void *opaque, hwaddr addr,
                                uint64_t val, unsigned int size)
 {
+    int i;
     NV2AState *d = opaque;
 
     NV2A_DPRINTF("nv2a PFIFO: [0x%llx] = 0x%02llx\n", addr, val);
@@ -1283,7 +1719,7 @@ static void nv2a_pfifo_write(void *opaque, hwaddr addr,
         break;
     case NV_PFIFO_CACHE1_DMA_PUSH:
         d->pfifo.cache1.dma_push_enabled =
-            val & NV_PFIFO_CACHE1_DMA_PUSH_ACCESS;
+            (val & NV_PFIFO_CACHE1_DMA_PUSH_ACCESS);
         break;
     case NV_PFIFO_CACHE1_DMA_STATE:
         d->pfifo.cache1.method_nonincreasing =
@@ -1312,6 +1748,15 @@ static void nv2a_pfifo_write(void *opaque, hwaddr addr,
         d->pfifo.cache1.subroutine_active =
             (val & NV_PFIFO_CACHE1_DMA_SUBROUTINE_STATE);
         break;
+    case NV_PFIFO_CACHE1_PULL0:
+        d->pfifo.cache1.pull_enabled =
+            (val & NV_PFIFO_CACHE1_PULL0_ACCESS);
+        break;
+    case NV_PFIFO_CACHE1_ENGINE:
+        for (i=0; i<NV2A_NUM_SUBCHANNELS; i++) {
+            d->pfifo.cache1.bound_engines[i] = (val >> (i*2)) & 3;
+        }
+        break;
     case NV_PFIFO_CACHE1_DMA_DCOUNT:
         d->pfifo.cache1.dcount =
             (val & NV_PFIFO_CACHE1_DMA_DCOUNT_VALUE);
@@ -1601,6 +2046,7 @@ static void nv2a_pgraph_write(void *opaque, hwaddr addr,
         d->pgraph.channel_valid = (val & NV_PGRAPH_CTX_CONTROL_CHID);
         break;
     case NV_PGRAPH_CTX_USER:
+        printf("ppp ctx_user %d\n", (bool)(val & NV_PGRAPH_CTX_USER_CHANNEL_3D));
         d->pgraph.channel_id = (val & NV_PGRAPH_CTX_USER_CHID) >> 24;
 
         d->pgraph.context[d->pgraph.channel_id].channel_3d =
@@ -1608,19 +2054,24 @@ static void nv2a_pgraph_write(void *opaque, hwaddr addr,
         d->pgraph.context[d->pgraph.channel_id].subchannel =
             (val & NV_PGRAPH_CTX_USER_SUBCH) >> 13;
 
-        nv2a_pgraph_context_set_current(
-            &d->pgraph.context[d->pgraph.channel_id]);
+        /* QQQ */
+        d->pgraph.context[d->pgraph.channel_id].channel_3d = true;
 
         break;
     case NV_PGRAPH_CHANNEL_CTX_TABLE:
+        printf("ppp11 %llx - %x\n", val,
+            le32_to_cpupu((uint32_t*)(d->ramin_ptr+val)));
         d->pgraph.context_table = val & NV_PGRAPH_CHANNEL_CTX_TABLE_INST;
         break;
     case NV_PGRAPH_CHANNEL_CTX_POINTER:
+        printf("ppp1 %llx\n", val);
         d->pgraph.context_pointer = val & NV_PGRAPH_CHANNEL_CTX_POINTER_INST;
         break;
     case NV_PGRAPH_CHANNEL_CTX_TRIGGER:
         if (val & NV_PGRAPH_CHANNEL_CTX_TRIGGER_READ_IN) {
             /* do stuff ... */
+            printf("ppp %llx\n", d->pgraph.context_pointer);
+            printf("ppp_ %x\n", le32_to_cpupu((uint32_t*)(d->ramin_ptr+d->pgraph.context_pointer)));
         }
         if (val & NV_PGRAPH_CHANNEL_CTX_TRIGGER_WRITE_OUT) {
             /* do stuff ... */
@@ -1873,7 +2324,7 @@ static void nv2a_user_write(void *opaque, hwaddr addr,
             control->dma_put = val;
 
             if (d->pfifo.cache1.push_enabled) {
-                nv2a_run_pusher(d);
+                nv2a_fifo_run_pusher(d);
             }
             break;
         case NV_USER_DMA_GET:
@@ -2102,6 +2553,23 @@ static int nv2a_get_bpp(VGACommonState *s)
 static void nv2a_vga_update(void *opaque)
 {
     NV2AState *d = NV2A_DEVICE(opaque);
+
+    GraphicsContext *context = &d->pgraph.context[d->pgraph.channel_id];
+    if (context->channel_3d) {
+        printf("3d ping! %d\n", nv2a_get_bpp(&d->vga));
+
+        nv2a_pgraph_context_set_current(context);
+
+        //glClearColor(1, 0, 0, 1);
+        //glClear(GL_COLOR_BUFFER_BIT);
+        glReadPixels(0, 0, 640, 480, GL_RGBA, GL_UNSIGNED_BYTE,
+            d->vga.vram_ptr);
+        assert(glGetError() == GL_NO_ERROR);
+        memory_region_set_dirty(&d->vga.vram, 0, 640*480*4);
+
+        nv2a_pgraph_context_set_current(NULL);
+    }
+
     d->vga.update(&d->vga);
 
     d->pcrtc.pending_interrupts |= NV_PCRTC_INTR_0_VBLANK;
@@ -2180,6 +2648,7 @@ static int nv2a_initfn(PCIDevice *dev)
                              0x100000);
     memory_region_add_subregion(&d->mmio, 0x700000, &d->ramin);
 
+    d->vram_ptr = memory_region_get_ram_ptr(&d->vram);
     d->ramin_ptr = memory_region_get_ram_ptr(&d->ramin);
 
 
@@ -2188,7 +2657,6 @@ static int nv2a_initfn(PCIDevice *dev)
         nv2a_pgraph_context_init(&d->pgraph.context[i]);
     }
 
-
     return 0;
 }
 
diff --git a/hw/nv2a_vsh.c b/hw/nv2a_vsh.c
new file mode 100644
index 0000000000..14dc5353fc
--- /dev/null
+++ b/hw/nv2a_vsh.c
@@ -0,0 +1,682 @@
+/*
+ * QEMU Geforce NV2A vertex shader translation
+ *
+ * Copyright (c) 2012 espes
+ *
+ * Based on:
+ * Cxbx, VertexShader.cpp
+ * Copyright (c) 2004 Aaron Robinson <caustik@caustik.com>
+ *                    Kingofc <kingofc@freenet.de>
+ * Dxbx, uPushBuffer.pas
+ * Copyright (c) 2007 Shadow_tj, PatrickvL
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <assert.h>
+
+#include "nv2a_vsh.h"
+
+
+#define VSH_TOKEN_SIZE 4
+
+
+typedef enum {
+    FLD_ILU = 0,
+    FLD_MAC,
+    FLD_CONST,
+    FLD_V,
+    // Input A
+    FLD_A_NEG,
+    FLD_A_SWZ_X,
+    FLD_A_SWZ_Y,
+    FLD_A_SWZ_Z,
+    FLD_A_SWZ_W,
+    FLD_A_R,
+    FLD_A_MUX,
+    // Input B
+    FLD_B_NEG,
+    FLD_B_SWZ_X,
+    FLD_B_SWZ_Y,
+    FLD_B_SWZ_Z,
+    FLD_B_SWZ_W,
+    FLD_B_R,
+    FLD_B_MUX,
+    // Input C
+    FLD_C_NEG,
+    FLD_C_SWZ_X,
+    FLD_C_SWZ_Y,
+    FLD_C_SWZ_Z,
+    FLD_C_SWZ_W,
+    FLD_C_R_HIGH,
+    FLD_C_R_LOW,
+    FLD_C_MUX,
+    // Output
+    FLD_OUT_MAC_MASK,
+    FLD_OUT_R,
+    FLD_OUT_ILU_MASK,
+    FLD_OUT_O_MASK,
+    FLD_OUT_ORB,
+    FLD_OUT_ADDRESS,
+    FLD_OUT_MUX,
+    // Relative addressing
+    FLD_A0X,
+    // Final instruction
+    FLD_FINAL
+} VshFieldName;
+
+
+typedef enum {
+    PARAM_UNKNOWN = 0,
+    PARAM_R,
+    PARAM_V,
+    PARAM_C
+} VshParameterType;
+
+typedef enum {
+    OUTPUT_C = 0,
+    OUTPUT_O
+} VshOutputType;
+
+typedef enum {
+    OMUX_MAC = 0,
+    OMUX_ILU
+} VshOutputMux;
+
+typedef enum {
+    ILU_NOP = 0,
+    ILU_MOV,
+    ILU_RCP,
+    ILU_RCC,
+    ILU_RSQ,
+    ILU_EXP,
+    ILU_LOG,
+    ILU_LIT
+} VshILU;
+
+typedef enum {
+    MAC_NOP,
+    MAC_MOV,
+    MAC_MUL,
+    MAC_ADD,
+    MAC_MAD,
+    MAC_DP3,
+    MAC_DPH,
+    MAC_DP4,
+    MAC_DST,
+    MAC_MIN,
+    MAC_MAX,
+    MAC_SLT,
+    MAC_SGE,
+    MAC_ARL
+} VshMAC;
+
+typedef enum {
+    SWIZZLE_X = 0,
+    SWIZZLE_Y,
+    SWIZZLE_Z,
+    SWIZZLE_W
+} VshSwizzle;
+
+
+typedef struct VshFieldMapping {
+    VshFieldName field_name;
+    uint8_t subtoken;
+    uint8_t start_bit;
+    uint8_t bit_length;
+} VshFieldMapping;
+
+static const VshFieldMapping field_mapping[] = {
+    // Field Name         DWORD BitPos BitSize
+    {  FLD_ILU,              1,   25,     3 },
+    {  FLD_MAC,              1,   21,     4 },
+    {  FLD_CONST,            1,   13,     8 },
+    {  FLD_V,                1,    9,     4 },
+    // INPUT A
+    {  FLD_A_NEG,            1,    8,     1 },
+    {  FLD_A_SWZ_X,          1,    6,     2 },
+    {  FLD_A_SWZ_Y,          1,    4,     2 },
+    {  FLD_A_SWZ_Z,          1,    2,     2 },
+    {  FLD_A_SWZ_W,          1,    0,     2 },
+    {  FLD_A_R,              2,   28,     4 },
+    {  FLD_A_MUX,            2,   26,     2 },
+    // INPUT B
+    {  FLD_B_NEG,            2,   25,     1 },
+    {  FLD_B_SWZ_X,          2,   23,     2 },
+    {  FLD_B_SWZ_Y,          2,   21,     2 },
+    {  FLD_B_SWZ_Z,          2,   19,     2 },
+    {  FLD_B_SWZ_W,          2,   17,     2 },
+    {  FLD_B_R,              2,   13,     4 },
+    {  FLD_B_MUX,            2,   11,     2 },
+    // INPUT C
+    {  FLD_C_NEG,            2,   10,     1 },
+    {  FLD_C_SWZ_X,          2,    8,     2 },
+    {  FLD_C_SWZ_Y,          2,    6,     2 },
+    {  FLD_C_SWZ_Z,          2,    4,     2 },
+    {  FLD_C_SWZ_W,          2,    2,     2 },
+    {  FLD_C_R_HIGH,         2,    0,     2 },
+    {  FLD_C_R_LOW,          3,   30,     2 },
+    {  FLD_C_MUX,            3,   28,     2 },
+    // Output
+    {  FLD_OUT_MAC_MASK,     3,   24,     4 },
+    {  FLD_OUT_R,            3,   20,     4 },
+    {  FLD_OUT_ILU_MASK,     3,   16,     4 },
+    {  FLD_OUT_O_MASK,       3,   12,     4 },
+    {  FLD_OUT_ORB,          3,   11,     1 },
+    {  FLD_OUT_ADDRESS,      3,    3,     8 },
+    {  FLD_OUT_MUX,          3,    2,     1 },
+    // Other
+    {  FLD_A0X,              3,    1,     1 },
+    {  FLD_FINAL,            3,    0,     1 }
+};
+
+
+typedef struct VshOpcodeParams {
+    bool A;
+    bool B;
+    bool C;
+} VshOpcodeParams;
+
+static const VshOpcodeParams ilu_opcode_params[] = {
+    /* ILU OP       ParamA ParamB ParamC */
+    /* ILU_NOP */ { false, false, false }, // Dxbx note : Unused
+    /* ILU_MOV */ { false, false, true  },
+    /* ILU_RCP */ { false, false, true  },
+    /* ILU_RCC */ { false, false, true  },
+    /* ILU_RSQ */ { false, false, true  },
+    /* ILU_EXP */ { false, false, true  },
+    /* ILU_LOG */ { false, false, true  },
+    /* ILU_LIT */ { false, false, true  },
+};
+
+static const VshOpcodeParams mac_opcode_params[] = {
+    /* MAC OP      ParamA  ParamB ParamC */
+    /* MAC_NOP */ { false, false, false }, // Dxbx note : Unused
+    /* MAC_MOV */ { true,  false, false },
+    /* MAC_MUL */ { true,  true,  false },
+    /* MAC_ADD */ { true,  false, true  },
+    /* MAC_MAD */ { true,  true,  true  },
+    /* MAC_DP3 */ { true,  true,  false },
+    /* MAC_DPH */ { true,  true,  false },
+    /* MAC_DP4 */ { true,  true,  false },
+    /* MAC_DST */ { true,  true,  false },
+    /* MAC_MIN */ { true,  true,  false },
+    /* MAC_MAX */ { true,  true,  false },
+    /* MAC_SLT */ { true,  true,  false },
+    /* MAC_SGE */ { true,  true,  false },
+    /* MAC_ARL */ { true,  false, false },
+};
+
+
+
+static const char* mask_str[] = {
+            // xyzw xyzw
+    "",     // 0000 ____
+    ".w",   // 0001 ___w
+    ".z",   // 0010 __z_
+    ".zw",  // 0011 __zw
+    ".y",   // 0100 _y__
+    ".yw",  // 0101 _y_w
+    ".yz",  // 0110 _yz_
+    ".yzw", // 0111 _yzw
+    ".x",   // 1000 x___
+    ".xw",  // 1001 x__w
+    ".xz",  // 1010 x_z_
+    ".xzw", // 1011 x_zw
+    ".xy",  // 1100 xy__
+    ".xyw", // 1101 xy_w
+    ".xyz", // 1110 xyz_
+    ""//.xyzw  1111 xyzw
+};
+
+/* Note: OpenGL seems to be case-sensitive, and requires upper-case opcodes! */
+static const char* mac_opcode[] = {
+    "NOP",
+    "MOV",
+    "MUL",
+    "ADD",
+    "MAD",
+    "DP3",
+    "DPH",
+    "DP4",
+    "DST",
+    "MIN",
+    "MAX",
+    "SLT",
+    "SGE",
+    "ARL A0.x", // Dxbx note : Alias for "mov a0.x"
+};
+
+static const char* ilu_opcode[] = {
+    "NOP",
+    "MOV",
+    "RCP",
+    "RCP", // Was RCC
+    "RSQ",
+    "EXP",
+    "LOG",
+    "LIT",
+};
+
+static bool ilu_force_scalar[] = {
+    false,
+    false,
+    true,
+    true,
+    true,
+    true,
+    true,
+    false,
+};
+
+static const char* out_reg_name[] = {
+    "R12", // "oPos",
+    "???",
+    "???",
+    "oD0",
+    "oD1",
+    "oFog",
+    "oPts",
+    "oB0",
+    "oB1",
+    "oT0",
+    "oT1",
+    "oT2",
+    "oT3",
+    "???",
+    "???",
+    "A0.x",
+};
+
+
+
+// Retrieves a number of bits in the instruction token
+static int vsh_get_from_token(uint32_t *shader_token,
+                              uint8_t subtoken,
+                              uint8_t start_bit,
+                              uint8_t bit_length)
+{
+    return (shader_token[subtoken] >> start_bit) & ~(0xFFFFFFFF << bit_length);
+}
+static uint8_t vsh_get_field(uint32_t *shader_token, VshFieldName field_name)
+{
+
+    return (uint8_t)(vsh_get_from_token(shader_token,
+                                        field_mapping[field_name].subtoken,
+                                        field_mapping[field_name].start_bit,
+                                        field_mapping[field_name].bit_length));
+}
+
+
+// Converts the C register address to disassembly format
+static int16_t convert_c_register(const int16_t c_reg)
+{
+    int16_t r = ((((c_reg >> 5) & 7) - 3) * 32) + (c_reg & 31);
+    r += VSH_D3DSCM_CORRECTION; /* to map -96..95 to 0..191 */
+    return r;
+}
+
+
+
+static QString* decode_swizzle(uint32_t *shader_token,
+                               VshFieldName swizzle_field)
+{
+    const char* swizzle_str = "xyzw";
+    VshSwizzle x, y, z, w;
+
+    /* some microcode instructions force a scalar value */
+    if (swizzle_field == FLD_C_SWZ_X
+        && ilu_force_scalar[vsh_get_field(shader_token, FLD_ILU)]) {
+        x = y = z = w = x = vsh_get_field(shader_token, swizzle_field);
+    } else {
+        x = vsh_get_field(shader_token, swizzle_field++);
+        y = vsh_get_field(shader_token, swizzle_field++);
+        z = vsh_get_field(shader_token, swizzle_field++);
+        w = vsh_get_field(shader_token, swizzle_field);
+    }
+
+    if (x == SWIZZLE_X && y == SWIZZLE_Y
+        && z == SWIZZLE_Z && w == SWIZZLE_W) {
+        /* Don't print the swizzle if it's .xyzw */
+        return qstring_from_str("");
+    /* Don't print duplicates */
+    } else if (x == y && y == z && z == w) {
+        return qstring_from_str((char[]){'.', swizzle_str[x], '\0'});
+    } else if (x == y && z == w) {
+        return qstring_from_str((char[]){'.',
+            swizzle_str[x], swizzle_str[y], '\0'});
+    } /*else if (z == w) {
+        return qstring_from_str((char[]){'.',
+            swizzle_str[x], swizzle_str[y], swizzle_str[z], '\0'});
+    }*/ else {
+        return qstring_from_str((char[]){'.',
+                                       swizzle_str[x], swizzle_str[y],
+                                       swizzle_str[z], swizzle_str[w],
+                                       '\0'});
+    }
+}
+
+static QString* decode_opcode_input(uint32_t *shader_token,
+                                    VshParameterType param,
+                                    VshFieldName neg_field,
+                                    int reg_num)
+{
+    /* This function decodes a vertex shader opcode parameter into a string.
+     * Input A, B or C is controlled via the Param and NEG fieldnames,
+     * the R-register address for each input is already given by caller. */
+
+    QString *ret_str = qstring_new();
+
+
+    if (vsh_get_field(shader_token, neg_field) > 0) {
+        qstring_append_chr(ret_str, '-');
+    }
+
+    /* PARAM_R uses the supplied reg_num, but the other two need to be
+     * determined */
+    char tmp[40];
+    switch (param) {
+    case PARAM_R:
+        snprintf(tmp, sizeof(tmp), "R%d", reg_num);
+        break;
+    case PARAM_V:
+        reg_num = vsh_get_field(shader_token, FLD_V);
+        snprintf(tmp, sizeof(tmp), "v%d", reg_num);
+        break;
+    case PARAM_C:
+        reg_num = convert_c_register(vsh_get_field(shader_token, FLD_CONST));
+        if (vsh_get_field(shader_token, FLD_A0X) > 0) {
+            snprintf(tmp, sizeof(tmp), "c[A0+%d]", reg_num);
+        } else {
+            snprintf(tmp, sizeof(tmp), "c[%d]", reg_num);
+        }
+        break;
+    default:
+        assert(false);
+    }
+    qstring_append(ret_str, tmp);
+
+    {
+        /* swizzle bits are next to the neg bit */
+        QString *swizzle_str = decode_swizzle(shader_token, neg_field+1);
+        qstring_append(ret_str, qstring_get_str(swizzle_str));
+        QDECREF(swizzle_str);
+    }
+
+    return ret_str;
+}
+
+
+static QString* decode_opcode(uint32_t *shader_token,
+                              VshOutputMux out_mux,
+                              uint32_t mask,
+                              const char* opcode,
+                              QString *inputs)
+{
+    QString *ret = qstring_new();
+    int reg_num = vsh_get_field(shader_token, FLD_OUT_R);
+
+    /* Test for paired opcodes (in other words : Are both <> NOP?) */
+    if (out_mux == OMUX_MAC
+          &&  vsh_get_field(shader_token, FLD_ILU) != ILU_NOP
+          && reg_num == 1) {
+        /* Ignore paired MAC opcodes that write to R1 */
+        mask = 0;
+    } else if (out_mux == OMUX_ILU
+               && vsh_get_field(shader_token, FLD_MAC) != MAC_NOP) {
+        /* Paired ILU opcodes can only write to R1 */
+        reg_num = 1;
+    }
+
+    if (mask > 0) {
+        if (strcmp(opcode, mac_opcode[MAC_ARL]) == 0) {
+            qstring_append(ret, opcode);
+            qstring_append(ret, qstring_get_str(inputs));
+            qstring_append(ret, ";\n");
+        } else {
+            qstring_append(ret, opcode);
+            qstring_append(ret, " R");
+            qstring_append_int(ret, reg_num);
+            qstring_append(ret, mask_str[mask]);
+            qstring_append(ret, qstring_get_str(inputs));
+            qstring_append(ret, ";\n");
+        }
+    }
+
+    /* See if we must add a muxed opcode too: */
+    if (vsh_get_field(shader_token, FLD_OUT_MUX) == out_mux
+        /* Only if it's not masked away: */
+        && vsh_get_field(shader_token, FLD_OUT_O_MASK) != 0) {
+
+        qstring_append(ret, opcode);
+        if (vsh_get_field(shader_token, FLD_OUT_ORB) == OUTPUT_C) {
+            /* TODO : Emulate writeable const registers */
+            qstring_append(ret, " c");
+            qstring_append_int(ret,
+                convert_c_register(
+                    vsh_get_field(shader_token, FLD_OUT_ADDRESS)));
+        } else {
+            qstring_append_chr(ret, ' ');
+            qstring_append(ret,
+                out_reg_name[
+                    vsh_get_field(shader_token, FLD_OUT_ADDRESS) & 0xF]);
+        }
+        qstring_append(ret,
+            mask_str[
+                vsh_get_field(shader_token, FLD_OUT_O_MASK)]);
+        qstring_append(ret, qstring_get_str(inputs));
+        qstring_append(ret, ";\n");
+    }
+
+    return ret;
+}
+
+
+static QString* decode_token(uint32_t *shader_token)
+{
+    QString *ret;
+
+    /* Since it's potentially used twice, decode input C once: */
+    QString *input_c =
+        decode_opcode_input(shader_token,
+                            vsh_get_field(shader_token, FLD_C_MUX),
+                            FLD_C_NEG,
+                            (vsh_get_field(shader_token, FLD_C_R_HIGH) << 2)
+                                | vsh_get_field(shader_token, FLD_C_R_LOW));
+
+    /* See what MAC opcode is written to (if not masked away): */
+    VshMAC mac = vsh_get_field(shader_token, FLD_MAC);
+    if (mac != MAC_NOP) {
+        QString *inputs_mac = qstring_new();
+        if (mac_opcode_params[mac].A) {
+            QString *input_a =
+                decode_opcode_input(shader_token,
+                                    vsh_get_field(shader_token, FLD_A_MUX),
+                                    FLD_A_NEG,
+                                    vsh_get_field(shader_token, FLD_A_R));
+            qstring_append(inputs_mac, ", ");
+            qstring_append(inputs_mac, qstring_get_str(input_a));
+            QDECREF(input_a);
+        }
+        if (mac_opcode_params[mac].B) {
+            QString *input_b =
+                decode_opcode_input(shader_token,
+                                    vsh_get_field(shader_token, FLD_B_MUX),
+                                    FLD_B_NEG,
+                                    vsh_get_field(shader_token, FLD_B_R));
+            qstring_append(inputs_mac, ", ");
+            qstring_append(inputs_mac, qstring_get_str(input_b));
+            QDECREF(input_b);
+        }
+        if (mac_opcode_params[mac].C) {
+            qstring_append(inputs_mac, ", ");
+            qstring_append(inputs_mac, qstring_get_str(input_c));
+        }
+
+        /* Then prepend these inputs with the actual opcode, mask, and input : */
+        ret = decode_opcode(shader_token,
+                            OMUX_MAC,
+                            vsh_get_field(shader_token, FLD_OUT_MAC_MASK),
+                            mac_opcode[mac],
+                            inputs_mac);
+    } else {
+        ret = qstring_new();
+    }
+
+    /* See if a ILU opcode is present too: */
+    VshILU ilu = vsh_get_field(shader_token, FLD_ILU);
+    if (ilu != ILU_NOP) {
+        QString *inputs_c = qstring_from_str(", ");
+        qstring_append(inputs_c, qstring_get_str(input_c));
+
+        /* Append the ILU opcode, mask and (the already determined) input C: */
+        QString *ilu_op =
+            decode_opcode(shader_token,
+                          OMUX_ILU,
+                          vsh_get_field(shader_token, FLD_OUT_ILU_MASK),
+                          ilu_opcode[ilu],
+                          inputs_c);
+
+        qstring_append(ret, qstring_get_str(ilu_op));
+
+        QDECREF(inputs_c);
+        QDECREF(ilu_op);
+    }
+
+    QDECREF(input_c);
+
+    return ret;
+}
+
+/* Vertex shader header, mapping Xbox1 registers to the ARB syntax (original
+ * version by KingOfC). Note about the use of 'conventional' attributes in here:
+ * Since we prefer to use only one shader for both immediate and deferred mode
+ * rendering, we alias all attributes to conventional inputs as much as possible.
+ * Only when there's no conventional attribute available, we use generic
+ * attributes. So in the following header, we use conventional attributes first,
+ * and generic attributes for the rest of the vertex attribute slots. This makes
+ * it possible to support immediate and deferred mode rendering with the same
+ * shader, and the use of the OpenGL fixed-function pipeline without a shader.
+ */
+static const char* vsh_header =
+    "!!ARBvp1.0\n"
+    "TEMP R0,R1,R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12;\n"
+    "ADDRESS A0;\n"
+#if 0
+    "ATTRIB v0 = vertex.position;" // (See "conventional" note above)
+    "ATTRIB v1 = vertex.%s;" // Note : We replace this with "weight" or "attrib[1]" depending GL_ARB_vertex_blend
+    "ATTRIB v2 = vertex.normal;"
+    "ATTRIB v3 = vertex.color.primary;"
+    "ATTRIB v4 = vertex.color.secondary;"
+    "ATTRIB v5 = vertex.fogcoord;"
+    "ATTRIB v6 = vertex.attrib[6];"
+    "ATTRIB v7 = vertex.attrib[7];"
+    "ATTRIB v8 = vertex.texcoord[0];"
+    "ATTRIB v9 = vertex.texcoord[1];"
+    "ATTRIB v10 = vertex.texcoord[2];"
+    "ATTRIB v11 = vertex.texcoord[3];"
+#else
+    "ATTRIB v0 = vertex.attrib[0];\n"
+    "ATTRIB v1 = vertex.attrib[1];\n"
+    "ATTRIB v2 = vertex.attrib[2];\n"
+    "ATTRIB v3 = vertex.attrib[3];\n"
+    "ATTRIB v4 = vertex.attrib[4];\n"
+    "ATTRIB v5 = vertex.attrib[5];\n"
+    "ATTRIB v6 = vertex.attrib[6];\n"
+    "ATTRIB v7 = vertex.attrib[7];\n"
+    "ATTRIB v8 = vertex.attrib[8];\n"
+    "ATTRIB v9 = vertex.attrib[9];\n"
+    "ATTRIB v10 = vertex.attrib[10];\n"
+    "ATTRIB v11 = vertex.attrib[11];\n"
+#endif
+    "ATTRIB v12 = vertex.attrib[12];\n"
+    "ATTRIB v13 = vertex.attrib[13];\n"
+    "ATTRIB v14 = vertex.attrib[14];\n"
+    "ATTRIB v15 = vertex.attrib[15];\n"
+    "OUTPUT oPos = result.position;\n"
+    "OUTPUT oD0 = result.color.front.primary;\n"
+    "OUTPUT oD1 = result.color.front.secondary;\n"
+    "OUTPUT oB0 = result.color.back.primary;\n"
+    "OUTPUT oB1 = result.color.back.secondary;\n"
+    "OUTPUT oPts = result.pointsize;\n"
+    "OUTPUT oFog = result.fogcoord;\n"
+    "OUTPUT oT0 = result.texcoord[0];\n"
+    "OUTPUT oT1 = result.texcoord[1];\n"
+    "OUTPUT oT2 = result.texcoord[2];\n"
+    "OUTPUT oT3 = result.texcoord[3];\n"
+    /* All constants in 1 array declaration (requires NV_gpu_program4?) */
+    "PARAM c[] = { program.env[0..191] };\n"
+    "PARAM mvp[4] = { state.matrix.mvp };\n";
+
+
+QString* vsh_translate(uint16_t version,
+                       uint32_t *tokens, unsigned int tokens_length)
+{
+    QString *ret = qstring_from_str(vsh_header);
+    
+    uint32_t *cur_token = tokens;
+    while (cur_token-tokens < tokens_length) {
+        QString *token_str = decode_token(cur_token);
+        qstring_append(ret, qstring_get_str(token_str));
+        QDECREF(token_str);
+
+        if (vsh_get_field(cur_token, FLD_FINAL)) {
+            break;
+        }
+        cur_token += VSH_TOKEN_SIZE;
+    }
+
+    /* Note : Since we replaced oPos with r12 in the above decoding,
+     * we have to assign oPos at the end; This can be done in two ways;
+     * 1) When the shader is complete (including transformations),
+     *    we could just do a 'MOV oPos, R12;' and be done with it.
+     * 2) In case of D3DFVF_XYZRHW, it seems the NV2A applies the mvp
+     *    (model/view/projection) matrix transformation AFTER executing
+     *    the shader (but OpenGL expects *the*shader* to handle this
+     *    transformation).
+     * Until we can discern these two situations, we apply the matrix 
+     * transformation :
+     * TODO : What should we do about normals, eye-space lighting and all that?
+     */
+    qstring_append(ret,
+/*
+    '# Dxbx addition : Transform the vertex to clip coordinates :'
+    "DP4 R0.x, mvp[0], R12;"
+    "DP4 R0.y, mvp[1], R12;"
+    "DP4 R0.z, mvp[2], R12;"
+    "DP4 R0.w, mvp[3], R12;"
+    "MOV R12, R0;"
+*/
+
+    /* Z coord [0;1]->[-1;1] mapping, see comment in transform_projection
+     * in state.c
+     *
+     * Basically we want (in homogeneous coordinates) z = z * 2 - 1. However,
+     * shaders are run before the homogeneous divide, so we have to take the w
+     * into account: z = ((z / w) * 2 - 1) * w, which is the same as
+     * z = z * 2 - w.
+     */
+        "# Apply Z coord mapping\n"
+        "ADD R12.z, R12.z, R12.z;\n"
+        "ADD R12.z, R12.z, -R12.w;\n"
+
+        "# End of shader:\n"
+        "MOV oPos, R12;\n"
+        "END"
+    );
+    return ret;
+}
diff --git a/hw/nv2a_vsh.h b/hw/nv2a_vsh.h
new file mode 100644
index 0000000000..9e34981551
--- /dev/null
+++ b/hw/nv2a_vsh.h
@@ -0,0 +1,46 @@
+/*
+ * QEMU Geforce NV2A vertex shader translation
+ *
+ * Copyright (c) 2012 espes
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#ifndef HW_NV2A_VSH_H
+#define HW_NV2A_VSH_H
+
+#include "qstring.h"
+
+// vs.1.1, not an official value
+#define VSH_VERSION_VS                     0xF078
+
+// Xbox vertex shader
+#define VSH_VERSION_XVS                    0x2078
+
+// Xbox vertex state shader
+#define VSH_VERSION_XVSS                   0x7378
+
+// Xbox vertex read/write shader
+#define VSH_VERSION_XVSW                   0x7778
+
+
+#define VSH_D3DSCM_CORRECTION 96
+
+QString* vsh_translate(uint16_t version,
+                       uint32_t *tokens, unsigned int tokens_length);
+
+
+#endif
\ No newline at end of file