From 2c4571a3cdc48786568971500f2c4fccf57a7469 Mon Sep 17 00:00:00 2001
From: espes <espes@pequalsnp.com>
Date: Sun, 25 May 2014 20:58:17 +1000
Subject: [PATCH] Initial integration of JayFoxRox's GLSL vertex program
 translator

---
 hw/xbox/Makefile.objs |   2 +-
 hw/xbox/nv2a.c        | 497 +++++++++++++++++-------------------------
 hw/xbox/nv2a_vsh.c    | 479 +++++++++++++++++++++++++---------------
 hw/xbox/nv2a_vsh.h    |  48 ++++
 hw/xbox/swizzle.c     | 109 +++++++++
 hw/xbox/swizzle.h     |  34 +++
 6 files changed, 691 insertions(+), 478 deletions(-)
 create mode 100644 hw/xbox/swizzle.c
 create mode 100644 hw/xbox/swizzle.h

diff --git a/hw/xbox/Makefile.objs b/hw/xbox/Makefile.objs
index 72d8b187f0..3478ce5d4b 100644
--- a/hw/xbox/Makefile.objs
+++ b/hw/xbox/Makefile.objs
@@ -2,7 +2,7 @@ obj-y += xbox.o chihiro.o
 obj-y += xbox_pci.o acpi_xbox.o
 obj-y += amd_smbus.o smbus_xbox_smc.o smbus_cx25871.o smbus_adm1032.o
 obj-y += nvnet.o
-obj-y += nv2a.o nv2a_vsh.o nv2a_psh.o
+obj-y += nv2a.o nv2a_vsh.o nv2a_psh.o swizzle.o
 obj-y += mcpx_apu.o mcpx_aci.o
 obj-y += lpc47m157.o
 obj-y += xid.o
diff --git a/hw/xbox/nv2a.c b/hw/xbox/nv2a.c
index eb9720774e..7f70ac5066 100644
--- a/hw/xbox/nv2a.c
+++ b/hw/xbox/nv2a.c
@@ -28,6 +28,7 @@
 #include "qapi/qmp/qstring.h"
 #include "gl/gloffscreen.h"
 
+#include "hw/xbox/swizzle.h"
 #include "hw/xbox/u_format_r11g11b10f.h"
 #include "hw/xbox/nv2a_vsh.h"
 #include "hw/xbox/nv2a_psh.h"
@@ -692,8 +693,7 @@ static const ColorFormatInfo kelvin_color_format_map[66] = {
 #define NV2A_NUM_SUBCHANNELS 8
 
 #define NV2A_MAX_BATCH_LENGTH 0xFFFF
-#define NV2A_VERTEXSHADER_SLOTS  32 /*???*/
-#define NV2A_MAX_VERTEXSHADER_LENGTH 136
+#define NV2A_MAX_TRANSFORM_PROGRAM_LENGTH 136
 #define NV2A_VERTEXSHADER_CONSTANTS 192
 #define NV2A_VERTEXSHADER_ATTRIBUTES 16
 #define NV2A_MAX_TEXTURES 4
@@ -771,14 +771,6 @@ typedef struct VertexShaderConstant {
     uint32 data[4];
 } VertexShaderConstant;
 
-typedef struct VertexShader {
-    bool dirty;
-    unsigned int program_length;
-    uint32_t program_data[NV2A_MAX_VERTEXSHADER_LENGTH];
-
-    GLuint gl_program;
-} VertexShader;
-
 typedef struct Texture {
     bool dirty;
     bool enabled;
@@ -818,9 +810,12 @@ typedef struct ShaderState {
 
     bool rect_tex[4];
 
-    /* vertex shader */
+
     bool fixed_function;
 
+    bool vertex_program;
+    uint32_t program_data[NV2A_MAX_TRANSFORM_PROGRAM_LENGTH];
+    int program_length;
 } ShaderState;
 
 typedef struct Surface {
@@ -842,28 +837,6 @@ typedef struct KelvinState {
     hwaddr dma_vertex_a, dma_vertex_b;
     hwaddr dma_semaphore;
     unsigned int semaphore_offset;
-
-    GLenum gl_primitive_mode;
-
-    bool enable_vertex_program_write;
-
-    unsigned int vertexshader_start_slot;
-    unsigned int vertexshader_load_slot;
-    VertexShader vertexshaders[NV2A_VERTEXSHADER_SLOTS];
-
-    unsigned int constant_load_slot;
-    VertexShaderConstant constants[NV2A_VERTEXSHADER_CONSTANTS];
-
-    VertexAttribute vertex_attributes[NV2A_VERTEXSHADER_ATTRIBUTES];
-
-    unsigned int inline_array_length;
-    uint32_t inline_array[NV2A_MAX_BATCH_LENGTH];
-
-    unsigned int inline_elements_length;
-    uint32_t inline_elements[NV2A_MAX_BATCH_LENGTH];
-
-    unsigned int inline_buffer_length;
-    InlineVertexBufferEntry inline_buffer[NV2A_MAX_BATCH_LENGTH];
 } KelvinState;
 
 typedef struct ContextSurfaces2DState {
@@ -956,6 +929,29 @@ typedef struct PGRAPHState {
     GraphicsSubchannel subchannel_data[NV2A_NUM_SUBCHANNELS];
 
 
+    GLenum gl_primitive_mode;
+
+    bool enable_vertex_program_write;
+
+    unsigned int program_start;
+    unsigned int program_load;
+    uint32_t program_data[NV2A_MAX_TRANSFORM_PROGRAM_LENGTH];
+
+    unsigned int constant_load_slot;
+    VertexShaderConstant constants[NV2A_VERTEXSHADER_CONSTANTS];
+
+    VertexAttribute vertex_attributes[NV2A_VERTEXSHADER_ATTRIBUTES];
+
+    unsigned int inline_array_length;
+    uint32_t inline_array[NV2A_MAX_BATCH_LENGTH];
+
+    unsigned int inline_elements_length;
+    uint32_t inline_elements[NV2A_MAX_BATCH_LENGTH];
+
+    unsigned int inline_buffer_length;
+    InlineVertexBufferEntry inline_buffer[NV2A_MAX_BATCH_LENGTH];
+
+
     uint32_t regs[0x2000];
 } PGRAPHState;
 
@@ -1211,7 +1207,6 @@ static void *nv_dma_map(NV2AState *d, hwaddr dma_obj_address, hwaddr *len)
 static void load_graphics_object(NV2AState *d, hwaddr instance_address,
                                  GraphicsObject *obj)
 {
-    int i;
     uint8_t *obj_ptr;
     uint32_t switch1, switch2, switch3;
 
@@ -1226,21 +1221,9 @@ static void load_graphics_object(NV2AState *d, hwaddr instance_address,
     obj->graphics_class = switch1 & NV_PGRAPH_CTX_SWITCH1_GRCLASS;
 
     /* init graphics object */
-    KelvinState *kelvin;
     switch (obj->graphics_class) {
     case NV_KELVIN_PRIMITIVE:
-        kelvin = &obj->data.kelvin;
-
-        /* generate vertex programs */
-        for (i = 0; i < NV2A_VERTEXSHADER_SLOTS; i++) {
-            VertexShader *shader = &kelvin->vertexshaders[i];
-            glGenProgramsARB(1, &shader->gl_program);
-        }
-        assert(glGetError() == GL_NO_ERROR);
-
-        /* temp hack? */
-        kelvin->vertex_attributes[NV2A_VERTEX_ATTR_DIFFUSE].inline_value = 0xFFFFFFF;
-
+        // kelvin->vertex_attributes[NV2A_VERTEX_ATTR_DIFFUSE].inline_value = 0xFFFFFFF;
         break;
     default:
         break;
@@ -1260,19 +1243,21 @@ static GraphicsObject* lookup_graphics_object(PGRAPHState *s,
 }
 
 
-static void kelvin_bind_converted_vertex_attributes(NV2AState *d,
+static void pgraph_bind_converted_vertex_attributes(NV2AState *d,
                                                     KelvinState *kelvin,
                                                     bool inline_data,
                                                     unsigned int num_elements)
 {
     int i, j;
-    for (i=0; i<NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
-        VertexAttribute *attribute = &kelvin->vertex_attributes[i];
-        if (attribute->count && attribute->needs_conversion) {
+    PGRAPHState *pg = &d->pgraph;
 
+    for (i=0; i<NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+        VertexAttribute *attribute = &pg->vertex_attributes[i];
+        if (attribute->count && attribute->needs_conversion) {
+            NV2A_DPRINTF("converted %d\n", i);
             uint8_t *data;
             if (inline_data) {
-                data = (uint8_t*)kelvin->inline_array
+                data = (uint8_t*)pg->inline_array
                         + attribute->inline_array_offset;
             } else {
                 hwaddr dma_len;
@@ -1322,12 +1307,13 @@ static void kelvin_bind_converted_vertex_attributes(NV2AState *d,
     }
 }
 
-static unsigned int kelvin_bind_inline_array(KelvinState *kelvin)
+static unsigned int pgraph_bind_inline_array(PGRAPHState *pg)
 {
     int i;
+
     unsigned int offset = 0;
     for (i=0; i<NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
-        VertexAttribute *attribute = &kelvin->vertex_attributes[i];
+        VertexAttribute *attribute = &pg->vertex_attributes[i];
         if (attribute->count) {
 
             glEnableVertexAttribArray(i);
@@ -1340,7 +1326,7 @@ static unsigned int kelvin_bind_inline_array(KelvinState *kelvin)
                     attribute->gl_type,
                     attribute->gl_normalize,
                     attribute->stride,
-                    (uint8_t*)kelvin->inline_array + offset);
+                    (uint8_t*)pg->inline_array + offset);
             }
 
             offset += attribute->size * attribute->count;
@@ -1349,13 +1335,13 @@ static unsigned int kelvin_bind_inline_array(KelvinState *kelvin)
     return offset;
 }
 
-static void kelvin_bind_vertex_attributes(NV2AState *d,
-                                                 KelvinState *kelvin)
+static void pgraph_bind_vertex_attributes(NV2AState *d, KelvinState *kelvin)
 {
     int i;
+    PGRAPHState *pg = &d->pgraph;
 
     for (i=0; i<NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
-        VertexAttribute *attribute = &kelvin->vertex_attributes[i];
+        VertexAttribute *attribute = &pg->vertex_attributes[i];
         if (attribute->count) {
             glEnableVertexAttribArray(i);
 
@@ -1387,157 +1373,6 @@ static void kelvin_bind_vertex_attributes(NV2AState *d,
     }
 }
 
-static void kelvin_bind_vertex_program(KelvinState *kelvin)
-{
-    int i;
-    VertexShader *shader;
-
-    shader = &kelvin->vertexshaders[kelvin->vertexshader_start_slot];
-
-    glBindProgramARB(GL_VERTEX_PROGRAM_ARB, shader->gl_program);
-
-    if (shader->dirty) {
-        QString *program_code = vsh_translate(VSH_VERSION_XVS,
-                                             shader->program_data,
-                                             shader->program_length);
-        const char* program_code_str = qstring_get_str(program_code);
-
-        NV2A_DPRINTF("bind vertex program %d, code:\n%s\n",
-                     kelvin->vertexshader_start_slot,
-                     program_code_str);
-
-        glProgramStringARB(GL_VERTEX_PROGRAM_ARB,
-                           GL_PROGRAM_FORMAT_ASCII_ARB,
-                           strlen(program_code_str),
-                           program_code_str);
-
-        /* Check it compiled */
-        GLint pos;
-        glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
-        if (pos != -1) {
-            fprintf(stderr, "nv2a: vertex program compilation failed:\n"
-                            "      pos %d, %s\n",
-                    pos, glGetString(GL_PROGRAM_ERROR_STRING_ARB));
-            fprintf(stderr, "ucode:\n");
-            for (i=0; i<shader->program_length; i++) {
-                fprintf(stderr, "    0x%08x,\n", shader->program_data[i]);
-            }
-            abort();
-        }
-
-        /* Check we're within resource limits */
-        GLint native;
-        glGetProgramivARB(GL_VERTEX_PROGRAM_ARB,
-                          GL_PROGRAM_UNDER_NATIVE_LIMITS_ARB,
-                          &native);
-        assert(native);
-
-        assert(glGetError() == GL_NO_ERROR);
-
-        QDECREF(program_code);
-        shader->dirty = false;
-    }
-
-    /* load constants */
-    for (i=0; i<NV2A_VERTEXSHADER_CONSTANTS; i++) {
-        VertexShaderConstant *constant = &kelvin->constants[i];
-        if (!constant->dirty) continue;
-
-        glProgramEnvParameter4fvARB(GL_VERTEX_PROGRAM_ARB,
-                                    i,
-                                    (const GLfloat*)constant->data);
-        constant->dirty = false;
-    }
-
-    assert(glGetError() == GL_NO_ERROR);
-}
-
-
-static void unswizzle_rect(
-    uint8_t *src_buf,
-    unsigned int width,
-    unsigned int height,
-    unsigned int depth,
-    uint8_t *dst_buf,
-    unsigned int pitch,
-    unsigned int bytes_per_pixel)
-{
-    unsigned int offset_u = 0, offset_v = 0, offset_w = 0;
-    uint32_t mask_u = 0, mask_v = 0, mask_w = 0;
-
-    unsigned int i = 1, j = 1;
-
-    while( (i <= width) || (i <= height) || (i <= depth) ) {
-        if(i < width) {
-            mask_u |= j;
-            j<<=1;
-        }
-        if(i < height) {
-            mask_v |= j;
-            j<<=1;
-        }
-        if(i < depth) {
-            mask_w |= j;
-            j<<=1;
-        }
-        i<<=1;
-    }
-
-    uint32_t start_u = 0;
-    uint32_t start_v = 0;
-    uint32_t start_w = 0;
-    uint32_t mask_max = 0;
-
-    // get the biggest mask
-    if(mask_u > mask_v)
-        mask_max = mask_u;
-    else
-        mask_max = mask_v;
-    if(mask_w > mask_max)
-        mask_max = mask_w;
-
-    for(i = 1; i <= mask_max; i<<=1) {
-        if(i<=mask_u) {
-            if(mask_u & i) start_u |= (offset_u & i);
-            else offset_u <<= 1;
-        }
-
-        if(i <= mask_v) {
-            if(mask_v & i) start_v |= (offset_v & i);
-            else offset_v<<=1;
-        }
-
-        if(i <= mask_w) {
-            if(mask_w & i) start_w |= (offset_w & i);
-            else offset_w <<= 1;
-        }
-    }
-
-    uint32_t w = start_w;
-    unsigned int z;
-    for(z=0; z<depth; z++) {
-        uint32_t v = start_v;
-
-        unsigned int y;
-        for(y=0; y<height; y++) {
-            uint32_t u = start_u;
-
-            unsigned int x;
-            for (x=0; x<width; x++) {
-                memcpy(dst_buf,
-                       src_buf + ( (u|v|w)*bytes_per_pixel ),
-                       bytes_per_pixel);
-                dst_buf += bytes_per_pixel;
-
-                u = (u - mask_u) & mask_u;
-            }
-            dst_buf += pitch - width * bytes_per_pixel;
-
-            v = (v - mask_v) & mask_v;
-        }
-        w = (w - mask_w) & mask_w;
-    }
-}
 
 static void pgraph_bind_textures(NV2AState *d)
 {
@@ -1707,12 +1542,14 @@ static GLuint generate_shaders(ShaderState state)
 
     GLuint program = glCreateProgram();
 
+
+    /* create the vertex shader */
+
+    QString *vertex_shader_code = NULL;
+    const char *vertex_shader_code_str = NULL;
     if (state.fixed_function) {
         /* generate vertex shader mimicking fixed function */
-        GLuint vertex_shader = glCreateShader(GL_VERTEX_SHADER);
-        glAttachShader(program, vertex_shader);
-
-        const char *vertex_shader_code =
+        vertex_shader_code_str =
 "attribute vec4 position;\n"
 "attribute vec3 normal;\n"
 "attribute vec4 diffuse;\n"
@@ -1739,10 +1576,21 @@ static GLuint generate_shaders(ShaderState state)
 "   gl_TexCoord[3] = multiTexCoord3;\n"
 "}\n";
 
-        glShaderSource(vertex_shader, 1, &vertex_shader_code, 0);
+    } else if (state.vertex_program) {
+        vertex_shader_code = vsh_translate(VSH_VERSION_XVS,
+                                           state.program_data,
+                                           state.program_length);
+        vertex_shader_code_str = qstring_get_str(vertex_shader_code);
+    }
+
+    if (vertex_shader_code_str) {
+        GLuint vertex_shader = glCreateShader(GL_VERTEX_SHADER);
+        glAttachShader(program, vertex_shader); 
+
+        glShaderSource(vertex_shader, 1, &vertex_shader_code_str, 0);
         glCompileShader(vertex_shader);
 
-        NV2A_DPRINTF("bind new vertex shader, code:\n%s\n", vertex_shader_code);
+        NV2A_DPRINTF("bind new vertex shader, code:\n%s\n", vertex_shader_code_str);
 
         /* Check it compiled */
         GLint compiled = 0;
@@ -1754,6 +1602,15 @@ static GLuint generate_shaders(ShaderState state)
             abort();
         }
 
+        if (vertex_shader_code) {
+            QDECREF(vertex_shader_code);
+            vertex_shader_code = NULL;
+        }
+    }
+
+
+    if (state.fixed_function) {
+        /* bind fixed function vertex attributes */
         glBindAttribLocation(program, NV2A_VERTEX_ATTR_POSITION, "position");
         glBindAttribLocation(program, NV2A_VERTEX_ATTR_DIFFUSE, "diffuse");
         glBindAttribLocation(program, NV2A_VERTEX_ATTR_SPECULAR, "specular");
@@ -1762,10 +1619,17 @@ static GLuint generate_shaders(ShaderState state)
         glBindAttribLocation(program, NV2A_VERTEX_ATTR_TEXTURE1, "multiTexCoord1");
         glBindAttribLocation(program, NV2A_VERTEX_ATTR_TEXTURE2, "multiTexCoord2");
         glBindAttribLocation(program, NV2A_VERTEX_ATTR_TEXTURE3, "multiTexCoord3");
+    } else if (state.vertex_program) {
+        /* Bind attributes for transform program*/
+        char tmp[8];
+        for(i = 0; i < 16; i++) {
+            snprintf(tmp, sizeof(tmp), "v%d", i);
+            glBindAttribLocation(program, i, tmp);
+        }
     }
 
 
-    /* generate a fragment hader from register combiners */
+    /* generate a fragment shader from register combiners */
     GLuint fragment_shader = glCreateShader(GL_FRAGMENT_SHADER);
     glAttachShader(program, fragment_shader);
 
@@ -1823,6 +1687,7 @@ static GLuint generate_shaders(ShaderState state)
         }
     }
 
+    /* validate the program */
     glValidateProgram(program);
     GLint valid = 0;
     glGetProgramiv(program, GL_VALIDATE_STATUS, &valid);
@@ -1840,6 +1705,9 @@ static void pgraph_bind_shaders(PGRAPHState *pg)
 {
     int i;
 
+    bool vertex_program = GET_MASK(pg->regs[NV_PGRAPH_CSV0_D],
+                                   NV_PGRAPH_CSV0_D_MODE) == 2;
+
     bool fixed_function = GET_MASK(pg->regs[NV_PGRAPH_CSV0_D],
                                    NV_PGRAPH_CSV0_D_MODE) == 0;
 
@@ -1854,8 +1722,30 @@ static void pgraph_bind_shaders(PGRAPHState *pg)
 
             /* fixed function stuff */
             .fixed_function = fixed_function,
+
+            /* vertex program stuff */
+            .vertex_program = vertex_program,
         };
 
+        state.program_length = 0;
+        memset(state.program_data, 0, sizeof(state.program_data));
+
+        if (vertex_program) {
+            // copy in vertex program tokens
+            for (i = pg->program_start;
+                    i < NV2A_MAX_TRANSFORM_PROGRAM_LENGTH;
+                    i += VSH_TOKEN_SIZE) {
+                uint32_t *cur_token = pg->program_data + i;
+                memcpy(state.program_data + state.program_length,
+                       cur_token,
+                       VSH_TOKEN_SIZE * sizeof(uint32_t));
+                state.program_length += VSH_TOKEN_SIZE;
+
+                if (vsh_get_field(cur_token, FLD_FINAL)) {
+                    break;
+                }
+            }
+        }
 
         for (i = 0; i < 8; i++) {
             state.rgb_inputs[i] = pg->regs[NV_PGRAPH_COMBINECOLORI0 + i * 4];
@@ -1921,9 +1811,11 @@ static void pgraph_bind_shaders(PGRAPHState *pg)
         }
     }
 
-    /* update fixed function composite matrix */
     if (fixed_function) {
+        /* update fixed function composite matrix */
+
         GLint comLoc = glGetUniformLocation(pg->gl_program, "composite");
+        assert(comLoc != -1);
         glUniformMatrix4fv(comLoc, 1, GL_FALSE, pg->composite_matrix);
 
 
@@ -1951,8 +1843,23 @@ static void pgraph_bind_shaders(PGRAPHState *pg)
         };
 
         GLint viewLoc = glGetUniformLocation(pg->gl_program, "invViewport");
+        assert(viewLoc != -1);
         glUniformMatrix4fv(viewLoc, 1, GL_FALSE, &invViewport[0]);
 
+    } else if (vertex_program) {
+        /* update vertex program constants */
+
+        for (i=0; i<NV2A_VERTEXSHADER_CONSTANTS; i++) {
+            VertexShaderConstant *constant = &pg->constants[i];
+
+            char tmp[8];
+            snprintf(tmp, sizeof(tmp), "c[%d]", i);
+            GLint loc = glGetUniformLocation(pg->gl_program, tmp);
+            //assert(loc != -1);
+            if (loc != -1) {
+                glUniform4fv(loc, 1, (const GLfloat*)constant->data);
+            }
+        }
     }
 
     pg->shaders_dirty = false;
@@ -2196,7 +2103,6 @@ static void pgraph_method(NV2AState *d,
 
     unsigned int slot;
     VertexAttribute *vertex_attribute;
-    VertexShader *vertexshader;
     VertexShaderConstant *constant;
 
     PGRAPHState *pg = &d->pgraph;
@@ -2481,8 +2387,8 @@ static void pgraph_method(NV2AState *d,
         slot = (class_method - NV097_SET_VIEWPORT_OFFSET) / 4;
 
         /* populate magic viewport offset constant */
-        kelvin->constants[59].data[slot] = parameter;
-        kelvin->constants[59].dirty = true;
+        pg->constants[59].data[slot] = parameter;
+        pg->constants[59].dirty = true;
         break;
 
     case NV097_SET_COMBINER_FACTOR0 ...
@@ -2519,30 +2425,31 @@ static void pgraph_method(NV2AState *d,
         slot = (class_method - NV097_SET_VIEWPORT_SCALE) / 4;
 
         /* populate magic viewport scale constant */
-        kelvin->constants[58].data[slot] = parameter;
-        kelvin->constants[58].dirty = true;
+        pg->constants[58].data[slot] = parameter;
+        pg->constants[58].dirty = true;
         break;
 
     case NV097_SET_TRANSFORM_PROGRAM ...
             NV097_SET_TRANSFORM_PROGRAM + 0x7c:
 
-        slot = (class_method - NV097_SET_TRANSFORM_PROGRAM) / 4;
-        /* TODO: It should still work using a non-increasing slot??? */
+        // slot = (class_method - NV097_SET_TRANSFORM_PROGRAM) / 4;
 
-        vertexshader = &kelvin->vertexshaders[kelvin->vertexshader_load_slot];
-        assert(vertexshader->program_length < NV2A_MAX_VERTEXSHADER_LENGTH);
-        vertexshader->program_data[
-            vertexshader->program_length++] = parameter;
+        assert(pg->program_load < NV2A_MAX_TRANSFORM_PROGRAM_LENGTH);
+        pg->program_data[pg->program_load++] = parameter;
+        pg->shaders_dirty = true;
         break;
 
     case NV097_SET_TRANSFORM_CONSTANT ...
             NV097_SET_TRANSFORM_CONSTANT + 0x7c:
 
-        slot = (class_method - NV097_SET_TRANSFORM_CONSTANT) / 4;
+        // slot = (class_method - NV097_SET_TRANSFORM_CONSTANT) / 4;
 
-        constant = &kelvin->constants[kelvin->constant_load_slot+slot/4];
-        constant->data[slot%4] = parameter;
+        assert((pg->constant_load_slot/4) < NV2A_VERTEXSHADER_CONSTANTS);
+        constant = &pg->constants[pg->constant_load_slot/4];
+        constant->data[pg->constant_load_slot%4] = parameter;
         constant->dirty = true;
+
+        pg->constant_load_slot++;
         break;
 
     case NV097_SET_VERTEX4F ...
@@ -2550,16 +2457,16 @@ static void pgraph_method(NV2AState *d,
 
         slot = (class_method - NV097_SET_VERTEX4F) / 4;
 
-        assert(kelvin->inline_buffer_length < NV2A_MAX_BATCH_LENGTH);
+        assert(pg->inline_buffer_length < NV2A_MAX_BATCH_LENGTH);
         
         InlineVertexBufferEntry *entry =
-            &kelvin->inline_buffer[kelvin->inline_buffer_length];
+            &pg->inline_buffer[pg->inline_buffer_length];
 
         entry->position[slot] = parameter;
         if (slot == 3) {
             entry->diffuse =
-                kelvin->vertex_attributes[NV2A_VERTEX_ATTR_DIFFUSE].inline_value;
-            kelvin->inline_buffer_length++;
+                pg->vertex_attributes[NV2A_VERTEX_ATTR_DIFFUSE].inline_value;
+            pg->inline_buffer_length++;
         }
         break;
     }
@@ -2568,7 +2475,7 @@ static void pgraph_method(NV2AState *d,
             NV097_SET_VERTEX_DATA_ARRAY_FORMAT + 0x3c:
 
         slot = (class_method - NV097_SET_VERTEX_DATA_ARRAY_FORMAT) / 4;
-        vertex_attribute = &kelvin->vertex_attributes[slot];
+        vertex_attribute = &pg->vertex_attributes[slot];
 
         vertex_attribute->format =
             GET_MASK(parameter, NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE);
@@ -2633,26 +2540,27 @@ static void pgraph_method(NV2AState *d,
 
         slot = (class_method - NV097_SET_VERTEX_DATA_ARRAY_OFFSET) / 4;
 
-        kelvin->vertex_attributes[slot].dma_select =
+        pg->vertex_attributes[slot].dma_select =
             parameter & 0x80000000;
-        kelvin->vertex_attributes[slot].offset =
+        pg->vertex_attributes[slot].offset =
             parameter & 0x7fffffff;
 
-        kelvin->vertex_attributes[slot].converted_elements = 0;
+        pg->vertex_attributes[slot].converted_elements = 0;
 
         break;
 
     case NV097_SET_BEGIN_END:
         if (parameter == NV097_SET_BEGIN_END_OP_END) {
 
-            if (kelvin->inline_buffer_length) {
+            if (pg->inline_buffer_length) {
                 glEnableVertexAttribArray(NV2A_VERTEX_ATTR_POSITION);
                 glVertexAttribPointer(NV2A_VERTEX_ATTR_POSITION,
                         4,
                         GL_FLOAT,
                         GL_FALSE,
                         sizeof(InlineVertexBufferEntry),
-                        kelvin->inline_buffer);
+                        pg->inline_buffer);
+
 
                 glEnableVertexAttribArray(NV2A_VERTEX_ATTR_DIFFUSE);
                 glVertexAttribPointer(NV2A_VERTEX_ATTR_DIFFUSE,
@@ -2660,36 +2568,38 @@ static void pgraph_method(NV2AState *d,
                         GL_UNSIGNED_BYTE,
                         GL_TRUE,
                         sizeof(InlineVertexBufferEntry),
-                        &kelvin->inline_buffer[0].diffuse);
+                        &pg->inline_buffer[0].diffuse);
 
-                glDrawArrays(kelvin->gl_primitive_mode,
-                             0, kelvin->inline_buffer_length);
-            } else if (kelvin->inline_array_length) {
+                glDrawArrays(pg->gl_primitive_mode,
+                             0, pg->inline_buffer_length);
+            } else if (pg->inline_array_length) {
                 unsigned int vertex_size =
-                    kelvin_bind_inline_array(kelvin);
+                    pgraph_bind_inline_array(pg);
                 unsigned int index_count =
-                    kelvin->inline_array_length*4 / vertex_size;
+                    pg->inline_array_length*4 / vertex_size;
                 
-                kelvin_bind_converted_vertex_attributes(d, kelvin,
-                    true, index_count);
-                glDrawArrays(kelvin->gl_primitive_mode,
+                NV2A_DPRINTF("draw inline array %d, %d\n", vertex_size, index_count);
+
+                pgraph_bind_converted_vertex_attributes(d,
+                    kelvin, true, index_count);
+                glDrawArrays(pg->gl_primitive_mode,
                              0, index_count);
-            } else if (kelvin->inline_elements_length) {
+            } else if (pg->inline_elements_length) {
 
 
                 uint32_t max_element = 0;
                 uint32_t min_element = (uint32_t)-1;
-                for (i=0; i<kelvin->inline_elements_length; i++) {
-                    max_element = MAX(kelvin->inline_elements[i], max_element);
-                    min_element = MIN(kelvin->inline_elements[i], min_element);
+                for (i=0; i<pg->inline_elements_length; i++) {
+                    max_element = MAX(pg->inline_elements[i], max_element);
+                    min_element = MIN(pg->inline_elements[i], min_element);
                 }
 
-                kelvin_bind_converted_vertex_attributes(d, kelvin,
-                    false, max_element+1);
-                glDrawElements(kelvin->gl_primitive_mode,
-                               kelvin->inline_elements_length,
+                pgraph_bind_converted_vertex_attributes(d,
+                    kelvin, false, max_element+1);
+                glDrawElements(pg->gl_primitive_mode,
+                               pg->inline_elements_length,
                                GL_UNSIGNED_INT,
-                               kelvin->inline_elements);
+                               pg->inline_elements);
             }/* else {
                 assert(false);
             }*/
@@ -2699,26 +2609,18 @@ static void pgraph_method(NV2AState *d,
 
             pgraph_update_surface(d, true);
 
-            bool use_vertex_program = GET_MASK(pg->regs[NV_PGRAPH_CSV0_D],
-                                               NV_PGRAPH_CSV0_D_MODE) == 2;
-            if (use_vertex_program) {
-                glEnable(GL_VERTEX_PROGRAM_ARB);
-                kelvin_bind_vertex_program(kelvin);
-            } else {
-                glDisable(GL_VERTEX_PROGRAM_ARB);
-            }
-
             pgraph_bind_shaders(pg);
 
             pgraph_bind_textures(d);
-            kelvin_bind_vertex_attributes(d, kelvin);
+            pgraph_bind_vertex_attributes(d, kelvin);
 
 
-            kelvin->gl_primitive_mode = kelvin_primitive_map[parameter];
 
-            kelvin->inline_elements_length = 0;
-            kelvin->inline_array_length = 0;
-            kelvin->inline_buffer_length = 0;
+            pg->gl_primitive_mode = kelvin_primitive_map[parameter];
+
+            pg->inline_elements_length = 0;
+            pg->inline_array_length = 0;
+            pg->inline_buffer_length = 0;
         }
         pg->surface_color.draw_dirty = true;
         break;
@@ -2788,38 +2690,38 @@ static void pgraph_method(NV2AState *d,
         break;
 
     case NV097_ARRAY_ELEMENT16:
-        assert(kelvin->inline_elements_length < NV2A_MAX_BATCH_LENGTH);
-        kelvin->inline_elements[
-            kelvin->inline_elements_length++] = parameter & 0xFFFF;
-        kelvin->inline_elements[
-            kelvin->inline_elements_length++] = parameter >> 16;
+        assert(pg->inline_elements_length < NV2A_MAX_BATCH_LENGTH);
+        pg->inline_elements[
+            pg->inline_elements_length++] = parameter & 0xFFFF;
+        pg->inline_elements[
+            pg->inline_elements_length++] = parameter >> 16;
         break;
     case NV097_ARRAY_ELEMENT32:
-        assert(kelvin->inline_elements_length < NV2A_MAX_BATCH_LENGTH);
-        kelvin->inline_elements[
-            kelvin->inline_elements_length++] = parameter;
+        assert(pg->inline_elements_length < NV2A_MAX_BATCH_LENGTH);
+        pg->inline_elements[
+            pg->inline_elements_length++] = parameter;
         break;
     case NV097_DRAW_ARRAYS: {
         unsigned int start = GET_MASK(parameter, NV097_DRAW_ARRAYS_START_INDEX);
         unsigned int count = GET_MASK(parameter, NV097_DRAW_ARRAYS_COUNT)+1;
 
 
-        kelvin_bind_converted_vertex_attributes(d, kelvin,
+        pgraph_bind_converted_vertex_attributes(d, kelvin,
             false, start + count);
-        glDrawArrays(kelvin->gl_primitive_mode, start, count);
+        glDrawArrays(pg->gl_primitive_mode, start, count);
 
         break;
     }
     case NV097_INLINE_ARRAY:
-        assert(kelvin->inline_array_length < NV2A_MAX_BATCH_LENGTH);
-        kelvin->inline_array[
-            kelvin->inline_array_length++] = parameter;
+        assert(pg->inline_array_length < NV2A_MAX_BATCH_LENGTH);
+        pg->inline_array[
+            pg->inline_array_length++] = parameter;
         break;
 
     case NV097_SET_VERTEX_DATA4UB ...
             NV097_SET_VERTEX_DATA4UB + 0x3c:
         slot = (class_method - NV097_SET_VERTEX_DATA4UB) / 4;
-        kelvin->vertex_attributes[slot].inline_value = parameter;
+        pg->vertex_attributes[slot].inline_value = parameter;
         break;
 
     case NV097_SET_SEMAPHORE_OFFSET:
@@ -2946,27 +2848,20 @@ static void pgraph_method(NV2AState *d,
                  GET_MASK(parameter, NV_097_SET_TRANSFORM_EXECUTION_MODE_RANGE_MODE));
         break;
     case NV097_SET_TRANSFORM_PROGRAM_CXT_WRITE_EN:
-        kelvin->enable_vertex_program_write = parameter;
+        pg->enable_vertex_program_write = parameter;
         break;
     case NV097_SET_TRANSFORM_PROGRAM_LOAD:
-        assert(parameter < NV2A_VERTEXSHADER_SLOTS);
-        kelvin->vertexshader_load_slot = parameter;
-        kelvin->vertexshaders[parameter].program_length = 0; /* ??? */
-        kelvin->vertexshaders[parameter].dirty = true;
+        assert(parameter < NV2A_MAX_TRANSFORM_PROGRAM_LENGTH);
+        pg->program_load = parameter * VSH_TOKEN_SIZE;
         break;
     case NV097_SET_TRANSFORM_PROGRAM_START:
-        assert(parameter < NV2A_VERTEXSHADER_SLOTS);
-        /* if the shader changed, dirty all the constants */
-        if (parameter != kelvin->vertexshader_start_slot) {
-            for (i=0; i<NV2A_VERTEXSHADER_CONSTANTS; i++) {
-                kelvin->constants[i].dirty = true;
-            }
-        }
-        kelvin->vertexshader_start_slot = parameter;
+        assert(parameter < NV2A_MAX_TRANSFORM_PROGRAM_LENGTH);
+        pg->program_start = parameter * VSH_TOKEN_SIZE;
+        pg->shaders_dirty = true;
         break;
     case NV097_SET_TRANSFORM_CONSTANT_LOAD:
         assert(parameter < NV2A_VERTEXSHADER_CONSTANTS);
-        kelvin->constant_load_slot = parameter;
+        pg->constant_load_slot = parameter * 4;
         NV2A_DPRINTF("load to %d\n", parameter);
         break;
 
diff --git a/hw/xbox/nv2a_vsh.c b/hw/xbox/nv2a_vsh.c
index eb3aefe988..bd1a6b419e 100644
--- a/hw/xbox/nv2a_vsh.c
+++ b/hw/xbox/nv2a_vsh.c
@@ -1,6 +1,7 @@
 /*
  * QEMU Geforce NV2A vertex shader translation
  *
+ * Copyright (c) 2014 Jannik Vogel
  * Copyright (c) 2012 espes
  *
  * Based on:
@@ -33,52 +34,6 @@
 
 #define VSH_D3DSCM_CORRECTION 96
 
-#define VSH_TOKEN_SIZE 4
-
-typedef enum {
-    FLD_ILU = 0,
-    FLD_MAC,
-    FLD_CONST,
-    FLD_V,
-    // Input A
-    FLD_A_NEG,
-    FLD_A_SWZ_X,
-    FLD_A_SWZ_Y,
-    FLD_A_SWZ_Z,
-    FLD_A_SWZ_W,
-    FLD_A_R,
-    FLD_A_MUX,
-    // Input B
-    FLD_B_NEG,
-    FLD_B_SWZ_X,
-    FLD_B_SWZ_Y,
-    FLD_B_SWZ_Z,
-    FLD_B_SWZ_W,
-    FLD_B_R,
-    FLD_B_MUX,
-    // Input C
-    FLD_C_NEG,
-    FLD_C_SWZ_X,
-    FLD_C_SWZ_Y,
-    FLD_C_SWZ_Z,
-    FLD_C_SWZ_W,
-    FLD_C_R_HIGH,
-    FLD_C_R_LOW,
-    FLD_C_MUX,
-    // Output
-    FLD_OUT_MAC_MASK,
-    FLD_OUT_R,
-    FLD_OUT_ILU_MASK,
-    FLD_OUT_O_MASK,
-    FLD_OUT_ORB,
-    FLD_OUT_ADDRESS,
-    FLD_OUT_MUX,
-    // Relative addressing
-    FLD_A0X,
-    // Final instruction
-    FLD_FINAL
-} VshFieldName;
-
 
 typedef enum {
     PARAM_UNKNOWN = 0,
@@ -222,7 +177,6 @@ static const VshOpcodeParams mac_opcode_params[] = {
 };
 
 
-
 static const char* mask_str[] = {
             // xyzw xyzw
     "",     // 0000 ____
@@ -240,7 +194,7 @@ static const char* mask_str[] = {
     ".xy",  // 1100 xy__
     ".xyw", // 1101 xy_w
     ".xyz", // 1110 xyz_
-    ""//.xyzw  1111 xyzw
+    ".xyzw" // 1111 xyzw
 };
 
 /* Note: OpenGL seems to be case-sensitive, and requires upper-case opcodes! */
@@ -265,7 +219,7 @@ static const char* ilu_opcode[] = {
     "NOP",
     "MOV",
     "RCP",
-    "RCP", // Was RCC
+    "RCC",
     "RSQ",
     "EXP",
     "LOG",
@@ -284,7 +238,7 @@ static bool ilu_force_scalar[] = {
 };
 
 static const char* out_reg_name[] = {
-    "R12", // "oPos",
+    "oPos",
     "???",
     "???",
     "oD0",
@@ -312,7 +266,8 @@ static int vsh_get_from_token(uint32_t *shader_token,
 {
     return (shader_token[subtoken] >> start_bit) & ~(0xFFFFFFFF << bit_length);
 }
-static uint8_t vsh_get_field(uint32_t *shader_token, VshFieldName field_name)
+
+uint8_t vsh_get_field(uint32_t *shader_token, VshFieldName field_name)
 {
 
     return (uint8_t)(vsh_get_from_token(shader_token,
@@ -327,7 +282,7 @@ static int16_t convert_c_register(const int16_t c_reg)
 {
     int16_t r = ((((c_reg >> 5) & 7) - 3) * 32) + (c_reg & 31);
     r += VSH_D3DSCM_CORRECTION; /* to map -96..95 to 0..191 */
-    return r;
+    return r; //FIXME: = c_reg?!
 }
 
 
@@ -341,7 +296,7 @@ static QString* decode_swizzle(uint32_t *shader_token,
     /* some microcode instructions force a scalar value */
     if (swizzle_field == FLD_C_SWZ_X
         && ilu_force_scalar[vsh_get_field(shader_token, FLD_ILU)]) {
-        x = y = z = w = x = vsh_get_field(shader_token, swizzle_field);
+        x = y = z = w = vsh_get_field(shader_token, swizzle_field);
     } else {
         x = vsh_get_field(shader_token, swizzle_field++);
         y = vsh_get_field(shader_token, swizzle_field++);
@@ -352,21 +307,21 @@ static QString* decode_swizzle(uint32_t *shader_token,
     if (x == SWIZZLE_X && y == SWIZZLE_Y
         && z == SWIZZLE_Z && w == SWIZZLE_W) {
         /* Don't print the swizzle if it's .xyzw */
-        return qstring_from_str("");
+        return qstring_from_str(""); // Will turn ".xyzw" into "."
     /* Don't print duplicates */
     } else if (x == y && y == z && z == w) {
         return qstring_from_str((char[]){'.', swizzle_str[x], '\0'});
-    } else if (x == y && z == w) {
+    } else if (y == z && z == w) {
         return qstring_from_str((char[]){'.',
             swizzle_str[x], swizzle_str[y], '\0'});
-    } /*else if (z == w) {
+    } else if (z == w) {
         return qstring_from_str((char[]){'.',
             swizzle_str[x], swizzle_str[y], swizzle_str[z], '\0'});
-    }*/ else {
+    } else {
         return qstring_from_str((char[]){'.',
                                        swizzle_str[x], swizzle_str[y],
                                        swizzle_str[z], swizzle_str[w],
-                                       '\0'});
+                                       '\0'}); // Normal swizzle mask
     }
 }
 
@@ -400,12 +355,14 @@ static QString* decode_opcode_input(uint32_t *shader_token,
     case PARAM_C:
         reg_num = convert_c_register(vsh_get_field(shader_token, FLD_CONST));
         if (vsh_get_field(shader_token, FLD_A0X) > 0) {
+            //FIXME: does this really require the "correction" doe in convert_c_register?!
             snprintf(tmp, sizeof(tmp), "c[A0+%d]", reg_num);
         } else {
             snprintf(tmp, sizeof(tmp), "c[%d]", reg_num);
         }
         break;
     default:
+        printf("Param: 0x%x\n", param);
         assert(false);
     }
     qstring_append(ret_str, tmp);
@@ -444,16 +401,18 @@ static QString* decode_opcode(uint32_t *shader_token,
 
     if (mask > 0) {
         if (strcmp(opcode, mac_opcode[MAC_ARL]) == 0) {
-            qstring_append(ret, opcode);
+            qstring_append(ret, "  ARL(a0");
             qstring_append(ret, qstring_get_str(inputs));
             qstring_append(ret, ";\n");
         } else {
+            qstring_append(ret, "  ");
             qstring_append(ret, opcode);
-            qstring_append(ret, " R");
+            qstring_append(ret, "(");
+            qstring_append(ret, "R");
             qstring_append_int(ret, reg_num);
             qstring_append(ret, mask_str[mask]);
             qstring_append(ret, qstring_get_str(inputs));
-            qstring_append(ret, ";\n");
+            qstring_append(ret, ");\n");
         }
     }
 
@@ -462,15 +421,17 @@ static QString* decode_opcode(uint32_t *shader_token,
         /* Only if it's not masked away: */
         && vsh_get_field(shader_token, FLD_OUT_O_MASK) != 0) {
 
+        qstring_append(ret, "  ");
         qstring_append(ret, opcode);
+        qstring_append(ret, "(");
+
         if (vsh_get_field(shader_token, FLD_OUT_ORB) == OUTPUT_C) {
             /* TODO : Emulate writeable const registers */
-            qstring_append(ret, " c");
+            qstring_append(ret, "c");
             qstring_append_int(ret,
                 convert_c_register(
                     vsh_get_field(shader_token, FLD_OUT_ADDRESS)));
         } else {
-            qstring_append_chr(ret, ' ');
             qstring_append(ret,
                 out_reg_name[
                     vsh_get_field(shader_token, FLD_OUT_ADDRESS) & 0xF]);
@@ -479,7 +440,7 @@ static QString* decode_opcode(uint32_t *shader_token,
             mask_str[
                 vsh_get_field(shader_token, FLD_OUT_O_MASK)]);
         qstring_append(ret, qstring_get_str(inputs));
-        qstring_append(ret, ";\n");
+        qstring_append(ret, ");\n");
     }
 
     return ret;
@@ -563,141 +524,285 @@ static QString* decode_token(uint32_t *shader_token)
     return ret;
 }
 
-/* Vertex shader header, mapping Xbox1 registers to the ARB syntax (original
- * version by KingOfC). Note about the use of 'conventional' attributes in here:
- * Since we prefer to use only one shader for both immediate and deferred mode
- * rendering, we alias all attributes to conventional inputs as much as possible.
- * Only when there's no conventional attribute available, we use generic
- * attributes. So in the following header, we use conventional attributes first,
- * and generic attributes for the rest of the vertex attribute slots. This makes
- * it possible to support immediate and deferred mode rendering with the same
- * shader, and the use of the OpenGL fixed-function pipeline without a shader.
- */
 static const char* vsh_header =
-    "!!ARBvp1.0\n"
-    "TEMP R0,R1,R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12;\n"
-    "ADDRESS A0;\n"
+    "#version 110\n"
+    "\n"
+    "attribute vec4 v0;\n"
+    "attribute vec4 v1;\n"
+    "attribute vec4 v2;\n"
+    "attribute vec4 v3;\n"
+    "attribute vec4 v4;\n"
+    "attribute vec4 v5;\n"
+    "attribute vec4 v6;\n"
+    "attribute vec4 v7;\n"
+    "attribute vec4 v8;\n"
+    "attribute vec4 v9;\n"
+    "attribute vec4 v10;\n"
+    "attribute vec4 v11;\n"
+    "attribute vec4 v12;\n"
+    "attribute vec4 v13;\n"
+    "attribute vec4 v14;\n"
+    "attribute vec4 v15;\n"
+    "\n"
+    //FIXME: What is a0 initialized as?
+    "int A0 = 0;\n"
+    "\n"
+    //FIXME: I just assumed this is true for all registers?!
+    "vec4 R0 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 R1 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 R2 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 R3 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 R4 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 R5 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 R6 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 R7 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 R8 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 R9 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 R10 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 R11 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 R12 = vec4(0.0,0.0,0.0,1.0);\n"
+    "\n"
+    "#define oPos R12\n" /* oPos is a mirror of R12 */
+    "vec4 oD0 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 oD1 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 oB0 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 oB1 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 oPts = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 oFog = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 oT0 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 oT1 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 oT2 = vec4(0.0,0.0,0.0,1.0);\n"
+    "vec4 oT3 = vec4(0.0,0.0,0.0,1.0);\n"
+    "\n"
+
+    /* All constants in 1 array declaration */
+   "uniform vec4 c[192];\n"
+   "#define viewport_scale c[58]\n"
+   "#define viewport_offset c[59]\n"
+   "uniform vec2 cliprange;\n"
+
+    /* See:
+     * http://msdn.microsoft.com/en-us/library/windows/desktop/bb174703%28v=vs.85%29.aspx
+     * https://www.opengl.org/registry/specs/NV/vertex_program1_1.txt
+     */
+    "/* Converts number of components of rvalue to lvalue */\n"
+    "float _out(float l, vec4 r) { return r.x; }\n"
+    "vec2 _out(vec2 l, vec4 r) { return r.xy; }\n"
+    "vec3 _out(vec3 l, vec4 r) { return r.xyz; }\n"
+    "vec4 _out(vec4 l, vec4 r) { return r.xyzw; }\n"
+    "\n"
+//QQQ #ifdef NICE_CODE
+    "/* Converts the input to vec4, pads with last component */\n"
+    "vec4 _in(float v) { return vec4(v); }\n"
+    "vec4 _in(vec2 v) { return v.xyyy; }\n"
+    "vec4 _in(vec3 v) { return v.xyzz; }\n"
+    "vec4 _in(vec4 v) { return v.xyzw; }\n"
+//#else
+//    "/* Make sure input is always a vec4 */\n"
+//   "#define _in(v) vec4(v)\n"
+//#endif
+    "\n"
+    "#define MOV(dest, src) dest = _out(dest,_MOV(_in(src)))\n"
+    "vec4 _MOV(vec4 src)\n"
+    "{\n"
+    "  return src;\n"
+    "}\n"
+    "\n"
+    "#define MUL(dest, src0, src1) dest = _out(dest,_MUL(_in(src0), _in(src1)))\n"
+    "vec4 _MUL(vec4 src0, vec4 src1)\n" 
+    "{\n"
+    "  return src0 * src1;\n"
+    "}\n"
+    "\n"
+    "#define ADD(dest, src0, src1) dest = _out(dest,_ADD(_in(src0), _in(src1)))\n"
+    "vec4 _ADD(vec4 src0, vec4 src1)\n" 
+    "{\n"
+    "  return src0 + src1;\n"
+    "}\n"
+    "\n"
+    "#define MAD(dest, src0, src1, src2) dest = _out(dest,_MAD(_in(src0), _in(src1), _in(src2)))\n"
+    "vec4 _MAD(vec4 src0, vec4 src1, vec4 src2)\n" 
+    "{\n"
+    "  return src0 * src1 + src2;\n"
+    "}\n"
+    "\n"
+    "#define DP3(dest, src0, src1) dest = _out(dest,_DP3(_in(src0), _in(src1)))\n"
+    "vec4 _DP3(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return vec4(dot(src0.xyz, src1.xyz));\n"
+    "}\n"
+    "\n"
+    "#define DPH(dest, src0, src1) dest = _out(dest,_DPH(_in(src0), _in(src1)))\n"
+    "vec4 _DPH(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return vec4(dot(vec4(src0.xyz, 1.0), src1));\n"
+    "}\n"
+    "\n"
+    "#define DP4(dest, src0, src1) dest = _out(dest,_DP4(_in(src0), _in(src1)))\n"
+    "vec4 _DP4(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return vec4(dot(src0, src1));\n"
+    "}\n"
+    "\n"
+    "#define DST(dest, src0, src1) dest = _out(dest,_DST(_in(src0), _in(src1)))\n"
+    "vec4 _DST(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return vec4(1.0,\n"
+    "              src0.y * src1.y,\n"
+    "              src0.z,\n"
+    "              src1.w);\n"
+    "}\n"
+    "\n"
+    "#define MIN(dest, src0, src1) dest = _out(dest,_MIN(_in(src0), _in(src1)))\n"
+    "vec4 _MIN(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return min(src0, src1);\n"
+    "}\n"
+    "\n"
+    "#define MAX(dest, src0, src1) dest = _out(dest,_MAX(_in(src0), _in(src1)))\n"
+    "vec4 _MAX(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return max(src0, src1);\n"
+    "}\n"
+    "\n"
+    "#define SLT(dest, src0, src1) dest = _out(dest,_SLT(_in(src0), _in(src1)))\n"
+    "vec4 _SLT(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return vec4(lessThan(src0, src1));\n"
+    "}\n"
+    "\n"
+    "#define ARL(dest, src) dest = _ARL(_in(src).x)\n"
+    "int _ARL(float src)\n"
+    "{\n"
+    "  return int(src);\n"
+    "}\n"
+    "\n"
+    "#define SGE(dest, src0, src1) dest = _out(dest,_SGE(_in(src0), _in(src1)))\n"
+    "vec4 _SGE(vec4 src0, vec4 src1)\n"
+    "{\n"
+    "  return vec4(greaterThanEqual(src0, src1));\n"
+    "}\n"
+    "\n"
+    "#define RCP(dest, src) dest = _out(dest,_RCP(_in(src).x))\n"
+    "vec4 _RCP(float src)\n"
+    "{\n"
+    "  return vec4(1.0 / src);\n"
+    "}\n"
+    "\n"
+    "#define RCC(dest, src) dest = _out(dest,_RCC(_in(src).x))\n"
+    "vec4 _RCC(float src)\n"
+    "{\n"
+    "  float t = 1.0 / src;\n"
+    "  if (t > 0.0) {\n"
+    "    t = clamp(t, 5.42101e-020, 1.884467e+019);\n"
+    "  } else {\n"
+    "    t = clamp(t, -1.884467e+019, -5.42101e-020);\n"
+    "  }\n"
+    "  return vec4(t);\n"
+    "}\n"
+    "\n"
+    "#define RSQ(dest, src) dest = _out(dest,_RSQ(_in(src).x))\n"
+    "vec4 _RSQ(float src)\n"
+    "{\n"
+    "  return vec4(inversesqrt(src));\n"
+    "}\n"
+    "\n"
+    "#define EXP(dest, src) dest = _out(dest,_EXP(_in(src).x))\n"
+    "vec4 _EXP(float src)\n"
+    "{\n"
+    "  return vec4(exp2(src));\n"
+    "}\n"
+    "\n"
+    "#define LOG(dest, src) dest = _out(dest,_LOG(_in(src).x))\n"
+    "vec4 _LOG(float src)\n"
+    "{\n"
+    "  return vec4(log2(src));\n"
+    "}\n"
+    "\n"
+    "#define LIT(dest, src) dest = _out(dest,_LIT(_in(src)))\n"
+    "vec4 _LIT(vec4 src)\n"
+    "{\n"
+    "  vec4 t = vec4(1.0, 0.0, 0.0, 1.0);\n"
+    "  float power = src.w;\n"
 #if 0
-    "ATTRIB v0 = vertex.position;" // (See "conventional" note above)
-    "ATTRIB v1 = vertex.%s;" // Note : We replace this with "weight" or "attrib[1]" depending GL_ARB_vertex_blend
-    "ATTRIB v2 = vertex.normal;"
-    "ATTRIB v3 = vertex.color.primary;"
-    "ATTRIB v4 = vertex.color.secondary;"
-    "ATTRIB v5 = vertex.fogcoord;"
-    "ATTRIB v6 = vertex.attrib[6];"
-    "ATTRIB v7 = vertex.attrib[7];"
-    "ATTRIB v8 = vertex.texcoord[0];"
-    "ATTRIB v9 = vertex.texcoord[1];"
-    "ATTRIB v10 = vertex.texcoord[2];"
-    "ATTRIB v11 = vertex.texcoord[3];"
-#else
-    "ATTRIB v0 = vertex.attrib[0];\n"
-    "ATTRIB v1 = vertex.attrib[1];\n"
-    "ATTRIB v2 = vertex.attrib[2];\n"
-    "ATTRIB v3 = vertex.attrib[3];\n"
-    "ATTRIB v4 = vertex.attrib[4];\n"
-    "ATTRIB v5 = vertex.attrib[5];\n"
-    "ATTRIB v6 = vertex.attrib[6];\n"
-    "ATTRIB v7 = vertex.attrib[7];\n"
-    "ATTRIB v8 = vertex.attrib[8];\n"
-    "ATTRIB v9 = vertex.attrib[9];\n"
-    "ATTRIB v10 = vertex.attrib[10];\n"
-    "ATTRIB v11 = vertex.attrib[11];\n"
+    //XXX: Limitation for 8.8 fixed point
+    "  power = max(power, -127.9961);\n"
+    "  power = min(power, 127.9961);\n"
 #endif
-    "ATTRIB v12 = vertex.attrib[12];\n"
-    "ATTRIB v13 = vertex.attrib[13];\n"
-    "ATTRIB v14 = vertex.attrib[14];\n"
-    "ATTRIB v15 = vertex.attrib[15];\n"
-    "OUTPUT oPos = result.position;\n"
-    "OUTPUT oD0 = result.color.front.primary;\n"
-    "OUTPUT oD1 = result.color.front.secondary;\n"
-    "OUTPUT oB0 = result.color.back.primary;\n"
-    "OUTPUT oB1 = result.color.back.secondary;\n"
-    "OUTPUT oPts = result.pointsize;\n"
-    "OUTPUT oFog = result.fogcoord;\n"
-    "OUTPUT oT0 = result.texcoord[0];\n"
-    "OUTPUT oT1 = result.texcoord[1];\n"
-    "OUTPUT oT2 = result.texcoord[2];\n"
-    "OUTPUT oT3 = result.texcoord[3];\n"
-
-    /* All constants in 1 array declaration (requires NV_gpu_program4?) */
-    "PARAM c[] = { program.env[0..191] };\n"
-
-    /* w component of outputs are expected to be initialised to 1 */
-    "MOV R12, 0.0;\n"
-    "MOV R12.w, 1.0;\n"
-    "MOV oD0.w, 1.0;\n"
-    "MOV oD1.w, 1.0;\n"
-    "MOV oB0.w, 1.0;\n"
-    "MOV oB1.w, 1.0;\n"
-    "MOV oT0.w, 1.0;\n"
-    "MOV oT1.w, 1.0;\n"
-    "MOV oT2.w, 1.0;\n"
-    "MOV oT3.w, 1.0;\n";
-
+    "  if (src.x > 0.0) {\n"
+    "    t.y = src.x;\n"
+    "    if (src.y > 0.0) {\n"
+    //XXX: Allowed approximation is EXP(power * LOG(src.y))
+    "      t.z = pow(src.y, power);\n"
+    "    }\n"
+    "  }\n"
+    "  return t;\n"
+    "}\n";
 
 QString* vsh_translate(uint16_t version,
                        uint32_t *tokens, unsigned int tokens_length)
 {
-    QString *ret = qstring_from_str(vsh_header);
-    
+    QString *body = qstring_from_str("\n");
+    QString *header = qstring_from_str(vsh_header);
+
+
+    bool has_final = false;
     uint32_t *cur_token = tokens;
+    unsigned int slot;
     while (cur_token-tokens < tokens_length) {
+        slot = (cur_token-tokens) / VSH_TOKEN_SIZE;
         QString *token_str = decode_token(cur_token);
-        qstring_append(ret, qstring_get_str(token_str));
+        qstring_append_fmt(body,
+                           "  /* Slot %d: 0x%08X 0x%08X 0x%08X 0x%08X */",
+                           slot,
+                           cur_token[0],cur_token[1],cur_token[2],cur_token[3]);
+        qstring_append(body, "\n");
+        qstring_append(body, qstring_get_str(token_str));
+        qstring_append(body, "\n");
         QDECREF(token_str);
 
         if (vsh_get_field(cur_token, FLD_FINAL)) {
+            has_final = true;
             break;
         }
         cur_token += VSH_TOKEN_SIZE;
     }
-
-    /* Note : Since we replaced oPos with r12 in the above decoding,
-     * we have to assign oPos at the end; This can be done in two ways;
-     * 1) When the shader is complete (including transformations),
-     *    we could just do a 'MOV oPos, R12;' and be done with it.
-     * 2) In case of D3DFVF_XYZRHW, it seems the NV2A applies the mvp
-     *    (model/view/projection) matrix transformation AFTER executing
-     *    the shader (but OpenGL expects *the*shader* to handle this
-     *    transformation).
-     * Until we can discern these two situations, we apply the matrix 
-     * transformation :
-     * TODO : What should we do about normals, eye-space lighting and all that?
-     */
-    qstring_append(ret,
-/*
-    '# Dxbx addition : Transform the vertex to clip coordinates :'
-    "DP4 R0.x, mvp[0], R12;"
-    "DP4 R0.y, mvp[1], R12;"
-    "DP4 R0.z, mvp[2], R12;"
-    "DP4 R0.w, mvp[3], R12;"
-    "MOV R12, R0;"
-*/
+    assert(has_final);
 
 
+    qstring_append(body,
         /* the shaders leave the result in screen space, while
          * opengl expects it in clip coordinates.
          * Use the magic viewport constants for now,
-         * but they're not necessarily present.
-         * Same idea as above I think, but dono what the mvp stuff is about...
+         * but they're not necessarily present...
         */
-        "# un-screenspace transform\n"
-        "ADD R12, R12, -c[59];\n"
-        "RCP R1.x, c[58].x;\n"
-        "RCP R1.y, c[58].y;\n"
 
-        /* scale_z = view_z == 0 ? 1 : (1 / view_z) */
-        "ABS R1.z, c[58].z;\n"
-        "SGE R1.z, -R1.z, 0;\n"
-        "ADD R1.z, R1.z, c[58].z;\n"
-        "RCP R1.z, R1.z;\n"
+        "  /* Un-screenspace transform */\n"
+        "  oPos.xyz = oPos.xyz - viewport_offset.xyz;\n"
+        "  vec3 tmp = vec3(1.0);\n"
 
-        "MUL R12.xyz, R12, R1;\n"
-        "MOV R12.w, 1.0;\n"
+        /* FIXME: old comment was "scale_z = view_z == 0 ? 1 : (1 / view_z)" */
+        "  if (viewport_scale.x != 0.0) { tmp.x /= viewport_scale.x; }\n"
+        "  if (viewport_scale.y != 0.0) { tmp.y /= viewport_scale.y; }\n"
+        "  if (viewport_scale.z != 0.0) { tmp.z /= viewport_scale.z; }\n"
 
-        /* undo the perspective divide? */
-        //"MUL R12.xyz, R12, R12.w;\n"
+        "  oPos.xyz *= tmp.xyz;\n"
+        "  oPos.w = 1.0;\n" //This breaks 2D? Maybe w is zero?
+        "\n"
+#if 0
+//FIXME: Use surface width / height / zeta max
+      "R12.z /= 16777215.0;\n" // Z[0;1]
+      "R12.z *= (cliprange.y - cliprange.x) / 16777215.0;\n" // Scale so [0;zmax] -> [0;cliprange_size]
+      "R12.z -= cliprange.x / 16777215.0;\n" // Move down so [clipmin_min;clipmin_max]
+      // X = [0;surface_width]; Y = [surface_height;0]; Z = [0;1]; W = ???
+      "R12.xyz = R12.xyz / vec3(640.0,480.0,1.0);\n"
+      // X,Z = [0;1]; Y = [1;0]; W = ???
+      "R12.xyz = R12.xyz * vec3(2.0) - vec3(1.0);\n"
+      "R12.y *= -1.0;\n"
+      "R12.w = 1.0;\n"
+      // X,Y,Z = [-1;+1]; W = 1
+        "\n"
+#endif
 
         /* Z coord [0;1]->[-1;1] mapping, see comment in transform_projection
          * in state.c
@@ -711,9 +816,31 @@ QString* vsh_translate(uint16_t version,
         //"ADD R12.z, R12.z, R12.z;\n"
         //"ADD R12.z, R12.z, -R12.w;\n"
 
-        "# End of shader:\n"
-        "MOV oPos, R12;\n"
-        "END"
+
+        "  /* Set outputs */\n"
+        "  gl_Position = oPos;\n"
+        "  gl_FrontColor = oD0;\n"
+        "  gl_FrontSecondaryColor = oD1;\n"
+        "  gl_BackColor = oB0;\n"
+        "  gl_BackSecondaryColor = oB1;\n"
+        "  gl_PointSize = oPts.x;\n"
+        "  gl_FogFragCoord = oFog.x;\n"
+        "  gl_TexCoord[0] = oT0;\n"
+        "  gl_TexCoord[1] = oT1;\n"
+        "  gl_TexCoord[2] = oT2;\n"
+        "  gl_TexCoord[3] = oT3;\n"
+        "\n"
     );
+
+    QString *ret = qstring_new();
+    qstring_append(ret, qstring_get_str(header));
+    qstring_append(ret,"\n"
+                       "void main(void)\n"
+                       "{\n");
+    qstring_append(ret, qstring_get_str(body));
+    qstring_append(ret,"}\n");
+    QDECREF(header);
+    QDECREF(body);
     return ret;
 }
+
diff --git a/hw/xbox/nv2a_vsh.h b/hw/xbox/nv2a_vsh.h
index a2f2abce05..5dd4b6394e 100644
--- a/hw/xbox/nv2a_vsh.h
+++ b/hw/xbox/nv2a_vsh.h
@@ -36,6 +36,54 @@
 // Xbox vertex read/write shader
 #define VSH_VERSION_XVSW                   0x7778
 
+#define VSH_TOKEN_SIZE 4
+
+typedef enum {
+    FLD_ILU = 0,
+    FLD_MAC,
+    FLD_CONST,
+    FLD_V,
+    // Input A
+    FLD_A_NEG,
+    FLD_A_SWZ_X,
+    FLD_A_SWZ_Y,
+    FLD_A_SWZ_Z,
+    FLD_A_SWZ_W,
+    FLD_A_R,
+    FLD_A_MUX,
+    // Input B
+    FLD_B_NEG,
+    FLD_B_SWZ_X,
+    FLD_B_SWZ_Y,
+    FLD_B_SWZ_Z,
+    FLD_B_SWZ_W,
+    FLD_B_R,
+    FLD_B_MUX,
+    // Input C
+    FLD_C_NEG,
+    FLD_C_SWZ_X,
+    FLD_C_SWZ_Y,
+    FLD_C_SWZ_Z,
+    FLD_C_SWZ_W,
+    FLD_C_R_HIGH,
+    FLD_C_R_LOW,
+    FLD_C_MUX,
+    // Output
+    FLD_OUT_MAC_MASK,
+    FLD_OUT_R,
+    FLD_OUT_ILU_MASK,
+    FLD_OUT_O_MASK,
+    FLD_OUT_ORB,
+    FLD_OUT_ADDRESS,
+    FLD_OUT_MUX,
+    // Relative addressing
+    FLD_A0X,
+    // Final instruction
+    FLD_FINAL
+} VshFieldName;
+
+uint8_t vsh_get_field(uint32_t *shader_token, VshFieldName field_name);
+
 QString* vsh_translate(uint16_t version,
                        uint32_t *tokens, unsigned int tokens_length);
 
diff --git a/hw/xbox/swizzle.c b/hw/xbox/swizzle.c
new file mode 100644
index 0000000000..0d80da9366
--- /dev/null
+++ b/hw/xbox/swizzle.c
@@ -0,0 +1,109 @@
+/*
+ * QEMU texture swizzling routines
+ *
+ * Copyright (c) 2013 espes
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+void unswizzle_rect(
+    uint8_t *src_buf,
+    unsigned int width,
+    unsigned int height,
+    unsigned int depth,
+    uint8_t *dst_buf,
+    unsigned int pitch,
+    unsigned int bytes_per_pixel)
+{
+    unsigned int offset_u = 0, offset_v = 0, offset_w = 0;
+    uint32_t mask_u = 0, mask_v = 0, mask_w = 0;
+
+    unsigned int i = 1, j = 1;
+
+    while( (i <= width) || (i <= height) || (i <= depth) ) {
+        if(i < width) {
+            mask_u |= j;
+            j<<=1;
+        }
+        if(i < height) {
+            mask_v |= j;
+            j<<=1;
+        }
+        if(i < depth) {
+            mask_w |= j;
+            j<<=1;
+        }
+        i<<=1;
+    }
+
+    uint32_t start_u = 0;
+    uint32_t start_v = 0;
+    uint32_t start_w = 0;
+    uint32_t mask_max = 0;
+
+    // get the biggest mask
+    if(mask_u > mask_v)
+        mask_max = mask_u;
+    else
+        mask_max = mask_v;
+    if(mask_w > mask_max)
+        mask_max = mask_w;
+
+    for(i = 1; i <= mask_max; i<<=1) {
+        if(i<=mask_u) {
+            if(mask_u & i) start_u |= (offset_u & i);
+            else offset_u <<= 1;
+        }
+
+        if(i <= mask_v) {
+            if(mask_v & i) start_v |= (offset_v & i);
+            else offset_v<<=1;
+        }
+
+        if(i <= mask_w) {
+            if(mask_w & i) start_w |= (offset_w & i);
+            else offset_w <<= 1;
+        }
+    }
+
+    uint32_t w = start_w;
+    unsigned int z;
+    for(z=0; z<depth; z++) {
+        uint32_t v = start_v;
+
+        unsigned int y;
+        for(y=0; y<height; y++) {
+            uint32_t u = start_u;
+
+            unsigned int x;
+            for (x=0; x<width; x++) {
+                memcpy(dst_buf,
+                       src_buf + ( (u|v|w)*bytes_per_pixel ),
+                       bytes_per_pixel);
+                dst_buf += bytes_per_pixel;
+
+                u = (u - mask_u) & mask_u;
+            }
+            dst_buf += pitch - width * bytes_per_pixel;
+
+            v = (v - mask_v) & mask_v;
+        }
+        w = (w - mask_w) & mask_w;
+    }
+}
\ No newline at end of file
diff --git a/hw/xbox/swizzle.h b/hw/xbox/swizzle.h
new file mode 100644
index 0000000000..6aba83e7f3
--- /dev/null
+++ b/hw/xbox/swizzle.h
@@ -0,0 +1,34 @@
+/*
+ * QEMU texture swizzling routines
+ *
+ * Copyright (c) 2013 espes
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#ifndef HW_XBOX_SWIZZLE_H
+#define HW_XBOX_SWIZZLE_H
+
+ void unswizzle_rect(
+    uint8_t *src_buf,
+    unsigned int width,
+    unsigned int height,
+    unsigned int depth,
+    uint8_t *dst_buf,
+    unsigned int pitch,
+    unsigned int bytes_per_pixel);
+
+#endif
\ No newline at end of file