Basic cubemap support, though the sampling code is wrong.

2015-03-22 19:15:22 -07:00 · 2015-03-22 19:15:22 -07:00 · b392afbfae
parent 7677d6ea9f
commit b392afbfae
8 changed files with 265 additions and 19 deletions
--- a/src/xenia/gpu/gl4/gl4_shader.cc
+++ b/src/xenia/gpu/gl4/gl4_shader.cc
@ -74,6 +74,59 @@ std::string GL4Shader::GetHeader() {
  return header;
 }

+std::string GL4Shader::GetFooter() {
+  // http://www.nvidia.com/object/cube_map_ogl_tutorial.html
+  // http://developer.amd.com/wordpress/media/2012/10/R600_Instruction_Set_Architecture.pdf
+  // src0 = Rn.zzxy, src1 = Rn.yxzz
+  // dst.W = FaceId;
+  // dst.Z = 2.0f * MajorAxis;
+  // dst.Y = S cube coordinate;
+  // dst.X = T cube coordinate;
+  /*
+  major axis
+  direction     target                                sc     tc    ma
+  ----------   ------------------------------------   ---    ---   ---
+  +rx          GL_TEXTURE_CUBE_MAP_POSITIVE_X_EXT=0   -rz    -ry   rx
+  -rx          GL_TEXTURE_CUBE_MAP_NEGATIVE_X_EXT=1   +rz    -ry   rx
+  +ry          GL_TEXTURE_CUBE_MAP_POSITIVE_Y_EXT=2   +rx    +rz   ry
+  -ry          GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT=3   +rx    -rz   ry
+  +rz          GL_TEXTURE_CUBE_MAP_POSITIVE_Z_EXT=4   +rx    -ry   rz
+  -rz          GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT=5   -rx    -ry   rz
+  */
+  static const std::string footer =
+      "vec4 cube(vec4 src0, vec4 src1) {\n"
+      "  vec3 src = vec3(src1.y, src1.x, src1.z);\n"
+      "  vec3 abs_src = abs(src);\n"
+      "  int face_id;\n"
+      "  float sc;\n"
+      "  float tc;\n"
+      "  float ma;\n"
+      "  if (abs_src.x > abs_src.y && abs_src.x > abs_src.z) {\n"
+      "    if (src.x > 0.0) {\n"
+      "      face_id = 0; sc = -abs_src.z; tc = -abs_src.y; ma = abs_src.x;\n"
+      "    } else {\n"
+      "      face_id = 1; sc =  abs_src.z; tc = -abs_src.y; ma = abs_src.x;\n"
+      "    }\n"
+      "  } else if (abs_src.y > abs_src.x && abs_src.y > abs_src.z) {\n"
+      "    if (src.y > 0.0) {\n"
+      "      face_id = 2; sc =  abs_src.x; tc =  abs_src.z; ma = abs_src.y;\n"
+      "    } else {\n"
+      "      face_id = 3; sc =  abs_src.x; tc = -abs_src.z; ma = abs_src.y;\n"
+      "    }\n"
+      "  } else {\n"
+      "    if (src.z > 0.0) {\n"
+      "      face_id = 4; sc =  abs_src.x; tc = -abs_src.y; ma = abs_src.z;\n"
+      "    } else {\n"
+      "      face_id = 5; sc = -abs_src.x; tc = -abs_src.y; ma = abs_src.z;\n"
+      "    }\n"
+      "  }\n"
+      "  float s = (sc / ma + 1.0) / 2.0;\n"
+      "  float t = (tc / ma + 1.0) / 2.0;\n"
+      "  return vec4(t, s, 2.0 * ma, float(face_id));\n"
+      "}\n";
+  return footer;
+}
+
 bool GL4Shader::PrepareVertexArrayObject() {
  glCreateVertexArrays(1, &vao_);

@ -182,7 +235,6 @@ bool GL4Shader::PrepareVertexShader(
    PLOGE("Unable to prepare vertex shader array object");
    return false;
  }
-
  std::string apply_transform =
      "vec4 applyTransform(const in StateData state, vec4 pos) {\n"
      "  if (state.vtx_fmt.w == 0.0) {\n"
@ -221,7 +273,8 @@ bool GL4Shader::PrepareVertexShader(
      "  processVertex(state);\n"
      "  gl_Position = applyTransform(state, gl_Position);\n"
      "  draw_id = gl_DrawIDARB;\n"
-      "}\n";
+      "}\n" +
+      GetFooter();

  std::string translated_source =
      shader_translator_.TranslateVertexShader(this, program_cntl);
@ -273,7 +326,8 @@ bool GL4Shader::PreparePixelShader(
      "  if (state.alpha_test.x != 0.0) {\n"
      "    applyAlphaTest(int(state.alpha_test.y), state.alpha_test.z);\n"
      "  }\n"
-      "}\n";
+      "}\n" +
+      GetFooter();

  std::string translated_source =
      shader_translator_.TranslatePixelShader(this, program_cntl);
--- a/src/xenia/gpu/gl4/gl4_shader.h
+++ b/src/xenia/gpu/gl4/gl4_shader.h
@ -34,6 +34,7 @@ class GL4Shader : public Shader {

 protected:
  std::string GetHeader();
+  std::string GetFooter();
  bool PrepareVertexArrayObject();
  bool CompileProgram(std::string source);

--- a/src/xenia/gpu/gl4/gl4_shader_translator.cc
+++ b/src/xenia/gpu/gl4/gl4_shader_translator.cc
@ -675,7 +675,16 @@ bool GL4ShaderTranslator::TranslateALU_DOT2ADDv(const instr_alu_t& alu) {
  return true;
 }

-// CUBEv
+bool GL4ShaderTranslator::TranslateALU_CUBEv(const instr_alu_t& alu) {
+  BeginAppendVectorOp(alu);
+  Append("cube(");
+  AppendVectorOpSrcReg(alu, 1);
+  Append(", ");
+  AppendVectorOpSrcReg(alu, 2);
+  Append(")");
+  EndAppendVectorOp(alu);
+  return true;
+}

 bool GL4ShaderTranslator::TranslateALU_MAX4v(const instr_alu_t& alu) {
  BeginAppendVectorOp(alu);
@ -1206,7 +1215,7 @@ bool GL4ShaderTranslator::TranslateALU(const instr_alu_t* alu, int sync) {
      ALU_INSTR_IMPL(DOT4v, 2),              // 15
      ALU_INSTR_IMPL(DOT3v, 2),              // 16
      ALU_INSTR_IMPL(DOT2ADDv, 3),           // 17 -- ???
-      ALU_INSTR(CUBEv, 2),                   // 18
+      ALU_INSTR_IMPL(CUBEv, 2),              // 18
      ALU_INSTR_IMPL(MAX4v, 1),              // 19
      ALU_INSTR_IMPL(PRED_SETE_PUSHv, 2),    // 20
      ALU_INSTR_IMPL(PRED_SETNE_PUSHv, 2),   // 21
@ -1876,15 +1885,29 @@ bool GL4ShaderTranslator::TranslateTextureFetch(const instr_fetch_tex_t* tex,
  // Translate.
  // TODO(benvanik): if sampler == null, set to invalid color.
  Append("  if (state.texture_samplers[%d].x != 0) {\n", tex->const_idx & 0xF);
-  Append("    t = texture(");
-  Append("%s(state.texture_samplers[%d])", sampler_type, tex->const_idx & 0xF);
-  Append(", r%u.", tex->src_reg);
-  src_swiz = tex->src_swiz;
-  for (int i = 0; i < src_component_count; i++) {
-    Append("%c", chan_names[src_swiz & 0x3]);
-    src_swiz >>= 2;
+  if (tex->dimension == DIMENSION_CUBE) {
+    Append("    t.xyz = r%u.", tex->src_reg);
+    src_swiz = tex->src_swiz;
+    for (int i = 0; i < src_component_count; i++) {
+      Append("%c", chan_names[src_swiz & 0x3]);
+      src_swiz >>= 2;
+    }
+    Append(";\n");
+    // TODO(benvanik): undo CUBEv logic on t? (s,t,faceid)
+    Append("    t = texture(%s(state.texture_samplers[%d]), t.xyz);\n",
+           sampler_type, tex->const_idx & 0xF);
+  } else {
+    Append("    t = texture(");
+    Append("%s(state.texture_samplers[%d])", sampler_type,
+           tex->const_idx & 0xF);
+    Append(", r%u.", tex->src_reg);
+    src_swiz = tex->src_swiz;
+    for (int i = 0; i < src_component_count; i++) {
+      Append("%c", chan_names[src_swiz & 0x3]);
+      src_swiz >>= 2;
+    }
+    Append(");\n");
  }
-  Append(");\n");
  Append("  } else {\n");
  Append("    t = vec4(r%u.", tex->src_reg);
  src_swiz = tex->src_swiz;
--- a/src/xenia/gpu/gl4/gl4_shader_translator.h
+++ b/src/xenia/gpu/gl4/gl4_shader_translator.h
@ -85,7 +85,7 @@ class GL4ShaderTranslator {
  bool TranslateALU_DOT4v(const ucode::instr_alu_t& alu);
  bool TranslateALU_DOT3v(const ucode::instr_alu_t& alu);
  bool TranslateALU_DOT2ADDv(const ucode::instr_alu_t& alu);
-  // CUBEv
+  bool TranslateALU_CUBEv(const ucode::instr_alu_t& alu);
  bool TranslateALU_MAX4v(const ucode::instr_alu_t& alu);
  bool TranslateALU_PRED_SETXX_PUSHv(const ucode::instr_alu_t& alu,
                                     const char* op);
--- a/src/xenia/gpu/gl4/texture_cache.cc
+++ b/src/xenia/gpu/gl4/texture_cache.cc
@ -128,7 +128,8 @@ static const TextureConfig texture_configs[64] = {
     GL_INVALID_ENUM, GL_INVALID_ENUM},
    {TextureFormat::k_32_32_32_FLOAT, GL_RGB32F, GL_RGB, GL_FLOAT},
    {TextureFormat::k_DXT3A, GL_INVALID_ENUM, GL_INVALID_ENUM, GL_INVALID_ENUM},
-    {TextureFormat::k_DXT5A, GL_INVALID_ENUM, GL_INVALID_ENUM, GL_INVALID_ENUM},
+    {TextureFormat::k_DXT5A, GL_COMPRESSED_RGBA_S3TC_DXT5_EXT,
+     GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_UNSIGNED_BYTE},
    {TextureFormat::k_CTX1, GL_INVALID_ENUM, GL_INVALID_ENUM, GL_INVALID_ENUM},
    {TextureFormat::k_DXT3A_AS_1_1_1_1, GL_INVALID_ENUM, GL_INVALID_ENUM,
     GL_INVALID_ENUM},
@ -470,9 +471,11 @@ TextureCache::TextureEntry* TextureCache::LookupOrInsertTexture(
    case Dimension::k2D:
      uploaded = UploadTexture2D(entry->handle, texture_info);
      break;
+    case Dimension::kCube:
+      uploaded = UploadTextureCube(entry->handle, texture_info);
+      break;
    case Dimension::k1D:
    case Dimension::k3D:
-    case Dimension::kCube:
      assert_unhandled_case(texture_info.dimension);
      return false;
  }
@ -773,6 +776,110 @@ bool TextureCache::UploadTexture2D(GLuint texture,
  return true;
 }

+bool TextureCache::UploadTextureCube(GLuint texture,
+                                     const TextureInfo& texture_info) {
+  const auto host_address = memory_->Translate(texture_info.guest_address);
+
+  const auto& config =
+      texture_configs[uint32_t(texture_info.format_info->format)];
+  if (config.format == GL_INVALID_ENUM) {
+    assert_always("Unhandled texture format");
+    return false;
+  }
+
+  size_t unpack_length = texture_info.output_length;
+  glTextureStorage2D(texture, 1, config.internal_format,
+                     texture_info.size_cube.output_width,
+                     texture_info.size_cube.output_height);
+
+  auto allocation = scratch_buffer_->Acquire(unpack_length);
+  if (!texture_info.is_tiled) {
+    if (texture_info.size_cube.input_pitch ==
+        texture_info.size_cube.output_pitch) {
+      // Fast path copy entire image.
+      TextureSwap(texture_info.endianness, allocation.host_ptr, host_address,
+                  unpack_length);
+    } else {
+      // Slow path copy row-by-row because strides differ.
+      // UNPACK_ROW_LENGTH only works for uncompressed images, and likely does
+      // this exact thing under the covers, so we just always do it here.
+      const uint8_t* src = host_address;
+      uint8_t* dest = reinterpret_cast<uint8_t*>(allocation.host_ptr);
+      for (int face = 0; face < 6; ++face) {
+        uint32_t pitch = std::min(texture_info.size_cube.input_pitch,
+                                  texture_info.size_cube.output_pitch);
+        for (uint32_t y = 0; y < texture_info.size_cube.block_height; y++) {
+          TextureSwap(texture_info.endianness, dest, src, pitch);
+          src += texture_info.size_cube.input_pitch;
+          dest += texture_info.size_cube.output_pitch;
+        }
+      }
+    }
+  } else {
+    // TODO(benvanik): optimize this inner loop (or work by tiles).
+    const uint8_t* src = host_address;
+    uint8_t* dest = reinterpret_cast<uint8_t*>(allocation.host_ptr);
+    uint32_t bytes_per_block = texture_info.format_info->block_width *
+                               texture_info.format_info->block_height *
+                               texture_info.format_info->bits_per_pixel / 8;
+    // Tiled textures can be packed; get the offset into the packed texture.
+    uint32_t offset_x;
+    uint32_t offset_y;
+    TextureInfo::GetPackedTileOffset(texture_info, &offset_x, &offset_y);
+    auto bpp = (bytes_per_block >> 2) +
+               ((bytes_per_block >> 1) >> (bytes_per_block >> 2));
+    for (int face = 0; face < 6; ++face) {
+      for (uint32_t y = 0, output_base_offset = 0;
+           y < texture_info.size_cube.block_height;
+           y++, output_base_offset += texture_info.size_cube.output_pitch) {
+        auto input_base_offset = TextureInfo::TiledOffset2DOuter(
+            offset_y + y, (texture_info.size_cube.input_width /
+                           texture_info.format_info->block_width),
+            bpp);
+        for (uint32_t x = 0, output_offset = output_base_offset;
+             x < texture_info.size_cube.block_width;
+             x++, output_offset += bytes_per_block) {
+          auto input_offset =
+              TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp,
+                                              input_base_offset) >>
+              bpp;
+          TextureSwap(texture_info.endianness, dest + output_offset,
+                      src + input_offset * bytes_per_block, bytes_per_block);
+        }
+      }
+      src += texture_info.size_cube.input_face_length;
+      dest += texture_info.size_cube.output_face_length;
+    }
+  }
+  size_t unpack_offset = allocation.offset;
+  scratch_buffer_->Commit(std::move(allocation));
+  // TODO(benvanik): avoid flush on entire buffer by using another texture
+  // buffer.
+  scratch_buffer_->Flush();
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, scratch_buffer_->handle());
+  if (texture_info.is_compressed()) {
+    glCompressedTextureSubImage3D(
+        texture, 0, 0, 0, 0, texture_info.size_cube.output_width,
+        texture_info.size_cube.output_height, 6, config.format,
+        static_cast<GLsizei>(unpack_length),
+        reinterpret_cast<void*>(unpack_offset));
+  } else {
+    // Most of these don't seem to have an effect on compressed images.
+    // glPixelStorei(GL_UNPACK_SWAP_BYTES, GL_TRUE);
+    // glPixelStorei(GL_UNPACK_ALIGNMENT, texture_info.texel_pitch);
+    // glPixelStorei(GL_UNPACK_ROW_LENGTH, texture_info.size_2d.input_width);
+    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+
+    glTextureSubImage3D(texture, 0, 0, 0, 0,
+                        texture_info.size_cube.output_width,
+                        texture_info.size_cube.output_height, 6, config.format,
+                        config.type, reinterpret_cast<void*>(unpack_offset));
+  }
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+  return true;
+}
+
 }  // namespace gl4
 }  // namespace gpu
 }  // namespace xe
--- a/src/xenia/gpu/gl4/texture_cache.h
+++ b/src/xenia/gpu/gl4/texture_cache.h
@ -91,6 +91,7 @@ class TextureCache {
  void EvictTexture(TextureEntry* entry);

  bool UploadTexture2D(GLuint texture, const TextureInfo& texture_info);
+  bool UploadTextureCube(GLuint texture, const TextureInfo& texture_info);

  Memory* memory_;
  CircularBuffer* scratch_buffer_;
--- a/src/xenia/gpu/texture_info.cc
+++ b/src/xenia/gpu/texture_info.cc
@ -121,11 +121,15 @@ bool TextureInfo::Prepare(const xe_gpu_texture_fetch_t& fetch,
      info.height = fetch.size_2d.height;
      break;
    case Dimension::k3D:
-    case Dimension::kCube:
      info.width = fetch.size_3d.width;
      info.height = fetch.size_3d.height;
      info.depth = fetch.size_3d.depth;
      break;
+    case Dimension::kCube:
+      info.width = fetch.size_stack.width;
+      info.height = fetch.size_stack.height;
+      info.depth = fetch.size_stack.depth;
+      break;
  }
  info.format_info = FormatInfo::Get(fetch.format);
  info.endianness = static_cast<Endian>(fetch.endianness);
@ -152,8 +156,8 @@ bool TextureInfo::Prepare(const xe_gpu_texture_fetch_t& fetch,
      // TODO(benvanik): calculate size.
      return false;
    case Dimension::kCube:
-      // TODO(benvanik): calculate size.
-      return false;
+      info.CalculateTextureSizesCube(fetch);
+      break;
  }

  return true;
@ -208,6 +212,49 @@ void TextureInfo::CalculateTextureSizes2D(const xe_gpu_texture_fetch_t& fetch) {
  output_length = size_2d.output_pitch * block_height;
 }

+void TextureInfo::CalculateTextureSizesCube(const xe_gpu_texture_fetch_t& fetch) {
+  assert_true(fetch.size_stack.depth + 1 == 6);
+  size_cube.logical_width = 1 + fetch.size_stack.width;
+  size_cube.logical_height = 1 + fetch.size_stack.height;
+
+  // w/h in blocks must be a multiple of block size.
+  uint32_t block_width =
+    poly::round_up(size_cube.logical_width, format_info->block_width) /
+    format_info->block_width;
+  uint32_t block_height =
+    poly::round_up(size_cube.logical_height, format_info->block_height) /
+    format_info->block_height;
+
+  // Tiles are 32x32 blocks. All textures must be multiples of tile dimensions.
+  uint32_t tile_width = uint32_t(std::ceilf(block_width / 32.0f));
+  uint32_t tile_height = uint32_t(std::ceilf(block_height / 32.0f));
+  size_cube.block_width = tile_width * 32;
+  size_cube.block_height = tile_height * 32;
+
+  uint32_t bytes_per_block = format_info->block_width *
+    format_info->block_height *
+    format_info->bits_per_pixel / 8;
+  uint32_t byte_pitch = tile_width * 32 * bytes_per_block;
+  if (!is_tiled) {
+    // Each row must be a multiple of 256 in linear textures.
+    byte_pitch = poly::round_up(byte_pitch, 256);
+  }
+
+  size_cube.input_width = tile_width * 32 * format_info->block_width;
+  size_cube.input_height = tile_height * 32 * format_info->block_height;
+
+  size_cube.output_width = block_width * format_info->block_width;
+  size_cube.output_height = block_height * format_info->block_height;
+
+  size_cube.input_pitch = byte_pitch;
+  size_cube.output_pitch = block_width * bytes_per_block;
+
+  size_cube.input_face_length = size_cube.input_pitch * size_cube.block_height;
+  input_length = size_cube.input_face_length * 6;
+  size_cube.output_face_length = size_cube.output_pitch * block_height;
+  output_length = size_cube.output_face_length * 6;
+}
+
 void TextureInfo::GetPackedTileOffset(const TextureInfo& texture_info,
                                      uint32_t* out_offset_x,
                                      uint32_t* out_offset_y) {
--- a/src/xenia/gpu/texture_info.h
+++ b/src/xenia/gpu/texture_info.h
@ -187,6 +187,18 @@ struct TextureInfo {
    struct {
    } size_3d;
    struct {
+      uint32_t logical_width;
+      uint32_t logical_height;
+      uint32_t block_width;
+      uint32_t block_height;
+      uint32_t input_width;
+      uint32_t input_height;
+      uint32_t input_pitch;
+      uint32_t output_width;
+      uint32_t output_height;
+      uint32_t output_pitch;
+      uint32_t input_face_length;
+      uint32_t output_face_length;
    } size_cube;
  };

@ -209,6 +221,7 @@ struct TextureInfo {
 private:
  void CalculateTextureSizes1D(const xenos::xe_gpu_texture_fetch_t& fetch);
  void CalculateTextureSizes2D(const xenos::xe_gpu_texture_fetch_t& fetch);
+  void CalculateTextureSizesCube(const xenos::xe_gpu_texture_fetch_t& fetch);
 };

 }  // namespace gpu