From b392afbfae5ea60a8cc5682e804b0e4abf58aec4 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 22 Mar 2015 19:15:22 -0700 Subject: [PATCH] Basic cubemap support, though the sampling code is wrong. --- src/xenia/gpu/gl4/gl4_shader.cc | 60 ++++++++++- src/xenia/gpu/gl4/gl4_shader.h | 1 + src/xenia/gpu/gl4/gl4_shader_translator.cc | 43 ++++++-- src/xenia/gpu/gl4/gl4_shader_translator.h | 2 +- src/xenia/gpu/gl4/texture_cache.cc | 111 ++++++++++++++++++++- src/xenia/gpu/gl4/texture_cache.h | 1 + src/xenia/gpu/texture_info.cc | 53 +++++++++- src/xenia/gpu/texture_info.h | 13 +++ 8 files changed, 265 insertions(+), 19 deletions(-) diff --git a/src/xenia/gpu/gl4/gl4_shader.cc b/src/xenia/gpu/gl4/gl4_shader.cc index bdf6f4ec3..125315e5e 100644 --- a/src/xenia/gpu/gl4/gl4_shader.cc +++ b/src/xenia/gpu/gl4/gl4_shader.cc @@ -74,6 +74,59 @@ std::string GL4Shader::GetHeader() { return header; } +std::string GL4Shader::GetFooter() { + // http://www.nvidia.com/object/cube_map_ogl_tutorial.html + // http://developer.amd.com/wordpress/media/2012/10/R600_Instruction_Set_Architecture.pdf + // src0 = Rn.zzxy, src1 = Rn.yxzz + // dst.W = FaceId; + // dst.Z = 2.0f * MajorAxis; + // dst.Y = S cube coordinate; + // dst.X = T cube coordinate; + /* + major axis + direction target sc tc ma + ---------- ------------------------------------ --- --- --- + +rx GL_TEXTURE_CUBE_MAP_POSITIVE_X_EXT=0 -rz -ry rx + -rx GL_TEXTURE_CUBE_MAP_NEGATIVE_X_EXT=1 +rz -ry rx + +ry GL_TEXTURE_CUBE_MAP_POSITIVE_Y_EXT=2 +rx +rz ry + -ry GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT=3 +rx -rz ry + +rz GL_TEXTURE_CUBE_MAP_POSITIVE_Z_EXT=4 +rx -ry rz + -rz GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT=5 -rx -ry rz + */ + static const std::string footer = + "vec4 cube(vec4 src0, vec4 src1) {\n" + " vec3 src = vec3(src1.y, src1.x, src1.z);\n" + " vec3 abs_src = abs(src);\n" + " int face_id;\n" + " float sc;\n" + " float tc;\n" + " float ma;\n" + " if (abs_src.x > abs_src.y && abs_src.x > abs_src.z) {\n" + " if (src.x > 0.0) {\n" + " face_id = 0; sc = -abs_src.z; tc = -abs_src.y; ma = abs_src.x;\n" + " } else {\n" + " face_id = 1; sc = abs_src.z; tc = -abs_src.y; ma = abs_src.x;\n" + " }\n" + " } else if (abs_src.y > abs_src.x && abs_src.y > abs_src.z) {\n" + " if (src.y > 0.0) {\n" + " face_id = 2; sc = abs_src.x; tc = abs_src.z; ma = abs_src.y;\n" + " } else {\n" + " face_id = 3; sc = abs_src.x; tc = -abs_src.z; ma = abs_src.y;\n" + " }\n" + " } else {\n" + " if (src.z > 0.0) {\n" + " face_id = 4; sc = abs_src.x; tc = -abs_src.y; ma = abs_src.z;\n" + " } else {\n" + " face_id = 5; sc = -abs_src.x; tc = -abs_src.y; ma = abs_src.z;\n" + " }\n" + " }\n" + " float s = (sc / ma + 1.0) / 2.0;\n" + " float t = (tc / ma + 1.0) / 2.0;\n" + " return vec4(t, s, 2.0 * ma, float(face_id));\n" + "}\n"; + return footer; +} + bool GL4Shader::PrepareVertexArrayObject() { glCreateVertexArrays(1, &vao_); @@ -182,7 +235,6 @@ bool GL4Shader::PrepareVertexShader( PLOGE("Unable to prepare vertex shader array object"); return false; } - std::string apply_transform = "vec4 applyTransform(const in StateData state, vec4 pos) {\n" " if (state.vtx_fmt.w == 0.0) {\n" @@ -221,7 +273,8 @@ bool GL4Shader::PrepareVertexShader( " processVertex(state);\n" " gl_Position = applyTransform(state, gl_Position);\n" " draw_id = gl_DrawIDARB;\n" - "}\n"; + "}\n" + + GetFooter(); std::string translated_source = shader_translator_.TranslateVertexShader(this, program_cntl); @@ -273,7 +326,8 @@ bool GL4Shader::PreparePixelShader( " if (state.alpha_test.x != 0.0) {\n" " applyAlphaTest(int(state.alpha_test.y), state.alpha_test.z);\n" " }\n" - "}\n"; + "}\n" + + GetFooter(); std::string translated_source = shader_translator_.TranslatePixelShader(this, program_cntl); diff --git a/src/xenia/gpu/gl4/gl4_shader.h b/src/xenia/gpu/gl4/gl4_shader.h index 5c9b4eb24..3c7e203a0 100644 --- a/src/xenia/gpu/gl4/gl4_shader.h +++ b/src/xenia/gpu/gl4/gl4_shader.h @@ -34,6 +34,7 @@ class GL4Shader : public Shader { protected: std::string GetHeader(); + std::string GetFooter(); bool PrepareVertexArrayObject(); bool CompileProgram(std::string source); diff --git a/src/xenia/gpu/gl4/gl4_shader_translator.cc b/src/xenia/gpu/gl4/gl4_shader_translator.cc index bc527eb2d..ec48a6319 100644 --- a/src/xenia/gpu/gl4/gl4_shader_translator.cc +++ b/src/xenia/gpu/gl4/gl4_shader_translator.cc @@ -675,7 +675,16 @@ bool GL4ShaderTranslator::TranslateALU_DOT2ADDv(const instr_alu_t& alu) { return true; } -// CUBEv +bool GL4ShaderTranslator::TranslateALU_CUBEv(const instr_alu_t& alu) { + BeginAppendVectorOp(alu); + Append("cube("); + AppendVectorOpSrcReg(alu, 1); + Append(", "); + AppendVectorOpSrcReg(alu, 2); + Append(")"); + EndAppendVectorOp(alu); + return true; +} bool GL4ShaderTranslator::TranslateALU_MAX4v(const instr_alu_t& alu) { BeginAppendVectorOp(alu); @@ -1206,7 +1215,7 @@ bool GL4ShaderTranslator::TranslateALU(const instr_alu_t* alu, int sync) { ALU_INSTR_IMPL(DOT4v, 2), // 15 ALU_INSTR_IMPL(DOT3v, 2), // 16 ALU_INSTR_IMPL(DOT2ADDv, 3), // 17 -- ??? - ALU_INSTR(CUBEv, 2), // 18 + ALU_INSTR_IMPL(CUBEv, 2), // 18 ALU_INSTR_IMPL(MAX4v, 1), // 19 ALU_INSTR_IMPL(PRED_SETE_PUSHv, 2), // 20 ALU_INSTR_IMPL(PRED_SETNE_PUSHv, 2), // 21 @@ -1876,15 +1885,29 @@ bool GL4ShaderTranslator::TranslateTextureFetch(const instr_fetch_tex_t* tex, // Translate. // TODO(benvanik): if sampler == null, set to invalid color. Append(" if (state.texture_samplers[%d].x != 0) {\n", tex->const_idx & 0xF); - Append(" t = texture("); - Append("%s(state.texture_samplers[%d])", sampler_type, tex->const_idx & 0xF); - Append(", r%u.", tex->src_reg); - src_swiz = tex->src_swiz; - for (int i = 0; i < src_component_count; i++) { - Append("%c", chan_names[src_swiz & 0x3]); - src_swiz >>= 2; + if (tex->dimension == DIMENSION_CUBE) { + Append(" t.xyz = r%u.", tex->src_reg); + src_swiz = tex->src_swiz; + for (int i = 0; i < src_component_count; i++) { + Append("%c", chan_names[src_swiz & 0x3]); + src_swiz >>= 2; + } + Append(";\n"); + // TODO(benvanik): undo CUBEv logic on t? (s,t,faceid) + Append(" t = texture(%s(state.texture_samplers[%d]), t.xyz);\n", + sampler_type, tex->const_idx & 0xF); + } else { + Append(" t = texture("); + Append("%s(state.texture_samplers[%d])", sampler_type, + tex->const_idx & 0xF); + Append(", r%u.", tex->src_reg); + src_swiz = tex->src_swiz; + for (int i = 0; i < src_component_count; i++) { + Append("%c", chan_names[src_swiz & 0x3]); + src_swiz >>= 2; + } + Append(");\n"); } - Append(");\n"); Append(" } else {\n"); Append(" t = vec4(r%u.", tex->src_reg); src_swiz = tex->src_swiz; diff --git a/src/xenia/gpu/gl4/gl4_shader_translator.h b/src/xenia/gpu/gl4/gl4_shader_translator.h index 663e72684..8684f3787 100644 --- a/src/xenia/gpu/gl4/gl4_shader_translator.h +++ b/src/xenia/gpu/gl4/gl4_shader_translator.h @@ -85,7 +85,7 @@ class GL4ShaderTranslator { bool TranslateALU_DOT4v(const ucode::instr_alu_t& alu); bool TranslateALU_DOT3v(const ucode::instr_alu_t& alu); bool TranslateALU_DOT2ADDv(const ucode::instr_alu_t& alu); - // CUBEv + bool TranslateALU_CUBEv(const ucode::instr_alu_t& alu); bool TranslateALU_MAX4v(const ucode::instr_alu_t& alu); bool TranslateALU_PRED_SETXX_PUSHv(const ucode::instr_alu_t& alu, const char* op); diff --git a/src/xenia/gpu/gl4/texture_cache.cc b/src/xenia/gpu/gl4/texture_cache.cc index e6afa5304..f458182b8 100644 --- a/src/xenia/gpu/gl4/texture_cache.cc +++ b/src/xenia/gpu/gl4/texture_cache.cc @@ -128,7 +128,8 @@ static const TextureConfig texture_configs[64] = { GL_INVALID_ENUM, GL_INVALID_ENUM}, {TextureFormat::k_32_32_32_FLOAT, GL_RGB32F, GL_RGB, GL_FLOAT}, {TextureFormat::k_DXT3A, GL_INVALID_ENUM, GL_INVALID_ENUM, GL_INVALID_ENUM}, - {TextureFormat::k_DXT5A, GL_INVALID_ENUM, GL_INVALID_ENUM, GL_INVALID_ENUM}, + {TextureFormat::k_DXT5A, GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, + GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_UNSIGNED_BYTE}, {TextureFormat::k_CTX1, GL_INVALID_ENUM, GL_INVALID_ENUM, GL_INVALID_ENUM}, {TextureFormat::k_DXT3A_AS_1_1_1_1, GL_INVALID_ENUM, GL_INVALID_ENUM, GL_INVALID_ENUM}, @@ -470,9 +471,11 @@ TextureCache::TextureEntry* TextureCache::LookupOrInsertTexture( case Dimension::k2D: uploaded = UploadTexture2D(entry->handle, texture_info); break; + case Dimension::kCube: + uploaded = UploadTextureCube(entry->handle, texture_info); + break; case Dimension::k1D: case Dimension::k3D: - case Dimension::kCube: assert_unhandled_case(texture_info.dimension); return false; } @@ -773,6 +776,110 @@ bool TextureCache::UploadTexture2D(GLuint texture, return true; } +bool TextureCache::UploadTextureCube(GLuint texture, + const TextureInfo& texture_info) { + const auto host_address = memory_->Translate(texture_info.guest_address); + + const auto& config = + texture_configs[uint32_t(texture_info.format_info->format)]; + if (config.format == GL_INVALID_ENUM) { + assert_always("Unhandled texture format"); + return false; + } + + size_t unpack_length = texture_info.output_length; + glTextureStorage2D(texture, 1, config.internal_format, + texture_info.size_cube.output_width, + texture_info.size_cube.output_height); + + auto allocation = scratch_buffer_->Acquire(unpack_length); + if (!texture_info.is_tiled) { + if (texture_info.size_cube.input_pitch == + texture_info.size_cube.output_pitch) { + // Fast path copy entire image. + TextureSwap(texture_info.endianness, allocation.host_ptr, host_address, + unpack_length); + } else { + // Slow path copy row-by-row because strides differ. + // UNPACK_ROW_LENGTH only works for uncompressed images, and likely does + // this exact thing under the covers, so we just always do it here. + const uint8_t* src = host_address; + uint8_t* dest = reinterpret_cast(allocation.host_ptr); + for (int face = 0; face < 6; ++face) { + uint32_t pitch = std::min(texture_info.size_cube.input_pitch, + texture_info.size_cube.output_pitch); + for (uint32_t y = 0; y < texture_info.size_cube.block_height; y++) { + TextureSwap(texture_info.endianness, dest, src, pitch); + src += texture_info.size_cube.input_pitch; + dest += texture_info.size_cube.output_pitch; + } + } + } + } else { + // TODO(benvanik): optimize this inner loop (or work by tiles). + const uint8_t* src = host_address; + uint8_t* dest = reinterpret_cast(allocation.host_ptr); + uint32_t bytes_per_block = texture_info.format_info->block_width * + texture_info.format_info->block_height * + texture_info.format_info->bits_per_pixel / 8; + // Tiled textures can be packed; get the offset into the packed texture. + uint32_t offset_x; + uint32_t offset_y; + TextureInfo::GetPackedTileOffset(texture_info, &offset_x, &offset_y); + auto bpp = (bytes_per_block >> 2) + + ((bytes_per_block >> 1) >> (bytes_per_block >> 2)); + for (int face = 0; face < 6; ++face) { + for (uint32_t y = 0, output_base_offset = 0; + y < texture_info.size_cube.block_height; + y++, output_base_offset += texture_info.size_cube.output_pitch) { + auto input_base_offset = TextureInfo::TiledOffset2DOuter( + offset_y + y, (texture_info.size_cube.input_width / + texture_info.format_info->block_width), + bpp); + for (uint32_t x = 0, output_offset = output_base_offset; + x < texture_info.size_cube.block_width; + x++, output_offset += bytes_per_block) { + auto input_offset = + TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp, + input_base_offset) >> + bpp; + TextureSwap(texture_info.endianness, dest + output_offset, + src + input_offset * bytes_per_block, bytes_per_block); + } + } + src += texture_info.size_cube.input_face_length; + dest += texture_info.size_cube.output_face_length; + } + } + size_t unpack_offset = allocation.offset; + scratch_buffer_->Commit(std::move(allocation)); + // TODO(benvanik): avoid flush on entire buffer by using another texture + // buffer. + scratch_buffer_->Flush(); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, scratch_buffer_->handle()); + if (texture_info.is_compressed()) { + glCompressedTextureSubImage3D( + texture, 0, 0, 0, 0, texture_info.size_cube.output_width, + texture_info.size_cube.output_height, 6, config.format, + static_cast(unpack_length), + reinterpret_cast(unpack_offset)); + } else { + // Most of these don't seem to have an effect on compressed images. + // glPixelStorei(GL_UNPACK_SWAP_BYTES, GL_TRUE); + // glPixelStorei(GL_UNPACK_ALIGNMENT, texture_info.texel_pitch); + // glPixelStorei(GL_UNPACK_ROW_LENGTH, texture_info.size_2d.input_width); + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + + glTextureSubImage3D(texture, 0, 0, 0, 0, + texture_info.size_cube.output_width, + texture_info.size_cube.output_height, 6, config.format, + config.type, reinterpret_cast(unpack_offset)); + } + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + return true; +} + } // namespace gl4 } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/gl4/texture_cache.h b/src/xenia/gpu/gl4/texture_cache.h index cc2997d57..55fcd4313 100644 --- a/src/xenia/gpu/gl4/texture_cache.h +++ b/src/xenia/gpu/gl4/texture_cache.h @@ -91,6 +91,7 @@ class TextureCache { void EvictTexture(TextureEntry* entry); bool UploadTexture2D(GLuint texture, const TextureInfo& texture_info); + bool UploadTextureCube(GLuint texture, const TextureInfo& texture_info); Memory* memory_; CircularBuffer* scratch_buffer_; diff --git a/src/xenia/gpu/texture_info.cc b/src/xenia/gpu/texture_info.cc index 96b14ea7a..ac431c4d4 100644 --- a/src/xenia/gpu/texture_info.cc +++ b/src/xenia/gpu/texture_info.cc @@ -121,11 +121,15 @@ bool TextureInfo::Prepare(const xe_gpu_texture_fetch_t& fetch, info.height = fetch.size_2d.height; break; case Dimension::k3D: - case Dimension::kCube: info.width = fetch.size_3d.width; info.height = fetch.size_3d.height; info.depth = fetch.size_3d.depth; break; + case Dimension::kCube: + info.width = fetch.size_stack.width; + info.height = fetch.size_stack.height; + info.depth = fetch.size_stack.depth; + break; } info.format_info = FormatInfo::Get(fetch.format); info.endianness = static_cast(fetch.endianness); @@ -152,8 +156,8 @@ bool TextureInfo::Prepare(const xe_gpu_texture_fetch_t& fetch, // TODO(benvanik): calculate size. return false; case Dimension::kCube: - // TODO(benvanik): calculate size. - return false; + info.CalculateTextureSizesCube(fetch); + break; } return true; @@ -208,6 +212,49 @@ void TextureInfo::CalculateTextureSizes2D(const xe_gpu_texture_fetch_t& fetch) { output_length = size_2d.output_pitch * block_height; } +void TextureInfo::CalculateTextureSizesCube(const xe_gpu_texture_fetch_t& fetch) { + assert_true(fetch.size_stack.depth + 1 == 6); + size_cube.logical_width = 1 + fetch.size_stack.width; + size_cube.logical_height = 1 + fetch.size_stack.height; + + // w/h in blocks must be a multiple of block size. + uint32_t block_width = + poly::round_up(size_cube.logical_width, format_info->block_width) / + format_info->block_width; + uint32_t block_height = + poly::round_up(size_cube.logical_height, format_info->block_height) / + format_info->block_height; + + // Tiles are 32x32 blocks. All textures must be multiples of tile dimensions. + uint32_t tile_width = uint32_t(std::ceilf(block_width / 32.0f)); + uint32_t tile_height = uint32_t(std::ceilf(block_height / 32.0f)); + size_cube.block_width = tile_width * 32; + size_cube.block_height = tile_height * 32; + + uint32_t bytes_per_block = format_info->block_width * + format_info->block_height * + format_info->bits_per_pixel / 8; + uint32_t byte_pitch = tile_width * 32 * bytes_per_block; + if (!is_tiled) { + // Each row must be a multiple of 256 in linear textures. + byte_pitch = poly::round_up(byte_pitch, 256); + } + + size_cube.input_width = tile_width * 32 * format_info->block_width; + size_cube.input_height = tile_height * 32 * format_info->block_height; + + size_cube.output_width = block_width * format_info->block_width; + size_cube.output_height = block_height * format_info->block_height; + + size_cube.input_pitch = byte_pitch; + size_cube.output_pitch = block_width * bytes_per_block; + + size_cube.input_face_length = size_cube.input_pitch * size_cube.block_height; + input_length = size_cube.input_face_length * 6; + size_cube.output_face_length = size_cube.output_pitch * block_height; + output_length = size_cube.output_face_length * 6; +} + void TextureInfo::GetPackedTileOffset(const TextureInfo& texture_info, uint32_t* out_offset_x, uint32_t* out_offset_y) { diff --git a/src/xenia/gpu/texture_info.h b/src/xenia/gpu/texture_info.h index 031dc7add..d6823fdfc 100644 --- a/src/xenia/gpu/texture_info.h +++ b/src/xenia/gpu/texture_info.h @@ -187,6 +187,18 @@ struct TextureInfo { struct { } size_3d; struct { + uint32_t logical_width; + uint32_t logical_height; + uint32_t block_width; + uint32_t block_height; + uint32_t input_width; + uint32_t input_height; + uint32_t input_pitch; + uint32_t output_width; + uint32_t output_height; + uint32_t output_pitch; + uint32_t input_face_length; + uint32_t output_face_length; } size_cube; }; @@ -209,6 +221,7 @@ struct TextureInfo { private: void CalculateTextureSizes1D(const xenos::xe_gpu_texture_fetch_t& fetch); void CalculateTextureSizes2D(const xenos::xe_gpu_texture_fetch_t& fetch); + void CalculateTextureSizesCube(const xenos::xe_gpu_texture_fetch_t& fetch); }; } // namespace gpu