diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h index 824a1d746..662ceced7 100644 --- a/src/xenia/base/math.h +++ b/src/xenia/base/math.h @@ -64,6 +64,22 @@ constexpr uint32_t select_bits(uint32_t value, uint32_t a, uint32_t b) { return (value & make_bitmask(a, b)) >> a; } +inline uint32_t bit_count(uint32_t v) { + v = v - ((v >> 1) & 0x55555555); + v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; +} + +inline uint32_t bit_count(uint64_t v) { + v = (v & 0x5555555555555555LU) + (v >> 1 & 0x5555555555555555LU); + v = (v & 0x3333333333333333LU) + (v >> 2 & 0x3333333333333333LU); + v = v + (v >> 4) & 0x0F0F0F0F0F0F0F0FLU; + v = v + (v >> 8); + v = v + (v >> 16); + v = v + (v >> 32) & 0x0000007F; + return static_cast(v); +} + // lzcnt instruction, typed for integers of all sizes. // The number of leading zero bits in the value parameter. If value is zero, the // return value is the size of the input operand (8, 16, 32, or 64). If the most diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index e3e4d7072..476369e53 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -504,8 +504,11 @@ class Shader { // Each bit corresponds to a storage index [0-31]. uint32_t int_bitmap; // Bitmap of all kConstantBool registers read by the shader. - // Each bit corresponds to a storage index [0-31]. - uint32_t bool_bitmap; + // Each bit corresponds to a storage index [0-255]. + uint32_t bool_bitmap[256 / 32]; + + // Computed byte count of all registers required when packed. + uint32_t packed_byte_length; }; Shader(ShaderType shader_type, uint64_t ucode_data_hash, diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index 68a70d5fb..1c9f31962 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -125,6 +125,24 @@ bool ShaderTranslator::Translate(Shader* shader) { TranslateBlocks(); + // Compute total bytes used by the register map. + // This saves us work later when we need to pack them. + constant_register_map_.packed_byte_length = 0; + for (int i = 0; i < 4; ++i) { + // Each bit indicates a vec4 (4 floats). + constant_register_map_.packed_byte_length += + 4 * 4 * xe::bit_count(constant_register_map_.float_bitmap[i]); + } + // Each bit indicates a single word. + constant_register_map_.packed_byte_length += + 4 * xe::bit_count(constant_register_map_.int_bitmap); + // Direct map between words and words we upload. + for (int i = 0; i < 4; ++i) { + if (constant_register_map_.bool_bitmap[i]) { + constant_register_map_.packed_byte_length += 4; + } + } + shader->errors_ = std::move(errors_); shader->translated_binary_ = CompleteTranslation(); shader->ucode_disassembly_ = ucode_disasm_buffer_.to_string(); @@ -490,7 +508,8 @@ void ShaderTranslator::TranslateControlFlowCondExec( i.instruction_count = cf.count(); i.type = ParsedExecInstruction::Type::kConditional; i.bool_constant_index = cf.bool_address(); - constant_register_map_.bool_bitmap |= 1 << i.bool_constant_index; + constant_register_map_.bool_bitmap[i.bool_constant_index / 32] |= + 1 << (i.bool_constant_index % 32); i.condition = cf.condition(); switch (cf.opcode()) { case ControlFlowOpcode::kCondExec: @@ -567,7 +586,8 @@ void ShaderTranslator::TranslateControlFlowCondCall( } else { i.type = ParsedCallInstruction::Type::kConditional; i.bool_constant_index = cf.bool_address(); - constant_register_map_.bool_bitmap |= 1 << i.bool_constant_index; + constant_register_map_.bool_bitmap[i.bool_constant_index / 32] |= + 1 << (i.bool_constant_index % 32); i.condition = cf.condition(); } @@ -599,7 +619,8 @@ void ShaderTranslator::TranslateControlFlowCondJmp( } else { i.type = ParsedJumpInstruction::Type::kConditional; i.bool_constant_index = cf.bool_address(); - constant_register_map_.bool_bitmap |= 1 << i.bool_constant_index; + constant_register_map_.bool_bitmap[i.bool_constant_index / 32] |= + 1 << (i.bool_constant_index % 32); i.condition = cf.condition(); }