diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 31f2a680e..deb7fca35 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -875,7 +875,7 @@ void DxbcShaderTranslator::StartTranslation() { // depends on the guest code (thus no guarantees), initialize everything // now (except for pv, it's an internal temporary variable, not accessible // by the guest). - system_temp_pv_ = PushSystemTemp(); + system_temp_result_ = PushSystemTemp(); system_temp_ps_pc_p0_a0_ = PushSystemTemp(0b1111); system_temp_aL_ = PushSystemTemp(0b1111); system_temp_loop_count_ = PushSystemTemp(0b1111); @@ -1089,7 +1089,7 @@ void DxbcShaderTranslator::CompleteShaderCode() { DxbcOpEndLoop(); // Release the following system temporary values so epilogue can reuse them: - // - system_temp_pv_. + // - system_temp_result_. // - system_temp_ps_pc_p0_a0_. // - system_temp_aL_. // - system_temp_loop_count_. @@ -1306,6 +1306,96 @@ void DxbcShaderTranslator::EmitInstructionDisassembly() { length_dwords * sizeof(uint32_t) - length - 1); } +DxbcShaderTranslator::DxbcSrc DxbcShaderTranslator::LoadOperand( + const InstructionOperand& operand, uint32_t needed_components, + bool& temp_pushed_out) { + temp_pushed_out = false; + + uint32_t first_needed_component; + if (!xe::bit_scan_forward(needed_components, &first_needed_component)) { + return DxbcSrc::LF(0.0f); + } + + DxbcIndex index(operand.storage_index); + switch (operand.storage_addressing_mode) { + case InstructionStorageAddressingMode::kStatic: + break; + case InstructionStorageAddressingMode::kAddressAbsolute: + index = DxbcIndex(system_temp_ps_pc_p0_a0_, 3, operand.storage_index); + break; + case InstructionStorageAddressingMode::kAddressRelative: + index = DxbcIndex(system_temp_aL_, 0, operand.storage_index); + break; + } + + DxbcSrc src(DxbcSrc::LF(0.0f)); + switch (operand.storage_source) { + case InstructionStorageSource::kRegister: { + if (uses_register_dynamic_addressing()) { + // Load x#[#] to r# because x#[#] can be used only with mov. + uint32_t temp = PushSystemTemp(); + temp_pushed_out = true; + uint32_t used_swizzle_components = 0; + for (uint32_t i = 0; i < uint32_t(operand.component_count); ++i) { + if (!(needed_components & (1 << i))) { + continue; + } + SwizzleSource component = operand.GetComponent(i); + assert_true(component >= SwizzleSource::kX && + component <= SwizzleSource::kW); + used_swizzle_components |= + 1 << (uint32_t(component) - uint32_t(SwizzleSource::kX)); + } + assert_not_zero(used_swizzle_components); + DxbcOpMov(DxbcDest::R(temp, used_swizzle_components), + DxbcSrc::X(0, index)); + src = DxbcSrc::R(temp); + } else { + assert_true(operand.storage_addressing_mode == + InstructionStorageAddressingMode::kStatic); + src = DxbcSrc::R(index.index_); + } + } break; + case InstructionStorageSource::kConstantFloat: { + if (cbuffer_index_float_constants_ == kCbufferIndexUnallocated) { + cbuffer_index_float_constants_ = cbuffer_count_++; + } + if (operand.storage_addressing_mode == + InstructionStorageAddressingMode::kStatic) { + uint32_t float_constant_index = + constant_register_map().GetPackedFloatConstantIndex( + operand.storage_index); + assert_true(float_constant_index != UINT32_MAX); + if (float_constant_index == UINT32_MAX) { + return DxbcSrc::LF(0.0f); + } + index.index_ = float_constant_index; + } else { + assert_true(constant_register_map().float_dynamic_addressing); + } + src = DxbcSrc::CB(cbuffer_index_float_constants_, + uint32_t(CbufferRegister::kFloatConstants), index); + } break; + default: + assert_unhandled_case(operand.storage_source); + return DxbcSrc::LF(0.0f); + } + + // Swizzle, skipping unneeded components similar to how FXC skips components, + // by replacing them with the leftmost used one. + uint32_t swizzle = 0; + for (uint32_t i = 0; i < 4; ++i) { + SwizzleSource component = operand.GetComponent( + (needed_components & (1 << i)) ? i : first_needed_component); + assert_true(component >= SwizzleSource::kX && + component <= SwizzleSource::kW); + swizzle |= (uint32_t(component) - uint32_t(SwizzleSource::kX)) << (i * 2); + } + src = src.Swizzle(swizzle); + + return src.WithModifiers(operand.is_absolute_value, operand.is_negated); +} + void DxbcShaderTranslator::LoadDxbcSourceOperand( const InstructionOperand& operand, DxbcSourceOperand& dxbc_operand) { // Initialize the values to their defaults. @@ -1693,306 +1783,151 @@ void DxbcShaderTranslator::UnloadDxbcSourceOperand( } void DxbcShaderTranslator::StoreResult(const InstructionResult& result, - uint32_t reg, bool replicate_x, + const DxbcSrc& src, bool can_store_memexport_address) { uint32_t used_write_mask = result.GetUsedWriteMask(); - if (result.storage_target == InstructionStorageTarget::kNone || - !result.GetUsedWriteMask()) { + if (!used_write_mask) { return; } - // Validate memexport writes (Halo 3 has some weird invalid ones). - if (result.storage_target == InstructionStorageTarget::kExportAddress) { - if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 || - memexport_alloc_current_count_ > kMaxMemExports || - system_temps_memexport_address_[memexport_alloc_current_count_ - 1] == - UINT32_MAX) { + // Get the destination address and type. + DxbcDest dest(DxbcDest::Null()); + bool is_clamped = result.is_clamped; + switch (result.storage_target) { + case InstructionStorageTarget::kNone: return; - } - } else if (result.storage_target == InstructionStorageTarget::kExportData) { - if (memexport_alloc_current_count_ == 0 || - memexport_alloc_current_count_ > kMaxMemExports || - system_temps_memexport_data_[memexport_alloc_current_count_ - 1] - [result.storage_index] == UINT32_MAX) { - return; - } - } - - uint32_t saturate_bit = - ENCODE_D3D10_SB_INSTRUCTION_SATURATE(result.is_clamped); - - // Scalar targets get only one component. - // TODO(Triang3l): It's not replicated, it's X specifically. - if (result.storage_target == InstructionStorageTarget::kDepth) { - assert_not_zero(used_write_mask & 0b0001); - SwizzleSource component = result.components[0]; - if (replicate_x && component <= SwizzleSource::kW) { - component = SwizzleSource::kX; - } - // Both r[imm32] and imm32 operands are 2 tokens long. - switch (result.storage_target) { - case InstructionStorageTarget::kDepth: - assert_true(writes_depth()); - if (writes_depth()) { - if (edram_rov_used_) { - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_INSTRUCTION_SATURATE(1) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); - shader_code_.push_back(EncodeVectorMaskedOperand( - D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_rov_depth_stencil_); - } else { - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_INSTRUCTION_SATURATE(1) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(4)); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_OUTPUT_DEPTH, 0)); - } + case InstructionStorageTarget::kRegister: + if (uses_register_dynamic_addressing()) { + DxbcIndex register_index(result.storage_index); + switch (result.storage_addressing_mode) { + case InstructionStorageAddressingMode::kStatic: + break; + case InstructionStorageAddressingMode::kAddressAbsolute: + register_index = + DxbcIndex(system_temp_ps_pc_p0_a0_, 3, result.storage_index); + break; + case InstructionStorageAddressingMode::kAddressRelative: + register_index = + DxbcIndex(system_temp_aL_, 0, result.storage_index); + break; } - break; - default: - assert_unhandled_case(result.storage_target); + dest = DxbcDest::X(0, register_index); + } else { + assert_true(result.storage_addressing_mode == + InstructionStorageAddressingMode::kStatic); + dest = DxbcDest::R(result.storage_index); + } + break; + case InstructionStorageTarget::kInterpolator: + dest = DxbcDest::O(uint32_t(InOutRegister::kVSDSOutInterpolators) + + result.storage_index); + break; + case InstructionStorageTarget::kPosition: + dest = DxbcDest::R(system_temp_position_); + break; + case InstructionStorageTarget::kPointSizeEdgeFlagKillVertex: + assert_zero(used_write_mask & 0b1000); + dest = DxbcDest::R(system_temp_point_size_edge_flag_kill_vertex_); + break; + case InstructionStorageTarget::kExportAddress: + // Validate memexport writes (Halo 3 has some weird invalid ones). + if (!can_store_memexport_address || memexport_alloc_current_count_ == 0 || + memexport_alloc_current_count_ > kMaxMemExports || + system_temps_memexport_address_[memexport_alloc_current_count_ - 1] == + UINT32_MAX) { return; - } - if (component <= SwizzleSource::kW) { - shader_code_.push_back(EncodeVectorSelectOperand( - D3D10_SB_OPERAND_TYPE_TEMP, uint32_t(component), 1)); - shader_code_.push_back(reg); - } else { - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(component == SwizzleSource::k1 ? 0x3F800000 : 0); - } - ++stat_.instruction_count; - ++stat_.mov_instruction_count; + } + dest = DxbcDest::R( + system_temps_memexport_address_[memexport_alloc_current_count_ - 1]); + break; + case InstructionStorageTarget::kExportData: { + // Validate memexport writes (Halo 3 has some weird invalid ones). + if (memexport_alloc_current_count_ == 0 || + memexport_alloc_current_count_ > kMaxMemExports || + system_temps_memexport_data_[memexport_alloc_current_count_ - 1] + [result.storage_index] == UINT32_MAX) { + return; + } + dest = DxbcDest::R( + system_temps_memexport_data_[memexport_alloc_current_count_ - 1] + [result.storage_index]); + // Mark that the eM# has been written to and needs to be exported. + assert_not_zero(used_write_mask); + uint32_t memexport_index = memexport_alloc_current_count_ - 1; + DxbcOpOr(DxbcDest::R(system_temp_memexport_written_, + 1 << (memexport_index >> 2)), + DxbcSrc::R(system_temp_memexport_written_) + .Select(memexport_index >> 2), + DxbcSrc::LU(uint32_t(1) << (result.storage_index + + ((memexport_index & 3) << 3)))); + } break; + case InstructionStorageTarget::kColor: + assert_not_zero(used_write_mask); + assert_true(writes_color_target(result.storage_index)); + dest = DxbcDest::R(system_temps_color_[result.storage_index]); + if (edram_rov_used_) { + // For ROV output, mark that the color has been written to. + // According to: + // https://docs.microsoft.com/en-us/windows/desktop/direct3dhlsl/dx9-graphics-reference-asm-ps-registers-output-color + // if a color target hasn't been written to - including due to flow + // control - the render target must not be modified (the unwritten + // components of a written target are undefined, not sure if this + // behavior is respected on the real GPU, but the ROV code currently + // doesn't preserve unmodified components). + DxbcOpOr(DxbcDest::R(system_temp_rov_params_, 0b0001), + DxbcSrc::R(system_temp_rov_params_, DxbcSrc::kXXXX), + DxbcSrc::LU(uint32_t(1) << (8 + result.storage_index))); + } + break; + case InstructionStorageTarget::kDepth: + // Writes X to scalar oDepth or to X of system_temp_rov_depth_stencil_, no + // additional swizzling needed. + assert_true(used_write_mask == 0b0001); + assert_true(writes_depth()); + if (edram_rov_used_) { + dest = DxbcDest::R(system_temp_rov_depth_stencil_); + } else { + dest = DxbcDest::ODepth(); + } + // Depth outside [0, 1] is not safe for use with the ROV code. Though 20e4 + // float depth can store values below 2, it's a very unusual case. + // Direct3D 10+ SV_Depth, however, can accept any values, including + // specials, when the depth buffer is floating-point. + is_clamped = true; + break; + } + if (dest.type_ == DxbcOperandType::kNull) { return; } - // Get the write masks and data required for loading of both the swizzled part - // and the constant (zero/one) part. The write mask is treated also as a read - // mask in DXBC, and `mov r0.zw, r1.xyzw` actually means r0.zw = r1.zw, not - // r0.zw = r1.xy. - uint32_t swizzle_mask = 0; - uint32_t swizzle_components = 0; - uint32_t constant_mask = 0; - uint32_t constant_values = 0; + // Write. + uint32_t src_additional_swizzle = 0; + uint32_t constant_mask = 0, constant_1_mask = 0; for (uint32_t i = 0; i < 4; ++i) { if (!(used_write_mask & (1 << i))) { continue; } SwizzleSource component = result.components[i]; - if (component <= SwizzleSource::kW) { - swizzle_mask |= 1 << i; - // If replicating X, just keep zero swizzle (XXXX). - if (!replicate_x) { - swizzle_components |= uint32_t(component) << (i * 2); - } + if (component >= SwizzleSource::kX && component <= SwizzleSource::kW) { + src_additional_swizzle |= + (uint32_t(component) - uint32_t(SwizzleSource::kX)) << (i * 2); } else { constant_mask |= 1 << i; - constant_values |= (component == SwizzleSource::k1 ? 1 : 0) << i; - } - } - - bool is_static = result.storage_addressing_mode == - InstructionStorageAddressingMode::kStatic; - // If the index is dynamic, choose where it's taken from. - uint32_t dynamic_address_register, dynamic_address_component; - if (result.storage_addressing_mode == - InstructionStorageAddressingMode::kAddressRelative) { - // Addressed by aL.x. - dynamic_address_register = system_temp_aL_; - dynamic_address_component = 0; - } else { - // Addressed by a0. - dynamic_address_register = system_temp_ps_pc_p0_a0_; - dynamic_address_component = 3; - } - - // Store both parts of the write (i == 0 - swizzled, i == 1 - constant). - for (uint32_t i = 0; i < 2; ++i) { - uint32_t mask = i == 0 ? swizzle_mask : constant_mask; - if (mask == 0) { - continue; - } - - // r# for the swizzled part, 4-component imm32 for the constant part. - uint32_t source_length = i != 0 ? 5 : 2; - switch (result.storage_target) { - case InstructionStorageTarget::kRegister: - if (uses_register_dynamic_addressing()) { - ++stat_.instruction_count; - ++stat_.array_instruction_count; - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH((is_static ? 4 : 6) + - source_length) | - saturate_bit); - shader_code_.push_back(EncodeVectorMaskedOperand( - D3D10_SB_OPERAND_TYPE_INDEXABLE_TEMP, mask, 2, - D3D10_SB_OPERAND_INDEX_IMMEDIATE32, - is_static ? D3D10_SB_OPERAND_INDEX_IMMEDIATE32 - : D3D10_SB_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE)); - shader_code_.push_back(0); - shader_code_.push_back(result.storage_index); - if (!is_static) { - shader_code_.push_back(EncodeVectorSelectOperand( - D3D10_SB_OPERAND_TYPE_TEMP, dynamic_address_component, 1)); - shader_code_.push_back(dynamic_address_register); - } - } else { - assert_true(is_static); - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) | - saturate_bit); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1)); - shader_code_.push_back(result.storage_index); - } - break; - - case InstructionStorageTarget::kInterpolator: - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) | - saturate_bit); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_OUTPUT, mask, 1)); - shader_code_.push_back(uint32_t(InOutRegister::kVSDSOutInterpolators) + - uint32_t(result.storage_index)); - break; - - case InstructionStorageTarget::kPosition: - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) | - saturate_bit); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1)); - shader_code_.push_back(system_temp_position_); - break; - - case InstructionStorageTarget::kPointSizeEdgeFlagKillVertex: - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) | - saturate_bit); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1)); - shader_code_.push_back(system_temp_point_size_edge_flag_kill_vertex_); - break; - - case InstructionStorageTarget::kExportAddress: - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) | - saturate_bit); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1)); - shader_code_.push_back( - system_temps_memexport_address_[memexport_alloc_current_count_ - - 1]); - break; - - case InstructionStorageTarget::kExportData: - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) | - saturate_bit); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1)); - shader_code_.push_back( - system_temps_memexport_data_[memexport_alloc_current_count_ - 1] - [uint32_t(result.storage_index)]); - break; - - case InstructionStorageTarget::kColor: - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + source_length) | - saturate_bit); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, mask, 1)); - shader_code_.push_back(system_temps_color_[result.storage_index]); - break; - - default: - continue; - } - - if (i == 0) { - // Copy from the source r#. - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, swizzle_components, 1)); - shader_code_.push_back(reg); - } else { - // Load constants. - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - for (uint32_t j = 0; j < 4; ++j) { - shader_code_.push_back((constant_values & (1 << j)) ? 0x3F800000 : 0); + if (component == SwizzleSource::k1) { + constant_1_mask |= 1 << i; } } } - - if (result.storage_target == InstructionStorageTarget::kExportData) { - // Mark that the eM# has been written to and needs to be exported. - uint32_t memexport_index = memexport_alloc_current_count_ - 1; - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back(EncodeVectorMaskedOperand( - D3D10_SB_OPERAND_TYPE_TEMP, 1 << (memexport_index >> 2), 1)); - shader_code_.push_back(system_temp_memexport_written_); - shader_code_.push_back(EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, - memexport_index >> 2, 1)); - shader_code_.push_back(system_temp_memexport_written_); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back( - uint32_t(1) << (result.storage_index + ((memexport_index & 3) << 3))); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; + if (used_write_mask != constant_mask) { + DxbcOpMov(dest.Mask(used_write_mask & ~constant_mask), + src.SwizzleSwizzled(src_additional_swizzle), is_clamped); } - - if (edram_rov_used_ && - result.storage_target == InstructionStorageTarget::kColor) { - // For ROV output, mark that the color has been written to. - // According to: - // https://docs.microsoft.com/en-us/windows/desktop/direct3dhlsl/dx9-graphics-reference-asm-ps-registers-output-color - // if a color target has been written to - including due to flow control - - // the render target must not be modified (the unwritten components of a - // written target are undefined, not sure if this behavior is respected on - // the real GPU, but the ROV code currently uses pre-packed masks to keep - // the old values, so preservation of components is not done). - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_rov_params_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_rov_params_); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(1 << (8 + result.storage_index)); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; + if (constant_mask) { + DxbcOpMov(dest.Mask(constant_mask), + DxbcSrc::LF(float(constant_1_mask & 1), + float((constant_1_mask >> 1) & 1), + float((constant_1_mask >> 2) & 1), + float((constant_1_mask >> 3) & 1))); } } @@ -2192,8 +2127,8 @@ void DxbcShaderTranslator::ProcessLoopStartInstruction( EmitInstructionDisassembly(); } - // Count (as uint) in bits 0:7 of the loop constant, initial aL in 8:15. - // Starting from vector 2 because of bool constants. + // Count (unsigned) in bits 0:7 of the loop constant, initial aL (unsigned) in + // 8:15. Starting from vector 2 because of bool constants. if (cbuffer_index_bool_loop_constants_ == kCbufferIndexUnallocated) { cbuffer_index_bool_loop_constants_ = cbuffer_count_++; } @@ -2280,12 +2215,12 @@ void DxbcShaderTranslator::ProcessLoopEndInstruction( { // Continue case. uint32_t aL_add_temp = PushSystemTemp(); - // Extract the value to add to aL (in bits 16:23 of the loop constant). - // Starting from vector 2 because of bool constants. + // Extract the value to add to aL (signed, in bits 16:23 of the loop + // constant). Starting from vector 2 because of bool constants. if (cbuffer_index_bool_loop_constants_ == kCbufferIndexUnallocated) { cbuffer_index_bool_loop_constants_ = cbuffer_count_++; } - DxbcOpUBFE(DxbcDest::R(aL_add_temp, 0b0001), DxbcSrc::LU(8), + DxbcOpIBFE(DxbcDest::R(aL_add_temp, 0b0001), DxbcSrc::LU(8), DxbcSrc::LU(16), DxbcSrc::CB(cbuffer_index_bool_loop_constants_, uint32_t(CbufferRegister::kBoolLoopConstants), diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 3fff2c561..6c3c06670 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -764,7 +764,7 @@ class DxbcShaderTranslator : public ShaderTranslator { if (index_dimension > 1) { operand_token |= uint32_t(index_2d_.GetRepresentation()) << 25; if (index_dimension > 2) { - operand_token |= uint32_t(index_2d_.GetRepresentation()) << 28; + operand_token |= uint32_t(index_3d_.GetRepresentation()) << 28; } } } @@ -1084,12 +1084,15 @@ class DxbcShaderTranslator : public ShaderTranslator { kDefault = 10, kDiscard = 13, kDiv = 14, + kDP2 = 15, + kDP3 = 16, kDP4 = 17, kElse = 18, kEndIf = 21, kEndLoop = 22, kEndSwitch = 23, kEq = 24, + kFrc = 26, kFToI = 27, kFToU = 28, kGE = 29, @@ -1118,6 +1121,7 @@ class DxbcShaderTranslator : public ShaderTranslator { kRet = 62, kRetC = 63, kRoundNE = 64, + kRoundNI = 65, kRoundZ = 67, kSwitch = 76, kULT = 79, @@ -1291,6 +1295,32 @@ class DxbcShaderTranslator : public ShaderTranslator { DxbcEmitAluOp(DxbcOpcode::kDiv, 0b00, dest, src0, src1, saturate); ++stat_.float_instruction_count; } + void DxbcOpDP2(const DxbcDest& dest, const DxbcSrc& src0, const DxbcSrc& src1, + bool saturate = false) { + uint32_t operands_length = + dest.GetLength() + src0.GetLength(0b0011) + src1.GetLength(0b0011); + shader_code_.reserve(shader_code_.size() + 1 + operands_length); + shader_code_.push_back( + DxbcOpcodeToken(DxbcOpcode::kDP2, operands_length, saturate)); + dest.Write(shader_code_); + src0.Write(shader_code_, false, 0b0011); + src1.Write(shader_code_, false, 0b0011); + ++stat_.instruction_count; + ++stat_.float_instruction_count; + } + void DxbcOpDP3(const DxbcDest& dest, const DxbcSrc& src0, const DxbcSrc& src1, + bool saturate = false) { + uint32_t operands_length = + dest.GetLength() + src0.GetLength(0b0111) + src1.GetLength(0b0111); + shader_code_.reserve(shader_code_.size() + 1 + operands_length); + shader_code_.push_back( + DxbcOpcodeToken(DxbcOpcode::kDP3, operands_length, saturate)); + dest.Write(shader_code_); + src0.Write(shader_code_, false, 0b0111); + src1.Write(shader_code_, false, 0b0111); + ++stat_.instruction_count; + ++stat_.float_instruction_count; + } void DxbcOpDP4(const DxbcDest& dest, const DxbcSrc& src0, const DxbcSrc& src1, bool saturate = false) { uint32_t operands_length = @@ -1325,6 +1355,11 @@ class DxbcShaderTranslator : public ShaderTranslator { DxbcEmitAluOp(DxbcOpcode::kEq, 0b00, dest, src0, src1); ++stat_.float_instruction_count; } + void DxbcOpFrc(const DxbcDest& dest, const DxbcSrc& src, + bool saturate = false) { + DxbcEmitAluOp(DxbcOpcode::kFrc, 0b0, dest, src, saturate); + ++stat_.float_instruction_count; + } void DxbcOpFToI(const DxbcDest& dest, const DxbcSrc& src) { DxbcEmitAluOp(DxbcOpcode::kFToI, 0b0, dest, src); ++stat_.conversion_instruction_count; @@ -1471,6 +1506,11 @@ class DxbcShaderTranslator : public ShaderTranslator { DxbcEmitAluOp(DxbcOpcode::kRoundNE, 0b0, dest, src, saturate); ++stat_.float_instruction_count; } + void DxbcOpRoundNI(const DxbcDest& dest, const DxbcSrc& src, + bool saturate = false) { + DxbcEmitAluOp(DxbcOpcode::kRoundNI, 0b0, dest, src, saturate); + ++stat_.float_instruction_count; + } void DxbcOpRoundZ(const DxbcDest& dest, const DxbcSrc& src, bool saturate = false) { DxbcEmitAluOp(DxbcOpcode::kRoundZ, 0b0, dest, src, saturate); @@ -2027,6 +2067,14 @@ class DxbcShaderTranslator : public ShaderTranslator { // as shader messages, from instruction_disassembly_buffer_. void EmitInstructionDisassembly(); + // Converts a shader translator source operand to a DXBC emitter operand, or + // returns a zero literal operand if it's not going to be referenced. This may + // allocate a temporary register and emit instructions if the operand can't be + // used directly with most DXBC instructions (like, if it's an indexable GPR), + // in this case, temp_pushed_out will be set to true, and PopSystemTemp must + // be done when the operand is not needed anymore. + DxbcSrc LoadOperand(const InstructionOperand& operand, + uint32_t needed_components, bool& temp_pushed_out); // Abstract 4-component vector source operand. // TODO(Triang3l): Remove after fully moving to the new emitter. struct DxbcSourceOperand { @@ -2085,11 +2133,12 @@ class DxbcShaderTranslator : public ShaderTranslator { // TODO(Triang3l): Remove after fully moving to the new emitter. void UnloadDxbcSourceOperand(const DxbcSourceOperand& operand); - // Writes xyzw or xxxx of the specified r# to the destination. - // can_store_memexport_address is for safety, to allow only proper MADs with - // a stream constant to write to eA. - void StoreResult(const InstructionResult& result, uint32_t reg, - bool replicate_x, bool can_store_memexport_address = false); + // Writes the specified source (src must be usable as a vector `mov` source, + // including to x#) to an instruction storage target. + // can_store_memexport_address is for safety, to allow only proper MADs with a + // stream constant to write to eA. + void StoreResult(const InstructionResult& result, const DxbcSrc& src, + bool can_store_memexport_address = false); // The nesting of `if` instructions is the following: // - pc checks (labels). @@ -2150,12 +2199,12 @@ class DxbcShaderTranslator : public ShaderTranslator { TextureFilter min_filter, TextureFilter mip_filter, AnisoFilter aniso_filter); - // Converts (S, T, face index) in the specified temporary register to a 3D - // cubemap coordinate. - void ArrayCoordToCubeDirection(uint32_t reg); + // Converts (array S + 1, array T + 1, face index) in the specified temporary + // register to a 3D cubemap coordinate. + void TfetchCubeCoordToCubeDirection(uint32_t reg); - bool ProcessVectorAluOperation(const ParsedAluInstruction& instr, - bool& replicate_result_x, + void ProcessVectorAluOperation(const ParsedAluInstruction& instr, + uint32_t& result_swizzle, bool& predicate_written); bool ProcessScalarAluOperation(const ParsedAluInstruction& instr, bool& predicate_written); @@ -2334,9 +2383,9 @@ class DxbcShaderTranslator : public ShaderTranslator { // eM# in each `alloc export`, or UINT32_MAX if not used. uint32_t system_temps_memexport_data_[kMaxMemExports][5]; - // Vector ALU result or fetch scratch (since Xenos write masks can contain + // Vector ALU or fetch result/scratch (since Xenos write masks can contain // swizzles). - uint32_t system_temp_pv_; + uint32_t system_temp_result_; // Temporary register ID for previous scalar result, program counter, // predicate and absolute address register. uint32_t system_temp_ps_pc_p0_a0_; diff --git a/src/xenia/gpu/dxbc_shader_translator_alu.cc b/src/xenia/gpu/dxbc_shader_translator_alu.cc index 6b253dd2e..c7380a3c0 100644 --- a/src/xenia/gpu/dxbc_shader_translator_alu.cc +++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc @@ -12,22 +12,26 @@ #include "third_party/dxbc/d3d12TokenizedProgramFormat.hpp" #include "xenia/base/assert.h" +#include "xenia/base/math.h" namespace xe { namespace gpu { using namespace ucode; -bool DxbcShaderTranslator::ProcessVectorAluOperation( - const ParsedAluInstruction& instr, bool& replicate_result_x, +void DxbcShaderTranslator::ProcessVectorAluOperation( + const ParsedAluInstruction& instr, uint32_t& result_swizzle, bool& predicate_written) { - replicate_result_x = false; + result_swizzle = DxbcSrc::kXYZW; predicate_written = false; - if (!instr.vector_and_constant_result.GetUsedWriteMask() && + uint32_t used_result_components = + instr.vector_and_constant_result.GetUsedResultComponents(); + if (!used_result_components && !AluVectorOpHasSideEffects(instr.vector_opcode)) { - return false; + return; } + // Load operands. // A small shortcut, operands of cube are the same, but swizzled. uint32_t operand_count; if (instr.vector_opcode == AluVectorOpcode::kCube) { @@ -35,1265 +39,627 @@ bool DxbcShaderTranslator::ProcessVectorAluOperation( } else { operand_count = instr.vector_operand_count; } - DxbcSourceOperand dxbc_operands[3]; - // Whether the operand is the same as any previous operand, and thus is loaded - // only once. - bool operands_duplicate[3] = {}; - uint32_t operand_length_sums[3]; + uint32_t operand_needed_components[3]; for (uint32_t i = 0; i < operand_count; ++i) { - const InstructionOperand& operand = instr.vector_operands[i]; - for (uint32_t j = 0; j < i; ++j) { - if (operand.GetIdenticalComponents(instr.vector_operands[j]) == 0b1111) { - operands_duplicate[i] = true; - dxbc_operands[i] = dxbc_operands[j]; - break; - } - } - if (!operands_duplicate[i]) { - LoadDxbcSourceOperand(operand, dxbc_operands[i]); - } - operand_length_sums[i] = DxbcSourceOperandLength(dxbc_operands[i]); - if (i != 0) { - operand_length_sums[i] += operand_length_sums[i - 1]; - } + operand_needed_components[i] = ucode::GetAluVectorOpNeededSourceComponents( + instr.vector_opcode, i + 1, used_result_components); } + // .zzxy - don't need the first component. + if (instr.vector_opcode == AluVectorOpcode::kCube) { + operand_needed_components[0] &= 0b1101; + } + DxbcSrc operands[3]{DxbcSrc::LF(0.0f), DxbcSrc::LF(0.0f), DxbcSrc::LF(0.0f)}; + uint32_t operand_temps = 0; + for (uint32_t i = 0; i < operand_count; ++i) { + bool operand_temp_pushed = false; + operands[i] = + LoadOperand(instr.vector_operands[i], operand_needed_components[i], + operand_temp_pushed); + operand_temps += uint32_t(operand_temp_pushed); + } + // Don't return without PopSystemTemp(operand_temps) from now on! - // So the same code can be used for instructions with the same format. - static const uint32_t kCoreOpcodes[] = { - D3D10_SB_OPCODE_ADD, - D3D10_SB_OPCODE_MUL, - D3D10_SB_OPCODE_MAX, - D3D10_SB_OPCODE_MIN, - D3D10_SB_OPCODE_EQ, - D3D10_SB_OPCODE_LT, - D3D10_SB_OPCODE_GE, - D3D10_SB_OPCODE_NE, - D3D10_SB_OPCODE_FRC, - D3D10_SB_OPCODE_ROUND_Z, - D3D10_SB_OPCODE_ROUND_NI, - D3D10_SB_OPCODE_MAD, - D3D10_SB_OPCODE_EQ, - D3D10_SB_OPCODE_GE, - D3D10_SB_OPCODE_LT, - D3D10_SB_OPCODE_DP4, - D3D10_SB_OPCODE_DP3, - D3D10_SB_OPCODE_DP2, - 0, - 0, - D3D10_SB_OPCODE_EQ, - D3D10_SB_OPCODE_NE, - D3D10_SB_OPCODE_LT, - D3D10_SB_OPCODE_GE, - D3D10_SB_OPCODE_EQ, - D3D10_SB_OPCODE_LT, - D3D10_SB_OPCODE_GE, - D3D10_SB_OPCODE_NE, - 0, - D3D10_SB_OPCODE_MAX, - }; - - bool translated = true; + DxbcDest per_component_dest( + DxbcDest::R(system_temp_result_, used_result_components)); switch (instr.vector_opcode) { case AluVectorOpcode::kAdd: - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ADD) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + operand_length_sums[1])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0]); - UseDxbcSourceOperand(dxbc_operands[1]); - ++stat_.instruction_count; - ++stat_.float_instruction_count; + DxbcOpAdd(per_component_dest, operands[0], operands[1]); break; - - case AluVectorOpcode::kMul: { - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MUL) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + operand_length_sums[1])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0]); - UseDxbcSourceOperand(dxbc_operands[1]); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - if (instr.vector_operands[0].GetAbsoluteIdenticalComponents( - instr.vector_operands[1]) != 0b1111) { - // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0), - // flushing denormals (must be done using eq - doing bitwise comparison - // doesn't flush denormals). - // With Shader Model 4 behavior, Halo 3 has a significant portion of the - // image missing because rcp(0) is multiplied by 0, which results in NaN - // rather than 0. - uint32_t is_subnormal_temp = PushSystemTemp(); - // Get the non-NaN multiplicand closer to zero to check if any of them - // is zero. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MIN) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + DxbcSourceOperandLength(dxbc_operands[0], false, true) + - DxbcSourceOperandLength(dxbc_operands[1], false, true))); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(is_subnormal_temp); - UseDxbcSourceOperand(dxbc_operands[0], kSwizzleXYZW, 4, false, true); - UseDxbcSourceOperand(dxbc_operands[1], kSwizzleXYZW, 4, false, true); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // Check if any multiplicand is zero (min isn't required to flush - // denormals in the result). - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_EQ) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(is_subnormal_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(is_subnormal_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // Zero the result if any multiplicand is zero. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(is_subnormal_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - // Release is_subnormal_temp. + case AluVectorOpcode::kMul: + case AluVectorOpcode::kMad: { + bool is_mad = instr.vector_opcode == AluVectorOpcode::kMad; + if (is_mad) { + DxbcOpMAd(per_component_dest, operands[0], operands[1], operands[2]); + } else { + DxbcOpMul(per_component_dest, operands[0], operands[1]); + } + // Shader Model 3: 0 or denormal * anything = 0. + // FIXME(Triang3l): Signed zero needs research and handling. + uint32_t absolute_different = + used_result_components & + ~instr.vector_operands[0].GetAbsoluteIdenticalComponents( + instr.vector_operands[1]); + if (absolute_different) { + uint32_t is_zero_temp = PushSystemTemp(); + DxbcOpMin(DxbcDest::R(is_zero_temp, absolute_different), + operands[0].Abs(), operands[1].Abs()); + // min isn't required to flush denormals, eq is. + DxbcOpEq(DxbcDest::R(is_zero_temp, absolute_different), + DxbcSrc::R(is_zero_temp), DxbcSrc::LF(0.0f)); + DxbcOpMovC(DxbcDest::R(system_temp_result_, absolute_different), + DxbcSrc::R(is_zero_temp), + is_mad ? operands[2] : DxbcSrc::LF(0.0f), + DxbcSrc::R(system_temp_result_)); + // Release is_zero_temp. PopSystemTemp(); } } break; case AluVectorOpcode::kMax: - case AluVectorOpcode::kMin: + case AluVectorOpcode::kMin: { // max is commonly used as mov. - if (operands_duplicate[1]) { - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + operand_length_sums[0])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0]); - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - } else { - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE( - kCoreOpcodes[uint32_t(instr.vector_opcode)]) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + operand_length_sums[1])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0]); - UseDxbcSourceOperand(dxbc_operands[1]); - ++stat_.instruction_count; - ++stat_.float_instruction_count; + uint32_t identical = instr.vector_operands[0].GetIdenticalComponents( + instr.vector_operands[1]) & + used_result_components; + uint32_t different = used_result_components & ~identical; + if (different) { + // Shader Model 3 NaN behavior (a op b ? a : b, not fmax/fmin). + if (instr.vector_opcode == AluVectorOpcode::kMin) { + DxbcOpLT(DxbcDest::R(system_temp_result_, different), operands[0], + operands[1]); + } else { + DxbcOpGE(DxbcDest::R(system_temp_result_, different), operands[0], + operands[1]); + } + DxbcOpMovC(DxbcDest::R(system_temp_result_, different), + DxbcSrc::R(system_temp_result_), operands[0], operands[1]); } - break; - - case AluVectorOpcode::kSeq: - case AluVectorOpcode::kSgt: - case AluVectorOpcode::kSge: - case AluVectorOpcode::kSne: - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE( - kCoreOpcodes[uint32_t(instr.vector_opcode)]) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + operand_length_sums[1])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - if (instr.vector_opcode == AluVectorOpcode::kSgt) { - // lt in DXBC, not gt. - UseDxbcSourceOperand(dxbc_operands[1]); - UseDxbcSourceOperand(dxbc_operands[0]); - } else { - UseDxbcSourceOperand(dxbc_operands[0]); - UseDxbcSourceOperand(dxbc_operands[1]); - } - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // Convert 0xFFFFFFFF to 1.0f. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0x3F800000); - shader_code_.push_back(0x3F800000); - shader_code_.push_back(0x3F800000); - shader_code_.push_back(0x3F800000); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - break; - - case AluVectorOpcode::kFrc: - case AluVectorOpcode::kTrunc: - case AluVectorOpcode::kFloor: - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE( - kCoreOpcodes[uint32_t(instr.vector_opcode)]) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + operand_length_sums[0])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0]); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - break; - - case AluVectorOpcode::kMad: { - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MAD) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + operand_length_sums[2])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0]); - UseDxbcSourceOperand(dxbc_operands[1]); - UseDxbcSourceOperand(dxbc_operands[2]); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - if (instr.vector_operands[0].GetAbsoluteIdenticalComponents( - instr.vector_operands[1]) != 0b1111) { - // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0). - // If any operand is zero or denormalized, just leave the addition part. - uint32_t is_subnormal_temp = PushSystemTemp(); - // Get the non-NaN multiplicand closer to zero to check if any of them - // is zero. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MIN) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + DxbcSourceOperandLength(dxbc_operands[0], false, true) + - DxbcSourceOperandLength(dxbc_operands[1], false, true))); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(is_subnormal_temp); - UseDxbcSourceOperand(dxbc_operands[0], kSwizzleXYZW, 4, false, true); - UseDxbcSourceOperand(dxbc_operands[1], kSwizzleXYZW, 4, false, true); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // Check if any multiplicand is zero (min isn't required to flush - // denormals in the result). - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_EQ) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(is_subnormal_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(is_subnormal_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // Zero the multiplication part if any multiplicand is zero. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 7 + DxbcSourceOperandLength(dxbc_operands[2]))); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(is_subnormal_temp); - UseDxbcSourceOperand(dxbc_operands[2]); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - // Release is_subnormal_temp. - PopSystemTemp(); + if (identical) { + DxbcOpMov(DxbcDest::R(system_temp_result_, identical), operands[0]); } } break; - // Using true eq to compare with zero because it handles denormals and -0. + case AluVectorOpcode::kSeq: + DxbcOpEq(per_component_dest, operands[0], operands[1]); + DxbcOpAnd(per_component_dest, DxbcSrc::R(system_temp_result_), + DxbcSrc::LF(1.0f)); + break; + case AluVectorOpcode::kSgt: + DxbcOpLT(per_component_dest, operands[1], operands[0]); + DxbcOpAnd(per_component_dest, DxbcSrc::R(system_temp_result_), + DxbcSrc::LF(1.0f)); + break; + case AluVectorOpcode::kSge: + DxbcOpGE(per_component_dest, operands[0], operands[1]); + DxbcOpAnd(per_component_dest, DxbcSrc::R(system_temp_result_), + DxbcSrc::LF(1.0f)); + break; + case AluVectorOpcode::kSne: + DxbcOpNE(per_component_dest, operands[0], operands[1]); + DxbcOpAnd(per_component_dest, DxbcSrc::R(system_temp_result_), + DxbcSrc::LF(1.0f)); + break; + + case AluVectorOpcode::kFrc: + DxbcOpFrc(per_component_dest, operands[0]); + break; + case AluVectorOpcode::kTrunc: + DxbcOpRoundZ(per_component_dest, operands[0]); + break; + case AluVectorOpcode::kFloor: + DxbcOpRoundNI(per_component_dest, operands[0]); + break; + case AluVectorOpcode::kCndEq: + DxbcOpEq(per_component_dest, operands[0], DxbcSrc::LF(0.0f)); + DxbcOpMovC(per_component_dest, DxbcSrc::R(system_temp_result_), + operands[1], operands[2]); + break; case AluVectorOpcode::kCndGe: + DxbcOpGE(per_component_dest, operands[0], DxbcSrc::LF(0.0f)); + DxbcOpMovC(per_component_dest, DxbcSrc::R(system_temp_result_), + operands[1], operands[2]); + break; case AluVectorOpcode::kCndGt: - // dest = src0 op 0.0 ? src1 : src2 - // Compare src0 to zero. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE( - kCoreOpcodes[uint32_t(instr.vector_opcode)]) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 8 + operand_length_sums[0])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - if (instr.vector_opcode != AluVectorOpcode::kCndGt) { - // lt in DXBC, not gt. - UseDxbcSourceOperand(dxbc_operands[0]); - } - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - if (instr.vector_opcode == AluVectorOpcode::kCndGt) { - UseDxbcSourceOperand(dxbc_operands[0]); - } - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // Select src1 or src2. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 5 + operand_length_sums[2] - operand_length_sums[0])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[1]); - UseDxbcSourceOperand(dxbc_operands[2]); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; + DxbcOpLT(per_component_dest, DxbcSrc::LF(0.0f), operands[0]); + DxbcOpMovC(per_component_dest, DxbcSrc::R(system_temp_result_), + operands[1], operands[2]); break; case AluVectorOpcode::kDp4: case AluVectorOpcode::kDp3: case AluVectorOpcode::kDp2Add: { - if (instr.vector_operands[0].GetAbsoluteIdenticalComponents( - instr.vector_operands[1]) != 0b1111) { - // The operands are the same when calculating vector length, no need to - // emulate 0 * anything = 0 in this case. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE( - kCoreOpcodes[uint32_t(instr.vector_opcode)]) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + operand_length_sums[1])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0]); - UseDxbcSourceOperand(dxbc_operands[1]); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - } else { - uint32_t operand_mask; - if (instr.vector_opcode == AluVectorOpcode::kDp2Add) { - operand_mask = 0b0011; - } else if (instr.vector_opcode == AluVectorOpcode::kDp3) { - operand_mask = 0b0111; - } else { - operand_mask = 0b1111; - } - // Load the operands into pv and a temp register, zeroing if the other - // operand is zero or denormalized, reproducing the Shader Model 3 - // multiplication behavior (0 * anything = 0). - uint32_t src1_temp = PushSystemTemp(); - // Load the first operand into pv. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_EQ) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 8 + DxbcSourceOperandLength(dxbc_operands[1]))); - shader_code_.push_back(EncodeVectorMaskedOperand( - D3D10_SB_OPERAND_TYPE_TEMP, operand_mask, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[1]); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 10 + operand_length_sums[0])); - shader_code_.push_back(EncodeVectorMaskedOperand( - D3D10_SB_OPERAND_TYPE_TEMP, operand_mask, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - UseDxbcSourceOperand(dxbc_operands[0]); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - // Load the second operand into src1_temp. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_EQ) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 8 + operand_length_sums[0])); - shader_code_.push_back(EncodeVectorMaskedOperand( - D3D10_SB_OPERAND_TYPE_TEMP, operand_mask, 1)); - shader_code_.push_back(src1_temp); - UseDxbcSourceOperand(dxbc_operands[0]); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 10 + DxbcSourceOperandLength(dxbc_operands[1]))); - shader_code_.push_back(EncodeVectorMaskedOperand( - D3D10_SB_OPERAND_TYPE_TEMP, operand_mask, 1)); - shader_code_.push_back(src1_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(src1_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - UseDxbcSourceOperand(dxbc_operands[1]); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - // Calculate the dot product. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE( - kCoreOpcodes[uint32_t(instr.vector_opcode)]) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(src1_temp); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // Release src1_temp. - PopSystemTemp(); - } - // Add src2.x for dp2add. + uint32_t component_count; if (instr.vector_opcode == AluVectorOpcode::kDp2Add) { - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ADD) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 5 + DxbcSourceOperandLength(dxbc_operands[2]))); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[2], kSwizzleXXXX); - ++stat_.instruction_count; - ++stat_.float_instruction_count; + component_count = 2; + } else if (instr.vector_opcode == AluVectorOpcode::kDp3) { + component_count = 3; + } else { + component_count = 4; + } + result_swizzle = DxbcSrc::kXXXX; + uint32_t absolute_different = + uint32_t((1 << component_count) - 1) & + ~instr.vector_operands[0].GetAbsoluteIdenticalComponents( + instr.vector_operands[1]); + if (absolute_different) { + // Shader Model 3: 0 or denormal * anything = 0. + // FIXME(Triang3l): Signed zero needs research and handling. + // Add component products only if non-zero. For dp4, 16 scalar + // operations in the worst case (as opposed to always 20 for + // eq/movc/eq/movc/dp4 or min/eq/movc/movc/dp4 for preparing operands + // for dp4). + DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0001), + operands[0].SelectFromSwizzled(0), + operands[1].SelectFromSwizzled(0)); + if (absolute_different & 0b0001) { + DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0010), + operands[0].SelectFromSwizzled(0).Abs(), + operands[1].SelectFromSwizzled(0).Abs()); + DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0010), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY), + DxbcSrc::LF(0.0f)); + DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY), + DxbcSrc::LF(0.0f), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); + } + for (uint32_t i = 1; i < component_count; ++i) { + bool component_different = (absolute_different & (1 << i)) != 0; + DxbcOpMAd(DxbcDest::R(system_temp_result_, + component_different ? 0b0010 : 0b0001), + operands[0].SelectFromSwizzled(i), + operands[1].SelectFromSwizzled(i), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); + if (component_different) { + DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100), + operands[0].SelectFromSwizzled(i).Abs(), + operands[1].SelectFromSwizzled(i).Abs()); + DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100), + DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), + DxbcSrc::LF(0.0f)); + DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY)); + } + } + } else { + if (component_count == 2) { + DxbcOpDP2(DxbcDest::R(system_temp_result_, 0b0001), operands[0], + operands[1]); + } else if (component_count == 3) { + DxbcOpDP3(DxbcDest::R(system_temp_result_, 0b0001), operands[0], + operands[1]); + } else { + assert_true(component_count == 4); + DxbcOpDP4(DxbcDest::R(system_temp_result_, 0b0001), operands[0], + operands[1]); + } + } + if (component_count == 2) { + // Add the third operand. Since floating-point addition isn't + // associative, even though adding this in multiply-add for the first + // component would be faster, it's safer to add here, in the end. + DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + operands[2].SelectFromSwizzled(0)); } - break; - } - - case AluVectorOpcode::kCube: { - // 3D cubemap direction -> (T, S, 2.0 * major axis, face ID). - // src0 is the direction swizzled as .zzxy, src1 is the same direction as - // .yxzz, but we don't need it. - // - // If the major axis is X (X >= Y && X >= Z): - // * T is -Y. - // * S is -Z for positive X, +Z for negative X. - // * Face is 0 for positive X, 1 for negative X. - // Otherwise, if the major axis is Y (Y >= Z): - // * T is +Z for positive Y, -Z for negative Y. - // * S is +X. - // * Face is 2 for positive Y, 3 for negative Y. - // Otherwise, if the major axis is Z: - // * T is -Y. - // * S is +X for positive Z, -X for negative Z. - // * Face is 4 for positive Z, 5 for negative Z. - - // For making swizzle masks when using src0. - const uint32_t cube_src0_x = 2; - const uint32_t cube_src0_y = 3; - const uint32_t cube_src0_z = 1; - - // Used for various masks, as 0xFFFFFFFF/0, 2.0/0.0. - uint32_t cube_mask_temp = PushSystemTemp(); - - // 1) Choose which axis is the major one - resulting in (0xFFFFFFFF, 0, 0) - // for X major axis, (0, 0xFFFFFFFF, 0) for Y, (0, 0, 0xFFFFFFFF) for Z. - - // Mask = (X >= Y, Y >= Z, Z >= Z, X >= Z), let's hope nothing passes NaN - // in Z. - // ge mask, |src.xyzx|, |src.yzzz| - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_GE) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + 2 * DxbcSourceOperandLength(dxbc_operands[0], false, true))); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(cube_mask_temp); - UseDxbcSourceOperand(dxbc_operands[0], - cube_src0_x | (cube_src0_y << 2) | - (cube_src0_z << 4) | (cube_src0_x << 6), - 4, false, true); - UseDxbcSourceOperand(dxbc_operands[0], - cube_src0_y | (cube_src0_z << 2) | - (cube_src0_z << 4) | (cube_src0_z << 6), - 4, false, true); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - - // Mask = (X >= Y && X >= Z, Y >= Z, Z >= Z, unused). - // and mask.x, mask.x, mask.w - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); - shader_code_.push_back(cube_mask_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // If X is MA, Y and Z can't be MA. - // movc mask._yz_, mask._xx_, l(_, 0, 0, _), mask._yz_ - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(12)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0110, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(cube_mask_temp); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - - // If Y is MA, Z can't be MA. - // movc mask.z, mask.y, l(0), mask.z - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0100, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(0); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1)); - shader_code_.push_back(cube_mask_temp); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - - // 2) Get T and S as if the major axis was positive (sign changing for - // negative major axis will be done later). - - uint32_t minus_src0_length = - DxbcSourceOperandLength(dxbc_operands[0], true); - - // T is +Z if Y is major, -Y otherwise. - // movc pv.x, mask.y, src.z, -src.y - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 5 + operand_length_sums[0] + minus_src0_length)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(cube_mask_temp); - UseDxbcSourceOperand(dxbc_operands[0], kSwizzleXYZW, cube_src0_z); - UseDxbcSourceOperand(dxbc_operands[0], kSwizzleXYZW, cube_src0_y, true); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - - // S is -Z if X is major, +X otherwise. - // movc pv.y, mask.x, -src.z, src.x - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5 + minus_src0_length + - operand_length_sums[0])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0010, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(cube_mask_temp); - UseDxbcSourceOperand(dxbc_operands[0], kSwizzleXYZW, cube_src0_z, true); - UseDxbcSourceOperand(dxbc_operands[0], kSwizzleXYZW, cube_src0_x); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - - // 3) Get 2.0 * major axis. - - // Convert the mask to float and double it (because we need 2 * MA). - // and mask.xyz_, mask.xyz_, l(0x40000000, 0x40000000, 0x40000000, _) - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0111, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0x40000000); - shader_code_.push_back(0x40000000); - shader_code_.push_back(0x40000000); - shader_code_.push_back(0x40000000); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // Select 2.0 * needed component (mask always has 2.0 in one component and - // 0.0 in the rest). - // dp3 pv.__z_, src.xyz_, mask.xyz_ - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DP3) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 5 + operand_length_sums[0])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0100, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0], cube_src0_x | (cube_src0_y << 2) | - (cube_src0_z << 4)); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(cube_mask_temp); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - - // 4) Check whether the major axis is negative and get the face index. - - // Test if the major axis is negative. - // lt mask.w, pv.z, l(0.0) - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_LT) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(0); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - - // Convert the negative mask to float the same way (multiplied by 2) - // because it will be used in bitwise operations with other mask - // components. - // and mask.w, mask.w, l(0x40000000) - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(0x40000000); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // Get the face index. If major axis is X, it's 0, if it's Y, it's 2, if - // Z, it's 4, but also, being negative also adds 1 to the index. Since YZW - // of the mask contain 2.0 for whether YZ are the major axis and the major - // axis is negative, the factor is divided by 2. - // dp3 pv.___w, mask.yzw_, l(1.0, 2.0, 0.5, _) - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_DP3) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, 0b11111001, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0x3F800000); - shader_code_.push_back(0x40000000); - shader_code_.push_back(0x3F000000); - shader_code_.push_back(0); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - - // 5) Flip axes if the major axis is negative - if major axis is Y, flip - // T, otherwise flip S. - - // S needs to flipped if the major axis is X or Z, so make an X || Z mask. - // or mask.x, mask.x, mask.z - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1)); - shader_code_.push_back(cube_mask_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // Don't flip anything if the major axis is positive (AND 2.0 and 2.0 if - // it's negative). - // and mask.xy__, mask.xy__, mask.ww__ - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0011, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); - shader_code_.push_back(cube_mask_temp); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - - // Flip T or S. - // movc pv.xy__, mask.yx__, -pv.xy__, pv.xy__ - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0011, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, 0b11100001, 1)); - shader_code_.push_back(cube_mask_temp); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1) | - ENCODE_D3D10_SB_OPERAND_EXTENDED(1)); - shader_code_.push_back(ENCODE_D3D10_SB_EXTENDED_OPERAND_MODIFIER( - D3D10_SB_OPERAND_MODIFIER_NEG)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - - // 6) Move T and S to the proper coordinate system. - - // Subtract abs(2.0 * major axis) from T and S. - // add pv.xy__, pv.xy__, -|pv.zz__| - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ADD) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0011, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1) | - ENCODE_D3D10_SB_OPERAND_EXTENDED(1)); - shader_code_.push_back(ENCODE_D3D10_SB_EXTENDED_OPERAND_MODIFIER( - D3D10_SB_OPERAND_MODIFIER_ABSNEG)); - shader_code_.push_back(system_temp_pv_); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - - // Release cube_mask_temp. - PopSystemTemp(); } break; - case AluVectorOpcode::kMax4: - replicate_result_x = true; - // pv.xy = max(src0.xy, src0.zw) - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MAX) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + 2 * operand_length_sums[0])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0011, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0]); - UseDxbcSourceOperand(dxbc_operands[0], 0b01001110); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // pv.x = max(pv.x, pv.y) - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MAX) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(system_temp_pv_); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - break; + case AluVectorOpcode::kCube: { + // operands[0] is .z_xy. + // Result is T coordinate, S coordinate, 2 * major axis, face ID. + constexpr uint32_t kCubeX = 2, kCubeY = 3, kCubeZ = 0; + DxbcSrc cube_x_src(operands[0].SelectFromSwizzled(kCubeX)); + DxbcSrc cube_y_src(operands[0].SelectFromSwizzled(kCubeY)); + DxbcSrc cube_z_src(operands[0].SelectFromSwizzled(kCubeZ)); + // result.xy = bool2(abs(z) >= abs(x), abs(z) >= abs(y)) + DxbcOpGE(DxbcDest::R(system_temp_result_, 0b0011), cube_z_src.Abs(), + operands[0].SwizzleSwizzled(kCubeX | (kCubeY << 2)).Abs()); + // result.x = abs(z) >= abs(x) && abs(z) >= abs(y) + DxbcOpAnd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY)); + DxbcDest tc_dest(DxbcDest::R(system_temp_result_, 0b0001)); + DxbcDest sc_dest(DxbcDest::R(system_temp_result_, 0b0010)); + DxbcDest ma_dest(DxbcDest::R(system_temp_result_, 0b0100)); + DxbcDest id_dest(DxbcDest::R(system_temp_result_, 0b1000)); + DxbcOpIf(true, DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); + { + // Z is the major axis. + // z < 0 needed for SC and ID, but the last to use is ID. + uint32_t ma_neg_component = (used_result_components & 0b1000) ? 3 : 1; + if (used_result_components & 0b1010) { + DxbcOpLT(DxbcDest::R(system_temp_result_, 1 << ma_neg_component), + cube_z_src, DxbcSrc::LF(0.0f)); + } + if (used_result_components & 0b0001) { + DxbcOpMov(tc_dest, -cube_y_src); + } + if (used_result_components & 0b0010) { + DxbcOpMovC(sc_dest, + DxbcSrc::R(system_temp_result_).Select(ma_neg_component), + -cube_x_src, cube_x_src); + } + if (used_result_components & 0b0100) { + DxbcOpMul(ma_dest, DxbcSrc::LF(2.0f), cube_z_src); + } + if (used_result_components & 0b1000) { + DxbcOpMovC(id_dest, + DxbcSrc::R(system_temp_result_).Select(ma_neg_component), + DxbcSrc::LF(5.0f), DxbcSrc::LF(4.0f)); + } + } + DxbcOpElse(); + { + // result.x = abs(y) >= abs(x) + DxbcOpGE(DxbcDest::R(system_temp_result_, 0b0001), cube_y_src.Abs(), + cube_x_src.Abs()); + DxbcOpIf(true, DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); + { + // Y is the major axis. + // y < 0 needed for TC and ID, but the last to use is ID. + uint32_t ma_neg_component = (used_result_components & 0b1000) ? 3 : 0; + if (used_result_components & 0b1001) { + DxbcOpLT(DxbcDest::R(system_temp_result_, 1 << ma_neg_component), + cube_y_src, DxbcSrc::LF(0.0f)); + } + if (used_result_components & 0b0001) { + DxbcOpMovC(tc_dest, + DxbcSrc::R(system_temp_result_).Select(ma_neg_component), + -cube_z_src, cube_z_src); + } + if (used_result_components & 0b0010) { + DxbcOpMov(sc_dest, cube_x_src); + } + if (used_result_components & 0b0100) { + DxbcOpMul(ma_dest, DxbcSrc::LF(2.0f), cube_y_src); + } + if (used_result_components & 0b1000) { + DxbcOpMovC(id_dest, + DxbcSrc::R(system_temp_result_).Select(ma_neg_component), + DxbcSrc::LF(3.0f), DxbcSrc::LF(2.0f)); + } + } + DxbcOpElse(); + { + // X is the major axis. + // x < 0 needed for SC and ID, but the last to use is ID. + uint32_t ma_neg_component = (used_result_components & 0b1000) ? 3 : 1; + if (used_result_components & 0b1010) { + DxbcOpLT(DxbcDest::R(system_temp_result_, 1 << ma_neg_component), + cube_x_src, DxbcSrc::LF(0.0f)); + } + if (used_result_components & 0b0001) { + DxbcOpMov(tc_dest, -cube_y_src); + } + if (used_result_components & 0b0010) { + DxbcOpMovC(sc_dest, + DxbcSrc::R(system_temp_result_).Select(ma_neg_component), + cube_z_src, -cube_z_src); + } + if (used_result_components & 0b0100) { + DxbcOpMul(ma_dest, DxbcSrc::LF(2.0f), cube_x_src); + } + if (used_result_components & 0b1000) { + DxbcOpAnd(id_dest, + DxbcSrc::R(system_temp_result_).Select(ma_neg_component), + DxbcSrc::LF(1.0f)); + } + } + DxbcOpEndIf(); + } + DxbcOpEndIf(); + } break; + + case AluVectorOpcode::kMax4: { + result_swizzle = DxbcSrc::kXXXX; + // Find max of all different components of the first operand. + // FIXME(Triang3l): Not caring about NaN because no info about the + // correct order, just using SM4 max here, which replaces them with the + // non-NaN component (however, there's one nice thing about it is that it + // may be compiled into max3 + max on GCN). + uint32_t remaining_components = 0; + for (uint32_t i = 0; i < 4; ++i) { + remaining_components |= 1 << ((operands[0].swizzle_ >> (i * 2)) & 3); + } + uint32_t unique_component_0; + xe::bit_scan_forward(remaining_components, &unique_component_0); + remaining_components &= ~uint32_t(1 << unique_component_0); + if (remaining_components) { + uint32_t unique_component_1; + xe::bit_scan_forward(remaining_components, &unique_component_1); + remaining_components &= ~uint32_t(1 << unique_component_1); + DxbcOpMax(DxbcDest::R(system_temp_result_, 0b0001), + operands[0].Select(unique_component_0), + operands[0].Select(unique_component_1)); + while (remaining_components) { + uint32_t unique_component; + xe::bit_scan_forward(remaining_components, &unique_component); + remaining_components &= ~uint32_t(1 << unique_component); + DxbcOpMax(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + operands[0].Select(unique_component)); + } + } else { + DxbcOpMov(DxbcDest::R(system_temp_result_, 0b0001), + operands[0].Select(unique_component_0)); + } + } break; case AluVectorOpcode::kSetpEqPush: + predicate_written = true; + result_swizzle = DxbcSrc::kXXXX; + // result.xy = src0.xw == 0.0 (x only if needed). + DxbcOpEq(DxbcDest::R(system_temp_result_, + used_result_components ? 0b0011 : 0b0010), + operands[0].SwizzleSwizzled(0b1100), DxbcSrc::LF(0.0f)); + // result.zw = src1.xw == 0.0 (z only if needed). + DxbcOpEq(DxbcDest::R(system_temp_result_, + used_result_components ? 0b1100 : 0b1000), + operands[1].SwizzleSwizzled(0b11000000), DxbcSrc::LF(0.0f)); + // p0 = src0.w == 0.0 && src1.w == 0.0 + DxbcOpAnd(DxbcDest::R(system_temp_ps_pc_p0_a0_, 0b0100), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY), + DxbcSrc::R(system_temp_result_, DxbcSrc::kWWWW)); + if (used_result_components) { + // result = (src0.x == 0.0 && src1.x == 0.0) ? 0.0 : src0.x + 1.0 + DxbcOpAnd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ)); + // If the condition is true, 1 will be added to make it 0. + DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::LF(-1.0f), operands[0].SelectFromSwizzled(0)); + DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::LF(1.0f)); + } + break; case AluVectorOpcode::kSetpNePush: + predicate_written = true; + result_swizzle = DxbcSrc::kXXXX; + // result.xy = src0.xw == 0.0 (x only if needed). + DxbcOpEq(DxbcDest::R(system_temp_result_, + used_result_components ? 0b0011 : 0b0010), + operands[0].SwizzleSwizzled(0b1100), DxbcSrc::LF(0.0f)); + // result.zw = src1.xw != 0.0 (z only if needed). + DxbcOpNE(DxbcDest::R(system_temp_result_, + used_result_components ? 0b1100 : 0b1000), + operands[1].SwizzleSwizzled(0b11000000), DxbcSrc::LF(0.0f)); + // p0 = src0.w == 0.0 && src1.w != 0.0 + DxbcOpAnd(DxbcDest::R(system_temp_ps_pc_p0_a0_, 0b0100), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY), + DxbcSrc::R(system_temp_result_, DxbcSrc::kWWWW)); + if (used_result_components) { + // result = (src0.x == 0.0 && src1.x != 0.0) ? 0.0 : src0.x + 1.0 + DxbcOpAnd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ)); + // If the condition is true, 1 will be added to make it 0. + DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::LF(-1.0f), operands[0].SelectFromSwizzled(0)); + DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::LF(1.0f)); + } + break; case AluVectorOpcode::kSetpGtPush: + predicate_written = true; + result_swizzle = DxbcSrc::kXXXX; + // result.xy = src0.xw == 0.0 (x only if needed). + DxbcOpEq(DxbcDest::R(system_temp_result_, + used_result_components ? 0b0011 : 0b0010), + operands[0].SwizzleSwizzled(0b1100), DxbcSrc::LF(0.0f)); + // result.zw = src1.xw > 0.0 (z only if needed). + DxbcOpLT(DxbcDest::R(system_temp_result_, + used_result_components ? 0b1100 : 0b1000), + DxbcSrc::LF(0.0f), operands[1].SwizzleSwizzled(0b11000000)); + // p0 = src0.w == 0.0 && src1.w > 0.0 + DxbcOpAnd(DxbcDest::R(system_temp_ps_pc_p0_a0_, 0b0100), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY), + DxbcSrc::R(system_temp_result_, DxbcSrc::kWWWW)); + if (used_result_components) { + // result = (src0.x == 0.0 && src1.x > 0.0) ? 0.0 : src0.x + 1.0 + DxbcOpAnd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ)); + // If the condition is true, 1 will be added to make it 0. + DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::LF(-1.0f), operands[0].SelectFromSwizzled(0)); + DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::LF(1.0f)); + } + break; case AluVectorOpcode::kSetpGePush: predicate_written = true; - replicate_result_x = true; - // pv.xy = (src0.x == 0.0, src0.w == 0.0) - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_EQ) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 8 + operand_length_sums[0])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0011, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0], 0b11001100); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // pv.zw = (src1.x op 0.0, src1.w op 0.0) - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE( - kCoreOpcodes[uint32_t(instr.vector_opcode)]) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 8 + DxbcSourceOperandLength(dxbc_operands[1]))); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1100, 1)); - shader_code_.push_back(system_temp_pv_); - if (instr.vector_opcode != AluVectorOpcode::kSetpGtPush) { - // lt in DXBC, not gt. - UseDxbcSourceOperand(dxbc_operands[1], 0b11000000); + result_swizzle = DxbcSrc::kXXXX; + // result.xy = src0.xw == 0.0 (x only if needed). + DxbcOpEq(DxbcDest::R(system_temp_result_, + used_result_components ? 0b0011 : 0b0010), + operands[0].SwizzleSwizzled(0b1100), DxbcSrc::LF(0.0f)); + // result.zw = src1.xw >= 0.0 (z only if needed). + DxbcOpGE(DxbcDest::R(system_temp_result_, + used_result_components ? 0b1100 : 0b1000), + operands[1].SwizzleSwizzled(0b11000000), DxbcSrc::LF(0.0f)); + // p0 = src0.w == 0.0 && src1.w >= 0.0 + DxbcOpAnd(DxbcDest::R(system_temp_ps_pc_p0_a0_, 0b0100), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY), + DxbcSrc::R(system_temp_result_, DxbcSrc::kWWWW)); + if (used_result_components) { + // result = (src0.x == 0.0 && src1.x >= 0.0) ? 0.0 : src0.x + 1.0 + DxbcOpAnd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ)); + // If the condition is true, 1 will be added to make it 0. + DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::LF(-1.0f), operands[0].SelectFromSwizzled(0)); + DxbcOpAdd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::LF(1.0f)); } - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - shader_code_.push_back(0); - if (instr.vector_opcode == AluVectorOpcode::kSetpGtPush) { - UseDxbcSourceOperand(dxbc_operands[1], 0b11000000); - } - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // p0 = src0.w == 0.0 && src1.w op 0.0 - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0100, 1)); - shader_code_.push_back(system_temp_ps_pc_p0_a0_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); - shader_code_.push_back(system_temp_pv_); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - // pv.x = src0.x == 0.0 && src1.x op 0.0 - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 2, 1)); - shader_code_.push_back(system_temp_pv_); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - // pv.x = (src0.x == 0.0 && src1.x op 0.0) ? -1.0 : src0.x - // (1.0 is going to be added, thus -1.0) - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 7 + operand_length_sums[0])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(0xBF800000u); - UseDxbcSourceOperand(dxbc_operands[0], kSwizzleXYZW, 0); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - // pv.x += 1.0 - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ADD) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(0x3F800000u); - ++stat_.instruction_count; - ++stat_.float_instruction_count; break; case AluVectorOpcode::kKillEq: - case AluVectorOpcode::kKillGt: - case AluVectorOpcode::kKillGe: - case AluVectorOpcode::kKillNe: - replicate_result_x = true; - // pv = src0 op src1 - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE( - kCoreOpcodes[uint32_t(instr.vector_opcode)]) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + operand_length_sums[1])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - if (instr.vector_opcode == AluVectorOpcode::kKillGt) { - // lt in DXBC, not gt. - UseDxbcSourceOperand(dxbc_operands[1]); - UseDxbcSourceOperand(dxbc_operands[0]); - } else { - UseDxbcSourceOperand(dxbc_operands[0]); - UseDxbcSourceOperand(dxbc_operands[1]); - } - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // pv = any(src0 op src1) - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0011, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back(EncodeVectorSwizzledOperand( - D3D10_SB_OPERAND_TYPE_TEMP, 0b01001110, 1)); - shader_code_.push_back(system_temp_pv_); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_OR) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(system_temp_pv_); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - // Convert 0xFFFFFFFF to 1.0f. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_AND) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(0x3F800000); - ++stat_.instruction_count; - ++stat_.uint_instruction_count; - // Discard. - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE( - edram_rov_used_ ? D3D10_SB_OPCODE_RETC - : D3D10_SB_OPCODE_DISCARD) | - ENCODE_D3D10_SB_INSTRUCTION_TEST_BOOLEAN( - D3D10_SB_INSTRUCTION_TEST_NONZERO) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); - ++stat_.instruction_count; + result_swizzle = DxbcSrc::kXXXX; + DxbcOpEq(DxbcDest::R(system_temp_result_), operands[0], operands[1]); + DxbcOpOr(DxbcDest::R(system_temp_result_, 0b0011), + DxbcSrc::R(system_temp_result_, 0b0100), + DxbcSrc::R(system_temp_result_, 0b1110)); + DxbcOpOr(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY)); if (edram_rov_used_) { - ++stat_.dynamic_flow_control_count; + DxbcOpRetC(true, DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); + } else { + DxbcOpDiscard(true, DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); + } + if (used_result_components) { + DxbcOpAnd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::LF(1.0f)); + } + break; + case AluVectorOpcode::kKillGt: + result_swizzle = DxbcSrc::kXXXX; + DxbcOpLT(DxbcDest::R(system_temp_result_), operands[1], operands[0]); + DxbcOpOr(DxbcDest::R(system_temp_result_, 0b0011), + DxbcSrc::R(system_temp_result_, 0b0100), + DxbcSrc::R(system_temp_result_, 0b1110)); + DxbcOpOr(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY)); + if (edram_rov_used_) { + DxbcOpRetC(true, DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); + } else { + DxbcOpDiscard(true, DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); + } + if (used_result_components) { + DxbcOpAnd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::LF(1.0f)); + } + break; + case AluVectorOpcode::kKillGe: + result_swizzle = DxbcSrc::kXXXX; + DxbcOpGE(DxbcDest::R(system_temp_result_), operands[0], operands[1]); + DxbcOpOr(DxbcDest::R(system_temp_result_, 0b0011), + DxbcSrc::R(system_temp_result_, 0b0100), + DxbcSrc::R(system_temp_result_, 0b1110)); + DxbcOpOr(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY)); + if (edram_rov_used_) { + DxbcOpRetC(true, DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); + } else { + DxbcOpDiscard(true, DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); + } + if (used_result_components) { + DxbcOpAnd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::LF(1.0f)); + } + break; + case AluVectorOpcode::kKillNe: + result_swizzle = DxbcSrc::kXXXX; + DxbcOpNE(DxbcDest::R(system_temp_result_), operands[0], operands[1]); + DxbcOpOr(DxbcDest::R(system_temp_result_, 0b0011), + DxbcSrc::R(system_temp_result_, 0b0100), + DxbcSrc::R(system_temp_result_, 0b1110)); + DxbcOpOr(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY)); + if (edram_rov_used_) { + DxbcOpRetC(true, DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); + } else { + DxbcOpDiscard(true, DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX)); + } + if (used_result_components) { + DxbcOpAnd(DxbcDest::R(system_temp_result_, 0b0001), + DxbcSrc::R(system_temp_result_, DxbcSrc::kXXXX), + DxbcSrc::LF(1.0f)); } break; - case AluVectorOpcode::kDst: { - // Not shortening so there are no write-read dependencies and less scalar - // operations. - // pv.x = 1.0 - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(0x3F800000); - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - // pv.y = src0.y * src1.y - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MUL) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + operand_length_sums[1])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0010, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0], kSwizzleXYZW, 1); - UseDxbcSourceOperand(dxbc_operands[1], kSwizzleXYZW, 1); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - if (!(instr.vector_operands[0].GetAbsoluteIdenticalComponents( - instr.vector_operands[1]) & - 0b0010)) { - // Reproduce Shader Model 3 multiplication behavior (0 * anything = 0). - // This is an attenuation calculation function, so infinity is probably - // not very unlikely. - uint32_t is_subnormal_temp = PushSystemTemp(); - // Get the non-NaN multiplicand closer to zero to check if any of them - // is zero. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MIN) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + DxbcSourceOperandLength(dxbc_operands[0], false, true) + - DxbcSourceOperandLength(dxbc_operands[1], false, true))); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(is_subnormal_temp); - UseDxbcSourceOperand(dxbc_operands[0], kSwizzleXYZW, 1, false, true); - UseDxbcSourceOperand(dxbc_operands[1], kSwizzleXYZW, 1, false, true); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // Check if any multiplicand is zero (min isn't required to flush - // denormals in the result). - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_EQ) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(is_subnormal_temp); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(is_subnormal_temp); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(0); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // Set pv.y to zero if any multiplicand is zero. - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0010, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(is_subnormal_temp); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(0); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(system_temp_pv_); - ++stat_.instruction_count; - ++stat_.movc_instruction_count; - // Release is_subnormal_temp. - PopSystemTemp(); + case AluVectorOpcode::kDst: + if (used_result_components & 0b0001) { + DxbcOpMov(DxbcDest::R(system_temp_result_, 0b0001), DxbcSrc::LF(1.0f)); } - // pv.z = src0.z - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + operand_length_sums[0])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0100, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0], kSwizzleXYZW, 2); - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - // pv.w = src1.w - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + DxbcSourceOperandLength(dxbc_operands[1]))); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[1], kSwizzleXYZW, 3); - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - } break; + if (used_result_components & 0b0010) { + // Shader Model 3: 0 or denormal * anything = 0. + // FIXME(Triang3l): Signed zero needs research and handling. + DxbcOpMul(DxbcDest::R(system_temp_result_, 0b0010), + operands[0].SelectFromSwizzled(1), + operands[1].SelectFromSwizzled(1)); + if (!(instr.vector_operands[0].GetAbsoluteIdenticalComponents( + instr.vector_operands[1]) & + 0b0010)) { + DxbcOpMin(DxbcDest::R(system_temp_result_, 0b0100), + operands[0].SelectFromSwizzled(1).Abs(), + operands[1].SelectFromSwizzled(1).Abs()); + // min isn't required to flush denormals, eq is. + DxbcOpEq(DxbcDest::R(system_temp_result_, 0b0100), + DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), + DxbcSrc::LF(0.0f)); + DxbcOpMovC(DxbcDest::R(system_temp_result_, 0b0010), + DxbcSrc::R(system_temp_result_, DxbcSrc::kZZZZ), + DxbcSrc::LF(0.0f), + DxbcSrc::R(system_temp_result_, DxbcSrc::kYYYY)); + } + } + if (used_result_components & 0b0100) { + DxbcOpMov(DxbcDest::R(system_temp_result_, 0b0100), + operands[0].SelectFromSwizzled(2)); + } + if (used_result_components & 0b1000) { + DxbcOpMov(DxbcDest::R(system_temp_result_, 0b1000), + operands[1].SelectFromSwizzled(2)); + } + break; case AluVectorOpcode::kMaxA: - // The `a0 = int(clamp(floor(src0.w + 0.5), -256.0, 255.0))` part. - // - // Using specifically floor(src0.w + 0.5) rather than round(src0.w) - // because the R600 ISA reference and MSDN say so - this makes a - // difference at 0.5 because round_ni rounds to the nearest even. - // There's one deviation from the R600 specification though - the value is - // clamped to 255 rather than set to -256 if it's over 255. We don't know - // yet which is the correct - the mova_int description, for example, says - // "clamp" explicitly. MSDN, however, says the value should actually be - // clamped. - // http://web.archive.org/web/20100705151335/http://msdn.microsoft.com:80/en-us/library/bb313931.aspx - // - // pv.x (temporary) = src0.w + 0.5 - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ADD) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 5 + operand_length_sums[0])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0], kSwizzleXYZW, 3); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(0x3F000000u); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // pv.x = floor(src0.w + 0.5) - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_ROUND_NI) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // pv.x = max(floor(src0.w + 0.5), -256.0) - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MAX) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(0xC3800000u); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // pv.x = clamp(floor(src0.w + 0.5), -256.0, 255.0) - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MIN) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); - shader_code_.push_back( - EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); - shader_code_.push_back(0x437F0000u); - ++stat_.instruction_count; - ++stat_.float_instruction_count; - // a0 = int(clamp(floor(src0.w + 0.5), -256.0, 255.0)) - shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_FTOI) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); - shader_code_.push_back(system_temp_ps_pc_p0_a0_); - shader_code_.push_back( - EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); - ++stat_.instruction_count; - ++stat_.conversion_instruction_count; - // The `pv = max(src0, src1)` part. - if (operands_duplicate[1]) { - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOV) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + operand_length_sums[0])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0]); - ++stat_.instruction_count; - ++stat_.mov_instruction_count; - } else { - shader_code_.push_back( - ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MAX) | - ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH( - 3 + operand_length_sums[1])); - shader_code_.push_back( - EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); - UseDxbcSourceOperand(dxbc_operands[0]); - UseDxbcSourceOperand(dxbc_operands[1]); - ++stat_.instruction_count; - ++stat_.float_instruction_count; + DxbcOpAdd(DxbcDest::R(system_temp_ps_pc_p0_a0_, 0b1000), + operands[0].SelectFromSwizzled(3), DxbcSrc::LF(0.5f)); + DxbcOpRoundNI(DxbcDest::R(system_temp_ps_pc_p0_a0_, 0b1000), + DxbcSrc::R(system_temp_ps_pc_p0_a0_, DxbcSrc::kWWWW)); + DxbcOpMax(DxbcDest::R(system_temp_ps_pc_p0_a0_, 0b1000), + DxbcSrc::R(system_temp_ps_pc_p0_a0_, DxbcSrc::kWWWW), + DxbcSrc::LF(-256.0f)); + DxbcOpMin(DxbcDest::R(system_temp_ps_pc_p0_a0_, 0b1000), + DxbcSrc::R(system_temp_ps_pc_p0_a0_, DxbcSrc::kWWWW), + DxbcSrc::LF(255.0f)); + DxbcOpFToI(DxbcDest::R(system_temp_ps_pc_p0_a0_, 0b1000), + DxbcSrc::R(system_temp_ps_pc_p0_a0_, DxbcSrc::kWWWW)); + if (used_result_components) { + uint32_t identical = instr.vector_operands[0].GetIdenticalComponents( + instr.vector_operands[1]) & + used_result_components; + uint32_t different = used_result_components & ~identical; + if (different) { + // Shader Model 3 NaN behavior (a op b ? a : b, not fmax/fmin). + DxbcOpGE(DxbcDest::R(system_temp_result_, different), operands[0], + operands[1]); + DxbcOpMovC(DxbcDest::R(system_temp_result_, different), + DxbcSrc::R(system_temp_result_), operands[0], operands[1]); + } + if (identical) { + DxbcOpMov(DxbcDest::R(system_temp_result_, identical), operands[0]); + } } break; default: assert_unhandled_case(instr.vector_opcode); - translated = false; - break; + EmitTranslationError("Unknown ALU vector operation"); + DxbcOpMov(DxbcDest::R(system_temp_result_), DxbcSrc::LF(0.0f)); } - for (uint32_t i = 0; i < operand_count; ++i) { - uint32_t operand_index = operand_count - 1 - i; - if (!operands_duplicate[operand_index]) { - UnloadDxbcSourceOperand(dxbc_operands[operand_index]); - } - } - - return translated; + PopSystemTemp(operand_temps); } bool DxbcShaderTranslator::ProcessScalarAluOperation( @@ -2446,20 +1812,19 @@ void DxbcShaderTranslator::ProcessAluInstruction( bool predicate_written_vector = false; // Whether the result is only in X and all components should be remapped to X // while storing. - bool replicate_vector_x = false; - bool store_vector = ProcessVectorAluOperation(instr, replicate_vector_x, - predicate_written_vector); + uint32_t vector_result_swizzle = DxbcSrc::kXYZW; + ProcessVectorAluOperation(instr, vector_result_swizzle, + predicate_written_vector); bool predicate_written_scalar = false; bool store_scalar = ProcessScalarAluOperation(instr, predicate_written_scalar); - if (store_vector) { - StoreResult(instr.vector_and_constant_result, system_temp_pv_, - replicate_vector_x, - instr.GetMemExportStreamConstant() != UINT32_MAX); - } + StoreResult(instr.vector_and_constant_result, + DxbcSrc::R(system_temp_result_, vector_result_swizzle), + instr.GetMemExportStreamConstant() != UINT32_MAX); if (store_scalar) { - StoreResult(instr.scalar_result, system_temp_ps_pc_p0_a0_, true); + StoreResult(instr.scalar_result, + DxbcSrc::R(system_temp_ps_pc_p0_a0_, DxbcSrc::kXXXX)); } if (predicate_written_vector || predicate_written_scalar) { diff --git a/src/xenia/gpu/dxbc_shader_translator_fetch.cc b/src/xenia/gpu/dxbc_shader_translator_fetch.cc index 4de3326c9..9ccc212bd 100644 --- a/src/xenia/gpu/dxbc_shader_translator_fetch.cc +++ b/src/xenia/gpu/dxbc_shader_translator_fetch.cc @@ -42,7 +42,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index, shader_code_.push_back(temp1); shader_code_.push_back( EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); shader_code_.push_back(8); @@ -74,7 +74,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index, shader_code_.push_back(8); shader_code_.push_back( EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); shader_code_.push_back(temp1); @@ -91,7 +91,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index, shader_code_.push_back(temp2); shader_code_.push_back( EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); shader_code_.push_back(16); @@ -189,7 +189,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index, ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, write_mask, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); shader_code_.push_back(temp2); @@ -198,7 +198,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index, shader_code_.push_back(temp1); shader_code_.push_back( EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); ++stat_.instruction_count; ++stat_.movc_instruction_count; @@ -212,7 +212,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index, shader_code_.push_back(temp1); shader_code_.push_back( EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); shader_code_.push_back(16); @@ -244,7 +244,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index, shader_code_.push_back(16); shader_code_.push_back( EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); shader_code_.push_back(temp1); @@ -257,7 +257,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index, ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, write_mask, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); shader_code_.push_back(temp2); @@ -266,7 +266,7 @@ void DxbcShaderTranslator::SwapVertexData(uint32_t vfetch_index, shader_code_.push_back(temp1); shader_code_.push_back( EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); ++stat_.instruction_count; ++stat_.movc_instruction_count; @@ -342,7 +342,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( 5 + DxbcSourceOperandLength(index_operand))); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); UseDxbcSourceOperand(index_operand, kSwizzleXYZW, 0); shader_code_.push_back( EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); @@ -353,10 +353,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); ++stat_.instruction_count; ++stat_.conversion_instruction_count; } else { @@ -365,7 +365,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( 3 + DxbcSourceOperandLength(index_operand))); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); UseDxbcSourceOperand(index_operand, kSwizzleXYZW, 0); ++stat_.instruction_count; ++stat_.conversion_instruction_count; @@ -390,7 +390,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0010, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSelectOperand( D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, (vfetch_index & 1) * 2, 3)); shader_code_.push_back(cbuffer_index_fetch_constants_); @@ -407,16 +407,16 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); shader_code_.push_back(instr.attributes.stride * 4); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); ++stat_.instruction_count; ++stat_.uint_instruction_count; @@ -426,10 +426,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); shader_code_.push_back(instr.attributes.offset * 4); @@ -444,7 +444,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0010, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSelectOperand( D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, kSysConst_Flags_Comp, 3)); shader_code_.push_back(cbuffer_index_system_constants_); @@ -462,7 +462,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3)); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); ++stat_.instruction_count; ++stat_.dynamic_flow_control_count; @@ -471,10 +471,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8)); shader_code_.push_back(EncodeVectorMaskedOperand( D3D10_SB_OPERAND_TYPE_TEMP, (1 << load_dword_count) - 1, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D11_SB_OPERAND_TYPE_UNORDERED_ACCESS_VIEW, kSwizzleXYZW & ((1 << (load_dword_count * 2)) - 1), 2)); @@ -492,10 +492,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8)); shader_code_.push_back(EncodeVectorMaskedOperand( D3D10_SB_OPERAND_TYPE_TEMP, (1 << load_dword_count) - 1, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_RESOURCE, kSwizzleXYZW & ((1 << (load_dword_count * 2)) - 1), 2)); @@ -607,7 +607,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(15)); shader_code_.push_back(EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, result_write_mask, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); shader_code_.push_back(extract_widths[0]); @@ -622,7 +622,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( shader_code_.push_back(extract_offsets[3]); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_TEMP, extract_swizzle, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); ++stat_.instruction_count; if (extract_signed) { ++stat_.int_instruction_count; @@ -639,10 +639,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); shader_code_.push_back(EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, result_write_mask, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); ++stat_.instruction_count; ++stat_.conversion_instruction_count; } else if (normalize_scales[0] != 0.0f) { @@ -655,10 +655,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); shader_code_.push_back(EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, result_write_mask, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); ++stat_.instruction_count; ++stat_.conversion_instruction_count; if (!instr.attributes.is_integer) { @@ -667,10 +667,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); shader_code_.push_back(EncodeVectorMaskedOperand( D3D10_SB_OPERAND_TYPE_TEMP, result_write_mask, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); for (uint32_t i = 0; i < 4; ++i) { @@ -687,10 +687,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); shader_code_.push_back(EncodeVectorMaskedOperand( D3D10_SB_OPERAND_TYPE_TEMP, result_write_mask, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); shader_code_.push_back(0xBF800000u); @@ -710,7 +710,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8)); shader_code_.push_back(EncodeVectorMaskedOperand( D3D10_SB_OPERAND_TYPE_TEMP, 0b1111 & ~result_write_mask, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); shader_code_.push_back(0); @@ -727,10 +727,10 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(10)); shader_code_.push_back(EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, result_write_mask, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); uint32_t exp_adjust_scale = @@ -743,7 +743,7 @@ void DxbcShaderTranslator::ProcessVertexFetchInstruction( ++stat_.float_instruction_count; } - StoreResult(instr.result, system_temp_pv_, false); + StoreResult(instr.result, DxbcSrc::R(system_temp_result_)); } uint32_t DxbcShaderTranslator::FindOrAddTextureSRV(uint32_t fetch_constant, @@ -852,9 +852,9 @@ uint32_t DxbcShaderTranslator::FindOrAddSamplerBinding( return sampler_register; } -void DxbcShaderTranslator::ArrayCoordToCubeDirection(uint32_t reg) { - // This does the reverse of what the cube vector ALU instruction does, but - // assuming S and T are normalized. +void DxbcShaderTranslator::TfetchCubeCoordToCubeDirection(uint32_t reg) { + // This does the reverse of what's done by the ALU sequence for cubemap + // coordinate calculation. // // The major axis depends on the face index (passed as a float in reg.z): // +X for 0, -X for 1, +Y for 2, -Y for 3, +Z for 4, -Z for 5. @@ -872,8 +872,8 @@ void DxbcShaderTranslator::ArrayCoordToCubeDirection(uint32_t reg) { // * Y is -T. // * Z is 1.0 or -1.0. - // Make 0, not 0.5, the center of S and T. - // mad reg.xy__, reg.xy__, l(2.0, 2.0, _, _), l(-1.0, -1.0, _, _) + // Make 0, not 1.5, the center of S and T. + // mad reg.xy__, reg.xy__, l(2.0, 2.0, _, _), l(-3.0, -3.0, _, _) shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MAD) | ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(15)); shader_code_.push_back( @@ -890,8 +890,8 @@ void DxbcShaderTranslator::ArrayCoordToCubeDirection(uint32_t reg) { shader_code_.push_back(0x3F800000u); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); - shader_code_.push_back(0xBF800000u); - shader_code_.push_back(0xBF800000u); + shader_code_.push_back(0xC0400000u); + shader_code_.push_back(0xC0400000u); shader_code_.push_back(0); shader_code_.push_back(0); ++stat_.instruction_count; @@ -1194,7 +1194,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(8)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_IMMEDIATE32, kSwizzleXYZW, 0)); shader_code_.push_back(0); @@ -2149,7 +2149,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, coord_mask, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); shader_code_.push_back(coord_temp); @@ -2157,12 +2157,13 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( ++stat_.float_instruction_count; } else { if (instr.dimension == TextureDimension::kCube) { - // Convert cubemap coordinates passed as 2D array texture coordinates to - // a 3D direction. We can't use a 2D array to emulate cubemaps because - // at the edges, especially in pixel shader helper invocations, the - // major axis changes, causing S/T to jump between 0 and 1, breaking - // gradient calculation and causing the 1x1 mipmap to be sampled. - ArrayCoordToCubeDirection(coord_temp); + // Convert cubemap coordinates passed as 2D array texture coordinates + // plus 1 in ST to a 3D direction. We can't use a 2D array to emulate + // cubemaps because at the edges, especially in pixel shader helper + // invocations, the major axis changes, causing S/T to jump between 0 + // and 1, breaking gradient calculation and causing the 1x1 mipmap to be + // sampled. + TfetchCubeCoordToCubeDirection(coord_temp); } // Bias the register LOD if fetching with explicit LOD (so this is not @@ -2237,7 +2238,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(11)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); shader_code_.push_back(coord_temp); @@ -2260,10 +2261,10 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); shader_code_.push_back(EncodeVectorMaskedOperand( D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); shader_code_.push_back( @@ -2277,7 +2278,8 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( for (uint32_t j = 0; j < 2; ++j) { uint32_t srv_index_current = i ? srv_indices_stacked[j] : srv_indices[j]; - uint32_t target_temp_sign = j ? signed_value_temp : system_temp_pv_; + uint32_t target_temp_sign = + j ? signed_value_temp : system_temp_result_; for (uint32_t k = 0; k < (vol_filter_lerp_temp != UINT32_MAX ? 2u : 1u); ++k) { uint32_t target_temp_current = @@ -2564,7 +2566,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1 << i, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); shader_code_.push_back(sign_temp); @@ -2573,7 +2575,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( shader_code_.push_back(signed_value_temp); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, i, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); ++stat_.instruction_count; ++stat_.movc_instruction_count; @@ -2603,7 +2605,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( shader_code_.push_back(sign_temp); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, i, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeScalarOperand(D3D10_SB_OPERAND_TYPE_IMMEDIATE32, 0)); shader_code_.push_back(0x40000000u); @@ -2619,7 +2621,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(9)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1 << i, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 1, 1)); shader_code_.push_back(sign_temp); @@ -2628,7 +2630,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( shader_code_.push_back(sign_temp); shader_code_.push_back( EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, i, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); ++stat_.instruction_count; ++stat_.movc_instruction_count; @@ -2661,7 +2663,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( ++stat_.dynamic_flow_control_count; // Degamma the channel. - ConvertPWLGamma(false, system_temp_pv_, i, system_temp_pv_, i, + ConvertPWLGamma(false, system_temp_result_, i, system_temp_result_, i, sign_temp, 0, sign_temp, 1); // Close the gamma conditional. @@ -2733,10 +2735,10 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); shader_code_.push_back(exp_adjust_temp); @@ -2774,7 +2776,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + operand_length)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0101, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); UseDxbcSourceOperand(operand, 0b01010000); ++stat_.instruction_count; ++stat_.float_instruction_count; @@ -2784,7 +2786,7 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(3 + operand_length)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1010, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); UseDxbcSourceOperand(operand, 0b01010000); ++stat_.instruction_count; ++stat_.float_instruction_count; @@ -2857,10 +2859,10 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); shader_code_.push_back( EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1111, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back(EncodeVectorSwizzledOperand( D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); - shader_code_.push_back(system_temp_pv_); + shader_code_.push_back(system_temp_result_); shader_code_.push_back( EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b01000100, 1)); shader_code_.push_back(exp_bias_temp); @@ -2898,7 +2900,9 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( } if (store_result) { - StoreResult(instr.result, system_temp_pv_, replicate_result); + StoreResult(instr.result, + DxbcSrc::R(system_temp_result_, + replicate_result ? DxbcSrc::kXXXX : DxbcSrc::kXYZW)); } } diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 25941c4bd..ac89bdc0a 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -289,7 +289,7 @@ struct ParsedLoopStartInstruction { uint32_t dword_index = 0; // Integer constant register that holds the loop parameters. - // Byte-wise: [loop count, start, step [-128, 127], ?] + // 0:7 - uint8 loop count, 8:15 - uint8 start aL, 16:23 - int8 aL step. uint32_t loop_constant_index = 0; // Whether to reuse the current aL instead of reset it to loop start. bool is_repeat = false; @@ -311,7 +311,7 @@ struct ParsedLoopEndInstruction { bool predicate_condition = false; // Integer constant register that holds the loop parameters. - // Byte-wise: [loop count, start, step [-128, 127], ?] + // 0:7 - uint8 loop count, 8:15 - uint8 start aL, 16:23 - int8 aL step. uint32_t loop_constant_index = 0; // Target address of the start of the loop body. diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index b588f6776..051cbf7df 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -256,7 +256,7 @@ struct ControlFlowLoopStartInstruction { // Whether to reuse the current aL instead of reset it to loop start. bool is_repeat() const { return is_repeat_; } // Integer constant register that holds the loop parameters. - // Byte-wise: [loop count, start, step [-128, 127], ?] + // 0:7 - uint8 loop count, 8:15 - uint8 start aL, 16:23 - int8 aL step. uint32_t loop_id() const { return loop_id_; } private: @@ -281,7 +281,7 @@ struct ControlFlowLoopEndInstruction { // Target address of the start of the loop body. uint32_t address() const { return address_; } // Integer constant register that holds the loop parameters. - // Byte-wise: [loop count, start, step [-128, 127], ?] + // 0:7 - uint8 loop count, 8:15 - uint8 start aL, 16:23 - int8 aL step. uint32_t loop_id() const { return loop_id_; } // Break from the loop if the predicate matches the expected value. bool is_predicated_break() const { return is_predicated_break_; } @@ -667,11 +667,13 @@ static_assert_size(TextureFetchInstruction, 12); // Both are valid only within the current ALU clause. They are not modified // when the instruction that would write them fails its predication check. // - Direct3D 9 rules (like in GCN v_*_legacy_f32 instructions) for -// multiplication (0 * anything = 0) wherever it's present (mul, mad, dp, -// etc.) and for NaN in min/max. It's very important to respect this rule for -// multiplication, as games often rely on it in vector normalization (rcp and -// mul), Infinity * 0 resulting in NaN breaks a lot of things in games - -// causes white screen in Halo 3, white specular on characters in GTA IV. +// multiplication (0 or denormal * anything = 0) wherever it's present (mul, +// mad, dp, etc.) and for NaN in min/max. It's very important to respect this +// rule for multiplication, as games often rely on it in vector normalization +// (rcp and mul), Infinity * 0 resulting in NaN breaks a lot of things in +// games - causes white screen in Halo 3, white specular on characters in GTA +// IV. +// TODO(Triang3l): Investigate signed zero handling in multiplication. enum class AluScalarOpcode : uint32_t { // Floating-Point Add @@ -1145,7 +1147,7 @@ enum class AluVectorOpcode : uint32_t { // cube/CUBEv dest, src0, src1 // dest.x = T cube coordinate; // dest.y = S cube coordinate; - // dest.z = 2.0 * MajorAxis; + // dest.z = 2.0 * major axis; // dest.w = FaceID; // https://developer.amd.com/wordpress/media/2012/12/AMD_Southern_Islands_Instruction_Set_Architecture.pdf // if (abs(z) >= abs(x) && abs(z) >= abs(y)) { @@ -1167,6 +1169,16 @@ enum class AluVectorOpcode : uint32_t { // Expects src0.zzxy and src1.yxzz swizzles. // FaceID is D3DCUBEMAP_FACES: // https://msdn.microsoft.com/en-us/library/windows/desktop/bb172528(v=vs.85).aspx + // Used like: + // cube r0, source.zzxy, source.yxz + // rcp r0.z, r0_abs.z + // mad r0.xy, r0, r0.zzzw, 1.5f + // tfetchCube r0, r0.yxw, tf0 + // http://web.archive.org/web/20100705154143/http://msdn.microsoft.com/en-us/library/bb313921.aspx + // On GCN, the sequence is the same, so GCN documentation can be used as a + // reference (tfetchCube doesn't accept the UV as if the texture was a 2D + // array in XY exactly, to get texture array UV, 1 must be subtracted from its + // XY inputs). kCube = 18, // Four-Element Maximum @@ -1293,12 +1305,20 @@ enum class AluVectorOpcode : uint32_t { // Per-Component Floating-Point Maximum with Copy To Integer in AR // maxa dest, src0, src1 // This is a combined max + mova/MOVAv. - // int result = (int)floor(src0.w + 0.5); - // a0 = clamp(result, -256, 255); + // a0 = (int)clamp(floor(src0.w + 0.5), -256.0, 255.0); // dest.x = src0.x >= src1.x ? src0.x : src1.x; // dest.y = src0.x >= src1.y ? src0.y : src1.y; // dest.z = src0.x >= src1.z ? src0.z : src1.z; // dest.w = src0.x >= src1.w ? src0.w : src1.w; + // The MSDN documentation specifies clamp as: + // if (!(SQResultF >= -256.0)) { + // SQResultF = -256.0; + // } + // if (SQResultF > 255.0) { + // SQResultF = 255.0; + // } + // http://web.archive.org/web/20100705151335/http://msdn.microsoft.com:80/en-us/library/bb313931.aspx + // However, using NaN as an address would be unusual. kMaxA = 29, }; @@ -1329,6 +1349,7 @@ constexpr bool AluVectorOpHasSideEffects(AluVectorOpcode vector_opcode) { // (doesn't check the operand count though). constexpr uint32_t GetAluVectorOpUsedSourceComponents( AluVectorOpcode vector_opcode, uint32_t src_index) { + assert_not_zero(src_index); switch (vector_opcode) { case AluVectorOpcode::kDp3: return 0b0111; @@ -1353,27 +1374,30 @@ constexpr uint32_t GetAluVectorOpUsedSourceComponents( // components specified in the write mask are needed, but there are instructions // with special behavior for certain components. constexpr uint32_t GetAluVectorOpNeededSourceComponents( - AluVectorOpcode vector_opcode, uint32_t src_index, uint32_t write_mask) { - uint32_t components = write_mask; + AluVectorOpcode vector_opcode, uint32_t src_index, + uint32_t used_result_components) { + assert_not_zero(src_index); + uint32_t components = used_result_components; switch (vector_opcode) { case AluVectorOpcode::kDp4: case AluVectorOpcode::kMax4: - components = write_mask ? 0b1111 : 0; + components = used_result_components ? 0b1111 : 0; break; case AluVectorOpcode::kDp3: - components = write_mask ? 0b0111 : 0; + components = used_result_components ? 0b0111 : 0; break; case AluVectorOpcode::kDp2Add: - components = write_mask ? (src_index == 3 ? 0b0001 : 0b0011) : 0; + components = + used_result_components ? (src_index == 3 ? 0b0001 : 0b0011) : 0; break; case AluVectorOpcode::kCube: - components = write_mask ? 0b1111 : 0; + components = used_result_components ? 0b1111 : 0; break; case AluVectorOpcode::kSetpEqPush: case AluVectorOpcode::kSetpNePush: case AluVectorOpcode::kSetpGtPush: case AluVectorOpcode::kSetpGePush: - components = write_mask ? 0b1001 : 0b1000; + components = used_result_components ? 0b1001 : 0b1000; break; case AluVectorOpcode::kKillEq: case AluVectorOpcode::kKillGt: