diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 980a09e81..20d478d0e 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1347,15 +1347,15 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( // = false: multiply the Z coordinate by 1/W0. // VTX_W0_FMT = true: the incoming W0 is not 1/W0. Perform the reciprocal to // get 1/W0. - float vtx_xy_fmt = (pa_cl_vte_cntl & (1 << 8)) ? 1.0f : 0.0f; - float vtx_z_fmt = (pa_cl_vte_cntl & (1 << 9)) ? 1.0f : 0.0f; - float vtx_w0_fmt = (pa_cl_vte_cntl & (1 << 10)) ? 1.0f : 0.0f; - dirty |= system_constants_.mul_rcp_w[0] != vtx_xy_fmt; - dirty |= system_constants_.mul_rcp_w[1] != vtx_z_fmt; - dirty |= system_constants_.mul_rcp_w[2] != vtx_w0_fmt; - system_constants_.mul_rcp_w[0] = vtx_xy_fmt; - system_constants_.mul_rcp_w[1] = vtx_z_fmt; - system_constants_.mul_rcp_w[2] = vtx_w0_fmt; + uint32_t vtx_xy_fmt = (pa_cl_vte_cntl >> 8) & 1; + uint32_t vtx_z_fmt = (pa_cl_vte_cntl >> 9) & 1; + uint32_t vtx_w0_fmt = (pa_cl_vte_cntl >> 10) & 1; + dirty |= system_constants_.vertex_w_format[0] != vtx_xy_fmt; + dirty |= system_constants_.vertex_w_format[1] != vtx_z_fmt; + dirty |= system_constants_.vertex_w_format[2] != vtx_w0_fmt; + system_constants_.vertex_w_format[0] = vtx_xy_fmt; + system_constants_.vertex_w_format[1] = vtx_z_fmt; + system_constants_.vertex_w_format[2] = vtx_w0_fmt; // Conversion to Direct3D 12 normalized device coordinates. // See viewport configuration in UpdateFixedFunctionState for explanations. diff --git a/src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli b/src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli index dccaa8342..09a47c0ef 100644 --- a/src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli +++ b/src/xenia/gpu/d3d12/shaders/xenos_draw.hlsli @@ -3,7 +3,7 @@ cbuffer xe_system_cbuffer : register(b0) { // vec4 0 - float3 xe_mul_rcp_w; + uint3 xe_vertex_w_format; uint xe_vertex_base_index; // vec4 1 float3 xe_ndc_scale; diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 9d218ddd3..b69f77b92 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -517,7 +517,85 @@ void DxbcShaderTranslator::StartTranslation() { } void DxbcShaderTranslator::CompleteVertexShader() { - // TODO(Triang3l): vtx_fmt. + // Revert getting the reciprocal of W and dividing XY by W if needed. + // TODO(Triang3l): Check if having XY or Z pre-divided by W should enable + // affine interpolation. + rdef_constants_used_ |= 1ull + << uint32_t(RdefConstantIndex::kSysVertexWFormat); + uint32_t w_format_temp = PushSystemTemp(); + // If the shader has returned 1/W, restore W. First take the reciprocal, which + // may be either W (what we need) or 1/W, depending on the vertex W format. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D11_SB_OPCODE_RCP) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(5)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0001, 1)); + shader_code_.push_back(w_format_temp); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); + shader_code_.push_back(system_temp_position_); + ++stat_.instruction_count; + ++stat_.float_instruction_count; + // Then, if the shader returns 1/W (vtx_w0_fmt is 0), write 1/(1/W) to the + // position. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(11)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b1000, 1)); + shader_code_.push_back(system_temp_position_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, + kSysConst_VertexWFormat_Comp + 2, 3)); + shader_code_.push_back(uint32_t(RdefConstantBufferIndex::kSystemConstants)); + shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); + shader_code_.push_back(kSysConst_VertexWFormat_Vec); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); + shader_code_.push_back(system_temp_position_); + shader_code_.push_back( + EncodeVectorSelectOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0, 1)); + shader_code_.push_back(w_format_temp); + ++stat_.instruction_count; + ++stat_.movc_instruction_count; + // Multiply XYZ by W in case the shader returns XYZ/W and we'll need to + // restore XYZ. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MUL) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(7)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0111, 1)); + shader_code_.push_back(w_format_temp); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(system_temp_position_); + shader_code_.push_back( + EncodeVectorReplicatedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 3, 1)); + shader_code_.push_back(system_temp_position_); + ++stat_.instruction_count; + ++stat_.float_instruction_count; + // If vtx_xy_fmt and/or vtx_z_fmt are 1, XY and/or Z are pre-divided by W. + // Restore them in this case. + shader_code_.push_back(ENCODE_D3D10_SB_OPCODE_TYPE(D3D10_SB_OPCODE_MOVC) | + ENCODE_D3D10_SB_TOKENIZED_INSTRUCTION_LENGTH(11)); + shader_code_.push_back( + EncodeVectorMaskedOperand(D3D10_SB_OPERAND_TYPE_TEMP, 0b0111, 1)); + shader_code_.push_back(system_temp_position_); + shader_code_.push_back(EncodeVectorSwizzledOperand( + D3D10_SB_OPERAND_TYPE_CONSTANT_BUFFER, + kSysConst_VertexWFormat_Comp | (kSysConst_VertexWFormat_Comp << 2) | + ((kSysConst_VertexWFormat_Comp + 1) << 4), + 3)); + shader_code_.push_back(uint32_t(RdefConstantBufferIndex::kSystemConstants)); + shader_code_.push_back(uint32_t(CbufferRegister::kSystemConstants)); + shader_code_.push_back(kSysConst_VertexWFormat_Vec); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(w_format_temp); + shader_code_.push_back( + EncodeVectorSwizzledOperand(D3D10_SB_OPERAND_TYPE_TEMP, kSwizzleXYZW, 1)); + shader_code_.push_back(system_temp_position_); + ++stat_.instruction_count; + ++stat_.movc_instruction_count; + // Release w_format_temp. + PopSystemTemp(); // Apply scale for drawing without a viewport. rdef_constants_used_ |= 1ull << uint32_t(RdefConstantIndex::kSysNDCScale); @@ -5976,6 +6054,7 @@ const DxbcShaderTranslator::RdefType DxbcShaderTranslator::rdef_types_[size_t( {"float4", 1, 3, 1, 4, 0, 0, RdefTypeIndex::kUnknown, nullptr}, {"int", 0, 2, 1, 1, 0, 0, RdefTypeIndex::kUnknown, nullptr}, {"uint", 0, 19, 1, 1, 0, 0, RdefTypeIndex::kUnknown, nullptr}, + {"uint3", 1, 19, 1, 3, 0, 0, RdefTypeIndex::kUnknown, nullptr}, {"uint4", 1, 19, 1, 4, 0, 0, RdefTypeIndex::kUnknown, nullptr}, {nullptr, 1, 19, 1, 4, 8, 0, RdefTypeIndex::kUint4, nullptr}, {nullptr, 1, 19, 1, 4, 32, 0, RdefTypeIndex::kUint4, nullptr}, @@ -5991,7 +6070,7 @@ const DxbcShaderTranslator::RdefConstant DxbcShaderTranslator::RdefConstantIndex::kCount)] = { // SYSTEM CONSTANTS MUST BE UPDATED IF THEIR LAYOUT CHANGES! // System constants vec4 0. - {"xe_mul_rcp_w", RdefTypeIndex::kFloat3, 0, 12}, + {"xe_vertex_w_format", RdefTypeIndex::kUint3, 0, 12}, {"xe_vertex_base_index", RdefTypeIndex::kUint, 12, 4}, // System constants vec4 1. {"xe_ndc_scale", RdefTypeIndex::kFloat3, 16, 12}, diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 838f1028e..51b5b0133 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -33,7 +33,7 @@ class DxbcShaderTranslator : public ShaderTranslator { // - d3d12/shaders/xenos_draw.hlsli (for geometry shaders). struct SystemConstants { // vec4 0 - float mul_rcp_w[3]; + uint32_t vertex_w_format[3]; uint32_t vertex_base_index; // vec4 1 @@ -139,8 +139,8 @@ class DxbcShaderTranslator : public ShaderTranslator { }; enum : uint32_t { - kSysConst_MulRcpW_Vec = 0, - kSysConst_MulRcpW_Comp = 0, + kSysConst_VertexWFormat_Vec = 0, + kSysConst_VertexWFormat_Comp = 0, kSysConst_VertexBaseIndex_Vec = 0, kSysConst_VertexBaseIndex_Comp = 3, @@ -397,6 +397,7 @@ class DxbcShaderTranslator : public ShaderTranslator { kFloat4, kInt, kUint, + kUint3, kUint4, // Bool constants. kUint4Array8, @@ -438,7 +439,7 @@ class DxbcShaderTranslator : public ShaderTranslator { enum class RdefConstantIndex { kSystemConstantFirst, - kSysMulRcpW = kSystemConstantFirst, + kSysVertexWFormat = kSystemConstantFirst, kSysVertexBaseIndex, kSysNDCScale, kSysVertexIndexEndian,