diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 7fd7b347e..f221243a0 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -1333,14 +1333,16 @@ void DxbcShaderTranslator::StartTranslation() { system_temp_position_ = PushSystemTemp(0b1111); } else if (IsDxbcPixelShader()) { if (edram_rov_used_) { + // Will be initialized unconditionally. system_temp_rov_params_ = PushSystemTemp(); - // If the shader doesn't write to depth, StartPixelShader will load the - // depth/stencil for early test, so no need to initialize. If it does, - // initialize it to something consistent - depth must be written on every - // shader execution path (at least in PC ps_3_0 and later shader models) - // and to make compilation easier. + // If the shader doesn't write to oDepth, each component will be written + // to if depth/stencil is enabled and the respective sample is covered - + // so need to initialize now because the first writes will be conditional. + // If the shader writes to oDepth, this is oDepth of the shader, written + // by the guest code, so initialize because assumptions can't be made + // about the integrity of the guest code. system_temp_rov_depth_stencil_ = - PushSystemTemp(writes_depth() ? 0b0001 : 0); + PushSystemTemp(writes_depth() ? 0b0001 : 0b1111); } for (uint32_t i = 0; i < 4; ++i) { if (writes_color_target(i)) { @@ -1377,7 +1379,10 @@ void DxbcShaderTranslator::StartTranslation() { } } - // Allocate system temporary variables for the translated code. + // Allocate system temporary variables for the translated code. Since access + // depends on the guest code (thus no guarantees), initialize everything + // now (except for pv, it's an internal temporary variable, not accessible + // by the guest). system_temp_pv_ = PushSystemTemp(); system_temp_ps_pc_p0_a0_ = PushSystemTemp(0b1111); system_temp_aL_ = PushSystemTemp(0b1111); diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index 245eab7c0..1bac1ccd7 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -49,6 +49,10 @@ namespace gpu { // optimization possibilities as this may result in false dependencies. Always // mov l(0, 0, 0, 0) to such components before potential branching - // PushSystemTemp accepts a zero mask for this purpose. +// +// Clamping must be done first to the lower bound (using max), then to the upper +// bound (using min), to match the saturate modifier behavior, which results in +// 0 for NaN. class DxbcShaderTranslator : public ShaderTranslator { public: DxbcShaderTranslator(uint32_t vendor_id, bool edram_rov_used); @@ -1626,7 +1630,8 @@ class DxbcShaderTranslator : public ShaderTranslator { uint32_t temp2_component); // Packs a float32x4 color value to 32bpp or a 64bpp in color_temp to // packed_temp.packed_temp_components, using 2 temporary VGPR. color_temp and - // packed_temp may be the same if packed_temp_components is 0. + // packed_temp may be the same if packed_temp_components is 0. If the format + // is 32bpp, will still the high part to break register dependency. void ROV_PackPreClampedColor(uint32_t rt_index, uint32_t color_temp, uint32_t packed_temp, uint32_t packed_temp_components, uint32_t temp1, @@ -2002,7 +2007,8 @@ class DxbcShaderTranslator : public ShaderTranslator { // W - Base-relative resolution-scaled EDRAM offset for 64bpp color data, in // dwords. uint32_t system_temp_rov_params_; - // ROV only - new depth/stencil data. 4 VGPRs. + // ROV only - new depth/stencil data. 4 VGPRs when not writing to oDepth, 1 + // VGPR when writing to oDepth. // When not writing to oDepth: New per-sample depth/stencil values, generated // during early depth/stencil test (actual writing checks coverage bits). // When writing to oDepth: X also used to hold the depth written by the diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc index 5ed21df4a..61e9e9fcc 100644 --- a/src/xenia/gpu/dxbc_shader_translator_om.cc +++ b/src/xenia/gpu/dxbc_shader_translator_om.cc @@ -880,6 +880,10 @@ void DxbcShaderTranslator::ROV_PackPreClampedColor( DxbcDest temp2_dest(DxbcDest::R(temp2, 1 << temp2_component)); DxbcSrc temp2_src(DxbcSrc::R(temp2).Select(temp2_component)); + // Break register dependency after 32bpp cases. + DxbcOpMov(DxbcDest::R(packed_temp, 1 << (packed_temp_components + 1)), + DxbcSrc::LU(0)); + // Choose the packing based on the render target's format. system_constants_used_ |= 1ull << kSysConst_EDRAMRTFormatFlags_Index; DxbcOpSwitch(DxbcSrc::CB(cbuffer_index_system_constants_,