diff --git a/src/xenia/gpu/dxbc_shader_translator_fetch.cc b/src/xenia/gpu/dxbc_shader_translator_fetch.cc index b4813b381..8e81d0831 100644 --- a/src/xenia/gpu/dxbc_shader_translator_fetch.cc +++ b/src/xenia/gpu/dxbc_shader_translator_fetch.cc @@ -17,6 +17,7 @@ #include "xenia/base/math.h" #include "xenia/base/string.h" #include "xenia/gpu/dxbc_shader_translator.h" +#include "xenia/gpu/gpu_flags.h" namespace xe { namespace gpu { @@ -630,21 +631,44 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( if (instr.opcode == FetchOpcode::kGetTextureGradients) { // Handle before doing anything that actually needs the texture. bool grad_operand_temp_pushed = false; - DxbcSrc grad_operand = - LoadOperand(instr.operands[0], 0b0011, grad_operand_temp_pushed); - if (used_result_components & 0b0101) { - DxbcOpDerivRTXFine( - DxbcDest::R(system_temp_result_, used_result_components & 0b0101), - grad_operand.SwizzleSwizzled(0b010000)); + DxbcSrc grad_operand = LoadOperand( + instr.operands[0], + ((used_result_nonzero_components & 0b0011) ? 0b0001 : 0) | + ((used_result_nonzero_components & 0b1100) ? 0b0010 : 0), + grad_operand_temp_pushed); + if (used_result_nonzero_components & 0b0101) { + DxbcOpDerivRTXCoarse(DxbcDest::R(system_temp_result_, + used_result_nonzero_components & 0b0101), + grad_operand.SwizzleSwizzled(0b010000)); } - if (used_result_components & 0b1010) { - DxbcOpDerivRTYFine( - DxbcDest::R(system_temp_result_, used_result_components & 0b1010), - grad_operand.SwizzleSwizzled(0b01000000)); + if (used_result_nonzero_components & 0b1010) { + DxbcOpDerivRTYCoarse(DxbcDest::R(system_temp_result_, + used_result_nonzero_components & 0b1010), + grad_operand.SwizzleSwizzled(0b01000000)); } if (grad_operand_temp_pushed) { PopSystemTemp(); } + if (!edram_rov_used_ && cvars::ssaa_scale_gradients) { + // Scale the gradients to guest pixels with SSAA. + uint32_t ssaa_scale_temp = PushSystemTemp(); + system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index; + DxbcOpMovC(DxbcDest::R(ssaa_scale_temp, + (used_result_nonzero_components & 0b0011) | + (used_result_nonzero_components >> 2)), + DxbcSrc::CB(cbuffer_index_system_constants_, + uint32_t(CbufferRegister::kSystemConstants), + kSysConst_SampleCountLog2_Vec, + kSysConst_SampleCountLog2_Comp | + ((kSysConst_SampleCountLog2_Comp + 1) << 2)), + DxbcSrc::LF(2.0f), DxbcSrc::LF(1.0f)); + DxbcOpMul( + DxbcDest::R(system_temp_result_, used_result_nonzero_components), + DxbcSrc::R(system_temp_result_), + DxbcSrc::R(ssaa_scale_temp, 0b01000100)); + // Release ssaa_scale_temp. + PopSystemTemp(); + } StoreResult(instr.result, DxbcSrc::R(system_temp_result_)); return; } @@ -1387,12 +1411,27 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( DxbcOpExp(lod_dest, lod_src); // FIXME(Triang3l): Gradient exponent adjustment is currently not done // in getCompTexLOD, so don't do it here too. + bool ssaa_scale_gradients = + !instr.attributes.use_register_gradients && !edram_rov_used_ && + cvars::ssaa_scale_gradients; #if 0 // Extract gradient exponent biases from the fetch constant and merge // them with the LOD bias. DxbcOpIBFE(DxbcDest::R(grad_h_lod_temp, 0b0011), DxbcSrc::LU(5), DxbcSrc::LU(22, 27, 0, 0), RequestTextureFetchConstantWord(tfetch_index, 4)); + if (ssaa_scale_gradients) { + // Adjust the gradient scales to include the SSAA scale. + system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index; + DxbcOpIAdd(DxbcDest::R(grad_h_lod_temp, 0b0011), + DxbcSrc::R(grad_h_lod_temp), + DxbcSrc::CB( + cbuffer_index_system_constants_, + uint32_t(CbufferRegister::kSystemConstants), + kSysConst_SampleCountLog2_Vec, + kSysConst_SampleCountLog2_Comp | + ((kSysConst_SampleCountLog2_Comp + 1) << 2))); + } DxbcOpIMAd(DxbcDest::R(grad_h_lod_temp, 0b0011), DxbcSrc::R(grad_h_lod_temp), DxbcSrc::LI(int32_t(1) << 23), DxbcSrc::LF(1.0f)); @@ -1400,6 +1439,32 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( DxbcSrc::R(grad_h_lod_temp, DxbcSrc::kYYYY)); DxbcOpMul(lod_dest, lod_src, DxbcSrc::R(grad_h_lod_temp, DxbcSrc::kXXXX)); +#else + if (ssaa_scale_gradients) { + // Adjust the gradient scales in each direction to include the SSAA + // scale - for ddy scale, grad_v_temp.w, not grad_h_lod_temp.w, must + // be used. + // ddy. + system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index; + DxbcOpMovC(DxbcDest::R(grad_v_temp, 0b1000), + DxbcSrc::CB(cbuffer_index_system_constants_, + uint32_t(CbufferRegister::kSystemConstants), + kSysConst_SampleCountLog2_Vec) + .Select(kSysConst_SampleCountLog2_Comp + 1), + DxbcSrc::LF(2.0f), DxbcSrc::LF(1.0f)); + DxbcOpMul(DxbcDest::R(grad_v_temp, 0b1000), lod_src, + DxbcSrc::R(grad_v_temp, DxbcSrc::kWWWW)); + // ddx (after ddy handling, because the ddy code uses lod_src, and + // it's being overwritten now). + system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index; + DxbcOpIf(true, + DxbcSrc::CB(cbuffer_index_system_constants_, + uint32_t(CbufferRegister::kSystemConstants), + kSysConst_SampleCountLog2_Vec) + .Select(kSysConst_SampleCountLog2_Comp)); + DxbcOpMul(lod_dest, lod_src, DxbcSrc::LF(2.0f)); + DxbcOpEndIf(); + } #endif // Obtain the gradients and apply biases to them. if (instr.attributes.use_register_gradients) { @@ -1458,8 +1523,12 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction( DxbcSrc::R(grad_v_temp), DxbcSrc::R(grad_v_temp, DxbcSrc::kWWWW)); #else - DxbcOpMul(DxbcDest::R(grad_v_temp, grad_mask), - DxbcSrc::R(grad_v_temp), lod_src); + // With SSAA gradient scaling, the scale is separate in each + // direction. + DxbcOpMul( + DxbcDest::R(grad_v_temp, grad_mask), DxbcSrc::R(grad_v_temp), + ssaa_scale_gradients ? DxbcSrc::R(grad_v_temp, DxbcSrc::kWWWW) + : lod_src); #endif } if (instr.dimension == xenos::FetchOpDimension::k1D) { diff --git a/src/xenia/gpu/gpu_flags.cc b/src/xenia/gpu/gpu_flags.cc index 07eff0bc8..9a234712c 100644 --- a/src/xenia/gpu/gpu_flags.cc +++ b/src/xenia/gpu/gpu_flags.cc @@ -40,6 +40,14 @@ DEFINE_bool( "be fully covered when MSAA is used with fullscreen passes.", "GPU"); +DEFINE_bool( + ssaa_scale_gradients, true, + "When using SSAA instead of native MSAA, adjust texture coordinate " + "derivatives used for mipmap selection, and getGradients results, to guest " + "pixels as if true MSAA rather than SSAA was used.\n" + "Reduces bandwidth usage of texture fetching.", + "GPU"); + DEFINE_string( depth_float24_conversion, "", "Method for converting 32-bit Z values to 20e4 floating point when using " diff --git a/src/xenia/gpu/gpu_flags.h b/src/xenia/gpu/gpu_flags.h index 2405dc23c..f1710d107 100644 --- a/src/xenia/gpu/gpu_flags.h +++ b/src/xenia/gpu/gpu_flags.h @@ -22,6 +22,8 @@ DECLARE_bool(gpu_allow_invalid_fetch_constants); DECLARE_bool(half_pixel_offset); +DECLARE_bool(ssaa_scale_gradients); + DECLARE_string(depth_float24_conversion); DECLARE_int32(query_occlusion_fake_sample_count); diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index 4570f9515..f60b5c6ed 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -551,6 +551,12 @@ enum class FetchOpcode : uint32_t { kGetTextureComputedLod = 17, // Source is 2-component. XZ = ddx(source.xy), YW = ddy(source.xy). + // TODO(Triang3l): Verify whether it's coarse or fine (on Adreno 200, for + // instance). This is using the texture unit, where the LOD is computed for + // the whole quad (according to the Direct3D 11.3 specification), so likely + // coarse; ddx / ddy from the Shader Model 4 era is also compiled by FXC to + // deriv_rtx/rty_coarse when targeting Shader Model 5, and on TeraScale, + // coarse / fine selection only appeared on Direct3D 11 GPUs. kGetTextureGradients = 18, // Gets the weights used in a bilinear fetch.