[GPU] Scale gradients by SSAA factor

This commit is contained in:
Triang3l 2020-12-20 16:35:15 +03:00
parent e6fa0ad139
commit c7fbe0e6d5
4 changed files with 97 additions and 12 deletions

View File

@ -17,6 +17,7 @@
#include "xenia/base/math.h"
#include "xenia/base/string.h"
#include "xenia/gpu/dxbc_shader_translator.h"
#include "xenia/gpu/gpu_flags.h"
namespace xe {
namespace gpu {
@ -630,21 +631,44 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
if (instr.opcode == FetchOpcode::kGetTextureGradients) {
// Handle before doing anything that actually needs the texture.
bool grad_operand_temp_pushed = false;
DxbcSrc grad_operand =
LoadOperand(instr.operands[0], 0b0011, grad_operand_temp_pushed);
if (used_result_components & 0b0101) {
DxbcOpDerivRTXFine(
DxbcDest::R(system_temp_result_, used_result_components & 0b0101),
DxbcSrc grad_operand = LoadOperand(
instr.operands[0],
((used_result_nonzero_components & 0b0011) ? 0b0001 : 0) |
((used_result_nonzero_components & 0b1100) ? 0b0010 : 0),
grad_operand_temp_pushed);
if (used_result_nonzero_components & 0b0101) {
DxbcOpDerivRTXCoarse(DxbcDest::R(system_temp_result_,
used_result_nonzero_components & 0b0101),
grad_operand.SwizzleSwizzled(0b010000));
}
if (used_result_components & 0b1010) {
DxbcOpDerivRTYFine(
DxbcDest::R(system_temp_result_, used_result_components & 0b1010),
if (used_result_nonzero_components & 0b1010) {
DxbcOpDerivRTYCoarse(DxbcDest::R(system_temp_result_,
used_result_nonzero_components & 0b1010),
grad_operand.SwizzleSwizzled(0b01000000));
}
if (grad_operand_temp_pushed) {
PopSystemTemp();
}
if (!edram_rov_used_ && cvars::ssaa_scale_gradients) {
// Scale the gradients to guest pixels with SSAA.
uint32_t ssaa_scale_temp = PushSystemTemp();
system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index;
DxbcOpMovC(DxbcDest::R(ssaa_scale_temp,
(used_result_nonzero_components & 0b0011) |
(used_result_nonzero_components >> 2)),
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_SampleCountLog2_Vec,
kSysConst_SampleCountLog2_Comp |
((kSysConst_SampleCountLog2_Comp + 1) << 2)),
DxbcSrc::LF(2.0f), DxbcSrc::LF(1.0f));
DxbcOpMul(
DxbcDest::R(system_temp_result_, used_result_nonzero_components),
DxbcSrc::R(system_temp_result_),
DxbcSrc::R(ssaa_scale_temp, 0b01000100));
// Release ssaa_scale_temp.
PopSystemTemp();
}
StoreResult(instr.result, DxbcSrc::R(system_temp_result_));
return;
}
@ -1387,12 +1411,27 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
DxbcOpExp(lod_dest, lod_src);
// FIXME(Triang3l): Gradient exponent adjustment is currently not done
// in getCompTexLOD, so don't do it here too.
bool ssaa_scale_gradients =
!instr.attributes.use_register_gradients && !edram_rov_used_ &&
cvars::ssaa_scale_gradients;
#if 0
// Extract gradient exponent biases from the fetch constant and merge
// them with the LOD bias.
DxbcOpIBFE(DxbcDest::R(grad_h_lod_temp, 0b0011), DxbcSrc::LU(5),
DxbcSrc::LU(22, 27, 0, 0),
RequestTextureFetchConstantWord(tfetch_index, 4));
if (ssaa_scale_gradients) {
// Adjust the gradient scales to include the SSAA scale.
system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index;
DxbcOpIAdd(DxbcDest::R(grad_h_lod_temp, 0b0011),
DxbcSrc::R(grad_h_lod_temp),
DxbcSrc::CB(
cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_SampleCountLog2_Vec,
kSysConst_SampleCountLog2_Comp |
((kSysConst_SampleCountLog2_Comp + 1) << 2)));
}
DxbcOpIMAd(DxbcDest::R(grad_h_lod_temp, 0b0011),
DxbcSrc::R(grad_h_lod_temp), DxbcSrc::LI(int32_t(1) << 23),
DxbcSrc::LF(1.0f));
@ -1400,6 +1439,32 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
DxbcSrc::R(grad_h_lod_temp, DxbcSrc::kYYYY));
DxbcOpMul(lod_dest, lod_src,
DxbcSrc::R(grad_h_lod_temp, DxbcSrc::kXXXX));
#else
if (ssaa_scale_gradients) {
// Adjust the gradient scales in each direction to include the SSAA
// scale - for ddy scale, grad_v_temp.w, not grad_h_lod_temp.w, must
// be used.
// ddy.
system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index;
DxbcOpMovC(DxbcDest::R(grad_v_temp, 0b1000),
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_SampleCountLog2_Vec)
.Select(kSysConst_SampleCountLog2_Comp + 1),
DxbcSrc::LF(2.0f), DxbcSrc::LF(1.0f));
DxbcOpMul(DxbcDest::R(grad_v_temp, 0b1000), lod_src,
DxbcSrc::R(grad_v_temp, DxbcSrc::kWWWW));
// ddx (after ddy handling, because the ddy code uses lod_src, and
// it's being overwritten now).
system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index;
DxbcOpIf(true,
DxbcSrc::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_SampleCountLog2_Vec)
.Select(kSysConst_SampleCountLog2_Comp));
DxbcOpMul(lod_dest, lod_src, DxbcSrc::LF(2.0f));
DxbcOpEndIf();
}
#endif
// Obtain the gradients and apply biases to them.
if (instr.attributes.use_register_gradients) {
@ -1458,8 +1523,12 @@ void DxbcShaderTranslator::ProcessTextureFetchInstruction(
DxbcSrc::R(grad_v_temp),
DxbcSrc::R(grad_v_temp, DxbcSrc::kWWWW));
#else
DxbcOpMul(DxbcDest::R(grad_v_temp, grad_mask),
DxbcSrc::R(grad_v_temp), lod_src);
// With SSAA gradient scaling, the scale is separate in each
// direction.
DxbcOpMul(
DxbcDest::R(grad_v_temp, grad_mask), DxbcSrc::R(grad_v_temp),
ssaa_scale_gradients ? DxbcSrc::R(grad_v_temp, DxbcSrc::kWWWW)
: lod_src);
#endif
}
if (instr.dimension == xenos::FetchOpDimension::k1D) {

View File

@ -40,6 +40,14 @@ DEFINE_bool(
"be fully covered when MSAA is used with fullscreen passes.",
"GPU");
DEFINE_bool(
ssaa_scale_gradients, true,
"When using SSAA instead of native MSAA, adjust texture coordinate "
"derivatives used for mipmap selection, and getGradients results, to guest "
"pixels as if true MSAA rather than SSAA was used.\n"
"Reduces bandwidth usage of texture fetching.",
"GPU");
DEFINE_string(
depth_float24_conversion, "",
"Method for converting 32-bit Z values to 20e4 floating point when using "

View File

@ -22,6 +22,8 @@ DECLARE_bool(gpu_allow_invalid_fetch_constants);
DECLARE_bool(half_pixel_offset);
DECLARE_bool(ssaa_scale_gradients);
DECLARE_string(depth_float24_conversion);
DECLARE_int32(query_occlusion_fake_sample_count);

View File

@ -551,6 +551,12 @@ enum class FetchOpcode : uint32_t {
kGetTextureComputedLod = 17,
// Source is 2-component. XZ = ddx(source.xy), YW = ddy(source.xy).
// TODO(Triang3l): Verify whether it's coarse or fine (on Adreno 200, for
// instance). This is using the texture unit, where the LOD is computed for
// the whole quad (according to the Direct3D 11.3 specification), so likely
// coarse; ddx / ddy from the Shader Model 4 era is also compiled by FXC to
// deriv_rtx/rty_coarse when targeting Shader Model 5, and on TeraScale,
// coarse / fine selection only appeared on Direct3D 11 GPUs.
kGetTextureGradients = 18,
// Gets the weights used in a bilinear fetch.