diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp index ffc9044916..b2a1166fd6 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.cpp +++ b/Source/Core/VideoCommon/PixelShaderGen.cpp @@ -381,7 +381,7 @@ void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type, // Declare samplers out.Write("SamplerState samp[8] : register(s0);\n" "\n" - "Texture2DArray Tex[8] : register(t0);\n"); + "Texture2DArray tex[8] : register(t0);\n"); } out.Write("\n"); @@ -428,7 +428,9 @@ void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type, "#define bpmem_tevind(i) (bpmem_pack1[(i)].z)\n" "#define bpmem_iref(i) (bpmem_pack1[(i)].w)\n" "#define bpmem_tevorder(i) (bpmem_pack2[(i)].x)\n" - "#define bpmem_tevksel(i) (bpmem_pack2[(i)].y)\n\n"); + "#define bpmem_tevksel(i) (bpmem_pack2[(i)].y)\n" + "#define samp_texmode0(i) (bpmem_pack2[(i)].z)\n" + "#define samp_texmode1(i) (bpmem_pack2[(i)].w)\n\n"); if (host_config.per_pixel_lighting) { @@ -534,14 +536,183 @@ void UpdateBoundingBox(float2 rawpos) {{ )", fmt::arg("efb_height", EFB_HEIGHT), fmt::arg("efb_scale", I_EFBSCALE)); } + + { + if (api_type == APIType::OpenGL || api_type == APIType::Vulkan) + { + out.Write(R"( +int4 readTexture(in sampler2DArray tex, uint u, uint v, int layer, int lod) {{ + return iround(texelFetch(tex, int3(u, v, layer), lod) * 255.0); +}} + +int4 readTextureLinear(in sampler2DArray tex, uint2 uv1, uint2 uv2, int layer, int lod, int2 frac_uv) {{)"); + } + else if (api_type == APIType::D3D) + { + out.Write(R"( +int4 readTexture(in Texture2DArray tex, uint u, uint v, int layer, int lod) {{ + return iround(tex.Load(int4(u, v, layer, lod)) * 255.0); +}} + +int4 readTextureLinear(in Texture2DArray tex, uint2 uv1, uint2 uv2, int layer, int lod, int2 frac_uv) {{)"); + } + + out.Write(R"( + int4 result = + readTexture(tex, uv1.x, uv1.y, layer, lod) * (128 - frac_uv.x) * (128 - frac_uv.y) + + readTexture(tex, uv2.x, uv1.y, layer, lod) * ( frac_uv.x) * (128 - frac_uv.y) + + readTexture(tex, uv1.x, uv2.y, layer, lod) * (128 - frac_uv.x) * ( frac_uv.y) + + readTexture(tex, uv2.x, uv2.y, layer, lod) * ( frac_uv.x) * ( frac_uv.y); + return result >> 14; +}} +)"); + + out.Write(R"( +uint WrapCoord(int coord, uint wrap, int size) {{ + switch (wrap) {{ + case {:s}: + default: // confirmed that clamp is used for invalid (3) via hardware test + return uint(clamp(coord, 0, size - 1)); + case {:s}: + return uint(coord & (size - 1)); + case {:s}: + if ((coord & size) != 0) {{ + coord = ~coord; + }} + return uint(coord & (size - 1)); + }} +}} +)", + WrapMode::Clamp, WrapMode::Repeat, WrapMode::Mirror); + } + + if (api_type == APIType::OpenGL || api_type == APIType::Vulkan) + { + out.Write("\nint4 sampleTexture(uint texmap, in sampler2DArray tex, int2 uv, int layer) {{\n"); + } + else if (api_type == APIType::D3D) + { + out.Write("\nint4 sampleTexture(uint texmap, in Texture2DArray tex, in SamplerState tex_samp, " + "int2 uv, int layer) {{\n"); + } + + { + out.Write(R"( + uint texmode0 = samp_texmode0(texmap); + uint texmode1 = samp_texmode1(texmap); + int size_s = )" I_TEXDIMS R"([texmap].x; + int size_t = )" I_TEXDIMS R"([texmap].y; + + uint wrap_s = {}; + uint wrap_t = {}; + bool mag_linear = {} != 0u; + bool mipmap_linear = {} != 0u; + bool min_linear = {} != 0u; + bool diag_lod = {} != 0u; + int lod_bias = {}; + // uint max_aniso = TODO; + bool lod_clamp = {} != 0u; + int min_lod = int({}); + int max_lod = int({}); +)", + BitfieldExtract<&SamplerState::TM0::wrap_u>("texmode0"), + BitfieldExtract<&SamplerState::TM0::wrap_v>("texmode0"), + BitfieldExtract<&SamplerState::TM0::mag_filter>("texmode0"), + BitfieldExtract<&SamplerState::TM0::mipmap_filter>("texmode0"), + BitfieldExtract<&SamplerState::TM0::min_filter>("texmode0"), + BitfieldExtract<&SamplerState::TM0::diag_lod>("texmode0"), + BitfieldExtract<&SamplerState::TM0::lod_bias>("texmode0"), + // BitfieldExtract<&SamplerState::TM0::max_aniso>("texmode0"), + BitfieldExtract<&SamplerState::TM0::lod_clamp>("texmode0"), + BitfieldExtract<&SamplerState::TM1::min_lod>("texmode1"), + BitfieldExtract<&SamplerState::TM1::max_lod>("texmode1")); + + if (api_type == APIType::OpenGL || api_type == APIType::Vulkan) + out.Write(R"( + float2 uv_delta_x = abs(dFdx(float2(uv))); + float2 uv_delta_y = abs(dFdy(float2(uv))); +)"); + else if (api_type == APIType::D3D) + out.Write(R"( + float2 uv_delta_x = abs(ddx(float2(uv))); + float2 uv_delta_y = abs(ddy(float2(uv))); +)"); + + // TODO: LOD bias is normally S2.5 (Dolphin uses S7.8 for arbitrary mipmap detection and higher + // IRs), but (at least per the software renderer) actual LOD is S28.4. How does this work? + // Also, note that we can make some assumptions due to use of a SamplerState version of the BP + // configuration, which tidies things compared to whatever nonsense games can put in. + // TODO: This doesn't support diagonal LOD + out.Write(R"( + float2 uv_delta = max(uv_delta_x, uv_delta_y); + float max_delta = max(uv_delta.x / 128.0, uv_delta.y / 128.0); + // log2(x) is undefined if x <= 0, but in practice it seems log2(0) is -infinity, which becomes INT_MIN. + // If lod_bias is negative, adding it to INT_MIN causes an underflow, resulting in a large positive value. + // Hardware testing indicates that min_lod should be used when the derivative is 0. + int lod = max_delta == 0.0 ? min_lod : int(floor(log2(max_delta) * 16.0)) + (lod_bias >> 4); + + bool is_linear = (lod > 0) ? min_linear : mag_linear; + lod = clamp(lod, min_lod, max_lod); + int base_lod = lod >> 4; + int frac_lod = lod & 15; + if (!mipmap_linear && frac_lod >= 8) {{ + // Round to nearest LOD in point mode + base_lod++; + }} + + if (is_linear) {{ + uint2 texuv1 = uint2( + WrapCoord(((uv.x >> base_lod) - 64) >> 7, wrap_s, size_s >> base_lod), + WrapCoord(((uv.y >> base_lod) - 64) >> 7, wrap_t, size_t >> base_lod)); + uint2 texuv2 = uint2( + WrapCoord(((uv.x >> base_lod) + 64) >> 7, wrap_s, size_s >> base_lod), + WrapCoord(((uv.y >> base_lod) + 64) >> 7, wrap_t, size_t >> base_lod)); + int2 frac_uv = int2(((uv.x >> base_lod) - 64) & 0x7f, ((uv.y >> base_lod) - 64) & 0x7f); + + int4 result = readTextureLinear(tex, texuv1, texuv2, layer, base_lod, frac_uv); + + if (frac_lod != 0 && mipmap_linear) {{ + texuv1 = uint2( + WrapCoord(((uv.x >> (base_lod + 1)) - 64) >> 7, wrap_s, size_s >> (base_lod + 1)), + WrapCoord(((uv.y >> (base_lod + 1)) - 64) >> 7, wrap_t, size_t >> (base_lod + 1))); + texuv2 = uint2( + WrapCoord(((uv.x >> (base_lod + 1)) + 64) >> 7, wrap_s, size_s >> (base_lod + 1)), + WrapCoord(((uv.y >> (base_lod + 1)) + 64) >> 7, wrap_t, size_t >> (base_lod + 1))); + frac_uv = int2(((uv.x >> (base_lod + 1)) - 64) & 0x7f, ((uv.y >> (base_lod + 1)) - 64) & 0x7f); + + result *= 16 - frac_lod; + result += readTextureLinear(tex, texuv1, texuv2, layer, base_lod + 1, frac_uv) * frac_lod; + result >>= 4; + }} + + return result; + }} else {{ + uint2 texuv = uint2( + WrapCoord(uv.x >> (7 + base_lod), wrap_s, size_s >> base_lod), + WrapCoord(uv.y >> (7 + base_lod), wrap_t, size_t >> base_lod)); + + int4 result = readTexture(tex, texuv.x, texuv.y, layer, base_lod); + + if (frac_lod != 0 && mipmap_linear) {{ + texuv = uint2( + WrapCoord(uv.x >> (7 + base_lod + 1), wrap_s, size_s >> (base_lod + 1)), + WrapCoord(uv.y >> (7 + base_lod + 1), wrap_t, size_t >> (base_lod + 1))); + + result *= 16 - frac_lod; + result += readTexture(tex, texuv.x, texuv.y, layer, base_lod + 1) * frac_lod; + result >>= 4; + }} + return result; + }} +}} +)"); + } } static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, int n, APIType api_type, bool stereo); static void WriteTevRegular(ShaderCode& out, std::string_view components, TevBias bias, TevOp op, bool clamp, TevScale scale, bool alpha); -static void SampleTexture(ShaderCode& out, std::string_view texcoords, std::string_view texswap, - int texmap, bool stereo, APIType api_type); static void WriteAlphaTest(ShaderCode& out, const pixel_shader_uid_data* uid_data, APIType api_type, bool per_pixel_depth, bool use_dual_source); static void WriteFog(ShaderCode& out, const pixel_shader_uid_data* uid_data); @@ -568,6 +739,17 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos WriteBitfieldExtractHeader(out, api_type, host_config); WritePixelShaderCommonHeader(out, api_type, host_config, uid_data->bounding_box); + if (api_type == APIType::OpenGL || api_type == APIType::Vulkan) + { + out.Write("\n#define sampleTextureWrapper(texmap, uv, layer) " + "sampleTexture(texmap, samp[texmap], uv, layer)\n"); + } + else if (api_type == APIType::D3D) + { + out.Write("\n#define sampleTextureWrapper(texmap, uv, layer) " + "sampleTexture(texmap, tex[texmap], samp[texmap], uv, layer)\n"); + } + if (uid_data->forced_early_z && g_ActiveConfig.backend_info.bSupportsEarlyZ) { // Zcomploc (aka early_ztest) is a way to control whether depth test is done before @@ -755,6 +937,8 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos out.Write(",\n in uint layer : SV_RenderTargetArrayIndex\n"); out.Write(" ) {{\n"); } + if (!stereo) + out.Write("\tint layer = 0;\n"); out.Write("\tint4 c0 = " I_COLORS "[1], c1 = " I_COLORS "[2], c2 = " I_COLORS "[3], prev = " I_COLORS "[0];\n" @@ -835,8 +1019,8 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos out.Write("\ttempcoord = fixpoint_uv{} >> " I_INDTEXSCALE "[{}].{};\n", texcoord, i / 2, (i & 1) ? "zw" : "xy"); - out.Write("\tint3 iindtex{} = ", i); - SampleTexture(out, "float2(tempcoord)", "abg", texmap, stereo, api_type); + out.Write("\tint3 iindtex{0} = sampleTextureWrapper({1}u, tempcoord, layer).abg;\n", i, + texmap); } } @@ -1244,8 +1428,8 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i '\0', }; - out.Write("\ttextemp = "); - SampleTexture(out, "float2(tevcoord.xy)", texswap, stage.tevorders_texmap, stereo, api_type); + out.Write("\ttextemp = sampleTextureWrapper({0}u, tevcoord.xy, layer).{1};\n", + stage.tevorders_texmap, texswap); } else if (uid_data->genMode_numtexgens == 0) { @@ -1429,25 +1613,6 @@ static void WriteTevRegular(ShaderCode& out, std::string_view components, TevBia out.Write("){}", tev_scale_table_right[u32(scale)]); } -static void SampleTexture(ShaderCode& out, std::string_view texcoords, std::string_view texswap, - int texmap, bool stereo, APIType api_type) -{ - out.SetConstantsUsed(C_TEXDIMS + texmap, C_TEXDIMS + texmap); - - if (api_type == APIType::D3D) - { - out.Write("iround(255.0 * Tex[{}].Sample(samp[{}], float3({}.xy / float2(" I_TEXDIMS - "[{}].xy * 128), {}))).{};\n", - texmap, texmap, texcoords, texmap, stereo ? "layer" : "0.0", texswap); - } - else - { - out.Write("iround(255.0 * texture(samp[{}], float3({}.xy / float2(" I_TEXDIMS - "[{}].xy * 128), {}))).{};\n", - texmap, texcoords, texmap, stereo ? "layer" : "0.0", texswap); - } -} - constexpr std::array tev_alpha_funcs_table{ "(false)", // CompareMode::Never "(prev.a < {})", // CompareMode::Less diff --git a/Source/Core/VideoCommon/UberShaderPixel.cpp b/Source/Core/VideoCommon/UberShaderPixel.cpp index 833bbe1d0f..36fc6addc1 100644 --- a/Source/Core/VideoCommon/UberShaderPixel.cpp +++ b/Source/Core/VideoCommon/UberShaderPixel.cpp @@ -226,17 +226,17 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config, { // Doesn't look like DirectX supports this. Oh well the code path is here just in case it // supports this in the future. - out.Write("int4 sampleTexture(uint sampler_num, float3 uv) {{\n"); + out.Write("int4 sampleTextureWrapper(uint texmap, int2 uv, int layer) {{\n"); if (api_type == APIType::OpenGL || api_type == APIType::Vulkan) - out.Write(" return iround(texture(samp[sampler_num], uv) * 255.0);\n"); + out.Write(" return sampleTexture(texmap, samp[texmap], uv, layer);\n"); else if (api_type == APIType::D3D) - out.Write(" return iround(Tex[sampler_num].Sample(samp[sampler_num], uv) * 255.0);\n"); + out.Write(" return sampleTexture(texmap, tex[texmap], samp[texmap], uv, layer);\n"); out.Write("}}\n\n"); } else { - out.Write("int4 sampleTexture(uint sampler_num, float3 uv) {{\n" - " // This is messy, but DirectX, OpenGL 3.3 and OpenGL ES 3.0 doesn't support " + out.Write("int4 sampleTextureWrapper(uint sampler_num, int2 uv, int layer) {{\n" + " // This is messy, but DirectX, OpenGL 3.3, and OpenGL ES 3.0 don't support " "dynamic indexing of the sampler array\n" " // With any luck the shader compiler will optimise this if the hardware supports " "dynamic indexing.\n" @@ -244,9 +244,14 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config, for (int i = 0; i < 8; i++) { if (api_type == APIType::OpenGL || api_type == APIType::Vulkan) - out.Write(" case {}u: return iround(texture(samp[{}], uv) * 255.0);\n", i, i); + { + out.Write(" case {0}u: return sampleTexture({0}u, samp[{0}u], uv, layer);\n", i); + } else if (api_type == APIType::D3D) - out.Write(" case {}u: return iround(Tex[{}].Sample(samp[{}], uv) * 255.0);\n", i, i, i); + { + out.Write(" case {0}u: return sampleTexture({0}u, tex[{0}u], samp[{0}u], uv, layer);\n", + i); + } } out.Write(" }}\n" "}}\n\n"); @@ -284,8 +289,8 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config, // ====================== // Indirect Lookup // ====================== - const auto LookupIndirectTexture = [&out, stereo](std::string_view out_var_name, - std::string_view in_index_name) { + const auto LookupIndirectTexture = [&out](std::string_view out_var_name, + std::string_view in_index_name) { // in_index_name is the indirect stage, not the tev stage // bpmem_iref is packed differently from RAS1_IREF // This function assumes bpmem_iref is nonzero (i.e. matrix is not off, and the @@ -301,11 +306,9 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config, " else\n" " fixedPoint_uv = fixedPoint_uv >> " I_INDTEXSCALE "[{} >> 1].zw;\n" "\n" - " {} = sampleTexture(texmap, float3(float2(fixedPoint_uv) / float2(" I_TEXDIMS - "[texmap].xy * 128), {})).abg;\n" - "}}", - in_index_name, in_index_name, in_index_name, in_index_name, out_var_name, - stereo ? "float(layer)" : "0.0"); + " {} = sampleTextureWrapper(texmap, fixedPoint_uv, layer).abg;\n" + "}}\n", + in_index_name, in_index_name, in_index_name, in_index_name, out_var_name); }; // ====================== @@ -729,6 +732,8 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config, out.Write(",\n in uint layer : SV_RenderTargetArrayIndex\n"); out.Write("\n ) {{\n"); } + if (!stereo) + out.Write(" int layer = 0;\n"); out.Write(" int3 tevcoord = int3(0, 0, 0);\n" " State s;\n" @@ -820,7 +825,7 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config, // For the undefined case, we just skip applying the indirect operation, which is close enough. // Viewtiful Joe hits the undefined case (bug 12525). // Wrapping and add to previous still apply in this case (and when the stage is disabled). - out.Write(" if (bpmem_iref(bt) != 0u) {{"); + out.Write(" if (bpmem_iref(bt) != 0u) {{\n"); out.Write(" int3 indcoord;\n"); LookupIndirectTexture("indcoord", "bt"); out.Write(" if (bs != 0u)\n" @@ -910,11 +915,8 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config, " uint sampler_num = {};\n", BitfieldExtract<&TwoTevStageOrders::texmap0>("ss.order")); out.Write("\n" - " float2 uv = (float2(tevcoord.xy)) / float2(" I_TEXDIMS - "[sampler_num].xy * 128);\n"); - out.Write(" int4 color = sampleTexture(sampler_num, float3(uv, {}));\n", - stereo ? "float(layer)" : "0.0"); - out.Write(" uint swap = {};\n", + " int4 color = sampleTextureWrapper(sampler_num, tevcoord.xy, layer);\n" + " uint swap = {};\n", BitfieldExtract<&TevStageCombiner::AlphaCombiner::tswap>("ss.ac")); out.Write(" s.TexColor = Swizzle(swap, color);\n"); out.Write(" }} else {{\n"