From 9634329033402f02ab3759b3daf8985f01fd7f11 Mon Sep 17 00:00:00 2001 From: PatrickvL Date: Tue, 17 Dec 2019 15:27:46 +0100 Subject: [PATCH] Avoid regex_replace by cutting up HLSL template into two raw strings. --- .../Direct3D9/CxbxVertexShaderTemplate.hlsl | 353 +++++++++--------- src/core/hle/D3D8/XbVertexShader.cpp | 11 +- 2 files changed, 183 insertions(+), 181 deletions(-) diff --git a/src/core/hle/D3D8/Direct3D9/CxbxVertexShaderTemplate.hlsl b/src/core/hle/D3D8/Direct3D9/CxbxVertexShaderTemplate.hlsl index 9a582ab70..569ea6127 100644 --- a/src/core/hle/D3D8/Direct3D9/CxbxVertexShaderTemplate.hlsl +++ b/src/core/hle/D3D8/Direct3D9/CxbxVertexShaderTemplate.hlsl @@ -1,13 +1,13 @@ -// This starts the raw string (comment to get syntax highlighting, UNCOMMENT to compile) : -R"DELIMITER( -// Xbox HLSL vertex shader (template populated at runtime) +// This starts the raw string (comment to get syntax highlighting, UNCOMMENT to compile) : +R"DELIMITER(// Xbox HLSL vertex shader (template populated at runtime) + struct VS_INPUT { float4 v[16] : TEXCOORD; }; // Output registers -struct VS_OUTPUT +struct VS_OUTPUT { float4 oPos : POSITION; // Homogeneous clip space position float4 oD0 : COLOR0; // Primary color (front-facing) @@ -21,98 +21,98 @@ struct VS_OUTPUT float4 oT2 : TEXCOORD2; // Texture coordinate set 2 float4 oT3 : TEXCOORD3; // Texture coordinate set 3 }; - -#define X_D3DSCM_CORRECTION 96 // Add 96 to arrive at the range 0..191 (instead of -96..95) -#define X_D3DVS_CONSTREG_COUNT 192 - -// Xbox constant registers + +#define X_D3DSCM_CORRECTION 96 // Add 96 to arrive at the range 0..191 (instead of -96..95) +#define X_D3DVS_CONSTREG_COUNT 192 + +// Xbox constant registers uniform float4 C[X_D3DVS_CONSTREG_COUNT] : register(c0); - -// Vertex input overrides for SetVertexData4f support -uniform float4 vOverrideValue[16] : register(c192); -uniform float4 vOverridePacked[4] : register(c208); - -uniform float4 xboxViewportScale : register(c212); -uniform float4 xboxViewportOffset : register(c213); - -// Overloaded casts, assuring all inputs are treated as float4 -float4 _tof4(float src) { return float4(src, src, src, src); } -float4 _tof4(float2 src) { return src.xyyy; } -float4 _tof4(float3 src) { return src.xyzz; } -float4 _tof4(float4 src) { return src; } -float4 _ssss(float s) { return float4(s, s, s, s); } // a scalar output replicated across a 4-component vector + +// Vertex input overrides for SetVertexData4f support +uniform float4 vOverrideValue[16] : register(c192); +uniform float4 vOverridePacked[4] : register(c208); + +uniform float4 xboxViewportScale : register(c212); +uniform float4 xboxViewportOffset : register(c213); + +// Overloaded casts, assuring all inputs are treated as float4 +float4 _tof4(float src) { return float4(src, src, src, src); } +float4 _tof4(float2 src) { return src.xyyy; } +float4 _tof4(float3 src) { return src.xyzz; } +float4 _tof4(float4 src) { return src; } +float4 _ssss(float s) { return float4(s, s, s, s); } // a scalar output replicated across a 4-component vector #define _scalar(src) _tof4(src).x /* a scalar input */ - -float4 c(int register_number) -{ + +float4 c(int register_number) +{ // Map Xbox [-96, 95] to Host [0, 191] // Account for Xbox's negative constant indexes register_number += X_D3DSCM_CORRECTION; - if (register_number < 0) - return 0; - - if (register_number >= X_D3DVS_CONSTREG_COUNT) // X_D3DVS_CONSTREG_COUNT - return 0; - - return C[register_number]; -} - + if (register_number < 0) + return 0; + + if (register_number >= X_D3DVS_CONSTREG_COUNT) // X_D3DVS_CONSTREG_COUNT + return 0; + + return C[register_number]; +} + // Due to rounding differences with the Xbox (and increased precision on PC?) // some titles produce values just below the threshold of the next integer. // We can add a small bias to make sure it's bumped over the threshold // Test Case: Azurik (divides indexes 755, then scales them back in the vertex shader) -#define BIAS 0.0001 -// TODO : Use 0.001 like xqemu? - -// 2.14.1.11 Vertex Program Floating Point Requirements -// The floor operations used by the ARL and EXP instructions must -// operate identically. Specifically, the EXP instruction's floor(t.x) -// intermediate result must exactly match the integer stored in the -// address register by the ARL instruction. -float x_floor(float src) -{ - return floor(src + BIAS); -} - -// http://xboxdevwiki.net/NV2A/Vertex_Shader -// https://www.khronos.org/registry/OpenGL/extensions/NV/NV_vertex_program.txt -// https://www.khronos.org/registry/OpenGL/extensions/NV/NV_vertex_program1_1.txt +#define BIAS 0.0001 +// TODO : Use 0.001 like xqemu? + +// 2.14.1.11 Vertex Program Floating Point Requirements +// The floor operations used by the ARL and EXP instructions must +// operate identically. Specifically, the EXP instruction's floor(t.x) +// intermediate result must exactly match the integer stored in the +// address register by the ARL instruction. +float x_floor(float src) +{ + return floor(src + BIAS); +} + +// http://xboxdevwiki.net/NV2A/Vertex_Shader +// https://www.khronos.org/registry/OpenGL/extensions/NV/NV_vertex_program.txt +// https://www.khronos.org/registry/OpenGL/extensions/NV/NV_vertex_program1_1.txt // Functions for MAC ('Multiply And Accumulate') opcodes - -// 2.14.1.10.1 ARL: Address Register Load + +// 2.14.1.10.1 ARL: Address Register Load // The address register should be floored -#define x_arl(dest, mask, src0) dest.mask = x_floor(_tof4(src0).x).mask - +#define x_arl(dest, mask, src0) dest.mask = x_floor(_tof4(src0).x).mask + // 2.14.1.10.2 MOV: Move #define x_mov(dest, mask, src0) dest.mask = (_tof4(src0)).mask - + // 2.14.1.10.3 MUL: Multiply -#define x_mul(dest, mask, src0, src1) dest.mask = (_tof4(src0) * _tof4(src1)).mask - +#define x_mul(dest, mask, src0, src1) dest.mask = (_tof4(src0) * _tof4(src1)).mask + // 2.14.1.10.4 ADD: Add -#define x_add(dest, mask, src0, src1) dest.mask = (_tof4(src0) + _tof4(src1)).mask - +#define x_add(dest, mask, src0, src1) dest.mask = (_tof4(src0) + _tof4(src1)).mask + // 2.14.1.10.5 MAD: Multiply and Add -#define x_mad(dest, mask, src0, src1, src2) dest.mask = (_tof4(src0) * _tof4(src1) + _tof4(src2)).mask - -// 2.14.1.10.8 DP3: Three-Component Dot Product -#define x_dp3(dest, mask, src0, src1) dest.mask = _ssss(dot(_tof4(src0).xyz, _tof4(src1).xyz)).mask - -// 2.14.1.10.9 DP4: Four-Component Dot Product -#define x_dp4(dest, mask, src0, src1) dest.mask = _ssss(dot(_tof4(src0), _tof4(src1))).mask - +#define x_mad(dest, mask, src0, src1, src2) dest.mask = (_tof4(src0) * _tof4(src1) + _tof4(src2)).mask + +// 2.14.1.10.8 DP3: Three-Component Dot Product +#define x_dp3(dest, mask, src0, src1) dest.mask = _ssss(dot(_tof4(src0).xyz, _tof4(src1).xyz)).mask + +// 2.14.1.10.9 DP4: Four-Component Dot Product +#define x_dp4(dest, mask, src0, src1) dest.mask = _ssss(dot(_tof4(src0), _tof4(src1))).mask + // 2.14.1.10.10 DST: Distance Vector -#define x_dst(dest, mask, src0, src1) dest.mask = dst(_tof4(src0), _tof4(src1)).mask /* equals { dest.x = 1; dest.y = src0.y * src1.y; dest.z = src0.z; dest.w = src1.w; } */ - +#define x_dst(dest, mask, src0, src1) dest.mask = dst(_tof4(src0), _tof4(src1)).mask /* equals { dest.x = 1; dest.y = src0.y * src1.y; dest.z = src0.z; dest.w = src1.w; } */ + // 2.14.1.10.11 MIN: Minimum -#define x_min(dest, mask, src0, src1) dest.mask = min(_tof4(src0), _tof4(src1)).mask - +#define x_min(dest, mask, src0, src1) dest.mask = min(_tof4(src0), _tof4(src1)).mask + // 2.14.1.10.12 MAX: Maximum -#define x_max(dest, mask, src0, src1) dest.mask = max(_tof4(src0), _tof4(src1)).mask - +#define x_max(dest, mask, src0, src1) dest.mask = max(_tof4(src0), _tof4(src1)).mask + // 2.14.1.10.13 SLT: Set On Less Than -#define x_slt(dest, mask, src0, src1) dest.mask = _slt(_tof4(src0), _tof4(src1)).mask +#define x_slt(dest, mask, src0, src1) dest.mask = _slt(_tof4(src0), _tof4(src1)).mask float4 _slt(float4 src0, float4 src1) { float4 dest; @@ -124,7 +124,7 @@ float4 _slt(float4 src0, float4 src1) } // 2.14.1.10.14 SGE: Set On Greater or Equal Than -#define x_sge(dest, mask, src0, src1) dest.mask = _sge(_tof4(src0), _tof4(src1)).mask +#define x_sge(dest, mask, src0, src1) dest.mask = _sge(_tof4(src0), _tof4(src1)).mask float4 _sge(float4 src0, float4 src1) { float4 dest; @@ -134,92 +134,92 @@ float4 _sge(float4 src0, float4 src1) dest.w = (src0.w >= src1.w) ? 1 : 0; return dest; } - -// 2.14.1.10.18 DPH: Homogeneous Dot Product -#define x_dph(dest, mask, src0, src1) dest.mask = _ssss(_dph(_tof4(src0), _tof4(src1))).mask -float _dph(float4 src0, float4 src1) -{ - return dot(src0.xyz, src1.xyz) + src1.w; -} - + +// 2.14.1.10.18 DPH: Homogeneous Dot Product +#define x_dph(dest, mask, src0, src1) dest.mask = _ssss(_dph(_tof4(src0), _tof4(src1))).mask +float _dph(float4 src0, float4 src1) +{ + return dot(src0.xyz, src1.xyz) + src1.w; +} + // Xbox ILU Functions - -// 2.14.1.10.6 RCP: Reciprocal -#define x_rcp(dest, mask, src0) dest.mask = _ssss(_rcp(_scalar(src0))).mask -float _rcp(float src) -{ -#if 0 // TODO : Enable - if (src == 1) return 1; - if (src == 0) return 1.#INF; -#endif + +// 2.14.1.10.6 RCP: Reciprocal +#define x_rcp(dest, mask, src0) dest.mask = _ssss(_rcp(_scalar(src0))).mask +float _rcp(float src) +{ +#if 0 // TODO : Enable + if (src == 1) return 1; + if (src == 0) return 1.#INF; +#endif return 1/ src; -} - +} + // 2.14.1.10.7 RSQ: Reciprocal Square Root -#define x_rsq(dest, mask, src0) dest.mask = _ssss(_rsq(_scalar(src0))).mask -float _rsq(float src) -{ - float a = abs(src); -#if 0 // TODO : Enable - if (a == 1) return 1; - if (a == 0) return 1.#INF; -#endif +#define x_rsq(dest, mask, src0) dest.mask = _ssss(_rsq(_scalar(src0))).mask +float _rsq(float src) +{ + float a = abs(src); +#if 0 // TODO : Enable + if (a == 1) return 1; + if (a == 0) return 1.#INF; +#endif return rsqrt(a); -} - +} + // 2.14.1.10.15 EXP: Exponential Base 2 -#define x_expp(dest, mask, src0) dest.mask = _expp(_scalar(src0)).mask +#define x_expp(dest, mask, src0) dest.mask = _expp(_scalar(src0)).mask float4 _expp(float src) { float floor_src = x_floor(src); - - float4 dest; + + float4 dest; dest.x = exp2(floor_src); dest.y = src - floor_src; dest.z = exp2(src); dest.w = 1; - return dest; + return dest; } - + // 2.14.1.10.16 LOG: Logarithm Base 2 -#define x_logp(dest, mask, src0) dest.mask = _logp(_scalar(src0)).mask +#define x_logp(dest, mask, src0) dest.mask = _logp(_scalar(src0)).mask float4 _logp(float src) -{ +{ float4 dest; -#if 0 // TODO : Enable - float t = abs(src); - if (t != 0) { - if (t == 1.#INF) { - dest.x = 1.#INF; - dest.y = 1; - dest.z = 1.#INF; - } else { -#endif - float exponent = floor(log2(src)); // TODO : x_floor - float mantissa = 1 / exp2(exponent); - float z = log2(src); // TODO : exponent + log2(mantissa); // TODO : Or log2(t)? - // TODO : float exponent = frexp(src + BIAS, /*out*/mantissa); +#if 0 // TODO : Enable + float t = abs(src); + if (t != 0) { + if (t == 1.#INF) { + dest.x = 1.#INF; + dest.y = 1; + dest.z = 1.#INF; + } else { +#endif + float exponent = floor(log2(src)); // TODO : x_floor + float mantissa = 1 / exp2(exponent); + float z = log2(src); // TODO : exponent + log2(mantissa); // TODO : Or log2(t)? + // TODO : float exponent = frexp(src + BIAS, /*out*/mantissa); dest.x = exponent; dest.y = mantissa; dest.z = z; -#if 0 - } +#if 0 + } } else { - dest.x = -1.#INF; - dest.y = 1; - dest.z = -1.#INF; - } -#endif + dest.x = -1.#INF; + dest.y = 1; + dest.z = -1.#INF; + } +#endif dest.w = 1; return dest; } - -// 2.14.1.10.17 LIT: Light Coefficients + +// 2.14.1.10.17 LIT: Light Coefficients #define x_lit(dest, mask, src) dest.mask = _lit(_tof4(src)).mask float4 _lit(float4 src0) { - const float epsilon = 1.0f / 256.0f; + const float epsilon = 1.0f / 256.0f; float diffuse = src0.x; float blinn = src0.y; @@ -228,15 +228,15 @@ float4 _lit(float4 src0) float4 dest; dest.x = 1; dest.y = max(0, diffuse); - dest.z = diffuse > 0 ? exp2(specPower * log(blinn)) : 0; + dest.z = diffuse > 0 ? exp2(specPower * log(blinn)) : 0; // TODO : Use dest.z = (diffuse > 0) && (blinn > 0) ? pow(blinn, specPower) : 0; dest.w = 1; - + return dest; } -// 2.14.1.10.19 RCC: Reciprocal Clamped -#define x_rcc(dest, mask, src0) dest.mask = _ssss(_rcc(_scalar(src0))).mask +// 2.14.1.10.19 RCC: Reciprocal Clamped +#define x_rcc(dest, mask, src0) dest.mask = _ssss(_rcc(_scalar(src0))).mask float _rcc(float src) { // Calculate the reciprocal @@ -247,20 +247,20 @@ float _rcc(float src) ? clamp(r, 5.42101e-020f, 1.84467e+019f) // the IEEE 32-bit binary values 0x1F800000 and 0x5F800000 : clamp(r, -1.84467e+019f, -5.42101e-020f); // the IEEE 32-bit binary values 0xDF800000 and 0x9F800000 } - + float4 reverseScreenspaceTransform(float4 oPos) { // On Xbox, oPos should contain the vertex position in screenspace - // We need to reverse this transformation + // We need to reverse this transformation // Conventionally, each Xbox Vertex Shader includes instructions like this // mul oPos.xyz, r12, c-38 // +rcc r1.x, r12.w // mad oPos.xyz, r12, r1.x, c-37 - // where c-37 and c-38 are reserved transform values + // where c-37 and c-38 are reserved transform values - oPos.xyz -= xboxViewportOffset.xyz; // reverse offset + oPos.xyz -= xboxViewportOffset.xyz; // reverse offset oPos.xyz *= oPos.w; // reverse perspective divide - oPos.xyz /= xboxViewportScale.xyz; // reverse scale + oPos.xyz /= xboxViewportScale.xyz; // reverse scale return oPos; } @@ -270,7 +270,7 @@ VS_OUTPUT main(const VS_INPUT xIn) // Output variables float4 oPos, oD0, oD1, oB0, oB1, oT0, oT1, oT2, oT3; oPos = oD0 = oD1 = oB0 = oB1 = oT0 = oT1 = oT2 = oT3 = float4(0, 0, 0, 1); // Pre-initialize w component of outputs to 1 - + // Single component outputs float4 oFog, oPts; // x is write-only on Xbox. Use float4 as some games use incorrect masks oFog = oPts = 0; @@ -283,37 +283,38 @@ VS_OUTPUT main(const VS_INPUT xIn) r0 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = r8 = r9 = r10 = r11 = float4(0, 0, 0, 0); #define r12 oPos // oPos and r12 are two ways of accessing the same register on Xbox - // Input registerss - float4 v[16]; - # define v0 v[0] - # define v1 v[1] - # define v2 v[2] - # define v3 v[3] - # define v4 v[4] - # define v5 v[5] - # define v6 v[6] - # define v7 v[7] - # define v8 v[8] - # define v9 v[9] - # define v10 v[10] - # define v11 v[11] - # define v12 v[12] - # define v13 v[13] - # define v14 v[14] - # define v15 v[15] - - // View 4 packed overrides as an array of 16 floats - float vOverride[16] = (float[16])vOverridePacked; - - // Initialize input registers from the vertex buffer - // Or use an override value set with SetVertexData4f - for(int i = 0; i < 16; i++){ - v[i] = vOverride[i] ? vOverrideValue[i] : xIn.v[i]; - } + // Input registerss + float4 v[16]; + # define v0 v[0] + # define v1 v[1] + # define v2 v[2] + # define v3 v[3] + # define v4 v[4] + # define v5 v[5] + # define v6 v[6] + # define v7 v[7] + # define v8 v[8] + # define v9 v[9] + # define v10 v[10] + # define v11 v[11] + # define v12 v[12] + # define v13 v[13] + # define v14 v[14] + # define v15 v[15] + + // View 4 packed overrides as an array of 16 floats + float vOverride[16] = (float[16])vOverridePacked; + + // Initialize input registers from the vertex buffer + // Or use an override value set with SetVertexData4f + for(int i = 0; i < 16; i++){ + v[i] = vOverride[i] ? vOverrideValue[i] : xIn.v[i]; + } + + // Xbox shader program)DELIMITER", /* This terminates the header raw string" // */ + +R"DELIMITER( - // Xbox shader program -// - // Copy variables to output struct VS_OUTPUT xOut; @@ -332,4 +333,4 @@ VS_OUTPUT main(const VS_INPUT xIn) return xOut; } -// End of vertex shader )DELIMITER" /* This terminates the raw string" // */ +// End of vertex shader footer)DELIMITER" /* This terminates the footer raw string" // */ diff --git a/src/core/hle/D3D8/XbVertexShader.cpp b/src/core/hle/D3D8/XbVertexShader.cpp index dc1fe276a..91de74208 100644 --- a/src/core/hle/D3D8/XbVertexShader.cpp +++ b/src/core/hle/D3D8/XbVertexShader.cpp @@ -36,7 +36,6 @@ #include "XbD3D8Types.h" // For X_D3DVSDE_* #include -#include #include #include #include @@ -1663,9 +1662,10 @@ extern HRESULT EmuRecompileVshFunction if (!SUCCEEDED(hRet)) return hRet; - static std::string hlsl_template = - #include "core\hle\D3D8\Direct3D9\CxbxVertexShaderTemplate.hlsl" // Note : This included .hlsl defines a raw string - ; + // Include HLSL header and footer as raw strings : + static std::string hlsl_template[2] = { + #include "core\hle\D3D8\Direct3D9\CxbxVertexShaderTemplate.hlsl" + }; // Decode the vertex shader program tokens into an intermediate representation pToken = (uint32_t*)((uintptr_t)pXboxFunction + sizeof(XTL::X_VSH_SHADER_HEADER)); @@ -1678,6 +1678,7 @@ extern HRESULT EmuRecompileVshFunction *pXboxFunctionSize = (intptr_t)pToken - (intptr_t)pXboxFunction; auto hlsl_stream = std::stringstream(); + hlsl_stream << hlsl_template[0]; // Start with the HLSL template header if (!VshDecoder.BuildShader(hlsl_stream)) { // Do not attempt to compile empty shaders // This is a declaration only shader, so there is no function to recompile @@ -1685,8 +1686,8 @@ extern HRESULT EmuRecompileVshFunction return D3D_OK; } + hlsl_stream << hlsl_template[1]; // Finish with the HLSL template footer std::string hlsl_str = hlsl_stream.str(); - hlsl_str = std::regex_replace(hlsl_template, std::regex("// "), hlsl_str, std::regex_constants::format_first_only); DbgVshPrintf("--- HLSL conversion ---\n"); DbgVshPrintf(DebugPrependLineNumbers(hlsl_str).c_str());