Avoid regex_replace by cutting up HLSL template into two raw strings.

This commit is contained in:
PatrickvL 2019-12-17 15:27:46 +01:00 committed by patrickvl
parent 016f8361b5
commit 9634329033
2 changed files with 183 additions and 181 deletions

View File

@ -1,13 +1,13 @@
// This starts the raw string (comment to get syntax highlighting, UNCOMMENT to compile) :
R"DELIMITER(
// Xbox HLSL vertex shader (template populated at runtime)
// This starts the raw string (comment to get syntax highlighting, UNCOMMENT to compile) :
R"DELIMITER(// Xbox HLSL vertex shader (template populated at runtime)
struct VS_INPUT
{
float4 v[16] : TEXCOORD;
};
// Output registers
struct VS_OUTPUT
struct VS_OUTPUT
{
float4 oPos : POSITION; // Homogeneous clip space position
float4 oD0 : COLOR0; // Primary color (front-facing)
@ -21,98 +21,98 @@ struct VS_OUTPUT
float4 oT2 : TEXCOORD2; // Texture coordinate set 2
float4 oT3 : TEXCOORD3; // Texture coordinate set 3
};
#define X_D3DSCM_CORRECTION 96 // Add 96 to arrive at the range 0..191 (instead of -96..95)
#define X_D3DVS_CONSTREG_COUNT 192
// Xbox constant registers
#define X_D3DSCM_CORRECTION 96 // Add 96 to arrive at the range 0..191 (instead of -96..95)
#define X_D3DVS_CONSTREG_COUNT 192
// Xbox constant registers
uniform float4 C[X_D3DVS_CONSTREG_COUNT] : register(c0);
// Vertex input overrides for SetVertexData4f support
uniform float4 vOverrideValue[16] : register(c192);
uniform float4 vOverridePacked[4] : register(c208);
uniform float4 xboxViewportScale : register(c212);
uniform float4 xboxViewportOffset : register(c213);
// Overloaded casts, assuring all inputs are treated as float4
float4 _tof4(float src) { return float4(src, src, src, src); }
float4 _tof4(float2 src) { return src.xyyy; }
float4 _tof4(float3 src) { return src.xyzz; }
float4 _tof4(float4 src) { return src; }
float4 _ssss(float s) { return float4(s, s, s, s); } // a scalar output replicated across a 4-component vector
// Vertex input overrides for SetVertexData4f support
uniform float4 vOverrideValue[16] : register(c192);
uniform float4 vOverridePacked[4] : register(c208);
uniform float4 xboxViewportScale : register(c212);
uniform float4 xboxViewportOffset : register(c213);
// Overloaded casts, assuring all inputs are treated as float4
float4 _tof4(float src) { return float4(src, src, src, src); }
float4 _tof4(float2 src) { return src.xyyy; }
float4 _tof4(float3 src) { return src.xyzz; }
float4 _tof4(float4 src) { return src; }
float4 _ssss(float s) { return float4(s, s, s, s); } // a scalar output replicated across a 4-component vector
#define _scalar(src) _tof4(src).x /* a scalar input */
float4 c(int register_number)
{
float4 c(int register_number)
{
// Map Xbox [-96, 95] to Host [0, 191]
// Account for Xbox's negative constant indexes
register_number += X_D3DSCM_CORRECTION;
if (register_number < 0)
return 0;
if (register_number >= X_D3DVS_CONSTREG_COUNT) // X_D3DVS_CONSTREG_COUNT
return 0;
return C[register_number];
}
if (register_number < 0)
return 0;
if (register_number >= X_D3DVS_CONSTREG_COUNT) // X_D3DVS_CONSTREG_COUNT
return 0;
return C[register_number];
}
// Due to rounding differences with the Xbox (and increased precision on PC?)
// some titles produce values just below the threshold of the next integer.
// We can add a small bias to make sure it's bumped over the threshold
// Test Case: Azurik (divides indexes 755, then scales them back in the vertex shader)
#define BIAS 0.0001
// TODO : Use 0.001 like xqemu?
// 2.14.1.11 Vertex Program Floating Point Requirements
// The floor operations used by the ARL and EXP instructions must
// operate identically. Specifically, the EXP instruction's floor(t.x)
// intermediate result must exactly match the integer stored in the
// address register by the ARL instruction.
float x_floor(float src)
{
return floor(src + BIAS);
}
// http://xboxdevwiki.net/NV2A/Vertex_Shader
// https://www.khronos.org/registry/OpenGL/extensions/NV/NV_vertex_program.txt
// https://www.khronos.org/registry/OpenGL/extensions/NV/NV_vertex_program1_1.txt
#define BIAS 0.0001
// TODO : Use 0.001 like xqemu?
// 2.14.1.11 Vertex Program Floating Point Requirements
// The floor operations used by the ARL and EXP instructions must
// operate identically. Specifically, the EXP instruction's floor(t.x)
// intermediate result must exactly match the integer stored in the
// address register by the ARL instruction.
float x_floor(float src)
{
return floor(src + BIAS);
}
// http://xboxdevwiki.net/NV2A/Vertex_Shader
// https://www.khronos.org/registry/OpenGL/extensions/NV/NV_vertex_program.txt
// https://www.khronos.org/registry/OpenGL/extensions/NV/NV_vertex_program1_1.txt
// Functions for MAC ('Multiply And Accumulate') opcodes
// 2.14.1.10.1 ARL: Address Register Load
// 2.14.1.10.1 ARL: Address Register Load
// The address register should be floored
#define x_arl(dest, mask, src0) dest.mask = x_floor(_tof4(src0).x).mask
#define x_arl(dest, mask, src0) dest.mask = x_floor(_tof4(src0).x).mask
// 2.14.1.10.2 MOV: Move
#define x_mov(dest, mask, src0) dest.mask = (_tof4(src0)).mask
// 2.14.1.10.3 MUL: Multiply
#define x_mul(dest, mask, src0, src1) dest.mask = (_tof4(src0) * _tof4(src1)).mask
#define x_mul(dest, mask, src0, src1) dest.mask = (_tof4(src0) * _tof4(src1)).mask
// 2.14.1.10.4 ADD: Add
#define x_add(dest, mask, src0, src1) dest.mask = (_tof4(src0) + _tof4(src1)).mask
#define x_add(dest, mask, src0, src1) dest.mask = (_tof4(src0) + _tof4(src1)).mask
// 2.14.1.10.5 MAD: Multiply and Add
#define x_mad(dest, mask, src0, src1, src2) dest.mask = (_tof4(src0) * _tof4(src1) + _tof4(src2)).mask
// 2.14.1.10.8 DP3: Three-Component Dot Product
#define x_dp3(dest, mask, src0, src1) dest.mask = _ssss(dot(_tof4(src0).xyz, _tof4(src1).xyz)).mask
// 2.14.1.10.9 DP4: Four-Component Dot Product
#define x_dp4(dest, mask, src0, src1) dest.mask = _ssss(dot(_tof4(src0), _tof4(src1))).mask
#define x_mad(dest, mask, src0, src1, src2) dest.mask = (_tof4(src0) * _tof4(src1) + _tof4(src2)).mask
// 2.14.1.10.8 DP3: Three-Component Dot Product
#define x_dp3(dest, mask, src0, src1) dest.mask = _ssss(dot(_tof4(src0).xyz, _tof4(src1).xyz)).mask
// 2.14.1.10.9 DP4: Four-Component Dot Product
#define x_dp4(dest, mask, src0, src1) dest.mask = _ssss(dot(_tof4(src0), _tof4(src1))).mask
// 2.14.1.10.10 DST: Distance Vector
#define x_dst(dest, mask, src0, src1) dest.mask = dst(_tof4(src0), _tof4(src1)).mask /* equals { dest.x = 1; dest.y = src0.y * src1.y; dest.z = src0.z; dest.w = src1.w; } */
#define x_dst(dest, mask, src0, src1) dest.mask = dst(_tof4(src0), _tof4(src1)).mask /* equals { dest.x = 1; dest.y = src0.y * src1.y; dest.z = src0.z; dest.w = src1.w; } */
// 2.14.1.10.11 MIN: Minimum
#define x_min(dest, mask, src0, src1) dest.mask = min(_tof4(src0), _tof4(src1)).mask
#define x_min(dest, mask, src0, src1) dest.mask = min(_tof4(src0), _tof4(src1)).mask
// 2.14.1.10.12 MAX: Maximum
#define x_max(dest, mask, src0, src1) dest.mask = max(_tof4(src0), _tof4(src1)).mask
#define x_max(dest, mask, src0, src1) dest.mask = max(_tof4(src0), _tof4(src1)).mask
// 2.14.1.10.13 SLT: Set On Less Than
#define x_slt(dest, mask, src0, src1) dest.mask = _slt(_tof4(src0), _tof4(src1)).mask
#define x_slt(dest, mask, src0, src1) dest.mask = _slt(_tof4(src0), _tof4(src1)).mask
float4 _slt(float4 src0, float4 src1)
{
float4 dest;
@ -124,7 +124,7 @@ float4 _slt(float4 src0, float4 src1)
}
// 2.14.1.10.14 SGE: Set On Greater or Equal Than
#define x_sge(dest, mask, src0, src1) dest.mask = _sge(_tof4(src0), _tof4(src1)).mask
#define x_sge(dest, mask, src0, src1) dest.mask = _sge(_tof4(src0), _tof4(src1)).mask
float4 _sge(float4 src0, float4 src1)
{
float4 dest;
@ -134,92 +134,92 @@ float4 _sge(float4 src0, float4 src1)
dest.w = (src0.w >= src1.w) ? 1 : 0;
return dest;
}
// 2.14.1.10.18 DPH: Homogeneous Dot Product
#define x_dph(dest, mask, src0, src1) dest.mask = _ssss(_dph(_tof4(src0), _tof4(src1))).mask
float _dph(float4 src0, float4 src1)
{
return dot(src0.xyz, src1.xyz) + src1.w;
}
// 2.14.1.10.18 DPH: Homogeneous Dot Product
#define x_dph(dest, mask, src0, src1) dest.mask = _ssss(_dph(_tof4(src0), _tof4(src1))).mask
float _dph(float4 src0, float4 src1)
{
return dot(src0.xyz, src1.xyz) + src1.w;
}
// Xbox ILU Functions
// 2.14.1.10.6 RCP: Reciprocal
#define x_rcp(dest, mask, src0) dest.mask = _ssss(_rcp(_scalar(src0))).mask
float _rcp(float src)
{
#if 0 // TODO : Enable
if (src == 1) return 1;
if (src == 0) return 1.#INF;
#endif
// 2.14.1.10.6 RCP: Reciprocal
#define x_rcp(dest, mask, src0) dest.mask = _ssss(_rcp(_scalar(src0))).mask
float _rcp(float src)
{
#if 0 // TODO : Enable
if (src == 1) return 1;
if (src == 0) return 1.#INF;
#endif
return 1/ src;
}
}
// 2.14.1.10.7 RSQ: Reciprocal Square Root
#define x_rsq(dest, mask, src0) dest.mask = _ssss(_rsq(_scalar(src0))).mask
float _rsq(float src)
{
float a = abs(src);
#if 0 // TODO : Enable
if (a == 1) return 1;
if (a == 0) return 1.#INF;
#endif
#define x_rsq(dest, mask, src0) dest.mask = _ssss(_rsq(_scalar(src0))).mask
float _rsq(float src)
{
float a = abs(src);
#if 0 // TODO : Enable
if (a == 1) return 1;
if (a == 0) return 1.#INF;
#endif
return rsqrt(a);
}
}
// 2.14.1.10.15 EXP: Exponential Base 2
#define x_expp(dest, mask, src0) dest.mask = _expp(_scalar(src0)).mask
#define x_expp(dest, mask, src0) dest.mask = _expp(_scalar(src0)).mask
float4 _expp(float src)
{
float floor_src = x_floor(src);
float4 dest;
float4 dest;
dest.x = exp2(floor_src);
dest.y = src - floor_src;
dest.z = exp2(src);
dest.w = 1;
return dest;
return dest;
}
// 2.14.1.10.16 LOG: Logarithm Base 2
#define x_logp(dest, mask, src0) dest.mask = _logp(_scalar(src0)).mask
#define x_logp(dest, mask, src0) dest.mask = _logp(_scalar(src0)).mask
float4 _logp(float src)
{
{
float4 dest;
#if 0 // TODO : Enable
float t = abs(src);
if (t != 0) {
if (t == 1.#INF) {
dest.x = 1.#INF;
dest.y = 1;
dest.z = 1.#INF;
} else {
#endif
float exponent = floor(log2(src)); // TODO : x_floor
float mantissa = 1 / exp2(exponent);
float z = log2(src); // TODO : exponent + log2(mantissa); // TODO : Or log2(t)?
// TODO : float exponent = frexp(src + BIAS, /*out*/mantissa);
#if 0 // TODO : Enable
float t = abs(src);
if (t != 0) {
if (t == 1.#INF) {
dest.x = 1.#INF;
dest.y = 1;
dest.z = 1.#INF;
} else {
#endif
float exponent = floor(log2(src)); // TODO : x_floor
float mantissa = 1 / exp2(exponent);
float z = log2(src); // TODO : exponent + log2(mantissa); // TODO : Or log2(t)?
// TODO : float exponent = frexp(src + BIAS, /*out*/mantissa);
dest.x = exponent;
dest.y = mantissa;
dest.z = z;
#if 0
}
#if 0
}
} else {
dest.x = -1.#INF;
dest.y = 1;
dest.z = -1.#INF;
}
#endif
dest.x = -1.#INF;
dest.y = 1;
dest.z = -1.#INF;
}
#endif
dest.w = 1;
return dest;
}
// 2.14.1.10.17 LIT: Light Coefficients
// 2.14.1.10.17 LIT: Light Coefficients
#define x_lit(dest, mask, src) dest.mask = _lit(_tof4(src)).mask
float4 _lit(float4 src0)
{
const float epsilon = 1.0f / 256.0f;
const float epsilon = 1.0f / 256.0f;
float diffuse = src0.x;
float blinn = src0.y;
@ -228,15 +228,15 @@ float4 _lit(float4 src0)
float4 dest;
dest.x = 1;
dest.y = max(0, diffuse);
dest.z = diffuse > 0 ? exp2(specPower * log(blinn)) : 0;
dest.z = diffuse > 0 ? exp2(specPower * log(blinn)) : 0;
// TODO : Use dest.z = (diffuse > 0) && (blinn > 0) ? pow(blinn, specPower) : 0;
dest.w = 1;
return dest;
}
// 2.14.1.10.19 RCC: Reciprocal Clamped
#define x_rcc(dest, mask, src0) dest.mask = _ssss(_rcc(_scalar(src0))).mask
// 2.14.1.10.19 RCC: Reciprocal Clamped
#define x_rcc(dest, mask, src0) dest.mask = _ssss(_rcc(_scalar(src0))).mask
float _rcc(float src)
{
// Calculate the reciprocal
@ -247,20 +247,20 @@ float _rcc(float src)
? clamp(r, 5.42101e-020f, 1.84467e+019f) // the IEEE 32-bit binary values 0x1F800000 and 0x5F800000
: clamp(r, -1.84467e+019f, -5.42101e-020f); // the IEEE 32-bit binary values 0xDF800000 and 0x9F800000
}
float4 reverseScreenspaceTransform(float4 oPos)
{
// On Xbox, oPos should contain the vertex position in screenspace
// We need to reverse this transformation
// We need to reverse this transformation
// Conventionally, each Xbox Vertex Shader includes instructions like this
// mul oPos.xyz, r12, c-38
// +rcc r1.x, r12.w
// mad oPos.xyz, r12, r1.x, c-37
// where c-37 and c-38 are reserved transform values
// where c-37 and c-38 are reserved transform values
oPos.xyz -= xboxViewportOffset.xyz; // reverse offset
oPos.xyz -= xboxViewportOffset.xyz; // reverse offset
oPos.xyz *= oPos.w; // reverse perspective divide
oPos.xyz /= xboxViewportScale.xyz; // reverse scale
oPos.xyz /= xboxViewportScale.xyz; // reverse scale
return oPos;
}
@ -270,7 +270,7 @@ VS_OUTPUT main(const VS_INPUT xIn)
// Output variables
float4 oPos, oD0, oD1, oB0, oB1, oT0, oT1, oT2, oT3;
oPos = oD0 = oD1 = oB0 = oB1 = oT0 = oT1 = oT2 = oT3 = float4(0, 0, 0, 1); // Pre-initialize w component of outputs to 1
// Single component outputs
float4 oFog, oPts; // x is write-only on Xbox. Use float4 as some games use incorrect masks
oFog = oPts = 0;
@ -283,37 +283,38 @@ VS_OUTPUT main(const VS_INPUT xIn)
r0 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = r8 = r9 = r10 = r11 = float4(0, 0, 0, 0);
#define r12 oPos // oPos and r12 are two ways of accessing the same register on Xbox
// Input registerss
float4 v[16];
# define v0 v[0]
# define v1 v[1]
# define v2 v[2]
# define v3 v[3]
# define v4 v[4]
# define v5 v[5]
# define v6 v[6]
# define v7 v[7]
# define v8 v[8]
# define v9 v[9]
# define v10 v[10]
# define v11 v[11]
# define v12 v[12]
# define v13 v[13]
# define v14 v[14]
# define v15 v[15]
// View 4 packed overrides as an array of 16 floats
float vOverride[16] = (float[16])vOverridePacked;
// Initialize input registers from the vertex buffer
// Or use an override value set with SetVertexData4f
for(int i = 0; i < 16; i++){
v[i] = vOverride[i] ? vOverrideValue[i] : xIn.v[i];
}
// Input registerss
float4 v[16];
# define v0 v[0]
# define v1 v[1]
# define v2 v[2]
# define v3 v[3]
# define v4 v[4]
# define v5 v[5]
# define v6 v[6]
# define v7 v[7]
# define v8 v[8]
# define v9 v[9]
# define v10 v[10]
# define v11 v[11]
# define v12 v[12]
# define v13 v[13]
# define v14 v[14]
# define v15 v[15]
// View 4 packed overrides as an array of 16 floats
float vOverride[16] = (float[16])vOverridePacked;
// Initialize input registers from the vertex buffer
// Or use an override value set with SetVertexData4f
for(int i = 0; i < 16; i++){
v[i] = vOverride[i] ? vOverrideValue[i] : xIn.v[i];
}
// Xbox shader program)DELIMITER", /* This terminates the header raw string" // */
R"DELIMITER(
// Xbox shader program
// <Xbox Shader>
// Copy variables to output struct
VS_OUTPUT xOut;
@ -332,4 +333,4 @@ VS_OUTPUT main(const VS_INPUT xIn)
return xOut;
}
// End of vertex shader )DELIMITER" /* This terminates the raw string" // */
// End of vertex shader footer)DELIMITER" /* This terminates the footer raw string" // */

View File

@ -36,7 +36,6 @@
#include "XbD3D8Types.h" // For X_D3DVSDE_*
#include <sstream>
#include <regex>
#include <unordered_map>
#include <array>
#include <bitset>
@ -1663,9 +1662,10 @@ extern HRESULT EmuRecompileVshFunction
if (!SUCCEEDED(hRet)) return hRet;
static std::string hlsl_template =
#include "core\hle\D3D8\Direct3D9\CxbxVertexShaderTemplate.hlsl" // Note : This included .hlsl defines a raw string
;
// Include HLSL header and footer as raw strings :
static std::string hlsl_template[2] = {
#include "core\hle\D3D8\Direct3D9\CxbxVertexShaderTemplate.hlsl"
};
// Decode the vertex shader program tokens into an intermediate representation
pToken = (uint32_t*)((uintptr_t)pXboxFunction + sizeof(XTL::X_VSH_SHADER_HEADER));
@ -1678,6 +1678,7 @@ extern HRESULT EmuRecompileVshFunction
*pXboxFunctionSize = (intptr_t)pToken - (intptr_t)pXboxFunction;
auto hlsl_stream = std::stringstream();
hlsl_stream << hlsl_template[0]; // Start with the HLSL template header
if (!VshDecoder.BuildShader(hlsl_stream)) {
// Do not attempt to compile empty shaders
// This is a declaration only shader, so there is no function to recompile
@ -1685,8 +1686,8 @@ extern HRESULT EmuRecompileVshFunction
return D3D_OK;
}
hlsl_stream << hlsl_template[1]; // Finish with the HLSL template footer
std::string hlsl_str = hlsl_stream.str();
hlsl_str = std::regex_replace(hlsl_template, std::regex("// <Xbox Shader>"), hlsl_str, std::regex_constants::format_first_only);
DbgVshPrintf("--- HLSL conversion ---\n");
DbgVshPrintf(DebugPrependLineNumbers(hlsl_str).c_str());