Initial port of zfreeze branch (3.5-1729)

Initial port of original zfreeze branch (3.5-1729) by neobrain into
most recent build of Dolphin.

Makes Rogue Squadron 2 very playable at full speed thanks to recent core
speedups made to Dolphin. Works on DirectX Video plugin only for now.

Enjoy!  and Merry Xmas!!
This commit is contained in:
NanoByte011 2014-12-25 00:34:22 -07:00 committed by Scott Mansell
parent 4984215971
commit 937844b9e3
12 changed files with 173 additions and 4 deletions

View File

@ -178,9 +178,51 @@ void VertexManager::vFlush(bool useDstAlpha)
}
u32 stride = VertexLoaderManager::GetCurrentVertexFormat()->GetVertexStride();
u32 indices = IndexGenerator::GetIndexLen();
PrepareDrawBuffers(stride);
if (!bpmem.genMode.zfreeze && indices >= 3)
{
float vtx[9];
float out[12];
// Lookup vertices of the last rendered triangle and software-transform them
// This allows us to determine the depth slope, which will be used if zfreeze
// is enabled in the following flush.
for (unsigned int i = 0; i < 3; ++i)
{
const int base_index = GetIndexBuffer()[indices - 3 + i];
u8* vtx_ptr = &((u8*)GetVertexBuffer())[base_index * stride];
vtx[0 + i * 3] = ((float*)vtx_ptr)[0];
vtx[1 + i * 3] = ((float*)vtx_ptr)[1];
vtx[2 + i * 3] = ((float*)vtx_ptr)[2];
VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4]);
// viewport offset ignored because we only look at coordinate differences.
out[0 + i * 4] = out[0 + i * 4] / out[3 + i * 4] * xfmem.viewport.wd;
out[1 + i * 4] = out[1 + i * 4] / out[3 + i * 4] * xfmem.viewport.ht;
out[2 + i * 4] = out[2 + i * 4] / out[3 + i * 4] * xfmem.viewport.zRange + xfmem.viewport.farZ;
}
float dx31 = out[8] - out[0];
float dx12 = out[0] - out[4];
float dy12 = out[1] - out[5];
float dy31 = out[9] - out[1];
float DF31 = out[10] - out[2];
float DF21 = out[6] - out[2];
float a = DF31 * -dy12 - DF21 * dy31;
float b = dx31 * DF21 + dx12 * DF31;
float c = -dx12 * dy31 - dx31 * -dy12;
float slope_dfdx = -a / c;
float slope_dfdy = -b / c;
float slope_f0 = out[2];
PixelShaderManager::SetZSlopeChanged(slope_dfdx, slope_dfdy, slope_f0);
}
VertexLoaderManager::GetCurrentVertexFormat()->SetupVertexPointers();
g_renderer->ApplyState(useDstAlpha);

View File

@ -22,6 +22,7 @@ public:
protected:
virtual void ResetBuffer(u32 stride) override;
u16* GetIndexBuffer() { return &LocalIBuffer[0]; }
u8* GetVertexBuffer() { return &LocalVBuffer[0]; }
private:

View File

@ -42,6 +42,13 @@ static size_t s_index_offset;
VertexManager::VertexManager()
{
LocalVBuffer.resize(MAXVBUFFERSIZE);
s_pCurBufferPointer = s_pBaseBufferPointer = &LocalVBuffer[0];
s_pEndBufferPointer = s_pBaseBufferPointer + LocalVBuffer.size();
LocalIBuffer.resize(MAXIBUFFERSIZE);
CreateDeviceObjects();
}
@ -131,6 +138,7 @@ void VertexManager::vFlush(bool useDstAlpha)
{
GLVertexFormat *nativeVertexFmt = (GLVertexFormat*)VertexLoaderManager::GetCurrentVertexFormat();
u32 stride = nativeVertexFmt->GetVertexStride();
u32 indices = IndexGenerator::GetIndexLen();
if (m_last_vao != nativeVertexFmt->VAO)
{
@ -140,6 +148,47 @@ void VertexManager::vFlush(bool useDstAlpha)
PrepareDrawBuffers(stride);
if (!bpmem.genMode.zfreeze && indices >= 3)
{
float vtx[9];
float out[12];
// Lookup vertices of the last rendered triangle and software-transform them
// This allows us to determine the depth slope, which will be used if zfreeze
// is enabled in the following flush.
for (unsigned int i = 0; i < 3; ++i)
{
const int base_index = GetIndexBuffer()[indices - 3 + i];
u8* vtx_ptr = &((u8*)GetVertexBuffer())[base_index * stride];
vtx[0 + i * 3] = ((float*)vtx_ptr)[0];
vtx[1 + i * 3] = ((float*)vtx_ptr)[1];
vtx[2 + i * 3] = ((float*)vtx_ptr)[2];
VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4]);
// viewport offset ignored because we only look at coordinate differences.
out[0 + i * 4] = out[0 + i * 4] / out[3 + i * 4] * xfmem.viewport.wd;
out[1 + i * 4] = out[1 + i * 4] / out[3 + i * 4] * xfmem.viewport.ht;
out[2 + i * 4] = out[2 + i * 4] / out[3 + i * 4] * xfmem.viewport.zRange + xfmem.viewport.farZ;
}
float dx31 = out[8] - out[0];
float dx12 = out[0] - out[4];
float dy12 = out[1] - out[5];
float dy31 = out[9] - out[1];
float DF31 = out[10] - out[2];
float DF21 = out[6] - out[2];
float a = DF31 * -dy12 - DF21 * dy31;
float b = dx31 * DF21 + dx12 * DF31;
float c = -dx12 * dy31 - dx31 * -dy12;
float slope_dfdx = -a / c;
float slope_dfdy = -b / c;
float slope_f0 = out[2];
PixelShaderManager::SetZSlopeChanged(slope_dfdx, slope_dfdy, slope_f0);
}
// Makes sure we can actually do Dual source blending
bool dualSourcePossible = g_ActiveConfig.backend_info.bSupportsDualSourceBlend;

View File

@ -42,10 +42,15 @@ public:
GLuint m_last_vao;
protected:
virtual void ResetBuffer(u32 stride) override;
u16* GetIndexBuffer() { return &LocalIBuffer[0]; }
u8* GetVertexBuffer() { return &LocalVBuffer[0]; }
private:
void Draw(u32 stride);
void vFlush(bool useDstAlpha) override;
void PrepareDrawBuffers(u32 stride);
std::vector<u8> LocalVBuffer;
std::vector<u16> LocalIBuffer;
};
}

View File

@ -23,6 +23,7 @@ struct PixelShaderConstants
int4 fogcolor;
int4 fogi;
float4 fogf[2];
float4 zslope;
};
struct VertexShaderConstants

View File

@ -228,6 +228,7 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T
"\tint4 " I_FOGCOLOR";\n"
"\tint4 " I_FOGI";\n"
"\tfloat4 " I_FOGF"[2];\n"
"\tfloat4 " I_ZSLOPE";\n"
"};\n");
if (g_ActiveConfig.bEnablePixelLighting)
@ -269,7 +270,7 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T
out.Write("};\n");
const bool forced_early_z = g_ActiveConfig.backend_info.bSupportsEarlyZ && bpmem.UseEarlyDepthTest() && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED);
const bool per_pixel_depth = (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) || (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z);
const bool per_pixel_depth = (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) || (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z) || bpmem.genMode.zfreeze;
if (forced_early_z)
{
@ -538,10 +539,20 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T
uid_data->fast_depth_calc = g_ActiveConfig.bFastDepthCalc;
uid_data->early_ztest = bpmem.UseEarlyDepthTest();
uid_data->fog_fsel = bpmem.fog.c_proj_fsel.fsel;
uid_data->zfreeze = bpmem.genMode.zfreeze;
// Note: z-textures are not written to depth buffer if early depth test is used
if (per_pixel_depth && bpmem.UseEarlyDepthTest())
out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n");
{
if (bpmem.genMode.zfreeze)
{
out.Write("\tdepth = " I_ZSLOPE".z + " I_ZSLOPE".x * (clipPos.x / clipPos.w) + " I_ZSLOPE".y * (clipPos.y / clipPos.w);\n");
}
else
{
out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n");
}
}
// Note: depth texture output is only written to depth buffer if late depth test is used
// theoretical final depth value is used for fog calculation, though, so we have to emulate ztextures anyway
@ -555,7 +566,16 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T
}
if (per_pixel_depth && bpmem.UseLateDepthTest())
out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n");
{
if (bpmem.genMode.zfreeze)
{
out.Write("\tdepth = " I_ZSLOPE".z + " I_ZSLOPE".x * (clipPos.x / clipPos.w) + " I_ZSLOPE".y * (clipPos.y / clipPos.w);\n");
}
else
{
out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n");
}
}
if (dstAlphaMode == DSTALPHA_ALPHA_PASS)
{

View File

@ -21,8 +21,9 @@
#define C_FOGCOLOR (C_INDTEXMTX + 6) //27
#define C_FOGI (C_FOGCOLOR + 1) //28
#define C_FOGF (C_FOGI + 1) //29
#define C_ZSLOPE (C_FOGF + 1) //30
#define C_PENVCONST_END (C_FOGF + 2)
#define C_PENVCONST_END (C_ZSLOPE + 2)
// Different ways to achieve rendering with destination alpha
enum DSTALPHA_MODE
@ -62,6 +63,7 @@ struct pixel_shader_uid_data
u32 forced_early_z : 1;
u32 early_ztest : 1;
u32 bounding_box : 1;
u32 zfreeze : 1;
u32 texMtxInfo_n_projection : 8; // 8x1 bit
u32 tevindref_bi0 : 3;

View File

@ -14,6 +14,8 @@
bool PixelShaderManager::s_bFogRangeAdjustChanged;
bool PixelShaderManager::s_bViewPortChanged;
bool PixelShaderManager::s_bZSlopeChanged;
static float zslope[3];
std::array<int4,4> PixelShaderManager::s_tev_color;
std::array<int4,4> PixelShaderManager::s_tev_konst_color;
@ -48,6 +50,7 @@ void PixelShaderManager::Dirty()
SetDestAlpha();
SetZTextureBias();
SetViewportChanged();
SetZSlopeChanged(0, 0, 1);
SetIndTexScaleChanged(false);
SetIndTexScaleChanged(true);
SetIndMatrixChanged(0);
@ -112,6 +115,17 @@ void PixelShaderManager::SetConstants()
dirty = true;
s_bViewPortChanged = false;
}
if (s_bZSlopeChanged)
{
constants.zslope[0] = zslope[0];
constants.zslope[1] = zslope[1];
constants.zslope[2] = zslope[2];
constants.zslope[3] = 0;
dirty = true;
s_bZSlopeChanged = false;
}
}
void PixelShaderManager::SetTevColor(int index, int component, s32 value)
@ -168,6 +182,14 @@ void PixelShaderManager::SetViewportChanged()
s_bFogRangeAdjustChanged = true; // TODO: Shouldn't be necessary with an accurate fog range adjust implementation
}
void PixelShaderManager::SetZSlopeChanged(float dfdx, float dfdy, float f0)
{
zslope[0] = dfdx;
zslope[1] = dfdy;
zslope[2] = f0;
s_bZSlopeChanged = true;
}
void PixelShaderManager::SetIndTexScaleChanged(bool high)
{
constants.indtexscale[high][0] = bpmem.texscale[high].ss0;

View File

@ -36,6 +36,7 @@ public:
static void SetTexDims(int texmapid, u32 width, u32 height, u32 wraps, u32 wrapt);
static void SetZTextureBias();
static void SetViewportChanged();
static void SetZSlopeChanged(float dfdx, float dfdy, float f0);
static void SetIndMatrixChanged(int matrixidx);
static void SetTevKSelChanged(int id);
static void SetZTextureTypeChanged();
@ -50,6 +51,7 @@ public:
static bool s_bFogRangeAdjustChanged;
static bool s_bViewPortChanged;
static bool s_bZSlopeChanged;
// These colors aren't available from global BP state,
// hence we keep a copy of them around.

View File

@ -291,6 +291,7 @@ static inline void AssignVSOutputMembers(T& object, const char* a, const char* b
#define I_FOGCOLOR "cfogcolor"
#define I_FOGI "cfogi"
#define I_FOGF "cfogf"
#define I_ZSLOPE "czslope"
#define I_POSNORMALMATRIX "cpnmtx"
#define I_PROJECTION "cproj"

View File

@ -690,6 +690,24 @@ void VertexShaderManager::ResetView()
bProjectionChanged = true;
}
void VertexShaderManager::TransformToClipSpace(const float* data, float *out)
{
const float *world_matrix = (const float *)xfmem.posMatrices + g_main_cp_state.matrix_index_a.PosNormalMtxIdx * 4;
const float *proj_matrix = &g_fProjectionMatrix[0];
float t[3];
t[0] = data[0] * world_matrix[0] + data[1] * world_matrix[1] + data[2] * world_matrix[2] + world_matrix[3];
t[1] = data[0] * world_matrix[4] + data[1] * world_matrix[5] + data[2] * world_matrix[6] + world_matrix[7];
t[2] = data[0] * world_matrix[8] + data[1] * world_matrix[9] + data[2] * world_matrix[10] + world_matrix[11];
// TODO: this requires g_fProjectionMatrix to be up to date, which is not really a good design decision.
out[0] = t[0] * proj_matrix[0] + t[1] * proj_matrix[1] + t[2] * proj_matrix[2] + proj_matrix[3];
out[1] = t[0] * proj_matrix[4] + t[1] * proj_matrix[5] + t[2] * proj_matrix[6] + proj_matrix[7];
out[2] = t[0] * proj_matrix[8] + t[1] * proj_matrix[9] + t[2] * proj_matrix[10] + proj_matrix[11];
out[3] = t[0] * proj_matrix[12] + t[1] * proj_matrix[13] + t[2] * proj_matrix[14] + proj_matrix[15];
}
void VertexShaderManager::DoState(PointerWrap &p)
{
p.Do(g_fProjectionMatrix);

View File

@ -34,6 +34,12 @@ public:
static void RotateView(float x, float y);
static void ResetView();
// data: 3 floats representing the X, Y and Z vertex model coordinates
// out: 4 floats which will be initialized with the corresponding clip space coordinates
// NOTE: g_fProjectionMatrix must be up to date when this is called
// (i.e. VertexShaderManager::SetConstants needs to be called before using this!)
static void TransformToClipSpace(const float* data, float *out);
static VertexShaderConstants constants;
static bool dirty;
};