Initial port of zfreeze branch (3.5-1729)

Initial port of original zfreeze branch (3.5-1729) by neobrain into most recent build of Dolphin. Makes Rogue Squadron 2 very playable at full speed thanks to recent core speedups made to Dolphin. Works on DirectX Video plugin only for now. Enjoy! and Merry Xmas!!
2014-12-25 00:34:22 -07:00 · 2014-12-25 00:34:22 -07:00 · 937844b9e3
parent 4984215971
commit 937844b9e3
12 changed files with 173 additions and 4 deletions
--- a/Source/Core/VideoBackends/D3D/VertexManager.cpp
+++ b/Source/Core/VideoBackends/D3D/VertexManager.cpp
@ -178,9 +178,51 @@ void VertexManager::vFlush(bool useDstAlpha)
 	}
 	u32 stride = VertexLoaderManager::GetCurrentVertexFormat()->GetVertexStride();
 	u32 indices = IndexGenerator::GetIndexLen();
 	PrepareDrawBuffers(stride);
 	if (!bpmem.genMode.zfreeze && indices >= 3)
 	{
 		float vtx[9];
 		float out[12];
 		// Lookup vertices of the last rendered triangle and software-transform them
 		// This allows us to determine the depth slope, which will be used if zfreeze
 		// is enabled in the following flush.
 		for (unsigned int i = 0; i < 3; ++i)
 		{
 			const int base_index = GetIndexBuffer()[indices - 3 + i];
 			u8* vtx_ptr = &((u8*)GetVertexBuffer())[base_index * stride];
 			vtx[0 + i * 3] = ((float*)vtx_ptr)[0];
 			vtx[1 + i * 3] = ((float*)vtx_ptr)[1];
 			vtx[2 + i * 3] = ((float*)vtx_ptr)[2];
 			VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4]);
 			// viewport offset ignored because we only look at coordinate differences.
 			out[0 + i * 4] = out[0 + i * 4] / out[3 + i * 4] * xfmem.viewport.wd;
 			out[1 + i * 4] = out[1 + i * 4] / out[3 + i * 4] * xfmem.viewport.ht;
 			out[2 + i * 4] = out[2 + i * 4] / out[3 + i * 4] * xfmem.viewport.zRange + xfmem.viewport.farZ;
 		}
 		float dx31 = out[8] - out[0];
 		float dx12 = out[0] - out[4];
 		float dy12 = out[1] - out[5];
 		float dy31 = out[9] - out[1];
 		float DF31 = out[10] - out[2];
 		float DF21 = out[6] - out[2];
 		float a = DF31 * -dy12 - DF21 * dy31;
 		float b = dx31 * DF21 + dx12 * DF31;
 		float c = -dx12 * dy31 - dx31 * -dy12;
 		float slope_dfdx = -a / c;
 		float slope_dfdy = -b / c;
 		float slope_f0 = out[2];
 		PixelShaderManager::SetZSlopeChanged(slope_dfdx, slope_dfdy, slope_f0);
 	}
 	VertexLoaderManager::GetCurrentVertexFormat()->SetupVertexPointers();
 	g_renderer->ApplyState(useDstAlpha);
--- a/Source/Core/VideoBackends/D3D/VertexManager.h
+++ b/Source/Core/VideoBackends/D3D/VertexManager.h
@ -22,6 +22,7 @@ public:
 protected:
 	virtual void ResetBuffer(u32 stride) override;
 	u16* GetIndexBuffer() { return &LocalIBuffer[0]; }
 	u8* GetVertexBuffer() { return &LocalVBuffer[0]; }
 private:
--- a/Source/Core/VideoBackends/OGL/VertexManager.cpp
+++ b/Source/Core/VideoBackends/OGL/VertexManager.cpp
@ -42,6 +42,13 @@ static size_t s_index_offset;
 VertexManager::VertexManager()
 {
 	LocalVBuffer.resize(MAXVBUFFERSIZE);
 	s_pCurBufferPointer = s_pBaseBufferPointer = &LocalVBuffer[0];
 	s_pEndBufferPointer = s_pBaseBufferPointer + LocalVBuffer.size();
 	LocalIBuffer.resize(MAXIBUFFERSIZE);
 	CreateDeviceObjects();
 }
@ -131,6 +138,7 @@ void VertexManager::vFlush(bool useDstAlpha)
 {
 	GLVertexFormat *nativeVertexFmt = (GLVertexFormat*)VertexLoaderManager::GetCurrentVertexFormat();
 	u32 stride  = nativeVertexFmt->GetVertexStride();
 	u32 indices = IndexGenerator::GetIndexLen();
 	if (m_last_vao != nativeVertexFmt->VAO)
 	{
@ -140,6 +148,47 @@ void VertexManager::vFlush(bool useDstAlpha)
 	PrepareDrawBuffers(stride);
 	if (!bpmem.genMode.zfreeze && indices >= 3)
 	{
 		float vtx[9];
 		float out[12];
 		// Lookup vertices of the last rendered triangle and software-transform them
 		// This allows us to determine the depth slope, which will be used if zfreeze
 		// is enabled in the following flush.
 		for (unsigned int i = 0; i < 3; ++i)
 		{
 			const int base_index = GetIndexBuffer()[indices - 3 + i];
 			u8* vtx_ptr = &((u8*)GetVertexBuffer())[base_index * stride];
 			vtx[0 + i * 3] = ((float*)vtx_ptr)[0];
 			vtx[1 + i * 3] = ((float*)vtx_ptr)[1];
 			vtx[2 + i * 3] = ((float*)vtx_ptr)[2];
 			VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4]);
 			// viewport offset ignored because we only look at coordinate differences.
 			out[0 + i * 4] = out[0 + i * 4] / out[3 + i * 4] * xfmem.viewport.wd;
 			out[1 + i * 4] = out[1 + i * 4] / out[3 + i * 4] * xfmem.viewport.ht;
 			out[2 + i * 4] = out[2 + i * 4] / out[3 + i * 4] * xfmem.viewport.zRange + xfmem.viewport.farZ;
 		}
 		float dx31 = out[8] - out[0];
 		float dx12 = out[0] - out[4];
 		float dy12 = out[1] - out[5];
 		float dy31 = out[9] - out[1];
 		float DF31 = out[10] - out[2];
 		float DF21 = out[6] - out[2];
 		float a = DF31 * -dy12 - DF21 * dy31;
 		float b = dx31 * DF21 + dx12 * DF31;
 		float c = -dx12 * dy31 - dx31 * -dy12;
 		float slope_dfdx = -a / c;
 		float slope_dfdy = -b / c;
 		float slope_f0 = out[2];
 		PixelShaderManager::SetZSlopeChanged(slope_dfdx, slope_dfdy, slope_f0);
 	}
 	// Makes sure we can actually do Dual source blending
 	bool dualSourcePossible = g_ActiveConfig.backend_info.bSupportsDualSourceBlend;
--- a/Source/Core/VideoBackends/OGL/VertexManager.h
+++ b/Source/Core/VideoBackends/OGL/VertexManager.h
@ -42,10 +42,15 @@ public:
 	GLuint m_last_vao;
 protected:
 	virtual void ResetBuffer(u32 stride) override;
 	u16* GetIndexBuffer() { return &LocalIBuffer[0]; }
 	u8* GetVertexBuffer() { return &LocalVBuffer[0]; }
 private:
 	void Draw(u32 stride);
 	void vFlush(bool useDstAlpha) override;
 	void PrepareDrawBuffers(u32 stride);
 	std::vector<u8> LocalVBuffer;
 	std::vector<u16> LocalIBuffer;
 };
 }
--- a/Source/Core/VideoCommon/ConstantManager.h
+++ b/Source/Core/VideoCommon/ConstantManager.h
@ -23,6 +23,7 @@ struct PixelShaderConstants
 	int4 fogcolor;
 	int4 fogi;
 	float4 fogf[2];
 	float4 zslope;
 };
 struct VertexShaderConstants
--- a/Source/Core/VideoCommon/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/PixelShaderGen.cpp
@ -228,6 +228,7 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T
 		"\tint4 " I_FOGCOLOR";\n"
 		"\tint4 " I_FOGI";\n"
 		"\tfloat4 " I_FOGF"[2];\n"
 		"\tfloat4 " I_ZSLOPE";\n"
 		"};\n");
 	if (g_ActiveConfig.bEnablePixelLighting)
@ -269,7 +270,7 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T
 	out.Write("};\n");
 	const bool forced_early_z = g_ActiveConfig.backend_info.bSupportsEarlyZ && bpmem.UseEarlyDepthTest() && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED);
-	const bool per_pixel_depth = (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) || (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z);
+	const bool per_pixel_depth = (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) || (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z) || bpmem.genMode.zfreeze;
 	if (forced_early_z)
 	{
@ -538,10 +539,20 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T
 	uid_data->fast_depth_calc = g_ActiveConfig.bFastDepthCalc;
 	uid_data->early_ztest = bpmem.UseEarlyDepthTest();
 	uid_data->fog_fsel = bpmem.fog.c_proj_fsel.fsel;
 	uid_data->zfreeze = bpmem.genMode.zfreeze;
 	// Note: z-textures are not written to depth buffer if early depth test is used
 	if (per_pixel_depth && bpmem.UseEarlyDepthTest())
-		out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n");
+	{
 		if (bpmem.genMode.zfreeze)
 		{
 			out.Write("\tdepth = " I_ZSLOPE".z + " I_ZSLOPE".x * (clipPos.x / clipPos.w) + " I_ZSLOPE".y * (clipPos.y / clipPos.w);\n");
 		}
 		else
 		{
 			out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n");
 		}
 	}
 	// Note: depth texture output is only written to depth buffer if late depth test is used
 	// theoretical final depth value is used for fog calculation, though, so we have to emulate ztextures anyway
@ -555,7 +566,16 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T
 	}
 	if (per_pixel_depth && bpmem.UseLateDepthTest())
-		out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n");
+	{
 		if (bpmem.genMode.zfreeze)
 		{
 			out.Write("\tdepth = " I_ZSLOPE".z + " I_ZSLOPE".x * (clipPos.x / clipPos.w) + " I_ZSLOPE".y * (clipPos.y / clipPos.w);\n");
 		}
 		else
 		{
 			out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n");
 		}
 	}
 	if (dstAlphaMode == DSTALPHA_ALPHA_PASS)
 	{
--- a/Source/Core/VideoCommon/PixelShaderGen.h
+++ b/Source/Core/VideoCommon/PixelShaderGen.h
@ -21,8 +21,9 @@
 #define C_FOGCOLOR      (C_INDTEXMTX + 6)   //27
 #define C_FOGI          (C_FOGCOLOR + 1)    //28
 #define C_FOGF          (C_FOGI + 1)        //29
 #define C_ZSLOPE        (C_FOGF + 1)        //30
-#define C_PENVCONST_END (C_FOGF + 2)
+#define C_PENVCONST_END (C_ZSLOPE + 2)
 // Different ways to achieve rendering with destination alpha
 enum DSTALPHA_MODE
@ -62,6 +63,7 @@ struct pixel_shader_uid_data
 	u32 forced_early_z : 1;
 	u32 early_ztest : 1;
 	u32 bounding_box : 1;
 	u32 zfreeze : 1;
 	u32 texMtxInfo_n_projection : 8; // 8x1 bit
 	u32 tevindref_bi0 : 3;
--- a/Source/Core/VideoCommon/PixelShaderManager.cpp
+++ b/Source/Core/VideoCommon/PixelShaderManager.cpp
@ -14,6 +14,8 @@
 bool PixelShaderManager::s_bFogRangeAdjustChanged;
 bool PixelShaderManager::s_bViewPortChanged;
 bool PixelShaderManager::s_bZSlopeChanged;
 static float zslope[3];
 std::array<int4,4> PixelShaderManager::s_tev_color;
 std::array<int4,4> PixelShaderManager::s_tev_konst_color;
@ -48,6 +50,7 @@ void PixelShaderManager::Dirty()
 	SetDestAlpha();
 	SetZTextureBias();
 	SetViewportChanged();
 	SetZSlopeChanged(0, 0, 1);
 	SetIndTexScaleChanged(false);
 	SetIndTexScaleChanged(true);
 	SetIndMatrixChanged(0);
@ -112,6 +115,17 @@ void PixelShaderManager::SetConstants()
 		dirty = true;
 		s_bViewPortChanged = false;
 	}
 	if (s_bZSlopeChanged)
 	{
 		constants.zslope[0] = zslope[0];
 		constants.zslope[1] = zslope[1];
 		constants.zslope[2] = zslope[2];
 		constants.zslope[3] = 0;
 		dirty = true;
 		s_bZSlopeChanged = false;
 	}
 }
 void PixelShaderManager::SetTevColor(int index, int component, s32 value)
@ -168,6 +182,14 @@ void PixelShaderManager::SetViewportChanged()
 	s_bFogRangeAdjustChanged = true; // TODO: Shouldn't be necessary with an accurate fog range adjust implementation
 }
 void PixelShaderManager::SetZSlopeChanged(float dfdx, float dfdy, float f0)
 {
 	zslope[0] = dfdx;
 	zslope[1] = dfdy;
 	zslope[2] = f0;
 	s_bZSlopeChanged = true;
 }
 void PixelShaderManager::SetIndTexScaleChanged(bool high)
 {
 	constants.indtexscale[high][0] = bpmem.texscale[high].ss0;
--- a/Source/Core/VideoCommon/PixelShaderManager.h
+++ b/Source/Core/VideoCommon/PixelShaderManager.h
@ -36,6 +36,7 @@ public:
 	static void SetTexDims(int texmapid, u32 width, u32 height, u32 wraps, u32 wrapt);
 	static void SetZTextureBias();
 	static void SetViewportChanged();
 	static void SetZSlopeChanged(float dfdx, float dfdy, float f0);
 	static void SetIndMatrixChanged(int matrixidx);
 	static void SetTevKSelChanged(int id);
 	static void SetZTextureTypeChanged();
@ -50,6 +51,7 @@ public:
 	static bool s_bFogRangeAdjustChanged;
 	static bool s_bViewPortChanged;
 	static bool s_bZSlopeChanged;
 	// These colors aren't available from global BP state,
 	// hence we keep a copy of them around.
--- a/Source/Core/VideoCommon/ShaderGenCommon.h
+++ b/Source/Core/VideoCommon/ShaderGenCommon.h
@ -291,6 +291,7 @@ static inline void AssignVSOutputMembers(T& object, const char* a, const char* b
 #define I_FOGCOLOR      "cfogcolor"
 #define I_FOGI          "cfogi"
 #define I_FOGF          "cfogf"
 #define I_ZSLOPE        "czslope"
 #define I_POSNORMALMATRIX       "cpnmtx"
 #define I_PROJECTION            "cproj"
--- a/Source/Core/VideoCommon/VertexShaderManager.cpp
+++ b/Source/Core/VideoCommon/VertexShaderManager.cpp
@ -690,6 +690,24 @@ void VertexShaderManager::ResetView()
 	bProjectionChanged = true;
 }
 void VertexShaderManager::TransformToClipSpace(const float* data, float *out)
 {
 	const float *world_matrix = (const float *)xfmem.posMatrices + g_main_cp_state.matrix_index_a.PosNormalMtxIdx * 4;
 	const float *proj_matrix = &g_fProjectionMatrix[0];
 	float t[3];
 	t[0] = data[0] * world_matrix[0] + data[1] * world_matrix[1] + data[2] * world_matrix[2] + world_matrix[3];
 	t[1] = data[0] * world_matrix[4] + data[1] * world_matrix[5] + data[2] * world_matrix[6] + world_matrix[7];
 	t[2] = data[0] * world_matrix[8] + data[1] * world_matrix[9] + data[2] * world_matrix[10] + world_matrix[11];
 	// TODO: this requires g_fProjectionMatrix to be up to date, which is not really a good design decision.
 	out[0] = t[0] * proj_matrix[0] + t[1] * proj_matrix[1] + t[2] * proj_matrix[2] + proj_matrix[3];
 	out[1] = t[0] * proj_matrix[4] + t[1] * proj_matrix[5] + t[2] * proj_matrix[6] + proj_matrix[7];
 	out[2] = t[0] * proj_matrix[8] + t[1] * proj_matrix[9] + t[2] * proj_matrix[10] + proj_matrix[11];
 	out[3] = t[0] * proj_matrix[12] + t[1] * proj_matrix[13] + t[2] * proj_matrix[14] + proj_matrix[15];
 }
 void VertexShaderManager::DoState(PointerWrap &p)
 {
 	p.Do(g_fProjectionMatrix);
--- a/Source/Core/VideoCommon/VertexShaderManager.h
+++ b/Source/Core/VideoCommon/VertexShaderManager.h
@ -34,6 +34,12 @@ public:
 	static void RotateView(float x, float y);
 	static void ResetView();
 	// data: 3 floats representing the X, Y and Z vertex model coordinates
 	// out: 4 floats which will be initialized with the corresponding clip space coordinates
 	// NOTE: g_fProjectionMatrix must be up to date when this is called
 	//		(i.e. VertexShaderManager::SetConstants needs to be called before using this!)
 	static void TransformToClipSpace(const float* data, float *out);
 	static VertexShaderConstants constants;
 	static bool dirty;
 };