From 937844b9e339d1f6b819e90eeb9bde65f45e833f Mon Sep 17 00:00:00 2001 From: NanoByte011 Date: Thu, 25 Dec 2014 00:34:22 -0700 Subject: [PATCH 01/11] Initial port of zfreeze branch (3.5-1729) Initial port of original zfreeze branch (3.5-1729) by neobrain into most recent build of Dolphin. Makes Rogue Squadron 2 very playable at full speed thanks to recent core speedups made to Dolphin. Works on DirectX Video plugin only for now. Enjoy! and Merry Xmas!! --- .../Core/VideoBackends/D3D/VertexManager.cpp | 42 ++++++++++++++++ Source/Core/VideoBackends/D3D/VertexManager.h | 1 + .../Core/VideoBackends/OGL/VertexManager.cpp | 49 +++++++++++++++++++ Source/Core/VideoBackends/OGL/VertexManager.h | 5 ++ Source/Core/VideoCommon/ConstantManager.h | 1 + Source/Core/VideoCommon/PixelShaderGen.cpp | 26 ++++++++-- Source/Core/VideoCommon/PixelShaderGen.h | 4 +- .../Core/VideoCommon/PixelShaderManager.cpp | 22 +++++++++ Source/Core/VideoCommon/PixelShaderManager.h | 2 + Source/Core/VideoCommon/ShaderGenCommon.h | 1 + .../Core/VideoCommon/VertexShaderManager.cpp | 18 +++++++ Source/Core/VideoCommon/VertexShaderManager.h | 6 +++ 12 files changed, 173 insertions(+), 4 deletions(-) diff --git a/Source/Core/VideoBackends/D3D/VertexManager.cpp b/Source/Core/VideoBackends/D3D/VertexManager.cpp index 5bd39d45b2..8f925452c2 100644 --- a/Source/Core/VideoBackends/D3D/VertexManager.cpp +++ b/Source/Core/VideoBackends/D3D/VertexManager.cpp @@ -178,9 +178,51 @@ void VertexManager::vFlush(bool useDstAlpha) } u32 stride = VertexLoaderManager::GetCurrentVertexFormat()->GetVertexStride(); + u32 indices = IndexGenerator::GetIndexLen(); PrepareDrawBuffers(stride); + if (!bpmem.genMode.zfreeze && indices >= 3) + { + float vtx[9]; + float out[12]; + + // Lookup vertices of the last rendered triangle and software-transform them + // This allows us to determine the depth slope, which will be used if zfreeze + // is enabled in the following flush. + for (unsigned int i = 0; i < 3; ++i) + { + const int base_index = GetIndexBuffer()[indices - 3 + i]; + u8* vtx_ptr = &((u8*)GetVertexBuffer())[base_index * stride]; + vtx[0 + i * 3] = ((float*)vtx_ptr)[0]; + vtx[1 + i * 3] = ((float*)vtx_ptr)[1]; + vtx[2 + i * 3] = ((float*)vtx_ptr)[2]; + + VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4]); + + // viewport offset ignored because we only look at coordinate differences. + out[0 + i * 4] = out[0 + i * 4] / out[3 + i * 4] * xfmem.viewport.wd; + out[1 + i * 4] = out[1 + i * 4] / out[3 + i * 4] * xfmem.viewport.ht; + out[2 + i * 4] = out[2 + i * 4] / out[3 + i * 4] * xfmem.viewport.zRange + xfmem.viewport.farZ; + } + float dx31 = out[8] - out[0]; + float dx12 = out[0] - out[4]; + float dy12 = out[1] - out[5]; + float dy31 = out[9] - out[1]; + + float DF31 = out[10] - out[2]; + float DF21 = out[6] - out[2]; + float a = DF31 * -dy12 - DF21 * dy31; + float b = dx31 * DF21 + dx12 * DF31; + float c = -dx12 * dy31 - dx31 * -dy12; + + float slope_dfdx = -a / c; + float slope_dfdy = -b / c; + float slope_f0 = out[2]; + + PixelShaderManager::SetZSlopeChanged(slope_dfdx, slope_dfdy, slope_f0); + } + VertexLoaderManager::GetCurrentVertexFormat()->SetupVertexPointers(); g_renderer->ApplyState(useDstAlpha); diff --git a/Source/Core/VideoBackends/D3D/VertexManager.h b/Source/Core/VideoBackends/D3D/VertexManager.h index 0b124d7512..38fcd088fd 100644 --- a/Source/Core/VideoBackends/D3D/VertexManager.h +++ b/Source/Core/VideoBackends/D3D/VertexManager.h @@ -22,6 +22,7 @@ public: protected: virtual void ResetBuffer(u32 stride) override; u16* GetIndexBuffer() { return &LocalIBuffer[0]; } + u8* GetVertexBuffer() { return &LocalVBuffer[0]; } private: diff --git a/Source/Core/VideoBackends/OGL/VertexManager.cpp b/Source/Core/VideoBackends/OGL/VertexManager.cpp index 1a162b1cde..d3a8d91bca 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.cpp +++ b/Source/Core/VideoBackends/OGL/VertexManager.cpp @@ -42,6 +42,13 @@ static size_t s_index_offset; VertexManager::VertexManager() { + LocalVBuffer.resize(MAXVBUFFERSIZE); + + s_pCurBufferPointer = s_pBaseBufferPointer = &LocalVBuffer[0]; + s_pEndBufferPointer = s_pBaseBufferPointer + LocalVBuffer.size(); + + LocalIBuffer.resize(MAXIBUFFERSIZE); + CreateDeviceObjects(); } @@ -131,6 +138,7 @@ void VertexManager::vFlush(bool useDstAlpha) { GLVertexFormat *nativeVertexFmt = (GLVertexFormat*)VertexLoaderManager::GetCurrentVertexFormat(); u32 stride = nativeVertexFmt->GetVertexStride(); + u32 indices = IndexGenerator::GetIndexLen(); if (m_last_vao != nativeVertexFmt->VAO) { @@ -140,6 +148,47 @@ void VertexManager::vFlush(bool useDstAlpha) PrepareDrawBuffers(stride); + if (!bpmem.genMode.zfreeze && indices >= 3) + { + float vtx[9]; + float out[12]; + + // Lookup vertices of the last rendered triangle and software-transform them + // This allows us to determine the depth slope, which will be used if zfreeze + // is enabled in the following flush. + for (unsigned int i = 0; i < 3; ++i) + { + const int base_index = GetIndexBuffer()[indices - 3 + i]; + u8* vtx_ptr = &((u8*)GetVertexBuffer())[base_index * stride]; + vtx[0 + i * 3] = ((float*)vtx_ptr)[0]; + vtx[1 + i * 3] = ((float*)vtx_ptr)[1]; + vtx[2 + i * 3] = ((float*)vtx_ptr)[2]; + + VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4]); + + // viewport offset ignored because we only look at coordinate differences. + out[0 + i * 4] = out[0 + i * 4] / out[3 + i * 4] * xfmem.viewport.wd; + out[1 + i * 4] = out[1 + i * 4] / out[3 + i * 4] * xfmem.viewport.ht; + out[2 + i * 4] = out[2 + i * 4] / out[3 + i * 4] * xfmem.viewport.zRange + xfmem.viewport.farZ; + } + float dx31 = out[8] - out[0]; + float dx12 = out[0] - out[4]; + float dy12 = out[1] - out[5]; + float dy31 = out[9] - out[1]; + + float DF31 = out[10] - out[2]; + float DF21 = out[6] - out[2]; + float a = DF31 * -dy12 - DF21 * dy31; + float b = dx31 * DF21 + dx12 * DF31; + float c = -dx12 * dy31 - dx31 * -dy12; + + float slope_dfdx = -a / c; + float slope_dfdy = -b / c; + float slope_f0 = out[2]; + + PixelShaderManager::SetZSlopeChanged(slope_dfdx, slope_dfdy, slope_f0); + } + // Makes sure we can actually do Dual source blending bool dualSourcePossible = g_ActiveConfig.backend_info.bSupportsDualSourceBlend; diff --git a/Source/Core/VideoBackends/OGL/VertexManager.h b/Source/Core/VideoBackends/OGL/VertexManager.h index 1f527fd9c0..0e9efd9c83 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.h +++ b/Source/Core/VideoBackends/OGL/VertexManager.h @@ -42,10 +42,15 @@ public: GLuint m_last_vao; protected: virtual void ResetBuffer(u32 stride) override; + u16* GetIndexBuffer() { return &LocalIBuffer[0]; } + u8* GetVertexBuffer() { return &LocalVBuffer[0]; } private: void Draw(u32 stride); void vFlush(bool useDstAlpha) override; void PrepareDrawBuffers(u32 stride); + + std::vector LocalVBuffer; + std::vector LocalIBuffer; }; } diff --git a/Source/Core/VideoCommon/ConstantManager.h b/Source/Core/VideoCommon/ConstantManager.h index 9bfce8aac1..b7b3d6664c 100644 --- a/Source/Core/VideoCommon/ConstantManager.h +++ b/Source/Core/VideoCommon/ConstantManager.h @@ -23,6 +23,7 @@ struct PixelShaderConstants int4 fogcolor; int4 fogi; float4 fogf[2]; + float4 zslope; }; struct VertexShaderConstants diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp index edc67cc83c..7afb21056c 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.cpp +++ b/Source/Core/VideoCommon/PixelShaderGen.cpp @@ -228,6 +228,7 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T "\tint4 " I_FOGCOLOR";\n" "\tint4 " I_FOGI";\n" "\tfloat4 " I_FOGF"[2];\n" + "\tfloat4 " I_ZSLOPE";\n" "};\n"); if (g_ActiveConfig.bEnablePixelLighting) @@ -269,7 +270,7 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T out.Write("};\n"); const bool forced_early_z = g_ActiveConfig.backend_info.bSupportsEarlyZ && bpmem.UseEarlyDepthTest() && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED); - const bool per_pixel_depth = (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) || (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z); + const bool per_pixel_depth = (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) || (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z) || bpmem.genMode.zfreeze; if (forced_early_z) { @@ -538,10 +539,20 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T uid_data->fast_depth_calc = g_ActiveConfig.bFastDepthCalc; uid_data->early_ztest = bpmem.UseEarlyDepthTest(); uid_data->fog_fsel = bpmem.fog.c_proj_fsel.fsel; + uid_data->zfreeze = bpmem.genMode.zfreeze; // Note: z-textures are not written to depth buffer if early depth test is used if (per_pixel_depth && bpmem.UseEarlyDepthTest()) - out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n"); + { + if (bpmem.genMode.zfreeze) + { + out.Write("\tdepth = " I_ZSLOPE".z + " I_ZSLOPE".x * (clipPos.x / clipPos.w) + " I_ZSLOPE".y * (clipPos.y / clipPos.w);\n"); + } + else + { + out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n"); + } + } // Note: depth texture output is only written to depth buffer if late depth test is used // theoretical final depth value is used for fog calculation, though, so we have to emulate ztextures anyway @@ -555,7 +566,16 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T } if (per_pixel_depth && bpmem.UseLateDepthTest()) - out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n"); + { + if (bpmem.genMode.zfreeze) + { + out.Write("\tdepth = " I_ZSLOPE".z + " I_ZSLOPE".x * (clipPos.x / clipPos.w) + " I_ZSLOPE".y * (clipPos.y / clipPos.w);\n"); + } + else + { + out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n"); + } + } if (dstAlphaMode == DSTALPHA_ALPHA_PASS) { diff --git a/Source/Core/VideoCommon/PixelShaderGen.h b/Source/Core/VideoCommon/PixelShaderGen.h index 784523087a..c889bd62a0 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.h +++ b/Source/Core/VideoCommon/PixelShaderGen.h @@ -21,8 +21,9 @@ #define C_FOGCOLOR (C_INDTEXMTX + 6) //27 #define C_FOGI (C_FOGCOLOR + 1) //28 #define C_FOGF (C_FOGI + 1) //29 +#define C_ZSLOPE (C_FOGF + 1) //30 -#define C_PENVCONST_END (C_FOGF + 2) +#define C_PENVCONST_END (C_ZSLOPE + 2) // Different ways to achieve rendering with destination alpha enum DSTALPHA_MODE @@ -62,6 +63,7 @@ struct pixel_shader_uid_data u32 forced_early_z : 1; u32 early_ztest : 1; u32 bounding_box : 1; + u32 zfreeze : 1; u32 texMtxInfo_n_projection : 8; // 8x1 bit u32 tevindref_bi0 : 3; diff --git a/Source/Core/VideoCommon/PixelShaderManager.cpp b/Source/Core/VideoCommon/PixelShaderManager.cpp index 0c6d4b73b3..b55147eb15 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.cpp +++ b/Source/Core/VideoCommon/PixelShaderManager.cpp @@ -14,6 +14,8 @@ bool PixelShaderManager::s_bFogRangeAdjustChanged; bool PixelShaderManager::s_bViewPortChanged; +bool PixelShaderManager::s_bZSlopeChanged; +static float zslope[3]; std::array PixelShaderManager::s_tev_color; std::array PixelShaderManager::s_tev_konst_color; @@ -48,6 +50,7 @@ void PixelShaderManager::Dirty() SetDestAlpha(); SetZTextureBias(); SetViewportChanged(); + SetZSlopeChanged(0, 0, 1); SetIndTexScaleChanged(false); SetIndTexScaleChanged(true); SetIndMatrixChanged(0); @@ -112,6 +115,17 @@ void PixelShaderManager::SetConstants() dirty = true; s_bViewPortChanged = false; } + + if (s_bZSlopeChanged) + { + constants.zslope[0] = zslope[0]; + constants.zslope[1] = zslope[1]; + constants.zslope[2] = zslope[2]; + constants.zslope[3] = 0; + + dirty = true; + s_bZSlopeChanged = false; + } } void PixelShaderManager::SetTevColor(int index, int component, s32 value) @@ -168,6 +182,14 @@ void PixelShaderManager::SetViewportChanged() s_bFogRangeAdjustChanged = true; // TODO: Shouldn't be necessary with an accurate fog range adjust implementation } +void PixelShaderManager::SetZSlopeChanged(float dfdx, float dfdy, float f0) +{ + zslope[0] = dfdx; + zslope[1] = dfdy; + zslope[2] = f0; + s_bZSlopeChanged = true; +} + void PixelShaderManager::SetIndTexScaleChanged(bool high) { constants.indtexscale[high][0] = bpmem.texscale[high].ss0; diff --git a/Source/Core/VideoCommon/PixelShaderManager.h b/Source/Core/VideoCommon/PixelShaderManager.h index ebf299d9fc..16c760f70f 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.h +++ b/Source/Core/VideoCommon/PixelShaderManager.h @@ -36,6 +36,7 @@ public: static void SetTexDims(int texmapid, u32 width, u32 height, u32 wraps, u32 wrapt); static void SetZTextureBias(); static void SetViewportChanged(); + static void SetZSlopeChanged(float dfdx, float dfdy, float f0); static void SetIndMatrixChanged(int matrixidx); static void SetTevKSelChanged(int id); static void SetZTextureTypeChanged(); @@ -50,6 +51,7 @@ public: static bool s_bFogRangeAdjustChanged; static bool s_bViewPortChanged; + static bool s_bZSlopeChanged; // These colors aren't available from global BP state, // hence we keep a copy of them around. diff --git a/Source/Core/VideoCommon/ShaderGenCommon.h b/Source/Core/VideoCommon/ShaderGenCommon.h index 571f8db5c5..dd80fd3987 100644 --- a/Source/Core/VideoCommon/ShaderGenCommon.h +++ b/Source/Core/VideoCommon/ShaderGenCommon.h @@ -291,6 +291,7 @@ static inline void AssignVSOutputMembers(T& object, const char* a, const char* b #define I_FOGCOLOR "cfogcolor" #define I_FOGI "cfogi" #define I_FOGF "cfogf" +#define I_ZSLOPE "czslope" #define I_POSNORMALMATRIX "cpnmtx" #define I_PROJECTION "cproj" diff --git a/Source/Core/VideoCommon/VertexShaderManager.cpp b/Source/Core/VideoCommon/VertexShaderManager.cpp index 4ca20a21f4..a745f7004f 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.cpp +++ b/Source/Core/VideoCommon/VertexShaderManager.cpp @@ -690,6 +690,24 @@ void VertexShaderManager::ResetView() bProjectionChanged = true; } +void VertexShaderManager::TransformToClipSpace(const float* data, float *out) +{ + const float *world_matrix = (const float *)xfmem.posMatrices + g_main_cp_state.matrix_index_a.PosNormalMtxIdx * 4; + const float *proj_matrix = &g_fProjectionMatrix[0]; + + float t[3]; + t[0] = data[0] * world_matrix[0] + data[1] * world_matrix[1] + data[2] * world_matrix[2] + world_matrix[3]; + t[1] = data[0] * world_matrix[4] + data[1] * world_matrix[5] + data[2] * world_matrix[6] + world_matrix[7]; + t[2] = data[0] * world_matrix[8] + data[1] * world_matrix[9] + data[2] * world_matrix[10] + world_matrix[11]; + + // TODO: this requires g_fProjectionMatrix to be up to date, which is not really a good design decision. + + out[0] = t[0] * proj_matrix[0] + t[1] * proj_matrix[1] + t[2] * proj_matrix[2] + proj_matrix[3]; + out[1] = t[0] * proj_matrix[4] + t[1] * proj_matrix[5] + t[2] * proj_matrix[6] + proj_matrix[7]; + out[2] = t[0] * proj_matrix[8] + t[1] * proj_matrix[9] + t[2] * proj_matrix[10] + proj_matrix[11]; + out[3] = t[0] * proj_matrix[12] + t[1] * proj_matrix[13] + t[2] * proj_matrix[14] + proj_matrix[15]; +} + void VertexShaderManager::DoState(PointerWrap &p) { p.Do(g_fProjectionMatrix); diff --git a/Source/Core/VideoCommon/VertexShaderManager.h b/Source/Core/VideoCommon/VertexShaderManager.h index d99f07fe21..229ba1f599 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.h +++ b/Source/Core/VideoCommon/VertexShaderManager.h @@ -34,6 +34,12 @@ public: static void RotateView(float x, float y); static void ResetView(); + // data: 3 floats representing the X, Y and Z vertex model coordinates + // out: 4 floats which will be initialized with the corresponding clip space coordinates + // NOTE: g_fProjectionMatrix must be up to date when this is called + // (i.e. VertexShaderManager::SetConstants needs to be called before using this!) + static void TransformToClipSpace(const float* data, float *out); + static VertexShaderConstants constants; static bool dirty; }; From 613781c7650d3cbd494a212eacdff10ea5140894 Mon Sep 17 00:00:00 2001 From: NanoByte011 Date: Fri, 26 Dec 2014 01:25:24 -0700 Subject: [PATCH 02/11] Cleanup and refactor of zfreeze port Based on the feedback from pull request #1767 I have put in most of degasus's suggestions in here now. I think we have a real winner here as moving the code to VertexManagerBase for a function has allowed OGL to utilize zfreeze now :) Correct use of the vertex pointer has also corrected most of the issue found in pull request #1767 that JMC47 stated. Which also for me now has Mario Tennis working with no polygon spikes on the characters anymore! Shadows are still an issue and probably in the other games with shadow problems. Rebel Strike also seems better but random skybox glitches can show up. --- .../Core/VideoBackends/D3D/VertexManager.cpp | 41 +--------------- Source/Core/VideoBackends/D3D/VertexManager.h | 1 - .../Core/VideoBackends/OGL/VertexManager.cpp | 48 +------------------ Source/Core/VideoBackends/OGL/VertexManager.h | 6 +-- Source/Core/VideoCommon/PixelShaderGen.h | 4 +- .../Core/VideoCommon/PixelShaderManager.cpp | 26 +++------- Source/Core/VideoCommon/PixelShaderManager.h | 3 +- Source/Core/VideoCommon/VertexManagerBase.cpp | 40 ++++++++++++++++ Source/Core/VideoCommon/VertexManagerBase.h | 2 + 9 files changed, 57 insertions(+), 114 deletions(-) diff --git a/Source/Core/VideoBackends/D3D/VertexManager.cpp b/Source/Core/VideoBackends/D3D/VertexManager.cpp index 8f925452c2..5f878cc29b 100644 --- a/Source/Core/VideoBackends/D3D/VertexManager.cpp +++ b/Source/Core/VideoBackends/D3D/VertexManager.cpp @@ -178,49 +178,12 @@ void VertexManager::vFlush(bool useDstAlpha) } u32 stride = VertexLoaderManager::GetCurrentVertexFormat()->GetVertexStride(); - u32 indices = IndexGenerator::GetIndexLen(); PrepareDrawBuffers(stride); - if (!bpmem.genMode.zfreeze && indices >= 3) + if (!bpmem.genMode.zfreeze && IndexGenerator::GetIndexLen() >= 3) { - float vtx[9]; - float out[12]; - - // Lookup vertices of the last rendered triangle and software-transform them - // This allows us to determine the depth slope, which will be used if zfreeze - // is enabled in the following flush. - for (unsigned int i = 0; i < 3; ++i) - { - const int base_index = GetIndexBuffer()[indices - 3 + i]; - u8* vtx_ptr = &((u8*)GetVertexBuffer())[base_index * stride]; - vtx[0 + i * 3] = ((float*)vtx_ptr)[0]; - vtx[1 + i * 3] = ((float*)vtx_ptr)[1]; - vtx[2 + i * 3] = ((float*)vtx_ptr)[2]; - - VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4]); - - // viewport offset ignored because we only look at coordinate differences. - out[0 + i * 4] = out[0 + i * 4] / out[3 + i * 4] * xfmem.viewport.wd; - out[1 + i * 4] = out[1 + i * 4] / out[3 + i * 4] * xfmem.viewport.ht; - out[2 + i * 4] = out[2 + i * 4] / out[3 + i * 4] * xfmem.viewport.zRange + xfmem.viewport.farZ; - } - float dx31 = out[8] - out[0]; - float dx12 = out[0] - out[4]; - float dy12 = out[1] - out[5]; - float dy31 = out[9] - out[1]; - - float DF31 = out[10] - out[2]; - float DF21 = out[6] - out[2]; - float a = DF31 * -dy12 - DF21 * dy31; - float b = dx31 * DF21 + dx12 * DF31; - float c = -dx12 * dy31 - dx31 * -dy12; - - float slope_dfdx = -a / c; - float slope_dfdy = -b / c; - float slope_f0 = out[2]; - - PixelShaderManager::SetZSlopeChanged(slope_dfdx, slope_dfdy, slope_f0); + CalculateZSlope(stride); } VertexLoaderManager::GetCurrentVertexFormat()->SetupVertexPointers(); diff --git a/Source/Core/VideoBackends/D3D/VertexManager.h b/Source/Core/VideoBackends/D3D/VertexManager.h index 38fcd088fd..0b124d7512 100644 --- a/Source/Core/VideoBackends/D3D/VertexManager.h +++ b/Source/Core/VideoBackends/D3D/VertexManager.h @@ -22,7 +22,6 @@ public: protected: virtual void ResetBuffer(u32 stride) override; u16* GetIndexBuffer() { return &LocalIBuffer[0]; } - u8* GetVertexBuffer() { return &LocalVBuffer[0]; } private: diff --git a/Source/Core/VideoBackends/OGL/VertexManager.cpp b/Source/Core/VideoBackends/OGL/VertexManager.cpp index d3a8d91bca..427a5ecee3 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.cpp +++ b/Source/Core/VideoBackends/OGL/VertexManager.cpp @@ -42,13 +42,6 @@ static size_t s_index_offset; VertexManager::VertexManager() { - LocalVBuffer.resize(MAXVBUFFERSIZE); - - s_pCurBufferPointer = s_pBaseBufferPointer = &LocalVBuffer[0]; - s_pEndBufferPointer = s_pBaseBufferPointer + LocalVBuffer.size(); - - LocalIBuffer.resize(MAXIBUFFERSIZE); - CreateDeviceObjects(); } @@ -138,7 +131,6 @@ void VertexManager::vFlush(bool useDstAlpha) { GLVertexFormat *nativeVertexFmt = (GLVertexFormat*)VertexLoaderManager::GetCurrentVertexFormat(); u32 stride = nativeVertexFmt->GetVertexStride(); - u32 indices = IndexGenerator::GetIndexLen(); if (m_last_vao != nativeVertexFmt->VAO) { @@ -148,45 +140,9 @@ void VertexManager::vFlush(bool useDstAlpha) PrepareDrawBuffers(stride); - if (!bpmem.genMode.zfreeze && indices >= 3) + if (!bpmem.genMode.zfreeze && IndexGenerator::GetIndexLen() >= 3) { - float vtx[9]; - float out[12]; - - // Lookup vertices of the last rendered triangle and software-transform them - // This allows us to determine the depth slope, which will be used if zfreeze - // is enabled in the following flush. - for (unsigned int i = 0; i < 3; ++i) - { - const int base_index = GetIndexBuffer()[indices - 3 + i]; - u8* vtx_ptr = &((u8*)GetVertexBuffer())[base_index * stride]; - vtx[0 + i * 3] = ((float*)vtx_ptr)[0]; - vtx[1 + i * 3] = ((float*)vtx_ptr)[1]; - vtx[2 + i * 3] = ((float*)vtx_ptr)[2]; - - VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4]); - - // viewport offset ignored because we only look at coordinate differences. - out[0 + i * 4] = out[0 + i * 4] / out[3 + i * 4] * xfmem.viewport.wd; - out[1 + i * 4] = out[1 + i * 4] / out[3 + i * 4] * xfmem.viewport.ht; - out[2 + i * 4] = out[2 + i * 4] / out[3 + i * 4] * xfmem.viewport.zRange + xfmem.viewport.farZ; - } - float dx31 = out[8] - out[0]; - float dx12 = out[0] - out[4]; - float dy12 = out[1] - out[5]; - float dy31 = out[9] - out[1]; - - float DF31 = out[10] - out[2]; - float DF21 = out[6] - out[2]; - float a = DF31 * -dy12 - DF21 * dy31; - float b = dx31 * DF21 + dx12 * DF31; - float c = -dx12 * dy31 - dx31 * -dy12; - - float slope_dfdx = -a / c; - float slope_dfdy = -b / c; - float slope_f0 = out[2]; - - PixelShaderManager::SetZSlopeChanged(slope_dfdx, slope_dfdy, slope_f0); + CalculateZSlope(stride); } // Makes sure we can actually do Dual source blending diff --git a/Source/Core/VideoBackends/OGL/VertexManager.h b/Source/Core/VideoBackends/OGL/VertexManager.h index 0e9efd9c83..f0c6ae9109 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.h +++ b/Source/Core/VideoBackends/OGL/VertexManager.h @@ -42,15 +42,11 @@ public: GLuint m_last_vao; protected: virtual void ResetBuffer(u32 stride) override; - u16* GetIndexBuffer() { return &LocalIBuffer[0]; } - u8* GetVertexBuffer() { return &LocalVBuffer[0]; } + private: void Draw(u32 stride); void vFlush(bool useDstAlpha) override; void PrepareDrawBuffers(u32 stride); - - std::vector LocalVBuffer; - std::vector LocalIBuffer; }; } diff --git a/Source/Core/VideoCommon/PixelShaderGen.h b/Source/Core/VideoCommon/PixelShaderGen.h index c889bd62a0..eb787fdc81 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.h +++ b/Source/Core/VideoCommon/PixelShaderGen.h @@ -20,8 +20,8 @@ #define C_INDTEXMTX (C_INDTEXSCALE + 2) //21 #define C_FOGCOLOR (C_INDTEXMTX + 6) //27 #define C_FOGI (C_FOGCOLOR + 1) //28 -#define C_FOGF (C_FOGI + 1) //29 -#define C_ZSLOPE (C_FOGF + 1) //30 +#define C_FOGF (C_FOGI + 2) //29 +#define C_ZSLOPE (C_FOGF + 1) //31 #define C_PENVCONST_END (C_ZSLOPE + 2) diff --git a/Source/Core/VideoCommon/PixelShaderManager.cpp b/Source/Core/VideoCommon/PixelShaderManager.cpp index b55147eb15..d0004a0921 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.cpp +++ b/Source/Core/VideoCommon/PixelShaderManager.cpp @@ -14,8 +14,6 @@ bool PixelShaderManager::s_bFogRangeAdjustChanged; bool PixelShaderManager::s_bViewPortChanged; -bool PixelShaderManager::s_bZSlopeChanged; -static float zslope[3]; std::array PixelShaderManager::s_tev_color; std::array PixelShaderManager::s_tev_konst_color; @@ -50,7 +48,7 @@ void PixelShaderManager::Dirty() SetDestAlpha(); SetZTextureBias(); SetViewportChanged(); - SetZSlopeChanged(0, 0, 1); + SetZSlope(0, 0, 1); SetIndTexScaleChanged(false); SetIndTexScaleChanged(true); SetIndMatrixChanged(0); @@ -115,17 +113,6 @@ void PixelShaderManager::SetConstants() dirty = true; s_bViewPortChanged = false; } - - if (s_bZSlopeChanged) - { - constants.zslope[0] = zslope[0]; - constants.zslope[1] = zslope[1]; - constants.zslope[2] = zslope[2]; - constants.zslope[3] = 0; - - dirty = true; - s_bZSlopeChanged = false; - } } void PixelShaderManager::SetTevColor(int index, int component, s32 value) @@ -182,12 +169,13 @@ void PixelShaderManager::SetViewportChanged() s_bFogRangeAdjustChanged = true; // TODO: Shouldn't be necessary with an accurate fog range adjust implementation } -void PixelShaderManager::SetZSlopeChanged(float dfdx, float dfdy, float f0) +void PixelShaderManager::SetZSlope(float dfdx, float dfdy, float f0) { - zslope[0] = dfdx; - zslope[1] = dfdy; - zslope[2] = f0; - s_bZSlopeChanged = true; + constants.zslope[0] = dfdx; + constants.zslope[1] = dfdy; + constants.zslope[2] = f0; + constants.zslope[3] = 0; + dirty = true; } void PixelShaderManager::SetIndTexScaleChanged(bool high) diff --git a/Source/Core/VideoCommon/PixelShaderManager.h b/Source/Core/VideoCommon/PixelShaderManager.h index 16c760f70f..faa15cff7e 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.h +++ b/Source/Core/VideoCommon/PixelShaderManager.h @@ -36,7 +36,7 @@ public: static void SetTexDims(int texmapid, u32 width, u32 height, u32 wraps, u32 wrapt); static void SetZTextureBias(); static void SetViewportChanged(); - static void SetZSlopeChanged(float dfdx, float dfdy, float f0); + static void SetZSlope(float dfdx, float dfdy, float f0); static void SetIndMatrixChanged(int matrixidx); static void SetTevKSelChanged(int id); static void SetZTextureTypeChanged(); @@ -51,7 +51,6 @@ public: static bool s_bFogRangeAdjustChanged; static bool s_bViewPortChanged; - static bool s_bZSlopeChanged; // These colors aren't available from global BP state, // hence we keep a copy of them around. diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index 38cfd19630..80ea3b5bb9 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -241,3 +241,43 @@ void VertexManager::DoState(PointerWrap& p) { g_vertex_manager->vDoState(p); } + +void VertexManager::CalculateZSlope(u32 stride) +{ + float vtx[9]; + float out[12]; + + // Lookup vertices of the last rendered triangle and software-transform them + // This allows us to determine the depth slope, which will be used if zfreeze + // is enabled in the following flush. + for (unsigned int i = 0; i < 3; ++i) + { + u8* vtx_ptr = s_pCurBufferPointer - stride * (3 - i); + vtx[0 + i * 3] = ((float*)vtx_ptr)[0]; + vtx[1 + i * 3] = ((float*)vtx_ptr)[1]; + vtx[2 + i * 3] = ((float*)vtx_ptr)[2]; + + VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4]); + + // viewport offset ignored because we only look at coordinate differences. + out[0 + i * 4] = out[0 + i * 4] / out[3 + i * 4] * xfmem.viewport.wd; + out[1 + i * 4] = out[1 + i * 4] / out[3 + i * 4] * xfmem.viewport.ht; + out[2 + i * 4] = out[2 + i * 4] / out[3 + i * 4] * xfmem.viewport.zRange + xfmem.viewport.farZ; + } + float dx31 = out[8] - out[0]; + float dx12 = out[0] - out[4]; + float dy12 = out[1] - out[5]; + float dy31 = out[9] - out[1]; + + float DF31 = out[10] - out[2]; + float DF21 = out[6] - out[2]; + float a = DF31 * -dy12 - DF21 * dy31; + float b = dx31 * DF21 + dx12 * DF31; + float c = -dx12 * dy31 - dx31 * -dy12; + + float slope_dfdx = -a / c; + float slope_dfdy = -b / c; + float slope_f0 = out[2]; + + PixelShaderManager::SetZSlope(slope_dfdx, slope_dfdy, slope_f0); +} diff --git a/Source/Core/VideoCommon/VertexManagerBase.h b/Source/Core/VideoCommon/VertexManagerBase.h index c854cd3586..524f3e5a0c 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.h +++ b/Source/Core/VideoCommon/VertexManagerBase.h @@ -41,6 +41,8 @@ public: static void DoState(PointerWrap& p); + static void CalculateZSlope(u32 stride); + protected: virtual void vDoState(PointerWrap& p) { } From 418296961cba39e4ae5bc441b3f52bf1459ad2bb Mon Sep 17 00:00:00 2001 From: Scott Mansell Date: Fri, 2 Jan 2015 23:55:41 +1300 Subject: [PATCH 03/11] Fix various issues with zfreeze implemntation. Results are still not correct, but things are getting closer. * Don't cull CULLALL primitives so early so they can be used as reference planes. * Convert CalculateZSlope to screenspace coordinates. * Convert Pixelshader to screenspace coordinates (instead of worldspace xy coordinates, which is totally wrong) * Divide depth by 2^24 instead of clamping to 0.0-1.0 as was done before. Progress: * Rouge Squadron 2/3 appear correct in game (videos in rs2 save file selection are missing) * Shadows draw 100% correctly in NHL 2003. * Mario golf menu renders correctly. * NFS: HP2, shadows sometimes render on top of car or below the road. * Mario Tennis, courts and shadows render correctly, but at wrong depth * Blood Omen 2, doesn't work. --- Source/Core/VideoBackends/D3D/VertexManager.cpp | 4 ++++ Source/Core/VideoBackends/OGL/VertexManager.cpp | 4 ++++ Source/Core/VideoCommon/PixelShaderGen.cpp | 4 ++-- Source/Core/VideoCommon/VertexLoaderManager.cpp | 5 +---- Source/Core/VideoCommon/VertexManagerBase.cpp | 9 +++++---- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/Source/Core/VideoBackends/D3D/VertexManager.cpp b/Source/Core/VideoBackends/D3D/VertexManager.cpp index 5f878cc29b..2c38ac9d22 100644 --- a/Source/Core/VideoBackends/D3D/VertexManager.cpp +++ b/Source/Core/VideoBackends/D3D/VertexManager.cpp @@ -186,6 +186,10 @@ void VertexManager::vFlush(bool useDstAlpha) CalculateZSlope(stride); } + // if cull mode is CULL_ALL, ignore triangles and quads + if (bpmem.genMode.cullmode == GenMode::CULL_ALL && current_primitive_type == PRIMITIVE_TRIANGLES) + return; + VertexLoaderManager::GetCurrentVertexFormat()->SetupVertexPointers(); g_renderer->ApplyState(useDstAlpha); diff --git a/Source/Core/VideoBackends/OGL/VertexManager.cpp b/Source/Core/VideoBackends/OGL/VertexManager.cpp index 427a5ecee3..859a3b8db4 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.cpp +++ b/Source/Core/VideoBackends/OGL/VertexManager.cpp @@ -145,6 +145,10 @@ void VertexManager::vFlush(bool useDstAlpha) CalculateZSlope(stride); } + // if cull mode is CULL_ALL, ignore triangles and quads + if (bpmem.genMode.cullmode == GenMode::CULL_ALL && current_primitive_type == PRIMITIVE_TRIANGLES) + return; + // Makes sure we can actually do Dual source blending bool dualSourcePossible = g_ActiveConfig.backend_info.bSupportsDualSourceBlend; diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp index 7afb21056c..8bc7a7980b 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.cpp +++ b/Source/Core/VideoCommon/PixelShaderGen.cpp @@ -546,7 +546,7 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T { if (bpmem.genMode.zfreeze) { - out.Write("\tdepth = " I_ZSLOPE".z + " I_ZSLOPE".x * (clipPos.x / clipPos.w) + " I_ZSLOPE".y * (clipPos.y / clipPos.w);\n"); + out.Write("\tdepth = float(" I_ZSLOPE".z + " I_ZSLOPE".x * rawpos.x + " I_ZSLOPE".y * rawpos.y) / float(0xffffff);\n"); } else { @@ -569,7 +569,7 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T { if (bpmem.genMode.zfreeze) { - out.Write("\tdepth = " I_ZSLOPE".z + " I_ZSLOPE".x * (clipPos.x / clipPos.w) + " I_ZSLOPE".y * (clipPos.y / clipPos.w);\n"); + out.Write("\tdepth = float(" I_ZSLOPE".z + " I_ZSLOPE".x * rawpos.x + " I_ZSLOPE".y * rawpos.y) / float(0xffffff);\n"); } else { diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index 786cb45a96..9cc8861186 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -149,11 +149,8 @@ int RunVertices(int vtx_attr_group, int primitive, int count, DataReader src, bo if ((int)src.size() < size) return -1; - if (skip_drawing || (bpmem.genMode.cullmode == GenMode::CULL_ALL && primitive < 5)) - { - // if cull mode is CULL_ALL, ignore triangles and quads + if (skip_drawing) return size; - } // If the native vertex format changed, force a flush. if (loader->m_native_vertex_format != s_current_vtx_fmt) diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index 80ea3b5bb9..bcdaf466a4 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -259,11 +259,12 @@ void VertexManager::CalculateZSlope(u32 stride) VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4]); - // viewport offset ignored because we only look at coordinate differences. - out[0 + i * 4] = out[0 + i * 4] / out[3 + i * 4] * xfmem.viewport.wd; - out[1 + i * 4] = out[1 + i * 4] / out[3 + i * 4] * xfmem.viewport.ht; + // Transform to Screenspace + out[0 + i * 4] = out[0 + i * 4] / out[3 + i * 4] * xfmem.viewport.wd + (xfmem.viewport.xOrig - 342); + out[1 + i * 4] = out[1 + i * 4] / out[3 + i * 4] * xfmem.viewport.ht + (xfmem.viewport.yOrig - 342); out[2 + i * 4] = out[2 + i * 4] / out[3 + i * 4] * xfmem.viewport.zRange + xfmem.viewport.farZ; } + float dx31 = out[8] - out[0]; float dx12 = out[0] - out[4]; float dy12 = out[1] - out[5]; @@ -277,7 +278,7 @@ void VertexManager::CalculateZSlope(u32 stride) float slope_dfdx = -a / c; float slope_dfdy = -b / c; - float slope_f0 = out[2]; + float slope_f0 = out[2] - (out[0] * slope_dfdx + out[1] * slope_dfdy); PixelShaderManager::SetZSlope(slope_dfdx, slope_dfdy, slope_f0); } From 88c7afd315d06c27e459a8352f753cdeca5a1fe5 Mon Sep 17 00:00:00 2001 From: Scott Mansell Date: Sat, 3 Jan 2015 06:06:56 +1300 Subject: [PATCH 04/11] Make zfreeze use screenspace coordinates independant of IR. OpenGL requires the y coordinates to be flipped. Also refactored PixelGen code to remove duplicate code. --- Source/Core/VideoBackends/D3D/Render.cpp | 4 ++ Source/Core/VideoBackends/OGL/Render.cpp | 5 +++ Source/Core/VideoCommon/ConstantManager.h | 1 + Source/Core/VideoCommon/PixelShaderGen.cpp | 43 ++++++++++++------- Source/Core/VideoCommon/PixelShaderGen.h | 3 +- .../Core/VideoCommon/PixelShaderManager.cpp | 15 +++++++ Source/Core/VideoCommon/PixelShaderManager.h | 2 + Source/Core/VideoCommon/ShaderGenCommon.h | 1 + 8 files changed, 57 insertions(+), 17 deletions(-) diff --git a/Source/Core/VideoBackends/D3D/Render.cpp b/Source/Core/VideoBackends/D3D/Render.cpp index 268a408022..ea6bb924b6 100644 --- a/Source/Core/VideoBackends/D3D/Render.cpp +++ b/Source/Core/VideoBackends/D3D/Render.cpp @@ -33,6 +33,7 @@ #include "VideoCommon/ImageWrite.h" #include "VideoCommon/OnScreenDisplay.h" #include "VideoCommon/PixelEngine.h" +#include "VideoCommon/PixelShaderManager.h" #include "VideoCommon/Statistics.h" #include "VideoCommon/VertexShaderManager.h" #include "VideoCommon/VideoConfig.h" @@ -231,6 +232,7 @@ Renderer::Renderer(void *&window_handle) s_last_stereo_mode = g_ActiveConfig.iStereoMode > 0; s_last_xfb_mode = g_ActiveConfig.bUseRealXFB; CalculateTargetSize(s_backbuffer_width, s_backbuffer_height); + PixelShaderManager::SetEfbScaleChanged(); SetupDeviceObjects(); @@ -946,6 +948,8 @@ void Renderer::SwapImpl(u32 xfbAddr, u32 fbWidth, u32 fbStride, u32 fbHeight, co s_last_stereo_mode = g_ActiveConfig.iStereoMode > 0; CalculateTargetSize(s_backbuffer_width, s_backbuffer_height); + PixelShaderManager::SetEfbScaleChanged(); + D3D::context->OMSetRenderTargets(1, &D3D::GetBackBuffer()->GetRTV(), nullptr); delete g_framebuffer_manager; diff --git a/Source/Core/VideoBackends/OGL/Render.cpp b/Source/Core/VideoBackends/OGL/Render.cpp index 961d50f9c7..3d1991fe3f 100644 --- a/Source/Core/VideoBackends/OGL/Render.cpp +++ b/Source/Core/VideoBackends/OGL/Render.cpp @@ -43,6 +43,7 @@ #include "VideoCommon/ImageWrite.h" #include "VideoCommon/OnScreenDisplay.h" #include "VideoCommon/PixelEngine.h" +#include "VideoCommon/PixelShaderManager.h" #include "VideoCommon/Statistics.h" #include "VideoCommon/VertexLoaderManager.h" #include "VideoCommon/VertexShaderGen.h" @@ -618,6 +619,8 @@ Renderer::Renderer() s_last_efb_scale = g_ActiveConfig.iEFBScale; CalculateTargetSize(s_backbuffer_width, s_backbuffer_height); + PixelShaderManager::SetEfbScaleChanged(); + // Because of the fixed framebuffer size we need to disable the resolution // options while running g_Config.bRunning = true; @@ -1681,6 +1684,8 @@ void Renderer::SwapImpl(u32 xfbAddr, u32 fbWidth, u32 fbStride, u32 fbHeight, co delete g_framebuffer_manager; g_framebuffer_manager = new FramebufferManager(s_target_width, s_target_height, s_MSAASamples); + + PixelShaderManager::SetEfbScaleChanged(); } } diff --git a/Source/Core/VideoCommon/ConstantManager.h b/Source/Core/VideoCommon/ConstantManager.h index b7b3d6664c..8fc1c221ed 100644 --- a/Source/Core/VideoCommon/ConstantManager.h +++ b/Source/Core/VideoCommon/ConstantManager.h @@ -24,6 +24,7 @@ struct PixelShaderConstants int4 fogi; float4 fogf[2]; float4 zslope; + float4 efbscale; }; struct VertexShaderConstants diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp index 8bc7a7980b..5899f0100b 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.cpp +++ b/Source/Core/VideoCommon/PixelShaderGen.cpp @@ -144,6 +144,7 @@ template static inline void WriteTevRegular(T& out, const char* compone template static inline void SampleTexture(T& out, const char *texcoords, const char *texswap, int texmap, API_TYPE ApiType); template static inline void WriteAlphaTest(T& out, pixel_shader_uid_data* uid_data, API_TYPE ApiType,DSTALPHA_MODE dstAlphaMode, bool per_pixel_depth); template static inline void WriteFog(T& out, pixel_shader_uid_data* uid_data); +template static inline void WritePerPixelDepth(T& out, pixel_shader_uid_data* uid_data, API_TYPE ApiType); template static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_TYPE ApiType, u32 components) @@ -229,6 +230,7 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T "\tint4 " I_FOGI";\n" "\tfloat4 " I_FOGF"[2];\n" "\tfloat4 " I_ZSLOPE";\n" + "\tfloat4 " I_EFBSCALE";\n" "};\n"); if (g_ActiveConfig.bEnablePixelLighting) @@ -544,14 +546,7 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T // Note: z-textures are not written to depth buffer if early depth test is used if (per_pixel_depth && bpmem.UseEarlyDepthTest()) { - if (bpmem.genMode.zfreeze) - { - out.Write("\tdepth = float(" I_ZSLOPE".z + " I_ZSLOPE".x * rawpos.x + " I_ZSLOPE".y * rawpos.y) / float(0xffffff);\n"); - } - else - { - out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n"); - } + WritePerPixelDepth(out, uid_data, ApiType); } // Note: depth texture output is only written to depth buffer if late depth test is used @@ -567,14 +562,7 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T if (per_pixel_depth && bpmem.UseLateDepthTest()) { - if (bpmem.genMode.zfreeze) - { - out.Write("\tdepth = float(" I_ZSLOPE".z + " I_ZSLOPE".x * rawpos.x + " I_ZSLOPE".y * rawpos.y) / float(0xffffff);\n"); - } - else - { - out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n"); - } + WritePerPixelDepth(out, uid_data, ApiType); } if (dstAlphaMode == DSTALPHA_ALPHA_PASS) @@ -1115,6 +1103,29 @@ static inline void WriteFog(T& out, pixel_shader_uid_data* uid_data) out.Write("\tprev.rgb = (prev.rgb * (256 - ifog) + " I_FOGCOLOR".rgb * ifog) >> 8;\n"); } +template +static inline void WritePerPixelDepth(T& out, pixel_shader_uid_data* uid_data, API_TYPE ApiType) +{ + if (bpmem.genMode.zfreeze) + { + out.SetConstantsUsed(C_ZSLOPE, C_ZSLOPE); + out.SetConstantsUsed(C_EFBSCALE, C_EFBSCALE); + + out.Write("\tfloat2 screenpos = rawpos.xy * " I_EFBSCALE".xy;\n"); + + // Opengl has reversed vertical screenspace coordiantes + if(ApiType == API_OPENGL) + out.Write("\tscreenpos.y = %i - screenpos.y - 1;\n", EFB_HEIGHT); + + out.Write("\tdepth = float(" I_ZSLOPE".z + " I_ZSLOPE".x * screenpos.x + " I_ZSLOPE".y * screenpos.y) / float(0xffffff);\n"); + } + else + { + out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n"); + } +} + + void GetPixelShaderUid(PixelShaderUid& object, DSTALPHA_MODE dstAlphaMode, API_TYPE ApiType, u32 components) { GeneratePixelShader(object, dstAlphaMode, ApiType, components); diff --git a/Source/Core/VideoCommon/PixelShaderGen.h b/Source/Core/VideoCommon/PixelShaderGen.h index eb787fdc81..f8631e73a8 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.h +++ b/Source/Core/VideoCommon/PixelShaderGen.h @@ -22,8 +22,9 @@ #define C_FOGI (C_FOGCOLOR + 1) //28 #define C_FOGF (C_FOGI + 2) //29 #define C_ZSLOPE (C_FOGF + 1) //31 +#define C_EFBSCALE (C_ZSLOPE + 1) //32 -#define C_PENVCONST_END (C_ZSLOPE + 2) +#define C_PENVCONST_END (C_EFBSCALE + 1) // Different ways to achieve rendering with destination alpha enum DSTALPHA_MODE diff --git a/Source/Core/VideoCommon/PixelShaderManager.cpp b/Source/Core/VideoCommon/PixelShaderManager.cpp index d0004a0921..a94dfaf991 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.cpp +++ b/Source/Core/VideoCommon/PixelShaderManager.cpp @@ -14,6 +14,7 @@ bool PixelShaderManager::s_bFogRangeAdjustChanged; bool PixelShaderManager::s_bViewPortChanged; +bool PixelShaderManager::s_bEFBScaleChanged; std::array PixelShaderManager::s_tev_color; std::array PixelShaderManager::s_tev_konst_color; @@ -48,6 +49,7 @@ void PixelShaderManager::Dirty() SetDestAlpha(); SetZTextureBias(); SetViewportChanged(); + SetEfbScaleChanged(); SetZSlope(0, 0, 1); SetIndTexScaleChanged(false); SetIndTexScaleChanged(true); @@ -113,6 +115,13 @@ void PixelShaderManager::SetConstants() dirty = true; s_bViewPortChanged = false; } + + if (s_bEFBScaleChanged) { + constants.efbscale[0] = 1.0f / float(Renderer::EFBToScaledXf(1)); + constants.efbscale[1] = 1.0f / float(Renderer::EFBToScaledYf(1)); + dirty = true; + s_bEFBScaleChanged = false; + } } void PixelShaderManager::SetTevColor(int index, int component, s32 value) @@ -169,6 +178,12 @@ void PixelShaderManager::SetViewportChanged() s_bFogRangeAdjustChanged = true; // TODO: Shouldn't be necessary with an accurate fog range adjust implementation } +void PixelShaderManager::SetEfbScaleChanged() +{ + s_bEFBScaleChanged = true; + s_bViewPortChanged = true; +} + void PixelShaderManager::SetZSlope(float dfdx, float dfdy, float f0) { constants.zslope[0] = dfdx; diff --git a/Source/Core/VideoCommon/PixelShaderManager.h b/Source/Core/VideoCommon/PixelShaderManager.h index faa15cff7e..421fd3393c 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.h +++ b/Source/Core/VideoCommon/PixelShaderManager.h @@ -36,6 +36,7 @@ public: static void SetTexDims(int texmapid, u32 width, u32 height, u32 wraps, u32 wrapt); static void SetZTextureBias(); static void SetViewportChanged(); + static void SetEfbScaleChanged(); static void SetZSlope(float dfdx, float dfdy, float f0); static void SetIndMatrixChanged(int matrixidx); static void SetTevKSelChanged(int id); @@ -51,6 +52,7 @@ public: static bool s_bFogRangeAdjustChanged; static bool s_bViewPortChanged; + static bool s_bEFBScaleChanged; // These colors aren't available from global BP state, // hence we keep a copy of them around. diff --git a/Source/Core/VideoCommon/ShaderGenCommon.h b/Source/Core/VideoCommon/ShaderGenCommon.h index dd80fd3987..4698392f9e 100644 --- a/Source/Core/VideoCommon/ShaderGenCommon.h +++ b/Source/Core/VideoCommon/ShaderGenCommon.h @@ -292,6 +292,7 @@ static inline void AssignVSOutputMembers(T& object, const char* a, const char* b #define I_FOGI "cfogi" #define I_FOGF "cfogf" #define I_ZSLOPE "czslope" +#define I_EFBSCALE "cefbscale" #define I_POSNORMALMATRIX "cpnmtx" #define I_PROJECTION "cproj" From 6d5065c58d06e14a0c92632ce794958ddff977f3 Mon Sep 17 00:00:00 2001 From: Scott Mansell Date: Sat, 3 Jan 2015 09:30:52 +1300 Subject: [PATCH 05/11] Fix pixelshader constant offsets. --- Source/Core/VideoCommon/PixelShaderGen.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Source/Core/VideoCommon/PixelShaderGen.h b/Source/Core/VideoCommon/PixelShaderGen.h index f8631e73a8..b37fa59ab3 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.h +++ b/Source/Core/VideoCommon/PixelShaderGen.h @@ -20,8 +20,8 @@ #define C_INDTEXMTX (C_INDTEXSCALE + 2) //21 #define C_FOGCOLOR (C_INDTEXMTX + 6) //27 #define C_FOGI (C_FOGCOLOR + 1) //28 -#define C_FOGF (C_FOGI + 2) //29 -#define C_ZSLOPE (C_FOGF + 1) //31 +#define C_FOGF (C_FOGI + 1) //29 +#define C_ZSLOPE (C_FOGF + 2) //31 #define C_EFBSCALE (C_ZSLOPE + 1) //32 #define C_PENVCONST_END (C_EFBSCALE + 1) From add59b3bea032e331363dcf28361f8e0de65e43d Mon Sep 17 00:00:00 2001 From: NanoByte011 Date: Tue, 13 Jan 2015 02:55:25 -0700 Subject: [PATCH 06/11] Fixes Mario Tennis Gimmick Courts and adds support for FastDepthCalc - Calculate ZSlope every flush but only set PixelShader Constant on Reset Buffer when zfreeze - Fixed another Pixel Shader bug in D3D that was giving me grief --- .../Core/VideoBackends/D3D/VertexManager.cpp | 9 ++++---- .../Core/VideoBackends/OGL/VertexManager.cpp | 9 ++++---- Source/Core/VideoCommon/PixelShaderGen.cpp | 18 ++++++++++----- .../Core/VideoCommon/PixelShaderManager.cpp | 5 +++-- Source/Core/VideoCommon/VertexManagerBase.cpp | 22 +++++++++++++------ Source/Core/VideoCommon/VertexManagerBase.h | 12 ++++++++-- .../Core/VideoCommon/VertexShaderManager.cpp | 3 +-- 7 files changed, 52 insertions(+), 26 deletions(-) diff --git a/Source/Core/VideoBackends/D3D/VertexManager.cpp b/Source/Core/VideoBackends/D3D/VertexManager.cpp index 2c38ac9d22..8546ed8ca7 100644 --- a/Source/Core/VideoBackends/D3D/VertexManager.cpp +++ b/Source/Core/VideoBackends/D3D/VertexManager.cpp @@ -181,12 +181,10 @@ void VertexManager::vFlush(bool useDstAlpha) PrepareDrawBuffers(stride); - if (!bpmem.genMode.zfreeze && IndexGenerator::GetIndexLen() >= 3) - { + if (!bpmem.genMode.zfreeze) CalculateZSlope(stride); - } - // if cull mode is CULL_ALL, ignore triangles and quads + // If cull mode is CULL_ALL, do not render these triangles if (bpmem.genMode.cullmode == GenMode::CULL_ALL && current_primitive_type == PRIMITIVE_TRIANGLES) return; @@ -202,6 +200,9 @@ void VertexManager::ResetBuffer(u32 stride) { s_pCurBufferPointer = s_pBaseBufferPointer; IndexGenerator::Start(GetIndexBuffer()); + + if (bpmem.genMode.zfreeze) + PixelShaderManager::SetZSlope(ZSlope.dfdx, ZSlope.dfdy, ZSlope.f0); } } // namespace diff --git a/Source/Core/VideoBackends/OGL/VertexManager.cpp b/Source/Core/VideoBackends/OGL/VertexManager.cpp index 859a3b8db4..81c377fd02 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.cpp +++ b/Source/Core/VideoBackends/OGL/VertexManager.cpp @@ -89,6 +89,9 @@ void VertexManager::ResetBuffer(u32 stride) buffer = s_indexBuffer->Map(MAXIBUFFERSIZE * sizeof(u16)); IndexGenerator::Start((u16*)buffer.first); s_index_offset = buffer.second; + + if (bpmem.genMode.zfreeze) + PixelShaderManager::SetZSlope(ZSlope.dfdx, ZSlope.dfdy, ZSlope.f0); } void VertexManager::Draw(u32 stride) @@ -140,12 +143,10 @@ void VertexManager::vFlush(bool useDstAlpha) PrepareDrawBuffers(stride); - if (!bpmem.genMode.zfreeze && IndexGenerator::GetIndexLen() >= 3) - { + if (!bpmem.genMode.zfreeze) CalculateZSlope(stride); - } - // if cull mode is CULL_ALL, ignore triangles and quads + // If cull mode is CULL_ALL, do not render these triangles if (bpmem.genMode.cullmode == GenMode::CULL_ALL && current_primitive_type == PRIMITIVE_TRIANGLES) return; diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp index 5899f0100b..bd57f0c888 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.cpp +++ b/Source/Core/VideoCommon/PixelShaderGen.cpp @@ -271,7 +271,11 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T GenerateVSOutputMembers(out, ApiType); out.Write("};\n"); - const bool forced_early_z = g_ActiveConfig.backend_info.bSupportsEarlyZ && bpmem.UseEarlyDepthTest() && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED); + const bool forced_early_z = g_ActiveConfig.backend_info.bSupportsEarlyZ && bpmem.UseEarlyDepthTest() + && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED) + // We can't allow early_ztest for zfreeze because a reference poly is used + // to control the depth and we need a depth test after the alpha test. + && !bpmem.genMode.zfreeze; const bool per_pixel_depth = (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) || (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z) || bpmem.genMode.zfreeze; if (forced_early_z) @@ -365,7 +369,7 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T out.Write("void main(\n"); out.Write(" out float4 ocol0 : SV_Target0,%s%s\n in float4 rawpos : SV_Position,\n", dstAlphaMode == DSTALPHA_DUAL_SOURCE_BLEND ? "\n out float4 ocol1 : SV_Target1," : "", - per_pixel_depth ? "\n out float depth : SV_Depth," : ""); + (per_pixel_depth && bpmem.zmode.testenable) ? "\n out float depth : SV_Depth," : ""); out.Write(" in centroid float4 colors_0 : COLOR0,\n"); out.Write(" in centroid float4 colors_1 : COLOR1\n"); @@ -1023,7 +1027,11 @@ static inline void WriteAlphaTest(T& out, pixel_shader_uid_data* uid_data, API_T // Tests seem to have proven that writing depth even when the alpha test fails is more // important that a reliable alpha test, so we just force the alpha test to always succeed. // At least this seems to be less buggy. - uid_data->alpha_test_use_zcomploc_hack = bpmem.UseEarlyDepthTest() && bpmem.zmode.updateenable && !g_ActiveConfig.backend_info.bSupportsEarlyZ; + uid_data->alpha_test_use_zcomploc_hack = bpmem.UseEarlyDepthTest() + && bpmem.zmode.updateenable + && !g_ActiveConfig.backend_info.bSupportsEarlyZ + && !bpmem.genMode.zfreeze; // Might not be neccessary + if (!uid_data->alpha_test_use_zcomploc_hack) { out.Write("\t\tdiscard;\n"); @@ -1114,10 +1122,10 @@ static inline void WritePerPixelDepth(T& out, pixel_shader_uid_data* uid_data, A out.Write("\tfloat2 screenpos = rawpos.xy * " I_EFBSCALE".xy;\n"); // Opengl has reversed vertical screenspace coordiantes - if(ApiType == API_OPENGL) + if (ApiType == API_OPENGL) out.Write("\tscreenpos.y = %i - screenpos.y - 1;\n", EFB_HEIGHT); - out.Write("\tdepth = float(" I_ZSLOPE".z + " I_ZSLOPE".x * screenpos.x + " I_ZSLOPE".y * screenpos.y) / float(0xffffff);\n"); + out.Write("\tdepth = float(" I_ZSLOPE".z + " I_ZSLOPE".x * screenpos.x + " I_ZSLOPE".y * screenpos.y) / float(0xFFFFFF);\n"); } else { diff --git a/Source/Core/VideoCommon/PixelShaderManager.cpp b/Source/Core/VideoCommon/PixelShaderManager.cpp index a94dfaf991..f80fc114ce 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.cpp +++ b/Source/Core/VideoCommon/PixelShaderManager.cpp @@ -50,7 +50,7 @@ void PixelShaderManager::Dirty() SetZTextureBias(); SetViewportChanged(); SetEfbScaleChanged(); - SetZSlope(0, 0, 1); + SetZSlope(0, 0, (float)0xFFFFFF); SetIndTexScaleChanged(false); SetIndTexScaleChanged(true); SetIndMatrixChanged(0); @@ -116,7 +116,8 @@ void PixelShaderManager::SetConstants() s_bViewPortChanged = false; } - if (s_bEFBScaleChanged) { + if (s_bEFBScaleChanged) + { constants.efbscale[0] = 1.0f / float(Renderer::EFBToScaledXf(1)); constants.efbscale[1] = 1.0f / float(Renderer::EFBToScaledYf(1)); dirty = true; diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index bcdaf466a4..23eb770c6d 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -25,6 +25,8 @@ u8 *VertexManager::s_pEndBufferPointer; PrimitiveType VertexManager::current_primitive_type; +Slope VertexManager::ZSlope; + bool VertexManager::IsFlushed; static const PrimitiveType primitive_from_gx[8] = { @@ -246,6 +248,8 @@ void VertexManager::CalculateZSlope(u32 stride) { float vtx[9]; float out[12]; + float viewOffset[2] = { xfmem.viewport.xOrig - bpmem.scissorOffset.x * 2, + xfmem.viewport.yOrig - bpmem.scissorOffset.y * 2}; // Lookup vertices of the last rendered triangle and software-transform them // This allows us to determine the depth slope, which will be used if zfreeze @@ -260,9 +264,11 @@ void VertexManager::CalculateZSlope(u32 stride) VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4]); // Transform to Screenspace - out[0 + i * 4] = out[0 + i * 4] / out[3 + i * 4] * xfmem.viewport.wd + (xfmem.viewport.xOrig - 342); - out[1 + i * 4] = out[1 + i * 4] / out[3 + i * 4] * xfmem.viewport.ht + (xfmem.viewport.yOrig - 342); - out[2 + i * 4] = out[2 + i * 4] / out[3 + i * 4] * xfmem.viewport.zRange + xfmem.viewport.farZ; + float w = out[3 + i * 4]; + + out[0 + i * 4] = out[0 + i * 4] / w * xfmem.viewport.wd + viewOffset[0]; + out[1 + i * 4] = out[1 + i * 4] / w * xfmem.viewport.ht + viewOffset[1]; + out[2 + i * 4] = out[2 + i * 4] / w * xfmem.viewport.zRange + xfmem.viewport.farZ; } float dx31 = out[8] - out[0]; @@ -276,9 +282,11 @@ void VertexManager::CalculateZSlope(u32 stride) float b = dx31 * DF21 + dx12 * DF31; float c = -dx12 * dy31 - dx31 * -dy12; - float slope_dfdx = -a / c; - float slope_dfdy = -b / c; - float slope_f0 = out[2] - (out[0] * slope_dfdx + out[1] * slope_dfdy); + // Stop divide by zero + if (c == 0) + return; - PixelShaderManager::SetZSlope(slope_dfdx, slope_dfdy, slope_f0); + ZSlope.dfdx = -a / c; + ZSlope.dfdy = -b / c; + ZSlope.f0 = out[2] - (out[0] * ZSlope.dfdx + out[1] * ZSlope.dfdy); } diff --git a/Source/Core/VideoCommon/VertexManagerBase.h b/Source/Core/VideoCommon/VertexManagerBase.h index 524f3e5a0c..143e6b811c 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.h +++ b/Source/Core/VideoCommon/VertexManagerBase.h @@ -14,6 +14,13 @@ enum PrimitiveType { PRIMITIVE_TRIANGLES, }; +struct Slope +{ + float dfdx; + float dfdy; + float f0; +}; + class VertexManager { private: @@ -41,8 +48,6 @@ public: static void DoState(PointerWrap& p); - static void CalculateZSlope(u32 stride); - protected: virtual void vDoState(PointerWrap& p) { } @@ -57,6 +62,9 @@ protected: static u32 GetRemainingSize(); static u32 GetRemainingIndices(int primitive); + static Slope ZSlope; + static void CalculateZSlope(u32 stride); + private: static bool IsFlushed; diff --git a/Source/Core/VideoCommon/VertexShaderManager.cpp b/Source/Core/VideoCommon/VertexShaderManager.cpp index a745f7004f..5320e0af2e 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.cpp +++ b/Source/Core/VideoCommon/VertexShaderManager.cpp @@ -692,6 +692,7 @@ void VertexShaderManager::ResetView() void VertexShaderManager::TransformToClipSpace(const float* data, float *out) { + // Can we use constants.posnormalmatrix here instead? const float *world_matrix = (const float *)xfmem.posMatrices + g_main_cp_state.matrix_index_a.PosNormalMtxIdx * 4; const float *proj_matrix = &g_fProjectionMatrix[0]; @@ -700,8 +701,6 @@ void VertexShaderManager::TransformToClipSpace(const float* data, float *out) t[1] = data[0] * world_matrix[4] + data[1] * world_matrix[5] + data[2] * world_matrix[6] + world_matrix[7]; t[2] = data[0] * world_matrix[8] + data[1] * world_matrix[9] + data[2] * world_matrix[10] + world_matrix[11]; - // TODO: this requires g_fProjectionMatrix to be up to date, which is not really a good design decision. - out[0] = t[0] * proj_matrix[0] + t[1] * proj_matrix[1] + t[2] * proj_matrix[2] + proj_matrix[3]; out[1] = t[0] * proj_matrix[4] + t[1] * proj_matrix[5] + t[2] * proj_matrix[6] + proj_matrix[7]; out[2] = t[0] * proj_matrix[8] + t[1] * proj_matrix[9] + t[2] * proj_matrix[10] + proj_matrix[11]; From 128d3036564e99fd3c8d7e6ae45eb17e43d0c95f Mon Sep 17 00:00:00 2001 From: Scott Mansell Date: Fri, 16 Jan 2015 04:01:00 +1300 Subject: [PATCH 07/11] Reduce number of divisions in screenspace transform. This is closer to what the hardware does anyway. --- Source/Core/VideoCommon/VertexManagerBase.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index 23eb770c6d..c9f5e2c714 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -264,11 +264,11 @@ void VertexManager::CalculateZSlope(u32 stride) VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4]); // Transform to Screenspace - float w = out[3 + i * 4]; + float inv_w = 1.0f / out[3 + i * 4]; - out[0 + i * 4] = out[0 + i * 4] / w * xfmem.viewport.wd + viewOffset[0]; - out[1 + i * 4] = out[1 + i * 4] / w * xfmem.viewport.ht + viewOffset[1]; - out[2 + i * 4] = out[2 + i * 4] / w * xfmem.viewport.zRange + xfmem.viewport.farZ; + out[0 + i * 4] = out[0 + i * 4] * inv_w * xfmem.viewport.wd + viewOffset[0]; + out[1 + i * 4] = out[1 + i * 4] * inv_w * xfmem.viewport.ht + viewOffset[1]; + out[2 + i * 4] = out[2 + i * 4] * inv_w * xfmem.viewport.zRange + xfmem.viewport.farZ; } float dx31 = out[8] - out[0]; From e88c02dece0e5e8d3b019fd0137eab790d6d2036 Mon Sep 17 00:00:00 2001 From: Scott Mansell Date: Fri, 16 Jan 2015 05:29:39 +1300 Subject: [PATCH 08/11] Ensure that ZSlopes save/restore state correctly. Had to re-do *ShaderManager so they saved their constant arrays instead of completly rebuilding them on restore state. --- Source/Core/Core/State.cpp | 2 +- .../VideoCommon/GeometryShaderManager.cpp | 22 +++--- .../Core/VideoCommon/PixelShaderManager.cpp | 70 +++++++------------ Source/Core/VideoCommon/PixelShaderManager.h | 6 -- Source/Core/VideoCommon/VertexManagerBase.cpp | 1 + .../Core/VideoCommon/VertexShaderManager.cpp | 51 ++++++++------ 6 files changed, 72 insertions(+), 80 deletions(-) diff --git a/Source/Core/Core/State.cpp b/Source/Core/Core/State.cpp index a63fafc2a5..2b3ce95e08 100644 --- a/Source/Core/Core/State.cpp +++ b/Source/Core/Core/State.cpp @@ -64,7 +64,7 @@ static Common::Event g_compressAndDumpStateSyncEvent; static std::thread g_save_thread; // Don't forget to increase this after doing changes on the savestate system -static const u32 STATE_VERSION = 38; +static const u32 STATE_VERSION = 39; enum { diff --git a/Source/Core/VideoCommon/GeometryShaderManager.cpp b/Source/Core/VideoCommon/GeometryShaderManager.cpp index 3e96592625..0a07cf683c 100644 --- a/Source/Core/VideoCommon/GeometryShaderManager.cpp +++ b/Source/Core/VideoCommon/GeometryShaderManager.cpp @@ -26,7 +26,11 @@ void GeometryShaderManager::Init() { memset(&constants, 0, sizeof(constants)); - Dirty(); + // Init any intial constants which aren't zero when bpmem is zero. + SetViewportChanged(); + SetProjectionChanged(); + + dirty = true; } void GeometryShaderManager::Shutdown() @@ -35,12 +39,9 @@ void GeometryShaderManager::Shutdown() void GeometryShaderManager::Dirty() { - SetViewportChanged(); - SetProjectionChanged(); - SetLinePtWidthChanged(); - - for (int i = 0; i < 8; i++) - SetTexCoordChanged(i); + // This function is called after a savestate is loaded. + // Any constants that can changed based on settings should be re-calculated + s_projection_changed = true; dirty = true; } @@ -110,9 +111,14 @@ void GeometryShaderManager::SetTexCoordChanged(u8 texmapid) void GeometryShaderManager::DoState(PointerWrap &p) { + p.Do(s_projection_changed); + p.Do(s_viewport_changed); + + p.Do(constants); + if (p.GetMode() == PointerWrap::MODE_READ) { - // Reload current state from global GPU state + // Fixup the current state from global GPU state // NOTE: This requires that all GPU memory has been loaded already. Dirty(); } diff --git a/Source/Core/VideoCommon/PixelShaderManager.cpp b/Source/Core/VideoCommon/PixelShaderManager.cpp index f80fc114ce..b1a68248ee 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.cpp +++ b/Source/Core/VideoCommon/PixelShaderManager.cpp @@ -14,10 +14,6 @@ bool PixelShaderManager::s_bFogRangeAdjustChanged; bool PixelShaderManager::s_bViewPortChanged; -bool PixelShaderManager::s_bEFBScaleChanged; - -std::array PixelShaderManager::s_tev_color; -std::array PixelShaderManager::s_tev_konst_color; PixelShaderConstants PixelShaderManager::constants; bool PixelShaderManager::dirty; @@ -25,34 +21,12 @@ bool PixelShaderManager::dirty; void PixelShaderManager::Init() { memset(&constants, 0, sizeof(constants)); - memset(s_tev_color.data(), 0, sizeof(s_tev_color)); - memset(s_tev_konst_color.data(), 0, sizeof(s_tev_konst_color)); - Dirty(); -} - -void PixelShaderManager::Dirty() -{ + // Init any intial constants which aren't zero when bpmem is zero. s_bFogRangeAdjustChanged = true; - s_bViewPortChanged = true; + s_bViewPortChanged = false; - for (unsigned index = 0; index < s_tev_color.size(); ++index) - { - for (int comp = 0; comp < 4; ++comp) - { - SetTevColor(index, comp, s_tev_color[index][comp]); - SetTevKonstColor(index, comp, s_tev_konst_color[index][comp]); - } - } - - SetAlpha(); - SetDestAlpha(); - SetZTextureBias(); - SetViewportChanged(); SetEfbScaleChanged(); - SetZSlope(0, 0, (float)0xFFFFFF); - SetIndTexScaleChanged(false); - SetIndTexScaleChanged(true); SetIndMatrixChanged(0); SetIndMatrixChanged(1); SetIndMatrixChanged(2); @@ -65,8 +39,20 @@ void PixelShaderManager::Dirty() SetTexCoordChanged(5); SetTexCoordChanged(6); SetTexCoordChanged(7); - SetFogColorChanged(); + + dirty = true; +} + +void PixelShaderManager::Dirty() +{ + // This function is called after a savestate is loaded. + // Any constants that can changed based on settings should be re-calculated + s_bFogRangeAdjustChanged = true; + + SetEfbScaleChanged(); SetFogParamChanged(); + + dirty = true; } void PixelShaderManager::Shutdown() @@ -115,20 +101,12 @@ void PixelShaderManager::SetConstants() dirty = true; s_bViewPortChanged = false; } - - if (s_bEFBScaleChanged) - { - constants.efbscale[0] = 1.0f / float(Renderer::EFBToScaledXf(1)); - constants.efbscale[1] = 1.0f / float(Renderer::EFBToScaledYf(1)); - dirty = true; - s_bEFBScaleChanged = false; - } } void PixelShaderManager::SetTevColor(int index, int component, s32 value) { auto& c = constants.colors[index]; - c[component] = s_tev_color[index][component] = value; + c[component] = value; dirty = true; PRIM_LOG("tev color%d: %d %d %d %d\n", index, c[0], c[1], c[2], c[3]); @@ -137,7 +115,7 @@ void PixelShaderManager::SetTevColor(int index, int component, s32 value) void PixelShaderManager::SetTevKonstColor(int index, int component, s32 value) { auto& c = constants.kcolors[index]; - c[component] = s_tev_konst_color[index][component] = value; + c[component] = value; dirty = true; PRIM_LOG("tev konst color%d: %d %d %d %d\n", index, c[0], c[1], c[2], c[3]); @@ -181,8 +159,9 @@ void PixelShaderManager::SetViewportChanged() void PixelShaderManager::SetEfbScaleChanged() { - s_bEFBScaleChanged = true; - s_bViewPortChanged = true; + constants.efbscale[0] = 1.0f / float(Renderer::EFBToScaledXf(1)); + constants.efbscale[1] = 1.0f / float(Renderer::EFBToScaledYf(1)); + dirty = true; } void PixelShaderManager::SetZSlope(float dfdx, float dfdy, float f0) @@ -190,7 +169,6 @@ void PixelShaderManager::SetZSlope(float dfdx, float dfdy, float f0) constants.zslope[0] = dfdx; constants.zslope[1] = dfdy; constants.zslope[2] = f0; - constants.zslope[3] = 0; dirty = true; } @@ -304,12 +282,14 @@ void PixelShaderManager::SetFogRangeAdjustChanged() void PixelShaderManager::DoState(PointerWrap &p) { - p.DoArray(s_tev_color); - p.DoArray(s_tev_konst_color); + p.Do(s_bFogRangeAdjustChanged); + p.Do(s_bViewPortChanged); + + p.Do(constants); if (p.GetMode() == PointerWrap::MODE_READ) { - // Reload current state from global GPU state + // Fixup the current state from global GPU state // NOTE: This requires that all GPU memory has been loaded already. Dirty(); } diff --git a/Source/Core/VideoCommon/PixelShaderManager.h b/Source/Core/VideoCommon/PixelShaderManager.h index 421fd3393c..0f3dba3699 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.h +++ b/Source/Core/VideoCommon/PixelShaderManager.h @@ -52,10 +52,4 @@ public: static bool s_bFogRangeAdjustChanged; static bool s_bViewPortChanged; - static bool s_bEFBScaleChanged; - - // These colors aren't available from global BP state, - // hence we keep a copy of them around. - static std::array s_tev_color; - static std::array s_tev_konst_color; }; diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index c9f5e2c714..3f6c672c65 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -241,6 +241,7 @@ void VertexManager::Flush() void VertexManager::DoState(PointerWrap& p) { + p.Do(ZSlope); g_vertex_manager->vDoState(p); } diff --git a/Source/Core/VideoCommon/VertexShaderManager.cpp b/Source/Core/VideoCommon/VertexShaderManager.cpp index 5320e0af2e..c6cf8edce5 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.cpp +++ b/Source/Core/VideoCommon/VertexShaderManager.cpp @@ -167,7 +167,21 @@ static void ViewportCorrectionMatrix(Matrix44& result) void VertexShaderManager::Init() { - Dirty(); + // Initialize state tracking variables + nTransformMatricesChanged[0] = -1; + nTransformMatricesChanged[1] = -1; + nNormalMatricesChanged[0] = -1; + nNormalMatricesChanged[1] = -1; + nPostTransformMatricesChanged[0] = -1; + nPostTransformMatricesChanged[1] = -1; + nLightsChanged[0] = -1; + nLightsChanged[1] = -1; + nMaterialsChanged = BitSet32(0); + bTexMatricesChanged[0] = false; + bTexMatricesChanged[1] = false; + bPosNormalMatrixChanged = false; + bProjectionChanged = true; + bViewportChanged = false; memset(&xfmem, 0, sizeof(xfmem)); memset(&constants, 0 , sizeof(constants)); @@ -178,6 +192,8 @@ void VertexShaderManager::Init() memset(g_fProjectionMatrix, 0, sizeof(g_fProjectionMatrix)); for (int i = 0; i < 4; ++i) g_fProjectionMatrix[i*5] = 1.0f; + + dirty = true; } void VertexShaderManager::Shutdown() @@ -186,26 +202,10 @@ void VertexShaderManager::Shutdown() void VertexShaderManager::Dirty() { - nTransformMatricesChanged[0] = 0; - nTransformMatricesChanged[1] = 256; - - nNormalMatricesChanged[0] = 0; - nNormalMatricesChanged[1] = 96; - - nPostTransformMatricesChanged[0] = 0; - nPostTransformMatricesChanged[1] = 256; - - nLightsChanged[0] = 0; - nLightsChanged[1] = 0x80; - - bPosNormalMatrixChanged = true; - bTexMatricesChanged[0] = true; - bTexMatricesChanged[1] = true; - + // This function is called after a savestate is loaded. + // Any constants that can changed based on settings should be re-calculated bProjectionChanged = true; - nMaterialsChanged = BitSet32::AllTrue(4); - dirty = true; } @@ -715,8 +715,19 @@ void VertexShaderManager::DoState(PointerWrap &p) p.Do(s_viewInvRotationMatrix); p.Do(s_fViewTranslationVector); p.Do(s_fViewRotation); + + p.Do(nTransformMatricesChanged); + p.Do(nNormalMatricesChanged); + p.Do(nPostTransformMatricesChanged); + p.Do(nLightsChanged); + + p.Do(nMaterialsChanged); + p.Do(bTexMatricesChanged); + p.Do(bPosNormalMatrixChanged); + p.Do(bProjectionChanged); + p.Do(bViewportChanged); + p.Do(constants); - p.Do(dirty); if (p.GetMode() == PointerWrap::MODE_READ) { From daf760b20245c2e397a98da94e80c15be9090657 Mon Sep 17 00:00:00 2001 From: Scott Mansell Date: Fri, 23 Jan 2015 04:38:36 +1300 Subject: [PATCH 09/11] A few small cleanups based on code review. --- Source/Core/VideoCommon/PixelShaderGen.cpp | 16 ++++++++-------- Source/Core/VideoCommon/PixelShaderGen.h | 3 +++ Source/Core/VideoCommon/VertexManagerBase.cpp | 2 +- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp index bd57f0c888..7c76628dd4 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.cpp +++ b/Source/Core/VideoCommon/PixelShaderGen.cpp @@ -272,10 +272,10 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T out.Write("};\n"); const bool forced_early_z = g_ActiveConfig.backend_info.bSupportsEarlyZ && bpmem.UseEarlyDepthTest() - && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED) - // We can't allow early_ztest for zfreeze because a reference poly is used - // to control the depth and we need a depth test after the alpha test. - && !bpmem.genMode.zfreeze; + && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED) + // We can't allow early_ztest for zfreeze because depth is overridden per-pixel. + // This means it's impossible for zcomploc to be emulated on a zfrozen polygon. + && !bpmem.genMode.zfreeze; const bool per_pixel_depth = (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) || (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z) || bpmem.genMode.zfreeze; if (forced_early_z) @@ -1028,9 +1028,9 @@ static inline void WriteAlphaTest(T& out, pixel_shader_uid_data* uid_data, API_T // important that a reliable alpha test, so we just force the alpha test to always succeed. // At least this seems to be less buggy. uid_data->alpha_test_use_zcomploc_hack = bpmem.UseEarlyDepthTest() - && bpmem.zmode.updateenable - && !g_ActiveConfig.backend_info.bSupportsEarlyZ - && !bpmem.genMode.zfreeze; // Might not be neccessary + && bpmem.zmode.updateenable + && !g_ActiveConfig.backend_info.bSupportsEarlyZ + && !bpmem.genMode.zfreeze; if (!uid_data->alpha_test_use_zcomploc_hack) { @@ -1123,7 +1123,7 @@ static inline void WritePerPixelDepth(T& out, pixel_shader_uid_data* uid_data, A // Opengl has reversed vertical screenspace coordiantes if (ApiType == API_OPENGL) - out.Write("\tscreenpos.y = %i - screenpos.y - 1;\n", EFB_HEIGHT); + out.Write("\tscreenpos.y = %i - screenpos.y;\n", EFB_HEIGHT); out.Write("\tdepth = float(" I_ZSLOPE".z + " I_ZSLOPE".x * screenpos.x + " I_ZSLOPE".y * screenpos.y) / float(0xFFFFFF);\n"); } diff --git a/Source/Core/VideoCommon/PixelShaderGen.h b/Source/Core/VideoCommon/PixelShaderGen.h index b37fa59ab3..6d063ca419 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.h +++ b/Source/Core/VideoCommon/PixelShaderGen.h @@ -64,7 +64,10 @@ struct pixel_shader_uid_data u32 forced_early_z : 1; u32 early_ztest : 1; u32 bounding_box : 1; + + // TODO: 31 bits of padding is a waste. Can we free up some bits elseware? u32 zfreeze : 1; + u32 pad : 31; u32 texMtxInfo_n_projection : 8; // 8x1 bit u32 tevindref_bi0 : 3; diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index 3f6c672c65..4c13a26736 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -250,7 +250,7 @@ void VertexManager::CalculateZSlope(u32 stride) float vtx[9]; float out[12]; float viewOffset[2] = { xfmem.viewport.xOrig - bpmem.scissorOffset.x * 2, - xfmem.viewport.yOrig - bpmem.scissorOffset.y * 2}; + xfmem.viewport.yOrig - bpmem.scissorOffset.y * 2}; // Lookup vertices of the last rendered triangle and software-transform them // This allows us to determine the depth slope, which will be used if zfreeze From 5510c86b8133ec734ba490b700d4f432d98f71a7 Mon Sep 17 00:00:00 2001 From: Scott Mansell Date: Sat, 24 Jan 2015 03:15:09 +1300 Subject: [PATCH 10/11] Move Zfreeze code out individual backends into videoCommon Also: * Implement support for per-vertex PosMatrixIndex * Only update zslope constant once when zfreeze is activated. * Added a bunch of comments. --- .../VideoBackends/D3D/NativeVertexFormat.cpp | 2 +- .../Core/VideoBackends/D3D/VertexManager.cpp | 10 ---- .../VideoBackends/OGL/NativeVertexFormat.cpp | 2 +- .../Core/VideoBackends/OGL/VertexManager.cpp | 10 ---- Source/Core/VideoBackends/OGL/VertexManager.h | 2 - Source/Core/VideoCommon/NativeVertexFormat.h | 5 +- Source/Core/VideoCommon/VertexManagerBase.cpp | 52 ++++++++++++++++--- Source/Core/VideoCommon/VertexManagerBase.h | 4 +- .../Core/VideoCommon/VertexShaderManager.cpp | 8 +-- Source/Core/VideoCommon/VertexShaderManager.h | 4 +- 10 files changed, 59 insertions(+), 40 deletions(-) diff --git a/Source/Core/VideoBackends/D3D/NativeVertexFormat.cpp b/Source/Core/VideoBackends/D3D/NativeVertexFormat.cpp index a0762578e1..abd9c569b2 100644 --- a/Source/Core/VideoBackends/D3D/NativeVertexFormat.cpp +++ b/Source/Core/VideoBackends/D3D/NativeVertexFormat.cpp @@ -59,7 +59,7 @@ DXGI_FORMAT VarToD3D(VarType t, int size, bool integer) void D3DVertexFormat::Initialize(const PortableVertexDeclaration &_vtx_decl) { - vertex_stride = _vtx_decl.stride; + vtx_decl = _vtx_decl; memset(m_elems, 0, sizeof(m_elems)); const AttributeFormat* format = &_vtx_decl.position; diff --git a/Source/Core/VideoBackends/D3D/VertexManager.cpp b/Source/Core/VideoBackends/D3D/VertexManager.cpp index 8546ed8ca7..5bd39d45b2 100644 --- a/Source/Core/VideoBackends/D3D/VertexManager.cpp +++ b/Source/Core/VideoBackends/D3D/VertexManager.cpp @@ -181,13 +181,6 @@ void VertexManager::vFlush(bool useDstAlpha) PrepareDrawBuffers(stride); - if (!bpmem.genMode.zfreeze) - CalculateZSlope(stride); - - // If cull mode is CULL_ALL, do not render these triangles - if (bpmem.genMode.cullmode == GenMode::CULL_ALL && current_primitive_type == PRIMITIVE_TRIANGLES) - return; - VertexLoaderManager::GetCurrentVertexFormat()->SetupVertexPointers(); g_renderer->ApplyState(useDstAlpha); @@ -200,9 +193,6 @@ void VertexManager::ResetBuffer(u32 stride) { s_pCurBufferPointer = s_pBaseBufferPointer; IndexGenerator::Start(GetIndexBuffer()); - - if (bpmem.genMode.zfreeze) - PixelShaderManager::SetZSlope(ZSlope.dfdx, ZSlope.dfdy, ZSlope.f0); } } // namespace diff --git a/Source/Core/VideoBackends/OGL/NativeVertexFormat.cpp b/Source/Core/VideoBackends/OGL/NativeVertexFormat.cpp index 4207a5a4af..8b1e4ded24 100644 --- a/Source/Core/VideoBackends/OGL/NativeVertexFormat.cpp +++ b/Source/Core/VideoBackends/OGL/NativeVertexFormat.cpp @@ -58,7 +58,7 @@ static void SetPointer(u32 attrib, u32 stride, const AttributeFormat &format) void GLVertexFormat::Initialize(const PortableVertexDeclaration &_vtx_decl) { this->vtx_decl = _vtx_decl; - vertex_stride = vtx_decl.stride; + u32 vertex_stride = _vtx_decl.stride; // We will not allow vertex components causing uneven strides. if (vertex_stride & 3) diff --git a/Source/Core/VideoBackends/OGL/VertexManager.cpp b/Source/Core/VideoBackends/OGL/VertexManager.cpp index 81c377fd02..1a162b1cde 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.cpp +++ b/Source/Core/VideoBackends/OGL/VertexManager.cpp @@ -89,9 +89,6 @@ void VertexManager::ResetBuffer(u32 stride) buffer = s_indexBuffer->Map(MAXIBUFFERSIZE * sizeof(u16)); IndexGenerator::Start((u16*)buffer.first); s_index_offset = buffer.second; - - if (bpmem.genMode.zfreeze) - PixelShaderManager::SetZSlope(ZSlope.dfdx, ZSlope.dfdy, ZSlope.f0); } void VertexManager::Draw(u32 stride) @@ -143,13 +140,6 @@ void VertexManager::vFlush(bool useDstAlpha) PrepareDrawBuffers(stride); - if (!bpmem.genMode.zfreeze) - CalculateZSlope(stride); - - // If cull mode is CULL_ALL, do not render these triangles - if (bpmem.genMode.cullmode == GenMode::CULL_ALL && current_primitive_type == PRIMITIVE_TRIANGLES) - return; - // Makes sure we can actually do Dual source blending bool dualSourcePossible = g_ActiveConfig.backend_info.bSupportsDualSourceBlend; diff --git a/Source/Core/VideoBackends/OGL/VertexManager.h b/Source/Core/VideoBackends/OGL/VertexManager.h index f0c6ae9109..ab400cf43d 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.h +++ b/Source/Core/VideoBackends/OGL/VertexManager.h @@ -13,8 +13,6 @@ namespace OGL { class GLVertexFormat : public NativeVertexFormat { - PortableVertexDeclaration vtx_decl; - public: GLVertexFormat(); ~GLVertexFormat(); diff --git a/Source/Core/VideoCommon/NativeVertexFormat.h b/Source/Core/VideoCommon/NativeVertexFormat.h index 024f4f070d..cefda66c52 100644 --- a/Source/Core/VideoCommon/NativeVertexFormat.h +++ b/Source/Core/VideoCommon/NativeVertexFormat.h @@ -109,7 +109,8 @@ public: virtual void Initialize(const PortableVertexDeclaration &vtx_decl) = 0; virtual void SetupVertexPointers() = 0; - u32 GetVertexStride() const { return vertex_stride; } + u32 GetVertexStride() const { return vtx_decl.stride; } + PortableVertexDeclaration GetVertexDeclaration() const { return vtx_decl; } // TODO: move this under private: u32 m_components; // VB_HAS_X. Bitmask telling what vertex components are present. @@ -118,5 +119,5 @@ protected: // Let subclasses construct. NativeVertexFormat() {} - u32 vertex_stride; + PortableVertexDeclaration vtx_decl; }; diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index 4c13a26736..75f6de97f8 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -12,6 +12,7 @@ #include "VideoCommon/RenderBase.h" #include "VideoCommon/Statistics.h" #include "VideoCommon/TextureCacheBase.h" +#include "VideoCommon/VertexLoaderManager.h" #include "VideoCommon/VertexManagerBase.h" #include "VideoCommon/VertexShaderManager.h" #include "VideoCommon/VideoConfig.h" @@ -220,6 +221,30 @@ void VertexManager::Flush() GeometryShaderManager::SetConstants(); PixelShaderManager::SetConstants(); + // Calculate ZSlope for zfreeze + if (!bpmem.genMode.zfreeze) + { + // Must be done after VertexShaderManager::SetConstants() + CalculateZSlope(VertexLoaderManager::GetCurrentVertexFormat()); + } + else if (ZSlope.dirty) // or apply any dirty ZSlopes + { + PixelShaderManager::SetZSlope(ZSlope.dfdx, ZSlope.dfdy, ZSlope.f0); + ZSlope.dirty = false; + } + + // If cull mode is CULL_ALL, we shouldn't render any triangles/quads (points and lines don't get culled) + // vertex loader has already converted any quads into triangles, so we just check for triangles. + // TODO: These culled primites need to get this far through the pipeline to be used as zfreeze refrence + // planes. But currently we apply excessive processing and store the vertices in buffers on the + // video card, which is a waste of bandwidth. + if (bpmem.genMode.cullmode == GenMode::CULL_ALL && current_primitive_type == PRIMITIVE_TRIANGLES) + { + GFX_DEBUGGER_PAUSE_AT(NEXT_FLUSH, true); + IsFlushed = true; + return; + } + bool useDstAlpha = !g_ActiveConfig.bDstAlphaPass && bpmem.dstalpha.enable && bpmem.blendmode.alphaupdate && @@ -245,24 +270,34 @@ void VertexManager::DoState(PointerWrap& p) g_vertex_manager->vDoState(p); } -void VertexManager::CalculateZSlope(u32 stride) +void VertexManager::CalculateZSlope(NativeVertexFormat *format) { float vtx[9]; float out[12]; float viewOffset[2] = { xfmem.viewport.xOrig - bpmem.scissorOffset.x * 2, xfmem.viewport.yOrig - bpmem.scissorOffset.y * 2}; + // Global matrix ID. + u32 mtxIdx = g_main_cp_state.matrix_index_a.PosNormalMtxIdx; + PortableVertexDeclaration vert_decl = format->GetVertexDeclaration(); + size_t posOff = vert_decl.position.offset; + size_t mtxOff = vert_decl.posmtx.offset; + // Lookup vertices of the last rendered triangle and software-transform them - // This allows us to determine the depth slope, which will be used if zfreeze + // This allows us to determine the depth slope, which will be used if z--freeze // is enabled in the following flush. for (unsigned int i = 0; i < 3; ++i) { - u8* vtx_ptr = s_pCurBufferPointer - stride * (3 - i); - vtx[0 + i * 3] = ((float*)vtx_ptr)[0]; - vtx[1 + i * 3] = ((float*)vtx_ptr)[1]; - vtx[2 + i * 3] = ((float*)vtx_ptr)[2]; + u8* vtx_ptr = s_pCurBufferPointer - vert_decl.stride * (3 - i); + vtx[0 + i * 3] = ((float*)(vtx_ptr + posOff))[0]; + vtx[1 + i * 3] = ((float*)(vtx_ptr + posOff))[1]; + vtx[2 + i * 3] = ((float*)(vtx_ptr + posOff))[2]; - VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4]); + // If this vertex format has per-vertex position matrix IDs, look it up. + if(vert_decl.posmtx.enable) + mtxIdx = *((u32*)(vtx_ptr + mtxOff)); + + VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4], mtxIdx); // Transform to Screenspace float inv_w = 1.0f / out[3 + i * 4]; @@ -283,11 +318,12 @@ void VertexManager::CalculateZSlope(u32 stride) float b = dx31 * DF21 + dx12 * DF31; float c = -dx12 * dy31 - dx31 * -dy12; - // Stop divide by zero + // Sometimes we process de-generate triangles. Stop any divide by zeros if (c == 0) return; ZSlope.dfdx = -a / c; ZSlope.dfdy = -b / c; ZSlope.f0 = out[2] - (out[0] * ZSlope.dfdx + out[1] * ZSlope.dfdy); + ZSlope.dirty = true; } diff --git a/Source/Core/VideoCommon/VertexManagerBase.h b/Source/Core/VideoCommon/VertexManagerBase.h index 143e6b811c..9bcd71d3b3 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.h +++ b/Source/Core/VideoCommon/VertexManagerBase.h @@ -4,6 +4,7 @@ #include "Common/CommonFuncs.h" #include "Common/CommonTypes.h" #include "VideoCommon/DataReader.h" +#include "VideoCommon/NativeVertexFormat.h" class NativeVertexFormat; class PointerWrap; @@ -19,6 +20,7 @@ struct Slope float dfdx; float dfdy; float f0; + bool dirty; }; class VertexManager @@ -63,7 +65,7 @@ protected: static u32 GetRemainingIndices(int primitive); static Slope ZSlope; - static void CalculateZSlope(u32 stride); + static void CalculateZSlope(NativeVertexFormat *format); private: static bool IsFlushed; diff --git a/Source/Core/VideoCommon/VertexShaderManager.cpp b/Source/Core/VideoCommon/VertexShaderManager.cpp index c6cf8edce5..92c1fffa34 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.cpp +++ b/Source/Core/VideoCommon/VertexShaderManager.cpp @@ -690,10 +690,12 @@ void VertexShaderManager::ResetView() bProjectionChanged = true; } -void VertexShaderManager::TransformToClipSpace(const float* data, float *out) +void VertexShaderManager::TransformToClipSpace(const float* data, float *out, u32 MtxIdx) { - // Can we use constants.posnormalmatrix here instead? - const float *world_matrix = (const float *)xfmem.posMatrices + g_main_cp_state.matrix_index_a.PosNormalMtxIdx * 4; + const float *world_matrix = (const float *)xfmem.posMatrices + (MtxIdx & 0x3f) * 4; + // We use the projection matrix calculated by vertexShaderManager, because it + // includes any free look transformations. + // Make sure VertexManager::SetConstants() has been called first. const float *proj_matrix = &g_fProjectionMatrix[0]; float t[3]; diff --git a/Source/Core/VideoCommon/VertexShaderManager.h b/Source/Core/VideoCommon/VertexShaderManager.h index 229ba1f599..9689cd8238 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.h +++ b/Source/Core/VideoCommon/VertexShaderManager.h @@ -34,11 +34,11 @@ public: static void RotateView(float x, float y); static void ResetView(); - // data: 3 floats representing the X, Y and Z vertex model coordinates + // data: 3 floats representing the X, Y and Z vertex model coordinates and the posmatrix index. // out: 4 floats which will be initialized with the corresponding clip space coordinates // NOTE: g_fProjectionMatrix must be up to date when this is called // (i.e. VertexShaderManager::SetConstants needs to be called before using this!) - static void TransformToClipSpace(const float* data, float *out); + static void TransformToClipSpace(const float* data, float *out, u32 mtxIdx); static VertexShaderConstants constants; static bool dirty; From 14baf038e70a598a6af1ce71719bb557425941f9 Mon Sep 17 00:00:00 2001 From: Scott Mansell Date: Sat, 24 Jan 2015 14:37:20 +1300 Subject: [PATCH 11/11] Stop doing nastly shit to OpenGL stream buffers. Instead we keep the loaded vertices in CPU memory. --- .../Core/VideoBackends/OGL/VertexManager.cpp | 27 ++++-- Source/Core/VideoBackends/OGL/VertexManager.h | 4 + Source/Core/VideoCommon/NativeVertexFormat.h | 2 +- .../Core/VideoCommon/VertexLoaderManager.cpp | 6 +- Source/Core/VideoCommon/VertexManagerBase.cpp | 90 ++++++++++--------- Source/Core/VideoCommon/VertexManagerBase.h | 4 +- 6 files changed, 79 insertions(+), 54 deletions(-) diff --git a/Source/Core/VideoBackends/OGL/VertexManager.cpp b/Source/Core/VideoBackends/OGL/VertexManager.cpp index 1a162b1cde..5b89e0507e 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.cpp +++ b/Source/Core/VideoBackends/OGL/VertexManager.cpp @@ -43,6 +43,8 @@ static size_t s_index_offset; VertexManager::VertexManager() { CreateDeviceObjects(); + CpuVBuffer.resize(MAX_VBUFFER_SIZE); + CpuIBuffer.resize(MAX_IBUFFER_SIZE); } VertexManager::~VertexManager() @@ -81,14 +83,25 @@ void VertexManager::PrepareDrawBuffers(u32 stride) void VertexManager::ResetBuffer(u32 stride) { - auto buffer = s_vertexBuffer->Map(MAXVBUFFERSIZE, stride); - s_pCurBufferPointer = s_pBaseBufferPointer = buffer.first; - s_pEndBufferPointer = buffer.first + MAXVBUFFERSIZE; - s_baseVertex = buffer.second / stride; + if (CullAll) + { + // This buffer isn't getting sent to the GPU. Just allocate it on the cpu. + s_pCurBufferPointer = s_pBaseBufferPointer = CpuVBuffer.data(); + s_pEndBufferPointer = s_pBaseBufferPointer + CpuVBuffer.size(); - buffer = s_indexBuffer->Map(MAXIBUFFERSIZE * sizeof(u16)); - IndexGenerator::Start((u16*)buffer.first); - s_index_offset = buffer.second; + IndexGenerator::Start((u16*)CpuIBuffer.data()); + } + else + { + auto buffer = s_vertexBuffer->Map(MAXVBUFFERSIZE, stride); + s_pCurBufferPointer = s_pBaseBufferPointer = buffer.first; + s_pEndBufferPointer = buffer.first + MAXVBUFFERSIZE; + s_baseVertex = buffer.second / stride; + + buffer = s_indexBuffer->Map(MAXIBUFFERSIZE * sizeof(u16)); + IndexGenerator::Start((u16*)buffer.first); + s_index_offset = buffer.second; + } } void VertexManager::Draw(u32 stride) diff --git a/Source/Core/VideoBackends/OGL/VertexManager.h b/Source/Core/VideoBackends/OGL/VertexManager.h index ab400cf43d..ba6e49c466 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.h +++ b/Source/Core/VideoBackends/OGL/VertexManager.h @@ -45,6 +45,10 @@ private: void Draw(u32 stride); void vFlush(bool useDstAlpha) override; void PrepareDrawBuffers(u32 stride); + + // Alternative buffers in CPU memory for primatives we are going to discard. + std::vector CpuVBuffer; + std::vector CpuIBuffer; }; } diff --git a/Source/Core/VideoCommon/NativeVertexFormat.h b/Source/Core/VideoCommon/NativeVertexFormat.h index cefda66c52..5a77c3eb03 100644 --- a/Source/Core/VideoCommon/NativeVertexFormat.h +++ b/Source/Core/VideoCommon/NativeVertexFormat.h @@ -110,7 +110,7 @@ public: virtual void SetupVertexPointers() = 0; u32 GetVertexStride() const { return vtx_decl.stride; } - PortableVertexDeclaration GetVertexDeclaration() const { return vtx_decl; } + const PortableVertexDeclaration& GetVertexDeclaration() const { return vtx_decl; } // TODO: move this under private: u32 m_components; // VB_HAS_X. Bitmask telling what vertex components are present. diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index 9cc8861186..e0115a0d29 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -157,8 +157,12 @@ int RunVertices(int vtx_attr_group, int primitive, int count, DataReader src, bo VertexManager::Flush(); s_current_vtx_fmt = loader->m_native_vertex_format; + // if cull mode is CULL_ALL, tell VertexManager to skip triangles and quads. + // They still need to go through vertex loading, because we need to calculate a zfreeze refrence slope. + bool cullall = (bpmem.genMode.cullmode == GenMode::CULL_ALL && primitive < 5); + DataReader dst = VertexManager::PrepareForAdditionalData(primitive, count, - loader->m_native_vtx_decl.stride); + loader->m_native_vtx_decl.stride, cullall); count = loader->RunVertices(primitive, count, src, dst); diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index 75f6de97f8..0c5ccdd10d 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -29,6 +29,7 @@ PrimitiveType VertexManager::current_primitive_type; Slope VertexManager::ZSlope; bool VertexManager::IsFlushed; +bool VertexManager::CullAll; static const PrimitiveType primitive_from_gx[8] = { PRIMITIVE_TRIANGLES, // GX_DRAW_QUADS @@ -44,6 +45,7 @@ static const PrimitiveType primitive_from_gx[8] = { VertexManager::VertexManager() { IsFlushed = true; + CullAll = false; } VertexManager::~VertexManager() @@ -55,7 +57,7 @@ u32 VertexManager::GetRemainingSize() return (u32)(s_pEndBufferPointer - s_pCurBufferPointer); } -DataReader VertexManager::PrepareForAdditionalData(int primitive, u32 count, u32 stride) +DataReader VertexManager::PrepareForAdditionalData(int primitive, u32 count, u32 stride, bool cullall) { // The SSE vertex loader can write up to 4 bytes past the end u32 const needed_vertex_bytes = count * stride + 4; @@ -81,6 +83,8 @@ DataReader VertexManager::PrepareForAdditionalData(int primitive, u32 count, u32 "Increase MAXVBUFFERSIZE or we need primitive breaking after all."); } + CullAll = cullall; + // need to alloc new buffer if (IsFlushed) { @@ -192,34 +196,36 @@ void VertexManager::Flush() (int)bpmem.genMode.numtexgens, (u32)bpmem.dstalpha.enable, (bpmem.alpha_test.hex>>16)&0xff); #endif - BitSet32 usedtextures; - for (u32 i = 0; i < bpmem.genMode.numtevstages + 1u; ++i) - if (bpmem.tevorders[i / 2].getEnable(i & 1)) - usedtextures[bpmem.tevorders[i/2].getTexMap(i & 1)] = true; - - if (bpmem.genMode.numindstages > 0) - for (unsigned int i = 0; i < bpmem.genMode.numtevstages + 1u; ++i) - if (bpmem.tevind[i].IsActive() && bpmem.tevind[i].bt < bpmem.genMode.numindstages) - usedtextures[bpmem.tevindref.getTexMap(bpmem.tevind[i].bt)] = true; - - for (unsigned int i : usedtextures) + // If the primitave is marked CullAll. All we need to do is update the vertex constants and calculate the zfreeze refrence slope + if (!CullAll) { - g_renderer->SetSamplerState(i & 3, i >> 2); - const TextureCache::TCacheEntryBase* tentry = TextureCache::Load(i); + BitSet32 usedtextures; + for (u32 i = 0; i < bpmem.genMode.numtevstages + 1u; ++i) + if (bpmem.tevorders[i / 2].getEnable(i & 1)) + usedtextures[bpmem.tevorders[i/2].getTexMap(i & 1)] = true; - if (tentry) + if (bpmem.genMode.numindstages > 0) + for (unsigned int i = 0; i < bpmem.genMode.numtevstages + 1u; ++i) + if (bpmem.tevind[i].IsActive() && bpmem.tevind[i].bt < bpmem.genMode.numindstages) + usedtextures[bpmem.tevindref.getTexMap(bpmem.tevind[i].bt)] = true; + + for (unsigned int i : usedtextures) { - // 0s are probably for no manual wrapping needed. - PixelShaderManager::SetTexDims(i, tentry->native_width, tentry->native_height, 0, 0); + g_renderer->SetSamplerState(i & 3, i >> 2); + const TextureCache::TCacheEntryBase* tentry = TextureCache::Load(i); + + if (tentry) + { + // 0s are probably for no manual wrapping needed. + PixelShaderManager::SetTexDims(i, tentry->native_width, tentry->native_height, 0, 0); + } + else + ERROR_LOG(VIDEO, "error loading texture"); } - else - ERROR_LOG(VIDEO, "error loading texture"); } - // set global constants + // set global vertex constants VertexShaderManager::SetConstants(); - GeometryShaderManager::SetConstants(); - PixelShaderManager::SetConstants(); // Calculate ZSlope for zfreeze if (!bpmem.genMode.zfreeze) @@ -227,41 +233,37 @@ void VertexManager::Flush() // Must be done after VertexShaderManager::SetConstants() CalculateZSlope(VertexLoaderManager::GetCurrentVertexFormat()); } - else if (ZSlope.dirty) // or apply any dirty ZSlopes + else if (ZSlope.dirty && !CullAll) // or apply any dirty ZSlopes { PixelShaderManager::SetZSlope(ZSlope.dfdx, ZSlope.dfdy, ZSlope.f0); ZSlope.dirty = false; } - // If cull mode is CULL_ALL, we shouldn't render any triangles/quads (points and lines don't get culled) - // vertex loader has already converted any quads into triangles, so we just check for triangles. - // TODO: These culled primites need to get this far through the pipeline to be used as zfreeze refrence - // planes. But currently we apply excessive processing and store the vertices in buffers on the - // video card, which is a waste of bandwidth. - if (bpmem.genMode.cullmode == GenMode::CULL_ALL && current_primitive_type == PRIMITIVE_TRIANGLES) + if (!CullAll) { - GFX_DEBUGGER_PAUSE_AT(NEXT_FLUSH, true); - IsFlushed = true; - return; + // set the rest of the global constants + GeometryShaderManager::SetConstants(); + PixelShaderManager::SetConstants(); + + bool useDstAlpha = !g_ActiveConfig.bDstAlphaPass && + bpmem.dstalpha.enable && + bpmem.blendmode.alphaupdate && + bpmem.zcontrol.pixel_format == PEControl::RGBA6_Z24; + + if (PerfQueryBase::ShouldEmulate()) + g_perf_query->EnableQuery(bpmem.zcontrol.early_ztest ? PQG_ZCOMP_ZCOMPLOC : PQG_ZCOMP); + g_vertex_manager->vFlush(useDstAlpha); + if (PerfQueryBase::ShouldEmulate()) + g_perf_query->DisableQuery(bpmem.zcontrol.early_ztest ? PQG_ZCOMP_ZCOMPLOC : PQG_ZCOMP); } - bool useDstAlpha = !g_ActiveConfig.bDstAlphaPass && - bpmem.dstalpha.enable && - bpmem.blendmode.alphaupdate && - bpmem.zcontrol.pixel_format == PEControl::RGBA6_Z24; - - if (PerfQueryBase::ShouldEmulate()) - g_perf_query->EnableQuery(bpmem.zcontrol.early_ztest ? PQG_ZCOMP_ZCOMPLOC : PQG_ZCOMP); - g_vertex_manager->vFlush(useDstAlpha); - if (PerfQueryBase::ShouldEmulate()) - g_perf_query->DisableQuery(bpmem.zcontrol.early_ztest ? PQG_ZCOMP_ZCOMPLOC : PQG_ZCOMP); - GFX_DEBUGGER_PAUSE_AT(NEXT_FLUSH, true); if (xfmem.numTexGen.numTexGens != bpmem.genMode.numtexgens) ERROR_LOG(VIDEO, "xf.numtexgens (%d) does not match bp.numtexgens (%d). Error in command stream.", xfmem.numTexGen.numTexGens, bpmem.genMode.numtexgens.Value()); IsFlushed = true; + CullAll = false; } void VertexManager::DoState(PointerWrap& p) @@ -279,7 +281,7 @@ void VertexManager::CalculateZSlope(NativeVertexFormat *format) // Global matrix ID. u32 mtxIdx = g_main_cp_state.matrix_index_a.PosNormalMtxIdx; - PortableVertexDeclaration vert_decl = format->GetVertexDeclaration(); + const PortableVertexDeclaration vert_decl = format->GetVertexDeclaration(); size_t posOff = vert_decl.position.offset; size_t mtxOff = vert_decl.posmtx.offset; diff --git a/Source/Core/VideoCommon/VertexManagerBase.h b/Source/Core/VideoCommon/VertexManagerBase.h index 9bcd71d3b3..4369438bc5 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.h +++ b/Source/Core/VideoCommon/VertexManagerBase.h @@ -41,7 +41,7 @@ public: // needs to be virtual for DX11's dtor virtual ~VertexManager(); - static DataReader PrepareForAdditionalData(int primitive, u32 count, u32 stride); + static DataReader PrepareForAdditionalData(int primitive, u32 count, u32 stride, bool cullall); static void FlushData(u32 count, u32 stride); static void Flush(); @@ -67,6 +67,8 @@ protected: static Slope ZSlope; static void CalculateZSlope(NativeVertexFormat *format); + static bool CullAll; + private: static bool IsFlushed;