diff --git a/Source/Core/Common/Src/VideoBackendBase.h b/Source/Core/Common/Src/VideoBackendBase.h index 311fa66104..f1c8b0a131 100644 --- a/Source/Core/Common/Src/VideoBackendBase.h +++ b/Source/Core/Common/Src/VideoBackendBase.h @@ -22,6 +22,7 @@ #include #include "ChunkFile.h" +#include "../../VideoCommon/Src/PerfQueryBase.h" typedef void (*writeFn16)(const u16,const u32); typedef void (*writeFn32)(const u32,const u32); @@ -107,6 +108,7 @@ public: virtual void Video_EndField() = 0; virtual u32 Video_AccessEFB(EFBAccessType, u32, u32, u32) = 0; + virtual u32 Video_GetQueryResult(PerfQueryType type) = 0; virtual void Video_AddMessage(const char* pstr, unsigned int milliseconds) = 0; virtual void Video_ClearMessages() = 0; @@ -156,8 +158,10 @@ class VideoBackendHardware : public VideoBackend void Video_ExitLoop(); void Video_BeginField(u32, FieldType, u32, u32); void Video_EndField(); - u32 Video_AccessEFB(EFBAccessType, u32, u32, u32); + u32 Video_AccessEFB(EFBAccessType, u32, u32, u32); + u32 Video_GetQueryResult(PerfQueryType type); + void Video_AddMessage(const char* pstr, unsigned int milliseconds); void Video_ClearMessages(); bool Video_Screenshot(const char* filename); diff --git a/Source/Core/VideoCommon/CMakeLists.txt b/Source/Core/VideoCommon/CMakeLists.txt index cba3666399..89b6828281 100644 --- a/Source/Core/VideoCommon/CMakeLists.txt +++ b/Source/Core/VideoCommon/CMakeLists.txt @@ -16,6 +16,7 @@ set(SRCS Src/BPFunctions.cpp Src/OpcodeDecoding.cpp Src/OpenCL.cpp Src/OpenCL/OCLTextureDecoder.cpp + Src/PerfQueryBase.cpp Src/PixelEngine.cpp Src/PixelShaderGen.cpp Src/PixelShaderManager.cpp diff --git a/Source/Core/VideoCommon/Src/BPMemory.h b/Source/Core/VideoCommon/Src/BPMemory.h index 85e95f4f8b..0ee00f911e 100644 --- a/Source/Core/VideoCommon/Src/BPMemory.h +++ b/Source/Core/VideoCommon/Src/BPMemory.h @@ -62,7 +62,7 @@ #define BPMEM_COPYFILTER1 0x54 #define BPMEM_CLEARBBOX1 0x55 #define BPMEM_CLEARBBOX2 0x56 -#define BPMEM_UNKNOWN_57 0x57 +#define BPMEM_CLEAR_PIXEL_PERF 0x57 #define BPMEM_REVBITS 0x58 #define BPMEM_SCISSOROFFSET 0x59 #define BPMEM_PRELOAD_ADDR 0x60 diff --git a/Source/Core/VideoCommon/Src/BPStructs.cpp b/Source/Core/VideoCommon/Src/BPStructs.cpp index 0d7d741a92..fa6bc08966 100644 --- a/Source/Core/VideoCommon/Src/BPStructs.cpp +++ b/Source/Core/VideoCommon/Src/BPStructs.cpp @@ -31,6 +31,7 @@ #include "VertexShaderManager.h" #include "Thread.h" #include "HW/Memmap.h" +#include "PerfQueryBase.h" using namespace BPFunctions; @@ -62,7 +63,6 @@ void RenderToXFB(const BPCmd &bp, const EFBRectangle &rc, float yScale, float xf { Renderer::RenderToXFB(xfbAddr, dstWidth, dstHeight, rc, gamma); } - void BPWritten(const BPCmd& bp) { /* @@ -144,7 +144,8 @@ void BPWritten(const BPCmd& bp) || bp.address == BPMEM_LOADTLUT0 || bp.address == BPMEM_LOADTLUT1 || bp.address == BPMEM_TEXINVALIDATE - || bp.address == BPMEM_PRELOAD_MODE)) + || bp.address == BPMEM_PRELOAD_MODE + || bp.address == BPMEM_CLEAR_PIXEL_PERF)) { return; } @@ -484,9 +485,10 @@ void BPWritten(const BPCmd& bp) case BPMEM_IND_IMASK: // Index Mask ? case BPMEM_REVBITS: // Always set to 0x0F when GX_InitRevBits() is called. break; - - case BPMEM_UNKNOWN_57: // Sunshine alternates this register between values 0x000 and 0xAAA - DEBUG_LOG(VIDEO, "Unknown BP Reg 0x57: %08x", bp.newvalue); + + case BPMEM_CLEAR_PIXEL_PERF: + // GXClearPixMetric writes 0xAAA here, Sunshine alternates this register between values 0x000 and 0xAAA + g_perf_query->ResetQuery(); break; case BPMEM_PRELOAD_ADDR: diff --git a/Source/Core/VideoCommon/Src/MainBase.cpp b/Source/Core/VideoCommon/Src/MainBase.cpp index a881bebfb2..1472367f21 100644 --- a/Source/Core/VideoCommon/Src/MainBase.cpp +++ b/Source/Core/VideoCommon/Src/MainBase.cpp @@ -21,6 +21,10 @@ volatile u32 s_swapRequested = false; u32 s_efbAccessRequested = false; volatile u32 s_FifoShuttingDown = false; +std::condition_variable s_perf_query_cond; +std::mutex s_perf_query_lock; +static volatile bool s_perf_query_requested; + static volatile struct { u32 xfbAddr; @@ -169,6 +173,43 @@ u32 VideoBackendHardware::Video_AccessEFB(EFBAccessType type, u32 x, u32 y, u32 return 0; } +static bool QueryResultIsReady() +{ + return !s_perf_query_requested || s_FifoShuttingDown; +} + +void VideoFifo_CheckPerfQueryRequest() +{ + if (s_perf_query_requested) + { + g_perf_query->FlushResults(); + + { + std::lock_guard lk(s_perf_query_lock); + s_perf_query_requested = false; + } + + s_perf_query_cond.notify_one(); + } +} + +u32 VideoBackendHardware::Video_GetQueryResult(PerfQueryType type) +{ + // TODO: Is this check sane? + if (!g_perf_query->IsFlushed()) + { + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bCPUThread) + { + s_perf_query_requested = true; + std::unique_lock lk(s_perf_query_lock); + s_perf_query_cond.wait(lk, QueryResultIsReady); + } + else + g_perf_query->FlushResults(); + } + + return g_perf_query->GetQueryResult(type); +} void VideoBackendHardware::InitializeShared() { @@ -176,6 +217,7 @@ void VideoBackendHardware::InitializeShared() s_swapRequested = 0; s_efbAccessRequested = 0; + s_perf_query_requested = false; s_FifoShuttingDown = 0; memset((void*)&s_beginFieldArgs, 0, sizeof(s_beginFieldArgs)); memset(&s_accessEFBArgs, 0, sizeof(s_accessEFBArgs)); @@ -238,6 +280,7 @@ void VideoFifo_CheckAsyncRequest() { VideoFifo_CheckSwapRequest(); VideoFifo_CheckEFBAccess(); + VideoFifo_CheckPerfQueryRequest(); } void VideoBackendHardware::Video_GatherPipeBursted() diff --git a/Source/Core/VideoCommon/Src/OnScreenDisplay.h b/Source/Core/VideoCommon/Src/OnScreenDisplay.h index 3777e2b5d3..80187b8ac3 100644 --- a/Source/Core/VideoCommon/Src/OnScreenDisplay.h +++ b/Source/Core/VideoCommon/Src/OnScreenDisplay.h @@ -22,7 +22,7 @@ namespace OSD { // On-screen message display -void AddMessage(const char* str, u32 ms); +void AddMessage(const char* str, u32 ms = 2000); void DrawMessages(); // draw the current messages on the screen. Only call once per frame. void ClearMessages(); diff --git a/Source/Core/VideoCommon/Src/PerfQueryBase.cpp b/Source/Core/VideoCommon/Src/PerfQueryBase.cpp new file mode 100644 index 0000000000..c537d176f6 --- /dev/null +++ b/Source/Core/VideoCommon/Src/PerfQueryBase.cpp @@ -0,0 +1,3 @@ +#include "PerfQueryBase.h" + +PerfQueryBase* g_perf_query = 0; diff --git a/Source/Core/VideoCommon/Src/PerfQueryBase.h b/Source/Core/VideoCommon/Src/PerfQueryBase.h new file mode 100644 index 0000000000..b979449edb --- /dev/null +++ b/Source/Core/VideoCommon/Src/PerfQueryBase.h @@ -0,0 +1,54 @@ +#ifndef _PERFQUERY_BASE_H_ +#define _PERFQUERY_BASE_H_ + +#include "CommonTypes.h" + +enum PerfQueryType +{ + PQ_ZCOMP_INPUT_ZCOMPLOC = 0, + PQ_ZCOMP_OUTPUT_ZCOMPLOC, + PQ_ZCOMP_INPUT, + PQ_ZCOMP_OUTPUT, + PQ_BLEND_INPUT, + PQ_EFB_COPY_CLOCKS, + PQ_NUM_MEMBERS +}; + +enum PerfQueryGroup +{ + PQG_ZCOMP_ZCOMPLOC, + PQG_ZCOMP, + PQG_EFB_COPY_CLOCKS, + PQG_NUM_MEMBERS, +}; + +class PerfQueryBase +{ +public: + PerfQueryBase() {}; + virtual ~PerfQueryBase() {} + + // Begin querying the specified value for the following host GPU commands + virtual void EnableQuery(PerfQueryGroup type) {} + + // Stop querying the specified value for the following host GPU commands + virtual void DisableQuery(PerfQueryGroup type) {} + + // Reset query counters to zero and drop any pending queries + virtual void ResetQuery() {} + + // Return the measured value for the specified query type + // NOTE: Called from CPU thread + virtual u32 GetQueryResult(PerfQueryType type) { return 0; } + + // Request the value of any pending queries - causes a pipeline flush and thus should be used carefully! + virtual void FlushResults() {} + + // True if there are no further pending query results + // NOTE: Called from CPU thread + virtual bool IsFlushed() const { return true; } +}; + +extern PerfQueryBase* g_perf_query; + +#endif // _PERFQUERY_H_ diff --git a/Source/Core/VideoCommon/Src/PixelEngine.cpp b/Source/Core/VideoCommon/Src/PixelEngine.cpp index a488e52b42..e5ba554678 100644 --- a/Source/Core/VideoCommon/Src/PixelEngine.cpp +++ b/Source/Core/VideoCommon/Src/PixelEngine.cpp @@ -28,10 +28,13 @@ #include "ConfigManager.h" #include "PixelEngine.h" +#include "RenderBase.h" #include "CommandProcessor.h" #include "HW/ProcessorInterface.h" #include "DLCache.h" #include "State.h" +#include "PerfQueryBase.h" + namespace PixelEngine { @@ -255,23 +258,59 @@ void Read16(u16& _uReturnValue, const u32 _iAddress) break; } - case PE_PERF_0L: - case PE_PERF_0H: - case PE_PERF_1L: - case PE_PERF_1H: - case PE_PERF_2L: - case PE_PERF_2H: - case PE_PERF_3L: - case PE_PERF_3H: - case PE_PERF_4L: - case PE_PERF_4H: - case PE_PERF_5L: - case PE_PERF_5H: - INFO_LOG(PIXELENGINE, "(r16) perf counter @ %08x", _iAddress); - // git r90a2096a24f4 (svn r3663) added the PE_PERF cases, without setting - // _uReturnValue to anything, this reverts to the previous behaviour which allows - // The timer in SMS:Scrubbing Serena Beach to countdown correctly - _uReturnValue = 1; + // NOTE(neobrain): only PE_PERF_ZCOMP_OUTPUT is implemented in D3D11, but the other values shouldn't be contradictionary to the value of that register (i.e. INPUT registers should always be greater or equal to their corresponding OUTPUT registers). + case PE_PERF_ZCOMP_INPUT_ZCOMPLOC_L: + _uReturnValue = g_video_backend->Video_GetQueryResult(PQ_ZCOMP_INPUT_ZCOMPLOC) & 0xFFFF; + break; + + case PE_PERF_ZCOMP_INPUT_ZCOMPLOC_H: + _uReturnValue = g_video_backend->Video_GetQueryResult(PQ_ZCOMP_INPUT_ZCOMPLOC) >> 16; + break; + + case PE_PERF_ZCOMP_OUTPUT_ZCOMPLOC_L: + _uReturnValue = g_video_backend->Video_GetQueryResult(PQ_ZCOMP_OUTPUT_ZCOMPLOC) & 0xFFFF; + break; + + case PE_PERF_ZCOMP_OUTPUT_ZCOMPLOC_H: + _uReturnValue = g_video_backend->Video_GetQueryResult(PQ_ZCOMP_OUTPUT_ZCOMPLOC) >> 16; + break; + + case PE_PERF_ZCOMP_INPUT_L: + _uReturnValue = g_video_backend->Video_GetQueryResult(PQ_ZCOMP_INPUT) & 0xFFFF; + break; + + case PE_PERF_ZCOMP_INPUT_H: + _uReturnValue = g_video_backend->Video_GetQueryResult(PQ_ZCOMP_INPUT) >> 16; + break; + + case PE_PERF_ZCOMP_OUTPUT_L: + _uReturnValue = g_video_backend->Video_GetQueryResult(PQ_ZCOMP_OUTPUT) & 0xFFFF; + break; + + case PE_PERF_ZCOMP_OUTPUT_H: + _uReturnValue = g_video_backend->Video_GetQueryResult(PQ_ZCOMP_OUTPUT) >> 16; + break; + + case PE_PERF_BLEND_INPUT_L: + // Super Mario Sunshine uses this register in episode 6 of Sirena Beach: + // The amount of remaining goop is determined by checking how many pixels reach the blending stage. + // Once this register falls below a particular value (around 0x90), the game regards the challenge finished. + // In very old builds, Dolphin only returned 0. That caused the challenge to be immediately finished without any goop being cleaned (the timer just didn't even start counting from 3:00:00). + // Later builds returned 1 for the high register. That caused the timer to actually count down, but made the challenge unbeatable because the game always thought you didn't clear any goop at all. + // Note that currently this functionality is only implemented in the D3D11 backend. + _uReturnValue = g_video_backend->Video_GetQueryResult(PQ_BLEND_INPUT) & 0xFFFF; + break; + + case PE_PERF_BLEND_INPUT_H: + _uReturnValue = g_video_backend->Video_GetQueryResult(PQ_BLEND_INPUT) >> 16; + break; + + case PE_PERF_EFB_COPY_CLOCKS_L: + _uReturnValue = g_video_backend->Video_GetQueryResult(PQ_EFB_COPY_CLOCKS) & 0xFFFF; + break; + + case PE_PERF_EFB_COPY_CLOCKS_H: + _uReturnValue = g_video_backend->Video_GetQueryResult(PQ_EFB_COPY_CLOCKS) >> 16; break; default: diff --git a/Source/Core/VideoCommon/Src/PixelEngine.h b/Source/Core/VideoCommon/Src/PixelEngine.h index 64f959009f..eaf55f0031 100644 --- a/Source/Core/VideoCommon/Src/PixelEngine.h +++ b/Source/Core/VideoCommon/Src/PixelEngine.h @@ -36,19 +36,20 @@ enum PE_BBOX_TOP = 0x14, // Flip Top PE_BBOX_BOTTOM = 0x16, // Flip Bottom - // These have not yet been RE:d. They are the perf counters. - PE_PERF_0L = 0x18, - PE_PERF_0H = 0x1a, - PE_PERF_1L = 0x1c, - PE_PERF_1H = 0x1e, - PE_PERF_2L = 0x20, - PE_PERF_2H = 0x22, - PE_PERF_3L = 0x24, - PE_PERF_3H = 0x26, - PE_PERF_4L = 0x28, - PE_PERF_4H = 0x2a, - PE_PERF_5L = 0x2c, - PE_PERF_5H = 0x2e, + // NOTE: Order not verified + // These indicate the number of quads that are being used as input/output for each particular stage + PE_PERF_ZCOMP_INPUT_ZCOMPLOC_L = 0x18, + PE_PERF_ZCOMP_INPUT_ZCOMPLOC_H = 0x1a, + PE_PERF_ZCOMP_OUTPUT_ZCOMPLOC_L = 0x1c, + PE_PERF_ZCOMP_OUTPUT_ZCOMPLOC_H = 0x1e, + PE_PERF_ZCOMP_INPUT_L = 0x20, + PE_PERF_ZCOMP_INPUT_H = 0x22, + PE_PERF_ZCOMP_OUTPUT_L = 0x24, + PE_PERF_ZCOMP_OUTPUT_H = 0x26, + PE_PERF_BLEND_INPUT_L = 0x28, + PE_PERF_BLEND_INPUT_H = 0x2a, + PE_PERF_EFB_COPY_CLOCKS_L = 0x2c, + PE_PERF_EFB_COPY_CLOCKS_H = 0x2e, }; namespace PixelEngine diff --git a/Source/Core/VideoCommon/Src/RenderBase.h b/Source/Core/VideoCommon/Src/RenderBase.h index 8b5505e082..55678f3f5a 100644 --- a/Source/Core/VideoCommon/Src/RenderBase.h +++ b/Source/Core/VideoCommon/Src/RenderBase.h @@ -52,6 +52,15 @@ public: Renderer(); virtual ~Renderer(); + enum PixelPerfQuery { + PP_ZCOMP_INPUT_ZCOMPLOC, + PP_ZCOMP_OUTPUT_ZCOMPLOC, + PP_ZCOMP_INPUT, + PP_ZCOMP_OUTPUT, + PP_BLEND_INPUT, + PP_EFB_COPY_CLOCKS + }; + virtual void SetColorMask() = 0; virtual void SetBlendMode(bool forceUpdate) = 0; virtual void SetScissorRect(const TargetRectangle& rc) = 0; diff --git a/Source/Core/VideoCommon/Src/VertexManagerBase.cpp b/Source/Core/VideoCommon/Src/VertexManagerBase.cpp index cd5a01c6b3..520fc21090 100644 --- a/Source/Core/VideoCommon/Src/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/Src/VertexManagerBase.cpp @@ -257,7 +257,9 @@ void VertexManager::Flush() //if (g_nativeVertexFmt) g_nativeVertexFmt->SetupVertexPointers(); + g_renderer->ResumePixelPerf(false); g_vertex_manager->Draw(stride, false); + g_renderer->PausePixelPerf(false); // run through vertex groups again to set alpha if (false == g_ActiveConfig.bDstAlphaPass && bpmem.dstalpha.enable && bpmem.blendmode.alphaupdate) diff --git a/Source/Core/VideoCommon/Src/VideoConfig.h b/Source/Core/VideoCommon/Src/VideoConfig.h index 183fcf784c..0531918183 100644 --- a/Source/Core/VideoCommon/Src/VideoConfig.h +++ b/Source/Core/VideoCommon/Src/VideoConfig.h @@ -115,7 +115,7 @@ struct VideoConfig int iAnaglyphStereoSeparation; int iAnaglyphFocalAngle; bool b3DVision; - + // Hacks bool bEFBAccessEnable; bool bDlistCachingEnable; diff --git a/Source/Core/VideoCommon/VideoCommon.vcxproj b/Source/Core/VideoCommon/VideoCommon.vcxproj index 0c1b13f301..f785cb5c84 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcxproj +++ b/Source/Core/VideoCommon/VideoCommon.vcxproj @@ -196,6 +196,7 @@ + @@ -244,6 +245,7 @@ + diff --git a/Source/Core/VideoCommon/VideoCommon.vcxproj.filters b/Source/Core/VideoCommon/VideoCommon.vcxproj.filters index 6038d5e9e2..330b23d370 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcxproj.filters +++ b/Source/Core/VideoCommon/VideoCommon.vcxproj.filters @@ -101,6 +101,9 @@ Base + + Base + Base @@ -237,6 +240,9 @@ Base + + Base + Base @@ -285,4 +291,4 @@ {e2a527a2-ccc8-4ab8-a93e-dd2628c0f3b6} - \ No newline at end of file + diff --git a/Source/Plugins/Plugin_VideoDX11/Plugin_VideoDX11.vcxproj b/Source/Plugins/Plugin_VideoDX11/Plugin_VideoDX11.vcxproj index 52d1c37aed..aab9345ef7 100644 --- a/Source/Plugins/Plugin_VideoDX11/Plugin_VideoDX11.vcxproj +++ b/Source/Plugins/Plugin_VideoDX11/Plugin_VideoDX11.vcxproj @@ -199,6 +199,7 @@ + @@ -228,6 +229,7 @@ + diff --git a/Source/Plugins/Plugin_VideoDX11/Plugin_VideoDX11.vcxproj.filters b/Source/Plugins/Plugin_VideoDX11/Plugin_VideoDX11.vcxproj.filters index 6492e887ca..4b8efac92b 100644 --- a/Source/Plugins/Plugin_VideoDX11/Plugin_VideoDX11.vcxproj.filters +++ b/Source/Plugins/Plugin_VideoDX11/Plugin_VideoDX11.vcxproj.filters @@ -57,6 +57,9 @@ Render + + Render + @@ -117,6 +120,9 @@ Render + + Render + diff --git a/Source/Plugins/Plugin_VideoDX11/Src/PerfQuery.cpp b/Source/Plugins/Plugin_VideoDX11/Src/PerfQuery.cpp new file mode 100644 index 0000000000..b859d50ec6 --- /dev/null +++ b/Source/Plugins/Plugin_VideoDX11/Src/PerfQuery.cpp @@ -0,0 +1,150 @@ +#include "RenderBase.h" + +#include "D3DBase.h" +#include "PerfQuery.h" + +namespace DX11 { + +PerfQuery::PerfQuery() + : m_query_read_pos() + , m_query_count() +{ + for (int i = 0; i != ARRAYSIZE(m_query_buffer); ++i) + { + D3D11_QUERY_DESC qdesc = CD3D11_QUERY_DESC(D3D11_QUERY_OCCLUSION, 0); + D3D::device->CreateQuery(&qdesc, &m_query_buffer[i].query); + } + ResetQuery(); +} + +PerfQuery::~PerfQuery() +{ + for (int i = 0; i != ARRAYSIZE(m_query_buffer); ++i) + { + // TODO: EndQuery? + m_query_buffer[i].query->Release(); + } +} + +void PerfQuery::EnableQuery(PerfQueryGroup type) +{ + // Is this sane? + if (m_query_count > ARRAYSIZE(m_query_buffer) / 2) + WeakFlush(); + + if (ARRAYSIZE(m_query_buffer) == m_query_count) + { + // TODO + FlushOne(); + ERROR_LOG(VIDEO, "flushed query buffer early!"); + } + + // start query + if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP) + { + auto& entry = m_query_buffer[(m_query_read_pos + m_query_count) % ARRAYSIZE(m_query_buffer)]; + + D3D::context->Begin(entry.query); + entry.query_type = type; + + ++m_query_count; + } +} + +void PerfQuery::DisableQuery(PerfQueryGroup type) +{ + // stop query + if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP) + { + auto& entry = m_query_buffer[(m_query_read_pos + m_query_count + ARRAYSIZE(m_query_buffer)-1) % ARRAYSIZE(m_query_buffer)]; + D3D::context->End(entry.query); + } +} + +void PerfQuery::ResetQuery() +{ + m_query_count = 0; + std::fill_n(m_results, ARRAYSIZE(m_results), 0); +} + +u32 PerfQuery::GetQueryResult(PerfQueryType type) +{ + u32 result = 0; + + if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC) + { + result = m_results[PQG_ZCOMP_ZCOMPLOC]; + } + else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT) + { + result = m_results[PQG_ZCOMP]; + } + else if (type == PQ_BLEND_INPUT) + { + result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC]; + } + else if (type == PQ_EFB_COPY_CLOCKS) + { + result = m_results[PQG_EFB_COPY_CLOCKS]; + } + + return result / 4; +} + +void PerfQuery::FlushOne() +{ + auto& entry = m_query_buffer[m_query_read_pos]; + + UINT64 result = 0; + HRESULT hr = S_FALSE; + while (hr != S_OK) + { + // TODO: Might cause us to be stuck in an infinite loop! + hr = D3D::context->GetData(entry.query, &result, sizeof(result), 0); + } + + // NOTE: Reported pixel metrics should be referenced to native resolution + m_results[entry.query_type] += (u64)result * EFB_WIDTH / g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight(); + + m_query_read_pos = (m_query_read_pos + 1) % ARRAYSIZE(m_query_buffer); + --m_query_count; +} + +// TODO: could selectively flush things, but I don't think that will do much +void PerfQuery::FlushResults() +{ + while (!IsFlushed()) + FlushOne(); +} + +void PerfQuery::WeakFlush() +{ + while (!IsFlushed()) + { + auto& entry = m_query_buffer[m_query_read_pos]; + + UINT64 result = 0; + HRESULT hr = D3D::context->GetData(entry.query, &result, sizeof(result), D3D11_ASYNC_GETDATA_DONOTFLUSH); + + if (hr == S_OK) + { + // NOTE: Reported pixel metrics should be referenced to native resolution + m_results[entry.query_type] += (u64)result * EFB_WIDTH / g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight(); + + m_query_read_pos = (m_query_read_pos + 1) % ARRAYSIZE(m_query_buffer); + --m_query_count; + } + else + { + break; + } + } +} + +bool PerfQuery::IsFlushed() const +{ + return 0 == m_query_count; +} + + +} // namespace diff --git a/Source/Plugins/Plugin_VideoDX11/Src/PerfQuery.h b/Source/Plugins/Plugin_VideoDX11/Src/PerfQuery.h new file mode 100644 index 0000000000..b3709d1013 --- /dev/null +++ b/Source/Plugins/Plugin_VideoDX11/Src/PerfQuery.h @@ -0,0 +1,46 @@ +#ifndef _PERFQUERY_H_ +#define _PERFQUERY_H_ + +#include "PerfQueryBase.h" + +namespace DX11 { + +class PerfQuery : public PerfQueryBase +{ +public: + PerfQuery(); + ~PerfQuery(); + + void EnableQuery(PerfQueryGroup type); + void DisableQuery(PerfQueryGroup type); + void ResetQuery(); + u32 GetQueryResult(PerfQueryType type); + void FlushResults(); + bool IsFlushed() const; + +private: + struct ActiveQuery + { + ID3D11Query* query; + PerfQueryGroup query_type; + }; + + void WeakFlush(); + + // Only use when non-empty + void FlushOne(); + + // when testing in SMS: 64 was too small, 128 was ok + static const int PERF_QUERY_BUFFER_SIZE = 512; + + ActiveQuery m_query_buffer[PERF_QUERY_BUFFER_SIZE]; + int m_query_read_pos; + + // TODO: sloppy + volatile int m_query_count; + volatile u32 m_results[PQG_NUM_MEMBERS]; +}; + +} // namespace + +#endif // _PERFQUERY_H_ diff --git a/Source/Plugins/Plugin_VideoDX11/Src/Render.cpp b/Source/Plugins/Plugin_VideoDX11/Src/Render.cpp index 44f4f1b15b..2dcfcd041c 100644 --- a/Source/Plugins/Plugin_VideoDX11/Src/Render.cpp +++ b/Source/Plugins/Plugin_VideoDX11/Src/Render.cpp @@ -65,6 +65,7 @@ ID3D11RasterizerState* resetraststate = NULL; static ID3D11Texture2D* s_screenshot_texture = NULL; + // GX pipeline state struct { diff --git a/Source/Plugins/Plugin_VideoDX11/Src/VertexManager.cpp b/Source/Plugins/Plugin_VideoDX11/Src/VertexManager.cpp index ab8ed68654..6991b11690 100644 --- a/Source/Plugins/Plugin_VideoDX11/Src/VertexManager.cpp +++ b/Source/Plugins/Plugin_VideoDX11/Src/VertexManager.cpp @@ -208,7 +208,6 @@ void VertexManager::Draw(UINT stride) if (IndexGenerator::GetNumLines() > 0 || IndexGenerator::GetNumPoints() > 0) ((DX11::Renderer*)g_renderer)->RestoreCull(); } - void VertexManager::vFlush() { if (LocalVBuffer == s_pCurBufferPointer) return; @@ -274,8 +273,10 @@ void VertexManager::vFlush() unsigned int stride = g_nativeVertexFmt->GetVertexStride(); g_nativeVertexFmt->SetupVertexPointers(); g_renderer->ApplyState(useDstAlpha); - + + g_perf_query->EnableQuery(bpmem.zcontrol.early_ztest ? PQG_ZCOMP_ZCOMPLOC : PQG_ZCOMP); Draw(stride); + g_perf_query->DisableQuery(bpmem.zcontrol.early_ztest ? PQG_ZCOMP_ZCOMPLOC : PQG_ZCOMP); GFX_DEBUGGER_PAUSE_AT(NEXT_FLUSH, true); diff --git a/Source/Plugins/Plugin_VideoDX11/Src/main.cpp b/Source/Plugins/Plugin_VideoDX11/Src/main.cpp index b7dd9101d3..af4d57dbf9 100644 --- a/Source/Plugins/Plugin_VideoDX11/Src/main.cpp +++ b/Source/Plugins/Plugin_VideoDX11/Src/main.cpp @@ -42,6 +42,7 @@ #include "D3DUtil.h" #include "D3DBase.h" +#include "PerfQuery.h" #include "PixelShaderCache.h" #include "TextureCache.h" #include "VertexManager.h" @@ -185,6 +186,7 @@ void VideoBackend::Video_Prepare() g_renderer = new Renderer; g_texture_cache = new TextureCache; g_vertex_manager = new VertexManager; + g_perf_query = new PerfQuery; VertexShaderCache::Init(); PixelShaderCache::Init(); D3D::InitUtils(); @@ -227,6 +229,7 @@ void VideoBackend::Shutdown() D3D::ShutdownUtils(); PixelShaderCache::Shutdown(); VertexShaderCache::Shutdown(); + delete g_perf_query; delete g_vertex_manager; delete g_texture_cache; delete g_renderer; diff --git a/Source/Plugins/Plugin_VideoDX9/Src/main.cpp b/Source/Plugins/Plugin_VideoDX9/Src/main.cpp index 3c00e1d619..970d4c7085 100644 --- a/Source/Plugins/Plugin_VideoDX9/Src/main.cpp +++ b/Source/Plugins/Plugin_VideoDX9/Src/main.cpp @@ -57,6 +57,7 @@ #include "ConfigManager.h" #include "VideoBackend.h" +#include "PerfQueryBase.h" namespace DX9 { @@ -97,8 +98,6 @@ void InitBackendInfo() g_Config.backend_info.bSupports3DVision = true; g_Config.backend_info.bSupportsDualSourceBlend = false; g_Config.backend_info.bSupportsFormatReinterpretation = true; - - g_Config.backend_info.bSupportsPixelLighting = C_PLIGHTS + 40 <= maxConstants && C_PMATERIALS + 4 <= maxConstants; // adapters @@ -172,6 +171,7 @@ void VideoBackend::Video_Prepare() g_vertex_manager = new VertexManager; g_renderer = new Renderer; g_texture_cache = new TextureCache; + g_perf_query = new PerfQueryBase; // VideoCommon BPInit(); Fifo_Init(); @@ -209,6 +209,7 @@ void VideoBackend::Shutdown() // internal interfaces PixelShaderCache::Shutdown(); VertexShaderCache::Shutdown(); + delete g_perf_query; delete g_texture_cache; delete g_renderer; delete g_vertex_manager; diff --git a/Source/Plugins/Plugin_VideoOGL/CMakeLists.txt b/Source/Plugins/Plugin_VideoOGL/CMakeLists.txt index 43b127f5ef..fe83deb68b 100644 --- a/Source/Plugins/Plugin_VideoOGL/CMakeLists.txt +++ b/Source/Plugins/Plugin_VideoOGL/CMakeLists.txt @@ -2,6 +2,7 @@ set(SRCS Src/FramebufferManager.cpp Src/GLUtil.cpp Src/main.cpp Src/NativeVertexFormat.cpp + Src/PerfQuery.cpp Src/PixelShaderCache.cpp Src/PostProcessing.cpp Src/RasterFont.cpp diff --git a/Source/Plugins/Plugin_VideoOGL/Plugin_VideoOGL.vcxproj b/Source/Plugins/Plugin_VideoOGL/Plugin_VideoOGL.vcxproj index d74bc2edb5..1b71329917 100644 --- a/Source/Plugins/Plugin_VideoOGL/Plugin_VideoOGL.vcxproj +++ b/Source/Plugins/Plugin_VideoOGL/Plugin_VideoOGL.vcxproj @@ -200,6 +200,7 @@ + @@ -222,6 +223,7 @@ + diff --git a/Source/Plugins/Plugin_VideoOGL/Plugin_VideoOGL.vcxproj.filters b/Source/Plugins/Plugin_VideoOGL/Plugin_VideoOGL.vcxproj.filters index cd170691da..f423a77f0b 100644 --- a/Source/Plugins/Plugin_VideoOGL/Plugin_VideoOGL.vcxproj.filters +++ b/Source/Plugins/Plugin_VideoOGL/Plugin_VideoOGL.vcxproj.filters @@ -36,6 +36,9 @@ Render + + Render + @@ -72,6 +75,9 @@ Render + + Render + diff --git a/Source/Plugins/Plugin_VideoOGL/Src/PerfQuery.cpp b/Source/Plugins/Plugin_VideoOGL/Src/PerfQuery.cpp new file mode 100644 index 0000000000..95311eb98d --- /dev/null +++ b/Source/Plugins/Plugin_VideoOGL/Src/PerfQuery.cpp @@ -0,0 +1,133 @@ +#include "RenderBase.h" +#include "GLUtil.h" +#include "PerfQuery.h" + +namespace OGL +{ + +PerfQuery::PerfQuery() + : m_query_read_pos() + , m_query_count() +{ + for (int i = 0; i != ARRAYSIZE(m_query_buffer); ++i) + glGenQueries(1, &m_query_buffer[i].query_id); + + ResetQuery(); +} + +PerfQuery::~PerfQuery() +{ + for (int i = 0; i != ARRAYSIZE(m_query_buffer); ++i) + glDeleteQueries(1, &m_query_buffer[i].query_id); +} + +void PerfQuery::EnableQuery(PerfQueryGroup type) +{ + // Is this sane? + if (m_query_count > ARRAYSIZE(m_query_buffer) / 2) + WeakFlush(); + + if (ARRAYSIZE(m_query_buffer) == m_query_count) + { + FlushOne(); + //ERROR_LOG(VIDEO, "flushed query buffer early!"); + } + + // start query + if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP) + { + auto& entry = m_query_buffer[(m_query_read_pos + m_query_count) % ARRAYSIZE(m_query_buffer)]; + + glBeginQuery(GL_SAMPLES_PASSED, entry.query_id); + entry.query_type = type; + + ++m_query_count; + } +} + +void PerfQuery::DisableQuery(PerfQueryGroup type) +{ + // stop query + if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP) + { + glEndQuery(GL_SAMPLES_PASSED); + } +} + +bool PerfQuery::IsFlushed() const +{ + return 0 == m_query_count; +} + +void PerfQuery::FlushOne() +{ + auto& entry = m_query_buffer[m_query_read_pos]; + + GLuint result = 0; + glGetQueryObjectuiv(entry.query_id, GL_QUERY_RESULT, &result); + + // NOTE: Reported pixel metrics should be referenced to native resolution + m_results[entry.query_type] += (u64)result * EFB_WIDTH / g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight(); + + m_query_read_pos = (m_query_read_pos + 1) % ARRAYSIZE(m_query_buffer); + --m_query_count; +} + +// TODO: could selectively flush things, but I don't think that will do much +void PerfQuery::FlushResults() +{ + while (!IsFlushed()) + FlushOne(); +} + +void PerfQuery::WeakFlush() +{ + while (!IsFlushed()) + { + auto& entry = m_query_buffer[m_query_read_pos]; + + GLuint result = GL_FALSE; + glGetQueryObjectuiv(entry.query_id, GL_QUERY_RESULT_AVAILABLE, &result); + + if (GL_TRUE == result) + { + FlushOne(); + } + else + { + break; + } + } +} + +void PerfQuery::ResetQuery() +{ + m_query_count = 0; + std::fill_n(m_results, ARRAYSIZE(m_results), 0); +} + +u32 PerfQuery::GetQueryResult(PerfQueryType type) +{ + u32 result = 0; + + if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC) + { + result = m_results[PQG_ZCOMP_ZCOMPLOC]; + } + else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT) + { + result = m_results[PQG_ZCOMP]; + } + else if (type == PQ_BLEND_INPUT) + { + result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC]; + } + else if (type == PQ_EFB_COPY_CLOCKS) + { + result = m_results[PQG_EFB_COPY_CLOCKS]; + } + + return result / 4; +} + +} // namespace diff --git a/Source/Plugins/Plugin_VideoOGL/Src/PerfQuery.h b/Source/Plugins/Plugin_VideoOGL/Src/PerfQuery.h new file mode 100644 index 0000000000..34c64e43a1 --- /dev/null +++ b/Source/Plugins/Plugin_VideoOGL/Src/PerfQuery.h @@ -0,0 +1,46 @@ +#ifndef _PERFQUERY_H_ +#define _PERFQUERY_H_ + +#include "PerfQueryBase.h" + +namespace OGL { + +class PerfQuery : public PerfQueryBase +{ +public: + PerfQuery(); + ~PerfQuery(); + + void EnableQuery(PerfQueryGroup type); + void DisableQuery(PerfQueryGroup type); + void ResetQuery(); + u32 GetQueryResult(PerfQueryType type); + void FlushResults(); + bool IsFlushed() const; + +private: + struct ActiveQuery + { + GLuint query_id; + PerfQueryGroup query_type; + }; + + // when testing in SMS: 64 was too small, 128 was ok + static const int PERF_QUERY_BUFFER_SIZE = 512; + + void WeakFlush(); + // Only use when non-empty + void FlushOne(); + + // This contains gl query objects with unretrieved results. + ActiveQuery m_query_buffer[PERF_QUERY_BUFFER_SIZE]; + int m_query_read_pos; + + // TODO: sloppy + volatile int m_query_count; + volatile u32 m_results[PQG_NUM_MEMBERS]; +}; + +} // namespace + +#endif // _PERFQUERY_H_ diff --git a/Source/Plugins/Plugin_VideoOGL/Src/Render.cpp b/Source/Plugins/Plugin_VideoOGL/Src/Render.cpp index b5393cdbde..dca8e4e30e 100644 --- a/Source/Plugins/Plugin_VideoOGL/Src/Render.cpp +++ b/Source/Plugins/Plugin_VideoOGL/Src/Render.cpp @@ -1419,6 +1419,7 @@ void Renderer::SetDepthMode() else { // if the test is disabled write is disabled too + // TODO: When PE performance metrics are being emulated via occlusion queries, we should (probably?) enable depth test with depth function ALWAYS here glDisable(GL_DEPTH_TEST); glDepthMask(GL_FALSE); } diff --git a/Source/Plugins/Plugin_VideoOGL/Src/VertexManager.cpp b/Source/Plugins/Plugin_VideoOGL/Src/VertexManager.cpp index 6ea10fc077..87257f15da 100644 --- a/Source/Plugins/Plugin_VideoOGL/Src/VertexManager.cpp +++ b/Source/Plugins/Plugin_VideoOGL/Src/VertexManager.cpp @@ -40,6 +40,7 @@ #include "OpcodeDecoding.h" #include "FileUtil.h" #include "Debugger.h" +#include "PerfQueryBase.h" #include "main.h" @@ -217,7 +218,10 @@ void VertexManager::vFlush() if (ps) PixelShaderCache::SetCurrentShader(ps->glprogid); // Lego Star Wars crashes here. if (vs) VertexShaderCache::SetCurrentShader(vs->glprogid); + g_perf_query->EnableQuery(bpmem.zcontrol.early_ztest ? PQG_ZCOMP_ZCOMPLOC : PQG_ZCOMP); Draw(); + g_perf_query->DisableQuery(bpmem.zcontrol.early_ztest ? PQG_ZCOMP_ZCOMPLOC : PQG_ZCOMP); + //ERROR_LOG(VIDEO, "PerfQuery result: %d", g_perf_query->GetQueryResult(bpmem.zcontrol.early_ztest ? PQ_ZCOMP_OUTPUT_ZCOMPLOC : PQ_ZCOMP_OUTPUT)); // run through vertex groups again to set alpha if (useDstAlpha && !dualSourcePossible) diff --git a/Source/Plugins/Plugin_VideoOGL/Src/main.cpp b/Source/Plugins/Plugin_VideoOGL/Src/main.cpp index 99e01bfb3d..854554d968 100644 --- a/Source/Plugins/Plugin_VideoOGL/Src/main.cpp +++ b/Source/Plugins/Plugin_VideoOGL/Src/main.cpp @@ -92,6 +92,7 @@ Make AA apply instantly during gameplay if possible #include "FramebufferManager.h" #include "Core.h" #include "Host.h" +#include "PerfQuery.h" #include "VideoState.h" #include "VideoBackend.h" @@ -194,6 +195,7 @@ void VideoBackend::Video_Prepare() BPInit(); g_vertex_manager = new VertexManager; + g_perf_query = new PerfQuery; Fifo_Init(); // must be done before OpcodeDecoder_Init() OpcodeDecoder_Init(); VertexShaderCache::Init(); diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/BPMemLoader.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/BPMemLoader.cpp index f05f9f8b57..0d7c77c767 100644 --- a/Source/Plugins/Plugin_VideoSoftware/Src/BPMemLoader.cpp +++ b/Source/Plugins/Plugin_VideoSoftware/Src/BPMemLoader.cpp @@ -90,6 +90,21 @@ void SWBPWritten(int address, int newvalue) SWPixelEngine::pereg.boxBottom = newvalue >> 10; SWPixelEngine::pereg.boxTop = newvalue & 0x3ff; break; + case BPMEM_CLEAR_PIXEL_PERF: + // TODO: I didn't test if the value written to this register affects the amount of cleared registers + SWPixelEngine::pereg.perfZcompInputZcomplocLo = 0; + SWPixelEngine::pereg.perfZcompInputZcomplocHi = 0; + SWPixelEngine::pereg.perfZcompOutputZcomplocLo = 0; + SWPixelEngine::pereg.perfZcompOutputZcomplocHi = 0; + SWPixelEngine::pereg.perfZcompInputLo = 0; + SWPixelEngine::pereg.perfZcompInputHi = 0; + SWPixelEngine::pereg.perfZcompOutputLo = 0; + SWPixelEngine::pereg.perfZcompOutputHi = 0; + SWPixelEngine::pereg.perfBlendInputLo = 0; + SWPixelEngine::pereg.perfBlendInputHi = 0; + SWPixelEngine::pereg.perfEfbCopyClocksLo = 0; + SWPixelEngine::pereg.perfEfbCopyClocksHi = 0; + break; case BPMEM_LOADTLUT0: // This one updates bpmem.tlutXferSrc, no need to do anything here. break; case BPMEM_LOADTLUT1: // Load a Texture Look Up Table diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.cpp index 5eed912dcb..badb123fa0 100644 --- a/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.cpp +++ b/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.cpp @@ -23,6 +23,7 @@ #include "BPMemLoader.h" #include "XFMemLoader.h" #include "Tev.h" +#include "SWPixelEngine.h" #include "SWStatistics.h" #include "SWVideoConfig.h" @@ -149,9 +150,15 @@ inline void Draw(s32 x, s32 y, s32 xi, s32 yi) if (bpmem.zcontrol.early_ztest && bpmem.zmode.testenable && g_SWVideoConfig.bZComploc) { - // early z - if (!EfbInterface::ZCompare(x, y, z)) - return; + // TODO: Test if perf regs are incremented even if test is disabled + SWPixelEngine::pereg.IncZInputQuadCount(true); + if (bpmem.zmode.testenable) + { + // early z + if (!EfbInterface::ZCompare(x, y, z)) + return; + } + SWPixelEngine::pereg.IncZOutputQuadCount(true); } RasterBlockPixel& pixel = rasterBlock.Pixel[xi][yi]; diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/SWPixelEngine.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/SWPixelEngine.cpp index e9d2c1fe96..118e59629e 100644 --- a/Source/Plugins/Plugin_VideoSoftware/Src/SWPixelEngine.cpp +++ b/Source/Plugins/Plugin_VideoSoftware/Src/SWPixelEngine.cpp @@ -79,7 +79,7 @@ void Read16(u16& _uReturnValue, const u32 _iAddress) u16 address = _iAddress & 0xFFF; - if (address <= 0x16) + if (address <= 0x2e) _uReturnValue = ((u16*)&pereg)[address >> 1]; } @@ -111,7 +111,7 @@ void Write16(const u16 _iValue, const u32 _iAddress) } break; default: - if (address <= 0x16) + if (address <= 0x2e) ((u16*)&pereg)[address >> 1] = _iValue; break; } diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/SWPixelEngine.h b/Source/Plugins/Plugin_VideoSoftware/Src/SWPixelEngine.h index 6a87143e8c..351e53456d 100644 --- a/Source/Plugins/Plugin_VideoSoftware/Src/SWPixelEngine.h +++ b/Source/Plugins/Plugin_VideoSoftware/Src/SWPixelEngine.h @@ -38,6 +38,21 @@ namespace SWPixelEngine PE_BBOX_RIGHT = 0x012, // Flip Right PE_BBOX_TOP = 0x014, // Flip Top PE_BBOX_BOTTOM = 0x016, // Flip Bottom + + // NOTE: Order not verified + // These indicate the number of quads that are being used as input/output for each particular stage + PE_PERF_ZCOMP_INPUT_ZCOMPLOC_L = 0x18, + PE_PERF_ZCOMP_INPUT_ZCOMPLOC_H = 0x1a, + PE_PERF_ZCOMP_OUTPUT_ZCOMPLOC_L = 0x1c, + PE_PERF_ZCOMP_OUTPUT_ZCOMPLOC_H = 0x1e, + PE_PERF_ZCOMP_INPUT_L = 0x20, + PE_PERF_ZCOMP_INPUT_H = 0x22, + PE_PERF_ZCOMP_OUTPUT_L = 0x24, + PE_PERF_ZCOMP_OUTPUT_H = 0x26, + PE_PERF_BLEND_INPUT_L = 0x28, + PE_PERF_BLEND_INPUT_H = 0x2a, + PE_PERF_EFB_COPY_CLOCKS_L = 0x2c, + PE_PERF_EFB_COPY_CLOCKS_H = 0x2e, }; union UPEZConfReg @@ -125,10 +140,72 @@ namespace SWPixelEngine UPECtrlReg ctrl; u16 unk0; u16 token; + u16 boxLeft; u16 boxRight; u16 boxTop; u16 boxBottom; + + u16 perfZcompInputZcomplocLo; + u16 perfZcompInputZcomplocHi; + u16 perfZcompOutputZcomplocLo; + u16 perfZcompOutputZcomplocHi; + u16 perfZcompInputLo; + u16 perfZcompInputHi; + u16 perfZcompOutputLo; + u16 perfZcompOutputHi; + u16 perfBlendInputLo; + u16 perfBlendInputHi; + u16 perfEfbCopyClocksLo; + u16 perfEfbCopyClocksHi; + + // NOTE: hardware doesn't process individual pixels but quads instead. Current software renderer architecture works on pixels though, so we have this "quad" hack here to only increment the registers on every fourth rendered pixel + void IncZInputQuadCount(bool early_ztest) + { + static int quad = 0; + if (++quad != 3) + return; + quad = 0; + + if (early_ztest) + { + if (++perfZcompInputZcomplocLo == 0) + perfZcompInputZcomplocHi++; + } + else + { + if (++perfZcompInputLo == 0) + perfZcompInputHi++; + } + } + void IncZOutputQuadCount(bool early_ztest) + { + static int quad = 0; + if (++quad != 3) + return; + quad = 0; + + if (early_ztest) + { + if (++perfZcompOutputZcomplocLo == 0) + perfZcompOutputZcomplocHi++; + } + else + { + if (++perfZcompOutputLo == 0) + perfZcompOutputHi++; + } + } + void IncBlendInputQuadCount() + { + static int quad = 0; + if (++quad != 3) + return; + quad = 0; + + if (++perfBlendInputLo == 0) + perfBlendInputHi++; + } }; extern PEReg pereg; diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/SWmain.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/SWmain.cpp index 0e2d6afa72..9d15619f55 100644 --- a/Source/Plugins/Plugin_VideoSoftware/Src/SWmain.cpp +++ b/Source/Plugins/Plugin_VideoSoftware/Src/SWmain.cpp @@ -225,6 +225,12 @@ u32 VideoSoftware::Video_AccessEFB(EFBAccessType type, u32 x, u32 y, u32 InputDa return value; } +u32 VideoSoftware::Video_GetQueryResult(PerfQueryType type) +{ + // TODO: + return 0; +} + bool VideoSoftware::Video_Screenshot(const char *_szFilename) { return false; diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/Tev.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/Tev.cpp index 0f4b4fce20..3d5c0d7724 100644 --- a/Source/Plugins/Plugin_VideoSoftware/Src/Tev.cpp +++ b/Source/Plugins/Plugin_VideoSoftware/Src/Tev.cpp @@ -20,6 +20,7 @@ #include "Tev.h" #include "EfbInterface.h" #include "TextureSampler.h" +#include "SWPixelEngine.h" #include "SWStatistics.h" #include "SWVideoConfig.h" #include "DebugUtil.h" @@ -787,8 +788,13 @@ void Tev::Draw() bool late_ztest = !bpmem.zcontrol.early_ztest || !g_SWVideoConfig.bZComploc; if (late_ztest && bpmem.zmode.testenable) { - if (!EfbInterface::ZCompare(Position[0], Position[1], Position[2])) - return; + // TODO: Check against hw if these values get incremented even if depth testing is disabled + SWPixelEngine::pereg.IncZInputQuadCount(false); + + if (!EfbInterface::ZCompare(Position[0], Position[1], Position[2])) + return; + + SWPixelEngine::pereg.IncZOutputQuadCount(false); } #if ALLOW_TEV_DUMPS @@ -812,6 +818,7 @@ void Tev::Draw() #endif INCSTAT(swstats.thisFrame.tevPixelsOut); + SWPixelEngine::pereg.IncBlendInputQuadCount(); EfbInterface::BlendTev(Position[0], Position[1], output); } diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/VideoBackend.h b/Source/Plugins/Plugin_VideoSoftware/Src/VideoBackend.h index 3e1490031f..86fdc37b2a 100644 --- a/Source/Plugins/Plugin_VideoSoftware/Src/VideoBackend.h +++ b/Source/Plugins/Plugin_VideoSoftware/Src/VideoBackend.h @@ -26,7 +26,9 @@ class VideoSoftware : public VideoBackend void Video_ExitLoop(); void Video_BeginField(u32, FieldType, u32, u32); void Video_EndField(); + u32 Video_AccessEFB(EFBAccessType, u32, u32, u32); + u32 Video_GetQueryResult(PerfQueryType type); void Video_AddMessage(const char* pstr, unsigned int milliseconds); void Video_ClearMessages();