From 63c62b277d6850279bd2a982535d5abde824e6e2 Mon Sep 17 00:00:00 2001 From: comex Date: Sun, 24 Aug 2014 23:53:28 -0400 Subject: [PATCH 01/10] Some changes to VertexLoaderManager: - Lazily create the native vertex format (which involves GL calls) from RunVertices rather than RefreshLoader itself, freeing the latter to be run from the CPU thread (hopefully). - In order to avoid useless allocations while doing so, store the native format inside the VertexLoader rather than using a cache entry. - Wrap the s_vertex_loader_map in a lock, for similar reasons. --- Source/Core/VideoCommon/VertexLoader.cpp | 20 +++++ Source/Core/VideoCommon/VertexLoader.h | 8 ++ .../Core/VideoCommon/VertexLoaderManager.cpp | 88 ++++++++----------- 3 files changed, 64 insertions(+), 52 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index 7f7dbc9030..7876e050c2 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -548,6 +548,7 @@ VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr) m_compiledCode = nullptr; m_numLoadedVertices = 0; m_VertexSize = 0; + m_native_vertex_format = nullptr; loop_counter = 0; VertexLoader_Normal::Init(); VertexLoader_Position::Init(); @@ -1035,3 +1036,22 @@ void VertexLoader::AppendToString(std::string *dest) const } dest->append(StringFromFormat(" - %i v\n", m_numLoadedVertices)); } + +NativeVertexFormat* VertexLoader::GetNativeVertexFormat() +{ + if (m_native_vertex_format) + return m_native_vertex_format; + auto& native = s_native_vertex_map[m_native_vtx_decl]; + if (!native) + { + auto raw_pointer = g_vertex_manager->CreateNativeVertexFormat(); + native = std::unique_ptr(raw_pointer); + native->Initialize(m_native_vtx_decl); + native->m_components = m_native_components; + } + m_native_vertex_format = native.get(); + return native.get(); + +} + +std::map> VertexLoader::s_native_vertex_map; diff --git a/Source/Core/VideoCommon/VertexLoader.h b/Source/Core/VideoCommon/VertexLoader.h index 7be8385879..0bca6f5e5c 100644 --- a/Source/Core/VideoCommon/VertexLoader.h +++ b/Source/Core/VideoCommon/VertexLoader.h @@ -8,6 +8,8 @@ // Metroid Prime: P I16-flt N I16-s16 T0 I16-u16 T1 i16-flt #include +#include +#include #include #include "Common/CommonTypes.h" @@ -114,6 +116,9 @@ public: void AppendToString(std::string *dest) const; int GetNumLoadedVerts() const { return m_numLoadedVertices; } + NativeVertexFormat* GetNativeVertexFormat(); + static void ClearNativeVertexFormatCache() { s_native_vertex_map.clear(); } + private: int m_VertexSize; // number of bytes of a raw GC vertex. Computed by CompileVertexTranslator. @@ -135,6 +140,9 @@ private: int m_numLoadedVertices; + NativeVertexFormat* m_native_vertex_format; + static std::map> s_native_vertex_map; + void SetVAT(const VAT& vat); void CompileVertexTranslator(); diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index dbf22c9c3d..b3a1c97c3f 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -24,8 +25,7 @@ static int s_attr_dirty; // bitfield static NativeVertexFormat* s_current_vtx_fmt; -typedef std::pair VertexLoaderCacheItem; -static VertexLoaderCacheItem s_VertexLoaders[8]; +static VertexLoader* s_VertexLoaders[8]; namespace std { @@ -41,14 +41,13 @@ struct hash } -typedef std::unordered_map VertexLoaderMap; -typedef std::map> NativeVertexLoaderMap; +typedef std::unordered_map> VertexLoaderMap; namespace VertexLoaderManager { -static VertexLoaderMap s_VertexLoaderMap; -static NativeVertexLoaderMap s_native_vertex_map; +static std::mutex s_vertex_loader_map_lock; +static VertexLoaderMap s_vertex_loader_map; // TODO - change into array of pointers. Keep a map of all seen so far. void Init() @@ -56,20 +55,16 @@ void Init() MarkAllDirty(); for (auto& map_entry : s_VertexLoaders) { - map_entry.first = nullptr; - map_entry.second = nullptr; + map_entry = nullptr; } RecomputeCachedArraybases(); } void Shutdown() { - for (auto& map_entry : s_VertexLoaderMap) - { - delete map_entry.second.first; - } - s_VertexLoaderMap.clear(); - s_native_vertex_map.clear(); + std::lock_guard lk(s_vertex_loader_map_lock); + s_vertex_loader_map.clear(); + VertexLoader::ClearNativeVertexFormatCache(); } namespace @@ -87,14 +82,15 @@ struct entry void AppendListToString(std::string *dest) { + std::lock_guard lk(s_vertex_loader_map_lock); std::vector entries; size_t total_size = 0; - for (const auto& map_entry : s_VertexLoaderMap) + for (const auto& map_entry : s_vertex_loader_map) { entry e; - map_entry.second.first->AppendToString(&e.text); - e.num_verts = map_entry.second.first->GetNumLoadedVerts(); + map_entry.second->AppendToString(&e.text); + e.num_verts = map_entry.second->GetNumLoadedVerts(); entries.push_back(e); total_size += e.text.size() + 1; } @@ -111,54 +107,39 @@ void MarkAllDirty() s_attr_dirty = 0xff; } -static NativeVertexFormat* GetNativeVertexFormat(const PortableVertexDeclaration& format, - u32 components) -{ - auto& native = s_native_vertex_map[format]; - if (!native) - { - auto raw_pointer = g_vertex_manager->CreateNativeVertexFormat(); - native = std::unique_ptr(raw_pointer); - native->Initialize(format); - native->m_components = components; - } - return native.get(); -} - -static VertexLoaderCacheItem RefreshLoader(int vtx_attr_group) +static VertexLoader* RefreshLoader(int vtx_attr_group) { + VertexLoader* loader; if ((s_attr_dirty >> vtx_attr_group) & 1) { VertexLoaderUID uid(g_VtxDesc, g_VtxAttr[vtx_attr_group]); - VertexLoaderMap::iterator iter = s_VertexLoaderMap.find(uid); - if (iter != s_VertexLoaderMap.end()) + std::lock_guard lk(s_vertex_loader_map_lock); + VertexLoaderMap::iterator iter = s_vertex_loader_map.find(uid); + if (iter != s_vertex_loader_map.end()) { - s_VertexLoaders[vtx_attr_group] = iter->second; + loader = iter->second.get(); } else { - VertexLoader* loader = new VertexLoader(g_VtxDesc, g_VtxAttr[vtx_attr_group]); - - NativeVertexFormat* vtx_fmt = GetNativeVertexFormat( - loader->GetNativeVertexDeclaration(), - loader->GetNativeComponents()); - - s_VertexLoaderMap[uid] = std::make_pair(loader, vtx_fmt); - s_VertexLoaders[vtx_attr_group] = std::make_pair(loader, vtx_fmt); + loader = new VertexLoader(g_VtxDesc, g_VtxAttr[vtx_attr_group]); + s_vertex_loader_map[uid] = std::unique_ptr(loader); INCSTAT(stats.numVertexLoaders); } + s_VertexLoaders[vtx_attr_group] = loader; + s_attr_dirty &= ~(1 << vtx_attr_group); + } else { + loader = s_VertexLoaders[vtx_attr_group]; } - s_attr_dirty &= ~(1 << vtx_attr_group); - return s_VertexLoaders[vtx_attr_group]; + return loader; } bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size, bool skip_drawing) { if (!count) return true; - auto loader = RefreshLoader(vtx_attr_group); + VertexLoader* loader = RefreshLoader(vtx_attr_group); - size_t size = count * loader.first->GetVertexSize(); + size_t size = count * loader->GetVertexSize(); if (buf_size < size) return false; @@ -169,15 +150,18 @@ bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size, return true; } + NativeVertexFormat* native = loader->GetNativeVertexFormat(); + + // If the native vertex format changed, force a flush. - if (loader.second != s_current_vtx_fmt) + if (native != s_current_vtx_fmt) VertexManager::Flush(); - s_current_vtx_fmt = loader.second; + s_current_vtx_fmt = native; VertexManager::PrepareForAdditionalData(primitive, count, - loader.first->GetNativeVertexDeclaration().stride); + loader->GetNativeVertexDeclaration().stride); - loader.first->RunVertices(g_VtxAttr[vtx_attr_group], primitive, count); + loader->RunVertices(g_VtxAttr[vtx_attr_group], primitive, count); IndexGenerator::AddIndices(primitive, count); @@ -188,7 +172,7 @@ bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size, int GetVertexSize(int vtx_attr_group) { - return RefreshLoader(vtx_attr_group).first->GetVertexSize(); + return RefreshLoader(vtx_attr_group)->GetVertexSize(); } NativeVertexFormat* GetCurrentVertexFormat() From f8452ff50103a503546e57ec8c6fc7fd5fa92a29 Mon Sep 17 00:00:00 2001 From: comex Date: Tue, 26 Aug 2014 23:17:51 -0400 Subject: [PATCH 02/10] Fix threading issue with vertex loader JIT. VertexLoader::VertexLoader was setting loop_counter, a *static* variable, to 0. This was nonsensical, but harmless until I started to run it on a separate thread, where it had a chance of interfering with a running vertex translator. Switch to just using a register for the loop counter. --- Source/Core/VideoCommon/VertexLoader.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index 7876e050c2..ebc57d8577 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -38,9 +38,6 @@ static u8 s_curtexmtx[8]; static int s_texmtxwrite = 0; static int s_texmtxread = 0; -static int loop_counter; - - // Vertex loaders read these. Although the scale ones should be baked into the shader. int tcIndex; int colIndex; @@ -549,7 +546,6 @@ VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr) m_numLoadedVertices = 0; m_VertexSize = 0; m_native_vertex_format = nullptr; - loop_counter = 0; VertexLoader_Normal::Init(); VertexLoader_Position::Init(); VertexLoader_TextCoord::Init(); @@ -585,8 +581,11 @@ void VertexLoader::CompileVertexTranslator() PanicAlert("Trying to recompile a vertex translator"); m_compiledCode = GetCodePtr(); - // We don't use any callee saved registers or anything but RAX. - ABI_PushRegistersAndAdjustStack(0, 8); + // We only use RAX (caller saved) and RBX (callee saved). + ABI_PushRegistersAndAdjustStack(1 << RBX, 8); + + // save count + MOV(64, R(RBX), R(ABI_PARAM1)); // Start loop here const u8 *loop_start = GetCodePtr(); @@ -843,11 +842,10 @@ void VertexLoader::CompileVertexTranslator() #ifdef USE_VERTEX_LOADER_JIT // End loop here - MOV(64, R(RAX), Imm64((u64)&loop_counter)); - SUB(32, MatR(RAX), Imm8(1)); + SUB(64, R(RBX), Imm8(1)); J_CC(CC_NZ, loop_start); - ABI_PopRegistersAndAdjustStack(0, 8); + ABI_PopRegistersAndAdjustStack(1 << RBX, 8); RET(); #endif } @@ -913,8 +911,7 @@ void VertexLoader::ConvertVertices ( int count ) #ifdef USE_VERTEX_LOADER_JIT if (count > 0) { - loop_counter = count; - ((void (*)())(void*)m_compiledCode)(); + ((void (*)(int))(void*)m_compiledCode)(count); } #else for (int s = 0; s < count; s++) From 90638c6806ac2c92b211e41b2abd257b4266f488 Mon Sep 17 00:00:00 2001 From: comex Date: Wed, 27 Aug 2014 22:37:08 -0400 Subject: [PATCH 03/10] Switch to an unordered_map as a micro-optimization. --- Source/Core/VideoCommon/NativeVertexFormat.h | 16 +++++++++++++++- Source/Core/VideoCommon/VertexLoader.cpp | 2 +- Source/Core/VideoCommon/VertexLoader.h | 4 ++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/Source/Core/VideoCommon/NativeVertexFormat.h b/Source/Core/VideoCommon/NativeVertexFormat.h index ebc95cdc45..8778423d3f 100644 --- a/Source/Core/VideoCommon/NativeVertexFormat.h +++ b/Source/Core/VideoCommon/NativeVertexFormat.h @@ -4,7 +4,7 @@ #pragma once -#include "Common/CommonTypes.h" +#include "Common/Hash.h" // m_components enum @@ -87,6 +87,20 @@ struct PortableVertexDeclaration } }; +namespace std +{ + +template <> +struct hash +{ + size_t operator()(const PortableVertexDeclaration& decl) const + { + return HashFletcher((u8 *) &decl, sizeof(decl)); + } +}; + +} + // The implementation of this class is specific for GL/DX, so NativeVertexFormat.cpp // is in the respective backend, not here in VideoCommon. diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index ebc57d8577..ab78e85f9e 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -1051,4 +1051,4 @@ NativeVertexFormat* VertexLoader::GetNativeVertexFormat() } -std::map> VertexLoader::s_native_vertex_map; +std::unordered_map> VertexLoader::s_native_vertex_map; diff --git a/Source/Core/VideoCommon/VertexLoader.h b/Source/Core/VideoCommon/VertexLoader.h index 0bca6f5e5c..c81be52556 100644 --- a/Source/Core/VideoCommon/VertexLoader.h +++ b/Source/Core/VideoCommon/VertexLoader.h @@ -8,9 +8,9 @@ // Metroid Prime: P I16-flt N I16-s16 T0 I16-u16 T1 i16-flt #include -#include #include #include +#include #include "Common/CommonTypes.h" #include "Common/x64Emitter.h" @@ -141,7 +141,7 @@ private: int m_numLoadedVertices; NativeVertexFormat* m_native_vertex_format; - static std::map> s_native_vertex_map; + static std::unordered_map> s_native_vertex_map; void SetVAT(const VAT& vat); From f0131c2e09faba0d65fa5b7360898e514e3636f5 Mon Sep 17 00:00:00 2001 From: comex Date: Wed, 27 Aug 2014 13:26:06 -0400 Subject: [PATCH 04/10] Mechanical changes to move most CP state to a struct rather than separate globals. The next commit will add a separate copy of the struct and the ability for LoadCPReg to work on it. --- .../VideoBackends/Software/CPMemLoader.cpp | 22 +++---- .../VideoBackends/Software/SWVertexLoader.cpp | 66 +++++++++---------- Source/Core/VideoBackends/Software/SWmain.cpp | 9 +-- .../VideoBackends/Software/XFMemLoader.cpp | 2 +- Source/Core/VideoCommon/CPMemory.cpp | 21 +++--- Source/Core/VideoCommon/CPMemory.h | 27 +++++--- Source/Core/VideoCommon/VertexLoader.cpp | 4 +- .../Core/VideoCommon/VertexLoaderManager.cpp | 44 ++++++------- .../Core/VideoCommon/VertexLoader_Color.cpp | 12 ++-- .../Core/VideoCommon/VertexLoader_Normal.cpp | 2 +- .../VideoCommon/VertexLoader_Position.cpp | 4 +- .../VideoCommon/VertexLoader_TextCoord.cpp | 6 +- Source/Core/VideoCommon/VertexShaderGen.cpp | 4 +- .../Core/VideoCommon/VertexShaderManager.cpp | 54 +++++++-------- Source/Core/VideoCommon/VideoState.cpp | 15 +---- Source/Core/VideoCommon/XFStructs.cpp | 2 +- 16 files changed, 146 insertions(+), 148 deletions(-) diff --git a/Source/Core/VideoBackends/Software/CPMemLoader.cpp b/Source/Core/VideoBackends/Software/CPMemLoader.cpp index 8e78059616..21d3861d90 100644 --- a/Source/Core/VideoBackends/Software/CPMemLoader.cpp +++ b/Source/Core/VideoBackends/Software/CPMemLoader.cpp @@ -13,46 +13,46 @@ void SWLoadCPReg(u32 sub_cmd, u32 value) switch (sub_cmd & 0xF0) { case 0x30: - MatrixIndexA.Hex = value; + g_main_cp_state.matrix_index_a.Hex = value; break; case 0x40: - MatrixIndexB.Hex = value; + g_main_cp_state.matrix_index_b.Hex = value; break; case 0x50: - g_VtxDesc.Hex &= ~0x1FFFF; // keep the Upper bits - g_VtxDesc.Hex |= value; + g_main_cp_state.vtx_desc.Hex &= ~0x1FFFF; // keep the Upper bits + g_main_cp_state.vtx_desc.Hex |= value; break; case 0x60: - g_VtxDesc.Hex &= 0x1FFFF; // keep the lower 17Bits - g_VtxDesc.Hex |= (u64)value << 17; + g_main_cp_state.vtx_desc.Hex &= 0x1FFFF; // keep the lower 17Bits + g_main_cp_state.vtx_desc.Hex |= (u64)value << 17; break; case 0x70: _assert_((sub_cmd & 0x0F) < 8); - g_VtxAttr[sub_cmd & 7].g0.Hex = value; + g_main_cp_state.vtx_attr[sub_cmd & 7].g0.Hex = value; break; case 0x80: _assert_((sub_cmd & 0x0F) < 8); - g_VtxAttr[sub_cmd & 7].g1.Hex = value; + g_main_cp_state.vtx_attr[sub_cmd & 7].g1.Hex = value; break; case 0x90: _assert_((sub_cmd & 0x0F) < 8); - g_VtxAttr[sub_cmd & 7].g2.Hex = value; + g_main_cp_state.vtx_attr[sub_cmd & 7].g2.Hex = value; break; // Pointers to vertex arrays in GC RAM case 0xA0: - arraybases[sub_cmd & 0xF] = value; + g_main_cp_state.array_bases[sub_cmd & 0xF] = value; cached_arraybases[sub_cmd & 0xF] = Memory::GetPointer(value); break; case 0xB0: - arraystrides[sub_cmd & 0xF] = value & 0xFF; + g_main_cp_state.array_strides[sub_cmd & 0xF] = value & 0xFF; break; } } diff --git a/Source/Core/VideoBackends/Software/SWVertexLoader.cpp b/Source/Core/VideoBackends/Software/SWVertexLoader.cpp index dcd71d35a4..e4848ceb34 100644 --- a/Source/Core/VideoBackends/Software/SWVertexLoader.cpp +++ b/Source/Core/VideoBackends/Software/SWVertexLoader.cpp @@ -39,7 +39,7 @@ SWVertexLoader::~SWVertexLoader() void SWVertexLoader::SetFormat(u8 attributeIndex, u8 primitiveType) { - m_CurrentVat = &g_VtxAttr[attributeIndex]; + m_CurrentVat = &g_main_cp_state.vtx_attr[attributeIndex]; posScale = 1.0f / float(1 << m_CurrentVat->g0.PosFrac); tcScale[0] = 1.0f / float(1 << m_CurrentVat->g0.Tex0Frac); @@ -53,20 +53,20 @@ void SWVertexLoader::SetFormat(u8 attributeIndex, u8 primitiveType) //TexMtx const u64 tmDesc[8] = { - g_VtxDesc.Tex0MatIdx, g_VtxDesc.Tex1MatIdx, g_VtxDesc.Tex2MatIdx, g_VtxDesc.Tex3MatIdx, - g_VtxDesc.Tex4MatIdx, g_VtxDesc.Tex5MatIdx, g_VtxDesc.Tex6MatIdx, g_VtxDesc.Tex7MatIdx + g_main_cp_state.vtx_desc.Tex0MatIdx, g_main_cp_state.vtx_desc.Tex1MatIdx, g_main_cp_state.vtx_desc.Tex2MatIdx, g_main_cp_state.vtx_desc.Tex3MatIdx, + g_main_cp_state.vtx_desc.Tex4MatIdx, g_main_cp_state.vtx_desc.Tex5MatIdx, g_main_cp_state.vtx_desc.Tex6MatIdx, g_main_cp_state.vtx_desc.Tex7MatIdx }; // Colors - const u64 colDesc[2] = {g_VtxDesc.Color0, g_VtxDesc.Color1}; + const u64 colDesc[2] = {g_main_cp_state.vtx_desc.Color0, g_main_cp_state.vtx_desc.Color1}; colElements[0] = m_CurrentVat->g0.Color0Elements; colElements[1] = m_CurrentVat->g0.Color1Elements; const u32 colComp[2] = {m_CurrentVat->g0.Color0Comp, m_CurrentVat->g0.Color1Comp}; // TextureCoord const u64 tcDesc[8] = { - g_VtxDesc.Tex0Coord, g_VtxDesc.Tex1Coord, g_VtxDesc.Tex2Coord, g_VtxDesc.Tex3Coord, - g_VtxDesc.Tex4Coord, g_VtxDesc.Tex5Coord, g_VtxDesc.Tex6Coord, g_VtxDesc.Tex7Coord + g_main_cp_state.vtx_desc.Tex0Coord, g_main_cp_state.vtx_desc.Tex1Coord, g_main_cp_state.vtx_desc.Tex2Coord, g_main_cp_state.vtx_desc.Tex3Coord, + g_main_cp_state.vtx_desc.Tex4Coord, g_main_cp_state.vtx_desc.Tex5Coord, g_main_cp_state.vtx_desc.Tex6Coord, g_main_cp_state.vtx_desc.Tex7Coord }; const u32 tcElements[8] = { m_CurrentVat->g0.Tex0CoordElements, m_CurrentVat->g1.Tex1CoordElements, m_CurrentVat->g1.Tex2CoordElements, @@ -89,15 +89,15 @@ void SWVertexLoader::SetFormat(u8 attributeIndex, u8 primitiveType) // Reset vertex // matrix index from xf regs or cp memory? - if (xfmem.MatrixIndexA.PosNormalMtxIdx != MatrixIndexA.PosNormalMtxIdx || - xfmem.MatrixIndexA.Tex0MtxIdx != MatrixIndexA.Tex0MtxIdx || - xfmem.MatrixIndexA.Tex1MtxIdx != MatrixIndexA.Tex1MtxIdx || - xfmem.MatrixIndexA.Tex2MtxIdx != MatrixIndexA.Tex2MtxIdx || - xfmem.MatrixIndexA.Tex3MtxIdx != MatrixIndexA.Tex3MtxIdx || - xfmem.MatrixIndexB.Tex4MtxIdx != MatrixIndexB.Tex4MtxIdx || - xfmem.MatrixIndexB.Tex5MtxIdx != MatrixIndexB.Tex5MtxIdx || - xfmem.MatrixIndexB.Tex6MtxIdx != MatrixIndexB.Tex6MtxIdx || - xfmem.MatrixIndexB.Tex7MtxIdx != MatrixIndexB.Tex7MtxIdx) + if (xfmem.MatrixIndexA.PosNormalMtxIdx != g_main_cp_state.matrix_index_a.PosNormalMtxIdx || + xfmem.MatrixIndexA.Tex0MtxIdx != g_main_cp_state.matrix_index_a.Tex0MtxIdx || + xfmem.MatrixIndexA.Tex1MtxIdx != g_main_cp_state.matrix_index_a.Tex1MtxIdx || + xfmem.MatrixIndexA.Tex2MtxIdx != g_main_cp_state.matrix_index_a.Tex2MtxIdx || + xfmem.MatrixIndexA.Tex3MtxIdx != g_main_cp_state.matrix_index_a.Tex3MtxIdx || + xfmem.MatrixIndexB.Tex4MtxIdx != g_main_cp_state.matrix_index_b.Tex4MtxIdx || + xfmem.MatrixIndexB.Tex5MtxIdx != g_main_cp_state.matrix_index_b.Tex5MtxIdx || + xfmem.MatrixIndexB.Tex6MtxIdx != g_main_cp_state.matrix_index_b.Tex6MtxIdx || + xfmem.MatrixIndexB.Tex7MtxIdx != g_main_cp_state.matrix_index_b.Tex7MtxIdx) { WARN_LOG(VIDEO, "Matrix indices don't match"); @@ -118,18 +118,18 @@ void SWVertexLoader::SetFormat(u8 attributeIndex, u8 primitiveType) m_Vertex.texMtx[6] = xfmem.MatrixIndexB.Tex6MtxIdx; m_Vertex.texMtx[7] = xfmem.MatrixIndexB.Tex7MtxIdx; #else - m_Vertex.posMtx = MatrixIndexA.PosNormalMtxIdx; - m_Vertex.texMtx[0] = MatrixIndexA.Tex0MtxIdx; - m_Vertex.texMtx[1] = MatrixIndexA.Tex1MtxIdx; - m_Vertex.texMtx[2] = MatrixIndexA.Tex2MtxIdx; - m_Vertex.texMtx[3] = MatrixIndexA.Tex3MtxIdx; - m_Vertex.texMtx[4] = MatrixIndexB.Tex4MtxIdx; - m_Vertex.texMtx[5] = MatrixIndexB.Tex5MtxIdx; - m_Vertex.texMtx[6] = MatrixIndexB.Tex6MtxIdx; - m_Vertex.texMtx[7] = MatrixIndexB.Tex7MtxIdx; + m_Vertex.posMtx = g_main_cp_state.matrix_index_a.PosNormalMtxIdx; + m_Vertex.texMtx[0] = g_main_cp_state.matrix_index_a.Tex0MtxIdx; + m_Vertex.texMtx[1] = g_main_cp_state.matrix_index_a.Tex1MtxIdx; + m_Vertex.texMtx[2] = g_main_cp_state.matrix_index_a.Tex2MtxIdx; + m_Vertex.texMtx[3] = g_main_cp_state.matrix_index_a.Tex3MtxIdx; + m_Vertex.texMtx[4] = g_main_cp_state.matrix_index_b.Tex4MtxIdx; + m_Vertex.texMtx[5] = g_main_cp_state.matrix_index_b.Tex5MtxIdx; + m_Vertex.texMtx[6] = g_main_cp_state.matrix_index_b.Tex6MtxIdx; + m_Vertex.texMtx[7] = g_main_cp_state.matrix_index_b.Tex7MtxIdx; #endif - if (g_VtxDesc.PosMatIdx != NOT_PRESENT) + if (g_main_cp_state.vtx_desc.PosMatIdx != NOT_PRESENT) { AddAttributeLoader(LoadPosMtx); m_VertexSize++; @@ -145,17 +145,17 @@ void SWVertexLoader::SetFormat(u8 attributeIndex, u8 primitiveType) } // Write vertex position loader - m_positionLoader = VertexLoader_Position::GetFunction(g_VtxDesc.Position, m_CurrentVat->g0.PosFormat, m_CurrentVat->g0.PosElements); - m_VertexSize += VertexLoader_Position::GetSize(g_VtxDesc.Position, m_CurrentVat->g0.PosFormat, m_CurrentVat->g0.PosElements); + m_positionLoader = VertexLoader_Position::GetFunction(g_main_cp_state.vtx_desc.Position, m_CurrentVat->g0.PosFormat, m_CurrentVat->g0.PosElements); + m_VertexSize += VertexLoader_Position::GetSize(g_main_cp_state.vtx_desc.Position, m_CurrentVat->g0.PosFormat, m_CurrentVat->g0.PosElements); AddAttributeLoader(LoadPosition); // Normals - if (g_VtxDesc.Normal != NOT_PRESENT) + if (g_main_cp_state.vtx_desc.Normal != NOT_PRESENT) { - m_VertexSize += VertexLoader_Normal::GetSize(g_VtxDesc.Normal, + m_VertexSize += VertexLoader_Normal::GetSize(g_main_cp_state.vtx_desc.Normal, m_CurrentVat->g0.NormalFormat, m_CurrentVat->g0.NormalElements, m_CurrentVat->g0.NormalIndex3); - m_normalLoader = VertexLoader_Normal::GetFunction(g_VtxDesc.Normal, + m_normalLoader = VertexLoader_Normal::GetFunction(g_main_cp_state.vtx_desc.Normal, m_CurrentVat->g0.NormalFormat, m_CurrentVat->g0.NormalElements, m_CurrentVat->g0.NormalIndex3); if (m_normalLoader == nullptr) @@ -234,8 +234,8 @@ void SWVertexLoader::SetFormat(u8 attributeIndex, u8 primitiveType) // special case if only pos and tex coord 0 and tex coord input is AB11 m_TexGenSpecialCase = - ((g_VtxDesc.Hex & 0x60600L) == g_VtxDesc.Hex) && // only pos and tex coord 0 - (g_VtxDesc.Tex0Coord != NOT_PRESENT) && + ((g_main_cp_state.vtx_desc.Hex & 0x60600L) == g_main_cp_state.vtx_desc.Hex) && // only pos and tex coord 0 + (g_main_cp_state.vtx_desc.Tex0Coord != NOT_PRESENT) && (xfmem.texMtxInfo[0].projection == XF_TEXPROJ_ST); m_SetupUnit->Init(primitiveType); @@ -252,7 +252,7 @@ void SWVertexLoader::LoadVertex() // transform input data TransformUnit::TransformPosition(&m_Vertex, outVertex); - if (g_VtxDesc.Normal != NOT_PRESENT) + if (g_main_cp_state.vtx_desc.Normal != NOT_PRESENT) { TransformUnit::TransformNormal(&m_Vertex, m_CurrentVat->g0.NormalElements, outVertex); } diff --git a/Source/Core/VideoBackends/Software/SWmain.cpp b/Source/Core/VideoBackends/Software/SWmain.cpp index 25d9e0d543..9063cece5e 100644 --- a/Source/Core/VideoBackends/Software/SWmain.cpp +++ b/Source/Core/VideoBackends/Software/SWmain.cpp @@ -116,14 +116,7 @@ void VideoSoftware::DoState(PointerWrap& p) p.DoPOD(swstats); // CP Memory - p.DoArray(arraybases, 16); - p.DoArray(arraystrides, 16); - p.Do(MatrixIndexA); - p.Do(MatrixIndexB); - p.Do(g_VtxDesc.Hex); - p.DoArray(g_VtxAttr, 8); - p.DoMarker("CP Memory"); - + DoCPState(p); } void VideoSoftware::CheckInvalidState() diff --git a/Source/Core/VideoBackends/Software/XFMemLoader.cpp b/Source/Core/VideoBackends/Software/XFMemLoader.cpp index 08e03ad8d8..1ad1804f72 100644 --- a/Source/Core/VideoBackends/Software/XFMemLoader.cpp +++ b/Source/Core/VideoBackends/Software/XFMemLoader.cpp @@ -74,7 +74,7 @@ void SWLoadIndexedXF(u32 val, int array) int size = ((val >> 12) & 0xF) + 1; //load stuff from array to address in xf mem - u32 *pData = (u32*)Memory::GetPointer(arraybases[array] + arraystrides[array]*index); + u32 *pData = (u32*)Memory::GetPointer(g_main_cp_state.array_bases[array] + g_main_cp_state.array_strides[array]*index); // byteswap data u32 buffer[16]; diff --git a/Source/Core/VideoCommon/CPMemory.cpp b/Source/Core/VideoCommon/CPMemory.cpp index 752063065b..22bc95c57d 100644 --- a/Source/Core/VideoCommon/CPMemory.cpp +++ b/Source/Core/VideoCommon/CPMemory.cpp @@ -2,17 +2,22 @@ // Licensed under GPLv2 // Refer to the license.txt file included. +#include "Common/ChunkFile.h" #include "Common/CommonTypes.h" #include "VideoCommon/CPMemory.h" // CP state u8 *cached_arraybases[16]; -// STATE_TO_SAVE -u32 arraybases[16]; -u32 arraystrides[16]; -TMatrixIndexA MatrixIndexA; -TMatrixIndexB MatrixIndexB; -TVtxDesc g_VtxDesc; -// Most games only use the first VtxAttr and simply reconfigure it all the time as needed. -VAT g_VtxAttr[8]; +CPState g_main_cp_state; + +void DoCPState(PointerWrap& p) +{ + p.DoArray(g_main_cp_state.array_bases, 16); + p.DoArray(g_main_cp_state.array_strides, 16); + p.Do(g_main_cp_state.matrix_index_a); + p.Do(g_main_cp_state.matrix_index_b); + p.Do(g_main_cp_state.vtx_desc.Hex); + p.DoArray(g_main_cp_state.vtx_attr, 8); + p.DoMarker("CP Memory"); +} diff --git a/Source/Core/VideoCommon/CPMemory.h b/Source/Core/VideoCommon/CPMemory.h index cac82e0be0..9232a6b4f8 100644 --- a/Source/Core/VideoCommon/CPMemory.h +++ b/Source/Core/VideoCommon/CPMemory.h @@ -231,12 +231,6 @@ union TMatrixIndexB #pragma pack() -extern u32 arraybases[16]; -extern u8 *cached_arraybases[16]; -extern u32 arraystrides[16]; -extern TMatrixIndexA MatrixIndexA; -extern TMatrixIndexB MatrixIndexB; - struct VAT { UVAT_group0 g0; @@ -244,8 +238,25 @@ struct VAT UVAT_group2 g2; }; -extern TVtxDesc g_VtxDesc; -extern VAT g_VtxAttr[8]; +// STATE_TO_SAVE +struct CPState final +{ + u32 array_bases[16]; + u32 array_strides[16]; + TMatrixIndexA matrix_index_a; + TMatrixIndexB matrix_index_b; + TVtxDesc vtx_desc; + // Most games only use the first VtxAttr and simply reconfigure it all the time as needed. + VAT vtx_attr[8]; +}; + +class PointerWrap; + +extern void DoCPState(PointerWrap& p); + +extern CPState g_main_cp_state; + +extern u8 *cached_arraybases[16]; // Might move this into its own file later. void LoadCPReg(u32 SubCmd, u32 Value); diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index ab78e85f9e..b2f3cd7a4e 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -33,7 +33,7 @@ // Matrix components are first in GC format but later in PC format - we need to store it temporarily // when decoding each vertex. -static u8 s_curposmtx = MatrixIndexA.PosNormalMtxIdx; +static u8 s_curposmtx = g_main_cp_state.matrix_index_a.PosNormalMtxIdx; static u8 s_curtexmtx[8]; static int s_texmtxwrite = 0; static int s_texmtxread = 0; @@ -87,7 +87,7 @@ static void LOADERDECL PosMtx_Write() DataWrite(0); // Resetting current position matrix to default is needed for bbox to behave - s_curposmtx = (u8) MatrixIndexA.PosNormalMtxIdx; + s_curposmtx = (u8) g_main_cp_state.matrix_index_a.PosNormalMtxIdx; } static void LOADERDECL UpdateBoundingBoxPrepare() diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index b3a1c97c3f..88c8b2d668 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -112,7 +112,7 @@ static VertexLoader* RefreshLoader(int vtx_attr_group) VertexLoader* loader; if ((s_attr_dirty >> vtx_attr_group) & 1) { - VertexLoaderUID uid(g_VtxDesc, g_VtxAttr[vtx_attr_group]); + VertexLoaderUID uid(g_main_cp_state.vtx_desc, g_main_cp_state.vtx_attr[vtx_attr_group]); std::lock_guard lk(s_vertex_loader_map_lock); VertexLoaderMap::iterator iter = s_vertex_loader_map.find(uid); if (iter != s_vertex_loader_map.end()) @@ -121,7 +121,7 @@ static VertexLoader* RefreshLoader(int vtx_attr_group) } else { - loader = new VertexLoader(g_VtxDesc, g_VtxAttr[vtx_attr_group]); + loader = new VertexLoader(g_main_cp_state.vtx_desc, g_main_cp_state.vtx_attr[vtx_attr_group]); s_vertex_loader_map[uid] = std::unique_ptr(loader); INCSTAT(stats.numVertexLoaders); } @@ -161,7 +161,7 @@ bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size, VertexManager::PrepareForAdditionalData(primitive, count, loader->GetNativeVertexDeclaration().stride); - loader->RunVertices(g_VtxAttr[vtx_attr_group], primitive, count); + loader->RunVertices(g_main_cp_state.vtx_attr[vtx_attr_group], primitive, count); IndexGenerator::AddIndices(primitive, count); @@ -195,65 +195,65 @@ void LoadCPReg(u32 sub_cmd, u32 value) break; case 0x50: - g_VtxDesc.Hex &= ~0x1FFFF; // keep the Upper bits - g_VtxDesc.Hex |= value; + g_main_cp_state.vtx_desc.Hex &= ~0x1FFFF; // keep the Upper bits + g_main_cp_state.vtx_desc.Hex |= value; s_attr_dirty = 0xFF; break; case 0x60: - g_VtxDesc.Hex &= 0x1FFFF; // keep the lower 17Bits - g_VtxDesc.Hex |= (u64)value << 17; + g_main_cp_state.vtx_desc.Hex &= 0x1FFFF; // keep the lower 17Bits + g_main_cp_state.vtx_desc.Hex |= (u64)value << 17; s_attr_dirty = 0xFF; break; case 0x70: _assert_((sub_cmd & 0x0F) < 8); - g_VtxAttr[sub_cmd & 7].g0.Hex = value; + g_main_cp_state.vtx_attr[sub_cmd & 7].g0.Hex = value; s_attr_dirty |= 1 << (sub_cmd & 7); break; case 0x80: _assert_((sub_cmd & 0x0F) < 8); - g_VtxAttr[sub_cmd & 7].g1.Hex = value; + g_main_cp_state.vtx_attr[sub_cmd & 7].g1.Hex = value; s_attr_dirty |= 1 << (sub_cmd & 7); break; case 0x90: _assert_((sub_cmd & 0x0F) < 8); - g_VtxAttr[sub_cmd & 7].g2.Hex = value; + g_main_cp_state.vtx_attr[sub_cmd & 7].g2.Hex = value; s_attr_dirty |= 1 << (sub_cmd & 7); break; // Pointers to vertex arrays in GC RAM case 0xA0: - arraybases[sub_cmd & 0xF] = value; + g_main_cp_state.array_bases[sub_cmd & 0xF] = value; cached_arraybases[sub_cmd & 0xF] = Memory::GetPointer(value); break; case 0xB0: - arraystrides[sub_cmd & 0xF] = value & 0xFF; + g_main_cp_state.array_strides[sub_cmd & 0xF] = value & 0xFF; break; } } void FillCPMemoryArray(u32 *memory) { - memory[0x30] = MatrixIndexA.Hex; - memory[0x40] = MatrixIndexB.Hex; - memory[0x50] = (u32)g_VtxDesc.Hex; - memory[0x60] = (u32)(g_VtxDesc.Hex >> 17); + memory[0x30] = g_main_cp_state.matrix_index_a.Hex; + memory[0x40] = g_main_cp_state.matrix_index_b.Hex; + memory[0x50] = (u32)g_main_cp_state.vtx_desc.Hex; + memory[0x60] = (u32)(g_main_cp_state.vtx_desc.Hex >> 17); for (int i = 0; i < 8; ++i) { - memory[0x70 + i] = g_VtxAttr[i].g0.Hex; - memory[0x80 + i] = g_VtxAttr[i].g1.Hex; - memory[0x90 + i] = g_VtxAttr[i].g2.Hex; + memory[0x70 + i] = g_main_cp_state.vtx_attr[i].g0.Hex; + memory[0x80 + i] = g_main_cp_state.vtx_attr[i].g1.Hex; + memory[0x90 + i] = g_main_cp_state.vtx_attr[i].g2.Hex; } for (int i = 0; i < 16; ++i) { - memory[0xA0 + i] = arraybases[i]; - memory[0xB0 + i] = arraystrides[i]; + memory[0xA0 + i] = g_main_cp_state.array_bases[i]; + memory[0xB0 + i] = g_main_cp_state.array_strides[i]; } } @@ -261,6 +261,6 @@ void RecomputeCachedArraybases() { for (int i = 0; i < 16; i++) { - cached_arraybases[i] = Memory::GetPointer(arraybases[i]); + cached_arraybases[i] = Memory::GetPointer(g_main_cp_state.array_bases[i]); } } diff --git a/Source/Core/VideoCommon/VertexLoader_Color.cpp b/Source/Core/VideoCommon/VertexLoader_Color.cpp index 13b01f0bdb..0120e3bea5 100644 --- a/Source/Core/VideoCommon/VertexLoader_Color.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Color.cpp @@ -117,7 +117,7 @@ template void Color_ReadIndex_16b_565() { auto const Index = DataRead(); - u16 val = Common::swap16(*(const u16 *)(cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]))); + u16 val = Common::swap16(*(const u16 *)(cached_arraybases[ARRAY_COLOR+colIndex] + (Index * g_main_cp_state.array_strides[ARRAY_COLOR+colIndex]))); _SetCol565(val); } @@ -125,7 +125,7 @@ template void Color_ReadIndex_24b_888() { auto const Index = DataRead(); - const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]); + const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * g_main_cp_state.array_strides[ARRAY_COLOR+colIndex]); _SetCol(_Read24(iAddress)); } @@ -133,7 +133,7 @@ template void Color_ReadIndex_32b_888x() { auto const Index = DataRead(); - const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]); + const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * g_main_cp_state.array_strides[ARRAY_COLOR+colIndex]); _SetCol(_Read24(iAddress)); } @@ -141,7 +141,7 @@ template void Color_ReadIndex_16b_4444() { auto const Index = DataRead(); - u16 val = *(const u16 *)(cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex])); + u16 val = *(const u16 *)(cached_arraybases[ARRAY_COLOR+colIndex] + (Index * g_main_cp_state.array_strides[ARRAY_COLOR+colIndex])); _SetCol4444(val); } @@ -149,7 +149,7 @@ template void Color_ReadIndex_24b_6666() { auto const Index = DataRead(); - const u8* pData = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]) - 1; + const u8* pData = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * g_main_cp_state.array_strides[ARRAY_COLOR+colIndex]) - 1; u32 val = Common::swap32(pData); _SetCol6666(val); } @@ -158,7 +158,7 @@ template void Color_ReadIndex_32b_8888() { auto const Index = DataRead(); - const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]); + const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * g_main_cp_state.array_strides[ARRAY_COLOR+colIndex]); _SetCol(_Read32(iAddress)); } diff --git a/Source/Core/VideoCommon/VertexLoader_Normal.cpp b/Source/Core/VideoCommon/VertexLoader_Normal.cpp index 0fb24d9264..3d58592d70 100644 --- a/Source/Core/VideoCommon/VertexLoader_Normal.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Normal.cpp @@ -80,7 +80,7 @@ __forceinline void Normal_Index_Offset() auto const index = DataRead(); auto const data = reinterpret_cast(cached_arraybases[ARRAY_NORMAL] - + (index * arraystrides[ARRAY_NORMAL]) + sizeof(T) * 3 * Offset); + + (index * g_main_cp_state.array_strides[ARRAY_NORMAL]) + sizeof(T) * 3 * Offset); ReadIndirect(data); } diff --git a/Source/Core/VideoCommon/VertexLoader_Position.cpp b/Source/Core/VideoCommon/VertexLoader_Position.cpp index 0c27b2b6fb..a38d429d58 100644 --- a/Source/Core/VideoCommon/VertexLoader_Position.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Position.cpp @@ -91,7 +91,7 @@ void LOADERDECL Pos_ReadIndex() static_assert(N <= 3, "N > 3 is not sane!"); auto const index = DataRead(); - auto const data = reinterpret_cast(cached_arraybases[ARRAY_POSITION] + (index * arraystrides[ARRAY_POSITION])); + auto const data = reinterpret_cast(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION])); auto const scale = posScale; DataWriter dst; @@ -109,7 +109,7 @@ template void LOADERDECL Pos_ReadIndex_Float_SSSE3() { auto const index = DataRead(); - const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (index * arraystrides[ARRAY_POSITION])); + const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION])); GC_ALIGNED128(const __m128i a = _mm_loadu_si128((__m128i*)pData)); GC_ALIGNED128(__m128i b = _mm_shuffle_epi8(a, three ? kMaskSwap32_3 : kMaskSwap32_2)); _mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, b); diff --git a/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp b/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp index 14b7efb451..25114ac82d 100644 --- a/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp +++ b/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp @@ -73,7 +73,7 @@ void LOADERDECL TexCoord_ReadIndex() auto const index = DataRead(); auto const data = reinterpret_cast(cached_arraybases[ARRAY_TEXCOORD0 + tcIndex] - + (index * arraystrides[ARRAY_TEXCOORD0 + tcIndex])); + + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0 + tcIndex])); auto const scale = tcScale[tcIndex]; DataWriter dst; @@ -94,7 +94,7 @@ void LOADERDECL TexCoord_ReadIndex_Short2_SSE4() // Heavy in ZWW auto const index = DataRead(); - const s32 *pData = (const s32*)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); + const s32 *pData = (const s32*)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0+tcIndex])); const __m128i a = _mm_cvtsi32_si128(*pData); const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_2); const __m128i c = _mm_cvtepi16_epi32(b); @@ -117,7 +117,7 @@ void LOADERDECL TexCoord_ReadIndex_Float2_SSSE3() static_assert(!std::numeric_limits::is_signed, "Only unsigned I is sane!"); auto const index = DataRead(); - const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); + const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0+tcIndex])); GC_ALIGNED128(const __m128i a = _mm_loadl_epi64((__m128i*)pData)); GC_ALIGNED128(const __m128i b = _mm_shuffle_epi8(a, kMaskSwap32)); _mm_storel_epi64((__m128i*)VertexManager::s_pCurBufferPointer, b); diff --git a/Source/Core/VideoCommon/VertexShaderGen.cpp b/Source/Core/VideoCommon/VertexShaderGen.cpp index 293b75cc78..e1526af7fa 100644 --- a/Source/Core/VideoCommon/VertexShaderGen.cpp +++ b/Source/Core/VideoCommon/VertexShaderGen.cpp @@ -245,8 +245,8 @@ static inline void GenerateVertexShader(T& out, u32 components, API_TYPE api_typ // donko - this has caused problems in some games. removed for now. bool texGenSpecialCase = false; /*bool texGenSpecialCase = - ((g_VtxDesc.Hex & 0x60600L) == g_VtxDesc.Hex) && // only pos and tex coord 0 - (g_VtxDesc.Tex0Coord != NOT_PRESENT) && + ((g_main_cp_state.vtx_desc.Hex & 0x60600L) == g_main_cp_state.vtx_desc.Hex) && // only pos and tex coord 0 + (g_main_cp_state.vtx_desc.Tex0Coord != NOT_PRESENT) && (xfmem.texcoords[0].texmtxinfo.inputform == XF_TEXINPUT_AB11); */ diff --git a/Source/Core/VideoCommon/VertexShaderManager.cpp b/Source/Core/VideoCommon/VertexShaderManager.cpp index 70e203d99e..022bf7683d 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.cpp +++ b/Source/Core/VideoCommon/VertexShaderManager.cpp @@ -329,8 +329,8 @@ void VertexShaderManager::SetConstants() { bPosNormalMatrixChanged = false; - const float *pos = (const float *)xfmem.posMatrices + MatrixIndexA.PosNormalMtxIdx * 4; - const float *norm = (const float *)xfmem.normalMatrices + 3 * (MatrixIndexA.PosNormalMtxIdx & 31); + const float *pos = (const float *)xfmem.posMatrices + g_main_cp_state.matrix_index_a.PosNormalMtxIdx * 4; + const float *norm = (const float *)xfmem.normalMatrices + 3 * (g_main_cp_state.matrix_index_a.PosNormalMtxIdx & 31); memcpy(constants.posnormalmatrix, pos, 3*16); memcpy(constants.posnormalmatrix[3], norm, 12); @@ -344,10 +344,10 @@ void VertexShaderManager::SetConstants() bTexMatricesChanged[0] = false; const float *fptrs[] = { - (const float *)&xfmem.posMatrices[MatrixIndexA.Tex0MtxIdx * 4], - (const float *)&xfmem.posMatrices[MatrixIndexA.Tex1MtxIdx * 4], - (const float *)&xfmem.posMatrices[MatrixIndexA.Tex2MtxIdx * 4], - (const float *)&xfmem.posMatrices[MatrixIndexA.Tex3MtxIdx * 4] + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_a.Tex0MtxIdx * 4], + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_a.Tex1MtxIdx * 4], + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_a.Tex2MtxIdx * 4], + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_a.Tex3MtxIdx * 4] }; for (int i = 0; i < 4; ++i) @@ -361,10 +361,10 @@ void VertexShaderManager::SetConstants() { bTexMatricesChanged[1] = false; const float *fptrs[] = { - (const float *)&xfmem.posMatrices[MatrixIndexB.Tex4MtxIdx * 4], - (const float *)&xfmem.posMatrices[MatrixIndexB.Tex5MtxIdx * 4], - (const float *)&xfmem.posMatrices[MatrixIndexB.Tex6MtxIdx * 4], - (const float *)&xfmem.posMatrices[MatrixIndexB.Tex7MtxIdx * 4] + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_b.Tex4MtxIdx * 4], + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_b.Tex5MtxIdx * 4], + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_b.Tex6MtxIdx * 4], + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_b.Tex7MtxIdx * 4] }; for (int i = 0; i < 4; ++i) @@ -536,26 +536,26 @@ void VertexShaderManager::SetConstants() void VertexShaderManager::InvalidateXFRange(int start, int end) { - if (((u32)start >= (u32)MatrixIndexA.PosNormalMtxIdx * 4 && - (u32)start < (u32)MatrixIndexA.PosNormalMtxIdx * 4 + 12) || - ((u32)start >= XFMEM_NORMALMATRICES + ((u32)MatrixIndexA.PosNormalMtxIdx & 31) * 3 && - (u32)start < XFMEM_NORMALMATRICES + ((u32)MatrixIndexA.PosNormalMtxIdx & 31) * 3 + 9)) + if (((u32)start >= (u32)g_main_cp_state.matrix_index_a.PosNormalMtxIdx * 4 && + (u32)start < (u32)g_main_cp_state.matrix_index_a.PosNormalMtxIdx * 4 + 12) || + ((u32)start >= XFMEM_NORMALMATRICES + ((u32)g_main_cp_state.matrix_index_a.PosNormalMtxIdx & 31) * 3 && + (u32)start < XFMEM_NORMALMATRICES + ((u32)g_main_cp_state.matrix_index_a.PosNormalMtxIdx & 31) * 3 + 9)) { bPosNormalMatrixChanged = true; } - if (((u32)start >= (u32)MatrixIndexA.Tex0MtxIdx*4 && (u32)start < (u32)MatrixIndexA.Tex0MtxIdx*4+12) || - ((u32)start >= (u32)MatrixIndexA.Tex1MtxIdx*4 && (u32)start < (u32)MatrixIndexA.Tex1MtxIdx*4+12) || - ((u32)start >= (u32)MatrixIndexA.Tex2MtxIdx*4 && (u32)start < (u32)MatrixIndexA.Tex2MtxIdx*4+12) || - ((u32)start >= (u32)MatrixIndexA.Tex3MtxIdx*4 && (u32)start < (u32)MatrixIndexA.Tex3MtxIdx*4+12)) + if (((u32)start >= (u32)g_main_cp_state.matrix_index_a.Tex0MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_a.Tex0MtxIdx*4+12) || + ((u32)start >= (u32)g_main_cp_state.matrix_index_a.Tex1MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_a.Tex1MtxIdx*4+12) || + ((u32)start >= (u32)g_main_cp_state.matrix_index_a.Tex2MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_a.Tex2MtxIdx*4+12) || + ((u32)start >= (u32)g_main_cp_state.matrix_index_a.Tex3MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_a.Tex3MtxIdx*4+12)) { bTexMatricesChanged[0] = true; } - if (((u32)start >= (u32)MatrixIndexB.Tex4MtxIdx*4 && (u32)start < (u32)MatrixIndexB.Tex4MtxIdx*4+12) || - ((u32)start >= (u32)MatrixIndexB.Tex5MtxIdx*4 && (u32)start < (u32)MatrixIndexB.Tex5MtxIdx*4+12) || - ((u32)start >= (u32)MatrixIndexB.Tex6MtxIdx*4 && (u32)start < (u32)MatrixIndexB.Tex6MtxIdx*4+12) || - ((u32)start >= (u32)MatrixIndexB.Tex7MtxIdx*4 && (u32)start < (u32)MatrixIndexB.Tex7MtxIdx*4+12)) + if (((u32)start >= (u32)g_main_cp_state.matrix_index_b.Tex4MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_b.Tex4MtxIdx*4+12) || + ((u32)start >= (u32)g_main_cp_state.matrix_index_b.Tex5MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_b.Tex5MtxIdx*4+12) || + ((u32)start >= (u32)g_main_cp_state.matrix_index_b.Tex6MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_b.Tex6MtxIdx*4+12) || + ((u32)start >= (u32)g_main_cp_state.matrix_index_b.Tex7MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_b.Tex7MtxIdx*4+12)) { bTexMatricesChanged[1] = true; } @@ -628,23 +628,23 @@ void VertexShaderManager::InvalidateXFRange(int start, int end) void VertexShaderManager::SetTexMatrixChangedA(u32 Value) { - if (MatrixIndexA.Hex != Value) + if (g_main_cp_state.matrix_index_a.Hex != Value) { VertexManager::Flush(); - if (MatrixIndexA.PosNormalMtxIdx != (Value&0x3f)) + if (g_main_cp_state.matrix_index_a.PosNormalMtxIdx != (Value&0x3f)) bPosNormalMatrixChanged = true; bTexMatricesChanged[0] = true; - MatrixIndexA.Hex = Value; + g_main_cp_state.matrix_index_a.Hex = Value; } } void VertexShaderManager::SetTexMatrixChangedB(u32 Value) { - if (MatrixIndexB.Hex != Value) + if (g_main_cp_state.matrix_index_b.Hex != Value) { VertexManager::Flush(); bTexMatricesChanged[1] = true; - MatrixIndexB.Hex = Value; + g_main_cp_state.matrix_index_b.Hex = Value; } } diff --git a/Source/Core/VideoCommon/VideoState.cpp b/Source/Core/VideoCommon/VideoState.cpp index a8d5ddcf04..72232e2737 100644 --- a/Source/Core/VideoCommon/VideoState.cpp +++ b/Source/Core/VideoCommon/VideoState.cpp @@ -22,13 +22,7 @@ static void DoState(PointerWrap &p) p.DoMarker("BP Memory"); // CP Memory - p.DoArray(arraybases, 16); - p.DoArray(arraystrides, 16); - p.Do(MatrixIndexA); - p.Do(MatrixIndexB); - p.Do(g_VtxDesc.Hex); - p.DoArray(g_VtxAttr, 8); - p.DoMarker("CP Memory"); + DoCPState(p); // XF Memory p.Do(xfmem); @@ -73,11 +67,6 @@ void VideoCommon_RunLoop(bool enable) void VideoCommon_Init() { - memset(arraybases, 0, sizeof(arraybases)); - memset(arraystrides, 0, sizeof(arraystrides)); - memset(&MatrixIndexA, 0, sizeof(MatrixIndexA)); - memset(&MatrixIndexB, 0, sizeof(MatrixIndexB)); - memset(&g_VtxDesc, 0, sizeof(g_VtxDesc)); - memset(g_VtxAttr, 0, sizeof(g_VtxAttr)); + memset(&g_main_cp_state, 0, sizeof(g_main_cp_state)); memset(texMem, 0, TMEM_SIZE); } diff --git a/Source/Core/VideoCommon/XFStructs.cpp b/Source/Core/VideoCommon/XFStructs.cpp index 650a4a913b..a0941a0133 100644 --- a/Source/Core/VideoCommon/XFStructs.cpp +++ b/Source/Core/VideoCommon/XFStructs.cpp @@ -252,7 +252,7 @@ void LoadIndexedXF(u32 val, int refarray) //load stuff from array to address in xf mem u32* currData = (u32*)(&xfmem) + address; - u32* newData = (u32*)Memory::GetPointer(arraybases[refarray] + arraystrides[refarray] * index); + u32* newData = (u32*)Memory::GetPointer(g_main_cp_state.array_bases[refarray] + g_main_cp_state.array_strides[refarray] * index); bool changed = false; for (int i = 0; i < size; ++i) { From e86ddacb18affaef2d773b0c86fdf57f93d594c4 Mon Sep 17 00:00:00 2001 From: comex Date: Wed, 27 Aug 2014 13:38:00 -0400 Subject: [PATCH 05/10] Changes to allow LoadCPReg to work in a preprocess mode which affects a separate state. This state will be used to calculate sizes for skipping over commands on a separate thread. An alternative to having these state variables would be to have the preprocessor stash "state as we go" somewhere, but I think that would be much uglier. GetVertexSize now takes an extra argument to determine which state to use, as does FifoCommandRunnable, which calls it. While I'm modifying FifoCommandRunnable, I also change it to take a buffer and size as parameters rather than using g_pVideoData, which will also be necessary later. I also get rid of an unused overload. --- Source/Core/VideoCommon/CPMemory.cpp | 10 +++ Source/Core/VideoCommon/CPMemory.h | 11 ++- .../Core/VideoCommon/VertexLoaderManager.cpp | 80 ++++++++++--------- Source/Core/VideoCommon/VertexLoaderManager.h | 2 +- Source/Core/VideoCommon/VideoState.cpp | 1 + 5 files changed, 64 insertions(+), 40 deletions(-) diff --git a/Source/Core/VideoCommon/CPMemory.cpp b/Source/Core/VideoCommon/CPMemory.cpp index 22bc95c57d..739a3e66e3 100644 --- a/Source/Core/VideoCommon/CPMemory.cpp +++ b/Source/Core/VideoCommon/CPMemory.cpp @@ -10,9 +10,12 @@ u8 *cached_arraybases[16]; CPState g_main_cp_state; +CPState g_preprocess_cp_state; void DoCPState(PointerWrap& p) { + // We don't save g_preprocess_cp_state separately because the GPU should be + // synced around state save/load. p.DoArray(g_main_cp_state.array_bases, 16); p.DoArray(g_main_cp_state.array_strides, 16); p.Do(g_main_cp_state.matrix_index_a); @@ -20,4 +23,11 @@ void DoCPState(PointerWrap& p) p.Do(g_main_cp_state.vtx_desc.Hex); p.DoArray(g_main_cp_state.vtx_attr, 8); p.DoMarker("CP Memory"); + if (p.mode == PointerWrap::MODE_READ) + CopyPreprocessCPStateFromMain(); +} + +void CopyPreprocessCPStateFromMain() +{ + memcpy(&g_preprocess_cp_state, &g_main_cp_state, sizeof(CPState)); } diff --git a/Source/Core/VideoCommon/CPMemory.h b/Source/Core/VideoCommon/CPMemory.h index 9232a6b4f8..ae8ff08303 100644 --- a/Source/Core/VideoCommon/CPMemory.h +++ b/Source/Core/VideoCommon/CPMemory.h @@ -238,6 +238,8 @@ struct VAT UVAT_group2 g2; }; +class VertexLoader; + // STATE_TO_SAVE struct CPState final { @@ -248,18 +250,25 @@ struct CPState final TVtxDesc vtx_desc; // Most games only use the first VtxAttr and simply reconfigure it all the time as needed. VAT vtx_attr[8]; + + // Attributes that actually belong to VertexLoaderManager: + int attr_dirty; // bitfield + VertexLoader* vertex_loaders[8]; }; class PointerWrap; extern void DoCPState(PointerWrap& p); +extern void CopyPreprocessCPStateFromMain(); + extern CPState g_main_cp_state; +extern CPState g_preprocess_cp_state; extern u8 *cached_arraybases[16]; // Might move this into its own file later. -void LoadCPReg(u32 SubCmd, u32 Value); +void LoadCPReg(u32 SubCmd, u32 Value, bool is_preprocess = false); // Fills memory with data from CP regs void FillCPMemoryArray(u32 *memory); diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index 88c8b2d668..ca925d0302 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -21,12 +21,8 @@ #include "VideoCommon/VertexShaderManager.h" #include "VideoCommon/VideoCommon.h" -static int s_attr_dirty; // bitfield - static NativeVertexFormat* s_current_vtx_fmt; -static VertexLoader* s_VertexLoaders[8]; - namespace std { @@ -53,10 +49,10 @@ static VertexLoaderMap s_vertex_loader_map; void Init() { MarkAllDirty(); - for (auto& map_entry : s_VertexLoaders) - { + for (auto& map_entry : g_main_cp_state.vertex_loaders) + map_entry = nullptr; + for (auto& map_entry : g_preprocess_cp_state.vertex_loaders) map_entry = nullptr; - } RecomputeCachedArraybases(); } @@ -104,15 +100,16 @@ void AppendListToString(std::string *dest) void MarkAllDirty() { - s_attr_dirty = 0xff; + g_main_cp_state.attr_dirty = 0xff; + g_preprocess_cp_state.attr_dirty = 0xff; } -static VertexLoader* RefreshLoader(int vtx_attr_group) +static VertexLoader* RefreshLoader(int vtx_attr_group, CPState* state) { VertexLoader* loader; - if ((s_attr_dirty >> vtx_attr_group) & 1) + if ((state->attr_dirty >> vtx_attr_group) & 1) { - VertexLoaderUID uid(g_main_cp_state.vtx_desc, g_main_cp_state.vtx_attr[vtx_attr_group]); + VertexLoaderUID uid(state->vtx_desc, state->vtx_attr[vtx_attr_group]); std::lock_guard lk(s_vertex_loader_map_lock); VertexLoaderMap::iterator iter = s_vertex_loader_map.find(uid); if (iter != s_vertex_loader_map.end()) @@ -121,14 +118,14 @@ static VertexLoader* RefreshLoader(int vtx_attr_group) } else { - loader = new VertexLoader(g_main_cp_state.vtx_desc, g_main_cp_state.vtx_attr[vtx_attr_group]); + loader = new VertexLoader(state->vtx_desc, state->vtx_attr[vtx_attr_group]); s_vertex_loader_map[uid] = std::unique_ptr(loader); INCSTAT(stats.numVertexLoaders); } - s_VertexLoaders[vtx_attr_group] = loader; - s_attr_dirty &= ~(1 << vtx_attr_group); + state->vertex_loaders[vtx_attr_group] = loader; + state->attr_dirty &= ~(1 << vtx_attr_group); } else { - loader = s_VertexLoaders[vtx_attr_group]; + loader = state->vertex_loaders[vtx_attr_group]; } return loader; } @@ -137,7 +134,10 @@ bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size, { if (!count) return true; - VertexLoader* loader = RefreshLoader(vtx_attr_group); + + CPState* state = &g_main_cp_state; + + VertexLoader* loader = RefreshLoader(vtx_attr_group, state); size_t size = count * loader->GetVertexSize(); if (buf_size < size) @@ -152,7 +152,6 @@ bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size, NativeVertexFormat* native = loader->GetNativeVertexFormat(); - // If the native vertex format changed, force a flush. if (native != s_current_vtx_fmt) VertexManager::Flush(); @@ -161,7 +160,7 @@ bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size, VertexManager::PrepareForAdditionalData(primitive, count, loader->GetNativeVertexDeclaration().stride); - loader->RunVertices(g_main_cp_state.vtx_attr[vtx_attr_group], primitive, count); + loader->RunVertices(state->vtx_attr[vtx_attr_group], primitive, count); IndexGenerator::AddIndices(primitive, count); @@ -170,9 +169,9 @@ bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size, return true; } -int GetVertexSize(int vtx_attr_group) +int GetVertexSize(int vtx_attr_group, bool preprocess) { - return RefreshLoader(vtx_attr_group)->GetVertexSize(); + return RefreshLoader(vtx_attr_group, preprocess ? &g_preprocess_cp_state : &g_main_cp_state)->GetVertexSize(); } NativeVertexFormat* GetCurrentVertexFormat() @@ -182,56 +181,61 @@ NativeVertexFormat* GetCurrentVertexFormat() } // namespace -void LoadCPReg(u32 sub_cmd, u32 value) +void LoadCPReg(u32 sub_cmd, u32 value, bool is_preprocess) { + bool update_global_state = !is_preprocess; + CPState* state = is_preprocess ? &g_preprocess_cp_state : &g_main_cp_state; switch (sub_cmd & 0xF0) { case 0x30: - VertexShaderManager::SetTexMatrixChangedA(value); + if (update_global_state) + VertexShaderManager::SetTexMatrixChangedA(value); break; case 0x40: - VertexShaderManager::SetTexMatrixChangedB(value); + if (update_global_state) + VertexShaderManager::SetTexMatrixChangedB(value); break; case 0x50: - g_main_cp_state.vtx_desc.Hex &= ~0x1FFFF; // keep the Upper bits - g_main_cp_state.vtx_desc.Hex |= value; - s_attr_dirty = 0xFF; + state->vtx_desc.Hex &= ~0x1FFFF; // keep the Upper bits + state->vtx_desc.Hex |= value; + state->attr_dirty = 0xFF; break; case 0x60: - g_main_cp_state.vtx_desc.Hex &= 0x1FFFF; // keep the lower 17Bits - g_main_cp_state.vtx_desc.Hex |= (u64)value << 17; - s_attr_dirty = 0xFF; + state->vtx_desc.Hex &= 0x1FFFF; // keep the lower 17Bits + state->vtx_desc.Hex |= (u64)value << 17; + state->attr_dirty = 0xFF; break; case 0x70: _assert_((sub_cmd & 0x0F) < 8); - g_main_cp_state.vtx_attr[sub_cmd & 7].g0.Hex = value; - s_attr_dirty |= 1 << (sub_cmd & 7); + state->vtx_attr[sub_cmd & 7].g0.Hex = value; + state->attr_dirty |= 1 << (sub_cmd & 7); break; case 0x80: _assert_((sub_cmd & 0x0F) < 8); - g_main_cp_state.vtx_attr[sub_cmd & 7].g1.Hex = value; - s_attr_dirty |= 1 << (sub_cmd & 7); + state->vtx_attr[sub_cmd & 7].g1.Hex = value; + state->attr_dirty |= 1 << (sub_cmd & 7); break; case 0x90: _assert_((sub_cmd & 0x0F) < 8); - g_main_cp_state.vtx_attr[sub_cmd & 7].g2.Hex = value; - s_attr_dirty |= 1 << (sub_cmd & 7); + state->vtx_attr[sub_cmd & 7].g2.Hex = value; + state->attr_dirty |= 1 << (sub_cmd & 7); break; // Pointers to vertex arrays in GC RAM case 0xA0: - g_main_cp_state.array_bases[sub_cmd & 0xF] = value; - cached_arraybases[sub_cmd & 0xF] = Memory::GetPointer(value); + state->array_bases[sub_cmd & 0xF] = value; + if (update_global_state) + cached_arraybases[sub_cmd & 0xF] = Memory::GetPointer(value); break; case 0xB0: - g_main_cp_state.array_strides[sub_cmd & 0xF] = value & 0xFF; + state->array_strides[sub_cmd & 0xF] = value & 0xFF; break; } } diff --git a/Source/Core/VideoCommon/VertexLoaderManager.h b/Source/Core/VideoCommon/VertexLoaderManager.h index 32a64b6cd8..8995ad2d7a 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.h +++ b/Source/Core/VideoCommon/VertexLoaderManager.h @@ -16,7 +16,7 @@ namespace VertexLoaderManager void MarkAllDirty(); - int GetVertexSize(int vtx_attr_group); + int GetVertexSize(int vtx_attr_group, bool preprocess); // Returns false if buf_size is insufficient. bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size, bool skip_drawing = false); diff --git a/Source/Core/VideoCommon/VideoState.cpp b/Source/Core/VideoCommon/VideoState.cpp index 72232e2737..dd0eb2fb88 100644 --- a/Source/Core/VideoCommon/VideoState.cpp +++ b/Source/Core/VideoCommon/VideoState.cpp @@ -68,5 +68,6 @@ void VideoCommon_RunLoop(bool enable) void VideoCommon_Init() { memset(&g_main_cp_state, 0, sizeof(g_main_cp_state)); + memset(&g_preprocess_cp_state, 0, sizeof(g_preprocess_cp_state)); memset(texMem, 0, TMEM_SIZE); } From 0ae9e398c8e0f808eb4da6cc6f5e3cd553e975a1 Mon Sep 17 00:00:00 2001 From: comex Date: Tue, 26 Aug 2014 13:37:32 -0400 Subject: [PATCH 06/10] Rejigger some FIFO buffer variables to be more rational. videoBuffer -> s_video_buffer size -> s_video_buffer_write_ptr g_pVideoData -> g_video_buffer_read_ptr (impl moved to Fifo.cpp) This eradicates the wonderful use of 'size' as a global name, and makes it clear that s_video_buffer_write_ptr and g_video_buffer_read_ptr are the two ends of the FIFO buffer s_video_buffer. Oh, and remove a useless namespace {}. --- .../VideoBackends/Software/OpcodeDecoder.cpp | 12 ++-- .../Software/SWCommandProcessor.cpp | 8 +-- Source/Core/VideoCommon/DataReader.h | 18 +++--- Source/Core/VideoCommon/Fifo.cpp | 56 ++++++++++--------- Source/Core/VideoCommon/OpcodeDecoding.cpp | 43 +++++++------- .../VideoCommon/VertexLoaderTest.cpp | 2 +- 6 files changed, 70 insertions(+), 69 deletions(-) diff --git a/Source/Core/VideoBackends/Software/OpcodeDecoder.cpp b/Source/Core/VideoBackends/Software/OpcodeDecoder.cpp index 77deeed01f..66816e6626 100644 --- a/Source/Core/VideoBackends/Software/OpcodeDecoder.cpp +++ b/Source/Core/VideoBackends/Software/OpcodeDecoder.cpp @@ -57,7 +57,7 @@ static void DecodePrimitiveStream(u32 iBufferSize) { while (streamSize > 0 && iBufferSize >= vertexSize) { - g_pVideoData += vertexSize; + g_video_buffer_read_ptr += vertexSize; iBufferSize -= vertexSize; streamSize--; } @@ -94,26 +94,26 @@ static void ReadXFData(u32 iBufferSize) static void ExecuteDisplayList(u32 addr, u32 count) { - u8 *videoDataSave = g_pVideoData; + u8 *videoDataSave = g_video_buffer_read_ptr; u8 *dlStart = Memory::GetPointer(addr); - g_pVideoData = dlStart; + g_video_buffer_read_ptr = dlStart; while (OpcodeDecoder::CommandRunnable(count)) { OpcodeDecoder::Run(count); // if data was read by the opcode decoder then the video data pointer changed - u32 readCount = (u32)(g_pVideoData - dlStart); - dlStart = g_pVideoData; + u32 readCount = (u32)(g_video_buffer_read_ptr - dlStart); + dlStart = g_video_buffer_read_ptr; _assert_msg_(VIDEO, count >= readCount, "Display list underrun"); count -= readCount; } - g_pVideoData = videoDataSave; + g_video_buffer_read_ptr = videoDataSave; } static void DecodeStandard(u32 bufferSize) diff --git a/Source/Core/VideoBackends/Software/SWCommandProcessor.cpp b/Source/Core/VideoBackends/Software/SWCommandProcessor.cpp index 5f227d4b5c..56832eb786 100644 --- a/Source/Core/VideoBackends/Software/SWCommandProcessor.cpp +++ b/Source/Core/VideoBackends/Software/SWCommandProcessor.cpp @@ -57,7 +57,7 @@ void DoState(PointerWrap &p) p.Do(interruptWaiting); // Is this right? - p.DoArray(g_pVideoData,writePos); + p.DoArray(g_video_buffer_read_ptr,writePos); } static void UpdateInterrupts_Wrapper(u64 userdata, int cyclesLate) @@ -95,7 +95,7 @@ void Init() interruptSet = false; interruptWaiting = false; - g_pVideoData = nullptr; + g_video_buffer_read_ptr = nullptr; g_bSkipCurrentFrame = false; } @@ -311,7 +311,7 @@ bool RunBuffer() _dbg_assert_(COMMANDPROCESSOR, writePos >= readPos); - g_pVideoData = &commandBuffer[readPos]; + g_video_buffer_read_ptr = &commandBuffer[readPos]; u32 availableBytes = writePos - readPos; @@ -322,7 +322,7 @@ bool RunBuffer() OpcodeDecoder::Run(availableBytes); // if data was read by the opcode decoder then the video data pointer changed - readPos = (u32)(g_pVideoData - &commandBuffer[0]); + readPos = (u32)(g_video_buffer_read_ptr - &commandBuffer[0]); _dbg_assert_(VIDEO, writePos >= readPos); availableBytes = writePos - readPos; } diff --git a/Source/Core/VideoCommon/DataReader.h b/Source/Core/VideoCommon/DataReader.h index 85beec3a11..7f317b177c 100644 --- a/Source/Core/VideoCommon/DataReader.h +++ b/Source/Core/VideoCommon/DataReader.h @@ -6,7 +6,7 @@ #include "VideoCommon/VertexManagerBase.h" -extern u8* g_pVideoData; +extern u8* g_video_buffer_read_ptr; #if _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__) #include @@ -14,20 +14,20 @@ extern u8* g_pVideoData; __forceinline void DataSkip(u32 skip) { - g_pVideoData += skip; + g_video_buffer_read_ptr += skip; } // probably unnecessary template __forceinline void DataSkip() { - g_pVideoData += count; + g_video_buffer_read_ptr += count; } template __forceinline T DataPeek(int _uOffset) { - auto const result = Common::FromBigEndian(*reinterpret_cast(g_pVideoData + _uOffset)); + auto const result = Common::FromBigEndian(*reinterpret_cast(g_video_buffer_read_ptr + _uOffset)); return result; } @@ -58,8 +58,8 @@ __forceinline T DataRead() class DataReader { public: - inline DataReader() : buffer(g_pVideoData), offset(0) {} - inline ~DataReader() { g_pVideoData += offset; } + inline DataReader() : buffer(g_video_buffer_read_ptr), offset(0) {} + inline ~DataReader() { g_video_buffer_read_ptr += offset; } template inline T Read() { const T result = Common::FromBigEndian(*(T*)(buffer + offset)); @@ -94,14 +94,14 @@ __forceinline u32 DataReadU32() __forceinline u32 DataReadU32Unswapped() { - u32 tmp = *(u32*)g_pVideoData; - g_pVideoData += 4; + u32 tmp = *(u32*)g_video_buffer_read_ptr; + g_video_buffer_read_ptr += 4; return tmp; } __forceinline u8* DataGetPosition() { - return g_pVideoData; + return g_video_buffer_read_ptr; } template diff --git a/Source/Core/VideoCommon/Fifo.cpp b/Source/Core/VideoCommon/Fifo.cpp index 4e1b52dfa1..e8ba7a2b22 100644 --- a/Source/Core/VideoCommon/Fifo.cpp +++ b/Source/Core/VideoCommon/Fifo.cpp @@ -22,21 +22,22 @@ bool g_bSkipCurrentFrame = false; -namespace -{ static volatile bool GpuRunningState = false; static volatile bool EmuRunningState = false; static std::mutex m_csHWVidOccupied; // STATE_TO_SAVE -static u8 *videoBuffer; -static int size = 0; -} // namespace +static u8* s_video_buffer; +static u8* s_video_buffer_write_ptr; + +// Note: during display list execution, temporarily points to the list instead +// of inside s_video_buffer. +u8* g_video_buffer_read_ptr; void Fifo_DoState(PointerWrap &p) { - p.DoArray(videoBuffer, FIFO_SIZE); - p.Do(size); - p.DoPointer(g_pVideoData, videoBuffer); + p.DoArray(s_video_buffer, FIFO_SIZE); + p.DoPointer(s_video_buffer_write_ptr, s_video_buffer); + p.DoPointer(g_video_buffer_read_ptr, s_video_buffer); p.Do(g_bSkipCurrentFrame); } @@ -61,8 +62,8 @@ void Fifo_PauseAndLock(bool doLock, bool unpauseOnUnlock) void Fifo_Init() { - videoBuffer = (u8*)AllocateMemoryPages(FIFO_SIZE); - size = 0; + s_video_buffer = (u8*)AllocateMemoryPages(FIFO_SIZE); + s_video_buffer_write_ptr = s_video_buffer; GpuRunningState = false; Common::AtomicStore(CommandProcessor::VITicks, CommandProcessor::m_cpClockOrigin); } @@ -70,18 +71,18 @@ void Fifo_Init() void Fifo_Shutdown() { if (GpuRunningState) PanicAlert("Fifo shutting down while active"); - FreeMemoryPages(videoBuffer, FIFO_SIZE); - videoBuffer = nullptr; + FreeMemoryPages(s_video_buffer, FIFO_SIZE); + s_video_buffer = nullptr; } u8* GetVideoBufferStartPtr() { - return videoBuffer; + return s_video_buffer; } u8* GetVideoBufferEndPtr() { - return &videoBuffer[size]; + return s_video_buffer_write_ptr; } void Fifo_SetRendering(bool enabled) @@ -111,26 +112,27 @@ void EmulatorState(bool running) // Description: RunGpuLoop() sends data through this function. void ReadDataFromFifo(u8* _uData, u32 len) { - if (size + len >= FIFO_SIZE) + if (len > (s_video_buffer + FIFO_SIZE - s_video_buffer_write_ptr)) { - int pos = (int)(g_pVideoData - videoBuffer); - size -= pos; - if (size + len > FIFO_SIZE) + size_t size = s_video_buffer_write_ptr - g_video_buffer_read_ptr; + if (len > FIFO_SIZE - size) { - PanicAlert("FIFO out of bounds (size = %i, len = %i at %08x)", size, len, pos); + PanicAlert("FIFO out of bounds (existing %lu + new %lu > %lu)", (unsigned long) size, (unsigned long) len, (unsigned long) FIFO_SIZE); + return; } - memmove(&videoBuffer[0], &videoBuffer[pos], size); - g_pVideoData = videoBuffer; + memmove(s_video_buffer, g_video_buffer_read_ptr, size); + s_video_buffer_write_ptr = s_video_buffer + size; + g_video_buffer_read_ptr = s_video_buffer; } - // Copy new video instructions to videoBuffer for future use in rendering the new picture - memcpy(videoBuffer + size, _uData, len); - size += len; + // Copy new video instructions to s_video_buffer for future use in rendering the new picture + memcpy(s_video_buffer_write_ptr, _uData, len); + s_video_buffer_write_ptr += len; } void ResetVideoBuffer() { - g_pVideoData = videoBuffer; - size = 0; + g_video_buffer_read_ptr = s_video_buffer; + s_video_buffer_write_ptr = s_video_buffer; } @@ -181,7 +183,7 @@ void RunGpuLoop() Common::AtomicStore(fifo.CPReadPointer, readPtr); Common::AtomicAdd(fifo.CPReadWriteDistance, -32); - if ((GetVideoBufferEndPtr() - g_pVideoData) == 0) + if ((GetVideoBufferEndPtr() - g_video_buffer_read_ptr) == 0) Common::AtomicStore(fifo.SafeCPReadPointer, fifo.CPReadPointer); } diff --git a/Source/Core/VideoCommon/OpcodeDecoding.cpp b/Source/Core/VideoCommon/OpcodeDecoding.cpp index e9a20a526a..fe70bcf492 100644 --- a/Source/Core/VideoCommon/OpcodeDecoding.cpp +++ b/Source/Core/VideoCommon/OpcodeDecoding.cpp @@ -31,12 +31,11 @@ #include "VideoCommon/XFMemory.h" -u8* g_pVideoData = nullptr; bool g_bRecordFifoData = false; static u32 InterpretDisplayList(u32 address, u32 size) { - u8* old_pVideoData = g_pVideoData; + u8* old_pVideoData = g_video_buffer_read_ptr; u8* startAddress = Memory::GetPointer(address); u32 cycles = 0; @@ -44,12 +43,12 @@ static u32 InterpretDisplayList(u32 address, u32 size) // Avoid the crash if Memory::GetPointer failed .. if (startAddress != nullptr) { - g_pVideoData = startAddress; + g_video_buffer_read_ptr = startAddress; // temporarily swap dl and non-dl (small "hack" for the stats) Statistics::SwapDL(); - u8 *end = g_pVideoData + size; + u8 *end = g_video_buffer_read_ptr + size; cycles = OpcodeDecoder_Run(end); INCSTAT(stats.thisFrame.numDListsCalled); @@ -58,7 +57,7 @@ static u32 InterpretDisplayList(u32 address, u32 size) } // reset to the old pointer - g_pVideoData = old_pVideoData; + g_video_buffer_read_ptr = old_pVideoData; return cycles; } @@ -107,8 +106,8 @@ static void UnknownOpcode(u8 cmd_byte, void *buffer, bool preprocess) static u32 Decode(u8* end) { - u8 *opcodeStart = g_pVideoData; - if (g_pVideoData == end) + u8 *opcodeStart = g_video_buffer_read_ptr; + if (g_video_buffer_read_ptr == end) return 0; u8 cmd_byte = DataReadU8(); @@ -121,7 +120,7 @@ static u32 Decode(u8* end) case GX_LOAD_CP_REG: //0x08 { - if (end - g_pVideoData < 1 + 4) + if (end - g_video_buffer_read_ptr < 1 + 4) return 0; cycles = 12; u8 sub_cmd = DataReadU8(); @@ -133,11 +132,11 @@ static u32 Decode(u8* end) case GX_LOAD_XF_REG: { - if (end - g_pVideoData < 4) + if (end - g_video_buffer_read_ptr < 4) return 0; u32 Cmd2 = DataReadU32(); int transfer_size = ((Cmd2 >> 16) & 15) + 1; - if ((size_t) (end - g_pVideoData) < transfer_size * sizeof(u32)) + if ((size_t) (end - g_video_buffer_read_ptr) < transfer_size * sizeof(u32)) return 0; cycles = 18 + 6 * transfer_size; u32 xf_address = Cmd2 & 0xFFFF; @@ -148,25 +147,25 @@ static u32 Decode(u8* end) break; case GX_LOAD_INDX_A: //used for position matrices - if (end - g_pVideoData < 4) + if (end - g_video_buffer_read_ptr < 4) return 0; cycles = 6; LoadIndexedXF(DataReadU32(), 0xC); break; case GX_LOAD_INDX_B: //used for normal matrices - if (end - g_pVideoData < 4) + if (end - g_video_buffer_read_ptr < 4) return 0; cycles = 6; LoadIndexedXF(DataReadU32(), 0xD); break; case GX_LOAD_INDX_C: //used for postmatrices - if (end - g_pVideoData < 4) + if (end - g_video_buffer_read_ptr < 4) return 0; cycles = 6; LoadIndexedXF(DataReadU32(), 0xE); break; case GX_LOAD_INDX_D: //used for lights - if (end - g_pVideoData < 4) + if (end - g_video_buffer_read_ptr < 4) return 0; cycles = 6; LoadIndexedXF(DataReadU32(), 0xF); @@ -174,7 +173,7 @@ static u32 Decode(u8* end) case GX_CMD_CALL_DL: { - if (end - g_pVideoData < 8) + if (end - g_video_buffer_read_ptr < 8) return 0; u32 address = DataReadU32(); u32 count = DataReadU32(); @@ -196,7 +195,7 @@ static u32 Decode(u8* end) // In skipped_frame case: We have to let BP writes through because they set // tokens and stuff. TODO: Call a much simplified LoadBPReg instead. { - if (end - g_pVideoData < 4) + if (end - g_video_buffer_read_ptr < 4) return 0; cycles = 12; u32 bp_cmd = DataReadU32(); @@ -211,7 +210,7 @@ static u32 Decode(u8* end) { cycles = 1600; // load vertices - if (end - g_pVideoData < 2) + if (end - g_video_buffer_read_ptr < 2) return 0; u16 numVertices = DataReadU16(); @@ -219,7 +218,7 @@ static u32 Decode(u8* end) cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7) (cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, numVertices, - end - g_pVideoData, + end - g_video_buffer_read_ptr, g_bSkipCurrentFrame)) { return 0; @@ -235,14 +234,14 @@ static u32 Decode(u8* end) // Display lists get added directly into the FIFO stream if (g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL) - FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(g_pVideoData - opcodeStart)); + FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(g_video_buffer_read_ptr - opcodeStart)); return cycles; } void OpcodeDecoder_Init() { - g_pVideoData = GetVideoBufferStartPtr(); + g_video_buffer_read_ptr = GetVideoBufferStartPtr(); } @@ -255,11 +254,11 @@ u32 OpcodeDecoder_Run(u8* end) u32 totalCycles = 0; while (true) { - u8* old = g_pVideoData; + u8* old = g_video_buffer_read_ptr; u32 cycles = Decode(end); if (cycles == 0) { - g_pVideoData = old; + g_video_buffer_read_ptr = old; break; } totalCycles += cycles; diff --git a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp index 1126cbd73a..576e9a8a53 100644 --- a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp +++ b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp @@ -74,7 +74,7 @@ protected: void ResetPointers() { - g_pVideoData = &input_memory[0]; + g_video_buffer_read_ptr = &input_memory[0]; VertexManager::s_pCurBufferPointer = &output_memory[0]; m_input_pos = m_output_pos = 0; } From 2d4b7c59005d516fc6d21ec48689c062b6ff6c16 Mon Sep 17 00:00:00 2001 From: comex Date: Tue, 26 Aug 2014 21:18:47 -0400 Subject: [PATCH 07/10] Make ReadDataFromFifo static. --- Source/Core/VideoCommon/Fifo.cpp | 2 +- Source/Core/VideoCommon/Fifo.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Source/Core/VideoCommon/Fifo.cpp b/Source/Core/VideoCommon/Fifo.cpp index e8ba7a2b22..b6412c796c 100644 --- a/Source/Core/VideoCommon/Fifo.cpp +++ b/Source/Core/VideoCommon/Fifo.cpp @@ -110,7 +110,7 @@ void EmulatorState(bool running) // Description: RunGpuLoop() sends data through this function. -void ReadDataFromFifo(u8* _uData, u32 len) +static void ReadDataFromFifo(u8* _uData, u32 len) { if (len > (s_video_buffer + FIFO_SIZE - s_video_buffer_write_ptr)) { diff --git a/Source/Core/VideoCommon/Fifo.h b/Source/Core/VideoCommon/Fifo.h index 389336be0a..66399680c1 100644 --- a/Source/Core/VideoCommon/Fifo.h +++ b/Source/Core/VideoCommon/Fifo.h @@ -23,8 +23,6 @@ u8* GetVideoBufferEndPtr(); void Fifo_DoState(PointerWrap &f); void Fifo_PauseAndLock(bool doLock, bool unpauseOnUnlock); -void ReadDataFromFifo(u8* _uData, u32 len); - void RunGpu(); void RunGpuLoop(); void ExitGpuLoop(); From 65af90669bd5f9e02bbaa994d51d5c83d147b868 Mon Sep 17 00:00:00 2001 From: comex Date: Wed, 27 Aug 2014 22:56:19 -0400 Subject: [PATCH 08/10] Add the 'desynced GPU thread' mode. It's a relatively big commit (less big with -w), but it's hard to test any of this separately... The basic problem is that in netplay or movies, the state of the CPU must be deterministic, including when the game receives notification that the GPU has processed FIFO data. Dual core mode notifies the game whenever the GPU thread actually gets around to doing the work, so it isn't deterministic. Single core mode is because it notifies the game 'instantly' (after processing the data synchronously), but it's too slow for many systems and games. My old dc-netplay branch worked as follows: everything worked as normal except the state of the CP registers was a lie, and the CPU thread only delivered results when idle detection triggered (waiting for the GPU if they weren't ready at that point). Usually, a game is idle iff all the work for the frame has been done, except for a small amount of work depending on the GPU result, so neither the CPU or the GPU waiting on the other affected performance much. However, it's possible that the game could be waiting for some earlier interrupt, and any of several games which, for whatever reason, never went into a detectable idle (even when I tried to improve the detection) would never receive results at all. (The current method should have better compatibility, but it also has slightly higher overhead and breaks some other things, so I want to reimplement this, hopefully with less impact on the code, in the future.) With this commit, the basic idea is that the CPU thread acts as if the work has been done instantly, like single core mode, but actually hands it off asynchronously to the GPU thread (after backing up some data that the game might change in memory before it's actually done). Since the work isn't done, any feedback from the GPU to the CPU, such as real XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is broken; but most games work with these options disabled, and there is no need to try to detect what the CPU thread is doing. Technically: when the flag g_use_deterministic_gpu_thread (currently stuck on) is on, the CPU thread calls RunGpu like in single core mode. This function synchronously copies the data from the FIFO to the internal video buffer and updates the CP registers, interrupts, etc. However, instead of the regular ReadDataFromFifo followed by running the opcode decoder, it runs ReadDataFromFifoOnCPU -> OpcodeDecoder_Preprocess, which relatively quickly scans through the FIFO data, detects SetFinish calls etc., which are immediately fired, and saves certain associated data from memory (e.g. display lists) in AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at the moment), before handing the data off to the GPU thread to actually render. That makes up the bulk of this commit. In various circumstances, including the aforementioned EFB pokes and performance queries as well as swap requests (i.e. the end of a frame - we don't want the CPU potentially pumping out frames too quickly and the GPU falling behind*), SyncGPU is called to wait for actual completion. The overhead mainly comes from OpcodeDecoder_Preprocess (which is, again, synchronous), as well as the actual copying. Currently, display lists and such are escrowed from main memory even though they usually won't change over the course of a frame, and textures are not even though they might, resulting in a small chance of graphical glitches. When the texture locking (i.e. fault on write) code lands, I can make this all correct and maybe a little faster. * This suggests an alternate determinism method of just delaying results until a short time before the end of each frame. For all I know this might mostly work - I haven't tried it - but if any significant work hinges on the competion of render to texture etc., the frame will be missed. --- Source/Core/VideoCommon/BPMemory.h | 1 + Source/Core/VideoCommon/BPStructs.cpp | 29 +- Source/Core/VideoCommon/BPStructs.h | 1 - Source/Core/VideoCommon/CommandProcessor.cpp | 44 +-- Source/Core/VideoCommon/CommandProcessor.h | 1 + Source/Core/VideoCommon/DataReader.h | 10 +- Source/Core/VideoCommon/Fifo.cpp | 265 +++++++++++++++---- Source/Core/VideoCommon/Fifo.h | 21 ++ Source/Core/VideoCommon/MainBase.cpp | 5 + Source/Core/VideoCommon/OpcodeDecoding.cpp | 178 +++++++++---- Source/Core/VideoCommon/OpcodeDecoding.h | 1 + Source/Core/VideoCommon/XFMemory.h | 1 + Source/Core/VideoCommon/XFStructs.cpp | 22 +- 13 files changed, 444 insertions(+), 135 deletions(-) diff --git a/Source/Core/VideoCommon/BPMemory.h b/Source/Core/VideoCommon/BPMemory.h index 0d50f3ac51..346af479f2 100644 --- a/Source/Core/VideoCommon/BPMemory.h +++ b/Source/Core/VideoCommon/BPMemory.h @@ -1085,5 +1085,6 @@ struct BPMemory extern BPMemory bpmem; void LoadBPReg(u32 value0); +void LoadBPRegPreprocess(u32 value0); void GetBPRegInfo(const u8* data, std::string* name, std::string* desc); diff --git a/Source/Core/VideoCommon/BPStructs.cpp b/Source/Core/VideoCommon/BPStructs.cpp index 308badae6e..152d15a7d5 100644 --- a/Source/Core/VideoCommon/BPStructs.cpp +++ b/Source/Core/VideoCommon/BPStructs.cpp @@ -173,7 +173,8 @@ static void BPWritten(const BPCmd& bp) switch (bp.newvalue & 0xFF) { case 0x02: - PixelEngine::SetFinish(); // may generate interrupt + if (!g_use_deterministic_gpu_thread) + PixelEngine::SetFinish(); // may generate interrupt DEBUG_LOG(VIDEO, "GXSetDrawDone SetPEFinish (value: 0x%02X)", (bp.newvalue & 0xFFFF)); return; @@ -183,11 +184,13 @@ static void BPWritten(const BPCmd& bp) } return; case BPMEM_PE_TOKEN_ID: // Pixel Engine Token ID - PixelEngine::SetToken(static_cast(bp.newvalue & 0xFFFF), false); + if (!g_use_deterministic_gpu_thread) + PixelEngine::SetToken(static_cast(bp.newvalue & 0xFFFF), false); DEBUG_LOG(VIDEO, "SetPEToken 0x%04x", (bp.newvalue & 0xFFFF)); return; case BPMEM_PE_TOKEN_INT_ID: // Pixel Engine Interrupt Token ID - PixelEngine::SetToken(static_cast(bp.newvalue & 0xFFFF), true); + if (!g_use_deterministic_gpu_thread) + PixelEngine::SetToken(static_cast(bp.newvalue & 0xFFFF), true); DEBUG_LOG(VIDEO, "SetPEToken + INT 0x%04x", (bp.newvalue & 0xFFFF)); return; @@ -685,6 +688,26 @@ void LoadBPReg(u32 value0) BPWritten(bp); } +void LoadBPRegPreprocess(u32 value0) +{ + int regNum = value0 >> 24; + // masking could hypothetically be a problem + u32 newval = value0 & 0xffffff; + switch (regNum) + { + case BPMEM_SETDRAWDONE: + if ((newval & 0xff) == 0x02) + PixelEngine::SetFinish(); + break; + case BPMEM_PE_TOKEN_ID: + PixelEngine::SetToken(newval & 0xffff, false); + break; + case BPMEM_PE_TOKEN_INT_ID: // Pixel Engine Interrupt Token ID + PixelEngine::SetToken(newval & 0xffff, true); + break; + } +} + void GetBPRegInfo(const u8* data, std::string* name, std::string* desc) { const char* no_yes[2] = { "No", "Yes" }; diff --git a/Source/Core/VideoCommon/BPStructs.h b/Source/Core/VideoCommon/BPStructs.h index 2a99443346..a1dc48b821 100644 --- a/Source/Core/VideoCommon/BPStructs.h +++ b/Source/Core/VideoCommon/BPStructs.h @@ -7,5 +7,4 @@ #include "VideoCommon/BPMemory.h" void BPInit(); -void LoadBPReg(u32 value0); void BPReload(); diff --git a/Source/Core/VideoCommon/CommandProcessor.cpp b/Source/Core/VideoCommon/CommandProcessor.cpp index aa9f8c4f28..6f8997cc58 100644 --- a/Source/Core/VideoCommon/CommandProcessor.cpp +++ b/Source/Core/VideoCommon/CommandProcessor.cpp @@ -77,7 +77,7 @@ void DoState(PointerWrap &p) p.Do(interruptFinishWaiting); } -UNUSED static inline void WriteLow(volatile u32& _reg, u16 lowbits) +static inline void WriteLow(volatile u32& _reg, u16 lowbits) { Common::AtomicStore(_reg, (_reg & 0xFFFF0000) | lowbits); } @@ -159,9 +159,8 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) { FIFO_WRITE_POINTER_LO, MMIO::Utils::LowPart(&fifo.CPWritePointer), false, true }, { FIFO_WRITE_POINTER_HI, MMIO::Utils::HighPart(&fifo.CPWritePointer) }, // FIFO_READ_POINTER has different code for single/dual core. - { FIFO_BP_LO, MMIO::Utils::LowPart(&fifo.CPBreakpoint), false, true }, - { FIFO_BP_HI, MMIO::Utils::HighPart(&fifo.CPBreakpoint) }, }; + for (auto& mapped_var : directly_mapped_vars) { u16 wmask = mapped_var.writes_align_to_32_bytes ? 0xFFE0 : 0xFFFF; @@ -173,6 +172,19 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) ); } + mmio->Register(base | FIFO_BP_LO, + MMIO::DirectRead(MMIO::Utils::LowPart(&fifo.CPBreakpoint)), + MMIO::ComplexWrite([](u32, u16 val) { + WriteLow(fifo.CPBreakpoint, val & 0xffe0); + }) + ); + mmio->Register(base | FIFO_BP_HI, + MMIO::DirectRead(MMIO::Utils::HighPart(&fifo.CPBreakpoint)), + MMIO::ComplexWrite([](u32, u16 val) { + WriteHigh(fifo.CPBreakpoint, val); + }) + ); + // Timing and metrics MMIOs are stubbed with fixed values. struct { u32 addr; @@ -216,8 +228,7 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) UCPCtrlReg tmp(val); m_CPCtrlReg.Hex = tmp.Hex; SetCpControlRegister(); - if (!IsOnThread()) - RunGpu(); + RunGpu(); }) ); @@ -227,8 +238,7 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) UCPClearReg tmp(val); m_CPClearReg.Hex = tmp.Hex; SetCpClearRegister(); - if (!IsOnThread()) - RunGpu(); + RunGpu(); }) ); @@ -260,6 +270,7 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) : MMIO::DirectRead(MMIO::Utils::HighPart(&fifo.CPReadWriteDistance)), MMIO::ComplexWrite([](u32, u16 val) { WriteHigh(fifo.CPReadWriteDistance, val); + SyncGPU(SYNC_GPU_OTHER); if (fifo.CPReadWriteDistance == 0) { GPFifo::ResetGatherPipe(); @@ -269,8 +280,7 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) { ResetVideoBuffer(); } - if (!IsOnThread()) - RunGpu(); + RunGpu(); }) ); mmio->Register(base | FIFO_READ_POINTER_LO, @@ -298,11 +308,7 @@ void STACKALIGN GatherPipeBursted() // if we aren't linked, we don't care about gather pipe data if (!m_CPCtrlReg.GPLinkEnable) { - if (!IsOnThread()) - { - RunGpu(); - } - else + if (IsOnThread() && !g_use_deterministic_gpu_thread) { // In multibuffer mode is not allowed write in the same FIFO attached to the GPU. // Fix Pokemon XD in DC mode. @@ -313,6 +319,10 @@ void STACKALIGN GatherPipeBursted() ProcessFifoAllDistance(); } } + else + { + RunGpu(); + } return; } @@ -327,8 +337,7 @@ void STACKALIGN GatherPipeBursted() Common::AtomicAdd(fifo.CPReadWriteDistance, GATHER_PIPE_SIZE); - if (!IsOnThread()) - RunGpu(); + RunGpu(); _assert_msg_(COMMANDPROCESSOR, fifo.CPReadWriteDistance <= fifo.CPEnd - fifo.CPBase, "FIFO is overflowed by GatherPipe !\nCPU thread is too fast!"); @@ -358,7 +367,8 @@ void UpdateInterrupts(u64 userdata) void UpdateInterruptsFromVideoBackend(u64 userdata) { - CoreTiming::ScheduleEvent_Threadsafe(0, et_UpdateInterrupts, userdata); + if (!g_use_deterministic_gpu_thread) + CoreTiming::ScheduleEvent_Threadsafe(0, et_UpdateInterrupts, userdata); } void SetCPStatusFromGPU() diff --git a/Source/Core/VideoCommon/CommandProcessor.h b/Source/Core/VideoCommon/CommandProcessor.h index b29816e8fe..0dad1578af 100644 --- a/Source/Core/VideoCommon/CommandProcessor.h +++ b/Source/Core/VideoCommon/CommandProcessor.h @@ -16,6 +16,7 @@ namespace CommandProcessor { extern SCPFifoStruct fifo; //This one is shared between gfx thread and emulator thread. + extern volatile bool isPossibleWaitingSetDrawDone; //This one is used for sync gfx thread and emulator thread. extern volatile bool interruptSet; extern volatile bool interruptWaiting; diff --git a/Source/Core/VideoCommon/DataReader.h b/Source/Core/VideoCommon/DataReader.h index 7f317b177c..fcb89d36e4 100644 --- a/Source/Core/VideoCommon/DataReader.h +++ b/Source/Core/VideoCommon/DataReader.h @@ -25,9 +25,9 @@ __forceinline void DataSkip() } template -__forceinline T DataPeek(int _uOffset) +__forceinline T DataPeek(int _uOffset, u8** bufp = &g_video_buffer_read_ptr) { - auto const result = Common::FromBigEndian(*reinterpret_cast(g_video_buffer_read_ptr + _uOffset)); + auto const result = Common::FromBigEndian(*reinterpret_cast(*bufp + _uOffset)); return result; } @@ -48,10 +48,10 @@ __forceinline u32 DataPeek32(int _uOffset) } template -__forceinline T DataRead() +__forceinline T DataRead(u8** bufp = &g_video_buffer_read_ptr) { - auto const result = DataPeek(0); - DataSkip(); + auto const result = DataPeek(0, bufp); + *bufp += sizeof(T); return result; } diff --git a/Source/Core/VideoCommon/Fifo.cpp b/Source/Core/VideoCommon/Fifo.cpp index b6412c796c..87764ec85c 100644 --- a/Source/Core/VideoCommon/Fifo.cpp +++ b/Source/Core/VideoCommon/Fifo.cpp @@ -25,19 +25,46 @@ bool g_bSkipCurrentFrame = false; static volatile bool GpuRunningState = false; static volatile bool EmuRunningState = false; static std::mutex m_csHWVidOccupied; -// STATE_TO_SAVE -static u8* s_video_buffer; -static u8* s_video_buffer_write_ptr; -// Note: during display list execution, temporarily points to the list instead -// of inside s_video_buffer. +// Most of this array is unlikely to be faulted in... +static u8 s_fifo_aux_data[FIFO_SIZE]; +static u8* s_fifo_aux_write_ptr; +static u8* s_fifo_aux_read_ptr; + +bool g_use_deterministic_gpu_thread = true; // XXX + +// STATE_TO_SAVE +static std::mutex s_video_buffer_lock; +static std::condition_variable s_video_buffer_cond; +static u8* s_video_buffer; u8* g_video_buffer_read_ptr; +static std::atomic s_video_buffer_write_ptr; +static std::atomic s_video_buffer_seen_ptr; +u8* g_video_buffer_pp_read_ptr; +// The read_ptr is always owned by the GPU thread. In normal mode, so is the +// write_ptr, despite it being atomic. In g_use_deterministic_gpu_thread mode, +// things get a bit more complicated: +// - The seen_ptr is written by the GPU thread, and points to what it's already +// processed as much of as possible - in the case of a partial command which +// caused it to stop, not the same as the read ptr. It's written by the GPU, +// under the lock, and updating the cond. +// - The write_ptr is written by the CPU thread after it copies data from the +// FIFO. Maybe someday it will be under the lock. For now, because RunGpuLoop +// polls, it's just atomic. +// - The pp_read_ptr is the CPU preprocessing version of the read_ptr. void Fifo_DoState(PointerWrap &p) { p.DoArray(s_video_buffer, FIFO_SIZE); - p.DoPointer(s_video_buffer_write_ptr, s_video_buffer); + u8* write_ptr = s_video_buffer_write_ptr; + p.DoPointer(write_ptr, s_video_buffer); + s_video_buffer_write_ptr = write_ptr; p.DoPointer(g_video_buffer_read_ptr, s_video_buffer); + if (p.mode == PointerWrap::MODE_READ && g_use_deterministic_gpu_thread) + { + // We're good and paused, right? + s_video_buffer_seen_ptr = g_video_buffer_pp_read_ptr = g_video_buffer_read_ptr; + } p.Do(g_bSkipCurrentFrame); } @@ -45,6 +72,7 @@ void Fifo_PauseAndLock(bool doLock, bool unpauseOnUnlock) { if (doLock) { + SyncGPU(SYNC_GPU_OTHER); EmulatorState(false); if (!Core::IsGPUThread()) m_csHWVidOccupied.lock(); @@ -63,7 +91,7 @@ void Fifo_PauseAndLock(bool doLock, bool unpauseOnUnlock) void Fifo_Init() { s_video_buffer = (u8*)AllocateMemoryPages(FIFO_SIZE); - s_video_buffer_write_ptr = s_video_buffer; + ResetVideoBuffer(); GpuRunningState = false; Common::AtomicStore(CommandProcessor::VITicks, CommandProcessor::m_cpClockOrigin); } @@ -73,6 +101,12 @@ void Fifo_Shutdown() if (GpuRunningState) PanicAlert("Fifo shutting down while active"); FreeMemoryPages(s_video_buffer, FIFO_SIZE); s_video_buffer = nullptr; + s_video_buffer_write_ptr = nullptr; + g_video_buffer_pp_read_ptr = nullptr; + g_video_buffer_read_ptr = nullptr; + s_video_buffer_seen_ptr = nullptr; + s_fifo_aux_write_ptr = nullptr; + s_fifo_aux_read_ptr = nullptr; } u8* GetVideoBufferStartPtr() @@ -108,6 +142,66 @@ void EmulatorState(bool running) EmuRunningState = running; } +void SyncGPU(SyncGPUReason reason, bool may_move_read_ptr) +{ + if (g_use_deterministic_gpu_thread && GpuRunningState) + { + std::unique_lock lk(s_video_buffer_lock); + u8* write_ptr = s_video_buffer_write_ptr; + s_video_buffer_cond.wait(lk, [&]() { + return !GpuRunningState || s_video_buffer_seen_ptr == write_ptr; + }); + if (!GpuRunningState) + return; + + // Opportunistically reset FIFOs so we don't wrap around. + if (may_move_read_ptr && s_fifo_aux_write_ptr != s_fifo_aux_read_ptr) + PanicAlert("aux fifo not synced (%p, %p)", s_fifo_aux_write_ptr, s_fifo_aux_read_ptr); + + memmove(s_fifo_aux_data, s_fifo_aux_read_ptr, s_fifo_aux_write_ptr - s_fifo_aux_read_ptr); + s_fifo_aux_write_ptr -= (s_fifo_aux_read_ptr - s_fifo_aux_data); + s_fifo_aux_read_ptr = s_fifo_aux_data; + + if (may_move_read_ptr) + { + // what's left over in the buffer + size_t size = write_ptr - g_video_buffer_pp_read_ptr; + + memmove(s_video_buffer, g_video_buffer_pp_read_ptr, size); + // This change always decreases the pointers. We write seen_ptr + // after write_ptr here, and read it before in RunGpuLoop, so + // 'write_ptr > seen_ptr' there cannot become spuriously true. + s_video_buffer_write_ptr = write_ptr = s_video_buffer + size; + g_video_buffer_pp_read_ptr = s_video_buffer; + g_video_buffer_read_ptr = s_video_buffer; + s_video_buffer_seen_ptr = write_ptr; + } + } +} + +void PushFifoAuxBuffer(void* ptr, size_t size) +{ + if (size > (size_t) (s_fifo_aux_data + FIFO_SIZE - s_fifo_aux_write_ptr)) + { + SyncGPU(SYNC_GPU_AUX_SPACE, /* may_move_read_ptr */ false); + if (size > (size_t) (s_fifo_aux_data + FIFO_SIZE - s_fifo_aux_write_ptr)) + { + // That will sync us up to the last 32 bytes, so this short region + // of FIFO would have to point to a 2MB display list or something. + PanicAlert("absurdly large aux buffer"); + return; + } + } + memcpy(s_fifo_aux_write_ptr, ptr, size); + s_fifo_aux_write_ptr += size; +} + +void* PopFifoAuxBuffer(size_t size) +{ + void* ret = s_fifo_aux_read_ptr; + s_fifo_aux_read_ptr += size; + return ret; +} // Description: RunGpuLoop() sends data through this function. static void ReadDataFromFifo(u8* _uData, u32 len) @@ -129,10 +223,42 @@ static void ReadDataFromFifo(u8* _uData, u32 len) s_video_buffer_write_ptr += len; } +// The deterministic_gpu_thread version. +static void ReadDataFromFifoOnCPU(u8* _uData, u32 len) +{ + u8 *write_ptr = s_video_buffer_write_ptr; + if (len > (s_video_buffer + FIFO_SIZE - write_ptr)) + { + // We can't wrap around while the GPU is working on the data. + // This should be very rare due to the reset in SyncGPU. + SyncGPU(SYNC_GPU_WRAPAROUND); + if (g_video_buffer_pp_read_ptr != g_video_buffer_read_ptr) + { + PanicAlert("desynced read pointers"); + return; + } + write_ptr = s_video_buffer_write_ptr; + size_t size = write_ptr - g_video_buffer_pp_read_ptr; + if (len > FIFO_SIZE - size) + { + PanicAlert("FIFO out of bounds (existing %lu + new %lu > %lu)", (unsigned long) size, (unsigned long) len, (unsigned long) FIFO_SIZE); + return; + } + } + memcpy(write_ptr, _uData, len); + OpcodeDecoder_Preprocess(write_ptr + len); + // This would have to be locked if the GPU thread didn't spin. + s_video_buffer_write_ptr = write_ptr + len; +} + void ResetVideoBuffer() { g_video_buffer_read_ptr = s_video_buffer; s_video_buffer_write_ptr = s_video_buffer; + s_video_buffer_seen_ptr = s_video_buffer; + g_video_buffer_pp_read_ptr = s_video_buffer; + s_fifo_aux_write_ptr = s_fifo_aux_data; + s_fifo_aux_read_ptr = s_fifo_aux_data; } @@ -150,53 +276,75 @@ void RunGpuLoop() g_video_backend->PeekMessages(); VideoFifo_CheckAsyncRequest(); - - CommandProcessor::SetCPStatusFromGPU(); - - Common::AtomicStore(CommandProcessor::VITicks, CommandProcessor::m_cpClockOrigin); - - // check if we are able to run this buffer - while (GpuRunningState && EmuRunningState && !CommandProcessor::interruptWaiting && fifo.bFF_GPReadEnable && fifo.CPReadWriteDistance && !AtBreakpoint()) + if (g_use_deterministic_gpu_thread) { - fifo.isGpuReadingData = true; - CommandProcessor::isPossibleWaitingSetDrawDone = fifo.bFF_GPLinkEnable ? true : false; - - if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPU || Common::AtomicLoad(CommandProcessor::VITicks) > CommandProcessor::m_cpClockOrigin) + // All the fifo/CP stuff is on the CPU. We just need to run the opcode decoder. + u8* seen_ptr = s_video_buffer_seen_ptr; + u8* write_ptr = s_video_buffer_write_ptr; + // See comment in SyncGPU + if (write_ptr > seen_ptr) { - u32 readPtr = fifo.CPReadPointer; - u8 *uData = Memory::GetPointer(readPtr); + OpcodeDecoder_Run(write_ptr); - if (readPtr == fifo.CPEnd) - readPtr = fifo.CPBase; - else - readPtr += 32; - - _assert_msg_(COMMANDPROCESSOR, (s32)fifo.CPReadWriteDistance - 32 >= 0 , - "Negative fifo.CPReadWriteDistance = %i in FIFO Loop !\nThat can produce instability in the game. Please report it.", fifo.CPReadWriteDistance - 32); - - ReadDataFromFifo(uData, 32); - - cyclesExecuted = OpcodeDecoder_Run(GetVideoBufferEndPtr()); - - if (SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPU && Common::AtomicLoad(CommandProcessor::VITicks) >= cyclesExecuted) - Common::AtomicAdd(CommandProcessor::VITicks, -(s32)cyclesExecuted); - - Common::AtomicStore(fifo.CPReadPointer, readPtr); - Common::AtomicAdd(fifo.CPReadWriteDistance, -32); - if ((GetVideoBufferEndPtr() - g_video_buffer_read_ptr) == 0) - Common::AtomicStore(fifo.SafeCPReadPointer, fifo.CPReadPointer); + { + std::lock_guard vblk(s_video_buffer_lock); + s_video_buffer_seen_ptr = write_ptr; + s_video_buffer_cond.notify_all(); + } } - + } + else + { CommandProcessor::SetCPStatusFromGPU(); - // This call is pretty important in DualCore mode and must be called in the FIFO Loop. - // If we don't, s_swapRequested or s_efbAccessRequested won't be set to false - // leading the CPU thread to wait in Video_BeginField or Video_AccessEFB thus slowing things down. - VideoFifo_CheckAsyncRequest(); - CommandProcessor::isPossibleWaitingSetDrawDone = false; - } + Common::AtomicStore(CommandProcessor::VITicks, CommandProcessor::m_cpClockOrigin); - fifo.isGpuReadingData = false; + // check if we are able to run this buffer + while (GpuRunningState && EmuRunningState && !CommandProcessor::interruptWaiting && fifo.bFF_GPReadEnable && fifo.CPReadWriteDistance && !AtBreakpoint()) + { + fifo.isGpuReadingData = true; + CommandProcessor::isPossibleWaitingSetDrawDone = fifo.bFF_GPLinkEnable ? true : false; + + if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPU || Common::AtomicLoad(CommandProcessor::VITicks) > CommandProcessor::m_cpClockOrigin) + { + u32 readPtr = fifo.CPReadPointer; + u8 *uData = Memory::GetPointer(readPtr); + + if (readPtr == fifo.CPEnd) + readPtr = fifo.CPBase; + else + readPtr += 32; + + _assert_msg_(COMMANDPROCESSOR, (s32)fifo.CPReadWriteDistance - 32 >= 0 , + "Negative fifo.CPReadWriteDistance = %i in FIFO Loop !\nThat can produce instability in the game. Please report it.", fifo.CPReadWriteDistance - 32); + + ReadDataFromFifo(uData, 32); + + u8* write_ptr = s_video_buffer_write_ptr; + + cyclesExecuted = OpcodeDecoder_Run(write_ptr); + + + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPU && Common::AtomicLoad(CommandProcessor::VITicks) >= cyclesExecuted) + Common::AtomicAdd(CommandProcessor::VITicks, -(s32)cyclesExecuted); + + Common::AtomicStore(fifo.CPReadPointer, readPtr); + Common::AtomicAdd(fifo.CPReadWriteDistance, -32); + if ((write_ptr - g_video_buffer_read_ptr) == 0) + Common::AtomicStore(fifo.SafeCPReadPointer, fifo.CPReadPointer); + } + + CommandProcessor::SetCPStatusFromGPU(); + + // This call is pretty important in DualCore mode and must be called in the FIFO Loop. + // If we don't, s_swapRequested or s_efbAccessRequested won't be set to false + // leading the CPU thread to wait in Video_BeginField or Video_AccessEFB thus slowing things down. + VideoFifo_CheckAsyncRequest(); + CommandProcessor::isPossibleWaitingSetDrawDone = false; + } + + fifo.isGpuReadingData = false; + } if (EmuRunningState) { @@ -219,6 +367,8 @@ void RunGpuLoop() } } } + // wake up SyncGPU if we were interrupted + s_video_buffer_cond.notify_all(); } @@ -230,16 +380,27 @@ bool AtBreakpoint() void RunGpu() { + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bCPUThread && + !g_use_deterministic_gpu_thread) + return; + SCPFifoStruct &fifo = CommandProcessor::fifo; while (fifo.bFF_GPReadEnable && fifo.CPReadWriteDistance && !AtBreakpoint() ) { u8 *uData = Memory::GetPointer(fifo.CPReadPointer); - FPURoundMode::SaveSIMDState(); - FPURoundMode::LoadDefaultSIMDState(); - ReadDataFromFifo(uData, 32); - OpcodeDecoder_Run(GetVideoBufferEndPtr()); - FPURoundMode::LoadSIMDState(); + if (g_use_deterministic_gpu_thread) + { + ReadDataFromFifoOnCPU(uData, 32); + } + else + { + FPURoundMode::SaveSIMDState(); + FPURoundMode::LoadDefaultSIMDState(); + ReadDataFromFifo(uData, 32); + OpcodeDecoder_Run(s_video_buffer_write_ptr); + FPURoundMode::LoadSIMDState(); + } //DEBUG_LOG(COMMANDPROCESSOR, "Fifo wraps to base"); diff --git a/Source/Core/VideoCommon/Fifo.h b/Source/Core/VideoCommon/Fifo.h index 66399680c1..175d6b6e4a 100644 --- a/Source/Core/VideoCommon/Fifo.h +++ b/Source/Core/VideoCommon/Fifo.h @@ -13,6 +13,11 @@ class PointerWrap; extern bool g_bSkipCurrentFrame; +// This could be in SCoreStartupParameter, but it depends on multiple settings +// and can change at runtime. +extern bool g_use_deterministic_gpu_thread; +extern std::atomic g_video_buffer_write_ptr_xthread; +extern u8* g_video_buffer_pp_read_ptr; void Fifo_Init(); void Fifo_Shutdown(); @@ -23,6 +28,22 @@ u8* GetVideoBufferEndPtr(); void Fifo_DoState(PointerWrap &f); void Fifo_PauseAndLock(bool doLock, bool unpauseOnUnlock); +// Used for diagnostics. +enum SyncGPUReason { + SYNC_GPU_NONE, + SYNC_GPU_OTHER, + SYNC_GPU_WRAPAROUND, + SYNC_GPU_EFB_POKE, + SYNC_GPU_PERFQUERY, + SYNC_GPU_SWAP, + SYNC_GPU_AUX_SPACE, +}; +// In g_use_deterministic_gpu_thread mode, waits for the GPU to be done with pending work. +void SyncGPU(SyncGPUReason reason, bool may_move_read_ptr = true); + +void PushFifoAuxBuffer(void* ptr, size_t size); +void* PopFifoAuxBuffer(size_t size); + void RunGpu(); void RunGpuLoop(); void ExitGpuLoop(); diff --git a/Source/Core/VideoCommon/MainBase.cpp b/Source/Core/VideoCommon/MainBase.cpp index 99fac60eae..082c81c2ea 100644 --- a/Source/Core/VideoCommon/MainBase.cpp +++ b/Source/Core/VideoCommon/MainBase.cpp @@ -118,6 +118,7 @@ void VideoBackendHardware::Video_EndField() { if (s_BackendInitialized) { + SyncGPU(SYNC_GPU_SWAP); s_swapRequested.Set(); } } @@ -153,6 +154,8 @@ u32 VideoBackendHardware::Video_AccessEFB(EFBAccessType type, u32 x, u32 y, u32 { if (s_BackendInitialized && g_ActiveConfig.bEFBAccessEnable) { + SyncGPU(SYNC_GPU_EFB_POKE); + s_accessEFBArgs.type = type; s_accessEFBArgs.x = x; s_accessEFBArgs.y = y; @@ -194,6 +197,8 @@ u32 VideoBackendHardware::Video_GetQueryResult(PerfQueryType type) return 0; } + SyncGPU(SYNC_GPU_PERFQUERY); + // TODO: Is this check sane? if (!g_perf_query->IsFlushed()) { diff --git a/Source/Core/VideoCommon/OpcodeDecoding.cpp b/Source/Core/VideoCommon/OpcodeDecoding.cpp index fe70bcf492..1bb5fae940 100644 --- a/Source/Core/VideoCommon/OpcodeDecoding.cpp +++ b/Source/Core/VideoCommon/OpcodeDecoding.cpp @@ -24,6 +24,7 @@ #include "VideoCommon/DataReader.h" #include "VideoCommon/Fifo.h" #include "VideoCommon/OpcodeDecoding.h" +#include "VideoCommon/PixelEngine.h" #include "VideoCommon/Statistics.h" #include "VideoCommon/VertexLoaderManager.h" #include "VideoCommon/VideoCommon.h" @@ -36,7 +37,12 @@ bool g_bRecordFifoData = false; static u32 InterpretDisplayList(u32 address, u32 size) { u8* old_pVideoData = g_video_buffer_read_ptr; - u8* startAddress = Memory::GetPointer(address); + u8* startAddress; + + if (g_use_deterministic_gpu_thread) + startAddress = (u8*) PopFifoAuxBuffer(size); + else + startAddress = Memory::GetPointer(address); u32 cycles = 0; @@ -62,11 +68,29 @@ static u32 InterpretDisplayList(u32 address, u32 size) return cycles; } +static void InterpretDisplayListPreprocess(u32 address, u32 size) +{ + u8* old_read_ptr = g_video_buffer_pp_read_ptr; + u8* startAddress = Memory::GetPointer(address); + + PushFifoAuxBuffer(startAddress, size); + + if (startAddress != nullptr) + { + g_video_buffer_pp_read_ptr = startAddress; + + u8 *end = startAddress + size; + OpcodeDecoder_Preprocess(end); + } + + g_video_buffer_pp_read_ptr = old_read_ptr; +} + static void UnknownOpcode(u8 cmd_byte, void *buffer, bool preprocess) { // TODO(Omega): Maybe dump FIFO to file on this error std::string temp = StringFromFormat( - "GFX FIFO: Unknown Opcode (0x%x @ %p).\n" + "GFX FIFO: Unknown Opcode (0x%x @ %p, preprocessing=%s).\n" "This means one of the following:\n" "* The emulated GPU got desynced, disabling dual core can help\n" "* Command stream corrupted by some spurious memory bug\n" @@ -74,7 +98,8 @@ static void UnknownOpcode(u8 cmd_byte, void *buffer, bool preprocess) "* Some other sort of bug\n\n" "Dolphin will now likely crash or hang. Enjoy." , cmd_byte, - buffer); + buffer, + preprocess ? "yes" : "no"); Host_SysMessage(temp.c_str()); INFO_LOG(VIDEO, "%s", temp.c_str()); { @@ -104,14 +129,16 @@ static void UnknownOpcode(u8 cmd_byte, void *buffer, bool preprocess) } } +template static u32 Decode(u8* end) { - u8 *opcodeStart = g_video_buffer_read_ptr; - if (g_video_buffer_read_ptr == end) + u8 *opcodeStart = *bufp; + if (*bufp == end) return 0; - u8 cmd_byte = DataReadU8(); + u8 cmd_byte = DataRead(bufp); u32 cycles; + int refarray; switch (cmd_byte) { case GX_NOP: @@ -120,64 +147,72 @@ static u32 Decode(u8* end) case GX_LOAD_CP_REG: //0x08 { - if (end - g_video_buffer_read_ptr < 1 + 4) + if (end - *bufp < 1 + 4) return 0; cycles = 12; - u8 sub_cmd = DataReadU8(); - u32 value = DataReadU32(); - LoadCPReg(sub_cmd, value); - INCSTAT(stats.thisFrame.numCPLoads); + u8 sub_cmd = DataRead(bufp); + u32 value = DataRead(bufp); + LoadCPReg(sub_cmd, value, is_preprocess); + if (!is_preprocess) + INCSTAT(stats.thisFrame.numCPLoads); } break; case GX_LOAD_XF_REG: { - if (end - g_video_buffer_read_ptr < 4) + if (end - *bufp < 4) return 0; - u32 Cmd2 = DataReadU32(); + u32 Cmd2 = DataRead(bufp); int transfer_size = ((Cmd2 >> 16) & 15) + 1; - if ((size_t) (end - g_video_buffer_read_ptr) < transfer_size * sizeof(u32)) + if ((size_t) (end - *bufp) < transfer_size * sizeof(u32)) return 0; cycles = 18 + 6 * transfer_size; - u32 xf_address = Cmd2 & 0xFFFF; - LoadXFReg(transfer_size, xf_address); + if (!is_preprocess) + { + u32 xf_address = Cmd2 & 0xFFFF; + LoadXFReg(transfer_size, xf_address); - INCSTAT(stats.thisFrame.numXFLoads); + INCSTAT(stats.thisFrame.numXFLoads); + } + else + { + *bufp += transfer_size * sizeof(u32); + } } break; case GX_LOAD_INDX_A: //used for position matrices - if (end - g_video_buffer_read_ptr < 4) - return 0; - cycles = 6; - LoadIndexedXF(DataReadU32(), 0xC); - break; + refarray = 0xC; + goto load_indx; case GX_LOAD_INDX_B: //used for normal matrices - if (end - g_video_buffer_read_ptr < 4) - return 0; - cycles = 6; - LoadIndexedXF(DataReadU32(), 0xD); - break; + refarray = 0xD; + goto load_indx; case GX_LOAD_INDX_C: //used for postmatrices - if (end - g_video_buffer_read_ptr < 4) - return 0; - cycles = 6; - LoadIndexedXF(DataReadU32(), 0xE); - break; + refarray = 0xE; + goto load_indx; case GX_LOAD_INDX_D: //used for lights - if (end - g_video_buffer_read_ptr < 4) + refarray = 0xF; + goto load_indx; + load_indx: + if (end - *bufp < 4) return 0; cycles = 6; - LoadIndexedXF(DataReadU32(), 0xF); + if (is_preprocess) + PreprocessIndexedXF(DataRead(bufp), refarray); + else + LoadIndexedXF(DataRead(bufp), refarray); break; case GX_CMD_CALL_DL: { - if (end - g_video_buffer_read_ptr < 8) + if (end - *bufp < 8) return 0; - u32 address = DataReadU32(); - u32 count = DataReadU32(); - cycles = 6 + InterpretDisplayList(address, count); + u32 address = DataRead(bufp); + u32 count = DataRead(bufp); + if (is_preprocess) + InterpretDisplayListPreprocess(address, count); + else + cycles = 6 + InterpretDisplayList(address, count); } break; @@ -195,12 +230,19 @@ static u32 Decode(u8* end) // In skipped_frame case: We have to let BP writes through because they set // tokens and stuff. TODO: Call a much simplified LoadBPReg instead. { - if (end - g_video_buffer_read_ptr < 4) + if (end - *bufp < 4) return 0; cycles = 12; - u32 bp_cmd = DataReadU32(); - LoadBPReg(bp_cmd); - INCSTAT(stats.thisFrame.numBPLoads); + u32 bp_cmd = DataRead(bufp); + if (is_preprocess) + { + LoadBPRegPreprocess(bp_cmd); + } + else + { + LoadBPReg(bp_cmd); + INCSTAT(stats.thisFrame.numBPLoads); + } } break; @@ -210,33 +252,43 @@ static u32 Decode(u8* end) { cycles = 1600; // load vertices - if (end - g_video_buffer_read_ptr < 2) + if (end - *bufp < 2) return 0; - u16 numVertices = DataReadU16(); + u16 num_vertices = DataRead(bufp); - if (!VertexLoaderManager::RunVertices( - cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7) - (cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, - numVertices, - end - g_video_buffer_read_ptr, - g_bSkipCurrentFrame)) + if (is_preprocess) { - return 0; + size_t size = num_vertices * VertexLoaderManager::GetVertexSize(cmd_byte & GX_VAT_MASK, is_preprocess); + if ((size_t) (end - *bufp) < size) + return 0; + *bufp += size; + } + else + { + if (!VertexLoaderManager::RunVertices( + cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7) + (cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, + num_vertices, + end - *bufp, + g_bSkipCurrentFrame)) + return 0; } } else { - UnknownOpcode(cmd_byte, opcodeStart, false); + UnknownOpcode(cmd_byte, opcodeStart, is_preprocess); cycles = 1; } break; } // Display lists get added directly into the FIFO stream - if (g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL) - FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(g_video_buffer_read_ptr - opcodeStart)); + if (!is_preprocess && g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL) + FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(*bufp - opcodeStart)); - return cycles; + // In is_preprocess mode, we don't actually care about cycles, at least for + // now... make sure the compiler realizes that. + return is_preprocess ? 1 : cycles; } void OpcodeDecoder_Init() @@ -255,7 +307,7 @@ u32 OpcodeDecoder_Run(u8* end) while (true) { u8* old = g_video_buffer_read_ptr; - u32 cycles = Decode(end); + u32 cycles = Decode(end); if (cycles == 0) { g_video_buffer_read_ptr = old; @@ -265,3 +317,17 @@ u32 OpcodeDecoder_Run(u8* end) } return totalCycles; } + +void OpcodeDecoder_Preprocess(u8 *end) +{ + while (true) + { + u8* old = g_video_buffer_pp_read_ptr; + u32 cycles = Decode(end); + if (cycles == 0) + { + g_video_buffer_pp_read_ptr = old; + break; + } + } +} diff --git a/Source/Core/VideoCommon/OpcodeDecoding.h b/Source/Core/VideoCommon/OpcodeDecoding.h index 1702969825..e5b1b23e89 100644 --- a/Source/Core/VideoCommon/OpcodeDecoding.h +++ b/Source/Core/VideoCommon/OpcodeDecoding.h @@ -39,3 +39,4 @@ extern bool g_bRecordFifoData; void OpcodeDecoder_Init(); void OpcodeDecoder_Shutdown(); u32 OpcodeDecoder_Run(u8* end); +void OpcodeDecoder_Preprocess(u8* write_ptr); diff --git a/Source/Core/VideoCommon/XFMemory.h b/Source/Core/VideoCommon/XFMemory.h index c9d4d35216..33077aa69b 100644 --- a/Source/Core/VideoCommon/XFMemory.h +++ b/Source/Core/VideoCommon/XFMemory.h @@ -275,3 +275,4 @@ extern XFMemory xfmem; void LoadXFReg(u32 transferSize, u32 address); void LoadIndexedXF(u32 val, int array); +void PreprocessIndexedXF(u32 val, int refarray); diff --git a/Source/Core/VideoCommon/XFStructs.cpp b/Source/Core/VideoCommon/XFStructs.cpp index a0941a0133..0552aa0986 100644 --- a/Source/Core/VideoCommon/XFStructs.cpp +++ b/Source/Core/VideoCommon/XFStructs.cpp @@ -6,6 +6,7 @@ #include "Core/HW/Memmap.h" #include "VideoCommon/CPMemory.h" #include "VideoCommon/DataReader.h" +#include "VideoCommon/Fifo.h" #include "VideoCommon/PixelShaderManager.h" #include "VideoCommon/VertexManagerBase.h" #include "VideoCommon/VertexShaderManager.h" @@ -252,7 +253,15 @@ void LoadIndexedXF(u32 val, int refarray) //load stuff from array to address in xf mem u32* currData = (u32*)(&xfmem) + address; - u32* newData = (u32*)Memory::GetPointer(g_main_cp_state.array_bases[refarray] + g_main_cp_state.array_strides[refarray] * index); + u32* newData; + if (g_use_deterministic_gpu_thread) + { + newData = (u32*)PopFifoAuxBuffer(size * sizeof(u32)); + } + else + { + newData = (u32*)Memory::GetPointer(g_main_cp_state.array_bases[refarray] + g_main_cp_state.array_strides[refarray] * index); + } bool changed = false; for (int i = 0; i < size; ++i) { @@ -269,3 +278,14 @@ void LoadIndexedXF(u32 val, int refarray) currData[i] = Common::swap32(newData[i]); } } + +void PreprocessIndexedXF(u32 val, int refarray) +{ + int index = val >> 16; + int size = ((val >> 12) & 0xF) + 1; + + u32* new_data = (u32*)Memory::GetPointer(g_preprocess_cp_state.array_bases[refarray] + g_preprocess_cp_state.array_strides[refarray] * index); + + size_t buf_size = size * sizeof(u32); + PushFifoAuxBuffer(new_data, buf_size); +} From 3a2048ea570b428ea34adbfea6b71280e1ae5cc0 Mon Sep 17 00:00:00 2001 From: comex Date: Sat, 6 Sep 2014 17:26:40 -0400 Subject: [PATCH 09/10] Add a central variable g_want_determinism which controls whether to try to make things deterministic. It now affects the GPU determinism mode as well as some miscellaneous things that were calling IsNetPlayRunning. Probably incomplete. Notably, this can change while paused, if the user starts recording a movie. The movie code appears to have been missing locking between setting g_playMode and doing other things, which probably had a small chance of causing crashes or even desynced movies; fix that with PauseAndLock. The next commit will add a hidden config variable to override GPU determinism mode. --- Source/Core/Core/Core.cpp | 31 +++++++++++++++++++ Source/Core/Core/Core.h | 5 +++ Source/Core/Core/HW/WiimoteEmu/WiimoteEmu.cpp | 4 +-- Source/Core/Core/IPC_HLE/WII_Socket.cpp | 14 ++++++--- Source/Core/Core/IPC_HLE/WII_Socket.h | 2 ++ Source/Core/Core/Movie.cpp | 9 ++++++ Source/Core/VideoCommon/Fifo.cpp | 28 ++++++++++++++++- Source/Core/VideoCommon/Fifo.h | 1 + Source/Core/VideoCommon/MainBase.cpp | 5 +++ Source/Core/VideoCommon/VideoBackendBase.h | 4 +++ 10 files changed, 95 insertions(+), 8 deletions(-) diff --git a/Source/Core/Core/Core.cpp b/Source/Core/Core/Core.cpp index 5b6294d2c7..cce3576100 100644 --- a/Source/Core/Core/Core.cpp +++ b/Source/Core/Core/Core.cpp @@ -48,6 +48,7 @@ #include "Core/HW/VideoInterface.h" #include "Core/HW/Wiimote.h" #include "Core/IPC_HLE/WII_IPC_HLE_Device_usb.h" +#include "Core/IPC_HLE/WII_Socket.h" #include "Core/PowerPC/PowerPC.h" #ifdef USE_GDBSTUB @@ -65,6 +66,8 @@ bool g_aspect_wide; namespace Core { +bool g_want_determinism; + // Declarations and definitions static Common::Timer s_timer; static volatile u32 s_drawn_frame = 0; @@ -177,6 +180,8 @@ bool Init() s_emu_thread.join(); } + Core::UpdateWantDeterminism(/*initial*/ true); + INFO_LOG(OSREPORT, "Starting core = %s mode", _CoreParameter.bWii ? "Wii" : "GameCube"); INFO_LOG(OSREPORT, "CPU Thread separate = %s", @@ -564,6 +569,9 @@ void RequestRefreshInfo() bool PauseAndLock(bool doLock, bool unpauseOnUnlock) { + if (!IsRunning()) + return true; + // let's support recursive locking to simplify things on the caller's side, // and let's do it at this outer level in case the individual systems don't support it. if (doLock ? s_pause_and_lock_depth++ : --s_pause_and_lock_depth) @@ -702,4 +710,27 @@ void SetOnStoppedCallback(StoppedCallbackFunc callback) s_on_stopped_callback = callback; } +void UpdateWantDeterminism(bool initial) +{ + // For now, this value is not itself configurable. Instead, individual + // settings that depend on it, such as GPU determinism mode. should have + // override options for testing, + bool new_want_determinism = + Movie::IsPlayingInput() || + Movie::IsRecordingInput() || + NetPlay::IsNetPlayRunning(); + if (new_want_determinism != g_want_determinism || initial) + { + WARN_LOG(COMMON, "Want determinism <- %s", new_want_determinism ? "true" : "false"); + + bool was_unpaused = Core::PauseAndLock(true); + + g_want_determinism = new_want_determinism; + WiiSockMan::GetInstance().UpdateWantDeterminism(new_want_determinism); + g_video_backend->UpdateWantDeterminism(new_want_determinism); + + Core::PauseAndLock(false, was_unpaused); + } +} + } // Core diff --git a/Source/Core/Core/Core.h b/Source/Core/Core/Core.h index 2e9ccddfca..08ed7f1081 100644 --- a/Source/Core/Core/Core.h +++ b/Source/Core/Core/Core.h @@ -23,6 +23,8 @@ extern bool g_aspect_wide; namespace Core { +extern bool g_want_determinism; + bool GetIsFramelimiterTempDisabled(); void SetIsFramelimiterTempDisabled(bool disable); @@ -79,4 +81,7 @@ bool PauseAndLock(bool doLock, bool unpauseOnUnlock=true); typedef void(*StoppedCallbackFunc)(void); void SetOnStoppedCallback(StoppedCallbackFunc callback); +// Run on the GUI thread when the factors change. +void UpdateWantDeterminism(bool initial = false); + } // namespace diff --git a/Source/Core/Core/HW/WiimoteEmu/WiimoteEmu.cpp b/Source/Core/Core/HW/WiimoteEmu/WiimoteEmu.cpp index cc3e5cdcf0..da24bbb280 100644 --- a/Source/Core/Core/HW/WiimoteEmu/WiimoteEmu.cpp +++ b/Source/Core/Core/HW/WiimoteEmu/WiimoteEmu.cpp @@ -331,7 +331,7 @@ bool Wiimote::Step() m_rumble->controls[0]->control_ref->State(m_rumble_on); // when a movie is active, this button status update is disabled (moved), because movies only record data reports. - if (!(Movie::IsMovieActive()) || NetPlay::IsNetPlayRunning()) + if (!Core::g_want_determinism) { UpdateButtonsStatus(); } @@ -385,7 +385,7 @@ void Wiimote::UpdateButtonsStatus() void Wiimote::GetCoreData(u8* const data) { // when a movie is active, the button update happens here instead of Wiimote::Step, to avoid potential desync issues. - if (Movie::IsMovieActive() || NetPlay::IsNetPlayRunning()) + if (Core::g_want_determinism) { UpdateButtonsStatus(); } diff --git a/Source/Core/Core/IPC_HLE/WII_Socket.cpp b/Source/Core/Core/IPC_HLE/WII_Socket.cpp index ce46a0fb3a..b5a130c2c6 100644 --- a/Source/Core/Core/IPC_HLE/WII_Socket.cpp +++ b/Source/Core/Core/IPC_HLE/WII_Socket.cpp @@ -4,8 +4,7 @@ #include -#include "Core/Movie.h" -#include "Core/NetPlayProto.h" +#include "Core/Core.h" #include "Core/IPC_HLE/WII_IPC_HLE.h" #include "Core/IPC_HLE/WII_IPC_HLE_Device.h" #include "Core/IPC_HLE/WII_Socket.h" // No Wii socket support while using NetPlay or TAS @@ -559,9 +558,7 @@ void WiiSockMan::AddSocket(s32 fd) s32 WiiSockMan::NewSocket(s32 af, s32 type, s32 protocol) { - if (NetPlay::IsNetPlayRunning() || - Movie::IsRecordingInput() || - Movie::IsPlayingInput()) + if (Core::g_want_determinism) { return SO_ENOMEM; } @@ -664,5 +661,12 @@ void WiiSockMan::Convert(sockaddr_in const & from, WiiSockAddrIn& to, s32 addrle to.len = addrlen; } +void WiiSockMan::UpdateWantDeterminism(bool want) +{ + // If we switched into movie recording, kill existing sockets. + if (want) + Clean(); +} + #undef ERRORCODE #undef EITHER diff --git a/Source/Core/Core/IPC_HLE/WII_Socket.h b/Source/Core/Core/IPC_HLE/WII_Socket.h index f9b72f5425..abed7d9f29 100644 --- a/Source/Core/Core/IPC_HLE/WII_Socket.h +++ b/Source/Core/Core/IPC_HLE/WII_Socket.h @@ -242,6 +242,8 @@ public: } } + void UpdateWantDeterminism(bool want); + private: WiiSockMan() = default; diff --git a/Source/Core/Core/Movie.cpp b/Source/Core/Core/Movie.cpp index 5cba50a883..a06a7ca25b 100644 --- a/Source/Core/Core/Movie.cpp +++ b/Source/Core/Core/Movie.cpp @@ -437,6 +437,8 @@ bool BeginRecordingInput(int controllers) if (s_playMode != MODE_NONE || controllers == 0) return false; + bool was_unpaused = Core::PauseAndLock(true); + s_numPads = controllers; g_currentFrame = g_totalFrames = 0; g_currentLagCount = s_totalLagCount = 0; @@ -487,6 +489,10 @@ bool BeginRecordingInput(int controllers) s_currentByte = s_totalBytes = 0; + Core::UpdateWantDeterminism(); + + Core::PauseAndLock(false, was_unpaused); + Core::DisplayMessage("Starting movie recording", 2000); return true; } @@ -764,6 +770,8 @@ bool PlayInput(const std::string& filename) s_playMode = MODE_PLAYING; + Core::UpdateWantDeterminism(); + s_totalBytes = g_recordfd.GetSize() - 256; EnsureTmpInputSize((size_t)s_totalBytes); g_recordfd.ReadArray(tmpInput, (size_t)s_totalBytes); @@ -1097,6 +1105,7 @@ void EndPlayInput(bool cont) s_rerecords = 0; s_currentByte = 0; s_playMode = MODE_NONE; + Core::UpdateWantDeterminism(); Core::DisplayMessage("Movie End.", 2000); s_bRecordingFromSaveState = false; // we don't clear these things because otherwise we can't resume playback if we load a movie state later diff --git a/Source/Core/VideoCommon/Fifo.cpp b/Source/Core/VideoCommon/Fifo.cpp index 87764ec85c..a47438c41e 100644 --- a/Source/Core/VideoCommon/Fifo.cpp +++ b/Source/Core/VideoCommon/Fifo.cpp @@ -11,13 +11,16 @@ #include "Core/ConfigManager.h" #include "Core/Core.h" #include "Core/CoreTiming.h" +#include "Core/NetPlayProto.h" #include "Core/HW/Memmap.h" #include "VideoCommon/CommandProcessor.h" +#include "VideoCommon/CPMemory.h" #include "VideoCommon/DataReader.h" #include "VideoCommon/Fifo.h" #include "VideoCommon/OpcodeDecoding.h" #include "VideoCommon/PixelEngine.h" +#include "VideoCommon/VertexLoaderManager.h" #include "VideoCommon/VideoConfig.h" bool g_bSkipCurrentFrame = false; @@ -31,7 +34,7 @@ static u8 s_fifo_aux_data[FIFO_SIZE]; static u8* s_fifo_aux_write_ptr; static u8* s_fifo_aux_read_ptr; -bool g_use_deterministic_gpu_thread = true; // XXX +bool g_use_deterministic_gpu_thread; // STATE_TO_SAVE static std::mutex s_video_buffer_lock; @@ -413,3 +416,26 @@ void RunGpu() } CommandProcessor::SetCPStatusFromGPU(); } + +void Fifo_UpdateWantDeterminism(bool want) +{ + // We are paused (or not running at all yet) and have m_csHWVidOccupied, so + // it should be safe to change this. + g_use_deterministic_gpu_thread = want && SConfig::GetInstance().m_LocalCoreStartupParameter.bCPUThread; + + // Hack: For now movies are an exception to this being on (but not + // to wanting determinism in general). Once vertex arrays are + // fixed, there should be no reason to want this off for movies by + // default, so this can be removed. + if (NetPlay::IsNetPlayRunning()) + g_use_deterministic_gpu_thread = false; + + if (g_use_deterministic_gpu_thread) + { + // These haven't been updated in non-deterministic mode. + s_video_buffer_seen_ptr = g_video_buffer_pp_read_ptr = g_video_buffer_read_ptr; + CopyPreprocessCPStateFromMain(); + VertexLoaderManager::MarkAllDirty(); + } + +} diff --git a/Source/Core/VideoCommon/Fifo.h b/Source/Core/VideoCommon/Fifo.h index 175d6b6e4a..40a5ad84b7 100644 --- a/Source/Core/VideoCommon/Fifo.h +++ b/Source/Core/VideoCommon/Fifo.h @@ -27,6 +27,7 @@ u8* GetVideoBufferEndPtr(); void Fifo_DoState(PointerWrap &f); void Fifo_PauseAndLock(bool doLock, bool unpauseOnUnlock); +void Fifo_UpdateWantDeterminism(bool want); // Used for diagnostics. enum SyncGPUReason { diff --git a/Source/Core/VideoCommon/MainBase.cpp b/Source/Core/VideoCommon/MainBase.cpp index 082c81c2ea..d775cfe64c 100644 --- a/Source/Core/VideoCommon/MainBase.cpp +++ b/Source/Core/VideoCommon/MainBase.cpp @@ -309,3 +309,8 @@ void VideoBackendHardware::RegisterCPMMIO(MMIO::Mapping* mmio, u32 base) CommandProcessor::RegisterMMIO(mmio, base); } +void VideoBackendHardware::UpdateWantDeterminism(bool want) +{ + Fifo_UpdateWantDeterminism(want); +} + diff --git a/Source/Core/VideoCommon/VideoBackendBase.h b/Source/Core/VideoCommon/VideoBackendBase.h index 6ab9fce8f2..7d62dda37b 100644 --- a/Source/Core/VideoCommon/VideoBackendBase.h +++ b/Source/Core/VideoCommon/VideoBackendBase.h @@ -116,6 +116,8 @@ public: virtual void DoState(PointerWrap &p) = 0; virtual void CheckInvalidState() = 0; + + virtual void UpdateWantDeterminism(bool want) {} }; extern std::vector g_available_video_backends; @@ -151,6 +153,8 @@ class VideoBackendHardware : public VideoBackend void PauseAndLock(bool doLock, bool unpauseOnUnlock=true) override; void DoState(PointerWrap &p) override; + void UpdateWantDeterminism(bool want) override; + bool m_invalid; public: From 6c0a68d50765c24a205f7133205b54c26ae1bbfb Mon Sep 17 00:00:00 2001 From: comex Date: Sat, 6 Sep 2014 17:43:43 -0400 Subject: [PATCH 10/10] Add the override config option. I hate the config code, but now is not the time to fix it... --- Source/Core/Core/BootManager.cpp | 18 ++++++++++++ Source/Core/Core/ConfigManager.cpp | 2 ++ Source/Core/Core/CoreParameter.h | 13 +++++++++ Source/Core/VideoCommon/Fifo.cpp | 47 +++++++++++++++++++++--------- 4 files changed, 66 insertions(+), 14 deletions(-) diff --git a/Source/Core/Core/BootManager.cpp b/Source/Core/Core/BootManager.cpp index 8d9a305c29..bfea949301 100644 --- a/Source/Core/Core/BootManager.cpp +++ b/Source/Core/Core/BootManager.cpp @@ -55,10 +55,24 @@ struct ConfigCache unsigned int framelimit, frameSkip; TEXIDevices m_EXIDevice[MAX_EXI_CHANNELS]; std::string strBackend, sBackend; + std::string m_strGPUDeterminismMode; bool bSetFramelimit, bSetEXIDevice[MAX_EXI_CHANNELS], bSetVolume, bSetPads[MAX_SI_CHANNELS], bSetWiimoteSource[MAX_BBMOTES], bSetFrameSkip; }; static ConfigCache config_cache; +static GPUDeterminismMode ParseGPUDeterminismMode(const std::string& mode) +{ + if (mode == "auto") + return GPU_DETERMINISM_AUTO; + if (mode == "none") + return GPU_DETERMINISM_NONE; + if (mode == "fake-completion") + return GPU_DETERMINISM_FAKE_COMPLETION; + + NOTICE_LOG(BOOT, "Unknown GPU determinism mode %s", mode.c_str()); + return GPU_DETERMINISM_AUTO; +} + // Boot the ISO or file bool BootCore(const std::string& _rFilename) { @@ -109,6 +123,7 @@ bool BootCore(const std::string& _rFilename) config_cache.bMergeBlocks = StartUp.bMergeBlocks; config_cache.bDSPHLE = StartUp.bDSPHLE; config_cache.strBackend = StartUp.m_strVideoBackend; + config_cache.m_strGPUDeterminismMode = StartUp.m_strGPUDeterminismMode; config_cache.m_EnableJIT = SConfig::GetInstance().m_DSPEnableJIT; config_cache.bDSPThread = StartUp.bDSPThread; config_cache.Volume = SConfig::GetInstance().m_Volume; @@ -168,6 +183,8 @@ bool BootCore(const std::string& _rFilename) dsp_section->Get("EnableJIT", &SConfig::GetInstance().m_DSPEnableJIT, SConfig::GetInstance().m_DSPEnableJIT); dsp_section->Get("Backend", &SConfig::GetInstance().sBackend, SConfig::GetInstance().sBackend); VideoBackend::ActivateBackend(StartUp.m_strVideoBackend); + core_section->Get("GPUDeterminismMode", &StartUp.m_strGPUDeterminismMode, StartUp.m_strGPUDeterminismMode); + StartUp.m_GPUDeterminismMode = ParseGPUDeterminismMode(StartUp.m_strGPUDeterminismMode); for (unsigned int i = 0; i < MAX_SI_CHANNELS; ++i) { @@ -277,6 +294,7 @@ void Stop() StartUp.bDSPHLE = config_cache.bDSPHLE; StartUp.bDSPThread = config_cache.bDSPThread; StartUp.m_strVideoBackend = config_cache.strBackend; + StartUp.m_strGPUDeterminismMode = config_cache.m_strGPUDeterminismMode; VideoBackend::ActivateBackend(StartUp.m_strVideoBackend); StartUp.bHLE_BS2 = config_cache.bHLE_BS2; SConfig::GetInstance().sBackend = config_cache.sBackend; diff --git a/Source/Core/Core/ConfigManager.cpp b/Source/Core/Core/ConfigManager.cpp index 5b672305ed..3942fd87c9 100644 --- a/Source/Core/Core/ConfigManager.cpp +++ b/Source/Core/Core/ConfigManager.cpp @@ -317,6 +317,7 @@ void SConfig::SaveCoreSettings(IniFile& ini) core->Set("FrameLimit", m_Framelimit); core->Set("FrameSkip", m_FrameSkip); core->Set("GFXBackend", m_LocalCoreStartupParameter.m_strVideoBackend); + core->Set("GPUDeterminismMode", m_LocalCoreStartupParameter.m_strGPUDeterminismMode); } void SConfig::SaveMovieSettings(IniFile& ini) @@ -542,6 +543,7 @@ void SConfig::LoadCoreSettings(IniFile& ini) core->Get("FrameLimit", &m_Framelimit, 1); // auto frame limit by default core->Get("FrameSkip", &m_FrameSkip, 0); core->Get("GFXBackend", &m_LocalCoreStartupParameter.m_strVideoBackend, ""); + core->Get("GPUDeterminismMode", &m_LocalCoreStartupParameter.m_strGPUDeterminismMode, "auto"); } void SConfig::LoadMovieSettings(IniFile& ini) diff --git a/Source/Core/Core/CoreParameter.h b/Source/Core/Core/CoreParameter.h index b6008b3145..898c949714 100644 --- a/Source/Core/Core/CoreParameter.h +++ b/Source/Core/Core/CoreParameter.h @@ -97,6 +97,15 @@ enum Hotkey NUM_HOTKEYS, }; +enum GPUDeterminismMode +{ + GPU_DETERMINISM_AUTO, + GPU_DETERMINISM_NONE, + // This is currently the only mode. There will probably be at least + // one more at some point. + GPU_DETERMINISM_FAKE_COMPLETION, +}; + struct SCoreStartupParameter { // Settings @@ -200,6 +209,10 @@ struct SCoreStartupParameter EBootType m_BootType; std::string m_strVideoBackend; + std::string m_strGPUDeterminismMode; + + // set based on the string version + GPUDeterminismMode m_GPUDeterminismMode; // files std::string m_strFilename; diff --git a/Source/Core/VideoCommon/Fifo.cpp b/Source/Core/VideoCommon/Fifo.cpp index a47438c41e..44322d4d3b 100644 --- a/Source/Core/VideoCommon/Fifo.cpp +++ b/Source/Core/VideoCommon/Fifo.cpp @@ -421,21 +421,40 @@ void Fifo_UpdateWantDeterminism(bool want) { // We are paused (or not running at all yet) and have m_csHWVidOccupied, so // it should be safe to change this. - g_use_deterministic_gpu_thread = want && SConfig::GetInstance().m_LocalCoreStartupParameter.bCPUThread; - - // Hack: For now movies are an exception to this being on (but not - // to wanting determinism in general). Once vertex arrays are - // fixed, there should be no reason to want this off for movies by - // default, so this can be removed. - if (NetPlay::IsNetPlayRunning()) - g_use_deterministic_gpu_thread = false; - - if (g_use_deterministic_gpu_thread) + const SCoreStartupParameter& param = SConfig::GetInstance().m_LocalCoreStartupParameter; + bool gpu_thread; + switch (param.m_GPUDeterminismMode) { - // These haven't been updated in non-deterministic mode. - s_video_buffer_seen_ptr = g_video_buffer_pp_read_ptr = g_video_buffer_read_ptr; - CopyPreprocessCPStateFromMain(); - VertexLoaderManager::MarkAllDirty(); + case GPU_DETERMINISM_AUTO: + gpu_thread = want; + + // Hack: For now movies are an exception to this being on (but not + // to wanting determinism in general). Once vertex arrays are + // fixed, there should be no reason to want this off for movies by + // default, so this can be removed. + if (!NetPlay::IsNetPlayRunning()) + gpu_thread = false; + + break; + case GPU_DETERMINISM_NONE: + gpu_thread = false; + break; + case GPU_DETERMINISM_FAKE_COMPLETION: + gpu_thread = true; + break; } + gpu_thread = gpu_thread && SConfig::GetInstance().m_LocalCoreStartupParameter.bCPUThread; + + if (g_use_deterministic_gpu_thread != gpu_thread) + { + g_use_deterministic_gpu_thread = gpu_thread; + if (gpu_thread) + { + // These haven't been updated in non-deterministic mode. + s_video_buffer_seen_ptr = g_video_buffer_pp_read_ptr = g_video_buffer_read_ptr; + CopyPreprocessCPStateFromMain(); + VertexLoaderManager::MarkAllDirty(); + } + } }