diff --git a/Source/Core/Core/BootManager.cpp b/Source/Core/Core/BootManager.cpp index 8d9a305c29..bfea949301 100644 --- a/Source/Core/Core/BootManager.cpp +++ b/Source/Core/Core/BootManager.cpp @@ -55,10 +55,24 @@ struct ConfigCache unsigned int framelimit, frameSkip; TEXIDevices m_EXIDevice[MAX_EXI_CHANNELS]; std::string strBackend, sBackend; + std::string m_strGPUDeterminismMode; bool bSetFramelimit, bSetEXIDevice[MAX_EXI_CHANNELS], bSetVolume, bSetPads[MAX_SI_CHANNELS], bSetWiimoteSource[MAX_BBMOTES], bSetFrameSkip; }; static ConfigCache config_cache; +static GPUDeterminismMode ParseGPUDeterminismMode(const std::string& mode) +{ + if (mode == "auto") + return GPU_DETERMINISM_AUTO; + if (mode == "none") + return GPU_DETERMINISM_NONE; + if (mode == "fake-completion") + return GPU_DETERMINISM_FAKE_COMPLETION; + + NOTICE_LOG(BOOT, "Unknown GPU determinism mode %s", mode.c_str()); + return GPU_DETERMINISM_AUTO; +} + // Boot the ISO or file bool BootCore(const std::string& _rFilename) { @@ -109,6 +123,7 @@ bool BootCore(const std::string& _rFilename) config_cache.bMergeBlocks = StartUp.bMergeBlocks; config_cache.bDSPHLE = StartUp.bDSPHLE; config_cache.strBackend = StartUp.m_strVideoBackend; + config_cache.m_strGPUDeterminismMode = StartUp.m_strGPUDeterminismMode; config_cache.m_EnableJIT = SConfig::GetInstance().m_DSPEnableJIT; config_cache.bDSPThread = StartUp.bDSPThread; config_cache.Volume = SConfig::GetInstance().m_Volume; @@ -168,6 +183,8 @@ bool BootCore(const std::string& _rFilename) dsp_section->Get("EnableJIT", &SConfig::GetInstance().m_DSPEnableJIT, SConfig::GetInstance().m_DSPEnableJIT); dsp_section->Get("Backend", &SConfig::GetInstance().sBackend, SConfig::GetInstance().sBackend); VideoBackend::ActivateBackend(StartUp.m_strVideoBackend); + core_section->Get("GPUDeterminismMode", &StartUp.m_strGPUDeterminismMode, StartUp.m_strGPUDeterminismMode); + StartUp.m_GPUDeterminismMode = ParseGPUDeterminismMode(StartUp.m_strGPUDeterminismMode); for (unsigned int i = 0; i < MAX_SI_CHANNELS; ++i) { @@ -277,6 +294,7 @@ void Stop() StartUp.bDSPHLE = config_cache.bDSPHLE; StartUp.bDSPThread = config_cache.bDSPThread; StartUp.m_strVideoBackend = config_cache.strBackend; + StartUp.m_strGPUDeterminismMode = config_cache.m_strGPUDeterminismMode; VideoBackend::ActivateBackend(StartUp.m_strVideoBackend); StartUp.bHLE_BS2 = config_cache.bHLE_BS2; SConfig::GetInstance().sBackend = config_cache.sBackend; diff --git a/Source/Core/Core/ConfigManager.cpp b/Source/Core/Core/ConfigManager.cpp index 5b672305ed..3942fd87c9 100644 --- a/Source/Core/Core/ConfigManager.cpp +++ b/Source/Core/Core/ConfigManager.cpp @@ -317,6 +317,7 @@ void SConfig::SaveCoreSettings(IniFile& ini) core->Set("FrameLimit", m_Framelimit); core->Set("FrameSkip", m_FrameSkip); core->Set("GFXBackend", m_LocalCoreStartupParameter.m_strVideoBackend); + core->Set("GPUDeterminismMode", m_LocalCoreStartupParameter.m_strGPUDeterminismMode); } void SConfig::SaveMovieSettings(IniFile& ini) @@ -542,6 +543,7 @@ void SConfig::LoadCoreSettings(IniFile& ini) core->Get("FrameLimit", &m_Framelimit, 1); // auto frame limit by default core->Get("FrameSkip", &m_FrameSkip, 0); core->Get("GFXBackend", &m_LocalCoreStartupParameter.m_strVideoBackend, ""); + core->Get("GPUDeterminismMode", &m_LocalCoreStartupParameter.m_strGPUDeterminismMode, "auto"); } void SConfig::LoadMovieSettings(IniFile& ini) diff --git a/Source/Core/Core/Core.cpp b/Source/Core/Core/Core.cpp index 5b6294d2c7..cce3576100 100644 --- a/Source/Core/Core/Core.cpp +++ b/Source/Core/Core/Core.cpp @@ -48,6 +48,7 @@ #include "Core/HW/VideoInterface.h" #include "Core/HW/Wiimote.h" #include "Core/IPC_HLE/WII_IPC_HLE_Device_usb.h" +#include "Core/IPC_HLE/WII_Socket.h" #include "Core/PowerPC/PowerPC.h" #ifdef USE_GDBSTUB @@ -65,6 +66,8 @@ bool g_aspect_wide; namespace Core { +bool g_want_determinism; + // Declarations and definitions static Common::Timer s_timer; static volatile u32 s_drawn_frame = 0; @@ -177,6 +180,8 @@ bool Init() s_emu_thread.join(); } + Core::UpdateWantDeterminism(/*initial*/ true); + INFO_LOG(OSREPORT, "Starting core = %s mode", _CoreParameter.bWii ? "Wii" : "GameCube"); INFO_LOG(OSREPORT, "CPU Thread separate = %s", @@ -564,6 +569,9 @@ void RequestRefreshInfo() bool PauseAndLock(bool doLock, bool unpauseOnUnlock) { + if (!IsRunning()) + return true; + // let's support recursive locking to simplify things on the caller's side, // and let's do it at this outer level in case the individual systems don't support it. if (doLock ? s_pause_and_lock_depth++ : --s_pause_and_lock_depth) @@ -702,4 +710,27 @@ void SetOnStoppedCallback(StoppedCallbackFunc callback) s_on_stopped_callback = callback; } +void UpdateWantDeterminism(bool initial) +{ + // For now, this value is not itself configurable. Instead, individual + // settings that depend on it, such as GPU determinism mode. should have + // override options for testing, + bool new_want_determinism = + Movie::IsPlayingInput() || + Movie::IsRecordingInput() || + NetPlay::IsNetPlayRunning(); + if (new_want_determinism != g_want_determinism || initial) + { + WARN_LOG(COMMON, "Want determinism <- %s", new_want_determinism ? "true" : "false"); + + bool was_unpaused = Core::PauseAndLock(true); + + g_want_determinism = new_want_determinism; + WiiSockMan::GetInstance().UpdateWantDeterminism(new_want_determinism); + g_video_backend->UpdateWantDeterminism(new_want_determinism); + + Core::PauseAndLock(false, was_unpaused); + } +} + } // Core diff --git a/Source/Core/Core/Core.h b/Source/Core/Core/Core.h index 2e9ccddfca..08ed7f1081 100644 --- a/Source/Core/Core/Core.h +++ b/Source/Core/Core/Core.h @@ -23,6 +23,8 @@ extern bool g_aspect_wide; namespace Core { +extern bool g_want_determinism; + bool GetIsFramelimiterTempDisabled(); void SetIsFramelimiterTempDisabled(bool disable); @@ -79,4 +81,7 @@ bool PauseAndLock(bool doLock, bool unpauseOnUnlock=true); typedef void(*StoppedCallbackFunc)(void); void SetOnStoppedCallback(StoppedCallbackFunc callback); +// Run on the GUI thread when the factors change. +void UpdateWantDeterminism(bool initial = false); + } // namespace diff --git a/Source/Core/Core/CoreParameter.h b/Source/Core/Core/CoreParameter.h index b6008b3145..898c949714 100644 --- a/Source/Core/Core/CoreParameter.h +++ b/Source/Core/Core/CoreParameter.h @@ -97,6 +97,15 @@ enum Hotkey NUM_HOTKEYS, }; +enum GPUDeterminismMode +{ + GPU_DETERMINISM_AUTO, + GPU_DETERMINISM_NONE, + // This is currently the only mode. There will probably be at least + // one more at some point. + GPU_DETERMINISM_FAKE_COMPLETION, +}; + struct SCoreStartupParameter { // Settings @@ -200,6 +209,10 @@ struct SCoreStartupParameter EBootType m_BootType; std::string m_strVideoBackend; + std::string m_strGPUDeterminismMode; + + // set based on the string version + GPUDeterminismMode m_GPUDeterminismMode; // files std::string m_strFilename; diff --git a/Source/Core/Core/HW/WiimoteEmu/WiimoteEmu.cpp b/Source/Core/Core/HW/WiimoteEmu/WiimoteEmu.cpp index cc3e5cdcf0..da24bbb280 100644 --- a/Source/Core/Core/HW/WiimoteEmu/WiimoteEmu.cpp +++ b/Source/Core/Core/HW/WiimoteEmu/WiimoteEmu.cpp @@ -331,7 +331,7 @@ bool Wiimote::Step() m_rumble->controls[0]->control_ref->State(m_rumble_on); // when a movie is active, this button status update is disabled (moved), because movies only record data reports. - if (!(Movie::IsMovieActive()) || NetPlay::IsNetPlayRunning()) + if (!Core::g_want_determinism) { UpdateButtonsStatus(); } @@ -385,7 +385,7 @@ void Wiimote::UpdateButtonsStatus() void Wiimote::GetCoreData(u8* const data) { // when a movie is active, the button update happens here instead of Wiimote::Step, to avoid potential desync issues. - if (Movie::IsMovieActive() || NetPlay::IsNetPlayRunning()) + if (Core::g_want_determinism) { UpdateButtonsStatus(); } diff --git a/Source/Core/Core/IPC_HLE/WII_Socket.cpp b/Source/Core/Core/IPC_HLE/WII_Socket.cpp index ce46a0fb3a..b5a130c2c6 100644 --- a/Source/Core/Core/IPC_HLE/WII_Socket.cpp +++ b/Source/Core/Core/IPC_HLE/WII_Socket.cpp @@ -4,8 +4,7 @@ #include -#include "Core/Movie.h" -#include "Core/NetPlayProto.h" +#include "Core/Core.h" #include "Core/IPC_HLE/WII_IPC_HLE.h" #include "Core/IPC_HLE/WII_IPC_HLE_Device.h" #include "Core/IPC_HLE/WII_Socket.h" // No Wii socket support while using NetPlay or TAS @@ -559,9 +558,7 @@ void WiiSockMan::AddSocket(s32 fd) s32 WiiSockMan::NewSocket(s32 af, s32 type, s32 protocol) { - if (NetPlay::IsNetPlayRunning() || - Movie::IsRecordingInput() || - Movie::IsPlayingInput()) + if (Core::g_want_determinism) { return SO_ENOMEM; } @@ -664,5 +661,12 @@ void WiiSockMan::Convert(sockaddr_in const & from, WiiSockAddrIn& to, s32 addrle to.len = addrlen; } +void WiiSockMan::UpdateWantDeterminism(bool want) +{ + // If we switched into movie recording, kill existing sockets. + if (want) + Clean(); +} + #undef ERRORCODE #undef EITHER diff --git a/Source/Core/Core/IPC_HLE/WII_Socket.h b/Source/Core/Core/IPC_HLE/WII_Socket.h index f9b72f5425..abed7d9f29 100644 --- a/Source/Core/Core/IPC_HLE/WII_Socket.h +++ b/Source/Core/Core/IPC_HLE/WII_Socket.h @@ -242,6 +242,8 @@ public: } } + void UpdateWantDeterminism(bool want); + private: WiiSockMan() = default; diff --git a/Source/Core/Core/Movie.cpp b/Source/Core/Core/Movie.cpp index 5cba50a883..a06a7ca25b 100644 --- a/Source/Core/Core/Movie.cpp +++ b/Source/Core/Core/Movie.cpp @@ -437,6 +437,8 @@ bool BeginRecordingInput(int controllers) if (s_playMode != MODE_NONE || controllers == 0) return false; + bool was_unpaused = Core::PauseAndLock(true); + s_numPads = controllers; g_currentFrame = g_totalFrames = 0; g_currentLagCount = s_totalLagCount = 0; @@ -487,6 +489,10 @@ bool BeginRecordingInput(int controllers) s_currentByte = s_totalBytes = 0; + Core::UpdateWantDeterminism(); + + Core::PauseAndLock(false, was_unpaused); + Core::DisplayMessage("Starting movie recording", 2000); return true; } @@ -764,6 +770,8 @@ bool PlayInput(const std::string& filename) s_playMode = MODE_PLAYING; + Core::UpdateWantDeterminism(); + s_totalBytes = g_recordfd.GetSize() - 256; EnsureTmpInputSize((size_t)s_totalBytes); g_recordfd.ReadArray(tmpInput, (size_t)s_totalBytes); @@ -1097,6 +1105,7 @@ void EndPlayInput(bool cont) s_rerecords = 0; s_currentByte = 0; s_playMode = MODE_NONE; + Core::UpdateWantDeterminism(); Core::DisplayMessage("Movie End.", 2000); s_bRecordingFromSaveState = false; // we don't clear these things because otherwise we can't resume playback if we load a movie state later diff --git a/Source/Core/VideoBackends/Software/CPMemLoader.cpp b/Source/Core/VideoBackends/Software/CPMemLoader.cpp index 8e78059616..21d3861d90 100644 --- a/Source/Core/VideoBackends/Software/CPMemLoader.cpp +++ b/Source/Core/VideoBackends/Software/CPMemLoader.cpp @@ -13,46 +13,46 @@ void SWLoadCPReg(u32 sub_cmd, u32 value) switch (sub_cmd & 0xF0) { case 0x30: - MatrixIndexA.Hex = value; + g_main_cp_state.matrix_index_a.Hex = value; break; case 0x40: - MatrixIndexB.Hex = value; + g_main_cp_state.matrix_index_b.Hex = value; break; case 0x50: - g_VtxDesc.Hex &= ~0x1FFFF; // keep the Upper bits - g_VtxDesc.Hex |= value; + g_main_cp_state.vtx_desc.Hex &= ~0x1FFFF; // keep the Upper bits + g_main_cp_state.vtx_desc.Hex |= value; break; case 0x60: - g_VtxDesc.Hex &= 0x1FFFF; // keep the lower 17Bits - g_VtxDesc.Hex |= (u64)value << 17; + g_main_cp_state.vtx_desc.Hex &= 0x1FFFF; // keep the lower 17Bits + g_main_cp_state.vtx_desc.Hex |= (u64)value << 17; break; case 0x70: _assert_((sub_cmd & 0x0F) < 8); - g_VtxAttr[sub_cmd & 7].g0.Hex = value; + g_main_cp_state.vtx_attr[sub_cmd & 7].g0.Hex = value; break; case 0x80: _assert_((sub_cmd & 0x0F) < 8); - g_VtxAttr[sub_cmd & 7].g1.Hex = value; + g_main_cp_state.vtx_attr[sub_cmd & 7].g1.Hex = value; break; case 0x90: _assert_((sub_cmd & 0x0F) < 8); - g_VtxAttr[sub_cmd & 7].g2.Hex = value; + g_main_cp_state.vtx_attr[sub_cmd & 7].g2.Hex = value; break; // Pointers to vertex arrays in GC RAM case 0xA0: - arraybases[sub_cmd & 0xF] = value; + g_main_cp_state.array_bases[sub_cmd & 0xF] = value; cached_arraybases[sub_cmd & 0xF] = Memory::GetPointer(value); break; case 0xB0: - arraystrides[sub_cmd & 0xF] = value & 0xFF; + g_main_cp_state.array_strides[sub_cmd & 0xF] = value & 0xFF; break; } } diff --git a/Source/Core/VideoBackends/Software/OpcodeDecoder.cpp b/Source/Core/VideoBackends/Software/OpcodeDecoder.cpp index 77deeed01f..66816e6626 100644 --- a/Source/Core/VideoBackends/Software/OpcodeDecoder.cpp +++ b/Source/Core/VideoBackends/Software/OpcodeDecoder.cpp @@ -57,7 +57,7 @@ static void DecodePrimitiveStream(u32 iBufferSize) { while (streamSize > 0 && iBufferSize >= vertexSize) { - g_pVideoData += vertexSize; + g_video_buffer_read_ptr += vertexSize; iBufferSize -= vertexSize; streamSize--; } @@ -94,26 +94,26 @@ static void ReadXFData(u32 iBufferSize) static void ExecuteDisplayList(u32 addr, u32 count) { - u8 *videoDataSave = g_pVideoData; + u8 *videoDataSave = g_video_buffer_read_ptr; u8 *dlStart = Memory::GetPointer(addr); - g_pVideoData = dlStart; + g_video_buffer_read_ptr = dlStart; while (OpcodeDecoder::CommandRunnable(count)) { OpcodeDecoder::Run(count); // if data was read by the opcode decoder then the video data pointer changed - u32 readCount = (u32)(g_pVideoData - dlStart); - dlStart = g_pVideoData; + u32 readCount = (u32)(g_video_buffer_read_ptr - dlStart); + dlStart = g_video_buffer_read_ptr; _assert_msg_(VIDEO, count >= readCount, "Display list underrun"); count -= readCount; } - g_pVideoData = videoDataSave; + g_video_buffer_read_ptr = videoDataSave; } static void DecodeStandard(u32 bufferSize) diff --git a/Source/Core/VideoBackends/Software/SWCommandProcessor.cpp b/Source/Core/VideoBackends/Software/SWCommandProcessor.cpp index 5f227d4b5c..56832eb786 100644 --- a/Source/Core/VideoBackends/Software/SWCommandProcessor.cpp +++ b/Source/Core/VideoBackends/Software/SWCommandProcessor.cpp @@ -57,7 +57,7 @@ void DoState(PointerWrap &p) p.Do(interruptWaiting); // Is this right? - p.DoArray(g_pVideoData,writePos); + p.DoArray(g_video_buffer_read_ptr,writePos); } static void UpdateInterrupts_Wrapper(u64 userdata, int cyclesLate) @@ -95,7 +95,7 @@ void Init() interruptSet = false; interruptWaiting = false; - g_pVideoData = nullptr; + g_video_buffer_read_ptr = nullptr; g_bSkipCurrentFrame = false; } @@ -311,7 +311,7 @@ bool RunBuffer() _dbg_assert_(COMMANDPROCESSOR, writePos >= readPos); - g_pVideoData = &commandBuffer[readPos]; + g_video_buffer_read_ptr = &commandBuffer[readPos]; u32 availableBytes = writePos - readPos; @@ -322,7 +322,7 @@ bool RunBuffer() OpcodeDecoder::Run(availableBytes); // if data was read by the opcode decoder then the video data pointer changed - readPos = (u32)(g_pVideoData - &commandBuffer[0]); + readPos = (u32)(g_video_buffer_read_ptr - &commandBuffer[0]); _dbg_assert_(VIDEO, writePos >= readPos); availableBytes = writePos - readPos; } diff --git a/Source/Core/VideoBackends/Software/SWVertexLoader.cpp b/Source/Core/VideoBackends/Software/SWVertexLoader.cpp index dcd71d35a4..e4848ceb34 100644 --- a/Source/Core/VideoBackends/Software/SWVertexLoader.cpp +++ b/Source/Core/VideoBackends/Software/SWVertexLoader.cpp @@ -39,7 +39,7 @@ SWVertexLoader::~SWVertexLoader() void SWVertexLoader::SetFormat(u8 attributeIndex, u8 primitiveType) { - m_CurrentVat = &g_VtxAttr[attributeIndex]; + m_CurrentVat = &g_main_cp_state.vtx_attr[attributeIndex]; posScale = 1.0f / float(1 << m_CurrentVat->g0.PosFrac); tcScale[0] = 1.0f / float(1 << m_CurrentVat->g0.Tex0Frac); @@ -53,20 +53,20 @@ void SWVertexLoader::SetFormat(u8 attributeIndex, u8 primitiveType) //TexMtx const u64 tmDesc[8] = { - g_VtxDesc.Tex0MatIdx, g_VtxDesc.Tex1MatIdx, g_VtxDesc.Tex2MatIdx, g_VtxDesc.Tex3MatIdx, - g_VtxDesc.Tex4MatIdx, g_VtxDesc.Tex5MatIdx, g_VtxDesc.Tex6MatIdx, g_VtxDesc.Tex7MatIdx + g_main_cp_state.vtx_desc.Tex0MatIdx, g_main_cp_state.vtx_desc.Tex1MatIdx, g_main_cp_state.vtx_desc.Tex2MatIdx, g_main_cp_state.vtx_desc.Tex3MatIdx, + g_main_cp_state.vtx_desc.Tex4MatIdx, g_main_cp_state.vtx_desc.Tex5MatIdx, g_main_cp_state.vtx_desc.Tex6MatIdx, g_main_cp_state.vtx_desc.Tex7MatIdx }; // Colors - const u64 colDesc[2] = {g_VtxDesc.Color0, g_VtxDesc.Color1}; + const u64 colDesc[2] = {g_main_cp_state.vtx_desc.Color0, g_main_cp_state.vtx_desc.Color1}; colElements[0] = m_CurrentVat->g0.Color0Elements; colElements[1] = m_CurrentVat->g0.Color1Elements; const u32 colComp[2] = {m_CurrentVat->g0.Color0Comp, m_CurrentVat->g0.Color1Comp}; // TextureCoord const u64 tcDesc[8] = { - g_VtxDesc.Tex0Coord, g_VtxDesc.Tex1Coord, g_VtxDesc.Tex2Coord, g_VtxDesc.Tex3Coord, - g_VtxDesc.Tex4Coord, g_VtxDesc.Tex5Coord, g_VtxDesc.Tex6Coord, g_VtxDesc.Tex7Coord + g_main_cp_state.vtx_desc.Tex0Coord, g_main_cp_state.vtx_desc.Tex1Coord, g_main_cp_state.vtx_desc.Tex2Coord, g_main_cp_state.vtx_desc.Tex3Coord, + g_main_cp_state.vtx_desc.Tex4Coord, g_main_cp_state.vtx_desc.Tex5Coord, g_main_cp_state.vtx_desc.Tex6Coord, g_main_cp_state.vtx_desc.Tex7Coord }; const u32 tcElements[8] = { m_CurrentVat->g0.Tex0CoordElements, m_CurrentVat->g1.Tex1CoordElements, m_CurrentVat->g1.Tex2CoordElements, @@ -89,15 +89,15 @@ void SWVertexLoader::SetFormat(u8 attributeIndex, u8 primitiveType) // Reset vertex // matrix index from xf regs or cp memory? - if (xfmem.MatrixIndexA.PosNormalMtxIdx != MatrixIndexA.PosNormalMtxIdx || - xfmem.MatrixIndexA.Tex0MtxIdx != MatrixIndexA.Tex0MtxIdx || - xfmem.MatrixIndexA.Tex1MtxIdx != MatrixIndexA.Tex1MtxIdx || - xfmem.MatrixIndexA.Tex2MtxIdx != MatrixIndexA.Tex2MtxIdx || - xfmem.MatrixIndexA.Tex3MtxIdx != MatrixIndexA.Tex3MtxIdx || - xfmem.MatrixIndexB.Tex4MtxIdx != MatrixIndexB.Tex4MtxIdx || - xfmem.MatrixIndexB.Tex5MtxIdx != MatrixIndexB.Tex5MtxIdx || - xfmem.MatrixIndexB.Tex6MtxIdx != MatrixIndexB.Tex6MtxIdx || - xfmem.MatrixIndexB.Tex7MtxIdx != MatrixIndexB.Tex7MtxIdx) + if (xfmem.MatrixIndexA.PosNormalMtxIdx != g_main_cp_state.matrix_index_a.PosNormalMtxIdx || + xfmem.MatrixIndexA.Tex0MtxIdx != g_main_cp_state.matrix_index_a.Tex0MtxIdx || + xfmem.MatrixIndexA.Tex1MtxIdx != g_main_cp_state.matrix_index_a.Tex1MtxIdx || + xfmem.MatrixIndexA.Tex2MtxIdx != g_main_cp_state.matrix_index_a.Tex2MtxIdx || + xfmem.MatrixIndexA.Tex3MtxIdx != g_main_cp_state.matrix_index_a.Tex3MtxIdx || + xfmem.MatrixIndexB.Tex4MtxIdx != g_main_cp_state.matrix_index_b.Tex4MtxIdx || + xfmem.MatrixIndexB.Tex5MtxIdx != g_main_cp_state.matrix_index_b.Tex5MtxIdx || + xfmem.MatrixIndexB.Tex6MtxIdx != g_main_cp_state.matrix_index_b.Tex6MtxIdx || + xfmem.MatrixIndexB.Tex7MtxIdx != g_main_cp_state.matrix_index_b.Tex7MtxIdx) { WARN_LOG(VIDEO, "Matrix indices don't match"); @@ -118,18 +118,18 @@ void SWVertexLoader::SetFormat(u8 attributeIndex, u8 primitiveType) m_Vertex.texMtx[6] = xfmem.MatrixIndexB.Tex6MtxIdx; m_Vertex.texMtx[7] = xfmem.MatrixIndexB.Tex7MtxIdx; #else - m_Vertex.posMtx = MatrixIndexA.PosNormalMtxIdx; - m_Vertex.texMtx[0] = MatrixIndexA.Tex0MtxIdx; - m_Vertex.texMtx[1] = MatrixIndexA.Tex1MtxIdx; - m_Vertex.texMtx[2] = MatrixIndexA.Tex2MtxIdx; - m_Vertex.texMtx[3] = MatrixIndexA.Tex3MtxIdx; - m_Vertex.texMtx[4] = MatrixIndexB.Tex4MtxIdx; - m_Vertex.texMtx[5] = MatrixIndexB.Tex5MtxIdx; - m_Vertex.texMtx[6] = MatrixIndexB.Tex6MtxIdx; - m_Vertex.texMtx[7] = MatrixIndexB.Tex7MtxIdx; + m_Vertex.posMtx = g_main_cp_state.matrix_index_a.PosNormalMtxIdx; + m_Vertex.texMtx[0] = g_main_cp_state.matrix_index_a.Tex0MtxIdx; + m_Vertex.texMtx[1] = g_main_cp_state.matrix_index_a.Tex1MtxIdx; + m_Vertex.texMtx[2] = g_main_cp_state.matrix_index_a.Tex2MtxIdx; + m_Vertex.texMtx[3] = g_main_cp_state.matrix_index_a.Tex3MtxIdx; + m_Vertex.texMtx[4] = g_main_cp_state.matrix_index_b.Tex4MtxIdx; + m_Vertex.texMtx[5] = g_main_cp_state.matrix_index_b.Tex5MtxIdx; + m_Vertex.texMtx[6] = g_main_cp_state.matrix_index_b.Tex6MtxIdx; + m_Vertex.texMtx[7] = g_main_cp_state.matrix_index_b.Tex7MtxIdx; #endif - if (g_VtxDesc.PosMatIdx != NOT_PRESENT) + if (g_main_cp_state.vtx_desc.PosMatIdx != NOT_PRESENT) { AddAttributeLoader(LoadPosMtx); m_VertexSize++; @@ -145,17 +145,17 @@ void SWVertexLoader::SetFormat(u8 attributeIndex, u8 primitiveType) } // Write vertex position loader - m_positionLoader = VertexLoader_Position::GetFunction(g_VtxDesc.Position, m_CurrentVat->g0.PosFormat, m_CurrentVat->g0.PosElements); - m_VertexSize += VertexLoader_Position::GetSize(g_VtxDesc.Position, m_CurrentVat->g0.PosFormat, m_CurrentVat->g0.PosElements); + m_positionLoader = VertexLoader_Position::GetFunction(g_main_cp_state.vtx_desc.Position, m_CurrentVat->g0.PosFormat, m_CurrentVat->g0.PosElements); + m_VertexSize += VertexLoader_Position::GetSize(g_main_cp_state.vtx_desc.Position, m_CurrentVat->g0.PosFormat, m_CurrentVat->g0.PosElements); AddAttributeLoader(LoadPosition); // Normals - if (g_VtxDesc.Normal != NOT_PRESENT) + if (g_main_cp_state.vtx_desc.Normal != NOT_PRESENT) { - m_VertexSize += VertexLoader_Normal::GetSize(g_VtxDesc.Normal, + m_VertexSize += VertexLoader_Normal::GetSize(g_main_cp_state.vtx_desc.Normal, m_CurrentVat->g0.NormalFormat, m_CurrentVat->g0.NormalElements, m_CurrentVat->g0.NormalIndex3); - m_normalLoader = VertexLoader_Normal::GetFunction(g_VtxDesc.Normal, + m_normalLoader = VertexLoader_Normal::GetFunction(g_main_cp_state.vtx_desc.Normal, m_CurrentVat->g0.NormalFormat, m_CurrentVat->g0.NormalElements, m_CurrentVat->g0.NormalIndex3); if (m_normalLoader == nullptr) @@ -234,8 +234,8 @@ void SWVertexLoader::SetFormat(u8 attributeIndex, u8 primitiveType) // special case if only pos and tex coord 0 and tex coord input is AB11 m_TexGenSpecialCase = - ((g_VtxDesc.Hex & 0x60600L) == g_VtxDesc.Hex) && // only pos and tex coord 0 - (g_VtxDesc.Tex0Coord != NOT_PRESENT) && + ((g_main_cp_state.vtx_desc.Hex & 0x60600L) == g_main_cp_state.vtx_desc.Hex) && // only pos and tex coord 0 + (g_main_cp_state.vtx_desc.Tex0Coord != NOT_PRESENT) && (xfmem.texMtxInfo[0].projection == XF_TEXPROJ_ST); m_SetupUnit->Init(primitiveType); @@ -252,7 +252,7 @@ void SWVertexLoader::LoadVertex() // transform input data TransformUnit::TransformPosition(&m_Vertex, outVertex); - if (g_VtxDesc.Normal != NOT_PRESENT) + if (g_main_cp_state.vtx_desc.Normal != NOT_PRESENT) { TransformUnit::TransformNormal(&m_Vertex, m_CurrentVat->g0.NormalElements, outVertex); } diff --git a/Source/Core/VideoBackends/Software/SWmain.cpp b/Source/Core/VideoBackends/Software/SWmain.cpp index 25d9e0d543..9063cece5e 100644 --- a/Source/Core/VideoBackends/Software/SWmain.cpp +++ b/Source/Core/VideoBackends/Software/SWmain.cpp @@ -116,14 +116,7 @@ void VideoSoftware::DoState(PointerWrap& p) p.DoPOD(swstats); // CP Memory - p.DoArray(arraybases, 16); - p.DoArray(arraystrides, 16); - p.Do(MatrixIndexA); - p.Do(MatrixIndexB); - p.Do(g_VtxDesc.Hex); - p.DoArray(g_VtxAttr, 8); - p.DoMarker("CP Memory"); - + DoCPState(p); } void VideoSoftware::CheckInvalidState() diff --git a/Source/Core/VideoBackends/Software/XFMemLoader.cpp b/Source/Core/VideoBackends/Software/XFMemLoader.cpp index 08e03ad8d8..1ad1804f72 100644 --- a/Source/Core/VideoBackends/Software/XFMemLoader.cpp +++ b/Source/Core/VideoBackends/Software/XFMemLoader.cpp @@ -74,7 +74,7 @@ void SWLoadIndexedXF(u32 val, int array) int size = ((val >> 12) & 0xF) + 1; //load stuff from array to address in xf mem - u32 *pData = (u32*)Memory::GetPointer(arraybases[array] + arraystrides[array]*index); + u32 *pData = (u32*)Memory::GetPointer(g_main_cp_state.array_bases[array] + g_main_cp_state.array_strides[array]*index); // byteswap data u32 buffer[16]; diff --git a/Source/Core/VideoCommon/BPMemory.h b/Source/Core/VideoCommon/BPMemory.h index 0d50f3ac51..346af479f2 100644 --- a/Source/Core/VideoCommon/BPMemory.h +++ b/Source/Core/VideoCommon/BPMemory.h @@ -1085,5 +1085,6 @@ struct BPMemory extern BPMemory bpmem; void LoadBPReg(u32 value0); +void LoadBPRegPreprocess(u32 value0); void GetBPRegInfo(const u8* data, std::string* name, std::string* desc); diff --git a/Source/Core/VideoCommon/BPStructs.cpp b/Source/Core/VideoCommon/BPStructs.cpp index 308badae6e..152d15a7d5 100644 --- a/Source/Core/VideoCommon/BPStructs.cpp +++ b/Source/Core/VideoCommon/BPStructs.cpp @@ -173,7 +173,8 @@ static void BPWritten(const BPCmd& bp) switch (bp.newvalue & 0xFF) { case 0x02: - PixelEngine::SetFinish(); // may generate interrupt + if (!g_use_deterministic_gpu_thread) + PixelEngine::SetFinish(); // may generate interrupt DEBUG_LOG(VIDEO, "GXSetDrawDone SetPEFinish (value: 0x%02X)", (bp.newvalue & 0xFFFF)); return; @@ -183,11 +184,13 @@ static void BPWritten(const BPCmd& bp) } return; case BPMEM_PE_TOKEN_ID: // Pixel Engine Token ID - PixelEngine::SetToken(static_cast(bp.newvalue & 0xFFFF), false); + if (!g_use_deterministic_gpu_thread) + PixelEngine::SetToken(static_cast(bp.newvalue & 0xFFFF), false); DEBUG_LOG(VIDEO, "SetPEToken 0x%04x", (bp.newvalue & 0xFFFF)); return; case BPMEM_PE_TOKEN_INT_ID: // Pixel Engine Interrupt Token ID - PixelEngine::SetToken(static_cast(bp.newvalue & 0xFFFF), true); + if (!g_use_deterministic_gpu_thread) + PixelEngine::SetToken(static_cast(bp.newvalue & 0xFFFF), true); DEBUG_LOG(VIDEO, "SetPEToken + INT 0x%04x", (bp.newvalue & 0xFFFF)); return; @@ -685,6 +688,26 @@ void LoadBPReg(u32 value0) BPWritten(bp); } +void LoadBPRegPreprocess(u32 value0) +{ + int regNum = value0 >> 24; + // masking could hypothetically be a problem + u32 newval = value0 & 0xffffff; + switch (regNum) + { + case BPMEM_SETDRAWDONE: + if ((newval & 0xff) == 0x02) + PixelEngine::SetFinish(); + break; + case BPMEM_PE_TOKEN_ID: + PixelEngine::SetToken(newval & 0xffff, false); + break; + case BPMEM_PE_TOKEN_INT_ID: // Pixel Engine Interrupt Token ID + PixelEngine::SetToken(newval & 0xffff, true); + break; + } +} + void GetBPRegInfo(const u8* data, std::string* name, std::string* desc) { const char* no_yes[2] = { "No", "Yes" }; diff --git a/Source/Core/VideoCommon/BPStructs.h b/Source/Core/VideoCommon/BPStructs.h index 2a99443346..a1dc48b821 100644 --- a/Source/Core/VideoCommon/BPStructs.h +++ b/Source/Core/VideoCommon/BPStructs.h @@ -7,5 +7,4 @@ #include "VideoCommon/BPMemory.h" void BPInit(); -void LoadBPReg(u32 value0); void BPReload(); diff --git a/Source/Core/VideoCommon/CPMemory.cpp b/Source/Core/VideoCommon/CPMemory.cpp index 752063065b..739a3e66e3 100644 --- a/Source/Core/VideoCommon/CPMemory.cpp +++ b/Source/Core/VideoCommon/CPMemory.cpp @@ -2,17 +2,32 @@ // Licensed under GPLv2 // Refer to the license.txt file included. +#include "Common/ChunkFile.h" #include "Common/CommonTypes.h" #include "VideoCommon/CPMemory.h" // CP state u8 *cached_arraybases[16]; -// STATE_TO_SAVE -u32 arraybases[16]; -u32 arraystrides[16]; -TMatrixIndexA MatrixIndexA; -TMatrixIndexB MatrixIndexB; -TVtxDesc g_VtxDesc; -// Most games only use the first VtxAttr and simply reconfigure it all the time as needed. -VAT g_VtxAttr[8]; +CPState g_main_cp_state; +CPState g_preprocess_cp_state; + +void DoCPState(PointerWrap& p) +{ + // We don't save g_preprocess_cp_state separately because the GPU should be + // synced around state save/load. + p.DoArray(g_main_cp_state.array_bases, 16); + p.DoArray(g_main_cp_state.array_strides, 16); + p.Do(g_main_cp_state.matrix_index_a); + p.Do(g_main_cp_state.matrix_index_b); + p.Do(g_main_cp_state.vtx_desc.Hex); + p.DoArray(g_main_cp_state.vtx_attr, 8); + p.DoMarker("CP Memory"); + if (p.mode == PointerWrap::MODE_READ) + CopyPreprocessCPStateFromMain(); +} + +void CopyPreprocessCPStateFromMain() +{ + memcpy(&g_preprocess_cp_state, &g_main_cp_state, sizeof(CPState)); +} diff --git a/Source/Core/VideoCommon/CPMemory.h b/Source/Core/VideoCommon/CPMemory.h index cac82e0be0..ae8ff08303 100644 --- a/Source/Core/VideoCommon/CPMemory.h +++ b/Source/Core/VideoCommon/CPMemory.h @@ -231,12 +231,6 @@ union TMatrixIndexB #pragma pack() -extern u32 arraybases[16]; -extern u8 *cached_arraybases[16]; -extern u32 arraystrides[16]; -extern TMatrixIndexA MatrixIndexA; -extern TMatrixIndexB MatrixIndexB; - struct VAT { UVAT_group0 g0; @@ -244,11 +238,37 @@ struct VAT UVAT_group2 g2; }; -extern TVtxDesc g_VtxDesc; -extern VAT g_VtxAttr[8]; +class VertexLoader; + +// STATE_TO_SAVE +struct CPState final +{ + u32 array_bases[16]; + u32 array_strides[16]; + TMatrixIndexA matrix_index_a; + TMatrixIndexB matrix_index_b; + TVtxDesc vtx_desc; + // Most games only use the first VtxAttr and simply reconfigure it all the time as needed. + VAT vtx_attr[8]; + + // Attributes that actually belong to VertexLoaderManager: + int attr_dirty; // bitfield + VertexLoader* vertex_loaders[8]; +}; + +class PointerWrap; + +extern void DoCPState(PointerWrap& p); + +extern void CopyPreprocessCPStateFromMain(); + +extern CPState g_main_cp_state; +extern CPState g_preprocess_cp_state; + +extern u8 *cached_arraybases[16]; // Might move this into its own file later. -void LoadCPReg(u32 SubCmd, u32 Value); +void LoadCPReg(u32 SubCmd, u32 Value, bool is_preprocess = false); // Fills memory with data from CP regs void FillCPMemoryArray(u32 *memory); diff --git a/Source/Core/VideoCommon/CommandProcessor.cpp b/Source/Core/VideoCommon/CommandProcessor.cpp index aa9f8c4f28..6f8997cc58 100644 --- a/Source/Core/VideoCommon/CommandProcessor.cpp +++ b/Source/Core/VideoCommon/CommandProcessor.cpp @@ -77,7 +77,7 @@ void DoState(PointerWrap &p) p.Do(interruptFinishWaiting); } -UNUSED static inline void WriteLow(volatile u32& _reg, u16 lowbits) +static inline void WriteLow(volatile u32& _reg, u16 lowbits) { Common::AtomicStore(_reg, (_reg & 0xFFFF0000) | lowbits); } @@ -159,9 +159,8 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) { FIFO_WRITE_POINTER_LO, MMIO::Utils::LowPart(&fifo.CPWritePointer), false, true }, { FIFO_WRITE_POINTER_HI, MMIO::Utils::HighPart(&fifo.CPWritePointer) }, // FIFO_READ_POINTER has different code for single/dual core. - { FIFO_BP_LO, MMIO::Utils::LowPart(&fifo.CPBreakpoint), false, true }, - { FIFO_BP_HI, MMIO::Utils::HighPart(&fifo.CPBreakpoint) }, }; + for (auto& mapped_var : directly_mapped_vars) { u16 wmask = mapped_var.writes_align_to_32_bytes ? 0xFFE0 : 0xFFFF; @@ -173,6 +172,19 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) ); } + mmio->Register(base | FIFO_BP_LO, + MMIO::DirectRead(MMIO::Utils::LowPart(&fifo.CPBreakpoint)), + MMIO::ComplexWrite([](u32, u16 val) { + WriteLow(fifo.CPBreakpoint, val & 0xffe0); + }) + ); + mmio->Register(base | FIFO_BP_HI, + MMIO::DirectRead(MMIO::Utils::HighPart(&fifo.CPBreakpoint)), + MMIO::ComplexWrite([](u32, u16 val) { + WriteHigh(fifo.CPBreakpoint, val); + }) + ); + // Timing and metrics MMIOs are stubbed with fixed values. struct { u32 addr; @@ -216,8 +228,7 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) UCPCtrlReg tmp(val); m_CPCtrlReg.Hex = tmp.Hex; SetCpControlRegister(); - if (!IsOnThread()) - RunGpu(); + RunGpu(); }) ); @@ -227,8 +238,7 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) UCPClearReg tmp(val); m_CPClearReg.Hex = tmp.Hex; SetCpClearRegister(); - if (!IsOnThread()) - RunGpu(); + RunGpu(); }) ); @@ -260,6 +270,7 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) : MMIO::DirectRead(MMIO::Utils::HighPart(&fifo.CPReadWriteDistance)), MMIO::ComplexWrite([](u32, u16 val) { WriteHigh(fifo.CPReadWriteDistance, val); + SyncGPU(SYNC_GPU_OTHER); if (fifo.CPReadWriteDistance == 0) { GPFifo::ResetGatherPipe(); @@ -269,8 +280,7 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) { ResetVideoBuffer(); } - if (!IsOnThread()) - RunGpu(); + RunGpu(); }) ); mmio->Register(base | FIFO_READ_POINTER_LO, @@ -298,11 +308,7 @@ void STACKALIGN GatherPipeBursted() // if we aren't linked, we don't care about gather pipe data if (!m_CPCtrlReg.GPLinkEnable) { - if (!IsOnThread()) - { - RunGpu(); - } - else + if (IsOnThread() && !g_use_deterministic_gpu_thread) { // In multibuffer mode is not allowed write in the same FIFO attached to the GPU. // Fix Pokemon XD in DC mode. @@ -313,6 +319,10 @@ void STACKALIGN GatherPipeBursted() ProcessFifoAllDistance(); } } + else + { + RunGpu(); + } return; } @@ -327,8 +337,7 @@ void STACKALIGN GatherPipeBursted() Common::AtomicAdd(fifo.CPReadWriteDistance, GATHER_PIPE_SIZE); - if (!IsOnThread()) - RunGpu(); + RunGpu(); _assert_msg_(COMMANDPROCESSOR, fifo.CPReadWriteDistance <= fifo.CPEnd - fifo.CPBase, "FIFO is overflowed by GatherPipe !\nCPU thread is too fast!"); @@ -358,7 +367,8 @@ void UpdateInterrupts(u64 userdata) void UpdateInterruptsFromVideoBackend(u64 userdata) { - CoreTiming::ScheduleEvent_Threadsafe(0, et_UpdateInterrupts, userdata); + if (!g_use_deterministic_gpu_thread) + CoreTiming::ScheduleEvent_Threadsafe(0, et_UpdateInterrupts, userdata); } void SetCPStatusFromGPU() diff --git a/Source/Core/VideoCommon/CommandProcessor.h b/Source/Core/VideoCommon/CommandProcessor.h index b29816e8fe..0dad1578af 100644 --- a/Source/Core/VideoCommon/CommandProcessor.h +++ b/Source/Core/VideoCommon/CommandProcessor.h @@ -16,6 +16,7 @@ namespace CommandProcessor { extern SCPFifoStruct fifo; //This one is shared between gfx thread and emulator thread. + extern volatile bool isPossibleWaitingSetDrawDone; //This one is used for sync gfx thread and emulator thread. extern volatile bool interruptSet; extern volatile bool interruptWaiting; diff --git a/Source/Core/VideoCommon/DataReader.h b/Source/Core/VideoCommon/DataReader.h index 85beec3a11..fcb89d36e4 100644 --- a/Source/Core/VideoCommon/DataReader.h +++ b/Source/Core/VideoCommon/DataReader.h @@ -6,7 +6,7 @@ #include "VideoCommon/VertexManagerBase.h" -extern u8* g_pVideoData; +extern u8* g_video_buffer_read_ptr; #if _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__) #include @@ -14,20 +14,20 @@ extern u8* g_pVideoData; __forceinline void DataSkip(u32 skip) { - g_pVideoData += skip; + g_video_buffer_read_ptr += skip; } // probably unnecessary template __forceinline void DataSkip() { - g_pVideoData += count; + g_video_buffer_read_ptr += count; } template -__forceinline T DataPeek(int _uOffset) +__forceinline T DataPeek(int _uOffset, u8** bufp = &g_video_buffer_read_ptr) { - auto const result = Common::FromBigEndian(*reinterpret_cast(g_pVideoData + _uOffset)); + auto const result = Common::FromBigEndian(*reinterpret_cast(*bufp + _uOffset)); return result; } @@ -48,18 +48,18 @@ __forceinline u32 DataPeek32(int _uOffset) } template -__forceinline T DataRead() +__forceinline T DataRead(u8** bufp = &g_video_buffer_read_ptr) { - auto const result = DataPeek(0); - DataSkip(); + auto const result = DataPeek(0, bufp); + *bufp += sizeof(T); return result; } class DataReader { public: - inline DataReader() : buffer(g_pVideoData), offset(0) {} - inline ~DataReader() { g_pVideoData += offset; } + inline DataReader() : buffer(g_video_buffer_read_ptr), offset(0) {} + inline ~DataReader() { g_video_buffer_read_ptr += offset; } template inline T Read() { const T result = Common::FromBigEndian(*(T*)(buffer + offset)); @@ -94,14 +94,14 @@ __forceinline u32 DataReadU32() __forceinline u32 DataReadU32Unswapped() { - u32 tmp = *(u32*)g_pVideoData; - g_pVideoData += 4; + u32 tmp = *(u32*)g_video_buffer_read_ptr; + g_video_buffer_read_ptr += 4; return tmp; } __forceinline u8* DataGetPosition() { - return g_pVideoData; + return g_video_buffer_read_ptr; } template diff --git a/Source/Core/VideoCommon/Fifo.cpp b/Source/Core/VideoCommon/Fifo.cpp index 4e1b52dfa1..44322d4d3b 100644 --- a/Source/Core/VideoCommon/Fifo.cpp +++ b/Source/Core/VideoCommon/Fifo.cpp @@ -11,32 +11,63 @@ #include "Core/ConfigManager.h" #include "Core/Core.h" #include "Core/CoreTiming.h" +#include "Core/NetPlayProto.h" #include "Core/HW/Memmap.h" #include "VideoCommon/CommandProcessor.h" +#include "VideoCommon/CPMemory.h" #include "VideoCommon/DataReader.h" #include "VideoCommon/Fifo.h" #include "VideoCommon/OpcodeDecoding.h" #include "VideoCommon/PixelEngine.h" +#include "VideoCommon/VertexLoaderManager.h" #include "VideoCommon/VideoConfig.h" bool g_bSkipCurrentFrame = false; -namespace -{ static volatile bool GpuRunningState = false; static volatile bool EmuRunningState = false; static std::mutex m_csHWVidOccupied; + +// Most of this array is unlikely to be faulted in... +static u8 s_fifo_aux_data[FIFO_SIZE]; +static u8* s_fifo_aux_write_ptr; +static u8* s_fifo_aux_read_ptr; + +bool g_use_deterministic_gpu_thread; + // STATE_TO_SAVE -static u8 *videoBuffer; -static int size = 0; -} // namespace +static std::mutex s_video_buffer_lock; +static std::condition_variable s_video_buffer_cond; +static u8* s_video_buffer; +u8* g_video_buffer_read_ptr; +static std::atomic s_video_buffer_write_ptr; +static std::atomic s_video_buffer_seen_ptr; +u8* g_video_buffer_pp_read_ptr; +// The read_ptr is always owned by the GPU thread. In normal mode, so is the +// write_ptr, despite it being atomic. In g_use_deterministic_gpu_thread mode, +// things get a bit more complicated: +// - The seen_ptr is written by the GPU thread, and points to what it's already +// processed as much of as possible - in the case of a partial command which +// caused it to stop, not the same as the read ptr. It's written by the GPU, +// under the lock, and updating the cond. +// - The write_ptr is written by the CPU thread after it copies data from the +// FIFO. Maybe someday it will be under the lock. For now, because RunGpuLoop +// polls, it's just atomic. +// - The pp_read_ptr is the CPU preprocessing version of the read_ptr. void Fifo_DoState(PointerWrap &p) { - p.DoArray(videoBuffer, FIFO_SIZE); - p.Do(size); - p.DoPointer(g_pVideoData, videoBuffer); + p.DoArray(s_video_buffer, FIFO_SIZE); + u8* write_ptr = s_video_buffer_write_ptr; + p.DoPointer(write_ptr, s_video_buffer); + s_video_buffer_write_ptr = write_ptr; + p.DoPointer(g_video_buffer_read_ptr, s_video_buffer); + if (p.mode == PointerWrap::MODE_READ && g_use_deterministic_gpu_thread) + { + // We're good and paused, right? + s_video_buffer_seen_ptr = g_video_buffer_pp_read_ptr = g_video_buffer_read_ptr; + } p.Do(g_bSkipCurrentFrame); } @@ -44,6 +75,7 @@ void Fifo_PauseAndLock(bool doLock, bool unpauseOnUnlock) { if (doLock) { + SyncGPU(SYNC_GPU_OTHER); EmulatorState(false); if (!Core::IsGPUThread()) m_csHWVidOccupied.lock(); @@ -61,8 +93,8 @@ void Fifo_PauseAndLock(bool doLock, bool unpauseOnUnlock) void Fifo_Init() { - videoBuffer = (u8*)AllocateMemoryPages(FIFO_SIZE); - size = 0; + s_video_buffer = (u8*)AllocateMemoryPages(FIFO_SIZE); + ResetVideoBuffer(); GpuRunningState = false; Common::AtomicStore(CommandProcessor::VITicks, CommandProcessor::m_cpClockOrigin); } @@ -70,18 +102,24 @@ void Fifo_Init() void Fifo_Shutdown() { if (GpuRunningState) PanicAlert("Fifo shutting down while active"); - FreeMemoryPages(videoBuffer, FIFO_SIZE); - videoBuffer = nullptr; + FreeMemoryPages(s_video_buffer, FIFO_SIZE); + s_video_buffer = nullptr; + s_video_buffer_write_ptr = nullptr; + g_video_buffer_pp_read_ptr = nullptr; + g_video_buffer_read_ptr = nullptr; + s_video_buffer_seen_ptr = nullptr; + s_fifo_aux_write_ptr = nullptr; + s_fifo_aux_read_ptr = nullptr; } u8* GetVideoBufferStartPtr() { - return videoBuffer; + return s_video_buffer; } u8* GetVideoBufferEndPtr() { - return &videoBuffer[size]; + return s_video_buffer_write_ptr; } void Fifo_SetRendering(bool enabled) @@ -107,30 +145,123 @@ void EmulatorState(bool running) EmuRunningState = running; } +void SyncGPU(SyncGPUReason reason, bool may_move_read_ptr) +{ + if (g_use_deterministic_gpu_thread && GpuRunningState) + { + std::unique_lock lk(s_video_buffer_lock); + u8* write_ptr = s_video_buffer_write_ptr; + s_video_buffer_cond.wait(lk, [&]() { + return !GpuRunningState || s_video_buffer_seen_ptr == write_ptr; + }); + if (!GpuRunningState) + return; + + // Opportunistically reset FIFOs so we don't wrap around. + if (may_move_read_ptr && s_fifo_aux_write_ptr != s_fifo_aux_read_ptr) + PanicAlert("aux fifo not synced (%p, %p)", s_fifo_aux_write_ptr, s_fifo_aux_read_ptr); + + memmove(s_fifo_aux_data, s_fifo_aux_read_ptr, s_fifo_aux_write_ptr - s_fifo_aux_read_ptr); + s_fifo_aux_write_ptr -= (s_fifo_aux_read_ptr - s_fifo_aux_data); + s_fifo_aux_read_ptr = s_fifo_aux_data; + + if (may_move_read_ptr) + { + // what's left over in the buffer + size_t size = write_ptr - g_video_buffer_pp_read_ptr; + + memmove(s_video_buffer, g_video_buffer_pp_read_ptr, size); + // This change always decreases the pointers. We write seen_ptr + // after write_ptr here, and read it before in RunGpuLoop, so + // 'write_ptr > seen_ptr' there cannot become spuriously true. + s_video_buffer_write_ptr = write_ptr = s_video_buffer + size; + g_video_buffer_pp_read_ptr = s_video_buffer; + g_video_buffer_read_ptr = s_video_buffer; + s_video_buffer_seen_ptr = write_ptr; + } + } +} + +void PushFifoAuxBuffer(void* ptr, size_t size) +{ + if (size > (size_t) (s_fifo_aux_data + FIFO_SIZE - s_fifo_aux_write_ptr)) + { + SyncGPU(SYNC_GPU_AUX_SPACE, /* may_move_read_ptr */ false); + if (size > (size_t) (s_fifo_aux_data + FIFO_SIZE - s_fifo_aux_write_ptr)) + { + // That will sync us up to the last 32 bytes, so this short region + // of FIFO would have to point to a 2MB display list or something. + PanicAlert("absurdly large aux buffer"); + return; + } + } + memcpy(s_fifo_aux_write_ptr, ptr, size); + s_fifo_aux_write_ptr += size; +} + +void* PopFifoAuxBuffer(size_t size) +{ + void* ret = s_fifo_aux_read_ptr; + s_fifo_aux_read_ptr += size; + return ret; +} // Description: RunGpuLoop() sends data through this function. -void ReadDataFromFifo(u8* _uData, u32 len) +static void ReadDataFromFifo(u8* _uData, u32 len) { - if (size + len >= FIFO_SIZE) + if (len > (s_video_buffer + FIFO_SIZE - s_video_buffer_write_ptr)) { - int pos = (int)(g_pVideoData - videoBuffer); - size -= pos; - if (size + len > FIFO_SIZE) + size_t size = s_video_buffer_write_ptr - g_video_buffer_read_ptr; + if (len > FIFO_SIZE - size) { - PanicAlert("FIFO out of bounds (size = %i, len = %i at %08x)", size, len, pos); + PanicAlert("FIFO out of bounds (existing %lu + new %lu > %lu)", (unsigned long) size, (unsigned long) len, (unsigned long) FIFO_SIZE); + return; } - memmove(&videoBuffer[0], &videoBuffer[pos], size); - g_pVideoData = videoBuffer; + memmove(s_video_buffer, g_video_buffer_read_ptr, size); + s_video_buffer_write_ptr = s_video_buffer + size; + g_video_buffer_read_ptr = s_video_buffer; } - // Copy new video instructions to videoBuffer for future use in rendering the new picture - memcpy(videoBuffer + size, _uData, len); - size += len; + // Copy new video instructions to s_video_buffer for future use in rendering the new picture + memcpy(s_video_buffer_write_ptr, _uData, len); + s_video_buffer_write_ptr += len; +} + +// The deterministic_gpu_thread version. +static void ReadDataFromFifoOnCPU(u8* _uData, u32 len) +{ + u8 *write_ptr = s_video_buffer_write_ptr; + if (len > (s_video_buffer + FIFO_SIZE - write_ptr)) + { + // We can't wrap around while the GPU is working on the data. + // This should be very rare due to the reset in SyncGPU. + SyncGPU(SYNC_GPU_WRAPAROUND); + if (g_video_buffer_pp_read_ptr != g_video_buffer_read_ptr) + { + PanicAlert("desynced read pointers"); + return; + } + write_ptr = s_video_buffer_write_ptr; + size_t size = write_ptr - g_video_buffer_pp_read_ptr; + if (len > FIFO_SIZE - size) + { + PanicAlert("FIFO out of bounds (existing %lu + new %lu > %lu)", (unsigned long) size, (unsigned long) len, (unsigned long) FIFO_SIZE); + return; + } + } + memcpy(write_ptr, _uData, len); + OpcodeDecoder_Preprocess(write_ptr + len); + // This would have to be locked if the GPU thread didn't spin. + s_video_buffer_write_ptr = write_ptr + len; } void ResetVideoBuffer() { - g_pVideoData = videoBuffer; - size = 0; + g_video_buffer_read_ptr = s_video_buffer; + s_video_buffer_write_ptr = s_video_buffer; + s_video_buffer_seen_ptr = s_video_buffer; + g_video_buffer_pp_read_ptr = s_video_buffer; + s_fifo_aux_write_ptr = s_fifo_aux_data; + s_fifo_aux_read_ptr = s_fifo_aux_data; } @@ -148,53 +279,75 @@ void RunGpuLoop() g_video_backend->PeekMessages(); VideoFifo_CheckAsyncRequest(); - - CommandProcessor::SetCPStatusFromGPU(); - - Common::AtomicStore(CommandProcessor::VITicks, CommandProcessor::m_cpClockOrigin); - - // check if we are able to run this buffer - while (GpuRunningState && EmuRunningState && !CommandProcessor::interruptWaiting && fifo.bFF_GPReadEnable && fifo.CPReadWriteDistance && !AtBreakpoint()) + if (g_use_deterministic_gpu_thread) { - fifo.isGpuReadingData = true; - CommandProcessor::isPossibleWaitingSetDrawDone = fifo.bFF_GPLinkEnable ? true : false; - - if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPU || Common::AtomicLoad(CommandProcessor::VITicks) > CommandProcessor::m_cpClockOrigin) + // All the fifo/CP stuff is on the CPU. We just need to run the opcode decoder. + u8* seen_ptr = s_video_buffer_seen_ptr; + u8* write_ptr = s_video_buffer_write_ptr; + // See comment in SyncGPU + if (write_ptr > seen_ptr) { - u32 readPtr = fifo.CPReadPointer; - u8 *uData = Memory::GetPointer(readPtr); + OpcodeDecoder_Run(write_ptr); - if (readPtr == fifo.CPEnd) - readPtr = fifo.CPBase; - else - readPtr += 32; - - _assert_msg_(COMMANDPROCESSOR, (s32)fifo.CPReadWriteDistance - 32 >= 0 , - "Negative fifo.CPReadWriteDistance = %i in FIFO Loop !\nThat can produce instability in the game. Please report it.", fifo.CPReadWriteDistance - 32); - - ReadDataFromFifo(uData, 32); - - cyclesExecuted = OpcodeDecoder_Run(GetVideoBufferEndPtr()); - - if (SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPU && Common::AtomicLoad(CommandProcessor::VITicks) >= cyclesExecuted) - Common::AtomicAdd(CommandProcessor::VITicks, -(s32)cyclesExecuted); - - Common::AtomicStore(fifo.CPReadPointer, readPtr); - Common::AtomicAdd(fifo.CPReadWriteDistance, -32); - if ((GetVideoBufferEndPtr() - g_pVideoData) == 0) - Common::AtomicStore(fifo.SafeCPReadPointer, fifo.CPReadPointer); + { + std::lock_guard vblk(s_video_buffer_lock); + s_video_buffer_seen_ptr = write_ptr; + s_video_buffer_cond.notify_all(); + } } - + } + else + { CommandProcessor::SetCPStatusFromGPU(); - // This call is pretty important in DualCore mode and must be called in the FIFO Loop. - // If we don't, s_swapRequested or s_efbAccessRequested won't be set to false - // leading the CPU thread to wait in Video_BeginField or Video_AccessEFB thus slowing things down. - VideoFifo_CheckAsyncRequest(); - CommandProcessor::isPossibleWaitingSetDrawDone = false; - } + Common::AtomicStore(CommandProcessor::VITicks, CommandProcessor::m_cpClockOrigin); - fifo.isGpuReadingData = false; + // check if we are able to run this buffer + while (GpuRunningState && EmuRunningState && !CommandProcessor::interruptWaiting && fifo.bFF_GPReadEnable && fifo.CPReadWriteDistance && !AtBreakpoint()) + { + fifo.isGpuReadingData = true; + CommandProcessor::isPossibleWaitingSetDrawDone = fifo.bFF_GPLinkEnable ? true : false; + + if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPU || Common::AtomicLoad(CommandProcessor::VITicks) > CommandProcessor::m_cpClockOrigin) + { + u32 readPtr = fifo.CPReadPointer; + u8 *uData = Memory::GetPointer(readPtr); + + if (readPtr == fifo.CPEnd) + readPtr = fifo.CPBase; + else + readPtr += 32; + + _assert_msg_(COMMANDPROCESSOR, (s32)fifo.CPReadWriteDistance - 32 >= 0 , + "Negative fifo.CPReadWriteDistance = %i in FIFO Loop !\nThat can produce instability in the game. Please report it.", fifo.CPReadWriteDistance - 32); + + ReadDataFromFifo(uData, 32); + + u8* write_ptr = s_video_buffer_write_ptr; + + cyclesExecuted = OpcodeDecoder_Run(write_ptr); + + + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPU && Common::AtomicLoad(CommandProcessor::VITicks) >= cyclesExecuted) + Common::AtomicAdd(CommandProcessor::VITicks, -(s32)cyclesExecuted); + + Common::AtomicStore(fifo.CPReadPointer, readPtr); + Common::AtomicAdd(fifo.CPReadWriteDistance, -32); + if ((write_ptr - g_video_buffer_read_ptr) == 0) + Common::AtomicStore(fifo.SafeCPReadPointer, fifo.CPReadPointer); + } + + CommandProcessor::SetCPStatusFromGPU(); + + // This call is pretty important in DualCore mode and must be called in the FIFO Loop. + // If we don't, s_swapRequested or s_efbAccessRequested won't be set to false + // leading the CPU thread to wait in Video_BeginField or Video_AccessEFB thus slowing things down. + VideoFifo_CheckAsyncRequest(); + CommandProcessor::isPossibleWaitingSetDrawDone = false; + } + + fifo.isGpuReadingData = false; + } if (EmuRunningState) { @@ -217,6 +370,8 @@ void RunGpuLoop() } } } + // wake up SyncGPU if we were interrupted + s_video_buffer_cond.notify_all(); } @@ -228,16 +383,27 @@ bool AtBreakpoint() void RunGpu() { + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bCPUThread && + !g_use_deterministic_gpu_thread) + return; + SCPFifoStruct &fifo = CommandProcessor::fifo; while (fifo.bFF_GPReadEnable && fifo.CPReadWriteDistance && !AtBreakpoint() ) { u8 *uData = Memory::GetPointer(fifo.CPReadPointer); - FPURoundMode::SaveSIMDState(); - FPURoundMode::LoadDefaultSIMDState(); - ReadDataFromFifo(uData, 32); - OpcodeDecoder_Run(GetVideoBufferEndPtr()); - FPURoundMode::LoadSIMDState(); + if (g_use_deterministic_gpu_thread) + { + ReadDataFromFifoOnCPU(uData, 32); + } + else + { + FPURoundMode::SaveSIMDState(); + FPURoundMode::LoadDefaultSIMDState(); + ReadDataFromFifo(uData, 32); + OpcodeDecoder_Run(s_video_buffer_write_ptr); + FPURoundMode::LoadSIMDState(); + } //DEBUG_LOG(COMMANDPROCESSOR, "Fifo wraps to base"); @@ -250,3 +416,45 @@ void RunGpu() } CommandProcessor::SetCPStatusFromGPU(); } + +void Fifo_UpdateWantDeterminism(bool want) +{ + // We are paused (or not running at all yet) and have m_csHWVidOccupied, so + // it should be safe to change this. + const SCoreStartupParameter& param = SConfig::GetInstance().m_LocalCoreStartupParameter; + bool gpu_thread; + switch (param.m_GPUDeterminismMode) + { + case GPU_DETERMINISM_AUTO: + gpu_thread = want; + + // Hack: For now movies are an exception to this being on (but not + // to wanting determinism in general). Once vertex arrays are + // fixed, there should be no reason to want this off for movies by + // default, so this can be removed. + if (!NetPlay::IsNetPlayRunning()) + gpu_thread = false; + + break; + case GPU_DETERMINISM_NONE: + gpu_thread = false; + break; + case GPU_DETERMINISM_FAKE_COMPLETION: + gpu_thread = true; + break; + } + + gpu_thread = gpu_thread && SConfig::GetInstance().m_LocalCoreStartupParameter.bCPUThread; + + if (g_use_deterministic_gpu_thread != gpu_thread) + { + g_use_deterministic_gpu_thread = gpu_thread; + if (gpu_thread) + { + // These haven't been updated in non-deterministic mode. + s_video_buffer_seen_ptr = g_video_buffer_pp_read_ptr = g_video_buffer_read_ptr; + CopyPreprocessCPStateFromMain(); + VertexLoaderManager::MarkAllDirty(); + } + } +} diff --git a/Source/Core/VideoCommon/Fifo.h b/Source/Core/VideoCommon/Fifo.h index 389336be0a..40a5ad84b7 100644 --- a/Source/Core/VideoCommon/Fifo.h +++ b/Source/Core/VideoCommon/Fifo.h @@ -13,6 +13,11 @@ class PointerWrap; extern bool g_bSkipCurrentFrame; +// This could be in SCoreStartupParameter, but it depends on multiple settings +// and can change at runtime. +extern bool g_use_deterministic_gpu_thread; +extern std::atomic g_video_buffer_write_ptr_xthread; +extern u8* g_video_buffer_pp_read_ptr; void Fifo_Init(); void Fifo_Shutdown(); @@ -22,8 +27,23 @@ u8* GetVideoBufferEndPtr(); void Fifo_DoState(PointerWrap &f); void Fifo_PauseAndLock(bool doLock, bool unpauseOnUnlock); +void Fifo_UpdateWantDeterminism(bool want); -void ReadDataFromFifo(u8* _uData, u32 len); +// Used for diagnostics. +enum SyncGPUReason { + SYNC_GPU_NONE, + SYNC_GPU_OTHER, + SYNC_GPU_WRAPAROUND, + SYNC_GPU_EFB_POKE, + SYNC_GPU_PERFQUERY, + SYNC_GPU_SWAP, + SYNC_GPU_AUX_SPACE, +}; +// In g_use_deterministic_gpu_thread mode, waits for the GPU to be done with pending work. +void SyncGPU(SyncGPUReason reason, bool may_move_read_ptr = true); + +void PushFifoAuxBuffer(void* ptr, size_t size); +void* PopFifoAuxBuffer(size_t size); void RunGpu(); void RunGpuLoop(); diff --git a/Source/Core/VideoCommon/MainBase.cpp b/Source/Core/VideoCommon/MainBase.cpp index 99fac60eae..d775cfe64c 100644 --- a/Source/Core/VideoCommon/MainBase.cpp +++ b/Source/Core/VideoCommon/MainBase.cpp @@ -118,6 +118,7 @@ void VideoBackendHardware::Video_EndField() { if (s_BackendInitialized) { + SyncGPU(SYNC_GPU_SWAP); s_swapRequested.Set(); } } @@ -153,6 +154,8 @@ u32 VideoBackendHardware::Video_AccessEFB(EFBAccessType type, u32 x, u32 y, u32 { if (s_BackendInitialized && g_ActiveConfig.bEFBAccessEnable) { + SyncGPU(SYNC_GPU_EFB_POKE); + s_accessEFBArgs.type = type; s_accessEFBArgs.x = x; s_accessEFBArgs.y = y; @@ -194,6 +197,8 @@ u32 VideoBackendHardware::Video_GetQueryResult(PerfQueryType type) return 0; } + SyncGPU(SYNC_GPU_PERFQUERY); + // TODO: Is this check sane? if (!g_perf_query->IsFlushed()) { @@ -304,3 +309,8 @@ void VideoBackendHardware::RegisterCPMMIO(MMIO::Mapping* mmio, u32 base) CommandProcessor::RegisterMMIO(mmio, base); } +void VideoBackendHardware::UpdateWantDeterminism(bool want) +{ + Fifo_UpdateWantDeterminism(want); +} + diff --git a/Source/Core/VideoCommon/NativeVertexFormat.h b/Source/Core/VideoCommon/NativeVertexFormat.h index ebc95cdc45..8778423d3f 100644 --- a/Source/Core/VideoCommon/NativeVertexFormat.h +++ b/Source/Core/VideoCommon/NativeVertexFormat.h @@ -4,7 +4,7 @@ #pragma once -#include "Common/CommonTypes.h" +#include "Common/Hash.h" // m_components enum @@ -87,6 +87,20 @@ struct PortableVertexDeclaration } }; +namespace std +{ + +template <> +struct hash +{ + size_t operator()(const PortableVertexDeclaration& decl) const + { + return HashFletcher((u8 *) &decl, sizeof(decl)); + } +}; + +} + // The implementation of this class is specific for GL/DX, so NativeVertexFormat.cpp // is in the respective backend, not here in VideoCommon. diff --git a/Source/Core/VideoCommon/OpcodeDecoding.cpp b/Source/Core/VideoCommon/OpcodeDecoding.cpp index e9a20a526a..1bb5fae940 100644 --- a/Source/Core/VideoCommon/OpcodeDecoding.cpp +++ b/Source/Core/VideoCommon/OpcodeDecoding.cpp @@ -24,6 +24,7 @@ #include "VideoCommon/DataReader.h" #include "VideoCommon/Fifo.h" #include "VideoCommon/OpcodeDecoding.h" +#include "VideoCommon/PixelEngine.h" #include "VideoCommon/Statistics.h" #include "VideoCommon/VertexLoaderManager.h" #include "VideoCommon/VideoCommon.h" @@ -31,25 +32,29 @@ #include "VideoCommon/XFMemory.h" -u8* g_pVideoData = nullptr; bool g_bRecordFifoData = false; static u32 InterpretDisplayList(u32 address, u32 size) { - u8* old_pVideoData = g_pVideoData; - u8* startAddress = Memory::GetPointer(address); + u8* old_pVideoData = g_video_buffer_read_ptr; + u8* startAddress; + + if (g_use_deterministic_gpu_thread) + startAddress = (u8*) PopFifoAuxBuffer(size); + else + startAddress = Memory::GetPointer(address); u32 cycles = 0; // Avoid the crash if Memory::GetPointer failed .. if (startAddress != nullptr) { - g_pVideoData = startAddress; + g_video_buffer_read_ptr = startAddress; // temporarily swap dl and non-dl (small "hack" for the stats) Statistics::SwapDL(); - u8 *end = g_pVideoData + size; + u8 *end = g_video_buffer_read_ptr + size; cycles = OpcodeDecoder_Run(end); INCSTAT(stats.thisFrame.numDListsCalled); @@ -58,16 +63,34 @@ static u32 InterpretDisplayList(u32 address, u32 size) } // reset to the old pointer - g_pVideoData = old_pVideoData; + g_video_buffer_read_ptr = old_pVideoData; return cycles; } +static void InterpretDisplayListPreprocess(u32 address, u32 size) +{ + u8* old_read_ptr = g_video_buffer_pp_read_ptr; + u8* startAddress = Memory::GetPointer(address); + + PushFifoAuxBuffer(startAddress, size); + + if (startAddress != nullptr) + { + g_video_buffer_pp_read_ptr = startAddress; + + u8 *end = startAddress + size; + OpcodeDecoder_Preprocess(end); + } + + g_video_buffer_pp_read_ptr = old_read_ptr; +} + static void UnknownOpcode(u8 cmd_byte, void *buffer, bool preprocess) { // TODO(Omega): Maybe dump FIFO to file on this error std::string temp = StringFromFormat( - "GFX FIFO: Unknown Opcode (0x%x @ %p).\n" + "GFX FIFO: Unknown Opcode (0x%x @ %p, preprocessing=%s).\n" "This means one of the following:\n" "* The emulated GPU got desynced, disabling dual core can help\n" "* Command stream corrupted by some spurious memory bug\n" @@ -75,7 +98,8 @@ static void UnknownOpcode(u8 cmd_byte, void *buffer, bool preprocess) "* Some other sort of bug\n\n" "Dolphin will now likely crash or hang. Enjoy." , cmd_byte, - buffer); + buffer, + preprocess ? "yes" : "no"); Host_SysMessage(temp.c_str()); INFO_LOG(VIDEO, "%s", temp.c_str()); { @@ -105,14 +129,16 @@ static void UnknownOpcode(u8 cmd_byte, void *buffer, bool preprocess) } } +template static u32 Decode(u8* end) { - u8 *opcodeStart = g_pVideoData; - if (g_pVideoData == end) + u8 *opcodeStart = *bufp; + if (*bufp == end) return 0; - u8 cmd_byte = DataReadU8(); + u8 cmd_byte = DataRead(bufp); u32 cycles; + int refarray; switch (cmd_byte) { case GX_NOP: @@ -121,64 +147,72 @@ static u32 Decode(u8* end) case GX_LOAD_CP_REG: //0x08 { - if (end - g_pVideoData < 1 + 4) + if (end - *bufp < 1 + 4) return 0; cycles = 12; - u8 sub_cmd = DataReadU8(); - u32 value = DataReadU32(); - LoadCPReg(sub_cmd, value); - INCSTAT(stats.thisFrame.numCPLoads); + u8 sub_cmd = DataRead(bufp); + u32 value = DataRead(bufp); + LoadCPReg(sub_cmd, value, is_preprocess); + if (!is_preprocess) + INCSTAT(stats.thisFrame.numCPLoads); } break; case GX_LOAD_XF_REG: { - if (end - g_pVideoData < 4) + if (end - *bufp < 4) return 0; - u32 Cmd2 = DataReadU32(); + u32 Cmd2 = DataRead(bufp); int transfer_size = ((Cmd2 >> 16) & 15) + 1; - if ((size_t) (end - g_pVideoData) < transfer_size * sizeof(u32)) + if ((size_t) (end - *bufp) < transfer_size * sizeof(u32)) return 0; cycles = 18 + 6 * transfer_size; - u32 xf_address = Cmd2 & 0xFFFF; - LoadXFReg(transfer_size, xf_address); + if (!is_preprocess) + { + u32 xf_address = Cmd2 & 0xFFFF; + LoadXFReg(transfer_size, xf_address); - INCSTAT(stats.thisFrame.numXFLoads); + INCSTAT(stats.thisFrame.numXFLoads); + } + else + { + *bufp += transfer_size * sizeof(u32); + } } break; case GX_LOAD_INDX_A: //used for position matrices - if (end - g_pVideoData < 4) - return 0; - cycles = 6; - LoadIndexedXF(DataReadU32(), 0xC); - break; + refarray = 0xC; + goto load_indx; case GX_LOAD_INDX_B: //used for normal matrices - if (end - g_pVideoData < 4) - return 0; - cycles = 6; - LoadIndexedXF(DataReadU32(), 0xD); - break; + refarray = 0xD; + goto load_indx; case GX_LOAD_INDX_C: //used for postmatrices - if (end - g_pVideoData < 4) - return 0; - cycles = 6; - LoadIndexedXF(DataReadU32(), 0xE); - break; + refarray = 0xE; + goto load_indx; case GX_LOAD_INDX_D: //used for lights - if (end - g_pVideoData < 4) + refarray = 0xF; + goto load_indx; + load_indx: + if (end - *bufp < 4) return 0; cycles = 6; - LoadIndexedXF(DataReadU32(), 0xF); + if (is_preprocess) + PreprocessIndexedXF(DataRead(bufp), refarray); + else + LoadIndexedXF(DataRead(bufp), refarray); break; case GX_CMD_CALL_DL: { - if (end - g_pVideoData < 8) + if (end - *bufp < 8) return 0; - u32 address = DataReadU32(); - u32 count = DataReadU32(); - cycles = 6 + InterpretDisplayList(address, count); + u32 address = DataRead(bufp); + u32 count = DataRead(bufp); + if (is_preprocess) + InterpretDisplayListPreprocess(address, count); + else + cycles = 6 + InterpretDisplayList(address, count); } break; @@ -196,12 +230,19 @@ static u32 Decode(u8* end) // In skipped_frame case: We have to let BP writes through because they set // tokens and stuff. TODO: Call a much simplified LoadBPReg instead. { - if (end - g_pVideoData < 4) + if (end - *bufp < 4) return 0; cycles = 12; - u32 bp_cmd = DataReadU32(); - LoadBPReg(bp_cmd); - INCSTAT(stats.thisFrame.numBPLoads); + u32 bp_cmd = DataRead(bufp); + if (is_preprocess) + { + LoadBPRegPreprocess(bp_cmd); + } + else + { + LoadBPReg(bp_cmd); + INCSTAT(stats.thisFrame.numBPLoads); + } } break; @@ -211,38 +252,48 @@ static u32 Decode(u8* end) { cycles = 1600; // load vertices - if (end - g_pVideoData < 2) + if (end - *bufp < 2) return 0; - u16 numVertices = DataReadU16(); + u16 num_vertices = DataRead(bufp); - if (!VertexLoaderManager::RunVertices( - cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7) - (cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, - numVertices, - end - g_pVideoData, - g_bSkipCurrentFrame)) + if (is_preprocess) { - return 0; + size_t size = num_vertices * VertexLoaderManager::GetVertexSize(cmd_byte & GX_VAT_MASK, is_preprocess); + if ((size_t) (end - *bufp) < size) + return 0; + *bufp += size; + } + else + { + if (!VertexLoaderManager::RunVertices( + cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7) + (cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, + num_vertices, + end - *bufp, + g_bSkipCurrentFrame)) + return 0; } } else { - UnknownOpcode(cmd_byte, opcodeStart, false); + UnknownOpcode(cmd_byte, opcodeStart, is_preprocess); cycles = 1; } break; } // Display lists get added directly into the FIFO stream - if (g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL) - FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(g_pVideoData - opcodeStart)); + if (!is_preprocess && g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL) + FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(*bufp - opcodeStart)); - return cycles; + // In is_preprocess mode, we don't actually care about cycles, at least for + // now... make sure the compiler realizes that. + return is_preprocess ? 1 : cycles; } void OpcodeDecoder_Init() { - g_pVideoData = GetVideoBufferStartPtr(); + g_video_buffer_read_ptr = GetVideoBufferStartPtr(); } @@ -255,14 +306,28 @@ u32 OpcodeDecoder_Run(u8* end) u32 totalCycles = 0; while (true) { - u8* old = g_pVideoData; - u32 cycles = Decode(end); + u8* old = g_video_buffer_read_ptr; + u32 cycles = Decode(end); if (cycles == 0) { - g_pVideoData = old; + g_video_buffer_read_ptr = old; break; } totalCycles += cycles; } return totalCycles; } + +void OpcodeDecoder_Preprocess(u8 *end) +{ + while (true) + { + u8* old = g_video_buffer_pp_read_ptr; + u32 cycles = Decode(end); + if (cycles == 0) + { + g_video_buffer_pp_read_ptr = old; + break; + } + } +} diff --git a/Source/Core/VideoCommon/OpcodeDecoding.h b/Source/Core/VideoCommon/OpcodeDecoding.h index 1702969825..e5b1b23e89 100644 --- a/Source/Core/VideoCommon/OpcodeDecoding.h +++ b/Source/Core/VideoCommon/OpcodeDecoding.h @@ -39,3 +39,4 @@ extern bool g_bRecordFifoData; void OpcodeDecoder_Init(); void OpcodeDecoder_Shutdown(); u32 OpcodeDecoder_Run(u8* end); +void OpcodeDecoder_Preprocess(u8* write_ptr); diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index 7f7dbc9030..b2f3cd7a4e 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -33,14 +33,11 @@ // Matrix components are first in GC format but later in PC format - we need to store it temporarily // when decoding each vertex. -static u8 s_curposmtx = MatrixIndexA.PosNormalMtxIdx; +static u8 s_curposmtx = g_main_cp_state.matrix_index_a.PosNormalMtxIdx; static u8 s_curtexmtx[8]; static int s_texmtxwrite = 0; static int s_texmtxread = 0; -static int loop_counter; - - // Vertex loaders read these. Although the scale ones should be baked into the shader. int tcIndex; int colIndex; @@ -90,7 +87,7 @@ static void LOADERDECL PosMtx_Write() DataWrite(0); // Resetting current position matrix to default is needed for bbox to behave - s_curposmtx = (u8) MatrixIndexA.PosNormalMtxIdx; + s_curposmtx = (u8) g_main_cp_state.matrix_index_a.PosNormalMtxIdx; } static void LOADERDECL UpdateBoundingBoxPrepare() @@ -548,7 +545,7 @@ VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr) m_compiledCode = nullptr; m_numLoadedVertices = 0; m_VertexSize = 0; - loop_counter = 0; + m_native_vertex_format = nullptr; VertexLoader_Normal::Init(); VertexLoader_Position::Init(); VertexLoader_TextCoord::Init(); @@ -584,8 +581,11 @@ void VertexLoader::CompileVertexTranslator() PanicAlert("Trying to recompile a vertex translator"); m_compiledCode = GetCodePtr(); - // We don't use any callee saved registers or anything but RAX. - ABI_PushRegistersAndAdjustStack(0, 8); + // We only use RAX (caller saved) and RBX (callee saved). + ABI_PushRegistersAndAdjustStack(1 << RBX, 8); + + // save count + MOV(64, R(RBX), R(ABI_PARAM1)); // Start loop here const u8 *loop_start = GetCodePtr(); @@ -842,11 +842,10 @@ void VertexLoader::CompileVertexTranslator() #ifdef USE_VERTEX_LOADER_JIT // End loop here - MOV(64, R(RAX), Imm64((u64)&loop_counter)); - SUB(32, MatR(RAX), Imm8(1)); + SUB(64, R(RBX), Imm8(1)); J_CC(CC_NZ, loop_start); - ABI_PopRegistersAndAdjustStack(0, 8); + ABI_PopRegistersAndAdjustStack(1 << RBX, 8); RET(); #endif } @@ -912,8 +911,7 @@ void VertexLoader::ConvertVertices ( int count ) #ifdef USE_VERTEX_LOADER_JIT if (count > 0) { - loop_counter = count; - ((void (*)())(void*)m_compiledCode)(); + ((void (*)(int))(void*)m_compiledCode)(count); } #else for (int s = 0; s < count; s++) @@ -1035,3 +1033,22 @@ void VertexLoader::AppendToString(std::string *dest) const } dest->append(StringFromFormat(" - %i v\n", m_numLoadedVertices)); } + +NativeVertexFormat* VertexLoader::GetNativeVertexFormat() +{ + if (m_native_vertex_format) + return m_native_vertex_format; + auto& native = s_native_vertex_map[m_native_vtx_decl]; + if (!native) + { + auto raw_pointer = g_vertex_manager->CreateNativeVertexFormat(); + native = std::unique_ptr(raw_pointer); + native->Initialize(m_native_vtx_decl); + native->m_components = m_native_components; + } + m_native_vertex_format = native.get(); + return native.get(); + +} + +std::unordered_map> VertexLoader::s_native_vertex_map; diff --git a/Source/Core/VideoCommon/VertexLoader.h b/Source/Core/VideoCommon/VertexLoader.h index 7be8385879..c81be52556 100644 --- a/Source/Core/VideoCommon/VertexLoader.h +++ b/Source/Core/VideoCommon/VertexLoader.h @@ -8,7 +8,9 @@ // Metroid Prime: P I16-flt N I16-s16 T0 I16-u16 T1 i16-flt #include +#include #include +#include #include "Common/CommonTypes.h" #include "Common/x64Emitter.h" @@ -114,6 +116,9 @@ public: void AppendToString(std::string *dest) const; int GetNumLoadedVerts() const { return m_numLoadedVertices; } + NativeVertexFormat* GetNativeVertexFormat(); + static void ClearNativeVertexFormatCache() { s_native_vertex_map.clear(); } + private: int m_VertexSize; // number of bytes of a raw GC vertex. Computed by CompileVertexTranslator. @@ -135,6 +140,9 @@ private: int m_numLoadedVertices; + NativeVertexFormat* m_native_vertex_format; + static std::unordered_map> s_native_vertex_map; + void SetVAT(const VAT& vat); void CompileVertexTranslator(); diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index dbf22c9c3d..ca925d0302 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -20,13 +21,8 @@ #include "VideoCommon/VertexShaderManager.h" #include "VideoCommon/VideoCommon.h" -static int s_attr_dirty; // bitfield - static NativeVertexFormat* s_current_vtx_fmt; -typedef std::pair VertexLoaderCacheItem; -static VertexLoaderCacheItem s_VertexLoaders[8]; - namespace std { @@ -41,35 +37,30 @@ struct hash } -typedef std::unordered_map VertexLoaderMap; -typedef std::map> NativeVertexLoaderMap; +typedef std::unordered_map> VertexLoaderMap; namespace VertexLoaderManager { -static VertexLoaderMap s_VertexLoaderMap; -static NativeVertexLoaderMap s_native_vertex_map; +static std::mutex s_vertex_loader_map_lock; +static VertexLoaderMap s_vertex_loader_map; // TODO - change into array of pointers. Keep a map of all seen so far. void Init() { MarkAllDirty(); - for (auto& map_entry : s_VertexLoaders) - { - map_entry.first = nullptr; - map_entry.second = nullptr; - } + for (auto& map_entry : g_main_cp_state.vertex_loaders) + map_entry = nullptr; + for (auto& map_entry : g_preprocess_cp_state.vertex_loaders) + map_entry = nullptr; RecomputeCachedArraybases(); } void Shutdown() { - for (auto& map_entry : s_VertexLoaderMap) - { - delete map_entry.second.first; - } - s_VertexLoaderMap.clear(); - s_native_vertex_map.clear(); + std::lock_guard lk(s_vertex_loader_map_lock); + s_vertex_loader_map.clear(); + VertexLoader::ClearNativeVertexFormatCache(); } namespace @@ -87,14 +78,15 @@ struct entry void AppendListToString(std::string *dest) { + std::lock_guard lk(s_vertex_loader_map_lock); std::vector entries; size_t total_size = 0; - for (const auto& map_entry : s_VertexLoaderMap) + for (const auto& map_entry : s_vertex_loader_map) { entry e; - map_entry.second.first->AppendToString(&e.text); - e.num_verts = map_entry.second.first->GetNumLoadedVerts(); + map_entry.second->AppendToString(&e.text); + e.num_verts = map_entry.second->GetNumLoadedVerts(); entries.push_back(e); total_size += e.text.size() + 1; } @@ -108,57 +100,46 @@ void AppendListToString(std::string *dest) void MarkAllDirty() { - s_attr_dirty = 0xff; + g_main_cp_state.attr_dirty = 0xff; + g_preprocess_cp_state.attr_dirty = 0xff; } -static NativeVertexFormat* GetNativeVertexFormat(const PortableVertexDeclaration& format, - u32 components) +static VertexLoader* RefreshLoader(int vtx_attr_group, CPState* state) { - auto& native = s_native_vertex_map[format]; - if (!native) + VertexLoader* loader; + if ((state->attr_dirty >> vtx_attr_group) & 1) { - auto raw_pointer = g_vertex_manager->CreateNativeVertexFormat(); - native = std::unique_ptr(raw_pointer); - native->Initialize(format); - native->m_components = components; - } - return native.get(); -} - -static VertexLoaderCacheItem RefreshLoader(int vtx_attr_group) -{ - if ((s_attr_dirty >> vtx_attr_group) & 1) - { - VertexLoaderUID uid(g_VtxDesc, g_VtxAttr[vtx_attr_group]); - VertexLoaderMap::iterator iter = s_VertexLoaderMap.find(uid); - if (iter != s_VertexLoaderMap.end()) + VertexLoaderUID uid(state->vtx_desc, state->vtx_attr[vtx_attr_group]); + std::lock_guard lk(s_vertex_loader_map_lock); + VertexLoaderMap::iterator iter = s_vertex_loader_map.find(uid); + if (iter != s_vertex_loader_map.end()) { - s_VertexLoaders[vtx_attr_group] = iter->second; + loader = iter->second.get(); } else { - VertexLoader* loader = new VertexLoader(g_VtxDesc, g_VtxAttr[vtx_attr_group]); - - NativeVertexFormat* vtx_fmt = GetNativeVertexFormat( - loader->GetNativeVertexDeclaration(), - loader->GetNativeComponents()); - - s_VertexLoaderMap[uid] = std::make_pair(loader, vtx_fmt); - s_VertexLoaders[vtx_attr_group] = std::make_pair(loader, vtx_fmt); + loader = new VertexLoader(state->vtx_desc, state->vtx_attr[vtx_attr_group]); + s_vertex_loader_map[uid] = std::unique_ptr(loader); INCSTAT(stats.numVertexLoaders); } + state->vertex_loaders[vtx_attr_group] = loader; + state->attr_dirty &= ~(1 << vtx_attr_group); + } else { + loader = state->vertex_loaders[vtx_attr_group]; } - s_attr_dirty &= ~(1 << vtx_attr_group); - return s_VertexLoaders[vtx_attr_group]; + return loader; } bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size, bool skip_drawing) { if (!count) return true; - auto loader = RefreshLoader(vtx_attr_group); - size_t size = count * loader.first->GetVertexSize(); + CPState* state = &g_main_cp_state; + + VertexLoader* loader = RefreshLoader(vtx_attr_group, state); + + size_t size = count * loader->GetVertexSize(); if (buf_size < size) return false; @@ -169,15 +150,17 @@ bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size, return true; } + NativeVertexFormat* native = loader->GetNativeVertexFormat(); + // If the native vertex format changed, force a flush. - if (loader.second != s_current_vtx_fmt) + if (native != s_current_vtx_fmt) VertexManager::Flush(); - s_current_vtx_fmt = loader.second; + s_current_vtx_fmt = native; VertexManager::PrepareForAdditionalData(primitive, count, - loader.first->GetNativeVertexDeclaration().stride); + loader->GetNativeVertexDeclaration().stride); - loader.first->RunVertices(g_VtxAttr[vtx_attr_group], primitive, count); + loader->RunVertices(state->vtx_attr[vtx_attr_group], primitive, count); IndexGenerator::AddIndices(primitive, count); @@ -186,9 +169,9 @@ bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size, return true; } -int GetVertexSize(int vtx_attr_group) +int GetVertexSize(int vtx_attr_group, bool preprocess) { - return RefreshLoader(vtx_attr_group).first->GetVertexSize(); + return RefreshLoader(vtx_attr_group, preprocess ? &g_preprocess_cp_state : &g_main_cp_state)->GetVertexSize(); } NativeVertexFormat* GetCurrentVertexFormat() @@ -198,78 +181,83 @@ NativeVertexFormat* GetCurrentVertexFormat() } // namespace -void LoadCPReg(u32 sub_cmd, u32 value) +void LoadCPReg(u32 sub_cmd, u32 value, bool is_preprocess) { + bool update_global_state = !is_preprocess; + CPState* state = is_preprocess ? &g_preprocess_cp_state : &g_main_cp_state; switch (sub_cmd & 0xF0) { case 0x30: - VertexShaderManager::SetTexMatrixChangedA(value); + if (update_global_state) + VertexShaderManager::SetTexMatrixChangedA(value); break; case 0x40: - VertexShaderManager::SetTexMatrixChangedB(value); + if (update_global_state) + VertexShaderManager::SetTexMatrixChangedB(value); break; case 0x50: - g_VtxDesc.Hex &= ~0x1FFFF; // keep the Upper bits - g_VtxDesc.Hex |= value; - s_attr_dirty = 0xFF; + state->vtx_desc.Hex &= ~0x1FFFF; // keep the Upper bits + state->vtx_desc.Hex |= value; + state->attr_dirty = 0xFF; break; case 0x60: - g_VtxDesc.Hex &= 0x1FFFF; // keep the lower 17Bits - g_VtxDesc.Hex |= (u64)value << 17; - s_attr_dirty = 0xFF; + state->vtx_desc.Hex &= 0x1FFFF; // keep the lower 17Bits + state->vtx_desc.Hex |= (u64)value << 17; + state->attr_dirty = 0xFF; break; case 0x70: _assert_((sub_cmd & 0x0F) < 8); - g_VtxAttr[sub_cmd & 7].g0.Hex = value; - s_attr_dirty |= 1 << (sub_cmd & 7); + state->vtx_attr[sub_cmd & 7].g0.Hex = value; + state->attr_dirty |= 1 << (sub_cmd & 7); break; case 0x80: _assert_((sub_cmd & 0x0F) < 8); - g_VtxAttr[sub_cmd & 7].g1.Hex = value; - s_attr_dirty |= 1 << (sub_cmd & 7); + state->vtx_attr[sub_cmd & 7].g1.Hex = value; + state->attr_dirty |= 1 << (sub_cmd & 7); break; case 0x90: _assert_((sub_cmd & 0x0F) < 8); - g_VtxAttr[sub_cmd & 7].g2.Hex = value; - s_attr_dirty |= 1 << (sub_cmd & 7); + state->vtx_attr[sub_cmd & 7].g2.Hex = value; + state->attr_dirty |= 1 << (sub_cmd & 7); break; // Pointers to vertex arrays in GC RAM case 0xA0: - arraybases[sub_cmd & 0xF] = value; - cached_arraybases[sub_cmd & 0xF] = Memory::GetPointer(value); + state->array_bases[sub_cmd & 0xF] = value; + if (update_global_state) + cached_arraybases[sub_cmd & 0xF] = Memory::GetPointer(value); break; case 0xB0: - arraystrides[sub_cmd & 0xF] = value & 0xFF; + state->array_strides[sub_cmd & 0xF] = value & 0xFF; break; } } void FillCPMemoryArray(u32 *memory) { - memory[0x30] = MatrixIndexA.Hex; - memory[0x40] = MatrixIndexB.Hex; - memory[0x50] = (u32)g_VtxDesc.Hex; - memory[0x60] = (u32)(g_VtxDesc.Hex >> 17); + memory[0x30] = g_main_cp_state.matrix_index_a.Hex; + memory[0x40] = g_main_cp_state.matrix_index_b.Hex; + memory[0x50] = (u32)g_main_cp_state.vtx_desc.Hex; + memory[0x60] = (u32)(g_main_cp_state.vtx_desc.Hex >> 17); for (int i = 0; i < 8; ++i) { - memory[0x70 + i] = g_VtxAttr[i].g0.Hex; - memory[0x80 + i] = g_VtxAttr[i].g1.Hex; - memory[0x90 + i] = g_VtxAttr[i].g2.Hex; + memory[0x70 + i] = g_main_cp_state.vtx_attr[i].g0.Hex; + memory[0x80 + i] = g_main_cp_state.vtx_attr[i].g1.Hex; + memory[0x90 + i] = g_main_cp_state.vtx_attr[i].g2.Hex; } for (int i = 0; i < 16; ++i) { - memory[0xA0 + i] = arraybases[i]; - memory[0xB0 + i] = arraystrides[i]; + memory[0xA0 + i] = g_main_cp_state.array_bases[i]; + memory[0xB0 + i] = g_main_cp_state.array_strides[i]; } } @@ -277,6 +265,6 @@ void RecomputeCachedArraybases() { for (int i = 0; i < 16; i++) { - cached_arraybases[i] = Memory::GetPointer(arraybases[i]); + cached_arraybases[i] = Memory::GetPointer(g_main_cp_state.array_bases[i]); } } diff --git a/Source/Core/VideoCommon/VertexLoaderManager.h b/Source/Core/VideoCommon/VertexLoaderManager.h index 32a64b6cd8..8995ad2d7a 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.h +++ b/Source/Core/VideoCommon/VertexLoaderManager.h @@ -16,7 +16,7 @@ namespace VertexLoaderManager void MarkAllDirty(); - int GetVertexSize(int vtx_attr_group); + int GetVertexSize(int vtx_attr_group, bool preprocess); // Returns false if buf_size is insufficient. bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size, bool skip_drawing = false); diff --git a/Source/Core/VideoCommon/VertexLoader_Color.cpp b/Source/Core/VideoCommon/VertexLoader_Color.cpp index 13b01f0bdb..0120e3bea5 100644 --- a/Source/Core/VideoCommon/VertexLoader_Color.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Color.cpp @@ -117,7 +117,7 @@ template void Color_ReadIndex_16b_565() { auto const Index = DataRead(); - u16 val = Common::swap16(*(const u16 *)(cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]))); + u16 val = Common::swap16(*(const u16 *)(cached_arraybases[ARRAY_COLOR+colIndex] + (Index * g_main_cp_state.array_strides[ARRAY_COLOR+colIndex]))); _SetCol565(val); } @@ -125,7 +125,7 @@ template void Color_ReadIndex_24b_888() { auto const Index = DataRead(); - const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]); + const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * g_main_cp_state.array_strides[ARRAY_COLOR+colIndex]); _SetCol(_Read24(iAddress)); } @@ -133,7 +133,7 @@ template void Color_ReadIndex_32b_888x() { auto const Index = DataRead(); - const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]); + const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * g_main_cp_state.array_strides[ARRAY_COLOR+colIndex]); _SetCol(_Read24(iAddress)); } @@ -141,7 +141,7 @@ template void Color_ReadIndex_16b_4444() { auto const Index = DataRead(); - u16 val = *(const u16 *)(cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex])); + u16 val = *(const u16 *)(cached_arraybases[ARRAY_COLOR+colIndex] + (Index * g_main_cp_state.array_strides[ARRAY_COLOR+colIndex])); _SetCol4444(val); } @@ -149,7 +149,7 @@ template void Color_ReadIndex_24b_6666() { auto const Index = DataRead(); - const u8* pData = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]) - 1; + const u8* pData = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * g_main_cp_state.array_strides[ARRAY_COLOR+colIndex]) - 1; u32 val = Common::swap32(pData); _SetCol6666(val); } @@ -158,7 +158,7 @@ template void Color_ReadIndex_32b_8888() { auto const Index = DataRead(); - const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]); + const u8 *iAddress = cached_arraybases[ARRAY_COLOR+colIndex] + (Index * g_main_cp_state.array_strides[ARRAY_COLOR+colIndex]); _SetCol(_Read32(iAddress)); } diff --git a/Source/Core/VideoCommon/VertexLoader_Normal.cpp b/Source/Core/VideoCommon/VertexLoader_Normal.cpp index 0fb24d9264..3d58592d70 100644 --- a/Source/Core/VideoCommon/VertexLoader_Normal.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Normal.cpp @@ -80,7 +80,7 @@ __forceinline void Normal_Index_Offset() auto const index = DataRead(); auto const data = reinterpret_cast(cached_arraybases[ARRAY_NORMAL] - + (index * arraystrides[ARRAY_NORMAL]) + sizeof(T) * 3 * Offset); + + (index * g_main_cp_state.array_strides[ARRAY_NORMAL]) + sizeof(T) * 3 * Offset); ReadIndirect(data); } diff --git a/Source/Core/VideoCommon/VertexLoader_Position.cpp b/Source/Core/VideoCommon/VertexLoader_Position.cpp index 0c27b2b6fb..a38d429d58 100644 --- a/Source/Core/VideoCommon/VertexLoader_Position.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Position.cpp @@ -91,7 +91,7 @@ void LOADERDECL Pos_ReadIndex() static_assert(N <= 3, "N > 3 is not sane!"); auto const index = DataRead(); - auto const data = reinterpret_cast(cached_arraybases[ARRAY_POSITION] + (index * arraystrides[ARRAY_POSITION])); + auto const data = reinterpret_cast(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION])); auto const scale = posScale; DataWriter dst; @@ -109,7 +109,7 @@ template void LOADERDECL Pos_ReadIndex_Float_SSSE3() { auto const index = DataRead(); - const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (index * arraystrides[ARRAY_POSITION])); + const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION])); GC_ALIGNED128(const __m128i a = _mm_loadu_si128((__m128i*)pData)); GC_ALIGNED128(__m128i b = _mm_shuffle_epi8(a, three ? kMaskSwap32_3 : kMaskSwap32_2)); _mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, b); diff --git a/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp b/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp index 14b7efb451..25114ac82d 100644 --- a/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp +++ b/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp @@ -73,7 +73,7 @@ void LOADERDECL TexCoord_ReadIndex() auto const index = DataRead(); auto const data = reinterpret_cast(cached_arraybases[ARRAY_TEXCOORD0 + tcIndex] - + (index * arraystrides[ARRAY_TEXCOORD0 + tcIndex])); + + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0 + tcIndex])); auto const scale = tcScale[tcIndex]; DataWriter dst; @@ -94,7 +94,7 @@ void LOADERDECL TexCoord_ReadIndex_Short2_SSE4() // Heavy in ZWW auto const index = DataRead(); - const s32 *pData = (const s32*)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); + const s32 *pData = (const s32*)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0+tcIndex])); const __m128i a = _mm_cvtsi32_si128(*pData); const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_2); const __m128i c = _mm_cvtepi16_epi32(b); @@ -117,7 +117,7 @@ void LOADERDECL TexCoord_ReadIndex_Float2_SSSE3() static_assert(!std::numeric_limits::is_signed, "Only unsigned I is sane!"); auto const index = DataRead(); - const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); + const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0+tcIndex])); GC_ALIGNED128(const __m128i a = _mm_loadl_epi64((__m128i*)pData)); GC_ALIGNED128(const __m128i b = _mm_shuffle_epi8(a, kMaskSwap32)); _mm_storel_epi64((__m128i*)VertexManager::s_pCurBufferPointer, b); diff --git a/Source/Core/VideoCommon/VertexShaderGen.cpp b/Source/Core/VideoCommon/VertexShaderGen.cpp index 293b75cc78..e1526af7fa 100644 --- a/Source/Core/VideoCommon/VertexShaderGen.cpp +++ b/Source/Core/VideoCommon/VertexShaderGen.cpp @@ -245,8 +245,8 @@ static inline void GenerateVertexShader(T& out, u32 components, API_TYPE api_typ // donko - this has caused problems in some games. removed for now. bool texGenSpecialCase = false; /*bool texGenSpecialCase = - ((g_VtxDesc.Hex & 0x60600L) == g_VtxDesc.Hex) && // only pos and tex coord 0 - (g_VtxDesc.Tex0Coord != NOT_PRESENT) && + ((g_main_cp_state.vtx_desc.Hex & 0x60600L) == g_main_cp_state.vtx_desc.Hex) && // only pos and tex coord 0 + (g_main_cp_state.vtx_desc.Tex0Coord != NOT_PRESENT) && (xfmem.texcoords[0].texmtxinfo.inputform == XF_TEXINPUT_AB11); */ diff --git a/Source/Core/VideoCommon/VertexShaderManager.cpp b/Source/Core/VideoCommon/VertexShaderManager.cpp index 70e203d99e..022bf7683d 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.cpp +++ b/Source/Core/VideoCommon/VertexShaderManager.cpp @@ -329,8 +329,8 @@ void VertexShaderManager::SetConstants() { bPosNormalMatrixChanged = false; - const float *pos = (const float *)xfmem.posMatrices + MatrixIndexA.PosNormalMtxIdx * 4; - const float *norm = (const float *)xfmem.normalMatrices + 3 * (MatrixIndexA.PosNormalMtxIdx & 31); + const float *pos = (const float *)xfmem.posMatrices + g_main_cp_state.matrix_index_a.PosNormalMtxIdx * 4; + const float *norm = (const float *)xfmem.normalMatrices + 3 * (g_main_cp_state.matrix_index_a.PosNormalMtxIdx & 31); memcpy(constants.posnormalmatrix, pos, 3*16); memcpy(constants.posnormalmatrix[3], norm, 12); @@ -344,10 +344,10 @@ void VertexShaderManager::SetConstants() bTexMatricesChanged[0] = false; const float *fptrs[] = { - (const float *)&xfmem.posMatrices[MatrixIndexA.Tex0MtxIdx * 4], - (const float *)&xfmem.posMatrices[MatrixIndexA.Tex1MtxIdx * 4], - (const float *)&xfmem.posMatrices[MatrixIndexA.Tex2MtxIdx * 4], - (const float *)&xfmem.posMatrices[MatrixIndexA.Tex3MtxIdx * 4] + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_a.Tex0MtxIdx * 4], + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_a.Tex1MtxIdx * 4], + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_a.Tex2MtxIdx * 4], + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_a.Tex3MtxIdx * 4] }; for (int i = 0; i < 4; ++i) @@ -361,10 +361,10 @@ void VertexShaderManager::SetConstants() { bTexMatricesChanged[1] = false; const float *fptrs[] = { - (const float *)&xfmem.posMatrices[MatrixIndexB.Tex4MtxIdx * 4], - (const float *)&xfmem.posMatrices[MatrixIndexB.Tex5MtxIdx * 4], - (const float *)&xfmem.posMatrices[MatrixIndexB.Tex6MtxIdx * 4], - (const float *)&xfmem.posMatrices[MatrixIndexB.Tex7MtxIdx * 4] + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_b.Tex4MtxIdx * 4], + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_b.Tex5MtxIdx * 4], + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_b.Tex6MtxIdx * 4], + (const float *)&xfmem.posMatrices[g_main_cp_state.matrix_index_b.Tex7MtxIdx * 4] }; for (int i = 0; i < 4; ++i) @@ -536,26 +536,26 @@ void VertexShaderManager::SetConstants() void VertexShaderManager::InvalidateXFRange(int start, int end) { - if (((u32)start >= (u32)MatrixIndexA.PosNormalMtxIdx * 4 && - (u32)start < (u32)MatrixIndexA.PosNormalMtxIdx * 4 + 12) || - ((u32)start >= XFMEM_NORMALMATRICES + ((u32)MatrixIndexA.PosNormalMtxIdx & 31) * 3 && - (u32)start < XFMEM_NORMALMATRICES + ((u32)MatrixIndexA.PosNormalMtxIdx & 31) * 3 + 9)) + if (((u32)start >= (u32)g_main_cp_state.matrix_index_a.PosNormalMtxIdx * 4 && + (u32)start < (u32)g_main_cp_state.matrix_index_a.PosNormalMtxIdx * 4 + 12) || + ((u32)start >= XFMEM_NORMALMATRICES + ((u32)g_main_cp_state.matrix_index_a.PosNormalMtxIdx & 31) * 3 && + (u32)start < XFMEM_NORMALMATRICES + ((u32)g_main_cp_state.matrix_index_a.PosNormalMtxIdx & 31) * 3 + 9)) { bPosNormalMatrixChanged = true; } - if (((u32)start >= (u32)MatrixIndexA.Tex0MtxIdx*4 && (u32)start < (u32)MatrixIndexA.Tex0MtxIdx*4+12) || - ((u32)start >= (u32)MatrixIndexA.Tex1MtxIdx*4 && (u32)start < (u32)MatrixIndexA.Tex1MtxIdx*4+12) || - ((u32)start >= (u32)MatrixIndexA.Tex2MtxIdx*4 && (u32)start < (u32)MatrixIndexA.Tex2MtxIdx*4+12) || - ((u32)start >= (u32)MatrixIndexA.Tex3MtxIdx*4 && (u32)start < (u32)MatrixIndexA.Tex3MtxIdx*4+12)) + if (((u32)start >= (u32)g_main_cp_state.matrix_index_a.Tex0MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_a.Tex0MtxIdx*4+12) || + ((u32)start >= (u32)g_main_cp_state.matrix_index_a.Tex1MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_a.Tex1MtxIdx*4+12) || + ((u32)start >= (u32)g_main_cp_state.matrix_index_a.Tex2MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_a.Tex2MtxIdx*4+12) || + ((u32)start >= (u32)g_main_cp_state.matrix_index_a.Tex3MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_a.Tex3MtxIdx*4+12)) { bTexMatricesChanged[0] = true; } - if (((u32)start >= (u32)MatrixIndexB.Tex4MtxIdx*4 && (u32)start < (u32)MatrixIndexB.Tex4MtxIdx*4+12) || - ((u32)start >= (u32)MatrixIndexB.Tex5MtxIdx*4 && (u32)start < (u32)MatrixIndexB.Tex5MtxIdx*4+12) || - ((u32)start >= (u32)MatrixIndexB.Tex6MtxIdx*4 && (u32)start < (u32)MatrixIndexB.Tex6MtxIdx*4+12) || - ((u32)start >= (u32)MatrixIndexB.Tex7MtxIdx*4 && (u32)start < (u32)MatrixIndexB.Tex7MtxIdx*4+12)) + if (((u32)start >= (u32)g_main_cp_state.matrix_index_b.Tex4MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_b.Tex4MtxIdx*4+12) || + ((u32)start >= (u32)g_main_cp_state.matrix_index_b.Tex5MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_b.Tex5MtxIdx*4+12) || + ((u32)start >= (u32)g_main_cp_state.matrix_index_b.Tex6MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_b.Tex6MtxIdx*4+12) || + ((u32)start >= (u32)g_main_cp_state.matrix_index_b.Tex7MtxIdx*4 && (u32)start < (u32)g_main_cp_state.matrix_index_b.Tex7MtxIdx*4+12)) { bTexMatricesChanged[1] = true; } @@ -628,23 +628,23 @@ void VertexShaderManager::InvalidateXFRange(int start, int end) void VertexShaderManager::SetTexMatrixChangedA(u32 Value) { - if (MatrixIndexA.Hex != Value) + if (g_main_cp_state.matrix_index_a.Hex != Value) { VertexManager::Flush(); - if (MatrixIndexA.PosNormalMtxIdx != (Value&0x3f)) + if (g_main_cp_state.matrix_index_a.PosNormalMtxIdx != (Value&0x3f)) bPosNormalMatrixChanged = true; bTexMatricesChanged[0] = true; - MatrixIndexA.Hex = Value; + g_main_cp_state.matrix_index_a.Hex = Value; } } void VertexShaderManager::SetTexMatrixChangedB(u32 Value) { - if (MatrixIndexB.Hex != Value) + if (g_main_cp_state.matrix_index_b.Hex != Value) { VertexManager::Flush(); bTexMatricesChanged[1] = true; - MatrixIndexB.Hex = Value; + g_main_cp_state.matrix_index_b.Hex = Value; } } diff --git a/Source/Core/VideoCommon/VideoBackendBase.h b/Source/Core/VideoCommon/VideoBackendBase.h index 6ab9fce8f2..7d62dda37b 100644 --- a/Source/Core/VideoCommon/VideoBackendBase.h +++ b/Source/Core/VideoCommon/VideoBackendBase.h @@ -116,6 +116,8 @@ public: virtual void DoState(PointerWrap &p) = 0; virtual void CheckInvalidState() = 0; + + virtual void UpdateWantDeterminism(bool want) {} }; extern std::vector g_available_video_backends; @@ -151,6 +153,8 @@ class VideoBackendHardware : public VideoBackend void PauseAndLock(bool doLock, bool unpauseOnUnlock=true) override; void DoState(PointerWrap &p) override; + void UpdateWantDeterminism(bool want) override; + bool m_invalid; public: diff --git a/Source/Core/VideoCommon/VideoState.cpp b/Source/Core/VideoCommon/VideoState.cpp index a8d5ddcf04..dd0eb2fb88 100644 --- a/Source/Core/VideoCommon/VideoState.cpp +++ b/Source/Core/VideoCommon/VideoState.cpp @@ -22,13 +22,7 @@ static void DoState(PointerWrap &p) p.DoMarker("BP Memory"); // CP Memory - p.DoArray(arraybases, 16); - p.DoArray(arraystrides, 16); - p.Do(MatrixIndexA); - p.Do(MatrixIndexB); - p.Do(g_VtxDesc.Hex); - p.DoArray(g_VtxAttr, 8); - p.DoMarker("CP Memory"); + DoCPState(p); // XF Memory p.Do(xfmem); @@ -73,11 +67,7 @@ void VideoCommon_RunLoop(bool enable) void VideoCommon_Init() { - memset(arraybases, 0, sizeof(arraybases)); - memset(arraystrides, 0, sizeof(arraystrides)); - memset(&MatrixIndexA, 0, sizeof(MatrixIndexA)); - memset(&MatrixIndexB, 0, sizeof(MatrixIndexB)); - memset(&g_VtxDesc, 0, sizeof(g_VtxDesc)); - memset(g_VtxAttr, 0, sizeof(g_VtxAttr)); + memset(&g_main_cp_state, 0, sizeof(g_main_cp_state)); + memset(&g_preprocess_cp_state, 0, sizeof(g_preprocess_cp_state)); memset(texMem, 0, TMEM_SIZE); } diff --git a/Source/Core/VideoCommon/XFMemory.h b/Source/Core/VideoCommon/XFMemory.h index c9d4d35216..33077aa69b 100644 --- a/Source/Core/VideoCommon/XFMemory.h +++ b/Source/Core/VideoCommon/XFMemory.h @@ -275,3 +275,4 @@ extern XFMemory xfmem; void LoadXFReg(u32 transferSize, u32 address); void LoadIndexedXF(u32 val, int array); +void PreprocessIndexedXF(u32 val, int refarray); diff --git a/Source/Core/VideoCommon/XFStructs.cpp b/Source/Core/VideoCommon/XFStructs.cpp index 650a4a913b..0552aa0986 100644 --- a/Source/Core/VideoCommon/XFStructs.cpp +++ b/Source/Core/VideoCommon/XFStructs.cpp @@ -6,6 +6,7 @@ #include "Core/HW/Memmap.h" #include "VideoCommon/CPMemory.h" #include "VideoCommon/DataReader.h" +#include "VideoCommon/Fifo.h" #include "VideoCommon/PixelShaderManager.h" #include "VideoCommon/VertexManagerBase.h" #include "VideoCommon/VertexShaderManager.h" @@ -252,7 +253,15 @@ void LoadIndexedXF(u32 val, int refarray) //load stuff from array to address in xf mem u32* currData = (u32*)(&xfmem) + address; - u32* newData = (u32*)Memory::GetPointer(arraybases[refarray] + arraystrides[refarray] * index); + u32* newData; + if (g_use_deterministic_gpu_thread) + { + newData = (u32*)PopFifoAuxBuffer(size * sizeof(u32)); + } + else + { + newData = (u32*)Memory::GetPointer(g_main_cp_state.array_bases[refarray] + g_main_cp_state.array_strides[refarray] * index); + } bool changed = false; for (int i = 0; i < size; ++i) { @@ -269,3 +278,14 @@ void LoadIndexedXF(u32 val, int refarray) currData[i] = Common::swap32(newData[i]); } } + +void PreprocessIndexedXF(u32 val, int refarray) +{ + int index = val >> 16; + int size = ((val >> 12) & 0xF) + 1; + + u32* new_data = (u32*)Memory::GetPointer(g_preprocess_cp_state.array_bases[refarray] + g_preprocess_cp_state.array_strides[refarray] * index); + + size_t buf_size = size * sizeof(u32); + PushFifoAuxBuffer(new_data, buf_size); +} diff --git a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp index 1126cbd73a..576e9a8a53 100644 --- a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp +++ b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp @@ -74,7 +74,7 @@ protected: void ResetPointers() { - g_pVideoData = &input_memory[0]; + g_video_buffer_read_ptr = &input_memory[0]; VertexManager::s_pCurBufferPointer = &output_memory[0]; m_input_pos = m_output_pos = 0; }