From 608f9bcd6724c77fef9be8e45ec8f14b49e2b3fb Mon Sep 17 00:00:00 2001 From: comex Date: Mon, 1 Sep 2014 01:11:32 -0400 Subject: [PATCH] Refactor opcode decoding a bit to kill FifoCommandRunnable. Separated out from my gpu-determinism branch by request. It's not a big commit; I just like to write long commit messages. The main reason to kill it is hopefully a slight performance improvement from avoiding the double switch (especially in single core mode); however, this also improves cycle calculation, as described below. - FifoCommandRunnable is removed; in its stead, Decode returns the number of cycles (which only matters for "sync" GPU mode), or 0 if there was not enough data, and is also responsible for unknown opcode alerts. Decode and DecodeSemiNop are almost identical, so the latter is replaced with a skipped_frame parameter to Decode. Doesn't mean we can't improve skipped_frame mode to do less work; if, at such a point, branching on it has too much overhead (it certainly won't now), it can always be changed to a template parameter. - FifoCommandRunnable used a fixed, large cycle count for display lists, regardless of the contents. Presumably the actual hardware's processing time is mostly the processing time of whatever commands are in the list, and with this change InterpretDisplayList can just return the list's cycle count to be added to the total. (Since the calculation for this is part of Decode, it didn't seem easy to split this change up.) To facilitate this, Decode also gains an explicit 'end' parameter in lieu of FifoCommandRunnable's call to GetVideoBufferEndPtr, which can point to there or to the end of a display list (or elsewhere in gpu-determinism, but that's another story). Also, as a small optimization, InterpretDisplayList now calls OpcodeDecoder_Run rather than having its own Decode loop, to allow Decode to be inlined (haven't checked whether this actually happens though). skipped_frame mode still does not traverse display lists and uses the old fake value of 45 cycles. degasus has suggested that this hack is not essential for performance and can be removed, but I want to separate any potential performance impact of that from this commit. --- Source/Core/VideoCommon/Fifo.cpp | 4 +- Source/Core/VideoCommon/OpcodeDecoding.cpp | 370 ++++++------------ Source/Core/VideoCommon/OpcodeDecoding.h | 3 +- .../Core/VideoCommon/VertexLoaderManager.cpp | 13 +- Source/Core/VideoCommon/VertexLoaderManager.h | 3 +- 5 files changed, 123 insertions(+), 270 deletions(-) diff --git a/Source/Core/VideoCommon/Fifo.cpp b/Source/Core/VideoCommon/Fifo.cpp index 165cd8e239..fb47c73e49 100644 --- a/Source/Core/VideoCommon/Fifo.cpp +++ b/Source/Core/VideoCommon/Fifo.cpp @@ -173,7 +173,7 @@ void RunGpuLoop() ReadDataFromFifo(uData, 32); - cyclesExecuted = OpcodeDecoder_Run(g_bSkipCurrentFrame); + cyclesExecuted = OpcodeDecoder_Run(g_bSkipCurrentFrame, GetVideoBufferEndPtr()); if (Core::g_CoreStartupParameter.bSyncGPU && Common::AtomicLoad(CommandProcessor::VITicks) > cyclesExecuted) Common::AtomicAdd(CommandProcessor::VITicks, -(s32)cyclesExecuted); @@ -235,7 +235,7 @@ void RunGpu() FPURoundMode::SaveSIMDState(); FPURoundMode::LoadDefaultSIMDState(); ReadDataFromFifo(uData, 32); - OpcodeDecoder_Run(g_bSkipCurrentFrame); + OpcodeDecoder_Run(g_bSkipCurrentFrame, GetVideoBufferEndPtr()); FPURoundMode::LoadSIMDState(); //DEBUG_LOG(COMMANDPROCESSOR, "Fifo wraps to base"); diff --git a/Source/Core/VideoCommon/OpcodeDecoding.cpp b/Source/Core/VideoCommon/OpcodeDecoding.cpp index ff9cd8e735..fe644db21e 100644 --- a/Source/Core/VideoCommon/OpcodeDecoding.cpp +++ b/Source/Core/VideoCommon/OpcodeDecoding.cpp @@ -75,13 +75,13 @@ static DataReadU32xNfunc DataReadU32xFuncs[16] = { DataReadU32xN<16> }; -static void Decode(); - -void InterpretDisplayList(u32 address, u32 size) +static u32 InterpretDisplayList(u32 address, u32 size) { u8* old_pVideoData = g_pVideoData; u8* startAddress = Memory::GetPointer(address); + u32 cycles = 0; + // Avoid the crash if Memory::GetPointer failed .. if (startAddress != nullptr) { @@ -91,10 +91,7 @@ void InterpretDisplayList(u32 address, u32 size) Statistics::SwapDL(); u8 *end = g_pVideoData + size; - while (g_pVideoData < end) - { - Decode(); - } + cycles = OpcodeDecoder_Run(false, end); INCSTAT(stats.thisFrame.numDListsCalled); // un-swap @@ -103,186 +100,71 @@ void InterpretDisplayList(u32 address, u32 size) // reset to the old pointer g_pVideoData = old_pVideoData; + + return cycles; } -static u32 FifoCommandRunnable(u32 &command_size) +static void UnknownOpcode(u8 cmd_byte, void *buffer, bool preprocess) { - u32 cycleTime = 0; - u32 buffer_size = (u32)(GetVideoBufferEndPtr() - g_pVideoData); - if (buffer_size == 0) - return 0; // can't peek - - u8 cmd_byte = DataPeek8(0); - - switch (cmd_byte) + // TODO(Omega): Maybe dump FIFO to file on this error + std::string temp = StringFromFormat( + "GFX FIFO: Unknown Opcode (0x%x @ %p).\n" + "This means one of the following:\n" + "* The emulated GPU got desynced, disabling dual core can help\n" + "* Command stream corrupted by some spurious memory bug\n" + "* This really is an unknown opcode (unlikely)\n" + "* Some other sort of bug\n\n" + "Dolphin will now likely crash or hang. Enjoy." , + cmd_byte, + buffer); + Host_SysMessage(temp.c_str()); + INFO_LOG(VIDEO, "%s", temp.c_str()); { - case GX_NOP: // Hm, this means that we scan over nop streams pretty slowly... - command_size = 1; - cycleTime = 6; - break; - case GX_CMD_INVL_VC: // Invalidate Vertex Cache - no parameters - command_size = 1; - cycleTime = 6; - break; - case GX_CMD_UNKNOWN_METRICS: // zelda 4 swords calls it and checks the metrics registers after that - command_size = 1; - cycleTime = 6; - break; + SCPFifoStruct &fifo = CommandProcessor::fifo; - case GX_LOAD_BP_REG: - command_size = 5; - cycleTime = 12; - break; + std::string tmp = StringFromFormat( + "Illegal command %02x\n" + "CPBase: 0x%08x\n" + "CPEnd: 0x%08x\n" + "CPHiWatermark: 0x%08x\n" + "CPLoWatermark: 0x%08x\n" + "CPReadWriteDistance: 0x%08x\n" + "CPWritePointer: 0x%08x\n" + "CPReadPointer: 0x%08x\n" + "CPBreakpoint: 0x%08x\n" + "bFF_GPReadEnable: %s\n" + "bFF_BPEnable: %s\n" + "bFF_BPInt: %s\n" + "bFF_Breakpoint: %s\n" + ,cmd_byte, fifo.CPBase, fifo.CPEnd, fifo.CPHiWatermark, fifo.CPLoWatermark, fifo.CPReadWriteDistance + ,fifo.CPWritePointer, fifo.CPReadPointer, fifo.CPBreakpoint, fifo.bFF_GPReadEnable ? "true" : "false" + ,fifo.bFF_BPEnable ? "true" : "false" ,fifo.bFF_BPInt ? "true" : "false" + ,fifo.bFF_Breakpoint ? "true" : "false"); - case GX_LOAD_CP_REG: - command_size = 6; - cycleTime = 12; - break; - - case GX_LOAD_INDX_A: - case GX_LOAD_INDX_B: - case GX_LOAD_INDX_C: - case GX_LOAD_INDX_D: - command_size = 5; - cycleTime = 6; // TODO - break; - - case GX_CMD_CALL_DL: - { - // FIXME: Calculate the cycle time of the display list. - //u32 address = DataPeek32(1); - //u32 size = DataPeek32(5); - //u8* old_pVideoData = g_pVideoData; - //u8* startAddress = Memory::GetPointer(address); - - //// Avoid the crash if Memory::GetPointer failed .. - //if (startAddress != 0) - //{ - // g_pVideoData = startAddress; - // u8 *end = g_pVideoData + size; - // u32 step = 0; - // while (g_pVideoData < end) - // { - // cycleTime += FifoCommandRunnable(step); - // g_pVideoData += step; - // } - //} - //else - //{ - // cycleTime = 45; - //} - - //// reset to the old pointer - //g_pVideoData = old_pVideoData; - command_size = 9; - cycleTime = 45; // This is unverified - } - break; - - case GX_LOAD_XF_REG: - { - // check if we can read the header - if (buffer_size >= 5) - { - command_size = 1 + 4; - u32 Cmd2 = DataPeek32(1); - int transfer_size = ((Cmd2 >> 16) & 15) + 1; - command_size += transfer_size * 4; - cycleTime = 18 + 6 * transfer_size; - } - else - { - return 0; - } - } - break; - - default: - if ((cmd_byte & 0xC0) == 0x80) - { - // check if we can read the header - if (buffer_size >= 3) - { - command_size = 1 + 2; - u16 numVertices = DataPeek16(1); - command_size += numVertices * VertexLoaderManager::GetVertexSize(cmd_byte & GX_VAT_MASK); - cycleTime = 1600; // This depends on the number of pixels rendered - } - else - { - return 0; - } - } - else - { - // TODO(Omega): Maybe dump FIFO to file on this error - std::string temp = StringFromFormat( - "GFX FIFO: Unknown Opcode (0x%x).\n" - "This means one of the following:\n" - "* The emulated GPU got desynced, disabling dual core can help\n" - "* Command stream corrupted by some spurious memory bug\n" - "* This really is an unknown opcode (unlikely)\n" - "* Some other sort of bug\n\n" - "Dolphin will now likely crash or hang. Enjoy." , cmd_byte); - Host_SysMessage(temp.c_str()); - INFO_LOG(VIDEO, "%s", temp.c_str()); - { - SCPFifoStruct &fifo = CommandProcessor::fifo; - - std::string tmp = StringFromFormat( - "Illegal command %02x\n" - "CPBase: 0x%08x\n" - "CPEnd: 0x%08x\n" - "CPHiWatermark: 0x%08x\n" - "CPLoWatermark: 0x%08x\n" - "CPReadWriteDistance: 0x%08x\n" - "CPWritePointer: 0x%08x\n" - "CPReadPointer: 0x%08x\n" - "CPBreakpoint: 0x%08x\n" - "bFF_GPReadEnable: %s\n" - "bFF_BPEnable: %s\n" - "bFF_BPInt: %s\n" - "bFF_Breakpoint: %s\n" - ,cmd_byte, fifo.CPBase, fifo.CPEnd, fifo.CPHiWatermark, fifo.CPLoWatermark, fifo.CPReadWriteDistance - ,fifo.CPWritePointer, fifo.CPReadPointer, fifo.CPBreakpoint, fifo.bFF_GPReadEnable ? "true" : "false" - ,fifo.bFF_BPEnable ? "true" : "false" ,fifo.bFF_BPInt ? "true" : "false" - ,fifo.bFF_Breakpoint ? "true" : "false"); - - Host_SysMessage(tmp.c_str()); - INFO_LOG(VIDEO, "%s", tmp.c_str()); - } - } - break; + Host_SysMessage(tmp.c_str()); + INFO_LOG(VIDEO, "%s", tmp.c_str()); } - - if (command_size > buffer_size) - return 0; - - // INFO_LOG("OP detected: cmd_byte 0x%x size %i buffer %i",cmd_byte, command_size, buffer_size); - if (cycleTime == 0) - cycleTime = 6; - - return cycleTime; } -static u32 FifoCommandRunnable() -{ - u32 command_size = 0; - return FifoCommandRunnable(command_size); -} - -static void Decode() +static u32 Decode(u8* end, bool skipped_frame) { u8 *opcodeStart = g_pVideoData; + if (g_pVideoData == end) + return 0; - int cmd_byte = DataReadU8(); + u8 cmd_byte = DataReadU8(); + u32 cycles; switch (cmd_byte) { case GX_NOP: + cycles = 6; // Hm, this means that we scan over nop streams pretty slowly... break; case GX_LOAD_CP_REG: //0x08 { + if (end - g_pVideoData < 1 + 4) + return 0; + cycles = 12; u8 sub_cmd = DataReadU8(); u32 value = DataReadU32(); LoadCPReg(sub_cmd, value); @@ -292,8 +174,13 @@ static void Decode() case GX_LOAD_XF_REG: { + if (end - g_pVideoData < 4) + return 0; u32 Cmd2 = DataReadU32(); int transfer_size = ((Cmd2 >> 16) & 15) + 1; + if ((size_t) (end - g_pVideoData) < transfer_size * sizeof(u32)) + return 0; + cycles = 18 + 6 * transfer_size; u32 xf_address = Cmd2 & 0xFFFF; GC_ALIGNED128(u32 data_buffer[16]); DataReadU32xFuncs[transfer_size-1](data_buffer); @@ -304,36 +191,60 @@ static void Decode() break; case GX_LOAD_INDX_A: //used for position matrices + if (end - g_pVideoData < 4) + return 0; + cycles = 6; LoadIndexedXF(DataReadU32(), 0xC); break; case GX_LOAD_INDX_B: //used for normal matrices + if (end - g_pVideoData < 4) + return 0; + cycles = 6; LoadIndexedXF(DataReadU32(), 0xD); break; case GX_LOAD_INDX_C: //used for postmatrices + if (end - g_pVideoData < 4) + return 0; + cycles = 6; LoadIndexedXF(DataReadU32(), 0xE); break; case GX_LOAD_INDX_D: //used for lights + if (end - g_pVideoData < 4) + return 0; + cycles = 6; LoadIndexedXF(DataReadU32(), 0xF); break; case GX_CMD_CALL_DL: { + if (end - g_pVideoData < 8) + return 0; u32 address = DataReadU32(); u32 count = DataReadU32(); - InterpretDisplayList(address, count); + if (skipped_frame) + cycles = 45; // xxx + else + cycles = 6 + InterpretDisplayList(address, count); } break; case GX_CMD_UNKNOWN_METRICS: // zelda 4 swords calls it and checks the metrics registers after that + cycles = 6; DEBUG_LOG(VIDEO, "GX 0x44: %08x", cmd_byte); break; case GX_CMD_INVL_VC: // Invalidate Vertex Cache + cycles = 6; DEBUG_LOG(VIDEO, "Invalidate (vertex cache?)"); break; case GX_LOAD_BP_REG: //0x61 + // In skipped_frame case: We have to let BP writes through because they set + // tokens and stuff. TODO: Call a much simplified LoadBPReg instead. { + if (end - g_pVideoData < 4) + return 0; + cycles = 12; u32 bp_cmd = DataReadU32(); LoadBPReg(bp_cmd); INCSTAT(stats.thisFrame.numBPLoads); @@ -344,18 +255,33 @@ static void Decode() default: if ((cmd_byte & 0xC0) == 0x80) { - // load vertices (use computed vertex size from FifoCommandRunnable above) + cycles = 1600; + // load vertices + if (end - g_pVideoData < 2) + return 0; u16 numVertices = DataReadU16(); - VertexLoaderManager::RunVertices( - cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7) - (cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, - numVertices); + if (skipped_frame) + { + size_t size = numVertices * VertexLoaderManager::GetVertexSize(cmd_byte & GX_VAT_MASK); + if ((size_t) (end - g_pVideoData) < size) + return 0; + DataSkip((u32)size); + } + else + { + if (!VertexLoaderManager::RunVertices( + cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7) + (cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, + numVertices, + end - g_pVideoData)) + return 0; + } } else { - ERROR_LOG(VIDEO, "OpcodeDecoding::Decode: Illegal command %02x", cmd_byte); - break; + UnknownOpcode(cmd_byte, opcodeStart, false); + cycles = 1; } break; } @@ -363,89 +289,8 @@ static void Decode() // Display lists get added directly into the FIFO stream if (g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL) FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(g_pVideoData - opcodeStart)); -} -static void DecodeSemiNop() -{ - u8 *opcodeStart = g_pVideoData; - - int cmd_byte = DataReadU8(); - switch (cmd_byte) - { - case GX_CMD_UNKNOWN_METRICS: // zelda 4 swords calls it and checks the metrics registers after that - case GX_CMD_INVL_VC: // Invalidate Vertex Cache - case GX_NOP: - break; - - case GX_LOAD_CP_REG: //0x08 - // We have to let CP writes through because they determine the size of vertices. - { - u8 sub_cmd = DataReadU8(); - u32 value = DataReadU32(); - LoadCPReg(sub_cmd, value); - INCSTAT(stats.thisFrame.numCPLoads); - } - break; - - case GX_LOAD_XF_REG: - { - u32 Cmd2 = DataReadU32(); - int transfer_size = ((Cmd2 >> 16) & 15) + 1; - u32 address = Cmd2 & 0xFFFF; - GC_ALIGNED128(u32 data_buffer[16]); - DataReadU32xFuncs[transfer_size-1](data_buffer); - LoadXFReg(transfer_size, address, data_buffer); - INCSTAT(stats.thisFrame.numXFLoads); - } - break; - - case GX_LOAD_INDX_A: //used for position matrices - LoadIndexedXF(DataReadU32(), 0xC); - break; - case GX_LOAD_INDX_B: //used for normal matrices - LoadIndexedXF(DataReadU32(), 0xD); - break; - case GX_LOAD_INDX_C: //used for postmatrices - LoadIndexedXF(DataReadU32(), 0xE); - break; - case GX_LOAD_INDX_D: //used for lights - LoadIndexedXF(DataReadU32(), 0xF); - break; - - case GX_CMD_CALL_DL: - // Hm, wonder if any games put tokens in display lists - in that case, - // we'll have to parse them too. - DataSkip(8); - break; - - case GX_LOAD_BP_REG: //0x61 - // We have to let BP writes through because they set tokens and stuff. - // TODO: Call a much simplified LoadBPReg instead. - { - u32 bp_cmd = DataReadU32(); - LoadBPReg(bp_cmd); - INCSTAT(stats.thisFrame.numBPLoads); - } - break; - - // draw primitives - default: - if ((cmd_byte & 0xC0) == 0x80) - { - // load vertices (use computed vertex size from FifoCommandRunnable above) - u16 numVertices = DataReadU16(); - DataSkip(numVertices * VertexLoaderManager::GetVertexSize(cmd_byte & GX_VAT_MASK)); - } - else - { - ERROR_LOG(VIDEO, "OpcodeDecoding::Decode: Illegal command %02x", cmd_byte); - break; - } - break; - } - - if (g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL) - FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(g_pVideoData - opcodeStart)); + return cycles; } void OpcodeDecoder_Init() @@ -466,15 +311,18 @@ void OpcodeDecoder_Shutdown() { } -u32 OpcodeDecoder_Run(bool skipped_frame) +u32 OpcodeDecoder_Run(bool skipped_frame, u8* end) { u32 totalCycles = 0; while (true) { - u32 cycles = FifoCommandRunnable(); + u8* old = g_pVideoData; + u32 cycles = Decode(end, skipped_frame); if (cycles == 0) + { + g_pVideoData = old; break; - skipped_frame ? DecodeSemiNop() : Decode(); + } totalCycles += cycles; } return totalCycles; diff --git a/Source/Core/VideoCommon/OpcodeDecoding.h b/Source/Core/VideoCommon/OpcodeDecoding.h index baf44bd40f..2944863bbc 100644 --- a/Source/Core/VideoCommon/OpcodeDecoding.h +++ b/Source/Core/VideoCommon/OpcodeDecoding.h @@ -38,5 +38,4 @@ extern bool g_bRecordFifoData; void OpcodeDecoder_Init(); void OpcodeDecoder_Shutdown(); -u32 OpcodeDecoder_Run(bool skipped_frame); -void InterpretDisplayList(u32 address, u32 size); +u32 OpcodeDecoder_Run(bool skipped_frame, u8* end); diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index b74fc029db..7ff21d103b 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -151,17 +151,21 @@ static VertexLoaderCacheItem RefreshLoader(int vtx_attr_group) return s_VertexLoaders[vtx_attr_group]; } -void RunVertices(int vtx_attr_group, int primitive, int count) +bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size) { if (!count) - return; + return true; auto loader = RefreshLoader(vtx_attr_group); + size_t size = count * loader.first->GetVertexSize(); + if (buf_size < size) + return false; + if (bpmem.genMode.cullmode == GenMode::CULL_ALL && primitive < 5) { // if cull mode is CULL_ALL, ignore triangles and quads - DataSkip(count * loader.first->GetVertexSize()); - return; + DataSkip((u32)size); + return true; } // If the native vertex format changed, force a flush. @@ -178,6 +182,7 @@ void RunVertices(int vtx_attr_group, int primitive, int count) ADDSTAT(stats.thisFrame.numPrims, count); INCSTAT(stats.thisFrame.numPrimitiveJoins); + return true; } int GetVertexSize(int vtx_attr_group) diff --git a/Source/Core/VideoCommon/VertexLoaderManager.h b/Source/Core/VideoCommon/VertexLoaderManager.h index 17bf13c3ba..ff3485b9d5 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.h +++ b/Source/Core/VideoCommon/VertexLoaderManager.h @@ -17,7 +17,8 @@ namespace VertexLoaderManager void MarkAllDirty(); int GetVertexSize(int vtx_attr_group); - void RunVertices(int vtx_attr_group, int primitive, int count); + // Returns false if buf_size is insufficient. + bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size); // For debugging void AppendListToString(std::string *dest);