Refactor opcode decoding a bit to kill FifoCommandRunnable.

Separated out from my gpu-determinism branch by request.  It's not a big
commit; I just like to write long commit messages.

The main reason to kill it is hopefully a slight performance improvement
from avoiding the double switch (especially in single core mode);
however, this also improves cycle calculation, as described below.

- FifoCommandRunnable is removed; in its stead, Decode returns the
number of cycles (which only matters for "sync" GPU mode), or 0 if there
was not enough data, and is also responsible for unknown opcode alerts.

Decode and DecodeSemiNop are almost identical, so the latter is replaced
with a skipped_frame parameter to Decode.  Doesn't mean we can't improve
skipped_frame mode to do less work; if, at such a point, branching on it
has too much overhead (it certainly won't now), it can always be changed
to a template parameter.

- FifoCommandRunnable used a fixed, large cycle count for display lists,
regardless of the contents.  Presumably the actual hardware's processing
time is mostly the processing time of whatever commands are in the list,
and with this change InterpretDisplayList can just return the list's
cycle count to be added to the total.  (Since the calculation for this
is part of Decode, it didn't seem easy to split this change up.)

To facilitate this, Decode also gains an explicit 'end' parameter in
lieu of FifoCommandRunnable's call to GetVideoBufferEndPtr, which can
point to there or to the end of a display list (or elsewhere in
gpu-determinism, but that's another story).  Also, as a small
optimization, InterpretDisplayList now calls OpcodeDecoder_Run rather
than having its own Decode loop, to allow Decode to be inlined (haven't
checked whether this actually happens though).

skipped_frame mode still does not traverse display lists and uses the
old fake value of 45 cycles.  degasus has suggested that this hack is
not essential for performance and can be removed, but I want to separate
any potential performance impact of that from this commit.
This commit is contained in:
comex 2014-09-01 01:11:32 -04:00
parent f8e24de833
commit 608f9bcd67
5 changed files with 123 additions and 270 deletions

View File

@ -173,7 +173,7 @@ void RunGpuLoop()
ReadDataFromFifo(uData, 32); ReadDataFromFifo(uData, 32);
cyclesExecuted = OpcodeDecoder_Run(g_bSkipCurrentFrame); cyclesExecuted = OpcodeDecoder_Run(g_bSkipCurrentFrame, GetVideoBufferEndPtr());
if (Core::g_CoreStartupParameter.bSyncGPU && Common::AtomicLoad(CommandProcessor::VITicks) > cyclesExecuted) if (Core::g_CoreStartupParameter.bSyncGPU && Common::AtomicLoad(CommandProcessor::VITicks) > cyclesExecuted)
Common::AtomicAdd(CommandProcessor::VITicks, -(s32)cyclesExecuted); Common::AtomicAdd(CommandProcessor::VITicks, -(s32)cyclesExecuted);
@ -235,7 +235,7 @@ void RunGpu()
FPURoundMode::SaveSIMDState(); FPURoundMode::SaveSIMDState();
FPURoundMode::LoadDefaultSIMDState(); FPURoundMode::LoadDefaultSIMDState();
ReadDataFromFifo(uData, 32); ReadDataFromFifo(uData, 32);
OpcodeDecoder_Run(g_bSkipCurrentFrame); OpcodeDecoder_Run(g_bSkipCurrentFrame, GetVideoBufferEndPtr());
FPURoundMode::LoadSIMDState(); FPURoundMode::LoadSIMDState();
//DEBUG_LOG(COMMANDPROCESSOR, "Fifo wraps to base"); //DEBUG_LOG(COMMANDPROCESSOR, "Fifo wraps to base");

View File

@ -75,13 +75,13 @@ static DataReadU32xNfunc DataReadU32xFuncs[16] = {
DataReadU32xN<16> DataReadU32xN<16>
}; };
static void Decode(); static u32 InterpretDisplayList(u32 address, u32 size)
void InterpretDisplayList(u32 address, u32 size)
{ {
u8* old_pVideoData = g_pVideoData; u8* old_pVideoData = g_pVideoData;
u8* startAddress = Memory::GetPointer(address); u8* startAddress = Memory::GetPointer(address);
u32 cycles = 0;
// Avoid the crash if Memory::GetPointer failed .. // Avoid the crash if Memory::GetPointer failed ..
if (startAddress != nullptr) if (startAddress != nullptr)
{ {
@ -91,10 +91,7 @@ void InterpretDisplayList(u32 address, u32 size)
Statistics::SwapDL(); Statistics::SwapDL();
u8 *end = g_pVideoData + size; u8 *end = g_pVideoData + size;
while (g_pVideoData < end) cycles = OpcodeDecoder_Run(false, end);
{
Decode();
}
INCSTAT(stats.thisFrame.numDListsCalled); INCSTAT(stats.thisFrame.numDListsCalled);
// un-swap // un-swap
@ -103,127 +100,23 @@ void InterpretDisplayList(u32 address, u32 size)
// reset to the old pointer // reset to the old pointer
g_pVideoData = old_pVideoData; g_pVideoData = old_pVideoData;
return cycles;
} }
static u32 FifoCommandRunnable(u32 &command_size) static void UnknownOpcode(u8 cmd_byte, void *buffer, bool preprocess)
{ {
u32 cycleTime = 0;
u32 buffer_size = (u32)(GetVideoBufferEndPtr() - g_pVideoData);
if (buffer_size == 0)
return 0; // can't peek
u8 cmd_byte = DataPeek8(0);
switch (cmd_byte)
{
case GX_NOP: // Hm, this means that we scan over nop streams pretty slowly...
command_size = 1;
cycleTime = 6;
break;
case GX_CMD_INVL_VC: // Invalidate Vertex Cache - no parameters
command_size = 1;
cycleTime = 6;
break;
case GX_CMD_UNKNOWN_METRICS: // zelda 4 swords calls it and checks the metrics registers after that
command_size = 1;
cycleTime = 6;
break;
case GX_LOAD_BP_REG:
command_size = 5;
cycleTime = 12;
break;
case GX_LOAD_CP_REG:
command_size = 6;
cycleTime = 12;
break;
case GX_LOAD_INDX_A:
case GX_LOAD_INDX_B:
case GX_LOAD_INDX_C:
case GX_LOAD_INDX_D:
command_size = 5;
cycleTime = 6; // TODO
break;
case GX_CMD_CALL_DL:
{
// FIXME: Calculate the cycle time of the display list.
//u32 address = DataPeek32(1);
//u32 size = DataPeek32(5);
//u8* old_pVideoData = g_pVideoData;
//u8* startAddress = Memory::GetPointer(address);
//// Avoid the crash if Memory::GetPointer failed ..
//if (startAddress != 0)
//{
// g_pVideoData = startAddress;
// u8 *end = g_pVideoData + size;
// u32 step = 0;
// while (g_pVideoData < end)
// {
// cycleTime += FifoCommandRunnable(step);
// g_pVideoData += step;
// }
//}
//else
//{
// cycleTime = 45;
//}
//// reset to the old pointer
//g_pVideoData = old_pVideoData;
command_size = 9;
cycleTime = 45; // This is unverified
}
break;
case GX_LOAD_XF_REG:
{
// check if we can read the header
if (buffer_size >= 5)
{
command_size = 1 + 4;
u32 Cmd2 = DataPeek32(1);
int transfer_size = ((Cmd2 >> 16) & 15) + 1;
command_size += transfer_size * 4;
cycleTime = 18 + 6 * transfer_size;
}
else
{
return 0;
}
}
break;
default:
if ((cmd_byte & 0xC0) == 0x80)
{
// check if we can read the header
if (buffer_size >= 3)
{
command_size = 1 + 2;
u16 numVertices = DataPeek16(1);
command_size += numVertices * VertexLoaderManager::GetVertexSize(cmd_byte & GX_VAT_MASK);
cycleTime = 1600; // This depends on the number of pixels rendered
}
else
{
return 0;
}
}
else
{
// TODO(Omega): Maybe dump FIFO to file on this error // TODO(Omega): Maybe dump FIFO to file on this error
std::string temp = StringFromFormat( std::string temp = StringFromFormat(
"GFX FIFO: Unknown Opcode (0x%x).\n" "GFX FIFO: Unknown Opcode (0x%x @ %p).\n"
"This means one of the following:\n" "This means one of the following:\n"
"* The emulated GPU got desynced, disabling dual core can help\n" "* The emulated GPU got desynced, disabling dual core can help\n"
"* Command stream corrupted by some spurious memory bug\n" "* Command stream corrupted by some spurious memory bug\n"
"* This really is an unknown opcode (unlikely)\n" "* This really is an unknown opcode (unlikely)\n"
"* Some other sort of bug\n\n" "* Some other sort of bug\n\n"
"Dolphin will now likely crash or hang. Enjoy." , cmd_byte); "Dolphin will now likely crash or hang. Enjoy." ,
cmd_byte,
buffer);
Host_SysMessage(temp.c_str()); Host_SysMessage(temp.c_str());
INFO_LOG(VIDEO, "%s", temp.c_str()); INFO_LOG(VIDEO, "%s", temp.c_str());
{ {
@ -251,38 +144,27 @@ static u32 FifoCommandRunnable(u32 &command_size)
Host_SysMessage(tmp.c_str()); Host_SysMessage(tmp.c_str());
INFO_LOG(VIDEO, "%s", tmp.c_str()); INFO_LOG(VIDEO, "%s", tmp.c_str());
} }
}
break;
}
if (command_size > buffer_size)
return 0;
// INFO_LOG("OP detected: cmd_byte 0x%x size %i buffer %i",cmd_byte, command_size, buffer_size);
if (cycleTime == 0)
cycleTime = 6;
return cycleTime;
} }
static u32 FifoCommandRunnable() static u32 Decode(u8* end, bool skipped_frame)
{
u32 command_size = 0;
return FifoCommandRunnable(command_size);
}
static void Decode()
{ {
u8 *opcodeStart = g_pVideoData; u8 *opcodeStart = g_pVideoData;
if (g_pVideoData == end)
return 0;
int cmd_byte = DataReadU8(); u8 cmd_byte = DataReadU8();
u32 cycles;
switch (cmd_byte) switch (cmd_byte)
{ {
case GX_NOP: case GX_NOP:
cycles = 6; // Hm, this means that we scan over nop streams pretty slowly...
break; break;
case GX_LOAD_CP_REG: //0x08 case GX_LOAD_CP_REG: //0x08
{ {
if (end - g_pVideoData < 1 + 4)
return 0;
cycles = 12;
u8 sub_cmd = DataReadU8(); u8 sub_cmd = DataReadU8();
u32 value = DataReadU32(); u32 value = DataReadU32();
LoadCPReg(sub_cmd, value); LoadCPReg(sub_cmd, value);
@ -292,8 +174,13 @@ static void Decode()
case GX_LOAD_XF_REG: case GX_LOAD_XF_REG:
{ {
if (end - g_pVideoData < 4)
return 0;
u32 Cmd2 = DataReadU32(); u32 Cmd2 = DataReadU32();
int transfer_size = ((Cmd2 >> 16) & 15) + 1; int transfer_size = ((Cmd2 >> 16) & 15) + 1;
if ((size_t) (end - g_pVideoData) < transfer_size * sizeof(u32))
return 0;
cycles = 18 + 6 * transfer_size;
u32 xf_address = Cmd2 & 0xFFFF; u32 xf_address = Cmd2 & 0xFFFF;
GC_ALIGNED128(u32 data_buffer[16]); GC_ALIGNED128(u32 data_buffer[16]);
DataReadU32xFuncs[transfer_size-1](data_buffer); DataReadU32xFuncs[transfer_size-1](data_buffer);
@ -304,36 +191,60 @@ static void Decode()
break; break;
case GX_LOAD_INDX_A: //used for position matrices case GX_LOAD_INDX_A: //used for position matrices
if (end - g_pVideoData < 4)
return 0;
cycles = 6;
LoadIndexedXF(DataReadU32(), 0xC); LoadIndexedXF(DataReadU32(), 0xC);
break; break;
case GX_LOAD_INDX_B: //used for normal matrices case GX_LOAD_INDX_B: //used for normal matrices
if (end - g_pVideoData < 4)
return 0;
cycles = 6;
LoadIndexedXF(DataReadU32(), 0xD); LoadIndexedXF(DataReadU32(), 0xD);
break; break;
case GX_LOAD_INDX_C: //used for postmatrices case GX_LOAD_INDX_C: //used for postmatrices
if (end - g_pVideoData < 4)
return 0;
cycles = 6;
LoadIndexedXF(DataReadU32(), 0xE); LoadIndexedXF(DataReadU32(), 0xE);
break; break;
case GX_LOAD_INDX_D: //used for lights case GX_LOAD_INDX_D: //used for lights
if (end - g_pVideoData < 4)
return 0;
cycles = 6;
LoadIndexedXF(DataReadU32(), 0xF); LoadIndexedXF(DataReadU32(), 0xF);
break; break;
case GX_CMD_CALL_DL: case GX_CMD_CALL_DL:
{ {
if (end - g_pVideoData < 8)
return 0;
u32 address = DataReadU32(); u32 address = DataReadU32();
u32 count = DataReadU32(); u32 count = DataReadU32();
InterpretDisplayList(address, count); if (skipped_frame)
cycles = 45; // xxx
else
cycles = 6 + InterpretDisplayList(address, count);
} }
break; break;
case GX_CMD_UNKNOWN_METRICS: // zelda 4 swords calls it and checks the metrics registers after that case GX_CMD_UNKNOWN_METRICS: // zelda 4 swords calls it and checks the metrics registers after that
cycles = 6;
DEBUG_LOG(VIDEO, "GX 0x44: %08x", cmd_byte); DEBUG_LOG(VIDEO, "GX 0x44: %08x", cmd_byte);
break; break;
case GX_CMD_INVL_VC: // Invalidate Vertex Cache case GX_CMD_INVL_VC: // Invalidate Vertex Cache
cycles = 6;
DEBUG_LOG(VIDEO, "Invalidate (vertex cache?)"); DEBUG_LOG(VIDEO, "Invalidate (vertex cache?)");
break; break;
case GX_LOAD_BP_REG: //0x61 case GX_LOAD_BP_REG: //0x61
// In skipped_frame case: We have to let BP writes through because they set
// tokens and stuff. TODO: Call a much simplified LoadBPReg instead.
{ {
if (end - g_pVideoData < 4)
return 0;
cycles = 12;
u32 bp_cmd = DataReadU32(); u32 bp_cmd = DataReadU32();
LoadBPReg(bp_cmd); LoadBPReg(bp_cmd);
INCSTAT(stats.thisFrame.numBPLoads); INCSTAT(stats.thisFrame.numBPLoads);
@ -344,18 +255,33 @@ static void Decode()
default: default:
if ((cmd_byte & 0xC0) == 0x80) if ((cmd_byte & 0xC0) == 0x80)
{ {
// load vertices (use computed vertex size from FifoCommandRunnable above) cycles = 1600;
// load vertices
if (end - g_pVideoData < 2)
return 0;
u16 numVertices = DataReadU16(); u16 numVertices = DataReadU16();
VertexLoaderManager::RunVertices( if (skipped_frame)
cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7) {
(cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, size_t size = numVertices * VertexLoaderManager::GetVertexSize(cmd_byte & GX_VAT_MASK);
numVertices); if ((size_t) (end - g_pVideoData) < size)
return 0;
DataSkip((u32)size);
} }
else else
{ {
ERROR_LOG(VIDEO, "OpcodeDecoding::Decode: Illegal command %02x", cmd_byte); if (!VertexLoaderManager::RunVertices(
break; cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7)
(cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT,
numVertices,
end - g_pVideoData))
return 0;
}
}
else
{
UnknownOpcode(cmd_byte, opcodeStart, false);
cycles = 1;
} }
break; break;
} }
@ -363,89 +289,8 @@ static void Decode()
// Display lists get added directly into the FIFO stream // Display lists get added directly into the FIFO stream
if (g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL) if (g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL)
FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(g_pVideoData - opcodeStart)); FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(g_pVideoData - opcodeStart));
}
static void DecodeSemiNop() return cycles;
{
u8 *opcodeStart = g_pVideoData;
int cmd_byte = DataReadU8();
switch (cmd_byte)
{
case GX_CMD_UNKNOWN_METRICS: // zelda 4 swords calls it and checks the metrics registers after that
case GX_CMD_INVL_VC: // Invalidate Vertex Cache
case GX_NOP:
break;
case GX_LOAD_CP_REG: //0x08
// We have to let CP writes through because they determine the size of vertices.
{
u8 sub_cmd = DataReadU8();
u32 value = DataReadU32();
LoadCPReg(sub_cmd, value);
INCSTAT(stats.thisFrame.numCPLoads);
}
break;
case GX_LOAD_XF_REG:
{
u32 Cmd2 = DataReadU32();
int transfer_size = ((Cmd2 >> 16) & 15) + 1;
u32 address = Cmd2 & 0xFFFF;
GC_ALIGNED128(u32 data_buffer[16]);
DataReadU32xFuncs[transfer_size-1](data_buffer);
LoadXFReg(transfer_size, address, data_buffer);
INCSTAT(stats.thisFrame.numXFLoads);
}
break;
case GX_LOAD_INDX_A: //used for position matrices
LoadIndexedXF(DataReadU32(), 0xC);
break;
case GX_LOAD_INDX_B: //used for normal matrices
LoadIndexedXF(DataReadU32(), 0xD);
break;
case GX_LOAD_INDX_C: //used for postmatrices
LoadIndexedXF(DataReadU32(), 0xE);
break;
case GX_LOAD_INDX_D: //used for lights
LoadIndexedXF(DataReadU32(), 0xF);
break;
case GX_CMD_CALL_DL:
// Hm, wonder if any games put tokens in display lists - in that case,
// we'll have to parse them too.
DataSkip(8);
break;
case GX_LOAD_BP_REG: //0x61
// We have to let BP writes through because they set tokens and stuff.
// TODO: Call a much simplified LoadBPReg instead.
{
u32 bp_cmd = DataReadU32();
LoadBPReg(bp_cmd);
INCSTAT(stats.thisFrame.numBPLoads);
}
break;
// draw primitives
default:
if ((cmd_byte & 0xC0) == 0x80)
{
// load vertices (use computed vertex size from FifoCommandRunnable above)
u16 numVertices = DataReadU16();
DataSkip(numVertices * VertexLoaderManager::GetVertexSize(cmd_byte & GX_VAT_MASK));
}
else
{
ERROR_LOG(VIDEO, "OpcodeDecoding::Decode: Illegal command %02x", cmd_byte);
break;
}
break;
}
if (g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL)
FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(g_pVideoData - opcodeStart));
} }
void OpcodeDecoder_Init() void OpcodeDecoder_Init()
@ -466,15 +311,18 @@ void OpcodeDecoder_Shutdown()
{ {
} }
u32 OpcodeDecoder_Run(bool skipped_frame) u32 OpcodeDecoder_Run(bool skipped_frame, u8* end)
{ {
u32 totalCycles = 0; u32 totalCycles = 0;
while (true) while (true)
{ {
u32 cycles = FifoCommandRunnable(); u8* old = g_pVideoData;
u32 cycles = Decode(end, skipped_frame);
if (cycles == 0) if (cycles == 0)
{
g_pVideoData = old;
break; break;
skipped_frame ? DecodeSemiNop() : Decode(); }
totalCycles += cycles; totalCycles += cycles;
} }
return totalCycles; return totalCycles;

View File

@ -38,5 +38,4 @@ extern bool g_bRecordFifoData;
void OpcodeDecoder_Init(); void OpcodeDecoder_Init();
void OpcodeDecoder_Shutdown(); void OpcodeDecoder_Shutdown();
u32 OpcodeDecoder_Run(bool skipped_frame); u32 OpcodeDecoder_Run(bool skipped_frame, u8* end);
void InterpretDisplayList(u32 address, u32 size);

View File

@ -151,17 +151,21 @@ static VertexLoaderCacheItem RefreshLoader(int vtx_attr_group)
return s_VertexLoaders[vtx_attr_group]; return s_VertexLoaders[vtx_attr_group];
} }
void RunVertices(int vtx_attr_group, int primitive, int count) bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size)
{ {
if (!count) if (!count)
return; return true;
auto loader = RefreshLoader(vtx_attr_group); auto loader = RefreshLoader(vtx_attr_group);
size_t size = count * loader.first->GetVertexSize();
if (buf_size < size)
return false;
if (bpmem.genMode.cullmode == GenMode::CULL_ALL && primitive < 5) if (bpmem.genMode.cullmode == GenMode::CULL_ALL && primitive < 5)
{ {
// if cull mode is CULL_ALL, ignore triangles and quads // if cull mode is CULL_ALL, ignore triangles and quads
DataSkip(count * loader.first->GetVertexSize()); DataSkip((u32)size);
return; return true;
} }
// If the native vertex format changed, force a flush. // If the native vertex format changed, force a flush.
@ -178,6 +182,7 @@ void RunVertices(int vtx_attr_group, int primitive, int count)
ADDSTAT(stats.thisFrame.numPrims, count); ADDSTAT(stats.thisFrame.numPrims, count);
INCSTAT(stats.thisFrame.numPrimitiveJoins); INCSTAT(stats.thisFrame.numPrimitiveJoins);
return true;
} }
int GetVertexSize(int vtx_attr_group) int GetVertexSize(int vtx_attr_group)

View File

@ -17,7 +17,8 @@ namespace VertexLoaderManager
void MarkAllDirty(); void MarkAllDirty();
int GetVertexSize(int vtx_attr_group); int GetVertexSize(int vtx_attr_group);
void RunVertices(int vtx_attr_group, int primitive, int count); // Returns false if buf_size is insufficient.
bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size);
// For debugging // For debugging
void AppendListToString(std::string *dest); void AppendListToString(std::string *dest);