fully implemented display list cache with vertex data included and added in all the plugins.

still experimental, not totally optimized but must bring a nice speed up
please test for regressions an error. an please Linux people fix scons :)

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6149 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
Rodolfo Osvaldo Bogado 2010-08-29 23:08:56 +00:00
parent 4229d9e01e
commit 3e7dafbbd7
18 changed files with 207 additions and 76 deletions

View File

@ -106,6 +106,16 @@ void XEmitter::ABI_CallFunctionCCP(void *func, u32 param1, u32 param2, void *par
ABI_RestoreStack(3 * 4);
}
void XEmitter::ABI_CallFunctionCCCP(void *func, u32 param1, u32 param2,u32 param3, void *param4) {
ABI_AlignStack(4 * 4);
PUSH(32, Imm32((u32)param4));
PUSH(32, Imm32(param3));
PUSH(32, Imm32(param2));
PUSH(32, Imm32(param1));
CALL(func);
ABI_RestoreStack(4 * 4);
}
// Pass a register as a parameter.
void XEmitter::ABI_CallFunctionR(void *func, X64Reg reg1) {
ABI_AlignStack(1 * 4);
@ -236,6 +246,14 @@ void XEmitter::ABI_CallFunctionCCP(void *func, u32 param1, u32 param2, void *par
CALL(func);
}
void XEmitter::ABI_CallFunctionCCCP(void *func, u32 param1, u32 param2, u32 param3, void *param4) {
MOV(32, R(ABI_PARAM1), Imm32(param1));
MOV(32, R(ABI_PARAM2), Imm32(param2));
MOV(32, R(ABI_PARAM3), Imm32(param3));
MOV(64, R(ABI_PARAM4), Imm64((u64)param4));
CALL(func);
}
// Pass a register as a parameter.
void XEmitter::ABI_CallFunctionR(void *func, X64Reg reg1) {
if (reg1 != ABI_PARAM1)

View File

@ -22,7 +22,7 @@ static const char ID[4] = {'D', 'C', 'A', 'C'};
// Update this to the current SVN revision every time you change shader generation code.
// We don't automatically get this from SVN_REV because that would mean regenerating the
// shader cache for every revision, graphics-related or not, which is simply annoying.
const int version = 6139;
const int version = 6148;
LinearDiskCache::LinearDiskCache()
: file_(NULL), num_entries_(0) {

View File

@ -599,6 +599,7 @@ public:
void ABI_CallFunctionCC(void *func, u32 param1, u32 param2);
void ABI_CallFunctionCCC(void *func, u32 param1, u32 param2, u32 param3);
void ABI_CallFunctionCCP(void *func, u32 param1, u32 param2, void *param3);
void ABI_CallFunctionCCCP(void *func, u32 param1, u32 param2,u32 param3, void *param4);
void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2);
void ABI_CallFunctionA(void *func, const Gen::OpArg &arg1);

View File

@ -31,16 +31,14 @@
#include "CPMemory.h"
#include "BPMemory.h"
#include "VertexManager.h"
#include "VertexLoaderManager.h"
#include "NativeVertexWriter.h"
#include "x64Emitter.h"
#include "ABI.h"
#include "DLCache.h"
#define DL_CODE_CACHE_SIZE (1024*1024*16)
#define DL_STATIC_DATA_SIZE (1024*1024*4)
extern int frameCount;
using namespace Gen;
@ -64,13 +62,21 @@ struct VDataHashRegion
u32 start_address;
int size;
};
typedef u8* DataPointer;
typedef std::map<u8, DataPointer> VdataMap;
struct CachedDisplayList
{
CachedDisplayList()
: uncachable(false),
pass(DLPASS_ANALYZE),
next_check(1)
: uncachable(false),
pass(DLPASS_ANALYZE),
next_check(1),
BufferCount(0),
num_xf_reg(0),
num_cp_reg(0),
num_bp_reg(0),
num_index_xf(0),
num_draw_call(0)
{
frame_count = frameCount;
}
@ -83,16 +89,20 @@ struct CachedDisplayList
int check;
int next_check;
u32 vdata_hash;
std::vector<VDataHashRegion> hash_regions;
int frame_count;
// ... Something containing cached vertex buffers here ...
u8 BufferCount;
VdataMap Vdata;
int num_xf_reg;
int num_cp_reg;
int num_bp_reg;
int num_index_xf;
int num_draw_call;
// Compile the commands themselves down to native code.
const u8 *compiled_code;
const u8* compiled_code;
};
// We want to allow caching DLs that start at the same address but have different lengths,
@ -105,28 +115,18 @@ inline u64 CreateMapId(u32 address, u32 size)
typedef std::map<u64, CachedDisplayList> DLMap;
static DLMap dl_map;
static u8 *dlcode_cache;
static u8 *static_data_buffer;
static u8 *static_data_ptr;
static DataPointer dlcode_cache;
static Gen::XEmitter emitter;
// Everything gets free'd when the cache is cleared.
u8 *AllocStaticData(int size)
{
u8 *cur_ptr = static_data_ptr;
static_data_ptr += (size + 3) & ~3;
return cur_ptr;
}
// First pass - analyze
bool AnalyzeAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
{
int num_xf_reg = 0;
int num_cp_reg = 0;
//int num_bp_reg = 0; // unused?
int num_bp_reg = 0;
int num_index_xf = 0;
//int num_draw_call = 0; // unused?
int num_draw_call = 0;
u8* old_pVideoData = g_pVideoData;
u8* startAddress = Memory_GetPtr(address);
@ -216,6 +216,7 @@ bool AnalyzeAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
u32 bp_cmd = DataReadU32();
LoadBPReg(bp_cmd);
INCSTAT(stats.thisFrame.numBPLoads);
num_bp_reg++;
}
break;
@ -230,6 +231,7 @@ bool AnalyzeAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7)
(cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT,
numVertices);
num_draw_call++;
}
else
{
@ -244,7 +246,11 @@ bool AnalyzeAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
// un-swap
Statistics::SwapDL();
}
dl->num_bp_reg = num_bp_reg;
dl->num_cp_reg = num_cp_reg;
dl->num_draw_call = num_draw_call;
dl->num_index_xf = num_index_xf;
dl->num_xf_reg = num_xf_reg;
// reset to the old pointer
g_pVideoData = old_pVideoData;
return true;
@ -308,13 +314,14 @@ bool CompileAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
int transfer_size = ((Cmd2 >> 16) & 15) + 1;
u32 xf_address = Cmd2 & 0xFFFF;
// TODO - speed this up. pshufb?
u8 *real_data_buffer = AllocStaticData(4 * transfer_size);
u32 *data_buffer = (u32 *)real_data_buffer;
DataPointer real_data_buffer = (DataPointer) new u8[transfer_size * 4];
u32 *data_buffer = (u32*)real_data_buffer;
for (int i = 0; i < transfer_size; i++)
data_buffer[i] = DataReadU32();
LoadXFReg(transfer_size, xf_address, data_buffer);
INCSTAT(stats.thisFrame.numXFLoads);
dl->Vdata[dl->BufferCount] = real_data_buffer;
dl->BufferCount++;
// Compile
emitter.ABI_CallFunctionCCP((void *)&LoadXFReg, transfer_size, xf_address, data_buffer);
}
@ -396,24 +403,23 @@ bool CompileAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
u64 pre_draw_video_data = (u64)g_pVideoData;
u8* StartAddress = VertexManager::s_pBaseBufferPointer;
VertexManager::Flush();
VertexLoaderManager::RunVertices(
cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7)
(cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT,
numVertices);
u8* EndAddress = VertexManager::s_pCurBufferPointer;
u32 Vdatasize = (u32)(EndAddress - StartAddress);
if (size > 0)
{
// Compile
#ifdef _M_X64
emitter.MOV(64, R(RAX), Imm64(pre_draw_video_data));
emitter.MOV(64, M(&g_pVideoData), R(RAX));
#else
emitter.MOV(32, R(EAX), Imm32((u32)pre_draw_video_data));
emitter.MOV(32, M(&g_pVideoData), R(EAX));
#endif
emitter.ABI_CallFunctionCCC(
(void *)&VertexLoaderManager::RunVertices,
cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7)
(cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT,
numVertices);
DataPointer NewData = (DataPointer)new u8[Vdatasize];
memcpy(NewData,StartAddress,Vdatasize);
dl->Vdata[dl->BufferCount] = NewData;
dl->BufferCount++;
emitter.ABI_CallFunctionCCCP((void *)&VertexLoaderManager::RunCompiledVertices,cmd_byte & GX_VAT_MASK, (cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, numVertices, NewData);
}
}
else
{
@ -424,6 +430,9 @@ bool CompileAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
}
}
emitter.ABI_EmitEpilogue(4);
INCSTAT(stats.numDListsCalled);
INCSTAT(stats.thisFrame.numDListsCalled);
Statistics::SwapDL();
}
g_pVideoData = old_pVideoData;
return true;
@ -431,27 +440,34 @@ bool CompileAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
void Init()
{
dlcode_cache = (u8 *)AllocateExecutableMemory(DL_CODE_CACHE_SIZE, false); // Don't need low memory.
static_data_buffer = (u8 *)AllocateMemoryPages(DL_STATIC_DATA_SIZE);
static_data_ptr = static_data_buffer;
dlcode_cache = (DataPointer)AllocateExecutableMemory(DL_CODE_CACHE_SIZE, false); // Don't need low memory.
emitter.SetCodePtr(dlcode_cache);
}
void Shutdown()
{
Clear();
FreeMemoryPages(dlcode_cache, DL_CODE_CACHE_SIZE);
FreeMemoryPages(static_data_buffer, DL_STATIC_DATA_SIZE);
FreeMemoryPages(dlcode_cache, DL_CODE_CACHE_SIZE);
dlcode_cache = NULL;
}
void Clear()
{
DLMap::iterator iter = dl_map.begin();
while (iter != dl_map.end()) {
CachedDisplayList &entry = iter->second;
VdataMap::iterator viter = entry.Vdata.begin();
while (viter != entry.Vdata.end())
{
DataPointer &ventry = viter->second;
delete [] ventry;
entry.Vdata.erase(viter++);
}
iter++;
}
dl_map.clear();
// Reset the cache pointers.
emitter.SetCodePtr(dlcode_cache);
static_data_ptr = static_data_buffer;
emitter.SetCodePtr(dlcode_cache);
}
void ProgressiveCleanup()
@ -462,6 +478,13 @@ void ProgressiveCleanup()
int limit = iter->second.uncachable ? 1200 : 400;
if (entry.frame_count < frameCount - limit) {
// entry.Destroy();
VdataMap::iterator viter = entry.Vdata.begin();
while (viter != entry.Vdata.end())
{
DataPointer &ventry = viter->second;
delete [] ventry;
entry.Vdata.erase(viter++);
}
dl_map.erase(iter++); // (this is gcc standard!)
}
else
@ -478,8 +501,6 @@ bool HandleDisplayList(u32 address, u32 size)
// right now...
//Fixed DlistCaching now is fully functional benefits still marginal but when vertex data is stored here the story will be diferent :)
//to test remove the next line;
return false;
if(size == 0) return false;
u64 dl_id = DLCache::CreateMapId(address, size);
@ -492,7 +513,7 @@ bool HandleDisplayList(u32 address, u32 size)
if (dl.uncachable)
{
// We haven't compiled it - let's return false so it gets
// interpreted.
// interpreted.
return false;
}
@ -507,7 +528,7 @@ bool HandleDisplayList(u32 address, u32 size)
if (dl.dl_hash != GetHash64(Memory_GetPtr(address), size,0))
{
// PanicAlert("uncachable %08x", address);
dl.uncachable = true;
dl.uncachable = true;
return false;
}
DLCache::CompileAndRunDisplayList(address, size, &dl);
@ -522,15 +543,37 @@ bool HandleDisplayList(u32 address, u32 size)
if (dl.dl_hash != GetHash64(Memory_GetPtr(address), size,0))
{
dl.uncachable = true;
DLCache::VdataMap::iterator viter = dl.Vdata.begin();
while (viter != dl.Vdata.end())
{
DLCache::DataPointer &ventry = viter->second;
delete [] ventry;
dl.Vdata.erase(viter++);
}
dl.BufferCount = 0;
return false;
}
dl.check = dl.next_check;
//dl.next_check *= 2;
dl.next_check *= 2;
if (dl.next_check > 1024)
dl.next_check = 1024;
}
dl.frame_count= frameCount;
u8 *old_datareader = g_pVideoData;
((void (*)())(void*)(dl.compiled_code))();
Statistics::SwapDL();
ADDSTAT(stats.thisFrame.numCPLoadsInDL,dl.num_cp_reg);
ADDSTAT(stats.thisFrame.numXFLoadsInDL,dl.num_xf_reg);
ADDSTAT(stats.thisFrame.numBPLoadsInDL,dl.num_bp_reg);
ADDSTAT(stats.thisFrame.numCPLoads,dl.num_cp_reg);
ADDSTAT(stats.thisFrame.numXFLoads,dl.num_xf_reg);
ADDSTAT(stats.thisFrame.numBPLoads,dl.num_bp_reg);
INCSTAT(stats.numDListsCalled);
INCSTAT(stats.thisFrame.numDListsCalled);
Statistics::SwapDL();
g_pVideoData = old_datareader;
break;
}

View File

@ -23,6 +23,7 @@ namespace VertexManager
{
void AddVertices(int primitive, int numvertices);
void AddCompiledVertices(int primitive, int numvertices, u8* Vdata);
void Flush(); // flushes the current buffer
int GetRemainingSize(); // remaining space in the current buffer.
int GetRemainingVertices(int primitive); // remaining number of vertices that can be processed in one AddVertices call

View File

@ -680,6 +680,63 @@ void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
VertexManager::AddVertices(primitive, count - startv + extraverts);
}
void VertexLoader::RunCompiledVertices(int vtx_attr_group, int primitive, int count, u8* Data)
{
DVSTARTPROFILE();
m_numLoadedVertices += count;
// Flush if our vertex format is different from the currently set.
if (g_nativeVertexFmt != NULL && g_nativeVertexFmt != m_NativeFmt)
{
// We really must flush here. It's possible that the native representations
// of the two vtx formats are the same, but we have no way to easily check that
// now.
VertexManager::Flush();
// Also move the Set() here?
}
g_nativeVertexFmt = m_NativeFmt;
if (bpmem.genMode.cullmode == 3 && primitive < 5)
{
// if cull mode is none, ignore triangles and quads
DataSkip(count * m_VertexSize);
return;
}
m_NativeFmt->EnableComponents(m_NativeFmt->m_components);
// Load position and texcoord scale factors.
m_VtxAttr.PosFrac = g_VtxAttr[vtx_attr_group].g0.PosFrac;
m_VtxAttr.texCoord[0].Frac = g_VtxAttr[vtx_attr_group].g0.Tex0Frac;
m_VtxAttr.texCoord[1].Frac = g_VtxAttr[vtx_attr_group].g1.Tex1Frac;
m_VtxAttr.texCoord[2].Frac = g_VtxAttr[vtx_attr_group].g1.Tex2Frac;
m_VtxAttr.texCoord[3].Frac = g_VtxAttr[vtx_attr_group].g1.Tex3Frac;
m_VtxAttr.texCoord[4].Frac = g_VtxAttr[vtx_attr_group].g2.Tex4Frac;
m_VtxAttr.texCoord[5].Frac = g_VtxAttr[vtx_attr_group].g2.Tex5Frac;
m_VtxAttr.texCoord[6].Frac = g_VtxAttr[vtx_attr_group].g2.Tex6Frac;
m_VtxAttr.texCoord[7].Frac = g_VtxAttr[vtx_attr_group].g2.Tex7Frac;
pVtxAttr = &m_VtxAttr;
posScale = 1.0f / float(1 << m_VtxAttr.PosFrac);
if (m_NativeFmt->m_components & VB_HAS_UVALL)
for (int i = 0; i < 8; i++)
tcScale[i] = texCoordFrac[m_VtxAttr.texCoord[i].Frac];
for (int i = 0; i < 2; i++)
colElements[i] = m_VtxAttr.color[i].Elements;
if(VertexManager::GetRemainingSize() < native_stride * count)
VertexManager::Flush();
memcpy_gc(VertexManager::s_pCurBufferPointer, Data, native_stride * count);
VertexManager::s_pCurBufferPointer += native_stride * count;
VertexManager::AddVertices(primitive, count);
}
void VertexLoader::SetVAT(u32 _group0, u32 _group1, u32 _group2)
{
VAT vat;

View File

@ -84,6 +84,7 @@ public:
int GetVertexSize() const {return m_VertexSize;}
void RunVertices(int vtx_attr_group, int primitive, int count);
void RunCompiledVertices(int vtx_attr_group, int primitive, int count, u8* Data);
// For debugging / profiling
void AppendToString(std::string *dest) const;

View File

@ -143,6 +143,14 @@ void RunVertices(int vtx_attr_group, int primitive, int count)
g_VertexLoaders[vtx_attr_group]->RunVertices(vtx_attr_group, primitive, count);
}
void RunCompiledVertices(int vtx_attr_group, int primitive, int count, u8* Data)
{
if (!count || !Data)
return;
RefreshLoader(vtx_attr_group);
g_VertexLoaders[vtx_attr_group]->RunCompiledVertices(vtx_attr_group, primitive, count,Data);
}
int GetVertexSize(int vtx_attr_group)
{
RefreshLoader(vtx_attr_group);

View File

@ -30,6 +30,7 @@ namespace VertexLoaderManager
int GetVertexSize(int vtx_attr_group);
void RunVertices(int vtx_attr_group, int primitive, int count);
void RunCompiledVertices(int vtx_attr_group, int primitive, int count, u8* Data);
// For debugging
void AppendListToString(std::string *dest);

View File

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="Windows-1252"?>
<VisualStudioProject
ProjectType="Visual C++"
Version="9.00"
Version="9,00"
Name="VideoCommon"
ProjectGUID="{E5D1F0C0-AA07-4841-A4EB-4CF4DAA6B0FA}"
RootNamespace="VideoCommon"
@ -634,6 +634,14 @@
RelativePath=".\Src\DataReader.h"
>
</File>
<File
RelativePath=".\Src\DLCache.cpp"
>
</File>
<File
RelativePath=".\Src\DLCache.h"
>
</File>
<File
RelativePath=".\Src\VertexLoader.cpp"
>

View File

@ -44,6 +44,7 @@
#include "OnScreenDisplay.h"
#include "FBManager.h"
#include "Fifo.h"
#include "DLCache.h"
#include <strsafe.h>
@ -982,6 +983,7 @@ void Renderer::Swap(u32 xfbAddr, FieldType field, u32 fbWidth, u32 fbHeight,cons
OSD::DrawMessages();
D3D::EndFrame();
frameCount++;
DLCache::ProgressiveCleanup();
TextureCache::Cleanup();
// enable any configuration changes

View File

@ -123,6 +123,7 @@ bool Init()
LIBuffer = new u16[MAXIBUFFERSIZE];
PIBuffer = new u16[MAXIBUFFERSIZE];
s_pCurBufferPointer = LocalVBuffer;
s_pBaseBufferPointer = LocalVBuffer;
Flushed=false;
CreateDeviceObjects();

View File

@ -53,6 +53,7 @@
#include "W32Util/Misc.h"
#include "EmuWindow.h"
#include "FBManager.h"
#include "DLCache.h"
#if defined(DEBUGFAST)
@ -83,11 +84,6 @@ static volatile struct
static volatile EFBAccessType s_AccessEFBType;
bool HandleDisplayList(u32 address, u32 size)
{
return false;
}
bool IsD3D()
{
return true;
@ -234,6 +230,7 @@ void Video_Prepare()
PixelShaderManager::Init();
CommandProcessor::Init();
PixelEngine::Init();
DLCache::Init();
// tell the host that the window is ready
g_VideoInitialize.pCoreMessage(WM_USER_CREATE);
@ -246,6 +243,7 @@ void Shutdown()
s_swapRequested = FALSE;
// VideoCommon
DLCache::Shutdown();
CommandProcessor::Shutdown();
PixelShaderManager::Shutdown();
VertexShaderManager::Shutdown();

View File

@ -47,6 +47,7 @@
#include "FramebufferManager.h"
#include "Fifo.h"
#include "TextureConverter.h"
#include "DLCache.h"
#include "debugger/debugger.h"
@ -1193,6 +1194,7 @@ void Renderer::Swap(u32 xfbAddr, FieldType field, u32 fbWidth, u32 fbHeight,cons
D3D::EndFrame();
frameCount++;
DLCache::ProgressiveCleanup();
TextureCache::Cleanup();
// Make any new configuration settings active.

View File

@ -65,6 +65,7 @@ bool Init()
LIBuffer = new u16[MAXIBUFFERSIZE];
PIBuffer = new u16[MAXIBUFFERSIZE];
s_pCurBufferPointer = LocalVBuffer;
s_pBaseBufferPointer = LocalVBuffer;
Flushed=false;
IndexGenerator::Start(TIBuffer,LIBuffer,PIBuffer);
return true;

View File

@ -52,6 +52,7 @@ GFXConfigDialogDX *m_ConfigFrame = NULL;
#include "VideoState.h"
#include "XFBConvert.h"
#include "render.h"
#include "DLCache.h"
HINSTANCE g_hInstance = NULL;
SVideoInitialize g_VideoInitialize;
@ -72,11 +73,6 @@ static volatile struct
static volatile EFBAccessType s_AccessEFBType;
bool HandleDisplayList(u32 address, u32 size)
{
return false;
}
bool IsD3D()
{
return true;
@ -249,7 +245,7 @@ void Video_Prepare()
PixelShaderManager::Init();
CommandProcessor::Init();
PixelEngine::Init();
DLCache::Init();
// Tell the host the window is ready
g_VideoInitialize.pCoreMessage(WM_USER_CREATE);
}
@ -259,6 +255,7 @@ void Shutdown()
s_efbAccessRequested = FALSE;
s_FifoShuttingDown = FALSE;
s_swapRequested = FALSE;
DLCache::Shutdown();
Fifo_Shutdown();
CommandProcessor::Shutdown();
VertexManager::Shutdown();

View File

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="Windows-1252"?>
<VisualStudioProject
ProjectType="Visual C++"
Version="9.00"
Version="9,00"
Name="Plugin_VideoOGL"
ProjectGUID="{CFDCEE0E-FA45-4F72-9FCC-0B88F5A75160}"
RootNamespace="Plugin_VideoOGL"
@ -717,14 +717,6 @@
<Filter
Name="Decoder"
>
<File
RelativePath=".\Src\DLCache.cpp"
>
</File>
<File
RelativePath=".\Src\DLCache.h"
>
</File>
<File
RelativePath=".\Src\NativeVertexFormat.cpp"
>