GSdx: added a shortcut in GSState::Transfer for the most frequent vertex format I found (helps quite a lot), less thread-syncing for the sw renderer, and the bios boot logo was fixed (just had to clear the memory on reset).

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5072 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2012-01-18 11:47:31 +00:00
parent da4ea83134
commit 9aabcc1701
27 changed files with 1549 additions and 528 deletions

View File

@ -28,8 +28,8 @@ const GSVector4i GPULocalMemory::m_xxbx(0x00007c00);
const GSVector4i GPULocalMemory::m_xgxx(0x000003e0);
const GSVector4i GPULocalMemory::m_rxxx(0x0000001f);
#define VM_SIZE ((1 << (12 + 11)) * sizeof(uint16))
#define VM_ALLOC_SIZE (VM_SIZE * 2)
#define VM_REAL_SIZE ((1 << (12 + 11)) * sizeof(uint16))
#define VM_ALLOC_SIZE (VM_REAL_SIZE * 2)
#define TEX_ALLOC_SIZE (256 * 256 * (1 + 1 + 4) * 32)
GPULocalMemory::GPULocalMemory()
@ -39,7 +39,7 @@ GPULocalMemory::GPULocalMemory()
//
int size = VM_SIZE;
int size = VM_REAL_SIZE;
m_vm = (uint16*)vmalloc(VM_ALLOC_SIZE, false);

View File

@ -207,7 +207,7 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1)
s_gs = NULL;
}
if(renderer == 12)
if(renderer == 15)
{
#ifdef _WINDOWS

View File

@ -90,6 +90,11 @@ enum GIF_REG
GIF_REG_NOP = 0x0f,
};
enum GIF_REG_COMPLEX
{
GIF_REG_STQRGBAXYZF2 = 0x00,
};
enum GIF_A_D_REG
{
GIF_A_D_REG_PRIM = 0x00,
@ -1093,9 +1098,11 @@ __aligned(struct, 32) GIFPath
uint32 reg;
uint32 nreg;
uint32 nloop;
uint32 adonly;
uint32 type;
GSVector4i regs;
enum {TYPE_UNKNOWN, TYPE_ADONLY, TYPE_STQRGBAXYZF2};
void SetTag(const void* mem)
{
GSVector4i v = GSVector4i::load<false>(mem);
@ -1104,7 +1111,9 @@ __aligned(struct, 32) GIFPath
regs = v.uph8(v >> 4) & 0x0f0f0f0f;
nreg = tag.NREG ? tag.NREG : 16;
nloop = tag.NLOOP;
adonly = regs.eq8(GSVector4i(0x0e0e0e0e)).mask() == (1 << nreg) - 1;
type = TYPE_UNKNOWN;
if(regs.u32[0] == 0x00040102 && nreg == 3) type = TYPE_STQRGBAXYZF2;
else if(regs.eq8(GSVector4i(0x0e0e0e0e)).mask() == (1 << nreg) - 1) type = TYPE_ADONLY;
}
__forceinline uint8 GetReg()

View File

@ -729,7 +729,6 @@ void GSDevice11::IASetVertexBuffer(const void* vertex, size_t stride, size_t cou
m_vb = NULL;
m_vertex.start = 0;
m_vertex.count = 0;
m_vertex.limit = std::max<int>(count * 3 / 2, 11000);
}
@ -798,7 +797,7 @@ void GSDevice11::IASetIndexBuffer(const void* index, size_t count)
m_ib_old = m_ib;
m_ib = NULL;
m_index.count = 0;
m_index.start = 0;
m_index.limit = std::max<int>(count * 3 / 2, 11000);
}
@ -904,7 +903,11 @@ void GSDevice11::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1)
{
PSSetShaderResource(0, sr0);
PSSetShaderResource(1, sr1);
PSSetShaderResource(2, NULL);
for(int i = 2; i < countof(m_state.ps_srv); i++)
{
PSSetShaderResource(i, NULL);
}
}
void GSDevice11::PSSetShaderResource(int i, GSTexture* sr)
@ -913,6 +916,13 @@ void GSDevice11::PSSetShaderResource(int i, GSTexture* sr)
if(sr) srv = *(GSTexture11*)sr;
PSSetShaderResourceView(i, srv);
}
void GSDevice11::PSSetShaderResourceView(int i, ID3D11ShaderResourceView* srv)
{
ASSERT(i < countof(m_state.ps_srv));
if(m_state.ps_srv[i] != srv)
{
m_state.ps_srv[i] = srv;
@ -944,14 +954,14 @@ void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb)
if(m_srv_changed)
{
m_ctx->PSSetShaderResources(0, 3, m_state.ps_srv);
m_ctx->PSSetShaderResources(0, countof(m_state.ps_srv), m_state.ps_srv);
m_srv_changed = false;
}
if(m_ss_changed)
{
m_ctx->PSSetSamplers(0, 3, m_state.ps_ss);
m_ctx->PSSetSamplers(0, countof(m_state.ps_ss), m_state.ps_ss);
m_ss_changed = false;
}
@ -1036,6 +1046,8 @@ void GSDevice11::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector
m_ctx->OMSetRenderTargets(1, &rtv, dsv);
}
memset(m_state.uav, 0, sizeof(m_state.uav));
if(m_state.viewport != rt->GetSize())
{
m_state.viewport = rt->GetSize();
@ -1064,6 +1076,52 @@ void GSDevice11::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector
}
}
void GSDevice11::OMSetRenderTargets(const GSVector2i& rtsize, ID3D11UnorderedAccessView** uav, int count, const GSVector4i* scissor)
{
for(int i = 0; i < count; i++)
{
if(m_state.uav[i] != uav[i])
{
memcpy(m_state.uav, uav, sizeof(uav[0]) * count);
memset(m_state.uav + count, 0, sizeof(m_state.uav) - sizeof(uav[0]) * count);
m_ctx->OMSetRenderTargetsAndUnorderedAccessViews(0, NULL, NULL, 0, count, uav, NULL);
break;
}
}
m_state.rtv = NULL;
m_state.dsv = NULL;
if(m_state.viewport != rtsize)
{
m_state.viewport = rtsize;
D3D11_VIEWPORT vp;
memset(&vp, 0, sizeof(vp));
vp.TopLeftX = 0;
vp.TopLeftY = 0;
vp.Width = (float)rtsize.x;
vp.Height = (float)rtsize.y;
vp.MinDepth = 0.0f;
vp.MaxDepth = 1.0f;
m_ctx->RSSetViewports(1, &vp);
}
GSVector4i r = scissor ? *scissor : GSVector4i(rtsize).zwxy();
if(!m_state.scissor.eq(r))
{
m_state.scissor = r;
m_ctx->RSSetScissorRects(1, r);
}
}
HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il)
{
HRESULT hr;
@ -1135,6 +1193,38 @@ HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MAC
return hr;
}
HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs, D3D11_SO_DECLARATION_ENTRY* layout, int count)
{
HRESULT hr;
vector<D3D11_SHADER_MACRO> m;
PrepareShaderMacro(m, macro);
CComPtr<ID3D11Blob> shader, error;
hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.gs.c_str(), 0, 0, NULL, &shader, &error, NULL);
if(error)
{
printf("%s\n", (const char*)error->GetBufferPointer());
}
if(FAILED(hr))
{
return hr;
}
hr = m_dev->CreateGeometryShaderWithStreamOutput((void*)shader->GetBufferPointer(), shader->GetBufferSize(), layout, count, NULL, 0, D3D11_SO_NO_RASTERIZED_STREAM, NULL, gs);
if(FAILED(hr))
{
return hr;
}
return hr;
}
HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps)
{
HRESULT hr;

View File

@ -60,7 +60,7 @@ class GSDevice11 : public GSDeviceDX
ID3D11VertexShader* vs;
ID3D11Buffer* vs_cb;
ID3D11GeometryShader* gs;
ID3D11ShaderResourceView* ps_srv[3];
ID3D11ShaderResourceView* ps_srv[16];
ID3D11PixelShader* ps;
ID3D11Buffer* ps_cb;
ID3D11SamplerState* ps_ss[3];
@ -73,6 +73,7 @@ class GSDevice11 : public GSDeviceDX
float bf;
ID3D11RenderTargetView* rtv;
ID3D11DepthStencilView* dsv;
ID3D11UnorderedAccessView* uav[8];
} m_state;
public: // TODO
@ -178,6 +179,7 @@ public:
void GSSetShader(ID3D11GeometryShader* gs);
void PSSetShaderResources(GSTexture* sr0, GSTexture* sr1);
void PSSetShaderResource(int i, GSTexture* sr);
void PSSetShaderResourceView(int i, ID3D11ShaderResourceView* srv);
void PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb);
void PSSetSamplerState(ID3D11SamplerState* ss0, ID3D11SamplerState* ss1, ID3D11SamplerState* ss2 = NULL);
void CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv);
@ -186,6 +188,7 @@ public:
void OMSetDepthStencilState(ID3D11DepthStencilState* dss, uint8 sref);
void OMSetBlendState(ID3D11BlendState* bs, float bf);
void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor = NULL);
void OMSetRenderTargets(const GSVector2i& rtsize, ID3D11UnorderedAccessView** uav, int count, const GSVector4i* scissor = NULL);
void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim);
void SetupVS(VSSelector sel, const VSConstantBuffer* cb);
@ -202,6 +205,7 @@ public:
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il);
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs);
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs, D3D11_SO_DECLARATION_ENTRY* layout, int count);
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps);
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs);
HRESULT CompileShader(const char* fn, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs);

View File

@ -53,7 +53,8 @@ public:
GSOffset* fb;
GSOffset* zb;
GSOffset* tex;
GSPixelOffset4* fzb;
GSPixelOffset* fzb;
GSPixelOffset4* fzb4;
} offset;
GSDrawingContext()

View File

@ -473,6 +473,62 @@ GSOffset* GSLocalMemory::GetOffset(uint32 bp, uint32 bw, uint32 psm)
return o;
}
GSPixelOffset* GSLocalMemory::GetPixelOffset(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF)
{
uint32 fbp = FRAME.Block();
uint32 zbp = ZBUF.Block();
uint32 fpsm = FRAME.PSM;
uint32 zpsm = ZBUF.PSM;
uint32 bw = FRAME.FBW;
ASSERT(m_psm[fpsm].trbpp > 8 || m_psm[zpsm].trbpp > 8);
// "(psm & 0x0f) ^ ((psm & 0xf0) >> 2)" creates 4 bit unique identifiers for render target formats (only)
uint32 fpsm_hash = (fpsm & 0x0f) ^ ((fpsm & 0x30) >> 2);
uint32 zpsm_hash = (zpsm & 0x0f) ^ ((zpsm & 0x30) >> 2);
uint32 hash = (FRAME.FBP << 0) | (ZBUF.ZBP << 9) | (bw << 18) | (fpsm_hash << 24) | (zpsm_hash << 28);
hash_map<uint32, GSPixelOffset*>::iterator i = m_pomap.find(hash);
if(i != m_pomap.end())
{
return i->second;
}
GSPixelOffset* o = (GSPixelOffset*)_aligned_malloc(sizeof(GSPixelOffset), 32);
o->hash = hash;
o->fbp = fbp;
o->zbp = zbp;
o->fpsm = fpsm;
o->zpsm = zpsm;
o->bw = bw;
pixelAddress fpa = m_psm[fpsm].pa;
pixelAddress zpa = m_psm[zpsm].pa;
int fs = m_psm[fpsm].bpp >> 5;
int zs = m_psm[zpsm].bpp >> 5;
for(int i = 0; i < 2048; i++)
{
o->row[i].x = (int)fpa(0, i, fbp, bw) << fs;
o->row[i].y = (int)zpa(0, i, zbp, bw) << zs;
}
for(int i = 0; i < 2048; i++)
{
o->col[i].x = m_psm[fpsm].rowOffset[0][i] << fs;
o->col[i].y = m_psm[zpsm].rowOffset[0][i] << zs;
}
m_pomap[hash] = o;
return o;
}
GSPixelOffset4* GSLocalMemory::GetPixelOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF)
{
uint32 fbp = FRAME.Block();

View File

@ -56,6 +56,16 @@ public:
uint32* GetPages(const GSVector4i& rect, uint32* pages = NULL, GSVector4i* bbox = NULL);
};
struct GSPixelOffset
{
// 16 bit offsets (m_vm16[...])
GSVector2i row[2048]; // f yn | z yn
GSVector2i col[2048]; // f xn | z xn
uint32 hash;
uint32 fbp, zbp, fpsm, zpsm, bw;
};
struct GSPixelOffset4
{
// 16 bit offsets (m_vm16[...])
@ -158,6 +168,7 @@ protected:
//
hash_map<uint32, GSOffset*> m_omap;
hash_map<uint32, GSPixelOffset*> m_pomap;
hash_map<uint32, GSPixelOffset4*> m_po4map;
hash_map<uint64, vector<GSVector2i>*> m_p2tmap;
@ -166,6 +177,7 @@ public:
virtual ~GSLocalMemory();
GSOffset* GetOffset(uint32 bp, uint32 bw, uint32 psm);
GSPixelOffset* GetPixelOffset(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF);
GSPixelOffset4* GetPixelOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF);
vector<GSVector2i>* GetPage2TileMap(const GIFRegTEX0& TEX0);

View File

@ -35,7 +35,7 @@ public:
enum counter_t
{
Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad,
Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad, SyncPoint,
CounterLast,
};

View File

@ -40,7 +40,7 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* pe
m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false);
m_edge.count = 0;
m_myscanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64);
m_scanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64);
int row = 0;
@ -48,14 +48,14 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* pe
{
for(int i = 0; i < threads; i++, row++)
{
m_myscanline[row] = i == id ? 1 : 0;
m_scanline[row] = i == id ? 1 : 0;
}
}
}
GSRasterizer::~GSRasterizer()
{
_aligned_free(m_myscanline);
_aligned_free(m_scanline);
if(m_edge.buff != NULL) vmfree(m_edge.buff, sizeof(GSVertexSW) * 2048);
@ -66,7 +66,7 @@ bool GSRasterizer::IsOneOfMyScanlines(int top) const
{
ASSERT(top >= 0 && top < 2048);
return m_myscanline[top >> THREAD_HEIGHT] != 0;
return m_scanline[top >> THREAD_HEIGHT] != 0;
}
bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const
@ -78,7 +78,7 @@ bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const
while(top < bottom)
{
if(m_myscanline[top++])
if(m_scanline[top++])
{
return true;
}
@ -91,9 +91,9 @@ int GSRasterizer::FindMyNextScanline(int top) const
{
int i = top >> THREAD_HEIGHT;
if(m_myscanline[i] == 0)
if(m_scanline[i] == 0)
{
while(m_myscanline[++i] == 0);
while(m_scanline[++i] == 0);
top = i << THREAD_HEIGHT;
}
@ -904,11 +904,20 @@ void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GS
//
GSRasterizerList::GSRasterizerList()
: GSJobQueue<shared_ptr<GSRasterizerData> >()
, m_sync_count(0)
, m_syncpoint_count(0)
GSRasterizerList::GSRasterizerList(int threads, GSPerfMon* perfmon)
: m_perfmon(perfmon)
{
m_scanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64);
int row = 0;
while(row < (2048 >> THREAD_HEIGHT))
{
for(int i = 0; i < threads; i++, row++)
{
m_scanline[row] = i;
}
}
}
GSRasterizerList::~GSRasterizerList()
@ -917,31 +926,54 @@ GSRasterizerList::~GSRasterizerList()
{
delete *i;
}
_aligned_free(m_scanline);
}
void GSRasterizerList::Queue(shared_ptr<GSRasterizerData> data)
{
// disable dispatcher thread for now and pass-through directly,
// would only be relevant if data->syncpoint was utilized more,
// it would hide the syncing latency from the main gs thread
if(data->syncpoint)
{
Sync();
}
// Push(data);
GSVector4i r = data->bbox.rintersect(data->scissor);
Process(data); m_count++;
ASSERT(r.top >= 0 && r.top < 2048 && r.bottom >= 0 && r.bottom < 2048);
int top = r.top >> THREAD_HEIGHT;
int bottom = std::min<int>((r.bottom + (1 << THREAD_HEIGHT) - 1) >> THREAD_HEIGHT, top + m_workers.size());
while(top < bottom)
{
m_workers[m_scanline[top++]]->Push(data);
}
}
void GSRasterizerList::Sync()
{
if(GetCount() == 0) return;
if(!IsSynced())
{
for(size_t i = 0; i < m_workers.size(); i++)
{
m_workers[i]->Wait();
}
Wait(); // first dispatch all items to workers
m_perfmon->Put(GSPerfMon::SyncPoint, 1);
}
}
bool GSRasterizerList::IsSynced() const
{
for(size_t i = 0; i < m_workers.size(); i++)
{
m_workers[i]->Wait(); // then wait all workers to finish their jobs
if(!m_workers[i]->IsEmpty())
{
return false;
}
}
m_sync_count++;
return true;
}
int GSRasterizerList::GetPixels(bool reset)
@ -956,24 +988,6 @@ int GSRasterizerList::GetPixels(bool reset)
return pixels;
}
void GSRasterizerList::Process(shared_ptr<GSRasterizerData>& item)
{
if(item->syncpoint)
{
for(size_t i = 0; i < m_workers.size(); i++)
{
m_workers[i]->Wait();
}
m_syncpoint_count++;
}
for(size_t i = 0; i < m_workers.size(); i++)
{
m_workers[i]->Push(item);
}
}
// GSRasterizerList::GSWorker
GSRasterizerList::GSWorker::GSWorker(GSRasterizer* r)
@ -994,16 +1008,6 @@ int GSRasterizerList::GSWorker::GetPixels(bool reset)
return m_r->GetPixels(reset);
}
void GSRasterizerList::GSWorker::Push(const shared_ptr<GSRasterizerData>& item)
{
GSVector4i r = item->bbox.rintersect(item->scissor);
if(m_r->IsOneOfMyScanlines(r.top, r.bottom))
{
GSJobQueue<shared_ptr<GSRasterizerData> >::Push(item);
}
}
void GSRasterizerList::GSWorker::Process(shared_ptr<GSRasterizerData>& item)
{
m_r->Draw(item.get());

View File

@ -109,6 +109,7 @@ public:
virtual void Queue(shared_ptr<GSRasterizerData> data) = 0;
virtual void Sync() = 0;
virtual bool IsSynced() const = 0;
virtual int GetPixels(bool reset = true) = 0;
};
@ -119,7 +120,7 @@ protected:
IDrawScanline* m_ds;
int m_id;
int m_threads;
uint8* m_myscanline;
uint8* m_scanline;
GSVector4i m_scissor;
GSVector4 m_fscissor_x;
GSVector4 m_fscissor_y;
@ -155,12 +156,12 @@ public:
void Queue(shared_ptr<GSRasterizerData> data);
void Sync() {}
bool IsSynced() const {return true;}
int GetPixels(bool reset);
};
class GSRasterizerList
: public IRasterizer
, private GSJobQueue<shared_ptr<GSRasterizerData> >
{
protected:
class GSWorker : public GSJobQueue<shared_ptr<GSRasterizerData> >
@ -175,17 +176,14 @@ protected:
// GSJobQueue
void Push(const shared_ptr<GSRasterizerData>& item);
void Process(shared_ptr<GSRasterizerData>& item);
};
GSPerfMon* m_perfmon;
vector<GSWorker*> m_workers;
uint8* m_scanline;
GSRasterizerList();
// GSJobQueue
void Process(shared_ptr<GSRasterizerData>& item);
GSRasterizerList(int threads, GSPerfMon* perfmon);
public:
virtual ~GSRasterizerList();
@ -200,7 +198,7 @@ public:
}
else
{
GSRasterizerList* rl = new GSRasterizerList();
GSRasterizerList* rl = new GSRasterizerList(threads, perfmon);
for(int i = 0; i < threads; i++)
{
@ -211,12 +209,10 @@ public:
}
}
int m_sync_count;
int m_syncpoint_count;
// IRasterizer
void Queue(shared_ptr<GSRasterizerData> data);
void Sync();
bool IsSynced() const;
int GetPixels(bool reset);
};

View File

@ -304,6 +304,8 @@ void GSRenderer::VSync(int field)
ResetDevice();
}
m_dev->AgePool();
// osd
if((m_perfmon.GetFrame() & 0x1f) == 0)
@ -332,7 +334,7 @@ void GSRenderer::VSync(int field)
s2.c_str(),
theApp.m_gs_interlace[m_interlace].name.c_str(),
theApp.m_gs_aspectratio[m_aspectratio].name.c_str(),
(int)m_perfmon.Get(GSPerfMon::Quad),
(int)m_perfmon.Get(GSPerfMon::SyncPoint),
(int)m_perfmon.Get(GSPerfMon::Prim),
(int)m_perfmon.Get(GSPerfMon::Draw),
m_perfmon.CPU(),

View File

@ -23,7 +23,7 @@
#include "GSRendererCS.h"
GSRendererCS::GSRendererCS()
: GSRenderer(new GSVertexTraceCS(this), sizeof(GSVertex))
: GSRenderer(new GSVertexTraceDX11(this), sizeof(GSVertexHW11))
{
m_nativeres = true;
@ -41,27 +41,72 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk)
if(!__super::CreateDevice(dev_unk))
return false;
HRESULT hr;
D3D11_DEPTH_STENCIL_DESC dsd;
D3D11_BLEND_DESC bsd;
D3D11_SAMPLER_DESC sd;
D3D11_BUFFER_DESC bd;
D3D11_TEXTURE2D_DESC td;
D3D11_UNORDERED_ACCESS_VIEW_DESC uavd;
D3D_FEATURE_LEVEL level;
((GSDeviceDX*)dev_unk)->GetFeatureLevel(level);
if(level < D3D_FEATURE_LEVEL_10_0)
if(level < D3D_FEATURE_LEVEL_11_0)
return false;
HRESULT hr;
GSDevice11* dev = (GSDevice11*)dev_unk;
D3D11_BUFFER_DESC bd;
D3D11_UNORDERED_ACCESS_VIEW_DESC uavd;
D3D11_SHADER_RESOURCE_VIEW_DESC srvd;
ID3D11DeviceContext* ctx = *dev;
delete dev->CreateRenderTarget(1024, 1024, false);
// empty depth stencil state
memset(&dsd, 0, sizeof(dsd));
dsd.StencilEnable = false;
dsd.DepthEnable = false;
hr = (*dev)->CreateDepthStencilState(&dsd, &m_dss);
if(FAILED(hr)) return false;
// empty blend state
memset(&bsd, 0, sizeof(bsd));
bsd.RenderTarget[0].BlendEnable = false;
hr = (*dev)->CreateBlendState(&bsd, &m_bs);
if(FAILED(hr)) return false;
// point sampler
memset(&sd, 0, sizeof(sd));
sd.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT;
sd.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP;
sd.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP;
sd.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP;
sd.MaxLOD = FLT_MAX;
sd.MaxAnisotropy = 16;
sd.ComparisonFunc = D3D11_COMPARISON_NEVER;
hr = (*dev)->CreateSamplerState(&sd, &m_ss);
if(FAILED(hr)) return false;
// video memory (4MB)
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = 4 * 1024 * 1024;
bd.StructureByteStride = 4;
bd.Usage = D3D11_USAGE_DEFAULT;
bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS;
bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS;
@ -81,35 +126,32 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk)
hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav);
if(FAILED(hr)) return false;
/*
memset(&td, 0, sizeof(td));
// vertex buffer
td.Width = PAGE_SIZE;
td.Height = MAX_PAGES;
td.Format = DXGI_FORMAT_R8_UINT;
td.MipLevels = 1;
td.ArraySize = 1;
td.SampleDesc.Count = 1;
td.SampleDesc.Quality = 0;
td.Usage = D3D11_USAGE_DEFAULT;
td.BindFlags = D3D11_BIND_UNORDERED_ACCESS;
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = sizeof(GSVertex) * 10000;
bd.StructureByteStride = sizeof(GSVertex);
bd.Usage = D3D11_USAGE_DYNAMIC;
bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_vb);
hr = (*dev)->CreateTexture2D(&td, NULL, &m_vm);
if(FAILED(hr)) return false;
// index buffer
memset(&uavd, 0, sizeof(uavd));
memset(&bd, 0, sizeof(bd));
uavd.Format = DXGI_FORMAT_R8_UINT;
uavd.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D;
bd.ByteWidth = sizeof(uint32) * 10000 * 3;
bd.Usage = D3D11_USAGE_DYNAMIC;
bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_ib);
hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav);
if(FAILED(hr)) return false;
*/
// one page, for copying between cpu<->gpu
memset(&bd, 0, sizeof(bd));
@ -121,10 +163,69 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk)
hr = (*dev)->CreateBuffer(&bd, NULL, &m_pb);
if(FAILED(hr)) return false;
/*
memset(&td, 0, sizeof(td));
td.Width = PAGE_SIZE;
td.Height = 1;
td.Format = DXGI_FORMAT_R8_UINT;
td.MipLevels = 1;
td.ArraySize = 1;
td.SampleDesc.Count = 1;
td.SampleDesc.Quality = 0;
td.Usage = D3D11_USAGE_STAGING;
td.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
hr = (*dev)->CreateTexture2D(&td, NULL, &m_pb);
if(FAILED(hr)) return false;
*/
// VSConstantBuffer
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = sizeof(VSConstantBuffer);
bd.Usage = D3D11_USAGE_DEFAULT;
bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_vs_cb);
if(FAILED(hr)) return false;
// PSConstantBuffer
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = sizeof(PSConstantBuffer);
bd.Usage = D3D11_USAGE_DEFAULT;
bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_ps_cb);
if(FAILED(hr)) return false;
//
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = 14 * sizeof(float) * 200000;
bd.Usage = D3D11_USAGE_DEFAULT;
bd.BindFlags = D3D11_BIND_STREAM_OUTPUT | D3D11_BIND_SHADER_RESOURCE;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_sob);
//
return true;
}
void GSRendererCS::VSync(int field)
{
__super::VSync(field);
//printf("%lld\n", m_perfmon.GetFrame());
}
GSTexture* GSRendererCS::GetOutput(int i)
{
// TODO: create a compute shader which unswizzles the frame from m_vm to the output texture
@ -135,205 +236,342 @@ GSTexture* GSRendererCS::GetOutput(int i)
template<uint32 prim, uint32 tme, uint32 fst>
void GSRendererCS::ConvertVertex(size_t dst_index, size_t src_index)
{
// TODO: vertex format more fitting as the input for the compute shader
GSVertex* s = (GSVertex*)((GSVertexHW11*)m_vertex.buff + src_index);
GSVertexHW11* d = (GSVertexHW11*)m_vertex.buff + dst_index;
if(src_index != dst_index)
GSVector4i v0 = ((GSVector4i*)s)[0];
GSVector4i v1 = ((GSVector4i*)s)[1];
if(tme && fst)
{
GSVertex v = ((GSVertex*)m_vertex.buff)[src_index];
// TODO: modify VertexTrace to read uv from v1.u16[0], v1.u16[1], then this step is not needed
((GSVertex*)m_vertex.buff)[dst_index] = v;
v0 = GSVector4i::cast(GSVector4(v1.uph16()).xyzw(GSVector4::cast(v0))); // uv => st
}
((GSVector4i*)d)[0] = v0;
((GSVector4i*)d)[1] = v1;
}
void GSRendererCS::Draw()
{
HRESULT hr;
GSDrawingEnvironment& env = m_env;
GSDrawingContext* context = m_context;
GSVector2i rtsize(2048, 2048);
GSVector4i scissor = GSVector4i(context->scissor.in).rintersect(GSVector4i(rtsize).zwxy());
GSVector4i bbox = GSVector4i(m_vt->m_min.p.floor().xyxy(m_vt->m_max.p.ceil()));
GSVector4i r = bbox.rintersect(scissor);
uint32 fm = context->FRAME.FBMSK;
uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
if(fm != 0xffffffff)
{
Write(context->offset.fb, r);
// TODO: m_tc->InvalidateVideoMem(context->offset.fb, r, false);
}
if(zm != 0xffffffff)
{
Write(context->offset.zb, r);
// TODO: m_tc->InvalidateVideoMem(context->offset.zb, r, false);
}
if(PRIM->TME)
{
m_mem.m_clut.Read32(context->TEX0, env.TEXA);
GSVector4i r;
GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt->IsLinear());
// TODO: unswizzle pages of r to a texture, check m_vm_valid, bit not set cpu->gpu, set gpu->gpu
// TODO: Write transfer should directly write to m_vm, then Read/Write syncing won't be necessary, clut must be updated with the gpu also
// TODO: tex = m_tc->LookupSource(context->TEX0, env.TEXA, r);
// if(!tex) return;
}
//
GSDevice11* dev = (GSDevice11*)m_dev;
ID3D11DeviceContext* ctx = *dev;
D3D11_BUFFER_DESC bd;
D3D11_UNORDERED_ACCESS_VIEW_DESC uavd;
D3D11_SHADER_RESOURCE_VIEW_DESC srvd;
D3D11_MAPPED_SUBRESOURCE map;
dev->BeginScene();
CComPtr<ID3D11ShaderResourceView> vb_srv;
CComPtr<ID3D11ShaderResourceView> ib_srv;
// SetupOM
// TODO: cache these in hash_maps
ID3D11UnorderedAccessView* uavs[] = {m_vm_uav};
CComPtr<ID3D11Buffer> fbr, fbc, zbr, zbc;
CComPtr<ID3D11ShaderResourceView> fbr_srv, fbc_srv, zbr_srv, zbc_srv;
dev->OMSetDepthStencilState(m_dss, 0);
dev->OMSetBlendState(m_bs, 0);
dev->OMSetRenderTargets(rtsize, uavs, countof(uavs), &scissor);
// TODO: grow m_vb, m_ib if needed
// SetupIA
if(m_vertex.next > 10000) return;
if(m_index.tail > 30000) return;
D3D11_PRIMITIVE_TOPOLOGY topology;
// TODO: fill/advance/discardwhenfull, as in GSDevice11::IASetVertexBuffer/IASetIndexBuffer
hr = ctx->Map(m_vb, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); // discarding, until properly advancing the start pointer around
if(FAILED(hr)) return;
memcpy(map.pData, m_vertex.buff, sizeof(GSVertex) * m_vertex.next);
ctx->Unmap(m_vb, 0);
//
hr = ctx->Map(m_ib, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); // discarding, until properly advancing the start pointer around
if(FAILED(hr)) return;
memcpy(map.pData, m_index.buff, sizeof(uint32) * m_index.tail);
ctx->Unmap(m_ib, 0);
// TODO: UpdateResource might be faster, based on my exprience with the real vertex buffer, write-no-overwrite/discarded dynamic buffer + map is better
//
memset(&srvd, 0, sizeof(srvd));
srvd.Format = DXGI_FORMAT_UNKNOWN;
srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
srvd.Buffer.FirstElement = 0;
srvd.Buffer.NumElements = m_vertex.next;
hr = (*dev)->CreateShaderResourceView(m_vb, &srvd, &vb_srv); // TODO: have to create this dyncamically in Draw() or pass the start/count in a const reg
memset(&srvd, 0, sizeof(srvd));
srvd.Format = DXGI_FORMAT_R32_UINT;
srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
srvd.Buffer.FirstElement = 0;
srvd.Buffer.NumElements = m_index.tail;
hr = (*dev)->CreateShaderResourceView(m_ib, &srvd, &ib_srv); // TODO: have to create this dyncamically in Draw() or pass the start/count in a const reg
// fzb offsets
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = sizeof(int) * 4096;
bd.StructureByteStride = sizeof(int);
bd.Usage = D3D11_USAGE_IMMUTABLE;
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
D3D11_SUBRESOURCE_DATA data;
memset(&data, 0, sizeof(data));
data.pSysMem = m_context->offset.fb->pixel.row;
hr = (*dev)->CreateBuffer(&bd, &data, &fbr);
data.pSysMem = m_context->offset.fb->pixel.col[0]; // same column layout for every line in case of frame and zbuffer formats
hr = (*dev)->CreateBuffer(&bd, &data, &fbc);
data.pSysMem = m_context->offset.zb->pixel.row;
hr = (*dev)->CreateBuffer(&bd, &data, &zbr);
data.pSysMem = m_context->offset.zb->pixel.col[0]; // same column layout for every line in case of frame and zbuffer formats
hr = (*dev)->CreateBuffer(&bd, &data, &zbc);
// TODO: D3D10_SHADER_MACRO (primclass, less frequently changing drawing attribs, etc.)
uint32 sel = 0; // TODO
hash_map<uint32, CComPtr<ID3D11ComputeShader> >::iterator i = m_cs.find(sel);
CComPtr<ID3D11ComputeShader> cs;
if(i == m_cs.end())
switch(m_vt->m_primclass)
{
// hr = dev->CompileShader(IDR_CS_FX, "cs_main", NULL, &cs);
hr = dev->CompileShader("E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\cs.fx", "cs_main", NULL, &cs);
if(FAILED(hr)) return;
m_cs[sel] = cs;
}
else
{
cs = i->second;
case GS_POINT_CLASS:
topology = D3D11_PRIMITIVE_TOPOLOGY_POINTLIST;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
topology = D3D11_PRIMITIVE_TOPOLOGY_LINELIST;
break;
case GS_TRIANGLE_CLASS:
topology = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
break;
default:
__assume(0);
}
//
dev->IASetVertexBuffer(m_vertex.buff, sizeof(GSVertexHW11), m_vertex.next);
dev->IASetIndexBuffer(m_index.buff, m_index.tail);
dev->IASetPrimitiveTopology(topology);
dev->CSSetShaderUAV(0, m_vm_uav);
// SetupVS
dev->CSSetShaderSRV(0, vb_srv);
dev->CSSetShaderSRV(1, ib_srv);
dev->CSSetShaderSRV(2, fbr_srv);
dev->CSSetShaderSRV(3, fbc_srv);
dev->CSSetShaderSRV(4, zbr_srv);
dev->CSSetShaderSRV(5, zbc_srv);
VSSelector vs_sel;
dev->CSSetShader(cs);
vs_sel.tme = PRIM->TME;
vs_sel.fst = PRIM->FST;
GSVector4i bbox = GSVector4i(0, 0, 640, 512); // TODO: vertex trace
VSConstantBuffer vs_cb;
GSVector4i r = bbox.ralign<Align_Outside>(GSVector2i(16, 8));
float sx = 2.0f / (rtsize.x << 4);
float sy = 2.0f / (rtsize.y << 4);
//float sx = 1.0f / 16;
//float sy = 1.0f / 16;
float ox = (float)(int)context->XYOFFSET.OFX;
float oy = (float)(int)context->XYOFFSET.OFY;
bool fb = true; // TODO: frame buffer used
bool zb = true; // TODO: z-buffer used
vs_cb.VertexScale = GSVector4(sx, -sy, 0.0f, 0.0f);
vs_cb.VertexOffset = GSVector4(ox * sx + 1, -(oy * sy + 1), 0.0f, -1.0f);
//vs_cb.VertexScale = GSVector4(sx, sy, 0.0f, 0.0f);
//vs_cb.VertexOffset = GSVector4(ox * sx, oy * sy, 0.0f, -1.0f);
if(fb) Write(m_context->offset.fb, r);
if(zb) Write(m_context->offset.zb, r);
{
hash_map<uint32, GSVertexShader11 >::const_iterator i = m_vs.find(vs_sel);
// TODO: constant buffer (frequently chaning drawing attribs)
// TODO: texture (implement texture cache)
// TODO: clut to a palette texture (should be texture1d, not simply buffer, it is random accessed)
// TODO: CSSetShaderSRV(6 7 8 ..., texture level 0 1 2 ...) or use Texture3D?
// TODO: invalidate texture cache
if(i == m_vs.end())
{
string str[2];
/*
CComPtr<ID3D11Query> q;
str[0] = format("%d", vs_sel.tme);
str[1] = format("%d", vs_sel.fst);
D3D11_QUERY_DESC qd;
memset(&qd, 0, sizeof(qd));
qd.Query = D3D11_QUERY_EVENT;
D3D11_SHADER_MACRO macro[] =
{
{"VS_TME", str[0].c_str()},
{"VS_FST", str[1].c_str()},
{NULL, NULL},
};
hr = (*dev)->CreateQuery(&qd, &q);
D3D11_INPUT_ELEMENT_DESC layout[] =
{
{"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"COLOR", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 8, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"TEXCOORD", 1, DXGI_FORMAT_R32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"POSITION", 0, DXGI_FORMAT_R16G16_UINT, 0, 16, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"POSITION", 1, DXGI_FORMAT_R32_UINT, 0, 20, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"TEXCOORD", 2, DXGI_FORMAT_R16G16_UINT, 0, 24, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"COLOR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 28, D3D11_INPUT_PER_VERTEX_DATA, 0},
};
ctx->Begin(q);
*/
GSVertexShader11 vs;
printf("[%lld] dispatch %05x %d %05x %d %05x %d %dx%d | %d %d %d\n",
__rdtsc(),
m_context->FRAME.Block(), m_context->FRAME.PSM,
m_context->ZBUF.Block(), m_context->ZBUF.PSM,
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH,
PRIM->PRIM, m_vertex.next, m_index.tail);
dev->CompileShader(IDR_CS_FX, "vs_main", macro, &vs.vs, layout, countof(layout), &vs.il);
GSVector4i rsize = r.rsize();
m_vs[vs_sel] = vs;
dev->Dispatch(rsize.z >> 4, rsize.w >> 3, 1); // TODO: pass upper-left corner offset (r.xy) in a const buffer
i = m_vs.find(vs_sel);
}
/*
ctx->End(q);
ctx->UpdateSubresource(m_vs_cb, 0, NULL, &vs_cb, 0, 0); // TODO: only update if changed
uint64 t0 = __rdtsc();
dev->VSSetShader(i->second.vs, m_vs_cb);
BOOL b;
dev->IASetInputLayout(i->second.il);
}
while(S_OK != ctx->GetData(q, &b, sizeof(BOOL), 0)) {}
// SetupGS
printf("%lld\n", __rdtsc() - t0);
*/
GSSelector gs_sel;
gs_sel.iip = PRIM->IIP;
gs_sel.prim = m_vt->m_primclass;
CComPtr<ID3D11GeometryShader> gs;
{
hash_map<uint32, CComPtr<ID3D11GeometryShader> >::const_iterator i = m_gs.find(gs_sel);
if(i != m_gs.end())
{
gs = i->second;
}
else
{
string str[2];
str[0] = format("%d", gs_sel.iip);
str[1] = format("%d", gs_sel.prim);
D3D11_SHADER_MACRO macro[] =
{
{"GS_IIP", str[0].c_str()},
{"GS_PRIM", str[1].c_str()},
{NULL, NULL},
};
/*
D3D11_SO_DECLARATION_ENTRY layout[] =
{
{0, "SV_Position", 0, 0, 4, 0},
{0, "TEXCOORD", 0, 0, 2, 0},
{0, "TEXCOORD", 1, 0, 4, 0},
{0, "COLOR", 0, 0, 4, 0},
};
*/
dev->CompileShader(IDR_CS_FX, "gs_main", macro, &gs);//, layout, countof(layout));
m_gs[gs_sel] = gs;
}
}
dev->GSSetShader(gs);
// SetupPS
PSSelector ps_sel;
PSConstantBuffer ps_cb;
hash_map<uint32, CComPtr<ID3D11PixelShader> >::const_iterator i = m_ps.find(ps_sel);
if(i == m_ps.end())
{
string str[15];
str[0] = format("%d", 0);
D3D11_SHADER_MACRO macro[] =
{
{"PS_TODO", str[0].c_str()},
{NULL, NULL},
};
CComPtr<ID3D11PixelShader> ps;
dev->CompileShader(IDR_CS_FX, "ps_main", macro, &ps);
m_ps[ps_sel] = ps;
i = m_ps.find(ps_sel);
}
ctx->UpdateSubresource(m_ps_cb, 0, NULL, &ps_cb, 0, 0); // TODO: only update if changed
dev->PSSetSamplerState(m_ss, NULL, NULL);
dev->PSSetShader(i->second, m_ps_cb);
// Offset
OffsetBuffer* fzbo = NULL;
GetOffsetBuffer(&fzbo);
dev->PSSetShaderResourceView(0, fzbo->row_view);
dev->PSSetShaderResourceView(1, fzbo->col_view);
// TODO: 2 palette
// TODO: 3, 4, ... texture levels
//ID3D11Buffer* tmp[] = {m_sob};
//ctx->SOSetTargets(countof(tmp), tmp, NULL);
dev->DrawIndexedPrimitive();
//ctx->SOSetTargets(0, NULL, NULL);
if(0)
{
HRESULT hr;
D3D11_BUFFER_DESC bd;
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = 14 * sizeof(float) * 200000;
bd.Usage = D3D11_USAGE_STAGING;
bd.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
CComPtr<ID3D11Buffer> sob;
hr = (*dev)->CreateBuffer(&bd, NULL, &sob);
ctx->CopyResource(sob, m_sob);
D3D11_MAPPED_SUBRESOURCE map;
if(SUCCEEDED(ctx->Map(sob, 0, D3D11_MAP_READ, 0, &map)))
{
float* f = (float*)map.pData;
for(int i = 0; i < 12; i++, f += 14)
printf("%f %f %f %f\n%f %f\n%f %f %f %f\n%f %f %f %f\n",
f[0], f[1], f[2], f[3],
f[4], f[5],
f[6], f[7], f[8], f[9],
f[10], f[11], f[12], f[13]);
ctx->Unmap(sob, 0);
}
}
if(1)
{
//Read(m_mem.GetOffset(0, 16, PSM_PSMCT32), GSVector4i(0, 0, 1024, 1024), false);
//
if(fm != 0xffffffff) Read(context->offset.fb, r, false);
//
if(zm != 0xffffffff) Read(context->offset.zb, r, false);
std::string s;
s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->FRAME.Block(), m_context->FRAME.PSM);
//
m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
s = format("c:\\temp1\\_%05d_f%lld_zt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->ZBUF.Block(), m_context->ZBUF.PSM);
//
m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
//m_mem.SaveBMP(s, 0, 16, PSM_PSMCT32, 1024, 1024);
s_n++;
}
dev->EndScene();
}
void GSRendererCS::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
{
GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
Read(o, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated
Read(o, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated (important)
// TODO: false deps, 8H/4HL/4HH texture sharing pages with 24-bit target
// TODO: invalidate texture cache
@ -356,6 +594,10 @@ void GSRendererCS::Write(GSOffset* o, const GSVector4i& r)
memset(&box, 0, sizeof(box));
box.right = 1;
box.bottom = 1;
box.back = 1;
uint32* pages = o->GetPages(r);
for(size_t i = 0; pages[i] != GSOffset::EOP; i++)
@ -370,10 +612,20 @@ void GSRendererCS::Write(GSOffset* o, const GSVector4i& r)
m_vm_valid[row] |= col;
box.left = page * PAGE_SIZE;
box.right = box.left + PAGE_SIZE;
box.right = (page + 1) * PAGE_SIZE;
ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + box.left, 0, 0);
ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0);
/*
// m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4
box.left = 0;
box.right = PAGE_SIZE;
box.top = page;
box.bottom = box.top + 1;
ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0);
*/
if(0)
printf("[%lld] write %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page);
}
}
@ -391,6 +643,10 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate)
memset(&box, 0, sizeof(box));
box.right = 1;
box.bottom = 1;
box.back = 1;
uint32* pages = o->GetPages(r);
for(size_t i = 0; pages[i] != GSOffset::EOP; i++)
@ -402,21 +658,34 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate)
if(m_vm_valid[row] & col)
{
if(invalidate) m_vm_valid[row] ^= col;
if(invalidate)
{
m_vm_valid[row] ^= col;
}
box.left = page * PAGE_SIZE;
box.right = box.left + PAGE_SIZE;
box.right = (page + 1) * PAGE_SIZE;
ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box);
/*
// m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4
box.left = 0;
box.right = PAGE_SIZE;
box.top = page;
box.bottom = box.top + 1;
ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box);
*/
D3D11_MAPPED_SUBRESOURCE map;
if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ_WRITE, 0, &map)))
if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ, 0, &map)))
{
memcpy(m_mem.m_vm8 + box.left, map.pData, PAGE_SIZE);
memcpy(m_mem.m_vm8 + page * PAGE_SIZE, map.pData, PAGE_SIZE);
ctx->Unmap(m_pb, 0);
if(0)
printf("[%lld] read %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page);
}
}
@ -424,3 +693,64 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate)
delete [] pages;
}
bool GSRendererCS::GetOffsetBuffer(OffsetBuffer** fzbo)
{
HRESULT hr;
GSDevice11* dev = (GSDevice11*)m_dev;
D3D11_BUFFER_DESC bd;
D3D11_SHADER_RESOURCE_VIEW_DESC srvd;
D3D11_SUBRESOURCE_DATA data;
hash_map<uint32, OffsetBuffer>::iterator i = m_offset.find(m_context->offset.fzb->hash);
if(i == m_offset.end())
{
OffsetBuffer ob;
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = sizeof(GSVector2i) * 2048;
bd.Usage = D3D11_USAGE_IMMUTABLE;
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
memset(&data, 0, sizeof(data));
data.pSysMem = m_context->offset.fzb->row;
hr = (*dev)->CreateBuffer(&bd, &data, &ob.row);
if(FAILED(hr)) return false;
data.pSysMem = m_context->offset.fzb->col;
hr = (*dev)->CreateBuffer(&bd, &data, &ob.col);
if(FAILED(hr)) return false;
memset(&srvd, 0, sizeof(srvd));
srvd.Format = DXGI_FORMAT_R32G32_SINT;
srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
srvd.Buffer.FirstElement = 0;
srvd.Buffer.NumElements = 2048;
hr = (*dev)->CreateShaderResourceView(ob.row, &srvd, &ob.row_view);
if(FAILED(hr)) return false;
hr = (*dev)->CreateShaderResourceView(ob.col, &srvd, &ob.col_view);
if(FAILED(hr)) return false;
m_offset[m_context->offset.fzb->hash] = ob;
i = m_offset.find(m_context->offset.fzb->hash);
}
*fzbo = &i->second;
return true;
}

View File

@ -26,28 +26,105 @@
class GSRendererCS : public GSRenderer
{
class GSVertexTraceCS : public GSVertexTrace
struct VSSelector
{
public:
GSVertexTraceCS(const GSState* state) : GSVertexTrace(state) {}
union
{
struct
{
uint32 tme:1;
uint32 fst:1;
};
uint32 key;
};
operator uint32() {return key & 0x3;}
VSSelector() : key(0) {}
};
__aligned(struct, 32) VSConstantBuffer
{
GSVector4 VertexScale;
GSVector4 VertexOffset;
};
struct GSSelector
{
union
{
struct
{
uint32 iip:1;
uint32 prim:2;
};
uint32 key;
};
operator uint32() {return key & 0x7;}
GSSelector() : key(0) {}
};
struct PSSelector
{
union
{
struct
{
uint32 TODO:1;
};
uint32 key;
};
operator uint32() {return key & 0x1;}
PSSelector() : key(0) {}
};
__aligned(struct, 32) PSConstantBuffer
{
GSVector4 TODO;
};
CComPtr<ID3D11DepthStencilState> m_dss;
CComPtr<ID3D11BlendState> m_bs;
CComPtr<ID3D11SamplerState> m_ss;
CComPtr<ID3D11Buffer> m_vm;
//CComPtr<ID3D11Texture2D> m_vm;
CComPtr<ID3D11UnorderedAccessView> m_vm_uav;
CComPtr<ID3D11Buffer> m_vb;
CComPtr<ID3D11Buffer> m_ib;
CComPtr<ID3D11Buffer> m_pb;
hash_map<uint32, CComPtr<ID3D11ComputeShader> > m_cs;
uint32 m_vm_valid[16];
CComPtr<ID3D11Buffer> m_pb;
//CComPtr<ID3D11Texture2D> m_pb;
hash_map<uint32, GSVertexShader11 > m_vs;
CComPtr<ID3D11Buffer> m_vs_cb;
hash_map<uint32, CComPtr<ID3D11GeometryShader> > m_gs;
hash_map<uint32, CComPtr<ID3D11PixelShader> > m_ps;
CComPtr<ID3D11Buffer> m_ps_cb;
CComPtr<ID3D11Buffer> m_sob;
void Write(GSOffset* o, const GSVector4i& r);
void Read(GSOffset* o, const GSVector4i& r, bool invalidate);
struct OffsetBuffer
{
CComPtr<ID3D11Buffer> row, col;
CComPtr<ID3D11ShaderResourceView> row_view, col_view;
};
hash_map<uint32, OffsetBuffer> m_offset;
bool GetOffsetBuffer(OffsetBuffer** fzbo);
protected:
template<uint32 prim, uint32 tme, uint32 fst>
void ConvertVertex(size_t dst_index, size_t src_index);
bool CreateDevice(GSDevice* dev);
void VSync(int field);
GSTexture* GetOutput(int i);
void Draw();
void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);

View File

@ -233,7 +233,7 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc
}
}
if (env.COLCLAMP.CLAMP == 0 && /* hack */ !tex && PRIM->PRIM != GS_POINTLIST)
if(env.COLCLAMP.CLAMP == 0 && /* hack */ !tex && PRIM->PRIM != GS_POINTLIST)
{
ps_sel.colclip = 1;
}

View File

@ -101,19 +101,18 @@ void GSRendererHW::Reset()
void GSRendererHW::VSync(int field)
{
GSRenderer::VSync(field);
m_tc->IncAge();
m_dev->AgePool();
m_skip = 0;
if(m_reset)
{
m_tc->RemoveAll();
m_reset = false;
}
GSRenderer::VSync(field);
m_tc->IncAge();
m_skip = 0;
}
void GSRendererHW::ResetDevice()

View File

@ -22,6 +22,10 @@
#include "stdafx.h"
#include "GSRendererSW.h"
#define LOG 0
static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
GSRendererSW::GSRendererSW(int threads)
@ -60,10 +64,9 @@ GSRendererSW::~GSRendererSW()
void GSRendererSW::Reset()
{
// TODO: GSreset can come from the main thread too => crash
// m_tc->RemoveAll();
Sync(-1);
m_reset = true;
m_tc->RemoveAll();
GSRenderer::Reset();
}
@ -72,6 +75,93 @@ void GSRendererSW::VSync(int field)
{
Sync(0); // IncAge might delete a cached texture in use
if(LOG)
{
fprintf(s_fp, "%lld\n", m_perfmon.GetFrame());
GSVector4i dr = GetDisplayRect();
GSVector4i fr = GetFrameRect();
GSVector2i ds = GetDeviceSize();
fprintf(s_fp, "dr %d %d %d %d, fr %d %d %d %d, ds %d %d\n",
dr.x, dr.y, dr.z, dr.w,
fr.x, fr.y, fr.z, fr.w,
ds.x, ds.y);
for(int i = 0; i < 2; i++)
{
if(i == 0 && !m_regs->PMODE.EN1) continue;
if(i == 1 && !m_regs->PMODE.EN2) continue;
fprintf(s_fp, "DISPFB[%d] BP=%05x BW=%d PSM=%d DBX=%d DBY=%d\n",
i,
m_regs->DISP[i].DISPFB.Block(),
m_regs->DISP[i].DISPFB.FBW,
m_regs->DISP[i].DISPFB.PSM,
m_regs->DISP[i].DISPFB.DBX,
m_regs->DISP[i].DISPFB.DBY
);
fprintf(s_fp, "DISPLAY[%d] DX=%d DY=%d DW=%d DH=%d MAGH=%d MAGV=%d\n",
i,
m_regs->DISP[i].DISPLAY.DX,
m_regs->DISP[i].DISPLAY.DY,
m_regs->DISP[i].DISPLAY.DW,
m_regs->DISP[i].DISPLAY.DH,
m_regs->DISP[i].DISPLAY.MAGH,
m_regs->DISP[i].DISPLAY.MAGV
);
}
fprintf(s_fp, "PMODE EN1=%d EN2=%d CRTMD=%d MMOD=%d AMOD=%d SLBG=%d ALP=%d\n",
m_regs->PMODE.EN1,
m_regs->PMODE.EN2,
m_regs->PMODE.CRTMD,
m_regs->PMODE.MMOD,
m_regs->PMODE.AMOD,
m_regs->PMODE.SLBG,
m_regs->PMODE.ALP
);
fprintf(s_fp, "SMODE1 %08x_%08x\n",
m_regs->SMODE1.u32[0],
m_regs->SMODE1.u32[1]
);
fprintf(s_fp, "SMODE2 INT=%d FFMD=%d DPMS=%d\n",
m_regs->SMODE2.INT,
m_regs->SMODE2.FFMD,
m_regs->SMODE2.DPMS
);
fprintf(s_fp, "SRFSH %08x_%08x\n",
m_regs->SRFSH.u32[0],
m_regs->SRFSH.u32[1]
);
fprintf(s_fp, "SYNCH1 %08x_%08x\n",
m_regs->SYNCH1.u32[0],
m_regs->SYNCH1.u32[1]
);
fprintf(s_fp, "SYNCH2 %08x_%08x\n",
m_regs->SYNCH2.u32[0],
m_regs->SYNCH2.u32[1]
);
fprintf(s_fp, "SYNCV %08x_%08x\n",
m_regs->SYNCV.u32[0],
m_regs->SYNCV.u32[1]
);
fprintf(s_fp, "CSR %08x_%08x\n",
m_regs->CSR.u32[0],
m_regs->CSR.u32[1]
);
fflush(s_fp);
}
/*
int draw[8], sum = 0;
@ -87,20 +177,12 @@ void GSRendererSW::VSync(int field)
draw[0], draw[1], draw[2], draw[3], draw[4], draw[5], draw[6], draw[7], sum);
//
printf("m_sync_count = %d\n", ((GSRasterizerList*)m_rl)->m_sync_count); ((GSRasterizerList*)m_rl)->m_sync_count = 0;
printf("m_syncpoint_count = %d\n", ((GSRasterizerList*)m_rl)->m_syncpoint_count); ((GSRasterizerList*)m_rl)->m_syncpoint_count = 0;
*/
GSRenderer::VSync(field);
m_tc->IncAge();
if(m_reset)
{
m_tc->RemoveAll();
m_reset = false;
}
// if((m_perfmon.GetFrame() & 255) == 0) m_rl.PrintStats();
}
@ -197,10 +279,6 @@ void GSRendererSW::ConvertVertex(size_t dst_index, size_t src_index)
}
}
#define LOG 0
FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
void GSRendererSW::Draw()
{
SharedData* sd = new SharedData(this);
@ -265,89 +343,18 @@ void GSRendererSW::Draw()
m_tc->InvalidatePages(zb_pages, context->offset.zb->psm);
}
// set data->syncpoint
if(m_fzb != context->offset.fzb)
if(CheckTargetPages(fb_pages, zb_pages, r))
{
// hmm, what if "r" gets bigger next time and slips through unchecked, need to trace that too
sd->syncpoint = true; // TODO
if(!sd->syncpoint)
{
if(fb_pages == NULL)
{
fb_pages = context->offset.fb->GetPages(r);
}
if(CheckTargetPages<0xffffffff>(fb_pages))
{
sd->syncpoint = true;
if(LOG) fprintf(s_fp, "syncpoint 0\n");
}
}
if(!sd->syncpoint)
{
if(zb_pages == NULL)
{
zb_pages = context->offset.zb->GetPages(r);
}
if(CheckTargetPages<0xffffffff>(zb_pages))
{
sd->syncpoint = true;
if(LOG) fprintf(s_fp, "syncpoint 1\n");
}
}
if(!sd->syncpoint)
{
if(LOG) fprintf(s_fp, "no syncpoint *\n");
}
m_fzb = context->offset.fzb;
sd->syncpoint = true;
}
else
{
// chross-check frame and z-buffer pages, they cannot overlap with eachother and with previous batches in queue,
// m_fzb filters out most of these cases, only have to be careful when the addresses stay the same and the output
// is mutually enabled/disabled and alternating (Bully FBP/ZBP = 0x2300)
if(!sd->syncpoint)
{
if(gd.sel.fwrite)
{
if(CheckTargetPages<0xffff0000>(fb_pages)) // already used as a z-buffer
{
sd->syncpoint = true;
if(LOG) fprintf(s_fp, "syncpoint 2\n");
}
}
}
if(!sd->syncpoint)
{
if(gd.sel.zwrite)
{
if(CheckTargetPages<0x0000ffff>(zb_pages)) // already used as a frame buffer
{
sd->syncpoint = true;
if(LOG) fprintf(s_fp, "syncpoint 3\n");
}
}
}
}
//
sd->UseTargetPages(fb_pages, zb_pages);
//
if(LOG) {fprintf(s_fp, "queue %05x %d %05x %d %05x %d %dx%d | %d %d %d\n",
m_context->FRAME.Block(), m_context->FRAME.PSM,
m_context->ZBUF.Block(), m_context->ZBUF.PSM,
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH,
PRIM->PRIM, sd->vertex_count, sd->index_count); fflush(s_fp);}
if(s_dump)
{
@ -404,12 +411,6 @@ void GSRendererSW::Draw()
}
else
{
if(LOG) fprintf(s_fp, "queue %05x %d %05x %d %05x %d %dx%d | %d %d %d\n",
m_context->FRAME.Block(), m_context->FRAME.PSM,
m_context->ZBUF.Block(), m_context->ZBUF.PSM,
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH,
PRIM->PRIM, sd->vertex_count, sd->index_count);
m_rl->Queue(data);
}
@ -435,36 +436,42 @@ void GSRendererSW::Sync(int reason)
m_rl->Sync();
s_n++;
if(0)
{
std::string s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->FRAME.Block(), m_context->FRAME.PSM);
m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
s_n++;
}
t = __rdtsc() - t;
if(LOG) fprintf(s_fp, "sync n=%d r=%d t=%lld p=%d %c\n", s_n, reason, t, m_rl->GetPixels(), t > 10000000 ? '*' : ' ');
int pixels = m_rl->GetPixels();
m_perfmon.Put(GSPerfMon::Fillrate, m_rl->GetPixels());
if(LOG) {fprintf(s_fp, "sync n=%d r=%d t=%lld p=%d %c\n", s_n, reason, t, pixels, t > 10000000 ? '*' : ' '); fflush(s_fp);}
m_perfmon.Put(GSPerfMon::Fillrate, pixels);
}
void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
{
GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
uint32* RESTRICT p = m_tmp_pages;
o->GetPages(r, p);
o->GetPages(r, m_tmp_pages);
// check if the changing pages either used as a texture or a target
for(; *p != GSOffset::EOP; p++)
if(!m_rl->IsSynced())
{
uint32 page = *p;
//while(m_fzb_pages[page] | m_tex_pages[page]) _mm_pause();
if(m_fzb_pages[page] | m_tex_pages[page])
for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
{
Sync(5);
if(m_fzb_pages[*p] | m_tex_pages[*p])
{
Sync(5);
break;
break;
}
}
}
@ -473,21 +480,20 @@ void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut)
{
GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
uint32* RESTRICT p = m_tmp_pages;
o->GetPages(r, p);
for(; *p != GSOffset::EOP; p++)
if(!m_rl->IsSynced())
{
//while(m_fzb_pages[*p]) _mm_pause();
GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
if(m_fzb_pages[*p])
o->GetPages(r, m_tmp_pages);
for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
{
Sync(6);
if(m_fzb_pages[*p])
{
Sync(6);
break;
break;
}
}
}
}
@ -505,15 +511,16 @@ void GSRendererSW::UsePages(const uint32* pages, int type)
}
else
{
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
if(!m_rl->IsSynced())
{
//while(m_fzb_pages[*p]) _mm_pause();
if(m_fzb_pages[*p]) // currently being drawn to? => sync (could even spin and wait until it hits 0, not sure if it's worth though, or just create 512 condvars? :D)
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
{
Sync(7);
if(m_fzb_pages[*p]) // currently being drawn to? => sync
{
Sync(7);
break;
break;
}
}
}
@ -548,13 +555,150 @@ void GSRendererSW::ReleasePages(const uint32* pages, int type)
}
}
template<uint32 mask> bool GSRendererSW::CheckTargetPages(const uint32* pages)
bool GSRendererSW::CheckTargetPages(const uint32* fb_pages, const uint32* zb_pages, const GSVector4i& r)
{
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
bool synced = m_rl->IsSynced();
if(m_fzb != m_context->offset.fzb4)
{
if(mask != 0xffffffff ? (m_fzb_pages[*p] & mask) : m_fzb_pages[*p])
// targets changed, check everything
m_fzb = m_context->offset.fzb4;
m_fzb_bbox = r;
if(fb_pages == NULL) fb_pages = m_context->offset.fb->GetPages(r);
if(zb_pages == NULL) zb_pages = m_context->offset.zb->GetPages(r);
memset(m_fzb_cur_pages, 0, sizeof(m_fzb_cur_pages));
uint32 used = 0;
for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++)
{
return true;
uint32 i = *p;
uint32 row = i >> 5;
uint32 col = 1 << (i & 31);
m_fzb_cur_pages[row] |= col;
used |= m_fzb_pages[i];
}
for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++)
{
uint32 i = *p;
uint32 row = i >> 5;
uint32 col = 1 << (i & 31);
m_fzb_cur_pages[row] |= col;
used |= m_fzb_pages[i];
}
if(!synced)
{
if(used)
{
if(LOG) {fprintf(s_fp, "syncpoint 0\n"); fflush(s_fp);}
return true;
}
if(LOG) {fprintf(s_fp, "no syncpoint *\n"); fflush(s_fp);}
}
}
else
{
// same target, only check new areas and cross-rendering between frame and z-buffer
GSVector4i bbox = m_fzb_bbox.runion(r);
bool check = !m_fzb_bbox.eq(bbox);
m_fzb_bbox = bbox;
if(check)
{
// drawing area is larger than previous time, check new parts only to avoid false positives (m_fzb_cur_pages guards)
if(fb_pages == NULL) fb_pages = m_context->offset.fb->GetPages(r);
if(zb_pages == NULL) zb_pages = m_context->offset.zb->GetPages(r);
uint32 used = 0;
for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++)
{
uint32 i = *p;
uint32 row = i >> 5;
uint32 col = 1 << (i & 31);
if((m_fzb_cur_pages[row] & col) == 0)
{
m_fzb_cur_pages[row] |= col;
used |= m_fzb_pages[i];
}
}
for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++)
{
uint32 i = *p;
uint32 row = i >> 5;
uint32 col = 1 << (i & 31);
if((m_fzb_cur_pages[row] & col) == 0)
{
m_fzb_cur_pages[row] |= col;
used |= m_fzb_pages[i];
}
}
if(!synced)
{
if(used)
{
if(LOG) {fprintf(s_fp, "syncpoint 1\n"); fflush(s_fp);}
return true;
}
}
}
if(!synced)
{
// chross-check frame and z-buffer pages, they cannot overlap with eachother and with previous batches in queue,
// have to be careful when the two buffers are mutually enabled/disabled and alternating (Bully FBP/ZBP = 0x2300)
if(fb_pages)
{
for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++)
{
if(m_fzb_pages[*p] & 0xffff0000)
{
if(LOG) {fprintf(s_fp, "syncpoint 2\n"); fflush(s_fp);}
return true;
}
}
}
if(zb_pages)
{
for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++)
{
if(m_fzb_pages[*p] & 0x0000ffff)
{
if(LOG) {fprintf(s_fp, "syncpoint 3\n"); fflush(s_fp);}
return true;
}
}
}
}
}
@ -577,8 +721,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
gd.zbr = context->offset.zb->pixel.row;
gd.fbc = context->offset.fb->pixel.col[0];
gd.zbc = context->offset.zb->pixel.col[0];
gd.fzbr = context->offset.fzb->row;
gd.fzbc = context->offset.fzb->col;
gd.fzbr = context->offset.fzb4->row;
gd.fzbc = context->offset.fzb4->col;
gd.sel.key = 0;
@ -1117,8 +1261,8 @@ GSRendererSW::SharedData::~SharedData()
}
}
delete m_fb_pages;
delete m_zb_pages;
delete [] m_fb_pages;
delete [] m_zb_pages;
for(size_t i = 0; i < countof(m_tex_pages) && m_tex_pages[i] != NULL; i++)
{
@ -1153,8 +1297,10 @@ void GSRendererSW::SharedData::UseSourcePages(GSTextureCacheSW::Texture* t, int
{
ASSERT(m_tex_pages[level] == NULL);
m_tex_pages[level] = t->m_pages.n;
const uint32* pages = t->m_pages.n;
m_tex_pages[level] = pages;
m_tex_pages[level + 1] = NULL;
m_parent->UsePages(t->m_pages.n, 2);
m_parent->UsePages(pages, 2);
}

View File

@ -48,8 +48,9 @@ protected:
GSTextureCacheSW* m_tc;
GSTexture* m_texture[2];
uint8* m_output;
bool m_reset;
GSPixelOffset4* m_fzb;
GSVector4i m_fzb_bbox;
uint32 m_fzb_cur_pages[16];
uint32 m_fzb_pages[512]; // uint16 frame/zbuf pages interleaved
uint16 m_tex_pages[512];
uint32 m_tmp_pages[512 + 1];
@ -66,7 +67,7 @@ protected:
void UsePages(const uint32* pages, int type);
void ReleasePages(const uint32* pages, int type);
template<uint32 mask> bool CheckTargetPages(const uint32* pages);
bool CheckTargetPages(const uint32* fb_pages, const uint32* zb_pages, const GSVector4i& r);
bool GetScanlineGlobalData(SharedData* data);

View File

@ -209,6 +209,9 @@ void GSState::SetFrameSkip(int skip)
void GSState::Reset()
{
printf("GS reset\n");
memset(m_mem.m_vm8, 0, m_mem.m_vmsize);
memset(&m_path[0], 0, sizeof(m_path[0]) * countof(m_path));
memset(&m_v, 0, sizeof(m_v));
@ -253,6 +256,7 @@ void GSState::ResetHandlers()
m_fpGIFRegHandlerXYZ[P][1] = &GSState::GIFRegHandlerXYZF2<P, 1>; \
m_fpGIFRegHandlerXYZ[P][2] = &GSState::GIFRegHandlerXYZ2<P, 0>; \
m_fpGIFRegHandlerXYZ[P][3] = &GSState::GIFRegHandlerXYZ2<P, 1>; \
m_fpGIFPackedRegHandlerSTQRGBAXYZF2[P] = &GSState::GIFPackedRegHandlerSTQRGBAXYZF2<P>; \
SetHandlerXYZ(GS_POINTLIST);
SetHandlerXYZ(GS_LINELIST);
@ -546,6 +550,36 @@ void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r)
{
}
template<uint32 prim>
void GSState::GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, uint32 size)
{
ASSERT(size > 0 && size % 3 == 0);
const GIFPackedReg* RESTRICT r_end = r + size;
while(r < r_end)
{
GSVector4i st = GSVector4i::loadl(&r[0].u64[0]);
GSVector4i q = GSVector4i::loadl(&r[0].u64[1]);
GSVector4i rgba = (GSVector4i::load<false>(&r[1]) & GSVector4i::x000000ff()).ps32().pu16();
m_v.m[0] = st.upl64(rgba.upl32(q));
GSVector4i xy = GSVector4i::loadl(&r[2].u64[0]);
GSVector4i zf = GSVector4i::loadl(&r[2].u64[1]);
xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::loadl(&m_v.UV));
zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
m_v.m[1] = xy.upl32(zf);
VertexKick<prim>(r[2].XYZF2.Skip());
r += 3;
}
m_q = r[-3].STQ.Q; // remember the last one, STQ outputs this to the temp Q each time
}
// GIFRegHandler*
void GSState::GIFRegHandlerNull(const GIFReg* RESTRICT r)
@ -1037,7 +1071,8 @@ template<int i> void GSState::GIFRegHandlerFRAME(const GIFReg* RESTRICT r)
{
m_env.CTXT[i].offset.fb = m_mem.GetOffset(r->FRAME.Block(), r->FRAME.FBW, r->FRAME.PSM);
m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), r->FRAME.FBW, m_env.CTXT[i].ZBUF.PSM);
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(r->FRAME, m_env.CTXT[i].ZBUF);
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(r->FRAME, m_env.CTXT[i].ZBUF);
m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(r->FRAME, m_env.CTXT[i].ZBUF);
}
m_env.CTXT[i].FRAME = (GSVector4i)r->FRAME;
@ -1075,7 +1110,8 @@ template<int i> void GSState::GIFRegHandlerZBUF(const GIFReg* RESTRICT r)
if((m_env.CTXT[i].ZBUF.u32[0] ^ ZBUF.u32[0]) & 0x3f0001ff) // ZBP PSM
{
m_env.CTXT[i].offset.zb = m_mem.GetOffset(ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, ZBUF.PSM);
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, ZBUF);
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(m_env.CTXT[i].FRAME, ZBUF);
m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, ZBUF);
}
m_env.CTXT[i].ZBUF = (GSVector4i)ZBUF;
@ -1726,8 +1762,28 @@ template<int index> void GSState::Transfer(const uint8* mem, uint32 size)
{
size -= total;
if(path.adonly)
switch(path.type)
{
case GIFPath::TYPE_UNKNOWN:
{
uint32 reg = 0;
do
{
(this->*m_fpGIFPackedRegHandlers[path.GetReg(reg++)])((GIFPackedReg*)mem);
mem += sizeof(GIFPackedReg);
reg = reg & ((int)(reg - path.nreg) >> 31); // resets reg back to 0 when it becomes equal to path.nreg
}
while(--total > 0);
}
break;
case GIFPath::TYPE_ADONLY: // very common
do
{
(this->*m_fpGIFRegHandlers[((GIFPackedReg*)mem)->A_D.ADDR])(&((GIFPackedReg*)mem)->r);
@ -1735,20 +1791,20 @@ template<int index> void GSState::Transfer(const uint8* mem, uint32 size)
mem += sizeof(GIFPackedReg);
}
while(--total > 0);
}
else
{
uint32 reg = 0;
do
{
(this->*m_fpGIFPackedRegHandlers[path.GetReg(reg++)])((GIFPackedReg*)mem);
break;
mem += sizeof(GIFPackedReg);
case GIFPath::TYPE_STQRGBAXYZF2: // majority of the vertices are formatted like this
reg = reg & ((int)(reg - path.nreg) >> 31); // resets reg back to 0 when it becomes equal to path.nreg
}
while(--total > 0);
(this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2])((GIFPackedReg*)mem, total);
mem += total * sizeof(GIFPackedReg);
break;
default:
__assume(0);
}
path.nloop = 0;
@ -2070,7 +2126,8 @@ int GSState::Defrost(const GSFreezeData* fd)
m_env.CTXT[i].offset.fb = m_mem.GetOffset(m_env.CTXT[i].FRAME.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].FRAME.PSM);
m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].ZBUF.PSM);
m_env.CTXT[i].offset.tex = m_mem.GetOffset(m_env.CTXT[i].TEX0.TBP0, m_env.CTXT[i].TEX0.TBW, m_env.CTXT[i].TEX0.PSM);
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
}
UpdateScissor();
@ -2116,6 +2173,8 @@ void GSState::UpdateVertexKick()
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = m_fpGIFRegHandlerXYZ[prim][2];
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = m_fpGIFRegHandlerXYZ[prim][3];
m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2] = m_fpGIFPackedRegHandlerSTQRGBAXYZF2[prim];
m_cvf = m_cv[prim][PRIM->TME][PRIM->FST];
}

View File

@ -59,6 +59,13 @@ class GSState : public GSAlignedClass<32>
GIFRegHandler m_fpGIFRegHandlers[256];
GIFRegHandler m_fpGIFRegHandlerXYZ[8][4];
typedef void (GSState::*GIFPackedRegHandlerC)(const GIFPackedReg* RESTRICT r, uint32 size);
GIFPackedRegHandlerC m_fpGIFPackedRegHandlersC[1];
GIFPackedRegHandlerC m_fpGIFPackedRegHandlerSTQRGBAXYZF2[8];
template<uint32 prim> void GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, uint32 size);
template<int i> void ApplyTEX0(GIFRegTEX0& TEX0);
void ApplyPRIM(const GIFRegPRIM& PRIM);

View File

@ -167,6 +167,18 @@ GSTexture11::operator ID3D11ShaderResourceView*()
return m_srv;
}
GSTexture11::operator ID3D11UnorderedAccessView*()
{
if(!m_uav && m_dev && m_texture)
{
ASSERT(!m_msaa);
m_dev->CreateUnorderedAccessView(m_texture, NULL, &m_uav);
}
return m_uav;
}
GSTexture11::operator ID3D11RenderTargetView*()
{
ASSERT(m_dev);

View File

@ -30,6 +30,7 @@ class GSTexture11 : public GSTexture
CComPtr<ID3D11Texture2D> m_texture;
D3D11_TEXTURE2D_DESC m_desc;
CComPtr<ID3D11ShaderResourceView> m_srv;
CComPtr<ID3D11UnorderedAccessView> m_uav;
CComPtr<ID3D11RenderTargetView> m_rtv;
CComPtr<ID3D11DepthStencilView> m_dsv;
@ -43,6 +44,7 @@ public:
operator ID3D11Texture2D*();
operator ID3D11ShaderResourceView*();
operator ID3D11UnorderedAccessView*();
operator ID3D11RenderTargetView*();
operator ID3D11DepthStencilView*();
};

View File

@ -281,6 +281,8 @@ GSTextureCache::Target* GSTextureCache::LookupTarget(const GIFRegTEX0& TEX0, int
{
return NULL;
}
m_renderer->m_dev->ClearRenderTarget(dst->m_texture, 0); // new frame buffers after reset should be cleared, don't display memory garbage
}
else
{

View File

@ -30,6 +30,7 @@ WakeAllConditionVariablePtr pWakeAllConditionVariable;
SleepConditionVariableSRWPtr pSleepConditionVariableSRW;
InitializeSRWLockPtr pInitializeSRWLock;;
AcquireSRWLockExclusivePtr pAcquireSRWLockExclusive;
TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive;
ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive;
class InitCondVar
@ -47,6 +48,7 @@ public:
pSleepConditionVariableSRW = (SleepConditionVariableSRWPtr)GetProcAddress(m_kernel32, "SleepConditionVariableSRW");
pInitializeSRWLock = (InitializeSRWLockPtr)GetProcAddress(m_kernel32, "InitializeSRWLock");
pAcquireSRWLockExclusive = (AcquireSRWLockExclusivePtr)GetProcAddress(m_kernel32, "AcquireSRWLockExclusive");
pTryAcquireSRWLockExclusive = (TryAcquireSRWLockExclusivePtr)GetProcAddress(m_kernel32, "TryAcquireSRWLockExclusive");
pReleaseSRWLockExclusive = (ReleaseSRWLockExclusivePtr)GetProcAddress(m_kernel32, "ReleaseSRWLockExclusive");
}

View File

@ -21,6 +21,8 @@
#pragma once
#include "GSdx.h"
#ifdef _WINDOWS
typedef void (WINAPI * InitializeConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable);
@ -29,7 +31,7 @@ typedef void (WINAPI * WakeAllConditionVariablePtr)(CONDITION_VARIABLE* Conditio
typedef void (WINAPI * SleepConditionVariableSRWPtr)(CONDITION_VARIABLE* ConditionVariable, SRWLOCK* SRWLock, DWORD dwMilliseconds, ULONG Flags);
typedef void (WINAPI * InitializeSRWLockPtr)(SRWLOCK* SRWLock);
typedef void (WINAPI * AcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock);
typedef void (WINAPI * ReleaseSRWLockExclusivePtr)(SRWLOCK* SRWLock);
typedef BOOLEAN (WINAPI * TryAcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock);typedef void (WINAPI * ReleaseSRWLockExclusivePtr)(SRWLOCK* SRWLock);
extern InitializeConditionVariablePtr pInitializeConditionVariable;
extern WakeConditionVariablePtr pWakeConditionVariable;
@ -37,7 +39,7 @@ extern WakeAllConditionVariablePtr pWakeAllConditionVariable;
extern SleepConditionVariableSRWPtr pSleepConditionVariableSRW;
extern InitializeSRWLockPtr pInitializeSRWLock;;
extern AcquireSRWLockExclusivePtr pAcquireSRWLockExclusive;
extern ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive;
extern TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive;extern ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive;
class GSThread
{
@ -92,7 +94,7 @@ public:
GSCondVarLock() {pInitializeSRWLock(&m_lock);}
void Lock() {pAcquireSRWLockExclusive(&m_lock);}
void Unlock() {pReleaseSRWLockExclusive(&m_lock);}
bool TryLock() {return pTryAcquireSRWLockExclusive(&m_lock) == TRUE;} void Unlock() {pReleaseSRWLockExclusive(&m_lock);}
operator SRWLOCK* () {return &m_lock;}
};
@ -114,7 +116,6 @@ public:
#include <pthread.h>
#include <semaphore.h>
#include "GSdx.h"
class GSThread
{
@ -191,6 +192,7 @@ public:
}
void Lock() {pthread_mutex_lock(&m_mutex);}
bool TryLock() {return pthread_mutex_trylock(&m_mutex) == 0;}
void Unlock() {pthread_mutex_unlock(&m_mutex);}
operator pthread_mutex_t* () {return &m_mutex;}
@ -254,10 +256,10 @@ public:
template<class T> class GSJobQueue : private GSThread
{
protected:
int m_count;
queue<T> m_queue;
volatile long m_count; // NOTE: it is the safest to have our own counter because m_queue.pop() might decrement its own before the last item runs out of its scope and gets destroyed (implementation dependent)
volatile bool m_exit;
struct {GSCritSec lock; GSEvent notempty; volatile long count;} m_ev;
struct {GSCritSec lock; GSEvent notempty;} m_ev;
struct {GSCondVar notempty, empty; GSCondVarLock lock; bool available;} m_cv;
void ThreadProc()
@ -285,6 +287,8 @@ protected:
m_queue.pop();
m_count--;
if(m_queue.empty())
{
m_cv.empty.Set();
@ -318,7 +322,7 @@ protected:
m_queue.pop();
_InterlockedDecrement(&m_ev.count);
m_count--;
}
}
}
@ -328,16 +332,14 @@ public:
: m_count(0)
, m_exit(false)
{
m_ev.count = 0;
m_cv.available = !!theApp.GetConfig("condvar", 1);
#ifdef _WINDOWS
m_cv.available = pInitializeConditionVariable != NULL;
#elif defined(_LINUX)
//m_cv.available = true;
m_cv.available = !!theApp.GetConfig("condvar", 1);
if(pInitializeConditionVariable == NULL)
{
m_cv.available = false;
}
#endif
@ -358,12 +360,14 @@ public:
}
}
int GetCount() const
bool IsEmpty() const
{
return m_count;
ASSERT(m_count >= 0);
return m_count == 0;
}
virtual void Push(const T& item)
void Push(const T& item)
{
if(m_cv.available)
{
@ -371,6 +375,8 @@ public:
m_queue.push(item);
m_count++;
m_cv.lock.Unlock();
m_cv.notempty.Set();
@ -381,35 +387,34 @@ public:
m_queue.push(item);
_InterlockedIncrement(&m_ev.count);
m_count++;
m_ev.notempty.Set();
}
m_count++;
}
virtual void Wait()
void Wait()
{
if(m_cv.available)
{
m_cv.lock.Lock();
while(!m_queue.empty())
if(m_count > 0)
{
m_cv.empty.Wait(m_cv.lock);
}
m_cv.lock.Lock();
m_cv.lock.Unlock();
while(!m_queue.empty())
{
m_cv.empty.Wait(m_cv.lock);
}
ASSERT(m_count == 0);
m_cv.lock.Unlock();
}
}
else
{
// NOTE: it is the safest to have our own counter because m_queue.pop() might decrement its own before the last item runs out of its scope and gets destroyed (implementation dependent)
while(m_ev.count > 0) _mm_pause();
while(m_count > 0) _mm_pause();
}
m_count++;
}
virtual void Process(T& item) = 0;

View File

@ -1024,6 +1024,10 @@
RelativePath=".\GSRenderer.cpp"
>
</File>
<File
RelativePath=".\GSRendererCS.cpp"
>
</File>
<File
RelativePath=".\GSRendererDX.cpp"
>
@ -1630,6 +1634,10 @@
RelativePath=".\GSRenderer.h"
>
</File>
<File
RelativePath=".\GSRendererCS.h"
>
</File>
<File
RelativePath=".\GSRendererDX.h"
>

View File

@ -1,73 +1,270 @@
struct Vertex
#ifndef VS_TME
#define VS_TME 1
#define VS_FST 1
#endif
#ifndef GS_IIP
#define GS_IIP 0
#define GS_PRIM 3
#endif
//
globallycoherent RWByteAddressBuffer VideoMemory : register(u0);
//globallycoherent RWTexture2D<uint> VideoMemory : register(u0); // 8192 * 512 R8_UINT
Buffer<int2> FZBufRow : register(t0);
Buffer<int2> FZBufCol : register(t1);
Texture2D<float4> Palette : register(t2);
Texture2D<float4> TextureL0 : register(t3);
Texture2D<float4> TextureL1 : register(t4);
Texture2D<float4> TextureL2 : register(t5);
Texture2D<float4> TextureL3 : register(t6);
Texture2D<float4> TextureL4 : register(t7);
Texture2D<float4> TextureL5 : register(t8);
Texture2D<float4> TextureL6 : register(t9);
cbuffer VSConstantBuffer : register(c0)
{
float2 st;
uint c;
float q;
uint xy, z;
uint uv, f;
float4 VertexScale;
float4 VertexOffset;
};
RWByteAddressBuffer VideoMemory : register(u0);
StructuredBuffer<Vertex> VertexBuffer : register(t0);
Buffer<uint> IndexBuffer : register(t1);
Buffer<int> FrameRowOffset : register(t2);
Buffer<int> FrameColOffset : register(t3);
Buffer<int> ZBufRowOffset : register(t4);
Buffer<int> ZBufColOffset : register(t5);
cbuffer DrawingEnvironment : register(c0)
cbuffer PSConstantBuffer : register(c0)
{
// TODO
};
// one group is 16x8 pixels and one thread does 2 pixels, otherwise could not read-merge-write 16-bit targets safely
// neighburing pixels are next to eachother in memory, at least we don't have to calculate the address twice
// TODO: they say groupshared memory is faster, try unswizzling the corresponding chunk of memory initially (how to do that once by only one thread?) then write-back when finished, unless it was untouched
[numthreads(8, 8, 1)]
void cs_main(uint3 gid : SV_GroupID, uint3 tid : SV_GroupThreadID)
struct VS_INPUT
{
uint count;
uint2 p : POSITION0;
uint z : POSITION1;
float2 st : TEXCOORD0;
float q : TEXCOORD1;
uint2 uv : TEXCOORD2;
float4 c : COLOR0;
float4 f : COLOR1;
};
IndexBuffer.GetDimensions(count);
struct VS_OUTPUT
{
float4 p : SV_Position;
float2 z : TEXCOORD0;
float4 t : TEXCOORD1;
float4 c : COLOR0;
};
// #if GS_PRIM == 2 (triangle)
struct GS_OUTPUT
{
float4 p : SV_Position;
float2 z : TEXCOORD0;
float4 t : TEXCOORD1;
float4 c : COLOR0;
uint id : SV_PrimitiveID;
};
for(uint i = 0; i < count; i += 3)
VS_OUTPUT vs_main(VS_INPUT input)
{
VS_OUTPUT output;
output.p = float4(input.p, 0.0f, 0.0f) * VertexScale - VertexOffset;
output.z = float2(input.z & 0xffff, input.z >> 16);
if(VS_TME)
{
Vertex v0 = VertexBuffer[IndexBuffer[i + 0]];
Vertex v1 = VertexBuffer[IndexBuffer[i + 1]];
Vertex v2 = VertexBuffer[IndexBuffer[i + 2]];
uint x = gid.x + tid.x * 2;
uint y = gid.y + tid.y;
uint fa = FrameRowOffset[y] + FrameColOffset[x];
uint za = ZBufRowOffset[y] + ZBufColOffset[x];
// TODO: quickly reject if x, y is outside the triangle
// TODO: calculate interpolated values at x, y
// TODO: run the GS pipeline
// TODO: repeat for x+1, y
// TODO: output two pixels (might be better to process a single pixel, more threads, if there is no 16-bit target involved)
// testing...
uint4 c = VideoMemory.Load4(fa); // does this load 4*4 bytes? or 4 bytes each expanded uint?
c = (v0.c >> uint4(0, 8, 16, 24)) & 0xff; // => ushr r1.yzw, r1.xxxx, l(0, 8, 16, 24), v0.c auto-converted to uint4 and per-component shift in one instruction, SSE is embarrassed
VideoMemory.Store4(fa, c); // same question, 4*4 bytes or compressed to uint
if(VS_FST)
{
output.t.xy = input.uv;
output.t.w = 1.0f;
}
else
{
output.t.xy = input.st;
output.t.w = input.q;
}
}
else
{
output.t.xy = 0;
output.t.w = 1.0f;
}
// #endif
output.c = input.c;
output.t.z = input.f.r;
return output;
}
// TODO: DrawPoint (this is going to be a waste of resources)
// TODO: DrawLine (line hit-test, will it work?)
// TODO: DrawSprite (similar to DrawTriangle)
// TODO: if read-backs are too slow, implement GSState::Write/FlushWrite/Read/clut.Write in a compute shader
// TODO: unswizzle pages from VideoMemory to the texture cache (if they are marked as valid, otherwise upload from GSLocalMemory::m_vm8)
#if GS_PRIM == 0
[maxvertexcount(1)]
void gs_main(point VS_OUTPUT input[1], inout PointStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
{
GS_OUTPUT output;
output.p = input[0].p;
output.z = input[0].z;
output.t = input[0].t;
output.c = input[0].c;
output.id = id;
stream.Append(output);
}
#elif GS_PRIM == 1
[maxvertexcount(2)]
void gs_main(line VS_OUTPUT input[2], inout LineStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
{
for(int i = 0; i < 2; i++)
{
GS_OUTPUT output;
output.p = input[i].p;
output.z = input[i].z;
output.t = input[i].t;
output.c = input[i].c;
output.id = id;
#if GS_IIP == 0
if(i != 1) output.c = input[1].c;
#endif
stream.Append(output);
}
}
#elif GS_PRIM == 2
[maxvertexcount(3)]
void gs_main(triangle VS_OUTPUT input[3], inout TriangleStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
{
for(int i = 0; i < 3; i++)
{
GS_OUTPUT output;
output.p = input[i].p;
output.z = input[i].z;
output.t = input[i].t;
output.c = input[i].c;
output.id = id;
#if GS_IIP == 0
if(i != 1) output.c = input[2].c;
#endif
stream.Append(output);
}
}
#elif GS_PRIM == 3
[maxvertexcount(4)]
void gs_main(line VS_OUTPUT input[2], inout TriangleStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
{
GS_OUTPUT lt, rb, lb, rt;
lt.p = input[0].p;
lt.z = input[1].z;
lt.t.xy = input[0].t.xy;
lt.t.zw = input[1].t.zw;
lt.c = input[0].c;
lt.id = id;
#if GS_IIP == 0
lt.c = input[1].c;
#endif
rb.p = input[1].p;
rb.z = input[1].z;
rb.t = input[1].t;
rb.c = input[1].c;
rb.id = id;
lb = lt;
lb.p.y = rb.p.y;
lb.t.y = rb.t.y;
rt = rb;
rt.p.y = lt.p.y;
rt.t.y = lt.t.y;
stream.Append(lt);
stream.Append(lb);
stream.Append(rt);
stream.Append(rb);
}
#endif
uint CompressColor(float4 f)
{
// is there a faster way?
uint4 c = (uint4)(f * 0xff) << uint4(0, 8, 16, 24);
return c.r | c.g | c.b | c.a;
}
void ps_main(GS_OUTPUT input)
{
uint c = CompressColor(input.c);
uint z = (uint)(input.z.y * 0x10000 + input.z.x);
uint x = (uint)input.p.x;
uint y = (uint)input.p.y;
uint2 addr = FZBufRow[y] + FZBufCol[x]; // 16-bit address
uint2 unaligned = addr.xy & 1; // 16-bit formats can address into the middle of an uint (smallest word size for VideoMemory)
addr = (addr & ~1) * 2;
//DeviceMemoryBarrier();
uint zd = VideoMemory.Load(addr.y);
if(z < zd) discard;
VideoMemory.Store(addr.y, z);
VideoMemory.Store(addr.x, c);
/*
addr <<= 1;
uint2 fa0 = uint2(addr.x & 0x1fff, addr.x >> 13);
uint2 fa1 = fa0 + uint2(1, 0);
uint2 fa2 = fa0 + uint2(2, 0);
uint2 fa3 = fa0 + uint2(3, 0);
uint2 za0 = uint2(addr.y & 0x1fff, addr.y >> 13);
uint2 za1 = za0 + uint2(1, 0);
uint2 za2 = za0 + uint2(2, 0);
uint2 za3 = za0 + uint2(3, 0);
DeviceMemoryBarrier();
uint zd =
(VideoMemory[za0] << 0) |
(VideoMemory[za1] << 8) |
(VideoMemory[za2] << 16) |
(VideoMemory[za3] << 24);
if(zd >= z) discard;
VideoMemory[za0] = (z >> 0) & 0xff;
VideoMemory[za1] = (z >> 8) & 0xff;
VideoMemory[za2] = (z >> 16) & 0xff;
VideoMemory[za3] = (z >> 24) & 0xff;
DeviceMemoryBarrier();
VideoMemory[fa0] = (c >> 0) & 0xff;
VideoMemory[fa1] = (c >> 8) & 0xff;
VideoMemory[fa2] = (c >> 16) & 0xff;
VideoMemory[fa3] = (c >> 24) & 0xff;
*/
}