From 9aabcc1701ea32315129a1ca26034c759c105979 Mon Sep 17 00:00:00 2001 From: gabest11 Date: Wed, 18 Jan 2012 11:47:31 +0000 Subject: [PATCH] GSdx: added a shortcut in GSState::Transfer for the most frequent vertex format I found (helps quite a lot), less thread-syncing for the sw renderer, and the bios boot logo was fixed (just had to clear the memory on reset). git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5072 96395faa-99c1-11dd-bbfe-3dabce05a288 --- plugins/GSdx/GPULocalMemory.cpp | 6 +- plugins/GSdx/GS.cpp | 2 +- plugins/GSdx/GS.h | 13 +- plugins/GSdx/GSDevice11.cpp | 100 ++++- plugins/GSdx/GSDevice11.h | 6 +- plugins/GSdx/GSDrawingContext.h | 3 +- plugins/GSdx/GSLocalMemory.cpp | 56 +++ plugins/GSdx/GSLocalMemory.h | 12 + plugins/GSdx/GSPerfMon.h | 2 +- plugins/GSdx/GSRasterizer.cpp | 100 ++--- plugins/GSdx/GSRasterizer.h | 20 +- plugins/GSdx/GSRenderer.cpp | 6 +- plugins/GSdx/GSRendererCS.cpp | 706 +++++++++++++++++++++++--------- plugins/GSdx/GSRendererCS.h | 95 ++++- plugins/GSdx/GSRendererDX.cpp | 2 +- plugins/GSdx/GSRendererHW.cpp | 13 +- plugins/GSdx/GSRendererSW.cpp | 428 ++++++++++++------- plugins/GSdx/GSRendererSW.h | 5 +- plugins/GSdx/GSState.cpp | 89 +++- plugins/GSdx/GSState.h | 7 + plugins/GSdx/GSTexture11.cpp | 12 + plugins/GSdx/GSTexture11.h | 2 + plugins/GSdx/GSTextureCache.cpp | 2 + plugins/GSdx/GSThread.cpp | 2 + plugins/GSdx/GSThread.h | 69 ++-- plugins/GSdx/GSdx_vs2008.vcproj | 8 + plugins/GSdx/res/cs.fx | 311 +++++++++++--- 27 files changed, 1549 insertions(+), 528 deletions(-) diff --git a/plugins/GSdx/GPULocalMemory.cpp b/plugins/GSdx/GPULocalMemory.cpp index 820b0534dd..c74a76c441 100644 --- a/plugins/GSdx/GPULocalMemory.cpp +++ b/plugins/GSdx/GPULocalMemory.cpp @@ -28,8 +28,8 @@ const GSVector4i GPULocalMemory::m_xxbx(0x00007c00); const GSVector4i GPULocalMemory::m_xgxx(0x000003e0); const GSVector4i GPULocalMemory::m_rxxx(0x0000001f); -#define VM_SIZE ((1 << (12 + 11)) * sizeof(uint16)) -#define VM_ALLOC_SIZE (VM_SIZE * 2) +#define VM_REAL_SIZE ((1 << (12 + 11)) * sizeof(uint16)) +#define VM_ALLOC_SIZE (VM_REAL_SIZE * 2) #define TEX_ALLOC_SIZE (256 * 256 * (1 + 1 + 4) * 32) GPULocalMemory::GPULocalMemory() @@ -39,7 +39,7 @@ GPULocalMemory::GPULocalMemory() // - int size = VM_SIZE; + int size = VM_REAL_SIZE; m_vm = (uint16*)vmalloc(VM_ALLOC_SIZE, false); diff --git a/plugins/GSdx/GS.cpp b/plugins/GSdx/GS.cpp index c8ed7c5128..0b7df5e807 100644 --- a/plugins/GSdx/GS.cpp +++ b/plugins/GSdx/GS.cpp @@ -207,7 +207,7 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1) s_gs = NULL; } - if(renderer == 12) + if(renderer == 15) { #ifdef _WINDOWS diff --git a/plugins/GSdx/GS.h b/plugins/GSdx/GS.h index 9b10f47ac9..f5d928bd43 100644 --- a/plugins/GSdx/GS.h +++ b/plugins/GSdx/GS.h @@ -90,6 +90,11 @@ enum GIF_REG GIF_REG_NOP = 0x0f, }; +enum GIF_REG_COMPLEX +{ + GIF_REG_STQRGBAXYZF2 = 0x00, +}; + enum GIF_A_D_REG { GIF_A_D_REG_PRIM = 0x00, @@ -1093,9 +1098,11 @@ __aligned(struct, 32) GIFPath uint32 reg; uint32 nreg; uint32 nloop; - uint32 adonly; + uint32 type; GSVector4i regs; + enum {TYPE_UNKNOWN, TYPE_ADONLY, TYPE_STQRGBAXYZF2}; + void SetTag(const void* mem) { GSVector4i v = GSVector4i::load(mem); @@ -1104,7 +1111,9 @@ __aligned(struct, 32) GIFPath regs = v.uph8(v >> 4) & 0x0f0f0f0f; nreg = tag.NREG ? tag.NREG : 16; nloop = tag.NLOOP; - adonly = regs.eq8(GSVector4i(0x0e0e0e0e)).mask() == (1 << nreg) - 1; + type = TYPE_UNKNOWN; + if(regs.u32[0] == 0x00040102 && nreg == 3) type = TYPE_STQRGBAXYZF2; + else if(regs.eq8(GSVector4i(0x0e0e0e0e)).mask() == (1 << nreg) - 1) type = TYPE_ADONLY; } __forceinline uint8 GetReg() diff --git a/plugins/GSdx/GSDevice11.cpp b/plugins/GSdx/GSDevice11.cpp index 7e253f04e3..f2f1fff651 100644 --- a/plugins/GSdx/GSDevice11.cpp +++ b/plugins/GSdx/GSDevice11.cpp @@ -729,7 +729,6 @@ void GSDevice11::IASetVertexBuffer(const void* vertex, size_t stride, size_t cou m_vb = NULL; m_vertex.start = 0; - m_vertex.count = 0; m_vertex.limit = std::max(count * 3 / 2, 11000); } @@ -798,7 +797,7 @@ void GSDevice11::IASetIndexBuffer(const void* index, size_t count) m_ib_old = m_ib; m_ib = NULL; - m_index.count = 0; + m_index.start = 0; m_index.limit = std::max(count * 3 / 2, 11000); } @@ -904,7 +903,11 @@ void GSDevice11::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1) { PSSetShaderResource(0, sr0); PSSetShaderResource(1, sr1); - PSSetShaderResource(2, NULL); + + for(int i = 2; i < countof(m_state.ps_srv); i++) + { + PSSetShaderResource(i, NULL); + } } void GSDevice11::PSSetShaderResource(int i, GSTexture* sr) @@ -913,6 +916,13 @@ void GSDevice11::PSSetShaderResource(int i, GSTexture* sr) if(sr) srv = *(GSTexture11*)sr; + PSSetShaderResourceView(i, srv); +} + +void GSDevice11::PSSetShaderResourceView(int i, ID3D11ShaderResourceView* srv) +{ + ASSERT(i < countof(m_state.ps_srv)); + if(m_state.ps_srv[i] != srv) { m_state.ps_srv[i] = srv; @@ -944,14 +954,14 @@ void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb) if(m_srv_changed) { - m_ctx->PSSetShaderResources(0, 3, m_state.ps_srv); + m_ctx->PSSetShaderResources(0, countof(m_state.ps_srv), m_state.ps_srv); m_srv_changed = false; } if(m_ss_changed) { - m_ctx->PSSetSamplers(0, 3, m_state.ps_ss); + m_ctx->PSSetSamplers(0, countof(m_state.ps_ss), m_state.ps_ss); m_ss_changed = false; } @@ -1036,6 +1046,8 @@ void GSDevice11::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector m_ctx->OMSetRenderTargets(1, &rtv, dsv); } + memset(m_state.uav, 0, sizeof(m_state.uav)); + if(m_state.viewport != rt->GetSize()) { m_state.viewport = rt->GetSize(); @@ -1064,6 +1076,52 @@ void GSDevice11::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector } } +void GSDevice11::OMSetRenderTargets(const GSVector2i& rtsize, ID3D11UnorderedAccessView** uav, int count, const GSVector4i* scissor) +{ + for(int i = 0; i < count; i++) + { + if(m_state.uav[i] != uav[i]) + { + memcpy(m_state.uav, uav, sizeof(uav[0]) * count); + memset(m_state.uav + count, 0, sizeof(m_state.uav) - sizeof(uav[0]) * count); + + m_ctx->OMSetRenderTargetsAndUnorderedAccessViews(0, NULL, NULL, 0, count, uav, NULL); + + break; + } + } + + m_state.rtv = NULL; + m_state.dsv = NULL; + + if(m_state.viewport != rtsize) + { + m_state.viewport = rtsize; + + D3D11_VIEWPORT vp; + + memset(&vp, 0, sizeof(vp)); + + vp.TopLeftX = 0; + vp.TopLeftY = 0; + vp.Width = (float)rtsize.x; + vp.Height = (float)rtsize.y; + vp.MinDepth = 0.0f; + vp.MaxDepth = 1.0f; + + m_ctx->RSSetViewports(1, &vp); + } + + GSVector4i r = scissor ? *scissor : GSVector4i(rtsize).zwxy(); + + if(!m_state.scissor.eq(r)) + { + m_state.scissor = r; + + m_ctx->RSSetScissorRects(1, r); + } +} + HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il) { HRESULT hr; @@ -1135,6 +1193,38 @@ HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MAC return hr; } +HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs, D3D11_SO_DECLARATION_ENTRY* layout, int count) +{ + HRESULT hr; + + vector m; + + PrepareShaderMacro(m, macro); + + CComPtr shader, error; + + hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.gs.c_str(), 0, 0, NULL, &shader, &error, NULL); + + if(error) + { + printf("%s\n", (const char*)error->GetBufferPointer()); + } + + if(FAILED(hr)) + { + return hr; + } + + hr = m_dev->CreateGeometryShaderWithStreamOutput((void*)shader->GetBufferPointer(), shader->GetBufferSize(), layout, count, NULL, 0, D3D11_SO_NO_RASTERIZED_STREAM, NULL, gs); + + if(FAILED(hr)) + { + return hr; + } + + return hr; +} + HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps) { HRESULT hr; diff --git a/plugins/GSdx/GSDevice11.h b/plugins/GSdx/GSDevice11.h index 443e6f43d3..f46cf7eafe 100644 --- a/plugins/GSdx/GSDevice11.h +++ b/plugins/GSdx/GSDevice11.h @@ -60,7 +60,7 @@ class GSDevice11 : public GSDeviceDX ID3D11VertexShader* vs; ID3D11Buffer* vs_cb; ID3D11GeometryShader* gs; - ID3D11ShaderResourceView* ps_srv[3]; + ID3D11ShaderResourceView* ps_srv[16]; ID3D11PixelShader* ps; ID3D11Buffer* ps_cb; ID3D11SamplerState* ps_ss[3]; @@ -73,6 +73,7 @@ class GSDevice11 : public GSDeviceDX float bf; ID3D11RenderTargetView* rtv; ID3D11DepthStencilView* dsv; + ID3D11UnorderedAccessView* uav[8]; } m_state; public: // TODO @@ -178,6 +179,7 @@ public: void GSSetShader(ID3D11GeometryShader* gs); void PSSetShaderResources(GSTexture* sr0, GSTexture* sr1); void PSSetShaderResource(int i, GSTexture* sr); + void PSSetShaderResourceView(int i, ID3D11ShaderResourceView* srv); void PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb); void PSSetSamplerState(ID3D11SamplerState* ss0, ID3D11SamplerState* ss1, ID3D11SamplerState* ss2 = NULL); void CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv); @@ -186,6 +188,7 @@ public: void OMSetDepthStencilState(ID3D11DepthStencilState* dss, uint8 sref); void OMSetBlendState(ID3D11BlendState* bs, float bf); void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor = NULL); + void OMSetRenderTargets(const GSVector2i& rtsize, ID3D11UnorderedAccessView** uav, int count, const GSVector4i* scissor = NULL); void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim); void SetupVS(VSSelector sel, const VSConstantBuffer* cb); @@ -202,6 +205,7 @@ public: HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il); HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs); + HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs, D3D11_SO_DECLARATION_ENTRY* layout, int count); HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps); HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs); HRESULT CompileShader(const char* fn, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs); diff --git a/plugins/GSdx/GSDrawingContext.h b/plugins/GSdx/GSDrawingContext.h index d3a7f8b996..73f3206c08 100644 --- a/plugins/GSdx/GSDrawingContext.h +++ b/plugins/GSdx/GSDrawingContext.h @@ -53,7 +53,8 @@ public: GSOffset* fb; GSOffset* zb; GSOffset* tex; - GSPixelOffset4* fzb; + GSPixelOffset* fzb; + GSPixelOffset4* fzb4; } offset; GSDrawingContext() diff --git a/plugins/GSdx/GSLocalMemory.cpp b/plugins/GSdx/GSLocalMemory.cpp index 4bffdf475a..c74a609364 100644 --- a/plugins/GSdx/GSLocalMemory.cpp +++ b/plugins/GSdx/GSLocalMemory.cpp @@ -473,6 +473,62 @@ GSOffset* GSLocalMemory::GetOffset(uint32 bp, uint32 bw, uint32 psm) return o; } +GSPixelOffset* GSLocalMemory::GetPixelOffset(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF) +{ + uint32 fbp = FRAME.Block(); + uint32 zbp = ZBUF.Block(); + uint32 fpsm = FRAME.PSM; + uint32 zpsm = ZBUF.PSM; + uint32 bw = FRAME.FBW; + + ASSERT(m_psm[fpsm].trbpp > 8 || m_psm[zpsm].trbpp > 8); + + // "(psm & 0x0f) ^ ((psm & 0xf0) >> 2)" creates 4 bit unique identifiers for render target formats (only) + + uint32 fpsm_hash = (fpsm & 0x0f) ^ ((fpsm & 0x30) >> 2); + uint32 zpsm_hash = (zpsm & 0x0f) ^ ((zpsm & 0x30) >> 2); + + uint32 hash = (FRAME.FBP << 0) | (ZBUF.ZBP << 9) | (bw << 18) | (fpsm_hash << 24) | (zpsm_hash << 28); + + hash_map::iterator i = m_pomap.find(hash); + + if(i != m_pomap.end()) + { + return i->second; + } + + GSPixelOffset* o = (GSPixelOffset*)_aligned_malloc(sizeof(GSPixelOffset), 32); + + o->hash = hash; + o->fbp = fbp; + o->zbp = zbp; + o->fpsm = fpsm; + o->zpsm = zpsm; + o->bw = bw; + + pixelAddress fpa = m_psm[fpsm].pa; + pixelAddress zpa = m_psm[zpsm].pa; + + int fs = m_psm[fpsm].bpp >> 5; + int zs = m_psm[zpsm].bpp >> 5; + + for(int i = 0; i < 2048; i++) + { + o->row[i].x = (int)fpa(0, i, fbp, bw) << fs; + o->row[i].y = (int)zpa(0, i, zbp, bw) << zs; + } + + for(int i = 0; i < 2048; i++) + { + o->col[i].x = m_psm[fpsm].rowOffset[0][i] << fs; + o->col[i].y = m_psm[zpsm].rowOffset[0][i] << zs; + } + + m_pomap[hash] = o; + + return o; +} + GSPixelOffset4* GSLocalMemory::GetPixelOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF) { uint32 fbp = FRAME.Block(); diff --git a/plugins/GSdx/GSLocalMemory.h b/plugins/GSdx/GSLocalMemory.h index e76bde3f00..383cca4e5f 100644 --- a/plugins/GSdx/GSLocalMemory.h +++ b/plugins/GSdx/GSLocalMemory.h @@ -56,6 +56,16 @@ public: uint32* GetPages(const GSVector4i& rect, uint32* pages = NULL, GSVector4i* bbox = NULL); }; +struct GSPixelOffset +{ + // 16 bit offsets (m_vm16[...]) + + GSVector2i row[2048]; // f yn | z yn + GSVector2i col[2048]; // f xn | z xn + uint32 hash; + uint32 fbp, zbp, fpsm, zpsm, bw; +}; + struct GSPixelOffset4 { // 16 bit offsets (m_vm16[...]) @@ -158,6 +168,7 @@ protected: // hash_map m_omap; + hash_map m_pomap; hash_map m_po4map; hash_map*> m_p2tmap; @@ -166,6 +177,7 @@ public: virtual ~GSLocalMemory(); GSOffset* GetOffset(uint32 bp, uint32 bw, uint32 psm); + GSPixelOffset* GetPixelOffset(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF); GSPixelOffset4* GetPixelOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF); vector* GetPage2TileMap(const GIFRegTEX0& TEX0); diff --git a/plugins/GSdx/GSPerfMon.h b/plugins/GSdx/GSPerfMon.h index 907af994bf..f9b023b932 100644 --- a/plugins/GSdx/GSPerfMon.h +++ b/plugins/GSdx/GSPerfMon.h @@ -35,7 +35,7 @@ public: enum counter_t { - Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad, + Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad, SyncPoint, CounterLast, }; diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index 1872b6844a..2c05114e52 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -40,7 +40,7 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* pe m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false); m_edge.count = 0; - m_myscanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64); + m_scanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64); int row = 0; @@ -48,14 +48,14 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* pe { for(int i = 0; i < threads; i++, row++) { - m_myscanline[row] = i == id ? 1 : 0; + m_scanline[row] = i == id ? 1 : 0; } } } GSRasterizer::~GSRasterizer() { - _aligned_free(m_myscanline); + _aligned_free(m_scanline); if(m_edge.buff != NULL) vmfree(m_edge.buff, sizeof(GSVertexSW) * 2048); @@ -66,7 +66,7 @@ bool GSRasterizer::IsOneOfMyScanlines(int top) const { ASSERT(top >= 0 && top < 2048); - return m_myscanline[top >> THREAD_HEIGHT] != 0; + return m_scanline[top >> THREAD_HEIGHT] != 0; } bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const @@ -78,7 +78,7 @@ bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const while(top < bottom) { - if(m_myscanline[top++]) + if(m_scanline[top++]) { return true; } @@ -91,9 +91,9 @@ int GSRasterizer::FindMyNextScanline(int top) const { int i = top >> THREAD_HEIGHT; - if(m_myscanline[i] == 0) + if(m_scanline[i] == 0) { - while(m_myscanline[++i] == 0); + while(m_scanline[++i] == 0); top = i << THREAD_HEIGHT; } @@ -904,11 +904,20 @@ void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GS // -GSRasterizerList::GSRasterizerList() - : GSJobQueue >() - , m_sync_count(0) - , m_syncpoint_count(0) +GSRasterizerList::GSRasterizerList(int threads, GSPerfMon* perfmon) + : m_perfmon(perfmon) { + m_scanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64); + + int row = 0; + + while(row < (2048 >> THREAD_HEIGHT)) + { + for(int i = 0; i < threads; i++, row++) + { + m_scanline[row] = i; + } + } } GSRasterizerList::~GSRasterizerList() @@ -917,31 +926,54 @@ GSRasterizerList::~GSRasterizerList() { delete *i; } + + _aligned_free(m_scanline); } void GSRasterizerList::Queue(shared_ptr data) { - // disable dispatcher thread for now and pass-through directly, - // would only be relevant if data->syncpoint was utilized more, - // it would hide the syncing latency from the main gs thread + if(data->syncpoint) + { + Sync(); + } - // Push(data); + GSVector4i r = data->bbox.rintersect(data->scissor); - Process(data); m_count++; + ASSERT(r.top >= 0 && r.top < 2048 && r.bottom >= 0 && r.bottom < 2048); + + int top = r.top >> THREAD_HEIGHT; + int bottom = std::min((r.bottom + (1 << THREAD_HEIGHT) - 1) >> THREAD_HEIGHT, top + m_workers.size()); + + while(top < bottom) + { + m_workers[m_scanline[top++]]->Push(data); + } } void GSRasterizerList::Sync() { - if(GetCount() == 0) return; + if(!IsSynced()) + { + for(size_t i = 0; i < m_workers.size(); i++) + { + m_workers[i]->Wait(); + } - Wait(); // first dispatch all items to workers + m_perfmon->Put(GSPerfMon::SyncPoint, 1); + } +} +bool GSRasterizerList::IsSynced() const +{ for(size_t i = 0; i < m_workers.size(); i++) { - m_workers[i]->Wait(); // then wait all workers to finish their jobs + if(!m_workers[i]->IsEmpty()) + { + return false; + } } - m_sync_count++; + return true; } int GSRasterizerList::GetPixels(bool reset) @@ -956,24 +988,6 @@ int GSRasterizerList::GetPixels(bool reset) return pixels; } -void GSRasterizerList::Process(shared_ptr& item) -{ - if(item->syncpoint) - { - for(size_t i = 0; i < m_workers.size(); i++) - { - m_workers[i]->Wait(); - } - - m_syncpoint_count++; - } - - for(size_t i = 0; i < m_workers.size(); i++) - { - m_workers[i]->Push(item); - } -} - // GSRasterizerList::GSWorker GSRasterizerList::GSWorker::GSWorker(GSRasterizer* r) @@ -994,16 +1008,6 @@ int GSRasterizerList::GSWorker::GetPixels(bool reset) return m_r->GetPixels(reset); } -void GSRasterizerList::GSWorker::Push(const shared_ptr& item) -{ - GSVector4i r = item->bbox.rintersect(item->scissor); - - if(m_r->IsOneOfMyScanlines(r.top, r.bottom)) - { - GSJobQueue >::Push(item); - } -} - void GSRasterizerList::GSWorker::Process(shared_ptr& item) { m_r->Draw(item.get()); diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h index 71b2dd4ad1..84bd01f31e 100644 --- a/plugins/GSdx/GSRasterizer.h +++ b/plugins/GSdx/GSRasterizer.h @@ -109,6 +109,7 @@ public: virtual void Queue(shared_ptr data) = 0; virtual void Sync() = 0; + virtual bool IsSynced() const = 0; virtual int GetPixels(bool reset = true) = 0; }; @@ -119,7 +120,7 @@ protected: IDrawScanline* m_ds; int m_id; int m_threads; - uint8* m_myscanline; + uint8* m_scanline; GSVector4i m_scissor; GSVector4 m_fscissor_x; GSVector4 m_fscissor_y; @@ -155,12 +156,12 @@ public: void Queue(shared_ptr data); void Sync() {} + bool IsSynced() const {return true;} int GetPixels(bool reset); }; class GSRasterizerList : public IRasterizer - , private GSJobQueue > { protected: class GSWorker : public GSJobQueue > @@ -175,17 +176,14 @@ protected: // GSJobQueue - void Push(const shared_ptr& item); void Process(shared_ptr& item); }; + GSPerfMon* m_perfmon; vector m_workers; + uint8* m_scanline; - GSRasterizerList(); - - // GSJobQueue - - void Process(shared_ptr& item); + GSRasterizerList(int threads, GSPerfMon* perfmon); public: virtual ~GSRasterizerList(); @@ -200,7 +198,7 @@ public: } else { - GSRasterizerList* rl = new GSRasterizerList(); + GSRasterizerList* rl = new GSRasterizerList(threads, perfmon); for(int i = 0; i < threads; i++) { @@ -211,12 +209,10 @@ public: } } - int m_sync_count; - int m_syncpoint_count; - // IRasterizer void Queue(shared_ptr data); void Sync(); + bool IsSynced() const; int GetPixels(bool reset); }; diff --git a/plugins/GSdx/GSRenderer.cpp b/plugins/GSdx/GSRenderer.cpp index 0fe8ab8961..151de0b311 100644 --- a/plugins/GSdx/GSRenderer.cpp +++ b/plugins/GSdx/GSRenderer.cpp @@ -257,7 +257,7 @@ bool GSRenderer::Merge(int field) { int field2 = 1 - ((m_interlace - 1) & 1); int mode = (m_interlace - 1) >> 1; - + m_dev->Interlace(ds, field ^ field2, mode, tex[1] ? tex[1]->GetScale().y : tex[0]->GetScale().y); } @@ -304,6 +304,8 @@ void GSRenderer::VSync(int field) ResetDevice(); } + m_dev->AgePool(); + // osd if((m_perfmon.GetFrame() & 0x1f) == 0) @@ -332,7 +334,7 @@ void GSRenderer::VSync(int field) s2.c_str(), theApp.m_gs_interlace[m_interlace].name.c_str(), theApp.m_gs_aspectratio[m_aspectratio].name.c_str(), - (int)m_perfmon.Get(GSPerfMon::Quad), + (int)m_perfmon.Get(GSPerfMon::SyncPoint), (int)m_perfmon.Get(GSPerfMon::Prim), (int)m_perfmon.Get(GSPerfMon::Draw), m_perfmon.CPU(), diff --git a/plugins/GSdx/GSRendererCS.cpp b/plugins/GSdx/GSRendererCS.cpp index a244081feb..1c864d09fb 100644 --- a/plugins/GSdx/GSRendererCS.cpp +++ b/plugins/GSdx/GSRendererCS.cpp @@ -23,7 +23,7 @@ #include "GSRendererCS.h" GSRendererCS::GSRendererCS() - : GSRenderer(new GSVertexTraceCS(this), sizeof(GSVertex)) + : GSRenderer(new GSVertexTraceDX11(this), sizeof(GSVertexHW11)) { m_nativeres = true; @@ -41,27 +41,72 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk) if(!__super::CreateDevice(dev_unk)) return false; + HRESULT hr; + + D3D11_DEPTH_STENCIL_DESC dsd; + D3D11_BLEND_DESC bsd; + D3D11_SAMPLER_DESC sd; + D3D11_BUFFER_DESC bd; + D3D11_TEXTURE2D_DESC td; + D3D11_UNORDERED_ACCESS_VIEW_DESC uavd; + D3D_FEATURE_LEVEL level; ((GSDeviceDX*)dev_unk)->GetFeatureLevel(level); - if(level < D3D_FEATURE_LEVEL_10_0) + if(level < D3D_FEATURE_LEVEL_11_0) return false; - HRESULT hr; - GSDevice11* dev = (GSDevice11*)dev_unk; - D3D11_BUFFER_DESC bd; - D3D11_UNORDERED_ACCESS_VIEW_DESC uavd; - D3D11_SHADER_RESOURCE_VIEW_DESC srvd; + ID3D11DeviceContext* ctx = *dev; + + delete dev->CreateRenderTarget(1024, 1024, false); + + // empty depth stencil state + + memset(&dsd, 0, sizeof(dsd)); + + dsd.StencilEnable = false; + dsd.DepthEnable = false; + + hr = (*dev)->CreateDepthStencilState(&dsd, &m_dss); + + if(FAILED(hr)) return false; + + // empty blend state + + memset(&bsd, 0, sizeof(bsd)); + + bsd.RenderTarget[0].BlendEnable = false; + + hr = (*dev)->CreateBlendState(&bsd, &m_bs); + + if(FAILED(hr)) return false; + + // point sampler + + memset(&sd, 0, sizeof(sd)); + + sd.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT; + + sd.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP; + sd.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP; + sd.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP; + + sd.MaxLOD = FLT_MAX; + sd.MaxAnisotropy = 16; + sd.ComparisonFunc = D3D11_COMPARISON_NEVER; + + hr = (*dev)->CreateSamplerState(&sd, &m_ss); + + if(FAILED(hr)) return false; // video memory (4MB) memset(&bd, 0, sizeof(bd)); bd.ByteWidth = 4 * 1024 * 1024; - bd.StructureByteStride = 4; bd.Usage = D3D11_USAGE_DEFAULT; bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS; bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS; @@ -81,35 +126,32 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk) hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav); if(FAILED(hr)) return false; +/* + memset(&td, 0, sizeof(td)); - // vertex buffer + td.Width = PAGE_SIZE; + td.Height = MAX_PAGES; + td.Format = DXGI_FORMAT_R8_UINT; + td.MipLevels = 1; + td.ArraySize = 1; + td.SampleDesc.Count = 1; + td.SampleDesc.Quality = 0; + td.Usage = D3D11_USAGE_DEFAULT; + td.BindFlags = D3D11_BIND_UNORDERED_ACCESS; - memset(&bd, 0, sizeof(bd)); - - bd.ByteWidth = sizeof(GSVertex) * 10000; - bd.StructureByteStride = sizeof(GSVertex); - bd.Usage = D3D11_USAGE_DYNAMIC; - bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; - bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; - - hr = (*dev)->CreateBuffer(&bd, NULL, &m_vb); + hr = (*dev)->CreateTexture2D(&td, NULL, &m_vm); if(FAILED(hr)) return false; - // index buffer + memset(&uavd, 0, sizeof(uavd)); - memset(&bd, 0, sizeof(bd)); + uavd.Format = DXGI_FORMAT_R8_UINT; + uavd.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D; - bd.ByteWidth = sizeof(uint32) * 10000 * 3; - bd.Usage = D3D11_USAGE_DYNAMIC; - bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; - - hr = (*dev)->CreateBuffer(&bd, NULL, &m_ib); + hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav); if(FAILED(hr)) return false; - +*/ // one page, for copying between cpu<->gpu memset(&bd, 0, sizeof(bd)); @@ -121,10 +163,69 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk) hr = (*dev)->CreateBuffer(&bd, NULL, &m_pb); if(FAILED(hr)) return false; +/* + memset(&td, 0, sizeof(td)); + + td.Width = PAGE_SIZE; + td.Height = 1; + td.Format = DXGI_FORMAT_R8_UINT; + td.MipLevels = 1; + td.ArraySize = 1; + td.SampleDesc.Count = 1; + td.SampleDesc.Quality = 0; + td.Usage = D3D11_USAGE_STAGING; + td.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE; + + hr = (*dev)->CreateTexture2D(&td, NULL, &m_pb); + + if(FAILED(hr)) return false; +*/ + // VSConstantBuffer + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = sizeof(VSConstantBuffer); + bd.Usage = D3D11_USAGE_DEFAULT; + bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_vs_cb); + + if(FAILED(hr)) return false; + + // PSConstantBuffer + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = sizeof(PSConstantBuffer); + bd.Usage = D3D11_USAGE_DEFAULT; + bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_ps_cb); + + if(FAILED(hr)) return false; + + // + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = 14 * sizeof(float) * 200000; + bd.Usage = D3D11_USAGE_DEFAULT; + bd.BindFlags = D3D11_BIND_STREAM_OUTPUT | D3D11_BIND_SHADER_RESOURCE; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_sob); + + // return true; } +void GSRendererCS::VSync(int field) +{ + __super::VSync(field); + + //printf("%lld\n", m_perfmon.GetFrame()); +} + GSTexture* GSRendererCS::GetOutput(int i) { // TODO: create a compute shader which unswizzles the frame from m_vm to the output texture @@ -135,205 +236,342 @@ GSTexture* GSRendererCS::GetOutput(int i) template void GSRendererCS::ConvertVertex(size_t dst_index, size_t src_index) { - // TODO: vertex format more fitting as the input for the compute shader + GSVertex* s = (GSVertex*)((GSVertexHW11*)m_vertex.buff + src_index); + GSVertexHW11* d = (GSVertexHW11*)m_vertex.buff + dst_index; - if(src_index != dst_index) + GSVector4i v0 = ((GSVector4i*)s)[0]; + GSVector4i v1 = ((GSVector4i*)s)[1]; + + if(tme && fst) { - GSVertex v = ((GSVertex*)m_vertex.buff)[src_index]; + // TODO: modify VertexTrace to read uv from v1.u16[0], v1.u16[1], then this step is not needed - ((GSVertex*)m_vertex.buff)[dst_index] = v; + v0 = GSVector4i::cast(GSVector4(v1.uph16()).xyzw(GSVector4::cast(v0))); // uv => st } + + ((GSVector4i*)d)[0] = v0; + ((GSVector4i*)d)[1] = v1; } void GSRendererCS::Draw() { - HRESULT hr; + GSDrawingEnvironment& env = m_env; + GSDrawingContext* context = m_context; + + GSVector2i rtsize(2048, 2048); + GSVector4i scissor = GSVector4i(context->scissor.in).rintersect(GSVector4i(rtsize).zwxy()); + GSVector4i bbox = GSVector4i(m_vt->m_min.p.floor().xyxy(m_vt->m_max.p.ceil())); + GSVector4i r = bbox.rintersect(scissor); + + uint32 fm = context->FRAME.FBMSK; + uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0; + + if(fm != 0xffffffff) + { + Write(context->offset.fb, r); + + // TODO: m_tc->InvalidateVideoMem(context->offset.fb, r, false); + } + + if(zm != 0xffffffff) + { + Write(context->offset.zb, r); + + // TODO: m_tc->InvalidateVideoMem(context->offset.zb, r, false); + } + + if(PRIM->TME) + { + m_mem.m_clut.Read32(context->TEX0, env.TEXA); + + GSVector4i r; + + GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt->IsLinear()); + + // TODO: unswizzle pages of r to a texture, check m_vm_valid, bit not set cpu->gpu, set gpu->gpu + + // TODO: Write transfer should directly write to m_vm, then Read/Write syncing won't be necessary, clut must be updated with the gpu also + + // TODO: tex = m_tc->LookupSource(context->TEX0, env.TEXA, r); + + // if(!tex) return; + } + + // GSDevice11* dev = (GSDevice11*)m_dev; - + ID3D11DeviceContext* ctx = *dev; - D3D11_BUFFER_DESC bd; - D3D11_UNORDERED_ACCESS_VIEW_DESC uavd; - D3D11_SHADER_RESOURCE_VIEW_DESC srvd; - D3D11_MAPPED_SUBRESOURCE map; + dev->BeginScene(); - CComPtr vb_srv; - CComPtr ib_srv; + // SetupOM - // TODO: cache these in hash_maps + ID3D11UnorderedAccessView* uavs[] = {m_vm_uav}; - CComPtr fbr, fbc, zbr, zbc; - CComPtr fbr_srv, fbc_srv, zbr_srv, zbc_srv; - - // TODO: grow m_vb, m_ib if needed - - if(m_vertex.next > 10000) return; - if(m_index.tail > 30000) return; - - // TODO: fill/advance/discardwhenfull, as in GSDevice11::IASetVertexBuffer/IASetIndexBuffer - - hr = ctx->Map(m_vb, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); // discarding, until properly advancing the start pointer around - - if(FAILED(hr)) return; - - memcpy(map.pData, m_vertex.buff, sizeof(GSVertex) * m_vertex.next); - - ctx->Unmap(m_vb, 0); - - // - - hr = ctx->Map(m_ib, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); // discarding, until properly advancing the start pointer around - - if(FAILED(hr)) return; - - memcpy(map.pData, m_index.buff, sizeof(uint32) * m_index.tail); - - ctx->Unmap(m_ib, 0); - - // TODO: UpdateResource might be faster, based on my exprience with the real vertex buffer, write-no-overwrite/discarded dynamic buffer + map is better - - // - - memset(&srvd, 0, sizeof(srvd)); - - srvd.Format = DXGI_FORMAT_UNKNOWN; - srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; - srvd.Buffer.FirstElement = 0; - srvd.Buffer.NumElements = m_vertex.next; - - hr = (*dev)->CreateShaderResourceView(m_vb, &srvd, &vb_srv); // TODO: have to create this dyncamically in Draw() or pass the start/count in a const reg - - memset(&srvd, 0, sizeof(srvd)); - - srvd.Format = DXGI_FORMAT_R32_UINT; - srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; - srvd.Buffer.FirstElement = 0; - srvd.Buffer.NumElements = m_index.tail; - - hr = (*dev)->CreateShaderResourceView(m_ib, &srvd, &ib_srv); // TODO: have to create this dyncamically in Draw() or pass the start/count in a const reg - - // fzb offsets - - memset(&bd, 0, sizeof(bd)); - - bd.ByteWidth = sizeof(int) * 4096; - bd.StructureByteStride = sizeof(int); - bd.Usage = D3D11_USAGE_IMMUTABLE; - bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; - - D3D11_SUBRESOURCE_DATA data; - - memset(&data, 0, sizeof(data)); - - data.pSysMem = m_context->offset.fb->pixel.row; - - hr = (*dev)->CreateBuffer(&bd, &data, &fbr); - - data.pSysMem = m_context->offset.fb->pixel.col[0]; // same column layout for every line in case of frame and zbuffer formats + dev->OMSetDepthStencilState(m_dss, 0); + dev->OMSetBlendState(m_bs, 0); + dev->OMSetRenderTargets(rtsize, uavs, countof(uavs), &scissor); - hr = (*dev)->CreateBuffer(&bd, &data, &fbc); + // SetupIA - data.pSysMem = m_context->offset.zb->pixel.row; - - hr = (*dev)->CreateBuffer(&bd, &data, &zbr); + D3D11_PRIMITIVE_TOPOLOGY topology; - data.pSysMem = m_context->offset.zb->pixel.col[0]; // same column layout for every line in case of frame and zbuffer formats - - hr = (*dev)->CreateBuffer(&bd, &data, &zbc); - - // TODO: D3D10_SHADER_MACRO (primclass, less frequently changing drawing attribs, etc.) - - uint32 sel = 0; // TODO - - hash_map >::iterator i = m_cs.find(sel); - - CComPtr cs; - - if(i == m_cs.end()) + switch(m_vt->m_primclass) { - // hr = dev->CompileShader(IDR_CS_FX, "cs_main", NULL, &cs); - hr = dev->CompileShader("E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\cs.fx", "cs_main", NULL, &cs); - - if(FAILED(hr)) return; - - m_cs[sel] = cs; + case GS_POINT_CLASS: + topology = D3D11_PRIMITIVE_TOPOLOGY_POINTLIST; + break; + case GS_LINE_CLASS: + case GS_SPRITE_CLASS: + topology = D3D11_PRIMITIVE_TOPOLOGY_LINELIST; + break; + case GS_TRIANGLE_CLASS: + topology = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST; + break; + default: + __assume(0); } - else + + dev->IASetVertexBuffer(m_vertex.buff, sizeof(GSVertexHW11), m_vertex.next); + dev->IASetIndexBuffer(m_index.buff, m_index.tail); + dev->IASetPrimitiveTopology(topology); + + // SetupVS + + VSSelector vs_sel; + + vs_sel.tme = PRIM->TME; + vs_sel.fst = PRIM->FST; + + VSConstantBuffer vs_cb; + + float sx = 2.0f / (rtsize.x << 4); + float sy = 2.0f / (rtsize.y << 4); + //float sx = 1.0f / 16; + //float sy = 1.0f / 16; + float ox = (float)(int)context->XYOFFSET.OFX; + float oy = (float)(int)context->XYOFFSET.OFY; + + vs_cb.VertexScale = GSVector4(sx, -sy, 0.0f, 0.0f); + vs_cb.VertexOffset = GSVector4(ox * sx + 1, -(oy * sy + 1), 0.0f, -1.0f); + //vs_cb.VertexScale = GSVector4(sx, sy, 0.0f, 0.0f); + //vs_cb.VertexOffset = GSVector4(ox * sx, oy * sy, 0.0f, -1.0f); + { - cs = i->second; + hash_map::const_iterator i = m_vs.find(vs_sel); + + if(i == m_vs.end()) + { + string str[2]; + + str[0] = format("%d", vs_sel.tme); + str[1] = format("%d", vs_sel.fst); + + D3D11_SHADER_MACRO macro[] = + { + {"VS_TME", str[0].c_str()}, + {"VS_FST", str[1].c_str()}, + {NULL, NULL}, + }; + + D3D11_INPUT_ELEMENT_DESC layout[] = + { + {"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"COLOR", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 8, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"TEXCOORD", 1, DXGI_FORMAT_R32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"POSITION", 0, DXGI_FORMAT_R16G16_UINT, 0, 16, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"POSITION", 1, DXGI_FORMAT_R32_UINT, 0, 20, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"TEXCOORD", 2, DXGI_FORMAT_R16G16_UINT, 0, 24, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"COLOR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 28, D3D11_INPUT_PER_VERTEX_DATA, 0}, + }; + + GSVertexShader11 vs; + + dev->CompileShader(IDR_CS_FX, "vs_main", macro, &vs.vs, layout, countof(layout), &vs.il); + + m_vs[vs_sel] = vs; + + i = m_vs.find(vs_sel); + } + + ctx->UpdateSubresource(m_vs_cb, 0, NULL, &vs_cb, 0, 0); // TODO: only update if changed + + dev->VSSetShader(i->second.vs, m_vs_cb); + + dev->IASetInputLayout(i->second.il); } + + // SetupGS + + GSSelector gs_sel; + + gs_sel.iip = PRIM->IIP; + gs_sel.prim = m_vt->m_primclass; + + CComPtr gs; + + { + hash_map >::const_iterator i = m_gs.find(gs_sel); + + if(i != m_gs.end()) + { + gs = i->second; + } + else + { + string str[2]; + + str[0] = format("%d", gs_sel.iip); + str[1] = format("%d", gs_sel.prim); + + D3D11_SHADER_MACRO macro[] = + { + {"GS_IIP", str[0].c_str()}, + {"GS_PRIM", str[1].c_str()}, + {NULL, NULL}, + }; + /* + D3D11_SO_DECLARATION_ENTRY layout[] = + { + {0, "SV_Position", 0, 0, 4, 0}, + {0, "TEXCOORD", 0, 0, 2, 0}, + {0, "TEXCOORD", 1, 0, 4, 0}, + {0, "COLOR", 0, 0, 4, 0}, + }; + */ + dev->CompileShader(IDR_CS_FX, "gs_main", macro, &gs);//, layout, countof(layout)); + + m_gs[gs_sel] = gs; + } + } + + dev->GSSetShader(gs); + + // SetupPS + + PSSelector ps_sel; + PSConstantBuffer ps_cb; + + hash_map >::const_iterator i = m_ps.find(ps_sel); + + if(i == m_ps.end()) + { + string str[15]; + + str[0] = format("%d", 0); + + D3D11_SHADER_MACRO macro[] = + { + {"PS_TODO", str[0].c_str()}, + {NULL, NULL}, + }; + + CComPtr ps; + + dev->CompileShader(IDR_CS_FX, "ps_main", macro, &ps); + + m_ps[ps_sel] = ps; + + i = m_ps.find(ps_sel); + } + + ctx->UpdateSubresource(m_ps_cb, 0, NULL, &ps_cb, 0, 0); // TODO: only update if changed + + dev->PSSetSamplerState(m_ss, NULL, NULL); + + dev->PSSetShader(i->second, m_ps_cb); + + // Offset + + OffsetBuffer* fzbo = NULL; - // + GetOffsetBuffer(&fzbo); - dev->CSSetShaderUAV(0, m_vm_uav); - - dev->CSSetShaderSRV(0, vb_srv); - dev->CSSetShaderSRV(1, ib_srv); - dev->CSSetShaderSRV(2, fbr_srv); - dev->CSSetShaderSRV(3, fbc_srv); - dev->CSSetShaderSRV(4, zbr_srv); - dev->CSSetShaderSRV(5, zbc_srv); - - dev->CSSetShader(cs); + dev->PSSetShaderResourceView(0, fzbo->row_view); + dev->PSSetShaderResourceView(1, fzbo->col_view); - GSVector4i bbox = GSVector4i(0, 0, 640, 512); // TODO: vertex trace + // TODO: 2 palette + // TODO: 3, 4, ... texture levels - GSVector4i r = bbox.ralign(GSVector2i(16, 8)); + //ID3D11Buffer* tmp[] = {m_sob}; - bool fb = true; // TODO: frame buffer used - bool zb = true; // TODO: z-buffer used + //ctx->SOSetTargets(countof(tmp), tmp, NULL); - if(fb) Write(m_context->offset.fb, r); - if(zb) Write(m_context->offset.zb, r); + dev->DrawIndexedPrimitive(); - // TODO: constant buffer (frequently chaning drawing attribs) - // TODO: texture (implement texture cache) - // TODO: clut to a palette texture (should be texture1d, not simply buffer, it is random accessed) - // TODO: CSSetShaderSRV(6 7 8 ..., texture level 0 1 2 ...) or use Texture3D? - // TODO: invalidate texture cache + //ctx->SOSetTargets(0, NULL, NULL); - /* - CComPtr q; + if(0) + { + HRESULT hr; - D3D11_QUERY_DESC qd; - memset(&qd, 0, sizeof(qd)); - qd.Query = D3D11_QUERY_EVENT; + D3D11_BUFFER_DESC bd; - hr = (*dev)->CreateQuery(&qd, &q); + memset(&bd, 0, sizeof(bd)); - ctx->Begin(q); - */ - - printf("[%lld] dispatch %05x %d %05x %d %05x %d %dx%d | %d %d %d\n", - __rdtsc(), - m_context->FRAME.Block(), m_context->FRAME.PSM, - m_context->ZBUF.Block(), m_context->ZBUF.PSM, - PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, - PRIM->PRIM, m_vertex.next, m_index.tail); + bd.ByteWidth = 14 * sizeof(float) * 200000; + bd.Usage = D3D11_USAGE_STAGING; + bd.CPUAccessFlags = D3D11_CPU_ACCESS_READ; - GSVector4i rsize = r.rsize(); + CComPtr sob; - dev->Dispatch(rsize.z >> 4, rsize.w >> 3, 1); // TODO: pass upper-left corner offset (r.xy) in a const buffer + hr = (*dev)->CreateBuffer(&bd, NULL, &sob); - /* - ctx->End(q); + ctx->CopyResource(sob, m_sob); - uint64 t0 = __rdtsc(); + D3D11_MAPPED_SUBRESOURCE map; - BOOL b; + if(SUCCEEDED(ctx->Map(sob, 0, D3D11_MAP_READ, 0, &map))) + { + float* f = (float*)map.pData; - while(S_OK != ctx->GetData(q, &b, sizeof(BOOL), 0)) {} + for(int i = 0; i < 12; i++, f += 14) + printf("%f %f %f %f\n%f %f\n%f %f %f %f\n%f %f %f %f\n", + f[0], f[1], f[2], f[3], + f[4], f[5], + f[6], f[7], f[8], f[9], + f[10], f[11], f[12], f[13]); - printf("%lld\n", __rdtsc() - t0); - */ + ctx->Unmap(sob, 0); + } + + } + + if(1) + { + //Read(m_mem.GetOffset(0, 16, PSM_PSMCT32), GSVector4i(0, 0, 1024, 1024), false); + + // + if(fm != 0xffffffff) Read(context->offset.fb, r, false); + // + if(zm != 0xffffffff) Read(context->offset.zb, r, false); + + std::string s; + + s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->FRAME.Block(), m_context->FRAME.PSM); + + // + m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512); + + s = format("c:\\temp1\\_%05d_f%lld_zt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->ZBUF.Block(), m_context->ZBUF.PSM); + + // + m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512); + + //m_mem.SaveBMP(s, 0, 16, PSM_PSMCT32, 1024, 1024); + + s_n++; + } + + dev->EndScene(); } void GSRendererCS::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) { GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM); - Read(o, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated + Read(o, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated (important) // TODO: false deps, 8H/4HL/4HH texture sharing pages with 24-bit target // TODO: invalidate texture cache @@ -356,6 +594,10 @@ void GSRendererCS::Write(GSOffset* o, const GSVector4i& r) memset(&box, 0, sizeof(box)); + box.right = 1; + box.bottom = 1; + box.back = 1; + uint32* pages = o->GetPages(r); for(size_t i = 0; pages[i] != GSOffset::EOP; i++) @@ -370,10 +612,20 @@ void GSRendererCS::Write(GSOffset* o, const GSVector4i& r) m_vm_valid[row] |= col; box.left = page * PAGE_SIZE; - box.right = box.left + PAGE_SIZE; + box.right = (page + 1) * PAGE_SIZE; - ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + box.left, 0, 0); + ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0); +/* + // m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4 + box.left = 0; + box.right = PAGE_SIZE; + box.top = page; + box.bottom = box.top + 1; + + ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0); +*/ + if(0) printf("[%lld] write %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page); } } @@ -391,6 +643,10 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate) memset(&box, 0, sizeof(box)); + box.right = 1; + box.bottom = 1; + box.back = 1; + uint32* pages = o->GetPages(r); for(size_t i = 0; pages[i] != GSOffset::EOP; i++) @@ -402,21 +658,34 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate) if(m_vm_valid[row] & col) { - if(invalidate) m_vm_valid[row] ^= col; + if(invalidate) + { + m_vm_valid[row] ^= col; + } box.left = page * PAGE_SIZE; - box.right = box.left + PAGE_SIZE; + box.right = (page + 1) * PAGE_SIZE; ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box); +/* + // m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4 + box.left = 0; + box.right = PAGE_SIZE; + box.top = page; + box.bottom = box.top + 1; + + ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box); +*/ D3D11_MAPPED_SUBRESOURCE map; - if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ_WRITE, 0, &map))) + if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ, 0, &map))) { - memcpy(m_mem.m_vm8 + box.left, map.pData, PAGE_SIZE); + memcpy(m_mem.m_vm8 + page * PAGE_SIZE, map.pData, PAGE_SIZE); ctx->Unmap(m_pb, 0); - + + if(0) printf("[%lld] read %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page); } } @@ -424,3 +693,64 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate) delete [] pages; } + +bool GSRendererCS::GetOffsetBuffer(OffsetBuffer** fzbo) +{ + HRESULT hr; + + GSDevice11* dev = (GSDevice11*)m_dev; + + D3D11_BUFFER_DESC bd; + D3D11_SHADER_RESOURCE_VIEW_DESC srvd; + D3D11_SUBRESOURCE_DATA data; + + hash_map::iterator i = m_offset.find(m_context->offset.fzb->hash); + + if(i == m_offset.end()) + { + OffsetBuffer ob; + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = sizeof(GSVector2i) * 2048; + bd.Usage = D3D11_USAGE_IMMUTABLE; + bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; + + memset(&data, 0, sizeof(data)); + + data.pSysMem = m_context->offset.fzb->row; + + hr = (*dev)->CreateBuffer(&bd, &data, &ob.row); + + if(FAILED(hr)) return false; + + data.pSysMem = m_context->offset.fzb->col; + + hr = (*dev)->CreateBuffer(&bd, &data, &ob.col); + + if(FAILED(hr)) return false; + + memset(&srvd, 0, sizeof(srvd)); + + srvd.Format = DXGI_FORMAT_R32G32_SINT; + srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; + srvd.Buffer.FirstElement = 0; + srvd.Buffer.NumElements = 2048; + + hr = (*dev)->CreateShaderResourceView(ob.row, &srvd, &ob.row_view); + + if(FAILED(hr)) return false; + + hr = (*dev)->CreateShaderResourceView(ob.col, &srvd, &ob.col_view); + + if(FAILED(hr)) return false; + + m_offset[m_context->offset.fzb->hash] = ob; + + i = m_offset.find(m_context->offset.fzb->hash); + } + + *fzbo = &i->second; + + return true; +} diff --git a/plugins/GSdx/GSRendererCS.h b/plugins/GSdx/GSRendererCS.h index 42f45d58af..f51b44bb10 100644 --- a/plugins/GSdx/GSRendererCS.h +++ b/plugins/GSdx/GSRendererCS.h @@ -26,28 +26,105 @@ class GSRendererCS : public GSRenderer { - class GSVertexTraceCS : public GSVertexTrace + struct VSSelector { - public: - GSVertexTraceCS(const GSState* state) : GSVertexTrace(state) {} + union + { + struct + { + uint32 tme:1; + uint32 fst:1; + }; + + uint32 key; + }; + + operator uint32() {return key & 0x3;} + + VSSelector() : key(0) {} }; + __aligned(struct, 32) VSConstantBuffer + { + GSVector4 VertexScale; + GSVector4 VertexOffset; + }; + + struct GSSelector + { + union + { + struct + { + uint32 iip:1; + uint32 prim:2; + }; + + uint32 key; + }; + + operator uint32() {return key & 0x7;} + + GSSelector() : key(0) {} + }; + + struct PSSelector + { + union + { + struct + { + uint32 TODO:1; + }; + + uint32 key; + }; + + operator uint32() {return key & 0x1;} + + PSSelector() : key(0) {} + }; + + __aligned(struct, 32) PSConstantBuffer + { + GSVector4 TODO; + }; + + CComPtr m_dss; + CComPtr m_bs; + CComPtr m_ss; CComPtr m_vm; + //CComPtr m_vm; CComPtr m_vm_uav; - CComPtr m_vb; - CComPtr m_ib; - CComPtr m_pb; - hash_map > m_cs; uint32 m_vm_valid[16]; + CComPtr m_pb; + //CComPtr m_pb; + hash_map m_vs; + CComPtr m_vs_cb; + hash_map > m_gs; + hash_map > m_ps; + CComPtr m_ps_cb; + CComPtr m_sob; void Write(GSOffset* o, const GSVector4i& r); void Read(GSOffset* o, const GSVector4i& r, bool invalidate); - + + struct OffsetBuffer + { + CComPtr row, col; + CComPtr row_view, col_view; + }; + + hash_map m_offset; + + bool GetOffsetBuffer(OffsetBuffer** fzbo); + protected: template void ConvertVertex(size_t dst_index, size_t src_index); - + bool CreateDevice(GSDevice* dev); + void VSync(int field); GSTexture* GetOutput(int i); void Draw(); void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r); diff --git a/plugins/GSdx/GSRendererDX.cpp b/plugins/GSdx/GSRendererDX.cpp index 0a887490e4..e8a941c1d3 100644 --- a/plugins/GSdx/GSRendererDX.cpp +++ b/plugins/GSdx/GSRendererDX.cpp @@ -233,7 +233,7 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc } } - if (env.COLCLAMP.CLAMP == 0 && /* hack */ !tex && PRIM->PRIM != GS_POINTLIST) + if(env.COLCLAMP.CLAMP == 0 && /* hack */ !tex && PRIM->PRIM != GS_POINTLIST) { ps_sel.colclip = 1; } diff --git a/plugins/GSdx/GSRendererHW.cpp b/plugins/GSdx/GSRendererHW.cpp index 921641b721..07ef77abc5 100644 --- a/plugins/GSdx/GSRendererHW.cpp +++ b/plugins/GSdx/GSRendererHW.cpp @@ -101,19 +101,18 @@ void GSRendererHW::Reset() void GSRendererHW::VSync(int field) { - GSRenderer::VSync(field); - - m_tc->IncAge(); - m_dev->AgePool(); - - m_skip = 0; - if(m_reset) { m_tc->RemoveAll(); m_reset = false; } + + GSRenderer::VSync(field); + + m_tc->IncAge(); + + m_skip = 0; } void GSRendererHW::ResetDevice() diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index d6494f58c3..393395299e 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -22,6 +22,10 @@ #include "stdafx.h" #include "GSRendererSW.h" +#define LOG 0 + +static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL; + const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 128.0f); GSRendererSW::GSRendererSW(int threads) @@ -60,10 +64,9 @@ GSRendererSW::~GSRendererSW() void GSRendererSW::Reset() { - // TODO: GSreset can come from the main thread too => crash - // m_tc->RemoveAll(); + Sync(-1); - m_reset = true; + m_tc->RemoveAll(); GSRenderer::Reset(); } @@ -72,6 +75,93 @@ void GSRendererSW::VSync(int field) { Sync(0); // IncAge might delete a cached texture in use + if(LOG) + { + fprintf(s_fp, "%lld\n", m_perfmon.GetFrame()); + + GSVector4i dr = GetDisplayRect(); + GSVector4i fr = GetFrameRect(); + GSVector2i ds = GetDeviceSize(); + + fprintf(s_fp, "dr %d %d %d %d, fr %d %d %d %d, ds %d %d\n", + dr.x, dr.y, dr.z, dr.w, + fr.x, fr.y, fr.z, fr.w, + ds.x, ds.y); + + for(int i = 0; i < 2; i++) + { + if(i == 0 && !m_regs->PMODE.EN1) continue; + if(i == 1 && !m_regs->PMODE.EN2) continue; + + fprintf(s_fp, "DISPFB[%d] BP=%05x BW=%d PSM=%d DBX=%d DBY=%d\n", + i, + m_regs->DISP[i].DISPFB.Block(), + m_regs->DISP[i].DISPFB.FBW, + m_regs->DISP[i].DISPFB.PSM, + m_regs->DISP[i].DISPFB.DBX, + m_regs->DISP[i].DISPFB.DBY + ); + + fprintf(s_fp, "DISPLAY[%d] DX=%d DY=%d DW=%d DH=%d MAGH=%d MAGV=%d\n", + i, + m_regs->DISP[i].DISPLAY.DX, + m_regs->DISP[i].DISPLAY.DY, + m_regs->DISP[i].DISPLAY.DW, + m_regs->DISP[i].DISPLAY.DH, + m_regs->DISP[i].DISPLAY.MAGH, + m_regs->DISP[i].DISPLAY.MAGV + ); + } + + fprintf(s_fp, "PMODE EN1=%d EN2=%d CRTMD=%d MMOD=%d AMOD=%d SLBG=%d ALP=%d\n", + m_regs->PMODE.EN1, + m_regs->PMODE.EN2, + m_regs->PMODE.CRTMD, + m_regs->PMODE.MMOD, + m_regs->PMODE.AMOD, + m_regs->PMODE.SLBG, + m_regs->PMODE.ALP + ); + + fprintf(s_fp, "SMODE1 %08x_%08x\n", + m_regs->SMODE1.u32[0], + m_regs->SMODE1.u32[1] + ); + + fprintf(s_fp, "SMODE2 INT=%d FFMD=%d DPMS=%d\n", + m_regs->SMODE2.INT, + m_regs->SMODE2.FFMD, + m_regs->SMODE2.DPMS + ); + + fprintf(s_fp, "SRFSH %08x_%08x\n", + m_regs->SRFSH.u32[0], + m_regs->SRFSH.u32[1] + ); + + fprintf(s_fp, "SYNCH1 %08x_%08x\n", + m_regs->SYNCH1.u32[0], + m_regs->SYNCH1.u32[1] + ); + + fprintf(s_fp, "SYNCH2 %08x_%08x\n", + m_regs->SYNCH2.u32[0], + m_regs->SYNCH2.u32[1] + ); + + fprintf(s_fp, "SYNCV %08x_%08x\n", + m_regs->SYNCV.u32[0], + m_regs->SYNCV.u32[1] + ); + + fprintf(s_fp, "CSR %08x_%08x\n", + m_regs->CSR.u32[0], + m_regs->CSR.u32[1] + ); + + fflush(s_fp); + } + /* int draw[8], sum = 0; @@ -87,20 +177,12 @@ void GSRendererSW::VSync(int field) draw[0], draw[1], draw[2], draw[3], draw[4], draw[5], draw[6], draw[7], sum); // - printf("m_sync_count = %d\n", ((GSRasterizerList*)m_rl)->m_sync_count); ((GSRasterizerList*)m_rl)->m_sync_count = 0; - printf("m_syncpoint_count = %d\n", ((GSRasterizerList*)m_rl)->m_syncpoint_count); ((GSRasterizerList*)m_rl)->m_syncpoint_count = 0; */ + GSRenderer::VSync(field); m_tc->IncAge(); - if(m_reset) - { - m_tc->RemoveAll(); - - m_reset = false; - } - // if((m_perfmon.GetFrame() & 255) == 0) m_rl.PrintStats(); } @@ -197,10 +279,6 @@ void GSRendererSW::ConvertVertex(size_t dst_index, size_t src_index) } } -#define LOG 0 - -FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL; - void GSRendererSW::Draw() { SharedData* sd = new SharedData(this); @@ -265,89 +343,18 @@ void GSRendererSW::Draw() m_tc->InvalidatePages(zb_pages, context->offset.zb->psm); } - // set data->syncpoint - - if(m_fzb != context->offset.fzb) + if(CheckTargetPages(fb_pages, zb_pages, r)) { - // hmm, what if "r" gets bigger next time and slips through unchecked, need to trace that too - - sd->syncpoint = true; // TODO - - if(!sd->syncpoint) - { - if(fb_pages == NULL) - { - fb_pages = context->offset.fb->GetPages(r); - } - - if(CheckTargetPages<0xffffffff>(fb_pages)) - { - sd->syncpoint = true; - - if(LOG) fprintf(s_fp, "syncpoint 0\n"); - } - } - - if(!sd->syncpoint) - { - if(zb_pages == NULL) - { - zb_pages = context->offset.zb->GetPages(r); - } - - if(CheckTargetPages<0xffffffff>(zb_pages)) - { - sd->syncpoint = true; - - if(LOG) fprintf(s_fp, "syncpoint 1\n"); - } - } - - if(!sd->syncpoint) - { - if(LOG) fprintf(s_fp, "no syncpoint *\n"); - } - - m_fzb = context->offset.fzb; + sd->syncpoint = true; } - else - { - // chross-check frame and z-buffer pages, they cannot overlap with eachother and with previous batches in queue, - // m_fzb filters out most of these cases, only have to be careful when the addresses stay the same and the output - // is mutually enabled/disabled and alternating (Bully FBP/ZBP = 0x2300) - - if(!sd->syncpoint) - { - if(gd.sel.fwrite) - { - if(CheckTargetPages<0xffff0000>(fb_pages)) // already used as a z-buffer - { - sd->syncpoint = true; - - if(LOG) fprintf(s_fp, "syncpoint 2\n"); - } - } - } - - if(!sd->syncpoint) - { - if(gd.sel.zwrite) - { - if(CheckTargetPages<0x0000ffff>(zb_pages)) // already used as a frame buffer - { - sd->syncpoint = true; - - if(LOG) fprintf(s_fp, "syncpoint 3\n"); - } - } - } - } - - // sd->UseTargetPages(fb_pages, zb_pages); - // + if(LOG) {fprintf(s_fp, "queue %05x %d %05x %d %05x %d %dx%d | %d %d %d\n", + m_context->FRAME.Block(), m_context->FRAME.PSM, + m_context->ZBUF.Block(), m_context->ZBUF.PSM, + PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, + PRIM->PRIM, sd->vertex_count, sd->index_count); fflush(s_fp);} if(s_dump) { @@ -404,12 +411,6 @@ void GSRendererSW::Draw() } else { - if(LOG) fprintf(s_fp, "queue %05x %d %05x %d %05x %d %dx%d | %d %d %d\n", - m_context->FRAME.Block(), m_context->FRAME.PSM, - m_context->ZBUF.Block(), m_context->ZBUF.PSM, - PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, - PRIM->PRIM, sd->vertex_count, sd->index_count); - m_rl->Queue(data); } @@ -435,36 +436,42 @@ void GSRendererSW::Sync(int reason) m_rl->Sync(); - s_n++; + if(0) + { + std::string s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->FRAME.Block(), m_context->FRAME.PSM); + + m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512); + + s_n++; + } t = __rdtsc() - t; - if(LOG) fprintf(s_fp, "sync n=%d r=%d t=%lld p=%d %c\n", s_n, reason, t, m_rl->GetPixels(), t > 10000000 ? '*' : ' '); + int pixels = m_rl->GetPixels(); - m_perfmon.Put(GSPerfMon::Fillrate, m_rl->GetPixels()); + if(LOG) {fprintf(s_fp, "sync n=%d r=%d t=%lld p=%d %c\n", s_n, reason, t, pixels, t > 10000000 ? '*' : ' '); fflush(s_fp);} + + m_perfmon.Put(GSPerfMon::Fillrate, pixels); } void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) { GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM); - uint32* RESTRICT p = m_tmp_pages; - - o->GetPages(r, p); + o->GetPages(r, m_tmp_pages); // check if the changing pages either used as a texture or a target - for(; *p != GSOffset::EOP; p++) + if(!m_rl->IsSynced()) { - uint32 page = *p; - - //while(m_fzb_pages[page] | m_tex_pages[page]) _mm_pause(); - - if(m_fzb_pages[page] | m_tex_pages[page]) + for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) { - Sync(5); + if(m_fzb_pages[*p] | m_tex_pages[*p]) + { + Sync(5); - break; + break; + } } } @@ -473,21 +480,20 @@ void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut) { - GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM); - - uint32* RESTRICT p = m_tmp_pages; - - o->GetPages(r, p); - - for(; *p != GSOffset::EOP; p++) + if(!m_rl->IsSynced()) { - //while(m_fzb_pages[*p]) _mm_pause(); + GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM); - if(m_fzb_pages[*p]) + o->GetPages(r, m_tmp_pages); + + for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) { - Sync(6); + if(m_fzb_pages[*p]) + { + Sync(6); - break; + break; + } } } } @@ -505,15 +511,16 @@ void GSRendererSW::UsePages(const uint32* pages, int type) } else { - for(const uint32* p = pages; *p != GSOffset::EOP; p++) + if(!m_rl->IsSynced()) { - //while(m_fzb_pages[*p]) _mm_pause(); - - if(m_fzb_pages[*p]) // currently being drawn to? => sync (could even spin and wait until it hits 0, not sure if it's worth though, or just create 512 condvars? :D) + for(const uint32* p = pages; *p != GSOffset::EOP; p++) { - Sync(7); + if(m_fzb_pages[*p]) // currently being drawn to? => sync + { + Sync(7); - break; + break; + } } } @@ -548,13 +555,150 @@ void GSRendererSW::ReleasePages(const uint32* pages, int type) } } -template bool GSRendererSW::CheckTargetPages(const uint32* pages) +bool GSRendererSW::CheckTargetPages(const uint32* fb_pages, const uint32* zb_pages, const GSVector4i& r) { - for(const uint32* p = pages; *p != GSOffset::EOP; p++) + bool synced = m_rl->IsSynced(); + + if(m_fzb != m_context->offset.fzb4) { - if(mask != 0xffffffff ? (m_fzb_pages[*p] & mask) : m_fzb_pages[*p]) + // targets changed, check everything + + m_fzb = m_context->offset.fzb4; + m_fzb_bbox = r; + + if(fb_pages == NULL) fb_pages = m_context->offset.fb->GetPages(r); + if(zb_pages == NULL) zb_pages = m_context->offset.zb->GetPages(r); + + memset(m_fzb_cur_pages, 0, sizeof(m_fzb_cur_pages)); + + uint32 used = 0; + + for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++) { - return true; + uint32 i = *p; + + uint32 row = i >> 5; + uint32 col = 1 << (i & 31); + + m_fzb_cur_pages[row] |= col; + + used |= m_fzb_pages[i]; + } + + for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++) + { + uint32 i = *p; + + uint32 row = i >> 5; + uint32 col = 1 << (i & 31); + + m_fzb_cur_pages[row] |= col; + + used |= m_fzb_pages[i]; + } + + if(!synced) + { + if(used) + { + if(LOG) {fprintf(s_fp, "syncpoint 0\n"); fflush(s_fp);} + + return true; + } + + if(LOG) {fprintf(s_fp, "no syncpoint *\n"); fflush(s_fp);} + } + } + else + { + // same target, only check new areas and cross-rendering between frame and z-buffer + + GSVector4i bbox = m_fzb_bbox.runion(r); + + bool check = !m_fzb_bbox.eq(bbox); + + m_fzb_bbox = bbox; + + if(check) + { + // drawing area is larger than previous time, check new parts only to avoid false positives (m_fzb_cur_pages guards) + + if(fb_pages == NULL) fb_pages = m_context->offset.fb->GetPages(r); + if(zb_pages == NULL) zb_pages = m_context->offset.zb->GetPages(r); + + uint32 used = 0; + + for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++) + { + uint32 i = *p; + + uint32 row = i >> 5; + uint32 col = 1 << (i & 31); + + if((m_fzb_cur_pages[row] & col) == 0) + { + m_fzb_cur_pages[row] |= col; + + used |= m_fzb_pages[i]; + } + } + + for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++) + { + uint32 i = *p; + + uint32 row = i >> 5; + uint32 col = 1 << (i & 31); + + if((m_fzb_cur_pages[row] & col) == 0) + { + m_fzb_cur_pages[row] |= col; + + used |= m_fzb_pages[i]; + } + } + + if(!synced) + { + if(used) + { + if(LOG) {fprintf(s_fp, "syncpoint 1\n"); fflush(s_fp);} + + return true; + } + } + } + + if(!synced) + { + // chross-check frame and z-buffer pages, they cannot overlap with eachother and with previous batches in queue, + // have to be careful when the two buffers are mutually enabled/disabled and alternating (Bully FBP/ZBP = 0x2300) + + if(fb_pages) + { + for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++) + { + if(m_fzb_pages[*p] & 0xffff0000) + { + if(LOG) {fprintf(s_fp, "syncpoint 2\n"); fflush(s_fp);} + + return true; + } + } + } + + if(zb_pages) + { + for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++) + { + if(m_fzb_pages[*p] & 0x0000ffff) + { + if(LOG) {fprintf(s_fp, "syncpoint 3\n"); fflush(s_fp);} + + return true; + } + } + } } } @@ -577,8 +721,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.zbr = context->offset.zb->pixel.row; gd.fbc = context->offset.fb->pixel.col[0]; gd.zbc = context->offset.zb->pixel.col[0]; - gd.fzbr = context->offset.fzb->row; - gd.fzbc = context->offset.fzb->col; + gd.fzbr = context->offset.fzb4->row; + gd.fzbc = context->offset.fzb4->col; gd.sel.key = 0; @@ -1117,8 +1261,8 @@ GSRendererSW::SharedData::~SharedData() } } - delete m_fb_pages; - delete m_zb_pages; + delete [] m_fb_pages; + delete [] m_zb_pages; for(size_t i = 0; i < countof(m_tex_pages) && m_tex_pages[i] != NULL; i++) { @@ -1153,8 +1297,10 @@ void GSRendererSW::SharedData::UseSourcePages(GSTextureCacheSW::Texture* t, int { ASSERT(m_tex_pages[level] == NULL); - m_tex_pages[level] = t->m_pages.n; + const uint32* pages = t->m_pages.n; + + m_tex_pages[level] = pages; m_tex_pages[level + 1] = NULL; - m_parent->UsePages(t->m_pages.n, 2); + m_parent->UsePages(pages, 2); } diff --git a/plugins/GSdx/GSRendererSW.h b/plugins/GSdx/GSRendererSW.h index ee68de3611..8c9914530a 100644 --- a/plugins/GSdx/GSRendererSW.h +++ b/plugins/GSdx/GSRendererSW.h @@ -48,8 +48,9 @@ protected: GSTextureCacheSW* m_tc; GSTexture* m_texture[2]; uint8* m_output; - bool m_reset; GSPixelOffset4* m_fzb; + GSVector4i m_fzb_bbox; + uint32 m_fzb_cur_pages[16]; uint32 m_fzb_pages[512]; // uint16 frame/zbuf pages interleaved uint16 m_tex_pages[512]; uint32 m_tmp_pages[512 + 1]; @@ -66,7 +67,7 @@ protected: void UsePages(const uint32* pages, int type); void ReleasePages(const uint32* pages, int type); - template bool CheckTargetPages(const uint32* pages); + bool CheckTargetPages(const uint32* fb_pages, const uint32* zb_pages, const GSVector4i& r); bool GetScanlineGlobalData(SharedData* data); diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index f1259f76ca..4522e85011 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -209,6 +209,9 @@ void GSState::SetFrameSkip(int skip) void GSState::Reset() { + printf("GS reset\n"); + + memset(m_mem.m_vm8, 0, m_mem.m_vmsize); memset(&m_path[0], 0, sizeof(m_path[0]) * countof(m_path)); memset(&m_v, 0, sizeof(m_v)); @@ -253,6 +256,7 @@ void GSState::ResetHandlers() m_fpGIFRegHandlerXYZ[P][1] = &GSState::GIFRegHandlerXYZF2; \ m_fpGIFRegHandlerXYZ[P][2] = &GSState::GIFRegHandlerXYZ2; \ m_fpGIFRegHandlerXYZ[P][3] = &GSState::GIFRegHandlerXYZ2; \ + m_fpGIFPackedRegHandlerSTQRGBAXYZF2[P] = &GSState::GIFPackedRegHandlerSTQRGBAXYZF2

; \ SetHandlerXYZ(GS_POINTLIST); SetHandlerXYZ(GS_LINELIST); @@ -546,6 +550,36 @@ void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r) { } +template +void GSState::GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, uint32 size) +{ + ASSERT(size > 0 && size % 3 == 0); + + const GIFPackedReg* RESTRICT r_end = r + size; + + while(r < r_end) + { + GSVector4i st = GSVector4i::loadl(&r[0].u64[0]); + GSVector4i q = GSVector4i::loadl(&r[0].u64[1]); + GSVector4i rgba = (GSVector4i::load(&r[1]) & GSVector4i::x000000ff()).ps32().pu16(); + + m_v.m[0] = st.upl64(rgba.upl32(q)); + + GSVector4i xy = GSVector4i::loadl(&r[2].u64[0]); + GSVector4i zf = GSVector4i::loadl(&r[2].u64[1]); + xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::loadl(&m_v.UV)); + zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff()); + + m_v.m[1] = xy.upl32(zf); + + VertexKick(r[2].XYZF2.Skip()); + + r += 3; + } + + m_q = r[-3].STQ.Q; // remember the last one, STQ outputs this to the temp Q each time +} + // GIFRegHandler* void GSState::GIFRegHandlerNull(const GIFReg* RESTRICT r) @@ -1037,7 +1071,8 @@ template void GSState::GIFRegHandlerFRAME(const GIFReg* RESTRICT r) { m_env.CTXT[i].offset.fb = m_mem.GetOffset(r->FRAME.Block(), r->FRAME.FBW, r->FRAME.PSM); m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), r->FRAME.FBW, m_env.CTXT[i].ZBUF.PSM); - m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(r->FRAME, m_env.CTXT[i].ZBUF); + m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(r->FRAME, m_env.CTXT[i].ZBUF); + m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(r->FRAME, m_env.CTXT[i].ZBUF); } m_env.CTXT[i].FRAME = (GSVector4i)r->FRAME; @@ -1075,7 +1110,8 @@ template void GSState::GIFRegHandlerZBUF(const GIFReg* RESTRICT r) if((m_env.CTXT[i].ZBUF.u32[0] ^ ZBUF.u32[0]) & 0x3f0001ff) // ZBP PSM { m_env.CTXT[i].offset.zb = m_mem.GetOffset(ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, ZBUF.PSM); - m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, ZBUF); + m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(m_env.CTXT[i].FRAME, ZBUF); + m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, ZBUF); } m_env.CTXT[i].ZBUF = (GSVector4i)ZBUF; @@ -1726,8 +1762,28 @@ template void GSState::Transfer(const uint8* mem, uint32 size) { size -= total; - if(path.adonly) + switch(path.type) { + case GIFPath::TYPE_UNKNOWN: + + { + uint32 reg = 0; + + do + { + (this->*m_fpGIFPackedRegHandlers[path.GetReg(reg++)])((GIFPackedReg*)mem); + + mem += sizeof(GIFPackedReg); + + reg = reg & ((int)(reg - path.nreg) >> 31); // resets reg back to 0 when it becomes equal to path.nreg + } + while(--total > 0); + } + + break; + + case GIFPath::TYPE_ADONLY: // very common + do { (this->*m_fpGIFRegHandlers[((GIFPackedReg*)mem)->A_D.ADDR])(&((GIFPackedReg*)mem)->r); @@ -1735,20 +1791,20 @@ template void GSState::Transfer(const uint8* mem, uint32 size) mem += sizeof(GIFPackedReg); } while(--total > 0); - } - else - { - uint32 reg = 0; - do - { - (this->*m_fpGIFPackedRegHandlers[path.GetReg(reg++)])((GIFPackedReg*)mem); + break; + + case GIFPath::TYPE_STQRGBAXYZF2: // majority of the vertices are formatted like this - mem += sizeof(GIFPackedReg); + (this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2])((GIFPackedReg*)mem, total); - reg = reg & ((int)(reg - path.nreg) >> 31); // resets reg back to 0 when it becomes equal to path.nreg - } - while(--total > 0); + mem += total * sizeof(GIFPackedReg); + + break; + + default: + + __assume(0); } path.nloop = 0; @@ -2070,7 +2126,8 @@ int GSState::Defrost(const GSFreezeData* fd) m_env.CTXT[i].offset.fb = m_mem.GetOffset(m_env.CTXT[i].FRAME.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].FRAME.PSM); m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].ZBUF.PSM); m_env.CTXT[i].offset.tex = m_mem.GetOffset(m_env.CTXT[i].TEX0.TBP0, m_env.CTXT[i].TEX0.TBW, m_env.CTXT[i].TEX0.PSM); - m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF); + m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF); + m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF); } UpdateScissor(); @@ -2116,6 +2173,8 @@ void GSState::UpdateVertexKick() m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = m_fpGIFRegHandlerXYZ[prim][2]; m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = m_fpGIFRegHandlerXYZ[prim][3]; + m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2] = m_fpGIFPackedRegHandlerSTQRGBAXYZF2[prim]; + m_cvf = m_cv[prim][PRIM->TME][PRIM->FST]; } diff --git a/plugins/GSdx/GSState.h b/plugins/GSdx/GSState.h index 0624f39267..52a8dbe81e 100644 --- a/plugins/GSdx/GSState.h +++ b/plugins/GSdx/GSState.h @@ -59,6 +59,13 @@ class GSState : public GSAlignedClass<32> GIFRegHandler m_fpGIFRegHandlers[256]; GIFRegHandler m_fpGIFRegHandlerXYZ[8][4]; + typedef void (GSState::*GIFPackedRegHandlerC)(const GIFPackedReg* RESTRICT r, uint32 size); + + GIFPackedRegHandlerC m_fpGIFPackedRegHandlersC[1]; + GIFPackedRegHandlerC m_fpGIFPackedRegHandlerSTQRGBAXYZF2[8]; + + template void GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, uint32 size); + template void ApplyTEX0(GIFRegTEX0& TEX0); void ApplyPRIM(const GIFRegPRIM& PRIM); diff --git a/plugins/GSdx/GSTexture11.cpp b/plugins/GSdx/GSTexture11.cpp index da0c285f05..9f199fbb36 100644 --- a/plugins/GSdx/GSTexture11.cpp +++ b/plugins/GSdx/GSTexture11.cpp @@ -167,6 +167,18 @@ GSTexture11::operator ID3D11ShaderResourceView*() return m_srv; } +GSTexture11::operator ID3D11UnorderedAccessView*() +{ + if(!m_uav && m_dev && m_texture) + { + ASSERT(!m_msaa); + + m_dev->CreateUnorderedAccessView(m_texture, NULL, &m_uav); + } + + return m_uav; +} + GSTexture11::operator ID3D11RenderTargetView*() { ASSERT(m_dev); diff --git a/plugins/GSdx/GSTexture11.h b/plugins/GSdx/GSTexture11.h index 1f78df5f34..2d287ac4f9 100644 --- a/plugins/GSdx/GSTexture11.h +++ b/plugins/GSdx/GSTexture11.h @@ -30,6 +30,7 @@ class GSTexture11 : public GSTexture CComPtr m_texture; D3D11_TEXTURE2D_DESC m_desc; CComPtr m_srv; + CComPtr m_uav; CComPtr m_rtv; CComPtr m_dsv; @@ -43,6 +44,7 @@ public: operator ID3D11Texture2D*(); operator ID3D11ShaderResourceView*(); + operator ID3D11UnorderedAccessView*(); operator ID3D11RenderTargetView*(); operator ID3D11DepthStencilView*(); }; diff --git a/plugins/GSdx/GSTextureCache.cpp b/plugins/GSdx/GSTextureCache.cpp index 8a9e572cee..49743f557c 100644 --- a/plugins/GSdx/GSTextureCache.cpp +++ b/plugins/GSdx/GSTextureCache.cpp @@ -281,6 +281,8 @@ GSTextureCache::Target* GSTextureCache::LookupTarget(const GIFRegTEX0& TEX0, int { return NULL; } + + m_renderer->m_dev->ClearRenderTarget(dst->m_texture, 0); // new frame buffers after reset should be cleared, don't display memory garbage } else { diff --git a/plugins/GSdx/GSThread.cpp b/plugins/GSdx/GSThread.cpp index 54284285f0..3f0eda6921 100644 --- a/plugins/GSdx/GSThread.cpp +++ b/plugins/GSdx/GSThread.cpp @@ -30,6 +30,7 @@ WakeAllConditionVariablePtr pWakeAllConditionVariable; SleepConditionVariableSRWPtr pSleepConditionVariableSRW; InitializeSRWLockPtr pInitializeSRWLock;; AcquireSRWLockExclusivePtr pAcquireSRWLockExclusive; +TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive; ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive; class InitCondVar @@ -47,6 +48,7 @@ public: pSleepConditionVariableSRW = (SleepConditionVariableSRWPtr)GetProcAddress(m_kernel32, "SleepConditionVariableSRW"); pInitializeSRWLock = (InitializeSRWLockPtr)GetProcAddress(m_kernel32, "InitializeSRWLock"); pAcquireSRWLockExclusive = (AcquireSRWLockExclusivePtr)GetProcAddress(m_kernel32, "AcquireSRWLockExclusive"); + pTryAcquireSRWLockExclusive = (TryAcquireSRWLockExclusivePtr)GetProcAddress(m_kernel32, "TryAcquireSRWLockExclusive"); pReleaseSRWLockExclusive = (ReleaseSRWLockExclusivePtr)GetProcAddress(m_kernel32, "ReleaseSRWLockExclusive"); } diff --git a/plugins/GSdx/GSThread.h b/plugins/GSdx/GSThread.h index 99247d8431..d53faf04e6 100644 --- a/plugins/GSdx/GSThread.h +++ b/plugins/GSdx/GSThread.h @@ -21,6 +21,8 @@ #pragma once +#include "GSdx.h" + #ifdef _WINDOWS typedef void (WINAPI * InitializeConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable); @@ -29,7 +31,7 @@ typedef void (WINAPI * WakeAllConditionVariablePtr)(CONDITION_VARIABLE* Conditio typedef void (WINAPI * SleepConditionVariableSRWPtr)(CONDITION_VARIABLE* ConditionVariable, SRWLOCK* SRWLock, DWORD dwMilliseconds, ULONG Flags); typedef void (WINAPI * InitializeSRWLockPtr)(SRWLOCK* SRWLock); typedef void (WINAPI * AcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock); -typedef void (WINAPI * ReleaseSRWLockExclusivePtr)(SRWLOCK* SRWLock); +typedef BOOLEAN (WINAPI * TryAcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock);typedef void (WINAPI * ReleaseSRWLockExclusivePtr)(SRWLOCK* SRWLock); extern InitializeConditionVariablePtr pInitializeConditionVariable; extern WakeConditionVariablePtr pWakeConditionVariable; @@ -37,7 +39,7 @@ extern WakeAllConditionVariablePtr pWakeAllConditionVariable; extern SleepConditionVariableSRWPtr pSleepConditionVariableSRW; extern InitializeSRWLockPtr pInitializeSRWLock;; extern AcquireSRWLockExclusivePtr pAcquireSRWLockExclusive; -extern ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive; +extern TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive;extern ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive; class GSThread { @@ -92,7 +94,7 @@ public: GSCondVarLock() {pInitializeSRWLock(&m_lock);} void Lock() {pAcquireSRWLockExclusive(&m_lock);} - void Unlock() {pReleaseSRWLockExclusive(&m_lock);} + bool TryLock() {return pTryAcquireSRWLockExclusive(&m_lock) == TRUE;} void Unlock() {pReleaseSRWLockExclusive(&m_lock);} operator SRWLOCK* () {return &m_lock;} }; @@ -114,7 +116,6 @@ public: #include #include -#include "GSdx.h" class GSThread { @@ -191,6 +192,7 @@ public: } void Lock() {pthread_mutex_lock(&m_mutex);} + bool TryLock() {return pthread_mutex_trylock(&m_mutex) == 0;} void Unlock() {pthread_mutex_unlock(&m_mutex);} operator pthread_mutex_t* () {return &m_mutex;} @@ -254,10 +256,10 @@ public: template class GSJobQueue : private GSThread { protected: - int m_count; queue m_queue; + volatile long m_count; // NOTE: it is the safest to have our own counter because m_queue.pop() might decrement its own before the last item runs out of its scope and gets destroyed (implementation dependent) volatile bool m_exit; - struct {GSCritSec lock; GSEvent notempty; volatile long count;} m_ev; + struct {GSCritSec lock; GSEvent notempty;} m_ev; struct {GSCondVar notempty, empty; GSCondVarLock lock; bool available;} m_cv; void ThreadProc() @@ -285,6 +287,8 @@ protected: m_queue.pop(); + m_count--; + if(m_queue.empty()) { m_cv.empty.Set(); @@ -318,7 +322,7 @@ protected: m_queue.pop(); - _InterlockedDecrement(&m_ev.count); + m_count--; } } } @@ -328,16 +332,14 @@ public: : m_count(0) , m_exit(false) { - m_ev.count = 0; + m_cv.available = !!theApp.GetConfig("condvar", 1); #ifdef _WINDOWS - m_cv.available = pInitializeConditionVariable != NULL; - - #elif defined(_LINUX) - - //m_cv.available = true; - m_cv.available = !!theApp.GetConfig("condvar", 1); + if(pInitializeConditionVariable == NULL) + { + m_cv.available = false; + } #endif @@ -358,12 +360,14 @@ public: } } - int GetCount() const + bool IsEmpty() const { - return m_count; + ASSERT(m_count >= 0); + + return m_count == 0; } - virtual void Push(const T& item) + void Push(const T& item) { if(m_cv.available) { @@ -371,6 +375,8 @@ public: m_queue.push(item); + m_count++; + m_cv.lock.Unlock(); m_cv.notempty.Set(); @@ -381,35 +387,34 @@ public: m_queue.push(item); - _InterlockedIncrement(&m_ev.count); + m_count++; m_ev.notempty.Set(); } - - m_count++; } - virtual void Wait() + void Wait() { if(m_cv.available) { - m_cv.lock.Lock(); - - while(!m_queue.empty()) + if(m_count > 0) { - m_cv.empty.Wait(m_cv.lock); - } + m_cv.lock.Lock(); - m_cv.lock.Unlock(); + while(!m_queue.empty()) + { + m_cv.empty.Wait(m_cv.lock); + } + + ASSERT(m_count == 0); + + m_cv.lock.Unlock(); + } } else { - // NOTE: it is the safest to have our own counter because m_queue.pop() might decrement its own before the last item runs out of its scope and gets destroyed (implementation dependent) - - while(m_ev.count > 0) _mm_pause(); + while(m_count > 0) _mm_pause(); } - - m_count++; } virtual void Process(T& item) = 0; diff --git a/plugins/GSdx/GSdx_vs2008.vcproj b/plugins/GSdx/GSdx_vs2008.vcproj index 8a4b1a71dc..67186e4337 100644 --- a/plugins/GSdx/GSdx_vs2008.vcproj +++ b/plugins/GSdx/GSdx_vs2008.vcproj @@ -1024,6 +1024,10 @@ RelativePath=".\GSRenderer.cpp" > + + @@ -1630,6 +1634,10 @@ RelativePath=".\GSRenderer.h" > + + diff --git a/plugins/GSdx/res/cs.fx b/plugins/GSdx/res/cs.fx index 7579753e77..8bbedf4177 100644 --- a/plugins/GSdx/res/cs.fx +++ b/plugins/GSdx/res/cs.fx @@ -1,73 +1,270 @@ -struct Vertex +#ifndef VS_TME +#define VS_TME 1 +#define VS_FST 1 +#endif + +#ifndef GS_IIP +#define GS_IIP 0 +#define GS_PRIM 3 +#endif + + +// +globallycoherent RWByteAddressBuffer VideoMemory : register(u0); + +//globallycoherent RWTexture2D VideoMemory : register(u0); // 8192 * 512 R8_UINT + +Buffer FZBufRow : register(t0); +Buffer FZBufCol : register(t1); +Texture2D Palette : register(t2); +Texture2D TextureL0 : register(t3); +Texture2D TextureL1 : register(t4); +Texture2D TextureL2 : register(t5); +Texture2D TextureL3 : register(t6); +Texture2D TextureL4 : register(t7); +Texture2D TextureL5 : register(t8); +Texture2D TextureL6 : register(t9); + +cbuffer VSConstantBuffer : register(c0) { - float2 st; - uint c; - float q; - uint xy, z; - uint uv, f; + float4 VertexScale; + float4 VertexOffset; }; -RWByteAddressBuffer VideoMemory : register(u0); - -StructuredBuffer VertexBuffer : register(t0); -Buffer IndexBuffer : register(t1); - -Buffer FrameRowOffset : register(t2); -Buffer FrameColOffset : register(t3); -Buffer ZBufRowOffset : register(t4); -Buffer ZBufColOffset : register(t5); - -cbuffer DrawingEnvironment : register(c0) +cbuffer PSConstantBuffer : register(c0) { // TODO }; -// one group is 16x8 pixels and one thread does 2 pixels, otherwise could not read-merge-write 16-bit targets safely -// neighburing pixels are next to eachother in memory, at least we don't have to calculate the address twice - -// TODO: they say groupshared memory is faster, try unswizzling the corresponding chunk of memory initially (how to do that once by only one thread?) then write-back when finished, unless it was untouched - -[numthreads(8, 8, 1)] -void cs_main(uint3 gid : SV_GroupID, uint3 tid : SV_GroupThreadID) +struct VS_INPUT { - uint count; + uint2 p : POSITION0; + uint z : POSITION1; + float2 st : TEXCOORD0; + float q : TEXCOORD1; + uint2 uv : TEXCOORD2; + float4 c : COLOR0; + float4 f : COLOR1; +}; - IndexBuffer.GetDimensions(count); +struct VS_OUTPUT +{ + float4 p : SV_Position; + float2 z : TEXCOORD0; + float4 t : TEXCOORD1; + float4 c : COLOR0; +}; - // #if GS_PRIM == 2 (triangle) +struct GS_OUTPUT +{ + float4 p : SV_Position; + float2 z : TEXCOORD0; + float4 t : TEXCOORD1; + float4 c : COLOR0; + uint id : SV_PrimitiveID; +}; - for(uint i = 0; i < count; i += 3) +VS_OUTPUT vs_main(VS_INPUT input) +{ + VS_OUTPUT output; + + output.p = float4(input.p, 0.0f, 0.0f) * VertexScale - VertexOffset; + + output.z = float2(input.z & 0xffff, input.z >> 16); + + if(VS_TME) { - Vertex v0 = VertexBuffer[IndexBuffer[i + 0]]; - Vertex v1 = VertexBuffer[IndexBuffer[i + 1]]; - Vertex v2 = VertexBuffer[IndexBuffer[i + 2]]; - - uint x = gid.x + tid.x * 2; - uint y = gid.y + tid.y; - - uint fa = FrameRowOffset[y] + FrameColOffset[x]; - uint za = ZBufRowOffset[y] + ZBufColOffset[x]; - - // TODO: quickly reject if x, y is outside the triangle - // TODO: calculate interpolated values at x, y - // TODO: run the GS pipeline - // TODO: repeat for x+1, y - // TODO: output two pixels (might be better to process a single pixel, more threads, if there is no 16-bit target involved) - - // testing... - - uint4 c = VideoMemory.Load4(fa); // does this load 4*4 bytes? or 4 bytes each expanded uint? - - c = (v0.c >> uint4(0, 8, 16, 24)) & 0xff; // => ushr r1.yzw, r1.xxxx, l(0, 8, 16, 24), v0.c auto-converted to uint4 and per-component shift in one instruction, SSE is embarrassed - - VideoMemory.Store4(fa, c); // same question, 4*4 bytes or compressed to uint + if(VS_FST) + { + output.t.xy = input.uv; + output.t.w = 1.0f; + } + else + { + output.t.xy = input.st; + output.t.w = input.q; + } + } + else + { + output.t.xy = 0; + output.t.w = 1.0f; } - // #endif + output.c = input.c; + output.t.z = input.f.r; + + return output; } -// TODO: DrawPoint (this is going to be a waste of resources) -// TODO: DrawLine (line hit-test, will it work?) -// TODO: DrawSprite (similar to DrawTriangle) -// TODO: if read-backs are too slow, implement GSState::Write/FlushWrite/Read/clut.Write in a compute shader -// TODO: unswizzle pages from VideoMemory to the texture cache (if they are marked as valid, otherwise upload from GSLocalMemory::m_vm8) + +#if GS_PRIM == 0 + +[maxvertexcount(1)] +void gs_main(point VS_OUTPUT input[1], inout PointStream stream, uint id : SV_PrimitiveID) +{ + GS_OUTPUT output; + + output.p = input[0].p; + output.z = input[0].z; + output.t = input[0].t; + output.c = input[0].c; + output.id = id; + + stream.Append(output); +} + +#elif GS_PRIM == 1 + +[maxvertexcount(2)] +void gs_main(line VS_OUTPUT input[2], inout LineStream stream, uint id : SV_PrimitiveID) +{ + for(int i = 0; i < 2; i++) + { + GS_OUTPUT output; + + output.p = input[i].p; + output.z = input[i].z; + output.t = input[i].t; + output.c = input[i].c; + output.id = id; + + #if GS_IIP == 0 + if(i != 1) output.c = input[1].c; + #endif + + stream.Append(output); + } +} + +#elif GS_PRIM == 2 + +[maxvertexcount(3)] +void gs_main(triangle VS_OUTPUT input[3], inout TriangleStream stream, uint id : SV_PrimitiveID) +{ + for(int i = 0; i < 3; i++) + { + GS_OUTPUT output; + + output.p = input[i].p; + output.z = input[i].z; + output.t = input[i].t; + output.c = input[i].c; + output.id = id; + + #if GS_IIP == 0 + if(i != 1) output.c = input[2].c; + #endif + + stream.Append(output); + } +} + +#elif GS_PRIM == 3 + +[maxvertexcount(4)] +void gs_main(line VS_OUTPUT input[2], inout TriangleStream stream, uint id : SV_PrimitiveID) +{ + GS_OUTPUT lt, rb, lb, rt; + + lt.p = input[0].p; + lt.z = input[1].z; + lt.t.xy = input[0].t.xy; + lt.t.zw = input[1].t.zw; + lt.c = input[0].c; + lt.id = id; + + #if GS_IIP == 0 + lt.c = input[1].c; + #endif + + rb.p = input[1].p; + rb.z = input[1].z; + rb.t = input[1].t; + rb.c = input[1].c; + rb.id = id; + + lb = lt; + lb.p.y = rb.p.y; + lb.t.y = rb.t.y; + + rt = rb; + rt.p.y = lt.p.y; + rt.t.y = lt.t.y; + + stream.Append(lt); + stream.Append(lb); + stream.Append(rt); + stream.Append(rb); +} + +#endif + +uint CompressColor(float4 f) +{ + // is there a faster way? + + uint4 c = (uint4)(f * 0xff) << uint4(0, 8, 16, 24); + + return c.r | c.g | c.b | c.a; +} + +void ps_main(GS_OUTPUT input) +{ + uint c = CompressColor(input.c); + uint z = (uint)(input.z.y * 0x10000 + input.z.x); + + uint x = (uint)input.p.x; + uint y = (uint)input.p.y; + + uint2 addr = FZBufRow[y] + FZBufCol[x]; // 16-bit address + + uint2 unaligned = addr.xy & 1; // 16-bit formats can address into the middle of an uint (smallest word size for VideoMemory) + + addr = (addr & ~1) * 2; + + //DeviceMemoryBarrier(); + + uint zd = VideoMemory.Load(addr.y); + + if(z < zd) discard; + + VideoMemory.Store(addr.y, z); + VideoMemory.Store(addr.x, c); + +/* + addr <<= 1; + + uint2 fa0 = uint2(addr.x & 0x1fff, addr.x >> 13); + uint2 fa1 = fa0 + uint2(1, 0); + uint2 fa2 = fa0 + uint2(2, 0); + uint2 fa3 = fa0 + uint2(3, 0); + + uint2 za0 = uint2(addr.y & 0x1fff, addr.y >> 13); + uint2 za1 = za0 + uint2(1, 0); + uint2 za2 = za0 + uint2(2, 0); + uint2 za3 = za0 + uint2(3, 0); + + DeviceMemoryBarrier(); + + uint zd = + (VideoMemory[za0] << 0) | + (VideoMemory[za1] << 8) | + (VideoMemory[za2] << 16) | + (VideoMemory[za3] << 24); + + if(zd >= z) discard; + + VideoMemory[za0] = (z >> 0) & 0xff; + VideoMemory[za1] = (z >> 8) & 0xff; + VideoMemory[za2] = (z >> 16) & 0xff; + VideoMemory[za3] = (z >> 24) & 0xff; + + DeviceMemoryBarrier(); + + VideoMemory[fa0] = (c >> 0) & 0xff; + VideoMemory[fa1] = (c >> 8) & 0xff; + VideoMemory[fa2] = (c >> 16) & 0xff; + VideoMemory[fa3] = (c >> 24) & 0xff; +*/ +}