diff --git a/plugins/GSdx/GS.cpp b/plugins/GSdx/GS.cpp index f79b4759b9..c8ed7c5128 100644 --- a/plugins/GSdx/GS.cpp +++ b/plugins/GSdx/GS.cpp @@ -33,6 +33,7 @@ #include "GSRendererDX11.h" #include "GSDevice9.h" #include "GSDevice11.h" +#include "GSRendererCS.h" #include "GSSettingsDlg.h" static HRESULT s_hr = E_FAIL; @@ -206,41 +207,64 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1) s_gs = NULL; } - switch(renderer / 3) + if(renderer == 12) { - default: - #ifdef _WINDOWS - case 0: dev = new GSDevice9(); break; - case 1: dev = new GSDevice11(); break; - #endif - case 2: dev = new GSDeviceSDL(); break; - case 3: dev = new GSDeviceNull(); break; - } + #ifdef _WINDOWS + + dev = new GSDevice11(); - if(dev == NULL) - { - return -1; - } + if(dev == NULL) + { + return -1; + } - if(s_gs == NULL) + if(s_gs == NULL) + { + s_gs = new GSRendererCS(); + + s_renderer = renderer; + } + + #endif + } + else { - switch(renderer % 3) + switch(renderer / 3) { default: #ifdef _WINDOWS - case 0: - s_gs = (renderer / 3) == 0 ? (GSRenderer*)new GSRendererDX9() : (GSRenderer*)new GSRendererDX11(); - break; + case 0: dev = new GSDevice9(); break; + case 1: dev = new GSDevice11(); break; #endif - case 1: - s_gs = new GSRendererSW(threads); - break; - case 2: - s_gs = new GSRendererNull(); - break; + case 2: dev = new GSDeviceSDL(); break; + case 3: dev = new GSDeviceNull(); break; } - s_renderer = renderer; + if(dev == NULL) + { + return -1; + } + + if(s_gs == NULL) + { + switch(renderer % 3) + { + default: + #ifdef _WINDOWS + case 0: + s_gs = (renderer / 3) == 0 ? (GSRenderer*)new GSRendererDX9() : (GSRenderer*)new GSRendererDX11(); + break; + #endif + case 1: + s_gs = new GSRendererSW(threads); + break; + case 2: + s_gs = new GSRendererNull(); + break; + } + + s_renderer = renderer; + } } } catch(std::exception& ex) diff --git a/plugins/GSdx/GS.h b/plugins/GSdx/GS.h index 385aa685b5..9b10f47ac9 100644 --- a/plugins/GSdx/GS.h +++ b/plugins/GSdx/GS.h @@ -28,8 +28,14 @@ #define PLUGIN_VERSION 16 -#define MAX_PAGES 512 -#define MAX_BLOCKS 16384 +#define VM_SIZE 4194304 +#define PAGE_SIZE 8192 +#define BLOCK_SIZE 256 +#define COLUMN_SIZE 64 + +#define MAX_PAGES (VM_SIZE / PAGE_SIZE) +#define MAX_BLOCKS (VM_SIZE / BLOCK_SIZE) +#define MAX_COLUMNS (VM_SIZE / COLUMN_SIZE) //if defined, will send much info in reply to the API title info queri from PCSX2 //default should be undefined diff --git a/plugins/GSdx/GSDevice11.cpp b/plugins/GSdx/GSDevice11.cpp index ddca54752e..7e253f04e3 100644 --- a/plugins/GSdx/GSDevice11.cpp +++ b/plugins/GSdx/GSDevice11.cpp @@ -144,7 +144,7 @@ bool GSDevice11::Create(GSWnd* wnd) for(int i = 0; i < countof(m_convert.ps); i++) { - hr = CompileShader(IDR_CONVERT_FX, format("ps_main%d", i), NULL, &m_convert.ps[i]); + hr = CompileShader(IDR_CONVERT_FX, format("ps_main%d", i).c_str(), NULL, &m_convert.ps[i]); } memset(&dsd, 0, sizeof(dsd)); @@ -172,7 +172,7 @@ bool GSDevice11::Create(GSWnd* wnd) for(int i = 0; i < countof(m_merge.ps); i++) { - hr = CompileShader(IDR_MERGE_FX, format("ps_main%d", i), NULL, &m_merge.ps[i]); + hr = CompileShader(IDR_MERGE_FX, format("ps_main%d", i).c_str(), NULL, &m_merge.ps[i]); } memset(&bsd, 0, sizeof(bsd)); @@ -200,7 +200,7 @@ bool GSDevice11::Create(GSWnd* wnd) for(int i = 0; i < countof(m_interlace.ps); i++) { - hr = CompileShader(IDR_INTERLACE_FX, format("ps_main%d", i), NULL, &m_interlace.ps[i]); + hr = CompileShader(IDR_INTERLACE_FX, format("ps_main%d", i).c_str(), NULL, &m_interlace.ps[i]); } // fxaa @@ -360,6 +360,11 @@ void GSDevice11::DrawIndexedPrimitive() m_ctx->DrawIndexed(m_index.count, m_index.start, m_vertex.start); } +void GSDevice11::Dispatch(uint32 x, uint32 y, uint32 z) +{ + m_ctx->Dispatch(x, y, z); +} + void GSDevice11::ClearRenderTarget(GSTexture* t, const GSVector4& c) { m_ctx->ClearRenderTargetView(*(GSTexture11*)t, c.v); @@ -937,7 +942,7 @@ void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb) m_ctx->PSSetShader(ps, NULL, 0); } - if (m_srv_changed) + if(m_srv_changed) { m_ctx->PSSetShaderResources(0, 3, m_state.ps_srv); @@ -959,6 +964,38 @@ void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb) } } +void GSDevice11::CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv) +{ + // TODO: if(m_state.cs_srv[i] != srv) + { + // TODO: m_state.cs_srv[i] = srv; + + m_ctx->CSSetShaderResources(i, 1, &srv); + } +} + +void GSDevice11::CSSetShaderUAV(int i, ID3D11UnorderedAccessView* uav) +{ + // TODO: if(m_state.cs_uav[i] != uav) + { + // TODO: m_state.cs_uav[i] = uav; + + // uint32 count[] = {-1}; + + m_ctx->CSSetUnorderedAccessViews(i, 1, &uav, NULL); + } +} + +void GSDevice11::CSSetShader(ID3D11ComputeShader* cs) +{ + if(m_state.cs != cs) + { + m_state.cs = cs; + + m_ctx->CSSetShader(cs, NULL, 0); + } +} + void GSDevice11::OMSetDepthStencilState(ID3D11DepthStencilState* dss, uint8 sref) { if(m_state.dss != dss || m_state.sref != sref) @@ -1027,7 +1064,7 @@ void GSDevice11::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector } } -HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il) +HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il) { HRESULT hr; @@ -1037,7 +1074,7 @@ HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_M CComPtr shader, error; - hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry.c_str(), m_shader.vs.c_str(), 0, 0, NULL, &shader, &error, NULL); + hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.vs.c_str(), 0, 0, NULL, &shader, &error, NULL); if(error) { @@ -1066,7 +1103,7 @@ HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_M return hr; } -HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs) +HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs) { HRESULT hr; @@ -1076,7 +1113,7 @@ HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_M CComPtr shader, error; - hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry.c_str(), m_shader.gs.c_str(), 0, 0, NULL, &shader, &error, NULL); + hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.gs.c_str(), 0, 0, NULL, &shader, &error, NULL); if(error) { @@ -1098,7 +1135,7 @@ HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_M return hr; } -HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps) +HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps) { HRESULT hr; @@ -1108,7 +1145,7 @@ HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_M CComPtr shader, error; - hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry.c_str(), m_shader.ps.c_str(), 0, 0, NULL, &shader, &error, NULL); + hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.ps.c_str(), 0, 0, NULL, &shader, &error, NULL); if(error) { @@ -1120,7 +1157,71 @@ HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_M return hr; } - hr = m_dev->CreatePixelShader((void*)shader->GetBufferPointer(), shader->GetBufferSize(),NULL, ps); + hr = m_dev->CreatePixelShader((void*)shader->GetBufferPointer(), shader->GetBufferSize(),NULL, ps); + + if(FAILED(hr)) + { + return hr; + } + + return hr; +} + +HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs) +{ + HRESULT hr; + + vector m; + + PrepareShaderMacro(m, macro); + + CComPtr shader, error; + + hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.ps.c_str(), 0, 0, NULL, &shader, &error, NULL); + + if(error) + { + printf("%s\n", (const char*)error->GetBufferPointer()); + } + + if(FAILED(hr)) + { + return hr; + } + + hr = m_dev->CreateComputeShader((void*)shader->GetBufferPointer(), shader->GetBufferSize(),NULL, cs); + + if(FAILED(hr)) + { + return hr; + } + + return hr; +} + +HRESULT GSDevice11::CompileShader(const char* fn, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs) +{ + HRESULT hr; + + vector m; + + PrepareShaderMacro(m, macro); + + CComPtr shader, error; + + hr = D3DX11CompileFromFile(fn, &m[0], NULL, entry, m_shader.cs.c_str(), 0, 0, NULL, &shader, &error, NULL); + + if(error) + { + printf("%s\n", (const char*)error->GetBufferPointer()); + } + + if(FAILED(hr)) + { + return hr; + } + + hr = m_dev->CreateComputeShader((void*)shader->GetBufferPointer(), shader->GetBufferSize(),NULL, cs); if(FAILED(hr)) { diff --git a/plugins/GSdx/GSDevice11.h b/plugins/GSdx/GSDevice11.h index a370a24b0a..443e6f43d3 100644 --- a/plugins/GSdx/GSDevice11.h +++ b/plugins/GSdx/GSDevice11.h @@ -64,6 +64,7 @@ class GSDevice11 : public GSDeviceDX ID3D11PixelShader* ps; ID3D11Buffer* ps_cb; ID3D11SamplerState* ps_ss[3]; + ID3D11ComputeShader* cs; GSVector2i viewport; GSVector4i scissor; ID3D11DepthStencilState* dss; @@ -145,6 +146,7 @@ public: void DrawPrimitive(); void DrawIndexedPrimitive(); + void Dispatch(uint32 x, uint32 y, uint32 z); void ClearRenderTarget(GSTexture* t, const GSVector4& c); void ClearRenderTarget(GSTexture* t, uint32 c); @@ -178,6 +180,9 @@ public: void PSSetShaderResource(int i, GSTexture* sr); void PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb); void PSSetSamplerState(ID3D11SamplerState* ss0, ID3D11SamplerState* ss1, ID3D11SamplerState* ss2 = NULL); + void CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv); + void CSSetShaderUAV(int i, ID3D11UnorderedAccessView* uav); + void CSSetShader(ID3D11ComputeShader* cs); void OMSetDepthStencilState(ID3D11DepthStencilState* dss, uint8 sref); void OMSetBlendState(ID3D11BlendState* bs, float bf); void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor = NULL); @@ -195,8 +200,10 @@ public: operator ID3D11Device*() {return m_dev;} operator ID3D11DeviceContext*() {return m_ctx;} - HRESULT CompileShader(uint32 id, const string& entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il); - HRESULT CompileShader(uint32 id, const string& entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs); - HRESULT CompileShader(uint32 id, const string& entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps); + HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il); + HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs); + HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps); + HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs); + HRESULT CompileShader(const char* fn, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs); }; diff --git a/plugins/GSdx/GSDeviceDX.cpp b/plugins/GSdx/GSDeviceDX.cpp index c09d272ccb..160e3130d6 100644 --- a/plugins/GSdx/GSDeviceDX.cpp +++ b/plugins/GSdx/GSDeviceDX.cpp @@ -67,18 +67,21 @@ bool GSDeviceDX::SetFeatureLevel(D3D_FEATURE_LEVEL level, bool compat_mode) m_shader.vs = "vs_4_0"; m_shader.gs = "gs_4_0"; m_shader.ps = "ps_4_0"; + m_shader.cs = "cs_4_0"; break; case D3D_FEATURE_LEVEL_10_1: m_shader.model = "0x401"; m_shader.vs = "vs_4_1"; m_shader.gs = "gs_4_1"; m_shader.ps = "ps_4_1"; + m_shader.cs = "cs_4_1"; break; case D3D_FEATURE_LEVEL_11_0: m_shader.model = "0x500"; m_shader.vs = "vs_5_0"; m_shader.gs = "gs_5_0"; m_shader.ps = "ps_5_0"; + m_shader.cs = "cs_5_0"; break; default: ASSERT(0); diff --git a/plugins/GSdx/GSDeviceDX.h b/plugins/GSdx/GSDeviceDX.h index 6de181fa01..9d2f954472 100644 --- a/plugins/GSdx/GSDeviceDX.h +++ b/plugins/GSdx/GSDeviceDX.h @@ -266,7 +266,7 @@ public: #pragma pack(pop) protected: - struct {D3D_FEATURE_LEVEL level; string model, vs, gs, ps;} m_shader; + struct {D3D_FEATURE_LEVEL level; string model, vs, gs, ps, cs;} m_shader; uint32 m_msaa; DXGI_SAMPLE_DESC m_msaa_desc; @@ -277,6 +277,7 @@ public: virtual ~GSDeviceDX(); bool SetFeatureLevel(D3D_FEATURE_LEVEL level, bool compat_mode); + void GetFeatureLevel(D3D_FEATURE_LEVEL& level) const {level = m_shader.level;} virtual void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim) = 0; virtual void SetupVS(VSSelector sel, const VSConstantBuffer* cb) = 0; diff --git a/plugins/GSdx/GSLocalMemory.cpp b/plugins/GSdx/GSLocalMemory.cpp index bb990a657a..4bffdf475a 100644 --- a/plugins/GSdx/GSLocalMemory.cpp +++ b/plugins/GSdx/GSLocalMemory.cpp @@ -500,6 +500,11 @@ GSPixelOffset4* GSLocalMemory::GetPixelOffset4(const GIFRegFRAME& FRAME, const G GSPixelOffset4* o = (GSPixelOffset4*)_aligned_malloc(sizeof(GSPixelOffset4), 32); o->hash = hash; + o->fbp = fbp; + o->zbp = zbp; + o->fpsm = fpsm; + o->zpsm = zpsm; + o->bw = bw; pixelAddress fpa = m_psm[fpsm].pa; pixelAddress zpa = m_psm[zpsm].pa; diff --git a/plugins/GSdx/GSLocalMemory.h b/plugins/GSdx/GSLocalMemory.h index a43b8d65e0..e76bde3f00 100644 --- a/plugins/GSdx/GSLocalMemory.h +++ b/plugins/GSdx/GSLocalMemory.h @@ -63,6 +63,7 @@ struct GSPixelOffset4 GSVector2i row[2048]; // f yn | z yn (n = 0 1 2 ...) GSVector2i col[512]; // f xn | z xn (n = 0 4 8 ...) uint32 hash; + uint32 fbp, zbp, fpsm, zpsm, bw; }; class GSLocalMemory : public GSBlock diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index f280934ad4..1872b6844a 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -64,11 +64,15 @@ GSRasterizer::~GSRasterizer() bool GSRasterizer::IsOneOfMyScanlines(int top) const { + ASSERT(top >= 0 && top < 2048); + return m_myscanline[top >> THREAD_HEIGHT] != 0; } bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const { + ASSERT(top >= 0 && top < 2048 && bottom >= 0 && bottom < 2048); + top = top >> THREAD_HEIGHT; bottom = (bottom + (1 << THREAD_HEIGHT) - 1) >> THREAD_HEIGHT; @@ -187,12 +191,12 @@ void GSRasterizer::Draw(GSRasterizerData* data) if(index != NULL) { - do {DrawSprite(vertex, index, data->solidrect); index += 2;} + do {DrawSprite(vertex, index); index += 2;} while(index < index_end); } else { - do {DrawSprite(vertex, tmp_index, data->solidrect); vertex += 2;} + do {DrawSprite(vertex, tmp_index); vertex += 2;} while(vertex < vertex_end); } @@ -407,7 +411,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const uint32* index) GSVector4 tbf = y0011.xzxz(y1221).ceil(); GSVector4 tbmax = tbf.max(m_fscissor_y); GSVector4 tbmin = tbf.min(m_fscissor_y); - GSVector4i tb = GSVector4i(tbmax.xzyw(tbmin)); + GSVector4i tb = GSVector4i(tbmax.xzyw(tbmin)); // max(y0, t) max(y1, t) min(y1, b) min(y2, b) dv[0] = v1 - v0; dv[1] = v2 - v0; @@ -565,7 +569,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co m_edge.count += e - &m_edge.buff[m_edge.count]; } -void GSRasterizer::DrawSprite(const GSVertexSW* vertex, const uint32* index, bool solidrect) +void GSRasterizer::DrawSprite(const GSVertexSW* vertex, const uint32* index) { const GSVertexSW& v0 = vertex[index[0]]; const GSVertexSW& v1 = vertex[index[1]]; @@ -589,7 +593,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertex, const uint32* index, boo GSVertexSW scan = v[0]; - if(solidrect) + if(m_ds->IsSolidRect()) { if(m_threads == 1) { @@ -904,7 +908,6 @@ GSRasterizerList::GSRasterizerList() : GSJobQueue >() , m_sync_count(0) , m_syncpoint_count(0) - , m_solidrect_count(0) { } @@ -955,11 +958,6 @@ int GSRasterizerList::GetPixels(bool reset) void GSRasterizerList::Process(shared_ptr& item) { - if(item->solidrect) - { - m_solidrect_count++; - } - if(item->syncpoint) { for(size_t i = 0; i < m_workers.size(); i++) diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h index eab733b4ea..71b2dd4ad1 100644 --- a/plugins/GSdx/GSRasterizer.h +++ b/plugins/GSdx/GSRasterizer.h @@ -39,7 +39,6 @@ public: int vertex_count; uint32* index; int index_count; - bool solidrect; bool syncpoint; uint64 frame; @@ -52,7 +51,6 @@ public: , vertex_count(0) , index(NULL) , index_count(0) - , solidrect(false) , syncpoint(false) , frame(0) { @@ -101,6 +99,7 @@ public: #endif __forceinline bool HasEdge() const {return m_de != NULL;} + __forceinline bool IsSolidRect() const {return m_dr != NULL;} }; class IRasterizer : public GSAlignedClass<32> @@ -133,7 +132,7 @@ protected: void DrawPoint(const GSVertexSW* vertex, int vertex_count, const uint32* index, int index_count); void DrawLine(const GSVertexSW* vertex, const uint32* index); void DrawTriangle(const GSVertexSW* vertex, const uint32* index); - void DrawSprite(const GSVertexSW* vertex, const uint32* index, bool solidrect); + void DrawSprite(const GSVertexSW* vertex, const uint32* index); __forceinline void DrawTriangleSection(int top, int bottom, GSVertexSW& edge, const GSVertexSW& dedge, const GSVertexSW& dscan, const GSVector4& p0); @@ -214,7 +213,6 @@ public: int m_sync_count; int m_syncpoint_count; - int m_solidrect_count; // IRasterizer diff --git a/plugins/GSdx/GSRendererCS.cpp b/plugins/GSdx/GSRendererCS.cpp new file mode 100644 index 0000000000..a244081feb --- /dev/null +++ b/plugins/GSdx/GSRendererCS.cpp @@ -0,0 +1,426 @@ +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSRendererCS.h" + +GSRendererCS::GSRendererCS() + : GSRenderer(new GSVertexTraceCS(this), sizeof(GSVertex)) +{ + m_nativeres = true; + + InitConvertVertex(GSRendererCS); + + memset(m_vm_valid, 0, sizeof(m_vm_valid)); +} + +GSRendererCS::~GSRendererCS() +{ +} + +bool GSRendererCS::CreateDevice(GSDevice* dev_unk) +{ + if(!__super::CreateDevice(dev_unk)) + return false; + + D3D_FEATURE_LEVEL level; + + ((GSDeviceDX*)dev_unk)->GetFeatureLevel(level); + + if(level < D3D_FEATURE_LEVEL_10_0) + return false; + + HRESULT hr; + + GSDevice11* dev = (GSDevice11*)dev_unk; + + D3D11_BUFFER_DESC bd; + D3D11_UNORDERED_ACCESS_VIEW_DESC uavd; + D3D11_SHADER_RESOURCE_VIEW_DESC srvd; + + // video memory (4MB) + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = 4 * 1024 * 1024; + bd.StructureByteStride = 4; + bd.Usage = D3D11_USAGE_DEFAULT; + bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS; + bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_vm); + + if(FAILED(hr)) return false; + + memset(&uavd, 0, sizeof(uavd)); + + uavd.Format = DXGI_FORMAT_R32_TYPELESS; + uavd.Buffer.FirstElement = 0; + uavd.Buffer.NumElements = 1024 * 1024; + uavd.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW; + uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER; + + hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav); + + if(FAILED(hr)) return false; + + // vertex buffer + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = sizeof(GSVertex) * 10000; + bd.StructureByteStride = sizeof(GSVertex); + bd.Usage = D3D11_USAGE_DYNAMIC; + bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; + bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_vb); + + if(FAILED(hr)) return false; + + // index buffer + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = sizeof(uint32) * 10000 * 3; + bd.Usage = D3D11_USAGE_DYNAMIC; + bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_ib); + + if(FAILED(hr)) return false; + + // one page, for copying between cpu<->gpu + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = PAGE_SIZE; + bd.Usage = D3D11_USAGE_STAGING; + bd.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_pb); + + if(FAILED(hr)) return false; + + return true; +} + +GSTexture* GSRendererCS::GetOutput(int i) +{ + // TODO: create a compute shader which unswizzles the frame from m_vm to the output texture + + return NULL; +} + +template +void GSRendererCS::ConvertVertex(size_t dst_index, size_t src_index) +{ + // TODO: vertex format more fitting as the input for the compute shader + + if(src_index != dst_index) + { + GSVertex v = ((GSVertex*)m_vertex.buff)[src_index]; + + ((GSVertex*)m_vertex.buff)[dst_index] = v; + } +} + +void GSRendererCS::Draw() +{ + HRESULT hr; + + GSDevice11* dev = (GSDevice11*)m_dev; + + ID3D11DeviceContext* ctx = *dev; + + D3D11_BUFFER_DESC bd; + D3D11_UNORDERED_ACCESS_VIEW_DESC uavd; + D3D11_SHADER_RESOURCE_VIEW_DESC srvd; + D3D11_MAPPED_SUBRESOURCE map; + + CComPtr vb_srv; + CComPtr ib_srv; + + // TODO: cache these in hash_maps + + CComPtr fbr, fbc, zbr, zbc; + CComPtr fbr_srv, fbc_srv, zbr_srv, zbc_srv; + + // TODO: grow m_vb, m_ib if needed + + if(m_vertex.next > 10000) return; + if(m_index.tail > 30000) return; + + // TODO: fill/advance/discardwhenfull, as in GSDevice11::IASetVertexBuffer/IASetIndexBuffer + + hr = ctx->Map(m_vb, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); // discarding, until properly advancing the start pointer around + + if(FAILED(hr)) return; + + memcpy(map.pData, m_vertex.buff, sizeof(GSVertex) * m_vertex.next); + + ctx->Unmap(m_vb, 0); + + // + + hr = ctx->Map(m_ib, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); // discarding, until properly advancing the start pointer around + + if(FAILED(hr)) return; + + memcpy(map.pData, m_index.buff, sizeof(uint32) * m_index.tail); + + ctx->Unmap(m_ib, 0); + + // TODO: UpdateResource might be faster, based on my exprience with the real vertex buffer, write-no-overwrite/discarded dynamic buffer + map is better + + // + + memset(&srvd, 0, sizeof(srvd)); + + srvd.Format = DXGI_FORMAT_UNKNOWN; + srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; + srvd.Buffer.FirstElement = 0; + srvd.Buffer.NumElements = m_vertex.next; + + hr = (*dev)->CreateShaderResourceView(m_vb, &srvd, &vb_srv); // TODO: have to create this dyncamically in Draw() or pass the start/count in a const reg + + memset(&srvd, 0, sizeof(srvd)); + + srvd.Format = DXGI_FORMAT_R32_UINT; + srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; + srvd.Buffer.FirstElement = 0; + srvd.Buffer.NumElements = m_index.tail; + + hr = (*dev)->CreateShaderResourceView(m_ib, &srvd, &ib_srv); // TODO: have to create this dyncamically in Draw() or pass the start/count in a const reg + + // fzb offsets + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = sizeof(int) * 4096; + bd.StructureByteStride = sizeof(int); + bd.Usage = D3D11_USAGE_IMMUTABLE; + bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; + + D3D11_SUBRESOURCE_DATA data; + + memset(&data, 0, sizeof(data)); + + data.pSysMem = m_context->offset.fb->pixel.row; + + hr = (*dev)->CreateBuffer(&bd, &data, &fbr); + + data.pSysMem = m_context->offset.fb->pixel.col[0]; // same column layout for every line in case of frame and zbuffer formats + + hr = (*dev)->CreateBuffer(&bd, &data, &fbc); + + data.pSysMem = m_context->offset.zb->pixel.row; + + hr = (*dev)->CreateBuffer(&bd, &data, &zbr); + + data.pSysMem = m_context->offset.zb->pixel.col[0]; // same column layout for every line in case of frame and zbuffer formats + + hr = (*dev)->CreateBuffer(&bd, &data, &zbc); + + // TODO: D3D10_SHADER_MACRO (primclass, less frequently changing drawing attribs, etc.) + + uint32 sel = 0; // TODO + + hash_map >::iterator i = m_cs.find(sel); + + CComPtr cs; + + if(i == m_cs.end()) + { + // hr = dev->CompileShader(IDR_CS_FX, "cs_main", NULL, &cs); + hr = dev->CompileShader("E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\cs.fx", "cs_main", NULL, &cs); + + if(FAILED(hr)) return; + + m_cs[sel] = cs; + } + else + { + cs = i->second; + } + + // + + dev->CSSetShaderUAV(0, m_vm_uav); + + dev->CSSetShaderSRV(0, vb_srv); + dev->CSSetShaderSRV(1, ib_srv); + dev->CSSetShaderSRV(2, fbr_srv); + dev->CSSetShaderSRV(3, fbc_srv); + dev->CSSetShaderSRV(4, zbr_srv); + dev->CSSetShaderSRV(5, zbc_srv); + + dev->CSSetShader(cs); + + GSVector4i bbox = GSVector4i(0, 0, 640, 512); // TODO: vertex trace + + GSVector4i r = bbox.ralign(GSVector2i(16, 8)); + + bool fb = true; // TODO: frame buffer used + bool zb = true; // TODO: z-buffer used + + if(fb) Write(m_context->offset.fb, r); + if(zb) Write(m_context->offset.zb, r); + + // TODO: constant buffer (frequently chaning drawing attribs) + // TODO: texture (implement texture cache) + // TODO: clut to a palette texture (should be texture1d, not simply buffer, it is random accessed) + // TODO: CSSetShaderSRV(6 7 8 ..., texture level 0 1 2 ...) or use Texture3D? + // TODO: invalidate texture cache + + /* + CComPtr q; + + D3D11_QUERY_DESC qd; + memset(&qd, 0, sizeof(qd)); + qd.Query = D3D11_QUERY_EVENT; + + hr = (*dev)->CreateQuery(&qd, &q); + + ctx->Begin(q); + */ + + printf("[%lld] dispatch %05x %d %05x %d %05x %d %dx%d | %d %d %d\n", + __rdtsc(), + m_context->FRAME.Block(), m_context->FRAME.PSM, + m_context->ZBUF.Block(), m_context->ZBUF.PSM, + PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, + PRIM->PRIM, m_vertex.next, m_index.tail); + + GSVector4i rsize = r.rsize(); + + dev->Dispatch(rsize.z >> 4, rsize.w >> 3, 1); // TODO: pass upper-left corner offset (r.xy) in a const buffer + + /* + ctx->End(q); + + uint64 t0 = __rdtsc(); + + BOOL b; + + while(S_OK != ctx->GetData(q, &b, sizeof(BOOL), 0)) {} + + printf("%lld\n", __rdtsc() - t0); + */ +} + +void GSRendererCS::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) +{ + GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM); + + Read(o, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated + + // TODO: false deps, 8H/4HL/4HH texture sharing pages with 24-bit target + // TODO: invalidate texture cache +} + +void GSRendererCS::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut) +{ + GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM); + + Read(o, r, false); +} + +void GSRendererCS::Write(GSOffset* o, const GSVector4i& r) +{ + GSDevice11* dev = (GSDevice11*)m_dev; + + ID3D11DeviceContext* ctx = *dev; + + D3D11_BOX box; + + memset(&box, 0, sizeof(box)); + + uint32* pages = o->GetPages(r); + + for(size_t i = 0; pages[i] != GSOffset::EOP; i++) + { + uint32 page = pages[i]; + + uint32 row = page >> 5; + uint32 col = 1 << (page & 31); + + if((m_vm_valid[row] & col) == 0) + { + m_vm_valid[row] |= col; + + box.left = page * PAGE_SIZE; + box.right = box.left + PAGE_SIZE; + + ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + box.left, 0, 0); + + printf("[%lld] write %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page); + } + } + + delete [] pages; +} + +void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate) +{ + GSDevice11* dev = (GSDevice11*)m_dev; + + ID3D11DeviceContext* ctx = *dev; + + D3D11_BOX box; + + memset(&box, 0, sizeof(box)); + + uint32* pages = o->GetPages(r); + + for(size_t i = 0; pages[i] != GSOffset::EOP; i++) + { + uint32 page = pages[i]; + + uint32 row = page >> 5; + uint32 col = 1 << (page & 31); + + if(m_vm_valid[row] & col) + { + if(invalidate) m_vm_valid[row] ^= col; + + box.left = page * PAGE_SIZE; + box.right = box.left + PAGE_SIZE; + + ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box); + + D3D11_MAPPED_SUBRESOURCE map; + + if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ_WRITE, 0, &map))) + { + memcpy(m_mem.m_vm8 + box.left, map.pData, PAGE_SIZE); + + ctx->Unmap(m_pb, 0); + + printf("[%lld] read %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page); + } + } + } + + delete [] pages; +} diff --git a/plugins/GSdx/GSRendererCS.h b/plugins/GSdx/GSRendererCS.h new file mode 100644 index 0000000000..42f45d58af --- /dev/null +++ b/plugins/GSdx/GSRendererCS.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#pragma once + +#include "GSRenderer.h" +#include "GSDevice11.h" + +class GSRendererCS : public GSRenderer +{ + class GSVertexTraceCS : public GSVertexTrace + { + public: + GSVertexTraceCS(const GSState* state) : GSVertexTrace(state) {} + }; + + CComPtr m_vm; + CComPtr m_vm_uav; + CComPtr m_vb; + CComPtr m_ib; + CComPtr m_pb; + hash_map > m_cs; + uint32 m_vm_valid[16]; + + void Write(GSOffset* o, const GSVector4i& r); + void Read(GSOffset* o, const GSVector4i& r, bool invalidate); + +protected: + template + void ConvertVertex(size_t dst_index, size_t src_index); + + bool CreateDevice(GSDevice* dev); + GSTexture* GetOutput(int i); + void Draw(); + void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r); + void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut); + +public: + GSRendererCS(); + virtual ~GSRendererCS(); +}; diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index ca9e73cbfe..d6494f58c3 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -89,8 +89,7 @@ void GSRendererSW::VSync(int field) // printf("m_sync_count = %d\n", ((GSRasterizerList*)m_rl)->m_sync_count); ((GSRasterizerList*)m_rl)->m_sync_count = 0; printf("m_syncpoint_count = %d\n", ((GSRasterizerList*)m_rl)->m_syncpoint_count); ((GSRasterizerList*)m_rl)->m_syncpoint_count = 0; - printf("m_solidrect_count = %d\n", ((GSRasterizerList*)m_rl)->m_solidrect_count); ((GSRasterizerList*)m_rl)->m_solidrect_count = 0; -*/ + */ GSRenderer::VSync(field); m_tc->IncAge(); @@ -198,29 +197,38 @@ void GSRendererSW::ConvertVertex(size_t dst_index, size_t src_index) } } +#define LOG 0 + +FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL; + void GSRendererSW::Draw() { SharedData* sd = new SharedData(this); shared_ptr data(sd); - if(!GetScanlineGlobalData(sd)) return; + sd->primclass = m_vt->m_primclass; + sd->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_vertex.next + sizeof(uint32) * m_index.tail, 32); + sd->vertex = (GSVertexSW*)sd->buff; + sd->vertex_count = m_vertex.next; + sd->index = (uint32*)(sd->buff + sizeof(GSVertexSW) * m_vertex.next); + sd->index_count = m_index.tail; - data->primclass = m_vt->m_primclass; - data->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_vertex.next + sizeof(uint32) * m_index.tail, 32); - data->vertex = (GSVertexSW*)data->buff; - data->vertex_count = m_vertex.next; - data->index = (uint32*)(data->buff + sizeof(GSVertexSW) * m_vertex.next); - data->index_count = m_index.tail; - - memcpy(data->vertex, m_vertex.buff, sizeof(GSVertexSW) * m_vertex.next); - memcpy(data->index, m_index.buff, sizeof(uint32) * m_index.tail); + memcpy(sd->vertex, m_vertex.buff, sizeof(GSVertexSW) * m_vertex.next); + memcpy(sd->index, m_index.buff, sizeof(uint32) * m_index.tail); for(size_t i = 0; i < m_index.tail; i++) { ASSERT(((GSVertexSW*)m_vertex.buff + m_index.buff[i])->_pad.u32[0] == 0x12345678); } + // TODO: delay texture update, do it later along with the syncing on the dispatcher thread, then this thread does not have to wait and can continue assembling more jobs + // TODO: if(any texture page is used as a target) GSRasterizerData::syncpoint = true; + // TODO: virtual void GSRasterizerData::Update() {texture[all levels]->Update();}, call it from the dispatcher thread before sending to workers + // TODO: m_tc->InvalidatePages must be called after texture->Update, move that inside GSRasterizerData::Update too + + if(!GetScanlineGlobalData(sd)) return; + // const GSDrawingContext* context = m_context; @@ -232,10 +240,9 @@ void GSRendererSW::Draw() scissor.z = std::min(scissor.z, (int)context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour - data->scissor = scissor; - data->bbox = bbox; - data->solidrect = gd.sel.IsSolidRect(); - data->frame = m_perfmon.GetFrame(); + sd->scissor = scissor; + sd->bbox = bbox; + sd->frame = m_perfmon.GetFrame(); // @@ -262,41 +269,75 @@ void GSRendererSW::Draw() if(m_fzb != context->offset.fzb) { - m_fzb = context->offset.fzb; + // hmm, what if "r" gets bigger next time and slips through unchecked, need to trace that too - data->syncpoint = true; - } + sd->syncpoint = true; // TODO - // - chross-check frame and z-buffer pages, they cannot overlap with eachother and with previous batches in queue - // - m_fzb filters out most of these cases, only have to be careful when the addresses stay the same and the output is mutually enabled/disabled and alternating (Bully FBP/ZBP = 0x2300) - - if(!data->syncpoint) - { - if(gd.sel.fwrite) + if(!sd->syncpoint) { - for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++) + if(fb_pages == NULL) { - if(m_fzb_pages[*p] & 0xffff0000) // already used as a z-buffer + fb_pages = context->offset.fb->GetPages(r); + } + + if(CheckTargetPages<0xffffffff>(fb_pages)) + { + sd->syncpoint = true; + + if(LOG) fprintf(s_fp, "syncpoint 0\n"); + } + } + + if(!sd->syncpoint) + { + if(zb_pages == NULL) + { + zb_pages = context->offset.zb->GetPages(r); + } + + if(CheckTargetPages<0xffffffff>(zb_pages)) + { + sd->syncpoint = true; + + if(LOG) fprintf(s_fp, "syncpoint 1\n"); + } + } + + if(!sd->syncpoint) + { + if(LOG) fprintf(s_fp, "no syncpoint *\n"); + } + + m_fzb = context->offset.fzb; + } + else + { + // chross-check frame and z-buffer pages, they cannot overlap with eachother and with previous batches in queue, + // m_fzb filters out most of these cases, only have to be careful when the addresses stay the same and the output + // is mutually enabled/disabled and alternating (Bully FBP/ZBP = 0x2300) + + if(!sd->syncpoint) + { + if(gd.sel.fwrite) + { + if(CheckTargetPages<0xffff0000>(fb_pages)) // already used as a z-buffer { - data->syncpoint = true; - - break; + sd->syncpoint = true; + + if(LOG) fprintf(s_fp, "syncpoint 2\n"); } } } - } - if(!data->syncpoint) - { - if(gd.sel.zwrite) + if(!sd->syncpoint) { - for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++) + if(gd.sel.zwrite) { - if(m_fzb_pages[*p] & 0x0000ffff) // already used as a frame buffer + if(CheckTargetPages<0x0000ffff>(zb_pages)) // already used as a frame buffer { - data->syncpoint = true; + sd->syncpoint = true; - break; + if(LOG) fprintf(s_fp, "syncpoint 3\n"); } } } @@ -363,6 +404,12 @@ void GSRendererSW::Draw() } else { + if(LOG) fprintf(s_fp, "queue %05x %d %05x %d %05x %d %dx%d | %d %d %d\n", + m_context->FRAME.Block(), m_context->FRAME.PSM, + m_context->ZBUF.Block(), m_context->ZBUF.PSM, + PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, + PRIM->PRIM, sd->vertex_count, sd->index_count); + m_rl->Queue(data); } @@ -384,8 +431,16 @@ void GSRendererSW::Sync(int reason) GSPerfMonAutoTimer pmat(&m_perfmon, GSPerfMon::Sync); + uint64 t = __rdtsc(); + m_rl->Sync(); + s_n++; + + t = __rdtsc() - t; + + if(LOG) fprintf(s_fp, "sync n=%d r=%d t=%lld p=%d %c\n", s_n, reason, t, m_rl->GetPixels(), t > 10000000 ? '*' : ' '); + m_perfmon.Put(GSPerfMon::Fillrate, m_rl->GetPixels()); } @@ -397,8 +452,6 @@ void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS o->GetPages(r, p); - m_tc->InvalidatePages(p, o->psm); - // check if the changing pages either used as a texture or a target for(; *p != GSOffset::EOP; p++) @@ -414,6 +467,8 @@ void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS break; } } + + m_tc->InvalidatePages(m_tmp_pages, o->psm); // if texture update runs on a thread and Sync(5) happens then this must come later } void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut) @@ -493,6 +548,19 @@ void GSRendererSW::ReleasePages(const uint32* pages, int type) } } +template bool GSRendererSW::CheckTargetPages(const uint32* pages) +{ + for(const uint32* p = pages; *p != GSOffset::EOP; p++) + { + if(mask != 0xffffffff ? (m_fzb_pages[*p] & mask) : m_fzb_pages[*p]) + { + return true; + } + } + + return false; +} + #include "GSTextureSW.h" bool GSRendererSW::GetScanlineGlobalData(SharedData* data) @@ -811,19 +879,19 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) { // skip per pixel division if q is constant - GSVertexSW* RESTRICT v = (GSVertexSW*)m_vertex.buff; + GSVertexSW* RESTRICT v = data->vertex; if(m_vt->m_eq.q) { gd.sel.fst = 1; - const GSVector4& t = v[m_index.buff[0]].t; + const GSVector4& t = v[data->index[0]].t; if(t.z != 1.0f) { GSVector4 w = t.zzzz().rcpnr(); - for(int i = 0, j = m_vertex.next; i < j; i++) + for(int i = 0, j = data->vertex_count; i < j; i++) { GSVector4 t = v[i].t; @@ -835,7 +903,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) { gd.sel.fst = 1; - for(int i = 0, j = m_vertex.next; i < j; i += 2) + for(int i = 0, j = data->vertex_count; i < j; i += 2) { GSVector4 t0 = v[i + 0].t; GSVector4 t1 = v[i + 1].t; @@ -856,9 +924,9 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) GSVector4 half(0x8000, 0x8000); - GSVertexSW* RESTRICT v = (GSVertexSW*)m_vertex.buff; + GSVertexSW* RESTRICT v = data->vertex; - for(int i = 0, j = m_vertex.next; i < j; i++) + for(int i = 0, j = data->vertex_count; i < j; i++) { GSVector4 t = v[i].t; @@ -1051,12 +1119,12 @@ GSRendererSW::SharedData::~SharedData() delete m_fb_pages; delete m_zb_pages; - + for(size_t i = 0; i < countof(m_tex_pages) && m_tex_pages[i] != NULL; i++) { m_parent->ReleasePages(m_tex_pages[i], 2); } - + if(global.clut) _aligned_free(global.clut); if(global.dimx) _aligned_free(global.dimx); } diff --git a/plugins/GSdx/GSRendererSW.h b/plugins/GSdx/GSRendererSW.h index 45af7a9440..ee68de3611 100644 --- a/plugins/GSdx/GSRendererSW.h +++ b/plugins/GSdx/GSRendererSW.h @@ -66,6 +66,7 @@ protected: void UsePages(const uint32* pages, int type); void ReleasePages(const uint32* pages, int type); + template bool CheckTargetPages(const uint32* pages); bool GetScanlineGlobalData(SharedData* data); diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index 6f3f0e1bb2..f1259f76ca 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -671,11 +671,6 @@ template void GSState::ApplyTEX0(GIFRegTEX0& TEX0) TEX0.CPSM &= 0xa; // 1010b - if((TEX0.TBW & 1) && (TEX0.PSM == PSM_PSMT8 || TEX0.PSM == PSM_PSMT4)) - { - TEX0.TBW &= ~1; // GS User 2.6 - } - if((TEX0.u32[0] ^ m_env.CTXT[i].TEX0.u32[0]) & 0x3ffffff) // TBP0 TBW PSM { m_env.CTXT[i].offset.tex = m_mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM); @@ -709,6 +704,13 @@ template void GSState::GIFRegHandlerTEX0(const GIFReg* RESTRICT r) if(TEX0.TW > 10) TEX0.TW = 10; if(TEX0.TH > 10) TEX0.TH = 10; + if((TEX0.TBW & 1) && (TEX0.PSM == PSM_PSMT8 || TEX0.PSM == PSM_PSMT4)) + { + ASSERT(TEX0.TBW == 1); // TODO + + TEX0.TBW &= ~1; // GS User 2.6 + } + ApplyTEX0(TEX0); if(m_env.CTXT[i].TEX1.MTBA) @@ -1265,32 +1267,34 @@ void GSState::FlushPrim() size_t head = m_vertex.head; size_t tail = m_vertex.tail; size_t next = m_vertex.next; + size_t unused = 0; if(tail > head) { switch(PRIM->PRIM) { case GS_POINTLIST: + ASSERT(0); break; case GS_LINELIST: case GS_LINESTRIP: case GS_SPRITE: - if(tail > head + 0) memcpy(&buff[stride * 0], &m_vertex.buff[stride * (head + 0)], stride); - break; case GS_TRIANGLELIST: case GS_TRIANGLESTRIP: - if(tail > head + 0) memcpy(&buff[stride * 0], &m_vertex.buff[stride * (head + 0)], stride); - if(tail > head + 1) memcpy(&buff[stride * 1], &m_vertex.buff[stride * (head + 1)], stride); + unused = tail - head; + memcpy(buff, &m_vertex.buff[stride * head], stride * unused); break; case GS_TRIANGLEFAN: - if(tail > head + 0) memcpy(&buff[stride * 0], &m_vertex.buff[stride * (head + 0)], stride); - if(tail > head + 1) memcpy(&buff[stride * 1], &m_vertex.buff[stride * (tail - 1)], stride); + memcpy(buff, &m_vertex.buff[stride * head], stride); unused = 1; + if(tail - 1 > head) {memcpy(&buff[stride], &m_vertex.buff[stride * (tail - 1)], stride); unused = 2;} break; case GS_INVALID: break; default: __assume(0); } + + ASSERT(unused < GSUtil::GetVertexCount(PRIM->PRIM)); } if(GSLocalMemory::m_psm[m_context->FRAME.PSM].fmt < 3 && GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt < 3) @@ -1308,34 +1312,19 @@ void GSState::FlushPrim() m_index.tail = 0; m_vertex.head = 0; - m_vertex.tail = 0; - m_vertex.next = 0; - if(tail > head) + if(unused > 0) { - switch(PRIM->PRIM) - { - case GS_POINTLIST: - break; - case GS_LINELIST: - case GS_LINESTRIP: - case GS_SPRITE: - if(tail > head + 0) {memcpy(&m_vertex.buff[stride * 0], &buff[stride * 0], stride); m_vertex.tail++;} - break; - case GS_TRIANGLELIST: - case GS_TRIANGLESTRIP: - case GS_TRIANGLEFAN: - if(tail > head + 0) {memcpy(&m_vertex.buff[stride * 0], &buff[stride * 0], stride); m_vertex.tail++;} - if(tail > head + 1) {memcpy(&m_vertex.buff[stride * 1], &buff[stride * 1], stride); m_vertex.tail++;} - break; - case GS_INVALID: - break; - default: - __assume(0); - } + memcpy(m_vertex.buff, buff, stride * unused); + m_vertex.tail = unused; m_vertex.next = next > head ? next - head : 0; } + else + { + m_vertex.tail = 0; + m_vertex.next = 0; + } } } @@ -1380,6 +1369,15 @@ void GSState::Write(const uint8* mem, int len) m_tr.start = m_tr.end = m_tr.total; m_perfmon.Put(GSPerfMon::Swizzle, len); + + /* + static int n = 0; + string s; + s = format("c:\\temp1\\[%04d]_%05x_%d_%d_%d_%d_%d_%d.bmp", + n++, (int)m_env.BITBLTBUF.DBP, (int)m_env.BITBLTBUF.DBW, (int)m_env.BITBLTBUF.DPSM, + r.left, r.top, r.right, r.bottom); + m_mem.SaveBMP(s, m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW, m_env.BITBLTBUF.DPSM, r.right, r.bottom); + */ } else { diff --git a/plugins/GSdx/GSTextureCacheSW.cpp b/plugins/GSdx/GSTextureCacheSW.cpp index 4e24b14aa0..b0d6d83fbf 100644 --- a/plugins/GSdx/GSTextureCacheSW.cpp +++ b/plugins/GSdx/GSTextureCacheSW.cpp @@ -95,7 +95,7 @@ void GSTextureCacheSW::InvalidatePages(const uint32* pages, uint32 psm) { Texture* t = *i; - if(GSUtil::HasSharedBits(psm, t->m_TEX0.PSM)) + if(GSUtil::HasSharedBits(psm, t->m_sharedbits)) { uint32* RESTRICT valid = t->m_valid; @@ -181,6 +181,8 @@ GSTextureCacheSW::Texture::Texture(GSState* state, uint32 tw0, const GIFRegTEX0& memset(m_valid, 0, sizeof(m_valid)); memset(m_pages.bm, 0, sizeof(m_pages.bm)); + m_sharedbits = GSUtil::HasSharedBitsPtr(m_TEX0.PSM); + m_offset = m_state->m_mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM); m_pages.n = m_offset->GetPages(GSVector4i(0, 0, 1 << TEX0.TW, 1 << TEX0.TH)); diff --git a/plugins/GSdx/GSTextureCacheSW.h b/plugins/GSdx/GSTextureCacheSW.h index fc642afc11..8c80456c16 100644 --- a/plugins/GSdx/GSTextureCacheSW.h +++ b/plugins/GSdx/GSTextureCacheSW.h @@ -41,6 +41,7 @@ public: vector* m_p2t; uint32 m_valid[MAX_PAGES]; struct {uint32 bm[16]; const uint32* n;} m_pages; + const uint32* RESTRICT m_sharedbits; // m_valid // fast mode: each uint32 bits map to the 32 blocks of that page diff --git a/plugins/GSdx/GSThread.h b/plugins/GSdx/GSThread.h index d19d758c31..99247d8431 100644 --- a/plugins/GSdx/GSThread.h +++ b/plugins/GSdx/GSThread.h @@ -275,19 +275,15 @@ protected: if(m_exit) {m_cv.lock.Unlock(); return;} } - { - // NOTE: this is scoped because we must make sure the last item is no longer around when Wait detects an empty queue + T& item = m_queue.front(); - T item = m_queue.front(); + m_cv.lock.Unlock(); - m_cv.lock.Unlock(); + Process(item); - Process(item); + m_cv.lock.Lock(); - m_cv.lock.Lock(); - - m_queue.pop(); - } + m_queue.pop(); if(m_queue.empty()) { @@ -312,23 +308,18 @@ protected: m_ev.lock.Lock(); } - { - // NOTE: this is scoped because we must make sure the last item is no longer around when Wait detects an empty queue + T& item = m_queue.front(); - T item = m_queue.front(); + m_ev.lock.Unlock(); - m_ev.lock.Unlock(); + Process(item); - Process(item); + m_ev.lock.Lock(); - m_ev.lock.Lock(); - - m_queue.pop(); - } + m_queue.pop(); _InterlockedDecrement(&m_ev.count); } - } } diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp index d9a2998d38..3600d7e335 100644 --- a/plugins/GSdx/GSUtil.cpp +++ b/plugins/GSdx/GSUtil.cpp @@ -161,6 +161,16 @@ int GSUtil::GetVertexCount(uint32 prim) return s_maps.VertexCountField[prim]; } +const uint32* GSUtil::HasSharedBitsPtr(uint32 dpsm) +{ + return s_maps.SharedBitsField[dpsm]; +} + +bool GSUtil::HasSharedBits(uint32 spsm, const uint32* RESTRICT ptr) +{ + return (ptr[spsm >> 5] & (1 << (spsm & 0x1f))) == 0; +} + bool GSUtil::HasSharedBits(uint32 spsm, uint32 dpsm) { return (s_maps.SharedBitsField[dpsm][spsm >> 5] & (1 << (spsm & 0x1f))) == 0; diff --git a/plugins/GSdx/GSUtil.h b/plugins/GSdx/GSUtil.h index f834255d2b..7d863e0b18 100644 --- a/plugins/GSdx/GSUtil.h +++ b/plugins/GSdx/GSUtil.h @@ -31,6 +31,8 @@ public: static GS_PRIM_CLASS GetPrimClass(uint32 prim); static int GetVertexCount(uint32 prim); + static const uint32* HasSharedBitsPtr(uint32 dpsm); + static bool HasSharedBits(uint32 spsm, const uint32* ptr); static bool HasSharedBits(uint32 spsm, uint32 dpsm); static bool HasSharedBits(uint32 sbp, uint32 spsm, uint32 dbp, uint32 dpsm); static bool HasCompatibleBits(uint32 spsm, uint32 dpsm); diff --git a/plugins/GSdx/GSdx.rc b/plugins/GSdx/GSdx.rc index 9cc9fc1ec0..5c7c472309 100644 --- a/plugins/GSdx/GSdx.rc +++ b/plugins/GSdx/GSdx.rc @@ -57,6 +57,7 @@ IDR_TFX_FX RCDATA "res\\tfx.fx" IDR_MERGE_FX RCDATA "res\\merge.fx" IDR_INTERLACE_FX RCDATA "res\\interlace.fx" IDR_FXAA_FX RCDATA "res\\fxaa.fx" +IDR_CS_FX RCDATA "res\\cs.fx" ///////////////////////////////////////////////////////////////////////////// // diff --git a/plugins/GSdx/GSdx.vcxproj b/plugins/GSdx/GSdx.vcxproj index 7745046397..b33203905f 100644 --- a/plugins/GSdx/GSdx.vcxproj +++ b/plugins/GSdx/GSdx.vcxproj @@ -531,6 +531,7 @@ AssemblyAndSourceCode + @@ -1658,6 +1659,7 @@ + @@ -1727,6 +1729,7 @@ + diff --git a/plugins/GSdx/GSdx.vcxproj.filters b/plugins/GSdx/GSdx.vcxproj.filters index 59464dc543..53417912be 100644 --- a/plugins/GSdx/GSdx.vcxproj.filters +++ b/plugins/GSdx/GSdx.vcxproj.filters @@ -324,6 +324,9 @@ Source Files + + Source Files + @@ -647,6 +650,9 @@ Header Files + + Header Files + @@ -677,6 +683,9 @@ Shaders + + Shaders + diff --git a/plugins/GSdx/res/cs.fx b/plugins/GSdx/res/cs.fx new file mode 100644 index 0000000000..7579753e77 --- /dev/null +++ b/plugins/GSdx/res/cs.fx @@ -0,0 +1,73 @@ +struct Vertex +{ + float2 st; + uint c; + float q; + uint xy, z; + uint uv, f; +}; + +RWByteAddressBuffer VideoMemory : register(u0); + +StructuredBuffer VertexBuffer : register(t0); +Buffer IndexBuffer : register(t1); + +Buffer FrameRowOffset : register(t2); +Buffer FrameColOffset : register(t3); +Buffer ZBufRowOffset : register(t4); +Buffer ZBufColOffset : register(t5); + +cbuffer DrawingEnvironment : register(c0) +{ + // TODO +}; + +// one group is 16x8 pixels and one thread does 2 pixels, otherwise could not read-merge-write 16-bit targets safely +// neighburing pixels are next to eachother in memory, at least we don't have to calculate the address twice + +// TODO: they say groupshared memory is faster, try unswizzling the corresponding chunk of memory initially (how to do that once by only one thread?) then write-back when finished, unless it was untouched + +[numthreads(8, 8, 1)] +void cs_main(uint3 gid : SV_GroupID, uint3 tid : SV_GroupThreadID) +{ + uint count; + + IndexBuffer.GetDimensions(count); + + // #if GS_PRIM == 2 (triangle) + + for(uint i = 0; i < count; i += 3) + { + Vertex v0 = VertexBuffer[IndexBuffer[i + 0]]; + Vertex v1 = VertexBuffer[IndexBuffer[i + 1]]; + Vertex v2 = VertexBuffer[IndexBuffer[i + 2]]; + + uint x = gid.x + tid.x * 2; + uint y = gid.y + tid.y; + + uint fa = FrameRowOffset[y] + FrameColOffset[x]; + uint za = ZBufRowOffset[y] + ZBufColOffset[x]; + + // TODO: quickly reject if x, y is outside the triangle + // TODO: calculate interpolated values at x, y + // TODO: run the GS pipeline + // TODO: repeat for x+1, y + // TODO: output two pixels (might be better to process a single pixel, more threads, if there is no 16-bit target involved) + + // testing... + + uint4 c = VideoMemory.Load4(fa); // does this load 4*4 bytes? or 4 bytes each expanded uint? + + c = (v0.c >> uint4(0, 8, 16, 24)) & 0xff; // => ushr r1.yzw, r1.xxxx, l(0, 8, 16, 24), v0.c auto-converted to uint4 and per-component shift in one instruction, SSE is embarrassed + + VideoMemory.Store4(fa, c); // same question, 4*4 bytes or compressed to uint + } + + // #endif +} + +// TODO: DrawPoint (this is going to be a waste of resources) +// TODO: DrawLine (line hit-test, will it work?) +// TODO: DrawSprite (similar to DrawTriangle) +// TODO: if read-backs are too slow, implement GSState::Write/FlushWrite/Read/clut.Write in a compute shader +// TODO: unswizzle pages from VideoMemory to the texture cache (if they are marked as valid, otherwise upload from GSLocalMemory::m_vm8) diff --git a/plugins/GSdx/resource.h b/plugins/GSdx/resource.h index 43d9df57fe..b3e8f02077 100644 --- a/plugins/GSdx/resource.h +++ b/plugins/GSdx/resource.h @@ -81,12 +81,13 @@ #define IDR_INTERLACE_FX 10003 #define IDD_CONFIG2 10004 #define IDR_FXAA_FX 10005 +#define IDR_CS_FX 10006 // Next default values for new objects // #ifdef APSTUDIO_INVOKED #ifndef APSTUDIO_READONLY_SYMBOLS -#define _APS_NEXT_RESOURCE_VALUE 10006 +#define _APS_NEXT_RESOURCE_VALUE 10007 #define _APS_NEXT_COMMAND_VALUE 32771 #define _APS_NEXT_CONTROL_VALUE 2050 #define _APS_NEXT_SYMED_VALUE 5000