GSdx: nothing really new, just testing the compute shader, if you are an expert take a look and tell me your opinion :P

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5068 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2012-01-13 18:10:05 +00:00
parent 481f1fdda2
commit da4ea83134
25 changed files with 949 additions and 160 deletions

View File

@ -33,6 +33,7 @@
#include "GSRendererDX11.h"
#include "GSDevice9.h"
#include "GSDevice11.h"
#include "GSRendererCS.h"
#include "GSSettingsDlg.h"
static HRESULT s_hr = E_FAIL;
@ -206,41 +207,64 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1)
s_gs = NULL;
}
switch(renderer / 3)
if(renderer == 12)
{
default:
#ifdef _WINDOWS
case 0: dev = new GSDevice9(); break;
case 1: dev = new GSDevice11(); break;
#endif
case 2: dev = new GSDeviceSDL(); break;
case 3: dev = new GSDeviceNull(); break;
}
#ifdef _WINDOWS
dev = new GSDevice11();
if(dev == NULL)
{
return -1;
}
if(dev == NULL)
{
return -1;
}
if(s_gs == NULL)
if(s_gs == NULL)
{
s_gs = new GSRendererCS();
s_renderer = renderer;
}
#endif
}
else
{
switch(renderer % 3)
switch(renderer / 3)
{
default:
#ifdef _WINDOWS
case 0:
s_gs = (renderer / 3) == 0 ? (GSRenderer*)new GSRendererDX9() : (GSRenderer*)new GSRendererDX11();
break;
case 0: dev = new GSDevice9(); break;
case 1: dev = new GSDevice11(); break;
#endif
case 1:
s_gs = new GSRendererSW(threads);
break;
case 2:
s_gs = new GSRendererNull();
break;
case 2: dev = new GSDeviceSDL(); break;
case 3: dev = new GSDeviceNull(); break;
}
s_renderer = renderer;
if(dev == NULL)
{
return -1;
}
if(s_gs == NULL)
{
switch(renderer % 3)
{
default:
#ifdef _WINDOWS
case 0:
s_gs = (renderer / 3) == 0 ? (GSRenderer*)new GSRendererDX9() : (GSRenderer*)new GSRendererDX11();
break;
#endif
case 1:
s_gs = new GSRendererSW(threads);
break;
case 2:
s_gs = new GSRendererNull();
break;
}
s_renderer = renderer;
}
}
}
catch(std::exception& ex)

View File

@ -28,8 +28,14 @@
#define PLUGIN_VERSION 16
#define MAX_PAGES 512
#define MAX_BLOCKS 16384
#define VM_SIZE 4194304
#define PAGE_SIZE 8192
#define BLOCK_SIZE 256
#define COLUMN_SIZE 64
#define MAX_PAGES (VM_SIZE / PAGE_SIZE)
#define MAX_BLOCKS (VM_SIZE / BLOCK_SIZE)
#define MAX_COLUMNS (VM_SIZE / COLUMN_SIZE)
//if defined, will send much info in reply to the API title info queri from PCSX2
//default should be undefined

View File

@ -144,7 +144,7 @@ bool GSDevice11::Create(GSWnd* wnd)
for(int i = 0; i < countof(m_convert.ps); i++)
{
hr = CompileShader(IDR_CONVERT_FX, format("ps_main%d", i), NULL, &m_convert.ps[i]);
hr = CompileShader(IDR_CONVERT_FX, format("ps_main%d", i).c_str(), NULL, &m_convert.ps[i]);
}
memset(&dsd, 0, sizeof(dsd));
@ -172,7 +172,7 @@ bool GSDevice11::Create(GSWnd* wnd)
for(int i = 0; i < countof(m_merge.ps); i++)
{
hr = CompileShader(IDR_MERGE_FX, format("ps_main%d", i), NULL, &m_merge.ps[i]);
hr = CompileShader(IDR_MERGE_FX, format("ps_main%d", i).c_str(), NULL, &m_merge.ps[i]);
}
memset(&bsd, 0, sizeof(bsd));
@ -200,7 +200,7 @@ bool GSDevice11::Create(GSWnd* wnd)
for(int i = 0; i < countof(m_interlace.ps); i++)
{
hr = CompileShader(IDR_INTERLACE_FX, format("ps_main%d", i), NULL, &m_interlace.ps[i]);
hr = CompileShader(IDR_INTERLACE_FX, format("ps_main%d", i).c_str(), NULL, &m_interlace.ps[i]);
}
// fxaa
@ -360,6 +360,11 @@ void GSDevice11::DrawIndexedPrimitive()
m_ctx->DrawIndexed(m_index.count, m_index.start, m_vertex.start);
}
void GSDevice11::Dispatch(uint32 x, uint32 y, uint32 z)
{
m_ctx->Dispatch(x, y, z);
}
void GSDevice11::ClearRenderTarget(GSTexture* t, const GSVector4& c)
{
m_ctx->ClearRenderTargetView(*(GSTexture11*)t, c.v);
@ -937,7 +942,7 @@ void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb)
m_ctx->PSSetShader(ps, NULL, 0);
}
if (m_srv_changed)
if(m_srv_changed)
{
m_ctx->PSSetShaderResources(0, 3, m_state.ps_srv);
@ -959,6 +964,38 @@ void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb)
}
}
void GSDevice11::CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv)
{
// TODO: if(m_state.cs_srv[i] != srv)
{
// TODO: m_state.cs_srv[i] = srv;
m_ctx->CSSetShaderResources(i, 1, &srv);
}
}
void GSDevice11::CSSetShaderUAV(int i, ID3D11UnorderedAccessView* uav)
{
// TODO: if(m_state.cs_uav[i] != uav)
{
// TODO: m_state.cs_uav[i] = uav;
// uint32 count[] = {-1};
m_ctx->CSSetUnorderedAccessViews(i, 1, &uav, NULL);
}
}
void GSDevice11::CSSetShader(ID3D11ComputeShader* cs)
{
if(m_state.cs != cs)
{
m_state.cs = cs;
m_ctx->CSSetShader(cs, NULL, 0);
}
}
void GSDevice11::OMSetDepthStencilState(ID3D11DepthStencilState* dss, uint8 sref)
{
if(m_state.dss != dss || m_state.sref != sref)
@ -1027,7 +1064,7 @@ void GSDevice11::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector
}
}
HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il)
HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il)
{
HRESULT hr;
@ -1037,7 +1074,7 @@ HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_M
CComPtr<ID3D11Blob> shader, error;
hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry.c_str(), m_shader.vs.c_str(), 0, 0, NULL, &shader, &error, NULL);
hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.vs.c_str(), 0, 0, NULL, &shader, &error, NULL);
if(error)
{
@ -1066,7 +1103,7 @@ HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_M
return hr;
}
HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs)
HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs)
{
HRESULT hr;
@ -1076,7 +1113,7 @@ HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_M
CComPtr<ID3D11Blob> shader, error;
hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry.c_str(), m_shader.gs.c_str(), 0, 0, NULL, &shader, &error, NULL);
hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.gs.c_str(), 0, 0, NULL, &shader, &error, NULL);
if(error)
{
@ -1098,7 +1135,7 @@ HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_M
return hr;
}
HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps)
HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps)
{
HRESULT hr;
@ -1108,7 +1145,7 @@ HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_M
CComPtr<ID3D11Blob> shader, error;
hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry.c_str(), m_shader.ps.c_str(), 0, 0, NULL, &shader, &error, NULL);
hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.ps.c_str(), 0, 0, NULL, &shader, &error, NULL);
if(error)
{
@ -1120,7 +1157,71 @@ HRESULT GSDevice11::CompileShader(uint32 id, const string& entry, D3D11_SHADER_M
return hr;
}
hr = m_dev->CreatePixelShader((void*)shader->GetBufferPointer(), shader->GetBufferSize(),NULL, ps);
hr = m_dev->CreatePixelShader((void*)shader->GetBufferPointer(), shader->GetBufferSize(),NULL, ps);
if(FAILED(hr))
{
return hr;
}
return hr;
}
HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs)
{
HRESULT hr;
vector<D3D11_SHADER_MACRO> m;
PrepareShaderMacro(m, macro);
CComPtr<ID3D11Blob> shader, error;
hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.ps.c_str(), 0, 0, NULL, &shader, &error, NULL);
if(error)
{
printf("%s\n", (const char*)error->GetBufferPointer());
}
if(FAILED(hr))
{
return hr;
}
hr = m_dev->CreateComputeShader((void*)shader->GetBufferPointer(), shader->GetBufferSize(),NULL, cs);
if(FAILED(hr))
{
return hr;
}
return hr;
}
HRESULT GSDevice11::CompileShader(const char* fn, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs)
{
HRESULT hr;
vector<D3D11_SHADER_MACRO> m;
PrepareShaderMacro(m, macro);
CComPtr<ID3D11Blob> shader, error;
hr = D3DX11CompileFromFile(fn, &m[0], NULL, entry, m_shader.cs.c_str(), 0, 0, NULL, &shader, &error, NULL);
if(error)
{
printf("%s\n", (const char*)error->GetBufferPointer());
}
if(FAILED(hr))
{
return hr;
}
hr = m_dev->CreateComputeShader((void*)shader->GetBufferPointer(), shader->GetBufferSize(),NULL, cs);
if(FAILED(hr))
{

View File

@ -64,6 +64,7 @@ class GSDevice11 : public GSDeviceDX
ID3D11PixelShader* ps;
ID3D11Buffer* ps_cb;
ID3D11SamplerState* ps_ss[3];
ID3D11ComputeShader* cs;
GSVector2i viewport;
GSVector4i scissor;
ID3D11DepthStencilState* dss;
@ -145,6 +146,7 @@ public:
void DrawPrimitive();
void DrawIndexedPrimitive();
void Dispatch(uint32 x, uint32 y, uint32 z);
void ClearRenderTarget(GSTexture* t, const GSVector4& c);
void ClearRenderTarget(GSTexture* t, uint32 c);
@ -178,6 +180,9 @@ public:
void PSSetShaderResource(int i, GSTexture* sr);
void PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb);
void PSSetSamplerState(ID3D11SamplerState* ss0, ID3D11SamplerState* ss1, ID3D11SamplerState* ss2 = NULL);
void CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv);
void CSSetShaderUAV(int i, ID3D11UnorderedAccessView* uav);
void CSSetShader(ID3D11ComputeShader* cs);
void OMSetDepthStencilState(ID3D11DepthStencilState* dss, uint8 sref);
void OMSetBlendState(ID3D11BlendState* bs, float bf);
void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor = NULL);
@ -195,8 +200,10 @@ public:
operator ID3D11Device*() {return m_dev;}
operator ID3D11DeviceContext*() {return m_ctx;}
HRESULT CompileShader(uint32 id, const string& entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il);
HRESULT CompileShader(uint32 id, const string& entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs);
HRESULT CompileShader(uint32 id, const string& entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps);
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il);
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs);
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps);
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs);
HRESULT CompileShader(const char* fn, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs);
};

View File

@ -67,18 +67,21 @@ bool GSDeviceDX::SetFeatureLevel(D3D_FEATURE_LEVEL level, bool compat_mode)
m_shader.vs = "vs_4_0";
m_shader.gs = "gs_4_0";
m_shader.ps = "ps_4_0";
m_shader.cs = "cs_4_0";
break;
case D3D_FEATURE_LEVEL_10_1:
m_shader.model = "0x401";
m_shader.vs = "vs_4_1";
m_shader.gs = "gs_4_1";
m_shader.ps = "ps_4_1";
m_shader.cs = "cs_4_1";
break;
case D3D_FEATURE_LEVEL_11_0:
m_shader.model = "0x500";
m_shader.vs = "vs_5_0";
m_shader.gs = "gs_5_0";
m_shader.ps = "ps_5_0";
m_shader.cs = "cs_5_0";
break;
default:
ASSERT(0);

View File

@ -266,7 +266,7 @@ public:
#pragma pack(pop)
protected:
struct {D3D_FEATURE_LEVEL level; string model, vs, gs, ps;} m_shader;
struct {D3D_FEATURE_LEVEL level; string model, vs, gs, ps, cs;} m_shader;
uint32 m_msaa;
DXGI_SAMPLE_DESC m_msaa_desc;
@ -277,6 +277,7 @@ public:
virtual ~GSDeviceDX();
bool SetFeatureLevel(D3D_FEATURE_LEVEL level, bool compat_mode);
void GetFeatureLevel(D3D_FEATURE_LEVEL& level) const {level = m_shader.level;}
virtual void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim) = 0;
virtual void SetupVS(VSSelector sel, const VSConstantBuffer* cb) = 0;

View File

@ -500,6 +500,11 @@ GSPixelOffset4* GSLocalMemory::GetPixelOffset4(const GIFRegFRAME& FRAME, const G
GSPixelOffset4* o = (GSPixelOffset4*)_aligned_malloc(sizeof(GSPixelOffset4), 32);
o->hash = hash;
o->fbp = fbp;
o->zbp = zbp;
o->fpsm = fpsm;
o->zpsm = zpsm;
o->bw = bw;
pixelAddress fpa = m_psm[fpsm].pa;
pixelAddress zpa = m_psm[zpsm].pa;

View File

@ -63,6 +63,7 @@ struct GSPixelOffset4
GSVector2i row[2048]; // f yn | z yn (n = 0 1 2 ...)
GSVector2i col[512]; // f xn | z xn (n = 0 4 8 ...)
uint32 hash;
uint32 fbp, zbp, fpsm, zpsm, bw;
};
class GSLocalMemory : public GSBlock

View File

@ -64,11 +64,15 @@ GSRasterizer::~GSRasterizer()
bool GSRasterizer::IsOneOfMyScanlines(int top) const
{
ASSERT(top >= 0 && top < 2048);
return m_myscanline[top >> THREAD_HEIGHT] != 0;
}
bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const
{
ASSERT(top >= 0 && top < 2048 && bottom >= 0 && bottom < 2048);
top = top >> THREAD_HEIGHT;
bottom = (bottom + (1 << THREAD_HEIGHT) - 1) >> THREAD_HEIGHT;
@ -187,12 +191,12 @@ void GSRasterizer::Draw(GSRasterizerData* data)
if(index != NULL)
{
do {DrawSprite(vertex, index, data->solidrect); index += 2;}
do {DrawSprite(vertex, index); index += 2;}
while(index < index_end);
}
else
{
do {DrawSprite(vertex, tmp_index, data->solidrect); vertex += 2;}
do {DrawSprite(vertex, tmp_index); vertex += 2;}
while(vertex < vertex_end);
}
@ -407,7 +411,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const uint32* index)
GSVector4 tbf = y0011.xzxz(y1221).ceil();
GSVector4 tbmax = tbf.max(m_fscissor_y);
GSVector4 tbmin = tbf.min(m_fscissor_y);
GSVector4i tb = GSVector4i(tbmax.xzyw(tbmin));
GSVector4i tb = GSVector4i(tbmax.xzyw(tbmin)); // max(y0, t) max(y1, t) min(y1, b) min(y2, b)
dv[0] = v1 - v0;
dv[1] = v2 - v0;
@ -565,7 +569,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co
m_edge.count += e - &m_edge.buff[m_edge.count];
}
void GSRasterizer::DrawSprite(const GSVertexSW* vertex, const uint32* index, bool solidrect)
void GSRasterizer::DrawSprite(const GSVertexSW* vertex, const uint32* index)
{
const GSVertexSW& v0 = vertex[index[0]];
const GSVertexSW& v1 = vertex[index[1]];
@ -589,7 +593,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertex, const uint32* index, boo
GSVertexSW scan = v[0];
if(solidrect)
if(m_ds->IsSolidRect())
{
if(m_threads == 1)
{
@ -904,7 +908,6 @@ GSRasterizerList::GSRasterizerList()
: GSJobQueue<shared_ptr<GSRasterizerData> >()
, m_sync_count(0)
, m_syncpoint_count(0)
, m_solidrect_count(0)
{
}
@ -955,11 +958,6 @@ int GSRasterizerList::GetPixels(bool reset)
void GSRasterizerList::Process(shared_ptr<GSRasterizerData>& item)
{
if(item->solidrect)
{
m_solidrect_count++;
}
if(item->syncpoint)
{
for(size_t i = 0; i < m_workers.size(); i++)

View File

@ -39,7 +39,6 @@ public:
int vertex_count;
uint32* index;
int index_count;
bool solidrect;
bool syncpoint;
uint64 frame;
@ -52,7 +51,6 @@ public:
, vertex_count(0)
, index(NULL)
, index_count(0)
, solidrect(false)
, syncpoint(false)
, frame(0)
{
@ -101,6 +99,7 @@ public:
#endif
__forceinline bool HasEdge() const {return m_de != NULL;}
__forceinline bool IsSolidRect() const {return m_dr != NULL;}
};
class IRasterizer : public GSAlignedClass<32>
@ -133,7 +132,7 @@ protected:
void DrawPoint(const GSVertexSW* vertex, int vertex_count, const uint32* index, int index_count);
void DrawLine(const GSVertexSW* vertex, const uint32* index);
void DrawTriangle(const GSVertexSW* vertex, const uint32* index);
void DrawSprite(const GSVertexSW* vertex, const uint32* index, bool solidrect);
void DrawSprite(const GSVertexSW* vertex, const uint32* index);
__forceinline void DrawTriangleSection(int top, int bottom, GSVertexSW& edge, const GSVertexSW& dedge, const GSVertexSW& dscan, const GSVector4& p0);
@ -214,7 +213,6 @@ public:
int m_sync_count;
int m_syncpoint_count;
int m_solidrect_count;
// IRasterizer

View File

@ -0,0 +1,426 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSRendererCS.h"
GSRendererCS::GSRendererCS()
: GSRenderer(new GSVertexTraceCS(this), sizeof(GSVertex))
{
m_nativeres = true;
InitConvertVertex(GSRendererCS);
memset(m_vm_valid, 0, sizeof(m_vm_valid));
}
GSRendererCS::~GSRendererCS()
{
}
bool GSRendererCS::CreateDevice(GSDevice* dev_unk)
{
if(!__super::CreateDevice(dev_unk))
return false;
D3D_FEATURE_LEVEL level;
((GSDeviceDX*)dev_unk)->GetFeatureLevel(level);
if(level < D3D_FEATURE_LEVEL_10_0)
return false;
HRESULT hr;
GSDevice11* dev = (GSDevice11*)dev_unk;
D3D11_BUFFER_DESC bd;
D3D11_UNORDERED_ACCESS_VIEW_DESC uavd;
D3D11_SHADER_RESOURCE_VIEW_DESC srvd;
// video memory (4MB)
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = 4 * 1024 * 1024;
bd.StructureByteStride = 4;
bd.Usage = D3D11_USAGE_DEFAULT;
bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS;
bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_vm);
if(FAILED(hr)) return false;
memset(&uavd, 0, sizeof(uavd));
uavd.Format = DXGI_FORMAT_R32_TYPELESS;
uavd.Buffer.FirstElement = 0;
uavd.Buffer.NumElements = 1024 * 1024;
uavd.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW;
uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav);
if(FAILED(hr)) return false;
// vertex buffer
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = sizeof(GSVertex) * 10000;
bd.StructureByteStride = sizeof(GSVertex);
bd.Usage = D3D11_USAGE_DYNAMIC;
bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_vb);
if(FAILED(hr)) return false;
// index buffer
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = sizeof(uint32) * 10000 * 3;
bd.Usage = D3D11_USAGE_DYNAMIC;
bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_ib);
if(FAILED(hr)) return false;
// one page, for copying between cpu<->gpu
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = PAGE_SIZE;
bd.Usage = D3D11_USAGE_STAGING;
bd.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_pb);
if(FAILED(hr)) return false;
return true;
}
GSTexture* GSRendererCS::GetOutput(int i)
{
// TODO: create a compute shader which unswizzles the frame from m_vm to the output texture
return NULL;
}
template<uint32 prim, uint32 tme, uint32 fst>
void GSRendererCS::ConvertVertex(size_t dst_index, size_t src_index)
{
// TODO: vertex format more fitting as the input for the compute shader
if(src_index != dst_index)
{
GSVertex v = ((GSVertex*)m_vertex.buff)[src_index];
((GSVertex*)m_vertex.buff)[dst_index] = v;
}
}
void GSRendererCS::Draw()
{
HRESULT hr;
GSDevice11* dev = (GSDevice11*)m_dev;
ID3D11DeviceContext* ctx = *dev;
D3D11_BUFFER_DESC bd;
D3D11_UNORDERED_ACCESS_VIEW_DESC uavd;
D3D11_SHADER_RESOURCE_VIEW_DESC srvd;
D3D11_MAPPED_SUBRESOURCE map;
CComPtr<ID3D11ShaderResourceView> vb_srv;
CComPtr<ID3D11ShaderResourceView> ib_srv;
// TODO: cache these in hash_maps
CComPtr<ID3D11Buffer> fbr, fbc, zbr, zbc;
CComPtr<ID3D11ShaderResourceView> fbr_srv, fbc_srv, zbr_srv, zbc_srv;
// TODO: grow m_vb, m_ib if needed
if(m_vertex.next > 10000) return;
if(m_index.tail > 30000) return;
// TODO: fill/advance/discardwhenfull, as in GSDevice11::IASetVertexBuffer/IASetIndexBuffer
hr = ctx->Map(m_vb, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); // discarding, until properly advancing the start pointer around
if(FAILED(hr)) return;
memcpy(map.pData, m_vertex.buff, sizeof(GSVertex) * m_vertex.next);
ctx->Unmap(m_vb, 0);
//
hr = ctx->Map(m_ib, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); // discarding, until properly advancing the start pointer around
if(FAILED(hr)) return;
memcpy(map.pData, m_index.buff, sizeof(uint32) * m_index.tail);
ctx->Unmap(m_ib, 0);
// TODO: UpdateResource might be faster, based on my exprience with the real vertex buffer, write-no-overwrite/discarded dynamic buffer + map is better
//
memset(&srvd, 0, sizeof(srvd));
srvd.Format = DXGI_FORMAT_UNKNOWN;
srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
srvd.Buffer.FirstElement = 0;
srvd.Buffer.NumElements = m_vertex.next;
hr = (*dev)->CreateShaderResourceView(m_vb, &srvd, &vb_srv); // TODO: have to create this dyncamically in Draw() or pass the start/count in a const reg
memset(&srvd, 0, sizeof(srvd));
srvd.Format = DXGI_FORMAT_R32_UINT;
srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
srvd.Buffer.FirstElement = 0;
srvd.Buffer.NumElements = m_index.tail;
hr = (*dev)->CreateShaderResourceView(m_ib, &srvd, &ib_srv); // TODO: have to create this dyncamically in Draw() or pass the start/count in a const reg
// fzb offsets
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = sizeof(int) * 4096;
bd.StructureByteStride = sizeof(int);
bd.Usage = D3D11_USAGE_IMMUTABLE;
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
D3D11_SUBRESOURCE_DATA data;
memset(&data, 0, sizeof(data));
data.pSysMem = m_context->offset.fb->pixel.row;
hr = (*dev)->CreateBuffer(&bd, &data, &fbr);
data.pSysMem = m_context->offset.fb->pixel.col[0]; // same column layout for every line in case of frame and zbuffer formats
hr = (*dev)->CreateBuffer(&bd, &data, &fbc);
data.pSysMem = m_context->offset.zb->pixel.row;
hr = (*dev)->CreateBuffer(&bd, &data, &zbr);
data.pSysMem = m_context->offset.zb->pixel.col[0]; // same column layout for every line in case of frame and zbuffer formats
hr = (*dev)->CreateBuffer(&bd, &data, &zbc);
// TODO: D3D10_SHADER_MACRO (primclass, less frequently changing drawing attribs, etc.)
uint32 sel = 0; // TODO
hash_map<uint32, CComPtr<ID3D11ComputeShader> >::iterator i = m_cs.find(sel);
CComPtr<ID3D11ComputeShader> cs;
if(i == m_cs.end())
{
// hr = dev->CompileShader(IDR_CS_FX, "cs_main", NULL, &cs);
hr = dev->CompileShader("E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\cs.fx", "cs_main", NULL, &cs);
if(FAILED(hr)) return;
m_cs[sel] = cs;
}
else
{
cs = i->second;
}
//
dev->CSSetShaderUAV(0, m_vm_uav);
dev->CSSetShaderSRV(0, vb_srv);
dev->CSSetShaderSRV(1, ib_srv);
dev->CSSetShaderSRV(2, fbr_srv);
dev->CSSetShaderSRV(3, fbc_srv);
dev->CSSetShaderSRV(4, zbr_srv);
dev->CSSetShaderSRV(5, zbc_srv);
dev->CSSetShader(cs);
GSVector4i bbox = GSVector4i(0, 0, 640, 512); // TODO: vertex trace
GSVector4i r = bbox.ralign<Align_Outside>(GSVector2i(16, 8));
bool fb = true; // TODO: frame buffer used
bool zb = true; // TODO: z-buffer used
if(fb) Write(m_context->offset.fb, r);
if(zb) Write(m_context->offset.zb, r);
// TODO: constant buffer (frequently chaning drawing attribs)
// TODO: texture (implement texture cache)
// TODO: clut to a palette texture (should be texture1d, not simply buffer, it is random accessed)
// TODO: CSSetShaderSRV(6 7 8 ..., texture level 0 1 2 ...) or use Texture3D?
// TODO: invalidate texture cache
/*
CComPtr<ID3D11Query> q;
D3D11_QUERY_DESC qd;
memset(&qd, 0, sizeof(qd));
qd.Query = D3D11_QUERY_EVENT;
hr = (*dev)->CreateQuery(&qd, &q);
ctx->Begin(q);
*/
printf("[%lld] dispatch %05x %d %05x %d %05x %d %dx%d | %d %d %d\n",
__rdtsc(),
m_context->FRAME.Block(), m_context->FRAME.PSM,
m_context->ZBUF.Block(), m_context->ZBUF.PSM,
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH,
PRIM->PRIM, m_vertex.next, m_index.tail);
GSVector4i rsize = r.rsize();
dev->Dispatch(rsize.z >> 4, rsize.w >> 3, 1); // TODO: pass upper-left corner offset (r.xy) in a const buffer
/*
ctx->End(q);
uint64 t0 = __rdtsc();
BOOL b;
while(S_OK != ctx->GetData(q, &b, sizeof(BOOL), 0)) {}
printf("%lld\n", __rdtsc() - t0);
*/
}
void GSRendererCS::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
{
GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
Read(o, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated
// TODO: false deps, 8H/4HL/4HH texture sharing pages with 24-bit target
// TODO: invalidate texture cache
}
void GSRendererCS::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut)
{
GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
Read(o, r, false);
}
void GSRendererCS::Write(GSOffset* o, const GSVector4i& r)
{
GSDevice11* dev = (GSDevice11*)m_dev;
ID3D11DeviceContext* ctx = *dev;
D3D11_BOX box;
memset(&box, 0, sizeof(box));
uint32* pages = o->GetPages(r);
for(size_t i = 0; pages[i] != GSOffset::EOP; i++)
{
uint32 page = pages[i];
uint32 row = page >> 5;
uint32 col = 1 << (page & 31);
if((m_vm_valid[row] & col) == 0)
{
m_vm_valid[row] |= col;
box.left = page * PAGE_SIZE;
box.right = box.left + PAGE_SIZE;
ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + box.left, 0, 0);
printf("[%lld] write %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page);
}
}
delete [] pages;
}
void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate)
{
GSDevice11* dev = (GSDevice11*)m_dev;
ID3D11DeviceContext* ctx = *dev;
D3D11_BOX box;
memset(&box, 0, sizeof(box));
uint32* pages = o->GetPages(r);
for(size_t i = 0; pages[i] != GSOffset::EOP; i++)
{
uint32 page = pages[i];
uint32 row = page >> 5;
uint32 col = 1 << (page & 31);
if(m_vm_valid[row] & col)
{
if(invalidate) m_vm_valid[row] ^= col;
box.left = page * PAGE_SIZE;
box.right = box.left + PAGE_SIZE;
ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box);
D3D11_MAPPED_SUBRESOURCE map;
if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ_WRITE, 0, &map)))
{
memcpy(m_mem.m_vm8 + box.left, map.pData, PAGE_SIZE);
ctx->Unmap(m_pb, 0);
printf("[%lld] read %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page);
}
}
}
delete [] pages;
}

View File

@ -0,0 +1,59 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#pragma once
#include "GSRenderer.h"
#include "GSDevice11.h"
class GSRendererCS : public GSRenderer
{
class GSVertexTraceCS : public GSVertexTrace
{
public:
GSVertexTraceCS(const GSState* state) : GSVertexTrace(state) {}
};
CComPtr<ID3D11Buffer> m_vm;
CComPtr<ID3D11UnorderedAccessView> m_vm_uav;
CComPtr<ID3D11Buffer> m_vb;
CComPtr<ID3D11Buffer> m_ib;
CComPtr<ID3D11Buffer> m_pb;
hash_map<uint32, CComPtr<ID3D11ComputeShader> > m_cs;
uint32 m_vm_valid[16];
void Write(GSOffset* o, const GSVector4i& r);
void Read(GSOffset* o, const GSVector4i& r, bool invalidate);
protected:
template<uint32 prim, uint32 tme, uint32 fst>
void ConvertVertex(size_t dst_index, size_t src_index);
bool CreateDevice(GSDevice* dev);
GSTexture* GetOutput(int i);
void Draw();
void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut);
public:
GSRendererCS();
virtual ~GSRendererCS();
};

View File

@ -89,8 +89,7 @@ void GSRendererSW::VSync(int field)
//
printf("m_sync_count = %d\n", ((GSRasterizerList*)m_rl)->m_sync_count); ((GSRasterizerList*)m_rl)->m_sync_count = 0;
printf("m_syncpoint_count = %d\n", ((GSRasterizerList*)m_rl)->m_syncpoint_count); ((GSRasterizerList*)m_rl)->m_syncpoint_count = 0;
printf("m_solidrect_count = %d\n", ((GSRasterizerList*)m_rl)->m_solidrect_count); ((GSRasterizerList*)m_rl)->m_solidrect_count = 0;
*/
*/
GSRenderer::VSync(field);
m_tc->IncAge();
@ -198,29 +197,38 @@ void GSRendererSW::ConvertVertex(size_t dst_index, size_t src_index)
}
}
#define LOG 0
FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
void GSRendererSW::Draw()
{
SharedData* sd = new SharedData(this);
shared_ptr<GSRasterizerData> data(sd);
if(!GetScanlineGlobalData(sd)) return;
sd->primclass = m_vt->m_primclass;
sd->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_vertex.next + sizeof(uint32) * m_index.tail, 32);
sd->vertex = (GSVertexSW*)sd->buff;
sd->vertex_count = m_vertex.next;
sd->index = (uint32*)(sd->buff + sizeof(GSVertexSW) * m_vertex.next);
sd->index_count = m_index.tail;
data->primclass = m_vt->m_primclass;
data->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_vertex.next + sizeof(uint32) * m_index.tail, 32);
data->vertex = (GSVertexSW*)data->buff;
data->vertex_count = m_vertex.next;
data->index = (uint32*)(data->buff + sizeof(GSVertexSW) * m_vertex.next);
data->index_count = m_index.tail;
memcpy(data->vertex, m_vertex.buff, sizeof(GSVertexSW) * m_vertex.next);
memcpy(data->index, m_index.buff, sizeof(uint32) * m_index.tail);
memcpy(sd->vertex, m_vertex.buff, sizeof(GSVertexSW) * m_vertex.next);
memcpy(sd->index, m_index.buff, sizeof(uint32) * m_index.tail);
for(size_t i = 0; i < m_index.tail; i++)
{
ASSERT(((GSVertexSW*)m_vertex.buff + m_index.buff[i])->_pad.u32[0] == 0x12345678);
}
// TODO: delay texture update, do it later along with the syncing on the dispatcher thread, then this thread does not have to wait and can continue assembling more jobs
// TODO: if(any texture page is used as a target) GSRasterizerData::syncpoint = true;
// TODO: virtual void GSRasterizerData::Update() {texture[all levels]->Update();}, call it from the dispatcher thread before sending to workers
// TODO: m_tc->InvalidatePages must be called after texture->Update, move that inside GSRasterizerData::Update too
if(!GetScanlineGlobalData(sd)) return;
//
const GSDrawingContext* context = m_context;
@ -232,10 +240,9 @@ void GSRendererSW::Draw()
scissor.z = std::min<int>(scissor.z, (int)context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour
data->scissor = scissor;
data->bbox = bbox;
data->solidrect = gd.sel.IsSolidRect();
data->frame = m_perfmon.GetFrame();
sd->scissor = scissor;
sd->bbox = bbox;
sd->frame = m_perfmon.GetFrame();
//
@ -262,41 +269,75 @@ void GSRendererSW::Draw()
if(m_fzb != context->offset.fzb)
{
m_fzb = context->offset.fzb;
// hmm, what if "r" gets bigger next time and slips through unchecked, need to trace that too
data->syncpoint = true;
}
sd->syncpoint = true; // TODO
// - chross-check frame and z-buffer pages, they cannot overlap with eachother and with previous batches in queue
// - m_fzb filters out most of these cases, only have to be careful when the addresses stay the same and the output is mutually enabled/disabled and alternating (Bully FBP/ZBP = 0x2300)
if(!data->syncpoint)
{
if(gd.sel.fwrite)
if(!sd->syncpoint)
{
for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++)
if(fb_pages == NULL)
{
if(m_fzb_pages[*p] & 0xffff0000) // already used as a z-buffer
fb_pages = context->offset.fb->GetPages(r);
}
if(CheckTargetPages<0xffffffff>(fb_pages))
{
sd->syncpoint = true;
if(LOG) fprintf(s_fp, "syncpoint 0\n");
}
}
if(!sd->syncpoint)
{
if(zb_pages == NULL)
{
zb_pages = context->offset.zb->GetPages(r);
}
if(CheckTargetPages<0xffffffff>(zb_pages))
{
sd->syncpoint = true;
if(LOG) fprintf(s_fp, "syncpoint 1\n");
}
}
if(!sd->syncpoint)
{
if(LOG) fprintf(s_fp, "no syncpoint *\n");
}
m_fzb = context->offset.fzb;
}
else
{
// chross-check frame and z-buffer pages, they cannot overlap with eachother and with previous batches in queue,
// m_fzb filters out most of these cases, only have to be careful when the addresses stay the same and the output
// is mutually enabled/disabled and alternating (Bully FBP/ZBP = 0x2300)
if(!sd->syncpoint)
{
if(gd.sel.fwrite)
{
if(CheckTargetPages<0xffff0000>(fb_pages)) // already used as a z-buffer
{
data->syncpoint = true;
break;
sd->syncpoint = true;
if(LOG) fprintf(s_fp, "syncpoint 2\n");
}
}
}
}
if(!data->syncpoint)
{
if(gd.sel.zwrite)
if(!sd->syncpoint)
{
for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++)
if(gd.sel.zwrite)
{
if(m_fzb_pages[*p] & 0x0000ffff) // already used as a frame buffer
if(CheckTargetPages<0x0000ffff>(zb_pages)) // already used as a frame buffer
{
data->syncpoint = true;
sd->syncpoint = true;
break;
if(LOG) fprintf(s_fp, "syncpoint 3\n");
}
}
}
@ -363,6 +404,12 @@ void GSRendererSW::Draw()
}
else
{
if(LOG) fprintf(s_fp, "queue %05x %d %05x %d %05x %d %dx%d | %d %d %d\n",
m_context->FRAME.Block(), m_context->FRAME.PSM,
m_context->ZBUF.Block(), m_context->ZBUF.PSM,
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH,
PRIM->PRIM, sd->vertex_count, sd->index_count);
m_rl->Queue(data);
}
@ -384,8 +431,16 @@ void GSRendererSW::Sync(int reason)
GSPerfMonAutoTimer pmat(&m_perfmon, GSPerfMon::Sync);
uint64 t = __rdtsc();
m_rl->Sync();
s_n++;
t = __rdtsc() - t;
if(LOG) fprintf(s_fp, "sync n=%d r=%d t=%lld p=%d %c\n", s_n, reason, t, m_rl->GetPixels(), t > 10000000 ? '*' : ' ');
m_perfmon.Put(GSPerfMon::Fillrate, m_rl->GetPixels());
}
@ -397,8 +452,6 @@ void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
o->GetPages(r, p);
m_tc->InvalidatePages(p, o->psm);
// check if the changing pages either used as a texture or a target
for(; *p != GSOffset::EOP; p++)
@ -414,6 +467,8 @@ void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
break;
}
}
m_tc->InvalidatePages(m_tmp_pages, o->psm); // if texture update runs on a thread and Sync(5) happens then this must come later
}
void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut)
@ -493,6 +548,19 @@ void GSRendererSW::ReleasePages(const uint32* pages, int type)
}
}
template<uint32 mask> bool GSRendererSW::CheckTargetPages(const uint32* pages)
{
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
{
if(mask != 0xffffffff ? (m_fzb_pages[*p] & mask) : m_fzb_pages[*p])
{
return true;
}
}
return false;
}
#include "GSTextureSW.h"
bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
@ -811,19 +879,19 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
{
// skip per pixel division if q is constant
GSVertexSW* RESTRICT v = (GSVertexSW*)m_vertex.buff;
GSVertexSW* RESTRICT v = data->vertex;
if(m_vt->m_eq.q)
{
gd.sel.fst = 1;
const GSVector4& t = v[m_index.buff[0]].t;
const GSVector4& t = v[data->index[0]].t;
if(t.z != 1.0f)
{
GSVector4 w = t.zzzz().rcpnr();
for(int i = 0, j = m_vertex.next; i < j; i++)
for(int i = 0, j = data->vertex_count; i < j; i++)
{
GSVector4 t = v[i].t;
@ -835,7 +903,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
{
gd.sel.fst = 1;
for(int i = 0, j = m_vertex.next; i < j; i += 2)
for(int i = 0, j = data->vertex_count; i < j; i += 2)
{
GSVector4 t0 = v[i + 0].t;
GSVector4 t1 = v[i + 1].t;
@ -856,9 +924,9 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
GSVector4 half(0x8000, 0x8000);
GSVertexSW* RESTRICT v = (GSVertexSW*)m_vertex.buff;
GSVertexSW* RESTRICT v = data->vertex;
for(int i = 0, j = m_vertex.next; i < j; i++)
for(int i = 0, j = data->vertex_count; i < j; i++)
{
GSVector4 t = v[i].t;
@ -1051,12 +1119,12 @@ GSRendererSW::SharedData::~SharedData()
delete m_fb_pages;
delete m_zb_pages;
for(size_t i = 0; i < countof(m_tex_pages) && m_tex_pages[i] != NULL; i++)
{
m_parent->ReleasePages(m_tex_pages[i], 2);
}
if(global.clut) _aligned_free(global.clut);
if(global.dimx) _aligned_free(global.dimx);
}

View File

@ -66,6 +66,7 @@ protected:
void UsePages(const uint32* pages, int type);
void ReleasePages(const uint32* pages, int type);
template<uint32 mask> bool CheckTargetPages(const uint32* pages);
bool GetScanlineGlobalData(SharedData* data);

View File

@ -671,11 +671,6 @@ template<int i> void GSState::ApplyTEX0(GIFRegTEX0& TEX0)
TEX0.CPSM &= 0xa; // 1010b
if((TEX0.TBW & 1) && (TEX0.PSM == PSM_PSMT8 || TEX0.PSM == PSM_PSMT4))
{
TEX0.TBW &= ~1; // GS User 2.6
}
if((TEX0.u32[0] ^ m_env.CTXT[i].TEX0.u32[0]) & 0x3ffffff) // TBP0 TBW PSM
{
m_env.CTXT[i].offset.tex = m_mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
@ -709,6 +704,13 @@ template<int i> void GSState::GIFRegHandlerTEX0(const GIFReg* RESTRICT r)
if(TEX0.TW > 10) TEX0.TW = 10;
if(TEX0.TH > 10) TEX0.TH = 10;
if((TEX0.TBW & 1) && (TEX0.PSM == PSM_PSMT8 || TEX0.PSM == PSM_PSMT4))
{
ASSERT(TEX0.TBW == 1); // TODO
TEX0.TBW &= ~1; // GS User 2.6
}
ApplyTEX0<i>(TEX0);
if(m_env.CTXT[i].TEX1.MTBA)
@ -1265,32 +1267,34 @@ void GSState::FlushPrim()
size_t head = m_vertex.head;
size_t tail = m_vertex.tail;
size_t next = m_vertex.next;
size_t unused = 0;
if(tail > head)
{
switch(PRIM->PRIM)
{
case GS_POINTLIST:
ASSERT(0);
break;
case GS_LINELIST:
case GS_LINESTRIP:
case GS_SPRITE:
if(tail > head + 0) memcpy(&buff[stride * 0], &m_vertex.buff[stride * (head + 0)], stride);
break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
if(tail > head + 0) memcpy(&buff[stride * 0], &m_vertex.buff[stride * (head + 0)], stride);
if(tail > head + 1) memcpy(&buff[stride * 1], &m_vertex.buff[stride * (head + 1)], stride);
unused = tail - head;
memcpy(buff, &m_vertex.buff[stride * head], stride * unused);
break;
case GS_TRIANGLEFAN:
if(tail > head + 0) memcpy(&buff[stride * 0], &m_vertex.buff[stride * (head + 0)], stride);
if(tail > head + 1) memcpy(&buff[stride * 1], &m_vertex.buff[stride * (tail - 1)], stride);
memcpy(buff, &m_vertex.buff[stride * head], stride); unused = 1;
if(tail - 1 > head) {memcpy(&buff[stride], &m_vertex.buff[stride * (tail - 1)], stride); unused = 2;}
break;
case GS_INVALID:
break;
default:
__assume(0);
}
ASSERT(unused < GSUtil::GetVertexCount(PRIM->PRIM));
}
if(GSLocalMemory::m_psm[m_context->FRAME.PSM].fmt < 3 && GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt < 3)
@ -1308,34 +1312,19 @@ void GSState::FlushPrim()
m_index.tail = 0;
m_vertex.head = 0;
m_vertex.tail = 0;
m_vertex.next = 0;
if(tail > head)
if(unused > 0)
{
switch(PRIM->PRIM)
{
case GS_POINTLIST:
break;
case GS_LINELIST:
case GS_LINESTRIP:
case GS_SPRITE:
if(tail > head + 0) {memcpy(&m_vertex.buff[stride * 0], &buff[stride * 0], stride); m_vertex.tail++;}
break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
if(tail > head + 0) {memcpy(&m_vertex.buff[stride * 0], &buff[stride * 0], stride); m_vertex.tail++;}
if(tail > head + 1) {memcpy(&m_vertex.buff[stride * 1], &buff[stride * 1], stride); m_vertex.tail++;}
break;
case GS_INVALID:
break;
default:
__assume(0);
}
memcpy(m_vertex.buff, buff, stride * unused);
m_vertex.tail = unused;
m_vertex.next = next > head ? next - head : 0;
}
else
{
m_vertex.tail = 0;
m_vertex.next = 0;
}
}
}
@ -1380,6 +1369,15 @@ void GSState::Write(const uint8* mem, int len)
m_tr.start = m_tr.end = m_tr.total;
m_perfmon.Put(GSPerfMon::Swizzle, len);
/*
static int n = 0;
string s;
s = format("c:\\temp1\\[%04d]_%05x_%d_%d_%d_%d_%d_%d.bmp",
n++, (int)m_env.BITBLTBUF.DBP, (int)m_env.BITBLTBUF.DBW, (int)m_env.BITBLTBUF.DPSM,
r.left, r.top, r.right, r.bottom);
m_mem.SaveBMP(s, m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW, m_env.BITBLTBUF.DPSM, r.right, r.bottom);
*/
}
else
{

View File

@ -95,7 +95,7 @@ void GSTextureCacheSW::InvalidatePages(const uint32* pages, uint32 psm)
{
Texture* t = *i;
if(GSUtil::HasSharedBits(psm, t->m_TEX0.PSM))
if(GSUtil::HasSharedBits(psm, t->m_sharedbits))
{
uint32* RESTRICT valid = t->m_valid;
@ -181,6 +181,8 @@ GSTextureCacheSW::Texture::Texture(GSState* state, uint32 tw0, const GIFRegTEX0&
memset(m_valid, 0, sizeof(m_valid));
memset(m_pages.bm, 0, sizeof(m_pages.bm));
m_sharedbits = GSUtil::HasSharedBitsPtr(m_TEX0.PSM);
m_offset = m_state->m_mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
m_pages.n = m_offset->GetPages(GSVector4i(0, 0, 1 << TEX0.TW, 1 << TEX0.TH));

View File

@ -41,6 +41,7 @@ public:
vector<GSVector2i>* m_p2t;
uint32 m_valid[MAX_PAGES];
struct {uint32 bm[16]; const uint32* n;} m_pages;
const uint32* RESTRICT m_sharedbits;
// m_valid
// fast mode: each uint32 bits map to the 32 blocks of that page

View File

@ -275,19 +275,15 @@ protected:
if(m_exit) {m_cv.lock.Unlock(); return;}
}
{
// NOTE: this is scoped because we must make sure the last item is no longer around when Wait detects an empty queue
T& item = m_queue.front();
T item = m_queue.front();
m_cv.lock.Unlock();
m_cv.lock.Unlock();
Process(item);
Process(item);
m_cv.lock.Lock();
m_cv.lock.Lock();
m_queue.pop();
}
m_queue.pop();
if(m_queue.empty())
{
@ -312,23 +308,18 @@ protected:
m_ev.lock.Lock();
}
{
// NOTE: this is scoped because we must make sure the last item is no longer around when Wait detects an empty queue
T& item = m_queue.front();
T item = m_queue.front();
m_ev.lock.Unlock();
m_ev.lock.Unlock();
Process(item);
Process(item);
m_ev.lock.Lock();
m_ev.lock.Lock();
m_queue.pop();
}
m_queue.pop();
_InterlockedDecrement(&m_ev.count);
}
}
}

View File

@ -161,6 +161,16 @@ int GSUtil::GetVertexCount(uint32 prim)
return s_maps.VertexCountField[prim];
}
const uint32* GSUtil::HasSharedBitsPtr(uint32 dpsm)
{
return s_maps.SharedBitsField[dpsm];
}
bool GSUtil::HasSharedBits(uint32 spsm, const uint32* RESTRICT ptr)
{
return (ptr[spsm >> 5] & (1 << (spsm & 0x1f))) == 0;
}
bool GSUtil::HasSharedBits(uint32 spsm, uint32 dpsm)
{
return (s_maps.SharedBitsField[dpsm][spsm >> 5] & (1 << (spsm & 0x1f))) == 0;

View File

@ -31,6 +31,8 @@ public:
static GS_PRIM_CLASS GetPrimClass(uint32 prim);
static int GetVertexCount(uint32 prim);
static const uint32* HasSharedBitsPtr(uint32 dpsm);
static bool HasSharedBits(uint32 spsm, const uint32* ptr);
static bool HasSharedBits(uint32 spsm, uint32 dpsm);
static bool HasSharedBits(uint32 sbp, uint32 spsm, uint32 dbp, uint32 dpsm);
static bool HasCompatibleBits(uint32 spsm, uint32 dpsm);

View File

@ -57,6 +57,7 @@ IDR_TFX_FX RCDATA "res\\tfx.fx"
IDR_MERGE_FX RCDATA "res\\merge.fx"
IDR_INTERLACE_FX RCDATA "res\\interlace.fx"
IDR_FXAA_FX RCDATA "res\\fxaa.fx"
IDR_CS_FX RCDATA "res\\cs.fx"
/////////////////////////////////////////////////////////////////////////////
//

View File

@ -531,6 +531,7 @@
<AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">AssemblyAndSourceCode</AssemblerOutput>
</ClCompile>
<ClCompile Include="GSRenderer.cpp" />
<ClCompile Include="GSRendererCS.cpp" />
<ClCompile Include="GSRendererDX.cpp" />
<ClCompile Include="GSRendererDX11.cpp" />
<ClCompile Include="GSRendererDX9.cpp" />
@ -1658,6 +1659,7 @@
<ClInclude Include="GSPerfMon.h" />
<ClInclude Include="GSRasterizer.h" />
<ClInclude Include="GSRenderer.h" />
<ClInclude Include="GSRendererCS.h" />
<ClInclude Include="GSRendererDX.h" />
<ClInclude Include="GSRendererDX11.h" />
<ClInclude Include="GSRendererDX9.h" />
@ -1727,6 +1729,7 @@
</ItemGroup>
<ItemGroup>
<None Include="GSdx.def" />
<None Include="res\cs.fx" />
<None Include="res\fxaa.fx" />
<None Include="res\logo10.bmp" />
<None Include="res\logo9.bmp" />

View File

@ -324,6 +324,9 @@
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSRendererCS.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="GS.h">
@ -647,6 +650,9 @@
<ClInclude Include="config.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="GSRendererCS.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="res\logo10.bmp">
@ -677,6 +683,9 @@
<None Include="res\fxaa.fx">
<Filter>Shaders</Filter>
</None>
<None Include="res\cs.fx">
<Filter>Shaders</Filter>
</None>
</ItemGroup>
<ItemGroup>
<ResourceCompile Include="GSdx.rc">

73
plugins/GSdx/res/cs.fx Normal file
View File

@ -0,0 +1,73 @@
struct Vertex
{
float2 st;
uint c;
float q;
uint xy, z;
uint uv, f;
};
RWByteAddressBuffer VideoMemory : register(u0);
StructuredBuffer<Vertex> VertexBuffer : register(t0);
Buffer<uint> IndexBuffer : register(t1);
Buffer<int> FrameRowOffset : register(t2);
Buffer<int> FrameColOffset : register(t3);
Buffer<int> ZBufRowOffset : register(t4);
Buffer<int> ZBufColOffset : register(t5);
cbuffer DrawingEnvironment : register(c0)
{
// TODO
};
// one group is 16x8 pixels and one thread does 2 pixels, otherwise could not read-merge-write 16-bit targets safely
// neighburing pixels are next to eachother in memory, at least we don't have to calculate the address twice
// TODO: they say groupshared memory is faster, try unswizzling the corresponding chunk of memory initially (how to do that once by only one thread?) then write-back when finished, unless it was untouched
[numthreads(8, 8, 1)]
void cs_main(uint3 gid : SV_GroupID, uint3 tid : SV_GroupThreadID)
{
uint count;
IndexBuffer.GetDimensions(count);
// #if GS_PRIM == 2 (triangle)
for(uint i = 0; i < count; i += 3)
{
Vertex v0 = VertexBuffer[IndexBuffer[i + 0]];
Vertex v1 = VertexBuffer[IndexBuffer[i + 1]];
Vertex v2 = VertexBuffer[IndexBuffer[i + 2]];
uint x = gid.x + tid.x * 2;
uint y = gid.y + tid.y;
uint fa = FrameRowOffset[y] + FrameColOffset[x];
uint za = ZBufRowOffset[y] + ZBufColOffset[x];
// TODO: quickly reject if x, y is outside the triangle
// TODO: calculate interpolated values at x, y
// TODO: run the GS pipeline
// TODO: repeat for x+1, y
// TODO: output two pixels (might be better to process a single pixel, more threads, if there is no 16-bit target involved)
// testing...
uint4 c = VideoMemory.Load4(fa); // does this load 4*4 bytes? or 4 bytes each expanded uint?
c = (v0.c >> uint4(0, 8, 16, 24)) & 0xff; // => ushr r1.yzw, r1.xxxx, l(0, 8, 16, 24), v0.c auto-converted to uint4 and per-component shift in one instruction, SSE is embarrassed
VideoMemory.Store4(fa, c); // same question, 4*4 bytes or compressed to uint
}
// #endif
}
// TODO: DrawPoint (this is going to be a waste of resources)
// TODO: DrawLine (line hit-test, will it work?)
// TODO: DrawSprite (similar to DrawTriangle)
// TODO: if read-backs are too slow, implement GSState::Write/FlushWrite/Read/clut.Write in a compute shader
// TODO: unswizzle pages from VideoMemory to the texture cache (if they are marked as valid, otherwise upload from GSLocalMemory::m_vm8)

View File

@ -81,12 +81,13 @@
#define IDR_INTERLACE_FX 10003
#define IDD_CONFIG2 10004
#define IDR_FXAA_FX 10005
#define IDR_CS_FX 10006
// Next default values for new objects
//
#ifdef APSTUDIO_INVOKED
#ifndef APSTUDIO_READONLY_SYMBOLS
#define _APS_NEXT_RESOURCE_VALUE 10006
#define _APS_NEXT_RESOURCE_VALUE 10007
#define _APS_NEXT_COMMAND_VALUE 32771
#define _APS_NEXT_CONTROL_VALUE 2050
#define _APS_NEXT_SYMED_VALUE 5000