mirror of https://github.com/PCSX2/pcsx2.git
GSdx: added a shortcut in GSState::Transfer for the most frequent vertex format I found (helps quite a lot), less thread-syncing for the sw renderer, and the bios boot logo was fixed (just had to clear the memory on reset).
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5072 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
da4ea83134
commit
9aabcc1701
|
@ -28,8 +28,8 @@ const GSVector4i GPULocalMemory::m_xxbx(0x00007c00);
|
|||
const GSVector4i GPULocalMemory::m_xgxx(0x000003e0);
|
||||
const GSVector4i GPULocalMemory::m_rxxx(0x0000001f);
|
||||
|
||||
#define VM_SIZE ((1 << (12 + 11)) * sizeof(uint16))
|
||||
#define VM_ALLOC_SIZE (VM_SIZE * 2)
|
||||
#define VM_REAL_SIZE ((1 << (12 + 11)) * sizeof(uint16))
|
||||
#define VM_ALLOC_SIZE (VM_REAL_SIZE * 2)
|
||||
#define TEX_ALLOC_SIZE (256 * 256 * (1 + 1 + 4) * 32)
|
||||
|
||||
GPULocalMemory::GPULocalMemory()
|
||||
|
@ -39,7 +39,7 @@ GPULocalMemory::GPULocalMemory()
|
|||
|
||||
//
|
||||
|
||||
int size = VM_SIZE;
|
||||
int size = VM_REAL_SIZE;
|
||||
|
||||
m_vm = (uint16*)vmalloc(VM_ALLOC_SIZE, false);
|
||||
|
||||
|
|
|
@ -207,7 +207,7 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1)
|
|||
s_gs = NULL;
|
||||
}
|
||||
|
||||
if(renderer == 12)
|
||||
if(renderer == 15)
|
||||
{
|
||||
#ifdef _WINDOWS
|
||||
|
||||
|
|
|
@ -90,6 +90,11 @@ enum GIF_REG
|
|||
GIF_REG_NOP = 0x0f,
|
||||
};
|
||||
|
||||
enum GIF_REG_COMPLEX
|
||||
{
|
||||
GIF_REG_STQRGBAXYZF2 = 0x00,
|
||||
};
|
||||
|
||||
enum GIF_A_D_REG
|
||||
{
|
||||
GIF_A_D_REG_PRIM = 0x00,
|
||||
|
@ -1093,9 +1098,11 @@ __aligned(struct, 32) GIFPath
|
|||
uint32 reg;
|
||||
uint32 nreg;
|
||||
uint32 nloop;
|
||||
uint32 adonly;
|
||||
uint32 type;
|
||||
GSVector4i regs;
|
||||
|
||||
enum {TYPE_UNKNOWN, TYPE_ADONLY, TYPE_STQRGBAXYZF2};
|
||||
|
||||
void SetTag(const void* mem)
|
||||
{
|
||||
GSVector4i v = GSVector4i::load<false>(mem);
|
||||
|
@ -1104,7 +1111,9 @@ __aligned(struct, 32) GIFPath
|
|||
regs = v.uph8(v >> 4) & 0x0f0f0f0f;
|
||||
nreg = tag.NREG ? tag.NREG : 16;
|
||||
nloop = tag.NLOOP;
|
||||
adonly = regs.eq8(GSVector4i(0x0e0e0e0e)).mask() == (1 << nreg) - 1;
|
||||
type = TYPE_UNKNOWN;
|
||||
if(regs.u32[0] == 0x00040102 && nreg == 3) type = TYPE_STQRGBAXYZF2;
|
||||
else if(regs.eq8(GSVector4i(0x0e0e0e0e)).mask() == (1 << nreg) - 1) type = TYPE_ADONLY;
|
||||
}
|
||||
|
||||
__forceinline uint8 GetReg()
|
||||
|
|
|
@ -729,7 +729,6 @@ void GSDevice11::IASetVertexBuffer(const void* vertex, size_t stride, size_t cou
|
|||
m_vb = NULL;
|
||||
|
||||
m_vertex.start = 0;
|
||||
m_vertex.count = 0;
|
||||
m_vertex.limit = std::max<int>(count * 3 / 2, 11000);
|
||||
}
|
||||
|
||||
|
@ -798,7 +797,7 @@ void GSDevice11::IASetIndexBuffer(const void* index, size_t count)
|
|||
m_ib_old = m_ib;
|
||||
m_ib = NULL;
|
||||
|
||||
m_index.count = 0;
|
||||
m_index.start = 0;
|
||||
m_index.limit = std::max<int>(count * 3 / 2, 11000);
|
||||
}
|
||||
|
||||
|
@ -904,7 +903,11 @@ void GSDevice11::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1)
|
|||
{
|
||||
PSSetShaderResource(0, sr0);
|
||||
PSSetShaderResource(1, sr1);
|
||||
PSSetShaderResource(2, NULL);
|
||||
|
||||
for(int i = 2; i < countof(m_state.ps_srv); i++)
|
||||
{
|
||||
PSSetShaderResource(i, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDevice11::PSSetShaderResource(int i, GSTexture* sr)
|
||||
|
@ -913,6 +916,13 @@ void GSDevice11::PSSetShaderResource(int i, GSTexture* sr)
|
|||
|
||||
if(sr) srv = *(GSTexture11*)sr;
|
||||
|
||||
PSSetShaderResourceView(i, srv);
|
||||
}
|
||||
|
||||
void GSDevice11::PSSetShaderResourceView(int i, ID3D11ShaderResourceView* srv)
|
||||
{
|
||||
ASSERT(i < countof(m_state.ps_srv));
|
||||
|
||||
if(m_state.ps_srv[i] != srv)
|
||||
{
|
||||
m_state.ps_srv[i] = srv;
|
||||
|
@ -944,14 +954,14 @@ void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb)
|
|||
|
||||
if(m_srv_changed)
|
||||
{
|
||||
m_ctx->PSSetShaderResources(0, 3, m_state.ps_srv);
|
||||
m_ctx->PSSetShaderResources(0, countof(m_state.ps_srv), m_state.ps_srv);
|
||||
|
||||
m_srv_changed = false;
|
||||
}
|
||||
|
||||
if(m_ss_changed)
|
||||
{
|
||||
m_ctx->PSSetSamplers(0, 3, m_state.ps_ss);
|
||||
m_ctx->PSSetSamplers(0, countof(m_state.ps_ss), m_state.ps_ss);
|
||||
|
||||
m_ss_changed = false;
|
||||
}
|
||||
|
@ -1036,6 +1046,8 @@ void GSDevice11::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector
|
|||
m_ctx->OMSetRenderTargets(1, &rtv, dsv);
|
||||
}
|
||||
|
||||
memset(m_state.uav, 0, sizeof(m_state.uav));
|
||||
|
||||
if(m_state.viewport != rt->GetSize())
|
||||
{
|
||||
m_state.viewport = rt->GetSize();
|
||||
|
@ -1064,6 +1076,52 @@ void GSDevice11::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector
|
|||
}
|
||||
}
|
||||
|
||||
void GSDevice11::OMSetRenderTargets(const GSVector2i& rtsize, ID3D11UnorderedAccessView** uav, int count, const GSVector4i* scissor)
|
||||
{
|
||||
for(int i = 0; i < count; i++)
|
||||
{
|
||||
if(m_state.uav[i] != uav[i])
|
||||
{
|
||||
memcpy(m_state.uav, uav, sizeof(uav[0]) * count);
|
||||
memset(m_state.uav + count, 0, sizeof(m_state.uav) - sizeof(uav[0]) * count);
|
||||
|
||||
m_ctx->OMSetRenderTargetsAndUnorderedAccessViews(0, NULL, NULL, 0, count, uav, NULL);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
m_state.rtv = NULL;
|
||||
m_state.dsv = NULL;
|
||||
|
||||
if(m_state.viewport != rtsize)
|
||||
{
|
||||
m_state.viewport = rtsize;
|
||||
|
||||
D3D11_VIEWPORT vp;
|
||||
|
||||
memset(&vp, 0, sizeof(vp));
|
||||
|
||||
vp.TopLeftX = 0;
|
||||
vp.TopLeftY = 0;
|
||||
vp.Width = (float)rtsize.x;
|
||||
vp.Height = (float)rtsize.y;
|
||||
vp.MinDepth = 0.0f;
|
||||
vp.MaxDepth = 1.0f;
|
||||
|
||||
m_ctx->RSSetViewports(1, &vp);
|
||||
}
|
||||
|
||||
GSVector4i r = scissor ? *scissor : GSVector4i(rtsize).zwxy();
|
||||
|
||||
if(!m_state.scissor.eq(r))
|
||||
{
|
||||
m_state.scissor = r;
|
||||
|
||||
m_ctx->RSSetScissorRects(1, r);
|
||||
}
|
||||
}
|
||||
|
||||
HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il)
|
||||
{
|
||||
HRESULT hr;
|
||||
|
@ -1135,6 +1193,38 @@ HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MAC
|
|||
return hr;
|
||||
}
|
||||
|
||||
HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs, D3D11_SO_DECLARATION_ENTRY* layout, int count)
|
||||
{
|
||||
HRESULT hr;
|
||||
|
||||
vector<D3D11_SHADER_MACRO> m;
|
||||
|
||||
PrepareShaderMacro(m, macro);
|
||||
|
||||
CComPtr<ID3D11Blob> shader, error;
|
||||
|
||||
hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.gs.c_str(), 0, 0, NULL, &shader, &error, NULL);
|
||||
|
||||
if(error)
|
||||
{
|
||||
printf("%s\n", (const char*)error->GetBufferPointer());
|
||||
}
|
||||
|
||||
if(FAILED(hr))
|
||||
{
|
||||
return hr;
|
||||
}
|
||||
|
||||
hr = m_dev->CreateGeometryShaderWithStreamOutput((void*)shader->GetBufferPointer(), shader->GetBufferSize(), layout, count, NULL, 0, D3D11_SO_NO_RASTERIZED_STREAM, NULL, gs);
|
||||
|
||||
if(FAILED(hr))
|
||||
{
|
||||
return hr;
|
||||
}
|
||||
|
||||
return hr;
|
||||
}
|
||||
|
||||
HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps)
|
||||
{
|
||||
HRESULT hr;
|
||||
|
|
|
@ -60,7 +60,7 @@ class GSDevice11 : public GSDeviceDX
|
|||
ID3D11VertexShader* vs;
|
||||
ID3D11Buffer* vs_cb;
|
||||
ID3D11GeometryShader* gs;
|
||||
ID3D11ShaderResourceView* ps_srv[3];
|
||||
ID3D11ShaderResourceView* ps_srv[16];
|
||||
ID3D11PixelShader* ps;
|
||||
ID3D11Buffer* ps_cb;
|
||||
ID3D11SamplerState* ps_ss[3];
|
||||
|
@ -73,6 +73,7 @@ class GSDevice11 : public GSDeviceDX
|
|||
float bf;
|
||||
ID3D11RenderTargetView* rtv;
|
||||
ID3D11DepthStencilView* dsv;
|
||||
ID3D11UnorderedAccessView* uav[8];
|
||||
} m_state;
|
||||
|
||||
public: // TODO
|
||||
|
@ -178,6 +179,7 @@ public:
|
|||
void GSSetShader(ID3D11GeometryShader* gs);
|
||||
void PSSetShaderResources(GSTexture* sr0, GSTexture* sr1);
|
||||
void PSSetShaderResource(int i, GSTexture* sr);
|
||||
void PSSetShaderResourceView(int i, ID3D11ShaderResourceView* srv);
|
||||
void PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb);
|
||||
void PSSetSamplerState(ID3D11SamplerState* ss0, ID3D11SamplerState* ss1, ID3D11SamplerState* ss2 = NULL);
|
||||
void CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv);
|
||||
|
@ -186,6 +188,7 @@ public:
|
|||
void OMSetDepthStencilState(ID3D11DepthStencilState* dss, uint8 sref);
|
||||
void OMSetBlendState(ID3D11BlendState* bs, float bf);
|
||||
void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor = NULL);
|
||||
void OMSetRenderTargets(const GSVector2i& rtsize, ID3D11UnorderedAccessView** uav, int count, const GSVector4i* scissor = NULL);
|
||||
|
||||
void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim);
|
||||
void SetupVS(VSSelector sel, const VSConstantBuffer* cb);
|
||||
|
@ -202,6 +205,7 @@ public:
|
|||
|
||||
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il);
|
||||
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs);
|
||||
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs, D3D11_SO_DECLARATION_ENTRY* layout, int count);
|
||||
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps);
|
||||
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs);
|
||||
HRESULT CompileShader(const char* fn, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs);
|
||||
|
|
|
@ -53,7 +53,8 @@ public:
|
|||
GSOffset* fb;
|
||||
GSOffset* zb;
|
||||
GSOffset* tex;
|
||||
GSPixelOffset4* fzb;
|
||||
GSPixelOffset* fzb;
|
||||
GSPixelOffset4* fzb4;
|
||||
} offset;
|
||||
|
||||
GSDrawingContext()
|
||||
|
|
|
@ -473,6 +473,62 @@ GSOffset* GSLocalMemory::GetOffset(uint32 bp, uint32 bw, uint32 psm)
|
|||
return o;
|
||||
}
|
||||
|
||||
GSPixelOffset* GSLocalMemory::GetPixelOffset(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF)
|
||||
{
|
||||
uint32 fbp = FRAME.Block();
|
||||
uint32 zbp = ZBUF.Block();
|
||||
uint32 fpsm = FRAME.PSM;
|
||||
uint32 zpsm = ZBUF.PSM;
|
||||
uint32 bw = FRAME.FBW;
|
||||
|
||||
ASSERT(m_psm[fpsm].trbpp > 8 || m_psm[zpsm].trbpp > 8);
|
||||
|
||||
// "(psm & 0x0f) ^ ((psm & 0xf0) >> 2)" creates 4 bit unique identifiers for render target formats (only)
|
||||
|
||||
uint32 fpsm_hash = (fpsm & 0x0f) ^ ((fpsm & 0x30) >> 2);
|
||||
uint32 zpsm_hash = (zpsm & 0x0f) ^ ((zpsm & 0x30) >> 2);
|
||||
|
||||
uint32 hash = (FRAME.FBP << 0) | (ZBUF.ZBP << 9) | (bw << 18) | (fpsm_hash << 24) | (zpsm_hash << 28);
|
||||
|
||||
hash_map<uint32, GSPixelOffset*>::iterator i = m_pomap.find(hash);
|
||||
|
||||
if(i != m_pomap.end())
|
||||
{
|
||||
return i->second;
|
||||
}
|
||||
|
||||
GSPixelOffset* o = (GSPixelOffset*)_aligned_malloc(sizeof(GSPixelOffset), 32);
|
||||
|
||||
o->hash = hash;
|
||||
o->fbp = fbp;
|
||||
o->zbp = zbp;
|
||||
o->fpsm = fpsm;
|
||||
o->zpsm = zpsm;
|
||||
o->bw = bw;
|
||||
|
||||
pixelAddress fpa = m_psm[fpsm].pa;
|
||||
pixelAddress zpa = m_psm[zpsm].pa;
|
||||
|
||||
int fs = m_psm[fpsm].bpp >> 5;
|
||||
int zs = m_psm[zpsm].bpp >> 5;
|
||||
|
||||
for(int i = 0; i < 2048; i++)
|
||||
{
|
||||
o->row[i].x = (int)fpa(0, i, fbp, bw) << fs;
|
||||
o->row[i].y = (int)zpa(0, i, zbp, bw) << zs;
|
||||
}
|
||||
|
||||
for(int i = 0; i < 2048; i++)
|
||||
{
|
||||
o->col[i].x = m_psm[fpsm].rowOffset[0][i] << fs;
|
||||
o->col[i].y = m_psm[zpsm].rowOffset[0][i] << zs;
|
||||
}
|
||||
|
||||
m_pomap[hash] = o;
|
||||
|
||||
return o;
|
||||
}
|
||||
|
||||
GSPixelOffset4* GSLocalMemory::GetPixelOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF)
|
||||
{
|
||||
uint32 fbp = FRAME.Block();
|
||||
|
|
|
@ -56,6 +56,16 @@ public:
|
|||
uint32* GetPages(const GSVector4i& rect, uint32* pages = NULL, GSVector4i* bbox = NULL);
|
||||
};
|
||||
|
||||
struct GSPixelOffset
|
||||
{
|
||||
// 16 bit offsets (m_vm16[...])
|
||||
|
||||
GSVector2i row[2048]; // f yn | z yn
|
||||
GSVector2i col[2048]; // f xn | z xn
|
||||
uint32 hash;
|
||||
uint32 fbp, zbp, fpsm, zpsm, bw;
|
||||
};
|
||||
|
||||
struct GSPixelOffset4
|
||||
{
|
||||
// 16 bit offsets (m_vm16[...])
|
||||
|
@ -158,6 +168,7 @@ protected:
|
|||
//
|
||||
|
||||
hash_map<uint32, GSOffset*> m_omap;
|
||||
hash_map<uint32, GSPixelOffset*> m_pomap;
|
||||
hash_map<uint32, GSPixelOffset4*> m_po4map;
|
||||
hash_map<uint64, vector<GSVector2i>*> m_p2tmap;
|
||||
|
||||
|
@ -166,6 +177,7 @@ public:
|
|||
virtual ~GSLocalMemory();
|
||||
|
||||
GSOffset* GetOffset(uint32 bp, uint32 bw, uint32 psm);
|
||||
GSPixelOffset* GetPixelOffset(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF);
|
||||
GSPixelOffset4* GetPixelOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF);
|
||||
vector<GSVector2i>* GetPage2TileMap(const GIFRegTEX0& TEX0);
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ public:
|
|||
|
||||
enum counter_t
|
||||
{
|
||||
Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad,
|
||||
Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad, SyncPoint,
|
||||
CounterLast,
|
||||
};
|
||||
|
||||
|
|
|
@ -40,7 +40,7 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* pe
|
|||
m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false);
|
||||
m_edge.count = 0;
|
||||
|
||||
m_myscanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64);
|
||||
m_scanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64);
|
||||
|
||||
int row = 0;
|
||||
|
||||
|
@ -48,14 +48,14 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* pe
|
|||
{
|
||||
for(int i = 0; i < threads; i++, row++)
|
||||
{
|
||||
m_myscanline[row] = i == id ? 1 : 0;
|
||||
m_scanline[row] = i == id ? 1 : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GSRasterizer::~GSRasterizer()
|
||||
{
|
||||
_aligned_free(m_myscanline);
|
||||
_aligned_free(m_scanline);
|
||||
|
||||
if(m_edge.buff != NULL) vmfree(m_edge.buff, sizeof(GSVertexSW) * 2048);
|
||||
|
||||
|
@ -66,7 +66,7 @@ bool GSRasterizer::IsOneOfMyScanlines(int top) const
|
|||
{
|
||||
ASSERT(top >= 0 && top < 2048);
|
||||
|
||||
return m_myscanline[top >> THREAD_HEIGHT] != 0;
|
||||
return m_scanline[top >> THREAD_HEIGHT] != 0;
|
||||
}
|
||||
|
||||
bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const
|
||||
|
@ -78,7 +78,7 @@ bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const
|
|||
|
||||
while(top < bottom)
|
||||
{
|
||||
if(m_myscanline[top++])
|
||||
if(m_scanline[top++])
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
@ -91,9 +91,9 @@ int GSRasterizer::FindMyNextScanline(int top) const
|
|||
{
|
||||
int i = top >> THREAD_HEIGHT;
|
||||
|
||||
if(m_myscanline[i] == 0)
|
||||
if(m_scanline[i] == 0)
|
||||
{
|
||||
while(m_myscanline[++i] == 0);
|
||||
while(m_scanline[++i] == 0);
|
||||
|
||||
top = i << THREAD_HEIGHT;
|
||||
}
|
||||
|
@ -904,11 +904,20 @@ void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GS
|
|||
|
||||
//
|
||||
|
||||
GSRasterizerList::GSRasterizerList()
|
||||
: GSJobQueue<shared_ptr<GSRasterizerData> >()
|
||||
, m_sync_count(0)
|
||||
, m_syncpoint_count(0)
|
||||
GSRasterizerList::GSRasterizerList(int threads, GSPerfMon* perfmon)
|
||||
: m_perfmon(perfmon)
|
||||
{
|
||||
m_scanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64);
|
||||
|
||||
int row = 0;
|
||||
|
||||
while(row < (2048 >> THREAD_HEIGHT))
|
||||
{
|
||||
for(int i = 0; i < threads; i++, row++)
|
||||
{
|
||||
m_scanline[row] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GSRasterizerList::~GSRasterizerList()
|
||||
|
@ -917,31 +926,54 @@ GSRasterizerList::~GSRasterizerList()
|
|||
{
|
||||
delete *i;
|
||||
}
|
||||
|
||||
_aligned_free(m_scanline);
|
||||
}
|
||||
|
||||
void GSRasterizerList::Queue(shared_ptr<GSRasterizerData> data)
|
||||
{
|
||||
// disable dispatcher thread for now and pass-through directly,
|
||||
// would only be relevant if data->syncpoint was utilized more,
|
||||
// it would hide the syncing latency from the main gs thread
|
||||
if(data->syncpoint)
|
||||
{
|
||||
Sync();
|
||||
}
|
||||
|
||||
// Push(data);
|
||||
GSVector4i r = data->bbox.rintersect(data->scissor);
|
||||
|
||||
Process(data); m_count++;
|
||||
ASSERT(r.top >= 0 && r.top < 2048 && r.bottom >= 0 && r.bottom < 2048);
|
||||
|
||||
int top = r.top >> THREAD_HEIGHT;
|
||||
int bottom = std::min<int>((r.bottom + (1 << THREAD_HEIGHT) - 1) >> THREAD_HEIGHT, top + m_workers.size());
|
||||
|
||||
while(top < bottom)
|
||||
{
|
||||
m_workers[m_scanline[top++]]->Push(data);
|
||||
}
|
||||
}
|
||||
|
||||
void GSRasterizerList::Sync()
|
||||
{
|
||||
if(GetCount() == 0) return;
|
||||
if(!IsSynced())
|
||||
{
|
||||
for(size_t i = 0; i < m_workers.size(); i++)
|
||||
{
|
||||
m_workers[i]->Wait();
|
||||
}
|
||||
|
||||
Wait(); // first dispatch all items to workers
|
||||
m_perfmon->Put(GSPerfMon::SyncPoint, 1);
|
||||
}
|
||||
}
|
||||
|
||||
bool GSRasterizerList::IsSynced() const
|
||||
{
|
||||
for(size_t i = 0; i < m_workers.size(); i++)
|
||||
{
|
||||
m_workers[i]->Wait(); // then wait all workers to finish their jobs
|
||||
if(!m_workers[i]->IsEmpty())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
m_sync_count++;
|
||||
return true;
|
||||
}
|
||||
|
||||
int GSRasterizerList::GetPixels(bool reset)
|
||||
|
@ -956,24 +988,6 @@ int GSRasterizerList::GetPixels(bool reset)
|
|||
return pixels;
|
||||
}
|
||||
|
||||
void GSRasterizerList::Process(shared_ptr<GSRasterizerData>& item)
|
||||
{
|
||||
if(item->syncpoint)
|
||||
{
|
||||
for(size_t i = 0; i < m_workers.size(); i++)
|
||||
{
|
||||
m_workers[i]->Wait();
|
||||
}
|
||||
|
||||
m_syncpoint_count++;
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < m_workers.size(); i++)
|
||||
{
|
||||
m_workers[i]->Push(item);
|
||||
}
|
||||
}
|
||||
|
||||
// GSRasterizerList::GSWorker
|
||||
|
||||
GSRasterizerList::GSWorker::GSWorker(GSRasterizer* r)
|
||||
|
@ -994,16 +1008,6 @@ int GSRasterizerList::GSWorker::GetPixels(bool reset)
|
|||
return m_r->GetPixels(reset);
|
||||
}
|
||||
|
||||
void GSRasterizerList::GSWorker::Push(const shared_ptr<GSRasterizerData>& item)
|
||||
{
|
||||
GSVector4i r = item->bbox.rintersect(item->scissor);
|
||||
|
||||
if(m_r->IsOneOfMyScanlines(r.top, r.bottom))
|
||||
{
|
||||
GSJobQueue<shared_ptr<GSRasterizerData> >::Push(item);
|
||||
}
|
||||
}
|
||||
|
||||
void GSRasterizerList::GSWorker::Process(shared_ptr<GSRasterizerData>& item)
|
||||
{
|
||||
m_r->Draw(item.get());
|
||||
|
|
|
@ -109,6 +109,7 @@ public:
|
|||
|
||||
virtual void Queue(shared_ptr<GSRasterizerData> data) = 0;
|
||||
virtual void Sync() = 0;
|
||||
virtual bool IsSynced() const = 0;
|
||||
virtual int GetPixels(bool reset = true) = 0;
|
||||
};
|
||||
|
||||
|
@ -119,7 +120,7 @@ protected:
|
|||
IDrawScanline* m_ds;
|
||||
int m_id;
|
||||
int m_threads;
|
||||
uint8* m_myscanline;
|
||||
uint8* m_scanline;
|
||||
GSVector4i m_scissor;
|
||||
GSVector4 m_fscissor_x;
|
||||
GSVector4 m_fscissor_y;
|
||||
|
@ -155,12 +156,12 @@ public:
|
|||
|
||||
void Queue(shared_ptr<GSRasterizerData> data);
|
||||
void Sync() {}
|
||||
bool IsSynced() const {return true;}
|
||||
int GetPixels(bool reset);
|
||||
};
|
||||
|
||||
class GSRasterizerList
|
||||
: public IRasterizer
|
||||
, private GSJobQueue<shared_ptr<GSRasterizerData> >
|
||||
{
|
||||
protected:
|
||||
class GSWorker : public GSJobQueue<shared_ptr<GSRasterizerData> >
|
||||
|
@ -175,17 +176,14 @@ protected:
|
|||
|
||||
// GSJobQueue
|
||||
|
||||
void Push(const shared_ptr<GSRasterizerData>& item);
|
||||
void Process(shared_ptr<GSRasterizerData>& item);
|
||||
};
|
||||
|
||||
GSPerfMon* m_perfmon;
|
||||
vector<GSWorker*> m_workers;
|
||||
uint8* m_scanline;
|
||||
|
||||
GSRasterizerList();
|
||||
|
||||
// GSJobQueue
|
||||
|
||||
void Process(shared_ptr<GSRasterizerData>& item);
|
||||
GSRasterizerList(int threads, GSPerfMon* perfmon);
|
||||
|
||||
public:
|
||||
virtual ~GSRasterizerList();
|
||||
|
@ -200,7 +198,7 @@ public:
|
|||
}
|
||||
else
|
||||
{
|
||||
GSRasterizerList* rl = new GSRasterizerList();
|
||||
GSRasterizerList* rl = new GSRasterizerList(threads, perfmon);
|
||||
|
||||
for(int i = 0; i < threads; i++)
|
||||
{
|
||||
|
@ -211,12 +209,10 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
int m_sync_count;
|
||||
int m_syncpoint_count;
|
||||
|
||||
// IRasterizer
|
||||
|
||||
void Queue(shared_ptr<GSRasterizerData> data);
|
||||
void Sync();
|
||||
bool IsSynced() const;
|
||||
int GetPixels(bool reset);
|
||||
};
|
||||
|
|
|
@ -304,6 +304,8 @@ void GSRenderer::VSync(int field)
|
|||
ResetDevice();
|
||||
}
|
||||
|
||||
m_dev->AgePool();
|
||||
|
||||
// osd
|
||||
|
||||
if((m_perfmon.GetFrame() & 0x1f) == 0)
|
||||
|
@ -332,7 +334,7 @@ void GSRenderer::VSync(int field)
|
|||
s2.c_str(),
|
||||
theApp.m_gs_interlace[m_interlace].name.c_str(),
|
||||
theApp.m_gs_aspectratio[m_aspectratio].name.c_str(),
|
||||
(int)m_perfmon.Get(GSPerfMon::Quad),
|
||||
(int)m_perfmon.Get(GSPerfMon::SyncPoint),
|
||||
(int)m_perfmon.Get(GSPerfMon::Prim),
|
||||
(int)m_perfmon.Get(GSPerfMon::Draw),
|
||||
m_perfmon.CPU(),
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
#include "GSRendererCS.h"
|
||||
|
||||
GSRendererCS::GSRendererCS()
|
||||
: GSRenderer(new GSVertexTraceCS(this), sizeof(GSVertex))
|
||||
: GSRenderer(new GSVertexTraceDX11(this), sizeof(GSVertexHW11))
|
||||
{
|
||||
m_nativeres = true;
|
||||
|
||||
|
@ -41,27 +41,72 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk)
|
|||
if(!__super::CreateDevice(dev_unk))
|
||||
return false;
|
||||
|
||||
HRESULT hr;
|
||||
|
||||
D3D11_DEPTH_STENCIL_DESC dsd;
|
||||
D3D11_BLEND_DESC bsd;
|
||||
D3D11_SAMPLER_DESC sd;
|
||||
D3D11_BUFFER_DESC bd;
|
||||
D3D11_TEXTURE2D_DESC td;
|
||||
D3D11_UNORDERED_ACCESS_VIEW_DESC uavd;
|
||||
|
||||
D3D_FEATURE_LEVEL level;
|
||||
|
||||
((GSDeviceDX*)dev_unk)->GetFeatureLevel(level);
|
||||
|
||||
if(level < D3D_FEATURE_LEVEL_10_0)
|
||||
if(level < D3D_FEATURE_LEVEL_11_0)
|
||||
return false;
|
||||
|
||||
HRESULT hr;
|
||||
|
||||
GSDevice11* dev = (GSDevice11*)dev_unk;
|
||||
|
||||
D3D11_BUFFER_DESC bd;
|
||||
D3D11_UNORDERED_ACCESS_VIEW_DESC uavd;
|
||||
D3D11_SHADER_RESOURCE_VIEW_DESC srvd;
|
||||
ID3D11DeviceContext* ctx = *dev;
|
||||
|
||||
delete dev->CreateRenderTarget(1024, 1024, false);
|
||||
|
||||
// empty depth stencil state
|
||||
|
||||
memset(&dsd, 0, sizeof(dsd));
|
||||
|
||||
dsd.StencilEnable = false;
|
||||
dsd.DepthEnable = false;
|
||||
|
||||
hr = (*dev)->CreateDepthStencilState(&dsd, &m_dss);
|
||||
|
||||
if(FAILED(hr)) return false;
|
||||
|
||||
// empty blend state
|
||||
|
||||
memset(&bsd, 0, sizeof(bsd));
|
||||
|
||||
bsd.RenderTarget[0].BlendEnable = false;
|
||||
|
||||
hr = (*dev)->CreateBlendState(&bsd, &m_bs);
|
||||
|
||||
if(FAILED(hr)) return false;
|
||||
|
||||
// point sampler
|
||||
|
||||
memset(&sd, 0, sizeof(sd));
|
||||
|
||||
sd.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT;
|
||||
|
||||
sd.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP;
|
||||
sd.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP;
|
||||
sd.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP;
|
||||
|
||||
sd.MaxLOD = FLT_MAX;
|
||||
sd.MaxAnisotropy = 16;
|
||||
sd.ComparisonFunc = D3D11_COMPARISON_NEVER;
|
||||
|
||||
hr = (*dev)->CreateSamplerState(&sd, &m_ss);
|
||||
|
||||
if(FAILED(hr)) return false;
|
||||
|
||||
// video memory (4MB)
|
||||
|
||||
memset(&bd, 0, sizeof(bd));
|
||||
|
||||
bd.ByteWidth = 4 * 1024 * 1024;
|
||||
bd.StructureByteStride = 4;
|
||||
bd.Usage = D3D11_USAGE_DEFAULT;
|
||||
bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS;
|
||||
bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS;
|
||||
|
@ -81,35 +126,32 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk)
|
|||
hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav);
|
||||
|
||||
if(FAILED(hr)) return false;
|
||||
/*
|
||||
memset(&td, 0, sizeof(td));
|
||||
|
||||
// vertex buffer
|
||||
td.Width = PAGE_SIZE;
|
||||
td.Height = MAX_PAGES;
|
||||
td.Format = DXGI_FORMAT_R8_UINT;
|
||||
td.MipLevels = 1;
|
||||
td.ArraySize = 1;
|
||||
td.SampleDesc.Count = 1;
|
||||
td.SampleDesc.Quality = 0;
|
||||
td.Usage = D3D11_USAGE_DEFAULT;
|
||||
td.BindFlags = D3D11_BIND_UNORDERED_ACCESS;
|
||||
|
||||
memset(&bd, 0, sizeof(bd));
|
||||
|
||||
bd.ByteWidth = sizeof(GSVertex) * 10000;
|
||||
bd.StructureByteStride = sizeof(GSVertex);
|
||||
bd.Usage = D3D11_USAGE_DYNAMIC;
|
||||
bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
|
||||
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
|
||||
bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
|
||||
|
||||
hr = (*dev)->CreateBuffer(&bd, NULL, &m_vb);
|
||||
hr = (*dev)->CreateTexture2D(&td, NULL, &m_vm);
|
||||
|
||||
if(FAILED(hr)) return false;
|
||||
|
||||
// index buffer
|
||||
memset(&uavd, 0, sizeof(uavd));
|
||||
|
||||
memset(&bd, 0, sizeof(bd));
|
||||
uavd.Format = DXGI_FORMAT_R8_UINT;
|
||||
uavd.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D;
|
||||
|
||||
bd.ByteWidth = sizeof(uint32) * 10000 * 3;
|
||||
bd.Usage = D3D11_USAGE_DYNAMIC;
|
||||
bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
|
||||
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
|
||||
|
||||
hr = (*dev)->CreateBuffer(&bd, NULL, &m_ib);
|
||||
hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav);
|
||||
|
||||
if(FAILED(hr)) return false;
|
||||
|
||||
*/
|
||||
// one page, for copying between cpu<->gpu
|
||||
|
||||
memset(&bd, 0, sizeof(bd));
|
||||
|
@ -121,10 +163,69 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk)
|
|||
hr = (*dev)->CreateBuffer(&bd, NULL, &m_pb);
|
||||
|
||||
if(FAILED(hr)) return false;
|
||||
/*
|
||||
memset(&td, 0, sizeof(td));
|
||||
|
||||
td.Width = PAGE_SIZE;
|
||||
td.Height = 1;
|
||||
td.Format = DXGI_FORMAT_R8_UINT;
|
||||
td.MipLevels = 1;
|
||||
td.ArraySize = 1;
|
||||
td.SampleDesc.Count = 1;
|
||||
td.SampleDesc.Quality = 0;
|
||||
td.Usage = D3D11_USAGE_STAGING;
|
||||
td.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
|
||||
|
||||
hr = (*dev)->CreateTexture2D(&td, NULL, &m_pb);
|
||||
|
||||
if(FAILED(hr)) return false;
|
||||
*/
|
||||
// VSConstantBuffer
|
||||
|
||||
memset(&bd, 0, sizeof(bd));
|
||||
|
||||
bd.ByteWidth = sizeof(VSConstantBuffer);
|
||||
bd.Usage = D3D11_USAGE_DEFAULT;
|
||||
bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
|
||||
|
||||
hr = (*dev)->CreateBuffer(&bd, NULL, &m_vs_cb);
|
||||
|
||||
if(FAILED(hr)) return false;
|
||||
|
||||
// PSConstantBuffer
|
||||
|
||||
memset(&bd, 0, sizeof(bd));
|
||||
|
||||
bd.ByteWidth = sizeof(PSConstantBuffer);
|
||||
bd.Usage = D3D11_USAGE_DEFAULT;
|
||||
bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
|
||||
|
||||
hr = (*dev)->CreateBuffer(&bd, NULL, &m_ps_cb);
|
||||
|
||||
if(FAILED(hr)) return false;
|
||||
|
||||
//
|
||||
|
||||
memset(&bd, 0, sizeof(bd));
|
||||
|
||||
bd.ByteWidth = 14 * sizeof(float) * 200000;
|
||||
bd.Usage = D3D11_USAGE_DEFAULT;
|
||||
bd.BindFlags = D3D11_BIND_STREAM_OUTPUT | D3D11_BIND_SHADER_RESOURCE;
|
||||
|
||||
hr = (*dev)->CreateBuffer(&bd, NULL, &m_sob);
|
||||
|
||||
//
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void GSRendererCS::VSync(int field)
|
||||
{
|
||||
__super::VSync(field);
|
||||
|
||||
//printf("%lld\n", m_perfmon.GetFrame());
|
||||
}
|
||||
|
||||
GSTexture* GSRendererCS::GetOutput(int i)
|
||||
{
|
||||
// TODO: create a compute shader which unswizzles the frame from m_vm to the output texture
|
||||
|
@ -135,205 +236,342 @@ GSTexture* GSRendererCS::GetOutput(int i)
|
|||
template<uint32 prim, uint32 tme, uint32 fst>
|
||||
void GSRendererCS::ConvertVertex(size_t dst_index, size_t src_index)
|
||||
{
|
||||
// TODO: vertex format more fitting as the input for the compute shader
|
||||
GSVertex* s = (GSVertex*)((GSVertexHW11*)m_vertex.buff + src_index);
|
||||
GSVertexHW11* d = (GSVertexHW11*)m_vertex.buff + dst_index;
|
||||
|
||||
if(src_index != dst_index)
|
||||
GSVector4i v0 = ((GSVector4i*)s)[0];
|
||||
GSVector4i v1 = ((GSVector4i*)s)[1];
|
||||
|
||||
if(tme && fst)
|
||||
{
|
||||
GSVertex v = ((GSVertex*)m_vertex.buff)[src_index];
|
||||
// TODO: modify VertexTrace to read uv from v1.u16[0], v1.u16[1], then this step is not needed
|
||||
|
||||
((GSVertex*)m_vertex.buff)[dst_index] = v;
|
||||
v0 = GSVector4i::cast(GSVector4(v1.uph16()).xyzw(GSVector4::cast(v0))); // uv => st
|
||||
}
|
||||
|
||||
((GSVector4i*)d)[0] = v0;
|
||||
((GSVector4i*)d)[1] = v1;
|
||||
}
|
||||
|
||||
void GSRendererCS::Draw()
|
||||
{
|
||||
HRESULT hr;
|
||||
GSDrawingEnvironment& env = m_env;
|
||||
GSDrawingContext* context = m_context;
|
||||
|
||||
GSVector2i rtsize(2048, 2048);
|
||||
GSVector4i scissor = GSVector4i(context->scissor.in).rintersect(GSVector4i(rtsize).zwxy());
|
||||
GSVector4i bbox = GSVector4i(m_vt->m_min.p.floor().xyxy(m_vt->m_max.p.ceil()));
|
||||
GSVector4i r = bbox.rintersect(scissor);
|
||||
|
||||
uint32 fm = context->FRAME.FBMSK;
|
||||
uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
|
||||
|
||||
if(fm != 0xffffffff)
|
||||
{
|
||||
Write(context->offset.fb, r);
|
||||
|
||||
// TODO: m_tc->InvalidateVideoMem(context->offset.fb, r, false);
|
||||
}
|
||||
|
||||
if(zm != 0xffffffff)
|
||||
{
|
||||
Write(context->offset.zb, r);
|
||||
|
||||
// TODO: m_tc->InvalidateVideoMem(context->offset.zb, r, false);
|
||||
}
|
||||
|
||||
if(PRIM->TME)
|
||||
{
|
||||
m_mem.m_clut.Read32(context->TEX0, env.TEXA);
|
||||
|
||||
GSVector4i r;
|
||||
|
||||
GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt->IsLinear());
|
||||
|
||||
// TODO: unswizzle pages of r to a texture, check m_vm_valid, bit not set cpu->gpu, set gpu->gpu
|
||||
|
||||
// TODO: Write transfer should directly write to m_vm, then Read/Write syncing won't be necessary, clut must be updated with the gpu also
|
||||
|
||||
// TODO: tex = m_tc->LookupSource(context->TEX0, env.TEXA, r);
|
||||
|
||||
// if(!tex) return;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
GSDevice11* dev = (GSDevice11*)m_dev;
|
||||
|
||||
ID3D11DeviceContext* ctx = *dev;
|
||||
|
||||
D3D11_BUFFER_DESC bd;
|
||||
D3D11_UNORDERED_ACCESS_VIEW_DESC uavd;
|
||||
D3D11_SHADER_RESOURCE_VIEW_DESC srvd;
|
||||
D3D11_MAPPED_SUBRESOURCE map;
|
||||
dev->BeginScene();
|
||||
|
||||
CComPtr<ID3D11ShaderResourceView> vb_srv;
|
||||
CComPtr<ID3D11ShaderResourceView> ib_srv;
|
||||
// SetupOM
|
||||
|
||||
// TODO: cache these in hash_maps
|
||||
ID3D11UnorderedAccessView* uavs[] = {m_vm_uav};
|
||||
|
||||
CComPtr<ID3D11Buffer> fbr, fbc, zbr, zbc;
|
||||
CComPtr<ID3D11ShaderResourceView> fbr_srv, fbc_srv, zbr_srv, zbc_srv;
|
||||
dev->OMSetDepthStencilState(m_dss, 0);
|
||||
dev->OMSetBlendState(m_bs, 0);
|
||||
dev->OMSetRenderTargets(rtsize, uavs, countof(uavs), &scissor);
|
||||
|
||||
// TODO: grow m_vb, m_ib if needed
|
||||
// SetupIA
|
||||
|
||||
if(m_vertex.next > 10000) return;
|
||||
if(m_index.tail > 30000) return;
|
||||
D3D11_PRIMITIVE_TOPOLOGY topology;
|
||||
|
||||
// TODO: fill/advance/discardwhenfull, as in GSDevice11::IASetVertexBuffer/IASetIndexBuffer
|
||||
|
||||
hr = ctx->Map(m_vb, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); // discarding, until properly advancing the start pointer around
|
||||
|
||||
if(FAILED(hr)) return;
|
||||
|
||||
memcpy(map.pData, m_vertex.buff, sizeof(GSVertex) * m_vertex.next);
|
||||
|
||||
ctx->Unmap(m_vb, 0);
|
||||
|
||||
//
|
||||
|
||||
hr = ctx->Map(m_ib, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); // discarding, until properly advancing the start pointer around
|
||||
|
||||
if(FAILED(hr)) return;
|
||||
|
||||
memcpy(map.pData, m_index.buff, sizeof(uint32) * m_index.tail);
|
||||
|
||||
ctx->Unmap(m_ib, 0);
|
||||
|
||||
// TODO: UpdateResource might be faster, based on my exprience with the real vertex buffer, write-no-overwrite/discarded dynamic buffer + map is better
|
||||
|
||||
//
|
||||
|
||||
memset(&srvd, 0, sizeof(srvd));
|
||||
|
||||
srvd.Format = DXGI_FORMAT_UNKNOWN;
|
||||
srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
|
||||
srvd.Buffer.FirstElement = 0;
|
||||
srvd.Buffer.NumElements = m_vertex.next;
|
||||
|
||||
hr = (*dev)->CreateShaderResourceView(m_vb, &srvd, &vb_srv); // TODO: have to create this dyncamically in Draw() or pass the start/count in a const reg
|
||||
|
||||
memset(&srvd, 0, sizeof(srvd));
|
||||
|
||||
srvd.Format = DXGI_FORMAT_R32_UINT;
|
||||
srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
|
||||
srvd.Buffer.FirstElement = 0;
|
||||
srvd.Buffer.NumElements = m_index.tail;
|
||||
|
||||
hr = (*dev)->CreateShaderResourceView(m_ib, &srvd, &ib_srv); // TODO: have to create this dyncamically in Draw() or pass the start/count in a const reg
|
||||
|
||||
// fzb offsets
|
||||
|
||||
memset(&bd, 0, sizeof(bd));
|
||||
|
||||
bd.ByteWidth = sizeof(int) * 4096;
|
||||
bd.StructureByteStride = sizeof(int);
|
||||
bd.Usage = D3D11_USAGE_IMMUTABLE;
|
||||
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
|
||||
|
||||
D3D11_SUBRESOURCE_DATA data;
|
||||
|
||||
memset(&data, 0, sizeof(data));
|
||||
|
||||
data.pSysMem = m_context->offset.fb->pixel.row;
|
||||
|
||||
hr = (*dev)->CreateBuffer(&bd, &data, &fbr);
|
||||
|
||||
data.pSysMem = m_context->offset.fb->pixel.col[0]; // same column layout for every line in case of frame and zbuffer formats
|
||||
|
||||
hr = (*dev)->CreateBuffer(&bd, &data, &fbc);
|
||||
|
||||
data.pSysMem = m_context->offset.zb->pixel.row;
|
||||
|
||||
hr = (*dev)->CreateBuffer(&bd, &data, &zbr);
|
||||
|
||||
data.pSysMem = m_context->offset.zb->pixel.col[0]; // same column layout for every line in case of frame and zbuffer formats
|
||||
|
||||
hr = (*dev)->CreateBuffer(&bd, &data, &zbc);
|
||||
|
||||
// TODO: D3D10_SHADER_MACRO (primclass, less frequently changing drawing attribs, etc.)
|
||||
|
||||
uint32 sel = 0; // TODO
|
||||
|
||||
hash_map<uint32, CComPtr<ID3D11ComputeShader> >::iterator i = m_cs.find(sel);
|
||||
|
||||
CComPtr<ID3D11ComputeShader> cs;
|
||||
|
||||
if(i == m_cs.end())
|
||||
switch(m_vt->m_primclass)
|
||||
{
|
||||
// hr = dev->CompileShader(IDR_CS_FX, "cs_main", NULL, &cs);
|
||||
hr = dev->CompileShader("E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\cs.fx", "cs_main", NULL, &cs);
|
||||
|
||||
if(FAILED(hr)) return;
|
||||
|
||||
m_cs[sel] = cs;
|
||||
}
|
||||
else
|
||||
{
|
||||
cs = i->second;
|
||||
case GS_POINT_CLASS:
|
||||
topology = D3D11_PRIMITIVE_TOPOLOGY_POINTLIST;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
topology = D3D11_PRIMITIVE_TOPOLOGY_LINELIST;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
topology = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
|
||||
break;
|
||||
default:
|
||||
__assume(0);
|
||||
}
|
||||
|
||||
//
|
||||
dev->IASetVertexBuffer(m_vertex.buff, sizeof(GSVertexHW11), m_vertex.next);
|
||||
dev->IASetIndexBuffer(m_index.buff, m_index.tail);
|
||||
dev->IASetPrimitiveTopology(topology);
|
||||
|
||||
dev->CSSetShaderUAV(0, m_vm_uav);
|
||||
// SetupVS
|
||||
|
||||
dev->CSSetShaderSRV(0, vb_srv);
|
||||
dev->CSSetShaderSRV(1, ib_srv);
|
||||
dev->CSSetShaderSRV(2, fbr_srv);
|
||||
dev->CSSetShaderSRV(3, fbc_srv);
|
||||
dev->CSSetShaderSRV(4, zbr_srv);
|
||||
dev->CSSetShaderSRV(5, zbc_srv);
|
||||
VSSelector vs_sel;
|
||||
|
||||
dev->CSSetShader(cs);
|
||||
vs_sel.tme = PRIM->TME;
|
||||
vs_sel.fst = PRIM->FST;
|
||||
|
||||
GSVector4i bbox = GSVector4i(0, 0, 640, 512); // TODO: vertex trace
|
||||
VSConstantBuffer vs_cb;
|
||||
|
||||
GSVector4i r = bbox.ralign<Align_Outside>(GSVector2i(16, 8));
|
||||
float sx = 2.0f / (rtsize.x << 4);
|
||||
float sy = 2.0f / (rtsize.y << 4);
|
||||
//float sx = 1.0f / 16;
|
||||
//float sy = 1.0f / 16;
|
||||
float ox = (float)(int)context->XYOFFSET.OFX;
|
||||
float oy = (float)(int)context->XYOFFSET.OFY;
|
||||
|
||||
bool fb = true; // TODO: frame buffer used
|
||||
bool zb = true; // TODO: z-buffer used
|
||||
vs_cb.VertexScale = GSVector4(sx, -sy, 0.0f, 0.0f);
|
||||
vs_cb.VertexOffset = GSVector4(ox * sx + 1, -(oy * sy + 1), 0.0f, -1.0f);
|
||||
//vs_cb.VertexScale = GSVector4(sx, sy, 0.0f, 0.0f);
|
||||
//vs_cb.VertexOffset = GSVector4(ox * sx, oy * sy, 0.0f, -1.0f);
|
||||
|
||||
if(fb) Write(m_context->offset.fb, r);
|
||||
if(zb) Write(m_context->offset.zb, r);
|
||||
{
|
||||
hash_map<uint32, GSVertexShader11 >::const_iterator i = m_vs.find(vs_sel);
|
||||
|
||||
// TODO: constant buffer (frequently chaning drawing attribs)
|
||||
// TODO: texture (implement texture cache)
|
||||
// TODO: clut to a palette texture (should be texture1d, not simply buffer, it is random accessed)
|
||||
// TODO: CSSetShaderSRV(6 7 8 ..., texture level 0 1 2 ...) or use Texture3D?
|
||||
// TODO: invalidate texture cache
|
||||
if(i == m_vs.end())
|
||||
{
|
||||
string str[2];
|
||||
|
||||
/*
|
||||
CComPtr<ID3D11Query> q;
|
||||
str[0] = format("%d", vs_sel.tme);
|
||||
str[1] = format("%d", vs_sel.fst);
|
||||
|
||||
D3D11_QUERY_DESC qd;
|
||||
memset(&qd, 0, sizeof(qd));
|
||||
qd.Query = D3D11_QUERY_EVENT;
|
||||
D3D11_SHADER_MACRO macro[] =
|
||||
{
|
||||
{"VS_TME", str[0].c_str()},
|
||||
{"VS_FST", str[1].c_str()},
|
||||
{NULL, NULL},
|
||||
};
|
||||
|
||||
hr = (*dev)->CreateQuery(&qd, &q);
|
||||
D3D11_INPUT_ELEMENT_DESC layout[] =
|
||||
{
|
||||
{"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0},
|
||||
{"COLOR", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 8, D3D11_INPUT_PER_VERTEX_DATA, 0},
|
||||
{"TEXCOORD", 1, DXGI_FORMAT_R32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0},
|
||||
{"POSITION", 0, DXGI_FORMAT_R16G16_UINT, 0, 16, D3D11_INPUT_PER_VERTEX_DATA, 0},
|
||||
{"POSITION", 1, DXGI_FORMAT_R32_UINT, 0, 20, D3D11_INPUT_PER_VERTEX_DATA, 0},
|
||||
{"TEXCOORD", 2, DXGI_FORMAT_R16G16_UINT, 0, 24, D3D11_INPUT_PER_VERTEX_DATA, 0},
|
||||
{"COLOR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 28, D3D11_INPUT_PER_VERTEX_DATA, 0},
|
||||
};
|
||||
|
||||
ctx->Begin(q);
|
||||
*/
|
||||
GSVertexShader11 vs;
|
||||
|
||||
printf("[%lld] dispatch %05x %d %05x %d %05x %d %dx%d | %d %d %d\n",
|
||||
__rdtsc(),
|
||||
m_context->FRAME.Block(), m_context->FRAME.PSM,
|
||||
m_context->ZBUF.Block(), m_context->ZBUF.PSM,
|
||||
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH,
|
||||
PRIM->PRIM, m_vertex.next, m_index.tail);
|
||||
dev->CompileShader(IDR_CS_FX, "vs_main", macro, &vs.vs, layout, countof(layout), &vs.il);
|
||||
|
||||
GSVector4i rsize = r.rsize();
|
||||
m_vs[vs_sel] = vs;
|
||||
|
||||
dev->Dispatch(rsize.z >> 4, rsize.w >> 3, 1); // TODO: pass upper-left corner offset (r.xy) in a const buffer
|
||||
i = m_vs.find(vs_sel);
|
||||
}
|
||||
|
||||
/*
|
||||
ctx->End(q);
|
||||
ctx->UpdateSubresource(m_vs_cb, 0, NULL, &vs_cb, 0, 0); // TODO: only update if changed
|
||||
|
||||
uint64 t0 = __rdtsc();
|
||||
dev->VSSetShader(i->second.vs, m_vs_cb);
|
||||
|
||||
BOOL b;
|
||||
dev->IASetInputLayout(i->second.il);
|
||||
}
|
||||
|
||||
while(S_OK != ctx->GetData(q, &b, sizeof(BOOL), 0)) {}
|
||||
// SetupGS
|
||||
|
||||
printf("%lld\n", __rdtsc() - t0);
|
||||
*/
|
||||
GSSelector gs_sel;
|
||||
|
||||
gs_sel.iip = PRIM->IIP;
|
||||
gs_sel.prim = m_vt->m_primclass;
|
||||
|
||||
CComPtr<ID3D11GeometryShader> gs;
|
||||
|
||||
{
|
||||
hash_map<uint32, CComPtr<ID3D11GeometryShader> >::const_iterator i = m_gs.find(gs_sel);
|
||||
|
||||
if(i != m_gs.end())
|
||||
{
|
||||
gs = i->second;
|
||||
}
|
||||
else
|
||||
{
|
||||
string str[2];
|
||||
|
||||
str[0] = format("%d", gs_sel.iip);
|
||||
str[1] = format("%d", gs_sel.prim);
|
||||
|
||||
D3D11_SHADER_MACRO macro[] =
|
||||
{
|
||||
{"GS_IIP", str[0].c_str()},
|
||||
{"GS_PRIM", str[1].c_str()},
|
||||
{NULL, NULL},
|
||||
};
|
||||
/*
|
||||
D3D11_SO_DECLARATION_ENTRY layout[] =
|
||||
{
|
||||
{0, "SV_Position", 0, 0, 4, 0},
|
||||
{0, "TEXCOORD", 0, 0, 2, 0},
|
||||
{0, "TEXCOORD", 1, 0, 4, 0},
|
||||
{0, "COLOR", 0, 0, 4, 0},
|
||||
};
|
||||
*/
|
||||
dev->CompileShader(IDR_CS_FX, "gs_main", macro, &gs);//, layout, countof(layout));
|
||||
|
||||
m_gs[gs_sel] = gs;
|
||||
}
|
||||
}
|
||||
|
||||
dev->GSSetShader(gs);
|
||||
|
||||
// SetupPS
|
||||
|
||||
PSSelector ps_sel;
|
||||
PSConstantBuffer ps_cb;
|
||||
|
||||
hash_map<uint32, CComPtr<ID3D11PixelShader> >::const_iterator i = m_ps.find(ps_sel);
|
||||
|
||||
if(i == m_ps.end())
|
||||
{
|
||||
string str[15];
|
||||
|
||||
str[0] = format("%d", 0);
|
||||
|
||||
D3D11_SHADER_MACRO macro[] =
|
||||
{
|
||||
{"PS_TODO", str[0].c_str()},
|
||||
{NULL, NULL},
|
||||
};
|
||||
|
||||
CComPtr<ID3D11PixelShader> ps;
|
||||
|
||||
dev->CompileShader(IDR_CS_FX, "ps_main", macro, &ps);
|
||||
|
||||
m_ps[ps_sel] = ps;
|
||||
|
||||
i = m_ps.find(ps_sel);
|
||||
}
|
||||
|
||||
ctx->UpdateSubresource(m_ps_cb, 0, NULL, &ps_cb, 0, 0); // TODO: only update if changed
|
||||
|
||||
dev->PSSetSamplerState(m_ss, NULL, NULL);
|
||||
|
||||
dev->PSSetShader(i->second, m_ps_cb);
|
||||
|
||||
// Offset
|
||||
|
||||
OffsetBuffer* fzbo = NULL;
|
||||
|
||||
GetOffsetBuffer(&fzbo);
|
||||
|
||||
dev->PSSetShaderResourceView(0, fzbo->row_view);
|
||||
dev->PSSetShaderResourceView(1, fzbo->col_view);
|
||||
|
||||
// TODO: 2 palette
|
||||
// TODO: 3, 4, ... texture levels
|
||||
|
||||
//ID3D11Buffer* tmp[] = {m_sob};
|
||||
|
||||
//ctx->SOSetTargets(countof(tmp), tmp, NULL);
|
||||
|
||||
dev->DrawIndexedPrimitive();
|
||||
|
||||
//ctx->SOSetTargets(0, NULL, NULL);
|
||||
|
||||
if(0)
|
||||
{
|
||||
HRESULT hr;
|
||||
|
||||
D3D11_BUFFER_DESC bd;
|
||||
|
||||
memset(&bd, 0, sizeof(bd));
|
||||
|
||||
bd.ByteWidth = 14 * sizeof(float) * 200000;
|
||||
bd.Usage = D3D11_USAGE_STAGING;
|
||||
bd.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
|
||||
|
||||
CComPtr<ID3D11Buffer> sob;
|
||||
|
||||
hr = (*dev)->CreateBuffer(&bd, NULL, &sob);
|
||||
|
||||
ctx->CopyResource(sob, m_sob);
|
||||
|
||||
D3D11_MAPPED_SUBRESOURCE map;
|
||||
|
||||
if(SUCCEEDED(ctx->Map(sob, 0, D3D11_MAP_READ, 0, &map)))
|
||||
{
|
||||
float* f = (float*)map.pData;
|
||||
|
||||
for(int i = 0; i < 12; i++, f += 14)
|
||||
printf("%f %f %f %f\n%f %f\n%f %f %f %f\n%f %f %f %f\n",
|
||||
f[0], f[1], f[2], f[3],
|
||||
f[4], f[5],
|
||||
f[6], f[7], f[8], f[9],
|
||||
f[10], f[11], f[12], f[13]);
|
||||
|
||||
ctx->Unmap(sob, 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if(1)
|
||||
{
|
||||
//Read(m_mem.GetOffset(0, 16, PSM_PSMCT32), GSVector4i(0, 0, 1024, 1024), false);
|
||||
|
||||
//
|
||||
if(fm != 0xffffffff) Read(context->offset.fb, r, false);
|
||||
//
|
||||
if(zm != 0xffffffff) Read(context->offset.zb, r, false);
|
||||
|
||||
std::string s;
|
||||
|
||||
s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->FRAME.Block(), m_context->FRAME.PSM);
|
||||
|
||||
//
|
||||
m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
|
||||
|
||||
s = format("c:\\temp1\\_%05d_f%lld_zt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->ZBUF.Block(), m_context->ZBUF.PSM);
|
||||
|
||||
//
|
||||
m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
|
||||
|
||||
//m_mem.SaveBMP(s, 0, 16, PSM_PSMCT32, 1024, 1024);
|
||||
|
||||
s_n++;
|
||||
}
|
||||
|
||||
dev->EndScene();
|
||||
}
|
||||
|
||||
void GSRendererCS::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
|
||||
{
|
||||
GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
|
||||
|
||||
Read(o, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated
|
||||
Read(o, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated (important)
|
||||
|
||||
// TODO: false deps, 8H/4HL/4HH texture sharing pages with 24-bit target
|
||||
// TODO: invalidate texture cache
|
||||
|
@ -356,6 +594,10 @@ void GSRendererCS::Write(GSOffset* o, const GSVector4i& r)
|
|||
|
||||
memset(&box, 0, sizeof(box));
|
||||
|
||||
box.right = 1;
|
||||
box.bottom = 1;
|
||||
box.back = 1;
|
||||
|
||||
uint32* pages = o->GetPages(r);
|
||||
|
||||
for(size_t i = 0; pages[i] != GSOffset::EOP; i++)
|
||||
|
@ -370,10 +612,20 @@ void GSRendererCS::Write(GSOffset* o, const GSVector4i& r)
|
|||
m_vm_valid[row] |= col;
|
||||
|
||||
box.left = page * PAGE_SIZE;
|
||||
box.right = box.left + PAGE_SIZE;
|
||||
box.right = (page + 1) * PAGE_SIZE;
|
||||
|
||||
ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + box.left, 0, 0);
|
||||
ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0);
|
||||
/*
|
||||
// m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4
|
||||
|
||||
box.left = 0;
|
||||
box.right = PAGE_SIZE;
|
||||
box.top = page;
|
||||
box.bottom = box.top + 1;
|
||||
|
||||
ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0);
|
||||
*/
|
||||
if(0)
|
||||
printf("[%lld] write %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page);
|
||||
}
|
||||
}
|
||||
|
@ -391,6 +643,10 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate)
|
|||
|
||||
memset(&box, 0, sizeof(box));
|
||||
|
||||
box.right = 1;
|
||||
box.bottom = 1;
|
||||
box.back = 1;
|
||||
|
||||
uint32* pages = o->GetPages(r);
|
||||
|
||||
for(size_t i = 0; pages[i] != GSOffset::EOP; i++)
|
||||
|
@ -402,21 +658,34 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate)
|
|||
|
||||
if(m_vm_valid[row] & col)
|
||||
{
|
||||
if(invalidate) m_vm_valid[row] ^= col;
|
||||
if(invalidate)
|
||||
{
|
||||
m_vm_valid[row] ^= col;
|
||||
}
|
||||
|
||||
box.left = page * PAGE_SIZE;
|
||||
box.right = box.left + PAGE_SIZE;
|
||||
box.right = (page + 1) * PAGE_SIZE;
|
||||
|
||||
ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box);
|
||||
/*
|
||||
// m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4
|
||||
|
||||
box.left = 0;
|
||||
box.right = PAGE_SIZE;
|
||||
box.top = page;
|
||||
box.bottom = box.top + 1;
|
||||
|
||||
ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box);
|
||||
*/
|
||||
D3D11_MAPPED_SUBRESOURCE map;
|
||||
|
||||
if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ_WRITE, 0, &map)))
|
||||
if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ, 0, &map)))
|
||||
{
|
||||
memcpy(m_mem.m_vm8 + box.left, map.pData, PAGE_SIZE);
|
||||
memcpy(m_mem.m_vm8 + page * PAGE_SIZE, map.pData, PAGE_SIZE);
|
||||
|
||||
ctx->Unmap(m_pb, 0);
|
||||
|
||||
if(0)
|
||||
printf("[%lld] read %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page);
|
||||
}
|
||||
}
|
||||
|
@ -424,3 +693,64 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate)
|
|||
|
||||
delete [] pages;
|
||||
}
|
||||
|
||||
bool GSRendererCS::GetOffsetBuffer(OffsetBuffer** fzbo)
|
||||
{
|
||||
HRESULT hr;
|
||||
|
||||
GSDevice11* dev = (GSDevice11*)m_dev;
|
||||
|
||||
D3D11_BUFFER_DESC bd;
|
||||
D3D11_SHADER_RESOURCE_VIEW_DESC srvd;
|
||||
D3D11_SUBRESOURCE_DATA data;
|
||||
|
||||
hash_map<uint32, OffsetBuffer>::iterator i = m_offset.find(m_context->offset.fzb->hash);
|
||||
|
||||
if(i == m_offset.end())
|
||||
{
|
||||
OffsetBuffer ob;
|
||||
|
||||
memset(&bd, 0, sizeof(bd));
|
||||
|
||||
bd.ByteWidth = sizeof(GSVector2i) * 2048;
|
||||
bd.Usage = D3D11_USAGE_IMMUTABLE;
|
||||
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
|
||||
|
||||
memset(&data, 0, sizeof(data));
|
||||
|
||||
data.pSysMem = m_context->offset.fzb->row;
|
||||
|
||||
hr = (*dev)->CreateBuffer(&bd, &data, &ob.row);
|
||||
|
||||
if(FAILED(hr)) return false;
|
||||
|
||||
data.pSysMem = m_context->offset.fzb->col;
|
||||
|
||||
hr = (*dev)->CreateBuffer(&bd, &data, &ob.col);
|
||||
|
||||
if(FAILED(hr)) return false;
|
||||
|
||||
memset(&srvd, 0, sizeof(srvd));
|
||||
|
||||
srvd.Format = DXGI_FORMAT_R32G32_SINT;
|
||||
srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
|
||||
srvd.Buffer.FirstElement = 0;
|
||||
srvd.Buffer.NumElements = 2048;
|
||||
|
||||
hr = (*dev)->CreateShaderResourceView(ob.row, &srvd, &ob.row_view);
|
||||
|
||||
if(FAILED(hr)) return false;
|
||||
|
||||
hr = (*dev)->CreateShaderResourceView(ob.col, &srvd, &ob.col_view);
|
||||
|
||||
if(FAILED(hr)) return false;
|
||||
|
||||
m_offset[m_context->offset.fzb->hash] = ob;
|
||||
|
||||
i = m_offset.find(m_context->offset.fzb->hash);
|
||||
}
|
||||
|
||||
*fzbo = &i->second;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -26,28 +26,105 @@
|
|||
|
||||
class GSRendererCS : public GSRenderer
|
||||
{
|
||||
class GSVertexTraceCS : public GSVertexTrace
|
||||
struct VSSelector
|
||||
{
|
||||
public:
|
||||
GSVertexTraceCS(const GSState* state) : GSVertexTrace(state) {}
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint32 tme:1;
|
||||
uint32 fst:1;
|
||||
};
|
||||
|
||||
uint32 key;
|
||||
};
|
||||
|
||||
operator uint32() {return key & 0x3;}
|
||||
|
||||
VSSelector() : key(0) {}
|
||||
};
|
||||
|
||||
__aligned(struct, 32) VSConstantBuffer
|
||||
{
|
||||
GSVector4 VertexScale;
|
||||
GSVector4 VertexOffset;
|
||||
};
|
||||
|
||||
struct GSSelector
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint32 iip:1;
|
||||
uint32 prim:2;
|
||||
};
|
||||
|
||||
uint32 key;
|
||||
};
|
||||
|
||||
operator uint32() {return key & 0x7;}
|
||||
|
||||
GSSelector() : key(0) {}
|
||||
};
|
||||
|
||||
struct PSSelector
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint32 TODO:1;
|
||||
};
|
||||
|
||||
uint32 key;
|
||||
};
|
||||
|
||||
operator uint32() {return key & 0x1;}
|
||||
|
||||
PSSelector() : key(0) {}
|
||||
};
|
||||
|
||||
__aligned(struct, 32) PSConstantBuffer
|
||||
{
|
||||
GSVector4 TODO;
|
||||
};
|
||||
|
||||
CComPtr<ID3D11DepthStencilState> m_dss;
|
||||
CComPtr<ID3D11BlendState> m_bs;
|
||||
CComPtr<ID3D11SamplerState> m_ss;
|
||||
CComPtr<ID3D11Buffer> m_vm;
|
||||
//CComPtr<ID3D11Texture2D> m_vm;
|
||||
CComPtr<ID3D11UnorderedAccessView> m_vm_uav;
|
||||
CComPtr<ID3D11Buffer> m_vb;
|
||||
CComPtr<ID3D11Buffer> m_ib;
|
||||
CComPtr<ID3D11Buffer> m_pb;
|
||||
hash_map<uint32, CComPtr<ID3D11ComputeShader> > m_cs;
|
||||
uint32 m_vm_valid[16];
|
||||
CComPtr<ID3D11Buffer> m_pb;
|
||||
//CComPtr<ID3D11Texture2D> m_pb;
|
||||
hash_map<uint32, GSVertexShader11 > m_vs;
|
||||
CComPtr<ID3D11Buffer> m_vs_cb;
|
||||
hash_map<uint32, CComPtr<ID3D11GeometryShader> > m_gs;
|
||||
hash_map<uint32, CComPtr<ID3D11PixelShader> > m_ps;
|
||||
CComPtr<ID3D11Buffer> m_ps_cb;
|
||||
CComPtr<ID3D11Buffer> m_sob;
|
||||
|
||||
void Write(GSOffset* o, const GSVector4i& r);
|
||||
void Read(GSOffset* o, const GSVector4i& r, bool invalidate);
|
||||
|
||||
struct OffsetBuffer
|
||||
{
|
||||
CComPtr<ID3D11Buffer> row, col;
|
||||
CComPtr<ID3D11ShaderResourceView> row_view, col_view;
|
||||
};
|
||||
|
||||
hash_map<uint32, OffsetBuffer> m_offset;
|
||||
|
||||
bool GetOffsetBuffer(OffsetBuffer** fzbo);
|
||||
|
||||
protected:
|
||||
template<uint32 prim, uint32 tme, uint32 fst>
|
||||
void ConvertVertex(size_t dst_index, size_t src_index);
|
||||
|
||||
bool CreateDevice(GSDevice* dev);
|
||||
void VSync(int field);
|
||||
GSTexture* GetOutput(int i);
|
||||
void Draw();
|
||||
void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
|
||||
|
|
|
@ -233,7 +233,7 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc
|
|||
}
|
||||
}
|
||||
|
||||
if (env.COLCLAMP.CLAMP == 0 && /* hack */ !tex && PRIM->PRIM != GS_POINTLIST)
|
||||
if(env.COLCLAMP.CLAMP == 0 && /* hack */ !tex && PRIM->PRIM != GS_POINTLIST)
|
||||
{
|
||||
ps_sel.colclip = 1;
|
||||
}
|
||||
|
|
|
@ -101,19 +101,18 @@ void GSRendererHW::Reset()
|
|||
|
||||
void GSRendererHW::VSync(int field)
|
||||
{
|
||||
GSRenderer::VSync(field);
|
||||
|
||||
m_tc->IncAge();
|
||||
m_dev->AgePool();
|
||||
|
||||
m_skip = 0;
|
||||
|
||||
if(m_reset)
|
||||
{
|
||||
m_tc->RemoveAll();
|
||||
|
||||
m_reset = false;
|
||||
}
|
||||
|
||||
GSRenderer::VSync(field);
|
||||
|
||||
m_tc->IncAge();
|
||||
|
||||
m_skip = 0;
|
||||
}
|
||||
|
||||
void GSRendererHW::ResetDevice()
|
||||
|
|
|
@ -22,6 +22,10 @@
|
|||
#include "stdafx.h"
|
||||
#include "GSRendererSW.h"
|
||||
|
||||
#define LOG 0
|
||||
|
||||
static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
|
||||
|
||||
const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
|
||||
|
||||
GSRendererSW::GSRendererSW(int threads)
|
||||
|
@ -60,10 +64,9 @@ GSRendererSW::~GSRendererSW()
|
|||
|
||||
void GSRendererSW::Reset()
|
||||
{
|
||||
// TODO: GSreset can come from the main thread too => crash
|
||||
// m_tc->RemoveAll();
|
||||
Sync(-1);
|
||||
|
||||
m_reset = true;
|
||||
m_tc->RemoveAll();
|
||||
|
||||
GSRenderer::Reset();
|
||||
}
|
||||
|
@ -72,6 +75,93 @@ void GSRendererSW::VSync(int field)
|
|||
{
|
||||
Sync(0); // IncAge might delete a cached texture in use
|
||||
|
||||
if(LOG)
|
||||
{
|
||||
fprintf(s_fp, "%lld\n", m_perfmon.GetFrame());
|
||||
|
||||
GSVector4i dr = GetDisplayRect();
|
||||
GSVector4i fr = GetFrameRect();
|
||||
GSVector2i ds = GetDeviceSize();
|
||||
|
||||
fprintf(s_fp, "dr %d %d %d %d, fr %d %d %d %d, ds %d %d\n",
|
||||
dr.x, dr.y, dr.z, dr.w,
|
||||
fr.x, fr.y, fr.z, fr.w,
|
||||
ds.x, ds.y);
|
||||
|
||||
for(int i = 0; i < 2; i++)
|
||||
{
|
||||
if(i == 0 && !m_regs->PMODE.EN1) continue;
|
||||
if(i == 1 && !m_regs->PMODE.EN2) continue;
|
||||
|
||||
fprintf(s_fp, "DISPFB[%d] BP=%05x BW=%d PSM=%d DBX=%d DBY=%d\n",
|
||||
i,
|
||||
m_regs->DISP[i].DISPFB.Block(),
|
||||
m_regs->DISP[i].DISPFB.FBW,
|
||||
m_regs->DISP[i].DISPFB.PSM,
|
||||
m_regs->DISP[i].DISPFB.DBX,
|
||||
m_regs->DISP[i].DISPFB.DBY
|
||||
);
|
||||
|
||||
fprintf(s_fp, "DISPLAY[%d] DX=%d DY=%d DW=%d DH=%d MAGH=%d MAGV=%d\n",
|
||||
i,
|
||||
m_regs->DISP[i].DISPLAY.DX,
|
||||
m_regs->DISP[i].DISPLAY.DY,
|
||||
m_regs->DISP[i].DISPLAY.DW,
|
||||
m_regs->DISP[i].DISPLAY.DH,
|
||||
m_regs->DISP[i].DISPLAY.MAGH,
|
||||
m_regs->DISP[i].DISPLAY.MAGV
|
||||
);
|
||||
}
|
||||
|
||||
fprintf(s_fp, "PMODE EN1=%d EN2=%d CRTMD=%d MMOD=%d AMOD=%d SLBG=%d ALP=%d\n",
|
||||
m_regs->PMODE.EN1,
|
||||
m_regs->PMODE.EN2,
|
||||
m_regs->PMODE.CRTMD,
|
||||
m_regs->PMODE.MMOD,
|
||||
m_regs->PMODE.AMOD,
|
||||
m_regs->PMODE.SLBG,
|
||||
m_regs->PMODE.ALP
|
||||
);
|
||||
|
||||
fprintf(s_fp, "SMODE1 %08x_%08x\n",
|
||||
m_regs->SMODE1.u32[0],
|
||||
m_regs->SMODE1.u32[1]
|
||||
);
|
||||
|
||||
fprintf(s_fp, "SMODE2 INT=%d FFMD=%d DPMS=%d\n",
|
||||
m_regs->SMODE2.INT,
|
||||
m_regs->SMODE2.FFMD,
|
||||
m_regs->SMODE2.DPMS
|
||||
);
|
||||
|
||||
fprintf(s_fp, "SRFSH %08x_%08x\n",
|
||||
m_regs->SRFSH.u32[0],
|
||||
m_regs->SRFSH.u32[1]
|
||||
);
|
||||
|
||||
fprintf(s_fp, "SYNCH1 %08x_%08x\n",
|
||||
m_regs->SYNCH1.u32[0],
|
||||
m_regs->SYNCH1.u32[1]
|
||||
);
|
||||
|
||||
fprintf(s_fp, "SYNCH2 %08x_%08x\n",
|
||||
m_regs->SYNCH2.u32[0],
|
||||
m_regs->SYNCH2.u32[1]
|
||||
);
|
||||
|
||||
fprintf(s_fp, "SYNCV %08x_%08x\n",
|
||||
m_regs->SYNCV.u32[0],
|
||||
m_regs->SYNCV.u32[1]
|
||||
);
|
||||
|
||||
fprintf(s_fp, "CSR %08x_%08x\n",
|
||||
m_regs->CSR.u32[0],
|
||||
m_regs->CSR.u32[1]
|
||||
);
|
||||
|
||||
fflush(s_fp);
|
||||
}
|
||||
|
||||
/*
|
||||
int draw[8], sum = 0;
|
||||
|
||||
|
@ -87,20 +177,12 @@ void GSRendererSW::VSync(int field)
|
|||
draw[0], draw[1], draw[2], draw[3], draw[4], draw[5], draw[6], draw[7], sum);
|
||||
|
||||
//
|
||||
printf("m_sync_count = %d\n", ((GSRasterizerList*)m_rl)->m_sync_count); ((GSRasterizerList*)m_rl)->m_sync_count = 0;
|
||||
printf("m_syncpoint_count = %d\n", ((GSRasterizerList*)m_rl)->m_syncpoint_count); ((GSRasterizerList*)m_rl)->m_syncpoint_count = 0;
|
||||
*/
|
||||
|
||||
GSRenderer::VSync(field);
|
||||
|
||||
m_tc->IncAge();
|
||||
|
||||
if(m_reset)
|
||||
{
|
||||
m_tc->RemoveAll();
|
||||
|
||||
m_reset = false;
|
||||
}
|
||||
|
||||
// if((m_perfmon.GetFrame() & 255) == 0) m_rl.PrintStats();
|
||||
}
|
||||
|
||||
|
@ -197,10 +279,6 @@ void GSRendererSW::ConvertVertex(size_t dst_index, size_t src_index)
|
|||
}
|
||||
}
|
||||
|
||||
#define LOG 0
|
||||
|
||||
FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
|
||||
|
||||
void GSRendererSW::Draw()
|
||||
{
|
||||
SharedData* sd = new SharedData(this);
|
||||
|
@ -265,89 +343,18 @@ void GSRendererSW::Draw()
|
|||
m_tc->InvalidatePages(zb_pages, context->offset.zb->psm);
|
||||
}
|
||||
|
||||
// set data->syncpoint
|
||||
|
||||
if(m_fzb != context->offset.fzb)
|
||||
if(CheckTargetPages(fb_pages, zb_pages, r))
|
||||
{
|
||||
// hmm, what if "r" gets bigger next time and slips through unchecked, need to trace that too
|
||||
|
||||
sd->syncpoint = true; // TODO
|
||||
|
||||
if(!sd->syncpoint)
|
||||
{
|
||||
if(fb_pages == NULL)
|
||||
{
|
||||
fb_pages = context->offset.fb->GetPages(r);
|
||||
}
|
||||
|
||||
if(CheckTargetPages<0xffffffff>(fb_pages))
|
||||
{
|
||||
sd->syncpoint = true;
|
||||
|
||||
if(LOG) fprintf(s_fp, "syncpoint 0\n");
|
||||
}
|
||||
}
|
||||
|
||||
if(!sd->syncpoint)
|
||||
{
|
||||
if(zb_pages == NULL)
|
||||
{
|
||||
zb_pages = context->offset.zb->GetPages(r);
|
||||
}
|
||||
|
||||
if(CheckTargetPages<0xffffffff>(zb_pages))
|
||||
{
|
||||
sd->syncpoint = true;
|
||||
|
||||
if(LOG) fprintf(s_fp, "syncpoint 1\n");
|
||||
}
|
||||
}
|
||||
|
||||
if(!sd->syncpoint)
|
||||
{
|
||||
if(LOG) fprintf(s_fp, "no syncpoint *\n");
|
||||
}
|
||||
|
||||
m_fzb = context->offset.fzb;
|
||||
sd->syncpoint = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
// chross-check frame and z-buffer pages, they cannot overlap with eachother and with previous batches in queue,
|
||||
// m_fzb filters out most of these cases, only have to be careful when the addresses stay the same and the output
|
||||
// is mutually enabled/disabled and alternating (Bully FBP/ZBP = 0x2300)
|
||||
|
||||
if(!sd->syncpoint)
|
||||
{
|
||||
if(gd.sel.fwrite)
|
||||
{
|
||||
if(CheckTargetPages<0xffff0000>(fb_pages)) // already used as a z-buffer
|
||||
{
|
||||
sd->syncpoint = true;
|
||||
|
||||
if(LOG) fprintf(s_fp, "syncpoint 2\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(!sd->syncpoint)
|
||||
{
|
||||
if(gd.sel.zwrite)
|
||||
{
|
||||
if(CheckTargetPages<0x0000ffff>(zb_pages)) // already used as a frame buffer
|
||||
{
|
||||
sd->syncpoint = true;
|
||||
|
||||
if(LOG) fprintf(s_fp, "syncpoint 3\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
sd->UseTargetPages(fb_pages, zb_pages);
|
||||
|
||||
//
|
||||
if(LOG) {fprintf(s_fp, "queue %05x %d %05x %d %05x %d %dx%d | %d %d %d\n",
|
||||
m_context->FRAME.Block(), m_context->FRAME.PSM,
|
||||
m_context->ZBUF.Block(), m_context->ZBUF.PSM,
|
||||
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH,
|
||||
PRIM->PRIM, sd->vertex_count, sd->index_count); fflush(s_fp);}
|
||||
|
||||
if(s_dump)
|
||||
{
|
||||
|
@ -404,12 +411,6 @@ void GSRendererSW::Draw()
|
|||
}
|
||||
else
|
||||
{
|
||||
if(LOG) fprintf(s_fp, "queue %05x %d %05x %d %05x %d %dx%d | %d %d %d\n",
|
||||
m_context->FRAME.Block(), m_context->FRAME.PSM,
|
||||
m_context->ZBUF.Block(), m_context->ZBUF.PSM,
|
||||
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH,
|
||||
PRIM->PRIM, sd->vertex_count, sd->index_count);
|
||||
|
||||
m_rl->Queue(data);
|
||||
}
|
||||
|
||||
|
@ -435,36 +436,42 @@ void GSRendererSW::Sync(int reason)
|
|||
|
||||
m_rl->Sync();
|
||||
|
||||
s_n++;
|
||||
if(0)
|
||||
{
|
||||
std::string s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->FRAME.Block(), m_context->FRAME.PSM);
|
||||
|
||||
m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
|
||||
|
||||
s_n++;
|
||||
}
|
||||
|
||||
t = __rdtsc() - t;
|
||||
|
||||
if(LOG) fprintf(s_fp, "sync n=%d r=%d t=%lld p=%d %c\n", s_n, reason, t, m_rl->GetPixels(), t > 10000000 ? '*' : ' ');
|
||||
int pixels = m_rl->GetPixels();
|
||||
|
||||
m_perfmon.Put(GSPerfMon::Fillrate, m_rl->GetPixels());
|
||||
if(LOG) {fprintf(s_fp, "sync n=%d r=%d t=%lld p=%d %c\n", s_n, reason, t, pixels, t > 10000000 ? '*' : ' '); fflush(s_fp);}
|
||||
|
||||
m_perfmon.Put(GSPerfMon::Fillrate, pixels);
|
||||
}
|
||||
|
||||
void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
|
||||
{
|
||||
GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
|
||||
|
||||
uint32* RESTRICT p = m_tmp_pages;
|
||||
|
||||
o->GetPages(r, p);
|
||||
o->GetPages(r, m_tmp_pages);
|
||||
|
||||
// check if the changing pages either used as a texture or a target
|
||||
|
||||
for(; *p != GSOffset::EOP; p++)
|
||||
if(!m_rl->IsSynced())
|
||||
{
|
||||
uint32 page = *p;
|
||||
|
||||
//while(m_fzb_pages[page] | m_tex_pages[page]) _mm_pause();
|
||||
|
||||
if(m_fzb_pages[page] | m_tex_pages[page])
|
||||
for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
|
||||
{
|
||||
Sync(5);
|
||||
if(m_fzb_pages[*p] | m_tex_pages[*p])
|
||||
{
|
||||
Sync(5);
|
||||
|
||||
break;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -473,21 +480,20 @@ void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
|
|||
|
||||
void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut)
|
||||
{
|
||||
GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
|
||||
|
||||
uint32* RESTRICT p = m_tmp_pages;
|
||||
|
||||
o->GetPages(r, p);
|
||||
|
||||
for(; *p != GSOffset::EOP; p++)
|
||||
if(!m_rl->IsSynced())
|
||||
{
|
||||
//while(m_fzb_pages[*p]) _mm_pause();
|
||||
GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
|
||||
|
||||
if(m_fzb_pages[*p])
|
||||
o->GetPages(r, m_tmp_pages);
|
||||
|
||||
for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
|
||||
{
|
||||
Sync(6);
|
||||
if(m_fzb_pages[*p])
|
||||
{
|
||||
Sync(6);
|
||||
|
||||
break;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -505,15 +511,16 @@ void GSRendererSW::UsePages(const uint32* pages, int type)
|
|||
}
|
||||
else
|
||||
{
|
||||
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
|
||||
if(!m_rl->IsSynced())
|
||||
{
|
||||
//while(m_fzb_pages[*p]) _mm_pause();
|
||||
|
||||
if(m_fzb_pages[*p]) // currently being drawn to? => sync (could even spin and wait until it hits 0, not sure if it's worth though, or just create 512 condvars? :D)
|
||||
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
|
||||
{
|
||||
Sync(7);
|
||||
if(m_fzb_pages[*p]) // currently being drawn to? => sync
|
||||
{
|
||||
Sync(7);
|
||||
|
||||
break;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -548,13 +555,150 @@ void GSRendererSW::ReleasePages(const uint32* pages, int type)
|
|||
}
|
||||
}
|
||||
|
||||
template<uint32 mask> bool GSRendererSW::CheckTargetPages(const uint32* pages)
|
||||
bool GSRendererSW::CheckTargetPages(const uint32* fb_pages, const uint32* zb_pages, const GSVector4i& r)
|
||||
{
|
||||
for(const uint32* p = pages; *p != GSOffset::EOP; p++)
|
||||
bool synced = m_rl->IsSynced();
|
||||
|
||||
if(m_fzb != m_context->offset.fzb4)
|
||||
{
|
||||
if(mask != 0xffffffff ? (m_fzb_pages[*p] & mask) : m_fzb_pages[*p])
|
||||
// targets changed, check everything
|
||||
|
||||
m_fzb = m_context->offset.fzb4;
|
||||
m_fzb_bbox = r;
|
||||
|
||||
if(fb_pages == NULL) fb_pages = m_context->offset.fb->GetPages(r);
|
||||
if(zb_pages == NULL) zb_pages = m_context->offset.zb->GetPages(r);
|
||||
|
||||
memset(m_fzb_cur_pages, 0, sizeof(m_fzb_cur_pages));
|
||||
|
||||
uint32 used = 0;
|
||||
|
||||
for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++)
|
||||
{
|
||||
return true;
|
||||
uint32 i = *p;
|
||||
|
||||
uint32 row = i >> 5;
|
||||
uint32 col = 1 << (i & 31);
|
||||
|
||||
m_fzb_cur_pages[row] |= col;
|
||||
|
||||
used |= m_fzb_pages[i];
|
||||
}
|
||||
|
||||
for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++)
|
||||
{
|
||||
uint32 i = *p;
|
||||
|
||||
uint32 row = i >> 5;
|
||||
uint32 col = 1 << (i & 31);
|
||||
|
||||
m_fzb_cur_pages[row] |= col;
|
||||
|
||||
used |= m_fzb_pages[i];
|
||||
}
|
||||
|
||||
if(!synced)
|
||||
{
|
||||
if(used)
|
||||
{
|
||||
if(LOG) {fprintf(s_fp, "syncpoint 0\n"); fflush(s_fp);}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
if(LOG) {fprintf(s_fp, "no syncpoint *\n"); fflush(s_fp);}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// same target, only check new areas and cross-rendering between frame and z-buffer
|
||||
|
||||
GSVector4i bbox = m_fzb_bbox.runion(r);
|
||||
|
||||
bool check = !m_fzb_bbox.eq(bbox);
|
||||
|
||||
m_fzb_bbox = bbox;
|
||||
|
||||
if(check)
|
||||
{
|
||||
// drawing area is larger than previous time, check new parts only to avoid false positives (m_fzb_cur_pages guards)
|
||||
|
||||
if(fb_pages == NULL) fb_pages = m_context->offset.fb->GetPages(r);
|
||||
if(zb_pages == NULL) zb_pages = m_context->offset.zb->GetPages(r);
|
||||
|
||||
uint32 used = 0;
|
||||
|
||||
for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++)
|
||||
{
|
||||
uint32 i = *p;
|
||||
|
||||
uint32 row = i >> 5;
|
||||
uint32 col = 1 << (i & 31);
|
||||
|
||||
if((m_fzb_cur_pages[row] & col) == 0)
|
||||
{
|
||||
m_fzb_cur_pages[row] |= col;
|
||||
|
||||
used |= m_fzb_pages[i];
|
||||
}
|
||||
}
|
||||
|
||||
for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++)
|
||||
{
|
||||
uint32 i = *p;
|
||||
|
||||
uint32 row = i >> 5;
|
||||
uint32 col = 1 << (i & 31);
|
||||
|
||||
if((m_fzb_cur_pages[row] & col) == 0)
|
||||
{
|
||||
m_fzb_cur_pages[row] |= col;
|
||||
|
||||
used |= m_fzb_pages[i];
|
||||
}
|
||||
}
|
||||
|
||||
if(!synced)
|
||||
{
|
||||
if(used)
|
||||
{
|
||||
if(LOG) {fprintf(s_fp, "syncpoint 1\n"); fflush(s_fp);}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(!synced)
|
||||
{
|
||||
// chross-check frame and z-buffer pages, they cannot overlap with eachother and with previous batches in queue,
|
||||
// have to be careful when the two buffers are mutually enabled/disabled and alternating (Bully FBP/ZBP = 0x2300)
|
||||
|
||||
if(fb_pages)
|
||||
{
|
||||
for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++)
|
||||
{
|
||||
if(m_fzb_pages[*p] & 0xffff0000)
|
||||
{
|
||||
if(LOG) {fprintf(s_fp, "syncpoint 2\n"); fflush(s_fp);}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(zb_pages)
|
||||
{
|
||||
for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++)
|
||||
{
|
||||
if(m_fzb_pages[*p] & 0x0000ffff)
|
||||
{
|
||||
if(LOG) {fprintf(s_fp, "syncpoint 3\n"); fflush(s_fp);}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -577,8 +721,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
|
|||
gd.zbr = context->offset.zb->pixel.row;
|
||||
gd.fbc = context->offset.fb->pixel.col[0];
|
||||
gd.zbc = context->offset.zb->pixel.col[0];
|
||||
gd.fzbr = context->offset.fzb->row;
|
||||
gd.fzbc = context->offset.fzb->col;
|
||||
gd.fzbr = context->offset.fzb4->row;
|
||||
gd.fzbc = context->offset.fzb4->col;
|
||||
|
||||
gd.sel.key = 0;
|
||||
|
||||
|
@ -1117,8 +1261,8 @@ GSRendererSW::SharedData::~SharedData()
|
|||
}
|
||||
}
|
||||
|
||||
delete m_fb_pages;
|
||||
delete m_zb_pages;
|
||||
delete [] m_fb_pages;
|
||||
delete [] m_zb_pages;
|
||||
|
||||
for(size_t i = 0; i < countof(m_tex_pages) && m_tex_pages[i] != NULL; i++)
|
||||
{
|
||||
|
@ -1153,8 +1297,10 @@ void GSRendererSW::SharedData::UseSourcePages(GSTextureCacheSW::Texture* t, int
|
|||
{
|
||||
ASSERT(m_tex_pages[level] == NULL);
|
||||
|
||||
m_tex_pages[level] = t->m_pages.n;
|
||||
const uint32* pages = t->m_pages.n;
|
||||
|
||||
m_tex_pages[level] = pages;
|
||||
m_tex_pages[level + 1] = NULL;
|
||||
|
||||
m_parent->UsePages(t->m_pages.n, 2);
|
||||
m_parent->UsePages(pages, 2);
|
||||
}
|
||||
|
|
|
@ -48,8 +48,9 @@ protected:
|
|||
GSTextureCacheSW* m_tc;
|
||||
GSTexture* m_texture[2];
|
||||
uint8* m_output;
|
||||
bool m_reset;
|
||||
GSPixelOffset4* m_fzb;
|
||||
GSVector4i m_fzb_bbox;
|
||||
uint32 m_fzb_cur_pages[16];
|
||||
uint32 m_fzb_pages[512]; // uint16 frame/zbuf pages interleaved
|
||||
uint16 m_tex_pages[512];
|
||||
uint32 m_tmp_pages[512 + 1];
|
||||
|
@ -66,7 +67,7 @@ protected:
|
|||
|
||||
void UsePages(const uint32* pages, int type);
|
||||
void ReleasePages(const uint32* pages, int type);
|
||||
template<uint32 mask> bool CheckTargetPages(const uint32* pages);
|
||||
bool CheckTargetPages(const uint32* fb_pages, const uint32* zb_pages, const GSVector4i& r);
|
||||
|
||||
bool GetScanlineGlobalData(SharedData* data);
|
||||
|
||||
|
|
|
@ -209,6 +209,9 @@ void GSState::SetFrameSkip(int skip)
|
|||
|
||||
void GSState::Reset()
|
||||
{
|
||||
printf("GS reset\n");
|
||||
|
||||
memset(m_mem.m_vm8, 0, m_mem.m_vmsize);
|
||||
memset(&m_path[0], 0, sizeof(m_path[0]) * countof(m_path));
|
||||
memset(&m_v, 0, sizeof(m_v));
|
||||
|
||||
|
@ -253,6 +256,7 @@ void GSState::ResetHandlers()
|
|||
m_fpGIFRegHandlerXYZ[P][1] = &GSState::GIFRegHandlerXYZF2<P, 1>; \
|
||||
m_fpGIFRegHandlerXYZ[P][2] = &GSState::GIFRegHandlerXYZ2<P, 0>; \
|
||||
m_fpGIFRegHandlerXYZ[P][3] = &GSState::GIFRegHandlerXYZ2<P, 1>; \
|
||||
m_fpGIFPackedRegHandlerSTQRGBAXYZF2[P] = &GSState::GIFPackedRegHandlerSTQRGBAXYZF2<P>; \
|
||||
|
||||
SetHandlerXYZ(GS_POINTLIST);
|
||||
SetHandlerXYZ(GS_LINELIST);
|
||||
|
@ -546,6 +550,36 @@ void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r)
|
|||
{
|
||||
}
|
||||
|
||||
template<uint32 prim>
|
||||
void GSState::GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, uint32 size)
|
||||
{
|
||||
ASSERT(size > 0 && size % 3 == 0);
|
||||
|
||||
const GIFPackedReg* RESTRICT r_end = r + size;
|
||||
|
||||
while(r < r_end)
|
||||
{
|
||||
GSVector4i st = GSVector4i::loadl(&r[0].u64[0]);
|
||||
GSVector4i q = GSVector4i::loadl(&r[0].u64[1]);
|
||||
GSVector4i rgba = (GSVector4i::load<false>(&r[1]) & GSVector4i::x000000ff()).ps32().pu16();
|
||||
|
||||
m_v.m[0] = st.upl64(rgba.upl32(q));
|
||||
|
||||
GSVector4i xy = GSVector4i::loadl(&r[2].u64[0]);
|
||||
GSVector4i zf = GSVector4i::loadl(&r[2].u64[1]);
|
||||
xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::loadl(&m_v.UV));
|
||||
zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
|
||||
|
||||
m_v.m[1] = xy.upl32(zf);
|
||||
|
||||
VertexKick<prim>(r[2].XYZF2.Skip());
|
||||
|
||||
r += 3;
|
||||
}
|
||||
|
||||
m_q = r[-3].STQ.Q; // remember the last one, STQ outputs this to the temp Q each time
|
||||
}
|
||||
|
||||
// GIFRegHandler*
|
||||
|
||||
void GSState::GIFRegHandlerNull(const GIFReg* RESTRICT r)
|
||||
|
@ -1037,7 +1071,8 @@ template<int i> void GSState::GIFRegHandlerFRAME(const GIFReg* RESTRICT r)
|
|||
{
|
||||
m_env.CTXT[i].offset.fb = m_mem.GetOffset(r->FRAME.Block(), r->FRAME.FBW, r->FRAME.PSM);
|
||||
m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), r->FRAME.FBW, m_env.CTXT[i].ZBUF.PSM);
|
||||
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(r->FRAME, m_env.CTXT[i].ZBUF);
|
||||
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(r->FRAME, m_env.CTXT[i].ZBUF);
|
||||
m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(r->FRAME, m_env.CTXT[i].ZBUF);
|
||||
}
|
||||
|
||||
m_env.CTXT[i].FRAME = (GSVector4i)r->FRAME;
|
||||
|
@ -1075,7 +1110,8 @@ template<int i> void GSState::GIFRegHandlerZBUF(const GIFReg* RESTRICT r)
|
|||
if((m_env.CTXT[i].ZBUF.u32[0] ^ ZBUF.u32[0]) & 0x3f0001ff) // ZBP PSM
|
||||
{
|
||||
m_env.CTXT[i].offset.zb = m_mem.GetOffset(ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, ZBUF.PSM);
|
||||
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, ZBUF);
|
||||
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(m_env.CTXT[i].FRAME, ZBUF);
|
||||
m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, ZBUF);
|
||||
}
|
||||
|
||||
m_env.CTXT[i].ZBUF = (GSVector4i)ZBUF;
|
||||
|
@ -1726,8 +1762,28 @@ template<int index> void GSState::Transfer(const uint8* mem, uint32 size)
|
|||
{
|
||||
size -= total;
|
||||
|
||||
if(path.adonly)
|
||||
switch(path.type)
|
||||
{
|
||||
case GIFPath::TYPE_UNKNOWN:
|
||||
|
||||
{
|
||||
uint32 reg = 0;
|
||||
|
||||
do
|
||||
{
|
||||
(this->*m_fpGIFPackedRegHandlers[path.GetReg(reg++)])((GIFPackedReg*)mem);
|
||||
|
||||
mem += sizeof(GIFPackedReg);
|
||||
|
||||
reg = reg & ((int)(reg - path.nreg) >> 31); // resets reg back to 0 when it becomes equal to path.nreg
|
||||
}
|
||||
while(--total > 0);
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case GIFPath::TYPE_ADONLY: // very common
|
||||
|
||||
do
|
||||
{
|
||||
(this->*m_fpGIFRegHandlers[((GIFPackedReg*)mem)->A_D.ADDR])(&((GIFPackedReg*)mem)->r);
|
||||
|
@ -1735,20 +1791,20 @@ template<int index> void GSState::Transfer(const uint8* mem, uint32 size)
|
|||
mem += sizeof(GIFPackedReg);
|
||||
}
|
||||
while(--total > 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32 reg = 0;
|
||||
|
||||
do
|
||||
{
|
||||
(this->*m_fpGIFPackedRegHandlers[path.GetReg(reg++)])((GIFPackedReg*)mem);
|
||||
break;
|
||||
|
||||
mem += sizeof(GIFPackedReg);
|
||||
case GIFPath::TYPE_STQRGBAXYZF2: // majority of the vertices are formatted like this
|
||||
|
||||
reg = reg & ((int)(reg - path.nreg) >> 31); // resets reg back to 0 when it becomes equal to path.nreg
|
||||
}
|
||||
while(--total > 0);
|
||||
(this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2])((GIFPackedReg*)mem, total);
|
||||
|
||||
mem += total * sizeof(GIFPackedReg);
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
|
||||
__assume(0);
|
||||
}
|
||||
|
||||
path.nloop = 0;
|
||||
|
@ -2070,7 +2126,8 @@ int GSState::Defrost(const GSFreezeData* fd)
|
|||
m_env.CTXT[i].offset.fb = m_mem.GetOffset(m_env.CTXT[i].FRAME.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].FRAME.PSM);
|
||||
m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].ZBUF.PSM);
|
||||
m_env.CTXT[i].offset.tex = m_mem.GetOffset(m_env.CTXT[i].TEX0.TBP0, m_env.CTXT[i].TEX0.TBW, m_env.CTXT[i].TEX0.PSM);
|
||||
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
|
||||
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
|
||||
m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
|
||||
}
|
||||
|
||||
UpdateScissor();
|
||||
|
@ -2116,6 +2173,8 @@ void GSState::UpdateVertexKick()
|
|||
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = m_fpGIFRegHandlerXYZ[prim][2];
|
||||
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = m_fpGIFRegHandlerXYZ[prim][3];
|
||||
|
||||
m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2] = m_fpGIFPackedRegHandlerSTQRGBAXYZF2[prim];
|
||||
|
||||
m_cvf = m_cv[prim][PRIM->TME][PRIM->FST];
|
||||
}
|
||||
|
||||
|
|
|
@ -59,6 +59,13 @@ class GSState : public GSAlignedClass<32>
|
|||
GIFRegHandler m_fpGIFRegHandlers[256];
|
||||
GIFRegHandler m_fpGIFRegHandlerXYZ[8][4];
|
||||
|
||||
typedef void (GSState::*GIFPackedRegHandlerC)(const GIFPackedReg* RESTRICT r, uint32 size);
|
||||
|
||||
GIFPackedRegHandlerC m_fpGIFPackedRegHandlersC[1];
|
||||
GIFPackedRegHandlerC m_fpGIFPackedRegHandlerSTQRGBAXYZF2[8];
|
||||
|
||||
template<uint32 prim> void GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, uint32 size);
|
||||
|
||||
template<int i> void ApplyTEX0(GIFRegTEX0& TEX0);
|
||||
void ApplyPRIM(const GIFRegPRIM& PRIM);
|
||||
|
||||
|
|
|
@ -167,6 +167,18 @@ GSTexture11::operator ID3D11ShaderResourceView*()
|
|||
return m_srv;
|
||||
}
|
||||
|
||||
GSTexture11::operator ID3D11UnorderedAccessView*()
|
||||
{
|
||||
if(!m_uav && m_dev && m_texture)
|
||||
{
|
||||
ASSERT(!m_msaa);
|
||||
|
||||
m_dev->CreateUnorderedAccessView(m_texture, NULL, &m_uav);
|
||||
}
|
||||
|
||||
return m_uav;
|
||||
}
|
||||
|
||||
GSTexture11::operator ID3D11RenderTargetView*()
|
||||
{
|
||||
ASSERT(m_dev);
|
||||
|
|
|
@ -30,6 +30,7 @@ class GSTexture11 : public GSTexture
|
|||
CComPtr<ID3D11Texture2D> m_texture;
|
||||
D3D11_TEXTURE2D_DESC m_desc;
|
||||
CComPtr<ID3D11ShaderResourceView> m_srv;
|
||||
CComPtr<ID3D11UnorderedAccessView> m_uav;
|
||||
CComPtr<ID3D11RenderTargetView> m_rtv;
|
||||
CComPtr<ID3D11DepthStencilView> m_dsv;
|
||||
|
||||
|
@ -43,6 +44,7 @@ public:
|
|||
|
||||
operator ID3D11Texture2D*();
|
||||
operator ID3D11ShaderResourceView*();
|
||||
operator ID3D11UnorderedAccessView*();
|
||||
operator ID3D11RenderTargetView*();
|
||||
operator ID3D11DepthStencilView*();
|
||||
};
|
||||
|
|
|
@ -281,6 +281,8 @@ GSTextureCache::Target* GSTextureCache::LookupTarget(const GIFRegTEX0& TEX0, int
|
|||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
m_renderer->m_dev->ClearRenderTarget(dst->m_texture, 0); // new frame buffers after reset should be cleared, don't display memory garbage
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -30,6 +30,7 @@ WakeAllConditionVariablePtr pWakeAllConditionVariable;
|
|||
SleepConditionVariableSRWPtr pSleepConditionVariableSRW;
|
||||
InitializeSRWLockPtr pInitializeSRWLock;;
|
||||
AcquireSRWLockExclusivePtr pAcquireSRWLockExclusive;
|
||||
TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive;
|
||||
ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive;
|
||||
|
||||
class InitCondVar
|
||||
|
@ -47,6 +48,7 @@ public:
|
|||
pSleepConditionVariableSRW = (SleepConditionVariableSRWPtr)GetProcAddress(m_kernel32, "SleepConditionVariableSRW");
|
||||
pInitializeSRWLock = (InitializeSRWLockPtr)GetProcAddress(m_kernel32, "InitializeSRWLock");
|
||||
pAcquireSRWLockExclusive = (AcquireSRWLockExclusivePtr)GetProcAddress(m_kernel32, "AcquireSRWLockExclusive");
|
||||
pTryAcquireSRWLockExclusive = (TryAcquireSRWLockExclusivePtr)GetProcAddress(m_kernel32, "TryAcquireSRWLockExclusive");
|
||||
pReleaseSRWLockExclusive = (ReleaseSRWLockExclusivePtr)GetProcAddress(m_kernel32, "ReleaseSRWLockExclusive");
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,8 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "GSdx.h"
|
||||
|
||||
#ifdef _WINDOWS
|
||||
|
||||
typedef void (WINAPI * InitializeConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable);
|
||||
|
@ -29,7 +31,7 @@ typedef void (WINAPI * WakeAllConditionVariablePtr)(CONDITION_VARIABLE* Conditio
|
|||
typedef void (WINAPI * SleepConditionVariableSRWPtr)(CONDITION_VARIABLE* ConditionVariable, SRWLOCK* SRWLock, DWORD dwMilliseconds, ULONG Flags);
|
||||
typedef void (WINAPI * InitializeSRWLockPtr)(SRWLOCK* SRWLock);
|
||||
typedef void (WINAPI * AcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock);
|
||||
typedef void (WINAPI * ReleaseSRWLockExclusivePtr)(SRWLOCK* SRWLock);
|
||||
typedef BOOLEAN (WINAPI * TryAcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock);typedef void (WINAPI * ReleaseSRWLockExclusivePtr)(SRWLOCK* SRWLock);
|
||||
|
||||
extern InitializeConditionVariablePtr pInitializeConditionVariable;
|
||||
extern WakeConditionVariablePtr pWakeConditionVariable;
|
||||
|
@ -37,7 +39,7 @@ extern WakeAllConditionVariablePtr pWakeAllConditionVariable;
|
|||
extern SleepConditionVariableSRWPtr pSleepConditionVariableSRW;
|
||||
extern InitializeSRWLockPtr pInitializeSRWLock;;
|
||||
extern AcquireSRWLockExclusivePtr pAcquireSRWLockExclusive;
|
||||
extern ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive;
|
||||
extern TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive;extern ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive;
|
||||
|
||||
class GSThread
|
||||
{
|
||||
|
@ -92,7 +94,7 @@ public:
|
|||
GSCondVarLock() {pInitializeSRWLock(&m_lock);}
|
||||
|
||||
void Lock() {pAcquireSRWLockExclusive(&m_lock);}
|
||||
void Unlock() {pReleaseSRWLockExclusive(&m_lock);}
|
||||
bool TryLock() {return pTryAcquireSRWLockExclusive(&m_lock) == TRUE;} void Unlock() {pReleaseSRWLockExclusive(&m_lock);}
|
||||
|
||||
operator SRWLOCK* () {return &m_lock;}
|
||||
};
|
||||
|
@ -114,7 +116,6 @@ public:
|
|||
|
||||
#include <pthread.h>
|
||||
#include <semaphore.h>
|
||||
#include "GSdx.h"
|
||||
|
||||
class GSThread
|
||||
{
|
||||
|
@ -191,6 +192,7 @@ public:
|
|||
}
|
||||
|
||||
void Lock() {pthread_mutex_lock(&m_mutex);}
|
||||
bool TryLock() {return pthread_mutex_trylock(&m_mutex) == 0;}
|
||||
void Unlock() {pthread_mutex_unlock(&m_mutex);}
|
||||
|
||||
operator pthread_mutex_t* () {return &m_mutex;}
|
||||
|
@ -254,10 +256,10 @@ public:
|
|||
template<class T> class GSJobQueue : private GSThread
|
||||
{
|
||||
protected:
|
||||
int m_count;
|
||||
queue<T> m_queue;
|
||||
volatile long m_count; // NOTE: it is the safest to have our own counter because m_queue.pop() might decrement its own before the last item runs out of its scope and gets destroyed (implementation dependent)
|
||||
volatile bool m_exit;
|
||||
struct {GSCritSec lock; GSEvent notempty; volatile long count;} m_ev;
|
||||
struct {GSCritSec lock; GSEvent notempty;} m_ev;
|
||||
struct {GSCondVar notempty, empty; GSCondVarLock lock; bool available;} m_cv;
|
||||
|
||||
void ThreadProc()
|
||||
|
@ -285,6 +287,8 @@ protected:
|
|||
|
||||
m_queue.pop();
|
||||
|
||||
m_count--;
|
||||
|
||||
if(m_queue.empty())
|
||||
{
|
||||
m_cv.empty.Set();
|
||||
|
@ -318,7 +322,7 @@ protected:
|
|||
|
||||
m_queue.pop();
|
||||
|
||||
_InterlockedDecrement(&m_ev.count);
|
||||
m_count--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -328,16 +332,14 @@ public:
|
|||
: m_count(0)
|
||||
, m_exit(false)
|
||||
{
|
||||
m_ev.count = 0;
|
||||
m_cv.available = !!theApp.GetConfig("condvar", 1);
|
||||
|
||||
#ifdef _WINDOWS
|
||||
|
||||
m_cv.available = pInitializeConditionVariable != NULL;
|
||||
|
||||
#elif defined(_LINUX)
|
||||
|
||||
//m_cv.available = true;
|
||||
m_cv.available = !!theApp.GetConfig("condvar", 1);
|
||||
if(pInitializeConditionVariable == NULL)
|
||||
{
|
||||
m_cv.available = false;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -358,12 +360,14 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
int GetCount() const
|
||||
bool IsEmpty() const
|
||||
{
|
||||
return m_count;
|
||||
ASSERT(m_count >= 0);
|
||||
|
||||
return m_count == 0;
|
||||
}
|
||||
|
||||
virtual void Push(const T& item)
|
||||
void Push(const T& item)
|
||||
{
|
||||
if(m_cv.available)
|
||||
{
|
||||
|
@ -371,6 +375,8 @@ public:
|
|||
|
||||
m_queue.push(item);
|
||||
|
||||
m_count++;
|
||||
|
||||
m_cv.lock.Unlock();
|
||||
|
||||
m_cv.notempty.Set();
|
||||
|
@ -381,35 +387,34 @@ public:
|
|||
|
||||
m_queue.push(item);
|
||||
|
||||
_InterlockedIncrement(&m_ev.count);
|
||||
m_count++;
|
||||
|
||||
m_ev.notempty.Set();
|
||||
}
|
||||
|
||||
m_count++;
|
||||
}
|
||||
|
||||
virtual void Wait()
|
||||
void Wait()
|
||||
{
|
||||
if(m_cv.available)
|
||||
{
|
||||
m_cv.lock.Lock();
|
||||
|
||||
while(!m_queue.empty())
|
||||
if(m_count > 0)
|
||||
{
|
||||
m_cv.empty.Wait(m_cv.lock);
|
||||
}
|
||||
m_cv.lock.Lock();
|
||||
|
||||
m_cv.lock.Unlock();
|
||||
while(!m_queue.empty())
|
||||
{
|
||||
m_cv.empty.Wait(m_cv.lock);
|
||||
}
|
||||
|
||||
ASSERT(m_count == 0);
|
||||
|
||||
m_cv.lock.Unlock();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// NOTE: it is the safest to have our own counter because m_queue.pop() might decrement its own before the last item runs out of its scope and gets destroyed (implementation dependent)
|
||||
|
||||
while(m_ev.count > 0) _mm_pause();
|
||||
while(m_count > 0) _mm_pause();
|
||||
}
|
||||
|
||||
m_count++;
|
||||
}
|
||||
|
||||
virtual void Process(T& item) = 0;
|
||||
|
|
|
@ -1024,6 +1024,10 @@
|
|||
RelativePath=".\GSRenderer.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\GSRendererCS.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\GSRendererDX.cpp"
|
||||
>
|
||||
|
@ -1630,6 +1634,10 @@
|
|||
RelativePath=".\GSRenderer.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\GSRendererCS.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\GSRendererDX.h"
|
||||
>
|
||||
|
|
|
@ -1,73 +1,270 @@
|
|||
struct Vertex
|
||||
#ifndef VS_TME
|
||||
#define VS_TME 1
|
||||
#define VS_FST 1
|
||||
#endif
|
||||
|
||||
#ifndef GS_IIP
|
||||
#define GS_IIP 0
|
||||
#define GS_PRIM 3
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
globallycoherent RWByteAddressBuffer VideoMemory : register(u0);
|
||||
|
||||
//globallycoherent RWTexture2D<uint> VideoMemory : register(u0); // 8192 * 512 R8_UINT
|
||||
|
||||
Buffer<int2> FZBufRow : register(t0);
|
||||
Buffer<int2> FZBufCol : register(t1);
|
||||
Texture2D<float4> Palette : register(t2);
|
||||
Texture2D<float4> TextureL0 : register(t3);
|
||||
Texture2D<float4> TextureL1 : register(t4);
|
||||
Texture2D<float4> TextureL2 : register(t5);
|
||||
Texture2D<float4> TextureL3 : register(t6);
|
||||
Texture2D<float4> TextureL4 : register(t7);
|
||||
Texture2D<float4> TextureL5 : register(t8);
|
||||
Texture2D<float4> TextureL6 : register(t9);
|
||||
|
||||
cbuffer VSConstantBuffer : register(c0)
|
||||
{
|
||||
float2 st;
|
||||
uint c;
|
||||
float q;
|
||||
uint xy, z;
|
||||
uint uv, f;
|
||||
float4 VertexScale;
|
||||
float4 VertexOffset;
|
||||
};
|
||||
|
||||
RWByteAddressBuffer VideoMemory : register(u0);
|
||||
|
||||
StructuredBuffer<Vertex> VertexBuffer : register(t0);
|
||||
Buffer<uint> IndexBuffer : register(t1);
|
||||
|
||||
Buffer<int> FrameRowOffset : register(t2);
|
||||
Buffer<int> FrameColOffset : register(t3);
|
||||
Buffer<int> ZBufRowOffset : register(t4);
|
||||
Buffer<int> ZBufColOffset : register(t5);
|
||||
|
||||
cbuffer DrawingEnvironment : register(c0)
|
||||
cbuffer PSConstantBuffer : register(c0)
|
||||
{
|
||||
// TODO
|
||||
};
|
||||
|
||||
// one group is 16x8 pixels and one thread does 2 pixels, otherwise could not read-merge-write 16-bit targets safely
|
||||
// neighburing pixels are next to eachother in memory, at least we don't have to calculate the address twice
|
||||
|
||||
// TODO: they say groupshared memory is faster, try unswizzling the corresponding chunk of memory initially (how to do that once by only one thread?) then write-back when finished, unless it was untouched
|
||||
|
||||
[numthreads(8, 8, 1)]
|
||||
void cs_main(uint3 gid : SV_GroupID, uint3 tid : SV_GroupThreadID)
|
||||
struct VS_INPUT
|
||||
{
|
||||
uint count;
|
||||
uint2 p : POSITION0;
|
||||
uint z : POSITION1;
|
||||
float2 st : TEXCOORD0;
|
||||
float q : TEXCOORD1;
|
||||
uint2 uv : TEXCOORD2;
|
||||
float4 c : COLOR0;
|
||||
float4 f : COLOR1;
|
||||
};
|
||||
|
||||
IndexBuffer.GetDimensions(count);
|
||||
struct VS_OUTPUT
|
||||
{
|
||||
float4 p : SV_Position;
|
||||
float2 z : TEXCOORD0;
|
||||
float4 t : TEXCOORD1;
|
||||
float4 c : COLOR0;
|
||||
};
|
||||
|
||||
// #if GS_PRIM == 2 (triangle)
|
||||
struct GS_OUTPUT
|
||||
{
|
||||
float4 p : SV_Position;
|
||||
float2 z : TEXCOORD0;
|
||||
float4 t : TEXCOORD1;
|
||||
float4 c : COLOR0;
|
||||
uint id : SV_PrimitiveID;
|
||||
};
|
||||
|
||||
for(uint i = 0; i < count; i += 3)
|
||||
VS_OUTPUT vs_main(VS_INPUT input)
|
||||
{
|
||||
VS_OUTPUT output;
|
||||
|
||||
output.p = float4(input.p, 0.0f, 0.0f) * VertexScale - VertexOffset;
|
||||
|
||||
output.z = float2(input.z & 0xffff, input.z >> 16);
|
||||
|
||||
if(VS_TME)
|
||||
{
|
||||
Vertex v0 = VertexBuffer[IndexBuffer[i + 0]];
|
||||
Vertex v1 = VertexBuffer[IndexBuffer[i + 1]];
|
||||
Vertex v2 = VertexBuffer[IndexBuffer[i + 2]];
|
||||
|
||||
uint x = gid.x + tid.x * 2;
|
||||
uint y = gid.y + tid.y;
|
||||
|
||||
uint fa = FrameRowOffset[y] + FrameColOffset[x];
|
||||
uint za = ZBufRowOffset[y] + ZBufColOffset[x];
|
||||
|
||||
// TODO: quickly reject if x, y is outside the triangle
|
||||
// TODO: calculate interpolated values at x, y
|
||||
// TODO: run the GS pipeline
|
||||
// TODO: repeat for x+1, y
|
||||
// TODO: output two pixels (might be better to process a single pixel, more threads, if there is no 16-bit target involved)
|
||||
|
||||
// testing...
|
||||
|
||||
uint4 c = VideoMemory.Load4(fa); // does this load 4*4 bytes? or 4 bytes each expanded uint?
|
||||
|
||||
c = (v0.c >> uint4(0, 8, 16, 24)) & 0xff; // => ushr r1.yzw, r1.xxxx, l(0, 8, 16, 24), v0.c auto-converted to uint4 and per-component shift in one instruction, SSE is embarrassed
|
||||
|
||||
VideoMemory.Store4(fa, c); // same question, 4*4 bytes or compressed to uint
|
||||
if(VS_FST)
|
||||
{
|
||||
output.t.xy = input.uv;
|
||||
output.t.w = 1.0f;
|
||||
}
|
||||
else
|
||||
{
|
||||
output.t.xy = input.st;
|
||||
output.t.w = input.q;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
output.t.xy = 0;
|
||||
output.t.w = 1.0f;
|
||||
}
|
||||
|
||||
// #endif
|
||||
output.c = input.c;
|
||||
output.t.z = input.f.r;
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
// TODO: DrawPoint (this is going to be a waste of resources)
|
||||
// TODO: DrawLine (line hit-test, will it work?)
|
||||
// TODO: DrawSprite (similar to DrawTriangle)
|
||||
// TODO: if read-backs are too slow, implement GSState::Write/FlushWrite/Read/clut.Write in a compute shader
|
||||
// TODO: unswizzle pages from VideoMemory to the texture cache (if they are marked as valid, otherwise upload from GSLocalMemory::m_vm8)
|
||||
|
||||
#if GS_PRIM == 0
|
||||
|
||||
[maxvertexcount(1)]
|
||||
void gs_main(point VS_OUTPUT input[1], inout PointStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
|
||||
{
|
||||
GS_OUTPUT output;
|
||||
|
||||
output.p = input[0].p;
|
||||
output.z = input[0].z;
|
||||
output.t = input[0].t;
|
||||
output.c = input[0].c;
|
||||
output.id = id;
|
||||
|
||||
stream.Append(output);
|
||||
}
|
||||
|
||||
#elif GS_PRIM == 1
|
||||
|
||||
[maxvertexcount(2)]
|
||||
void gs_main(line VS_OUTPUT input[2], inout LineStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
|
||||
{
|
||||
for(int i = 0; i < 2; i++)
|
||||
{
|
||||
GS_OUTPUT output;
|
||||
|
||||
output.p = input[i].p;
|
||||
output.z = input[i].z;
|
||||
output.t = input[i].t;
|
||||
output.c = input[i].c;
|
||||
output.id = id;
|
||||
|
||||
#if GS_IIP == 0
|
||||
if(i != 1) output.c = input[1].c;
|
||||
#endif
|
||||
|
||||
stream.Append(output);
|
||||
}
|
||||
}
|
||||
|
||||
#elif GS_PRIM == 2
|
||||
|
||||
[maxvertexcount(3)]
|
||||
void gs_main(triangle VS_OUTPUT input[3], inout TriangleStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
|
||||
{
|
||||
for(int i = 0; i < 3; i++)
|
||||
{
|
||||
GS_OUTPUT output;
|
||||
|
||||
output.p = input[i].p;
|
||||
output.z = input[i].z;
|
||||
output.t = input[i].t;
|
||||
output.c = input[i].c;
|
||||
output.id = id;
|
||||
|
||||
#if GS_IIP == 0
|
||||
if(i != 1) output.c = input[2].c;
|
||||
#endif
|
||||
|
||||
stream.Append(output);
|
||||
}
|
||||
}
|
||||
|
||||
#elif GS_PRIM == 3
|
||||
|
||||
[maxvertexcount(4)]
|
||||
void gs_main(line VS_OUTPUT input[2], inout TriangleStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
|
||||
{
|
||||
GS_OUTPUT lt, rb, lb, rt;
|
||||
|
||||
lt.p = input[0].p;
|
||||
lt.z = input[1].z;
|
||||
lt.t.xy = input[0].t.xy;
|
||||
lt.t.zw = input[1].t.zw;
|
||||
lt.c = input[0].c;
|
||||
lt.id = id;
|
||||
|
||||
#if GS_IIP == 0
|
||||
lt.c = input[1].c;
|
||||
#endif
|
||||
|
||||
rb.p = input[1].p;
|
||||
rb.z = input[1].z;
|
||||
rb.t = input[1].t;
|
||||
rb.c = input[1].c;
|
||||
rb.id = id;
|
||||
|
||||
lb = lt;
|
||||
lb.p.y = rb.p.y;
|
||||
lb.t.y = rb.t.y;
|
||||
|
||||
rt = rb;
|
||||
rt.p.y = lt.p.y;
|
||||
rt.t.y = lt.t.y;
|
||||
|
||||
stream.Append(lt);
|
||||
stream.Append(lb);
|
||||
stream.Append(rt);
|
||||
stream.Append(rb);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
uint CompressColor(float4 f)
|
||||
{
|
||||
// is there a faster way?
|
||||
|
||||
uint4 c = (uint4)(f * 0xff) << uint4(0, 8, 16, 24);
|
||||
|
||||
return c.r | c.g | c.b | c.a;
|
||||
}
|
||||
|
||||
void ps_main(GS_OUTPUT input)
|
||||
{
|
||||
uint c = CompressColor(input.c);
|
||||
uint z = (uint)(input.z.y * 0x10000 + input.z.x);
|
||||
|
||||
uint x = (uint)input.p.x;
|
||||
uint y = (uint)input.p.y;
|
||||
|
||||
uint2 addr = FZBufRow[y] + FZBufCol[x]; // 16-bit address
|
||||
|
||||
uint2 unaligned = addr.xy & 1; // 16-bit formats can address into the middle of an uint (smallest word size for VideoMemory)
|
||||
|
||||
addr = (addr & ~1) * 2;
|
||||
|
||||
//DeviceMemoryBarrier();
|
||||
|
||||
uint zd = VideoMemory.Load(addr.y);
|
||||
|
||||
if(z < zd) discard;
|
||||
|
||||
VideoMemory.Store(addr.y, z);
|
||||
VideoMemory.Store(addr.x, c);
|
||||
|
||||
/*
|
||||
addr <<= 1;
|
||||
|
||||
uint2 fa0 = uint2(addr.x & 0x1fff, addr.x >> 13);
|
||||
uint2 fa1 = fa0 + uint2(1, 0);
|
||||
uint2 fa2 = fa0 + uint2(2, 0);
|
||||
uint2 fa3 = fa0 + uint2(3, 0);
|
||||
|
||||
uint2 za0 = uint2(addr.y & 0x1fff, addr.y >> 13);
|
||||
uint2 za1 = za0 + uint2(1, 0);
|
||||
uint2 za2 = za0 + uint2(2, 0);
|
||||
uint2 za3 = za0 + uint2(3, 0);
|
||||
|
||||
DeviceMemoryBarrier();
|
||||
|
||||
uint zd =
|
||||
(VideoMemory[za0] << 0) |
|
||||
(VideoMemory[za1] << 8) |
|
||||
(VideoMemory[za2] << 16) |
|
||||
(VideoMemory[za3] << 24);
|
||||
|
||||
if(zd >= z) discard;
|
||||
|
||||
VideoMemory[za0] = (z >> 0) & 0xff;
|
||||
VideoMemory[za1] = (z >> 8) & 0xff;
|
||||
VideoMemory[za2] = (z >> 16) & 0xff;
|
||||
VideoMemory[za3] = (z >> 24) & 0xff;
|
||||
|
||||
DeviceMemoryBarrier();
|
||||
|
||||
VideoMemory[fa0] = (c >> 0) & 0xff;
|
||||
VideoMemory[fa1] = (c >> 8) & 0xff;
|
||||
VideoMemory[fa2] = (c >> 16) & 0xff;
|
||||
VideoMemory[fa3] = (c >> 24) & 0xff;
|
||||
*/
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue