GS/HW: Implement multi stretch for DX11/DX12/OpenGL

This commit is contained in:
Stenzek 2023-03-02 20:53:28 +10:00 committed by refractionpcsx2
parent 8505e9203a
commit b8a86baec7
7 changed files with 331 additions and 15 deletions

View File

@ -798,6 +798,84 @@ void GSDevice11::UpdateCLUTTexture(GSTexture* sTex, u32 offsetX, u32 offsetY, GS
StretchRect(sTex, GSVector4::zero(), dTex, dRect, m_convert.ps[static_cast<int>(shader)].get(), m_merge.cb.get(), nullptr, false);
}
void GSDevice11::DrawMultiStretchRects(const MultiStretchRect* rects, u32 num_rects, GSTexture* dTex, ShaderConvert shader)
{
IASetInputLayout(m_convert.il.get());
IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
VSSetShader(m_convert.vs.get(), nullptr);
GSSetShader(nullptr, nullptr);
PSSetShader(m_convert.ps[static_cast<int>(shader)].get(), nullptr);
OMSetDepthStencilState(m_convert.dss.get(), 0);
OMSetBlendState(nullptr, 0.0f);
OMSetRenderTargets(dTex, nullptr);
const GSVector2 ds(static_cast<float>(dTex->GetWidth()), static_cast<float>(dTex->GetHeight()));
GSTexture* last_tex = rects[0].src;
bool last_linear = rects[0].linear;
u32 first = 0;
u32 count = 1;
for (u32 i = 1; i < num_rects; i++)
{
if (rects[i].src == last_tex && rects[i].linear == last_linear)
{
count++;
continue;
}
DoMultiStretchRects(rects + first, count, ds);
last_tex = rects[i].src;
last_linear = rects[i].linear;
first += count;
count = 1;
}
DoMultiStretchRects(rects + first, count, ds);
}
void GSDevice11::DoMultiStretchRects(const MultiStretchRect* rects, u32 num_rects, const GSVector2& ds)
{
// Don't use primitive restart here, it ends up slower on some drivers.
const u32 vertex_reserve_size = num_rects * 4;
const u32 index_reserve_size = num_rects * 6;
GSVertexPT1* verts = static_cast<GSVertexPT1*>(IAMapVertexBuffer(sizeof(GSVertexPT1), vertex_reserve_size));
u32* idx = IAMapIndexBuffer(index_reserve_size);
u32 icount = 0;
u32 vcount = 0;
for (u32 i = 0; i < num_rects; i++)
{
const GSVector4& sRect = rects[i].src_rect;
const GSVector4& dRect = rects[i].dst_rect;
const float left = dRect.x * 2 / ds.x - 1.0f;
const float top = 1.0f - dRect.y * 2 / ds.y;
const float right = dRect.z * 2 / ds.x - 1.0f;
const float bottom = 1.0f - dRect.w * 2 / ds.y;
const u32 vstart = vcount;
verts[vcount++] = {GSVector4(left, top, 0.5f, 1.0f), GSVector2(sRect.x, sRect.y)};
verts[vcount++] = {GSVector4(right, top, 0.5f, 1.0f), GSVector2(sRect.z, sRect.y)};
verts[vcount++] = {GSVector4(left, bottom, 0.5f, 1.0f), GSVector2(sRect.x, sRect.w)};
verts[vcount++] = {GSVector4(right, bottom, 0.5f, 1.0f), GSVector2(sRect.z, sRect.w)};
if (i > 0)
idx[icount++] = vstart;
idx[icount++] = vstart;
idx[icount++] = vstart + 1;
idx[icount++] = vstart + 2;
idx[icount++] = vstart + 3;
idx[icount++] = vstart + 3;
};
IAUnmapVertexBuffer(sizeof(GSVertexPT1), vcount);
IAUnmapIndexBuffer(icount);
PSSetShaderResource(0, rects[0].src);
PSSetSamplerState(rects[0].linear ? m_convert.ln.get() : m_convert.pt.get());
DrawIndexedPrimitive();
}
void GSDevice11::DoMerge(GSTexture* sTex[3], GSVector4* sRect, GSTexture* dTex, GSVector4* dRect, const GSRegPMODE& PMODE, const GSRegEXTBUF& EXTBUF, const GSVector4& c, const bool linear)
{
const GSVector4 full_r(0.0f, 0.0f, 1.0f, 1.0f);
@ -991,11 +1069,11 @@ void GSDevice11::SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* vert
//
}
bool GSDevice11::IASetVertexBuffer(const void* vertex, u32 stride, u32 count)
void* GSDevice11::IAMapVertexBuffer(u32 stride, u32 count)
{
const u32 size = stride * count;
if (size > VERTEX_BUFFER_SIZE)
return false;
return nullptr;
D3D11_MAP type = D3D11_MAP_WRITE_NO_OVERWRITE;
@ -1010,10 +1088,13 @@ bool GSDevice11::IASetVertexBuffer(const void* vertex, u32 stride, u32 count)
D3D11_MAPPED_SUBRESOURCE m;
if (FAILED(m_ctx->Map(m_vb.get(), 0, type, 0, &m)))
return false;
return nullptr;
GSVector4i::storent(static_cast<u8*>(m.pData) + m_vertex.start * stride, vertex, count * stride);
return static_cast<u8*>(m.pData) + (m_vertex.start * stride);
}
void GSDevice11::IAUnmapVertexBuffer(u32 stride, u32 count)
{
m_ctx->Unmap(m_vb.get(), 0);
if (m_state.vb_stride != stride)
@ -1024,13 +1105,24 @@ bool GSDevice11::IASetVertexBuffer(const void* vertex, u32 stride, u32 count)
}
m_vertex.count = count;
}
bool GSDevice11::IASetVertexBuffer(const void* vertex, u32 stride, u32 count)
{
void* map = IAMapVertexBuffer(stride, count);
if (!map)
return false;
GSVector4i::storent(map, vertex, count * stride);
IAUnmapVertexBuffer(stride, count);
return true;
}
bool GSDevice11::IASetIndexBuffer(const void* index, u32 count)
u32* GSDevice11::IAMapIndexBuffer(u32 count)
{
if (count > (INDEX_BUFFER_SIZE / sizeof(u32)))
return false;
return nullptr;
D3D11_MAP type = D3D11_MAP_WRITE_NO_OVERWRITE;
@ -1046,11 +1138,25 @@ bool GSDevice11::IASetIndexBuffer(const void* index, u32 count)
D3D11_MAPPED_SUBRESOURCE m;
if (FAILED(m_ctx->Map(m_ib.get(), 0, type, 0, &m)))
return false;
return nullptr;
std::memcpy((u8*)m.pData + m_index.start * sizeof(u32), index, count * sizeof(u32));
return static_cast<u32*>(m.pData) + m_index.start;
}
void GSDevice11::IAUnmapIndexBuffer(u32 count)
{
m_ctx->Unmap(m_ib.get(), 0);
m_index.count = count;
}
bool GSDevice11::IASetIndexBuffer(const void* index, u32 count)
{
u32* map = IAMapIndexBuffer(count);
if (!map)
return false;
std::memcpy(map, index, count * sizeof(u32));
IAUnmapIndexBuffer(count);
return true;
}

View File

@ -278,11 +278,19 @@ public:
void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, ID3D11PixelShader* ps, ID3D11Buffer* ps_cb, ID3D11BlendState* bs, bool linear = true);
void PresentRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, PresentShader shader, float shaderTime, bool linear) override;
void UpdateCLUTTexture(GSTexture* sTex, u32 offsetX, u32 offsetY, GSTexture* dTex, u32 dOffset, u32 dSize) override;
void DrawMultiStretchRects(const MultiStretchRect* rects, u32 num_rects, GSTexture* dTex, ShaderConvert shader) override;
void DoMultiStretchRects(const MultiStretchRect* rects, u32 num_rects, const GSVector2& ds);
void SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* vertices, bool datm);
void* IAMapVertexBuffer(u32 stride, u32 count);
void IAUnmapVertexBuffer(u32 stride, u32 count);
bool IASetVertexBuffer(const void* vertex, u32 stride, u32 count);
u32* IAMapIndexBuffer(u32 count);
void IAUnmapIndexBuffer(u32 count);
bool IASetIndexBuffer(const void* index, u32 count);
void IASetInputLayout(ID3D11InputLayout* layout);
void IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY topology);

View File

@ -478,10 +478,123 @@ void GSDevice12::UpdateCLUTTexture(GSTexture* sTex, u32 offsetX, u32 offsetY, GS
m_convert[static_cast<int>(shader)].get(), false);
}
void GSDevice12::BeginRenderPassForStretchRect(GSTexture12* dTex, const GSVector4i& dtex_rc, const GSVector4i& dst_rc)
void GSDevice12::DrawMultiStretchRects(
const MultiStretchRect* rects, u32 num_rects, GSTexture* dTex, ShaderConvert shader)
{
const bool is_whole_target = dst_rc.eq(dtex_rc);
const D3D12_RENDER_PASS_BEGINNING_ACCESS_TYPE load_op = is_whole_target ? D3D12_RENDER_PASS_BEGINNING_ACCESS_TYPE_DISCARD : GetLoadOpForTexture(dTex);
GSTexture* last_tex = rects[0].src;
bool last_linear = rects[0].linear;
u32 first = 0;
u32 count = 1;
// Make sure all textures are in shader read only layout, so we don't need to break
// the render pass to transition.
for (u32 i = 0; i < num_rects; i++)
{
GSTexture12* const stex = static_cast<GSTexture12*>(rects[i].src);
stex->CommitClear();
if (stex->GetResourceState() != D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE)
{
EndRenderPass();
stex->TransitionToState(D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE);
}
}
for (u32 i = 1; i < num_rects; i++)
{
if (rects[i].src == last_tex && rects[i].linear == last_linear)
{
count++;
continue;
}
DoMultiStretchRects(rects + first, count, static_cast<GSTexture12*>(dTex), shader);
last_tex = rects[i].src;
last_linear = rects[i].linear;
first += count;
count = 1;
}
DoMultiStretchRects(rects + first, count, static_cast<GSTexture12*>(dTex), shader);
}
void GSDevice12::DoMultiStretchRects(
const MultiStretchRect* rects, u32 num_rects, GSTexture12* dTex, ShaderConvert shader)
{
// Set up vertices first.
const u32 vertex_reserve_size = num_rects * 4 * sizeof(GSVertexPT1);
const u32 index_reserve_size = num_rects * 6 * sizeof(u32);
if (!m_vertex_stream_buffer.ReserveMemory(vertex_reserve_size, sizeof(GSVertexPT1)) ||
!m_index_stream_buffer.ReserveMemory(index_reserve_size, sizeof(u32)))
{
ExecuteCommandListAndRestartRenderPass(false, "Uploading bytes to vertex buffer");
if (!m_vertex_stream_buffer.ReserveMemory(vertex_reserve_size, sizeof(GSVertexPT1)) ||
!m_index_stream_buffer.ReserveMemory(index_reserve_size, sizeof(u32)))
{
pxFailRel("Failed to reserve space for vertices");
}
}
// Pain in the arse because the primitive topology for the pipelines is all triangle strips.
// Don't use primitive restart here, it ends up slower on some drivers.
const GSVector2 ds(static_cast<float>(dTex->GetWidth()), static_cast<float>(dTex->GetHeight()));
GSVertexPT1* verts = reinterpret_cast<GSVertexPT1*>(m_vertex_stream_buffer.GetCurrentHostPointer());
u32* idx = reinterpret_cast<u32*>(m_index_stream_buffer.GetCurrentHostPointer());
u32 icount = 0;
u32 vcount = 0;
for (u32 i = 0; i < num_rects; i++)
{
const GSVector4& sRect = rects[i].src_rect;
const GSVector4& dRect = rects[i].dst_rect;
const float left = dRect.x * 2 / ds.x - 1.0f;
const float top = 1.0f - dRect.y * 2 / ds.y;
const float right = dRect.z * 2 / ds.x - 1.0f;
const float bottom = 1.0f - dRect.w * 2 / ds.y;
const u32 vstart = vcount;
verts[vcount++] = {GSVector4(left, top, 0.5f, 1.0f), GSVector2(sRect.x, sRect.y)};
verts[vcount++] = {GSVector4(right, top, 0.5f, 1.0f), GSVector2(sRect.z, sRect.y)};
verts[vcount++] = {GSVector4(left, bottom, 0.5f, 1.0f), GSVector2(sRect.x, sRect.w)};
verts[vcount++] = {GSVector4(right, bottom, 0.5f, 1.0f), GSVector2(sRect.z, sRect.w)};
if (i > 0)
idx[icount++] = vstart;
idx[icount++] = vstart;
idx[icount++] = vstart + 1;
idx[icount++] = vstart + 2;
idx[icount++] = vstart + 3;
idx[icount++] = vstart + 3;
};
m_vertex.start = m_vertex_stream_buffer.GetCurrentOffset() / sizeof(GSVertexPT1);
m_vertex.count = vcount;
m_index.start = m_index_stream_buffer.GetCurrentOffset() / sizeof(u32);
m_index.count = icount;
m_vertex_stream_buffer.CommitMemory(vcount * sizeof(GSVertexPT1));
m_index_stream_buffer.CommitMemory(icount * sizeof(u32));
SetVertexBuffer(m_vertex_stream_buffer.GetGPUPointer(), m_vertex_stream_buffer.GetSize(), sizeof(GSVertexPT1));
SetIndexBuffer(m_index_stream_buffer.GetGPUPointer(), m_index_stream_buffer.GetSize(), DXGI_FORMAT_R32_UINT);
// Even though we're batching, a cmdbuffer submit could've messed this up.
const GSVector4i rc(dTex->GetRect());
OMSetRenderTargets(dTex, nullptr, rc);
if (!InRenderPass())
BeginRenderPassForStretchRect(dTex, rc, rc, false);
SetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
SetUtilityTexture(rects[0].src, rects[0].linear ? m_linear_sampler_cpu : m_point_sampler_cpu);
SetPipeline(m_convert[static_cast<int>(shader)].get());
if (ApplyUtilityState())
DrawIndexedPrimitive();
}
void GSDevice12::BeginRenderPassForStretchRect(
GSTexture12* dTex, const GSVector4i& dtex_rc, const GSVector4i& dst_rc, bool allow_discard)
{
const D3D12_RENDER_PASS_BEGINNING_ACCESS_TYPE load_op = (allow_discard && dst_rc.eq(dtex_rc)) ?
D3D12_RENDER_PASS_BEGINNING_ACCESS_TYPE_DISCARD :
GetLoadOpForTexture(dTex);
dTex->SetState(GSTexture::State::Dirty);
if (dTex->GetType() != GSTexture::Type::DepthStencil)
@ -495,8 +608,8 @@ void GSDevice12::BeginRenderPassForStretchRect(GSTexture12* dTex, const GSVector
else
{
const float clear_depth = dTex->GetClearDepth();
BeginRenderPass(D3D12_RENDER_PASS_BEGINNING_ACCESS_TYPE_NO_ACCESS, D3D12_RENDER_PASS_ENDING_ACCESS_TYPE_NO_ACCESS,
load_op, D3D12_RENDER_PASS_ENDING_ACCESS_TYPE_PRESERVE,
BeginRenderPass(D3D12_RENDER_PASS_BEGINNING_ACCESS_TYPE_NO_ACCESS,
D3D12_RENDER_PASS_ENDING_ACCESS_TYPE_NO_ACCESS, load_op, D3D12_RENDER_PASS_ENDING_ACCESS_TYPE_PRESERVE,
D3D12_RENDER_PASS_BEGINNING_ACCESS_TYPE_NO_ACCESS, D3D12_RENDER_PASS_ENDING_ACCESS_TYPE_NO_ACCESS,
GSVector4::zero(), clear_depth);
}

View File

@ -254,8 +254,11 @@ public:
void PresentRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect,
PresentShader shader, float shaderTime, bool linear) override;
void UpdateCLUTTexture(GSTexture* sTex, u32 offsetX, u32 offsetY, GSTexture* dTex, u32 dOffset, u32 dSize) override;
void DrawMultiStretchRects(const MultiStretchRect* rects, u32 num_rects, GSTexture* dTex, ShaderConvert shader) override;
void DoMultiStretchRects(const MultiStretchRect* rects, u32 num_rects, GSTexture12* dTex, ShaderConvert shader);
void BeginRenderPassForStretchRect(GSTexture12* dTex, const GSVector4i& dtex_rc, const GSVector4i& dst_rc);
void BeginRenderPassForStretchRect(
GSTexture12* dTex, const GSVector4i& dtex_rc, const GSVector4i& dst_rc, bool allow_discard = true);
void DoStretchRect(GSTexture12* sTex, const GSVector4& sRect, GSTexture12* dTex, const GSVector4& dRect,
const ID3D12PipelineState* pipeline, bool linear);
void DrawStretchRect(const GSVector4& sRect, const GSVector4& dRect, const GSVector2i& ds);

View File

@ -1310,6 +1310,90 @@ void GSDeviceOGL::DrawStretchRect(const GSVector4& sRect, const GSVector4& dRect
DrawPrimitive();
}
void GSDeviceOGL::DrawMultiStretchRects(
const MultiStretchRect* rects, u32 num_rects, GSTexture* dTex, ShaderConvert shader)
{
IASetPrimitiveTopology(GL_TRIANGLE_STRIP);
OMSetDepthStencilState(m_convert.dss);
OMSetBlendState(false);
OMSetColorMaskState();
OMSetRenderTargets(dTex, nullptr);
m_convert.ps[static_cast<int>(shader)].Bind();
const GSVector2 ds(static_cast<float>(dTex->GetWidth()), static_cast<float>(dTex->GetHeight()));
GSTexture* last_tex = rects[0].src;
bool last_linear = rects[0].linear;
u32 first = 0;
u32 count = 1;
for (u32 i = 1; i < num_rects; i++)
{
if (rects[i].src == last_tex && rects[i].linear == last_linear)
{
count++;
continue;
}
DoMultiStretchRects(rects + first, count, ds);
last_tex = rects[i].src;
last_linear = rects[i].linear;
first += count;
count = 1;
}
DoMultiStretchRects(rects + first, count, ds);
}
void GSDeviceOGL::DoMultiStretchRects(const MultiStretchRect* rects, u32 num_rects, const GSVector2& ds)
{
const u32 vertex_reserve_size = num_rects * 4 * sizeof(GSVertexPT1);
const u32 index_reserve_size = num_rects * 6 * sizeof(u32);
auto vertex_map = m_vertex_stream_buffer->Map(sizeof(GSVertexPT1), vertex_reserve_size);
auto index_map = m_index_stream_buffer->Map(sizeof(u32), index_reserve_size);
m_vertex.start = vertex_map.index_aligned;
m_index.start = index_map.index_aligned;
// Don't use primitive restart here, it ends up slower on some drivers.
GSVertexPT1* verts = reinterpret_cast<GSVertexPT1*>(vertex_map.pointer);
u32* idx = reinterpret_cast<u32*>(index_map.pointer);
u32 icount = 0;
u32 vcount = 0;
for (u32 i = 0; i < num_rects; i++)
{
const GSVector4& sRect = rects[i].src_rect;
const GSVector4& dRect = rects[i].dst_rect;
const float left = dRect.x * 2 / ds.x - 1.0f;
const float right = dRect.z * 2 / ds.x - 1.0f;
const float top = -1.0f + dRect.y * 2 / ds.y;
const float bottom = -1.0f + dRect.w * 2 / ds.y;
const u32 vstart = vcount;
verts[vcount++] = { GSVector4(left , top , 0.0f, 0.0f) , GSVector2(sRect.x , sRect.y) };
verts[vcount++] = { GSVector4(right , top , 0.0f, 0.0f) , GSVector2(sRect.z , sRect.y) };
verts[vcount++] = { GSVector4(left , bottom, 0.0f, 0.0f) , GSVector2(sRect.x , sRect.w) };
verts[vcount++] = { GSVector4(right , bottom, 0.0f, 0.0f) , GSVector2(sRect.z , sRect.w) };
if (i > 0)
idx[icount++] = vstart;
idx[icount++] = vstart;
idx[icount++] = vstart + 1;
idx[icount++] = vstart + 2;
idx[icount++] = vstart + 3;
idx[icount++] = vstart + 3;
};
m_vertex.count = vcount;
m_index.count = icount;
m_vertex_stream_buffer->Unmap(vcount * sizeof(GSVertexPT1));
m_index_stream_buffer->Unmap(icount * sizeof(u32));
PSSetShaderResource(0, rects[0].src);
PSSetSamplerState(rects[0].linear ? m_convert.ln : m_convert.pt);
DrawIndexedPrimitive();
}
void GSDeviceOGL::DoMerge(GSTexture* sTex[3], GSVector4* sRect, GSTexture* dTex, GSVector4* dRect, const GSRegPMODE& PMODE, const GSRegEXTBUF& EXTBUF, const GSVector4& c, const bool linear)
{
GL_PUSH("DoMerge");

View File

@ -338,6 +338,8 @@ public:
void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, const GL::Program& ps, bool alpha_blend, OMColorMaskSelector cms, bool linear = true);
void PresentRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, PresentShader shader, float shaderTime, bool linear) final;
void UpdateCLUTTexture(GSTexture* sTex, u32 offsetX, u32 offsetY, GSTexture* dTex, u32 dOffset, u32 dSize) final;
void DrawMultiStretchRects(const MultiStretchRect* rects, u32 num_rects, GSTexture* dTex, ShaderConvert shader) final;
void DoMultiStretchRects(const MultiStretchRect* rects, u32 num_rects, const GSVector2& ds);
void RenderHW(GSHWDrawConfig& config) final;
void SendHWDraw(const GSHWDrawConfig& config, bool needs_barrier);

View File

@ -651,7 +651,7 @@ void GSDeviceVK::DoMultiStretchRects(
// Even though we're batching, a cmdbuffer submit could've messed this up.
const GSVector4i rc(dTex->GetRect());
OMSetRenderTargets(dTex, nullptr, dTex->GetRect(), false);
OMSetRenderTargets(dTex, nullptr, rc, false);
if (!InRenderPass() || !CheckRenderPassArea(rc))
BeginRenderPassForStretchRect(dTex, rc, rc, false);
SetUtilityTexture(rects[0].src, rects[0].linear ? m_linear_sampler : m_point_sampler);