MultiDrawIndirect draw batching - now down to <20us per draw.

This commit is contained in:
Ben Vanik 2015-01-04 11:20:42 -08:00
parent eda38a7428
commit 4fcf9c6229
10 changed files with 1120 additions and 601 deletions

View File

@ -35,6 +35,8 @@ class CircularBuffer {
void Shutdown();
GLuint handle() const { return buffer_; }
GLuint64 gpu_handle() const { return gpu_base_; }
size_t capacity() const { return capacity_; }
bool CanAcquire(size_t length);
Allocation Acquire(size_t length);

File diff suppressed because it is too large Load Diff

View File

@ -18,6 +18,7 @@
#include <vector>
#include <xenia/gpu/gl4/circular_buffer.h>
#include <xenia/gpu/gl4/draw_batcher.h>
#include <xenia/gpu/gl4/gl_context.h>
#include <xenia/gpu/gl4/gl4_shader.h>
#include <xenia/gpu/gl4/texture_cache.h>
@ -41,73 +42,6 @@ struct SwapParameters {
GLenum attachment;
};
// This must match the layout in gl4_shader.cc.
struct UniformDataBlock {
union float4 {
float v[4];
struct {
float x, y, z, w;
};
};
float4 window_offset; // tx,ty,sx,sy
float4 window_scissor; // x0,y0,x1,y1
float4 vtx_fmt;
float4 viewport_offset; // tx,ty,tz,?
float4 viewport_scale; // sx,sy,sz,?
// TODO(benvanik): vertex format xyzw?
float4 alpha_test; // alpha test enable, func, ref, ?
// TODO(benvanik): pack tightly
uint64_t texture_samplers[32];
// Register data from 0x4000 to 0x4927.
// UpdateConstants relies on the packing of these.
struct {
// SHADER_CONSTANT_000_X...
float4 float_consts[512];
// SHADER_CONSTANT_FETCH_00_0 is omitted
// SHADER_CONSTANT_BOOL_000_031...
int32_t bool_consts[8];
// SHADER_CONSTANT_LOOP_00...
int32_t loop_consts[32];
};
};
static_assert(sizeof(UniformDataBlock) <= 16 * 1024, "Need <=16k uniform data");
// TODO(benvanik): move more of the enums in here?
struct DrawCommand {
PrimitiveType prim_type;
uint32_t start_index;
uint32_t min_index;
uint32_t max_index;
uint32_t index_count;
uint32_t base_vertex;
// Index buffer, if present.
// If index_count > 0 but buffer is nullptr then auto draw.
struct {
const uint8_t* address;
size_t size;
xenos::Endian endianness;
xenos::IndexFormat format;
size_t buffer_offset;
} index_buffer;
// Texture samplers.
struct SamplerInput {
uint32_t input_index;
// TextureResource* texture;
// SamplerStateResource* sampler_state;
};
SamplerInput vertex_shader_samplers[32];
SamplerInput pixel_shader_samplers[32];
// NOTE: do not read from this - the mapped memory is likely write combined.
UniformDataBlock* state_data;
};
class CommandProcessor {
public:
CommandProcessor(GL4GraphicsSystem* graphics_system);
@ -241,22 +175,19 @@ class CommandProcessor {
bool LoadShader(ShaderType shader_type, const uint32_t* address,
uint32_t dword_count);
void PrepareDraw(DrawCommand* draw_command);
bool IssueDraw(DrawCommand* draw_command);
UpdateStatus UpdateRenderTargets(DrawCommand* draw_command);
UpdateStatus UpdateState(DrawCommand* draw_command);
UpdateStatus UpdateViewportState(DrawCommand* draw_command);
UpdateStatus UpdateRasterizerState(DrawCommand* draw_command);
UpdateStatus UpdateBlendState(DrawCommand* draw_command);
UpdateStatus UpdateDepthStencilState(DrawCommand* draw_command);
UpdateStatus UpdateConstants(DrawCommand* draw_command);
UpdateStatus UpdateShaders(DrawCommand* draw_command);
UpdateStatus PopulateIndexBuffer(DrawCommand* draw_command);
UpdateStatus PopulateVertexBuffers(DrawCommand* draw_command);
UpdateStatus PopulateSamplers(DrawCommand* draw_command);
UpdateStatus PopulateSampler(DrawCommand* draw_command,
const Shader::SamplerDesc& desc);
bool IssueCopy(DrawCommand* draw_command);
bool IssueDraw();
UpdateStatus UpdateShaders(PrimitiveType prim_type);
UpdateStatus UpdateRenderTargets();
UpdateStatus UpdateState();
UpdateStatus UpdateViewportState();
UpdateStatus UpdateRasterizerState();
UpdateStatus UpdateBlendState();
UpdateStatus UpdateDepthStencilState();
UpdateStatus PopulateIndexBuffer();
UpdateStatus PopulateVertexBuffers();
UpdateStatus PopulateSamplers();
UpdateStatus PopulateSampler(const Shader::SamplerDesc& desc);
bool IssueCopy();
CachedFramebuffer* GetFramebuffer(GLuint color_targets[4],
GLuint depth_target);
@ -306,21 +237,23 @@ class CommandProcessor {
std::vector<CachedDepthRenderTarget> cached_depth_render_targets_;
std::vector<std::unique_ptr<CachedPipeline>> all_pipelines_;
std::unordered_map<uint64_t, CachedPipeline*> cached_pipelines_;
GLuint vertex_array_;
GLuint point_list_geometry_program_;
GLuint rect_list_geometry_program_;
GLuint quad_list_geometry_program_;
struct {
xenos::IndexFormat format;
xenos::Endian endianness;
uint32_t count;
uint32_t guest_base;
size_t length;
uint32_t max_index_found;
} index_buffer_info_;
uint32_t draw_index_count_;
TextureCache texture_cache_;
DrawBatcher draw_batcher_;
CircularBuffer scratch_buffer_;
struct ScratchBufferStats {
size_t total_state_data_size = 0;
size_t total_indices_size = 0;
size_t total_vertices_size = 0;
} scratch_buffer_stats_;
DrawCommand draw_command_;
private:
bool SetShadowRegister(uint32_t& dest, uint32_t register_name);
@ -341,7 +274,6 @@ class CommandProcessor {
void Reset() { std::memset(this, 0, sizeof(*this)); }
} update_render_targets_regs_;
struct UpdateViewportStateRegisters {
//
UpdateViewportStateRegisters() { Reset(); }
void Reset() { std::memset(this, 0, sizeof(*this)); }
} update_viewport_state_regs_;
@ -367,7 +299,6 @@ class CommandProcessor {
UpdateDepthStencilStateRegisters() { Reset(); }
void Reset() { std::memset(this, 0, sizeof(*this)); }
} update_depth_stencil_state_regs_;
// TODO(benvanik): constant bitmask?
struct UpdateShadersRegisters {
PrimitiveType prim_type;
uint32_t sq_program_cntl;
@ -380,9 +311,6 @@ class CommandProcessor {
vertex_shader = pixel_shader = nullptr;
}
} update_shaders_regs_;
// ib
// vb
// samplers
};
} // namespace gl4

View File

@ -0,0 +1,384 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2014 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#include <xenia/gpu/gl4/draw_batcher.h>
#include <poly/cxx_compat.h>
#include <poly/math.h>
#include <xenia/gpu/gl4/gl4_gpu-private.h>
#include <xenia/gpu/gpu-private.h>
namespace xe {
namespace gpu {
namespace gl4 {
using namespace xe::gpu::xenos;
extern "C" GLEWContext* glewGetContext();
const size_t kCommandBufferCapacity = 16 * (1024 * 1024);
const size_t kCommandBufferAlignment = 4;
const size_t kStateBufferCapacity = 64 * (1024 * 1024);
const size_t kStateBufferAlignment = 256;
DrawBatcher::DrawBatcher(RegisterFile* register_file)
: register_file_(register_file),
command_buffer_(kCommandBufferCapacity, kCommandBufferAlignment),
state_buffer_(kStateBufferCapacity, kStateBufferAlignment),
array_data_buffer_(nullptr),
has_bindless_mdi_(false),
draw_open_(false) {
std::memset(&batch_state_, 0, sizeof(batch_state_));
batch_state_.needs_reconfigure = true;
batch_state_.command_range_start = batch_state_.state_range_start =
UINTPTR_MAX;
std::memset(&active_draw_, 0, sizeof(active_draw_));
}
bool DrawBatcher::Initialize(CircularBuffer* array_data_buffer) {
array_data_buffer_ = array_data_buffer;
if (!command_buffer_.Initialize()) {
return false;
}
if (!state_buffer_.Initialize()) {
return false;
}
glBindBuffer(GL_DRAW_INDIRECT_BUFFER, command_buffer_.handle());
if (FLAGS_vendor_gl_extensions && GLEW_NV_bindless_multi_draw_indirect) {
has_bindless_mdi_ = true;
}
return true;
}
void DrawBatcher::Shutdown() {
command_buffer_.Shutdown();
state_buffer_.Shutdown();
}
bool DrawBatcher::ReconfigurePipeline(GL4Shader* vertex_shader,
GL4Shader* pixel_shader,
GLuint pipeline) {
if (batch_state_.pipeline == pipeline) {
// No-op.
return true;
}
if (!Flush(FlushMode::kReconfigure)) {
return false;
}
batch_state_.vertex_shader = vertex_shader;
batch_state_.pixel_shader = pixel_shader;
batch_state_.pipeline = pipeline;
return true;
}
bool DrawBatcher::BeginDrawArrays(PrimitiveType prim_type,
uint32_t index_count) {
assert_false(draw_open_);
if (batch_state_.prim_type != prim_type || batch_state_.indexed) {
if (!Flush(FlushMode::kReconfigure)) {
return false;
}
}
batch_state_.prim_type = prim_type;
batch_state_.indexed = false;
if (!BeginDraw()) {
return false;
}
auto cmd = active_draw_.draw_arrays_cmd;
cmd->base_instance = 0;
cmd->instance_count = 1;
cmd->count = index_count;
cmd->first_index = 0;
return true;
}
bool DrawBatcher::BeginDrawElements(PrimitiveType prim_type,
uint32_t index_count,
IndexFormat index_format) {
assert_false(draw_open_);
GLenum index_type =
index_format == IndexFormat::kInt32 ? GL_UNSIGNED_INT : GL_UNSIGNED_SHORT;
if (batch_state_.prim_type != prim_type || !batch_state_.indexed ||
batch_state_.index_type != index_type) {
if (!Flush(FlushMode::kReconfigure)) {
return false;
}
}
batch_state_.prim_type = prim_type;
batch_state_.indexed = true;
batch_state_.index_type = index_type;
if (!BeginDraw()) {
return false;
}
uint32_t start_index = register_file_->values[XE_GPU_REG_VGT_INDX_OFFSET].u32;
assert_zero(start_index);
auto cmd = active_draw_.draw_elements_cmd;
cmd->base_instance = 0;
cmd->instance_count = 1;
cmd->count = index_count;
cmd->first_index = start_index;
cmd->base_vertex = 0;
if (has_bindless_mdi_) {
auto bindless_cmd = active_draw_.draw_elements_bindless_cmd;
bindless_cmd->reserved_zero = 0;
}
return true;
}
bool DrawBatcher::BeginDraw() {
draw_open_ = true;
if (batch_state_.needs_reconfigure) {
batch_state_.needs_reconfigure = false;
// Have been reconfigured since last draw - need to compute state size.
// Layout:
// [draw command]
// [common header]
// [consts]
// Padded to max.
GLsizei command_size = 0;
if (has_bindless_mdi_) {
if (batch_state_.indexed) {
command_size = sizeof(DrawElementsIndirectBindlessCommandNV);
} else {
command_size = sizeof(DrawArraysIndirectBindlessCommandNV);
}
} else {
if (batch_state_.indexed) {
command_size = sizeof(DrawElementsIndirectCommand);
} else {
command_size = sizeof(DrawArraysIndirectCommand);
}
}
batch_state_.command_stride =
poly::round_up(command_size, GLsizei(kCommandBufferAlignment));
GLsizei header_size = sizeof(CommonHeader);
// TODO(benvanik); consts sizing.
// GLsizei float_consts_size = sizeof(float4) * 512;
// GLsizei bool_consts_size = sizeof(uint32_t) * 8;
// GLsizei loop_consts_size = sizeof(uint32_t) * 32;
// GLsizei consts_size =
// float_consts_size + bool_consts_size + loop_consts_size;
// batch_state_.float_consts_offset = batch_state_.header_offset +
// header_size;
// batch_state_.bool_consts_offset =
// batch_state_.float_consts_offset + float_consts_size;
// batch_state_.loop_consts_offset =
// batch_state_.bool_consts_offset + bool_consts_size;
GLsizei consts_size = 0;
batch_state_.state_stride = header_size + consts_size;
}
// Allocate a command data block.
// We should treat it as write-only.
if (!command_buffer_.CanAcquire(batch_state_.command_stride)) {
Flush(FlushMode::kMakeCoherent);
}
active_draw_.command_allocation =
command_buffer_.Acquire(batch_state_.command_stride);
assert_not_null(active_draw_.command_allocation.host_ptr);
// Allocate a state data block.
// We should treat it as write-only.
if (!state_buffer_.CanAcquire(batch_state_.state_stride)) {
Flush(FlushMode::kMakeCoherent);
}
active_draw_.state_allocation =
state_buffer_.Acquire(batch_state_.state_stride);
assert_not_null(active_draw_.state_allocation.host_ptr);
active_draw_.command_address =
reinterpret_cast<uintptr_t>(active_draw_.command_allocation.host_ptr);
auto state_host_ptr =
reinterpret_cast<uintptr_t>(active_draw_.state_allocation.host_ptr);
active_draw_.header = reinterpret_cast<CommonHeader*>(state_host_ptr);
// active_draw_.float_consts =
// reinterpret_cast<float4*>(state_host_ptr +
// batch_state_.float_consts_offset);
// active_draw_.bool_consts =
// reinterpret_cast<uint32_t*>(state_host_ptr +
// batch_state_.bool_consts_offset);
// active_draw_.loop_consts =
// reinterpret_cast<uint32_t*>(state_host_ptr +
// batch_state_.loop_consts_offset);
return true;
}
void DrawBatcher::DiscardDraw() {
if (!draw_open_) {
// No-op.
return;
}
draw_open_ = false;
command_buffer_.Discard(std::move(active_draw_.command_allocation));
state_buffer_.Discard(std::move(active_draw_.state_allocation));
}
bool DrawBatcher::CommitDraw() {
assert_true(draw_open_);
draw_open_ = false;
// Copy over required constants.
CopyConstants();
if (batch_state_.state_range_start == UINTPTR_MAX) {
batch_state_.command_range_start = active_draw_.command_allocation.offset;
batch_state_.state_range_start = active_draw_.state_allocation.offset;
}
batch_state_.command_range_length +=
active_draw_.command_allocation.aligned_length;
batch_state_.state_range_length +=
active_draw_.state_allocation.aligned_length;
command_buffer_.Commit(std::move(active_draw_.command_allocation));
state_buffer_.Commit(std::move(active_draw_.state_allocation));
++batch_state_.draw_count;
return true;
}
bool DrawBatcher::Flush(FlushMode mode) {
if (batch_state_.draw_count) {
SCOPE_profile_cpu_f("gpu");
assert_not_zero(batch_state_.command_stride);
assert_not_zero(batch_state_.state_stride);
// Flush pending buffer changes.
command_buffer_.Flush();
state_buffer_.Flush();
array_data_buffer_->Flush();
// State data is indexed by draw ID.
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, state_buffer_.handle(),
batch_state_.state_range_start,
batch_state_.state_range_length);
GLenum prim_type = 0;
switch (batch_state_.prim_type) {
case PrimitiveType::kPointList:
prim_type = GL_POINTS;
break;
case PrimitiveType::kLineList:
prim_type = GL_LINES;
break;
case PrimitiveType::kLineStrip:
prim_type = GL_LINE_STRIP;
break;
case PrimitiveType::kLineLoop:
prim_type = GL_LINE_LOOP;
break;
case PrimitiveType::kTriangleList:
prim_type = GL_TRIANGLES;
break;
case PrimitiveType::kTriangleStrip:
prim_type = GL_TRIANGLE_STRIP;
break;
case PrimitiveType::kTriangleFan:
prim_type = GL_TRIANGLE_FAN;
break;
case PrimitiveType::kRectangleList:
prim_type = GL_TRIANGLE_STRIP;
// Rect lists aren't culled. There may be other things they skip too.
// assert_true((register_file_->values[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32
// &
// 0x3) == 0);
break;
case PrimitiveType::kQuadList:
prim_type = GL_LINES_ADJACENCY;
break;
default:
case PrimitiveType::kUnknown0x07:
prim_type = GL_POINTS;
XELOGE("unsupported primitive type %d", batch_state_.prim_type);
assert_unhandled_case(batch_state_.prim_type);
DiscardDraw();
return false;
}
// Fast path for single draws.
void* indirect_offset =
reinterpret_cast<void*>(batch_state_.command_range_start);
if (has_bindless_mdi_) {
int vertex_buffer_count =
batch_state_.vertex_shader->buffer_inputs().total_elements_count;
assert_true(vertex_buffer_count < 8);
if (batch_state_.indexed) {
glMultiDrawElementsIndirectBindlessNV(
prim_type, batch_state_.index_type, indirect_offset,
batch_state_.draw_count, batch_state_.command_stride,
vertex_buffer_count);
} else {
glMultiDrawArraysIndirectBindlessNV(
prim_type, indirect_offset, batch_state_.draw_count,
batch_state_.command_stride, vertex_buffer_count);
}
} else {
if (batch_state_.indexed) {
glMultiDrawElementsIndirect(prim_type, batch_state_.index_type,
indirect_offset, batch_state_.draw_count,
batch_state_.command_stride);
} else {
glMultiDrawArraysIndirect(prim_type, indirect_offset,
batch_state_.draw_count,
batch_state_.command_stride);
}
}
batch_state_.command_range_start = UINTPTR_MAX;
batch_state_.command_range_length = 0;
batch_state_.state_range_start = UINTPTR_MAX;
batch_state_.state_range_length = 0;
batch_state_.draw_count = 0;
}
if (mode == FlushMode::kReconfigure) {
// Reset - we'll update it as soon as we have all the information.
batch_state_.needs_reconfigure = true;
}
return true;
}
void DrawBatcher::CopyConstants() {
// TODO(benvanik): partial updates, etc. We could use shader constant access
// knowledge that we get at compile time to only upload those constants
// required. If we did this as a variable length then we could really cut
// down on state block sizes.
std::memcpy(active_draw_.header->float_consts,
&register_file_->values[XE_GPU_REG_SHADER_CONSTANT_000_X].f32,
sizeof(active_draw_.header->float_consts));
std::memcpy(
active_draw_.header->bool_consts,
&register_file_->values[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].f32,
sizeof(active_draw_.header->bool_consts));
std::memcpy(active_draw_.header->loop_consts,
&register_file_->values[XE_GPU_REG_SHADER_CONSTANT_LOOP_00].f32,
sizeof(active_draw_.header->loop_consts));
}
} // namespace gl4
} // namespace gpu
} // namespace xe

View File

@ -0,0 +1,230 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2014 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_GPU_GL4_GL4_STATE_DATA_BUILDER_H_
#define XENIA_GPU_GL4_GL4_STATE_DATA_BUILDER_H_
#include <xenia/common.h>
#include <xenia/gpu/gl4/circular_buffer.h>
#include <xenia/gpu/gl4/gl_context.h>
#include <xenia/gpu/gl4/gl4_shader.h>
#include <xenia/gpu/register_file.h>
#include <xenia/gpu/xenos.h>
namespace xe {
namespace gpu {
namespace gl4 {
union float4 {
float v[4];
struct {
float x, y, z, w;
};
};
#pragma pack(push, 4)
struct DrawArraysIndirectCommand {
GLuint count;
GLuint instance_count;
GLuint first_index;
GLuint base_instance;
};
struct DrawElementsIndirectCommand {
GLuint count;
GLuint instance_count;
GLuint first_index;
GLint base_vertex;
GLuint base_instance;
};
struct BindlessPtrNV {
GLuint index;
GLuint reserved_zero;
GLuint64 address;
GLuint64 length;
};
struct DrawArraysIndirectBindlessCommandNV {
DrawArraysIndirectCommand cmd;
// NOTE: the spec is wrong here. For fucks sake.
// GLuint reserved_zero;
BindlessPtrNV vertex_buffers[8];
};
struct DrawElementsIndirectBindlessCommandNV {
DrawElementsIndirectCommand cmd;
GLuint reserved_zero;
BindlessPtrNV index_buffer;
BindlessPtrNV vertex_buffers[8];
};
#pragma pack(pop)
class DrawBatcher {
public:
enum class FlushMode {
kMakeCoherent,
kStateChange,
kReconfigure,
};
DrawBatcher(RegisterFile* register_file);
bool Initialize(CircularBuffer* array_data_buffer);
void Shutdown();
PrimitiveType prim_type() const { return batch_state_.prim_type; }
void set_window_offset(uint32_t x, uint32_t y) {
active_draw_.header->window_offset.x = float(x);
active_draw_.header->window_offset.y = float(y);
}
void set_window_scissor(uint32_t left, uint32_t top, uint32_t right,
uint32_t bottom) {
active_draw_.header->window_scissor.x = float(left);
active_draw_.header->window_scissor.y = float(top);
active_draw_.header->window_scissor.z = float(right);
active_draw_.header->window_scissor.w = float(bottom);
}
void set_window_scalar(float width_scalar, float height_scalar) {
active_draw_.header->window_offset.z = width_scalar;
active_draw_.header->window_offset.w = height_scalar;
}
void set_viewport_offset(float offset_x, float offset_y, float offset_z) {
active_draw_.header->viewport_offset.x = offset_x;
active_draw_.header->viewport_offset.y = offset_y;
active_draw_.header->viewport_offset.z = offset_z;
}
void set_viewport_scale(float scale_x, float scale_y, float scale_z) {
active_draw_.header->viewport_scale.x = scale_x;
active_draw_.header->viewport_scale.y = scale_y;
active_draw_.header->viewport_scale.z = scale_z;
}
void set_vtx_fmt(float xy, float z, float w) {
active_draw_.header->vtx_fmt.x = xy;
active_draw_.header->vtx_fmt.y = xy;
active_draw_.header->vtx_fmt.z = z;
active_draw_.header->vtx_fmt.w = w;
}
void set_alpha_test(bool enabled, uint32_t func, float ref) {
active_draw_.header->alpha_test.x = enabled ? 1.0f : 0.0f;
active_draw_.header->alpha_test.y = float(func);
active_draw_.header->alpha_test.z = ref;
}
void set_texture_sampler(int index, GLuint64 handle) {
active_draw_.header->texture_samplers[index] = handle;
}
void set_index_buffer(const CircularBuffer::Allocation& allocation) {
if (has_bindless_mdi_) {
auto& ptr = active_draw_.draw_elements_bindless_cmd->index_buffer;
ptr.reserved_zero = 0;
ptr.index = 0;
ptr.address = allocation.gpu_ptr;
ptr.length = allocation.length;
} else {
// Offset is used in glDrawElements.
auto& cmd = active_draw_.draw_elements_cmd;
size_t index_size = batch_state_.index_type == GL_UNSIGNED_SHORT ? 2 : 4;
cmd->first_index = GLuint(allocation.offset / index_size);
}
}
void set_vertex_buffer(int index, GLsizei offset, GLsizei stride,
const CircularBuffer::Allocation& allocation) {
if (has_bindless_mdi_) {
BindlessPtrNV* ptr;
if (batch_state_.indexed) {
ptr = &active_draw_.draw_elements_bindless_cmd->vertex_buffers[index];
} else {
ptr = &active_draw_.draw_arrays_bindless_cmd->vertex_buffers[index];
}
ptr->reserved_zero = 0;
ptr->index = index;
ptr->address = allocation.gpu_ptr + offset;
ptr->length = allocation.length - offset;
}
}
bool ReconfigurePipeline(GL4Shader* vertex_shader, GL4Shader* pixel_shader,
GLuint pipeline);
bool BeginDrawArrays(PrimitiveType prim_type, uint32_t index_count);
bool BeginDrawElements(PrimitiveType prim_type, uint32_t index_count,
xenos::IndexFormat index_format);
void DiscardDraw();
bool CommitDraw();
bool Flush(FlushMode mode);
private:
bool BeginDraw();
void CopyConstants();
RegisterFile* register_file_;
CircularBuffer command_buffer_;
CircularBuffer state_buffer_;
CircularBuffer* array_data_buffer_;
bool has_bindless_mdi_;
struct BatchState {
bool needs_reconfigure;
PrimitiveType prim_type;
bool indexed;
GLenum index_type;
GL4Shader* vertex_shader;
GL4Shader* pixel_shader;
GLuint pipeline;
GLsizei command_stride;
GLsizei state_stride;
GLsizei float_consts_offset;
GLsizei bool_consts_offset;
GLsizei loop_consts_offset;
uintptr_t command_range_start;
uintptr_t command_range_length;
uintptr_t state_range_start;
uintptr_t state_range_length;
GLsizei draw_count;
} batch_state_;
// This must match GL4Shader's header.
struct CommonHeader {
float4 window_offset; // tx,ty,sx,sy
float4 window_scissor; // x0,y0,x1,y1
float4 viewport_offset; // tx,ty,tz,?
float4 viewport_scale; // sx,sy,sz,?
float4 vtx_fmt; //
float4 alpha_test; // alpha test enable, func, ref, ?
// TODO(benvanik): pack tightly
GLuint64 texture_samplers[32];
float4 float_consts[512];
uint32_t bool_consts[8];
uint32_t loop_consts[32];
};
struct {
CircularBuffer::Allocation command_allocation;
CircularBuffer::Allocation state_allocation;
union {
DrawArraysIndirectCommand* draw_arrays_cmd;
DrawElementsIndirectCommand* draw_elements_cmd;
DrawArraysIndirectBindlessCommandNV* draw_arrays_bindless_cmd;
DrawElementsIndirectBindlessCommandNV* draw_elements_bindless_cmd;
uintptr_t command_address;
};
CommonHeader* header;
} active_draw_;
bool draw_open_;
};
} // namespace gl4
} // namespace gpu
} // namespace xe
#endif // XENIA_GPU_GL4_GL4_STATE_DATA_BUILDER_H_

View File

@ -11,6 +11,7 @@
#include <poly/cxx_compat.h>
#include <poly/math.h>
#include <xenia/gpu/gl4/gl4_gpu-private.h>
#include <xenia/gpu/gl4/gl4_shader_translator.h>
#include <xenia/gpu/gpu-private.h>
@ -18,6 +19,8 @@ namespace xe {
namespace gpu {
namespace gl4 {
using namespace xe::gpu::xenos;
extern "C" GLEWContext* glewGetContext();
// Stateful, but minimally.
@ -25,41 +28,147 @@ thread_local GL4ShaderTranslator shader_translator_;
GL4Shader::GL4Shader(ShaderType shader_type, uint64_t data_hash,
const uint32_t* dword_ptr, uint32_t dword_count)
: Shader(shader_type, data_hash, dword_ptr, dword_count), program_(0) {}
: Shader(shader_type, data_hash, dword_ptr, dword_count),
program_(0),
vao_(0) {}
GL4Shader::~GL4Shader() { glDeleteProgram(program_); }
GL4Shader::~GL4Shader() {
glDeleteProgram(program_);
glDeleteVertexArrays(1, &vao_);
}
const std::string header =
"#version 450\n"
"#extension all : warn\n"
"#extension GL_ARB_bindless_texture : require\n"
"#extension GL_ARB_explicit_uniform_location : require\n"
"#extension GL_ARB_shading_language_420pack : require\n"
"#extension GL_ARB_shader_storage_buffer_object : require\n"
"precision highp float;\n"
"precision highp int;\n"
"layout(std140, column_major) uniform;\n"
"layout(std430, column_major) buffer;\n"
"struct StateData {\n"
" vec4 window_offset;\n"
" vec4 window_scissor;\n"
" vec4 vtx_fmt;\n"
" vec4 viewport_offset;\n"
" vec4 viewport_scale;\n"
" vec4 alpha_test;\n"
" uvec2 texture_samplers[32];\n"
" vec4 float_consts[512];\n"
" uint fetch_consts[32 * 6];\n"
" int bool_consts[8];\n"
" int loop_consts[32];\n"
"};\n"
"struct VertexData {\n"
" vec4 o[16];\n"
"};\n"
"\n"
"layout(binding = 0) buffer State {\n"
" StateData state;\n"
"};\n";
std::string GL4Shader::GetHeader() {
static const std::string header =
"#version 450\n"
"#extension all : warn\n"
"#extension GL_ARB_bindless_texture : require\n"
"#extension GL_ARB_explicit_uniform_location : require\n"
"#extension GL_ARB_shader_draw_parameters : require\n"
"#extension GL_ARB_shader_storage_buffer_object : require\n"
"#extension GL_ARB_shading_language_420pack : require\n"
"precision highp float;\n"
"precision highp int;\n"
"layout(std140, column_major) uniform;\n"
"layout(std430, column_major) buffer;\n"
"\n"
// This must match DrawBatcher::CommonHeader.
"struct StateData {\n"
" vec4 window_offset;\n"
" vec4 window_scissor;\n"
" vec4 viewport_offset;\n"
" vec4 viewport_scale;\n"
" vec4 vtx_fmt;\n"
" vec4 alpha_test;\n"
// TODO(benvanik): variable length.
" uvec2 texture_samplers[32];\n"
" vec4 float_consts[512];\n"
" int bool_consts[8];\n"
" int loop_consts[32];\n"
"};\n"
"layout(binding = 0) buffer State {\n"
" StateData states[];\n"
"};\n"
"\n"
"struct VertexData {\n"
" vec4 o[16];\n"
"};\n";
return header;
}
bool GL4Shader::PrepareVertexArrayObject() {
glCreateVertexArrays(1, &vao_);
bool has_bindless_vbos = false;
if (FLAGS_vendor_gl_extensions && GLEW_NV_vertex_buffer_unified_memory) {
has_bindless_vbos = true;
// Nasty, but no DSA for this.
glBindVertexArray(vao_);
glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
}
uint32_t el_index = 0;
for (uint32_t buffer_index = 0; buffer_index < buffer_inputs_.count;
++buffer_index) {
const auto& desc = buffer_inputs_.descs[buffer_index];
for (uint32_t i = 0; i < desc.element_count; ++i, ++el_index) {
const auto& el = desc.elements[i];
auto comp_count = GetVertexFormatComponentCount(el.format);
GLenum comp_type;
switch (el.format) {
case VertexFormat::k_8_8_8_8:
comp_type = el.is_signed ? GL_BYTE : GL_UNSIGNED_BYTE;
break;
case VertexFormat::k_2_10_10_10:
comp_type = el.is_signed ? GL_INT_2_10_10_10_REV
: GL_UNSIGNED_INT_2_10_10_10_REV;
break;
case VertexFormat::k_10_11_11:
assert_false(el.is_signed);
comp_type = GL_UNSIGNED_INT_10F_11F_11F_REV;
break;
/*case VertexFormat::k_11_11_10:
break;*/
case VertexFormat::k_16_16:
comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT;
break;
case VertexFormat::k_16_16_FLOAT:
comp_type = GL_HALF_FLOAT;
break;
case VertexFormat::k_16_16_16_16:
comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT;
break;
case VertexFormat::k_16_16_16_16_FLOAT:
comp_type = GL_HALF_FLOAT;
break;
case VertexFormat::k_32:
comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
break;
case VertexFormat::k_32_32:
comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
break;
case VertexFormat::k_32_32_32_32:
comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT;
break;
case VertexFormat::k_32_FLOAT:
comp_type = GL_FLOAT;
break;
case VertexFormat::k_32_32_FLOAT:
comp_type = GL_FLOAT;
break;
case VertexFormat::k_32_32_32_FLOAT:
comp_type = GL_FLOAT;
break;
case VertexFormat::k_32_32_32_32_FLOAT:
comp_type = GL_FLOAT;
break;
default:
assert_unhandled_case(el.format);
return false;
}
glEnableVertexArrayAttrib(vao_, el_index);
if (has_bindless_vbos) {
// NOTE: MultiDrawIndirectBindlessMumble doesn't handle separate
// vertex bindings/formats.
glVertexAttribFormat(el_index, comp_count, comp_type, el.is_normalized,
el.offset_words * 4);
glVertexArrayVertexBuffer(vao_, el_index, 0, 0, desc.stride_words * 4);
} else {
glVertexArrayAttribBinding(vao_, el_index, buffer_index);
glVertexArrayAttribFormat(vao_, el_index, comp_count, comp_type,
el.is_normalized, el.offset_words * 4);
}
}
}
if (has_bindless_vbos) {
glBindVertexArray(0);
}
return true;
}
bool GL4Shader::PrepareVertexShader(
const xenos::xe_gpu_program_cntl_t& program_cntl) {
@ -68,8 +177,14 @@ bool GL4Shader::PrepareVertexShader(
}
has_prepared_ = true;
// Build static vertex array descriptor.
if (!PrepareVertexArrayObject()) {
PLOGE("Unable to prepare vertex shader array object");
return false;
}
std::string apply_transform =
"vec4 applyTransform(vec4 pos) {\n"
"vec4 applyTransform(const in StateData state, vec4 pos) {\n"
" // Clip->NDC with perspective divide.\n"
" // We do this here because it's programmable on the 360.\n"
" float w = pos.w;\n"
@ -107,14 +222,15 @@ bool GL4Shader::PrepareVertexShader(
" return pos;\n"
"}\n";
std::string source =
header + apply_transform +
GetHeader() + apply_transform +
"out gl_PerVertex {\n"
" vec4 gl_Position;\n"
" float gl_PointSize;\n"
" float gl_ClipDistance[];\n"
"};\n"
"layout(location = 0) out VertexData vtx;\n"
"void processVertex();\n"
"layout(location = 0) flat out uint draw_id;\n"
"layout(location = 1) out VertexData vtx;\n"
"void processVertex(const in StateData state);\n"
"void main() {\n" +
(alloc_counts().positions ? " gl_Position = vec4(0.0, 0.0, 0.0, 1.0);\n"
: "") +
@ -122,8 +238,10 @@ bool GL4Shader::PrepareVertexShader(
" for (int i = 0; i < vtx.o.length(); ++i) {\n"
" vtx.o[i] = vec4(0.0, 0.0, 0.0, 0.0);\n"
" }\n"
" processVertex();\n"
" gl_Position = applyTransform(gl_Position);\n"
" const StateData state = states[gl_DrawIDARB];\n"
" processVertex(state);\n"
" gl_Position = applyTransform(state, gl_Position);\n"
" draw_id = gl_DrawIDARB;\n"
"}\n";
std::string translated_source =
@ -149,12 +267,14 @@ bool GL4Shader::PreparePixelShader(
}
has_prepared_ = true;
std::string source = header +
"layout(location = 0) in VertexData vtx;\n"
std::string source = GetHeader() +
"layout(location = 0) flat in uint draw_id;\n"
"layout(location = 1) in VertexData vtx;\n"
"layout(location = 0) out vec4 oC[4];\n"
"void processFragment();\n"
"void processFragment(const in StateData state);\n"
"void main() {\n" +
" processFragment();\n"
" const StateData state = states[draw_id];\n"
" processFragment(state);\n"
"}\n";
std::string translated_source =

View File

@ -10,6 +10,8 @@
#ifndef XENIA_GPU_GL4_GL4_SHADER_H_
#define XENIA_GPU_GL4_GL4_SHADER_H_
#include <string>
#include <xenia/common.h>
#include <xenia/gpu/gl4/gl_context.h>
#include <xenia/gpu/shader.h>
@ -25,14 +27,18 @@ class GL4Shader : public Shader {
~GL4Shader() override;
GLuint program() const { return program_; }
GLuint vao() const { return vao_; }
bool PrepareVertexShader(const xenos::xe_gpu_program_cntl_t& program_cntl);
bool PreparePixelShader(const xenos::xe_gpu_program_cntl_t& program_cntl);
protected:
std::string GetHeader();
bool PrepareVertexArrayObject();
bool CompileProgram(std::string source);
GLuint program_;
GLuint vao_;
};
} // namespace gl4

View File

@ -91,7 +91,7 @@ std::string GL4ShaderTranslator::TranslateVertexShader(
const auto& alloc_counts = vertex_shader->alloc_counts();
// Vertex shader main() header.
Append("void processVertex() {\n");
Append("void processVertex(const in StateData state) {\n");
// Add temporaries for any registers we may use.
uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs;
@ -126,7 +126,7 @@ std::string GL4ShaderTranslator::TranslatePixelShader(
// (and less than the number of required registers), things may die.
// Pixel shader main() header.
Append("void processFragment() {\n");
Append("void processFragment(const in StateData state) {\n");
// Add temporary registers.
uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs;

View File

@ -132,12 +132,15 @@ std::unique_ptr<GLContext> GLContext::CreateShared() {
GLContextLock context_lock(this);
int context_flags = 0;
//int profile = WGL_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB;
int profile = WGL_CONTEXT_CORE_PROFILE_BIT_ARB;
#if DEBUG
context_flags |= WGL_CONTEXT_DEBUG_BIT_ARB;
#endif // DEBUG
int attrib_list[] = {WGL_CONTEXT_MAJOR_VERSION_ARB, 4, //
WGL_CONTEXT_MINOR_VERSION_ARB, 5, //
WGL_CONTEXT_FLAGS_ARB, context_flags, //
#endif // DEBUG
int attrib_list[] = {WGL_CONTEXT_MAJOR_VERSION_ARB, 4, //
WGL_CONTEXT_MINOR_VERSION_ARB, 5, //
WGL_CONTEXT_FLAGS_ARB, context_flags, //
WGL_CONTEXT_PROFILE_MASK_ARB, profile, //
0};
new_glrc = wglCreateContextAttribsARB(dc_, glrc_, attrib_list);
if (!new_glrc) {

View File

@ -5,6 +5,8 @@
'circular_buffer.h',
'command_processor.cc',
'command_processor.h',
'draw_batcher.cc',
'draw_batcher.h',
'gl4_gpu-private.h',
'gl4_gpu.cc',
'gl4_gpu.h',