GL context on command processor.

This commit is contained in:
Ben Vanik 2014-12-23 20:32:41 -08:00
parent e8de42d9ea
commit 02d52167d3
7 changed files with 569 additions and 62 deletions

View File

@ -12,6 +12,7 @@
#include <algorithm>
#include <poly/logging.h>
#include <poly/math.h>
#include <xenia/gpu/gl4/gl4_graphics_system.h>
#include <xenia/gpu/gpu-private.h>
#include <xenia/gpu/xenos.h>
@ -27,6 +28,8 @@ namespace gl4 {
using namespace xe::gpu::xenos;
extern "C" extern "C" GLEWContext* glewGetContext();
CommandProcessor::CommandProcessor(GL4GraphicsSystem* graphics_system)
: memory_(graphics_system->memory()),
membase_(graphics_system->memory()->membase()),
@ -46,6 +49,7 @@ CommandProcessor::CommandProcessor(GL4GraphicsSystem* graphics_system)
bin_mask_(0xFFFFFFFFull),
active_vertex_shader_(nullptr),
active_pixel_shader_(nullptr) {
std::memset(&draw_command_, 0, sizeof(draw_command_));
LARGE_INTEGER perf_counter;
QueryPerformanceCounter(&perf_counter);
time_base_ = perf_counter.QuadPart;
@ -59,33 +63,37 @@ uint64_t CommandProcessor::QueryTime() {
return perf_counter.QuadPart - time_base_;
}
void CommandProcessor::Initialize(uint32_t ptr, uint32_t page_count) {
primary_buffer_ptr_ = ptr;
// Not sure this is correct, but it's a way to take the page_count back to
// the number of bytes allocated by the physical alloc.
uint32_t original_size = 1 << (0x1C - page_count - 1);
primary_buffer_size_ = original_size;
read_ptr_index_ = 0;
bool CommandProcessor::Initialize(std::unique_ptr<GLContext> context) {
context_ = std::move(context);
worker_running_ = true;
worker_thread_ = std::thread([this]() {
poly::threading::set_name("GL4 Worker");
xe::Profiler::ThreadEnter("GL4 Worker");
context_->MakeCurrent();
WorkerMain();
xe::Profiler::ThreadExit();
});
return true;
}
void CommandProcessor::Shutdown() {
worker_running_ = false;
SetEvent(write_ptr_index_event_);
worker_thread_.join();
context_.reset();
all_shaders_.clear();
shader_cache_.clear();
}
void CommandProcessor::WorkerMain() {
if (!SetupGL()) {
PFATAL("Unable to setup command processor GL state");
return;
}
while (worker_running_) {
uint32_t write_ptr_index = write_ptr_index_.load();
while (write_ptr_index == 0xBAADF00D ||
@ -94,6 +102,7 @@ void CommandProcessor::WorkerMain() {
// We wait a short bit here to yield time. Since we are also running the
// main window display we don't want to pause too long, though.
// YieldProcessor();
PrepareForWait();
const int wait_time_ms = 5;
if (WaitForSingleObject(write_ptr_index_event_, wait_time_ms) ==
WAIT_TIMEOUT) {
@ -117,6 +126,28 @@ void CommandProcessor::WorkerMain() {
read_ptr_index_);
}
}
ShutdownGL();
}
bool CommandProcessor::SetupGL() {
// Uniform buffer that stores the per-draw state (constants, etc).
glGenBuffers(1, &uniform_data_buffer_);
glNamedBufferStorage(uniform_data_buffer_, 16 * 1024, nullptr, GL_MAP_WRITE_BIT);
return true;
}
void CommandProcessor::ShutdownGL() {
glDeleteBuffers(1, &uniform_data_buffer_);
}
void CommandProcessor::InitializeRingBuffer(uint32_t ptr, uint32_t page_count) {
primary_buffer_ptr_ = ptr;
// Not sure this is correct, but it's a way to take the page_count back to
// the number of bytes allocated by the physical alloc.
uint32_t original_size = 1 << (0x1C - page_count - 1);
primary_buffer_size_ = original_size;
}
void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr,
@ -162,6 +193,8 @@ void CommandProcessor::WriteRegister(uint32_t packet_ptr, uint32_t index,
}
void CommandProcessor::MakeCoherent() {
SCOPE_profile_cpu_f("gpu");
// Status host often has 0x01000000 or 0x03000000.
// This is likely toggling VC (vertex cache) or TC (texture cache).
// Or, it also has a direction in here maybe - there is probably
@ -189,6 +222,16 @@ void CommandProcessor::MakeCoherent() {
regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32 = status_host;
}
void CommandProcessor::PrepareForWait() {
SCOPE_profile_cpu_f("gpu");
// TODO(benvanik): fences and fancy stuff. We should figure out a way to
// make interrupt callbacks from the GPU so that we don't have to do a full
// synchronize here.
// glFlush();
glFinish();
}
class CommandProcessor::RingbufferReader {
public:
RingbufferReader(uint8_t* membase, uint32_t base_ptr, uint32_t ptr_mask,
@ -274,6 +317,8 @@ void CommandProcessor::ExecutePrimaryBuffer(uint32_t start_index,
}
void CommandProcessor::ExecuteIndirectBuffer(uint32_t ptr, uint32_t length) {
SCOPE_profile_cpu_f("gpu");
XETRACECP("[%.8X] ExecuteIndirectBuffer(%dw)", ptr, length);
// Execute commands!
@ -625,6 +670,7 @@ bool CommandProcessor::ExecutePacketType3_WAIT_REG_MEM(RingbufferReader* reader,
if (!matched) {
// Wait.
if (wait >= 0x100) {
PrepareForWait();
Sleep(wait / 0x100);
} else {
SwitchToThread();
@ -790,14 +836,19 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingbufferReader* reader,
uint32_t dword1 = reader->Read();
uint32_t index_count = dword1 >> 16;
auto prim_type = static_cast<PrimitiveType>(dword1 & 0x3F);
uint32_t index_base = 0;
uint32_t index_size = 0;
Endian index_endianness = Endian::kUnspecified;
bool index_32bit = false;
uint32_t src_sel = (dword1 >> 6) & 0x3;
if (src_sel == 0x0) {
// Indexed draw.
uint32_t index_base = reader->Read();
uint32_t index_size = reader->Read();
auto endianness = static_cast<Endian>(index_size >> 30);
index_base = reader->Read();
index_size = reader->Read();
index_endianness = static_cast<Endian>(index_size >> 30);
index_size &= 0x00FFFFFF;
bool index_32bit = (dword1 >> 11) & 0x1;
index_32bit = (dword1 >> 11) & 0x1;
index_size *= index_32bit ? 4 : 2;
} else if (src_sel == 0x2) {
// Auto draw.
@ -805,33 +856,31 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingbufferReader* reader,
// Unknown source select.
assert_always();
}
// if (!driver_->PrepareDraw(draw_command_)) {
// draw_command_.prim_type = prim_type;
// draw_command_.start_index = 0;
// draw_command_.index_count = index_count;
// draw_command_.base_vertex = 0;
// if (src_sel == 0x0) {
// // Indexed draw.
// // TODO(benvanik): detect subregions of larger index
// buffers!
// driver_->PrepareDrawIndexBuffer(
// draw_command_, index_base, index_size,
// endianness,
// index_32bit ? INDEX_FORMAT_32BIT : INDEX_FORMAT_16BIT);
// } else if (src_sel == 0x2) {
// // Auto draw.
// draw_command_.index_buffer = nullptr;
// } else {
// // Unknown source select.
// assert_always();
// }
// driver_->Draw(draw_command_);
// } else {
// if (src_sel == 0x0) {
// reader->Advance(2); // skip
// }
// }
return true;
if (!PrepareDraw(&draw_command_)) {
PLOGE("Invalid DRAW_INDX; ignoring");
return false;
}
draw_command_.prim_type = prim_type;
draw_command_.start_index = 0;
draw_command_.index_count = index_count;
draw_command_.base_vertex = 0;
if (src_sel == 0x0) {
// Indexed draw.
// TODO(benvanik): detect subregions of larger index buffers
/*driver_->PrepareDrawIndexBuffer(
draw_command_, index_base, index_size,
endianness,
index_32bit ? INDEX_FORMAT_32BIT : INDEX_FORMAT_16BIT);*/
draw_command_.index_buffer = nullptr;
} else if (src_sel == 0x2) {
// Auto draw.
draw_command_.index_buffer = nullptr;
} else {
// Unknown source select.
assert_always();
}
return IssueDraw(&draw_command_);
}
bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingbufferReader* reader,
@ -849,16 +898,17 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingbufferReader* reader,
bool index_32bit = (dword0 >> 11) & 0x1;
uint32_t indices_size = index_count * (index_32bit ? 4 : 2);
reader->CheckRead(indices_size / sizeof(uint32_t));
/*if (!driver_->PrepareDraw(draw_command_)) {
uint32_t index_ptr = reader->ptr();
reader->Advance(count - 1);
if (!PrepareDraw(&draw_command_)) {
return false;
}
draw_command_.prim_type = prim_type;
draw_command_.start_index = 0;
draw_command_.index_count = index_count;
draw_command_.base_vertex = 0;
draw_command_.index_buffer = nullptr;
driver_->Draw(draw_command_);
}*/
reader->Advance(count - 1);
return true;
return IssueDraw(&draw_command_);
}
bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingbufferReader* reader,
@ -967,6 +1017,8 @@ bool CommandProcessor::ExecutePacketType3_INVALIDATE_STATE(
bool CommandProcessor::LoadShader(ShaderType shader_type,
const uint32_t* address,
uint32_t dword_count) {
SCOPE_profile_cpu_f("gpu");
// Hash the input memory and lookup the shader.
GL4Shader* shader_ptr = nullptr;
uint64_t hash = XXH64(address, dword_count * sizeof(uint32_t), 0);
@ -1004,6 +1056,344 @@ bool CommandProcessor::LoadShader(ShaderType shader_type,
return true;
}
bool CommandProcessor::PrepareDraw(DrawCommand* draw_command) {
SCOPE_profile_cpu_f("gpu");
auto& regs = *register_file_;
auto& cmd = *draw_command;
// Reset the things we don't modify so that we have clean state.
cmd.prim_type = PrimitiveType::kPointList;
cmd.index_count = 0;
cmd.index_buffer = nullptr;
// Generic stuff.
cmd.start_index = regs[XE_GPU_REG_VGT_INDX_OFFSET].u32;
cmd.base_vertex = 0;
if (!UpdateState(draw_command)) {
return false;
}
if (!UpdateRenderTargets()) {
return false;
}
return true;
}
bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
// Much of this state machine is extracted from:
// https://github.com/freedreno/mesa/blob/master/src/mesa/drivers/dri/r200/r200_state.c
// http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html
// http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
auto& regs = *register_file_;
union float4 {
float v[4];
struct {
float x, y, z, w;
};
};
struct UniformDataBlock {
float4 window_offset; // tx,ty,?,?
float4 window_scissor; // x0,y0,x1,y1
float4 viewport_offset; // tx,ty,tz,?
float4 viewport_scale; // sx,sy,sz,?
// TODO(benvanik): vertex format xyzw?
float4 alpha_test; // alpha test enable, func, ref, ?
// Register data from 0x4000 to 0x4927.
// SHADER_CONSTANT_000_X...
float4 float_consts[512];
// SHADER_CONSTANT_FETCH_00_0...
uint32_t fetch_consts[32 * 6];
// SHADER_CONSTANT_BOOL_000_031...
int32_t bool_consts[8];
// SHADER_CONSTANT_LOOP_00...
int32_t loop_consts[32];
};
static_assert(sizeof(UniformDataBlock) <= 16 * 1024,
"Need <=16k uniform data");
auto buffer_ptr = reinterpret_cast<UniformDataBlock*>(
glMapNamedBufferRange(uniform_data_buffer_, 0, 0,
GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT));
if (!buffer_ptr) {
PLOGE("Unable to map uniform data buffer");
return false;
}
// Window parameters.
// See r200UpdateWindow:
// https://github.com/freedreno/mesa/blob/master/src/mesa/drivers/dri/r200/r200_state.c
uint32_t window_offset = regs[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32;
buffer_ptr->window_offset.x = float(window_offset & 0x7FFF);
buffer_ptr->window_offset.y = float((window_offset >> 16) & 0x7FFF);
uint32_t window_scissor_tl = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32;
uint32_t window_scissor_br = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32;
buffer_ptr->window_scissor.x = float(window_scissor_tl & 0x7FFF);
buffer_ptr->window_scissor.y = float((window_scissor_tl >> 16) & 0x7FFF);
buffer_ptr->window_scissor.z = float(window_scissor_br & 0x7FFF);
buffer_ptr->window_scissor.w = float((window_scissor_br >> 16) & 0x7FFF);
// Viewport scaling. Only enabled if the flags are all set.
buffer_ptr->viewport_scale.x =
regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32; // 640
buffer_ptr->viewport_offset.x =
regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32; // 640
buffer_ptr->viewport_scale.y =
regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32; // -360
buffer_ptr->viewport_offset.y =
regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; // 360
buffer_ptr->viewport_scale.z = regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32; // 1
buffer_ptr->viewport_offset.z =
regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32; // 0
// Whether each of the viewport settings is enabled.
// We require it to be all or nothing right now.
uint32_t vte_control = regs[XE_GPU_REG_PA_CL_VTE_CNTL].u32;
bool vport_xscale_enable = (vte_control & (1 << 0)) > 0;
bool vport_xoffset_enable = (vte_control & (1 << 1)) > 0;
bool vport_yscale_enable = (vte_control & (1 << 2)) > 0;
bool vport_yoffset_enable = (vte_control & (1 << 3)) > 0;
bool vport_zscale_enable = (vte_control & (1 << 4)) > 0;
bool vport_zoffset_enable = (vte_control & (1 << 5)) > 0;
assert_true(vport_xscale_enable == vport_yscale_enable ==
vport_zscale_enable == vport_xoffset_enable ==
vport_yoffset_enable == vport_zoffset_enable);
// TODO(benvanik): pass to shaders? disable transform? etc?
glViewport(0, 0, 1280, 720);
// Copy over all constants.
// TODO(benvanik): partial updates, etc. We could use shader constant access
// knowledge that we get at compile time to only upload those constants
// required.
std::memcpy(
&buffer_ptr->float_consts, &regs[XE_GPU_REG_SHADER_CONSTANT_000_X].f32,
sizeof(buffer_ptr->float_consts) + sizeof(buffer_ptr->fetch_consts) +
sizeof(buffer_ptr->loop_consts) + sizeof(buffer_ptr->bool_consts));
// Scissoring.
int32_t screen_scissor_tl = regs[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL].u32;
int32_t screen_scissor_br = regs[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR].u32;
if (screen_scissor_tl != 0 && screen_scissor_br != 0x20002000) {
glEnable(GL_SCISSOR_TEST);
// TODO(benvanik): signed?
int32_t screen_scissor_x = screen_scissor_tl & 0x7FFF;
int32_t screen_scissor_y = (screen_scissor_tl >> 16) & 0x7FFF;
int32_t screen_scissor_w = screen_scissor_br & 0x7FFF - screen_scissor_x;
int32_t screen_scissor_h =
(screen_scissor_br >> 16) & 0x7FFF - screen_scissor_y;
glScissor(screen_scissor_x, screen_scissor_y, screen_scissor_w,
screen_scissor_h);
} else {
glDisable(GL_SCISSOR_TEST);
}
// Rasterizer state.
uint32_t mode_control = regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32;
if (draw_command->prim_type == PrimitiveType::kRectangleList) {
// Rect lists aren't culled. There may be other things they skip too.
glDisable(GL_CULL_FACE);
} else {
switch (mode_control & 0x3) {
case 0:
glDisable(GL_CULL_FACE);
break;
case 1:
glEnable(GL_CULL_FACE);
glCullFace(GL_FRONT);
break;
case 2:
glEnable(GL_CULL_FACE);
glCullFace(GL_BACK);
break;
}
}
if (mode_control & 0x4) {
glFrontFace(GL_CW);
} else {
glFrontFace(GL_CCW);
}
// TODO(benvanik): wireframe mode.
// glPolygonMode(GL_FRONT_AND_BACK, GL_LINE);
glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
// Alpha testing -- ALPHAREF, ALPHAFUNC, ALPHATESTENABLE
// Deprecated in GL, implemented in shader.
// if(ALPHATESTENABLE && frag_out.a [<=/ALPHAFUNC] ALPHAREF) discard;
uint32_t color_control = regs[XE_GPU_REG_RB_COLORCONTROL].u32;
buffer_ptr->alpha_test.x =
(color_control & 0x4) ? 1.0f : 0.0f; // ALPAHTESTENABLE
buffer_ptr->alpha_test.y = float(color_control & 0x3); // ALPHAFUNC
buffer_ptr->alpha_test.z = regs[XE_GPU_REG_RB_ALPHA_REF].f32;
static const GLenum blend_map[] = {
/* 0 */ GL_ZERO,
/* 1 */ GL_ONE,
/* 2 */ GL_ZERO, // ?
/* 3 */ GL_ZERO, // ?
/* 4 */ GL_SRC_COLOR,
/* 5 */ GL_ONE_MINUS_SRC_COLOR,
/* 6 */ GL_SRC_ALPHA,
/* 7 */ GL_ONE_MINUS_SRC_ALPHA,
/* 8 */ GL_DST_COLOR,
/* 9 */ GL_ONE_MINUS_DST_COLOR,
/* 10 */ GL_DST_ALPHA,
/* 11 */ GL_ONE_MINUS_DST_ALPHA,
/* 12 */ GL_CONSTANT_COLOR,
/* 13 */ GL_ONE_MINUS_CONSTANT_COLOR,
/* 14 */ GL_CONSTANT_ALPHA,
/* 15 */ GL_ONE_MINUS_CONSTANT_ALPHA,
/* 16 */ GL_SRC_ALPHA_SATURATE,
};
static const GLenum blend_op_map[] = {
/* 0 */ GL_FUNC_ADD,
/* 1 */ GL_FUNC_SUBTRACT,
/* 2 */ GL_MIN,
/* 3 */ GL_MAX,
/* 4 */ GL_FUNC_REVERSE_SUBTRACT,
};
uint32_t color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32;
uint32_t blend_control[4] = {
regs[XE_GPU_REG_RB_BLENDCONTROL_0].u32,
regs[XE_GPU_REG_RB_BLENDCONTROL_1].u32,
regs[XE_GPU_REG_RB_BLENDCONTROL_2].u32,
regs[XE_GPU_REG_RB_BLENDCONTROL_3].u32,
};
for (int n = 0; n < poly::countof(blend_control); n++) {
// A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND
auto src_blend = blend_map[(blend_control[n] & 0x0000001F) >> 0];
// A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND
auto dest_blend = blend_map[(blend_control[n] & 0x00001F00) >> 8];
// A2XX_RB_BLEND_CONTROL_COLOR_COMB_FCN
auto blend_op = blend_op_map[(blend_control[n] & 0x000000E0) >> 5];
// A2XX_RB_BLEND_CONTROL_ALPHA_SRCBLEND
auto src_blend_alpha = blend_map[(blend_control[n] & 0x001F0000) >> 16];
// A2XX_RB_BLEND_CONTROL_ALPHA_DESTBLEND
auto dest_blend_alpha = blend_map[(blend_control[n] & 0x1F000000) >> 24];
// A2XX_RB_BLEND_CONTROL_ALPHA_COMB_FCN
auto blend_op_alpha = blend_op_map[(blend_control[n] & 0x00E00000) >> 21];
// A2XX_RB_COLOR_MASK_WRITE_*
uint32_t write_mask = (color_mask >> (n * 4)) & 0xF;
// A2XX_RB_COLORCONTROL_BLEND_DISABLE ?? Can't find this!
// Just guess based on actions.
bool blend_enable =
!((src_blend == GL_ONE) && (dest_blend == GL_ZERO) &&
(blend_op == GL_FUNC_ADD) && (src_blend_alpha == GL_ONE) &&
(dest_blend_alpha == GL_ZERO) && (blend_op_alpha == GL_FUNC_ADD));
if (blend_enable) {
glEnablei(GL_BLEND, n);
glBlendEquationSeparatei(n, blend_op, blend_op_alpha);
glBlendFuncSeparatei(n, src_blend, dest_blend, src_blend_alpha,
dest_blend_alpha);
} else {
glDisablei(GL_BLEND, n);
}
}
float blend_color[4] = {
regs[XE_GPU_REG_RB_BLEND_RED].f32, regs[XE_GPU_REG_RB_BLEND_GREEN].f32,
regs[XE_GPU_REG_RB_BLEND_BLUE].f32, regs[XE_GPU_REG_RB_BLEND_ALPHA].f32,
};
glBlendColor(blend_color[0], blend_color[1], blend_color[2], blend_color[3]);
static const GLenum compare_func_map[] = {
/* 0 */ GL_NEVER,
/* 1 */ GL_LESS,
/* 2 */ GL_EQUAL,
/* 3 */ GL_LEQUAL,
/* 4 */ GL_GREATER,
/* 5 */ GL_NOTEQUAL,
/* 6 */ GL_GEQUAL,
/* 7 */ GL_ALWAYS,
};
static const GLenum stencil_op_map[] = {
/* 0 */ GL_KEEP,
/* 1 */ GL_ZERO,
/* 2 */ GL_REPLACE,
/* 3 */ GL_INCR_WRAP,
/* 4 */ GL_DECR_WRAP,
/* 5 */ GL_INVERT,
/* 6 */ GL_INCR,
/* 7 */ GL_DECR,
};
uint32_t depth_control = regs[XE_GPU_REG_RB_DEPTHCONTROL].u32;
// A2XX_RB_DEPTHCONTROL_Z_ENABLE
if (depth_control & 0x00000002) {
glEnable(GL_DEPTH_TEST);
} else {
glDisable(GL_DEPTH_TEST);
}
// A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE
glDepthMask((depth_control & 0x00000004) ? GL_TRUE : GL_FALSE);
// A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE
// ?
// A2XX_RB_DEPTHCONTROL_ZFUNC
glDepthFunc(compare_func_map[(depth_control & 0x00000070) >> 4]);
// A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE
if (depth_control & 0x00000001) {
glEnable(GL_STENCIL_TEST);
} else {
glDisable(GL_STENCIL_TEST);
}
uint32_t stencil_ref_mask = regs[XE_GPU_REG_RB_STENCILREFMASK].u32;
// RB_STENCILREFMASK_STENCILREF
uint32_t stencil_ref = (stencil_ref_mask & 0x000000FF);
// RB_STENCILREFMASK_STENCILMASK
uint32_t stencil_read_mask = (stencil_ref_mask & 0x0000FF00) >> 8;
// RB_STENCILREFMASK_STENCILWRITEMASK
glStencilMask((stencil_ref_mask & 0x00FF0000) >> 16);
// A2XX_RB_DEPTHCONTROL_BACKFACE_ENABLE
bool backface_enabled = (depth_control & 0x00000080) != 0;
if (backface_enabled) {
// A2XX_RB_DEPTHCONTROL_STENCILFUNC
glStencilFuncSeparate(GL_FRONT,
compare_func_map[(depth_control & 0x00000700) >> 8],
stencil_ref, stencil_read_mask);
// A2XX_RB_DEPTHCONTROL_STENCILFAIL
// A2XX_RB_DEPTHCONTROL_STENCILZFAIL
// A2XX_RB_DEPTHCONTROL_STENCILZPASS
glStencilOpSeparate(GL_FRONT,
stencil_op_map[(depth_control & 0x00003800) >> 11],
stencil_op_map[(depth_control & 0x000E0000) >> 17],
stencil_op_map[(depth_control & 0x0001C000) >> 14]);
// A2XX_RB_DEPTHCONTROL_STENCILFUNC_BF
glStencilFuncSeparate(GL_BACK,
compare_func_map[(depth_control & 0x00700000) >> 20],
stencil_ref, stencil_read_mask);
// A2XX_RB_DEPTHCONTROL_STENCILFAIL_BF
// A2XX_RB_DEPTHCONTROL_STENCILZFAIL_BF
// A2XX_RB_DEPTHCONTROL_STENCILZPASS_BF
glStencilOpSeparate(GL_BACK,
stencil_op_map[(depth_control & 0x03800000) >> 23],
stencil_op_map[(depth_control & 0xE0000000) >> 29],
stencil_op_map[(depth_control & 0x1C000000) >> 26]);
} else {
// Backfaces disabled - treat backfaces as frontfaces.
glStencilFunc(compare_func_map[(depth_control & 0x00000700) >> 8],
stencil_ref, stencil_read_mask);
glStencilOp(stencil_op_map[(depth_control & 0x00003800) >> 11],
stencil_op_map[(depth_control & 0x000E0000) >> 17],
stencil_op_map[(depth_control & 0x0001C000) >> 14]);
}
glUnmapNamedBuffer(uniform_data_buffer_);
return true;
}
bool CommandProcessor::UpdateRenderTargets() {
auto& regs = *register_file_;
return true;
}
bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
SCOPE_profile_cpu_f("gpu");
return true;
}
} // namespace gl4
} // namespace gpu
} // namespace xe

View File

@ -12,10 +12,12 @@
#include <atomic>
#include <functional>
#include <memory>
#include <thread>
#include <unordered_map>
#include <vector>
#include <xenia/gpu/gl4/gl_context.h>
#include <xenia/gpu/gl4/gl4_shader.h>
#include <xenia/gpu/register_file.h>
#include <xenia/gpu/xenos.h>
@ -27,6 +29,42 @@ namespace gl4 {
class GL4GraphicsSystem;
// TODO(benvanik): move more of the enums in here?
struct DrawCommand {
PrimitiveType prim_type;
uint32_t start_index;
uint32_t index_count;
uint32_t base_vertex;
GL4Shader* vertex_shader;
GL4Shader* pixel_shader;
// Index buffer, if present.
// If index_count > 0 but buffer is nullptr then auto draw.
//IndexBufferResource* index_buffer;
void* index_buffer;
// Vertex buffers.
struct {
uint32_t input_index;
//VertexBufferResource* buffer;
uint32_t stride;
uint32_t offset;
} vertex_buffers[96];
size_t vertex_buffer_count;
// Texture samplers.
struct SamplerInput {
uint32_t input_index;
//TextureResource* texture;
//SamplerStateResource* sampler_state;
};
SamplerInput vertex_shader_samplers[32];
size_t vertex_shader_sampler_count;
SamplerInput pixel_shader_samplers[32];
size_t pixel_shader_sampler_count;
};
class CommandProcessor {
public:
CommandProcessor(GL4GraphicsSystem* graphics_system);
@ -38,8 +76,10 @@ class CommandProcessor {
uint32_t counter() const { return counter_; }
void increment_counter() { counter_++; }
void Initialize(uint32_t ptr, uint32_t page_count);
bool Initialize(std::unique_ptr<GLContext> context);
void Shutdown();
void InitializeRingBuffer(uint32_t ptr, uint32_t page_count);
void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size);
void UpdateWritePointer(uint32_t value);
@ -48,9 +88,12 @@ class CommandProcessor {
class RingbufferReader;
void WorkerMain();
bool SetupGL();
void ShutdownGL();
void WriteRegister(uint32_t packet_ptr, uint32_t index, uint32_t value);
void MakeCoherent();
void PrepareForWait();
void ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index);
void ExecuteIndirectBuffer(uint32_t ptr, uint32_t length);
@ -113,6 +156,11 @@ class CommandProcessor {
bool LoadShader(ShaderType shader_type, const uint32_t* address,
uint32_t dword_count);
bool PrepareDraw(DrawCommand* draw_command);
bool UpdateState(DrawCommand* draw_command);
bool UpdateRenderTargets();
bool IssueDraw(DrawCommand* draw_command);
Memory* memory_;
uint8_t* membase_;
GL4GraphicsSystem* graphics_system_;
@ -120,7 +168,7 @@ class CommandProcessor {
std::thread worker_thread_;
std::atomic<bool> worker_running_;
std::unique_ptr<GLContext> context_;
std::function<void()> swap_handler_;
uint64_t time_base_;
@ -143,6 +191,10 @@ class CommandProcessor {
std::unordered_map<uint64_t, GL4Shader*> shader_cache_;
GL4Shader* active_vertex_shader_;
GL4Shader* active_pixel_shader_;
GLuint uniform_data_buffer_;
DrawCommand draw_command_;
};
} // namespace gl4

View File

@ -32,9 +32,19 @@ X_STATUS GL4GraphicsSystem::Setup() {
// This must happen on the UI thread.
poly::threading::Fence control_ready_fence;
auto loop = emulator_->main_window()->loop();
std::unique_ptr<GLContext> processor_context;
loop->Post([&]() {
// Setup the GL control that actually does the drawing.
// We run here in the loop and only touch it (and its context) on this
// thread. That means some sync-fu when we want to swap.
control_ = std::make_unique<WGLControl>(loop);
emulator_->main_window()->AddChild(control_.get());
// Setup the GL context the command processor will do all its drawing in.
// It's shared with the control context so that we can resolve framebuffers
// from it.
processor_context = control_->context()->CreateShared();
control_ready_fence.Signal();
});
control_ready_fence.Wait();
@ -42,6 +52,10 @@ X_STATUS GL4GraphicsSystem::Setup() {
// Create command processor. This will spin up a thread to process all
// incoming ringbuffer packets.
command_processor_ = std::make_unique<CommandProcessor>(this);
if (!command_processor_->Initialize(std::move(processor_context))) {
PLOGE("Unable to initialize command processor");
return X_STATUS_UNSUCCESSFUL;
}
command_processor_->set_swap_handler(
std::bind(&GL4GraphicsSystem::SwapHandler, this));
@ -76,7 +90,7 @@ void GL4GraphicsSystem::Shutdown() {
void GL4GraphicsSystem::InitializeRingBuffer(uint32_t ptr,
uint32_t page_count) {
command_processor_->Initialize(ptr, page_count);
command_processor_->InitializeRingBuffer(ptr, page_count);
}
void GL4GraphicsSystem::EnableReadPointerWriteBack(uint32_t ptr,

View File

@ -9,6 +9,7 @@
#include <xenia/gpu/gl4/gl_context.h>
#include <poly/assert.h>
#include <poly/logging.h>
namespace xe {
@ -20,17 +21,26 @@ thread_local WGLEWContext* tls_wglew_context_ = nullptr;
extern "C" GLEWContext* glewGetContext() { return tls_glew_context_; }
extern "C" WGLEWContext* wglewGetContext() { return tls_wglew_context_; }
GLContext::GLContext() : dc_(nullptr), glrc_(nullptr) {}
GLContext::GLContext() : hwnd_(nullptr), dc_(nullptr), glrc_(nullptr) {}
GLContext::GLContext(HWND hwnd, HGLRC glrc)
: hwnd_(hwnd), dc_(nullptr), glrc_(glrc) {
dc_ = GetDC(hwnd);
}
GLContext::~GLContext() {
wglMakeCurrent(nullptr, nullptr);
if (glrc_) {
wglDeleteContext(glrc_);
}
if (dc_) {
ReleaseDC(hwnd_, dc_);
}
}
bool GLContext::Initialize(HDC dc) {
dc_ = dc;
bool GLContext::Initialize(HWND hwnd) {
hwnd_ = hwnd;
dc_ = GetDC(hwnd);
PIXELFORMATDESCRIPTOR pfd = {0};
pfd.nSize = sizeof(pfd);
@ -59,6 +69,7 @@ bool GLContext::Initialize(HDC dc) {
tls_glew_context_ = &glew_context_;
tls_wglew_context_ = &wglew_context_;
glewExperimental = GL_TRUE;
if (glewInit() != GLEW_OK) {
PLOGE("Unable to initialize GLEW");
return false;
@ -73,11 +84,10 @@ bool GLContext::Initialize(HDC dc) {
return false;
}
int context_flags = WGL_CONTEXT_FORWARD_COMPATIBLE_BIT_ARB;
int context_flags = 0;
#if DEBUG
context_flags |= WGL_CONTEXT_DEBUG_BIT_ARB;
#endif // DEBUG
#endif // DEBUG
int attrib_list[] = {WGL_CONTEXT_MAJOR_VERSION_ARB, 4, //
WGL_CONTEXT_MINOR_VERSION_ARB, 5, //
WGL_CONTEXT_FLAGS_ARB, context_flags, //
@ -99,6 +109,45 @@ bool GLContext::Initialize(HDC dc) {
return true;
}
std::unique_ptr<GLContext> GLContext::CreateShared() {
assert_not_null(glrc_);
int context_flags = 0;
#if DEBUG
context_flags |= WGL_CONTEXT_DEBUG_BIT_ARB;
#endif // DEBUG
int attrib_list[] = {WGL_CONTEXT_MAJOR_VERSION_ARB, 4, //
WGL_CONTEXT_MINOR_VERSION_ARB, 5, //
WGL_CONTEXT_FLAGS_ARB, context_flags, //
0};
auto new_glrc = wglCreateContextAttribsARB(dc_, glrc_, attrib_list);
if (!new_glrc) {
PLOGE("Could not create shared context");
return nullptr;
}
auto new_context = std::make_unique<GLContext>(hwnd_, new_glrc);
if (!new_context->MakeCurrent()) {
PLOGE("Could not make new GL context current");
return nullptr;
}
glewExperimental = GL_TRUE;
if (glewInit() != GLEW_OK) {
PLOGE("Unable to initialize GLEW");
return nullptr;
}
if (wglewInit() != GLEW_OK) {
PLOGE("Unable to initialize WGLEW");
return nullptr;
}
new_context->ClearCurrent();
MakeCurrent();
return new_context;
}
bool GLContext::MakeCurrent() {
if (!wglMakeCurrent(dc_, glrc_)) {
return false;

View File

@ -10,6 +10,8 @@
#ifndef XENIA_GPU_GL4_GL_CONTEXT_H_
#define XENIA_GPU_GL4_GL_CONTEXT_H_
#include <memory>
#include <third_party/GL/glew.h>
#include <third_party/GL/wglew.h>
@ -20,16 +22,20 @@ namespace gl4 {
class GLContext {
public:
GLContext();
GLContext(HWND hwnd, HGLRC glrc);
~GLContext();
bool Initialize(HDC dc);
bool Initialize(HWND hwnd);
HDC dc() const { return dc_; }
std::unique_ptr<GLContext> CreateShared();
bool MakeCurrent();
void ClearCurrent();
private:
HWND hwnd_;
HDC dc_;
HGLRC glrc_;

View File

@ -56,13 +56,7 @@ bool WGLControl::Create() {
return false;
}
HDC dc = GetDC(hwnd_);
if (!dc) {
PLOGE("No DC for WGL window");
return false;
}
if (!context_.Initialize(dc)) {
if (!context_.Initialize(hwnd_)) {
PFATAL("Unable to initialize GL context");
return false;
}

View File

@ -100,7 +100,8 @@ void Shader::GatherExec(const instr_cf_exec_t* cf) {
uint32_t alu_off = (cf->address + i);
int sync = sequence & 0x2;
if (sequence & 0x1) {
auto fetch = reinterpret_cast<const instr_fetch_t*>(&data_[alu_off * 3]);
auto fetch =
reinterpret_cast<const instr_fetch_t*>(data_.data() + alu_off * 3);
switch (fetch->opc) {
case VTX_FETCH:
GatherVertexFetch(&fetch->vtx);
@ -121,7 +122,8 @@ void Shader::GatherExec(const instr_cf_exec_t* cf) {
}
} else {
// TODO(benvanik): gather registers used, predicate bits used, etc.
auto alu = reinterpret_cast<const instr_alu_t*>(&data_[alu_off * 3]);
auto alu =
reinterpret_cast<const instr_alu_t*>(data_.data() + alu_off * 3);
if (alu->vector_write_mask) {
if (alu->export_data && alu->vector_dest == 63) {
alloc_counts_.point_size = true;