GL context on command processor.
This commit is contained in:
parent
e8de42d9ea
commit
02d52167d3
|
@ -12,6 +12,7 @@
|
|||
#include <algorithm>
|
||||
|
||||
#include <poly/logging.h>
|
||||
#include <poly/math.h>
|
||||
#include <xenia/gpu/gl4/gl4_graphics_system.h>
|
||||
#include <xenia/gpu/gpu-private.h>
|
||||
#include <xenia/gpu/xenos.h>
|
||||
|
@ -27,6 +28,8 @@ namespace gl4 {
|
|||
|
||||
using namespace xe::gpu::xenos;
|
||||
|
||||
extern "C" extern "C" GLEWContext* glewGetContext();
|
||||
|
||||
CommandProcessor::CommandProcessor(GL4GraphicsSystem* graphics_system)
|
||||
: memory_(graphics_system->memory()),
|
||||
membase_(graphics_system->memory()->membase()),
|
||||
|
@ -46,6 +49,7 @@ CommandProcessor::CommandProcessor(GL4GraphicsSystem* graphics_system)
|
|||
bin_mask_(0xFFFFFFFFull),
|
||||
active_vertex_shader_(nullptr),
|
||||
active_pixel_shader_(nullptr) {
|
||||
std::memset(&draw_command_, 0, sizeof(draw_command_));
|
||||
LARGE_INTEGER perf_counter;
|
||||
QueryPerformanceCounter(&perf_counter);
|
||||
time_base_ = perf_counter.QuadPart;
|
||||
|
@ -59,33 +63,37 @@ uint64_t CommandProcessor::QueryTime() {
|
|||
return perf_counter.QuadPart - time_base_;
|
||||
}
|
||||
|
||||
void CommandProcessor::Initialize(uint32_t ptr, uint32_t page_count) {
|
||||
primary_buffer_ptr_ = ptr;
|
||||
// Not sure this is correct, but it's a way to take the page_count back to
|
||||
// the number of bytes allocated by the physical alloc.
|
||||
uint32_t original_size = 1 << (0x1C - page_count - 1);
|
||||
primary_buffer_size_ = original_size;
|
||||
read_ptr_index_ = 0;
|
||||
bool CommandProcessor::Initialize(std::unique_ptr<GLContext> context) {
|
||||
context_ = std::move(context);
|
||||
|
||||
worker_running_ = true;
|
||||
worker_thread_ = std::thread([this]() {
|
||||
poly::threading::set_name("GL4 Worker");
|
||||
xe::Profiler::ThreadEnter("GL4 Worker");
|
||||
context_->MakeCurrent();
|
||||
WorkerMain();
|
||||
xe::Profiler::ThreadExit();
|
||||
});
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void CommandProcessor::Shutdown() {
|
||||
worker_running_ = false;
|
||||
SetEvent(write_ptr_index_event_);
|
||||
worker_thread_.join();
|
||||
context_.reset();
|
||||
|
||||
all_shaders_.clear();
|
||||
shader_cache_.clear();
|
||||
}
|
||||
|
||||
void CommandProcessor::WorkerMain() {
|
||||
if (!SetupGL()) {
|
||||
PFATAL("Unable to setup command processor GL state");
|
||||
return;
|
||||
}
|
||||
|
||||
while (worker_running_) {
|
||||
uint32_t write_ptr_index = write_ptr_index_.load();
|
||||
while (write_ptr_index == 0xBAADF00D ||
|
||||
|
@ -94,6 +102,7 @@ void CommandProcessor::WorkerMain() {
|
|||
// We wait a short bit here to yield time. Since we are also running the
|
||||
// main window display we don't want to pause too long, though.
|
||||
// YieldProcessor();
|
||||
PrepareForWait();
|
||||
const int wait_time_ms = 5;
|
||||
if (WaitForSingleObject(write_ptr_index_event_, wait_time_ms) ==
|
||||
WAIT_TIMEOUT) {
|
||||
|
@ -117,6 +126,28 @@ void CommandProcessor::WorkerMain() {
|
|||
read_ptr_index_);
|
||||
}
|
||||
}
|
||||
|
||||
ShutdownGL();
|
||||
}
|
||||
|
||||
bool CommandProcessor::SetupGL() {
|
||||
// Uniform buffer that stores the per-draw state (constants, etc).
|
||||
glGenBuffers(1, &uniform_data_buffer_);
|
||||
glNamedBufferStorage(uniform_data_buffer_, 16 * 1024, nullptr, GL_MAP_WRITE_BIT);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void CommandProcessor::ShutdownGL() {
|
||||
glDeleteBuffers(1, &uniform_data_buffer_);
|
||||
}
|
||||
|
||||
void CommandProcessor::InitializeRingBuffer(uint32_t ptr, uint32_t page_count) {
|
||||
primary_buffer_ptr_ = ptr;
|
||||
// Not sure this is correct, but it's a way to take the page_count back to
|
||||
// the number of bytes allocated by the physical alloc.
|
||||
uint32_t original_size = 1 << (0x1C - page_count - 1);
|
||||
primary_buffer_size_ = original_size;
|
||||
}
|
||||
|
||||
void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr,
|
||||
|
@ -162,6 +193,8 @@ void CommandProcessor::WriteRegister(uint32_t packet_ptr, uint32_t index,
|
|||
}
|
||||
|
||||
void CommandProcessor::MakeCoherent() {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
|
||||
// Status host often has 0x01000000 or 0x03000000.
|
||||
// This is likely toggling VC (vertex cache) or TC (texture cache).
|
||||
// Or, it also has a direction in here maybe - there is probably
|
||||
|
@ -189,6 +222,16 @@ void CommandProcessor::MakeCoherent() {
|
|||
regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32 = status_host;
|
||||
}
|
||||
|
||||
void CommandProcessor::PrepareForWait() {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
|
||||
// TODO(benvanik): fences and fancy stuff. We should figure out a way to
|
||||
// make interrupt callbacks from the GPU so that we don't have to do a full
|
||||
// synchronize here.
|
||||
// glFlush();
|
||||
glFinish();
|
||||
}
|
||||
|
||||
class CommandProcessor::RingbufferReader {
|
||||
public:
|
||||
RingbufferReader(uint8_t* membase, uint32_t base_ptr, uint32_t ptr_mask,
|
||||
|
@ -274,6 +317,8 @@ void CommandProcessor::ExecutePrimaryBuffer(uint32_t start_index,
|
|||
}
|
||||
|
||||
void CommandProcessor::ExecuteIndirectBuffer(uint32_t ptr, uint32_t length) {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
|
||||
XETRACECP("[%.8X] ExecuteIndirectBuffer(%dw)", ptr, length);
|
||||
|
||||
// Execute commands!
|
||||
|
@ -625,6 +670,7 @@ bool CommandProcessor::ExecutePacketType3_WAIT_REG_MEM(RingbufferReader* reader,
|
|||
if (!matched) {
|
||||
// Wait.
|
||||
if (wait >= 0x100) {
|
||||
PrepareForWait();
|
||||
Sleep(wait / 0x100);
|
||||
} else {
|
||||
SwitchToThread();
|
||||
|
@ -790,14 +836,19 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingbufferReader* reader,
|
|||
uint32_t dword1 = reader->Read();
|
||||
uint32_t index_count = dword1 >> 16;
|
||||
auto prim_type = static_cast<PrimitiveType>(dword1 & 0x3F);
|
||||
|
||||
uint32_t index_base = 0;
|
||||
uint32_t index_size = 0;
|
||||
Endian index_endianness = Endian::kUnspecified;
|
||||
bool index_32bit = false;
|
||||
uint32_t src_sel = (dword1 >> 6) & 0x3;
|
||||
if (src_sel == 0x0) {
|
||||
// Indexed draw.
|
||||
uint32_t index_base = reader->Read();
|
||||
uint32_t index_size = reader->Read();
|
||||
auto endianness = static_cast<Endian>(index_size >> 30);
|
||||
index_base = reader->Read();
|
||||
index_size = reader->Read();
|
||||
index_endianness = static_cast<Endian>(index_size >> 30);
|
||||
index_size &= 0x00FFFFFF;
|
||||
bool index_32bit = (dword1 >> 11) & 0x1;
|
||||
index_32bit = (dword1 >> 11) & 0x1;
|
||||
index_size *= index_32bit ? 4 : 2;
|
||||
} else if (src_sel == 0x2) {
|
||||
// Auto draw.
|
||||
|
@ -805,33 +856,31 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingbufferReader* reader,
|
|||
// Unknown source select.
|
||||
assert_always();
|
||||
}
|
||||
// if (!driver_->PrepareDraw(draw_command_)) {
|
||||
// draw_command_.prim_type = prim_type;
|
||||
// draw_command_.start_index = 0;
|
||||
// draw_command_.index_count = index_count;
|
||||
// draw_command_.base_vertex = 0;
|
||||
// if (src_sel == 0x0) {
|
||||
// // Indexed draw.
|
||||
// // TODO(benvanik): detect subregions of larger index
|
||||
// buffers!
|
||||
// driver_->PrepareDrawIndexBuffer(
|
||||
// draw_command_, index_base, index_size,
|
||||
// endianness,
|
||||
// index_32bit ? INDEX_FORMAT_32BIT : INDEX_FORMAT_16BIT);
|
||||
// } else if (src_sel == 0x2) {
|
||||
// // Auto draw.
|
||||
// draw_command_.index_buffer = nullptr;
|
||||
// } else {
|
||||
// // Unknown source select.
|
||||
// assert_always();
|
||||
// }
|
||||
// driver_->Draw(draw_command_);
|
||||
// } else {
|
||||
// if (src_sel == 0x0) {
|
||||
// reader->Advance(2); // skip
|
||||
// }
|
||||
// }
|
||||
return true;
|
||||
|
||||
if (!PrepareDraw(&draw_command_)) {
|
||||
PLOGE("Invalid DRAW_INDX; ignoring");
|
||||
return false;
|
||||
}
|
||||
draw_command_.prim_type = prim_type;
|
||||
draw_command_.start_index = 0;
|
||||
draw_command_.index_count = index_count;
|
||||
draw_command_.base_vertex = 0;
|
||||
if (src_sel == 0x0) {
|
||||
// Indexed draw.
|
||||
// TODO(benvanik): detect subregions of larger index buffers
|
||||
/*driver_->PrepareDrawIndexBuffer(
|
||||
draw_command_, index_base, index_size,
|
||||
endianness,
|
||||
index_32bit ? INDEX_FORMAT_32BIT : INDEX_FORMAT_16BIT);*/
|
||||
draw_command_.index_buffer = nullptr;
|
||||
} else if (src_sel == 0x2) {
|
||||
// Auto draw.
|
||||
draw_command_.index_buffer = nullptr;
|
||||
} else {
|
||||
// Unknown source select.
|
||||
assert_always();
|
||||
}
|
||||
return IssueDraw(&draw_command_);
|
||||
}
|
||||
|
||||
bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingbufferReader* reader,
|
||||
|
@ -849,16 +898,17 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingbufferReader* reader,
|
|||
bool index_32bit = (dword0 >> 11) & 0x1;
|
||||
uint32_t indices_size = index_count * (index_32bit ? 4 : 2);
|
||||
reader->CheckRead(indices_size / sizeof(uint32_t));
|
||||
/*if (!driver_->PrepareDraw(draw_command_)) {
|
||||
uint32_t index_ptr = reader->ptr();
|
||||
reader->Advance(count - 1);
|
||||
if (!PrepareDraw(&draw_command_)) {
|
||||
return false;
|
||||
}
|
||||
draw_command_.prim_type = prim_type;
|
||||
draw_command_.start_index = 0;
|
||||
draw_command_.index_count = index_count;
|
||||
draw_command_.base_vertex = 0;
|
||||
draw_command_.index_buffer = nullptr;
|
||||
driver_->Draw(draw_command_);
|
||||
}*/
|
||||
reader->Advance(count - 1);
|
||||
return true;
|
||||
return IssueDraw(&draw_command_);
|
||||
}
|
||||
|
||||
bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingbufferReader* reader,
|
||||
|
@ -967,6 +1017,8 @@ bool CommandProcessor::ExecutePacketType3_INVALIDATE_STATE(
|
|||
bool CommandProcessor::LoadShader(ShaderType shader_type,
|
||||
const uint32_t* address,
|
||||
uint32_t dword_count) {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
|
||||
// Hash the input memory and lookup the shader.
|
||||
GL4Shader* shader_ptr = nullptr;
|
||||
uint64_t hash = XXH64(address, dword_count * sizeof(uint32_t), 0);
|
||||
|
@ -1004,6 +1056,344 @@ bool CommandProcessor::LoadShader(ShaderType shader_type,
|
|||
return true;
|
||||
}
|
||||
|
||||
bool CommandProcessor::PrepareDraw(DrawCommand* draw_command) {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
auto& regs = *register_file_;
|
||||
auto& cmd = *draw_command;
|
||||
|
||||
// Reset the things we don't modify so that we have clean state.
|
||||
cmd.prim_type = PrimitiveType::kPointList;
|
||||
cmd.index_count = 0;
|
||||
cmd.index_buffer = nullptr;
|
||||
|
||||
// Generic stuff.
|
||||
cmd.start_index = regs[XE_GPU_REG_VGT_INDX_OFFSET].u32;
|
||||
cmd.base_vertex = 0;
|
||||
|
||||
if (!UpdateState(draw_command)) {
|
||||
return false;
|
||||
}
|
||||
if (!UpdateRenderTargets()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CommandProcessor::UpdateState(DrawCommand* draw_command) {
|
||||
// Much of this state machine is extracted from:
|
||||
// https://github.com/freedreno/mesa/blob/master/src/mesa/drivers/dri/r200/r200_state.c
|
||||
// http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html
|
||||
// http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
|
||||
|
||||
auto& regs = *register_file_;
|
||||
|
||||
union float4 {
|
||||
float v[4];
|
||||
struct {
|
||||
float x, y, z, w;
|
||||
};
|
||||
};
|
||||
struct UniformDataBlock {
|
||||
float4 window_offset; // tx,ty,?,?
|
||||
float4 window_scissor; // x0,y0,x1,y1
|
||||
float4 viewport_offset; // tx,ty,tz,?
|
||||
float4 viewport_scale; // sx,sy,sz,?
|
||||
// TODO(benvanik): vertex format xyzw?
|
||||
|
||||
float4 alpha_test; // alpha test enable, func, ref, ?
|
||||
|
||||
// Register data from 0x4000 to 0x4927.
|
||||
// SHADER_CONSTANT_000_X...
|
||||
float4 float_consts[512];
|
||||
// SHADER_CONSTANT_FETCH_00_0...
|
||||
uint32_t fetch_consts[32 * 6];
|
||||
// SHADER_CONSTANT_BOOL_000_031...
|
||||
int32_t bool_consts[8];
|
||||
// SHADER_CONSTANT_LOOP_00...
|
||||
int32_t loop_consts[32];
|
||||
};
|
||||
static_assert(sizeof(UniformDataBlock) <= 16 * 1024,
|
||||
"Need <=16k uniform data");
|
||||
|
||||
auto buffer_ptr = reinterpret_cast<UniformDataBlock*>(
|
||||
glMapNamedBufferRange(uniform_data_buffer_, 0, 0,
|
||||
GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT));
|
||||
if (!buffer_ptr) {
|
||||
PLOGE("Unable to map uniform data buffer");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Window parameters.
|
||||
// See r200UpdateWindow:
|
||||
// https://github.com/freedreno/mesa/blob/master/src/mesa/drivers/dri/r200/r200_state.c
|
||||
uint32_t window_offset = regs[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32;
|
||||
buffer_ptr->window_offset.x = float(window_offset & 0x7FFF);
|
||||
buffer_ptr->window_offset.y = float((window_offset >> 16) & 0x7FFF);
|
||||
uint32_t window_scissor_tl = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32;
|
||||
uint32_t window_scissor_br = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32;
|
||||
buffer_ptr->window_scissor.x = float(window_scissor_tl & 0x7FFF);
|
||||
buffer_ptr->window_scissor.y = float((window_scissor_tl >> 16) & 0x7FFF);
|
||||
buffer_ptr->window_scissor.z = float(window_scissor_br & 0x7FFF);
|
||||
buffer_ptr->window_scissor.w = float((window_scissor_br >> 16) & 0x7FFF);
|
||||
|
||||
// Viewport scaling. Only enabled if the flags are all set.
|
||||
buffer_ptr->viewport_scale.x =
|
||||
regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32; // 640
|
||||
buffer_ptr->viewport_offset.x =
|
||||
regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32; // 640
|
||||
buffer_ptr->viewport_scale.y =
|
||||
regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32; // -360
|
||||
buffer_ptr->viewport_offset.y =
|
||||
regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; // 360
|
||||
buffer_ptr->viewport_scale.z = regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32; // 1
|
||||
buffer_ptr->viewport_offset.z =
|
||||
regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32; // 0
|
||||
|
||||
// Whether each of the viewport settings is enabled.
|
||||
// We require it to be all or nothing right now.
|
||||
uint32_t vte_control = regs[XE_GPU_REG_PA_CL_VTE_CNTL].u32;
|
||||
bool vport_xscale_enable = (vte_control & (1 << 0)) > 0;
|
||||
bool vport_xoffset_enable = (vte_control & (1 << 1)) > 0;
|
||||
bool vport_yscale_enable = (vte_control & (1 << 2)) > 0;
|
||||
bool vport_yoffset_enable = (vte_control & (1 << 3)) > 0;
|
||||
bool vport_zscale_enable = (vte_control & (1 << 4)) > 0;
|
||||
bool vport_zoffset_enable = (vte_control & (1 << 5)) > 0;
|
||||
assert_true(vport_xscale_enable == vport_yscale_enable ==
|
||||
vport_zscale_enable == vport_xoffset_enable ==
|
||||
vport_yoffset_enable == vport_zoffset_enable);
|
||||
// TODO(benvanik): pass to shaders? disable transform? etc?
|
||||
glViewport(0, 0, 1280, 720);
|
||||
|
||||
// Copy over all constants.
|
||||
// TODO(benvanik): partial updates, etc. We could use shader constant access
|
||||
// knowledge that we get at compile time to only upload those constants
|
||||
// required.
|
||||
std::memcpy(
|
||||
&buffer_ptr->float_consts, ®s[XE_GPU_REG_SHADER_CONSTANT_000_X].f32,
|
||||
sizeof(buffer_ptr->float_consts) + sizeof(buffer_ptr->fetch_consts) +
|
||||
sizeof(buffer_ptr->loop_consts) + sizeof(buffer_ptr->bool_consts));
|
||||
|
||||
// Scissoring.
|
||||
int32_t screen_scissor_tl = regs[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL].u32;
|
||||
int32_t screen_scissor_br = regs[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR].u32;
|
||||
if (screen_scissor_tl != 0 && screen_scissor_br != 0x20002000) {
|
||||
glEnable(GL_SCISSOR_TEST);
|
||||
// TODO(benvanik): signed?
|
||||
int32_t screen_scissor_x = screen_scissor_tl & 0x7FFF;
|
||||
int32_t screen_scissor_y = (screen_scissor_tl >> 16) & 0x7FFF;
|
||||
int32_t screen_scissor_w = screen_scissor_br & 0x7FFF - screen_scissor_x;
|
||||
int32_t screen_scissor_h =
|
||||
(screen_scissor_br >> 16) & 0x7FFF - screen_scissor_y;
|
||||
glScissor(screen_scissor_x, screen_scissor_y, screen_scissor_w,
|
||||
screen_scissor_h);
|
||||
} else {
|
||||
glDisable(GL_SCISSOR_TEST);
|
||||
}
|
||||
|
||||
// Rasterizer state.
|
||||
uint32_t mode_control = regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32;
|
||||
if (draw_command->prim_type == PrimitiveType::kRectangleList) {
|
||||
// Rect lists aren't culled. There may be other things they skip too.
|
||||
glDisable(GL_CULL_FACE);
|
||||
} else {
|
||||
switch (mode_control & 0x3) {
|
||||
case 0:
|
||||
glDisable(GL_CULL_FACE);
|
||||
break;
|
||||
case 1:
|
||||
glEnable(GL_CULL_FACE);
|
||||
glCullFace(GL_FRONT);
|
||||
break;
|
||||
case 2:
|
||||
glEnable(GL_CULL_FACE);
|
||||
glCullFace(GL_BACK);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (mode_control & 0x4) {
|
||||
glFrontFace(GL_CW);
|
||||
} else {
|
||||
glFrontFace(GL_CCW);
|
||||
}
|
||||
// TODO(benvanik): wireframe mode.
|
||||
// glPolygonMode(GL_FRONT_AND_BACK, GL_LINE);
|
||||
glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
|
||||
|
||||
// Alpha testing -- ALPHAREF, ALPHAFUNC, ALPHATESTENABLE
|
||||
// Deprecated in GL, implemented in shader.
|
||||
// if(ALPHATESTENABLE && frag_out.a [<=/ALPHAFUNC] ALPHAREF) discard;
|
||||
uint32_t color_control = regs[XE_GPU_REG_RB_COLORCONTROL].u32;
|
||||
buffer_ptr->alpha_test.x =
|
||||
(color_control & 0x4) ? 1.0f : 0.0f; // ALPAHTESTENABLE
|
||||
buffer_ptr->alpha_test.y = float(color_control & 0x3); // ALPHAFUNC
|
||||
buffer_ptr->alpha_test.z = regs[XE_GPU_REG_RB_ALPHA_REF].f32;
|
||||
|
||||
static const GLenum blend_map[] = {
|
||||
/* 0 */ GL_ZERO,
|
||||
/* 1 */ GL_ONE,
|
||||
/* 2 */ GL_ZERO, // ?
|
||||
/* 3 */ GL_ZERO, // ?
|
||||
/* 4 */ GL_SRC_COLOR,
|
||||
/* 5 */ GL_ONE_MINUS_SRC_COLOR,
|
||||
/* 6 */ GL_SRC_ALPHA,
|
||||
/* 7 */ GL_ONE_MINUS_SRC_ALPHA,
|
||||
/* 8 */ GL_DST_COLOR,
|
||||
/* 9 */ GL_ONE_MINUS_DST_COLOR,
|
||||
/* 10 */ GL_DST_ALPHA,
|
||||
/* 11 */ GL_ONE_MINUS_DST_ALPHA,
|
||||
/* 12 */ GL_CONSTANT_COLOR,
|
||||
/* 13 */ GL_ONE_MINUS_CONSTANT_COLOR,
|
||||
/* 14 */ GL_CONSTANT_ALPHA,
|
||||
/* 15 */ GL_ONE_MINUS_CONSTANT_ALPHA,
|
||||
/* 16 */ GL_SRC_ALPHA_SATURATE,
|
||||
};
|
||||
static const GLenum blend_op_map[] = {
|
||||
/* 0 */ GL_FUNC_ADD,
|
||||
/* 1 */ GL_FUNC_SUBTRACT,
|
||||
/* 2 */ GL_MIN,
|
||||
/* 3 */ GL_MAX,
|
||||
/* 4 */ GL_FUNC_REVERSE_SUBTRACT,
|
||||
};
|
||||
uint32_t color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32;
|
||||
uint32_t blend_control[4] = {
|
||||
regs[XE_GPU_REG_RB_BLENDCONTROL_0].u32,
|
||||
regs[XE_GPU_REG_RB_BLENDCONTROL_1].u32,
|
||||
regs[XE_GPU_REG_RB_BLENDCONTROL_2].u32,
|
||||
regs[XE_GPU_REG_RB_BLENDCONTROL_3].u32,
|
||||
};
|
||||
for (int n = 0; n < poly::countof(blend_control); n++) {
|
||||
// A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND
|
||||
auto src_blend = blend_map[(blend_control[n] & 0x0000001F) >> 0];
|
||||
// A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND
|
||||
auto dest_blend = blend_map[(blend_control[n] & 0x00001F00) >> 8];
|
||||
// A2XX_RB_BLEND_CONTROL_COLOR_COMB_FCN
|
||||
auto blend_op = blend_op_map[(blend_control[n] & 0x000000E0) >> 5];
|
||||
// A2XX_RB_BLEND_CONTROL_ALPHA_SRCBLEND
|
||||
auto src_blend_alpha = blend_map[(blend_control[n] & 0x001F0000) >> 16];
|
||||
// A2XX_RB_BLEND_CONTROL_ALPHA_DESTBLEND
|
||||
auto dest_blend_alpha = blend_map[(blend_control[n] & 0x1F000000) >> 24];
|
||||
// A2XX_RB_BLEND_CONTROL_ALPHA_COMB_FCN
|
||||
auto blend_op_alpha = blend_op_map[(blend_control[n] & 0x00E00000) >> 21];
|
||||
// A2XX_RB_COLOR_MASK_WRITE_*
|
||||
uint32_t write_mask = (color_mask >> (n * 4)) & 0xF;
|
||||
// A2XX_RB_COLORCONTROL_BLEND_DISABLE ?? Can't find this!
|
||||
// Just guess based on actions.
|
||||
bool blend_enable =
|
||||
!((src_blend == GL_ONE) && (dest_blend == GL_ZERO) &&
|
||||
(blend_op == GL_FUNC_ADD) && (src_blend_alpha == GL_ONE) &&
|
||||
(dest_blend_alpha == GL_ZERO) && (blend_op_alpha == GL_FUNC_ADD));
|
||||
if (blend_enable) {
|
||||
glEnablei(GL_BLEND, n);
|
||||
glBlendEquationSeparatei(n, blend_op, blend_op_alpha);
|
||||
glBlendFuncSeparatei(n, src_blend, dest_blend, src_blend_alpha,
|
||||
dest_blend_alpha);
|
||||
} else {
|
||||
glDisablei(GL_BLEND, n);
|
||||
}
|
||||
}
|
||||
float blend_color[4] = {
|
||||
regs[XE_GPU_REG_RB_BLEND_RED].f32, regs[XE_GPU_REG_RB_BLEND_GREEN].f32,
|
||||
regs[XE_GPU_REG_RB_BLEND_BLUE].f32, regs[XE_GPU_REG_RB_BLEND_ALPHA].f32,
|
||||
};
|
||||
glBlendColor(blend_color[0], blend_color[1], blend_color[2], blend_color[3]);
|
||||
|
||||
static const GLenum compare_func_map[] = {
|
||||
/* 0 */ GL_NEVER,
|
||||
/* 1 */ GL_LESS,
|
||||
/* 2 */ GL_EQUAL,
|
||||
/* 3 */ GL_LEQUAL,
|
||||
/* 4 */ GL_GREATER,
|
||||
/* 5 */ GL_NOTEQUAL,
|
||||
/* 6 */ GL_GEQUAL,
|
||||
/* 7 */ GL_ALWAYS,
|
||||
};
|
||||
static const GLenum stencil_op_map[] = {
|
||||
/* 0 */ GL_KEEP,
|
||||
/* 1 */ GL_ZERO,
|
||||
/* 2 */ GL_REPLACE,
|
||||
/* 3 */ GL_INCR_WRAP,
|
||||
/* 4 */ GL_DECR_WRAP,
|
||||
/* 5 */ GL_INVERT,
|
||||
/* 6 */ GL_INCR,
|
||||
/* 7 */ GL_DECR,
|
||||
};
|
||||
uint32_t depth_control = regs[XE_GPU_REG_RB_DEPTHCONTROL].u32;
|
||||
// A2XX_RB_DEPTHCONTROL_Z_ENABLE
|
||||
if (depth_control & 0x00000002) {
|
||||
glEnable(GL_DEPTH_TEST);
|
||||
} else {
|
||||
glDisable(GL_DEPTH_TEST);
|
||||
}
|
||||
// A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE
|
||||
glDepthMask((depth_control & 0x00000004) ? GL_TRUE : GL_FALSE);
|
||||
// A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE
|
||||
// ?
|
||||
// A2XX_RB_DEPTHCONTROL_ZFUNC
|
||||
glDepthFunc(compare_func_map[(depth_control & 0x00000070) >> 4]);
|
||||
// A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE
|
||||
if (depth_control & 0x00000001) {
|
||||
glEnable(GL_STENCIL_TEST);
|
||||
} else {
|
||||
glDisable(GL_STENCIL_TEST);
|
||||
}
|
||||
uint32_t stencil_ref_mask = regs[XE_GPU_REG_RB_STENCILREFMASK].u32;
|
||||
// RB_STENCILREFMASK_STENCILREF
|
||||
uint32_t stencil_ref = (stencil_ref_mask & 0x000000FF);
|
||||
// RB_STENCILREFMASK_STENCILMASK
|
||||
uint32_t stencil_read_mask = (stencil_ref_mask & 0x0000FF00) >> 8;
|
||||
// RB_STENCILREFMASK_STENCILWRITEMASK
|
||||
glStencilMask((stencil_ref_mask & 0x00FF0000) >> 16);
|
||||
// A2XX_RB_DEPTHCONTROL_BACKFACE_ENABLE
|
||||
bool backface_enabled = (depth_control & 0x00000080) != 0;
|
||||
if (backface_enabled) {
|
||||
// A2XX_RB_DEPTHCONTROL_STENCILFUNC
|
||||
glStencilFuncSeparate(GL_FRONT,
|
||||
compare_func_map[(depth_control & 0x00000700) >> 8],
|
||||
stencil_ref, stencil_read_mask);
|
||||
// A2XX_RB_DEPTHCONTROL_STENCILFAIL
|
||||
// A2XX_RB_DEPTHCONTROL_STENCILZFAIL
|
||||
// A2XX_RB_DEPTHCONTROL_STENCILZPASS
|
||||
glStencilOpSeparate(GL_FRONT,
|
||||
stencil_op_map[(depth_control & 0x00003800) >> 11],
|
||||
stencil_op_map[(depth_control & 0x000E0000) >> 17],
|
||||
stencil_op_map[(depth_control & 0x0001C000) >> 14]);
|
||||
// A2XX_RB_DEPTHCONTROL_STENCILFUNC_BF
|
||||
glStencilFuncSeparate(GL_BACK,
|
||||
compare_func_map[(depth_control & 0x00700000) >> 20],
|
||||
stencil_ref, stencil_read_mask);
|
||||
// A2XX_RB_DEPTHCONTROL_STENCILFAIL_BF
|
||||
// A2XX_RB_DEPTHCONTROL_STENCILZFAIL_BF
|
||||
// A2XX_RB_DEPTHCONTROL_STENCILZPASS_BF
|
||||
glStencilOpSeparate(GL_BACK,
|
||||
stencil_op_map[(depth_control & 0x03800000) >> 23],
|
||||
stencil_op_map[(depth_control & 0xE0000000) >> 29],
|
||||
stencil_op_map[(depth_control & 0x1C000000) >> 26]);
|
||||
} else {
|
||||
// Backfaces disabled - treat backfaces as frontfaces.
|
||||
glStencilFunc(compare_func_map[(depth_control & 0x00000700) >> 8],
|
||||
stencil_ref, stencil_read_mask);
|
||||
glStencilOp(stencil_op_map[(depth_control & 0x00003800) >> 11],
|
||||
stencil_op_map[(depth_control & 0x000E0000) >> 17],
|
||||
stencil_op_map[(depth_control & 0x0001C000) >> 14]);
|
||||
}
|
||||
|
||||
glUnmapNamedBuffer(uniform_data_buffer_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CommandProcessor::UpdateRenderTargets() {
|
||||
auto& regs = *register_file_;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CommandProcessor::IssueDraw(DrawCommand* draw_command) {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace gl4
|
||||
} // namespace gpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -12,10 +12,12 @@
|
|||
|
||||
#include <atomic>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include <xenia/gpu/gl4/gl_context.h>
|
||||
#include <xenia/gpu/gl4/gl4_shader.h>
|
||||
#include <xenia/gpu/register_file.h>
|
||||
#include <xenia/gpu/xenos.h>
|
||||
|
@ -27,6 +29,42 @@ namespace gl4 {
|
|||
|
||||
class GL4GraphicsSystem;
|
||||
|
||||
// TODO(benvanik): move more of the enums in here?
|
||||
struct DrawCommand {
|
||||
PrimitiveType prim_type;
|
||||
uint32_t start_index;
|
||||
uint32_t index_count;
|
||||
uint32_t base_vertex;
|
||||
|
||||
GL4Shader* vertex_shader;
|
||||
GL4Shader* pixel_shader;
|
||||
|
||||
// Index buffer, if present.
|
||||
// If index_count > 0 but buffer is nullptr then auto draw.
|
||||
//IndexBufferResource* index_buffer;
|
||||
void* index_buffer;
|
||||
|
||||
// Vertex buffers.
|
||||
struct {
|
||||
uint32_t input_index;
|
||||
//VertexBufferResource* buffer;
|
||||
uint32_t stride;
|
||||
uint32_t offset;
|
||||
} vertex_buffers[96];
|
||||
size_t vertex_buffer_count;
|
||||
|
||||
// Texture samplers.
|
||||
struct SamplerInput {
|
||||
uint32_t input_index;
|
||||
//TextureResource* texture;
|
||||
//SamplerStateResource* sampler_state;
|
||||
};
|
||||
SamplerInput vertex_shader_samplers[32];
|
||||
size_t vertex_shader_sampler_count;
|
||||
SamplerInput pixel_shader_samplers[32];
|
||||
size_t pixel_shader_sampler_count;
|
||||
};
|
||||
|
||||
class CommandProcessor {
|
||||
public:
|
||||
CommandProcessor(GL4GraphicsSystem* graphics_system);
|
||||
|
@ -38,8 +76,10 @@ class CommandProcessor {
|
|||
uint32_t counter() const { return counter_; }
|
||||
void increment_counter() { counter_++; }
|
||||
|
||||
void Initialize(uint32_t ptr, uint32_t page_count);
|
||||
bool Initialize(std::unique_ptr<GLContext> context);
|
||||
void Shutdown();
|
||||
|
||||
void InitializeRingBuffer(uint32_t ptr, uint32_t page_count);
|
||||
void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size);
|
||||
|
||||
void UpdateWritePointer(uint32_t value);
|
||||
|
@ -48,9 +88,12 @@ class CommandProcessor {
|
|||
class RingbufferReader;
|
||||
|
||||
void WorkerMain();
|
||||
bool SetupGL();
|
||||
void ShutdownGL();
|
||||
|
||||
void WriteRegister(uint32_t packet_ptr, uint32_t index, uint32_t value);
|
||||
void MakeCoherent();
|
||||
void PrepareForWait();
|
||||
|
||||
void ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index);
|
||||
void ExecuteIndirectBuffer(uint32_t ptr, uint32_t length);
|
||||
|
@ -113,6 +156,11 @@ class CommandProcessor {
|
|||
bool LoadShader(ShaderType shader_type, const uint32_t* address,
|
||||
uint32_t dword_count);
|
||||
|
||||
bool PrepareDraw(DrawCommand* draw_command);
|
||||
bool UpdateState(DrawCommand* draw_command);
|
||||
bool UpdateRenderTargets();
|
||||
bool IssueDraw(DrawCommand* draw_command);
|
||||
|
||||
Memory* memory_;
|
||||
uint8_t* membase_;
|
||||
GL4GraphicsSystem* graphics_system_;
|
||||
|
@ -120,7 +168,7 @@ class CommandProcessor {
|
|||
|
||||
std::thread worker_thread_;
|
||||
std::atomic<bool> worker_running_;
|
||||
|
||||
std::unique_ptr<GLContext> context_;
|
||||
std::function<void()> swap_handler_;
|
||||
|
||||
uint64_t time_base_;
|
||||
|
@ -143,6 +191,10 @@ class CommandProcessor {
|
|||
std::unordered_map<uint64_t, GL4Shader*> shader_cache_;
|
||||
GL4Shader* active_vertex_shader_;
|
||||
GL4Shader* active_pixel_shader_;
|
||||
|
||||
GLuint uniform_data_buffer_;
|
||||
|
||||
DrawCommand draw_command_;
|
||||
};
|
||||
|
||||
} // namespace gl4
|
||||
|
|
|
@ -32,9 +32,19 @@ X_STATUS GL4GraphicsSystem::Setup() {
|
|||
// This must happen on the UI thread.
|
||||
poly::threading::Fence control_ready_fence;
|
||||
auto loop = emulator_->main_window()->loop();
|
||||
std::unique_ptr<GLContext> processor_context;
|
||||
loop->Post([&]() {
|
||||
// Setup the GL control that actually does the drawing.
|
||||
// We run here in the loop and only touch it (and its context) on this
|
||||
// thread. That means some sync-fu when we want to swap.
|
||||
control_ = std::make_unique<WGLControl>(loop);
|
||||
emulator_->main_window()->AddChild(control_.get());
|
||||
|
||||
// Setup the GL context the command processor will do all its drawing in.
|
||||
// It's shared with the control context so that we can resolve framebuffers
|
||||
// from it.
|
||||
processor_context = control_->context()->CreateShared();
|
||||
|
||||
control_ready_fence.Signal();
|
||||
});
|
||||
control_ready_fence.Wait();
|
||||
|
@ -42,6 +52,10 @@ X_STATUS GL4GraphicsSystem::Setup() {
|
|||
// Create command processor. This will spin up a thread to process all
|
||||
// incoming ringbuffer packets.
|
||||
command_processor_ = std::make_unique<CommandProcessor>(this);
|
||||
if (!command_processor_->Initialize(std::move(processor_context))) {
|
||||
PLOGE("Unable to initialize command processor");
|
||||
return X_STATUS_UNSUCCESSFUL;
|
||||
}
|
||||
command_processor_->set_swap_handler(
|
||||
std::bind(&GL4GraphicsSystem::SwapHandler, this));
|
||||
|
||||
|
@ -76,7 +90,7 @@ void GL4GraphicsSystem::Shutdown() {
|
|||
|
||||
void GL4GraphicsSystem::InitializeRingBuffer(uint32_t ptr,
|
||||
uint32_t page_count) {
|
||||
command_processor_->Initialize(ptr, page_count);
|
||||
command_processor_->InitializeRingBuffer(ptr, page_count);
|
||||
}
|
||||
|
||||
void GL4GraphicsSystem::EnableReadPointerWriteBack(uint32_t ptr,
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#include <xenia/gpu/gl4/gl_context.h>
|
||||
|
||||
#include <poly/assert.h>
|
||||
#include <poly/logging.h>
|
||||
|
||||
namespace xe {
|
||||
|
@ -20,17 +21,26 @@ thread_local WGLEWContext* tls_wglew_context_ = nullptr;
|
|||
extern "C" GLEWContext* glewGetContext() { return tls_glew_context_; }
|
||||
extern "C" WGLEWContext* wglewGetContext() { return tls_wglew_context_; }
|
||||
|
||||
GLContext::GLContext() : dc_(nullptr), glrc_(nullptr) {}
|
||||
GLContext::GLContext() : hwnd_(nullptr), dc_(nullptr), glrc_(nullptr) {}
|
||||
|
||||
GLContext::GLContext(HWND hwnd, HGLRC glrc)
|
||||
: hwnd_(hwnd), dc_(nullptr), glrc_(glrc) {
|
||||
dc_ = GetDC(hwnd);
|
||||
}
|
||||
|
||||
GLContext::~GLContext() {
|
||||
wglMakeCurrent(nullptr, nullptr);
|
||||
if (glrc_) {
|
||||
wglDeleteContext(glrc_);
|
||||
}
|
||||
if (dc_) {
|
||||
ReleaseDC(hwnd_, dc_);
|
||||
}
|
||||
}
|
||||
|
||||
bool GLContext::Initialize(HDC dc) {
|
||||
dc_ = dc;
|
||||
bool GLContext::Initialize(HWND hwnd) {
|
||||
hwnd_ = hwnd;
|
||||
dc_ = GetDC(hwnd);
|
||||
|
||||
PIXELFORMATDESCRIPTOR pfd = {0};
|
||||
pfd.nSize = sizeof(pfd);
|
||||
|
@ -59,6 +69,7 @@ bool GLContext::Initialize(HDC dc) {
|
|||
|
||||
tls_glew_context_ = &glew_context_;
|
||||
tls_wglew_context_ = &wglew_context_;
|
||||
glewExperimental = GL_TRUE;
|
||||
if (glewInit() != GLEW_OK) {
|
||||
PLOGE("Unable to initialize GLEW");
|
||||
return false;
|
||||
|
@ -73,11 +84,10 @@ bool GLContext::Initialize(HDC dc) {
|
|||
return false;
|
||||
}
|
||||
|
||||
int context_flags = WGL_CONTEXT_FORWARD_COMPATIBLE_BIT_ARB;
|
||||
int context_flags = 0;
|
||||
#if DEBUG
|
||||
context_flags |= WGL_CONTEXT_DEBUG_BIT_ARB;
|
||||
#endif // DEBUG
|
||||
|
||||
#endif // DEBUG
|
||||
int attrib_list[] = {WGL_CONTEXT_MAJOR_VERSION_ARB, 4, //
|
||||
WGL_CONTEXT_MINOR_VERSION_ARB, 5, //
|
||||
WGL_CONTEXT_FLAGS_ARB, context_flags, //
|
||||
|
@ -99,6 +109,45 @@ bool GLContext::Initialize(HDC dc) {
|
|||
return true;
|
||||
}
|
||||
|
||||
std::unique_ptr<GLContext> GLContext::CreateShared() {
|
||||
assert_not_null(glrc_);
|
||||
|
||||
int context_flags = 0;
|
||||
#if DEBUG
|
||||
context_flags |= WGL_CONTEXT_DEBUG_BIT_ARB;
|
||||
#endif // DEBUG
|
||||
int attrib_list[] = {WGL_CONTEXT_MAJOR_VERSION_ARB, 4, //
|
||||
WGL_CONTEXT_MINOR_VERSION_ARB, 5, //
|
||||
WGL_CONTEXT_FLAGS_ARB, context_flags, //
|
||||
0};
|
||||
auto new_glrc = wglCreateContextAttribsARB(dc_, glrc_, attrib_list);
|
||||
if (!new_glrc) {
|
||||
PLOGE("Could not create shared context");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto new_context = std::make_unique<GLContext>(hwnd_, new_glrc);
|
||||
if (!new_context->MakeCurrent()) {
|
||||
PLOGE("Could not make new GL context current");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
glewExperimental = GL_TRUE;
|
||||
if (glewInit() != GLEW_OK) {
|
||||
PLOGE("Unable to initialize GLEW");
|
||||
return nullptr;
|
||||
}
|
||||
if (wglewInit() != GLEW_OK) {
|
||||
PLOGE("Unable to initialize WGLEW");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
new_context->ClearCurrent();
|
||||
MakeCurrent();
|
||||
|
||||
return new_context;
|
||||
}
|
||||
|
||||
bool GLContext::MakeCurrent() {
|
||||
if (!wglMakeCurrent(dc_, glrc_)) {
|
||||
return false;
|
||||
|
|
|
@ -10,6 +10,8 @@
|
|||
#ifndef XENIA_GPU_GL4_GL_CONTEXT_H_
|
||||
#define XENIA_GPU_GL4_GL_CONTEXT_H_
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include <third_party/GL/glew.h>
|
||||
#include <third_party/GL/wglew.h>
|
||||
|
||||
|
@ -20,16 +22,20 @@ namespace gl4 {
|
|||
class GLContext {
|
||||
public:
|
||||
GLContext();
|
||||
GLContext(HWND hwnd, HGLRC glrc);
|
||||
~GLContext();
|
||||
|
||||
bool Initialize(HDC dc);
|
||||
bool Initialize(HWND hwnd);
|
||||
|
||||
HDC dc() const { return dc_; }
|
||||
|
||||
std::unique_ptr<GLContext> CreateShared();
|
||||
|
||||
bool MakeCurrent();
|
||||
void ClearCurrent();
|
||||
|
||||
private:
|
||||
HWND hwnd_;
|
||||
HDC dc_;
|
||||
HGLRC glrc_;
|
||||
|
||||
|
|
|
@ -56,13 +56,7 @@ bool WGLControl::Create() {
|
|||
return false;
|
||||
}
|
||||
|
||||
HDC dc = GetDC(hwnd_);
|
||||
if (!dc) {
|
||||
PLOGE("No DC for WGL window");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!context_.Initialize(dc)) {
|
||||
if (!context_.Initialize(hwnd_)) {
|
||||
PFATAL("Unable to initialize GL context");
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -100,7 +100,8 @@ void Shader::GatherExec(const instr_cf_exec_t* cf) {
|
|||
uint32_t alu_off = (cf->address + i);
|
||||
int sync = sequence & 0x2;
|
||||
if (sequence & 0x1) {
|
||||
auto fetch = reinterpret_cast<const instr_fetch_t*>(&data_[alu_off * 3]);
|
||||
auto fetch =
|
||||
reinterpret_cast<const instr_fetch_t*>(data_.data() + alu_off * 3);
|
||||
switch (fetch->opc) {
|
||||
case VTX_FETCH:
|
||||
GatherVertexFetch(&fetch->vtx);
|
||||
|
@ -121,7 +122,8 @@ void Shader::GatherExec(const instr_cf_exec_t* cf) {
|
|||
}
|
||||
} else {
|
||||
// TODO(benvanik): gather registers used, predicate bits used, etc.
|
||||
auto alu = reinterpret_cast<const instr_alu_t*>(&data_[alu_off * 3]);
|
||||
auto alu =
|
||||
reinterpret_cast<const instr_alu_t*>(data_.data() + alu_off * 3);
|
||||
if (alu->vector_write_mask) {
|
||||
if (alu->export_data && alu->vector_dest == 63) {
|
||||
alloc_counts_.point_size = true;
|
||||
|
|
Loading…
Reference in New Issue