Better vertex buffer construction, supporting packed data.

This commit is contained in:
Ben Vanik 2014-01-18 00:12:57 -08:00
parent ba9d343c51
commit 99c72f24f2
5 changed files with 241 additions and 164 deletions

View File

@ -139,9 +139,12 @@ int D3D11GraphicsDriver::SetupDraw(XE_GPU_PRIMITIVE_TYPE prim_type) {
case XE_GPU_PRIMITIVE_TYPE_TRIANGLE_STRIP:
primitive_topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP;
break;
case XE_GPU_PRIMITIVE_TYPE_RECTANGLE_LIST:
XELOGW("D3D11: faking RECTANGLE_LIST as a tri list");
primitive_topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
break;
case XE_GPU_PRIMITIVE_TYPE_TRIANGLE_FAN:
case XE_GPU_PRIMITIVE_TYPE_UNKNOWN_07:
case XE_GPU_PRIMITIVE_TYPE_RECTANGLE_LIST:
case XE_GPU_PRIMITIVE_TYPE_LINE_LOOP:
XELOGE("D3D11: unsupported primitive type %d", prim_type);
return 1;
@ -429,6 +432,17 @@ int D3D11GraphicsDriver::BindShaders() {
}
int D3D11GraphicsDriver::PrepareFetchers() {
XEASSERTNOTNULL(state_.vertex_shader);
auto inputs = state_.vertex_shader->GetVertexBufferInputs();
for (size_t n = 0; n < inputs->count; n++) {
auto input = inputs->descs[n];
if (PrepareVertexBuffer(input)) {
XELOGE("D3D11: unable to prepare vertex buffer");
return 1;
}
}
// TODO(benvanik): rewrite by sampler
RegisterFile& rf = register_file_;
for (int n = 0; n < 32; n++) {
int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + n * 6;
@ -437,38 +451,37 @@ int D3D11GraphicsDriver::PrepareFetchers() {
if (PrepareTextureFetcher(n, &group->texture_fetch)) {
return 1;
}
} else {
// TODO(benvanik): verify register numbering.
if (group->type_0 == 0x3) {
if (PrepareVertexFetcher(n * 3 + 0, &group->vertex_fetch_0)) {
return 1;
}
}
if (group->type_1 == 0x3) {
if (PrepareVertexFetcher(n * 3 + 1, &group->vertex_fetch_1)) {
return 1;
}
}
if (group->type_2 == 0x3) {
if (PrepareVertexFetcher(n * 3 + 2, &group->vertex_fetch_2)) {
return 1;
}
}
}
}
return 0;
}
int D3D11GraphicsDriver::PrepareVertexFetcher(
int fetch_slot, xe_gpu_vertex_fetch_t* fetch) {
uint32_t address = (fetch->address << 2) + address_translation_;
uint32_t size_dwords = fetch->size;
int D3D11GraphicsDriver::PrepareVertexBuffer(Shader::vtx_buffer_desc_t& desc) {
RegisterFile& rf = register_file_;
int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (desc.fetch_slot / 3) * 6;
xe_gpu_fetch_group_t* group = (xe_gpu_fetch_group_t*)&rf.values[r];
xe_gpu_vertex_fetch_t* fetch = NULL;
switch (desc.fetch_slot % 3) {
case 0:
fetch = &group->vertex_fetch_0;
break;
case 1:
fetch = &group->vertex_fetch_1;
break;
case 2:
fetch = &group->vertex_fetch_2;
break;
}
XEASSERTNOTNULL(fetch);
// If this assert doesn't hold, maybe we just abort?
XEASSERT(fetch->type == 0x3);
XEASSERTNOTZERO(fetch->size);
ID3D11Buffer* buffer = 0;
D3D11_BUFFER_DESC buffer_desc;
xe_zero_struct(&buffer_desc, sizeof(buffer_desc));
buffer_desc.ByteWidth = size_dwords * 4;
buffer_desc.ByteWidth = fetch->size * 4;
buffer_desc.Usage = D3D11_USAGE_DYNAMIC;
buffer_desc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
@ -484,15 +497,23 @@ int D3D11GraphicsDriver::PrepareVertexFetcher(
XESAFERELEASE(buffer);
return 1;
}
uint32_t* src = (uint32_t*)memory_->Translate(address);
uint32_t* dest = (uint32_t*)res.pData;
for (uint32_t n = 0; n < size_dwords; n++) {
// union {
// uint32_t i;
// float f;
// } d = {XESWAP32(src[n])};
// XELOGGPU("v%.3d %0.8X %g", n, d.i, d.f);
dest[n] = XESWAP32(src[n]);
uint32_t address = (fetch->address << 2) + address_translation_;
uint8_t* src = (uint8_t*)memory_->Translate(address);
uint8_t* dest = (uint8_t*)res.pData;
// TODO(benvanik): rewrite to be faster/special case common/etc
for (size_t n = 0; n < desc.element_count; n++) {
auto& el = desc.elements[n];
uint32_t stride = desc.stride_words;
uint32_t count = fetch->size / stride;
uint32_t* src_ptr = (uint32_t*)(src + el.offset_words * 4);
uint32_t* dest_ptr = (uint32_t*)(dest + el.offset_words * 4);
uint32_t o = 0;
for (uint32_t i = 0; i < count; i++) {
for (uint32_t j = 0; j < el.size_words; j++) {
dest_ptr[o + j] = XESWAP32(src_ptr[o + j]);
}
o += stride;
}
}
context_->Unmap(buffer, 0);
@ -500,14 +521,10 @@ int D3D11GraphicsDriver::PrepareVertexFetcher(
if (!vs) {
return 1;
}
const instr_fetch_vtx_t* vtx = vs->GetFetchVtxBySlot(fetch_slot);
if (!vtx->must_be_one) {
return 1;
}
// TODO(benvanik): always dword aligned?
uint32_t stride = vtx->stride * 4;
uint32_t stride = desc.stride_words * 4;
uint32_t offset = 0;
int vb_slot = 95 - fetch_slot;
int vb_slot = desc.input_index;
context_->IASetVertexBuffers(vb_slot, 1, &buffer, &stride, &offset);
buffer->Release();

View File

@ -13,6 +13,7 @@
#include <xenia/core.h>
#include <xenia/gpu/graphics_driver.h>
#include <xenia/gpu/shader.h>
#include <xenia/gpu/d3d11/d3d11_gpu-private.h>
#include <xenia/gpu/xenos/xenos.h>
@ -56,8 +57,7 @@ private:
int UpdateConstantBuffers();
int BindShaders();
int PrepareFetchers();
int PrepareVertexFetcher(
int fetch_slot, xenos::xe_gpu_vertex_fetch_t* fetch);
int PrepareVertexBuffer(Shader::vtx_buffer_desc_t& desc);
int PrepareTextureFetcher(
int fetch_slot, xenos::xe_gpu_texture_fetch_t* fetch);
int PrepareIndexBuffer(

View File

@ -184,90 +184,74 @@ int D3D11VertexShader::Prepare(xe_gpu_program_cntl_t* program_cntl) {
}
// Create input layout.
size_t element_count = fetch_vtxs_.size();
size_t element_count = 0;
for (uint32_t n = 0; n < vtx_buffer_inputs_.count; n++) {
element_count += vtx_buffer_inputs_.descs[n].element_count;
}
D3D11_INPUT_ELEMENT_DESC* element_descs =
(D3D11_INPUT_ELEMENT_DESC*)xe_alloca(
sizeof(D3D11_INPUT_ELEMENT_DESC) * element_count);
int n = 0;
for (std::vector<instr_fetch_vtx_t>::iterator it = fetch_vtxs_.begin();
it != fetch_vtxs_.end(); ++it, ++n) {
const instr_fetch_vtx_t& vtx = *it;
uint32_t el_index = 0;
for (uint32_t n = 0; n < vtx_buffer_inputs_.count; n++) {
auto& input = vtx_buffer_inputs_.descs[n];
for (uint32_t m = 0; m < input.element_count; m++) {
auto& el = input.elements[m];
uint32_t vb_slot = input.input_index;
uint32_t num_format_all = el.vtx_fetch.num_format_all;
uint32_t format_comp_all = el.vtx_fetch.format_comp_all;
DXGI_FORMAT vtx_format;
switch (vtx.format) {
case FMT_1_REVERSE:
vtx_format = DXGI_FORMAT_R1_UNORM; // ?
break;
case FMT_8:
if (!vtx.num_format_all) {
vtx_format = vtx.format_comp_all ?
DXGI_FORMAT_R8_SNORM : DXGI_FORMAT_R8_UNORM;
} else {
vtx_format = vtx.format_comp_all ?
DXGI_FORMAT_R8_SINT : DXGI_FORMAT_R8_UINT;
}
break;
switch (el.format) {
case FMT_8_8_8_8:
if (!vtx.num_format_all) {
vtx_format = vtx.format_comp_all ?
if (!num_format_all) {
vtx_format = format_comp_all ?
DXGI_FORMAT_R8G8B8A8_SNORM : DXGI_FORMAT_R8G8B8A8_UNORM;
} else {
vtx_format = vtx.format_comp_all ?
vtx_format = format_comp_all ?
DXGI_FORMAT_R8G8B8A8_SINT : DXGI_FORMAT_R8G8B8A8_UINT;
}
break;
case FMT_2_10_10_10:
if (!vtx.num_format_all) {
if (!num_format_all) {
vtx_format = DXGI_FORMAT_R10G10B10A2_UNORM;
} else {
vtx_format = DXGI_FORMAT_R10G10B10A2_UINT;
}
break;
case FMT_8_8:
if (!vtx.num_format_all) {
vtx_format = vtx.format_comp_all ?
DXGI_FORMAT_R8G8_SNORM : DXGI_FORMAT_R8G8_UNORM;
} else {
vtx_format = vtx.format_comp_all ?
DXGI_FORMAT_R8G8_SINT : DXGI_FORMAT_R8G8_UINT;
}
break;
case FMT_16:
if (!vtx.num_format_all) {
vtx_format = vtx.format_comp_all ?
DXGI_FORMAT_R16_SNORM : DXGI_FORMAT_R16_UNORM;
} else {
vtx_format = vtx.format_comp_all ?
DXGI_FORMAT_R16_SINT : DXGI_FORMAT_R16_UINT;
}
break;
// DXGI_FORMAT_R11G11B10_FLOAT?
case FMT_16_16:
if (!vtx.num_format_all) {
vtx_format = vtx.format_comp_all ?
if (!num_format_all) {
vtx_format = format_comp_all ?
DXGI_FORMAT_R16G16_SNORM : DXGI_FORMAT_R16G16_UNORM;
} else {
vtx_format = vtx.format_comp_all ?
vtx_format = format_comp_all ?
DXGI_FORMAT_R16G16_SINT : DXGI_FORMAT_R16G16_UINT;
}
break;
case FMT_16_16_16_16:
if (!vtx.num_format_all) {
vtx_format = vtx.format_comp_all ?
if (!num_format_all) {
vtx_format = format_comp_all ?
DXGI_FORMAT_R16G16B16A16_SNORM : DXGI_FORMAT_R16G16B16A16_UNORM;
} else {
vtx_format = vtx.format_comp_all ?
vtx_format = format_comp_all ?
DXGI_FORMAT_R16G16B16A16_SINT : DXGI_FORMAT_R16G16B16A16_UINT;
}
break;
case FMT_16_16_FLOAT:
vtx_format = DXGI_FORMAT_R16G16_FLOAT;
break;
case FMT_16_16_16_16_FLOAT:
vtx_format = DXGI_FORMAT_R16G16B16A16_FLOAT;
break;
case FMT_32:
vtx_format = vtx.format_comp_all ?
vtx_format = format_comp_all ?
DXGI_FORMAT_R32_SINT : DXGI_FORMAT_R32_UINT;
break;
case FMT_32_32:
vtx_format = vtx.format_comp_all ?
vtx_format = format_comp_all ?
DXGI_FORMAT_R32G32_SINT : DXGI_FORMAT_R32G32_UINT;
break;
case FMT_32_32_32_32:
vtx_format = vtx.format_comp_all ?
vtx_format = format_comp_all ?
DXGI_FORMAT_R32G32B32A32_SINT : DXGI_FORMAT_R32G32B32A32_UINT;
break;
case FMT_32_FLOAT:
@ -276,27 +260,25 @@ int D3D11VertexShader::Prepare(xe_gpu_program_cntl_t* program_cntl) {
case FMT_32_32_FLOAT:
vtx_format = DXGI_FORMAT_R32G32_FLOAT;
break;
case FMT_32_32_32_32_FLOAT:
vtx_format = DXGI_FORMAT_R32G32B32A32_FLOAT;
break;
case FMT_32_32_32_FLOAT:
vtx_format = DXGI_FORMAT_R32G32B32_FLOAT;
break;
case FMT_32_32_32_32_FLOAT:
vtx_format = DXGI_FORMAT_R32G32B32A32_FLOAT;
break;
default:
XEASSERTALWAYS();
break;
}
element_descs[n].SemanticName = "XE_VF";
element_descs[n].SemanticIndex = n;
element_descs[n].Format = vtx_format;
// Pick slot in same way that driver does.
// CONST(31, 2) = reg 31, index 2 = rf([31] * 6 + [2] * 2)
uint32_t fetch_slot = vtx.const_index * 3 + vtx.const_index_sel;
uint32_t vb_slot = 95 - fetch_slot;
element_descs[n].InputSlot = vb_slot;
element_descs[n].AlignedByteOffset = vtx.offset * 4;
element_descs[n].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
element_descs[n].InstanceDataStepRate = 0;
element_descs[el_index].SemanticName = "XE_VF";
element_descs[el_index].SemanticIndex = el_index;
element_descs[el_index].Format = vtx_format;
element_descs[el_index].InputSlot = vb_slot;
element_descs[el_index].AlignedByteOffset = el.offset_words * 4;
element_descs[el_index].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
element_descs[el_index].InstanceDataStepRate = 0;
el_index++;
}
}
hr = device_->CreateInputLayout(
element_descs,
@ -337,13 +319,17 @@ const char* D3D11VertexShader::Translate(xe_gpu_program_cntl_t* program_cntl) {
// Add vertex shader input.
output->append(
"struct VS_INPUT {\n");
int n = 0;
for (std::vector<instr_fetch_vtx_t>::iterator it = fetch_vtxs_.begin();
it != fetch_vtxs_.end(); ++it, ++n) {
const instr_fetch_vtx_t& vtx = *it;
uint32_t el_index = 0;
for (uint32_t n = 0; n < vtx_buffer_inputs_.count; n++) {
auto& input = vtx_buffer_inputs_.descs[n];
for (uint32_t m = 0; m < input.element_count; m++) {
auto& el = input.elements[m];
auto& vtx = el.vtx_fetch;
uint32_t fetch_slot = vtx.const_index * 3 + vtx.const_index_sel;
output->append(
" float4 vf%u_%d : XE_VF%u;\n", fetch_slot, vtx.offset, n);
" float4 vf%u_%d : XE_VF%u;\n", fetch_slot, vtx.offset, el_index);
el_index++;
}
}
output->append(
"};\n");

View File

@ -23,7 +23,7 @@ Shader::Shader(
uint64_t hash) :
type_(type), hash_(hash), is_prepared_(false), disasm_src_(NULL) {
xe_zero_struct(&alloc_counts_, sizeof(alloc_counts_));
xe_zero_struct(fetch_vtx_slots_, sizeof(fetch_vtx_slots_));
xe_zero_struct(&vtx_buffer_inputs_, sizeof(vtx_buffer_inputs_));
// Verify.
dword_count_ = length / 4;
@ -146,14 +146,72 @@ void Shader::GatherVertexFetch(const instr_fetch_vtx_t* vtx) {
// num_format_all ? integer : fraction
// exp_adjust_all - [-32,31] - (2^exp_adjust_all)*fetch - 0 = default
fetch_vtxs_.push_back(*vtx);
uint32_t fetch_slot = vtx->const_index * 3 + vtx->const_index_sel;
fetch_vtx_slots_[fetch_slot] = *vtx;
auto& inputs = vtx_buffer_inputs_;
vtx_buffer_element_t* el = NULL;
for (size_t n = 0; n < inputs.count; n++) {
auto& input = inputs.descs[n];
if (input.fetch_slot == fetch_slot) {
XEASSERT(input.element_count + 1 < XECOUNT(input.elements));
// It may not hold that all strides are equal, but I hope it does.
XEASSERT(!vtx->stride || input.stride_words == vtx->stride);
el = &input.elements[input.element_count++];
break;
}
}
if (!el) {
XEASSERTNOTZERO(vtx->stride);
XEASSERT(inputs.count + 1 < XECOUNT(inputs.descs));
auto& input = inputs.descs[inputs.count++];
input.input_index = inputs.count - 1;
input.fetch_slot = fetch_slot;
input.stride_words = vtx->stride;
el = &input.elements[input.element_count++];
}
const instr_fetch_vtx_t* Shader::GetFetchVtxBySlot(uint32_t fetch_slot) {
return &fetch_vtx_slots_[fetch_slot];
el->vtx_fetch = *vtx;
el->format = vtx->format;
el->offset_words = vtx->offset;
el->size_words = 0;
switch (el->format) {
case FMT_8_8_8_8:
case FMT_2_10_10_10:
case FMT_10_11_11:
case FMT_11_11_10:
el->size_words = 1;
break;
case FMT_16_16:
case FMT_16_16_FLOAT:
el->size_words = 1;
break;
case FMT_16_16_16_16:
case FMT_16_16_16_16_FLOAT:
el->size_words = 2;
break;
case FMT_32:
case FMT_32_FLOAT:
el->size_words = 1;
break;
case FMT_32_32:
case FMT_32_32_FLOAT:
el->size_words = 2;
break;
case FMT_32_32_32_FLOAT:
el->size_words = 3;
break;
case FMT_32_32_32_32:
case FMT_32_32_32_32_FLOAT:
el->size_words = 4;
break;
default:
XELOGE("Unknown vertex format: %d", el->format);
XEASSERTALWAYS();
break;
}
}
const Shader::vtx_buffer_inputs_t* Shader::GetVertexBufferInputs() {
return &vtx_buffer_inputs_;
}
void Shader::GatherTextureFetch(const xenos::instr_fetch_tex_t* tex) {

View File

@ -34,7 +34,24 @@ public:
const char* disasm_src() const { return disasm_src_; }
const xenos::instr_fetch_vtx_t* GetFetchVtxBySlot(uint32_t fetch_slot);
typedef struct {
xenos::instr_fetch_vtx_t vtx_fetch;
uint32_t format;
uint32_t offset_words;
uint32_t size_words;
} vtx_buffer_element_t;
typedef struct {
uint32_t input_index;
uint32_t fetch_slot;
uint32_t stride_words;
uint32_t element_count;
vtx_buffer_element_t elements[16];
} vtx_buffer_desc_t;
typedef struct {
uint32_t count;
vtx_buffer_desc_t descs[16];
} vtx_buffer_inputs_t;
const vtx_buffer_inputs_t* GetVertexBufferInputs();
typedef struct {
uint32_t positions;
@ -62,8 +79,7 @@ protected:
alloc_counts_t alloc_counts_;
std::vector<xenos::instr_cf_exec_t> execs_;
std::vector<xenos::instr_cf_alloc_t> allocs_;
std::vector<xenos::instr_fetch_vtx_t> fetch_vtxs_;
xenos::instr_fetch_vtx_t fetch_vtx_slots_[96];
vtx_buffer_inputs_t vtx_buffer_inputs_;
std::vector<xenos::instr_fetch_tex_t> fetch_texs_;
};