From 99c72f24f2362c9b06c6073d950c4ce1c4904e03 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 18 Jan 2014 00:12:57 -0800 Subject: [PATCH] Better vertex buffer construction, supporting packed data. --- src/xenia/gpu/d3d11/d3d11_graphics_driver.cc | 93 ++++---- src/xenia/gpu/d3d11/d3d11_graphics_driver.h | 4 +- src/xenia/gpu/d3d11/d3d11_shader.cc | 216 +++++++++---------- src/xenia/gpu/shader.cc | 70 +++++- src/xenia/gpu/shader.h | 22 +- 5 files changed, 241 insertions(+), 164 deletions(-) diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc index c160eb943..dda1d3a7f 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc @@ -139,9 +139,12 @@ int D3D11GraphicsDriver::SetupDraw(XE_GPU_PRIMITIVE_TYPE prim_type) { case XE_GPU_PRIMITIVE_TYPE_TRIANGLE_STRIP: primitive_topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP; break; + case XE_GPU_PRIMITIVE_TYPE_RECTANGLE_LIST: + XELOGW("D3D11: faking RECTANGLE_LIST as a tri list"); + primitive_topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST; + break; case XE_GPU_PRIMITIVE_TYPE_TRIANGLE_FAN: case XE_GPU_PRIMITIVE_TYPE_UNKNOWN_07: - case XE_GPU_PRIMITIVE_TYPE_RECTANGLE_LIST: case XE_GPU_PRIMITIVE_TYPE_LINE_LOOP: XELOGE("D3D11: unsupported primitive type %d", prim_type); return 1; @@ -429,6 +432,17 @@ int D3D11GraphicsDriver::BindShaders() { } int D3D11GraphicsDriver::PrepareFetchers() { + XEASSERTNOTNULL(state_.vertex_shader); + auto inputs = state_.vertex_shader->GetVertexBufferInputs(); + for (size_t n = 0; n < inputs->count; n++) { + auto input = inputs->descs[n]; + if (PrepareVertexBuffer(input)) { + XELOGE("D3D11: unable to prepare vertex buffer"); + return 1; + } + } + + // TODO(benvanik): rewrite by sampler RegisterFile& rf = register_file_; for (int n = 0; n < 32; n++) { int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + n * 6; @@ -437,38 +451,37 @@ int D3D11GraphicsDriver::PrepareFetchers() { if (PrepareTextureFetcher(n, &group->texture_fetch)) { return 1; } - } else { - // TODO(benvanik): verify register numbering. - if (group->type_0 == 0x3) { - if (PrepareVertexFetcher(n * 3 + 0, &group->vertex_fetch_0)) { - return 1; - } - } - if (group->type_1 == 0x3) { - if (PrepareVertexFetcher(n * 3 + 1, &group->vertex_fetch_1)) { - return 1; - } - } - if (group->type_2 == 0x3) { - if (PrepareVertexFetcher(n * 3 + 2, &group->vertex_fetch_2)) { - return 1; - } - } } } return 0; } -int D3D11GraphicsDriver::PrepareVertexFetcher( - int fetch_slot, xe_gpu_vertex_fetch_t* fetch) { - uint32_t address = (fetch->address << 2) + address_translation_; - uint32_t size_dwords = fetch->size; +int D3D11GraphicsDriver::PrepareVertexBuffer(Shader::vtx_buffer_desc_t& desc) { + RegisterFile& rf = register_file_; + int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (desc.fetch_slot / 3) * 6; + xe_gpu_fetch_group_t* group = (xe_gpu_fetch_group_t*)&rf.values[r]; + xe_gpu_vertex_fetch_t* fetch = NULL; + switch (desc.fetch_slot % 3) { + case 0: + fetch = &group->vertex_fetch_0; + break; + case 1: + fetch = &group->vertex_fetch_1; + break; + case 2: + fetch = &group->vertex_fetch_2; + break; + } + XEASSERTNOTNULL(fetch); + // If this assert doesn't hold, maybe we just abort? + XEASSERT(fetch->type == 0x3); + XEASSERTNOTZERO(fetch->size); ID3D11Buffer* buffer = 0; D3D11_BUFFER_DESC buffer_desc; xe_zero_struct(&buffer_desc, sizeof(buffer_desc)); - buffer_desc.ByteWidth = size_dwords * 4; + buffer_desc.ByteWidth = fetch->size * 4; buffer_desc.Usage = D3D11_USAGE_DYNAMIC; buffer_desc.BindFlags = D3D11_BIND_VERTEX_BUFFER; buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; @@ -484,15 +497,23 @@ int D3D11GraphicsDriver::PrepareVertexFetcher( XESAFERELEASE(buffer); return 1; } - uint32_t* src = (uint32_t*)memory_->Translate(address); - uint32_t* dest = (uint32_t*)res.pData; - for (uint32_t n = 0; n < size_dwords; n++) { - // union { - // uint32_t i; - // float f; - // } d = {XESWAP32(src[n])}; - // XELOGGPU("v%.3d %0.8X %g", n, d.i, d.f); - dest[n] = XESWAP32(src[n]); + uint32_t address = (fetch->address << 2) + address_translation_; + uint8_t* src = (uint8_t*)memory_->Translate(address); + uint8_t* dest = (uint8_t*)res.pData; + // TODO(benvanik): rewrite to be faster/special case common/etc + for (size_t n = 0; n < desc.element_count; n++) { + auto& el = desc.elements[n]; + uint32_t stride = desc.stride_words; + uint32_t count = fetch->size / stride; + uint32_t* src_ptr = (uint32_t*)(src + el.offset_words * 4); + uint32_t* dest_ptr = (uint32_t*)(dest + el.offset_words * 4); + uint32_t o = 0; + for (uint32_t i = 0; i < count; i++) { + for (uint32_t j = 0; j < el.size_words; j++) { + dest_ptr[o + j] = XESWAP32(src_ptr[o + j]); + } + o += stride; + } } context_->Unmap(buffer, 0); @@ -500,14 +521,10 @@ int D3D11GraphicsDriver::PrepareVertexFetcher( if (!vs) { return 1; } - const instr_fetch_vtx_t* vtx = vs->GetFetchVtxBySlot(fetch_slot); - if (!vtx->must_be_one) { - return 1; - } // TODO(benvanik): always dword aligned? - uint32_t stride = vtx->stride * 4; + uint32_t stride = desc.stride_words * 4; uint32_t offset = 0; - int vb_slot = 95 - fetch_slot; + int vb_slot = desc.input_index; context_->IASetVertexBuffers(vb_slot, 1, &buffer, &stride, &offset); buffer->Release(); diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.h b/src/xenia/gpu/d3d11/d3d11_graphics_driver.h index 7a9714880..75db4e7b0 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.h +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.h @@ -13,6 +13,7 @@ #include #include +#include #include #include @@ -56,8 +57,7 @@ private: int UpdateConstantBuffers(); int BindShaders(); int PrepareFetchers(); - int PrepareVertexFetcher( - int fetch_slot, xenos::xe_gpu_vertex_fetch_t* fetch); + int PrepareVertexBuffer(Shader::vtx_buffer_desc_t& desc); int PrepareTextureFetcher( int fetch_slot, xenos::xe_gpu_texture_fetch_t* fetch); int PrepareIndexBuffer( diff --git a/src/xenia/gpu/d3d11/d3d11_shader.cc b/src/xenia/gpu/d3d11/d3d11_shader.cc index 2f114dfe7..7b2e14b6d 100644 --- a/src/xenia/gpu/d3d11/d3d11_shader.cc +++ b/src/xenia/gpu/d3d11/d3d11_shader.cc @@ -184,119 +184,101 @@ int D3D11VertexShader::Prepare(xe_gpu_program_cntl_t* program_cntl) { } // Create input layout. - size_t element_count = fetch_vtxs_.size(); + size_t element_count = 0; + for (uint32_t n = 0; n < vtx_buffer_inputs_.count; n++) { + element_count += vtx_buffer_inputs_.descs[n].element_count; + } D3D11_INPUT_ELEMENT_DESC* element_descs = (D3D11_INPUT_ELEMENT_DESC*)xe_alloca( sizeof(D3D11_INPUT_ELEMENT_DESC) * element_count); - int n = 0; - for (std::vector::iterator it = fetch_vtxs_.begin(); - it != fetch_vtxs_.end(); ++it, ++n) { - const instr_fetch_vtx_t& vtx = *it; - DXGI_FORMAT vtx_format; - switch (vtx.format) { - case FMT_1_REVERSE: - vtx_format = DXGI_FORMAT_R1_UNORM; // ? - break; - case FMT_8: - if (!vtx.num_format_all) { - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R8_SNORM : DXGI_FORMAT_R8_UNORM; - } else { - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R8_SINT : DXGI_FORMAT_R8_UINT; + uint32_t el_index = 0; + for (uint32_t n = 0; n < vtx_buffer_inputs_.count; n++) { + auto& input = vtx_buffer_inputs_.descs[n]; + for (uint32_t m = 0; m < input.element_count; m++) { + auto& el = input.elements[m]; + uint32_t vb_slot = input.input_index; + uint32_t num_format_all = el.vtx_fetch.num_format_all; + uint32_t format_comp_all = el.vtx_fetch.format_comp_all; + DXGI_FORMAT vtx_format; + switch (el.format) { + case FMT_8_8_8_8: + if (!num_format_all) { + vtx_format = format_comp_all ? + DXGI_FORMAT_R8G8B8A8_SNORM : DXGI_FORMAT_R8G8B8A8_UNORM; + } else { + vtx_format = format_comp_all ? + DXGI_FORMAT_R8G8B8A8_SINT : DXGI_FORMAT_R8G8B8A8_UINT; + } + break; + case FMT_2_10_10_10: + if (!num_format_all) { + vtx_format = DXGI_FORMAT_R10G10B10A2_UNORM; + } else { + vtx_format = DXGI_FORMAT_R10G10B10A2_UINT; + } + break; + // DXGI_FORMAT_R11G11B10_FLOAT? + case FMT_16_16: + if (!num_format_all) { + vtx_format = format_comp_all ? + DXGI_FORMAT_R16G16_SNORM : DXGI_FORMAT_R16G16_UNORM; + } else { + vtx_format = format_comp_all ? + DXGI_FORMAT_R16G16_SINT : DXGI_FORMAT_R16G16_UINT; + } + break; + case FMT_16_16_16_16: + if (!num_format_all) { + vtx_format = format_comp_all ? + DXGI_FORMAT_R16G16B16A16_SNORM : DXGI_FORMAT_R16G16B16A16_UNORM; + } else { + vtx_format = format_comp_all ? + DXGI_FORMAT_R16G16B16A16_SINT : DXGI_FORMAT_R16G16B16A16_UINT; + } + break; + case FMT_16_16_FLOAT: + vtx_format = DXGI_FORMAT_R16G16_FLOAT; + break; + case FMT_16_16_16_16_FLOAT: + vtx_format = DXGI_FORMAT_R16G16B16A16_FLOAT; + break; + case FMT_32: + vtx_format = format_comp_all ? + DXGI_FORMAT_R32_SINT : DXGI_FORMAT_R32_UINT; + break; + case FMT_32_32: + vtx_format = format_comp_all ? + DXGI_FORMAT_R32G32_SINT : DXGI_FORMAT_R32G32_UINT; + break; + case FMT_32_32_32_32: + vtx_format = format_comp_all ? + DXGI_FORMAT_R32G32B32A32_SINT : DXGI_FORMAT_R32G32B32A32_UINT; + break; + case FMT_32_FLOAT: + vtx_format = DXGI_FORMAT_R32_FLOAT; + break; + case FMT_32_32_FLOAT: + vtx_format = DXGI_FORMAT_R32G32_FLOAT; + break; + case FMT_32_32_32_FLOAT: + vtx_format = DXGI_FORMAT_R32G32B32_FLOAT; + break; + case FMT_32_32_32_32_FLOAT: + vtx_format = DXGI_FORMAT_R32G32B32A32_FLOAT; + break; + default: + XEASSERTALWAYS(); + break; } - break; - case FMT_8_8_8_8: - if (!vtx.num_format_all) { - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R8G8B8A8_SNORM : DXGI_FORMAT_R8G8B8A8_UNORM; - } else { - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R8G8B8A8_SINT : DXGI_FORMAT_R8G8B8A8_UINT; - } - break; - case FMT_2_10_10_10: - if (!vtx.num_format_all) { - vtx_format = DXGI_FORMAT_R10G10B10A2_UNORM; - } else { - vtx_format = DXGI_FORMAT_R10G10B10A2_UINT; - } - break; - case FMT_8_8: - if (!vtx.num_format_all) { - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R8G8_SNORM : DXGI_FORMAT_R8G8_UNORM; - } else { - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R8G8_SINT : DXGI_FORMAT_R8G8_UINT; - } - break; - case FMT_16: - if (!vtx.num_format_all) { - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R16_SNORM : DXGI_FORMAT_R16_UNORM; - } else { - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R16_SINT : DXGI_FORMAT_R16_UINT; - } - break; - case FMT_16_16: - if (!vtx.num_format_all) { - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R16G16_SNORM : DXGI_FORMAT_R16G16_UNORM; - } else { - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R16G16_SINT : DXGI_FORMAT_R16G16_UINT; - } - break; - case FMT_16_16_16_16: - if (!vtx.num_format_all) { - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R16G16B16A16_SNORM : DXGI_FORMAT_R16G16B16A16_UNORM; - } else { - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R16G16B16A16_SINT : DXGI_FORMAT_R16G16B16A16_UINT; - } - break; - case FMT_32: - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R32_SINT : DXGI_FORMAT_R32_UINT; - break; - case FMT_32_32: - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R32G32_SINT : DXGI_FORMAT_R32G32_UINT; - break; - case FMT_32_32_32_32: - vtx_format = vtx.format_comp_all ? - DXGI_FORMAT_R32G32B32A32_SINT : DXGI_FORMAT_R32G32B32A32_UINT; - break; - case FMT_32_FLOAT: - vtx_format = DXGI_FORMAT_R32_FLOAT; - break; - case FMT_32_32_FLOAT: - vtx_format = DXGI_FORMAT_R32G32_FLOAT; - break; - case FMT_32_32_32_32_FLOAT: - vtx_format = DXGI_FORMAT_R32G32B32A32_FLOAT; - break; - case FMT_32_32_32_FLOAT: - vtx_format = DXGI_FORMAT_R32G32B32_FLOAT; - break; - default: - XEASSERTALWAYS(); - break; + element_descs[el_index].SemanticName = "XE_VF"; + element_descs[el_index].SemanticIndex = el_index; + element_descs[el_index].Format = vtx_format; + element_descs[el_index].InputSlot = vb_slot; + element_descs[el_index].AlignedByteOffset = el.offset_words * 4; + element_descs[el_index].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA; + element_descs[el_index].InstanceDataStepRate = 0; + el_index++; } - element_descs[n].SemanticName = "XE_VF"; - element_descs[n].SemanticIndex = n; - element_descs[n].Format = vtx_format; - // Pick slot in same way that driver does. - // CONST(31, 2) = reg 31, index 2 = rf([31] * 6 + [2] * 2) - uint32_t fetch_slot = vtx.const_index * 3 + vtx.const_index_sel; - uint32_t vb_slot = 95 - fetch_slot; - element_descs[n].InputSlot = vb_slot; - element_descs[n].AlignedByteOffset = vtx.offset * 4; - element_descs[n].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA; - element_descs[n].InstanceDataStepRate = 0; } hr = device_->CreateInputLayout( element_descs, @@ -337,13 +319,17 @@ const char* D3D11VertexShader::Translate(xe_gpu_program_cntl_t* program_cntl) { // Add vertex shader input. output->append( "struct VS_INPUT {\n"); - int n = 0; - for (std::vector::iterator it = fetch_vtxs_.begin(); - it != fetch_vtxs_.end(); ++it, ++n) { - const instr_fetch_vtx_t& vtx = *it; - uint32_t fetch_slot = vtx.const_index * 3 + vtx.const_index_sel; - output->append( - " float4 vf%u_%d : XE_VF%u;\n", fetch_slot, vtx.offset, n); + uint32_t el_index = 0; + for (uint32_t n = 0; n < vtx_buffer_inputs_.count; n++) { + auto& input = vtx_buffer_inputs_.descs[n]; + for (uint32_t m = 0; m < input.element_count; m++) { + auto& el = input.elements[m]; + auto& vtx = el.vtx_fetch; + uint32_t fetch_slot = vtx.const_index * 3 + vtx.const_index_sel; + output->append( + " float4 vf%u_%d : XE_VF%u;\n", fetch_slot, vtx.offset, el_index); + el_index++; + } } output->append( "};\n"); diff --git a/src/xenia/gpu/shader.cc b/src/xenia/gpu/shader.cc index 4f6b31f63..8522b9eb0 100644 --- a/src/xenia/gpu/shader.cc +++ b/src/xenia/gpu/shader.cc @@ -23,7 +23,7 @@ Shader::Shader( uint64_t hash) : type_(type), hash_(hash), is_prepared_(false), disasm_src_(NULL) { xe_zero_struct(&alloc_counts_, sizeof(alloc_counts_)); - xe_zero_struct(fetch_vtx_slots_, sizeof(fetch_vtx_slots_)); + xe_zero_struct(&vtx_buffer_inputs_, sizeof(vtx_buffer_inputs_)); // Verify. dword_count_ = length / 4; @@ -146,14 +146,72 @@ void Shader::GatherVertexFetch(const instr_fetch_vtx_t* vtx) { // num_format_all ? integer : fraction // exp_adjust_all - [-32,31] - (2^exp_adjust_all)*fetch - 0 = default - fetch_vtxs_.push_back(*vtx); - uint32_t fetch_slot = vtx->const_index * 3 + vtx->const_index_sel; - fetch_vtx_slots_[fetch_slot] = *vtx; + auto& inputs = vtx_buffer_inputs_; + vtx_buffer_element_t* el = NULL; + for (size_t n = 0; n < inputs.count; n++) { + auto& input = inputs.descs[n]; + if (input.fetch_slot == fetch_slot) { + XEASSERT(input.element_count + 1 < XECOUNT(input.elements)); + // It may not hold that all strides are equal, but I hope it does. + XEASSERT(!vtx->stride || input.stride_words == vtx->stride); + el = &input.elements[input.element_count++]; + break; + } + } + if (!el) { + XEASSERTNOTZERO(vtx->stride); + XEASSERT(inputs.count + 1 < XECOUNT(inputs.descs)); + auto& input = inputs.descs[inputs.count++]; + input.input_index = inputs.count - 1; + input.fetch_slot = fetch_slot; + input.stride_words = vtx->stride; + el = &input.elements[input.element_count++]; + } + + el->vtx_fetch = *vtx; + el->format = vtx->format; + el->offset_words = vtx->offset; + el->size_words = 0; + switch (el->format) { + case FMT_8_8_8_8: + case FMT_2_10_10_10: + case FMT_10_11_11: + case FMT_11_11_10: + el->size_words = 1; + break; + case FMT_16_16: + case FMT_16_16_FLOAT: + el->size_words = 1; + break; + case FMT_16_16_16_16: + case FMT_16_16_16_16_FLOAT: + el->size_words = 2; + break; + case FMT_32: + case FMT_32_FLOAT: + el->size_words = 1; + break; + case FMT_32_32: + case FMT_32_32_FLOAT: + el->size_words = 2; + break; + case FMT_32_32_32_FLOAT: + el->size_words = 3; + break; + case FMT_32_32_32_32: + case FMT_32_32_32_32_FLOAT: + el->size_words = 4; + break; + default: + XELOGE("Unknown vertex format: %d", el->format); + XEASSERTALWAYS(); + break; + } } -const instr_fetch_vtx_t* Shader::GetFetchVtxBySlot(uint32_t fetch_slot) { - return &fetch_vtx_slots_[fetch_slot]; +const Shader::vtx_buffer_inputs_t* Shader::GetVertexBufferInputs() { + return &vtx_buffer_inputs_; } void Shader::GatherTextureFetch(const xenos::instr_fetch_tex_t* tex) { diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 04073085e..ff26175ec 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -34,7 +34,24 @@ public: const char* disasm_src() const { return disasm_src_; } - const xenos::instr_fetch_vtx_t* GetFetchVtxBySlot(uint32_t fetch_slot); + typedef struct { + xenos::instr_fetch_vtx_t vtx_fetch; + uint32_t format; + uint32_t offset_words; + uint32_t size_words; + } vtx_buffer_element_t; + typedef struct { + uint32_t input_index; + uint32_t fetch_slot; + uint32_t stride_words; + uint32_t element_count; + vtx_buffer_element_t elements[16]; + } vtx_buffer_desc_t; + typedef struct { + uint32_t count; + vtx_buffer_desc_t descs[16]; + } vtx_buffer_inputs_t; + const vtx_buffer_inputs_t* GetVertexBufferInputs(); typedef struct { uint32_t positions; @@ -62,8 +79,7 @@ protected: alloc_counts_t alloc_counts_; std::vector execs_; std::vector allocs_; - std::vector fetch_vtxs_; - xenos::instr_fetch_vtx_t fetch_vtx_slots_[96]; + vtx_buffer_inputs_t vtx_buffer_inputs_; std::vector fetch_texs_; };